210 lines
		
	
	
		
			6.4 KiB
		
	
	
	
		
			PHP
		
	
	
	
	
	
			
		
		
	
	
			210 lines
		
	
	
		
			6.4 KiB
		
	
	
	
		
			PHP
		
	
	
	
	
	
| <?php
 | |
| /*
 | |
|  * StatusNet - the distributed open-source microblogging tool
 | |
|  * Copyright (C) 2009, StatusNet, Inc.
 | |
|  *
 | |
|  * This program is free software: you can redistribute it and/or modify
 | |
|  * it under the terms of the GNU Affero General Public License as published by
 | |
|  * the Free Software Foundation, either version 3 of the License, or
 | |
|  * (at your option) any later version.
 | |
|  *
 | |
|  * This program is distributed in the hope that it will be useful,
 | |
|  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 | |
|  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | |
|  * GNU Affero General Public License for more details.
 | |
|  *
 | |
|  * You should have received a copy of the GNU Affero General Public License
 | |
|  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 | |
|  */
 | |
| 
 | |
| /**
 | |
|  * @package FeedSubPlugin
 | |
|  * @maintainer Brion Vibber <brion@status.net>
 | |
|  */
 | |
| 
 | |
| if (!defined('STATUSNET') && !defined('LACONICA')) { exit(1); }
 | |
| 
 | |
| class FeedSubBadURLException extends FeedSubException
 | |
| {
 | |
| }
 | |
| 
 | |
| class FeedSubBadResponseException extends FeedSubException
 | |
| {
 | |
| }
 | |
| 
 | |
| class FeedSubEmptyException extends FeedSubException
 | |
| {
 | |
| }
 | |
| 
 | |
| class FeedSubBadHTMLException extends FeedSubException
 | |
| {
 | |
| }
 | |
| 
 | |
| class FeedSubUnrecognizedTypeException extends FeedSubException
 | |
| {
 | |
| }
 | |
| 
 | |
| class FeedSubNoFeedException extends FeedSubException
 | |
| {
 | |
| }
 | |
| 
 | |
| class FeedDiscovery
 | |
| {
 | |
|     public $uri;
 | |
|     public $type;
 | |
|     public $body;
 | |
| 
 | |
| 
 | |
|     public function feedMunger()
 | |
|     {
 | |
|         require_once 'XML/Feed/Parser.php';
 | |
|         $feed = new XML_Feed_Parser($this->body, false, false, true); // @fixme
 | |
|         return new FeedMunger($feed, $this->uri);
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * @param string $url
 | |
|      * @param bool $htmlOk
 | |
|      * @return string with validated URL
 | |
|      * @throws FeedSubBadURLException
 | |
|      * @throws FeedSubBadHtmlException
 | |
|      * @throws FeedSubNoFeedException
 | |
|      * @throws FeedSubEmptyException
 | |
|      * @throws FeedSubUnrecognizedTypeException
 | |
|      */
 | |
|     function discoverFromURL($url, $htmlOk=true)
 | |
|     {
 | |
|         try {
 | |
|             $client = new HTTPClient();
 | |
|             $response = $client->get($url);
 | |
|         } catch (HTTP_Request2_Exception $e) {
 | |
|             throw new FeedSubBadURLException($e);
 | |
|         }
 | |
| 
 | |
|         if ($htmlOk) {
 | |
|             $type = $response->getHeader('Content-Type');
 | |
|             $isHtml = preg_match('!^(text/html|application/xhtml\+xml)!i', $type);
 | |
|             if ($isHtml) {
 | |
|                 $target = $this->discoverFromHTML($response->getUrl(), $response->getBody());
 | |
|                 if (!$target) {
 | |
|                     throw new FeedSubNoFeedException($url);
 | |
|                 }
 | |
|                 return $this->discoverFromURL($target, false);
 | |
|             }
 | |
|         }
 | |
|         
 | |
|         return $this->initFromResponse($response);
 | |
|     }
 | |
|     
 | |
|     function initFromResponse($response)
 | |
|     {
 | |
|         if (!$response->isOk()) {
 | |
|             throw new FeedSubBadResponseException($response->getCode());
 | |
|         }
 | |
| 
 | |
|         $sourceurl = $response->getUrl();
 | |
|         $body = $response->getBody();
 | |
|         if (!$body) {
 | |
|             throw new FeedSubEmptyException($sourceurl);
 | |
|         }
 | |
| 
 | |
|         $type = $response->getHeader('Content-Type');
 | |
|         if (preg_match('!^(text/xml|application/xml|application/(rss|atom)\+xml)!i', $type)) {
 | |
|             $this->uri = $sourceurl;
 | |
|             $this->type = $type;
 | |
|             $this->body = $body;
 | |
|             return true;
 | |
|         } else {
 | |
|             common_log(LOG_WARNING, "Unrecognized feed type $type for $sourceurl");
 | |
|             throw new FeedSubUnrecognizedTypeException($type);
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * @param string $url source URL, used to resolve relative links
 | |
|      * @param string $body HTML body text
 | |
|      * @return mixed string with URL or false if no target found
 | |
|      */
 | |
|     function discoverFromHTML($url, $body)
 | |
|     {
 | |
|         // DOMDocument::loadHTML may throw warnings on unrecognized elements.
 | |
|         $old = error_reporting(error_reporting() & ~E_WARNING);
 | |
|         $dom = new DOMDocument();
 | |
|         $ok = $dom->loadHTML($body);
 | |
|         error_reporting($old);
 | |
| 
 | |
|         if (!$ok) {
 | |
|             throw new FeedSubBadHtmlException();
 | |
|         }
 | |
| 
 | |
|         // Autodiscovery links may be relative to the page's URL or <base href>
 | |
|         $base = false;
 | |
|         $nodes = $dom->getElementsByTagName('base');
 | |
|         for ($i = 0; $i < $nodes->length; $i++) {
 | |
|             $node = $nodes->item($i);
 | |
|             if ($node->hasAttributes()) {
 | |
|                 $href = $node->attributes->getNamedItem('href');
 | |
|                 if ($href) {
 | |
|                     $base = trim($href->value);
 | |
|                 }
 | |
|             }
 | |
|         }
 | |
|         if ($base) {
 | |
|             $base = $this->resolveURI($base, $url);
 | |
|         } else {
 | |
|             $base = $url;
 | |
|         }
 | |
| 
 | |
|         // Ok... now on to the links!
 | |
|         // @fixme merge with the munger link checks
 | |
|         $nodes = $dom->getElementsByTagName('link');
 | |
|         for ($i = 0; $i < $nodes->length; $i++) {
 | |
|             $node = $nodes->item($i);
 | |
|             if ($node->hasAttributes()) {
 | |
|                 $rel = $node->attributes->getNamedItem('rel');
 | |
|                 $type = $node->attributes->getNamedItem('type');
 | |
|                 $href = $node->attributes->getNamedItem('href');
 | |
|                 if ($rel && $type && $href) {
 | |
|                     $rel = trim($rel->value);
 | |
|                     $type = trim($type->value);
 | |
|                     $href = trim($href->value);
 | |
| 
 | |
|                     $feedTypes = array(
 | |
|                         'application/rss+xml',
 | |
|                         'application/atom+xml',
 | |
|                     );
 | |
|                     if (trim($rel) == 'alternate' && in_array($type, $feedTypes)) {
 | |
|                         return $this->resolveURI($href, $base);
 | |
|                     }
 | |
|                 }
 | |
|             }
 | |
|         }
 | |
| 
 | |
|         return false;
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * Resolve a possibly relative URL against some absolute base URL
 | |
|      * @param string $rel relative or absolute URL
 | |
|      * @param string $base absolute URL
 | |
|      * @return string absolute URL, or original URL if could not be resolved.
 | |
|      */
 | |
|     function resolveURI($rel, $base)
 | |
|     {
 | |
|         require_once "Net/URL2.php";
 | |
|         try {
 | |
|             $relUrl = new Net_URL2($rel);
 | |
|             if ($relUrl->isAbsolute()) {
 | |
|                 return $rel;
 | |
|             }
 | |
|             $baseUrl = new Net_URL2($base);
 | |
|             $absUrl = $baseUrl->resolve($relUrl);
 | |
|             return $absUrl->getURL();
 | |
|         } catch (Exception $e) {
 | |
|             common_log(LOG_WARNING, 'Unable to resolve relative link "' .
 | |
|                 $rel . '" against base "' . $base . '": ' . $e->getMessage());
 | |
|             return $rel;
 | |
|         }
 | |
|     }
 | |
| }
 |