| 
									
										
										
										
											2009-11-20 09:42:19 -08:00
										 |  |  | <?php | 
					
						
							|  |  |  | /* | 
					
						
							|  |  |  |  * StatusNet - the distributed open-source microblogging tool | 
					
						
							|  |  |  |  * Copyright (C) 2009, StatusNet, Inc. | 
					
						
							|  |  |  |  * | 
					
						
							|  |  |  |  * This program is free software: you can redistribute it and/or modify | 
					
						
							|  |  |  |  * it under the terms of the GNU Affero General Public License as published by | 
					
						
							|  |  |  |  * the Free Software Foundation, either version 3 of the License, or | 
					
						
							|  |  |  |  * (at your option) any later version. | 
					
						
							|  |  |  |  * | 
					
						
							|  |  |  |  * This program is distributed in the hope that it will be useful, | 
					
						
							|  |  |  |  * but WITHOUT ANY WARRANTY; without even the implied warranty of | 
					
						
							|  |  |  |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | 
					
						
							|  |  |  |  * GNU Affero General Public License for more details. | 
					
						
							|  |  |  |  * | 
					
						
							|  |  |  |  * You should have received a copy of the GNU Affero General Public License | 
					
						
							|  |  |  |  * along with this program.  If not, see <http://www.gnu.org/licenses/>. | 
					
						
							|  |  |  |  */ | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | /** | 
					
						
							|  |  |  |  * @package FeedSubPlugin | 
					
						
							|  |  |  |  * @maintainer Brion Vibber <brion@status.net> | 
					
						
							|  |  |  |  */ | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2010-10-08 10:42:59 -07:00
										 |  |  | if (!defined('STATUSNET')) { | 
					
						
							|  |  |  |     exit(1); | 
					
						
							|  |  |  | } | 
					
						
							| 
									
										
										
										
											2009-11-20 09:42:19 -08:00
										 |  |  | 
 | 
					
						
							|  |  |  | class FeedSubBadURLException extends FeedSubException | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | class FeedSubBadResponseException extends FeedSubException | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | class FeedSubEmptyException extends FeedSubException | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | class FeedSubBadHTMLException extends FeedSubException | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | class FeedSubUnrecognizedTypeException extends FeedSubException | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | class FeedSubNoFeedException extends FeedSubException | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2010-02-18 21:22:21 +00:00
										 |  |  | class FeedSubBadXmlException extends FeedSubException | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | class FeedSubNoHubException extends FeedSubException | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2010-02-08 11:06:03 -08:00
										 |  |  | /** | 
					
						
							|  |  |  |  * Given a web page or feed URL, discover the final location of the feed | 
					
						
							|  |  |  |  * and return its current contents. | 
					
						
							|  |  |  |  * | 
					
						
							|  |  |  |  * @example | 
					
						
							|  |  |  |  *   $feed = new FeedDiscovery(); | 
					
						
							|  |  |  |  *   if ($feed->discoverFromURL($url)) { | 
					
						
							|  |  |  |  *     print $feed->uri; | 
					
						
							|  |  |  |  *     print $feed->type; | 
					
						
							| 
									
										
										
										
											2010-02-18 21:22:21 +00:00
										 |  |  |  *     processFeed($feed->feed); // DOMDocument
 | 
					
						
							| 
									
										
										
										
											2010-02-08 11:06:03 -08:00
										 |  |  |  *   } | 
					
						
							|  |  |  |  */ | 
					
						
							| 
									
										
										
										
											2009-11-20 09:42:19 -08:00
										 |  |  | class FeedDiscovery | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  |     public $uri; | 
					
						
							|  |  |  |     public $type; | 
					
						
							| 
									
										
										
										
											2010-02-18 21:22:21 +00:00
										 |  |  |     public $feed; | 
					
						
							| 
									
										
										
										
											2010-03-19 15:23:30 -05:00
										 |  |  |     public $root; | 
					
						
							| 
									
										
										
										
											2009-11-20 09:42:19 -08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2010-02-18 21:22:21 +00:00
										 |  |  |     /** Post-initialize query helper... */ | 
					
						
							|  |  |  |     public function getLink($rel, $type=null) | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         // @fixme check for non-Atom links in RSS2 feeds as well
 | 
					
						
							|  |  |  |         return self::getAtomLink($rel, $type); | 
					
						
							|  |  |  |     } | 
					
						
							| 
									
										
										
										
											2009-11-20 09:42:19 -08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2010-02-18 21:22:21 +00:00
										 |  |  |     public function getAtomLink($rel, $type=null) | 
					
						
							| 
									
										
										
										
											2009-11-20 09:42:19 -08:00
										 |  |  |     { | 
					
						
							| 
									
										
										
										
											2010-03-19 15:23:30 -05:00
										 |  |  |         return ActivityUtils::getLink($this->root, $rel, $type); | 
					
						
							| 
									
										
										
										
											2009-11-20 09:42:19 -08:00
										 |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2010-08-02 16:08:54 -07:00
										 |  |  |     /** | 
					
						
							|  |  |  |      * Get the referenced PuSH hub link from an Atom feed. | 
					
						
							|  |  |  |      * | 
					
						
							|  |  |  |      * @return mixed string or false | 
					
						
							|  |  |  |      */ | 
					
						
							|  |  |  |     public function getHubLink() | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         return $this->getAtomLink('hub'); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-11-20 09:42:19 -08:00
										 |  |  |     /** | 
					
						
							|  |  |  |      * @param string $url | 
					
						
							| 
									
										
										
										
											2010-02-08 11:06:03 -08:00
										 |  |  |      * @param bool $htmlOk pass false here if you don't want to follow web pages. | 
					
						
							| 
									
										
										
										
											2009-11-20 09:42:19 -08:00
										 |  |  |      * @return string with validated URL | 
					
						
							|  |  |  |      * @throws FeedSubBadURLException | 
					
						
							|  |  |  |      * @throws FeedSubBadHtmlException | 
					
						
							|  |  |  |      * @throws FeedSubNoFeedException | 
					
						
							|  |  |  |      * @throws FeedSubEmptyException | 
					
						
							|  |  |  |      * @throws FeedSubUnrecognizedTypeException | 
					
						
							|  |  |  |      */ | 
					
						
							|  |  |  |     function discoverFromURL($url, $htmlOk=true) | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         try { | 
					
						
							|  |  |  |             $client = new HTTPClient(); | 
					
						
							|  |  |  |             $response = $client->get($url); | 
					
						
							|  |  |  |         } catch (HTTP_Request2_Exception $e) { | 
					
						
							| 
									
										
										
										
											2010-02-18 21:22:21 +00:00
										 |  |  |             common_log(LOG_ERR, __METHOD__ . " Failure for $url - " . $e->getMessage()); | 
					
						
							| 
									
										
										
										
											2010-05-04 17:11:43 -07:00
										 |  |  |             throw new FeedSubBadURLException($e->getMessage()); | 
					
						
							| 
									
										
										
										
											2009-11-20 09:42:19 -08:00
										 |  |  |         } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         if ($htmlOk) { | 
					
						
							|  |  |  |             $type = $response->getHeader('Content-Type'); | 
					
						
							|  |  |  |             $isHtml = preg_match('!^(text/html|application/xhtml\+xml)!i', $type); | 
					
						
							|  |  |  |             if ($isHtml) { | 
					
						
							|  |  |  |                 $target = $this->discoverFromHTML($response->getUrl(), $response->getBody()); | 
					
						
							|  |  |  |                 if (!$target) { | 
					
						
							|  |  |  |                     throw new FeedSubNoFeedException($url); | 
					
						
							|  |  |  |                 } | 
					
						
							|  |  |  |                 return $this->discoverFromURL($target, false); | 
					
						
							|  |  |  |             } | 
					
						
							|  |  |  |         } | 
					
						
							| 
									
										
										
										
											2010-03-16 11:25:18 -05:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-11-20 09:42:19 -08:00
										 |  |  |         return $this->initFromResponse($response); | 
					
						
							|  |  |  |     } | 
					
						
							| 
									
										
										
										
											2010-02-18 21:22:21 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     function discoverFromFeedURL($url) | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         return $this->discoverFromURL($url, false); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-11-20 09:42:19 -08:00
										 |  |  |     function initFromResponse($response) | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         if (!$response->isOk()) { | 
					
						
							| 
									
										
										
										
											2010-03-09 10:56:33 -08:00
										 |  |  |             throw new FeedSubBadResponseException($response->getStatus()); | 
					
						
							| 
									
										
										
										
											2009-11-20 09:42:19 -08:00
										 |  |  |         } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         $sourceurl = $response->getUrl(); | 
					
						
							|  |  |  |         $body = $response->getBody(); | 
					
						
							|  |  |  |         if (!$body) { | 
					
						
							|  |  |  |             throw new FeedSubEmptyException($sourceurl); | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         $type = $response->getHeader('Content-Type'); | 
					
						
							|  |  |  |         if (preg_match('!^(text/xml|application/xml|application/(rss|atom)\+xml)!i', $type)) { | 
					
						
							| 
									
										
										
										
											2010-02-18 21:22:21 +00:00
										 |  |  |             return $this->init($sourceurl, $type, $body); | 
					
						
							| 
									
										
										
										
											2009-11-20 09:42:19 -08:00
										 |  |  |         } else { | 
					
						
							|  |  |  |             common_log(LOG_WARNING, "Unrecognized feed type $type for $sourceurl"); | 
					
						
							|  |  |  |             throw new FeedSubUnrecognizedTypeException($type); | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2010-02-18 21:22:21 +00:00
										 |  |  |     function init($sourceurl, $type, $body) | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         $feed = new DOMDocument(); | 
					
						
							|  |  |  |         if ($feed->loadXML($body)) { | 
					
						
							|  |  |  |             $this->uri = $sourceurl; | 
					
						
							|  |  |  |             $this->type = $type; | 
					
						
							|  |  |  |             $this->feed = $feed; | 
					
						
							| 
									
										
										
										
											2010-03-19 15:23:30 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  |             $el = $this->feed->documentElement; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             // Looking for the "root" element: RSS channel or Atom feed
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             if ($el->tagName == 'rss') { | 
					
						
							|  |  |  |                 $channels = $el->getElementsByTagName('channel'); | 
					
						
							|  |  |  |                 if ($channels->length > 0) { | 
					
						
							|  |  |  |                     $this->root = $channels->item(0); | 
					
						
							|  |  |  |                 } else { | 
					
						
							|  |  |  |                     throw new FeedSubBadXmlException($sourceurl); | 
					
						
							|  |  |  |                 } | 
					
						
							|  |  |  |             } else if ($el->tagName == 'feed') { | 
					
						
							|  |  |  |                 $this->root = $el; | 
					
						
							|  |  |  |             } else { | 
					
						
							|  |  |  |                 throw new FeedSubBadXmlException($sourceurl); | 
					
						
							|  |  |  |             } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2010-02-18 21:22:21 +00:00
										 |  |  |             return $this->uri; | 
					
						
							|  |  |  |         } else { | 
					
						
							| 
									
										
										
										
											2010-03-19 15:23:30 -05:00
										 |  |  |             throw new FeedSubBadXmlException($sourceurl); | 
					
						
							| 
									
										
										
										
											2010-02-18 21:22:21 +00:00
										 |  |  |         } | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-11-20 09:42:19 -08:00
										 |  |  |     /** | 
					
						
							|  |  |  |      * @param string $url source URL, used to resolve relative links | 
					
						
							|  |  |  |      * @param string $body HTML body text | 
					
						
							|  |  |  |      * @return mixed string with URL or false if no target found | 
					
						
							|  |  |  |      */ | 
					
						
							|  |  |  |     function discoverFromHTML($url, $body) | 
					
						
							|  |  |  |     { | 
					
						
							| 
									
										
										
										
											2010-08-13 10:51:00 -07:00
										 |  |  |         // DOMDocument::loadHTML may throw warnings on unrecognized elements,
 | 
					
						
							|  |  |  |         // and notices on unrecognized namespaces.
 | 
					
						
							|  |  |  |         $old = error_reporting(error_reporting() & ~(E_WARNING | E_NOTICE)); | 
					
						
							| 
									
										
										
										
											2009-11-20 09:42:19 -08:00
										 |  |  |         $dom = new DOMDocument(); | 
					
						
							|  |  |  |         $ok = $dom->loadHTML($body); | 
					
						
							|  |  |  |         error_reporting($old); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         if (!$ok) { | 
					
						
							|  |  |  |             throw new FeedSubBadHtmlException(); | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         // Autodiscovery links may be relative to the page's URL or <base href>
 | 
					
						
							|  |  |  |         $base = false; | 
					
						
							|  |  |  |         $nodes = $dom->getElementsByTagName('base'); | 
					
						
							|  |  |  |         for ($i = 0; $i < $nodes->length; $i++) { | 
					
						
							|  |  |  |             $node = $nodes->item($i); | 
					
						
							|  |  |  |             if ($node->hasAttributes()) { | 
					
						
							|  |  |  |                 $href = $node->attributes->getNamedItem('href'); | 
					
						
							|  |  |  |                 if ($href) { | 
					
						
							|  |  |  |                     $base = trim($href->value); | 
					
						
							|  |  |  |                 } | 
					
						
							|  |  |  |             } | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |         if ($base) { | 
					
						
							|  |  |  |             $base = $this->resolveURI($base, $url); | 
					
						
							|  |  |  |         } else { | 
					
						
							|  |  |  |             $base = $url; | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         // Ok... now on to the links!
 | 
					
						
							| 
									
										
										
										
											2010-02-08 14:06:36 -08:00
										 |  |  |         // Types listed in order of priority -- we'll prefer Atom if available.
 | 
					
						
							| 
									
										
										
										
											2009-11-20 09:42:19 -08:00
										 |  |  |         // @fixme merge with the munger link checks
 | 
					
						
							| 
									
										
										
										
											2010-02-08 14:06:36 -08:00
										 |  |  |         $feeds = array( | 
					
						
							|  |  |  |             'application/atom+xml' => false, | 
					
						
							|  |  |  |             'application/rss+xml' => false, | 
					
						
							|  |  |  |         ); | 
					
						
							| 
									
										
										
										
											2010-03-16 11:25:18 -05:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-11-20 09:42:19 -08:00
										 |  |  |         $nodes = $dom->getElementsByTagName('link'); | 
					
						
							|  |  |  |         for ($i = 0; $i < $nodes->length; $i++) { | 
					
						
							|  |  |  |             $node = $nodes->item($i); | 
					
						
							|  |  |  |             if ($node->hasAttributes()) { | 
					
						
							|  |  |  |                 $rel = $node->attributes->getNamedItem('rel'); | 
					
						
							|  |  |  |                 $type = $node->attributes->getNamedItem('type'); | 
					
						
							|  |  |  |                 $href = $node->attributes->getNamedItem('href'); | 
					
						
							|  |  |  |                 if ($rel && $type && $href) { | 
					
						
							| 
									
										
										
										
											2010-03-15 20:26:42 +00:00
										 |  |  |                     $rel = array_filter(explode(" ", $rel->value)); | 
					
						
							| 
									
										
										
										
											2009-11-20 09:42:19 -08:00
										 |  |  |                     $type = trim($type->value); | 
					
						
							|  |  |  |                     $href = trim($href->value); | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2010-03-15 20:26:42 +00:00
										 |  |  |                     if (in_array('alternate', $rel) && array_key_exists($type, $feeds) && empty($feeds[$type])) { | 
					
						
							| 
									
										
										
										
											2010-02-08 14:06:36 -08:00
										 |  |  |                         // Save the first feed found of each type...
 | 
					
						
							|  |  |  |                         $feeds[$type] = $this->resolveURI($href, $base); | 
					
						
							| 
									
										
										
										
											2009-11-20 09:42:19 -08:00
										 |  |  |                     } | 
					
						
							|  |  |  |                 } | 
					
						
							|  |  |  |             } | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2010-02-08 14:06:36 -08:00
										 |  |  |         // Return the highest-priority feed found
 | 
					
						
							|  |  |  |         foreach ($feeds as $type => $url) { | 
					
						
							|  |  |  |             if ($url) { | 
					
						
							|  |  |  |                 return $url; | 
					
						
							|  |  |  |             } | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-11-20 09:42:19 -08:00
										 |  |  |         return false; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     /** | 
					
						
							|  |  |  |      * Resolve a possibly relative URL against some absolute base URL | 
					
						
							|  |  |  |      * @param string $rel relative or absolute URL | 
					
						
							|  |  |  |      * @param string $base absolute URL | 
					
						
							|  |  |  |      * @return string absolute URL, or original URL if could not be resolved. | 
					
						
							|  |  |  |      */ | 
					
						
							|  |  |  |     function resolveURI($rel, $base) | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         require_once "Net/URL2.php"; | 
					
						
							|  |  |  |         try { | 
					
						
							|  |  |  |             $relUrl = new Net_URL2($rel); | 
					
						
							|  |  |  |             if ($relUrl->isAbsolute()) { | 
					
						
							|  |  |  |                 return $rel; | 
					
						
							|  |  |  |             } | 
					
						
							|  |  |  |             $baseUrl = new Net_URL2($base); | 
					
						
							|  |  |  |             $absUrl = $baseUrl->resolve($relUrl); | 
					
						
							|  |  |  |             return $absUrl->getURL(); | 
					
						
							|  |  |  |         } catch (Exception $e) { | 
					
						
							|  |  |  |             common_log(LOG_WARNING, 'Unable to resolve relative link "' . | 
					
						
							|  |  |  |                 $rel . '" against base "' . $base . '": ' . $e->getMessage()); | 
					
						
							|  |  |  |             return $rel; | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | } |