gnu-social/plugins/OStatus/lib/feeddiscovery.php
Evan Prodromou f21f78364a Change the workflow to get better discovery
Tried to re-structure the workflow of discovery to get more and richer
data and hints.
2010-03-16 11:25:18 -05:00

260 lines
7.8 KiB
PHP

<?php
/*
* StatusNet - the distributed open-source microblogging tool
* Copyright (C) 2009, StatusNet, Inc.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
/**
* @package FeedSubPlugin
* @maintainer Brion Vibber <brion@status.net>
*/
if (!defined('STATUSNET') && !defined('LACONICA')) { exit(1); }
class FeedSubBadURLException extends FeedSubException
{
}
class FeedSubBadResponseException extends FeedSubException
{
}
class FeedSubEmptyException extends FeedSubException
{
}
class FeedSubBadHTMLException extends FeedSubException
{
}
class FeedSubUnrecognizedTypeException extends FeedSubException
{
}
class FeedSubNoFeedException extends FeedSubException
{
}
class FeedSubBadXmlException extends FeedSubException
{
}
class FeedSubNoHubException extends FeedSubException
{
}
/**
* Given a web page or feed URL, discover the final location of the feed
* and return its current contents.
*
* @example
* $feed = new FeedDiscovery();
* if ($feed->discoverFromURL($url)) {
* print $feed->uri;
* print $feed->type;
* processFeed($feed->feed); // DOMDocument
* }
*/
class FeedDiscovery
{
public $uri;
public $type;
public $feed;
/** Post-initialize query helper... */
public function getLink($rel, $type=null)
{
// @fixme check for non-Atom links in RSS2 feeds as well
return self::getAtomLink($rel, $type);
}
public function getAtomLink($rel, $type=null)
{
return ActivityUtils::getLink($this->feed->documentElement, $rel, $type);
}
/**
* @param string $url
* @param bool $htmlOk pass false here if you don't want to follow web pages.
* @return string with validated URL
* @throws FeedSubBadURLException
* @throws FeedSubBadHtmlException
* @throws FeedSubNoFeedException
* @throws FeedSubEmptyException
* @throws FeedSubUnrecognizedTypeException
*/
function discoverFromURL($url, $htmlOk=true)
{
try {
$client = new HTTPClient();
$response = $client->get($url);
} catch (HTTP_Request2_Exception $e) {
common_log(LOG_ERR, __METHOD__ . " Failure for $url - " . $e->getMessage());
throw new FeedSubBadURLException($e);
}
if ($htmlOk) {
$type = $response->getHeader('Content-Type');
$isHtml = preg_match('!^(text/html|application/xhtml\+xml)!i', $type);
if ($isHtml) {
$target = $this->discoverFromHTML($response->getUrl(), $response->getBody());
if (!$target) {
throw new FeedSubNoFeedException($url);
}
return $this->discoverFromURL($target, false);
}
}
return $this->initFromResponse($response);
}
function discoverFromFeedURL($url)
{
return $this->discoverFromURL($url, false);
}
function initFromResponse($response)
{
if (!$response->isOk()) {
throw new FeedSubBadResponseException($response->getStatus());
}
$sourceurl = $response->getUrl();
$body = $response->getBody();
if (!$body) {
throw new FeedSubEmptyException($sourceurl);
}
$type = $response->getHeader('Content-Type');
if (preg_match('!^(text/xml|application/xml|application/(rss|atom)\+xml)!i', $type)) {
return $this->init($sourceurl, $type, $body);
} else {
common_log(LOG_WARNING, "Unrecognized feed type $type for $sourceurl");
throw new FeedSubUnrecognizedTypeException($type);
}
}
function init($sourceurl, $type, $body)
{
$feed = new DOMDocument();
if ($feed->loadXML($body)) {
$this->uri = $sourceurl;
$this->type = $type;
$this->feed = $feed;
return $this->uri;
} else {
throw new FeedSubBadXmlException($url);
}
}
/**
* @param string $url source URL, used to resolve relative links
* @param string $body HTML body text
* @return mixed string with URL or false if no target found
*/
function discoverFromHTML($url, $body)
{
// DOMDocument::loadHTML may throw warnings on unrecognized elements.
$old = error_reporting(error_reporting() & ~E_WARNING);
$dom = new DOMDocument();
$ok = $dom->loadHTML($body);
error_reporting($old);
if (!$ok) {
throw new FeedSubBadHtmlException();
}
// Autodiscovery links may be relative to the page's URL or <base href>
$base = false;
$nodes = $dom->getElementsByTagName('base');
for ($i = 0; $i < $nodes->length; $i++) {
$node = $nodes->item($i);
if ($node->hasAttributes()) {
$href = $node->attributes->getNamedItem('href');
if ($href) {
$base = trim($href->value);
}
}
}
if ($base) {
$base = $this->resolveURI($base, $url);
} else {
$base = $url;
}
// Ok... now on to the links!
// Types listed in order of priority -- we'll prefer Atom if available.
// @fixme merge with the munger link checks
$feeds = array(
'application/atom+xml' => false,
'application/rss+xml' => false,
);
$nodes = $dom->getElementsByTagName('link');
for ($i = 0; $i < $nodes->length; $i++) {
$node = $nodes->item($i);
if ($node->hasAttributes()) {
$rel = $node->attributes->getNamedItem('rel');
$type = $node->attributes->getNamedItem('type');
$href = $node->attributes->getNamedItem('href');
if ($rel && $type && $href) {
$rel = trim($rel->value);
$type = trim($type->value);
$href = trim($href->value);
if (trim($rel) == 'alternate' && array_key_exists($type, $feeds) && empty($feeds[$type])) {
// Save the first feed found of each type...
$feeds[$type] = $this->resolveURI($href, $base);
}
}
}
}
// Return the highest-priority feed found
foreach ($feeds as $type => $url) {
if ($url) {
return $url;
}
}
return false;
}
/**
* Resolve a possibly relative URL against some absolute base URL
* @param string $rel relative or absolute URL
* @param string $base absolute URL
* @return string absolute URL, or original URL if could not be resolved.
*/
function resolveURI($rel, $base)
{
require_once "Net/URL2.php";
try {
$relUrl = new Net_URL2($rel);
if ($relUrl->isAbsolute()) {
return $rel;
}
$baseUrl = new Net_URL2($base);
$absUrl = $baseUrl->resolve($relUrl);
return $absUrl->getURL();
} catch (Exception $e) {
common_log(LOG_WARNING, 'Unable to resolve relative link "' .
$rel . '" against base "' . $base . '": ' . $e->getMessage());
return $rel;
}
}
}