 * StatusNet - the distributed open-source microblogging tool
 * Copyright (C) 2009, StatusNet, Inc.
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * GNU Affero General Public License for more details.
 * You should have received a copy of the GNU Affero General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.

 * @package FeedSubPlugin
 * @maintainer Brion Vibber <brion@status.net>

if (!defined('STATUSNET') && !defined('LACONICA')) { exit(1); }

class FeedSubBadURLException extends FeedSubException

class FeedSubBadResponseException extends FeedSubException

class FeedSubEmptyException extends FeedSubException

class FeedSubBadHTMLException extends FeedSubException

class FeedSubUnrecognizedTypeException extends FeedSubException

class FeedSubNoFeedException extends FeedSubException

class FeedSubBadXmlException extends FeedSubException

class FeedSubNoHubException extends FeedSubException

 * Given a web page or feed URL, discover the final location of the feed
 * and return its current contents.
 * @example
 *   $feed = new FeedDiscovery();
 *   if ($feed->discoverFromURL($url)) {
 *     print $feed->uri;
 *     print $feed->type;
 *     processFeed($feed->feed); // DOMDocument
 *   }
class FeedDiscovery
    public $uri;
    public $type;
    public $feed;

    /** Post-initialize query helper... */
    public function getLink($rel, $type=null)
        // @fixme check for non-Atom links in RSS2 feeds as well
        return self::getAtomLink($rel, $type);

    public function getAtomLink($rel, $type=null)
        return ActivityUtils::getLink($this->feed->documentElement, $rel, $type);

     * @param string $url
     * @param bool $htmlOk pass false here if you don't want to follow web pages.
     * @return string with validated URL
     * @throws FeedSubBadURLException
     * @throws FeedSubBadHtmlException
     * @throws FeedSubNoFeedException
     * @throws FeedSubEmptyException
     * @throws FeedSubUnrecognizedTypeException
    function discoverFromURL($url, $htmlOk=true)
        try {
            $client = new HTTPClient();
            $response = $client->get($url);
        } catch (HTTP_Request2_Exception $e) {
            common_log(LOG_ERR, __METHOD__ . " Failure for $url - " . $e->getMessage());
            throw new FeedSubBadURLException($e);

        if ($htmlOk) {
            $type = $response->getHeader('Content-Type');
            $isHtml = preg_match('!^(text/html|application/xhtml\+xml)!i', $type);
            if ($isHtml) {
                $target = $this->discoverFromHTML($response->getUrl(), $response->getBody());
                if (!$target) {
                    throw new FeedSubNoFeedException($url);
                return $this->discoverFromURL($target, false);
        return $this->initFromResponse($response);

    function discoverFromFeedURL($url)
        return $this->discoverFromURL($url, false);

    function initFromResponse($response)
        if (!$response->isOk()) {
            throw new FeedSubBadResponseException($response->getCode());

        $sourceurl = $response->getUrl();
        $body = $response->getBody();
        if (!$body) {
            throw new FeedSubEmptyException($sourceurl);

        $type = $response->getHeader('Content-Type');
        if (preg_match('!^(text/xml|application/xml|application/(rss|atom)\+xml)!i', $type)) {
            return $this->init($sourceurl, $type, $body);
        } else {
            common_log(LOG_WARNING, "Unrecognized feed type $type for $sourceurl");
            throw new FeedSubUnrecognizedTypeException($type);

    function init($sourceurl, $type, $body)
        $feed = new DOMDocument();
        if ($feed->loadXML($body)) {
            $this->uri = $sourceurl;
            $this->type = $type;
            $this->feed = $feed;
            return $this->uri;
        } else {
            throw new FeedSubBadXmlException($url);

     * @param string $url source URL, used to resolve relative links
     * @param string $body HTML body text
     * @return mixed string with URL or false if no target found
    function discoverFromHTML($url, $body)
        // DOMDocument::loadHTML may throw warnings on unrecognized elements.
        $old = error_reporting(error_reporting() & ~E_WARNING);
        $dom = new DOMDocument();
        $ok = $dom->loadHTML($body);

        if (!$ok) {
            throw new FeedSubBadHtmlException();

        // Autodiscovery links may be relative to the page's URL or <base href>
        $base = false;
        $nodes = $dom->getElementsByTagName('base');
        for ($i = 0; $i < $nodes->length; $i++) {
            $node = $nodes->item($i);
            if ($node->hasAttributes()) {
                $href = $node->attributes->getNamedItem('href');
                if ($href) {
                    $base = trim($href->value);
        if ($base) {
            $base = $this->resolveURI($base, $url);
        } else {
            $base = $url;

        // Ok... now on to the links!
        // Types listed in order of priority -- we'll prefer Atom if available.
        // @fixme merge with the munger link checks
        $feeds = array(
            'application/atom+xml' => false,
            'application/rss+xml' => false,
        $nodes = $dom->getElementsByTagName('link');
        for ($i = 0; $i < $nodes->length; $i++) {
            $node = $nodes->item($i);
            if ($node->hasAttributes()) {
                $rel = $node->attributes->getNamedItem('rel');
                $type = $node->attributes->getNamedItem('type');
                $href = $node->attributes->getNamedItem('href');
                if ($rel && $type && $href) {
                    $rel = trim($rel->value);
                    $type = trim($type->value);
                    $href = trim($href->value);

                    if (trim($rel) == 'alternate' && array_key_exists($type, $feeds) && empty($feeds[$type])) {
                        // Save the first feed found of each type...
                        $feeds[$type] = $this->resolveURI($href, $base);

        // Return the highest-priority feed found
        foreach ($feeds as $type => $url) {
            if ($url) {
                return $url;

        return false;

     * Resolve a possibly relative URL against some absolute base URL
     * @param string $rel relative or absolute URL
     * @param string $base absolute URL
     * @return string absolute URL, or original URL if could not be resolved.
    function resolveURI($rel, $base)
        require_once "Net/URL2.php";
        try {
            $relUrl = new Net_URL2($rel);
            if ($relUrl->isAbsolute()) {
                return $rel;
            $baseUrl = new Net_URL2($base);
            $absUrl = $baseUrl->resolve($relUrl);
            return $absUrl->getURL();
        } catch (Exception $e) {
            common_log(LOG_WARNING, 'Unable to resolve relative link "' .
                $rel . '" against base "' . $base . '": ' . $e->getMessage());
            return $rel;