Test cases and fixes for Atom and RSS content decoding.

Fix extraction of Atom <content type="text"> and <content type="html">; we were failing to escape plaintext source data to HTML, and doing an extraneous double-deescape on HTML source resulting in breakage of notices containing text that looks like HTML. Only <content type="xhtml"> was working correctly previously. Fixes for RSS2 content processing: we were failing to load <content:encoded> at all due to using wrong element name, and were applying an extraneous de-escape for <description> rather than the escaping that is required to turn plaintext into HTML. (Per spec, <description> must be plaintext.)
2010-04-23 15:40:48 -07:00 · 2010-04-23 15:40:48 -07:00 · 8fd0059bf6
commit 8fd0059bf6
parent 9c8052e755
3 changed files with 98 additions and 5 deletions
--- a/lib/activity.php
+++ b/lib/activity.php
@ -83,6 +83,7 @@ class Activity
    const CREATOR = 'creator';

    const CONTENTNS = 'http://purl.org/rss/1.0/modules/content/';
+    const ENCODED = 'encoded';

    public $actor;   // an ActivityObject
    public $verb;    // a string (the URL)
@ -268,14 +269,21 @@ class Activity

        $this->title = ActivityUtils::childContent($item, ActivityObject::TITLE, self::RSS);

-        $contentEl = ActivityUtils::child($item, ActivityUtils::CONTENT, self::CONTENTNS);
+        $contentEl = ActivityUtils::child($item, self::ENCODED, self::CONTENTNS);

        if (!empty($contentEl)) {
-            $this->content = htmlspecialchars_decode($contentEl->textContent, ENT_QUOTES);
+            // <content:encoded> XML node's text content is HTML; no further processing needed.
+            $this->content = $contentEl->textContent;
        } else {
            $descriptionEl = ActivityUtils::child($item, self::DESCRIPTION, self::RSS);
            if (!empty($descriptionEl)) {
-                $this->content = htmlspecialchars_decode($descriptionEl->textContent, ENT_QUOTES);
+                // Per spec, <description> must be plaintext.
+                // In practice, often there's HTML... but these days good
+                // feeds are using <content:encoded> which is explicitly
+                // real HTML.
+                // We'll treat this following spec, and do HTML escaping
+                // to convert from plaintext to HTML.
+                $this->content = htmlspecialchars($descriptionEl->textContent);
            }
        }

--- a/lib/activityutils.php
+++ b/lib/activityutils.php
@ -213,11 +213,19 @@ class ActivityUtils
        // slavishly following http://atompub.org/rfc4287.html#rfc.section.4.1.3.3

        if (empty($type) || $type == 'text') {
-            return $el->textContent;
+            // We have plaintext saved as the XML text content.
+            // Since we want HTML, we need to escape any special chars.
+            return htmlspecialchars($el->textContent);
        } else if ($type == 'html') {
+            // We have HTML saved as the XML text content.
+            // No additional processing required once we've got it.
            $text = $el->textContent;
-            return htmlspecialchars_decode($text, ENT_QUOTES);
+            return $text;
        } else if ($type == 'xhtml') {
+            // Per spec, the <content type="xhtml"> contains a single
+            // HTML <div> with XHTML namespace on it as a child node.
+            // We need to pull all of that <div>'s child nodes and
+            // serialize them back to an (X)HTML source fragment.
            $divEl = ActivityUtils::child($el, 'div', 'http://www.w3.org/1999/xhtml');
            if (empty($divEl)) {
                return null;
--- a/tests/ActivityParseTests.php
+++ b/tests/ActivityParseTests.php
@ -32,6 +32,18 @@ class ActivityParseTests extends PHPUnit_Framework_TestCase
        $this->assertEquals('tag:versioncentral.example.org,2009:/change/1643245', $act->objects[0]->id);
    }

+    public function testExample2()
+    {
+        global $_example2;
+        $dom = DOMDocument::loadXML($_example2);
+        $act = new Activity($dom->documentElement);
+
+        $this->assertFalse(empty($act));
+        // Did we handle <content type="html"> correctly with a typical payload?
+        $this->assertEquals("<p>Geraldine posted a Photo on PhotoPanic</p>\n     " .
+                            "<img src=\"/geraldine/photo1.jpg\">", trim($act->content));
+    }
+
    public function testExample3()
    {
        global $_example3;
@ -305,6 +317,71 @@ class ActivityParseTests extends PHPUnit_Framework_TestCase

    }

+    public function testAtomContent()
+    {
+        $tests = array(array("<content>Some regular plain text.</content>",
+                             "Some regular plain text."),
+                       array("<content>&lt;b&gt;this is not HTML&lt;/b&gt;</content>",
+                             "&lt;b&gt;this is not HTML&lt;/b&gt;"),
+                       array("<content type='html'>Some regular plain HTML.</content>",
+                             "Some regular plain HTML."),
+                       array("<content type='html'>&lt;b&gt;this is too HTML&lt;/b&gt;</content>",
+                             "<b>this is too HTML</b>"),
+                       array("<content type='html'>&amp;lt;b&amp;gt;but this is not HTML!&amp;lt;/b&amp;gt;</content>",
+                             "&lt;b&gt;but this is not HTML!&lt;/b&gt;"),
+                       array("<content type='xhtml'><div xmlns='http://www.w3.org/1999/xhtml'>Some regular plain XHTML.</div></content>",
+                             "Some regular plain XHTML."),
+                       array("<content type='xhtml'><div xmlns='http://www.w3.org/1999/xhtml'><b>This is some XHTML!</b></div></content>",
+                             "<b>This is some XHTML!</b>"),
+                       array("<content type='xhtml'><div xmlns='http://www.w3.org/1999/xhtml'>&lt;b&gt;This is not some XHTML!&lt;/b&gt;</div></content>",
+                             "&lt;b&gt;This is not some XHTML!&lt;/b&gt;"),
+                       array("<content type='xhtml'><div xmlns='http://www.w3.org/1999/xhtml'>&amp;lt;b&amp;gt;This is not some XHTML either!&amp;lt;/b&amp;gt;</div></content>",
+                             "&amp;lt;b&amp;gt;This is not some XHTML either!&amp;lt;/b&amp;gt;"));
+        foreach ($tests as $data) {
+            list($source, $output) = $data;
+            $xml = "<entry xmlns='http://www.w3.org/2005/Atom'>" .
+                   "<id>http://example.com/fakeid</id>" .
+                   "<author><name>Test</name></author>" .
+                   "<title>Atom content tests</title>" .
+                   $source .
+                   "</entry>";
+            $dom = DOMDocument::loadXML($xml);
+            $act = new Activity($dom->documentElement);
+
+            $this->assertFalse(empty($act));
+            $this->assertEquals($output, trim($act->content));
+        }
+    }
+
+    public function testRssContent()
+    {
+        $tests = array(array("<content:encoded>Some regular plain HTML.</content:encoded>",
+                             "Some regular plain HTML."),
+                       array("<content:encoded>Some &lt;b&gt;exciting bold HTML&lt;/b&gt;</content:encoded>",
+                             "Some <b>exciting bold HTML</b>"),
+                       array("<content:encoded>Some &amp;lt;b&amp;gt;escaped non-HTML.&amp;lt;/b&amp;gt;</content:encoded>",
+                             "Some &lt;b&gt;escaped non-HTML.&lt;/b&gt;"),
+                       array("<description>Some plain text.</description>",
+                             "Some plain text."),
+                       array("<description>Some &lt;b&gt;non-HTML text&lt;/b&gt;</description>",
+                             "Some &lt;b&gt;non-HTML text&lt;/b&gt;"),
+                       array("<description>Some &amp;lt;b&amp;gt;double-escaped text&amp;lt;/b&amp;gt;</description>",
+                             "Some &amp;lt;b&amp;gt;double-escaped text&amp;lt;/b&amp;gt;"));
+        foreach ($tests as $data) {
+            list($source, $output) = $data;
+            $xml = "<item xmlns:content='http://purl.org/rss/1.0/modules/content/'>" .
+                   "<guid>http://example.com/fakeid</guid>" .
+                   "<title>RSS content tests</title>" .
+                   $source .
+                   "</item>";
+            $dom = DOMDocument::loadXML($xml);
+            $act = new Activity($dom->documentElement);
+
+            $this->assertFalse(empty($act));
+            $this->assertEquals($output, trim($act->content));
+        }
+    }
+
 }

 $_example1 = <<<EXAMPLE1