Test cases and fixes for Atom and RSS content decoding.

Fix extraction of Atom <content type="text"> and <content type="html">; we were failing to escape plaintext source data to HTML, and doing an extraneous double-deescape on HTML source resulting in breakage of notices containing text that looks like HTML. Only <content type="xhtml"> was working correctly previously.
Fixes for RSS2 content processing: we were failing to load <content:encoded> at all due to using wrong element name, and were applying an extraneous de-escape for <description> rather than the escaping that is required to turn plaintext into HTML. (Per spec, <description> must be plaintext.)
This commit is contained in:
Brion Vibber 2010-04-23 15:40:48 -07:00
parent 9c8052e755
commit 8fd0059bf6
3 changed files with 98 additions and 5 deletions

View File

@ -83,6 +83,7 @@ class Activity
const CREATOR = 'creator';
const CONTENTNS = 'http://purl.org/rss/1.0/modules/content/';
const ENCODED = 'encoded';
public $actor; // an ActivityObject
public $verb; // a string (the URL)
@ -268,14 +269,21 @@ class Activity
$this->title = ActivityUtils::childContent($item, ActivityObject::TITLE, self::RSS);
$contentEl = ActivityUtils::child($item, ActivityUtils::CONTENT, self::CONTENTNS);
$contentEl = ActivityUtils::child($item, self::ENCODED, self::CONTENTNS);
if (!empty($contentEl)) {
$this->content = htmlspecialchars_decode($contentEl->textContent, ENT_QUOTES);
// <content:encoded> XML node's text content is HTML; no further processing needed.
$this->content = $contentEl->textContent;
} else {
$descriptionEl = ActivityUtils::child($item, self::DESCRIPTION, self::RSS);
if (!empty($descriptionEl)) {
$this->content = htmlspecialchars_decode($descriptionEl->textContent, ENT_QUOTES);
// Per spec, <description> must be plaintext.
// In practice, often there's HTML... but these days good
// feeds are using <content:encoded> which is explicitly
// real HTML.
// We'll treat this following spec, and do HTML escaping
// to convert from plaintext to HTML.
$this->content = htmlspecialchars($descriptionEl->textContent);
}
}

View File

@ -213,11 +213,19 @@ class ActivityUtils
// slavishly following http://atompub.org/rfc4287.html#rfc.section.4.1.3.3
if (empty($type) || $type == 'text') {
return $el->textContent;
// We have plaintext saved as the XML text content.
// Since we want HTML, we need to escape any special chars.
return htmlspecialchars($el->textContent);
} else if ($type == 'html') {
// We have HTML saved as the XML text content.
// No additional processing required once we've got it.
$text = $el->textContent;
return htmlspecialchars_decode($text, ENT_QUOTES);
return $text;
} else if ($type == 'xhtml') {
// Per spec, the <content type="xhtml"> contains a single
// HTML <div> with XHTML namespace on it as a child node.
// We need to pull all of that <div>'s child nodes and
// serialize them back to an (X)HTML source fragment.
$divEl = ActivityUtils::child($el, 'div', 'http://www.w3.org/1999/xhtml');
if (empty($divEl)) {
return null;

View File

@ -32,6 +32,18 @@ class ActivityParseTests extends PHPUnit_Framework_TestCase
$this->assertEquals('tag:versioncentral.example.org,2009:/change/1643245', $act->objects[0]->id);
}
public function testExample2()
{
global $_example2;
$dom = DOMDocument::loadXML($_example2);
$act = new Activity($dom->documentElement);
$this->assertFalse(empty($act));
// Did we handle <content type="html"> correctly with a typical payload?
$this->assertEquals("<p>Geraldine posted a Photo on PhotoPanic</p>\n " .
"<img src=\"/geraldine/photo1.jpg\">", trim($act->content));
}
public function testExample3()
{
global $_example3;
@ -305,6 +317,71 @@ class ActivityParseTests extends PHPUnit_Framework_TestCase
}
public function testAtomContent()
{
$tests = array(array("<content>Some regular plain text.</content>",
"Some regular plain text."),
array("<content>&lt;b&gt;this is not HTML&lt;/b&gt;</content>",
"&lt;b&gt;this is not HTML&lt;/b&gt;"),
array("<content type='html'>Some regular plain HTML.</content>",
"Some regular plain HTML."),
array("<content type='html'>&lt;b&gt;this is too HTML&lt;/b&gt;</content>",
"<b>this is too HTML</b>"),
array("<content type='html'>&amp;lt;b&amp;gt;but this is not HTML!&amp;lt;/b&amp;gt;</content>",
"&lt;b&gt;but this is not HTML!&lt;/b&gt;"),
array("<content type='xhtml'><div xmlns='http://www.w3.org/1999/xhtml'>Some regular plain XHTML.</div></content>",
"Some regular plain XHTML."),
array("<content type='xhtml'><div xmlns='http://www.w3.org/1999/xhtml'><b>This is some XHTML!</b></div></content>",
"<b>This is some XHTML!</b>"),
array("<content type='xhtml'><div xmlns='http://www.w3.org/1999/xhtml'>&lt;b&gt;This is not some XHTML!&lt;/b&gt;</div></content>",
"&lt;b&gt;This is not some XHTML!&lt;/b&gt;"),
array("<content type='xhtml'><div xmlns='http://www.w3.org/1999/xhtml'>&amp;lt;b&amp;gt;This is not some XHTML either!&amp;lt;/b&amp;gt;</div></content>",
"&amp;lt;b&amp;gt;This is not some XHTML either!&amp;lt;/b&amp;gt;"));
foreach ($tests as $data) {
list($source, $output) = $data;
$xml = "<entry xmlns='http://www.w3.org/2005/Atom'>" .
"<id>http://example.com/fakeid</id>" .
"<author><name>Test</name></author>" .
"<title>Atom content tests</title>" .
$source .
"</entry>";
$dom = DOMDocument::loadXML($xml);
$act = new Activity($dom->documentElement);
$this->assertFalse(empty($act));
$this->assertEquals($output, trim($act->content));
}
}
public function testRssContent()
{
$tests = array(array("<content:encoded>Some regular plain HTML.</content:encoded>",
"Some regular plain HTML."),
array("<content:encoded>Some &lt;b&gt;exciting bold HTML&lt;/b&gt;</content:encoded>",
"Some <b>exciting bold HTML</b>"),
array("<content:encoded>Some &amp;lt;b&amp;gt;escaped non-HTML.&amp;lt;/b&amp;gt;</content:encoded>",
"Some &lt;b&gt;escaped non-HTML.&lt;/b&gt;"),
array("<description>Some plain text.</description>",
"Some plain text."),
array("<description>Some &lt;b&gt;non-HTML text&lt;/b&gt;</description>",
"Some &lt;b&gt;non-HTML text&lt;/b&gt;"),
array("<description>Some &amp;lt;b&amp;gt;double-escaped text&amp;lt;/b&amp;gt;</description>",
"Some &amp;lt;b&amp;gt;double-escaped text&amp;lt;/b&amp;gt;"));
foreach ($tests as $data) {
list($source, $output) = $data;
$xml = "<item xmlns:content='http://purl.org/rss/1.0/modules/content/'>" .
"<guid>http://example.com/fakeid</guid>" .
"<title>RSS content tests</title>" .
$source .
"</item>";
$dom = DOMDocument::loadXML($xml);
$act = new Activity($dom->documentElement);
$this->assertFalse(empty($act));
$this->assertEquals($output, trim($act->content));
}
}
}
$_example1 = <<<EXAMPLE1