From 4a6c9e445149e42a4f81d5140296e7770c60bc6c Mon Sep 17 00:00:00 2001 From: Brion Vibber Date: Thu, 18 Mar 2010 17:55:21 -0700 Subject: [PATCH 1/3] Work around weird bug with HTML normalization via PHP DOM module; if source had xmlns and xml:lang I ended up with double output, breaking the subsequent parsing. Will have to track this down later and report upstream if not already resolved. --- plugins/OStatus/extlib/hkit/hkit.class.php | 2 +- plugins/OStatus/lib/discoveryhints.php | 21 ++++++++++++++++++++- 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/plugins/OStatus/extlib/hkit/hkit.class.php b/plugins/OStatus/extlib/hkit/hkit.class.php index c3a54cff65..fec6f4d8fd 100644 --- a/plugins/OStatus/extlib/hkit/hkit.class.php +++ b/plugins/OStatus/extlib/hkit/hkit.class.php @@ -472,4 +472,4 @@ } -?> \ No newline at end of file +?> diff --git a/plugins/OStatus/lib/discoveryhints.php b/plugins/OStatus/lib/discoveryhints.php index 4da2ec0f1e..0273b5a92c 100644 --- a/plugins/OStatus/lib/discoveryhints.php +++ b/plugins/OStatus/lib/discoveryhints.php @@ -174,6 +174,26 @@ class DiscoveryHints { error_reporting($old); if ($ok) { + // If the original had xmlns or xml:lang attributes on the + // , we seen to end up with duplicates, which causes + // parse errors. Remove em! + // + // For some reason we have to iterate and remove them twice, + // *plus* they don't show up on hasAttribute() or removeAttribute(). + // This might be some weird bug in PHP or libxml2, uncertain if + // it affects other folks consistently. + $root = $dom->documentElement; + foreach ($root->attributes as $i => $x) { + if ($i == 'xmlns' || $i == 'xml:lang') { + $root->removeAttributeNode($x); + } + } + foreach ($root->attributes as $i => $x) { + if ($i == 'xmlns' || $i == 'xml:lang') { + $root->removeAttributeNode($x); + } + } + // hKit doesn't give us a chance to pass the source URL for // resolving relative links, such as the avatar photo on a // Google profile. We'll slip it into a tag if there's @@ -192,7 +212,6 @@ class DiscoveryHints { $head = $heads->item(0); } else { $head = $dom->createElement('head'); - $root = $dom->documentRoot; if ($root->firstChild) { $root->insertBefore($head, $root->firstChild); } else { From 17c50f338ceb574780476f6b788f48e2d7d06017 Mon Sep 17 00:00:00 2001 From: Evan Prodromou Date: Thu, 18 Mar 2010 20:52:00 -0500 Subject: [PATCH 2/3] Remove hkit and do our own hcard parsing Parsing hcards for the data we need wasn't hard enough to justify using hkit. It was dependent on a number of external systems (something to run tidy), and only could handle XHTML. We now parse HTML with the PHP dom libraries used elsewhere, and scrape out our own hcards. Seems to work nicer and faster and most of all works with Google Buzz profile URLs. --- plugins/OStatus/extlib/hkit/hcard.profile.php | 105 ---- plugins/OStatus/extlib/hkit/hkit.class.php | 475 ------------------ plugins/OStatus/lib/discoveryhints.php | 202 +++++--- 3 files changed, 134 insertions(+), 648 deletions(-) delete mode 100644 plugins/OStatus/extlib/hkit/hcard.profile.php delete mode 100644 plugins/OStatus/extlib/hkit/hkit.class.php diff --git a/plugins/OStatus/extlib/hkit/hcard.profile.php b/plugins/OStatus/extlib/hkit/hcard.profile.php deleted file mode 100644 index 6ec0dc8906..0000000000 --- a/plugins/OStatus/extlib/hkit/hcard.profile.php +++ /dev/null @@ -1,105 +0,0 @@ -root_class = 'vcard'; - - $this->classes = array( - 'fn', array('honorific-prefix', 'given-name', 'additional-name', 'family-name', 'honorific-suffix'), - 'n', array('honorific-prefix', 'given-name', 'additional-name', 'family-name', 'honorific-suffix'), - 'adr', array('post-office-box', 'extended-address', 'street-address', 'postal-code', 'country-name', 'type', 'region', 'locality'), - 'label', 'bday', 'agent', 'nickname', 'photo', 'class', - 'email', array('type', 'value'), - 'category', 'key', 'logo', 'mailer', 'note', - 'org', array('organization-name', 'organization-unit'), - 'tel', array('type', 'value'), - 'geo', array('latitude', 'longitude'), - 'tz', 'uid', 'url', 'rev', 'role', 'sort-string', 'sound', 'title' - ); - - // classes that must only appear once per card - $this->singles = array( - 'fn' - ); - - // classes that are required (not strictly enforced - give at least one!) - $this->required = array( - 'fn' - ); - - $this->att_map = array( - 'fn' => array('IMG|alt'), - 'url' => array('A|href', 'IMG|src', 'AREA|href'), - 'photo' => array('IMG|src'), - 'bday' => array('ABBR|title'), - 'logo' => array('IMG|src'), - 'email' => array('A|href'), - 'geo' => array('ABBR|title') - ); - - - $this->callbacks = array( - 'url' => array($this, 'resolvePath'), - 'photo' => array($this, 'resolvePath'), - 'logo' => array($this, 'resolvePath'), - 'email' => array($this, 'resolveEmail') - ); - - - - function hKit_hcard_post($a) - { - - foreach ($a as &$vcard){ - - hKit_implied_n_optimization($vcard); - hKit_implied_n_from_fn($vcard); - - } - - return $a; - - } - - - function hKit_implied_n_optimization(&$vcard) - { - if (array_key_exists('fn', $vcard) && !is_array($vcard['fn']) && - !array_key_exists('n', $vcard) && (!array_key_exists('org', $vcard) || $vcard['fn'] != $vcard['org'])){ - - if (sizeof(explode(' ', $vcard['fn'])) == 2){ - $patterns = array(); - $patterns[] = array('/^(\S+),\s*(\S{1})$/', 2, 1); // Lastname, Initial - $patterns[] = array('/^(\S+)\s*(\S{1})\.*$/', 2, 1); // Lastname Initial(.) - $patterns[] = array('/^(\S+),\s*(\S+)$/', 2, 1); // Lastname, Firstname - $patterns[] = array('/^(\S+)\s*(\S+)$/', 1, 2); // Firstname Lastname - - foreach ($patterns as $pattern){ - if (preg_match($pattern[0], $vcard['fn'], $matches) === 1){ - $n = array(); - $n['given-name'] = $matches[$pattern[1]]; - $n['family-name'] = $matches[$pattern[2]]; - $vcard['n'] = $n; - - - break; - } - } - } - } - } - - - function hKit_implied_n_from_fn(&$vcard) - { - if (array_key_exists('fn', $vcard) && is_array($vcard['fn']) - && !array_key_exists('n', $vcard) && (!array_key_exists('org', $vcard) || $vcard['fn'] != $vcard['org'])){ - - $vcard['n'] = $vcard['fn']; - } - - if (array_key_exists('fn', $vcard) && is_array($vcard['fn'])){ - $vcard['fn'] = $vcard['fn']['text']; - } - } - -?> \ No newline at end of file diff --git a/plugins/OStatus/extlib/hkit/hkit.class.php b/plugins/OStatus/extlib/hkit/hkit.class.php deleted file mode 100644 index c3a54cff65..0000000000 --- a/plugins/OStatus/extlib/hkit/hkit.class.php +++ /dev/null @@ -1,475 +0,0 @@ -' . implode(', ', $missing) . ''); - - } - - - public function getByURL($profile='', $url='') - { - - if ($profile=='' || $url == '') return false; - - $this->loadProfile($profile); - - $source = $this->loadURL($url); - - if ($source){ - $tidy_xhtml = $this->tidyThis($source); - - $fragment = false; - - if (strrchr($url, '#')) - $fragment = array_pop(explode('#', $url)); - - $doc = $this->loadDoc($tidy_xhtml, $fragment); - $s = $this->processNodes($doc, $this->classes); - $s = $this->postProcess($profile, $s); - - return $s; - }else{ - return false; - } - } - - public function getByString($profile='', $input_xml='') - { - if ($profile=='' || $input_xml == '') return false; - - $this->loadProfile($profile); - - $doc = $this->loadDoc($input_xml); - $s = $this->processNodes($doc, $this->classes); - $s = $this->postProcess($profile, $s); - - return $s; - - } - - private function processNodes($items, $classes, $allow_includes=true){ - - $out = array(); - - foreach($items as $item){ - $data = array(); - - for ($i=0; $ixpath($xpath); - - if ($results){ - foreach ($results as $result){ - if (isset($classes[$i+1]) && is_array($classes[$i+1])){ - $nodes = $this->processNodes($results, $classes[$i+1]); - if (sizeof($nodes) > 0){ - $nodes = array_merge(array('text'=>$this->getNodeValue($result, $classes[$i])), $nodes); - $data[$classes[$i]] = $nodes; - }else{ - $data[$classes[$i]] = $this->getNodeValue($result, $classes[$i]); - } - - }else{ - if (isset($data[$classes[$i]])){ - if (is_array($data[$classes[$i]])){ - // is already an array - append - $data[$classes[$i]][] = $this->getNodeValue($result, $classes[$i]); - - }else{ - // make it an array - if ($classes[$i] == 'value'){ // unless it's the 'value' of a type/value pattern - $data[$classes[$i]] .= $this->getNodeValue($result, $classes[$i]); - }else{ - $old_val = $data[$classes[$i]]; - $data[$classes[$i]] = array($old_val, $this->getNodeValue($result, $classes[$i])); - $old_val = false; - } - } - }else{ - // set as normal value - $data[$classes[$i]] = $this->getNodeValue($result, $classes[$i]); - - } - } - - // td@headers pattern - if (strtoupper(dom_import_simplexml($result)->tagName)== "TD" && $result['headers']){ - $include_ids = explode(' ', $result['headers']); - $doc = $this->doc; - foreach ($include_ids as $id){ - $xpath = "//*[@id='$id']/.."; - $includes = $doc->xpath($xpath); - foreach ($includes as $include){ - $tmp = $this->processNodes($include, $this->classes); - if (is_array($tmp)) $data = array_merge($data, $tmp); - } - } - } - } - } - } - $result = false; - } - - // include-pattern - if ($allow_includes){ - $xpath = ".//*[contains(concat(' ',normalize-space(@class),' '),' include ')]"; - $results = $item->xpath($xpath); - - if ($results){ - foreach ($results as $result){ - $tagName = strtoupper(dom_import_simplexml($result)->tagName); - if ((($tagName == "OBJECT" && $result['data']) || ($tagName == "A" && $result['href'])) - && preg_match('/\binclude\b/', $result['class'])){ - $att = ($tagName == "OBJECT" ? 'data' : 'href'); - $id = str_replace('#', '', $result[$att]); - $doc = $this->doc; - $xpath = "//*[@id='$id']"; - $includes = $doc->xpath($xpath); - foreach ($includes as $include){ - $include = simplexml_load_string(''.$include->asXML().''); // don't ask. - $tmp = $this->processNodes($include, $this->classes, false); - if (is_array($tmp)) $data = array_merge($data, $tmp); - } - } - } - } - } - $out[] = $data; - } - - if (sizeof($out) > 1){ - return $out; - }else if (isset($data)){ - return $data; - }else{ - return array(); - } - } - - - private function getNodeValue($node, $className) - { - - $tag_name = strtoupper(dom_import_simplexml($node)->tagName); - $s = false; - - // ignore DEL tags - if ($tag_name == 'DEL') return $s; - - // look up att map values - if (array_key_exists($className, $this->att_map)){ - - foreach ($this->att_map[$className] as $map){ - if (preg_match("/$tag_name\|/", $map)){ - $s = ''.$node[array_pop($foo = explode('|', $map))]; - } - } - } - - // if nothing and OBJ, try data. - if (!$s && $tag_name=='OBJECT' && $node['data']) $s = ''.$node['data']; - - // if nothing and IMG, try alt. - if (!$s && $tag_name=='IMG' && $node['alt']) $s = ''.$node['alt']; - - // if nothing and AREA, try alt. - if (!$s && $tag_name=='AREA' && $node['alt']) $s = ''.$node['alt']; - - //if nothing and not A, try title. - if (!$s && $tag_name!='A' && $node['title']) $s = ''.$node['title']; - - - // if nothing found, go with node text - $s = ($s ? $s : implode(array_filter($node->xpath('child::node()'), array(&$this, "filterBlankValues")), ' ')); - - // callbacks - if (array_key_exists($className, $this->callbacks)){ - $s = preg_replace_callback('/.*/', $this->callbacks[$className], $s, 1); - } - - // trim and remove line breaks - if ($tag_name != 'PRE'){ - $s = trim(preg_replace('/[\r\n\t]+/', '', $s)); - $s = trim(preg_replace('/(\s{2})+/', ' ', $s)); - } - - return $s; - } - - private function filterBlankValues($s){ - return preg_match("/\w+/", $s); - } - - - private function tidyThis($source) - { - switch ( $this->tidy_mode ) - { - case 'exec': - $tmp_file = $this->tmp_dir.md5($source).'.txt'; - file_put_contents($tmp_file, $source); - exec("tidy -utf8 -indent -asxhtml -numeric -bare -quiet $tmp_file", $tidy); - unlink($tmp_file); - return implode("\n", $tidy); - break; - - case 'php': - $tidy = tidy_parse_string($source); - return tidy_clean_repair($tidy); - break; - - default: - return $source; - break; - } - - } - - - private function loadProfile($profile) - { - require_once("$profile.profile.php"); - } - - - private function loadDoc($input_xml, $fragment=false) - { - $xml = simplexml_load_string($input_xml); - - $this->doc = $xml; - - if ($fragment){ - $doc = $xml->xpath("//*[@id='$fragment']"); - $xml = simplexml_load_string($doc[0]->asXML()); - $doc = null; - } - - // base tag - if ($xml->head->base['href']) $this->base = $xml->head->base['href']; - - // xml:base attribute - PITA with SimpleXML - preg_match('/xml:base="(.*)"/', $xml->asXML(), $matches); - if (is_array($matches) && sizeof($matches)>1) $this->base = $matches[1]; - - return $xml->xpath("//*[contains(concat(' ',normalize-space(@class),' '),' $this->root_class ')]"); - - } - - - private function loadURL($url) - { - $this->url = $url; - - if ($this->tidy_mode == 'proxy' && $this->tidy_proxy != ''){ - $url = $this->tidy_proxy . $url; - } - - return @file_get_contents($url); - - } - - - private function postProcess($profile, $s) - { - $required = $this->required; - - if (is_array($s) && array_key_exists($required[0], $s)){ - $s = array($s); - } - - $s = $this->dedupeSingles($s); - - if (function_exists('hKit_'.$profile.'_post')){ - $s = call_user_func('hKit_'.$profile.'_post', $s); - } - - $s = $this->removeTextVals($s); - - return $s; - } - - - private function resolvePath($filepath) - { // ugly code ahoy: needs a serious tidy up - - $filepath = $filepath[0]; - - $base = $this->base; - $url = $this->url; - - if ($base != '' && strpos($base, '://') !== false) - $url = $base; - - $r = parse_url($url); - $domain = $r['scheme'] . '://' . $r['host']; - - if (!isset($r['path'])) $r['path'] = '/'; - $path = explode('/', $r['path']); - $file = explode('/', $filepath); - $new = array(''); - - if (strpos($filepath, '://') !== false || strpos($filepath, 'data:') !== false){ - return $filepath; - } - - if ($file[0] == ''){ - // absolute path - return ''.$domain . implode('/', $file); - }else{ - // relative path - if ($path[sizeof($path)-1] == '') array_pop($path); - if (strpos($path[sizeof($path)-1], '.') !== false) array_pop($path); - - foreach ($file as $segment){ - if ($segment == '..'){ - array_pop($path); - }else{ - $new[] = $segment; - } - } - return ''.$domain . implode('/', $path) . implode('/', $new); - } - } - - private function resolveEmail($v) - { - $parts = parse_url($v[0]); - return ($parts['path']); - } - - - private function dedupeSingles($s) - { - $singles = $this->singles; - - foreach ($s as &$item){ - foreach ($singles as $classname){ - if (array_key_exists($classname, $item) && is_array($item[$classname])){ - if (isset($item[$classname][0])) $item[$classname] = $item[$classname][0]; - } - } - } - - return $s; - } - - private function removeTextVals($s) - { - foreach ($s as $key => &$val){ - if ($key){ - $k = $key; - }else{ - $k = ''; - } - - if (is_array($val)){ - $val = $this->removeTextVals($val); - }else{ - if ($k == 'text'){ - $val = ''; - } - } - } - - return array_filter($s); - } - - } - - -?> \ No newline at end of file diff --git a/plugins/OStatus/lib/discoveryhints.php b/plugins/OStatus/lib/discoveryhints.php index db13793dde..1bb0ad2aea 100644 --- a/plugins/OStatus/lib/discoveryhints.php +++ b/plugins/OStatus/lib/discoveryhints.php @@ -63,49 +63,12 @@ class DiscoveryHints { static function hcardHints($body, $url) { - common_debug("starting tidy"); - - $body = self::_tidy($body); - - common_debug("done with tidy"); - - set_include_path(get_include_path() . PATH_SEPARATOR . INSTALLDIR . '/plugins/OStatus/extlib/hkit/'); - require_once('hkit.class.php'); - - $h = new hKit; - - $hcards = $h->getByString('hcard', $body); - - if (empty($hcards)) { - return array(); - } - - if (count($hcards) == 1) { - $hcard = $hcards[0]; - } else { - foreach ($hcards as $try) { - if (array_key_exists('url', $try)) { - if (is_string($try['url']) && $try['url'] == $url) { - $hcard = $try; - break; - } else if (is_array($try['url'])) { - foreach ($try['url'] as $tryurl) { - if ($tryurl == $url) { - $hcard = $try; - break 2; - } - } - } - } - } - // last chance; grab the first one - if (empty($hcard)) { - $hcard = $hcards[0]; - } - } + $hcard = self::_hcard($body, $url); $hints = array(); + // XXX: don't copy stuff into an array and then copy it again + if (array_key_exists('nickname', $hcard)) { $hints['nickname'] = $hcard['nickname']; } @@ -117,7 +80,7 @@ class DiscoveryHints { } if (array_key_exists('photo', $hcard)) { - $hints['avatar'] = $hcard['photo']; + $hints['avatar'] = $hcard['photo'][0]; } if (array_key_exists('note', $hcard)) { @@ -144,39 +107,142 @@ class DiscoveryHints { return $hints; } - private static function _tidy($body) + static function _hcard($body, $url) { - if (function_exists('tidy_parse_string')) { - common_debug("Tidying with extension"); - $text = tidy_parse_string($body); - $text = tidy_clean_repair($text); - return $body; - } else if ($fullpath = self::_findProgram('tidy')) { - common_debug("Tidying with program $fullpath"); - $tempfile = tempnam('/tmp', 'snht'); // statusnet hcard tidy - file_put_contents($tempfile, $source); - exec("$fullpath -utf8 -indent -asxhtml -numeric -bare -quiet $tempfile", $tidy); - unlink($tempfile); - return implode("\n", $tidy); - } else { - common_debug("Not tidying."); - return $body; + // DOMDocument::loadHTML may throw warnings on unrecognized elements. + + $old = error_reporting(error_reporting() & ~E_WARNING); + + $doc = new DOMDocument(); + $doc->loadHTML($body); + + error_reporting($old); + + $xp = new DOMXPath($doc); + + $hcardNodes = self::_getChildrenByClass($doc->documentElement, 'vcard', $xp); + + $hcards = array(); + + for ($i = 0; $i < $hcardNodes->length; $i++) { + + $hcardNode = $hcardNodes->item($i); + + $hcard = self::_hcardFromNode($hcardNode, $xp, $url); + + $hcards[] = $hcard; } - } - private static function _findProgram($name) - { - $path = $_ENV['PATH']; + $repr = null; - $parts = explode(':', $path); - - foreach ($parts as $part) { - $fullpath = $part . '/' . $name; - if (is_executable($fullpath)) { - return $fullpath; + foreach ($hcards as $hcard) { + if (in_array($url, $hcard['url'])) { + $repr = $hcard; + break; } } - return null; + if (!is_null($repr)) { + return $repr; + } else if (count($hcards) > 0) { + return $hcards[0]; + } else { + return null; + } + } + + function _getChildrenByClass($el, $cls, $xp) + { + // borrowed from hkit. Thanks dudes! + + $qry = ".//*[contains(concat(' ',normalize-space(@class),' '),' $cls ')]"; + + $nodes = $xp->query($qry, $el); + + return $nodes; + } + + function _hcardFromNode($hcardNode, $xp, $base) + { + $hcard = array(); + + $hcard['url'] = array(); + + $urlNodes = self::_getChildrenByClass($hcardNode, 'url', $xp); + + for ($j = 0; $j < $urlNodes->length; $j++) { + + $urlNode = $urlNodes->item($j); + + if ($urlNode->hasAttribute('href')) { + $url = $urlNode->getAttribute('href'); + } else { + $url = $urlNode->textContent; + } + + $hcard['url'][] = self::_rel2abs($url, $base); + } + + $hcard['photo'] = array(); + + $photoNodes = self::_getChildrenByClass($hcardNode, 'photo', $xp); + + for ($j = 0; $j < $photoNodes->length; $j++) { + $photoNode = $photoNodes->item($j); + if ($photoNode->hasAttribute('src')) { + $url = $photoNode->getAttribute('src'); + } else if ($photoNode->hasAttribute('href')) { + $url = $photoNode->getAttribute('href'); + } else { + $url = $photoNode->textContent; + } + $hcard['photo'][] = self::_rel2abs($url, $base); + } + + $singles = array('nickname', 'note', 'fn', 'n', 'adr'); + + foreach ($singles as $single) { + + $nodes = self::_getChildrenByClass($hcardNode, $single, $xp); + + if ($nodes->length > 0) { + $node = $nodes->item(0); + $hcard[$single] = $node->textContent; + } + } + + return $hcard; + } + + // XXX: this is a first pass; we probably need + // to handle things like ../ and ./ and so on + + static function _rel2abs($rel, $wrt) + { + $parts = parse_url($rel); + + if ($parts === false) { + return false; + } + + // If it's got a scheme, use it + + if ($parts['scheme'] != '') { + return $rel; + } + + $w = parse_url($wrt); + + $base = $w['scheme'].'://'.$w['host']; + + if ($rel[0] == '/') { + return $base.$rel; + } + + $wp = explode('/', $w['path']); + + array_pop($wp); + + return $base.implode('/', $wp).'/'.$rel; } } From 05e3768e6a833d99a5180d4306d68f59e2d8f8c9 Mon Sep 17 00:00:00 2001 From: Evan Prodromou Date: Fri, 19 Mar 2010 09:48:39 -0500 Subject: [PATCH 3/3] Parse RSS items as activities First steps to parsing RSS items as activities. RSS feeds don't seem to have enough data to make good remote profiles, but this may work with some "hints". --- lib/activity.php | 272 ++++++++++++++++++++++++++++++----- tests/ActivityParseTests.php | 95 +++++++++++- 2 files changed, 333 insertions(+), 34 deletions(-) diff --git a/lib/activity.php b/lib/activity.php index c67d090f72..5b304020d6 100644 --- a/lib/activity.php +++ b/lib/activity.php @@ -643,38 +643,11 @@ class ActivityObject ); if ($element->tagName == 'author') { - - $this->type = self::PERSON; // XXX: is this fair? - $this->title = $this->_childContent($element, self::NAME); - $this->id = $this->_childContent($element, self::URI); - - if (empty($this->id)) { - $email = $this->_childContent($element, self::EMAIL); - if (!empty($email)) { - // XXX: acct: ? - $this->id = 'mailto:'.$email; - } - } - + $this->_fromAuthor($element); + } else if ($element->tagName == 'item') { + $this->_fromRssItem($element); } else { - - $this->type = $this->_childContent($element, Activity::OBJECTTYPE, - Activity::SPEC); - - if (empty($this->type)) { - $this->type = ActivityObject::NOTE; - } - - $this->id = $this->_childContent($element, self::ID); - $this->title = $this->_childContent($element, self::TITLE); - $this->summary = $this->_childContent($element, self::SUMMARY); - - $this->source = $this->_getSource($element); - - $this->content = ActivityUtils::getContent($element); - - $this->link = ActivityUtils::getPermalink($element); - + $this->_fromAtomEntry($element); } // Some per-type attributes... @@ -697,6 +670,72 @@ class ActivityObject } } + private function _fromAuthor($element) + { + $this->type = self::PERSON; // XXX: is this fair? + $this->title = $this->_childContent($element, self::NAME); + $this->id = $this->_childContent($element, self::URI); + + if (empty($this->id)) { + $email = $this->_childContent($element, self::EMAIL); + if (!empty($email)) { + // XXX: acct: ? + $this->id = 'mailto:'.$email; + } + } + } + + private function _fromAtomEntry($element) + { + $this->type = $this->_childContent($element, Activity::OBJECTTYPE, + Activity::SPEC); + + if (empty($this->type)) { + $this->type = ActivityObject::NOTE; + } + + $this->id = $this->_childContent($element, self::ID); + $this->title = $this->_childContent($element, self::TITLE); + $this->summary = $this->_childContent($element, self::SUMMARY); + + $this->source = $this->_getSource($element); + + $this->content = ActivityUtils::getContent($element); + + $this->link = ActivityUtils::getPermalink($element); + } + + // @fixme rationalize with Activity::_fromRssItem() + + private function _fromRssItem($item) + { + $this->title = ActivityUtils::childContent($item, ActivityObject::TITLE, Activity::RSS); + + $contentEl = ActivityUtils::child($item, ActivityUtils::CONTENT, Activity::CONTENTNS); + + if (!empty($contentEl)) { + $this->content = htmlspecialchars_decode($contentEl->textContent, ENT_QUOTES); + } else { + $descriptionEl = ActivityUtils::child($item, Activity::DESCRIPTION, Activity::RSS); + if (!empty($descriptionEl)) { + $this->content = htmlspecialchars_decode($descriptionEl->textContent, ENT_QUOTES); + } + } + + $this->link = ActivityUtils::childContent($item, ActivityUtils::LINK, Activity::RSS); + + $guidEl = ActivityUtils::child($item, Activity::GUID, Activity::RSS); + + if (!empty($guidEl)) { + $this->id = $guidEl->textContent; + + if ($guidEl->hasAttribute('isPermaLink')) { + // overwrites + $this->link = $this->id; + } + } + } + private function _childContent($element, $tag, $namespace=ActivityUtils::ATOM) { return ActivityUtils::childContent($element, $tag, $namespace); @@ -1051,6 +1090,21 @@ class Activity const PUBLISHED = 'published'; const UPDATED = 'updated'; + const RSS = null; // no namespace! + + const PUBDATE = 'pubDate'; + const DESCRIPTION = 'description'; + const GUID = 'guid'; + const SELF = 'self'; + const IMAGE = 'image'; + const URL = 'url'; + + const DC = 'http://purl.org/dc/elements/1.1/'; + + const CREATOR = 'creator'; + + const CONTENTNS = 'http://purl.org/rss/1.0/modules/content/'; + public $actor; // an ActivityObject public $verb; // a string (the URL) public $object; // an ActivityObject @@ -1081,8 +1135,6 @@ class Activity return; } - $this->entry = $entry; - // Insist on a feed's root DOMElement; don't allow a DOMDocument if ($feed instanceof DOMDocument) { throw new ClientException( @@ -1090,8 +1142,22 @@ class Activity ); } + $this->entry = $entry; $this->feed = $feed; + if ($entry->namespaceURI == Activity::ATOM && + $entry->localName == 'entry') { + $this->_fromAtomEntry($entry, $feed); + } else if ($entry->namespaceURI == Activity::RSS && + $entry->localName == 'item') { + $this->_fromRssItem($entry, $feed); + } else { + throw new Exception("Unknown DOM element: {$entry->namespaceURI} {$entry->localName}"); + } + } + + function _fromAtomEntry($entry, $feed) + { $pubEl = $this->_child($entry, self::PUBLISHED, self::ATOM); if (!empty($pubEl)) { @@ -1177,6 +1243,69 @@ class Activity } } + function _fromRssItem($item, $rss) + { + $verbEl = $this->_child($item, self::VERB); + + if (!empty($verbEl)) { + $this->verb = trim($verbEl->textContent); + } else { + $this->verb = ActivityVerb::POST; + // XXX: do other implied stuff here + } + + $pubDateEl = $this->_child($item, self::PUBDATE, self::RSS); + + if (!empty($pubDateEl)) { + $this->time = strtotime($pubDateEl->textContent); + } + + $authorEl = $this->_child($item, self::AUTHOR, self::RSS); + + if (!empty($authorEl)) { + $this->actor = $this->_fromRssAuthor($authorEl); + } else { + $dcCreatorEl = $this->_child($item, self::CREATOR, self::DC); + if (!empty($dcCreatorEl)) { + $this->actor = $this->_fromDcCreator($dcCreatorEl); + } else if (!empty($rss)) { + $this->actor = $this->_fromRss($rss); + } + } + + $this->title = ActivityUtils::childContent($item, ActivityObject::TITLE, self::RSS); + + $contentEl = ActivityUtils::child($item, ActivityUtils::CONTENT, self::CONTENTNS); + + if (!empty($contentEl)) { + $this->content = htmlspecialchars_decode($contentEl->textContent, ENT_QUOTES); + } else { + $descriptionEl = ActivityUtils::child($item, self::DESCRIPTION, self::RSS); + if (!empty($descriptionEl)) { + $this->content = htmlspecialchars_decode($descriptionEl->textContent, ENT_QUOTES); + } + } + + $this->link = ActivityUtils::childContent($item, ActivityUtils::LINK, self::RSS); + + // @fixme enclosures + // @fixme thumbnails... maybe + + $guidEl = ActivityUtils::child($item, self::GUID, self::RSS); + + if (!empty($guidEl)) { + $this->id = $guidEl->textContent; + + if ($guidEl->hasAttribute('isPermaLink') && $guidEl->getAttribute('isPermaLink') != 'false') { + // overwrites + $this->link = $this->id; + } + } + + $this->object = new ActivityObject($item); + $this->context = new ActivityContext($item); + } + /** * Returns an Atom based on this activity * @@ -1249,6 +1378,83 @@ class Activity return $xs->getString(); } + function _fromRssAuthor($el) + { + $text = $el->textContent; + + if (preg_match('/^(.*?) \((.*)\)$/', $text, $match)) { + $email = $match[1]; + $name = $match[2]; + } else if (preg_match('/^(.*?) <(.*)>$/', $text, $match)) { + $name = $match[1]; + $email = $match[2]; + } else if (preg_match('/.*@.*/', $text)) { + $email = $text; + $name = null; + } else { + $name = $text; + $email = null; + } + + // Not really enough info + + $actor = new ActivityObject(); + + $actor->element = $el; + + $actor->type = ActivityObject::PERSON; + $actor->title = $name; + + if (!empty($email)) { + $actor->id = 'mailto:'.$email; + } + + return $actor; + } + + function _fromDcCreator($el) + { + // Not really enough info + + $text = $el->textContent; + + $actor = new ActivityObject(); + + $actor->element = $el; + + $actor->title = $text; + $actor->type = ActivityObject::PERSON; + + return $actor; + } + + function _fromRss($el) + { + $actor = new ActivityObject(); + + $actor->element = $el; + + $actor->type = ActivityObject::PERSON; // @fixme guess better + + $actor->title = ActivityUtils::childContent($el, ActivityObject::TITLE, self::RSS); + $actor->link = ActivityUtils::childContent($el, ActivityUtils::LINK, self::RSS); + $actor->id = ActivityUtils::getLink($el, self::SELF); + + $desc = ActivityUtils::childContent($el, self::DESCRIPTION, self::RSS); + + if (!empty($desc)) { + $actor->content = htmlspecialchars_decode($desc, ENT_QUOTES); + } + + $imageEl = ActivityUtils::child($el, self::IMAGE, self::RSS); + + if (!empty($imageEl)) { + $actor->avatarLinks[] = ActivityUtils::childContent($imageEl, self::URL, self::RSS); + } + + return $actor; + } + private function _child($element, $tag, $namespace=self::SPEC) { return ActivityUtils::child($element, $tag, $namespace); diff --git a/tests/ActivityParseTests.php b/tests/ActivityParseTests.php index 7bf9cec7c4..b6980a6bb9 100644 --- a/tests/ActivityParseTests.php +++ b/tests/ActivityParseTests.php @@ -138,9 +138,38 @@ class ActivityParseTests extends PHPUnit_Framework_TestCase $this->assertEquals($poco->urls[0]->value, 'http://example.com/blog.html'); $this->assertEquals($poco->urls[0]->primary, 'true'); $this->assertEquals($act->actor->geopoint, '37.7749295 -122.4194155'); - } + public function testExample6() + { + global $_example6; + + $dom = DOMDocument::loadXML($_example6); + + $rss = $dom->documentElement; + + $channels = $dom->getElementsByTagName('channel'); + + $channel = $channels->item(0); + + $items = $channel->getElementsByTagName('item'); + + $item = $items->item(0); + + $act = new Activity($item, $channel); + + $this->assertEquals($act->verb, ActivityVerb::POST); + + $this->assertEquals($act->id, 'http://en.blog.wordpress.com/?p=3857'); + $this->assertEquals($act->link, 'http://en.blog.wordpress.com/2010/03/03/rub-a-dub-dub-in-the-pubsubhubbub/'); + $this->assertEquals($act->title, 'Rub-a-Dub-Dub in the PubSubHubbub'); + $this->assertEquals($act->time, 1267634892); + + $actor = $act->actor; + + $this->assertFalse(empty($actor)); + $this->assertEquals($actor->title, "Joseph Scott"); + } } $_example1 = << EXAMPLE5; + +$_example6 = << + + + + WordPress.com News + + http://en.blog.wordpress.com + The latest news on WordPress.com and the WordPress community. + Thu, 18 Mar 2010 23:25:35 +0000 + + http://wordpress.com/ + en + hourly + 1 + + + http://www.gravatar.com/blavatar/e6392390e3bcfadff3671c5a5653d95b?s=96&d=http://s2.wp.com/i/buttonw-com.png + WordPress.com News + http://en.blog.wordpress.com + + + +EXAMPLE6; +