Remove hkit and do our own hcard parsing

Parsing hcards for the data we need wasn't hard enough to justify using
hkit. It was dependent on a number of external systems (something to
run tidy), and only could handle XHTML.

We now parse HTML with the PHP dom libraries used elsewhere, and
scrape out our own hcards. Seems to work nicer and faster and most of
all works with Google Buzz profile URLs.
This commit is contained in:
Evan Prodromou 2010-03-18 20:52:00 -05:00
parent dbd44e51a2
commit 17c50f338c
3 changed files with 134 additions and 648 deletions

View File

@ -1,105 +0,0 @@
<?php
// hcard profile for hkit
$this->root_class = 'vcard';
$this->classes = array(
'fn', array('honorific-prefix', 'given-name', 'additional-name', 'family-name', 'honorific-suffix'),
'n', array('honorific-prefix', 'given-name', 'additional-name', 'family-name', 'honorific-suffix'),
'adr', array('post-office-box', 'extended-address', 'street-address', 'postal-code', 'country-name', 'type', 'region', 'locality'),
'label', 'bday', 'agent', 'nickname', 'photo', 'class',
'email', array('type', 'value'),
'category', 'key', 'logo', 'mailer', 'note',
'org', array('organization-name', 'organization-unit'),
'tel', array('type', 'value'),
'geo', array('latitude', 'longitude'),
'tz', 'uid', 'url', 'rev', 'role', 'sort-string', 'sound', 'title'
);
// classes that must only appear once per card
$this->singles = array(
'fn'
);
// classes that are required (not strictly enforced - give at least one!)
$this->required = array(
'fn'
);
$this->att_map = array(
'fn' => array('IMG|alt'),
'url' => array('A|href', 'IMG|src', 'AREA|href'),
'photo' => array('IMG|src'),
'bday' => array('ABBR|title'),
'logo' => array('IMG|src'),
'email' => array('A|href'),
'geo' => array('ABBR|title')
);
$this->callbacks = array(
'url' => array($this, 'resolvePath'),
'photo' => array($this, 'resolvePath'),
'logo' => array($this, 'resolvePath'),
'email' => array($this, 'resolveEmail')
);
function hKit_hcard_post($a)
{
foreach ($a as &$vcard){
hKit_implied_n_optimization($vcard);
hKit_implied_n_from_fn($vcard);
}
return $a;
}
function hKit_implied_n_optimization(&$vcard)
{
if (array_key_exists('fn', $vcard) && !is_array($vcard['fn']) &&
!array_key_exists('n', $vcard) && (!array_key_exists('org', $vcard) || $vcard['fn'] != $vcard['org'])){
if (sizeof(explode(' ', $vcard['fn'])) == 2){
$patterns = array();
$patterns[] = array('/^(\S+),\s*(\S{1})$/', 2, 1); // Lastname, Initial
$patterns[] = array('/^(\S+)\s*(\S{1})\.*$/', 2, 1); // Lastname Initial(.)
$patterns[] = array('/^(\S+),\s*(\S+)$/', 2, 1); // Lastname, Firstname
$patterns[] = array('/^(\S+)\s*(\S+)$/', 1, 2); // Firstname Lastname
foreach ($patterns as $pattern){
if (preg_match($pattern[0], $vcard['fn'], $matches) === 1){
$n = array();
$n['given-name'] = $matches[$pattern[1]];
$n['family-name'] = $matches[$pattern[2]];
$vcard['n'] = $n;
break;
}
}
}
}
}
function hKit_implied_n_from_fn(&$vcard)
{
if (array_key_exists('fn', $vcard) && is_array($vcard['fn'])
&& !array_key_exists('n', $vcard) && (!array_key_exists('org', $vcard) || $vcard['fn'] != $vcard['org'])){
$vcard['n'] = $vcard['fn'];
}
if (array_key_exists('fn', $vcard) && is_array($vcard['fn'])){
$vcard['fn'] = $vcard['fn']['text'];
}
}
?>

View File

@ -1,475 +0,0 @@
<?php
/*
hKit Library for PHP5 - a generic library for parsing Microformats
Copyright (C) 2006 Drew McLellan
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Author
Drew McLellan - http://allinthehead.com/
Contributors:
Scott Reynen - http://www.randomchaos.com/
Version 0.5, 22-Jul-2006
fixed by-ref issue cropping up in PHP 5.0.5
fixed a bug with a@title
added support for new fn=n optimisation
added support for new a.include include-pattern
Version 0.4, 23-Jun-2006
prevented nested includes from causing infinite loops
returns false if URL can't be fetched
added pre-flight check for base support level
added deduping of once-only classnames
prevented accumulation of multiple 'value' values
tuned whitespace handling and treatment of DEL elements
Version 0.3, 21-Jun-2006
added post-processor callback method into profiles
fixed minor problems raised by hcard testsuite
added support for include-pattern
added support for td@headers pattern
added implied-n optimization into default hcard profile
Version 0.2, 20-Jun-2006
added class callback mechanism
added resolvePath & resolveEmail
added basic BASE support
Version 0.1.1, 19-Jun-2006 (different timezone, no time machine)
added external Tidy option
Version 0.1, 20-Jun-2006
initial release
*/
class hKit
{
public $tidy_mode = 'proxy'; // 'proxy', 'exec', 'php' or 'none'
public $tidy_proxy = 'http://cgi.w3.org/cgi-bin/tidy?forceXML=on&docAddr='; // required only for tidy_mode=proxy
public $tmp_dir = '/path/to/writable/dir/'; // required only for tidy_mode=exec
private $root_class = '';
private $classes = '';
private $singles = '';
private $required = '';
private $att_map = '';
private $callbacks = '';
private $processor = '';
private $url = '';
private $base = '';
private $doc = '';
public function hKit()
{
// pre-flight checks
$pass = true;
$required = array('dom_import_simplexml', 'file_get_contents', 'simplexml_load_string');
$missing = array();
foreach ($required as $f){
if (!function_exists($f)){
$pass = false;
$missing[] = $f . '()';
}
}
if (!$pass)
die('hKit error: these required functions are not available: <strong>' . implode(', ', $missing) . '</strong>');
}
public function getByURL($profile='', $url='')
{
if ($profile=='' || $url == '') return false;
$this->loadProfile($profile);
$source = $this->loadURL($url);
if ($source){
$tidy_xhtml = $this->tidyThis($source);
$fragment = false;
if (strrchr($url, '#'))
$fragment = array_pop(explode('#', $url));
$doc = $this->loadDoc($tidy_xhtml, $fragment);
$s = $this->processNodes($doc, $this->classes);
$s = $this->postProcess($profile, $s);
return $s;
}else{
return false;
}
}
public function getByString($profile='', $input_xml='')
{
if ($profile=='' || $input_xml == '') return false;
$this->loadProfile($profile);
$doc = $this->loadDoc($input_xml);
$s = $this->processNodes($doc, $this->classes);
$s = $this->postProcess($profile, $s);
return $s;
}
private function processNodes($items, $classes, $allow_includes=true){
$out = array();
foreach($items as $item){
$data = array();
for ($i=0; $i<sizeof($classes); $i++){
if (!is_array($classes[$i])){
$xpath = ".//*[contains(concat(' ',normalize-space(@class),' '),' " . $classes[$i] . " ')]";
$results = $item->xpath($xpath);
if ($results){
foreach ($results as $result){
if (isset($classes[$i+1]) && is_array($classes[$i+1])){
$nodes = $this->processNodes($results, $classes[$i+1]);
if (sizeof($nodes) > 0){
$nodes = array_merge(array('text'=>$this->getNodeValue($result, $classes[$i])), $nodes);
$data[$classes[$i]] = $nodes;
}else{
$data[$classes[$i]] = $this->getNodeValue($result, $classes[$i]);
}
}else{
if (isset($data[$classes[$i]])){
if (is_array($data[$classes[$i]])){
// is already an array - append
$data[$classes[$i]][] = $this->getNodeValue($result, $classes[$i]);
}else{
// make it an array
if ($classes[$i] == 'value'){ // unless it's the 'value' of a type/value pattern
$data[$classes[$i]] .= $this->getNodeValue($result, $classes[$i]);
}else{
$old_val = $data[$classes[$i]];
$data[$classes[$i]] = array($old_val, $this->getNodeValue($result, $classes[$i]));
$old_val = false;
}
}
}else{
// set as normal value
$data[$classes[$i]] = $this->getNodeValue($result, $classes[$i]);
}
}
// td@headers pattern
if (strtoupper(dom_import_simplexml($result)->tagName)== "TD" && $result['headers']){
$include_ids = explode(' ', $result['headers']);
$doc = $this->doc;
foreach ($include_ids as $id){
$xpath = "//*[@id='$id']/..";
$includes = $doc->xpath($xpath);
foreach ($includes as $include){
$tmp = $this->processNodes($include, $this->classes);
if (is_array($tmp)) $data = array_merge($data, $tmp);
}
}
}
}
}
}
$result = false;
}
// include-pattern
if ($allow_includes){
$xpath = ".//*[contains(concat(' ',normalize-space(@class),' '),' include ')]";
$results = $item->xpath($xpath);
if ($results){
foreach ($results as $result){
$tagName = strtoupper(dom_import_simplexml($result)->tagName);
if ((($tagName == "OBJECT" && $result['data']) || ($tagName == "A" && $result['href']))
&& preg_match('/\binclude\b/', $result['class'])){
$att = ($tagName == "OBJECT" ? 'data' : 'href');
$id = str_replace('#', '', $result[$att]);
$doc = $this->doc;
$xpath = "//*[@id='$id']";
$includes = $doc->xpath($xpath);
foreach ($includes as $include){
$include = simplexml_load_string('<root1><root2>'.$include->asXML().'</root2></root1>'); // don't ask.
$tmp = $this->processNodes($include, $this->classes, false);
if (is_array($tmp)) $data = array_merge($data, $tmp);
}
}
}
}
}
$out[] = $data;
}
if (sizeof($out) > 1){
return $out;
}else if (isset($data)){
return $data;
}else{
return array();
}
}
private function getNodeValue($node, $className)
{
$tag_name = strtoupper(dom_import_simplexml($node)->tagName);
$s = false;
// ignore DEL tags
if ($tag_name == 'DEL') return $s;
// look up att map values
if (array_key_exists($className, $this->att_map)){
foreach ($this->att_map[$className] as $map){
if (preg_match("/$tag_name\|/", $map)){
$s = ''.$node[array_pop($foo = explode('|', $map))];
}
}
}
// if nothing and OBJ, try data.
if (!$s && $tag_name=='OBJECT' && $node['data']) $s = ''.$node['data'];
// if nothing and IMG, try alt.
if (!$s && $tag_name=='IMG' && $node['alt']) $s = ''.$node['alt'];
// if nothing and AREA, try alt.
if (!$s && $tag_name=='AREA' && $node['alt']) $s = ''.$node['alt'];
//if nothing and not A, try title.
if (!$s && $tag_name!='A' && $node['title']) $s = ''.$node['title'];
// if nothing found, go with node text
$s = ($s ? $s : implode(array_filter($node->xpath('child::node()'), array(&$this, "filterBlankValues")), ' '));
// callbacks
if (array_key_exists($className, $this->callbacks)){
$s = preg_replace_callback('/.*/', $this->callbacks[$className], $s, 1);
}
// trim and remove line breaks
if ($tag_name != 'PRE'){
$s = trim(preg_replace('/[\r\n\t]+/', '', $s));
$s = trim(preg_replace('/(\s{2})+/', ' ', $s));
}
return $s;
}
private function filterBlankValues($s){
return preg_match("/\w+/", $s);
}
private function tidyThis($source)
{
switch ( $this->tidy_mode )
{
case 'exec':
$tmp_file = $this->tmp_dir.md5($source).'.txt';
file_put_contents($tmp_file, $source);
exec("tidy -utf8 -indent -asxhtml -numeric -bare -quiet $tmp_file", $tidy);
unlink($tmp_file);
return implode("\n", $tidy);
break;
case 'php':
$tidy = tidy_parse_string($source);
return tidy_clean_repair($tidy);
break;
default:
return $source;
break;
}
}
private function loadProfile($profile)
{
require_once("$profile.profile.php");
}
private function loadDoc($input_xml, $fragment=false)
{
$xml = simplexml_load_string($input_xml);
$this->doc = $xml;
if ($fragment){
$doc = $xml->xpath("//*[@id='$fragment']");
$xml = simplexml_load_string($doc[0]->asXML());
$doc = null;
}
// base tag
if ($xml->head->base['href']) $this->base = $xml->head->base['href'];
// xml:base attribute - PITA with SimpleXML
preg_match('/xml:base="(.*)"/', $xml->asXML(), $matches);
if (is_array($matches) && sizeof($matches)>1) $this->base = $matches[1];
return $xml->xpath("//*[contains(concat(' ',normalize-space(@class),' '),' $this->root_class ')]");
}
private function loadURL($url)
{
$this->url = $url;
if ($this->tidy_mode == 'proxy' && $this->tidy_proxy != ''){
$url = $this->tidy_proxy . $url;
}
return @file_get_contents($url);
}
private function postProcess($profile, $s)
{
$required = $this->required;
if (is_array($s) && array_key_exists($required[0], $s)){
$s = array($s);
}
$s = $this->dedupeSingles($s);
if (function_exists('hKit_'.$profile.'_post')){
$s = call_user_func('hKit_'.$profile.'_post', $s);
}
$s = $this->removeTextVals($s);
return $s;
}
private function resolvePath($filepath)
{ // ugly code ahoy: needs a serious tidy up
$filepath = $filepath[0];
$base = $this->base;
$url = $this->url;
if ($base != '' && strpos($base, '://') !== false)
$url = $base;
$r = parse_url($url);
$domain = $r['scheme'] . '://' . $r['host'];
if (!isset($r['path'])) $r['path'] = '/';
$path = explode('/', $r['path']);
$file = explode('/', $filepath);
$new = array('');
if (strpos($filepath, '://') !== false || strpos($filepath, 'data:') !== false){
return $filepath;
}
if ($file[0] == ''){
// absolute path
return ''.$domain . implode('/', $file);
}else{
// relative path
if ($path[sizeof($path)-1] == '') array_pop($path);
if (strpos($path[sizeof($path)-1], '.') !== false) array_pop($path);
foreach ($file as $segment){
if ($segment == '..'){
array_pop($path);
}else{
$new[] = $segment;
}
}
return ''.$domain . implode('/', $path) . implode('/', $new);
}
}
private function resolveEmail($v)
{
$parts = parse_url($v[0]);
return ($parts['path']);
}
private function dedupeSingles($s)
{
$singles = $this->singles;
foreach ($s as &$item){
foreach ($singles as $classname){
if (array_key_exists($classname, $item) && is_array($item[$classname])){
if (isset($item[$classname][0])) $item[$classname] = $item[$classname][0];
}
}
}
return $s;
}
private function removeTextVals($s)
{
foreach ($s as $key => &$val){
if ($key){
$k = $key;
}else{
$k = '';
}
if (is_array($val)){
$val = $this->removeTextVals($val);
}else{
if ($k == 'text'){
$val = '';
}
}
}
return array_filter($s);
}
}
?>

View File

@ -63,49 +63,12 @@ class DiscoveryHints {
static function hcardHints($body, $url) static function hcardHints($body, $url)
{ {
common_debug("starting tidy"); $hcard = self::_hcard($body, $url);
$body = self::_tidy($body);
common_debug("done with tidy");
set_include_path(get_include_path() . PATH_SEPARATOR . INSTALLDIR . '/plugins/OStatus/extlib/hkit/');
require_once('hkit.class.php');
$h = new hKit;
$hcards = $h->getByString('hcard', $body);
if (empty($hcards)) {
return array();
}
if (count($hcards) == 1) {
$hcard = $hcards[0];
} else {
foreach ($hcards as $try) {
if (array_key_exists('url', $try)) {
if (is_string($try['url']) && $try['url'] == $url) {
$hcard = $try;
break;
} else if (is_array($try['url'])) {
foreach ($try['url'] as $tryurl) {
if ($tryurl == $url) {
$hcard = $try;
break 2;
}
}
}
}
}
// last chance; grab the first one
if (empty($hcard)) {
$hcard = $hcards[0];
}
}
$hints = array(); $hints = array();
// XXX: don't copy stuff into an array and then copy it again
if (array_key_exists('nickname', $hcard)) { if (array_key_exists('nickname', $hcard)) {
$hints['nickname'] = $hcard['nickname']; $hints['nickname'] = $hcard['nickname'];
} }
@ -117,7 +80,7 @@ class DiscoveryHints {
} }
if (array_key_exists('photo', $hcard)) { if (array_key_exists('photo', $hcard)) {
$hints['avatar'] = $hcard['photo']; $hints['avatar'] = $hcard['photo'][0];
} }
if (array_key_exists('note', $hcard)) { if (array_key_exists('note', $hcard)) {
@ -144,39 +107,142 @@ class DiscoveryHints {
return $hints; return $hints;
} }
private static function _tidy($body) static function _hcard($body, $url)
{ {
if (function_exists('tidy_parse_string')) { // DOMDocument::loadHTML may throw warnings on unrecognized elements.
common_debug("Tidying with extension");
$text = tidy_parse_string($body); $old = error_reporting(error_reporting() & ~E_WARNING);
$text = tidy_clean_repair($text);
return $body; $doc = new DOMDocument();
} else if ($fullpath = self::_findProgram('tidy')) { $doc->loadHTML($body);
common_debug("Tidying with program $fullpath");
$tempfile = tempnam('/tmp', 'snht'); // statusnet hcard tidy error_reporting($old);
file_put_contents($tempfile, $source);
exec("$fullpath -utf8 -indent -asxhtml -numeric -bare -quiet $tempfile", $tidy); $xp = new DOMXPath($doc);
unlink($tempfile);
return implode("\n", $tidy); $hcardNodes = self::_getChildrenByClass($doc->documentElement, 'vcard', $xp);
$hcards = array();
for ($i = 0; $i < $hcardNodes->length; $i++) {
$hcardNode = $hcardNodes->item($i);
$hcard = self::_hcardFromNode($hcardNode, $xp, $url);
$hcards[] = $hcard;
}
$repr = null;
foreach ($hcards as $hcard) {
if (in_array($url, $hcard['url'])) {
$repr = $hcard;
break;
}
}
if (!is_null($repr)) {
return $repr;
} else if (count($hcards) > 0) {
return $hcards[0];
} else { } else {
common_debug("Not tidying.");
return $body;
}
}
private static function _findProgram($name)
{
$path = $_ENV['PATH'];
$parts = explode(':', $path);
foreach ($parts as $part) {
$fullpath = $part . '/' . $name;
if (is_executable($fullpath)) {
return $fullpath;
}
}
return null; return null;
} }
} }
function _getChildrenByClass($el, $cls, $xp)
{
// borrowed from hkit. Thanks dudes!
$qry = ".//*[contains(concat(' ',normalize-space(@class),' '),' $cls ')]";
$nodes = $xp->query($qry, $el);
return $nodes;
}
function _hcardFromNode($hcardNode, $xp, $base)
{
$hcard = array();
$hcard['url'] = array();
$urlNodes = self::_getChildrenByClass($hcardNode, 'url', $xp);
for ($j = 0; $j < $urlNodes->length; $j++) {
$urlNode = $urlNodes->item($j);
if ($urlNode->hasAttribute('href')) {
$url = $urlNode->getAttribute('href');
} else {
$url = $urlNode->textContent;
}
$hcard['url'][] = self::_rel2abs($url, $base);
}
$hcard['photo'] = array();
$photoNodes = self::_getChildrenByClass($hcardNode, 'photo', $xp);
for ($j = 0; $j < $photoNodes->length; $j++) {
$photoNode = $photoNodes->item($j);
if ($photoNode->hasAttribute('src')) {
$url = $photoNode->getAttribute('src');
} else if ($photoNode->hasAttribute('href')) {
$url = $photoNode->getAttribute('href');
} else {
$url = $photoNode->textContent;
}
$hcard['photo'][] = self::_rel2abs($url, $base);
}
$singles = array('nickname', 'note', 'fn', 'n', 'adr');
foreach ($singles as $single) {
$nodes = self::_getChildrenByClass($hcardNode, $single, $xp);
if ($nodes->length > 0) {
$node = $nodes->item(0);
$hcard[$single] = $node->textContent;
}
}
return $hcard;
}
// XXX: this is a first pass; we probably need
// to handle things like ../ and ./ and so on
static function _rel2abs($rel, $wrt)
{
$parts = parse_url($rel);
if ($parts === false) {
return false;
}
// If it's got a scheme, use it
if ($parts['scheme'] != '') {
return $rel;
}
$w = parse_url($wrt);
$base = $w['scheme'].'://'.$w['host'];
if ($rel[0] == '/') {
return $base.$rel;
}
$wp = explode('/', $w['path']);
array_pop($wp);
return $base.implode('/', $wp).'/'.$rel;
}
}