forked from GNUsocial/gnu-social
475 lines
12 KiB
PHP
475 lines
12 KiB
PHP
<?php
|
|
|
|
/*
|
|
|
|
hKit Library for PHP5 - a generic library for parsing Microformats
|
|
Copyright (C) 2006 Drew McLellan
|
|
|
|
This library is free software; you can redistribute it and/or
|
|
modify it under the terms of the GNU Lesser General Public
|
|
License as published by the Free Software Foundation; either
|
|
version 2.1 of the License, or (at your option) any later version.
|
|
|
|
This library is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
Lesser General Public License for more details.
|
|
|
|
You should have received a copy of the GNU Lesser General Public
|
|
License along with this library; if not, write to the Free Software
|
|
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
|
|
Author
|
|
Drew McLellan - http://allinthehead.com/
|
|
|
|
Contributors:
|
|
Scott Reynen - http://www.randomchaos.com/
|
|
|
|
Version 0.5, 22-Jul-2006
|
|
fixed by-ref issue cropping up in PHP 5.0.5
|
|
fixed a bug with a@title
|
|
added support for new fn=n optimisation
|
|
added support for new a.include include-pattern
|
|
Version 0.4, 23-Jun-2006
|
|
prevented nested includes from causing infinite loops
|
|
returns false if URL can't be fetched
|
|
added pre-flight check for base support level
|
|
added deduping of once-only classnames
|
|
prevented accumulation of multiple 'value' values
|
|
tuned whitespace handling and treatment of DEL elements
|
|
Version 0.3, 21-Jun-2006
|
|
added post-processor callback method into profiles
|
|
fixed minor problems raised by hcard testsuite
|
|
added support for include-pattern
|
|
added support for td@headers pattern
|
|
added implied-n optimization into default hcard profile
|
|
Version 0.2, 20-Jun-2006
|
|
added class callback mechanism
|
|
added resolvePath & resolveEmail
|
|
added basic BASE support
|
|
Version 0.1.1, 19-Jun-2006 (different timezone, no time machine)
|
|
added external Tidy option
|
|
Version 0.1, 20-Jun-2006
|
|
initial release
|
|
|
|
|
|
|
|
|
|
*/
|
|
|
|
class hKit
|
|
{
|
|
|
|
public $tidy_mode = 'proxy'; // 'proxy', 'exec', 'php' or 'none'
|
|
public $tidy_proxy = 'http://cgi.w3.org/cgi-bin/tidy?forceXML=on&docAddr='; // required only for tidy_mode=proxy
|
|
public $tmp_dir = '/path/to/writable/dir/'; // required only for tidy_mode=exec
|
|
|
|
private $root_class = '';
|
|
private $classes = '';
|
|
private $singles = '';
|
|
private $required = '';
|
|
private $att_map = '';
|
|
private $callbacks = '';
|
|
private $processor = '';
|
|
|
|
private $url = '';
|
|
private $base = '';
|
|
private $doc = '';
|
|
|
|
|
|
public function hKit()
|
|
{
|
|
// pre-flight checks
|
|
$pass = true;
|
|
$required = array('dom_import_simplexml', 'file_get_contents', 'simplexml_load_string');
|
|
$missing = array();
|
|
|
|
foreach ($required as $f){
|
|
if (!function_exists($f)){
|
|
$pass = false;
|
|
$missing[] = $f . '()';
|
|
}
|
|
}
|
|
|
|
if (!$pass)
|
|
die('hKit error: these required functions are not available: <strong>' . implode(', ', $missing) . '</strong>');
|
|
|
|
}
|
|
|
|
|
|
public function getByURL($profile='', $url='')
|
|
{
|
|
|
|
if ($profile=='' || $url == '') return false;
|
|
|
|
$this->loadProfile($profile);
|
|
|
|
$source = $this->loadURL($url);
|
|
|
|
if ($source){
|
|
$tidy_xhtml = $this->tidyThis($source);
|
|
|
|
$fragment = false;
|
|
|
|
if (strrchr($url, '#'))
|
|
$fragment = array_pop(explode('#', $url));
|
|
|
|
$doc = $this->loadDoc($tidy_xhtml, $fragment);
|
|
$s = $this->processNodes($doc, $this->classes);
|
|
$s = $this->postProcess($profile, $s);
|
|
|
|
return $s;
|
|
}else{
|
|
return false;
|
|
}
|
|
}
|
|
|
|
public function getByString($profile='', $input_xml='')
|
|
{
|
|
if ($profile=='' || $input_xml == '') return false;
|
|
|
|
$this->loadProfile($profile);
|
|
|
|
$doc = $this->loadDoc($input_xml);
|
|
$s = $this->processNodes($doc, $this->classes);
|
|
$s = $this->postProcess($profile, $s);
|
|
|
|
return $s;
|
|
|
|
}
|
|
|
|
private function processNodes($items, $classes, $allow_includes=true){
|
|
|
|
$out = array();
|
|
|
|
foreach($items as $item){
|
|
$data = array();
|
|
|
|
for ($i=0; $i<sizeof($classes); $i++){
|
|
|
|
if (!is_array($classes[$i])){
|
|
|
|
$xpath = ".//*[contains(concat(' ',normalize-space(@class),' '),' " . $classes[$i] . " ')]";
|
|
$results = $item->xpath($xpath);
|
|
|
|
if ($results){
|
|
foreach ($results as $result){
|
|
if (isset($classes[$i+1]) && is_array($classes[$i+1])){
|
|
$nodes = $this->processNodes($results, $classes[$i+1]);
|
|
if (sizeof($nodes) > 0){
|
|
$nodes = array_merge(array('text'=>$this->getNodeValue($result, $classes[$i])), $nodes);
|
|
$data[$classes[$i]] = $nodes;
|
|
}else{
|
|
$data[$classes[$i]] = $this->getNodeValue($result, $classes[$i]);
|
|
}
|
|
|
|
}else{
|
|
if (isset($data[$classes[$i]])){
|
|
if (is_array($data[$classes[$i]])){
|
|
// is already an array - append
|
|
$data[$classes[$i]][] = $this->getNodeValue($result, $classes[$i]);
|
|
|
|
}else{
|
|
// make it an array
|
|
if ($classes[$i] == 'value'){ // unless it's the 'value' of a type/value pattern
|
|
$data[$classes[$i]] .= $this->getNodeValue($result, $classes[$i]);
|
|
}else{
|
|
$old_val = $data[$classes[$i]];
|
|
$data[$classes[$i]] = array($old_val, $this->getNodeValue($result, $classes[$i]));
|
|
$old_val = false;
|
|
}
|
|
}
|
|
}else{
|
|
// set as normal value
|
|
$data[$classes[$i]] = $this->getNodeValue($result, $classes[$i]);
|
|
|
|
}
|
|
}
|
|
|
|
// td@headers pattern
|
|
if (strtoupper(dom_import_simplexml($result)->tagName)== "TD" && $result['headers']){
|
|
$include_ids = explode(' ', $result['headers']);
|
|
$doc = $this->doc;
|
|
foreach ($include_ids as $id){
|
|
$xpath = "//*[@id='$id']/..";
|
|
$includes = $doc->xpath($xpath);
|
|
foreach ($includes as $include){
|
|
$tmp = $this->processNodes($include, $this->classes);
|
|
if (is_array($tmp)) $data = array_merge($data, $tmp);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
$result = false;
|
|
}
|
|
|
|
// include-pattern
|
|
if ($allow_includes){
|
|
$xpath = ".//*[contains(concat(' ',normalize-space(@class),' '),' include ')]";
|
|
$results = $item->xpath($xpath);
|
|
|
|
if ($results){
|
|
foreach ($results as $result){
|
|
$tagName = strtoupper(dom_import_simplexml($result)->tagName);
|
|
if ((($tagName == "OBJECT" && $result['data']) || ($tagName == "A" && $result['href']))
|
|
&& preg_match('/\binclude\b/', $result['class'])){
|
|
$att = ($tagName == "OBJECT" ? 'data' : 'href');
|
|
$id = str_replace('#', '', $result[$att]);
|
|
$doc = $this->doc;
|
|
$xpath = "//*[@id='$id']";
|
|
$includes = $doc->xpath($xpath);
|
|
foreach ($includes as $include){
|
|
$include = simplexml_load_string('<root1><root2>'.$include->asXML().'</root2></root1>'); // don't ask.
|
|
$tmp = $this->processNodes($include, $this->classes, false);
|
|
if (is_array($tmp)) $data = array_merge($data, $tmp);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
$out[] = $data;
|
|
}
|
|
|
|
if (sizeof($out) > 1){
|
|
return $out;
|
|
}else if (isset($data)){
|
|
return $data;
|
|
}else{
|
|
return array();
|
|
}
|
|
}
|
|
|
|
|
|
private function getNodeValue($node, $className)
|
|
{
|
|
|
|
$tag_name = strtoupper(dom_import_simplexml($node)->tagName);
|
|
$s = false;
|
|
|
|
// ignore DEL tags
|
|
if ($tag_name == 'DEL') return $s;
|
|
|
|
// look up att map values
|
|
if (array_key_exists($className, $this->att_map)){
|
|
|
|
foreach ($this->att_map[$className] as $map){
|
|
if (preg_match("/$tag_name\|/", $map)){
|
|
$s = ''.$node[array_pop($foo = explode('|', $map))];
|
|
}
|
|
}
|
|
}
|
|
|
|
// if nothing and OBJ, try data.
|
|
if (!$s && $tag_name=='OBJECT' && $node['data']) $s = ''.$node['data'];
|
|
|
|
// if nothing and IMG, try alt.
|
|
if (!$s && $tag_name=='IMG' && $node['alt']) $s = ''.$node['alt'];
|
|
|
|
// if nothing and AREA, try alt.
|
|
if (!$s && $tag_name=='AREA' && $node['alt']) $s = ''.$node['alt'];
|
|
|
|
//if nothing and not A, try title.
|
|
if (!$s && $tag_name!='A' && $node['title']) $s = ''.$node['title'];
|
|
|
|
|
|
// if nothing found, go with node text
|
|
$s = ($s ? $s : implode(array_filter($node->xpath('child::node()'), array(&$this, "filterBlankValues")), ' '));
|
|
|
|
// callbacks
|
|
if (array_key_exists($className, $this->callbacks)){
|
|
$s = preg_replace_callback('/.*/', $this->callbacks[$className], $s, 1);
|
|
}
|
|
|
|
// trim and remove line breaks
|
|
if ($tag_name != 'PRE'){
|
|
$s = trim(preg_replace('/[\r\n\t]+/', '', $s));
|
|
$s = trim(preg_replace('/(\s{2})+/', ' ', $s));
|
|
}
|
|
|
|
return $s;
|
|
}
|
|
|
|
private function filterBlankValues($s){
|
|
return preg_match("/\w+/", $s);
|
|
}
|
|
|
|
|
|
private function tidyThis($source)
|
|
{
|
|
switch ( $this->tidy_mode )
|
|
{
|
|
case 'exec':
|
|
$tmp_file = $this->tmp_dir.md5($source).'.txt';
|
|
file_put_contents($tmp_file, $source);
|
|
exec("tidy -utf8 -indent -asxhtml -numeric -bare -quiet $tmp_file", $tidy);
|
|
unlink($tmp_file);
|
|
return implode("\n", $tidy);
|
|
break;
|
|
|
|
case 'php':
|
|
$tidy = tidy_parse_string($source);
|
|
return tidy_clean_repair($tidy);
|
|
break;
|
|
|
|
default:
|
|
return $source;
|
|
break;
|
|
}
|
|
|
|
}
|
|
|
|
|
|
private function loadProfile($profile)
|
|
{
|
|
require_once("$profile.profile.php");
|
|
}
|
|
|
|
|
|
private function loadDoc($input_xml, $fragment=false)
|
|
{
|
|
$xml = simplexml_load_string($input_xml);
|
|
|
|
$this->doc = $xml;
|
|
|
|
if ($fragment){
|
|
$doc = $xml->xpath("//*[@id='$fragment']");
|
|
$xml = simplexml_load_string($doc[0]->asXML());
|
|
$doc = null;
|
|
}
|
|
|
|
// base tag
|
|
if ($xml->head->base['href']) $this->base = $xml->head->base['href'];
|
|
|
|
// xml:base attribute - PITA with SimpleXML
|
|
preg_match('/xml:base="(.*)"/', $xml->asXML(), $matches);
|
|
if (is_array($matches) && sizeof($matches)>1) $this->base = $matches[1];
|
|
|
|
return $xml->xpath("//*[contains(concat(' ',normalize-space(@class),' '),' $this->root_class ')]");
|
|
|
|
}
|
|
|
|
|
|
private function loadURL($url)
|
|
{
|
|
$this->url = $url;
|
|
|
|
if ($this->tidy_mode == 'proxy' && $this->tidy_proxy != ''){
|
|
$url = $this->tidy_proxy . $url;
|
|
}
|
|
|
|
return @file_get_contents($url);
|
|
|
|
}
|
|
|
|
|
|
private function postProcess($profile, $s)
|
|
{
|
|
$required = $this->required;
|
|
|
|
if (is_array($s) && array_key_exists($required[0], $s)){
|
|
$s = array($s);
|
|
}
|
|
|
|
$s = $this->dedupeSingles($s);
|
|
|
|
if (function_exists('hKit_'.$profile.'_post')){
|
|
$s = call_user_func('hKit_'.$profile.'_post', $s);
|
|
}
|
|
|
|
$s = $this->removeTextVals($s);
|
|
|
|
return $s;
|
|
}
|
|
|
|
|
|
private function resolvePath($filepath)
|
|
{ // ugly code ahoy: needs a serious tidy up
|
|
|
|
$filepath = $filepath[0];
|
|
|
|
$base = $this->base;
|
|
$url = $this->url;
|
|
|
|
if ($base != '' && strpos($base, '://') !== false)
|
|
$url = $base;
|
|
|
|
$r = parse_url($url);
|
|
$domain = $r['scheme'] . '://' . $r['host'];
|
|
|
|
if (!isset($r['path'])) $r['path'] = '/';
|
|
$path = explode('/', $r['path']);
|
|
$file = explode('/', $filepath);
|
|
$new = array('');
|
|
|
|
if (strpos($filepath, '://') !== false || strpos($filepath, 'data:') !== false){
|
|
return $filepath;
|
|
}
|
|
|
|
if ($file[0] == ''){
|
|
// absolute path
|
|
return ''.$domain . implode('/', $file);
|
|
}else{
|
|
// relative path
|
|
if ($path[sizeof($path)-1] == '') array_pop($path);
|
|
if (strpos($path[sizeof($path)-1], '.') !== false) array_pop($path);
|
|
|
|
foreach ($file as $segment){
|
|
if ($segment == '..'){
|
|
array_pop($path);
|
|
}else{
|
|
$new[] = $segment;
|
|
}
|
|
}
|
|
return ''.$domain . implode('/', $path) . implode('/', $new);
|
|
}
|
|
}
|
|
|
|
private function resolveEmail($v)
|
|
{
|
|
$parts = parse_url($v[0]);
|
|
return ($parts['path']);
|
|
}
|
|
|
|
|
|
private function dedupeSingles($s)
|
|
{
|
|
$singles = $this->singles;
|
|
|
|
foreach ($s as &$item){
|
|
foreach ($singles as $classname){
|
|
if (array_key_exists($classname, $item) && is_array($item[$classname])){
|
|
if (isset($item[$classname][0])) $item[$classname] = $item[$classname][0];
|
|
}
|
|
}
|
|
}
|
|
|
|
return $s;
|
|
}
|
|
|
|
private function removeTextVals($s)
|
|
{
|
|
foreach ($s as $key => &$val){
|
|
if ($key){
|
|
$k = $key;
|
|
}else{
|
|
$k = '';
|
|
}
|
|
|
|
if (is_array($val)){
|
|
$val = $this->removeTextVals($val);
|
|
}else{
|
|
if ($k == 'text'){
|
|
$val = '';
|
|
}
|
|
}
|
|
}
|
|
|
|
return array_filter($s);
|
|
}
|
|
|
|
}
|
|
|
|
|
|
?>
|