forked from GNUsocial/gnu-social
		
	
		
			
				
	
	
		
			475 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			PHP
		
	
	
	
	
	
			
		
		
	
	
			475 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			PHP
		
	
	
	
	
	
<?php
 | 
						|
 | 
						|
	/* 
 | 
						|
	
 | 
						|
	hKit Library for PHP5 - a generic library for parsing Microformats
 | 
						|
	Copyright (C) 2006  Drew McLellan
 | 
						|
 | 
						|
	This library is free software; you can redistribute it and/or
 | 
						|
	modify it under the terms of the GNU Lesser General Public
 | 
						|
	License as published by the Free Software Foundation; either
 | 
						|
	version 2.1 of the License, or (at your option) any later version.
 | 
						|
 | 
						|
	This library is distributed in the hope that it will be useful,
 | 
						|
	but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
						|
	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 | 
						|
	Lesser General Public License for more details.
 | 
						|
 | 
						|
	You should have received a copy of the GNU Lesser General Public
 | 
						|
	License along with this library; if not, write to the Free Software
 | 
						|
	Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 | 
						|
	
 | 
						|
	Author	
 | 
						|
		Drew McLellan - http://allinthehead.com/
 | 
						|
		
 | 
						|
	Contributors:
 | 
						|
		Scott Reynen - http://www.randomchaos.com/
 | 
						|
		
 | 
						|
	Version 0.5, 22-Jul-2006
 | 
						|
		fixed by-ref issue cropping up in PHP 5.0.5
 | 
						|
		fixed a bug with a@title
 | 
						|
		added support for new fn=n optimisation
 | 
						|
		added support for new a.include include-pattern
 | 
						|
	Version 0.4, 23-Jun-2006
 | 
						|
		prevented nested includes from causing infinite loops
 | 
						|
		returns false if URL can't be fetched
 | 
						|
		added pre-flight check for base support level
 | 
						|
		added deduping of once-only classnames
 | 
						|
		prevented accumulation of multiple 'value' values
 | 
						|
		tuned whitespace handling and treatment of DEL elements
 | 
						|
	Version 0.3, 21-Jun-2006
 | 
						|
		added post-processor callback method into profiles
 | 
						|
		fixed minor problems raised by hcard testsuite
 | 
						|
		added support for include-pattern
 | 
						|
		added support for td@headers pattern
 | 
						|
		added implied-n optimization into default hcard profile
 | 
						|
	Version 0.2, 20-Jun-2006
 | 
						|
		added class callback mechanism
 | 
						|
		added resolvePath & resolveEmail
 | 
						|
		added basic BASE support
 | 
						|
	Version 0.1.1, 19-Jun-2006 (different timezone, no time machine)
 | 
						|
		added external Tidy option
 | 
						|
	Version 0.1, 20-Jun-2006
 | 
						|
		initial release
 | 
						|
		
 | 
						|
	
 | 
						|
	
 | 
						|
	
 | 
						|
	*/
 | 
						|
 | 
						|
	class hKit
 | 
						|
	{
 | 
						|
		
 | 
						|
		public $tidy_mode	= 'proxy'; // 'proxy', 'exec', 'php' or 'none'
 | 
						|
		public $tidy_proxy	= 'http://cgi.w3.org/cgi-bin/tidy?forceXML=on&docAddr='; // required only for tidy_mode=proxy
 | 
						|
		public $tmp_dir		= '/path/to/writable/dir/'; // required only for tidy_mode=exec
 | 
						|
		
 | 
						|
		private $root_class = '';
 | 
						|
		private $classes	= '';
 | 
						|
		private $singles	= '';
 | 
						|
		private $required	= '';
 | 
						|
		private $att_map	= '';
 | 
						|
		private $callbacks	= '';
 | 
						|
		private $processor 	= '';
 | 
						|
		
 | 
						|
		private $url		= '';
 | 
						|
		private $base 		= '';
 | 
						|
		private $doc		= '';
 | 
						|
		
 | 
						|
		
 | 
						|
		public function hKit()
 | 
						|
		{
 | 
						|
			// pre-flight checks
 | 
						|
			$pass 		= true; 
 | 
						|
			$required	= array('dom_import_simplexml', 'file_get_contents', 'simplexml_load_string');
 | 
						|
			$missing	= array();
 | 
						|
			
 | 
						|
			foreach ($required as $f){
 | 
						|
				if (!function_exists($f)){
 | 
						|
					$pass		= false;
 | 
						|
					$missing[] 	= $f . '()';
 | 
						|
				}
 | 
						|
			}
 | 
						|
			
 | 
						|
			if (!$pass)
 | 
						|
				die('hKit error: these required functions are not available: <strong>' . implode(', ', $missing) . '</strong>');
 | 
						|
			
 | 
						|
		}
 | 
						|
		
 | 
						|
 | 
						|
		public function getByURL($profile='', $url='')
 | 
						|
		{
 | 
						|
			
 | 
						|
			if ($profile=='' || $url == '') return false;
 | 
						|
			
 | 
						|
			$this->loadProfile($profile);
 | 
						|
			
 | 
						|
			$source		= $this->loadURL($url);
 | 
						|
			
 | 
						|
			if ($source){
 | 
						|
				$tidy_xhtml	= $this->tidyThis($source);
 | 
						|
 | 
						|
				$fragment	= false;
 | 
						|
			
 | 
						|
				if (strrchr($url, '#'))
 | 
						|
				$fragment	= array_pop(explode('#', $url));
 | 
						|
			
 | 
						|
				$doc		= $this->loadDoc($tidy_xhtml, $fragment);
 | 
						|
				$s			= $this->processNodes($doc, $this->classes);
 | 
						|
				$s			= $this->postProcess($profile, $s);
 | 
						|
			
 | 
						|
				return $s;
 | 
						|
			}else{
 | 
						|
				return false;
 | 
						|
			}
 | 
						|
		}
 | 
						|
		
 | 
						|
		public function getByString($profile='', $input_xml='')
 | 
						|
		{
 | 
						|
			if ($profile=='' || $input_xml == '') return false;
 | 
						|
			
 | 
						|
			$this->loadProfile($profile);
 | 
						|
 | 
						|
			$doc	= $this->loadDoc($input_xml);
 | 
						|
			$s		= $this->processNodes($doc, $this->classes);
 | 
						|
			$s		= $this->postProcess($profile, $s);
 | 
						|
			
 | 
						|
			return $s;
 | 
						|
			
 | 
						|
		}
 | 
						|
		
 | 
						|
		private function processNodes($items, $classes, $allow_includes=true){
 | 
						|
 | 
						|
			$out	= array();
 | 
						|
 | 
						|
			foreach($items as $item){
 | 
						|
				$data	= array();
 | 
						|
 | 
						|
				for ($i=0; $i<sizeof($classes); $i++){
 | 
						|
					
 | 
						|
					if (!is_array($classes[$i])){
 | 
						|
 | 
						|
						$xpath			= ".//*[contains(concat(' ',normalize-space(@class),' '),' " . $classes[$i] . " ')]";
 | 
						|
						$results		= $item->xpath($xpath);
 | 
						|
						
 | 
						|
						if ($results){
 | 
						|
							foreach ($results as $result){ 
 | 
						|
								if (isset($classes[$i+1]) && is_array($classes[$i+1])){
 | 
						|
									$nodes				= $this->processNodes($results, $classes[$i+1]);
 | 
						|
									if (sizeof($nodes) > 0){
 | 
						|
										$nodes = array_merge(array('text'=>$this->getNodeValue($result, $classes[$i])), $nodes);
 | 
						|
										$data[$classes[$i]]	= $nodes;
 | 
						|
									}else{
 | 
						|
										$data[$classes[$i]]	= $this->getNodeValue($result, $classes[$i]);
 | 
						|
									}
 | 
						|
									
 | 
						|
								}else{								
 | 
						|
									if (isset($data[$classes[$i]])){
 | 
						|
										if (is_array($data[$classes[$i]])){
 | 
						|
											// is already an array - append
 | 
						|
											$data[$classes[$i]][]	= $this->getNodeValue($result, $classes[$i]);
 | 
						|
 | 
						|
										}else{
 | 
						|
											// make it an array
 | 
						|
											if ($classes[$i] == 'value'){ // unless it's the 'value' of a type/value pattern
 | 
						|
												$data[$classes[$i]] .= $this->getNodeValue($result, $classes[$i]);
 | 
						|
											}else{
 | 
						|
												$old_val			= $data[$classes[$i]];
 | 
						|
												$data[$classes[$i]]	= array($old_val, $this->getNodeValue($result, $classes[$i]));
 | 
						|
												$old_val			= false;
 | 
						|
											}
 | 
						|
										}
 | 
						|
									}else{										
 | 
						|
										// set as normal value
 | 
						|
										$data[$classes[$i]]	= $this->getNodeValue($result, $classes[$i]);
 | 
						|
 | 
						|
									}
 | 
						|
								}
 | 
						|
							
 | 
						|
								// td@headers pattern
 | 
						|
								if (strtoupper(dom_import_simplexml($result)->tagName)== "TD" && $result['headers']){
 | 
						|
									$include_ids	= explode(' ', $result['headers']);
 | 
						|
									$doc			= $this->doc;
 | 
						|
									foreach ($include_ids as $id){
 | 
						|
										$xpath			= "//*[@id='$id']/..";
 | 
						|
										$includes		= $doc->xpath($xpath);
 | 
						|
										foreach ($includes as $include){
 | 
						|
											$tmp = $this->processNodes($include, $this->classes);
 | 
						|
											if (is_array($tmp)) $data = array_merge($data, $tmp);
 | 
						|
										}
 | 
						|
									}
 | 
						|
								}
 | 
						|
							}					
 | 
						|
						}				
 | 
						|
					}
 | 
						|
					$result	= false;
 | 
						|
				}
 | 
						|
				
 | 
						|
				// include-pattern
 | 
						|
				if ($allow_includes){
 | 
						|
					$xpath			= ".//*[contains(concat(' ',normalize-space(@class),' '),' include ')]";
 | 
						|
					$results		= $item->xpath($xpath);
 | 
						|
				
 | 
						|
					if ($results){
 | 
						|
						foreach ($results as $result){
 | 
						|
							$tagName = strtoupper(dom_import_simplexml($result)->tagName);
 | 
						|
							if ((($tagName == "OBJECT" && $result['data']) || ($tagName == "A" && $result['href'])) 
 | 
						|
									&& preg_match('/\binclude\b/', $result['class'])){	
 | 
						|
								$att		= ($tagName == "OBJECT" ? 'data' : 'href');						
 | 
						|
								$id			= str_replace('#', '', $result[$att]);
 | 
						|
								$doc		= $this->doc;
 | 
						|
								$xpath		= "//*[@id='$id']";
 | 
						|
								$includes	= $doc->xpath($xpath);
 | 
						|
								foreach ($includes as $include){
 | 
						|
									$include	= simplexml_load_string('<root1><root2>'.$include->asXML().'</root2></root1>'); // don't ask.
 | 
						|
									$tmp 		= $this->processNodes($include, $this->classes, false);
 | 
						|
									if (is_array($tmp)) $data = array_merge($data, $tmp);
 | 
						|
								}
 | 
						|
							}
 | 
						|
						}
 | 
						|
					}
 | 
						|
				}
 | 
						|
				$out[]	= $data;
 | 
						|
			}
 | 
						|
			
 | 
						|
			if (sizeof($out) > 1){
 | 
						|
				return $out;
 | 
						|
			}else if (isset($data)){
 | 
						|
				return $data;
 | 
						|
			}else{
 | 
						|
				return array();
 | 
						|
			}
 | 
						|
		}
 | 
						|
 | 
						|
 | 
						|
		private function getNodeValue($node, $className)
 | 
						|
		{
 | 
						|
 | 
						|
			$tag_name	= strtoupper(dom_import_simplexml($node)->tagName);
 | 
						|
			$s			= false;
 | 
						|
			
 | 
						|
			// ignore DEL tags
 | 
						|
			if ($tag_name == 'DEL') return $s;
 | 
						|
			
 | 
						|
			// look up att map values
 | 
						|
			if (array_key_exists($className, $this->att_map)){
 | 
						|
				
 | 
						|
				foreach ($this->att_map[$className] as $map){					
 | 
						|
					if (preg_match("/$tag_name\|/", $map)){
 | 
						|
						$s	= ''.$node[array_pop($foo = explode('|', $map))];
 | 
						|
					}
 | 
						|
				}
 | 
						|
			}
 | 
						|
			
 | 
						|
			// if nothing and OBJ, try data.
 | 
						|
			if (!$s && $tag_name=='OBJECT' && $node['data'])	$s	= ''.$node['data'];
 | 
						|
			
 | 
						|
			// if nothing and IMG, try alt.
 | 
						|
			if (!$s && $tag_name=='IMG' && $node['alt'])	$s	= ''.$node['alt'];
 | 
						|
			
 | 
						|
			// if nothing and AREA, try alt.
 | 
						|
			if (!$s && $tag_name=='AREA' && $node['alt'])	$s	= ''.$node['alt'];
 | 
						|
			
 | 
						|
			//if nothing and not A, try title.
 | 
						|
			if (!$s && $tag_name!='A' && $node['title'])	$s	= ''.$node['title'];
 | 
						|
				
 | 
						|
			
 | 
						|
			// if nothing found, go with node text
 | 
						|
			$s	= ($s ? $s : implode(array_filter($node->xpath('child::node()'), array(&$this, "filterBlankValues")), ' '));			
 | 
						|
 | 
						|
			// callbacks			
 | 
						|
			if (array_key_exists($className, $this->callbacks)){
 | 
						|
				$s	= preg_replace_callback('/.*/', $this->callbacks[$className], $s, 1);
 | 
						|
			}
 | 
						|
			
 | 
						|
			// trim and remove line breaks
 | 
						|
			if ($tag_name != 'PRE'){
 | 
						|
				$s	= trim(preg_replace('/[\r\n\t]+/', '', $s));
 | 
						|
				$s	= trim(preg_replace('/(\s{2})+/', ' ', $s));
 | 
						|
			}
 | 
						|
			
 | 
						|
			return $s;
 | 
						|
		}
 | 
						|
 | 
						|
		private function filterBlankValues($s){
 | 
						|
			return preg_match("/\w+/", $s);
 | 
						|
		}
 | 
						|
		
 | 
						|
		
 | 
						|
		private function tidyThis($source)
 | 
						|
		{
 | 
						|
			switch ( $this->tidy_mode )
 | 
						|
			{
 | 
						|
				case 'exec':
 | 
						|
					$tmp_file	= $this->tmp_dir.md5($source).'.txt';
 | 
						|
					file_put_contents($tmp_file, $source);
 | 
						|
					exec("tidy -utf8 -indent -asxhtml -numeric -bare -quiet $tmp_file", $tidy);
 | 
						|
					unlink($tmp_file);
 | 
						|
					return implode("\n", $tidy);
 | 
						|
				break;
 | 
						|
				
 | 
						|
				case 'php':
 | 
						|
					$tidy 	= tidy_parse_string($source);
 | 
						|
					return tidy_clean_repair($tidy);
 | 
						|
				break;
 | 
						|
						
 | 
						|
				default:
 | 
						|
					return $source;
 | 
						|
				break;
 | 
						|
			}
 | 
						|
			
 | 
						|
		}
 | 
						|
		
 | 
						|
		
 | 
						|
		private function loadProfile($profile)
 | 
						|
		{
 | 
						|
			require_once("$profile.profile.php");
 | 
						|
		}
 | 
						|
		
 | 
						|
		
 | 
						|
		private function loadDoc($input_xml, $fragment=false)
 | 
						|
		{
 | 
						|
			$xml 		= simplexml_load_string($input_xml);
 | 
						|
			
 | 
						|
			$this->doc	= $xml;
 | 
						|
			
 | 
						|
			if ($fragment){
 | 
						|
				$doc	= $xml->xpath("//*[@id='$fragment']");
 | 
						|
				$xml	= simplexml_load_string($doc[0]->asXML());
 | 
						|
				$doc	= null;
 | 
						|
			}
 | 
						|
			
 | 
						|
			// base tag
 | 
						|
			if ($xml->head->base['href']) $this->base = $xml->head->base['href'];			
 | 
						|
 | 
						|
			// xml:base attribute - PITA with SimpleXML
 | 
						|
			preg_match('/xml:base="(.*)"/', $xml->asXML(), $matches);
 | 
						|
			if (is_array($matches) && sizeof($matches)>1) $this->base = $matches[1];
 | 
						|
								
 | 
						|
			return 	$xml->xpath("//*[contains(concat(' ',normalize-space(@class),' '),' $this->root_class ')]");
 | 
						|
			
 | 
						|
		}
 | 
						|
		
 | 
						|
		
 | 
						|
		private function loadURL($url)
 | 
						|
		{
 | 
						|
			$this->url	= $url;
 | 
						|
			
 | 
						|
			if ($this->tidy_mode == 'proxy' && $this->tidy_proxy != ''){
 | 
						|
				$url	= $this->tidy_proxy . $url;
 | 
						|
			}
 | 
						|
		
 | 
						|
			return @file_get_contents($url);
 | 
						|
			
 | 
						|
		}
 | 
						|
		
 | 
						|
		
 | 
						|
		private function postProcess($profile, $s)
 | 
						|
		{
 | 
						|
			$required	= $this->required;
 | 
						|
			
 | 
						|
			if (is_array($s) && array_key_exists($required[0], $s)){
 | 
						|
				$s	= array($s);
 | 
						|
			}
 | 
						|
			
 | 
						|
			$s	= $this->dedupeSingles($s);
 | 
						|
			
 | 
						|
			if (function_exists('hKit_'.$profile.'_post')){
 | 
						|
				$s		= call_user_func('hKit_'.$profile.'_post', $s);
 | 
						|
			}
 | 
						|
			
 | 
						|
			$s	= $this->removeTextVals($s);
 | 
						|
			
 | 
						|
			return $s;
 | 
						|
		}
 | 
						|
		
 | 
						|
		
 | 
						|
		private function resolvePath($filepath)
 | 
						|
		{	// ugly code ahoy: needs a serious tidy up
 | 
						|
					
 | 
						|
			$filepath	= $filepath[0];
 | 
						|
			
 | 
						|
			$base 	= $this->base;
 | 
						|
			$url	= $this->url;
 | 
						|
			
 | 
						|
			if ($base != '' &&  strpos($base, '://') !== false)
 | 
						|
				$url	= $base;
 | 
						|
			
 | 
						|
			$r		= parse_url($url);
 | 
						|
			$domain	= $r['scheme'] . '://' . $r['host'];
 | 
						|
 | 
						|
			if (!isset($r['path'])) $r['path'] = '/';
 | 
						|
			$path	= explode('/', $r['path']);
 | 
						|
			$file	= explode('/', $filepath);
 | 
						|
			$new	= array('');
 | 
						|
 | 
						|
			if (strpos($filepath, '://') !== false || strpos($filepath, 'data:') !== false){
 | 
						|
				return $filepath;
 | 
						|
			}
 | 
						|
 | 
						|
			if ($file[0] == ''){
 | 
						|
				// absolute path
 | 
						|
				return ''.$domain . implode('/', $file);
 | 
						|
			}else{
 | 
						|
				// relative path
 | 
						|
				if ($path[sizeof($path)-1] == '') array_pop($path);
 | 
						|
				if (strpos($path[sizeof($path)-1], '.') !== false) array_pop($path);
 | 
						|
 | 
						|
				foreach ($file as $segment){
 | 
						|
					if ($segment == '..'){
 | 
						|
						array_pop($path);
 | 
						|
					}else{
 | 
						|
						$new[]	= $segment;
 | 
						|
					}
 | 
						|
				}
 | 
						|
				return ''.$domain . implode('/', $path) . implode('/', $new);
 | 
						|
			}	
 | 
						|
		}
 | 
						|
		
 | 
						|
		private function resolveEmail($v)
 | 
						|
		{
 | 
						|
			$parts	= parse_url($v[0]);
 | 
						|
			return ($parts['path']);
 | 
						|
		}
 | 
						|
		
 | 
						|
		
 | 
						|
		private function dedupeSingles($s)
 | 
						|
		{
 | 
						|
			$singles	= $this->singles;
 | 
						|
			
 | 
						|
			foreach ($s as &$item){
 | 
						|
				foreach ($singles as $classname){
 | 
						|
					if (array_key_exists($classname, $item) && is_array($item[$classname])){
 | 
						|
						if (isset($item[$classname][0])) $item[$classname]	= $item[$classname][0];
 | 
						|
					}
 | 
						|
				}
 | 
						|
			}
 | 
						|
			
 | 
						|
			return $s;
 | 
						|
		}
 | 
						|
		
 | 
						|
		private function removeTextVals($s)
 | 
						|
		{
 | 
						|
			foreach ($s as $key => &$val){
 | 
						|
				if ($key){
 | 
						|
					$k = $key;
 | 
						|
				}else{
 | 
						|
					$k = '';
 | 
						|
				}
 | 
						|
				
 | 
						|
				if (is_array($val)){
 | 
						|
					$val = $this->removeTextVals($val);
 | 
						|
				}else{
 | 
						|
					if ($k == 'text'){
 | 
						|
						$val = '';
 | 
						|
					}
 | 
						|
				}
 | 
						|
			}
 | 
						|
			
 | 
						|
			return array_filter($s);
 | 
						|
		}
 | 
						|
 | 
						|
	}
 | 
						|
 | 
						|
 | 
						|
?>
 |