forked from GNUsocial/gnu-social
		
	
		
			
	
	
		
			475 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			PHP
		
	
	
	
	
	
		
		
			
		
	
	
			475 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			PHP
		
	
	
	
	
	
|   | <?php | ||
|  | 
 | ||
|  | 	/*  | ||
|  | 	 | ||
|  | 	hKit Library for PHP5 - a generic library for parsing Microformats | ||
|  | 	Copyright (C) 2006  Drew McLellan | ||
|  | 
 | ||
|  | 	This library is free software; you can redistribute it and/or | ||
|  | 	modify it under the terms of the GNU Lesser General Public | ||
|  | 	License as published by the Free Software Foundation; either | ||
|  | 	version 2.1 of the License, or (at your option) any later version. | ||
|  | 
 | ||
|  | 	This library is distributed in the hope that it will be useful, | ||
|  | 	but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
|  | 	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU | ||
|  | 	Lesser General Public License for more details. | ||
|  | 
 | ||
|  | 	You should have received a copy of the GNU Lesser General Public | ||
|  | 	License along with this library; if not, write to the Free Software | ||
|  | 	Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA | ||
|  | 	 | ||
|  | 	Author	 | ||
|  | 		Drew McLellan - http://allinthehead.com/ | ||
|  | 		 | ||
|  | 	Contributors: | ||
|  | 		Scott Reynen - http://www.randomchaos.com/ | ||
|  | 		 | ||
|  | 	Version 0.5, 22-Jul-2006 | ||
|  | 		fixed by-ref issue cropping up in PHP 5.0.5 | ||
|  | 		fixed a bug with a@title | ||
|  | 		added support for new fn=n optimisation | ||
|  | 		added support for new a.include include-pattern | ||
|  | 	Version 0.4, 23-Jun-2006 | ||
|  | 		prevented nested includes from causing infinite loops | ||
|  | 		returns false if URL can't be fetched | ||
|  | 		added pre-flight check for base support level | ||
|  | 		added deduping of once-only classnames | ||
|  | 		prevented accumulation of multiple 'value' values | ||
|  | 		tuned whitespace handling and treatment of DEL elements | ||
|  | 	Version 0.3, 21-Jun-2006 | ||
|  | 		added post-processor callback method into profiles | ||
|  | 		fixed minor problems raised by hcard testsuite | ||
|  | 		added support for include-pattern | ||
|  | 		added support for td@headers pattern | ||
|  | 		added implied-n optimization into default hcard profile | ||
|  | 	Version 0.2, 20-Jun-2006 | ||
|  | 		added class callback mechanism | ||
|  | 		added resolvePath & resolveEmail | ||
|  | 		added basic BASE support | ||
|  | 	Version 0.1.1, 19-Jun-2006 (different timezone, no time machine) | ||
|  | 		added external Tidy option | ||
|  | 	Version 0.1, 20-Jun-2006 | ||
|  | 		initial release | ||
|  | 		 | ||
|  | 	 | ||
|  | 	 | ||
|  | 	 | ||
|  | 	*/ | ||
|  | 
 | ||
|  | 	class hKit | ||
|  | 	{ | ||
|  | 		 | ||
|  | 		public $tidy_mode	= 'proxy'; // 'proxy', 'exec', 'php' or 'none'
 | ||
|  | 		public $tidy_proxy	= 'http://cgi.w3.org/cgi-bin/tidy?forceXML=on&docAddr='; // required only for tidy_mode=proxy
 | ||
|  | 		public $tmp_dir		= '/path/to/writable/dir/'; // required only for tidy_mode=exec
 | ||
|  | 		 | ||
|  | 		private $root_class = ''; | ||
|  | 		private $classes	= ''; | ||
|  | 		private $singles	= ''; | ||
|  | 		private $required	= ''; | ||
|  | 		private $att_map	= ''; | ||
|  | 		private $callbacks	= ''; | ||
|  | 		private $processor 	= ''; | ||
|  | 		 | ||
|  | 		private $url		= ''; | ||
|  | 		private $base 		= ''; | ||
|  | 		private $doc		= ''; | ||
|  | 		 | ||
|  | 		 | ||
|  | 		public function hKit() | ||
|  | 		{ | ||
|  | 			// pre-flight checks
 | ||
|  | 			$pass 		= true;  | ||
|  | 			$required	= array('dom_import_simplexml', 'file_get_contents', 'simplexml_load_string'); | ||
|  | 			$missing	= array(); | ||
|  | 			 | ||
|  | 			foreach ($required as $f){ | ||
|  | 				if (!function_exists($f)){ | ||
|  | 					$pass		= false; | ||
|  | 					$missing[] 	= $f . '()'; | ||
|  | 				} | ||
|  | 			} | ||
|  | 			 | ||
|  | 			if (!$pass) | ||
|  | 				die('hKit error: these required functions are not available: <strong>' . implode(', ', $missing) . '</strong>'); | ||
|  | 			 | ||
|  | 		} | ||
|  | 		 | ||
|  | 
 | ||
|  | 		public function getByURL($profile='', $url='') | ||
|  | 		{ | ||
|  | 			 | ||
|  | 			if ($profile=='' || $url == '') return false; | ||
|  | 			 | ||
|  | 			$this->loadProfile($profile); | ||
|  | 			 | ||
|  | 			$source		= $this->loadURL($url); | ||
|  | 			 | ||
|  | 			if ($source){ | ||
|  | 				$tidy_xhtml	= $this->tidyThis($source); | ||
|  | 
 | ||
|  | 				$fragment	= false; | ||
|  | 			 | ||
|  | 				if (strrchr($url, '#')) | ||
|  | 				$fragment	= array_pop(explode('#', $url)); | ||
|  | 			 | ||
|  | 				$doc		= $this->loadDoc($tidy_xhtml, $fragment); | ||
|  | 				$s			= $this->processNodes($doc, $this->classes); | ||
|  | 				$s			= $this->postProcess($profile, $s); | ||
|  | 			 | ||
|  | 				return $s; | ||
|  | 			}else{ | ||
|  | 				return false; | ||
|  | 			} | ||
|  | 		} | ||
|  | 		 | ||
|  | 		public function getByString($profile='', $input_xml='') | ||
|  | 		{ | ||
|  | 			if ($profile=='' || $input_xml == '') return false; | ||
|  | 			 | ||
|  | 			$this->loadProfile($profile); | ||
|  | 
 | ||
|  | 			$doc	= $this->loadDoc($input_xml); | ||
|  | 			$s		= $this->processNodes($doc, $this->classes); | ||
|  | 			$s		= $this->postProcess($profile, $s); | ||
|  | 			 | ||
|  | 			return $s; | ||
|  | 			 | ||
|  | 		} | ||
|  | 		 | ||
|  | 		private function processNodes($items, $classes, $allow_includes=true){ | ||
|  | 
 | ||
|  | 			$out	= array(); | ||
|  | 
 | ||
|  | 			foreach($items as $item){ | ||
|  | 				$data	= array(); | ||
|  | 
 | ||
|  | 				for ($i=0; $i<sizeof($classes); $i++){ | ||
|  | 					 | ||
|  | 					if (!is_array($classes[$i])){ | ||
|  | 
 | ||
|  | 						$xpath			= ".//*[contains(concat(' ',normalize-space(@class),' '),' " . $classes[$i] . " ')]"; | ||
|  | 						$results		= $item->xpath($xpath); | ||
|  | 						 | ||
|  | 						if ($results){ | ||
|  | 							foreach ($results as $result){  | ||
|  | 								if (isset($classes[$i+1]) && is_array($classes[$i+1])){ | ||
|  | 									$nodes				= $this->processNodes($results, $classes[$i+1]); | ||
|  | 									if (sizeof($nodes) > 0){ | ||
|  | 										$nodes = array_merge(array('text'=>$this->getNodeValue($result, $classes[$i])), $nodes); | ||
|  | 										$data[$classes[$i]]	= $nodes; | ||
|  | 									}else{ | ||
|  | 										$data[$classes[$i]]	= $this->getNodeValue($result, $classes[$i]); | ||
|  | 									} | ||
|  | 									 | ||
|  | 								}else{								 | ||
|  | 									if (isset($data[$classes[$i]])){ | ||
|  | 										if (is_array($data[$classes[$i]])){ | ||
|  | 											// is already an array - append
 | ||
|  | 											$data[$classes[$i]][]	= $this->getNodeValue($result, $classes[$i]); | ||
|  | 
 | ||
|  | 										}else{ | ||
|  | 											// make it an array
 | ||
|  | 											if ($classes[$i] == 'value'){ // unless it's the 'value' of a type/value pattern
 | ||
|  | 												$data[$classes[$i]] .= $this->getNodeValue($result, $classes[$i]); | ||
|  | 											}else{ | ||
|  | 												$old_val			= $data[$classes[$i]]; | ||
|  | 												$data[$classes[$i]]	= array($old_val, $this->getNodeValue($result, $classes[$i])); | ||
|  | 												$old_val			= false; | ||
|  | 											} | ||
|  | 										} | ||
|  | 									}else{										 | ||
|  | 										// set as normal value
 | ||
|  | 										$data[$classes[$i]]	= $this->getNodeValue($result, $classes[$i]); | ||
|  | 
 | ||
|  | 									} | ||
|  | 								} | ||
|  | 							 | ||
|  | 								// td@headers pattern
 | ||
|  | 								if (strtoupper(dom_import_simplexml($result)->tagName)== "TD" && $result['headers']){ | ||
|  | 									$include_ids	= explode(' ', $result['headers']); | ||
|  | 									$doc			= $this->doc; | ||
|  | 									foreach ($include_ids as $id){ | ||
|  | 										$xpath			= "//*[@id='$id']/.."; | ||
|  | 										$includes		= $doc->xpath($xpath); | ||
|  | 										foreach ($includes as $include){ | ||
|  | 											$tmp = $this->processNodes($include, $this->classes); | ||
|  | 											if (is_array($tmp)) $data = array_merge($data, $tmp); | ||
|  | 										} | ||
|  | 									} | ||
|  | 								} | ||
|  | 							}					 | ||
|  | 						}				 | ||
|  | 					} | ||
|  | 					$result	= false; | ||
|  | 				} | ||
|  | 				 | ||
|  | 				// include-pattern
 | ||
|  | 				if ($allow_includes){ | ||
|  | 					$xpath			= ".//*[contains(concat(' ',normalize-space(@class),' '),' include ')]"; | ||
|  | 					$results		= $item->xpath($xpath); | ||
|  | 				 | ||
|  | 					if ($results){ | ||
|  | 						foreach ($results as $result){ | ||
|  | 							$tagName = strtoupper(dom_import_simplexml($result)->tagName); | ||
|  | 							if ((($tagName == "OBJECT" && $result['data']) || ($tagName == "A" && $result['href']))  | ||
|  | 									&& preg_match('/\binclude\b/', $result['class'])){	 | ||
|  | 								$att		= ($tagName == "OBJECT" ? 'data' : 'href');						 | ||
|  | 								$id			= str_replace('#', '', $result[$att]); | ||
|  | 								$doc		= $this->doc; | ||
|  | 								$xpath		= "//*[@id='$id']"; | ||
|  | 								$includes	= $doc->xpath($xpath); | ||
|  | 								foreach ($includes as $include){ | ||
|  | 									$include	= simplexml_load_string('<root1><root2>'.$include->asXML().'</root2></root1>'); // don't ask.
 | ||
|  | 									$tmp 		= $this->processNodes($include, $this->classes, false); | ||
|  | 									if (is_array($tmp)) $data = array_merge($data, $tmp); | ||
|  | 								} | ||
|  | 							} | ||
|  | 						} | ||
|  | 					} | ||
|  | 				} | ||
|  | 				$out[]	= $data; | ||
|  | 			} | ||
|  | 			 | ||
|  | 			if (sizeof($out) > 1){ | ||
|  | 				return $out; | ||
|  | 			}else if (isset($data)){ | ||
|  | 				return $data; | ||
|  | 			}else{ | ||
|  | 				return array(); | ||
|  | 			} | ||
|  | 		} | ||
|  | 
 | ||
|  | 
 | ||
|  | 		private function getNodeValue($node, $className) | ||
|  | 		{ | ||
|  | 
 | ||
|  | 			$tag_name	= strtoupper(dom_import_simplexml($node)->tagName); | ||
|  | 			$s			= false; | ||
|  | 			 | ||
|  | 			// ignore DEL tags
 | ||
|  | 			if ($tag_name == 'DEL') return $s; | ||
|  | 			 | ||
|  | 			// look up att map values
 | ||
|  | 			if (array_key_exists($className, $this->att_map)){ | ||
|  | 				 | ||
|  | 				foreach ($this->att_map[$className] as $map){					 | ||
|  | 					if (preg_match("/$tag_name\|/", $map)){ | ||
|  | 						$s	= ''.$node[array_pop($foo = explode('|', $map))]; | ||
|  | 					} | ||
|  | 				} | ||
|  | 			} | ||
|  | 			 | ||
|  | 			// if nothing and OBJ, try data.
 | ||
|  | 			if (!$s && $tag_name=='OBJECT' && $node['data'])	$s	= ''.$node['data']; | ||
|  | 			 | ||
|  | 			// if nothing and IMG, try alt.
 | ||
|  | 			if (!$s && $tag_name=='IMG' && $node['alt'])	$s	= ''.$node['alt']; | ||
|  | 			 | ||
|  | 			// if nothing and AREA, try alt.
 | ||
|  | 			if (!$s && $tag_name=='AREA' && $node['alt'])	$s	= ''.$node['alt']; | ||
|  | 			 | ||
|  | 			//if nothing and not A, try title.
 | ||
|  | 			if (!$s && $tag_name!='A' && $node['title'])	$s	= ''.$node['title']; | ||
|  | 				 | ||
|  | 			 | ||
|  | 			// if nothing found, go with node text
 | ||
|  | 			$s	= ($s ? $s : implode(array_filter($node->xpath('child::node()'), array(&$this, "filterBlankValues")), ' '));			 | ||
|  | 
 | ||
|  | 			// callbacks			
 | ||
|  | 			if (array_key_exists($className, $this->callbacks)){ | ||
|  | 				$s	= preg_replace_callback('/.*/', $this->callbacks[$className], $s, 1); | ||
|  | 			} | ||
|  | 			 | ||
|  | 			// trim and remove line breaks
 | ||
|  | 			if ($tag_name != 'PRE'){ | ||
|  | 				$s	= trim(preg_replace('/[\r\n\t]+/', '', $s)); | ||
|  | 				$s	= trim(preg_replace('/(\s{2})+/', ' ', $s)); | ||
|  | 			} | ||
|  | 			 | ||
|  | 			return $s; | ||
|  | 		} | ||
|  | 
 | ||
|  | 		private function filterBlankValues($s){ | ||
|  | 			return preg_match("/\w+/", $s); | ||
|  | 		} | ||
|  | 		 | ||
|  | 		 | ||
|  | 		private function tidyThis($source) | ||
|  | 		{ | ||
|  | 			switch ( $this->tidy_mode ) | ||
|  | 			{ | ||
|  | 				case 'exec': | ||
|  | 					$tmp_file	= $this->tmp_dir.md5($source).'.txt'; | ||
|  | 					file_put_contents($tmp_file, $source); | ||
|  | 					exec("tidy -utf8 -indent -asxhtml -numeric -bare -quiet $tmp_file", $tidy); | ||
|  | 					unlink($tmp_file); | ||
|  | 					return implode("\n", $tidy); | ||
|  | 				break; | ||
|  | 				 | ||
|  | 				case 'php': | ||
|  | 					$tidy 	= tidy_parse_string($source); | ||
|  | 					return tidy_clean_repair($tidy); | ||
|  | 				break; | ||
|  | 						 | ||
|  | 				default: | ||
|  | 					return $source; | ||
|  | 				break; | ||
|  | 			} | ||
|  | 			 | ||
|  | 		} | ||
|  | 		 | ||
|  | 		 | ||
|  | 		private function loadProfile($profile) | ||
|  | 		{ | ||
|  | 			require_once("$profile.profile.php"); | ||
|  | 		} | ||
|  | 		 | ||
|  | 		 | ||
|  | 		private function loadDoc($input_xml, $fragment=false) | ||
|  | 		{ | ||
|  | 			$xml 		= simplexml_load_string($input_xml); | ||
|  | 			 | ||
|  | 			$this->doc	= $xml; | ||
|  | 			 | ||
|  | 			if ($fragment){ | ||
|  | 				$doc	= $xml->xpath("//*[@id='$fragment']"); | ||
|  | 				$xml	= simplexml_load_string($doc[0]->asXML()); | ||
|  | 				$doc	= null; | ||
|  | 			} | ||
|  | 			 | ||
|  | 			// base tag
 | ||
|  | 			if ($xml->head->base['href']) $this->base = $xml->head->base['href'];			 | ||
|  | 
 | ||
|  | 			// xml:base attribute - PITA with SimpleXML
 | ||
|  | 			preg_match('/xml:base="(.*)"/', $xml->asXML(), $matches); | ||
|  | 			if (is_array($matches) && sizeof($matches)>1) $this->base = $matches[1]; | ||
|  | 								 | ||
|  | 			return 	$xml->xpath("//*[contains(concat(' ',normalize-space(@class),' '),' $this->root_class ')]"); | ||
|  | 			 | ||
|  | 		} | ||
|  | 		 | ||
|  | 		 | ||
|  | 		private function loadURL($url) | ||
|  | 		{ | ||
|  | 			$this->url	= $url; | ||
|  | 			 | ||
|  | 			if ($this->tidy_mode == 'proxy' && $this->tidy_proxy != ''){ | ||
|  | 				$url	= $this->tidy_proxy . $url; | ||
|  | 			} | ||
|  | 		 | ||
|  | 			return @file_get_contents($url); | ||
|  | 			 | ||
|  | 		} | ||
|  | 		 | ||
|  | 		 | ||
|  | 		private function postProcess($profile, $s) | ||
|  | 		{ | ||
|  | 			$required	= $this->required; | ||
|  | 			 | ||
|  | 			if (is_array($s) && array_key_exists($required[0], $s)){ | ||
|  | 				$s	= array($s); | ||
|  | 			} | ||
|  | 			 | ||
|  | 			$s	= $this->dedupeSingles($s); | ||
|  | 			 | ||
|  | 			if (function_exists('hKit_'.$profile.'_post')){ | ||
|  | 				$s		= call_user_func('hKit_'.$profile.'_post', $s); | ||
|  | 			} | ||
|  | 			 | ||
|  | 			$s	= $this->removeTextVals($s); | ||
|  | 			 | ||
|  | 			return $s; | ||
|  | 		} | ||
|  | 		 | ||
|  | 		 | ||
|  | 		private function resolvePath($filepath) | ||
|  | 		{	// ugly code ahoy: needs a serious tidy up
 | ||
|  | 					 | ||
|  | 			$filepath	= $filepath[0]; | ||
|  | 			 | ||
|  | 			$base 	= $this->base; | ||
|  | 			$url	= $this->url; | ||
|  | 			 | ||
|  | 			if ($base != '' &&  strpos($base, '://') !== false) | ||
|  | 				$url	= $base; | ||
|  | 			 | ||
|  | 			$r		= parse_url($url); | ||
|  | 			$domain	= $r['scheme'] . '://' . $r['host']; | ||
|  | 
 | ||
|  | 			if (!isset($r['path'])) $r['path'] = '/'; | ||
|  | 			$path	= explode('/', $r['path']); | ||
|  | 			$file	= explode('/', $filepath); | ||
|  | 			$new	= array(''); | ||
|  | 
 | ||
|  | 			if (strpos($filepath, '://') !== false || strpos($filepath, 'data:') !== false){ | ||
|  | 				return $filepath; | ||
|  | 			} | ||
|  | 
 | ||
|  | 			if ($file[0] == ''){ | ||
|  | 				// absolute path
 | ||
|  | 				return ''.$domain . implode('/', $file); | ||
|  | 			}else{ | ||
|  | 				// relative path
 | ||
|  | 				if ($path[sizeof($path)-1] == '') array_pop($path); | ||
|  | 				if (strpos($path[sizeof($path)-1], '.') !== false) array_pop($path); | ||
|  | 
 | ||
|  | 				foreach ($file as $segment){ | ||
|  | 					if ($segment == '..'){ | ||
|  | 						array_pop($path); | ||
|  | 					}else{ | ||
|  | 						$new[]	= $segment; | ||
|  | 					} | ||
|  | 				} | ||
|  | 				return ''.$domain . implode('/', $path) . implode('/', $new); | ||
|  | 			}	 | ||
|  | 		} | ||
|  | 		 | ||
|  | 		private function resolveEmail($v) | ||
|  | 		{ | ||
|  | 			$parts	= parse_url($v[0]); | ||
|  | 			return ($parts['path']); | ||
|  | 		} | ||
|  | 		 | ||
|  | 		 | ||
|  | 		private function dedupeSingles($s) | ||
|  | 		{ | ||
|  | 			$singles	= $this->singles; | ||
|  | 			 | ||
|  | 			foreach ($s as &$item){ | ||
|  | 				foreach ($singles as $classname){ | ||
|  | 					if (array_key_exists($classname, $item) && is_array($item[$classname])){ | ||
|  | 						if (isset($item[$classname][0])) $item[$classname]	= $item[$classname][0]; | ||
|  | 					} | ||
|  | 				} | ||
|  | 			} | ||
|  | 			 | ||
|  | 			return $s; | ||
|  | 		} | ||
|  | 		 | ||
|  | 		private function removeTextVals($s) | ||
|  | 		{ | ||
|  | 			foreach ($s as $key => &$val){ | ||
|  | 				if ($key){ | ||
|  | 					$k = $key; | ||
|  | 				}else{ | ||
|  | 					$k = ''; | ||
|  | 				} | ||
|  | 				 | ||
|  | 				if (is_array($val)){ | ||
|  | 					$val = $this->removeTextVals($val); | ||
|  | 				}else{ | ||
|  | 					if ($k == 'text'){ | ||
|  | 						$val = ''; | ||
|  | 					} | ||
|  | 				} | ||
|  | 			} | ||
|  | 			 | ||
|  | 			return array_filter($s); | ||
|  | 		} | ||
|  | 
 | ||
|  | 	} | ||
|  | 
 | ||
|  | 
 | ||
|  | ?>
 |