' . implode(', ', $missing) . ''); } public function getByURL($profile='', $url='') { if ($profile=='' || $url == '') return false; $this->loadProfile($profile); $source = $this->loadURL($url); if ($source){ $tidy_xhtml = $this->tidyThis($source); $fragment = false; if (strrchr($url, '#')) $fragment = array_pop(explode('#', $url)); $doc = $this->loadDoc($tidy_xhtml, $fragment); $s = $this->processNodes($doc, $this->classes); $s = $this->postProcess($profile, $s); return $s; }else{ return false; } } public function getByString($profile='', $input_xml='') { if ($profile=='' || $input_xml == '') return false; $this->loadProfile($profile); $doc = $this->loadDoc($input_xml); $s = $this->processNodes($doc, $this->classes); $s = $this->postProcess($profile, $s); return $s; } private function processNodes($items, $classes, $allow_includes=true){ $out = array(); foreach($items as $item){ $data = array(); for ($i=0; $ixpath($xpath); if ($results){ foreach ($results as $result){ if (isset($classes[$i+1]) && is_array($classes[$i+1])){ $nodes = $this->processNodes($results, $classes[$i+1]); if (sizeof($nodes) > 0){ $nodes = array_merge(array('text'=>$this->getNodeValue($result, $classes[$i])), $nodes); $data[$classes[$i]] = $nodes; }else{ $data[$classes[$i]] = $this->getNodeValue($result, $classes[$i]); } }else{ if (isset($data[$classes[$i]])){ if (is_array($data[$classes[$i]])){ // is already an array - append $data[$classes[$i]][] = $this->getNodeValue($result, $classes[$i]); }else{ // make it an array if ($classes[$i] == 'value'){ // unless it's the 'value' of a type/value pattern $data[$classes[$i]] .= $this->getNodeValue($result, $classes[$i]); }else{ $old_val = $data[$classes[$i]]; $data[$classes[$i]] = array($old_val, $this->getNodeValue($result, $classes[$i])); $old_val = false; } } }else{ // set as normal value $data[$classes[$i]] = $this->getNodeValue($result, $classes[$i]); } } // td@headers pattern if (strtoupper(dom_import_simplexml($result)->tagName)== "TD" && $result['headers']){ $include_ids = explode(' ', $result['headers']); $doc = $this->doc; foreach ($include_ids as $id){ $xpath = "//*[@id='$id']/.."; $includes = $doc->xpath($xpath); foreach ($includes as $include){ $tmp = $this->processNodes($include, $this->classes); if (is_array($tmp)) $data = array_merge($data, $tmp); } } } } } } $result = false; } // include-pattern if ($allow_includes){ $xpath = ".//*[contains(concat(' ',normalize-space(@class),' '),' include ')]"; $results = $item->xpath($xpath); if ($results){ foreach ($results as $result){ $tagName = strtoupper(dom_import_simplexml($result)->tagName); if ((($tagName == "OBJECT" && $result['data']) || ($tagName == "A" && $result['href'])) && preg_match('/\binclude\b/', $result['class'])){ $att = ($tagName == "OBJECT" ? 'data' : 'href'); $id = str_replace('#', '', $result[$att]); $doc = $this->doc; $xpath = "//*[@id='$id']"; $includes = $doc->xpath($xpath); foreach ($includes as $include){ $include = simplexml_load_string(''.$include->asXML().''); // don't ask. $tmp = $this->processNodes($include, $this->classes, false); if (is_array($tmp)) $data = array_merge($data, $tmp); } } } } } $out[] = $data; } if (sizeof($out) > 1){ return $out; }else if (isset($data)){ return $data; }else{ return array(); } } private function getNodeValue($node, $className) { $tag_name = strtoupper(dom_import_simplexml($node)->tagName); $s = false; // ignore DEL tags if ($tag_name == 'DEL') return $s; // look up att map values if (array_key_exists($className, $this->att_map)){ foreach ($this->att_map[$className] as $map){ if (preg_match("/$tag_name\|/", $map)){ $s = ''.$node[array_pop($foo = explode('|', $map))]; } } } // if nothing and OBJ, try data. if (!$s && $tag_name=='OBJECT' && $node['data']) $s = ''.$node['data']; // if nothing and IMG, try alt. if (!$s && $tag_name=='IMG' && $node['alt']) $s = ''.$node['alt']; // if nothing and AREA, try alt. if (!$s && $tag_name=='AREA' && $node['alt']) $s = ''.$node['alt']; //if nothing and not A, try title. if (!$s && $tag_name!='A' && $node['title']) $s = ''.$node['title']; // if nothing found, go with node text $s = ($s ? $s : implode(array_filter($node->xpath('child::node()'), array(&$this, "filterBlankValues")), ' ')); // callbacks if (array_key_exists($className, $this->callbacks)){ $s = preg_replace_callback('/.*/', $this->callbacks[$className], $s, 1); } // trim and remove line breaks if ($tag_name != 'PRE'){ $s = trim(preg_replace('/[\r\n\t]+/', '', $s)); $s = trim(preg_replace('/(\s{2})+/', ' ', $s)); } return $s; } private function filterBlankValues($s){ return preg_match("/\w+/", $s); } private function tidyThis($source) { switch ( $this->tidy_mode ) { case 'exec': $tmp_file = $this->tmp_dir.md5($source).'.txt'; file_put_contents($tmp_file, $source); exec("tidy -utf8 -indent -asxhtml -numeric -bare -quiet $tmp_file", $tidy); unlink($tmp_file); return implode("\n", $tidy); break; case 'php': $tidy = tidy_parse_string($source); return tidy_clean_repair($tidy); break; default: return $source; break; } } private function loadProfile($profile) { require_once("$profile.profile.php"); } private function loadDoc($input_xml, $fragment=false) { $xml = simplexml_load_string($input_xml); $this->doc = $xml; if ($fragment){ $doc = $xml->xpath("//*[@id='$fragment']"); $xml = simplexml_load_string($doc[0]->asXML()); $doc = null; } // base tag if ($xml->head->base['href']) $this->base = $xml->head->base['href']; // xml:base attribute - PITA with SimpleXML preg_match('/xml:base="(.*)"/', $xml->asXML(), $matches); if (is_array($matches) && sizeof($matches)>1) $this->base = $matches[1]; return $xml->xpath("//*[contains(concat(' ',normalize-space(@class),' '),' $this->root_class ')]"); } private function loadURL($url) { $this->url = $url; if ($this->tidy_mode == 'proxy' && $this->tidy_proxy != ''){ $url = $this->tidy_proxy . $url; } return @file_get_contents($url); } private function postProcess($profile, $s) { $required = $this->required; if (is_array($s) && array_key_exists($required[0], $s)){ $s = array($s); } $s = $this->dedupeSingles($s); if (function_exists('hKit_'.$profile.'_post')){ $s = call_user_func('hKit_'.$profile.'_post', $s); } $s = $this->removeTextVals($s); return $s; } private function resolvePath($filepath) { // ugly code ahoy: needs a serious tidy up $filepath = $filepath[0]; $base = $this->base; $url = $this->url; if ($base != '' && strpos($base, '://') !== false) $url = $base; $r = parse_url($url); $domain = $r['scheme'] . '://' . $r['host']; if (!isset($r['path'])) $r['path'] = '/'; $path = explode('/', $r['path']); $file = explode('/', $filepath); $new = array(''); if (strpos($filepath, '://') !== false || strpos($filepath, 'data:') !== false){ return $filepath; } if ($file[0] == ''){ // absolute path return ''.$domain . implode('/', $file); }else{ // relative path if ($path[sizeof($path)-1] == '') array_pop($path); if (strpos($path[sizeof($path)-1], '.') !== false) array_pop($path); foreach ($file as $segment){ if ($segment == '..'){ array_pop($path); }else{ $new[] = $segment; } } return ''.$domain . implode('/', $path) . implode('/', $new); } } private function resolveEmail($v) { $parts = parse_url($v[0]); return ($parts['path']); } private function dedupeSingles($s) { $singles = $this->singles; foreach ($s as &$item){ foreach ($singles as $classname){ if (array_key_exists($classname, $item) && is_array($item[$classname])){ if (isset($item[$classname][0])) $item[$classname] = $item[$classname][0]; } } } return $s; } private function removeTextVals($s) { foreach ($s as $key => &$val){ if ($key){ $k = $key; }else{ $k = ''; } if (is_array($val)){ $val = $this->removeTextVals($val); }else{ if ($k == 'text'){ $val = ''; } } } return array_filter($s); } } ?>