From da04334028a26f944b0c128dc04e280859bc944e Mon Sep 17 00:00:00 2001 From: Mikael Nordfeldth Date: Mon, 23 Jun 2014 20:52:01 +0200 Subject: [PATCH] Microformats2 parser class added to extlib (MIT license) Retrieved from https://raw.githubusercontent.com/indieweb/php-mf2/master/Mf2/Parser.php GitHub project page: https://github.com/indieweb/php-mf2/ --- extlib/Mf2/Parser.php | 1335 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1335 insertions(+) create mode 100644 extlib/Mf2/Parser.php diff --git a/extlib/Mf2/Parser.php b/extlib/Mf2/Parser.php new file mode 100644 index 0000000000..04d44e2aca --- /dev/null +++ b/extlib/Mf2/Parser.php @@ -0,0 +1,1335 @@ +Barnaby Walters'); + * echo json_encode($output, JSON_PRETTY_PRINT); + * + * Produces: + * + * { + * "items": [ + * { + * "type": ["h-card"], + * "properties": { + * "name": ["Barnaby Walters"] + * } + * } + * ], + * "rels": {} + * } + * + * @param string|DOMDocument $input The HTML string or DOMDocument object to parse + * @param string $url The URL the input document was found at, for relative URL resolution + * @param bool $convertClassic whether or not to convert classic microformats + * @return array Canonical MF2 array structure + */ +function parse($input, $url = null, $convertClassic = true) { + $parser = new Parser($input, $url); + return $parser->parse($convertClassic); +} + +/** + * Fetch microformats2 + * + * Given a URL, fetches it (following up to 5 redirects) and, if the content-type appears to be HTML, returns the parsed + * microformats2 array structure. + * + * Not that even if the response code was a 4XX or 5XX error, if the content-type is HTML-like then it will be parsed + * all the same, as there are legitimate cases where error pages might contain useful microformats (for example a deleted + * h-entry resulting in a 410 Gone page with a stub h-entry explaining the reason for deletion). Look in $curlInfo['http_code'] + * for the actual value. + * + * @param string $url The URL to fetch + * @param bool $convertClassic (optional, default true) whether or not to convert classic microformats + * @param &array $curlInfo (optional) the results of curl_getinfo will be placed in this variable for debugging + * @return array|null canonical microformats2 array structure on success, null on failure + */ +function fetch($url, $convertClassic = true, &$curlInfo=null) { + $ch = curl_init(); + curl_setopt($ch, CURLOPT_URL, $url); + curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); + curl_setopt($ch, CURLOPT_HEADER, 0); + curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1); + curl_setopt($ch, CURLOPT_MAXREDIRS, 5); + $response = curl_exec($ch); + $info = $curlInfo = curl_getinfo($ch); + curl_close($ch); + + if (strpos(strtolower($info['content_type']), 'html') === false) { + // The content was not delivered as HTML, do not attempt to parse it. + return null; + } + + $html = mb_substr($response, $info['header_size']); + return parse($html, $url, $convertClassic); +} + +/** + * Unicode to HTML Entities + * @param string $input String containing characters to convert into HTML entities + * @return string + */ +function unicodeToHtmlEntities($input) { + return mb_convert_encoding($input, 'HTML-ENTITIES', mb_detect_encoding($input)); +} + +/** + * Collapse Whitespace + * + * Collapses any sequences of whitespace within a string into a single space + * character. + * + * @deprecated since v0.2.3 + * @param string $str + * @return string + */ +function collapseWhitespace($str) { + return preg_replace('/[\s|\n]+/', ' ', $str); +} + +function unicodeTrim($str) { + // this is cheating. TODO: find a better way if this causes any problems + $str = str_replace(mb_convert_encoding(' ', 'UTF-8', 'HTML-ENTITIES'), ' ', $str); + $str = preg_replace('/^\s+/', '', $str); + return preg_replace('/\s+$/', '', $str); +} + +/** + * Microformat Name From Class string + * + * Given the value of @class, get the relevant mf classnames (e.g. h-card, + * p-name). + * + * @param string $class A space delimited list of classnames + * @param string $prefix The prefix to look for + * @return string|array The prefixed name of the first microfomats class found or false + */ +function mfNamesFromClass($class, $prefix='h-') { + $class = str_replace([' ', ' ', "\n"], ' ', $class); + $classes = explode(' ', $class); + $matches = array(); + + foreach ($classes as $classname) { + if (strpos($classname, $prefix) === 0 && $classname !== $prefix) { + $matches[] = ($prefix === 'h-') ? $classname : substr($classname, strlen($prefix)); + } + } + + return $matches; +} + +/** + * Get Nested µf Property Name From Class + * + * Returns all the p-, u-, dt- or e- prefixed classnames it finds in a + * space-separated string. + * + * @param string $class + * @return array + */ +function nestedMfPropertyNamesFromClass($class) { + $prefixes = array('p-', 'u-', 'dt-', 'e-'); + $propertyNames = array(); + + $class = str_replace([' ', ' ', "\n"], ' ', $class); + foreach (explode(' ', $class) as $classname) { + foreach ($prefixes as $prefix) { + if (strpos($classname, $prefix) === 0 and $classname !== $prefix) { + $propertyNames = array_merge($propertyNames, mfNamesFromClass($classname, ltrim($prefix))); + } + } + } + + return $propertyNames; +} + +/** + * Wraps mfNamesFromClass to handle an element as input (common) + * + * @param DOMElement $e The element to get the classname for + * @param string $prefix The prefix to look for + * @return mixed See return value of mf2\Parser::mfNameFromClass() + */ +function mfNamesFromElement(\DOMElement $e, $prefix = 'h-') { + $class = $e->getAttribute('class'); + return mfNamesFromClass($class, $prefix); +} + +/** + * Wraps nestedMfPropertyNamesFromClass to handle an element as input + */ +function nestedMfPropertyNamesFromElement(\DOMElement $e) { + $class = $e->getAttribute('class'); + return nestedMfPropertyNamesFromClass($class); +} + +/** + * Converts various time formats to HH:MM + * @param string $time The time to convert + * @return string + */ +function convertTimeFormat($time) { + $hh = $mm = $ss = ''; + preg_match('/(\d{1,2}):?(\d{2})?:?(\d{2})?(a\.?m\.?|p\.?m\.?)?/i', $time, $matches); + + // if no am/pm specified + if ( empty($matches[4]) ) { + return $time; + } + // else am/pm specified + else { + $meridiem = strtolower(str_replace('.', '', $matches[4])); + + // hours + $hh = $matches[1]; + + // add 12 to the pm hours + if ( $meridiem == 'pm' && ($hh < 12) ) + { + $hh += 12; + } + + $hh = str_pad($hh, 2, '0', STR_PAD_LEFT); + + // minutes + $mm = ( empty($matches[2]) ) ? '00' : $matches[2]; + + // seconds, only if supplied + if ( !empty($matches[3]) ) + { + $ss = $matches[3]; + } + + if ( empty($ss) ) { + return sprintf('%s:%s', $hh, $mm); + } + else { + return sprintf('%s:%s:%s', $hh, $mm, $ss); + } + + } + +} + +/** + * Microformats2 Parser + * + * A class which holds state for parsing microformats2 from HTML. + * + * Example usage: + * + * use Mf2; + * $parser = new Mf2\Parser('

Barnaby Walters

'); + * $output = $parser->parse(); + */ +class Parser { + /** @var string The baseurl (if any) to use for this parse */ + public $baseurl; + + /** @var DOMXPath object which can be used to query over any fragment*/ + public $xpath; + + /** @var DOMDocument */ + public $doc; + + /** @var SplObjectStorage */ + protected $parsed; + + public $jsonMode; + + /** + * Constructor + * + * @param DOMDocument|string $input The data to parse. A string of HTML or a DOMDocument + * @param string $url The URL of the parsed document, for relative URL resolution + * @param boolean $jsonMode Whether or not to use a stdClass instance for an empty `rels` dictionary. This breaks PHP looping over rels, but allows the output to be correctly serialized as JSON. + */ + public function __construct($input, $url = null, $jsonMode = false) { + libxml_use_internal_errors(true); + if (is_string($input)) { + $doc = new DOMDocument(); + @$doc->loadHTML(unicodeToHtmlEntities($input)); + } elseif (is_a($input, 'DOMDocument')) { + $doc = $input; + } else { + $doc = new DOMDocument(); + @$doc->loadHTML(''); + } + + $this->xpath = new DOMXPath($doc); + + $baseurl = $url; + foreach ($this->xpath->query('//base[@href]') as $base) { + $baseElementUrl = $base->getAttribute('href'); + + if (parse_url($baseElementUrl, PHP_URL_SCHEME) === null) { + /* The base element URL is relative to the document URL. + * + * :/ + * + * Perhaps the author was high? */ + + $baseurl = resolveUrl($url, $baseElementUrl); + } else { + $baseurl = $baseElementUrl; + } + break; + } + + $this->baseurl = $baseurl; + $this->doc = $doc; + $this->parsed = new SplObjectStorage(); + $this->jsonMode = $jsonMode; + } + + private function elementPrefixParsed(\DOMElement $e, $prefix) { + if (!$this->parsed->contains($e)) + $this->parsed->attach($e, array()); + + $prefixes = $this->parsed[$e]; + $prefixes[] = $prefix; + $this->parsed[$e] = $prefixes; + } + + private function isElementParsed(\DOMElement $e, $prefix) { + if (!$this->parsed->contains($e)) + return false; + + $prefixes = $this->parsed[$e]; + + if (!in_array($prefix, $prefixes)) + return false; + + return true; + } + + // TODO: figure out if this has problems with sms: and geo: URLs + public function resolveUrl($url) { + // If the URL is seriously malformed it’s probably beyond the scope of this + // parser to try to do anything with it. + if (parse_url($url) === false) + return $url; + + $scheme = parse_url($url, PHP_URL_SCHEME); + + if (empty($scheme) and !empty($this->baseurl)) { + return resolveUrl($this->baseurl, $url); + } else { + return $url; + } + } + + // Parsing Functions + + /** + * Parse value-class/value-title on an element, joining with $separator if + * there are multiple. + * + * @param \DOMElement $e + * @param string $separator = '' if multiple value-title elements, join with this string + * @return string|null the parsed value or null if value-class or -title aren’t in use + */ + public function parseValueClassTitle(\DOMElement $e, $separator = '') { + $valueClassElements = $this->xpath->query('./*[contains(concat(" ", @class, " "), " value ")]', $e); + + if ($valueClassElements->length !== 0) { + // Process value-class stuff + $val = ''; + foreach ($valueClassElements as $el) { + $val .= $el->textContent; + } + + return unicodeTrim($val); + } + + $valueTitleElements = $this->xpath->query('./*[contains(concat(" ", @class, " "), " value-title ")]', $e); + + if ($valueTitleElements->length !== 0) { + // Process value-title stuff + $val = ''; + foreach ($valueTitleElements as $el) { + $val .= $el->getAttribute('title'); + } + + return unicodeTrim($val); + } + + // No value-title or -class in this element + return null; + } + + /** + * Given an element with class="p-*", get it’s value + * + * @param DOMElement $p The element to parse + * @return string The plaintext value of $p, dependant on type + * @todo Make this adhere to value-class + */ + public function parseP(\DOMElement $p) { + $classTitle = $this->parseValueClassTitle($p, ' '); + + if ($classTitle !== null) + return $classTitle; + + if ($p->tagName == 'img' and $p->getAttribute('alt') !== '') { + $pValue = $p->getAttribute('alt'); + } elseif ($p->tagName == 'area' and $p->getAttribute('alt') !== '') { + $pValue = $p->getAttribute('alt'); + } elseif ($p->tagName == 'abbr' and $p->getAttribute('title') !== '') { + $pValue = $p->getAttribute('title'); + } elseif (in_array($p->tagName, array('data', 'input')) and $p->getAttribute('value') !== '') { + $pValue = $p->getAttribute('value'); + } else { + $pValue = unicodeTrim($p->textContent); + } + + return $pValue; + } + + /** + * Given an element with class="u-*", get the value of the URL + * + * @param DOMElement $u The element to parse + * @return string The plaintext value of $u, dependant on type + * @todo make this adhere to value-class + */ + public function parseU(\DOMElement $u) { + if (($u->tagName == 'a' or $u->tagName == 'area') and $u->getAttribute('href') !== null) { + $uValue = $u->getAttribute('href'); + } elseif ($u->tagName == 'img' and $u->getAttribute('src') !== null) { + $uValue = $u->getAttribute('src'); + } elseif ($u->tagName == 'object' and $u->getAttribute('data') !== null) { + $uValue = $u->getAttribute('data'); + } + + if (isset($uValue)) { + return $this->resolveUrl($uValue); + } + + $classTitle = $this->parseValueClassTitle($u); + + if ($classTitle !== null) { + return $classTitle; + } elseif ($u->tagName == 'abbr' and $u->getAttribute('title') !== null) { + return $u->getAttribute('title'); + } elseif (in_array($u->tagName, array('data', 'input')) and $u->getAttribute('value') !== null) { + return $u->getAttribute('value'); + } else { + return unicodeTrim($u->textContent); + } + } + + /** + * Given an element with class="dt-*", get the value of the datetime as a php date object + * + * @param DOMElement $dt The element to parse + * @param array $dates Array of dates processed so far + * @return string The datetime string found + */ + public function parseDT(\DOMElement $dt, &$dates = array()) { + // Check for value-class pattern + $valueClassChildren = $this->xpath->query('./*[contains(concat(" ", @class, " "), " value ") or contains(concat(" ", @class, " "), " value-title ")]', $dt); + $dtValue = false; + + if ($valueClassChildren->length > 0) { + // They’re using value-class + $dateParts = array(); + + foreach ($valueClassChildren as $e) { + if (strstr(' ' . $e->getAttribute('class') . ' ', ' value-title ')) { + $title = $e->getAttribute('title'); + if (!empty($title)) + $dateParts[] = $title; + } + elseif ($e->tagName == 'img' or $e->tagName == 'area') { + // Use @alt + $alt = $e->getAttribute('alt'); + if (!empty($alt)) + $dateParts[] = $alt; + } + elseif ($e->tagName == 'data') { + // Use @value, otherwise innertext + $value = $e->hasAttribute('value') ? $e->getAttribute('value') : unicodeTrim($e->nodeValue); + if (!empty($value)) + $dateParts[] = $value; + } + elseif ($e->tagName == 'abbr') { + // Use @title, otherwise innertext + $title = $e->hasAttribute('title') ? $e->getAttribute('title') : unicodeTrim($e->nodeValue); + if (!empty($title)) + $dateParts[] = $title; + } + elseif ($e->tagName == 'del' or $e->tagName == 'ins' or $e->tagName == 'time') { + // Use @datetime if available, otherwise innertext + $dtAttr = ($e->hasAttribute('datetime')) ? $e->getAttribute('datetime') : unicodeTrim($e->nodeValue); + if (!empty($dtAttr)) + $dateParts[] = $dtAttr; + } + else { + if (!empty($e->nodeValue)) + $dateParts[] = unicodeTrim($e->nodeValue); + } + } + + // Look through dateParts + $datePart = ''; + $timePart = ''; + foreach ($dateParts as $part) { + // Is this part a full ISO8601 datetime? + if (preg_match('/^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}(?::\d{2})?(?:Z?[+|-]\d{2}:?\d{2})?$/', $part)) { + // Break completely, we’ve got our value. + $dtValue = $part; + break; + } else { + // Is the current part a valid time(+TZ?) AND no other time representation has been found? + if ((preg_match('/\d{1,2}:\d{1,2}(Z?[+|-]\d{2}:?\d{2})?/', $part) or preg_match('/\d{1,2}[a|p]m/', $part)) and empty($timePart)) { + $timePart = $part; + } elseif (preg_match('/\d{4}-\d{2}-\d{2}/', $part) and empty($datePart)) { + // Is the current part a valid date AND no other date representation has been found? + $datePart = $part; + } + + if ( !empty($datePart) && !in_array($datePart, $dates) ) { + $dates[] = $datePart; + } + + $dtValue = ''; + + if ( empty($datePart) && !empty($timePart) ) { + $timePart = convertTimeFormat($timePart); + $dtValue = unicodeTrim($timePart, 'T'); + } + else if ( !empty($datePart) && empty($timePart) ) { + $dtValue = rtrim($datePart, 'T'); + } + else { + $timePart = convertTimeFormat($timePart); + $dtValue = rtrim($datePart, 'T') . 'T' . unicodeTrim($timePart, 'T'); + } + } + } + } else { + // Not using value-class (phew). + if ($dt->tagName == 'img' or $dt->tagName == 'area') { + // Use @alt + // Is it an entire dt? + $alt = $dt->getAttribute('alt'); + if (!empty($alt)) + $dtValue = $alt; + } elseif (in_array($dt->tagName, array('data'))) { + // Use @value, otherwise innertext + // Is it an entire dt? + $value = $dt->getAttribute('value'); + if (!empty($value)) + $dtValue = $value; + else + $dtValue = $dt->nodeValue; + } elseif ($dt->tagName == 'abbr') { + // Use @title, otherwise innertext + // Is it an entire dt? + $title = $dt->getAttribute('title'); + if (!empty($title)) + $dtValue = $title; + else + $dtValue = $dt->nodeValue; + } elseif ($dt->tagName == 'del' or $dt->tagName == 'ins' or $dt->tagName == 'time') { + // Use @datetime if available, otherwise innertext + // Is it an entire dt? + $dtAttr = $dt->getAttribute('datetime'); + if (!empty($dtAttr)) + $dtValue = $dtAttr; + else + $dtValue = $dt->nodeValue; + } else { + $dtValue = $dt->nodeValue; + } + + if ( preg_match('/(\d{4}-\d{2}-\d{2})/', $dtValue, $matches) ) { + $dates[] = $matches[0]; + } + } + + /** + * if $dtValue is only a time and there are recently parsed dates, + * form the full date-time using the most recnetly parsed dt- value + */ + if ( (preg_match('/^\d{1,2}:\d{1,2}(Z?[+|-]\d{2}:?\d{2})?/', $dtValue) or preg_match('/^\d{1,2}[a|p]m/', $dtValue)) && !empty($dates) ) { + $dtValue = convertTimeFormat($dtValue); + $dtValue = end($dates) . 'T' . unicodeTrim($dtValue, 'T'); + } + + return $dtValue; + } + + /** + * Given the root element of some embedded markup, return a string representing that markup + * + * @param DOMElement $e The element to parse + * @return string $e’s innerHTML + * + * @todo need to mark this element as e- parsed so it doesn’t get parsed as it’s parent’s e-* too + */ + public function parseE(\DOMElement $e) { + $classTitle = $this->parseValueClassTitle($e); + + if ($classTitle !== null) + return $classTitle; + + // Expand relative URLs within children of this element + // TODO: as it is this is not relative to only children, make this .// and rerun tests + $hyperlinkChildren = $this->xpath->query('//*[@src or @href or @data]', $e); + + foreach ($hyperlinkChildren as $child) { + if ($child->hasAttribute('href')) + $child->setAttribute('href', $this->resolveUrl($child->getAttribute('href'))); + if ($child->hasAttribute('src')) + $child->setAttribute('src', $this->resolveUrl($child->getAttribute('src'))); + if ($child->hasAttribute('data')) + $child->setAttribute('data', $this->resolveUrl($child->getAttribute('data'))); + } + + $html = ''; + foreach ($e->childNodes as $node) { + $html .= $node->C14N(); + } + + return array( + 'html' => $html, + 'value' => unicodeTrim($e->textContent) + ); + } + + /** + * Recursively parse microformats + * + * @param DOMElement $e The element to parse + * @return array A representation of the values contained within microformat $e + */ + public function parseH(\DOMElement $e) { + // If it’s already been parsed (e.g. is a child mf), skip + if ($this->parsed->contains($e)) + return null; + + // Get current µf name + $mfTypes = mfNamesFromElement($e, 'h-'); + + // Initalise var to store the representation in + $return = array(); + $children = array(); + $dates = array(); + + // Handle nested microformats (h-*) + foreach ($this->xpath->query('.//*[contains(concat(" ", @class)," h-")]', $e) as $subMF) { + // Parse + $result = $this->parseH($subMF); + + // If result was already parsed, skip it + if (null === $result) + continue; + + $result['value'] = $this->parseP($subMF); + + // Does this µf have any property names other than h-*? + $properties = nestedMfPropertyNamesFromElement($subMF); + + if (!empty($properties)) { + // Yes! It’s a nested property µf + foreach ($properties as $property) { + $return[$property][] = $result; + } + } else { + // No, it’s a child µf + $children[] = $result; + } + + // Make sure this sub-mf won’t get parsed as a µf or property + // TODO: Determine if clearing this is required? + $this->elementPrefixParsed($subMF, 'h'); + $this->elementPrefixParsed($subMF, 'p'); + $this->elementPrefixParsed($subMF, 'u'); + $this->elementPrefixParsed($subMF, 'dt'); + $this->elementPrefixParsed($subMF, 'e'); + } + + // Handle p-* + foreach ($this->xpath->query('.//*[contains(concat(" ", @class) ," p-")]', $e) as $p) { + if ($this->isElementParsed($p, 'p')) + continue; + + $pValue = $this->parseP($p); + + // Add the value to the array for it’s p- properties + foreach (mfNamesFromElement($p, 'p-') as $propName) { + if (!empty($propName)) + $return[$propName][] = $pValue; + } + + // Make sure this sub-mf won’t get parsed as a top level mf + $this->elementPrefixParsed($p, 'p'); + } + + // Handle u-* + foreach ($this->xpath->query('.//*[contains(concat(" ", @class)," u-")]', $e) as $u) { + if ($this->isElementParsed($u, 'u')) + continue; + + $uValue = $this->parseU($u); + + // Add the value to the array for it’s property types + foreach (mfNamesFromElement($u, 'u-') as $propName) { + $return[$propName][] = $uValue; + } + + // Make sure this sub-mf won’t get parsed as a top level mf + $this->elementPrefixParsed($u, 'u'); + } + + // Handle dt-* + foreach ($this->xpath->query('.//*[contains(concat(" ", @class), " dt-")]', $e) as $dt) { + if ($this->isElementParsed($dt, 'dt')) + continue; + + $dtValue = $this->parseDT($dt, $dates); + + if ($dtValue) { + // Add the value to the array for dt- properties + foreach (mfNamesFromElement($dt, 'dt-') as $propName) { + $return[$propName][] = $dtValue; + } + } + + // Make sure this sub-mf won’t get parsed as a top level mf + $this->elementPrefixParsed($dt, 'dt'); + } + + // Handle e-* + foreach ($this->xpath->query('.//*[contains(concat(" ", @class)," e-")]', $e) as $em) { + if ($this->isElementParsed($em, 'e')) + continue; + + $eValue = $this->parseE($em); + + if ($eValue) { + // Add the value to the array for e- properties + foreach (mfNamesFromElement($em, 'e-') as $propName) { + $return[$propName][] = $eValue; + } + } + // Make sure this sub-mf won’t get parsed as a top level mf + $this->elementPrefixParsed($em, 'e'); + } + + // Implied Properties + // Check for p-name + if (!array_key_exists('name', $return)) { + try { + // Look for img @alt + if ($e->tagName == 'img' and $e->getAttribute('alt') != '') + throw new Exception($e->getAttribute('alt')); + + if ($e->tagName == 'abbr' and $e->hasAttribute('title')) + throw new Exception($e->getAttribute('title')); + + // Look for nested img @alt + foreach ($this->xpath->query('./img[count(preceding-sibling::*)+count(following-sibling::*)=0]', $e) as $em) { + if ($em->getAttribute('alt') != '') + throw new Exception($em->getAttribute('alt')); + } + + // Look for double nested img @alt + foreach ($this->xpath->query('./*[count(preceding-sibling::*)+count(following-sibling::*)=0]/img[count(preceding-sibling::*)+count(following-sibling::*)=0]', $e) as $em) { + if ($em->getAttribute('alt') != '') + throw new Exception($em->getAttribute('alt')); + } + + throw new Exception($e->nodeValue); + } catch (Exception $exc) { + $return['name'][] = unicodeTrim($exc->getMessage()); + } + } + + // Check for u-photo + if (!array_key_exists('photo', $return)) { + // Look for img @src + try { + if ($e->tagName == 'img') + throw new Exception($e->getAttribute('src')); + + // Look for nested img @src + foreach ($this->xpath->query('./img[count(preceding-sibling::*)+count(following-sibling::*)=0]', $e) as $em) { + if ($em->getAttribute('src') != '') + throw new Exception($em->getAttribute('src')); + } + + // Look for double nested img @src + foreach ($this->xpath->query('./*[count(preceding-sibling::*)+count(following-sibling::*)=0]/img[count(preceding-sibling::*)+count(following-sibling::*)=0]', $e) as $em) { + if ($em->getAttribute('src') != '') + throw new Exception($em->getAttribute('src')); + } + } catch (Exception $exc) { + $return['photo'][] = $this->resolveUrl($exc->getMessage()); + } + } + + // Check for u-url + if (!array_key_exists('url', $return)) { + // Look for img @src + if ($e->tagName == 'a') + $url = $e->getAttribute('href'); + + // Look for nested img @src + foreach ($this->xpath->query('./a[count(preceding-sibling::a)+count(following-sibling::a)=0]', $e) as $em) { + $url = $em->getAttribute('href'); + break; + } + + if (!empty($url)) + $return['url'][] = $this->resolveUrl($url); + } + + // Make sure things are in alphabetical order + sort($mfTypes); + + // Phew. Return the final result. + $parsed = array( + 'type' => $mfTypes, + 'properties' => $return + ); + if (!empty($children)) + $parsed['children'] = array_values(array_filter($children)); + return $parsed; + } + + /** + * Parse Rels and Alternatives + * + * Returns [$rels, $alternatives]. If the $rels value is to be empty, i.e. there are no links on the page + * with a rel value *not* containing `alternate`, then the type of $rels depends on $this->jsonMode. If set + * to true, it will be a stdClass instance, optimising for JSON serialisation. Otherwise (the default case), + * it will be an empty array. + */ + public function parseRelsAndAlternates() { + $rels = array(); + $alternates = array(); + + // Iterate through all a, area and link elements with rel attributes + foreach ($this->xpath->query('//*[@rel and @href]') as $hyperlink) { + if ($hyperlink->getAttribute('rel') == '') + continue; + + // Resolve the href + $href = $this->resolveUrl($hyperlink->getAttribute('href')); + + // Split up the rel into space-separated values + $linkRels = array_filter(explode(' ', $hyperlink->getAttribute('rel'))); + + // If alternate in rels, create alternate structure, append + if (in_array('alternate', $linkRels)) { + $alt = array( + 'url' => $href, + 'rel' => implode(' ', array_diff($linkRels, array('alternate'))) + ); + if ($hyperlink->hasAttribute('media')) + $alt['media'] = $hyperlink->getAttribute('media'); + + if ($hyperlink->hasAttribute('hreflang')) + $alt['hreflang'] = $hyperlink->getAttribute('hreflang'); + + $alternates[] = $alt; + } else { + foreach ($linkRels as $rel) { + $rels[$rel][] = $href; + } + } + } + + if (empty($rels) and $this->jsonMode) { + $rels = new stdClass(); + } + + return array($rels, $alternates); + } + + /** + * Kicks off the parsing routine + * + * If `$htmlSafe` is set, any angle brackets in the results from non e-* properties + * will be HTML-encoded, bringing all output to the same level of encoding. + * + * If a DOMElement is set as the $context, only descendants of that element will + * be parsed for microformats. + * + * @param bool $htmlSafe whether or not to html-encode non e-* properties. Defaults to false + * @param DOMElement $context optionally an element from which to parse microformats + * @return array An array containing all the µfs found in the current document + */ + public function parse($convertClassic = true, DOMElement $context = null) { + $mfs = array(); + + if ($convertClassic) { + $this->convertLegacy(); + } + + $mfElements = null === $context + ? $this->xpath->query('//*[contains(concat(" ", @class), " h-")]') + : $this->xpath->query('.//*[contains(concat(" ", @class), " h-")]', $context); + + // Parser microformats + foreach ($mfElements as $node) { + // For each microformat + $result = $this->parseH($node); + + // Add the value to the array for this property type + $mfs[] = $result; + } + + // Parse rels + list($rels, $alternates) = $this->parseRelsAndAlternates(); + + $top = array( + 'items' => array_values(array_filter($mfs)), + 'rels' => $rels + ); + + if (count($alternates)) + $top['alternates'] = $alternates; + + return $top; + } + + /** + * Parse From ID + * + * Given an ID, parse all microformats which are children of the element with + * that ID. + * + * Note that rel values are still document-wide. + * + * If an element with the ID is not found, an empty skeleton mf2 array structure + * will be returned. + * + * @param string $id + * @param bool $htmlSafe = false whether or not to HTML-encode angle brackets in non e-* properties + * @return array + */ + public function parseFromId($id, $convertClassic=true) { + $matches = $this->xpath->query("//*[@id='{$id}']"); + + if (empty($matches)) + return array('items' => array(), 'rels' => array(), 'alternates' => array()); + + return $this->parse($convertClassic, $matches->item(0)); + } + + /** + * Convert Legacy Classnames + * + * Adds microformats2 classnames into a document containing only legacy + * semantic classnames. + * + * @return Parser $this + */ + public function convertLegacy() { + $doc = $this->doc; + $xp = new DOMXPath($doc); + + // replace all roots + foreach ($this->classicRootMap as $old => $new) { + foreach ($xp->query('//*[contains(concat(" ", @class, " "), " ' . $old . ' ") and not(contains(concat(" ", @class, " "), " ' . $new . ' "))]') as $el) { + $el->setAttribute('class', $el->getAttribute('class') . ' ' . $new); + } + } + + foreach ($this->classicPropertyMap as $oldRoot => $properties) { + $newRoot = $this->classicRootMap[$oldRoot]; + foreach ($properties as $old => $new) { + foreach ($xp->query('//*[contains(concat(" ", @class, " "), " ' . $oldRoot . ' ")]//*[contains(concat(" ", @class, " "), " ' . $old . ' ") and not(contains(concat(" ", @class, " "), " ' . $new . ' "))]') as $el) { + $el->setAttribute('class', $el->getAttribute('class') . ' ' . $new); + } + } + } + + return $this; + } + + /** + * XPath Query + * + * Runs an XPath query over the current document. Works in exactly the same + * way as DOMXPath::query. + * + * @param string $expression + * @param DOMNode $context + * @return DOMNodeList + */ + public function query($expression, $context = null) { + return $this->xpath->query($expression, $context); + } + + /** + * Classic Root Classname map + */ + public $classicRootMap = array( + 'vcard' => 'h-card', + 'hfeed' => 'h-feed', + 'hentry' => 'h-entry', + 'hrecipe' => 'h-recipe', + 'hresume' => 'h-resume', + 'hevent' => 'h-event', + 'hreview' => 'h-review' + ); + + public $classicPropertyMap = array( + 'vcard' => array( + 'fn' => 'p-name', + 'url' => 'u-url', + 'honorific-prefix' => 'p-honorific-prefix', + 'given-name' => 'p-given-name', + 'additional-name' => 'p-additional-name', + 'family-name' => 'p-family-name', + 'honorific-suffix' => 'p-honorific-suffix', + 'nickname' => 'p-nickname', + 'email' => 'u-email', + 'logo' => 'u-logo', + 'photo' => 'u-photo', + 'url' => 'u-url', + 'uid' => 'u-uid', + 'category' => 'p-category', + 'adr' => 'p-adr h-adr', + 'extended-address' => 'p-extended-address', + 'street-address' => 'p-street-address', + 'locality' => 'p-locality', + 'region' => 'p-region', + 'postal-code' => 'p-postal-code', + 'country-name' => 'p-country-name', + 'label' => 'p-label', + 'geo' => 'p-geo h-geo', + 'latitude' => 'p-latitude', + 'longitude' => 'p-longitude', + 'tel' => 'p-tel', + 'note' => 'p-note', + 'bday' => 'dt-bday', + 'key' => 'u-key', + 'org' => 'p-org', + 'organization-name' => 'p-organization-name', + 'organization-unit' => 'p-organization-unit', + ), + 'hentry' => array( + 'entry-title' => 'p-name', + 'entry-summary' => 'p-summary', + 'entry-content' => 'e-content', + 'published' => 'dt-published', + 'updated' => 'dt-updated', + 'author' => 'p-author h-card', + 'category' => 'p-category', + 'geo' => 'p-geo h-geo', + 'latitude' => 'p-latitude', + 'longitude' => 'p-longitude', + ), + 'hrecipe' => array( + 'fn' => 'p-name', + 'ingredient' => 'p-ingredient', + 'yield' => 'p-yield', + 'instructions' => 'e-instructions', + 'duration' => 'dt-duration', + 'nutrition' => 'p-nutrition', + 'photo' => 'u-photo', + 'summary' => 'p-summary', + 'author' => 'p-author h-card' + ), + 'hresume' => array( + 'summary' => 'p-summary', + 'contact' => 'h-card p-contact', + 'education' => 'h-event p-education', + 'experience' => 'h-event p-experience', + 'skill' => 'p-skill', + 'affiliation' => 'p-affiliation h-card', + ), + 'hevent' => array( + 'dtstart' => 'dt-start', + 'dtend' => 'dt-end', + 'duration' => 'dt-duration', + 'description' => 'p-description', + 'summary' => 'p-summary', + 'description' => 'p-description', + 'url' => 'u-url', + 'category' => 'p-category', + 'location' => 'h-card', + 'geo' => 'p-location h-geo' + ), + 'hreview' => array( + 'summary' => 'p-name', + 'fn' => 'p-item h-item p-name', // doesn’t work properly, see spec + 'photo' => 'u-photo', // of the item being reviewed (p-item h-item u-photo) + 'url' => 'u-url', // of the item being reviewed (p-item h-item u-url) + 'reviewer' => 'p-reviewer p-author h-card', + 'dtreviewed' => 'dt-reviewed', + 'rating' => 'p-rating', + 'best' => 'p-best', + 'worst' => 'p-worst', + 'description' => 'p-description' + ) + ); +} + +function parseUriToComponents($uri) { + $result = array( + 'scheme' => null, + 'authority' => null, + 'path' => null, + 'query' => null, + 'fragment' => null + ); + + $u = @parse_url($uri); + + if(array_key_exists('scheme', $u)) + $result['scheme'] = $u['scheme']; + + if(array_key_exists('host', $u)) { + if(array_key_exists('user', $u)) + $result['authority'] = $u['user']; + if(array_key_exists('pass', $u)) + $result['authority'] .= ':' . $u['pass']; + if(array_key_exists('user', $u) || array_key_exists('pass', $u)) + $result['authority'] .= '@'; + $result['authority'] .= $u['host']; + if(array_key_exists('port', $u)) + $result['authority'] .= ':' . $u['port']; + } + + if(array_key_exists('path', $u)) + $result['path'] = $u['path']; + + if(array_key_exists('query', $u)) + $result['query'] = $u['query']; + + if(array_key_exists('fragment', $u)) + $result['fragment'] = $u['fragment']; + + return $result; +} + +function resolveUrl($baseURI, $referenceURI) { + $target = array( + 'scheme' => null, + 'authority' => null, + 'path' => null, + 'query' => null, + 'fragment' => null + ); + + # 5.2.1 Pre-parse the Base URI + # The base URI (Base) is established according to the procedure of + # Section 5.1 and parsed into the five main components described in + # Section 3 + $base = parseUriToComponents($baseURI); + + # If base path is blank (http://example.com) then set it to / + # (I can't tell if this is actually in the RFC or not, but seems like it makes sense) + if($base['path'] == null) + $base['path'] = '/'; + + # 5.2.2. Transform References + + # The URI reference is parsed into the five URI components + # (R.scheme, R.authority, R.path, R.query, R.fragment) = parse(R); + $reference = parseUriToComponents($referenceURI); + + # A non-strict parser may ignore a scheme in the reference + # if it is identical to the base URI's scheme. + # TODO + + if($reference['scheme']) { + $target['scheme'] = $reference['scheme']; + $target['authority'] = $reference['authority']; + $target['path'] = removeDotSegments($reference['path']); + $target['query'] = $reference['query']; + } else { + if($reference['authority']) { + $target['authority'] = $reference['authority']; + $target['path'] = removeDotSegments($reference['path']); + $target['query'] = $reference['query']; + } else { + if($reference['path'] == '') { + $target['path'] = $base['path']; + if($reference['query']) { + $target['query'] = $reference['query']; + } else { + $target['query'] = $base['query']; + } + } else { + if(substr($reference['path'], 0, 1) == '/') { + $target['path'] = removeDotSegments($reference['path']); + } else { + $target['path'] = mergePaths($base, $reference); + $target['path'] = removeDotSegments($target['path']); + } + $target['query'] = $reference['query']; + } + $target['authority'] = $base['authority']; + } + $target['scheme'] = $base['scheme']; + } + $target['fragment'] = $reference['fragment']; + + # 5.3 Component Recomposition + $result = ''; + if($target['scheme']) { + $result .= $target['scheme'] . ':'; + } + if($target['authority']) { + $result .= '//' . $target['authority']; + } + $result .= $target['path']; + if($target['query']) { + $result .= '?' . $target['query']; + } + if($target['fragment']) { + $result .= '#' . $target['fragment']; + } elseif($referenceURI == '#') { + $result .= '#'; + } + return $result; +} + +# 5.2.3 Merge Paths +function mergePaths($base, $reference) { + # If the base URI has a defined authority component and an empty + # path, + if($base['authority'] && $base['path'] == null) { + # then return a string consisting of "/" concatenated with the + # reference's path; otherwise, + $merged = '/' . $reference['path']; + } else { + if(($pos=strrpos($base['path'], '/')) !== false) { + # return a string consisting of the reference's path component + # appended to all but the last segment of the base URI's path (i.e., + # excluding any characters after the right-most "/" in the base URI + # path, + $merged = substr($base['path'], 0, $pos + 1) . $reference['path']; + } else { + # or excluding the entire base URI path if it does not contain + # any "/" characters). + $merged = $base['path']; + } + } + return $merged; +} + +# 5.2.4.A Remove leading ../ or ./ +function removeLeadingDotSlash(&$input) { + if(substr($input, 0, 3) == '../') { + $input = substr($input, 3); + } elseif(substr($input, 0, 2) == './') { + $input = substr($input, 2); + } +} + +# 5.2.4.B Replace leading /. with / +function removeLeadingSlashDot(&$input) { + if(substr($input, 0, 3) == '/./') { + $input = '/' . substr($input, 3); + } else { + $input = '/' . substr($input, 2); + } +} + +# 5.2.4.C Given leading /../ remove component from output buffer +function removeOneDirLevel(&$input, &$output) { + if(substr($input, 0, 4) == '/../') { + $input = '/' . substr($input, 4); + } else { + $input = '/' . substr($input, 3); + } + $output = substr($output, 0, strrpos($output, '/')); +} + +# 5.2.4.D Remove . and .. if it's the only thing in the input +function removeLoneDotDot(&$input) { + if($input == '.') { + $input = substr($input, 1); + } else { + $input = substr($input, 2); + } +} + +# 5.2.4.E Move one segment from input to output +function moveOneSegmentFromInput(&$input, &$output) { + if(substr($input, 0, 1) != '/') { + $pos = strpos($input, '/'); + } else { + $pos = strpos($input, '/', 1); + } + + if($pos === false) { + $output .= $input; + $input = ''; + } else { + $output .= substr($input, 0, $pos); + $input = substr($input, $pos); + } +} + +# 5.2.4 Remove Dot Segments +function removeDotSegments($path) { + # 1. The input buffer is initialized with the now-appended path + # components and the output buffer is initialized to the empty + # string. + $input = $path; + $output = ''; + + $step = 0; + + # 2. While the input buffer is not empty, loop as follows: + while($input) { + $step++; + + if(substr($input, 0, 3) == '../' || substr($input, 0, 2) == './') { + # A. If the input buffer begins with a prefix of "../" or "./", + # then remove that prefix from the input buffer; otherwise, + removeLeadingDotSlash($input); + } elseif(substr($input, 0, 3) == '/./' || $input == '/.') { + # B. if the input buffer begins with a prefix of "/./" or "/.", + # where "." is a complete path segment, then replace that + # prefix with "/" in the input buffer; otherwise, + removeLeadingSlashDot($input); + } elseif(substr($input, 0, 4) == '/../' || $input == '/..') { + # C. if the input buffer begins with a prefix of "/../" or "/..", + # where ".." is a complete path segment, then replace that + # prefix with "/" in the input buffer and remove the last + # segment and its preceding "/" (if any) from the output + # buffer; otherwise, + removeOneDirLevel($input, $output); + } elseif($input == '.' || $input == '..') { + # D. if the input buffer consists only of "." or "..", then remove + # that from the input buffer; otherwise, + removeLoneDotDot($input); + } else { + # E. move the first path segment in the input buffer to the end of + # the output buffer and any subsequent characters up to, but not including, + # the next "/" character or the end of the input buffer + moveOneSegmentFromInput($input, $output); + } + } + + return $output; +}