This commit is contained in:
Mikael Nordfeldth 2015-08-15 11:48:39 +02:00
parent b434243416
commit c77bce12e5
1 changed files with 227 additions and 153 deletions

View File

@ -13,17 +13,17 @@ use stdClass;
/** /**
* Parse Microformats2 * Parse Microformats2
* *
* Functional shortcut for the commonest cases of parsing microformats2 from HTML. * Functional shortcut for the commonest cases of parsing microformats2 from HTML.
* *
* Example usage: * Example usage:
* *
* use Mf2; * use Mf2;
* $output = Mf2\parse('<span class="h-card">Barnaby Walters</span>'); * $output = Mf2\parse('<span class="h-card">Barnaby Walters</span>');
* echo json_encode($output, JSON_PRETTY_PRINT); * echo json_encode($output, JSON_PRETTY_PRINT);
* *
* Produces: * Produces:
* *
* { * {
* "items": [ * "items": [
* { * {
@ -35,7 +35,7 @@ use stdClass;
* ], * ],
* "rels": {} * "rels": {}
* } * }
* *
* @param string|DOMDocument $input The HTML string or DOMDocument object to parse * @param string|DOMDocument $input The HTML string or DOMDocument object to parse
* @param string $url The URL the input document was found at, for relative URL resolution * @param string $url The URL the input document was found at, for relative URL resolution
* @param bool $convertClassic whether or not to convert classic microformats * @param bool $convertClassic whether or not to convert classic microformats
@ -84,7 +84,7 @@ function fetch($url, $convertClassic = true, &$curlInfo=null) {
/** /**
* Unicode to HTML Entities * Unicode to HTML Entities
* @param string $input String containing characters to convert into HTML entities * @param string $input String containing characters to convert into HTML entities
* @return string * @return string
*/ */
function unicodeToHtmlEntities($input) { function unicodeToHtmlEntities($input) {
return mb_convert_encoding($input, 'HTML-ENTITIES', mb_detect_encoding($input)); return mb_convert_encoding($input, 'HTML-ENTITIES', mb_detect_encoding($input));
@ -92,10 +92,10 @@ function unicodeToHtmlEntities($input) {
/** /**
* Collapse Whitespace * Collapse Whitespace
* *
* Collapses any sequences of whitespace within a string into a single space * Collapses any sequences of whitespace within a string into a single space
* character. * character.
* *
* @deprecated since v0.2.3 * @deprecated since v0.2.3
* @param string $str * @param string $str
* @return string * @return string
@ -113,10 +113,10 @@ function unicodeTrim($str) {
/** /**
* Microformat Name From Class string * Microformat Name From Class string
* *
* Given the value of @class, get the relevant mf classnames (e.g. h-card, * Given the value of @class, get the relevant mf classnames (e.g. h-card,
* p-name). * p-name).
* *
* @param string $class A space delimited list of classnames * @param string $class A space delimited list of classnames
* @param string $prefix The prefix to look for * @param string $prefix The prefix to look for
* @return string|array The prefixed name of the first microfomats class found or false * @return string|array The prefixed name of the first microfomats class found or false
@ -127,9 +127,9 @@ function mfNamesFromClass($class, $prefix='h-') {
$matches = array(); $matches = array();
foreach ($classes as $classname) { foreach ($classes as $classname) {
$compare_classname = strtolower(' ' . $classname); $compare_classname = ' ' . $classname;
$compare_prefix = strtolower(' ' . $prefix); $compare_prefix = ' ' . $prefix;
if (stristr($compare_classname, $compare_prefix) !== false && ($compare_classname != $compare_prefix)) { if (strstr($compare_classname, $compare_prefix) !== false && ($compare_classname != $compare_prefix)) {
$matches[] = ($prefix === 'h-') ? $classname : substr($classname, strlen($prefix)); $matches[] = ($prefix === 'h-') ? $classname : substr($classname, strlen($prefix));
} }
} }
@ -139,10 +139,10 @@ function mfNamesFromClass($class, $prefix='h-') {
/** /**
* Get Nested µf Property Name From Class * Get Nested µf Property Name From Class
* *
* Returns all the p-, u-, dt- or e- prefixed classnames it finds in a * Returns all the p-, u-, dt- or e- prefixed classnames it finds in a
* space-separated string. * space-separated string.
* *
* @param string $class * @param string $class
* @return array * @return array
*/ */
@ -153,19 +153,24 @@ function nestedMfPropertyNamesFromClass($class) {
$class = str_replace(array(' ', ' ', "\n"), ' ', $class); $class = str_replace(array(' ', ' ', "\n"), ' ', $class);
foreach (explode(' ', $class) as $classname) { foreach (explode(' ', $class) as $classname) {
foreach ($prefixes as $prefix) { foreach ($prefixes as $prefix) {
$compare_classname = strtolower(' ' . $classname); // Check if $classname is a valid property classname for $prefix.
if (stristr($compare_classname, $prefix) && ($compare_classname != $prefix)) { if (mb_substr($classname, 0, mb_strlen($prefix)) == $prefix && $classname != $prefix) {
$propertyNames = array_merge($propertyNames, mfNamesFromClass($classname, ltrim($prefix))); $propertyName = mb_substr($classname, mb_strlen($prefix));
$propertyNames[$propertyName][] = $prefix;
} }
} }
} }
foreach ($propertyNames as $property => $prefixes) {
$propertyNames[$property] = array_unique($prefixes);
}
return $propertyNames; return $propertyNames;
} }
/** /**
* Wraps mfNamesFromClass to handle an element as input (common) * Wraps mfNamesFromClass to handle an element as input (common)
* *
* @param DOMElement $e The element to get the classname for * @param DOMElement $e The element to get the classname for
* @param string $prefix The prefix to look for * @param string $prefix The prefix to look for
* @return mixed See return value of mf2\Parser::mfNameFromClass() * @return mixed See return value of mf2\Parser::mfNameFromClass()
@ -192,28 +197,27 @@ function convertTimeFormat($time) {
$hh = $mm = $ss = ''; $hh = $mm = $ss = '';
preg_match('/(\d{1,2}):?(\d{2})?:?(\d{2})?(a\.?m\.?|p\.?m\.?)?/i', $time, $matches); preg_match('/(\d{1,2}):?(\d{2})?:?(\d{2})?(a\.?m\.?|p\.?m\.?)?/i', $time, $matches);
// if no am/pm specified // If no am/pm is specified:
if (empty($matches[4])) { if (empty($matches[4])) {
return $time; return $time;
} } else {
// else am/pm specified // Otherwise, am/pm is specified.
else {
$meridiem = strtolower(str_replace('.', '', $matches[4])); $meridiem = strtolower(str_replace('.', '', $matches[4]));
// hours // Hours.
$hh = $matches[1]; $hh = $matches[1];
// add 12 to the pm hours // Add 12 to hours if pm applies.
if ($meridiem == 'pm' && ($hh < 12)) { if ($meridiem == 'pm' && ($hh < 12)) {
$hh += 12; $hh += 12;
} }
$hh = str_pad($hh, 2, '0', STR_PAD_LEFT); $hh = str_pad($hh, 2, '0', STR_PAD_LEFT);
// minutes // Minutes.
$mm = (empty($matches[2]) ) ? '00' : $matches[2]; $mm = (empty($matches[2]) ) ? '00' : $matches[2];
// seconds, only if supplied // Seconds, only if supplied.
if (!empty($matches[3])) { if (!empty($matches[3])) {
$ss = $matches[3]; $ss = $matches[3];
} }
@ -229,11 +233,11 @@ function convertTimeFormat($time) {
/** /**
* Microformats2 Parser * Microformats2 Parser
* *
* A class which holds state for parsing microformats2 from HTML. * A class which holds state for parsing microformats2 from HTML.
* *
* Example usage: * Example usage:
* *
* use Mf2; * use Mf2;
* $parser = new Mf2\Parser('<p class="h-card">Barnaby Walters</p>'); * $parser = new Mf2\Parser('<p class="h-card">Barnaby Walters</p>');
* $output = $parser->parse(); * $output = $parser->parse();
@ -244,18 +248,18 @@ class Parser {
/** @var DOMXPath object which can be used to query over any fragment*/ /** @var DOMXPath object which can be used to query over any fragment*/
public $xpath; public $xpath;
/** @var DOMDocument */ /** @var DOMDocument */
public $doc; public $doc;
/** @var SplObjectStorage */ /** @var SplObjectStorage */
protected $parsed; protected $parsed;
public $jsonMode; public $jsonMode;
/** /**
* Constructor * Constructor
* *
* @param DOMDocument|string $input The data to parse. A string of HTML or a DOMDocument * @param DOMDocument|string $input The data to parse. A string of HTML or a DOMDocument
* @param string $url The URL of the parsed document, for relative URL resolution * @param string $url The URL of the parsed document, for relative URL resolution
* @param boolean $jsonMode Whether or not to use a stdClass instance for an empty `rels` dictionary. This breaks PHP looping over rels, but allows the output to be correctly serialized as JSON. * @param boolean $jsonMode Whether or not to use a stdClass instance for an empty `rels` dictionary. This breaks PHP looping over rels, but allows the output to be correctly serialized as JSON.
@ -271,20 +275,20 @@ class Parser {
$doc = new DOMDocument(); $doc = new DOMDocument();
@$doc->loadHTML(''); @$doc->loadHTML('');
} }
$this->xpath = new DOMXPath($doc); $this->xpath = new DOMXPath($doc);
$baseurl = $url; $baseurl = $url;
foreach ($this->xpath->query('//base[@href]') as $base) { foreach ($this->xpath->query('//base[@href]') as $base) {
$baseElementUrl = $base->getAttribute('href'); $baseElementUrl = $base->getAttribute('href');
if (parse_url($baseElementUrl, PHP_URL_SCHEME) === null) { if (parse_url($baseElementUrl, PHP_URL_SCHEME) === null) {
/* The base element URL is relative to the document URL. /* The base element URL is relative to the document URL.
* *
* :/ * :/
* *
* Perhaps the author was high? */ * Perhaps the author was high? */
$baseurl = resolveUrl($url, $baseElementUrl); $baseurl = resolveUrl($url, $baseElementUrl);
} else { } else {
$baseurl = $baseElementUrl; $baseurl = $baseElementUrl;
@ -296,31 +300,31 @@ class Parser {
foreach ($this->xpath->query('//template') as $templateEl) { foreach ($this->xpath->query('//template') as $templateEl) {
$templateEl->parentNode->removeChild($templateEl); $templateEl->parentNode->removeChild($templateEl);
} }
$this->baseurl = $baseurl; $this->baseurl = $baseurl;
$this->doc = $doc; $this->doc = $doc;
$this->parsed = new SplObjectStorage(); $this->parsed = new SplObjectStorage();
$this->jsonMode = $jsonMode; $this->jsonMode = $jsonMode;
} }
private function elementPrefixParsed(\DOMElement $e, $prefix) { private function elementPrefixParsed(\DOMElement $e, $prefix) {
if (!$this->parsed->contains($e)) if (!$this->parsed->contains($e))
$this->parsed->attach($e, array()); $this->parsed->attach($e, array());
$prefixes = $this->parsed[$e]; $prefixes = $this->parsed[$e];
$prefixes[] = $prefix; $prefixes[] = $prefix;
$this->parsed[$e] = $prefixes; $this->parsed[$e] = $prefixes;
} }
private function isElementParsed(\DOMElement $e, $prefix) { private function isElementParsed(\DOMElement $e, $prefix) {
if (!$this->parsed->contains($e)) if (!$this->parsed->contains($e))
return false; return false;
$prefixes = $this->parsed[$e]; $prefixes = $this->parsed[$e];
if (!in_array($prefix, $prefixes)) if (!in_array($prefix, $prefixes))
return false; return false;
return true; return true;
} }
@ -352,72 +356,72 @@ class Parser {
// TODO: figure out if this has problems with sms: and geo: URLs // TODO: figure out if this has problems with sms: and geo: URLs
public function resolveUrl($url) { public function resolveUrl($url) {
// If the URL is seriously malformed its probably beyond the scope of this // If the URL is seriously malformed its probably beyond the scope of this
// parser to try to do anything with it. // parser to try to do anything with it.
if (parse_url($url) === false) if (parse_url($url) === false)
return $url; return $url;
$scheme = parse_url($url, PHP_URL_SCHEME); $scheme = parse_url($url, PHP_URL_SCHEME);
if (empty($scheme) and !empty($this->baseurl)) { if (empty($scheme) and !empty($this->baseurl)) {
return resolveUrl($this->baseurl, $url); return resolveUrl($this->baseurl, $url);
} else { } else {
return $url; return $url;
} }
} }
// Parsing Functions // Parsing Functions
/** /**
* Parse value-class/value-title on an element, joining with $separator if * Parse value-class/value-title on an element, joining with $separator if
* there are multiple. * there are multiple.
* *
* @param \DOMElement $e * @param \DOMElement $e
* @param string $separator = '' if multiple value-title elements, join with this string * @param string $separator = '' if multiple value-title elements, join with this string
* @return string|null the parsed value or null if value-class or -title arent in use * @return string|null the parsed value or null if value-class or -title arent in use
*/ */
public function parseValueClassTitle(\DOMElement $e, $separator = '') { public function parseValueClassTitle(\DOMElement $e, $separator = '') {
$valueClassElements = $this->xpath->query('./*[contains(concat(" ", @class, " "), " value ")]', $e); $valueClassElements = $this->xpath->query('./*[contains(concat(" ", @class, " "), " value ")]', $e);
if ($valueClassElements->length !== 0) { if ($valueClassElements->length !== 0) {
// Process value-class stuff // Process value-class stuff
$val = ''; $val = '';
foreach ($valueClassElements as $el) { foreach ($valueClassElements as $el) {
$val .= $this->textContent($el); $val .= $this->textContent($el);
} }
return unicodeTrim($val); return unicodeTrim($val);
} }
$valueTitleElements = $this->xpath->query('./*[contains(concat(" ", @class, " "), " value-title ")]', $e); $valueTitleElements = $this->xpath->query('./*[contains(concat(" ", @class, " "), " value-title ")]', $e);
if ($valueTitleElements->length !== 0) { if ($valueTitleElements->length !== 0) {
// Process value-title stuff // Process value-title stuff
$val = ''; $val = '';
foreach ($valueTitleElements as $el) { foreach ($valueTitleElements as $el) {
$val .= $el->getAttribute('title'); $val .= $el->getAttribute('title');
} }
return unicodeTrim($val); return unicodeTrim($val);
} }
// No value-title or -class in this element // No value-title or -class in this element
return null; return null;
} }
/** /**
* Given an element with class="p-*", get its value * Given an element with class="p-*", get its value
* *
* @param DOMElement $p The element to parse * @param DOMElement $p The element to parse
* @return string The plaintext value of $p, dependant on type * @return string The plaintext value of $p, dependant on type
* @todo Make this adhere to value-class * @todo Make this adhere to value-class
*/ */
public function parseP(\DOMElement $p) { public function parseP(\DOMElement $p) {
$classTitle = $this->parseValueClassTitle($p, ' '); $classTitle = $this->parseValueClassTitle($p, ' ');
if ($classTitle !== null) if ($classTitle !== null)
return $classTitle; return $classTitle;
if ($p->tagName == 'img' and $p->getAttribute('alt') !== '') { if ($p->tagName == 'img' and $p->getAttribute('alt') !== '') {
$pValue = $p->getAttribute('alt'); $pValue = $p->getAttribute('alt');
} elseif ($p->tagName == 'area' and $p->getAttribute('alt') !== '') { } elseif ($p->tagName == 'area' and $p->getAttribute('alt') !== '') {
@ -429,13 +433,13 @@ class Parser {
} else { } else {
$pValue = unicodeTrim($this->textContent($p)); $pValue = unicodeTrim($this->textContent($p));
} }
return $pValue; return $pValue;
} }
/** /**
* Given an element with class="u-*", get the value of the URL * Given an element with class="u-*", get the value of the URL
* *
* @param DOMElement $u The element to parse * @param DOMElement $u The element to parse
* @return string The plaintext value of $u, dependant on type * @return string The plaintext value of $u, dependant on type
* @todo make this adhere to value-class * @todo make this adhere to value-class
@ -443,18 +447,18 @@ class Parser {
public function parseU(\DOMElement $u) { public function parseU(\DOMElement $u) {
if (($u->tagName == 'a' or $u->tagName == 'area') and $u->getAttribute('href') !== null) { if (($u->tagName == 'a' or $u->tagName == 'area') and $u->getAttribute('href') !== null) {
$uValue = $u->getAttribute('href'); $uValue = $u->getAttribute('href');
} elseif ($u->tagName == 'img' and $u->getAttribute('src') !== null) { } elseif (in_array($u->tagName, array('img', 'audio', 'video', 'source')) and $u->getAttribute('src') !== null) {
$uValue = $u->getAttribute('src'); $uValue = $u->getAttribute('src');
} elseif ($u->tagName == 'object' and $u->getAttribute('data') !== null) { } elseif ($u->tagName == 'object' and $u->getAttribute('data') !== null) {
$uValue = $u->getAttribute('data'); $uValue = $u->getAttribute('data');
} }
if (isset($uValue)) { if (isset($uValue)) {
return $this->resolveUrl($uValue); return $this->resolveUrl($uValue);
} }
$classTitle = $this->parseValueClassTitle($u); $classTitle = $this->parseValueClassTitle($u);
if ($classTitle !== null) { if ($classTitle !== null) {
return $classTitle; return $classTitle;
} elseif ($u->tagName == 'abbr' and $u->getAttribute('title') !== null) { } elseif ($u->tagName == 'abbr' and $u->getAttribute('title') !== null) {
@ -468,7 +472,7 @@ class Parser {
/** /**
* Given an element with class="dt-*", get the value of the datetime as a php date object * Given an element with class="dt-*", get the value of the datetime as a php date object
* *
* @param DOMElement $dt The element to parse * @param DOMElement $dt The element to parse
* @param array $dates Array of dates processed so far * @param array $dates Array of dates processed so far
* @return string The datetime string found * @return string The datetime string found
@ -477,11 +481,11 @@ class Parser {
// Check for value-class pattern // Check for value-class pattern
$valueClassChildren = $this->xpath->query('./*[contains(concat(" ", @class, " "), " value ") or contains(concat(" ", @class, " "), " value-title ")]', $dt); $valueClassChildren = $this->xpath->query('./*[contains(concat(" ", @class, " "), " value ") or contains(concat(" ", @class, " "), " value-title ")]', $dt);
$dtValue = false; $dtValue = false;
if ($valueClassChildren->length > 0) { if ($valueClassChildren->length > 0) {
// Theyre using value-class // Theyre using value-class
$dateParts = array(); $dateParts = array();
foreach ($valueClassChildren as $e) { foreach ($valueClassChildren as $e) {
if (strstr(' ' . $e->getAttribute('class') . ' ', ' value-title ')) { if (strstr(' ' . $e->getAttribute('class') . ' ', ' value-title ')) {
$title = $e->getAttribute('title'); $title = $e->getAttribute('title');
@ -591,16 +595,16 @@ class Parser {
$dtValue = $dt->nodeValue; $dtValue = $dt->nodeValue;
} }
if ( preg_match('/(\d{4}-\d{2}-\d{2})/', $dtValue, $matches) ) { if (preg_match('/(\d{4}-\d{2}-\d{2})/', $dtValue, $matches)) {
$dates[] = $matches[0]; $dates[] = $matches[0];
} }
} }
/** /**
* if $dtValue is only a time and there are recently parsed dates, * if $dtValue is only a time and there are recently parsed dates,
* form the full date-time using the most recnetly parsed dt- value * form the full date-time using the most recently parsed dt- value
*/ */
if ( (preg_match('/^\d{1,2}:\d{1,2}(Z?[+|-]\d{2}:?\d{2})?/', $dtValue) or preg_match('/^\d{1,2}[a|p]m/', $dtValue)) && !empty($dates) ) { if ((preg_match('/^\d{1,2}:\d{1,2}(Z?[+|-]\d{2}:?\d{2})?/', $dtValue) or preg_match('/^\d{1,2}[a|p]m/', $dtValue)) && !empty($dates)) {
$dtValue = convertTimeFormat($dtValue); $dtValue = convertTimeFormat($dtValue);
$dtValue = end($dates) . 'T' . unicodeTrim($dtValue, 'T'); $dtValue = end($dates) . 'T' . unicodeTrim($dtValue, 'T');
} }
@ -613,15 +617,15 @@ class Parser {
* *
* @param DOMElement $e The element to parse * @param DOMElement $e The element to parse
* @return string $es innerHTML * @return string $es innerHTML
* *
* @todo need to mark this element as e- parsed so it doesnt get parsed as its parents e-* too * @todo need to mark this element as e- parsed so it doesnt get parsed as its parents e-* too
*/ */
public function parseE(\DOMElement $e) { public function parseE(\DOMElement $e) {
$classTitle = $this->parseValueClassTitle($e); $classTitle = $this->parseValueClassTitle($e);
if ($classTitle !== null) if ($classTitle !== null)
return $classTitle; return $classTitle;
// Expand relative URLs within children of this element // Expand relative URLs within children of this element
// TODO: as it is this is not relative to only children, make this .// and rerun tests // TODO: as it is this is not relative to only children, make this .// and rerun tests
$this->resolveChildUrls($e); $this->resolveChildUrls($e);
@ -630,7 +634,7 @@ class Parser {
foreach ($e->childNodes as $node) { foreach ($e->childNodes as $node) {
$html .= $node->C14N(); $html .= $node->C14N();
} }
return array( return array(
'html' => $html, 'html' => $html,
'value' => unicodeTrim($this->textContent($e)) 'value' => unicodeTrim($this->textContent($e))
@ -639,7 +643,7 @@ class Parser {
/** /**
* Recursively parse microformats * Recursively parse microformats
* *
* @param DOMElement $e The element to parse * @param DOMElement $e The element to parse
* @return array A representation of the values contained within microformat $e * @return array A representation of the values contained within microformat $e
*/ */
@ -660,26 +664,39 @@ class Parser {
foreach ($this->xpath->query('.//*[contains(concat(" ", @class)," h-")]', $e) as $subMF) { foreach ($this->xpath->query('.//*[contains(concat(" ", @class)," h-")]', $e) as $subMF) {
// Parse // Parse
$result = $this->parseH($subMF); $result = $this->parseH($subMF);
// If result was already parsed, skip it // If result was already parsed, skip it
if (null === $result) if (null === $result)
continue; continue;
// In most cases, the value attribute of the nested microformat should be the p- parsed value of the elemnt.
// The only times this is different is when the microformat is nested under certain prefixes, which are handled below.
$result['value'] = $this->parseP($subMF); $result['value'] = $this->parseP($subMF);
// Does this µf have any property names other than h-*? // Does this µf have any property names other than h-*?
$properties = nestedMfPropertyNamesFromElement($subMF); $properties = nestedMfPropertyNamesFromElement($subMF);
if (!empty($properties)) { if (!empty($properties)) {
// Yes! Its a nested property µf // Yes! Its a nested property µf
foreach ($properties as $property) { foreach ($properties as $property => $prefixes) {
$return[$property][] = $result; // Note: handling microformat nesting under multiple conflicting prefixes is not currently specified by the mf2 parsing spec.
$prefixSpecificResult = $result;
if (in_array('p-', $prefixes)) {
$prefixSpecificResult['value'] = $prefixSpecificResult['properties']['name'][0];
} elseif (in_array('e-', $prefixes)) {
$eParsedResult = $this->parseE($subMF);
$prefixSpecificResult['html'] = $eParsedResult['html'];
$prefixSpecificResult['value'] = $eParsedResult['value'];
} elseif (in_array('u-', $prefixes)) {
$prefixSpecificResult['value'] = $this->parseU($subMF);
}
$return[$property][] = $prefixSpecificResult;
} }
} else { } else {
// No, its a child µf // No, its a child µf
$children[] = $result; $children[] = $result;
} }
// Make sure this sub-mf wont get parsed as a µf or property // Make sure this sub-mf wont get parsed as a µf or property
// TODO: Determine if clearing this is required? // TODO: Determine if clearing this is required?
$this->elementPrefixParsed($subMF, 'h'); $this->elementPrefixParsed($subMF, 'h');
@ -689,19 +706,24 @@ class Parser {
$this->elementPrefixParsed($subMF, 'e'); $this->elementPrefixParsed($subMF, 'e');
} }
if($e->tagName == 'area') {
$coords = $e->getAttribute('coords');
$shape = $e->getAttribute('shape');
}
// Handle p-* // Handle p-*
foreach ($this->xpath->query('.//*[contains(concat(" ", @class) ," p-")]', $e) as $p) { foreach ($this->xpath->query('.//*[contains(concat(" ", @class) ," p-")]', $e) as $p) {
if ($this->isElementParsed($p, 'p')) if ($this->isElementParsed($p, 'p'))
continue; continue;
$pValue = $this->parseP($p); $pValue = $this->parseP($p);
// Add the value to the array for its p- properties // Add the value to the array for its p- properties
foreach (mfNamesFromElement($p, 'p-') as $propName) { foreach (mfNamesFromElement($p, 'p-') as $propName) {
if (!empty($propName)) if (!empty($propName))
$return[$propName][] = $pValue; $return[$propName][] = $pValue;
} }
// Make sure this sub-mf wont get parsed as a top level mf // Make sure this sub-mf wont get parsed as a top level mf
$this->elementPrefixParsed($p, 'p'); $this->elementPrefixParsed($p, 'p');
} }
@ -710,32 +732,32 @@ class Parser {
foreach ($this->xpath->query('.//*[contains(concat(" ", @class)," u-")]', $e) as $u) { foreach ($this->xpath->query('.//*[contains(concat(" ", @class)," u-")]', $e) as $u) {
if ($this->isElementParsed($u, 'u')) if ($this->isElementParsed($u, 'u'))
continue; continue;
$uValue = $this->parseU($u); $uValue = $this->parseU($u);
// Add the value to the array for its property types // Add the value to the array for its property types
foreach (mfNamesFromElement($u, 'u-') as $propName) { foreach (mfNamesFromElement($u, 'u-') as $propName) {
$return[$propName][] = $uValue; $return[$propName][] = $uValue;
} }
// Make sure this sub-mf wont get parsed as a top level mf // Make sure this sub-mf wont get parsed as a top level mf
$this->elementPrefixParsed($u, 'u'); $this->elementPrefixParsed($u, 'u');
} }
// Handle dt-* // Handle dt-*
foreach ($this->xpath->query('.//*[contains(concat(" ", @class), " dt-")]', $e) as $dt) { foreach ($this->xpath->query('.//*[contains(concat(" ", @class), " dt-")]', $e) as $dt) {
if ($this->isElementParsed($dt, 'dt')) if ($this->isElementParsed($dt, 'dt'))
continue; continue;
$dtValue = $this->parseDT($dt, $dates); $dtValue = $this->parseDT($dt, $dates);
if ($dtValue) { if ($dtValue) {
// Add the value to the array for dt- properties // Add the value to the array for dt- properties
foreach (mfNamesFromElement($dt, 'dt-') as $propName) { foreach (mfNamesFromElement($dt, 'dt-') as $propName) {
$return[$propName][] = $dtValue; $return[$propName][] = $dtValue;
} }
} }
// Make sure this sub-mf wont get parsed as a top level mf // Make sure this sub-mf wont get parsed as a top level mf
$this->elementPrefixParsed($dt, 'dt'); $this->elementPrefixParsed($dt, 'dt');
} }
@ -762,22 +784,43 @@ class Parser {
if (!array_key_exists('name', $return)) { if (!array_key_exists('name', $return)) {
try { try {
// Look for img @alt // Look for img @alt
if ($e->tagName == 'img' and $e->getAttribute('alt') != '') if (($e->tagName == 'img' or $e->tagName == 'area') and $e->getAttribute('alt') != '')
throw new Exception($e->getAttribute('alt')); throw new Exception($e->getAttribute('alt'));
if ($e->tagName == 'abbr' and $e->hasAttribute('title')) if ($e->tagName == 'abbr' and $e->hasAttribute('title'))
throw new Exception($e->getAttribute('title')); throw new Exception($e->getAttribute('title'));
// Look for nested img @alt // Look for nested img @alt
foreach ($this->xpath->query('./img[count(preceding-sibling::*)+count(following-sibling::*)=0]', $e) as $em) { foreach ($this->xpath->query('./img[count(preceding-sibling::*)+count(following-sibling::*)=0]', $e) as $em) {
if ($em->getAttribute('alt') != '') $emNames = mfNamesFromElement($em, 'h-');
if (empty($emNames) && $em->getAttribute('alt') != '') {
throw new Exception($em->getAttribute('alt')); throw new Exception($em->getAttribute('alt'));
}
} }
// Look for nested area @alt
foreach ($this->xpath->query('./area[count(preceding-sibling::*)+count(following-sibling::*)=0]', $e) as $em) {
$emNames = mfNamesFromElement($em, 'h-');
if (empty($emNames) && $em->getAttribute('alt') != '') {
throw new Exception($em->getAttribute('alt'));
}
}
// Look for double nested img @alt // Look for double nested img @alt
foreach ($this->xpath->query('./*[count(preceding-sibling::*)+count(following-sibling::*)=0]/img[count(preceding-sibling::*)+count(following-sibling::*)=0]', $e) as $em) { foreach ($this->xpath->query('./*[count(preceding-sibling::*)+count(following-sibling::*)=0]/img[count(preceding-sibling::*)+count(following-sibling::*)=0]', $e) as $em) {
if ($em->getAttribute('alt') != '') $emNames = mfNamesFromElement($em, 'h-');
if (empty($emNames) && $em->getAttribute('alt') != '') {
throw new Exception($em->getAttribute('alt')); throw new Exception($em->getAttribute('alt'));
}
}
// Look for double nested img @alt
foreach ($this->xpath->query('./*[count(preceding-sibling::*)+count(following-sibling::*)=0]/area[count(preceding-sibling::*)+count(following-sibling::*)=0]', $e) as $em) {
$emNames = mfNamesFromElement($em, 'h-');
if (empty($emNames) && $em->getAttribute('alt') != '') {
throw new Exception($em->getAttribute('alt'));
}
} }
throw new Exception($e->nodeValue); throw new Exception($e->nodeValue);
@ -812,36 +855,58 @@ class Parser {
// Check for u-url // Check for u-url
if (!array_key_exists('url', $return)) { if (!array_key_exists('url', $return)) {
// Look for img @src // Look for img @src
if ($e->tagName == 'a') if ($e->tagName == 'a' or $e->tagName == 'area')
$url = $e->getAttribute('href'); $url = $e->getAttribute('href');
// Look for nested img @src // Look for nested a @href
foreach ($this->xpath->query('./a[count(preceding-sibling::a)+count(following-sibling::a)=0]', $e) as $em) { foreach ($this->xpath->query('./a[count(preceding-sibling::a)+count(following-sibling::a)=0]', $e) as $em) {
$url = $em->getAttribute('href'); $emNames = mfNamesFromElement($em, 'h-');
break; if (empty($emNames)) {
$url = $em->getAttribute('href');
break;
}
} }
// Look for nested area @src
foreach ($this->xpath->query('./area[count(preceding-sibling::area)+count(following-sibling::area)=0]', $e) as $em) {
$emNames = mfNamesFromElement($em, 'h-');
if (empty($emNames)) {
$url = $em->getAttribute('href');
break;
}
}
if (!empty($url)) if (!empty($url))
$return['url'][] = $this->resolveUrl($url); $return['url'][] = $this->resolveUrl($url);
} }
// Make sure things are in alphabetical order // Make sure things are in alphabetical order
sort($mfTypes); sort($mfTypes);
// Phew. Return the final result. // Phew. Return the final result.
$parsed = array( $parsed = array(
'type' => $mfTypes, 'type' => $mfTypes,
'properties' => $return 'properties' => $return
); );
if (!empty($children))
if (!empty($shape)) {
$parsed['shape'] = $shape;
}
if (!empty($coords)) {
$parsed['coords'] = $coords;
}
if (!empty($children)) {
$parsed['children'] = array_values(array_filter($children)); $parsed['children'] = array_values(array_filter($children));
}
return $parsed; return $parsed;
} }
/** /**
* Parse Rels and Alternatives * Parse Rels and Alternatives
* *
* Returns [$rels, $alternatives]. If the $rels value is to be empty, i.e. there are no links on the page * Returns [$rels, $alternatives]. If the $rels value is to be empty, i.e. there are no links on the page
* with a rel value *not* containing `alternate`, then the type of $rels depends on $this->jsonMode. If set * with a rel value *not* containing `alternate`, then the type of $rels depends on $this->jsonMode. If set
* to true, it will be a stdClass instance, optimising for JSON serialisation. Otherwise (the default case), * to true, it will be a stdClass instance, optimising for JSON serialisation. Otherwise (the default case),
* it will be an empty array. * it will be an empty array.
@ -849,18 +914,18 @@ class Parser {
public function parseRelsAndAlternates() { public function parseRelsAndAlternates() {
$rels = array(); $rels = array();
$alternates = array(); $alternates = array();
// Iterate through all a, area and link elements with rel attributes // Iterate through all a, area and link elements with rel attributes
foreach ($this->xpath->query('//*[@rel and @href]') as $hyperlink) { foreach ($this->xpath->query('//*[@rel and @href]') as $hyperlink) {
if ($hyperlink->getAttribute('rel') == '') if ($hyperlink->getAttribute('rel') == '')
continue; continue;
// Resolve the href // Resolve the href
$href = $this->resolveUrl($hyperlink->getAttribute('href')); $href = $this->resolveUrl($hyperlink->getAttribute('href'));
// Split up the rel into space-separated values // Split up the rel into space-separated values
$linkRels = array_filter(explode(' ', $hyperlink->getAttribute('rel'))); $linkRels = array_filter(explode(' ', $hyperlink->getAttribute('rel')));
// If alternate in rels, create alternate structure, append // If alternate in rels, create alternate structure, append
if (in_array('alternate', $linkRels)) { if (in_array('alternate', $linkRels)) {
$alt = array( $alt = array(
@ -869,10 +934,19 @@ class Parser {
); );
if ($hyperlink->hasAttribute('media')) if ($hyperlink->hasAttribute('media'))
$alt['media'] = $hyperlink->getAttribute('media'); $alt['media'] = $hyperlink->getAttribute('media');
if ($hyperlink->hasAttribute('hreflang')) if ($hyperlink->hasAttribute('hreflang'))
$alt['hreflang'] = $hyperlink->getAttribute('hreflang'); $alt['hreflang'] = $hyperlink->getAttribute('hreflang');
if ($hyperlink->hasAttribute('title'))
$alt['title'] = $hyperlink->getAttribute('title');
if ($hyperlink->hasAttribute('type'))
$alt['type'] = $hyperlink->getAttribute('type');
if ($hyperlink->nodeValue)
$alt['text'] = $hyperlink->nodeValue;
$alternates[] = $alt; $alternates[] = $alt;
} else { } else {
foreach ($linkRels as $rel) { foreach ($linkRels as $rel) {
@ -880,38 +954,38 @@ class Parser {
} }
} }
} }
if (empty($rels) and $this->jsonMode) { if (empty($rels) and $this->jsonMode) {
$rels = new stdClass(); $rels = new stdClass();
} }
return array($rels, $alternates); return array($rels, $alternates);
} }
/** /**
* Kicks off the parsing routine * Kicks off the parsing routine
* *
* If `$htmlSafe` is set, any angle brackets in the results from non e-* properties * If `$htmlSafe` is set, any angle brackets in the results from non e-* properties
* will be HTML-encoded, bringing all output to the same level of encoding. * will be HTML-encoded, bringing all output to the same level of encoding.
* *
* If a DOMElement is set as the $context, only descendants of that element will * If a DOMElement is set as the $context, only descendants of that element will
* be parsed for microformats. * be parsed for microformats.
* *
* @param bool $htmlSafe whether or not to html-encode non e-* properties. Defaults to false * @param bool $htmlSafe whether or not to html-encode non e-* properties. Defaults to false
* @param DOMElement $context optionally an element from which to parse microformats * @param DOMElement $context optionally an element from which to parse microformats
* @return array An array containing all the µfs found in the current document * @return array An array containing all the µfs found in the current document
*/ */
public function parse($convertClassic = true, DOMElement $context = null) { public function parse($convertClassic = true, DOMElement $context = null) {
$mfs = array(); $mfs = array();
if ($convertClassic) { if ($convertClassic) {
$this->convertLegacy(); $this->convertLegacy();
} }
$mfElements = null === $context $mfElements = null === $context
? $this->xpath->query('//*[contains(concat(" ", @class), " h-")]') ? $this->xpath->query('//*[contains(concat(" ", @class), " h-")]')
: $this->xpath->query('.//*[contains(concat(" ", @class), " h-")]', $context); : $this->xpath->query('.//*[contains(concat(" ", @class), " h-")]', $context);
// Parser microformats // Parser microformats
foreach ($mfElements as $node) { foreach ($mfElements as $node) {
// For each microformat // For each microformat
@ -920,64 +994,64 @@ class Parser {
// Add the value to the array for this property type // Add the value to the array for this property type
$mfs[] = $result; $mfs[] = $result;
} }
// Parse rels // Parse rels
list($rels, $alternates) = $this->parseRelsAndAlternates(); list($rels, $alternates) = $this->parseRelsAndAlternates();
$top = array( $top = array(
'items' => array_values(array_filter($mfs)), 'items' => array_values(array_filter($mfs)),
'rels' => $rels 'rels' => $rels
); );
if (count($alternates)) if (count($alternates))
$top['alternates'] = $alternates; $top['alternates'] = $alternates;
return $top; return $top;
} }
/** /**
* Parse From ID * Parse From ID
* *
* Given an ID, parse all microformats which are children of the element with * Given an ID, parse all microformats which are children of the element with
* that ID. * that ID.
* *
* Note that rel values are still document-wide. * Note that rel values are still document-wide.
* *
* If an element with the ID is not found, an empty skeleton mf2 array structure * If an element with the ID is not found, an empty skeleton mf2 array structure
* will be returned. * will be returned.
* *
* @param string $id * @param string $id
* @param bool $htmlSafe = false whether or not to HTML-encode angle brackets in non e-* properties * @param bool $htmlSafe = false whether or not to HTML-encode angle brackets in non e-* properties
* @return array * @return array
*/ */
public function parseFromId($id, $convertClassic=true) { public function parseFromId($id, $convertClassic=true) {
$matches = $this->xpath->query("//*[@id='{$id}']"); $matches = $this->xpath->query("//*[@id='{$id}']");
if (empty($matches)) if (empty($matches))
return array('items' => array(), 'rels' => array(), 'alternates' => array()); return array('items' => array(), 'rels' => array(), 'alternates' => array());
return $this->parse($convertClassic, $matches->item(0)); return $this->parse($convertClassic, $matches->item(0));
} }
/** /**
* Convert Legacy Classnames * Convert Legacy Classnames
* *
* Adds microformats2 classnames into a document containing only legacy * Adds microformats2 classnames into a document containing only legacy
* semantic classnames. * semantic classnames.
* *
* @return Parser $this * @return Parser $this
*/ */
public function convertLegacy() { public function convertLegacy() {
$doc = $this->doc; $doc = $this->doc;
$xp = new DOMXPath($doc); $xp = new DOMXPath($doc);
// replace all roots // replace all roots
foreach ($this->classicRootMap as $old => $new) { foreach ($this->classicRootMap as $old => $new) {
foreach ($xp->query('//*[contains(concat(" ", @class, " "), " ' . $old . ' ") and not(contains(concat(" ", @class, " "), " ' . $new . ' "))]') as $el) { foreach ($xp->query('//*[contains(concat(" ", @class, " "), " ' . $old . ' ") and not(contains(concat(" ", @class, " "), " ' . $new . ' "))]') as $el) {
$el->setAttribute('class', $el->getAttribute('class') . ' ' . $new); $el->setAttribute('class', $el->getAttribute('class') . ' ' . $new);
} }
} }
foreach ($this->classicPropertyMap as $oldRoot => $properties) { foreach ($this->classicPropertyMap as $oldRoot => $properties) {
$newRoot = $this->classicRootMap[$oldRoot]; $newRoot = $this->classicRootMap[$oldRoot];
foreach ($properties as $old => $new) { foreach ($properties as $old => $new) {
@ -986,16 +1060,16 @@ class Parser {
} }
} }
} }
return $this; return $this;
} }
/** /**
* XPath Query * XPath Query
* *
* Runs an XPath query over the current document. Works in exactly the same * Runs an XPath query over the current document. Works in exactly the same
* way as DOMXPath::query. * way as DOMXPath::query.
* *
* @param string $expression * @param string $expression
* @param DOMNode $context * @param DOMNode $context
* @return DOMNodeList * @return DOMNodeList
@ -1003,7 +1077,7 @@ class Parser {
public function query($expression, $context = null) { public function query($expression, $context = null) {
return $this->xpath->query($expression, $context); return $this->xpath->query($expression, $context);
} }
/** /**
* Classic Root Classname map * Classic Root Classname map
*/ */
@ -1013,11 +1087,11 @@ class Parser {
'hentry' => 'h-entry', 'hentry' => 'h-entry',
'hrecipe' => 'h-recipe', 'hrecipe' => 'h-recipe',
'hresume' => 'h-resume', 'hresume' => 'h-resume',
'hevent' => 'h-event', 'vevent' => 'h-event',
'hreview' => 'h-review', 'hreview' => 'h-review',
'hproduct' => 'h-product' 'hproduct' => 'h-product'
); );
public $classicPropertyMap = array( public $classicPropertyMap = array(
'vcard' => array( 'vcard' => array(
'fn' => 'p-name', 'fn' => 'p-name',
@ -1084,7 +1158,7 @@ class Parser {
'skill' => 'p-skill', 'skill' => 'p-skill',
'affiliation' => 'p-affiliation h-card', 'affiliation' => 'p-affiliation h-card',
), ),
'hevent' => array( 'vevent' => array(
'dtstart' => 'dt-start', 'dtstart' => 'dt-start',
'dtend' => 'dt-end', 'dtend' => 'dt-end',
'duration' => 'dt-duration', 'duration' => 'dt-duration',
@ -1246,7 +1320,7 @@ function resolveUrl($baseURI, $referenceURI) {
# 5.2.3 Merge Paths # 5.2.3 Merge Paths
function mergePaths($base, $reference) { function mergePaths($base, $reference) {
# If the base URI has a defined authority component and an empty # If the base URI has a defined authority component and an empty
# path, # path,
if($base['authority'] && $base['path'] == null) { if($base['authority'] && $base['path'] == null) {
# then return a string consisting of "/" concatenated with the # then return a string consisting of "/" concatenated with the
# reference's path; otherwise, # reference's path; otherwise,