Barnaby Walters'); * echo json_encode($output, JSON_PRETTY_PRINT); * * Produces: * * { * "items": [ * { * "type": ["h-card"], * "properties": { * "name": ["Barnaby Walters"] * } * } * ], * "rels": {} * } * * @param string|DOMDocument $input The HTML string or DOMDocument object to parse * @param string $url The URL the input document was found at, for relative URL resolution * @param bool $convertClassic whether or not to convert classic microformats * @return array Canonical MF2 array structure */ function parse($input, $url = null, $convertClassic = true) { $parser = new Parser($input, $url); return $parser->parse($convertClassic); } /** * Fetch microformats2 * * Given a URL, fetches it (following up to 5 redirects) and, if the content-type appears to be HTML, returns the parsed * microformats2 array structure. * * Not that even if the response code was a 4XX or 5XX error, if the content-type is HTML-like then it will be parsed * all the same, as there are legitimate cases where error pages might contain useful microformats (for example a deleted * h-entry resulting in a 410 Gone page with a stub h-entry explaining the reason for deletion). Look in $curlInfo['http_code'] * for the actual value. * * @param string $url The URL to fetch * @param bool $convertClassic (optional, default true) whether or not to convert classic microformats * @param &array $curlInfo (optional) the results of curl_getinfo will be placed in this variable for debugging * @return array|null canonical microformats2 array structure on success, null on failure */ function fetch($url, $convertClassic = true, &$curlInfo=null) { $ch = curl_init(); curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); curl_setopt($ch, CURLOPT_HEADER, 0); curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1); curl_setopt($ch, CURLOPT_MAXREDIRS, 5); $html = curl_exec($ch); $info = $curlInfo = curl_getinfo($ch); curl_close($ch); if (strpos(strtolower($info['content_type']), 'html') === false) { // The content was not delivered as HTML, do not attempt to parse it. return null; } return parse($html, $url, $convertClassic); } /** * Unicode to HTML Entities * @param string $input String containing characters to convert into HTML entities * @return string */ function unicodeToHtmlEntities($input) { return mb_convert_encoding($input, 'HTML-ENTITIES', mb_detect_encoding($input)); } /** * Collapse Whitespace * * Collapses any sequences of whitespace within a string into a single space * character. * * @deprecated since v0.2.3 * @param string $str * @return string */ function collapseWhitespace($str) { return preg_replace('/[\s|\n]+/', ' ', $str); } function unicodeTrim($str) { // this is cheating. TODO: find a better way if this causes any problems $str = str_replace(mb_convert_encoding(' ', 'UTF-8', 'HTML-ENTITIES'), ' ', $str); $str = preg_replace('/^\s+/', '', $str); return preg_replace('/\s+$/', '', $str); } /** * Microformat Name From Class string * * Given the value of @class, get the relevant mf classnames (e.g. h-card, * p-name). * * @param string $class A space delimited list of classnames * @param string $prefix The prefix to look for * @return string|array The prefixed name of the first microfomats class found or false */ function mfNamesFromClass($class, $prefix='h-') { $class = str_replace(array(' ', ' ', "\n"), ' ', $class); $classes = explode(' ', $class); $matches = array(); foreach ($classes as $classname) { $compare_classname = ' ' . $classname; $compare_prefix = ' ' . $prefix; if (strstr($compare_classname, $compare_prefix) !== false && ($compare_classname != $compare_prefix)) { $matches[] = ($prefix === 'h-') ? $classname : substr($classname, strlen($prefix)); } } return $matches; } /** * Get Nested µf Property Name From Class * * Returns all the p-, u-, dt- or e- prefixed classnames it finds in a * space-separated string. * * @param string $class * @return array */ function nestedMfPropertyNamesFromClass($class) { $prefixes = array('p-', 'u-', 'dt-', 'e-'); $propertyNames = array(); $class = str_replace(array(' ', ' ', "\n"), ' ', $class); foreach (explode(' ', $class) as $classname) { foreach ($prefixes as $prefix) { // Check if $classname is a valid property classname for $prefix. if (mb_substr($classname, 0, mb_strlen($prefix)) == $prefix && $classname != $prefix) { $propertyName = mb_substr($classname, mb_strlen($prefix)); $propertyNames[$propertyName][] = $prefix; } } } foreach ($propertyNames as $property => $prefixes) { $propertyNames[$property] = array_unique($prefixes); } return $propertyNames; } /** * Wraps mfNamesFromClass to handle an element as input (common) * * @param DOMElement $e The element to get the classname for * @param string $prefix The prefix to look for * @return mixed See return value of mf2\Parser::mfNameFromClass() */ function mfNamesFromElement(\DOMElement $e, $prefix = 'h-') { $class = $e->getAttribute('class'); return mfNamesFromClass($class, $prefix); } /** * Wraps nestedMfPropertyNamesFromClass to handle an element as input */ function nestedMfPropertyNamesFromElement(\DOMElement $e) { $class = $e->getAttribute('class'); return nestedMfPropertyNamesFromClass($class); } /** * Converts various time formats to HH:MM * @param string $time The time to convert * @return string */ function convertTimeFormat($time) { $hh = $mm = $ss = ''; preg_match('/(\d{1,2}):?(\d{2})?:?(\d{2})?(a\.?m\.?|p\.?m\.?)?/i', $time, $matches); // If no am/pm is specified: if (empty($matches[4])) { return $time; } else { // Otherwise, am/pm is specified. $meridiem = strtolower(str_replace('.', '', $matches[4])); // Hours. $hh = $matches[1]; // Add 12 to hours if pm applies. if ($meridiem == 'pm' && ($hh < 12)) { $hh += 12; } $hh = str_pad($hh, 2, '0', STR_PAD_LEFT); // Minutes. $mm = (empty($matches[2]) ) ? '00' : $matches[2]; // Seconds, only if supplied. if (!empty($matches[3])) { $ss = $matches[3]; } if (empty($ss)) { return sprintf('%s:%s', $hh, $mm); } else { return sprintf('%s:%s:%s', $hh, $mm, $ss); } } } /** * Microformats2 Parser * * A class which holds state for parsing microformats2 from HTML. * * Example usage: * * use Mf2; * $parser = new Mf2\Parser('

Barnaby Walters

'); * $output = $parser->parse(); */ class Parser { /** @var string The baseurl (if any) to use for this parse */ public $baseurl; /** @var DOMXPath object which can be used to query over any fragment*/ public $xpath; /** @var DOMDocument */ public $doc; /** @var SplObjectStorage */ protected $parsed; public $jsonMode; /** * Constructor * * @param DOMDocument|string $input The data to parse. A string of HTML or a DOMDocument * @param string $url The URL of the parsed document, for relative URL resolution * @param boolean $jsonMode Whether or not to use a stdClass instance for an empty `rels` dictionary. This breaks PHP looping over rels, but allows the output to be correctly serialized as JSON. */ public function __construct($input, $url = null, $jsonMode = false) { libxml_use_internal_errors(true); if (is_string($input)) { $doc = new DOMDocument(); @$doc->loadHTML(unicodeToHtmlEntities($input)); } elseif (is_a($input, 'DOMDocument')) { $doc = $input; } else { $doc = new DOMDocument(); @$doc->loadHTML(''); } $this->xpath = new DOMXPath($doc); $baseurl = $url; foreach ($this->xpath->query('//base[@href]') as $base) { $baseElementUrl = $base->getAttribute('href'); if (parse_url($baseElementUrl, PHP_URL_SCHEME) === null) { /* The base element URL is relative to the document URL. * * :/ * * Perhaps the author was high? */ $baseurl = resolveUrl($url, $baseElementUrl); } else { $baseurl = $baseElementUrl; } break; } // Ignore