self::NAMESPACE_HTML, 'svg' => self::NAMESPACE_SVG, 'math' => self::NAMESPACE_MATHML, ); /** * Holds the always available namespaces (which does not require the XMLNS declaration). * * @var array */ protected $implicitNamespaces = array( 'xml' => self::NAMESPACE_XML, 'xmlns' => self::NAMESPACE_XMLNS, 'xlink' => self::NAMESPACE_XLINK, ); /** * Holds a stack of currently active namespaces. * * @var array */ protected $nsStack = array(); /** * Holds the number of namespaces declared by a node. * * @var array */ protected $pushes = array(); /** * Defined in 8.2.5. */ const IM_INITIAL = 0; const IM_BEFORE_HTML = 1; const IM_BEFORE_HEAD = 2; const IM_IN_HEAD = 3; const IM_IN_HEAD_NOSCRIPT = 4; const IM_AFTER_HEAD = 5; const IM_IN_BODY = 6; const IM_TEXT = 7; const IM_IN_TABLE = 8; const IM_IN_TABLE_TEXT = 9; const IM_IN_CAPTION = 10; const IM_IN_COLUMN_GROUP = 11; const IM_IN_TABLE_BODY = 12; const IM_IN_ROW = 13; const IM_IN_CELL = 14; const IM_IN_SELECT = 15; const IM_IN_SELECT_IN_TABLE = 16; const IM_AFTER_BODY = 17; const IM_IN_FRAMESET = 18; const IM_AFTER_FRAMESET = 19; const IM_AFTER_AFTER_BODY = 20; const IM_AFTER_AFTER_FRAMESET = 21; const IM_IN_SVG = 22; const IM_IN_MATHML = 23; protected $options = array(); protected $stack = array(); protected $current; // Pointer in the tag hierarchy. protected $rules; protected $doc; protected $frag; protected $processor; protected $insertMode = 0; /** * Track if we are in an element that allows only inline child nodes. * * @var string|null */ protected $onlyInline; /** * Quirks mode is enabled by default. * Any document that is missing the DT will be considered to be in quirks mode. */ protected $quirks = true; protected $errors = array(); public function __construct($isFragment = false, array $options = array()) { $this->options = $options; if (isset($options[self::OPT_TARGET_DOC])) { $this->doc = $options[self::OPT_TARGET_DOC]; } else { $impl = new \DOMImplementation(); // XXX: // Create the doctype. For now, we are always creating HTML5 // documents, and attempting to up-convert any older DTDs to HTML5. $dt = $impl->createDocumentType('html'); // $this->doc = \DOMImplementation::createDocument(NULL, 'html', $dt); $this->doc = $impl->createDocument(null, null, $dt); $this->doc->encoding = !empty($options['encoding']) ? $options['encoding'] : 'UTF-8'; } $this->errors = array(); $this->current = $this->doc; // ->documentElement; // Create a rules engine for tags. $this->rules = new TreeBuildingRules(); $implicitNS = array(); if (isset($this->options[self::OPT_IMPLICIT_NS])) { $implicitNS = $this->options[self::OPT_IMPLICIT_NS]; } elseif (isset($this->options['implicitNamespaces'])) { $implicitNS = $this->options['implicitNamespaces']; } // Fill $nsStack with the defalut HTML5 namespaces, plus the "implicitNamespaces" array taken form $options array_unshift($this->nsStack, $implicitNS + array('' => self::NAMESPACE_HTML) + $this->implicitNamespaces); if ($isFragment) { $this->insertMode = static::IM_IN_BODY; $this->frag = $this->doc->createDocumentFragment(); $this->current = $this->frag; } } /** * Get the document. */ public function document() { return $this->doc; } /** * Get the DOM fragment for the body. * * This returns a DOMNodeList because a fragment may have zero or more * DOMNodes at its root. * * @see http://www.w3.org/TR/2012/CR-html5-20121217/syntax.html#concept-frag-parse-context * * @return \DOMDocumentFragment */ public function fragment() { return $this->frag; } /** * Provide an instruction processor. * * This is used for handling Processor Instructions as they are * inserted. If omitted, PI's are inserted directly into the DOM tree. * * @param InstructionProcessor $proc */ public function setInstructionProcessor(InstructionProcessor $proc) { $this->processor = $proc; } public function doctype($name, $idType = 0, $id = null, $quirks = false) { // This is used solely for setting quirks mode. Currently we don't // try to preserve the inbound DT. We convert it to HTML5. $this->quirks = $quirks; if ($this->insertMode > static::IM_INITIAL) { $this->parseError('Illegal placement of DOCTYPE tag. Ignoring: ' . $name); return; } $this->insertMode = static::IM_BEFORE_HTML; } /** * Process the start tag. * * @todo - XMLNS namespace handling (we need to parse, even if it's not valid) * - XLink, MathML and SVG namespace handling * - Omission rules: 8.1.2.4 Optional tags * * @param string $name * @param array $attributes * @param bool $selfClosing * * @return int */ public function startTag($name, $attributes = array(), $selfClosing = false) { $lname = $this->normalizeTagName($name); // Make sure we have an html element. if (!$this->doc->documentElement && 'html' !== $name && !$this->frag) { $this->startTag('html'); } // Set quirks mode if we're at IM_INITIAL with no doctype. if ($this->insertMode === static::IM_INITIAL) { $this->quirks = true; $this->parseError('No DOCTYPE specified.'); } // SPECIAL TAG HANDLING: // Spec says do this, and "don't ask." // find the spec where this is defined... looks problematic if ('image' === $name && !($this->insertMode === static::IM_IN_SVG || $this->insertMode === static::IM_IN_MATHML)) { $name = 'img'; } // Autoclose p tags where appropriate. if ($this->insertMode >= static::IM_IN_BODY && Elements::isA($name, Elements::AUTOCLOSE_P)) { $this->autoclose('p'); } // Set insert mode: switch ($name) { case 'html': $this->insertMode = static::IM_BEFORE_HEAD; break; case 'head': if ($this->insertMode > static::IM_BEFORE_HEAD) { $this->parseError('Unexpected head tag outside of head context.'); } else { $this->insertMode = static::IM_IN_HEAD; } break; case 'body': $this->insertMode = static::IM_IN_BODY; break; case 'svg': $this->insertMode = static::IM_IN_SVG; break; case 'math': $this->insertMode = static::IM_IN_MATHML; break; case 'noscript': if ($this->insertMode === static::IM_IN_HEAD) { $this->insertMode = static::IM_IN_HEAD_NOSCRIPT; } break; } // Special case handling for SVG. if ($this->insertMode === static::IM_IN_SVG) { $lname = Elements::normalizeSvgElement($lname); } $pushes = 0; // when we found a tag thats appears inside $nsRoots, we have to switch the defalut namespace if (isset($this->nsRoots[$lname]) && $this->nsStack[0][''] !== $this->nsRoots[$lname]) { array_unshift($this->nsStack, array( '' => $this->nsRoots[$lname], ) + $this->nsStack[0]); ++$pushes; } $needsWorkaround = false; if (isset($this->options['xmlNamespaces']) && $this->options['xmlNamespaces']) { // when xmlNamespaces is true a and we found a 'xmlns' or 'xmlns:*' attribute, we should add a new item to the $nsStack foreach ($attributes as $aName => $aVal) { if ('xmlns' === $aName) { $needsWorkaround = $aVal; array_unshift($this->nsStack, array( '' => $aVal, ) + $this->nsStack[0]); ++$pushes; } elseif ('xmlns' === (($pos = strpos($aName, ':')) ? substr($aName, 0, $pos) : '')) { array_unshift($this->nsStack, array( substr($aName, $pos + 1) => $aVal, ) + $this->nsStack[0]); ++$pushes; } } } if ($this->onlyInline && Elements::isA($lname, Elements::BLOCK_TAG)) { $this->autoclose($this->onlyInline); $this->onlyInline = null; } try { $prefix = ($pos = strpos($lname, ':')) ? substr($lname, 0, $pos) : ''; if (false !== $needsWorkaround) { $xml = "<$lname xmlns=\"$needsWorkaround\" " . (strlen($prefix) && isset($this->nsStack[0][$prefix]) ? ("xmlns:$prefix=\"" . $this->nsStack[0][$prefix] . '"') : '') . '/>'; $frag = new \DOMDocument('1.0', 'UTF-8'); $frag->loadXML($xml); $ele = $this->doc->importNode($frag->documentElement, true); } else { if (!isset($this->nsStack[0][$prefix]) || ('' === $prefix && isset($this->options[self::OPT_DISABLE_HTML_NS]) && $this->options[self::OPT_DISABLE_HTML_NS])) { $ele = $this->doc->createElement($lname); } else { $ele = $this->doc->createElementNS($this->nsStack[0][$prefix], $lname); } } } catch (\DOMException $e) { $this->parseError("Illegal tag name: <$lname>. Replaced with ."); $ele = $this->doc->createElement('invalid'); } if (Elements::isA($lname, Elements::BLOCK_ONLY_INLINE)) { $this->onlyInline = $lname; } // When we add some namespacess, we have to track them. Later, when "endElement" is invoked, we have to remove them. // When we are on a void tag, we do not need to care about namesapce nesting. if ($pushes > 0 && !Elements::isA($name, Elements::VOID_TAG)) { // PHP tends to free the memory used by DOM, // to avoid spl_object_hash collisions whe have to avoid garbage collection of $ele storing it into $pushes // see https://bugs.php.net/bug.php?id=67459 $this->pushes[spl_object_hash($ele)] = array($pushes, $ele); } foreach ($attributes as $aName => $aVal) { // xmlns attributes can't be set if ('xmlns' === $aName) { continue; } if ($this->insertMode === static::IM_IN_SVG) { $aName = Elements::normalizeSvgAttribute($aName); } elseif ($this->insertMode === static::IM_IN_MATHML) { $aName = Elements::normalizeMathMlAttribute($aName); } try { $prefix = ($pos = strpos($aName, ':')) ? substr($aName, 0, $pos) : false; if ('xmlns' === $prefix) { $ele->setAttributeNS(self::NAMESPACE_XMLNS, $aName, $aVal); } elseif (false !== $prefix && isset($this->nsStack[0][$prefix])) { $ele->setAttributeNS($this->nsStack[0][$prefix], $aName, $aVal); } else { $ele->setAttribute($aName, $aVal); } } catch (\DOMException $e) { $this->parseError("Illegal attribute name for tag $name. Ignoring: $aName"); continue; } // This is necessary on a non-DTD schema, like HTML5. if ('id' === $aName) { $ele->setIdAttribute('id', true); } } if ($this->frag !== $this->current && $this->rules->hasRules($name)) { // Some elements have special processing rules. Handle those separately. $this->current = $this->rules->evaluate($ele, $this->current); } else { // Otherwise, it's a standard element. $this->current->appendChild($ele); if (!Elements::isA($name, Elements::VOID_TAG)) { $this->current = $ele; } // Self-closing tags should only be respected on foreign elements // (and are implied on void elements) // See: https://www.w3.org/TR/html5/syntax.html#start-tags if (Elements::isHtml5Element($name)) { $selfClosing = false; } } // This is sort of a last-ditch attempt to correct for cases where no head/body // elements are provided. if ($this->insertMode <= static::IM_BEFORE_HEAD && 'head' !== $name && 'html' !== $name) { $this->insertMode = static::IM_IN_BODY; } // When we are on a void tag, we do not need to care about namesapce nesting, // but we have to remove the namespaces pushed to $nsStack. if ($pushes > 0 && Elements::isA($name, Elements::VOID_TAG)) { // remove the namespaced definded by current node for ($i = 0; $i < $pushes; ++$i) { array_shift($this->nsStack); } } if ($selfClosing) { $this->endTag($name); } // Return the element mask, which the tokenizer can then use to set // various processing rules. return Elements::element($name); } public function endTag($name) { $lname = $this->normalizeTagName($name); // Ignore closing tags for unary elements. if (Elements::isA($name, Elements::VOID_TAG)) { return; } if ($this->insertMode <= static::IM_BEFORE_HTML) { // 8.2.5.4.2 if (in_array($name, array( 'html', 'br', 'head', 'title', ))) { $this->startTag('html'); $this->endTag($name); $this->insertMode = static::IM_BEFORE_HEAD; return; } // Ignore the tag. $this->parseError('Illegal closing tag at global scope.'); return; } // Special case handling for SVG. if ($this->insertMode === static::IM_IN_SVG) { $lname = Elements::normalizeSvgElement($lname); } $cid = spl_object_hash($this->current); // XXX: HTML has no parent. What do we do, though, // if this element appears in the wrong place? if ('html' === $lname) { return; } // remove the namespaced definded by current node if (isset($this->pushes[$cid])) { for ($i = 0; $i < $this->pushes[$cid][0]; ++$i) { array_shift($this->nsStack); } unset($this->pushes[$cid]); } if (!$this->autoclose($lname)) { $this->parseError('Could not find closing tag for ' . $lname); } switch ($lname) { case 'head': $this->insertMode = static::IM_AFTER_HEAD; break; case 'body': $this->insertMode = static::IM_AFTER_BODY; break; case 'svg': case 'mathml': $this->insertMode = static::IM_IN_BODY; break; } } public function comment($cdata) { // TODO: Need to handle case where comment appears outside of the HTML tag. $node = $this->doc->createComment($cdata); $this->current->appendChild($node); } public function text($data) { // XXX: Hmmm.... should we really be this strict? if ($this->insertMode < static::IM_IN_HEAD) { // Per '8.2.5.4.3 The "before head" insertion mode' the characters // " \t\n\r\f" should be ignored but no mention of a parse error. This is // practical as most documents contain these characters. Other text is not // expected here so recording a parse error is necessary. $dataTmp = trim($data, " \t\n\r\f"); if (!empty($dataTmp)) { // fprintf(STDOUT, "Unexpected insert mode: %d", $this->insertMode); $this->parseError('Unexpected text. Ignoring: ' . $dataTmp); } return; } // fprintf(STDOUT, "Appending text %s.", $data); $node = $this->doc->createTextNode($data); $this->current->appendChild($node); } public function eof() { // If the $current isn't the $root, do we need to do anything? } public function parseError($msg, $line = 0, $col = 0) { $this->errors[] = sprintf('Line %d, Col %d: %s', $line, $col, $msg); } public function getErrors() { return $this->errors; } public function cdata($data) { $node = $this->doc->createCDATASection($data); $this->current->appendChild($node); } public function processingInstruction($name, $data = null) { // XXX: Ignore initial XML declaration, per the spec. if ($this->insertMode === static::IM_INITIAL && 'xml' === strtolower($name)) { return; } // Important: The processor may modify the current DOM tree however it sees fit. if ($this->processor instanceof InstructionProcessor) { $res = $this->processor->process($this->current, $name, $data); if (!empty($res)) { $this->current = $res; } return; } // Otherwise, this is just a dumb PI element. $node = $this->doc->createProcessingInstruction($name, $data); $this->current->appendChild($node); } // ========================================================================== // UTILITIES // ========================================================================== /** * Apply normalization rules to a tag name. * See sections 2.9 and 8.1.2. * * @param string $tagName * * @return string The normalized tag name. */ protected function normalizeTagName($tagName) { /* * Section 2.9 suggests that we should not do this. if (strpos($name, ':') !== false) { // We know from the grammar that there must be at least one other // char besides :, since : is not a legal tag start. $parts = explode(':', $name); return array_pop($parts); } */ return $tagName; } protected function quirksTreeResolver($name) { throw new \Exception('Not implemented.'); } /** * Automatically climb the tree and close the closest node with the matching $tag. * * @param string $tagName * * @return bool */ protected function autoclose($tagName) { $working = $this->current; do { if (XML_ELEMENT_NODE !== $working->nodeType) { return false; } if ($working->tagName === $tagName) { $this->current = $working->parentNode; return true; } } while ($working = $working->parentNode); return false; } /** * Checks if the given tagname is an ancestor of the present candidate. * * If $this->current or anything above $this->current matches the given tag * name, this returns true. * * @param string $tagName * * @return bool */ protected function isAncestor($tagName) { $candidate = $this->current; while (XML_ELEMENT_NODE === $candidate->nodeType) { if ($candidate->tagName === $tagName) { return true; } $candidate = $candidate->parentNode; } return false; } /** * Returns true if the immediate parent element is of the given tagname. * * @param string $tagName * * @return bool */ protected function isParent($tagName) { return $this->current->tagName === $tagName; } }