minor #31257 [DomCrawler] fix HTML5 parser integration (nicolas-grekas)

This PR was merged into the 4.3 branch.

Discussion
----------

[DomCrawler] fix HTML5 parser integration

| Q             | A
| ------------- | ---
| Branch?       | master
| Bug fix?      | yes
| New feature?  | no
| BC breaks?    | no
| Deprecations? | no
| Tests pass?   | yes
| Fixed tickets | -
| License       | MIT
| Doc PR        | -

Spotted while reviewing #30892
The current logic is context-dependent: by changing the order of calls, you can get different behaviors.

Commits
-------

ba83bdadb1 [DomCrawler] fix HTML5 parser integration
This commit is contained in:
Nicolas Grekas 2019-05-09 16:02:42 +02:00
commit 4f290d784c

View File

@ -69,6 +69,7 @@ class Crawler implements \Countable, \IteratorAggregate
{
$this->uri = $uri;
$this->baseHref = $baseHref ?: $uri;
$this->html5Parser = class_exists(HTML5::class) ? new HTML5(['disable_html_ns' => true]) : null;
$this->add($node);
}
@ -190,13 +191,7 @@ class Crawler implements \Countable, \IteratorAggregate
public function addHtmlContent($content, $charset = 'UTF-8')
{
// Use HTML5 parser if the content is HTML5 and the library is available
if (!$this->html5Parser
&& class_exists(HTML5::class)
&& '<!doctype html>' === strtolower(substr(ltrim($content), 0, 15))) {
$this->html5Parser = new HTML5(['disable_html_ns' => true]);
}
$dom = null !== $this->html5Parser ? $this->parseHtml5($content, $charset) : $this->parseXhtml($content, $charset);
$dom = null !== $this->html5Parser && strspn($content, " \t\r\n") === stripos($content, '<!doctype html>') ? $this->parseHtml5($content, $charset) : $this->parseXhtml($content, $charset);
$this->addDocument($dom);
$base = $this->filterRelativeXPath('descendant-or-self::base')->extract(['href']);
@ -599,18 +594,16 @@ class Crawler implements \Countable, \IteratorAggregate
throw new \InvalidArgumentException('The current node list is empty.');
}
if (null !== $this->html5Parser) {
$html = '';
foreach ($this->getNode(0)->childNodes as $child) {
$html .= $this->html5Parser->saveHTML($child);
}
$node = $this->getNode(0);
$owner = $node->ownerDocument;
return $html;
if (null !== $this->html5Parser && '<!DOCTYPE html>' === $owner->saveXML($owner->childNodes[0])) {
$owner = $this->html5Parser;
}
$html = '';
foreach ($this->getNode(0)->childNodes as $child) {
$html .= $child->ownerDocument->saveHTML($child);
foreach ($node->childNodes as $child) {
$html .= $owner->saveHTML($child);
}
return $html;