bug #37803 Fix for issue #37681 (Rav)

This PR was squashed before being merged into the 4.4 branch.

Discussion
----------

Fix for issue #37681

| Q             | A
| ------------- | ---
| Branch?       | 4.4
| Bug fix?      | yes
| New feature?  | no
| Deprecations? | no
| Tickets       | Fix #37681
| License       | MIT
| Doc PR        |

Allow BOM character and comments before `<!DOCTYPE html>` declaration in DomCrawler while choosing a parser implementation

Commits
-------

9bc249e0b9 Fix for issue #37681
This commit is contained in:
Fabien Potencier 2020-08-12 08:20:48 +02:00
commit 7e85a6a6c6
2 changed files with 84 additions and 2 deletions

View File

@ -188,8 +188,7 @@ class Crawler implements \Countable, \IteratorAggregate
*/
public function addHtmlContent($content, $charset = 'UTF-8')
{
// Use HTML5 parser if the content is HTML5 and the library is available
$dom = null !== $this->html5Parser && strspn($content, " \t\r\n") === stripos($content, '<!doctype html>') ? $this->parseHtml5($content, $charset) : $this->parseXhtml($content, $charset);
$dom = $this->parseHtmlString($content, $charset);
$this->addDocument($dom);
$base = $this->filterRelativeXPath('descendant-or-self::base')->extract(['href']);
@ -1295,4 +1294,35 @@ class Crawler implements \Countable, \IteratorAggregate
return new CssSelectorConverter($this->isHtml);
}
/**
* Parse string into DOMDocument object using HTML5 parser if the content is HTML5 and the library is available.
* Use libxml parser otherwise.
*/
private function parseHtmlString(string $content, string $charset): \DOMDocument
{
if ($this->canParseHtml5String($content)) {
return $this->parseHtml5($content, $charset);
}
return $this->parseXhtml($content, $charset);
}
private function canParseHtml5String(string $content): bool
{
if (null === $this->html5Parser) {
return false;
}
if (false === ($pos = stripos($content, '<!doctype html>'))) {
return false;
}
$header = substr($content, 0, $pos);
return '' === $header || $this->isValidHtml5Heading($header);
}
private function isValidHtml5Heading(string $heading): bool
{
return 1 === preg_match('/^\x{FEFF}?\s*(<!--[^>]*?-->\s*)*$/u', $heading);
}
}

View File

@ -25,4 +25,56 @@ class Html5ParserCrawlerTest extends AbstractCrawlerTest
$crawler->add($this->getDoctype().'<html><body><h1><p>Foo</p></h1></body></html>');
$this->assertEquals('Foo', $crawler->filterXPath('//h1')->text(), '->add() adds nodes from a string');
}
/** @dataProvider validHtml5Provider */
public function testHtml5ParserParseContentStartingWithValidHeading(string $content): void
{
$this->skipTestIfHTML5LibraryNotAvailable();
$crawler = $this->createCrawler();
$crawler->addHtmlContent($content);
self::assertEquals(
'Foo',
$crawler->filterXPath('//h1')->text(),
'->addHtmlContent() parses valid HTML with comment before doctype'
);
}
/** @dataProvider invalidHtml5Provider */
public function testHtml5ParserWithInvalidHeadedContent(string $content): void
{
$this->skipTestIfHTML5LibraryNotAvailable();
$crawler = $this->createCrawler();
$crawler->addHtmlContent($content);
self::assertEmpty($crawler->filterXPath('//h1')->text(), '->addHtmlContent failed as expected');
}
public function validHtml5Provider(): iterable
{
$html = $this->getDoctype().'<html><body><h1><p>Foo</p></h1></body></html>';
$BOM = \chr(0xEF).\chr(0xBB).\chr(0xBF);
yield 'BOM first' => [$BOM.$html];
yield 'Single comment' => ['<!-- comment -->'.$html];
yield 'Multiline comment' => ["<!-- \n multiline comment \n -->".$html];
yield 'Several comments' => ['<!--c--> <!--cc-->'.$html];
yield 'Whitespaces' => [' '.$html];
yield 'All together' => [$BOM.' '.'<!--c-->'.$html];
}
public function invalidHtml5Provider(): iterable
{
$html = $this->getDoctype().'<html><body><h1><p>Foo</p></h1></body></html>';
yield 'Text' => ['hello world'.$html];
yield 'Text between comments' => ['<!--c--> test <!--cc-->'.$html];
}
private function skipTestIfHTML5LibraryNotAvailable(): void
{
if (!class_exists(\Masterminds\HTML5::class)) {
self::markTestSkipped('HTML5 library is not available');
}
}
}