This PR was squashed before being merged into the 4.4 branch.
Discussion
----------
Fix for issue #37681
| Q | A
| ------------- | ---
| Branch? | 4.4
| Bug fix? | yes
| New feature? | no
| Deprecations? | no
| Tickets | Fix #37681
| License | MIT
| Doc PR |
Allow BOM character and comments before `<!DOCTYPE html>` declaration in DomCrawler while choosing a parser implementation
Commits
-------
9bc249e0b9
Fix for issue #37681
This commit is contained in:
commit
7e85a6a6c6
@ -188,8 +188,7 @@ class Crawler implements \Countable, \IteratorAggregate
|
|||||||
*/
|
*/
|
||||||
public function addHtmlContent($content, $charset = 'UTF-8')
|
public function addHtmlContent($content, $charset = 'UTF-8')
|
||||||
{
|
{
|
||||||
// Use HTML5 parser if the content is HTML5 and the library is available
|
$dom = $this->parseHtmlString($content, $charset);
|
||||||
$dom = null !== $this->html5Parser && strspn($content, " \t\r\n") === stripos($content, '<!doctype html>') ? $this->parseHtml5($content, $charset) : $this->parseXhtml($content, $charset);
|
|
||||||
$this->addDocument($dom);
|
$this->addDocument($dom);
|
||||||
|
|
||||||
$base = $this->filterRelativeXPath('descendant-or-self::base')->extract(['href']);
|
$base = $this->filterRelativeXPath('descendant-or-self::base')->extract(['href']);
|
||||||
@ -1295,4 +1294,35 @@ class Crawler implements \Countable, \IteratorAggregate
|
|||||||
|
|
||||||
return new CssSelectorConverter($this->isHtml);
|
return new CssSelectorConverter($this->isHtml);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Parse string into DOMDocument object using HTML5 parser if the content is HTML5 and the library is available.
|
||||||
|
* Use libxml parser otherwise.
|
||||||
|
*/
|
||||||
|
private function parseHtmlString(string $content, string $charset): \DOMDocument
|
||||||
|
{
|
||||||
|
if ($this->canParseHtml5String($content)) {
|
||||||
|
return $this->parseHtml5($content, $charset);
|
||||||
|
}
|
||||||
|
|
||||||
|
return $this->parseXhtml($content, $charset);
|
||||||
|
}
|
||||||
|
|
||||||
|
private function canParseHtml5String(string $content): bool
|
||||||
|
{
|
||||||
|
if (null === $this->html5Parser) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (false === ($pos = stripos($content, '<!doctype html>'))) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
$header = substr($content, 0, $pos);
|
||||||
|
|
||||||
|
return '' === $header || $this->isValidHtml5Heading($header);
|
||||||
|
}
|
||||||
|
|
||||||
|
private function isValidHtml5Heading(string $heading): bool
|
||||||
|
{
|
||||||
|
return 1 === preg_match('/^\x{FEFF}?\s*(<!--[^>]*?-->\s*)*$/u', $heading);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -25,4 +25,56 @@ class Html5ParserCrawlerTest extends AbstractCrawlerTest
|
|||||||
$crawler->add($this->getDoctype().'<html><body><h1><p>Foo</p></h1></body></html>');
|
$crawler->add($this->getDoctype().'<html><body><h1><p>Foo</p></h1></body></html>');
|
||||||
$this->assertEquals('Foo', $crawler->filterXPath('//h1')->text(), '->add() adds nodes from a string');
|
$this->assertEquals('Foo', $crawler->filterXPath('//h1')->text(), '->add() adds nodes from a string');
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** @dataProvider validHtml5Provider */
|
||||||
|
public function testHtml5ParserParseContentStartingWithValidHeading(string $content): void
|
||||||
|
{
|
||||||
|
$this->skipTestIfHTML5LibraryNotAvailable();
|
||||||
|
|
||||||
|
$crawler = $this->createCrawler();
|
||||||
|
$crawler->addHtmlContent($content);
|
||||||
|
self::assertEquals(
|
||||||
|
'Foo',
|
||||||
|
$crawler->filterXPath('//h1')->text(),
|
||||||
|
'->addHtmlContent() parses valid HTML with comment before doctype'
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** @dataProvider invalidHtml5Provider */
|
||||||
|
public function testHtml5ParserWithInvalidHeadedContent(string $content): void
|
||||||
|
{
|
||||||
|
$this->skipTestIfHTML5LibraryNotAvailable();
|
||||||
|
|
||||||
|
$crawler = $this->createCrawler();
|
||||||
|
$crawler->addHtmlContent($content);
|
||||||
|
self::assertEmpty($crawler->filterXPath('//h1')->text(), '->addHtmlContent failed as expected');
|
||||||
|
}
|
||||||
|
|
||||||
|
public function validHtml5Provider(): iterable
|
||||||
|
{
|
||||||
|
$html = $this->getDoctype().'<html><body><h1><p>Foo</p></h1></body></html>';
|
||||||
|
$BOM = \chr(0xEF).\chr(0xBB).\chr(0xBF);
|
||||||
|
|
||||||
|
yield 'BOM first' => [$BOM.$html];
|
||||||
|
yield 'Single comment' => ['<!-- comment -->'.$html];
|
||||||
|
yield 'Multiline comment' => ["<!-- \n multiline comment \n -->".$html];
|
||||||
|
yield 'Several comments' => ['<!--c--> <!--cc-->'.$html];
|
||||||
|
yield 'Whitespaces' => [' '.$html];
|
||||||
|
yield 'All together' => [$BOM.' '.'<!--c-->'.$html];
|
||||||
|
}
|
||||||
|
|
||||||
|
public function invalidHtml5Provider(): iterable
|
||||||
|
{
|
||||||
|
$html = $this->getDoctype().'<html><body><h1><p>Foo</p></h1></body></html>';
|
||||||
|
|
||||||
|
yield 'Text' => ['hello world'.$html];
|
||||||
|
yield 'Text between comments' => ['<!--c--> test <!--cc-->'.$html];
|
||||||
|
}
|
||||||
|
|
||||||
|
private function skipTestIfHTML5LibraryNotAvailable(): void
|
||||||
|
{
|
||||||
|
if (!class_exists(\Masterminds\HTML5::class)) {
|
||||||
|
self::markTestSkipped('HTML5 library is not available');
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user