diff --git a/src/Symfony/Component/DomCrawler/Crawler.php b/src/Symfony/Component/DomCrawler/Crawler.php index a45cc86cb2..208181648a 100644 --- a/src/Symfony/Component/DomCrawler/Crawler.php +++ b/src/Symfony/Component/DomCrawler/Crawler.php @@ -188,8 +188,7 @@ class Crawler implements \Countable, \IteratorAggregate */ public function addHtmlContent($content, $charset = 'UTF-8') { - // Use HTML5 parser if the content is HTML5 and the library is available - $dom = null !== $this->html5Parser && strspn($content, " \t\r\n") === stripos($content, '') ? $this->parseHtml5($content, $charset) : $this->parseXhtml($content, $charset); + $dom = $this->parseHtmlString($content, $charset); $this->addDocument($dom); $base = $this->filterRelativeXPath('descendant-or-self::base')->extract(['href']); @@ -1295,4 +1294,35 @@ class Crawler implements \Countable, \IteratorAggregate return new CssSelectorConverter($this->isHtml); } + + /** + * Parse string into DOMDocument object using HTML5 parser if the content is HTML5 and the library is available. + * Use libxml parser otherwise. + */ + private function parseHtmlString(string $content, string $charset): \DOMDocument + { + if ($this->canParseHtml5String($content)) { + return $this->parseHtml5($content, $charset); + } + + return $this->parseXhtml($content, $charset); + } + + private function canParseHtml5String(string $content): bool + { + if (null === $this->html5Parser) { + return false; + } + if (false === ($pos = stripos($content, ''))) { + return false; + } + $header = substr($content, 0, $pos); + + return '' === $header || $this->isValidHtml5Heading($header); + } + + private function isValidHtml5Heading(string $heading): bool + { + return 1 === preg_match('/^\x{FEFF}?\s*(\s*)*$/u', $heading); + } } diff --git a/src/Symfony/Component/DomCrawler/Tests/Html5ParserCrawlerTest.php b/src/Symfony/Component/DomCrawler/Tests/Html5ParserCrawlerTest.php index 8ada3794d7..a4aa7740dd 100644 --- a/src/Symfony/Component/DomCrawler/Tests/Html5ParserCrawlerTest.php +++ b/src/Symfony/Component/DomCrawler/Tests/Html5ParserCrawlerTest.php @@ -25,4 +25,56 @@ class Html5ParserCrawlerTest extends AbstractCrawlerTest $crawler->add($this->getDoctype().'

Foo

'); $this->assertEquals('Foo', $crawler->filterXPath('//h1')->text(), '->add() adds nodes from a string'); } + + /** @dataProvider validHtml5Provider */ + public function testHtml5ParserParseContentStartingWithValidHeading(string $content): void + { + $this->skipTestIfHTML5LibraryNotAvailable(); + + $crawler = $this->createCrawler(); + $crawler->addHtmlContent($content); + self::assertEquals( + 'Foo', + $crawler->filterXPath('//h1')->text(), + '->addHtmlContent() parses valid HTML with comment before doctype' + ); + } + + /** @dataProvider invalidHtml5Provider */ + public function testHtml5ParserWithInvalidHeadedContent(string $content): void + { + $this->skipTestIfHTML5LibraryNotAvailable(); + + $crawler = $this->createCrawler(); + $crawler->addHtmlContent($content); + self::assertEmpty($crawler->filterXPath('//h1')->text(), '->addHtmlContent failed as expected'); + } + + public function validHtml5Provider(): iterable + { + $html = $this->getDoctype().'

Foo

'; + $BOM = \chr(0xEF).\chr(0xBB).\chr(0xBF); + + yield 'BOM first' => [$BOM.$html]; + yield 'Single comment' => [''.$html]; + yield 'Multiline comment' => ["".$html]; + yield 'Several comments' => [' '.$html]; + yield 'Whitespaces' => [' '.$html]; + yield 'All together' => [$BOM.' '.''.$html]; + } + + public function invalidHtml5Provider(): iterable + { + $html = $this->getDoctype().'

Foo

'; + + yield 'Text' => ['hello world'.$html]; + yield 'Text between comments' => [' test '.$html]; + } + + private function skipTestIfHTML5LibraryNotAvailable(): void + { + if (!class_exists(\Masterminds\HTML5::class)) { + self::markTestSkipped('HTML5 library is not available'); + } + } }