bug #37803 Fix for issue #37681 (Rav)

This PR was squashed before being merged into the 4.4 branch. Discussion ---------- Fix for issue #37681 | Q | A | ------------- | --- | Branch? | 4.4 | Bug fix? | yes | New feature? | no | Deprecations? | no | Tickets | Fix #37681 | License | MIT | Doc PR | Allow BOM character and comments before `<!DOCTYPE html>` declaration in DomCrawler while choosing a parser implementation Commits ------- 9bc249e0b9 Fix for issue #37681
2020-08-12 08:20:48 +02:00 · 2020-08-12 08:20:48 +02:00 · 7e85a6a6c6
commit 7e85a6a6c6
parent 8761f80268 9bc249e0b9
2 changed files with 84 additions and 2 deletions
--- a/src/Symfony/Component/DomCrawler/Crawler.php
+++ b/src/Symfony/Component/DomCrawler/Crawler.php
@ -188,8 +188,7 @@ class Crawler implements \Countable, \IteratorAggregate
     */
    public function addHtmlContent($content, $charset = 'UTF-8')
    {
-        // Use HTML5 parser if the content is HTML5 and the library is available
+        $dom = $this->parseHtmlString($content, $charset);
        $dom = null !== $this->html5Parser && strspn($content, " \t\r\n") === stripos($content, '<!doctype html>') ? $this->parseHtml5($content, $charset) : $this->parseXhtml($content, $charset);
        $this->addDocument($dom);
        $base = $this->filterRelativeXPath('descendant-or-self::base')->extract(['href']);
@ -1295,4 +1294,35 @@ class Crawler implements \Countable, \IteratorAggregate
        return new CssSelectorConverter($this->isHtml);
    }
    /**
     * Parse string into DOMDocument object using HTML5 parser if the content is HTML5 and the library is available.
     * Use libxml parser otherwise.
     */
    private function parseHtmlString(string $content, string $charset): \DOMDocument
    {
        if ($this->canParseHtml5String($content)) {
            return $this->parseHtml5($content, $charset);
        }
        return $this->parseXhtml($content, $charset);
    }
    private function canParseHtml5String(string $content): bool
    {
        if (null === $this->html5Parser) {
            return false;
        }
        if (false === ($pos = stripos($content, '<!doctype html>'))) {
            return false;
        }
        $header = substr($content, 0, $pos);
        return '' === $header || $this->isValidHtml5Heading($header);
    }
    private function isValidHtml5Heading(string $heading): bool
    {
        return 1 === preg_match('/^\x{FEFF}?\s*(<!--[^>]*?-->\s*)*$/u', $heading);
    }
 }
--- a/src/Symfony/Component/DomCrawler/Tests/Html5ParserCrawlerTest.php
+++ b/src/Symfony/Component/DomCrawler/Tests/Html5ParserCrawlerTest.php
@ -25,4 +25,56 @@ class Html5ParserCrawlerTest extends AbstractCrawlerTest
        $crawler->add($this->getDoctype().'<html><body><h1><p>Foo</p></h1></body></html>');
        $this->assertEquals('Foo', $crawler->filterXPath('//h1')->text(), '->add() adds nodes from a string');
    }
    /** @dataProvider validHtml5Provider */
    public function testHtml5ParserParseContentStartingWithValidHeading(string $content): void
    {
        $this->skipTestIfHTML5LibraryNotAvailable();
        $crawler = $this->createCrawler();
        $crawler->addHtmlContent($content);
        self::assertEquals(
            'Foo',
            $crawler->filterXPath('//h1')->text(),
            '->addHtmlContent() parses valid HTML with comment before doctype'
        );
    }
    /** @dataProvider invalidHtml5Provider */
    public function testHtml5ParserWithInvalidHeadedContent(string $content): void
    {
        $this->skipTestIfHTML5LibraryNotAvailable();
        $crawler = $this->createCrawler();
        $crawler->addHtmlContent($content);
        self::assertEmpty($crawler->filterXPath('//h1')->text(), '->addHtmlContent failed as expected');
    }
    public function validHtml5Provider(): iterable
    {
        $html = $this->getDoctype().'<html><body><h1><p>Foo</p></h1></body></html>';
        $BOM = \chr(0xEF).\chr(0xBB).\chr(0xBF);
        yield 'BOM first' => [$BOM.$html];
        yield 'Single comment' => ['<!-- comment -->'.$html];
        yield 'Multiline comment' => ["<!-- \n multiline comment \n -->".$html];
        yield 'Several comments' => ['<!--c--> <!--cc-->'.$html];
        yield 'Whitespaces' => ['    '.$html];
        yield 'All together' => [$BOM.'  '.'<!--c-->'.$html];
    }
    public function invalidHtml5Provider(): iterable
    {
        $html = $this->getDoctype().'<html><body><h1><p>Foo</p></h1></body></html>';
        yield 'Text' => ['hello world'.$html];
        yield 'Text between comments' => ['<!--c--> test <!--cc-->'.$html];
    }
    private function skipTestIfHTML5LibraryNotAvailable(): void
    {
        if (!class_exists(\Masterminds\HTML5::class)) {
            self::markTestSkipped('HTML5 library is not available');
        }
    }
 }