[DomCrawler] Improve Crawler HTML5 parser need detection
This commit is contained in:
parent
45fd75ea20
commit
9bbdab68ef
@ -61,24 +61,15 @@ class Crawler implements \Countable, \IteratorAggregate
|
||||
private $html5Parser;
|
||||
|
||||
/**
|
||||
* @param mixed $node A Node to use as the base for the crawling
|
||||
* @param string $uri The current URI
|
||||
* @param string $baseHref The base href value
|
||||
* @param bool|null $useHtml5Parser Whether the Crawler should use the HTML5 parser or the native DOM parser
|
||||
* @param mixed $node A Node to use as the base for the crawling
|
||||
* @param string $uri The current URI
|
||||
* @param string $baseHref The base href value
|
||||
*/
|
||||
public function __construct($node = null, string $uri = null, string $baseHref = null, bool $useHtml5Parser = null)
|
||||
public function __construct($node = null, string $uri = null, string $baseHref = null)
|
||||
{
|
||||
$this->uri = $uri;
|
||||
$this->baseHref = $baseHref ?: $uri;
|
||||
|
||||
if ($useHtml5Parser && !class_exists(HTML5::class)) {
|
||||
throw new \LogicException('Using the DomCrawler HTML5 parser requires the html5-php library. Try running "composer require masterminds/html5".');
|
||||
}
|
||||
|
||||
if ($useHtml5Parser ?? class_exists(HTML5::class)) {
|
||||
$this->html5Parser = new HTML5(['disable_html_ns' => true]);
|
||||
}
|
||||
|
||||
$this->add($node);
|
||||
}
|
||||
|
||||
@ -198,6 +189,13 @@ class Crawler implements \Countable, \IteratorAggregate
|
||||
*/
|
||||
public function addHtmlContent($content, $charset = 'UTF-8')
|
||||
{
|
||||
// Use HTML5 parser if the content is HTML5 and the library is available
|
||||
if (!$this->html5Parser
|
||||
&& class_exists(HTML5::class)
|
||||
&& '<!doctype html>' === strtolower(substr(ltrim($content), 0, 15))) {
|
||||
$this->html5Parser = new HTML5(['disable_html_ns' => true]);
|
||||
}
|
||||
|
||||
$dom = null !== $this->html5Parser ? $this->parseHtml5($content, $charset) : $this->parseXhtml($content, $charset);
|
||||
$this->addDocument($dom);
|
||||
|
||||
@ -1219,6 +1217,7 @@ class Crawler implements \Countable, \IteratorAggregate
|
||||
$crawler->isHtml = $this->isHtml;
|
||||
$crawler->document = $this->document;
|
||||
$crawler->namespaces = $this->namespaces;
|
||||
$crawler->html5Parser = $this->html5Parser;
|
||||
|
||||
return $crawler;
|
||||
}
|
||||
|
@ -16,14 +16,12 @@ use Symfony\Component\DomCrawler\Crawler;
|
||||
|
||||
abstract class AbstractCrawlerTest extends TestCase
|
||||
{
|
||||
/**
|
||||
* @param mixed $node
|
||||
* @param string|null $uri
|
||||
* @param string|null $baseHref
|
||||
*
|
||||
* @return Crawler
|
||||
*/
|
||||
abstract public function createCrawler($node = null, string $uri = null, string $baseHref = null);
|
||||
abstract public function getDoctype(): string;
|
||||
|
||||
protected function createCrawler($node = null, string $uri = null, string $baseHref = null)
|
||||
{
|
||||
return new Crawler($node, $uri, $baseHref);
|
||||
}
|
||||
|
||||
public function testConstructor()
|
||||
{
|
||||
@ -74,7 +72,7 @@ abstract class AbstractCrawlerTest extends TestCase
|
||||
$this->assertEquals('foo', $crawler->filterXPath('//div')->attr('class'), '->add() adds nodes from a \DOMNode');
|
||||
|
||||
$crawler = $this->createCrawler();
|
||||
$crawler->add('<html><body>Foo</body></html>');
|
||||
$crawler->add($this->getDoctype().'<html><body>Foo</body></html>');
|
||||
$this->assertEquals('Foo', $crawler->filterXPath('//body')->text(), '->add() adds nodes from a string');
|
||||
}
|
||||
|
||||
@ -94,13 +92,13 @@ abstract class AbstractCrawlerTest extends TestCase
|
||||
public function testAddMultipleDocumentNode()
|
||||
{
|
||||
$crawler = $this->createTestCrawler();
|
||||
$crawler->addHtmlContent('<html><div class="foo"></html>', 'UTF-8');
|
||||
$crawler->addHtmlContent($this->getDoctype().'<html><div class="foo"></html>', 'UTF-8');
|
||||
}
|
||||
|
||||
public function testAddHtmlContent()
|
||||
{
|
||||
$crawler = $this->createCrawler();
|
||||
$crawler->addHtmlContent('<html><div class="foo"></html>', 'UTF-8');
|
||||
$crawler->addHtmlContent($this->getDoctype().'<html><div class="foo"></html>', 'UTF-8');
|
||||
|
||||
$this->assertEquals('foo', $crawler->filterXPath('//div')->attr('class'), '->addHtmlContent() adds nodes from an HTML string');
|
||||
}
|
||||
@ -108,8 +106,7 @@ abstract class AbstractCrawlerTest extends TestCase
|
||||
public function testAddHtmlContentWithBaseTag()
|
||||
{
|
||||
$crawler = $this->createCrawler();
|
||||
|
||||
$crawler->addHtmlContent('<html><head><base href="http://symfony.com"></head><a href="/contact"></a></html>', 'UTF-8');
|
||||
$crawler->addHtmlContent($this->getDoctype().'<html><head><base href="http://symfony.com"></head><a href="/contact"></a></html>', 'UTF-8');
|
||||
|
||||
$this->assertEquals('http://symfony.com', $crawler->filterXPath('//base')->attr('href'), '->addHtmlContent() adds nodes from an HTML string');
|
||||
$this->assertEquals('http://symfony.com/contact', $crawler->filterXPath('//a')->link()->getUri(), '->addHtmlContent() adds nodes from an HTML string');
|
||||
@ -121,7 +118,7 @@ abstract class AbstractCrawlerTest extends TestCase
|
||||
public function testAddHtmlContentCharset()
|
||||
{
|
||||
$crawler = $this->createCrawler();
|
||||
$crawler->addHtmlContent('<html><div class="foo">Tiếng Việt</html>', 'UTF-8');
|
||||
$crawler->addHtmlContent($this->getDoctype().'<html><div class="foo">Tiếng Việt</html>', 'UTF-8');
|
||||
|
||||
$this->assertEquals('Tiếng Việt', $crawler->filterXPath('//div')->text());
|
||||
}
|
||||
@ -129,7 +126,7 @@ abstract class AbstractCrawlerTest extends TestCase
|
||||
public function testAddHtmlContentInvalidBaseTag()
|
||||
{
|
||||
$crawler = $this->createCrawler(null, 'http://symfony.com');
|
||||
$crawler->addHtmlContent('<html><head><base target="_top"></head><a href="/contact"></a></html>', 'UTF-8');
|
||||
$crawler->addHtmlContent($this->getDoctype().'<html><head><base target="_top"></head><a href="/contact"></a></html>', 'UTF-8');
|
||||
|
||||
$this->assertEquals('http://symfony.com/contact', current($crawler->filterXPath('//a')->links())->getUri(), '->addHtmlContent() correctly handles a non-existent base tag href attribute');
|
||||
}
|
||||
@ -141,7 +138,7 @@ abstract class AbstractCrawlerTest extends TestCase
|
||||
{
|
||||
$crawler = $this->createCrawler();
|
||||
//gbk encode of <html><p>中文</p></html>
|
||||
$crawler->addHtmlContent(base64_decode('PGh0bWw+PHA+1tDOxDwvcD48L2h0bWw+'), 'gbk');
|
||||
$crawler->addHtmlContent($this->getDoctype().base64_decode('PGh0bWw+PHA+1tDOxDwvcD48L2h0bWw+'), 'gbk');
|
||||
|
||||
$this->assertEquals('中文', $crawler->filterXPath('//p')->text());
|
||||
}
|
||||
@ -149,7 +146,7 @@ abstract class AbstractCrawlerTest extends TestCase
|
||||
public function testAddXmlContent()
|
||||
{
|
||||
$crawler = $this->createCrawler();
|
||||
$crawler->addXmlContent('<html><div class="foo"></div></html>', 'UTF-8');
|
||||
$crawler->addXmlContent($this->getDoctype().'<html><div class="foo"></div></html>', 'UTF-8');
|
||||
|
||||
$this->assertEquals('foo', $crawler->filterXPath('//div')->attr('class'), '->addXmlContent() adds nodes from an XML string');
|
||||
}
|
||||
@ -157,7 +154,7 @@ abstract class AbstractCrawlerTest extends TestCase
|
||||
public function testAddXmlContentCharset()
|
||||
{
|
||||
$crawler = $this->createCrawler();
|
||||
$crawler->addXmlContent('<html><div class="foo">Tiếng Việt</div></html>', 'UTF-8');
|
||||
$crawler->addXmlContent($this->getDoctype().'<html><div class="foo">Tiếng Việt</div></html>', 'UTF-8');
|
||||
|
||||
$this->assertEquals('Tiếng Việt', $crawler->filterXPath('//div')->text());
|
||||
}
|
||||
@ -165,23 +162,23 @@ abstract class AbstractCrawlerTest extends TestCase
|
||||
public function testAddContent()
|
||||
{
|
||||
$crawler = $this->createCrawler();
|
||||
$crawler->addContent('<html><div class="foo"></html>', 'text/html; charset=UTF-8');
|
||||
$crawler->addContent($this->getDoctype().'<html><div class="foo"></html>', 'text/html; charset=UTF-8');
|
||||
$this->assertEquals('foo', $crawler->filterXPath('//div')->attr('class'), '->addContent() adds nodes from an HTML string');
|
||||
|
||||
$crawler = $this->createCrawler();
|
||||
$crawler->addContent('<html><div class="foo"></html>', 'text/html; charset=UTF-8; dir=RTL');
|
||||
$crawler->addContent($this->getDoctype().'<html><div class="foo"></html>', 'text/html; charset=UTF-8; dir=RTL');
|
||||
$this->assertEquals('foo', $crawler->filterXPath('//div')->attr('class'), '->addContent() adds nodes from an HTML string with extended content type');
|
||||
|
||||
$crawler = $this->createCrawler();
|
||||
$crawler->addContent('<html><div class="foo"></html>');
|
||||
$crawler->addContent($this->getDoctype().'<html><div class="foo"></html>');
|
||||
$this->assertEquals('foo', $crawler->filterXPath('//div')->attr('class'), '->addContent() uses text/html as the default type');
|
||||
|
||||
$crawler = $this->createCrawler();
|
||||
$crawler->addContent('<html><div class="foo"></div></html>', 'text/xml; charset=UTF-8');
|
||||
$crawler->addContent($this->getDoctype().'<html><div class="foo"></div></html>', 'text/xml; charset=UTF-8');
|
||||
$this->assertEquals('foo', $crawler->filterXPath('//div')->attr('class'), '->addContent() adds nodes from an XML string');
|
||||
|
||||
$crawler = $this->createCrawler();
|
||||
$crawler->addContent('<html><div class="foo"></div></html>', 'text/xml');
|
||||
$crawler->addContent($this->getDoctype().'<html><div class="foo"></div></html>', 'text/xml');
|
||||
$this->assertEquals('foo', $crawler->filterXPath('//div')->attr('class'), '->addContent() adds nodes from an XML string');
|
||||
|
||||
$crawler = $this->createCrawler();
|
||||
@ -189,7 +186,7 @@ abstract class AbstractCrawlerTest extends TestCase
|
||||
$this->assertCount(0, $crawler, '->addContent() does nothing if the type is not (x|ht)ml');
|
||||
|
||||
$crawler = $this->createCrawler();
|
||||
$crawler->addContent('<html><meta http-equiv="Content-Type" content="text/html; charset=utf-8" /><span>中文</span></html>');
|
||||
$crawler->addContent($this->getDoctype().'<html><meta http-equiv="Content-Type" content="text/html; charset=utf-8" /><span>中文</span></html>');
|
||||
$this->assertEquals('中文', $crawler->filterXPath('//span')->text(), '->addContent() guess wrong charset');
|
||||
}
|
||||
|
||||
@ -199,7 +196,7 @@ abstract class AbstractCrawlerTest extends TestCase
|
||||
public function testAddContentNonUtf8()
|
||||
{
|
||||
$crawler = $this->createCrawler();
|
||||
$crawler->addContent(iconv('UTF-8', 'SJIS', '<html><head><meta charset="Shift_JIS"></head><body>日本語</body></html>'));
|
||||
$crawler->addContent(iconv('UTF-8', 'SJIS', $this->getDoctype().'<html><head><meta charset="Shift_JIS"></head><body>日本語</body></html>'));
|
||||
$this->assertEquals('日本語', $crawler->filterXPath('//body')->text(), '->addContent() can recognize "Shift_JIS" in html5 meta charset tag');
|
||||
}
|
||||
|
||||
@ -314,7 +311,7 @@ abstract class AbstractCrawlerTest extends TestCase
|
||||
public function testMissingAttrValueIsNull()
|
||||
{
|
||||
$crawler = $this->createCrawler();
|
||||
$crawler->addContent('<html><div non-empty-attr="sample value" empty-attr=""></div></html>', 'text/html; charset=UTF-8');
|
||||
$crawler->addContent($this->getDoctype().'<html><div non-empty-attr="sample value" empty-attr=""></div></html>', 'text/html; charset=UTF-8');
|
||||
$div = $crawler->filterXPath('//div');
|
||||
|
||||
$this->assertEquals('sample value', $div->attr('non-empty-attr'), '->attr() reads non-empty attributes correctly');
|
||||
@ -670,7 +667,6 @@ abstract class AbstractCrawlerTest extends TestCase
|
||||
public function testSelectButtonWithSingleQuotesInNameAttribute()
|
||||
{
|
||||
$html = <<<'HTML'
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<body>
|
||||
<div id="action">
|
||||
@ -683,7 +679,7 @@ abstract class AbstractCrawlerTest extends TestCase
|
||||
</html>
|
||||
HTML;
|
||||
|
||||
$crawler = $this->createCrawler($html);
|
||||
$crawler = $this->createCrawler($this->getDoctype().$html);
|
||||
|
||||
$this->assertCount(1, $crawler->selectButton('Click \'Here\''));
|
||||
}
|
||||
@ -691,7 +687,6 @@ HTML;
|
||||
public function testSelectButtonWithDoubleQuotesInNameAttribute()
|
||||
{
|
||||
$html = <<<'HTML'
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<body>
|
||||
<div id="action">
|
||||
@ -704,7 +699,7 @@ HTML;
|
||||
</html>
|
||||
HTML;
|
||||
|
||||
$crawler = $this->createCrawler($html);
|
||||
$crawler = $this->createCrawler($this->getDoctype().$html);
|
||||
|
||||
$this->assertCount(1, $crawler->selectButton('Click "Here"'));
|
||||
}
|
||||
@ -763,7 +758,6 @@ HTML;
|
||||
public function testSelectLinkAndLinkFiltered()
|
||||
{
|
||||
$html = <<<'HTML'
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<body>
|
||||
<div id="action">
|
||||
@ -776,7 +770,7 @@ HTML;
|
||||
</html>
|
||||
HTML;
|
||||
|
||||
$crawler = $this->createCrawler($html);
|
||||
$crawler = $this->createCrawler($this->getDoctype().$html);
|
||||
$filtered = $crawler->filterXPath("descendant-or-self::*[@id = 'login-form']");
|
||||
|
||||
$this->assertCount(0, $filtered->selectLink('Login'));
|
||||
@ -793,7 +787,7 @@ HTML;
|
||||
|
||||
public function testChaining()
|
||||
{
|
||||
$crawler = $this->createCrawler('<div name="a"><div name="b"><div name="c"></div></div></div>');
|
||||
$crawler = $this->createCrawler($this->getDoctype().'<div name="a"><div name="b"><div name="c"></div></div></div>');
|
||||
|
||||
$this->assertEquals('a', $crawler->filterXPath('//div')->filterXPath('div')->filterXPath('div')->attr('name'));
|
||||
}
|
||||
@ -965,7 +959,6 @@ HTML;
|
||||
public function testFilteredChildren()
|
||||
{
|
||||
$html = <<<'HTML'
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<body>
|
||||
<div id="foo">
|
||||
@ -981,7 +974,7 @@ HTML;
|
||||
</html>
|
||||
HTML;
|
||||
|
||||
$crawler = $this->createCrawler($html);
|
||||
$crawler = $this->createCrawler($this->getDoctype().$html);
|
||||
$foo = $crawler->filter('#foo');
|
||||
|
||||
$this->assertEquals(3, $foo->children()->count());
|
||||
@ -1018,7 +1011,7 @@ HTML;
|
||||
*/
|
||||
public function testBaseTag($baseValue, $linkValue, $expectedUri, $currentUri = null, $description = '')
|
||||
{
|
||||
$crawler = $this->createCrawler('<html><base href="'.$baseValue.'"><a href="'.$linkValue.'"></a></html>', $currentUri);
|
||||
$crawler = $this->createCrawler($this->getDoctype().'<html><base href="'.$baseValue.'"><a href="'.$linkValue.'"></a></html>', $currentUri);
|
||||
$this->assertEquals($expectedUri, $crawler->filterXPath('//a')->link()->getUri(), $description);
|
||||
}
|
||||
|
||||
@ -1038,7 +1031,7 @@ HTML;
|
||||
*/
|
||||
public function testBaseTagWithForm($baseValue, $actionValue, $expectedUri, $currentUri = null, $description = null)
|
||||
{
|
||||
$crawler = $this->createCrawler('<html><base href="'.$baseValue.'"><form method="post" action="'.$actionValue.'"><button type="submit" name="submit"/></form></html>', $currentUri);
|
||||
$crawler = $this->createCrawler($this->getDoctype().'<html><base href="'.$baseValue.'"><form method="post" action="'.$actionValue.'"><button type="submit" name="submit"/></form></html>', $currentUri);
|
||||
$this->assertEquals($expectedUri, $crawler->filterXPath('//button')->form()->getUri(), $description);
|
||||
}
|
||||
|
||||
@ -1113,7 +1106,7 @@ HTML;
|
||||
public function testInheritedClassCallChildrenWithoutArgument()
|
||||
{
|
||||
$dom = new \DOMDocument();
|
||||
$dom->loadHTML('
|
||||
$dom->loadHTML($this->getDoctype().'
|
||||
<html>
|
||||
<body>
|
||||
<a href="foo">Foo</a>
|
||||
@ -1165,7 +1158,7 @@ HTML;
|
||||
public function testAddHtmlContentUnsupportedCharset()
|
||||
{
|
||||
$crawler = $this->createCrawler();
|
||||
$crawler->addHtmlContent(file_get_contents(__DIR__.'/Fixtures/windows-1250.html'), 'Windows-1250');
|
||||
$crawler->addHtmlContent($this->getDoctype().file_get_contents(__DIR__.'/Fixtures/windows-1250.html'), 'Windows-1250');
|
||||
|
||||
$this->assertEquals('Žťčýů', $crawler->filterXPath('//p')->text());
|
||||
}
|
||||
@ -1173,7 +1166,7 @@ HTML;
|
||||
public function createTestCrawler($uri = null)
|
||||
{
|
||||
$dom = new \DOMDocument();
|
||||
$dom->loadHTML('
|
||||
$dom->loadHTML($this->getDoctype().'
|
||||
<html>
|
||||
<body>
|
||||
<a href="foo">Foo</a>
|
||||
|
@ -11,12 +11,18 @@
|
||||
|
||||
namespace Symfony\Component\DomCrawler\Tests;
|
||||
|
||||
use Symfony\Component\DomCrawler\Crawler;
|
||||
|
||||
class Html5ParserCrawlerTest extends AbstractCrawlerTest
|
||||
{
|
||||
public function createCrawler($node = null, string $uri = null, string $baseHref = null)
|
||||
public function getDoctype(): string
|
||||
{
|
||||
return new Crawler($node, $uri, $baseHref, true);
|
||||
return '<!DOCTYPE html>';
|
||||
}
|
||||
|
||||
public function testAddHtml5()
|
||||
{
|
||||
// Ensure a bug specific to the DOM extension is fixed (see https://github.com/symfony/symfony/issues/28596)
|
||||
$crawler = $this->createCrawler();
|
||||
$crawler->add($this->getDoctype().'<html><body><h1><p>Foo</p></h1></body></html>');
|
||||
$this->assertEquals('Foo', $crawler->filterXPath('//h1')->text(), '->add() adds nodes from a string');
|
||||
}
|
||||
}
|
||||
|
@ -11,13 +11,11 @@
|
||||
|
||||
namespace Symfony\Component\DomCrawler\Tests;
|
||||
|
||||
use Symfony\Component\DomCrawler\Crawler;
|
||||
|
||||
class NativeParserCrawlerTest extends AbstractCrawlerTest
|
||||
{
|
||||
public function createCrawler($node = null, string $uri = null, string $baseHref = null)
|
||||
public function getDoctype(): string
|
||||
{
|
||||
return new Crawler($node, $uri, $baseHref, false);
|
||||
return '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">';
|
||||
}
|
||||
|
||||
public function testAddHtmlContentWithErrors()
|
||||
@ -26,7 +24,7 @@ class NativeParserCrawlerTest extends AbstractCrawlerTest
|
||||
|
||||
$crawler = $this->createCrawler();
|
||||
$crawler->addHtmlContent(<<<'EOF'
|
||||
<!DOCTYPE html>
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
|
||||
<html>
|
||||
<head>
|
||||
</head>
|
||||
@ -51,7 +49,7 @@ EOF
|
||||
|
||||
$crawler = $this->createCrawler();
|
||||
$crawler->addXmlContent(<<<'EOF'
|
||||
<!DOCTYPE html>
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
|
||||
<html>
|
||||
<head>
|
||||
</head>
|
||||
|
Reference in New Issue
Block a user