feature #29306 [DomCrawler] Optionally use html5-php to parse HTML (tgalopin)

This PR was squashed before being merged into the 4.3-dev branch (closes #29306).

Discussion
----------

[DomCrawler] Optionally use html5-php to parse HTML

| Q             | A
| ------------- | ---
| Branch?       | master
| Bug fix?      | no
| New feature?  | yes
| BC breaks?    | no
| Deprecations? | no
| Tests pass?   | WIP
| Fixed tickets | https://github.com/symfony/symfony/issues/29280, https://github.com/symfony/symfony/issues/28596
| License       | MIT
| Doc PR        | https://github.com/symfony/symfony-docs/pull/10700

This PR introduces the possibility to parse HTML content in the Crawler using the html5-php library (https://github.com/Masterminds/html5-php). This allows for better support of HTML5 and fix many unexpected behaviors and inconsistencies of the native DOM extension.

Commits
-------

4050ec4257 [DomCrawler] Optionally use html5-php to parse HTML
This commit is contained in:
Fabien Potencier 2019-04-03 15:17:25 +02:00
commit be66c44079
9 changed files with 246 additions and 137 deletions

View File

@ -101,6 +101,7 @@
"doctrine/orm": "~2.4,>=2.4.5",
"doctrine/reflection": "~1.0",
"doctrine/doctrine-bundle": "~1.4",
"masterminds/html5": "^2.6",
"monolog/monolog": "~1.11",
"nyholm/psr7": "^1.0",
"ocramius/proxy-manager": "~0.4|~1.0|~2.0",
@ -112,6 +113,7 @@
"phpdocumentor/reflection-docblock": "^3.0|^4.0"
},
"conflict": {
"masterminds/html5": "<2.6",
"phpdocumentor/reflection-docblock": "<3.0||>=3.2.0,<3.2.2",
"phpdocumentor/type-resolver": "<0.3.0",
"phpunit/phpunit": "<5.4.3"

View File

@ -28,19 +28,19 @@ class SessionController implements ContainerAwareInterface
// new session case
if (!$session->has('name')) {
if (!$name) {
return new Response('You are new here and gave no name.');
return new Response('<html><body>You are new here and gave no name.</body></html>');
}
// remember name
$session->set('name', $name);
return new Response(sprintf('Hello %s, nice to meet you.', $name));
return new Response(sprintf('<html><body>Hello %s, nice to meet you.</body></html>', $name));
}
// existing session
$name = $session->get('name');
return new Response(sprintf('Welcome back %s, nice to meet you.', $name));
return new Response(sprintf('<html><body>Welcome back %s, nice to meet you.</body></html>', $name));
}
public function cacheableAction()
@ -55,7 +55,7 @@ class SessionController implements ContainerAwareInterface
{
$request->getSession()->invalidate();
return new Response('Session cleared.');
return new Response('<html><body>Session cleared.</body></html>');
}
public function setFlashAction(Request $request, $message)
@ -76,6 +76,6 @@ class SessionController implements ContainerAwareInterface
$output = 'No flash was set.';
}
return new Response($output);
return new Response('<html><body>'.$output.'</body></html>');
}
}

View File

@ -54,11 +54,11 @@ class LocalizedController implements ContainerAwareInterface
public function profileAction()
{
return new Response('Profile');
return new Response('<html><body>Profile</body></html>');
}
public function homepageAction()
{
return new Response('Homepage');
return new Response('<html><body>Homepage</body></html>');
}
}

View File

@ -8,6 +8,8 @@ CHANGELOG
and `CrawlerSelectorTextSame`
* Added return of element name (`_name`) in `extract()` method.
* Added ability to return a default value in `text()` and `html()` instead of throwing an exception when node is empty.
* When available, the [html5-php library](https://github.com/Masterminds/html5-php) is used to
parse HTML added to a Crawler for better support of HTML5 tags.
4.2.0
-----

View File

@ -11,6 +11,7 @@
namespace Symfony\Component\DomCrawler;
use Masterminds\HTML5;
use Symfony\Component\CssSelector\CssSelectorConverter;
/**
@ -55,15 +56,29 @@ class Crawler implements \Countable, \IteratorAggregate
private $isHtml = true;
/**
* @param mixed $node A Node to use as the base for the crawling
* @param string $uri The current URI
* @param string $baseHref The base href value
* @var HTML5|null
*/
public function __construct($node = null, string $uri = null, string $baseHref = null)
private $html5Parser;
/**
* @param mixed $node A Node to use as the base for the crawling
* @param string $uri The current URI
* @param string $baseHref The base href value
* @param bool|null $useHtml5Parser Whether the Crawler should use the HTML5 parser or the native DOM parser
*/
public function __construct($node = null, string $uri = null, string $baseHref = null, bool $useHtml5Parser = null)
{
$this->uri = $uri;
$this->baseHref = $baseHref ?: $uri;
if ($useHtml5Parser && !class_exists(HTML5::class)) {
throw new \LogicException('Using the DomCrawler HTML5 parser requires the html5-php library. Try running "composer require masterminds/html5".');
}
if ($useHtml5Parser ?? class_exists(HTML5::class)) {
$this->html5Parser = new HTML5(['disable_html_ns' => true]);
}
$this->add($node);
}
@ -183,29 +198,7 @@ class Crawler implements \Countable, \IteratorAggregate
*/
public function addHtmlContent($content, $charset = 'UTF-8')
{
$internalErrors = libxml_use_internal_errors(true);
$disableEntities = libxml_disable_entity_loader(true);
$dom = new \DOMDocument('1.0', $charset);
$dom->validateOnParse = true;
set_error_handler(function () { throw new \Exception(); });
try {
// Convert charset to HTML-entities to work around bugs in DOMDocument::loadHTML()
$content = mb_convert_encoding($content, 'HTML-ENTITIES', $charset);
} catch (\Exception $e) {
}
restore_error_handler();
if ('' !== trim($content)) {
@$dom->loadHTML($content);
}
libxml_use_internal_errors($internalErrors);
libxml_disable_entity_loader($disableEntities);
$dom = null !== $this->html5Parser ? $this->parseHtml5($content, $charset) : $this->parseXhtml($content, $charset);
$this->addDocument($dom);
$base = $this->filterRelativeXPath('descendant-or-self::base')->extract(['href']);
@ -608,6 +601,15 @@ class Crawler implements \Countable, \IteratorAggregate
throw new \InvalidArgumentException('The current node list is empty.');
}
if (null !== $this->html5Parser) {
$html = '';
foreach ($this->getNode(0)->childNodes as $child) {
$html .= $this->html5Parser->saveHTML($child);
}
return $html;
}
$html = '';
foreach ($this->getNode(0)->childNodes as $child) {
$html .= $child->ownerDocument->saveHTML($child);
@ -1112,6 +1114,53 @@ class Crawler implements \Countable, \IteratorAggregate
return $nodes;
}
private function parseHtml5(string $htmlContent, string $charset = 'UTF-8'): \DOMDocument
{
return $this->html5Parser->parse($this->convertToHtmlEntities($htmlContent, $charset), [], $charset);
}
private function parseXhtml(string $htmlContent, string $charset = 'UTF-8'): \DOMDocument
{
$htmlContent = $this->convertToHtmlEntities($htmlContent, $charset);
$internalErrors = libxml_use_internal_errors(true);
$disableEntities = libxml_disable_entity_loader(true);
$dom = new \DOMDocument('1.0', $charset);
$dom->validateOnParse = true;
if ('' !== trim($htmlContent)) {
@$dom->loadHTML($htmlContent);
}
libxml_use_internal_errors($internalErrors);
libxml_disable_entity_loader($disableEntities);
return $dom;
}
/**
* Convert charset to HTML-entities to ensure valid parsing.
*/
private function convertToHtmlEntities(string $htmlContent, string $charset = 'UTF-8'): string
{
set_error_handler(function () { throw new \Exception(); });
try {
return mb_convert_encoding($htmlContent, 'HTML-ENTITIES', $charset);
} catch (\Exception $e) {
try {
$htmlContent = iconv($charset, 'UTF-8', $htmlContent);
$htmlContent = mb_convert_encoding($htmlContent, 'HTML-ENTITIES', 'UTF-8');
} catch (\Exception $e) {
}
return $htmlContent;
} finally {
restore_error_handler();
}
}
/**
* @throws \InvalidArgumentException
*/

View File

@ -14,41 +14,50 @@ namespace Symfony\Component\DomCrawler\Tests;
use PHPUnit\Framework\TestCase;
use Symfony\Component\DomCrawler\Crawler;
class CrawlerTest extends TestCase
abstract class AbstractCrawlerTest extends TestCase
{
/**
* @param mixed $node
* @param string|null $uri
* @param string|null $baseHref
*
* @return Crawler
*/
abstract public function createCrawler($node = null, string $uri = null, string $baseHref = null);
public function testConstructor()
{
$crawler = new Crawler();
$crawler = $this->createCrawler();
$this->assertCount(0, $crawler, '__construct() returns an empty crawler');
$doc = new \DOMDocument();
$node = $doc->createElement('test');
$crawler = new Crawler($node);
$crawler = $this->createCrawler($node);
$this->assertCount(1, $crawler, '__construct() takes a node as a first argument');
}
public function testGetUri()
{
$uri = 'http://symfony.com';
$crawler = new Crawler(null, $uri);
$crawler = $this->createCrawler(null, $uri);
$this->assertEquals($uri, $crawler->getUri());
}
public function testGetBaseHref()
{
$baseHref = 'http://symfony.com';
$crawler = new Crawler(null, null, $baseHref);
$crawler = $this->createCrawler(null, null, $baseHref);
$this->assertEquals($baseHref, $crawler->getBaseHref());
}
public function testAdd()
{
$crawler = new Crawler();
$crawler = $this->createCrawler();
$crawler->add($this->createDomDocument());
$this->assertEquals('foo', $crawler->filterXPath('//div')->attr('class'), '->add() adds nodes from a \DOMDocument');
$crawler = new Crawler();
$crawler = $this->createCrawler();
$crawler->add($this->createNodeList());
$this->assertEquals('foo', $crawler->filterXPath('//div')->attr('class'), '->add() adds nodes from a \DOMNodeList');
@ -56,15 +65,15 @@ class CrawlerTest extends TestCase
foreach ($this->createNodeList() as $node) {
$list[] = $node;
}
$crawler = new Crawler();
$crawler = $this->createCrawler();
$crawler->add($list);
$this->assertEquals('foo', $crawler->filterXPath('//div')->attr('class'), '->add() adds nodes from an array of nodes');
$crawler = new Crawler();
$crawler = $this->createCrawler();
$crawler->add($this->createNodeList()->item(0));
$this->assertEquals('foo', $crawler->filterXPath('//div')->attr('class'), '->add() adds nodes from a \DOMNode');
$crawler = new Crawler();
$crawler = $this->createCrawler();
$crawler->add('<html><body>Foo</body></html>');
$this->assertEquals('Foo', $crawler->filterXPath('//body')->text(), '->add() adds nodes from a string');
}
@ -74,7 +83,7 @@ class CrawlerTest extends TestCase
*/
public function testAddInvalidType()
{
$crawler = new Crawler();
$crawler = $this->createCrawler();
$crawler->add(1);
}
@ -90,7 +99,7 @@ class CrawlerTest extends TestCase
public function testAddHtmlContent()
{
$crawler = new Crawler();
$crawler = $this->createCrawler();
$crawler->addHtmlContent('<html><div class="foo"></html>', 'UTF-8');
$this->assertEquals('foo', $crawler->filterXPath('//div')->attr('class'), '->addHtmlContent() adds nodes from an HTML string');
@ -98,7 +107,7 @@ class CrawlerTest extends TestCase
public function testAddHtmlContentWithBaseTag()
{
$crawler = new Crawler();
$crawler = $this->createCrawler();
$crawler->addHtmlContent('<html><head><base href="http://symfony.com"></head><a href="/contact"></a></html>', 'UTF-8');
@ -111,7 +120,7 @@ class CrawlerTest extends TestCase
*/
public function testAddHtmlContentCharset()
{
$crawler = new Crawler();
$crawler = $this->createCrawler();
$crawler->addHtmlContent('<html><div class="foo">Tiếng Việt</html>', 'UTF-8');
$this->assertEquals('Tiếng Việt', $crawler->filterXPath('//div')->text());
@ -119,61 +128,27 @@ class CrawlerTest extends TestCase
public function testAddHtmlContentInvalidBaseTag()
{
$crawler = new Crawler(null, 'http://symfony.com');
$crawler = $this->createCrawler(null, 'http://symfony.com');
$crawler->addHtmlContent('<html><head><base target="_top"></head><a href="/contact"></a></html>', 'UTF-8');
$this->assertEquals('http://symfony.com/contact', current($crawler->filterXPath('//a')->links())->getUri(), '->addHtmlContent() correctly handles a non-existent base tag href attribute');
}
public function testAddHtmlContentUnsupportedCharset()
{
$crawler = new Crawler();
$crawler->addHtmlContent(file_get_contents(__DIR__.'/Fixtures/windows-1250.html'), 'Windows-1250');
$this->assertEquals('Žťčýů', $crawler->filterXPath('//p')->text());
}
/**
* @requires extension mbstring
*/
public function testAddHtmlContentCharsetGbk()
{
$crawler = new Crawler();
$crawler = $this->createCrawler();
//gbk encode of <html><p>中文</p></html>
$crawler->addHtmlContent(base64_decode('PGh0bWw+PHA+1tDOxDwvcD48L2h0bWw+'), 'gbk');
$this->assertEquals('中文', $crawler->filterXPath('//p')->text());
}
public function testAddHtmlContentWithErrors()
{
$internalErrors = libxml_use_internal_errors(true);
$crawler = new Crawler();
$crawler->addHtmlContent(<<<'EOF'
<!DOCTYPE html>
<html>
<head>
</head>
<body>
<nav><a href="#"><a href="#"></nav>
</body>
</html>
EOF
, 'UTF-8');
$errors = libxml_get_errors();
$this->assertCount(1, $errors);
$this->assertEquals("Tag nav invalid\n", $errors[0]->message);
libxml_clear_errors();
libxml_use_internal_errors($internalErrors);
}
public function testAddXmlContent()
{
$crawler = new Crawler();
$crawler = $this->createCrawler();
$crawler->addXmlContent('<html><div class="foo"></div></html>', 'UTF-8');
$this->assertEquals('foo', $crawler->filterXPath('//div')->attr('class'), '->addXmlContent() adds nodes from an XML string');
@ -181,62 +156,39 @@ EOF
public function testAddXmlContentCharset()
{
$crawler = new Crawler();
$crawler = $this->createCrawler();
$crawler->addXmlContent('<html><div class="foo">Tiếng Việt</div></html>', 'UTF-8');
$this->assertEquals('Tiếng Việt', $crawler->filterXPath('//div')->text());
}
public function testAddXmlContentWithErrors()
{
$internalErrors = libxml_use_internal_errors(true);
$crawler = new Crawler();
$crawler->addXmlContent(<<<'EOF'
<!DOCTYPE html>
<html>
<head>
</head>
<body>
<nav><a href="#"><a href="#"></nav>
</body>
</html>
EOF
, 'UTF-8');
$this->assertGreaterThan(1, libxml_get_errors());
libxml_clear_errors();
libxml_use_internal_errors($internalErrors);
}
public function testAddContent()
{
$crawler = new Crawler();
$crawler = $this->createCrawler();
$crawler->addContent('<html><div class="foo"></html>', 'text/html; charset=UTF-8');
$this->assertEquals('foo', $crawler->filterXPath('//div')->attr('class'), '->addContent() adds nodes from an HTML string');
$crawler = new Crawler();
$crawler = $this->createCrawler();
$crawler->addContent('<html><div class="foo"></html>', 'text/html; charset=UTF-8; dir=RTL');
$this->assertEquals('foo', $crawler->filterXPath('//div')->attr('class'), '->addContent() adds nodes from an HTML string with extended content type');
$crawler = new Crawler();
$crawler = $this->createCrawler();
$crawler->addContent('<html><div class="foo"></html>');
$this->assertEquals('foo', $crawler->filterXPath('//div')->attr('class'), '->addContent() uses text/html as the default type');
$crawler = new Crawler();
$crawler = $this->createCrawler();
$crawler->addContent('<html><div class="foo"></div></html>', 'text/xml; charset=UTF-8');
$this->assertEquals('foo', $crawler->filterXPath('//div')->attr('class'), '->addContent() adds nodes from an XML string');
$crawler = new Crawler();
$crawler = $this->createCrawler();
$crawler->addContent('<html><div class="foo"></div></html>', 'text/xml');
$this->assertEquals('foo', $crawler->filterXPath('//div')->attr('class'), '->addContent() adds nodes from an XML string');
$crawler = new Crawler();
$crawler = $this->createCrawler();
$crawler->addContent('foo bar', 'text/plain');
$this->assertCount(0, $crawler, '->addContent() does nothing if the type is not (x|ht)ml');
$crawler = new Crawler();
$crawler = $this->createCrawler();
$crawler->addContent('<html><meta http-equiv="Content-Type" content="text/html; charset=utf-8" /><span>中文</span></html>');
$this->assertEquals('中文', $crawler->filterXPath('//span')->text(), '->addContent() guess wrong charset');
}
@ -246,14 +198,14 @@ EOF
*/
public function testAddContentNonUtf8()
{
$crawler = new Crawler();
$crawler = $this->createCrawler();
$crawler->addContent(iconv('UTF-8', 'SJIS', '<html><head><meta charset="Shift_JIS"></head><body>日本語</body></html>'));
$this->assertEquals('日本語', $crawler->filterXPath('//body')->text(), '->addContent() can recognize "Shift_JIS" in html5 meta charset tag');
}
public function testAddDocument()
{
$crawler = new Crawler();
$crawler = $this->createCrawler();
$crawler->addDocument($this->createDomDocument());
$this->assertEquals('foo', $crawler->filterXPath('//div')->attr('class'), '->addDocument() adds nodes from a \DOMDocument');
@ -261,7 +213,7 @@ EOF
public function testAddNodeList()
{
$crawler = new Crawler();
$crawler = $this->createCrawler();
$crawler->addNodeList($this->createNodeList());
$this->assertEquals('foo', $crawler->filterXPath('//div')->attr('class'), '->addNodeList() adds nodes from a \DOMNodeList');
@ -274,7 +226,7 @@ EOF
$list[] = $node;
}
$crawler = new Crawler();
$crawler = $this->createCrawler();
$crawler->addNodes($list);
$this->assertEquals('foo', $crawler->filterXPath('//div')->attr('class'), '->addNodes() adds nodes from an array of nodes');
@ -282,7 +234,7 @@ EOF
public function testAddNode()
{
$crawler = new Crawler();
$crawler = $this->createCrawler();
$crawler->addNode($this->createNodeList()->item(0));
$this->assertEquals('foo', $crawler->filterXPath('//div')->attr('class'), '->addNode() adds nodes from a \DOMNode');
@ -293,7 +245,7 @@ EOF
$doc = new \DOMDocument();
$node = $doc->createElement('test');
$crawler = new Crawler($node);
$crawler = $this->createCrawler($node);
$crawler->clear();
$this->assertCount(0, $crawler, '->clear() removes all the nodes from the crawler');
}
@ -361,7 +313,7 @@ EOF
public function testMissingAttrValueIsNull()
{
$crawler = new Crawler();
$crawler = $this->createCrawler();
$crawler->addContent('<html><div non-empty-attr="sample value" empty-attr=""></div></html>', 'text/html; charset=UTF-8');
$div = $crawler->filterXPath('//div');
@ -647,7 +599,7 @@ EOF
public function testFilterWithDefaultNamespaceOnly()
{
$crawler = new Crawler('<?xml version="1.0" encoding="UTF-8"?>
$crawler = $this->createCrawler('<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<url>
<loc>http://localhost/foo</loc>
@ -731,7 +683,7 @@ EOF
</html>
HTML;
$crawler = new Crawler($html);
$crawler = $this->createCrawler($html);
$this->assertCount(1, $crawler->selectButton('Click \'Here\''));
}
@ -752,7 +704,7 @@ HTML;
</html>
HTML;
$crawler = new Crawler($html);
$crawler = $this->createCrawler($html);
$this->assertCount(1, $crawler->selectButton('Click "Here"'));
}
@ -824,7 +776,7 @@ HTML;
</html>
HTML;
$crawler = new Crawler($html);
$crawler = $this->createCrawler($html);
$filtered = $crawler->filterXPath("descendant-or-self::*[@id = 'login-form']");
$this->assertCount(0, $filtered->selectLink('Login'));
@ -841,7 +793,7 @@ HTML;
public function testChaining()
{
$crawler = new Crawler('<div name="a"><div name="b"><div name="c"></div></div></div>');
$crawler = $this->createCrawler('<div name="a"><div name="b"><div name="c"></div></div></div>');
$this->assertEquals('a', $crawler->filterXPath('//div')->filterXPath('div')->filterXPath('div')->attr('name'));
}
@ -1000,7 +952,7 @@ HTML;
}
try {
$crawler = new Crawler('<p></p>');
$crawler = $this->createCrawler('<p></p>');
$crawler->filter('p')->children();
$this->assertTrue(true, '->children() does not trigger a notice if the node has no children');
} catch (\PHPUnit\Framework\Error\Notice $e) {
@ -1029,7 +981,7 @@ HTML;
</html>
HTML;
$crawler = new Crawler($html);
$crawler = $this->createCrawler($html);
$foo = $crawler->filter('#foo');
$this->assertEquals(3, $foo->children()->count());
@ -1066,7 +1018,7 @@ HTML;
*/
public function testBaseTag($baseValue, $linkValue, $expectedUri, $currentUri = null, $description = '')
{
$crawler = new Crawler('<html><base href="'.$baseValue.'"><a href="'.$linkValue.'"></a></html>', $currentUri);
$crawler = $this->createCrawler('<html><base href="'.$baseValue.'"><a href="'.$linkValue.'"></a></html>', $currentUri);
$this->assertEquals($expectedUri, $crawler->filterXPath('//a')->link()->getUri(), $description);
}
@ -1086,7 +1038,7 @@ HTML;
*/
public function testBaseTagWithForm($baseValue, $actionValue, $expectedUri, $currentUri = null, $description = null)
{
$crawler = new Crawler('<html><base href="'.$baseValue.'"><form method="post" action="'.$actionValue.'"><button type="submit" name="submit"/></form></html>', $currentUri);
$crawler = $this->createCrawler('<html><base href="'.$baseValue.'"><form method="post" action="'.$actionValue.'"><button type="submit" name="submit"/></form></html>', $currentUri);
$this->assertEquals($expectedUri, $crawler->filterXPath('//button')->form()->getUri(), $description);
}
@ -1104,7 +1056,7 @@ HTML;
public function testCountOfNestedElements()
{
$crawler = new Crawler('<html><body><ul><li>List item 1<ul><li>Sublist item 1</li><li>Sublist item 2</ul></li></ul></body></html>');
$crawler = $this->createCrawler('<html><body><ul><li>List item 1<ul><li>Sublist item 1</li><li>Sublist item 2</ul></li></ul></body></html>');
$this->assertCount(1, $crawler->filter('li:contains("List item 1")'));
}
@ -1151,7 +1103,7 @@ HTML;
*/
public function testEvaluateThrowsAnExceptionIfDocumentIsEmpty()
{
(new Crawler())->evaluate('//form/input[1]');
$this->createCrawler()->evaluate('//form/input[1]');
}
/**
@ -1210,6 +1162,14 @@ HTML;
$crawlerChild->children();
}
public function testAddHtmlContentUnsupportedCharset()
{
$crawler = $this->createCrawler();
$crawler->addHtmlContent(file_get_contents(__DIR__.'/Fixtures/windows-1250.html'), 'Windows-1250');
$this->assertEquals('Žťčýů', $crawler->filterXPath('//p')->text());
}
public function createTestCrawler($uri = null)
{
$dom = new \DOMDocument();
@ -1259,7 +1219,7 @@ HTML;
</html>
');
return new Crawler($dom, $uri);
return $this->createCrawler($dom, $uri);
}
protected function createTestXmlCrawler($uri = null)
@ -1276,7 +1236,7 @@ HTML;
<media:category label="Music" scheme="http://gdata.youtube.com/schemas/2007/categories.cat">Music</media:category>
</entry>';
return new Crawler($xml, $uri);
return $this->createCrawler($xml, $uri);
}
protected function createDomDocument()

View File

@ -0,0 +1,22 @@
<?php
/*
* This file is part of the Symfony package.
*
* (c) Fabien Potencier <fabien@symfony.com>
*
* For the full copyright and license information, please view the LICENSE
* file that was distributed with this source code.
*/
namespace Symfony\Component\DomCrawler\Tests;
use Symfony\Component\DomCrawler\Crawler;
class Html5ParserCrawlerTest extends AbstractCrawlerTest
{
public function createCrawler($node = null, string $uri = null, string $baseHref = null)
{
return new Crawler($node, $uri, $baseHref, true);
}
}

View File

@ -0,0 +1,70 @@
<?php
/*
* This file is part of the Symfony package.
*
* (c) Fabien Potencier <fabien@symfony.com>
*
* For the full copyright and license information, please view the LICENSE
* file that was distributed with this source code.
*/
namespace Symfony\Component\DomCrawler\Tests;
use Symfony\Component\DomCrawler\Crawler;
class NativeParserCrawlerTest extends AbstractCrawlerTest
{
public function createCrawler($node = null, string $uri = null, string $baseHref = null)
{
return new Crawler($node, $uri, $baseHref, false);
}
public function testAddHtmlContentWithErrors()
{
$internalErrors = libxml_use_internal_errors(true);
$crawler = $this->createCrawler();
$crawler->addHtmlContent(<<<'EOF'
<!DOCTYPE html>
<html>
<head>
</head>
<body>
<nav><a href="#"><a href="#"></nav>
</body>
</html>
EOF
, 'UTF-8');
$errors = libxml_get_errors();
$this->assertCount(1, $errors);
$this->assertEquals("Tag nav invalid\n", $errors[0]->message);
libxml_clear_errors();
libxml_use_internal_errors($internalErrors);
}
public function testAddXmlContentWithErrors()
{
$internalErrors = libxml_use_internal_errors(true);
$crawler = $this->createCrawler();
$crawler->addXmlContent(<<<'EOF'
<!DOCTYPE html>
<html>
<head>
</head>
<body>
<nav><a href="#"><a href="#"></nav>
</body>
</html>
EOF
, 'UTF-8');
$this->assertGreaterThan(1, libxml_get_errors());
libxml_clear_errors();
libxml_use_internal_errors($internalErrors);
}
}

View File

@ -21,7 +21,11 @@
"symfony/polyfill-mbstring": "~1.0"
},
"require-dev": {
"symfony/css-selector": "~3.4|~4.0"
"symfony/css-selector": "~3.4|~4.0",
"masterminds/html5": "^2.6"
},
"conflict": {
"masterminds/html5": "<2.6"
},
"suggest": {
"symfony/css-selector": ""