feature #29306 [DomCrawler] Optionally use html5-php to parse HTML (tgalopin)
This PR was squashed before being merged into the 4.3-dev branch (closes #29306).
Discussion
----------
[DomCrawler] Optionally use html5-php to parse HTML
| Q | A
| ------------- | ---
| Branch? | master
| Bug fix? | no
| New feature? | yes
| BC breaks? | no
| Deprecations? | no
| Tests pass? | WIP
| Fixed tickets | https://github.com/symfony/symfony/issues/29280, https://github.com/symfony/symfony/issues/28596
| License | MIT
| Doc PR | https://github.com/symfony/symfony-docs/pull/10700
This PR introduces the possibility to parse HTML content in the Crawler using the html5-php library (https://github.com/Masterminds/html5-php). This allows for better support of HTML5 and fix many unexpected behaviors and inconsistencies of the native DOM extension.
Commits
-------
4050ec4257
[DomCrawler] Optionally use html5-php to parse HTML
This commit is contained in:
commit
be66c44079
@ -101,6 +101,7 @@
|
||||
"doctrine/orm": "~2.4,>=2.4.5",
|
||||
"doctrine/reflection": "~1.0",
|
||||
"doctrine/doctrine-bundle": "~1.4",
|
||||
"masterminds/html5": "^2.6",
|
||||
"monolog/monolog": "~1.11",
|
||||
"nyholm/psr7": "^1.0",
|
||||
"ocramius/proxy-manager": "~0.4|~1.0|~2.0",
|
||||
@ -112,6 +113,7 @@
|
||||
"phpdocumentor/reflection-docblock": "^3.0|^4.0"
|
||||
},
|
||||
"conflict": {
|
||||
"masterminds/html5": "<2.6",
|
||||
"phpdocumentor/reflection-docblock": "<3.0||>=3.2.0,<3.2.2",
|
||||
"phpdocumentor/type-resolver": "<0.3.0",
|
||||
"phpunit/phpunit": "<5.4.3"
|
||||
|
@ -28,19 +28,19 @@ class SessionController implements ContainerAwareInterface
|
||||
// new session case
|
||||
if (!$session->has('name')) {
|
||||
if (!$name) {
|
||||
return new Response('You are new here and gave no name.');
|
||||
return new Response('<html><body>You are new here and gave no name.</body></html>');
|
||||
}
|
||||
|
||||
// remember name
|
||||
$session->set('name', $name);
|
||||
|
||||
return new Response(sprintf('Hello %s, nice to meet you.', $name));
|
||||
return new Response(sprintf('<html><body>Hello %s, nice to meet you.</body></html>', $name));
|
||||
}
|
||||
|
||||
// existing session
|
||||
$name = $session->get('name');
|
||||
|
||||
return new Response(sprintf('Welcome back %s, nice to meet you.', $name));
|
||||
return new Response(sprintf('<html><body>Welcome back %s, nice to meet you.</body></html>', $name));
|
||||
}
|
||||
|
||||
public function cacheableAction()
|
||||
@ -55,7 +55,7 @@ class SessionController implements ContainerAwareInterface
|
||||
{
|
||||
$request->getSession()->invalidate();
|
||||
|
||||
return new Response('Session cleared.');
|
||||
return new Response('<html><body>Session cleared.</body></html>');
|
||||
}
|
||||
|
||||
public function setFlashAction(Request $request, $message)
|
||||
@ -76,6 +76,6 @@ class SessionController implements ContainerAwareInterface
|
||||
$output = 'No flash was set.';
|
||||
}
|
||||
|
||||
return new Response($output);
|
||||
return new Response('<html><body>'.$output.'</body></html>');
|
||||
}
|
||||
}
|
||||
|
@ -54,11 +54,11 @@ class LocalizedController implements ContainerAwareInterface
|
||||
|
||||
public function profileAction()
|
||||
{
|
||||
return new Response('Profile');
|
||||
return new Response('<html><body>Profile</body></html>');
|
||||
}
|
||||
|
||||
public function homepageAction()
|
||||
{
|
||||
return new Response('Homepage');
|
||||
return new Response('<html><body>Homepage</body></html>');
|
||||
}
|
||||
}
|
||||
|
@ -8,6 +8,8 @@ CHANGELOG
|
||||
and `CrawlerSelectorTextSame`
|
||||
* Added return of element name (`_name`) in `extract()` method.
|
||||
* Added ability to return a default value in `text()` and `html()` instead of throwing an exception when node is empty.
|
||||
* When available, the [html5-php library](https://github.com/Masterminds/html5-php) is used to
|
||||
parse HTML added to a Crawler for better support of HTML5 tags.
|
||||
|
||||
4.2.0
|
||||
-----
|
||||
|
@ -11,6 +11,7 @@
|
||||
|
||||
namespace Symfony\Component\DomCrawler;
|
||||
|
||||
use Masterminds\HTML5;
|
||||
use Symfony\Component\CssSelector\CssSelectorConverter;
|
||||
|
||||
/**
|
||||
@ -55,15 +56,29 @@ class Crawler implements \Countable, \IteratorAggregate
|
||||
private $isHtml = true;
|
||||
|
||||
/**
|
||||
* @param mixed $node A Node to use as the base for the crawling
|
||||
* @param string $uri The current URI
|
||||
* @param string $baseHref The base href value
|
||||
* @var HTML5|null
|
||||
*/
|
||||
public function __construct($node = null, string $uri = null, string $baseHref = null)
|
||||
private $html5Parser;
|
||||
|
||||
/**
|
||||
* @param mixed $node A Node to use as the base for the crawling
|
||||
* @param string $uri The current URI
|
||||
* @param string $baseHref The base href value
|
||||
* @param bool|null $useHtml5Parser Whether the Crawler should use the HTML5 parser or the native DOM parser
|
||||
*/
|
||||
public function __construct($node = null, string $uri = null, string $baseHref = null, bool $useHtml5Parser = null)
|
||||
{
|
||||
$this->uri = $uri;
|
||||
$this->baseHref = $baseHref ?: $uri;
|
||||
|
||||
if ($useHtml5Parser && !class_exists(HTML5::class)) {
|
||||
throw new \LogicException('Using the DomCrawler HTML5 parser requires the html5-php library. Try running "composer require masterminds/html5".');
|
||||
}
|
||||
|
||||
if ($useHtml5Parser ?? class_exists(HTML5::class)) {
|
||||
$this->html5Parser = new HTML5(['disable_html_ns' => true]);
|
||||
}
|
||||
|
||||
$this->add($node);
|
||||
}
|
||||
|
||||
@ -183,29 +198,7 @@ class Crawler implements \Countable, \IteratorAggregate
|
||||
*/
|
||||
public function addHtmlContent($content, $charset = 'UTF-8')
|
||||
{
|
||||
$internalErrors = libxml_use_internal_errors(true);
|
||||
$disableEntities = libxml_disable_entity_loader(true);
|
||||
|
||||
$dom = new \DOMDocument('1.0', $charset);
|
||||
$dom->validateOnParse = true;
|
||||
|
||||
set_error_handler(function () { throw new \Exception(); });
|
||||
|
||||
try {
|
||||
// Convert charset to HTML-entities to work around bugs in DOMDocument::loadHTML()
|
||||
$content = mb_convert_encoding($content, 'HTML-ENTITIES', $charset);
|
||||
} catch (\Exception $e) {
|
||||
}
|
||||
|
||||
restore_error_handler();
|
||||
|
||||
if ('' !== trim($content)) {
|
||||
@$dom->loadHTML($content);
|
||||
}
|
||||
|
||||
libxml_use_internal_errors($internalErrors);
|
||||
libxml_disable_entity_loader($disableEntities);
|
||||
|
||||
$dom = null !== $this->html5Parser ? $this->parseHtml5($content, $charset) : $this->parseXhtml($content, $charset);
|
||||
$this->addDocument($dom);
|
||||
|
||||
$base = $this->filterRelativeXPath('descendant-or-self::base')->extract(['href']);
|
||||
@ -608,6 +601,15 @@ class Crawler implements \Countable, \IteratorAggregate
|
||||
throw new \InvalidArgumentException('The current node list is empty.');
|
||||
}
|
||||
|
||||
if (null !== $this->html5Parser) {
|
||||
$html = '';
|
||||
foreach ($this->getNode(0)->childNodes as $child) {
|
||||
$html .= $this->html5Parser->saveHTML($child);
|
||||
}
|
||||
|
||||
return $html;
|
||||
}
|
||||
|
||||
$html = '';
|
||||
foreach ($this->getNode(0)->childNodes as $child) {
|
||||
$html .= $child->ownerDocument->saveHTML($child);
|
||||
@ -1112,6 +1114,53 @@ class Crawler implements \Countable, \IteratorAggregate
|
||||
return $nodes;
|
||||
}
|
||||
|
||||
private function parseHtml5(string $htmlContent, string $charset = 'UTF-8'): \DOMDocument
|
||||
{
|
||||
return $this->html5Parser->parse($this->convertToHtmlEntities($htmlContent, $charset), [], $charset);
|
||||
}
|
||||
|
||||
private function parseXhtml(string $htmlContent, string $charset = 'UTF-8'): \DOMDocument
|
||||
{
|
||||
$htmlContent = $this->convertToHtmlEntities($htmlContent, $charset);
|
||||
|
||||
$internalErrors = libxml_use_internal_errors(true);
|
||||
$disableEntities = libxml_disable_entity_loader(true);
|
||||
|
||||
$dom = new \DOMDocument('1.0', $charset);
|
||||
$dom->validateOnParse = true;
|
||||
|
||||
if ('' !== trim($htmlContent)) {
|
||||
@$dom->loadHTML($htmlContent);
|
||||
}
|
||||
|
||||
libxml_use_internal_errors($internalErrors);
|
||||
libxml_disable_entity_loader($disableEntities);
|
||||
|
||||
return $dom;
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert charset to HTML-entities to ensure valid parsing.
|
||||
*/
|
||||
private function convertToHtmlEntities(string $htmlContent, string $charset = 'UTF-8'): string
|
||||
{
|
||||
set_error_handler(function () { throw new \Exception(); });
|
||||
|
||||
try {
|
||||
return mb_convert_encoding($htmlContent, 'HTML-ENTITIES', $charset);
|
||||
} catch (\Exception $e) {
|
||||
try {
|
||||
$htmlContent = iconv($charset, 'UTF-8', $htmlContent);
|
||||
$htmlContent = mb_convert_encoding($htmlContent, 'HTML-ENTITIES', 'UTF-8');
|
||||
} catch (\Exception $e) {
|
||||
}
|
||||
|
||||
return $htmlContent;
|
||||
} finally {
|
||||
restore_error_handler();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @throws \InvalidArgumentException
|
||||
*/
|
||||
|
@ -14,41 +14,50 @@ namespace Symfony\Component\DomCrawler\Tests;
|
||||
use PHPUnit\Framework\TestCase;
|
||||
use Symfony\Component\DomCrawler\Crawler;
|
||||
|
||||
class CrawlerTest extends TestCase
|
||||
abstract class AbstractCrawlerTest extends TestCase
|
||||
{
|
||||
/**
|
||||
* @param mixed $node
|
||||
* @param string|null $uri
|
||||
* @param string|null $baseHref
|
||||
*
|
||||
* @return Crawler
|
||||
*/
|
||||
abstract public function createCrawler($node = null, string $uri = null, string $baseHref = null);
|
||||
|
||||
public function testConstructor()
|
||||
{
|
||||
$crawler = new Crawler();
|
||||
$crawler = $this->createCrawler();
|
||||
$this->assertCount(0, $crawler, '__construct() returns an empty crawler');
|
||||
|
||||
$doc = new \DOMDocument();
|
||||
$node = $doc->createElement('test');
|
||||
|
||||
$crawler = new Crawler($node);
|
||||
$crawler = $this->createCrawler($node);
|
||||
$this->assertCount(1, $crawler, '__construct() takes a node as a first argument');
|
||||
}
|
||||
|
||||
public function testGetUri()
|
||||
{
|
||||
$uri = 'http://symfony.com';
|
||||
$crawler = new Crawler(null, $uri);
|
||||
$crawler = $this->createCrawler(null, $uri);
|
||||
$this->assertEquals($uri, $crawler->getUri());
|
||||
}
|
||||
|
||||
public function testGetBaseHref()
|
||||
{
|
||||
$baseHref = 'http://symfony.com';
|
||||
$crawler = new Crawler(null, null, $baseHref);
|
||||
$crawler = $this->createCrawler(null, null, $baseHref);
|
||||
$this->assertEquals($baseHref, $crawler->getBaseHref());
|
||||
}
|
||||
|
||||
public function testAdd()
|
||||
{
|
||||
$crawler = new Crawler();
|
||||
$crawler = $this->createCrawler();
|
||||
$crawler->add($this->createDomDocument());
|
||||
$this->assertEquals('foo', $crawler->filterXPath('//div')->attr('class'), '->add() adds nodes from a \DOMDocument');
|
||||
|
||||
$crawler = new Crawler();
|
||||
$crawler = $this->createCrawler();
|
||||
$crawler->add($this->createNodeList());
|
||||
$this->assertEquals('foo', $crawler->filterXPath('//div')->attr('class'), '->add() adds nodes from a \DOMNodeList');
|
||||
|
||||
@ -56,15 +65,15 @@ class CrawlerTest extends TestCase
|
||||
foreach ($this->createNodeList() as $node) {
|
||||
$list[] = $node;
|
||||
}
|
||||
$crawler = new Crawler();
|
||||
$crawler = $this->createCrawler();
|
||||
$crawler->add($list);
|
||||
$this->assertEquals('foo', $crawler->filterXPath('//div')->attr('class'), '->add() adds nodes from an array of nodes');
|
||||
|
||||
$crawler = new Crawler();
|
||||
$crawler = $this->createCrawler();
|
||||
$crawler->add($this->createNodeList()->item(0));
|
||||
$this->assertEquals('foo', $crawler->filterXPath('//div')->attr('class'), '->add() adds nodes from a \DOMNode');
|
||||
|
||||
$crawler = new Crawler();
|
||||
$crawler = $this->createCrawler();
|
||||
$crawler->add('<html><body>Foo</body></html>');
|
||||
$this->assertEquals('Foo', $crawler->filterXPath('//body')->text(), '->add() adds nodes from a string');
|
||||
}
|
||||
@ -74,7 +83,7 @@ class CrawlerTest extends TestCase
|
||||
*/
|
||||
public function testAddInvalidType()
|
||||
{
|
||||
$crawler = new Crawler();
|
||||
$crawler = $this->createCrawler();
|
||||
$crawler->add(1);
|
||||
}
|
||||
|
||||
@ -90,7 +99,7 @@ class CrawlerTest extends TestCase
|
||||
|
||||
public function testAddHtmlContent()
|
||||
{
|
||||
$crawler = new Crawler();
|
||||
$crawler = $this->createCrawler();
|
||||
$crawler->addHtmlContent('<html><div class="foo"></html>', 'UTF-8');
|
||||
|
||||
$this->assertEquals('foo', $crawler->filterXPath('//div')->attr('class'), '->addHtmlContent() adds nodes from an HTML string');
|
||||
@ -98,7 +107,7 @@ class CrawlerTest extends TestCase
|
||||
|
||||
public function testAddHtmlContentWithBaseTag()
|
||||
{
|
||||
$crawler = new Crawler();
|
||||
$crawler = $this->createCrawler();
|
||||
|
||||
$crawler->addHtmlContent('<html><head><base href="http://symfony.com"></head><a href="/contact"></a></html>', 'UTF-8');
|
||||
|
||||
@ -111,7 +120,7 @@ class CrawlerTest extends TestCase
|
||||
*/
|
||||
public function testAddHtmlContentCharset()
|
||||
{
|
||||
$crawler = new Crawler();
|
||||
$crawler = $this->createCrawler();
|
||||
$crawler->addHtmlContent('<html><div class="foo">Tiếng Việt</html>', 'UTF-8');
|
||||
|
||||
$this->assertEquals('Tiếng Việt', $crawler->filterXPath('//div')->text());
|
||||
@ -119,61 +128,27 @@ class CrawlerTest extends TestCase
|
||||
|
||||
public function testAddHtmlContentInvalidBaseTag()
|
||||
{
|
||||
$crawler = new Crawler(null, 'http://symfony.com');
|
||||
|
||||
$crawler = $this->createCrawler(null, 'http://symfony.com');
|
||||
$crawler->addHtmlContent('<html><head><base target="_top"></head><a href="/contact"></a></html>', 'UTF-8');
|
||||
|
||||
$this->assertEquals('http://symfony.com/contact', current($crawler->filterXPath('//a')->links())->getUri(), '->addHtmlContent() correctly handles a non-existent base tag href attribute');
|
||||
}
|
||||
|
||||
public function testAddHtmlContentUnsupportedCharset()
|
||||
{
|
||||
$crawler = new Crawler();
|
||||
$crawler->addHtmlContent(file_get_contents(__DIR__.'/Fixtures/windows-1250.html'), 'Windows-1250');
|
||||
|
||||
$this->assertEquals('Žťčýů', $crawler->filterXPath('//p')->text());
|
||||
}
|
||||
|
||||
/**
|
||||
* @requires extension mbstring
|
||||
*/
|
||||
public function testAddHtmlContentCharsetGbk()
|
||||
{
|
||||
$crawler = new Crawler();
|
||||
$crawler = $this->createCrawler();
|
||||
//gbk encode of <html><p>中文</p></html>
|
||||
$crawler->addHtmlContent(base64_decode('PGh0bWw+PHA+1tDOxDwvcD48L2h0bWw+'), 'gbk');
|
||||
|
||||
$this->assertEquals('中文', $crawler->filterXPath('//p')->text());
|
||||
}
|
||||
|
||||
public function testAddHtmlContentWithErrors()
|
||||
{
|
||||
$internalErrors = libxml_use_internal_errors(true);
|
||||
|
||||
$crawler = new Crawler();
|
||||
$crawler->addHtmlContent(<<<'EOF'
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
</head>
|
||||
<body>
|
||||
<nav><a href="#"><a href="#"></nav>
|
||||
</body>
|
||||
</html>
|
||||
EOF
|
||||
, 'UTF-8');
|
||||
|
||||
$errors = libxml_get_errors();
|
||||
$this->assertCount(1, $errors);
|
||||
$this->assertEquals("Tag nav invalid\n", $errors[0]->message);
|
||||
|
||||
libxml_clear_errors();
|
||||
libxml_use_internal_errors($internalErrors);
|
||||
}
|
||||
|
||||
public function testAddXmlContent()
|
||||
{
|
||||
$crawler = new Crawler();
|
||||
$crawler = $this->createCrawler();
|
||||
$crawler->addXmlContent('<html><div class="foo"></div></html>', 'UTF-8');
|
||||
|
||||
$this->assertEquals('foo', $crawler->filterXPath('//div')->attr('class'), '->addXmlContent() adds nodes from an XML string');
|
||||
@ -181,62 +156,39 @@ EOF
|
||||
|
||||
public function testAddXmlContentCharset()
|
||||
{
|
||||
$crawler = new Crawler();
|
||||
$crawler = $this->createCrawler();
|
||||
$crawler->addXmlContent('<html><div class="foo">Tiếng Việt</div></html>', 'UTF-8');
|
||||
|
||||
$this->assertEquals('Tiếng Việt', $crawler->filterXPath('//div')->text());
|
||||
}
|
||||
|
||||
public function testAddXmlContentWithErrors()
|
||||
{
|
||||
$internalErrors = libxml_use_internal_errors(true);
|
||||
|
||||
$crawler = new Crawler();
|
||||
$crawler->addXmlContent(<<<'EOF'
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
</head>
|
||||
<body>
|
||||
<nav><a href="#"><a href="#"></nav>
|
||||
</body>
|
||||
</html>
|
||||
EOF
|
||||
, 'UTF-8');
|
||||
|
||||
$this->assertGreaterThan(1, libxml_get_errors());
|
||||
|
||||
libxml_clear_errors();
|
||||
libxml_use_internal_errors($internalErrors);
|
||||
}
|
||||
|
||||
public function testAddContent()
|
||||
{
|
||||
$crawler = new Crawler();
|
||||
$crawler = $this->createCrawler();
|
||||
$crawler->addContent('<html><div class="foo"></html>', 'text/html; charset=UTF-8');
|
||||
$this->assertEquals('foo', $crawler->filterXPath('//div')->attr('class'), '->addContent() adds nodes from an HTML string');
|
||||
|
||||
$crawler = new Crawler();
|
||||
$crawler = $this->createCrawler();
|
||||
$crawler->addContent('<html><div class="foo"></html>', 'text/html; charset=UTF-8; dir=RTL');
|
||||
$this->assertEquals('foo', $crawler->filterXPath('//div')->attr('class'), '->addContent() adds nodes from an HTML string with extended content type');
|
||||
|
||||
$crawler = new Crawler();
|
||||
$crawler = $this->createCrawler();
|
||||
$crawler->addContent('<html><div class="foo"></html>');
|
||||
$this->assertEquals('foo', $crawler->filterXPath('//div')->attr('class'), '->addContent() uses text/html as the default type');
|
||||
|
||||
$crawler = new Crawler();
|
||||
$crawler = $this->createCrawler();
|
||||
$crawler->addContent('<html><div class="foo"></div></html>', 'text/xml; charset=UTF-8');
|
||||
$this->assertEquals('foo', $crawler->filterXPath('//div')->attr('class'), '->addContent() adds nodes from an XML string');
|
||||
|
||||
$crawler = new Crawler();
|
||||
$crawler = $this->createCrawler();
|
||||
$crawler->addContent('<html><div class="foo"></div></html>', 'text/xml');
|
||||
$this->assertEquals('foo', $crawler->filterXPath('//div')->attr('class'), '->addContent() adds nodes from an XML string');
|
||||
|
||||
$crawler = new Crawler();
|
||||
$crawler = $this->createCrawler();
|
||||
$crawler->addContent('foo bar', 'text/plain');
|
||||
$this->assertCount(0, $crawler, '->addContent() does nothing if the type is not (x|ht)ml');
|
||||
|
||||
$crawler = new Crawler();
|
||||
$crawler = $this->createCrawler();
|
||||
$crawler->addContent('<html><meta http-equiv="Content-Type" content="text/html; charset=utf-8" /><span>中文</span></html>');
|
||||
$this->assertEquals('中文', $crawler->filterXPath('//span')->text(), '->addContent() guess wrong charset');
|
||||
}
|
||||
@ -246,14 +198,14 @@ EOF
|
||||
*/
|
||||
public function testAddContentNonUtf8()
|
||||
{
|
||||
$crawler = new Crawler();
|
||||
$crawler = $this->createCrawler();
|
||||
$crawler->addContent(iconv('UTF-8', 'SJIS', '<html><head><meta charset="Shift_JIS"></head><body>日本語</body></html>'));
|
||||
$this->assertEquals('日本語', $crawler->filterXPath('//body')->text(), '->addContent() can recognize "Shift_JIS" in html5 meta charset tag');
|
||||
}
|
||||
|
||||
public function testAddDocument()
|
||||
{
|
||||
$crawler = new Crawler();
|
||||
$crawler = $this->createCrawler();
|
||||
$crawler->addDocument($this->createDomDocument());
|
||||
|
||||
$this->assertEquals('foo', $crawler->filterXPath('//div')->attr('class'), '->addDocument() adds nodes from a \DOMDocument');
|
||||
@ -261,7 +213,7 @@ EOF
|
||||
|
||||
public function testAddNodeList()
|
||||
{
|
||||
$crawler = new Crawler();
|
||||
$crawler = $this->createCrawler();
|
||||
$crawler->addNodeList($this->createNodeList());
|
||||
|
||||
$this->assertEquals('foo', $crawler->filterXPath('//div')->attr('class'), '->addNodeList() adds nodes from a \DOMNodeList');
|
||||
@ -274,7 +226,7 @@ EOF
|
||||
$list[] = $node;
|
||||
}
|
||||
|
||||
$crawler = new Crawler();
|
||||
$crawler = $this->createCrawler();
|
||||
$crawler->addNodes($list);
|
||||
|
||||
$this->assertEquals('foo', $crawler->filterXPath('//div')->attr('class'), '->addNodes() adds nodes from an array of nodes');
|
||||
@ -282,7 +234,7 @@ EOF
|
||||
|
||||
public function testAddNode()
|
||||
{
|
||||
$crawler = new Crawler();
|
||||
$crawler = $this->createCrawler();
|
||||
$crawler->addNode($this->createNodeList()->item(0));
|
||||
|
||||
$this->assertEquals('foo', $crawler->filterXPath('//div')->attr('class'), '->addNode() adds nodes from a \DOMNode');
|
||||
@ -293,7 +245,7 @@ EOF
|
||||
$doc = new \DOMDocument();
|
||||
$node = $doc->createElement('test');
|
||||
|
||||
$crawler = new Crawler($node);
|
||||
$crawler = $this->createCrawler($node);
|
||||
$crawler->clear();
|
||||
$this->assertCount(0, $crawler, '->clear() removes all the nodes from the crawler');
|
||||
}
|
||||
@ -361,7 +313,7 @@ EOF
|
||||
|
||||
public function testMissingAttrValueIsNull()
|
||||
{
|
||||
$crawler = new Crawler();
|
||||
$crawler = $this->createCrawler();
|
||||
$crawler->addContent('<html><div non-empty-attr="sample value" empty-attr=""></div></html>', 'text/html; charset=UTF-8');
|
||||
$div = $crawler->filterXPath('//div');
|
||||
|
||||
@ -647,7 +599,7 @@ EOF
|
||||
|
||||
public function testFilterWithDefaultNamespaceOnly()
|
||||
{
|
||||
$crawler = new Crawler('<?xml version="1.0" encoding="UTF-8"?>
|
||||
$crawler = $this->createCrawler('<?xml version="1.0" encoding="UTF-8"?>
|
||||
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
||||
<url>
|
||||
<loc>http://localhost/foo</loc>
|
||||
@ -731,7 +683,7 @@ EOF
|
||||
</html>
|
||||
HTML;
|
||||
|
||||
$crawler = new Crawler($html);
|
||||
$crawler = $this->createCrawler($html);
|
||||
|
||||
$this->assertCount(1, $crawler->selectButton('Click \'Here\''));
|
||||
}
|
||||
@ -752,7 +704,7 @@ HTML;
|
||||
</html>
|
||||
HTML;
|
||||
|
||||
$crawler = new Crawler($html);
|
||||
$crawler = $this->createCrawler($html);
|
||||
|
||||
$this->assertCount(1, $crawler->selectButton('Click "Here"'));
|
||||
}
|
||||
@ -824,7 +776,7 @@ HTML;
|
||||
</html>
|
||||
HTML;
|
||||
|
||||
$crawler = new Crawler($html);
|
||||
$crawler = $this->createCrawler($html);
|
||||
$filtered = $crawler->filterXPath("descendant-or-self::*[@id = 'login-form']");
|
||||
|
||||
$this->assertCount(0, $filtered->selectLink('Login'));
|
||||
@ -841,7 +793,7 @@ HTML;
|
||||
|
||||
public function testChaining()
|
||||
{
|
||||
$crawler = new Crawler('<div name="a"><div name="b"><div name="c"></div></div></div>');
|
||||
$crawler = $this->createCrawler('<div name="a"><div name="b"><div name="c"></div></div></div>');
|
||||
|
||||
$this->assertEquals('a', $crawler->filterXPath('//div')->filterXPath('div')->filterXPath('div')->attr('name'));
|
||||
}
|
||||
@ -1000,7 +952,7 @@ HTML;
|
||||
}
|
||||
|
||||
try {
|
||||
$crawler = new Crawler('<p></p>');
|
||||
$crawler = $this->createCrawler('<p></p>');
|
||||
$crawler->filter('p')->children();
|
||||
$this->assertTrue(true, '->children() does not trigger a notice if the node has no children');
|
||||
} catch (\PHPUnit\Framework\Error\Notice $e) {
|
||||
@ -1029,7 +981,7 @@ HTML;
|
||||
</html>
|
||||
HTML;
|
||||
|
||||
$crawler = new Crawler($html);
|
||||
$crawler = $this->createCrawler($html);
|
||||
$foo = $crawler->filter('#foo');
|
||||
|
||||
$this->assertEquals(3, $foo->children()->count());
|
||||
@ -1066,7 +1018,7 @@ HTML;
|
||||
*/
|
||||
public function testBaseTag($baseValue, $linkValue, $expectedUri, $currentUri = null, $description = '')
|
||||
{
|
||||
$crawler = new Crawler('<html><base href="'.$baseValue.'"><a href="'.$linkValue.'"></a></html>', $currentUri);
|
||||
$crawler = $this->createCrawler('<html><base href="'.$baseValue.'"><a href="'.$linkValue.'"></a></html>', $currentUri);
|
||||
$this->assertEquals($expectedUri, $crawler->filterXPath('//a')->link()->getUri(), $description);
|
||||
}
|
||||
|
||||
@ -1086,7 +1038,7 @@ HTML;
|
||||
*/
|
||||
public function testBaseTagWithForm($baseValue, $actionValue, $expectedUri, $currentUri = null, $description = null)
|
||||
{
|
||||
$crawler = new Crawler('<html><base href="'.$baseValue.'"><form method="post" action="'.$actionValue.'"><button type="submit" name="submit"/></form></html>', $currentUri);
|
||||
$crawler = $this->createCrawler('<html><base href="'.$baseValue.'"><form method="post" action="'.$actionValue.'"><button type="submit" name="submit"/></form></html>', $currentUri);
|
||||
$this->assertEquals($expectedUri, $crawler->filterXPath('//button')->form()->getUri(), $description);
|
||||
}
|
||||
|
||||
@ -1104,7 +1056,7 @@ HTML;
|
||||
|
||||
public function testCountOfNestedElements()
|
||||
{
|
||||
$crawler = new Crawler('<html><body><ul><li>List item 1<ul><li>Sublist item 1</li><li>Sublist item 2</ul></li></ul></body></html>');
|
||||
$crawler = $this->createCrawler('<html><body><ul><li>List item 1<ul><li>Sublist item 1</li><li>Sublist item 2</ul></li></ul></body></html>');
|
||||
|
||||
$this->assertCount(1, $crawler->filter('li:contains("List item 1")'));
|
||||
}
|
||||
@ -1151,7 +1103,7 @@ HTML;
|
||||
*/
|
||||
public function testEvaluateThrowsAnExceptionIfDocumentIsEmpty()
|
||||
{
|
||||
(new Crawler())->evaluate('//form/input[1]');
|
||||
$this->createCrawler()->evaluate('//form/input[1]');
|
||||
}
|
||||
|
||||
/**
|
||||
@ -1210,6 +1162,14 @@ HTML;
|
||||
$crawlerChild->children();
|
||||
}
|
||||
|
||||
public function testAddHtmlContentUnsupportedCharset()
|
||||
{
|
||||
$crawler = $this->createCrawler();
|
||||
$crawler->addHtmlContent(file_get_contents(__DIR__.'/Fixtures/windows-1250.html'), 'Windows-1250');
|
||||
|
||||
$this->assertEquals('Žťčýů', $crawler->filterXPath('//p')->text());
|
||||
}
|
||||
|
||||
public function createTestCrawler($uri = null)
|
||||
{
|
||||
$dom = new \DOMDocument();
|
||||
@ -1259,7 +1219,7 @@ HTML;
|
||||
</html>
|
||||
');
|
||||
|
||||
return new Crawler($dom, $uri);
|
||||
return $this->createCrawler($dom, $uri);
|
||||
}
|
||||
|
||||
protected function createTestXmlCrawler($uri = null)
|
||||
@ -1276,7 +1236,7 @@ HTML;
|
||||
<media:category label="Music" scheme="http://gdata.youtube.com/schemas/2007/categories.cat">Music</media:category>
|
||||
</entry>';
|
||||
|
||||
return new Crawler($xml, $uri);
|
||||
return $this->createCrawler($xml, $uri);
|
||||
}
|
||||
|
||||
protected function createDomDocument()
|
@ -0,0 +1,22 @@
|
||||
<?php
|
||||
|
||||
/*
|
||||
* This file is part of the Symfony package.
|
||||
*
|
||||
* (c) Fabien Potencier <fabien@symfony.com>
|
||||
*
|
||||
* For the full copyright and license information, please view the LICENSE
|
||||
* file that was distributed with this source code.
|
||||
*/
|
||||
|
||||
namespace Symfony\Component\DomCrawler\Tests;
|
||||
|
||||
use Symfony\Component\DomCrawler\Crawler;
|
||||
|
||||
class Html5ParserCrawlerTest extends AbstractCrawlerTest
|
||||
{
|
||||
public function createCrawler($node = null, string $uri = null, string $baseHref = null)
|
||||
{
|
||||
return new Crawler($node, $uri, $baseHref, true);
|
||||
}
|
||||
}
|
@ -0,0 +1,70 @@
|
||||
<?php
|
||||
|
||||
/*
|
||||
* This file is part of the Symfony package.
|
||||
*
|
||||
* (c) Fabien Potencier <fabien@symfony.com>
|
||||
*
|
||||
* For the full copyright and license information, please view the LICENSE
|
||||
* file that was distributed with this source code.
|
||||
*/
|
||||
|
||||
namespace Symfony\Component\DomCrawler\Tests;
|
||||
|
||||
use Symfony\Component\DomCrawler\Crawler;
|
||||
|
||||
class NativeParserCrawlerTest extends AbstractCrawlerTest
|
||||
{
|
||||
public function createCrawler($node = null, string $uri = null, string $baseHref = null)
|
||||
{
|
||||
return new Crawler($node, $uri, $baseHref, false);
|
||||
}
|
||||
|
||||
public function testAddHtmlContentWithErrors()
|
||||
{
|
||||
$internalErrors = libxml_use_internal_errors(true);
|
||||
|
||||
$crawler = $this->createCrawler();
|
||||
$crawler->addHtmlContent(<<<'EOF'
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
</head>
|
||||
<body>
|
||||
<nav><a href="#"><a href="#"></nav>
|
||||
</body>
|
||||
</html>
|
||||
EOF
|
||||
, 'UTF-8');
|
||||
|
||||
$errors = libxml_get_errors();
|
||||
$this->assertCount(1, $errors);
|
||||
$this->assertEquals("Tag nav invalid\n", $errors[0]->message);
|
||||
|
||||
libxml_clear_errors();
|
||||
libxml_use_internal_errors($internalErrors);
|
||||
}
|
||||
|
||||
public function testAddXmlContentWithErrors()
|
||||
{
|
||||
$internalErrors = libxml_use_internal_errors(true);
|
||||
|
||||
$crawler = $this->createCrawler();
|
||||
$crawler->addXmlContent(<<<'EOF'
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
</head>
|
||||
<body>
|
||||
<nav><a href="#"><a href="#"></nav>
|
||||
</body>
|
||||
</html>
|
||||
EOF
|
||||
, 'UTF-8');
|
||||
|
||||
$this->assertGreaterThan(1, libxml_get_errors());
|
||||
|
||||
libxml_clear_errors();
|
||||
libxml_use_internal_errors($internalErrors);
|
||||
}
|
||||
}
|
@ -21,7 +21,11 @@
|
||||
"symfony/polyfill-mbstring": "~1.0"
|
||||
},
|
||||
"require-dev": {
|
||||
"symfony/css-selector": "~3.4|~4.0"
|
||||
"symfony/css-selector": "~3.4|~4.0",
|
||||
"masterminds/html5": "^2.6"
|
||||
},
|
||||
"conflict": {
|
||||
"masterminds/html5": "<2.6"
|
||||
},
|
||||
"suggest": {
|
||||
"symfony/css-selector": ""
|
||||
|
Reference in New Issue
Block a user