[DomCrawler] Added auto-discovery of namespaces in Crawler::filter() and Crawler::filterByXPath().

Improved content type guessing.
This commit is contained in:
Jakub Zalas 2013-01-10 09:49:01 +00:00
parent b1542f0620
commit e5b8abb564
2 changed files with 61 additions and 1 deletions

View File

@ -92,7 +92,7 @@ class Crawler extends \SplObjectStorage
public function addContent($content, $type = null)
{
if (empty($type)) {
$type = 'text/html';
$type = 0 === strpos($content, '<?xml') ? 'application/xml' : 'text/html';
}
// DOM only for HTML/XML content
@ -580,6 +580,15 @@ class Crawler extends \SplObjectStorage
}
$domxpath = new \DOMXPath($document);
if (preg_match_all('/(?P<prefix>[a-zA-Z_][a-zA-Z_0-9\-\.]+):[^:]/', $xpath, $matches)) {
foreach ($matches['prefix'] as $prefix) {
// ask for one namespace, otherwise we'd get a collection with an item for each node
$namespaces = $domxpath->query(sprintf('(//namespace::*[name()="%s"])[last()]', $prefix));
foreach ($namespaces as $node) {
$domxpath->registerNamespace($node->prefix, $node->nodeValue);
}
}
}
return new static($domxpath->query($xpath), $this->uri);
}

View File

@ -370,11 +370,31 @@ EOF
$this->assertCount(6, $crawler->filterXPath('//li'), '->filterXPath() filters the node list with the XPath expression');
}
public function testFilterXPathWithDefaultNamespace()
{
$crawler = $this->createTestXmlCrawler()->filterXPath('//entry/id');
$this->assertCount(1, $crawler, '->filterXPath() automatically registers a namespace');
}
public function testFilterXPathWithNamespace()
{
$crawler = $this->createTestXmlCrawler()->filterXPath('//yt:accessControl');
$this->assertCount(2, $crawler, '->filterXPath() automatically registers a namespace');
}
public function testFilterXPathWithMultipleNamespaces()
{
$crawler = $this->createTestXmlCrawler()->filterXPath('//media:group/yt:aspectRatio');
$this->assertCount(1, $crawler, '->filterXPath() automatically registers multiple namespaces');
}
/**
* @covers Symfony\Component\DomCrawler\Crawler::filter
*/
public function testFilter()
{
$this->markSkippedIfCssSelectorNotPresent();
$crawler = $this->createTestCrawler();
$this->assertNotSame($crawler, $crawler->filter('li'), '->filter() returns a new instance of a crawler');
$this->assertInstanceOf('Symfony\\Component\\DomCrawler\\Crawler', $crawler, '->filter() returns a new instance of a crawler');
@ -384,6 +404,14 @@ EOF
$this->assertCount(6, $crawler->filter('li'), '->filter() filters the node list with the CSS selector');
}
public function testFilterWithNamespace()
{
$this->markSkippedIfCssSelectorNotPresent();
$crawler = $this->createTestXmlCrawler()->filter('yt|accessControl');
$this->assertCount(2, $crawler, '->filter() automatically registers namespaces');
}
public function testSelectLink()
{
$crawler = $this->createTestCrawler();
@ -656,6 +684,22 @@ EOF
return new Crawler($dom, $uri);
}
protected function createTestXmlCrawler($uri = null)
{
$xml = '<?xml version="1.0" encoding="UTF-8"?>
<entry xmlns="http://www.w3.org/2005/Atom" xmlns:media="http://search.yahoo.com/mrss/" xmlns:yt="http://gdata.youtube.com/schemas/2007">
<id>tag:youtube.com,2008:video:kgZRZmEc9j4</id>
<yt:accessControl action="comment" permission="allowed"/>
<yt:accessControl action="videoRespond" permission="moderated"/>
<media:group>
<media:title type="plain">Chordates - CrashCourse Biology #24</media:title>
<yt:aspectRatio>widescreen</yt:aspectRatio>
</media:group>
</entry>';
return new Crawler($xml, $uri);
}
protected function createDomDocument()
{
$dom = new \DOMDocument();
@ -672,4 +716,11 @@ EOF
return $domxpath->query('//div');
}
protected function markSkippedIfCssSelectorNotPresent()
{
if (!class_exists('Symfony\Component\CssSelector\CssSelector')) {
$this->markTestSkipped('The "CssSelector" component is not available');
}
}
}