merged branch jakzal/domcrawler-namespace-autodiscovery (PR #6650)

This PR was merged into the master branch.

Discussion
----------

[DomCrawler] Added auto-discovery and explicit registration of namespaces in filter() and filterByXPath()

| Q | A
| --- | ---
|Bug fix: | no
|Feature addition: |yes
|Backwards compatibility break: | yes, default namespace is no longer removed in the `addContent` method
|Symfony2 tests pass: | yes|
|Fixes the following tickets: | #4845
|Todo: | -
|License of the code:| MIT
|Documentation PR: | symfony/symfony-docs#2979

* added support for automatic discovery and explicit registration of document namespaces for `Crawler::filterXPath()` and `Crawler::filter()`
* improved content type guessing in `Crawler::addContent()`
* [BC BREAK] `Crawler::addXmlContent()` no longer removes the default document namespace

I mentioned in #4845 it would probably be possible to use [DOMNode::lookupNamespaceURI()](http://www.php.net/manual/en/domnode.lookupnamespaceuri.php) to find a namespace URI by given prefix. Unfortunately we cannot use it here since we'd have to call it on a node in the namespace we're looking for.

Current implementation makes the following query to find a namespace:
```php
$domxpath->query('(//namespace::*[name()="media"])[last()]')
```

Commits
-------

77e2fa5 [DomCrawler] Removed checks if CssSelector is present.
9110468 [DomCrawler] Enabled manual namespace registration.
be1e4e6 [DomCrawler] Enabled default namespace prefix overloading.
943d446 [DomCrawler] Updated the CHANGELOG with namespace auto-registration details.
c6fbb13 [DomCrawler] Added support for an automatic default namespace registration.
587e2dd [DomCrawler] Made that default namespace is no longer removed when loading documents with addXmlContent().
c905bba [DomCrawler] Added more tests for namespaced filtering.
6e717a3 [DomCrawler] Made sure only the default namespace is removed when loading an XML content.
e5b8abb [DomCrawler] Added auto-discovery of namespaces in Crawler::filter() and Crawler::filterByXPath().
This commit is contained in:
Fabien Potencier 2013-09-25 08:05:47 +02:00
commit 98c0d38a44
3 changed files with 190 additions and 5 deletions

View File

@ -1,6 +1,15 @@
CHANGELOG
=========
2.4.0
-----
* added support for automatic discovery and explicit registration of document
namespaces for `Crawler::filterXPath()` and `Crawler::filter()`
* improved content type guessing in `Crawler::addContent()`
* [BC BREAK] `Crawler::addXmlContent()` no longer removes the default document
namespace
2.3.0
-----

View File

@ -27,6 +27,16 @@ class Crawler extends \SplObjectStorage
*/
protected $uri;
/**
* @var string The default namespace prefix to be used with XPath and CSS expressions
*/
private $defaultNamespacePrefix = 'default';
/**
* @var array A map of manually registered namespaces
*/
private $namespaces = array();
/**
* Constructor.
*
@ -92,7 +102,7 @@ class Crawler extends \SplObjectStorage
public function addContent($content, $type = null)
{
if (empty($type)) {
$type = 'text/html';
$type = 0 === strpos($content, '<?xml') ? 'application/xml' : 'text/html';
}
// DOM only for HTML/XML content
@ -195,9 +205,7 @@ class Crawler extends \SplObjectStorage
$dom = new \DOMDocument('1.0', $charset);
$dom->validateOnParse = true;
// remove the default namespace to make XPath expressions simpler
@$dom->loadXML(str_replace('xmlns', 'ns', $content), LIBXML_NONET);
@$dom->loadXML($content, LIBXML_NONET);
libxml_use_internal_errors($current);
libxml_disable_entity_loader($disableEntities);
@ -579,7 +587,8 @@ class Crawler extends \SplObjectStorage
$root->appendChild($document->importNode($node, true));
}
$domxpath = new \DOMXPath($document);
$prefixes = $this->findNamespacePrefixes($xpath);
$domxpath = $this->createDOMXPath($document, $prefixes);
return new static($domxpath->query($xpath), $this->uri);
}
@ -709,6 +718,25 @@ class Crawler extends \SplObjectStorage
return $form;
}
/**
* Overloads a default namespace prefix to be used with XPath and CSS expressions.
*
* @param string $prefix
*/
public function setDefaultNamespacePrefix($prefix)
{
$this->defaultNamespacePrefix = $prefix;
}
/**
* @param string $prefix
* @param string $namespace
*/
public function registerNamespace($prefix, $namespace)
{
$this->namespaces[$prefix] = $namespace;
}
/**
* Converts string for XPath expressions.
*
@ -792,4 +820,62 @@ class Crawler extends \SplObjectStorage
return $nodes;
}
/**
* @param \DOMDocument $document
* @param array $prefixes
*
* @return \DOMXPath
*
* @throws \InvalidArgumentException
*/
private function createDOMXPath(\DOMDocument $document, array $prefixes = array())
{
$domxpath = new \DOMXPath($document);
foreach ($prefixes as $prefix) {
$namespace = $this->discoverNamespace($domxpath, $prefix);
$domxpath->registerNamespace($prefix, $namespace);
}
return $domxpath;
}
/**
* @param \DOMXPath $domxpath
* @param string $prefix
*
* @return string
*
* @throws \InvalidArgumentException
*/
private function discoverNamespace(\DOMXPath $domxpath, $prefix)
{
if (isset($this->namespaces[$prefix])) {
return $this->namespaces[$prefix];
}
// ask for one namespace, otherwise we'd get a collection with an item for each node
$namespaces = $domxpath->query(sprintf('(//namespace::*[name()="%s"])[last()]', $this->defaultNamespacePrefix === $prefix ? '' : $prefix));
if ($node = $namespaces->item(0)) {
return $node->nodeValue;
}
throw new \InvalidArgumentException(sprintf('Could not find a namespace for the prefix: "%s"', $prefix));
}
/**
* @param $xpath
*
* @return array
*/
private function findNamespacePrefixes($xpath)
{
if (preg_match_all('/(?P<prefix>[a-zA-Z_][a-zA-Z_0-9\-\.]*):[^:]/', $xpath, $matches)) {
return array_unique($matches['prefix']);
}
return array();
}
}

View File

@ -11,6 +11,7 @@
namespace Symfony\Component\DomCrawler\Tests;
use Symfony\Component\CssSelector\CssSelector;
use Symfony\Component\DomCrawler\Crawler;
class CrawlerTest extends \PHPUnit_Framework_TestCase
@ -370,6 +371,55 @@ EOF
$this->assertCount(6, $crawler->filterXPath('//li'), '->filterXPath() filters the node list with the XPath expression');
}
public function testFilterXPathWithDefaultNamespace()
{
$crawler = $this->createTestXmlCrawler()->filterXPath('//default:entry/default:id');
$this->assertCount(1, $crawler, '->filterXPath() automatically registers a namespace');
$this->assertSame('tag:youtube.com,2008:video:kgZRZmEc9j4', $crawler->text());
}
public function testFilterXPathWithCustomDefaultNamespace()
{
$crawler = $this->createTestXmlCrawler();
$crawler->setDefaultNamespacePrefix('x');
$crawler = $crawler->filterXPath('//x:entry/x:id');
$this->assertCount(1, $crawler, '->filterXPath() lets to override the default namespace prefix');
$this->assertSame('tag:youtube.com,2008:video:kgZRZmEc9j4', $crawler->text());
}
public function testFilterXPathWithNamespace()
{
$crawler = $this->createTestXmlCrawler()->filterXPath('//yt:accessControl');
$this->assertCount(2, $crawler, '->filterXPath() automatically registers a namespace');
}
public function testFilterXPathWithMultipleNamespaces()
{
$crawler = $this->createTestXmlCrawler()->filterXPath('//media:group/yt:aspectRatio');
$this->assertCount(1, $crawler, '->filterXPath() automatically registers multiple namespaces');
$this->assertSame('widescreen', $crawler->text());
}
/**
* @expectedException \InvalidArgumentException
* @expectedExceptionMessage Could not find a namespace for the prefix: "foo"
*/
public function testFilterXPathWithAnInvalidNamespace()
{
$this->createTestXmlCrawler()->filterXPath('//media:group/foo:aspectRatio');
}
public function testFilterXPathWithManuallyRegisteredNamespace()
{
$crawler = $this->createTestXmlCrawler();
$crawler->registerNamespace('m', 'http://search.yahoo.com/mrss/');
$crawler = $crawler->filterXPath('//m:group/yt:aspectRatio');
$this->assertCount(1, $crawler, '->filterXPath() uses manually registered namespace');
$this->assertSame('widescreen', $crawler->text());
}
/**
* @covers Symfony\Component\DomCrawler\Crawler::filter
*/
@ -384,6 +434,30 @@ EOF
$this->assertCount(6, $crawler->filter('li'), '->filter() filters the node list with the CSS selector');
}
public function testFilterWithDefaultNamespace()
{
$crawler = $this->createTestXmlCrawler()->filter('default|entry default|id');
$this->assertCount(1, $crawler, '->filter() automatically registers namespaces');
$this->assertSame('tag:youtube.com,2008:video:kgZRZmEc9j4', $crawler->text());
}
public function testFilterWithNamespace()
{
CssSelector::disableHtmlExtension();
$crawler = $this->createTestXmlCrawler()->filter('yt|accessControl');
$this->assertCount(2, $crawler, '->filter() automatically registers namespaces');
}
public function testFilterWithMultipleNamespaces()
{
CssSelector::disableHtmlExtension();
$crawler = $this->createTestXmlCrawler()->filter('media|group yt|aspectRatio');
$this->assertCount(1, $crawler, '->filter() automatically registers namespaces');
$this->assertSame('widescreen', $crawler->text());
}
public function testSelectLink()
{
$crawler = $this->createTestCrawler();
@ -656,6 +730,22 @@ EOF
return new Crawler($dom, $uri);
}
protected function createTestXmlCrawler($uri = null)
{
$xml = '<?xml version="1.0" encoding="UTF-8"?>
<entry xmlns="http://www.w3.org/2005/Atom" xmlns:media="http://search.yahoo.com/mrss/" xmlns:yt="http://gdata.youtube.com/schemas/2007">
<id>tag:youtube.com,2008:video:kgZRZmEc9j4</id>
<yt:accessControl action="comment" permission="allowed"/>
<yt:accessControl action="videoRespond" permission="moderated"/>
<media:group>
<media:title type="plain">Chordates - CrashCourse Biology #24</media:title>
<yt:aspectRatio>widescreen</yt:aspectRatio>
</media:group>
</entry>';
return new Crawler($xml, $uri);
}
protected function createDomDocument()
{
$dom = new \DOMDocument();