merged branch jakzal/domcrawler-namespace-autodiscovery (PR #6650)
This PR was merged into the master branch. Discussion ---------- [DomCrawler] Added auto-discovery and explicit registration of namespaces in filter() and filterByXPath() | Q | A | --- | --- |Bug fix: | no |Feature addition: |yes |Backwards compatibility break: | yes, default namespace is no longer removed in the `addContent` method |Symfony2 tests pass: | yes| |Fixes the following tickets: | #4845 |Todo: | - |License of the code:| MIT |Documentation PR: | symfony/symfony-docs#2979 * added support for automatic discovery and explicit registration of document namespaces for `Crawler::filterXPath()` and `Crawler::filter()` * improved content type guessing in `Crawler::addContent()` * [BC BREAK] `Crawler::addXmlContent()` no longer removes the default document namespace I mentioned in #4845 it would probably be possible to use [DOMNode::lookupNamespaceURI()](http://www.php.net/manual/en/domnode.lookupnamespaceuri.php) to find a namespace URI by given prefix. Unfortunately we cannot use it here since we'd have to call it on a node in the namespace we're looking for. Current implementation makes the following query to find a namespace: ```php $domxpath->query('(//namespace::*[name()="media"])[last()]') ``` Commits -------77e2fa5
[DomCrawler] Removed checks if CssSelector is present.9110468
[DomCrawler] Enabled manual namespace registration.be1e4e6
[DomCrawler] Enabled default namespace prefix overloading.943d446
[DomCrawler] Updated the CHANGELOG with namespace auto-registration details.c6fbb13
[DomCrawler] Added support for an automatic default namespace registration.587e2dd
[DomCrawler] Made that default namespace is no longer removed when loading documents with addXmlContent().c905bba
[DomCrawler] Added more tests for namespaced filtering.6e717a3
[DomCrawler] Made sure only the default namespace is removed when loading an XML content.e5b8abb
[DomCrawler] Added auto-discovery of namespaces in Crawler::filter() and Crawler::filterByXPath().
This commit is contained in:
commit
98c0d38a44
@ -1,6 +1,15 @@
|
||||
CHANGELOG
|
||||
=========
|
||||
|
||||
2.4.0
|
||||
-----
|
||||
|
||||
* added support for automatic discovery and explicit registration of document
|
||||
namespaces for `Crawler::filterXPath()` and `Crawler::filter()`
|
||||
* improved content type guessing in `Crawler::addContent()`
|
||||
* [BC BREAK] `Crawler::addXmlContent()` no longer removes the default document
|
||||
namespace
|
||||
|
||||
2.3.0
|
||||
-----
|
||||
|
||||
|
@ -27,6 +27,16 @@ class Crawler extends \SplObjectStorage
|
||||
*/
|
||||
protected $uri;
|
||||
|
||||
/**
|
||||
* @var string The default namespace prefix to be used with XPath and CSS expressions
|
||||
*/
|
||||
private $defaultNamespacePrefix = 'default';
|
||||
|
||||
/**
|
||||
* @var array A map of manually registered namespaces
|
||||
*/
|
||||
private $namespaces = array();
|
||||
|
||||
/**
|
||||
* Constructor.
|
||||
*
|
||||
@ -92,7 +102,7 @@ class Crawler extends \SplObjectStorage
|
||||
public function addContent($content, $type = null)
|
||||
{
|
||||
if (empty($type)) {
|
||||
$type = 'text/html';
|
||||
$type = 0 === strpos($content, '<?xml') ? 'application/xml' : 'text/html';
|
||||
}
|
||||
|
||||
// DOM only for HTML/XML content
|
||||
@ -195,9 +205,7 @@ class Crawler extends \SplObjectStorage
|
||||
|
||||
$dom = new \DOMDocument('1.0', $charset);
|
||||
$dom->validateOnParse = true;
|
||||
|
||||
// remove the default namespace to make XPath expressions simpler
|
||||
@$dom->loadXML(str_replace('xmlns', 'ns', $content), LIBXML_NONET);
|
||||
@$dom->loadXML($content, LIBXML_NONET);
|
||||
|
||||
libxml_use_internal_errors($current);
|
||||
libxml_disable_entity_loader($disableEntities);
|
||||
@ -579,7 +587,8 @@ class Crawler extends \SplObjectStorage
|
||||
$root->appendChild($document->importNode($node, true));
|
||||
}
|
||||
|
||||
$domxpath = new \DOMXPath($document);
|
||||
$prefixes = $this->findNamespacePrefixes($xpath);
|
||||
$domxpath = $this->createDOMXPath($document, $prefixes);
|
||||
|
||||
return new static($domxpath->query($xpath), $this->uri);
|
||||
}
|
||||
@ -709,6 +718,25 @@ class Crawler extends \SplObjectStorage
|
||||
return $form;
|
||||
}
|
||||
|
||||
/**
|
||||
* Overloads a default namespace prefix to be used with XPath and CSS expressions.
|
||||
*
|
||||
* @param string $prefix
|
||||
*/
|
||||
public function setDefaultNamespacePrefix($prefix)
|
||||
{
|
||||
$this->defaultNamespacePrefix = $prefix;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param string $prefix
|
||||
* @param string $namespace
|
||||
*/
|
||||
public function registerNamespace($prefix, $namespace)
|
||||
{
|
||||
$this->namespaces[$prefix] = $namespace;
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts string for XPath expressions.
|
||||
*
|
||||
@ -792,4 +820,62 @@ class Crawler extends \SplObjectStorage
|
||||
|
||||
return $nodes;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param \DOMDocument $document
|
||||
* @param array $prefixes
|
||||
*
|
||||
* @return \DOMXPath
|
||||
*
|
||||
* @throws \InvalidArgumentException
|
||||
*/
|
||||
private function createDOMXPath(\DOMDocument $document, array $prefixes = array())
|
||||
{
|
||||
$domxpath = new \DOMXPath($document);
|
||||
|
||||
foreach ($prefixes as $prefix) {
|
||||
$namespace = $this->discoverNamespace($domxpath, $prefix);
|
||||
$domxpath->registerNamespace($prefix, $namespace);
|
||||
}
|
||||
|
||||
return $domxpath;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param \DOMXPath $domxpath
|
||||
* @param string $prefix
|
||||
*
|
||||
* @return string
|
||||
*
|
||||
* @throws \InvalidArgumentException
|
||||
*/
|
||||
private function discoverNamespace(\DOMXPath $domxpath, $prefix)
|
||||
{
|
||||
if (isset($this->namespaces[$prefix])) {
|
||||
return $this->namespaces[$prefix];
|
||||
}
|
||||
|
||||
// ask for one namespace, otherwise we'd get a collection with an item for each node
|
||||
$namespaces = $domxpath->query(sprintf('(//namespace::*[name()="%s"])[last()]', $this->defaultNamespacePrefix === $prefix ? '' : $prefix));
|
||||
|
||||
if ($node = $namespaces->item(0)) {
|
||||
return $node->nodeValue;
|
||||
}
|
||||
|
||||
throw new \InvalidArgumentException(sprintf('Could not find a namespace for the prefix: "%s"', $prefix));
|
||||
}
|
||||
|
||||
/**
|
||||
* @param $xpath
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
private function findNamespacePrefixes($xpath)
|
||||
{
|
||||
if (preg_match_all('/(?P<prefix>[a-zA-Z_][a-zA-Z_0-9\-\.]*):[^:]/', $xpath, $matches)) {
|
||||
return array_unique($matches['prefix']);
|
||||
}
|
||||
|
||||
return array();
|
||||
}
|
||||
}
|
||||
|
@ -11,6 +11,7 @@
|
||||
|
||||
namespace Symfony\Component\DomCrawler\Tests;
|
||||
|
||||
use Symfony\Component\CssSelector\CssSelector;
|
||||
use Symfony\Component\DomCrawler\Crawler;
|
||||
|
||||
class CrawlerTest extends \PHPUnit_Framework_TestCase
|
||||
@ -370,6 +371,55 @@ EOF
|
||||
$this->assertCount(6, $crawler->filterXPath('//li'), '->filterXPath() filters the node list with the XPath expression');
|
||||
}
|
||||
|
||||
public function testFilterXPathWithDefaultNamespace()
|
||||
{
|
||||
$crawler = $this->createTestXmlCrawler()->filterXPath('//default:entry/default:id');
|
||||
$this->assertCount(1, $crawler, '->filterXPath() automatically registers a namespace');
|
||||
$this->assertSame('tag:youtube.com,2008:video:kgZRZmEc9j4', $crawler->text());
|
||||
}
|
||||
|
||||
public function testFilterXPathWithCustomDefaultNamespace()
|
||||
{
|
||||
$crawler = $this->createTestXmlCrawler();
|
||||
$crawler->setDefaultNamespacePrefix('x');
|
||||
$crawler = $crawler->filterXPath('//x:entry/x:id');
|
||||
|
||||
$this->assertCount(1, $crawler, '->filterXPath() lets to override the default namespace prefix');
|
||||
$this->assertSame('tag:youtube.com,2008:video:kgZRZmEc9j4', $crawler->text());
|
||||
}
|
||||
|
||||
public function testFilterXPathWithNamespace()
|
||||
{
|
||||
$crawler = $this->createTestXmlCrawler()->filterXPath('//yt:accessControl');
|
||||
$this->assertCount(2, $crawler, '->filterXPath() automatically registers a namespace');
|
||||
}
|
||||
|
||||
public function testFilterXPathWithMultipleNamespaces()
|
||||
{
|
||||
$crawler = $this->createTestXmlCrawler()->filterXPath('//media:group/yt:aspectRatio');
|
||||
$this->assertCount(1, $crawler, '->filterXPath() automatically registers multiple namespaces');
|
||||
$this->assertSame('widescreen', $crawler->text());
|
||||
}
|
||||
|
||||
/**
|
||||
* @expectedException \InvalidArgumentException
|
||||
* @expectedExceptionMessage Could not find a namespace for the prefix: "foo"
|
||||
*/
|
||||
public function testFilterXPathWithAnInvalidNamespace()
|
||||
{
|
||||
$this->createTestXmlCrawler()->filterXPath('//media:group/foo:aspectRatio');
|
||||
}
|
||||
|
||||
public function testFilterXPathWithManuallyRegisteredNamespace()
|
||||
{
|
||||
$crawler = $this->createTestXmlCrawler();
|
||||
$crawler->registerNamespace('m', 'http://search.yahoo.com/mrss/');
|
||||
|
||||
$crawler = $crawler->filterXPath('//m:group/yt:aspectRatio');
|
||||
$this->assertCount(1, $crawler, '->filterXPath() uses manually registered namespace');
|
||||
$this->assertSame('widescreen', $crawler->text());
|
||||
}
|
||||
|
||||
/**
|
||||
* @covers Symfony\Component\DomCrawler\Crawler::filter
|
||||
*/
|
||||
@ -384,6 +434,30 @@ EOF
|
||||
$this->assertCount(6, $crawler->filter('li'), '->filter() filters the node list with the CSS selector');
|
||||
}
|
||||
|
||||
public function testFilterWithDefaultNamespace()
|
||||
{
|
||||
$crawler = $this->createTestXmlCrawler()->filter('default|entry default|id');
|
||||
$this->assertCount(1, $crawler, '->filter() automatically registers namespaces');
|
||||
$this->assertSame('tag:youtube.com,2008:video:kgZRZmEc9j4', $crawler->text());
|
||||
}
|
||||
|
||||
public function testFilterWithNamespace()
|
||||
{
|
||||
CssSelector::disableHtmlExtension();
|
||||
|
||||
$crawler = $this->createTestXmlCrawler()->filter('yt|accessControl');
|
||||
$this->assertCount(2, $crawler, '->filter() automatically registers namespaces');
|
||||
}
|
||||
|
||||
public function testFilterWithMultipleNamespaces()
|
||||
{
|
||||
CssSelector::disableHtmlExtension();
|
||||
|
||||
$crawler = $this->createTestXmlCrawler()->filter('media|group yt|aspectRatio');
|
||||
$this->assertCount(1, $crawler, '->filter() automatically registers namespaces');
|
||||
$this->assertSame('widescreen', $crawler->text());
|
||||
}
|
||||
|
||||
public function testSelectLink()
|
||||
{
|
||||
$crawler = $this->createTestCrawler();
|
||||
@ -656,6 +730,22 @@ EOF
|
||||
return new Crawler($dom, $uri);
|
||||
}
|
||||
|
||||
protected function createTestXmlCrawler($uri = null)
|
||||
{
|
||||
$xml = '<?xml version="1.0" encoding="UTF-8"?>
|
||||
<entry xmlns="http://www.w3.org/2005/Atom" xmlns:media="http://search.yahoo.com/mrss/" xmlns:yt="http://gdata.youtube.com/schemas/2007">
|
||||
<id>tag:youtube.com,2008:video:kgZRZmEc9j4</id>
|
||||
<yt:accessControl action="comment" permission="allowed"/>
|
||||
<yt:accessControl action="videoRespond" permission="moderated"/>
|
||||
<media:group>
|
||||
<media:title type="plain">Chordates - CrashCourse Biology #24</media:title>
|
||||
<yt:aspectRatio>widescreen</yt:aspectRatio>
|
||||
</media:group>
|
||||
</entry>';
|
||||
|
||||
return new Crawler($xml, $uri);
|
||||
}
|
||||
|
||||
protected function createDomDocument()
|
||||
{
|
||||
$dom = new \DOMDocument();
|
||||
|
Reference in New Issue
Block a user