diff --git a/src/Symfony/Component/DomCrawler/CHANGELOG.md b/src/Symfony/Component/DomCrawler/CHANGELOG.md index 2343a51b4f..66ce6542c4 100644 --- a/src/Symfony/Component/DomCrawler/CHANGELOG.md +++ b/src/Symfony/Component/DomCrawler/CHANGELOG.md @@ -1,6 +1,15 @@ CHANGELOG ========= +2.4.0 +----- + + * added support for automatic discovery and explicit registration of document + namespaces for `Crawler::filterXPath()` and `Crawler::filter()` + * improved content type guessing in `Crawler::addContent()` + * [BC BREAK] `Crawler::addXmlContent()` no longer removes the default document + namespace + 2.3.0 ----- diff --git a/src/Symfony/Component/DomCrawler/Crawler.php b/src/Symfony/Component/DomCrawler/Crawler.php index 412e717e59..2935a51f53 100644 --- a/src/Symfony/Component/DomCrawler/Crawler.php +++ b/src/Symfony/Component/DomCrawler/Crawler.php @@ -27,6 +27,16 @@ class Crawler extends \SplObjectStorage */ protected $uri; + /** + * @var string The default namespace prefix to be used with XPath and CSS expressions + */ + private $defaultNamespacePrefix = 'default'; + + /** + * @var array A map of manually registered namespaces + */ + private $namespaces = array(); + /** * Constructor. * @@ -92,7 +102,7 @@ class Crawler extends \SplObjectStorage public function addContent($content, $type = null) { if (empty($type)) { - $type = 'text/html'; + $type = 0 === strpos($content, 'validateOnParse = true; - - // remove the default namespace to make XPath expressions simpler - @$dom->loadXML(str_replace('xmlns', 'ns', $content), LIBXML_NONET); + @$dom->loadXML($content, LIBXML_NONET); libxml_use_internal_errors($current); libxml_disable_entity_loader($disableEntities); @@ -579,7 +587,8 @@ class Crawler extends \SplObjectStorage $root->appendChild($document->importNode($node, true)); } - $domxpath = new \DOMXPath($document); + $prefixes = $this->findNamespacePrefixes($xpath); + $domxpath = $this->createDOMXPath($document, $prefixes); return new static($domxpath->query($xpath), $this->uri); } @@ -709,6 +718,25 @@ class Crawler extends \SplObjectStorage return $form; } + /** + * Overloads a default namespace prefix to be used with XPath and CSS expressions. + * + * @param string $prefix + */ + public function setDefaultNamespacePrefix($prefix) + { + $this->defaultNamespacePrefix = $prefix; + } + + /** + * @param string $prefix + * @param string $namespace + */ + public function registerNamespace($prefix, $namespace) + { + $this->namespaces[$prefix] = $namespace; + } + /** * Converts string for XPath expressions. * @@ -792,4 +820,62 @@ class Crawler extends \SplObjectStorage return $nodes; } + + /** + * @param \DOMDocument $document + * @param array $prefixes + * + * @return \DOMXPath + * + * @throws \InvalidArgumentException + */ + private function createDOMXPath(\DOMDocument $document, array $prefixes = array()) + { + $domxpath = new \DOMXPath($document); + + foreach ($prefixes as $prefix) { + $namespace = $this->discoverNamespace($domxpath, $prefix); + $domxpath->registerNamespace($prefix, $namespace); + } + + return $domxpath; + } + + /** + * @param \DOMXPath $domxpath + * @param string $prefix + * + * @return string + * + * @throws \InvalidArgumentException + */ + private function discoverNamespace(\DOMXPath $domxpath, $prefix) + { + if (isset($this->namespaces[$prefix])) { + return $this->namespaces[$prefix]; + } + + // ask for one namespace, otherwise we'd get a collection with an item for each node + $namespaces = $domxpath->query(sprintf('(//namespace::*[name()="%s"])[last()]', $this->defaultNamespacePrefix === $prefix ? '' : $prefix)); + + if ($node = $namespaces->item(0)) { + return $node->nodeValue; + } + + throw new \InvalidArgumentException(sprintf('Could not find a namespace for the prefix: "%s"', $prefix)); + } + + /** + * @param $xpath + * + * @return array + */ + private function findNamespacePrefixes($xpath) + { + if (preg_match_all('/(?P[a-zA-Z_][a-zA-Z_0-9\-\.]*):[^:]/', $xpath, $matches)) { + return array_unique($matches['prefix']); + } + + return array(); + } } diff --git a/src/Symfony/Component/DomCrawler/Tests/CrawlerTest.php b/src/Symfony/Component/DomCrawler/Tests/CrawlerTest.php index f05d81e681..5c27451f6f 100644 --- a/src/Symfony/Component/DomCrawler/Tests/CrawlerTest.php +++ b/src/Symfony/Component/DomCrawler/Tests/CrawlerTest.php @@ -11,6 +11,7 @@ namespace Symfony\Component\DomCrawler\Tests; +use Symfony\Component\CssSelector\CssSelector; use Symfony\Component\DomCrawler\Crawler; class CrawlerTest extends \PHPUnit_Framework_TestCase @@ -370,6 +371,55 @@ EOF $this->assertCount(6, $crawler->filterXPath('//li'), '->filterXPath() filters the node list with the XPath expression'); } + public function testFilterXPathWithDefaultNamespace() + { + $crawler = $this->createTestXmlCrawler()->filterXPath('//default:entry/default:id'); + $this->assertCount(1, $crawler, '->filterXPath() automatically registers a namespace'); + $this->assertSame('tag:youtube.com,2008:video:kgZRZmEc9j4', $crawler->text()); + } + + public function testFilterXPathWithCustomDefaultNamespace() + { + $crawler = $this->createTestXmlCrawler(); + $crawler->setDefaultNamespacePrefix('x'); + $crawler = $crawler->filterXPath('//x:entry/x:id'); + + $this->assertCount(1, $crawler, '->filterXPath() lets to override the default namespace prefix'); + $this->assertSame('tag:youtube.com,2008:video:kgZRZmEc9j4', $crawler->text()); + } + + public function testFilterXPathWithNamespace() + { + $crawler = $this->createTestXmlCrawler()->filterXPath('//yt:accessControl'); + $this->assertCount(2, $crawler, '->filterXPath() automatically registers a namespace'); + } + + public function testFilterXPathWithMultipleNamespaces() + { + $crawler = $this->createTestXmlCrawler()->filterXPath('//media:group/yt:aspectRatio'); + $this->assertCount(1, $crawler, '->filterXPath() automatically registers multiple namespaces'); + $this->assertSame('widescreen', $crawler->text()); + } + + /** + * @expectedException \InvalidArgumentException + * @expectedExceptionMessage Could not find a namespace for the prefix: "foo" + */ + public function testFilterXPathWithAnInvalidNamespace() + { + $this->createTestXmlCrawler()->filterXPath('//media:group/foo:aspectRatio'); + } + + public function testFilterXPathWithManuallyRegisteredNamespace() + { + $crawler = $this->createTestXmlCrawler(); + $crawler->registerNamespace('m', 'http://search.yahoo.com/mrss/'); + + $crawler = $crawler->filterXPath('//m:group/yt:aspectRatio'); + $this->assertCount(1, $crawler, '->filterXPath() uses manually registered namespace'); + $this->assertSame('widescreen', $crawler->text()); + } + /** * @covers Symfony\Component\DomCrawler\Crawler::filter */ @@ -384,6 +434,30 @@ EOF $this->assertCount(6, $crawler->filter('li'), '->filter() filters the node list with the CSS selector'); } + public function testFilterWithDefaultNamespace() + { + $crawler = $this->createTestXmlCrawler()->filter('default|entry default|id'); + $this->assertCount(1, $crawler, '->filter() automatically registers namespaces'); + $this->assertSame('tag:youtube.com,2008:video:kgZRZmEc9j4', $crawler->text()); + } + + public function testFilterWithNamespace() + { + CssSelector::disableHtmlExtension(); + + $crawler = $this->createTestXmlCrawler()->filter('yt|accessControl'); + $this->assertCount(2, $crawler, '->filter() automatically registers namespaces'); + } + + public function testFilterWithMultipleNamespaces() + { + CssSelector::disableHtmlExtension(); + + $crawler = $this->createTestXmlCrawler()->filter('media|group yt|aspectRatio'); + $this->assertCount(1, $crawler, '->filter() automatically registers namespaces'); + $this->assertSame('widescreen', $crawler->text()); + } + public function testSelectLink() { $crawler = $this->createTestCrawler(); @@ -656,6 +730,22 @@ EOF return new Crawler($dom, $uri); } + protected function createTestXmlCrawler($uri = null) + { + $xml = ' + + tag:youtube.com,2008:video:kgZRZmEc9j4 + + + + Chordates - CrashCourse Biology #24 + widescreen + + '; + + return new Crawler($xml, $uri); + } + protected function createDomDocument() { $dom = new \DOMDocument();