merged branch jakzal/domcrawler-xpath-namespace-fix (PR #9129)
This PR was merged into the master branch.
Discussion
----------
[DomCrawler] Fixed an issue with namespace prefix matching being to greedy
| Q | A
| ------------- | ---
| Bug fix? | yes
| New feature? | no
| BC breaks? | no
| Deprecations? | no
| Tests pass? | yes
| Fixed tickets | #9121
| License | MIT
| Doc PR | -
The regexp matching prefixes is naive and matches most of strings followed by a colon: `/(?P<prefix>[a-z_][a-z_0-9\-\.]*):[^"\/]/i`.
It is also incomplete as it does not match all the supported characters (like the unicode ones). Implementing [the actual specification](http://www.w3.org/TR/2004/REC-xml-names11-20040204/#NT-PrefixedName) would mean creating a very complex expression, especially because we have to ignore strings contained in quotes etc.
This would match any XML tag name (names in quotes not ignored):
```php
$ncNameStartChar = 'a-z_\\xC0-\\xD6\\xD8-\\xF6\\xF8-\\x{2FF}\\x{370}-\\x{37D}\\x{37F}-\\x{1FFF}\\x{200C}-\\x{200D}\\x{2070}-\\x{218F}\\x{2C00}-\\x{2FEF}\\x{3001}-\\x{D7FF}\\x{F900}-\\x{FDCF}\\x{FDF0}-\\x{FFFD}\\x{10000}-\\x{EFFFF}';
$ncNameChar = $ncNameStartChar.'.\\-0-9\\xB7\\x{0300}-\\x{036F}\\x{203F}-\\x{2040}';
$ncName = '['.$ncNameStartChar.']['.$ncNameChar.']*';
$regexp = sprintf('/(?P<prefix>%s):%s/iu', $ncName, $ncName);
```
Therefore, I decided NOT to throw an exception if found prefix is actually not a prefix. If a prefix used in xpath doesn't exist we'll still get a warning when running it (`DOMXPath::query(): Undefined namespace prefix`).
The current implementation is simple and will work in most situations. For edge cases, it's still possible to register prefixes manually.
What do you think?
Commits
-------
3292163
[DomCrawler] Fixed an issue with namespace prefix matching being to greedy.
This commit is contained in:
commit
803d434839
@ -835,8 +835,10 @@ class Crawler extends \SplObjectStorage
|
||||
|
||||
foreach ($prefixes as $prefix) {
|
||||
$namespace = $this->discoverNamespace($domxpath, $prefix);
|
||||
if (null !== $namespace) {
|
||||
$domxpath->registerNamespace($prefix, $namespace);
|
||||
}
|
||||
}
|
||||
|
||||
return $domxpath;
|
||||
}
|
||||
@ -861,8 +863,6 @@ class Crawler extends \SplObjectStorage
|
||||
if ($node = $namespaces->item(0)) {
|
||||
return $node->nodeValue;
|
||||
}
|
||||
|
||||
throw new \InvalidArgumentException(sprintf('Could not find a namespace for the prefix: "%s"', $prefix));
|
||||
}
|
||||
|
||||
/**
|
||||
@ -872,7 +872,7 @@ class Crawler extends \SplObjectStorage
|
||||
*/
|
||||
private function findNamespacePrefixes($xpath)
|
||||
{
|
||||
if (preg_match_all('/(?P<prefix>[a-zA-Z_][a-zA-Z_0-9\-\.]*):[^:]/', $xpath, $matches)) {
|
||||
if (preg_match_all('/(?P<prefix>[a-z_][a-z_0-9\-\.]*):[^"\/]/i', $xpath, $matches)) {
|
||||
return array_unique($matches['prefix']);
|
||||
}
|
||||
|
||||
|
@ -401,15 +401,6 @@ EOF
|
||||
$this->assertSame('widescreen', $crawler->text());
|
||||
}
|
||||
|
||||
/**
|
||||
* @expectedException \InvalidArgumentException
|
||||
* @expectedExceptionMessage Could not find a namespace for the prefix: "foo"
|
||||
*/
|
||||
public function testFilterXPathWithAnInvalidNamespace()
|
||||
{
|
||||
$this->createTestXmlCrawler()->filterXPath('//media:group/foo:aspectRatio');
|
||||
}
|
||||
|
||||
public function testFilterXPathWithManuallyRegisteredNamespace()
|
||||
{
|
||||
$crawler = $this->createTestXmlCrawler();
|
||||
@ -420,6 +411,15 @@ EOF
|
||||
$this->assertSame('widescreen', $crawler->text());
|
||||
}
|
||||
|
||||
public function testFilterXPathWithAnUrl()
|
||||
{
|
||||
$crawler = $this->createTestXmlCrawler();
|
||||
|
||||
$crawler = $crawler->filterXPath('//media:category[@scheme="http://gdata.youtube.com/schemas/2007/categories.cat"]');
|
||||
$this->assertCount(1, $crawler);
|
||||
$this->assertSame('Music', $crawler->text());
|
||||
}
|
||||
|
||||
/**
|
||||
* @covers Symfony\Component\DomCrawler\Crawler::filter
|
||||
*/
|
||||
@ -741,6 +741,7 @@ EOF
|
||||
<media:title type="plain">Chordates - CrashCourse Biology #24</media:title>
|
||||
<yt:aspectRatio>widescreen</yt:aspectRatio>
|
||||
</media:group>
|
||||
<media:category label="Music" scheme="http://gdata.youtube.com/schemas/2007/categories.cat">Music</media:category>
|
||||
</entry>';
|
||||
|
||||
return new Crawler($xml, $uri);
|
||||
|
Reference in New Issue
Block a user