[DomCrawler] Allow pipe (|) character in link tags when using Xpath expressions

This commit is contained in:
Klaus Purer 2016-10-16 22:10:53 +02:00 committed by Nicolas Grekas
parent ef48f5924a
commit 5b26e33261
2 changed files with 46 additions and 3 deletions

View File

@ -856,13 +856,12 @@ class Crawler extends \SplObjectStorage
{
$expressions = array();
$unionPattern = '/\|(?![^\[]*\])/';
// An expression which will never match to replace expressions which cannot match in the crawler
// We cannot simply drop
$nonMatchingExpression = 'a[name() = "b"]';
// Split any unions into individual expressions.
foreach (preg_split($unionPattern, $xpath) as $expression) {
foreach ($this->splitUnionParts($xpath) as $expression) {
$expression = trim($expression);
$parenthesis = '';
@ -912,6 +911,47 @@ class Crawler extends \SplObjectStorage
return implode(' | ', $expressions);
}
/**
* Splits the XPath into parts that are separated by the union operator.
*
* @param string $xpath
*
* @return string[]
*/
private function splitUnionParts($xpath)
{
// Split any unions into individual expressions. We need to iterate
// through the string to correctly parse opening/closing quotes and
// braces which is not possible with regular expressions.
$unionParts = array();
$inSingleQuotedString = false;
$inDoubleQuotedString = false;
$openedBrackets = 0;
$lastUnion = 0;
$xpathLength = strlen($xpath);
for ($i = 0; $i < $xpathLength; ++$i) {
$char = $xpath[$i];
if ($char === "'" && !$inDoubleQuotedString) {
$inSingleQuotedString = !$inSingleQuotedString;
} elseif ($char === '"' && !$inSingleQuotedString) {
$inDoubleQuotedString = !$inDoubleQuotedString;
} elseif (!$inSingleQuotedString && !$inDoubleQuotedString) {
if ($char === '[') {
++$openedBrackets;
} elseif ($char === ']') {
--$openedBrackets;
} elseif ($char === '|' && $openedBrackets === 0) {
$unionParts[] = substr($xpath, $lastUnion, $i - $lastUnion);
$lastUnion = $i + 1;
}
}
}
$unionParts[] = substr($xpath, $lastUnion);
return $unionParts;
}
/**
* @param int $position
*

View File

@ -387,6 +387,7 @@ EOF
$this->assertCount(5, $crawler->filterXPath('(//a | //div)//img'));
$this->assertCount(7, $crawler->filterXPath('((//a | //div)//img | //ul)'));
$this->assertCount(7, $crawler->filterXPath('( ( //a | //div )//img | //ul )'));
$this->assertCount(1, $crawler->filterXPath("//a[./@href][((./@id = 'Klausi|Claudiu' or normalize-space(string(.)) = 'Klausi|Claudiu' or ./@title = 'Klausi|Claudiu' or ./@rel = 'Klausi|Claudiu') or .//img[./@alt = 'Klausi|Claudiu'])]"));
}
public function testFilterXPath()
@ -548,7 +549,7 @@ EOF
$this->assertCount(0, $crawler->filterXPath('self::a'), 'The fake root node has no "real" element name');
$this->assertCount(0, $crawler->filterXPath('self::a/img'), 'The fake root node has no "real" element name');
$this->assertCount(9, $crawler->filterXPath('self::*/a'));
$this->assertCount(10, $crawler->filterXPath('self::*/a'));
}
public function testFilter()
@ -969,6 +970,8 @@ HTML;
<a href="?get=param">GetLink</a>
<a href="/example">Klausi|Claudiu</a>
<form action="foo" id="FooFormId">
<input type="text" value="TextValue" name="TextName" />
<input type="submit" value="FooValue" name="FooName" id="FooId" />