[DomCrawler] Allow pipe (|) character in link tags when using Xpath expressions
This commit is contained in:
parent
ef48f5924a
commit
5b26e33261
|
@ -856,13 +856,12 @@ class Crawler extends \SplObjectStorage
|
|||
{
|
||||
$expressions = array();
|
||||
|
||||
$unionPattern = '/\|(?![^\[]*\])/';
|
||||
// An expression which will never match to replace expressions which cannot match in the crawler
|
||||
// We cannot simply drop
|
||||
$nonMatchingExpression = 'a[name() = "b"]';
|
||||
|
||||
// Split any unions into individual expressions.
|
||||
foreach (preg_split($unionPattern, $xpath) as $expression) {
|
||||
foreach ($this->splitUnionParts($xpath) as $expression) {
|
||||
$expression = trim($expression);
|
||||
$parenthesis = '';
|
||||
|
||||
|
@ -912,6 +911,47 @@ class Crawler extends \SplObjectStorage
|
|||
return implode(' | ', $expressions);
|
||||
}
|
||||
|
||||
/**
|
||||
* Splits the XPath into parts that are separated by the union operator.
|
||||
*
|
||||
* @param string $xpath
|
||||
*
|
||||
* @return string[]
|
||||
*/
|
||||
private function splitUnionParts($xpath)
|
||||
{
|
||||
// Split any unions into individual expressions. We need to iterate
|
||||
// through the string to correctly parse opening/closing quotes and
|
||||
// braces which is not possible with regular expressions.
|
||||
$unionParts = array();
|
||||
$inSingleQuotedString = false;
|
||||
$inDoubleQuotedString = false;
|
||||
$openedBrackets = 0;
|
||||
$lastUnion = 0;
|
||||
$xpathLength = strlen($xpath);
|
||||
for ($i = 0; $i < $xpathLength; ++$i) {
|
||||
$char = $xpath[$i];
|
||||
|
||||
if ($char === "'" && !$inDoubleQuotedString) {
|
||||
$inSingleQuotedString = !$inSingleQuotedString;
|
||||
} elseif ($char === '"' && !$inSingleQuotedString) {
|
||||
$inDoubleQuotedString = !$inDoubleQuotedString;
|
||||
} elseif (!$inSingleQuotedString && !$inDoubleQuotedString) {
|
||||
if ($char === '[') {
|
||||
++$openedBrackets;
|
||||
} elseif ($char === ']') {
|
||||
--$openedBrackets;
|
||||
} elseif ($char === '|' && $openedBrackets === 0) {
|
||||
$unionParts[] = substr($xpath, $lastUnion, $i - $lastUnion);
|
||||
$lastUnion = $i + 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
$unionParts[] = substr($xpath, $lastUnion);
|
||||
|
||||
return $unionParts;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param int $position
|
||||
*
|
||||
|
|
|
@ -387,6 +387,7 @@ EOF
|
|||
$this->assertCount(5, $crawler->filterXPath('(//a | //div)//img'));
|
||||
$this->assertCount(7, $crawler->filterXPath('((//a | //div)//img | //ul)'));
|
||||
$this->assertCount(7, $crawler->filterXPath('( ( //a | //div )//img | //ul )'));
|
||||
$this->assertCount(1, $crawler->filterXPath("//a[./@href][((./@id = 'Klausi|Claudiu' or normalize-space(string(.)) = 'Klausi|Claudiu' or ./@title = 'Klausi|Claudiu' or ./@rel = 'Klausi|Claudiu') or .//img[./@alt = 'Klausi|Claudiu'])]"));
|
||||
}
|
||||
|
||||
public function testFilterXPath()
|
||||
|
@ -548,7 +549,7 @@ EOF
|
|||
|
||||
$this->assertCount(0, $crawler->filterXPath('self::a'), 'The fake root node has no "real" element name');
|
||||
$this->assertCount(0, $crawler->filterXPath('self::a/img'), 'The fake root node has no "real" element name');
|
||||
$this->assertCount(9, $crawler->filterXPath('self::*/a'));
|
||||
$this->assertCount(10, $crawler->filterXPath('self::*/a'));
|
||||
}
|
||||
|
||||
public function testFilter()
|
||||
|
@ -969,6 +970,8 @@ HTML;
|
|||
|
||||
<a href="?get=param">GetLink</a>
|
||||
|
||||
<a href="/example">Klausi|Claudiu</a>
|
||||
|
||||
<form action="foo" id="FooFormId">
|
||||
<input type="text" value="TextValue" name="TextName" />
|
||||
<input type="submit" value="FooValue" name="FooName" id="FooId" />
|
||||
|
|
Reference in New Issue