feature #39097 [DomCrawler] Cache discovered namespaces (simonberger, fabpot)
This PR was merged into the 5.3-dev branch. Discussion ---------- [DomCrawler] Cache discovered namespaces | Q | A | ------------- | --- | Branch? | 5.x | Bug fix? | no | New feature? | yes | Deprecations? | no | Tickets | Address #39067 | License | MIT Discovering namespaces is by far the most expensive task when filtering for nodes and the xpath contains prefixes. When `Crawler::filterRelativeXPath` is called multiple times with (identical) prefixes the slowdown is huge. This fix brings the runtime of the linked ticket down from 27 seconds to 9 seconds in my test. Compared to a pure PHP version which takes < 0.5 seconds the design of the crawler API is the limiting factor. There are still many repeated namespace queries caused by new Crawler instances. Ideas to solve this are discussed in the ticket. Commits -------a8e85ecbbd
Make some CS changes4c74dead48
Cache discovered namespaces in DomCrawler
This commit is contained in:
commit
bd7a3e1b90
@ -33,6 +33,11 @@ class Crawler implements \Countable, \IteratorAggregate
|
||||
*/
|
||||
private $namespaces = [];
|
||||
|
||||
/**
|
||||
* @var \ArrayObject A map of cached namespaces
|
||||
*/
|
||||
private $cachedNamespaces;
|
||||
|
||||
/**
|
||||
* @var string The base href value
|
||||
*/
|
||||
@ -68,6 +73,7 @@ class Crawler implements \Countable, \IteratorAggregate
|
||||
$this->uri = $uri;
|
||||
$this->baseHref = $baseHref ?: $uri;
|
||||
$this->html5Parser = class_exists(HTML5::class) ? new HTML5(['disable_html_ns' => true]) : null;
|
||||
$this->cachedNamespaces = new \ArrayObject();
|
||||
|
||||
$this->add($node);
|
||||
}
|
||||
@ -99,6 +105,7 @@ class Crawler implements \Countable, \IteratorAggregate
|
||||
{
|
||||
$this->nodes = [];
|
||||
$this->document = null;
|
||||
$this->cachedNamespaces = new \ArrayObject();
|
||||
}
|
||||
|
||||
/**
|
||||
@ -967,12 +974,14 @@ class Crawler implements \Countable, \IteratorAggregate
|
||||
*/
|
||||
private function filterRelativeXPath(string $xpath): object
|
||||
{
|
||||
$prefixes = $this->findNamespacePrefixes($xpath);
|
||||
|
||||
$crawler = $this->createSubCrawler(null);
|
||||
if (null === $this->document) {
|
||||
return $crawler;
|
||||
}
|
||||
|
||||
$domxpath = $this->createDOMXPath($this->document, $this->findNamespacePrefixes($xpath));
|
||||
|
||||
foreach ($this->nodes as $node) {
|
||||
$domxpath = $this->createDOMXPath($node->ownerDocument, $prefixes);
|
||||
$crawler->add($domxpath->query($xpath, $node));
|
||||
}
|
||||
|
||||
@ -1189,10 +1198,14 @@ class Crawler implements \Countable, \IteratorAggregate
|
||||
return $this->namespaces[$prefix];
|
||||
}
|
||||
|
||||
if (isset($this->cachedNamespaces[$prefix])) {
|
||||
return $this->cachedNamespaces[$prefix];
|
||||
}
|
||||
|
||||
// ask for one namespace, otherwise we'd get a collection with an item for each node
|
||||
$namespaces = $domxpath->query(sprintf('(//namespace::*[name()="%s"])[last()]', $this->defaultNamespacePrefix === $prefix ? '' : $prefix));
|
||||
|
||||
return ($node = $namespaces->item(0)) ? $node->nodeValue : null;
|
||||
return $this->cachedNamespaces[$prefix] = ($node = $namespaces->item(0)) ? $node->nodeValue : null;
|
||||
}
|
||||
|
||||
private function findNamespacePrefixes(string $xpath): array
|
||||
@ -1217,6 +1230,7 @@ class Crawler implements \Countable, \IteratorAggregate
|
||||
$crawler->isHtml = $this->isHtml;
|
||||
$crawler->document = $this->document;
|
||||
$crawler->namespaces = $this->namespaces;
|
||||
$crawler->cachedNamespaces = $this->cachedNamespaces;
|
||||
$crawler->html5Parser = $this->html5Parser;
|
||||
|
||||
return $crawler;
|
||||
|
Reference in New Issue
Block a user