feature #35415 Extracted code to expand an URI to `UriExpander` (lyrixx)
This PR was merged into the 5.1-dev branch.
Discussion
----------
Extracted code to expand an URI to `UriExpander`
| Q | A
| ------------- | ---
| Branch? | master
| Bug fix? | no
| New feature? | yes
| Deprecations? | no
| Tickets |
| License | MIT
| Doc PR |
When building a crawler we need to extract and to expand all links on a
web pages.
ATM, we need to create a DomDocument, attach the href, and ask for the
full URL.
This is a bit slow, and unecessary. This is why I extracted the minimal
code to expand the URL to its onw trait for better re-usability.
I benched (a specific part of) my application:
* before: 2.16ms
* after: 1.42ms
Commits
-------
0c499c6b35
Extracted code to expand an URI to `UriExpanderTrait`
This commit is contained in:
commit
9b69b08062
|
@ -80,46 +80,7 @@ abstract class AbstractUriElement
|
|||
*/
|
||||
public function getUri()
|
||||
{
|
||||
$uri = trim($this->getRawUri());
|
||||
|
||||
// absolute URL?
|
||||
if (null !== parse_url($uri, PHP_URL_SCHEME)) {
|
||||
return $uri;
|
||||
}
|
||||
|
||||
// empty URI
|
||||
if (!$uri) {
|
||||
return $this->currentUri;
|
||||
}
|
||||
|
||||
// an anchor
|
||||
if ('#' === $uri[0]) {
|
||||
return $this->cleanupAnchor($this->currentUri).$uri;
|
||||
}
|
||||
|
||||
$baseUri = $this->cleanupUri($this->currentUri);
|
||||
|
||||
if ('?' === $uri[0]) {
|
||||
return $baseUri.$uri;
|
||||
}
|
||||
|
||||
// absolute URL with relative schema
|
||||
if (0 === strpos($uri, '//')) {
|
||||
return preg_replace('#^([^/]*)//.*$#', '$1', $baseUri).$uri;
|
||||
}
|
||||
|
||||
$baseUri = preg_replace('#^(.*?//[^/]*)(?:\/.*)?$#', '$1', $baseUri);
|
||||
|
||||
// absolute path
|
||||
if ('/' === $uri[0]) {
|
||||
return $baseUri.$uri;
|
||||
}
|
||||
|
||||
// relative path
|
||||
$path = parse_url(substr($this->currentUri, \strlen($baseUri)), PHP_URL_PATH);
|
||||
$path = $this->canonicalizePath(substr($path, 0, strrpos($path, '/')).'/'.$uri);
|
||||
|
||||
return $baseUri.('' === $path || '/' !== $path[0] ? '/' : '').$path;
|
||||
return UriExpander::expand($this->getRawUri(), $this->currentUri);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -167,36 +128,4 @@ abstract class AbstractUriElement
|
|||
* @throws \LogicException If given node is not an anchor
|
||||
*/
|
||||
abstract protected function setNode(\DOMElement $node);
|
||||
|
||||
/**
|
||||
* Removes the query string and the anchor from the given uri.
|
||||
*/
|
||||
private function cleanupUri(string $uri): string
|
||||
{
|
||||
return $this->cleanupQuery($this->cleanupAnchor($uri));
|
||||
}
|
||||
|
||||
/**
|
||||
* Remove the query string from the uri.
|
||||
*/
|
||||
private function cleanupQuery(string $uri): string
|
||||
{
|
||||
if (false !== $pos = strpos($uri, '?')) {
|
||||
return substr($uri, 0, $pos);
|
||||
}
|
||||
|
||||
return $uri;
|
||||
}
|
||||
|
||||
/**
|
||||
* Remove the anchor from the uri.
|
||||
*/
|
||||
private function cleanupAnchor(string $uri): string
|
||||
{
|
||||
if (false !== $pos = strpos($uri, '#')) {
|
||||
return substr($uri, 0, $pos);
|
||||
}
|
||||
|
||||
return $uri;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -5,6 +5,7 @@ CHANGELOG
|
|||
-----
|
||||
|
||||
* Added an internal cache layer on top of the CssSelectorConverter
|
||||
* Added `UriExpander` to expand an URL according to another URL
|
||||
|
||||
5.0.0
|
||||
-----
|
||||
|
|
|
@ -0,0 +1,86 @@
|
|||
<?php
|
||||
|
||||
/*
|
||||
* This file is part of the Symfony package.
|
||||
*
|
||||
* (c) Fabien Potencier <fabien@symfony.com>
|
||||
*
|
||||
* For the full copyright and license information, please view the LICENSE
|
||||
* file that was distributed with this source code.
|
||||
*/
|
||||
|
||||
namespace Symfony\Component\DomCrawler\Tests;
|
||||
|
||||
use PHPUnit\Framework\TestCase;
|
||||
use Symfony\Component\DomCrawler\UriExpander;
|
||||
|
||||
class UriExpanderTest extends TestCase
|
||||
{
|
||||
/**
|
||||
* @dataProvider provideExpandUriTests
|
||||
*/
|
||||
public function testExpandUri(string $uri, string $currentUri, string $expected)
|
||||
{
|
||||
$this->assertEquals($expected, UriExpander::expand($uri, $currentUri));
|
||||
}
|
||||
|
||||
public function provideExpandUriTests()
|
||||
{
|
||||
return [
|
||||
['/foo', 'http://localhost/bar/foo/', 'http://localhost/foo'],
|
||||
['/foo', 'http://localhost/bar/foo', 'http://localhost/foo'],
|
||||
['
|
||||
/foo', 'http://localhost/bar/foo/', 'http://localhost/foo'],
|
||||
['/foo
|
||||
', 'http://localhost/bar/foo', 'http://localhost/foo'],
|
||||
|
||||
['foo', 'http://localhost/bar/foo/', 'http://localhost/bar/foo/foo'],
|
||||
['foo', 'http://localhost/bar/foo', 'http://localhost/bar/foo'],
|
||||
|
||||
['', 'http://localhost/bar/', 'http://localhost/bar/'],
|
||||
['#', 'http://localhost/bar/', 'http://localhost/bar/#'],
|
||||
['#bar', 'http://localhost/bar?a=b', 'http://localhost/bar?a=b#bar'],
|
||||
['#bar', 'http://localhost/bar/#foo', 'http://localhost/bar/#bar'],
|
||||
['?a=b', 'http://localhost/bar#foo', 'http://localhost/bar?a=b'],
|
||||
['?a=b', 'http://localhost/bar/', 'http://localhost/bar/?a=b'],
|
||||
|
||||
['http://login.foo.com/foo', 'http://localhost/bar/', 'http://login.foo.com/foo'],
|
||||
['https://login.foo.com/foo', 'https://localhost/bar/', 'https://login.foo.com/foo'],
|
||||
['mailto:foo@bar.com', 'http://localhost/foo', 'mailto:foo@bar.com'],
|
||||
|
||||
// tests schema relative URL (issue #7169)
|
||||
['//login.foo.com/foo', 'http://localhost/bar/', 'http://login.foo.com/foo'],
|
||||
['//login.foo.com/foo', 'https://localhost/bar/', 'https://login.foo.com/foo'],
|
||||
|
||||
['?foo=2', 'http://localhost?foo=1', 'http://localhost?foo=2'],
|
||||
['?foo=2', 'http://localhost/?foo=1', 'http://localhost/?foo=2'],
|
||||
['?foo=2', 'http://localhost/bar?foo=1', 'http://localhost/bar?foo=2'],
|
||||
['?foo=2', 'http://localhost/bar/?foo=1', 'http://localhost/bar/?foo=2'],
|
||||
['?bar=2', 'http://localhost?foo=1', 'http://localhost?bar=2'],
|
||||
|
||||
['foo', 'http://login.foo.com/bar/baz?/query/string', 'http://login.foo.com/bar/foo'],
|
||||
|
||||
['.', 'http://localhost/foo/bar/baz', 'http://localhost/foo/bar/'],
|
||||
['./', 'http://localhost/foo/bar/baz', 'http://localhost/foo/bar/'],
|
||||
['./foo', 'http://localhost/foo/bar/baz', 'http://localhost/foo/bar/foo'],
|
||||
['..', 'http://localhost/foo/bar/baz', 'http://localhost/foo/'],
|
||||
['../', 'http://localhost/foo/bar/baz', 'http://localhost/foo/'],
|
||||
['../foo', 'http://localhost/foo/bar/baz', 'http://localhost/foo/foo'],
|
||||
['../..', 'http://localhost/foo/bar/baz', 'http://localhost/'],
|
||||
['../../', 'http://localhost/foo/bar/baz', 'http://localhost/'],
|
||||
['../../foo', 'http://localhost/foo/bar/baz', 'http://localhost/foo'],
|
||||
['../../foo', 'http://localhost/bar/foo/', 'http://localhost/foo'],
|
||||
['../bar/../../foo', 'http://localhost/bar/foo/', 'http://localhost/foo'],
|
||||
['../bar/./../../foo', 'http://localhost/bar/foo/', 'http://localhost/foo'],
|
||||
['../../', 'http://localhost/', 'http://localhost/'],
|
||||
['../../', 'http://localhost', 'http://localhost/'],
|
||||
|
||||
['/foo', 'http://localhost?bar=1', 'http://localhost/foo'],
|
||||
['/foo', 'http://localhost#bar', 'http://localhost/foo'],
|
||||
['/foo', 'file:///', 'file:///foo'],
|
||||
['/foo', 'file:///bar/baz', 'file:///foo'],
|
||||
['foo', 'file:///', 'file:///foo'],
|
||||
['foo', 'file:///bar/baz', 'file:///bar/foo'],
|
||||
];
|
||||
}
|
||||
}
|
|
@ -0,0 +1,135 @@
|
|||
<?php
|
||||
|
||||
/*
|
||||
* This file is part of the Symfony package.
|
||||
*
|
||||
* (c) Fabien Potencier <fabien@symfony.com>
|
||||
*
|
||||
* For the full copyright and license information, please view the LICENSE
|
||||
* file that was distributed with this source code.
|
||||
*/
|
||||
|
||||
namespace Symfony\Component\DomCrawler;
|
||||
|
||||
/**
|
||||
* Expand an URI according a current URI.
|
||||
*
|
||||
* @author Fabien Potencier <fabien@symfony.com>
|
||||
* @author Grégoire Pineau <lyrixx@lyrixx.info>
|
||||
*/
|
||||
class UriExpander
|
||||
{
|
||||
/**
|
||||
* Expand an URI according to a current Uri.
|
||||
*
|
||||
* For example if $uri=/foo/bar and $currentUri=https://symfony.com it will
|
||||
* return https://symfony.com/foo/bar
|
||||
*
|
||||
* If the $uri is not absolute you must pass an absolute $currentUri
|
||||
*/
|
||||
public static function expand(string $uri, ?string $currentUri): string
|
||||
{
|
||||
$uri = trim($uri);
|
||||
|
||||
// absolute URL?
|
||||
if (null !== parse_url($uri, PHP_URL_SCHEME)) {
|
||||
return $uri;
|
||||
}
|
||||
|
||||
if (null === $currentUri) {
|
||||
throw new \InvalidArgumentException('The URI is relative, so you must define its base URI passing an absolute URL.');
|
||||
}
|
||||
|
||||
// empty URI
|
||||
if (!$uri) {
|
||||
return $currentUri;
|
||||
}
|
||||
|
||||
// an anchor
|
||||
if ('#' === $uri[0]) {
|
||||
return self::cleanupAnchor($currentUri).$uri;
|
||||
}
|
||||
|
||||
$baseUri = self::cleanupUri($currentUri);
|
||||
|
||||
if ('?' === $uri[0]) {
|
||||
return $baseUri.$uri;
|
||||
}
|
||||
|
||||
// absolute URL with relative schema
|
||||
if (0 === strpos($uri, '//')) {
|
||||
return preg_replace('#^([^/]*)//.*$#', '$1', $baseUri).$uri;
|
||||
}
|
||||
|
||||
$baseUri = preg_replace('#^(.*?//[^/]*)(?:\/.*)?$#', '$1', $baseUri);
|
||||
|
||||
// absolute path
|
||||
if ('/' === $uri[0]) {
|
||||
return $baseUri.$uri;
|
||||
}
|
||||
|
||||
// relative path
|
||||
$path = parse_url(substr($currentUri, \strlen($baseUri)), PHP_URL_PATH);
|
||||
$path = self::canonicalizePath(substr($path, 0, strrpos($path, '/')).'/'.$uri);
|
||||
|
||||
return $baseUri.('' === $path || '/' !== $path[0] ? '/' : '').$path;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the canonicalized URI path (see RFC 3986, section 5.2.4).
|
||||
*/
|
||||
private static function canonicalizePath(string $path): string
|
||||
{
|
||||
if ('' === $path || '/' === $path) {
|
||||
return $path;
|
||||
}
|
||||
|
||||
if ('.' === substr($path, -1)) {
|
||||
$path .= '/';
|
||||
}
|
||||
|
||||
$output = [];
|
||||
|
||||
foreach (explode('/', $path) as $segment) {
|
||||
if ('..' === $segment) {
|
||||
array_pop($output);
|
||||
} elseif ('.' !== $segment) {
|
||||
$output[] = $segment;
|
||||
}
|
||||
}
|
||||
|
||||
return implode('/', $output);
|
||||
}
|
||||
|
||||
/**
|
||||
* Removes the query string and the anchor from the given uri.
|
||||
*/
|
||||
private static function cleanupUri(string $uri): string
|
||||
{
|
||||
return self::cleanupQuery(self::cleanupAnchor($uri));
|
||||
}
|
||||
|
||||
/**
|
||||
* Removes the query string from the uri.
|
||||
*/
|
||||
private static function cleanupQuery(string $uri): string
|
||||
{
|
||||
if (false !== $pos = strpos($uri, '?')) {
|
||||
return substr($uri, 0, $pos);
|
||||
}
|
||||
|
||||
return $uri;
|
||||
}
|
||||
|
||||
/**
|
||||
* Removes the anchor from the uri.
|
||||
*/
|
||||
private static function cleanupAnchor(string $uri): string
|
||||
{
|
||||
if (false !== $pos = strpos($uri, '#')) {
|
||||
return substr($uri, 0, $pos);
|
||||
}
|
||||
|
||||
return $uri;
|
||||
}
|
||||
}
|
Reference in New Issue