diff --git a/src/Symfony/Component/DomCrawler/AbstractUriElement.php b/src/Symfony/Component/DomCrawler/AbstractUriElement.php index 3dc67c7cab..4e31b38af6 100644 --- a/src/Symfony/Component/DomCrawler/AbstractUriElement.php +++ b/src/Symfony/Component/DomCrawler/AbstractUriElement.php @@ -80,46 +80,7 @@ abstract class AbstractUriElement */ public function getUri() { - $uri = trim($this->getRawUri()); - - // absolute URL? - if (null !== parse_url($uri, PHP_URL_SCHEME)) { - return $uri; - } - - // empty URI - if (!$uri) { - return $this->currentUri; - } - - // an anchor - if ('#' === $uri[0]) { - return $this->cleanupAnchor($this->currentUri).$uri; - } - - $baseUri = $this->cleanupUri($this->currentUri); - - if ('?' === $uri[0]) { - return $baseUri.$uri; - } - - // absolute URL with relative schema - if (0 === strpos($uri, '//')) { - return preg_replace('#^([^/]*)//.*$#', '$1', $baseUri).$uri; - } - - $baseUri = preg_replace('#^(.*?//[^/]*)(?:\/.*)?$#', '$1', $baseUri); - - // absolute path - if ('/' === $uri[0]) { - return $baseUri.$uri; - } - - // relative path - $path = parse_url(substr($this->currentUri, \strlen($baseUri)), PHP_URL_PATH); - $path = $this->canonicalizePath(substr($path, 0, strrpos($path, '/')).'/'.$uri); - - return $baseUri.('' === $path || '/' !== $path[0] ? '/' : '').$path; + return UriExpander::expand($this->getRawUri(), $this->currentUri); } /** @@ -167,36 +128,4 @@ abstract class AbstractUriElement * @throws \LogicException If given node is not an anchor */ abstract protected function setNode(\DOMElement $node); - - /** - * Removes the query string and the anchor from the given uri. - */ - private function cleanupUri(string $uri): string - { - return $this->cleanupQuery($this->cleanupAnchor($uri)); - } - - /** - * Remove the query string from the uri. - */ - private function cleanupQuery(string $uri): string - { - if (false !== $pos = strpos($uri, '?')) { - return substr($uri, 0, $pos); - } - - return $uri; - } - - /** - * Remove the anchor from the uri. - */ - private function cleanupAnchor(string $uri): string - { - if (false !== $pos = strpos($uri, '#')) { - return substr($uri, 0, $pos); - } - - return $uri; - } } diff --git a/src/Symfony/Component/DomCrawler/CHANGELOG.md b/src/Symfony/Component/DomCrawler/CHANGELOG.md index b55e781f27..9f0e0dd32a 100644 --- a/src/Symfony/Component/DomCrawler/CHANGELOG.md +++ b/src/Symfony/Component/DomCrawler/CHANGELOG.md @@ -5,6 +5,7 @@ CHANGELOG ----- * Added an internal cache layer on top of the CssSelectorConverter +* Added `UriExpander` to expand an URL according to another URL 5.0.0 ----- diff --git a/src/Symfony/Component/DomCrawler/Tests/UriExpanderTest.php b/src/Symfony/Component/DomCrawler/Tests/UriExpanderTest.php new file mode 100644 index 0000000000..1d783a3b39 --- /dev/null +++ b/src/Symfony/Component/DomCrawler/Tests/UriExpanderTest.php @@ -0,0 +1,86 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +namespace Symfony\Component\DomCrawler\Tests; + +use PHPUnit\Framework\TestCase; +use Symfony\Component\DomCrawler\UriExpander; + +class UriExpanderTest extends TestCase +{ + /** + * @dataProvider provideExpandUriTests + */ + public function testExpandUri(string $uri, string $currentUri, string $expected) + { + $this->assertEquals($expected, UriExpander::expand($uri, $currentUri)); + } + + public function provideExpandUriTests() + { + return [ + ['/foo', 'http://localhost/bar/foo/', 'http://localhost/foo'], + ['/foo', 'http://localhost/bar/foo', 'http://localhost/foo'], + [' + /foo', 'http://localhost/bar/foo/', 'http://localhost/foo'], + ['/foo + ', 'http://localhost/bar/foo', 'http://localhost/foo'], + + ['foo', 'http://localhost/bar/foo/', 'http://localhost/bar/foo/foo'], + ['foo', 'http://localhost/bar/foo', 'http://localhost/bar/foo'], + + ['', 'http://localhost/bar/', 'http://localhost/bar/'], + ['#', 'http://localhost/bar/', 'http://localhost/bar/#'], + ['#bar', 'http://localhost/bar?a=b', 'http://localhost/bar?a=b#bar'], + ['#bar', 'http://localhost/bar/#foo', 'http://localhost/bar/#bar'], + ['?a=b', 'http://localhost/bar#foo', 'http://localhost/bar?a=b'], + ['?a=b', 'http://localhost/bar/', 'http://localhost/bar/?a=b'], + + ['http://login.foo.com/foo', 'http://localhost/bar/', 'http://login.foo.com/foo'], + ['https://login.foo.com/foo', 'https://localhost/bar/', 'https://login.foo.com/foo'], + ['mailto:foo@bar.com', 'http://localhost/foo', 'mailto:foo@bar.com'], + + // tests schema relative URL (issue #7169) + ['//login.foo.com/foo', 'http://localhost/bar/', 'http://login.foo.com/foo'], + ['//login.foo.com/foo', 'https://localhost/bar/', 'https://login.foo.com/foo'], + + ['?foo=2', 'http://localhost?foo=1', 'http://localhost?foo=2'], + ['?foo=2', 'http://localhost/?foo=1', 'http://localhost/?foo=2'], + ['?foo=2', 'http://localhost/bar?foo=1', 'http://localhost/bar?foo=2'], + ['?foo=2', 'http://localhost/bar/?foo=1', 'http://localhost/bar/?foo=2'], + ['?bar=2', 'http://localhost?foo=1', 'http://localhost?bar=2'], + + ['foo', 'http://login.foo.com/bar/baz?/query/string', 'http://login.foo.com/bar/foo'], + + ['.', 'http://localhost/foo/bar/baz', 'http://localhost/foo/bar/'], + ['./', 'http://localhost/foo/bar/baz', 'http://localhost/foo/bar/'], + ['./foo', 'http://localhost/foo/bar/baz', 'http://localhost/foo/bar/foo'], + ['..', 'http://localhost/foo/bar/baz', 'http://localhost/foo/'], + ['../', 'http://localhost/foo/bar/baz', 'http://localhost/foo/'], + ['../foo', 'http://localhost/foo/bar/baz', 'http://localhost/foo/foo'], + ['../..', 'http://localhost/foo/bar/baz', 'http://localhost/'], + ['../../', 'http://localhost/foo/bar/baz', 'http://localhost/'], + ['../../foo', 'http://localhost/foo/bar/baz', 'http://localhost/foo'], + ['../../foo', 'http://localhost/bar/foo/', 'http://localhost/foo'], + ['../bar/../../foo', 'http://localhost/bar/foo/', 'http://localhost/foo'], + ['../bar/./../../foo', 'http://localhost/bar/foo/', 'http://localhost/foo'], + ['../../', 'http://localhost/', 'http://localhost/'], + ['../../', 'http://localhost', 'http://localhost/'], + + ['/foo', 'http://localhost?bar=1', 'http://localhost/foo'], + ['/foo', 'http://localhost#bar', 'http://localhost/foo'], + ['/foo', 'file:///', 'file:///foo'], + ['/foo', 'file:///bar/baz', 'file:///foo'], + ['foo', 'file:///', 'file:///foo'], + ['foo', 'file:///bar/baz', 'file:///bar/foo'], + ]; + } +} diff --git a/src/Symfony/Component/DomCrawler/UriExpander.php b/src/Symfony/Component/DomCrawler/UriExpander.php new file mode 100644 index 0000000000..51bc408ae3 --- /dev/null +++ b/src/Symfony/Component/DomCrawler/UriExpander.php @@ -0,0 +1,135 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +namespace Symfony\Component\DomCrawler; + +/** + * Expand an URI according a current URI. + * + * @author Fabien Potencier + * @author Grégoire Pineau + */ +class UriExpander +{ + /** + * Expand an URI according to a current Uri. + * + * For example if $uri=/foo/bar and $currentUri=https://symfony.com it will + * return https://symfony.com/foo/bar + * + * If the $uri is not absolute you must pass an absolute $currentUri + */ + public static function expand(string $uri, ?string $currentUri): string + { + $uri = trim($uri); + + // absolute URL? + if (null !== parse_url($uri, PHP_URL_SCHEME)) { + return $uri; + } + + if (null === $currentUri) { + throw new \InvalidArgumentException('The URI is relative, so you must define its base URI passing an absolute URL.'); + } + + // empty URI + if (!$uri) { + return $currentUri; + } + + // an anchor + if ('#' === $uri[0]) { + return self::cleanupAnchor($currentUri).$uri; + } + + $baseUri = self::cleanupUri($currentUri); + + if ('?' === $uri[0]) { + return $baseUri.$uri; + } + + // absolute URL with relative schema + if (0 === strpos($uri, '//')) { + return preg_replace('#^([^/]*)//.*$#', '$1', $baseUri).$uri; + } + + $baseUri = preg_replace('#^(.*?//[^/]*)(?:\/.*)?$#', '$1', $baseUri); + + // absolute path + if ('/' === $uri[0]) { + return $baseUri.$uri; + } + + // relative path + $path = parse_url(substr($currentUri, \strlen($baseUri)), PHP_URL_PATH); + $path = self::canonicalizePath(substr($path, 0, strrpos($path, '/')).'/'.$uri); + + return $baseUri.('' === $path || '/' !== $path[0] ? '/' : '').$path; + } + + /** + * Returns the canonicalized URI path (see RFC 3986, section 5.2.4). + */ + private static function canonicalizePath(string $path): string + { + if ('' === $path || '/' === $path) { + return $path; + } + + if ('.' === substr($path, -1)) { + $path .= '/'; + } + + $output = []; + + foreach (explode('/', $path) as $segment) { + if ('..' === $segment) { + array_pop($output); + } elseif ('.' !== $segment) { + $output[] = $segment; + } + } + + return implode('/', $output); + } + + /** + * Removes the query string and the anchor from the given uri. + */ + private static function cleanupUri(string $uri): string + { + return self::cleanupQuery(self::cleanupAnchor($uri)); + } + + /** + * Removes the query string from the uri. + */ + private static function cleanupQuery(string $uri): string + { + if (false !== $pos = strpos($uri, '?')) { + return substr($uri, 0, $pos); + } + + return $uri; + } + + /** + * Removes the anchor from the uri. + */ + private static function cleanupAnchor(string $uri): string + { + if (false !== $pos = strpos($uri, '#')) { + return substr($uri, 0, $pos); + } + + return $uri; + } +}