[DomCrawler] Fix relative path handling in links

Added relative path canonicalization according to RFC 3986, section 5.2.4
This commit is contained in:
Dmitrii Chekaliuk 2013-03-02 16:09:29 +02:00
parent 0e7b5fb3bb
commit a4ec6772df
2 changed files with 52 additions and 2 deletions

View File

@ -120,13 +120,18 @@ class Link
return $baseUri.$uri;
}
$baseUri = preg_replace('#^(.*?//[^/]+)(?:\/.*)?$#', '$1', $this->currentUri);
// absolute path
if ('/' === $uri[0]) {
return preg_replace('#^(.*?//[^/]+)(?:\/.*)?$#', '$1', $this->currentUri).$uri;
return $baseUri.$uri;
}
// relative path
return substr($this->currentUri, 0, strrpos($this->currentUri, '/') + 1).$uri;
$path = parse_url(substr($this->currentUri, strlen($baseUri)), PHP_URL_PATH);
$path = $this->canonicalizePath(substr($path, 0, strrpos($path, '/')).'/'.$uri);
return $baseUri.('' === $path || '/' !== $path[0] ? '/' : '').$path;
}
/**
@ -139,6 +144,36 @@ class Link
return $this->node->getAttribute('href');
}
/**
* Returns the canonicalized URI path (see RFC 3986, section 5.2.4)
*
* @param string $path URI path
*
* @return string
*/
protected function canonicalizePath($path)
{
if ('' === $path || '/' === $path) {
return $path;
}
if ('.' === substr($path, -1)) {
$path = $path.'/';
}
$output = array();
foreach (explode('/', $path) as $segment) {
if ('..' === $segment) {
array_pop($output);
} elseif ('.' !== $segment) {
array_push($output, $segment);
}
}
return implode('/', $output);
}
/**
* Sets current \DOMNode instance
*

View File

@ -101,6 +101,21 @@ class LinkTest extends \PHPUnit_Framework_TestCase
array('?foo=2', 'http://localhost/bar?foo=1', 'http://localhost/bar?foo=2'),
array('?foo=2', 'http://localhost/bar/?foo=1', 'http://localhost/bar/?foo=2'),
array('?bar=2', 'http://localhost?foo=1', 'http://localhost?bar=2'),
array('.', 'http://localhost/foo/bar/baz', 'http://localhost/foo/bar/'),
array('./', 'http://localhost/foo/bar/baz', 'http://localhost/foo/bar/'),
array('./foo', 'http://localhost/foo/bar/baz', 'http://localhost/foo/bar/foo'),
array('..', 'http://localhost/foo/bar/baz', 'http://localhost/foo/'),
array('../', 'http://localhost/foo/bar/baz', 'http://localhost/foo/'),
array('../foo', 'http://localhost/foo/bar/baz', 'http://localhost/foo/foo'),
array('../..', 'http://localhost/foo/bar/baz', 'http://localhost/'),
array('../../', 'http://localhost/foo/bar/baz', 'http://localhost/'),
array('../../foo', 'http://localhost/foo/bar/baz', 'http://localhost/foo'),
array('../../foo', 'http://localhost/bar/foo/', 'http://localhost/foo'),
array('../bar/../../foo', 'http://localhost/bar/foo/', 'http://localhost/foo'),
array('../bar/./../../foo', 'http://localhost/bar/foo/', 'http://localhost/foo'),
array('../../', 'http://localhost/', 'http://localhost/'),
array('../../', 'http://localhost', 'http://localhost/'),
);
}
}