merged branch lazyhammer/crawler-relative-links-2.1 (PR #7244)
This PR was merged into the 2.1 branch.
Commits
-------
a4ec677
[DomCrawler] Fix relative path handling in links
Discussion
----------
[2.1][DomCrawler] Fix relative path handling in links
| Q | A
| ------------- | ---
| Bug fix? | yes
| New feature? | no
| BC breaks? | no
| Deprecations? | no
| Tests pass? | yes
| Fixed tickets | #7219
| License | MIT
| Doc PR | n/a
Added relative path canonicalization according to RFC 3986, section 5.2.4
---------------------------------------------------------------------------
by vicb at 2013-03-02T14:48:46Z
@fabpot seems like 2.1 as a deps issue if you look at travis logs
This commit is contained in:
commit
f9d683eb8d
@ -120,13 +120,18 @@ class Link
|
||||
return $baseUri.$uri;
|
||||
}
|
||||
|
||||
$baseUri = preg_replace('#^(.*?//[^/]+)(?:\/.*)?$#', '$1', $this->currentUri);
|
||||
|
||||
// absolute path
|
||||
if ('/' === $uri[0]) {
|
||||
return preg_replace('#^(.*?//[^/]+)(?:\/.*)?$#', '$1', $this->currentUri).$uri;
|
||||
return $baseUri.$uri;
|
||||
}
|
||||
|
||||
// relative path
|
||||
return substr($this->currentUri, 0, strrpos($this->currentUri, '/') + 1).$uri;
|
||||
$path = parse_url(substr($this->currentUri, strlen($baseUri)), PHP_URL_PATH);
|
||||
$path = $this->canonicalizePath(substr($path, 0, strrpos($path, '/')).'/'.$uri);
|
||||
|
||||
return $baseUri.('' === $path || '/' !== $path[0] ? '/' : '').$path;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -139,6 +144,36 @@ class Link
|
||||
return $this->node->getAttribute('href');
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the canonicalized URI path (see RFC 3986, section 5.2.4)
|
||||
*
|
||||
* @param string $path URI path
|
||||
*
|
||||
* @return string
|
||||
*/
|
||||
protected function canonicalizePath($path)
|
||||
{
|
||||
if ('' === $path || '/' === $path) {
|
||||
return $path;
|
||||
}
|
||||
|
||||
if ('.' === substr($path, -1)) {
|
||||
$path = $path.'/';
|
||||
}
|
||||
|
||||
$output = array();
|
||||
|
||||
foreach (explode('/', $path) as $segment) {
|
||||
if ('..' === $segment) {
|
||||
array_pop($output);
|
||||
} elseif ('.' !== $segment) {
|
||||
array_push($output, $segment);
|
||||
}
|
||||
}
|
||||
|
||||
return implode('/', $output);
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets current \DOMNode instance
|
||||
*
|
||||
|
@ -101,6 +101,21 @@ class LinkTest extends \PHPUnit_Framework_TestCase
|
||||
array('?foo=2', 'http://localhost/bar?foo=1', 'http://localhost/bar?foo=2'),
|
||||
array('?foo=2', 'http://localhost/bar/?foo=1', 'http://localhost/bar/?foo=2'),
|
||||
array('?bar=2', 'http://localhost?foo=1', 'http://localhost?bar=2'),
|
||||
|
||||
array('.', 'http://localhost/foo/bar/baz', 'http://localhost/foo/bar/'),
|
||||
array('./', 'http://localhost/foo/bar/baz', 'http://localhost/foo/bar/'),
|
||||
array('./foo', 'http://localhost/foo/bar/baz', 'http://localhost/foo/bar/foo'),
|
||||
array('..', 'http://localhost/foo/bar/baz', 'http://localhost/foo/'),
|
||||
array('../', 'http://localhost/foo/bar/baz', 'http://localhost/foo/'),
|
||||
array('../foo', 'http://localhost/foo/bar/baz', 'http://localhost/foo/foo'),
|
||||
array('../..', 'http://localhost/foo/bar/baz', 'http://localhost/'),
|
||||
array('../../', 'http://localhost/foo/bar/baz', 'http://localhost/'),
|
||||
array('../../foo', 'http://localhost/foo/bar/baz', 'http://localhost/foo'),
|
||||
array('../../foo', 'http://localhost/bar/foo/', 'http://localhost/foo'),
|
||||
array('../bar/../../foo', 'http://localhost/bar/foo/', 'http://localhost/foo'),
|
||||
array('../bar/./../../foo', 'http://localhost/bar/foo/', 'http://localhost/foo'),
|
||||
array('../../', 'http://localhost/', 'http://localhost/'),
|
||||
array('../../', 'http://localhost', 'http://localhost/'),
|
||||
);
|
||||
}
|
||||
}
|
||||
|
Reference in New Issue
Block a user