[DomCrawler] Abstract URI logic and crawl images
This commit is contained in:
parent
5b5970364e
commit
1553b073fa
212
src/Symfony/Component/DomCrawler/AbstractUriElement.php
Normal file
212
src/Symfony/Component/DomCrawler/AbstractUriElement.php
Normal file
@ -0,0 +1,212 @@
|
||||
<?php
|
||||
|
||||
/*
|
||||
* This file is part of the Symfony package.
|
||||
*
|
||||
* (c) Fabien Potencier <fabien@symfony.com>
|
||||
*
|
||||
* For the full copyright and license information, please view the LICENSE
|
||||
* file that was distributed with this source code.
|
||||
*/
|
||||
|
||||
namespace Symfony\Component\DomCrawler;
|
||||
|
||||
/**
|
||||
* Any HTML element that can link to an URI.
|
||||
*
|
||||
* @author Fabien Potencier <fabien@symfony.com>
|
||||
*/
|
||||
abstract class AbstractUriElement
|
||||
{
|
||||
/**
|
||||
* @var \DOMElement
|
||||
*/
|
||||
protected $node;
|
||||
|
||||
/**
|
||||
* @var string The method to use for the element
|
||||
*/
|
||||
protected $method;
|
||||
|
||||
/**
|
||||
* @var string The URI of the page where the element is embedded (or the base href)
|
||||
*/
|
||||
protected $currentUri;
|
||||
|
||||
/**
|
||||
* @param \DOMElement $node A \DOMElement instance
|
||||
* @param string $currentUri The URI of the page where the link is embedded (or the base href)
|
||||
* @param string $method The method to use for the link (get by default)
|
||||
*
|
||||
* @throws \InvalidArgumentException if the node is not a link
|
||||
*/
|
||||
public function __construct(\DOMElement $node, $currentUri, $method = 'GET')
|
||||
{
|
||||
if (!in_array(strtolower(substr($currentUri, 0, 4)), array('http', 'file'))) {
|
||||
throw new \InvalidArgumentException(sprintf('Current URI must be an absolute URL ("%s").', $currentUri));
|
||||
}
|
||||
|
||||
$this->setNode($node);
|
||||
$this->method = $method ? strtoupper($method) : null;
|
||||
$this->currentUri = $currentUri;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the node associated with this link.
|
||||
*
|
||||
* @return \DOMElement A \DOMElement instance
|
||||
*/
|
||||
public function getNode()
|
||||
{
|
||||
return $this->node;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the method associated with this link.
|
||||
*
|
||||
* @return string The method
|
||||
*/
|
||||
public function getMethod()
|
||||
{
|
||||
return $this->method;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the URI associated with this link.
|
||||
*
|
||||
* @return string The URI
|
||||
*/
|
||||
public function getUri()
|
||||
{
|
||||
$uri = trim($this->getRawUri());
|
||||
|
||||
// absolute URL?
|
||||
if (null !== parse_url($uri, PHP_URL_SCHEME)) {
|
||||
return $uri;
|
||||
}
|
||||
|
||||
// empty URI
|
||||
if (!$uri) {
|
||||
return $this->currentUri;
|
||||
}
|
||||
|
||||
// an anchor
|
||||
if ('#' === $uri[0]) {
|
||||
return $this->cleanupAnchor($this->currentUri).$uri;
|
||||
}
|
||||
|
||||
$baseUri = $this->cleanupUri($this->currentUri);
|
||||
|
||||
if ('?' === $uri[0]) {
|
||||
return $baseUri.$uri;
|
||||
}
|
||||
|
||||
// absolute URL with relative schema
|
||||
if (0 === strpos($uri, '//')) {
|
||||
return preg_replace('#^([^/]*)//.*$#', '$1', $baseUri).$uri;
|
||||
}
|
||||
|
||||
$baseUri = preg_replace('#^(.*?//[^/]*)(?:\/.*)?$#', '$1', $baseUri);
|
||||
|
||||
// absolute path
|
||||
if ('/' === $uri[0]) {
|
||||
return $baseUri.$uri;
|
||||
}
|
||||
|
||||
// relative path
|
||||
$path = parse_url(substr($this->currentUri, strlen($baseUri)), PHP_URL_PATH);
|
||||
$path = $this->canonicalizePath(substr($path, 0, strrpos($path, '/')).'/'.$uri);
|
||||
|
||||
return $baseUri.('' === $path || '/' !== $path[0] ? '/' : '').$path;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns raw URI data.
|
||||
*
|
||||
* @return string
|
||||
*/
|
||||
abstract protected function getRawUri();
|
||||
|
||||
/**
|
||||
* Returns the canonicalized URI path (see RFC 3986, section 5.2.4).
|
||||
*
|
||||
* @param string $path URI path
|
||||
*
|
||||
* @return string
|
||||
*/
|
||||
protected function canonicalizePath($path)
|
||||
{
|
||||
if ('' === $path || '/' === $path) {
|
||||
return $path;
|
||||
}
|
||||
|
||||
if ('.' === substr($path, -1)) {
|
||||
$path .= '/';
|
||||
}
|
||||
|
||||
$output = array();
|
||||
|
||||
foreach (explode('/', $path) as $segment) {
|
||||
if ('..' === $segment) {
|
||||
array_pop($output);
|
||||
} elseif ('.' !== $segment) {
|
||||
$output[] = $segment;
|
||||
}
|
||||
}
|
||||
|
||||
return implode('/', $output);
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets current \DOMElement instance.
|
||||
*
|
||||
* @param \DOMElement $node A \DOMElement instance
|
||||
*
|
||||
* @throws \LogicException If given node is not an anchor
|
||||
*/
|
||||
abstract protected function setNode(\DOMElement $node);
|
||||
|
||||
/**
|
||||
* Removes the query string and the anchor from the given uri.
|
||||
*
|
||||
* @param string $uri The uri to clean
|
||||
*
|
||||
* @return string
|
||||
*/
|
||||
private function cleanupUri($uri)
|
||||
{
|
||||
return $this->cleanupQuery($this->cleanupAnchor($uri));
|
||||
}
|
||||
|
||||
/**
|
||||
* Remove the query string from the uri.
|
||||
*
|
||||
* @param string $uri
|
||||
*
|
||||
* @return string
|
||||
*/
|
||||
private function cleanupQuery($uri)
|
||||
{
|
||||
if (false !== $pos = strpos($uri, '?')) {
|
||||
return substr($uri, 0, $pos);
|
||||
}
|
||||
|
||||
return $uri;
|
||||
}
|
||||
|
||||
/**
|
||||
* Remove the anchor from the uri.
|
||||
*
|
||||
* @param string $uri
|
||||
*
|
||||
* @return string
|
||||
*/
|
||||
private function cleanupAnchor($uri)
|
||||
{
|
||||
if (false !== $pos = strpos($uri, '#')) {
|
||||
return substr($uri, 0, $pos);
|
||||
}
|
||||
|
||||
return $uri;
|
||||
}
|
||||
}
|
@ -1,6 +1,12 @@
|
||||
CHANGELOG
|
||||
=========
|
||||
|
||||
3.1.0
|
||||
-----
|
||||
|
||||
* All the URI parsing logic have been abstracted in the `AbstractUriElement` class. The `Link` class is now a child of `AbstractUriElement` which implements the new `UriElementInterface`, describing the common `getNode`, `getMethod` and `getUri` methods.
|
||||
* Added an `Image` class to crawl images and parse their `src` attribute, and `selectImage`, `image`, `images` methods in `Crawler`, the image version of the equivalent `link` methods.
|
||||
|
||||
2.5.0
|
||||
-----
|
||||
|
||||
|
@ -58,8 +58,6 @@ class Crawler implements \Countable, \IteratorAggregate
|
||||
private $isHtml = true;
|
||||
|
||||
/**
|
||||
* Constructor.
|
||||
*
|
||||
* @param mixed $node A Node to use as the base for the crawling
|
||||
* @param string $currentUri The current URI
|
||||
* @param string $baseHref The base href value
|
||||
@ -668,6 +666,20 @@ class Crawler implements \Countable, \IteratorAggregate
|
||||
return $this->filterRelativeXPath($xpath);
|
||||
}
|
||||
|
||||
/**
|
||||
* Selects images by alt value.
|
||||
*
|
||||
* @param string $value The image alt
|
||||
*
|
||||
* @return Crawler A new instance of Crawler with the filtered list of nodes
|
||||
*/
|
||||
public function selectImage($value)
|
||||
{
|
||||
$xpath = sprintf('descendant-or-self::img[contains(normalize-space(string(@alt)), %s)]', static::xpathLiteral($value));
|
||||
|
||||
return $this->filterRelativeXPath($xpath);
|
||||
}
|
||||
|
||||
/**
|
||||
* Selects a button by name or alt value for images.
|
||||
*
|
||||
@ -730,6 +742,47 @@ class Crawler implements \Countable, \IteratorAggregate
|
||||
return $links;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an Image object for the first node in the list.
|
||||
*
|
||||
* @return Image An Image instance
|
||||
*
|
||||
* @throws \InvalidArgumentException If the current node list is empty
|
||||
*/
|
||||
public function image()
|
||||
{
|
||||
if (!count($this)) {
|
||||
throw new \InvalidArgumentException('The current node list is empty.');
|
||||
}
|
||||
|
||||
$node = $this->getNode(0);
|
||||
|
||||
if (!$node instanceof \DOMElement) {
|
||||
throw new \InvalidArgumentException(sprintf('The selected node should be instance of DOMElement, got "%s".', get_class($node)));
|
||||
}
|
||||
|
||||
return new Image($node, $this->baseHref);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an array of Image objects for the nodes in the list.
|
||||
*
|
||||
* @return Image[] An array of Image instances
|
||||
*/
|
||||
public function images()
|
||||
{
|
||||
$images = array();
|
||||
foreach ($this as $node) {
|
||||
if (!$node instanceof \DOMElement) {
|
||||
throw new \InvalidArgumentException(sprintf('The current node list should contain only DOMElement instances, "%s" found.', get_class($node)));
|
||||
}
|
||||
|
||||
$images[] = new Image($node, $this->baseHref);
|
||||
}
|
||||
|
||||
return $images;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a Form object for the first node in the list.
|
||||
*
|
||||
|
37
src/Symfony/Component/DomCrawler/Image.php
Normal file
37
src/Symfony/Component/DomCrawler/Image.php
Normal file
@ -0,0 +1,37 @@
|
||||
<?php
|
||||
|
||||
/*
|
||||
* This file is part of the Symfony package.
|
||||
*
|
||||
* (c) Fabien Potencier <fabien@symfony.com>
|
||||
*
|
||||
* For the full copyright and license information, please view the LICENSE
|
||||
* file that was distributed with this source code.
|
||||
*/
|
||||
|
||||
namespace Symfony\Component\DomCrawler;
|
||||
|
||||
/**
|
||||
* Image represents an HTML image (an HTML img tag).
|
||||
*/
|
||||
class Image extends AbstractUriElement
|
||||
{
|
||||
public function __construct(\DOMElement $node, $currentUri)
|
||||
{
|
||||
parent::__construct($node, $currentUri, 'GET');
|
||||
}
|
||||
|
||||
protected function getRawUri()
|
||||
{
|
||||
return $this->node->getAttribute('src');
|
||||
}
|
||||
|
||||
protected function setNode(\DOMElement $node)
|
||||
{
|
||||
if ('img' !== $node->nodeName) {
|
||||
throw new \LogicException(sprintf('Unable to visualize a "%s" tag.', $node->nodeName));
|
||||
}
|
||||
|
||||
$this->node = $node;
|
||||
}
|
||||
}
|
@ -16,159 +16,13 @@ namespace Symfony\Component\DomCrawler;
|
||||
*
|
||||
* @author Fabien Potencier <fabien@symfony.com>
|
||||
*/
|
||||
class Link
|
||||
class Link extends AbstractUriElement
|
||||
{
|
||||
/**
|
||||
* @var \DOMElement
|
||||
*/
|
||||
protected $node;
|
||||
|
||||
/**
|
||||
* @var string The method to use for the link
|
||||
*/
|
||||
protected $method;
|
||||
|
||||
/**
|
||||
* @var string The URI of the page where the link is embedded (or the base href)
|
||||
*/
|
||||
protected $currentUri;
|
||||
|
||||
/**
|
||||
* Constructor.
|
||||
*
|
||||
* @param \DOMElement $node A \DOMElement instance
|
||||
* @param string $currentUri The URI of the page where the link is embedded (or the base href)
|
||||
* @param string $method The method to use for the link (get by default)
|
||||
*
|
||||
* @throws \InvalidArgumentException if the node is not a link
|
||||
*/
|
||||
public function __construct(\DOMElement $node, $currentUri, $method = 'GET')
|
||||
{
|
||||
if (!in_array(strtolower(substr($currentUri, 0, 4)), array('http', 'file'))) {
|
||||
throw new \InvalidArgumentException(sprintf('Current URI must be an absolute URL ("%s").', $currentUri));
|
||||
}
|
||||
|
||||
$this->setNode($node);
|
||||
$this->method = $method ? strtoupper($method) : null;
|
||||
$this->currentUri = $currentUri;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the node associated with this link.
|
||||
*
|
||||
* @return \DOMElement A \DOMElement instance
|
||||
*/
|
||||
public function getNode()
|
||||
{
|
||||
return $this->node;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the method associated with this link.
|
||||
*
|
||||
* @return string The method
|
||||
*/
|
||||
public function getMethod()
|
||||
{
|
||||
return $this->method;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the URI associated with this link.
|
||||
*
|
||||
* @return string The URI
|
||||
*/
|
||||
public function getUri()
|
||||
{
|
||||
$uri = trim($this->getRawUri());
|
||||
|
||||
// absolute URL?
|
||||
if (null !== parse_url($uri, PHP_URL_SCHEME)) {
|
||||
return $uri;
|
||||
}
|
||||
|
||||
// empty URI
|
||||
if (!$uri) {
|
||||
return $this->currentUri;
|
||||
}
|
||||
|
||||
// an anchor
|
||||
if ('#' === $uri[0]) {
|
||||
return $this->cleanupAnchor($this->currentUri).$uri;
|
||||
}
|
||||
|
||||
$baseUri = $this->cleanupUri($this->currentUri);
|
||||
|
||||
if ('?' === $uri[0]) {
|
||||
return $baseUri.$uri;
|
||||
}
|
||||
|
||||
// absolute URL with relative schema
|
||||
if (0 === strpos($uri, '//')) {
|
||||
return preg_replace('#^([^/]*)//.*$#', '$1', $baseUri).$uri;
|
||||
}
|
||||
|
||||
$baseUri = preg_replace('#^(.*?//[^/]*)(?:\/.*)?$#', '$1', $baseUri);
|
||||
|
||||
// absolute path
|
||||
if ('/' === $uri[0]) {
|
||||
return $baseUri.$uri;
|
||||
}
|
||||
|
||||
// relative path
|
||||
$path = parse_url(substr($this->currentUri, strlen($baseUri)), PHP_URL_PATH);
|
||||
$path = $this->canonicalizePath(substr($path, 0, strrpos($path, '/')).'/'.$uri);
|
||||
|
||||
return $baseUri.('' === $path || '/' !== $path[0] ? '/' : '').$path;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns raw URI data.
|
||||
*
|
||||
* @return string
|
||||
*/
|
||||
protected function getRawUri()
|
||||
{
|
||||
return $this->node->getAttribute('href');
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the canonicalized URI path (see RFC 3986, section 5.2.4).
|
||||
*
|
||||
* @param string $path URI path
|
||||
*
|
||||
* @return string
|
||||
*/
|
||||
protected function canonicalizePath($path)
|
||||
{
|
||||
if ('' === $path || '/' === $path) {
|
||||
return $path;
|
||||
}
|
||||
|
||||
if ('.' === substr($path, -1)) {
|
||||
$path .= '/';
|
||||
}
|
||||
|
||||
$output = array();
|
||||
|
||||
foreach (explode('/', $path) as $segment) {
|
||||
if ('..' === $segment) {
|
||||
array_pop($output);
|
||||
} elseif ('.' !== $segment) {
|
||||
$output[] = $segment;
|
||||
}
|
||||
}
|
||||
|
||||
return implode('/', $output);
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets current \DOMElement instance.
|
||||
*
|
||||
* @param \DOMElement $node A \DOMElement instance
|
||||
*
|
||||
* @throws \LogicException If given node is not an anchor
|
||||
*/
|
||||
protected function setNode(\DOMElement $node)
|
||||
{
|
||||
if ('a' !== $node->nodeName && 'area' !== $node->nodeName && 'link' !== $node->nodeName) {
|
||||
@ -177,48 +31,4 @@ class Link
|
||||
|
||||
$this->node = $node;
|
||||
}
|
||||
|
||||
/**
|
||||
* Removes the query string and the anchor from the given uri.
|
||||
*
|
||||
* @param string $uri The uri to clean
|
||||
*
|
||||
* @return string
|
||||
*/
|
||||
private function cleanupUri($uri)
|
||||
{
|
||||
return $this->cleanupQuery($this->cleanupAnchor($uri));
|
||||
}
|
||||
|
||||
/**
|
||||
* Remove the query string from the uri.
|
||||
*
|
||||
* @param string $uri
|
||||
*
|
||||
* @return string
|
||||
*/
|
||||
private function cleanupQuery($uri)
|
||||
{
|
||||
if (false !== $pos = strpos($uri, '?')) {
|
||||
return substr($uri, 0, $pos);
|
||||
}
|
||||
|
||||
return $uri;
|
||||
}
|
||||
|
||||
/**
|
||||
* Remove the anchor from the uri.
|
||||
*
|
||||
* @param string $uri
|
||||
*
|
||||
* @return string
|
||||
*/
|
||||
private function cleanupAnchor($uri)
|
||||
{
|
||||
if (false !== $pos = strpos($uri, '#')) {
|
||||
return substr($uri, 0, $pos);
|
||||
}
|
||||
|
||||
return $uri;
|
||||
}
|
||||
}
|
||||
|
@ -657,6 +657,17 @@ EOF
|
||||
$this->assertCount(4, $crawler->selectLink('Bar'), '->selectLink() selects links by the node values');
|
||||
}
|
||||
|
||||
public function testSelectImage()
|
||||
{
|
||||
$crawler = $this->createTestCrawler();
|
||||
$this->assertNotSame($crawler, $crawler->selectImage('Bar'), '->selectImage() returns a new instance of a crawler');
|
||||
$this->assertInstanceOf('Symfony\\Component\\DomCrawler\\Crawler', $crawler, '->selectImage() returns a new instance of a crawler');
|
||||
|
||||
$this->assertCount(1, $crawler->selectImage('Fabien\'s Bar'), '->selectImage() selects images by alt attribute');
|
||||
$this->assertCount(2, $crawler->selectImage('Fabien"s Bar'), '->selectImage() selects images by alt attribute');
|
||||
$this->assertCount(1, $crawler->selectImage('\' Fabien"s Bar'), '->selectImage() selects images by alt attribute');
|
||||
}
|
||||
|
||||
public function testSelectButton()
|
||||
{
|
||||
$crawler = $this->createTestCrawler();
|
||||
@ -755,6 +766,19 @@ HTML;
|
||||
$crawler->filterXPath('//li/text()')->link();
|
||||
}
|
||||
|
||||
public function testImage()
|
||||
{
|
||||
$crawler = $this->createTestCrawler('http://example.com/bar/')->selectImage('Bar');
|
||||
$this->assertInstanceOf('Symfony\\Component\\DomCrawler\\Image', $crawler->image(), '->image() returns an Image instance');
|
||||
|
||||
try {
|
||||
$this->createTestCrawler()->filterXPath('//ol')->image();
|
||||
$this->fail('->image() throws an \InvalidArgumentException if the node list is empty');
|
||||
} catch (\InvalidArgumentException $e) {
|
||||
$this->assertTrue(true, '->image() throws an \InvalidArgumentException if the node list is empty');
|
||||
}
|
||||
}
|
||||
|
||||
public function testSelectLinkAndLinkFiltered()
|
||||
{
|
||||
$html = <<<'HTML'
|
||||
@ -805,6 +829,18 @@ HTML;
|
||||
$this->assertEquals(array(), $this->createTestCrawler()->filterXPath('//ol')->links(), '->links() returns an empty array if the node selection is empty');
|
||||
}
|
||||
|
||||
public function testImages()
|
||||
{
|
||||
$crawler = $this->createTestCrawler('http://example.com/bar/')->selectImage('Bar');
|
||||
$this->assertInternalType('array', $crawler->images(), '->images() returns an array');
|
||||
|
||||
$this->assertCount(4, $crawler->images(), '->images() returns an array');
|
||||
$images = $crawler->images();
|
||||
$this->assertInstanceOf('Symfony\\Component\\DomCrawler\\Image', $images[0], '->images() returns an array of Image instances');
|
||||
|
||||
$this->assertEquals(array(), $this->createTestCrawler()->filterXPath('//ol')->links(), '->links() returns an empty array if the node selection is empty');
|
||||
}
|
||||
|
||||
public function testForm()
|
||||
{
|
||||
$testCrawler = $this->createTestCrawler('http://example.com/bar/');
|
||||
|
48
src/Symfony/Component/DomCrawler/Tests/ImageTest.php
Normal file
48
src/Symfony/Component/DomCrawler/Tests/ImageTest.php
Normal file
@ -0,0 +1,48 @@
|
||||
<?php
|
||||
|
||||
/*
|
||||
* This file is part of the Symfony package.
|
||||
*
|
||||
* (c) Fabien Potencier <fabien@symfony.com>
|
||||
*
|
||||
* For the full copyright and license information, please view the LICENSE
|
||||
* file that was distributed with this source code.
|
||||
*/
|
||||
|
||||
namespace Symfony\Component\DomCrawler\Tests;
|
||||
|
||||
use Symfony\Component\DomCrawler\Image;
|
||||
|
||||
class ImageTest extends \PHPUnit_Framework_TestCase
|
||||
{
|
||||
/**
|
||||
* @expectedException \LogicException
|
||||
*/
|
||||
public function testConstructorWithANonImgTag()
|
||||
{
|
||||
$dom = new \DOMDocument();
|
||||
$dom->loadHTML('<html><div><div></html>');
|
||||
|
||||
new Image($dom->getElementsByTagName('div')->item(0), 'http://www.example.com/');
|
||||
}
|
||||
|
||||
/**
|
||||
* @dataProvider getGetUriTests
|
||||
*/
|
||||
public function testGetUri($url, $currentUri, $expected)
|
||||
{
|
||||
$dom = new \DOMDocument();
|
||||
$dom->loadHTML(sprintf('<html><img alt="foo" src="%s" /></html>', $url));
|
||||
$image = new Image($dom->getElementsByTagName('img')->item(0), $currentUri);
|
||||
|
||||
$this->assertEquals($expected, $image->getUri());
|
||||
}
|
||||
|
||||
public function getGetUriTests()
|
||||
{
|
||||
return array(
|
||||
array('/foo.png', 'http://localhost/bar/foo/', 'http://localhost/foo.png'),
|
||||
array('foo.png', 'http://localhost/bar/foo/', 'http://localhost/bar/foo/foo.png'),
|
||||
);
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user