2010-04-15 13:41:42 +01:00
< ? php
/*
2010-04-24 00:22:16 +01:00
* This file is part of the Symfony package .
2010-04-15 13:41:42 +01:00
*
2011-03-06 11:40:06 +00:00
* ( c ) Fabien Potencier < fabien @ symfony . com >
2010-04-15 13:41:42 +01:00
*
* For the full copyright and license information , please view the LICENSE
* file that was distributed with this source code .
*/
2011-01-15 13:29:43 +00:00
namespace Symfony\Component\DomCrawler ;
2015-09-30 10:22:02 +01:00
use Symfony\Component\CssSelector\CssSelectorConverter ;
2011-01-15 13:29:43 +00:00
2010-04-15 13:41:42 +01:00
/**
2015-12-16 16:46:28 +00:00
* Crawler eases navigation of a list of \DOMNode objects .
2010-04-15 13:41:42 +01:00
*
2011-03-06 11:40:06 +00:00
* @ author Fabien Potencier < fabien @ symfony . com >
2010-04-15 13:41:42 +01:00
*/
2015-10-03 00:19:52 +01:00
class Crawler implements \Countable , \IteratorAggregate
2010-04-15 13:41:42 +01:00
{
2012-04-08 08:55:44 +01:00
/**
2014-10-22 08:21:07 +01:00
* @ var string The current URI
2012-04-08 08:55:44 +01:00
*/
2013-02-28 10:23:15 +00:00
protected $uri ;
2010-04-15 13:41:42 +01:00
2013-09-18 13:18:44 +01:00
/**
* @ var string The default namespace prefix to be used with XPath and CSS expressions
*/
private $defaultNamespacePrefix = 'default' ;
2013-09-22 23:29:38 +01:00
/**
* @ var array A map of manually registered namespaces
*/
private $namespaces = array ();
2014-10-22 08:21:07 +01:00
/**
* @ var string The base href value
*/
private $baseHref ;
2015-10-01 23:41:47 +01:00
/**
* @ var \DOMDocument | null
*/
private $document ;
2015-10-02 13:08:08 +01:00
/**
2015-10-03 00:19:52 +01:00
* @ var \DOMElement []
2015-10-02 13:08:08 +01:00
*/
private $nodes = array ();
2015-09-26 23:37:59 +01:00
/**
2015-09-30 10:22:02 +01:00
* Whether the Crawler contains HTML or XML content ( used when converting CSS to XPath ) .
2015-09-26 23:37:59 +01:00
*
* @ var bool
*/
private $isHtml = true ;
2010-05-06 12:25:53 +01:00
/**
2015-02-18 07:05:44 +00:00
* @ param mixed $node A Node to use as the base for the crawling
2014-10-22 08:21:07 +01:00
* @ param string $currentUri The current URI
2015-02-18 07:05:44 +00:00
* @ param string $baseHref The base href value
2010-05-06 12:25:53 +01:00
*/
2014-10-22 08:21:07 +01:00
public function __construct ( $node = null , $currentUri = null , $baseHref = null )
2010-05-06 12:25:53 +01:00
{
2014-10-22 08:21:07 +01:00
$this -> uri = $currentUri ;
$this -> baseHref = $baseHref ? : $currentUri ;
2011-01-13 01:05:26 +00:00
2010-05-06 12:25:53 +01:00
$this -> add ( $node );
}
2010-04-15 13:41:42 +01:00
2016-03-15 22:05:44 +00:00
/**
* Returns the current URI .
*
* @ return string
*/
public function getUri ()
{
return $this -> uri ;
}
2016-03-12 19:13:41 +00:00
/**
* Returns base href .
*
* @ return string
*/
public function getBaseHref ()
{
return $this -> baseHref ;
}
2010-05-06 12:25:53 +01:00
/**
* Removes all the nodes .
*/
public function clear ()
2010-04-15 13:41:42 +01:00
{
2015-10-02 13:08:08 +01:00
$this -> nodes = array ();
2015-10-01 23:41:47 +01:00
$this -> document = null ;
2010-04-15 13:41:42 +01:00
}
2010-05-06 12:25:53 +01:00
/**
* Adds a node to the current list of nodes .
*
* This method uses the appropriate specialized add * () method based
* on the type of the argument .
*
2013-05-03 23:53:35 +01:00
* @ param \DOMNodeList | \DOMNode | array | string | null $node A node
*
2017-09-11 10:28:55 +01:00
* @ throws \InvalidArgumentException when node is not the expected type
2010-05-06 12:25:53 +01:00
*/
public function add ( $node )
2010-04-15 13:41:42 +01:00
{
2010-05-07 15:09:11 +01:00
if ( $node instanceof \DOMNodeList ) {
2010-05-06 12:25:53 +01:00
$this -> addNodeList ( $node );
2013-05-03 23:53:35 +01:00
} elseif ( $node instanceof \DOMNode ) {
$this -> addNode ( $node );
2010-05-07 15:09:11 +01:00
} elseif ( is_array ( $node )) {
2010-05-06 12:25:53 +01:00
$this -> addNodes ( $node );
2010-06-19 10:42:07 +01:00
} elseif ( is_string ( $node )) {
$this -> addContent ( $node );
2013-05-06 07:26:30 +01:00
} elseif ( null !== $node ) {
throw new \InvalidArgumentException ( sprintf ( 'Expecting a DOMNodeList or DOMNode instance, an array, a string, or null, but got "%s".' , is_object ( $node ) ? get_class ( $node ) : gettype ( $node )));
2010-05-06 12:25:53 +01:00
}
2010-04-15 13:41:42 +01:00
}
2011-03-24 09:00:10 +00:00
/**
* Adds HTML / XML content .
*
2013-06-13 12:40:37 +01:00
* If the charset is not set via the content type , it is assumed
* to be ISO - 8859 - 1 , which is the default charset defined by the
* HTTP 1.1 specification .
*
2012-04-08 08:55:44 +01:00
* @ param string $content A string to parse as HTML / XML
* @ param null | string $type The content type of the string
2011-03-24 09:00:10 +00:00
*/
2010-05-06 12:25:53 +01:00
public function addContent ( $content , $type = null )
2010-04-19 07:33:45 +01:00
{
2010-05-07 15:09:11 +01:00
if ( empty ( $type )) {
2013-01-10 09:49:01 +00:00
$type = 0 === strpos ( $content , '<?xml' ) ? 'application/xml' : 'text/html' ;
2010-05-06 12:25:53 +01:00
}
// DOM only for HTML/XML content
2013-09-19 09:03:41 +01:00
if ( ! preg_match ( '/(x|ht)ml/i' , $type , $xmlMatches )) {
2014-04-16 08:15:58 +01:00
return ;
2010-05-06 12:25:53 +01:00
}
2013-09-19 09:03:41 +01:00
$charset = null ;
2013-10-25 16:19:22 +01:00
if ( false !== $pos = stripos ( $type , 'charset=' )) {
2010-05-06 12:25:53 +01:00
$charset = substr ( $type , $pos + 8 );
2011-08-10 10:41:07 +01:00
if ( false !== $pos = strpos ( $charset , ';' )) {
$charset = substr ( $charset , 0 , $pos );
}
2010-05-06 12:25:53 +01:00
}
2014-05-24 07:29:23 +01:00
// http://www.w3.org/TR/encoding/#encodings
// http://www.w3.org/TR/REC-xml/#NT-EncName
2013-09-19 09:03:41 +01:00
if ( null === $charset &&
2014-05-24 07:29:23 +01:00
preg_match ( '/\<meta[^\>]+charset *= *["\']?([a-zA-Z\-0-9_:.]+)/i' , $content , $matches )) {
2013-09-19 09:03:41 +01:00
$charset = $matches [ 1 ];
}
if ( null === $charset ) {
$charset = 'ISO-8859-1' ;
}
if ( 'x' === $xmlMatches [ 1 ]) {
2010-05-06 12:25:53 +01:00
$this -> addXmlContent ( $content , $charset );
2010-05-07 15:09:11 +01:00
} else {
2010-05-06 12:25:53 +01:00
$this -> addHtmlContent ( $content , $charset );
}
2010-04-19 07:33:45 +01:00
}
2010-05-06 12:25:53 +01:00
/**
* Adds an HTML content to the list of nodes .
*
2011-09-28 08:53:50 +01:00
* The libxml errors are disabled when the content is parsed .
*
* If you want to get parsing errors , be sure to enable
* internal errors via libxml_use_internal_errors ( true )
* and then , get the errors via libxml_get_errors () . Be
* sure to clear errors with libxml_clear_errors () afterward .
*
2010-05-06 12:25:53 +01:00
* @ param string $content The HTML content
* @ param string $charset The charset
*/
public function addHtmlContent ( $content , $charset = 'UTF-8' )
2010-04-19 07:33:45 +01:00
{
2014-03-19 15:07:56 +00:00
$internalErrors = libxml_use_internal_errors ( true );
2012-08-27 18:17:44 +01:00
$disableEntities = libxml_disable_entity_loader ( true );
2010-05-06 12:25:53 +01:00
$dom = new \DOMDocument ( '1.0' , $charset );
$dom -> validateOnParse = true ;
2017-01-20 15:16:44 +00:00
set_error_handler ( function () { throw new \Exception (); });
2015-08-20 07:18:51 +01:00
try {
// Convert charset to HTML-entities to work around bugs in DOMDocument::loadHTML()
2015-10-14 15:40:43 +01:00
$content = mb_convert_encoding ( $content , 'HTML-ENTITIES' , $charset );
2015-08-20 07:18:51 +01:00
} catch ( \Exception $e ) {
2012-05-07 13:20:03 +01:00
}
2015-08-20 07:18:51 +01:00
restore_error_handler ();
2014-03-19 15:07:56 +00:00
if ( '' !== trim ( $content )) {
@ $dom -> loadHTML ( $content );
}
2012-08-27 18:17:44 +01:00
2014-03-19 15:07:56 +00:00
libxml_use_internal_errors ( $internalErrors );
2012-08-27 18:17:44 +01:00
libxml_disable_entity_loader ( $disableEntities );
2011-09-28 08:53:50 +01:00
2010-05-06 12:25:53 +01:00
$this -> addDocument ( $dom );
2011-01-13 01:05:26 +00:00
2014-05-18 01:00:59 +01:00
$base = $this -> filterRelativeXPath ( 'descendant-or-self::base' ) -> extract ( array ( 'href' ));
2011-01-13 01:05:26 +00:00
2013-02-25 13:25:47 +00:00
$baseHref = current ( $base );
if ( count ( $base ) && ! empty ( $baseHref )) {
2014-10-22 08:21:07 +01:00
if ( $this -> baseHref ) {
2013-04-20 13:52:51 +01:00
$linkNode = $dom -> createElement ( 'a' );
$linkNode -> setAttribute ( 'href' , $baseHref );
2014-10-22 08:21:07 +01:00
$link = new Link ( $linkNode , $this -> baseHref );
$this -> baseHref = $link -> getUri ();
2013-04-20 13:52:51 +01:00
} else {
2014-10-22 08:21:07 +01:00
$this -> baseHref = $baseHref ;
2013-04-20 13:52:51 +01:00
}
2011-01-13 01:05:26 +00:00
}
2010-04-19 07:33:45 +01:00
}
2010-05-06 12:25:53 +01:00
/**
* Adds an XML content to the list of nodes .
*
2011-09-28 08:53:50 +01:00
* The libxml errors are disabled when the content is parsed .
*
* If you want to get parsing errors , be sure to enable
* internal errors via libxml_use_internal_errors ( true )
* and then , get the errors via libxml_get_errors () . Be
* sure to clear errors with libxml_clear_errors () afterward .
*
2010-05-06 12:25:53 +01:00
* @ param string $content The XML content
* @ param string $charset The charset
2016-03-02 14:53:47 +00:00
* @ param int $options Bitwise OR of the libxml option constants
* LIBXML_PARSEHUGE is dangerous , see
* http :// symfony . com / blog / security - release - symfony - 2 - 0 - 17 - released
2010-05-06 12:25:53 +01:00
*/
2016-03-02 14:53:47 +00:00
public function addXmlContent ( $content , $charset = 'UTF-8' , $options = LIBXML_NONET )
2010-04-19 07:33:45 +01:00
{
2013-12-14 14:13:38 +00:00
// remove the default namespace if it's the only namespace to make XPath expressions simpler
if ( ! preg_match ( '/xmlns:/' , $content )) {
$content = str_replace ( 'xmlns' , 'ns' , $content );
}
2014-03-19 15:07:56 +00:00
$internalErrors = libxml_use_internal_errors ( true );
2012-08-27 18:17:44 +01:00
$disableEntities = libxml_disable_entity_loader ( true );
2010-05-06 12:25:53 +01:00
$dom = new \DOMDocument ( '1.0' , $charset );
$dom -> validateOnParse = true ;
2012-08-27 18:17:44 +01:00
2014-03-19 15:07:56 +00:00
if ( '' !== trim ( $content )) {
2016-03-02 15:25:10 +00:00
@ $dom -> loadXML ( $content , $options );
2014-03-19 15:07:56 +00:00
}
2012-08-27 18:17:44 +01:00
2014-03-19 15:07:56 +00:00
libxml_use_internal_errors ( $internalErrors );
2012-08-27 18:17:44 +01:00
libxml_disable_entity_loader ( $disableEntities );
2011-09-28 08:53:50 +01:00
2010-05-06 12:25:53 +01:00
$this -> addDocument ( $dom );
2015-09-26 23:37:59 +01:00
$this -> isHtml = false ;
2010-04-19 07:33:45 +01:00
}
2010-05-06 12:25:53 +01:00
/**
* Adds a \DOMDocument to the list of nodes .
*
* @ param \DOMDocument $dom A \DOMDocument instance
*/
public function addDocument ( \DOMDocument $dom )
2010-04-19 07:33:45 +01:00
{
2010-05-07 15:09:11 +01:00
if ( $dom -> documentElement ) {
2010-05-06 12:25:53 +01:00
$this -> addNode ( $dom -> documentElement );
}
2010-04-19 07:33:45 +01:00
}
2010-05-06 12:25:53 +01:00
/**
* Adds a \DOMNodeList to the list of nodes .
*
* @ param \DOMNodeList $nodes A \DOMNodeList instance
*/
public function addNodeList ( \DOMNodeList $nodes )
2010-04-15 13:41:42 +01:00
{
2010-05-07 15:09:11 +01:00
foreach ( $nodes as $node ) {
2014-08-07 23:47:19 +01:00
if ( $node instanceof \DOMNode ) {
$this -> addNode ( $node );
}
2010-05-06 12:25:53 +01:00
}
2010-04-15 13:41:42 +01:00
}
2010-05-06 12:25:53 +01:00
/**
* Adds an array of \DOMNode instances to the list of nodes .
*
2012-11-01 15:08:59 +00:00
* @ param \DOMNode [] $nodes An array of \DOMNode instances
2010-05-06 12:25:53 +01:00
*/
public function addNodes ( array $nodes )
{
2010-05-07 15:09:11 +01:00
foreach ( $nodes as $node ) {
2010-05-06 12:25:53 +01:00
$this -> add ( $node );
}
}
2010-04-15 13:41:42 +01:00
2010-05-06 12:25:53 +01:00
/**
* Adds a \DOMNode instance to the list of nodes .
*
* @ param \DOMNode $node A \DOMNode instance
*/
public function addNode ( \DOMNode $node )
2010-04-15 13:41:42 +01:00
{
2010-05-07 15:09:11 +01:00
if ( $node instanceof \DOMDocument ) {
2015-10-02 00:09:43 +01:00
$node = $node -> documentElement ;
}
2015-10-02 13:05:56 +01:00
if ( null !== $this -> document && $this -> document !== $node -> ownerDocument ) {
2015-10-03 00:19:52 +01:00
throw new \InvalidArgumentException ( 'Attaching DOM nodes from multiple documents in the same crawler is forbidden.' );
2015-10-02 13:05:56 +01:00
}
if ( null === $this -> document ) {
$this -> document = $node -> ownerDocument ;
}
2015-10-03 00:19:52 +01:00
// Don't add duplicate nodes in the Crawler
if ( in_array ( $node , $this -> nodes , true )) {
return ;
}
2015-09-26 11:01:36 +01:00
2015-10-03 00:19:52 +01:00
$this -> nodes [] = $node ;
2015-09-26 11:01:36 +01:00
}
2010-05-06 12:25:53 +01:00
/**
* Returns a node given its position in the node list .
*
2014-11-30 13:33:44 +00:00
* @ param int $position The position
2010-05-06 12:25:53 +01:00
*
2016-12-26 07:50:27 +00:00
* @ return self
2010-05-06 12:25:53 +01:00
*/
public function eq ( $position )
{
2015-10-02 13:08:08 +01:00
if ( isset ( $this -> nodes [ $position ])) {
return $this -> createSubCrawler ( $this -> nodes [ $position ]);
2010-04-15 13:41:42 +01:00
}
2010-05-06 12:25:53 +01:00
2015-09-26 23:37:59 +01:00
return $this -> createSubCrawler ( null );
2010-05-06 12:25:53 +01:00
}
/**
* Calls an anonymous function on each node of the list .
*
2013-04-22 05:49:51 +01:00
* The anonymous function receives the position and the node wrapped
* in a Crawler instance as arguments .
2010-05-06 12:25:53 +01:00
*
* Example :
*
2013-04-22 05:49:51 +01:00
* $crawler -> filter ( 'h1' ) -> each ( function ( $node , $i ) {
* return $node -> text ();
2010-05-06 12:25:53 +01:00
* });
*
* @ param \Closure $closure An anonymous function
*
* @ return array An array of values returned by the anonymous function
*/
public function each ( \Closure $closure )
{
$data = array ();
2015-10-02 13:08:08 +01:00
foreach ( $this -> nodes as $i => $node ) {
2015-09-26 23:37:59 +01:00
$data [] = $closure ( $this -> createSubCrawler ( $node ), $i );
2010-05-06 12:25:53 +01:00
}
2010-04-15 13:41:42 +01:00
2010-05-06 12:25:53 +01:00
return $data ;
}
2010-04-15 13:41:42 +01:00
2014-04-13 11:38:36 +01:00
/**
* Slices the list of nodes by $offset and $length .
*
2014-06-03 22:06:30 +01:00
* @ param int $offset
* @ param int $length
2014-04-13 11:38:36 +01:00
*
2016-12-26 07:50:27 +00:00
* @ return self
2014-04-13 11:38:36 +01:00
*/
2015-10-02 13:08:08 +01:00
public function slice ( $offset = 0 , $length = null )
2014-04-13 11:38:36 +01:00
{
2015-10-02 13:08:08 +01:00
return $this -> createSubCrawler ( array_slice ( $this -> nodes , $offset , $length ));
2014-04-13 11:38:36 +01:00
}
2010-05-06 12:25:53 +01:00
/**
* Reduces the list of nodes by calling an anonymous function .
*
* To remove a node from the list , the anonymous function must return false .
*
* @ param \Closure $closure An anonymous function
*
2016-12-26 07:50:27 +00:00
* @ return self
2010-05-06 12:25:53 +01:00
*/
public function reduce ( \Closure $closure )
2010-04-15 13:41:42 +01:00
{
2010-05-06 12:25:53 +01:00
$nodes = array ();
2015-10-02 13:08:08 +01:00
foreach ( $this -> nodes as $i => $node ) {
2015-09-26 23:37:59 +01:00
if ( false !== $closure ( $this -> createSubCrawler ( $node ), $i )) {
2010-05-06 12:25:53 +01:00
$nodes [] = $node ;
}
}
2015-09-26 23:37:59 +01:00
return $this -> createSubCrawler ( $nodes );
2010-04-15 13:41:42 +01:00
}
2010-05-06 12:25:53 +01:00
/**
2014-12-21 17:00:50 +00:00
* Returns the first node of the current selection .
2010-05-06 12:25:53 +01:00
*
2016-12-26 07:50:27 +00:00
* @ return self
2010-05-06 12:25:53 +01:00
*/
public function first ()
{
return $this -> eq ( 0 );
}
2010-04-15 13:41:42 +01:00
2010-05-06 12:25:53 +01:00
/**
2014-12-21 17:00:50 +00:00
* Returns the last node of the current selection .
2010-05-06 12:25:53 +01:00
*
2016-12-26 07:50:27 +00:00
* @ return self
2010-05-06 12:25:53 +01:00
*/
public function last ()
{
2015-10-02 13:08:08 +01:00
return $this -> eq ( count ( $this -> nodes ) - 1 );
2010-05-06 12:25:53 +01:00
}
2010-04-15 13:41:42 +01:00
2010-05-06 12:25:53 +01:00
/**
2014-12-21 17:00:50 +00:00
* Returns the siblings nodes of the current selection .
2010-05-06 12:25:53 +01:00
*
2016-12-26 07:50:27 +00:00
* @ return self
2010-05-06 12:25:53 +01:00
*
* @ throws \InvalidArgumentException When current node is empty
*/
public function siblings ()
2010-04-15 13:41:42 +01:00
{
2015-10-02 13:08:08 +01:00
if ( ! $this -> nodes ) {
2010-05-06 12:25:53 +01:00
throw new \InvalidArgumentException ( 'The current node list is empty.' );
}
2015-09-26 23:37:59 +01:00
return $this -> createSubCrawler ( $this -> sibling ( $this -> getNode ( 0 ) -> parentNode -> firstChild ));
2010-04-15 13:41:42 +01:00
}
2010-05-06 12:25:53 +01:00
/**
2014-12-21 17:00:50 +00:00
* Returns the next siblings nodes of the current selection .
2010-05-06 12:25:53 +01:00
*
2016-12-26 07:50:27 +00:00
* @ return self
2010-05-06 12:25:53 +01:00
*
* @ throws \InvalidArgumentException When current node is empty
*/
public function nextAll ()
{
2015-10-02 13:08:08 +01:00
if ( ! $this -> nodes ) {
2010-05-06 12:25:53 +01:00
throw new \InvalidArgumentException ( 'The current node list is empty.' );
}
2015-09-26 23:37:59 +01:00
return $this -> createSubCrawler ( $this -> sibling ( $this -> getNode ( 0 )));
2010-05-06 12:25:53 +01:00
}
2010-04-15 13:41:42 +01:00
2010-05-06 12:25:53 +01:00
/**
2014-12-21 17:00:50 +00:00
* Returns the previous sibling nodes of the current selection .
2010-05-06 12:25:53 +01:00
*
2016-12-26 07:50:27 +00:00
* @ return self
2011-03-24 09:00:10 +00:00
*
2012-12-16 12:02:54 +00:00
* @ throws \InvalidArgumentException
2010-05-06 12:25:53 +01:00
*/
public function previousAll ()
2010-04-15 13:41:42 +01:00
{
2015-10-02 13:08:08 +01:00
if ( ! $this -> nodes ) {
2010-05-06 12:25:53 +01:00
throw new \InvalidArgumentException ( 'The current node list is empty.' );
}
2015-09-26 23:37:59 +01:00
return $this -> createSubCrawler ( $this -> sibling ( $this -> getNode ( 0 ), 'previousSibling' ));
2010-04-15 13:41:42 +01:00
}
2010-05-06 12:25:53 +01:00
/**
2014-12-21 17:00:50 +00:00
* Returns the parents nodes of the current selection .
2010-05-06 12:25:53 +01:00
*
2016-12-26 07:50:27 +00:00
* @ return self
2010-05-06 12:25:53 +01:00
*
* @ throws \InvalidArgumentException When current node is empty
*/
public function parents ()
{
2015-10-02 13:08:08 +01:00
if ( ! $this -> nodes ) {
2010-05-06 12:25:53 +01:00
throw new \InvalidArgumentException ( 'The current node list is empty.' );
}
$node = $this -> getNode ( 0 );
$nodes = array ();
2010-05-07 15:09:11 +01:00
while ( $node = $node -> parentNode ) {
2016-03-17 09:11:54 +00:00
if ( XML_ELEMENT_NODE === $node -> nodeType ) {
2010-05-06 12:25:53 +01:00
$nodes [] = $node ;
}
}
2015-09-26 23:37:59 +01:00
return $this -> createSubCrawler ( $nodes );
2010-05-06 12:25:53 +01:00
}
2010-04-15 13:41:42 +01:00
2010-05-06 12:25:53 +01:00
/**
2014-12-21 17:00:50 +00:00
* Returns the children nodes of the current selection .
2010-05-06 12:25:53 +01:00
*
2016-12-26 07:50:27 +00:00
* @ return self
2010-05-06 12:25:53 +01:00
*
* @ throws \InvalidArgumentException When current node is empty
*/
public function children ()
2010-04-15 13:41:42 +01:00
{
2015-10-02 13:08:08 +01:00
if ( ! $this -> nodes ) {
2010-05-06 12:25:53 +01:00
throw new \InvalidArgumentException ( 'The current node list is empty.' );
}
2013-05-16 01:06:15 +01:00
$node = $this -> getNode ( 0 ) -> firstChild ;
2015-09-26 23:37:59 +01:00
return $this -> createSubCrawler ( $node ? $this -> sibling ( $node ) : array ());
2010-04-15 13:41:42 +01:00
}
2010-05-06 12:25:53 +01:00
/**
* Returns the attribute value of the first node of the list .
*
* @ param string $attribute The attribute name
*
2013-11-23 07:20:18 +00:00
* @ return string | null The attribute value or null if the attribute does not exist
2010-05-06 12:25:53 +01:00
*
* @ throws \InvalidArgumentException When current node is empty
*/
public function attr ( $attribute )
{
2015-10-02 13:08:08 +01:00
if ( ! $this -> nodes ) {
2010-05-06 12:25:53 +01:00
throw new \InvalidArgumentException ( 'The current node list is empty.' );
}
2013-11-22 18:54:54 +00:00
$node = $this -> getNode ( 0 );
return $node -> hasAttribute ( $attribute ) ? $node -> getAttribute ( $attribute ) : null ;
2010-05-06 12:25:53 +01:00
}
2010-04-15 13:41:42 +01:00
2014-07-18 22:16:02 +01:00
/**
* Returns the node name of the first node of the list .
*
* @ return string The node name
*
* @ throws \InvalidArgumentException When current node is empty
*/
public function nodeName ()
{
2015-10-02 13:08:08 +01:00
if ( ! $this -> nodes ) {
2014-07-18 22:16:02 +01:00
throw new \InvalidArgumentException ( 'The current node list is empty.' );
}
return $this -> getNode ( 0 ) -> nodeName ;
}
2010-05-06 12:25:53 +01:00
/**
* Returns the node value of the first node of the list .
*
* @ return string The node value
*
* @ throws \InvalidArgumentException When current node is empty
*/
public function text ()
2010-04-15 13:41:42 +01:00
{
2015-10-02 13:08:08 +01:00
if ( ! $this -> nodes ) {
2010-05-06 12:25:53 +01:00
throw new \InvalidArgumentException ( 'The current node list is empty.' );
}
return $this -> getNode ( 0 ) -> nodeValue ;
}
2013-04-21 01:11:42 +01:00
/**
2013-04-22 06:03:23 +01:00
* Returns the first node of the list as HTML .
2013-04-21 01:11:42 +01:00
*
* @ return string The node html
*
* @ throws \InvalidArgumentException When current node is empty
*/
public function html ()
{
2015-10-02 13:08:08 +01:00
if ( ! $this -> nodes ) {
2013-04-21 01:11:42 +01:00
throw new \InvalidArgumentException ( 'The current node list is empty.' );
}
$html = '' ;
foreach ( $this -> getNode ( 0 ) -> childNodes as $child ) {
2015-01-08 22:22:42 +00:00
$html .= $child -> ownerDocument -> saveHTML ( $child );
2013-04-21 01:11:42 +01:00
}
2013-04-22 06:03:23 +01:00
return $html ;
2013-04-21 01:11:42 +01:00
}
2016-07-26 09:49:12 +01:00
/**
* Evaluates an XPath expression .
*
2016-08-31 12:20:00 +01:00
* Since an XPath expression might evaluate to either a simple type or a \DOMNodeList ,
2016-07-26 09:49:12 +01:00
* this method will return either an array of simple types or a new Crawler instance .
*
* @ param string $xpath An XPath expression
*
* @ return array | Crawler An array of evaluation results or a new Crawler instance
*/
public function evaluate ( $xpath )
{
if ( null === $this -> document ) {
throw new \LogicException ( 'Cannot evaluate the expression on an uninitialized crawler.' );
}
$data = array ();
$domxpath = $this -> createDOMXPath ( $this -> document , $this -> findNamespacePrefixes ( $xpath ));
foreach ( $this -> nodes as $node ) {
$data [] = $domxpath -> evaluate ( $xpath , $node );
}
if ( isset ( $data [ 0 ]) && $data [ 0 ] instanceof \DOMNodeList ) {
return $this -> createSubCrawler ( $data );
}
return $data ;
}
2010-05-06 12:25:53 +01:00
/**
* Extracts information from the list of nodes .
*
* You can extract attributes or / and the node value ( _text ) .
*
* Example :
*
* $crawler -> filter ( 'h1 a' ) -> extract ( array ( '_text' , 'href' ));
*
* @ param array $attributes An array of attributes
*
2010-07-01 19:17:03 +01:00
* @ return array An array of extracted values
2010-05-06 12:25:53 +01:00
*/
public function extract ( $attributes )
{
2011-04-26 10:39:34 +01:00
$attributes = ( array ) $attributes ;
2013-11-29 00:05:12 +00:00
$count = count ( $attributes );
2010-05-06 12:25:53 +01:00
$data = array ();
2015-10-02 13:08:08 +01:00
foreach ( $this -> nodes as $node ) {
2010-05-06 12:25:53 +01:00
$elements = array ();
2010-05-07 15:09:11 +01:00
foreach ( $attributes as $attribute ) {
2010-05-08 14:32:30 +01:00
if ( '_text' === $attribute ) {
2010-05-06 12:25:53 +01:00
$elements [] = $node -> nodeValue ;
2010-05-07 15:09:11 +01:00
} else {
2010-05-06 12:25:53 +01:00
$elements [] = $node -> getAttribute ( $attribute );
}
}
2013-11-29 00:05:12 +00:00
$data [] = $count > 1 ? $elements : $elements [ 0 ];
2010-05-06 12:25:53 +01:00
}
return $data ;
2010-04-15 13:41:42 +01:00
}
2010-05-06 12:25:53 +01:00
/**
* Filters the list of nodes with an XPath expression .
*
2014-05-18 01:00:59 +01:00
* The XPath expression is evaluated in the context of the crawler , which
* is considered as a fake parent of the elements inside it .
* This means that a child selector " div " or " ./div " will match only
* the div elements of the current crawler , not their children .
*
2010-05-06 12:25:53 +01:00
* @ param string $xpath An XPath expression
*
2016-12-26 07:50:27 +00:00
* @ return self
2010-05-06 12:25:53 +01:00
*/
public function filterXPath ( $xpath )
{
2014-05-18 01:00:59 +01:00
$xpath = $this -> relativize ( $xpath );
2010-04-15 13:41:42 +01:00
2014-05-18 01:00:59 +01:00
// If we dropped all expressions in the XPath while preparing it, there would be no match
if ( '' === $xpath ) {
2015-09-26 23:37:59 +01:00
return $this -> createSubCrawler ( null );
2010-05-06 12:25:53 +01:00
}
2014-02-18 16:26:04 +00:00
2014-05-18 01:00:59 +01:00
return $this -> filterRelativeXPath ( $xpath );
2010-05-06 12:25:53 +01:00
}
/**
* Filters the list of nodes with a CSS selector .
*
* This method only works if you have installed the CssSelector Symfony Component .
*
* @ param string $selector A CSS selector
*
2016-12-26 07:50:27 +00:00
* @ return self
2010-05-06 12:25:53 +01:00
*
* @ throws \RuntimeException if the CssSelector Component is not available
*/
public function filter ( $selector )
2010-04-15 13:41:42 +01:00
{
2015-09-30 10:22:02 +01:00
if ( ! class_exists ( 'Symfony\\Component\\CssSelector\\CssSelectorConverter' )) {
2015-09-26 23:37:59 +01:00
throw new \RuntimeException ( 'Unable to filter with a CSS selector as the Symfony CssSelector 2.8+ is not installed (you can use filterXPath instead).' );
2010-05-06 12:25:53 +01:00
}
2015-09-30 10:22:02 +01:00
$converter = new CssSelectorConverter ( $this -> isHtml );
2015-09-26 23:37:59 +01:00
2014-05-18 01:00:59 +01:00
// The CssSelector already prefixes the selector with descendant-or-self::
2015-09-26 23:37:59 +01:00
return $this -> filterRelativeXPath ( $converter -> toXPath ( $selector ));
2010-04-15 13:41:42 +01:00
}
2010-05-06 12:25:53 +01:00
/**
* Selects links by name or alt value for clickable images .
*
2012-04-23 18:19:47 +01:00
* @ param string $value The link text
2010-05-06 12:25:53 +01:00
*
2016-12-26 07:50:27 +00:00
* @ return self
2010-05-06 12:25:53 +01:00
*/
public function selectLink ( $value )
{
2014-05-18 01:00:59 +01:00
$xpath = sprintf ( 'descendant-or-self::a[contains(concat(\' \', normalize-space(string(.)), \' \'), %s) ' , static :: xpathLiteral ( ' ' . $value . ' ' )) .
sprintf ( 'or ./img[contains(concat(\' \', normalize-space(string(@alt)), \' \'), %s)]]' , static :: xpathLiteral ( ' ' . $value . ' ' ));
2010-05-06 12:25:53 +01:00
2014-05-18 01:00:59 +01:00
return $this -> filterRelativeXPath ( $xpath );
2010-05-06 12:25:53 +01:00
}
2010-04-15 13:41:42 +01:00
2015-02-07 16:25:03 +00:00
/**
* Selects images by alt value .
*
* @ param string $value The image alt
*
2016-12-29 21:43:11 +00:00
* @ return self A new instance of Crawler with the filtered list of nodes
2015-02-07 16:25:03 +00:00
*/
public function selectImage ( $value )
{
$xpath = sprintf ( 'descendant-or-self::img[contains(normalize-space(string(@alt)), %s)]' , static :: xpathLiteral ( $value ));
return $this -> filterRelativeXPath ( $xpath );
}
2010-05-06 12:25:53 +01:00
/**
* Selects a button by name or alt value for images .
*
2012-04-23 18:19:47 +01:00
* @ param string $value The button text
2010-05-06 12:25:53 +01:00
*
2016-12-26 07:50:27 +00:00
* @ return self
2010-05-06 12:25:53 +01:00
*/
public function selectButton ( $value )
2010-04-15 13:41:42 +01:00
{
2014-05-02 16:23:01 +01:00
$translate = 'translate(@type, "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz")' ;
2014-05-18 01:00:59 +01:00
$xpath = sprintf ( 'descendant-or-self::input[((contains(%s, "submit") or contains(%s, "button")) and contains(concat(\' \', normalize-space(string(@value)), \' \'), %s)) ' , $translate , $translate , static :: xpathLiteral ( ' ' . $value . ' ' )) .
2014-06-24 22:06:26 +01:00
sprintf ( 'or (contains(%s, "image") and contains(concat(\' \', normalize-space(string(@alt)), \' \'), %s)) or @id=%s or @name=%s] ' , $translate , static :: xpathLiteral ( ' ' . $value . ' ' ), static :: xpathLiteral ( $value ), static :: xpathLiteral ( $value )) .
sprintf ( '| descendant-or-self::button[contains(concat(\' \', normalize-space(string(.)), \' \'), %s) or @id=%s or @name=%s]' , static :: xpathLiteral ( ' ' . $value . ' ' ), static :: xpathLiteral ( $value ), static :: xpathLiteral ( $value ));
2010-05-06 12:25:53 +01:00
2014-05-18 01:00:59 +01:00
return $this -> filterRelativeXPath ( $xpath );
2010-04-15 13:41:42 +01:00
}
2010-05-06 12:25:53 +01:00
/**
* Returns a Link object for the first node in the list .
*
2012-04-23 18:19:47 +01:00
* @ param string $method The method for the link ( get by default )
2010-05-06 12:25:53 +01:00
*
2012-11-01 15:08:59 +00:00
* @ return Link A Link instance
2010-05-06 12:25:53 +01:00
*
2015-12-16 16:46:28 +00:00
* @ throws \InvalidArgumentException If the current node list is empty or the selected node is not instance of DOMElement
2010-05-06 12:25:53 +01:00
*/
public function link ( $method = 'get' )
{
2015-10-02 13:08:08 +01:00
if ( ! $this -> nodes ) {
2010-05-06 12:25:53 +01:00
throw new \InvalidArgumentException ( 'The current node list is empty.' );
}
2010-04-15 13:41:42 +01:00
2010-05-06 12:25:53 +01:00
$node = $this -> getNode ( 0 );
2015-12-16 16:46:28 +00:00
if ( ! $node instanceof \DOMElement ) {
throw new \InvalidArgumentException ( sprintf ( 'The selected node should be instance of DOMElement, got "%s".' , get_class ( $node )));
}
2014-10-22 08:21:07 +01:00
return new Link ( $node , $this -> baseHref , $method );
2010-05-06 12:25:53 +01:00
}
2010-04-15 13:41:42 +01:00
2010-05-06 12:25:53 +01:00
/**
* Returns an array of Link objects for the nodes in the list .
*
2012-11-01 15:08:59 +00:00
* @ return Link [] An array of Link instances
2015-12-16 16:46:28 +00:00
*
* @ throws \InvalidArgumentException If the current node list contains non - DOMElement instances
2010-05-06 12:25:53 +01:00
*/
public function links ()
2010-04-15 13:41:42 +01:00
{
2010-05-06 12:25:53 +01:00
$links = array ();
2015-10-02 13:08:08 +01:00
foreach ( $this -> nodes as $node ) {
2015-12-16 16:46:28 +00:00
if ( ! $node instanceof \DOMElement ) {
throw new \InvalidArgumentException ( sprintf ( 'The current node list should contain only DOMElement instances, "%s" found.' , get_class ( $node )));
}
2014-10-22 08:21:07 +01:00
$links [] = new Link ( $node , $this -> baseHref , 'get' );
2010-05-06 12:25:53 +01:00
}
return $links ;
2010-04-15 13:41:42 +01:00
}
2015-02-07 16:25:03 +00:00
/**
* Returns an Image object for the first node in the list .
*
* @ return Image An Image instance
*
* @ throws \InvalidArgumentException If the current node list is empty
*/
public function image ()
{
if ( ! count ( $this )) {
throw new \InvalidArgumentException ( 'The current node list is empty.' );
}
$node = $this -> getNode ( 0 );
if ( ! $node instanceof \DOMElement ) {
throw new \InvalidArgumentException ( sprintf ( 'The selected node should be instance of DOMElement, got "%s".' , get_class ( $node )));
}
return new Image ( $node , $this -> baseHref );
}
/**
* Returns an array of Image objects for the nodes in the list .
*
* @ return Image [] An array of Image instances
*/
public function images ()
{
$images = array ();
foreach ( $this as $node ) {
if ( ! $node instanceof \DOMElement ) {
throw new \InvalidArgumentException ( sprintf ( 'The current node list should contain only DOMElement instances, "%s" found.' , get_class ( $node )));
}
$images [] = new Image ( $node , $this -> baseHref );
}
return $images ;
}
2010-05-06 12:25:53 +01:00
/**
* Returns a Form object for the first node in the list .
*
2012-04-23 18:19:47 +01:00
* @ param array $values An array of values for the form fields
* @ param string $method The method for the form
2010-05-06 12:25:53 +01:00
*
2012-11-01 15:08:59 +00:00
* @ return Form A Form instance
2010-05-06 12:25:53 +01:00
*
2015-12-16 16:46:28 +00:00
* @ throws \InvalidArgumentException If the current node list is empty or the selected node is not instance of DOMElement
2010-05-06 12:25:53 +01:00
*/
public function form ( array $values = null , $method = null )
{
2015-10-02 13:08:08 +01:00
if ( ! $this -> nodes ) {
2010-05-06 12:25:53 +01:00
throw new \InvalidArgumentException ( 'The current node list is empty.' );
}
2015-12-16 16:46:28 +00:00
$node = $this -> getNode ( 0 );
if ( ! $node instanceof \DOMElement ) {
throw new \InvalidArgumentException ( sprintf ( 'The selected node should be instance of DOMElement, got "%s".' , get_class ( $node )));
}
$form = new Form ( $node , $this -> uri , $method , $this -> baseHref );
2010-05-06 12:25:53 +01:00
2010-05-07 15:09:11 +01:00
if ( null !== $values ) {
2010-05-06 12:25:53 +01:00
$form -> setValues ( $values );
}
2010-04-15 13:41:42 +01:00
2010-05-06 12:25:53 +01:00
return $form ;
}
2013-09-18 13:18:44 +01:00
/**
* Overloads a default namespace prefix to be used with XPath and CSS expressions .
*
* @ param string $prefix
*/
public function setDefaultNamespacePrefix ( $prefix )
{
$this -> defaultNamespacePrefix = $prefix ;
}
2013-09-22 23:29:38 +01:00
/**
* @ param string $prefix
* @ param string $namespace
*/
public function registerNamespace ( $prefix , $namespace )
{
$this -> namespaces [ $prefix ] = $namespace ;
}
2012-04-23 18:19:47 +01:00
/**
* Converts string for XPath expressions .
*
* Escaped characters are : quotes ( " ) and apostrophe (').
*
* Examples :
* < code >
* echo Crawler :: xpathLiteral ( 'foo " bar' );
* //prints 'foo " bar'
*
* echo Crawler :: xpathLiteral ( " foo ' bar " );
* //prints "foo ' bar"
*
* echo Crawler :: xpathLiteral ( 'a\'b"c' );
* //prints concat('a', "'", 'b"c')
* </ code >
*
* @ param string $s String to be escaped
*
* @ return string Converted string
*/
2012-07-09 13:50:58 +01:00
public static function xpathLiteral ( $s )
2011-03-12 12:33:07 +00:00
{
if ( false === strpos ( $s , " ' " )) {
return sprintf ( " '%s' " , $s );
}
if ( false === strpos ( $s , '"' )) {
return sprintf ( '"%s"' , $s );
}
$string = $s ;
$parts = array ();
while ( true ) {
if ( false !== $pos = strpos ( $string , " ' " )) {
$parts [] = sprintf ( " '%s' " , substr ( $string , 0 , $pos ));
$parts [] = " \" ' \" " ;
$string = substr ( $string , $pos + 1 );
} else {
$parts [] = " ' $string ' " ;
break ;
}
}
2016-11-13 11:38:04 +00:00
return sprintf ( 'concat(%s)' , implode ( ', ' , $parts ));
2011-03-12 12:33:07 +00:00
}
2014-05-18 01:00:59 +01:00
/**
* Filters the list of nodes with an XPath expression .
*
* The XPath expression should already be processed to apply it in the context of each node .
*
* @ param string $xpath
*
2016-12-26 07:50:27 +00:00
* @ return self
2014-05-18 01:00:59 +01:00
*/
private function filterRelativeXPath ( $xpath )
{
2014-05-21 16:50:42 +01:00
$prefixes = $this -> findNamespacePrefixes ( $xpath );
2015-09-26 23:37:59 +01:00
$crawler = $this -> createSubCrawler ( null );
2014-05-18 01:00:59 +01:00
2015-10-02 13:08:08 +01:00
foreach ( $this -> nodes as $node ) {
2014-05-21 16:50:42 +01:00
$domxpath = $this -> createDOMXPath ( $node -> ownerDocument , $prefixes );
2015-12-16 16:46:28 +00:00
$crawler -> add ( $domxpath -> query ( $xpath , $node ));
2014-05-18 01:00:59 +01:00
}
return $crawler ;
}
/**
* Make the XPath relative to the current context .
*
* The returned XPath will match elements matching the XPath inside the current crawler
* when running in the context of a node of the crawler .
*
* @ param string $xpath
*
* @ return string
*/
private function relativize ( $xpath )
{
$expressions = array ();
// An expression which will never match to replace expressions which cannot match in the crawler
// We cannot simply drop
$nonMatchingExpression = 'a[name() = "b"]' ;
2016-10-18 08:12:23 +01:00
$xpathLen = strlen ( $xpath );
$openedBrackets = 0 ;
$startPosition = strspn ( $xpath , " \t \n \r \0 \x0B " );
for ( $i = $startPosition ; $i <= $xpathLen ; ++ $i ) {
$i += strcspn ( $xpath , '"\'[]|' , $i );
if ( $i < $xpathLen ) {
switch ( $xpath [ $i ]) {
case '"' :
case " ' " :
if ( false === $i = strpos ( $xpath , $xpath [ $i ], $i + 1 )) {
return $xpath ; // The XPath expression is invalid
}
continue 2 ;
case '[' :
++ $openedBrackets ;
continue 2 ;
case ']' :
-- $openedBrackets ;
continue 2 ;
}
}
if ( $openedBrackets ) {
continue ;
2014-05-18 01:00:59 +01:00
}
2016-10-18 08:12:23 +01:00
if ( $startPosition < $xpathLen && '(' === $xpath [ $startPosition ]) {
// If the union is inside some braces, we need to preserve the opening braces and apply
// the change only inside it.
$j = 1 + strspn ( $xpath , " ( \t \n \r \0 \x0B " , $startPosition + 1 );
$parenthesis = substr ( $xpath , $startPosition , $j );
$startPosition += $j ;
} else {
$parenthesis = '' ;
2014-05-18 01:00:59 +01:00
}
2016-10-18 08:12:23 +01:00
$expression = rtrim ( substr ( $xpath , $startPosition , $i - $startPosition ));
2014-05-18 01:00:59 +01:00
2015-10-04 14:03:24 +01:00
if ( 0 === strpos ( $expression , 'self::*/' )) {
2014-08-07 23:47:19 +01:00
$expression = './' . substr ( $expression , 8 );
2014-05-18 01:00:59 +01:00
}
// add prefix before absolute element selector
2016-10-18 08:12:23 +01:00
if ( '' === $expression ) {
2014-05-18 01:00:59 +01:00
$expression = $nonMatchingExpression ;
} elseif ( 0 === strpos ( $expression , '//' )) {
2014-08-07 23:47:19 +01:00
$expression = 'descendant-or-self::' . substr ( $expression , 2 );
2014-05-24 18:15:07 +01:00
} elseif ( 0 === strpos ( $expression , './/' )) {
2014-08-07 23:47:19 +01:00
$expression = 'descendant-or-self::' . substr ( $expression , 3 );
2014-05-18 01:00:59 +01:00
} elseif ( 0 === strpos ( $expression , './' )) {
2014-08-07 23:47:19 +01:00
$expression = 'self::' . substr ( $expression , 2 );
} elseif ( 0 === strpos ( $expression , 'child::' )) {
$expression = 'self::' . substr ( $expression , 7 );
2015-10-04 14:03:24 +01:00
} elseif ( '/' === $expression [ 0 ] || '.' === $expression [ 0 ] || 0 === strpos ( $expression , 'self::' )) {
2014-05-18 01:00:59 +01:00
$expression = $nonMatchingExpression ;
} elseif ( 0 === strpos ( $expression , 'descendant::' )) {
2016-10-18 08:12:23 +01:00
$expression = 'descendant-or-self::' . substr ( $expression , 12 );
2014-08-07 23:47:19 +01:00
} elseif ( preg_match ( '/^(ancestor|ancestor-or-self|attribute|following|following-sibling|namespace|parent|preceding|preceding-sibling)::/' , $expression )) {
// the fake root has no parent, preceding or following nodes and also no attributes (even no namespace attributes)
$expression = $nonMatchingExpression ;
} elseif ( 0 !== strpos ( $expression , 'descendant-or-self::' )) {
$expression = 'self::' . $expression ;
2014-05-18 01:00:59 +01:00
}
$expressions [] = $parenthesis . $expression ;
2016-10-18 08:12:23 +01:00
if ( $i === $xpathLen ) {
return implode ( ' | ' , $expressions );
2016-10-16 21:10:53 +01:00
}
2016-10-18 08:12:23 +01:00
$i += strspn ( $xpath , " \t \n \r \0 \x0B " , $i + 1 );
$startPosition = $i + 1 ;
2014-05-18 01:00:59 +01:00
}
2016-10-18 08:12:23 +01:00
return $xpath ; // The XPath expression is invalid
2014-05-18 01:00:59 +01:00
}
2013-09-13 12:38:25 +01:00
/**
2014-11-30 13:33:44 +00:00
* @ param int $position
2013-09-13 12:38:25 +01:00
*
* @ return \DOMElement | null
*/
2013-05-13 12:21:14 +01:00
public function getNode ( $position )
2010-04-15 13:41:42 +01:00
{
2015-10-02 13:08:08 +01:00
if ( isset ( $this -> nodes [ $position ])) {
return $this -> nodes [ $position ];
2010-05-06 12:25:53 +01:00
}
2010-04-15 13:41:42 +01:00
}
2015-10-02 13:08:08 +01:00
/**
* @ return int
*/
public function count ()
{
return count ( $this -> nodes );
}
2015-10-03 00:19:52 +01:00
/**
* @ return \ArrayIterator
*/
public function getIterator ()
{
return new \ArrayIterator ( $this -> nodes );
}
2013-09-13 12:38:25 +01:00
/**
* @ param \DOMElement $node
* @ param string $siblingDir
*
* @ return array
*/
2013-02-28 10:23:15 +00:00
protected function sibling ( $node , $siblingDir = 'nextSibling' )
2010-04-15 13:41:42 +01:00
{
2010-05-06 12:25:53 +01:00
$nodes = array ();
2010-05-07 15:09:11 +01:00
do {
2017-09-07 10:04:22 +01:00
if ( $node !== $this -> getNode ( 0 ) && 1 === $node -> nodeType ) {
2010-05-06 12:25:53 +01:00
$nodes [] = $node ;
}
2011-02-27 19:56:29 +00:00
} while ( $node = $node -> $siblingDir );
2010-05-06 12:25:53 +01:00
return $nodes ;
2010-04-15 13:41:42 +01:00
}
2013-09-17 18:15:30 +01:00
/**
* @ param \DOMDocument $document
* @ param array $prefixes
*
* @ return \DOMXPath
*
* @ throws \InvalidArgumentException
*/
private function createDOMXPath ( \DOMDocument $document , array $prefixes = array ())
{
$domxpath = new \DOMXPath ( $document );
foreach ( $prefixes as $prefix ) {
2013-09-22 23:29:38 +01:00
$namespace = $this -> discoverNamespace ( $domxpath , $prefix );
2013-09-25 23:07:33 +01:00
if ( null !== $namespace ) {
$domxpath -> registerNamespace ( $prefix , $namespace );
}
2013-09-17 18:15:30 +01:00
}
return $domxpath ;
}
2013-09-22 23:29:38 +01:00
/**
* @ param \DOMXPath $domxpath
* @ param string $prefix
*
* @ return string
*
* @ throws \InvalidArgumentException
*/
private function discoverNamespace ( \DOMXPath $domxpath , $prefix )
{
if ( isset ( $this -> namespaces [ $prefix ])) {
return $this -> namespaces [ $prefix ];
}
// ask for one namespace, otherwise we'd get a collection with an item for each node
$namespaces = $domxpath -> query ( sprintf ( '(//namespace::*[name()="%s"])[last()]' , $this -> defaultNamespacePrefix === $prefix ? '' : $prefix ));
if ( $node = $namespaces -> item ( 0 )) {
return $node -> nodeValue ;
}
}
2013-09-17 18:15:30 +01:00
/**
2015-09-20 22:13:58 +01:00
* @ param string $xpath
2013-09-17 18:15:30 +01:00
*
* @ return array
*/
private function findNamespacePrefixes ( $xpath )
{
2015-09-20 22:13:58 +01:00
if ( preg_match_all ( '/(?P<prefix>[a-z_][a-z_0-9\-\.]*+):[^"\/:]/i' , $xpath , $matches )) {
2013-09-17 18:15:30 +01:00
return array_unique ( $matches [ 'prefix' ]);
}
return array ();
}
2015-09-26 23:37:59 +01:00
/**
2015-09-30 10:22:02 +01:00
* Creates a crawler for some subnodes .
2015-09-26 23:37:59 +01:00
*
* @ param \DOMElement | \DOMElement [] | \DOMNodeList | null $nodes
*
* @ return static
*/
private function createSubCrawler ( $nodes )
{
$crawler = new static ( $nodes , $this -> uri , $this -> baseHref );
$crawler -> isHtml = $this -> isHtml ;
2015-10-01 23:41:47 +01:00
$crawler -> document = $this -> document ;
2016-07-25 09:47:56 +01:00
$crawler -> namespaces = $this -> namespaces ;
2015-09-26 23:37:59 +01:00
return $crawler ;
}
2010-04-15 13:41:42 +01:00
}