2010-03-31 07:42:18 +01:00
|
|
|
<?php
|
|
|
|
|
2010-08-20 22:09:55 +01:00
|
|
|
namespace Symfony\Component\CssSelector;
|
2010-03-31 07:42:18 +01:00
|
|
|
|
|
|
|
/*
|
2010-04-07 01:51:29 +01:00
|
|
|
* This file is part of the Symfony package.
|
2010-03-31 07:42:18 +01:00
|
|
|
*
|
|
|
|
* (c) Fabien Potencier <fabien.potencier@symfony-project.com>
|
|
|
|
*
|
|
|
|
* For the full copyright and license information, please view the LICENSE
|
|
|
|
* file that was distributed with this source code.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Tokenizer lexes a CSS Selector to tokens.
|
|
|
|
*
|
|
|
|
* This component is a port of the Python lxml library,
|
|
|
|
* which is copyright Infrae and distributed under the BSD license.
|
|
|
|
*
|
2010-10-17 12:45:15 +01:00
|
|
|
* @author Fabien Potencier <fabien.potencier@symfony-project.com>
|
2010-03-31 07:42:18 +01:00
|
|
|
*/
|
|
|
|
class Tokenizer
|
|
|
|
{
|
2010-05-06 12:25:53 +01:00
|
|
|
public function tokenize($s)
|
2010-03-31 07:42:18 +01:00
|
|
|
{
|
2010-05-07 15:09:11 +01:00
|
|
|
if (function_exists('mb_internal_encoding') && ((int) ini_get('mbstring.func_overload')) & 2) {
|
2010-05-06 12:25:53 +01:00
|
|
|
$mbEncoding = mb_internal_encoding();
|
|
|
|
mb_internal_encoding('ASCII');
|
2010-03-31 07:42:18 +01:00
|
|
|
}
|
|
|
|
|
2010-05-06 12:25:53 +01:00
|
|
|
$tokens = array();
|
|
|
|
$pos = 0;
|
|
|
|
$s = preg_replace('#/\*.*?\*/#s', '', $s);
|
2010-03-31 07:42:18 +01:00
|
|
|
|
2010-05-07 15:09:11 +01:00
|
|
|
while (1) {
|
2010-05-08 14:32:30 +01:00
|
|
|
if (preg_match('#\s+#A', $s, $match, 0, $pos)) {
|
2010-05-06 12:25:53 +01:00
|
|
|
$preceding_whitespace_pos = $pos;
|
|
|
|
$pos += strlen($match[0]);
|
2010-05-07 15:09:11 +01:00
|
|
|
} else {
|
2010-05-06 12:25:53 +01:00
|
|
|
$preceding_whitespace_pos = 0;
|
|
|
|
}
|
|
|
|
|
2010-05-07 15:09:11 +01:00
|
|
|
if ($pos >= strlen($s)) {
|
2010-05-08 14:32:30 +01:00
|
|
|
if (isset($mbEncoding)) {
|
2010-05-06 12:25:53 +01:00
|
|
|
mb_internal_encoding($mbEncoding);
|
|
|
|
}
|
|
|
|
|
|
|
|
return $tokens;
|
|
|
|
}
|
|
|
|
|
2010-05-07 15:09:11 +01:00
|
|
|
if (preg_match('#[+-]?\d*n(?:[+-]\d+)?#A', $s, $match, 0, $pos) && 'n' !== $match[0]) {
|
2010-05-06 12:25:53 +01:00
|
|
|
$sym = substr($s, $pos, strlen($match[0]));
|
|
|
|
$tokens[] = new Token('Symbol', $sym, $pos);
|
|
|
|
$pos += strlen($match[0]);
|
|
|
|
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
$c = $s[$pos];
|
|
|
|
$c2 = substr($s, $pos, 2);
|
2010-05-07 15:09:11 +01:00
|
|
|
if (in_array($c2, array('~=', '|=', '^=', '$=', '*=', '::', '!='))) {
|
2010-05-06 12:25:53 +01:00
|
|
|
$tokens[] = new Token('Token', $c2, $pos);
|
|
|
|
$pos += 2;
|
|
|
|
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2010-05-07 15:09:11 +01:00
|
|
|
if (in_array($c, array('>', '+', '~', ',', '.', '*', '=', '[', ']', '(', ')', '|', ':', '#'))) {
|
2010-05-08 14:32:30 +01:00
|
|
|
if (in_array($c, array('.', '#', '[')) && $preceding_whitespace_pos > 0) {
|
2010-05-06 12:25:53 +01:00
|
|
|
$tokens[] = new Token('Token', ' ', $preceding_whitespace_pos);
|
|
|
|
}
|
|
|
|
$tokens[] = new Token('Token', $c, $pos);
|
|
|
|
++$pos;
|
|
|
|
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2010-12-21 02:59:17 +00:00
|
|
|
if ('"' === $c || "'" === $c) {
|
2010-05-06 12:25:53 +01:00
|
|
|
// Quoted string
|
|
|
|
$old_pos = $pos;
|
|
|
|
list($sym, $pos) = $this->tokenizeEscapedString($s, $pos);
|
|
|
|
|
|
|
|
$tokens[] = new Token('String', $sym, $old_pos);
|
|
|
|
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
$old_pos = $pos;
|
|
|
|
list($sym, $pos) = $this->tokenizeSymbol($s, $pos);
|
|
|
|
|
|
|
|
$tokens[] = new Token('Symbol', $sym, $old_pos);
|
|
|
|
|
|
|
|
continue;
|
2010-03-31 07:42:18 +01:00
|
|
|
}
|
2010-05-06 12:25:53 +01:00
|
|
|
}
|
2010-03-31 07:42:18 +01:00
|
|
|
|
2010-05-06 12:25:53 +01:00
|
|
|
/**
|
|
|
|
* @throws SyntaxError When expected closing is not found
|
|
|
|
*/
|
|
|
|
protected function tokenizeEscapedString($s, $pos)
|
|
|
|
{
|
|
|
|
$quote = $s[$pos];
|
2010-03-31 07:42:18 +01:00
|
|
|
|
2010-05-06 12:25:53 +01:00
|
|
|
$pos = $pos + 1;
|
|
|
|
$start = $pos;
|
2010-05-07 15:09:11 +01:00
|
|
|
while (1) {
|
2010-05-06 12:25:53 +01:00
|
|
|
$next = strpos($s, $quote, $pos);
|
2010-05-07 15:09:11 +01:00
|
|
|
if (false === $next) {
|
2010-05-06 12:25:53 +01:00
|
|
|
throw new SyntaxError(sprintf('Expected closing %s for string in: %s', $quote, substr($s, $start)));
|
|
|
|
}
|
|
|
|
|
|
|
|
$result = substr($s, $start, $next - $start);
|
2010-05-07 15:09:11 +01:00
|
|
|
if ('\\' === $result[strlen($result) - 1]) {
|
2010-05-06 12:25:53 +01:00
|
|
|
// next quote character is escaped
|
|
|
|
$pos = $next + 1;
|
2010-10-18 21:17:07 +01:00
|
|
|
continue;
|
2010-05-06 12:25:53 +01:00
|
|
|
}
|
|
|
|
|
2010-05-07 15:09:11 +01:00
|
|
|
if (false !== strpos($result, '\\')) {
|
2010-05-06 12:25:53 +01:00
|
|
|
$result = $this->unescapeStringLiteral($result);
|
|
|
|
}
|
|
|
|
|
|
|
|
return array($result, $next + 1);
|
|
|
|
}
|
2010-03-31 07:42:18 +01:00
|
|
|
}
|
2010-05-06 12:25:53 +01:00
|
|
|
|
|
|
|
/**
|
|
|
|
* @throws SyntaxError When invalid escape sequence is found
|
|
|
|
*/
|
|
|
|
protected function unescapeStringLiteral($literal)
|
2010-03-31 07:42:18 +01:00
|
|
|
{
|
2010-05-06 12:25:53 +01:00
|
|
|
return preg_replace_callback('#(\\\\(?:[A-Fa-f0-9]{1,6}(?:\r\n|\s)?|[^A-Fa-f0-9]))#', function ($matches) use ($literal)
|
|
|
|
{
|
2010-05-07 15:09:11 +01:00
|
|
|
if ($matches[0][0] == '\\' && strlen($matches[0]) > 1) {
|
2010-05-06 12:25:53 +01:00
|
|
|
$matches[0] = substr($matches[0], 1);
|
2010-05-07 15:09:11 +01:00
|
|
|
if (in_array($matches[0][0], array('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F', 'a', 'b', 'c', 'd', 'e', 'f'))) {
|
2010-05-06 12:25:53 +01:00
|
|
|
return chr(trim($matches[0]));
|
|
|
|
}
|
2010-05-07 15:09:11 +01:00
|
|
|
} else {
|
2010-05-06 12:25:53 +01:00
|
|
|
throw new SyntaxError(sprintf('Invalid escape sequence %s in string %s', $matches[0], $literal));
|
|
|
|
}
|
|
|
|
}, $literal);
|
2010-03-31 07:42:18 +01:00
|
|
|
}
|
2010-05-06 12:25:53 +01:00
|
|
|
|
|
|
|
/**
|
|
|
|
* @throws SyntaxError When Unexpected symbol is found
|
|
|
|
*/
|
|
|
|
protected function tokenizeSymbol($s, $pos)
|
2010-03-31 07:42:18 +01:00
|
|
|
{
|
2010-05-06 12:25:53 +01:00
|
|
|
$start = $pos;
|
|
|
|
|
2010-05-07 15:09:11 +01:00
|
|
|
if (!preg_match('#[^\w\-]#', $s, $match, PREG_OFFSET_CAPTURE, $pos)) {
|
2010-05-06 12:25:53 +01:00
|
|
|
// Goes to end of s
|
|
|
|
return array(substr($s, $start), strlen($s));
|
2010-03-31 07:42:18 +01:00
|
|
|
}
|
|
|
|
|
2010-05-06 12:25:53 +01:00
|
|
|
$matchStart = $match[0][1];
|
2010-03-31 07:42:18 +01:00
|
|
|
|
2010-05-07 15:09:11 +01:00
|
|
|
if ($matchStart == $pos) {
|
2010-05-06 12:25:53 +01:00
|
|
|
throw new SyntaxError(sprintf('Unexpected symbol: %s at %s', $s[$pos], $pos));
|
|
|
|
}
|
2010-03-31 07:42:18 +01:00
|
|
|
|
2010-05-06 12:25:53 +01:00
|
|
|
$result = substr($s, $start, $matchStart - $start);
|
|
|
|
$pos = $matchStart;
|
2010-03-31 07:42:18 +01:00
|
|
|
|
2010-05-06 12:25:53 +01:00
|
|
|
return array($result, $pos);
|
|
|
|
}
|
2010-03-31 07:42:18 +01:00
|
|
|
}
|