merged branch bronze1man/pr-2.2-crawler (PR #9074)

This PR was squashed before being merged into the 2.2 branch (closes #9074).

Discussion
----------

[DomCrawler]Crawler guess charset from html

| Q             | A
| ------------- | ---
| Bug fix?      | no
| New feature?  | yes
| BC breaks?    | no
| Deprecations? | no
| Tests pass?   | yes
| Fixed tickets |  #9061
| License       | MIT
| Doc PR        | n/a

Commits
-------

e5282e8 [DomCrawler]Crawler guess charset from html
This commit is contained in:
Fabien Potencier 2013-09-19 18:37:12 +02:00
commit f73aa37064
2 changed files with 16 additions and 3 deletions

View File

@ -92,11 +92,11 @@ class Crawler extends \SplObjectStorage
}
// DOM only for HTML/XML content
if (!preg_match('/(x|ht)ml/i', $type, $matches)) {
if (!preg_match('/(x|ht)ml/i', $type, $xmlMatches)) {
return null;
}
$charset = 'ISO-8859-1';
$charset = null;
if (false !== $pos = strpos($type, 'charset=')) {
$charset = substr($type, $pos + 8);
if (false !== $pos = strpos($charset, ';')) {
@ -104,7 +104,16 @@ class Crawler extends \SplObjectStorage
}
}
if ('x' === $matches[1]) {
if (null === $charset &&
preg_match('/\<meta[^\>]+charset *= *["\']?([a-zA-Z\-0-9]+)/i', $content, $matches)) {
$charset = $matches[1];
}
if (null === $charset) {
$charset = 'ISO-8859-1';
}
if ('x' === $xmlMatches[1]) {
$this->addXmlContent($content, $charset);
} else {
$this->addHtmlContent($content, $charset);

View File

@ -207,6 +207,10 @@ EOF
$crawler = new Crawler();
$crawler->addContent('foo bar', 'text/plain');
$this->assertCount(0, $crawler, '->addContent() does nothing if the type is not (x|ht)ml');
$crawler = new Crawler();
$crawler->addContent('<html><meta http-equiv="Content-Type" content="text/html; charset=utf-8" /><span>中文</span></html>');
$this->assertEquals('中文', $crawler->filterXPath('//span')->text(), '->addContent() guess wrong charset');
}
/**