merged branch bronze1man/pr-2.2-crawler (PR #9074)
This PR was squashed before being merged into the 2.2 branch (closes #9074).
Discussion
----------
[DomCrawler]Crawler guess charset from html
| Q | A
| ------------- | ---
| Bug fix? | no
| New feature? | yes
| BC breaks? | no
| Deprecations? | no
| Tests pass? | yes
| Fixed tickets | #9061
| License | MIT
| Doc PR | n/a
Commits
-------
e5282e8
[DomCrawler]Crawler guess charset from html
This commit is contained in:
commit
f73aa37064
|
@ -92,11 +92,11 @@ class Crawler extends \SplObjectStorage
|
|||
}
|
||||
|
||||
// DOM only for HTML/XML content
|
||||
if (!preg_match('/(x|ht)ml/i', $type, $matches)) {
|
||||
if (!preg_match('/(x|ht)ml/i', $type, $xmlMatches)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
$charset = 'ISO-8859-1';
|
||||
$charset = null;
|
||||
if (false !== $pos = strpos($type, 'charset=')) {
|
||||
$charset = substr($type, $pos + 8);
|
||||
if (false !== $pos = strpos($charset, ';')) {
|
||||
|
@ -104,7 +104,16 @@ class Crawler extends \SplObjectStorage
|
|||
}
|
||||
}
|
||||
|
||||
if ('x' === $matches[1]) {
|
||||
if (null === $charset &&
|
||||
preg_match('/\<meta[^\>]+charset *= *["\']?([a-zA-Z\-0-9]+)/i', $content, $matches)) {
|
||||
$charset = $matches[1];
|
||||
}
|
||||
|
||||
if (null === $charset) {
|
||||
$charset = 'ISO-8859-1';
|
||||
}
|
||||
|
||||
if ('x' === $xmlMatches[1]) {
|
||||
$this->addXmlContent($content, $charset);
|
||||
} else {
|
||||
$this->addHtmlContent($content, $charset);
|
||||
|
|
|
@ -207,6 +207,10 @@ EOF
|
|||
$crawler = new Crawler();
|
||||
$crawler->addContent('foo bar', 'text/plain');
|
||||
$this->assertCount(0, $crawler, '->addContent() does nothing if the type is not (x|ht)ml');
|
||||
|
||||
$crawler = new Crawler();
|
||||
$crawler->addContent('<html><meta http-equiv="Content-Type" content="text/html; charset=utf-8" /><span>中文</span></html>');
|
||||
$this->assertEquals('中文', $crawler->filterXPath('//span')->text(), '->addContent() guess wrong charset');
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
Reference in New Issue