merged branch bronze1man/pr-2.2-crawler (PR #9074)
This PR was squashed before being merged into the 2.2 branch (closes #9074).
Discussion
----------
[DomCrawler]Crawler guess charset from html
| Q | A
| ------------- | ---
| Bug fix? | no
| New feature? | yes
| BC breaks? | no
| Deprecations? | no
| Tests pass? | yes
| Fixed tickets | #9061
| License | MIT
| Doc PR | n/a
Commits
-------
e5282e8
[DomCrawler]Crawler guess charset from html
This commit is contained in:
commit
f73aa37064
|
@ -92,11 +92,11 @@ class Crawler extends \SplObjectStorage
|
||||||
}
|
}
|
||||||
|
|
||||||
// DOM only for HTML/XML content
|
// DOM only for HTML/XML content
|
||||||
if (!preg_match('/(x|ht)ml/i', $type, $matches)) {
|
if (!preg_match('/(x|ht)ml/i', $type, $xmlMatches)) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
$charset = 'ISO-8859-1';
|
$charset = null;
|
||||||
if (false !== $pos = strpos($type, 'charset=')) {
|
if (false !== $pos = strpos($type, 'charset=')) {
|
||||||
$charset = substr($type, $pos + 8);
|
$charset = substr($type, $pos + 8);
|
||||||
if (false !== $pos = strpos($charset, ';')) {
|
if (false !== $pos = strpos($charset, ';')) {
|
||||||
|
@ -104,7 +104,16 @@ class Crawler extends \SplObjectStorage
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if ('x' === $matches[1]) {
|
if (null === $charset &&
|
||||||
|
preg_match('/\<meta[^\>]+charset *= *["\']?([a-zA-Z\-0-9]+)/i', $content, $matches)) {
|
||||||
|
$charset = $matches[1];
|
||||||
|
}
|
||||||
|
|
||||||
|
if (null === $charset) {
|
||||||
|
$charset = 'ISO-8859-1';
|
||||||
|
}
|
||||||
|
|
||||||
|
if ('x' === $xmlMatches[1]) {
|
||||||
$this->addXmlContent($content, $charset);
|
$this->addXmlContent($content, $charset);
|
||||||
} else {
|
} else {
|
||||||
$this->addHtmlContent($content, $charset);
|
$this->addHtmlContent($content, $charset);
|
||||||
|
|
|
@ -207,6 +207,10 @@ EOF
|
||||||
$crawler = new Crawler();
|
$crawler = new Crawler();
|
||||||
$crawler->addContent('foo bar', 'text/plain');
|
$crawler->addContent('foo bar', 'text/plain');
|
||||||
$this->assertCount(0, $crawler, '->addContent() does nothing if the type is not (x|ht)ml');
|
$this->assertCount(0, $crawler, '->addContent() does nothing if the type is not (x|ht)ml');
|
||||||
|
|
||||||
|
$crawler = new Crawler();
|
||||||
|
$crawler->addContent('<html><meta http-equiv="Content-Type" content="text/html; charset=utf-8" /><span>中文</span></html>');
|
||||||
|
$this->assertEquals('中文', $crawler->filterXPath('//span')->text(), '->addContent() guess wrong charset');
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
Reference in New Issue