[DomCrawler] Fixed charset detection in html5 meta charset tag
This commit is contained in:
parent
29341fab1b
commit
172e75208a
@ -108,8 +108,10 @@ class Crawler extends \SplObjectStorage
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// http://www.w3.org/TR/encoding/#encodings
|
||||||
|
// http://www.w3.org/TR/REC-xml/#NT-EncName
|
||||||
if (null === $charset &&
|
if (null === $charset &&
|
||||||
preg_match('/\<meta[^\>]+charset *= *["\']?([a-zA-Z\-0-9]+)/i', $content, $matches)) {
|
preg_match('/\<meta[^\>]+charset *= *["\']?([a-zA-Z\-0-9_:.]+)/i', $content, $matches)) {
|
||||||
$charset = $matches[1];
|
$charset = $matches[1];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -232,6 +232,10 @@ EOF
|
|||||||
$crawler = new Crawler();
|
$crawler = new Crawler();
|
||||||
$crawler->addContent('<html><meta http-equiv="Content-Type" content="text/html; charset=utf-8" /><span>中文</span></html>');
|
$crawler->addContent('<html><meta http-equiv="Content-Type" content="text/html; charset=utf-8" /><span>中文</span></html>');
|
||||||
$this->assertEquals('中文', $crawler->filterXPath('//span')->text(), '->addContent() guess wrong charset');
|
$this->assertEquals('中文', $crawler->filterXPath('//span')->text(), '->addContent() guess wrong charset');
|
||||||
|
|
||||||
|
$crawler = new Crawler();
|
||||||
|
$crawler->addContent(mb_convert_encoding('<html><head><meta charset="Shift_JIS"></head><body>日本語</body></html>', 'SJIS', 'UTF-8'));
|
||||||
|
$this->assertEquals('日本語', $crawler->filterXPath('//body')->text(), '->addContent() can recognize "Shift_JIS" in html5 meta charset tag');
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
Reference in New Issue
Block a user