From 172e75208aced6f0cbe7d8c49034240d611c0286 Mon Sep 17 00:00:00 2001 From: 77web Date: Sat, 24 May 2014 15:29:23 +0900 Subject: [PATCH] [DomCrawler] Fixed charset detection in html5 meta charset tag --- src/Symfony/Component/DomCrawler/Crawler.php | 4 +++- src/Symfony/Component/DomCrawler/Tests/CrawlerTest.php | 4 ++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/src/Symfony/Component/DomCrawler/Crawler.php b/src/Symfony/Component/DomCrawler/Crawler.php index de65978614..ae98db7d61 100644 --- a/src/Symfony/Component/DomCrawler/Crawler.php +++ b/src/Symfony/Component/DomCrawler/Crawler.php @@ -108,8 +108,10 @@ class Crawler extends \SplObjectStorage } } + // http://www.w3.org/TR/encoding/#encodings + // http://www.w3.org/TR/REC-xml/#NT-EncName if (null === $charset && - preg_match('/\]+charset *= *["\']?([a-zA-Z\-0-9]+)/i', $content, $matches)) { + preg_match('/\]+charset *= *["\']?([a-zA-Z\-0-9_:.]+)/i', $content, $matches)) { $charset = $matches[1]; } diff --git a/src/Symfony/Component/DomCrawler/Tests/CrawlerTest.php b/src/Symfony/Component/DomCrawler/Tests/CrawlerTest.php index 04921f693c..d96d1ebf71 100644 --- a/src/Symfony/Component/DomCrawler/Tests/CrawlerTest.php +++ b/src/Symfony/Component/DomCrawler/Tests/CrawlerTest.php @@ -232,6 +232,10 @@ EOF $crawler = new Crawler(); $crawler->addContent('中文'); $this->assertEquals('中文', $crawler->filterXPath('//span')->text(), '->addContent() guess wrong charset'); + + $crawler = new Crawler(); + $crawler->addContent(mb_convert_encoding('日本語', 'SJIS', 'UTF-8')); + $this->assertEquals('日本語', $crawler->filterXPath('//body')->text(), '->addContent() can recognize "Shift_JIS" in html5 meta charset tag'); } /**