merged branch symfony/domcrawler-encoding (PR #4214)

Commits
-------

c9ebe67 [DomCrawler] fixed encoding when using addHtmlContent() (fixes #3881)

Discussion
----------

[DomCrawler] fixed encoding when using addHtmlContent() (fixes #3881)

After looking around, this is clear that loadHtml() resets the encoding set on the DomDocument instance. So, the only workaround that actually works (and which is not an ugly hack) is to use `mb_convert_encoding` when it exists.

---------------------------------------------------------------------------

by Seldaek at 2012-05-07T12:38:43Z

+1 (Side note: Using your fork of symfony for PRs would be good I think, otherwise it creates noisy versions on packagist.)
This commit is contained in:
Fabien Potencier 2012-05-07 19:19:26 +02:00
commit 919604ab71
2 changed files with 26 additions and 0 deletions

View File

@ -129,6 +129,10 @@ class Crawler extends \SplObjectStorage
$dom = new \DOMDocument('1.0', $charset);
$dom->validateOnParse = true;
if (function_exists('mb_convert_encoding')) {
$content = mb_convert_encoding($content, 'HTML-ENTITIES', $charset);
}
$current = libxml_use_internal_errors(true);
@$dom->loadHTML($content);
libxml_use_internal_errors($current);

View File

@ -69,6 +69,17 @@ class CrawlerTest extends \PHPUnit_Framework_TestCase
$this->assertEquals('http://symfony.com/contact', $crawler->filterXPath('//a')->link()->getUri(), '->addHtmlContent() adds nodes from an HTML string');
}
/**
* @covers Symfony\Component\DomCrawler\Crawler::addHtmlContent
*/
public function testAddHtmlContentCharset()
{
$crawler = new Crawler();
$crawler->addHtmlContent('<html><div class="foo">Tiếng Việt</html>', 'UTF-8');
$this->assertEquals('Tiếng Việt', $crawler->filterXPath('//div')->text());
}
/**
* @covers Symfony\Component\DomCrawler\Crawler::addHtmlContent
*/
@ -108,6 +119,17 @@ EOF
$this->assertEquals('foo', $crawler->filterXPath('//div')->attr('class'), '->addXmlContent() adds nodes from an XML string');
}
/**
* @covers Symfony\Component\DomCrawler\Crawler::addXmlContent
*/
public function testAddXmlContentCharset()
{
$crawler = new Crawler();
$crawler->addXmlContent('<html><div class="foo">Tiếng Việt</div></html>', 'UTF-8');
$this->assertEquals('Tiếng Việt', $crawler->filterXPath('//div')->text());
}
/**
* @covers Symfony\Component\DomCrawler\Crawler::addXmlContent
*/