[DowCrawler] Default to UTF-8 when possible

This commit is contained in:
Nicolas Grekas 2017-09-29 12:22:06 +02:00
parent a442e378e1
commit 73eda66b99

View File

@ -127,8 +127,8 @@ class Crawler implements \Countable, \IteratorAggregate
/** /**
* Adds HTML/XML content. * Adds HTML/XML content.
* *
* If the charset is not set via the content type, it is assumed * If the charset is not set via the content type, it is assumed to be UTF-8,
* to be ISO-8859-1, which is the default charset defined by the * or ISO-8859-1 as a fallback, which is the default charset defined by the
* HTTP 1.1 specification. * HTTP 1.1 specification.
* *
* @param string $content A string to parse as HTML/XML * @param string $content A string to parse as HTML/XML
@ -161,7 +161,7 @@ class Crawler implements \Countable, \IteratorAggregate
} }
if (null === $charset) { if (null === $charset) {
$charset = 'ISO-8859-1'; $charset = preg_match('//u', $content) ? 'UTF-8' : 'ISO-8859-1';
} }
if ('x' === $xmlMatches[1]) { if ('x' === $xmlMatches[1]) {