[String] improve AbstractUnicodeString::ascii() fallback logic

This commit is contained in:
Nicolas Grekas 2019-09-28 13:56:16 +02:00
parent 335632152e
commit 59069626fd

View File

@ -73,18 +73,21 @@ abstract class AbstractUnicodeString extends AbstractString
* *
* Install the intl extension for best results. * Install the intl extension for best results.
* *
* @param string[] $rules See "*-Latin" rules from Transliterator::listIDs() * @param string[]|\Transliterator[] $rules See "*-Latin" rules from Transliterator::listIDs()
*/ */
public function ascii(array $rules = []): self public function ascii(array $rules = []): self
{ {
$str = clone $this; $str = clone $this;
$s = $str->string; $s = $str->string;
$str->string = ''; $str->string = '';
$step = 0;
$rules[] = 'nfkd';
$rules[] = '[:nonspacing mark:] remove';
if (\function_exists('transliterator_transliterate')) { if (\function_exists('transliterator_transliterate')) {
$rules[] = 'any-latin/bgn';
$rules[] = 'nfkd';
$rules[] = '[:nonspacing mark:] remove'; $rules[] = '[:nonspacing mark:] remove';
$rules[] = 'any-latin';
} }
while (\strlen($s) !== $i = strspn($s, self::ASCII)) { while (\strlen($s) !== $i = strspn($s, self::ASCII)) {
@ -93,32 +96,48 @@ abstract class AbstractUnicodeString extends AbstractString
$s = substr($s, $i); $s = substr($s, $i);
} }
if (1 === ++$step) { if ($rules && !$rule = array_shift($rules)) {
if (!normalizer_is_normalized($s, self::NFKD)) { $rules = []; // An empty rule interrupts the next ones
$s = normalizer_normalize($s, self::NFKD); }
}
} elseif (2 === $step) {
$s = str_replace(self::TRANSLIT_FROM, self::TRANSLIT_TO, $s);
} elseif (3 === $step && '' !== $rule = strtolower(array_shift($rules))) {
$step = 2;
if ('[:nonspacing mark:] remove' === $rule) { if ($rules && $rule) {
if ($rule instanceof \Transliterator) {
$s = $rule->transliterate($s);
continue;
}
if ('nfkd' === $rule = strtolower($rule)) {
if (!normalizer_is_normalized($s, self::NFKD)) {
$s = normalizer_normalize($s, self::NFKD);
}
} elseif ('[:nonspacing mark:] remove' === $rule) {
$s = preg_replace('/\p{Mn}++/u', '', $s); $s = preg_replace('/\p{Mn}++/u', '', $s);
} elseif ('de-ascii' === $rule) { } elseif ('de-ascii' === $rule) {
$s = preg_replace("/([AUO])\u{0308}(?=\p{Ll})/u", '$1e', $s); $s = preg_replace("/([AUO])\u{0308}(?=\p{Ll})/u", '$1e', $s);
$s = str_replace(["a\u{0308}", "o\u{0308}", "u\u{0308}", "A\u{0308}", "O\u{0308}", "U\u{0308}"], ['ae', 'oe', 'ue', 'AE', 'OE', 'UE'], $s); $s = str_replace(["a\u{0308}", "o\u{0308}", "u\u{0308}", "A\u{0308}", "O\u{0308}", "U\u{0308}"], ['ae', 'oe', 'ue', 'AE', 'OE', 'UE'], $s);
} elseif (\function_exists('transliterator_transliterate')) { } elseif (\function_exists('transliterator_transliterate')) {
if (null === $transliterator = self::$transliterators[$rule] ?? self::$transliterators[$rule] = \Transliterator::create($rule)) { if (null === $transliterator = self::$transliterators[$rule] ?? self::$transliterators[$rule] = \Transliterator::create($rule)) {
throw new InvalidArgumentException(sprintf('Unknown transliteration rule "%s".', $rule)); if ('any-latin/bgn' === $rule) {
$rule = 'any-latin';
$transliterator = self::$transliterators[$rule] ?? self::$transliterators[$rule] = \Transliterator::create($rule);
}
if (null === $transliterator) {
throw new InvalidArgumentException(sprintf('Unknown transliteration rule "%s".', $rule));
}
self::$transliterators['any-latin/bgn'] = $transliterator;
} }
$s = $transliterator->transliterate($s); $s = $transliterator->transliterate($s);
} }
} elseif (!\function_exists('iconv')) { } elseif (!\function_exists('iconv')) {
$s = str_replace(self::TRANSLIT_FROM, self::TRANSLIT_TO, $s);
$s = preg_replace('/[^\x00-\x7F]/u', '?', $s); $s = preg_replace('/[^\x00-\x7F]/u', '?', $s);
} elseif (\ICONV_IMPL === 'glibc') { } elseif (\ICONV_IMPL === 'glibc') {
$s = iconv('UTF-8', 'ASCII//TRANSLIT', $s); $s = iconv('UTF-8', 'ASCII//TRANSLIT', $s);
} else { } else {
$s = str_replace(self::TRANSLIT_FROM, self::TRANSLIT_TO, $s);
$s = preg_replace_callback('/[^\x00-\x7F]/u', static function ($c) { $s = preg_replace_callback('/[^\x00-\x7F]/u', static function ($c) {
$c = iconv('UTF-8', 'ASCII//IGNORE//TRANSLIT', $c[0]); $c = iconv('UTF-8', 'ASCII//IGNORE//TRANSLIT', $c[0]);