feature #35156 [String] Made AbstractString::width() follow POSIX.1-2001 (fancyweb)
This PR was merged into the 5.1-dev branch.
Discussion
----------
[String] Made AbstractString::width() follow POSIX.1-2001
| Q | A
| ------------- | ---
| Branch? | master
| Bug fix? | no
| New feature? | yes
| Deprecations? | no
| Tickets | -
| License | MIT
| Doc PR | -
This PR ports the wcswidth() function (see http://man7.org/linux/man-pages/man3/wcwidth.3.html and https://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c) into the String component. This new method will be useful in the Console component to determine how many columns a character takes.
I kind of copied the Intl data import strategy.
Commits
-------
347d8252fb
[String] Made AbstractString::width() follow POSIX.1-2001
This commit is contained in:
commit
75fc3fa3e4
2
src/Symfony/Component/String/.gitattributes
vendored
2
src/Symfony/Component/String/.gitattributes
vendored
@ -1,3 +1,5 @@
|
||||
/Resources/bin/update-data.php export-ignore
|
||||
/Resources/WcswidthDataGenerator.php export-ignore
|
||||
/Tests export-ignore
|
||||
/phpunit.xml.dist export-ignore
|
||||
/.gitignore export-ignore
|
||||
|
@ -646,6 +646,9 @@ abstract class AbstractString implements \JsonSerializable
|
||||
*/
|
||||
abstract public function upper(): self;
|
||||
|
||||
/**
|
||||
* Returns the printable length on a terminal.
|
||||
*/
|
||||
abstract public function width(bool $ignoreAnsiDecoration = true): int;
|
||||
|
||||
/**
|
||||
|
@ -352,9 +352,6 @@ abstract class AbstractUnicodeString extends AbstractString
|
||||
return $str;
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritdoc}
|
||||
*/
|
||||
public function reverse(): parent
|
||||
{
|
||||
$str = clone $this;
|
||||
@ -444,22 +441,21 @@ abstract class AbstractUnicodeString extends AbstractString
|
||||
$s = str_replace(["\r\n", "\r"], "\n", $s);
|
||||
}
|
||||
|
||||
if (!$ignoreAnsiDecoration) {
|
||||
$s = preg_replace('/[\p{Cc}\x7F]++/u', '', $s);
|
||||
}
|
||||
|
||||
foreach (explode("\n", $s) as $s) {
|
||||
if ($ignoreAnsiDecoration) {
|
||||
$s = preg_replace('/\x1B(?:
|
||||
$s = preg_replace('/(?:\x1B(?:
|
||||
\[ [\x30-\x3F]*+ [\x20-\x2F]*+ [0x40-\x7E]
|
||||
| [P\]X^_] .*? \x1B\\\\
|
||||
| [\x41-\x7E]
|
||||
)/x', '', $s);
|
||||
)|[\p{Cc}\x7F]++)/xu', '', $s);
|
||||
}
|
||||
|
||||
$w = substr_count($s, "\xAD") - substr_count($s, "\x08");
|
||||
$s = preg_replace('/[\x00\x05\x07\p{Mn}\p{Me}\p{Cf}\x{1160}-\x{11FF}\x{200B}]+/u', '', $s);
|
||||
$s = preg_replace('/[\x{1100}-\x{115F}\x{2329}\x{232A}\x{2E80}-\x{303E}\x{3040}-\x{A4CF}\x{AC00}-\x{D7A3}\x{F900}-\x{FAFF}\x{FE10}-\x{FE19}\x{FE30}-\x{FE6F}\x{FF00}-\x{FF60}\x{FFE0}-\x{FFE6}\x{20000}-\x{2FFFD}\x{30000}-\x{3FFFD}]/u', '', $s, -1, $wide);
|
||||
|
||||
if ($width < $w += mb_strlen($s, 'UTF-8') + ($wide << 1)) {
|
||||
$width = $w;
|
||||
}
|
||||
// Non printable characters have been dropped, so wcswidth cannot logically return -1.
|
||||
$width += $this->wcswidth($s);
|
||||
}
|
||||
|
||||
return $width;
|
||||
@ -503,4 +499,80 @@ abstract class AbstractUnicodeString extends AbstractString
|
||||
throw new InvalidArgumentException('Invalid padding type.');
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Based on https://github.com/jquast/wcwidth, a Python implementation of https://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c.
|
||||
*/
|
||||
private function wcswidth(string $string): int
|
||||
{
|
||||
$width = 0;
|
||||
|
||||
foreach (preg_split('//u', $string, -1, PREG_SPLIT_NO_EMPTY) as $c) {
|
||||
$codePoint = mb_ord($c, 'UTF-8');
|
||||
|
||||
if (0 === $codePoint // NULL
|
||||
|| 0x034F === $codePoint // COMBINING GRAPHEME JOINER
|
||||
|| (0x200B <= $codePoint && 0x200F >= $codePoint) // ZERO WIDTH SPACE to RIGHT-TO-LEFT MARK
|
||||
|| 0x2028 === $codePoint // LINE SEPARATOR
|
||||
|| 0x2029 === $codePoint // PARAGRAPH SEPARATOR
|
||||
|| (0x202A <= $codePoint && 0x202E >= $codePoint) // LEFT-TO-RIGHT EMBEDDING to RIGHT-TO-LEFT OVERRIDE
|
||||
|| (0x2060 <= $codePoint && 0x2063 >= $codePoint) // WORD JOINER to INVISIBLE SEPARATOR
|
||||
) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Non printable characters
|
||||
if (32 > $codePoint // C0 control characters
|
||||
|| (0x07F <= $codePoint && 0x0A0 > $codePoint) // C1 control characters and DEL
|
||||
) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
static $tableZero;
|
||||
if (null === $tableZero) {
|
||||
$tableZero = require __DIR__.'/Resources/data/wcswidth_table_zero.php';
|
||||
}
|
||||
|
||||
if ($codePoint >= $tableZero[0][0] && $codePoint <= $tableZero[$ubound = \count($tableZero) - 1][1]) {
|
||||
$lbound = 0;
|
||||
while ($ubound >= $lbound) {
|
||||
$mid = floor(($lbound + $ubound) / 2);
|
||||
|
||||
if ($codePoint > $tableZero[$mid][1]) {
|
||||
$lbound = $mid + 1;
|
||||
} elseif ($codePoint < $tableZero[$mid][0]) {
|
||||
$ubound = $mid - 1;
|
||||
} else {
|
||||
continue 2;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static $tableWide;
|
||||
if (null === $tableWide) {
|
||||
$tableWide = require __DIR__.'/Resources/data/wcswidth_table_wide.php';
|
||||
}
|
||||
|
||||
if ($codePoint >= $tableWide[0][0] && $codePoint <= $tableWide[$ubound = \count($tableWide) - 1][1]) {
|
||||
$lbound = 0;
|
||||
while ($ubound >= $lbound) {
|
||||
$mid = floor(($lbound + $ubound) / 2);
|
||||
|
||||
if ($codePoint > $tableWide[$mid][1]) {
|
||||
$lbound = $mid + 1;
|
||||
} elseif ($codePoint < $tableWide[$mid][0]) {
|
||||
$ubound = $mid - 1;
|
||||
} else {
|
||||
$width += 2;
|
||||
|
||||
continue 2;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
++$width;
|
||||
}
|
||||
|
||||
return $width;
|
||||
}
|
||||
}
|
||||
|
@ -303,9 +303,6 @@ class ByteString extends AbstractString
|
||||
return $str;
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritdoc}
|
||||
*/
|
||||
public function reverse(): parent
|
||||
{
|
||||
$str = clone $this;
|
||||
@ -460,29 +457,8 @@ class ByteString extends AbstractString
|
||||
|
||||
public function width(bool $ignoreAnsiDecoration = true): int
|
||||
{
|
||||
$width = 0;
|
||||
$s = str_replace(["\x00", "\x05", "\x07"], '', $this->string);
|
||||
$string = preg_match('//u', $this->string) ? $this->string : preg_replace('/[\x80-\xFF]/', '?', $this->string);
|
||||
|
||||
if (false !== strpos($s, "\r")) {
|
||||
$s = str_replace(["\r\n", "\r"], "\n", $s);
|
||||
}
|
||||
|
||||
foreach (explode("\n", $s) as $s) {
|
||||
if ($ignoreAnsiDecoration) {
|
||||
$s = preg_replace('/\x1B(?:
|
||||
\[ [\x30-\x3F]*+ [\x20-\x2F]*+ [0x40-\x7E]
|
||||
| [P\]X^_] .*? \x1B\\\\
|
||||
| [\x41-\x7E]
|
||||
)/x', '', $s);
|
||||
}
|
||||
|
||||
$w = substr_count($s, "\xAD") - substr_count($s, "\x08");
|
||||
|
||||
if ($width < $w += \strlen($s)) {
|
||||
$width = $w;
|
||||
}
|
||||
}
|
||||
|
||||
return $width;
|
||||
return (new CodePointString($string))->width($ignoreAnsiDecoration);
|
||||
}
|
||||
}
|
||||
|
@ -5,6 +5,7 @@ CHANGELOG
|
||||
-----
|
||||
|
||||
* Added the `AbstractString::reverse()` method.
|
||||
* Made `AbstractString::width()` follow POSIX.1-2001.
|
||||
|
||||
5.0.0
|
||||
-----
|
||||
|
113
src/Symfony/Component/String/Resources/WcswidthDataGenerator.php
Normal file
113
src/Symfony/Component/String/Resources/WcswidthDataGenerator.php
Normal file
@ -0,0 +1,113 @@
|
||||
<?php
|
||||
|
||||
/*
|
||||
* This file is part of the Symfony package.
|
||||
*
|
||||
* (c) Fabien Potencier <fabien@symfony.com>
|
||||
*
|
||||
* For the full copyright and license information, please view the LICENSE
|
||||
* file that was distributed with this source code.
|
||||
*/
|
||||
|
||||
namespace Symfony\Component\String\Resources;
|
||||
|
||||
use Symfony\Component\HttpClient\HttpClient;
|
||||
use Symfony\Component\String\Exception\RuntimeException;
|
||||
use Symfony\Component\VarExporter\VarExporter;
|
||||
|
||||
/**
|
||||
* @internal
|
||||
*/
|
||||
final class WcswidthDataGenerator
|
||||
{
|
||||
private $outDir;
|
||||
|
||||
private $client;
|
||||
|
||||
public function __construct(string $outDir)
|
||||
{
|
||||
$this->outDir = $outDir;
|
||||
|
||||
$this->client = HttpClient::createForBaseUri('https://www.unicode.org/Public/UNIDATA/');
|
||||
}
|
||||
|
||||
public function generate(): void
|
||||
{
|
||||
$this->writeWideWidthData();
|
||||
|
||||
$this->writeZeroWidthData();
|
||||
}
|
||||
|
||||
private function writeWideWidthData(): void
|
||||
{
|
||||
if (!preg_match('/^# EastAsianWidth-(\d+\.\d+\.\d+)\.txt/', $content = $this->client->request('GET', 'EastAsianWidth.txt')->getContent(), $matches)) {
|
||||
throw new RuntimeException('The Unicode version could not be determined.');
|
||||
}
|
||||
|
||||
$version = $matches[1];
|
||||
|
||||
if (!preg_match_all('/^([A-H\d]{4,})(?:\.\.([A-H\d]{4,}))?;[W|F]/m', $content, $matches, PREG_SET_ORDER)) {
|
||||
throw new RuntimeException('The wide width pattern did not match anything.');
|
||||
}
|
||||
|
||||
$this->write('wcswidth_table_wide.php', $version, $matches);
|
||||
}
|
||||
|
||||
private function writeZeroWidthData(): void
|
||||
{
|
||||
if (!preg_match('/^# DerivedGeneralCategory-(\d+\.\d+\.\d+)\.txt/', $content = $this->client->request('GET', 'extracted/DerivedGeneralCategory.txt')->getContent(), $matches)) {
|
||||
throw new RuntimeException('The Unicode version could not be determined.');
|
||||
}
|
||||
|
||||
$version = $matches[1];
|
||||
|
||||
if (!preg_match_all('/^([A-H\d]{4,})(?:\.\.([A-H\d]{4,}))? *; (?:Me|Mn)/m', $content, $matches, PREG_SET_ORDER)) {
|
||||
throw new RuntimeException('The zero width pattern did not match anything.');
|
||||
}
|
||||
|
||||
$this->write('wcswidth_table_zero.php', $version, $matches);
|
||||
}
|
||||
|
||||
private function write(string $fileName, string $version, array $rawData): void
|
||||
{
|
||||
$content = $this->getHeader($version).'return '.VarExporter::export($this->format($rawData)).";\n";
|
||||
|
||||
if (!file_put_contents($this->outDir.'/'.$fileName, $content)) {
|
||||
throw new RuntimeException(sprintf('The "%s" file could not be written.', $fileName));
|
||||
}
|
||||
}
|
||||
|
||||
private function getHeader(string $version): string
|
||||
{
|
||||
$date = (new \DateTimeImmutable())->format('c');
|
||||
|
||||
return <<<EOT
|
||||
<?php
|
||||
|
||||
/*
|
||||
* This file has been auto-generated by the Symfony String Component for internal use.
|
||||
*
|
||||
* Unicode version: $version
|
||||
* Date: $date
|
||||
*/
|
||||
|
||||
|
||||
EOT;
|
||||
}
|
||||
|
||||
private function format(array $rawData): array
|
||||
{
|
||||
$data = array_map(static function (array $row): array {
|
||||
$start = $row[1];
|
||||
$end = $row[2] ?? $start;
|
||||
|
||||
return [hexdec($start), hexdec($end)];
|
||||
}, $rawData);
|
||||
|
||||
usort($data, static function (array $a, array $b): int {
|
||||
return $a[0] - $b[0];
|
||||
});
|
||||
|
||||
return $data;
|
||||
}
|
||||
}
|
55
src/Symfony/Component/String/Resources/bin/update-data.php
Normal file
55
src/Symfony/Component/String/Resources/bin/update-data.php
Normal file
@ -0,0 +1,55 @@
|
||||
<?php
|
||||
|
||||
/*
|
||||
* This file is part of the Symfony package.
|
||||
*
|
||||
* (c) Fabien Potencier <fabien@symfony.com>
|
||||
*
|
||||
* For the full copyright and license information, please view the LICENSE
|
||||
* file that was distributed with this source code.
|
||||
*/
|
||||
|
||||
use Symfony\Component\String\Resources\WcswidthDataGenerator;
|
||||
|
||||
error_reporting(E_ALL);
|
||||
|
||||
set_error_handler(static function (int $type, string $msg, string $file, int $line): void {
|
||||
throw new \ErrorException($msg, 0, $type, $file, $line);
|
||||
});
|
||||
|
||||
set_exception_handler(static function (\Throwable $exception): void {
|
||||
echo "\n";
|
||||
|
||||
$cause = $exception;
|
||||
$root = true;
|
||||
|
||||
while (null !== $cause) {
|
||||
if (!$root) {
|
||||
echo "Caused by\n";
|
||||
}
|
||||
|
||||
echo get_class($cause).': '.$cause->getMessage()."\n";
|
||||
echo "\n";
|
||||
echo $cause->getFile().':'.$cause->getLine()."\n";
|
||||
echo $cause->getTraceAsString()."\n";
|
||||
|
||||
$cause = $cause->getPrevious();
|
||||
$root = false;
|
||||
}
|
||||
});
|
||||
|
||||
$autoload = __DIR__.'/../../vendor/autoload.php';
|
||||
|
||||
if (!file_exists($autoload)) {
|
||||
echo wordwrap('You should run "composer install" in the component before running this script.', 75)." Aborting.\n";
|
||||
|
||||
exit(1);
|
||||
}
|
||||
|
||||
require_once $autoload;
|
||||
|
||||
echo "Generating wcswidth tables data...\n";
|
||||
|
||||
(new WcswidthDataGenerator(dirname(__DIR__).'/data'))->generate();
|
||||
|
||||
echo "Done.\n";
|
1095
src/Symfony/Component/String/Resources/data/wcswidth_table_wide.php
Normal file
1095
src/Symfony/Component/String/Resources/data/wcswidth_table_wide.php
Normal file
File diff suppressed because it is too large
Load Diff
1303
src/Symfony/Component/String/Resources/data/wcswidth_table_zero.php
Normal file
1303
src/Symfony/Component/String/Resources/data/wcswidth_table_zero.php
Normal file
File diff suppressed because it is too large
Load Diff
@ -1451,4 +1451,35 @@ abstract class AbstractAsciiTestCase extends TestCase
|
||||
["\n!!!\tTAERG SI ynofmyS ", " Symfony IS GREAT\t!!!\n"],
|
||||
];
|
||||
}
|
||||
|
||||
/**
|
||||
* @dataProvider provideWidth
|
||||
*/
|
||||
public function testWidth(int $expected, string $origin, bool $ignoreAnsiDecoration = true)
|
||||
{
|
||||
$this->assertSame($expected, static::createFromString($origin)->width($ignoreAnsiDecoration));
|
||||
}
|
||||
|
||||
public static function provideWidth(): array
|
||||
{
|
||||
return [
|
||||
[0, ''],
|
||||
[1, 'c'],
|
||||
[3, 'foo'],
|
||||
[2, '⭐'],
|
||||
[8, 'f⭐o⭐⭐'],
|
||||
[19, 'コンニチハ, セカイ!'],
|
||||
[6, "foo\u{0000}bar"],
|
||||
[6, "foo\u{001b}[0mbar"],
|
||||
[6, "foo\u{0001}bar"],
|
||||
[6, "foo\u{0001}bar", false],
|
||||
[4, '--ֿ--'],
|
||||
[4, 'café'],
|
||||
[1, 'А҈'],
|
||||
[4, 'ᬓᬨᬮ᭄'],
|
||||
[1, "\u{00AD}"],
|
||||
[14, "\u{007f}\u{007f}f\u{001b}[0moo\u{0001}bar\u{007f}cccïf\u{008e}cy\u{0005}1"], // foobarcccïfcy1
|
||||
[17, "\u{007f}\u{007f}f\u{001b}[0moo\u{0001}bar\u{007f}cccïf\u{008e}cy\u{0005}1", false], // f[0moobarcccïfcy1
|
||||
];
|
||||
}
|
||||
}
|
||||
|
@ -43,4 +43,15 @@ class ByteStringTest extends AbstractAsciiTestCase
|
||||
]
|
||||
);
|
||||
}
|
||||
|
||||
public static function provideWidth(): array
|
||||
{
|
||||
return array_merge(
|
||||
parent::provideWidth(),
|
||||
[
|
||||
[10, "f\u{001b}[0moo\x80bar\xfe\xfe1"], // foo?bar??1
|
||||
[13, "f\u{001b}[0moo\x80bar\xfe\xfe1", false], // f[0moo?bar??1
|
||||
]
|
||||
);
|
||||
}
|
||||
}
|
||||
|
@ -22,6 +22,10 @@
|
||||
"symfony/polyfill-mbstring": "~1.0",
|
||||
"symfony/translation-contracts": "^1.1|^2"
|
||||
},
|
||||
"require-dev": {
|
||||
"symfony/http-client": "^4.4|^5.0",
|
||||
"symfony/var-exporter": "^4.4|^5.0"
|
||||
},
|
||||
"autoload": {
|
||||
"psr-4": { "Symfony\\Component\\String\\": "" },
|
||||
"files": [ "Resources/functions.php" ],
|
||||
|
Reference in New Issue
Block a user