bug #40763 Fix/Rewrite .gitignore regex builder (mvorisek)

This PR was squashed before being merged into the 4.4 branch.

Discussion
----------

Fix/Rewrite .gitignore regex builder

| Q             | A
| ------------- | ---
| Branch?       | 4.4
| Bug fix?      | yes
| New feature?  | no
| Deprecations? | no
| Tickets       | fix #39257
| License       | MIT
| Doc PR        | no

This PR fixes `.gitignore` with exclude wildcard ignore rules like `!a/*/b` were failing with `preg_match(): Compilation failed: lookbehind assertion is not fixed length at offset` PHP error.

Functionality/performance was verified against large `.gitignore` files:
- https://github.com/PrestaShop/PrestaShop/blob/1.7.7.3/.gitignore
- https://github.com/dotnet/installer/blob/v5.0.202/.gitignore
- https://github.com/dotnet/runtime/blob/v5.0.5/.gitignore

This PR also improves the testing cases.

Commits
-------

83f9fd3adf Fix/Rewrite .gitignore regex builder
This commit is contained in:
Fabien Potencier 2021-05-09 11:13:17 +02:00
commit be480d8f97
2 changed files with 287 additions and 142 deletions

View File

@ -14,6 +14,7 @@ namespace Symfony\Component\Finder;
/**
* Gitignore matches against text.
*
* @author Michael Voříšek <vorismi3@fel.cvut.cz>
* @author Ahmed Abdou <mail@ahmd.io>
*/
class Gitignore
@ -21,113 +22,66 @@ class Gitignore
/**
* Returns a regexp which is the equivalent of the gitignore pattern.
*
* @return string The regexp
* Format specification: https://git-scm.com/docs/gitignore#_pattern_format
*/
public static function toRegex(string $gitignoreFileContent): string
{
$gitignoreFileContent = preg_replace('/^[^\\\r\n]*#.*/m', '', $gitignoreFileContent);
$gitignoreLines = preg_split('/\r\n|\r|\n/', $gitignoreFileContent);
$gitignoreFileContent = preg_replace('~(?<!\\\\)#[^\n\r]*~', '', $gitignoreFileContent);
$gitignoreLines = preg_split('~\r\n?|\n~', $gitignoreFileContent);
$positives = [];
$negatives = [];
$res = self::lineToRegex('');
foreach ($gitignoreLines as $i => $line) {
$line = trim($line);
if ('' === $line) {
continue;
$line = preg_replace('~(?<!\\\\)[ \t]+$~', '', $line);
if ('!' === substr($line, 0, 1)) {
$line = substr($line, 1);
$isNegative = true;
} else {
$isNegative = false;
}
if (1 === preg_match('/^!/', $line)) {
$positives[$i] = null;
$negatives[$i] = self::getRegexFromGitignore(preg_replace('/^!(.*)/', '${1}', $line), true);
continue;
if ('' !== $line) {
if ($isNegative) {
$res = '(?!'.self::lineToRegex($line).'$)'.$res;
} else {
$res = '(?:'.$res.'|'.self::lineToRegex($line).')';
}
}
$negatives[$i] = null;
$positives[$i] = self::getRegexFromGitignore($line);
}
$index = 0;
$patterns = [];
foreach ($positives as $pattern) {
if (null === $pattern) {
continue;
}
$negativesAfter = array_filter(\array_slice($negatives, ++$index));
if ([] !== $negativesAfter) {
$pattern .= sprintf('(?<!%s)', implode('|', $negativesAfter));
}
$patterns[] = $pattern;
}
return sprintf('/^((%s))$/', implode(')|(', $patterns));
return '~^(?:'.$res.')~s';
}
private static function getRegexFromGitignore(string $gitignorePattern, bool $negative = false): string
private static function lineToRegex(string $gitignoreLine): string
{
$regex = '';
$isRelativePath = false;
// If there is a separator at the beginning or middle (or both) of the pattern, then the pattern is relative to the directory level of the particular .gitignore file itself
$slashPosition = strpos($gitignorePattern, '/');
if (false !== $slashPosition && \strlen($gitignorePattern) - 1 !== $slashPosition) {
if (0 === $slashPosition) {
$gitignorePattern = substr($gitignorePattern, 1);
}
$isRelativePath = true;
$regex .= '^';
if ('' === $gitignoreLine) {
return '$f'; // always false
}
if ('/' === $gitignorePattern[\strlen($gitignorePattern) - 1]) {
$gitignorePattern = substr($gitignorePattern, 0, -1);
$slashPos = strpos($gitignoreLine, '/');
if (false !== $slashPos && \strlen($gitignoreLine) - 1 !== $slashPos) {
if (0 === $slashPos) {
$gitignoreLine = substr($gitignoreLine, 1);
}
$isAbsolute = true;
} else {
$isAbsolute = false;
}
$iMax = \strlen($gitignorePattern);
for ($i = 0; $i < $iMax; ++$i) {
$tripleChars = substr($gitignorePattern, $i, 3);
if ('**/' === $tripleChars || '/**' === $tripleChars) {
$regex .= '.*';
$i += 2;
continue;
}
$parts = array_map(function (string $v): string {
$v = preg_quote(str_replace('\\', '', $v), '~');
$v = preg_replace_callback('~\\\\\[([^\[\]]*)\\\\\]~', function (array $matches): string {
return '['.str_replace('\\-', '-', $matches[1]).']';
}, $v);
$v = preg_replace('~\\\\\*\\\\\*~', '[^/]+(?:/[^/]+)*', $v);
$v = preg_replace('~\\\\\*~', '[^/]*', $v);
$v = preg_replace('~\\\\\?~', '[^/]', $v);
$doubleChars = substr($gitignorePattern, $i, 2);
if ('**' === $doubleChars) {
$regex .= '.*';
++$i;
continue;
}
if ('*/' === $doubleChars) {
$regex .= '[^\/]*\/?[^\/]*';
++$i;
continue;
}
return $v;
}, explode('/', $gitignoreLine));
$c = $gitignorePattern[$i];
switch ($c) {
case '*':
$regex .= $isRelativePath ? '[^\/]*' : '[^\/]*\/?[^\/]*';
break;
case '/':
case '.':
case ':':
case '(':
case ')':
case '{':
case '}':
$regex .= '\\'.$c;
break;
default:
$regex .= $c;
}
}
if ($negative) {
// a lookbehind assertion has to be a fixed width (it can not have nested '|' statements)
return sprintf('%s$|%s\/$', $regex, $regex);
}
return '(?>'.$regex.'($|\/.*))';
return ($isAbsolute ? '' : '(?:[^/]+/)*')
.implode('/', $parts)
.('' !== end($parts) ? '(?:$|/)' : '');
}
}

View File

@ -13,136 +13,327 @@ namespace Symfony\Component\Finder\Tests;
use PHPUnit\Framework\TestCase;
use Symfony\Component\Finder\Gitignore;
/**
* @author Michael Voříšek <vorismi3@fel.cvut.cz>
*/
class GitignoreTest extends TestCase
{
/**
* @dataProvider provider
* @dataProvider providerExtended
*/
public function testCases(string $patterns, array $matchingCases, array $nonMatchingCases)
public function testToRegex(array $gitignoreLines, array $matchingCases, array $nonMatchingCases)
{
$patterns = implode("\n", $gitignoreLines);
$regex = Gitignore::toRegex($patterns);
$this->assertSame($regex, Gitignore::toRegex(implode("\r\n", $gitignoreLines)));
$this->assertSame($regex, Gitignore::toRegex(implode("\r", $gitignoreLines)));
foreach ($matchingCases as $matchingCase) {
$this->assertMatchesRegularExpression($regex, $matchingCase, sprintf('Failed asserting path [%s] matches gitignore patterns [%s] using regex [%s]', $matchingCase, $patterns, $regex));
$this->assertMatchesRegularExpression(
$regex,
$matchingCase,
sprintf(
"Failed asserting path:\n%s\nmatches gitignore patterns:\n%s",
preg_replace('~^~m', ' ', $matchingCase),
preg_replace('~^~m', ' ', $patterns)
)
);
}
foreach ($nonMatchingCases as $nonMatchingCase) {
$this->assertDoesNotMatchRegularExpression($regex, $nonMatchingCase, sprintf('Failed asserting path [%s] not matching gitignore patterns [%s] using regex [%s]', $nonMatchingCase, $patterns, $regex));
$this->assertDoesNotMatchRegularExpression(
$regex,
$nonMatchingCase,
sprintf("Failed asserting path:\n%s\nNOT matching gitignore patterns:\n%s",
preg_replace('~^~m', ' ', $nonMatchingCase),
preg_replace('~^~m', ' ', $patterns)
)
);
}
}
/**
* @return array return is array of
* [
* [
* '', // Git-ignore Pattern
* [], // array of file paths matching
* [], // array of file paths not matching
* ],
* ]
*/
public function provider(): array
{
return [
$cases = [
[
'
*
!/bin
!/bin/bash
',
[''],
[],
['a', 'a/b', 'a/b/c', 'aa', 'm.txt', '.txt'],
],
[
['a', 'X'],
['a', 'a/b', 'a/b/c', 'X', 'b/a', 'b/c/a', 'a/X', 'a/X/y', 'b/a/X/y'],
['A', 'x', 'aa', 'm.txt', '.txt', 'aa/b', 'b/aa'],
],
[
['/a', 'x', 'd/'],
['a', 'a/b', 'a/b/c', 'x', 'a/x', 'a/x/y', 'b/a/x/y', 'd/', 'd/u', 'e/d/', 'e/d/u'],
['b/a', 'b/c/a', 'aa', 'm.txt', '.txt', 'aa/b', 'b/aa', 'e/d'],
],
[
['a/', 'x'],
['a/b', 'a/b/c', 'x', 'a/x', 'a/x/y', 'b/a/x/y'],
['a', 'b/a', 'b/c/a', 'aa', 'm.txt', '.txt', 'aa/b', 'b/aa'],
],
[
['*'],
['a', 'a/b', 'a/b/c', 'aa', 'm.txt', '.txt'],
[],
],
[
['/*'],
['a', 'a/b', 'a/b/c', 'aa', 'm.txt', '.txt'],
[],
],
[
['/a', 'm/*'],
['a', 'a/b', 'a/b/c', 'm/'],
['aa', 'm', 'b/m', 'b/m/'],
],
[
['a', '!x'],
['a', 'a/b', 'a/b/c', 'b/a', 'b/c/a'],
['x', 'aa', 'm.txt', '.txt', 'aa/b', 'b/aa'],
],
[
['a', '!a/', 'b', '!b/b'],
['a', 'a/x', 'x/a', 'x/a/x', 'b', 'b'],
['a/', 'x/a/', 'bb', 'b/b', 'bb'],
],
[
['[a-c]', 'x[C-E][][o]', 'g-h'],
['a', 'b', 'c', 'xDo', 'g-h'],
['A', 'xdo', 'u', 'g', 'h'],
],
[
['a?', '*/??b?'],
['ax', 'x/xxbx'],
['a', 'axy', 'xxax', 'x/xxax', 'x/y/xxax'],
],
[
[' ', ' \ ', ' \ ', '/a ', '/b/c \ '],
[' ', ' ', 'x/ ', 'x/ ', 'a', 'a/x', 'b/c '],
[' ', ' ', 'x/ ', 'x/ ', 'a ', 'b/c '],
],
[
['#', ' #', '/ #', ' #', '/ #', ' \ #', ' \ #', 'a #', 'a #', 'a \ #', 'a \ #'],
[' ', ' ', 'a', 'a ', 'a '],
[' ', ' ', 'a ', 'a '],
],
[
["\t", "\t\\\t", " \t\\\t ", "\t#", "a\t#", "a\t\t#", "a \t#", "a\t\t\\\t#", "a \t\t\\\t\t#"],
["\t\t", " \t\t", 'a', "a\t\t\t", "a \t\t\t"],
["\t", "\t\t ", " \t\t ", "a\t", 'a ', "a \t", "a\t\t"],
],
[
[' a', 'b ', '\ ', 'c\ '],
[' a', 'b', ' ', 'c '],
['a', 'b ', 'c'],
],
[
['#a', '\#b', '\#/'],
['#b', '#/'],
['#a', 'a', 'b'],
],
[
['*', '!!', '!!*x', '\!!b'],
['a', '!!', '!!b'],
['!', '!x', '!xx'],
],
[
[
'*',
'!/bin',
'!/bin/bash',
],
['bin/cat', 'abc/bin/cat'],
['bin/bash'],
],
[
'fi#le.txt',
['fi#le.txt'],
[],
['#file.txt'],
],
[
'
/bin/
/usr/local/
!/bin/bash
!/usr/local/bin/bash
',
[
'/bin/',
'/usr/local/',
'!/bin/bash',
'!/usr/local/bin/bash',
],
['bin/cat'],
['bin/bash'],
],
[
'*.py[co]',
['*.py[co]'],
['file.pyc', 'file.pyc'],
['filexpyc', 'file.pycx', 'file.py'],
],
[
'dir1/**/dir2/',
['dir1/**/dir2/'],
['dir1/dirA/dir2/', 'dir1/dirA/dirB/dir2/'],
[],
],
[
'dir1/*/dir2/',
['dir1/*/dir2/'],
['dir1/dirA/dir2/'],
['dir1/dirA/dirB/dir2/'],
],
[
'/*.php',
['/*.php'],
['file.php'],
['app/file.php'],
],
[
'\#file.txt',
['\#file.txt'],
['#file.txt'],
[],
],
[
'*.php',
['*.php'],
['app/file.php', 'file.php'],
['file.phps', 'file.phps', 'filephps'],
],
[
'app/cache/',
['app/cache/'],
['app/cache/file.txt', 'app/cache/dir1/dir2/file.txt'],
['a/app/cache/file.txt'],
],
[
'
#IamComment
/app/cache/',
[
'#IamComment',
'/app/cache/',
],
['app/cache/file.txt', 'app/cache/subdir/ile.txt'],
['a/app/cache/file.txt', '#IamComment', 'IamComment'],
],
[
'
/app/cache/
#LastLineIsComment',
[
'/app/cache/',
'#LastLineIsComment',
],
['app/cache/file.txt', 'app/cache/subdir/ile.txt'],
['a/app/cache/file.txt', '#LastLineIsComment', 'LastLineIsComment'],
],
[
'
/app/cache/
\#file.txt
#LastLineIsComment',
[
'/app/cache/',
'\#file.txt',
'#LastLineIsComment',
],
['app/cache/file.txt', 'app/cache/subdir/ile.txt', '#file.txt'],
['a/app/cache/file.txt', '#LastLineIsComment', 'LastLineIsComment'],
],
[
'
/app/cache/
\#file.txt
#IamComment
another_file.txt',
[
'/app/cache/',
'\#file.txt',
'#IamComment',
'another_file.txt',
],
['app/cache/file.txt', 'app/cache/subdir/ile.txt', '#file.txt', 'another_file.txt'],
['a/app/cache/file.txt', 'IamComment', '#IamComment'],
],
[
'
/app/**
!/app/bin
!/app/bin/test
',
[
'/app/**',
'!/app/bin',
'!/app/bin/test',
],
['app/test/file', 'app/bin/file'],
['app/bin/test'],
],
[
[
'/app/*/img',
'!/app/*/img/src',
],
['app/a/img', 'app/a/img/x', 'app/a/img/src/x'],
['app/a/img/src', 'app/a/img/src/'],
],
[
[
'app/**/img',
'!/app/**/img/src',
],
['app/a/img', 'app/a/img/x', 'app/a/img/src/x', 'app/a/b/img', 'app/a/b/img/x', 'app/a/b/img/src/x', 'app/a/b/c/img'],
['app/a/img/src', 'app/a/b/img/src', 'app/a/c/b/img/src'],
],
[
[
'/*',
'!/foo',
'/foo/*',
'!/foo/bar',
],
['bar', 'foo/ba', 'foo/barx', 'x/foo/bar'],
['foo', 'foo/bar'],
],
[
[
'/example/**',
'!/example/example.txt',
'!/example/packages',
],
['example/test', 'example/example.txt2', 'example/packages/foo.yaml'],
['example/example.txt', 'example/packages', 'example/packages/'],
],
];
return $cases;
}
public function providerExtended(): array
{
$basicCases = $this->provider();
$cases = [];
foreach ($basicCases as $case) {
$cases[] = [
array_merge(['never'], $case[0], ['!never']),
$case[1],
$case[2],
];
$cases[] = [
array_merge(['!*'], $case[0]),
$case[1],
$case[2],
];
$cases[] = [
array_merge(['*', '!*'], $case[0]),
$case[1],
$case[2],
];
$cases[] = [
array_merge(['never', '**/never2', 'never3/**'], $case[0]),
$case[1],
$case[2],
];
$cases[] = [
array_merge(['!never', '!**/never2', '!never3/**'], $case[0]),
$case[1],
$case[2],
];
$lines = [];
for ($i = 0; $i < 30; ++$i) {
foreach ($case[0] as $line) {
$lines[] = $line;
}
}
$cases[] = [
array_merge(['!never', '!**/never2', '!never3/**'], $lines),
$case[1],
$case[2],
];
}
return $cases;
}
}