minor #34058 [YAML] Improve performance of YAML parser (NamelessCoder)

This PR was merged into the 5.1-dev branch.

Discussion
----------

[YAML] Improve performance of YAML parser

| Q             | A
| ------------- | ---
| Branch?       | master
| Bug fix?      | no
| New feature?  | no
| Deprecations? | no
| License       | MIT

Optimise various methods and conditions to use best
performing alternatives where possible. Roughly:

* Uses methods that do not copy memory, e.g. strncmp
  as alternative for strpos matching beginning of string.
* Switches order of some conditions to put the cheapest
  checks first in order.
* Checks input before calling trim() - despite the function
  returning the same string as input, it still costs memory
  and introduces unnecessary overhead.
* Extracts variables for repeated identical function calls.
* Uses negative substring offsets instead of strlen + substr.
* Replaces single-char substr usages with substring access.

Profiling method
-----------------

Profiled using a custom script which splits and parses all provided `fixture` files from the YAML component's test directory, then profiled this through Blackfire and identified frequent method calls.

Refactoring strategy
--------------------

Most important change: switching strpos to strncmp to avoid scanning a full (and potentially very long) YAML line for occurrence of a substring.

Whenever possible, I've gone for PHP methods that do not copy memory and replaced some instances of function calls which could be replaced with substring access.

In methods which are called frequently I've gone for guard clauses to prevent further processing if a YAML line is, for example, empty. Such as avoiding trim() on already empty lines.

Profiling results
----------------

A Blackfire profiling delta can be seen on https://blackfire.io/profiles/compare/90fd3005-8b9f-4534-8bd8-1e66832bf247/graph. Taken with 200 samples which render every YAML fixture from the component's test dir.

Commits
-------

7a7c9665da [YAML] Improve performance of YAML parser
This commit is contained in:
Nicolas Grekas 2020-02-07 15:18:18 +01:00
commit cd5c1d6361
2 changed files with 44 additions and 34 deletions

View File

@ -269,7 +269,7 @@ class Inline
*/
public static function parseScalar(string $scalar, int $flags = 0, array $delimiters = null, int &$i = 0, bool $evaluate = true, array $references = [])
{
if (\in_array($scalar[$i], ['"', "'"])) {
if (\in_array($scalar[$i], ['"', "'"], true)) {
// quoted scalar
$output = self::parseQuotedScalar($scalar, $i);
@ -324,7 +324,7 @@ class Inline
throw new ParseException(sprintf('Malformed inline YAML string: %s.', substr($scalar, $i)), self::$parsedLineNumber + 1, $scalar, self::$parsedFilename);
}
$output = substr($match[0], 1, \strlen($match[0]) - 2);
$output = substr($match[0], 1, -1);
$unescaper = new Unescaper();
if ('"' == $scalar[$i]) {
@ -371,7 +371,7 @@ class Inline
$value = self::parseMapping($sequence, $flags, $i, $references);
break;
default:
$isQuoted = \in_array($sequence[$i], ['"', "'"]);
$isQuoted = \in_array($sequence[$i], ['"', "'"], true);
$value = self::parseScalar($sequence, $flags, [',', ']'], $i, null === $tag, $references);
// the value can be an array if a reference has been resolved to an array var
@ -551,9 +551,8 @@ class Inline
private static function evaluateScalar(string $scalar, int $flags, array $references = [])
{
$scalar = trim($scalar);
$scalarLower = strtolower($scalar);
if (0 === strpos($scalar, '*')) {
if ('*' === ($scalar[0] ?? '')) {
if (false !== $pos = strpos($scalar, '#')) {
$value = substr($scalar, 1, $pos - 2);
} else {
@ -572,6 +571,8 @@ class Inline
return $references[$value];
}
$scalarLower = strtolower($scalar);
switch (true) {
case 'null' === $scalarLower:
case '' === $scalar:
@ -583,11 +584,11 @@ class Inline
return false;
case '!' === $scalar[0]:
switch (true) {
case 0 === strpos($scalar, '!!str '):
case 0 === strncmp($scalar, '!!str ', 6):
return (string) substr($scalar, 6);
case 0 === strpos($scalar, '! '):
case 0 === strncmp($scalar, '! ', 2):
return substr($scalar, 2);
case 0 === strpos($scalar, '!php/object'):
case 0 === strncmp($scalar, '!php/object', 11):
if (self::$objectSupport) {
if (!isset($scalar[12])) {
@trigger_error('Using the !php/object tag without a value is deprecated since Symfony 5.1.', E_USER_DEPRECATED);
@ -603,7 +604,7 @@ class Inline
}
return null;
case 0 === strpos($scalar, '!php/const'):
case 0 === strncmp($scalar, '!php/const', 10):
if (self::$constantSupport) {
if (!isset($scalar[11])) {
@trigger_error('Using the !php/const tag without a value is deprecated since Symfony 5.1.', E_USER_DEPRECATED);
@ -623,9 +624,9 @@ class Inline
}
return null;
case 0 === strpos($scalar, '!!float '):
case 0 === strncmp($scalar, '!!float ', 8):
return (float) substr($scalar, 8);
case 0 === strpos($scalar, '!!binary '):
case 0 === strncmp($scalar, '!!binary ', 9):
return self::evaluateBinaryScalar(substr($scalar, 9));
default:
throw new ParseException(sprintf('The string "%s" could not be parsed as it uses an unsupported built-in tag.', $scalar), self::$parsedLineNumber, $scalar, self::$parsedFilename);
@ -633,7 +634,7 @@ class Inline
// Optimize for returning strings.
// no break
case '+' === $scalar[0] || '-' === $scalar[0] || '.' === $scalar[0] || is_numeric($scalar[0]):
case \in_array($scalar[0], ['+', '-', '.'], true) || is_numeric($scalar[0]):
if (Parser::preg_match('{^[+-]?[0-9][0-9_]*$}', $scalar)) {
$scalar = str_replace('_', '', (string) $scalar);
}

View File

@ -28,6 +28,7 @@ class Parser
private $filename;
private $offset = 0;
private $numberOfParsedLines = 0;
private $totalNumberOfLines;
private $lines = [];
private $currentLineNb = -1;
@ -99,6 +100,7 @@ class Parser
}
$this->lines = [];
$this->currentLine = '';
$this->numberOfParsedLines = 0;
$this->refs = [];
$this->skippedLineNumbers = [];
$this->locallySkippedLineNumbers = [];
@ -113,10 +115,11 @@ class Parser
$this->currentLine = '';
$value = $this->cleanup($value);
$this->lines = explode("\n", $value);
$this->numberOfParsedLines = \count($this->lines);
$this->locallySkippedLineNumbers = [];
if (null === $this->totalNumberOfLines) {
$this->totalNumberOfLines = \count($this->lines);
$this->totalNumberOfLines = $this->numberOfParsedLines;
}
if (!$this->moveToNextLine()) {
@ -291,7 +294,7 @@ class Parser
$subTag = null;
if ($mergeNode) {
// Merge keys
} elseif (!isset($values['value']) || '' === $values['value'] || 0 === strpos($values['value'], '#') || (null !== $subTag = $this->getLineTag($values['value'], $flags)) || '<<' === $key) {
} elseif (!isset($values['value']) || '' === $values['value'] || '#' === ($values['value'][0] ?? '') || (null !== $subTag = $this->getLineTag($values['value'], $flags)) || '<<' === $key) {
// hash
// if next line is less indented or equal, then it means that the current value is null
if (!$this->isNextLineIndented() && !$this->isNextLineUnIndentedCollection()) {
@ -430,7 +433,8 @@ class Parser
$value = '';
foreach ($this->lines as $line) {
if ('' !== ltrim($line) && '#' === ltrim($line)[0]) {
$trimmedLine = trim($line);
if ('#' === ($trimmedLine[0] ?? '')) {
continue;
}
// If the indentation is not consistent at offset 0, it is to be considered as a ParseError
@ -442,22 +446,22 @@ class Parser
throw new ParseException('Mapping values are not allowed in multi-line blocks.', $this->getRealCurrentLineNb() + 1, $this->currentLine, $this->filename);
}
if ('' === trim($line)) {
if ('' === $trimmedLine) {
$value .= "\n";
} elseif (!$previousLineWasNewline && !$previousLineWasTerminatedWithBackslash) {
$value .= ' ';
}
if ('' !== trim($line) && '\\' === substr($line, -1)) {
if ('' !== $trimmedLine && '\\' === $line[-1]) {
$value .= ltrim(substr($line, 0, -1));
} elseif ('' !== trim($line)) {
$value .= trim($line);
} elseif ('' !== $trimmedLine) {
$value .= $trimmedLine;
}
if ('' === trim($line)) {
if ('' === $trimmedLine) {
$previousLineWasNewline = true;
$previousLineWasTerminatedWithBackslash = false;
} elseif ('\\' === substr($line, -1)) {
} elseif ('\\' === $line[-1]) {
$previousLineWasNewline = false;
$previousLineWasTerminatedWithBackslash = true;
} else {
@ -481,7 +485,7 @@ class Parser
$data = new TaggedValue($tag, $data);
}
if (Yaml::PARSE_OBJECT_FOR_MAP & $flags && !\is_object($data) && 'mapping' === $context) {
if (Yaml::PARSE_OBJECT_FOR_MAP & $flags && 'mapping' === $context && !\is_object($data)) {
$object = new \stdClass();
foreach ($data as $key => $value) {
@ -545,6 +549,10 @@ class Parser
*/
private function getCurrentLineIndentation(): int
{
if (' ' !== ($this->currentLine[0] ?? '')) {
return 0;
}
return \strlen($this->currentLine) - \strlen(ltrim($this->currentLine, ' '));
}
@ -653,7 +661,7 @@ class Parser
*/
private function moveToNextLine(): bool
{
if ($this->currentLineNb >= \count($this->lines) - 1) {
if ($this->currentLineNb >= $this->numberOfParsedLines - 1) {
return false;
}
@ -689,7 +697,7 @@ class Parser
*/
private function parseValue(string $value, int $flags, string $context)
{
if (0 === strpos($value, '*')) {
if ('*' === ($value[0] ?? '')) {
if (false !== $pos = strpos($value, '#')) {
$value = substr($value, 1, $pos - 2);
} else {
@ -750,7 +758,7 @@ class Parser
$lines[] = trim($this->currentLine);
// quoted string values end with a line that is terminated with the quotation character
if ('' !== $this->currentLine && substr($this->currentLine, -1) === $quotation) {
if ('' !== $this->currentLine && $this->currentLine[-1] === $quotation) {
break;
}
}
@ -944,7 +952,7 @@ class Parser
*/
private function isCurrentLineBlank(): bool
{
return '' == trim($this->currentLine, ' ');
return '' === $this->currentLine || '' === trim($this->currentLine, ' ');
}
/**
@ -955,7 +963,7 @@ class Parser
private function isCurrentLineComment(): bool
{
//checking explicitly the first char of the trim is faster than loops or strpos
$ltrimmedLine = ltrim($this->currentLine, ' ');
$ltrimmedLine = ' ' === $this->currentLine[0] ? ltrim($this->currentLine, ' ') : $this->currentLine;
return '' !== $ltrimmedLine && '#' === $ltrimmedLine[0];
}
@ -1041,7 +1049,7 @@ class Parser
*/
private function isStringUnIndentedCollectionItem(): bool
{
return '-' === rtrim($this->currentLine) || 0 === strpos($this->currentLine, '- ');
return 0 === strncmp($this->currentLine, '- ', 2) || '-' === rtrim($this->currentLine);
}
/**
@ -1144,22 +1152,23 @@ class Parser
$value = '';
for ($i = 0, $linesCount = \count($lines), $previousLineWasNewline = false, $previousLineWasTerminatedWithBackslash = false; $i < $linesCount; ++$i) {
if ('' === trim($lines[$i])) {
$trimmedLine = trim($lines[$i]);
if ('' === $trimmedLine) {
$value .= "\n";
} elseif (!$previousLineWasNewline && !$previousLineWasTerminatedWithBackslash) {
$value .= ' ';
}
if ('' !== trim($lines[$i]) && '\\' === substr($lines[$i], -1)) {
if ('' !== $trimmedLine && '\\' === $lines[$i][-1]) {
$value .= ltrim(substr($lines[$i], 0, -1));
} elseif ('' !== trim($lines[$i])) {
$value .= trim($lines[$i]);
} elseif ('' !== $trimmedLine) {
$value .= $trimmedLine;
}
if ('' === trim($lines[$i])) {
if ('' === $trimmedLine) {
$previousLineWasNewline = true;
$previousLineWasTerminatedWithBackslash = false;
} elseif ('\\' === substr($lines[$i], -1)) {
} elseif ('\\' === $lines[$i][-1]) {
$previousLineWasNewline = false;
$previousLineWasTerminatedWithBackslash = true;
} else {