minor #34058 [YAML] Improve performance of YAML parser (NamelessCoder)

This PR was merged into the 5.1-dev branch.

Discussion
----------

[YAML] Improve performance of YAML parser

| Q             | A
| ------------- | ---
| Branch?       | master
| Bug fix?      | no
| New feature?  | no
| Deprecations? | no
| License       | MIT

Optimise various methods and conditions to use best
performing alternatives where possible. Roughly:

* Uses methods that do not copy memory, e.g. strncmp
  as alternative for strpos matching beginning of string.
* Switches order of some conditions to put the cheapest
  checks first in order.
* Checks input before calling trim() - despite the function
  returning the same string as input, it still costs memory
  and introduces unnecessary overhead.
* Extracts variables for repeated identical function calls.
* Uses negative substring offsets instead of strlen + substr.
* Replaces single-char substr usages with substring access.

Profiling method
-----------------

Profiled using a custom script which splits and parses all provided `fixture` files from the YAML component's test directory, then profiled this through Blackfire and identified frequent method calls.

Refactoring strategy
--------------------

Most important change: switching strpos to strncmp to avoid scanning a full (and potentially very long) YAML line for occurrence of a substring.

Whenever possible, I've gone for PHP methods that do not copy memory and replaced some instances of function calls which could be replaced with substring access.

In methods which are called frequently I've gone for guard clauses to prevent further processing if a YAML line is, for example, empty. Such as avoiding trim() on already empty lines.

Profiling results
----------------

A Blackfire profiling delta can be seen on https://blackfire.io/profiles/compare/90fd3005-8b9f-4534-8bd8-1e66832bf247/graph. Taken with 200 samples which render every YAML fixture from the component's test dir.

Commits
-------

7a7c9665da [YAML] Improve performance of YAML parser
This commit is contained in:
Nicolas Grekas 2020-02-07 15:18:18 +01:00
commit cd5c1d6361
2 changed files with 44 additions and 34 deletions

View File

@ -269,7 +269,7 @@ class Inline
*/ */
public static function parseScalar(string $scalar, int $flags = 0, array $delimiters = null, int &$i = 0, bool $evaluate = true, array $references = []) public static function parseScalar(string $scalar, int $flags = 0, array $delimiters = null, int &$i = 0, bool $evaluate = true, array $references = [])
{ {
if (\in_array($scalar[$i], ['"', "'"])) { if (\in_array($scalar[$i], ['"', "'"], true)) {
// quoted scalar // quoted scalar
$output = self::parseQuotedScalar($scalar, $i); $output = self::parseQuotedScalar($scalar, $i);
@ -324,7 +324,7 @@ class Inline
throw new ParseException(sprintf('Malformed inline YAML string: %s.', substr($scalar, $i)), self::$parsedLineNumber + 1, $scalar, self::$parsedFilename); throw new ParseException(sprintf('Malformed inline YAML string: %s.', substr($scalar, $i)), self::$parsedLineNumber + 1, $scalar, self::$parsedFilename);
} }
$output = substr($match[0], 1, \strlen($match[0]) - 2); $output = substr($match[0], 1, -1);
$unescaper = new Unescaper(); $unescaper = new Unescaper();
if ('"' == $scalar[$i]) { if ('"' == $scalar[$i]) {
@ -371,7 +371,7 @@ class Inline
$value = self::parseMapping($sequence, $flags, $i, $references); $value = self::parseMapping($sequence, $flags, $i, $references);
break; break;
default: default:
$isQuoted = \in_array($sequence[$i], ['"', "'"]); $isQuoted = \in_array($sequence[$i], ['"', "'"], true);
$value = self::parseScalar($sequence, $flags, [',', ']'], $i, null === $tag, $references); $value = self::parseScalar($sequence, $flags, [',', ']'], $i, null === $tag, $references);
// the value can be an array if a reference has been resolved to an array var // the value can be an array if a reference has been resolved to an array var
@ -551,9 +551,8 @@ class Inline
private static function evaluateScalar(string $scalar, int $flags, array $references = []) private static function evaluateScalar(string $scalar, int $flags, array $references = [])
{ {
$scalar = trim($scalar); $scalar = trim($scalar);
$scalarLower = strtolower($scalar);
if (0 === strpos($scalar, '*')) { if ('*' === ($scalar[0] ?? '')) {
if (false !== $pos = strpos($scalar, '#')) { if (false !== $pos = strpos($scalar, '#')) {
$value = substr($scalar, 1, $pos - 2); $value = substr($scalar, 1, $pos - 2);
} else { } else {
@ -572,6 +571,8 @@ class Inline
return $references[$value]; return $references[$value];
} }
$scalarLower = strtolower($scalar);
switch (true) { switch (true) {
case 'null' === $scalarLower: case 'null' === $scalarLower:
case '' === $scalar: case '' === $scalar:
@ -583,11 +584,11 @@ class Inline
return false; return false;
case '!' === $scalar[0]: case '!' === $scalar[0]:
switch (true) { switch (true) {
case 0 === strpos($scalar, '!!str '): case 0 === strncmp($scalar, '!!str ', 6):
return (string) substr($scalar, 6); return (string) substr($scalar, 6);
case 0 === strpos($scalar, '! '): case 0 === strncmp($scalar, '! ', 2):
return substr($scalar, 2); return substr($scalar, 2);
case 0 === strpos($scalar, '!php/object'): case 0 === strncmp($scalar, '!php/object', 11):
if (self::$objectSupport) { if (self::$objectSupport) {
if (!isset($scalar[12])) { if (!isset($scalar[12])) {
@trigger_error('Using the !php/object tag without a value is deprecated since Symfony 5.1.', E_USER_DEPRECATED); @trigger_error('Using the !php/object tag without a value is deprecated since Symfony 5.1.', E_USER_DEPRECATED);
@ -603,7 +604,7 @@ class Inline
} }
return null; return null;
case 0 === strpos($scalar, '!php/const'): case 0 === strncmp($scalar, '!php/const', 10):
if (self::$constantSupport) { if (self::$constantSupport) {
if (!isset($scalar[11])) { if (!isset($scalar[11])) {
@trigger_error('Using the !php/const tag without a value is deprecated since Symfony 5.1.', E_USER_DEPRECATED); @trigger_error('Using the !php/const tag without a value is deprecated since Symfony 5.1.', E_USER_DEPRECATED);
@ -623,9 +624,9 @@ class Inline
} }
return null; return null;
case 0 === strpos($scalar, '!!float '): case 0 === strncmp($scalar, '!!float ', 8):
return (float) substr($scalar, 8); return (float) substr($scalar, 8);
case 0 === strpos($scalar, '!!binary '): case 0 === strncmp($scalar, '!!binary ', 9):
return self::evaluateBinaryScalar(substr($scalar, 9)); return self::evaluateBinaryScalar(substr($scalar, 9));
default: default:
throw new ParseException(sprintf('The string "%s" could not be parsed as it uses an unsupported built-in tag.', $scalar), self::$parsedLineNumber, $scalar, self::$parsedFilename); throw new ParseException(sprintf('The string "%s" could not be parsed as it uses an unsupported built-in tag.', $scalar), self::$parsedLineNumber, $scalar, self::$parsedFilename);
@ -633,7 +634,7 @@ class Inline
// Optimize for returning strings. // Optimize for returning strings.
// no break // no break
case '+' === $scalar[0] || '-' === $scalar[0] || '.' === $scalar[0] || is_numeric($scalar[0]): case \in_array($scalar[0], ['+', '-', '.'], true) || is_numeric($scalar[0]):
if (Parser::preg_match('{^[+-]?[0-9][0-9_]*$}', $scalar)) { if (Parser::preg_match('{^[+-]?[0-9][0-9_]*$}', $scalar)) {
$scalar = str_replace('_', '', (string) $scalar); $scalar = str_replace('_', '', (string) $scalar);
} }

View File

@ -28,6 +28,7 @@ class Parser
private $filename; private $filename;
private $offset = 0; private $offset = 0;
private $numberOfParsedLines = 0;
private $totalNumberOfLines; private $totalNumberOfLines;
private $lines = []; private $lines = [];
private $currentLineNb = -1; private $currentLineNb = -1;
@ -99,6 +100,7 @@ class Parser
} }
$this->lines = []; $this->lines = [];
$this->currentLine = ''; $this->currentLine = '';
$this->numberOfParsedLines = 0;
$this->refs = []; $this->refs = [];
$this->skippedLineNumbers = []; $this->skippedLineNumbers = [];
$this->locallySkippedLineNumbers = []; $this->locallySkippedLineNumbers = [];
@ -113,10 +115,11 @@ class Parser
$this->currentLine = ''; $this->currentLine = '';
$value = $this->cleanup($value); $value = $this->cleanup($value);
$this->lines = explode("\n", $value); $this->lines = explode("\n", $value);
$this->numberOfParsedLines = \count($this->lines);
$this->locallySkippedLineNumbers = []; $this->locallySkippedLineNumbers = [];
if (null === $this->totalNumberOfLines) { if (null === $this->totalNumberOfLines) {
$this->totalNumberOfLines = \count($this->lines); $this->totalNumberOfLines = $this->numberOfParsedLines;
} }
if (!$this->moveToNextLine()) { if (!$this->moveToNextLine()) {
@ -291,7 +294,7 @@ class Parser
$subTag = null; $subTag = null;
if ($mergeNode) { if ($mergeNode) {
// Merge keys // Merge keys
} elseif (!isset($values['value']) || '' === $values['value'] || 0 === strpos($values['value'], '#') || (null !== $subTag = $this->getLineTag($values['value'], $flags)) || '<<' === $key) { } elseif (!isset($values['value']) || '' === $values['value'] || '#' === ($values['value'][0] ?? '') || (null !== $subTag = $this->getLineTag($values['value'], $flags)) || '<<' === $key) {
// hash // hash
// if next line is less indented or equal, then it means that the current value is null // if next line is less indented or equal, then it means that the current value is null
if (!$this->isNextLineIndented() && !$this->isNextLineUnIndentedCollection()) { if (!$this->isNextLineIndented() && !$this->isNextLineUnIndentedCollection()) {
@ -430,7 +433,8 @@ class Parser
$value = ''; $value = '';
foreach ($this->lines as $line) { foreach ($this->lines as $line) {
if ('' !== ltrim($line) && '#' === ltrim($line)[0]) { $trimmedLine = trim($line);
if ('#' === ($trimmedLine[0] ?? '')) {
continue; continue;
} }
// If the indentation is not consistent at offset 0, it is to be considered as a ParseError // If the indentation is not consistent at offset 0, it is to be considered as a ParseError
@ -442,22 +446,22 @@ class Parser
throw new ParseException('Mapping values are not allowed in multi-line blocks.', $this->getRealCurrentLineNb() + 1, $this->currentLine, $this->filename); throw new ParseException('Mapping values are not allowed in multi-line blocks.', $this->getRealCurrentLineNb() + 1, $this->currentLine, $this->filename);
} }
if ('' === trim($line)) { if ('' === $trimmedLine) {
$value .= "\n"; $value .= "\n";
} elseif (!$previousLineWasNewline && !$previousLineWasTerminatedWithBackslash) { } elseif (!$previousLineWasNewline && !$previousLineWasTerminatedWithBackslash) {
$value .= ' '; $value .= ' ';
} }
if ('' !== trim($line) && '\\' === substr($line, -1)) { if ('' !== $trimmedLine && '\\' === $line[-1]) {
$value .= ltrim(substr($line, 0, -1)); $value .= ltrim(substr($line, 0, -1));
} elseif ('' !== trim($line)) { } elseif ('' !== $trimmedLine) {
$value .= trim($line); $value .= $trimmedLine;
} }
if ('' === trim($line)) { if ('' === $trimmedLine) {
$previousLineWasNewline = true; $previousLineWasNewline = true;
$previousLineWasTerminatedWithBackslash = false; $previousLineWasTerminatedWithBackslash = false;
} elseif ('\\' === substr($line, -1)) { } elseif ('\\' === $line[-1]) {
$previousLineWasNewline = false; $previousLineWasNewline = false;
$previousLineWasTerminatedWithBackslash = true; $previousLineWasTerminatedWithBackslash = true;
} else { } else {
@ -481,7 +485,7 @@ class Parser
$data = new TaggedValue($tag, $data); $data = new TaggedValue($tag, $data);
} }
if (Yaml::PARSE_OBJECT_FOR_MAP & $flags && !\is_object($data) && 'mapping' === $context) { if (Yaml::PARSE_OBJECT_FOR_MAP & $flags && 'mapping' === $context && !\is_object($data)) {
$object = new \stdClass(); $object = new \stdClass();
foreach ($data as $key => $value) { foreach ($data as $key => $value) {
@ -545,6 +549,10 @@ class Parser
*/ */
private function getCurrentLineIndentation(): int private function getCurrentLineIndentation(): int
{ {
if (' ' !== ($this->currentLine[0] ?? '')) {
return 0;
}
return \strlen($this->currentLine) - \strlen(ltrim($this->currentLine, ' ')); return \strlen($this->currentLine) - \strlen(ltrim($this->currentLine, ' '));
} }
@ -653,7 +661,7 @@ class Parser
*/ */
private function moveToNextLine(): bool private function moveToNextLine(): bool
{ {
if ($this->currentLineNb >= \count($this->lines) - 1) { if ($this->currentLineNb >= $this->numberOfParsedLines - 1) {
return false; return false;
} }
@ -689,7 +697,7 @@ class Parser
*/ */
private function parseValue(string $value, int $flags, string $context) private function parseValue(string $value, int $flags, string $context)
{ {
if (0 === strpos($value, '*')) { if ('*' === ($value[0] ?? '')) {
if (false !== $pos = strpos($value, '#')) { if (false !== $pos = strpos($value, '#')) {
$value = substr($value, 1, $pos - 2); $value = substr($value, 1, $pos - 2);
} else { } else {
@ -750,7 +758,7 @@ class Parser
$lines[] = trim($this->currentLine); $lines[] = trim($this->currentLine);
// quoted string values end with a line that is terminated with the quotation character // quoted string values end with a line that is terminated with the quotation character
if ('' !== $this->currentLine && substr($this->currentLine, -1) === $quotation) { if ('' !== $this->currentLine && $this->currentLine[-1] === $quotation) {
break; break;
} }
} }
@ -944,7 +952,7 @@ class Parser
*/ */
private function isCurrentLineBlank(): bool private function isCurrentLineBlank(): bool
{ {
return '' == trim($this->currentLine, ' '); return '' === $this->currentLine || '' === trim($this->currentLine, ' ');
} }
/** /**
@ -955,7 +963,7 @@ class Parser
private function isCurrentLineComment(): bool private function isCurrentLineComment(): bool
{ {
//checking explicitly the first char of the trim is faster than loops or strpos //checking explicitly the first char of the trim is faster than loops or strpos
$ltrimmedLine = ltrim($this->currentLine, ' '); $ltrimmedLine = ' ' === $this->currentLine[0] ? ltrim($this->currentLine, ' ') : $this->currentLine;
return '' !== $ltrimmedLine && '#' === $ltrimmedLine[0]; return '' !== $ltrimmedLine && '#' === $ltrimmedLine[0];
} }
@ -1041,7 +1049,7 @@ class Parser
*/ */
private function isStringUnIndentedCollectionItem(): bool private function isStringUnIndentedCollectionItem(): bool
{ {
return '-' === rtrim($this->currentLine) || 0 === strpos($this->currentLine, '- '); return 0 === strncmp($this->currentLine, '- ', 2) || '-' === rtrim($this->currentLine);
} }
/** /**
@ -1144,22 +1152,23 @@ class Parser
$value = ''; $value = '';
for ($i = 0, $linesCount = \count($lines), $previousLineWasNewline = false, $previousLineWasTerminatedWithBackslash = false; $i < $linesCount; ++$i) { for ($i = 0, $linesCount = \count($lines), $previousLineWasNewline = false, $previousLineWasTerminatedWithBackslash = false; $i < $linesCount; ++$i) {
if ('' === trim($lines[$i])) { $trimmedLine = trim($lines[$i]);
if ('' === $trimmedLine) {
$value .= "\n"; $value .= "\n";
} elseif (!$previousLineWasNewline && !$previousLineWasTerminatedWithBackslash) { } elseif (!$previousLineWasNewline && !$previousLineWasTerminatedWithBackslash) {
$value .= ' '; $value .= ' ';
} }
if ('' !== trim($lines[$i]) && '\\' === substr($lines[$i], -1)) { if ('' !== $trimmedLine && '\\' === $lines[$i][-1]) {
$value .= ltrim(substr($lines[$i], 0, -1)); $value .= ltrim(substr($lines[$i], 0, -1));
} elseif ('' !== trim($lines[$i])) { } elseif ('' !== $trimmedLine) {
$value .= trim($lines[$i]); $value .= $trimmedLine;
} }
if ('' === trim($lines[$i])) { if ('' === $trimmedLine) {
$previousLineWasNewline = true; $previousLineWasNewline = true;
$previousLineWasTerminatedWithBackslash = false; $previousLineWasTerminatedWithBackslash = false;
} elseif ('\\' === substr($lines[$i], -1)) { } elseif ('\\' === $lines[$i][-1]) {
$previousLineWasNewline = false; $previousLineWasNewline = false;
$previousLineWasTerminatedWithBackslash = true; $previousLineWasTerminatedWithBackslash = true;
} else { } else {