diff --git a/components/Tag/Tag.php b/components/Tag/Tag.php index 9769b47812..83b475efad 100644 --- a/components/Tag/Tag.php +++ b/components/Tag/Tag.php @@ -97,10 +97,13 @@ class Tag extends Component public static function canonicalTag(string $tag, string $language): string { $result = ''; - if (Event::handle('StemWord', [$language, $tag, &$result]) !== Event::stop) { - $result = Formatting::slugify($tag); + foreach (Formatting::splitWords(str_replace('#', '', $tag)) as $word) { + $temp_res = null; + if (Event::handle('StemWord', [$language, $word, &$temp_res]) !== Event::stop) { + $temp_res = $word; + } + $result .= Formatting::slugify($temp_res); } - $result = str_replace('#', '', $result); return self::ensureLength($result); } diff --git a/plugins/StemWord/StemWord.php b/plugins/StemWord/StemWord.php index 899e8ac529..b7a2c04565 100644 --- a/plugins/StemWord/StemWord.php +++ b/plugins/StemWord/StemWord.php @@ -29,7 +29,7 @@ use Wamania\Snowball\StemmerFactory; class StemWord extends Plugin { - public function onStemWord(string $language, string $word, string &$out) + public function onStemWord(string $language, string $word, ?string &$out) { $out = StemmerFactory::create($language)->stem($word); return Event::stop; diff --git a/src/Util/Formatting.php b/src/Util/Formatting.php index bb0b109801..fb2b8d29ee 100644 --- a/src/Util/Formatting.php +++ b/src/Util/Formatting.php @@ -267,13 +267,13 @@ abstract class Formatting $str = mb_convert_case($str, \MB_CASE_LOWER, 'UTF-8'); return mb_substr($str, 0, $length); } - $str = transliterator_transliterate('Any-Latin;' // any charset to latin compatible + $str = transliterator_transliterate('Any-Latin;' // any charset to latin compatible . 'NFD;' // decompose . '[:Nonspacing Mark:] Remove;' // remove nonspacing marks (accents etc.) . 'NFC;' // composite again . '[:Punctuation:] Remove;' // remove punctuation (.,¿? etc.) . 'Lower();' // turn into lowercase - . 'Latin-ASCII;', // get ASCII equivalents (ð to d for example) + . 'Latin-ASCII;', // get ASCII equivalents (ð to d for example) $str, ); return mb_substr(preg_replace('/[^\pL\pN]/u', '', $str), 0, $length); } @@ -504,4 +504,12 @@ abstract class Formatting return $output; } + + /** + * Split words by `-`, `_` or lower to upper case transitions + */ + public static function splitWords(string $words): array + { + return preg_split('/-|_|(?<=\p{Ll})(?=\p{Lu})/u', $words); + } }