[COMPONENTS][Tag] Split tag into words and stem each

This commit is contained in:
Hugo Sales 2021-11-25 20:04:59 +00:00
parent 4571b18c60
commit 98b719dca3
Signed by: someonewithpc
GPG Key ID: 7D0C7EAFC9D835A0
3 changed files with 17 additions and 6 deletions

View File

@ -97,10 +97,13 @@ class Tag extends Component
public static function canonicalTag(string $tag, string $language): string
{
$result = '';
if (Event::handle('StemWord', [$language, $tag, &$result]) !== Event::stop) {
$result = Formatting::slugify($tag);
foreach (Formatting::splitWords(str_replace('#', '', $tag)) as $word) {
$temp_res = null;
if (Event::handle('StemWord', [$language, $word, &$temp_res]) !== Event::stop) {
$temp_res = $word;
}
$result .= Formatting::slugify($temp_res);
}
$result = str_replace('#', '', $result);
return self::ensureLength($result);
}

View File

@ -29,7 +29,7 @@ use Wamania\Snowball\StemmerFactory;
class StemWord extends Plugin
{
public function onStemWord(string $language, string $word, string &$out)
public function onStemWord(string $language, string $word, ?string &$out)
{
$out = StemmerFactory::create($language)->stem($word);
return Event::stop;

View File

@ -267,13 +267,13 @@ abstract class Formatting
$str = mb_convert_case($str, \MB_CASE_LOWER, 'UTF-8');
return mb_substr($str, 0, $length);
}
$str = transliterator_transliterate('Any-Latin;' // any charset to latin compatible
$str = transliterator_transliterate('Any-Latin;' // any charset to latin compatible
. 'NFD;' // decompose
. '[:Nonspacing Mark:] Remove;' // remove nonspacing marks (accents etc.)
. 'NFC;' // composite again
. '[:Punctuation:] Remove;' // remove punctuation (.,¿? etc.)
. 'Lower();' // turn into lowercase
. 'Latin-ASCII;', // get ASCII equivalents (ð to d for example)
. 'Latin-ASCII;', // get ASCII equivalents (ð to d for example)
$str, );
return mb_substr(preg_replace('/[^\pL\pN]/u', '', $str), 0, $length);
}
@ -504,4 +504,12 @@ abstract class Formatting
return $output;
}
/**
* Split words by `-`, `_` or lower to upper case transitions
*/
public static function splitWords(string $words): array
{
return preg_split('/-|_|(?<=\p{Ll})(?=\p{Lu})/u', $words);
}
}