forked from GNUsocial/gnu-social
[COMPONENTS][Tag] Split tag into words and stem each
This commit is contained in:
parent
4571b18c60
commit
98b719dca3
@ -97,10 +97,13 @@ class Tag extends Component
|
||||
public static function canonicalTag(string $tag, string $language): string
|
||||
{
|
||||
$result = '';
|
||||
if (Event::handle('StemWord', [$language, $tag, &$result]) !== Event::stop) {
|
||||
$result = Formatting::slugify($tag);
|
||||
foreach (Formatting::splitWords(str_replace('#', '', $tag)) as $word) {
|
||||
$temp_res = null;
|
||||
if (Event::handle('StemWord', [$language, $word, &$temp_res]) !== Event::stop) {
|
||||
$temp_res = $word;
|
||||
}
|
||||
$result .= Formatting::slugify($temp_res);
|
||||
}
|
||||
$result = str_replace('#', '', $result);
|
||||
return self::ensureLength($result);
|
||||
}
|
||||
|
||||
|
@ -29,7 +29,7 @@ use Wamania\Snowball\StemmerFactory;
|
||||
|
||||
class StemWord extends Plugin
|
||||
{
|
||||
public function onStemWord(string $language, string $word, string &$out)
|
||||
public function onStemWord(string $language, string $word, ?string &$out)
|
||||
{
|
||||
$out = StemmerFactory::create($language)->stem($word);
|
||||
return Event::stop;
|
||||
|
@ -267,13 +267,13 @@ abstract class Formatting
|
||||
$str = mb_convert_case($str, \MB_CASE_LOWER, 'UTF-8');
|
||||
return mb_substr($str, 0, $length);
|
||||
}
|
||||
$str = transliterator_transliterate('Any-Latin;' // any charset to latin compatible
|
||||
$str = transliterator_transliterate('Any-Latin;' // any charset to latin compatible
|
||||
. 'NFD;' // decompose
|
||||
. '[:Nonspacing Mark:] Remove;' // remove nonspacing marks (accents etc.)
|
||||
. 'NFC;' // composite again
|
||||
. '[:Punctuation:] Remove;' // remove punctuation (.,¿? etc.)
|
||||
. 'Lower();' // turn into lowercase
|
||||
. 'Latin-ASCII;', // get ASCII equivalents (ð to d for example)
|
||||
. 'Latin-ASCII;', // get ASCII equivalents (ð to d for example)
|
||||
$str, );
|
||||
return mb_substr(preg_replace('/[^\pL\pN]/u', '', $str), 0, $length);
|
||||
}
|
||||
@ -504,4 +504,12 @@ abstract class Formatting
|
||||
|
||||
return $output;
|
||||
}
|
||||
|
||||
/**
|
||||
* Split words by `-`, `_` or lower to upper case transitions
|
||||
*/
|
||||
public static function splitWords(string $words): array
|
||||
{
|
||||
return preg_split('/-|_|(?<=\p{Ll})(?=\p{Lu})/u', $words);
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user