[DATABASE] Change collation handling

Before now table definitions could define collations only for MariaDB using the
MariaDB's collation names directly.
Now instead definitions get a slightly more abstract collation name syntax, but
only supporting the collations utf8mb4_bin and utf8mb4_unicode_(cs|ci) (wrapped
as utf8_bin, utf8_general_(cs|ci)), because those are the ones that have
practical use for GNU social.

Which also means that on MariaDB the formerly used utf8mb4_general_(cs|ci) have
been superseded by utf8mb4_unicode_(cs|ci), as they are the more modern
replacement.

Introduce collation support on PostgreSQL which results in use of the C (POSIX)
collation as utf8_bin and the und-x-icu collation as utf8_general_cs.
utf8_general_ci is also mapped to und-x-icu, which makes it case-sensitive,
unfortunately.
This commit is contained in:
Alexei Sorokin 2020-08-16 23:41:28 +03:00
parent 5c21816b22
commit 341e34b766
5 changed files with 141 additions and 36 deletions

View File

@ -79,7 +79,7 @@ class Notice extends Managed_DataObject
'id' => array('type' => 'serial', 'not null' => true, 'description' => 'unique identifier'),
'profile_id' => array('type' => 'int', 'not null' => true, 'description' => 'who made the update'),
'uri' => array('type' => 'varchar', 'length' => 191, 'description' => 'universally unique identifier, usually a tag URI'),
'content' => array('type' => 'text', 'description' => 'update content', 'collate' => 'utf8mb4_general_ci'),
'content' => array('type' => 'text', 'description' => 'update content', 'collate' => 'utf8_general_ci'),
'rendered' => array('type' => 'text', 'description' => 'HTML version of the content'),
'url' => array('type' => 'varchar', 'length' => 191, 'description' => 'URL of any attachment (image, video, bookmark, whatever)'),
'created' => array('type' => 'datetime', 'description' => 'date this record was created'),

View File

@ -46,12 +46,12 @@ class Profile extends Managed_DataObject
'description' => 'local and remote users have profiles',
'fields' => array(
'id' => array('type' => 'serial', 'not null' => true, 'description' => 'unique identifier'),
'nickname' => array('type' => 'varchar', 'length' => 64, 'not null' => true, 'description' => 'nickname or username', 'collate' => 'utf8mb4_general_ci'),
'fullname' => array('type' => 'text', 'description' => 'display name', 'collate' => 'utf8mb4_general_ci'),
'nickname' => array('type' => 'varchar', 'length' => 64, 'not null' => true, 'description' => 'nickname or username', 'collate' => 'utf8_general_ci'),
'fullname' => array('type' => 'text', 'description' => 'display name', 'collate' => 'utf8_general_ci'),
'profileurl' => array('type' => 'text', 'description' => 'URL, cached so we dont regenerate'),
'homepage' => array('type' => 'text', 'description' => 'identifying URL', 'collate' => 'utf8mb4_general_ci'),
'bio' => array('type' => 'text', 'description' => 'descriptive biography', 'collate' => 'utf8mb4_general_ci'),
'location' => array('type' => 'text', 'description' => 'physical location', 'collate' => 'utf8mb4_general_ci'),
'homepage' => array('type' => 'text', 'description' => 'identifying URL', 'collate' => 'utf8_general_ci'),
'bio' => array('type' => 'text', 'description' => 'descriptive biography', 'collate' => 'utf8_general_ci'),
'location' => array('type' => 'text', 'description' => 'physical location', 'collate' => 'utf8_general_ci'),
'lat' => array('type' => 'numeric', 'precision' => 10, 'scale' => 7, 'description' => 'latitude'),
'lon' => array('type' => 'numeric', 'precision' => 10, 'scale' => 7, 'description' => 'longitude'),
'location_id' => array('type' => 'int', 'description' => 'location id if possible'),

View File

@ -147,10 +147,8 @@ class MysqlSchema extends Schema
}
}
$table_props = $this->getTableProperties($table, ['TABLE_COLLATION']);
$collate = $row['COLLATION_NAME'];
if (!empty($collate) && $collate !== $table_props['TABLE_COLLATION']) {
$field['collate'] = $collate;
if (!empty($row['COLLATION_NAME'])) {
$field['collate'] = $row['COLLATION_NAME'];
}
$def['fields'][$name] = $field;
@ -471,17 +469,6 @@ class MysqlSchema extends Schema
return in_array(strtolower($cd['type']), $ints);
}
/**
* Is this column a string type?
* @param array $cd
* @return bool
*/
private function isStringType(array $cd): bool
{
$strings = ['char', 'varchar', 'text'];
return in_array(strtolower($cd['type']), $strings);
}
/**
* Return the proper SQL for creating or
* altering a column.
@ -547,6 +534,34 @@ class MysqlSchema extends Schema
return $type;
}
/**
* Collation in MariaDB format from our format
*
* @param string $collate
* @return string
*/
protected function collationToMySQL(string $collate): string
{
if (!in_array($collate, [
'utf8_bin',
'utf8_general_cs',
'utf8_general_ci',
])) {
common_log(
LOG_ERR,
'Collation not supported: "' . $collate . '"'
);
$collate = 'utf8_bin';
}
if (substr($collate, 0, 13) === 'utf8_general_') {
$collate = 'utf8mb4_unicode_' . substr($collate, 13);
} elseif (substr($collate, 0, 5) === 'utf8_') {
$collate = 'utf8mb4_' . substr($collate, 5);
}
return $collate;
}
public function typeAndSize(string $name, array $column)
{
if ($column['type'] === 'enum') {
@ -581,15 +596,6 @@ class MysqlSchema extends Schema
{
$tableDef = parent::filterDef($tableName, $tableDef);
// Get existing table collation if the table exists.
// To know if collation that's been set is unique for the table.
try {
$table_props = $this->getTableProperties($tableName, ['TABLE_COLLATION']);
$table_collate = $table_props['TABLE_COLLATION'];
} catch (SchemaTableMissingException $e) {
$table_collate = null;
}
foreach ($tableDef['fields'] as $name => &$col) {
switch ($col['type']) {
case 'serial':
@ -603,9 +609,8 @@ class MysqlSchema extends Schema
break;
}
if (!empty($col['collate'])
&& $col['collate'] === $table_collate) {
unset($col['collate']);
if (!empty($col['collate'])) {
$col['collate'] = $this->collationToMySQL($col['collate']);
}
$col['type'] = $this->mapType($col);

View File

@ -132,10 +132,13 @@ class PgsqlSchema extends Schema
) {
$field['auto_increment'] = true;
} elseif (array_key_exists($name, $enum_info)) {
$field['type'] = $type = 'enum';
$field['enum'] = $enum_info[$name];
}
if (!empty($row['collation_name'])) {
$field['collate'] = $row['collation_name'];
}
$def['fields'][$name] = $field;
}
@ -415,6 +418,7 @@ class PgsqlSchema extends Schema
'integer' => 'int',
'char' => 'bpchar',
'datetime' => 'timestamp',
'enum' => 'text',
'blob' => 'bytea'
];
@ -442,6 +446,49 @@ class PgsqlSchema extends Schema
return $type;
}
/**
* Collation in PostgreSQL format from our format
*
* @param string $collate
* @return string
*/
protected function collationToPostgreSQL(string $collate): string
{
if (!in_array($collate, [
'utf8_bin',
'utf8_general_cs',
'utf8_general_ci',
])) {
common_log(
LOG_ERR,
'Collation not supported: "' . $collate . '"'
);
$collate = 'utf8_bin';
}
// @fixme No case-insensitivity support
if (substr($collate, 0, 13) === 'utf8_general_') {
$collate = 'und-x-icu';
} elseif (substr($collate, 0, 8) === 'utf8_bin') {
$collate = 'C';
}
return $collate;
}
public function typeAndSize(string $name, array $column)
{
$col = parent::typeAndSize($name, $column);
if ($this->isStringType($column)) {
if (!empty($column['collate'])) {
$col .= ' COLLATE "' . $column['collate'] . '"';
}
}
return $col;
}
/**
* Append an SQL statement with an index definition for a full-text search
* index over one or more columns on a table.
@ -475,14 +522,16 @@ class PgsqlSchema extends Schema
foreach ($tableDef['fields'] as $name => &$col) {
// No convenient support for field descriptions
unset($col['description']);
// @fixme Nor for MariaDB-specific collations
unset($col['collate']);
if ($col['type'] === 'serial') {
$col['type'] = 'int';
$col['auto_increment'] = true;
}
if (!empty($col['collate'])) {
$col['collate'] = $this->collationToPostgreSQL($col['collate']);
}
$col['type'] = $this->mapType($col);
unset($col['size']);
}

View File

@ -891,6 +891,48 @@ class Schema
return null;
}
/**
* Is this column a string type?
*
* @param array $cd
* @return bool
*/
protected function isStringType(array $cd): bool
{
$strings = ['char', 'varchar', 'text'];
$strings[] = 'bpchar'; // PostgreSQL
$strings[] = 'enum'; // MariaDB
return in_array(strtolower($cd['type']), $strings);
}
/**
* Collation in our format from MariaDB format
*
* @param string $collate
* @return string
*/
protected function collationFromMySQL(string $collate): string
{
if (substr($collate, 0, 8) === 'utf8mb4_') {
$collate = 'utf8_' . substr($collate, 8);
}
if (substr($collate, 0, 13) === 'utf8_unicode_') {
$collate = 'utf8_general_' . substr($collate, 13);
}
if (!in_array($collate, [
'utf8_bin',
'utf8_general_cs',
'utf8_general_ci',
])) {
common_log(
LOG_ERR,
'Collation not supported: "' . $collate . '"'
);
$collate = 'utf8_bin';
}
return $collate;
}
/**
* Return the proper SQL for creating or
* altering a column.
@ -1059,6 +1101,15 @@ class Schema
if (array_key_exists('not null', $col) && $col['not null'] !== true) {
unset($col['not null']);
}
if ($this->isStringType($col)) {
// Default collation
if (empty($col['collate'])) {
$col['collate'] = 'utf8_bin';
}
// Migration from direct MariaDB collations
$col['collate'] = $this->collationFromMySQL($col['collate']);
}
}
if (common_config('search', 'type') !== 'fulltext') {