323 lines
8.2 KiB
Perl
323 lines
8.2 KiB
Perl
|
:- module(unicode_collate_info,
|
||
|
[ write_unicode_collate_map/2 % +File, +Options
|
||
|
]).
|
||
|
:- use_module(library('unicode/unicode_data')).
|
||
|
:- use_module(library('http/dcg_basics')).
|
||
|
:- use_module(library(debug)).
|
||
|
:- use_module(library(option)).
|
||
|
|
||
|
/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
|
||
|
This module creates a simple map for removing diacritics from Unicode
|
||
|
characters and mapping them to lowercase. It defines tables and a
|
||
|
function "int sort_point(int code)". The sort points are defined as
|
||
|
follows:
|
||
|
|
||
|
* The high 24-bit contains the character code after mapping to
|
||
|
lowercase and removing possible diacritics.
|
||
|
* Bit 8 is 0 for characters that are mapped from upper to
|
||
|
lowercase and 1 for all other characters
|
||
|
* The low 7 bits encode the removed diacritics. All removed
|
||
|
diacritics are ordered by their Unicode codepoint
|
||
|
and numbered.sequentially.
|
||
|
|
||
|
As a result, text will be ordered by its basic character, upper case
|
||
|
before lowercase and text with diacritics after text without.
|
||
|
|
||
|
|
||
|
RUNNING
|
||
|
|
||
|
To run thhis module, first check the instructions in
|
||
|
library('unicode/unicode_data') for installing the Unicode datafiles
|
||
|
that are not included in this package.
|
||
|
|
||
|
|
||
|
MOTIVATION
|
||
|
|
||
|
This module is a simple-minded replacement for true Unicode support such
|
||
|
as provided by UCI (http://uci.sourceforge.net/). The advantage of the
|
||
|
do-it-yourself approach adopted here however is that it greatly reduces
|
||
|
the footprint and eliminates installation and maintenance problems
|
||
|
involved in adopting large `can-do-everything' external libraries.
|
||
|
|
||
|
We believe it deals correctly with the Western languages, Greek,
|
||
|
Cyrillic and other languages with similar handling of diacritics.
|
||
|
|
||
|
|
||
|
UPPERCASE
|
||
|
|
||
|
For Prolog it makes more sense to map to lowercase as the cannonical
|
||
|
case. However, we wish to order first on uppercase and to be able to do
|
||
|
prefix matches we need to search on the lowest value. Hence, we use the
|
||
|
uppercase version for sorting.
|
||
|
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
|
||
|
|
||
|
:- dynamic
|
||
|
diacritic/1, % known diacritics
|
||
|
diacritic_code/2. % +Diacritic, -Code (1..N)
|
||
|
|
||
|
/*******************************
|
||
|
* C TABLES *
|
||
|
*******************************/
|
||
|
|
||
|
% write_unicode_collate_map(+File, +Options)
|
||
|
%
|
||
|
% Options supported are:
|
||
|
%
|
||
|
% # first_codepage [0]
|
||
|
% Code page to start
|
||
|
%
|
||
|
% # last_codepage [127]
|
||
|
% Code page to end.
|
||
|
%
|
||
|
% # case(UpperOrLower)
|
||
|
% Canonise to upper (default) or lower case.
|
||
|
|
||
|
write_unicode_collate_map(File, Options) :-
|
||
|
open(File, write, Out),
|
||
|
call_cleanup(write_sort_map(Out, Options),
|
||
|
close(Out)).
|
||
|
|
||
|
write_sort_map(Out, Options) :-
|
||
|
gen_tables(Tables, Options),
|
||
|
write_header(Out, Options),
|
||
|
forall(member(table(CP, Map), Tables),
|
||
|
write_codepage(Out, CP, Map)),
|
||
|
write_map(Out, Tables, Options),
|
||
|
write_footer(Out, Options).
|
||
|
|
||
|
write_codepage(Out, CP, Map) :-
|
||
|
assertion(length(Map, 256)),
|
||
|
cp_name(CP, CPN),
|
||
|
format(Out, 'static const int32_t ~w[256] =~n', [CPN]),
|
||
|
format(Out, '{ ', []),
|
||
|
map_entries(Map, 0, Out),
|
||
|
format(Out, '~N};~n~n', []).
|
||
|
|
||
|
cp_name(CP, CPN) :-
|
||
|
sformat(CPN, 'ucp0x~|~`0t~16r~2+', [CP]).
|
||
|
|
||
|
map_entries([], _, _).
|
||
|
map_entries([H|T], I, Out) :-
|
||
|
( I == 0
|
||
|
-> true
|
||
|
; 0 =:= I mod 8
|
||
|
-> format(Out, ',~n ', [])
|
||
|
; format(Out, ', ', [])
|
||
|
),
|
||
|
format(Out, '~|0x~`0t~16r~8+', [H]),
|
||
|
I2 is I + 1,
|
||
|
map_entries(T, I2, Out).
|
||
|
|
||
|
write_map(Out, Tables, Options) :-
|
||
|
option(last_codepage(Last), Options, 127),
|
||
|
format(Out,
|
||
|
'static const int32_t* const ucoll_map[UNICODE_MAP_SIZE] =~n',
|
||
|
[]),
|
||
|
format(Out, '{ ', []),
|
||
|
map_tables(0, Last, Tables, Out),
|
||
|
format(Out, '~N};~n~n', []).
|
||
|
|
||
|
map_tables(CP, Last, _, _) :-
|
||
|
CP > Last, !.
|
||
|
map_tables(CP, Last, Tables, Out) :-
|
||
|
( CP == 0
|
||
|
-> true
|
||
|
; 0 =:= CP mod 8
|
||
|
-> format(Out, ',~n ', [])
|
||
|
; format(Out, ', ', [])
|
||
|
),
|
||
|
( memberchk(table(CP, _), Tables)
|
||
|
-> cp_name(CP, CPN),
|
||
|
format(Out, '~w', [CPN])
|
||
|
; format(Out, '~|~tNULL~7+', [])
|
||
|
),
|
||
|
CP2 is CP + 1,
|
||
|
map_tables(CP2, Last, Tables, Out).
|
||
|
|
||
|
|
||
|
write_header(Out, Options) :-
|
||
|
option(last_codepage(Last), Options, 127),
|
||
|
Size is Last+1,
|
||
|
format(Out,
|
||
|
'#ifdef WIN32\n\
|
||
|
typedef int int32_t;\n\
|
||
|
#else\n\
|
||
|
#include <inttypes.h>\n\
|
||
|
#endif\n\n', []),
|
||
|
format(Out,
|
||
|
'#ifndef NULL\n\
|
||
|
#define NULL ((void*)0)\n\
|
||
|
#endif\n\n', []),
|
||
|
format(Out,
|
||
|
'#define UNICODE_MAP_SIZE ~d~n~n', [Size]).
|
||
|
|
||
|
write_footer(Out, Options) :-
|
||
|
( memberchk(case(lower), Options)
|
||
|
-> Add = '+0x80'
|
||
|
; Add = ''
|
||
|
),
|
||
|
format(Out,
|
||
|
'static int\n\
|
||
|
sort_point(int code)\n\
|
||
|
{ int cp = code / 256;\n\
|
||
|
\n \
|
||
|
if ( cp < UNICODE_MAP_SIZE && ucoll_map[cp] )\n \
|
||
|
return ucoll_map[cp][code&0xff];\n\
|
||
|
\n \
|
||
|
return (code<<8)~w;\n\
|
||
|
}\n\n', [Add]),
|
||
|
format(Out,
|
||
|
'static int\n\
|
||
|
sort_pointA(int code)\n\
|
||
|
{ return ucp0x00[code&0xff];\n\
|
||
|
}\n\n', []).
|
||
|
|
||
|
|
||
|
/*******************************
|
||
|
* TABLES *
|
||
|
*******************************/
|
||
|
|
||
|
gen_tables(Tables, Options) :-
|
||
|
forall(rm_diacritics(_, _, _), true),
|
||
|
assign_diacritic_codes,
|
||
|
findall(table(CP,Map), table(CP, Map, Options), Tables).
|
||
|
|
||
|
table(CP, Map, Options) :-
|
||
|
option(first_codepage(First), Options, 0),
|
||
|
option(last_codepage(Last), Options, 127),
|
||
|
between(First, Last, CP),
|
||
|
findall(M, char(CP, M, Options), Map), % now
|
||
|
non_empty_map(CP, Map, Options).
|
||
|
|
||
|
char(CP, Value, Options) :-
|
||
|
between(0, 255, I),
|
||
|
Code is 256*CP+I,
|
||
|
( char_to_code(Code, Value, Options)
|
||
|
-> true
|
||
|
; format('Failed on ~d~n', [Code]),
|
||
|
Value is Code<<8
|
||
|
).
|
||
|
|
||
|
char_to_code(Code, Value, Options) :-
|
||
|
memberchk(case(lower), Options), !,
|
||
|
( utolower(Code, Lower),
|
||
|
Lower \== Code
|
||
|
-> Cc = Lower,
|
||
|
CFlags = 0x00
|
||
|
; Cc = Code,
|
||
|
CFlags = 0x80
|
||
|
),
|
||
|
assertion(integer(Cc)),
|
||
|
( rm_diacritics(Cc, Base, Dia),
|
||
|
assertion(integer(Base))
|
||
|
-> diacritic_code(Dia, DiaV),
|
||
|
Value is Base << 8 \/ CFlags \/ DiaV
|
||
|
; Value is Cc << 8 \/ CFlags
|
||
|
).
|
||
|
char_to_code(Code, Value, _Options) :-
|
||
|
( utoupper(Code, Upper),
|
||
|
Upper \== Code
|
||
|
-> Cc = Upper,
|
||
|
CFlags = 0x80
|
||
|
; Cc = Code,
|
||
|
CFlags = 0x00
|
||
|
),
|
||
|
assertion(integer(Cc)),
|
||
|
( rm_diacritics(Cc, Base, Dia),
|
||
|
assertion(integer(Base))
|
||
|
-> diacritic_code(Dia, DiaV),
|
||
|
Value is Base << 8 \/ CFlags \/ DiaV
|
||
|
; Value is Cc << 8 \/ CFlags
|
||
|
).
|
||
|
|
||
|
non_empty_map(CP, Map, Options) :-
|
||
|
( memberchk(case(lower), Options)
|
||
|
-> Add is 0x80
|
||
|
; Add = 0
|
||
|
),
|
||
|
\+ empty_map(Map, 0, CP, Add).
|
||
|
|
||
|
empty_map([], _, _, _).
|
||
|
empty_map([H|T], I, CP, Add) :-
|
||
|
H =:= ((CP*256+I)<<8) + Add,
|
||
|
I2 is I + 1,
|
||
|
empty_map(T, I2, CP, Add).
|
||
|
|
||
|
|
||
|
/*******************************
|
||
|
* CASE CONVERSION *
|
||
|
*******************************/
|
||
|
|
||
|
utolower(Code, Lower) :-
|
||
|
unicode_property(Code, simple_lowercase_mapping(Lower)).
|
||
|
|
||
|
utoupper(Code, Upper) :-
|
||
|
unicode_property(Code, simple_uppercase_mapping(Upper)).
|
||
|
|
||
|
|
||
|
/*******************************
|
||
|
* DIACRITICS *
|
||
|
*******************************/
|
||
|
|
||
|
rm_diacritics(Code, Plain, Dia) :-
|
||
|
unicode_property(Code, decomposition_type(List)),
|
||
|
List \== '',
|
||
|
atomic_list_concat(AtomList, ' ', List),
|
||
|
to_plain(AtomList, Code, Plain, Dia).
|
||
|
|
||
|
to_plain([Special, PlainA], _, Plain, 0) :-
|
||
|
special(Special), !,
|
||
|
atom_hex(PlainA, Plain).
|
||
|
to_plain([PlainA], _Code, Plain, 1) :- !,
|
||
|
atom_hex(PlainA, Plain).
|
||
|
to_plain(List, Code, Plain, Dia) :-
|
||
|
maplist(atom_hex, List, Numbers),
|
||
|
Numbers = [Plain, Dia],
|
||
|
diacritic(Dia, Code), !.
|
||
|
|
||
|
diacritic(Code, For) :- !,
|
||
|
unicode_property(Code, canonical_combining_class(Cc)),
|
||
|
( Cc > 0
|
||
|
-> assert_diacritic(Code)
|
||
|
; debug(diacritic, '~16r: ~16r: Cc = ~d~n', [For, Code, Cc]),
|
||
|
fail
|
||
|
).
|
||
|
|
||
|
assign_diacritic_codes :-
|
||
|
retractall(diacritic_code(_,_)),
|
||
|
findall(D, diacritic(D), Ds),
|
||
|
sort([0,1|Ds], Sorted), % 0 and 1 are specials
|
||
|
assign_codes(Sorted, 1).
|
||
|
|
||
|
assign_codes([], _).
|
||
|
assign_codes([H|T], I) :-
|
||
|
assert(diacritic_code(H, I)),
|
||
|
I2 is I + 1,
|
||
|
assign_codes(T, I2).
|
||
|
|
||
|
assert_diacritic(Code) :-
|
||
|
diacritic(Code), !.
|
||
|
assert_diacritic(Code) :-
|
||
|
assert(diacritic(Code)).
|
||
|
|
||
|
atom_hex(Atom, Hex) :-
|
||
|
atom_codes(Atom, Codes),
|
||
|
phrase(xinteger(Hex), Codes).
|
||
|
|
||
|
special('<font>').
|
||
|
special('<noBreak>').
|
||
|
special('<initial>').
|
||
|
special('<medial>').
|
||
|
special('<final>').
|
||
|
special('<isolated>').
|
||
|
special('<circle>').
|
||
|
special('<super>').
|
||
|
special('<sub>').
|
||
|
special('<vertical>').
|
||
|
special('<wide>').
|
||
|
special('<narrow>').
|
||
|
special('<small>').
|
||
|
special('<square>').
|
||
|
special('<fraction>').
|
||
|
special('<compat> ').
|