323 lines
		
	
	
		
			8.2 KiB
		
	
	
	
		
			Prolog
		
	
	
	
	
	
			
		
		
	
	
			323 lines
		
	
	
		
			8.2 KiB
		
	
	
	
		
			Prolog
		
	
	
	
	
	
:- module(unicode_collate_info,
 | 
						|
	  [ write_unicode_collate_map/2	% +File, +Options
 | 
						|
	  ]).
 | 
						|
:- use_module(library('unicode/unicode_data')).
 | 
						|
:- use_module(library('http/dcg_basics')).
 | 
						|
:- use_module(library(debug)).
 | 
						|
:- use_module(library(option)).
 | 
						|
 | 
						|
/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 | 
						|
This module creates a simple map   for  removing diacritics from Unicode
 | 
						|
characters and mapping them  to  lowercase.   It  defines  tables  and a
 | 
						|
function "int sort_point(int code)". The  sort   points  are  defined as
 | 
						|
follows:
 | 
						|
 | 
						|
	* The high 24-bit contains the character code after mapping to
 | 
						|
	  lowercase and removing possible diacritics.
 | 
						|
	* Bit 8 is 0 for characters that are mapped from upper to
 | 
						|
	  lowercase and 1 for all other characters
 | 
						|
	* The low 7 bits encode the removed diacritics.  All removed
 | 
						|
	  diacritics are ordered by their Unicode codepoint
 | 
						|
	  and numbered.sequentially.
 | 
						|
 | 
						|
As a result, text will be  ordered   by  its basic character, upper case
 | 
						|
before lowercase and text with diacritics after text without.
 | 
						|
 | 
						|
 | 
						|
RUNNING
 | 
						|
 | 
						|
To   run   thhis   module,    first     check    the   instructions   in
 | 
						|
library('unicode/unicode_data') for installing  the   Unicode  datafiles
 | 
						|
that are not included in this package.
 | 
						|
 | 
						|
 | 
						|
MOTIVATION
 | 
						|
 | 
						|
This module is a simple-minded replacement for true Unicode support such
 | 
						|
as provided by UCI (http://uci.sourceforge.net/).   The advantage of the
 | 
						|
do-it-yourself approach adopted here however is  that it greatly reduces
 | 
						|
the footprint and  eliminates  installation   and  maintenance  problems
 | 
						|
involved in adopting large `can-do-everything' external libraries.
 | 
						|
 | 
						|
We believe it  deals  correctly  with   the  Western  languages,  Greek,
 | 
						|
Cyrillic and other languages with similar handling of diacritics.
 | 
						|
 | 
						|
 | 
						|
UPPERCASE
 | 
						|
 | 
						|
For Prolog it makes more sense  to   map  to lowercase as the cannonical
 | 
						|
case. However, we wish to order first on  uppercase and to be able to do
 | 
						|
prefix matches we need to search on the  lowest value. Hence, we use the
 | 
						|
uppercase version for sorting.
 | 
						|
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
 | 
						|
 | 
						|
:- dynamic
 | 
						|
	diacritic/1,			% known diacritics
 | 
						|
	diacritic_code/2.		% +Diacritic, -Code (1..N)
 | 
						|
 | 
						|
		 /*******************************
 | 
						|
		 *	     C TABLES		*
 | 
						|
		 *******************************/
 | 
						|
 | 
						|
%	write_unicode_collate_map(+File, +Options)
 | 
						|
%
 | 
						|
%	Options supported are:
 | 
						|
%
 | 
						|
%		# first_codepage [0]
 | 
						|
%		Code page to start
 | 
						|
%
 | 
						|
%		# last_codepage [127]
 | 
						|
%		Code page to end.
 | 
						|
%
 | 
						|
%		# case(UpperOrLower)
 | 
						|
%		Canonise to upper (default) or lower case.
 | 
						|
 | 
						|
write_unicode_collate_map(File, Options) :-
 | 
						|
	open(File, write, Out),
 | 
						|
	call_cleanup(write_sort_map(Out, Options),
 | 
						|
		     close(Out)).
 | 
						|
 | 
						|
write_sort_map(Out, Options) :-
 | 
						|
	gen_tables(Tables, Options),
 | 
						|
	write_header(Out, Options),
 | 
						|
	forall(member(table(CP, Map), Tables),
 | 
						|
	       write_codepage(Out, CP, Map)),
 | 
						|
	write_map(Out, Tables, Options),
 | 
						|
	write_footer(Out, Options).
 | 
						|
 | 
						|
write_codepage(Out, CP, Map) :-
 | 
						|
	assertion(length(Map, 256)),
 | 
						|
	cp_name(CP, CPN),
 | 
						|
	format(Out, 'static const int32_t ~w[256] =~n', [CPN]),
 | 
						|
	format(Out, '{ ', []),
 | 
						|
	map_entries(Map, 0, Out),
 | 
						|
	format(Out, '~N};~n~n', []).
 | 
						|
 | 
						|
cp_name(CP, CPN) :-
 | 
						|
	sformat(CPN, 'ucp0x~|~`0t~16r~2+', [CP]).
 | 
						|
 | 
						|
map_entries([], _, _).
 | 
						|
map_entries([H|T], I, Out) :-
 | 
						|
	(   I == 0
 | 
						|
	->  true
 | 
						|
	;   0 =:= I mod 8
 | 
						|
	->  format(Out, ',~n  ', [])
 | 
						|
	;   format(Out, ', ', [])
 | 
						|
	),
 | 
						|
	format(Out, '~|0x~`0t~16r~8+', [H]),
 | 
						|
	I2 is I + 1,
 | 
						|
	map_entries(T, I2, Out).
 | 
						|
 | 
						|
write_map(Out, Tables, Options) :-
 | 
						|
	option(last_codepage(Last), Options, 127),
 | 
						|
	format(Out,
 | 
						|
	       'static const int32_t* const ucoll_map[UNICODE_MAP_SIZE] =~n',
 | 
						|
	       []),
 | 
						|
	format(Out, '{ ', []),
 | 
						|
	map_tables(0, Last, Tables, Out),
 | 
						|
	format(Out, '~N};~n~n', []).
 | 
						|
 | 
						|
map_tables(CP, Last, _, _) :-
 | 
						|
	CP > Last, !.
 | 
						|
map_tables(CP, Last, Tables, Out) :-
 | 
						|
	(   CP == 0
 | 
						|
	->  true
 | 
						|
	;   0 =:= CP mod 8
 | 
						|
	->  format(Out, ',~n  ', [])
 | 
						|
	;   format(Out, ', ', [])
 | 
						|
	),
 | 
						|
	(   memberchk(table(CP, _), Tables)
 | 
						|
	->  cp_name(CP, CPN),
 | 
						|
	    format(Out, '~w', [CPN])
 | 
						|
	;   format(Out, '~|~tNULL~7+', [])
 | 
						|
	),
 | 
						|
	CP2 is CP + 1,
 | 
						|
	map_tables(CP2, Last, Tables, Out).
 | 
						|
 | 
						|
 | 
						|
write_header(Out, Options) :-
 | 
						|
	option(last_codepage(Last), Options, 127),
 | 
						|
	Size is Last+1,
 | 
						|
	format(Out,
 | 
						|
	       '#ifdef WIN32\n\
 | 
						|
		typedef int int32_t;\n\
 | 
						|
		#else\n\
 | 
						|
		#include <inttypes.h>\n\
 | 
						|
		#endif\n\n', []),
 | 
						|
	format(Out,
 | 
						|
	       '#ifndef NULL\n\
 | 
						|
		#define NULL ((void*)0)\n\
 | 
						|
		#endif\n\n', []),
 | 
						|
	format(Out,
 | 
						|
	       '#define UNICODE_MAP_SIZE ~d~n~n', [Size]).
 | 
						|
 | 
						|
write_footer(Out, Options) :-
 | 
						|
	(   memberchk(case(lower), Options)
 | 
						|
	->  Add = '+0x80'
 | 
						|
	;   Add = ''
 | 
						|
	),
 | 
						|
	format(Out,
 | 
						|
	       'static int\n\
 | 
						|
		sort_point(int code)\n\
 | 
						|
		{ int cp = code / 256;\n\
 | 
						|
		\n  \
 | 
						|
		  if ( cp < UNICODE_MAP_SIZE && ucoll_map[cp] )\n    \
 | 
						|
		    return ucoll_map[cp][code&0xff];\n\
 | 
						|
		\n  \
 | 
						|
		  return (code<<8)~w;\n\
 | 
						|
		}\n\n', [Add]),
 | 
						|
	format(Out,
 | 
						|
	       'static int\n\
 | 
						|
		sort_pointA(int code)\n\
 | 
						|
		{ return ucp0x00[code&0xff];\n\
 | 
						|
		}\n\n', []).
 | 
						|
 | 
						|
 | 
						|
		 /*******************************
 | 
						|
		 *	       TABLES		*
 | 
						|
		 *******************************/
 | 
						|
 | 
						|
gen_tables(Tables, Options) :-
 | 
						|
	forall(rm_diacritics(_, _, _), true),
 | 
						|
	assign_diacritic_codes,
 | 
						|
	findall(table(CP,Map), table(CP, Map, Options), Tables).
 | 
						|
 | 
						|
table(CP, Map, Options) :-
 | 
						|
	option(first_codepage(First), Options, 0),
 | 
						|
	option(last_codepage(Last), Options, 127),
 | 
						|
	between(First, Last, CP),
 | 
						|
	findall(M, char(CP, M, Options), Map),	% now
 | 
						|
	non_empty_map(CP, Map, Options).
 | 
						|
 | 
						|
char(CP, Value, Options) :-
 | 
						|
	between(0, 255, I),
 | 
						|
	Code is 256*CP+I,
 | 
						|
	(   char_to_code(Code, Value, Options)
 | 
						|
	->  true
 | 
						|
	;   format('Failed on ~d~n', [Code]),
 | 
						|
	    Value is Code<<8
 | 
						|
	).
 | 
						|
 | 
						|
char_to_code(Code, Value, Options) :-
 | 
						|
	memberchk(case(lower), Options), !,
 | 
						|
	(   utolower(Code, Lower),
 | 
						|
	    Lower \== Code
 | 
						|
	->  Cc = Lower,
 | 
						|
	    CFlags = 0x00
 | 
						|
	;   Cc = Code,
 | 
						|
	    CFlags = 0x80
 | 
						|
	),
 | 
						|
	assertion(integer(Cc)),
 | 
						|
	(   rm_diacritics(Cc, Base, Dia),
 | 
						|
	    assertion(integer(Base))
 | 
						|
	->  diacritic_code(Dia, DiaV),
 | 
						|
	    Value is Base << 8 \/ CFlags \/ DiaV
 | 
						|
	;   Value is Cc << 8 \/ CFlags
 | 
						|
	).
 | 
						|
char_to_code(Code, Value, _Options) :-
 | 
						|
	(   utoupper(Code, Upper),
 | 
						|
	    Upper \== Code
 | 
						|
	->  Cc = Upper,
 | 
						|
	    CFlags = 0x80
 | 
						|
	;   Cc = Code,
 | 
						|
	    CFlags = 0x00
 | 
						|
	),
 | 
						|
	assertion(integer(Cc)),
 | 
						|
	(   rm_diacritics(Cc, Base, Dia),
 | 
						|
	    assertion(integer(Base))
 | 
						|
	->  diacritic_code(Dia, DiaV),
 | 
						|
	    Value is Base << 8 \/ CFlags \/ DiaV
 | 
						|
	;   Value is Cc << 8 \/ CFlags
 | 
						|
	).
 | 
						|
 | 
						|
non_empty_map(CP, Map, Options) :-
 | 
						|
	(   memberchk(case(lower), Options)
 | 
						|
	->  Add is 0x80
 | 
						|
	;   Add = 0
 | 
						|
	),
 | 
						|
	\+ empty_map(Map, 0, CP, Add).
 | 
						|
 | 
						|
empty_map([], _, _, _).
 | 
						|
empty_map([H|T], I, CP, Add) :-
 | 
						|
	H =:= ((CP*256+I)<<8) + Add,
 | 
						|
	I2 is I + 1,
 | 
						|
	empty_map(T, I2, CP, Add).
 | 
						|
 | 
						|
 | 
						|
		 /*******************************
 | 
						|
		 *	 CASE CONVERSION	*
 | 
						|
		 *******************************/
 | 
						|
 | 
						|
utolower(Code, Lower) :-
 | 
						|
	unicode_property(Code, simple_lowercase_mapping(Lower)).
 | 
						|
 | 
						|
utoupper(Code, Upper) :-
 | 
						|
	unicode_property(Code, simple_uppercase_mapping(Upper)).
 | 
						|
 | 
						|
 | 
						|
		 /*******************************
 | 
						|
		 *	     DIACRITICS		*
 | 
						|
		 *******************************/
 | 
						|
 | 
						|
rm_diacritics(Code, Plain, Dia) :-
 | 
						|
	unicode_property(Code, decomposition_type(List)),
 | 
						|
	List \== '',
 | 
						|
	atomic_list_concat(AtomList, ' ', List),
 | 
						|
	to_plain(AtomList, Code, Plain, Dia).
 | 
						|
 | 
						|
to_plain([Special, PlainA], _, Plain, 0) :-
 | 
						|
	special(Special), !,
 | 
						|
	atom_hex(PlainA, Plain).
 | 
						|
to_plain([PlainA], _Code, Plain, 1) :- !,
 | 
						|
	atom_hex(PlainA, Plain).
 | 
						|
to_plain(List, Code, Plain, Dia) :-
 | 
						|
	maplist(atom_hex, List, Numbers),
 | 
						|
	Numbers = [Plain, Dia],
 | 
						|
	diacritic(Dia, Code), !.
 | 
						|
 | 
						|
diacritic(Code, For) :- !,
 | 
						|
	unicode_property(Code, canonical_combining_class(Cc)),
 | 
						|
	(  Cc > 0
 | 
						|
	-> assert_diacritic(Code)
 | 
						|
	;  debug(diacritic, '~16r: ~16r: Cc = ~d~n', [For, Code, Cc]),
 | 
						|
	   fail
 | 
						|
	).
 | 
						|
 | 
						|
assign_diacritic_codes :-
 | 
						|
	retractall(diacritic_code(_,_)),
 | 
						|
	findall(D, diacritic(D), Ds),
 | 
						|
	sort([0,1|Ds], Sorted),		% 0 and 1 are specials
 | 
						|
	assign_codes(Sorted, 1).
 | 
						|
 | 
						|
assign_codes([], _).
 | 
						|
assign_codes([H|T], I) :-
 | 
						|
	assert(diacritic_code(H, I)),
 | 
						|
	I2 is I + 1,
 | 
						|
	assign_codes(T, I2).
 | 
						|
 | 
						|
assert_diacritic(Code) :-
 | 
						|
	diacritic(Code), !.
 | 
						|
assert_diacritic(Code) :-
 | 
						|
	assert(diacritic(Code)).
 | 
						|
 | 
						|
atom_hex(Atom, Hex) :-
 | 
						|
	atom_codes(Atom, Codes),
 | 
						|
	phrase(xinteger(Hex), Codes).
 | 
						|
 | 
						|
special('<font>').
 | 
						|
special('<noBreak>').
 | 
						|
special('<initial>').
 | 
						|
special('<medial>').
 | 
						|
special('<final>').
 | 
						|
special('<isolated>').
 | 
						|
special('<circle>').
 | 
						|
special('<super>').
 | 
						|
special('<sub>').
 | 
						|
special('<vertical>').
 | 
						|
special('<wide>').
 | 
						|
special('<narrow>').
 | 
						|
special('<small>').
 | 
						|
special('<square>').
 | 
						|
special('<fraction>').
 | 
						|
special('<compat>  ').
 |