yap-6.3/packages/semweb/unicode_map.pl

:- module(unicode_collate_info,
	  [ write_unicode_collate_map/2	% +File, +Options
	  ]).
:- use_module(library('unicode/unicode_data')).
:- use_module(library('http/dcg_basics')).
:- use_module(library(debug)).
:- use_module(library(option)).

/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
This module creates a simple map   for  removing diacritics from Unicode
characters and mapping them  to  lowercase.   It  defines  tables  and a
function "int sort_point(int code)". The  sort   points  are  defined as
follows:

	* The high 24-bit contains the character code after mapping to
	  lowercase and removing possible diacritics.
	* Bit 8 is 0 for characters that are mapped from upper to
	  lowercase and 1 for all other characters
	* The low 7 bits encode the removed diacritics.  All removed
	  diacritics are ordered by their Unicode codepoint
	  and numbered.sequentially.

As a result, text will be  ordered   by  its basic character, upper case
before lowercase and text with diacritics after text without.


RUNNING

To   run   thhis   module,    first     check    the   instructions   in
library('unicode/unicode_data') for installing  the   Unicode  datafiles
that are not included in this package.


MOTIVATION

This module is a simple-minded replacement for true Unicode support such
as provided by UCI (http://uci.sourceforge.net/).   The advantage of the
do-it-yourself approach adopted here however is  that it greatly reduces
the footprint and  eliminates  installation   and  maintenance  problems
involved in adopting large `can-do-everything' external libraries.

We believe it  deals  correctly  with   the  Western  languages,  Greek,
Cyrillic and other languages with similar handling of diacritics.


UPPERCASE

For Prolog it makes more sense  to   map  to lowercase as the cannonical
case. However, we wish to order first on  uppercase and to be able to do
prefix matches we need to search on the  lowest value. Hence, we use the
uppercase version for sorting.
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */

:- dynamic
	diacritic/1,			% known diacritics
	diacritic_code/2.		% +Diacritic, -Code (1..N)

		 /*******************************
		 *	     C TABLES		*
		 *******************************/

%	write_unicode_collate_map(+File, +Options)
%
%	Options supported are:
%
%		# first_codepage [0]
%		Code page to start
%
%		# last_codepage [127]
%		Code page to end.
%
%		# case(UpperOrLower)
%		Canonise to upper (default) or lower case.

write_unicode_collate_map(File, Options) :-
	open(File, write, Out),
	call_cleanup(write_sort_map(Out, Options),
		     close(Out)).

write_sort_map(Out, Options) :-
	gen_tables(Tables, Options),
	write_header(Out, Options),
	forall(member(table(CP, Map), Tables),
	       write_codepage(Out, CP, Map)),
	write_map(Out, Tables, Options),
	write_footer(Out, Options).

write_codepage(Out, CP, Map) :-
	assertion(length(Map, 256)),
	cp_name(CP, CPN),
	format(Out, 'static const int32_t ~w[256] =~n', [CPN]),
	format(Out, '{ ', []),
	map_entries(Map, 0, Out),
	format(Out, '~N};~n~n', []).

cp_name(CP, CPN) :-
	sformat(CPN, 'ucp0x~|~`0t~16r~2+', [CP]).

map_entries([], _, _).
map_entries([H|T], I, Out) :-
	(   I == 0
	->  true
	;   0 =:= I mod 8
	->  format(Out, ',~n  ', [])
	;   format(Out, ', ', [])
	),
	format(Out, '~|0x~`0t~16r~8+', [H]),
	I2 is I + 1,
	map_entries(T, I2, Out).

write_map(Out, Tables, Options) :-
	option(last_codepage(Last), Options, 127),
	format(Out,
	       'static const int32_t* const ucoll_map[UNICODE_MAP_SIZE] =~n',
	       []),
	format(Out, '{ ', []),
	map_tables(0, Last, Tables, Out),
	format(Out, '~N};~n~n', []).

map_tables(CP, Last, _, _) :-
	CP > Last, !.
map_tables(CP, Last, Tables, Out) :-
	(   CP == 0
	->  true
	;   0 =:= CP mod 8
	->  format(Out, ',~n  ', [])
	;   format(Out, ', ', [])
	),
	(   memberchk(table(CP, _), Tables)
	->  cp_name(CP, CPN),
	    format(Out, '~w', [CPN])
	;   format(Out, '~|~tNULL~7+', [])
	),
	CP2 is CP + 1,
	map_tables(CP2, Last, Tables, Out).


write_header(Out, Options) :-
	option(last_codepage(Last), Options, 127),
	Size is Last+1,
	format(Out,
	       '#ifdef WIN32\n\
		typedef int int32_t;\n\
		#else\n\
		#include <inttypes.h>\n\
		#endif\n\n', []),
	format(Out,
	       '#ifndef NULL\n\
		#define NULL ((void*)0)\n\
		#endif\n\n', []),
	format(Out,
	       '#define UNICODE_MAP_SIZE ~d~n~n', [Size]).

write_footer(Out, Options) :-
	(   memberchk(case(lower), Options)
	->  Add = '+0x80'
	;   Add = ''
	),
	format(Out,
	       'static int\n\
		sort_point(int code)\n\
		{ int cp = code / 256;\n\
		\n  \
		  if ( cp < UNICODE_MAP_SIZE && ucoll_map[cp] )\n    \
		    return ucoll_map[cp][code&0xff];\n\
		\n  \
		  return (code<<8)~w;\n\
		}\n\n', [Add]),
	format(Out,
	       'static int\n\
		sort_pointA(int code)\n\
		{ return ucp0x00[code&0xff];\n\
		}\n\n', []).


		 /*******************************
		 *	       TABLES		*
		 *******************************/

gen_tables(Tables, Options) :-
	forall(rm_diacritics(_, _, _), true),
	assign_diacritic_codes,
	findall(table(CP,Map), table(CP, Map, Options), Tables).

table(CP, Map, Options) :-
	option(first_codepage(First), Options, 0),
	option(last_codepage(Last), Options, 127),
	between(First, Last, CP),
	findall(M, char(CP, M, Options), Map),	% now
	non_empty_map(CP, Map, Options).

char(CP, Value, Options) :-
	between(0, 255, I),
	Code is 256*CP+I,
	(   char_to_code(Code, Value, Options)
	->  true
	;   format('Failed on ~d~n', [Code]),
	    Value is Code<<8
	).

char_to_code(Code, Value, Options) :-
	memberchk(case(lower), Options), !,
	(   utolower(Code, Lower),
	    Lower \== Code
	->  Cc = Lower,
	    CFlags = 0x00
	;   Cc = Code,
	    CFlags = 0x80
	),
	assertion(integer(Cc)),
	(   rm_diacritics(Cc, Base, Dia),
	    assertion(integer(Base))
	->  diacritic_code(Dia, DiaV),
	    Value is Base << 8 \/ CFlags \/ DiaV
	;   Value is Cc << 8 \/ CFlags
	).
char_to_code(Code, Value, _Options) :-
	(   utoupper(Code, Upper),
	    Upper \== Code
	->  Cc = Upper,
	    CFlags = 0x80
	;   Cc = Code,
	    CFlags = 0x00
	),
	assertion(integer(Cc)),
	(   rm_diacritics(Cc, Base, Dia),
	    assertion(integer(Base))
	->  diacritic_code(Dia, DiaV),
	    Value is Base << 8 \/ CFlags \/ DiaV
	;   Value is Cc << 8 \/ CFlags
	).

non_empty_map(CP, Map, Options) :-
	(   memberchk(case(lower), Options)
	->  Add is 0x80
	;   Add = 0
	),
	\+ empty_map(Map, 0, CP, Add).

empty_map([], _, _, _).
empty_map([H|T], I, CP, Add) :-
	H =:= ((CP*256+I)<<8) + Add,
	I2 is I + 1,
	empty_map(T, I2, CP, Add).


		 /*******************************
		 *	 CASE CONVERSION	*
		 *******************************/

utolower(Code, Lower) :-
	unicode_property(Code, simple_lowercase_mapping(Lower)).

utoupper(Code, Upper) :-
	unicode_property(Code, simple_uppercase_mapping(Upper)).


		 /*******************************
		 *	     DIACRITICS		*
		 *******************************/

rm_diacritics(Code, Plain, Dia) :-
	unicode_property(Code, decomposition_type(List)),
	List \== '',
	atomic_list_concat(AtomList, ' ', List),
	to_plain(AtomList, Code, Plain, Dia).

to_plain([Special, PlainA], _, Plain, 0) :-
	special(Special), !,
	atom_hex(PlainA, Plain).
to_plain([PlainA], _Code, Plain, 1) :- !,
	atom_hex(PlainA, Plain).
to_plain(List, Code, Plain, Dia) :-
	maplist(atom_hex, List, Numbers),
	Numbers = [Plain, Dia],
	diacritic(Dia, Code), !.

diacritic(Code, For) :- !,
	unicode_property(Code, canonical_combining_class(Cc)),
	(  Cc > 0
	-> assert_diacritic(Code)
	;  debug(diacritic, '~16r: ~16r: Cc = ~d~n', [For, Code, Cc]),
	   fail
	).

assign_diacritic_codes :-
	retractall(diacritic_code(_,_)),
	findall(D, diacritic(D), Ds),
	sort([0,1|Ds], Sorted),		% 0 and 1 are specials
	assign_codes(Sorted, 1).

assign_codes([], _).
assign_codes([H|T], I) :-
	assert(diacritic_code(H, I)),
	I2 is I + 1,
	assign_codes(T, I2).

assert_diacritic(Code) :-
	diacritic(Code), !.
assert_diacritic(Code) :-
	assert(diacritic(Code)).

atom_hex(Atom, Hex) :-
	atom_codes(Atom, Codes),
	phrase(xinteger(Hex), Codes).

special('<font>').
special('<noBreak>').
special('<initial>').
special('<medial>').
special('<final>').
special('<isolated>').
special('<circle>').
special('<super>').
special('<sub>').
special('<vertical>').
special('<wide>').
special('<narrow>').
special('<small>').
special('<square>').
special('<fraction>').
special('<compat>  ').
semweb and http compile now (but they don't work properly yet). 2010-07-15 23:35:37 +01:00			`:- module(unicode_collate_info,`
			`[ write_unicode_collate_map/2 % +File, +Options`
			`]).`
			`:- use_module(library('unicode/unicode_data')).`
			`:- use_module(library('http/dcg_basics')).`
			`:- use_module(library(debug)).`
			`:- use_module(library(option)).`

			`/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -`
			`This module creates a simple map for removing diacritics from Unicode`
			`characters and mapping them to lowercase. It defines tables and a`
			`function "int sort_point(int code)". The sort points are defined as`
			`follows:`

			`* The high 24-bit contains the character code after mapping to`
			`lowercase and removing possible diacritics.`
			`* Bit 8 is 0 for characters that are mapped from upper to`
			`lowercase and 1 for all other characters`
			`* The low 7 bits encode the removed diacritics. All removed`
			`diacritics are ordered by their Unicode codepoint`
			`and numbered.sequentially.`

			`As a result, text will be ordered by its basic character, upper case`
			`before lowercase and text with diacritics after text without.`


			`RUNNING`

			`To run thhis module, first check the instructions in`
			`library('unicode/unicode_data') for installing the Unicode datafiles`
			`that are not included in this package.`


			`MOTIVATION`

			`This module is a simple-minded replacement for true Unicode support such`
			`as provided by UCI (http://uci.sourceforge.net/). The advantage of the`
			`do-it-yourself approach adopted here however is that it greatly reduces`
			`the footprint and eliminates installation and maintenance problems`
			involved in adopting large `can-do-everything' external libraries.

			`We believe it deals correctly with the Western languages, Greek,`
			`Cyrillic and other languages with similar handling of diacritics.`


			`UPPERCASE`

			`For Prolog it makes more sense to map to lowercase as the cannonical`
			`case. However, we wish to order first on uppercase and to be able to do`
			`prefix matches we need to search on the lowest value. Hence, we use the`
			`uppercase version for sorting.`
			`- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */`

			`:- dynamic`
			`diacritic/1, % known diacritics`
			`diacritic_code/2. % +Diacritic, -Code (1..N)`

			`/*******************************`
			`* C TABLES *`
			`*******************************/`

			`% write_unicode_collate_map(+File, +Options)`
			`%`
			`% Options supported are:`
			`%`
			`% # first_codepage [0]`
			`% Code page to start`
			`%`
			`% # last_codepage [127]`
			`% Code page to end.`
			`%`
			`% # case(UpperOrLower)`
			`% Canonise to upper (default) or lower case.`

			`write_unicode_collate_map(File, Options) :-`
			`open(File, write, Out),`
			`call_cleanup(write_sort_map(Out, Options),`
			`close(Out)).`

			`write_sort_map(Out, Options) :-`
			`gen_tables(Tables, Options),`
			`write_header(Out, Options),`
			`forall(member(table(CP, Map), Tables),`
			`write_codepage(Out, CP, Map)),`
			`write_map(Out, Tables, Options),`
			`write_footer(Out, Options).`

			`write_codepage(Out, CP, Map) :-`
			`assertion(length(Map, 256)),`
			`cp_name(CP, CPN),`
			`format(Out, 'static const int32_t ~w[256] =~n', [CPN]),`
			`format(Out, '{ ', []),`
			`map_entries(Map, 0, Out),`
			`format(Out, '~N};~n~n', []).`

			`cp_name(CP, CPN) :-`
			sformat(CPN, 'ucp0x~\|~`0t~16r~2+', [CP]).

			`map_entries([], _, _).`
			`map_entries([H\|T], I, Out) :-`
			`( I == 0`
			`-> true`
			`; 0 =:= I mod 8`
			`-> format(Out, ',~n ', [])`
			`; format(Out, ', ', [])`
			`),`
			format(Out, '~\|0x~`0t~16r~8+', [H]),
			`I2 is I + 1,`
			`map_entries(T, I2, Out).`

			`write_map(Out, Tables, Options) :-`
			`option(last_codepage(Last), Options, 127),`
			`format(Out,`
			`'static const int32_t* const ucoll_map[UNICODE_MAP_SIZE] =~n',`
			`[]),`
			`format(Out, '{ ', []),`
			`map_tables(0, Last, Tables, Out),`
			`format(Out, '~N};~n~n', []).`

			`map_tables(CP, Last, _, _) :-`
			`CP > Last, !.`
			`map_tables(CP, Last, Tables, Out) :-`
			`( CP == 0`
			`-> true`
			`; 0 =:= CP mod 8`
			`-> format(Out, ',~n ', [])`
			`; format(Out, ', ', [])`
			`),`
			`( memberchk(table(CP, _), Tables)`
			`-> cp_name(CP, CPN),`
			`format(Out, '~w', [CPN])`
			`; format(Out, '~\|~tNULL~7+', [])`
			`),`
			`CP2 is CP + 1,`
			`map_tables(CP2, Last, Tables, Out).`


			`write_header(Out, Options) :-`
			`option(last_codepage(Last), Options, 127),`
			`Size is Last+1,`
			`format(Out,`
			`'#ifdef WIN32\n\`
			`typedef int int32_t;\n\`
			`#else\n\`
			`#include <inttypes.h>\n\`
			`#endif\n\n', []),`
			`format(Out,`
			`'#ifndef NULL\n\`
			`#define NULL ((void*)0)\n\`
			`#endif\n\n', []),`
			`format(Out,`
			`'#define UNICODE_MAP_SIZE ~d~n~n', [Size]).`

			`write_footer(Out, Options) :-`
			`( memberchk(case(lower), Options)`
			`-> Add = '+0x80'`
			`; Add = ''`
			`),`
			`format(Out,`
			`'static int\n\`
			`sort_point(int code)\n\`
			`{ int cp = code / 256;\n\`
			`\n \`
			`if ( cp < UNICODE_MAP_SIZE && ucoll_map[cp] )\n \`
			`return ucoll_map[cp][code&0xff];\n\`
			`\n \`
			`return (code<<8)~w;\n\`
			`}\n\n', [Add]),`
			`format(Out,`
			`'static int\n\`
			`sort_pointA(int code)\n\`
			`{ return ucp0x00[code&0xff];\n\`
			`}\n\n', []).`


			`/*******************************`
			`* TABLES *`
			`*******************************/`

			`gen_tables(Tables, Options) :-`
			`forall(rm_diacritics(_, _, _), true),`
			`assign_diacritic_codes,`
			`findall(table(CP,Map), table(CP, Map, Options), Tables).`

			`table(CP, Map, Options) :-`
			`option(first_codepage(First), Options, 0),`
			`option(last_codepage(Last), Options, 127),`
			`between(First, Last, CP),`
			`findall(M, char(CP, M, Options), Map), % now`
			`non_empty_map(CP, Map, Options).`

			`char(CP, Value, Options) :-`
			`between(0, 255, I),`
			`Code is 256*CP+I,`
			`( char_to_code(Code, Value, Options)`
			`-> true`
			`; format('Failed on ~d~n', [Code]),`
			`Value is Code<<8`
			`).`

			`char_to_code(Code, Value, Options) :-`
			`memberchk(case(lower), Options), !,`
			`( utolower(Code, Lower),`
			`Lower \== Code`
			`-> Cc = Lower,`
			`CFlags = 0x00`
			`; Cc = Code,`
			`CFlags = 0x80`
			`),`
			`assertion(integer(Cc)),`
			`( rm_diacritics(Cc, Base, Dia),`
			`assertion(integer(Base))`
			`-> diacritic_code(Dia, DiaV),`
			`Value is Base << 8 \/ CFlags \/ DiaV`
			`; Value is Cc << 8 \/ CFlags`
			`).`
			`char_to_code(Code, Value, _Options) :-`
			`( utoupper(Code, Upper),`
			`Upper \== Code`
			`-> Cc = Upper,`
			`CFlags = 0x80`
			`; Cc = Code,`
			`CFlags = 0x00`
			`),`
			`assertion(integer(Cc)),`
			`( rm_diacritics(Cc, Base, Dia),`
			`assertion(integer(Base))`
			`-> diacritic_code(Dia, DiaV),`
			`Value is Base << 8 \/ CFlags \/ DiaV`
			`; Value is Cc << 8 \/ CFlags`
			`).`

			`non_empty_map(CP, Map, Options) :-`
			`( memberchk(case(lower), Options)`
			`-> Add is 0x80`
			`; Add = 0`
			`),`
			`\+ empty_map(Map, 0, CP, Add).`

			`empty_map([], _, _, _).`
			`empty_map([H\|T], I, CP, Add) :-`
			`H =:= ((CP*256+I)<<8) + Add,`
			`I2 is I + 1,`
			`empty_map(T, I2, CP, Add).`


			`/*******************************`
			`* CASE CONVERSION *`
			`*******************************/`

			`utolower(Code, Lower) :-`
			`unicode_property(Code, simple_lowercase_mapping(Lower)).`

			`utoupper(Code, Upper) :-`
			`unicode_property(Code, simple_uppercase_mapping(Upper)).`


			`/*******************************`
			`* DIACRITICS *`
			`*******************************/`

			`rm_diacritics(Code, Plain, Dia) :-`
			`unicode_property(Code, decomposition_type(List)),`
			`List \== '',`
			`atomic_list_concat(AtomList, ' ', List),`
			`to_plain(AtomList, Code, Plain, Dia).`

			`to_plain([Special, PlainA], _, Plain, 0) :-`
			`special(Special), !,`
			`atom_hex(PlainA, Plain).`
			`to_plain([PlainA], _Code, Plain, 1) :- !,`
			`atom_hex(PlainA, Plain).`
			`to_plain(List, Code, Plain, Dia) :-`
			`maplist(atom_hex, List, Numbers),`
			`Numbers = [Plain, Dia],`
			`diacritic(Dia, Code), !.`

			`diacritic(Code, For) :- !,`
			`unicode_property(Code, canonical_combining_class(Cc)),`
			`( Cc > 0`
			`-> assert_diacritic(Code)`
			`; debug(diacritic, '~16r: ~16r: Cc = ~d~n', [For, Code, Cc]),`
			`fail`
			`).`

			`assign_diacritic_codes :-`
			`retractall(diacritic_code(_,_)),`
			`findall(D, diacritic(D), Ds),`
			`sort([0,1\|Ds], Sorted), % 0 and 1 are specials`
			`assign_codes(Sorted, 1).`

			`assign_codes([], _).`
			`assign_codes([H\|T], I) :-`
			`assert(diacritic_code(H, I)),`
			`I2 is I + 1,`
			`assign_codes(T, I2).`

			`assert_diacritic(Code) :-`
			`diacritic(Code), !.`
			`assert_diacritic(Code) :-`
			`assert(diacritic(Code)).`

			`atom_hex(Atom, Hex) :-`
			`atom_codes(Atom, Codes),`
			`phrase(xinteger(Hex), Codes).`

			`special('<font>').`
			`special('<noBreak>').`
			`special('<initial>').`
			`special('<medial>').`
			`special('<final>').`
			`special('<isolated>').`
			`special('<circle>').`
			`special('<super>').`
			`special('<sub>').`
			`special('<vertical>').`
			`special('<wide>').`
			`special('<narrow>').`
			`special('<small>').`
			`special('<square>').`
			`special('<fraction>').`
			`special('<compat> ').`