323 lines
		
	
	
		
			8.2 KiB
		
	
	
	
		
			Perl
		
	
	
	
	
	
		
		
			
		
	
	
			323 lines
		
	
	
		
			8.2 KiB
		
	
	
	
		
			Perl
		
	
	
	
	
	
|   | :- module(unicode_collate_info, | ||
|  | 	  [ write_unicode_collate_map/2	% +File, +Options | ||
|  | 	  ]). | ||
|  | :- use_module(library('unicode/unicode_data')). | ||
|  | :- use_module(library('http/dcg_basics')). | ||
|  | :- use_module(library(debug)). | ||
|  | :- use_module(library(option)). | ||
|  | 
 | ||
|  | /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - | ||
|  | This module creates a simple map   for  removing diacritics from Unicode | ||
|  | characters and mapping them  to  lowercase.   It  defines  tables  and a | ||
|  | function "int sort_point(int code)". The  sort   points  are  defined as | ||
|  | follows: | ||
|  | 
 | ||
|  | 	* The high 24-bit contains the character code after mapping to | ||
|  | 	  lowercase and removing possible diacritics. | ||
|  | 	* Bit 8 is 0 for characters that are mapped from upper to | ||
|  | 	  lowercase and 1 for all other characters | ||
|  | 	* The low 7 bits encode the removed diacritics.  All removed | ||
|  | 	  diacritics are ordered by their Unicode codepoint | ||
|  | 	  and numbered.sequentially. | ||
|  | 
 | ||
|  | As a result, text will be  ordered   by  its basic character, upper case | ||
|  | before lowercase and text with diacritics after text without. | ||
|  | 
 | ||
|  | 
 | ||
|  | RUNNING | ||
|  | 
 | ||
|  | To   run   thhis   module,    first     check    the   instructions   in | ||
|  | library('unicode/unicode_data') for installing  the   Unicode  datafiles | ||
|  | that are not included in this package. | ||
|  | 
 | ||
|  | 
 | ||
|  | MOTIVATION | ||
|  | 
 | ||
|  | This module is a simple-minded replacement for true Unicode support such | ||
|  | as provided by UCI (http://uci.sourceforge.net/).   The advantage of the | ||
|  | do-it-yourself approach adopted here however is  that it greatly reduces | ||
|  | the footprint and  eliminates  installation   and  maintenance  problems | ||
|  | involved in adopting large `can-do-everything' external libraries. | ||
|  | 
 | ||
|  | We believe it  deals  correctly  with   the  Western  languages,  Greek, | ||
|  | Cyrillic and other languages with similar handling of diacritics. | ||
|  | 
 | ||
|  | 
 | ||
|  | UPPERCASE | ||
|  | 
 | ||
|  | For Prolog it makes more sense  to   map  to lowercase as the cannonical | ||
|  | case. However, we wish to order first on  uppercase and to be able to do | ||
|  | prefix matches we need to search on the  lowest value. Hence, we use the | ||
|  | uppercase version for sorting. | ||
|  | - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ | ||
|  | 
 | ||
|  | :- dynamic | ||
|  | 	diacritic/1,			% known diacritics | ||
|  | 	diacritic_code/2.		% +Diacritic, -Code (1..N) | ||
|  | 
 | ||
|  | 		 /******************************* | ||
|  | 		 *	     C TABLES		* | ||
|  | 		 *******************************/ | ||
|  | 
 | ||
|  | %	write_unicode_collate_map(+File, +Options) | ||
|  | % | ||
|  | %	Options supported are: | ||
|  | % | ||
|  | %		# first_codepage [0] | ||
|  | %		Code page to start | ||
|  | % | ||
|  | %		# last_codepage [127] | ||
|  | %		Code page to end. | ||
|  | % | ||
|  | %		# case(UpperOrLower) | ||
|  | %		Canonise to upper (default) or lower case. | ||
|  | 
 | ||
|  | write_unicode_collate_map(File, Options) :- | ||
|  | 	open(File, write, Out), | ||
|  | 	call_cleanup(write_sort_map(Out, Options), | ||
|  | 		     close(Out)). | ||
|  | 
 | ||
|  | write_sort_map(Out, Options) :- | ||
|  | 	gen_tables(Tables, Options), | ||
|  | 	write_header(Out, Options), | ||
|  | 	forall(member(table(CP, Map), Tables), | ||
|  | 	       write_codepage(Out, CP, Map)), | ||
|  | 	write_map(Out, Tables, Options), | ||
|  | 	write_footer(Out, Options). | ||
|  | 
 | ||
|  | write_codepage(Out, CP, Map) :- | ||
|  | 	assertion(length(Map, 256)), | ||
|  | 	cp_name(CP, CPN), | ||
|  | 	format(Out, 'static const int32_t ~w[256] =~n', [CPN]), | ||
|  | 	format(Out, '{ ', []), | ||
|  | 	map_entries(Map, 0, Out), | ||
|  | 	format(Out, '~N};~n~n', []). | ||
|  | 
 | ||
|  | cp_name(CP, CPN) :- | ||
|  | 	sformat(CPN, 'ucp0x~|~`0t~16r~2+', [CP]). | ||
|  | 
 | ||
|  | map_entries([], _, _). | ||
|  | map_entries([H|T], I, Out) :- | ||
|  | 	(   I == 0 | ||
|  | 	->  true | ||
|  | 	;   0 =:= I mod 8 | ||
|  | 	->  format(Out, ',~n  ', []) | ||
|  | 	;   format(Out, ', ', []) | ||
|  | 	), | ||
|  | 	format(Out, '~|0x~`0t~16r~8+', [H]), | ||
|  | 	I2 is I + 1, | ||
|  | 	map_entries(T, I2, Out). | ||
|  | 
 | ||
|  | write_map(Out, Tables, Options) :- | ||
|  | 	option(last_codepage(Last), Options, 127), | ||
|  | 	format(Out, | ||
|  | 	       'static const int32_t* const ucoll_map[UNICODE_MAP_SIZE] =~n', | ||
|  | 	       []), | ||
|  | 	format(Out, '{ ', []), | ||
|  | 	map_tables(0, Last, Tables, Out), | ||
|  | 	format(Out, '~N};~n~n', []). | ||
|  | 
 | ||
|  | map_tables(CP, Last, _, _) :- | ||
|  | 	CP > Last, !. | ||
|  | map_tables(CP, Last, Tables, Out) :- | ||
|  | 	(   CP == 0 | ||
|  | 	->  true | ||
|  | 	;   0 =:= CP mod 8 | ||
|  | 	->  format(Out, ',~n  ', []) | ||
|  | 	;   format(Out, ', ', []) | ||
|  | 	), | ||
|  | 	(   memberchk(table(CP, _), Tables) | ||
|  | 	->  cp_name(CP, CPN), | ||
|  | 	    format(Out, '~w', [CPN]) | ||
|  | 	;   format(Out, '~|~tNULL~7+', []) | ||
|  | 	), | ||
|  | 	CP2 is CP + 1, | ||
|  | 	map_tables(CP2, Last, Tables, Out). | ||
|  | 
 | ||
|  | 
 | ||
|  | write_header(Out, Options) :- | ||
|  | 	option(last_codepage(Last), Options, 127), | ||
|  | 	Size is Last+1, | ||
|  | 	format(Out, | ||
|  | 	       '#ifdef WIN32\n\ | ||
|  | 		typedef int int32_t;\n\ | ||
|  | 		#else\n\ | ||
|  | 		#include <inttypes.h>\n\ | ||
|  | 		#endif\n\n', []), | ||
|  | 	format(Out, | ||
|  | 	       '#ifndef NULL\n\ | ||
|  | 		#define NULL ((void*)0)\n\ | ||
|  | 		#endif\n\n', []), | ||
|  | 	format(Out, | ||
|  | 	       '#define UNICODE_MAP_SIZE ~d~n~n', [Size]). | ||
|  | 
 | ||
|  | write_footer(Out, Options) :- | ||
|  | 	(   memberchk(case(lower), Options) | ||
|  | 	->  Add = '+0x80' | ||
|  | 	;   Add = '' | ||
|  | 	), | ||
|  | 	format(Out, | ||
|  | 	       'static int\n\ | ||
|  | 		sort_point(int code)\n\ | ||
|  | 		{ int cp = code / 256;\n\ | ||
|  | 		\n  \ | ||
|  | 		  if ( cp < UNICODE_MAP_SIZE && ucoll_map[cp] )\n    \ | ||
|  | 		    return ucoll_map[cp][code&0xff];\n\ | ||
|  | 		\n  \ | ||
|  | 		  return (code<<8)~w;\n\ | ||
|  | 		}\n\n', [Add]), | ||
|  | 	format(Out, | ||
|  | 	       'static int\n\ | ||
|  | 		sort_pointA(int code)\n\ | ||
|  | 		{ return ucp0x00[code&0xff];\n\ | ||
|  | 		}\n\n', []). | ||
|  | 
 | ||
|  | 
 | ||
|  | 		 /******************************* | ||
|  | 		 *	       TABLES		* | ||
|  | 		 *******************************/ | ||
|  | 
 | ||
|  | gen_tables(Tables, Options) :- | ||
|  | 	forall(rm_diacritics(_, _, _), true), | ||
|  | 	assign_diacritic_codes, | ||
|  | 	findall(table(CP,Map), table(CP, Map, Options), Tables). | ||
|  | 
 | ||
|  | table(CP, Map, Options) :- | ||
|  | 	option(first_codepage(First), Options, 0), | ||
|  | 	option(last_codepage(Last), Options, 127), | ||
|  | 	between(First, Last, CP), | ||
|  | 	findall(M, char(CP, M, Options), Map),	% now | ||
|  | 	non_empty_map(CP, Map, Options). | ||
|  | 
 | ||
|  | char(CP, Value, Options) :- | ||
|  | 	between(0, 255, I), | ||
|  | 	Code is 256*CP+I, | ||
|  | 	(   char_to_code(Code, Value, Options) | ||
|  | 	->  true | ||
|  | 	;   format('Failed on ~d~n', [Code]), | ||
|  | 	    Value is Code<<8 | ||
|  | 	). | ||
|  | 
 | ||
|  | char_to_code(Code, Value, Options) :- | ||
|  | 	memberchk(case(lower), Options), !, | ||
|  | 	(   utolower(Code, Lower), | ||
|  | 	    Lower \== Code | ||
|  | 	->  Cc = Lower, | ||
|  | 	    CFlags = 0x00 | ||
|  | 	;   Cc = Code, | ||
|  | 	    CFlags = 0x80 | ||
|  | 	), | ||
|  | 	assertion(integer(Cc)), | ||
|  | 	(   rm_diacritics(Cc, Base, Dia), | ||
|  | 	    assertion(integer(Base)) | ||
|  | 	->  diacritic_code(Dia, DiaV), | ||
|  | 	    Value is Base << 8 \/ CFlags \/ DiaV | ||
|  | 	;   Value is Cc << 8 \/ CFlags | ||
|  | 	). | ||
|  | char_to_code(Code, Value, _Options) :- | ||
|  | 	(   utoupper(Code, Upper), | ||
|  | 	    Upper \== Code | ||
|  | 	->  Cc = Upper, | ||
|  | 	    CFlags = 0x80 | ||
|  | 	;   Cc = Code, | ||
|  | 	    CFlags = 0x00 | ||
|  | 	), | ||
|  | 	assertion(integer(Cc)), | ||
|  | 	(   rm_diacritics(Cc, Base, Dia), | ||
|  | 	    assertion(integer(Base)) | ||
|  | 	->  diacritic_code(Dia, DiaV), | ||
|  | 	    Value is Base << 8 \/ CFlags \/ DiaV | ||
|  | 	;   Value is Cc << 8 \/ CFlags | ||
|  | 	). | ||
|  | 
 | ||
|  | non_empty_map(CP, Map, Options) :- | ||
|  | 	(   memberchk(case(lower), Options) | ||
|  | 	->  Add is 0x80 | ||
|  | 	;   Add = 0 | ||
|  | 	), | ||
|  | 	\+ empty_map(Map, 0, CP, Add). | ||
|  | 
 | ||
|  | empty_map([], _, _, _). | ||
|  | empty_map([H|T], I, CP, Add) :- | ||
|  | 	H =:= ((CP*256+I)<<8) + Add, | ||
|  | 	I2 is I + 1, | ||
|  | 	empty_map(T, I2, CP, Add). | ||
|  | 
 | ||
|  | 
 | ||
|  | 		 /******************************* | ||
|  | 		 *	 CASE CONVERSION	* | ||
|  | 		 *******************************/ | ||
|  | 
 | ||
|  | utolower(Code, Lower) :- | ||
|  | 	unicode_property(Code, simple_lowercase_mapping(Lower)). | ||
|  | 
 | ||
|  | utoupper(Code, Upper) :- | ||
|  | 	unicode_property(Code, simple_uppercase_mapping(Upper)). | ||
|  | 
 | ||
|  | 
 | ||
|  | 		 /******************************* | ||
|  | 		 *	     DIACRITICS		* | ||
|  | 		 *******************************/ | ||
|  | 
 | ||
|  | rm_diacritics(Code, Plain, Dia) :- | ||
|  | 	unicode_property(Code, decomposition_type(List)), | ||
|  | 	List \== '', | ||
|  | 	atomic_list_concat(AtomList, ' ', List), | ||
|  | 	to_plain(AtomList, Code, Plain, Dia). | ||
|  | 
 | ||
|  | to_plain([Special, PlainA], _, Plain, 0) :- | ||
|  | 	special(Special), !, | ||
|  | 	atom_hex(PlainA, Plain). | ||
|  | to_plain([PlainA], _Code, Plain, 1) :- !, | ||
|  | 	atom_hex(PlainA, Plain). | ||
|  | to_plain(List, Code, Plain, Dia) :- | ||
|  | 	maplist(atom_hex, List, Numbers), | ||
|  | 	Numbers = [Plain, Dia], | ||
|  | 	diacritic(Dia, Code), !. | ||
|  | 
 | ||
|  | diacritic(Code, For) :- !, | ||
|  | 	unicode_property(Code, canonical_combining_class(Cc)), | ||
|  | 	(  Cc > 0 | ||
|  | 	-> assert_diacritic(Code) | ||
|  | 	;  debug(diacritic, '~16r: ~16r: Cc = ~d~n', [For, Code, Cc]), | ||
|  | 	   fail | ||
|  | 	). | ||
|  | 
 | ||
|  | assign_diacritic_codes :- | ||
|  | 	retractall(diacritic_code(_,_)), | ||
|  | 	findall(D, diacritic(D), Ds), | ||
|  | 	sort([0,1|Ds], Sorted),		% 0 and 1 are specials | ||
|  | 	assign_codes(Sorted, 1). | ||
|  | 
 | ||
|  | assign_codes([], _). | ||
|  | assign_codes([H|T], I) :- | ||
|  | 	assert(diacritic_code(H, I)), | ||
|  | 	I2 is I + 1, | ||
|  | 	assign_codes(T, I2). | ||
|  | 
 | ||
|  | assert_diacritic(Code) :- | ||
|  | 	diacritic(Code), !. | ||
|  | assert_diacritic(Code) :- | ||
|  | 	assert(diacritic(Code)). | ||
|  | 
 | ||
|  | atom_hex(Atom, Hex) :- | ||
|  | 	atom_codes(Atom, Codes), | ||
|  | 	phrase(xinteger(Hex), Codes). | ||
|  | 
 | ||
|  | special('<font>'). | ||
|  | special('<noBreak>'). | ||
|  | special('<initial>'). | ||
|  | special('<medial>'). | ||
|  | special('<final>'). | ||
|  | special('<isolated>'). | ||
|  | special('<circle>'). | ||
|  | special('<super>'). | ||
|  | special('<sub>'). | ||
|  | special('<vertical>'). | ||
|  | special('<wide>'). | ||
|  | special('<narrow>'). | ||
|  | special('<small>'). | ||
|  | special('<square>'). | ||
|  | special('<fraction>'). | ||
|  | special('<compat>  '). |