135 lines
		
	
	
		
			3.9 KiB
		
	
	
	
		
			Perl
		
	
	
	
	
	
		
		
			
		
	
	
			135 lines
		
	
	
		
			3.9 KiB
		
	
	
	
		
			Perl
		
	
	
	
	
	
|   | /*  $Id$ | ||
|  | 
 | ||
|  |     Part of SWI-Prolog | ||
|  | 
 | ||
|  |     Author:        Jan Wielemaker | ||
|  |     E-mail:        wielemak@science.uva.nl | ||
|  |     WWW:           http://www.swi-prolog.org | ||
|  |     Copyright (C): 1985-2005, University of Amsterdam | ||
|  | 
 | ||
|  |     This program is free software; you can redistribute it and/or | ||
|  |     modify it under the terms of the GNU General Public License | ||
|  |     as published by the Free Software Foundation; either version 2 | ||
|  |     of the License, or (at your option) any later version. | ||
|  | 
 | ||
|  |     This program is distributed in the hope that it will be useful, | ||
|  |     but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
|  |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||
|  |     GNU General Public License for more details. | ||
|  | 
 | ||
|  |     You should have received a copy of the GNU General Public | ||
|  |     License along with this library; if not, write to the Free Software | ||
|  |     Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA | ||
|  | 
 | ||
|  |     As a special exception, if you link this library with other files, | ||
|  |     compiled with a Free Software compiler, to produce an executable, this | ||
|  |     library does not by itself cause the resulting executable to be covered | ||
|  |     by the GNU General Public License. This exception does not however | ||
|  |     invalidate any other reasons why the executable file might be covered by | ||
|  |     the GNU General Public License. | ||
|  | */ | ||
|  | 
 | ||
|  | :- module(utf8, | ||
|  | 	  [ utf8_codes//1		% ?String | ||
|  | 	  ]). | ||
|  | 
 | ||
|  | %%	utf8_codes(?Codes)// is det. | ||
|  | % | ||
|  | %	DCG translating between  a  Unicode   code-list  and  its  UTF-8 | ||
|  | %	encoded  byte-string.  The  DCG  works   two  ways.  Encoding  a | ||
|  | %	code-list to a UTF-8 byte string is achieved using | ||
|  | % | ||
|  | %		phrase(utf8_codes(Codes), UTF8) | ||
|  | % | ||
|  | %	The  algorithm  is  a  close  copy    of  the  C-algorithm  used | ||
|  | %	internally and defined in src/pl-utf8.c | ||
|  | % | ||
|  | %	NOTE: in many  cases  you  can   avoid  this  library  and leave | ||
|  | %	encoding and decoding to I/O streams. If   only part of the data | ||
|  | %	is to be encoded the  encoding  of   a  stream  can  be switched | ||
|  | %	temporary using set_stream(Stream, encoding(utf8)) | ||
|  | 
 | ||
|  | utf8_codes([H|T]) --> | ||
|  | 	utf8_code(H), !, | ||
|  | 	utf8_codes(T). | ||
|  | utf8_codes([]) --> | ||
|  | 	[]. | ||
|  | 
 | ||
|  | utf8_code(C) --> | ||
|  | 	[C0], | ||
|  | 	{ nonvar(C0) }, !,		% decoding version | ||
|  | 	(   {C0 < 0x80} | ||
|  | 	->  {C = C0} | ||
|  | 	;   {C0/\0xe0 =:= 0xc0} | ||
|  | 	->  utf8_cont(C1, 0), | ||
|  | 	    {C is (C0/\0x1f)<<6\/C1} | ||
|  | 	;   {C0/\0xf0 =:= 0xe0} | ||
|  | 	->  utf8_cont(C1, 6), | ||
|  | 	    utf8_cont(C2, 0), | ||
|  | 	    {C is ((C0/\0xf)<<12)\/C1\/C2} | ||
|  | 	;   {C0/\0xf8 =:= 0xf0} | ||
|  | 	->  utf8_cont(C1, 12), | ||
|  | 	    utf8_cont(C2, 6), | ||
|  | 	    utf8_cont(C3, 0), | ||
|  | 	    {C is ((C0/\0x7)<<18)\/C1\/C2\/C3} | ||
|  | 	;   {C0/\0xfc =:= 0xf8} | ||
|  | 	->  utf8_cont(C1, 18), | ||
|  | 	    utf8_cont(C2, 12), | ||
|  | 	    utf8_cont(C3, 6), | ||
|  | 	    utf8_cont(C4, 0), | ||
|  | 	    {C is ((C0/\0x3)<<24)\/C1\/C2\/C3\/C4} | ||
|  | 	;   {C0/\0xfe =:= 0xfc} | ||
|  | 	->  utf8_cont(C1, 24), | ||
|  | 	    utf8_cont(C2, 18), | ||
|  | 	    utf8_cont(C3, 12), | ||
|  | 	    utf8_cont(C4, 6), | ||
|  | 	    utf8_cont(C5, 0), | ||
|  | 	    {C is ((C0/\0x1)<<30)\/C1\/C2\/C3\/C4\/C5} | ||
|  | 	). | ||
|  | utf8_code(C) --> | ||
|  | 	{ nonvar(C) }, !,		% encoding version | ||
|  | 	(   { C < 0x80 } | ||
|  | 	->  [C] | ||
|  | 	;   { C < 0x800 } | ||
|  | 	->  { C0 is 0xc0\/((C>>6)/\0x1f), | ||
|  | 	      C1 is 0x80\/(C/\0x3f) | ||
|  | 	    }, | ||
|  | 	    [C0,C1] | ||
|  | 	;   { C < 0x10000 } | ||
|  | 	->  { C0 is 0xe0\/((C>>12)/\0x0f), | ||
|  | 	      C1 is 0x80\/((C>>6)/\0x3f), | ||
|  | 	      C2 is 0x80\/(C/\0x3f) | ||
|  | 	    }, | ||
|  | 	    [C0,C1,C2] | ||
|  | 	;   { C < 0x200000 } | ||
|  | 	->  { C0 is 0xf0\/((C>>18)/\0x07), | ||
|  | 	      C1 is 0x80\/((C>>12)/\0x3f), | ||
|  | 	      C2 is 0x80\/((C>>6)/\0x3f), | ||
|  | 	      C3 is 0x80\/(C/\0x3f) | ||
|  | 	    }, | ||
|  | 	    [C0,C1,C2,C3] | ||
|  | 	;   { C < 0x4000000 } | ||
|  | 	->  { C0 is 0xf8\/((C>>24)/\0x03), | ||
|  | 	      C1 is 0x80\/((C>>18)/\0x3f), | ||
|  | 	      C2 is 0x80\/((C>>12)/\0x3f), | ||
|  | 	      C3 is 0x80\/((C>>6)/\0x3f), | ||
|  | 	      C4 is 0x80\/(C/\0x3f) | ||
|  | 	    }, | ||
|  | 	    [C0,C1,C2,C3,C4] | ||
|  | 	;   { C < 0x80000000 } | ||
|  | 	->  { C0 is 0xfc\/((C>>30)/\0x01), | ||
|  | 	      C1 is 0x80\/((C>>24)/\0x3f), | ||
|  | 	      C2 is 0x80\/((C>>18)/\0x3f), | ||
|  | 	      C3 is 0x80\/((C>>12)/\0x3f), | ||
|  | 	      C4 is 0x80\/((C>>6)/\0x3f), | ||
|  | 	      C5 is 0x80\/(C/\0x3f) | ||
|  | 	    }, | ||
|  | 	    [C0,C1,C2,C3,C4,C5] | ||
|  | 	). | ||
|  | 
 | ||
|  | utf8_cont(Val, Shift) --> | ||
|  | 	[C], | ||
|  | 	{ C/\0xc0 =:= 0x80, | ||
|  | 	  Val is (C/\0x3f)<<Shift | ||
|  | 	}. |