877 lines
		
	
	
		
			19 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			877 lines
		
	
	
		
			19 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
| /*  $Id$
 | |
| 
 | |
|     Part of SWI-Prolog
 | |
| 
 | |
|     Author:        Jan Wielemaker
 | |
|     E-mail:        wielemak@science.uva.nl
 | |
|     WWW:           http://www.swi-prolog.org
 | |
|     Copyright (C): 1985-2007, University of Amsterdam
 | |
| 
 | |
|     This library is free software; you can redistribute it and/or
 | |
|     modify it under the terms of the GNU Lesser General Public
 | |
|     License as published by the Free Software Foundation; either
 | |
|     version 2.1 of the License, or (at your option) any later version.
 | |
| 
 | |
|     This library is distributed in the hope that it will be useful,
 | |
|     but WITHOUT ANY WARRANTY; without even the implied warranty of
 | |
|     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 | |
|     Lesser General Public License for more details.
 | |
| 
 | |
|     You should have received a copy of the GNU Lesser General Public
 | |
|     License along with this library; if not, write to the Free Software
 | |
|     Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 | |
| */
 | |
| 
 | |
| #include "pl-incl.h"
 | |
| #include <ctype.h>
 | |
| #include "pl-ctype.h"
 | |
| 
 | |
| /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 | |
| This module defines:
 | |
| 
 | |
| 	char_type(?Char, ?Type)
 | |
| 	code_type(?Char, ?Type)
 | |
| 
 | |
| See manual for details.
 | |
| - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
 | |
| 
 | |
| #define CTX_CHAR 0			/* Class(Char) */
 | |
| #define CTX_CODE 1			/* Class(Int) */
 | |
| 
 | |
| typedef struct
 | |
| { atom_t	name;			/* name of the class */
 | |
|   int (*test)(wint_t chr);		/* boolean */
 | |
|   int (*reverse)(wint_t chr);		/* reverse mapping */
 | |
|   short		arity;			/* arity of class (i.e. lower('A')) */
 | |
|   short		ctx_type;		/* CTX_* */
 | |
| } char_type;
 | |
| 
 | |
| #define ENUM_NONE	0x00
 | |
| #define ENUM_CHAR	0x01
 | |
| #define ENUM_CLASS	0x02
 | |
| #define ENUM_BOTH	0x03
 | |
| 
 | |
| typedef struct
 | |
| { int		current;		/* current character */
 | |
|   const char_type   *class;		/* current class */
 | |
|   int   	do_enum;		/* what to enumerate */
 | |
| } generator;
 | |
| 
 | |
| 
 | |
| static int
 | |
| iswhite(wint_t chr)
 | |
| { return chr == ' ' || chr == '\t';
 | |
| }
 | |
| 
 | |
| 
 | |
| static int
 | |
| fiscsym(wint_t chr)
 | |
| { return iswalnum(chr) || chr == '_';
 | |
| }
 | |
| 
 | |
| 
 | |
| static int
 | |
| fiscsymf(wint_t chr)
 | |
| { return iswalpha(chr) || chr == '_';
 | |
| }
 | |
| 
 | |
| static int
 | |
| iseof(wint_t chr)
 | |
| { return chr == (wint_t)-1;
 | |
| }
 | |
| 
 | |
| static int
 | |
| iseol(wint_t chr)
 | |
| { return chr >= 10 && chr <= 13;
 | |
| }
 | |
| 
 | |
| static int
 | |
| isnl(wint_t chr)
 | |
| { return chr == '\n';
 | |
| }
 | |
| 
 | |
| static int
 | |
| isperiod(wint_t chr)
 | |
| { return chr && strchr(".?!", chr) != NULL;
 | |
| }
 | |
| 
 | |
| static int
 | |
| isquote(wint_t chr)
 | |
| { return chr && strchr("'`\"", chr) != NULL;
 | |
| }
 | |
| 
 | |
| static int
 | |
| fupper(wint_t chr)
 | |
| { return iswlower(chr) ? (int)towupper(chr) : -1;
 | |
| }
 | |
| 
 | |
| static int
 | |
| flower(wint_t chr)
 | |
| { return iswupper(chr) ? (int)towlower(chr) : -1;
 | |
| }
 | |
| 
 | |
| static int
 | |
| ftoupper(wint_t chr)
 | |
| { return towupper(chr);
 | |
| }
 | |
| 
 | |
| static int
 | |
| ftolower(wint_t chr)
 | |
| { return towlower(chr);
 | |
| }
 | |
| 
 | |
| static int
 | |
| fparen(wint_t chr)
 | |
| { switch(chr)
 | |
|   { case '(':
 | |
|       return ')';
 | |
|     case '{':
 | |
|       return '}';
 | |
|     case '[':
 | |
|       return ']';
 | |
|     default:
 | |
|       return -1;
 | |
|   }
 | |
| }
 | |
| 
 | |
| 
 | |
| static int
 | |
| rparen(wint_t chr)
 | |
| { switch(chr)
 | |
|   { case ')':
 | |
|       return '(';
 | |
|     case '}':
 | |
|       return '{';
 | |
|     case ']':
 | |
|       return '[';
 | |
|     default:
 | |
|       return -1;
 | |
|   }
 | |
| }
 | |
| 
 | |
| 
 | |
| static int
 | |
| fdigit(wint_t chr)
 | |
| { if ( chr <= 0xff && isdigit(chr) )
 | |
|     return chr - '0';
 | |
|   return -1;
 | |
| }
 | |
| 
 | |
| 
 | |
| static int
 | |
| rdigit(wint_t d)
 | |
| { if ( (int)d >= 0 && d <= 9 )
 | |
|     return d+'0';
 | |
|   return -1;
 | |
| }
 | |
| 
 | |
| 
 | |
| static int
 | |
| fxdigit(wint_t chr)
 | |
| { if ( chr > 0xff )
 | |
|     return -1;
 | |
|   if ( isdigit(chr) )
 | |
|     return chr - '0';
 | |
|   if ( chr >= 'a' && chr <= 'f' )
 | |
|     return chr - 'a' + 10;
 | |
|   if ( chr >= 'A' && chr <= 'F' )
 | |
|     return chr - 'A' + 10;
 | |
|   return -1;
 | |
| }
 | |
| 
 | |
| 
 | |
| static int
 | |
| rxdigit(wint_t d)
 | |
| { if ( (int)d >= 0 && d <= 9 )
 | |
|     return d+'0';
 | |
|   if ( d >= 10 && d <= 15 )
 | |
|     return d-10+'a';
 | |
|   return -1;
 | |
| }
 | |
| 
 | |
| 
 | |
| 
 | |
| #define mkfunction(name) \
 | |
| 	static int f ## name(wint_t chr) { return name(chr); }
 | |
| 
 | |
| mkfunction(iswalnum)
 | |
| mkfunction(iswalpha)
 | |
| mkfunction(isascii)
 | |
| mkfunction(iswcntrl)
 | |
| mkfunction(iswdigit)
 | |
| mkfunction(iswgraph)
 | |
| mkfunction(iswlower)
 | |
| mkfunction(iswupper)
 | |
| mkfunction(iswpunct)
 | |
| mkfunction(iswspace)
 | |
| 
 | |
| static const char_type char_types[] =
 | |
| { { ATOM_alnum,		fiswalnum },
 | |
|   { ATOM_alpha,		fiswalpha },
 | |
|   { ATOM_csym,		fiscsym },
 | |
|   { ATOM_csymf,		fiscsymf },
 | |
|   { ATOM_ascii,		fisascii },
 | |
|   { ATOM_white,		iswhite },
 | |
|   { ATOM_cntrl,		fiswcntrl },
 | |
|   { ATOM_digit,		fiswdigit },
 | |
|   { ATOM_graph,		fiswgraph },
 | |
|   { ATOM_lower,		fiswlower },
 | |
|   { ATOM_upper,		fiswupper },
 | |
|   { ATOM_punct,		fiswpunct },
 | |
|   { ATOM_space,		fiswspace },
 | |
|   { ATOM_end_of_file,	iseof },
 | |
|   { ATOM_end_of_line,	iseol },
 | |
|   { ATOM_newline,	isnl },
 | |
|   { ATOM_period,	isperiod },
 | |
|   { ATOM_quote,	        isquote },
 | |
|   { ATOM_lower,		fupper,		flower,   1, CTX_CHAR },
 | |
|   { ATOM_upper,		flower,		fupper,   1, CTX_CHAR },
 | |
|   { ATOM_to_lower,	ftoupper,	ftolower, 1, CTX_CHAR },
 | |
|   { ATOM_to_upper,	ftolower,	ftoupper, 1, CTX_CHAR },
 | |
|   { ATOM_paren,		fparen,		rparen,   1, CTX_CHAR },
 | |
|   { ATOM_digit,		fdigit,		rdigit,   1, CTX_CODE  },
 | |
|   { ATOM_xdigit,	fxdigit,	rxdigit,  1, CTX_CODE  },
 | |
|   { NULL_ATOM,		NULL }
 | |
| };
 | |
| 
 | |
| 
 | |
| static const char_type *
 | |
| char_type_by_name(atom_t name, int arity)
 | |
| { const char_type *cc;
 | |
| 
 | |
|   for(cc = char_types; cc->name; cc++)
 | |
|   { if ( cc->name == name && cc->arity == arity )
 | |
|       return cc;
 | |
|   }
 | |
| 
 | |
|   return NULL;
 | |
| }
 | |
| 
 | |
| 
 | |
| static int
 | |
| advanceGen(generator *gen)
 | |
| { if ( gen->do_enum & ENUM_CHAR )
 | |
|   { if ( ++gen->current == 256 )
 | |
|       fail;
 | |
|   } else
 | |
|   { gen->class++;
 | |
|     if ( !gen->class->name )
 | |
|       fail;
 | |
|   }
 | |
| 
 | |
|   succeed;
 | |
| }
 | |
| 
 | |
| 
 | |
| static int
 | |
| unify_char_type(term_t type, const char_type *ct, int context, int how)
 | |
| { GET_LD
 | |
| 
 | |
|   if ( ct->arity == 0 )
 | |
|   { return PL_unify_atom(type, ct->name);
 | |
|   } else /*if ( ct->arity == 1 )*/
 | |
|   { if ( PL_unify_functor(type, PL_new_functor(ct->name, 1)) )
 | |
|     { term_t a = PL_new_term_ref();
 | |
| 
 | |
|       _PL_get_arg(1, type, a);
 | |
| 
 | |
|       if ( ct->ctx_type == CTX_CHAR )
 | |
| 	return PL_unify_char(a, context, how);
 | |
|       else
 | |
| 	return PL_unify_integer(a, context);
 | |
|     }
 | |
|   }
 | |
| 
 | |
|   fail;
 | |
| }
 | |
| 
 | |
| 
 | |
| static foreign_t
 | |
| do_char_type(term_t chr, term_t class, control_t h, int how)
 | |
| { GET_LD
 | |
|   generator *gen;
 | |
|   fid_t fid;
 | |
| 
 | |
|   switch( ForeignControl(h) )
 | |
|   { case FRG_FIRST_CALL:
 | |
|     { const char_type *cc = NULL;
 | |
|       int c;
 | |
|       int do_enum = ENUM_NONE;
 | |
|       atom_t cn;
 | |
|       int arity;
 | |
| 
 | |
|       if ( PL_is_variable(chr) )
 | |
| 	do_enum |= ENUM_CHAR;
 | |
|       if ( PL_is_variable(class) )
 | |
| 	do_enum |= ENUM_CLASS;
 | |
| 
 | |
|       if ( do_enum == ENUM_BOTH )
 | |
| 	return PL_error("char_type", 2, NULL, ERR_INSTANTIATION);
 | |
| 
 | |
|       if ( !(do_enum & ENUM_CHAR) )
 | |
|       { if ( !PL_get_char(chr, &c, TRUE) )
 | |
| 	  fail;
 | |
| 	if ( c == -1 )
 | |
| 	  return PL_unify_atom(class, ATOM_end_of_file);
 | |
|       }
 | |
| 
 | |
|       if ( !(do_enum & ENUM_CLASS) )
 | |
|       { if ( !PL_get_name_arity(class, &cn, &arity) ||
 | |
| 	     !(cc = char_type_by_name(cn, arity)) )
 | |
| 	  return PL_error("char_type", 2, NULL,
 | |
| 			  ERR_TYPE, ATOM_char_type, class);
 | |
|       }
 | |
| 
 | |
|       if ( do_enum == ENUM_NONE )
 | |
|       { if ( arity == 0 )
 | |
| 	  return (*cc->test)((wint_t)c) ? TRUE : FALSE;
 | |
| 	else
 | |
| 	{ int rval = (*cc->test)((wint_t)c);
 | |
| 
 | |
| 	  if ( rval >= 0 )
 | |
| 	  { term_t a = PL_new_term_ref();
 | |
| 	    int ok;
 | |
| 
 | |
| 	    _PL_get_arg(1, class, a);
 | |
| 
 | |
| 	    if ( cc->ctx_type == CTX_CHAR )
 | |
| 	      ok = PL_unify_char(a, rval, how);
 | |
| 	    else
 | |
| 	      ok = PL_unify_integer(a, rval);
 | |
| 
 | |
| 	    if ( ok )
 | |
| 	      return TRUE;
 | |
| 	    else
 | |
| 	      do_enum = ENUM_CHAR;	/* try the other way around */
 | |
| 	  } else
 | |
| 	    fail;
 | |
| 	}
 | |
|       }
 | |
| 
 | |
|       if ( do_enum == ENUM_CHAR && arity == 1 )
 | |
|       {	term_t a = PL_new_term_ref();	/* char_type(X, lower('A')) */
 | |
| 	int ca;
 | |
| 
 | |
| 	_PL_get_arg(1, class, a);
 | |
| 	if ( !PL_is_variable(a) )
 | |
| 	{ if ( PL_get_char(a, &ca, FALSE) )
 | |
| 	  { int c = (*cc->reverse)((wint_t)ca);
 | |
| 
 | |
| 	    if ( c < 0 )
 | |
| 	      fail;
 | |
| 
 | |
| 	    return PL_unify_char(chr, c, how);
 | |
| 	  }
 | |
| 	  fail;				/* error */
 | |
| 	}
 | |
|       }
 | |
| 
 | |
|       gen = allocHeap(sizeof(*gen));
 | |
|       gen->do_enum = do_enum;
 | |
| 
 | |
|       if ( do_enum & ENUM_CHAR )
 | |
|       { gen->class      = cc;
 | |
| 	gen->current    = -1;
 | |
|       } else if ( do_enum & ENUM_CLASS )
 | |
|       { gen->class	= char_types;
 | |
| 	gen->current    = c;
 | |
|       }
 | |
| 
 | |
|       break;
 | |
|     }
 | |
|     case FRG_REDO:
 | |
|       gen = ForeignContextPtr(h);
 | |
|       break;
 | |
|     case FRG_CUTTED:
 | |
|       gen = ForeignContextPtr(h);
 | |
|       if ( gen )
 | |
| 	freeHeap(gen, sizeof(*gen));
 | |
|     default:
 | |
|       succeed;
 | |
|   }
 | |
|   if ( !(fid = PL_open_foreign_frame()) )
 | |
|     goto error;
 | |
| 
 | |
|   for(;;)
 | |
|   { int rval;
 | |
| 
 | |
|     if ( (rval = (*gen->class->test)((wint_t)gen->current)) )
 | |
|     { if ( gen->do_enum & ENUM_CHAR )
 | |
|       { if ( !PL_unify_char(chr, gen->current, how) )
 | |
| 	  goto next;
 | |
|       }
 | |
|       if ( gen->class->arity > 0 )
 | |
|       { if ( rval < 0 ||
 | |
| 	     !unify_char_type(class, gen->class, rval, how) )
 | |
| 	  goto next;
 | |
| 
 | |
|       } else if ( gen->do_enum & ENUM_CLASS )
 | |
|       { if ( !unify_char_type(class, gen->class, rval, how) )
 | |
| 	  goto next;
 | |
|       }
 | |
| 
 | |
|       if ( advanceGen(gen) )		/* ok, found one */
 | |
| 	ForeignRedoPtr(gen);
 | |
|       else
 | |
|       { freeHeap(gen, sizeof(*gen));	/* the only one */
 | |
| 	succeed;
 | |
|       }
 | |
|     }
 | |
|   next:
 | |
|     PL_rewind_foreign_frame(fid);
 | |
| 
 | |
|     if ( !advanceGen(gen) )
 | |
|       break;
 | |
|   }
 | |
| 
 | |
| error:
 | |
|   freeHeap(gen, sizeof(*gen));
 | |
|   fail;
 | |
| }
 | |
| 
 | |
| 
 | |
| 
 | |
| static
 | |
| PRED_IMPL("char_type", 2, char_type, PL_FA_NONDETERMINISTIC)
 | |
| { return do_char_type(A1, A2, PL__ctx, PL_CHAR);
 | |
| }
 | |
| 
 | |
| 
 | |
| static
 | |
| PRED_IMPL("code_type", 2, code_type, PL_FA_NONDETERMINISTIC)
 | |
| { return do_char_type(A1, A2, PL__ctx, PL_CODE);
 | |
| }
 | |
| 
 | |
| 
 | |
| #if 0
 | |
| static
 | |
| PRED_IMPL("iswctype", 2, iswctype, 0)
 | |
| { char *s;
 | |
|   int chr;
 | |
|   wctype_t t;
 | |
| 
 | |
|   if ( !PL_get_char_ex(A1, &chr, FALSE) ||
 | |
|        !PL_get_chars_ex(A2, &s, CVT_ATOM) )
 | |
|     return FALSE;
 | |
| 
 | |
|   if ( !(t=wctype(s)) )
 | |
|     return PL_error(NULL, 0, NULL, ERR_EXISTENCE, ATOM_type, A2);
 | |
| 
 | |
|   return iswctype(chr, t) ? TRUE : FALSE;
 | |
| }
 | |
| #endif
 | |
| 
 | |
| 
 | |
| static int
 | |
| init_tout(PL_chars_t *t, size_t len)
 | |
| { switch(t->encoding)
 | |
|   { case ENC_ISO_LATIN_1:
 | |
|       if ( len < sizeof(t->buf) )
 | |
|       { t->text.t = t->buf;
 | |
| 	t->storage = PL_CHARS_LOCAL;
 | |
|       } else
 | |
|       { t->text.t = PL_malloc(len);
 | |
| 	t->storage = PL_CHARS_MALLOC;
 | |
|       }
 | |
|       succeed;
 | |
|     case ENC_WCHAR:
 | |
|       if ( len*sizeof(pl_wchar_t) < sizeof(t->buf) )
 | |
|       { t->text.w = (pl_wchar_t*)t->buf;
 | |
| 	t->storage = PL_CHARS_LOCAL;
 | |
|       } else
 | |
|       { t->text.w = PL_malloc(len*sizeof(pl_wchar_t));
 | |
| 	t->storage = PL_CHARS_MALLOC;
 | |
|       }
 | |
|       succeed;
 | |
|     default:
 | |
|       assert(0);
 | |
|       fail;
 | |
|   }
 | |
| }
 | |
| 
 | |
| 
 | |
| static inline wint_t
 | |
| get_chr_from_text(const PL_chars_t *t, size_t index)
 | |
| { switch(t->encoding)
 | |
|   { case ENC_ISO_LATIN_1:
 | |
|       return t->text.t[index]&0xff;
 | |
|     case ENC_WCHAR:
 | |
|       return t->text.w[index];
 | |
|     default:
 | |
|       assert(0);
 | |
|       return 0;
 | |
|   }
 | |
| }
 | |
| 
 | |
| 
 | |
| static foreign_t
 | |
| modify_case_atom(term_t in, term_t out, int down)
 | |
| { GET_LD
 | |
|   PL_chars_t tin, tout;
 | |
| 
 | |
|   if ( !PL_get_text(in, &tin, CVT_ATOMIC|CVT_EXCEPTION) )
 | |
|     return FALSE;
 | |
| 
 | |
|   if ( PL_get_text(out, &tout, CVT_ATOMIC) )
 | |
|   { unsigned int i;
 | |
| 
 | |
|     if ( tin.length != tout.length )
 | |
|       fail;
 | |
| 
 | |
|     for(i=0; i<tin.length; i++)
 | |
|     { wint_t ci = get_chr_from_text(&tin, i);
 | |
|       wint_t co = get_chr_from_text(&tout, i);
 | |
| 
 | |
|       if ( down )
 | |
|       { if ( co != towlower(ci) )
 | |
| 	  fail;
 | |
|       } else
 | |
|       { if ( co != towupper(ci) )
 | |
| 	  fail;
 | |
|       }
 | |
|     }
 | |
| 
 | |
|     succeed;
 | |
|   } else if ( PL_is_variable(out) )
 | |
|   { unsigned int i;
 | |
| 
 | |
|     tout.encoding  = tin.encoding;
 | |
|     tout.length    = tin.length;
 | |
|     tout.canonical = FALSE;		/* or TRUE? Can WCHAR map to ISO? */
 | |
| 
 | |
|     init_tout(&tout, tin.length);
 | |
| 
 | |
|     if ( tin.encoding == ENC_ISO_LATIN_1 )
 | |
|     { const unsigned char *in = (const unsigned char*)tin.text.t;
 | |
| 
 | |
|       if ( down )
 | |
|       { for(i=0; i<tin.length; i++)
 | |
| 	{ wint_t c = towlower(in[i]);
 | |
| 
 | |
| 	  if ( c > 255 )
 | |
| 	  { PL_promote_text(&tout);
 | |
| 	    for( ; i<tin.length; i++)
 | |
| 	    { tout.text.w[i] = towlower(in[i]);
 | |
| 	    }
 | |
| 	    break;
 | |
| 	  } else
 | |
| 	  { tout.text.t[i] = (char)c;
 | |
| 	  }
 | |
| 	}
 | |
|       } else				/* upcase */
 | |
|       { for(i=0; i<tin.length; i++)
 | |
| 	{ wint_t c = towupper(in[i]);
 | |
| 
 | |
| 	  if ( c > 255 )
 | |
| 	  { PL_promote_text(&tout);
 | |
| 	    for( ; i<tin.length; i++)
 | |
| 	    { tout.text.w[i] = towupper(in[i]);
 | |
| 	    }
 | |
| 	    break;
 | |
| 	  } else
 | |
| 	  { tout.text.t[i] = (char)c;
 | |
| 	  }
 | |
| 	}
 | |
|       }
 | |
|     } else
 | |
|     { if ( down )
 | |
|       { for(i=0; i<tin.length; i++)
 | |
| 	{ tout.text.w[i] = towlower(tin.text.w[i]);
 | |
| 	}
 | |
|       } else
 | |
|       { for(i=0; i<tin.length; i++)
 | |
| 	{ tout.text.w[i] = towupper(tin.text.w[i]);
 | |
| 	}
 | |
|       }
 | |
|     }
 | |
| 
 | |
|     PL_unify_text(out, 0, &tout, PL_ATOM);
 | |
|     PL_free_text(&tout);
 | |
| 
 | |
|     succeed;
 | |
|   } else
 | |
|   { return PL_error(NULL, 0, NULL, ERR_TYPE, ATOM_atom, out);
 | |
|   }
 | |
| }
 | |
| 
 | |
| 
 | |
| static
 | |
| PRED_IMPL("downcase_atom", 2, downcase_atom, 0)
 | |
| { return modify_case_atom(A1, A2, TRUE);
 | |
| }
 | |
| 
 | |
| 
 | |
| static
 | |
| PRED_IMPL("upcase_atom", 2, upcase_atom, 0)
 | |
| { return modify_case_atom(A1, A2, FALSE);
 | |
| }
 | |
| 
 | |
| 
 | |
| 		 /*******************************
 | |
| 		 *	    WHITE SPACE		*
 | |
| 		 *******************************/
 | |
| 
 | |
| static int
 | |
| write_normalize_space(IOSTREAM *out, term_t in)
 | |
| { GET_LD
 | |
|   PL_chars_t tin;
 | |
|   size_t i, end;
 | |
| 
 | |
|   if ( !PL_get_text(in, &tin, CVT_ATOMIC|CVT_EXCEPTION) )
 | |
|     return FALSE;
 | |
| 
 | |
|   end = tin.length;
 | |
|   i = 0;
 | |
| 
 | |
|   while(i<end && unicode_separator(get_chr_from_text(&tin, i)))
 | |
|     i++;
 | |
|   while( i<end )
 | |
|   { wint_t c;
 | |
| 
 | |
|     while(i<end && !unicode_separator((c=get_chr_from_text(&tin, i))))
 | |
|     { if ( Sputcode(c, out) < 0 )
 | |
| 	fail;
 | |
|       i++;
 | |
|     }
 | |
|     while(i<end && unicode_separator(get_chr_from_text(&tin, i)))
 | |
|       i++;
 | |
|     if ( i < end )
 | |
|     { if (  Sputcode(' ', out) < 0 )
 | |
| 	fail;
 | |
|     }
 | |
|   }
 | |
| 
 | |
|   succeed;
 | |
| }
 | |
| 
 | |
| 
 | |
| static
 | |
| PRED_IMPL("normalize_space", 2, normalize_space, 0)
 | |
| { redir_context ctx;
 | |
|   word rc;
 | |
| 
 | |
|   if ( (rc = setupOutputRedirect(A1, &ctx, FALSE)) )
 | |
|   { if ( (rc = write_normalize_space(ctx.stream, A2)) )
 | |
|       rc = closeOutputRedirect(&ctx);
 | |
|     else
 | |
|       discardOutputRedirect(&ctx);
 | |
|   }
 | |
| 
 | |
|   return rc;
 | |
| }
 | |
| 
 | |
| 
 | |
| 
 | |
| 		 /*******************************
 | |
| 		 *	       LOCALE		*
 | |
| 		 *******************************/
 | |
| 
 | |
| #if defined(HAVE_LOCALE_H) && defined(HAVE_SETLOCALE)
 | |
| #include <locale.h>
 | |
| 
 | |
| /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 | |
| Note: on some installations, locale doesn't   work correctly. Printing a
 | |
| message isn't really cute. It would be better to use printMessage(), but
 | |
| the system isn't yet initialised far enough.   Maybe we should store the
 | |
| failure and print a message at the end of the initialisation?
 | |
| 
 | |
| We only return FALSE if LC_CTYPE  fails.   This  is a serious indication
 | |
| that locale support is broken. We don't   depend too much on the others,
 | |
| so we ignore possible problems.
 | |
| - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
 | |
| 
 | |
| static int
 | |
| initLocale(void)
 | |
| { int rc = TRUE;
 | |
| 
 | |
|   if ( !setlocale(LC_CTYPE, "") )
 | |
|   { rc = FALSE;
 | |
|     DEBUG(0, Sdprintf("Failed to set LC_CTYPE locale\n"));
 | |
|   }
 | |
|   if ( !setlocale(LC_TIME, "") )
 | |
|   { DEBUG(0, Sdprintf("Failed to set LC_TIME locale\n"));
 | |
|   }
 | |
|   if ( !setlocale(LC_COLLATE, "") )
 | |
|   { DEBUG(0, Sdprintf("Failed to set LC_COLLATE locale\n"));
 | |
|   }
 | |
| 
 | |
|   return rc;
 | |
| }
 | |
| 
 | |
| typedef struct
 | |
| { int category;
 | |
|   const char *name;
 | |
| } lccat;
 | |
| 
 | |
| static lccat lccats[] =
 | |
| { { LC_ALL,      "all" },
 | |
|   { LC_COLLATE,  "collate" },
 | |
|   { LC_CTYPE,    "ctype" },
 | |
| #ifdef LC_MESSAGES
 | |
|   { LC_MESSAGES, "messages" },
 | |
| #endif
 | |
|   { LC_MONETARY, "monetary" },
 | |
|   { LC_NUMERIC,  "numeric" },
 | |
|   { LC_TIME,     "time" },
 | |
|   { 0,           NULL }
 | |
| };
 | |
| 
 | |
| 
 | |
| static
 | |
| PRED_IMPL("setlocale", 3, setlocale, 0)
 | |
| { PRED_LD
 | |
|   char *what;
 | |
|   char *locale;
 | |
|   const lccat *lcp;
 | |
| 
 | |
| 
 | |
|   if ( !PL_get_chars_ex(A1, &what, CVT_ATOM) )
 | |
|     fail;
 | |
|   if ( PL_is_variable(A3) )
 | |
|     locale = NULL;
 | |
|   else if ( !PL_get_chars_ex(A3, &locale, CVT_ATOM) )
 | |
|     fail;
 | |
| 
 | |
|   for ( lcp = lccats; lcp->name; lcp++ )
 | |
|   { if ( streq(lcp->name, what) )
 | |
|     { char *old = setlocale(lcp->category, NULL);
 | |
| 
 | |
|       if ( !PL_unify_chars(A2, PL_ATOM, -1, old) )
 | |
| 	fail;
 | |
| 
 | |
|       if ( PL_compare(A2, A3) != 0 )
 | |
|       { if ( !setlocale(lcp->category, locale) )
 | |
| 	  return PL_error(NULL, 0, MSG_ERRNO, ERR_SYSCALL, "setlocale");
 | |
|       }
 | |
| 
 | |
|       succeed;
 | |
|     }
 | |
|   }
 | |
| 
 | |
|   return PL_error(NULL, 0, NULL, ERR_DOMAIN,
 | |
| 		  PL_new_atom("category"), A1);
 | |
| }
 | |
| 
 | |
| #else
 | |
| 
 | |
| #define initLocale() 1
 | |
| 
 | |
| static
 | |
| PRED_IMPL("setlocale", 3, setlocale, 0)
 | |
| { return notImplemented("setlocale", 3);
 | |
| }
 | |
| 
 | |
| #endif
 | |
| 
 | |
| 
 | |
| 		 /*******************************
 | |
| 		 *      PUBLISH PREDICATES	*
 | |
| 		 *******************************/
 | |
| 
 | |
| BeginPredDefs(ctype)
 | |
|   PRED_DEF("char_type", 2, char_type, PL_FA_NONDETERMINISTIC)
 | |
|   PRED_DEF("code_type", 2, code_type, PL_FA_NONDETERMINISTIC)
 | |
|   PRED_DEF("setlocale", 3, setlocale, 0)
 | |
|   PRED_DEF("downcase_atom", 2, downcase_atom, 0)
 | |
|   PRED_DEF("upcase_atom", 2, upcase_atom, 0)
 | |
|   PRED_DEF("normalize_space", 2, normalize_space, 0)
 | |
| EndPredDefs
 | |
| 
 | |
| 
 | |
| 		 /*******************************
 | |
| 		 *	PROLOG CHARACTERS	*
 | |
| 		 *******************************/
 | |
| 
 | |
| const char _PL_char_types[] = {
 | |
| /* ^@  ^A  ^B  ^C  ^D  ^E  ^F  ^G  ^H  ^I  ^J  ^K  ^L  ^M  ^N  ^O    0-15 */
 | |
|    CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT,
 | |
| /* ^P  ^Q  ^R  ^S  ^T  ^U  ^V  ^W  ^X  ^Y  ^Z  ^[  ^\  ^]  ^^  ^_   16-31 */
 | |
|    CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT,
 | |
| /* sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /   32-47 */
 | |
|    SP, SO, DQ, SY, SY, SO, SY, SQ, PU, PU, SY, SY, PU, SY, SY, SY,
 | |
| /*  0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?   48-63 */
 | |
|    DI, DI, DI, DI, DI, DI, DI, DI, DI, DI, SY, SO, SY, SY, SY, SY,
 | |
| /*  @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O   64-79 */
 | |
|    SY, UC, UC, UC, UC, UC, UC, UC, UC, UC, UC, UC, UC, UC, UC, UC,
 | |
| /*  P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _   80-95 */
 | |
|    UC, UC, UC, UC, UC, UC, UC, UC, UC, UC, UC, PU, SY, PU, SY, UC,
 | |
| /*  `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o   96-111 */
 | |
|    SY, LC, LC, LC, LC, LC, LC, LC, LC, LC, LC, LC, LC, LC, LC, LC,
 | |
| /*  p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  ^?   112-127 */
 | |
|    LC, LC, LC, LC, LC, LC, LC, LC, LC, LC, LC, PU, PU, PU, SY, CT,
 | |
| 			  /* 128-159 (C1 controls) */
 | |
|    CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT,
 | |
|    CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT,
 | |
| 			  /* 160-255 (G1 graphics) */
 | |
| 			  /* ISO Latin 1 is assumed */
 | |
|    SP, SO, SO, SO, SO, SO, SO, SO, SO, SO, LC, SO, SO, SO, SO, SO,
 | |
|    SO, SO, SO, SO, SO, SO, SO, SO, SO, SO, LC, SO, SO, SO, SO, SO,
 | |
|    UC, UC, UC, UC, UC, UC, UC, UC, UC, UC, UC, UC, UC, UC, UC, UC,
 | |
|    UC, UC, UC, UC, UC, UC, UC, SO, UC, UC, UC, UC, UC, UC, UC, LC,
 | |
|    LC, LC, LC, LC, LC, LC, LC, LC, LC, LC, LC, LC, LC, LC, LC, LC,
 | |
|    LC, LC, LC, LC, LC, LC, LC, SO, LC, LC, LC, LC, LC, LC, LC, LC
 | |
| };
 | |
| 
 | |
| 
 | |
| typedef struct
 | |
| { const char *name;
 | |
|   IOENC encoding;
 | |
| } enc_map;
 | |
| 
 | |
| static const enc_map map[] =
 | |
| { { "UTF-8",	  ENC_UTF8 },
 | |
|   { "utf8",	  ENC_UTF8 },
 | |
|   { "ISO8859-1",  ENC_ISO_LATIN_1 },
 | |
|   { "ISO8859_1",  ENC_ISO_LATIN_1 },
 | |
|   { "iso88591",   ENC_ISO_LATIN_1 },
 | |
|   { "iso_8859_1", ENC_ISO_LATIN_1 },
 | |
|   { NULL, ENC_UNKNOWN }
 | |
| };
 | |
| 
 | |
| IOENC
 | |
| initEncoding(void)
 | |
| { GET_LD
 | |
| 
 | |
|   if ( LD )
 | |
|   { if ( !LD->encoding )
 | |
|     { char *enc;
 | |
| 
 | |
|       if ( !initLocale() )
 | |
|       { LD->encoding = ENC_ISO_LATIN_1;
 | |
|       } else if ( (enc = setlocale(LC_CTYPE, NULL)) )
 | |
|       { LD->encoding = ENC_ANSI;		/* text encoding */
 | |
| 
 | |
| 	/* this does not work on the mac for some reason */
 | |
| 	if (strchr(enc, '.')) {
 | |
| 	  enc = strchr(enc, '.');
 | |
| 	  /* skip '.' */
 | |
| 	  enc++;
 | |
| 	}
 | |
| 	if ( enc )
 | |
| 	{ const enc_map *m;
 | |
| 
 | |
| 	  for ( m=map; m->name; m++ )
 | |
| 	  { if ( strcmp(enc, m->name) == 0 )
 | |
| 	    { LD->encoding = m->encoding;
 | |
| 	      break;
 | |
| 	    }
 | |
| 	  }
 | |
| 	}
 | |
|       } else
 | |
|       { LD->encoding = ENC_ISO_LATIN_1;
 | |
|       }
 | |
|     }
 | |
| 
 | |
|     return LD->encoding;
 | |
|   }
 | |
| 
 | |
|   return ENC_ANSI;
 | |
| }
 | |
| 
 | |
| 
 | |
| void
 | |
| initCharTypes(void)
 | |
| { initEncoding();
 | |
| }
 | |
| 
 |