This repository has been archived on 2023-08-20. You can view files and clone it, but cannot push or open issues or pull requests.
yap-6.3/packages/PLStream/pl-ctype.c
2010-08-04 02:46:26 +01:00

910 lines
19 KiB
C

/* $Id$
Part of SWI-Prolog
Author: Jan Wielemaker
E-mail: wielemak@science.uva.nl
WWW: http://www.swi-prolog.org
Copyright (C): 1985-2007, University of Amsterdam
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include "pl-incl.h"
#include <ctype.h>
#include "pl-ctype.h"
/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
This module defines:
char_type(?Char, ?Type)
code_type(?Char, ?Type)
See manual for details.
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
#define CHAR_MODE 0
#define CODE_MODE 1
#define CTX_CHAR 0 /* Class(Char) */
#define CTX_CODE 1 /* Class(Int) */
typedef struct
{ atom_t name; /* name of the class */
int (*test)(wint_t chr); /* boolean */
int (*reverse)(wint_t chr); /* reverse mapping */
short arity; /* arity of class (i.e. lower('A')) */
short ctx_type; /* CTX_* */
} char_type;
#define ENUM_NONE 0x00
#define ENUM_CHAR 0x01
#define ENUM_CLASS 0x02
#define ENUM_BOTH 0x03
typedef struct
{ int current; /* current character */
const char_type *class; /* current class */
int do_enum; /* what to enumerate */
} generator;
static int unicode_separator(wint_t c);
static int
iswhite(wint_t chr)
{ return chr == ' ' || chr == '\t';
}
#ifdef __YAP_PROLOG__
#include "pl-umap.c" /* Unicode map */
#define CharTypeW(c, t, w) \
((unsigned)(c) <= 0xff ? (_PL_char_types[(unsigned)(c)] t) \
: (uflagsW(c) & w))
#define PlBlankW(c) CharTypeW(c, <= SP, U_SEPARATOR)
inline int
unicode_separator(wint_t c)
{ return PlBlankW(c);
}
#endif
static int
fiscsym(wint_t chr)
{ return iswalnum(chr) || chr == '_';
}
static int
fiscsymf(wint_t chr)
{ return iswalpha(chr) || chr == '_';
}
static int
iseof(wint_t chr)
{ return chr == (wint_t)-1;
}
static int
iseol(wint_t chr)
{ return chr >= 10 && chr <= 13;
}
static int
isnl(wint_t chr)
{ return chr == '\n';
}
static int
isperiod(wint_t chr)
{ return chr && strchr(".?!", chr) != NULL;
}
static int
isquote(wint_t chr)
{ return chr && strchr("'`\"", chr) != NULL;
}
static int
fupper(wint_t chr)
{ return iswlower(chr) ? (int)towupper(chr) : -1;
}
static int
flower(wint_t chr)
{ return iswupper(chr) ? (int)towlower(chr) : -1;
}
static int
ftoupper(wint_t chr)
{ return towupper(chr);
}
static int
ftolower(wint_t chr)
{ return towlower(chr);
}
static int
fparen(wint_t chr)
{ switch(chr)
{ case '(':
return ')';
case '{':
return '}';
case '[':
return ']';
default:
return -1;
}
}
static int
rparen(wint_t chr)
{ switch(chr)
{ case ')':
return '(';
case '}':
return '{';
case ']':
return '[';
default:
return -1;
}
}
static int
fdigit(wint_t chr)
{ if ( chr <= 0xff && isdigit(chr) )
return chr - '0';
return -1;
}
static int
rdigit(wint_t d)
{ if ( (int)d >= 0 && d <= 9 )
return d+'0';
return -1;
}
static int
fxdigit(wint_t chr)
{ if ( chr > 0xff )
return -1;
if ( isdigit(chr) )
return chr - '0';
if ( chr >= 'a' && chr <= 'f' )
return chr - 'a' + 10;
if ( chr >= 'A' && chr <= 'F' )
return chr - 'A' + 10;
return -1;
}
static int
rxdigit(wint_t d)
{ if ( (int)d >= 0 && d <= 9 )
return d+'0';
if ( d >= 10 && d <= 15 )
return d-10+'a';
return -1;
}
#define mkfunction(name) \
static int f ## name(wint_t chr) { return name(chr); }
mkfunction(iswalnum)
mkfunction(iswalpha)
mkfunction(isascii)
mkfunction(iswcntrl)
mkfunction(iswdigit)
mkfunction(iswgraph)
mkfunction(iswlower)
mkfunction(iswupper)
mkfunction(iswpunct)
mkfunction(iswspace)
static const char_type char_types[] =
{ { ATOM_alnum, fiswalnum },
{ ATOM_alpha, fiswalpha },
{ ATOM_csym, fiscsym },
{ ATOM_csymf, fiscsymf },
{ ATOM_ascii, fisascii },
{ ATOM_white, iswhite },
{ ATOM_cntrl, fiswcntrl },
{ ATOM_digit, fiswdigit },
{ ATOM_graph, fiswgraph },
{ ATOM_lower, fiswlower },
{ ATOM_upper, fiswupper },
{ ATOM_punct, fiswpunct },
{ ATOM_space, fiswspace },
{ ATOM_end_of_file, iseof },
{ ATOM_end_of_line, iseol },
{ ATOM_newline, isnl },
{ ATOM_period, isperiod },
{ ATOM_quote, isquote },
{ ATOM_lower, fupper, flower, 1, CTX_CHAR },
{ ATOM_upper, flower, fupper, 1, CTX_CHAR },
{ ATOM_to_lower, ftoupper, ftolower, 1, CTX_CHAR },
{ ATOM_to_upper, ftolower, ftoupper, 1, CTX_CHAR },
{ ATOM_paren, fparen, rparen, 1, CTX_CHAR },
{ ATOM_digit, fdigit, rdigit, 1, CTX_CODE },
{ ATOM_xdigit, fxdigit, rxdigit, 1, CTX_CODE },
{ NULL_ATOM, NULL }
};
static const char_type *
char_type_by_name(atom_t name, int arity)
{ const char_type *cc;
for(cc = char_types; cc->name; cc++)
{ if ( cc->name == name && cc->arity == arity )
return cc;
}
return NULL;
}
static int
advanceGen(generator *gen)
{ if ( gen->do_enum & ENUM_CHAR )
{ if ( ++gen->current == 256 )
fail;
} else
{ gen->class++;
if ( !gen->class->name )
fail;
}
succeed;
}
static int
unify_char_type(term_t type, const char_type *ct, int context, int how)
{ if ( ct->arity == 0 )
return PL_unify_atom(type, ct->name);
else /*if ( ct->arity == 1 )*/
{ if ( PL_unify_functor(type, PL_new_functor(ct->name, 1)) )
{ term_t a = PL_new_term_ref();
_PL_get_arg(1, type, a);
if ( ct->ctx_type == CTX_CHAR )
return PL_unify_char(a, context, how);
else
return PL_unify_integer(a, context);
}
}
fail;
}
static foreign_t
do_char_type(term_t chr, term_t class, control_t h, int how)
{ GET_LD
generator *gen;
fid_t fid;
switch( ForeignControl(h) )
{ case FRG_FIRST_CALL:
{ const char_type *cc = NULL;
int c;
int do_enum = ENUM_NONE;
atom_t cn;
int arity;
if ( PL_is_variable(chr) )
do_enum |= ENUM_CHAR;
if ( PL_is_variable(class) )
do_enum |= ENUM_CLASS;
if ( do_enum == ENUM_BOTH )
return PL_error("char_type", 2, NULL, ERR_INSTANTIATION);
if ( !(do_enum & ENUM_CHAR) )
{ if ( !PL_get_char(chr, &c, TRUE) )
fail;
if ( c == -1 )
return PL_unify_atom(class, ATOM_end_of_file);
}
if ( !(do_enum & ENUM_CLASS) )
{ if ( !PL_get_name_arity(class, &cn, &arity) ||
!(cc = char_type_by_name(cn, arity)) )
return PL_error("char_type", 2, NULL,
ERR_TYPE, ATOM_char_type, class);
}
if ( do_enum == ENUM_NONE )
{ if ( arity == 0 )
return (*cc->test)((wint_t)c) ? TRUE : FALSE;
else
{ int rval = (*cc->test)((wint_t)c);
if ( rval >= 0 )
{ term_t a = PL_new_term_ref();
int ok;
_PL_get_arg(1, class, a);
if ( cc->ctx_type == CTX_CHAR )
ok = PL_unify_char(a, rval, how);
else
ok = PL_unify_integer(a, rval);
if ( ok )
return TRUE;
else
do_enum = ENUM_CHAR; /* try the other way around */
} else
fail;
}
}
if ( do_enum == ENUM_CHAR && arity == 1 )
{ term_t a = PL_new_term_ref(); /* char_type(X, lower('A')) */
int ca;
_PL_get_arg(1, class, a);
if ( !PL_is_variable(a) )
{ if ( PL_get_char(a, &ca, FALSE) )
{ int c = (*cc->reverse)((wint_t)ca);
if ( c < 0 )
fail;
return PL_unify_char(chr, c, how);
}
fail; /* error */
}
}
gen = allocHeap(sizeof(*gen));
gen->do_enum = do_enum;
if ( do_enum & ENUM_CHAR )
{ gen->class = cc;
gen->current = -1;
} else if ( do_enum & ENUM_CLASS )
{ gen->class = char_types;
gen->current = c;
}
break;
}
case FRG_REDO:
gen = ForeignContextPtr(h);
break;
case FRG_CUTTED:
gen = ForeignContextPtr(h);
if ( gen )
freeHeap(gen, sizeof(*gen));
default:
succeed;
}
if ( !(fid = PL_open_foreign_frame()) )
goto error;
for(;;)
{ int rval;
if ( (rval = (*gen->class->test)((wint_t)gen->current)) )
{ if ( gen->do_enum & ENUM_CHAR )
{ if ( !PL_unify_char(chr, gen->current, how) )
goto next;
}
if ( gen->class->arity > 0 )
{ if ( rval < 0 ||
!unify_char_type(class, gen->class, rval, how) )
goto next;
} else if ( gen->do_enum & ENUM_CLASS )
{ if ( !unify_char_type(class, gen->class, rval, how) )
goto next;
}
if ( advanceGen(gen) ) /* ok, found one */
ForeignRedoPtr(gen);
else
{ freeHeap(gen, sizeof(*gen)); /* the only one */
succeed;
}
}
next:
PL_rewind_foreign_frame(fid);
if ( !advanceGen(gen) )
break;
}
error:
freeHeap(gen, sizeof(*gen));
fail;
}
static
PRED_IMPL("char_type", 2, char_type, PL_FA_NONDETERMINISTIC)
{ return do_char_type(A1, A2, PL__ctx, PL_CHAR);
}
static
PRED_IMPL("code_type", 2, code_type, PL_FA_NONDETERMINISTIC)
{ return do_char_type(A1, A2, PL__ctx, PL_CODE);
}
#if 0
static
PRED_IMPL("iswctype", 2, iswctype, 0)
{ char *s;
int chr;
wctype_t t;
if ( !PL_get_char_ex(A1, &chr, FALSE) ||
!PL_get_chars_ex(A2, &s, CVT_ATOM) )
return FALSE;
if ( !(t=wctype(s)) )
return PL_error(NULL, 0, NULL, ERR_EXISTENCE, ATOM_type, A2);
return iswctype(chr, t) ? TRUE : FALSE;
}
#endif
static int
init_tout(PL_chars_t *t, size_t len)
{ switch(t->encoding)
{ case ENC_ISO_LATIN_1:
if ( len < sizeof(t->buf) )
{ t->text.t = t->buf;
t->storage = PL_CHARS_LOCAL;
} else
{ t->text.t = PL_malloc(len);
t->storage = PL_CHARS_MALLOC;
}
succeed;
case ENC_WCHAR:
if ( len*sizeof(pl_wchar_t) < sizeof(t->buf) )
{ t->text.w = (pl_wchar_t*)t->buf;
t->storage = PL_CHARS_LOCAL;
} else
{ t->text.w = PL_malloc(len*sizeof(pl_wchar_t));
t->storage = PL_CHARS_MALLOC;
}
succeed;
default:
assert(0);
fail;
}
}
static inline wint_t
get_chr_from_text(const PL_chars_t *t, size_t index)
{ switch(t->encoding)
{ case ENC_ISO_LATIN_1:
return t->text.t[index]&0xff;
case ENC_WCHAR:
return t->text.w[index];
default:
assert(0);
return 0;
}
}
static foreign_t
modify_case_atom(term_t in, term_t out, int down)
{ GET_LD
PL_chars_t tin, tout;
if ( !PL_get_text(in, &tin, CVT_ATOMIC|CVT_EXCEPTION) )
return FALSE;
if ( PL_get_text(out, &tout, CVT_ATOMIC) )
{ unsigned int i;
if ( tin.length != tout.length )
fail;
for(i=0; i<tin.length; i++)
{ wint_t ci = get_chr_from_text(&tin, i);
wint_t co = get_chr_from_text(&tout, i);
if ( down )
{ if ( co != towlower(ci) )
fail;
} else
{ if ( co != towupper(ci) )
fail;
}
}
succeed;
} else if ( PL_is_variable(out) )
{ unsigned int i;
tout.encoding = tin.encoding;
tout.length = tin.length;
tout.canonical = FALSE; /* or TRUE? Can WCHAR map to ISO? */
init_tout(&tout, tin.length);
if ( tin.encoding == ENC_ISO_LATIN_1 )
{ const unsigned char *in = (const unsigned char*)tin.text.t;
if ( down )
{ for(i=0; i<tin.length; i++)
{ wint_t c = towlower(in[i]);
if ( c > 255 )
{ PL_promote_text(&tout);
for( ; i<tin.length; i++)
{ tout.text.w[i] = towlower(in[i]);
}
break;
} else
{ tout.text.t[i] = (char)c;
}
}
} else /* upcase */
{ for(i=0; i<tin.length; i++)
{ wint_t c = towupper(in[i]);
if ( c > 255 )
{ PL_promote_text(&tout);
for( ; i<tin.length; i++)
{ tout.text.w[i] = towupper(in[i]);
}
break;
} else
{ tout.text.t[i] = (char)c;
}
}
}
} else
{ if ( down )
{ for(i=0; i<tin.length; i++)
{ tout.text.w[i] = towlower(tin.text.w[i]);
}
} else
{ for(i=0; i<tin.length; i++)
{ tout.text.w[i] = towupper(tin.text.w[i]);
}
}
}
PL_unify_text(out, 0, &tout, PL_ATOM);
PL_free_text(&tout);
succeed;
} else
{ return PL_error(NULL, 0, NULL, ERR_TYPE, ATOM_atom, out);
}
}
static
PRED_IMPL("downcase_atom", 2, downcase_atom, 0)
{ return modify_case_atom(A1, A2, TRUE);
}
static
PRED_IMPL("upcase_atom", 2, upcase_atom, 0)
{ return modify_case_atom(A1, A2, FALSE);
}
/*******************************
* WHITE SPACE *
*******************************/
static int
write_normalize_space(IOSTREAM *out, term_t in)
{ GET_LD
PL_chars_t tin;
size_t i, end;
if ( !PL_get_text(in, &tin, CVT_ATOMIC|CVT_EXCEPTION) )
return FALSE;
end = tin.length;
i = 0;
while(i<end && unicode_separator(get_chr_from_text(&tin, i)))
i++;
while( i<end )
{ wint_t c;
while(i<end && !unicode_separator((c=get_chr_from_text(&tin, i))))
{ if ( Sputcode(c, out) < 0 )
fail;
i++;
}
while(i<end && unicode_separator(get_chr_from_text(&tin, i)))
i++;
if ( i < end )
{ if ( Sputcode(' ', out) < 0 )
fail;
}
}
succeed;
}
static
PRED_IMPL("normalize_space", 2, normalize_space, 0)
{ redir_context ctx;
word rc;
if ( (rc = setupOutputRedirect(A1, &ctx, FALSE)) )
{ if ( (rc = write_normalize_space(ctx.stream, A2)) )
rc = closeOutputRedirect(&ctx);
else
discardOutputRedirect(&ctx);
}
return rc;
}
/*******************************
* LOCALE *
*******************************/
#if defined(HAVE_LOCALE_H) && defined(HAVE_SETLOCALE)
#include <locale.h>
/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Note: on some installations, locale doesn't work correctly. Printing a
message isn't really cute. It would be better to use printMessage(), but
the system isn't yet initialised far enough. Maybe we should store the
failure and print a message at the end of the initialisation?
We only return FALSE if LC_CTYPE fails. This is a serious indication
that locale support is broken. We don't depend too much on the others,
so we ignore possible problems.
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
static int
initLocale()
{ int rc = TRUE;
if ( !setlocale(LC_CTYPE, "") )
{ rc = FALSE;
DEBUG(0, Sdprintf("Failed to set LC_CTYPE locale\n"));
}
if ( !setlocale(LC_TIME, "") )
{ DEBUG(0, Sdprintf("Failed to set LC_TIME locale\n"));
}
if ( !setlocale(LC_COLLATE, "") )
{ DEBUG(0, Sdprintf("Failed to set LC_COLLATE locale\n"));
}
return rc;
}
typedef struct
{ int category;
const char *name;
} lccat;
static lccat lccats[] =
{ { LC_ALL, "all" },
{ LC_COLLATE, "collate" },
{ LC_CTYPE, "ctype" },
#ifdef LC_MESSAGES
{ LC_MESSAGES, "messages" },
#endif
{ LC_MONETARY, "monetary" },
{ LC_NUMERIC, "numeric" },
{ LC_TIME, "time" },
{ 0, NULL }
};
static
PRED_IMPL("setlocale", 3, setlocale, 0)
{ PRED_LD
char *what;
char *locale;
const lccat *lcp;
if ( !PL_get_chars_ex(A1, &what, CVT_ATOM) )
fail;
if ( PL_is_variable(A3) )
locale = NULL;
else if ( !PL_get_chars_ex(A3, &locale, CVT_ATOM) )
fail;
for ( lcp = lccats; lcp->name; lcp++ )
{ if ( streq(lcp->name, what) )
{ char *old = setlocale(lcp->category, NULL);
if ( !PL_unify_chars(A2, PL_ATOM, -1, old) )
fail;
if ( PL_compare(A2, A3) != 0 )
{ if ( !setlocale(lcp->category, locale) )
return PL_error(NULL, 0, MSG_ERRNO, ERR_SYSCALL, "setlocale");
}
succeed;
}
}
return PL_error(NULL, 0, NULL, ERR_DOMAIN,
PL_new_atom("category"), A1);
}
#else
#define initLocale() 1
static
PRED_IMPL("setlocale", 3, setlocale, 0)
{ return notImplemented("setlocale", 3);
}
#endif
/*******************************
* PUBLISH PREDICATES *
*******************************/
BeginPredDefs(ctype)
PRED_DEF("swi_char_type", 2, char_type, PL_FA_NONDETERMINISTIC)
PRED_DEF("swi_code_type", 2, code_type, PL_FA_NONDETERMINISTIC)
PRED_DEF("setlocale", 3, setlocale, 0)
PRED_DEF("swi_downcase_atom", 2, downcase_atom, 0)
PRED_DEF("swi_upcase_atom", 2, upcase_atom, 0)
PRED_DEF("normalize_space", 2, normalize_space, 0)
EndPredDefs
/*******************************
* PROLOG CHARACTERS *
*******************************/
const char _PL_char_types[] = {
/* ^@ ^A ^B ^C ^D ^E ^F ^G ^H ^I ^J ^K ^L ^M ^N ^O 0-15 */
CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT,
/* ^P ^Q ^R ^S ^T ^U ^V ^W ^X ^Y ^Z ^[ ^\ ^] ^^ ^_ 16-31 */
CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT,
/* sp ! " # $ % & ' ( ) * + , - . / 32-47 */
SP, SO, DQ, SY, SY, SO, SY, SQ, PU, PU, SY, SY, PU, SY, SY, SY,
/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? 48-63 */
DI, DI, DI, DI, DI, DI, DI, DI, DI, DI, SY, SO, SY, SY, SY, SY,
/* @ A B C D E F G H I J K L M N O 64-79 */
SY, UC, UC, UC, UC, UC, UC, UC, UC, UC, UC, UC, UC, UC, UC, UC,
/* P Q R S T U V W X Y Z [ \ ] ^ _ 80-95 */
UC, UC, UC, UC, UC, UC, UC, UC, UC, UC, UC, PU, SY, PU, SY, UC,
/* ` a b c d e f g h i j k l m n o 96-111 */
SY, LC, LC, LC, LC, LC, LC, LC, LC, LC, LC, LC, LC, LC, LC, LC,
/* p q r s t u v w x y z { | } ~ ^? 112-127 */
LC, LC, LC, LC, LC, LC, LC, LC, LC, LC, LC, PU, PU, PU, SY, CT,
/* 128-159 (C1 controls) */
CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT,
CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT,
/* 160-255 (G1 graphics) */
/* ISO Latin 1 is assumed */
SP, SO, SO, SO, SO, SO, SO, SO, SO, SO, LC, SO, SO, SO, SO, SO,
SO, SO, SO, SO, SO, SO, SO, SO, SO, SO, LC, SO, SO, SO, SO, SO,
UC, UC, UC, UC, UC, UC, UC, UC, UC, UC, UC, UC, UC, UC, UC, UC,
UC, UC, UC, UC, UC, UC, UC, SO, UC, UC, UC, UC, UC, UC, UC, LC,
LC, LC, LC, LC, LC, LC, LC, LC, LC, LC, LC, LC, LC, LC, LC, LC,
LC, LC, LC, LC, LC, LC, LC, SO, LC, LC, LC, LC, LC, LC, LC, LC
};
typedef struct
{ const char *name;
IOENC encoding;
} enc_map;
static const enc_map map[] =
{ { "UTF-8", ENC_UTF8 },
{ "utf8", ENC_UTF8 },
{ "ISO8859-1", ENC_ISO_LATIN_1 },
{ "ISO8859_1", ENC_ISO_LATIN_1 },
{ "iso88591", ENC_ISO_LATIN_1 },
{ "iso_8859_1", ENC_ISO_LATIN_1 },
{ NULL, ENC_UNKNOWN }
};
IOENC
initEncoding(void)
{ GET_LD
if ( LD )
{ if ( !LD->encoding )
{ char *enc;
if ( !initLocale() )
{ LD->encoding = ENC_ISO_LATIN_1;
} else if ( (enc = setlocale(LC_CTYPE, NULL)) )
{ LD->encoding = ENC_ANSI; /* text encoding */
if ( (enc = strchr(enc, '.')) )
{ const enc_map *m;
enc++; /* skip '.' */
for ( m=map; m->name; m++ )
{ if ( strcmp(enc, m->name) == 0 )
{ LD->encoding = m->encoding;
break;
}
}
}
} else
{ LD->encoding = ENC_ISO_LATIN_1;
}
}
#if __YAP_PROLOG__
PL_register_extensions(PL_predicates_from_ctype);
#endif
return LD->encoding;
}
return ENC_ANSI;
}
void
initCharTypes(void)
{
initEncoding();
}
#if __SWI_PROLOG__
bool
systemMode(bool accept)
{ GET_LD
bool old = SYSTEM_MODE ? TRUE : FALSE;
if ( accept )
debugstatus.styleCheck |= DOLLAR_STYLE;
else
debugstatus.styleCheck &= ~DOLLAR_STYLE;
return old;
}
#endif