/* $Id$ Part of SWI-Prolog Author: Jan Wielemaker and Anjo Anjewierden E-mail: jan@swi.psy.uva.nl WWW: http://www.swi-prolog.org Copyright (C): 1985-2002, University of Amsterdam This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include "pl-incl.h" #include "pl-ctype.h" #include "pl-utf8.h" #include #include #ifdef __WINDOWS__ #include "pl-mswchar.h" /* Terrible hack */ #endif #if HAVE_LIMITS_H #include /* solaris compatibility */ #endif #undef LD #define LD LOCAL_LD #ifdef __SWI_PROLOG__ static inline word valHandle__LD(term_t r ARG_LD) { Word p = valTermRef(r); deRef(p); return *p; } #define valHandle(r) valHandle__LD(r PASS_LD) #define setHandle(h, w) (*valTermRef(h) = (w)) #endif /******************************* * UNIFIED TEXT STUFF * *******************************/ static inline size_t bufsize_text(PL_chars_t *text, size_t len) { size_t unit; switch(text->encoding) { case ENC_ISO_LATIN_1: case ENC_ASCII: case ENC_UTF8: case ENC_ANSI: unit = sizeof(char); break; case ENC_WCHAR: unit = sizeof(pl_wchar_t); break; default: assert(0); unit = sizeof(char); /*NOTREACHED*/ } return len*unit; } void PL_save_text(PL_chars_t *text, int flags) { if ( (flags & BUF_MALLOC) && text->storage != PL_CHARS_MALLOC ) { size_t bl = bufsize_text(text, text->length+1); void *new = PL_malloc(bl); memcpy(new, text->text.t, bl); text->text.t = new; text->storage = PL_CHARS_MALLOC; } else if ( text->storage == PL_CHARS_LOCAL ) { Buffer b = findBuffer(BUF_RING); size_t bl = bufsize_text(text, text->length+1); addMultipleBuffer(b, text->text.t, bl, char); text->text.t = baseBuffer(b, char); text->storage = PL_CHARS_RING; } } int PL_get_text__LD(term_t l, PL_chars_t *text, int flags ARG_LD) { Word w = valHandle(l); if ( (flags & CVT_ATOM) && isAtom(w) ) { if ( !get_atom_text(w, text) ) goto maybe_write; } else if ( (flags & CVT_STRING) && isString(w) ) { if ( !get_string_text(w, text PASS_LD) ) goto maybe_write; } else if ( (flags & CVT_INTEGER) && isInteger(w) ) { number n; PL_get_number(l, &n); switch(n.type) { case V_INTEGER: sprintf(text->buf, INT64_FORMAT, n.value.i); text->text.t = text->buf; text->length = strlen(text->text.t); text->storage = PL_CHARS_LOCAL; break; #ifdef O_GMP case V_MPZ: { size_t sz = mpz_sizeinbase(n.value.mpz, 10) + 2; Buffer b = findBuffer(BUF_RING); growBuffer(b, sz); mpz_get_str(b->base, 10, n.value.mpz); b->top = b->base + strlen(b->base); text->text.t = baseBuffer(b, char); text->length = entriesBuffer(b, char); text->storage = PL_CHARS_RING; break; } #endif default: assert(0); } text->encoding = ENC_ISO_LATIN_1; text->canonical = TRUE; } else if ( (flags & CVT_FLOAT) && isReal(w) ) { format_float(valReal(w), text->buf, LD->float_format); text->text.t = text->buf; text->length = strlen(text->text.t); text->encoding = ENC_ISO_LATIN_1; text->storage = PL_CHARS_LOCAL; text->canonical = TRUE; } else if ( (flags & CVT_LIST) && (isList(w) || isNil(w)) ) { Buffer b; if ( (b = codes_or_chars_to_buffer(l, BUF_RING, FALSE)) ) { text->length = entriesBuffer(b, char); addBuffer(b, EOS, char); text->text.t = baseBuffer(b, char); text->encoding = ENC_ISO_LATIN_1; } else if ( (b = codes_or_chars_to_buffer(l, BUF_RING, TRUE)) ) { text->length = entriesBuffer(b, pl_wchar_t); addBuffer(b, EOS, pl_wchar_t); text->text.w = baseBuffer(b, pl_wchar_t); text->encoding = ENC_WCHAR; } else goto maybe_write; text->storage = PL_CHARS_RING; text->canonical = TRUE; } else if ( (flags & CVT_VARIABLE) && isVar(w) ) { text->text.t = varName(l, text->buf); text->length = strlen(text->text.t); text->encoding = ENC_ISO_LATIN_1; text->storage = PL_CHARS_LOCAL; text->canonical = TRUE; } else if ( (flags & CVT_WRITE) ) { IOENC encodings[3]; IOENC *enc; char *r; case_write: encodings[0] = ENC_ISO_LATIN_1; encodings[1] = ENC_WCHAR; encodings[2] = ENC_UNKNOWN; for(enc = encodings; *enc != ENC_UNKNOWN; enc++) { size_t size; IOSTREAM *fd; r = text->buf; size = sizeof(text->buf); fd = Sopenmem(&r, &size, "w"); fd->encoding = *enc; if ( PL_write_term(fd, l, 1200, 0) && Sputcode(EOS, fd) >= 0 && Sflush(fd) >= 0 ) { text->encoding = *enc; text->storage = (r == text->buf ? PL_CHARS_LOCAL : PL_CHARS_MALLOC); text->canonical = TRUE; if ( *enc == ENC_ISO_LATIN_1 ) { text->length = size-1; text->text.t = r; } else { text->length = (size/sizeof(pl_wchar_t))-1; text->text.w = (pl_wchar_t *)r; } Sclose(fd); return TRUE; } else { Sclose(fd); if ( r != text->buf ) Sfree(r); } } goto error; } else { goto error; } succeed; maybe_write: if ( (flags & CVT_WRITE) ) goto case_write; error: if ( (flags & CVT_EXCEPTION) ) { atom_t expected; if ( flags & CVT_LIST ) expected = ATOM_text; else if ( flags & CVT_NUMBER ) expected = ATOM_atomic; else expected = ATOM_atom; return PL_error(NULL, 0, NULL, ERR_TYPE, expected, l); } fail; } atom_t textToAtom(PL_chars_t *text) { PL_canonise_text(text); if ( text->encoding == ENC_ISO_LATIN_1 ) { return lookupAtom(text->text.t, text->length); } else { return lookupUCSAtom(text->text.w, text->length); } } #if __SWI_PROLOG__ word textToString(PL_chars_t *text) { PL_canonise_text(text); if ( text->encoding == ENC_ISO_LATIN_1 ) { return globalString(text->length, text->text.t); } else { return globalWString(text->length, text->text.w); } } #endif int PL_unify_text(term_t term, term_t tail, PL_chars_t *text, int type) { switch(type) { case PL_ATOM: { atom_t a = textToAtom(text); int rval = _PL_unify_atomic(term, a); PL_unregister_atom(a); return rval; } case PL_STRING: #if __SWI_PROLOG__ { word w = textToString(text); return _PL_unify_atomic(term, w); } #endif case PL_CODE_LIST: case PL_CHAR_LIST: { if ( text->length == 0 ) { if ( tail ) { GET_LD PL_put_term(tail, term); return TRUE; } else { return PL_unify_nil(term); } } else { GET_LD word p0, p; switch(text->encoding) { case ENC_ISO_LATIN_1: { const unsigned char *s = (const unsigned char *)text->text.t; const unsigned char *e = &s[text->length]; p0 = p = INIT_SEQ_CODES(text->length); if ( type == PL_CODE_LIST ) { for( ; s < e; s++) p = EXTEND_SEQ_CODES(p, *s); } else { for( ; s < e; s++) p = EXTEND_SEQ_ATOMS(p, *s); } break; } case ENC_WCHAR: { const pl_wchar_t *s = (const pl_wchar_t *)text->text.t; const pl_wchar_t *e = &s[text->length]; p0 = p = INIT_SEQ_CODES(text->length); if ( type == PL_CODE_LIST ) { for( ; s < e; s++) p = EXTEND_SEQ_CODES(p, *s); } else { for( ; s < e; s++) p = EXTEND_SEQ_ATOMS(p, *s); } break; } case ENC_UTF8: { const char *s = text->text.t; const char *e = &s[text->length]; size_t len = utf8_strlen(s, text->length); p0 = p = INIT_SEQ_CODES(len); if ( type == PL_CODE_LIST ) { while (s < e) { int chr; s = utf8_get_char(s, &chr); p = EXTEND_SEQ_CODES(p, chr); } } else { while (s < e) { int chr; s = utf8_get_char(s, &chr); p = EXTEND_SEQ_ATOMS(p, chr); } } break; } case ENC_ANSI: { const char *s = text->text.t; size_t rc, n = text->length; size_t len = 0; mbstate_t mbs; wchar_t wc; memset(&mbs, 0, sizeof(mbs)); while( n > 0 && (rc=mbrtowc(&wc, s, n, &mbs)) != (size_t)-1 ) { len++; n -= rc; s += rc; } p0 = p = INIT_SEQ_CODES(len); memset(&mbs, 0, sizeof(mbs)); n = text->length; while(n > 0) { rc = mbrtowc(&wc, s, n, &mbs); if ( type == PL_CODE_LIST ) p = EXTEND_SEQ_CODES(p, wc); else p = EXTEND_SEQ_ATOMS(p, wc); s += rc; n -= rc; } break; } default: { assert(0); return FALSE; } } return CLOSE_SEQ_OF_CODES(p, p0, tail, term ); } } default: { assert(0); return FALSE; } } } int PL_unify_text_range(term_t term, PL_chars_t *text, size_t offset, size_t len, int type) { if ( offset == 0 && len == text->length ) { return PL_unify_text(term, 0, text, type); } else { PL_chars_t sub; int rc; if ( offset > text->length || offset + len > text->length ) return FALSE; sub.length = len; sub.storage = PL_CHARS_HEAP; if ( text->encoding == ENC_ISO_LATIN_1 ) { sub.text.t = text->text.t+offset; sub.encoding = ENC_ISO_LATIN_1; sub.canonical = TRUE; } else { sub.text.w = text->text.w+offset; sub.encoding = ENC_WCHAR; sub.canonical = FALSE; } rc = PL_unify_text(term, 0, &sub, type); PL_free_text(&sub); return rc; } } /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - int PL_promote_text(PL_chars_t *text) Promote a text to USC if it is currently 8-bit text. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ int PL_promote_text(PL_chars_t *text) { if ( text->encoding != ENC_WCHAR ) { if ( text->storage == PL_CHARS_MALLOC ) { pl_wchar_t *new = PL_malloc(sizeof(pl_wchar_t)*(text->length+1)); pl_wchar_t *t = new; const unsigned char *s = (const unsigned char *)text->text.t; const unsigned char *e = &s[text->length]; while(stext.t); text->text.w = new; text->encoding = ENC_WCHAR; } else if ( text->storage == PL_CHARS_LOCAL && (text->length+1)*sizeof(pl_wchar_t) < sizeof(text->buf) ) { unsigned char buf[sizeof(text->buf)]; unsigned char *f = buf; unsigned char *e = &buf[text->length]; pl_wchar_t *t = (pl_wchar_t*)text->buf; memcpy(buf, text->buf, text->length*sizeof(char)); while(fencoding = ENC_WCHAR; } else { Buffer b = findBuffer(BUF_RING); const unsigned char *s = (const unsigned char *)text->text.t; const unsigned char *e = &s[text->length]; for( ; stext.w = baseBuffer(b, pl_wchar_t); text->encoding = ENC_WCHAR; text->storage = PL_CHARS_RING; } } succeed; } int PL_demote_text(PL_chars_t *text) { if ( text->encoding != ENC_ISO_LATIN_1 ) { if ( text->storage == PL_CHARS_MALLOC ) { char *new = PL_malloc(sizeof(char)*(text->length+1)); char *t = new; const pl_wchar_t *s = (const pl_wchar_t *)text->text.t; const pl_wchar_t *e = &s[text->length]; while(s 0xff ) { PL_free(new); return FALSE; } *t++ = *s++ & 0xff; } *t = EOS; PL_free(text->text.t); text->text.t = new; text->encoding = ENC_ISO_LATIN_1; } else if ( text->storage == PL_CHARS_LOCAL ) { pl_wchar_t buf[sizeof(text->buf)/sizeof(pl_wchar_t)]; pl_wchar_t *f = buf; pl_wchar_t *e = &buf[text->length]; char *t = text->buf; memcpy(buf, text->buf, text->length*sizeof(pl_wchar_t)); while(f 0xff ) return FALSE; *t++ = *f++ & 0xff; } *t = EOS; text->encoding = ENC_ISO_LATIN_1; } else { Buffer b = findBuffer(BUF_RING); const pl_wchar_t *s = (const pl_wchar_t*)text->text.w; const pl_wchar_t *e = &s[text->length]; for( ; s 0xff ) { unfindBuffer(BUF_RING); return FALSE; } addBuffer(b, *s&0xff, char); } addBuffer(b, EOS, char); text->text.t = baseBuffer(b, char); text->storage = PL_CHARS_RING; text->encoding = ENC_ISO_LATIN_1; } } succeed; } static int can_demote(PL_chars_t *text) { if ( text->encoding != ENC_ISO_LATIN_1 ) { const pl_wchar_t *w = (const pl_wchar_t*)text->text.w; const pl_wchar_t *e = &w[text->length]; for(; w 0xff ) return FALSE; } } return TRUE; } /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Convert text to 8-bit according to flags. May hold REP_UTF8 to convert to UTF-8, REP_MB to convert to locale 8-bit representation or nothing to convert to ISO Latin-1. This predicate can fail of the text cannot be represented. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ static int wctobuffer(wchar_t c, mbstate_t *mbs, Buffer buf) { char b[MB_LEN_MAX]; size_t n; if ( (n=wcrtomb(b, c, mbs)) != (size_t)-1 ) { size_t i; for(i=0; iencoding != target ) { Buffer b = findBuffer(BUF_RING); switch(text->encoding) { case ENC_ISO_LATIN_1: { const unsigned char *s = (const unsigned char*)text->text.t; const unsigned char *e = &s[text->length]; if ( target == ENC_UTF8 ) { for( ; stext.w; const pl_wchar_t *e = &w[text->length]; if ( target == ENC_UTF8 ) { for( ; wlength = sizeOfBuffer(b)-1; text->text.t = baseBuffer(b, char); text->encoding = target; text->storage = PL_CHARS_RING; } succeed; rep_error: if ( (flags & CVT_EXCEPTION) ) { char msg[128]; sprintf(msg, "Cannot represent char U%04x using %s encoding", norep, target == ENC_ISO_LATIN_1 ? "ISO Latin-1" : "current locale"); return PL_error(NULL, 0, msg, ERR_REPRESENTATION, ATOM_encoding); } fail; } int PL_canonise_text(PL_chars_t *text) { if ( !text->canonical ) { switch(text->encoding ) { case ENC_ISO_LATIN_1: break; /* nothing to do */ case ENC_WCHAR: { const pl_wchar_t *w = (const pl_wchar_t*)text->text.w; const pl_wchar_t *e = &w[text->length]; for(; w 0xff ) return FALSE; } return PL_demote_text(text); } case ENC_UTF8: { const char *s = text->text.t; const char *e = &s[text->length]; while(sencoding = ENC_ISO_LATIN_1; text->canonical = TRUE; } else { int chr; int wide = FALSE; size_t len = s - text->text.t; while(s 0xff ) /* requires wide characters */ wide = TRUE; len++; } s = (const char *)text->text.t; text->length = len; if ( wide ) { pl_wchar_t *to = PL_malloc(sizeof(pl_wchar_t)*(len+1)); text->text.w = to; while(sencoding = ENC_WCHAR; text->storage = PL_CHARS_MALLOC; } else { char *to = PL_malloc(len+1); text->text.t = to; while(sencoding = ENC_ISO_LATIN_1; text->storage = PL_CHARS_MALLOC; } text->canonical = TRUE; } succeed; } case ENC_ANSI: { mbstate_t mbs; size_t len = 0; int iso = TRUE; char *s = text->text.t; size_t rc, n = text->length; wchar_t wc; memset(&mbs, 0, sizeof(mbs)); while( n > 0 && (rc=mbrtowc(&wc, s, n, &mbs)) != (size_t)-1 ) { if ( wc > 0xff ) iso = FALSE; len++; n -= rc; s += rc; } if ( n == 0 ) { const char *from = text->text.t; void *do_free; n = text->length; memset(&mbs, 0, sizeof(mbs)); if ( text->storage == PL_CHARS_MALLOC ) do_free = text->text.t; else do_free = NULL; if ( iso ) { char *to; text->encoding = ENC_ISO_LATIN_1; if ( len+1 < sizeof(text->buf) ) { text->text.t = text->buf; text->storage = PL_CHARS_LOCAL; } else { text->text.t = PL_malloc(len+1); text->storage = PL_CHARS_MALLOC; } to = text->text.t; while( n > 0 && (rc=mbrtowc(&wc, from, n, &mbs)) != (size_t)-1 ) { *to++ = (char)wc; n -= rc; from += rc; } *to = EOS; } else { wchar_t *to; char b2[sizeof(text->buf)]; text->encoding = ENC_WCHAR; if ( len+1 < sizeof(text->buf)/sizeof(wchar_t) ) { if ( text->text.t == text->buf ) { memcpy(b2, text->buf, sizeof(text->buf)); from = b2; } text->text.w = (wchar_t*)text->buf; } else { text->text.w = PL_malloc((len+1)*sizeof(wchar_t)); text->storage = PL_CHARS_MALLOC; } to = text->text.w; while( n > 0 && (rc=mbrtowc(&wc, from, n, &mbs)) != (size_t)-1 ) { *to++ = wc; n -= rc; from += rc; } *to = EOS; } text->length = len; text->canonical = TRUE; if ( do_free ) PL_free(do_free); succeed; } fail; } default: assert(0); } } succeed; } void PL_free_text(PL_chars_t *text) { if ( text->storage == PL_CHARS_MALLOC ) PL_free(text->text.t); } void PL_text_recode(PL_chars_t *text, IOENC encoding) { if ( text->encoding != encoding ) { switch(encoding) { case ENC_UTF8: { switch(text->encoding) { case ENC_ASCII: text->encoding = ENC_UTF8; break; case ENC_ISO_LATIN_1: { Buffer b = findBuffer(BUF_RING); const unsigned char *s = (const unsigned char *)text->text.t; const unsigned char *e = &s[text->length]; char tmp[8]; for( ; slength = entriesBuffer(b, char); addBuffer(b, EOS, char); text->text.t = baseBuffer(b, char); text->encoding = ENC_UTF8; text->storage = PL_CHARS_RING; break; } case ENC_WCHAR: { Buffer b = findBuffer(BUF_RING); const pl_wchar_t *s = text->text.w; const pl_wchar_t *e = &s[text->length]; char tmp[8]; for( ; s 0x7f ) { const char *end = utf8_put_char(tmp, (int)*s); const char *q = tmp; for(q=tmp; qlength = entriesBuffer(b, char); addBuffer(b, EOS, char); text->text.t = baseBuffer(b, char); text->encoding = ENC_UTF8; text->storage = PL_CHARS_RING; break; } default: assert(0); } break; default: assert(0); } } } } /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - PL_cmp_text(PL_chars_t *t1, size_t o1, PL_chars_t *t2, size_t o2, size_t len) Compares two substrings of two text representations. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ int PL_cmp_text(PL_chars_t *t1, size_t o1, PL_chars_t *t2, size_t o2, size_t len) { ssize_t l = len; int ifeq = 0; if ( l > (ssize_t)(t1->length - o1) ) { l = t1->length - o1; ifeq = -1; /* first is short */ } if ( l > (ssize_t)(t2->length - o2) ) { l = t2->length - o2; if ( ifeq == 0 ) ifeq = 1; } if ( l == 0 ) /* too long offsets */ return ifeq; if ( t1->encoding == ENC_ISO_LATIN_1 && t2->encoding == ENC_ISO_LATIN_1 ) { const unsigned char *s = (const unsigned char *)t1->text.t+o1; const unsigned char *q = (const unsigned char *)t2->text.t+o2; for(; l-- > 0 && *s == *q; s++, q++ ) ; if ( l < 0 ) return ifeq; else return *s > *q ? 1 : -1; } else if ( t1->encoding == ENC_WCHAR && t2->encoding == ENC_WCHAR ) { const pl_wchar_t *s = t1->text.w+o1; const pl_wchar_t *q = t2->text.w+o2; for(; l-- > 0 && *s == *q; s++, q++ ) ; if ( l < 0 ) return ifeq; else return *s > *q ? 1 : -1; } else if ( t1->encoding == ENC_ISO_LATIN_1 && t2->encoding == ENC_WCHAR ) { const unsigned char *s = (const unsigned char *)t1->text.t+o1; const pl_wchar_t *q = t2->text.w+o2; for(; l-- > 0 && *s == *q; s++, q++ ) ; if ( l < 0 ) return ifeq; else return *s > *q ? 1 : -1; } else { const pl_wchar_t *s = t1->text.w+o1; const unsigned char *q = (const unsigned char *)t2->text.t+o2; for(; l-- > 0 && *s == *q; s++, q++ ) ; if ( l < 0 ) return ifeq; else return *s > *q ? 1 : -1; } } int PL_concat_text(int n, PL_chars_t **text, PL_chars_t *result) { size_t total_length = 0; int latin = TRUE; int i; for(i=0; ilength; } result->canonical = TRUE; result->length = total_length; if ( latin ) { char *to; result->encoding = ENC_ISO_LATIN_1; if ( total_length+1 < sizeof(result->buf) ) { result->text.t = result->buf; result->storage = PL_CHARS_LOCAL; } else { result->text.t = PL_malloc(total_length+1); result->storage = PL_CHARS_MALLOC; } for(to=result->text.t, i=0; itext.t, text[i]->length); to += text[i]->length; } *to = EOS; } else { pl_wchar_t *to; result->encoding = ENC_WCHAR; if ( total_length+1 < sizeof(result->buf)/sizeof(pl_wchar_t) ) { result->text.w = (pl_wchar_t*)result->buf; result->storage = PL_CHARS_LOCAL; } else { result->text.w = PL_malloc((total_length+1)*sizeof(pl_wchar_t)); result->storage = PL_CHARS_MALLOC; } for(to=result->text.w, i=0; iencoding == ENC_WCHAR ) { memcpy(to, text[i]->text.w, text[i]->length*sizeof(pl_wchar_t)); to += text[i]->length; } else { const unsigned char *f = (const unsigned char *)text[i]->text.t; const unsigned char *e = &f[text[i]->length]; while(ftext.w) == total_length); *to = EOS; } return TRUE; } IOSTREAM * Sopen_text(PL_chars_t *txt, const char *mode) { IOSTREAM *stream; if ( !streq(mode, "r") ) { errno = EINVAL; return NULL; } stream = Sopen_string(NULL, txt->text.t, bufsize_text(txt, txt->length), mode); stream->encoding = txt->encoding; return stream; } int PL_unify_chars(term_t t, int flags, size_t len, const char *s) { PL_chars_t text; term_t tail; int rc; if ( len == (size_t)-1 ) len = strlen(s); text.text.t = (char *)s; text.encoding = ((flags&REP_UTF8) ? ENC_UTF8 : \ (flags&REP_MB) ? ENC_ANSI : ENC_ISO_LATIN_1); text.storage = PL_CHARS_HEAP; text.length = len; text.canonical = FALSE; flags &= ~(REP_UTF8|REP_MB|REP_ISO_LATIN_1); if ( (flags & PL_DIFF_LIST) ) { tail = t+1; flags &= (~PL_DIFF_LIST); } else { tail = 0; } rc = PL_unify_text(t, tail, &text, flags); PL_free_text(&text); return rc; }