This repository has been archived on 2023-08-20. You can view files and clone it, but cannot push or open issues or pull requests.
yap-6.3/os/pl-text.c
Vítor Santos Costa c629ae9283 doc updates
2014-04-09 12:39:52 +01:00

1282 lines
29 KiB
C

/* $Id$
Part of SWI-Prolog
Author: Jan Wielemaker and Anjo Anjewierden
E-mail: jan@swi.psy.uva.nl
WWW: http://www.swi-prolog.org
Copyright (C): 1985-2002, University of Amsterdam
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "pl-incl.h"
#include "pl-ctype.h"
#include "pl-utf8.h"
#include "pl-codelist.h"
#include <errno.h>
#include <stdio.h>
#if HAVE_LIMITS_H
#include <limits.h> /* solaris compatibility */
#endif
#undef LD
#define LD LOCAL_LD
/*******************************
* UNIFIED TEXT STUFF *
*******************************/
static inline size_t
bufsize_text(PL_chars_t *text, size_t len)
{ size_t unit;
switch(text->encoding)
{ case ENC_ISO_LATIN_1:
case ENC_ASCII:
case ENC_UTF8:
case ENC_ANSI:
unit = sizeof(char);
break;
case ENC_WCHAR:
unit = sizeof(pl_wchar_t);
break;
default:
assert(0);
unit = sizeof(char); /*NOTREACHED*/
}
return len*unit;
}
void
PL_save_text(PL_chars_t *text, int flags)
{ if ( (flags & BUF_MALLOC) && text->storage != PL_CHARS_MALLOC )
{ size_t bl = bufsize_text(text, text->length+1);
void *new = PL_malloc(bl);
memcpy(new, text->text.t, bl);
text->text.t = new;
text->storage = PL_CHARS_MALLOC;
} else if ( text->storage == PL_CHARS_LOCAL )
{ Buffer b = findBuffer(BUF_RING);
size_t bl = bufsize_text(text, text->length+1);
addMultipleBuffer(b, text->text.t, bl, char);
text->text.t = baseBuffer(b, char);
text->storage = PL_CHARS_RING;
} else if ( text->storage == PL_CHARS_MALLOC )
{ Buffer b = findBuffer(BUF_RING);
size_t bl = bufsize_text(text, text->length+1);
addMultipleBuffer(b, text->text.t, bl, char);
PL_free_text(text);
text->text.t = baseBuffer(b, char);
text->storage = PL_CHARS_RING;
}
}
/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
PL_from_stack_text() moves a string from the stack, so it won't get
corrupted if GC/shift comes along.
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
static void
PL_from_stack_text(PL_chars_t *text)
{ if ( text->storage == PL_CHARS_STACK )
{ size_t bl = bufsize_text(text, text->length+1);
if ( bl < sizeof(text->buf) )
{ memcpy(text->buf, text->text.t, bl);
text->text.t = text->buf;
text->storage = PL_CHARS_LOCAL;
} else
{ Buffer b = findBuffer(BUF_RING);
addMultipleBuffer(b, text->text.t, bl, char);
text->text.t = baseBuffer(b, char);
text->storage = PL_CHARS_RING;
}
}
}
#define INT64_DIGITS 20
static char *
ui64toa(uint64_t val, char *out)
{ char tmpBuf[INT64_DIGITS + 1];
char *ptrOrg = tmpBuf + INT64_DIGITS;
char *ptr = ptrOrg;
size_t nbDigs;
do
{ int rem = val % 10;
*--ptr = rem + '0';
val /= 10;
} while ( val );
nbDigs = ptrOrg - ptr;
memcpy(out, ptr, nbDigs);
out += nbDigs;
*out = '\0';
return out; /* points to the END */
};
static char *
i64toa(int64_t val, char *out)
{ if ( val < 0 )
{ *out++ = '-';
val = -val;
}
return ui64toa((uint64_t)val, out);
}
int
PL_get_text__LD(term_t l, PL_chars_t *text, int flags ARG_LD)
{ word w = valHandle(l);
if ( (flags & CVT_ATOM) && isAtom(w) )
#if __YAP_PROLOG__
{ if ( !get_atom_ptr_text(AtomOfTerm(w), text) )
#else
{ if ( !get_atom_text(w, text) )
#endif
goto maybe_write;
} else if ( (flags & CVT_STRING) && isString(w) )
{ if ( !get_string_text(w, text PASS_LD) )
goto maybe_write;
PL_from_stack_text(text);
} else if ( (flags & CVT_INTEGER) && isInteger(w) )
{ number n;
PL_get_number(l, &n);
switch(n.type)
{ case V_INTEGER:
{ char *ep = i64toa(n.value.i, text->buf);
text->text.t = text->buf;
text->length = ep-text->text.t;
text->storage = PL_CHARS_LOCAL;
break;
}
#ifdef O_GMP
case V_MPZ:
{ size_t sz = mpz_sizeinbase(n.value.mpz, 10) + 2;
Buffer b = findBuffer(BUF_RING);
if ( !growBuffer(b, sz) )
outOfCore();
mpz_get_str(b->base, 10, n.value.mpz);
b->top = b->base + strlen(b->base);
text->text.t = baseBuffer(b, char);
text->length = entriesBuffer(b, char);
text->storage = PL_CHARS_RING;
break;
}
#endif
default:
assert(0);
}
text->encoding = ENC_ISO_LATIN_1;
text->canonical = TRUE;
} else if ( (flags & CVT_FLOAT) && isFloat(w) )
{ format_float(valFloat(w), text->buf);
text->text.t = text->buf;
text->length = strlen(text->text.t);
text->encoding = ENC_ISO_LATIN_1;
text->storage = PL_CHARS_LOCAL;
text->canonical = TRUE;
} else if ( (flags & CVT_LIST) )
{ Buffer b;
CVT_result result;
if ( (b = codes_or_chars_to_buffer(l, BUF_RING, FALSE, &result)) )
{ text->length = entriesBuffer(b, char);
addBuffer(b, EOS, char);
text->text.t = baseBuffer(b, char);
text->encoding = ENC_ISO_LATIN_1;
} else if ( result.status == CVT_wide &&
(b = codes_or_chars_to_buffer(l, BUF_RING, TRUE, &result)) )
{ text->length = entriesBuffer(b, pl_wchar_t);
addBuffer(b, EOS, pl_wchar_t);
text->text.w = baseBuffer(b, pl_wchar_t);
text->encoding = ENC_WCHAR;
} else if ( (flags & (CVT_WRITE|CVT_WRITE_CANONICAL)) )
{ goto case_write;
} else
{ if ( (flags & CVT_VARNOFAIL) && result.status == CVT_partial )
return 2;
if ( (flags & CVT_EXCEPTION) )
{ switch(result.status)
{ case CVT_partial:
return PL_error(NULL, 0, NULL, ERR_INSTANTIATION);
case CVT_nolist:
goto error;
case CVT_nocode:
case CVT_nochar:
{ term_t culprit = PL_new_term_ref();
atom_t type;
#if __YAP_PROLOG__
YAP_PutInSlot(culprit, result.culprit);
#else
*valTermRef(culprit) = result.culprit;
#endif
if ( result.status == CVT_nocode )
type = ATOM_character_code;
else
type = ATOM_character;
return PL_error(NULL, 0, NULL, ERR_TYPE, type, culprit);
}
default:
break;
}
}
goto error;
}
text->storage = PL_CHARS_RING;
text->canonical = TRUE;
} else if ( (flags & CVT_VARIABLE) && isVar(w) )
{ text->text.t = varName(l, text->buf);
text->length = strlen(text->text.t);
text->encoding = ENC_ISO_LATIN_1;
text->storage = PL_CHARS_LOCAL;
text->canonical = TRUE;
} else if ( (flags & (CVT_WRITE|CVT_WRITE_CANONICAL)) )
{ IOENC encodings[3];
IOENC *enc;
char *r;
int wflags;
case_write:
encodings[0] = ENC_ISO_LATIN_1;
encodings[1] = ENC_WCHAR;
encodings[2] = ENC_UNKNOWN;
if ( (flags&CVT_WRITEQ) == CVT_WRITEQ )
wflags = PL_WRT_QUOTED|PL_WRT_NUMBERVARS;
else if ( (flags&CVT_WRITE_CANONICAL) )
wflags = PL_WRT_QUOTED|PL_WRT_IGNOREOPS|PL_WRT_NUMBERVARS;
else
wflags = PL_WRT_NUMBERVARS;
for(enc = encodings; *enc != ENC_UNKNOWN; enc++)
{ size_t size;
IOSTREAM *fd;
r = text->buf;
size = sizeof(text->buf);
fd = Sopenmem(&r, &size, "w");
fd->encoding = *enc;
if ( PL_write_term(fd, l, 1200, wflags) &&
Sputcode(EOS, fd) >= 0 &&
Sflush(fd) >= 0 )
{ text->encoding = *enc;
text->storage = (r == text->buf ? PL_CHARS_LOCAL : PL_CHARS_MALLOC);
text->canonical = TRUE;
if ( *enc == ENC_ISO_LATIN_1 )
{ text->length = size-1;
text->text.t = r;
} else
{ text->length = (size/sizeof(pl_wchar_t))-1;
text->text.w = (pl_wchar_t *)r;
}
Sclose(fd);
return TRUE;
} else
{ Sclose(fd);
if ( r != text->buf )
Sfree(r);
}
}
goto error;
} else
{ goto error;
}
succeed;
maybe_write:
if ( (flags & (CVT_WRITE|CVT_WRITE_CANONICAL)) )
goto case_write;
error:
if ( canBind(w) && (flags & CVT_VARNOFAIL) )
return 2;
if ( (flags & CVT_EXCEPTION) )
{ atom_t expected;
if ( (flags & CVT_LIST) && !(flags&(CVT_ATOM|CVT_NUMBER)) )
expected = ATOM_list; /* List and/or string object */
else if ( flags & CVT_LIST )
expected = ATOM_text;
else if ( flags & CVT_NUMBER )
expected = ATOM_atomic;
else
expected = ATOM_atom;
return PL_error(NULL, 0, NULL, ERR_TYPE, expected, l);
}
fail;
}
atom_t
textToAtom(PL_chars_t *text)
{ if ( !PL_canonise_text(text) )
return 0;
if ( text->encoding == ENC_ISO_LATIN_1 )
{ return lookupAtom(text->text.t, text->length);
} else
{ return lookupUCSAtom(text->text.w, text->length);
}
}
word
textToString(PL_chars_t *text)
{ if ( !PL_canonise_text(text) )
return 0;
if ( text->encoding == ENC_ISO_LATIN_1 )
{ return globalString(text->length, text->text.t);
} else
{ return globalWString(text->length, text->text.w);
}
}
int
PL_unify_text(term_t term, term_t tail, PL_chars_t *text, int type)
{ switch(type)
{ case PL_ATOM:
{ atom_t a = textToAtom(text);
if ( a )
{ int rval = _PL_unify_atomic(term, a);
PL_unregister_atom(a);
return rval;
}
return FALSE;
}
case PL_STRING:
{ word w = textToString(text);
if ( w )
return _PL_unify_atomic(term, w);
else
return FALSE;
}
case PL_CODE_LIST:
case PL_CHAR_LIST:
{ if ( text->length == 0 )
{ if ( tail )
{ GET_LD
PL_put_term(tail, term);
return TRUE;
} else
{ return PL_unify_nil(term);
}
} else
{ GET_LD
term_t l = PL_new_term_ref();
Word p0, p;
switch(text->encoding)
{ case ENC_ISO_LATIN_1:
{ const unsigned char *s = (const unsigned char *)text->text.t;
const unsigned char *e = &s[text->length];
if ( !(p0 = p = INIT_SEQ_STRING(text->length)) )
return FALSE;
if ( type == PL_CODE_LIST ) {
for( ; s < e; s++)
p = EXTEND_SEQ_CODES(p, *s);
} else {
for( ; s < e; s++)
p = EXTEND_SEQ_CHARS(p, *s);
}
break;
}
case ENC_WCHAR:
{ const pl_wchar_t *s = (const pl_wchar_t *)text->text.t;
const pl_wchar_t *e = &s[text->length];
if ( !(p0 = p = INIT_SEQ_STRING(text->length)) )
return FALSE;
if ( type == PL_CODE_LIST ) {
for( ; s < e; s++)
p = EXTEND_SEQ_CODES(p, *s);
} else {
for( ; s < e; s++)
p = EXTEND_SEQ_CHARS(p, *s);
}
break;
}
case ENC_UTF8:
{ const char *s = text->text.t;
const char *e = &s[text->length];
size_t len = utf8_strlen(s, text->length);
if ( !(p0 = p = INIT_SEQ_STRING(len)) )
return FALSE;
if ( type == PL_CODE_LIST ) {
while (s < e) {
int chr;
s = utf8_get_char(s, &chr);
p = EXTEND_SEQ_CODES(p, chr);
}
} else {
while (s < e) {
int chr;
s = utf8_get_char(s, &chr);
p = EXTEND_SEQ_CHARS(p, chr);
}
}
break;
}
case ENC_ANSI:
{ const char *s = text->text.t;
size_t rc, n = text->length;
size_t len = 0;
mbstate_t mbs;
wchar_t wc;
memset(&mbs, 0, sizeof(mbs));
while( n > 0 )
{ if ( (rc=mbrtowc(&wc, s, n, &mbs)) == (size_t)-1 || rc == 0 )
return PL_error(NULL, 0, "cannot represent text in current locale",
ERR_REPRESENTATION, ATOM_encoding);
len++;
n -= rc;
s += rc;
}
if ( !(p0 = p = INIT_SEQ_STRING(len)) )
return FALSE;
n = text->length;
s = text->text.t;
memset(&mbs, 0, sizeof(mbs));
while(n > 0)
{ rc = mbrtowc(&wc, s, n, &mbs);
if ( type == PL_CODE_LIST )
p = EXTEND_SEQ_CODES(p, wc);
else
p = EXTEND_SEQ_CHARS(p, wc);
s += rc;
n -= rc;
}
break;
}
default:
{ assert(0);
return FALSE;
}
}
return CLOSE_SEQ_STRING(p, p0, tail, term, l );
}
}
default:
{ assert(0);
return FALSE;
}
}
}
int
PL_unify_text_range(term_t term, PL_chars_t *text,
size_t offset, size_t len, int type)
{ if ( offset == 0 && len == text->length )
{ return PL_unify_text(term, 0, text, type);
} else
{ PL_chars_t sub;
int rc;
if ( offset > text->length || offset + len > text->length )
return FALSE;
if ( len == 1 && type == PL_ATOM )
{ GET_LD
int c;
if ( text->encoding == ENC_ISO_LATIN_1 )
c = text->text.t[offset]&0xff;
else
c = text->text.w[offset];
return PL_unify_atom(term, codeToAtom(c));
}
sub.length = len;
sub.storage = PL_CHARS_HEAP;
if ( text->encoding == ENC_ISO_LATIN_1 )
{ sub.text.t = text->text.t+offset;
sub.encoding = ENC_ISO_LATIN_1;
sub.canonical = TRUE;
} else
{ sub.text.w = text->text.w+offset;
sub.encoding = ENC_WCHAR;
sub.canonical = FALSE;
}
rc = PL_unify_text(term, 0, &sub, type);
PL_free_text(&sub);
return rc;
}
}
/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
int PL_promote_text(PL_chars_t *text)
Promote a text to USC if it is currently 8-bit text.
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
int
PL_promote_text(PL_chars_t *text)
{ if ( text->encoding != ENC_WCHAR )
{ if ( text->storage == PL_CHARS_MALLOC )
{ pl_wchar_t *new = PL_malloc(sizeof(pl_wchar_t)*(text->length+1));
pl_wchar_t *t = new;
const unsigned char *s = (const unsigned char *)text->text.t;
const unsigned char *e = &s[text->length];
while(s<e)
{ *t++ = *s++;
}
*t = EOS;
PL_free(text->text.t);
text->text.w = new;
text->encoding = ENC_WCHAR;
} else if ( text->storage == PL_CHARS_LOCAL &&
(text->length+1)*sizeof(pl_wchar_t) < sizeof(text->buf) )
{ unsigned char buf[sizeof(text->buf)];
unsigned char *f = buf;
unsigned char *e = &buf[text->length];
pl_wchar_t *t = (pl_wchar_t*)text->buf;
memcpy(buf, text->buf, text->length*sizeof(char));
while(f<e)
{ *t++ = *f++;
}
*t = EOS;
text->encoding = ENC_WCHAR;
} else
{ Buffer b = findBuffer(BUF_RING);
const unsigned char *s = (const unsigned char *)text->text.t;
const unsigned char *e = &s[text->length];
for( ; s<e; s++)
addBuffer(b, *s, pl_wchar_t);
addBuffer(b, EOS, pl_wchar_t);
text->text.w = baseBuffer(b, pl_wchar_t);
text->encoding = ENC_WCHAR;
text->storage = PL_CHARS_RING;
}
}
succeed;
}
int
PL_demote_text(PL_chars_t *text)
{ if ( text->encoding != ENC_ISO_LATIN_1 )
{ if ( text->storage == PL_CHARS_MALLOC )
{ char *new = PL_malloc(sizeof(char)*(text->length+1));
char *t = new;
const pl_wchar_t *s = (const pl_wchar_t *)text->text.t;
const pl_wchar_t *e = &s[text->length];
while(s<e)
{ if ( *s > 0xff )
{ PL_free(new);
return FALSE;
}
*t++ = *s++ & 0xff;
}
*t = EOS;
PL_free(text->text.t);
text->text.t = new;
text->encoding = ENC_ISO_LATIN_1;
} else if ( text->storage == PL_CHARS_LOCAL )
{ pl_wchar_t buf[sizeof(text->buf)/sizeof(pl_wchar_t)];
pl_wchar_t *f = buf;
pl_wchar_t *e = &buf[text->length];
char *t = text->buf;
memcpy(buf, text->buf, text->length*sizeof(pl_wchar_t));
while(f<e)
{ if ( *f > 0xff )
return FALSE;
*t++ = *f++ & 0xff;
}
*t = EOS;
text->encoding = ENC_ISO_LATIN_1;
} else
{ Buffer b = findBuffer(BUF_RING);
const pl_wchar_t *s = (const pl_wchar_t*)text->text.w;
const pl_wchar_t *e = &s[text->length];
for( ; s<e; s++)
{ if ( *s > 0xff )
{ unfindBuffer(BUF_RING);
return FALSE;
}
addBuffer(b, *s&0xff, char);
}
addBuffer(b, EOS, char);
text->text.t = baseBuffer(b, char);
text->storage = PL_CHARS_RING;
text->encoding = ENC_ISO_LATIN_1;
}
}
succeed;
}
static int
can_demote(PL_chars_t *text)
{ if ( text->encoding != ENC_ISO_LATIN_1 )
{ const pl_wchar_t *w = (const pl_wchar_t*)text->text.w;
const pl_wchar_t *e = &w[text->length];
for(; w<e; w++)
{ if ( *w > 0xff )
return FALSE;
}
}
return TRUE;
}
/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Convert text to 8-bit according to flags. May hold REP_UTF8 to convert
to UTF-8, REP_MB to convert to locale 8-bit representation or nothing to
convert to ISO Latin-1. This predicate can fail of the text cannot be
represented.
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
static int
wctobuffer(wchar_t c, mbstate_t *mbs, Buffer buf)
{ char b[PL_MB_LEN_MAX];
size_t n;
if ( (n=wcrtomb(b, c, mbs)) != (size_t)-1 )
{ size_t i;
for(i=0; i<n; i++)
addBuffer(buf, b[i], char);
return TRUE;
}
return FALSE; /* cannot represent */
}
static void
utf8tobuffer(wchar_t c, Buffer buf)
{ if ( c <= 0x7f )
{ addBuffer(buf, (char)c, char);
} else
{ char b[6];
char *e = b;
const char *s;
e = utf8_put_char(e, c);
for(s=b; s<e; s++)
addBuffer(buf, *s, char);
}
}
int
PL_mb_text(PL_chars_t *text, int flags)
{ int norep = -1;
IOENC target = ((flags&REP_UTF8) ? ENC_UTF8 :
(flags&REP_MB) ? ENC_ANSI : ENC_ISO_LATIN_1);
if ( text->encoding != target )
{ Buffer b = findBuffer(BUF_RING);
switch(text->encoding)
{ case ENC_ISO_LATIN_1:
{ const unsigned char *s = (const unsigned char*)text->text.t;
const unsigned char *e = &s[text->length];
if ( target == ENC_UTF8 )
{ for( ; s<e; s++)
{ utf8tobuffer(*s, b);
}
addBuffer(b, 0, char);
} else /* if ( target == ENC_MB ) */
{ mbstate_t mbs;
memset(&mbs, 0, sizeof(mbs));
for( ; s<e; s++)
{ if ( !wctobuffer(*s, &mbs, b) )
{ unfindBuffer(BUF_RING);
norep = *s;
goto rep_error;
}
}
wctobuffer(0, &mbs, b);
}
break;
}
case ENC_UTF8:
{ const char *s = (const char*)text->text.t;
const char *e = &s[text->length];
if ( target == ENC_ISO_LATIN_1 )
{ for( ; s<e; )
{ int ch;
s = _PL__utf8_get_char(s, &ch);
if (ch > 0xff) {
unfindBuffer(BUF_RING);
norep = *s;
goto rep_error;
}
addBuffer(b, ch, char);
}
addBuffer(b, 0, char);
} else
{ mbstate_t mbs;
memset(&mbs, 0, sizeof(mbs));
for( ; s<e; )
{ int ch;
s = _PL__utf8_get_char(s, &ch);
if ( !wctobuffer(ch, &mbs, b) )
{ unfindBuffer(BUF_RING);
norep = ch;
goto rep_error;
}
}
wctobuffer(0, &mbs, b);
}
break;
}
case ENC_WCHAR:
{ if ( target == ENC_ISO_LATIN_1 )
{ return PL_demote_text(text);
} else
{ const pl_wchar_t *w = (const pl_wchar_t*)text->text.w;
const pl_wchar_t *e = &w[text->length];
if ( target == ENC_UTF8 )
{ for( ; w<e; w++)
{ utf8tobuffer(*w, b);
}
addBuffer(b, 0, char);
} else /* if ( target == ENC_MB ) */
{ mbstate_t mbs;
memset(&mbs, 0, sizeof(mbs));
for( ; w<e; w++)
{ if ( !wctobuffer(*w, &mbs, b) )
{ unfindBuffer(BUF_RING);
norep = *w;
goto rep_error;
}
}
wctobuffer(0, &mbs, b);
}
}
break;
}
default:
{ assert(0);
fail;
}
}
text->length = sizeOfBuffer(b)-1;
text->text.t = baseBuffer(b, char);
text->encoding = target;
text->storage = PL_CHARS_RING;
}
succeed;
rep_error:
if ( (flags & CVT_EXCEPTION) )
{ char msg[128];
sprintf(msg,
"Cannot represent char U%04x using %s encoding",
norep,
target == ENC_ISO_LATIN_1 ? "ISO Latin-1" : "current locale");
return PL_error(NULL, 0, msg, ERR_REPRESENTATION, ATOM_encoding);
}
fail;
}
int
PL_canonise_text(PL_chars_t *text)
{ if ( !text->canonical )
{ switch(text->encoding )
{ case ENC_ISO_LATIN_1:
break; /* nothing to do */
case ENC_WCHAR:
{ const pl_wchar_t *w = (const pl_wchar_t*)text->text.w;
const pl_wchar_t *e = &w[text->length];
for(; w<e; w++)
{ if ( *w > 0xff )
return TRUE;
}
return PL_demote_text(text);
}
case ENC_UTF8:
{ const char *s = text->text.t;
const char *e = &s[text->length];
while(s<e && !(*s & 0x80))
s++;
if ( s == e )
{ text->encoding = ENC_ISO_LATIN_1;
text->canonical = TRUE;
} else
{ int chr;
int wide = FALSE;
size_t len = s - text->text.t;
while(s<e)
{ s = utf8_get_char(s, &chr);
if ( chr > 0xff ) /* requires wide characters */
wide = TRUE;
len++;
}
s = (const char *)text->text.t;
text->length = len;
if ( wide )
{ pl_wchar_t *to = PL_malloc(sizeof(pl_wchar_t)*(len+1));
text->text.w = to;
while(s<e)
{ s = utf8_get_char(s, &chr);
*to++ = chr;
}
*to = EOS;
text->encoding = ENC_WCHAR;
text->storage = PL_CHARS_MALLOC;
} else
{ char *to = PL_malloc(len+1);
text->text.t = to;
while(s<e)
{ s = utf8_get_char(s, &chr);
*to++ = chr;
}
*to = EOS;
text->encoding = ENC_ISO_LATIN_1;
text->storage = PL_CHARS_MALLOC;
}
text->canonical = TRUE;
}
succeed;
}
case ENC_ANSI:
{ mbstate_t mbs;
size_t len = 0;
int iso = TRUE;
char *s = text->text.t;
size_t rc, n = text->length;
wchar_t wc;
memset(&mbs, 0, sizeof(mbs));
while( n > 0 )
{ if ( (rc=mbrtowc(&wc, s, n, &mbs)) == (size_t)-1 || rc == 0)
return FALSE; /* encoding error */
if ( wc > 0xff )
iso = FALSE;
len++;
n -= rc;
s += rc;
}
if ( n == 0 )
{ const char *from = text->text.t;
void *do_free;
n = text->length;
memset(&mbs, 0, sizeof(mbs));
if ( text->storage == PL_CHARS_MALLOC )
do_free = text->text.t;
else
do_free = NULL;
if ( iso )
{ char *to;
text->encoding = ENC_ISO_LATIN_1;
if ( len+1 < sizeof(text->buf) )
{ text->text.t = text->buf;
text->storage = PL_CHARS_LOCAL;
} else
{ text->text.t = PL_malloc(len+1);
text->storage = PL_CHARS_MALLOC;
}
to = text->text.t;
while( n > 0 )
{ rc = mbrtowc(&wc, from, n, &mbs);
*to++ = (char)wc;
n -= rc;
from += rc;
}
*to = EOS;
} else
{ wchar_t *to;
char b2[sizeof(text->buf)];
text->encoding = ENC_WCHAR;
if ( len+1 < sizeof(text->buf)/sizeof(wchar_t) )
{ if ( text->text.t == text->buf )
{ memcpy(b2, text->buf, sizeof(text->buf));
from = b2;
}
text->text.w = (wchar_t*)text->buf;
} else
{ text->text.w = PL_malloc((len+1)*sizeof(wchar_t));
text->storage = PL_CHARS_MALLOC;
}
to = text->text.w;
while( n > 0 )
{ rc = mbrtowc(&wc, from, n, &mbs);
*to++ = wc;
n -= rc;
from += rc;
}
*to = EOS;
}
text->length = len;
text->canonical = TRUE;
if ( do_free )
PL_free(do_free);
succeed;
}
fail;
}
default:
assert(0);
}
}
succeed;
}
void
PL_free_text(PL_chars_t *text)
{ if ( text->storage == PL_CHARS_MALLOC )
PL_free(text->text.t);
}
void
PL_text_recode(PL_chars_t *text, IOENC encoding)
{ if ( text->encoding != encoding )
{ switch(encoding)
{ case ENC_UTF8:
{ switch(text->encoding)
{ case ENC_ASCII:
text->encoding = ENC_UTF8;
break;
case ENC_ISO_LATIN_1:
{ Buffer b = findBuffer(BUF_RING);
const unsigned char *s = (const unsigned char *)text->text.t;
const unsigned char *e = &s[text->length];
char tmp[8];
for( ; s<e; s++)
{ if ( *s&0x80 )
{ const char *end = utf8_put_char(tmp, *s);
const char *q = tmp;
for(q=tmp; q<end; q++)
addBuffer(b, *q, char);
} else
{ addBuffer(b, *s, char);
}
}
PL_free_text(text);
text->length = entriesBuffer(b, char);
addBuffer(b, EOS, char);
text->text.t = baseBuffer(b, char);
text->encoding = ENC_UTF8;
text->storage = PL_CHARS_RING;
break;
}
case ENC_WCHAR:
{ Buffer b = findBuffer(BUF_RING);
const pl_wchar_t *s = text->text.w;
const pl_wchar_t *e = &s[text->length];
char tmp[8];
for( ; s<e; s++)
{ if ( *s > 0x7f )
{ const char *end = utf8_put_char(tmp, (int)*s);
const char *q = tmp;
for(q=tmp; q<end; q++)
addBuffer(b, *q&0xff, char);
} else
{ addBuffer(b, *s&0xff, char);
}
}
PL_free_text(text);
text->length = entriesBuffer(b, char);
addBuffer(b, EOS, char);
text->text.t = baseBuffer(b, char);
text->encoding = ENC_UTF8;
text->storage = PL_CHARS_RING;
break;
}
default:
assert(0);
}
break;
default:
assert(0);
}
}
}
}
/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
PL_cmp_text(PL_chars_t *t1, size_t o1,
PL_chars_t *t2, size_t o2,
size_t len)
Compares two substrings of two text representations.
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
int
PL_cmp_text(PL_chars_t *t1, size_t o1, PL_chars_t *t2, size_t o2,
size_t len)
{ ssize_t l = len;
int ifeq = 0;
if ( l > (ssize_t)(t1->length - o1) )
{ l = t1->length - o1;
ifeq = CMP_LESS; /* first is short */
}
if ( l > (ssize_t)(t2->length - o2) )
{ l = t2->length - o2;
if ( ifeq == 0 )
ifeq = CMP_GREATER;
}
if ( l == 0 ) /* too long offsets */
return ifeq;
if ( t1->encoding == ENC_ISO_LATIN_1 && t2->encoding == ENC_ISO_LATIN_1 )
{ const unsigned char *s = (const unsigned char *)t1->text.t+o1;
const unsigned char *q = (const unsigned char *)t2->text.t+o2;
for(; l-- > 0 && *s == *q; s++, q++ )
;
if ( l < 0 )
return ifeq;
else
return *s > *q ? CMP_GREATER : CMP_LESS;
} else if ( t1->encoding == ENC_WCHAR && t2->encoding == ENC_WCHAR )
{ const pl_wchar_t *s = t1->text.w+o1;
const pl_wchar_t *q = t2->text.w+o2;
for(; l-- > 0 && *s == *q; s++, q++ )
;
if ( l < 0 )
return ifeq;
else
return *s > *q ? CMP_GREATER : CMP_LESS;
} else if ( t1->encoding == ENC_ISO_LATIN_1 && t2->encoding == ENC_WCHAR )
{ const unsigned char *s = (const unsigned char *)t1->text.t+o1;
const pl_wchar_t *q = t2->text.w+o2;
for(; l-- > 0 && *s == *q; s++, q++ )
;
if ( l < 0 )
return ifeq;
else
return *s > *q ? CMP_GREATER : CMP_LESS;
} else
{ const pl_wchar_t *s = t1->text.w+o1;
const unsigned char *q = (const unsigned char *)t2->text.t+o2;
for(; l-- > 0 && *s == *q; s++, q++ )
;
if ( l < 0 )
return ifeq;
else
return *s > *q ? CMP_GREATER : CMP_LESS;
}
}
int
PL_concat_text(int n, PL_chars_t **text, PL_chars_t *result)
{ size_t total_length = 0;
int latin = TRUE;
int i;
for(i=0; i<n; i++)
{ if ( latin && !can_demote(text[i]) )
latin = FALSE;
total_length += text[i]->length;
}
result->canonical = TRUE;
result->length = total_length;
if ( latin )
{ char *to;
result->encoding = ENC_ISO_LATIN_1;
if ( total_length+1 < sizeof(result->buf) )
{ result->text.t = result->buf;
result->storage = PL_CHARS_LOCAL;
} else
{ result->text.t = PL_malloc(total_length+1);
result->storage = PL_CHARS_MALLOC;
}
for(to=result->text.t, i=0; i<n; i++)
{ memcpy(to, text[i]->text.t, text[i]->length);
to += text[i]->length;
}
*to = EOS;
} else
{ pl_wchar_t *to;
result->encoding = ENC_WCHAR;
if ( total_length+1 < sizeof(result->buf)/sizeof(pl_wchar_t) )
{ result->text.w = (pl_wchar_t*)result->buf;
result->storage = PL_CHARS_LOCAL;
} else
{ result->text.w = PL_malloc((total_length+1)*sizeof(pl_wchar_t));
result->storage = PL_CHARS_MALLOC;
}
for(to=result->text.w, i=0; i<n; i++)
{ if ( text[i]->encoding == ENC_WCHAR )
{ memcpy(to, text[i]->text.w, text[i]->length*sizeof(pl_wchar_t));
to += text[i]->length;
} else
{ const unsigned char *f = (const unsigned char *)text[i]->text.t;
const unsigned char *e = &f[text[i]->length];
while(f<e)
*to++ = *f++;
}
}
assert((size_t)(to-result->text.w) == total_length);
*to = EOS;
}
return TRUE;
}
IOSTREAM *
Sopen_text(PL_chars_t *txt, const char *mode)
{ IOSTREAM *stream;
if ( !streq(mode, "r") )
{ errno = EINVAL;
return NULL;
}
stream = Sopen_string(NULL,
txt->text.t,
bufsize_text(txt, txt->length),
mode);
stream->encoding = txt->encoding;
return stream;
}