fixes for UTF-8

This commit is contained in:
Vitor Santos Costa 2017-09-18 10:17:01 +01:00
parent b24dc4893d
commit 53822922c8
2 changed files with 35 additions and 32 deletions

View File

@ -239,12 +239,7 @@ static Int SkipListCodes(unsigned char **bufp, Term *l, Term **tailp,
return -REPRESENTATION_ERROR_CHARACTER_CODE; return -REPRESENTATION_ERROR_CHARACTER_CODE;
} else { } else {
AtomEntry *ae = RepAtom(AtomOfTerm(hd)); AtomEntry *ae = RepAtom(AtomOfTerm(hd));
if ((ae->StrOfAE)[1] != '\0') { st = stpcpy(st, ae->StrOfAE);
length = -REPRESENTATION_ERROR_CHARACTER;
} else {
ch = RepAtom(AtomOfTerm(hd))->StrOfAE[0];
*wide |= ch > 0x80;
}
} }
} else if (IsIntegerTerm(hd)) { } else if (IsIntegerTerm(hd)) {
ch = IntegerOfTerm(hd); ch = IntegerOfTerm(hd);
@ -263,12 +258,13 @@ static Int SkipListCodes(unsigned char **bufp, Term *l, Term **tailp,
*tailp = l; *tailp = l;
return length; return length;
} }
}
// now copy char to buffer // now copy char to buffer
int chsz = put_utf8(st, ch); int chsz = put_utf8(st, ch);
if (chsz > 0) { if (chsz > 0) {
st += chsz; st += chsz;
} }
}
l = RepPair(*l) + 1; l = RepPair(*l) + 1;
do_derefa(v, l, derefa2_unk, derefa2_nonvar); do_derefa(v, l, derefa2_unk, derefa2_nonvar);
} while (*l != *s && IsPairTerm(*l)); } while (*l != *s && IsPairTerm(*l));
@ -440,7 +436,7 @@ unsigned char *Yap_readText(seq_tv_t *inp, size_t *lengp) {
if (lengp) if (lengp)
*lengp = sz; *lengp = sz;
if (inp->type & YAP_STRING_WITH_BUFFER) if (inp->type & YAP_STRING_WITH_BUFFER)
return UStringOfTerm(inp->val.t); return (unsigned char*)UStringOfTerm(inp->val.t);
inp->type |= YAP_STRING_IN_TMP; inp->type |= YAP_STRING_IN_TMP;
char *o = Malloc(sz+1); char *o = Malloc(sz+1);
strcpy(o, s); strcpy(o, s);
@ -571,17 +567,19 @@ static Term write_atoms(void *s0, seq_tv_t *out, size_t leng USES_REGS) {
unsigned char *s = s0, *lim = s + strnlen((char *)s, max); unsigned char *s = s0, *lim = s + strnlen((char *)s, max);
unsigned char *cp = s; unsigned char *cp = s;
unsigned char w[10], *wp = w; unsigned char w[10];
int wp = 0;
LOCAL_TERM_ERROR(t, 2 * (lim - s)); LOCAL_TERM_ERROR(t, 2 * (lim - s));
while (cp < lim && *cp) { while (cp < lim && *cp) {
utf8proc_int32_t chr; utf8proc_int32_t chr;
CELL *cl; CELL *cl;
s += get_utf8(s, 1, &chr); s += get_utf8(s, -1, &chr);
if (chr == '\0') { if (chr == '\0') {
wp[0] = '\0'; w[0] = '\0';
break; break;
} }
wp += put_utf8(w, chr); wp = put_utf8(w, chr);
w[wp] = '\0';
cl = HR; cl = HR;
HR += 2; HR += 2;
cl[0] = MkAtomTerm(Yap_ULookupAtom(w)); cl[0] = MkAtomTerm(Yap_ULookupAtom(w));
@ -994,47 +992,52 @@ bool Yap_Concat_Text(int tot, seq_tv_t inp[], seq_tv_t *out USES_REGS) {
// //
bool Yap_Splice_Text(int n, size_t cuts[], seq_tv_t *inp, bool Yap_Splice_Text(int n, size_t cuts[], seq_tv_t *inp,
seq_tv_t outv[] USES_REGS) { seq_tv_t outv[] USES_REGS) {
unsigned char *buf; const unsigned char *buf;
size_t l; size_t b_l, u_l;
inp->type |= YAP_STRING_IN_TMP; inp->type |= YAP_STRING_IN_TMP;
buf = Yap_readText(inp, &l PASS_REGS); buf = Yap_readText(inp, &b_l PASS_REGS);
if (!buf) { if (!buf) {
return false; return false;
} }
u_l = strlen_utf8(buf);
if (!cuts) { if (!cuts) {
if (n == 2) { if (n == 2) {
size_t l0, l1; size_t b_l0, b_l1, u_l0, u_l1;
unsigned char *buf0, *buf1; unsigned char *buf0, *buf1;
if (outv[0].val.t) { if (outv[0].val.t) {
buf0 = Yap_readText(outv, &l0 PASS_REGS); buf0 = Yap_readText(outv, &b_l0 PASS_REGS);
if (!buf0) { if (!buf0) {
return false; return false;
} }
if (cmp_Text(buf, buf0, l0) != 0) { if (bcmp(buf, buf0, b_l0) != 0) {
return false; return false;
} }
l1 = l - l0; u_l0 = strlen_utf8(buf0);
u_l1 = u_l - u_l0;
buf1 = slice(l0, l, buf PASS_REGS); buf1 = slice(u_l0, u_l, buf PASS_REGS);
bool rc = write_Text(buf1, outv + 1, l1 PASS_REGS); b_l1 = strlen(buf1);
bool rc = write_Text(buf1, outv + 1, b_l1 PASS_REGS);
if (!rc) { if (!rc) {
return false; return false;
} }
return rc; return rc;
} else /* if (outv[1].val.t) */ { } else /* if (outv[1].val.t) */ {
buf1 = Yap_readText(outv + 1, &l1 PASS_REGS); buf1 = Yap_readText(outv + 1, &b_l1 PASS_REGS);
if (!buf1) { if (!buf1) {
return false; return false;
} }
l0 = l - l1; u_l1 = strlen_utf8(buf1);
if (cmp_Text(skip_utf8((const unsigned char *)buf, l0), buf1, l1) != b_l0 = b_l - b_l1;
u_l0 = u_l - u_l1;
if (bcmp(skip_utf8((const char *)buf, b_l0), buf1, b_l1) !=
0) { 0) {
return false; return false;
} }
buf0 = slice(0, l0, buf PASS_REGS); buf0 = slice(0, u_l0, buf PASS_REGS);
bool rc = write_Text(buf0, outv, l0 PASS_REGS); bool rc = write_Text(buf0, outv, b_l0 PASS_REGS);
return rc; return rc;
} }
} }
@ -1048,7 +1051,7 @@ bool Yap_Splice_Text(int n, size_t cuts[], seq_tv_t *inp,
if (i > 0 && cuts[i] == 0) if (i > 0 && cuts[i] == 0)
break; break;
void *bufi = slice(next, cuts[i], buf PASS_REGS); void *bufi = slice(next, cuts[i], buf PASS_REGS);
if (!write_Text(bufi, outv + i, cuts[i] - next PASS_REGS)) { if (!write_Text(bufi, outv + i, strlen(bufi) PASS_REGS)) {
return false; return false;
} }
} }