diff --git a/os/getw.h b/os/getw.h index 0a2fc8859..b2574b0d4 100644 --- a/os/getw.h +++ b/os/getw.h @@ -1,4 +1,19 @@ +#define utf_cont(ch) (((ch) & 0xc0) == 0x80) + +#define encoding_error(ch,v,st) post_process_read_wchar(1, v, st) + +static int post_process_f_weof(StreamDesc *st) +{ + if (ferror(st->file)) { + clearerr(st->file); + return 1; + } else { + return post_process_weof(st); + } + +} + /// compose a wide char from a sequence of getchars /// this is a slow lane routine, called if no specialised code /// isavailable. @@ -7,20 +22,20 @@ extern int get_wchar(int sno) { int ch = st->stream_getc(sno); if (ch == -1) - return post_process_weof(st); + return post_process_f_weof(st); switch (st->encoding) { case ENC_OCTET: - // no error detection, all characters are ok. + // no error detection, all characters are ok. case ENC_ISO_LATIN1: return post_process_read_wchar(ch, 1, st); - // 7 bits code, anything above is bad news + // 7 bits code, anything above is bad news case ENC_ISO_ASCII: if (ch & 0x80) { /* error */ } return post_process_read_wchar(ch, 1, st); - // default OS encoding, depends on locale. + // default OS encoding, depends on locale. case ENC_ISO_ANSI: { char buf[8]; int out; @@ -38,7 +53,7 @@ extern int get_wchar(int sno) { } return post_process_read_wchar(wch, n, st); } - // UTF-8 works o 8 bits. + // UTF-8 works o 8 bits. case ENC_ISO_UTF8: { int wch; unsigned char buf[8]; @@ -46,28 +61,36 @@ extern int get_wchar(int sno) { if (ch < 0x80) { return post_process_read_wchar(ch, 1, st); } - // if ((ch - 0xc2) > (0xf4-0xc2)) return UTF8PROC_ERROR_INVALIDUTF8; + if ((ch - 0xc2) > (0xf4-0xc2)) { + return encoding_error(ch, 1, st); + } if (ch < 0xe0) { // 2-byte sequence // Must have valid continuation character int c1 = buf[0] = st->stream_getc(sno); if (c1 == -1) return post_process_weof(st); - // if (!utf_cont(*str)) return UTF8PROC_ERROR_INVALIDUTF8; + if (!utf_cont(c1)) { + return encoding_error(ch, 2, st); + } wch = ((ch & 0x1f) << 6) | (c1 & 0x3f); return post_process_read_wchar(wch, 2, st); } if (ch < 0xf0) { // 3-byte sequence - // if ((str + 1 >= end) || !utf_cont(*str) || !utf_cont(str[1])) - // return UTF8PROC_ERROR_INVALIDUTF8; - // Check for surrogate chars - // if (ch == 0xed && *str > 0x9f) - // return UTF8PROC_ERROR_INVALIDUTF8; int c1 = st->stream_getc(sno); if (c1 == -1) return post_process_weof(st); + // return UTF8PROC_ERROR_INVALIDUTF8; + if (ch == 0xed && c1 > 0x9f) { + return encoding_error(ch, 1, st); + } int c2 = st->stream_getc(sno); if (c2 == -1) return post_process_weof(st); + if ( !utf_cont(c1) || !utf_cont(c2)) { + return encoding_error(ch, 2, st); + // Check for surrogate chars + + } wch = ((ch & 0xf) << 12) | ((c1 & 0x3f) << 6) | (c2 & 0x3f); return post_process_read_wchar(wch, 3, st); } else { @@ -80,8 +103,11 @@ extern int get_wchar(int sno) { int c3 = st->stream_getc(sno); if (c3 == -1) return post_process_weof(st); + if ( !utf_cont(c1) || !utf_cont(c2) || !utf_cont(c3)) { + return encoding_error(ch, 3, st); + } wch = ((ch & 7) << 18) | ((c1 & 0x3f) << 12) | ((c2 & 0x3f) << 6) | - (c3 & 0x3f); + (c3 & 0x3f); return post_process_read_wchar(wch, 4, st); } } @@ -208,7 +234,6 @@ extern int get_wchar(int sno) { extern int get_wchar_UTF8(int sno) { StreamDesc *st = GLOBAL_Stream + sno; int ch = st->stream_getc(sno); - if (ch == -1) return post_process_weof(st); else { @@ -224,7 +249,9 @@ extern int get_wchar_UTF8(int sno) { int c1 = buf[0] = st->stream_getc(sno); if (c1 == -1) return post_process_weof(st); - // if (!utf_cont(*str)) return UTF8PROC_ERROR_INVALIDUTF8; + if (!utf_cont(c1)) { + return encoding_error(ch, 2, st); + } wch = ((ch & 0x1f) << 6) | (c1 & 0x3f); return post_process_read_wchar(wch, 2, st); } @@ -237,23 +264,31 @@ extern int get_wchar_UTF8(int sno) { int c1 = st->stream_getc(sno); if (c1 == -1) return post_process_weof(st); + if (ch == 0xed && c1 > 0x9f) + return encoding_error(ch, 2, st); int c2 = st->stream_getc(sno); if (c2 == -1) return post_process_weof(st); - wch = ((ch & 0xf) << 12) | ((c1 & 0x3f) << 6) | (c2 & 0x3f); + wch = ((ch & 0xf)<<12) | ((c1 & 0x3f)<<6) | (c2 & 0x3f); + if (wch < 0x800) + return encoding_error(ch, 3, st); return post_process_read_wchar(wch, 3, st); } else { int c1 = st->stream_getc(sno); if (c1 == -1) - return post_process_weof(st); + return post_process_weof(st); int c2 = st->stream_getc(sno); if (c2 == -1) - return post_process_weof(st); + return post_process_weof(st); int c3 = st->stream_getc(sno); if (c3 == -1) - return post_process_weof(st); - wch = ((ch & 7) << 18) | ((c1 & 0x3f) << 12) | ((c2 & 0x3f) << 6) | - (c3 & 0x3f); + return post_process_weof(st); + if (ch == 0xf0) { + if (c1 < 0x90) return encoding_error(ch, 4, st); + } else if (c1 == 0xf4) { + if (c2 > 0x8f) return encoding_error(ch, 4, st); + } + wch = ((ch & 7)<<18) | ((c1 & 0x3f)<<12) | ((c2 & 0x3f)<<6) | (c3 & 0x3f); return post_process_read_wchar(wch, 4, st); } }