look for errors

2016-08-15 14:51:36 -05:00
parent fccf3d9f6f
commit cad55bcf71
1 changed files with 56 additions and 21 deletions
--- a/os/getw.h
+++ b/os/getw.h
@@ -1,4 +1,19 @@
 #define utf_cont(ch)  (((ch) & 0xc0) == 0x80)
 #define encoding_error(ch,v,st) post_process_read_wchar(1, v, st)
 static int post_process_f_weof(StreamDesc *st)
 {
  if (ferror(st->file)) {
    clearerr(st->file);
    return 1;
  } else {
    return post_process_weof(st);
  }
 }
 /// compose a wide char from a sequence of getchars
 ///  this is a slow lane routine, called if no specialised code
 ///  isavailable.
@@ -7,20 +22,20 @@ extern int get_wchar(int sno) {
  int ch = st->stream_getc(sno);
  if (ch == -1)
-    return post_process_weof(st);
+    return post_process_f_weof(st);
  switch (st->encoding) {
  case ENC_OCTET:
-  // no error detection, all characters are ok.
+    // no error detection, all characters are ok.
  case ENC_ISO_LATIN1:
    return post_process_read_wchar(ch, 1, st);
-  // 7 bits code, anything above is bad news
+    // 7 bits code, anything above is bad news
  case ENC_ISO_ASCII:
    if (ch & 0x80) {
      /* error */
    }
    return post_process_read_wchar(ch, 1, st);
-  // default OS encoding, depends on locale.
+    // default OS encoding, depends on locale.
  case ENC_ISO_ANSI: {
    char buf[8];
    int out;
@@ -38,7 +53,7 @@ extern int get_wchar(int sno) {
    }
    return post_process_read_wchar(wch, n, st);
  }
-  // UTF-8 works o 8 bits.
+    // UTF-8 works o 8 bits.
  case ENC_ISO_UTF8: {
    int wch;
    unsigned char buf[8];
@@ -46,28 +61,36 @@ extern int get_wchar(int sno) {
    if (ch < 0x80) {
      return post_process_read_wchar(ch, 1, st);
    }
-    // if ((ch - 0xc2) > (0xf4-0xc2)) return UTF8PROC_ERROR_INVALIDUTF8;
+    if ((ch - 0xc2) > (0xf4-0xc2)) {
      return encoding_error(ch, 1, st);
    }
    if (ch < 0xe0) { // 2-byte sequence
                     // Must have valid continuation character
      int c1 = buf[0] = st->stream_getc(sno);
      if (c1 == -1)
        return post_process_weof(st);
-      // if (!utf_cont(*str)) return UTF8PROC_ERROR_INVALIDUTF8;
+      if (!utf_cont(c1)) {
 	return encoding_error(ch, 2, st);
      }
      wch = ((ch & 0x1f) << 6) | (c1 & 0x3f);
      return post_process_read_wchar(wch, 2, st);
    }
    if (ch < 0xf0) { // 3-byte sequence
      // if ((str + 1 >= end) || !utf_cont(*str) || !utf_cont(str[1]))
      //   return UTF8PROC_ERROR_INVALIDUTF8;
      // Check for surrogate chars
      // if (ch == 0xed && *str > 0x9f)
      //    return UTF8PROC_ERROR_INVALIDUTF8;
      int c1 = st->stream_getc(sno);
      if (c1 == -1)
        return post_process_weof(st);
      //    return UTF8PROC_ERROR_INVALIDUTF8;
      if (ch == 0xed && c1 > 0x9f) {
        return encoding_error(ch, 1, st);
      }
      int c2 = st->stream_getc(sno);
      if (c2 == -1)
        return post_process_weof(st);
      if ( !utf_cont(c1) || !utf_cont(c2)) {
 	return encoding_error(ch, 2, st);
 	// Check for surrogate chars
      }
      wch = ((ch & 0xf) << 12) | ((c1 & 0x3f) << 6) | (c2 & 0x3f);
      return post_process_read_wchar(wch, 3, st);
    } else {
@@ -80,8 +103,11 @@ extern int get_wchar(int sno) {
      int c3 = st->stream_getc(sno);
      if (c3 == -1)
        return post_process_weof(st);
      if ( !utf_cont(c1) || !utf_cont(c2) || !utf_cont(c3)) {
 	return encoding_error(ch, 3, st);
      }
      wch = ((ch & 7) << 18) | ((c1 & 0x3f) << 12) | ((c2 & 0x3f) << 6) |
-            (c3 & 0x3f);
+	(c3 & 0x3f);
      return post_process_read_wchar(wch, 4, st);
    }
  }
@@ -208,7 +234,6 @@ extern int get_wchar(int sno) {
 extern int get_wchar_UTF8(int sno) {
  StreamDesc *st = GLOBAL_Stream + sno;
  int ch = st->stream_getc(sno);
  if (ch == -1)
    return post_process_weof(st);
  else {
@@ -224,7 +249,9 @@ extern int get_wchar_UTF8(int sno) {
      int c1 = buf[0] = st->stream_getc(sno);
      if (c1 == -1)
        return post_process_weof(st);
-      // if (!utf_cont(*str)) return UTF8PROC_ERROR_INVALIDUTF8;
+      if (!utf_cont(c1)) {
 	return encoding_error(ch, 2, st);
      }
      wch = ((ch & 0x1f) << 6) | (c1 & 0x3f);
      return post_process_read_wchar(wch, 2, st);
    }
@@ -237,23 +264,31 @@ extern int get_wchar_UTF8(int sno) {
      int c1 = st->stream_getc(sno);
      if (c1 == -1)
        return post_process_weof(st);
      if (ch == 0xed && c1 > 0x9f)
         return  encoding_error(ch, 2, st);
      int c2 = st->stream_getc(sno);
      if (c2 == -1)
        return post_process_weof(st);
-      wch = ((ch & 0xf) << 12) | ((c1 & 0x3f) << 6) | (c2 & 0x3f);
+     wch = ((ch & 0xf)<<12) | ((c1 & 0x3f)<<6) | (c2 & 0x3f);
     if (wch < 0x800)
         return encoding_error(ch, 3, st);
      return post_process_read_wchar(wch, 3, st);
    } else {
      int c1 = st->stream_getc(sno);
      if (c1 == -1)
-        return post_process_weof(st);
+	return post_process_weof(st);
      int c2 = st->stream_getc(sno);
      if (c2 == -1)
-        return post_process_weof(st);
+	return post_process_weof(st);
      int c3 = st->stream_getc(sno);
      if (c3 == -1)
-        return post_process_weof(st);
+	return post_process_weof(st);
-      wch = ((ch & 7) << 18) | ((c1 & 0x3f) << 12) | ((c2 & 0x3f) << 6) |
+   if (ch == 0xf0) {
-            (c3 & 0x3f);
+    if (c1 < 0x90) return  encoding_error(ch, 4, st);
  } else if (c1 == 0xf4) {
    if (c2 > 0x8f) return  encoding_error(ch, 4, st);
  }
   wch = ((ch & 7)<<18) | ((c1 & 0x3f)<<12) | ((c2 & 0x3f)<<6) | (c3 & 0x3f);
      return post_process_read_wchar(wch, 4, st);
    }
  }