support for UNICODE, and other bug fixes.

git-svn-id: https://yap.svn.sf.net/svnroot/yap/trunk@1725 b08c6af1-5177-4d33-ba66-4b1c6b8b522a
2006-11-27 17:42:03 +00:00
parent 0a21ac1b71
commit 0705ca0640
34 changed files with 2128 additions and 632 deletions
--- a/C/scanner.c
+++ b/C/scanner.c
@@ -57,7 +57,7 @@

 STATIC_PROTO(int my_getch, (int (*) (int)));
 STATIC_PROTO(Term float_send, (char *));
-STATIC_PROTO(Term get_num, (int *, int *, int, int (*) (int), int (*) (int),char *,UInt));
+STATIC_PROTO(Term get_num, (wchar_t *, wchar_t *, int, wchar_t (*) (int), wchar_t (*) (int),char *,UInt));

 /* token table with some help from Richard O'Keefe's PD scanner */
 static char chtype0[NUMBER_OF_CHARS+1] =
@@ -231,8 +231,8 @@ read_int_overflow(const char *s, Int base, Int val)
 #endif    
 }

-static unsigned int
-read_quoted_char(int *scan_nextp, int inp_stream, int (*QuotedNxtch)(int))
+static wchar_t
+read_quoted_char(int *scan_nextp, int inp_stream, wchar_t (*QuotedNxtch)(int))
 {
  int ch;

@@ -273,6 +273,46 @@ read_quoted_char(int *scan_nextp, int inp_stream, int (*QuotedNxtch)(int))
    return '\r';
  case 't':
    return '\t';
+  case 'u':
+    {
+      int i;
+      wchar_t wc='\0';
+
+      for (i=0; i< 4; i++) {
+	ch = QuotedNxtch(inp_stream);
+	if (ch>='0' && ch <= '9') {
+	  wc += (ch-'0')<<((3-i)*4);
+	} else if (ch>='a' && ch <= 'f') {
+	  wc += ((ch-'a')+10)<<((3-i)*4);
+	} else if (ch>='A' && ch <= 'F') {
+	  wc += ((ch-'A')+10)<<((3-i)*4);
+	} else {
+	  Yap_ErrorMessage = "invalid escape sequence";
+	  return 0;
+	}
+      }
+      return wc;
+    }
+  case 'U':
+    {
+      int i;
+      wchar_t wc='\0';
+
+      for (i=0; i< 8; i++) {
+	ch = QuotedNxtch(inp_stream);
+	if (ch>='0' && ch <= '9') {
+	  wc += (ch-'0')<<((7-i)*4);
+	} else if (ch>='a' && ch <= 'f') {
+	  wc += ((ch-'a')+10)<<((7-i)*4);
+	} else if (ch>='A' && ch <= 'F') {
+	  wc += ((ch-'A')+10)<<((7-i)*4);
+	} else {
+	  Yap_ErrorMessage = "invalid escape sequence";
+	  return 0;
+	}
+      }
+      return wc;
+    }
  case 'v':
    return '\v';
  case '\\':
@@ -415,7 +455,7 @@ read_quoted_char(int *scan_nextp, int inp_stream, int (*QuotedNxtch)(int))
 /* reads a number, either integer or float */

 static Term
-get_num(int *chp, int *chbuffp, int inp_stream, int (*Nxtch) (int), int (*QuotedNxtch) (int), char *s, UInt max_size)
+get_num(wchar_t *chp, wchar_t *chbuffp, int inp_stream, wchar_t (*Nxtch) (int), wchar_t (*QuotedNxtch) (int), char *s, UInt max_size)
 {
  char *sp = s;
  int ch = *chp;
@@ -450,7 +490,7 @@ get_num(int *chp, int *chbuffp, int inp_stream, int (*Nxtch) (int), int (*Quoted
    *sp++ = ch;
    ch = Nxtch(inp_stream);
    if (base == 0) {
-      Int ascii = ch;
+      wchar_t ascii = ch;
      int scan_extra = TRUE;

      if (ch == '\\' &&
@@ -460,7 +500,7 @@ get_num(int *chp, int *chbuffp, int inp_stream, int (*Nxtch) (int), int (*Quoted
      /* a quick way to represent ASCII */
      if (scan_extra)
 	*chp = Nxtch(inp_stream);
-      return MkIntTerm(ascii);
+      return MkIntegerTerm(ascii);
    } else if (base >= 10 && base <= 36) {
      int upper_case = 'A' - 11 + base;
      int lower_case = 'a' - 11 + base;
@@ -629,11 +669,11 @@ get_num(int *chp, int *chbuffp, int inp_stream, int (*Nxtch) (int), int (*Quoted
 /* given a function Nxtch scan until we  either find the number
   or end of file */
 Term
-Yap_scan_num(int (*Nxtch) (int))
+Yap_scan_num(wchar_t (*Nxtch) (int))
 {
  Term out;
  int sign = 1;
-  int ch, cherr;
+  wchar_t ch, cherr;
  char *ptr;

  Yap_ErrorMessage = NULL;
@@ -655,7 +695,7 @@ Yap_scan_num(int (*Nxtch) (int))
    Yap_clean_tokenizer(NULL, NULL, NULL);
    return TermNil;
  }
-  cherr = 0;
+  cherr = '\0';
  if (ASP-H < 1024)
    return TermNil;
  out = get_num(&ch, &cherr, -1, Nxtch, Nxtch, ptr, 4096);
@@ -672,15 +712,33 @@ Yap_scan_num(int (*Nxtch) (int))
  return(out);
 }

+
+static wchar_t *
+ch_to_wide(char *base, char *charp)
+{
+  int n = charp-base, i;
+  wchar_t *nb = (wchar_t *)base;
+
+  if ((nb+n) + 1024 > (wchar_t *)AuxSp) {
+    Yap_Error_TYPE = OUT_OF_AUXSPACE_ERROR;	  
+    Yap_ErrorMessage = "Heap Overflow While Scanning: please increase code space (-h)";
+    return NULL;
+  }
+  for (i=n; i > 0; i--) {
+    nb[i-1] = base[i-1];
+  }
+  return nb+n;
+}
+
 TokEntry *
 Yap_tokenizer(int inp_stream)
 {
  TokEntry *t, *l, *p;
  enum TokenKinds kind;
  int solo_flag = TRUE;
-  int ch;
-  int (*Nxtch) (int) = Stream[inp_stream].stream_getc_for_read;
-  int (*QuotedNxtch) (int) = Stream[inp_stream].stream_getc;
+  wchar_t ch, *wcharp;
+  wchar_t (*Nxtch) (int) = Stream[inp_stream].stream_wgetc_for_read;
+  wchar_t (*QuotedNxtch) (int) = Stream[inp_stream].stream_wgetc;

  Yap_ErrorMessage = NULL;
  Yap_Error_Size = 0;
@@ -694,7 +752,8 @@ Yap_tokenizer(int inp_stream)
  LOCK(Stream[inp_stream].streamlock);
  ch = Nxtch(inp_stream);
  do {
-    int och, quote, isvar;
+    wchar_t och;
+    int quote, isvar;
    char *charp, *mp;
    unsigned int len;
    char *TokImage = NULL;
@@ -785,7 +844,8 @@ Yap_tokenizer(int inp_stream)

    case NU:
      {
-	int cherr, cha = ch;
+	wchar_t cherr;
+	wchar_t cha = ch;
 	char *ptr;

 	cherr = 0;
@@ -915,12 +975,18 @@ Yap_tokenizer(int inp_stream)
      quote = ch;
      len = 0;
      ch = QuotedNxtch(inp_stream);
-      while (1) {
+      wcharp = NULL;
+
+      while (TRUE) {
 	if (charp + 1024 > (char *)AuxSp) {
 	  Yap_Error_TYPE = OUT_OF_AUXSPACE_ERROR;	  
 	  Yap_ErrorMessage = "Heap Overflow While Scanning: please increase code space (-h)";
 	  break;
 	}
+	if (ch >= 0xff){
+	  /* does not fit in ISO-LATIN */
+	  wcharp = ch_to_wide(TokImage, charp);
+	}
 	if (ch == 10  &&  yap_flags[CHARACTER_ESCAPE_FLAG] == ISO_CHARACTER_ESCAPES) {
 	  /* in ISO a new line terminates a string */
 	  Yap_ErrorMessage = "layout character \n inside quotes";
@@ -930,11 +996,25 @@ Yap_tokenizer(int inp_stream)
 	  ch = QuotedNxtch(inp_stream);
 	  if (ch != quote)
 	    break;
-	  *charp++ = ch;
+	  if (wcharp) 
+	    *wcharp++ = ch;
+	  else
+	    *charp++ = ch;
 	  ch = QuotedNxtch(inp_stream);
 	} else if (ch == '\\' && yap_flags[CHARACTER_ESCAPE_FLAG] != CPROLOG_CHARACTER_ESCAPES) {
 	  int scan_next = TRUE;
-	  *charp++ = read_quoted_char(&scan_next, inp_stream, QuotedNxtch);
+	  if (wcharp) 
+	    *wcharp++ = read_quoted_char(&scan_next, inp_stream, QuotedNxtch);
+	  else {
+	    wchar_t next = read_quoted_char(&scan_next, inp_stream, QuotedNxtch);
+	    if (next >= 0xff){
+	      /* does not fit in ISO-LATIN */
+	      wcharp = ch_to_wide(TokImage, charp);
+	      *wcharp++ = next;
+	    } else {
+	      *charp++ = next;
+	    }
+	  }
 	  if (scan_next) {
 	    ch = QuotedNxtch(inp_stream);
 	  }
@@ -943,7 +1023,10 @@ Yap_tokenizer(int inp_stream)
 	  t->Tok = Ord(kind = eot_tok);
 	  break;
 	} else {
-	  *charp++ = ch;
+	  if (wcharp) 
+	    *wcharp++ = ch;
+	  else
+	    *charp++ = ch;
 	  ch = QuotedNxtch(inp_stream);
 	}
 	++len;
@@ -958,9 +1041,16 @@ Yap_tokenizer(int inp_stream)
 	  return l;
 	}
      }
-      *charp = '\0';
+      if (wcharp) 
+	*wcharp++ = '\0';
+      else
+	*charp = '\0';
      if (quote == '"') {
-	mp = AllocScannerMemory(len + 1);
+	if (wcharp) {
+	  mp = AllocScannerMemory(sizeof(wchar_t)*(len+1));
+	} else {
+	  mp = AllocScannerMemory(len + 1);
+	}
 	if (mp == NULL) {
 	  UNLOCK(Stream[inp_stream].streamlock);
 	  Yap_ErrorMessage = "not enough heap space to read in string or quoted atom";
@@ -968,12 +1058,23 @@ Yap_tokenizer(int inp_stream)
 	  t->Tok = Ord(kind = eot_tok);
 	  return l;
 	}
-	strcpy(mp, TokImage);
+	if (wcharp) 
+	  wcscpy((wchar_t *)mp,(wchar_t *)TokImage);
+	else
+	  strcpy(mp, TokImage);
 	t->TokInfo = Unsigned(mp);
 	Yap_ReleasePreAllocCodeSpace((CODEADDR)TokImage);
-	t->Tok = Ord(kind = String_tok);
+	if (wcharp) {
+	  t->Tok = Ord(kind = WString_tok);
+	} else {
+	  t->Tok = Ord(kind = String_tok);
+	}
      } else {
-	t->TokInfo = Unsigned(Yap_LookupAtom(TokImage));
+	if (wcharp) {
+	  t->TokInfo = Unsigned(Yap_LookupWideAtom((wchar_t *)TokImage));
+	} else {
+	  t->TokInfo = Unsigned(Yap_LookupAtom(TokImage));
+	}
 	Yap_ReleasePreAllocCodeSpace((CODEADDR)TokImage);
 	t->Tok = Ord(kind = Name_tok);
 	if (ch == '(')