yap-6.3/os/encoding.h

/*************************************************************************
*									 *
*	 YAP Prolog 	%W% %G%
*									 *
*	Yap Prolog was developed at NCCUP - Universidade do Porto	 *
*									 *
* Copyright L.Damas, V.S.Costa and Universidade do Porto 1985-2003	 *
*									 *
**************************************************************************
*									 *
* File:		yapio.h							 *
* Last rev:	22/1/03							 *
* mods:									 *
* comments:	UNICODE encoding support (based on SWI-Prolog)		 *
*									 *
*************************************************************************/

#ifndef ENCODING_H

#define ENCODING_H 1

#include "YapError.h"

typedef enum {
  ENC_OCTET = 0,          /// binary files
  ENC_ISO_LATIN1 = 1,     /// US+West Europe
  ENC_ISO_ASCII = 2,      /// US only
  ENC_ISO_ANSI = 4,       /// Who cares
  ENC_ISO_UTF8 = 8,       /// Most everyone nowadays
  ENC_UTF16_BE = 16,      /// People who made a mistake
  ENC_UTF16_LE = 32,      /// People who made the same mistake
  ENC_ISO_UTF32_BE = 64,  /// nobody
  ENC_ISO_UTF32_LE = 128, /// yes, nobody
  ENC_UCS2_BE = 256,      /// nobody
  ENC_UCS2_LE = 512,      /// yes, nobody
} encoding_t;

#if WORDS_BIGENDIAN
#define ENC_WCHAR ENC_ISO_UTF32_BE
#else
#define ENC_WCHAR ENC_ISO_UTF32_LE
#endif

#ifdef YAP_H

/// read the current environment, as set by the user or as Initial
encoding_t Yap_DefaultEncoding(void);
encoding_t Yap_SystemEncoding(void);
void Yap_SetDefaultEncoding(encoding_t new_encoding);

#if HAVE_XLOCALE_H
typedef enum {
  SEQ_ENC_OCTET,        /// binary files
  SEQ_ENC_ISO_LATIN1,   /// US+West Europe
  SEQ_ENC_ISO_ASCII,    /// US only
  SEQ_ENC_ISO_ANSI,     /// Who cares
  SEQ_ENC_ISO_UTF8,     /// Most everyone nowadays
  SEQ_ENC_UTF16_BE,     /// People who made a mistake
  SEQ_ENC_UTF16_LE,     /// People who made the same mistake
  SEQ_ENC_ISO_UTF32_BE, /// nobody
  SEQ_ENC_ISO_UTF32_LE  /// yes, nobody
} seq_encoding_t;

/// convert from unary to binary representation.
static inline seq_encoding_t seq_encoding(encoding_t inp) {
#if HAVE__BUILTIN_FFSLL
  return __builtin_ffsll(inp);
#elif HAVE_FFSLL
  return ffsll(inp);
#else
  unsigned int out;
  // supports max 16 different encodings.
  if (inp == 0)
    return 0L;
  // if (inp &     ((CELL)0xffffL << 16)) {inp >>= 16; out += 16;}
  if (inp & ((CELL)0xffL << 8)) {
    inp >>= 8;
    out += 8;
  }
  if (inp & ((CELL)0xfL << 4)) {
    inp >>= 4;
    out += 4;
  }
  if (inp & ((CELL)0x3L << 2)) {
    inp >>= 2;
    out += 2;
  }
  if (inp & ((CELL)0x1 << 1))
    out++;
#endif
  return out;
}

extern xlocale enc_locales[SEQ_ENC_ISO_UTF32_LE + 1];
#endif

static inline const char *enc_name(encoding_t enc) {
  switch (enc) {
  case ENC_OCTET:
    return "octet";
  case ENC_ISO_LATIN1:
    return "iso_latin_1";
  case ENC_ISO_ASCII:
    return "ascii";
  case ENC_ISO_ANSI:
    return "octet";
  case ENC_ISO_UTF8:
    return "utf8";
  case ENC_UTF16_BE:
    return "utf16_be";
  case ENC_UTF16_LE:
    return "utf16_le";
  case ENC_UCS2_BE:
    return "ucs2_be";
  case ENC_UCS2_LE:
    return "ucs2_le";
  case ENC_ISO_UTF32_BE:
    return "utf32_be";
  case ENC_ISO_UTF32_LE:
    return "utf32_le";
  default:
    return "thanks for watching!!";
  }
}

static inline encoding_t enc_id(const char *s, encoding_t enc_bom) {
  {
    if (!strcmp(s, "iso_utf8"))
      return ENC_ISO_UTF8;
    if (!strcmp(s, "utf8"))
      return ENC_ISO_UTF8;
    if (!strcmp(s, "UTF-8"))
      return ENC_ISO_UTF8;
    if (!strcmp(s, "utf16_le"))
      return ENC_UTF16_LE;
    if (!strcmp(s, "utf16_be"))
      return ENC_UTF16_BE;
    if (!strcmp(s, "UTF-16")) {
      if (enc_bom == ENC_UTF16_LE)
        return ENC_UTF16_LE;
      return ENC_UTF16_BE;
    }
    if (!strcmp(s, "UTF-16LE"))
      return ENC_UTF16_LE;
    if (!strcmp(s, "UTF-16BE"))
      return ENC_UTF16_BE;
    if (!strcmp(s, "octet"))
      return ENC_OCTET;
    if (!strcmp(s, "iso_latin_1"))
      return ENC_ISO_LATIN1;
    if (!strcmp(s, "iso_ascii"))
      return ENC_ISO_ASCII;
    if (!strcmp(s, "iso_ansi"))
      return ENC_ISO_ANSI;
    if (!strcmp(s, "utf32_be"))
      return ENC_ISO_UTF32_BE;
    if (!strcmp(s, "utf32_le"))
      return ENC_ISO_UTF32_LE;
    if (!strcmp(s, "UTF-32")) {
      if (enc_bom == ENC_ISO_UTF32_LE)
        return ENC_ISO_UTF32_LE;
      return ENC_ISO_UTF32_BE;
    }
    if (!strcmp(s, "UTF-32BE"))
      return ENC_ISO_UTF32_BE;
    if (!strcmp(s, "UTF-32LE"))
      return ENC_ISO_UTF32_LE;
    if (!strcmp(s, "ISO-8859-1"))
      return ENC_ISO_LATIN1;
    if (!strcmp(s, "US_ASCII"))
      return ENC_ISO_ASCII;
    // just for SWI compat, this actually refers to
    // UCS-2
    if (!strcmp(s, "unicode_be"))
      return ENC_UCS2_BE;
    if (!strcmp(s, "unicode_le"))
      return ENC_UCS2_LE;
    if (!strcmp(s, "UCS-2")) {
      if (enc_bom == ENC_UTF16_LE)
        return ENC_UCS2_LE;
      return ENC_UCS2_BE;
    }
    if (!strcmp(s, "UCS-2LE"))
      return ENC_UCS2_LE;
    if (!strcmp(s, "UCS-2BE"))
      return ENC_UCS2_BE;
    if (!strcmp(s, "default")) {
      if (enc_bom != ENC_OCTET)
        return enc_bom;
      return Yap_DefaultEncoding();
    } else {
      return Yap_DefaultEncoding();
    }
  }
}

#endif

#endif
unicode 2015-06-18 01:34:39 +01:00			`/*************************************************************************`
			`* *`
			`* YAP Prolog %W% %G%`
			`* *`
			`* Yap Prolog was developed at NCCUP - Universidade do Porto *`
			`* *`
			`* Copyright L.Damas, V.S.Costa and Universidade do Porto 1985-2003 *`
			`* *`
			`**************************************************************************`
			`* *`
			`* File: yapio.h *`
			`* Last rev: 22/1/03 *`
			`* mods: *`
			`* comments: UNICODE encoding support (based on SWI-Prolog) *`
			`* *`
			`*************************************************************************/`

			`#ifndef ENCODING_H`

			`#define ENCODING_H 1`

missing define 2016-09-21 20:44:49 +01:00			`#include "YapError.h"`

unicode 2015-06-18 01:34:39 +01:00			`typedef enum {`
encoding fixes 2016-02-14 04:14:20 +00:00			`ENC_OCTET = 0, /// binary files`
			`ENC_ISO_LATIN1 = 1, /// US+West Europe`
			`ENC_ISO_ASCII = 2, /// US only`
			`ENC_ISO_ANSI = 4, /// Who cares`
			`ENC_ISO_UTF8 = 8, /// Most everyone nowadays`
			`ENC_UTF16_BE = 16, /// People who made a mistake`
			`ENC_UTF16_LE = 32, /// People who made the same mistake`
unicode 2015-06-18 01:34:39 +01:00			`ENC_ISO_UTF32_BE = 64, /// nobody`
			`ENC_ISO_UTF32_LE = 128, /// yes, nobody`
missing define 2016-09-21 20:44:49 +01:00			`ENC_UCS2_BE = 256, /// nobody`
			`ENC_UCS2_LE = 512, /// yes, nobody`
unicode 2015-06-18 01:34:39 +01:00			`} encoding_t;`
IO patches simplify error handling use get and inject use wide support in OS be stricter in checkin streams and arguments 2015-10-08 02:23:45 +01:00
Fix encoding 2016-02-18 12:10:58 +00:00			`#if WORDS_BIGENDIAN`
IO patches simplify error handling use get and inject use wide support in OS be stricter in checkin streams and arguments 2015-10-08 02:23:45 +01:00			`#define ENC_WCHAR ENC_ISO_UTF32_BE`
			`#else`
			`#define ENC_WCHAR ENC_ISO_UTF32_LE`
			`#endif`

			`#ifdef YAP_H`

unicode 2015-06-18 01:34:39 +01:00			`/// read the current environment, as set by the user or as Initial`
encoding fixes 2016-02-14 04:14:20 +00:00			`encoding_t Yap_DefaultEncoding(void);`
misc_bugs_when_compiling_problog 2016-04-14 12:00:09 +01:00			`encoding_t Yap_SystemEncoding(void);`
unicode 2015-06-18 01:34:39 +01:00			`void Yap_SetDefaultEncoding(encoding_t new_encoding);`

IO patches simplify error handling use get and inject use wide support in OS be stricter in checkin streams and arguments 2015-10-08 02:23:45 +01:00			`#if HAVE_XLOCALE_H`
			`typedef enum {`
missing define 2016-09-21 20:44:49 +01:00			`SEQ_ENC_OCTET, /// binary files`
			`SEQ_ENC_ISO_LATIN1, /// US+West Europe`
			`SEQ_ENC_ISO_ASCII, /// US only`
			`SEQ_ENC_ISO_ANSI, /// Who cares`
			`SEQ_ENC_ISO_UTF8, /// Most everyone nowadays`
			`SEQ_ENC_UTF16_BE, /// People who made a mistake`
			`SEQ_ENC_UTF16_LE, /// People who made the same mistake`
encoding stuff 2016-02-19 19:36:11 +00:00			`SEQ_ENC_ISO_UTF32_BE, /// nobody`
missing define 2016-09-21 20:44:49 +01:00			`SEQ_ENC_ISO_UTF32_LE /// yes, nobody`
encoding fixes 2016-02-14 04:14:20 +00:00			`} seq_encoding_t;`
IO patches simplify error handling use get and inject use wide support in OS be stricter in checkin streams and arguments 2015-10-08 02:23:45 +01:00
			`/// convert from unary to binary representation.`
			`static inline seq_encoding_t seq_encoding(encoding_t inp) {`
			`#if HAVE__BUILTIN_FFSLL`
			`return __builtin_ffsll(inp);`
			`#elif HAVE_FFSLL`
			`return ffsll(inp);`
			`#else`
			`unsigned int out;`
			`// supports max 16 different encodings.`
encoding fixes 2016-02-14 04:14:20 +00:00			`if (inp == 0)`
IO patches simplify error handling use get and inject use wide support in OS be stricter in checkin streams and arguments 2015-10-08 02:23:45 +01:00			`return 0L;`
			`// if (inp & ((CELL)0xffffL << 16)) {inp >>= 16; out += 16;}`
encoding fixes 2016-02-14 04:14:20 +00:00			`if (inp & ((CELL)0xffL << 8)) {`
			`inp >>= 8;`
			`out += 8;`
			`}`
			`if (inp & ((CELL)0xfL << 4)) {`
			`inp >>= 4;`
			`out += 4;`
			`}`
			`if (inp & ((CELL)0x3L << 2)) {`
			`inp >>= 2;`
			`out += 2;`
			`}`
			`if (inp & ((CELL)0x1 << 1))`
			`out++;`
IO patches simplify error handling use get and inject use wide support in OS be stricter in checkin streams and arguments 2015-10-08 02:23:45 +01:00			`#endif`
			`return out;`
			`}`

encoding fixes 2016-02-14 04:14:20 +00:00			`extern xlocale enc_locales[SEQ_ENC_ISO_UTF32_LE + 1];`
IO patches simplify error handling use get and inject use wide support in OS be stricter in checkin streams and arguments 2015-10-08 02:23:45 +01:00			`#endif`

encoding fixes 2016-02-14 04:14:20 +00:00			`static inline const char *enc_name(encoding_t enc) {`
			`switch (enc) {`
			`case ENC_OCTET:`
			`return "octet";`
			`case ENC_ISO_LATIN1:`
			`return "iso_latin_1";`
			`case ENC_ISO_ASCII:`
			`return "ascii";`
			`case ENC_ISO_ANSI:`
			`return "octet";`
			`case ENC_ISO_UTF8:`
			`return "utf8";`
			`case ENC_UTF16_BE:`
			`return "utf16_be";`
			`case ENC_UTF16_LE:`
			`return "utf16_le";`
encoding stuff 2016-02-19 19:36:11 +00:00			`case ENC_UCS2_BE:`
			`return "ucs2_be";`
			`case ENC_UCS2_LE:`
			`return "ucs2_le";`
encoding fixes 2016-02-14 04:14:20 +00:00			`case ENC_ISO_UTF32_BE:`
			`return "utf32_be";`
			`case ENC_ISO_UTF32_LE:`
			`return "utf32_le";`
			`default:`
			`return "thanks for watching!!";`
			`}`
unicode 2015-06-18 01:34:39 +01:00			`}`

Fix encoding 2016-02-18 12:10:58 +00:00			`static inline encoding_t enc_id(const char *s, encoding_t enc_bom) {`
encoding fixes 2016-02-14 04:14:20 +00:00			`{`
			`if (!strcmp(s, "iso_utf8"))`
			`return ENC_ISO_UTF8;`
			`if (!strcmp(s, "utf8"))`
			`return ENC_ISO_UTF8;`
Fix encoding 2016-02-18 12:10:58 +00:00			`if (!strcmp(s, "UTF-8"))`
			`return ENC_ISO_UTF8;`
encoding fixes 2016-02-14 04:14:20 +00:00			`if (!strcmp(s, "utf16_le"))`
			`return ENC_UTF16_LE;`
Fix encoding 2016-02-18 12:10:58 +00:00			`if (!strcmp(s, "utf16_be"))`
			`return ENC_UTF16_BE;`
			`if (!strcmp(s, "UTF-16")) {`
missing define 2016-09-21 20:44:49 +01:00			`if (enc_bom == ENC_UTF16_LE)`
Fix encoding 2016-02-18 12:10:58 +00:00			`return ENC_UTF16_LE;`
			`return ENC_UTF16_BE;`
			`}`
			`if (!strcmp(s, "UTF-16LE"))`
			`return ENC_UTF16_LE;`
encoding stuff 2016-02-19 19:36:11 +00:00			`if (!strcmp(s, "UTF-16BE"))`
Fix encoding 2016-02-18 12:10:58 +00:00			`return ENC_UTF16_BE;`
encoding fixes 2016-02-14 04:14:20 +00:00			`if (!strcmp(s, "octet"))`
			`return ENC_OCTET;`
			`if (!strcmp(s, "iso_latin_1"))`
			`return ENC_ISO_LATIN1;`
			`if (!strcmp(s, "iso_ascii"))`
			`return ENC_ISO_ASCII;`
			`if (!strcmp(s, "iso_ansi"))`
			`return ENC_ISO_ANSI;`
			`if (!strcmp(s, "utf32_be"))`
			`return ENC_ISO_UTF32_BE;`
			`if (!strcmp(s, "utf32_le"))`
			`return ENC_ISO_UTF32_LE;`
Fix encoding 2016-02-18 12:10:58 +00:00			`if (!strcmp(s, "UTF-32")) {`
			`if (enc_bom == ENC_ISO_UTF32_LE)`
			`return ENC_ISO_UTF32_LE;`
			`return ENC_ISO_UTF32_BE;`
			`}`
			`if (!strcmp(s, "UTF-32BE"))`
			`return ENC_ISO_UTF32_BE;`
			`if (!strcmp(s, "UTF-32LE"))`
			`return ENC_ISO_UTF32_LE;`
			`if (!strcmp(s, "ISO-8859-1"))`
			`return ENC_ISO_LATIN1;`
encoding stuff 2016-02-19 19:36:11 +00:00			`if (!strcmp(s, "US_ASCII"))`
			`return ENC_ISO_ASCII;`
Fix encoding 2016-02-18 12:10:58 +00:00			`// just for SWI compat, this actually refers to`
			`// UCS-2`
			`if (!strcmp(s, "unicode_be"))`
encoding stuff 2016-02-19 19:36:11 +00:00			`return ENC_UCS2_BE;`
Fix encoding 2016-02-18 12:10:58 +00:00			`if (!strcmp(s, "unicode_le"))`
encoding stuff 2016-02-19 19:36:11 +00:00			`return ENC_UCS2_LE;`
			`if (!strcmp(s, "UCS-2")) {`
missing define 2016-09-21 20:44:49 +01:00			`if (enc_bom == ENC_UTF16_LE)`
encoding stuff 2016-02-19 19:36:11 +00:00			`return ENC_UCS2_LE;`
			`return ENC_UCS2_BE;`
			`}`
			`if (!strcmp(s, "UCS-2LE"))`
			`return ENC_UCS2_LE;`
			`if (!strcmp(s, "UCS-2BE"))`
			`return ENC_UCS2_BE;`
Fix encoding 2016-02-18 12:10:58 +00:00			`if (!strcmp(s, "default")) {`
missing define 2016-09-21 20:44:49 +01:00			`if (enc_bom != ENC_OCTET)`
			`return enc_bom;`
			`return Yap_DefaultEncoding();`
			`} else {`
Fix encoding 2016-02-18 12:10:58 +00:00			`return Yap_DefaultEncoding();`
unicode 2015-06-18 01:34:39 +01:00			`}`
encoding fixes 2016-02-14 04:14:20 +00:00			`}`
unicode 2015-06-18 01:34:39 +01:00			`}`

IO patches simplify error handling use get and inject use wide support in OS be stricter in checkin streams and arguments 2015-10-08 02:23:45 +01:00			`#endif`
unicode 2015-06-18 01:34:39 +01:00
			`#endif`