encoding stuff
This commit is contained in:
parent
b27c1964bf
commit
bc71e54f20
@ -181,12 +181,12 @@ original program path.
|
|||||||
YAP_FLAG(FAST_FLAG, "fast", true, booleanFlag, "false", NULL), /**< `fast `
|
YAP_FLAG(FAST_FLAG, "fast", true, booleanFlag, "false", NULL), /**< `fast `
|
||||||
|
|
||||||
If `on` allow fast machine code, if `off` (default) disable it. Only
|
If `on` allow fast machine code, if `off` (default) disable it. Only
|
||||||
available in experimental implemexbntations.
|
available in experimental implementations.
|
||||||
*/
|
*/
|
||||||
YAP_FLAG(FILE_NAME_VARIABLES_FLAG, "file_name_variables", true, booleanFlag,
|
YAP_FLAG(FILE_NAME_VARIABLES_FLAG, "file_name_variables", true, booleanFlag,
|
||||||
"true", NULL),
|
"true", NULL),
|
||||||
YAP_FLAG(FLOAT_FORMAT_FLAG, "float_format", true, isatom, "%15e",
|
YAP_FLAG(FLOAT_FORMAT_FLAG, "float_format", true, isatom, "%15e",
|
||||||
NULL), /**< + `float_format `
|
NULL), /**< + `float_format `
|
||||||
|
|
||||||
C-library `printf()` format specification used by write/1 and
|
C-library `printf()` format specification used by write/1 and
|
||||||
friends to determine how floating point numbers are printed. The
|
friends to determine how floating point numbers are printed. The
|
||||||
|
@ -266,9 +266,8 @@ inline static int cmpn_utf8(const utf8proc_uint8_t *pt1,
|
|||||||
|
|
||||||
// UTF16
|
// UTF16
|
||||||
|
|
||||||
#define LEAD_OFFSET (0xD800 - (0x10000 >> 10))
|
#define LEAD_OFFSET ((uint32_t)0xD800 - (uint32_t)(0x10000 >> 10))
|
||||||
#define SURROGATE_OFFSET ( 0x10000 - (0xD800 << 10) - 0xDC00 )
|
#define SURROGATE_OFFSET ( (uint32_t)0x10000 - (uint32_t)(0xD800 << 10) - (uint32_t)0xDC00 )
|
||||||
|
|
||||||
|
|
||||||
const char *Yap_tokRep(TokEntry *tokptr);
|
const char *Yap_tokRep(TokEntry *tokptr);
|
||||||
|
|
||||||
|
190
docs/syntax.md
190
docs/syntax.md
@ -1,12 +1,17 @@
|
|||||||
|
|
||||||
|
|
||||||
|
@file syntax.md
|
||||||
|
|
||||||
@defgroup YAPSyntax YAP Syntax
|
@defgroup YAPSyntax YAP Syntax
|
||||||
@ingroup mainpage
|
@ingroup mainpage
|
||||||
|
|
||||||
We will describe the syntax of YAP at two levels. We first will
|
We will describe the syntax of YAP at two levels. We first will
|
||||||
describe the syntax for Prolog terms. In a second level we describe
|
describe the syntax for Prolog terms. In a second level we describe
|
||||||
the \a tokens from which Prolog \a terms are
|
the tokens from which Prolog terms are
|
||||||
built.
|
built.
|
||||||
|
|
||||||
@section Formal_Syntax Syntax of Terms
|
@defgroup Formal_Syntax Syntax of Terms
|
||||||
|
@ingroup YAPSyntax
|
||||||
|
|
||||||
Below, we describe the syntax of YAP terms from the different
|
Below, we describe the syntax of YAP terms from the different
|
||||||
classes of tokens defined above. The formalism used will be <em>BNF</em>,
|
classes of tokens defined above. The formalism used will be <em>BNF</em>,
|
||||||
@ -81,15 +86,18 @@ dot with single quotes.
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
@section Tokens Prolog Tokens
|
# @defgroup Tokens Prolog Tokens
|
||||||
|
@ingroup YAPSyntax
|
||||||
|
|
||||||
Prolog tokens are grouped into the following categories:
|
Prolog tokens are grouped into the following categories:
|
||||||
|
|
||||||
@subsection Numbers Numbers
|
## @defgroup Numbers Numbers
|
||||||
|
@ingroup Tokens
|
||||||
|
|
||||||
Numbers can be further subdivided into integer and floating-point numbers.
|
Numbers can be further subdivided into integer and floating-point numbers.
|
||||||
|
|
||||||
@subsubsection Integers
|
### @defgroup Integers Integers
|
||||||
|
@ingroup Numbers
|
||||||
|
|
||||||
Integer numbers
|
Integer numbers
|
||||||
are described by the following regular expression:
|
are described by the following regular expression:
|
||||||
@ -136,7 +144,8 @@ the word size of the machine. This is 32 bits in most current machines,
|
|||||||
but 64 in some others, such as the Alpha running Linux or Digital
|
but 64 in some others, such as the Alpha running Linux or Digital
|
||||||
Unix. The scanner will read larger or smaller integers erroneously.
|
Unix. The scanner will read larger or smaller integers erroneously.
|
||||||
|
|
||||||
@subsubsection Floats
|
### @defgroup Floats Floats
|
||||||
|
@ingroup Numbers
|
||||||
|
|
||||||
Floating-point numbers are described by:
|
Floating-point numbers are described by:
|
||||||
|
|
||||||
@ -160,12 +169,13 @@ Examples:
|
|||||||
Floating-point numbers are represented as a double in the target
|
Floating-point numbers are represented as a double in the target
|
||||||
machine. This is usually a 64-bit number.
|
machine. This is usually a 64-bit number.
|
||||||
|
|
||||||
@subsection Strings Character Strings
|
## Strings @defgroup Strings Character Strings
|
||||||
|
|
||||||
Strings are described by the following rules:
|
Strings are described by the following rules:
|
||||||
|
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~
|
||||||
string --> '"' string_quoted_characters '"'
|
string --> " string_quoted_characters "
|
||||||
|
string --> ` string_quoted_characters `
|
||||||
|
|
||||||
string_quoted_characters --> '"' '"' string_quoted_characters
|
string_quoted_characters --> '"' '"' string_quoted_characters
|
||||||
string_quoted_characters --> '\'
|
string_quoted_characters --> '\'
|
||||||
@ -177,10 +187,25 @@ Strings are described by the following rules:
|
|||||||
escape_sequence --> '\' | '"' | ''' | '`'
|
escape_sequence --> '\' | '"' | ''' | '`'
|
||||||
escape_sequence --> at_most_3_octal_digit_seq_char '\'
|
escape_sequence --> at_most_3_octal_digit_seq_char '\'
|
||||||
escape_sequence --> 'x' at_most_2_hexa_digit_seq_char '\'
|
escape_sequence --> 'x' at_most_2_hexa_digit_seq_char '\'
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~
|
||||||
where `string_character` in any character except the double quote
|
|
||||||
|
where `string_character` is any character except the double quote (back quote)
|
||||||
and escape characters.
|
and escape characters.
|
||||||
|
|
||||||
|
YAP supports four different textual elements:
|
||||||
|
|
||||||
|
+ Atoms, mentioned above, are textual representations of symbols, that are interned in the
|
||||||
|
data-base. They are stored either in ISO-LATIN-1 (first 256 code points), or as UTF-32.
|
||||||
|
|
||||||
|
+ Strings are atomic representations of text. The back-quote character is used to identify these objects in the program. Strings exist as stack objects, in the same way as other Prolog terms. As Prolog unification cannot be used to manipulate strings, YAP includes built-ins such as string_arg/3, sub_string/5, or string_concat to manipulate them efficiently. Strings are stored as opaque objects containing a
|
||||||
|
|
||||||
|
+ Lists of codes represent text as a list of numbers, where each number is a character code. A string of _N_ bytes requires _N_ pairs, that is _2N_ cells, leading to a total of 16 bytes per character on 64 byte machines. Thus, they are a very expensive, but very flexible representation, as one can use unification to construct and access string elements.
|
||||||
|
|
||||||
|
+ Lists of atoms represent text as a list of atoms, where each number has a single character code. A string of _N_ bytes also requires _2N_ pairs. They have similar properties to lists of codes.
|
||||||
|
|
||||||
|
The flags `double_quotes` and `backquoted_string` change the interpretation of text strings, they can take the
|
||||||
|
values `atom`, `string`, `codes`, and `chars`.
|
||||||
|
|
||||||
Examples:
|
Examples:
|
||||||
|
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
@ -188,9 +213,7 @@ Examples:
|
|||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
The first string is an empty string, the last string shows the use of
|
The first string is an empty string, the last string shows the use of
|
||||||
double-quoting. The implementation of YAP represents strings as
|
double-quoting.
|
||||||
lists of integers. Since YAP 4.3.0 there is no static limit on string
|
|
||||||
size.
|
|
||||||
|
|
||||||
Escape sequences can be used to include the non-printable characters
|
Escape sequences can be used to include the non-printable characters
|
||||||
`a` (alert), `b` (backspace), `r` (carriage return),
|
`a` (alert), `b` (backspace), `r` (carriage return),
|
||||||
@ -210,13 +233,14 @@ The first three examples return a list including only character 12 (form
|
|||||||
feed). The last example escapes the escape character.
|
feed). The last example escapes the escape character.
|
||||||
|
|
||||||
Escape sequences were not available in C-Prolog and in original
|
Escape sequences were not available in C-Prolog and in original
|
||||||
versions of YAP up to 4.2.0. Escape sequences can be disable by using:
|
versions of YAP up to 4.2.0. Escape sequences can be disabled by using:
|
||||||
|
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
:- yap_flag(character_escapes,false).
|
:- yap_flag(character_escapes,false).
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
@subsection Atoms Atoms
|
## @addgroup Atoms Atoms
|
||||||
|
@ingroup Tokens
|
||||||
|
|
||||||
Atoms are defined by one of the following rules:
|
Atoms are defined by one of the following rules:
|
||||||
|
|
||||||
@ -256,7 +280,8 @@ Version `4.2.0` of YAP removed the previous limit of 256
|
|||||||
characters on an atom. Size of an atom is now only limited by the space
|
characters on an atom. Size of an atom is now only limited by the space
|
||||||
available in the system.
|
available in the system.
|
||||||
|
|
||||||
@subsection Variables Variables
|
## @addgroup Variables Variables
|
||||||
|
@ingroup Tokens
|
||||||
|
|
||||||
Variables are described by:
|
Variables are described by:
|
||||||
|
|
||||||
@ -276,8 +301,8 @@ variables are known as anonymous variables. Note that different
|
|||||||
occurrences of `_` on the same term represent <em>different</em>
|
occurrences of `_` on the same term represent <em>different</em>
|
||||||
anonymous variables.
|
anonymous variables.
|
||||||
|
|
||||||
@subsection Punctuation_Tokens Punctuation Tokens
|
## @addgroup Punctuation_Tokens Punctuation Tokens
|
||||||
|
@ingroup Tokens
|
||||||
Punctuation tokens consist of one of the following characters:
|
Punctuation tokens consist of one of the following characters:
|
||||||
|
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
@ -298,7 +323,9 @@ layout characters, the YAP parser behaves as if it had found a
|
|||||||
single blank character. The end of a file also counts as a blank
|
single blank character. The end of a file also counts as a blank
|
||||||
character for this purpose.
|
character for this purpose.
|
||||||
|
|
||||||
@section Encoding Wide Character Support
|
## @addgroup WideChars Encoding Wide Character Support
|
||||||
|
@ingroup YAPSyntax
|
||||||
|
|
||||||
|
|
||||||
YAP now implements a SWI-Prolog compatible interface to wide
|
YAP now implements a SWI-Prolog compatible interface to wide
|
||||||
characters and the Universal Character Set (UCS). The following text
|
characters and the Universal Character Set (UCS). The following text
|
||||||
@ -319,7 +346,8 @@ atoms. If the text fits in ISO Latin-1, it is represented as an array
|
|||||||
of 8-bit characters. Otherwise the text is represented as an array of
|
of 8-bit characters. Otherwise the text is represented as an array of
|
||||||
wide chars, which may take 16 or 32 bits. This representational issue
|
wide chars, which may take 16 or 32 bits. This representational issue
|
||||||
is completely transparent to the Prolog user. Users of the foreign
|
is completely transparent to the Prolog user. Users of the foreign
|
||||||
language interface sometimes need to be aware of these issues though.
|
language interface sometimes need to be aware of these issues though. Notice that this will likely
|
||||||
|
change in the future, we probably will use an UTF-8 based representation.
|
||||||
|
|
||||||
Character coding comes into view when characters of strings need to be
|
Character coding comes into view when characters of strings need to be
|
||||||
read from or written to file or when they have to be communicated to
|
read from or written to file or when they have to be communicated to
|
||||||
@ -327,76 +355,105 @@ other software components using the foreign language interface. In this
|
|||||||
section we only deal with I/O through streams, which includes file I/O
|
section we only deal with I/O through streams, which includes file I/O
|
||||||
as well as I/O through network sockets.
|
as well as I/O through network sockets.
|
||||||
|
|
||||||
@subsection Stream_Encoding Wide character encodings on streams
|
== @addgroup Stream_Encoding Wide character encodings on streams
|
||||||
|
@ingroup WideChars
|
||||||
|
|
||||||
Although characters are uniquely coded using the UCS standard
|
The UCS standard describes all possible characters (or code points, as they include
|
||||||
internally, streams and files are byte (8-bit) oriented and there are a
|
ideograms, ligatures, and other symbols). The current version, Unicode 8.0, allows
|
||||||
variety of ways to represent the larger UCS codes in an 8-bit octet
|
code points up to 0x10FFFF, and thus allows for 1,114,112 code points. See [Unicode Charts](http://unicode.org/charts/) for the supported languages.
|
||||||
stream. The most popular one, especially in the context of the web, is
|
|
||||||
UTF-8. Bytes 0...127 represent simply the corresponding US-ASCII
|
Notice that most symbols are rarely used. Encodings represent the Unicode characters in a way
|
||||||
|
that is more suited for communication. The most popular encoding, especially in the context of the web and in the Unix/Linux/BSD/Mac communities, is
|
||||||
|
UTF-8. UTF-8 is compact and as it uses bytes, does not have different endianesses.
|
||||||
|
Bytes 0...127 represent simply the corresponding US-ASCII
|
||||||
character, while bytes 128...255 are used for multi-byte
|
character, while bytes 128...255 are used for multi-byte
|
||||||
encoding of characters placed higher in the UCS space. Especially on
|
encoding of characters placed higher in the UCS space.
|
||||||
MS-Windows the 16-bit Unicode standard, represented by pairs of bytes is
|
|
||||||
also popular.
|
|
||||||
|
|
||||||
Prolog I/O streams have a property called <em>encoding</em> which
|
Especially on
|
||||||
specifies the used encoding that influence `get_code/2` and
|
MS-Windows and Java the 16-bit Unicode standard, represented by pairs of bytes is
|
||||||
`put_code/2` as well as all the other text I/O predicates.
|
also popular. Originally, Microsoft supported a UCS-2 with 16 bits that
|
||||||
|
could represent only up to 64k characters. This was later extended to support the full
|
||||||
The default encoding for files is derived from the Prolog flag
|
Unicode, we will call the latter version UTF-16. The extension uses a hole in the first 64K code points. Characters above 0xFFFF are divided into two 2-byte words, each one in that hole. There are two versions of UTF-16: big and low
|
||||||
`encoding`, which is initialised from the environment. If the
|
endian. By default, UTF-16 is big endian, in practice most often it is used on Intel
|
||||||
|
hardware that is naturally little endian.
|
||||||
|
|
||||||
|
UTF-32, often called UCS-4, provides a natural interface where a code point is coded as
|
||||||
|
four octets. Unfortunately, it is also more expensive, so it is not as widely used.
|
||||||
|
|
||||||
|
Last, other encodings are also commonly used. One such legacy encoding is ISO-LATIN-1, that
|
||||||
|
supported latin based languages in western europe. YAP currently uses either ISO-LATIN-1 or UTF-32
|
||||||
|
internally.
|
||||||
|
|
||||||
|
Prolog supports the default encoding used by the Operating System,
|
||||||
|
Namely, YAP checks the variables LANG, LC_ALL and LC_TYPE. Say, if at boot YAP detects that the
|
||||||
environment variable `LANG` ends in "UTF-8", this encoding is
|
environment variable `LANG` ends in "UTF-8", this encoding is
|
||||||
assumed. Otherwise the default is `text` and the translation is
|
assumed. Otherwise, the default is `text` and the translation is
|
||||||
left to the wide-character functions of the C-library (note that the
|
left to the wide-character functions of the C-library (note that the
|
||||||
Prolog native UTF-8 mode is considerably faster than the generic
|
Prolog native UTF-8 mode is considerably faster than the generic
|
||||||
`mbrtowc()` one). The encoding can be specified explicitly in
|
`mbrtowc()` one).
|
||||||
|
|
||||||
|
Prolog allows the encoding to be specified explicitly in
|
||||||
load_files/2 for loading Prolog source with an alternative
|
load_files/2 for loading Prolog source with an alternative
|
||||||
encoding, `open/4` when opening files or using `set_stream/2` on
|
encoding, `open/4` when opening files or using `set_stream/2` on
|
||||||
any open stream (not yet implemented). For Prolog source files we also
|
any open stream (not yet implemented). For Prolog source files we also
|
||||||
provide the `encoding/1` directive that can be used to switch
|
provide the `encoding/1` directive that can be used to switch
|
||||||
between encodings that are compatible to US-ASCII (`ascii`,
|
between encodings that are compatible to US-ASCII (`ascii`,
|
||||||
`iso_latin_1`, `utf8` and many locales).
|
`iso_latin_1`, `utf8` and many locales).
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
For
|
For
|
||||||
additional information and Unicode resources, please visit
|
additional information and Unicode resources, please visit the
|
||||||
<http://www.unicode.org/>.
|
[unicode](http://www.unicode.org/) organization web page.
|
||||||
|
|
||||||
YAP currently defines and supports the following encodings:
|
YAP currently defines and supports the following encodings:
|
||||||
|
|
||||||
+ octet
|
+ `octet`
|
||||||
Default encoding for <em>binary</em> streams. This causes
|
Default encoding for <em>binary</em> streams. This causes
|
||||||
the stream to be read and written fully untranslated.
|
the stream to be read and written fully untranslated.
|
||||||
|
|
||||||
+ ascii
|
+ `ascii` or `US_ASCII`
|
||||||
7-bit encoding in 8-bit bytes. Equivalent to `iso_latin_1`,
|
7-bit encoding in 8-bit bytes. Equivalent to `iso_latin_1`,
|
||||||
but generates errors and warnings on encountering values above
|
but generates errors and warnings on encountering values above
|
||||||
127.
|
127.
|
||||||
|
|
||||||
+ iso_latin_1
|
+ `iso_latin_1` or `ISO-8859-1`
|
||||||
8-bit encoding supporting many western languages. This causes
|
8-bit encoding supporting many western languages. This causes
|
||||||
the stream to be read and written fully untranslated.
|
the stream to be read and written fully untranslated.
|
||||||
|
|
||||||
+ text
|
+ `text`
|
||||||
C-library default locale encoding for text files. Files are read and
|
C-library default locale encoding for text files. Files are read and
|
||||||
written using the C-library functions `mbrtowc()` and
|
written using the C-library functions `mbrtowc()` and
|
||||||
`wcrtomb()`. This may be the same as one of the other locales,
|
`wcrtomb()`. This may be the same as one of the other locales,
|
||||||
notably it may be the same as `iso_latin_1` for western
|
notably it may be the same as `iso_latin_1` for western
|
||||||
languages and `utf8` in a UTF-8 context.
|
languages and `utf8` in a UTF-8 context.
|
||||||
|
|
||||||
+ utf8
|
+ `utf8`, `iso_utf8`, or `UTF-8``
|
||||||
Multi-byte encoding of full UCS, compatible to `ascii`.
|
Multi-byte encoding of the full Unicode 8, compatible to `ascii` .
|
||||||
See above.
|
See above.
|
||||||
|
|
||||||
+ unicode_be
|
+ `unicode_be` or `UCS-2BE`
|
||||||
Unicode Big Endian. Reads input in pairs of bytes, most
|
Unicode Big Endian. Reads input in pairs of bytes, most
|
||||||
significant byte first. Can only represent 16-bit characters.
|
significant byte first. Can only represent 16-bit characters.
|
||||||
|
|
||||||
+ unicode_le
|
+ `unicode_le` or `UCS-2LE`
|
||||||
Unicode Little Endian. Reads input in pairs of bytes, least
|
Unicode Little Endian. Reads input in pairs of bytes, least
|
||||||
significant byte first. Can only represent 16-bit characters.
|
significant byte first. Can only represent 16-bit characters.
|
||||||
|
|
||||||
|
+ `utf16_le` or `UTF-16LE` (experimental)
|
||||||
|
UTF-16 Little Endian. Reads input in pairs of bytes, least
|
||||||
|
significant byte first. Can represent the full Unicode.
|
||||||
|
|
||||||
|
+ `utf16_le` or `UTF-16BE` (experimental)
|
||||||
|
Unicode Big Endian. Reads input in pairs of bytes, least
|
||||||
|
significant byte first. Can represent the full Unicode.
|
||||||
|
|
||||||
|
+ `utf32_le` or `UTF-32LE` (experimental)
|
||||||
|
UTF-16 Little Endian. Reads input in pairs of bytes, least
|
||||||
|
significant byte first. Can represent the full Unicode.
|
||||||
|
|
||||||
|
+ `utf32_le` or `UTF-32BE` (experimental)
|
||||||
|
Unicode Big Endian. Reads input in pairs of bytes, least
|
||||||
|
significant byte first. Can only represent 16-bit characters.
|
||||||
|
|
||||||
|
|
||||||
Note that not all encodings can represent all characters. This implies
|
Note that not all encodings can represent all characters. This implies
|
||||||
that writing text to a stream may cause errors because the stream
|
that writing text to a stream may cause errors because the stream
|
||||||
@ -405,34 +462,31 @@ errors can be controlled using `open/4` or `set_stream/2` (not
|
|||||||
implemented). Initially the terminal stream write the characters using
|
implemented). Initially the terminal stream write the characters using
|
||||||
Prolog escape sequences while other streams generate an I/O exception.
|
Prolog escape sequences while other streams generate an I/O exception.
|
||||||
|
|
||||||
@subsection BOM BOM: Byte Order Mark
|
|
||||||
|
|
||||||
|
=== @addgroup BOM BOM: Byte Order Mark
|
||||||
|
@ingroup WideChars
|
||||||
|
|
||||||
From Stream Encoding, you may have got the impression that
|
From Stream Encoding, you may have got the impression that
|
||||||
text-files are complicated. This section deals with a related topic,
|
text-files are complicated. This section deals with a related topic,
|
||||||
making live often easier for the user, but providing another worry to
|
making live often easier for the user, but providing another worry to
|
||||||
the programmer. *BOM* or <em>Byte Order Marker</em> is a technique
|
the programmer. *BOM* or <em>Byte Order Marker</em> is a technique
|
||||||
for identifying Unicode text-files as well as the encoding they
|
for identifying Unicode text-files as well as the encoding they
|
||||||
use. Such files start with the Unicode character `0xFEFF`, a
|
use. Please read the [W3C](https://www.w3.org/International/questions/qa-byte-order-mark.en.php]
|
||||||
non-breaking, zero-width space character. This is a pretty unique
|
page for a detailed explanation of byte-order marks.
|
||||||
sequence that is not likely to be the start of a non-Unicode file and
|
|
||||||
uniquely distinguishes the various Unicode file formats. As it is a
|
|
||||||
zero-width blank, it even doesn't produce any output. This solves all
|
|
||||||
problems, or ...
|
|
||||||
|
|
||||||
Some formats start of as US-ASCII and may contain some encoding mark to
|
BOMa are necessary on multi-byte encodings, such as UTF-16 and UTF-32. There is a BOM for UTF-8, but it is rarely used.
|
||||||
switch to UTF-8, such as the `encoding="UTF-8"` in an XML header.
|
The BOM is handled by the open/4 predicate. By default, text-files are
|
||||||
Such formats often explicitly forbid the the use of a UTF-8 BOM. In
|
|
||||||
other cases there is additional information telling the encoding making
|
|
||||||
the use of a BOM redundant or even illegal.
|
|
||||||
|
|
||||||
The BOM is handled by the `open/4` predicate. By default, text-files are
|
|
||||||
probed for the BOM when opened for reading. If a BOM is found, the
|
probed for the BOM when opened for reading. If a BOM is found, the
|
||||||
encoding is set accordingly and the property `bom(true)` is
|
encoding is set accordingly and the property `bom(true)` is
|
||||||
available through stream_property/2. When opening a file for
|
available through stream_property/2. When opening a file for
|
||||||
writing, writing a BOM can be requested using the option
|
writing, writing a BOM can be requested using the option
|
||||||
`bom(true)` with `open/4`.
|
`bom(true)` with `open/4`. Do notice that YAP will write a BOM by default on UTF-16 (including UCS-2) and
|
||||||
|
UTF-32; otherwise the default is not to write a BOM. BOMs are not avaliable for ASCII and
|
||||||
@subsection Operators Summary of YAP Predefined Operators
|
ISO-LATIN-1.
|
||||||
|
|
||||||
|
= @addgroup Operators Summary of YAP Predefined Operators
|
||||||
|
@ingroup YapSyntax
|
||||||
|
|
||||||
The Prolog syntax caters for operators of three main kinds:
|
The Prolog syntax caters for operators of three main kinds:
|
||||||
|
|
||||||
|
59
os/charsio.c
59
os/charsio.c
@ -98,7 +98,7 @@ Int Yap_peek(int sno) {
|
|||||||
CACHE_REGS
|
CACHE_REGS
|
||||||
Int ocharcount, olinecount, olinepos;
|
Int ocharcount, olinecount, olinepos;
|
||||||
StreamDesc *s;
|
StreamDesc *s;
|
||||||
Int ch;
|
uint32_t ch;
|
||||||
|
|
||||||
s = GLOBAL_Stream + sno;
|
s = GLOBAL_Stream + sno;
|
||||||
#if USE_READLINE
|
#if USE_READLINE
|
||||||
@ -141,41 +141,50 @@ Int Yap_peek(int sno) {
|
|||||||
} else if (s->encoding == ENC_UTF16_BE) {
|
} else if (s->encoding == ENC_UTF16_BE) {
|
||||||
/* do the ungetc as if a write .. */
|
/* do the ungetc as if a write .. */
|
||||||
// computations
|
// computations
|
||||||
int lead = LEAD_OFFSET + (ch >> 10);
|
if (ch < 0x10000) {
|
||||||
int trail = 0xDC00 + (ch & 0x3FF);
|
ungetc(ch % 256, s->file);
|
||||||
|
ungetc(ch / 256, s->file);
|
||||||
if (lead) {
|
} else {
|
||||||
ungetc(lead / 256, s->file);
|
|
||||||
ungetc(lead % 256, s->file);
|
|
||||||
}
|
|
||||||
ungetc(trail / 256, s->file);
|
|
||||||
ungetc(trail % 256, s->file);
|
|
||||||
} else if (s->encoding == ENC_UTF16_LE) {
|
|
||||||
/* do the ungetc as if a write .. */
|
|
||||||
// computations
|
|
||||||
uint16_t lead = LEAD_OFFSET + (ch >> 10);
|
uint16_t lead = LEAD_OFFSET + (ch >> 10);
|
||||||
uint16_t trail = 0xDC00 + (ch & 0x3FF);
|
uint16_t trail = 0xDC00 + (ch & 0x3FF);
|
||||||
lead = 0;
|
|
||||||
trail = ch;
|
ungetc(lead % 256, s->file);
|
||||||
if (lead) {
|
ungetc(lead / 256, s->file);
|
||||||
ungetc(lead / 256, s->file);
|
ungetc(trail % 256, s->file);
|
||||||
ungetc(lead % 256, s->file);
|
ungetc(trail / 256, s->file);
|
||||||
}
|
}
|
||||||
if (trail) {
|
} else if (s->encoding == ENC_UTF16_LE) {
|
||||||
ungetc(trail / 256, s->file);
|
if (ch < 0x10000) {
|
||||||
ungetc(trail % 256, s->file);
|
ungetc(ch / 256, s->file);
|
||||||
|
ungetc(ch % 256, s->file);
|
||||||
|
} else {
|
||||||
|
uint16_t lead = LEAD_OFFSET + (ch >> 10);
|
||||||
|
uint16_t trail = 0xDC00 + (ch & 0x3FF);
|
||||||
|
|
||||||
|
ungetc(trail / 256, s->file);
|
||||||
|
ungetc(trail % 256, s->file);
|
||||||
|
ungetc(lead / 256, s->file);
|
||||||
|
ungetc(lead % 256, s->file);
|
||||||
}
|
}
|
||||||
} else if (s->encoding == ENC_ISO_UTF32_LE) {
|
} else if (s->encoding == ENC_ISO_UTF32_LE) {
|
||||||
ungetc( (ch >> 24) & 0xff, s->file);
|
ungetc( (ch >> 24) & 0xff, s->file);
|
||||||
ungetc( (ch >> 16) & 0xff, s->file);
|
ungetc( (ch >> 16) & 0xff, s->file);
|
||||||
ungetc( (ch >> 8) & 0xff, s->file);
|
ungetc( (ch >> 8) & 0xff, s->file);
|
||||||
return ungetc( ch & 0xff, s->file);
|
ungetc( ch & 0xff, s->file);
|
||||||
} else if (s->encoding == ENC_ISO_UTF32_BE) {
|
} else if (s->encoding == ENC_ISO_UTF32_BE) {
|
||||||
ungetc( ch & 0xff, s->file);
|
ungetc( ch & 0xff, s->file);
|
||||||
ungetc( (ch >> 8) & 0xff, s->file);
|
ungetc( (ch >> 8) & 0xff, s->file);
|
||||||
ungetc( (ch >> 16) & 0xff, s->file);
|
ungetc( (ch >> 16) & 0xff, s->file);
|
||||||
return ungetc( (ch >> 24) & 0xff, s->file);
|
ungetc( (ch >> 24) & 0xff, s->file);
|
||||||
}
|
} else if (s->encoding == ENC_UCS2_BE) {
|
||||||
|
/* do the ungetc as if a write .. */
|
||||||
|
// computations
|
||||||
|
ungetc(ch % 256, s->file);
|
||||||
|
ungetc(ch / 256, s->file);
|
||||||
|
} else if (s->encoding == ENC_UCS2_LE) {
|
||||||
|
ungetc(ch / 256, s->file);
|
||||||
|
ungetc(ch % 256, s->file);
|
||||||
|
}
|
||||||
return ch;
|
return ch;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -29,6 +29,8 @@ typedef enum {
|
|||||||
ENC_UTF16_LE = 32, /// People who made the same mistake
|
ENC_UTF16_LE = 32, /// People who made the same mistake
|
||||||
ENC_ISO_UTF32_BE = 64, /// nobody
|
ENC_ISO_UTF32_BE = 64, /// nobody
|
||||||
ENC_ISO_UTF32_LE = 128, /// yes, nobody
|
ENC_ISO_UTF32_LE = 128, /// yes, nobody
|
||||||
|
ENC_UCS2_BE = 256, /// nobody
|
||||||
|
ENC_UCS2_LE = 512, /// yes, nobody
|
||||||
} encoding_t;
|
} encoding_t;
|
||||||
|
|
||||||
#if WORDS_BIGENDIAN
|
#if WORDS_BIGENDIAN
|
||||||
@ -52,7 +54,7 @@ typedef enum {
|
|||||||
SEQ_ENC_ISO_UTF8, /// Most everyone nowadays
|
SEQ_ENC_ISO_UTF8, /// Most everyone nowadays
|
||||||
SEQ_ENC_UTF16_BE, /// People who made a mistake
|
SEQ_ENC_UTF16_BE, /// People who made a mistake
|
||||||
SEQ_ENC_UTF16_LE, /// People who made the same mistake
|
SEQ_ENC_UTF16_LE, /// People who made the same mistake
|
||||||
v\ SEQ_ENC_ISO_UTF32_BE, /// nobody
|
SEQ_ENC_ISO_UTF32_BE, /// nobody
|
||||||
SEQ_ENC_ISO_UTF32_LE /// yes, nobody
|
SEQ_ENC_ISO_UTF32_LE /// yes, nobody
|
||||||
} seq_encoding_t;
|
} seq_encoding_t;
|
||||||
|
|
||||||
@ -105,6 +107,10 @@ static inline const char *enc_name(encoding_t enc) {
|
|||||||
return "utf16_be";
|
return "utf16_be";
|
||||||
case ENC_UTF16_LE:
|
case ENC_UTF16_LE:
|
||||||
return "utf16_le";
|
return "utf16_le";
|
||||||
|
case ENC_UCS2_BE:
|
||||||
|
return "ucs2_be";
|
||||||
|
case ENC_UCS2_LE:
|
||||||
|
return "ucs2_le";
|
||||||
case ENC_ISO_UTF32_BE:
|
case ENC_ISO_UTF32_BE:
|
||||||
return "utf32_be";
|
return "utf32_be";
|
||||||
case ENC_ISO_UTF32_LE:
|
case ENC_ISO_UTF32_LE:
|
||||||
@ -133,7 +139,7 @@ static inline encoding_t enc_id(const char *s, encoding_t enc_bom) {
|
|||||||
}
|
}
|
||||||
if (!strcmp(s, "UTF-16LE"))
|
if (!strcmp(s, "UTF-16LE"))
|
||||||
return ENC_UTF16_LE;
|
return ENC_UTF16_LE;
|
||||||
if (!strcmp(s, "UTF16-BE"))
|
if (!strcmp(s, "UTF-16BE"))
|
||||||
return ENC_UTF16_BE;
|
return ENC_UTF16_BE;
|
||||||
if (!strcmp(s, "octet"))
|
if (!strcmp(s, "octet"))
|
||||||
return ENC_OCTET;
|
return ENC_OCTET;
|
||||||
@ -158,12 +164,23 @@ static inline encoding_t enc_id(const char *s, encoding_t enc_bom) {
|
|||||||
return ENC_ISO_UTF32_LE;
|
return ENC_ISO_UTF32_LE;
|
||||||
if (!strcmp(s, "ISO-8859-1"))
|
if (!strcmp(s, "ISO-8859-1"))
|
||||||
return ENC_ISO_LATIN1;
|
return ENC_ISO_LATIN1;
|
||||||
|
if (!strcmp(s, "US_ASCII"))
|
||||||
|
return ENC_ISO_ASCII;
|
||||||
// just for SWI compat, this actually refers to
|
// just for SWI compat, this actually refers to
|
||||||
// UCS-2
|
// UCS-2
|
||||||
if (!strcmp(s, "unicode_be"))
|
if (!strcmp(s, "unicode_be"))
|
||||||
return ENC_UTF16_BE;
|
return ENC_UCS2_BE;
|
||||||
if (!strcmp(s, "unicode_le"))
|
if (!strcmp(s, "unicode_le"))
|
||||||
return ENC_UTF16_LE;
|
return ENC_UCS2_LE;
|
||||||
|
if (!strcmp(s, "UCS-2")) {
|
||||||
|
if (enc_bom == ENC_UTF16_LE)
|
||||||
|
return ENC_UCS2_LE;
|
||||||
|
return ENC_UCS2_BE;
|
||||||
|
}
|
||||||
|
if (!strcmp(s, "UCS-2LE"))
|
||||||
|
return ENC_UCS2_LE;
|
||||||
|
if (!strcmp(s, "UCS-2BE"))
|
||||||
|
return ENC_UCS2_BE;
|
||||||
if (!strcmp(s, "default")) {
|
if (!strcmp(s, "default")) {
|
||||||
if (enc_bom != ENC_OCTET)
|
if (enc_bom != ENC_OCTET)
|
||||||
return enc_bom;
|
return enc_bom;
|
||||||
|
167
os/iopreds.c
167
os/iopreds.c
@ -814,7 +814,7 @@ case ENC_UTF16_LE: // check http://unicode.org/faq/utf_bom.html#utf16-3
|
|||||||
if (c1 == -1)
|
if (c1 == -1)
|
||||||
return post_process_weof(st);
|
return post_process_weof(st);
|
||||||
wch = (c1 << 8) + ch;
|
wch = (c1 << 8) + ch;
|
||||||
if (wch >= 0xEFFF) {
|
if (wch >= 0xd800 && wch < 0xdc00) {
|
||||||
int c2 = st->stream_getc(sno);
|
int c2 = st->stream_getc(sno);
|
||||||
if (c2 == -1)
|
if (c2 == -1)
|
||||||
return post_process_weof(st);
|
return post_process_weof(st);
|
||||||
@ -826,6 +826,7 @@ case ENC_UTF16_LE: // check http://unicode.org/faq/utf_bom.html#utf16-3
|
|||||||
return wch;
|
return wch;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
case ENC_UTF16_BE: // check http://unicode.org/faq/utf_bom.html#utf16-3
|
case ENC_UTF16_BE: // check http://unicode.org/faq/utf_bom.html#utf16-3
|
||||||
// little-endian: start with big shot
|
// little-endian: start with big shot
|
||||||
{
|
{
|
||||||
@ -834,7 +835,7 @@ case ENC_UTF16_BE: // check http://unicode.org/faq/utf_bom.html#utf16-3
|
|||||||
if (c1 == -1)
|
if (c1 == -1)
|
||||||
return post_process_weof(st);
|
return post_process_weof(st);
|
||||||
wch = (c1) + (ch<<8);
|
wch = (c1) + (ch<<8);
|
||||||
if (wch >= 0xEFFF) {
|
if (wch >= 0xd800 && wch < 0xdc00) {
|
||||||
int c3 = st->stream_getc(sno);
|
int c3 = st->stream_getc(sno);
|
||||||
if (c3 == -1)
|
if (c3 == -1)
|
||||||
return post_process_weof(st);
|
return post_process_weof(st);
|
||||||
@ -845,6 +846,31 @@ case ENC_UTF16_BE: // check http://unicode.org/faq/utf_bom.html#utf16-3
|
|||||||
}
|
}
|
||||||
return wch;
|
return wch;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
case ENC_UCS2_BE: // check http://unicode.org/faq/utf_bom.html#utf16-3
|
||||||
|
// little-endian: start with big shot
|
||||||
|
{
|
||||||
|
int wch;
|
||||||
|
int c1 = st->stream_getc(sno);
|
||||||
|
if (c1 == -1)
|
||||||
|
return post_process_weof(st);
|
||||||
|
wch = (c1) + (ch<<8);
|
||||||
|
return wch;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
case ENC_UCS2_LE: // check http://unicode.org/faq/utf_bom.html#utf16-3
|
||||||
|
// little-endian: start with big shot
|
||||||
|
{
|
||||||
|
int wch;
|
||||||
|
int c1 = st->stream_getc(sno);
|
||||||
|
if (c1 == -1)
|
||||||
|
return post_process_weof(st);
|
||||||
|
wch = (c1 << 8) + ch;
|
||||||
|
|
||||||
|
return wch;
|
||||||
|
}
|
||||||
|
|
||||||
case ENC_ISO_UTF32_BE: // check http://unicode.org/faq/utf_bom.html#utf16-3
|
case ENC_ISO_UTF32_BE: // check http://unicode.org/faq/utf_bom.html#utf16-3
|
||||||
// little-endian: start with big shot
|
// little-endian: start with big shot
|
||||||
{
|
{
|
||||||
@ -977,64 +1003,94 @@ case ENC_ISO_UTF32_LE: // check http://unicode.org/faq/utf_bom.html#utf16-3
|
|||||||
}
|
}
|
||||||
case ENC_ISO_UTF8:
|
case ENC_ISO_UTF8:
|
||||||
if (ch < 0x80) {
|
if (ch < 0x80) {
|
||||||
return GLOBAL_Stream[sno].stream_putc(sno, ch);
|
GLOBAL_Stream[sno].stream_putc(sno, ch);
|
||||||
} else if (ch < 0x800) {
|
} else if (ch < 0x800) {
|
||||||
GLOBAL_Stream[sno].stream_putc(sno, 0xC0 | ch >> 6);
|
GLOBAL_Stream[sno].stream_putc(sno, 0xC0 | ch >> 6);
|
||||||
return GLOBAL_Stream[sno].stream_putc(sno, 0x80 | (ch & 0x3F));
|
GLOBAL_Stream[sno].stream_putc(sno, 0x80 | (ch & 0x3F));
|
||||||
} else if (ch < 0x10000) {
|
} else if (ch < 0x10000) {
|
||||||
GLOBAL_Stream[sno].stream_putc(sno, 0xE0 | ch >> 12);
|
GLOBAL_Stream[sno].stream_putc(sno, 0xE0 | ch >> 12);
|
||||||
GLOBAL_Stream[sno].stream_putc(sno, 0x80 | (ch >> 6 & 0x3F));
|
GLOBAL_Stream[sno].stream_putc(sno, 0x80 | (ch >> 6 & 0x3F));
|
||||||
return GLOBAL_Stream[sno].stream_putc(sno, 0x80 | (ch & 0x3F));
|
GLOBAL_Stream[sno].stream_putc(sno, 0x80 | (ch & 0x3F));
|
||||||
} else if (ch < 0x200000) {
|
} else if (ch < 0x200000) {
|
||||||
GLOBAL_Stream[sno].stream_putc(sno, 0xF0 | ch >> 18);
|
GLOBAL_Stream[sno].stream_putc(sno, 0xF0 | ch >> 18);
|
||||||
GLOBAL_Stream[sno].stream_putc(sno, 0x80 | (ch >> 12 & 0x3F));
|
GLOBAL_Stream[sno].stream_putc(sno, 0x80 | (ch >> 12 & 0x3F));
|
||||||
GLOBAL_Stream[sno].stream_putc(sno, 0x80 | (ch >> 6 & 0x3F));
|
GLOBAL_Stream[sno].stream_putc(sno, 0x80 | (ch >> 6 & 0x3F));
|
||||||
return GLOBAL_Stream[sno].stream_putc(sno, 0x80 | (ch & 0x3F));
|
GLOBAL_Stream[sno].stream_putc(sno, 0x80 | (ch & 0x3F));
|
||||||
} else {
|
} else {
|
||||||
/* should never happen */
|
/* should never happen */
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
return ch;
|
||||||
break;
|
break;
|
||||||
|
case ENC_UTF16_LE:
|
||||||
|
{
|
||||||
|
if (ch < 0x10000) {
|
||||||
|
GLOBAL_Stream[sno].stream_putc(sno, (ch & 0xff));
|
||||||
|
GLOBAL_Stream[sno].stream_putc(sno, (ch >> 8));
|
||||||
|
} else {
|
||||||
|
// computations
|
||||||
|
uint16_t ich = ch;
|
||||||
|
uint16_t lead = LEAD_OFFSET + (ich >> 10);
|
||||||
|
uint16_t trail = 0xDC00 + (ich & 0x3FF);
|
||||||
|
|
||||||
|
GLOBAL_Stream[sno].stream_putc(sno, (trail & 0xff));
|
||||||
|
GLOBAL_Stream[sno].stream_putc(sno, (trail >> 8));
|
||||||
|
GLOBAL_Stream[sno].stream_putc(sno, (lead & 0xff));
|
||||||
|
GLOBAL_Stream[sno].stream_putc(sno, (lead >> 8));
|
||||||
|
}
|
||||||
|
return ch;
|
||||||
|
}
|
||||||
case ENC_UTF16_BE:
|
case ENC_UTF16_BE:
|
||||||
{
|
{
|
||||||
// computations
|
// computations
|
||||||
int lead = LEAD_OFFSET + (ch >> 10);
|
if (ch < 0x10000) {
|
||||||
int trail = 0xDC00 + (ch & 0x3FF);
|
GLOBAL_Stream[sno].stream_putc(sno, (ch >> 8));
|
||||||
|
GLOBAL_Stream[sno].stream_putc(sno, (ch & 0xff));
|
||||||
|
} else {
|
||||||
|
uint16_t lead = (uint16_t)LEAD_OFFSET + ((uint16_t)ch >> 10);
|
||||||
|
uint16_t trail = 0xDC00 + ((uint16_t)ch & 0x3FF);
|
||||||
|
|
||||||
GLOBAL_Stream[sno].stream_putc(sno, (trail & 0xff));
|
|
||||||
GLOBAL_Stream[sno].stream_putc(sno, (trail >> 8));
|
|
||||||
if (trail) {
|
|
||||||
GLOBAL_Stream[sno].stream_putc(sno, (lead & 0xff));
|
|
||||||
GLOBAL_Stream[sno].stream_putc(sno, (lead >> 8));
|
GLOBAL_Stream[sno].stream_putc(sno, (lead >> 8));
|
||||||
}
|
GLOBAL_Stream[sno].stream_putc(sno, (lead & 0xff));
|
||||||
return lead >> 8;
|
|
||||||
}
|
|
||||||
case ENC_UTF16_LE:
|
|
||||||
{
|
|
||||||
// computations
|
|
||||||
int lead = LEAD_OFFSET + (ch >> 10);
|
|
||||||
int trail = 0xDC00 + (ch & 0x3FF);
|
|
||||||
|
|
||||||
|
|
||||||
if (lead) {
|
|
||||||
GLOBAL_Stream[sno].stream_putc(sno, (lead >> 8));
|
|
||||||
GLOBAL_Stream[sno].stream_putc(sno, (lead & 0xff));
|
|
||||||
}
|
|
||||||
GLOBAL_Stream[sno].stream_putc(sno, (trail >> 8));
|
GLOBAL_Stream[sno].stream_putc(sno, (trail >> 8));
|
||||||
GLOBAL_Stream[sno].stream_putc(sno, (trail & 0xff));
|
GLOBAL_Stream[sno].stream_putc(sno, (trail & 0xff));
|
||||||
return lead >> 8;
|
|
||||||
}
|
}
|
||||||
case ENC_ISO_UTF32_LE:
|
return ch;
|
||||||
|
}
|
||||||
|
case ENC_UCS2_LE:
|
||||||
|
{
|
||||||
|
if (ch >= 0x10000) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
GLOBAL_Stream[sno].stream_putc(sno, (ch & 0xff));
|
||||||
|
GLOBAL_Stream[sno].stream_putc(sno, (ch >> 8));
|
||||||
|
return ch;
|
||||||
|
}
|
||||||
|
case ENC_UCS2_BE:
|
||||||
|
{
|
||||||
|
// computations
|
||||||
|
if (ch < 0x10000) {
|
||||||
|
GLOBAL_Stream[sno].stream_putc(sno, (ch >> 8));
|
||||||
|
GLOBAL_Stream[sno].stream_putc(sno, (ch & 0xff));
|
||||||
|
return ch;
|
||||||
|
} else {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
case ENC_ISO_UTF32_BE:
|
||||||
GLOBAL_Stream[sno].stream_putc(sno, (ch >> 24) & 0xff);
|
GLOBAL_Stream[sno].stream_putc(sno, (ch >> 24) & 0xff);
|
||||||
GLOBAL_Stream[sno].stream_putc(sno, (ch >> 16) & 0xff);
|
GLOBAL_Stream[sno].stream_putc(sno, (ch >> 16) & 0xff);
|
||||||
GLOBAL_Stream[sno].stream_putc(sno, (ch >> 8) & 0xff);
|
GLOBAL_Stream[sno].stream_putc(sno, (ch >> 8) & 0xff);
|
||||||
return GLOBAL_Stream[sno].stream_putc(sno, ch & 0xff);
|
GLOBAL_Stream[sno].stream_putc(sno, ch & 0xff);
|
||||||
case ENC_ISO_UTF32_BE:
|
return ch;
|
||||||
|
case ENC_ISO_UTF32_LE:
|
||||||
GLOBAL_Stream[sno].stream_putc(sno, ch & 0xff);
|
GLOBAL_Stream[sno].stream_putc(sno, ch & 0xff);
|
||||||
GLOBAL_Stream[sno].stream_putc(sno, (ch >> 8) & 0xff);
|
GLOBAL_Stream[sno].stream_putc(sno, (ch >> 8) & 0xff);
|
||||||
GLOBAL_Stream[sno].stream_putc(sno, (ch >> 16) & 0xff);
|
GLOBAL_Stream[sno].stream_putc(sno, (ch >> 16) & 0xff);
|
||||||
return GLOBAL_Stream[sno].stream_putc(sno, (ch >> 24) & 0xff);
|
GLOBAL_Stream[sno].stream_putc(sno, (ch >> 24) & 0xff);
|
||||||
|
return ch;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return -1;
|
return -1;
|
||||||
@ -1105,45 +1161,53 @@ case ENC_ISO_UTF32_LE: // check http://unicode.org/faq/utf_bom.html#utf16-3
|
|||||||
switch (st->encoding) {
|
switch (st->encoding) {
|
||||||
case ENC_ISO_UTF8:
|
case ENC_ISO_UTF8:
|
||||||
if (st->stream_putc(sno, 0xEF) < 0)
|
if (st->stream_putc(sno, 0xEF) < 0)
|
||||||
return FALSE;
|
return false;
|
||||||
if (st->stream_putc(sno, 0xBB) < 0)
|
if (st->stream_putc(sno, 0xBB) < 0)
|
||||||
return FALSE;
|
return false;
|
||||||
if (st->stream_putc(sno, 0xBF) < 0)
|
if (st->stream_putc(sno, 0xBF) < 0)
|
||||||
return FALSE;
|
return false;
|
||||||
st->status |= HAS_BOM_f;
|
st->status |= HAS_BOM_f;
|
||||||
return TRUE;
|
return true;
|
||||||
case ENC_UTF16_BE:
|
case ENC_UTF16_BE:
|
||||||
|
case ENC_UCS2_BE:
|
||||||
if (st->stream_putc(sno, 0xFE) < 0)
|
if (st->stream_putc(sno, 0xFE) < 0)
|
||||||
return FALSE;
|
return false;
|
||||||
if (st->stream_putc(sno, 0xFF) < 0)
|
if (st->stream_putc(sno, 0xFF) < 0)
|
||||||
return FALSE;
|
return false;
|
||||||
st->status |= HAS_BOM_f;
|
st->status |= HAS_BOM_f;
|
||||||
return TRUE;
|
return true;
|
||||||
case ENC_UTF16_LE:
|
case ENC_UTF16_LE:
|
||||||
|
case ENC_UCS2_LE:
|
||||||
if (st->stream_putc(sno, 0xFF) < 0)
|
if (st->stream_putc(sno, 0xFF) < 0)
|
||||||
return FALSE;
|
return false;
|
||||||
if (st->stream_putc(sno, 0xFE) < 0)
|
if (st->stream_putc(sno, 0xFE) < 0)
|
||||||
return FALSE;
|
return false;
|
||||||
case ENC_ISO_UTF32_BE:
|
st->status |= HAS_BOM_f;
|
||||||
|
return true;
|
||||||
|
case ENC_ISO_UTF32_BE:
|
||||||
if (st->stream_putc(sno, 0x00) < 0)
|
if (st->stream_putc(sno, 0x00) < 0)
|
||||||
return FALSE;
|
return false;
|
||||||
if (st->stream_putc(sno, 0x00) < 0)
|
if (st->stream_putc(sno, 0x00) < 0)
|
||||||
return FALSE;
|
return false;
|
||||||
if (st->stream_putc(sno, 0xFE) < 0)
|
if (st->stream_putc(sno, 0xFE) < 0)
|
||||||
return FALSE;
|
return false;
|
||||||
if (st->stream_putc(sno, 0xFF) < 0)
|
if (st->stream_putc(sno, 0xFF) < 0)
|
||||||
return FALSE;
|
return false;
|
||||||
|
st->status |= HAS_BOM_f;
|
||||||
|
return true;
|
||||||
case ENC_ISO_UTF32_LE:
|
case ENC_ISO_UTF32_LE:
|
||||||
if (st->stream_putc(sno, 0xFF) < 0)
|
if (st->stream_putc(sno, 0xFF) < 0)
|
||||||
return FALSE;
|
return false;
|
||||||
if (st->stream_putc(sno, 0xFE) < 0)
|
if (st->stream_putc(sno, 0xFE) < 0)
|
||||||
return FALSE;
|
return false;
|
||||||
if (st->stream_putc(sno, 0x00) < 0)
|
if (st->stream_putc(sno, 0x00) < 0)
|
||||||
return FALSE;
|
return false;
|
||||||
if (st->stream_putc(sno, 0x00) < 0)
|
if (st->stream_putc(sno, 0x00) < 0)
|
||||||
return FALSE;
|
return false;
|
||||||
|
st->status |= HAS_BOM_f;
|
||||||
|
return true;
|
||||||
default:
|
default:
|
||||||
return TRUE;
|
return true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1471,6 +1535,7 @@ case ENC_ISO_UTF32_LE: // check http://unicode.org/faq/utf_bom.html#utf16-3
|
|||||||
}
|
}
|
||||||
// BOM mess
|
// BOM mess
|
||||||
if (encoding == ENC_UTF16_BE || encoding == ENC_UTF16_LE ||
|
if (encoding == ENC_UTF16_BE || encoding == ENC_UTF16_LE ||
|
||||||
|
encoding == ENC_UCS2_BE || encoding == ENC_UCS2_LE ||
|
||||||
encoding == ENC_ISO_UTF32_BE || encoding == ENC_ISO_UTF32_LE) {
|
encoding == ENC_ISO_UTF32_BE || encoding == ENC_ISO_UTF32_LE) {
|
||||||
needs_bom = true;
|
needs_bom = true;
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user