fix some encoding stuff and add some documentation.

git-svn-id: https://yap.svn.sf.net/svnroot/yap/trunk@1863 b08c6af1-5177-4d33-ba66-4b1c6b8b522a
This commit is contained in:
vsc 2007-04-03 15:03:11 +00:00
parent 917c777381
commit 35174e0901
6 changed files with 321 additions and 11 deletions

View File

@ -1740,6 +1740,40 @@ get_wchar(int sno)
#define MB_LEN_MAX 6
#endif
static int
handle_write_encoding_error(int sno, wchar_t ch)
{
if (Stream[sno].status & RepError_Xml_f) {
/* use HTML/XML encoding in ASCII */
int i = ch, digits = 1;
Stream[sno].stream_putc(sno, '&');
Stream[sno].stream_putc(sno, '#');
while (digits < i)
digits *= 10;
if (digits > i)
digits /= 10;
while (i) {
Stream[sno].stream_putc(sno, i/digits);
i %= 10;
digits /= 10;
}
Stream[sno].stream_putc(sno, ';');
return ch;
} else if (Stream[sno].status & RepError_Prolog_f) {
/* write quoted */
Stream[sno].stream_putc(sno, '\\');
Stream[sno].stream_putc(sno, 'u');
Stream[sno].stream_putc(sno, ch>>24);
Stream[sno].stream_putc(sno, 256&(ch>>16));
Stream[sno].stream_putc(sno, 256&(ch>>8));
Stream[sno].stream_putc(sno, 256&ch);
return ch;
} else {
Yap_Error(REPRESENTATION_ERROR_CHARACTER, MkIntegerTerm(ch),"charater %ld cannot be encoded in stream %d",(unsigned long int)ch,sno);
return -1;
}
}
static int
put_wchar(int sno, wchar_t ch)
{
@ -1750,12 +1784,12 @@ put_wchar(int sno, wchar_t ch)
return Stream[sno].stream_putc(sno, ch);
case ENC_ISO_LATIN1:
if (ch >= 0xff) {
/* error */
return handle_write_encoding_error(sno,ch);
}
return Stream[sno].stream_putc(sno, ch);
case ENC_ISO_ASCII:
if (ch >= 0x80) {
/* error */
return handle_write_encoding_error(sno,ch);
}
return Stream[sno].stream_putc(sno, ch);
case ENC_ISO_ANSI:
@ -2264,6 +2298,12 @@ p_open (void)
if (opts & 256) {
avoid_bom = TRUE;
}
if (opts & 512) {
st->status |= RepError_Prolog_f;
}
if (opts & 1024) {
st->status |= RepError_Xml_f;
}
}
st->stream_wgetc = get_wchar;
if (CharConversionTable != NULL)
@ -3308,12 +3348,45 @@ p_set_output (void)
static Int
p_has_bom (void)
{ /* '$set_output'(+Stream,-ErrorMessage) */
Int sno = CheckStream (ARG1, Input_Stream_f|Output_Stream_f, "has?bom/1");
Int sno = CheckStream (ARG1, Input_Stream_f|Output_Stream_f, "has_bom/1");
if (sno < 0)
return (FALSE);
return ((Stream[sno].status & HAS_BOM_f));
}
static Int
p_representation_error (void)
{ /* '$set_output'(+Stream,-ErrorMessage) */
Int sno = CheckStream (ARG1, Input_Stream_f|Output_Stream_f, "representation_errors/1");
if (sno < 0)
return (FALSE);
Term t = Deref(ARG2);
if (IsVarTerm(t)) {
if (Stream[sno].status & RepError_Prolog_f) {
return Yap_unify(ARG2, MkIntegerTerm(512));
}
if (Stream[sno].status & RepError_Xml_f) {
return Yap_unify(ARG2, MkIntegerTerm(1024));
}
return Yap_unify(ARG2, MkIntegerTerm(0));
} else {
Int i = IntegerOfTerm(t);
switch (i) {
case 512:
Stream[sno].status &= ~RepError_Xml_f;
Stream[sno].status |= RepError_Prolog_f;
break;
case 1024:
Stream[sno].status &= ~RepError_Prolog_f;
Stream[sno].status |= RepError_Xml_f;
default:
Stream[sno].status &= ~(RepError_Prolog_f|RepError_Xml_f);
}
}
return TRUE;
}
static Int
p_current_input (void)
{ /* current_input(?Stream) */
@ -5698,11 +5771,15 @@ p_get_default_encoding(void)
}
static Int
p_set_encoding (void)
{ /* '$set_encoding'(Stream,N) */
p_encoding (void)
{ /* '$encoding'(Stream,N) */
int sno = CheckStream (ARG1, Input_Stream_f|Output_Stream_f, "encoding/2");
Term t = Deref(ARG2);
if (sno < 0)
return FALSE;
if (IsVarTerm(t)) {
return Yap_unify(ARG2, MkIntegerTerm(Stream[sno].encoding));
}
Stream[sno].encoding = IntegerOfTerm(Deref(ARG2));
UNLOCK(Stream[sno].streamlock);
return TRUE;
@ -5829,6 +5906,7 @@ Yap_InitIOPreds(void)
Yap_InitCPred ("$peek", 2, p_peek, SafePredFlag|SyncPredFlag),
Yap_InitCPred ("$peek_byte", 2, p_peek_byte, SafePredFlag|SyncPredFlag),
Yap_InitCPred ("$has_bom", 1, p_has_bom, SafePredFlag);
Yap_InitCPred ("$stream_representation_error", 2, p_representation_error, SafePredFlag|SyncPredFlag);
Yap_InitCPred ("current_input", 1, p_current_input, SafePredFlag|SyncPredFlag);
Yap_InitCPred ("current_output", 1, p_current_output, SafePredFlag|SyncPredFlag);
Yap_InitCPred ("prompt", 1, p_setprompt, SafePredFlag|SyncPredFlag);
@ -5849,7 +5927,7 @@ Yap_InitIOPreds(void)
Yap_InitCPred ("$fetch_stream_alias", 2, p_fetch_stream_alias, SafePredFlag|SyncPredFlag|HiddenPredFlag);
Yap_InitCPred ("$stream", 1, p_stream, SafePredFlag|TestPredFlag);
Yap_InitCPred ("$get_default_encoding", 1, p_get_default_encoding, SafePredFlag|TestPredFlag);
Yap_InitCPred ("$set_encoding", 2, p_set_encoding, SafePredFlag|TestPredFlag),
Yap_InitCPred ("$encoding", 2, p_encoding, SafePredFlag|SyncPredFlag),
#if HAVE_SELECT
Yap_InitCPred ("stream_select", 3, p_stream_select, SafePredFlag|SyncPredFlag);
#endif

View File

@ -123,6 +123,8 @@ StreamDesc;
#define Popen_Stream_f 0x080000
#define User_Stream_f 0x100000
#define HAS_BOM_f 0x200000
#define RepError_Prolog_f 0x400000
#define RepError_Xml_f 0x800000
#define StdInStream 0
#define StdOutStream 1

View File

@ -138,6 +138,7 @@ Subnodes of Running
Subnodes of Syntax
* Formal Syntax:: Syntax of Terms
* Tokens:: Syntax of Prolog tokens
* Encoding:: How characters are encoded and Wide Character Support
Subnodes of Tokens
* Numbers:: Integer and Floating-Point Numbers
@ -151,6 +152,10 @@ Subnodes of Numbers
* Integers:: How Integers are read and represented
* Floats:: Floating Point Numbers
Subnodes of Encoding
* Stream Encoding:: How Prolog Streams can be coded
* BOM:: The Byte Order Mark
Subnodes of Loading Programs
* Compiling:: Program Loading and Updating
* Setting the Compiler:: Changing the compiler's parameters
@ -1029,6 +1034,7 @@ built.
@menu
* Formal Syntax:: Syntax of terms
* Tokens:: Syntax of Prolog tokens
* Encoding:: How characters are encoded and Wide Character Support
@end menu
@node Formal Syntax, Tokens, ,Syntax
@ -1116,7 +1122,7 @@ dot with single quotes.
@end itemize
@node Tokens, , Formal Syntax, Syntax
@node Tokens, Encoding, Formal Syntax, Syntax
@section Prolog Tokens
@cindex token
@ -1362,6 +1368,159 @@ layout characters, the YAP parser behaves as if it had found a
single blank character. The end of a file also counts as a blank
character for this purpose.
@node Encoding, , Tokens, Syntax
@section Wide Character Support
@cindex encodings
@menu
* Stream Encoding:: How Prolog Streams can be coded
* BOM:: The Byte Order Mark
@end menu
@cindex UTF-8
@cindex Unicode
@cindex UCS
@cindex internationalization
YAP now implements a SWI-Prolog compatible interface to wide
characters and the Universal Character Set (UCS). The following text
was adapted from the SWI-Prolog manual.
YAP now supports wide characters, characters with character
codes above 255 that cannot be represented in a single byte.
@emph{Universal Character Set} (UCS) is the ISO/IEC 10646 standard
that specifies a unique 31-bits unsigned integer for any character in
any language. It is a superset of 16-bit Unicode, which in turn is
a superset of ISO 8859-1 (ISO Latin-1), a superset of US-ASCII. UCS
can handle strings holding characters from multiple languages and
character classification (uppercase, lowercase, digit, etc.) and
operations such as case-conversion are unambiguously defined.
For this reason YAP, following SWI-Prolog, has two representations for
atoms. If the text fits in ISO Latin-1, it is represented as an array
of 8-bit characters. Otherwise the text is represented as an array of
wide chars, which may take 16 or 32 bits. This representational issue
is completely transparent to the Prolog user. Users of the foreign
language interface sometimes need to be aware of these issues though.
Character coding comes into view when characters of strings need to be
read from or written to file or when they have to be communicated to
other software components using the foreign language interface. In this
section we only deal with I/O through streams, which includes file I/O
as well as I/O through network sockets.
@node Stream Encoding, , BOM, Encoding
@subsection Wide character encodings on streams
Although characters are uniquely coded using the UCS standard
internally, streams and files are byte (8-bit) oriented and there are a
variety of ways to represent the larger UCS codes in an 8-bit octet
stream. The most popular one, especially in the context of the web, is
UTF-8. Bytes 0...127 represent simply the corresponding US-ASCII
character, while bytes 128...255 are used for multi-byte
encoding of characters placed higher in the UCS space. Especially on
MS-Windows the 16-bit Unicode standard, represented by pairs of bytes is
also popular.
Prolog I/O streams have a property called @emph{encoding} which
specifies the used encoding that influence @code{get_code/2} and
@code{put_code/2} as well as all the other text I/O predicates.
The default encoding for files is derived from the Prolog flag
@code{encoding}, which is initialised from the environment. If the
environment variable @env{LANG} ends in "UTF-8", this encoding is
assumed. Otherwise the default is @code{text} and the translation is
left to the wide-character functions of the C-library (note that the
Prolog native UTF-8 mode is considerably faster than the generic
mbrtowc() one). The encoding can be specified explicitly in
@code{load_files/2} for loading Prolog source with an alternative
encoding, @code{open/4} when opening files or using set_stream/2 on
any open stream (not yet implemented). For Prolog source files we also
provide the @code{encoding/1} directive that can be used to switch
between encodings that are compatible to US-ASCII (@code{ascii},
@code{iso_latin_1}, @code{utf8} and many locales).
@c See also
@c \secref{intsrcfile} for writing Prolog files with non-US-ASCII
@c characters and \secref{unicodesyntax} for syntax issues.
For
additional information and Unicode resources, please visit
@uref{http://www.unicode.org/}.
YAP currently defines and supports the following encodings:
@table @code
@item octet
Default encoding for @emph{binary} streams. This causes
the stream to be read and written fully untranslated.
@item ascii
7-bit encoding in 8-bit bytes. Equivalent to @code{iso_latin_1},
but generates errors and warnings on encountering values above
127.
@item iso_latin_1
8-bit encoding supporting many western languages. This causes
the stream to be read and written fully untranslated.
@item text
C-library default locale encoding for text files. Files are read and
written using the C-library functions @code{mbrtowc()} and
@code{wcrtomb()}. This may be the same as one of the other locales,
notably it may be the same as @code{iso_latin_1} for western
languages and @code{utf8} in a UTF-8 context.
@item utf8
Multi-byte encoding of full UCS, compatible to @code{ascii}.
See above.
@item unicode_be
Unicode Big Endian. Reads input in pairs of bytes, most
significant byte first. Can only represent 16-bit characters.
@item unicode_le
Unicode Little Endian. Reads input in pairs of bytes, least
significant byte first. Can only represent 16-bit characters.
@end table
Note that not all encodings can represent all characters. This implies
that writing text to a stream may cause errors because the stream
cannot represent these characters. The behaviour of a stream on these
errors can be controlled using @code{open/4} or @code{set_stream/2} (not
implemented). Initially the terminal stream write the characters using
Prolog escape sequences while other streams generate an I/O exception.
@node BOM, Stream Encoding, , Encoding
@subsection BOM: Byte Order Mark
@cindex BOM
@cindex Byte Order Mark
From @ref{Stream Encoding}, you may have got the impression text-files are
complicated. This section deals with a related topic, making live often
easier for the user, but providing another worry to the programmer.
@strong{BOM} or @emph{Byte Order Marker} is a technique for
identifying Unicode text-files as well as the encoding they use. Such
files start with the Unicode character @code{0xFEFF}, a non-breaking,
zero-width space character. This is a pretty unique sequence that is not
likely to be the start of a non-Unicode file and uniquely distinguishes
the various Unicode file formats. As it is a zero-width blank, it even
doesn't produce any output. This solves all problems, or ...
Some formats start of as US-ASCII and may contain some encoding mark to
switch to UTF-8, such as the @code{encoding="UTF-8"} in an XML header.
Such formats often explicitly forbid the the use of a UTF-8 BOM. In
other cases there is additional information telling the encoding making
the use of a BOM redundant or even illegal.
The BOM is handled by the @code{open/4} predicate. By default, text-files are
probed for the BOM when opened for reading. If a BOM is found, the
encoding is set accordingly and the property @code{bom(true)} is
available through @code{stream_property/2}. When opening a file for
writing, writing a BOM can be requested using the option
@code{bom(true)} with @code{open/4}.
@node Loading Programs, Modules, Syntax, Top
@chapter Loading Programs
@ -3381,6 +3540,24 @@ concerning the stream.
The operation will fail and give an error if the alias name is already
in use. YAP allows several aliases for the same file, but only
one is returned by @code{stream_property/2}
@item bom(+@var{Bool})
If present and @code{true}, a BOM (@emph{Byte Order Mark}) was
detected while opening the file for reading or a BOM was written while
opening the stream. See @ref{BOM} for details.
@item encoding(+@var{Encoding})
Set the encoding used for text. See @ref{Encoding} for an overview of
wide character and encoding issues.
@item representation_errors(+@var{Mode})
Change the behaviour when writing characters to the stream that cannot
be represented by the encoding. The behaviour is one of @code{error}
(throw and I/O error exception), @code{prolog} (write @code{\u...\}
escape code or @code{xml} (write @code{&#...;} XML character entity).
The initial mode is @code{prolog} for the user streams and
@code{error} for all other streams. See also @ref{Encoding}.
@end table
@item close(+@var{S}) [ISO]
@ -3550,6 +3727,24 @@ seekable.
@item type(@var{T})
Whether the stream is a @code{text} stream or a @code{binary} stream.
@item bom(+@var{Bool})
If present and @code{true}, a BOM (@emph{Byte Order Mark}) was
detected while opening the file for reading or a BOM was written while
opening the stream. See @ref{BOM} for details.
@item encoding(+@var{Encoding})
Query the encoding used for text. See @ref{Encoding} for an
overview of wide character and encoding issues in YAP.
@item representation_errors(+@var{Mode})
Behaviour when writing characters to the stream that cannot be
represented by the encoding. The behaviour is one of @code{error}
(throw and I/O error exception), @code{prolog} (write @code{\u...\}
escape code or @code{xml} (write @code{&#...;} XML character entity).
The initial mode is @code{prolog} for the user streams and
@code{error} for all other streams. See also @ref{Encoding} and
@code{open/4}.
@end table
@end table

View File

@ -45,6 +45,9 @@ true :- true.
;
true
),
'$stream_representation_error'(user_input, 512),
'$stream_representation_error'(user_output, 512),
'$stream_representation_error'(user_error, 512),
'$allocate_default_arena'(1024, 64),
'$enter_system_mode',
set_value(fileerrors,1),

View File

@ -524,16 +524,16 @@ remove_from_path(New) :- '$check_path'(New,Path),
'$valid_encoding'(iso_latin_1, 1).
% UTF-8: default 8 bits but 80 extends to 16bits
'$valid_encoding'(utf8, 8).
% UNICODE: 16 bits throughout, the way Gates does it!
% UNICODE: 16 bits throughout, the way it was supposed to be!
'$valid_encoding'(unicode_be, 16).
'$valid_encoding'(unicode_le, 32).
% whatever the system tell us to do.
'$valid_encoding'(text, 4).
'$default_encoding'(DefCode) :- nonvar(DefCode), !,
'$set_encoding'('$stream'(0),DefCode),
'$set_encoding'('$stream'(1),DefCode),
'$set_encoding'('$stream'(2),DefCode),
'$encoding'('$stream'(0),DefCode),
'$encoding'('$stream'(1),DefCode),
'$encoding'('$stream'(2),DefCode),
set_value('$default_encoding',DefCode).
'$default_encoding'(DefCode) :-
get_value('$default_encoding',DefCode0),

View File

@ -92,6 +92,10 @@ open(F,T,S,Opts) :-
'$process_open_opts'([encoding(Enc)|L], N0, N, Aliases, EncCode) :-
'$valid_encoding'(Enc, EncCode),
'$process_open_opts'(L, N0, N, Aliases, _).
'$process_open_opts'([representation_errors(Mode)|L], N0, N, Aliases, EncCode) :-
'$valid_reperrorhandler'(Mode, Flag),
NI is N0 \/ Flag,
'$process_open_opts'(L, NI, N, Aliases, EncCode).
'$process_open_opts'([bom(BOM)|L], N0, N, Aliases, EncCode) :-
'$valid_bom'(BOM, Flag),
NI is N0 \/ Flag,
@ -114,10 +118,16 @@ open(F,T,S,Opts) :-
'$value_open_opt'(reset,64, X) :- X is 128-32-16.
%128 -> use bom
%256 -> do not use bom
%512 -> do prolog on unrepresentable char
%1024 -> do XML on unrepresentable char
'$valid_bom'(true, 128).
'$valid_bom'(false, 256).
'$valid_reperrorhandler'(error, 0). % default.
'$valid_reperrorhandler'(prolog, 512).
'$valid_reperrorhandler'(xml, 1024).
/* check whether a list of options is valid */
'$check_io_opts'(V,G) :- var(V), !,
'$do_error'(instantiation_error,G).
@ -157,6 +167,8 @@ open(F,T,S,Opts) :-
'$check_open_eof_action_arg'(T, G).
'$check_opt_open'(encoding(T), G) :- !,
'$check_open_encoding'(T, G).
'$check_opt_open'(representation_errors(M), G) :- !,
'$check_open_representation_errors'(M, G).
'$check_opt_open'(bom(T), G) :- !,
'$check_open_bom_arg'(T, G).
'$check_opt_open'(A, G) :-
@ -183,6 +195,8 @@ open(F,T,S,Opts) :-
'$check_opt_sp'(reposition(_), _) :- !.
'$check_opt_sp'(type(_), _) :- !.
'$check_opt_sp'(bom(_), _) :- !.
'$check_opt_sp'(encoding(_), _) :- !.
'$check_opt_sp'(representation_errors(_), _) :- !.
'$check_opt_sp'(A, G) :-
'$do_error'(domain_error(stream_property,A),G).
@ -256,6 +270,13 @@ open(F,T,S,Opts) :-
'$check_open_encoding'(Encoding,G) :-
'$do_error'(domain_error(io_mode,encoding(Encoding)),G).
'$check_open_representation_errors'(X, G) :- var(X), !,
'$do_error'(instantiation_error,G).
'$check_open_representation_errors'(RepErrorHandler,_) :-
'$valid_reperrorhandler'(RepErrorHandler,_), !.
'$check_open_representation_errors'(Handler,G) :-
'$do_error'(domain_error(io_mode,representation_errors(Handler)),G).
'$check_read_syntax_errors_arg'(X, G) :- var(X), !,
'$do_error'(instantiation_error,G).
'$check_read_syntax_errors_arg'(dec10,_) :- !.
@ -836,6 +857,8 @@ stream_property(Stream, Props) :-
'$generate_prop'(type(_T)).
'$generate_prop'(alias(_A)).
'$generate_prop'(bom(_B)).
'$generate_prop'(encoding(_E)).
'$generate_prop'(representation_errors(_E)).
'$stream_property'(Stream, Props) :-
var(Props), !,
@ -865,6 +888,11 @@ stream_property(Stream, Props) :-
'$process_stream_properties'([position(P)|Props], Stream, F, Mode) :-
'$show_stream_bom'(Stream, P),
'$process_stream_properties'(Props, Stream, F, Mode).
'$process_stream_properties'([encoding(Enc)|Props], Stream, F, Mode) :-
% make sure this runs first, with EncCode unbound.
'$encoding'(Stream, EncCode),
'$valid_encoding'(Enc, EncCode),
'$process_stream_properties'(Props, Stream, F, Mode).
'$process_stream_properties'([bom(B)|Props], Stream, F, Mode) :-
'$show_stream_bom'(Stream, B),
'$process_stream_properties'(Props, Stream, F, Mode).
@ -879,6 +907,10 @@ stream_property(Stream, Props) :-
'$show_stream_flags'(Stream, Fl),
'$show_stream_reposition'(Fl, P),
'$process_stream_properties'(Props, Stream, F, Mode).
'$process_stream_properties'([representation_errors(B)|Props], Stream, F, Mode) :-
'$stream_representation_error'(Stream, ErrorHandler),
'$valid_reperrorhandler'(B, ErrorHandler),
'$process_stream_properties'(Props, Stream, F, Mode).
'$process_stream_properties'([type(P)|Props], Stream, F, Mode) :-
'$show_stream_flags'(Stream, Fl),
'$show_stream_type'(Fl, P),