This repository has been archived on 2023-08-20. You can view files and clone it, but cannot push or open issues or pull requests.
yap-6.3/packages/xml/xml_utilities.pl
2012-02-14 12:41:12 +00:00

500 lines
22 KiB
Prolog

/* XML Utilities
*
* Copyright (C) 2001-2005 Binding Time Limited
* Copyright (C) 2005-2011 John Fletcher
*
* Current Release: $Revision: 3.4 $
*
* TERMS AND CONDITIONS:
*
* This program is offered free of charge, as unsupported source code. You may
* use it, copy it, distribute it, modify it or sell it without restriction,
* but entirely at your own risk.
*/
% Entity and Namespace map operations: these maps are usually quite small, so
% a linear list lookup is okay. They could be substituted by a logarithmic
% data structure - in extremis.
/* empty_map( ?Map ) is true if Map is a null map.
*/
empty_map( [] ).
/* map_member( +Key, +Map, ?Data ) is true if Map is a ordered map structure
* which records the pair Key-Data. Key must be ground.
*/
map_member( Key0, [Key1-Data1|Rest], Data0 ) :-
( Key0 == Key1 ->
Data0 = Data1
; Key0 @> Key1 ->
map_member( Key0, Rest, Data0 )
).
/* map_store( +Map0, +Key, +Data, ?Map1 ) is true if Map0 is an ordered map
* structure, Key must be ground, and Map1 is identical to Map0 except that
* the pair Key-Data is recorded by Map1.
*/
map_store( [], Key, Data, [Key-Data] ).
map_store( [Key0-Data0|Map0], Key, Data, Map ) :-
( Key == Key0 ->
Map = [Key-Data|Map0]
; Key @< Key0 ->
Map = [Key-Data,Key0-Data0|Map0]
; otherwise -> % >
Map = [Key0-Data0|Map1],
map_store( Map0, Key, Data, Map1 )
).
/* context(?Element, ?PreserveSpace, ?CurrentNS, ?DefaultNS, ?Entities, ?Namespaces )
* is an ADT hiding the "state" arguments for XML Acquisition
*/
initial_context(
Controls,
context(void,PreserveSpace,'','',Entities,Empty,
RemoveAttributePrefixes,AllowAmpersand)
) :-
empty_map( Empty ),
( member( extended_characters(false), Controls ) ->
Entities = Empty
; otherwise ->
extended_character_entities(Entities)
),
( member( format(false), Controls ) ->
PreserveSpace = true
; otherwise ->
PreserveSpace = false
),
( member( remove_attribute_prefixes(true), Controls ) ->
RemoveAttributePrefixes = true
; otherwise ->
RemoveAttributePrefixes = false
),
( member( allow_ampersand(true), Controls ) ->
AllowAmpersand = true
; otherwise ->
AllowAmpersand = false
).
context_update( current_namespace, Context0, URI, Context1 ) :-
Context0 = context(Element,Preserve,_Current,Default,Entities,
Namespaces,RemoveAttributePrefixes,Amp),
Context1 = context(Element,Preserve,URI,Default,Entities,
Namespaces,RemoveAttributePrefixes,Amp).
context_update( element, Context0, Tag, Context1 ) :-
Context0 = context(_Element,Preserve,Current,Default,Entities,
Namespaces,RemoveAttributePrefixes,Amp),
Context1 = context(tag(Tag),Preserve,Current,Default,Entities,
Namespaces,RemoveAttributePrefixes,Amp).
context_update( default_namespace, Context0, URI, Context1 ):-
Context0 = context(Element,Preserve,Current,_Default,Entities,
Namespaces,RemoveAttributePrefixes,Amp),
Context1 = context(Element,Preserve,Current,URI,Entities,
Namespaces,RemoveAttributePrefixes,Amp).
context_update( space_preserve, Context0, Boolean, Context1 ):-
Context0 = context(Element,_Preserve,Current,Default,Entities,
Namespaces,RemoveAttributePrefixes,Amp),
Context1 = context(Element,Boolean,Current,Default,Entities,
Namespaces,RemoveAttributePrefixes,Amp).
context_update( ns_prefix(Prefix), Context0, URI, Context1 ) :-
Context0 = context(Element,Preserve,Current,Default,Entities,
Namespaces0,RemoveAttributePrefixes,Amp),
Context1 = context(Element,Preserve,Current,Default,Entities,
Namespaces1,RemoveAttributePrefixes,Amp),
map_store( Namespaces0, Prefix, URI, Namespaces1 ).
context_update( entity(Name), Context0, String, Context1 ) :-
Context0 = context(Element,Preserve,Current,Default,Entities0,
Namespaces,RemoveAttributePrefixes,Amp),
Context1 = context(Element,Preserve,Current,Default,Entities1,
Namespaces,RemoveAttributePrefixes,Amp),
map_store( Entities0, Name, String, Entities1 ).
remove_attribute_prefixes( Context ) :-
Context = context(_Element,_Preserve,_Current,_Default,_Entities,
_Namespaces,true,_Amp).
current_tag( Context, Tag ) :-
Context = context(tag(Tag),_Preserve,_Current,_Default,_Entities,
_Namespaces,_RPFA,_Amp).
current_namespace( Context, Current ) :-
Context = context(_Element,_Preserve,Current,_Default,_Entities,
_Namespaces,_RPFA,_Amp).
default_namespace( Context, Default ) :-
Context = context(_Element,_Preserve,_Current,Default,_Entities,
_Namespaces,_RPFA,_Amp).
space_preserve( Context ) :-
Context = context(tag(_Tag),true,_Current,_Default,_Entities,
_Namespaces,_RPFA,_Amp).
specific_namespace( Prefix, Context, URI ) :-
Context = context(_Element,_Preserve,_Current,_Default,_Entities,
Namespaces,_RPFA,_Amp),
map_member( Prefix, Namespaces, URI ).
defined_entity( Reference, Context, String ) :-
Context = context(_Element,_Preserve,_Current,_Default,Entities,
_Namespaces,_RPFA,_Amp),
map_member( Reference, Entities, String ).
close_context( Context, Terms, WellFormed ) :-
Context = context(Element,_Preserve,_Current,_Default,_Entities,
_Namespaces,_RPFA,_Amp),
close_context1( Element, Terms, WellFormed ).
close_context1( void, [], true ).
close_context1( tag(TagChars), [out_of_context(Tag)], false ) :-
atom_chars( Tag, TagChars ).
void_context(
context(void,_Preserve,_Current,_Default,_Entities,_Names,_RPFA,_Amp)
).
allow_ampersand(
context(_Void,_Preserve,_Current,_Default,_Entities,_Names,_RPFA,true)
).
/* pp_string( +String ) prints String onto the current output stream.
* If String contains only 7-bit chars it is printed in shorthand quoted
* format, otherwise it is written as a list.
* If your Prolog uses " to delimit a special string type, just use write/1.
*/
pp_string( Chars ) :-
( member( Char, Chars ),
not_shorthand( Char ) ->
write( Chars )
; otherwise ->
put_quote,
pp_string1( Chars ),
put_quote
).
not_shorthand( Char ) :-
Char > 255.
not_shorthand( Char ) :-
Char < 9.
not_shorthand( 126 ). % ~ gives syntax errors in LPA Prolog
put_quote :-
put( 0'" ). % '
pp_string1( [] ).
pp_string1( [Char|Chars] ) :-
( Char =:= """" -> % Meta-quote
put( Char ),
put( Char ),
pp_string1( Chars )
; Char =:= 13, % Handle Windows border-settings
Chars = [10|Chars1] ->
put( 10 ),
pp_string1( Chars1 )
; otherwise ->
put( Char ),
pp_string1( Chars )
).
xml_declaration_attributes_valid( [] ).
xml_declaration_attributes_valid( [Name=Value|Attributes] ) :-
xml_declaration_attribute_valid( Name, Value ),
xml_declaration_attributes_valid( Attributes ).
xml_declaration_attribute_valid( Name, Value ) :-
lowercase( Value, Lowercase ),
canonical_xml_declaration_attribute( Name, Lowercase ).
canonical_xml_declaration_attribute( version, "1.0" ).
canonical_xml_declaration_attribute( standalone, "yes" ).
canonical_xml_declaration_attribute( standalone, "no" ).
% The encodings here are all valid for the output produced.
canonical_xml_declaration_attribute( encoding, "utf-8" ).
% canonical_xml_declaration_attribute( encoding, "utf-16" ).
% This is erroneous for the output of this library
canonical_xml_declaration_attribute( encoding, "us-ascii" ).
canonical_xml_declaration_attribute( encoding, "ascii" ).
canonical_xml_declaration_attribute( encoding, "iso-8859-1" ).
canonical_xml_declaration_attribute( encoding, "iso-8859-2" ).
canonical_xml_declaration_attribute( encoding, "iso-8859-15" ).
canonical_xml_declaration_attribute( encoding, "windows-1252" ).
% In general, it's better not to specify an encoding.
/* lowercase( +MixedCase, ?Lowercase ) holds when Lowercase and MixedCase are
* lists of character codes, and Lowercase is identical to MixedCase with
* every uppercase character replaced by its lowercase equivalent.
*/
lowercase( [], [] ).
lowercase( [Char|Chars], [Lower|LowerCase] ) :-
( Char >= "A", Char =< "Z" ->
Lower is Char + "a" - "A"
; otherwise ->
Lower = Char
),
lowercase( Chars, LowerCase ).
extended_character_entities( [
"AElig"-[198], % latin capital letter AE
"Aacute"-[193], % latin capital letter A with acute,
"Acirc"-[194], % latin capital letter A with circumflex,
"Agrave"-[192], % latin capital letter A with grave
"Alpha"-[913], % greek capital letter alpha, U+0391
"Aring"-[197], % latin capital letter A with ring above
"Atilde"-[195], % latin capital letter A with tilde,
"Auml"-[196], % latin capital letter A with diaeresis,
"Beta"-[914], % greek capital letter beta, U+0392
"Ccedil"-[199], % latin capital letter C with cedilla,
"Chi"-[935], % greek capital letter chi, U+03A7
"Dagger"-[8225], % double dagger, U+2021 ISOpub
"Delta"-[916], % greek capital letter delta,
"ETH"-[208], % latin capital letter ETH, U+00D0 ISOlat1>
"Eacute"-[201], % latin capital letter E with acute,
"Ecirc"-[202], % latin capital letter E with circumflex,
"Egrave"-[200], % latin capital letter E with grave,
"Epsilon"-[917], % greek capital letter epsilon, U+0395
"Eta"-[919], % greek capital letter eta, U+0397
"Euml"-[203], % latin capital letter E with diaeresis,
"Gamma"-[915], % greek capital letter gamma,
"Iacute"-[205], % latin capital letter I with acute,
"Icirc"-[206], % latin capital letter I with circumflex,
"Igrave"-[204], % latin capital letter I with grave,
"Iota"-[921], % greek capital letter iota, U+0399
"Iuml"-[207], % latin capital letter I with diaeresis,
"Kappa"-[922], % greek capital letter kappa, U+039A
"Lambda"-[923], % greek capital letter lambda,
"Mu"-[924], % greek capital letter mu, U+039C
"Ntilde"-[209], % latin capital letter N with tilde,
"Nu"-[925], % greek capital letter nu, U+039D
"OElig"-[338], % latin capital ligature OE,
"Oacute"-[211], % latin capital letter O with acute,
"Ocirc"-[212], % latin capital letter O with circumflex,
"Ograve"-[210], % latin capital letter O with grave,
"Omega"-[937], % greek capital letter omega,
"Omicron"-[927], % greek capital letter omicron, U+039F
"Oslash"-[216], % latin capital letter O with stroke
"Otilde"-[213], % latin capital letter O with tilde,
"Ouml"-[214], % latin capital letter O with diaeresis,
"Phi"-[934], % greek capital letter phi,
"Pi"-[928], % greek capital letter pi, U+03A0 ISOgrk3
"Prime"-[8243], % double prime = seconds = inches,
"Psi"-[936], % greek capital letter psi,
"Rho"-[929], % greek capital letter rho, U+03A1
"Scaron"-[352], % latin capital letter S with caron,
"Sigma"-[931], % greek capital letter sigma,
"THORN"-[222], % latin capital letter THORN,
"Tau"-[932], % greek capital letter tau, U+03A4
"Theta"-[920], % greek capital letter theta,
"Uacute"-[218], % latin capital letter U with acute,
"Ucirc"-[219], % latin capital letter U with circumflex,
"Ugrave"-[217], % latin capital letter U with grave,
"Upsilon"-[933], % greek capital letter upsilon,
"Uuml"-[220], % latin capital letter U with diaeresis,
"Xi"-[926], % greek capital letter xi, U+039E ISOgrk3
"Yacute"-[221], % latin capital letter Y with acute,
"Yuml"-[376], % latin capital letter Y with diaeresis,
"Zeta"-[918], % greek capital letter zeta, U+0396
"aacute"-[225], % latin small letter a with acute,
"acirc"-[226], % latin small letter a with circumflex,
"acute"-[180], % acute accent = spacing acute,
"aelig"-[230], % latin small letter ae
"agrave"-[224], % latin small letter a with grave
"alefsym"-[8501], % alef symbol = first transfinite cardinal,
"alpha"-[945], % greek small letter alpha,
"and"-[8743], % logical and = wedge, U+2227 ISOtech
"ang"-[8736], % angle, U+2220 ISOamso
"aring"-[229], % latin small letter a with ring above
"asymp"-[8776], % almost equal to = asymptotic to,
"atilde"-[227], % latin small letter a with tilde,
"auml"-[228], % latin small letter a with diaeresis,
"bdquo"-[8222], % double low-9 quotation mark, U+201E NEW
"beta"-[946], % greek small letter beta, U+03B2 ISOgrk3
"brvbar"-[166], % broken bar = broken vertical bar,
"bull"-[8226], % bullet = black small circle,
"cap"-[8745], % intersection = cap, U+2229 ISOtech
"ccedil"-[231], % latin small letter c with cedilla,
"cedil"-[184], % cedilla = spacing cedilla, U+00B8 ISOdia>
"cent"-[162], % cent sign, U+00A2 ISOnum>
"chi"-[967], % greek small letter chi, U+03C7 ISOgrk3
"circ"-[710], % modifier letter circumflex accent,
"clubs"-[9827], % black club suit = shamrock,
"cong"-[8773], % approximately equal to, U+2245 ISOtech
"copy"-[169], % copyright sign, U+00A9 ISOnum>
"crarr"-[8629], % downwards arrow with corner leftwards
"cup"-[8746], % union = cup, U+222A ISOtech
"curren"-[164], % currency sign, U+00A4 ISOnum>
"dArr"-[8659], % downwards double arrow, U+21D3 ISOamsa
"dagger"-[8224], % dagger, U+2020 ISOpub
"darr"-[8595], % downwards arrow, U+2193 ISOnum
"deg"-[176], % degree sign, U+00B0 ISOnum>
"delta"-[948], % greek small letter delta,
"diams"-[9830], % black diamond suit, U+2666 ISOpub
"divide"-[247], % division sign, U+00F7 ISOnum>
"eacute"-[233], % latin small letter e with acute,
"ecirc"-[234], % latin small letter e with circumflex,
"egrave"-[232], % latin small letter e with grave,
"empty"-[8709], % empty set = null set = diameter,
"emsp"-[8195], % em space, U+2003 ISOpub
"ensp"-[8194], % en space, U+2002 ISOpub
"epsilon"-[949], % greek small letter epsilon,
"equiv"-[8801], % identical to, U+2261 ISOtech
"eta"-[951], % greek small letter eta, U+03B7 ISOgrk3
"eth"-[240], % latin small letter eth, U+00F0 ISOlat1>
"euml"-[235], % latin small letter e with diaeresis,
"euro"-[8364], % euro sign, U+20AC NEW
"exist"-[8707], % there exists, U+2203 ISOtech
"fnof"-[402], % latin small f with hook = function
"forall"-[8704], % for all, U+2200 ISOtech
"frac12"-[189], % vulgar fraction one half
"frac14"-[188], % vulgar fraction one quarter
"frac34"-[190], % vulgar fraction three quarters
"frasl"-[8260], % fraction slash, U+2044 NEW
"gamma"-[947], % greek small letter gamma,
"ge"-[8805], % greater-than or equal to,
"hArr"-[8660], % left right double arrow,
"harr"-[8596], % left right arrow, U+2194 ISOamsa
"hearts"-[9829], % black heart suit = valentine,
"hellip"-[8230], % horizontal ellipsis = three dot leader,
"iacute"-[237], % latin small letter i with acute,
"icirc"-[238], % latin small letter i with circumflex,
"iexcl"-[161], % inverted exclamation mark, U+00A1 ISOnum>
"igrave"-[236], % latin small letter i with grave,
"image"-[8465], % blackletter capital I = imaginary part,
"infin"-[8734], % infinity, U+221E ISOtech
"int"-[8747], % integral, U+222B ISOtech
"iota"-[953], % greek small letter iota, U+03B9 ISOgrk3
"iquest"-[191], % inverted question mark
"isin"-[8712], % element of, U+2208 ISOtech
"iuml"-[239], % latin small letter i with diaeresis,
"kappa"-[954], % greek small letter kappa,
"lArr"-[8656], % leftwards double arrow, U+21D0 ISOtech
"lambda"-[955], % greek small letter lambda,
"lang"-[9001], % left-pointing angle bracket = bra,
"laquo"-[171], % left-pointing double angle quotation mark
"larr"-[8592], % leftwards arrow, U+2190 ISOnum
"lceil"-[8968], % left ceiling = apl upstile,
"ldquo"-[8220], % left double quotation mark,
"le"-[8804], % less-than or equal to, U+2264 ISOtech
"lfloor"-[8970], % left floor = apl downstile,
"lowast"-[8727], % asterisk operator, U+2217 ISOtech
"loz"-[9674], % lozenge, U+25CA ISOpub
"lrm"-[8206], % left-to-right mark, U+200E NEW RFC 2070
"lsaquo"-[8249], % single left-pointing angle quotation mark,
"lsquo"-[8216], % left single quotation mark,
"macr"-[175], % macron = spacing macron = overline
"mdash"-[8212], % em dash, U+2014 ISOpub
"micro"-[181], % micro sign, U+00B5 ISOnum>
"middot"-[183], % middle dot = Georgian comma
"minus"-[8722], % minus sign, U+2212 ISOtech
"mu"-[956], % greek small letter mu, U+03BC ISOgrk3
"nabla"-[8711], % nabla = backward difference,
"nbsp"-[160], % no-break space = non-breaking space,
"ndash"-[8211], % en dash, U+2013 ISOpub
"ne"-[8800], % not equal to, U+2260 ISOtech
"ni"-[8715], % contains as member, U+220B ISOtech
"not"-[172], % not sign, U+00AC ISOnum>
"notin"-[8713], % not an element of, U+2209 ISOtech
"nsub"-[8836], % not a subset of, U+2284 ISOamsn
"ntilde"-[241], % latin small letter n with tilde,
"nu"-[957], % greek small letter nu, U+03BD ISOgrk3
"oacute"-[243], % latin small letter o with acute,
"ocirc"-[244], % latin small letter o with circumflex,
"oelig"-[339], % latin small ligature oe, U+0153 ISOlat2
"ograve"-[242], % latin small letter o with grave,
"oline"-[8254], % overline = spacing overscore,
"omega"-[969], % greek small letter omega,
"omicron"-[959], % greek small letter omicron, U+03BF NEW
"oplus"-[8853], % circled plus = direct sum,
"or"-[8744], % logical or = vee, U+2228 ISOtech
"ordf"-[170], % feminine ordinal indicator, U+00AA ISOnum>
"ordm"-[186], % masculine ordinal indicator,
"oslash"-[248], % latin small letter o with stroke,
"otilde"-[245], % latin small letter o with tilde,
"otimes"-[8855], % circled times = vector product,
"ouml"-[246], % latin small letter o with diaeresis,
"para"-[182], % pilcrow sign = paragraph sign,
"part"-[8706], % partial differential, U+2202 ISOtech
"permil"-[8240], % per mille sign, U+2030 ISOtech
"perp"-[8869], % up tack = orthogonal to = perpendicular,
"phi"-[966], % greek small letter phi, U+03C6 ISOgrk3
"pi"-[960], % greek small letter pi, U+03C0 ISOgrk3
"piv"-[982], % greek pi symbol, U+03D6 ISOgrk3
"plusmn"-[177], % plus-minus sign = plus-or-minus sign,
"pound"-[163], % pound sign, U+00A3 ISOnum>
"prime"-[8242], % prime = minutes = feet, U+2032 ISOtech
"prod"-[8719], % n-ary product = product sign,
"prop"-[8733], % proportional to, U+221D ISOtech
"psi"-[968], % greek small letter psi, U+03C8 ISOgrk3
"rArr"-[8658], % rightwards double arrow,
"radic"-[8730], % square root = radical sign,
"rang"-[9002], % right-pointing angle bracket = ket,
"raquo"-[187], % right-pointing double angle quotation mark
"rarr"-[8594], % rightwards arrow, U+2192 ISOnum
"rceil"-[8969], % right ceiling, U+2309 ISOamsc
"rdquo"-[8221], % right double quotation mark,
"real"-[8476], % blackletter capital R = real part symbol,
"reg"-[174], % registered sign = registered trade mark sign,
"rfloor"-[8971], % right floor, U+230B ISOamsc
"rho"-[961], % greek small letter rho, U+03C1 ISOgrk3
"rlm"-[8207], % right-to-left mark, U+200F NEW RFC 2070
"rsaquo"-[8250], % single right-pointing angle quotation mark,
"rsquo"-[8217], % right single quotation mark,
"sbquo"-[8218], % single low-9 quotation mark, U+201A NEW
"scaron"-[353], % latin small letter s with caron,
"sdot"-[8901], % dot operator, U+22C5 ISOamsb
"sect"-[167], % section sign, U+00A7 ISOnum>
"shy"-[173], % soft hyphen = discretionary hyphen,
"sigma"-[963], % greek small letter sigma,
"sigmaf"-[962], % greek small letter final sigma,
"sim"-[8764], % tilde operator = varies with = similar to,
"spades"-[9824], % black spade suit, U+2660 ISOpub
"sub"-[8834], % subset of, U+2282 ISOtech
"sube"-[8838], % subset of or equal to, U+2286 ISOtech
"sum"-[8721], % n-ary sumation, U+2211 ISOamsb
"sup"-[8835], % superset of, U+2283 ISOtech
"sup1"-[185], % superscript one = superscript digit one,
"sup2"-[178], % superscript two = superscript digit two
"sup3"-[179], % superscript three = superscript digit three
"supe"-[8839], % superset of or equal to,
"szlig"-[223], % latin small letter sharp s = ess-zed,
"tau"-[964], % greek small letter tau, U+03C4 ISOgrk3
"there4"-[8756], % therefore, U+2234 ISOtech
"theta"-[952], % greek small letter theta,
"thetasym"-[977], % greek small letter theta symbol,
"thinsp"-[8201], % thin space, U+2009 ISOpub
"thorn"-[254], % latin small letter thorn with,
"tilde"-[732], % small tilde, U+02DC ISOdia
"times"-[215], % multiplication sign, U+00D7 ISOnum>
"trade"-[8482], % trade mark sign, U+2122 ISOnum
"uArr"-[8657], % upwards double arrow, U+21D1 ISOamsa
"uacute"-[250], % latin small letter u with acute,
"uarr"-[8593], % upwards arrow, U+2191 ISOnum
"ucirc"-[251], % latin small letter u with circumflex,
"ugrave"-[249], % latin small letter u with grave,
"uml"-[168], % diaeresis = spacing diaeresis,
"upsih"-[978], % greek upsilon with hook symbol,
"upsilon"-[965], % greek small letter upsilon,
"uuml"-[252], % latin small letter u with diaeresis,
"weierp"-[8472], % script capital P = power set
"xi"-[958], % greek small letter xi, U+03BE ISOgrk3
"yacute"-[253], % latin small letter y with acute,
"yen"-[165], % yen sign = yuan sign, U+00A5 ISOnum>
"yuml"-[255], % latin small letter y with diaeresis,
"zeta"-[950], % greek small letter zeta, U+03B6 ISOgrk3
"zwj"-[8205], % zero width joiner, U+200D NEW RFC 2070
"zwnj"-[8204] % zero width non-joiner,
] ).
/* chars( ?Chars, ?Plus, ?Minus ) used as chars( ?Chars ) in a DCG to
* copy the list Chars inline.
*
* This is best expressed in terms of append/3 where append/3 is built-in.
* For other Prologs, a straightforward specification can be used:
*
* chars( [] ) --> "".
* chars( [Char|Chars] ) -->
* [Char],
* chars( Chars ).
*/
chars( Chars, Plus, Minus ) :-
append( Chars, Minus, Plus ).