/* XML Utilities * * Copyright (C) 2001-2005 Binding Time Limited * Copyright (C) 2005-2011 John Fletcher * * Current Release: $Revision: 3.4 $ * * TERMS AND CONDITIONS: * * This program is offered free of charge, as unsupported source code. You may * use it, copy it, distribute it, modify it or sell it without restriction, * but entirely at your own risk. */ :- use_module(library(lists)). % Entity and Namespace map operations: these maps are usually quite small, so % a linear list lookup is okay. They could be substituted by a logarithmic % data structure - in extremis. /** empty_map( ?Map ) * * is true if Map is a null map. */ empty_map( [] ). /* map_member( +Key, +Map, ?Data ) * * is true if Map is a ordered map structure * which records the pair Key-Data. Key must be ground. */ map_member( Key0, [Key1-Data1|Rest], Data0 ) :- ( Key0 == Key1 -> Data0 = Data1 ; Key0 @> Key1 -> map_member( Key0, Rest, Data0 ) ). /* map_store( +Map0, +Key, +Data, ?Map1 ) * * * is true if Map0 is an ordered map * structure, Key must be ground, and Map1 is identical to Map0 except that * the pair Key-Data is recorded by Map1. */ map_store( [], Key, Data, [Key-Data] ). map_store( [Key0-Data0|Map0], Key, Data, Map ) :- ( Key == Key0 -> Map = [Key-Data|Map0] ; Key @< Key0 -> Map = [Key-Data,Key0-Data0|Map0] ; otherwise -> % > Map = [Key0-Data0|Map1], map_store( Map0, Key, Data, Map1 ) ). /** context(?Element, ?PreserveSpace, ?CurrentNS, ?DefaultNS, ?Entities, ?Namespaces ) * * is an ADT hiding the "state" arguments for XML Acquisition */ initial_context( Controls, context(void,PreserveSpace,'','',Entities,Empty, RemoveAttributePrefixes,AllowAmpersand) ) :- empty_map( Empty ), ( member( extended_characters(false), Controls ) -> Entities = Empty ; otherwise -> extended_character_entities(Entities) ), ( member( format(false), Controls ) -> PreserveSpace = true ; otherwise -> PreserveSpace = false ), ( member( remove_attribute_prefixes(true), Controls ) -> RemoveAttributePrefixes = true ; otherwise -> RemoveAttributePrefixes = false ), ( member( allow_ampersand(true), Controls ) -> AllowAmpersand = true ; otherwise -> AllowAmpersand = false ). context_update( current_namespace, Context0, URI, Context1 ) :- Context0 = context(Element,Preserve,_Current,Default,Entities, Namespaces,RemoveAttributePrefixes,Amp), Context1 = context(Element,Preserve,URI,Default,Entities, Namespaces,RemoveAttributePrefixes,Amp). context_update( element, Context0, Tag, Context1 ) :- Context0 = context(_Element,Preserve,Current,Default,Entities, Namespaces,RemoveAttributePrefixes,Amp), Context1 = context(tag(Tag),Preserve,Current,Default,Entities, Namespaces,RemoveAttributePrefixes,Amp). context_update( default_namespace, Context0, URI, Context1 ):- Context0 = context(Element,Preserve,Current,_Default,Entities, Namespaces,RemoveAttributePrefixes,Amp), Context1 = context(Element,Preserve,Current,URI,Entities, Namespaces,RemoveAttributePrefixes,Amp). context_update( space_preserve, Context0, Boolean, Context1 ):- Context0 = context(Element,_Preserve,Current,Default,Entities, Namespaces,RemoveAttributePrefixes,Amp), Context1 = context(Element,Boolean,Current,Default,Entities, Namespaces,RemoveAttributePrefixes,Amp). context_update( ns_prefix(Prefix), Context0, URI, Context1 ) :- Context0 = context(Element,Preserve,Current,Default,Entities, Namespaces0,RemoveAttributePrefixes,Amp), Context1 = context(Element,Preserve,Current,Default,Entities, Namespaces1,RemoveAttributePrefixes,Amp), map_store( Namespaces0, Prefix, URI, Namespaces1 ). context_update( entity(Name), Context0, String, Context1 ) :- Context0 = context(Element,Preserve,Current,Default,Entities0, Namespaces,RemoveAttributePrefixes,Amp), Context1 = context(Element,Preserve,Current,Default,Entities1, Namespaces,RemoveAttributePrefixes,Amp), map_store( Entities0, Name, String, Entities1 ). remove_attribute_prefixes( Context ) :- Context = context(_Element,_Preserve,_Current,_Default,_Entities, _Namespaces,true,_Amp). current_tag( Context, Tag ) :- Context = context(tag(Tag),_Preserve,_Current,_Default,_Entities, _Namespaces,_RPFA,_Amp). current_namespace( Context, Current ) :- Context = context(_Element,_Preserve,Current,_Default,_Entities, _Namespaces,_RPFA,_Amp). default_namespace( Context, Default ) :- Context = context(_Element,_Preserve,_Current,Default,_Entities, _Namespaces,_RPFA,_Amp). space_preserve( Context ) :- Context = context(tag(_Tag),true,_Current,_Default,_Entities, _Namespaces,_RPFA,_Amp). specific_namespace( Prefix, Context, URI ) :- Context = context(_Element,_Preserve,_Current,_Default,_Entities, Namespaces,_RPFA,_Amp), map_member( Prefix, Namespaces, URI ). defined_entity( Reference, Context, String ) :- Context = context(_Element,_Preserve,_Current,_Default,Entities, _Namespaces,_RPFA,_Amp), map_member( Reference, Entities, String ). close_context( Context, Terms, WellFormed ) :- Context = context(Element,_Preserve,_Current,_Default,_Entities, _Namespaces,_RPFA,_Amp), close_context1( Element, Terms, WellFormed ). close_context1( void, [], true ). close_context1( tag(TagChars), [out_of_context(Tag)], false ) :- atom_chars( Tag, TagChars ). void_context( context(void,_Preserve,_Current,_Default,_Entities,_Names,_RPFA,_Amp) ). allow_ampersand( context(_Void,_Preserve,_Current,_Default,_Entities,_Names,_RPFA,true) ). /** pp_string( +String ) * * prints String onto the current output stream. * If String contains only 7-bit chars it is printed in shorthand quoted * format, otherwise it is written as a list. * If your Prolog uses " to delimit a special string type, just use write/1. */ pp_string( Chars ) :- ( member( Char, Chars ), not_shorthand( Char ) -> write( Chars ) ; otherwise -> put_quote, pp_string1( Chars ), put_quote ). not_shorthand( Char ) :- Char > 255. not_shorthand( Char ) :- Char < 9. not_shorthand( 126 ). % ~ gives syntax errors in LPA Prolog put_quote :- put( 0'" ). % ' pp_string1( [] ). pp_string1( [Char|Chars] ) :- ( Char =:= """" -> % Meta-quote put( Char ), put( Char ), pp_string1( Chars ) ; Char =:= 13, % Handle Windows border-settings Chars = [10|Chars1] -> put( 10 ), pp_string1( Chars1 ) ; otherwise -> put( Char ), pp_string1( Chars ) ). xml_declaration_attributes_valid( [] ). xml_declaration_attributes_valid( [Name=Value|Attributes] ) :- xml_declaration_attribute_valid( Name, Value ), xml_declaration_attributes_valid( Attributes ). xml_declaration_attribute_valid( Name, Value ) :- lowercase( Value, Lowercase ), canonical_xml_declaration_attribute( Name, Lowercase ). canonical_xml_declaration_attribute( version, "1.0" ). canonical_xml_declaration_attribute( standalone, "yes" ). canonical_xml_declaration_attribute( standalone, "no" ). % The encodings here are all valid for the output produced. canonical_xml_declaration_attribute( encoding, "utf-8" ). % canonical_xml_declaration_attribute( encoding, "utf-16" ). % This is erroneous for the output of this library canonical_xml_declaration_attribute( encoding, "us-ascii" ). canonical_xml_declaration_attribute( encoding, "ascii" ). canonical_xml_declaration_attribute( encoding, "iso-8859-1" ). canonical_xml_declaration_attribute( encoding, "iso-8859-2" ). canonical_xml_declaration_attribute( encoding, "iso-8859-15" ). canonical_xml_declaration_attribute( encoding, "windows-1252" ). % In general, it's better not to specify an encoding. /* lowercase( +MixedCase, ?Lowercase ) holds when Lowercase and MixedCase are * lists of character codes, and Lowercase is identical to MixedCase with * every uppercase character replaced by its lowercase equivalent. */ lowercase( [], [] ). lowercase( [Char|Chars], [Lower|LowerCase] ) :- ( Char >= "A", Char =< "Z" -> Lower is Char + "a" - "A" ; otherwise -> Lower = Char ), lowercase( Chars, LowerCase ). extended_character_entities( [ "AElig"-[198], % latin capital letter AE "Aacute"-[193], % latin capital letter A with acute, "Acirc"-[194], % latin capital letter A with circumflex, "Agrave"-[192], % latin capital letter A with grave "Alpha"-[913], % greek capital letter alpha, U+0391 "Aring"-[197], % latin capital letter A with ring above "Atilde"-[195], % latin capital letter A with tilde, "Auml"-[196], % latin capital letter A with diaeresis, "Beta"-[914], % greek capital letter beta, U+0392 "Ccedil"-[199], % latin capital letter C with cedilla, "Chi"-[935], % greek capital letter chi, U+03A7 "Dagger"-[8225], % double dagger, U+2021 ISOpub "Delta"-[916], % greek capital letter delta, "ETH"-[208], % latin capital letter ETH, U+00D0 ISOlat1> "Eacute"-[201], % latin capital letter E with acute, "Ecirc"-[202], % latin capital letter E with circumflex, "Egrave"-[200], % latin capital letter E with grave, "Epsilon"-[917], % greek capital letter epsilon, U+0395 "Eta"-[919], % greek capital letter eta, U+0397 "Euml"-[203], % latin capital letter E with diaeresis, "Gamma"-[915], % greek capital letter gamma, "Iacute"-[205], % latin capital letter I with acute, "Icirc"-[206], % latin capital letter I with circumflex, "Igrave"-[204], % latin capital letter I with grave, "Iota"-[921], % greek capital letter iota, U+0399 "Iuml"-[207], % latin capital letter I with diaeresis, "Kappa"-[922], % greek capital letter kappa, U+039A "Lambda"-[923], % greek capital letter lambda, "Mu"-[924], % greek capital letter mu, U+039C "Ntilde"-[209], % latin capital letter N with tilde, "Nu"-[925], % greek capital letter nu, U+039D "OElig"-[338], % latin capital ligature OE, "Oacute"-[211], % latin capital letter O with acute, "Ocirc"-[212], % latin capital letter O with circumflex, "Ograve"-[210], % latin capital letter O with grave, "Omega"-[937], % greek capital letter omega, "Omicron"-[927], % greek capital letter omicron, U+039F "Oslash"-[216], % latin capital letter O with stroke "Otilde"-[213], % latin capital letter O with tilde, "Ouml"-[214], % latin capital letter O with diaeresis, "Phi"-[934], % greek capital letter phi, "Pi"-[928], % greek capital letter pi, U+03A0 ISOgrk3 "Prime"-[8243], % double prime = seconds = inches, "Psi"-[936], % greek capital letter psi, "Rho"-[929], % greek capital letter rho, U+03A1 "Scaron"-[352], % latin capital letter S with caron, "Sigma"-[931], % greek capital letter sigma, "THORN"-[222], % latin capital letter THORN, "Tau"-[932], % greek capital letter tau, U+03A4 "Theta"-[920], % greek capital letter theta, "Uacute"-[218], % latin capital letter U with acute, "Ucirc"-[219], % latin capital letter U with circumflex, "Ugrave"-[217], % latin capital letter U with grave, "Upsilon"-[933], % greek capital letter upsilon, "Uuml"-[220], % latin capital letter U with diaeresis, "Xi"-[926], % greek capital letter xi, U+039E ISOgrk3 "Yacute"-[221], % latin capital letter Y with acute, "Yuml"-[376], % latin capital letter Y with diaeresis, "Zeta"-[918], % greek capital letter zeta, U+0396 "aacute"-[225], % latin small letter a with acute, "acirc"-[226], % latin small letter a with circumflex, "acute"-[180], % acute accent = spacing acute, "aelig"-[230], % latin small letter ae "agrave"-[224], % latin small letter a with grave "alefsym"-[8501], % alef symbol = first transfinite cardinal, "alpha"-[945], % greek small letter alpha, "and"-[8743], % logical and = wedge, U+2227 ISOtech "ang"-[8736], % angle, U+2220 ISOamso "aring"-[229], % latin small letter a with ring above "asymp"-[8776], % almost equal to = asymptotic to, "atilde"-[227], % latin small letter a with tilde, "auml"-[228], % latin small letter a with diaeresis, "bdquo"-[8222], % double low-9 quotation mark, U+201E NEW "beta"-[946], % greek small letter beta, U+03B2 ISOgrk3 "brvbar"-[166], % broken bar = broken vertical bar, "bull"-[8226], % bullet = black small circle, "cap"-[8745], % intersection = cap, U+2229 ISOtech "ccedil"-[231], % latin small letter c with cedilla, "cedil"-[184], % cedilla = spacing cedilla, U+00B8 ISOdia> "cent"-[162], % cent sign, U+00A2 ISOnum> "chi"-[967], % greek small letter chi, U+03C7 ISOgrk3 "circ"-[710], % modifier letter circumflex accent, "clubs"-[9827], % black club suit = shamrock, "cong"-[8773], % approximately equal to, U+2245 ISOtech "copy"-[169], % copyright sign, U+00A9 ISOnum> "crarr"-[8629], % downwards arrow with corner leftwards "cup"-[8746], % union = cup, U+222A ISOtech "curren"-[164], % currency sign, U+00A4 ISOnum> "dArr"-[8659], % downwards double arrow, U+21D3 ISOamsa "dagger"-[8224], % dagger, U+2020 ISOpub "darr"-[8595], % downwards arrow, U+2193 ISOnum "deg"-[176], % degree sign, U+00B0 ISOnum> "delta"-[948], % greek small letter delta, "diams"-[9830], % black diamond suit, U+2666 ISOpub "divide"-[247], % division sign, U+00F7 ISOnum> "eacute"-[233], % latin small letter e with acute, "ecirc"-[234], % latin small letter e with circumflex, "egrave"-[232], % latin small letter e with grave, "empty"-[8709], % empty set = null set = diameter, "emsp"-[8195], % em space, U+2003 ISOpub "ensp"-[8194], % en space, U+2002 ISOpub "epsilon"-[949], % greek small letter epsilon, "equiv"-[8801], % identical to, U+2261 ISOtech "eta"-[951], % greek small letter eta, U+03B7 ISOgrk3 "eth"-[240], % latin small letter eth, U+00F0 ISOlat1> "euml"-[235], % latin small letter e with diaeresis, "euro"-[8364], % euro sign, U+20AC NEW "exist"-[8707], % there exists, U+2203 ISOtech "fnof"-[402], % latin small f with hook = function "forall"-[8704], % for all, U+2200 ISOtech "frac12"-[189], % vulgar fraction one half "frac14"-[188], % vulgar fraction one quarter "frac34"-[190], % vulgar fraction three quarters "frasl"-[8260], % fraction slash, U+2044 NEW "gamma"-[947], % greek small letter gamma, "ge"-[8805], % greater-than or equal to, "hArr"-[8660], % left right double arrow, "harr"-[8596], % left right arrow, U+2194 ISOamsa "hearts"-[9829], % black heart suit = valentine, "hellip"-[8230], % horizontal ellipsis = three dot leader, "iacute"-[237], % latin small letter i with acute, "icirc"-[238], % latin small letter i with circumflex, "iexcl"-[161], % inverted exclamation mark, U+00A1 ISOnum> "igrave"-[236], % latin small letter i with grave, "image"-[8465], % blackletter capital I = imaginary part, "infin"-[8734], % infinity, U+221E ISOtech "int"-[8747], % integral, U+222B ISOtech "iota"-[953], % greek small letter iota, U+03B9 ISOgrk3 "iquest"-[191], % inverted question mark "isin"-[8712], % element of, U+2208 ISOtech "iuml"-[239], % latin small letter i with diaeresis, "kappa"-[954], % greek small letter kappa, "lArr"-[8656], % leftwards double arrow, U+21D0 ISOtech "lambda"-[955], % greek small letter lambda, "lang"-[9001], % left-pointing angle bracket = bra, "laquo"-[171], % left-pointing double angle quotation mark "larr"-[8592], % leftwards arrow, U+2190 ISOnum "lceil"-[8968], % left ceiling = apl upstile, "ldquo"-[8220], % left double quotation mark, "le"-[8804], % less-than or equal to, U+2264 ISOtech "lfloor"-[8970], % left floor = apl downstile, "lowast"-[8727], % asterisk operator, U+2217 ISOtech "loz"-[9674], % lozenge, U+25CA ISOpub "lrm"-[8206], % left-to-right mark, U+200E NEW RFC 2070 "lsaquo"-[8249], % single left-pointing angle quotation mark, "lsquo"-[8216], % left single quotation mark, "macr"-[175], % macron = spacing macron = overline "mdash"-[8212], % em dash, U+2014 ISOpub "micro"-[181], % micro sign, U+00B5 ISOnum> "middot"-[183], % middle dot = Georgian comma "minus"-[8722], % minus sign, U+2212 ISOtech "mu"-[956], % greek small letter mu, U+03BC ISOgrk3 "nabla"-[8711], % nabla = backward difference, "nbsp"-[160], % no-break space = non-breaking space, "ndash"-[8211], % en dash, U+2013 ISOpub "ne"-[8800], % not equal to, U+2260 ISOtech "ni"-[8715], % contains as member, U+220B ISOtech "not"-[172], % not sign, U+00AC ISOnum> "notin"-[8713], % not an element of, U+2209 ISOtech "nsub"-[8836], % not a subset of, U+2284 ISOamsn "ntilde"-[241], % latin small letter n with tilde, "nu"-[957], % greek small letter nu, U+03BD ISOgrk3 "oacute"-[243], % latin small letter o with acute, "ocirc"-[244], % latin small letter o with circumflex, "oelig"-[339], % latin small ligature oe, U+0153 ISOlat2 "ograve"-[242], % latin small letter o with grave, "oline"-[8254], % overline = spacing overscore, "omega"-[969], % greek small letter omega, "omicron"-[959], % greek small letter omicron, U+03BF NEW "oplus"-[8853], % circled plus = direct sum, "or"-[8744], % logical or = vee, U+2228 ISOtech "ordf"-[170], % feminine ordinal indicator, U+00AA ISOnum> "ordm"-[186], % masculine ordinal indicator, "oslash"-[248], % latin small letter o with stroke, "otilde"-[245], % latin small letter o with tilde, "otimes"-[8855], % circled times = vector product, "ouml"-[246], % latin small letter o with diaeresis, "para"-[182], % pilcrow sign = paragraph sign, "part"-[8706], % partial differential, U+2202 ISOtech "permil"-[8240], % per mille sign, U+2030 ISOtech "perp"-[8869], % up tack = orthogonal to = perpendicular, "phi"-[966], % greek small letter phi, U+03C6 ISOgrk3 "pi"-[960], % greek small letter pi, U+03C0 ISOgrk3 "piv"-[982], % greek pi symbol, U+03D6 ISOgrk3 "plusmn"-[177], % plus-minus sign = plus-or-minus sign, "pound"-[163], % pound sign, U+00A3 ISOnum> "prime"-[8242], % prime = minutes = feet, U+2032 ISOtech "prod"-[8719], % n-ary product = product sign, "prop"-[8733], % proportional to, U+221D ISOtech "psi"-[968], % greek small letter psi, U+03C8 ISOgrk3 "rArr"-[8658], % rightwards double arrow, "radic"-[8730], % square root = radical sign, "rang"-[9002], % right-pointing angle bracket = ket, "raquo"-[187], % right-pointing double angle quotation mark "rarr"-[8594], % rightwards arrow, U+2192 ISOnum "rceil"-[8969], % right ceiling, U+2309 ISOamsc "rdquo"-[8221], % right double quotation mark, "real"-[8476], % blackletter capital R = real part symbol, "reg"-[174], % registered sign = registered trade mark sign, "rfloor"-[8971], % right floor, U+230B ISOamsc "rho"-[961], % greek small letter rho, U+03C1 ISOgrk3 "rlm"-[8207], % right-to-left mark, U+200F NEW RFC 2070 "rsaquo"-[8250], % single right-pointing angle quotation mark, "rsquo"-[8217], % right single quotation mark, "sbquo"-[8218], % single low-9 quotation mark, U+201A NEW "scaron"-[353], % latin small letter s with caron, "sdot"-[8901], % dot operator, U+22C5 ISOamsb "sect"-[167], % section sign, U+00A7 ISOnum> "shy"-[173], % soft hyphen = discretionary hyphen, "sigma"-[963], % greek small letter sigma, "sigmaf"-[962], % greek small letter final sigma, "sim"-[8764], % tilde operator = varies with = similar to, "spades"-[9824], % black spade suit, U+2660 ISOpub "sub"-[8834], % subset of, U+2282 ISOtech "sube"-[8838], % subset of or equal to, U+2286 ISOtech "sum"-[8721], % n-ary sumation, U+2211 ISOamsb "sup"-[8835], % superset of, U+2283 ISOtech "sup1"-[185], % superscript one = superscript digit one, "sup2"-[178], % superscript two = superscript digit two "sup3"-[179], % superscript three = superscript digit three "supe"-[8839], % superset of or equal to, "szlig"-[223], % latin small letter sharp s = ess-zed, "tau"-[964], % greek small letter tau, U+03C4 ISOgrk3 "there4"-[8756], % therefore, U+2234 ISOtech "theta"-[952], % greek small letter theta, "thetasym"-[977], % greek small letter theta symbol, "thinsp"-[8201], % thin space, U+2009 ISOpub "thorn"-[254], % latin small letter thorn with, "tilde"-[732], % small tilde, U+02DC ISOdia "times"-[215], % multiplication sign, U+00D7 ISOnum> "trade"-[8482], % trade mark sign, U+2122 ISOnum "uArr"-[8657], % upwards double arrow, U+21D1 ISOamsa "uacute"-[250], % latin small letter u with acute, "uarr"-[8593], % upwards arrow, U+2191 ISOnum "ucirc"-[251], % latin small letter u with circumflex, "ugrave"-[249], % latin small letter u with grave, "uml"-[168], % diaeresis = spacing diaeresis, "upsih"-[978], % greek upsilon with hook symbol, "upsilon"-[965], % greek small letter upsilon, "uuml"-[252], % latin small letter u with diaeresis, "weierp"-[8472], % script capital P = power set "xi"-[958], % greek small letter xi, U+03BE ISOgrk3 "yacute"-[253], % latin small letter y with acute, "yen"-[165], % yen sign = yuan sign, U+00A5 ISOnum> "yuml"-[255], % latin small letter y with diaeresis, "zeta"-[950], % greek small letter zeta, U+03B6 ISOgrk3 "zwj"-[8205], % zero width joiner, U+200D NEW RFC 2070 "zwnj"-[8204] % zero width non-joiner, ] ). /* @pred chars( ?Chars, ?Plus, ?Minus ) * * used as chars( ?Chars ) in a DCG to * copy the list Chars inline. * * This is best expressed in terms of append/3 where append/3 is built-in. * For other Prologs, a straightforward specification can be used: * * chars( [] ) --> "". * chars( [Char|Chars] ) --> * [Char], * chars( Chars ). */ chars( Chars, Plus, Minus ) :- append( Chars, Minus, Plus ).