diff --git a/packages/sgml/pltotex.pl b/packages/sgml/pltotex.pl new file mode 100644 index 000000000..7e002aae8 --- /dev/null +++ b/packages/sgml/pltotex.pl @@ -0,0 +1,76 @@ +:- module(pltotex, + [ pltotex/2, + pltotex/0 + ]). +:- use_module(library(doc_latex)). +:- use_module(library(main)). +:- use_module(library(error)). +:- use_module(library(apply)). +:- use_module(library(lists)). + +pltotex(File, Options) :- + file_name_extension(_, txt, File), !, + tex_file(File, Out, Options), + doc_latex(File, Out, + [ stand_alone(false) + | Options + ]). +pltotex(Lib, Options) :- + ( file_name_extension(_, pl, Lib) + -> Spec = Lib + ; atom_to_term(Lib, Spec, _) + ), + absolute_file_name(Spec, File, + [ access(read), + file_type(prolog) + ]), + tex_file(File, Out, Options), + user:use_module(File), % we want the operators in user + doc_latex(File, Out, + [ stand_alone(false) + | Options + ]). + +tex_file(_, TeXFile, Options) :- + option(out(Base), Options), !, + file_name_extension(Base, tex, TeXFile). +tex_file(File, TeXFile, _) :- + file_base_name(File, Local), + file_name_extension(Base0, _, Local), + strip(Base0, 0'_, Base), + file_name_extension(Base, tex, TeXFile). + +strip(In, Code, Out) :- + atom_codes(In, Codes0), + delete(Codes0, Code, Codes), + atom_codes(Out, Codes). + + +%% pltotex +% +% Usage: pl -q -s pltotex.pl -g pltotex -- file ... + +pltotex :- + main. + +main(Argv) :- + partition(is_option, Argv, OptArgs, Files), + maplist(to_option, OptArgs, Options), + maplist(process_file(Options), Files). + +is_option(Arg) :- + sub_atom(Arg, 0, _, _, --). + +to_option('--section', section_level(section)) :- !. +to_option('--subsection', section_level(subsection)) :- !. +to_option('--subsubsection', section_level(subsubsection)) :- !. +to_option(Arg, Option) :- + atom_concat(--, Opt, Arg), + sub_atom(Opt, B, _, A, =), !, + sub_atom(Opt, 0, B, _, Name), + sub_atom(Opt, _, A, 0, Value), + Option =.. [Name, Value]. + +process_file(Options, File) :- + pltotex(File, Options). + diff --git a/packages/sgml/pwp.pl b/packages/sgml/pwp.pl new file mode 100644 index 000000000..dae6d7ec2 --- /dev/null +++ b/packages/sgml/pwp.pl @@ -0,0 +1,638 @@ +:- module(pwp, + [ pwp_files/2, % +FileIn, +FileOut + pwp_stream/3, % +StreamIn, +StreamOut, +Context + pwp_xml/3 % +DomIn, -DOMOut, +Context + ]). + +/** Prolog Well-formed Pages + +PWP is an approach to server-side scripting using Prolog +which is based on a simple key principle: + + - The source form of a PWP should be WELL-FORMED XML + +Especially when generating XML rather than HTML, this is such an +obvious thing to do. We have many kinds of XML checking tools. + + - We can tell whether an XML document is WELL FORMED (all the + punctuation is right, all tags balance) using practically + any decent parser, including SWI Prolog's 'sgml'. + + - If we can write a Document Type Description then we can check + that a document is VALID using tools like Open SP (formerly + James Clark's SP) or SWI Prolog's 'sgml'. This does not + guarantee that the output will be valid, but it does catch + a lot of errors very early. + + - If we can write an XML Schema then we can check that a + document is schema-valid. (SWI Prolog's 'sgml' does not + yet come with a schema validator, but who knows what the + future holds?). + + - Since an XML document is just a data structure, we can use + any checking tool that we can write in Prolog, IF the input + is well-formed so that we can load a template as a Prolog + data structure. + +Having decided that the input should be well formed, that means +*|NO NEW SYNTAX|* + +None of the weird and horrible <% ... %> or whatever not-quite-XML +stuff you see in other template systems, making checking so very hard +(and therefore, making errors so distressingly common). + +That in turns means that PWP "markup" must be based on special +elements or special attributes. The fact that an XML parser must +allow undeclared attributes on any element even when validating, +but must not allow undeclared elements, suggests doing this through +attributes. In particular, one should be able to take an existing +DTD, such as an XHTML DTD, and just use that without modification. +So the design reduces to + + - Allow dynamic data selection, insertion, and transformation + just using a small number of extra attributes. + +This description uses the following name space: + + == + xmlns:pwp='http://www.cs.otago.ac.nz/staffpriv/ok/pwp.pl' + == + +The attributes are + + - pwp:ask = Query + - pwp:use = Term + - pwp:how = text | xml + - pwp:tag = QName or '-' + - pwp:att = '' | 'one non-alphanumeric character' + +Here's what they mean. Each element is expanded in the context +of a set of variable bindings. After expansion, if the tag is +not mapped to '-', all attributes in the pwp: namespace are removed +and the children elements are recursively expanded. + + * pwp:ask = Query + + * Query is a Prolog goal. For each solution of Query, the element + is further processed with the new variables of Query added to + the context. + + * If Query is not a well formed Prolog goal, or if execution of + Query throws an exception, page transformation comes to a complete + halt and no page is generated. + + * pwp:use = Term + * pwp:how = text | xml | text-file | xml-file + + Term is a Prolog term; variables in Term are bound by the context. + An empty Term is regarded as a missing value for this attribute. + The Prolog variable CONTEXT refers to the entire context, a list + of Name = Value, where Name is a Prolog atom holding the name of the context variable and Value is an arbitrary Prolog term. + + - If pwp:how is text, + The value of Term is used to define a sequence of characters. + + - A number produces the same characters that write/1 would. + - An atom produces the same characters that write/1 would. + - A string produces the same characters that write/1 would. + - A list of character codes produces those characters. + + - The following terms produce the same sequence of characters + that the corresponding goal would have sent to the current + output stream: + + - write(Datum) + - writeq(Datum) + - write_canonical(Datum) + - print(Datum) + - print(Datum) + - format(Format) + - format(Format, Arguments) + + - A singleton list [X] defines the characters that X defines. + - Any other term F(T1,...,Tn) defines the characters that T1 + defines, followed by the characters that T2 defines, ..., + followed by the characters that Tn defines. + + - If pwp:how is xml, + The value of Term must be an XML term as defined in the + SGML2PL documentation or a list of such terms. A single + term is taken as if it had been [Term]. The resulting + list of terms replaces the children of the current element + and will not be further processed. + + - If pwp:how is text-file, + The value of Term is used to define a sequence of characters. + That sequence of characters is used as a file name. + The file is read as a sequence of characters, and that + sequence used as character data. + + - If pwp:how is xml-file, + The value of Term is used to define a sequence of characters. + That sequence of characters is used as a file name. + The file is loaded as XML, and the sequence of XML items thus + obtained used. This means that PWP provides XML inclusion + without depending on the parser to support XInclude. + + The default value for pwp:how is text. + + * pwp:tag = QName or '-' + + If pwp:tag is missing or the value is empty, the current element + appears in the output (after further processing) with its present + tag. If pwp:tag is a QName, the current element appears (...) + with that as its tag. That option is most useful in DTDs, where + an "authoring" DTD may use one tag and have it automatically mapped + to another tag in the output, e.g., ->
  • . Finally, if + pwp:tag is '-', the children of the current element (either the + result of pwp:use or the transformed original children, whichever + applies) appear in the output but there is no element around them. + + A missing or empty pwp:ask is just like pwp:ask = 'true'. + + * pwp:att = '' | 'one non-alphanumeric character'. + + Attributes in the pwp namespace are not affected by this attribute. + Such attributes are always stripped out and never substituted into. + + If pwp:att is missing or empty, attributes of the current + element are copied over to the output unchanged. + + If pwp:att = 'c' for some non-alphanumeric character c, + each attribute is examined for occurrences of c(...)c which + are as short as possible. + There is no one character which could be used every time, so you + have to explicitly choose a substitution marker which is safe + for the data you do not want to be altered. None of the pwp + attributes are inherited, least of all this one. + + Text outside c(...)c groups is copied unchanged; text inside + such a group is parsed as a Prolog term and treated as if by + pwp:how = text. + + +Examples: + + 1. *|A "Hello World" like example|* + + == + + + + </head> + <body> + <p><span pwp:use="Greeting" pwp:tag='-'/></p> + </body> + </html> + == + + where msg.pl contains + + == + msg('Hello, World!'). + == + + This example illustrates an important point. Prolog Well-Formed + Pages provide *NO* way to physically incorporate Prolog *clauses* + into a page template. Prolog clauses must be put in separate + files which can be checked by a Prolog syntax checker, compiler, + cross-referencer, &c WITHOUT the Prolog tool in question needing + to know anything whatsoever about PWP. You load the files using + pwp:ask on the root element. + + 2. *|Binding some variables and using them|* + + == + <html + xmlns:pwp="http://www.cs.otago.ac.nz/staffpriv/ok/pwp.pl"> + <head><title>Example 2 + +

    +

    The answer is .

    + + + == + + 3. *|Making a table|* + We are given a Prolog database staff.pl defining + staff(NickName, FullName, Office, Phone, E_Mail_Address). + status(NickName, full_time | part_time). + We want to make a phone list of full time staff. + + == + + + Phone list for Full-Time staff. + + +

    Phone list for Full-Time staff.

    + + + + +
    NamePhone
    + +
    + + + == + + 4. *|Substituting into an attribute|* + Same data base as before, but now we want to make a mailing list + page. + + == + + + Phone list for Full-Time staff. + + +

    Phone list for Full-Time staff.

    + + + + + +
    NameAddress
    +
    + + + == + + 5. *|If-then-else effect|* + A page that displays the value of the 'SHELL' environment + variable if it has one, otherwise displays 'There is no + default shell.' + + == + + $SHELL + +

    The default shell is .

    +

    There is no default shell.

    + + + == + + There is one other criterion for a good server-side template + language: + + It should be possible to compile templates so as to eliminate + most if not all interpretation overhead. + + This particular notation satisfies that criterion with the + limitation that the conversion of a term to character data requires + run-time traversal of terms (because the terms are not known until + run time). + +@author Richard O'Keefe +@tbd Support compilation of PWP input files +*/ + +:- use_module(library(sgml), [load_xml_file/2]). +:- use_module(library(sgml_write), [xml_write/3]). +:- use_module(library(lists), [append/3]). + +:- meta_predicate + pwp_files(:, +), + pwp_stream(:, +, +), + pwp_xml(:, -, +). + + +%% pwp_files(:In:atom, +Out:atom) is det. +% +% loads an Xml document from the file named In, +% transforms it using the PWP attributes, and +% writes the transformed version to the new file named Out. + +pwp_files(M:In, Out) :- + load_xml_file(In, Contents), + pwp_xml(M:Contents, Transformed, []), !, + setup_call_cleanup(open(Out, write, Output), + xml_write(Output, Transformed, []), + close(Output)). + + +%% pwp_stream(:Input:input_stream, +Output:output_stream, +%% +Context:list) is det. +% +% Loads an Xml document from the given Input stream, transforms it +% using the PWP attributes, and writes the transformed version to +% the given Output stream. Context provides initial contextual +% variables and is a list of Name=Value. + +pwp_stream(M:Input, Output, Context) :- + load_xml_file(stream(Input), Contents), + pwp_xml(M:Contents, Transformed, Context), !, + xml_write(Output, Transformed, []). + + +/* Recall that an XML term is one of + + Character Data + sdata(...) SDATA (SGML only) + ndata(...) NDATA + pi(...) Processing instruction + + element(Name, [Att...], [Child...]) + + where Att is Attribute=Value and Child is an XML term. + + We are only concerned with elements; all other XML terms are + left alone. I have given some thought to recognising + + + + processing instructions, executing the Command, and removing + the processing instructions, as a debugging tool. But this + is a proof-of-concept implementation; debugging features can + wait for The Real Thing. +*/ + + + +%% pwp_xml(:In:list(xml), -Out:list(xml), +Context) +% +% maps down a list of XML items, acting specially on elements and +% copying everything else unchanged, including white space. +% The Context is a list of 'VariableName'=CurrentValue bindings. + +pwp_xml(M:In, Out, Context) :- + pwp_list(In, Out, M, Context). + +pwp_list([], [], _, _). +pwp_list([element(Tag0,Atts0,Kids0)|Xs], Ys0, M, Context) :- !, + pwp_attributes(Atts0, Ask, Use, How, Att, Tag1, Atts1), + ( nonvar(Tag1), Tag1 \== '' -> Tag2 = Tag1 + ; Tag2 = Tag0 + ), + ( nonvar(Ask), Ask \== '', Ask \== 'true' + -> atom_to_term(Ask, Query, Bindings), + pwp_unite(Bindings, Context, Context1), + findall(Xml, + ( M:Query, + pwp_element(Tag2, Atts1, Kids0, Use, How, Att, + M, Context1, Xml)), + NewContent) + ; /* Ask is missing, empty, or true */ + pwp_element(Tag2, Atts1, Kids0, Use, How, Att, + M, Context, NewContent) + ), + pwp_attach(NewContent, Ys0, Ys1), + pwp_list(Xs, Ys1, M, Context). +pwp_list([X|Xs], [X|Ys], M, Context) :- + pwp_list(Xs, Ys, M, Context). + + +%% pwp_attributes(+Atts0:list(=(atom,atom)), +%% -Ask:atom, -Use:atom, -How:atom, -Att:atom, +%% -Tag:atom, -Atts1:list(=(atom,atom))) +% +% Walks down a list of AttributeName=ItsValue pairs, stripping out +% those whose AttributeName begins with the 'pwp:' prefix, and +% copying the rest to Atts1. Along the way, Ask/Use/How/Att/Tag +% are bound to the values of the +% pwp:ask/pwp:use/pwp:how/pwp:att/pwp:tag attributes, if any. At +% the end, any of these variables that are still unbound REMAIN +% unbound; they are not bound to default values. + +pwp_attributes([], _, _, _, _, _, []). +pwp_attributes([AV|AVs], Ask, Use, How, Att, Tag, New_Atts1) :- + AV = (Name=Value), + ( pwp_attr(Name, PWPName) + -> ( pwp_attr(PWPName, Value, Ask, Use, How, Att, Tag) + -> New_Atts1 = New_Atts2 + ; New_Atts1 = New_Atts2 + ) + ; New_Atts1 = [AV|New_Atts2] + ), + pwp_attributes(AVs, Ask, Use, How, Att, Tag, New_Atts2). + + +pwp_attr(ask, Value, Value, _Use, _How, _Att, _Tag). +pwp_attr(use, Value, _Ask, Value, _How, _Att, _Tag). +pwp_attr(how, Value, _Ask, _Use, Value, _Att, _Tag). +pwp_attr(att, Value, _Ask, _Use, _How, Value, _Tag). +pwp_attr(tag, Value, _Ask, _Use, _How, _Att, Value). + +%% pwp_attr(+XMLAttr, -PWPLocal) is semidet. +% +% True if PWPLocal is the local name of a pwp:Local expression in +% XML. This predicate deals with the three different XML +% representations: the form is returned of XML namespace +% processing is not enabled. The second if it is enabled and the +% namespace is properly defined and the last if the namespace is +% not defined. + +pwp_attr(Atom, PWP) :- + atom(Atom), + atom_concat('pwp:', PWP, Atom), !. +pwp_attr('http://www.cs.otago.ac.nz/staffpriv/ok/pwp.pl':PWP, PWP) :- !. +pwp_attr('pwp':PWP, PWP) :- !. +pwp_attr('xmlns:pwp', -). + +%% pwp_unite(+Bindings, +Context0, -Context:list(=(atom,any))) +% +% merges the new Bindings with the bindings in the outer Context0, +% constructing a new list of VariableName=CurrentValue bindings in +% Context1. This is only used when the CurrentValue parts of the +% new Bindings are known to be distinct new variables, so the +% Bindings cannot possibly conflict with any existing binding in +% Context0. This is O(|Bindings|.|Context0|), which is not that +% efficient, but since we do not expect there to be very many +% variables it doesn't matter much. + +pwp_unite(Bindings, Context0, Context) :- + pwp_unite(Bindings, Context0, Context0, Context). + + +pwp_unite([], _, Context, Context). +pwp_unite([Binding|Bindings], Context0, Context1, Context) :- + memberchk(Binding, Context0), !, + pwp_unite(Bindings, Context0, Context1, Context). +pwp_unite(['CONTEXT'=Context0|Bindings], Context0, Context1, Context) :- !, + pwp_unite(Bindings, Context0, Context1, Context). +pwp_unite([Binding|Bindings], Context0, Context1, Context) :- + pwp_unite(Bindings, Context0, [Binding|Context1], Context). + + + +%% pwp_unite(+Bindings, +Context0: list(=(atom,any))) +% +% looks up the bindings in Bindings in the outer Context0. +% This is only used for 'pwp:use' terms (and the related terms +% in $(...)$ attribute value substitutions), so that we have +% no interest in forming a new context. (If we did, we'd use +% pwp_unite/3 instead.) This is only used when the CurrentValue +% parts of the new Bindings are known to be distinct new variables, +% so the Bindings cannot possibly conflict with any existing +% binding in Context0. However, there _could_ be new variables +% in Bindings, and that would cause problems. An XML term may +% not contain variables, and a term we want to convert to a list +% of character codes had better not contain variables either. +% One approach would be to just bind such variables to something, +% another is to throw some kind of exception. For the moment we +% call functor/3 so as to get an instantiation error. + +pwp_unite([], _). +pwp_unite([Binding|Bindings], Context) :- + memberchk(Binding, Context), !, + pwp_unite(Bindings, Context). +pwp_unite([_=Value|_], _) :- + functor(Value, _, _). + +%% pwp_attach(+Tree, ?Ys0: list(xml), ?Ys: list(xml)) +% +% is a combination of "flatten" and "append". +% It unifies Ys0\Ys with the result of flattening Tree. + +pwp_attach([], Ys, Ys) :- !. +pwp_attach([X|Xs], Ys0, Ys) :- !, + pwp_attach(X, Ys0, Ys1), + pwp_attach(Xs, Ys1, Ys). +pwp_attach(X, [X|Ys], Ys). + + + +pwp_element('-', _, Kids, Use, How, _, M, Context, Xml) :- !, + pwp_use(Use, How, Kids, M, Context, Xml). +pwp_element(Tag, Atts, Kids, Use, How, Magic, M, Context, + element(Tag,Atts1,Kids1)) :- + ( nonvar(Magic) + -> pwp_substitute(Atts, Magic, Context, Atts1) + ; Atts1 = Atts + ), + pwp_use(Use, How, Kids, M, Context, Kids1). + +pwp_use('', _, Kids, M, Context, Kids1) :- !, + pwp_list(Kids, Kids1, M, Context). +pwp_use(Use, How, _, _, Context, Kids1) :- + atom_to_term(Use, Term, Bindings), + pwp_unite(Bindings, Context), + pwp_how(How, Term, Kids1). + +pwp_how('text', Term, [CData]) :- !, + pwp_use_codes(Term, Codes, []), + atom_codes(CData, Codes). +pwp_how('xml', Term, Kids1) :- + ( Term == [] -> Kids1 = Term + ; Term = [_|_] -> Kids1 = Term + ; Kids1 = [Term] + ). +pwp_how('text-file', Term, [CData]) :- + pwp_use_codes(Term, Codes, []), + atom_codes(FileName, Codes), + read_file_to_codes(FileName, FileCodes, []), + atom_codes(CData, FileCodes). +pwp_how('xml-file', Term, Kids1) :- + pwp_use_codes(Term, Codes, []), + atom_codes(FileName, Codes), + load_xml_file(FileName, Kids1). + + +pwp_substitute([], _, _, []). +pwp_substitute([AV|AVs], Magic, Context, [AV1|Atts1]) :- + AV = (Name = Value), + ( sub_atom(Value, _, _, _, Magic) + -> char_code(Magic, C), + atom_codes(Value, Codes), + pwp_split(Codes, C, B0, T0, A0), !, + pwp_substitute(B0, T0, A0, C, Context, V), + atom_codes(New_Value, V), + AV1 = (Name = New_Value), + pwp_substitute(AVs, Magic, Context, Atts1) + ). +pwp_substitute([AV|AVs], Magic, Context, [AV|Atts1]) :- + pwp_substitute(AVs, Magic, Context, Atts1). + + +pwp_substitute(B0, T0, A0, C, Context, V0) :- + append(B0, V1, V0), + atom_codes(Atom, T0), + atom_to_term(Atom, Term, Bindings), + pwp_unite(Bindings, Context, _), + pwp_use_codes(Term, V1, V2), + ( pwp_split(A0, C, B1, T1, A1) + -> pwp_substitute(B1, T1, A1, C, Context, V2) + ; V2 = A0 + ). + + +pwp_split(Codes, C, Before, Text, After) :- + append(Before, [C,0'(|Rest], Codes), + append(Text, [0'),C|After], Rest), !. + + +pwp_use_codes(format(Format), S0, S) :- !, + pwp_format(Format, [], S0, S). +pwp_use_codes(format(Format,Args), S0, S) :- !, + pwp_format(Format, Args, S0, S). +pwp_use_codes(write_canonical(Datum), S0, S) :- !, + pwp_format('~k', [Datum], S0, S). +pwp_use_codes(print(Datum), S0, S) :- !, + pwp_format('~p', [Datum], S0, S). +pwp_use_codes(writeq(Datum), S0, S) :- !, + pwp_format('~q', [Datum], S0, S). +pwp_use_codes(write(Datum), S0, S) :- !, + pwp_format('~w', [Datum], S0, S). +pwp_use_codes(Atomic, S0, S) :- + atomic(Atomic), !, + ( number(Atomic) -> number_codes(Atomic, Codes) + ; atom(Atomic) -> atom_codes(Atomic, Codes) + ; string(Atomic) -> string_to_list(Atomic, Codes) + ; pwp_format('~w', [Atomic], S0, S) + ), + append(Codes, S, S0). +pwp_use_codes([X|Xs], S0, S) :- + pwp_is_codes([X|Xs]), !, + append([X|Xs], S, S0). +pwp_use_codes([X|Xs], S0, S) :- !, + pwp_use_codes(Xs, X, S0, S). +pwp_use_codes(Compound, S0, S) :- + Compound =.. [_,X|Xs], + pwp_use_codes(Xs, X, S0, S). + + + +pwp_use_codes([], X, S0, S) :- !, + pwp_use_codes(X, S0, S). +pwp_use_codes([Y|Ys], X, S0, S) :- + pwp_use_codes(X, S0, S1), + pwp_use_codes(Ys, Y, S1, S). + + + +%% pwp_is_codes(+String: any) +% +% is true when String is a list of integers and each of those +% integers is a possible Unicode value (in the range U+0000..U+10FFFF). +% Back in the days of ISO Latin 1 we would have checked for 0..255, +% and way back in the days of ASCII for 0..127. Yes, there are more +% than a million possible characters in Unicode; currently about +% 100 000 of them are in use. + +pwp_is_codes([]). +pwp_is_codes([C|Cs]) :- + integer(C), C >= 0, C =< 0x10FFFF, + pwp_is_codes(Cs). + +pwp_format(Format, Arguments, S0, S) :- + format(codes(S0, S), Format, Arguments). diff --git a/packages/sgml/xpath.pl b/packages/sgml/xpath.pl new file mode 100644 index 000000000..e74bd3250 --- /dev/null +++ b/packages/sgml/xpath.pl @@ -0,0 +1,336 @@ +/* This file is part of ClioPatria. + + Author: Jan Wielemaker + HTTP: http://e-culture.multimedian.nl/ + Copyright: 2007, E-Culture/MultimediaN + + ClioPatria is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + ClioPatria is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with ClioPatria. If not, see . +*/ + +:- module(xpath, + [ xpath/3, % +DOM, +Spec, -Value + xpath_chk/3, % +DOM, +Spec, -Value + + op(400, fx, //), + op(400, fx, /), + op(200, fy, @) + ]). +:- use_module(library(record)). +:- use_module(library(lists)). +:- use_module(library(occurs)). +:- use_module(library(debug)). + +/** Select nodes in an XML DOM + +The library xpath.pl provides predicates to select nodes from an XML DOM +tree as produced by library(sgml) based on descriptions inspired by the +XPATH language. + +The predicate xpath/3 selects a sub-structure of the DOM +non-deterministically based on an xpath-like specification. Not all +selectors of XPATH are implemented, but the ability to mix xpath/3 calls +with arbitrary Prolog code provides a powerful tool for extracting +information from XML parse-trees. + +@see http://www.w3.org/TR/xpath +*/ + +:- record + element(name, attributes, content). + +%% xpath_chk(+DOM, +Spec, ?Content) is semidet. +% +% Semi-deterministic version of xpath/3. + +xpath_chk(DOM, Spec, Content) :- + xpath(DOM, Spec, Content), !. + +%% xpath(+DOM, +Spec, ?Content) is nondet. +% +% Match an element in a DOM structure. The syntax is inspired by +% XPath, using () rather than [] to select inside an element. +% First we can construct paths using / and //: +% +% $ =|//|=Term : +% Select any node in the DOM matching term. +% $ =|/|=Term : +% Match the root against Term. +% $ Term : +% Select the immediate children of the root matching Term. +% +% The Terms above are of type _callable_. The functor specifies +% the element name. The element name '*' refers to any element. +% The name =self= refers to the top-element itself and is often +% used for processing matches of an earlier xpath/3 query. A term +% NS:Term refers to an XML name in the namespace NS. Optional +% arguments specify additional constraints and functions. The +% arguments are processed from left to right. Defined conditional +% argument values are: +% +% $ Integer : +% The N-th element with the given name +% $ =last= : +% The last element with the given name. +% $ =last= - IntExpr : +% The IntExpr-th element counting from the last (0-based) +% +% Defined function argument values are: +% +% $ =self= : +% Evaluate to the entire element +% $ =text= : +% Evaluates to all text from the sub-tree as an atom +% $ =normalize_space= : +% As =text=, but uses normalize_space/2 to normalise +% white-space in the output +% $ =number= : +% Extract an integer or float from the value. Ignores +% leading and trailing white-space +% $ =|@|=Attribute : +% Evaluates to the value of the given attribute +% +% In addition, the argument-list can be _conditions_: +% +% $ Left = Right : +% Succeeds if the left-hand unifies with the right-hand. +% E.g. normalize_space = 'euro' +% $ contains(Haystack, Needle) : +% Succeeds if Needle is a sub-string of Haystack. +% +% Examples: +% +% Match each table-row in DOM: +% +% == +% xpath(DOM, //tr, TR) +% == +% +% Match the last cell of each tablerow in DOM. This example +% illustrates that a result can be the input of subsequent xpath/3 +% queries. Using multiple queries on the intermediate TR term +% guarantee that all results come from the same table-row: +% +% == +% xpath(DOM, //tr, TR), +% xpath(TR, /td(last), TD) +% == +% +% Match each =href= attribute in an element +% +% == +% xpath(DOM, //a(@href), HREF) +% == +% +% Suppose we have a table containing rows where each first column +% is the name of a product with a link to details and the second +% is the price (a number). The following predicate matches the +% name, URL and price: +% +% == +% product(DOM, Name, URL, Price) :- +% xpath(DOM, //tr, TR), +% xpath(TR, td(1), C1), +% xpath(C1, /self(normalize_space), Name), +% xpath(C1, a(@href), URL), +% xpath(TR, td(2, number), Price). +% == + +xpath(DOM, Spec, Content) :- + in_dom(Spec, DOM, Content). + +in_dom(//Spec, DOM, Value) :- !, + element_spec(Spec, Name, Modifiers), + sub_dom(I, Len, Name, E, DOM), + modifiers(Modifiers, I, Len, E, Value). +in_dom(/Spec, E, Value) :- !, + element_spec(Spec, Name, Modifiers), + ( Name == self + -> true + ; element_name(E, Name) + ), + modifiers(Modifiers, 1, 1, E, Value). +in_dom(A/B, DOM, Value) :- !, + in_dom(A, DOM, Value0), + in_dom(B, Value0, Value). +in_dom(A//B, DOM, Value) :- !, + in_dom(A, DOM, Value0), + in_dom(//B, Value0, Value). +in_dom(Spec, element(_, _, Content), Value) :- + element_spec(Spec, Name, Modifiers), + count_named_elements(Content, Name, CLen), + CLen > 0, + nth_element(N, Name, E, Content), + modifiers(Modifiers, N, CLen, E, Value). + +element_spec(Var, _, _) :- + var(Var), !, + instantiation_error(Var). +element_spec(NS:Term, NS:Name, Modifiers) :- !, + Term =.. [Name0|Modifiers], + star(Name0, Name). +element_spec(Term, Name, Modifiers) :- !, + Term =.. [Name0|Modifiers], + star(Name0, Name). + +star(*, _) :- !. +star(Name, Name). + + +%% sub_dom(-Index, -Count, +Name, -Sub, +DOM) is nondet. +% +% Sub is a node in DOM with Name. +% +% @param Count is the total number of nodes in the content +% list Sub appears that have the same name. +% @param Index is the 1-based index of Sub of nodes with +% Name. + +sub_dom(1, 1, Name, DOM, DOM) :- + element_name(DOM, Name). +sub_dom(N, Len, Name, E, element(_,_,Content)) :- !, + sub_dom_2(N, Len, Name, E, Content). +sub_dom(N, Len, Name, E, Content) :- + is_list(Content), + sub_dom_2(N, Len, Name, E, Content). + +sub_dom_2(N, Len, Name, Element, Content) :- + ( count_named_elements(Content, Name, Len), + nth_element(N, Name, Element, Content) + ; member(element(_,_,C2), Content), + sub_dom_2(N, Len, Name, Element, C2) + ). + + +%% count_named_elements(+Content, +Name, -Count) is det. +% +% Count is the number of nodes with Name in Content. + +count_named_elements(Content, Name, Count) :- + count_named_elements(Content, Name, 0, Count). + +count_named_elements([], _, Count, Count). +count_named_elements([element(Name,_,_)|T], Name, C0, C) :- !, + C1 is C0+1, + count_named_elements(T, Name, C1, C). +count_named_elements([_|T], Name, C0, C) :- + count_named_elements(T, Name, C0, C). + + +%% nth_element(?N, +Name, -Element, +Content:list) is nondet. +% +% True if Element is the N-th element with name in Content. + +nth_element(N, Name, Element, Content) :- + nth_element_(1, N, Name, Element, Content). + +nth_element_(I, N, Name, E, [H|T]) :- + element_name(H, Name), !, + ( N = I, + E = H + ; I2 is I + 1, + ( nonvar(N), I2 > N + -> !, fail + ; true + ), + nth_element_(I2, N, Name, E, T) + ). +nth_element_(I, N, Name, E, [_|T]) :- + nth_element_(I, N, Name, E, T). + + +%% modifiers(+Modifiers, +I, +Clen, +DOM, -Value) +% +% + +modifiers([], _, _, Value, Value). +modifiers([H|T], I, L, Value0, Value) :- + modifier(H, I, L, Value0, Value1), + modifiers(T, I, L, Value1, Value). + +modifier(N, I, _, Value, Value) :- % Integer + integer(N), !, + N =:= I. +modifier(last, I, L, Value, Value) :- !, % last + I =:= L. +modifier(last-Expr, I, L, Value, Value) :- !, % last-Expr + I =:= L-Expr. +modifier(Function, _, _, In, Out) :- + xpath_function(Function, In, Out). + +xpath_function(self, DOM, Value) :- !, % self + Value = DOM. +xpath_function(text, DOM, Text) :- !, % text + text_of_dom(DOM, Text). +xpath_function(normalize_space, DOM, Text) :- !, % normalize_space + text_of_dom(DOM, Text0), + normalize_space(atom(Text), Text0). +xpath_function(number, DOM, Number) :- !, % number + text_of_dom(DOM, Text0), + normalize_space(string(Text), Text0), + catch(atom_number(Text, Number), _, fail). +xpath_function(@Name, element(_, Attrs, _), Value) :- !, % @Name + memberchk(Name=Value, Attrs). +xpath_function(Left = Right, Value, Value) :- !, % = + var_or_function(Left, Value, LeftValue), + var_or_function(Right, Value, RightValue), + LeftValue = RightValue. +xpath_function(contains(Haystack, Needle), Value, Value) :- !, % contains(Haystack, Needle) + val_or_function(Haystack, Value, HaystackValue), + val_or_function(Needle, Value, NeedleValue), + atom(HaystackValue), atom(NeedleValue), + ( sub_atom(HaystackValue, _, _, _, NeedleValue) + -> true + ). + +var_or_function(Arg, _, Arg) :- + var(Arg), !. +var_or_function(Func, Value0, Value) :- + xpath_function(Func, Value0, Value). + +val_or_function(Arg, _, Arg) :- + var(Arg), !, + instantiation_error(Arg). +val_or_function(Func, Value0, Value) :- % TBD + xpath_function(Func, Value0, Value). + + +%% text_of_dom(+DOM, -Text:atom) is det. +% +% Text is the joined textual content of DOM. + +text_of_dom(DOM, Text) :- + phrase(text_of(DOM), Tokens), + concat_atom(Tokens, Text). + +text_of(element(_,_,Content)) --> + text_of_list(Content). +text_of([]) --> + []. +text_of([H|T]) --> + text_of(H), + text_of(T). + + +text_of_list([]) --> + []. +text_of_list([H|T]) --> + text_of_1(H), + text_of_list(T). + + +text_of_1(element(_,_,Content)) --> !, + text_of_list(Content). +text_of_1(Data) --> + { assertion(atom(Data)) }, + [Data].