add missing files.

This commit is contained in:
Vitor Santos Costa 2010-07-19 14:55:13 +01:00
parent 9ebf1e46f9
commit 8ca1345c09
3 changed files with 1050 additions and 0 deletions

76
packages/sgml/pltotex.pl Normal file
View File

@ -0,0 +1,76 @@
:- module(pltotex,
[ pltotex/2,
pltotex/0
]).
:- use_module(library(doc_latex)).
:- use_module(library(main)).
:- use_module(library(error)).
:- use_module(library(apply)).
:- use_module(library(lists)).
pltotex(File, Options) :-
file_name_extension(_, txt, File), !,
tex_file(File, Out, Options),
doc_latex(File, Out,
[ stand_alone(false)
| Options
]).
pltotex(Lib, Options) :-
( file_name_extension(_, pl, Lib)
-> Spec = Lib
; atom_to_term(Lib, Spec, _)
),
absolute_file_name(Spec, File,
[ access(read),
file_type(prolog)
]),
tex_file(File, Out, Options),
user:use_module(File), % we want the operators in user
doc_latex(File, Out,
[ stand_alone(false)
| Options
]).
tex_file(_, TeXFile, Options) :-
option(out(Base), Options), !,
file_name_extension(Base, tex, TeXFile).
tex_file(File, TeXFile, _) :-
file_base_name(File, Local),
file_name_extension(Base0, _, Local),
strip(Base0, 0'_, Base),
file_name_extension(Base, tex, TeXFile).
strip(In, Code, Out) :-
atom_codes(In, Codes0),
delete(Codes0, Code, Codes),
atom_codes(Out, Codes).
%% pltotex
%
% Usage: pl -q -s pltotex.pl -g pltotex -- file ...
pltotex :-
main.
main(Argv) :-
partition(is_option, Argv, OptArgs, Files),
maplist(to_option, OptArgs, Options),
maplist(process_file(Options), Files).
is_option(Arg) :-
sub_atom(Arg, 0, _, _, --).
to_option('--section', section_level(section)) :- !.
to_option('--subsection', section_level(subsection)) :- !.
to_option('--subsubsection', section_level(subsubsection)) :- !.
to_option(Arg, Option) :-
atom_concat(--, Opt, Arg),
sub_atom(Opt, B, _, A, =), !,
sub_atom(Opt, 0, B, _, Name),
sub_atom(Opt, _, A, 0, Value),
Option =.. [Name, Value].
process_file(Options, File) :-
pltotex(File, Options).

638
packages/sgml/pwp.pl Normal file
View File

@ -0,0 +1,638 @@
:- module(pwp,
[ pwp_files/2, % +FileIn, +FileOut
pwp_stream/3, % +StreamIn, +StreamOut, +Context
pwp_xml/3 % +DomIn, -DOMOut, +Context
]).
/** <module> Prolog Well-formed Pages
PWP is an approach to server-side scripting using Prolog
which is based on a simple key principle:
- The source form of a PWP should be WELL-FORMED XML
Especially when generating XML rather than HTML, this is such an
obvious thing to do. We have many kinds of XML checking tools.
- We can tell whether an XML document is WELL FORMED (all the
punctuation is right, all tags balance) using practically
any decent parser, including SWI Prolog's 'sgml'.
- If we can write a Document Type Description then we can check
that a document is VALID using tools like Open SP (formerly
James Clark's SP) or SWI Prolog's 'sgml'. This does not
guarantee that the output will be valid, but it does catch
a lot of errors very early.
- If we can write an XML Schema then we can check that a
document is schema-valid. (SWI Prolog's 'sgml' does not
yet come with a schema validator, but who knows what the
future holds?).
- Since an XML document is just a data structure, we can use
any checking tool that we can write in Prolog, IF the input
is well-formed so that we can load a template as a Prolog
data structure.
Having decided that the input should be well formed, that means
*|NO NEW SYNTAX|*
None of the weird and horrible <% ... %> or whatever not-quite-XML
stuff you see in other template systems, making checking so very hard
(and therefore, making errors so distressingly common).
That in turns means that PWP "markup" must be based on special
elements or special attributes. The fact that an XML parser must
allow undeclared attributes on any element even when validating,
but must not allow undeclared elements, suggests doing this through
attributes. In particular, one should be able to take an existing
DTD, such as an XHTML DTD, and just use that without modification.
So the design reduces to
- Allow dynamic data selection, insertion, and transformation
just using a small number of extra attributes.
This description uses the following name space:
==
xmlns:pwp='http://www.cs.otago.ac.nz/staffpriv/ok/pwp.pl'
==
The attributes are
- pwp:ask = Query
- pwp:use = Term
- pwp:how = text | xml
- pwp:tag = QName or '-'
- pwp:att = '' | 'one non-alphanumeric character'
Here's what they mean. Each element is expanded in the context
of a set of variable bindings. After expansion, if the tag is
not mapped to '-', all attributes in the pwp: namespace are removed
and the children elements are recursively expanded.
* pwp:ask = Query
* Query is a Prolog goal. For each solution of Query, the element
is further processed with the new variables of Query added to
the context.
* If Query is not a well formed Prolog goal, or if execution of
Query throws an exception, page transformation comes to a complete
halt and no page is generated.
* pwp:use = Term
* pwp:how = text | xml | text-file | xml-file
Term is a Prolog term; variables in Term are bound by the context.
An empty Term is regarded as a missing value for this attribute.
The Prolog variable CONTEXT refers to the entire context, a list
of Name = Value, where Name is a Prolog atom holding the name of the context variable and Value is an arbitrary Prolog term.
- If pwp:how is text,
The value of Term is used to define a sequence of characters.
- A number produces the same characters that write/1 would.
- An atom produces the same characters that write/1 would.
- A string produces the same characters that write/1 would.
- A list of character codes produces those characters.
- The following terms produce the same sequence of characters
that the corresponding goal would have sent to the current
output stream:
- write(Datum)
- writeq(Datum)
- write_canonical(Datum)
- print(Datum)
- print(Datum)
- format(Format)
- format(Format, Arguments)
- A singleton list [X] defines the characters that X defines.
- Any other term F(T1,...,Tn) defines the characters that T1
defines, followed by the characters that T2 defines, ...,
followed by the characters that Tn defines.
- If pwp:how is xml,
The value of Term must be an XML term as defined in the
SGML2PL documentation or a list of such terms. A single
term is taken as if it had been [Term]. The resulting
list of terms replaces the children of the current element
and will not be further processed.
- If pwp:how is text-file,
The value of Term is used to define a sequence of characters.
That sequence of characters is used as a file name.
The file is read as a sequence of characters, and that
sequence used as character data.
- If pwp:how is xml-file,
The value of Term is used to define a sequence of characters.
That sequence of characters is used as a file name.
The file is loaded as XML, and the sequence of XML items thus
obtained used. This means that PWP provides XML inclusion
without depending on the parser to support XInclude.
The default value for pwp:how is text.
* pwp:tag = QName or '-'
If pwp:tag is missing or the value is empty, the current element
appears in the output (after further processing) with its present
tag. If pwp:tag is a QName, the current element appears (...)
with that as its tag. That option is most useful in DTDs, where
an "authoring" DTD may use one tag and have it automatically mapped
to another tag in the output, e.g., <item> -> <li>. Finally, if
pwp:tag is '-', the children of the current element (either the
result of pwp:use or the transformed original children, whichever
applies) appear in the output but there is no element around them.
A missing or empty pwp:ask is just like pwp:ask = 'true'.
* pwp:att = '' | 'one non-alphanumeric character'.
Attributes in the pwp namespace are not affected by this attribute.
Such attributes are always stripped out and never substituted into.
If pwp:att is missing or empty, attributes of the current
element are copied over to the output unchanged.
If pwp:att = 'c' for some non-alphanumeric character c,
each attribute is examined for occurrences of c(...)c which
are as short as possible.
There is no one character which could be used every time, so you
have to explicitly choose a substitution marker which is safe
for the data you do not want to be altered. None of the pwp
attributes are inherited, least of all this one.
Text outside c(...)c groups is copied unchanged; text inside
such a group is parsed as a Prolog term and treated as if by
pwp:how = text.
Examples:
1. *|A "Hello World" like example|*
==
<html
xmlns:pwp="http://www.cs.otago.ac.nz/staffpriv/ok/pwp.pl"
pwp:ask = "ensure_loaded(msg), once(msg(Greeting))">
<head>
<title pwp:use="Greeting"/>
</head>
<body>
<p><span pwp:use="Greeting" pwp:tag='-'/></p>
</body>
</html>
==
where msg.pl contains
==
msg('Hello, World!').
==
This example illustrates an important point. Prolog Well-Formed
Pages provide *NO* way to physically incorporate Prolog *clauses*
into a page template. Prolog clauses must be put in separate
files which can be checked by a Prolog syntax checker, compiler,
cross-referencer, &c WITHOUT the Prolog tool in question needing
to know anything whatsoever about PWP. You load the files using
pwp:ask on the root element.
2. *|Binding some variables and using them|*
==
<html
xmlns:pwp="http://www.cs.otago.ac.nz/staffpriv/ok/pwp.pl">
<head><title>Example 2</title></head>
<body pwp:ask="Hello = 'Hello world', A = 20, B = 22">
<h1 pwp:use="Hello"/>
<p>The answer is <span pwp:use="C" pwp:ask="C is A+B"/>.</p>
</body>
</html>
==
3. *|Making a table|*
We are given a Prolog database staff.pl defining
staff(NickName, FullName, Office, Phone, E_Mail_Address).
status(NickName, full_time | part_time).
We want to make a phone list of full time staff.
==
<html
xmlns:pwp="http://www.cs.otago.ac.nz/staffpriv/ok/pwp.pl"
pwp:ask='ensure_loaded(staff)'>
<head>
<title>Phone list for Full-Time staff.</title>
</head>
<body>
<h1>Phone list for Full-Time staff.</h1>
<table
pwp:ask = "setof(FullName-Phone,
N^O^E^(
status(N, full_time),
staff(N, FullName, O, Phone, E)
),
Staff_List)">
<tr><th>Name</th><th>Phone</th></tr>
<tr pwp:ask="member(FullName-Phone, Staff_List)">
<td pwp:use="FullName"/>
<td pwp:use="Phone"/>
</tr>
</table>
</body>
</html>
==
4. *|Substituting into an attribute|*
Same data base as before, but now we want to make a mailing list
page.
==
<html
xmlns:pwp="http://www.cs.otago.ac.nz/staffpriv/ok/pwp.pl"
pwp:ask='ensure_loaded(staff)'>
<head>
<title>Phone list for Full-Time staff.</title>
</head>
<body>
<h1>Phone list for Full-Time staff.</h1>
<table
pwp:ask = "setof(FullName-E_Mail,
N^O^P^staff(N, FullName, O, P, E_Mail),
Staff_List)">
<tr><th>Name</th><th>Address</th></tr>
<tr pwp:ask="member(FullName-E_Mail, Staff_List)">
<td pwp:use="FullName"/>
<td><a pwp:use="E_Mail"
pwp:att='$' href="mailto:$(E_Mail)$"/></td>
</tr>
</table>
</body>
</html>
==
5. *|If-then-else effect|*
A page that displays the value of the 'SHELL' environment
variable if it has one, otherwise displays 'There is no
default shell.'
==
<html
xmlns:pwp="http://www.cs.otago.ac.nz/staffpriv/ok/pwp.pl">
<head><title>$SHELL</title></head>
<body>
<p pwp:ask="getenv('SHELL', Shell)"
>The default shell is <span pwp:tag="-" pwp:use="Shell"/>.</p>
<p pwp:ask="\+getenv('SHELL',_)">There is no default shell.</p>
</body>
</html>
==
There is one other criterion for a good server-side template
language:
It should be possible to compile templates so as to eliminate
most if not all interpretation overhead.
This particular notation satisfies that criterion with the
limitation that the conversion of a term to character data requires
run-time traversal of terms (because the terms are not known until
run time).
@author Richard O'Keefe
@tbd Support compilation of PWP input files
*/
:- use_module(library(sgml), [load_xml_file/2]).
:- use_module(library(sgml_write), [xml_write/3]).
:- use_module(library(lists), [append/3]).
:- meta_predicate
pwp_files(:, +),
pwp_stream(:, +, +),
pwp_xml(:, -, +).
%% pwp_files(:In:atom, +Out:atom) is det.
%
% loads an Xml document from the file named In,
% transforms it using the PWP attributes, and
% writes the transformed version to the new file named Out.
pwp_files(M:In, Out) :-
load_xml_file(In, Contents),
pwp_xml(M:Contents, Transformed, []), !,
setup_call_cleanup(open(Out, write, Output),
xml_write(Output, Transformed, []),
close(Output)).
%% pwp_stream(:Input:input_stream, +Output:output_stream,
%% +Context:list) is det.
%
% Loads an Xml document from the given Input stream, transforms it
% using the PWP attributes, and writes the transformed version to
% the given Output stream. Context provides initial contextual
% variables and is a list of Name=Value.
pwp_stream(M:Input, Output, Context) :-
load_xml_file(stream(Input), Contents),
pwp_xml(M:Contents, Transformed, Context), !,
xml_write(Output, Transformed, []).
/* Recall that an XML term is one of
<atom> Character Data
sdata(...) SDATA (SGML only)
ndata(...) NDATA
pi(...) Processing instruction
element(Name, [Att...], [Child...])
where Att is Attribute=Value and Child is an XML term.
We are only concerned with elements; all other XML terms are
left alone. I have given some thought to recognising
<?pwp ...Command...?>
processing instructions, executing the Command, and removing
the processing instructions, as a debugging tool. But this
is a proof-of-concept implementation; debugging features can
wait for The Real Thing.
*/
%% pwp_xml(:In:list(xml), -Out:list(xml), +Context)
%
% maps down a list of XML items, acting specially on elements and
% copying everything else unchanged, including white space.
% The Context is a list of 'VariableName'=CurrentValue bindings.
pwp_xml(M:In, Out, Context) :-
pwp_list(In, Out, M, Context).
pwp_list([], [], _, _).
pwp_list([element(Tag0,Atts0,Kids0)|Xs], Ys0, M, Context) :- !,
pwp_attributes(Atts0, Ask, Use, How, Att, Tag1, Atts1),
( nonvar(Tag1), Tag1 \== '' -> Tag2 = Tag1
; Tag2 = Tag0
),
( nonvar(Ask), Ask \== '', Ask \== 'true'
-> atom_to_term(Ask, Query, Bindings),
pwp_unite(Bindings, Context, Context1),
findall(Xml,
( M:Query,
pwp_element(Tag2, Atts1, Kids0, Use, How, Att,
M, Context1, Xml)),
NewContent)
; /* Ask is missing, empty, or true */
pwp_element(Tag2, Atts1, Kids0, Use, How, Att,
M, Context, NewContent)
),
pwp_attach(NewContent, Ys0, Ys1),
pwp_list(Xs, Ys1, M, Context).
pwp_list([X|Xs], [X|Ys], M, Context) :-
pwp_list(Xs, Ys, M, Context).
%% pwp_attributes(+Atts0:list(=(atom,atom)),
%% -Ask:atom, -Use:atom, -How:atom, -Att:atom,
%% -Tag:atom, -Atts1:list(=(atom,atom)))
%
% Walks down a list of AttributeName=ItsValue pairs, stripping out
% those whose AttributeName begins with the 'pwp:' prefix, and
% copying the rest to Atts1. Along the way, Ask/Use/How/Att/Tag
% are bound to the values of the
% pwp:ask/pwp:use/pwp:how/pwp:att/pwp:tag attributes, if any. At
% the end, any of these variables that are still unbound REMAIN
% unbound; they are not bound to default values.
pwp_attributes([], _, _, _, _, _, []).
pwp_attributes([AV|AVs], Ask, Use, How, Att, Tag, New_Atts1) :-
AV = (Name=Value),
( pwp_attr(Name, PWPName)
-> ( pwp_attr(PWPName, Value, Ask, Use, How, Att, Tag)
-> New_Atts1 = New_Atts2
; New_Atts1 = New_Atts2
)
; New_Atts1 = [AV|New_Atts2]
),
pwp_attributes(AVs, Ask, Use, How, Att, Tag, New_Atts2).
pwp_attr(ask, Value, Value, _Use, _How, _Att, _Tag).
pwp_attr(use, Value, _Ask, Value, _How, _Att, _Tag).
pwp_attr(how, Value, _Ask, _Use, Value, _Att, _Tag).
pwp_attr(att, Value, _Ask, _Use, _How, Value, _Tag).
pwp_attr(tag, Value, _Ask, _Use, _How, _Att, Value).
%% pwp_attr(+XMLAttr, -PWPLocal) is semidet.
%
% True if PWPLocal is the local name of a pwp:Local expression in
% XML. This predicate deals with the three different XML
% representations: the form is returned of XML namespace
% processing is not enabled. The second if it is enabled and the
% namespace is properly defined and the last if the namespace is
% not defined.
pwp_attr(Atom, PWP) :-
atom(Atom),
atom_concat('pwp:', PWP, Atom), !.
pwp_attr('http://www.cs.otago.ac.nz/staffpriv/ok/pwp.pl':PWP, PWP) :- !.
pwp_attr('pwp':PWP, PWP) :- !.
pwp_attr('xmlns:pwp', -).
%% pwp_unite(+Bindings, +Context0, -Context:list(=(atom,any)))
%
% merges the new Bindings with the bindings in the outer Context0,
% constructing a new list of VariableName=CurrentValue bindings in
% Context1. This is only used when the CurrentValue parts of the
% new Bindings are known to be distinct new variables, so the
% Bindings cannot possibly conflict with any existing binding in
% Context0. This is O(|Bindings|.|Context0|), which is not that
% efficient, but since we do not expect there to be very many
% variables it doesn't matter much.
pwp_unite(Bindings, Context0, Context) :-
pwp_unite(Bindings, Context0, Context0, Context).
pwp_unite([], _, Context, Context).
pwp_unite([Binding|Bindings], Context0, Context1, Context) :-
memberchk(Binding, Context0), !,
pwp_unite(Bindings, Context0, Context1, Context).
pwp_unite(['CONTEXT'=Context0|Bindings], Context0, Context1, Context) :- !,
pwp_unite(Bindings, Context0, Context1, Context).
pwp_unite([Binding|Bindings], Context0, Context1, Context) :-
pwp_unite(Bindings, Context0, [Binding|Context1], Context).
%% pwp_unite(+Bindings, +Context0: list(=(atom,any)))
%
% looks up the bindings in Bindings in the outer Context0.
% This is only used for 'pwp:use' terms (and the related terms
% in $(...)$ attribute value substitutions), so that we have
% no interest in forming a new context. (If we did, we'd use
% pwp_unite/3 instead.) This is only used when the CurrentValue
% parts of the new Bindings are known to be distinct new variables,
% so the Bindings cannot possibly conflict with any existing
% binding in Context0. However, there _could_ be new variables
% in Bindings, and that would cause problems. An XML term may
% not contain variables, and a term we want to convert to a list
% of character codes had better not contain variables either.
% One approach would be to just bind such variables to something,
% another is to throw some kind of exception. For the moment we
% call functor/3 so as to get an instantiation error.
pwp_unite([], _).
pwp_unite([Binding|Bindings], Context) :-
memberchk(Binding, Context), !,
pwp_unite(Bindings, Context).
pwp_unite([_=Value|_], _) :-
functor(Value, _, _).
%% pwp_attach(+Tree, ?Ys0: list(xml), ?Ys: list(xml))
%
% is a combination of "flatten" and "append".
% It unifies Ys0\Ys with the result of flattening Tree.
pwp_attach([], Ys, Ys) :- !.
pwp_attach([X|Xs], Ys0, Ys) :- !,
pwp_attach(X, Ys0, Ys1),
pwp_attach(Xs, Ys1, Ys).
pwp_attach(X, [X|Ys], Ys).
pwp_element('-', _, Kids, Use, How, _, M, Context, Xml) :- !,
pwp_use(Use, How, Kids, M, Context, Xml).
pwp_element(Tag, Atts, Kids, Use, How, Magic, M, Context,
element(Tag,Atts1,Kids1)) :-
( nonvar(Magic)
-> pwp_substitute(Atts, Magic, Context, Atts1)
; Atts1 = Atts
),
pwp_use(Use, How, Kids, M, Context, Kids1).
pwp_use('', _, Kids, M, Context, Kids1) :- !,
pwp_list(Kids, Kids1, M, Context).
pwp_use(Use, How, _, _, Context, Kids1) :-
atom_to_term(Use, Term, Bindings),
pwp_unite(Bindings, Context),
pwp_how(How, Term, Kids1).
pwp_how('text', Term, [CData]) :- !,
pwp_use_codes(Term, Codes, []),
atom_codes(CData, Codes).
pwp_how('xml', Term, Kids1) :-
( Term == [] -> Kids1 = Term
; Term = [_|_] -> Kids1 = Term
; Kids1 = [Term]
).
pwp_how('text-file', Term, [CData]) :-
pwp_use_codes(Term, Codes, []),
atom_codes(FileName, Codes),
read_file_to_codes(FileName, FileCodes, []),
atom_codes(CData, FileCodes).
pwp_how('xml-file', Term, Kids1) :-
pwp_use_codes(Term, Codes, []),
atom_codes(FileName, Codes),
load_xml_file(FileName, Kids1).
pwp_substitute([], _, _, []).
pwp_substitute([AV|AVs], Magic, Context, [AV1|Atts1]) :-
AV = (Name = Value),
( sub_atom(Value, _, _, _, Magic)
-> char_code(Magic, C),
atom_codes(Value, Codes),
pwp_split(Codes, C, B0, T0, A0), !,
pwp_substitute(B0, T0, A0, C, Context, V),
atom_codes(New_Value, V),
AV1 = (Name = New_Value),
pwp_substitute(AVs, Magic, Context, Atts1)
).
pwp_substitute([AV|AVs], Magic, Context, [AV|Atts1]) :-
pwp_substitute(AVs, Magic, Context, Atts1).
pwp_substitute(B0, T0, A0, C, Context, V0) :-
append(B0, V1, V0),
atom_codes(Atom, T0),
atom_to_term(Atom, Term, Bindings),
pwp_unite(Bindings, Context, _),
pwp_use_codes(Term, V1, V2),
( pwp_split(A0, C, B1, T1, A1)
-> pwp_substitute(B1, T1, A1, C, Context, V2)
; V2 = A0
).
pwp_split(Codes, C, Before, Text, After) :-
append(Before, [C,0'(|Rest], Codes),
append(Text, [0'),C|After], Rest), !.
pwp_use_codes(format(Format), S0, S) :- !,
pwp_format(Format, [], S0, S).
pwp_use_codes(format(Format,Args), S0, S) :- !,
pwp_format(Format, Args, S0, S).
pwp_use_codes(write_canonical(Datum), S0, S) :- !,
pwp_format('~k', [Datum], S0, S).
pwp_use_codes(print(Datum), S0, S) :- !,
pwp_format('~p', [Datum], S0, S).
pwp_use_codes(writeq(Datum), S0, S) :- !,
pwp_format('~q', [Datum], S0, S).
pwp_use_codes(write(Datum), S0, S) :- !,
pwp_format('~w', [Datum], S0, S).
pwp_use_codes(Atomic, S0, S) :-
atomic(Atomic), !,
( number(Atomic) -> number_codes(Atomic, Codes)
; atom(Atomic) -> atom_codes(Atomic, Codes)
; string(Atomic) -> string_to_list(Atomic, Codes)
; pwp_format('~w', [Atomic], S0, S)
),
append(Codes, S, S0).
pwp_use_codes([X|Xs], S0, S) :-
pwp_is_codes([X|Xs]), !,
append([X|Xs], S, S0).
pwp_use_codes([X|Xs], S0, S) :- !,
pwp_use_codes(Xs, X, S0, S).
pwp_use_codes(Compound, S0, S) :-
Compound =.. [_,X|Xs],
pwp_use_codes(Xs, X, S0, S).
pwp_use_codes([], X, S0, S) :- !,
pwp_use_codes(X, S0, S).
pwp_use_codes([Y|Ys], X, S0, S) :-
pwp_use_codes(X, S0, S1),
pwp_use_codes(Ys, Y, S1, S).
%% pwp_is_codes(+String: any)
%
% is true when String is a list of integers and each of those
% integers is a possible Unicode value (in the range U+0000..U+10FFFF).
% Back in the days of ISO Latin 1 we would have checked for 0..255,
% and way back in the days of ASCII for 0..127. Yes, there are more
% than a million possible characters in Unicode; currently about
% 100 000 of them are in use.
pwp_is_codes([]).
pwp_is_codes([C|Cs]) :-
integer(C), C >= 0, C =< 0x10FFFF,
pwp_is_codes(Cs).
pwp_format(Format, Arguments, S0, S) :-
format(codes(S0, S), Format, Arguments).

336
packages/sgml/xpath.pl Normal file
View File

@ -0,0 +1,336 @@
/* This file is part of ClioPatria.
Author: Jan Wielemaker <J.Wielemaker@cs.vu.nl>
HTTP: http://e-culture.multimedian.nl/
Copyright: 2007, E-Culture/MultimediaN
ClioPatria is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 2 of the License, or
(at your option) any later version.
ClioPatria is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with ClioPatria. If not, see <http://www.gnu.org/licenses/>.
*/
:- module(xpath,
[ xpath/3, % +DOM, +Spec, -Value
xpath_chk/3, % +DOM, +Spec, -Value
op(400, fx, //),
op(400, fx, /),
op(200, fy, @)
]).
:- use_module(library(record)).
:- use_module(library(lists)).
:- use_module(library(occurs)).
:- use_module(library(debug)).
/** <module> Select nodes in an XML DOM
The library xpath.pl provides predicates to select nodes from an XML DOM
tree as produced by library(sgml) based on descriptions inspired by the
XPATH language.
The predicate xpath/3 selects a sub-structure of the DOM
non-deterministically based on an xpath-like specification. Not all
selectors of XPATH are implemented, but the ability to mix xpath/3 calls
with arbitrary Prolog code provides a powerful tool for extracting
information from XML parse-trees.
@see http://www.w3.org/TR/xpath
*/
:- record
element(name, attributes, content).
%% xpath_chk(+DOM, +Spec, ?Content) is semidet.
%
% Semi-deterministic version of xpath/3.
xpath_chk(DOM, Spec, Content) :-
xpath(DOM, Spec, Content), !.
%% xpath(+DOM, +Spec, ?Content) is nondet.
%
% Match an element in a DOM structure. The syntax is inspired by
% XPath, using () rather than [] to select inside an element.
% First we can construct paths using / and //:
%
% $ =|//|=Term :
% Select any node in the DOM matching term.
% $ =|/|=Term :
% Match the root against Term.
% $ Term :
% Select the immediate children of the root matching Term.
%
% The Terms above are of type _callable_. The functor specifies
% the element name. The element name '*' refers to any element.
% The name =self= refers to the top-element itself and is often
% used for processing matches of an earlier xpath/3 query. A term
% NS:Term refers to an XML name in the namespace NS. Optional
% arguments specify additional constraints and functions. The
% arguments are processed from left to right. Defined conditional
% argument values are:
%
% $ Integer :
% The N-th element with the given name
% $ =last= :
% The last element with the given name.
% $ =last= - IntExpr :
% The IntExpr-th element counting from the last (0-based)
%
% Defined function argument values are:
%
% $ =self= :
% Evaluate to the entire element
% $ =text= :
% Evaluates to all text from the sub-tree as an atom
% $ =normalize_space= :
% As =text=, but uses normalize_space/2 to normalise
% white-space in the output
% $ =number= :
% Extract an integer or float from the value. Ignores
% leading and trailing white-space
% $ =|@|=Attribute :
% Evaluates to the value of the given attribute
%
% In addition, the argument-list can be _conditions_:
%
% $ Left = Right :
% Succeeds if the left-hand unifies with the right-hand.
% E.g. normalize_space = 'euro'
% $ contains(Haystack, Needle) :
% Succeeds if Needle is a sub-string of Haystack.
%
% Examples:
%
% Match each table-row in DOM:
%
% ==
% xpath(DOM, //tr, TR)
% ==
%
% Match the last cell of each tablerow in DOM. This example
% illustrates that a result can be the input of subsequent xpath/3
% queries. Using multiple queries on the intermediate TR term
% guarantee that all results come from the same table-row:
%
% ==
% xpath(DOM, //tr, TR),
% xpath(TR, /td(last), TD)
% ==
%
% Match each =href= attribute in an <a> element
%
% ==
% xpath(DOM, //a(@href), HREF)
% ==
%
% Suppose we have a table containing rows where each first column
% is the name of a product with a link to details and the second
% is the price (a number). The following predicate matches the
% name, URL and price:
%
% ==
% product(DOM, Name, URL, Price) :-
% xpath(DOM, //tr, TR),
% xpath(TR, td(1), C1),
% xpath(C1, /self(normalize_space), Name),
% xpath(C1, a(@href), URL),
% xpath(TR, td(2, number), Price).
% ==
xpath(DOM, Spec, Content) :-
in_dom(Spec, DOM, Content).
in_dom(//Spec, DOM, Value) :- !,
element_spec(Spec, Name, Modifiers),
sub_dom(I, Len, Name, E, DOM),
modifiers(Modifiers, I, Len, E, Value).
in_dom(/Spec, E, Value) :- !,
element_spec(Spec, Name, Modifiers),
( Name == self
-> true
; element_name(E, Name)
),
modifiers(Modifiers, 1, 1, E, Value).
in_dom(A/B, DOM, Value) :- !,
in_dom(A, DOM, Value0),
in_dom(B, Value0, Value).
in_dom(A//B, DOM, Value) :- !,
in_dom(A, DOM, Value0),
in_dom(//B, Value0, Value).
in_dom(Spec, element(_, _, Content), Value) :-
element_spec(Spec, Name, Modifiers),
count_named_elements(Content, Name, CLen),
CLen > 0,
nth_element(N, Name, E, Content),
modifiers(Modifiers, N, CLen, E, Value).
element_spec(Var, _, _) :-
var(Var), !,
instantiation_error(Var).
element_spec(NS:Term, NS:Name, Modifiers) :- !,
Term =.. [Name0|Modifiers],
star(Name0, Name).
element_spec(Term, Name, Modifiers) :- !,
Term =.. [Name0|Modifiers],
star(Name0, Name).
star(*, _) :- !.
star(Name, Name).
%% sub_dom(-Index, -Count, +Name, -Sub, +DOM) is nondet.
%
% Sub is a node in DOM with Name.
%
% @param Count is the total number of nodes in the content
% list Sub appears that have the same name.
% @param Index is the 1-based index of Sub of nodes with
% Name.
sub_dom(1, 1, Name, DOM, DOM) :-
element_name(DOM, Name).
sub_dom(N, Len, Name, E, element(_,_,Content)) :- !,
sub_dom_2(N, Len, Name, E, Content).
sub_dom(N, Len, Name, E, Content) :-
is_list(Content),
sub_dom_2(N, Len, Name, E, Content).
sub_dom_2(N, Len, Name, Element, Content) :-
( count_named_elements(Content, Name, Len),
nth_element(N, Name, Element, Content)
; member(element(_,_,C2), Content),
sub_dom_2(N, Len, Name, Element, C2)
).
%% count_named_elements(+Content, +Name, -Count) is det.
%
% Count is the number of nodes with Name in Content.
count_named_elements(Content, Name, Count) :-
count_named_elements(Content, Name, 0, Count).
count_named_elements([], _, Count, Count).
count_named_elements([element(Name,_,_)|T], Name, C0, C) :- !,
C1 is C0+1,
count_named_elements(T, Name, C1, C).
count_named_elements([_|T], Name, C0, C) :-
count_named_elements(T, Name, C0, C).
%% nth_element(?N, +Name, -Element, +Content:list) is nondet.
%
% True if Element is the N-th element with name in Content.
nth_element(N, Name, Element, Content) :-
nth_element_(1, N, Name, Element, Content).
nth_element_(I, N, Name, E, [H|T]) :-
element_name(H, Name), !,
( N = I,
E = H
; I2 is I + 1,
( nonvar(N), I2 > N
-> !, fail
; true
),
nth_element_(I2, N, Name, E, T)
).
nth_element_(I, N, Name, E, [_|T]) :-
nth_element_(I, N, Name, E, T).
%% modifiers(+Modifiers, +I, +Clen, +DOM, -Value)
%
%
modifiers([], _, _, Value, Value).
modifiers([H|T], I, L, Value0, Value) :-
modifier(H, I, L, Value0, Value1),
modifiers(T, I, L, Value1, Value).
modifier(N, I, _, Value, Value) :- % Integer
integer(N), !,
N =:= I.
modifier(last, I, L, Value, Value) :- !, % last
I =:= L.
modifier(last-Expr, I, L, Value, Value) :- !, % last-Expr
I =:= L-Expr.
modifier(Function, _, _, In, Out) :-
xpath_function(Function, In, Out).
xpath_function(self, DOM, Value) :- !, % self
Value = DOM.
xpath_function(text, DOM, Text) :- !, % text
text_of_dom(DOM, Text).
xpath_function(normalize_space, DOM, Text) :- !, % normalize_space
text_of_dom(DOM, Text0),
normalize_space(atom(Text), Text0).
xpath_function(number, DOM, Number) :- !, % number
text_of_dom(DOM, Text0),
normalize_space(string(Text), Text0),
catch(atom_number(Text, Number), _, fail).
xpath_function(@Name, element(_, Attrs, _), Value) :- !, % @Name
memberchk(Name=Value, Attrs).
xpath_function(Left = Right, Value, Value) :- !, % =
var_or_function(Left, Value, LeftValue),
var_or_function(Right, Value, RightValue),
LeftValue = RightValue.
xpath_function(contains(Haystack, Needle), Value, Value) :- !, % contains(Haystack, Needle)
val_or_function(Haystack, Value, HaystackValue),
val_or_function(Needle, Value, NeedleValue),
atom(HaystackValue), atom(NeedleValue),
( sub_atom(HaystackValue, _, _, _, NeedleValue)
-> true
).
var_or_function(Arg, _, Arg) :-
var(Arg), !.
var_or_function(Func, Value0, Value) :-
xpath_function(Func, Value0, Value).
val_or_function(Arg, _, Arg) :-
var(Arg), !,
instantiation_error(Arg).
val_or_function(Func, Value0, Value) :- % TBD
xpath_function(Func, Value0, Value).
%% text_of_dom(+DOM, -Text:atom) is det.
%
% Text is the joined textual content of DOM.
text_of_dom(DOM, Text) :-
phrase(text_of(DOM), Tokens),
concat_atom(Tokens, Text).
text_of(element(_,_,Content)) -->
text_of_list(Content).
text_of([]) -->
[].
text_of([H|T]) -->
text_of(H),
text_of(T).
text_of_list([]) -->
[].
text_of_list([H|T]) -->
text_of_1(H),
text_of_list(T).
text_of_1(element(_,_,Content)) --> !,
text_of_list(Content).
text_of_1(Data) -->
{ assertion(atom(Data)) },
[Data].