337 lines
		
	
	
		
			9.6 KiB
		
	
	
	
		
			Perl
		
	
	
	
	
	
		
		
			
		
	
	
			337 lines
		
	
	
		
			9.6 KiB
		
	
	
	
		
			Perl
		
	
	
	
	
	
|   | /*  This file is part of ClioPatria. | ||
|  | 
 | ||
|  |     Author:	Jan Wielemaker <J.Wielemaker@cs.vu.nl> | ||
|  |     HTTP:	http://e-culture.multimedian.nl/ | ||
|  |     Copyright:  2007, E-Culture/MultimediaN | ||
|  | 
 | ||
|  |     ClioPatria is free software: you can redistribute it and/or modify | ||
|  |     it under the terms of the GNU General Public License as published by | ||
|  |     the Free Software Foundation, either version 2 of the License, or | ||
|  |     (at your option) any later version. | ||
|  | 
 | ||
|  |     ClioPatria is distributed in the hope that it will be useful, | ||
|  |     but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
|  |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||
|  |     GNU General Public License for more details. | ||
|  | 
 | ||
|  |     You should have received a copy of the GNU General Public License | ||
|  |     along with ClioPatria.  If not, see <http://www.gnu.org/licenses/>. | ||
|  | */ | ||
|  | 
 | ||
|  | :- module(xpath, | ||
|  | 	  [ xpath/3,			% +DOM, +Spec, -Value | ||
|  | 	    xpath_chk/3,		% +DOM, +Spec, -Value | ||
|  | 
 | ||
|  | 	    op(400, fx, //), | ||
|  | 	    op(400, fx, /), | ||
|  | 	    op(200, fy, @) | ||
|  | 	  ]). | ||
|  | :- use_module(library(record)). | ||
|  | :- use_module(library(lists)). | ||
|  | :- use_module(library(occurs)). | ||
|  | :- use_module(library(debug)). | ||
|  | 
 | ||
|  | /** <module> Select nodes in an XML DOM | ||
|  | 
 | ||
|  | The library xpath.pl provides predicates to select nodes from an XML DOM | ||
|  | tree as produced by library(sgml) based  on descriptions inspired by the | ||
|  | XPATH language. | ||
|  | 
 | ||
|  | The   predicate   xpath/3   selects   a   sub-structure   of   the   DOM | ||
|  | non-deterministically based on an  xpath-like   specification.  Not  all | ||
|  | selectors of XPATH are implemented, but the ability to mix xpath/3 calls | ||
|  | with arbitrary Prolog code  provides  a   powerful  tool  for extracting | ||
|  | information from XML parse-trees. | ||
|  | 
 | ||
|  | @see http://www.w3.org/TR/xpath | ||
|  | */ | ||
|  | 
 | ||
|  | :- record | ||
|  | 	element(name, attributes, content). | ||
|  | 
 | ||
|  | %%	xpath_chk(+DOM, +Spec, ?Content) is semidet. | ||
|  | % | ||
|  | %	Semi-deterministic version of xpath/3. | ||
|  | 
 | ||
|  | xpath_chk(DOM, Spec, Content) :- | ||
|  | 	xpath(DOM, Spec, Content), !. | ||
|  | 
 | ||
|  | %%	xpath(+DOM, +Spec, ?Content) is nondet. | ||
|  | % | ||
|  | %	Match an element in a DOM structure.   The syntax is inspired by | ||
|  | %	XPath, using () rather than  []   to  select  inside an element. | ||
|  | %	First we can construct paths using / and //: | ||
|  | % | ||
|  | %	    $ =|//|=Term : | ||
|  | %	    Select any node in the DOM matching term. | ||
|  | %	    $ =|/|=Term : | ||
|  | %	    Match the root against Term. | ||
|  | %	    $ Term : | ||
|  | %	    Select the immediate children of the root matching Term. | ||
|  | % | ||
|  | %	The Terms above are of type   _callable_.  The functor specifies | ||
|  | %	the element name. The element name   '*'  refers to any element. | ||
|  | %	The name =self= refers to the   top-element  itself and is often | ||
|  | %	used for processing matches of an  earlier xpath/3 query. A term | ||
|  | %	NS:Term refers to an XML  name   in  the  namespace NS. Optional | ||
|  | %	arguments specify additional  constraints   and  functions.  The | ||
|  | %	arguments are processed from left  to right. Defined conditional | ||
|  | %	argument values are: | ||
|  | % | ||
|  | %	    $ Integer : | ||
|  | %	    The N-th element with the given name | ||
|  | %	    $ =last= : | ||
|  | %	    The last element with the given name. | ||
|  | %	    $ =last= - IntExpr : | ||
|  | %	    The IntExpr-th element counting from the last (0-based) | ||
|  | % | ||
|  | %	Defined function argument values are: | ||
|  | % | ||
|  | %	    $ =self= : | ||
|  | %	    Evaluate to the entire element | ||
|  | %	    $ =text= : | ||
|  | %	    Evaluates to all text from the sub-tree as an atom | ||
|  | %	    $ =normalize_space= : | ||
|  | %	    As =text=, but uses normalize_space/2 to normalise | ||
|  | %	    white-space in the output | ||
|  | %	    $ =number= : | ||
|  | %	    Extract an integer or float from the value.  Ignores | ||
|  | %	    leading and trailing white-space | ||
|  | %	    $ =|@|=Attribute : | ||
|  | %	    Evaluates to the value of the given attribute | ||
|  | % | ||
|  | %	In addition, the argument-list can be _conditions_: | ||
|  | % | ||
|  | %	    $ Left = Right : | ||
|  | %	    Succeeds if the left-hand unifies with the right-hand. | ||
|  | %	    E.g. normalize_space = 'euro' | ||
|  | %	    $ contains(Haystack, Needle) : | ||
|  | %	    Succeeds if Needle is a sub-string of Haystack. | ||
|  | % | ||
|  | %	Examples: | ||
|  | % | ||
|  | %	Match each table-row in DOM: | ||
|  | % | ||
|  | %	    == | ||
|  | %	    xpath(DOM, //tr, TR) | ||
|  | %	    == | ||
|  | % | ||
|  | %	Match the last cell  of  each   tablerow  in  DOM.  This example | ||
|  | %	illustrates that a result can be the input of subsequent xpath/3 | ||
|  | %	queries. Using multiple queries  on   the  intermediate  TR term | ||
|  | %	guarantee that all results come from the same table-row: | ||
|  | % | ||
|  | %	    == | ||
|  | %	    xpath(DOM, //tr, TR), | ||
|  | %	    xpath(TR,  /td(last), TD) | ||
|  | %	    == | ||
|  | % | ||
|  | %	Match each =href= attribute in an <a> element | ||
|  | % | ||
|  | %	    == | ||
|  | %	    xpath(DOM, //a(@href), HREF) | ||
|  | %	    == | ||
|  | % | ||
|  | %	Suppose we have a table containing  rows where each first column | ||
|  | %	is the name of a product with a   link to details and the second | ||
|  | %	is the price (a number).  The   following  predicate matches the | ||
|  | %	name, URL and price: | ||
|  | % | ||
|  | %	    == | ||
|  | %	    product(DOM, Name, URL, Price) :- | ||
|  | %	    	xpath(DOM, //tr, TR), | ||
|  | %	    	xpath(TR, td(1), C1), | ||
|  | %	    	xpath(C1, /self(normalize_space), Name), | ||
|  | %	    	xpath(C1, a(@href), URL), | ||
|  | %	    	xpath(TR, td(2, number), Price). | ||
|  | %	    == | ||
|  | 
 | ||
|  | xpath(DOM, Spec, Content) :- | ||
|  | 	in_dom(Spec, DOM, Content). | ||
|  | 
 | ||
|  | in_dom(//Spec, DOM, Value) :- !, | ||
|  | 	element_spec(Spec, Name, Modifiers), | ||
|  | 	sub_dom(I, Len, Name, E, DOM), | ||
|  | 	modifiers(Modifiers, I, Len, E, Value). | ||
|  | in_dom(/Spec, E, Value) :- !, | ||
|  | 	element_spec(Spec, Name, Modifiers), | ||
|  | 	(   Name == self | ||
|  | 	->  true | ||
|  | 	;   element_name(E, Name) | ||
|  | 	), | ||
|  | 	modifiers(Modifiers, 1, 1, E, Value). | ||
|  | in_dom(A/B, DOM, Value) :- !, | ||
|  | 	in_dom(A, DOM, Value0), | ||
|  | 	in_dom(B, Value0, Value). | ||
|  | in_dom(A//B, DOM, Value) :- !, | ||
|  | 	in_dom(A, DOM, Value0), | ||
|  | 	in_dom(//B, Value0, Value). | ||
|  | in_dom(Spec, element(_, _, Content), Value) :- | ||
|  | 	element_spec(Spec, Name, Modifiers), | ||
|  | 	count_named_elements(Content, Name, CLen), | ||
|  | 	CLen > 0, | ||
|  | 	nth_element(N, Name, E, Content), | ||
|  | 	modifiers(Modifiers, N, CLen, E, Value). | ||
|  | 
 | ||
|  | element_spec(Var, _, _) :- | ||
|  | 	var(Var), !, | ||
|  | 	instantiation_error(Var). | ||
|  | element_spec(NS:Term, NS:Name, Modifiers) :- !, | ||
|  | 	Term =.. [Name0|Modifiers], | ||
|  | 	star(Name0, Name). | ||
|  | element_spec(Term, Name, Modifiers) :- !, | ||
|  | 	Term =.. [Name0|Modifiers], | ||
|  | 	star(Name0, Name). | ||
|  | 
 | ||
|  | star(*, _) :- !. | ||
|  | star(Name, Name). | ||
|  | 
 | ||
|  | 
 | ||
|  | %%	sub_dom(-Index, -Count, +Name, -Sub, +DOM) is nondet. | ||
|  | % | ||
|  | %	Sub is a node in DOM with Name. | ||
|  | % | ||
|  | %	@param Count	is the total number of nodes in the content | ||
|  | %			list Sub appears that have the same name. | ||
|  | %	@param Index	is the 1-based index of Sub of nodes with | ||
|  | %			Name. | ||
|  | 
 | ||
|  | sub_dom(1, 1, Name, DOM, DOM) :- | ||
|  | 	element_name(DOM, Name). | ||
|  | sub_dom(N, Len, Name, E, element(_,_,Content)) :- !, | ||
|  | 	sub_dom_2(N, Len, Name, E, Content). | ||
|  | sub_dom(N, Len, Name, E, Content) :- | ||
|  | 	is_list(Content), | ||
|  | 	sub_dom_2(N, Len, Name, E, Content). | ||
|  | 
 | ||
|  | sub_dom_2(N, Len, Name, Element, Content) :- | ||
|  | 	(   count_named_elements(Content, Name, Len), | ||
|  | 	    nth_element(N, Name, Element, Content) | ||
|  | 	;   member(element(_,_,C2), Content), | ||
|  | 	    sub_dom_2(N, Len, Name, Element, C2) | ||
|  | 	). | ||
|  | 
 | ||
|  | 
 | ||
|  | %%	count_named_elements(+Content, +Name, -Count) is det. | ||
|  | % | ||
|  | %	Count is the number of nodes with Name in Content. | ||
|  | 
 | ||
|  | count_named_elements(Content, Name, Count) :- | ||
|  | 	count_named_elements(Content, Name, 0, Count). | ||
|  | 
 | ||
|  | count_named_elements([], _, Count, Count). | ||
|  | count_named_elements([element(Name,_,_)|T], Name, C0, C) :- !, | ||
|  | 	C1 is C0+1, | ||
|  | 	count_named_elements(T, Name, C1, C). | ||
|  | count_named_elements([_|T], Name, C0, C) :- | ||
|  | 	count_named_elements(T, Name, C0, C). | ||
|  | 
 | ||
|  | 
 | ||
|  | %%	nth_element(?N, +Name, -Element, +Content:list) is nondet. | ||
|  | % | ||
|  | %	True if Element is the N-th element with name in Content. | ||
|  | 
 | ||
|  | nth_element(N, Name, Element, Content) :- | ||
|  | 	nth_element_(1, N, Name, Element, Content). | ||
|  | 
 | ||
|  | nth_element_(I, N, Name, E, [H|T]) :- | ||
|  | 	element_name(H, Name), !, | ||
|  | 	(   N = I, | ||
|  | 	    E = H | ||
|  | 	;   I2 is I + 1, | ||
|  | 	    (	nonvar(N), I2 > N | ||
|  | 	    ->	!, fail | ||
|  | 	    ;	true | ||
|  | 	    ), | ||
|  | 	    nth_element_(I2, N, Name, E, T) | ||
|  | 	). | ||
|  | nth_element_(I, N, Name, E, [_|T]) :- | ||
|  | 	nth_element_(I, N, Name, E, T). | ||
|  | 
 | ||
|  | 
 | ||
|  | %%	modifiers(+Modifiers, +I, +Clen, +DOM, -Value) | ||
|  | % | ||
|  | % | ||
|  | 
 | ||
|  | modifiers([], _, _, Value, Value). | ||
|  | modifiers([H|T], I, L, Value0, Value) :- | ||
|  | 	modifier(H, I, L, Value0, Value1), | ||
|  | 	modifiers(T, I, L, Value1, Value). | ||
|  | 
 | ||
|  | modifier(N, I, _, Value, Value) :-				% Integer | ||
|  | 	integer(N), !, | ||
|  | 	N =:= I. | ||
|  | modifier(last, I, L, Value, Value) :- !,			% last | ||
|  | 	I =:= L. | ||
|  | modifier(last-Expr, I, L, Value, Value) :- !,			% last-Expr | ||
|  | 	I =:= L-Expr. | ||
|  | modifier(Function, _, _, In, Out) :- | ||
|  | 	xpath_function(Function, In, Out). | ||
|  | 
 | ||
|  | xpath_function(self, DOM, Value) :- !,				% self | ||
|  | 	Value = DOM. | ||
|  | xpath_function(text, DOM, Text) :- !,				% text | ||
|  | 	text_of_dom(DOM, Text). | ||
|  | xpath_function(normalize_space, DOM, Text) :- !,		% normalize_space | ||
|  | 	text_of_dom(DOM, Text0), | ||
|  | 	normalize_space(atom(Text), Text0). | ||
|  | xpath_function(number, DOM, Number) :- !,			% number | ||
|  | 	text_of_dom(DOM, Text0), | ||
|  | 	normalize_space(string(Text), Text0), | ||
|  | 	catch(atom_number(Text, Number), _, fail). | ||
|  | xpath_function(@Name, element(_, Attrs, _), Value) :- !,	% @Name | ||
|  | 	memberchk(Name=Value, Attrs). | ||
|  | xpath_function(Left = Right, Value, Value) :- !,		% = | ||
|  | 	var_or_function(Left, Value, LeftValue), | ||
|  | 	var_or_function(Right, Value, RightValue), | ||
|  | 	LeftValue = RightValue. | ||
|  | xpath_function(contains(Haystack, Needle), Value, Value) :- !,	% contains(Haystack, Needle) | ||
|  | 	val_or_function(Haystack, Value, HaystackValue), | ||
|  | 	val_or_function(Needle, Value, NeedleValue), | ||
|  | 	atom(HaystackValue), atom(NeedleValue), | ||
|  | 	(   sub_atom(HaystackValue, _, _, _, NeedleValue) | ||
|  | 	->  true | ||
|  | 	). | ||
|  | 
 | ||
|  | var_or_function(Arg, _, Arg) :- | ||
|  | 	var(Arg), !. | ||
|  | var_or_function(Func, Value0, Value) :- | ||
|  | 	xpath_function(Func, Value0, Value). | ||
|  | 
 | ||
|  | val_or_function(Arg, _, Arg) :- | ||
|  | 	var(Arg), !, | ||
|  | 	instantiation_error(Arg). | ||
|  | val_or_function(Func, Value0, Value) :-				% TBD | ||
|  | 	xpath_function(Func, Value0, Value). | ||
|  | 
 | ||
|  | 
 | ||
|  | %%	text_of_dom(+DOM, -Text:atom) is det. | ||
|  | % | ||
|  | %	Text is the joined textual content of DOM. | ||
|  | 
 | ||
|  | text_of_dom(DOM, Text) :- | ||
|  | 	phrase(text_of(DOM), Tokens), | ||
|  | 	concat_atom(Tokens, Text). | ||
|  | 
 | ||
|  | text_of(element(_,_,Content)) --> | ||
|  | 	text_of_list(Content). | ||
|  | text_of([]) --> | ||
|  | 	[]. | ||
|  | text_of([H|T]) --> | ||
|  | 	text_of(H), | ||
|  | 	text_of(T). | ||
|  | 
 | ||
|  | 
 | ||
|  | text_of_list([]) --> | ||
|  | 	[]. | ||
|  | text_of_list([H|T]) --> | ||
|  | 	text_of_1(H), | ||
|  | 	text_of_list(T). | ||
|  | 
 | ||
|  | 
 | ||
|  | text_of_1(element(_,_,Content)) --> !, | ||
|  | 	text_of_list(Content). | ||
|  | text_of_1(Data) --> | ||
|  | 	{ assertion(atom(Data)) }, | ||
|  | 	[Data]. |