703 lines
		
	
	
		
			18 KiB
		
	
	
	
		
			Perl
		
	
	
	
	
	
		
		
			
		
	
	
			703 lines
		
	
	
		
			18 KiB
		
	
	
	
		
			Perl
		
	
	
	
	
	
|   | /*  $Id$ | ||
|  | 
 | ||
|  |     Part of SWI-Prolog | ||
|  | 
 | ||
|  |     Author:        Jan Wielemaker | ||
|  |     E-mail:        wielemak@science.uva.nl | ||
|  |     WWW:           http://www.swi-prolog.org | ||
|  |     Copyright (C): 2006, University of Amsterdam | ||
|  | 
 | ||
|  |     This program is free software; you can redistribute it and/or | ||
|  |     modify it under the terms of the GNU General Public License | ||
|  |     as published by the Free Software Foundation; either version 2 | ||
|  |     of the License, or (at your option) any later version. | ||
|  | 
 | ||
|  |     This program is distributed in the hope that it will be useful, | ||
|  |     but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
|  |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||
|  |     GNU General Public License for more details. | ||
|  | 
 | ||
|  |     You should have received a copy of the GNU General Public | ||
|  |     License along with this library; if not, write to the Free Software | ||
|  |     Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA | ||
|  | 
 | ||
|  |     As a special exception, if you link this library with other files, | ||
|  |     compiled with a Free Software compiler, to produce an executable, this | ||
|  |     library does not by itself cause the resulting executable to be covered | ||
|  |     by the GNU General Public License. This exception does not however | ||
|  |     invalidate any other reasons why the executable file might be covered by | ||
|  |     the GNU General Public License. | ||
|  | */ | ||
|  | 
 | ||
|  | 
 | ||
|  | :- module(rdf_litindex, | ||
|  | 	  [ rdf_set_literal_index_option/1,	% +Options | ||
|  | 	    rdf_tokenize_literal/2,		% +Literal, -Tokens | ||
|  | 	    rdf_find_literals/2,		% +Spec, -ListOfLiterals | ||
|  | 	    rdf_token_expansions/2		% +Spec, -Expansions | ||
|  | 	  ]). | ||
|  | :- use_module(rdf_db). | ||
|  | :- use_module(library(debug)). | ||
|  | :- use_module(library(lists)). | ||
|  | :- use_module(library(error)). | ||
|  | :- use_module(library(porter_stem)). | ||
|  | :- use_module(library(double_metaphone)). | ||
|  | 
 | ||
|  | /** <module> Search literals | ||
|  | This module finds literals of the RDF database based on stemming and | ||
|  | being flexible to ordering of tokens. | ||
|  | */ | ||
|  | 
 | ||
|  | :- dynamic | ||
|  | 	literal_map/2,			% Type, -Map | ||
|  | 	new_token/1,			% Hook | ||
|  | 	setting/1. | ||
|  | :- volatile | ||
|  | 	literal_map/2. | ||
|  | :- multifile | ||
|  | 	tokenization/2,			% +Literal, -Tokens | ||
|  | 	exclude_from_index/2.		% +Which, +Token | ||
|  | 
 | ||
|  | 
 | ||
|  | setting(verbose(true)).			% print progress messages | ||
|  | setting(index_threads(1)).		% # threads for creating the index | ||
|  | setting(index(default)).		% Use a thread for incremental updates | ||
|  | 
 | ||
|  | %%	rdf_set_literal_index_option(+Options:list) | ||
|  | % | ||
|  | %	Set options for the literal package.  Currently defined options | ||
|  | % | ||
|  | %		* verbose(Bool) | ||
|  | %		If =true=, print progress messages while building the | ||
|  | %		index tables. | ||
|  | % | ||
|  | %		* index_threads(+Count) | ||
|  | %		Number of threads to use for initial indexing of | ||
|  | %		literals | ||
|  | % | ||
|  | %		* index(+How) | ||
|  | %		How to deal with indexing new literals.  How is one of | ||
|  | %		=self= (execute in the same thread), thread(N) (execute | ||
|  | %		in N concurrent threads) or =default= (depends on number | ||
|  | %		of cores). | ||
|  | 
 | ||
|  | rdf_set_literal_index_option([]) :- !. | ||
|  | rdf_set_literal_index_option([H|T]) :- !, | ||
|  | 	set_option(H), | ||
|  | 	rdf_set_literal_index_option(T). | ||
|  | rdf_set_literal_index_option(Option) :- | ||
|  | 	set_option(Option). | ||
|  | 
 | ||
|  | set_option(Term) :- | ||
|  | 	check_option(Term), | ||
|  | 	functor(Term, Name, Arity), | ||
|  | 	functor(General, Name, Arity), | ||
|  | 	retractall(setting(General)), | ||
|  | 	assert(setting(Term)). | ||
|  | 
 | ||
|  | check_option(X) :- | ||
|  | 	var(X), !, | ||
|  | 	instantiation_error(X). | ||
|  | check_option(verbose(X)) :- !, | ||
|  | 	must_be(boolean, X). | ||
|  | check_option(index_threads(Count)) :- !, | ||
|  | 	must_be(nonneg, Count). | ||
|  | check_option(index(How)) :- !, | ||
|  | 	must_be(oneof([default,thread(_),self]), How). | ||
|  | check_option(Option) :- | ||
|  | 	domain_error(literal_option, Option). | ||
|  | 
 | ||
|  | 
 | ||
|  | 		 /******************************* | ||
|  | 		 *	      QUERY		* | ||
|  | 		 *******************************/ | ||
|  | 
 | ||
|  | %%	rdf_find_literals(+Spec, -Literals) | ||
|  | % | ||
|  | %	Find literals in the RDF database matching Spec.  Spec is defined | ||
|  | %	as: | ||
|  | % | ||
|  | %	== | ||
|  | %	Spec ::= and(Spec,Spec) | ||
|  | %	Spec ::= or(Spec,Spec) | ||
|  | %	Spec ::= not(Spec) | ||
|  | %	Spec ::= sounds(Like) | ||
|  | %	Spec ::= stem(Like) | ||
|  | %	Spec ::= prefix(Prefix) | ||
|  | %	Spec ::= between(Low, High)	% Numerical between | ||
|  | %	Spec ::= ge(High)		% Numerical greater-equal | ||
|  | %	Spec ::= le(Low)		% Numerical less-equal | ||
|  | %	Spec ::= Token | ||
|  | %	== | ||
|  | % | ||
|  | %	sounds(Like) and stem(Like) both map to  a disjunction. First we | ||
|  | %	compile the spec to normal form:   a disjunction of conjunctions | ||
|  | %	on elementary tokens. Then we execute   all the conjunctions and | ||
|  | %	generate the union using ordered-set algorithms. | ||
|  | % | ||
|  | %	@tbd Exploit ordering of numbers and allow for > N, < N, etc. | ||
|  | 
 | ||
|  | rdf_find_literals(Spec, Literals) :- | ||
|  | 	compile_spec(Spec, DNF), | ||
|  | 	token_index(Map), | ||
|  | 	lookup(DNF, Map, _, SuperSet), | ||
|  | 	flatten(SuperSet, Set0), | ||
|  | 	sort(Set0, Literals). | ||
|  | 
 | ||
|  | %%	rdf_token_expansions(+Spec, -Extensions) | ||
|  | % | ||
|  | %	Determine which extensions of  a   token  contribute  to finding | ||
|  | %	literals. | ||
|  | 
 | ||
|  | rdf_token_expansions(prefix(Prefix), [prefix(Prefix, Tokens)]) :- | ||
|  | 	token_index(Map), | ||
|  | 	rdf_keys_in_literal_map(Map, prefix(Prefix), Tokens). | ||
|  | rdf_token_expansions(sounds(Like), [sounds(Like, Tokens)]) :- | ||
|  | 	metaphone_index(Map), | ||
|  | 	rdf_find_literal_map(Map, [Like], Tokens). | ||
|  | rdf_token_expansions(stem(Like), [stem(Like, Tokens)]) :- | ||
|  | 	porter_index(Map), | ||
|  | 	rdf_find_literal_map(Map, [Like], Tokens). | ||
|  | rdf_token_expansions(Spec, Expansions) :- | ||
|  | 	compile_spec(Spec, DNF), | ||
|  | 	token_index(Map), | ||
|  | 	lookup(DNF, Map, SCS, _), | ||
|  | 	flatten(SCS, CS), | ||
|  | 	sort(CS, Expansions0), | ||
|  | 	join_expansions(Expansions0, Expansions). | ||
|  | 
 | ||
|  | join_expansions([], []). | ||
|  | join_expansions([H0|T0], [H|T]) :- | ||
|  | 	untag(H0, Tag, V0), | ||
|  | 	Tag =.. L0, | ||
|  | 	append(L0, [[V0|Values]], L1), | ||
|  | 	H =.. L1, | ||
|  | 	join_expansions_by_tag(T0, Tag, T1, Values), | ||
|  | 	join_expansions(T1, T). | ||
|  | 
 | ||
|  | join_expansions_by_tag([H|T0], Tag, T, [V0|VT]) :- | ||
|  | 	untag(H, Tag, V0), !, | ||
|  | 	join_expansions_by_tag(T0, Tag, T, VT). | ||
|  | join_expansions_by_tag(L, _, L, []). | ||
|  | 
 | ||
|  | lookup(@(false), _, [], []) :- !. | ||
|  | lookup(or(H0,T0), Map, [CH|CT], [H|T]) :- !, | ||
|  | 	lookup(H0, Map, CH, H), | ||
|  | 	lookup(T0, Map, CT, T). | ||
|  | lookup(H0, Map, [C], [H]) :- | ||
|  | 	lookup1(H0, Map, C, H). | ||
|  | 
 | ||
|  | lookup1(Conj, Map, Cond, Literals) :- | ||
|  | 	phrase(conj_to_list(Conj), List), !, | ||
|  | 	rdf_find_literal_map(Map, List, Literals), | ||
|  | 	(   Literals \== [] | ||
|  | 	->  phrase(conj_to_cond(Conj), Cond) | ||
|  | 	;   Cond = [] | ||
|  | 	). | ||
|  | lookup1(_, _, _, []). | ||
|  | 
 | ||
|  | conj_to_list(and(A,B)) --> !, | ||
|  | 	conj_to_list(A), | ||
|  | 	conj_to_list(B). | ||
|  | conj_to_list(@(false)) --> !, | ||
|  | 	{fail}. | ||
|  | conj_to_list(Tagged) --> | ||
|  | 	{ untag(Tagged, L) }, !, | ||
|  | 	[L]. | ||
|  | conj_to_list(L) --> | ||
|  | 	[L]. | ||
|  | 
 | ||
|  | 
 | ||
|  | conj_to_cond(and(A,B)) --> !, | ||
|  | 	conj_to_cond(A), | ||
|  | 	conj_to_cond(B). | ||
|  | conj_to_cond(Tagged) --> | ||
|  | 	{ untag(Tagged, _) }, !, | ||
|  | 	[ Tagged ]. | ||
|  | conj_to_cond(_) --> | ||
|  | 	[]. | ||
|  | 
 | ||
|  | 
 | ||
|  | %%	compile_spec(+Spec, -Compiled) | ||
|  | % | ||
|  | %	Compile a specification as above into disjunctive normal form | ||
|  | 
 | ||
|  | compile_spec(Spec, DNF) :- | ||
|  | 	expand_fuzzy(Spec, Spec2), | ||
|  | 	nnf(Spec2, NNF), | ||
|  | 	dnf(NNF, DNF). | ||
|  | 
 | ||
|  | 
 | ||
|  | expand_fuzzy(Var, _) :- | ||
|  | 	var(Var), !, | ||
|  | 	throw(error(instantiation_error, _)). | ||
|  | expand_fuzzy(sounds(Like), Or) :- !, | ||
|  | 	metaphone_index(Map), | ||
|  | 	double_metaphone(Like, Key), | ||
|  | 	rdf_find_literal_map(Map, [Key], Tokens), | ||
|  | 	list_to_or(Tokens, sounds(Like), Or). | ||
|  | expand_fuzzy(stem(Like), Or) :- !, | ||
|  | 	porter_index(Map), | ||
|  | 	porter_stem(Like, Key), | ||
|  | 	rdf_find_literal_map(Map, [Key], Tokens), | ||
|  | 	list_to_or(Tokens, stem(Like), Or). | ||
|  | expand_fuzzy(prefix(Prefix), Or) :- !, | ||
|  | 	token_index(Map), | ||
|  | 	rdf_keys_in_literal_map(Map, prefix(Prefix), Tokens), | ||
|  | 	list_to_or(Tokens, prefix(Prefix), Or). | ||
|  | expand_fuzzy(case(String), Or) :- !, | ||
|  | 	token_index(Map), | ||
|  | 	rdf_keys_in_literal_map(Map, case(String), Tokens), | ||
|  | 	list_to_or(Tokens, case(String), Or). | ||
|  | expand_fuzzy(or(A0, B0), E) :- !, | ||
|  | 	expand_fuzzy(A0, A), | ||
|  | 	expand_fuzzy(B0, B), | ||
|  | 	simplify(or(A,B), E). | ||
|  | expand_fuzzy(and(A0, B0), E) :- !, | ||
|  | 	expand_fuzzy(A0, A), | ||
|  | 	expand_fuzzy(B0, B), | ||
|  | 	simplify(and(A,B), E). | ||
|  | expand_fuzzy(not(A0), not(A)) :- !, | ||
|  | 	expand_fuzzy(A0, A). | ||
|  | expand_fuzzy(between(Low, High), Or) :- !, | ||
|  | 	token_index(Map), | ||
|  | 	rdf_keys_in_literal_map(Map, between(Low, High), Tokens), | ||
|  | 	list_to_or(Tokens, between(Low, High), Or). | ||
|  | expand_fuzzy(le(High), Or) :- !, | ||
|  | 	token_index(Map), | ||
|  | 	rdf_keys_in_literal_map(Map, le(High), Tokens), | ||
|  | 	list_to_or(Tokens, le(High), Or). | ||
|  | expand_fuzzy(ge(Low), Or) :- !, | ||
|  | 	token_index(Map), | ||
|  | 	rdf_keys_in_literal_map(Map, ge(Low), Tokens), | ||
|  | 	list_to_or(Tokens, ge(Low), Or). | ||
|  | expand_fuzzy(Token, Token) :- | ||
|  | 	atomic(Token), !. | ||
|  | expand_fuzzy(Token, _) :- | ||
|  | 	throw(error(type_error(Token, boolean_expression), _)). | ||
|  | 
 | ||
|  | simplify(Expr0, Expr) :- | ||
|  | 	simple(Expr0, Expr), !. | ||
|  | simplify(Expr, Expr). | ||
|  | 
 | ||
|  | simple(and(@(false), _), @(false)). | ||
|  | simple(and(_, @(false)), @(false)). | ||
|  | simple(or(@(false), X), X). | ||
|  | simple(or(X, @(false)), X). | ||
|  | 
 | ||
|  | 
 | ||
|  | list_to_or([], _, @(false)) :- !. | ||
|  | list_to_or([X], How, One) :- !, | ||
|  | 	tag(How, X, One). | ||
|  | list_to_or([H0|T0], How, or(H, T)) :- | ||
|  | 	tag(How, H0, H), | ||
|  | 	list_to_or(T0, How, T). | ||
|  | 
 | ||
|  | tag(sounds(X),	  Y, sounds(X,Y)). | ||
|  | tag(stem(X),	  Y, stem(X,Y)). | ||
|  | tag(prefix(X),	  Y, prefix(X,Y)). | ||
|  | tag(case(X),	  Y, case(X,Y)). | ||
|  | tag(between(L,H), Y, between(L,H,Y)). | ||
|  | tag(ge(L),	  Y, ge(L,Y)). | ||
|  | tag(le(H),	  Y, le(H,Y)). | ||
|  | 
 | ||
|  | untag(sounds(_,Y),    Y). | ||
|  | untag(stem(_,Y),      Y). | ||
|  | untag(prefix(_,Y),    Y). | ||
|  | untag(case(_,Y),      Y). | ||
|  | untag(between(_,_,Y), Y). | ||
|  | untag(le(_,Y),	      Y). | ||
|  | untag(ge(_,Y),	      Y). | ||
|  | 
 | ||
|  | untag(sounds(X,Y),    sounds(X),    Y). | ||
|  | untag(stem(X,Y),      stem(X),	    Y). | ||
|  | untag(prefix(X,Y),    prefix(X),    Y). | ||
|  | untag(case(X,Y),      case(X),	    Y). | ||
|  | untag(between(L,H,Y), between(L,H), Y). | ||
|  | untag(ge(L,Y),	      ge(L),	    Y). | ||
|  | untag(le(H,Y),	      le(H),	    Y). | ||
|  | 
 | ||
|  | 
 | ||
|  | %%	nnf(+Formula, -NNF) | ||
|  | % | ||
|  | %	Rewrite to Negative Normal Form, meaning negations only appear | ||
|  | %	around literals. | ||
|  | 
 | ||
|  | nnf(not(not(A0)), A) :- !, | ||
|  | 	nnf(A0, A). | ||
|  | nnf(not(and(A0,B0)), or(A,B)) :- !, | ||
|  | 	nnf(not(A0), A), | ||
|  | 	nnf(not(B0), B). | ||
|  | nnf(not(or(A0,B0)), and(A,B)) :- !, | ||
|  | 	nnf(not(A0), A), | ||
|  | 	nnf(not(B0), B). | ||
|  | nnf(A, A). | ||
|  | 
 | ||
|  | 
 | ||
|  | %%	dnf(+NNF, -DNF) | ||
|  | % | ||
|  | %	Convert a formula in NNF to Disjunctive Normal Form (DNF) | ||
|  | 
 | ||
|  | dnf(or(A0,B0), or(A, B)) :- !, | ||
|  | 	dnf(A0, A), | ||
|  | 	dnf(B0, B). | ||
|  | dnf(and(A0,B0), DNF):- !, | ||
|  | 	dnf(A0, A1), | ||
|  | 	dnf(B0, B1), | ||
|  | 	dnf1(and(A1,B1), DNF). | ||
|  | dnf(DNF, DNF). | ||
|  | 
 | ||
|  | dnf1(and(A0, or(B,C)), or(P,Q)) :- !, | ||
|  | 	dnf1(and(A0,B), P), | ||
|  | 	dnf1(and(A0,C), Q). | ||
|  | dnf1(and(or(B,C), A0), or(P,Q)) :- !, | ||
|  | 	dnf1(and(A0,B), P), | ||
|  | 	dnf1(and(A0,C), Q). | ||
|  | dnf1(DNF, DNF). | ||
|  | 
 | ||
|  | 
 | ||
|  | 		 /******************************* | ||
|  | 		 *	    TOKEN INDEX		* | ||
|  | 		 *******************************/ | ||
|  | 
 | ||
|  | %%	token_index(-Map) | ||
|  | % | ||
|  | %	Get the index of tokens. If  not   present,  create one from the | ||
|  | %	current database. Once created, the map is kept up-to-date using | ||
|  | %	a monitor hook. | ||
|  | 
 | ||
|  | token_index(Map) :- | ||
|  | 	literal_map(tokens, Map), !. | ||
|  | token_index(Map) :- | ||
|  | 	rdf_new_literal_map(Map), | ||
|  | 	assert(literal_map(tokens, Map)), | ||
|  | 	make_literal_index, | ||
|  | 	verbose('~N', []), | ||
|  | 	Monitor = [ reset, | ||
|  | 		    new_literal, | ||
|  | 		    old_literal | ||
|  | 		  ], | ||
|  | 	(   setting(index(default)) | ||
|  | 	->  (   current_prolog_flag(cpu_count, N), N > 1 | ||
|  | 	    ->	create_update_literal_thread(1), | ||
|  | 		rdf_monitor(thread_monitor_literal, Monitor) | ||
|  | 	    ;	rdf_monitor(monitor_literal, Monitor) | ||
|  | 	    ) | ||
|  | 	;   setting(index(thread(N))) | ||
|  | 	->  create_update_literal_thread(N), | ||
|  | 	    rdf_monitor(thread_monitor_literal, Monitor) | ||
|  | 	;   rdf_monitor(monitor_literal, Monitor) | ||
|  | 	). | ||
|  | 
 | ||
|  | 
 | ||
|  | %%	make_literal_index | ||
|  | % | ||
|  | %	Create the initial literal index. | ||
|  | 
 | ||
|  | make_literal_index :- | ||
|  | 	setting(index_threads(N)), !, | ||
|  | 	threaded_literal_index(N). | ||
|  | make_literal_index :- | ||
|  | 	current_prolog_flag(cpu_count, X), | ||
|  | 	threaded_literal_index(X). | ||
|  | 
 | ||
|  | threaded_literal_index(N) :- | ||
|  | 	N > 1, !, | ||
|  | 	message_queue_create(Q, [max_size(1000)]), | ||
|  | 	create_index_threads(N, Q, Ids), | ||
|  | 	forall(rdf_current_literal(Literal), | ||
|  | 	       thread_send_message(Q, Literal)), | ||
|  | 	forall(between(1, N, _), | ||
|  | 	       thread_send_message(Q, done(true))), | ||
|  | 	maplist(thread_join, Ids, _). | ||
|  | threaded_literal_index(_) :- | ||
|  | 	forall(rdf_current_literal(Literal), | ||
|  | 	       register_literal(Literal)). | ||
|  | 
 | ||
|  | create_index_threads(N, Q, [Id|T]) :- | ||
|  | 	N > 0, !, | ||
|  | 	thread_create(index_worker(Q), Id, | ||
|  | 		      [ local(1000), | ||
|  | 			global(1000), | ||
|  | 			trail(1000) | ||
|  | 		      ]), | ||
|  | 	N2 is N - 1, | ||
|  | 	create_index_threads(N2, Q, T). | ||
|  | create_index_threads(_, _, []) :- !. | ||
|  | 
 | ||
|  | index_worker(Queue) :- | ||
|  | 	repeat, | ||
|  | 	    thread_get_message(Queue, Msg), | ||
|  | 	    work(Msg). | ||
|  | 
 | ||
|  | work(done(true)) :- !. | ||
|  | work(Literal) :- | ||
|  | 	register_literal(Literal), | ||
|  | 	fail. | ||
|  | 
 | ||
|  | 
 | ||
|  | %	clean_token_index | ||
|  | % | ||
|  | %	Clean after a reset. | ||
|  | 
 | ||
|  | clean_token_index :- | ||
|  | 	forall(literal_map(_, Map), | ||
|  | 	       rdf_reset_literal_map(Map)). | ||
|  | 
 | ||
|  | 		 /******************************* | ||
|  | 		 *	  THREADED UPDATE	* | ||
|  | 		 *******************************/ | ||
|  | 
 | ||
|  | %	create_update_literal_thread(+Threads) | ||
|  | % | ||
|  | %	Setup literal monitoring using threads.  While loading databases | ||
|  | %	through rdf_attach_db/2 from  rdf_persistency.pl,   most  of the | ||
|  | %	time is spent updating the literal token database. While loading | ||
|  | %	the RDF triples, most of the time   is spend in updating the AVL | ||
|  | %	tree holding the literals. Updating  the   token  index hangs on | ||
|  | %	updating the AVL trees holding the   tokens.  Both tasks however | ||
|  | %	can run concurrently. | ||
|  | 
 | ||
|  | create_update_literal_thread(Threads) :- | ||
|  | 	message_queue_create(_, | ||
|  | 			     [ alias(rdf_literal_monitor_queue), | ||
|  | 			       max_size(10000) | ||
|  | 			     ]), | ||
|  | 	forall(between(1, Threads, N), | ||
|  | 	       (   atom_concat(rdf_literal_monitor_, N, Alias), | ||
|  | 		   thread_create(monitor_literals, _, | ||
|  | 				 [ alias(Alias), | ||
|  | 				   local(1000), | ||
|  | 				   global(1000), | ||
|  | 				   trail(1000) | ||
|  | 				 ]) | ||
|  | 	       )). | ||
|  | 
 | ||
|  | monitor_literals :- | ||
|  | 	set_prolog_flag(agc_margin, 0),	% we don't create garbage | ||
|  | 	repeat, | ||
|  | 	    thread_get_message(rdf_literal_monitor_queue, Literal), | ||
|  | 	    register_literal(Literal), | ||
|  | 	fail. | ||
|  | 
 | ||
|  | thread_monitor_literal(new_literal(Literal)) :- !, | ||
|  | 	thread_send_message(rdf_literal_monitor_queue, Literal). | ||
|  | thread_monitor_literal(Action) :- !, | ||
|  | 	monitor_literal(Action). | ||
|  | 
 | ||
|  | 
 | ||
|  | 		 /******************************* | ||
|  | 		 *	 MONITORED UPDATE	* | ||
|  | 		 *******************************/ | ||
|  | 
 | ||
|  | monitor_literal(new_literal(Literal)) :- | ||
|  | 	register_literal(Literal). | ||
|  | monitor_literal(old_literal(Literal)) :- | ||
|  | 	unregister_literal(Literal). | ||
|  | monitor_literal(transaction(begin, reset)) :- | ||
|  | 	rdf_monitor(monitor_literal, [-old_literal]), | ||
|  | 	clean_token_index. | ||
|  | monitor_literal(transaction(end, reset)) :- | ||
|  | 	rdf_monitor(monitor_literal, [+old_literal]). | ||
|  | 
 | ||
|  | %%	register_literal(+Literal) | ||
|  | % | ||
|  | %	Associate the tokens of a literal with the literal itself. | ||
|  | 
 | ||
|  | register_literal(Literal) :- | ||
|  | 	(   rdf_tokenize_literal(Literal, Tokens) | ||
|  | 	->  text_of(Literal, Text), | ||
|  | 	    literal_map(tokens, Map), | ||
|  | 	    add_tokens(Tokens, Text, Map) | ||
|  | 	;   true | ||
|  | 	). | ||
|  | 
 | ||
|  | add_tokens([], _, _). | ||
|  | add_tokens([H|T], Literal, Map) :- | ||
|  | 	rdf_insert_literal_map(Map, H, Literal, Keys), | ||
|  | 	(   var(Keys) | ||
|  | 	->  true | ||
|  | 	;   forall(new_token(H), true), | ||
|  | 	    (	Keys mod 1000 =:= 0 | ||
|  | 	    ->	progress(Map, 'Tokens') | ||
|  | 	    ;	true | ||
|  | 	    ) | ||
|  | 	), | ||
|  | 	add_tokens(T, Literal, Map). | ||
|  | 
 | ||
|  | 
 | ||
|  | %%	unregister_literal(+Literal) | ||
|  | % | ||
|  | %	Literal is removed from the database.   As we abstract from lang | ||
|  | %	and type qualifiers we first have to  check this is the last one | ||
|  | %	that is destroyed. | ||
|  | 
 | ||
|  | unregister_literal(Literal) :- | ||
|  | 	text_of(Literal, Text), | ||
|  | 	(   rdf(_,_,literal(Text)) | ||
|  | 	->  true			% still something left | ||
|  | 	;   rdf_tokenize_literal(Literal, Tokens), | ||
|  | 	    literal_map(tokens, Map), | ||
|  | 	    del_tokens(Tokens, Text, Map) | ||
|  | 	). | ||
|  | 
 | ||
|  | del_tokens([], _, _). | ||
|  | del_tokens([H|T], Literal, Map) :- | ||
|  | 	rdf_delete_literal_map(Map, H, Literal), | ||
|  | 	del_tokens(T, Literal, Map). | ||
|  | 
 | ||
|  | 
 | ||
|  | %%	rdf_tokenize_literal(+Literal, -Tokens) is semidet. | ||
|  | % | ||
|  | %	Tokenize a literal. We make  this   hookable  as tokenization is | ||
|  | %	generally domain dependent. | ||
|  | 
 | ||
|  | rdf_tokenize_literal(Literal, Tokens) :- | ||
|  | 	tokenization(Literal, Tokens), !. 		% Hook | ||
|  | rdf_tokenize_literal(Literal, Tokens) :- | ||
|  | 	text_of(Literal, Text), | ||
|  | 	atom(Text), | ||
|  | 	tokenize_atom(Text, Tokens0), | ||
|  | 	select_tokens(Tokens0, Tokens). | ||
|  | 
 | ||
|  | select_tokens([], []). | ||
|  | select_tokens([H|T0], T) :- | ||
|  | 	(   exclude_from_index(token, H) | ||
|  | 	->  select_tokens(T0, T) | ||
|  | 	;   number(H) | ||
|  | 	->  (   integer(H), | ||
|  | 	        between(-1073741824, 1073741823, H) | ||
|  | 	    ->	T = [H|T1], | ||
|  | 		select_tokens(T0, T1) | ||
|  | 	    ;   select_tokens(T0, T) | ||
|  | 	    ) | ||
|  | 	;   atom_length(H, 1) | ||
|  | 	->  select_tokens(T0, T) | ||
|  | 	;   no_index_token(H) | ||
|  | 	->  select_tokens(T0, T) | ||
|  | 	;   T = [H|T1], | ||
|  | 	    select_tokens(T0, T1) | ||
|  | 	). | ||
|  | 
 | ||
|  | 
 | ||
|  | %	no_index_token/1 | ||
|  | % | ||
|  | %	Tokens we do not wish to index,   as  they creat huge amounts of | ||
|  | %	data with little or no value.  Is   there  a more general way to | ||
|  | %	describe this? Experience shows that simply  word count is not a | ||
|  | %	good criterium as it often rules out popular domain terms. | ||
|  | 
 | ||
|  | no_index_token(and). | ||
|  | no_index_token(an). | ||
|  | no_index_token(or). | ||
|  | no_index_token(of). | ||
|  | no_index_token(on). | ||
|  | no_index_token(in). | ||
|  | no_index_token(this). | ||
|  | no_index_token(the). | ||
|  | 
 | ||
|  | 
 | ||
|  | %%	text_of(+LiteralArg, -Text) | ||
|  | % | ||
|  | %	Get the textual  or  (integer)   numerical  information  from  a | ||
|  | %	literal value. | ||
|  | 
 | ||
|  | text_of(type(_, Text), Text) :- !. | ||
|  | text_of(lang(_, Text), Text) :- !. | ||
|  | text_of(Text, Text) :- atom(Text), !. | ||
|  | text_of(Text, Text) :- integer(Text). | ||
|  | 
 | ||
|  | 
 | ||
|  | 		 /******************************* | ||
|  | 		 *	   PORTER INDEX		* | ||
|  | 		 *******************************/ | ||
|  | 
 | ||
|  | 
 | ||
|  | porter_index(Map) :- | ||
|  | 	literal_map(porter, Map), !. | ||
|  | porter_index(Map) :- | ||
|  | 	rdf_new_literal_map(Map), | ||
|  | 	assert(literal_map(porter, Map)), | ||
|  | 	fill_porter_index(Map), | ||
|  | 	assert((new_token(Token) :- add_stem(Token, Map))). | ||
|  | 
 | ||
|  | fill_porter_index(PorterMap) :- | ||
|  | 	token_index(TokenMap), | ||
|  | 	rdf_keys_in_literal_map(TokenMap, all, Tokens), | ||
|  | 	stem(Tokens, PorterMap). | ||
|  | 
 | ||
|  | stem([], _). | ||
|  | stem([Token|T], Map) :- | ||
|  | 	(   atom(Token) | ||
|  | 	->  porter_stem(Token, Stem), | ||
|  | 	    rdf_insert_literal_map(Map, Stem, Token, Keys), | ||
|  | 	    (	integer(Keys), | ||
|  | 		Keys mod 1000 =:= 0 | ||
|  | 	    ->  progress(Map, 'Porter') | ||
|  | 	    ;	true | ||
|  | 	    ) | ||
|  | 	;   true | ||
|  | 	), | ||
|  | 	stem(T, Map). | ||
|  | 
 | ||
|  | 
 | ||
|  | add_stem(Token, Map) :- | ||
|  | 	porter_stem(Token, Stem), | ||
|  | 	rdf_insert_literal_map(Map, Stem, Token, _). | ||
|  | 
 | ||
|  | 
 | ||
|  | 		 /******************************* | ||
|  | 		 *	  METAPHONE INDEX	* | ||
|  | 		 *******************************/ | ||
|  | 
 | ||
|  | 
 | ||
|  | metaphone_index(Map) :- | ||
|  | 	literal_map(metaphone, Map), !. | ||
|  | metaphone_index(Map) :- | ||
|  | 	rdf_new_literal_map(Map), | ||
|  | 	assert(literal_map(metaphone, Map)), | ||
|  | 	fill_metaphone_index(Map), | ||
|  | 	assert((new_token(Token) :- add_metaphone(Token, Map))). | ||
|  | 
 | ||
|  | fill_metaphone_index(PorterMap) :- | ||
|  | 	token_index(TokenMap), | ||
|  | 	rdf_keys_in_literal_map(TokenMap, all, Tokens), | ||
|  | 	metaphone(Tokens, PorterMap). | ||
|  | 
 | ||
|  | metaphone([], _). | ||
|  | metaphone([Token|T], Map) :- | ||
|  | 	(   atom(Token) | ||
|  | 	->  double_metaphone(Token, SoundEx), | ||
|  | 	    rdf_insert_literal_map(Map, SoundEx, Token, Keys), | ||
|  | 	    (	integer(Keys), | ||
|  | 		Keys mod 1000 =:= 0 | ||
|  | 	    ->	progress(Map, 'Metaphone') | ||
|  | 	    ;	true | ||
|  | 	    ) | ||
|  | 	;   true | ||
|  | 	), | ||
|  | 	metaphone(T, Map). | ||
|  | 
 | ||
|  | 
 | ||
|  | add_metaphone(Token, Map) :- | ||
|  | 	double_metaphone(Token, SoundEx), | ||
|  | 	rdf_insert_literal_map(Map, SoundEx, Token). | ||
|  | 
 | ||
|  | 
 | ||
|  | 		 /******************************* | ||
|  | 		 *	       UTIL		* | ||
|  | 		 *******************************/ | ||
|  | 
 | ||
|  | verbose(Fmt, Args) :- | ||
|  | 	setting(verbose(true)), !, | ||
|  | 	format(user_error, Fmt, Args). | ||
|  | verbose(_, _). | ||
|  | 
 | ||
|  | progress(Map, Which) :- | ||
|  | 	setting(verbose(true)), !, | ||
|  | 	rdf_statistics_literal_map(Map, size(Keys, Values)), | ||
|  | 	format(user_error, | ||
|  | 	       '\r~t~w: ~12|Keys: ~t~D~15+; Values: ~t~D~20+', | ||
|  | 	       [Which, Keys, Values]). | ||
|  | progress(_,_). |