703 lines
18 KiB
Perl
703 lines
18 KiB
Perl
|
/* $Id$
|
||
|
|
||
|
Part of SWI-Prolog
|
||
|
|
||
|
Author: Jan Wielemaker
|
||
|
E-mail: wielemak@science.uva.nl
|
||
|
WWW: http://www.swi-prolog.org
|
||
|
Copyright (C): 2006, University of Amsterdam
|
||
|
|
||
|
This program is free software; you can redistribute it and/or
|
||
|
modify it under the terms of the GNU General Public License
|
||
|
as published by the Free Software Foundation; either version 2
|
||
|
of the License, or (at your option) any later version.
|
||
|
|
||
|
This program is distributed in the hope that it will be useful,
|
||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||
|
GNU General Public License for more details.
|
||
|
|
||
|
You should have received a copy of the GNU General Public
|
||
|
License along with this library; if not, write to the Free Software
|
||
|
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||
|
|
||
|
As a special exception, if you link this library with other files,
|
||
|
compiled with a Free Software compiler, to produce an executable, this
|
||
|
library does not by itself cause the resulting executable to be covered
|
||
|
by the GNU General Public License. This exception does not however
|
||
|
invalidate any other reasons why the executable file might be covered by
|
||
|
the GNU General Public License.
|
||
|
*/
|
||
|
|
||
|
|
||
|
:- module(rdf_litindex,
|
||
|
[ rdf_set_literal_index_option/1, % +Options
|
||
|
rdf_tokenize_literal/2, % +Literal, -Tokens
|
||
|
rdf_find_literals/2, % +Spec, -ListOfLiterals
|
||
|
rdf_token_expansions/2 % +Spec, -Expansions
|
||
|
]).
|
||
|
:- use_module(rdf_db).
|
||
|
:- use_module(library(debug)).
|
||
|
:- use_module(library(lists)).
|
||
|
:- use_module(library(error)).
|
||
|
:- use_module(library(porter_stem)).
|
||
|
:- use_module(library(double_metaphone)).
|
||
|
|
||
|
/** <module> Search literals
|
||
|
This module finds literals of the RDF database based on stemming and
|
||
|
being flexible to ordering of tokens.
|
||
|
*/
|
||
|
|
||
|
:- dynamic
|
||
|
literal_map/2, % Type, -Map
|
||
|
new_token/1, % Hook
|
||
|
setting/1.
|
||
|
:- volatile
|
||
|
literal_map/2.
|
||
|
:- multifile
|
||
|
tokenization/2, % +Literal, -Tokens
|
||
|
exclude_from_index/2. % +Which, +Token
|
||
|
|
||
|
|
||
|
setting(verbose(true)). % print progress messages
|
||
|
setting(index_threads(1)). % # threads for creating the index
|
||
|
setting(index(default)). % Use a thread for incremental updates
|
||
|
|
||
|
%% rdf_set_literal_index_option(+Options:list)
|
||
|
%
|
||
|
% Set options for the literal package. Currently defined options
|
||
|
%
|
||
|
% * verbose(Bool)
|
||
|
% If =true=, print progress messages while building the
|
||
|
% index tables.
|
||
|
%
|
||
|
% * index_threads(+Count)
|
||
|
% Number of threads to use for initial indexing of
|
||
|
% literals
|
||
|
%
|
||
|
% * index(+How)
|
||
|
% How to deal with indexing new literals. How is one of
|
||
|
% =self= (execute in the same thread), thread(N) (execute
|
||
|
% in N concurrent threads) or =default= (depends on number
|
||
|
% of cores).
|
||
|
|
||
|
rdf_set_literal_index_option([]) :- !.
|
||
|
rdf_set_literal_index_option([H|T]) :- !,
|
||
|
set_option(H),
|
||
|
rdf_set_literal_index_option(T).
|
||
|
rdf_set_literal_index_option(Option) :-
|
||
|
set_option(Option).
|
||
|
|
||
|
set_option(Term) :-
|
||
|
check_option(Term),
|
||
|
functor(Term, Name, Arity),
|
||
|
functor(General, Name, Arity),
|
||
|
retractall(setting(General)),
|
||
|
assert(setting(Term)).
|
||
|
|
||
|
check_option(X) :-
|
||
|
var(X), !,
|
||
|
instantiation_error(X).
|
||
|
check_option(verbose(X)) :- !,
|
||
|
must_be(boolean, X).
|
||
|
check_option(index_threads(Count)) :- !,
|
||
|
must_be(nonneg, Count).
|
||
|
check_option(index(How)) :- !,
|
||
|
must_be(oneof([default,thread(_),self]), How).
|
||
|
check_option(Option) :-
|
||
|
domain_error(literal_option, Option).
|
||
|
|
||
|
|
||
|
/*******************************
|
||
|
* QUERY *
|
||
|
*******************************/
|
||
|
|
||
|
%% rdf_find_literals(+Spec, -Literals)
|
||
|
%
|
||
|
% Find literals in the RDF database matching Spec. Spec is defined
|
||
|
% as:
|
||
|
%
|
||
|
% ==
|
||
|
% Spec ::= and(Spec,Spec)
|
||
|
% Spec ::= or(Spec,Spec)
|
||
|
% Spec ::= not(Spec)
|
||
|
% Spec ::= sounds(Like)
|
||
|
% Spec ::= stem(Like)
|
||
|
% Spec ::= prefix(Prefix)
|
||
|
% Spec ::= between(Low, High) % Numerical between
|
||
|
% Spec ::= ge(High) % Numerical greater-equal
|
||
|
% Spec ::= le(Low) % Numerical less-equal
|
||
|
% Spec ::= Token
|
||
|
% ==
|
||
|
%
|
||
|
% sounds(Like) and stem(Like) both map to a disjunction. First we
|
||
|
% compile the spec to normal form: a disjunction of conjunctions
|
||
|
% on elementary tokens. Then we execute all the conjunctions and
|
||
|
% generate the union using ordered-set algorithms.
|
||
|
%
|
||
|
% @tbd Exploit ordering of numbers and allow for > N, < N, etc.
|
||
|
|
||
|
rdf_find_literals(Spec, Literals) :-
|
||
|
compile_spec(Spec, DNF),
|
||
|
token_index(Map),
|
||
|
lookup(DNF, Map, _, SuperSet),
|
||
|
flatten(SuperSet, Set0),
|
||
|
sort(Set0, Literals).
|
||
|
|
||
|
%% rdf_token_expansions(+Spec, -Extensions)
|
||
|
%
|
||
|
% Determine which extensions of a token contribute to finding
|
||
|
% literals.
|
||
|
|
||
|
rdf_token_expansions(prefix(Prefix), [prefix(Prefix, Tokens)]) :-
|
||
|
token_index(Map),
|
||
|
rdf_keys_in_literal_map(Map, prefix(Prefix), Tokens).
|
||
|
rdf_token_expansions(sounds(Like), [sounds(Like, Tokens)]) :-
|
||
|
metaphone_index(Map),
|
||
|
rdf_find_literal_map(Map, [Like], Tokens).
|
||
|
rdf_token_expansions(stem(Like), [stem(Like, Tokens)]) :-
|
||
|
porter_index(Map),
|
||
|
rdf_find_literal_map(Map, [Like], Tokens).
|
||
|
rdf_token_expansions(Spec, Expansions) :-
|
||
|
compile_spec(Spec, DNF),
|
||
|
token_index(Map),
|
||
|
lookup(DNF, Map, SCS, _),
|
||
|
flatten(SCS, CS),
|
||
|
sort(CS, Expansions0),
|
||
|
join_expansions(Expansions0, Expansions).
|
||
|
|
||
|
join_expansions([], []).
|
||
|
join_expansions([H0|T0], [H|T]) :-
|
||
|
untag(H0, Tag, V0),
|
||
|
Tag =.. L0,
|
||
|
append(L0, [[V0|Values]], L1),
|
||
|
H =.. L1,
|
||
|
join_expansions_by_tag(T0, Tag, T1, Values),
|
||
|
join_expansions(T1, T).
|
||
|
|
||
|
join_expansions_by_tag([H|T0], Tag, T, [V0|VT]) :-
|
||
|
untag(H, Tag, V0), !,
|
||
|
join_expansions_by_tag(T0, Tag, T, VT).
|
||
|
join_expansions_by_tag(L, _, L, []).
|
||
|
|
||
|
lookup(@(false), _, [], []) :- !.
|
||
|
lookup(or(H0,T0), Map, [CH|CT], [H|T]) :- !,
|
||
|
lookup(H0, Map, CH, H),
|
||
|
lookup(T0, Map, CT, T).
|
||
|
lookup(H0, Map, [C], [H]) :-
|
||
|
lookup1(H0, Map, C, H).
|
||
|
|
||
|
lookup1(Conj, Map, Cond, Literals) :-
|
||
|
phrase(conj_to_list(Conj), List), !,
|
||
|
rdf_find_literal_map(Map, List, Literals),
|
||
|
( Literals \== []
|
||
|
-> phrase(conj_to_cond(Conj), Cond)
|
||
|
; Cond = []
|
||
|
).
|
||
|
lookup1(_, _, _, []).
|
||
|
|
||
|
conj_to_list(and(A,B)) --> !,
|
||
|
conj_to_list(A),
|
||
|
conj_to_list(B).
|
||
|
conj_to_list(@(false)) --> !,
|
||
|
{fail}.
|
||
|
conj_to_list(Tagged) -->
|
||
|
{ untag(Tagged, L) }, !,
|
||
|
[L].
|
||
|
conj_to_list(L) -->
|
||
|
[L].
|
||
|
|
||
|
|
||
|
conj_to_cond(and(A,B)) --> !,
|
||
|
conj_to_cond(A),
|
||
|
conj_to_cond(B).
|
||
|
conj_to_cond(Tagged) -->
|
||
|
{ untag(Tagged, _) }, !,
|
||
|
[ Tagged ].
|
||
|
conj_to_cond(_) -->
|
||
|
[].
|
||
|
|
||
|
|
||
|
%% compile_spec(+Spec, -Compiled)
|
||
|
%
|
||
|
% Compile a specification as above into disjunctive normal form
|
||
|
|
||
|
compile_spec(Spec, DNF) :-
|
||
|
expand_fuzzy(Spec, Spec2),
|
||
|
nnf(Spec2, NNF),
|
||
|
dnf(NNF, DNF).
|
||
|
|
||
|
|
||
|
expand_fuzzy(Var, _) :-
|
||
|
var(Var), !,
|
||
|
throw(error(instantiation_error, _)).
|
||
|
expand_fuzzy(sounds(Like), Or) :- !,
|
||
|
metaphone_index(Map),
|
||
|
double_metaphone(Like, Key),
|
||
|
rdf_find_literal_map(Map, [Key], Tokens),
|
||
|
list_to_or(Tokens, sounds(Like), Or).
|
||
|
expand_fuzzy(stem(Like), Or) :- !,
|
||
|
porter_index(Map),
|
||
|
porter_stem(Like, Key),
|
||
|
rdf_find_literal_map(Map, [Key], Tokens),
|
||
|
list_to_or(Tokens, stem(Like), Or).
|
||
|
expand_fuzzy(prefix(Prefix), Or) :- !,
|
||
|
token_index(Map),
|
||
|
rdf_keys_in_literal_map(Map, prefix(Prefix), Tokens),
|
||
|
list_to_or(Tokens, prefix(Prefix), Or).
|
||
|
expand_fuzzy(case(String), Or) :- !,
|
||
|
token_index(Map),
|
||
|
rdf_keys_in_literal_map(Map, case(String), Tokens),
|
||
|
list_to_or(Tokens, case(String), Or).
|
||
|
expand_fuzzy(or(A0, B0), E) :- !,
|
||
|
expand_fuzzy(A0, A),
|
||
|
expand_fuzzy(B0, B),
|
||
|
simplify(or(A,B), E).
|
||
|
expand_fuzzy(and(A0, B0), E) :- !,
|
||
|
expand_fuzzy(A0, A),
|
||
|
expand_fuzzy(B0, B),
|
||
|
simplify(and(A,B), E).
|
||
|
expand_fuzzy(not(A0), not(A)) :- !,
|
||
|
expand_fuzzy(A0, A).
|
||
|
expand_fuzzy(between(Low, High), Or) :- !,
|
||
|
token_index(Map),
|
||
|
rdf_keys_in_literal_map(Map, between(Low, High), Tokens),
|
||
|
list_to_or(Tokens, between(Low, High), Or).
|
||
|
expand_fuzzy(le(High), Or) :- !,
|
||
|
token_index(Map),
|
||
|
rdf_keys_in_literal_map(Map, le(High), Tokens),
|
||
|
list_to_or(Tokens, le(High), Or).
|
||
|
expand_fuzzy(ge(Low), Or) :- !,
|
||
|
token_index(Map),
|
||
|
rdf_keys_in_literal_map(Map, ge(Low), Tokens),
|
||
|
list_to_or(Tokens, ge(Low), Or).
|
||
|
expand_fuzzy(Token, Token) :-
|
||
|
atomic(Token), !.
|
||
|
expand_fuzzy(Token, _) :-
|
||
|
throw(error(type_error(Token, boolean_expression), _)).
|
||
|
|
||
|
simplify(Expr0, Expr) :-
|
||
|
simple(Expr0, Expr), !.
|
||
|
simplify(Expr, Expr).
|
||
|
|
||
|
simple(and(@(false), _), @(false)).
|
||
|
simple(and(_, @(false)), @(false)).
|
||
|
simple(or(@(false), X), X).
|
||
|
simple(or(X, @(false)), X).
|
||
|
|
||
|
|
||
|
list_to_or([], _, @(false)) :- !.
|
||
|
list_to_or([X], How, One) :- !,
|
||
|
tag(How, X, One).
|
||
|
list_to_or([H0|T0], How, or(H, T)) :-
|
||
|
tag(How, H0, H),
|
||
|
list_to_or(T0, How, T).
|
||
|
|
||
|
tag(sounds(X), Y, sounds(X,Y)).
|
||
|
tag(stem(X), Y, stem(X,Y)).
|
||
|
tag(prefix(X), Y, prefix(X,Y)).
|
||
|
tag(case(X), Y, case(X,Y)).
|
||
|
tag(between(L,H), Y, between(L,H,Y)).
|
||
|
tag(ge(L), Y, ge(L,Y)).
|
||
|
tag(le(H), Y, le(H,Y)).
|
||
|
|
||
|
untag(sounds(_,Y), Y).
|
||
|
untag(stem(_,Y), Y).
|
||
|
untag(prefix(_,Y), Y).
|
||
|
untag(case(_,Y), Y).
|
||
|
untag(between(_,_,Y), Y).
|
||
|
untag(le(_,Y), Y).
|
||
|
untag(ge(_,Y), Y).
|
||
|
|
||
|
untag(sounds(X,Y), sounds(X), Y).
|
||
|
untag(stem(X,Y), stem(X), Y).
|
||
|
untag(prefix(X,Y), prefix(X), Y).
|
||
|
untag(case(X,Y), case(X), Y).
|
||
|
untag(between(L,H,Y), between(L,H), Y).
|
||
|
untag(ge(L,Y), ge(L), Y).
|
||
|
untag(le(H,Y), le(H), Y).
|
||
|
|
||
|
|
||
|
%% nnf(+Formula, -NNF)
|
||
|
%
|
||
|
% Rewrite to Negative Normal Form, meaning negations only appear
|
||
|
% around literals.
|
||
|
|
||
|
nnf(not(not(A0)), A) :- !,
|
||
|
nnf(A0, A).
|
||
|
nnf(not(and(A0,B0)), or(A,B)) :- !,
|
||
|
nnf(not(A0), A),
|
||
|
nnf(not(B0), B).
|
||
|
nnf(not(or(A0,B0)), and(A,B)) :- !,
|
||
|
nnf(not(A0), A),
|
||
|
nnf(not(B0), B).
|
||
|
nnf(A, A).
|
||
|
|
||
|
|
||
|
%% dnf(+NNF, -DNF)
|
||
|
%
|
||
|
% Convert a formula in NNF to Disjunctive Normal Form (DNF)
|
||
|
|
||
|
dnf(or(A0,B0), or(A, B)) :- !,
|
||
|
dnf(A0, A),
|
||
|
dnf(B0, B).
|
||
|
dnf(and(A0,B0), DNF):- !,
|
||
|
dnf(A0, A1),
|
||
|
dnf(B0, B1),
|
||
|
dnf1(and(A1,B1), DNF).
|
||
|
dnf(DNF, DNF).
|
||
|
|
||
|
dnf1(and(A0, or(B,C)), or(P,Q)) :- !,
|
||
|
dnf1(and(A0,B), P),
|
||
|
dnf1(and(A0,C), Q).
|
||
|
dnf1(and(or(B,C), A0), or(P,Q)) :- !,
|
||
|
dnf1(and(A0,B), P),
|
||
|
dnf1(and(A0,C), Q).
|
||
|
dnf1(DNF, DNF).
|
||
|
|
||
|
|
||
|
/*******************************
|
||
|
* TOKEN INDEX *
|
||
|
*******************************/
|
||
|
|
||
|
%% token_index(-Map)
|
||
|
%
|
||
|
% Get the index of tokens. If not present, create one from the
|
||
|
% current database. Once created, the map is kept up-to-date using
|
||
|
% a monitor hook.
|
||
|
|
||
|
token_index(Map) :-
|
||
|
literal_map(tokens, Map), !.
|
||
|
token_index(Map) :-
|
||
|
rdf_new_literal_map(Map),
|
||
|
assert(literal_map(tokens, Map)),
|
||
|
make_literal_index,
|
||
|
verbose('~N', []),
|
||
|
Monitor = [ reset,
|
||
|
new_literal,
|
||
|
old_literal
|
||
|
],
|
||
|
( setting(index(default))
|
||
|
-> ( current_prolog_flag(cpu_count, N), N > 1
|
||
|
-> create_update_literal_thread(1),
|
||
|
rdf_monitor(thread_monitor_literal, Monitor)
|
||
|
; rdf_monitor(monitor_literal, Monitor)
|
||
|
)
|
||
|
; setting(index(thread(N)))
|
||
|
-> create_update_literal_thread(N),
|
||
|
rdf_monitor(thread_monitor_literal, Monitor)
|
||
|
; rdf_monitor(monitor_literal, Monitor)
|
||
|
).
|
||
|
|
||
|
|
||
|
%% make_literal_index
|
||
|
%
|
||
|
% Create the initial literal index.
|
||
|
|
||
|
make_literal_index :-
|
||
|
setting(index_threads(N)), !,
|
||
|
threaded_literal_index(N).
|
||
|
make_literal_index :-
|
||
|
current_prolog_flag(cpu_count, X),
|
||
|
threaded_literal_index(X).
|
||
|
|
||
|
threaded_literal_index(N) :-
|
||
|
N > 1, !,
|
||
|
message_queue_create(Q, [max_size(1000)]),
|
||
|
create_index_threads(N, Q, Ids),
|
||
|
forall(rdf_current_literal(Literal),
|
||
|
thread_send_message(Q, Literal)),
|
||
|
forall(between(1, N, _),
|
||
|
thread_send_message(Q, done(true))),
|
||
|
maplist(thread_join, Ids, _).
|
||
|
threaded_literal_index(_) :-
|
||
|
forall(rdf_current_literal(Literal),
|
||
|
register_literal(Literal)).
|
||
|
|
||
|
create_index_threads(N, Q, [Id|T]) :-
|
||
|
N > 0, !,
|
||
|
thread_create(index_worker(Q), Id,
|
||
|
[ local(1000),
|
||
|
global(1000),
|
||
|
trail(1000)
|
||
|
]),
|
||
|
N2 is N - 1,
|
||
|
create_index_threads(N2, Q, T).
|
||
|
create_index_threads(_, _, []) :- !.
|
||
|
|
||
|
index_worker(Queue) :-
|
||
|
repeat,
|
||
|
thread_get_message(Queue, Msg),
|
||
|
work(Msg).
|
||
|
|
||
|
work(done(true)) :- !.
|
||
|
work(Literal) :-
|
||
|
register_literal(Literal),
|
||
|
fail.
|
||
|
|
||
|
|
||
|
% clean_token_index
|
||
|
%
|
||
|
% Clean after a reset.
|
||
|
|
||
|
clean_token_index :-
|
||
|
forall(literal_map(_, Map),
|
||
|
rdf_reset_literal_map(Map)).
|
||
|
|
||
|
/*******************************
|
||
|
* THREADED UPDATE *
|
||
|
*******************************/
|
||
|
|
||
|
% create_update_literal_thread(+Threads)
|
||
|
%
|
||
|
% Setup literal monitoring using threads. While loading databases
|
||
|
% through rdf_attach_db/2 from rdf_persistency.pl, most of the
|
||
|
% time is spent updating the literal token database. While loading
|
||
|
% the RDF triples, most of the time is spend in updating the AVL
|
||
|
% tree holding the literals. Updating the token index hangs on
|
||
|
% updating the AVL trees holding the tokens. Both tasks however
|
||
|
% can run concurrently.
|
||
|
|
||
|
create_update_literal_thread(Threads) :-
|
||
|
message_queue_create(_,
|
||
|
[ alias(rdf_literal_monitor_queue),
|
||
|
max_size(10000)
|
||
|
]),
|
||
|
forall(between(1, Threads, N),
|
||
|
( atom_concat(rdf_literal_monitor_, N, Alias),
|
||
|
thread_create(monitor_literals, _,
|
||
|
[ alias(Alias),
|
||
|
local(1000),
|
||
|
global(1000),
|
||
|
trail(1000)
|
||
|
])
|
||
|
)).
|
||
|
|
||
|
monitor_literals :-
|
||
|
set_prolog_flag(agc_margin, 0), % we don't create garbage
|
||
|
repeat,
|
||
|
thread_get_message(rdf_literal_monitor_queue, Literal),
|
||
|
register_literal(Literal),
|
||
|
fail.
|
||
|
|
||
|
thread_monitor_literal(new_literal(Literal)) :- !,
|
||
|
thread_send_message(rdf_literal_monitor_queue, Literal).
|
||
|
thread_monitor_literal(Action) :- !,
|
||
|
monitor_literal(Action).
|
||
|
|
||
|
|
||
|
/*******************************
|
||
|
* MONITORED UPDATE *
|
||
|
*******************************/
|
||
|
|
||
|
monitor_literal(new_literal(Literal)) :-
|
||
|
register_literal(Literal).
|
||
|
monitor_literal(old_literal(Literal)) :-
|
||
|
unregister_literal(Literal).
|
||
|
monitor_literal(transaction(begin, reset)) :-
|
||
|
rdf_monitor(monitor_literal, [-old_literal]),
|
||
|
clean_token_index.
|
||
|
monitor_literal(transaction(end, reset)) :-
|
||
|
rdf_monitor(monitor_literal, [+old_literal]).
|
||
|
|
||
|
%% register_literal(+Literal)
|
||
|
%
|
||
|
% Associate the tokens of a literal with the literal itself.
|
||
|
|
||
|
register_literal(Literal) :-
|
||
|
( rdf_tokenize_literal(Literal, Tokens)
|
||
|
-> text_of(Literal, Text),
|
||
|
literal_map(tokens, Map),
|
||
|
add_tokens(Tokens, Text, Map)
|
||
|
; true
|
||
|
).
|
||
|
|
||
|
add_tokens([], _, _).
|
||
|
add_tokens([H|T], Literal, Map) :-
|
||
|
rdf_insert_literal_map(Map, H, Literal, Keys),
|
||
|
( var(Keys)
|
||
|
-> true
|
||
|
; forall(new_token(H), true),
|
||
|
( Keys mod 1000 =:= 0
|
||
|
-> progress(Map, 'Tokens')
|
||
|
; true
|
||
|
)
|
||
|
),
|
||
|
add_tokens(T, Literal, Map).
|
||
|
|
||
|
|
||
|
%% unregister_literal(+Literal)
|
||
|
%
|
||
|
% Literal is removed from the database. As we abstract from lang
|
||
|
% and type qualifiers we first have to check this is the last one
|
||
|
% that is destroyed.
|
||
|
|
||
|
unregister_literal(Literal) :-
|
||
|
text_of(Literal, Text),
|
||
|
( rdf(_,_,literal(Text))
|
||
|
-> true % still something left
|
||
|
; rdf_tokenize_literal(Literal, Tokens),
|
||
|
literal_map(tokens, Map),
|
||
|
del_tokens(Tokens, Text, Map)
|
||
|
).
|
||
|
|
||
|
del_tokens([], _, _).
|
||
|
del_tokens([H|T], Literal, Map) :-
|
||
|
rdf_delete_literal_map(Map, H, Literal),
|
||
|
del_tokens(T, Literal, Map).
|
||
|
|
||
|
|
||
|
%% rdf_tokenize_literal(+Literal, -Tokens) is semidet.
|
||
|
%
|
||
|
% Tokenize a literal. We make this hookable as tokenization is
|
||
|
% generally domain dependent.
|
||
|
|
||
|
rdf_tokenize_literal(Literal, Tokens) :-
|
||
|
tokenization(Literal, Tokens), !. % Hook
|
||
|
rdf_tokenize_literal(Literal, Tokens) :-
|
||
|
text_of(Literal, Text),
|
||
|
atom(Text),
|
||
|
tokenize_atom(Text, Tokens0),
|
||
|
select_tokens(Tokens0, Tokens).
|
||
|
|
||
|
select_tokens([], []).
|
||
|
select_tokens([H|T0], T) :-
|
||
|
( exclude_from_index(token, H)
|
||
|
-> select_tokens(T0, T)
|
||
|
; number(H)
|
||
|
-> ( integer(H),
|
||
|
between(-1073741824, 1073741823, H)
|
||
|
-> T = [H|T1],
|
||
|
select_tokens(T0, T1)
|
||
|
; select_tokens(T0, T)
|
||
|
)
|
||
|
; atom_length(H, 1)
|
||
|
-> select_tokens(T0, T)
|
||
|
; no_index_token(H)
|
||
|
-> select_tokens(T0, T)
|
||
|
; T = [H|T1],
|
||
|
select_tokens(T0, T1)
|
||
|
).
|
||
|
|
||
|
|
||
|
% no_index_token/1
|
||
|
%
|
||
|
% Tokens we do not wish to index, as they creat huge amounts of
|
||
|
% data with little or no value. Is there a more general way to
|
||
|
% describe this? Experience shows that simply word count is not a
|
||
|
% good criterium as it often rules out popular domain terms.
|
||
|
|
||
|
no_index_token(and).
|
||
|
no_index_token(an).
|
||
|
no_index_token(or).
|
||
|
no_index_token(of).
|
||
|
no_index_token(on).
|
||
|
no_index_token(in).
|
||
|
no_index_token(this).
|
||
|
no_index_token(the).
|
||
|
|
||
|
|
||
|
%% text_of(+LiteralArg, -Text)
|
||
|
%
|
||
|
% Get the textual or (integer) numerical information from a
|
||
|
% literal value.
|
||
|
|
||
|
text_of(type(_, Text), Text) :- !.
|
||
|
text_of(lang(_, Text), Text) :- !.
|
||
|
text_of(Text, Text) :- atom(Text), !.
|
||
|
text_of(Text, Text) :- integer(Text).
|
||
|
|
||
|
|
||
|
/*******************************
|
||
|
* PORTER INDEX *
|
||
|
*******************************/
|
||
|
|
||
|
|
||
|
porter_index(Map) :-
|
||
|
literal_map(porter, Map), !.
|
||
|
porter_index(Map) :-
|
||
|
rdf_new_literal_map(Map),
|
||
|
assert(literal_map(porter, Map)),
|
||
|
fill_porter_index(Map),
|
||
|
assert((new_token(Token) :- add_stem(Token, Map))).
|
||
|
|
||
|
fill_porter_index(PorterMap) :-
|
||
|
token_index(TokenMap),
|
||
|
rdf_keys_in_literal_map(TokenMap, all, Tokens),
|
||
|
stem(Tokens, PorterMap).
|
||
|
|
||
|
stem([], _).
|
||
|
stem([Token|T], Map) :-
|
||
|
( atom(Token)
|
||
|
-> porter_stem(Token, Stem),
|
||
|
rdf_insert_literal_map(Map, Stem, Token, Keys),
|
||
|
( integer(Keys),
|
||
|
Keys mod 1000 =:= 0
|
||
|
-> progress(Map, 'Porter')
|
||
|
; true
|
||
|
)
|
||
|
; true
|
||
|
),
|
||
|
stem(T, Map).
|
||
|
|
||
|
|
||
|
add_stem(Token, Map) :-
|
||
|
porter_stem(Token, Stem),
|
||
|
rdf_insert_literal_map(Map, Stem, Token, _).
|
||
|
|
||
|
|
||
|
/*******************************
|
||
|
* METAPHONE INDEX *
|
||
|
*******************************/
|
||
|
|
||
|
|
||
|
metaphone_index(Map) :-
|
||
|
literal_map(metaphone, Map), !.
|
||
|
metaphone_index(Map) :-
|
||
|
rdf_new_literal_map(Map),
|
||
|
assert(literal_map(metaphone, Map)),
|
||
|
fill_metaphone_index(Map),
|
||
|
assert((new_token(Token) :- add_metaphone(Token, Map))).
|
||
|
|
||
|
fill_metaphone_index(PorterMap) :-
|
||
|
token_index(TokenMap),
|
||
|
rdf_keys_in_literal_map(TokenMap, all, Tokens),
|
||
|
metaphone(Tokens, PorterMap).
|
||
|
|
||
|
metaphone([], _).
|
||
|
metaphone([Token|T], Map) :-
|
||
|
( atom(Token)
|
||
|
-> double_metaphone(Token, SoundEx),
|
||
|
rdf_insert_literal_map(Map, SoundEx, Token, Keys),
|
||
|
( integer(Keys),
|
||
|
Keys mod 1000 =:= 0
|
||
|
-> progress(Map, 'Metaphone')
|
||
|
; true
|
||
|
)
|
||
|
; true
|
||
|
),
|
||
|
metaphone(T, Map).
|
||
|
|
||
|
|
||
|
add_metaphone(Token, Map) :-
|
||
|
double_metaphone(Token, SoundEx),
|
||
|
rdf_insert_literal_map(Map, SoundEx, Token).
|
||
|
|
||
|
|
||
|
/*******************************
|
||
|
* UTIL *
|
||
|
*******************************/
|
||
|
|
||
|
verbose(Fmt, Args) :-
|
||
|
setting(verbose(true)), !,
|
||
|
format(user_error, Fmt, Args).
|
||
|
verbose(_, _).
|
||
|
|
||
|
progress(Map, Which) :-
|
||
|
setting(verbose(true)), !,
|
||
|
rdf_statistics_literal_map(Map, size(Keys, Values)),
|
||
|
format(user_error,
|
||
|
'\r~t~w: ~12|Keys: ~t~D~15+; Values: ~t~D~20+',
|
||
|
[Which, Keys, Values]).
|
||
|
progress(_,_).
|