From d7bde5b63dd3ad951ca9a0b1536da8371380a6d8 Mon Sep 17 00:00:00 2001 From: kostis Date: Fri, 8 Jun 2007 09:12:37 +0000 Subject: [PATCH] Added a file to preserve version with the complete text. git-svn-id: https://yap.svn.sf.net/svnroot/yap/trunk@1897 b08c6af1-5177-4d33-ba66-4b1c6b8b522a --- docs/index/article.tex | 1317 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1317 insertions(+) create mode 100644 docs/index/article.tex diff --git a/docs/index/article.tex b/docs/index/article.tex new file mode 100644 index 000000000..e2d70aa50 --- /dev/null +++ b/docs/index/article.tex @@ -0,0 +1,1317 @@ +%============================================================================== +\documentclass{llncs} +%------------------------------------------------------------------------------ +\usepackage{a4wide} +\usepackage{float} +\usepackage{alltt} +\usepackage{xspace} +\usepackage{epsfig} +\usepackage{wrapfig} +\usepackage{subfigure} + +\renewcommand{\rmdefault}{ptm} +%------------------------------------------------------------------------------ +\floatstyle{ruled} +\newfloat{Algorithm}{ht}{lop} +%------------------------------------------------------------------------------ +\newcommand{\wamcodesize}{scriptsize} +\newcommand{\code}[1]{\texttt{#1}} +\newcommand{\instr}[1]{\textsf{#1}} +\newcommand{\try}{\instr{try}\xspace} +\newcommand{\retry}{\mbox{\instr{retry}}\xspace} +\newcommand{\trust}{\instr{trust}\xspace} +\newcommand{\TryRetryTrust}{\mbox{\instr{try-retry-trust}}\xspace} +\newcommand{\fail}{\instr{fail}\xspace} +\newcommand{\jump}{\instr{jump}\xspace} +\newcommand{\jitiSTAR}{\mbox{\instr{dindex\_on\_*}}\xspace} +\newcommand{\switchSTAR}{\mbox{\instr{switch\_on\_*}}\xspace} +\newcommand{\jitiONterm}{\mbox{\instr{dindex\_on\_term}}\xspace} +\newcommand{\jitiONconstant}{\mbox{\instr{dindex\_on\_constant}}\xspace} +\newcommand{\jitiONstructure}{\mbox{\instr{dindex\_on\_structure}}\xspace} +\newcommand{\switchONterm}{\mbox{\instr{switch\_on\_term}}\xspace} +\newcommand{\switchONconstant}{\mbox{\instr{switch\_on\_constant}}\xspace} +\newcommand{\switchONstructure}{\mbox{\instr{switch\_on\_structure}}\xspace} +\newcommand{\getcon}{\mbox{\instr{get\_constant}}\xspace} +\newcommand{\proceed}{\instr{proceed}\xspace} +\newcommand{\Cline}{\cline{2-3}} +\newcommand{\JITI}{demand-driven indexing\xspace} +%------------------------------------------------------------------------------ +\newcommand{\bench}[1]{\textbf{\textsf{#1}}} +\newcommand{\tcLio}{\bench{tc\_l\_io}\xspace} +\newcommand{\tcRio}{\bench{tc\_r\_io}\xspace} +\newcommand{\tcDio}{\bench{tc\_d\_io}\xspace} +\newcommand{\tcLoo}{\bench{tc\_l\_oo}\xspace} +\newcommand{\tcRoo}{\bench{tc\_r\_oo}\xspace} +\newcommand{\tcDoo}{\bench{tc\_d\_oo}\xspace} +\newcommand{\compress}{\bench{compress}\xspace} +\newcommand{\sgCyl}{\bench{sg\_cyl}\xspace} +\newcommand{\muta}{\bench{mutagenesis}\xspace} +\newcommand{\pta}{\bench{pta}\xspace} +\newcommand{\tea}{\bench{tea}\xspace} +%------------------------------------------------------------------------------ +\newcommand{\BreastCancer}{\bench{BreastCancer}\xspace} +\newcommand{\Carcino}{\bench{Carcinogenesis}\xspace} +\newcommand{\Choline}{\bench{Choline}\xspace} +\newcommand{\GeneExpr}{\bench{GeneExpression}\xspace} +\newcommand{\IEProtein}{\bench{IE-Protein\_Extraction}\xspace} +%\newcommand{\Krki}{\bench{Krki}\xspace} +%\newcommand{\KrkiII}{\bench{Krki~II}\xspace} +\newcommand{\Mesh}{\bench{Mesh}\xspace} +\newcommand{\Pyrimidines}{\bench{Pyrimidines}\xspace} +\newcommand{\Susi}{\bench{Susi}\xspace} +\newcommand{\Thermolysin}{\bench{Thermolysin}\xspace} +%------------------------------------------------------------------------------ +\newenvironment{SmallProg}{\begin{tt}\begin{small}\begin{tabular}[b]{l}}{\end{tabular}\end{small}\end{tt}} +\newenvironment{ScriptProg}{\begin{tt}\begin{scriptsize}\begin{tabular}[b]{l}}{\end{tabular}\end{scriptsize}\end{tt}} +\newenvironment{FootProg}{\begin{tt}\begin{footnotesize}\begin{tabular}[c]{l}}{\end{tabular}\end{footnotesize}\end{tt}} + +\newcommand{\TODOcomment}[2]{% + \stepcounter{TODOcounter#1}% + {\scriptsize\bf$^{(\arabic{TODOcounter#1})}$}% + \marginpar[\fbox{ + \parbox{2cm}{\raggedleft + \scriptsize$^{({\bf{\arabic{TODOcounter#1}{#1}}})}$% + \scriptsize #2}}]% + {\fbox{\parbox{2cm}{\raggedright + \scriptsize$^{({\bf{\arabic{TODOcounter#1}{#1}}})}$% + \scriptsize #2}}} +}% +\newcounter{TODOcounter} +\newcommand{\TODO}[1]{\TODOcomment{}{#1}} +%------------------------------------------------------------------------------ + +\title{Demand-Driven Indexing of Prolog Clauses} +\titlerunning{Demand-Driven Indexing of Prolog Clauses} + +\author{V\'{\i}tor Santos Costa\inst{1} \and Konstantinos + Sagonas\inst{2} \and Ricardo Lopes\inst{1}} +\authorrunning{V. Santos Costa, K. Sagonas and R. Lopes} + +\institute{ + University of Porto, Portugal + \and + National Technical University of Athens, Greece +} + +\pagestyle{plain} % For the submission only + +\begin{document} +\maketitle + +\begin{abstract} + As logic programming applications grow in size, Prolog systems need + to efficiently access larger and larger data sets and the need for + any- and multi-argument indexing becomes more and more profound. + Static generation of multi-argument indexing is one alternative, but + applications often rely on features that are inherently dynamic + (e.g., generating hypotheses for ILP data sets during runtime) which + makes static techniques inapplicable or inaccurate. Another + alternative, which has not been investigated so far, is to employ + dynamic schemes for flexible demand-driven indexing of Prolog + clauses. We propose such schemes and discuss issues that need to be + addressed for their efficient implementation in the context of + WAM-based Prolog systems. We have implemented demand-driven indexing + in two different Prolog systems and have been able to obtain + non-negligible performance speedups: from a few percent up to orders + of magnitude. Given these results, we see very little reason for + Prolog systems not to incorporate some form of dynamic indexing + based on actual demand. In fact, we see demand-driven indexing as + the first step towards effective runtime optimization of Prolog + programs. +\end{abstract} + + +\section{Introduction} +%===================== +The WAM~\cite{Warren83} has mostly been a blessing but occasionally +also a curse for Prolog systems. Its ingenious design has allowed +implementors to get byte code compilers with decent performance --- it +is not a fluke that most Prolog systems are still based on the WAM. On +the other hand, \emph{because} the WAM gives good performance in many +cases, implementors have not incorporated in their systems many +features that drastically depart from WAM's basic characteristics. +% +For example, first argument indexing is sufficient for many Prolog +applications. However, it is clearly sub-optimal for applications +accessing large data sets; for a long time now, the database community +has recognized that good indexing is the basis for fast query +processing. + +As logic programming applications grow in size, Prolog systems need to +efficiently access larger and larger data sets and the need for any- +and multi-argument indexing becomes more and more profound. Static +generation of multi-argument indexing is one alternative. The problem +is that this alternative is often unattractive because it may +drastically increase the size of the generated byte code and do so +unnecessarily. Static analysis can partly address this concern, but in +applications that rely on features which are inherently dynamic (e.g., +generating hypotheses for inductive logic programming data sets during +runtime) static analysis is inapplicable or grossly inaccurate. +Another alternative, which has not been investigated so far, is to do +flexible indexing on demand during program execution. + +This is precisely what we advocate with this paper. More specifically, +we present a small extension to the WAM that allows for flexible +indexing of Prolog clauses during runtime based on actual demand. For +static predicates, the scheme we propose is partly guided by the +compiler; for dynamic code, besides being demand-driven by queries, +the method needs to cater for code updates during runtime. Where our +schemes radically depart from current practice is that they generate +new byte code during runtime, in effect doing a form of just-in-time +compilation. In our experience these schemes pay off. We have +implemented \JITI in two different Prolog systems (YAP and XXX) and +have obtained non-trivial speedups, ranging from a few percent to +orders of magnitude, across a wide range of applications. Given these +results, we see very little reason for Prolog systems not to +incorporate some form of indexing based on actual demand from queries. +In fact, we see \JITI as only the first step towards effective runtime +optimization of Prolog programs. + +This paper is structured as follows. After commenting on the state of +the art and related work concerning indexing in Prolog systems +(Sect.~\ref{sec:related}) we briefly review indexing in the WAM +(Sect.~\ref{sec:prelims}). We then present \JITI schemes for static +(Sect.~\ref{sec:static}) and dynamic (Sect.~\ref{sec:dynamic}) +predicates, their implementation in two Prolog systems +(Sect.~\ref{sec:impl}) and the performance benefits they bring +(Sect.~\ref{sec:perf}). The paper ends with some concluding remarks. + + +\section{State of the Art and Related Work} \label{sec:related} +%============================================================== +% Indexing in Prolog systems: +To the best of our knowledge, many Prolog systems still only support +indexing on the main functor symbol of the first argument. Some +others, like YAP version 4, can look inside some compound +terms~\cite{YAP}. SICStus Prolog supports \emph{shallow + backtracking}~\cite{ShallowBacktracking@ICLP-89}; choice points are +fully populated only when it is certain that execution will enter the +clause body. While shallow backtracking avoids some of the performance +problems of unnecessary choice point creation, it does not offer the +full benefits that indexing can provide. Other systems like +BIM-Prolog~\cite{IndexingProlog@NACLP-89}, SWI-Prolog~\cite{SWI} and +XSB~\cite{XSB} allow for user-controlled multi-argument indexing (via +an \code{:-~index} directive). Notably, ilProlog~\cite{ilProlog} uses +compile-time heuristics and generates code for multi-argument indexing +automatically. In all these systems, this support comes with various +implementation restrictions. For example, in SWI-Prolog at most four +arguments can be indexed; in XSB the compiler does not offer +multi-argument indexing and the predicates need to be asserted +instead; we know of no system where multi-argument indexing looks +inside compound terms. More importantly, requiring users to specify +arguments to index on is neither user-friendly nor guarantees good +performance results. + +% Trees, tries and unification factoring: +Recognizing the need for better indexing, researchers have proposed +more flexible index mechanisms for Prolog. For example, Hickey and +Mudambi proposed \emph{switching trees}~\cite{HickeyMudambi@JLP-89}, +which rely on the presence of mode information. Similar proposals were +put forward by Van Roy, Demoen and Willems who investigated indexing +on several arguments in the form of a \emph{selection tree}~\cite{VRDW87} +and by Zhou et al.\ who implemented a \emph{matching tree} oriented +abstract machine for Prolog~\cite{TOAM@ICLP-90}. For static +predicates, the XSB compiler offers support for \emph{unification +factoring}~\cite{UnifFact@POPL-95}; for asserted code, XSB can +represent databases of facts using \emph{tries}~\cite{Tries@JLP-99} +which provide left-to-right multi-argument indexing. However, in XSB +none of these mechanisms is used automatically; instead the user has +to specify appropriate directives. + +% Comparison with static analysis techniques and Mercury: +Long ago, Kliger and Shapiro argued that such tree-based indexing +schemes are not cost effective for the compilation of Prolog +programs~\cite{KligerShapiro@ICLP-88}. Some of their arguments make +sense for certain applications, but, as we shall show, in general +they underestimate the benefits of indexing on EDB predicates. +Nevertheless, it is true that unless the modes of +predicates are known we run the risk of doing indexing on output +arguments, whose only effect is an unnecessary increase in compilation +times and, more importantly, in code size. In a programming language +like Mercury~\cite{Mercury@JLP-96} where modes are known the compiler +can of course avoid this risk; indeed in Mercury modes (and types) are +used to guide the compiler generate good indexing tables. However, the +situation is different for a language like Prolog. Getting accurate +information about the set of all possible modes of predicates requires +a global static analyzer in the compiler --- and most Prolog systems +do not come with one. More importantly, it requires a lot of +discipline from the programmer (e.g., that applications use the module +system religiously and never bypass it). As a result, most Prolog +systems currently do not provide the type of indexing that +applications require. Even in systems like Ciao~\cite{Ciao@SCP-05}, +which do come with built-in static analysis and more or less force +such a discipline on the programmer, mode information is not used for +multi-argument indexing. + +% The grand finale: +The situation is actually worse for certain types of Prolog +applications. For example, consider applications in the area of +inductive logic programming. These applications on the one hand have +high demands for effective indexing since they need to efficiently +access big datasets and on the other they are unfit for static +analysis since queries are often ad hoc and generated only during +runtime as new hypotheses are formed or refined. +% +Our thesis is that the abstract machine should be able to adapt +automatically to the runtime requirements of such or, even better, of +all applications by employing increasingly aggressive forms of dynamic +compilation. As a concrete example of what this means in practice, in +this paper we will attack the problem of satisfying the indexing needs +of applications during runtime. Naturally, we will base our technique +on the existing support for indexing that the WAM provides, but we +will extend this support with the technique of \JITI that we describe +in the next sections. + + +\section{Indexing in the WAM} \label{sec:prelims} +%================================================ +To make the paper relatively self-contained we briefly review the +indexing instructions of the WAM and their use. In the WAM, the first +level of dispatching involves a test on the type of the argument. The +\switchONterm instruction checks the tag of the dereferenced value in +the first argument register and implements a four-way branch where one +branch is for the dereferenced register being an unbound variable, one +for being atomic, one for (non-empty) list, and one for structure. In +any case, control goes to a (possibly empty) bucket of clauses. In the +buckets for constants and structures the second level of dispatching +involves the value of the register. The \switchONconstant and +\switchONstructure instructions implement this dispatching: typically +with a \fail instruction when the bucket is empty, with a \jump +instruction for only one clause, with a sequential scan when the +number of clauses is small, and with a hash lookup when the number of +clauses exceeds a threshold. For this reason the \switchONconstant and +\switchONstructure instructions take as arguments the hash table +\instr{T} and the number of clauses \instr{N} the table contains (or +equivalently, \instr{N} is the size of the hash table). In each bucket +of this hash table and also in the bucket for the variable case of +\switchONterm the code sequentially backtracks through the clauses +using a \TryRetryTrust chain of instructions. The \try instruction +sets up a choice point, the \retry instructions (if~any) update +certain fields of this choice point, and the \trust instruction +removes it. + +The WAM has additional indexing instructions (\instr{try\_me\_else} +and friends) that allow indexing to be interspersed with the code of +clauses. For simplicity of presentation we will not consider them +here. This is not a problem since the above scheme handles all programs. +Also, we will feel free to do some minor modifications and +optimizations when this simplifies things. + +We present an example. Consider the Prolog code shown in +Fig.~\ref{fig:carc:facts}. It is a fragment of the machine +learning dataset \textit{Carcinogenesis}~\cite{Carcinogenesis@ILP-97}. +The five clauses get compiled to the WAM code shown in +Fig.~\ref{fig:carc:clauses}. The first argument indexing indexing code +that a Prolog compiler generates is shown in +Fig.~\ref{fig:carc:index}. This code is typically placed before the +code for the clauses and the \switchONconstant instruction is the +entry point of predicate. Note that compared with vanilla WAM this +instruction has an extra argument: the register on the value of which +we index ($r_1$). This extra argument will allow us to go beyond +first argument indexing. Another departure from the WAM is that if +this argument register contains an unbound variable instead of a +constant then execution will continue with the next instruction; in +effect we have merged part of the functionality of \switchONterm into +the \switchONconstant instruction. This small change in the behavior +of \switchONconstant will allow us to get \JITI. Let's see how. + +%------------------------------------------------------------------------------ +\begin{figure}[t] +\centering +\subfigure[Some Prolog clauses\label{fig:carc:facts}]{% + \begin{ScriptProg} + has\_property(d1,salmonella,p).\\ + has\_property(d1,salmonella\_n,p).\\ + has\_property(d2,salmonella,p). \\ + has\_property(d2,cytogen\_ca,n).\\ + has\_property(d3,cytogen\_ca,p). + \end{ScriptProg} +}% +\subfigure[WAM indexing\label{fig:carc:index}]{% + \begin{sf} + \begin{\wamcodesize} + \begin{tabular}[b]{l} + \switchONconstant $r_1$ 5 $T_1$ \\ + \try $L_1$ \\ + \retry $L_2$ \\ + \retry $L_3$ \\ + \retry $L_4$ \\ + \trust $L_5$ \\ + \\ + \begin{tabular}[b]{r|c@{\ }|l|} + \Cline + $T_1$: & \multicolumn{2}{c|}{Hash Table Info}\\ \Cline\Cline + \ & d1 & \try $L_1$ \\ + \ & & \trust $L_2$ \\ \Cline + \ & d2 & \try $L_3$ \\ + \ & & \trust $L_4$ \\ \Cline + \ & d3 & \jump $L_5$ \\ + \Cline + \end{tabular} + \end{tabular} + \end{\wamcodesize} + \end{sf} +}% +\subfigure[Code for the clauses\label{fig:carc:clauses}]{% + \begin{sf} + \begin{\wamcodesize} + \begin{tabular}[b]{rl} + $L_1$: & \getcon $r_1$ d1 \\ + \ & \getcon $r_2$ salmonella \\ + \ & \getcon $r_3$ p \\ + \ & \proceed \\ + $L_2$: & \getcon $r_1$ d1 \\ + \ & \getcon $r_2$ salmonella\_n \\ + \ & \getcon $r_3$ p \\ + \ & \proceed \\ + $L_3$: & \getcon $r_1$ d2 \\ + \ & \getcon $r_2$ salmonella \\ + \ & \getcon $r_3$ p \\ + \ & \proceed \\ + $L_4$: & \getcon $r_1$ d2 \\ + \ & \getcon $r_2$ cytogen\_ca \\ + \ & \getcon $r_3$ n \\ + \ & \proceed \\ + $L_5$: & \getcon $r_1$ d3 \\ + \ & \getcon $r_2$ cytogen\_ca \\ + \ & \getcon $r_3$ p \\ + \ & \proceed + \end{tabular} + \end{\wamcodesize} + \end{sf} +}% +\subfigure[Any arg indexing\label{fig:carc:jiti_single:before}]{% + \begin{sf} + \begin{\wamcodesize} + \begin{tabular}[b]{l} + \switchONconstant $r_1$ 5 $T_1$ \\ + \jitiONconstant $r_2$ 5 3 \\ + \jitiONconstant $r_3$ 5 3 \\ + \try $L_1$ \\ + \retry $L_2$ \\ + \retry $L_3$ \\ + \retry $L_4$ \\ + \trust $L_5$ \\ + \\ + \begin{tabular}[b]{r|c@{\ }|l|} + \Cline + $T_1$: & \multicolumn{2}{c|}{Hash Table Info}\\ \Cline\Cline + \ & \code{d1} & \try $L_1$ \\ + \ & & \trust $L_2$ \\ \Cline + \ & \code{d2} & \try $L_3$ \\ + \ & & \trust $L_4$ \\ \Cline + \ & \code{d3} & \jump $L_5$ \\ + \Cline + \end{tabular} + \end{tabular} + \end{\wamcodesize} + \end{sf} +}% +\caption{Part of the Carcinogenesis dataset and WAM code that a byte + code compiler generates} +\label{fig:carc} +\end{figure} +%------------------------------------------------------------------------------ + + +\section{Demand-Driven Indexing of Static Predicates} \label{sec:static} +%======================================================================= +For static predicates the compiler has complete information about all +clauses and shapes of their head arguments. It is both desirable and +possible to take advantage of this information at compile time and so +we treat the case of static predicates separately. +% +We will do so with schemes of increasing effectiveness and +implementation complexity. + +\subsection{A simple WAM extension for any argument indexing} +%------------------------------------------------------------ +Let us initially consider the case where the predicates to index +consist only of Datalog facts. This is commonly the case for all +extensional database predicates where indexing is most effective and +called for. + +Refer to the example in Fig.~\ref{fig:carc}. +% +The indexing code of Fig.~\ref{fig:carc:index} incurs a small cost for +a call where the first argument is a variable (namely, executing the +\switchONconstant instruction) but the instruction pays off for calls +where the first argument is bound. On the other hand, for calls where +the first argument is a free variable and some other argument is +bound, a choice point will be created, the \TryRetryTrust chain will +be used, and execution will go through the code of all clauses. This +is clearly inefficient, more so for larger data sets. +% +We can do much better with the relatively simple scheme shown in +Fig.~\ref{fig:carc:jiti_single:before}. Immediately after the +\switchONconstant instruction, we can statically generate +\jitiONconstant (demand indexing) instructions, one for each remaining +argument. Recall that the entry point of the predicate is the +\switchONconstant instruction. The \jitiONconstant $r_i$ \instr{N A} +instruction works as follows: +\begin{itemize} +\item if the argument register $r_i$ is a free variable, then + execution continues with the next instruction; +\item otherwise, \JITI kicks in as follows. The abstract machine + scans the WAM code of the clauses and creates an index table for the + values of the corresponding argument. It can do so because the + instruction takes as arguments the number of clauses \instr{N} to + index and the arity \instr{A} of the predicate. (In our example, the + numbers 5 and 3.) For Datalog facts, this information is sufficient. + Because the WAM byte code for the clauses has a very regular + structure, the index table can be created very quickly. Upon its + creation, the \jitiONconstant instruction gets transformed to a + \switchONconstant. Again this is straightforward because of the two + instructions have similar layouts in memory. Execution of the + abstract machine then continues with the \switchONconstant + instruction. +\end{itemize} +Figure~\ref{fig:carg:jiti_single:after} shows the index table $T_2$ +which is created for our example and how the indexing code looks after +the execution of a call with mode \code{(out,in,?)}. Note that the +\jitiONconstant instruction for argument register $r_2$ has been +appropriately patched. The call that triggered \JITI and subsequent +calls of the same mode will use table $T_2$. The index for the second +argument has been created. +%------------------------------------------------------------------------------ +\begin{figure} + \centering + \begin{sf} + \begin{\wamcodesize} + \begin{tabular}{c@{\hspace*{2em}}c@{\hspace*{2em}}c} + \begin{tabular}{l} + \switchONconstant $r_1$ 5 $T_1$ \\ + \switchONconstant $r_2$ 5 $T_2$ \\ + \jitiONconstant $r_3$ 5 3 \\ + \try $L_1$ \\ + \retry $L_2$ \\ + \retry $L_3$ \\ + \retry $L_4$ \\ + \trust $L_5$ \\ + \end{tabular} + & + \begin{tabular}{r|c@{\ }|l|} + \Cline + $T_1$: & \multicolumn{2}{c|}{Hash Table Info}\\ \Cline\Cline + \ & \code{d1} & \try $L_1$ \\ + \ & & \trust $L_2$ \\ \Cline + \ & \code{d2} & \try $L_3$ \\ + \ & & \trust $L_4$ \\ \Cline + \ & \code{d3} & \jump $L_5$ \\ + \Cline + \end{tabular} + & + \begin{tabular}{r|c@{\ }|l|} + \Cline + $T_2$: & \multicolumn{2}{|c|}{Hash Table Info}\\ \Cline\Cline + \ & \code{salmonella} & \try $L_1$ \\ + \ & & \trust $L_3$ \\ \Cline + \ & \code{salmonella\_n} & \jump $L_2$ \\ \Cline + \ & \code{cytrogen\_ca} & \try $L_4$ \\ + \ & & \trust $L_5$ \\ + \Cline + \end{tabular} + \end{tabular} + \end{\wamcodesize} + \end{sf} + \caption{WAM code after demand-driven indexing for argument 2; + table $T_2$ is generated dynamically} + \label{fig:carg:jiti_single:after} +\end{figure} +%------------------------------------------------------------------------------ + +The main advantage of this scheme is its simplicity. The compiled code +(Fig.~\ref{fig:carc:jiti_single:before}) is not significantly bigger +than the code which a WAM-based compiler would generate +(Fig.~\ref{fig:carc:index}) and, if \JITI turns out unnecessary +during runtime (e.g. execution encounters only open calls or with only +the first argument bound), the extra overhead is minimal: the +execution of some \jitiONconstant instructions for the open call only. +% +In short, this is a simple scheme that allows for \JITI on \emph{any +single} argument. At least for big sets of Datalog facts, we see +little reason not to use this indexing scheme. + +\paragraph*{Optimizations.} +Because we are dealing with static code, there are opportunities for +some easy optimizations. Suppose we statically determine that there +will never be any calls with \code{in} mode for some arguments or that +these arguments are not discriminating enough.\footnote{In our example, +suppose the third argument of \code{has\_property/3} had the atom +\code{p} as value throughout.} Then we can avoid generating +\jitiONconstant instructions for them. Also, suppose we detect or +heuristically decide that some arguments are most likely than others +to be used in the \code{in} mode. Then we can simply place the +\jitiONconstant instructions for these arguments \emph{before} the +instructions for other arguments. This is possible since all indexing +instructions take the argument register number as an argument; their +order does not matter. + +\subsection{From any argument indexing to multi-argument indexing} +%----------------------------------------------------------------- +The scheme of the previous section gives us only single argument +indexing. However, all the infrastructure we need is already in place. +We can use it to obtain any fixed-order multi-argument \JITI in a +straightforward way. + +Note that the compiler knows exactly the set of clauses that need to +be tried for each query with a specific symbol in the first argument. +This information is needed in order to construct, at compile time, the +hash table $T_1$ of Fig.~\ref{fig:carc:index}. For multi-argument +\JITI, instead of generating for each hash bucket only \TryRetryTrust +instructions, the compiler can prepend appropriate demand indexing +instructions. We illustrate this on our running example. The table +$T_1$ contains four \jitiONconstant instructions: two for each of the +remaining two arguments of hash buckets with more than one +alternative. For hash buckets with none or only one alternative (e.g., +for \code{d3}'s bucket) there is obviously no need to resort to \JITI +for the remaining arguments. Figure~\ref{fig:carc:jiti_multi} shows +the state of the hash tables after the execution of queries +\code{has\_property(C,salmonella,T)}, which creates table $T_2$, and +\code{has\_property(d2,P,n)} which creates the $T_3$ table and +transforms the \jitiONconstant instruction for \code{d2} and register +$r_3$ to the appropriate \switchONconstant instruction. + +%------------------------------------------------------------------------------ +\begin{figure}[t] + \centering + \begin{sf} + \begin{\wamcodesize} + \begin{tabular}{@{}cccc@{}} + \begin{tabular}{l} + \switchONconstant $r_1$ 5 $T_1$ \\ + \switchONconstant $r_2$ 5 $T_2$ \\ + \jitiONconstant $r_3$ 5 3 \\ + \try $L_1$ \\ + \retry $L_2$ \\ + \retry $L_3$ \\ + \retry $L_4$ \\ + \trust $L_5$ \\ + \end{tabular} + & + \begin{tabular}{r|c@{\ }|l|} + \Cline + $T_1$: & \multicolumn{2}{c|}{Hash Table Info}\\ \Cline\Cline + \ & \code{d1} & \jitiONconstant $r_2$ 2 3 \\ + \ & & \jitiONconstant $r_3$ 2 3 \\ + \ & & \try $L_1$ \\ + \ & & \trust $L_2$ \\ \Cline + \ & \code{d2} & \jitiONconstant $r_2$ 2 3 \\ + \ & & \switchONconstant $r_3$ 2 $T_3$ \\ + \ & & \try $L_3$ \\ + \ & & \trust $L_4$ \\ \Cline + \ & \code{d3} & \jump $L_5$ \\ + \Cline + \end{tabular} + & + \begin{tabular}{r|c@{\ }|l|} + \Cline + $T_2$: & \multicolumn{2}{|c|}{Hash Table Info}\\ \Cline\Cline + \ & \code{salmonella} & \jitiONconstant $r_3$ 2 3 \\ + \ & & \try $L_1$ \\ + \ & & \trust $L_3$ \\ \Cline + \ & \code{salmonella\_n} & \jump $L_2$ \\ \Cline + \ & \code{cytrogen\_ca} & \jitiONconstant $r_3$ 2 3 \\ + \ & & \try $L_4$ \\ + \ & & \trust $L_5$ \\ + \Cline + \end{tabular} + & + \begin{tabular}{r|c@{\ }|l|} + \Cline + $T_3$: & \multicolumn{2}{|c|}{Hash Table Info}\\ \Cline\Cline + \ & \code{p} & \jump $L_3$ \\ \Cline + \ & \code{n} & \jump $L_4$ \\ + \Cline + \end{tabular} + \end{tabular} + \end{\wamcodesize} + \end{sf} + \caption{\JITI for all argument combinations; + table $T_1$ is static; $T_2$ and $T_3$ are generated dynamically} + \label{fig:carc:jiti_multi} +\end{figure} +%------------------------------------------------------------------------------ + +\paragraph{Implementation issues.} +In the \jitiONconstant instructions of Fig.~\ref{fig:carc:jiti_multi} +notice the integer 2 which denotes the number of clauses that the +instruction will index. Using this number an index table of +appropriate size will be created, such as $T_3$. To fill this table we +need information about the clauses to index and the symbols to hash +on. The clauses can be obtained by scanning the labels of the +\TryRetryTrust instructions following \jitiONconstant; the symbols by +looking at appropriate byte code offsets (based on the argument +register number) from these labels. In our running example, the +symbols can be obtained by looking at the second argument of the +\getcon instruction whose argument register is $r_2$. In the loaded +bytecode, assuming the argument register is represented in one byte, +these symbols are found $sizeof(\getcon) + sizeof(opcode) + 1$ bytes +away from the clause label; see Fig.~\ref{fig:carc:clauses}. Thus, +multi-argument \JITI is easy to get and the creation of index tables +can be extremely fast when indexing Datalog facts. + +\subsection{Beyond Datalog and other implementation issues} +%---------------------------------------------------------- +Indexing on demand clauses with function symbols is not significantly +more difficult. The scheme we have described is applicable but +requires the following extensions: +\begin{enumerate} +\item Besides \jitiONconstant we also need \jitiONterm and + \jitiONstructure instructions. These are the \JITI counterparts of + the WAM's \switchONterm and \switchONstructure. +\item Because the byte code for the clause heads does not necessarily + have a regular structure, the abstract machine needs to be able to + ``walk'' the byte code instructions and recover the symbols on which + indexing will be based. Writing such a code walking procedure is not + hard.\footnote{In many Prolog systems, a procedure with similar + functionality often exists for the disassembler, the debugger, etc.} +\item Indexing on a position that contains unconstrained variables + for some clauses is tricky. The WAM needs to group clauses in this + case and without special treatment creates two choice points for + this argument (one for the variables and one per each group of + clauses). However, this issue and how to deal with it is well-known + by now. Possible solutions to it are described in a 1987 paper by + Carlsson~\cite{FreezeIndexing@ICLP-87} and can be readily adapted to + \JITI. Alternatively, in a simple implementation, we can skip \JITI + for positions with variables in some clauses. +\end{enumerate} +Before describing \JITI more formally, we remark on the following +design decisions whose rationale may not be immediately obvious: +\begin{itemize} +\item By default, only table $T_1$ is generated at compile time (as in + the WAM) and the additional index tables $T_2, T_3, \ldots$ are + generated dynamically. This is because we do not want to increase + compiled code size unnecessarily (i.e., when there is no demand for + these indices). +\item On the other hand, we generate \jitiSTAR instructions at compile + time for the head arguments.\footnote{The \jitiSTAR instructions for + the $T_1$ table can be generated either by the compiler or by the + loader.} This does not noticeably increase the generated byte code + but it greatly simplifies code loading. Notice that a nice property + of the scheme we have described is that the loaded byte code can be + patched \emph{without} the need to move any instructions. +% The indexing tables are typically not intersperced with the byte code. +\item Finally, one may wonder why the \jitiSTAR instructions create + the dynamic index tables with an additional code walking pass + instead of piggy-backing on the pass which examines all clauses via + the main \TryRetryTrust chain. Main reasons are: 1) in many cases + the code walking can be selective and guided by offsets and 2) by + first creating the index table and then using it we speed up the + execution of the queries encountered during runtime and often avoid + unnecessary choice point creations. +\end{itemize} +This is \JITI as we have implemented it. +% in one of our Prolog systems. +However, we note that these decisions are orthogonal to the main idea +and are under compiler control. If, for example, analysis determines +that some argument sequences will never demand indexing we can simply +avoid generation of \jitiSTAR instructions for these. Similarly, if we +determine that some argument sequences will definitely demand indexing +we can speed up execution by generating the appropriate index tables +at compile time instead of at runtime. + +\subsection{Demand-driven index construction and its properties} +%--------------------------------------------------------------- +The idea behind \JITI can be captured in a single sentence: \emph{we +can generate every index we need during program execution when this +index is demanded}. Subsequent uses of these indices can speed up +execution considerably more than the time it takes to construct them +(more on this below) so this runtime action makes sense.\footnote{In +fact, because choice points are expensive in the WAM, \JITI can speed +up even the execution of the query that triggers the process, not only +subsequent queries.} +% +We describe the process of demand-driven index construction. + +% \subsubsection{Demand-driven index construction} +%------------------------------------------------- +Let $p/k$ be a predicate with $n$ clauses. +% +At a high level, its indices form a tree whose root is the entry point +of the predicate. For simplicity, assume that the root node of the +tree and the interior nodes corresponding to the index table for the +first argument have been constructed at compile time. Leaves of this +tree are the nodes containing the code for the clauses of the +predicate and each clause is identified by a unique label \mbox{$L_i, +1 \leq i \leq n$}. Execution always starts at the first instruction of +the root node and follows Algorithm~\ref{alg:construction}. The +algorithm might look complicated but is actually quite simple. +% +Each non-leaf node contains a sequence of byte code instructions with +groups of the form \mbox{$\langle I_1, \ldots, I_m, T_1, \ldots, T_l +\rangle, 0 \leq m \leq k, 1 \leq l \leq n$} where each of the $I$ +instructions, if any, is either a \switchSTAR or a \jitiSTAR +instruction and the $T$ instructions are either a sequence of +\TryRetryTrust instructions (if $l > 1$) or a \jump instruction (if +\mbox{$l = 1$}). Step~2.2 dynamically constructs an index table $\cal +T$ whose buckets are the newly created interior nodes in the tree. +Each bucket associated with a single clause contains a \jump to the +label of that clause. Each bucket associated with many clauses starts +with the $I$ instructions which are yet to be visited and continues +with a \TryRetryTrust chain pointing to the clauses. When the index +construction is done, the instruction mutates to a \switchSTAR WAM +instruction. +%------------------------------------------------------------------------- +\begin{Algorithm}[t] + \caption{Actions of the abstract machine with \JITI} + \label{alg:construction} + \begin{enumerate} + \item if the current instruction $I$ is a \switchSTAR, \try, \retry, + \trust or \jump, the action is an in the WAM; + \item if the current instruction $I$ is a \jitiSTAR with arguments $r, + l$, and $k$ where $r$ is a register then + \begin{enumerate} + \item[2.1] if register $r$ contains a variable, the action is simply to + \instr{goto} the next instruction in the node; + \item[2.2] if register $r$ contains a value $v$, the action is to + dynamically construct the index as follows: + \begin{itemize} + \item[2.2.1] collect the subsequent instructions in a list $\cal I$ + until the next instruction is a \try;\footnote{Note that there + will always be a \try following a \jitiSTAR instruction.} + \item[2.2.2] for each label $L$ in the \TryRetryTrust chain + inspect the code of the clause with label $L$ to find the + symbol~$c$ associated with register $r$ in the clause; (This + step creates a list of $\langle c, L \rangle$ pairs.) + \item[2.2.3] create an index table $\cal T$ out of these pairs as + follows: + \begin{itemize} + \item if $I$ is a \jitiONconstant or a \jitiONstructure then + create an index table for the symbols in the list of pairs; + each entry of the table is identified by a symbol $c$ and + contains: + \begin{itemize} + \item the instruction \jump $L_c$ if $L_c$ is the only label + associated with $c$; + \item the sequence of instructions obtained by appending to + $\cal I$ a \TryRetryTrust chain for the sequence of labels + $L'_1, \ldots, L'_l$ that are associated with $c$ + \end{itemize} + \item if $I$ is a \jitiONterm then + \begin{itemize} + \item partition the sequence of labels $\cal L$ in the list + of pairs into sequences of labels ${\cal L}_c, {\cal L}_l$ + and ${\cal L}_s$ for constants, lists and structures, + respectively; + \item for each of the four sequences ${\cal L}, {\cal L}_c, + {\cal L}_l, {\cal L}_s$ of labels create code as follows: + \begin{itemize} + \item the instruction \fail if the sequence is empty; + \item the instruction \jump $L$ if $L$ is the only label in + the sequence; + \item the sequence of instructions obtained by appending to + $\cal I$ a \TryRetryTrust chain for the current sequence + of labels; + \end{itemize} + \end{itemize} + \end{itemize} + \item[2.2.4] transform the \jitiSTAR $r, l, k$ instruction to + a \switchSTAR $r, l, {\cal T}$ instruction; and + \item[2.2.5] continue execution with this instruction. + \end{itemize} + \end{enumerate} + \end{enumerate} +\end{Algorithm} +%------------------------------------------------------------------------- + +\paragraph*{Complexity properties.} +Index construction during runtime does not change the complexity of +query execution. First, note that each demanded index table will be +constructed at most once. Also, a \jitiSTAR instruction will be +encountered only in cases where execution would examine all clauses in +the \TryRetryTrust chain.\footnote{This statement is possibly not +valid in the presence of Prolog cuts.} The construction visits these +clauses \emph{once} and then creates the index table in time linear in +the number of clauses as one pass over the list of $\langle c, L +\rangle$ pairs suffices. After index construction, execution will +visit a subset of these clauses as the index table will be consulted. +%% Finally, note that the maximum number of \jitiSTAR instructions +%% that will be visited for each query is bounded by the maximum +%% number of index positions (symbols) in the clause heads of the +%% predicate. +Thus, in cases where \JITI is not effective, execution of a query will +at most double due to dynamic index construction. In fact, this worst +case is pessimistic and extremely unlikely in practice. On the other +hand, \JITI can change the complexity of query evaluation from $O(n)$ +to $O(1)$ where $n$ is the number of clauses. + +\subsection{More implementation choices} +%--------------------------------------- +The observant reader has no doubt noticed that +Algorithm~\ref{alg:construction} provides multi-argument indexing but +only for the main functor symbol of arguments. For clauses with +compound terms that require indexing in their sub-terms we can either +employ a program transformation like \emph{unification +factoring}~\cite{UnifFact@POPL-95} at compile time or modify the +algorithm to consider index positions inside compound terms. This is +relatively easy to do but requires support from the register allocator +(passing the sub-terms of compound terms in appropriate argument +registers) and/or a new set of instructions. Due to space limitations +we omit further details. + +Algorithm~\ref{alg:construction} relies on a procedure that inspects +the code of a clause and collects the symbols associated with some +particular index position (step~2.2.2). If we are satisfied with +looking only at clause heads, this procedure needs to understand only +the structure of \instr{get} and \instr{unify} instructions. Thus, it +is easy to write. At the cost of increased implementation complexity, +this step can of course take into account other information that may +exist in the body of the clause (e.g., type tests such as +\code{var(X)}, \code{atom(X)}, aliasing constraints such as \code{X = +Y}, numeric constraints such as \code{X > 0}, etc). + +A reasonable concern for \JITI is increased memory consumption during +runtime due to the creation of index tables. In our experience, this +does not seem to be a problem in practice since most applications do +not have demand for indexing on many argument combinations. In +applications where it does become a problem or when running in an +environment with limited memory, we can easily put a bound on the size +of index tables, either globally or for each predicate separately. For +example, the \jitiSTAR instructions can either become inactive when +this limit is reached, or better yet we can recover the space of some +tables. To do so, we can employ any standard recycling algorithm +(e.g., least recently used) and reclaim the memory of index tables +that are no longer in use. This is easy to do by reverting the +corresponding \switchSTAR instructions back to \jitiSTAR instructions. +If the indices are demanded again at a time when memory is available, +they can simply be regenerated. + + +\section{Demand-Driven Indexing of Dynamic Predicates} \label{sec:dynamic} +%========================================================================= +We have so far lived in the comfortable world of static predicates, +where the set of clauses to index is fixed and the compiler can take +advantage of this knowledge. Dynamic code introduces several +complications: +\begin{itemize} +\item We need mechanisms to update multiple indices when new clauses + are asserted or retracted. In particular, we need the ability to + expand and possibly shrink multiple code chunks after code updates. +\item We do not know a priori which are the best index positions and + cannot determine whether indexing on some arguments is avoidable. +\item Supporting the so-called logical update (LU) semantics of the + ISO Prolog standard becomes harder. +\end{itemize} +We will briefly discuss possible ways of addressing these issues. +However, we note that Prolog systems typically provide indexing for +dynamic predicates and thus already deal in some way or another with +these issues; \JITI makes the problems more involved but not +fundamentally different than those with only first argument indexing. + +The first complication suggests that we should allocate memory for +dynamic indices in separate chunks, so that these can be expanded and +deallocated independently. Indeed, this is what we do. +% +Regarding the second complication, in the absence of any other +information, the only alternative is to generate indices for all +arguments. As optimizations, we can avoid indexing for predicates with +only one clause (these are often used to simulate global variables) +and we can exclude arguments where some clause has a variable. + +Under logical update semantics calls to dynamic predicates execute in a +``snapshot'' of the corresponding predicate. In other words, each call +sees the clauses that existed at the time when the call was made, even if +some of the clauses were later deleted or new clauses were asserted. +If several calls are alive in the stack, several snapshots will be +alive at the same time. The standard solution to this problem is to +use time stamps to tell which clauses are \emph{live} for which calls. +% +This solution complicates freeing index tables because: (1) an index +table holds references to clauses, and (2) the table may be in use, +that is, it may be accessible from the execution stacks. An index +table thus is killed in several steps: +\begin{enumerate} +\item Detach the index table from the indexing tree. +\item Recursively \emph{kill} every child of the current table: + if the current table is killed, so are its children. +\item Wait until the table is not in use, that is, it is not pointed + to by someone. +\item Walk the table and release any references it may hold. +\item Physically recover space. +\end{enumerate} +%% It is interesting to observe that at the end of an \emph{itemset-node} +%% the emulator can remove references to the current index, hence freeing +%% the code it is currently executing. This happens on the last member of +%% the \emph{itemset-node}, so the emulator reads all the instruction's +%% arguments before executing the instruction. + + +\section{Implementation in XXX and in YAP} \label{sec:impl} +%========================================================== +The implementation of \JITI in XXX follows a variant of the scheme +presented in Sect.~\ref{sec:static}. The compiler uses heuristics to +determine the best argument to index on (i.e., this argument is not +necessarily the first) and employs \switchSTAR instructions for this +task. It also statically generates \jitiONconstant instructions for +other arguments that are good candidates for \JITI. +Currently, an argument is considered a good candidate if it has only +constants or only structure symbols in all clauses. Thus, XXX uses +only \jitiONconstant and \jitiONstructure instructions, never a +\jitiONterm. Also, XXX does not perform \JITI inside structure +symbols.\footnote{Instead, it prompts its user to request unification +factoring for predicates that look likely to benefit from indexing +inside compound terms. The user can then use the appropriate compiler +directive for these predicates.} For dynamic predicates, \JITI is +employed only if they consist of Datalog facts; if a clause which is +not a Datalog fact is asserted, all dynamically created index tables +for the predicate are simply removed and the \jitiONconstant +instruction becomes a \instr{noop}. All this is done automatically, +but the user can disable \JITI in compiled code using an appropriate +compiler option. + +YAP implements \JITI since version 5. The current implementation +supports static code, dynamic code, and the internal database. It +differs from the algorithm presented in Sect.~\ref{sec:static} in that +\emph{all indexing code is generated on demand}. Thus, YAP cannot +assume that a \jitiSTAR instruction is followed by a \TryRetryTrust +chain. Instead, by default YAP has to search the whole predicate for +clauses that match the current position in the indexing code. Doing so +for every index expansion was found to be very inefficient for larger +relations: in such cases YAP will maintain a list of matching clauses +at each \jitiSTAR node. Indexing dynamic predicates in YAP follows +very much the same algorithm as static indexing: the key idea is that +most nodes in the index tree must be allocated separately so that they +can grow or shrink independently. YAP can index arguments where some +clauses have unconstrained variables, but only for static predicates, +as in dynamic code this would complicate support for logical update +semantics. + +YAP uses the term JITI (Just-In-Time Indexing) to refer to \JITI. In +the next section we will take the liberty to use this term as a +convenient abbreviation. + +\section{Performance Evaluation} \label{sec:perf} +%================================================ +We evaluate \JITI on a set of benchmarks and applications. +Throughout, we compare performance of JITI with first argument +indexing. For the benchmarks of Sect.~\ref{sec:perf:ineffective} +and~\ref{sec:perf:effective} which involve both systems, we used a +2.4~GHz P4-based laptop with 512~MB of memory running Linux. +% and report times in milliseconds. +For the benchmarks of Sect.~\ref{sec:perf:ILP} which involve +YAP~5.1.2 only, we used a 8-node cluster, where each node is a +dual-core AMD~2600+ machine with 2GB of memory. +% and report times in seconds. + +\subsection{Performance of \JITI when ineffective} \label{sec:perf:ineffective} +%------------------------------------------------------------------------------ +In some programs, \JITI does not trigger\footnote{In XXX only; as +mentioned in Sect.~\ref{sec:impl} even 1st argument indexing is +generated on demand when JITI is used in YAP.} or might trigger but +have no effect other than an overhead due to runtime index +construction. We therefore wanted to measure this overhead. +% +As both systems support tabling, we decided to use tabling benchmarks +because they are small and easy to understand, and because they are a +bad case for JITI in the following sense: tabling avoids generating +repetitive queries and the benchmarks operate over extensional +database (EDB) predicates of size approximately equal to the size of +the program. We used \compress, a tabled program that solves a puzzle +from an ICLP Prolog programming competition. The other benchmarks are +different variants of tabled left, right and doubly recursive +transitive closure over an EDB predicate forming a chain of size shown +in Table~\ref{tab:ineffective} in parentheses. For each variant of +transitive closure, we issue two queries: one with mode +\code{(in,out)} and one with mode \code{(out,out)}. +% +For YAP, indices on the first argument and \TryRetryTrust chains are +built on all benchmarks under \JITI. +% +For XXX, \JITI triggers on no benchmark but the \jitiONconstant +instructions are executed for the three \bench{tc\_?\_oo} benchmarks. +% +As can be seen in Table~\ref{tab:ineffective}, \JITI, even when +ineffective, incurs a runtime overhead that is at the level of noise +and goes mostly unnoticed. +% +We also note that our aim here is \emph{not} to compare the two +systems, so the \textbf{YAP} and \textbf{XXX} columns should be read +separately. + +\vspace*{-0.5em} +\subsection{Performance of \JITI when effective} \label{sec:perf:effective} +%-------------------------------------------------------------------------- +On the other hand, when \JITI is effective, it can significantly +improve runtime performance. We use the following programs and +applications: +%% \TODO{For the journal version we should also add FSA benchmarks +%% (\bench{k963}, \bench{dg5} and \bench{tl3})} +%------------------------------------------------------------------------------ +\begin{small} +\begin{description} +\item[\sgCyl] The same generation DB benchmark on a $24 \times 24 + \times 2$ cylinder. We issue the open query. +\item[\muta] A computationally intensive application where most + predicates are defined intentionally. +\item[\pta] A tabled logic program implementing Andersen's points-to + analysis~\cite{anderson-phd}. A medium-sized imperative program is + encoded as a set of facts (about 16,000) and properties of interest + are encoded using rules. Program properties can then be determined + by checking the closure of these rules. +\item[\tea] Another analyzer using tabling to implement Andersen's + points-to analysis. The analyzed program, the \texttt{javac} SPEC + benchmark, is encoded in a file of 411,696 facts (62,759,581 bytes + in total). As its compilation exceeds the limits of the XXX compiler + (w/o JITI), we run this benchmark only in YAP. +\end{description} +\end{small} +%------------------------------------------------------------------------------ + +%------------------------------------------------------------------------------ +\begin{table}[t] + \centering + \caption{Performance of some benchmarks with 1st vs. \JITI (times in msecs)} + \setlength{\tabcolsep}{2.5pt} + \subfigure[When JITI is ineffective]{ + \label{tab:ineffective} + \begin{tabular}[b]{|l||r|r||r|r|} \hline + & \multicolumn{2}{|c||}{\bf YAP} & \multicolumn{2}{|c|}{\bf XXX} \\ + \cline{2-5} + Benchmark & 1st & JITI & 1st & JITI \\ + \hline + \tcLio (8000) & 13 & 14 & 4 & 4 \\ + \tcRio (2000) & 1445 & 1469 & 614 & 615 \\ + \tcDio ( 400) & 3208 & 3260 & 2338 & 2300 \\ + \tcLoo (2000) & 3935 & 3987 & 2026 & 2105 \\ + \tcRoo (2000) & 2841 & 2952 & 1502 & 1512 \\ + \tcDoo ( 400) & 3735 & 3805 & 4976 & 4978 \\ + \compress & 3614 & 3595 & 2875 & 2848 \\ + \hline + \end{tabular} + } + \subfigure[When \JITI is effective]{ + \label{tab:effective} + \begin{tabular}[b]{|l||r|r|r||r|r|r|} \hline + & \multicolumn{3}{|c||}{\bf YAP} & \multicolumn{3}{|c|}{\bf XXX} \\ + \cline{2-7} + Benchmark & 1st & JITI &{\bf ratio}& 1st & JITI &{\bf ratio}\\ + \hline + \sgCyl & 2,864 & 24 & $119\times$& 2,390 & 28 & $85\times$\\ + \muta & 30,057 &16,782 &$1.79\times$&26,314 &21,574 &$1.22\times$\\ + \pta & 5,131 & 188 & $27\times$& 4,442 & 279 & $16\times$\\ + \tea &1,478,813 &54,616 & $27\times$& --- & --- & --- \\ + \hline + \end{tabular} + } +\end{table} +%------------------------------------------------------------------------------ + +As can be seen in Table~\ref{tab:effective}, \JITI significantly +improves the performance of these applications. In \muta, which spends +most of its time in recursive predicates, the speed up is only $79\%$ +in YAP and~$22\%$ in XXX. The remaining benchmarks execute several +times (from~$16$ up to~$119$) faster. It is important to realize that +\emph{these speedups are obtained automatically}, i.e., without any +programmer intervention or by using any compiler directives, in all +these applications. + +We analyze the \sgCyl program that has the biggest speedup in both +systems and is the only one whose code is small enough to be shown. +With the open call to \texttt{same\_generation/2}, most work in this +benchmark consists of calling \texttt{cyl/2} facts in three different +modes: with both arguments unbound, with the first argument bound, or +with only the second argument bound. Demand-driven indexing improves +performance in the last case only, but this improvement makes a big +difference in this benchmark. + +\begin{alltt}\small + same_generation(X,X) :- cyl(X,_). + same_generation(X,X) :- cyl(_,X). + same_generation(X,Y) :- cyl(X,Z), same_generation(Z,W), cyl(Y,W).\end{alltt} + +%% Our experience with the indexing algorithm described here shows a +%% significant performance improvement over the previous indexing code in +%% our system. Quite often, this has allowed us to tackle applications +%% which previously would not have been feasible. + +\subsection{Performance of \JITI on ILP applications} \label{sec:perf:ILP} +%------------------------------------------------------------------------- +The need for \JITI was originally noticed in inductive logic +programming applications. These applications tend to issue ad hoc +queries during execution and thus their indexing requirements cannot +be determined at compile time. On the other hand, they operate on lots +of data, so memory consumption is a reasonable concern. We evaluate +JITI's time and space performance on some learning tasks using the +Aleph system~\cite{ALEPH} and the datasets of +Fig.~\ref{fig:ilp:datasets} which issue simple queries in an +extensional database. Several of these datasets are standard in the +Machine Learning literature. + +\paragraph*{Time performance.} +We compare times for 10 runs of the saturation/refinement cycle of the +ILP system; see Table~\ref{tab:ilp:time}. +%% The \Krki datasets have small search spaces and small databases, so +%% they achieve the same performance under both versions: there is no +%% slowdown. +The \Mesh and \Pyrimidines applications are the only ones that do not +benefit much from indexing in the database; they do benefit through +from indexing in the dynamic representation of the search space, as +their running times improve somewhat with \JITI. + +The \BreastCancer and \GeneExpr applications use data in 1NF (i.e., +unstructured data). The speedup here is mostly from multiple argument +indexing. \BreastCancer is particularly interesting. It consists of 40 +binary relations with 65k elements each, where the first argument is +the key. We know that most calls have the first argument bound, hence +indexing was not expected to matter much. Instead, the results show +\JITI to improve running time by more than an order of magnitude. Like in +\sgCyl, this suggests that even a small percentage of badly indexed +calls can end up dominating runtime. + +\IEProtein and \Thermolysin are example applications that manipulate +structured data. \IEProtein is the largest dataset we consider, and +indexing is absolutely critical. The speedup is not just impressive; +it is simply not possible to run the application in reasonable time +with only first argument indexing. \Thermolysin is smaller and +performs some computation per query, but even so, \JITI improves its +performance by an order of magnitude. The remaining benchmarks improve +from one to more than two orders of magnitude. + +%------------------------------------------------------------------------------ +\begin{table}[t] + \centering + \caption{Time and space performance of JITI + on Inductive Logic Programming datasets} + \label{tab:ilp} + \setlength{\tabcolsep}{3pt} + \subfigure[Time (in seconds)]{\label{tab:ilp:time} + \begin{tabular}{|l||r|r|r||} \hline + & \multicolumn{3}{|c||}{Time} \\ + \cline{2-4} + Benchmark & 1st & JITI &{\bf ratio} \\ + \hline + \BreastCancer & 1,450 & 88 & $16\times$ \\ + \Carcino & 17,705 & 192 & $92\times$ \\ + \Choline & 14,766 & 1,397 & $11\times$ \\ + \GeneExpr & 193,283 & 7,483 & $26\times$ \\ + \IEProtein & 1,677,146 & 2,909 & $577\times$ \\ +%% \Krki & 0.3 & 0.3 & $1$ \\ +%% \KrkiII & 1.3 & 1.3 & $1$ \\ + \Mesh & 4 & 3 & $1.3\times$ \\ + \Pyrimidines & 487,545 & 253,235 & $1.9\times$ \\ + \Susi & 105,091 & 307 & $342\times$ \\ + \Thermolysin & 50,279 & 5,213 & $10\times$ \\ + \hline + \end{tabular} + } + \subfigure[Memory usage (in KB)]{\label{tab:ilp:memory} + \begin{tabular}{||r|r|r|r||} \hline + \multicolumn{2}{||c|}{Static code} + & \multicolumn{2}{|c||}{Dynamic code} \\ + \hline + \multicolumn{1}{||c|}{Clauses} & \multicolumn{1}{c}{Index} + & \multicolumn{1}{|c|}{Clauses} & \multicolumn{1}{c||}{Index}\\ + \hline + 60,940 & 46,887 & 630 & 14 \\ + 1,801 & 2,678 & 13,512 & 942 \\ + 666 & 174 & 3,172 & 174 \\ + 46,726 & 22,629 & 116,463 & 9,015 \\ + 146,033 & 129,333 & 53,423 & 1,531 \\ +%% 678 & 117 & 2,047 & 24 \\ +%% 1,866 & 715 & 2,055 & 26 \\ + 802 & 161 & 2,149 & 109 \\ + 774 & 218 & 25,840 & 12,291 \\ + 5,007 & 2,509 & 4,497 & 759 \\ + 2,317 & 929 & 116,129 & 7,064 \\ + \hline + \end{tabular} + } +\end{table} +%------------------------------------------------------------------------------ + +%------------------------------------------------------------------------------ +\begin{figure} + \hrule \ \\[-2em] + \begin{description} +%% \item[\Krki] tries to learn rules from a small database of chess end-games; + \item[\GeneExpr] learns rules for yeast gene activity given a + database of genes, their interactions, and micro-array gene + expression data; %~\cite{Regulatory@ILP-06}; + \item[\BreastCancer] processes real-life patient reports towards + predicting whether an abnormality may be + malignant; %~\cite{DavisBDPRCS@IJCAI-05-short}; + \item[\IEProtein] processes information extraction from paper + abstracts to search proteins; + \item[\Susi] learns from shopping patterns; + \item[\Mesh] learns rules for finite-methods mesh design; + \item[\Carcino, \Choline, \Pyrimidines] try to predict chemical + properties of compounds and store them as tables; + \item[\Thermolysin] also manipulates chemical compounds but learns + from the 3D-structure of a molecule's conformations. + \end{description} + \hrule + \caption{Description of the ILP datasets used in the performance + comparison of Table~\ref{tab:ilp}} + \label{fig:ilp:datasets} +\end{figure} +%------------------------------------------------------------------------------ + +\paragraph*{Space performance.} +Table~\ref{tab:ilp:memory} shows memory usage when using \JITI. The +table presents data obtained at a point near the end of execution; +memory usage should be at or close to the maximum. These applications +use a mixture of static and dynamic predicates and we show their +memory usage separately. On static predicates, memory usage varies +widely, from only 10\% to the worst case, \Carcino, where the index +tables take more space than the original program. Hash tables dominate +usage in \IEProtein and \Susi, whereas \TryRetryTrust chains dominate +in \BreastCancer. In most other cases no single component dominates +memory usage. Memory usage for dynamic data is shown in the last two +columns; note that dynamic data is mostly used to store the search +space. One can observe that there is a much lower overhead in this +case. A more detailed analysis shows that most space is occupied by +the hash tables and by internal nodes of the tree, and that relatively +little space is occupied by \TryRetryTrust chains, suggesting that +\JITI is behaving well in practice. + + +\section{Concluding Remarks} +%=========================== +Motivated by the needs of applications in the areas of inductive +logic programming, program analysis, deductive databases, etc.\ to +access large datasets efficiently, we have described a novel but also +simple idea: \emph{indexing Prolog clauses on demand during program +execution}. +% +Given the impressive speedups this idea can provide for many LP +applications, we are a bit surprised similar techniques have not been +explored before. In general, Prolog systems have been reluctant to +perform code optimizations during runtime and our feeling is that LP +implementation has been left a bit behind. We hold that this +should change. +% +Indeed, we see \JITI as only a first, very successful, step towards +effective runtime optimization of logic programs.\footnote{The good +results obtained with JITI have motivated recent work on +Just-In-Time compilation of Prolog~\cite{yapc}.} + +As presented, \JITI is a hybrid technique: index generation occurs +during runtime but is partly guided by the compiler, because we want +to combine it with compile-time WAM-style indexing. More flexible +schemes are of course possible. For example, index generation can be +fully dynamic (as in YAP), combined with user declarations, or driven +by static analysis to be even more selective or go beyond fixed-order +indexing. +% +Last, observe that \JITI fully respects Prolog semantics. Better +performance can be achieved in the context of one solution +computations, or in the context of tabling where order of clauses and +solutions does not matter and repeated solutions are discarded. + +\paragraph{Acknowledgments} + +This work is dedicated to the memory of our friend and colleague +Ricardo Lopes. We miss you! V\'{\i}tor Santos Costa was partially +supported by CNPq and would like to acknowledge support received while +visiting at UW-Madison and the support of the YAP user community. +This work has been partially supported by MYDDAS (POSC/EIA/59154/2004) +and by funds granted to LIACC through the Programa de Financiamento +Plurianual, Fundação para a Ciência e Tecnologia and Programa POSC. + +%============================================================================== +\bibliographystyle{splncs} +\bibliography{lp} +%============================================================================== + +\end{document}