yap-6.3/docs/index/iclp07.tex

%==============================================================================
\documentclass{llncs}
%------------------------------------------------------------------------------
\usepackage[latin1]{inputenc}
\usepackage{float}
\usepackage{alltt}
\usepackage{xspace}
\usepackage{epsfig}
\usepackage{wrapfig}
\usepackage{subfigure}

\renewcommand{\rmdefault}{ptm}
%------------------------------------------------------------------------------
\newcommand{\Paragraph}[1]{\vspace*{-.5em}\paragraph{#1}}
%------------------------------------------------------------------------------
\floatstyle{ruled}
\newfloat{Algorithm}{ht}{lop}
%------------------------------------------------------------------------------
\newcommand{\wamcodesize}{scriptsize}
\newcommand{\code}[1]{\texttt{#1}}
\newcommand{\instr}[1]{\textsf{#1}}
\newcommand{\try}{\instr{try}\xspace}
\newcommand{\retry}{\mbox{\instr{retry}}\xspace}
\newcommand{\trust}{\instr{trust}\xspace}
\newcommand{\TryRetryTrust}{\mbox{\instr{try-retry-trust}}\xspace}
\newcommand{\fail}{\instr{fail}\xspace}
\newcommand{\jump}{\instr{jump}\xspace}
\newcommand{\jitiSTAR}{\mbox{\instr{dindex\_on\_*}}\xspace}
\newcommand{\switchSTAR}{\mbox{\instr{switch\_on\_*}}\xspace}
\newcommand{\jitiONterm}{\mbox{\instr{dindex\_on\_term}}\xspace}
\newcommand{\jitiONconstant}{\mbox{\instr{dindex\_on\_constant}}\xspace}
\newcommand{\jitiONstructure}{\mbox{\instr{dindex\_on\_structure}}\xspace}
\newcommand{\switchONterm}{\mbox{\instr{switch\_on\_term}}\xspace}
\newcommand{\switchONconstant}{\mbox{\instr{switch\_on\_constant}}\xspace}
\newcommand{\switchONstructure}{\mbox{\instr{switch\_on\_structure}}\xspace}
\newcommand{\getcon}{\mbox{\instr{get\_constant}}\xspace}
\newcommand{\proceed}{\instr{proceed}\xspace}
\newcommand{\Cline}{\cline{2-3}}
\newcommand{\JITI}{demand-driven indexing\xspace}
%------------------------------------------------------------------------------
\newcommand{\bench}[1]{\textbf{\textsf{#1}}}
\newcommand{\tcLio}{\bench{tc\_l\_io}\xspace}
\newcommand{\tcRio}{\bench{tc\_r\_io}\xspace}
\newcommand{\tcDio}{\bench{tc\_d\_io}\xspace}
\newcommand{\tcLoo}{\bench{tc\_l\_oo}\xspace}
\newcommand{\tcRoo}{\bench{tc\_r\_oo}\xspace}
\newcommand{\tcDoo}{\bench{tc\_d\_oo}\xspace}
\newcommand{\compress}{\bench{compress}\xspace}
\newcommand{\sgCyl}{\bench{sg\_cyl}\xspace}
\newcommand{\muta}{\bench{muta}\xspace}
\newcommand{\pta}{\bench{pta}\xspace}
\newcommand{\tea}{\bench{tea}\xspace}
%------------------------------------------------------------------------------
\newcommand{\BreastCancer}{\bench{BreastCancer}\xspace}
\newcommand{\Carcino}{\bench{Carcinogenesis}\xspace}
\newcommand{\Choline}{\bench{Choline}\xspace}
\newcommand{\GeneExpr}{\bench{GeneExpression}\xspace}
\newcommand{\IEProtein}{\bench{IE-Protein\_Extraction}\xspace}
%\newcommand{\Krki}{\bench{Krki}\xspace}
%\newcommand{\KrkiII}{\bench{Krki~II}\xspace}
\newcommand{\Mesh}{\bench{Mesh}\xspace}
\newcommand{\Pyrimidines}{\bench{Pyrimidines}\xspace}
\newcommand{\Susi}{\bench{Susi}\xspace}
\newcommand{\Thermolysin}{\bench{Thermolysin}\xspace}
%------------------------------------------------------------------------------
\newenvironment{SmallProg}{\begin{tt}\begin{small}\begin{tabular}[b]{l}}{\end{tabular}\end{small}\end{tt}}
\newenvironment{ScriptProg}{\begin{tt}\begin{scriptsize}\begin{tabular}[b]{l}}{\end{tabular}\end{scriptsize}\end{tt}}
\newenvironment{FootProg}{\begin{tt}\begin{footnotesize}\begin{tabular}[c]{l}}{\end{tabular}\end{footnotesize}\end{tt}}

\newcommand{\TODOcomment}[2]{%
  \stepcounter{TODOcounter#1}%
  {\scriptsize\bf$^{(\arabic{TODOcounter#1})}$}%
  \marginpar[\fbox{
    \parbox{2cm}{\raggedleft
      \scriptsize$^{({\bf{\arabic{TODOcounter#1}{#1}}})}$%
      \scriptsize #2}}]%
  {\fbox{\parbox{2cm}{\raggedright
      \scriptsize$^{({\bf{\arabic{TODOcounter#1}{#1}}})}$%
      \scriptsize #2}}}
}%
\newcounter{TODOcounter}
\newcommand{\TODO}[1]{\TODOcomment{}{#1}}
%------------------------------------------------------------------------------

\title{Demand-Driven Indexing of Prolog Clauses\thanks{Dedicated to
    the memory of our friend, colleague and co-author Ricardo Lopes.
    We miss you!}}
\titlerunning{Demand-Driven Indexing of Prolog Clauses}

\author{V\'{\i}tor Santos Costa\inst{1} \and Konstantinos
  Sagonas\inst{2} \and Ricardo Lopes}
\authorrunning{V. Santos Costa, K. Sagonas and R. Lopes}

\institute{
  LIACC- DCC/FCUP, University of Porto, Portugal
  \and
  National Technical University of Athens, Greece
}

\begin{document}
\maketitle

\begin{abstract}
  As logic programming applications grow in size, Prolog systems need
  to efficiently access larger and larger data sets and the need for
  any- and multi-argument indexing becomes more and more profound.
  Static generation of multi-argument indexing is one alternative, but
  applications often rely on features that are inherently dynamic
  which makes static techniques inapplicable or inaccurate. Another
  alternative is to employ dynamic schemes for flexible demand-driven
  indexing of Prolog clauses. We propose such schemes and discuss
  issues that need to be addressed for their efficient implementation
  in the context of WAM-based Prolog systems. We have implemented
  demand-driven indexing in two different Prolog systems and have been
  able to obtain non-negligible performance speedups: from a few
  percent up to orders of magnitude. Given these results, we see very
  little reason for Prolog systems not to incorporate some form of
  dynamic indexing based on actual demand. In fact, we see
  demand-driven indexing as only the first step towards effective
  runtime optimization of Prolog programs.
\end{abstract}


\section{Introduction}
%=====================
The WAM~\cite{Warren83} has mostly been a blessing but occasionally
also a curse for Prolog systems. Its ingenious design has allowed
implementors to get byte code compilers with decent performance --- it
is not a fluke that most Prolog systems are still based on the WAM. On
the other hand, \emph{because} the WAM gives good performance in many
cases, implementors have not incorporated in their systems many
features that drastically depart from WAM's basic characteristics.
%
For example, first argument indexing is sufficient for many Prolog
applications. However, it is clearly sub-optimal for applications
accessing large data sets; for a long time now, the database community
has recognized that good indexing is the basis for fast query
processing.

As logic programming applications grow in size, Prolog systems need to
efficiently access larger and larger data sets and the need for any-
and multi-argument indexing becomes more and more profound. Static
generation of multi-argument indexing is one alternative. The problem
is that this alternative is often unattractive because it may
drastically increase the size of the generated byte code and do so
unnecessarily. Static analysis can partly address this concern, but in
applications that rely on features which are inherently dynamic (e.g.,
generating hypotheses for inductive logic programming data sets during
runtime) static analysis is inapplicable or grossly inaccurate.
Another alternative, which has not been investigated so far, is to do
flexible indexing on demand during program execution.

This is precisely what we advocate with this paper. More specifically,
we present a small extension to the WAM that allows for flexible
indexing of Prolog clauses during runtime based on actual demand. For
static predicates, the scheme we propose is partly guided by the
compiler; for dynamic code, besides being demand-driven by queries,
the method needs to cater for code updates during runtime. Where our
schemes radically depart from current practice is that they generate
new byte code during runtime, in effect doing a form of just-in-time
compilation. In our experience these schemes pay off. We have
implemented \JITI in two different Prolog systems (YAP and XXX) and
have obtained non-trivial speedups, ranging from a few percent to
orders of magnitude, across a wide range of applications. Given these
results, we see very little reason for Prolog systems not to
incorporate some form of indexing based on actual demand from queries.
In fact, we see \JITI as only the first step towards effective runtime
optimization of Prolog programs.

\Paragraph{Organization.}
%------------------------
After commenting on the state of the art and related work concerning
indexing in Prolog systems (Sect.~\ref{sec:related}) we briefly review
indexing in the WAM (Sect.~\ref{sec:prelims}). We then present \JITI
schemes for static (Sect.~\ref{sec:static}) and dynamic
(Sect.~\ref{sec:dynamic}) predicates, their implementation in two
Prolog systems (Sect.~\ref{sec:impl}) and the performance benefits
they bring (Sect.~\ref{sec:perf}). The paper ends with some concluding
remarks.


\section{State of the Art and Related Work} \label{sec:related}
%==============================================================
% Indexing in Prolog systems:
Many Prolog systems still only support
indexing on the main functor symbol of the first argument. Some
others, such as YAP version 4, can look inside some compound
terms~\cite{YAP}. SICStus Prolog supports \emph{shallow
  backtracking}~\cite{ShallowBacktracking@ICLP-89}; choice points are
fully populated only when it is certain that execution will enter the
clause body. While shallow backtracking avoids some of the performance
problems of unnecessary choice point creation, it does not offer the
full benefits that indexing can provide. Other systems such as
BIM-Prolog~\cite{IndexingProlog@NACLP-89}, SWI-Prolog~\cite{SWI} and
XSB~\cite{XSB} allow for user-controlled multi-argument indexing.
Notably, ilProlog~\cite{ilProlog} uses compile-time heuristics and
generates code for multi-argument indexing automatically. In all these
systems, this support comes with various implementation restrictions.
For example, in SWI-Prolog at most four arguments can be indexed; in
XSB the compiler does not offer multi-argument indexing and the
predicates need to be asserted instead; we know of no system where
multi-argument indexing looks inside compound terms. More importantly,
requiring users to specify arguments to index on is neither
user-friendly nor guarantees good performance results.

% Trees, tries and unification factoring:
Recognizing the need for better indexing, researchers have proposed
more flexible indexing mechanisms for Prolog. For example, Hickey and
Mudambi proposed \emph{switching trees}~\cite{HickeyMudambi@JLP-89},
which rely on the presence of mode information. Similar proposals were
put forward by Van Roy, Demoen and Willems who investigated indexing
on several arguments in the form of a \emph{selection tree}~\cite{VRDW87}
and by Zhou et al.\ who implemented a \emph{matching tree} oriented
abstract machine for Prolog~\cite{TOAM@ICLP-90}. For static
predicates, the XSB compiler offers support for \emph{unification
factoring}~\cite{UnifFact@POPL-95}; for asserted code, XSB can
represent databases of facts using \emph{tries}~\cite{Tries@JLP-99}
which provide left-to-right multi-argument indexing. However, in XSB
none of these mechanisms is used automatically; instead the user has
to specify appropriate directives.

% Comparison with static analysis techniques and Mercury:
Long ago, Kliger and Shapiro argued that such tree-based indexing
schemes are not cost effective for the compilation of Prolog
programs~\cite{KligerShapiro@ICLP-88}. Some of their arguments make
sense for certain applications, but, as we shall show, in general
they underestimate the benefits of indexing on EDB predicates.
Nevertheless, it is true that unless the modes of
predicates are known we run the risk of doing indexing on output
arguments, whose only effect is an unnecessary increase in compilation
times and, more importantly, in code size. In a programming language
such as Mercury~\cite{Mercury@JLP-96} where modes are known the compiler
can of course avoid this risk; indeed in Mercury modes (and types) are
used to guide the compiler generate good indexing tables. However, the
situation is different for a language like Prolog. Getting accurate
information about the set of all possible modes of predicates requires
a global static analyzer in the compiler --- and most Prolog systems
do not come with one. More importantly, it requires a lot of
discipline from the programmer (e.g., that applications use the module
system religiously and never bypass it). As a result, most Prolog
systems currently do not provide the type of indexing that
applications require. Even in systems such as Ciao~\cite{Ciao@SCP-05},
which do come with a built-in static analyzer and more or less force
such a discipline on the programmer, mode information is not used for
multi-argument indexing.

% The grand finale:
The situation is actually worse for certain types of Prolog
applications. For example, consider applications in the area of
inductive logic programming. These applications on the one hand have
high demands for effective indexing since they need to efficiently
access big datasets and on the other they are unfit for static
analysis since queries are often ad hoc and generated only during
runtime as new hypotheses are formed or refined.
%
Our thesis is that the abstract machine should be able to adapt
automatically to the runtime requirements of such or, even better, of
all applications by employing increasingly aggressive forms of dynamic
compilation. As a concrete example of what this means in practice, in
this paper we will attack the problem of satisfying the indexing needs
of applications during runtime. Naturally, we will base our technique
on the existing support for indexing that the WAM provides, but we
will extend this support with the technique of \JITI that we describe
in the next sections.


\section{Indexing in the WAM} \label{sec:prelims}
%================================================
To make the paper relatively self-contained we review the indexing
instructions of the WAM and their use. In the WAM, the first level of
dispatching involves a test on the type of the argument. The
\switchONterm instruction checks the tag of the dereferenced value in
the first argument register and implements a four-way branch where one
branch is for the dereferenced register being an unbound variable, one
for being atomic, one for (non-empty) list, and one for structure. In
any case, control goes to a bucket of clauses. In the buckets for
constants and structures the second level of dispatching involves the
value of the register. The \switchONconstant and \switchONstructure
instructions implement this dispatching: typically with a \fail
instruction when the bucket is empty, with a \jump instruction for
only one clause, with a sequential scan when the number of clauses is
small, and with a hash table lookup when the number of clauses exceeds
a threshold. For this reason the \switchONconstant and
\switchONstructure instructions take as arguments the hash table
\instr{T} and the number of clauses \instr{N} the table contains. In
each bucket of this hash table and also in the bucket for the variable
case of \switchONterm the code sequentially backtracks through the
clauses using a \TryRetryTrust chain of instructions. The \try
instruction sets up a choice point, the \retry instructions (if~any)
update certain fields of this choice point, and the \trust instruction
removes it.

The WAM has additional indexing instructions (\instr{try\_me\_else}
and friends) that allow indexing to be interspersed with the code of
clauses. We will not consider them here. This is not a problem since
the above scheme handles all programs. Also, we will feel free to do
some minor modifications and optimizations when this simplifies
things.

Let's see an example. Consider the Prolog code shown in
Fig.~\ref{fig:carc:facts}, a fragment of the machine learning dataset
\textit{Carcinogenesis}.
%
These clauses get compiled to the WAM code shown in
Fig.~\ref{fig:carc:clauses}. The first argument indexing code that a
Prolog compiler generates is shown in Fig.~\ref{fig:carc:index}. This
code is typically placed before the code for the clauses and the
\switchONconstant is the entry point of the predicate. Note that compared
with vanilla WAM this instruction has an extra argument: the register
on the value of which we index ($r_1$). This extra argument will allow
us to go beyond first argument indexing. Another departure from the
WAM is that if this argument register contains an unbound variable
instead of a constant then execution will continue with the next
instruction; in effect we have merged part of the functionality of
\switchONterm into the \switchONconstant instruction. This small
change in the behavior of \switchONconstant will allow us to get
\JITI. Let's see how.

%------------------------------------------------------------------------------
\begin{figure}[t]
\centering
\begin{tabular}[b]{c}
  \subfigure[Some Prolog clauses\label{fig:carc:facts}]{%
    \begin{ScriptProg}
      has\_property(d1,salmonella,p).\\
      has\_property(d1,salmonella\_n,p).\\
      has\_property(d2,salmonella,p). \\
      has\_property(d2,cytogen\_ca,n).\\
      has\_property(d3,cytogen\_ca,p).\\[5pt]
    \end{ScriptProg}
  }\\ \hline\hline%
  \subfigure[WAM indexing\label{fig:carc:index}]{%
    \begin{sf}
      \begin{\wamcodesize}
	\begin{tabular}[b]{l}
          \switchONconstant $r_1$ 5 $T_1$  \\
          \try   $L_1$ \\
          \retry $L_2$ \\
          \retry $L_3$ \\
          \retry $L_4$ \\
          \trust $L_5$ \\
	  \\
	  \begin{tabular}[b]{r|c@{\ }|l|}
	    \Cline
	    $T_1$: & \multicolumn{2}{c|}{Hash Table Info}\\ \Cline\Cline
	    \      & d1 & \try   $L_1$ \\
	    \      &    & \trust $L_2$ \\ \Cline
            \      & d2 & \try   $L_3$ \\
	    \      &    & \trust $L_4$ \\ \Cline
	    \      & d3 & \jump  $L_5$ \\
	    \Cline
	  \end{tabular}\\[3pt]
	\end{tabular}
    \end{\wamcodesize}
    \end{sf}
  }%
\end{tabular}%
\subfigure[Code for the clauses\label{fig:carc:clauses}]{%
  \begin{sf}
    \begin{\wamcodesize}
      \begin{tabular}[b]{rl}
	$L_1$: & \getcon $r_1$ d1            \\
	\      & \getcon $r_2$ salmonella    \\
	\      & \getcon $r_3$ p             \\
        \      & \proceed                    \\
	$L_2$: & \getcon $r_1$ d1            \\
        \      & \getcon $r_2$ salmonella\_n \\
        \      & \getcon $r_3$ p             \\
        \      & \proceed                    \\
	$L_3$: & \getcon $r_1$ d2            \\
        \      & \getcon $r_2$ salmonella    \\
        \      & \getcon $r_3$ p             \\
        \      & \proceed                    \\
	$L_4$: & \getcon $r_1$ d2            \\
	\      & \getcon $r_2$ cytogen\_ca   \\
	\      & \getcon $r_3$ n             \\
	\      & \proceed                    \\
	$L_5$: & \getcon $r_1$ d3            \\
	\      & \getcon $r_2$ cytogen\_ca   \\
	\      & \getcon $r_3$ p             \\
	\      & \proceed
      \end{tabular}
    \end{\wamcodesize}
  \end{sf}
}%
\subfigure[Any arg indexing\label{fig:carc:jiti_single:before}]{%
  \begin{sf}
    \begin{\wamcodesize}
      \begin{tabular}[b]{l}
        \switchONconstant $r_1$ 5 $T_1$  \\
        \jitiONconstant $r_2$   5 3    \\
        \jitiONconstant $r_3$   5 3    \\
        \try   $L_1$ \\
        \retry $L_2$ \\
        \retry $L_3$ \\
        \retry $L_4$ \\
        \trust $L_5$ \\
	\\
	\begin{tabular}[b]{r|c@{\ }|l|}
	  \Cline
	  $T_1$: & \multicolumn{2}{c|}{Hash Table Info}\\ \Cline\Cline
	  \      & \code{d1} & \try   $L_1$ \\
	  \      &           & \trust $L_2$ \\ \Cline
          \      & \code{d2} & \try   $L_3$ \\
	  \      &           & \trust $L_4$ \\ \Cline
	  \      & \code{d3} & \jump  $L_5$ \\
	  \Cline
	\end{tabular}
      \end{tabular}
    \end{\wamcodesize}
  \end{sf}
}%
\caption{Part of the Carcinogenesis dataset and WAM code that a byte
  code compiler generates}
\label{fig:carc}
\vspace*{-1em}
\end{figure}
%------------------------------------------------------------------------------


\section{Demand-Driven Indexing of Static Predicates} \label{sec:static}
%=======================================================================
For static predicates the compiler has complete information about all
clauses and shapes of their head arguments. It is both desirable and
possible to take advantage of this information at compile time and so
we treat the case of static predicates separately.
%
We will do so with schemes of increasing effectiveness and
implementation complexity.

\subsection{A simple WAM extension for any argument indexing}
%------------------------------------------------------------
Let us initially consider the case where the predicates to index
consist only of Datalog facts. This is commonly the case for all
extensional database predicates where indexing is most effective and
called for.

Refer to the example in Fig.~\ref{fig:carc}.
%
The indexing code of Fig.~\ref{fig:carc:index} incurs a small cost for
a call where the first argument is a variable (namely, executing the
\switchONconstant instruction) but the instruction pays off for calls
where the first argument is bound. On the other hand, for calls where
the first argument is a free variable and some other argument is
bound, a choice point will be created, the \TryRetryTrust chain will
be used, and execution will go through the code of all clauses. This
is clearly inefficient, more so for larger data sets.
%
We can do much better with the relatively simple scheme shown in
Fig.~\ref{fig:carc:jiti_single:before}. Immediately after the
\switchONconstant instruction, we can statically generate
\jitiONconstant (demand indexing) instructions, one for each remaining
argument. Recall that the entry point of the predicate is the
\switchONconstant instruction. The \jitiONconstant $r_i$ \instr{N A}
instruction works as follows:
\begin{itemize}
\item if the argument $r_i$ is a free variable,
  execution continues with the next instruction;
\item otherwise, \JITI kicks in as follows. The abstract machine
  scans the WAM code of the clauses and creates an index table for the
  values of the corresponding argument. It can do so because the
  instruction takes as arguments the number of clauses \instr{N} to
  index and the arity \instr{A} of the predicate. (In our example, the
  numbers 5 and 3.) For Datalog facts, this information is sufficient.
  Because the WAM byte code for the clauses has a very regular
  structure, the index table can be created very quickly. Upon its
  creation, the \jitiONconstant instruction gets transformed to a
  \switchONconstant. Again this is straightforward because of the two
  instructions have similar layouts in memory. Execution of the
  abstract machine then continues with the \switchONconstant
  instruction.
\end{itemize}
Figure~\ref{fig:carg:jiti_single:after} shows the index table $T_2$
which is created for our example and how the indexing code looks after
the execution of a call with mode \code{(out,in,?)}. Note that the
\jitiONconstant instruction for argument register $r_2$ has been
appropriately patched. The call that triggered \JITI and subsequent
calls of the same mode will use table $T_2$. The index for the second
argument has been created.
%------------------------------------------------------------------------------
\begin{figure}[t]
  \centering
  \begin{sf}
    \begin{\wamcodesize}
      \begin{tabular}{c@{\hspace*{2em}}c@{\hspace*{2em}}c}
	\begin{tabular}{l}
          \switchONconstant $r_1$ 5 $T_1$ \\
          \switchONconstant $r_2$ 5 $T_2$ \\
          \jitiONconstant $r_3$   5 3     \\
          \try $L_1$   \\
          \retry $L_2$ \\
          \retry $L_3$ \\
          \retry $L_4$ \\
          \trust $L_5$ \\
	\end{tabular}
	&
	\begin{tabular}{r|c@{\ }|l|}
	  \Cline
	  $T_1$: & \multicolumn{2}{c|}{Hash Table Info}\\ \Cline\Cline
	  \      & \code{d1} & \try   $L_1$ \\
	  \      &           & \trust $L_2$ \\ \Cline
          \      & \code{d2} & \try   $L_3$ \\
	  \      &           & \trust $L_4$ \\ \Cline
	  \      & \code{d3} & \jump  $L_5$ \\
	  \Cline
	\end{tabular}
	&
	\begin{tabular}{r|c@{\ }|l|}
	  \Cline
	  $T_2$: & \multicolumn{2}{|c|}{Hash Table Info}\\ \Cline\Cline
	  \      & \code{salmonella}    & \try $L_1$   \\
	  \      &                      & \trust $L_3$ \\ \Cline
	  \      & \code{salmonella\_n} & \jump $L_2$  \\ \Cline
	  \      & \code{cytrogen\_ca}  & \try $L_4$   \\
	  \      &                      & \trust $L_5$ \\
	  \Cline
	\end{tabular}
      \end{tabular}
    \end{\wamcodesize}
  \end{sf}
  \caption{WAM code after demand-driven indexing for argument 2;
    $T_2$ is generated dynamically}
  \label{fig:carg:jiti_single:after}
\end{figure}
%------------------------------------------------------------------------------

The main advantage of this scheme is its simplicity. The compiled code
(Fig.~\ref{fig:carc:jiti_single:before}) is not significantly bigger
than the code which a WAM-based compiler would generate
(Fig.~\ref{fig:carc:index}) and, if \JITI turns out unnecessary
during runtime (e.g. execution encounters only open calls or with only
the first argument bound), the extra overhead is minimal: the
execution of some \jitiONconstant instructions for the open call only.
%
In short, this is a simple scheme that allows for indexing on \emph{any
single} argument. At least for big sets of Datalog facts, we see
little reason not to use it.

\Paragraph{Optimizations.}
Because we are dealing with static code, there are opportunities for
some easy optimizations. Suppose we statically determine that there
will never be any calls with \code{in} mode for some arguments or that
these arguments are not discriminating enough.\footnote{In our
example, suppose the third argument of \code{has\_property/3} was the
atom \code{p} throughout.} Then we can avoid generating
\jitiONconstant instructions for them. Also, suppose we know that some
arguments are most likely than others to be used in the \code{in}
mode. Then we can simply place the \jitiONconstant instructions for
them before the instructions for other arguments. This is possible
since all indexing instructions take the argument register number as
an argument; their order does not matter.

\subsection{From any argument indexing to multi-argument indexing}
%-----------------------------------------------------------------
The scheme of the previous section gives us only single argument
indexing. However, all the infrastructure we need is already in place.
We can use it to obtain any fixed-order multi-argument \JITI in a
straightforward way.

Note that the compiler knows exactly the set of clauses that need to
be tried for each query with a specific symbol in the first argument.
% This information is needed in order to construct, at compile time, the
% hash table $T_1$ of Fig.~\ref{fig:carc:index}.
For multi-argument \JITI, instead of generating for each hash bucket
only \TryRetryTrust instructions, the compiler can prepend appropriate
demand indexing instructions. We illustrate this on our running
example. The table $T_1$ contains four \jitiONconstant instructions:
two for each of the remaining two arguments of hash buckets with more
than one alternative. For hash buckets with none or only one
alternative (e.g., for \code{d3}'s bucket) there is obviously no need
to resort to \JITI for the remaining arguments.
Figure~\ref{fig:carc:jiti_multi} shows the state of the hash tables
after the execution of queries \code{has\_property(C,salmonella,T)},
which creates $T_2$, and \code{has\_property(d2,P,n)} which creates
the $T_3$ table and transforms the \jitiONconstant instruction for
\code{d2} and register $r_3$ to the appropriate \switchONconstant
instruction.

%------------------------------------------------------------------------------
\begin{figure}[t]
  \centering
  \begin{sf}
    \begin{\wamcodesize}
      \begin{tabular}{@{}c@{}c@{}c@{}}
	\begin{tabular}{l}
          \switchONconstant $r_1$ 5 $T_1$ \\
          \switchONconstant $r_2$ 5 $T_2$ \\
          \jitiONconstant $r_3$   5 3     \\
          \try $L_1$   \\
          \retry $L_2$ \\
          \retry $L_3$ \\
          \retry $L_4$ \\
          \trust $L_5$ \\
	\end{tabular}
	&
	\begin{tabular}{r|c@{\ }|l|}
	  \Cline
	  $T_1$: & \multicolumn{2}{c|}{Hash Table Info}\\ \Cline\Cline
	  \      & \code{d1} & \jitiONconstant $r_2$ 2 3 \\
	  \      &           & \jitiONconstant $r_3$ 2 3 \\
	  \      &           & \try   $L_1$ \\
	  \      &           & \trust $L_2$ \\ \Cline
          \      & \code{d2} & \jitiONconstant $r_2$ 2 3 \\
	  \      &           & \switchONconstant $r_3$ 2 $T_3$ \\
	  \      &           & \try   $L_3$ \\
	  \      &           & \trust $L_4$ \\ \Cline
	  \      & \code{d3} & \jump  $L_5$ \\
	  \Cline
	\end{tabular}
	&
	\begin{tabular}{c}
	\begin{tabular}{r|c@{\ }|l|}
	  \Cline
	  $T_2$: & \multicolumn{2}{|c|}{Hash Table Info}\\ \Cline\Cline
	  \      & \code{salmonella}    & \jitiONconstant $r_3$ 2 3 \\
	  \      &                      & \try $L_1$   \\
	  \      &                      & \trust $L_3$ \\ \Cline
	  \      & \code{salmonella\_n} & \jump $L_2$  \\ \Cline
	  \      & \code{cytrogen\_ca}  & \jitiONconstant $r_3$ 2 3 \\
	  \      &                      & \try $L_4$   \\
	  \      &                      & \trust $L_5$ \\
	  \Cline
	\end{tabular}
	\\
	\ \\
	\begin{tabular}{r|c@{\ }|l|}
	  \Cline
	  $T_3$: & \multicolumn{2}{|c|}{Hash Table Info}\\ \Cline\Cline
	  \      & \code{p} & \jump $L_3$ \\ \Cline
	  \      & \code{n} & \jump $L_4$ \\
	  \Cline
	\end{tabular}
	\end{tabular}
      \end{tabular}
    \end{\wamcodesize}
  \end{sf}
  \caption{\JITI for all arguments;
    $T_1$ is static; $T_2$ and $T_3$ are created dynamically}
  \label{fig:carc:jiti_multi}
\end{figure}
%------------------------------------------------------------------------------

\Paragraph{Implementation issues.}
In the \jitiONconstant instructions of Fig.~\ref{fig:carc:jiti_multi}
notice the integer 2 which denotes the number of clauses that the
instruction will index. Using this number an index table of
appropriate size will be created, such as $T_3$. To fill this table we
need information about the clauses to index and the symbols to hash
on. The clauses can be obtained by scanning the labels of the
\TryRetryTrust instructions following \jitiONconstant; the symbols by
looking at appropriate byte code offsets (based on the argument
register number) from these labels. In our running example, the
symbols can be obtained by looking at the second argument of the
\getcon instruction whose argument register is $r_2$. In the loaded
bytecode, assuming the argument register is represented in one byte,
these symbols are found $sizeof(\getcon) + sizeof(opcode) + 1$ bytes
away from the clause label; see Fig.~\ref{fig:carc:clauses}. Thus,
multi-argument \JITI is easy to get and the creation of index tables
can be extremely fast when indexing Datalog facts.

\subsection{Beyond Datalog and other implementation issues}
%----------------------------------------------------------
Indexing on demand clauses with function symbols is not significantly
more difficult. The scheme we have described is applicable but
requires the following extensions:
\begin{enumerate}
\item Besides \jitiONconstant we also need \jitiONterm and
  \jitiONstructure instructions. These are the \JITI counterparts of
  the WAM's \switchONterm and \switchONstructure.
\item Because the byte code for the clause heads does not necessarily
  have a regular structure, the abstract machine needs to be able to
  ``walk'' the byte code instructions and recover the symbols on which
  indexing will be based. Writing such a code walking procedure is not
  hard.
\item Indexing on a position that contains unconstrained variables
  for some clauses is tricky. The WAM needs to group clauses in this
  case and without special treatment creates two choice points for
  this argument (one for the variables and one per each group of
  clauses). However, this issue and how to deal with it is well-known
  by now. Possible solutions to it are described in a paper by
  Carlsson~\cite{FreezeIndexing@ICLP-87} and can be readily adapted to
  \JITI. Alternatively, in a simple implementation, we can skip \JITI
  for positions with variables in some clauses.
\end{enumerate}
Before describing \JITI more formally, we remark on the following
design decisions whose rationale may not be immediately obvious:
\begin{itemize}
\item By default, only table $T_1$ is generated at compile time (as in
  the WAM) and the additional index tables $T_2, T_3, \ldots$ are
  generated dynamically. This is because we do not want to increase
  compiled code size unnecessarily (i.e., when there is no demand for
  these indices).
\item On the other hand, we generate \jitiSTAR instructions at compile
  time for the head arguments.\footnote{The \jitiSTAR instructions for
  $T_1$ can be generated either by the compiler or the loader.} This
  does not noticeably increase the generated byte code but it greatly
  simplifies code loading. Notice that a nice property of the scheme
  we have described is that the loaded byte code can be patched
  \emph{without} the need to move any instructions.
% The indexing tables are typically not intersperced with the byte code.
\item Finally, one may wonder why the \jitiSTAR instructions create
  the dynamic index tables with an additional code walking pass
  instead of piggy-backing on the pass which examines all clauses via
  the main \TryRetryTrust chain. Main reasons are: 1) in many cases
  the code walking can be selective and guided by offsets and 2) by
  first creating the index table and then using it we speed up the
  execution of the queries and often avoid unnecessary choice point
  creations.
\end{itemize}
Note that all these decisions are orthogonal to the main idea and are
under compiler control. For example, if analysis determines that some
argument sequences will never demand indexing we can simply avoid
generation of \jitiSTAR instructions for them. Similarly, if some
argument sequences will definitely demand indexing we can speed up
execution by generating the appropriate tables at compile time instead
of dynamically.

\subsection{Demand-driven index construction and its properties}
%---------------------------------------------------------------
The idea behind \JITI can be captured in a single sentence: \emph{we
can generate every index we need during program execution when this
index is demanded}. Subsequent uses of these indices can speed up
execution considerably more than the time it takes to construct them
(more on this below) so this runtime action makes sense.%\footnote{In
%fact, because choice points are expensive in the WAM, \JITI can speed
%up even the execution of the query that triggers the process, not only
%subsequent queries.}
%
%We describe the process of demand-driven index construction.

% \subsubsection{Demand-driven index construction}
%-------------------------------------------------
Let $p/k$ be a predicate with $n$ clauses.
%
At a high level, its indices form a tree whose root is the entry point
of the predicate. For simplicity, assume that the root node of the
tree and the interior nodes corresponding to the index table for the
first argument have been constructed at compile time. Leaves of this
tree are the nodes containing the code for the clauses of the
predicate and each clause is identified by a unique label \mbox{$L_i,
1 \leq i \leq n$}. Execution always starts at the first instruction of
the root node and follows Algorithm~\ref{alg:construction}. The
algorithm might look complicated but is actually quite simple.
%
Each non-leaf node contains a sequence of byte code instructions with
groups of the form \mbox{$\langle I_1, \ldots, I_m, T_1, \ldots, T_l
\rangle, 0 \leq m \leq k, 1 \leq l \leq n$} where each of the $I$
instructions, if any, is either a \switchSTAR or a \jitiSTAR
instruction and each of the $T$ instructions either forms a sequence
of \TryRetryTrust instructions (if $l > 1$) or is a \jump instruction
(if \mbox{$l = 1$}). Step~2.2 dynamically constructs an index table
$\cal T$ whose buckets are the newly created interior nodes in the
tree. Each bucket associated with a single clause contains a \jump to
the label of that clause. Each bucket associated with many clauses
starts with the $I$ instructions which are yet to be visited and
continues with a \TryRetryTrust chain pointing to the clauses. When
the index construction is done, the instruction mutates to a
\switchSTAR WAM instruction.
%-------------------------------------------------------------------------
\begin{Algorithm}[t]
  \caption{Actions of the abstract machine with \JITI}
  \label{alg:construction}
  \begin{enumerate}
  \item if the current instruction $I$ is a \switchSTAR, \try, \retry,
    \trust or \jump, act as in the WAM;
  \item if the current instruction $I$ is a \jitiSTAR with arguments $r,
    l$, and $k$ ($r$ is a register) then
    \begin{enumerate}
    \item[2.1] if register $r$ contains a variable, the action is a
      \instr{goto} the next instruction in the node;
    \item[2.2] if register $r$ contains a value $v$, the action is to
      dynamically construct the index:
      \begin{itemize}
      \item[2.2.1] collect the subsequent instructions in a list $\cal I$
	until the next instruction is a \try;
      \item[2.2.2] for each label $L$ in the \TryRetryTrust chain
	inspect the code of the clause with label $L$ to find the
	symbol~$c$ associated with register $r$ in the clause; (This
	step creates a list of $\langle c, L \rangle$ pairs.)
      \item[2.2.3] create an index table $\cal T$ out of these pairs as
	follows:
	\begin{itemize}
	\item if $I$ is a \jitiONconstant or a \jitiONstructure then
	  create an index table for the symbols in the list of pairs;
	  each entry of the table is identified by a symbol $c$ and
	  contains:
	  \begin{itemize}
	  \item the instruction \jump $L_c$ if $L_c$ is the only label
	    associated with $c$;
	  \item the sequence of instructions obtained by appending to
	    $\cal I$ a \TryRetryTrust chain for the sequence of labels
	    $L'_1, \ldots, L'_l$ that are associated with $c$
	  \end{itemize}
	\item if $I$ is a \jitiONterm then
	  \begin{itemize}
	  \item partition the sequence of labels $\cal L$ in the list
	    of pairs into sequences of labels ${\cal L}_c, {\cal L}_l$
	    and ${\cal L}_s$ for constants, lists and structures,
	    respectively;
	  \item for each of the four sequences ${\cal L}, {\cal L}_c,
	    {\cal L}_l, {\cal L}_s$ of labels create code:
	    \begin{itemize}
	    \item the instruction \fail if the sequence is empty;
	    \item the instruction \jump $L$ if $L$ is the only label in
	      the sequence;
	    \item the sequence of instructions obtained by appending to
	      $\cal I$ a \TryRetryTrust chain for the current sequence
	      of labels;
	    \end{itemize}
	  \end{itemize}
	\end{itemize}
      \item[2.2.4] transform the \jitiSTAR $r, l, k$ instruction to
	a \switchSTAR $r, l, {\cal T}$ instruction;
      \item[2.2.5] continue execution with this instruction.
      \end{itemize}
    \end{enumerate}
  \end{enumerate}
\vspace*{-.7em}
\end{Algorithm}
%-------------------------------------------------------------------------

\Paragraph{Complexity properties.}
Index construction during runtime does not change the complexity of
query execution. First, note that each demanded index table will be
constructed at most once. Also, a \jitiSTAR instruction will be
encountered only in cases where execution would examine all clauses in
the \TryRetryTrust chain.\footnote{This statement is possibly not
valid in the presence of Prolog cuts.} The construction visits these
clauses \emph{once} and then creates the index table in time linear in
the number of clauses as one pass over the list of $\langle c, L
\rangle$ pairs suffices. After index construction, execution will
visit a subset of these clauses as the index table will be consulted.
%% Finally, note that the maximum number of \jitiSTAR instructions
%% that will be visited for each query is bounded by the maximum
%% number of index positions (symbols) in the clause heads of the
%% predicate.
Thus, in cases where \JITI is not effective, execution of a query will
at most double due to dynamic index construction. In fact, this worst
case is pessimistic and unlikely in practice. On the other hand, \JITI
can change the complexity of query evaluation from $O(n)$ to $O(1)$
where $n$ is the number of clauses.

\subsection{More implementation choices}
%---------------------------------------
The observant reader has no doubt noticed that
Algorithm~\ref{alg:construction} provides multi-argument indexing but
only for the main functor symbol. For clauses with compound terms that
require indexing in their sub-terms we can either employ a program
transformation such as \emph{unification
factoring}~\cite{UnifFact@POPL-95} at compile time or modify the
algorithm to consider index positions inside compound terms. This is
relatively easy to do but requires support from the register allocator
(passing the sub-terms of compound terms in appropriate registers)
and/or a new set of instructions. Due to space limitations we omit
further details.

Algorithm~\ref{alg:construction} relies on a procedure that inspects
the code of a clause and collects the symbols associated with some
particular index position (step~2.2.2). If we are satisfied with
looking only at clause heads, this procedure needs to understand only
the structure of \instr{get} and \instr{unify} instructions. Thus, it
is easy to write. At the cost of increased implementation complexity,
this step can of course take into account other information that may
exist in the body of the clause (e.g., type tests such as
\code{var(X)}, \code{atom(X)}, aliasing constraints such as \code{X =
Y}, numeric constraints such as \code{X > 0}, etc.).

A reasonable concern for \JITI is increased memory consumption. In our
experience, this does not seem to be a problem in practice since most
applications do not have demand for indexing on many argument
combinations. In applications where it does become a problem or when
running in an environment with limited memory, we can easily put a
bound on the size of index tables, either globally or for each
predicate separately. For example, the \jitiSTAR instructions can
either become inactive when this limit is reached, or better yet we
can recover the space of some tables. To do so, we can employ any
standard recycling algorithm (e.g., LRU) and reclaim the memory of
index tables that are no longer in use. This is easy to do by
reverting the corresponding \switchSTAR instructions back to \jitiSTAR
instructions. If the indices are demanded again at a time when memory
is available, they can simply be regenerated.


\section{Demand-Driven Indexing of Dynamic Predicates} \label{sec:dynamic}
%=========================================================================
We have so far lived in the comfortable world of static predicates,
where the set of clauses to index is fixed and the compiler can take
advantage of this knowledge. Dynamic code introduces several
complications:
\begin{itemize}
\item We need mechanisms to update multiple indices when new clauses
  are asserted or retracted. In particular, we need the ability to
  expand and possibly shrink multiple code chunks after code updates.
\item We do not know a priori which are the best index positions and
  cannot determine whether indexing on some arguments is avoidable.
\item Supporting the logical update (LU) semantics of ISO Prolog
  becomes harder.
\end{itemize}
We briefly discuss possible ways of addressing these issues.
However, note that Prolog systems typically provide indexing for
dynamic predicates and thus already deal in some way or another with
these issues; \JITI makes the problems more involved but not
fundamentally different than with only first argument indexing.

The first complication suggests that we should allocate memory for
dynamic indices in separate chunks, so that these can be expanded and
deallocated independently. Indeed, this is what we do.
%
Regarding the second complication, in the absence of any other
information, the only alternative is to generate indices for all
arguments. As optimizations, we can avoid indexing predicates with
only one clause and exclude arguments where some clause has a
variable.

Under LU semantics, calls to dynamic predicates execute in a
``snapshot'' of the corresponding predicate. Each call sees the
clauses that existed at the time when the call was made, even if some
of the clauses were later retracted or new clauses were asserted. If
several calls are alive in the stack, several snapshots will be alive
at the same time. The standard solution to this problem is to use time
stamps to tell which clauses are \emph{live} for which calls.
%
This solution complicates freeing index tables because: (1) an index
table holds references to clauses, and (2) the table may be in use
(i.e., may be accessible from the execution stacks). An index
table thus is killed in several steps:
\begin{enumerate}
\item Detach the index table from the indexing tree.
\item Recursively \emph{kill} every child of the current table; if a
  table is killed so are its children.
\item Wait until the table is not in use, that is, it is not pointed
  to from anywhere.
\item Walk the table and release any references it may hold.
\item Physically recover space.
\end{enumerate}


\section{Implementation in XXX and in YAP} \label{sec:impl}
%==========================================================
The implementation of \JITI in XXX follows a variant of the scheme
presented in Sect.~\ref{sec:static}. The compiler uses heuristics to
determine the best argument to index on (i.e., this argument is not
necessarily the first) and employs \switchSTAR instructions for this
task. It also statically generates \jitiONconstant instructions for
other arguments that are good candidates for \JITI. Currently, an
argument is considered a good candidate if it has only constants or
only structure symbols in all clauses. Thus, XXX uses only
\jitiONconstant and \jitiONstructure instructions, never a
\jitiONterm. Also, XXX does not perform \JITI inside structure
symbols. For dynamic predicates, \JITI is employed only if they
consist of Datalog facts; if a clause which is not a Datalog fact is
asserted, all dynamically created index tables for the predicate are
simply removed and the \jitiONconstant instruction becomes a
\instr{noop}. All this is done automatically, but the user can disable
\JITI in compiled code using an option.

YAP implements \JITI since version 5. The current implementation
supports static code, dynamic code, and the internal database. It
differs from the algorithm presented in Sect.~\ref{sec:static} in that
\emph{all indexing code is generated on demand}. Thus, YAP cannot
assume that a \jitiSTAR instruction is followed by a \TryRetryTrust
chain. Instead, by default YAP has to search the whole predicate for
clauses that match the current position in the indexing code. Doing so
for every index expansion was found to be very inefficient for larger
relations: in such cases YAP will maintain a list of matching clauses
at each \jitiSTAR node. Indexing dynamic predicates in YAP follows
very much the same algorithm as static indexing: the key idea is that
most nodes in the index tree must be allocated separately so that they
can grow or shrink independently. YAP can index arguments where some
clauses have unconstrained variables, but only for static predicates,
as in dynamic code this would complicate support for LU semantics.

YAP uses the term JITI (Just-In-Time Indexing) to refer to \JITI. In
the next section we will take the liberty to use this term as a
convenient abbreviation.


\section{Performance Evaluation} \label{sec:perf}
%================================================
We evaluate JITI on a set of benchmarks and applications.
Throughout, we compare performance of JITI with first argument
indexing. For the benchmarks of Sect.~\ref{sec:perf:ineffective}
and~\ref{sec:perf:effective} which involve both systems, we used a
2.4~GHz P4-based laptop with 512~MB of memory.
% and report times in milliseconds.
For the benchmarks of Sect.~\ref{sec:perf:ILP} which involve
YAP~5.1.2 only, we used a 8-node cluster, where each node is a
dual-core AMD~2600+ machine with 2GB of memory.
% and report times in seconds.

%------------------------------------------------------------------------------
\begin{table}[t]
  \centering
  \caption{Performance of some benchmarks with 1st vs. \JITI (times in msecs)}
  \vspace*{-1em}
  \subfigure[When JITI is ineffective]{%
    \label{tab:ineffective}
    \begin{tabular}[b]{|l||r|r||r|r|} \hline
      & \multicolumn{2}{|c||}{\bf YAP} & \multicolumn{2}{|c|}{\bf XXX} \\
      \cline{2-5}
      Benchmark     &   1st  &  JITI         &   1st  &  JITI          \\
      \hline
      \tcLio (8000) &     13 &    14         &      4 &     4          \\
      \tcRio (2000) &   1445 &  1469         &    614 &   615          \\
      \tcDio ( 400) &   3208 &  3260         &   2338 &  2300          \\
      \tcLoo (2000) &   3935 &  3987         &   2026 &  2105          \\
      \tcRoo (2000) &   2841 &  2952         &   1502 &  1512          \\
      \tcDoo ( 400) &   3735 &  3805         &   4976 &  4978          \\
      \compress     &   3614 &  3595         &   2875 &  2848          \\
      \hline
    \end{tabular}
  }%
  \subfigure[When JITI is effective]{
    \label{tab:effective}
    \begin{tabular}[b]{|l||r|r|r||r|r|r|} \hline
      & \multicolumn{3}{|c||}{\bf YAP} & \multicolumn{3}{|c|}{\bf XXX} \\
      \cline{2-7}
                &   1st  &  JITI &{\bf ratio}&  1st  &  JITI &{\bf ratio}\\
      \hline
      \sgCyl    &    2,864 &    24 & $119\times$& 2,390 &    28 &  $85\times$\\
      \muta     &   30,057 &16,782 &$1.79\times$&26,314 &21,574 &$1.22\times$\\
      \pta      &    5,131 &   188 &  $27\times$& 4,442 &   279 &  $16\times$\\
      \tea      &1,478,813 &54,616 &  $27\times$&   --- &   --- &      ---   \\
      \hline
    \end{tabular}
  }%
  \vspace*{-1em}
\end{table}
%------------------------------------------------------------------------------

\subsection{Performance of \JITI when ineffective} \label{sec:perf:ineffective}
%------------------------------------------------------------------------------
In some programs, \JITI does not trigger\footnote{In XXX only; even
1st argument indexing is generated on demand when JITI is used in
YAP.} or might trigger but have no effect other than an overhead due
to runtime index construction. We therefore wanted to measure this
overhead.
%
As both systems support tabling, we decided to use tabling benchmarks
because they are small and easy to understand, and because they are a
bad case for JITI in the following sense: tabling avoids generating
repetitive queries and the benchmarks operate over extensional
database (EDB) predicates of size approximately equal to the size of
the program. We used \compress, a tabled program that solves a puzzle
from an ICLP Prolog programming competition. The other benchmarks are
different variants of tabled left, right and doubly recursive
transitive closure over an EDB predicate forming a chain of size shown
in Table~\ref{tab:ineffective} in parentheses. For each variant of
transitive closure, we issue two queries: one with mode
\code{(in,out)} and one with mode \code{(out,out)}.
%
For YAP, indices on the first argument and \TryRetryTrust chains are
built on all benchmarks under \JITI.
%
For XXX, \JITI triggers on no benchmark but the \jitiONconstant
instructions are executed for the three \bench{tc\_?\_oo} benchmarks.
%
As can be seen in Table~\ref{tab:ineffective}, \JITI, even when
ineffective, incurs a runtime overhead that is at the level of noise
and goes mostly unnoticed.
%
We also note that our aim here is \emph{not} to compare the two
systems, so the \textbf{YAP} and \textbf{XXX} columns should be read
separately.

\vspace*{-0.5em}
\subsection{Performance of \JITI when effective} \label{sec:perf:effective}
%--------------------------------------------------------------------------
On the other hand, when \JITI is effective, it can significantly
improve runtime performance. We use the following programs and
applications:
%------------------------------------------------------------------------------
\begin{small}
\begin{description}
\item[\sgCyl] The same generation DB benchmark on a $24 \times 24
  \times 2$ cylinder. We issue the open query.
\item[\muta] A computationally intensive application where most
  predicates are defined intentionally.
\item[\pta] A tabled logic program implementing Andersen's points-to
  analysis. A medium-sized imperative program is encoded as a set of
  facts (about 16,000) and properties of interest are encoded using
  rules. Program properties are then determined by the closure of
  these rules.
\item[\tea] Another implementation of Andersen's points-to analysis.
  The analyzed program, the \texttt{javac} benchmark, is encoded in a
  file of 411,696 facts (62,759,581 bytes in total). Its compilation
  exceeds the limits of the XXX compiler (w/o JITI). So we run this
  benchmark only in YAP.
\end{description}
\end{small}
%------------------------------------------------------------------------------

As can be seen in Table~\ref{tab:effective}, \JITI significantly
improves the performance of these applications. In \muta, which spends
most of its time in recursive predicates, the speed up is only $79\%$
in YAP and~$22\%$ in XXX. The remaining benchmarks execute several
times (from~$16$ up to~$119$) faster. It is important to realize that
\emph{these speedups are obtained automatically}, i.e., without any
programmer intervention or by using any compiler directives, in all
these applications.

\subsection{Performance of \JITI on ILP applications} \label{sec:perf:ILP}
%-------------------------------------------------------------------------
The need for \JITI was originally noticed in inductive logic
programming applications. These applications tend to issue ad hoc
queries during execution and thus their indexing requirements cannot
be determined at compile time. On the other hand, they operate on lots
of data, so memory consumption is a reasonable concern. We evaluate
JITI's time and space performance on some learning tasks using the
Aleph system~\cite{ALEPH} and the datasets of
Fig.~\ref{fig:ilp:datasets} which issue simple queries in an
extensional database. Several of these datasets are standard in the
ILP literature.

\Paragraph{Time performance.}
We compare times for 10 runs of the saturation/refinement cycle of the
ILP system; see Table~\ref{tab:ilp:time}.
%% The \Krki datasets have small search spaces and small databases, so
%% they achieve the same performance under both versions: there is no
%% slowdown.
The \Mesh and \Pyrimidines applications are the only ones that do not
benefit much from indexing in the database; they do benefit through
from indexing in the dynamic representation of the search space, as
their running times improve somewhat with \JITI.

The \BreastCancer and \GeneExpr applications use unstructured data.
The speedup here is mostly from multiple argument indexing.
\BreastCancer is particularly interesting. It consists of 40 binary
relations with 65k elements each, where the first argument is the key.
We know that most calls have the first argument bound, hence indexing
was not expected to matter much. Instead, the results show \JITI to
improve running time by more than an order of magnitude. This suggests
that even a small percentage of badly indexed calls can end up
dominating runtime.

\IEProtein and \Thermolysin are example applications that manipulate
structured data. \IEProtein is the largest dataset we consider, and
indexing is absolutely critical. The speedup is not just impressive;
it is simply not possible to run the application in reasonable time
with only first argument indexing. \Thermolysin is smaller and
performs some computation per query, but even so, \JITI improves its
performance by an order of magnitude. The remaining benchmarks improve
from one to more than two orders of magnitude.

%------------------------------------------------------------------------------
\begin{table}[t]
  \centering
  \caption{Time and space performance of JITI
    on Inductive Logic Programming datasets}
  \vspace*{-1em}
  \label{tab:ilp}
  \setlength{\tabcolsep}{2.5pt}
  \subfigure[Time (in seconds)]{\label{tab:ilp:time}
    \begin{tabular}{|l||r|r|r||} \hline
                  & \multicolumn{3}{|c||}{Time} \\
    \cline{2-4}
    Benchmark     &    1st    &   JITI  &{\bf ratio} \\
    \hline
    \BreastCancer &     1,450 &      88 &  $16\times$ \\
    \Carcino      &    17,705 &     192 &  $92\times$ \\
    \Choline      &    14,766 &   1,397 &  $11\times$ \\
    \GeneExpr     &   193,283 &   7,483 &  $26\times$ \\
    \IEProtein    & 1,677,146 &   2,909 & $577\times$ \\
%%  \Krki         &       0.3 &     0.3 &   $1$ \\
%%  \KrkiII       &       1.3 &     1.3 &   $1$ \\
    \Mesh         &         4 &       3 & $1.3\times$ \\
    \Pyrimidines  &   487,545 & 253,235 & $1.9\times$ \\
    \Susi         &   105,091 &     307 & $342\times$ \\
    \Thermolysin  &    50,279 &   5,213 &  $10\times$ \\
    \hline
    \end{tabular}
  }%
  \subfigure[Memory usage (in KB)]{\label{tab:ilp:memory}
    \begin{tabular}{||r|r|r|r||} \hline
                \multicolumn{2}{||c|}{Static code}
              & \multicolumn{2}{|c||}{Dynamic code} \\
    \hline
                \multicolumn{1}{||c|}{Clauses} & \multicolumn{1}{c}{Index}
              & \multicolumn{1}{|c|}{Clauses} & \multicolumn{1}{c||}{Index}\\
    \hline
	        60,940 &  46,887 &     630 &     14 \\
	         1,801 &   2,678 &  13,512 &    942 \\
	           666 &     174 &   3,172 &    174 \\
	        46,726 &  22,629 & 116,463 &  9,015 \\
	       146,033 & 129,333 &  53,423 &  1,531 \\
%%	           678 &     117 &   2,047 &     24 \\
%%	         1,866 &     715 &   2,055 &     26 \\
	           802 &     161 &   2,149 &    109 \\
	           774 &     218 &  25,840 & 12,291 \\
 	         5,007 &   2,509 &   4,497 &    759 \\
	         2,317 &     929 & 116,129 &  7,064 \\
    \hline
    \end{tabular}
  }%
\end{table}
%------------------------------------------------------------------------------

%------------------------------------------------------------------------------
\begin{figure}
  \hrule \ \\[-2em]
  \begin{description}
%%  \item[\Krki] tries to learn rules from a small database of chess end-games;
  \item[\GeneExpr] learns rules for yeast gene activity given a
    database of genes, their interactions, and micro-array gene
    expression data; %~\cite{Regulatory@ILP-06};
  \item[\BreastCancer] processes real-life patient reports towards
    predicting whether an abnormality may be
    malignant; %~\cite{DavisBDPRCS@IJCAI-05-short};
  \item[\IEProtein] processes information extraction from paper
    abstracts to search proteins;
  \item[\Susi] learns from shopping patterns;
  \item[\Mesh] learns rules for finite-methods mesh design;
  \item[\Carcino, \Choline, \Pyrimidines] try to predict chemical
    properties of compounds and store them as tables, given their
    chemical composition and major properties;
  \item[\Thermolysin] also manipulates chemical compounds but learns
    from the 3D-structure of a molecule's conformations.
  \end{description}
  \hrule
  \caption{Description of the ILP datasets used in the performance
    comparison of Table~\ref{tab:ilp}}
  \label{fig:ilp:datasets}
\end{figure}
%------------------------------------------------------------------------------

\Paragraph{Space performance.}
Table~\ref{tab:ilp:memory} shows memory usage when using \JITI. The
table presents data obtained at a point near the end of execution;
memory usage should be at the maximum. These applications use a
mixture of static and dynamic predicates and we show their memory
usage separately. On static predicates, memory usage varies widely,
from only 10\% to the worst case, \Carcino, where the index tables
take more space than the original program. Hash tables dominate usage
in \IEProtein and \Susi, whereas \TryRetryTrust chains dominate in
\BreastCancer. In most other cases no single component dominates
memory usage. Memory usage for dynamic predicates is shown in the last
two columns; this data is mostly used to store the search space.
Observe that there is a much lower overhead in this case. A more
detailed analysis shows that most space is occupied by the hash tables
and by internal nodes of the tree, and that relatively little space is
occupied by \TryRetryTrust chains, suggesting that \JITI is behaving
well in practice.


\section{Concluding Remarks}
%===========================
Motivated by the needs of applications in the areas of inductive
logic programming, program analysis, deductive databases, etc.\ to
access large datasets efficiently, we have described a novel but also
simple idea: \emph{indexing Prolog clauses on demand during program
execution}.
%
Given the impressive speedups this idea can provide for many LP
applications, we are a bit surprised similar techniques have not been
explored before. In general, Prolog systems have been reluctant to
perform code optimizations during runtime and our feeling is that LP
implementation has been left a bit behind. We hold that this
should change.
%
Indeed, we see \JITI as only a first, very successful, step towards
effective runtime optimization of logic programs.

As presented, \JITI is a hybrid technique: index generation occurs
during runtime but is partly guided by the compiler, because we want
to combine it with compile-time WAM-style indexing. More flexible
schemes are of course possible. For example, index generation can be
fully dynamic (as in YAP), combined with user declarations, or driven
by static analysis to be even more selective or go beyond fixed-order
indexing.
%
Last, observe that \JITI fully respects Prolog semantics. Better
performance can be achieved in the context of one solution
computations, or in the context of tabling where order of clauses and
solutions does not matter and repeated solutions are discarded.

\paragragh{Acknowledgments}

This work is dedicated to the memory of our friend and colleague
Ricardo Lopes. We miss you! V\{\i}tor Santos Costa was partially
supported by CNPq and would like to acknowledge support received while
visiting at UW-Madison and the support of the YAP user community.
This work has been partially supported by MYDDAS (POSC/EIA/59154/2004)
and by funds granted to LIACC through the Programa de Financiamento
Plurianual, Funda<64><61>o para a Ci<43>ncia e Tecnologia and Programa POSC.

\Paragraph{Acknowledgments}
%--------------------------
V\'{\i}tor Santos Costa was partially supported by CNPq and would like
to acknowledge support received while visiting at UW-Madison and the
support of the YAP user community. This work has been partially
supported by MYDDAS (POSC/EIA/59154/2004) and by funds granted to
LIACC through the Programa de Financiamento Plurianual, Funda<64><61>o para
a Ci<43>ncia e Tecnologia and Programa POSC.

%==============================================================================
\begin{thebibliography}{10}

\bibitem{Warren83}
Warren, D.H.D.:
\newblock An abstract {P}rolog instruction set.
\newblock Tech. Note 309, SRI International (1983)

\bibitem{YAP}
Santos~Costa, V., Damas, L., Reis, R., Azevedo, R.:
\newblock {YAP} User's Manual. (2002)

\bibitem{ShallowBacktracking@ICLP-89}
Carlsson, M.:
\newblock On the efficiency of optimising shallow backtracking in compiled
  {Prolog}.
\newblock In Levi, G., Martelli, M., eds.: Proceedings of the Sixth
  ICLP, MIT Press (June 1989)  3--15

\bibitem{IndexingProlog@NACLP-89}
Demoen, B., Mari{\"e}n, A., Callebaut, A.:
\newblock Indexing in {P}rolog.
\newblock In Lusk, E.L., Overbeek, R.A., eds.: Proceedings of NACLP,
  MIT Press (1989)  1001--1012

\bibitem{SWI}
Wielemaker, J.:
\newblock {SWI-Prolog 5.1}: Reference Manual.
\newblock {SWI}, University of Amsterdam, Roetersstraat 15, 1018 WB Amsterdam,
  The Netherlands. (1997--2003)

\bibitem{XSB}
Sagonas, K.F., Swift, T., Warren, D.S., Freire, J., Rao, P.:
\newblock The {XSB} Pro\-grammer's Manual.
\newblock State University of New York at Stony Brook. (1997)

\bibitem{ilProlog}
Tron\c{c}on, R., Janssens, G., Demoen, B., Vandecasteele, H.:
\newblock Fast frequent quering with lazy control flow compilation.
\newblock Theory and Practice of Logic Programming (2007) To appear.

\bibitem{HickeyMudambi@JLP-89}
Hickey, T., Mudambi, S.:
\newblock Global compilation of {P}rolog.
\newblock JLP \textbf{7}(3) (November 1989)  193--230

\bibitem{VRDW87}
{Van Roy}, P., Demoen, B., Willems, Y.D.:
\newblock Improving the execution speed of compiled {Prolog} with modes, clause
  selection and determinism.
\newblock In: TAPSOFT'87, Springer (1987)  111--125

\bibitem{TOAM@ICLP-90}
Zhou, N.F., Takagi, T., Kazuo, U.:
\newblock A matching tree oriented abstract machine for {P}rolog.
\newblock In Warren, D.H.D., Szeredi, P., eds.: ICLP90, MIT Press (1990)
  158--173

\bibitem{UnifFact@POPL-95}
Dawson, S., Ramakrishnan, C.R., Ramakrishnan, I.V., Sagonas, K., Skiena, S.,
  Swift, T., Warren, D.S.:
\newblock Unification factoring for the efficient execution of logic programs.
\newblock In: Conference Record of POPL'95, ACM Press (January 1995)  247--258

\bibitem{Tries@JLP-99}
Ramakrishnan, I.V., Rao, P., Sagonas, K., Swift, T., Warren, D.S.:
\newblock Efficient access mechanisms for tabled logic programs.
\newblock Journal of Logic Programming \textbf{38}(1) (January 1999)  31--54

\bibitem{KligerShapiro@ICLP-88}
Kliger, S., Shapiro, E.:
\newblock A decision tree compilation algorithm for {FCP($|$,:,?)}.
\newblock In: Proceedings of the Fifth ICSLP, MIT Press (August 1988) 1315--1336

\bibitem{Mercury@JLP-96}
Somogyi, Z., Henderson, F., Conway, T.:
\newblock The execution algorithm of {Mercury}, an efficient purely declarative
  logic programming language.
\newblock JLP \textbf{26}(1--3) (December 1996)  17--64

\bibitem{Ciao@SCP-05}
Hermenegildo, M.V., Puebla, G., Bueno, F., L{\'o}pez-Garc\'{\i}a, P.:
\newblock Integrated program debugging, verification, and optimization using
  abstract interpretation (and the {Ciao} system preprocessor).
\newblock Science of Computer Programming \textbf{58}(1--2) (2005)  115--140

\bibitem{FreezeIndexing@ICLP-87}
Carlsson, M.:
\newblock Freeze, indexing, and other implementation issues in the {WAM}.
\newblock In Lassez, J.L., ed.: Proceedings of the Fourth ICLP,
  MIT Press (May 1987)  40--58

\bibitem{ALEPH}
Srinivasan, A.:
\newblock The Aleph Manual. (2001)

\end{thebibliography}
%==============================================================================

\end{document}