From a25072d5b4a2a451622eef115a116ed6630e43a0 Mon Sep 17 00:00:00 2001
From: kostis <kostis@b08c6af1-5177-4d33-ba66-4b1c6b8b522a>
Date: Fri, 8 Jun 2007 15:34:49 +0000
Subject: [PATCH] Version with all redundancies cut -- fits in 15 pages.

git-svn-id: https://yap.svn.sf.net/svnroot/yap/trunk@1899 b08c6af1-5177-4d33-ba66-4b1c6b8b522a
---
 docs/index/iclp07.tex | 782 +++++++++++++++++++++++-------------------
 1 file changed, 427 insertions(+), 355 deletions(-)

diff --git a/docs/index/iclp07.tex b/docs/index/iclp07.tex
index e2d70aa50..4c444ffe5 100644
--- a/docs/index/iclp07.tex
+++ b/docs/index/iclp07.tex
@@ -1,7 +1,7 @@
 %==============================================================================
 \documentclass{llncs} 
 %------------------------------------------------------------------------------
-\usepackage{a4wide}
+\usepackage[latin1]{inputenc}
 \usepackage{float}
 \usepackage{alltt}
 \usepackage{xspace}
@@ -11,6 +11,8 @@
 
 \renewcommand{\rmdefault}{ptm}
 %------------------------------------------------------------------------------
+\newcommand{\Paragraph}[1]{\vspace*{-.5em}\paragraph{#1}}
+%------------------------------------------------------------------------------
 \floatstyle{ruled}
 \newfloat{Algorithm}{ht}{lop}
 %------------------------------------------------------------------------------
@@ -45,7 +47,7 @@
 \newcommand{\tcDoo}{\bench{tc\_d\_oo}\xspace}
 \newcommand{\compress}{\bench{compress}\xspace}
 \newcommand{\sgCyl}{\bench{sg\_cyl}\xspace}
-\newcommand{\muta}{\bench{mutagenesis}\xspace}
+\newcommand{\muta}{\bench{muta}\xspace}
 \newcommand{\pta}{\bench{pta}\xspace}
 \newcommand{\tea}{\bench{tea}\xspace}
 %------------------------------------------------------------------------------
@@ -80,11 +82,13 @@
 \newcommand{\TODO}[1]{\TODOcomment{}{#1}}
 %------------------------------------------------------------------------------
 
-\title{Demand-Driven Indexing of Prolog Clauses}
+\title{Demand-Driven Indexing of Prolog Clauses\thanks{Dedicated to
+    the memory of our friend, colleague and co-author Ricardo Lopes.
+    We miss you!}}
 \titlerunning{Demand-Driven Indexing of Prolog Clauses}
 
 \author{V\'{\i}tor Santos Costa\inst{1} \and Konstantinos
-  Sagonas\inst{2} \and Ricardo Lopes\inst{1}}
+  Sagonas\inst{2} \and Ricardo Lopes}
 \authorrunning{V. Santos Costa, K. Sagonas and R. Lopes}
 
 \institute{
@@ -93,8 +97,6 @@
   National Technical University of Athens, Greece
 }
 
-\pagestyle{plain}  % For the submission only
-
 \begin{document}
 \maketitle
 
@@ -104,20 +106,18 @@
   any- and multi-argument indexing becomes more and more profound.
   Static generation of multi-argument indexing is one alternative, but
   applications often rely on features that are inherently dynamic
-  (e.g., generating hypotheses for ILP data sets during runtime) which
-  makes static techniques inapplicable or inaccurate. Another
-  alternative, which has not been investigated so far, is to employ
-  dynamic schemes for flexible demand-driven indexing of Prolog
-  clauses. We propose such schemes and discuss issues that need to be
-  addressed for their efficient implementation in the context of
-  WAM-based Prolog systems. We have implemented demand-driven indexing
-  in two different Prolog systems and have been able to obtain
-  non-negligible performance speedups: from a few percent up to orders
-  of magnitude. Given these results, we see very little reason for
-  Prolog systems not to incorporate some form of dynamic indexing
-  based on actual demand. In fact, we see demand-driven indexing as
-  the first step towards effective runtime optimization of Prolog
-  programs.
+  which makes static techniques inapplicable or inaccurate. Another
+  alternative is to employ dynamic schemes for flexible demand-driven
+  indexing of Prolog clauses. We propose such schemes and discuss
+  issues that need to be addressed for their efficient implementation
+  in the context of WAM-based Prolog systems. We have implemented
+  demand-driven indexing in two different Prolog systems and have been
+  able to obtain non-negligible performance speedups: from a few
+  percent up to orders of magnitude. Given these results, we see very
+  little reason for Prolog systems not to incorporate some form of
+  dynamic indexing based on actual demand. In fact, we see
+  demand-driven indexing as the first step towards effective runtime
+  optimization of Prolog programs.
 \end{abstract}
 
 
@@ -167,20 +167,22 @@ incorporate some form of indexing based on actual demand from queries.
 In fact, we see \JITI as only the first step towards effective runtime
 optimization of Prolog programs.
 
-This paper is structured as follows. After commenting on the state of
-the art and related work concerning indexing in Prolog systems
-(Sect.~\ref{sec:related}) we briefly review indexing in the WAM
-(Sect.~\ref{sec:prelims}). We then present \JITI schemes for static
-(Sect.~\ref{sec:static}) and dynamic (Sect.~\ref{sec:dynamic})
-predicates, their implementation in two Prolog systems
-(Sect.~\ref{sec:impl}) and the performance benefits they bring
-(Sect.~\ref{sec:perf}). The paper ends with some concluding remarks.
+\Paragraph{Organization}
+%-----------------------
+After commenting on the state of the art and related work concerning
+indexing in Prolog systems (Sect.~\ref{sec:related}) we briefly review
+indexing in the WAM (Sect.~\ref{sec:prelims}). We then present \JITI
+schemes for static (Sect.~\ref{sec:static}) and dynamic
+(Sect.~\ref{sec:dynamic}) predicates, their implementation in two
+Prolog systems (Sect.~\ref{sec:impl}) and the performance benefits
+they bring (Sect.~\ref{sec:perf}). The paper ends with some concluding
+remarks.
 
 
 \section{State of the Art and Related Work} \label{sec:related}
 %==============================================================
 % Indexing in Prolog systems:
-To the best of our knowledge, many Prolog systems still only support
+Many Prolog systems still only support
 indexing on the main functor symbol of the first argument. Some
 others, like YAP version 4, can look inside some compound
 terms~\cite{YAP}. SICStus Prolog supports \emph{shallow
@@ -190,17 +192,16 @@ clause body. While shallow backtracking avoids some of the performance
 problems of unnecessary choice point creation, it does not offer the
 full benefits that indexing can provide. Other systems like
 BIM-Prolog~\cite{IndexingProlog@NACLP-89}, SWI-Prolog~\cite{SWI} and
-XSB~\cite{XSB} allow for user-controlled multi-argument indexing (via
-an \code{:-~index} directive). Notably, ilProlog~\cite{ilProlog} uses
-compile-time heuristics and generates code for multi-argument indexing
-automatically. In all these systems, this support comes with various
-implementation restrictions. For example, in SWI-Prolog at most four
-arguments can be indexed; in XSB the compiler does not offer
-multi-argument indexing and the predicates need to be asserted
-instead; we know of no system where multi-argument indexing looks
-inside compound terms. More importantly, requiring users to specify
-arguments to index on is neither user-friendly nor guarantees good
-performance results.
+XSB~\cite{XSB} allow for user-controlled multi-argument indexing.
+Notably, ilProlog~\cite{ilProlog} uses compile-time heuristics and
+generates code for multi-argument indexing automatically. In all these
+systems, this support comes with various implementation restrictions.
+For example, in SWI-Prolog at most four arguments can be indexed; in
+XSB the compiler does not offer multi-argument indexing and the
+predicates need to be asserted instead; we know of no system where
+multi-argument indexing looks inside compound terms. More importantly,
+requiring users to specify arguments to index on is neither
+user-friendly nor guarantees good performance results.
 
 % Trees, tries and unification factoring:
 Recognizing the need for better indexing, researchers have proposed
@@ -265,93 +266,95 @@ in the next sections.
 
 \section{Indexing in the WAM} \label{sec:prelims}
 %================================================
-To make the paper relatively self-contained we briefly review the
-indexing instructions of the WAM and their use. In the WAM, the first
-level of dispatching involves a test on the type of the argument. The
+To make the paper relatively self-contained we review the indexing
+instructions of the WAM and their use. In the WAM, the first level of
+dispatching involves a test on the type of the argument. The
 \switchONterm instruction checks the tag of the dereferenced value in
 the first argument register and implements a four-way branch where one
 branch is for the dereferenced register being an unbound variable, one
 for being atomic, one for (non-empty) list, and one for structure. In
-any case, control goes to a (possibly empty) bucket of clauses. In the
-buckets for constants and structures the second level of dispatching
-involves the value of the register. The \switchONconstant and
-\switchONstructure instructions implement this dispatching: typically
-with a \fail instruction when the bucket is empty, with a \jump
-instruction for only one clause, with a sequential scan when the
-number of clauses is small, and with a hash lookup when the number of
-clauses exceeds a threshold. For this reason the \switchONconstant and
+any case, control goes to a bucket of clauses. In the buckets for
+constants and structures the second level of dispatching involves the
+value of the register. The \switchONconstant and \switchONstructure
+instructions implement this dispatching: typically with a \fail
+instruction when the bucket is empty, with a \jump instruction for
+only one clause, with a sequential scan when the number of clauses is
+small, and with a hash lookup when the number of clauses exceeds a
+threshold. For this reason the \switchONconstant and
 \switchONstructure instructions take as arguments the hash table
-\instr{T} and the number of clauses \instr{N} the table contains (or
-equivalently, \instr{N} is the size of the hash table). In each bucket
-of this hash table and also in the bucket for the variable case of
-\switchONterm the code sequentially backtracks through the clauses
-using a \TryRetryTrust chain of instructions. The \try instruction
-sets up a choice point, the \retry instructions (if~any) update
-certain fields of this choice point, and the \trust instruction
+\instr{T} and the number of clauses \instr{N} the table contains. In
+each bucket of this hash table and also in the bucket for the variable
+case of \switchONterm the code sequentially backtracks through the
+clauses using a \TryRetryTrust chain of instructions. The \try
+instruction sets up a choice point, the \retry instructions (if~any)
+update certain fields of this choice point, and the \trust instruction
 removes it.
 
 The WAM has additional indexing instructions (\instr{try\_me\_else}
 and friends) that allow indexing to be interspersed with the code of
-clauses. For simplicity of presentation we will not consider them
-here. This is not a problem since the above scheme handles all programs.
-Also, we will feel free to do some minor modifications and
-optimizations when this simplifies things.
+clauses. We will not consider them here. This is not a problem since
+the above scheme handles all programs. Also, we will feel free to do
+some minor modifications and optimizations when this simplifies
+things.
 
-We present an example. Consider the Prolog code shown in
-Fig.~\ref{fig:carc:facts}. It is a fragment of the machine
-learning dataset \textit{Carcinogenesis}~\cite{Carcinogenesis@ILP-97}.
-The five clauses get compiled to the WAM code shown in
-Fig.~\ref{fig:carc:clauses}. The first argument indexing indexing code
-that a Prolog compiler generates is shown in
-Fig.~\ref{fig:carc:index}. This code is typically placed before the
-code for the clauses and the \switchONconstant instruction is the
-entry point of predicate. Note that compared with vanilla WAM this
-instruction has an extra argument: the register on the value of which
-we index ($r_1$). This extra argument will allow us to go beyond
-first argument indexing. Another departure from the WAM is that if
-this argument register contains an unbound variable instead of a
-constant then execution will continue with the next instruction; in
-effect we have merged part of the functionality of \switchONterm into
-the \switchONconstant instruction. This small change in the behavior
-of \switchONconstant will allow us to get \JITI. Let's see how.
+Let's see an example. Consider the Prolog code shown in
+Fig.~\ref{fig:carc:facts}, a fragment of the machine learning dataset
+\textit{Carcinogenesis}.
+%
+These clauses get compiled to the WAM code shown in
+Fig.~\ref{fig:carc:clauses}. The first argument indexing code that a
+Prolog compiler generates is shown in Fig.~\ref{fig:carc:index}. This
+code is typically placed before the code for the clauses and the
+\switchONconstant is the entry point of the predicate. Note that compared
+with vanilla WAM this instruction has an extra argument: the register
+on the value of which we index ($r_1$). This extra argument will allow
+us to go beyond first argument indexing. Another departure from the
+WAM is that if this argument register contains an unbound variable
+instead of a constant then execution will continue with the next
+instruction; in effect we have merged part of the functionality of
+\switchONterm into the \switchONconstant instruction. This small
+change in the behavior of \switchONconstant will allow us to get
+\JITI. Let's see how.
 
 %------------------------------------------------------------------------------
 \begin{figure}[t]
 \centering
-\subfigure[Some Prolog clauses\label{fig:carc:facts}]{%
-  \begin{ScriptProg}
-    has\_property(d1,salmonella,p).\\
-    has\_property(d1,salmonella\_n,p).\\
-    has\_property(d2,salmonella,p). \\
-    has\_property(d2,cytogen\_ca,n).\\
-    has\_property(d3,cytogen\_ca,p).
-  \end{ScriptProg}
-}%
-\subfigure[WAM indexing\label{fig:carc:index}]{%
-  \begin{sf}
-    \begin{\wamcodesize}
-      \begin{tabular}[b]{l}
-        \switchONconstant $r_1$ 5 $T_1$  \\
-        \try   $L_1$ \\
-        \retry $L_2$ \\
-        \retry $L_3$ \\
-        \retry $L_4$ \\
-        \trust $L_5$ \\
-	\\
-	\begin{tabular}[b]{r|c@{\ }|l|}
-	  \Cline
-	  $T_1$: & \multicolumn{2}{c|}{Hash Table Info}\\ \Cline\Cline
-	  \      & d1 & \try   $L_1$ \\
-	  \      &    & \trust $L_2$ \\ \Cline
-          \      & d2 & \try   $L_3$ \\
-	  \      &    & \trust $L_4$ \\ \Cline
-	  \      & d3 & \jump  $L_5$ \\
-	  \Cline
+\begin{tabular}[b]{c}
+  \subfigure[Some Prolog clauses\label{fig:carc:facts}]{%
+    \begin{ScriptProg}
+      has\_property(d1,salmonella,p).\\
+      has\_property(d1,salmonella\_n,p).\\
+      has\_property(d2,salmonella,p). \\
+      has\_property(d2,cytogen\_ca,n).\\
+      has\_property(d3,cytogen\_ca,p).\\[5pt]
+    \end{ScriptProg}
+  }\\ \hline\hline%
+  \subfigure[WAM indexing\label{fig:carc:index}]{%
+    \begin{sf}
+      \begin{\wamcodesize}
+	\begin{tabular}[b]{l}
+          \switchONconstant $r_1$ 5 $T_1$  \\
+          \try   $L_1$ \\
+          \retry $L_2$ \\
+          \retry $L_3$ \\
+          \retry $L_4$ \\
+          \trust $L_5$ \\
+	  \\
+	  \begin{tabular}[b]{r|c@{\ }|l|}
+	    \Cline
+	    $T_1$: & \multicolumn{2}{c|}{Hash Table Info}\\ \Cline\Cline
+	    \      & d1 & \try   $L_1$ \\
+	    \      &    & \trust $L_2$ \\ \Cline
+            \      & d2 & \try   $L_3$ \\
+	    \      &    & \trust $L_4$ \\ \Cline
+	    \      & d3 & \jump  $L_5$ \\
+	    \Cline
+	  \end{tabular}\\[3pt]
 	\end{tabular}
-      \end{tabular}
     \end{\wamcodesize}
-  \end{sf}
-}%
+    \end{sf}
+  }%
+\end{tabular}%
 \subfigure[Code for the clauses\label{fig:carc:clauses}]{%
   \begin{sf}
     \begin{\wamcodesize}
@@ -410,6 +413,7 @@ of \switchONconstant will allow us to get \JITI. Let's see how.
 \caption{Part of the Carcinogenesis dataset and WAM code that a byte
   code compiler generates}
 \label{fig:carc}
+\vspace*{-1em}
 \end{figure}
 %------------------------------------------------------------------------------
 
@@ -450,7 +454,7 @@ argument. Recall that the entry point of the predicate is the
 \switchONconstant instruction. The \jitiONconstant $r_i$ \instr{N A}
 instruction works as follows:
 \begin{itemize}
-\item if the argument register $r_i$ is a free variable, then
+\item if the argument $r_i$ is a free variable,
   execution continues with the next instruction;
 \item otherwise, \JITI kicks in as follows. The abstract machine
   scans the WAM code of the clauses and creates an index table for the
@@ -474,7 +478,7 @@ appropriately patched. The call that triggered \JITI and subsequent
 calls of the same mode will use table $T_2$. The index for the second
 argument has been created.
 %------------------------------------------------------------------------------
-\begin{figure}
+\begin{figure}[t]
   \centering
   \begin{sf}
     \begin{\wamcodesize}
@@ -515,7 +519,7 @@ argument has been created.
     \end{\wamcodesize}
   \end{sf}
   \caption{WAM code after demand-driven indexing for argument 2;
-    table $T_2$ is generated dynamically}
+    $T_2$ is generated dynamically}
   \label{fig:carg:jiti_single:after}
 \end{figure}
 %------------------------------------------------------------------------------
@@ -528,24 +532,23 @@ during runtime (e.g. execution encounters only open calls or with only
 the first argument bound), the extra overhead is minimal: the
 execution of some \jitiONconstant instructions for the open call only.
 %
-In short, this is a simple scheme that allows for \JITI on \emph{any
+In short, this is a simple scheme that allows for indexing on \emph{any
 single} argument. At least for big sets of Datalog facts, we see
-little reason not to use this indexing scheme.
+little reason not to use it.
 
-\paragraph*{Optimizations.}
+\Paragraph{Optimizations.}
 Because we are dealing with static code, there are opportunities for
 some easy optimizations. Suppose we statically determine that there
 will never be any calls with \code{in} mode for some arguments or that
-these arguments are not discriminating enough.\footnote{In our example,
-suppose the third argument of \code{has\_property/3} had the atom
-\code{p} as value throughout.} Then we can avoid generating
-\jitiONconstant instructions for them. Also, suppose we detect or
-heuristically decide that some arguments are most likely than others
-to be used in the \code{in} mode. Then we can simply place the
-\jitiONconstant instructions for these arguments \emph{before} the
-instructions for other arguments. This is possible since all indexing
-instructions take the argument register number as an argument; their
-order does not matter.
+these arguments are not discriminating enough.\footnote{In our
+example, suppose the third argument of \code{has\_property/3} was the
+atom \code{p} throughout.} Then we can avoid generating
+\jitiONconstant instructions for them. Also, suppose we know that some
+arguments are most likely than others to be used in the \code{in}
+mode. Then we can simply place the \jitiONconstant instructions for
+them \emph{before} the instructions for other arguments. This is
+possible since all indexing instructions take the argument register
+number as an argument; their order does not matter.
 
 \subsection{From any argument indexing to multi-argument indexing}
 %-----------------------------------------------------------------
@@ -556,28 +559,29 @@ straightforward way.
 
 Note that the compiler knows exactly the set of clauses that need to
 be tried for each query with a specific symbol in the first argument.
-This information is needed in order to construct, at compile time, the
-hash table $T_1$ of Fig.~\ref{fig:carc:index}. For multi-argument
-\JITI, instead of generating for each hash bucket only \TryRetryTrust
-instructions, the compiler can prepend appropriate demand indexing
-instructions. We illustrate this on our running example. The table
-$T_1$ contains four \jitiONconstant instructions: two for each of the
-remaining two arguments of hash buckets with more than one
-alternative. For hash buckets with none or only one alternative (e.g.,
-for \code{d3}'s bucket) there is obviously no need to resort to \JITI
-for the remaining arguments. Figure~\ref{fig:carc:jiti_multi} shows
-the state of the hash tables after the execution of queries
-\code{has\_property(C,salmonella,T)}, which creates table $T_2$, and
-\code{has\_property(d2,P,n)} which creates the $T_3$ table and
-transforms the \jitiONconstant instruction for \code{d2} and register
-$r_3$ to the appropriate \switchONconstant instruction.
+% This information is needed in order to construct, at compile time, the
+% hash table $T_1$ of Fig.~\ref{fig:carc:index}.
+For multi-argument \JITI, instead of generating for each hash bucket
+only \TryRetryTrust instructions, the compiler can prepend appropriate
+demand indexing instructions. We illustrate this on our running
+example. The table $T_1$ contains four \jitiONconstant instructions:
+two for each of the remaining two arguments of hash buckets with more
+than one alternative. For hash buckets with none or only one
+alternative (e.g., for \code{d3}'s bucket) there is obviously no need
+to resort to \JITI for the remaining arguments.
+Figure~\ref{fig:carc:jiti_multi} shows the state of the hash tables
+after the execution of queries \code{has\_property(C,salmonella,T)},
+which creates $T_2$, and \code{has\_property(d2,P,n)} which creates
+the $T_3$ table and transforms the \jitiONconstant instruction for
+\code{d2} and register $r_3$ to the appropriate \switchONconstant
+instruction.
 
 %------------------------------------------------------------------------------
 \begin{figure}[t]
   \centering
   \begin{sf}
     \begin{\wamcodesize}
-      \begin{tabular}{@{}cccc@{}}
+      \begin{tabular}{@{}c@{}c@{}c@{}}
 	\begin{tabular}{l}
           \switchONconstant $r_1$ 5 $T_1$ \\
           \switchONconstant $r_2$ 5 $T_2$ \\
@@ -604,6 +608,7 @@ $r_3$ to the appropriate \switchONconstant instruction.
 	  \Cline
 	\end{tabular}
 	&
+	\begin{tabular}{c}
 	\begin{tabular}{r|c@{\ }|l|}
 	  \Cline
 	  $T_2$: & \multicolumn{2}{|c|}{Hash Table Info}\\ \Cline\Cline
@@ -616,7 +621,8 @@ $r_3$ to the appropriate \switchONconstant instruction.
 	  \      &                      & \trust $L_5$ \\
 	  \Cline
 	\end{tabular}
-	&
+	\\
+	\ \\
 	\begin{tabular}{r|c@{\ }|l|}
 	  \Cline
 	  $T_3$: & \multicolumn{2}{|c|}{Hash Table Info}\\ \Cline\Cline
@@ -624,16 +630,17 @@ $r_3$ to the appropriate \switchONconstant instruction.
 	  \      & \code{n} & \jump $L_4$ \\
 	  \Cline
 	\end{tabular}
+	\end{tabular}
       \end{tabular}
     \end{\wamcodesize}
   \end{sf}
-  \caption{\JITI for all argument combinations;
-    table $T_1$ is static; $T_2$ and $T_3$ are generated dynamically}
+  \caption{\JITI for all arguments;
+    $T_1$ is static; $T_2$ and $T_3$ are created dynamically}
   \label{fig:carc:jiti_multi}
 \end{figure}
 %------------------------------------------------------------------------------
 
-\paragraph{Implementation issues.}
+\Paragraph{Implementation issues.}
 In the \jitiONconstant instructions of Fig.~\ref{fig:carc:jiti_multi}
 notice the integer 2 which denotes the number of clauses that the
 instruction will index. Using this number an index table of
@@ -664,14 +671,13 @@ requires the following extensions:
   have a regular structure, the abstract machine needs to be able to
   ``walk'' the byte code instructions and recover the symbols on which
   indexing will be based. Writing such a code walking procedure is not
-  hard.\footnote{In many Prolog systems, a procedure with similar
-  functionality often exists for the disassembler, the debugger, etc.}
+  hard.
 \item Indexing on a position that contains unconstrained variables
   for some clauses is tricky. The WAM needs to group clauses in this
   case and without special treatment creates two choice points for
   this argument (one for the variables and one per each group of
   clauses). However, this issue and how to deal with it is well-known
-  by now. Possible solutions to it are described in a 1987 paper by
+  by now. Possible solutions to it are described in a paper by
   Carlsson~\cite{FreezeIndexing@ICLP-87} and can be readily adapted to
   \JITI. Alternatively, in a simple implementation, we can skip \JITI
   for positions with variables in some clauses.
@@ -686,11 +692,11 @@ design decisions whose rationale may not be immediately obvious:
   these indices).
 \item On the other hand, we generate \jitiSTAR instructions at compile
   time for the head arguments.\footnote{The \jitiSTAR instructions for
-  the $T_1$ table can be generated either by the compiler or by the
-  loader.} This does not noticeably increase the generated byte code
-  but it greatly simplifies code loading. Notice that a nice property
-  of the scheme we have described is that the loaded byte code can be
-  patched \emph{without} the need to move any instructions.
+  $T_1$ can be generated either by the compiler or the loader.} This
+  does not noticeably increase the generated byte code but it greatly
+  simplifies code loading. Notice that a nice property of the scheme
+  we have described is that the loaded byte code can be patched
+  \emph{without} the need to move any instructions.
 % The indexing tables are typically not intersperced with the byte code.
 \item Finally, one may wonder why the \jitiSTAR instructions create
   the dynamic index tables with an additional code walking pass
@@ -698,18 +704,16 @@ design decisions whose rationale may not be immediately obvious:
   the main \TryRetryTrust chain. Main reasons are: 1) in many cases
   the code walking can be selective and guided by offsets and 2) by
   first creating the index table and then using it we speed up the
-  execution of the queries encountered during runtime and often avoid
-  unnecessary choice point creations.
+  execution of the queries and often avoid unnecessary choice point
+  creations.
 \end{itemize}
-This is \JITI as we have implemented it.
-% in one of our Prolog systems.
-However, we note that these decisions are orthogonal to the main idea
-and are under compiler control. If, for example, analysis determines
-that some argument sequences will never demand indexing we can simply
-avoid generation of \jitiSTAR instructions for these. Similarly, if we
-determine that some argument sequences will definitely demand indexing
-we can speed up execution by generating the appropriate index tables
-at compile time instead of at runtime.
+Note that all these decisions are orthogonal to the main idea and are
+under compiler control. For example, if analysis determines that some
+argument sequences will never demand indexing we can simply avoid
+generation of \jitiSTAR instructions for them. Similarly, if some
+argument sequences will definitely demand indexing we can speed up
+execution by generating the appropriate tables at compile time instead
+of dynamically.
 
 \subsection{Demand-driven index construction and its properties}
 %---------------------------------------------------------------
@@ -717,12 +721,12 @@ The idea behind \JITI can be captured in a single sentence: \emph{we
 can generate every index we need during program execution when this
 index is demanded}. Subsequent uses of these indices can speed up
 execution considerably more than the time it takes to construct them
-(more on this below) so this runtime action makes sense.\footnote{In
-fact, because choice points are expensive in the WAM, \JITI can speed
-up even the execution of the query that triggers the process, not only
-subsequent queries.}
+(more on this below) so this runtime action makes sense.%\footnote{In
+%fact, because choice points are expensive in the WAM, \JITI can speed
+%up even the execution of the query that triggers the process, not only
+%subsequent queries.}
 %
-We describe the process of demand-driven index construction.
+%We describe the process of demand-driven index construction.
 
 % \subsubsection{Demand-driven index construction}
 %-------------------------------------------------
@@ -758,18 +762,17 @@ instruction.
   \label{alg:construction}
   \begin{enumerate}
   \item if the current instruction $I$ is a \switchSTAR, \try, \retry,
-    \trust or \jump, the action is an in the WAM;
+    \trust or \jump, act as in the WAM;
   \item if the current instruction $I$ is a \jitiSTAR with arguments $r,
-    l$, and $k$ where $r$ is a register then
+    l$, and $k$ ($r$ is a register) then
     \begin{enumerate}
-    \item[2.1] if register $r$ contains a variable, the action is simply to
+    \item[2.1] if register $r$ contains a variable, the action is a
       \instr{goto} the next instruction in the node;
     \item[2.2] if register $r$ contains a value $v$, the action is to
-      dynamically construct the index as follows:
+      dynamically construct the index:
       \begin{itemize}
       \item[2.2.1] collect the subsequent instructions in a list $\cal I$
-	until the next instruction is a \try;\footnote{Note that there
-	will always be a \try following a \jitiSTAR instruction.}
+	until the next instruction is a \try;
       \item[2.2.2] for each label $L$ in the \TryRetryTrust chain
 	inspect the code of the clause with label $L$ to find the
 	symbol~$c$ associated with register $r$ in the clause; (This
@@ -795,7 +798,7 @@ instruction.
 	    and ${\cal L}_s$ for constants, lists and structures,
 	    respectively;
 	  \item for each of the four sequences ${\cal L}, {\cal L}_c,
-	    {\cal L}_l, {\cal L}_s$ of labels create code as follows:
+	    {\cal L}_l, {\cal L}_s$ of labels create code:
 	    \begin{itemize}
 	    \item the instruction \fail if the sequence is empty;
 	    \item the instruction \jump $L$ if $L$ is the only label in
@@ -807,15 +810,16 @@ instruction.
 	  \end{itemize}
 	\end{itemize}
       \item[2.2.4] transform the \jitiSTAR $r, l, k$ instruction to
-	a \switchSTAR $r, l, {\cal T}$ instruction; and
+	a \switchSTAR $r, l, {\cal T}$ instruction;
       \item[2.2.5] continue execution with this instruction.
       \end{itemize}
     \end{enumerate}
   \end{enumerate}
+\vspace*{-.7em}
 \end{Algorithm}
 %-------------------------------------------------------------------------
 
-\paragraph*{Complexity properties.}
+\Paragraph{Complexity properties.}
 Index construction during runtime does not change the complexity of
 query execution. First, note that each demanded index table will be
 constructed at most once. Also, a \jitiSTAR instruction will be
@@ -832,23 +836,23 @@ visit a subset of these clauses as the index table will be consulted.
 %% predicate.
 Thus, in cases where \JITI is not effective, execution of a query will
 at most double due to dynamic index construction. In fact, this worst
-case is pessimistic and extremely unlikely in practice. On the other
-hand, \JITI can change the complexity of query evaluation from $O(n)$
-to $O(1)$ where $n$ is the number of clauses.
+case is pessimistic and unlikely in practice. On the other hand, \JITI
+can change the complexity of query evaluation from $O(n)$ to $O(1)$
+where $n$ is the number of clauses.
 
 \subsection{More implementation choices}
 %---------------------------------------
 The observant reader has no doubt noticed that
 Algorithm~\ref{alg:construction} provides multi-argument indexing but
-only for the main functor symbol of arguments. For clauses with
-compound terms that require indexing in their sub-terms we can either
-employ a program transformation like \emph{unification
+only for the main functor symbol. For clauses with compound terms that
+require indexing in their sub-terms we can either employ a program
+transformation like \emph{unification
 factoring}~\cite{UnifFact@POPL-95} at compile time or modify the
 algorithm to consider index positions inside compound terms. This is
 relatively easy to do but requires support from the register allocator
-(passing the sub-terms of compound terms in appropriate argument
-registers) and/or a new set of instructions. Due to space limitations
-we omit further details.
+(passing the sub-terms of compound terms in appropriate registers)
+and/or a new set of instructions. Due to space limitations we omit
+further details.
 
 Algorithm~\ref{alg:construction} relies on a procedure that inspects
 the code of a clause and collects the symbols associated with some
@@ -861,18 +865,17 @@ exist in the body of the clause (e.g., type tests such as
 \code{var(X)}, \code{atom(X)}, aliasing constraints such as \code{X =
 Y}, numeric constraints such as \code{X > 0}, etc).
 
-A reasonable concern for \JITI is increased memory consumption during
-runtime due to the creation of index tables. In our experience, this
-does not seem to be a problem in practice since most applications do
-not have demand for indexing on many argument combinations. In
-applications where it does become a problem or when running in an
-environment with limited memory, we can easily put a bound on the size
-of index tables, either globally or for each predicate separately. For
-example, the \jitiSTAR instructions can either become inactive when
-this limit is reached, or better yet we can recover the space of some
-tables. To do so, we can employ any standard recycling algorithm
-(e.g., least recently used) and reclaim the memory of index tables
-that are no longer in use. This is easy to do by reverting the
+A reasonable concern for \JITI is increased memory consumption. In our
+experience, this does not seem to be a problem in practice since most
+applications do not have demand for indexing on many argument
+combinations. In applications where it does become a problem or when
+running in an environment with limited memory, we can easily put a
+bound on the size of index tables, either globally or for each
+predicate separately. For example, the \jitiSTAR instructions can
+either become inactive when this limit is reached, or better yet we
+can recover the space of some tables. To do so, we can employ any
+standard recycling algorithm (e.g., LRU) and reclaim the memory of
+index tables no longer in use. This is easy to do by reverting the
 corresponding \switchSTAR instructions back to \jitiSTAR instructions.
 If the indices are demanded again at a time when memory is available,
 they can simply be regenerated.
@@ -890,14 +893,14 @@ complications:
   expand and possibly shrink multiple code chunks after code updates.
 \item We do not know a priori which are the best index positions and
   cannot determine whether indexing on some arguments is avoidable.
-\item Supporting the so-called logical update (LU) semantics of the
-  ISO Prolog standard becomes harder.
+\item Supporting the logical update (LU) semantics of ISO Prolog
+  becomes harder.
 \end{itemize}
-We will briefly discuss possible ways of addressing these issues.
-However, we note that Prolog systems typically provide indexing for
+We briefly discuss possible ways of addressing these issues.
+However, note that Prolog systems typically provide indexing for
 dynamic predicates and thus already deal in some way or another with
 these issues; \JITI makes the problems more involved but not
-fundamentally different than those with only first argument indexing.
+fundamentally different than with only first argument indexing.
 
 The first complication suggests that we should allocate memory for
 dynamic indices in separate chunks, so that these can be expanded and
@@ -905,28 +908,28 @@ deallocated independently. Indeed, this is what we do.
 %
 Regarding the second complication, in the absence of any other
 information, the only alternative is to generate indices for all
-arguments. As optimizations, we can avoid indexing for predicates with
-only one clause (these are often used to simulate global variables)
-and we can exclude arguments where some clause has a variable.
+arguments. As optimizations, we can avoid indexing predicates with
+only one clause and exclude arguments where some clause has a
+variable.
 
-Under logical update semantics calls to dynamic predicates execute in a
-``snapshot'' of the corresponding predicate. In other words, each call
-sees the clauses that existed at the time when the call was made, even if
-some of the clauses were later deleted or new clauses were asserted.
-If several calls are alive in the stack, several snapshots will be
-alive at the same time. The standard solution to this problem is to
-use time stamps to tell which clauses are \emph{live} for which calls.
+Under LU semantics, calls to dynamic predicates execute in a
+``snapshot'' of the corresponding predicate. Each call sees the
+clauses that existed at the time when the call was made, even if some
+of the clauses were later deleted or new clauses were asserted. If
+several calls are alive in the stack, several snapshots will be alive
+at the same time. The standard solution to this problem is to use time
+stamps to tell which clauses are \emph{live} for which calls.
 %
 This solution complicates freeing index tables because: (1) an index
-table holds references to clauses, and (2) the table may be in use,
-that is, it may be accessible from the execution stacks. An index
+table holds references to clauses, and (2) the table may be in use
+(i.e., may be accessible from the execution stacks). An index
 table thus is killed in several steps:
 \begin{enumerate}
 \item Detach the index table from the indexing tree.
-\item Recursively \emph{kill} every child of the current table:
-  if the current table is killed, so are its children.
+\item Recursively \emph{kill} every child of the current table; if a
+  table is killed so are its children.
 \item Wait until the table is not in use, that is, it is not pointed
-  to by someone.
+  to by anywhere.
 \item Walk the table and release any references it may hold.
 \item Physically recover space.
 \end{enumerate}
@@ -944,21 +947,17 @@ presented in Sect.~\ref{sec:static}. The compiler uses heuristics to
 determine the best argument to index on (i.e., this argument is not
 necessarily the first) and employs \switchSTAR instructions for this
 task. It also statically generates \jitiONconstant instructions for
-other arguments that are good candidates for \JITI.
-Currently, an argument is considered a good candidate if it has only
-constants or only structure symbols in all clauses. Thus, XXX uses
-only \jitiONconstant and \jitiONstructure instructions, never a
+other arguments that are good candidates for \JITI. Currently, an
+argument is considered a good candidate if it has only constants or
+only structure symbols in all clauses. Thus, XXX uses only
+\jitiONconstant and \jitiONstructure instructions, never a
 \jitiONterm. Also, XXX does not perform \JITI inside structure
-symbols.\footnote{Instead, it prompts its user to request unification
-factoring for predicates that look likely to benefit from indexing
-inside compound terms. The user can then use the appropriate compiler
-directive for these predicates.} For dynamic predicates, \JITI is
-employed only if they consist of Datalog facts; if a clause which is
-not a Datalog fact is asserted, all dynamically created index tables
-for the predicate are simply removed and the \jitiONconstant
-instruction becomes a \instr{noop}. All this is done automatically,
-but the user can disable \JITI in compiled code using an appropriate
-compiler option.
+symbols. For dynamic predicates, \JITI is employed only if they
+consist of Datalog facts; if a clause which is not a Datalog fact is
+asserted, all dynamically created index tables for the predicate are
+simply removed and the \jitiONconstant instruction becomes a
+\instr{noop}. All this is done automatically, but the user can disable
+\JITI in compiled code using an option.
 
 YAP implements \JITI since version 5. The current implementation
 supports static code, dynamic code, and the internal database. It
@@ -974,33 +973,73 @@ very much the same algorithm as static indexing: the key idea is that
 most nodes in the index tree must be allocated separately so that they
 can grow or shrink independently. YAP can index arguments where some
 clauses have unconstrained variables, but only for static predicates,
-as in dynamic code this would complicate support for logical update
-semantics.
+as in dynamic code this would complicate support for LU semantics.
 
 YAP uses the term JITI (Just-In-Time Indexing) to refer to \JITI. In
 the next section we will take the liberty to use this term as a
 convenient abbreviation.
 
+
 \section{Performance Evaluation} \label{sec:perf}
 %================================================
-We evaluate \JITI on a set of benchmarks and applications.
+We evaluate JITI on a set of benchmarks and applications.
 Throughout, we compare performance of JITI with first argument
 indexing. For the benchmarks of Sect.~\ref{sec:perf:ineffective}
 and~\ref{sec:perf:effective} which involve both systems, we used a
-2.4~GHz P4-based laptop with 512~MB of memory running Linux.
+2.4~GHz P4-based laptop with 512~MB of memory.
 % and report times in milliseconds.
 For the benchmarks of Sect.~\ref{sec:perf:ILP} which involve
 YAP~5.1.2 only, we used a 8-node cluster, where each node is a
 dual-core AMD~2600+ machine with 2GB of memory.
 % and report times in seconds.
 
+%------------------------------------------------------------------------------
+\begin{table}[t]
+  \centering
+  \caption{Performance of some benchmarks with 1st vs. \JITI (times in msecs)}
+  \vspace*{-1em}
+  \subfigure[When JITI is ineffective]{%
+    \label{tab:ineffective}
+    \begin{tabular}[b]{|l||r|r||r|r|} \hline
+      & \multicolumn{2}{|c||}{\bf YAP} & \multicolumn{2}{|c|}{\bf XXX} \\
+      \cline{2-5}
+      Benchmark     &   1st  &  JITI         &   1st  &  JITI          \\
+      \hline
+      \tcLio (8000) &     13 &    14         &      4 &     4          \\
+      \tcRio (2000) &   1445 &  1469         &    614 &   615          \\
+      \tcDio ( 400) &   3208 &  3260         &   2338 &  2300          \\
+      \tcLoo (2000) &   3935 &  3987         &   2026 &  2105          \\
+      \tcRoo (2000) &   2841 &  2952         &   1502 &  1512          \\
+      \tcDoo ( 400) &   3735 &  3805         &   4976 &  4978          \\
+      \compress     &   3614 &  3595         &   2875 &  2848          \\
+      \hline
+    \end{tabular}
+  }%
+  \subfigure[When JITI is effective]{
+    \label{tab:effective}
+    \begin{tabular}[b]{|l||r|r|r||r|r|r|} \hline
+      & \multicolumn{3}{|c||}{\bf YAP} & \multicolumn{3}{|c|}{\bf XXX} \\
+      \cline{2-7}
+                &   1st  &  JITI &{\bf ratio}&  1st  &  JITI &{\bf ratio}\\
+      \hline
+      \sgCyl    &    2,864 &    24 & $119\times$& 2,390 &    28 &  $85\times$\\
+      \muta     &   30,057 &16,782 &$1.79\times$&26,314 &21,574 &$1.22\times$\\
+      \pta      &    5,131 &   188 &  $27\times$& 4,442 &   279 &  $16\times$\\
+      \tea      &1,478,813 &54,616 &  $27\times$&   --- &   --- &      ---   \\
+      \hline
+    \end{tabular}
+  }%
+  \vspace*{-1em}
+\end{table}
+%------------------------------------------------------------------------------
+
 \subsection{Performance of \JITI when ineffective} \label{sec:perf:ineffective}
 %------------------------------------------------------------------------------
-In some programs, \JITI does not trigger\footnote{In XXX only; as
-mentioned in Sect.~\ref{sec:impl} even 1st argument indexing is
-generated on demand when JITI is used in YAP.} or might trigger but
-have no effect other than an overhead due to runtime index
-construction. We therefore wanted to measure this overhead.
+In some programs, \JITI does not trigger\footnote{In XXX only; even
+1st argument indexing is generated on demand when JITI is used in
+YAP.} or might trigger but have no effect other than an overhead due
+to runtime index construction. We therefore wanted to measure this
+overhead.
 %
 As both systems support tabling, we decided to use tabling benchmarks
 because they are small and easy to understand, and because they are a
@@ -1035,8 +1074,6 @@ separately.
 On the other hand, when \JITI is effective, it can significantly
 improve runtime performance. We use the following programs and
 applications:
-%% \TODO{For the journal version we should also add FSA benchmarks
-%%       (\bench{k963}, \bench{dg5} and \bench{tl3})}
 %------------------------------------------------------------------------------
 \begin{small}
 \begin{description}
@@ -1045,58 +1082,19 @@ applications:
 \item[\muta] A computationally intensive application where most
   predicates are defined intentionally.
 \item[\pta] A tabled logic program implementing Andersen's points-to
-  analysis~\cite{anderson-phd}. A medium-sized imperative program is
-  encoded as a set of facts (about 16,000) and properties of interest
-  are encoded using rules. Program properties can then be determined
-  by checking the closure of these rules.
-\item[\tea] Another analyzer using tabling to implement Andersen's
-  points-to analysis. The analyzed program, the \texttt{javac} SPEC
-  benchmark, is encoded in a file of 411,696 facts (62,759,581 bytes
-  in total). As its compilation exceeds the limits of the XXX compiler
-  (w/o JITI), we run this benchmark only in YAP.
+  analysis. A medium-sized imperative program is encoded as a set of
+  facts (about 16,000) and properties of interest are encoded using
+  rules. Program properties are then determined by the closure of
+  these rules.
+\item[\tea] Another implementation of Andersen's points-to analysis.
+  The analyzed program, the \texttt{javac} benchmark, is encoded in a
+  file of 411,696 facts (62,759,581 bytes in total). Its compilation
+  exceeds the limits of the XXX compiler (w/o JITI). So we run this
+  benchmark only in YAP.
 \end{description}
 \end{small}
 %------------------------------------------------------------------------------
 
-%------------------------------------------------------------------------------
-\begin{table}[t]
-  \centering
-  \caption{Performance of some benchmarks with 1st vs. \JITI (times in msecs)}
-  \setlength{\tabcolsep}{2.5pt}
-  \subfigure[When JITI is ineffective]{
-    \label{tab:ineffective}
-    \begin{tabular}[b]{|l||r|r||r|r|} \hline
-      & \multicolumn{2}{|c||}{\bf YAP} & \multicolumn{2}{|c|}{\bf XXX} \\
-      \cline{2-5}
-      Benchmark     &   1st  &  JITI         &   1st  &  JITI          \\
-      \hline
-      \tcLio (8000) &     13 &    14         &      4 &     4          \\
-      \tcRio (2000) &   1445 &  1469         &    614 &   615          \\
-      \tcDio ( 400) &   3208 &  3260         &   2338 &  2300          \\
-      \tcLoo (2000) &   3935 &  3987         &   2026 &  2105          \\
-      \tcRoo (2000) &   2841 &  2952         &   1502 &  1512          \\
-      \tcDoo ( 400) &   3735 &  3805         &   4976 &  4978          \\
-      \compress     &   3614 &  3595         &   2875 &  2848          \\
-      \hline
-    \end{tabular}
-  }
-  \subfigure[When \JITI is effective]{
-    \label{tab:effective}
-    \begin{tabular}[b]{|l||r|r|r||r|r|r|} \hline
-      & \multicolumn{3}{|c||}{\bf YAP} & \multicolumn{3}{|c|}{\bf XXX} \\
-      \cline{2-7}
-      Benchmark &   1st  &  JITI &{\bf ratio}&  1st  &  JITI &{\bf ratio}\\
-      \hline
-      \sgCyl    &    2,864 &    24 & $119\times$& 2,390 &    28 &  $85\times$\\
-      \muta     &   30,057 &16,782 &$1.79\times$&26,314 &21,574 &$1.22\times$\\
-      \pta      &    5,131 &   188 &  $27\times$& 4,442 &   279 &  $16\times$\\
-      \tea      &1,478,813 &54,616 &  $27\times$&   --- &   --- &      ---   \\
-      \hline
-    \end{tabular}
-  }
-\end{table}
-%------------------------------------------------------------------------------
-
 As can be seen in Table~\ref{tab:effective}, \JITI significantly
 improves the performance of these applications. In \muta, which spends
 most of its time in recursive predicates, the speed up is only $79\%$
@@ -1106,25 +1104,6 @@ times (from~$16$ up to~$119$) faster. It is important to realize that
 programmer intervention or by using any compiler directives, in all
 these applications.
 
-We analyze the \sgCyl program that has the biggest speedup in both
-systems and is the only one whose code is small enough to be shown.
-With the open call to \texttt{same\_generation/2}, most work in this
-benchmark consists of calling \texttt{cyl/2} facts in three different
-modes: with both arguments unbound, with the first argument bound, or
-with only the second argument bound. Demand-driven indexing improves
-performance in the last case only, but this improvement makes a big
-difference in this benchmark.
-
-\begin{alltt}\small
-  same_generation(X,X) :- cyl(X,_).
-  same_generation(X,X) :- cyl(_,X).
-  same_generation(X,Y) :- cyl(X,Z), same_generation(Z,W), cyl(Y,W).\end{alltt}
-
-%% Our experience with the indexing algorithm described here shows a
-%% significant performance improvement over the previous indexing code in
-%% our system. Quite often, this has allowed us to tackle applications
-%% which previously would not have been feasible.
-
 \subsection{Performance of \JITI on ILP applications} \label{sec:perf:ILP}
 %-------------------------------------------------------------------------
 The need for \JITI was originally noticed in inductive logic
@@ -1136,9 +1115,9 @@ JITI's time and space performance on some learning tasks using the
 Aleph system~\cite{ALEPH} and the datasets of
 Fig.~\ref{fig:ilp:datasets} which issue simple queries in an
 extensional database. Several of these datasets are standard in the
-Machine Learning literature.
+ILP literature.
 
-\paragraph*{Time performance.}
+\Paragraph{Time performance.}
 We compare times for 10 runs of the saturation/refinement cycle of the
 ILP system; see Table~\ref{tab:ilp:time}.
 %% The \Krki datasets have small search spaces and small databases, so
@@ -1149,15 +1128,15 @@ benefit much from indexing in the database; they do benefit through
 from indexing in the dynamic representation of the search space, as
 their running times improve somewhat with \JITI.
 
-The \BreastCancer and \GeneExpr applications use data in 1NF (i.e.,
-unstructured data). The speedup here is mostly from multiple argument
-indexing. \BreastCancer is particularly interesting. It consists of 40
-binary relations with 65k elements each, where the first argument is
-the key. We know that most calls have the first argument bound, hence
-indexing was not expected to matter much. Instead, the results show
-\JITI to improve running time by more than an order of magnitude. Like in
-\sgCyl, this suggests that even a small percentage of badly indexed
-calls can end up dominating runtime.
+The \BreastCancer and \GeneExpr applications use unstructured data.
+The speedup here is mostly from multiple argument indexing.
+\BreastCancer is particularly interesting. It consists of 40 binary
+relations with 65k elements each, where the first argument is the key.
+We know that most calls have the first argument bound, hence indexing
+was not expected to matter much. Instead, the results show \JITI to
+improve running time by more than an order of magnitude. This suggests
+that even a small percentage of badly indexed calls can end up
+dominating runtime.
 
 \IEProtein and \Thermolysin are example applications that manipulate
 structured data. \IEProtein is the largest dataset we consider, and
@@ -1173,8 +1152,9 @@ from one to more than two orders of magnitude.
   \centering
   \caption{Time and space performance of JITI
     on Inductive Logic Programming datasets}
+  \vspace*{-1em}
   \label{tab:ilp}
-  \setlength{\tabcolsep}{3pt}
+  \setlength{\tabcolsep}{2.5pt}
   \subfigure[Time (in seconds)]{\label{tab:ilp:time}
     \begin{tabular}{|l||r|r|r||} \hline
                   & \multicolumn{3}{|c||}{Time} \\
@@ -1194,7 +1174,7 @@ from one to more than two orders of magnitude.
     \Thermolysin  &    50,279 &   5,213 &  $10\times$ \\
     \hline
     \end{tabular}
-  }
+  }%
   \subfigure[Memory usage (in KB)]{\label{tab:ilp:memory}
     \begin{tabular}{||r|r|r|r||} \hline
                 \multicolumn{2}{||c|}{Static code}
@@ -1216,7 +1196,7 @@ from one to more than two orders of magnitude.
 	         2,317 &     929 & 116,129 &  7,064 \\
     \hline
     \end{tabular}
-  }
+  }%
 \end{table}
 %------------------------------------------------------------------------------
 
@@ -1247,23 +1227,23 @@ from one to more than two orders of magnitude.
 \end{figure}
 %------------------------------------------------------------------------------
 
-\paragraph*{Space performance.}
+\Paragraph{Space performance.}
 Table~\ref{tab:ilp:memory} shows memory usage when using \JITI. The
 table presents data obtained at a point near the end of execution;
-memory usage should be at or close to the maximum. These applications
-use a mixture of static and dynamic predicates and we show their
-memory usage separately. On static predicates, memory usage varies
-widely, from only 10\% to the worst case, \Carcino, where the index
-tables take more space than the original program. Hash tables dominate
-usage in \IEProtein and \Susi, whereas \TryRetryTrust chains dominate
-in \BreastCancer. In most other cases no single component dominates
-memory usage. Memory usage for dynamic data is shown in the last two
-columns; note that dynamic data is mostly used to store the search
-space. One can observe that there is a much lower overhead in this
-case. A more detailed analysis shows that most space is occupied by
-the hash tables and by internal nodes of the tree, and that relatively
-little space is occupied by \TryRetryTrust chains, suggesting that
-\JITI is behaving well in practice.
+memory usage should be at the maximum. These applications use a
+mixture of static and dynamic predicates and we show their memory
+usage separately. On static predicates, memory usage varies widely,
+from only 10\% to the worst case, \Carcino, where the index tables
+take more space than the original program. Hash tables dominate usage
+in \IEProtein and \Susi, whereas \TryRetryTrust chains dominate in
+\BreastCancer. In most other cases no single component dominates
+memory usage. Memory usage for dynamic predicates is shown in the last
+two columns; this data is mostly used to store the search space.
+Observe that there is a much lower overhead in this case. A more
+detailed analysis shows that most space is occupied by the hash tables
+and by internal nodes of the tree, and that relatively little space is
+occupied by \TryRetryTrust chains, suggesting that \JITI is behaving
+well in practice.
 
 
 \section{Concluding Remarks}
@@ -1282,9 +1262,7 @@ implementation has been left a bit behind. We hold that this
 should change.
 %
 Indeed, we see \JITI as only a first, very successful, step towards
-effective runtime optimization of logic programs.\footnote{The good
-results obtained with JITI have motivated recent work on 
-Just-In-Time compilation of Prolog~\cite{yapc}.}
+effective runtime optimization of logic programs.
 
 As presented, \JITI is a hybrid technique: index generation occurs
 during runtime but is partly guided by the compiler, because we want
@@ -1299,19 +1277,113 @@ performance can be achieved in the context of one solution
 computations, or in the context of tabling where order of clauses and
 solutions does not matter and repeated solutions are discarded.
 
-\paragraph{Acknowledgments}
 
-This work is dedicated to the memory of our friend and colleague
-Ricardo Lopes. We miss you! V\'{\i}tor Santos Costa was partially
-supported by CNPq and would like to acknowledge support received while
-visiting at UW-Madison and the support of the YAP user community.
-This work has been partially supported by MYDDAS (POSC/EIA/59154/2004)
-and by funds granted to LIACC through the Programa de Financiamento
-Plurianual, Funda��o para a Ci�ncia e Tecnologia and Programa POSC.
+\Paragraph{Acknowledgments}
+%--------------------------
+V\'{\i}tor Santos Costa was partially supported by CNPq and would like
+to acknowledge support received while visiting at UW-Madison and the
+support of the YAP user community. This work has been partially
+supported by MYDDAS (POSC/EIA/59154/2004) and by funds granted to
+LIACC through the Programa de Financiamento Plurianual, Funda��o para
+a Ci�ncia e Tecnologia and Programa POSC.
 
 %==============================================================================
-\bibliographystyle{splncs}
-\bibliography{lp}
+\begin{thebibliography}{10}
+
+\bibitem{Warren83}
+Warren, D.H.D.:
+\newblock An abstract {P}rolog instruction set.
+\newblock Tech. Note 309, SRI International (1983)
+
+\bibitem{YAP}
+Santos~Costa, V., Damas, L., Reis, R., Azevedo, R.:
+\newblock {YAP} User's Manual. (2002)
+
+\bibitem{ShallowBacktracking@ICLP-89}
+Carlsson, M.:
+\newblock On the efficiency of optimising shallow backtracking in compiled
+  {Prolog}.
+\newblock In Levi, G., Martelli, M., eds.: Proceedings of the Sixth
+  ICLP, MIT Press (June 1989)  3--15
+
+\bibitem{IndexingProlog@NACLP-89}
+Demoen, B., Mari{\"e}n, A., Callebaut, A.:
+\newblock Indexing in {P}rolog.
+\newblock In Lusk, E.L., Overbeek, R.A., eds.: Proceedings of NACLP,
+  MIT Press (1989)  1001--1012
+
+\bibitem{SWI}
+Wielemaker, J.:
+\newblock {SWI-Prolog 5.1}: Reference Manual.
+\newblock {SWI}, University of Amsterdam, Roetersstraat 15, 1018 WB Amsterdam,
+  The Netherlands. (1997--2003)
+
+\bibitem{XSB}
+Sagonas, K.F., Swift, T., Warren, D.S., Freire, J., Rao, P.:
+\newblock The {XSB} Pro\-grammer's Manual.
+\newblock State University of New York at Stony Brook. (1997)
+
+\bibitem{ilProlog}
+Tron\c{c}on, R., Janssens, G., Demoen, B., Vandecasteele, H.:
+\newblock Fast frequent quering with lazy control flow compilation.
+\newblock Theory and Practice of Logic Programming (2007) To appear.
+
+\bibitem{HickeyMudambi@JLP-89}
+Hickey, T., Mudambi, S.:
+\newblock Global compilation of {P}rolog.
+\newblock JLP \textbf{7}(3) (November 1989)  193--230
+
+\bibitem{VRDW87}
+{Van Roy}, P., Demoen, B., Willems, Y.D.:
+\newblock Improving the execution speed of compiled {Prolog} with modes, clause
+  selection and determinism.
+\newblock In: TAPSOFT'87, Springer (1987)  111--125
+
+\bibitem{TOAM@ICLP-90}
+Zhou, N.F., Takagi, T., Kazuo, U.:
+\newblock A matching tree oriented abstract machine for {P}rolog.
+\newblock In Warren, D.H.D., Szeredi, P., eds.: ICLP90, MIT Press (1990)
+  158--173
+
+\bibitem{UnifFact@POPL-95}
+Dawson, S., Ramakrishnan, C.R., Ramakrishnan, I.V., Sagonas, K., Skiena, S.,
+  Swift, T., Warren, D.S.:
+\newblock Unification factoring for the efficient execution of logic programs.
+\newblock In: Conference Record of POPL'95, ACM Press (January 1995)  247--258
+
+\bibitem{Tries@JLP-99}
+Ramakrishnan, I.V., Rao, P., Sagonas, K., Swift, T., Warren, D.S.:
+\newblock Efficient access mechanisms for tabled logic programs.
+\newblock Journal of Logic Programming \textbf{38}(1) (January 1999)  31--54
+
+\bibitem{KligerShapiro@ICLP-88}
+Kliger, S., Shapiro, E.:
+\newblock A decision tree compilation algorithm for {FCP($|$,:,?)}.
+\newblock In: Proceedings of the Fifth ICSLP, MIT Press (August 1988) 1315--1336
+
+\bibitem{Mercury@JLP-96}
+Somogyi, Z., Henderson, F., Conway, T.:
+\newblock The execution algorithm of {Mercury}, an efficient purely declarative
+  logic programming language.
+\newblock JLP \textbf{26}(1--3) (December 1996)  17--64
+
+\bibitem{Ciao@SCP-05}
+Hermenegildo, M.V., Puebla, G., Bueno, F., L{\'o}pez-Garc\'{\i}a, P.:
+\newblock Integrated program debugging, verification, and optimization using
+  abstract interpretation (and the {Ciao} system preprocessor).
+\newblock Science of Computer Programming \textbf{58}(1--2) (2005)  115--140
+
+\bibitem{FreezeIndexing@ICLP-87}
+Carlsson, M.:
+\newblock Freeze, indexing, and other implementation issues in the {WAM}.
+\newblock In Lassez, J.L., ed.: Proceedings of the Fourth ICLP,
+  MIT Press (May 1987)  40--58
+
+\bibitem{ALEPH}
+Srinivasan, A.:
+\newblock The Aleph Manual. (2001)
+
+\end{thebibliography}
 %==============================================================================
 
 \end{document}