hiatory

2016-04-19 23:30:02 +01:00
parent 3d68f0e06b
commit cd41d373db
28 changed files with 3153 additions and 2229 deletions
--- a/C/init.c
+++ b/C/init.c
@@ -1438,5 +1438,6 @@ void Yap_exit(int value) {
    Yap_ShutdownLoadForeign();
  }
  Yap_CloseStreams(false);
  Yap_CloseReadline();
  exit(value);
 }
--- a/H/YapGFlagInfo.h
+++ b/H/YapGFlagInfo.h
@@ -185,7 +185,7 @@ available in experimental implementations.
 */
    YAP_FLAG(FILE_NAME_VARIABLES_FLAG, "file_name_variables", true, booleanFlag,
             "true", NULL),
-    YAP_FLAG(FLOAT_FORMAT_FLAG, "float_format", true, isatom, "%15e",
+    YAP_FLAG(FLOAT_FORMAT_FLAG, "float_format", true, isatom, "%15f",
             NULL),                                    /**< + `float_format `
                                    C-library `printf()` format specification used by write/1 and
--- a/H/Yapproto.h
+++ b/H/Yapproto.h
@@ -301,6 +301,7 @@ extern void Yap_DebugErrorPutc(int n);
 extern void Yap_DebugErrorPuts(const char *s);
 extern void Yap_DebugWriteIndicator(struct pred_entry *ap);
 void Yap_PlWriteToStream(Term, int, int);
 void Yap_CloseReadline(void);
 /* depth_lim.c */
 bool   Yap_InitReadline(Term t);
 void Yap_InitItDeepenPreds(void);
--- a/H/Yatom.h
+++ b/H/Yatom.h
@@ -659,6 +659,19 @@ INLINE_ONLY inline EXTERN PropFlags IsPredProperty(int flags) {
  return (PropFlags)((flags == PEProp));
 }
 INLINE_ONLY inline EXTERN Atom NameOfPred(PredEntry *pe);
 INLINE_ONLY inline EXTERN Atom NameOfPred(PredEntry *pe) {
  if (pe->ModuleOfPred == IDB_MODULE) {
    return NULL;
  } else if (pe->ArityOfPE == 0) {
    return (Atom)pe->FunctorOfPred;
  } else {
    Functor f = pe->FunctorOfPred;
    return NameOfFunctor(f);
  }
 }
 /* Flags for code or dbase entry */
 /* There are several flags for code and data base entries */
 typedef enum {
@@ -1322,7 +1335,6 @@ INLINE_ONLY inline EXTERN bool IsFlagProperty(PropFlags);
 INLINE_ONLY inline EXTERN bool IsFlagProperty(PropFlags flags) {
  return flags == FlagProperty;
 }
 /* Proto types */
--- a/H/generated/dlocals.h
+++ b/H/generated/dlocals.h
@@ -474,6 +474,8 @@
 #define LOCAL_search_atoms LOCAL->search_atoms_
 #define REMOTE_search_atoms(wid) REMOTE(wid)->search_atoms_
 #define LOCAL_SearchPreds LOCAL->SearchPreds_
 #define REMOTE_SearchPreds(wid) REMOTE(wid)->SearchPreds_
 #define LOCAL_CurSlot LOCAL->CurSlot_
 #define REMOTE_CurSlot(wid) REMOTE(wid)->CurSlot_
--- a/H/generated/hlocals.h
+++ b/H/generated/hlocals.h
@@ -268,6 +268,7 @@ const char*  Error_Function_;
  UInt  exo_arg_;
 // atom completion
  struct scan_atoms*  search_atoms_;
  struct pred_entry*  SearchPreds_;
 // Slots
  yhandle_t  CurSlot_;
  yhandle_t  NSlots_;
--- a/H/generated/ilocals.h
+++ b/H/generated/ilocals.h
@@ -269,6 +269,7 @@ static void InitWorker(int wid) {
  REMOTE_CurSlot(wid) = 0;
  REMOTE_NSlots(wid) = 0;
  REMOTE_SlotBase(wid) = InitHandles(wid);
--- a/H/generated/rlocals.h
+++ b/H/generated/rlocals.h
@@ -279,4 +279,5 @@ static void RestoreWorker(int wid USES_REGS) {
 }
--- a/misc/LOCALS
+++ b/misc/LOCALS
@@ -312,6 +312,7 @@ UInt				exo_arg					=0
 // atom completion
 struct scan_atoms*		search_atoms				void
 struct pred_entry*		SearchPreds				void
 // Slots
 yhandle_t				CurSlot					=0
--- a/os/charsio.c
+++ b/os/charsio.c
@@ -470,20 +470,13 @@ code with  _C_.
 */
 static Int get_byte(USES_REGS1) { /* '$get_byte'(Stream,-N) */
-  int sno = Yap_CheckStream(ARG1, Input_Stream_f, "get_byte/2");
+  int sno = Yap_CheckBinaryStream(ARG1, Input_Stream_f, "get_byte/2");
  Int status;
  Term out;
  if (sno < 0)
    return (FALSE);
  status = GLOBAL_Stream[sno].status;
  if (!(status & Binary_Stream_f)
      //&& strictISOFlag()
      ) {
    UNLOCK(GLOBAL_Stream[sno].streamlock);
    Yap_Error(PERMISSION_ERROR_INPUT_STREAM, ARG1, "get_byte/2");
    return (FALSE);
  }
  out = MkIntTerm(GLOBAL_Stream[sno].stream_getc(sno));
  UNLOCK(GLOBAL_Stream[sno].streamlock);
  return Yap_unify_constant(ARG2, out);
@@ -812,16 +805,9 @@ static Int put_byte(USES_REGS1) { /* '$put_byte'(Stream,N)                 */
    Yap_Error(DOMAIN_ERROR_OUT_OF_RANGE, t2, "put_code/1");
    return FALSE;
  }
-  int sno = Yap_CheckStream(ARG1, Output_Stream_f, "put/2");
+  int sno = Yap_CheckBinaryStream(ARG1, Output_Stream_f, "put/2");
  if (sno < 0)
    return (FALSE);
  if (!(GLOBAL_Stream[sno].status & Binary_Stream_f)
      // && strictISOFlag()
      ) {
    UNLOCK(GLOBAL_Stream[sno].streamlock);
    Yap_Error(PERMISSION_ERROR_OUTPUT_BINARY_STREAM, ARG1, NULL);
    return false;
  }
  GLOBAL_Stream[sno].stream_putc(sno, ch);
  /*
   * if (!(GLOBAL_Stream[sno].status & Null_Stream_f))
--- a/os/iopreds.c
+++ b/os/iopreds.c
@@ -1576,6 +1576,24 @@ int Yap_CheckTextStream__(const char *file, const char *f, int line, Term arg,
  return sno;
 }
 int Yap_CheckBinaryStream__(const char *file, const char *f, int line, Term arg,
                          int kind, const char *msg) {
  int sno;
  if ((sno = CheckStream__(file, f, line, arg, kind, msg)) < 0)
    return -1;
  if ((GLOBAL_Stream[sno].status & Binary_Stream_f)) {
    UNLOCK(GLOBAL_Stream[sno].streamlock);
    if (kind == Input_Stream_f)
      PlIOError__(file, f, line, PERMISSION_ERROR_INPUT_TEXT_STREAM, arg,
                  msg);
    else
      PlIOError__(file, f, line, PERMISSION_ERROR_OUTPUT_TEXT_STREAM, arg,
                  msg);
    return -1;
  }
  return sno;
 }
 /* used from C-interface */
 int Yap_GetFreeStreamDForReading(void) {
  int sno = GetFreeStreamD();
--- a/os/iopreds.h
+++ b/os/iopreds.h
@@ -45,6 +45,10 @@ extern int Yap_CheckStream__(const char *, const char *, int, Term, int,
  Yap_CheckTextStream__(__FILE__, __FUNCTION__, __LINE__, arg, kind, msg)
 extern int Yap_CheckTextStream__(const char *, const char *, int, Term, int,
                                 const char *);
 #define Yap_CheckBinaryStream(arg, kind, msg)                                    \
  Yap_CheckBinaryStream__(__FILE__, __FUNCTION__, __LINE__, arg, kind, msg)
 extern int Yap_CheckBinaryStream__(const char *, const char *, int, Term, int,
                                 const char *);
 extern bool Yap_initStream(int sno, FILE *fd, const char *name, Term file_name,
                           encoding_t encoding, stream_flags_t flags,
--- a/os/readline.c
+++ b/os/readline.c
@@ -168,8 +168,7 @@ static char *predicate_enumerate(const char *prefix, int state) {
      p = mod->PredForME;
    }
    char *c = RepAtom(ap = NameOfPred(p))->StrOfAE;
-         if (strlen(c) > strlen(prefix) &&
+    if (strlen(c) > strlen(prefix) && strstr(c, prefix) == c &&
            strstr(c, prefix) == c &&
        !(p->PredFlags & HiddenPredFlag)) {
      LOCAL_SearchPreds = p;
      arity_t ar = p->ArityOfPE;
@@ -177,13 +176,10 @@ static char *predicate_enumerate(const char *prefix, int state) {
      if (Yap_IsPrefixOp(AbsAtom(ap), &l, &r) && ar == 1) {
        return c;
      }
          size_t sz = strlen(c);
          chain_t *el = (chain_t *)malloc(sizeof(chain_t)+sz);
      strncpy(LOCAL_FileNameBuf, c, YAP_FILENAME_MAX);
      strncat(LOCAL_FileNameBuf, "(", YAP_FILENAME_MAX);
      return LOCAL_FileNameBuf;
    }
  }
  LOCAL_SearchPreds = NULL;
  return NULL;
@@ -209,7 +205,7 @@ static char *predicate_enumerate(const char *prefix, int state) {
  if (start == 0 && isalpha(text[0])) {
    int i = 0;
    while (i < end) {
-        if (isalnum(text[i]))
+      if (isalnum(text[i]) || text[i] == '_')
        i++;
      else
        break;
@@ -227,14 +223,18 @@ static char *predicate_enumerate(const char *prefix, int state) {
    if ((strstr(p, "[") == p) || (strstr(p, "compile(") == p) ||
        (strstr(p, "consult(") == p) || (strstr(p, "load_files(") == p) ||
-          (strstr(p,"reconsult(") == p) || (strstr(p,"use_module(") == p))
+        (strstr(p, "reconsult(") == p) || (strstr(p, "use_module(") == p) ||
        (strstr(p, "cd(") == p))
      matches = rl_completion_matches((char *)text, /* for pre-4.2 */
                                      rl_filename_completion_function);
    return matches;
  }
  int i = end, ch = '\0';
  while (i > start) {
-      ch = text[-i];
+    ch = text[--i];
    if (ch == '\'')
      return rl_completion_matches((char *)text, /* for pre-4.2 */
                                   rl_filename_completion_function);
    if (isalnum(text[i]))
      continue;
    break;
@@ -303,12 +303,13 @@ static char *predicate_enumerate(const char *prefix, int state) {
  GLOBAL_Stream[StdInStream].u.irl.buf = NULL;
  GLOBAL_Stream[StdInStream].u.irl.ptr = NULL;
  GLOBAL_Stream[StdInStream].status |= Readline_Stream_f;
-#if _MSC_VER || defined(__MINGW32__)
+#if _WIN32
  rl_instream = stdin;
 #endif
  rl_outstream = stderr;
  using_history();
  const char *s = Yap_AbsoluteFile("~/.YAP.history", NULL, true);
  history_file = s;
  if (read_history(s) != 0) {
    FILE *f = fopen(s, "a");
    if (f) {
@@ -362,7 +363,6 @@ static char *predicate_enumerate(const char *prefix, int state) {
    return false;
  if (myrl_line[0] != '\0' && myrl_line[1] != '\0') {
    add_history((char *)myrl_line);
      write_history(history_file);
    fflush(NULL);
  }
  s->u.irl.ptr = s->u.irl.buf = myrl_line;
@@ -473,6 +473,12 @@ static char *predicate_enumerate(const char *prefix, int state) {
  }
 }
 void Yap_CloseReadline(void) {
 #if USE_READLINE
  write_history(history_file);
 #endif
 }
 static Int has_readline(USES_REGS1) {
 #if USE_READLINE
  return true;
--- a/os/writeterm.c
+++ b/os/writeterm.c
@@ -390,6 +390,8 @@ write1 ( USES_REGS1 )
  if (output_stream == -1) output_stream = 1;
  xarg * args = Yap_ArgListToVector ( TermNil, write_defs, WRITE_END  );
  if (args == NULL) {
   if (LOCAL_Error_TYPE == DOMAIN_ERROR_OUT_OF_RANGE)
      LOCAL_Error_TYPE = DOMAIN_ERROR_WRITE_OPTION;
     if (LOCAL_Error_TYPE)
      Yap_Error(LOCAL_Error_TYPE, LOCAL_Error_Term, NULL);
    return false;
@@ -415,6 +417,8 @@ write_canonical1 ( USES_REGS1 )
  if (output_stream == -1) output_stream = 1;
  xarg * args = Yap_ArgListToVector ( TermNil, write_defs, WRITE_END  );
  if (args == NULL) {
    if (LOCAL_Error_TYPE == DOMAIN_ERROR_OUT_OF_RANGE)
      LOCAL_Error_TYPE = DOMAIN_ERROR_WRITE_OPTION;
    if (LOCAL_Error_TYPE)
      Yap_Error(LOCAL_Error_TYPE, LOCAL_Error_Term, NULL);
    return false;
@@ -440,6 +444,8 @@ write_canonical ( USES_REGS1 )
     we cannot make recursive Prolog calls */
  xarg * args = Yap_ArgListToVector ( TermNil, write_defs, WRITE_END  );
  if (args == NULL) {
    if (LOCAL_Error_TYPE == DOMAIN_ERROR_OUT_OF_RANGE)
      LOCAL_Error_TYPE = DOMAIN_ERROR_WRITE_OPTION;
    if (LOCAL_Error_TYPE)
      Yap_Error(LOCAL_Error_TYPE, LOCAL_Error_Term, NULL);
    return false;
@@ -467,6 +473,8 @@ writeq1 ( USES_REGS1 )
     we cannot make recursive Prolog calls */
  xarg *args = Yap_ArgListToVector ( TermNil, write_defs, WRITE_END  );
  if (args == NULL) {
     if (LOCAL_Error_TYPE == DOMAIN_ERROR_OUT_OF_RANGE)
      LOCAL_Error_TYPE = DOMAIN_ERROR_WRITE_OPTION;
   if (LOCAL_Error_TYPE)
      Yap_Error(LOCAL_Error_TYPE, LOCAL_Error_Term, NULL);
    return false;
@@ -495,6 +503,8 @@ writeq ( USES_REGS1 )
     we cannot make recursive Prolog calls */
  xarg *args = Yap_ArgListToVector ( TermNil, write_defs, WRITE_END  );
  if (args == NULL) {
    if (LOCAL_Error_TYPE == DOMAIN_ERROR_OUT_OF_RANGE)
      LOCAL_Error_TYPE = DOMAIN_ERROR_WRITE_OPTION;
    if (LOCAL_Error_TYPE)
      Yap_Error(LOCAL_Error_TYPE, LOCAL_Error_Term, NULL);
    return false;
@@ -523,6 +533,8 @@ print1 ( USES_REGS1 )
     we cannot make recursive Prolog calls */
  xarg *args = Yap_ArgListToVector ( TermNil, write_defs, WRITE_END  );
  if (args == NULL) {
   if (LOCAL_Error_TYPE == DOMAIN_ERROR_OUT_OF_RANGE)
      LOCAL_Error_TYPE = DOMAIN_ERROR_WRITE_OPTION;
     if (LOCAL_Error_TYPE)
      Yap_Error(LOCAL_Error_TYPE, LOCAL_Error_Term, NULL);
    return false;
@@ -551,6 +563,8 @@ print ( USES_REGS1 )
     we cannot make recursive Prolog calls */
  xarg *args = Yap_ArgListToVector ( TermNil, write_defs, WRITE_END  );
  if (args == NULL) {
   if (LOCAL_Error_TYPE == DOMAIN_ERROR_OUT_OF_RANGE)
      LOCAL_Error_TYPE = DOMAIN_ERROR_WRITE_OPTION;
     if (LOCAL_Error_TYPE)
      Yap_Error(LOCAL_Error_TYPE, LOCAL_Error_Term, NULL);
    return false;
--- a/packages/cuda/CMakeLists.txt
+++ b/packages/cuda/CMakeLists.txt
@@ -54,8 +54,25 @@ if (CUDA_FOUND)
  macro_optional_find_package (Thrust ON)
  set (CUDA_SOURCES
 CC_CSSTree.cu
 bpreds.cu
 dbio.cu
 lista.cu
 memory.cu
 selectproyect.cu
 treeb.cu
 union2.cu
    )
  set (CXX_SOURCES
 bpredscpu.cpp
 joincpu.cpp
 selectproyectcpu.cpp
 unioncpu2.cpp
 )
  set (C_SOURCES
 creator2.c
 cuda.c
 )
--- a/packages/cuda/Makefile.in
+++ b/packages/cuda/Makefile.in
@@ -23,7 +23,7 @@ CC=@CC@
 NVCC=@NVCC@
 CFLAGS= @SHLIB_CFLAGS@ $(YAP_EXTRAS) $(DEFS) -I$(srcdir) -I../.. -I$(srcdir)/../../include  @CUDA_CPPFLAGS@
 NVCCFLAGS=@CUDA_CPPFLAGS@ -I$(srcdir) -I../.. -I$(srcdir)/../../include
-CUDA_LDFLAGS=@CUDA_LDFLAGS@
+LDFLAGS=@LDFLAGS@
 #
 #
 # You shouldn't need to change what follows.
@@ -39,7 +39,7 @@ SO=@SO@
 CWD=$(PWD)
 #
-CUDA_PROLOG= \
+BDD_PROLOG= \
 	$(srcdir)/cuda.yap
 OBJS=cuda.o memory.o lista.o
@@ -62,16 +62,11 @@ memory.o: $(srcdir)/memory.cu $(srcdir)/pred.h
@DO_SECOND_LD@cuda.@SO@: $(OBJS)
@DO_SECOND_LD@	@CUDA_SHLIB_LD@ $(CUDA_LDFLAGS) -o cuda.@SO@ $(OBJS)
-install: all install-examples
+install: all
 	mkdir -p $(DESTDIR)$(SHAREDIR)
-	for h in $(CUDA_PROLOG); do $(INSTALL_DATA) $$h $(DESTDIR)$(SHAREDIR); done
+	for h in $(BDD_PROLOG); do $(INSTALL_DATA) $$h $(DESTDIR)$(SHAREDIR); done
 	$(INSTALL_PROGRAM) $(SOBJS) $(DESTDIR)$(YAPLIBDIR)
 install-examples:
 clean:
-	rm -f *.o *~ $(OBJS) *.BAK
+	rm -f *.o *~ $(OBJS) $(SOBJS) *.BAK
 distclean: clean
 	rm -f $(SOBJS) Makefile
--- a/packages/cuda/bpreds.cu
+++ b/packages/cuda/bpreds.cu
@@ -1,4 +1,113 @@
-__global__ void predicates(int *dop1, int rows, int cols, int *cons, int numc, int *res)
+#include <thrust/device_vector.h>
 #include <thrust/scan.h>
 #include <cstdarg>
 #include "pred.h"
 /*Determines the maximum from a set of values*/
 int maximo(int count, ...)
 {
 	va_list ap;
    	int j, temp, mx = 0;
    	va_start(ap, count);
 	for(j = 0; j < count; j++)
 	{
 		temp = va_arg(ap, int);
 		if(temp > mx)
 			mx = temp;
 	}
    	va_end(ap);
    	return mx;
 }
 __global__ void bpreds(int *dop1, int *dop2, int rows, int of1, int of2, int *cons, int numc, int nx, int *res, int *res2)
 {
 	extern __shared__ int shared[];
 	int id = blockIdx.x * blockDim.x + threadIdx.x;
 	int x, rowact, rowact1, op1, op2;
 	if(threadIdx.x < numc)
 		shared[threadIdx.x] = cons[threadIdx.x];
 	__syncthreads();
 	if(id < rows)
 	{
 		rowact1 = id * of1;
 		rowact = id * of2;
 		for(x = nx; x < numc; x += 3)
 		{
 			op1 = shared[x+1];
 			if(op1 < 0)
 				op1 = dop1[rowact1 - op1 - 1];
 			else
 				op1 = dop2[rowact + op1];
 			op2 = shared[x+2];
 			if(op2 < 0)
 				op2 = dop1[rowact1 - op2 - 1];
 			else
 				op2 = dop2[rowact + op2];
 			switch(shared[x] - BPOFFSET)
 			{
 				case SBG_EQ: if(op1 != op2)
 						return;
 				break;
 				case SBG_GT: if(op1 <= op2)
 						return;
 				break;
 				case SBG_LT: if(op1 >= op2)
 						return;
 				break;
 				case SBG_GE: if(op1 < op2)
 						return;
 				break;
 				case SBG_LE: if(op1 > op2)
 						return;
 				break;
 				case SBG_DF: if(op1 == op2)
 						return;
 			}
 		}
 		if(res2 != NULL)
 			res2[id] = 1; 
 		for(x = 0; x < nx; x += 3)
 		{
 			op1 = shared[x+1];
 			if(op1 < 0)
 				op1 *= -1;
 			else
 				op1 = dop2[rowact + op1];
 			op2 = shared[x+2];
 			if(op2 < 0)
 				op2 *= -1;
 			else
 				op2 = dop2[rowact + op2];
 			switch(shared[x])
 			{
 				case SBG_EQ: if(op1 != op2)
 						return;
 				break;
 				case SBG_GT: if(op1 <= op2)
 						return;
 				break;
 				case SBG_LT: if(op1 >= op2)
 						return;
 				break;
 				case SBG_GE: if(op1 < op2)
 						return;
 				break;
 				case SBG_LE: if(op1 > op2)
 						return;
 				break;
 				case SBG_DF: if(op1 == op2)
 						return;
 			}
 		}
 		res[id] = 1;
 	}
 }
 /*Mark all rows that comply with the comparison predicates*/
 __global__ void bpredsnormal2(int *dop1, int rows, int of1, int *cons, int numc, int *res)
 {
 	extern __shared__ int shared[];
 	int id = blockIdx.x * blockDim.x + threadIdx.x;
@@ -8,7 +117,7 @@ __global__ void predicates(int *dop1, int rows, int cols, int *cons, int numc, i
 	__syncthreads();
 	if(id < rows)
 	{
-		rowact = id * cols;
+		rowact = id * of1; 
 		for(x = 0; x < numc; x += 3)
 		{
 			op1 = shared[x+1];
@@ -46,98 +155,306 @@ __global__ void predicates(int *dop1, int rows, int cols, int *cons, int numc, i
 	}
 }
-int bpreds(int *dop1, int rows, int cols, int *bin, int3 numpreds, int **ret)
+/*Unmark all rows that do not comply with the comparison predicates*/
 __global__ void bpredsnormal(int *dop1, int rows, int of1, int *cons, int numc, int *res)
 {
-	int *temp;
+ 	extern __shared__ int shared[];
-	int tmplen = rows + 1;
+	int id = blockIdx.x * blockDim.x + threadIdx.x;
-	int size = tmplen * sizeof(int);
+	int x, rowact, op1, op2;
-	reservar(&temp, size);
+	if(threadIdx.x < numc)
-#ifdef DEBUG_MEM
+		shared[threadIdx.x] = cons[threadIdx.x];
-	 cerr << "+ " << temp << " temp bpreds " << size << endl;
+	__syncthreads();
-#endif
+	if(id < rows)
-	cudaMemset(temp, 0, size);
+	{
-
+		if(res[id] == 0)
-#if TIMER
+			return;
-	cuda_stats.builtins++;
+		rowact = id * of1; 
-#endif
+		for(x = 0; x < numc; x += 3)
-	int *dhead;
+		{
-	int predn = numpreds.x * 3;
+			op1 = shared[x+1];
-	int spredn = predn * sizeof(int);
+			if(op1 < 0)
-	int sproj = numpreds.z * sizeof(int);
+				op1 *= -1;
 	int hsize;
 	if(predn > numpreds.z)
 		hsize = spredn;
 			else
-		hsize = sproj;
+				op1 = dop1[rowact + op1];
-	reservar(&dhead, hsize);
+			op2 = shared[x+2];
-#ifdef DEBUG_MEM
+			if(op2 < 0)
-	cerr << "+ " << dhead << " dhead  " << hsize << endl;
+				op2 *= -1;
-#endif
+			else
-	cudaMemcpy(dhead, bin, spredn, cudaMemcpyHostToDevice);
+				op2 = dop1[rowact + op2];
-
+			switch(shared[x])
 	int blockllen = rows / 1024 + 1;
 	int numthreads = 1024;
 	/*int x;
 	cout << "arraypreds = ";
 	for(x = 0; x < predn; x++)
 		cout << bin[x] << " ";
 	cout << endl;
 	cout << "temptable = ";
 	for(x = 0; x < numpreds.z; x++)
 		cout << bin[x+predn] << " ";
 	cout << endl; 
 	int y;
 	int *hop1 = (int *)malloc(numpreds.y * rows * sizeof(int));
 	cudaMemcpy(hop1, dop1, numpreds.y * rows * sizeof(int), cudaMemcpyDeviceToHost);
 	for(x = 0; x < rows; x++)
 			{
-		for(y = 0; y < numpreds.y; y++)
+				case SBG_EQ: if(op1 != op2)
 			cout << hop1[x * numpreds.y + y] << " ";
 		cout << endl;
 	}
 	free(hop1);*/
 	predicates<<<blockllen, numthreads, spredn>>>(dop1, rows, numpreds.y, dhead, predn, temp + 1);
 	/*int y;
 	int *hop1 = (int *)malloc((rows + 1) * sizeof(int));
 	cudaMemcpy(hop1, temp, (rows + 1) * sizeof(int), cudaMemcpyDeviceToHost);
 	for(x = 0; x < (rows + 1); x++)
 		cout << hop1[x] << " ";
 	cout << endl;
 	free(hop1);*/
 	thrust::device_ptr<int> res;
 	res = thrust::device_pointer_cast(temp);
 	thrust::inclusive_scan(res + 1, res + tmplen, res + 1);
 	int num = res[rows];
 	if(num == 0)
 		return 0;
 	int *fres;
 	reservar(&fres, num * sproj);
 #ifdef DEBUG_MEM
 	cerr << "+ " << fres << " fres  " << num * sproj << endl;
 #endif
 	cudaMemcpy(dhead, bin + predn, sproj, cudaMemcpyHostToDevice);
 	llenarproyectar<<<blockllen, numthreads, sproj>>>(dop1, rows, numpreds.y, temp, dhead, numpreds.z, fres);
 	/*int y;
 	int *hop1 = (int *)malloc(numpreds.z * num * sizeof(int));
 	cudaMemcpy(hop1, fres, numpreds.z * num * sizeof(int), cudaMemcpyDeviceToHost);
 	for(x = 0; x < num; x++)
 					     {
-		for(y = 0; y < numpreds.z; y++)
+						res[id] = 0;
-			cout << hop1[x * numpreds.z + y] << " ";
+						return;
 		cout << endl;
 					     }
-	free(hop1);*/
+				break;
-
+				case SBG_GT: if(op1 <= op2)
-	liberar(dhead, hsize);
+					     {
-	liberar(temp, size);
+						res[id] = 0;
-	liberar(dop1, rows * cols * sizeof(int));
+						return;
 	*ret = fres;
 	return num;
 					     }
 				break;
 				case SBG_LT: if(op1 >= op2)
 					     {
 						res[id] = 0;
 						return;
 					     }
 				break;
 				case SBG_GE: if(op1 < op2)
 					     {
 						res[id] = 0;
 						return;
 					     }
 				break;
 				case SBG_LE: if(op1 > op2)
 					     {
 						res[id] = 0;
 						return;
 					     }
 				break;
 				case SBG_DF: if(op1 == op2)
 					     {
 						res[id] = 0;
 						return;
 					     }
 			}
 		}
 	}
 }
 __global__ void bpredsOR(int *dop1, int *dop2, int rows, int of1, int of2, int *cons, int numc, int nx, int *res, int *res2)
 {
 	extern __shared__ int shared[];
 	int id = blockIdx.x * blockDim.x + threadIdx.x;
 	int x, rowact, rowact1, op1, op2;
 	if(threadIdx.x < numc)
 		shared[threadIdx.x] = cons[threadIdx.x];
 	__syncthreads();
 	if(id < rows)
 	{
 		rowact1 = id * of1;
 		rowact = id * of2;
 		for(x = nx; x < numc; x += 3)
 		{
 			op1 = shared[x+1];
 			if(op1 < 0)
 				op1 = dop1[rowact1 - op1 - 1];
 			else
 				op1 = dop2[rowact + op1];
 			op2 = shared[x+2];
 			if(op2 < 0)
 				op2 = dop1[rowact1 - op2 - 1];
 			else
 				op2 = dop2[rowact + op2];
 			switch(shared[x] - BPOFFSET)
 			{
 				case SBG_EQ: if(op1 == op2)
 					     {
 						res2[id] = 1;
 						x = numc;
 					     }
 				break;
 				case SBG_GT: if(op1 > op2)
 					     {
 						res2[id] = 1;
 						x = numc;
 					     }
 				break;
 				case SBG_LT: if(op1 < op2)
 					     {
 						res2[id] = 1;
 						x = numc;
 					     }
 				break;
 				case SBG_GE: if(op1 >= op2)
 					     {
 						res2[id] = 1;
 						x = numc;
 					     }
 				break;
 				case SBG_LE: if(op1 <= op2)
 					     {
 						res2[id] = 1;
 						x = numc;
 					     }
 				break;
 				case SBG_DF: if(op1 != op2)
 					     {
 						res2[id] = 1;
 						x = numc;
 					     }
 			}
 		}
 		for(x = 0; x < nx; x += 3)
 		{
 			op1 = shared[x+1];
 			if(op1 < 0)
 				op1 *= -1;
 			else
 				op1 = dop2[rowact + op1];
 			op2 = shared[x+2];
 			if(op2 < 0)
 				op2 *= -1;
 			else
 				op2 = dop2[rowact + op2];
 			switch(shared[x])
 			{
 				case SBG_EQ: if(op1 == op2)
 					     {
 						res[id] = 1;
 						return;
 					     }
 				break;
 				case SBG_GT: if(op1 > op2)
 					     {
 						res[id] = 1;
 						return;
 					     }
 				break;
 				case SBG_LT: if(op1 < op2)
 					     {
 						res[id] = 1;
 						return;
 					     }
 				break;
 				case SBG_GE: if(op1 >= op2)
 					     {
 						res[id] = 1;
 						return;
 					     }
 				break;
 				case SBG_LE: if(op1 <= op2)
 					     {
 						res[id] = 1;
 						return;
 					     }
 				break;
 				case SBG_DF: if(op1 != op2)
 					     {
 						res[id] = 1;
 						return;
 					     }
 			}
 		}
 	}
 }
 /*Mark all rows that comply with the comparison predicates using disjunctions (i.e. a row is marked if it complies with at least one predicate)*/
 __global__ void bpredsorlogic2(int *dop1, int rows, int of1, int *cons, int numc, int *res)
 {
 	extern __shared__ int shared[];
 	int id = blockIdx.x * blockDim.x + threadIdx.x;
 	int x, rowact, op1, op2;
 	if(threadIdx.x < numc)
 		shared[threadIdx.x] = cons[threadIdx.x];
 	__syncthreads();
 	if(id < rows)
 	{
 		rowact = id * of1; 
 		for(x = 0; x < numc; x += 3)
 		{
 			op1 = shared[x+1];
 			if(op1 < 0)
 				op1 *= -1;
 			else
 				op1 = dop1[rowact + op1];
 			op2 = shared[x+2];
 			if(op2 < 0)
 				op2 *= -1;
 			else
 				op2 = dop1[rowact + op2];
 			switch(shared[x])
 			{
 				case SBG_EQ: if(op1 == op2)
 					     {
 						res[id] = 1;
 						return;
 					     }
 				break;
 				case SBG_GT: if(op1 > op2)
 					     {
 						res[id] = 1;
 						return;
 					     }
 				break;
 				case SBG_LT: if(op1 < op2)
 					     {
 						res[id] = 1;
 						return;
 					     }
 				break;
 				case SBG_GE: if(op1 >= op2)
 					     {
 						res[id] = 1;
 						return;
 					     }
 				break;
 				case SBG_LE: if(op1 <= op2)
 					     {
 						res[id] = 1;
 						return;
 					     }
 				break;
 				case SBG_DF: if(op1 != op2)
 					     {
 						res[id] = 1;
 						return;
 					     }
 			}
 		}
 	}
 }
 /*Unmark all rows that do not comply with the comparison predicates using disjunctions (i.e. a row is unmarked only if it complies with none of the predicates)*/
 __global__ void bpredsorlogic(int *dop1, int rows, int of1, int *cons, int numc, int *res)
 {
 	extern __shared__ int shared[];
 	int id = blockIdx.x * blockDim.x + threadIdx.x;
 	int x, rowact, op1, op2;
 	if(threadIdx.x < numc)
 		shared[threadIdx.x] = cons[threadIdx.x];
 	__syncthreads();
 	if(id < rows)
 	{
 		if(res[id] == 0)
 			return;
 		rowact = id * of1; 
 		for(x = 0; x < numc; x += 3)
 		{
 			op1 = shared[x+1];
 			if(op1 < 0)
 				op1 *= -1;
 			else
 				op1 = dop1[rowact + op1];
 			op2 = shared[x+2];
 			if(op2 < 0)
 				op2 *= -1;
 			else
 				op2 = dop1[rowact + op2];
 			switch(shared[x])
 			{
 				case SBG_EQ: if(op1 == op2)
 						return;
 				break;
 				case SBG_GT: if(op1 > op2)
 						return;
 				break;
 				case SBG_LT: if(op1 < op2)
 						return;
 				break;
 				case SBG_GE: if(op1 >= op2)
 						return;
 				break;
 				case SBG_LE: if(op1 <= op2)
 						return;
 				break;
 				case SBG_DF: if(op1 != op2)
 						return;
 			}
 		}
 		res[id] = 0;
 	}
 }
--- a/packages/cuda/cuda.c
+++ b/packages/cuda/cuda.c
@@ -6,19 +6,25 @@
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
 #include <inttypes.h>
 #include "pred.h"
 #define MAXARG 100
 YAP_Atom AtomEq,
  AtomGt,
  AtomLt,
  AtomGe,
  AtomLe,
-  AtomDf;
+  AtomDf,
  AtomNt;
-predicate *facts[100]; /*Temporary solution to maintain facts and rules*/
+predicate *facts[MAXARG]; /*Temporary solution to maintain facts and rules*/
-predicate *rules[100];
+predicate *rules[MAXARG];
 int32_t cf = 0, cr = 0;
 char names[1024];
 // initialize CUDA system
 void Cuda_Initialize( void );
@@ -39,6 +45,19 @@ void init_cuda( void );
 //#define DEBUG_INTERFACE 1
 #ifdef ROCKIT
 static int32_t query[100];
 static int32_t qcont = 0;
 static int cuda_init_query(void)
 {
 	int32_t pname = YAP_AtomToInt(YAP_AtomOfTerm(YAP_ARG1));
 	query[qcont] = pname;
 	qcont++;
 	query[qcont] = 0;
 	return TRUE;
 }
 #endif
 #if DEBUG_INTERFACE
 static void
 dump_mat(int32_t mat[], int32_t nrows, int32_t ncols)
@@ -83,8 +102,18 @@ int32_t Cuda_NewFacts(predicate *pe)
 #if DEBUG_INTERFACE
  dump_mat( pe->address_host_table, pe->num_rows, pe->num_columns );
 #endif
 #ifdef ROCKIT
  if(cf >= 0)
  {
  	facts[cf] = pe;
 	cf++;
  }
 #else
  facts[cf] = pe;
  cf++;
 #endif
  return TRUE;
 }
@@ -115,7 +144,7 @@ int32_t Cuda_Erase(predicate *pe)
  return TRUE;
 }
-static YAP_Bool
+static int
 load_facts( void ) {
  int32_t nrows = YAP_IntOfTerm(YAP_ARG1);
@@ -164,15 +193,18 @@ load_facts( void ) {
 static int currentFact = 0;
 static predicate *currentPred = NULL;
-static YAP_Bool
+static int
 cuda_init_facts( void ) {
  int32_t nrows = YAP_IntOfTerm(YAP_ARG1);
-  int32_t ncols = YAP_IntOfTerm(YAP_ARG2), i = 0;
+  int32_t ncols = YAP_IntOfTerm(YAP_ARG2);
  int32_t *mat = (int32_t *)malloc(sizeof(int32_t)*nrows*ncols);
  int32_t pname = YAP_AtomToInt(YAP_AtomOfTerm(YAP_ARG3));
  predicate *pred;
 	strcat(names, YAP_AtomName(YAP_AtomOfTerm(YAP_ARG3)));
 	strcat(names, " ");
  if (!mat)
    return FALSE;
  if (YAP_IsVarTerm( YAP_ARG4)) {
@@ -198,14 +230,16 @@ cuda_init_facts( void ) {
  }
 }
-static YAP_Bool
+static int
 cuda_load_fact( void ) {
  YAP_Term th = YAP_ARG1;
-  int i, j;
+  int i = currentFact;
 #if defined(DATALOG) || defined(TUFFY)
  YAP_Term th = YAP_ARG1;
  int ncols = currentPred->num_columns;
  int j;
  int *mat = currentPred->address_host_table;
  i = currentFact;
  for (j = 0; j < ncols; j++) {
    YAP_Term ta = YAP_ArgOfTerm(j+1, th);
    if (YAP_IsAtomTerm(ta)) {
@@ -214,6 +248,8 @@ cuda_load_fact( void ) {
      mat[i*ncols+j] = YAP_IntOfTerm(ta);
    }
  }
 #endif
  i++;
  if (i == currentPred->num_rows) {
    Cuda_NewFacts(currentPred);
@@ -225,21 +261,26 @@ cuda_load_fact( void ) {
  return TRUE;
 }
-static YAP_Bool
+static int
 load_rule( void ) {
  // maximum of 2K symbols per rule, should be enough for ILP
-  int32_t vec[2048], *ptr = vec, *nvec;
+  int32_t vec[2048], *ptr = vec, *nvec, neg[2048];
  // qK different variables;
  YAP_Term vars[1024];
-  int32_t nvars = 0;
+  int32_t nvars = 0, x;
  int32_t ngoals = YAP_IntOfTerm(YAP_ARG1);   /* gives the number of goals */
  int32_t ncols = YAP_IntOfTerm(YAP_ARG2);
  YAP_Term t3 = YAP_ARG3;
-  int32_t pname = YAP_AtomToInt(YAP_NameOfFunctor(YAP_FunctorOfTerm(YAP_HeadOfTerm(t3))));
+	YAP_Atom name = YAP_NameOfFunctor(YAP_FunctorOfTerm(YAP_HeadOfTerm(t3)));
  int32_t pname = YAP_AtomToInt(name);
 	const char *strname = YAP_AtomName(name);
  predicate *pred;
  int32_t cont = 0;
  memset(neg, 0x0, 2048 * sizeof(int32_t));
  while(YAP_IsPairTerm(t3)) {
-    int32_t j = 0;
+    int32_t j = 0, m;
    YAP_Term th = YAP_HeadOfTerm(t3);
    YAP_Functor f = YAP_FunctorOfTerm( th );
    int32_t n = YAP_ArityOfFunctor( f ); 
@@ -257,8 +298,17 @@ load_rule( void ) {
      *ptr++ = SBG_LE;
    else if (at == AtomDf)
      *ptr++ = SBG_DF;
    else if (at == AtomNt)
 	{
      		neg[cont] = 1;
 		cont++;
 	}
    else
 	{
      		*ptr++ = YAP_AtomToInt( at );
 		cont++;
 	}
    for (j = 0; j < n; j++) {
      YAP_Term ta = YAP_ArgOfTerm(j+1, th);
@@ -277,6 +327,34 @@ load_rule( void ) {
 	}
      } else if (YAP_IsAtomTerm(ta))  {
 	*ptr++ = -YAP_AtomToInt(YAP_AtomOfTerm(ta));
      } else if (YAP_IsApplTerm(ta))  {
 	f = YAP_FunctorOfTerm( ta );
 	at = YAP_NameOfFunctor( f );
 	m = YAP_ArityOfFunctor( f );
 	*ptr++ = YAP_AtomToInt( at );
 	for (x = 0; x < m; x++) {
      		YAP_Term ta2 = YAP_ArgOfTerm(x+1, ta);
      		if (YAP_IsVarTerm(ta2)) {
 			int32_t k;
 			for (k = 0; k < nvars; k++) {
 	  			if (vars[k] == ta2) {
 	    				*ptr++ = k+1;
 	    				break;
 	  			}
 			}
 			if (k == nvars) {
 	  			vars[k] = ta2;
 	  			*ptr++ = k+1;
 	  			nvars++;
 			}
      		} else if (YAP_IsAtomTerm(ta2))  {
 			*ptr++ = -YAP_AtomToInt(YAP_AtomOfTerm(ta));
      		} else {
 			*ptr++ = -YAP_IntOfTerm(ta);
      		}
    	}
      } else {
 	*ptr++ = -YAP_IntOfTerm(ta);
      }
@@ -296,53 +374,136 @@ load_rule( void ) {
  pred->num_rows = ngoals;
  pred->num_columns = ncols;
  pred->is_fact = FALSE;
 	x = (strlen(strname) + 1) * sizeof(char);
 	pred->predname = (char *)malloc(x);
 	memcpy(pred->predname, strname, x); 
  nvec = (int32_t *)malloc(sizeof(int32_t)*(ptr-vec));
  memcpy(nvec, vec, sizeof(int32_t)*(ptr-vec));
  pred->address_host_table =  nvec;
  pred->negatives = (int32_t *)malloc(sizeof(int32_t) * cont);
  memcpy(pred->negatives, neg, sizeof(int32_t) * cont);
  Cuda_NewRule( pred );
  return YAP_Unify(YAP_ARG4, YAP_MkIntTerm((YAP_Int)pred));
 }
-static YAP_Bool
+static int
 cuda_erase( void )
 {
  predicate *ptr = (predicate *)YAP_IntOfTerm(YAP_ARG1);
  return Cuda_Erase( ptr );
 }
-static YAP_Bool
+void setQuery(YAP_Term t1, int32_t **res)
 {
 	int32_t *query = (int32_t *)malloc(MAXARG * sizeof(int32_t));
 	int32_t x, y = 0, *itr;
 	predicate *ptr = NULL;
 	if(YAP_IsPairTerm(t1))
 	{
 		while(YAP_IsPairTerm(t1))
 		{
 			ptr = (predicate *)YAP_IntOfTerm(YAP_HeadOfTerm(t1));
 			query[y] = ptr->name;
 			itr = ptr->address_host_table;
 			x = 2;
 			while(itr[x] != 0)
 				x++;
 			query[y+1] = itr[x+1];
 			t1 = YAP_TailOfTerm(t1);
 			y+=2;
 		}
 	}
 	else
 	{
 		ptr = (predicate *)YAP_IntOfTerm(t1);
 		query[y] = ptr->name;
 		itr = ptr->address_host_table;
 		x = 2;
 		while(itr[x] != 0)
 			x++;
 		query[y+1] = itr[x+1];
 		y += 2;
 	}
 	query[y] = -1;
 	query[y+1] = -1;
 	*res = query;
 }
 static int
 cuda_eval( void )
 {
  int32_t *mat;
 #if defined(DATALOG) || defined(TUFFY)
 	int32_t *query = NULL;
 	setQuery(YAP_ARG1, &query);
 #endif
 	int32_t finalDR = YAP_IntOfTerm(YAP_ARG3);
  int32_t n = Cuda_Eval(facts, cf, rules, cr, query, & mat, names, finalDR);
 #ifdef TUFFY
 	cf = 0;
 #endif
 #ifdef ROCKIT
 	if(cf > 0)
 		cf *= -1;
 #endif
 #if defined(TUFFY) || defined(ROCKIT)
 	cr = 0;
 	names[0] = '\0';
 	return FALSE;
 #else
  int32_t i;
  predicate *ptr = (predicate *)YAP_IntOfTerm(YAP_ARG1);
  int32_t n = Cuda_Eval(facts, cf, rules, cr, ptr, & mat);
  int32_t ncols = ptr->num_columns;
  YAP_Term out = YAP_TermNil();
  YAP_Functor f = YAP_MkFunctor(YAP_IntToAtom(ptr->name), ncols);
  YAP_Term vec[256];
-  int32_t i;
+
 	YAP_Atom at;
  if (n < 0)
    return FALSE;
  for (i=0; i<n; i++) {
    int32_t ni = ((n-1)-i)*ncols, j;
 	printf("%s(", YAP_AtomName(YAP_IntToAtom(ptr->name)));
    for (j=0; j<ncols; j++) {
      vec[j] = YAP_MkIntTerm(mat[ni+j]);
 	at = YAP_IntToAtom(mat[ni+j]);
 	if(at != NULL)
 		printf("%s", YAP_AtomName(at));
 	else
 		printf("%d", mat[ni+j]);	
 	if(j < (ncols - 1))
 		printf(",");
    }
    out = YAP_MkPairTerm(YAP_MkApplTerm( f, ncols, vec ), out);
 	printf(")\n");
  }
  if (n > 0)
    free( mat );
  return YAP_Unify(YAP_ARG2, out);
 #endif
 }
-static YAP_Bool
+static int
 cuda_coverage( void )
 {
  int32_t *mat;
-  predicate *ptr = (predicate *)YAP_IntOfTerm(YAP_ARG1);
+
-  int32_t n = Cuda_Eval(facts, cf, rules, cr, ptr, & mat);
+#if defined(DATALOG) || defined(TUFFY)
-  int32_t ncols = ptr->num_columns;
+	int32_t *query = NULL;
 	setQuery(YAP_ARG1, &query);
 #endif
  int32_t n = Cuda_Eval(facts, cf, rules, cr, query, & mat, 0, 0);
  int32_t post = YAP_AtomToInt(YAP_AtomOfTerm(YAP_ARG2));
  int32_t i = n/2, min = 0, max = n-1;
  int32_t t0, t1;
@@ -384,11 +545,16 @@ cuda_coverage( void )
  } while ( TRUE );
 }
-static YAP_Bool cuda_count( void )
+static int cuda_count( void )
 {
  int32_t *mat;
-  predicate *ptr = (predicate *)YAP_IntOfTerm(YAP_ARG1);
+
-  int32_t n = Cuda_Eval(facts, cf, rules, cr, ptr, & mat);
+#if defined(DATALOG) || defined(TUFFY)
 	int32_t *query = NULL;
 	setQuery(YAP_ARG1, &query);
 #endif
  int32_t n = Cuda_Eval(facts, cf, rules, cr, query, & mat, 0, 0);
  if (n < 0)
    return FALSE;
@@ -396,7 +562,7 @@ static YAP_Bool cuda_count( void )
  return YAP_Unify(YAP_ARG2, YAP_MkIntTerm(n));
 }
-static YAP_Bool cuda_statistics( void )
+static int cuda_statistics( void )
 {
  Cuda_Statistics();
  return TRUE;
@@ -416,14 +582,20 @@ init_cuda(void)
  AtomGe = YAP_LookupAtom(">=");
  AtomLe = YAP_LookupAtom("=<");
  AtomDf = YAP_LookupAtom("\\=");
  AtomNt = YAP_LookupAtom("not");
  YAP_UserCPredicate("load_facts", load_facts, 4);
  YAP_UserCPredicate("cuda_init_facts", cuda_init_facts, 4);
  YAP_UserCPredicate("cuda_load_fact", cuda_load_fact, 1);
  YAP_UserCPredicate("load_rule", load_rule, 4);
  YAP_UserCPredicate("cuda_erase", cuda_erase, 1);
-  YAP_UserCPredicate("cuda_eval", cuda_eval, 2);
+  YAP_UserCPredicate("cuda_eval", cuda_eval, 3);
  YAP_UserCPredicate("cuda_coverage", cuda_coverage, 4);
  YAP_UserCPredicate("cuda_count", cuda_count, 2);
  YAP_UserCPredicate("cuda_statistics", cuda_statistics, 0);
 #ifdef ROCKIT
  YAP_UserCPredicate("cuda_init_query", cuda_init_query, 1);
 #endif
 }
--- a/packages/cuda/cuda.yap
+++ b/packages/cuda/cuda.yap
@@ -2,10 +2,11 @@
 		 cuda_inline/2,
 		 cuda_rule/2,
 		 cuda_erase/1,
-		 cuda_eval/2,
+		 cuda_eval/3,
 		 cuda_coverage/4,
 		 cuda_statistics/0,
-		 cuda_count/2]).
+		 cuda_count/2,
 		 cuda_query/1]).
 tell_warning :-
 	print_message(warning,functionality(cuda)).
@@ -40,7 +41,7 @@ count_answers(G, N) :-
 cuda_rule((Head :- Body) , IdRules) :-
 	body_to_list( Body, L, [], 1, N),
-	functor(Head, _Na, Ar),
+	functor(Head, Na, Ar),
 	load_rule( N, Ar, [Head|L], IdRules ).
@@ -54,3 +55,5 @@ body_to_list( B, NL, L, N0, N) :-
 body_to_list( B, [B|L], L, N0, N) :-
 	N is N0+1.
 cuda_query(Call) :-
 	cuda_init_query(Call).
--- a/packages/cuda/lista.cu
+++ b/packages/cuda/lista.cu
--- a/packages/cuda/lista.h
+++ b/packages/cuda/lista.h
@@ -25,8 +25,11 @@ typedef struct auxiliar{
 	int *numselfj;
 	int **wherejoin;
 	int *numjoin;
-	int3 num_bpreds;
+	int totalpreds;
-	int *builtin;
+	int **preds;
 	int2 *numpreds;
 	int *negatives;
 	char *rulename;
 	int gen_act;
 	int gen_ant;
 }rulenode;
--- a/packages/cuda/memory.cu
+++ b/packages/cuda/memory.cu
@@ -5,63 +5,101 @@
 #include <thrust/device_vector.h>
 #include "lista.h"
 #include "memory.h"
 #include "pred.h"
 #define MAX_REC 200
 #define HALF_REC (MAX_REC / 2)
 #define MAX_FIX_POINTS 100
 unsigned int avmem;
 memnode temp_storage[MAX_REC];
 /*List used to store information (address, size, etc.) about facts and rule results loaded in the GPU*/
 list<memnode> GPUmem;
 /*List used to store information about rule results offloaded from the GPU to the CPU*/
 list<memnode> CPUmem;
 /*Auxiliary function to sort rule list*/
 bool comparer(const rulenode &r1, const rulenode &r2)
 {
 	return (r1.name > r2.name); 
 }
 /*Used in search functions to compare iterations*/
 bool compareiteration(const memnode &r1, const memnode &r2)
 {
 	return (r1.iteration < r2.iteration); 
 }
 /*Used in search functions to compare names*/
 bool comparename(const memnode &r1, const memnode &r2)
 {
 	return (r1.name > r2.name); 
 }
-void calcular_mem(int dev)
+/*Linear search of 'name' fact*/
 {
 	cudaDeviceProp p;
 	cudaGetDeviceProperties(&p, dev);
 	avmem = p.totalGlobalMem;
 	temp_storage[0].dev_address = NULL;
 	temp_storage[0].size = 0;
 	temp_storage[HALF_REC].dev_address = NULL;
 	temp_storage[HALF_REC].size = 0;
 	//cout << "Initial memory available " << avmem << endl;
 }
 template<class InputIterator>
 InputIterator buscarhecho(InputIterator first, InputIterator last, int name)
 {
 	while(first!=last) 
 	{
-		if(first->name == name) return first;
+		if(first->name == name && first->isrule == 0) return first;
 			++first;
 	}
 	return last;
 }
-list<memnode>::iterator buscarpornombre(int name, int itr, int *totalrows, int *gpunum)
+/*Finds all results of rule 'name' in iteration 'itr' in both CPU and GPU memory. Every result found is removed from its respective list*/
 list<memnode>::iterator buscarpornombre(int name, int itr, int *totalrows, int *gpunum, int *cpunum)
 {
-	int x = 1, sum = 0;
+	int x = 0, sum = 0;
 	memnode temp;
-
+	list<memnode>::iterator i;
 	temp.name = name;
 	temp.iteration = itr;
 	pair<list<memnode>::iterator, list<memnode>::iterator> rec = equal_range(GPUmem.begin(), GPUmem.end(), temp, compareiteration);
 	while(rec.first != rec.second)
 	{
 		if(rec.first->name == name && rec.first->isrule == 1)
 		{
 			temp_storage[x] = *rec.first;
 			rec.first = GPUmem.erase(rec.first);
 			sum += temp_storage[x].rows;
 			x++;
 		}	
 		else
 			rec.first++;
 	}
 	*gpunum = x;
 	temp.name = name;
 	temp.isrule = 1;
 	i = GPUmem.insert(rec.first, temp);
 	rec = equal_range(CPUmem.begin(), CPUmem.end(), temp, compareiteration);
-		//cout << "itr = " << itr << " rec.first = " << rec.first->name << endl;	
+	while(rec.first != rec.second)
 	{				
 		if(rec.first->name == name && rec.first->isrule == 1)
 		{
 			temp_storage[x] = *rec.first;
 			rec.first = CPUmem.erase(rec.first);
 			sum += temp_storage[x].rows;
 			x++;
 		}	
 		else
 			rec.first++;
 	}
 	*totalrows = sum;
 	*cpunum = x;
 	return i;
 }
 list<memnode>::iterator buscarpornombrecpu(int name, int itr, int *totalrows, int *gpunum, int *cpunum)
 {
 	int x = 0, sum = 0;
 	memnode temp;
 	list<memnode>::iterator i;
 	temp.iteration = itr;
 	pair<list<memnode>::iterator, list<memnode>::iterator> rec = equal_range(GPUmem.begin(), GPUmem.end(), temp, compareiteration);
 	while(rec.first != rec.second)
 	{				
 		if(rec.first->name == name)
 		{
 			temp_storage[x] = *rec.first;
@@ -72,22 +110,11 @@ list<memnode>::iterator buscarpornombre(int name, int itr, int *totalrows, int *
 		else
 			rec.first++;
 	}
-	//if(x > 1)
+
 	rec.first = GPUmem.insert(rec.first, temp);
 	*totalrows = sum;
 	*gpunum = x;
-	return rec.first;
+	temp.name = name;
-}
+	temp.isrule = 1;
-
+	rec = equal_range(CPUmem.begin(), CPUmem.end(), temp, compareiteration);
 int buscarpornombrecpu(int name, int itr, int *totalrows)
 {
 	int x = HALF_REC + 1, sum = 0;
 	memnode temp;
 	temp.iteration = itr;
 	pair<list<memnode>::iterator, list<memnode>::iterator> rec = equal_range(CPUmem.begin(), CPUmem.end(), temp, compareiteration);
 	/*if(rec.first != rec.second)
 		cout << "bscnomcpu = " << rec.first->name << " " << rec.first->iteration << endl;*/
 	while(rec.first != rec.second)
 	{				
@@ -101,18 +128,24 @@ int buscarpornombrecpu(int name, int itr, int *totalrows)
 		else
 			rec.first++;
 	}
-	*totalrows += sum;
+	i = CPUmem.insert(rec.first, temp);
-	return x;
+	*totalrows = sum;
 	*cpunum = x;
 	return i;
 }
 /*Removes the least recently used memory block from GPU memory, sending it to CPU memory if it's a rule result. 
 If there are no used memory blocks in the GPU and we still don't have enough memory, the program exits with error*/
 void limpiar(const char s[], size_t sz)
 {
 	list<memnode>::iterator ini;
 	memnode temp;
 	size_t free, total;
 	if(GPUmem.size() == 0)
 	{
-		cerr << s << ": not enough GPU memory: have " << avmem << ", need " << sz << " bytes." << endl;
+		cudaMemGetInfo(&free,&total);
 		cerr << s << ": not enough GPU memory: have " << free << " of " << total << ", need " << sz << " bytes." << endl;
 		exit(1);
 	}		
@@ -122,80 +155,32 @@ void limpiar(const char s[], size_t sz)
 		temp = *ini;
 		temp.dev_address = (int *)malloc(ini->size);
 		cudaMemcpyAsync(temp.dev_address, ini->dev_address, temp.size, cudaMemcpyDeviceToHost);
-		CPUmem.push_back(temp);
+		list<memnode>::iterator pos = lower_bound(CPUmem.begin(), CPUmem.end(), temp, compareiteration);
 		CPUmem.insert(pos, temp);
 	}
-	liberar(ini->dev_address, ini->size);
+	cudaFree(ini->dev_address);
 	GPUmem.erase(ini);
 }
-void limpiartodo(int *p1, int *p2)
+/*Allocs 'size' amount of bytes in GPU memory. If not enough memory is available, removes least recently used memory blocks until 
 enough space is available*/
 void reservar(int **ptr, size_t size)
 {
-	list<memnode>::iterator ini;
+	size_t free, total;
 	memnode temp;
 	int cont = 0;
 	if(p1 != NULL)
 		cont++;	
 	if(p2 != NULL)
 		cont++;
 	ini = GPUmem.begin();
 	/*cout << "ANTES" << endl;
 	mostrar_memoria();
 	mostrar_memcpu();
 	cout << "FIN ANTES" << endl;*/
 	//cout << "mem = " << GPUmem.size() << " " << avmem << endl;
 	while(GPUmem.size() > cont)
 	{
 		if(ini->dev_address == p1 || ini->dev_address == p2)
 		{
 			ini++;
 			continue;
 		}
 		if(ini->isrule)
 		{
 			temp = *ini; 
 			temp.dev_address = (int *)malloc(ini->size);
 			cudaMemcpy(temp.dev_address, ini->dev_address, temp.size, cudaMemcpyDeviceToHost);
 			CPUmem.push_back(temp);
 		}
 		liberar(ini->dev_address, temp.size);
 		ini = GPUmem.erase(ini);
 	}
 	/*cout << "DESPUES" << endl;
 	mostrar_memoria();
 	mostrar_memcpu();
 	cout << "FIN DESPUES" << endl;*/
 	//cout << "memfinal = " << GPUmem.size() << " " << avmem << endl;
 }
 void liberar(int *ptr, int size)
 {
 	//cout << "L " << avmem << " " << size; 
 	cudaFree(ptr);
 #ifdef DEBUG_MEM
 	cerr << "- " << ptr << " " << size << endl;
 #endif
 	avmem += size;
 	//cout << " " << avmem << endl;
 }
 void reservar(int **ptr, int size)
 {
  //size_t free, total;
  //cudaMemGetInfo(      &free, &total	 );
  //	cerr << "? " << free << " " << size << endl;
        if (size == 0) { 
                *ptr = NULL; 
                return;
        }
-	while(avmem < size)
+
 	cudaMemGetInfo(&free, &total);
 	while(free < size)
 	{
 		cout << "Se limpio memoria " << free << " " << total << endl;
 		limpiar("not enough memory", size);
 		cudaMemGetInfo(&free, &total);
 	}
 	while(cudaMalloc(ptr, size) == cudaErrorMemoryAllocation)
 		limpiar("Error in memory allocation", size);
 	if (! *ptr ) {
@@ -205,11 +190,9 @@ void reservar(int **ptr, int size)
 	  cerr << "Exiting CUDA...." << endl;
 	  exit(1);
 	}
 	avmem -= size;
 	// cout << " " << avmem << endl;
 }
 /*Creates a new entry in the GPU memory list*/
 void registrar(int name, int num_columns, int *ptr, int rows, int itr, int rule)
 {
 	memnode temp;
@@ -222,6 +205,19 @@ void registrar(int name, int num_columns, int *ptr, int rows, int itr, int rule)
 	GPUmem.push_back(temp);
 }
 void registrarcpu(int name, int num_columns, int *ptr, int rows, int itr, int rule)
 {
 	memnode temp;
 	temp.name = name;
 	temp.dev_address = ptr;
 	temp.rows = rows;
 	temp.size = rows * num_columns * sizeof(int);
 	temp.iteration = itr;
 	temp.isrule = rule;
 	CPUmem.push_back(temp);
 }
 /*Updates the information of an element in a list*/
 template<class InputIterator>
 void actualizar(int num_columns, int *ptr, int rows, InputIterator i)
 {
@@ -230,6 +226,7 @@ void actualizar(int num_columns, int *ptr, int rows, InputIterator i)
 	i->size = rows * num_columns * sizeof(int);
 }
 /*Count the total number of rows generated by rule 'name' in iteration 'iter'*/
 int numrows(int name, int itr)
 {
 	int sum = 0;
@@ -252,16 +249,17 @@ int numrows(int name, int itr)
 	return sum;
 }
 	extern "C" void * YAP_IntToAtom(int);
 	extern  "C" char * YAP_AtomName(void *);
-
+/*Loads facts or rule results in GPU memory. If a fact is already in GPU memory, its pointer is simply returned. Otherwise, 
 memory is reserved and the fact is loaded. Rule results are loaded based on the current iteration 'itr' and both GPU and 
 CPU memories are searched for all instances of said results. The instances are combined into a single one in GPU memory.*/
 int cargar(int name, int num_rows, int num_columns, int is_fact, int *address_host_table, int **ptr, int itr)
 {
 	int numgpu, numcpu, totalrows = 0;
 	int *temp, x;
-	int size, itrant;
+	int size, itrant, inc = 0;
 	list<memnode>::iterator i;
 	memnode fact;
@@ -279,9 +277,6 @@ int cargar(int name, int num_rows, int num_columns, int is_fact, int *address_ho
 		}
 		size = num_rows * num_columns * sizeof(int);
 		reservar(&temp, size);
 #ifdef DEBUG_MEM
 		cerr << "+ " << temp << " temp  " << size << endl;
 #endif
 		cudaMemcpyAsync(temp, address_host_table, size, cudaMemcpyHostToDevice);
 		registrar(name, num_columns, temp, num_rows, itr, 0);
 		*ptr = temp;
@@ -290,28 +285,25 @@ int cargar(int name, int num_rows, int num_columns, int is_fact, int *address_ho
 	if(itr > 0)
 	{
 		itrant = itr - 1;
-		i = buscarpornombre(name, itrant, &totalrows, &numgpu);
+		i = buscarpornombre(name, itrant, &totalrows, &numgpu, &numcpu);
-		numcpu = buscarpornombrecpu(name, itrant, &totalrows);
+		if((numgpu == 1) && (numcpu == 1))
 		if((numgpu == 2) && (numcpu == (HALF_REC + 1)))
 		{
-			actualizar(num_columns, temp_storage[1].dev_address, temp_storage[1].rows, i);
+			actualizar(num_columns, temp_storage[0].dev_address, temp_storage[0].rows, i);
-			*ptr = temp_storage[1].dev_address;
+			*ptr = temp_storage[0].dev_address;
-			return temp_storage[1].rows;
+			return temp_storage[0].rows;
 		}
 		size = totalrows * num_columns * sizeof(int);
 		reservar(&temp, size);
-#ifdef DEBUG_MEM
+		for(x = 0; x < numgpu; x++)
 		cerr << "+ " << temp << " temp 2  " << size << endl;
 #endif
 		for(x = 1; x < numgpu; x++)
 		{
-			cudaMemcpyAsync(temp + temp_storage[x-1].size, temp_storage[x].dev_address, temp_storage[x].size, cudaMemcpyDeviceToDevice);
+			cudaMemcpyAsync(temp + inc, temp_storage[x].dev_address, temp_storage[x].size, cudaMemcpyDeviceToDevice);
-			liberar(temp_storage[x].dev_address, temp_storage[x].size);
+			inc += temp_storage[x].size / sizeof(int);
 			cudaFree(temp_storage[x].dev_address);
 		}
-		for(x = HALF_REC + 1; x < numcpu; x++)
+		for(; x < numcpu; x++)
 		{
-			cudaMemcpyAsync(temp + temp_storage[x-1].size, temp_storage[x].dev_address, temp_storage[x].size, cudaMemcpyHostToDevice);
+			cudaMemcpyAsync(temp + inc, temp_storage[x].dev_address, temp_storage[x].size, cudaMemcpyHostToDevice);
 			inc += temp_storage[x].size / sizeof(int);
 			free(temp_storage[x].dev_address);
 		}
 		actualizar(num_columns, temp, totalrows, i);
@@ -321,9 +313,54 @@ int cargar(int name, int num_rows, int num_columns, int is_fact, int *address_ho
 	return 0;
 }
 int cargarcpu(int name, int num_rows, int num_columns, int is_fact, int *address_host_table, int **ptr, int itr)
 {
 	int numgpu, numcpu, totalrows = 0;
 	int *temp, x;
 	int size, itrant, inc = 0;
 	list<memnode>::iterator i;
 	if(is_fact)
 	{
 		*ptr = address_host_table;
 		return num_rows;
 	}
 	if(itr > 0)
 	{
 		itrant = itr - 1;
 		i = buscarpornombrecpu(name, itrant, &totalrows, &numgpu, &numcpu);
 		if((numgpu == 0) && (numcpu == 1))
 		{
 			actualizar(num_columns, temp_storage[0].dev_address, temp_storage[0].rows, i);
 			*ptr = temp_storage[0].dev_address;
 			return temp_storage[0].rows;
 		}
 		size = totalrows * num_columns * sizeof(int);
 		temp = (int *)malloc(size);
 		for(x = 0; x < numgpu; x++)
 		{
 			cudaMemcpyAsync(temp + inc, temp_storage[x].dev_address, temp_storage[x].size, cudaMemcpyDeviceToHost);
 			inc += temp_storage[x].size / sizeof(int);
 			cudaFree(temp_storage[x].dev_address);
 		}
 		for(; x < numcpu; x++)
 		{
 			memcpy(temp + inc, temp_storage[x].dev_address, temp_storage[x].size);
 			inc += temp_storage[x].size / sizeof(int);
 			free(temp_storage[x].dev_address);
 		}
 		actualizar(num_columns, temp, totalrows, i);
 		*ptr = temp;
 		return totalrows;
 	}
 	return 0;
 }
 /*Loads all results of rule 'name' from both GPU and CPU memories into the GPU*/
 int cargafinal(int name, int cols, int **ptr)
 {
-	int *temp, *ini, cont = 0;
+	int *temp, *ini, cont = 0, numg = 0, numc = 0;
 	memnode bus;
 	bus.name = name;
 	GPUmem.sort(comparename);
@@ -335,6 +372,7 @@ int cargafinal(int name, int cols, int **ptr)
 	while(pos != endg && pos->name == name)
 	{
 		cont += pos->rows;
 		numg++;
 		pos++;
 	}
 	pos = lower_bound(CPUmem.begin(), endc, bus, comparename);
@@ -342,15 +380,41 @@ int cargafinal(int name, int cols, int **ptr)
 	while(pos != endc && pos->name == name)
 	{
 		cont += pos->rows;
 		numc++;
 		pos++;
 	}
-	reservar(&temp, cont * cols * sizeof(int));
+	if(numg == 0 && numc == 0)
-#ifdef DEBUG_MEM
+		return 0;
-	cerr << "+ " << temp << " temp 3 " << cont * cols * sizeof(int) << endl;
+	if(numg == 1 && numc == 0) 
 	{
 		pos = gpu;
 		*ptr = pos->dev_address;
 		cont = pos->rows;
 		GPUmem.erase(pos);
 		#ifdef TUFFY
 		return -cont;
 		#else
 		return cont;
 		#endif
-	ini = temp;	
+	}
 	if(numg == 0 && numc == 1)
 	{
 		pos = cpu;
 		cont = pos->rows;
 		#ifdef TUFFY
 		reservar(&temp, pos->size);
 		cudaMemcpy(temp, pos->dev_address, pos->size, cudaMemcpyHostToDevice);
 		*ptr = temp;
 		#else
 		*ptr = pos->dev_address;
 		#endif
 		CPUmem.erase(pos);
 		return -cont;
 	}
 	reservar(&temp, cont * cols * sizeof(int));
 	ini = temp;
 	pos = gpu;
 	while(pos != endg && pos->name == name)
 	{
@@ -365,23 +429,13 @@ int cargafinal(int name, int cols, int **ptr)
 		temp += pos->size / sizeof(int);
 		pos++;
 	}
 	/*int x, y;
 	int *hop1 = (int *)malloc(cont * cols * sizeof(int));
 	cudaMemcpy(hop1, ini, cont * cols * sizeof(int), cudaMemcpyDeviceToHost);
 	cout << "select finala" << endl;
 	for(x = 0; x < cont; x++)
 	{
 		for(y = 0; y < cols; y++)
 			cout << hop1[x * cols + y] << " ";
 		cout << endl;
 	}
 	cout << "select finala" << endl;*/
 	*ptr = ini;
 	return cont;
 }
 /*Compares the results of the current iteration against the results of older iterations. 
 Used to avoid infinite computations when the result is not a single fixed-point, but an 
 orbit of points.*/
 bool generadas(int name, int filas, int cols, int itr)
 {
 	int r1, r2, x, fin;
@@ -401,46 +455,26 @@ bool generadas(int name, int filas, int cols, int itr)
 			thrust::device_ptr<int> pt2 = thrust::device_pointer_cast(dop2);
 			r1 = cargar(name, filas, cols, 0, NULL, &dop1, itr - x + 1);
 			thrust::device_ptr<int> pt1 = thrust::device_pointer_cast(dop1);
 			/*int y;
 			int *a = (int *)malloc(r1 * cols * sizeof(int));
 			cudaMemcpy(a, dop1, r1 * cols * sizeof(int), cudaMemcpyDeviceToHost);
 			for(x = 0; x < r1; x++)
 			{
 				for(y = 0; y < cols; y++)
 					cout << a[x * cols + y] << " ";
 			}
 			cout << endl;
 			cudaMemcpy(a, dop2, r1 * cols * sizeof(int), cudaMemcpyDeviceToHost);
 			for(x = 0; x < r1; x++)
 			{
 				for(y = 0; y < cols; y++)
 					cout << a[x * cols + y] << " ";
 			}
 			cout << endl;
 			free(a);*/
 			if(thrust::equal(pt1, pt1 + r1, pt2) == true)
 				return true;
 		}
 	}
 	return false;
 }
 void mostrar_memoria()
 {
-	int x;
+	unsigned int x;
 	list<memnode>::iterator i = GPUmem.begin();
 	cout << "Memoria inicio GPU" << endl;
 	for(x = 0; x < GPUmem.size(); x++, i++)
-		cout << i->name << " " << i->iteration << " " << i->size << endl;
+		cout << i->name << " " << i->iteration << " " << i->isrule << " " << i->rows << " " << i->size << endl;
 	cout << "Memoria fin GPU" << endl;
 }
 void mostrar_memcpu()
 {
-	int x;
+	unsigned int x;
 	list<memnode>::iterator i = CPUmem.begin();
 	cout << "Memoria inicio CPU" << endl;
 	for(x = 0; x < CPUmem.size(); x++, i++)
@@ -448,53 +482,7 @@ void mostrar_memcpu()
 	cout << "Memoria fin CPU" << endl;
 }
-void resultados(vector<rulenode>::iterator first, vector<rulenode>::iterator last)
+/*Clear all rule results from both GPU and CPU memory*/
 {
 	GPUmem.sort(comparename);
 	CPUmem.sort(comparename);
 	list<memnode>::iterator gpu = GPUmem.begin();
 	list<memnode>::iterator cpu = CPUmem.begin();
 	int x, y, of, cols;
 	int *temp, cont = 0;
 	while(first != last)
 	{
 		while(first->name == gpu->name)
 		{
 			temp = (int *)malloc(gpu->size);
 			cudaMemcpy(temp, gpu->dev_address, gpu->size, cudaMemcpyDeviceToHost);
 			cols = gpu->size / (gpu->rows * sizeof(int));
 			cont += gpu->rows;
 			for(x = 0, of = 0; x < gpu->rows; x++)
 			{
 				for(y = 0; y < cols; y++, of++)
 					cout << temp[of] << " ";
 				cout << endl;
 			}
 			cudaFree(gpu->dev_address);
 #ifdef DEBUG_MEM
 			cerr << "- " << gpu->dev_address << " gpu->dev_address" << endl;
 #endif
 			free(temp);
 			gpu++;
 		}
 		while(first->name == cpu->name)
 		{
 			cols = cpu->size / (cpu->rows * sizeof(int));
 			cont += cpu->rows;
 			for(x = 0, of = 0; x < cpu->rows; x++)
 			{
 				for(y = 0; y < cols; y++, of++)
 					cout << cpu->dev_address[of] << " ";
 				cout << endl;
 			}
 			free(cpu->dev_address);
 			cpu++;
 		}
 		first++;
 	}
 	cout << cont << endl;
 }
 void clear_memory()
 {
 	list<memnode>::iterator ini;
@@ -503,15 +491,13 @@ void clear_memory()
 	fin = GPUmem.end();
 	while(ini != fin)
 	{
-	  if (ini->isrule) {
+		if(ini->isrule)
 		{
 			cudaFree(ini->dev_address);
 #ifdef DEBUG_MEM
 	    cerr << "- " << ini->dev_address << " ini->dev_address" << endl;
 #endif
 			ini = GPUmem.erase(ini);
 	  } else {
 	    ini++;
 		}
 		else
 			ini++;
 	}
 	ini = CPUmem.begin();
 	fin = CPUmem.end();
@@ -522,3 +508,68 @@ void clear_memory()
 	}
 	CPUmem.clear();
 }
 /*Clear everything from both GPU and CPU memory*/
 void clear_memory_all()
 {
 	list<memnode>::iterator ini;
 	list<memnode>::iterator fin;
       	ini = GPUmem.begin();
 	fin = GPUmem.end();
 	while(ini != fin)
 	{
 		cudaFree(ini->dev_address);
 		ini++;
 	}
 	GPUmem.clear();
 	ini = CPUmem.begin();
 	fin = CPUmem.end();
 	while(ini != fin)
 	{
 		free(ini->dev_address);
 		ini++;
 	}
 	CPUmem.clear();
 }
 /*Remove all instances of fact 'name' from both CPU and GPU memories*/
 void liberar(int name)
 {
 	list<memnode>::iterator i;
 	memnode fact;
 	i = buscarhecho(GPUmem.begin(), GPUmem.end(), name);
 	if(i != GPUmem.end())
 	{
 		fact = *i;
 		GPUmem.erase(i);
 		cudaFree(fact.dev_address);
 	}
 	i = buscarhecho(CPUmem.begin(), CPUmem.end(), name);
 	if(i != CPUmem.end())
 	{
 		fact = *i;
 		CPUmem.erase(i);
 		free(fact.dev_address);
 	}
 }
 /*Add all rows in 'dop1' to the fact 'name' by creating a new array capable of holding both.*/
 void sumar(int name, int *dop1, int cols, int rows)
 {
 	list<memnode>::iterator i;
 	memnode fact;
 	i = buscarhecho(GPUmem.begin(), GPUmem.end(), name);
 	int *res, newrows, offset;
 	if(i != GPUmem.end())
 	{
 		fact = *i;
 		newrows = rows + fact.rows;
 		reservar(&res, newrows * cols * sizeof(int));
 		offset = fact.rows * cols;
 		cudaMemcpyAsync(res, fact.dev_address, offset * sizeof(int), cudaMemcpyDeviceToDevice);
 		GPUmem.erase(i);
 		registrar(name, cols, res, newrows, 0, 0);
 		cudaMemcpyAsync(res + offset, dop1, rows * cols * sizeof(int), cudaMemcpyDeviceToDevice);
 		cudaFree(fact.dev_address);
 	}
 }
--- a/packages/cuda/memory.h
+++ b/packages/cuda/memory.h
@@ -1,26 +1,27 @@
 #ifndef _MEMORY_H_
 #define _MEMORY_H_
 //#include <thrust/device_vector.h>
 #include <list>
 #include <vector>
 #include "lista.h"
 using namespace std;
 //using namespace thrust;
-void calcular_mem(int);
+bool comparer(const rulenode&, const rulenode&);
 void liberar(int*, int);
 void limpiar(const char [], size_t);
 void limpiartodo(int*, int*);
 int cargar(int, int, int, int, int*, int**, int);
 int cargarcpu(int, int, int, int, int*, int**, int);
 int cargafinal(int, int, int**);
-void reservar(int**, int);
+void reservar(int**, size_t);
 void registrar(int, int, int*, int, int, int);
 void registrarcpu(int, int, int*, int, int, int);
 bool generadas(int, int, int, int);
 void sumar(int, int*, int, int);
 void liberar(int);
 void mostrar_memoria(void);
 void mostrar_memcpu(void);
 void clear_memory(void);
-void resultados(vector<rulenode>::iterator, vector<rulenode>::iterator);
+void clear_memory_all(void);
 #endif
--- a/packages/cuda/pred.h
+++ b/packages/cuda/pred.h
@@ -9,11 +9,17 @@ typedef struct Nodo{
 	int num_columns;
 	int is_fact;
 	int *address_host_table;
 	int *negatives;
 	char *predname;
 	double *weight;
 }gpunode;
 typedef gpunode predicate;
 //#define TIMER 1
 #define DATALOG 1
 #define NUM_T 4
 #define INISIZE 1000000
 #if TIMER
 typedef struct Stats{
@@ -27,6 +33,8 @@ typedef struct Stats{
 extern statinfo cuda_stats;
 #endif
 /*Constants used to mark comparison predicates*/
 #define BPOFFSET (-6)
 #define SBG_EQ  (-1)
 #define SBG_GT  (-2)
 #define SBG_LT  (-3)
@@ -34,6 +42,6 @@ extern statinfo cuda_stats;
 #define SBG_LE  (-5)
 #define SBG_DF  (-6)
-int Cuda_Eval(predicate**, int, predicate**, int, predicate*, int**);
+int Cuda_Eval(predicate**, int, predicate**, int, int*, int**, char*, int);
 void  Cuda_Statistics( void );
 #endif
--- a/packages/cuda/selectproyect.cu
+++ b/packages/cuda/selectproyect.cu
@@ -1,10 +1,11 @@
 #include <thrust/device_vector.h>
 //#include <thrust/device_ptr.h>
 #include <thrust/scan.h>
 #include <stdlib.h>
 #include "memory.h"
 #include "bpreds.h"
-__global__ void marcar(int *dop1, int rows, int cols, int *cons, int numc, int *res) /*a libreria*/
+/*Mark all rows that comply with the selections*/
 __global__ void marcar2(int *dop1, int rows, int cols, int *cons, int numc, int *res)
 {
 	extern __shared__ int shared[];
 	int id = blockIdx.x * blockDim.x + threadIdx.x;
@@ -24,14 +25,14 @@ __global__ void marcar(int *dop1, int rows, int cols, int *cons, int numc, int *
 		res[id] = 1;
 	}
 }
-
+/*If we already have an array of marks (perhaps because the selfjoin was applied first), 
-__global__ void marcar2(int *dop1, int rows, int cols, int *cons, int numc, int *res) /*a libreria*/
+we unmark any rows that do not comply with the selections*/
 __global__ void marcar(int *dop1, int rows, int cols, int *cons, int numc, int *res)
 {
 	extern __shared__ int shared[];
    	int *spos = &shared[numc];
 	int id = blockIdx.x * blockDim.x + threadIdx.x;
 	int x, rowact, posact;
-	if(threadIdx.x < (numc * 2))
+	if(threadIdx.x < numc)
 		shared[threadIdx.x] = cons[threadIdx.x];
 	__syncthreads();
 	if(id < rows)
@@ -39,10 +40,10 @@ __global__ void marcar2(int *dop1, int rows, int cols, int *cons, int numc, int
 		if(res[id] == 0)
 			return;
 		rowact = id * cols;
-		for(x = 0; x < numc; x++)
+		for(x = 0; x < numc; x += 2)
 		{
-			posact = rowact + spos[x];
+			posact = rowact + shared[x];
-			if(dop1[posact] != shared[x])
+			if(dop1[posact] != shared[x+1])
 			{
 				res[id] = 0;
 				return;
@@ -51,6 +52,7 @@ __global__ void marcar2(int *dop1, int rows, int cols, int *cons, int numc, int
 	}
 }
 /*Unmark all rows that do not comply with the selfjoins.*/
 __global__ void samejoin(int *dop1, int rows, int cols, int *dhead, int cont, int *res)
 {
 	extern __shared__ int shared[];
@@ -66,12 +68,12 @@ __global__ void samejoin(int *dop1, int rows, int cols, int *dhead, int cont, in
 		pos = id * cols;
 		for(x = 0; x < cont; x++)
 		{
-			temp = shared[x];
+			temp = dop1[pos+shared[x]];
 			y = x + 1;
 			temp2 = shared[y];
 			while(temp2 > -1)
 			{
-				if(dop1[temp+pos] != dop1[temp2+pos])
+				if(temp != dop1[temp2+pos])
 				{
 					res[id] = 0;
 					return;
@@ -84,6 +86,7 @@ __global__ void samejoin(int *dop1, int rows, int cols, int *dhead, int cont, in
 	}
 }
 /*Mark all rows that comply with the selfjoins*/
 __global__ void samejoin2(int *dop1, int rows, int cols, int *dhead, int cont, int *res)
 {
 	extern __shared__ int shared[];
@@ -97,12 +100,12 @@ __global__ void samejoin2(int *dop1, int rows, int cols, int *dhead, int cont, i
 		pos = id * cols;
 		for(x = 0; x < cont; x++)
 		{
-			temp = shared[x];
+			temp = dop1[pos+shared[x]];
 			y = x + 1;
 			temp2 = shared[y];
 			while(temp2 > -1)
 			{
-				if(dop1[temp+pos] != dop1[temp2+pos])
+				if(temp != dop1[temp2+pos])
 					return;
 				y++;
 				temp2 = shared[y];
@@ -113,6 +116,7 @@ __global__ void samejoin2(int *dop1, int rows, int cols, int *dhead, int cont, i
 	}
 }
 /*Project all columns found in 'dhead' to a new array 'res'*/
 __global__ void proyectar(int *dop1, int rows, int cols, int *dhead, int hsize, int *res)
 {
 	extern __shared__ int shared[];
@@ -130,76 +134,31 @@ __global__ void proyectar(int *dop1, int rows, int cols, int *dhead, int hsize,
 	}
 }
 /*Project all columns found in 'dhead' using only the rows marked as valid (i.e. those that complied with 
 selections, selfjoins, etc.). The array 'temp' holds the result of the prefix sum of said marks.*/
 __global__ void llenarproyectar(int *dop1, int rows, int cols, int *temp, int *dhead, int hsize, int *res)
 {
 	extern __shared__ int shared[];
 	int id = blockIdx.x * blockDim.x + threadIdx.x;
 	int pos, posr, x;
-	if(threadIdx.x < cols)
+	if(threadIdx.x < hsize)
 		shared[threadIdx.x] = dhead[threadIdx.x];
 	__syncthreads();
 	if(id < rows)
 	{		
-		posr = temp[id+1];
+		posr = temp[id];
-		if(temp[id] != posr && posr > 0)
+		if(temp[id+1] != posr)
 		{
 			pos = id * cols;
-			posr = (posr - 1) * hsize;			
+			posr *= hsize;			
 			for(x = 0; x < hsize; x++, posr++)
 				res[posr] = dop1[pos+shared[x]];
 		}
 	}
 }
-/*__global__ void removedup()
+/*Performs selections, selfjoins and comparison predicates when the rule has a single normal predicate.*/
-{
+int selectproyect(int *dop1, int rows, int cols, int head_size, int *select, int numselect, int *selfjoin, int numselfj, int *preds, int numpreds, int *project, int **ret, int ANDlogic)
 	extern __shared__ int shared[];
 	int id = blockIdx.x * blockDim.x + threadIdx.x;
 	if(threadIdx.x < cols)
 		shared[threadIdx.x] = dhead[threadIdx.x];
 	if(id < rows)
 	{
 	}
 }*/
 template<typename T> /*a libreria*/
 struct suma : public binary_function<T,T,T>
 {
 	__host__ __device__ 
 	T operator()(const T &r1, const T &r2)
 	{
 		if(r1 > -1)
 		{
 			if(r2 > 0)
 				return r1 + r2;
 			return -r1;
 		}
 		else
 		{
 			if(r2 > 0)
 				return abs(r1) + r2;
 			return r1;
 		}
 	}
 };
 int mayor(int a, int b, int c)
 {
 	if(a > b)
 	{
 		if(a > c)
 			return a;
 	}
 	else
 	{
 		if(b > c)
 			return b;
 	}
 	return c;
 }
 int selectproyect(int *dop1, int rows, int cols, int head_size, int *select, int numselect, int *selfjoin, int numselfj, int *project, int **ret)
 {
 	int *fres = NULL, *temp = NULL;
 	int *dhead = NULL, tmplen;
@@ -209,30 +168,27 @@ int selectproyect(int *dop1, int rows, int cols, int head_size, int *select, int
 #if TIMER
 	cuda_stats.selects++;
 #endif
-	int head_bytes = mayor(numselect, numselfj, head_size) * sizeof(int);
+
 	int head_bytes = maximo(4, numselect, numselfj, numpreds, head_size) * sizeof(int);
 	reservar(&dhead, head_bytes);
-#ifdef DEBUG_MEM
+	int numthreads = 1024;
-	cerr << "+ " << dhead << " dhead  " << head_bytes << endl;
+	//int numthreads = 32;
 	int blockllen = rows / numthreads + 1;
 	#ifdef ROCKIT
 		ANDlogic = 1;
 	#endif
 	int blockllen = rows / 1024 + 1;
 	int numthreads = 1024;
 	//removerep(dop1, rows, cols, dhead,) 
 	if(numselect > 0)
 	{		
 		tmplen = rows + 1;
 		size2 = tmplen * sizeof(int);
 		reservar(&temp, size2);
 #ifdef DEBUG_MEM
 		cerr << "+ " << temp << " temp  select " << size2 << endl;
 #endif
 		cudaMemset(temp, 0, size2);
 		size = numselect * sizeof(int);
 		cudaMemcpy(dhead, select, size, cudaMemcpyHostToDevice);
-		marcar<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, numselect, temp + 1);
+		marcar2<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, numselect, temp + 1);
 		if(numselfj > 0)
 		{
@@ -241,6 +197,16 @@ int selectproyect(int *dop1, int rows, int cols, int head_size, int *select, int
 			samejoin<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, numselfj, temp + 1);
 		}
 		if(numpreds > 0)
 		{
 			size = numpreds * sizeof(int);
 			cudaMemcpy(dhead, preds, size, cudaMemcpyHostToDevice);
 			if(ANDlogic)
 				bpredsnormal<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, numpreds, temp + 1);
 			else
 				bpredsorlogic<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, numpreds, temp + 1);
 		}
 		res = thrust::device_pointer_cast(temp);
 		thrust::inclusive_scan(res + 1, res + tmplen, res + 1);
 		num = res[rows];
@@ -249,13 +215,10 @@ int selectproyect(int *dop1, int rows, int cols, int head_size, int *select, int
 		size = head_size * sizeof(int);
 		reservar(&fres, num * size);
 #ifdef DEBUG_MEM
 		cerr << "+ " << fres << " fres select  " << num*size << endl;
 #endif
 		cudaMemcpy(dhead, project, size, cudaMemcpyHostToDevice);
 		llenarproyectar<<<blockllen, numthreads, size>>>(dop1, rows, cols, temp, dhead, head_size, fres);
-		liberar(dhead, head_bytes);
+		cudaFree(dhead);
-		liberar(temp, size2);
+		cudaFree(temp);
 		*ret = fres;
 		return num;
 	}
@@ -266,15 +229,22 @@ int selectproyect(int *dop1, int rows, int cols, int head_size, int *select, int
 			tmplen = rows + 1;
 			size2 = tmplen * sizeof(int);
 			reservar(&temp, size2);
 #ifdef DEBUG_MEM
 			cerr << "+ " << temp << " temp select  " << size2 << endl;
 #endif
 			cudaMemset(temp, 0, size2);
 			size = numselfj * sizeof(int);
 			cudaMemcpy(dhead, selfjoin, size, cudaMemcpyHostToDevice);
 			samejoin2<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, numselfj, temp + 1);
 			if(numpreds > 0)
 			{
 				size = numpreds * sizeof(int);
 				cudaMemcpy(dhead, preds, size, cudaMemcpyHostToDevice);
 				if(ANDlogic)
 					bpredsnormal<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, numpreds, temp + 1);
 				else
 					bpredsorlogic<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, numpreds, temp + 1);
 			}
 			res = thrust::device_pointer_cast(temp);
 			thrust::inclusive_scan(res + 1, res + tmplen, res + 1);
 			num = res[rows];
@@ -283,13 +253,41 @@ int selectproyect(int *dop1, int rows, int cols, int head_size, int *select, int
 			size = head_size * sizeof(int);
 			reservar(&fres, num * size);
 #ifdef DEBUG_MEM
 			cerr << "+ " << fres << " fres select again  " << num*size << endl;
 #endif
 			cudaMemcpy(dhead, project, size, cudaMemcpyHostToDevice);
 			llenarproyectar<<<blockllen, numthreads, size>>>(dop1, rows, cols, temp, dhead, head_size, fres);
-			liberar(dhead, head_bytes);
+			cudaFree(dhead);
-			liberar(temp, size2);
+			cudaFree(temp);
 			*ret = fres;
 			return num;
 		}
 		else
 		{
 			if(numpreds > 0)
 			{
 				tmplen = rows + 1;
 				size2 = tmplen * sizeof(int);
 				reservar(&temp, size2);
 				cudaMemset(temp, 0, size2);		
 				size = numpreds * sizeof(int);
 				cudaMemcpy(dhead, preds, size, cudaMemcpyHostToDevice);
 				if(ANDlogic)
 					bpredsnormal2<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, numpreds, temp + 1);					
 				else
 					bpredsorlogic2<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, numpreds, temp + 1);
 				res = thrust::device_pointer_cast(temp);
 				thrust::inclusive_scan(res + 1, res + tmplen, res + 1);
 				num = res[rows];
 				if(num == 0)
 					return 0;
 				size = head_size * sizeof(int);
 				reservar(&fres, num * size);
 				cudaMemcpy(dhead, project, size, cudaMemcpyHostToDevice);
 				llenarproyectar<<<blockllen, numthreads, size>>>(dop1, rows, cols, temp, dhead, head_size, fres);
 				cudaFree(dhead);
 				cudaFree(temp);
 				*ret = fres;
 				return num;
 			}
@@ -297,14 +295,12 @@ int selectproyect(int *dop1, int rows, int cols, int head_size, int *select, int
 			{
 				size = head_size * sizeof(int);
 				reservar(&fres, rows * size);
 #ifdef DEBUG_MEM
 			cerr << "+ " << fres << " fres select third  " << rows*size << endl;
 #endif
 				cudaMemcpy(dhead, project, size, cudaMemcpyHostToDevice);
 				proyectar<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, head_size, fres);
-			liberar(dhead, head_bytes);
+				cudaFree(dhead);
 				*ret = fres;
 				return rows;
 			}
 		}
 	}
 }
--- a/packages/cuda/treeb.cu
+++ b/packages/cuda/treeb.cu
--- a/packages/cuda/union2.cu
+++ b/packages/cuda/union2.cu
--- a/packages/python/python.c
+++ b/packages/python/python.c
@@ -2158,12 +2158,14 @@ static foreign_t init_python(void) {
  char **argv;
  term_t t = PL_new_term_ref();
  YAP_Argv(&argv);
  if (argv) {
 #if PY_MAJOR_VERSION < 3
    Py_SetProgramName(argv[0]);
 #else
    wchar_t *buf = Py_DecodeLocale(argv[0], NULL);
    Py_SetProgramName(buf);
 #endif
  }
  Py_Initialize();
  py_Main = PyImport_AddModule("__main__");
  py_Builtin = PyImport_AddModule("__builtin__");
`@@ -279,4 +279,5 @@ static void RestoreWorker(int wid USES_REGS) {`




	`}`	`}`