new version of cuda interface

2016-07-31 10:14:02 -05:00
parent c6d174841a
commit d3599da6dc
37 changed files with 7040 additions and 367 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -179,3 +179,5 @@ packages/myddas/hh
 packages/myddas/DaysInHospital_Y3.csv
 packages/myddas/agile.csv
 *.pyc
--- a/packages/cuda/CC_CSSTree.cu
+++ b/packages/cuda/CC_CSSTree.cu
--- a/packages/cuda/CC_CSSTree.h
+++ b/packages/cuda/CC_CSSTree.h
--- a/packages/cuda/Makefile.in
+++ b/packages/cuda/Makefile.in
--- a/packages/cuda/bpreds.cu
+++ b/packages/cuda/bpreds.cu
@@ -1,3 +1,4 @@
 #include "hip/hip_runtime.h"
 #include <thrust/device_vector.h>
 #include <thrust/scan.h>
 #include <cstdarg>
@@ -25,10 +26,10 @@ int maximo(int count, ...)
 __global__ void bpreds(int *dop1, int *dop2, int rows, int of1, int of2, int *cons, int numc, int nx, int *res, int *res2)
 {
 	extern __shared__ int shared[];
-	int id = blockIdx.x * blockDim.x + threadIdx.x;
+	int id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
 	int x, rowact, rowact1, op1, op2;
-	if(threadIdx.x < numc)
+	if(hipThreadIdx_x < numc)
-		shared[threadIdx.x] = cons[threadIdx.x];
+		shared[hipThreadIdx_x] = cons[hipThreadIdx_x];
 	__syncthreads();
 	if(id < rows)
 	{
@@ -110,10 +111,10 @@ __global__ void bpreds(int *dop1, int *dop2, int rows, int of1, int of2, int *co
 __global__ void bpredsnormal2(int *dop1, int rows, int of1, int *cons, int numc, int *res)
 {
 	extern __shared__ int shared[];
-	int id = blockIdx.x * blockDim.x + threadIdx.x;
+	int id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
 	int x, rowact, op1, op2;
-	if(threadIdx.x < numc)
+	if(hipThreadIdx_x < numc)
-		shared[threadIdx.x] = cons[threadIdx.x];
+		shared[hipThreadIdx_x] = cons[hipThreadIdx_x];
 	__syncthreads();
 	if(id < rows)
 	{
@@ -159,10 +160,10 @@ __global__ void bpredsnormal2(int *dop1, int rows, int of1, int *cons, int numc,
 __global__ void bpredsnormal(int *dop1, int rows, int of1, int *cons, int numc, int *res)
 {
 	extern __shared__ int shared[];
-	int id = blockIdx.x * blockDim.x + threadIdx.x;
+	int id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
 	int x, rowact, op1, op2;
-	if(threadIdx.x < numc)
+	if(hipThreadIdx_x < numc)
-		shared[threadIdx.x] = cons[threadIdx.x];
+		shared[hipThreadIdx_x] = cons[hipThreadIdx_x];
 	__syncthreads();
 	if(id < rows)
 	{
@@ -226,10 +227,10 @@ __global__ void bpredsnormal(int *dop1, int rows, int of1, int *cons, int numc,
 __global__ void bpredsOR(int *dop1, int *dop2, int rows, int of1, int of2, int *cons, int numc, int nx, int *res, int *res2)
 {
 	extern __shared__ int shared[];
-	int id = blockIdx.x * blockDim.x + threadIdx.x;
+	int id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
 	int x, rowact, rowact1, op1, op2;
-	if(threadIdx.x < numc)
+	if(hipThreadIdx_x < numc)
-		shared[threadIdx.x] = cons[threadIdx.x];
+		shared[hipThreadIdx_x] = cons[hipThreadIdx_x];
 	__syncthreads();
 	if(id < rows)
 	{
@@ -344,10 +345,10 @@ __global__ void bpredsOR(int *dop1, int *dop2, int rows, int of1, int of2, int *
 __global__ void bpredsorlogic2(int *dop1, int rows, int of1, int *cons, int numc, int *res)
 {
 	extern __shared__ int shared[];
-	int id = blockIdx.x * blockDim.x + threadIdx.x;
+	int id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
 	int x, rowact, op1, op2;
-	if(threadIdx.x < numc)
+	if(hipThreadIdx_x < numc)
-		shared[threadIdx.x] = cons[threadIdx.x];
+		shared[hipThreadIdx_x] = cons[hipThreadIdx_x];
 	__syncthreads();
 	if(id < rows)
 	{
@@ -411,10 +412,10 @@ __global__ void bpredsorlogic2(int *dop1, int rows, int of1, int *cons, int numc
 __global__ void bpredsorlogic(int *dop1, int rows, int of1, int *cons, int numc, int *res)
 {
 	extern __shared__ int shared[];
-	int id = blockIdx.x * blockDim.x + threadIdx.x;
+	int id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
 	int x, rowact, op1, op2;
-	if(threadIdx.x < numc)
+	if(hipThreadIdx_x < numc)
-		shared[threadIdx.x] = cons[threadIdx.x];
+		shared[hipThreadIdx_x] = cons[hipThreadIdx_x];
 	__syncthreads();
 	if(id < rows)
 	{
--- a/packages/cuda/bpreds.h
+++ b/packages/cuda/bpreds.h
@@ -1,3 +1,4 @@
 #include "hip/hip_runtime.h"
 #ifndef _BPREDS_H_
 #define _BPREDS_H_
--- a/packages/cuda/bpredscpu.cpp
+++ b/packages/cuda/bpredscpu.cpp
--- a/packages/cuda/clamp.rb
+++ b/packages/cuda/clamp.rb
@@ -0,0 +1,52 @@
 require "formula"
 # Documentation: https://github.com/Homebrew/homebrew/wiki/Formula-Cookbook
 #                /usr/local/Library/Contributions/example-formula.rb
 # PLEASE REMOVE ALL GENERATED COMMENTS BEFORE SUBMITTING YOUR PULL REQUEST!
 class Clamp < Formula
  homepage "https://bitbucket.org/multicoreware/cppamp-driver-ng/wiki/Home"
  version "0.0.1-3"
  url "https://bitbucket.org/multicoreware/cppamp-driver-ng/get/milestone3.tar.bz2"
  head "https://bitbucket.org/multicoreware/cppamp-driver-ng.git"
  sha1 "b8b88306561a60942f8ecbd8ff20554661c4e5f9"
  depends_on "cmake" => :build
  depends_on "wget" => :build
  depends_on "git" => :build
  depends_on "hg" => :build
  depends_on "subversion" => :build
  # depends_on :x11 # if your formula requires any X11/XQuartz components
  def install
    # ENV.deparallelize  # if your formula fails when building in parallel
    # Remove unrecognized options if warned by configure
    # system "./configure", "--disable-debug",
    #                      "--disable-dependency-tracking",
    #                      "--disable-silent-rules",
    #                      "--prefix=#{prefix}"
    mkdir "macbuild" do
      args = std_cmake_args
      args << "-DCLANG_URL=https://bitbucket.org/multicoreware/cppamp-ng.git"
      args << "-DLLVM_EXPERIMENTAL_TARGETS_TO_BUILD=CBackend"
      args << "-DGMAC_URL=https://bitbucket.org/multicoreware/gmac"
      system 'cmake', "..", *args
      system "make", "world"
      system "cd libc++; make install"
      system "make", "install" # if this fails, try separate make/make install steps
    end
  end
  test do
    # `test do` will create, run in and delete a temporary directory.
    #
    # This test will fail and we won't accept that! It's enough to just replace
    # "false" with the main program this formula installs, but it'd be nice if you
    # were more thorough. Run the test with `brew test milestone`.
    #
    # The installed folder is not in the path, so use the entire path to any
    # executables being tested: `system "#{bin}/program", "do", "something"`.
    system "make", "test"
  end
 end
--- a/packages/cuda/creator2.c
+++ b/packages/cuda/creator2.c
@@ -66,7 +66,7 @@ int main(int argc, char *argv[])
 	fprintf(cuda, "\t\t\t{\n");
 	fprintf(cuda, "\t\t\t\tsize = nrows * tipo * sizeof(int);\n");
 	fprintf(cuda, "\t\t\t\treservar(&nres, size);\n");
-	fprintf(cuda, "\t\t\t\tcudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);\n");
+	fprintf(cuda, "\t\t\t\tcudaMemcpyAsync(nres, res, size, hipMemcpyDeviceToDevice);\n");
 	fprintf(cuda, "\t\t\t\tcudaFree(*ret);\n");
 	fprintf(cuda, "\t\t\t\t*ret = nres;\n");
 	fprintf(cuda, "\t\t\t}\n");
@@ -103,7 +103,7 @@ int main(int argc, char *argv[])
 		fprintf(cuda, "\t\t\t{\n");
 		fprintf(cuda, "\t\t\t\tsize = nrows * tipo * sizeof(int);\n");
 		fprintf(cuda, "\t\t\t\treservar(&nres, size);\n");
-		fprintf(cuda, "\t\t\t\tcudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);\n");
+		fprintf(cuda, "\t\t\t\tcudaMemcpyAsync(nres, res, size, hipMemcpyDeviceToDevice);\n");
 		fprintf(cuda, "\t\t\t\tcudaFree(*ret);\n");
 		fprintf(cuda, "\t\t\t\t*ret = nres;\n");
 		fprintf(cuda, "\t\t\t}\n");
--- a/packages/cuda/cuda.c
+++ b/packages/cuda/cuda.c
--- a/packages/cuda/cuda.yap
+++ b/packages/cuda/cuda.yap
--- a/packages/cuda/dbio.cu
+++ b/packages/cuda/dbio.cu
@@ -27,8 +27,8 @@ void datalogWrite(int query, vector<rulenode>::iterator rul_str, vector<rulenode
 				res_rows = unir(dop1, res_rows, cols1, &dop1, 0);
 			tipo = res_rows * cols1 * sizeof(int);
 			hres = (int *)malloc(tipo);
-			cudaMemcpy(hres, dop1, tipo, cudaMemcpyDeviceToHost);
+			hipMemcpy(hres, dop1, tipo, hipMemcpyDeviceToHost);
-			cudaFree(dop1);
+			hipFree(dop1);
 			*result = hres;
 		}
 		else
@@ -39,13 +39,13 @@ void datalogWrite(int query, vector<rulenode>::iterator rul_str, vector<rulenode
 				int *dop2;
 				tipo = res_rows * cols1 * sizeof(int);
 				reservar(&dop2, tipo); 
-				cudaMemcpy(dop2, dop1, tipo, cudaMemcpyHostToDevice);
+				hipMemcpy(dop2, dop1, tipo, hipMemcpyHostToDevice);
 				free(dop1);
 				res_rows = unir(dop2, res_rows, cols1, &dop2, 0);
 				tipo = res_rows * cols1 * sizeof(int);
 				hres = (int *)malloc(tipo);
-				cudaMemcpy(hres, dop2, tipo, cudaMemcpyDeviceToHost);
+				hipMemcpy(hres, dop2, tipo, hipMemcpyDeviceToHost);
-				cudaFree(dop2);
+				hipFree(dop2);
 				*result = hres;
 			}
 			else
@@ -315,8 +315,8 @@ void postgresWrite(int *inpquery, int ninpf, vector<rulenode>::iterator rul_str,
 				tipo = res_rows * cols1 * sizeof(int);
 				hres = (int *)malloc(tipo);
-				cudaMemcpy(hres, dop1, tipo, cudaMemcpyDeviceToHost);
+				hipMemcpy(hres, dop1, tipo, hipMemcpyDeviceToHost);
-				cudaFree(dop1);
+				hipFree(dop1);
 				w = z + 1;
 				strtok(qposr->rulename, "_");
@@ -353,8 +353,8 @@ void postgresWrite(int *inpquery, int ninpf, vector<rulenode>::iterator rul_str,
 			res_rows = abs(res_rows);
 			tipo = res_rows * cols1 * sizeof(int);
 			hres = (int *)malloc(tipo);
-			cudaMemcpy(hres, dop1, tipo, cudaMemcpyDeviceToHost);
+			hipMemcpy(hres, dop1, tipo, hipMemcpyDeviceToHost);
-			cudaFree(dop1);
+			hipFree(dop1);
 			char file[] = "/dev/shm/buffer.csv";
 			FILE *fp;
@@ -554,7 +554,7 @@ void mysqlWrite(vector<rulenode>::iterator rul_str, vector<rulenode>::iterator f
 			sign = tmpfact.predname;
 			tipo = res_rows * cols1 * sizeof(int);
 			hres = (int *)malloc(tipo);
-			cudaMemcpy(hres, dop1, tipo, cudaMemcpyDeviceToHost);
+			hipMemcpy(hres, dop1, tipo, hipMemcpyDeviceToHost);
 			if(sign[0] == 'f' && sign[1] >= '0' && sign[1] <= '9')
 				sumar(tmpfact.name, dop1, cols1, res_rows);
 		}
--- a/packages/cuda/hippy/hippy
+++ b/packages/cuda/hippy/hippy
--- a/packages/cuda/joincpu.cpp
+++ b/packages/cuda/joincpu.cpp
@@ -324,11 +324,11 @@ int joincpu(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenod
 	}
 	#ifdef TIMER
-	cudaEvent_t start, stop;
+	hipEvent_t start, stop;
 	float time;
-	cudaEventCreate(&start);
+	hipEventCreate(&start);
-	cudaEventCreate(&stop);
+	hipEventCreate(&stop);
-	cudaEventRecord(start, 0);
+	hipEventRecord(start, 0);
 	#endif
 	if(nsel1 > 0 || nsj1 > 0)
@@ -359,16 +359,16 @@ int joincpu(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenod
 	}
 	#ifdef TIMER
-	cudaEventRecord(stop, 0);
+	hipEventRecord(stop, 0);
-	cudaEventSynchronize(stop);
+	hipEventSynchronize(stop);
-	cudaEventElapsedTime(&time, start, stop);
+	hipEventElapsedTime(&time, start, stop);
 	cuda_stats.select1_time += time;
-	cudaEventDestroy(start);
+	hipEventDestroy(start);
-	cudaEventDestroy(stop);
+	hipEventDestroy(stop);
-	cudaEventCreate(&start);
+	hipEventCreate(&start);
-	cudaEventCreate(&stop);
+	hipEventCreate(&stop);
-	cudaEventRecord(start, 0);
+	hipEventRecord(start, 0);
 	#endif
 	if(nsel2 > 0 || nsj2 > 0)
@@ -381,16 +381,16 @@ int joincpu(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenod
 		Snl = sLen;
 	#ifdef TIMER
-	cudaEventRecord(stop, 0);
+	hipEventRecord(stop, 0);
-	cudaEventSynchronize(stop);
+	hipEventSynchronize(stop);
-	cudaEventElapsedTime(&time, start, stop);
+	hipEventElapsedTime(&time, start, stop);
 	cuda_stats.select2_time += time;
-	cudaEventDestroy(start);
+	hipEventDestroy(start);
-	cudaEventDestroy(stop);
+	hipEventDestroy(stop);
-	cudaEventCreate(&start);
+	hipEventCreate(&start);
-	cudaEventCreate(&stop);
+	hipEventCreate(&stop);
-	cudaEventRecord(start, 0);
+	hipEventRecord(start, 0);
 	#endif
 	//cout << "antes" << endl;
@@ -406,16 +406,16 @@ int joincpu(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenod
 	thrust::stable_sort_by_key(thrust::omp::par, Rres, Rres + Rnl, permutation);
 	#ifdef TIMER
-	cudaEventRecord(stop, 0);
+	hipEventRecord(stop, 0);
-	cudaEventSynchronize(stop);
+	hipEventSynchronize(stop);
-	cudaEventElapsedTime(&time, start, stop);
+	hipEventElapsedTime(&time, start, stop);
 	cuda_stats.sort_time += time;
-	cudaEventDestroy(start);
+	hipEventDestroy(start);
-	cudaEventDestroy(stop);
+	hipEventDestroy(stop);
-	cudaEventCreate(&start);
+	hipEventCreate(&start);
-	cudaEventCreate(&stop);
+	hipEventCreate(&stop);
-	cudaEventRecord(start, 0);
+	hipEventRecord(start, 0);
 	#endif
 	/*cout << "despues" << endl;
@@ -482,9 +482,9 @@ int joincpu(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenod
 	*ret = fres;
 	#ifdef TIMER
-	cudaEventRecord(stop, 0);
+	hipEventRecord(stop, 0);
-	cudaEventSynchronize(stop);
+	hipEventSynchronize(stop);
-	cudaEventElapsedTime(&time, start, stop);
+	hipEventElapsedTime(&time, start, stop);
 	cuda_stats.join_time += time;
 	#endif
--- a/packages/cuda/lista.cu
+++ b/packages/cuda/lista.cu
@@ -967,7 +967,7 @@ vector<gpunode> L;
 extern "C"
 int Cuda_Eval(predicate **inpfacts, int ninpf, predicate **inprules, int ninpr, int *inpquery, int **result, char *names, int finalDR)
 {
-	cudaSetDevice(0);
+	hipSetDevice(0);
 	vector<rulenode> rules;
 	int x;
@@ -1029,11 +1029,11 @@ int Cuda_Eval(predicate **inpfacts, int ninpf, predicate **inprules, int ninpr,
 	vector<rulenode>::iterator qposr;
 #if TIMER
-	cudaEvent_t start, stop;
+	hipEvent_t start, stop;
 	float time;
-	cudaEventCreate(&start);
+	hipEventCreate(&start);
-	cudaEventCreate(&stop);
+	hipEventCreate(&stop);
-	cudaEventRecord(start, 0);
+	hipEventRecord(start, 0);
 #endif
 	while(reglas.size()) /*Here's the main loop*/
@@ -1084,7 +1084,7 @@ int Cuda_Eval(predicate **inpfacts, int ninpf, predicate **inprules, int ninpr,
 				{
 					num_refs = rows1 * cols1 * sizeof(int);
 					reservar(&res, num_refs);
-					cudaMemcpyAsync(res, dop1, num_refs, cudaMemcpyDeviceToDevice);
+					hipMemcpyAsync(res, dop1, num_refs, hipMemcpyDeviceToDevice);
 					registrar(rul_act->name, cols1, res, rows1, itr, 1);
 					genflag = 1;
 					rul_act->gen_ant = rul_act->gen_act;
@@ -1251,10 +1251,10 @@ int Cuda_Eval(predicate **inpfacts, int ninpf, predicate **inprules, int ninpr,
 			if(x == num_refs)
 			{
 				#ifdef TIMER
-				cudaEvent_t start2, stop2;
+				hipEvent_t start2, stop2;
-				cudaEventCreate(&start2);
+				hipEventCreate(&start2);
-				cudaEventCreate(&stop2);
+				hipEventCreate(&stop2);
-				cudaEventRecord(start2, 0);
+				hipEventRecord(start2, 0);
 				#endif
 				//cout << rul_act->name << " res_rows = " << res_rows << endl;
@@ -1263,11 +1263,11 @@ int Cuda_Eval(predicate **inpfacts, int ninpf, predicate **inprules, int ninpr,
 					res_rows = unir(res, res_rows, rul_act->num_columns, &res, 0);
 				#ifdef TIMER
-				cudaEventRecord(stop2, 0);
+				hipEventRecord(stop2, 0);
-				cudaEventSynchronize(stop2);
+				hipEventSynchronize(stop2);
-				cudaEventElapsedTime(&time, start2, stop2);
+				hipEventElapsedTime(&time, start2, stop2);
-				cudaEventDestroy(start2);
+				hipEventDestroy(start2);
-				cudaEventDestroy(stop2);
+				hipEventDestroy(stop2);
 				//cout << "Union = " << time << endl;
 				cuda_stats.union_time += time;
 				#endif					
@@ -1319,16 +1319,16 @@ int Cuda_Eval(predicate **inpfacts, int ninpf, predicate **inprules, int ninpr,
 	#endif
 #if TIMER
-	cudaEventRecord(stop, 0);
+	hipEventRecord(stop, 0);
-	cudaEventSynchronize(stop);
+	hipEventSynchronize(stop);
-	cudaEventElapsedTime(&time, start, stop);
+	hipEventElapsedTime(&time, start, stop);
 	cuda_stats.total_time += time;
 	if (time > cuda_stats.max_time) 
 	  cuda_stats.max_time = time;
 	if (time < cuda_stats.min_time || cuda_stats.calls == 1) 
 	  cuda_stats.min_time = time;
-	cudaEventDestroy(start);
+	hipEventDestroy(start);
-	cudaEventDestroy(stop);
+	hipEventDestroy(stop);
 	Cuda_Statistics();
 #endif
--- a/packages/cuda/lista.h
+++ b/packages/cuda/lista.h
--- a/packages/cuda/memory.cu
+++ b/packages/cuda/memory.cu
@@ -144,7 +144,7 @@ void limpiar(const char s[], size_t sz)
 	if(GPUmem.size() == 0)
 	{
-		cudaMemGetInfo(&free,&total);
+		hipMemGetInfo(&free,&total);
 		cerr << s << ": not enough GPU memory: have " << free << " of " << total << ", need " << sz << " bytes." << endl;
 		exit(1);
 	}		
@@ -154,11 +154,11 @@ void limpiar(const char s[], size_t sz)
 	{	
 		temp = *ini;
 		temp.dev_address = (int *)malloc(ini->size);
-		cudaMemcpyAsync(temp.dev_address, ini->dev_address, temp.size, cudaMemcpyDeviceToHost);
+		hipMemcpyAsync(temp.dev_address, ini->dev_address, temp.size, hipMemcpyDeviceToHost);
 		list<memnode>::iterator pos = lower_bound(CPUmem.begin(), CPUmem.end(), temp, compareiteration);
 		CPUmem.insert(pos, temp);
 	}
-	cudaFree(ini->dev_address);
+	hipFree(ini->dev_address);
 	GPUmem.erase(ini);
 }
@@ -173,19 +173,19 @@ void reservar(int **ptr, size_t size)
                return;
        }
-	cudaMemGetInfo(&free, &total);
+	hipMemGetInfo(&free, &total);
 	while(free < size)
 	{
 		cout << "Se limpio memoria " << free << " " << total << endl;
 		limpiar("not enough memory", size);
-		cudaMemGetInfo(&free, &total);
+		hipMemGetInfo(&free, &total);
 	}
-	while(cudaMalloc(ptr, size) == cudaErrorMemoryAllocation)
+	while(hipMalloc(ptr, size) == hipErrorMemoryAllocation)
 		limpiar("Error in memory allocation", size);
 	if (! *ptr ) {
 	  size_t free, total;
-	  cudaMemGetInfo(      &free, &total	 );
+	  hipMemGetInfo(      &free, &total	 );
 	  cerr << "Could not allocate " << size << " bytes, only " << free << " avaliable from total of " << total << " !!!" << endl;
 	  cerr << "Exiting CUDA...." << endl;
 	  exit(1);
@@ -277,7 +277,7 @@ int cargar(int name, int num_rows, int num_columns, int is_fact, int *address_ho
 		}
 		size = num_rows * num_columns * sizeof(int);
 		reservar(&temp, size);
-		cudaMemcpyAsync(temp, address_host_table, size, cudaMemcpyHostToDevice);
+		hipMemcpyAsync(temp, address_host_table, size, hipMemcpyHostToDevice);
 		registrar(name, num_columns, temp, num_rows, itr, 0);
 		*ptr = temp;
 		return num_rows;
@@ -296,13 +296,13 @@ int cargar(int name, int num_rows, int num_columns, int is_fact, int *address_ho
 		reservar(&temp, size);
 		for(x = 0; x < numgpu; x++)
 		{
-			cudaMemcpyAsync(temp + inc, temp_storage[x].dev_address, temp_storage[x].size, cudaMemcpyDeviceToDevice);
+			hipMemcpyAsync(temp + inc, temp_storage[x].dev_address, temp_storage[x].size, hipMemcpyDeviceToDevice);
 			inc += temp_storage[x].size / sizeof(int);
-			cudaFree(temp_storage[x].dev_address);
+			hipFree(temp_storage[x].dev_address);
 		}
 		for(; x < numcpu; x++)
 		{
-			cudaMemcpyAsync(temp + inc, temp_storage[x].dev_address, temp_storage[x].size, cudaMemcpyHostToDevice);
+			hipMemcpyAsync(temp + inc, temp_storage[x].dev_address, temp_storage[x].size, hipMemcpyHostToDevice);
 			inc += temp_storage[x].size / sizeof(int);
 			free(temp_storage[x].dev_address);
 		}
@@ -340,9 +340,9 @@ int cargarcpu(int name, int num_rows, int num_columns, int is_fact, int *address
 		temp = (int *)malloc(size);
 		for(x = 0; x < numgpu; x++)
 		{
-			cudaMemcpyAsync(temp + inc, temp_storage[x].dev_address, temp_storage[x].size, cudaMemcpyDeviceToHost);
+			hipMemcpyAsync(temp + inc, temp_storage[x].dev_address, temp_storage[x].size, hipMemcpyDeviceToHost);
 			inc += temp_storage[x].size / sizeof(int);
-			cudaFree(temp_storage[x].dev_address);
+			hipFree(temp_storage[x].dev_address);
 		}
 		for(; x < numcpu; x++)
 		{
@@ -404,7 +404,7 @@ int cargafinal(int name, int cols, int **ptr)
 		cont = pos->rows;
 		#ifdef TUFFY
 		reservar(&temp, pos->size);
-		cudaMemcpy(temp, pos->dev_address, pos->size, cudaMemcpyHostToDevice);
+		hipMemcpy(temp, pos->dev_address, pos->size, hipMemcpyHostToDevice);
 		*ptr = temp;
 		#else
 		*ptr = pos->dev_address;
@@ -418,14 +418,14 @@ int cargafinal(int name, int cols, int **ptr)
 	pos = gpu;
 	while(pos != endg && pos->name == name)
 	{
-		cudaMemcpy(temp, pos->dev_address, pos->size, cudaMemcpyDeviceToDevice);
+		hipMemcpy(temp, pos->dev_address, pos->size, hipMemcpyDeviceToDevice);
 		temp += pos->size / sizeof(int);
 		pos++;
 	}
 	pos = cpu;
 	while(pos != endc && pos->name == name)
 	{
-		cudaMemcpy(temp, pos->dev_address, pos->size, cudaMemcpyHostToDevice);
+		hipMemcpy(temp, pos->dev_address, pos->size, hipMemcpyHostToDevice);
 		temp += pos->size / sizeof(int);
 		pos++;
 	}
@@ -493,7 +493,7 @@ void clear_memory()
 	{
 		if(ini->isrule)
 		{
-			cudaFree(ini->dev_address);
+			hipFree(ini->dev_address);
 			ini = GPUmem.erase(ini);
 		}
 		else
@@ -518,7 +518,7 @@ void clear_memory_all()
 	fin = GPUmem.end();
 	while(ini != fin)
 	{
-		cudaFree(ini->dev_address);
+		hipFree(ini->dev_address);
 		ini++;
 	}
 	GPUmem.clear();
@@ -542,7 +542,7 @@ void liberar(int name)
 	{
 		fact = *i;
 		GPUmem.erase(i);
-		cudaFree(fact.dev_address);
+		hipFree(fact.dev_address);
 	}
 	i = buscarhecho(CPUmem.begin(), CPUmem.end(), name);
 	if(i != CPUmem.end())
@@ -566,10 +566,10 @@ void sumar(int name, int *dop1, int cols, int rows)
 		newrows = rows + fact.rows;
 		reservar(&res, newrows * cols * sizeof(int));
 		offset = fact.rows * cols;
-		cudaMemcpyAsync(res, fact.dev_address, offset * sizeof(int), cudaMemcpyDeviceToDevice);
+		hipMemcpyAsync(res, fact.dev_address, offset * sizeof(int), hipMemcpyDeviceToDevice);
 		GPUmem.erase(i);
 		registrar(name, cols, res, newrows, 0, 0);
-		cudaMemcpyAsync(res + offset, dop1, rows * cols * sizeof(int), cudaMemcpyDeviceToDevice);
+		hipMemcpyAsync(res + offset, dop1, rows * cols * sizeof(int), hipMemcpyDeviceToDevice);
-		cudaFree(fact.dev_address);
+		hipFree(fact.dev_address);
 	}
 }
--- a/packages/cuda/memory.h
+++ b/packages/cuda/memory.h
--- a/packages/cuda/old/cuda.c
+++ b/packages/cuda/old/cuda.c
@@ -0,0 +1,601 @@
 // interface to CUDD Datalog evaluation
 #include "config.h"
 #include "YapInterface.h"
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
 #include <inttypes.h>
 #include "pred.h"
 #define MAXARG 100
 YAP_Atom AtomEq,
  AtomGt,
  AtomLt,
  AtomGe,
  AtomLe,
  AtomDf,
  AtomNt;
 predicate *facts[MAXARG]; /*Temporary solution to maintain facts and rules*/
 predicate *rules[MAXARG];
 int32_t cf = 0, cr = 0;
 char names[1024];
 // initialize CUDA system
 void Cuda_Initialize( void );
 // add/replace a set of facts for predicate pred
 int32_t Cuda_NewFacts(predicate *pred);
 // add/replace a rule for predicate pred
 int32_t Cuda_NewRule(predicate *pred);
 // erase predicate pred
 int32_t Cuda_Erase(predicate *pred);
 // evaluate predicate pred, mat is bound to a vector of solutions, and
 // output the count
 //int32_t Cuda_Eval(predicate *pred, int32_t **mat); This functions arguments were changed, please see pred.h
 void init_cuda( void );
 //#define DEBUG_INTERFACE 1
 #ifdef ROCKIT
 static int32_t query[100];
 static int32_t qcont = 0;
 static int cuda_init_query(void)
 {
 	int32_t pname = YAP_AtomToInt(YAP_AtomOfTerm(YAP_ARG1));
 	query[qcont] = pname;
 	qcont++;
 	query[qcont] = 0;
 	return TRUE;
 }
 #endif
 #if DEBUG_INTERFACE
 static void
 dump_mat(int32_t mat[], int32_t nrows, int32_t ncols)
 {
  return;
  int32_t i, j;
  for ( i=0; i< nrows; i++) {
    printf("%d", mat[i*ncols]);
    for (j=1; j < ncols; j++) {
      printf(", %d", mat[i*ncols+j]);
    }
    printf("\n");
  }
 }
 static void
 dump_vec(int32_t vec[], int32_t rows)
 {
  int32_t i = 1;
  int32_t j = 0;
  for (j = 0; j < rows; j++) {
    for ( ; vec[i]; i++ ) {
      printf(", %d", vec[i]);
    }
    printf(", 0");
    i++;
  }
  printf("\n");
 }
 #endif /* DEBUG_INTERFACE */
 // stubs, will point at Carlos code.
 void Cuda_Initialize( void )
 {
 }
 int32_t Cuda_NewFacts(predicate *pe)
 {
 #if DEBUG_INTERFACE
  dump_mat( pe->address_host_table, pe->num_rows, pe->num_columns );
 #endif
 #ifdef ROCKIT
  if(cf >= 0)
  {
  	facts[cf] = pe;
 	cf++;
  }
 #else
  facts[cf] = pe;
  cf++;
 #endif
  return TRUE;
 }
 int32_t Cuda_NewRule(predicate *pe)
 {
 #if DEBUG_INTERFACE
  dump_vec( pe->address_host_table, pe->num_rows);
 #endif
  rules[cr] = pe;
  cr++;
  return TRUE;
 }
 int32_t Cuda_Erase(predicate *pe)
 {
  int i = 0;
  while ( rules[i] != pe )
    i++;
  while (i < cr-1) {
    rules[i] = rules[i+1];
    i++;
  }
  rules[i] = NULL;
  cr--;
  if (pe->address_host_table)
    free( pe->address_host_table );
  free( pe );
  return TRUE;
 }
 static int
 load_facts( void ) {
  int32_t nrows = YAP_IntOfTerm(YAP_ARG1);
  int32_t ncols = YAP_IntOfTerm(YAP_ARG2), i = 0;
  YAP_Term t3 = YAP_ARG3;
  int32_t *mat = (int32_t *)malloc(sizeof(int32_t)*nrows*ncols);
  int32_t pname = YAP_AtomToInt(YAP_NameOfFunctor(YAP_FunctorOfTerm(YAP_HeadOfTerm(t3))));
  predicate *pred;
  while(YAP_IsPairTerm(t3)) {
    int32_t j = 0;
    YAP_Term th = YAP_HeadOfTerm(t3);
    for (j = 0; j < ncols; j++) {
      YAP_Term ta = YAP_ArgOfTerm(j+1, th);
      if (YAP_IsAtomTerm(ta)) {
 	mat[i*ncols+j] = YAP_AtomToInt(YAP_AtomOfTerm(ta));
      } else {
 	mat[i*ncols+j] = YAP_IntOfTerm(ta);
      }
    }
    t3 = YAP_TailOfTerm( t3 );
    i++;
  }
  if (YAP_IsVarTerm( YAP_ARG4)) {
    // new 
    pred = (predicate *)malloc(sizeof(predicate));
  } else {
    pred = (predicate *)YAP_IntOfTerm(YAP_ARG4);
    if (pred->address_host_table)
      free( pred->address_host_table );
  }
  pred->name = pname;
  pred->num_rows = nrows;
  pred->num_columns = ncols;
  pred->is_fact = TRUE;
  pred->address_host_table =  mat;
  Cuda_NewFacts(pred);
  if (YAP_IsVarTerm( YAP_ARG4)) {
    return YAP_Unify(YAP_ARG4, YAP_MkIntTerm((YAP_Int)pred));
  } else {
    return TRUE;
  }
 }
 static int currentFact = 0;
 static predicate *currentPred = NULL;
 static int
 cuda_init_facts( void ) {
  int32_t nrows = YAP_IntOfTerm(YAP_ARG1);
  int32_t ncols = YAP_IntOfTerm(YAP_ARG2);
  int32_t *mat = (int32_t *)malloc(sizeof(int32_t)*nrows*ncols);
  int32_t pname = YAP_AtomToInt(YAP_AtomOfTerm(YAP_ARG3));
  predicate *pred;
 	strcat(names, YAP_AtomName(YAP_AtomOfTerm(YAP_ARG3)));
 	strcat(names, " ");
  if (!mat)
    return FALSE;
  if (YAP_IsVarTerm( YAP_ARG4)) {
    // new 
    pred = (predicate *)malloc(sizeof(predicate));
  } else {
    pred = (predicate *)YAP_IntOfTerm(YAP_ARG4);
    if (pred->address_host_table)
      free( pred->address_host_table );
 }
  pred->name = pname;
  pred->num_rows = nrows;
  pred->num_columns = ncols;
  pred->is_fact = TRUE;
  pred->address_host_table =  mat;
  currentPred = pred;
  currentFact = 0;
  if (YAP_IsVarTerm( YAP_ARG4)) {
    return YAP_Unify(YAP_ARG4, YAP_MkIntTerm((YAP_Int)pred));
  } else {
    return TRUE;
  }
 }
 static int
 cuda_load_fact( void ) {
  int i = currentFact;
 #if defined(DATALOG) || defined(TUFFY)
  YAP_Term th = YAP_ARG1;
  int ncols = currentPred->num_columns;
  int j;
  int *mat = currentPred->address_host_table;
  for (j = 0; j < ncols; j++) {
    YAP_Term ta = YAP_ArgOfTerm(j+1, th);
    if (YAP_IsAtomTerm(ta)) {
      mat[i*ncols+j] = YAP_AtomToInt(YAP_AtomOfTerm(ta));
    } else {
      mat[i*ncols+j] = YAP_IntOfTerm(ta);
    }
  }
 #endif
  i++;
  if (i == currentPred->num_rows) {
    Cuda_NewFacts(currentPred);
    currentPred = NULL;
    currentFact = 0;
  } else {
    currentFact = i;
  }
  return TRUE;
 }
 static int
 load_rule( void ) {
  // maximum of 2K symbols per rule, should be enough for ILP
  int32_t vec[2048], *ptr = vec, *nvec, neg[2048];
  // qK different variables;
  YAP_Term vars[1024];
  int32_t nvars = 0, x;
  int32_t ngoals = YAP_IntOfTerm(YAP_ARG1);   /* gives the number of goals */
  int32_t ncols = YAP_IntOfTerm(YAP_ARG2);
  YAP_Term t3 = YAP_ARG3;
 	YAP_Atom name = YAP_NameOfFunctor(YAP_FunctorOfTerm(YAP_HeadOfTerm(t3)));
  int32_t pname = YAP_AtomToInt(name);
 	const char *strname = YAP_AtomName(name);
  predicate *pred;
  int32_t cont = 0;
  memset(neg, 0x0, 2048 * sizeof(int32_t));
  while(YAP_IsPairTerm(t3)) {
    int32_t j = 0, m;
    YAP_Term th = YAP_HeadOfTerm(t3);
    YAP_Functor f = YAP_FunctorOfTerm( th );
    int32_t n = YAP_ArityOfFunctor( f ); 
    YAP_Atom at = YAP_NameOfFunctor( f );
    if (at == AtomEq)
      *ptr++ = SBG_EQ;
    else if (at == AtomGt)
      *ptr++ = SBG_GT;
    else if (at == AtomLt)
      *ptr++ = SBG_LT;
    else if (at == AtomGe)
      *ptr++ = SBG_GE;
    else if (at == AtomLe)
      *ptr++ = SBG_LE;
    else if (at == AtomDf)
      *ptr++ = SBG_DF;
    else if (at == AtomNt)
 	{
      		neg[cont] = 1;
 		cont++;
 	}
    else
 	{
      		*ptr++ = YAP_AtomToInt( at );
 		cont++;
 	}
    for (j = 0; j < n; j++) {
      YAP_Term ta = YAP_ArgOfTerm(j+1, th);
      if (YAP_IsVarTerm(ta)) {
 	int32_t k;
 	for (k = 0; k< nvars; k++) {
 	  if (vars[k] == ta) {
 	    *ptr++ = k+1;
 	    break;
 	  }
 	}
 	if (k == nvars) {
 	  vars[k] = ta;
 	  *ptr++ = k+1;
 	  nvars++;
 	}
      } else if (YAP_IsAtomTerm(ta))  {
 	*ptr++ = -YAP_AtomToInt(YAP_AtomOfTerm(ta));
      } else if (YAP_IsApplTerm(ta))  {
 	f = YAP_FunctorOfTerm( ta );
 	at = YAP_NameOfFunctor( f );
 	m = YAP_ArityOfFunctor( f );
 	*ptr++ = YAP_AtomToInt( at );
 	for (x = 0; x < m; x++) {
      		YAP_Term ta2 = YAP_ArgOfTerm(x+1, ta);
      		if (YAP_IsVarTerm(ta2)) {
 			int32_t k;
 			for (k = 0; k < nvars; k++) {
 	  			if (vars[k] == ta2) {
 	    				*ptr++ = k+1;
 	    				break;
 	  			}
 			}
 			if (k == nvars) {
 	  			vars[k] = ta2;
 	  			*ptr++ = k+1;
 	  			nvars++;
 			}
      		} else if (YAP_IsAtomTerm(ta2))  {
 			*ptr++ = -YAP_AtomToInt(YAP_AtomOfTerm(ta));
      		} else {
 			*ptr++ = -YAP_IntOfTerm(ta);
      		}
    	}
      } else {
 	*ptr++ = -YAP_IntOfTerm(ta);
      }
    }
    *ptr++ = 0;
    t3 = YAP_TailOfTerm( t3 );
  }
  if (YAP_IsVarTerm( YAP_ARG4)) {
    // new 
    pred = (predicate *)malloc(sizeof(predicate));
  } else {
    pred = (predicate *)YAP_IntOfTerm(YAP_ARG4);
    if (pred->address_host_table)
      free( pred->address_host_table );
  }
  pred->name = pname;
  pred->num_rows = ngoals;
  pred->num_columns = ncols;
  pred->is_fact = FALSE;
 	x = (strlen(strname) + 1) * sizeof(char);
 	pred->predname = (char *)malloc(x);
 	memcpy(pred->predname, strname, x); 
  nvec = (int32_t *)malloc(sizeof(int32_t)*(ptr-vec));
  memcpy(nvec, vec, sizeof(int32_t)*(ptr-vec));
  pred->address_host_table =  nvec;
  pred->negatives = (int32_t *)malloc(sizeof(int32_t) * cont);
  memcpy(pred->negatives, neg, sizeof(int32_t) * cont);
  Cuda_NewRule( pred );
  return YAP_Unify(YAP_ARG4, YAP_MkIntTerm((YAP_Int)pred));
 }
 static int
 cuda_erase( void )
 {
  predicate *ptr = (predicate *)YAP_IntOfTerm(YAP_ARG1);
  return Cuda_Erase( ptr );
 }
 void setQuery(YAP_Term t1, int32_t **res)
 {
 	int32_t *query = (int32_t *)malloc(MAXARG * sizeof(int32_t));
 	int32_t x, y = 0, *itr;
 	predicate *ptr = NULL;
 	if(YAP_IsPairTerm(t1))
 	{
 		while(YAP_IsPairTerm(t1))
 		{
 			ptr = (predicate *)YAP_IntOfTerm(YAP_HeadOfTerm(t1));
 			query[y] = ptr->name;
 			itr = ptr->address_host_table;
 			x = 2;
 			while(itr[x] != 0)
 				x++;
 			query[y+1] = itr[x+1];
 			t1 = YAP_TailOfTerm(t1);
 			y+=2;
 		}
 	}
 	else
 	{
 		ptr = (predicate *)YAP_IntOfTerm(t1);
 		query[y] = ptr->name;
 		itr = ptr->address_host_table;
 		x = 2;
 		while(itr[x] != 0)
 			x++;
 		query[y+1] = itr[x+1];
 		y += 2;
 	}
 	query[y] = -1;
 	query[y+1] = -1;
 	*res = query;
 }
 static int
 cuda_eval( void )
 {
  int32_t *mat;
 #if defined(DATALOG) || defined(TUFFY)
 	int32_t *query = NULL;
 	setQuery(YAP_ARG1, &query);
 #endif
 	int32_t finalDR = YAP_IntOfTerm(YAP_ARG3);
  int32_t n = Cuda_Eval(facts, cf, rules, cr, query, & mat, names, finalDR);
 #ifdef TUFFY
 	cf = 0;
 #endif
 #ifdef ROCKIT
 	if(cf > 0)
 		cf *= -1;
 #endif
 #if defined(TUFFY) || defined(ROCKIT)
 	cr = 0;
 	names[0] = '\0';
 	return FALSE;
 #else
  int32_t i;
  predicate *ptr = (predicate *)YAP_IntOfTerm(YAP_ARG1);
  int32_t ncols = ptr->num_columns;
  YAP_Term out = YAP_TermNil();
  YAP_Functor f = YAP_MkFunctor(YAP_IntToAtom(ptr->name), ncols);
  YAP_Term vec[256];
 	YAP_Atom at;
  if (n < 0)
    return FALSE;
  for (i=0; i<n; i++) {
    int32_t ni = ((n-1)-i)*ncols, j;
 	printf("%s(", YAP_AtomName(YAP_IntToAtom(ptr->name)));
    for (j=0; j<ncols; j++) {
      vec[j] = YAP_MkIntTerm(mat[ni+j]);
 	at = YAP_IntToAtom(mat[ni+j]);
 	if(at != NULL)
 		printf("%s", YAP_AtomName(at));
 	else
 		printf("%d", mat[ni+j]);	
 	if(j < (ncols - 1))
 		printf(",");
    }
    out = YAP_MkPairTerm(YAP_MkApplTerm( f, ncols, vec ), out);
 	printf(")\n");
  }
  if (n > 0)
    free( mat );
  return YAP_Unify(YAP_ARG2, out);
 #endif
 }
 static int
 cuda_coverage( void )
 {
  int32_t *mat;
 #if defined(DATALOG) || defined(TUFFY)
 	int32_t *query = NULL;
 	setQuery(YAP_ARG1, &query);
 #endif
  int32_t n = Cuda_Eval(facts, cf, rules, cr, query, & mat, 0, 0);
  int32_t post = YAP_AtomToInt(YAP_AtomOfTerm(YAP_ARG2));
  int32_t i = n/2, min = 0, max = n-1;
  int32_t t0, t1;
  if (n < 0)
    return FALSE;
  if (n == 0) {
    return YAP_Unify(YAP_ARG4, YAP_MkIntTerm(0)) && 
      YAP_Unify(YAP_ARG3, YAP_MkIntTerm(0));
  }
  t0 = mat[0], t1 = mat[(n-1)*2];
  if (t0 == t1) { /* all sametype */
    free( mat );
    /* all pos */
    if (t0 == post) 
      return YAP_Unify(YAP_ARG3, YAP_MkIntTerm(n)) && 
 	YAP_Unify(YAP_ARG4, YAP_MkIntTerm(0));
    /* all neg */
    return YAP_Unify(YAP_ARG4, YAP_MkIntTerm(n)) && 
      YAP_Unify(YAP_ARG3, YAP_MkIntTerm(0));
  }
  do {
    i = (min+max)/2;
    if (i == min) i++;
    if (mat[i*2] == t0) {
      min = i;
    } else {
      max = i;
    }
    if (min+1 == max) {      
      free( mat );
      if (t0 == post) 
 	return YAP_Unify(YAP_ARG3, YAP_MkIntTerm(max)) && 
 	  YAP_Unify(YAP_ARG4, YAP_MkIntTerm(n-max));
      /* all neg */
      return YAP_Unify(YAP_ARG4, YAP_MkIntTerm(max)) && 
 	YAP_Unify(YAP_ARG3, YAP_MkIntTerm(n-max));
    }
  } while ( TRUE );
 }
 static int cuda_count( void )
 {
  int32_t *mat;
 #if defined(DATALOG) || defined(TUFFY)
 	int32_t *query = NULL;
 	setQuery(YAP_ARG1, &query);
 #endif
  int32_t n = Cuda_Eval(facts, cf, rules, cr, query, & mat, 0, 0);
  if (n < 0)
    return FALSE;
  free( mat );
  return YAP_Unify(YAP_ARG2, YAP_MkIntTerm(n));
 }
 static int cuda_statistics( void )
 {
  Cuda_Statistics();
  return TRUE;
 }
 static int first_time = TRUE;
 void
 init_cuda(void)
 {
  if (first_time) Cuda_Initialize();
  first_time = FALSE;
  AtomEq = YAP_LookupAtom("=");
  AtomGt = YAP_LookupAtom(">");
  AtomLt = YAP_LookupAtom("<");
  AtomGe = YAP_LookupAtom(">=");
  AtomLe = YAP_LookupAtom("=<");
  AtomDf = YAP_LookupAtom("\\=");
  AtomNt = YAP_LookupAtom("not");
  YAP_UserCPredicate("load_facts", load_facts, 4);
  YAP_UserCPredicate("cuda_init_facts", cuda_init_facts, 4);
  YAP_UserCPredicate("cuda_load_fact", cuda_load_fact, 1);
  YAP_UserCPredicate("load_rule", load_rule, 4);
  YAP_UserCPredicate("cuda_erase", cuda_erase, 1);
  YAP_UserCPredicate("cuda_eval", cuda_eval, 3);
  YAP_UserCPredicate("cuda_coverage", cuda_coverage, 4);
  YAP_UserCPredicate("cuda_count", cuda_count, 2);
  YAP_UserCPredicate("cuda_statistics", cuda_statistics, 0);
 #ifdef ROCKIT
  YAP_UserCPredicate("cuda_init_query", cuda_init_query, 1);
 #endif
 }
--- a/packages/cuda/old/dbio.cu
+++ b/packages/cuda/old/dbio.cu
@@ -0,0 +1,603 @@
 #include <iostream>
 #include <algorithm>
 #include <stdio.h>
 #include "memory.h"
 #include "union2.h"
 #include "dbio.h"
 #ifdef DATALOG
 //template<class InputIterator>
 //void datalogWrite(int query, InputIterator rul_str, InputIterator fin, int finalDR, int **result)
 void datalogWrite(int query, vector<rulenode>::iterator rul_str, vector<rulenode>::iterator fin, int finalDR, int **result)
 {
 	rulenode tmprule;
 	vector<rulenode>::iterator qposr;
 	int *dop1, *hres;
 	int cols1, res_rows, tipo;
 	tmprule.name = query;
 	qposr = lower_bound(rul_str, fin, tmprule, comparer);
 	cols1 = qposr->num_columns;
 	res_rows = cargafinal(query, cols1, &dop1);
 	if(res_rows != 0)
 	{	
 		if(res_rows > 0)
 		{
 			if(finalDR)
 				res_rows = unir(dop1, res_rows, cols1, &dop1, 0);
 			tipo = res_rows * cols1 * sizeof(int);
 			hres = (int *)malloc(tipo);
 			cudaMemcpy(hres, dop1, tipo, cudaMemcpyDeviceToHost);
 			cudaFree(dop1);
 			*result = hres;
 		}
 		else
 		{
 			res_rows *= -1;
 			if(finalDR)
 			{
 				int *dop2;
 				tipo = res_rows * cols1 * sizeof(int);
 				reservar(&dop2, tipo); 
 				cudaMemcpy(dop2, dop1, tipo, cudaMemcpyHostToDevice);
 				free(dop1);
 				res_rows = unir(dop2, res_rows, cols1, &dop2, 0);
 				tipo = res_rows * cols1 * sizeof(int);
 				hres = (int *)malloc(tipo);
 				cudaMemcpy(hres, dop2, tipo, cudaMemcpyDeviceToHost);
 				cudaFree(dop2);
 				*result = hres;
 			}
 			else
 				*result = dop1;
 		}
 	}
 }
 #endif
 #ifdef TUFFY
 void postgresRead(PGconn **ret, vector<gpunode> *L, int *inpquery, char *names, int finalDR)
 {
 	PGresult *pgr;
 	int x, y;
 	int *mat, *mat2;
 	char *tok, sel[1024], **qrs;
 	int w, z = 0, numt, numc, numc2, start = 0, start2, val;
 	PGconn *conn = PQconnectdb("host=localhost port=5432 dbname = prueba user=tuffer password=root");
 	if(PQstatus(conn) != CONNECTION_OK)
    	{
        	fprintf(stderr, "Connection to database failed: %s", PQerrorMessage(conn));
 		exit(1);
        }
 	pgr = PQexec(conn, "Select nspname from pg_catalog.pg_namespace where oid = (select max(oid) from pg_catalog.pg_namespace)");
 	sprintf(sel, "SET search_path = %s", PQgetvalue(pgr, 0, 0)); 
 	PQclear(pgr);
 	PQexec(conn, sel);
 	tok = strtok(names, " ");	
 	if(finalDR)
 	{
 		qrs = (char **)malloc(100 * sizeof(char *));
 		while(tok != NULL)
 		{
 			sprintf(sel, "Select * from %s limit 0", tok);
 			pgr = PQexec(conn, sel);
 			numc = L->at(z).num_columns;
 			if(tok[0] == 'c')
 			{
 				sprintf(sel, "Select ");
 				numt = numc + 1;
 				for(x = 1; x < numt; x++)
 				{
 					strcat(sel, PQfname(pgr, x));
 					strcat(sel, ", ");
 				}
 				sel[strlen(sel)-2] = '\0';
 				sprintf(sel, "%s from %s", sel, tok);
 			}
 			else
 			{
 				sprintf(sel, "Select id, Club, ");
 				numt = numc + 6;
 				for(x = 8; x < numt; x++)
 				{
 					strcat(sel, PQfname(pgr, x));
 					strcat(sel, ", ");
 				}
 				sel[strlen(sel)-2] = '\0';
 				sprintf(sel, "%s from %s", sel, tok);
 			}
 			PQclear(pgr);
 			pgr = PQexec(conn, sel);
 			numt = PQntuples(pgr);
 			mat = (int *)malloc(numt * numc * sizeof(int));
 			if(tok[0] == 'c')
 			{
 				for(x = 0; x < numt; x++)
 				{
 					start = x * numc;
 					for(y = 0; y < numc; y++)
 						mat[start + y] = atoi(PQgetvalue(pgr, x, y));
 				}
 			}
 			else
 			{
 				numc2 = numc - 2;
 				mat2 = (int *)malloc(numt * numc2 * sizeof(int));
 				start = 0;
 				start2 = 0;
 				for(x = 0; x < numt; x++)
 				{
 					w = atoi(PQgetvalue(pgr, x, 1));
 					if(w < 2)
 					{
 						mat[start] = atoi(PQgetvalue(pgr, x, 0));
 						start++;
 						mat[start] = w;
 						start++;
 						if(w > 0)
 						{
 							for(y = 2; y < numc; y++)
 							{
 								val = atoi(PQgetvalue(pgr, x, y));
 								mat[start] = val;
 								mat2[start2] = val;
 								start++;
 								start2++;
 							}
 						}
 						else
 						{
 							for(y = 2; y < numc; y++)
 							{
 								val = atoi(PQgetvalue(pgr, x, y));
 								mat[start] = val;
 								start++;
 							}
 						}
 					}
 					else
 					{
 						for(y = 2; y < numc; y++)
 						{
 							val = atoi(PQgetvalue(pgr, x, y));
 							mat2[start2] = val;
 							start2++;
 						}
 					}
 				}
 				L->at(z+1).address_host_table = mat2;
 				L->at(z+1).num_rows = start2 / numc2;
 			}
 			L->at(z).address_host_table = mat;
 			L->at(z).num_rows = start / numc;
 			PQclear(pgr);
 			x = 1;
 			while(inpquery[x] != -1)
 			{
 				if(L->at(z).name == inpquery[x])
 				{
 					numt = (strlen(tok) + 1) * sizeof(char);
 					qrs[x] = (char *)malloc(numt);
 					memcpy(qrs[x], tok, numt);
 				}
 				x += 2;
 			}
 			if(tok[0] == 'c')
 			{
 				tok = strtok(NULL, " ");
 				z++;
 			}
 			else
 			{
 				strtok(NULL, " ");	
 				tok = strtok(NULL, " ");
 				z += 2;
 			}
 		}
 	}
 	else
 	{
 		while(tok != NULL)
 		{
 			sprintf(sel, "Select * from %s limit 0", tok);
 			pgr = PQexec(conn, sel);
 			numc = L->at(z).num_columns;
 			if(tok[0] == 'c')
 			{
 				sprintf(sel, "Select weight, myid, ");
 				start = 1;
 				numt = numc + 1;
 			}
 			else
 			{
 				sprintf(sel, "Select truth, Club, atomID, ");
 				start = 8;
 				numt = numc + 5;
 			}
 			for(x = start; x < numt; x++)
 			{
 				strcat(sel, PQfname(pgr, x));
 				strcat(sel, ", ");
 			}
 			sel[strlen(sel)-2] = '\0';
 			sprintf(sel, "%s from %s", sel, tok);
 			PQclear(pgr);
 			pgr = PQexec(conn, sel);
 			numt = PQntuples(pgr);
 			mat = (int *)malloc(numt * numc * sizeof(int)); 
 			L->at(z).weight = (double *)malloc(numt * sizeof(double));
 			L->at(z).num_rows = numt;
 			for(x = 0; x < numt; x++)
 			{	
 				start = x * numc;
 				for(y = 1; y < numc; y++)
 					mat[start + y] = atoi(PQgetvalue(pgr, x, y));
 			}
 			numt *= numc;
 			double flo;
 			if(tok[0] == 'c')
 			{
 				for(x = 0, y = 0; x < numt; x+=numc, y++)
 				{
 					flo = atof(PQgetvalue(pgr, y, 0));
 					L->at(z).weight[y] = flo;
 					if(flo > 0)
 						mat[x] = y + 1;
 					else
 						mat[x] = -y - 1;
 				}
 			}
 			else
 			{
 				for(x = 0, y = 0; x < numt; x+=numc, y++)
 				{
 					if(PQgetvalue(pgr, y, 0)[0] == 't')
 						mat[x] = 2;
 					else
 						mat[x] = 1;
 				}				
 			}
 			L->at(z).address_host_table = mat;
 			numc = (strlen(tok) + 1) * sizeof(char);
 			L->at(z).predname = (char *)malloc(numc);
 			memcpy(L->at(z).predname, tok, numc);
 			PQclear(pgr);
 			tok = strtok(NULL, " ");
 			z++;
 		}
 	}
 	*ret = conn;
 }
 void postgresWrite(int *inpquery, int ninpf, vector<rulenode>::iterator rul_str, vector<rulenode>::iterator fin, vector<gpunode> *L, PGconn *conn, int finalDR)
 {
 	char sel[1024];
 	double *matw = NULL;
 	int qname, cols1, res_rows, tipo, *dop1;
 	int x, w, z, y, *hres;
 	rulenode tmprule;
 	vector<rulenode>::iterator qposr;
 	if(finalDR)
 	{
 		char file[] = "/dev/shm/mln0_atoms.csv";
 		z = 0;
 		int seqid = 1;
 		FILE *fp;
 		fp = fopen(file, "w");
 		if(fp == NULL)
 		{
 			cerr << "Failed to create main memory temporary file, attempting to use hardrive" << endl;
 			sprintf(file, "./temp/mln0_atoms.csv");
 			fp = fopen(file, "w");
 			if(fp == NULL)
 			{
 				cerr << "Failed to create main memory temporary file" << endl;
 				exit(1);
 			}
 		}
 		while((qname = inpquery[z]) != -1)
 		{
 			tmprule.name = qname;
 			qposr = lower_bound(rul_str, fin, tmprule, comparer);
 			cols1 = qposr->num_columns;
 			res_rows = cargafinal(qname, cols1, &dop1);
 			if(res_rows != 0)
 			{
 				if(res_rows < 0)
 					res_rows = unir(dop1, -res_rows, cols1, &dop1, 0);  /*duplicate elimination on result*/
 				else
 					res_rows = unir(dop1, res_rows, cols1, &dop1, finalDR);
 				tipo = res_rows * cols1 * sizeof(int);
 				hres = (int *)malloc(tipo);
 				cudaMemcpy(hres, dop1, tipo, cudaMemcpyDeviceToHost);
 				cudaFree(dop1);
 				w = z + 1;
 				strtok(qposr->rulename, "_");
 				strtok(NULL, "_");
 				int prid = atoi(strtok(NULL, "_"));
 				for(x = 0, w = 0; x < res_rows; x++, w+=2)
 				{
 					if(hres[w+1])
 						fprintf(fp, "%d,%d,%d,true\n", seqid, hres[w], prid);
 					else
 						fprintf(fp, "%d,%d,%d,false\n", seqid, hres[w], prid);
 					seqid++;
 				}
 				free(hres);
 			}
 			z += 2;
 		}
 		fclose(fp);
 		sprintf(sel, "Copy mln0_atoms(atomid,tupleID,predID,isquery) from '%s' CSV", file);
 		PQexec(conn, sel);
 	}
 	else
 	{
 		while(rul_str != fin)
 		{
 			cols1 = rul_str->num_columns;
 			res_rows = cargafinal(rul_str->name, cols1, &dop1);
 			if(res_rows == 0)
 			{
 				rul_str++;
 				continue;
 			}
 			res_rows = abs(res_rows);
 			tipo = res_rows * cols1 * sizeof(int);
 			hres = (int *)malloc(tipo);
 			cudaMemcpy(hres, dop1, tipo, cudaMemcpyDeviceToHost);
 			cudaFree(dop1);
 			char file[] = "/dev/shm/buffer.csv";
 			FILE *fp;
 			fp = fopen(file, "w");
 			if(fp == NULL)
 			{
 				cerr << "Failed to create main memory temporary file, attempting to use hardrive" << endl;
 				sprintf(file, "./temp/buffer.csv");
 				fp = fopen(file, "w");
 				if(fp == NULL)
 				{
 					cerr << "Failed to create main memory temporary file" << endl;
 					exit(1);
 				}
 			}
 			if(rul_str->rulename[0] == 'z')
 			{
 				char *name = rul_str->rulename + 1;
 				for(x = 0; x < ninpf; x++)
 				{
 					if(strncmp(L->at(x).predname, name, strlen(name)) == 0)
 					{
 						matw = L->at(x).weight;
 						break;
 					}
 				}
 				cols1 -= 3;
 				for(x = 0, z = 0; x < res_rows; x++, z+=3)
 				{
 					for(y = 0; y < cols1; y++, z++)
 						fprintf(fp, "%d,", hres[z]);
 					fprintf(fp, "%d,%lf,%d\n", hres[z], matw[abs(hres[z+1])-1], hres[z+2]);
 				}
 				fclose(fp);
 				sprintf(sel, "Copy %s from '%s' CSV", name, file);
 				PQexec(conn, sel);
 			}
 			else
 			{
 				cols1--;
 				for(x = 0, z = 0; x < res_rows; x++, z++)
 				{
 					for(y = 0; y < cols1; y++, z++)
 						fprintf(fp, "%d,", hres[z]);
 					fprintf(fp, "%d\n", hres[z]);
 				}
 				fclose(fp);
 				sprintf(sel, "Copy %s from '%s' CSV", rul_str->rulename, file);
 				PQexec(conn, sel);
 			}
 			free(hres);
 			rul_str++;
 		}
 	}
 	PQfinish(conn);
 	if(finalDR)
 		clear_memory_all();
 }
 #endif
 #ifdef ROCKIT
 void mysqlRead(MYSQL **ret, int *qrs, vector<gpunode> *L, int ninpf, char *names, int finalDR)
 {
 	char *tok, sel[1024];
 	int w, x, y, z = 0, numt, numc;
 	int *mat;
 	MYSQL *con = mysql_init(NULL);
 	if(con == NULL)
 	{
 		fprintf(stderr, "mysql_init() failed\n");
      		exit(1);
 	}
 	mysql_options(con, MYSQL_OPT_LOCAL_INFILE, NULL);
 	mysql_real_connect(con, "localhost", "root", "root", "rockit", 0, NULL, 0);
 	if(finalDR)
 	{
 		y = 0;
 		while(qrs[y] != 0)
 		{
 			for(z = 0; z < ninpf; z++)
 			{
 				if(qrs[y] == L->at(z).name)
 				{
 					MYSQL_ROW row;
 					sprintf(sel, "Select count(*) from %s", L->at(z).predname);
 					mysql_query(con, sel);
 					MYSQL_RES *result = mysql_store_result(con);
 					row = mysql_fetch_row(result);
 					numt = atoi(row[0]);
 					mysql_free_result(result);
 					if(numt != L->at(z).num_rows)
 					{
 						liberar(L->at(z).name);
 						numc = L->at(z).num_columns;
 						sprintf(sel, "Select * from %s", L->at(z).predname);
 						mysql_query(con, sel);
 						MYSQL_RES *result = mysql_store_result(con);
 						mat = (int *)malloc(numt * numc * sizeof(int));
 						w = 0;
 						while ((row = mysql_fetch_row(result))) 
 						{
 							for(x = 0; x < numc; x++, w++)
 								mat[w] = atoi(row[x]);
 						}
 						mysql_free_result(result);
 						if(L->at(z).address_host_table != NULL)
 							free(L->at(z).address_host_table);
 						L->at(z).address_host_table = mat;
 						L->at(z).num_rows = numt;
 					}
 				}
 			}
 			y++;
 		}
 	}
 	else
 	{
 		tok = strtok(names, " ");
 		while(tok != NULL)
 		{
 			numc = L->at(z).num_columns;
 			sprintf(sel, "Select * from %s", tok);
 			mysql_query(con, sel);
 			MYSQL_RES *result = mysql_store_result(con);
 			numt = mysql_num_rows(result);
 			MYSQL_ROW row;
 			mat = (int *)malloc(numt * numc * sizeof(int));
 			w = 0;
 			if(tok[0] == 'f' && tok[1] >= '0' && tok[1] <= '9')
 			{
 				while ((row = mysql_fetch_row(result))) 
 				{
 					for(x = 1; x <= numc; x++, w++)
 						mat[w] = atoi(row[x]);
 				}
 			}
 			else
 			{
 				while ((row = mysql_fetch_row(result))) 
 				{
 					for(x = 0; x < numc; x++, w++)
 						mat[w] = atoi(row[x]);
 				}
 			}
 			mysql_free_result(result);
 			L->at(z).address_host_table = mat;
 			L->at(z).num_rows = numt;
 			numc = (strlen(tok) + 1) * sizeof(char);
 			L->at(z).predname = (char *)malloc(numc);
 			strcpy(L->at(z).predname, tok);
 			tok = strtok(NULL, " ");
 			z++;
 		}
 	}
 	*ret = con;
 }
 void mysqlWrite(vector<rulenode>::iterator rul_str, vector<rulenode>::iterator fin, vector<gpunode> *L, MYSQL *con)
 {
 	int x, y, z, cols1, cols2, res_rows, tipo;
 	int *hres, *dop1;
 	char *id, *sign, *q1, *q2;
 	char sel[1024], weight[1024];
 	gpunode tmpfact;
 	while(rul_str != fin)
 	{
 		cols1 = rul_str->num_columns;
 		res_rows = cargafinal(rul_str->name, cols1, &dop1);
 		id = strtok(rul_str->rulename, "_");
 		sprintf(sel, "create table if not exists %s(weight double, ", id);
 		for(x = 0; x < cols1; x++)
 		{
 			sprintf(weight, "a%d char(10), ", x);
 			strcat(sel, weight);
 		}
 		sel[strlen(sel)-2] = ')';
 		strcat(sel, "ENGINE = MEMORY DEFAULT CHARSET=latin1");
 		mysql_query(con, sel);
 		sprintf(sel, "truncate %s", id);
 		mysql_query(con, sel);
 		if(res_rows == 0)
 		{
 			rul_str++;
 			continue;
 		}
 		if(res_rows > 0)
 		{
 			tmpfact = L->at(-rul_str->referencias[rul_str->num_rows - 2] - 1);
 			sign = tmpfact.predname;
 			tipo = res_rows * cols1 * sizeof(int);
 			hres = (int *)malloc(tipo);
 			cudaMemcpy(hres, dop1, tipo, cudaMemcpyDeviceToHost);
 			if(sign[0] == 'f' && sign[1] >= '0' && sign[1] <= '9')
 				sumar(tmpfact.name, dop1, cols1, res_rows);
 		}
 		else
 		{
 			hres = dop1;
 			res_rows = -res_rows;
 		}
 		sign = strtok(NULL, "_");
 		q1 = strtok(NULL, "_");
 		q2 = strtok(NULL, "_");
 		if(sign[0] == '0')
 			sprintf(weight, "%s.%s", q1, q2);
 		else
 			sprintf(weight, "-%s.%s", q1, q2);
 		FILE *fp;
 		char file[512];
 		sprintf(file, "/dev/shm/%s.tsv", id);
 		fp = fopen(file, "w");
 		if(fp == NULL)
 		{
 			cerr << "Failed to create main memory temporary file, attempting to use hardrive" << endl;
 			sprintf(file, "./temp/%s.tsv", id);
 			fp = fopen(file, "w");
 		}
 		cols2 = cols1 - 1;
 		for(x = 0, z = 0; x < res_rows; x++, z++)
 		{
 			fprintf(fp, "%s\t", weight);
 			for(y = 0; y < cols2; y++, z++)
 				fprintf(fp, "%d\t", hres[z]);
 			fprintf(fp, "%d\n", hres[z]);
 		}
 		fclose(fp);
 		sprintf(sel, "LOAD DATA LOCAL INFILE '%s' INTO TABLE %s", file, id);
 		mysql_query(con, sel);
 		rul_str++;
 	}
 	mysql_close(con);
 }
 #endif
--- a/packages/cuda/old/dbio.h
+++ b/packages/cuda/old/dbio.h
@@ -0,0 +1,28 @@
 #ifndef _DBIO_H_
 #define _DBIO_H_
 #include "pred.h"
 #ifdef TUFFY
 #include <libpq-fe.h>
 #endif
 #ifdef ROCKIT
 #include <mysql/mysql.h>
 #endif
 #include <vector>
 #include "lista.h"
 using namespace std;
 #ifdef TUFFY
 void postgresRead(PGconn **ret, vector<gpunode> *L, int *inpquery, char *names, int finalDR);
 void postgresWrite(int *inpquery, int ninpf, vector<rulenode>::iterator rul_str, vector<rulenode>::iterator fin, vector<gpunode> *L, PGconn *conn, int finalDR);
 #endif
 #ifdef ROCKIT
 void mysqlRead(MYSQL **ret, int *qrs, vector<gpunode> *L, int ninpf, char *names, int finalDR);
 void mysqlWrite(vector<rulenode>::iterator rul_str, vector<rulenode>::iterator fin, vector<gpunode> *L, MYSQL *con);
 #endif
 #ifdef DATALOG
 void datalogWrite(int query, vector<rulenode>::iterator rul_str, vector<rulenode>::iterator fin, int finalDR, int **result);
 #endif
 #endif
--- a/packages/cuda/old/lista.cu
+++ b/packages/cuda/old/lista.cu
--- a/packages/cuda/old/lista.h
+++ b/packages/cuda/old/lista.h
@@ -0,0 +1,44 @@
 #ifndef _LISTA_H_
 #define _LISTA_H_
 typedef struct Node{
 	int name;
 	int *dev_address;
 	int rows;
 	int size;
 	int iteration;
 	int isrule;
 }memnode;
 typedef struct auxiliar{
 	int name;
 	int num_rows;
 	int num_columns;
 	int *address_host_table;
 	int *rule_names;
 	int *referencias;
 	int **select;
 	int *numsel;
 	int **project;
 	int2 *projpos;
 	int **selfjoin;
 	int *numselfj;
 	int **wherejoin;
 	int *numjoin;
 	int totalpreds;
 	int **preds;
 	int2 *numpreds;
 	int *negatives;
 	char *rulename;
 	int gen_act;
 	int gen_ant;
 }rulenode;
 typedef struct completed{
 	int name;
 	int numrules;
 	int reduce;
 	int reset;
 }compnode;
 #endif
--- a/packages/cuda/old/memory.cu
+++ b/packages/cuda/old/memory.cu
@@ -0,0 +1,575 @@
 #include <list>
 #include <iostream>
 #include <stdlib.h>
 #include <algorithm>
 #include <thrust/device_vector.h>
 #include "lista.h"
 #include "memory.h"
 #include "pred.h"
 #define MAX_REC 200
 #define MAX_FIX_POINTS 100
 memnode temp_storage[MAX_REC];
 /*List used to store information (address, size, etc.) about facts and rule results loaded in the GPU*/
 list<memnode> GPUmem;
 /*List used to store information about rule results offloaded from the GPU to the CPU*/
 list<memnode> CPUmem;
 /*Auxiliary function to sort rule list*/
 bool comparer(const rulenode &r1, const rulenode &r2)
 {
 	return (r1.name > r2.name); 
 }
 /*Used in search functions to compare iterations*/
 bool compareiteration(const memnode &r1, const memnode &r2)
 {
 	return (r1.iteration < r2.iteration); 
 }
 /*Used in search functions to compare names*/
 bool comparename(const memnode &r1, const memnode &r2)
 {
 	return (r1.name > r2.name); 
 }
 /*Linear search of 'name' fact*/
 template<class InputIterator>
 InputIterator buscarhecho(InputIterator first, InputIterator last, int name)
 {
 	while(first!=last) 
 	{
 		if(first->name == name && first->isrule == 0) return first;
 			++first;
 	}
 	return last;
 }
 /*Finds all results of rule 'name' in iteration 'itr' in both CPU and GPU memory. Every result found is removed from its respective list*/
 list<memnode>::iterator buscarpornombre(int name, int itr, int *totalrows, int *gpunum, int *cpunum)
 {
 	int x = 0, sum = 0;
 	memnode temp;
 	list<memnode>::iterator i;
 	temp.iteration = itr;
 	pair<list<memnode>::iterator, list<memnode>::iterator> rec = equal_range(GPUmem.begin(), GPUmem.end(), temp, compareiteration);
 	while(rec.first != rec.second)
 	{
 		if(rec.first->name == name && rec.first->isrule == 1)
 		{
 			temp_storage[x] = *rec.first;
 			rec.first = GPUmem.erase(rec.first);
 			sum += temp_storage[x].rows;
 			x++;
 		}	
 		else
 			rec.first++;
 	}
 	*gpunum = x;
 	temp.name = name;
 	temp.isrule = 1;
 	i = GPUmem.insert(rec.first, temp);
 	rec = equal_range(CPUmem.begin(), CPUmem.end(), temp, compareiteration);
 	while(rec.first != rec.second)
 	{				
 		if(rec.first->name == name && rec.first->isrule == 1)
 		{
 			temp_storage[x] = *rec.first;
 			rec.first = CPUmem.erase(rec.first);
 			sum += temp_storage[x].rows;
 			x++;
 		}	
 		else
 			rec.first++;
 	}
 	*totalrows = sum;
 	*cpunum = x;
 	return i;
 }
 list<memnode>::iterator buscarpornombrecpu(int name, int itr, int *totalrows, int *gpunum, int *cpunum)
 {
 	int x = 0, sum = 0;
 	memnode temp;
 	list<memnode>::iterator i;
 	temp.iteration = itr;
 	pair<list<memnode>::iterator, list<memnode>::iterator> rec = equal_range(GPUmem.begin(), GPUmem.end(), temp, compareiteration);
 	while(rec.first != rec.second)
 	{				
 		if(rec.first->name == name)
 		{
 			temp_storage[x] = *rec.first;
 			rec.first = GPUmem.erase(rec.first);
 			sum += temp_storage[x].rows;
 			x++;
 		}	
 		else
 			rec.first++;
 	}
 	*gpunum = x;
 	temp.name = name;
 	temp.isrule = 1;
 	rec = equal_range(CPUmem.begin(), CPUmem.end(), temp, compareiteration);
 	while(rec.first != rec.second)
 	{				
 		if(rec.first->name == name)
 		{
 			temp_storage[x] = *rec.first;
 			rec.first = CPUmem.erase(rec.first);
 			sum += temp_storage[x].rows;
 			x++;
 		}	
 		else
 			rec.first++;
 	}
 	i = CPUmem.insert(rec.first, temp);
 	*totalrows = sum;
 	*cpunum = x;
 	return i;
 }
 /*Removes the least recently used memory block from GPU memory, sending it to CPU memory if it's a rule result. 
 If there are no used memory blocks in the GPU and we still don't have enough memory, the program exits with error*/
 void limpiar(const char s[], size_t sz)
 {
 	list<memnode>::iterator ini;
 	memnode temp;
 	size_t free, total;
 	if(GPUmem.size() == 0)
 	{
 		cudaMemGetInfo(&free,&total);
 		cerr << s << ": not enough GPU memory: have " << free << " of " << total << ", need " << sz << " bytes." << endl;
 		exit(1);
 	}		
 	ini = GPUmem.begin();
 	if(ini->isrule)
 	{	
 		temp = *ini;
 		temp.dev_address = (int *)malloc(ini->size);
 		cudaMemcpyAsync(temp.dev_address, ini->dev_address, temp.size, cudaMemcpyDeviceToHost);
 		list<memnode>::iterator pos = lower_bound(CPUmem.begin(), CPUmem.end(), temp, compareiteration);
 		CPUmem.insert(pos, temp);
 	}
 	cudaFree(ini->dev_address);
 	GPUmem.erase(ini);
 }
 /*Allocs 'size' amount of bytes in GPU memory. If not enough memory is available, removes least recently used memory blocks until 
 enough space is available*/
 void reservar(int **ptr, size_t size)
 {
 	size_t free, total;
        if (size == 0) { 
                *ptr = NULL; 
                return;
        }
 	cudaMemGetInfo(&free, &total);
 	while(free < size)
 	{
 		cout << "Se limpio memoria " << free << " " << total << endl;
 		limpiar("not enough memory", size);
 		cudaMemGetInfo(&free, &total);
 	}
 	while(cudaMalloc(ptr, size) == cudaErrorMemoryAllocation)
 		limpiar("Error in memory allocation", size);
 	if (! *ptr ) {
 	  size_t free, total;
 	  cudaMemGetInfo(      &free, &total	 );
 	  cerr << "Could not allocate " << size << " bytes, only " << free << " avaliable from total of " << total << " !!!" << endl;
 	  cerr << "Exiting CUDA...." << endl;
 	  exit(1);
 	}
 }
 /*Creates a new entry in the GPU memory list*/
 void registrar(int name, int num_columns, int *ptr, int rows, int itr, int rule)
 {
 	memnode temp;
 	temp.name = name;
 	temp.dev_address = ptr;
 	temp.rows = rows;
 	temp.size = rows * num_columns * sizeof(int);
 	temp.iteration = itr;
 	temp.isrule = rule;
 	GPUmem.push_back(temp);
 }
 void registrarcpu(int name, int num_columns, int *ptr, int rows, int itr, int rule)
 {
 	memnode temp;
 	temp.name = name;
 	temp.dev_address = ptr;
 	temp.rows = rows;
 	temp.size = rows * num_columns * sizeof(int);
 	temp.iteration = itr;
 	temp.isrule = rule;
 	CPUmem.push_back(temp);
 }
 /*Updates the information of an element in a list*/
 template<class InputIterator>
 void actualizar(int num_columns, int *ptr, int rows, InputIterator i)
 {
 	i->dev_address = ptr;
 	i->rows = rows;
 	i->size = rows * num_columns * sizeof(int);
 }
 /*Count the total number of rows generated by rule 'name' in iteration 'iter'*/
 int numrows(int name, int itr)
 {
 	int sum = 0;
 	memnode temp;
 	temp.iteration = itr;
 	pair<list<memnode>::iterator, list<memnode>::iterator> rec = equal_range(GPUmem.begin(), GPUmem.end(), temp, compareiteration);
 	while(rec.first != rec.second)
 	{
 		if(rec.first->name == name)
 			sum += rec.first->rows;
 		rec.first++;
 	}
 	rec = equal_range(CPUmem.begin(), CPUmem.end(), temp, compareiteration);
 	while(rec.first != rec.second)
 	{
 		if(rec.first->name == name)
 			sum += rec.first->rows;
 		rec.first++;
 	}
 	return sum;
 }
 	extern "C" void * YAP_IntToAtom(int);
 	extern  "C" char * YAP_AtomName(void *);
 /*Loads facts or rule results in GPU memory. If a fact is already in GPU memory, its pointer is simply returned. Otherwise, 
 memory is reserved and the fact is loaded. Rule results are loaded based on the current iteration 'itr' and both GPU and 
 CPU memories are searched for all instances of said results. The instances are combined into a single one in GPU memory.*/
 int cargar(int name, int num_rows, int num_columns, int is_fact, int *address_host_table, int **ptr, int itr)
 {
 	int numgpu, numcpu, totalrows = 0;
 	int *temp, x;
 	int size, itrant, inc = 0;
 	list<memnode>::iterator i;
 	memnode fact;
 	if(is_fact)
 	{
 		i = buscarhecho(GPUmem.begin(), GPUmem.end(), name);
 		if(i != GPUmem.end())
 		{
 			fact = *i;
 			GPUmem.erase(i);
 			fact.iteration = itr;
 			*ptr = fact.dev_address;
 			GPUmem.push_back(fact);
 			return fact.rows;
 		}
 		size = num_rows * num_columns * sizeof(int);
 		reservar(&temp, size);
 		cudaMemcpyAsync(temp, address_host_table, size, cudaMemcpyHostToDevice);
 		registrar(name, num_columns, temp, num_rows, itr, 0);
 		*ptr = temp;
 		return num_rows;
 	}
 	if(itr > 0)
 	{
 		itrant = itr - 1;
 		i = buscarpornombre(name, itrant, &totalrows, &numgpu, &numcpu);
 		if((numgpu == 1) && (numcpu == 1))
 		{
 			actualizar(num_columns, temp_storage[0].dev_address, temp_storage[0].rows, i);
 			*ptr = temp_storage[0].dev_address;
 			return temp_storage[0].rows;
 		}
 		size = totalrows * num_columns * sizeof(int);
 		reservar(&temp, size);
 		for(x = 0; x < numgpu; x++)
 		{
 			cudaMemcpyAsync(temp + inc, temp_storage[x].dev_address, temp_storage[x].size, cudaMemcpyDeviceToDevice);
 			inc += temp_storage[x].size / sizeof(int);
 			cudaFree(temp_storage[x].dev_address);
 		}
 		for(; x < numcpu; x++)
 		{
 			cudaMemcpyAsync(temp + inc, temp_storage[x].dev_address, temp_storage[x].size, cudaMemcpyHostToDevice);
 			inc += temp_storage[x].size / sizeof(int);
 			free(temp_storage[x].dev_address);
 		}
 		actualizar(num_columns, temp, totalrows, i);
 		*ptr = temp;
 		return totalrows;
 	}
 	return 0;
 }
 int cargarcpu(int name, int num_rows, int num_columns, int is_fact, int *address_host_table, int **ptr, int itr)
 {
 	int numgpu, numcpu, totalrows = 0;
 	int *temp, x;
 	int size, itrant, inc = 0;
 	list<memnode>::iterator i;
 	if(is_fact)
 	{
 		*ptr = address_host_table;
 		return num_rows;
 	}
 	if(itr > 0)
 	{
 		itrant = itr - 1;
 		i = buscarpornombrecpu(name, itrant, &totalrows, &numgpu, &numcpu);
 		if((numgpu == 0) && (numcpu == 1))
 		{
 			actualizar(num_columns, temp_storage[0].dev_address, temp_storage[0].rows, i);
 			*ptr = temp_storage[0].dev_address;
 			return temp_storage[0].rows;
 		}
 		size = totalrows * num_columns * sizeof(int);
 		temp = (int *)malloc(size);
 		for(x = 0; x < numgpu; x++)
 		{
 			cudaMemcpyAsync(temp + inc, temp_storage[x].dev_address, temp_storage[x].size, cudaMemcpyDeviceToHost);
 			inc += temp_storage[x].size / sizeof(int);
 			cudaFree(temp_storage[x].dev_address);
 		}
 		for(; x < numcpu; x++)
 		{
 			memcpy(temp + inc, temp_storage[x].dev_address, temp_storage[x].size);
 			inc += temp_storage[x].size / sizeof(int);
 			free(temp_storage[x].dev_address);
 		}
 		actualizar(num_columns, temp, totalrows, i);
 		*ptr = temp;
 		return totalrows;
 	}
 	return 0;
 }
 /*Loads all results of rule 'name' from both GPU and CPU memories into the GPU*/
 int cargafinal(int name, int cols, int **ptr)
 {
 	int *temp, *ini, cont = 0, numg = 0, numc = 0;
 	memnode bus;
 	bus.name = name;
 	GPUmem.sort(comparename);
 	CPUmem.sort(comparename);
 	list<memnode>::iterator endg = GPUmem.end();
 	list<memnode>::iterator endc = CPUmem.end();
 	list<memnode>::iterator pos = lower_bound(GPUmem.begin(), endg, bus, comparename);
 	list<memnode>::iterator gpu = pos;
 	while(pos != endg && pos->name == name)
 	{
 		cont += pos->rows;
 		numg++;
 		pos++;
 	}
 	pos = lower_bound(CPUmem.begin(), endc, bus, comparename);
 	list<memnode>::iterator cpu = pos;
 	while(pos != endc && pos->name == name)
 	{
 		cont += pos->rows;
 		numc++;
 		pos++;
 	}
 	if(numg == 0 && numc == 0)
 		return 0;
 	if(numg == 1 && numc == 0) 
 	{
 		pos = gpu;
 		*ptr = pos->dev_address;
 		cont = pos->rows;
 		GPUmem.erase(pos);
 		#ifdef TUFFY
 		return -cont;
 		#else
 		return cont;
 		#endif
 	}
 	if(numg == 0 && numc == 1)
 	{
 		pos = cpu;
 		cont = pos->rows;
 		#ifdef TUFFY
 		reservar(&temp, pos->size);
 		cudaMemcpy(temp, pos->dev_address, pos->size, cudaMemcpyHostToDevice);
 		*ptr = temp;
 		#else
 		*ptr = pos->dev_address;
 		#endif
 		CPUmem.erase(pos);
 		return -cont;
 	}
 	reservar(&temp, cont * cols * sizeof(int));
 	ini = temp;
 	pos = gpu;
 	while(pos != endg && pos->name == name)
 	{
 		cudaMemcpy(temp, pos->dev_address, pos->size, cudaMemcpyDeviceToDevice);
 		temp += pos->size / sizeof(int);
 		pos++;
 	}
 	pos = cpu;
 	while(pos != endc && pos->name == name)
 	{
 		cudaMemcpy(temp, pos->dev_address, pos->size, cudaMemcpyHostToDevice);
 		temp += pos->size / sizeof(int);
 		pos++;
 	}
 	*ptr = ini;
 	return cont;
 }
 /*Compares the results of the current iteration against the results of older iterations. 
 Used to avoid infinite computations when the result is not a single fixed-point, but an 
 orbit of points.*/
 bool generadas(int name, int filas, int cols, int itr)
 {
 	int r1, r2, x, fin;
 	int *dop1, *dop2;
 	r2 = numrows(name, itr);
 	if(itr < MAX_FIX_POINTS)
 		fin = itr;
 	else
 		fin = MAX_FIX_POINTS;
 	for(x = 1; x <= fin; x++)
 	{
 		r1 = numrows(name, itr - x);
 		if(r1 == r2)
 		{
 			r2 = cargar(name, filas, cols, 0, NULL, &dop2, itr + 1);
 			thrust::device_ptr<int> pt2 = thrust::device_pointer_cast(dop2);
 			r1 = cargar(name, filas, cols, 0, NULL, &dop1, itr - x + 1);
 			thrust::device_ptr<int> pt1 = thrust::device_pointer_cast(dop1);
 			if(thrust::equal(pt1, pt1 + r1, pt2) == true)
 				return true;
 		}
 	}
 	return false;
 }
 void mostrar_memoria()
 {
 	unsigned int x;
 	list<memnode>::iterator i = GPUmem.begin();
 	cout << "Memoria inicio GPU" << endl;
 	for(x = 0; x < GPUmem.size(); x++, i++)
 		cout << i->name << " " << i->iteration << " " << i->isrule << " " << i->rows << " " << i->size << endl;
 	cout << "Memoria fin GPU" << endl;
 }
 void mostrar_memcpu()
 {
 	unsigned int x;
 	list<memnode>::iterator i = CPUmem.begin();
 	cout << "Memoria inicio CPU" << endl;
 	for(x = 0; x < CPUmem.size(); x++, i++)
 		cout << i->name << " " << i->iteration << endl;
 	cout << "Memoria fin CPU" << endl;
 }
 /*Clear all rule results from both GPU and CPU memory*/
 void clear_memory()
 {
 	list<memnode>::iterator ini;
 	list<memnode>::iterator fin;
       	ini = GPUmem.begin();
 	fin = GPUmem.end();
 	while(ini != fin)
 	{
 		if(ini->isrule)
 		{
 			cudaFree(ini->dev_address);
 			ini = GPUmem.erase(ini);
 		}
 		else
 			ini++;
 	}
 	ini = CPUmem.begin();
 	fin = CPUmem.end();
 	while(ini != fin)
 	{
 		free(ini->dev_address);
 		ini++;
 	}
 	CPUmem.clear();
 }
 /*Clear everything from both GPU and CPU memory*/
 void clear_memory_all()
 {
 	list<memnode>::iterator ini;
 	list<memnode>::iterator fin;
       	ini = GPUmem.begin();
 	fin = GPUmem.end();
 	while(ini != fin)
 	{
 		cudaFree(ini->dev_address);
 		ini++;
 	}
 	GPUmem.clear();
 	ini = CPUmem.begin();
 	fin = CPUmem.end();
 	while(ini != fin)
 	{
 		free(ini->dev_address);
 		ini++;
 	}
 	CPUmem.clear();
 }
 /*Remove all instances of fact 'name' from both CPU and GPU memories*/
 void liberar(int name)
 {
 	list<memnode>::iterator i;
 	memnode fact;
 	i = buscarhecho(GPUmem.begin(), GPUmem.end(), name);
 	if(i != GPUmem.end())
 	{
 		fact = *i;
 		GPUmem.erase(i);
 		cudaFree(fact.dev_address);
 	}
 	i = buscarhecho(CPUmem.begin(), CPUmem.end(), name);
 	if(i != CPUmem.end())
 	{
 		fact = *i;
 		CPUmem.erase(i);
 		free(fact.dev_address);
 	}
 }
 /*Add all rows in 'dop1' to the fact 'name' by creating a new array capable of holding both.*/
 void sumar(int name, int *dop1, int cols, int rows)
 {
 	list<memnode>::iterator i;
 	memnode fact;
 	i = buscarhecho(GPUmem.begin(), GPUmem.end(), name);
 	int *res, newrows, offset;
 	if(i != GPUmem.end())
 	{
 		fact = *i;
 		newrows = rows + fact.rows;
 		reservar(&res, newrows * cols * sizeof(int));
 		offset = fact.rows * cols;
 		cudaMemcpyAsync(res, fact.dev_address, offset * sizeof(int), cudaMemcpyDeviceToDevice);
 		GPUmem.erase(i);
 		registrar(name, cols, res, newrows, 0, 0);
 		cudaMemcpyAsync(res + offset, dop1, rows * cols * sizeof(int), cudaMemcpyDeviceToDevice);
 		cudaFree(fact.dev_address);
 	}
 }
--- a/packages/cuda/old/memory.h
+++ b/packages/cuda/old/memory.h
@@ -0,0 +1,27 @@
 #ifndef _MEMORY_H_
 #define _MEMORY_H_
 #include <list>
 #include <vector>
 #include "lista.h"
 using namespace std;
 bool comparer(const rulenode&, const rulenode&);
 void limpiar(const char [], size_t);
 void limpiartodo(int*, int*);
 int cargar(int, int, int, int, int*, int**, int);
 int cargarcpu(int, int, int, int, int*, int**, int);
 int cargafinal(int, int, int**);
 void reservar(int**, size_t);
 void registrar(int, int, int*, int, int, int);
 void registrarcpu(int, int, int*, int, int, int);
 bool generadas(int, int, int, int);
 void sumar(int, int*, int, int);
 void liberar(int);
 void mostrar_memoria(void);
 void mostrar_memcpu(void);
 void clear_memory(void);
 void clear_memory_all(void);
 #endif
--- a/packages/cuda/old/pred.h
+++ b/packages/cuda/old/pred.h
@@ -0,0 +1,47 @@
 #ifndef _PRED_H_
 #define _PRED_H_
 // #define DEBUG_MEM 1
 typedef struct Nodo{
        int name;
 	int num_rows;
 	int num_columns;
 	int is_fact;
 	int *address_host_table;
 	int *negatives;
 	char *predname;
 	double *weight;
 }gpunode;
 typedef gpunode predicate;
 //#define TIMER 1
 #define DATALOG 1
 #define NUM_T 4
 #define INISIZE 1000000
 #if TIMER
 typedef struct Stats{
  size_t joins, selects, unions, builtins;
  size_t calls;
  double total_time;
  float max_time, min_time;
  float select1_time, select2_time, join_time, sort_time, union_time, pred_time;
 }statinfo;
 extern statinfo cuda_stats;
 #endif
 /*Constants used to mark comparison predicates*/
 #define BPOFFSET (-6)
 #define SBG_EQ  (-1)
 #define SBG_GT  (-2)
 #define SBG_LT  (-3)
 #define SBG_GE  (-4)
 #define SBG_LE  (-5)
 #define SBG_DF  (-6)
 int Cuda_Eval(predicate**, int, predicate**, int, int*, int**, char*, int);
 void  Cuda_Statistics( void );
 #endif
--- a/packages/cuda/old/selectproyect.cu
+++ b/packages/cuda/old/selectproyect.cu
@@ -0,0 +1,306 @@
 #include <thrust/device_vector.h>
 #include <thrust/scan.h>
 #include <stdlib.h>
 #include "memory.h"
 #include "bpreds.h"
 /*Mark all rows that comply with the selections*/
 __global__ void marcar2(int *dop1, int rows, int cols, int *cons, int numc, int *res)
 {
 	extern __shared__ int shared[];
 	int id = blockIdx.x * blockDim.x + threadIdx.x;
 	int x, rowact, posact;
 	if(threadIdx.x < numc)
 		shared[threadIdx.x] = cons[threadIdx.x];
 	__syncthreads();
 	if(id < rows)
 	{
 		rowact = id * cols;
 		for(x = 0; x < numc; x += 2)
 		{
 			posact = rowact + shared[x];
 			if(dop1[posact] != shared[x+1])
 				return;
 		}
 		res[id] = 1;
 	}
 }
 /*If we already have an array of marks (perhaps because the selfjoin was applied first), 
 we unmark any rows that do not comply with the selections*/
 __global__ void marcar(int *dop1, int rows, int cols, int *cons, int numc, int *res)
 {
 	extern __shared__ int shared[];
 	int id = blockIdx.x * blockDim.x + threadIdx.x;
 	int x, rowact, posact;
 	if(threadIdx.x < numc)
 		shared[threadIdx.x] = cons[threadIdx.x];
 	__syncthreads();
 	if(id < rows)
 	{
 		if(res[id] == 0)
 			return;
 		rowact = id * cols;
 		for(x = 0; x < numc; x += 2)
 		{
 			posact = rowact + shared[x];
 			if(dop1[posact] != shared[x+1])
 			{
 				res[id] = 0;
 				return;
 			}
 		}
 	}
 }
 /*Unmark all rows that do not comply with the selfjoins.*/
 __global__ void samejoin(int *dop1, int rows, int cols, int *dhead, int cont, int *res)
 {
 	extern __shared__ int shared[];
 	int id = blockIdx.x * blockDim.x + threadIdx.x;
 	int temp, temp2, pos, x, y;
 	if(threadIdx.x < cont)
 		shared[threadIdx.x] = dhead[threadIdx.x];
 	__syncthreads();
 	if(id < rows)
 	{	
 		if(res[id] == 0)
 			return;
 		pos = id * cols;
 		for(x = 0; x < cont; x++)
 		{
 			temp = dop1[pos+shared[x]];
 			y = x + 1;
 			temp2 = shared[y];
 			while(temp2 > -1)
 			{
 				if(temp != dop1[temp2+pos])
 				{
 					res[id] = 0;
 					return;
 				}
 				y++;
 				temp2 = shared[y];
 			}
 			x = y;
 		}
 	}
 }
 /*Mark all rows that comply with the selfjoins*/
 __global__ void samejoin2(int *dop1, int rows, int cols, int *dhead, int cont, int *res)
 {
 	extern __shared__ int shared[];
 	int id = blockIdx.x * blockDim.x + threadIdx.x;
 	int temp, temp2, pos, x, y;
 	if(threadIdx.x < cont)
 		shared[threadIdx.x] = dhead[threadIdx.x];
 	__syncthreads();
 	if(id < rows)
 	{	
 		pos = id * cols;
 		for(x = 0; x < cont; x++)
 		{
 			temp = dop1[pos+shared[x]];
 			y = x + 1;
 			temp2 = shared[y];
 			while(temp2 > -1)
 			{
 				if(temp != dop1[temp2+pos])
 					return;
 				y++;
 				temp2 = shared[y];
 			}
 			x = y;
 		}
 		res[id] = 1;
 	}
 }
 /*Project all columns found in 'dhead' to a new array 'res'*/
 __global__ void proyectar(int *dop1, int rows, int cols, int *dhead, int hsize, int *res)
 {
 	extern __shared__ int shared[];
 	int id = blockIdx.x * blockDim.x + threadIdx.x;
 	int pos, posr, x;
 	if(threadIdx.x < hsize)
 		shared[threadIdx.x] = dhead[threadIdx.x];
 	__syncthreads();
 	if(id < rows)
 	{	
 		pos = id * cols;
 		posr = id * hsize;
 		for(x = 0; x < hsize; x++, posr++)
 			res[posr] = dop1[pos+shared[x]];
 	}
 }
 /*Project all columns found in 'dhead' using only the rows marked as valid (i.e. those that complied with 
 selections, selfjoins, etc.). The array 'temp' holds the result of the prefix sum of said marks.*/
 __global__ void llenarproyectar(int *dop1, int rows, int cols, int *temp, int *dhead, int hsize, int *res)
 {
 	extern __shared__ int shared[];
 	int id = blockIdx.x * blockDim.x + threadIdx.x;
 	int pos, posr, x;
 	if(threadIdx.x < hsize)
 		shared[threadIdx.x] = dhead[threadIdx.x];
 	__syncthreads();
 	if(id < rows)
 	{		
 		posr = temp[id];
 		if(temp[id+1] != posr)
 		{
 			pos = id * cols;
 			posr *= hsize;			
 			for(x = 0; x < hsize; x++, posr++)
 				res[posr] = dop1[pos+shared[x]];
 		}
 	}
 }
 /*Performs selections, selfjoins and comparison predicates when the rule has a single normal predicate.*/
 int selectproyect(int *dop1, int rows, int cols, int head_size, int *select, int numselect, int *selfjoin, int numselfj, int *preds, int numpreds, int *project, int **ret, int ANDlogic)
 {
 	int *fres = NULL, *temp = NULL;
 	int *dhead = NULL, tmplen;
 	int size, size2, num;
 	thrust::device_ptr<int> res;
 #if TIMER
 	cuda_stats.selects++;
 #endif
 	int head_bytes = maximo(4, numselect, numselfj, numpreds, head_size) * sizeof(int);
 	reservar(&dhead, head_bytes);
 	int numthreads = 1024;
 	//int numthreads = 32;
 	int blockllen = rows / numthreads + 1;
 	#ifdef ROCKIT
 		ANDlogic = 1;
 	#endif
 	if(numselect > 0)
 	{		
 		tmplen = rows + 1;
 		size2 = tmplen * sizeof(int);
 		reservar(&temp, size2);
 		cudaMemset(temp, 0, size2);
 		size = numselect * sizeof(int);
 		cudaMemcpy(dhead, select, size, cudaMemcpyHostToDevice);
 		marcar2<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, numselect, temp + 1);
 		if(numselfj > 0)
 		{
 			size = numselfj * sizeof(int);
 			cudaMemcpy(dhead, selfjoin, size, cudaMemcpyHostToDevice);
 			samejoin<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, numselfj, temp + 1);
 		}
 		if(numpreds > 0)
 		{
 			size = numpreds * sizeof(int);
 			cudaMemcpy(dhead, preds, size, cudaMemcpyHostToDevice);
 			if(ANDlogic)
 				bpredsnormal<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, numpreds, temp + 1);
 			else
 				bpredsorlogic<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, numpreds, temp + 1);
 		}
 		res = thrust::device_pointer_cast(temp);
 		thrust::inclusive_scan(res + 1, res + tmplen, res + 1);
 		num = res[rows];
 		if(num == 0)
 			return 0;
 		size = head_size * sizeof(int);
 		reservar(&fres, num * size);
 		cudaMemcpy(dhead, project, size, cudaMemcpyHostToDevice);
 		llenarproyectar<<<blockllen, numthreads, size>>>(dop1, rows, cols, temp, dhead, head_size, fres);
 		cudaFree(dhead);
 		cudaFree(temp);
 		*ret = fres;
 		return num;
 	}
 	else
 	{
 		if(numselfj > 0)
 		{
 			tmplen = rows + 1;
 			size2 = tmplen * sizeof(int);
 			reservar(&temp, size2);
 			cudaMemset(temp, 0, size2);
 			size = numselfj * sizeof(int);
 			cudaMemcpy(dhead, selfjoin, size, cudaMemcpyHostToDevice);
 			samejoin2<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, numselfj, temp + 1);
 			if(numpreds > 0)
 			{
 				size = numpreds * sizeof(int);
 				cudaMemcpy(dhead, preds, size, cudaMemcpyHostToDevice);
 				if(ANDlogic)
 					bpredsnormal<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, numpreds, temp + 1);
 				else
 					bpredsorlogic<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, numpreds, temp + 1);
 			}
 			res = thrust::device_pointer_cast(temp);
 			thrust::inclusive_scan(res + 1, res + tmplen, res + 1);
 			num = res[rows];
 			if(num == 0)
 				return 0;
 			size = head_size * sizeof(int);
 			reservar(&fres, num * size);
 			cudaMemcpy(dhead, project, size, cudaMemcpyHostToDevice);
 			llenarproyectar<<<blockllen, numthreads, size>>>(dop1, rows, cols, temp, dhead, head_size, fres);
 			cudaFree(dhead);
 			cudaFree(temp);
 			*ret = fres;
 			return num;
 		}
 		else
 		{
 			if(numpreds > 0)
 			{
 				tmplen = rows + 1;
 				size2 = tmplen * sizeof(int);
 				reservar(&temp, size2);
 				cudaMemset(temp, 0, size2);		
 				size = numpreds * sizeof(int);
 				cudaMemcpy(dhead, preds, size, cudaMemcpyHostToDevice);
 				if(ANDlogic)
 					bpredsnormal2<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, numpreds, temp + 1);					
 				else
 					bpredsorlogic2<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, numpreds, temp + 1);
 				res = thrust::device_pointer_cast(temp);
 				thrust::inclusive_scan(res + 1, res + tmplen, res + 1);
 				num = res[rows];
 				if(num == 0)
 					return 0;
 				size = head_size * sizeof(int);
 				reservar(&fres, num * size);
 				cudaMemcpy(dhead, project, size, cudaMemcpyHostToDevice);
 				llenarproyectar<<<blockllen, numthreads, size>>>(dop1, rows, cols, temp, dhead, head_size, fres);
 				cudaFree(dhead);
 				cudaFree(temp);
 				*ret = fres;
 				return num;
 			}
 			else
 			{
 				size = head_size * sizeof(int);
 				reservar(&fres, rows * size);
 				cudaMemcpy(dhead, project, size, cudaMemcpyHostToDevice);
 				proyectar<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, head_size, fres);
 				cudaFree(dhead);
 				*ret = fres;
 				return rows;
 			}
 		}
 	}
 }
--- a/packages/cuda/old/treeb.cu
+++ b/packages/cuda/old/treeb.cu
--- a/packages/cuda/old/union2.cu
+++ b/packages/cuda/old/union2.cu
@@ -0,0 +1,763 @@
 /*Computer generated file to remove duplicates. Since Thrust's unique and sort, unlike their std's counterparts, don't have a way to specify the size of each element in
 the array, comparing pairs, triplets and other sets is not possible without defining a new pointer and all related operations for each set. If you have a better idea to do
 this, please don't hesitate to email us.*/
 #include <thrust/device_vector.h>
 #include <thrust/unique.h>
 #include <thrust/distance.h>
 #include <thrust/sort.h>
 #include <iostream>
 #include "memory.h"
 #include "union2.h"
 int unir(int *res, int rows, int tipo, int **ret, int final)
 {
 	thrust::device_ptr<int> pt, re;
 	thrust::device_ptr<s2> pt2, re2;
 	thrust::device_ptr<s3> pt3, re3;
 	thrust::device_ptr<s4> pt4, re4;
 	thrust::device_ptr<s5> pt5, re5;
 	thrust::device_ptr<s6> pt6, re6;
 	thrust::device_ptr<s7> pt7, re7;
 	thrust::device_ptr<s8> pt8, re8;
 	thrust::device_ptr<s9> pt9, re9;
 	thrust::device_ptr<s10> pt10, re10;
 	thrust::device_ptr<s11> pt11, re11;
 	thrust::device_ptr<s12> pt12, re12;
 	thrust::device_ptr<s13> pt13, re13;
 	thrust::device_ptr<s14> pt14, re14;
 	thrust::device_ptr<s15> pt15, re15;
 	thrust::device_ptr<s16> pt16, re16;
 	thrust::device_ptr<s17> pt17, re17;
 	thrust::device_ptr<s18> pt18, re18;
 	thrust::device_ptr<s19> pt19, re19;
 	thrust::device_ptr<s20> pt20, re20;
 	s2 *t2;
 	s3 *t3;
 	s4 *t4;
 	s5 *t5;
 	s6 *t6;
 	s7 *t7;
 	s8 *t8;
 	s9 *t9;
 	s10 *t10;
 	s11 *t11;
 	s12 *t12;
 	s13 *t13;
 	s14 *t14;
 	s15 *t15;
 	s16 *t16;
 	s17 *t17;
 	s18 *t18;
 	s19 *t19;
 	s20 *t20;
 	int flag, nrows, *nres, size;
 #if TIMER
 	cuda_stats.unions++;
 #endif
 	switch(tipo)
 	{
 		case 1:
 		{
 			pt = thrust::device_pointer_cast(res);
 			flag = 0;
 			while(flag != 1)
 			{
 				try
 				{
 					thrust::sort(pt, pt + rows);
 					if(final)
 					{
 						re = thrust::unique(pt, pt + rows, q1());
 						re = thrust::unique(pt, re);
 					}
 					else
 						re = thrust::unique(pt, pt + rows);
 					flag = 1;
 				}
 				catch(std::bad_alloc &e)
 				{
 					limpiar("sort/unique in unir", 0);
 				}
 			}
 			nrows = thrust::distance(pt, re);
 			if(nrows < rows / 2)
 			{
 				size = nrows * tipo * sizeof(int);
 				reservar(&nres, size);
 				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
 				cudaFree(*ret);
 				*ret = nres;
 			}
 			return nrows;
 		}
 		case 2:
 		{
 			t2 = (s2*)res;
 			pt2 = thrust::device_pointer_cast(t2);
 			flag = 0;
 			while(flag != 1)
 			{
 				try
 				{
 					thrust::sort(pt2, pt2 + rows, o2());
 					if(final)
 					{
 						re2 = thrust::unique(pt2, pt2 + rows, q2());
 						re2 = thrust::unique(pt2, re2, p2());
 					}
 					else
 						re2 = thrust::unique(pt2, pt2 + rows, p2());
 					flag = 1;
 				}
 				catch(std::bad_alloc &e)
 				{
 					limpiar("sort/unique in unir", 0);
 				}
 			}
 			nrows = thrust::distance(pt2, re2);
 			if(nrows < rows / 2)
 			{
 				size = nrows * tipo * sizeof(int);
 				reservar(&nres, size);
 				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
 				cudaFree(*ret);
 				*ret = nres;
 			}
 			return nrows;
 		}
 		case 3:
 		{
 			t3 = (s3*)res;
 			pt3 = thrust::device_pointer_cast(t3);
 			flag = 0;
 			while(flag != 1)
 			{
 				try
 				{
 					thrust::sort(pt3, pt3 + rows, o3());
 					if(final)
 					{
 						re3 = thrust::unique(pt3, pt3 + rows, q3());
 						re3 = thrust::unique(pt3, re3, p3());
 					}
 					else
 						re3 = thrust::unique(pt3, pt3 + rows, p3());
 					flag = 1;
 				}
 				catch(std::bad_alloc &e)
 				{
 					limpiar("sort/unique in unir", 0);
 				}
 			}
 			nrows = thrust::distance(pt3, re3);
 			if(nrows < rows / 2)
 			{
 				size = nrows * tipo * sizeof(int);
 				reservar(&nres, size);
 				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
 				cudaFree(*ret);
 				*ret = nres;
 			}
 			return nrows;
 		}
 		case 4:
 		{
 			t4 = (s4*)res;
 			pt4 = thrust::device_pointer_cast(t4);
 			flag = 0;
 			while(flag != 1)
 			{
 				try
 				{
 					thrust::sort(pt4, pt4 + rows, o4());
 					if(final)
 					{
 						re4 = thrust::unique(pt4, pt4 + rows, q4());
 						re4 = thrust::unique(pt4, re4, p4());
 					}
 					else
 						re4 = thrust::unique(pt4, pt4 + rows, p4());
 					flag = 1;
 				}
 				catch(std::bad_alloc &e)
 				{
 					limpiar("sort/unique in unir", 0);
 				}
 			}
 			nrows = thrust::distance(pt4, re4);
 			if(nrows < rows / 2)
 			{
 				size = nrows * tipo * sizeof(int);
 				reservar(&nres, size);
 				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
 				cudaFree(*ret);
 				*ret = nres;
 			}
 			return nrows;
 		}
 		case 5:
 		{
 			t5 = (s5*)res;
 			pt5 = thrust::device_pointer_cast(t5);
 			flag = 0;
 			while(flag != 1)
 			{
 				try
 				{
 					thrust::sort(pt5, pt5 + rows, o5());
 					if(final)
 					{
 						re5 = thrust::unique(pt5, pt5 + rows, q5());
 						re5 = thrust::unique(pt5, re5, p5());
 					}
 					else
 						re5 = thrust::unique(pt5, pt5 + rows, p5());
 					flag = 1;
 				}
 				catch(std::bad_alloc &e)
 				{
 					limpiar("sort/unique in unir", 0);
 				}
 			}
 			nrows = thrust::distance(pt5, re5);
 			if(nrows < rows / 2)
 			{
 				size = nrows * tipo * sizeof(int);
 				reservar(&nres, size);
 				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
 				cudaFree(*ret);
 				*ret = nres;
 			}
 			return nrows;
 		}
 		case 6:
 		{
 			t6 = (s6*)res;
 			pt6 = thrust::device_pointer_cast(t6);
 			flag = 0;
 			while(flag != 1)
 			{
 				try
 				{
 					thrust::sort(pt6, pt6 + rows, o6());
 					if(final)
 					{
 						re6 = thrust::unique(pt6, pt6 + rows, q6());
 						re6 = thrust::unique(pt6, re6, p6());
 					}
 					else
 						re6 = thrust::unique(pt6, pt6 + rows, p6());
 					flag = 1;
 				}
 				catch(std::bad_alloc &e)
 				{
 					limpiar("sort/unique in unir", 0);
 				}
 			}
 			nrows = thrust::distance(pt6, re6);
 			if(nrows < rows / 2)
 			{
 				size = nrows * tipo * sizeof(int);
 				reservar(&nres, size);
 				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
 				cudaFree(*ret);
 				*ret = nres;
 			}
 			return nrows;
 		}
 		case 7:
 		{
 			t7 = (s7*)res;
 			pt7 = thrust::device_pointer_cast(t7);
 			flag = 0;
 			while(flag != 1)
 			{
 				try
 				{
 					thrust::sort(pt7, pt7 + rows, o7());
 					if(final)
 					{
 						re7 = thrust::unique(pt7, pt7 + rows, q7());
 						re7 = thrust::unique(pt7, re7, p7());
 					}
 					else
 						re7 = thrust::unique(pt7, pt7 + rows, p7());
 					flag = 1;
 				}
 				catch(std::bad_alloc &e)
 				{
 					limpiar("sort/unique in unir", 0);
 				}
 			}
 			nrows = thrust::distance(pt7, re7);
 			if(nrows < rows / 2)
 			{
 				size = nrows * tipo * sizeof(int);
 				reservar(&nres, size);
 				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
 				cudaFree(*ret);
 				*ret = nres;
 			}
 			return nrows;
 		}
 		case 8:
 		{
 			t8 = (s8*)res;
 			pt8 = thrust::device_pointer_cast(t8);
 			flag = 0;
 			while(flag != 1)
 			{
 				try
 				{
 					thrust::sort(pt8, pt8 + rows, o8());
 					if(final)
 					{
 						re8 = thrust::unique(pt8, pt8 + rows, q8());
 						re8 = thrust::unique(pt8, re8, p8());
 					}
 					else
 						re8 = thrust::unique(pt8, pt8 + rows, p8());
 					flag = 1;
 				}
 				catch(std::bad_alloc &e)
 				{
 					limpiar("sort/unique in unir", 0);
 				}
 			}
 			nrows = thrust::distance(pt8, re8);
 			if(nrows < rows / 2)
 			{
 				size = nrows * tipo * sizeof(int);
 				reservar(&nres, size);
 				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
 				cudaFree(*ret);
 				*ret = nres;
 			}
 			return nrows;
 		}
 		case 9:
 		{
 			t9 = (s9*)res;
 			pt9 = thrust::device_pointer_cast(t9);
 			flag = 0;
 			while(flag != 1)
 			{
 				try
 				{
 					thrust::sort(pt9, pt9 + rows, o9());
 					if(final)
 					{
 						re9 = thrust::unique(pt9, pt9 + rows, q9());
 						re9 = thrust::unique(pt9, re9, p9());
 					}
 					else
 						re9 = thrust::unique(pt9, pt9 + rows, p9());
 					flag = 1;
 				}
 				catch(std::bad_alloc &e)
 				{
 					limpiar("sort/unique in unir", 0);
 				}
 			}
 			nrows = thrust::distance(pt9, re9);
 			if(nrows < rows / 2)
 			{
 				size = nrows * tipo * sizeof(int);
 				reservar(&nres, size);
 				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
 				cudaFree(*ret);
 				*ret = nres;
 			}
 			return nrows;
 		}
 		case 10:
 		{
 			t10 = (s10*)res;
 			pt10 = thrust::device_pointer_cast(t10);
 			flag = 0;
 			while(flag != 1)
 			{
 				try
 				{
 					thrust::sort(pt10, pt10 + rows, o10());
 					if(final)
 					{
 						re10 = thrust::unique(pt10, pt10 + rows, q10());
 						re10 = thrust::unique(pt10, re10, p10());
 					}
 					else
 						re10 = thrust::unique(pt10, pt10 + rows, p10());
 					flag = 1;
 				}
 				catch(std::bad_alloc &e)
 				{
 					limpiar("sort/unique in unir", 0);
 				}
 			}
 			nrows = thrust::distance(pt10, re10);
 			if(nrows < rows / 2)
 			{
 				size = nrows * tipo * sizeof(int);
 				reservar(&nres, size);
 				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
 				cudaFree(*ret);
 				*ret = nres;
 			}
 			return nrows;
 		}
 		case 11:
 		{
 			t11 = (s11*)res;
 			pt11 = thrust::device_pointer_cast(t11);
 			flag = 0;
 			while(flag != 1)
 			{
 				try
 				{
 					thrust::sort(pt11, pt11 + rows, o11());
 					if(final)
 					{
 						re11 = thrust::unique(pt11, pt11 + rows, q11());
 						re11 = thrust::unique(pt11, re11, p11());
 					}
 					else
 						re11 = thrust::unique(pt11, pt11 + rows, p11());
 					flag = 1;
 				}
 				catch(std::bad_alloc &e)
 				{
 					limpiar("sort/unique in unir", 0);
 				}
 			}
 			nrows = thrust::distance(pt11, re11);
 			if(nrows < rows / 2)
 			{
 				size = nrows * tipo * sizeof(int);
 				reservar(&nres, size);
 				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
 				cudaFree(*ret);
 				*ret = nres;
 			}
 			return nrows;
 		}
 		case 12:
 		{
 			t12 = (s12*)res;
 			pt12 = thrust::device_pointer_cast(t12);
 			flag = 0;
 			while(flag != 1)
 			{
 				try
 				{
 					thrust::sort(pt12, pt12 + rows, o12());
 					if(final)
 					{
 						re12 = thrust::unique(pt12, pt12 + rows, q12());
 						re12 = thrust::unique(pt12, re12, p12());
 					}
 					else
 						re12 = thrust::unique(pt12, pt12 + rows, p12());
 					flag = 1;
 				}
 				catch(std::bad_alloc &e)
 				{
 					limpiar("sort/unique in unir", 0);
 				}
 			}
 			nrows = thrust::distance(pt12, re12);
 			if(nrows < rows / 2)
 			{
 				size = nrows * tipo * sizeof(int);
 				reservar(&nres, size);
 				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
 				cudaFree(*ret);
 				*ret = nres;
 			}
 			return nrows;
 		}
 		case 13:
 		{
 			t13 = (s13*)res;
 			pt13 = thrust::device_pointer_cast(t13);
 			flag = 0;
 			while(flag != 1)
 			{
 				try
 				{
 					thrust::sort(pt13, pt13 + rows, o13());
 					if(final)
 					{
 						re13 = thrust::unique(pt13, pt13 + rows, q13());
 						re13 = thrust::unique(pt13, re13, p13());
 					}
 					else
 						re13 = thrust::unique(pt13, pt13 + rows, p13());
 					flag = 1;
 				}
 				catch(std::bad_alloc &e)
 				{
 					limpiar("sort/unique in unir", 0);
 				}
 			}
 			nrows = thrust::distance(pt13, re13);
 			if(nrows < rows / 2)
 			{
 				size = nrows * tipo * sizeof(int);
 				reservar(&nres, size);
 				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
 				cudaFree(*ret);
 				*ret = nres;
 			}
 			return nrows;
 		}
 		case 14:
 		{
 			t14 = (s14*)res;
 			pt14 = thrust::device_pointer_cast(t14);
 			flag = 0;
 			while(flag != 1)
 			{
 				try
 				{
 					thrust::sort(pt14, pt14 + rows, o14());
 					if(final)
 					{
 						re14 = thrust::unique(pt14, pt14 + rows, q14());
 						re14 = thrust::unique(pt14, re14, p14());
 					}
 					else
 						re14 = thrust::unique(pt14, pt14 + rows, p14());
 					flag = 1;
 				}
 				catch(std::bad_alloc &e)
 				{
 					limpiar("sort/unique in unir", 0);
 				}
 			}
 			nrows = thrust::distance(pt14, re14);
 			if(nrows < rows / 2)
 			{
 				size = nrows * tipo * sizeof(int);
 				reservar(&nres, size);
 				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
 				cudaFree(*ret);
 				*ret = nres;
 			}
 			return nrows;
 		}
 		case 15:
 		{
 			t15 = (s15*)res;
 			pt15 = thrust::device_pointer_cast(t15);
 			flag = 0;
 			while(flag != 1)
 			{
 				try
 				{
 					thrust::sort(pt15, pt15 + rows, o15());
 					if(final)
 					{
 						re15 = thrust::unique(pt15, pt15 + rows, q15());
 						re15 = thrust::unique(pt15, re15, p15());
 					}
 					else
 						re15 = thrust::unique(pt15, pt15 + rows, p15());
 					flag = 1;
 				}
 				catch(std::bad_alloc &e)
 				{
 					limpiar("sort/unique in unir", 0);
 				}
 			}
 			nrows = thrust::distance(pt15, re15);
 			if(nrows < rows / 2)
 			{
 				size = nrows * tipo * sizeof(int);
 				reservar(&nres, size);
 				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
 				cudaFree(*ret);
 				*ret = nres;
 			}
 			return nrows;
 		}
 		case 16:
 		{
 			t16 = (s16*)res;
 			pt16 = thrust::device_pointer_cast(t16);
 			flag = 0;
 			while(flag != 1)
 			{
 				try
 				{
 					thrust::sort(pt16, pt16 + rows, o16());
 					if(final)
 					{
 						re16 = thrust::unique(pt16, pt16 + rows, q16());
 						re16 = thrust::unique(pt16, re16, p16());
 					}
 					else
 						re16 = thrust::unique(pt16, pt16 + rows, p16());
 					flag = 1;
 				}
 				catch(std::bad_alloc &e)
 				{
 					limpiar("sort/unique in unir", 0);
 				}
 			}
 			nrows = thrust::distance(pt16, re16);
 			if(nrows < rows / 2)
 			{
 				size = nrows * tipo * sizeof(int);
 				reservar(&nres, size);
 				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
 				cudaFree(*ret);
 				*ret = nres;
 			}
 			return nrows;
 		}
 		case 17:
 		{
 			t17 = (s17*)res;
 			pt17 = thrust::device_pointer_cast(t17);
 			flag = 0;
 			while(flag != 1)
 			{
 				try
 				{
 					thrust::sort(pt17, pt17 + rows, o17());
 					if(final)
 					{
 						re17 = thrust::unique(pt17, pt17 + rows, q17());
 						re17 = thrust::unique(pt17, re17, p17());
 					}
 					else
 						re17 = thrust::unique(pt17, pt17 + rows, p17());
 					flag = 1;
 				}
 				catch(std::bad_alloc &e)
 				{
 					limpiar("sort/unique in unir", 0);
 				}
 			}
 			nrows = thrust::distance(pt17, re17);
 			if(nrows < rows / 2)
 			{
 				size = nrows * tipo * sizeof(int);
 				reservar(&nres, size);
 				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
 				cudaFree(*ret);
 				*ret = nres;
 			}
 			return nrows;
 		}
 		case 18:
 		{
 			t18 = (s18*)res;
 			pt18 = thrust::device_pointer_cast(t18);
 			flag = 0;
 			while(flag != 1)
 			{
 				try
 				{
 					thrust::sort(pt18, pt18 + rows, o18());
 					if(final)
 					{
 						re18 = thrust::unique(pt18, pt18 + rows, q18());
 						re18 = thrust::unique(pt18, re18, p18());
 					}
 					else
 						re18 = thrust::unique(pt18, pt18 + rows, p18());
 					flag = 1;
 				}
 				catch(std::bad_alloc &e)
 				{
 					limpiar("sort/unique in unir", 0);
 				}
 			}
 			nrows = thrust::distance(pt18, re18);
 			if(nrows < rows / 2)
 			{
 				size = nrows * tipo * sizeof(int);
 				reservar(&nres, size);
 				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
 				cudaFree(*ret);
 				*ret = nres;
 			}
 			return nrows;
 		}
 		case 19:
 		{
 			t19 = (s19*)res;
 			pt19 = thrust::device_pointer_cast(t19);
 			flag = 0;
 			while(flag != 1)
 			{
 				try
 				{
 					thrust::sort(pt19, pt19 + rows, o19());
 					if(final)
 					{
 						re19 = thrust::unique(pt19, pt19 + rows, q19());
 						re19 = thrust::unique(pt19, re19, p19());
 					}
 					else
 						re19 = thrust::unique(pt19, pt19 + rows, p19());
 					flag = 1;
 				}
 				catch(std::bad_alloc &e)
 				{
 					limpiar("sort/unique in unir", 0);
 				}
 			}
 			nrows = thrust::distance(pt19, re19);
 			if(nrows < rows / 2)
 			{
 				size = nrows * tipo * sizeof(int);
 				reservar(&nres, size);
 				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
 				cudaFree(*ret);
 				*ret = nres;
 			}
 			return nrows;
 		}
 		case 20:
 		{
 			t20 = (s20*)res;
 			pt20 = thrust::device_pointer_cast(t20);
 			flag = 0;
 			while(flag != 1)
 			{
 				try
 				{
 					thrust::sort(pt20, pt20 + rows, o20());
 					if(final)
 					{
 						re20 = thrust::unique(pt20, pt20 + rows, q20());
 						re20 = thrust::unique(pt20, re20, p20());
 					}
 					else
 						re20 = thrust::unique(pt20, pt20 + rows, p20());
 					flag = 1;
 				}
 				catch(std::bad_alloc &e)
 				{
 					limpiar("sort/unique in unir", 0);
 				}
 			}
 			nrows = thrust::distance(pt20, re20);
 			if(nrows < rows / 2)
 			{
 				size = nrows * tipo * sizeof(int);
 				reservar(&nres, size);
 				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
 				cudaFree(*ret);
 				*ret = nres;
 			}
 			return nrows;
 		}
 	}
 	return 0;
 }
--- a/packages/cuda/old/union2.h
+++ b/packages/cuda/old/union2.h
--- a/packages/cuda/pred.h
+++ b/packages/cuda/pred.h
--- a/packages/cuda/selectproyect.cu
+++ b/packages/cuda/selectproyect.cu
@@ -1,3 +1,4 @@
 #include "hip/hip_runtime.h"
 #include <thrust/device_vector.h>
 #include <thrust/scan.h>
 #include <stdlib.h>
@@ -8,10 +9,10 @@
 __global__ void marcar2(int *dop1, int rows, int cols, int *cons, int numc, int *res)
 {
 	extern __shared__ int shared[];
-	int id = blockIdx.x * blockDim.x + threadIdx.x;
+	int id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
 	int x, rowact, posact;
-	if(threadIdx.x < numc)
+	if(hipThreadIdx_x < numc)
-		shared[threadIdx.x] = cons[threadIdx.x];
+		shared[hipThreadIdx_x] = cons[hipThreadIdx_x];
 	__syncthreads();
 	if(id < rows)
 	{
@@ -30,10 +31,10 @@ we unmark any rows that do not comply with the selections*/
 __global__ void marcar(int *dop1, int rows, int cols, int *cons, int numc, int *res)
 {
 	extern __shared__ int shared[];
-	int id = blockIdx.x * blockDim.x + threadIdx.x;
+	int id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
 	int x, rowact, posact;
-	if(threadIdx.x < numc)
+	if(hipThreadIdx_x < numc)
-		shared[threadIdx.x] = cons[threadIdx.x];
+		shared[hipThreadIdx_x] = cons[hipThreadIdx_x];
 	__syncthreads();
 	if(id < rows)
 	{
@@ -56,10 +57,10 @@ __global__ void marcar(int *dop1, int rows, int cols, int *cons, int numc, int *
 __global__ void samejoin(int *dop1, int rows, int cols, int *dhead, int cont, int *res)
 {
 	extern __shared__ int shared[];
-	int id = blockIdx.x * blockDim.x + threadIdx.x;
+	int id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
 	int temp, temp2, pos, x, y;
-	if(threadIdx.x < cont)
+	if(hipThreadIdx_x < cont)
-		shared[threadIdx.x] = dhead[threadIdx.x];
+		shared[hipThreadIdx_x] = dhead[hipThreadIdx_x];
 	__syncthreads();
 	if(id < rows)
 	{	
@@ -90,10 +91,10 @@ __global__ void samejoin(int *dop1, int rows, int cols, int *dhead, int cont, in
 __global__ void samejoin2(int *dop1, int rows, int cols, int *dhead, int cont, int *res)
 {
 	extern __shared__ int shared[];
-	int id = blockIdx.x * blockDim.x + threadIdx.x;
+	int id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
 	int temp, temp2, pos, x, y;
-	if(threadIdx.x < cont)
+	if(hipThreadIdx_x < cont)
-		shared[threadIdx.x] = dhead[threadIdx.x];
+		shared[hipThreadIdx_x] = dhead[hipThreadIdx_x];
 	__syncthreads();
 	if(id < rows)
 	{	
@@ -120,10 +121,10 @@ __global__ void samejoin2(int *dop1, int rows, int cols, int *dhead, int cont, i
 __global__ void proyectar(int *dop1, int rows, int cols, int *dhead, int hsize, int *res)
 {
 	extern __shared__ int shared[];
-	int id = blockIdx.x * blockDim.x + threadIdx.x;
+	int id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
 	int pos, posr, x;
-	if(threadIdx.x < hsize)
+	if(hipThreadIdx_x < hsize)
-		shared[threadIdx.x] = dhead[threadIdx.x];
+		shared[hipThreadIdx_x] = dhead[hipThreadIdx_x];
 	__syncthreads();
 	if(id < rows)
 	{	
@@ -139,10 +140,10 @@ selections, selfjoins, etc.). The array 'temp' holds the result of the prefix su
 __global__ void llenarproyectar(int *dop1, int rows, int cols, int *temp, int *dhead, int hsize, int *res)
 {
 	extern __shared__ int shared[];
-	int id = blockIdx.x * blockDim.x + threadIdx.x;
+	int id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
 	int pos, posr, x;
-	if(threadIdx.x < hsize)
+	if(hipThreadIdx_x < hsize)
-		shared[threadIdx.x] = dhead[threadIdx.x];
+		shared[hipThreadIdx_x] = dhead[hipThreadIdx_x];
 	__syncthreads();
 	if(id < rows)
 	{		
@@ -184,27 +185,27 @@ int selectproyect(int *dop1, int rows, int cols, int head_size, int *select, int
 		tmplen = rows + 1;
 		size2 = tmplen * sizeof(int);
 		reservar(&temp, size2);
-		cudaMemset(temp, 0, size2);
+		hipMemset(temp, 0, size2);
 		size = numselect * sizeof(int);
-		cudaMemcpy(dhead, select, size, cudaMemcpyHostToDevice);
+		hipMemcpy(dhead, select, size, hipMemcpyHostToDevice);
-		marcar2<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, numselect, temp + 1);
+		hipLaunchKernel(HIP_KERNEL_NAME(marcar2), dim3(blockllen), dim3(numthreads), size, 0, dop1, rows, cols, dhead, numselect, temp + 1);
 		if(numselfj > 0)
 		{
 			size = numselfj * sizeof(int);
-			cudaMemcpy(dhead, selfjoin, size, cudaMemcpyHostToDevice);
+			hipMemcpy(dhead, selfjoin, size, hipMemcpyHostToDevice);
-			samejoin<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, numselfj, temp + 1);
+			hipLaunchKernel(HIP_KERNEL_NAME(samejoin), dim3(blockllen), dim3(numthreads), size, 0, dop1, rows, cols, dhead, numselfj, temp + 1);
 		}
 		if(numpreds > 0)
 		{
 			size = numpreds * sizeof(int);
-			cudaMemcpy(dhead, preds, size, cudaMemcpyHostToDevice);
+			hipMemcpy(dhead, preds, size, hipMemcpyHostToDevice);
 			if(ANDlogic)
-				bpredsnormal<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, numpreds, temp + 1);
+				hipLaunchKernel(HIP_KERNEL_NAME(bpredsnormal), dim3(blockllen), dim3(numthreads), size, 0, dop1, rows, cols, dhead, numpreds, temp + 1);
 			else
-				bpredsorlogic<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, numpreds, temp + 1);
+				hipLaunchKernel(HIP_KERNEL_NAME(bpredsorlogic), dim3(blockllen), dim3(numthreads), size, 0, dop1, rows, cols, dhead, numpreds, temp + 1);
 		}
 		res = thrust::device_pointer_cast(temp);
@@ -215,10 +216,10 @@ int selectproyect(int *dop1, int rows, int cols, int head_size, int *select, int
 		size = head_size * sizeof(int);
 		reservar(&fres, num * size);
-		cudaMemcpy(dhead, project, size, cudaMemcpyHostToDevice);
+		hipMemcpy(dhead, project, size, hipMemcpyHostToDevice);
-		llenarproyectar<<<blockllen, numthreads, size>>>(dop1, rows, cols, temp, dhead, head_size, fres);
+		hipLaunchKernel(HIP_KERNEL_NAME(llenarproyectar), dim3(blockllen), dim3(numthreads), size, 0, dop1, rows, cols, temp, dhead, head_size, fres);
-		cudaFree(dhead);
+		hipFree(dhead);
-		cudaFree(temp);
+		hipFree(temp);
 		*ret = fres;
 		return num;
 	}
@@ -229,19 +230,19 @@ int selectproyect(int *dop1, int rows, int cols, int head_size, int *select, int
 			tmplen = rows + 1;
 			size2 = tmplen * sizeof(int);
 			reservar(&temp, size2);
-			cudaMemset(temp, 0, size2);
+			hipMemset(temp, 0, size2);
 			size = numselfj * sizeof(int);
-			cudaMemcpy(dhead, selfjoin, size, cudaMemcpyHostToDevice);
+			hipMemcpy(dhead, selfjoin, size, hipMemcpyHostToDevice);
-			samejoin2<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, numselfj, temp + 1);
+			hipLaunchKernel(HIP_KERNEL_NAME(samejoin2), dim3(blockllen), dim3(numthreads), size, 0, dop1, rows, cols, dhead, numselfj, temp + 1);
 			if(numpreds > 0)
 			{
 				size = numpreds * sizeof(int);
-				cudaMemcpy(dhead, preds, size, cudaMemcpyHostToDevice);
+				hipMemcpy(dhead, preds, size, hipMemcpyHostToDevice);
 				if(ANDlogic)
-					bpredsnormal<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, numpreds, temp + 1);
+					hipLaunchKernel(HIP_KERNEL_NAME(bpredsnormal), dim3(blockllen), dim3(numthreads), size, 0, dop1, rows, cols, dhead, numpreds, temp + 1);
 				else
-					bpredsorlogic<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, numpreds, temp + 1);
+					hipLaunchKernel(HIP_KERNEL_NAME(bpredsorlogic), dim3(blockllen), dim3(numthreads), size, 0, dop1, rows, cols, dhead, numpreds, temp + 1);
 			}
@@ -253,10 +254,10 @@ int selectproyect(int *dop1, int rows, int cols, int head_size, int *select, int
 			size = head_size * sizeof(int);
 			reservar(&fres, num * size);
-			cudaMemcpy(dhead, project, size, cudaMemcpyHostToDevice);
+			hipMemcpy(dhead, project, size, hipMemcpyHostToDevice);
-			llenarproyectar<<<blockllen, numthreads, size>>>(dop1, rows, cols, temp, dhead, head_size, fres);
+			hipLaunchKernel(HIP_KERNEL_NAME(llenarproyectar), dim3(blockllen), dim3(numthreads), size, 0, dop1, rows, cols, temp, dhead, head_size, fres);
-			cudaFree(dhead);
+			hipFree(dhead);
-			cudaFree(temp);
+			hipFree(temp);
 			*ret = fres;
 			return num;
 		}
@@ -267,14 +268,14 @@ int selectproyect(int *dop1, int rows, int cols, int head_size, int *select, int
 				tmplen = rows + 1;
 				size2 = tmplen * sizeof(int);
 				reservar(&temp, size2);
-				cudaMemset(temp, 0, size2);		
+				hipMemset(temp, 0, size2);		
 				size = numpreds * sizeof(int);
-				cudaMemcpy(dhead, preds, size, cudaMemcpyHostToDevice);
+				hipMemcpy(dhead, preds, size, hipMemcpyHostToDevice);
 				if(ANDlogic)
-					bpredsnormal2<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, numpreds, temp + 1);					
+					hipLaunchKernel(HIP_KERNEL_NAME(bpredsnormal2), dim3(blockllen), dim3(numthreads), size, 0, dop1, rows, cols, dhead, numpreds, temp + 1);					
 				else
-					bpredsorlogic2<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, numpreds, temp + 1);
+					hipLaunchKernel(HIP_KERNEL_NAME(bpredsorlogic2), dim3(blockllen), dim3(numthreads), size, 0, dop1, rows, cols, dhead, numpreds, temp + 1);
 				res = thrust::device_pointer_cast(temp);
 				thrust::inclusive_scan(res + 1, res + tmplen, res + 1);
 				num = res[rows];
@@ -284,10 +285,10 @@ int selectproyect(int *dop1, int rows, int cols, int head_size, int *select, int
 				size = head_size * sizeof(int);
 				reservar(&fres, num * size);
-				cudaMemcpy(dhead, project, size, cudaMemcpyHostToDevice);
+				hipMemcpy(dhead, project, size, hipMemcpyHostToDevice);
-				llenarproyectar<<<blockllen, numthreads, size>>>(dop1, rows, cols, temp, dhead, head_size, fres);
+				hipLaunchKernel(HIP_KERNEL_NAME(llenarproyectar), dim3(blockllen), dim3(numthreads), size, 0, dop1, rows, cols, temp, dhead, head_size, fres);
-				cudaFree(dhead);
+				hipFree(dhead);
-				cudaFree(temp);
+				hipFree(temp);
 				*ret = fres;
 				return num;
 			}
@@ -295,9 +296,9 @@ int selectproyect(int *dop1, int rows, int cols, int head_size, int *select, int
 			{
 				size = head_size * sizeof(int);
 				reservar(&fres, rows * size);
-				cudaMemcpy(dhead, project, size, cudaMemcpyHostToDevice);
+				hipMemcpy(dhead, project, size, hipMemcpyHostToDevice);
-				proyectar<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, head_size, fres);
+				hipLaunchKernel(HIP_KERNEL_NAME(proyectar), dim3(blockllen), dim3(numthreads), size, 0, dop1, rows, cols, dhead, head_size, fres);
-				cudaFree(dhead);
+				hipFree(dhead);
 				*ret = fres;
 				return rows;
 			}
--- a/packages/cuda/selectproyectcpu.cpp
+++ b/packages/cuda/selectproyectcpu.cpp
--- a/packages/cuda/treeb.cu
+++ b/packages/cuda/treeb.cu
@@ -1,3 +1,4 @@
 #include "hip/hip_runtime.h"
 #include <thrust/host_vector.h>
 #include <thrust/device_vector.h>
 #include <thrust/sequence.h>
@@ -160,11 +161,11 @@ __device__ int firstMatchingKeyInDataNode2(Record records[], IKeyType key)
 __global__ void gCreateIndex(IDataNode data[], IDirectoryNode dir[], int dirSize, int tree_size, int bottom_start, int nNodesPerBlock)
 {
-        int startIdx = blockIdx.x * nNodesPerBlock;
+        int startIdx = hipBlockIdx_x * nNodesPerBlock;
        int endIdx = startIdx + nNodesPerBlock;
        if(endIdx > dirSize)
                endIdx = dirSize;
-        int keyIdx = threadIdx.x;
+        int keyIdx = hipThreadIdx_x;
        // Proceed only when in internal nodes
        for(int nodeIdx = startIdx; nodeIdx < endIdx; nodeIdx++)
@@ -191,11 +192,11 @@ __global__ void gSearchTree(IDataNode* data, int nDataNodes, IDirectoryNode* dir
 {
 	// Bringing the root node (visited by every tuple) to the faster shared memory
 	__shared__ IKeyType RootNodeKeys[TREE_NODE_SIZE];
-	RootNodeKeys[threadIdx.x] = dir->keys[threadIdx.x];
+	RootNodeKeys[hipThreadIdx_x] = dir->keys[hipThreadIdx_x];
 	__syncthreads();
-	int OverallThreadIdx = blockIdx.x * THRD_PER_BLCK_search + threadIdx.x;
+	int OverallThreadIdx = hipBlockIdx_x * THRD_PER_BLCK_search + hipThreadIdx_x;
 	for(int keyIdx = OverallThreadIdx; keyIdx < nSearchKeys; keyIdx += THRD_PER_GRID_search)
 	{
@@ -219,7 +220,7 @@ __global__ void gSearchTree(IDataNode* data, int nDataNodes, IDirectoryNode* dir
 /*Counts the number of times a row in 'S' is to be joined to a row in 'R'.*/
 __global__ void gIndexJoin(int *R, int *S, int g_locations[], int sLen, int g_ResNums[])
 {
-	int s_cur = blockIdx.x * blockDim.x + threadIdx.x;
+	int s_cur = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
 	if(s_cur < sLen) 
 	{
@@ -246,11 +247,11 @@ in 'g_locations' those rows that have equal values in the checked columns.*/
 __global__ void gIndexMultiJoinNegative(int *R, int *S, int g_locations[], int rLen, int *p1, int *p2, int of1, int of2, int *mloc, int *sloc, int *muljoin, int wj)
 {
 	extern __shared__ int shared[];
-	int r_cur = blockIdx.x * blockDim.x + threadIdx.x;
+	int r_cur = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
 	int posr, poss, x;
-	if(threadIdx.x < wj)
+	if(hipThreadIdx_x < wj)
-		shared[threadIdx.x] = muljoin[threadIdx.x];
+		shared[hipThreadIdx_x] = muljoin[hipThreadIdx_x];
 	__syncthreads();
 	if(r_cur < rLen) 
@@ -287,11 +288,11 @@ times a row in 'S' is to be joined to its corresponding row in 'R', storing the
 __global__ void gIndexMultiJoin(int *R, int *S, int g_locations[], int sLen, int g_ResNums[], int *p1, int *p2, int of1, int of2, int *mloc, int *sloc, int *muljoin, int wj)
 {
 	extern __shared__ int shared[];
-	int s_cur = blockIdx.x * blockDim.x + threadIdx.x;
+	int s_cur = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
 	int posr, poss, x;
-	if(threadIdx.x < wj)
+	if(hipThreadIdx_x < wj)
-		shared[threadIdx.x] = muljoin[threadIdx.x];
+		shared[hipThreadIdx_x] = muljoin[hipThreadIdx_x];
 	__syncthreads();
 	if(s_cur < sLen) 
@@ -330,10 +331,10 @@ __global__ void multiJoinWithWrite(int g_locations[], int sLen, int g_PrefixSums
 {
 	extern __shared__ int shared[];
 	int *extjoins = &shared[lenrul];
-	int s_cur = blockIdx.x * blockDim.x + threadIdx.x;
+	int s_cur = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
-	if(threadIdx.x < (lenrul + wj))
+	if(hipThreadIdx_x < (lenrul + wj))
-		shared[threadIdx.x] = rule[threadIdx.x];
+		shared[hipThreadIdx_x] = rule[hipThreadIdx_x];
 	__syncthreads();
 	if(s_cur < sLen)
@@ -382,10 +383,10 @@ __global__ void multiJoinWithWrite2(int g_locations[], int sLen, int g_PrefixSum
 {
 	extern __shared__ int shared[];
 	int *extjoins = &shared[cols];
-	int s_cur = blockIdx.x * blockDim.x + threadIdx.x;
+	int s_cur = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
-	if(threadIdx.x < (cols + wj))
+	if(hipThreadIdx_x < (cols + wj))
-		shared[threadIdx.x] = rule[threadIdx.x];
+		shared[hipThreadIdx_x] = rule[hipThreadIdx_x];
 	__syncthreads();
 	if(s_cur < sLen)
@@ -432,11 +433,11 @@ predicate are projected.*/
 __global__ void gJoinWithWriteNegative(int g_locations[], int rLen, int g_joinResultBuffers[], int *p1, int of1, int *rule, int halfrul, int *mloc)
 {
 	extern __shared__ int shared[];
-	int r_cur = blockIdx.x * blockDim.x + threadIdx.x;
+	int r_cur = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
 	int posr;
-	if(threadIdx.x < halfrul)
+	if(hipThreadIdx_x < halfrul)
-		shared[threadIdx.x] = rule[threadIdx.x];
+		shared[hipThreadIdx_x] = rule[hipThreadIdx_x];
 	__syncthreads();
 	if(r_cur < rLen)
@@ -461,11 +462,11 @@ predicate are projected.*/
 __global__ void gJoinWithWriteNegative2(int g_locations[], int rLen, int g_joinResultBuffers[], int *p1, int of1, int *rule, int cols, int *mloc)
 {
 	extern __shared__ int shared[];
-	int r_cur = blockIdx.x * blockDim.x + threadIdx.x;
+	int r_cur = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
 	int posr;
-	if(threadIdx.x < cols)
+	if(hipThreadIdx_x < cols)
-		shared[threadIdx.x] = rule[threadIdx.x];
+		shared[hipThreadIdx_x] = rule[hipThreadIdx_x];
 	__syncthreads();
 	if(r_cur < rLen)
@@ -489,10 +490,10 @@ __global__ void gJoinWithWriteNegative2(int g_locations[], int rLen, int g_joinR
 __global__ void gJoinWithWrite(int g_locations[], int sLen, int g_PrefixSums[], int g_joinResultBuffers[], int *p1, int *p2, int of1, int of2, int *rule, int halfrul, int lenrul, int *mloc, int *sloc)
 {
 	extern __shared__ int shared[];
-	int s_cur = blockIdx.x * blockDim.x + threadIdx.x;
+	int s_cur = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
-	if(threadIdx.x < lenrul)
+	if(hipThreadIdx_x < lenrul)
-		shared[threadIdx.x] = rule[threadIdx.x];
+		shared[hipThreadIdx_x] = rule[hipThreadIdx_x];
 	__syncthreads();
 	if(s_cur < sLen)
@@ -525,10 +526,10 @@ projection, which is performed based on the variables in the head of the rule.*/
 __global__ void gJoinWithWrite2(int g_locations[], int sLen, int g_PrefixSums[], int g_joinResultBuffers[], int *p1, int *p2, int of1, int of2, int *rule, int cols, int *mloc, int *sloc)
 {
 	extern __shared__ int shared[];
-	int s_cur = blockIdx.x * blockDim.x + threadIdx.x;
+	int s_cur = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
-	if(threadIdx.x < cols)
+	if(hipThreadIdx_x < cols)
-		shared[threadIdx.x] = rule[threadIdx.x];
+		shared[hipThreadIdx_x] = rule[hipThreadIdx_x];
 	__syncthreads();
 	if(s_cur < sLen)
@@ -563,7 +564,7 @@ __global__ void gJoinWithWrite2(int g_locations[], int sLen, int g_PrefixSums[],
 /*Load part of column 'wj' of 'p' in 'R'. Which values are loaded is defined by the prefix sum results in 'pos'.*/
 __global__ void llenar(int *p, int *R, int len, int of, int wj, int *pos, int *ids)
 {
-	int id = blockIdx.x * blockDim.x + threadIdx.x;
+	int id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
 	int cond;
 	if(id < len)
 	{
@@ -579,7 +580,7 @@ __global__ void llenar(int *p, int *R, int len, int of, int wj, int *pos, int *i
 /*Load an entire column from 'p' into 'R'.*/
 __global__ void llenarnosel(int *p, int *R, int len, int of, int wj)
 {
-	int id = blockIdx.x * blockDim.x + threadIdx.x;
+	int id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
 	if(id < len)
 		R[id] = p[id * of + wj];
 }
@@ -587,10 +588,10 @@ __global__ void llenarnosel(int *p, int *R, int len, int of, int wj)
 __global__ void projectfinal(int *res, int rows, int cols, int *rule, int *out)
 {
 	extern __shared__ int shared[];
-	int id = blockIdx.x * blockDim.x + threadIdx.x;
+	int id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
-	if(threadIdx.x < cols)
+	if(hipThreadIdx_x < cols)
-		shared[threadIdx.x] = rule[threadIdx.x];
+		shared[hipThreadIdx_x] = rule[hipThreadIdx_x];
 	__syncthreads();
 	if(id < rows)
@@ -614,26 +615,26 @@ void project(int *res, int resrows, int numcols1, int numcols2, int *proj, int *
 		int *pt = (int *)malloc(sizepro);
 		for(z = 0; z < numcols2; z++)
 			pt[z] = proj[z] - 1;
-		cudaMemcpy(dcons, pt, sizepro, cudaMemcpyHostToDevice); 
+		hipMemcpy(dcons, pt, sizepro, hipMemcpyHostToDevice); 
-		//cudaDeviceSynchronize(); //Small cudaMemcpys are asynchronous, uncomment this line if the pointer is being liberated before it is copied.
+		//hipDeviceSynchronize(); //Small cudaMemcpys are asynchronous, uncomment this line if the pointer is being liberated before it is copied.
 		free(pt);
 	}
 	else
-		cudaMemcpy(dcons, proj, sizepro, cudaMemcpyHostToDevice);
+		hipMemcpy(dcons, proj, sizepro, hipMemcpyHostToDevice);
 	reservar(&d_Rout, resrows * sizepro);
-	projectfinal<<<blockllen, numthreads, sizepro>>>(res, resrows, numcols1, dcons, d_Rout);
+	hipLaunchKernel(HIP_KERNEL_NAME(projectfinal), dim3(blockllen), dim3(numthreads), sizepro, 0, res, resrows, numcols1, dcons, d_Rout);
-	cudaFree(dcons);
+	hipFree(dcons);
-	cudaFree(*ret);
+	hipFree(*ret);
 	*ret = d_Rout;
 }
 __global__ void projectadd(int *dop1, int *dop2, int rows1, int rows2, int cols1, int cols2, int *dhead, int hsize, int *res)
 {
 	extern __shared__ int shared[];
-	int id = blockIdx.x * blockDim.x + threadIdx.x;
+	int id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
 	int pos2, posr, x, y, cond;
-	if(threadIdx.x < hsize)
+	if(hipThreadIdx_x < hsize)
-		shared[threadIdx.x] = dhead[threadIdx.x];
+		shared[hipThreadIdx_x] = dhead[hipThreadIdx_x];
 	__syncthreads();
 	if(id < rows2)
 	{
@@ -662,10 +663,10 @@ void juntar(int *dop1, int *dop2, int rows1, int rows2, int cols1, int cols2, in
 	int blockllen = rows2 / numthreads + 1;
 	sizepro = pcols * sizeof(int);
 	reservar(&dcons, sizepro);
-	cudaMemcpy(dcons, proj, sizepro, cudaMemcpyHostToDevice);
+	hipMemcpy(dcons, proj, sizepro, hipMemcpyHostToDevice);
 	reservar(&d_Rout, rows1 * rows2 * sizepro);
-	projectadd<<<blockllen, numthreads, sizepro>>>(dop1, dop2, rows1, rows2, cols1, cols2, dcons, pcols, d_Rout);
+	hipLaunchKernel(HIP_KERNEL_NAME(projectadd), dim3(blockllen), dim3(numthreads), sizepro, 0, dop1, dop2, rows1, rows2, cols1, cols2, dcons, pcols, d_Rout);
-	cudaFree(dcons);
+	hipFree(dcons);
 	*ret = d_Rout;
 }
@@ -743,51 +744,51 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
 	#ifdef TIMER
 	//cout << "INICIO" << endl;
-	cudaEvent_t start, stop;
+	hipEvent_t start, stop;
 	float time;
-	cudaEventCreate(&start);
+	hipEventCreate(&start);
-	cudaEventCreate(&stop);
+	hipEventCreate(&stop);
-	cudaEventRecord(start, 0);
+	hipEventRecord(start, 0);
 	#endif
 	if(npred2.x > 0 || npred2.y > 0 || nsel2 > 0 || nsj2 > 0)
 	{
 		newLen = sLen + 1;
-		cudaMemsetAsync(temp, 0, newLen * sizeof(int));
+		hipMemsetAsync(temp, 0, newLen * sizeof(int));
 	}
 	if(npred2.x > 0 || npred2.y > 0)
 	{
 		size = npred2tot * sizeof(int);
-		cudaMemcpy(dcons, pred2, size, cudaMemcpyHostToDevice);
+		hipMemcpy(dcons, pred2, size, hipMemcpyHostToDevice);
 		if(npred2.y > 0) /*Fix case when a(X,Y),b(Y,Z),Z > Y*/
 		{
 			reservar(&temp2, sizet2);
-			cudaMemsetAsync(temp2, 0, newLen * sizeof(int));
+			hipMemsetAsync(temp2, 0, newLen * sizeof(int));
 			//res = thrust::device_pointer_cast(temp2);
-			bpreds<<<blockllen, numthreads, size>>>(p1, p2, sLen, of1, of2, dcons, npred2tot, npred2.x, temp + 1, temp2 + 1);
+			hipLaunchKernel(HIP_KERNEL_NAME(bpreds), dim3(blockllen), dim3(numthreads), size, 0, p1, p2, sLen, of1, of2, dcons, npred2tot, npred2.x, temp + 1, temp2 + 1);
 		}
 		else
 		{
 			if(negative)
-				bpreds<<<blockllen, numthreads, size>>>(p1, p2, sLen, of1, of2, dcons, npred2tot, npred2.x, temp + 1, NULL);
+				hipLaunchKernel(HIP_KERNEL_NAME(bpreds), dim3(blockllen), dim3(numthreads), size, 0, p1, p2, sLen, of1, of2, dcons, npred2tot, npred2.x, temp + 1, NULL);
 			else
-				bpredsOR<<<blockllen, numthreads, size>>>(p1, p2, sLen, of1, of2, dcons, npred2tot, npred2.x, temp + 1, NULL);
+				hipLaunchKernel(HIP_KERNEL_NAME(bpredsOR), dim3(blockllen), dim3(numthreads), size, 0, p1, p2, sLen, of1, of2, dcons, npred2tot, npred2.x, temp + 1, NULL);
 		}
 		if(nsel2 > 0)
 		{
 			size = nsel2 * sizeof(int);
-			cudaMemcpy(dcons, sel2, size, cudaMemcpyHostToDevice);
+			hipMemcpy(dcons, sel2, size, hipMemcpyHostToDevice);
-			marcar<<<blockllen, numthreads, size>>>(p2, sLen, of2, dcons, nsel2, temp + 1);
+			hipLaunchKernel(HIP_KERNEL_NAME(marcar), dim3(blockllen), dim3(numthreads), size, 0, p2, sLen, of2, dcons, nsel2, temp + 1);
 		}
 		if(nsj2 > 0)
 		{
 			size = nsj2 * sizeof(int);
-			cudaMemcpy(dcons, sjoin2, size, cudaMemcpyHostToDevice);
+			hipMemcpy(dcons, sjoin2, size, hipMemcpyHostToDevice);
-			samejoin<<<blockllen, numthreads, size>>>(p2, sLen, of2, dcons, nsj2, temp + 1);
+			hipLaunchKernel(HIP_KERNEL_NAME(samejoin), dim3(blockllen), dim3(numthreads), size, 0, p2, sLen, of2, dcons, nsj2, temp + 1);
 		}
 	}
 	else
@@ -795,14 +796,14 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
 		if(nsel2 > 0)
 		{
 			size = nsel2 * sizeof(int);
-			cudaMemcpy(dcons, sel2, size, cudaMemcpyHostToDevice);
+			hipMemcpy(dcons, sel2, size, hipMemcpyHostToDevice);
-			marcar2<<<blockllen, numthreads, size>>>(p2, sLen, of2, dcons, nsel2, temp + 1);
+			hipLaunchKernel(HIP_KERNEL_NAME(marcar2), dim3(blockllen), dim3(numthreads), size, 0, p2, sLen, of2, dcons, nsel2, temp + 1);
 			if(nsj2 > 0)
 			{
 				size = nsj2 * sizeof(int);
-				cudaMemcpy(dcons, sjoin2, size, cudaMemcpyHostToDevice);
+				hipMemcpy(dcons, sjoin2, size, hipMemcpyHostToDevice);
-				samejoin<<<blockllen, numthreads, size>>>(p2, sLen, of2, dcons, nsj2, temp + 1);
+				hipLaunchKernel(HIP_KERNEL_NAME(samejoin), dim3(blockllen), dim3(numthreads), size, 0, p2, sLen, of2, dcons, nsj2, temp + 1);
 			}
 		}
 		else
@@ -810,15 +811,15 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
 			if(nsj2 > 0)
 			{
 				size = nsj2 * sizeof(int);
-				cudaMemcpy(dcons, sjoin2, size, cudaMemcpyHostToDevice);
+				hipMemcpy(dcons, sjoin2, size, hipMemcpyHostToDevice);
-				samejoin2<<<blockllen, numthreads, size>>>(p2, sLen, of2, dcons, nsj2, temp + 1);	
+				hipLaunchKernel(HIP_KERNEL_NAME(samejoin2), dim3(blockllen), dim3(numthreads), size, 0, p2, sLen, of2, dcons, nsj2, temp + 1);	
 			}
 			else
 			{
 				sizem32S = m32sLen * sizeof(int);
 				reservar(&d_S, sizem32S);
-				cudaMemsetAsync(d_S + sLen, 0x7f, extraspaceS * sizeof(int));
+				hipMemsetAsync(d_S + sLen, 0x7f, extraspaceS * sizeof(int));
-				llenarnosel<<<blockllen, numthreads>>>(p2, d_S, sLen, of2, wherej[1]);
+				hipLaunchKernel(HIP_KERNEL_NAME(llenarnosel), dim3(blockllen), dim3(numthreads), 0, 0, p2, d_S, sLen, of2, wherej[1]);
 			}
 		}
 	}
@@ -842,8 +843,8 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
 		if(newLen == 0) // && !negative) ARREGLAR
 		{
-			cudaFree(temp);
+			hipFree(temp);
-			cudaFree(dcons);
+			hipFree(dcons);
 			return 0;
 		}
@@ -854,24 +855,24 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
 		reservar(&d_S, sizem32S);
 		reservar(&posS, sizem32S);
-		cudaMemsetAsync(d_S + newLen, 0x7f, sizextra);
+		hipMemsetAsync(d_S + newLen, 0x7f, sizextra);
-		cudaMemsetAsync(posS + newLen, 0x7f, sizextra);
+		hipMemsetAsync(posS + newLen, 0x7f, sizextra);
-		llenar<<<blockllen, numthreads>>>(p2, d_S, sLen, of2, wherej[1], temp, posS);
+		hipLaunchKernel(HIP_KERNEL_NAME(llenar), dim3(blockllen), dim3(numthreads), 0, 0, p2, d_S, sLen, of2, wherej[1], temp, posS);
 		sLen = newLen;
 	}
 	#ifdef TIMER
-	cudaEventRecord(stop, 0);
+	hipEventRecord(stop, 0);
-	cudaEventSynchronize(stop);
+	hipEventSynchronize(stop);
-	cudaEventElapsedTime(&time, start, stop);
+	hipEventElapsedTime(&time, start, stop);
 	//cout << "Select1 = " << time << endl;
 	cuda_stats.select1_time += time;
-	cudaEventDestroy(start);
+	hipEventDestroy(start);
-	cudaEventDestroy(stop);
+	hipEventDestroy(stop);
-	cudaEventCreate(&start);
+	hipEventCreate(&start);
-	cudaEventCreate(&stop);
+	hipEventCreate(&stop);
-	cudaEventRecord(start, 0);
+	hipEventRecord(start, 0);
 	#endif
 	blockllen = rLen / numthreads + 1;
@@ -880,30 +881,30 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
 	{
 		if(temp2 != NULL)
 		{
-			cudaFree(temp);
+			hipFree(temp);
 			temp = temp2;
 			res = thrust::device_pointer_cast(temp);
 			newLen = rLen + 1;
 			if(nsel1 > 0)
 			{
 				size = nsel1 * sizeof(int);
-				cudaMemcpy(dcons, sel1, size, cudaMemcpyHostToDevice);
+				hipMemcpy(dcons, sel1, size, hipMemcpyHostToDevice);
-				marcar<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, nsel1, temp + 1);
+				hipLaunchKernel(HIP_KERNEL_NAME(marcar), dim3(blockllen), dim3(numthreads), size, 0, p1, rLen, of1, dcons, nsel1, temp + 1);
 			}
 			if(nsj1 > 0)
 			{
 				size = nsj1 * sizeof(int);
-				cudaMemcpy(dcons, sjoin1, size, cudaMemcpyHostToDevice);
+				hipMemcpy(dcons, sjoin1, size, hipMemcpyHostToDevice);
-				samejoin<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, nsj1, temp + 1);
+				hipLaunchKernel(HIP_KERNEL_NAME(samejoin), dim3(blockllen), dim3(numthreads), size, 0, p1, rLen, of1, dcons, nsj1, temp + 1);
 			}
 			if(npred1.x > 0)
 			{
 				size = npred1.x * sizeof(int);
-				cudaMemcpy(dcons, pred1, size, cudaMemcpyHostToDevice);
+				hipMemcpy(dcons, pred1, size, hipMemcpyHostToDevice);
 				if(ANDlogic)
-					bpredsnormal<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, npred1.x, temp + 1);
+					hipLaunchKernel(HIP_KERNEL_NAME(bpredsnormal), dim3(blockllen), dim3(numthreads), size, 0, p1, rLen, of1, dcons, npred1.x, temp + 1);
 				else
-					bpredsorlogic<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, npred1.x, temp + 1);
+					hipLaunchKernel(HIP_KERNEL_NAME(bpredsorlogic), dim3(blockllen), dim3(numthreads), size, 0, p1, rLen, of1, dcons, npred1.x, temp + 1);
 			}
 		}
 		else
@@ -911,30 +912,30 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
 			if(npred1.x > 0 || nsel1 > 0 || nsj1 > 0)
 			{
 				newLen = rLen + 1;
-				cudaMemsetAsync(temp, 0, newLen * sizeof(int));
+				hipMemsetAsync(temp, 0, newLen * sizeof(int));
 			}
 			if(nsel1 > 0)
 			{
 				size = nsel1 * sizeof(int);
-				cudaMemcpy(dcons, sel1, size, cudaMemcpyHostToDevice);
+				hipMemcpy(dcons, sel1, size, hipMemcpyHostToDevice);
-				marcar2<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, nsel1, temp + 1);
+				hipLaunchKernel(HIP_KERNEL_NAME(marcar2), dim3(blockllen), dim3(numthreads), size, 0, p1, rLen, of1, dcons, nsel1, temp + 1);
 				if(nsj1 > 0)
 				{
 					size = nsj1 * sizeof(int);
-					cudaMemcpy(dcons, sjoin1, size, cudaMemcpyHostToDevice);
+					hipMemcpy(dcons, sjoin1, size, hipMemcpyHostToDevice);
-					samejoin<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, nsj1, temp + 1);
+					hipLaunchKernel(HIP_KERNEL_NAME(samejoin), dim3(blockllen), dim3(numthreads), size, 0, p1, rLen, of1, dcons, nsj1, temp + 1);
 				}
 				if(npred1.x > 0)
 				{
 					size = npred1.x * sizeof(int);
-					cudaMemcpy(dcons, pred1, size, cudaMemcpyHostToDevice);
+					hipMemcpy(dcons, pred1, size, hipMemcpyHostToDevice);
 					if(ANDlogic)
-						bpredsnormal<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, npred1.x, temp + 1);
+						hipLaunchKernel(HIP_KERNEL_NAME(bpredsnormal), dim3(blockllen), dim3(numthreads), size, 0, p1, rLen, of1, dcons, npred1.x, temp + 1);
 					else
-						bpredsorlogic<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, npred1.x, temp + 1);		
+						hipLaunchKernel(HIP_KERNEL_NAME(bpredsorlogic), dim3(blockllen), dim3(numthreads), size, 0, p1, rLen, of1, dcons, npred1.x, temp + 1);		
 				}
 			}
 			else
@@ -942,17 +943,17 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
 				if(nsj1 > 0)
 				{
 					size = nsj1 * sizeof(int);
-					cudaMemcpy(dcons, sjoin1, size, cudaMemcpyHostToDevice);
+					hipMemcpy(dcons, sjoin1, size, hipMemcpyHostToDevice);
-					samejoin2<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, nsj1, temp + 1);
+					hipLaunchKernel(HIP_KERNEL_NAME(samejoin2), dim3(blockllen), dim3(numthreads), size, 0, p1, rLen, of1, dcons, nsj1, temp + 1);
 					if(npred1.x > 0)
 					{
 						size = npred1.x * sizeof(int);
-						cudaMemcpy(dcons, pred1, size, cudaMemcpyHostToDevice);
+						hipMemcpy(dcons, pred1, size, hipMemcpyHostToDevice);
 						if(ANDlogic)
-							bpredsnormal<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, npred1.x, temp + 1);
+							hipLaunchKernel(HIP_KERNEL_NAME(bpredsnormal), dim3(blockllen), dim3(numthreads), size, 0, p1, rLen, of1, dcons, npred1.x, temp + 1);
 						else
-							bpredsorlogic<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, npred1.x, temp + 1);
+							hipLaunchKernel(HIP_KERNEL_NAME(bpredsorlogic), dim3(blockllen), dim3(numthreads), size, 0, p1, rLen, of1, dcons, npred1.x, temp + 1);
 					}
 				}
 				else
@@ -960,11 +961,11 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
 					if(npred1.x > 0)
 					{
 						size = npred1.x * sizeof(int);
-						cudaMemcpy(dcons, pred1, size, cudaMemcpyHostToDevice);
+						hipMemcpy(dcons, pred1, size, hipMemcpyHostToDevice);
 						if(ANDlogic)
-							bpredsnormal2<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, npred1.x, temp + 1);
+							hipLaunchKernel(HIP_KERNEL_NAME(bpredsnormal2), dim3(blockllen), dim3(numthreads), size, 0, p1, rLen, of1, dcons, npred1.x, temp + 1);
 						else
-							bpredsorlogic2<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, npred1.x, temp + 1);	
+							hipLaunchKernel(HIP_KERNEL_NAME(bpredsorlogic2), dim3(blockllen), dim3(numthreads), size, 0, p1, rLen, of1, dcons, npred1.x, temp + 1);	
 					}
 				}
 			}
@@ -976,11 +977,11 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
 			newLen = res[rLen];
 			if(newLen == 0)
 			{
-				cudaFree(temp);
+				hipFree(temp);
-				cudaFree(dcons);
+				hipFree(dcons);
-				cudaFree(d_S);
+				hipFree(d_S);
 				if(posS != NULL)
-					cudaFree(posS);
+					hipFree(posS);
 				return 0;
 			}	
@@ -991,41 +992,41 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
 			reservar(&d_R, sizem32);
 			reservar(&posR, sizem32);
-			cudaMemsetAsync(d_R + newLen, 0x7f, sizextra);
+			hipMemsetAsync(d_R + newLen, 0x7f, sizextra);
-			cudaMemsetAsync(posR + newLen, 0x7f, sizextra);
+			hipMemsetAsync(posR + newLen, 0x7f, sizextra);
-			llenar<<<blockllen, numthreads>>>(p1, d_R, rLen, of1, wherej[0], temp, posR);
+			hipLaunchKernel(HIP_KERNEL_NAME(llenar), dim3(blockllen), dim3(numthreads), 0, 0, p1, d_R, rLen, of1, wherej[0], temp, posR);
 			rLen = newLen;
 		}
 		else
 		{
 			sizem32 = m32rLen * sizeof(int);
 			reservar(&d_R, sizem32);
-			cudaMemsetAsync(d_R + rLen, 0x7f, extraspace * sizeof(int));
+			hipMemsetAsync(d_R + rLen, 0x7f, extraspace * sizeof(int));
-			llenarnosel<<<blockllen, numthreads>>>(p1, d_R, rLen, of1, wherej[0]);
+			hipLaunchKernel(HIP_KERNEL_NAME(llenarnosel), dim3(blockllen), dim3(numthreads), 0, 0, p1, d_R, rLen, of1, wherej[0]);
 		}
 	}
 	else
 	{
 		sizem32 = m32rLen * sizeof(int);
 		reservar(&d_R, sizem32);
-		cudaMemsetAsync(d_R + rLen, 0x7f, extraspace * sizeof(int));
+		hipMemsetAsync(d_R + rLen, 0x7f, extraspace * sizeof(int));
-		llenarnosel<<<blockllen, numthreads>>>(p1, d_R, rLen, of1, wherej[0]);
+		hipLaunchKernel(HIP_KERNEL_NAME(llenarnosel), dim3(blockllen), dim3(numthreads), 0, 0, p1, d_R, rLen, of1, wherej[0]);
 	}
 	#ifdef TIMER
-	cudaEventRecord(stop, 0);
+	hipEventRecord(stop, 0);
-	cudaEventSynchronize(stop);
+	hipEventSynchronize(stop);
-	cudaEventElapsedTime(&time, start, stop);
+	hipEventElapsedTime(&time, start, stop);
 	//cout << "Select2 = " << time << endl;
 	cuda_stats.select2_time += time;
 	#endif
 	#ifdef TIMER
-	cudaEventDestroy(start);
+	hipEventDestroy(start);
-	cudaEventDestroy(stop);
+	hipEventDestroy(stop);
-	cudaEventCreate(&start);
+	hipEventCreate(&start);
-	cudaEventCreate(&stop);
+	hipEventCreate(&stop);
-	cudaEventRecord(start, 0);
+	hipEventRecord(start, 0);
 	#endif
 	thrust::device_ptr<Record> dvp1;
@@ -1084,17 +1085,17 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
 	}
 	#ifdef TIMER
-	cudaEventRecord(stop, 0);
+	hipEventRecord(stop, 0);
-	cudaEventSynchronize(stop);
+	hipEventSynchronize(stop);
-	cudaEventElapsedTime(&time, start, stop);
+	hipEventElapsedTime(&time, start, stop);
 	//cout << "Sort = " << time << endl;
 	cuda_stats.sort_time += time;
-	cudaEventDestroy(start);
+	hipEventDestroy(start);
-	cudaEventDestroy(stop);
+	hipEventDestroy(stop);
-	cudaEventCreate(&start);
+	hipEventCreate(&start);
-	cudaEventCreate(&stop);
+	hipEventCreate(&stop);
-	cudaEventRecord(start, 0);
+	hipEventRecord(start, 0);
 	#endif
 	IDataNode* d_data;
@@ -1123,7 +1124,7 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
 	dim3 Dbc(THRD_PER_BLCK_create, 1, 1);
 	dim3 Dgc(BLCK_PER_GRID_create, 1, 1);
-	gCreateIndex <<<Dgc, Dbc>>> (d_data, d_dir, nDirNodes, tree_size, bottom_start, nNodesPerBlock);
+	hipLaunchKernel(HIP_KERNEL_NAME(gCreateIndex), dim3(Dgc), dim3(Dbc), 0, 0, d_data, d_dir, nDirNodes, tree_size, bottom_start, nNodesPerBlock);
 	int *d_locations;
 	int memSizeR;
@@ -1132,7 +1133,7 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
 	{
 		memSizeR = (rLen + 1) * sizeof(int);
 		reservar(&d_locations, memSizeR);
-		cudaMemsetAsync(d_locations, 0, sizeof(int));
+		hipMemsetAsync(d_locations, 0, sizeof(int));
 		nSearchKeys = rLen;
 	}
 	else
@@ -1146,13 +1147,13 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
 	unsigned int nKeysPerThread = uintCeilingDiv(nSearchKeys, THRD_PER_GRID_search);
 	if(negative)
 	{
-		gSearchTree <<<Dgs, Dbs>>> (d_data, nDataNodes, d_dir, nDirNodes, lvlDir, d_R, d_locations + 1, nSearchKeys, nKeysPerThread, tree_size, bottom_start);
+		hipLaunchKernel(HIP_KERNEL_NAME(gSearchTree), dim3(Dgs), dim3(Dbs), 0, 0, d_data, nDataNodes, d_dir, nDirNodes, lvlDir, d_R, d_locations + 1, nSearchKeys, nKeysPerThread, tree_size, bottom_start);
-		cudaMemsetAsync(temp, 0, memSizeR);
+		hipMemsetAsync(temp, 0, memSizeR);
 	}
 	else
 	{
-		gSearchTree <<<Dgs, Dbs>>> (d_data, nDataNodes, d_dir, nDirNodes, lvlDir, d_S, d_locations, nSearchKeys, nKeysPerThread, tree_size, bottom_start);
+		hipLaunchKernel(HIP_KERNEL_NAME(gSearchTree), dim3(Dgs), dim3(Dbs), 0, 0, d_data, nDataNodes, d_dir, nDirNodes, lvlDir, d_S, d_locations, nSearchKeys, nKeysPerThread, tree_size, bottom_start);
-		cudaMemsetAsync(temp, 0, memSizeS);
+		hipMemsetAsync(temp, 0, memSizeS);
 	}
 	int muljoin = 0, muljoinsize = 0, sum;
@@ -1165,8 +1166,8 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
 		{
 			muljoin = numj - 2;
 			muljoinsize = muljoin * sizeof(int);
-			cudaMemcpy(dcons, wherej + 2, muljoinsize, cudaMemcpyHostToDevice);
+			hipMemcpy(dcons, wherej + 2, muljoinsize, hipMemcpyHostToDevice);
-			gIndexMultiJoinNegative<<<blockllen, numthreads, muljoinsize>>> (d_R, d_S, d_locations + 1, rLen, p1, p2, of1, of2, posR, posS, dcons, muljoin);
+			hipLaunchKernel(HIP_KERNEL_NAME(gIndexMultiJoinNegative), dim3(blockllen), dim3(numthreads), muljoinsize, 0, d_R, d_S, d_locations + 1, rLen, p1, p2, of1, of2, posR, posS, dcons, muljoin);
 		}
 		res = thrust::device_pointer_cast(d_locations);	
@@ -1177,21 +1178,21 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
 		if(pos == (rule->num_rows - 3))
 		{
 			sizepro = rule->num_columns * sizeof(int);
-			cudaMemcpy(dcons, proj, sizepro, cudaMemcpyHostToDevice);
+			hipMemcpy(dcons, proj, sizepro, hipMemcpyHostToDevice);
 			resSize = sum * sizepro;
 			reservar(&d_Rout, resSize);
-			gJoinWithWriteNegative2<<<blockllen, numthreads, sizepro>>> (d_locations, rLen, d_Rout, p1, of1, dcons, rule->num_columns, posR);
+			hipLaunchKernel(HIP_KERNEL_NAME(gJoinWithWriteNegative2), dim3(blockllen), dim3(numthreads), sizepro, 0, d_locations, rLen, d_Rout, p1, of1, dcons, rule->num_columns, posR);
 		}
 		else
 		{	
 			sizepro = projp.x * sizeof(int);
-			cudaMemcpy(dcons, proj, sizepro, cudaMemcpyHostToDevice);
+			hipMemcpy(dcons, proj, sizepro, hipMemcpyHostToDevice);
 			resSize = sum * sizepro;
 			reservar(&d_Rout, resSize);
-			gJoinWithWriteNegative<<<blockllen, numthreads, sizepro>>> (d_locations, rLen, d_Rout, p1, of1, dcons, projp.x, posR);
+			hipLaunchKernel(HIP_KERNEL_NAME(gJoinWithWriteNegative), dim3(blockllen), dim3(numthreads), sizepro, 0, d_locations, rLen, d_Rout, p1, of1, dcons, projp.x, posR);
 		}
-		cudaFree(d_R);
+		hipFree(d_R);
-		cudaFree(d_S);
+		hipFree(d_S);
 	}
 	else
 	{
@@ -1200,26 +1201,26 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
 		{
 			muljoin = numj - 2;
 			muljoinsize = muljoin * sizeof(int);
-			cudaMemcpy(dcons, wherej + 2, muljoinsize, cudaMemcpyHostToDevice);
+			hipMemcpy(dcons, wherej + 2, muljoinsize, hipMemcpyHostToDevice);
-			gIndexMultiJoin<<<blockllen, numthreads, muljoinsize>>> (d_R, d_S, d_locations, sLen, temp, p1, p2, of1, of2, posR, posS, dcons, muljoin);
+			hipLaunchKernel(HIP_KERNEL_NAME(gIndexMultiJoin), dim3(blockllen), dim3(numthreads), muljoinsize, 0, d_R, d_S, d_locations, sLen, temp, p1, p2, of1, of2, posR, posS, dcons, muljoin);
 		}
 		else
-			gIndexJoin<<<blockllen, numthreads>>> (d_R, d_S, d_locations, sLen, temp);
+			hipLaunchKernel(HIP_KERNEL_NAME(gIndexJoin), dim3(blockllen), dim3(numthreads), 0, 0, d_R, d_S, d_locations, sLen, temp);
-		cudaFree(d_R);
+		hipFree(d_R);
-		cudaFree(d_S);
+		hipFree(d_S);
 		sum = res[sLen-1];
 		thrust::exclusive_scan(res, res + sLen, res);
 		sum += res[sLen-1];
 		if(sum == 0)
 		{
-			cudaFree(dcons);
+			hipFree(dcons);
-			cudaFree(d_locations);
+			hipFree(d_locations);
-			cudaFree(temp);
+			hipFree(temp);
 			if(posS != NULL)
-				cudaFree(posS);
+				hipFree(posS);
 			if(posR != NULL)
-				cudaFree(posR);
+				hipFree(posR);
 			return 0;
 		}
 		res[sLen] = sum;
@@ -1227,49 +1228,49 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
 		if(pos == (rule->num_rows - 3))
 		{
 			sizepro = rule->num_columns * sizeof(int);
-			cudaMemcpy(dcons, proj, sizepro, cudaMemcpyHostToDevice);
+			hipMemcpy(dcons, proj, sizepro, hipMemcpyHostToDevice);
 			resSize = sum * sizepro;
 			reservar(&d_Rout, resSize);
 			if(numj > 2)
 			{
-				cudaMemcpy(dcons + rule->num_columns, wherej + 2, muljoinsize, cudaMemcpyHostToDevice);
+				hipMemcpy(dcons + rule->num_columns, wherej + 2, muljoinsize, hipMemcpyHostToDevice);
-				multiJoinWithWrite2<<<blockllen, numthreads, sizepro + muljoinsize>>> (d_locations, sLen, temp, d_Rout, p1, p2, of1, of2, dcons, rule->num_columns, posR, posS, muljoin);
+				hipLaunchKernel(HIP_KERNEL_NAME(multiJoinWithWrite2), dim3(blockllen), dim3(numthreads), sizepro + muljoinsize, 0, d_locations, sLen, temp, d_Rout, p1, p2, of1, of2, dcons, rule->num_columns, posR, posS, muljoin);
 			}
 			else
-				gJoinWithWrite2<<<blockllen, numthreads, sizepro>>> (d_locations, sLen, temp, d_Rout, p1, p2, of1, of2, dcons, rule->num_columns, posR, posS);
+				hipLaunchKernel(HIP_KERNEL_NAME(gJoinWithWrite2), dim3(blockllen), dim3(numthreads), sizepro, 0, d_locations, sLen, temp, d_Rout, p1, p2, of1, of2, dcons, rule->num_columns, posR, posS);
 		}
 		else
 		{
 			sizepro = projp.y * sizeof(int);
-			cudaMemcpy(dcons, proj, sizepro, cudaMemcpyHostToDevice);
+			hipMemcpy(dcons, proj, sizepro, hipMemcpyHostToDevice);
 			resSize = sum * sizepro;
 			reservar(&d_Rout, resSize);
 			if(numj > 2)
 			{
-				cudaMemcpy(dcons + projp.y, wherej + 2, muljoinsize, cudaMemcpyHostToDevice);
+				hipMemcpy(dcons + projp.y, wherej + 2, muljoinsize, hipMemcpyHostToDevice);
-				multiJoinWithWrite<<<blockllen, numthreads, sizepro + muljoinsize>>> (d_locations, sLen, temp, d_Rout, p1, p2, of1, of2, dcons, projp.x, projp.y, posR, posS, muljoin);
+				hipLaunchKernel(HIP_KERNEL_NAME(multiJoinWithWrite), dim3(blockllen), dim3(numthreads), sizepro + muljoinsize, 0, d_locations, sLen, temp, d_Rout, p1, p2, of1, of2, dcons, projp.x, projp.y, posR, posS, muljoin);
 			}
 			else
-				gJoinWithWrite<<<blockllen, numthreads, sizepro>>> (d_locations, sLen, temp, d_Rout, p1, p2, of1, of2, dcons, projp.x, projp.y, posR, posS);
+				hipLaunchKernel(HIP_KERNEL_NAME(gJoinWithWrite), dim3(blockllen), dim3(numthreads), sizepro, 0, d_locations, sLen, temp, d_Rout, p1, p2, of1, of2, dcons, projp.x, projp.y, posR, posS);
 		}
 	}
-	cudaFree(dcons);
+	hipFree(dcons);
-	cudaFree(d_locations);
+	hipFree(d_locations);
-	cudaFree(temp);
+	hipFree(temp);
 	if(posS != NULL)
-		cudaFree(posS);
+		hipFree(posS);
 	if(posR != NULL)
-		cudaFree(posR);
+		hipFree(posR);
 	if(*ret != NULL)
-		cudaFree(*ret);
+		hipFree(*ret);
 	*ret = d_Rout;
 	#ifdef TIMER
-	cudaEventRecord(stop, 0);
+	hipEventRecord(stop, 0);
-	cudaEventSynchronize(stop);
+	hipEventSynchronize(stop);
-	cudaEventElapsedTime(&time, start, stop);
+	hipEventElapsedTime(&time, start, stop);
 	//cout << "Join = " << time << endl;
 	//cout << "FIN" << endl;
 	cuda_stats.join_time += time;
--- a/packages/cuda/union2.cu
+++ b/packages/cuda/union2.cu
@@ -87,8 +87,8 @@ int unir(int *res, int rows, int tipo, int **ret, int final)
 			{
 				size = nrows * tipo * sizeof(int);
 				reservar(&nres, size);
-				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
+				hipMemcpyAsync(nres, res, size, hipMemcpyDeviceToDevice);
-				cudaFree(*ret);
+				hipFree(*ret);
 				*ret = nres;
 			}
 			return nrows;
@@ -122,8 +122,8 @@ int unir(int *res, int rows, int tipo, int **ret, int final)
 			{
 				size = nrows * tipo * sizeof(int);
 				reservar(&nres, size);
-				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
+				hipMemcpyAsync(nres, res, size, hipMemcpyDeviceToDevice);
-				cudaFree(*ret);
+				hipFree(*ret);
 				*ret = nres;
 			}
 			return nrows;
@@ -157,8 +157,8 @@ int unir(int *res, int rows, int tipo, int **ret, int final)
 			{
 				size = nrows * tipo * sizeof(int);
 				reservar(&nres, size);
-				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
+				hipMemcpyAsync(nres, res, size, hipMemcpyDeviceToDevice);
-				cudaFree(*ret);
+				hipFree(*ret);
 				*ret = nres;
 			}
 			return nrows;
@@ -192,8 +192,8 @@ int unir(int *res, int rows, int tipo, int **ret, int final)
 			{
 				size = nrows * tipo * sizeof(int);
 				reservar(&nres, size);
-				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
+				hipMemcpyAsync(nres, res, size, hipMemcpyDeviceToDevice);
-				cudaFree(*ret);
+				hipFree(*ret);
 				*ret = nres;
 			}
 			return nrows;
@@ -227,8 +227,8 @@ int unir(int *res, int rows, int tipo, int **ret, int final)
 			{
 				size = nrows * tipo * sizeof(int);
 				reservar(&nres, size);
-				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
+				hipMemcpyAsync(nres, res, size, hipMemcpyDeviceToDevice);
-				cudaFree(*ret);
+				hipFree(*ret);
 				*ret = nres;
 			}
 			return nrows;
@@ -262,8 +262,8 @@ int unir(int *res, int rows, int tipo, int **ret, int final)
 			{
 				size = nrows * tipo * sizeof(int);
 				reservar(&nres, size);
-				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
+				hipMemcpyAsync(nres, res, size, hipMemcpyDeviceToDevice);
-				cudaFree(*ret);
+				hipFree(*ret);
 				*ret = nres;
 			}
 			return nrows;
@@ -297,8 +297,8 @@ int unir(int *res, int rows, int tipo, int **ret, int final)
 			{
 				size = nrows * tipo * sizeof(int);
 				reservar(&nres, size);
-				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
+				hipMemcpyAsync(nres, res, size, hipMemcpyDeviceToDevice);
-				cudaFree(*ret);
+				hipFree(*ret);
 				*ret = nres;
 			}
 			return nrows;
@@ -332,8 +332,8 @@ int unir(int *res, int rows, int tipo, int **ret, int final)
 			{
 				size = nrows * tipo * sizeof(int);
 				reservar(&nres, size);
-				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
+				hipMemcpyAsync(nres, res, size, hipMemcpyDeviceToDevice);
-				cudaFree(*ret);
+				hipFree(*ret);
 				*ret = nres;
 			}
 			return nrows;
@@ -367,8 +367,8 @@ int unir(int *res, int rows, int tipo, int **ret, int final)
 			{
 				size = nrows * tipo * sizeof(int);
 				reservar(&nres, size);
-				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
+				hipMemcpyAsync(nres, res, size, hipMemcpyDeviceToDevice);
-				cudaFree(*ret);
+				hipFree(*ret);
 				*ret = nres;
 			}
 			return nrows;
@@ -402,8 +402,8 @@ int unir(int *res, int rows, int tipo, int **ret, int final)
 			{
 				size = nrows * tipo * sizeof(int);
 				reservar(&nres, size);
-				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
+				hipMemcpyAsync(nres, res, size, hipMemcpyDeviceToDevice);
-				cudaFree(*ret);
+				hipFree(*ret);
 				*ret = nres;
 			}
 			return nrows;
@@ -437,8 +437,8 @@ int unir(int *res, int rows, int tipo, int **ret, int final)
 			{
 				size = nrows * tipo * sizeof(int);
 				reservar(&nres, size);
-				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
+				hipMemcpyAsync(nres, res, size, hipMemcpyDeviceToDevice);
-				cudaFree(*ret);
+				hipFree(*ret);
 				*ret = nres;
 			}
 			return nrows;
@@ -472,8 +472,8 @@ int unir(int *res, int rows, int tipo, int **ret, int final)
 			{
 				size = nrows * tipo * sizeof(int);
 				reservar(&nres, size);
-				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
+				hipMemcpyAsync(nres, res, size, hipMemcpyDeviceToDevice);
-				cudaFree(*ret);
+				hipFree(*ret);
 				*ret = nres;
 			}
 			return nrows;
@@ -507,8 +507,8 @@ int unir(int *res, int rows, int tipo, int **ret, int final)
 			{
 				size = nrows * tipo * sizeof(int);
 				reservar(&nres, size);
-				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
+				hipMemcpyAsync(nres, res, size, hipMemcpyDeviceToDevice);
-				cudaFree(*ret);
+				hipFree(*ret);
 				*ret = nres;
 			}
 			return nrows;
@@ -542,8 +542,8 @@ int unir(int *res, int rows, int tipo, int **ret, int final)
 			{
 				size = nrows * tipo * sizeof(int);
 				reservar(&nres, size);
-				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
+				hipMemcpyAsync(nres, res, size, hipMemcpyDeviceToDevice);
-				cudaFree(*ret);
+				hipFree(*ret);
 				*ret = nres;
 			}
 			return nrows;
@@ -577,8 +577,8 @@ int unir(int *res, int rows, int tipo, int **ret, int final)
 			{
 				size = nrows * tipo * sizeof(int);
 				reservar(&nres, size);
-				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
+				hipMemcpyAsync(nres, res, size, hipMemcpyDeviceToDevice);
-				cudaFree(*ret);
+				hipFree(*ret);
 				*ret = nres;
 			}
 			return nrows;
@@ -612,8 +612,8 @@ int unir(int *res, int rows, int tipo, int **ret, int final)
 			{
 				size = nrows * tipo * sizeof(int);
 				reservar(&nres, size);
-				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
+				hipMemcpyAsync(nres, res, size, hipMemcpyDeviceToDevice);
-				cudaFree(*ret);
+				hipFree(*ret);
 				*ret = nres;
 			}
 			return nrows;
@@ -647,8 +647,8 @@ int unir(int *res, int rows, int tipo, int **ret, int final)
 			{
 				size = nrows * tipo * sizeof(int);
 				reservar(&nres, size);
-				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
+				hipMemcpyAsync(nres, res, size, hipMemcpyDeviceToDevice);
-				cudaFree(*ret);
+				hipFree(*ret);
 				*ret = nres;
 			}
 			return nrows;
@@ -682,8 +682,8 @@ int unir(int *res, int rows, int tipo, int **ret, int final)
 			{
 				size = nrows * tipo * sizeof(int);
 				reservar(&nres, size);
-				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
+				hipMemcpyAsync(nres, res, size, hipMemcpyDeviceToDevice);
-				cudaFree(*ret);
+				hipFree(*ret);
 				*ret = nres;
 			}
 			return nrows;
@@ -717,8 +717,8 @@ int unir(int *res, int rows, int tipo, int **ret, int final)
 			{
 				size = nrows * tipo * sizeof(int);
 				reservar(&nres, size);
-				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
+				hipMemcpyAsync(nres, res, size, hipMemcpyDeviceToDevice);
-				cudaFree(*ret);
+				hipFree(*ret);
 				*ret = nres;
 			}
 			return nrows;
@@ -752,8 +752,8 @@ int unir(int *res, int rows, int tipo, int **ret, int final)
 			{
 				size = nrows * tipo * sizeof(int);
 				reservar(&nres, size);
-				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
+				hipMemcpyAsync(nres, res, size, hipMemcpyDeviceToDevice);
-				cudaFree(*ret);
+				hipFree(*ret);
 				*ret = nres;
 			}
 			return nrows;
--- a/packages/cuda/union2.h
+++ b/packages/cuda/union2.h
--- a/packages/cuda/unioncpu2.cpp
+++ b/packages/cuda/unioncpu2.cpp