new version of cuda interface

2016-07-31 10:14:02 -05:00
parent c6d174841a
commit d3599da6dc
37 changed files with 7040 additions and 367 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -179,3 +179,5 @@ packages/myddas/hh
 packages/myddas/DaysInHospital_Y3.csv

 packages/myddas/agile.csv
+
+*.pyc
--- a/packages/cuda/CC_CSSTree.cu
+++ b/packages/cuda/CC_CSSTree.cu
--- a/packages/cuda/CC_CSSTree.h
+++ b/packages/cuda/CC_CSSTree.h
--- a/packages/cuda/Makefile.in
+++ b/packages/cuda/Makefile.in
--- a/packages/cuda/bpreds.cu
+++ b/packages/cuda/bpreds.cu
@@ -1,3 +1,4 @@
+#include "hip/hip_runtime.h"
 #include <thrust/device_vector.h>
 #include <thrust/scan.h>
 #include <cstdarg>
@@ -25,10 +26,10 @@ int maximo(int count, ...)
 __global__ void bpreds(int *dop1, int *dop2, int rows, int of1, int of2, int *cons, int numc, int nx, int *res, int *res2)
 {
 	extern __shared__ int shared[];
-	int id = blockIdx.x * blockDim.x + threadIdx.x;
+	int id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
 	int x, rowact, rowact1, op1, op2;
-	if(threadIdx.x < numc)
-		shared[threadIdx.x] = cons[threadIdx.x];
+	if(hipThreadIdx_x < numc)
+		shared[hipThreadIdx_x] = cons[hipThreadIdx_x];
 	__syncthreads();
 	if(id < rows)
 	{
@@ -110,10 +111,10 @@ __global__ void bpreds(int *dop1, int *dop2, int rows, int of1, int of2, int *co
 __global__ void bpredsnormal2(int *dop1, int rows, int of1, int *cons, int numc, int *res)
 {
 	extern __shared__ int shared[];
-	int id = blockIdx.x * blockDim.x + threadIdx.x;
+	int id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
 	int x, rowact, op1, op2;
-	if(threadIdx.x < numc)
-		shared[threadIdx.x] = cons[threadIdx.x];
+	if(hipThreadIdx_x < numc)
+		shared[hipThreadIdx_x] = cons[hipThreadIdx_x];
 	__syncthreads();
 	if(id < rows)
 	{
@@ -159,10 +160,10 @@ __global__ void bpredsnormal2(int *dop1, int rows, int of1, int *cons, int numc,
 __global__ void bpredsnormal(int *dop1, int rows, int of1, int *cons, int numc, int *res)
 {
 	extern __shared__ int shared[];
-	int id = blockIdx.x * blockDim.x + threadIdx.x;
+	int id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
 	int x, rowact, op1, op2;
-	if(threadIdx.x < numc)
-		shared[threadIdx.x] = cons[threadIdx.x];
+	if(hipThreadIdx_x < numc)
+		shared[hipThreadIdx_x] = cons[hipThreadIdx_x];
 	__syncthreads();
 	if(id < rows)
 	{
@@ -226,10 +227,10 @@ __global__ void bpredsnormal(int *dop1, int rows, int of1, int *cons, int numc,
 __global__ void bpredsOR(int *dop1, int *dop2, int rows, int of1, int of2, int *cons, int numc, int nx, int *res, int *res2)
 {
 	extern __shared__ int shared[];
-	int id = blockIdx.x * blockDim.x + threadIdx.x;
+	int id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
 	int x, rowact, rowact1, op1, op2;
-	if(threadIdx.x < numc)
-		shared[threadIdx.x] = cons[threadIdx.x];
+	if(hipThreadIdx_x < numc)
+		shared[hipThreadIdx_x] = cons[hipThreadIdx_x];
 	__syncthreads();
 	if(id < rows)
 	{
@@ -344,10 +345,10 @@ __global__ void bpredsOR(int *dop1, int *dop2, int rows, int of1, int of2, int *
 __global__ void bpredsorlogic2(int *dop1, int rows, int of1, int *cons, int numc, int *res)
 {
 	extern __shared__ int shared[];
-	int id = blockIdx.x * blockDim.x + threadIdx.x;
+	int id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
 	int x, rowact, op1, op2;
-	if(threadIdx.x < numc)
-		shared[threadIdx.x] = cons[threadIdx.x];
+	if(hipThreadIdx_x < numc)
+		shared[hipThreadIdx_x] = cons[hipThreadIdx_x];
 	__syncthreads();
 	if(id < rows)
 	{
@@ -411,10 +412,10 @@ __global__ void bpredsorlogic2(int *dop1, int rows, int of1, int *cons, int numc
 __global__ void bpredsorlogic(int *dop1, int rows, int of1, int *cons, int numc, int *res)
 {
 	extern __shared__ int shared[];
-	int id = blockIdx.x * blockDim.x + threadIdx.x;
+	int id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
 	int x, rowact, op1, op2;
-	if(threadIdx.x < numc)
-		shared[threadIdx.x] = cons[threadIdx.x];
+	if(hipThreadIdx_x < numc)
+		shared[hipThreadIdx_x] = cons[hipThreadIdx_x];
 	__syncthreads();
 	if(id < rows)
 	{
--- a/packages/cuda/bpreds.h
+++ b/packages/cuda/bpreds.h
@@ -1,3 +1,4 @@
+#include "hip/hip_runtime.h"
 #ifndef _BPREDS_H_
 #define _BPREDS_H_

--- a/packages/cuda/bpredscpu.cpp
+++ b/packages/cuda/bpredscpu.cpp
--- a/packages/cuda/clamp.rb
+++ b/packages/cuda/clamp.rb
@@ -0,0 +1,52 @@
+require "formula"
+
+# Documentation: https://github.com/Homebrew/homebrew/wiki/Formula-Cookbook
+#                /usr/local/Library/Contributions/example-formula.rb
+# PLEASE REMOVE ALL GENERATED COMMENTS BEFORE SUBMITTING YOUR PULL REQUEST!
+
+class Clamp < Formula
+  homepage "https://bitbucket.org/multicoreware/cppamp-driver-ng/wiki/Home"
+  version "0.0.1-3"
+  url "https://bitbucket.org/multicoreware/cppamp-driver-ng/get/milestone3.tar.bz2"
+  head "https://bitbucket.org/multicoreware/cppamp-driver-ng.git"
+  sha1 "b8b88306561a60942f8ecbd8ff20554661c4e5f9"
+
+  depends_on "cmake" => :build
+  depends_on "wget" => :build
+  depends_on "git" => :build
+  depends_on "hg" => :build
+  depends_on "subversion" => :build
+  # depends_on :x11 # if your formula requires any X11/XQuartz components
+
+  def install
+    # ENV.deparallelize  # if your formula fails when building in parallel
+
+    # Remove unrecognized options if warned by configure
+    # system "./configure", "--disable-debug",
+    #                      "--disable-dependency-tracking",
+    #                      "--disable-silent-rules",
+    #                      "--prefix=#{prefix}"
+    mkdir "macbuild" do
+      args = std_cmake_args
+      args << "-DCLANG_URL=https://bitbucket.org/multicoreware/cppamp-ng.git"
+      args << "-DLLVM_EXPERIMENTAL_TARGETS_TO_BUILD=CBackend"
+      args << "-DGMAC_URL=https://bitbucket.org/multicoreware/gmac"
+      system 'cmake', "..", *args
+      system "make", "world"
+      system "cd libc++; make install"
+      system "make", "install" # if this fails, try separate make/make install steps
+    end
+  end
+
+  test do
+    # `test do` will create, run in and delete a temporary directory.
+    #
+    # This test will fail and we won't accept that! It's enough to just replace
+    # "false" with the main program this formula installs, but it'd be nice if you
+    # were more thorough. Run the test with `brew test milestone`.
+    #
+    # The installed folder is not in the path, so use the entire path to any
+    # executables being tested: `system "#{bin}/program", "do", "something"`.
+    system "make", "test"
+  end
+end
--- a/packages/cuda/creator2.c
+++ b/packages/cuda/creator2.c
@@ -66,7 +66,7 @@ int main(int argc, char *argv[])
 	fprintf(cuda, "\t\t\t{\n");
 	fprintf(cuda, "\t\t\t\tsize = nrows * tipo * sizeof(int);\n");
 	fprintf(cuda, "\t\t\t\treservar(&nres, size);\n");
-	fprintf(cuda, "\t\t\t\tcudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);\n");
+	fprintf(cuda, "\t\t\t\tcudaMemcpyAsync(nres, res, size, hipMemcpyDeviceToDevice);\n");
 	fprintf(cuda, "\t\t\t\tcudaFree(*ret);\n");
 	fprintf(cuda, "\t\t\t\t*ret = nres;\n");
 	fprintf(cuda, "\t\t\t}\n");
@@ -103,7 +103,7 @@ int main(int argc, char *argv[])
 		fprintf(cuda, "\t\t\t{\n");
 		fprintf(cuda, "\t\t\t\tsize = nrows * tipo * sizeof(int);\n");
 		fprintf(cuda, "\t\t\t\treservar(&nres, size);\n");
-		fprintf(cuda, "\t\t\t\tcudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);\n");
+		fprintf(cuda, "\t\t\t\tcudaMemcpyAsync(nres, res, size, hipMemcpyDeviceToDevice);\n");
 		fprintf(cuda, "\t\t\t\tcudaFree(*ret);\n");
 		fprintf(cuda, "\t\t\t\t*ret = nres;\n");
 		fprintf(cuda, "\t\t\t}\n");
--- a/packages/cuda/cuda.c
+++ b/packages/cuda/cuda.c
--- a/packages/cuda/cuda.yap
+++ b/packages/cuda/cuda.yap
--- a/packages/cuda/dbio.cu
+++ b/packages/cuda/dbio.cu
@@ -27,8 +27,8 @@ void datalogWrite(int query, vector<rulenode>::iterator rul_str, vector<rulenode
 				res_rows = unir(dop1, res_rows, cols1, &dop1, 0);
 			tipo = res_rows * cols1 * sizeof(int);
 			hres = (int *)malloc(tipo);
-			cudaMemcpy(hres, dop1, tipo, cudaMemcpyDeviceToHost);
-			cudaFree(dop1);
+			hipMemcpy(hres, dop1, tipo, hipMemcpyDeviceToHost);
+			hipFree(dop1);
 			*result = hres;
 		}
 		else
@@ -39,13 +39,13 @@ void datalogWrite(int query, vector<rulenode>::iterator rul_str, vector<rulenode
 				int *dop2;
 				tipo = res_rows * cols1 * sizeof(int);
 				reservar(&dop2, tipo); 
-				cudaMemcpy(dop2, dop1, tipo, cudaMemcpyHostToDevice);
+				hipMemcpy(dop2, dop1, tipo, hipMemcpyHostToDevice);
 				free(dop1);
 				res_rows = unir(dop2, res_rows, cols1, &dop2, 0);
 				tipo = res_rows * cols1 * sizeof(int);
 				hres = (int *)malloc(tipo);
-				cudaMemcpy(hres, dop2, tipo, cudaMemcpyDeviceToHost);
-				cudaFree(dop2);
+				hipMemcpy(hres, dop2, tipo, hipMemcpyDeviceToHost);
+				hipFree(dop2);
 				*result = hres;
 			}
 			else
@@ -315,8 +315,8 @@ void postgresWrite(int *inpquery, int ninpf, vector<rulenode>::iterator rul_str,

 				tipo = res_rows * cols1 * sizeof(int);
 				hres = (int *)malloc(tipo);
-				cudaMemcpy(hres, dop1, tipo, cudaMemcpyDeviceToHost);
-				cudaFree(dop1);
+				hipMemcpy(hres, dop1, tipo, hipMemcpyDeviceToHost);
+				hipFree(dop1);
 				w = z + 1;

 				strtok(qposr->rulename, "_");
@@ -353,8 +353,8 @@ void postgresWrite(int *inpquery, int ninpf, vector<rulenode>::iterator rul_str,
 			res_rows = abs(res_rows);
 			tipo = res_rows * cols1 * sizeof(int);
 			hres = (int *)malloc(tipo);
-			cudaMemcpy(hres, dop1, tipo, cudaMemcpyDeviceToHost);
-			cudaFree(dop1);
+			hipMemcpy(hres, dop1, tipo, hipMemcpyDeviceToHost);
+			hipFree(dop1);

 			char file[] = "/dev/shm/buffer.csv";
 			FILE *fp;
@@ -554,7 +554,7 @@ void mysqlWrite(vector<rulenode>::iterator rul_str, vector<rulenode>::iterator f
 			sign = tmpfact.predname;
 			tipo = res_rows * cols1 * sizeof(int);
 			hres = (int *)malloc(tipo);
-			cudaMemcpy(hres, dop1, tipo, cudaMemcpyDeviceToHost);
+			hipMemcpy(hres, dop1, tipo, hipMemcpyDeviceToHost);
 			if(sign[0] == 'f' && sign[1] >= '0' && sign[1] <= '9')
 				sumar(tmpfact.name, dop1, cols1, res_rows);
 		}
--- a/packages/cuda/hippy/hippy
+++ b/packages/cuda/hippy/hippy
--- a/packages/cuda/joincpu.cpp
+++ b/packages/cuda/joincpu.cpp
@@ -324,11 +324,11 @@ int joincpu(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenod
 	}

 	#ifdef TIMER
-	cudaEvent_t start, stop;
+	hipEvent_t start, stop;
 	float time;
-	cudaEventCreate(&start);
-	cudaEventCreate(&stop);
-	cudaEventRecord(start, 0);
+	hipEventCreate(&start);
+	hipEventCreate(&stop);
+	hipEventRecord(start, 0);
 	#endif

 	if(nsel1 > 0 || nsj1 > 0)
@@ -359,16 +359,16 @@ int joincpu(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenod
 	}

 	#ifdef TIMER
-	cudaEventRecord(stop, 0);
-	cudaEventSynchronize(stop);
-	cudaEventElapsedTime(&time, start, stop);
+	hipEventRecord(stop, 0);
+	hipEventSynchronize(stop);
+	hipEventElapsedTime(&time, start, stop);
 	cuda_stats.select1_time += time;

-	cudaEventDestroy(start);
-	cudaEventDestroy(stop);
-	cudaEventCreate(&start);
-	cudaEventCreate(&stop);
-	cudaEventRecord(start, 0);
+	hipEventDestroy(start);
+	hipEventDestroy(stop);
+	hipEventCreate(&start);
+	hipEventCreate(&stop);
+	hipEventRecord(start, 0);
 	#endif

 	if(nsel2 > 0 || nsj2 > 0)
@@ -381,16 +381,16 @@ int joincpu(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenod
 		Snl = sLen;

 	#ifdef TIMER
-	cudaEventRecord(stop, 0);
-	cudaEventSynchronize(stop);
-	cudaEventElapsedTime(&time, start, stop);
+	hipEventRecord(stop, 0);
+	hipEventSynchronize(stop);
+	hipEventElapsedTime(&time, start, stop);
 	cuda_stats.select2_time += time;

-	cudaEventDestroy(start);
-	cudaEventDestroy(stop);
-	cudaEventCreate(&start);
-	cudaEventCreate(&stop);
-	cudaEventRecord(start, 0);
+	hipEventDestroy(start);
+	hipEventDestroy(stop);
+	hipEventCreate(&start);
+	hipEventCreate(&stop);
+	hipEventRecord(start, 0);
 	#endif

 	//cout << "antes" << endl;
@@ -406,16 +406,16 @@ int joincpu(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenod
 	thrust::stable_sort_by_key(thrust::omp::par, Rres, Rres + Rnl, permutation);

 	#ifdef TIMER
-	cudaEventRecord(stop, 0);
-	cudaEventSynchronize(stop);
-	cudaEventElapsedTime(&time, start, stop);
+	hipEventRecord(stop, 0);
+	hipEventSynchronize(stop);
+	hipEventElapsedTime(&time, start, stop);
 	cuda_stats.sort_time += time;
 	
-	cudaEventDestroy(start);
-	cudaEventDestroy(stop);
-	cudaEventCreate(&start);
-	cudaEventCreate(&stop);
-	cudaEventRecord(start, 0);
+	hipEventDestroy(start);
+	hipEventDestroy(stop);
+	hipEventCreate(&start);
+	hipEventCreate(&stop);
+	hipEventRecord(start, 0);
 	#endif

 	/*cout << "despues" << endl;
@@ -482,9 +482,9 @@ int joincpu(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenod
 	*ret = fres;

 	#ifdef TIMER
-	cudaEventRecord(stop, 0);
-	cudaEventSynchronize(stop);
-	cudaEventElapsedTime(&time, start, stop);
+	hipEventRecord(stop, 0);
+	hipEventSynchronize(stop);
+	hipEventElapsedTime(&time, start, stop);
 	cuda_stats.join_time += time;
 	#endif

--- a/packages/cuda/lista.cu
+++ b/packages/cuda/lista.cu
@@ -967,7 +967,7 @@ vector<gpunode> L;
 extern "C"
 int Cuda_Eval(predicate **inpfacts, int ninpf, predicate **inprules, int ninpr, int *inpquery, int **result, char *names, int finalDR)
 {
-	cudaSetDevice(0);
+	hipSetDevice(0);
 	vector<rulenode> rules;
 	int x;

@@ -1029,11 +1029,11 @@ int Cuda_Eval(predicate **inpfacts, int ninpf, predicate **inprules, int ninpr,
 	vector<rulenode>::iterator qposr;

 #if TIMER
-	cudaEvent_t start, stop;
+	hipEvent_t start, stop;
 	float time;
-	cudaEventCreate(&start);
-	cudaEventCreate(&stop);
-	cudaEventRecord(start, 0);
+	hipEventCreate(&start);
+	hipEventCreate(&stop);
+	hipEventRecord(start, 0);
 #endif

 	while(reglas.size()) /*Here's the main loop*/
@@ -1084,7 +1084,7 @@ int Cuda_Eval(predicate **inpfacts, int ninpf, predicate **inprules, int ninpr,
 				{
 					num_refs = rows1 * cols1 * sizeof(int);
 					reservar(&res, num_refs);
-					cudaMemcpyAsync(res, dop1, num_refs, cudaMemcpyDeviceToDevice);
+					hipMemcpyAsync(res, dop1, num_refs, hipMemcpyDeviceToDevice);
 					registrar(rul_act->name, cols1, res, rows1, itr, 1);
 					genflag = 1;
 					rul_act->gen_ant = rul_act->gen_act;
@@ -1251,10 +1251,10 @@ int Cuda_Eval(predicate **inpfacts, int ninpf, predicate **inprules, int ninpr,
 			if(x == num_refs)
 			{
 				#ifdef TIMER
-				cudaEvent_t start2, stop2;
-				cudaEventCreate(&start2);
-				cudaEventCreate(&stop2);
-				cudaEventRecord(start2, 0);
+				hipEvent_t start2, stop2;
+				hipEventCreate(&start2);
+				hipEventCreate(&stop2);
+				hipEventRecord(start2, 0);
 				#endif

 				//cout << rul_act->name << " res_rows = " << res_rows << endl;
@@ -1263,11 +1263,11 @@ int Cuda_Eval(predicate **inpfacts, int ninpf, predicate **inprules, int ninpr,
 					res_rows = unir(res, res_rows, rul_act->num_columns, &res, 0);

 				#ifdef TIMER
-				cudaEventRecord(stop2, 0);
-				cudaEventSynchronize(stop2);
-				cudaEventElapsedTime(&time, start2, stop2);
-				cudaEventDestroy(start2);
-				cudaEventDestroy(stop2);
+				hipEventRecord(stop2, 0);
+				hipEventSynchronize(stop2);
+				hipEventElapsedTime(&time, start2, stop2);
+				hipEventDestroy(start2);
+				hipEventDestroy(stop2);
 				//cout << "Union = " << time << endl;
 				cuda_stats.union_time += time;
 				#endif					
@@ -1319,16 +1319,16 @@ int Cuda_Eval(predicate **inpfacts, int ninpf, predicate **inprules, int ninpr,
 	#endif

 #if TIMER
-	cudaEventRecord(stop, 0);
-	cudaEventSynchronize(stop);
-	cudaEventElapsedTime(&time, start, stop);
+	hipEventRecord(stop, 0);
+	hipEventSynchronize(stop);
+	hipEventElapsedTime(&time, start, stop);
 	cuda_stats.total_time += time;
 	if (time > cuda_stats.max_time) 
 	  cuda_stats.max_time = time;
 	if (time < cuda_stats.min_time || cuda_stats.calls == 1) 
 	  cuda_stats.min_time = time;
-	cudaEventDestroy(start);
-	cudaEventDestroy(stop);
+	hipEventDestroy(start);
+	hipEventDestroy(stop);
 	Cuda_Statistics();
 #endif

--- a/packages/cuda/lista.h
+++ b/packages/cuda/lista.h
--- a/packages/cuda/memory.cu
+++ b/packages/cuda/memory.cu
@@ -144,7 +144,7 @@ void limpiar(const char s[], size_t sz)

 	if(GPUmem.size() == 0)
 	{
-		cudaMemGetInfo(&free,&total);
+		hipMemGetInfo(&free,&total);
 		cerr << s << ": not enough GPU memory: have " << free << " of " << total << ", need " << sz << " bytes." << endl;
 		exit(1);
 	}		
@@ -154,11 +154,11 @@ void limpiar(const char s[], size_t sz)
 	{	
 		temp = *ini;
 		temp.dev_address = (int *)malloc(ini->size);
-		cudaMemcpyAsync(temp.dev_address, ini->dev_address, temp.size, cudaMemcpyDeviceToHost);
+		hipMemcpyAsync(temp.dev_address, ini->dev_address, temp.size, hipMemcpyDeviceToHost);
 		list<memnode>::iterator pos = lower_bound(CPUmem.begin(), CPUmem.end(), temp, compareiteration);
 		CPUmem.insert(pos, temp);
 	}
-	cudaFree(ini->dev_address);
+	hipFree(ini->dev_address);
 	GPUmem.erase(ini);
 }

@@ -173,19 +173,19 @@ void reservar(int **ptr, size_t size)
                return;
        }

-	cudaMemGetInfo(&free, &total);
+	hipMemGetInfo(&free, &total);
 	while(free < size)
 	{
 		cout << "Se limpio memoria " << free << " " << total << endl;
 		limpiar("not enough memory", size);
-		cudaMemGetInfo(&free, &total);
+		hipMemGetInfo(&free, &total);
 	}

-	while(cudaMalloc(ptr, size) == cudaErrorMemoryAllocation)
+	while(hipMalloc(ptr, size) == hipErrorMemoryAllocation)
 		limpiar("Error in memory allocation", size);
 	if (! *ptr ) {
 	  size_t free, total;
-	  cudaMemGetInfo(      &free, &total	 );
+	  hipMemGetInfo(      &free, &total	 );
 	  cerr << "Could not allocate " << size << " bytes, only " << free << " avaliable from total of " << total << " !!!" << endl;
 	  cerr << "Exiting CUDA...." << endl;
 	  exit(1);
@@ -277,7 +277,7 @@ int cargar(int name, int num_rows, int num_columns, int is_fact, int *address_ho
 		}
 		size = num_rows * num_columns * sizeof(int);
 		reservar(&temp, size);
-		cudaMemcpyAsync(temp, address_host_table, size, cudaMemcpyHostToDevice);
+		hipMemcpyAsync(temp, address_host_table, size, hipMemcpyHostToDevice);
 		registrar(name, num_columns, temp, num_rows, itr, 0);
 		*ptr = temp;
 		return num_rows;
@@ -296,13 +296,13 @@ int cargar(int name, int num_rows, int num_columns, int is_fact, int *address_ho
 		reservar(&temp, size);
 		for(x = 0; x < numgpu; x++)
 		{
-			cudaMemcpyAsync(temp + inc, temp_storage[x].dev_address, temp_storage[x].size, cudaMemcpyDeviceToDevice);
+			hipMemcpyAsync(temp + inc, temp_storage[x].dev_address, temp_storage[x].size, hipMemcpyDeviceToDevice);
 			inc += temp_storage[x].size / sizeof(int);
-			cudaFree(temp_storage[x].dev_address);
+			hipFree(temp_storage[x].dev_address);
 		}
 		for(; x < numcpu; x++)
 		{
-			cudaMemcpyAsync(temp + inc, temp_storage[x].dev_address, temp_storage[x].size, cudaMemcpyHostToDevice);
+			hipMemcpyAsync(temp + inc, temp_storage[x].dev_address, temp_storage[x].size, hipMemcpyHostToDevice);
 			inc += temp_storage[x].size / sizeof(int);
 			free(temp_storage[x].dev_address);
 		}
@@ -340,9 +340,9 @@ int cargarcpu(int name, int num_rows, int num_columns, int is_fact, int *address
 		temp = (int *)malloc(size);
 		for(x = 0; x < numgpu; x++)
 		{
-			cudaMemcpyAsync(temp + inc, temp_storage[x].dev_address, temp_storage[x].size, cudaMemcpyDeviceToHost);
+			hipMemcpyAsync(temp + inc, temp_storage[x].dev_address, temp_storage[x].size, hipMemcpyDeviceToHost);
 			inc += temp_storage[x].size / sizeof(int);
-			cudaFree(temp_storage[x].dev_address);
+			hipFree(temp_storage[x].dev_address);
 		}
 		for(; x < numcpu; x++)
 		{
@@ -404,7 +404,7 @@ int cargafinal(int name, int cols, int **ptr)
 		cont = pos->rows;
 		#ifdef TUFFY
 		reservar(&temp, pos->size);
-		cudaMemcpy(temp, pos->dev_address, pos->size, cudaMemcpyHostToDevice);
+		hipMemcpy(temp, pos->dev_address, pos->size, hipMemcpyHostToDevice);
 		*ptr = temp;
 		#else
 		*ptr = pos->dev_address;
@@ -418,14 +418,14 @@ int cargafinal(int name, int cols, int **ptr)
 	pos = gpu;
 	while(pos != endg && pos->name == name)
 	{
-		cudaMemcpy(temp, pos->dev_address, pos->size, cudaMemcpyDeviceToDevice);
+		hipMemcpy(temp, pos->dev_address, pos->size, hipMemcpyDeviceToDevice);
 		temp += pos->size / sizeof(int);
 		pos++;
 	}
 	pos = cpu;
 	while(pos != endc && pos->name == name)
 	{
-		cudaMemcpy(temp, pos->dev_address, pos->size, cudaMemcpyHostToDevice);
+		hipMemcpy(temp, pos->dev_address, pos->size, hipMemcpyHostToDevice);
 		temp += pos->size / sizeof(int);
 		pos++;
 	}
@@ -493,7 +493,7 @@ void clear_memory()
 	{
 		if(ini->isrule)
 		{
-			cudaFree(ini->dev_address);
+			hipFree(ini->dev_address);
 			ini = GPUmem.erase(ini);
 		}
 		else
@@ -518,7 +518,7 @@ void clear_memory_all()
 	fin = GPUmem.end();
 	while(ini != fin)
 	{
-		cudaFree(ini->dev_address);
+		hipFree(ini->dev_address);
 		ini++;
 	}
 	GPUmem.clear();
@@ -542,7 +542,7 @@ void liberar(int name)
 	{
 		fact = *i;
 		GPUmem.erase(i);
-		cudaFree(fact.dev_address);
+		hipFree(fact.dev_address);
 	}
 	i = buscarhecho(CPUmem.begin(), CPUmem.end(), name);
 	if(i != CPUmem.end())
@@ -566,10 +566,10 @@ void sumar(int name, int *dop1, int cols, int rows)
 		newrows = rows + fact.rows;
 		reservar(&res, newrows * cols * sizeof(int));
 		offset = fact.rows * cols;
-		cudaMemcpyAsync(res, fact.dev_address, offset * sizeof(int), cudaMemcpyDeviceToDevice);
+		hipMemcpyAsync(res, fact.dev_address, offset * sizeof(int), hipMemcpyDeviceToDevice);
 		GPUmem.erase(i);
 		registrar(name, cols, res, newrows, 0, 0);
-		cudaMemcpyAsync(res + offset, dop1, rows * cols * sizeof(int), cudaMemcpyDeviceToDevice);
-		cudaFree(fact.dev_address);
+		hipMemcpyAsync(res + offset, dop1, rows * cols * sizeof(int), hipMemcpyDeviceToDevice);
+		hipFree(fact.dev_address);
 	}
 }
--- a/packages/cuda/memory.h
+++ b/packages/cuda/memory.h
--- a/packages/cuda/old/cuda.c
+++ b/packages/cuda/old/cuda.c
@@ -0,0 +1,601 @@
+
+// interface to CUDD Datalog evaluation
+#include "config.h"
+#include "YapInterface.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <inttypes.h>
+#include "pred.h"
+
+#define MAXARG 100
+
+YAP_Atom AtomEq,
+  AtomGt,
+  AtomLt,
+  AtomGe,
+  AtomLe,
+  AtomDf,
+  AtomNt;
+
+predicate *facts[MAXARG]; /*Temporary solution to maintain facts and rules*/
+predicate *rules[MAXARG];
+int32_t cf = 0, cr = 0;
+
+char names[1024];
+
+// initialize CUDA system
+void Cuda_Initialize( void );
+
+// add/replace a set of facts for predicate pred
+int32_t Cuda_NewFacts(predicate *pred);
+
+// add/replace a rule for predicate pred
+int32_t Cuda_NewRule(predicate *pred);
+
+// erase predicate pred
+int32_t Cuda_Erase(predicate *pred);
+
+// evaluate predicate pred, mat is bound to a vector of solutions, and
+// output the count
+//int32_t Cuda_Eval(predicate *pred, int32_t **mat); This functions arguments were changed, please see pred.h
+
+void init_cuda( void );
+
+//#define DEBUG_INTERFACE 1
+
+#ifdef ROCKIT
+static int32_t query[100];
+static int32_t qcont = 0;
+static int cuda_init_query(void)
+{
+	int32_t pname = YAP_AtomToInt(YAP_AtomOfTerm(YAP_ARG1));
+	query[qcont] = pname;
+	qcont++;
+	query[qcont] = 0;
+	return TRUE;
+}
+#endif
+
+#if DEBUG_INTERFACE
+static void
+dump_mat(int32_t mat[], int32_t nrows, int32_t ncols)
+{
+  return;
+  int32_t i, j;
+  for ( i=0; i< nrows; i++) {
+    printf("%d", mat[i*ncols]);
+    for (j=1; j < ncols; j++) {
+      printf(", %d", mat[i*ncols+j]);
+    }
+    printf("\n");
+  }
+}
+
+static void
+dump_vec(int32_t vec[], int32_t rows)
+{
+  int32_t i = 1;
+  int32_t j = 0;
+
+  for (j = 0; j < rows; j++) {
+    for ( ; vec[i]; i++ ) {
+      printf(", %d", vec[i]);
+    }
+    printf(", 0");
+    i++;
+  }
+  printf("\n");
+}
+#endif /* DEBUG_INTERFACE */
+
+
+// stubs, will point at Carlos code.
+
+void Cuda_Initialize( void )
+{
+}
+
+int32_t Cuda_NewFacts(predicate *pe)
+{
+#if DEBUG_INTERFACE
+  dump_mat( pe->address_host_table, pe->num_rows, pe->num_columns );
+#endif
+
+#ifdef ROCKIT
+  if(cf >= 0)
+  {
+  	facts[cf] = pe;
+	cf++;
+  }
+#else
+  facts[cf] = pe;
+  cf++;
+#endif
+
+  return TRUE;
+}
+
+int32_t Cuda_NewRule(predicate *pe)
+{
+#if DEBUG_INTERFACE
+  dump_vec( pe->address_host_table, pe->num_rows);
+#endif
+  rules[cr] = pe;
+  cr++;
+  return TRUE;
+}
+
+int32_t Cuda_Erase(predicate *pe)
+{
+  int i = 0;
+  while ( rules[i] != pe )
+    i++;
+  while (i < cr-1) {
+    rules[i] = rules[i+1];
+    i++;
+  }
+  rules[i] = NULL;
+  cr--;
+  if (pe->address_host_table)
+    free( pe->address_host_table );
+  free( pe );
+  return TRUE;
+}
+
+static int
+load_facts( void ) {
+
+  int32_t nrows = YAP_IntOfTerm(YAP_ARG1);
+  int32_t ncols = YAP_IntOfTerm(YAP_ARG2), i = 0;
+  YAP_Term t3 = YAP_ARG3;
+  int32_t *mat = (int32_t *)malloc(sizeof(int32_t)*nrows*ncols);
+  int32_t pname = YAP_AtomToInt(YAP_NameOfFunctor(YAP_FunctorOfTerm(YAP_HeadOfTerm(t3))));
+  predicate *pred;
+
+  while(YAP_IsPairTerm(t3)) {
+    int32_t j = 0;
+    YAP_Term th = YAP_HeadOfTerm(t3);
+
+    for (j = 0; j < ncols; j++) {
+      YAP_Term ta = YAP_ArgOfTerm(j+1, th);
+      if (YAP_IsAtomTerm(ta)) {
+	mat[i*ncols+j] = YAP_AtomToInt(YAP_AtomOfTerm(ta));
+      } else {
+	mat[i*ncols+j] = YAP_IntOfTerm(ta);
+      }
+    }
+    t3 = YAP_TailOfTerm( t3 );
+    i++;
+  }
+  if (YAP_IsVarTerm( YAP_ARG4)) {
+    // new 
+    pred = (predicate *)malloc(sizeof(predicate));
+  } else {
+    pred = (predicate *)YAP_IntOfTerm(YAP_ARG4);
+    if (pred->address_host_table)
+      free( pred->address_host_table );
+  }
+  pred->name = pname;
+  pred->num_rows = nrows;
+  pred->num_columns = ncols;
+  pred->is_fact = TRUE;
+  pred->address_host_table =  mat;
+  Cuda_NewFacts(pred);
+  if (YAP_IsVarTerm( YAP_ARG4)) {
+    return YAP_Unify(YAP_ARG4, YAP_MkIntTerm((YAP_Int)pred));
+  } else {
+    return TRUE;
+  }
+}
+
+static int currentFact = 0;
+static predicate *currentPred = NULL;
+
+static int
+cuda_init_facts( void ) {
+
+  int32_t nrows = YAP_IntOfTerm(YAP_ARG1);
+  int32_t ncols = YAP_IntOfTerm(YAP_ARG2);
+  int32_t *mat = (int32_t *)malloc(sizeof(int32_t)*nrows*ncols);
+  int32_t pname = YAP_AtomToInt(YAP_AtomOfTerm(YAP_ARG3));
+  predicate *pred;
+
+	strcat(names, YAP_AtomName(YAP_AtomOfTerm(YAP_ARG3)));
+	strcat(names, " ");
+
+  if (!mat)
+    return FALSE;
+  if (YAP_IsVarTerm( YAP_ARG4)) {
+    // new 
+    pred = (predicate *)malloc(sizeof(predicate));
+  } else {
+    pred = (predicate *)YAP_IntOfTerm(YAP_ARG4);
+    if (pred->address_host_table)
+      free( pred->address_host_table );
+}
+  pred->name = pname;
+  pred->num_rows = nrows;
+  pred->num_columns = ncols;
+  pred->is_fact = TRUE;
+  pred->address_host_table =  mat;
+  currentPred = pred;
+  currentFact = 0;
+
+  if (YAP_IsVarTerm( YAP_ARG4)) {
+    return YAP_Unify(YAP_ARG4, YAP_MkIntTerm((YAP_Int)pred));
+  } else {
+    return TRUE;
+  }
+}
+
+static int
+cuda_load_fact( void ) {
+
+  int i = currentFact;
+
+#if defined(DATALOG) || defined(TUFFY)
+  YAP_Term th = YAP_ARG1;
+  int ncols = currentPred->num_columns;
+  int j;
+  int *mat = currentPred->address_host_table;
+  for (j = 0; j < ncols; j++) {
+    YAP_Term ta = YAP_ArgOfTerm(j+1, th);
+    if (YAP_IsAtomTerm(ta)) {
+      mat[i*ncols+j] = YAP_AtomToInt(YAP_AtomOfTerm(ta));
+    } else {
+      mat[i*ncols+j] = YAP_IntOfTerm(ta);
+    }
+  }
+#endif
+
+  i++;
+  if (i == currentPred->num_rows) {
+    Cuda_NewFacts(currentPred);
+    currentPred = NULL;
+    currentFact = 0;
+  } else {
+    currentFact = i;
+  }
+  return TRUE;
+}
+
+static int
+load_rule( void ) {
+  // maximum of 2K symbols per rule, should be enough for ILP
+  int32_t vec[2048], *ptr = vec, *nvec, neg[2048];
+  // qK different variables;
+  YAP_Term vars[1024];
+  int32_t nvars = 0, x;
+  int32_t ngoals = YAP_IntOfTerm(YAP_ARG1);   /* gives the number of goals */
+  int32_t ncols = YAP_IntOfTerm(YAP_ARG2);
+  YAP_Term t3 = YAP_ARG3;
+	YAP_Atom name = YAP_NameOfFunctor(YAP_FunctorOfTerm(YAP_HeadOfTerm(t3)));
+  int32_t pname = YAP_AtomToInt(name);
+
+	const char *strname = YAP_AtomName(name);
+  predicate *pred;
+  int32_t cont = 0;
+  memset(neg, 0x0, 2048 * sizeof(int32_t));
+
+  while(YAP_IsPairTerm(t3)) {
+    int32_t j = 0, m;
+    YAP_Term th = YAP_HeadOfTerm(t3);
+    YAP_Functor f = YAP_FunctorOfTerm( th );
+    int32_t n = YAP_ArityOfFunctor( f ); 
+    YAP_Atom at = YAP_NameOfFunctor( f );
+
+    if (at == AtomEq)
+      *ptr++ = SBG_EQ;
+    else if (at == AtomGt)
+      *ptr++ = SBG_GT;
+    else if (at == AtomLt)
+      *ptr++ = SBG_LT;
+    else if (at == AtomGe)
+      *ptr++ = SBG_GE;
+    else if (at == AtomLe)
+      *ptr++ = SBG_LE;
+    else if (at == AtomDf)
+      *ptr++ = SBG_DF;
+    else if (at == AtomNt)
+	{
+      		neg[cont] = 1;
+		cont++;
+	}
+    else
+	{
+      		*ptr++ = YAP_AtomToInt( at );
+		cont++;
+	}
+
+    for (j = 0; j < n; j++) {
+      YAP_Term ta = YAP_ArgOfTerm(j+1, th);
+
+      if (YAP_IsVarTerm(ta)) {
+	int32_t k;
+	for (k = 0; k< nvars; k++) {
+	  if (vars[k] == ta) {
+	    *ptr++ = k+1;
+	    break;
+	  }
+	}
+	if (k == nvars) {
+	  vars[k] = ta;
+	  *ptr++ = k+1;
+	  nvars++;
+	}
+      } else if (YAP_IsAtomTerm(ta))  {
+	*ptr++ = -YAP_AtomToInt(YAP_AtomOfTerm(ta));
+      } else if (YAP_IsApplTerm(ta))  {
+	f = YAP_FunctorOfTerm( ta );
+	at = YAP_NameOfFunctor( f );
+	m = YAP_ArityOfFunctor( f );
+	*ptr++ = YAP_AtomToInt( at );
+
+	for (x = 0; x < m; x++) {
+      		YAP_Term ta2 = YAP_ArgOfTerm(x+1, ta);
+
+      		if (YAP_IsVarTerm(ta2)) {
+			int32_t k;
+			for (k = 0; k < nvars; k++) {
+	  			if (vars[k] == ta2) {
+	    				*ptr++ = k+1;
+	    				break;
+	  			}
+			}
+			if (k == nvars) {
+	  			vars[k] = ta2;
+	  			*ptr++ = k+1;
+	  			nvars++;
+			}
+      		} else if (YAP_IsAtomTerm(ta2))  {
+			*ptr++ = -YAP_AtomToInt(YAP_AtomOfTerm(ta));
+      		} else {
+			*ptr++ = -YAP_IntOfTerm(ta);
+      		}
+    	}
+      } else {
+	*ptr++ = -YAP_IntOfTerm(ta);
+      }
+    }
+    *ptr++ = 0;
+    t3 = YAP_TailOfTerm( t3 );
+  }
+  if (YAP_IsVarTerm( YAP_ARG4)) {
+    // new 
+    pred = (predicate *)malloc(sizeof(predicate));
+  } else {
+    pred = (predicate *)YAP_IntOfTerm(YAP_ARG4);
+    if (pred->address_host_table)
+      free( pred->address_host_table );
+  }
+  pred->name = pname;
+  pred->num_rows = ngoals;
+  pred->num_columns = ncols;
+  pred->is_fact = FALSE;
+	x = (strlen(strname) + 1) * sizeof(char);
+	pred->predname = (char *)malloc(x);
+	memcpy(pred->predname, strname, x); 
+  nvec = (int32_t *)malloc(sizeof(int32_t)*(ptr-vec));
+  memcpy(nvec, vec, sizeof(int32_t)*(ptr-vec));
+  pred->address_host_table =  nvec;
+  pred->negatives = (int32_t *)malloc(sizeof(int32_t) * cont);
+  memcpy(pred->negatives, neg, sizeof(int32_t) * cont);
+  Cuda_NewRule( pred );
+  return YAP_Unify(YAP_ARG4, YAP_MkIntTerm((YAP_Int)pred));
+}
+
+static int
+cuda_erase( void )
+{
+  predicate *ptr = (predicate *)YAP_IntOfTerm(YAP_ARG1);
+  return Cuda_Erase( ptr );
+}
+
+void setQuery(YAP_Term t1, int32_t **res)
+{
+	int32_t *query = (int32_t *)malloc(MAXARG * sizeof(int32_t));
+	int32_t x, y = 0, *itr;
+	predicate *ptr = NULL;
+	if(YAP_IsPairTerm(t1))
+	{
+		while(YAP_IsPairTerm(t1))
+		{
+			ptr = (predicate *)YAP_IntOfTerm(YAP_HeadOfTerm(t1));
+			query[y] = ptr->name;
+			itr = ptr->address_host_table;
+			x = 2;
+			while(itr[x] != 0)
+				x++;
+			query[y+1] = itr[x+1];
+			t1 = YAP_TailOfTerm(t1);
+			y+=2;
+		}
+	}
+	else
+	{
+		ptr = (predicate *)YAP_IntOfTerm(t1);
+		query[y] = ptr->name;
+		itr = ptr->address_host_table;
+		x = 2;
+		while(itr[x] != 0)
+			x++;
+		query[y+1] = itr[x+1];
+		y += 2;
+	}
+	query[y] = -1;
+	query[y+1] = -1;
+	*res = query;
+}
+
+static int
+cuda_eval( void )
+{
+  int32_t *mat;
+
+#if defined(DATALOG) || defined(TUFFY)
+	int32_t *query = NULL;
+	setQuery(YAP_ARG1, &query);
+#endif
+
+	int32_t finalDR = YAP_IntOfTerm(YAP_ARG3);
+  int32_t n = Cuda_Eval(facts, cf, rules, cr, query, & mat, names, finalDR);
+
+#ifdef TUFFY
+	cf = 0;
+#endif
+#ifdef ROCKIT
+	if(cf > 0)
+		cf *= -1;
+#endif
+#if defined(TUFFY) || defined(ROCKIT)
+	cr = 0;
+	names[0] = '\0';
+	return FALSE;
+#else
+  int32_t i;
+  predicate *ptr = (predicate *)YAP_IntOfTerm(YAP_ARG1);
+  int32_t ncols = ptr->num_columns;
+  YAP_Term out = YAP_TermNil();
+  YAP_Functor f = YAP_MkFunctor(YAP_IntToAtom(ptr->name), ncols);
+  YAP_Term vec[256];
+
+	YAP_Atom at;
+
+  if (n < 0)
+    return FALSE;
+  for (i=0; i<n; i++) {
+    int32_t ni = ((n-1)-i)*ncols, j;
+
+	printf("%s(", YAP_AtomName(YAP_IntToAtom(ptr->name)));
+
+    for (j=0; j<ncols; j++) {
+      vec[j] = YAP_MkIntTerm(mat[ni+j]);
+
+	at = YAP_IntToAtom(mat[ni+j]);
+	if(at != NULL)
+		printf("%s", YAP_AtomName(at));
+	else
+		printf("%d", mat[ni+j]);	
+	if(j < (ncols - 1))
+		printf(",");
+
+    }
+    out = YAP_MkPairTerm(YAP_MkApplTerm( f, ncols, vec ), out);
+
+	printf(")\n");
+
+  }
+  if (n > 0)
+    free( mat );
+  return YAP_Unify(YAP_ARG2, out);
+#endif
+}
+
+static int
+cuda_coverage( void )
+{
+  int32_t *mat;
+
+#if defined(DATALOG) || defined(TUFFY)
+	int32_t *query = NULL;
+	setQuery(YAP_ARG1, &query);
+#endif
+
+  int32_t n = Cuda_Eval(facts, cf, rules, cr, query, & mat, 0, 0);
+  int32_t post = YAP_AtomToInt(YAP_AtomOfTerm(YAP_ARG2));
+  int32_t i = n/2, min = 0, max = n-1;
+  int32_t t0, t1;
+
+  if (n < 0)
+    return FALSE;
+  if (n == 0) {
+    return YAP_Unify(YAP_ARG4, YAP_MkIntTerm(0)) && 
+      YAP_Unify(YAP_ARG3, YAP_MkIntTerm(0));
+  }
+  t0 = mat[0], t1 = mat[(n-1)*2];
+  if (t0 == t1) { /* all sametype */
+    free( mat );
+    /* all pos */
+    if (t0 == post) 
+      return YAP_Unify(YAP_ARG3, YAP_MkIntTerm(n)) && 
+	YAP_Unify(YAP_ARG4, YAP_MkIntTerm(0));
+    /* all neg */
+    return YAP_Unify(YAP_ARG4, YAP_MkIntTerm(n)) && 
+      YAP_Unify(YAP_ARG3, YAP_MkIntTerm(0));
+  }
+  do {
+    i = (min+max)/2;
+    if (i == min) i++;
+    if (mat[i*2] == t0) {
+      min = i;
+    } else {
+      max = i;
+    }
+    if (min+1 == max) {      
+      free( mat );
+      if (t0 == post) 
+	return YAP_Unify(YAP_ARG3, YAP_MkIntTerm(max)) && 
+	  YAP_Unify(YAP_ARG4, YAP_MkIntTerm(n-max));
+      /* all neg */
+      return YAP_Unify(YAP_ARG4, YAP_MkIntTerm(max)) && 
+	YAP_Unify(YAP_ARG3, YAP_MkIntTerm(n-max));
+    }
+  } while ( TRUE );
+}
+
+static int cuda_count( void )
+{
+  int32_t *mat;
+
+#if defined(DATALOG) || defined(TUFFY)
+	int32_t *query = NULL;
+	setQuery(YAP_ARG1, &query);
+#endif
+
+  int32_t n = Cuda_Eval(facts, cf, rules, cr, query, & mat, 0, 0);
+
+  if (n < 0)
+    return FALSE;
+  free( mat );
+  return YAP_Unify(YAP_ARG2, YAP_MkIntTerm(n));
+}
+
+static int cuda_statistics( void )
+{
+  Cuda_Statistics();
+  return TRUE;
+}
+
+static int first_time = TRUE;
+
+void
+init_cuda(void)
+{
+  if (first_time) Cuda_Initialize();
+  first_time = FALSE;
+
+  AtomEq = YAP_LookupAtom("=");
+  AtomGt = YAP_LookupAtom(">");
+  AtomLt = YAP_LookupAtom("<");
+  AtomGe = YAP_LookupAtom(">=");
+  AtomLe = YAP_LookupAtom("=<");
+  AtomDf = YAP_LookupAtom("\\=");
+  AtomNt = YAP_LookupAtom("not");
+  YAP_UserCPredicate("load_facts", load_facts, 4);
+  YAP_UserCPredicate("cuda_init_facts", cuda_init_facts, 4);
+  YAP_UserCPredicate("cuda_load_fact", cuda_load_fact, 1);
+  YAP_UserCPredicate("load_rule", load_rule, 4);
+  YAP_UserCPredicate("cuda_erase", cuda_erase, 1);
+  YAP_UserCPredicate("cuda_eval", cuda_eval, 3);
+  YAP_UserCPredicate("cuda_coverage", cuda_coverage, 4);
+  YAP_UserCPredicate("cuda_count", cuda_count, 2);
+  YAP_UserCPredicate("cuda_statistics", cuda_statistics, 0);
+
+#ifdef ROCKIT
+  YAP_UserCPredicate("cuda_init_query", cuda_init_query, 1);
+#endif
+
+}
+
--- a/packages/cuda/old/dbio.cu
+++ b/packages/cuda/old/dbio.cu
@@ -0,0 +1,603 @@
+#include <iostream>
+#include <algorithm>
+#include <stdio.h>
+#include "memory.h"
+#include "union2.h"
+#include "dbio.h"
+
+#ifdef DATALOG
+//template<class InputIterator>
+//void datalogWrite(int query, InputIterator rul_str, InputIterator fin, int finalDR, int **result)
+void datalogWrite(int query, vector<rulenode>::iterator rul_str, vector<rulenode>::iterator fin, int finalDR, int **result)
+{
+	rulenode tmprule;
+	vector<rulenode>::iterator qposr;
+	int *dop1, *hres;
+	int cols1, res_rows, tipo;
+	tmprule.name = query;
+	qposr = lower_bound(rul_str, fin, tmprule, comparer);
+	cols1 = qposr->num_columns;
+	res_rows = cargafinal(query, cols1, &dop1);
+
+	if(res_rows != 0)
+	{	
+		if(res_rows > 0)
+		{
+			if(finalDR)
+				res_rows = unir(dop1, res_rows, cols1, &dop1, 0);
+			tipo = res_rows * cols1 * sizeof(int);
+			hres = (int *)malloc(tipo);
+			cudaMemcpy(hres, dop1, tipo, cudaMemcpyDeviceToHost);
+			cudaFree(dop1);
+			*result = hres;
+		}
+		else
+		{
+			res_rows *= -1;
+			if(finalDR)
+			{
+				int *dop2;
+				tipo = res_rows * cols1 * sizeof(int);
+				reservar(&dop2, tipo); 
+				cudaMemcpy(dop2, dop1, tipo, cudaMemcpyHostToDevice);
+				free(dop1);
+				res_rows = unir(dop2, res_rows, cols1, &dop2, 0);
+				tipo = res_rows * cols1 * sizeof(int);
+				hres = (int *)malloc(tipo);
+				cudaMemcpy(hres, dop2, tipo, cudaMemcpyDeviceToHost);
+				cudaFree(dop2);
+				*result = hres;
+			}
+			else
+				*result = dop1;
+		}
+	}
+}
+#endif
+
+#ifdef TUFFY
+void postgresRead(PGconn **ret, vector<gpunode> *L, int *inpquery, char *names, int finalDR)
+{
+	PGresult *pgr;
+	int x, y;
+	int *mat, *mat2;
+	char *tok, sel[1024], **qrs;
+	int w, z = 0, numt, numc, numc2, start = 0, start2, val;
+	PGconn *conn = PQconnectdb("host=localhost port=5432 dbname = prueba user=tuffer password=root");
+	if(PQstatus(conn) != CONNECTION_OK)
+    	{
+        	fprintf(stderr, "Connection to database failed: %s", PQerrorMessage(conn));
+		exit(1);
+        }
+
+	pgr = PQexec(conn, "Select nspname from pg_catalog.pg_namespace where oid = (select max(oid) from pg_catalog.pg_namespace)");
+	sprintf(sel, "SET search_path = %s", PQgetvalue(pgr, 0, 0)); 
+	PQclear(pgr);
+	PQexec(conn, sel);
+	tok = strtok(names, " ");	
+	if(finalDR)
+	{
+		qrs = (char **)malloc(100 * sizeof(char *));
+		while(tok != NULL)
+		{
+			sprintf(sel, "Select * from %s limit 0", tok);
+			pgr = PQexec(conn, sel);
+			numc = L->at(z).num_columns;
+			if(tok[0] == 'c')
+			{
+				sprintf(sel, "Select ");
+				numt = numc + 1;
+				for(x = 1; x < numt; x++)
+				{
+					strcat(sel, PQfname(pgr, x));
+					strcat(sel, ", ");
+				}
+				sel[strlen(sel)-2] = '\0';
+				sprintf(sel, "%s from %s", sel, tok);
+			}
+			else
+			{
+				sprintf(sel, "Select id, Club, ");
+				numt = numc + 6;
+				for(x = 8; x < numt; x++)
+				{
+					strcat(sel, PQfname(pgr, x));
+					strcat(sel, ", ");
+				}
+				sel[strlen(sel)-2] = '\0';
+				sprintf(sel, "%s from %s", sel, tok);
+			}
+			PQclear(pgr);
+			pgr = PQexec(conn, sel);
+			numt = PQntuples(pgr);
+			mat = (int *)malloc(numt * numc * sizeof(int));
+			if(tok[0] == 'c')
+			{
+				for(x = 0; x < numt; x++)
+				{
+					start = x * numc;
+					for(y = 0; y < numc; y++)
+						mat[start + y] = atoi(PQgetvalue(pgr, x, y));
+				}
+			}
+			else
+			{
+				numc2 = numc - 2;
+				mat2 = (int *)malloc(numt * numc2 * sizeof(int));
+				start = 0;
+				start2 = 0;
+				for(x = 0; x < numt; x++)
+				{
+					w = atoi(PQgetvalue(pgr, x, 1));
+					if(w < 2)
+					{
+						mat[start] = atoi(PQgetvalue(pgr, x, 0));
+						start++;
+						mat[start] = w;
+						start++;
+						if(w > 0)
+						{
+							for(y = 2; y < numc; y++)
+							{
+								val = atoi(PQgetvalue(pgr, x, y));
+								mat[start] = val;
+								mat2[start2] = val;
+								start++;
+								start2++;
+							}
+						}
+						else
+						{
+							for(y = 2; y < numc; y++)
+							{
+								val = atoi(PQgetvalue(pgr, x, y));
+								mat[start] = val;
+								start++;
+							}
+						}
+					}
+					else
+					{
+						for(y = 2; y < numc; y++)
+						{
+							val = atoi(PQgetvalue(pgr, x, y));
+							mat2[start2] = val;
+							start2++;
+						}
+					}
+				}
+				L->at(z+1).address_host_table = mat2;
+				L->at(z+1).num_rows = start2 / numc2;
+			}
+			L->at(z).address_host_table = mat;
+			L->at(z).num_rows = start / numc;
+			PQclear(pgr);
+			
+			x = 1;
+			while(inpquery[x] != -1)
+			{
+				if(L->at(z).name == inpquery[x])
+				{
+					numt = (strlen(tok) + 1) * sizeof(char);
+					qrs[x] = (char *)malloc(numt);
+					memcpy(qrs[x], tok, numt);
+				}
+				x += 2;
+			}
+			if(tok[0] == 'c')
+			{
+				tok = strtok(NULL, " ");
+				z++;
+			}
+			else
+			{
+				strtok(NULL, " ");	
+				tok = strtok(NULL, " ");
+				z += 2;
+			}
+		}
+	}
+	else
+	{
+		while(tok != NULL)
+		{
+			sprintf(sel, "Select * from %s limit 0", tok);
+			pgr = PQexec(conn, sel);
+			numc = L->at(z).num_columns;
+			if(tok[0] == 'c')
+			{
+				sprintf(sel, "Select weight, myid, ");
+				start = 1;
+				numt = numc + 1;
+			}
+			else
+			{
+				sprintf(sel, "Select truth, Club, atomID, ");
+				start = 8;
+				numt = numc + 5;
+			}
+			for(x = start; x < numt; x++)
+			{
+				strcat(sel, PQfname(pgr, x));
+				strcat(sel, ", ");
+			}
+			sel[strlen(sel)-2] = '\0';
+			sprintf(sel, "%s from %s", sel, tok);
+			PQclear(pgr);
+			pgr = PQexec(conn, sel);
+			numt = PQntuples(pgr);
+			mat = (int *)malloc(numt * numc * sizeof(int)); 
+			L->at(z).weight = (double *)malloc(numt * sizeof(double));
+			L->at(z).num_rows = numt;
+
+			for(x = 0; x < numt; x++)
+			{	
+				start = x * numc;
+				for(y = 1; y < numc; y++)
+					mat[start + y] = atoi(PQgetvalue(pgr, x, y));
+			}
+
+			numt *= numc;
+			double flo;
+			if(tok[0] == 'c')
+			{
+				for(x = 0, y = 0; x < numt; x+=numc, y++)
+				{
+					flo = atof(PQgetvalue(pgr, y, 0));
+					L->at(z).weight[y] = flo;
+					if(flo > 0)
+						mat[x] = y + 1;
+					else
+						mat[x] = -y - 1;
+				}
+			}
+			else
+			{
+				for(x = 0, y = 0; x < numt; x+=numc, y++)
+				{
+					if(PQgetvalue(pgr, y, 0)[0] == 't')
+						mat[x] = 2;
+					else
+						mat[x] = 1;
+				}				
+			}
+			L->at(z).address_host_table = mat;
+			numc = (strlen(tok) + 1) * sizeof(char);
+			L->at(z).predname = (char *)malloc(numc);
+			memcpy(L->at(z).predname, tok, numc);
+			PQclear(pgr);
+			tok = strtok(NULL, " ");
+			z++;
+		}
+	}
+	*ret = conn;
+}
+
+void postgresWrite(int *inpquery, int ninpf, vector<rulenode>::iterator rul_str, vector<rulenode>::iterator fin, vector<gpunode> *L, PGconn *conn, int finalDR)
+{
+	char sel[1024];
+	double *matw = NULL;
+	int qname, cols1, res_rows, tipo, *dop1;
+	int x, w, z, y, *hres;
+	rulenode tmprule;
+	vector<rulenode>::iterator qposr;
+	if(finalDR)
+	{
+		char file[] = "/dev/shm/mln0_atoms.csv";
+		z = 0;
+		int seqid = 1;
+		FILE *fp;
+		fp = fopen(file, "w");
+		if(fp == NULL)
+		{
+			cerr << "Failed to create main memory temporary file, attempting to use hardrive" << endl;
+			sprintf(file, "./temp/mln0_atoms.csv");
+			fp = fopen(file, "w");
+			if(fp == NULL)
+			{
+				cerr << "Failed to create main memory temporary file" << endl;
+				exit(1);
+			}
+		}
+		while((qname = inpquery[z]) != -1)
+		{
+			tmprule.name = qname;
+			qposr = lower_bound(rul_str, fin, tmprule, comparer);
+			cols1 = qposr->num_columns;
+			res_rows = cargafinal(qname, cols1, &dop1);
+
+			if(res_rows != 0)
+			{
+				if(res_rows < 0)
+					res_rows = unir(dop1, -res_rows, cols1, &dop1, 0);  /*duplicate elimination on result*/
+				else
+					res_rows = unir(dop1, res_rows, cols1, &dop1, finalDR);
+
+				tipo = res_rows * cols1 * sizeof(int);
+				hres = (int *)malloc(tipo);
+				cudaMemcpy(hres, dop1, tipo, cudaMemcpyDeviceToHost);
+				cudaFree(dop1);
+				w = z + 1;
+
+				strtok(qposr->rulename, "_");
+				strtok(NULL, "_");
+				int prid = atoi(strtok(NULL, "_"));
+
+				for(x = 0, w = 0; x < res_rows; x++, w+=2)
+				{
+					if(hres[w+1])
+						fprintf(fp, "%d,%d,%d,true\n", seqid, hres[w], prid);
+					else
+						fprintf(fp, "%d,%d,%d,false\n", seqid, hres[w], prid);
+					seqid++;
+				}
+				free(hres);
+			}
+			z += 2;
+		}
+		fclose(fp);
+		sprintf(sel, "Copy mln0_atoms(atomid,tupleID,predID,isquery) from '%s' CSV", file);
+		PQexec(conn, sel);
+	}
+	else
+	{
+		while(rul_str != fin)
+		{
+			cols1 = rul_str->num_columns;
+			res_rows = cargafinal(rul_str->name, cols1, &dop1);
+			if(res_rows == 0)
+			{
+				rul_str++;
+				continue;
+			}
+			res_rows = abs(res_rows);
+			tipo = res_rows * cols1 * sizeof(int);
+			hres = (int *)malloc(tipo);
+			cudaMemcpy(hres, dop1, tipo, cudaMemcpyDeviceToHost);
+			cudaFree(dop1);
+
+			char file[] = "/dev/shm/buffer.csv";
+			FILE *fp;
+			fp = fopen(file, "w");
+			if(fp == NULL)
+			{
+				cerr << "Failed to create main memory temporary file, attempting to use hardrive" << endl;
+				sprintf(file, "./temp/buffer.csv");
+				fp = fopen(file, "w");
+				if(fp == NULL)
+				{
+					cerr << "Failed to create main memory temporary file" << endl;
+					exit(1);
+				}
+			}
+
+			if(rul_str->rulename[0] == 'z')
+			{
+				char *name = rul_str->rulename + 1;
+				for(x = 0; x < ninpf; x++)
+				{
+					if(strncmp(L->at(x).predname, name, strlen(name)) == 0)
+					{
+						matw = L->at(x).weight;
+						break;
+					}
+				}
+
+				cols1 -= 3;
+				for(x = 0, z = 0; x < res_rows; x++, z+=3)
+				{
+					for(y = 0; y < cols1; y++, z++)
+						fprintf(fp, "%d,", hres[z]);
+					fprintf(fp, "%d,%lf,%d\n", hres[z], matw[abs(hres[z+1])-1], hres[z+2]);
+				}
+				fclose(fp);
+				sprintf(sel, "Copy %s from '%s' CSV", name, file);
+				PQexec(conn, sel);
+			}
+			else
+			{
+				cols1--;
+				for(x = 0, z = 0; x < res_rows; x++, z++)
+				{
+					for(y = 0; y < cols1; y++, z++)
+						fprintf(fp, "%d,", hres[z]);
+					fprintf(fp, "%d\n", hres[z]);
+				}
+				fclose(fp);
+				sprintf(sel, "Copy %s from '%s' CSV", rul_str->rulename, file);
+				PQexec(conn, sel);
+			}
+			free(hres);
+			rul_str++;
+		}
+	}
+	PQfinish(conn);
+	if(finalDR)
+		clear_memory_all();
+}
+#endif
+
+#ifdef ROCKIT
+void mysqlRead(MYSQL **ret, int *qrs, vector<gpunode> *L, int ninpf, char *names, int finalDR)
+{
+	char *tok, sel[1024];
+	int w, x, y, z = 0, numt, numc;
+	int *mat;
+	MYSQL *con = mysql_init(NULL);
+	if(con == NULL)
+	{
+		fprintf(stderr, "mysql_init() failed\n");
+      		exit(1);
+	}
+	mysql_options(con, MYSQL_OPT_LOCAL_INFILE, NULL);
+	mysql_real_connect(con, "localhost", "root", "root", "rockit", 0, NULL, 0);
+	if(finalDR)
+	{
+		y = 0;
+		while(qrs[y] != 0)
+		{
+			for(z = 0; z < ninpf; z++)
+			{
+				if(qrs[y] == L->at(z).name)
+				{
+					MYSQL_ROW row;
+					sprintf(sel, "Select count(*) from %s", L->at(z).predname);
+					mysql_query(con, sel);
+					MYSQL_RES *result = mysql_store_result(con);
+					row = mysql_fetch_row(result);
+					numt = atoi(row[0]);
+					mysql_free_result(result);
+
+					if(numt != L->at(z).num_rows)
+					{
+						liberar(L->at(z).name);
+						numc = L->at(z).num_columns;
+						sprintf(sel, "Select * from %s", L->at(z).predname);
+						mysql_query(con, sel);
+						MYSQL_RES *result = mysql_store_result(con);
+						mat = (int *)malloc(numt * numc * sizeof(int));
+						w = 0;
+						while ((row = mysql_fetch_row(result))) 
+						{
+							for(x = 0; x < numc; x++, w++)
+								mat[w] = atoi(row[x]);
+						}
+
+						mysql_free_result(result);
+						if(L->at(z).address_host_table != NULL)
+							free(L->at(z).address_host_table);
+						L->at(z).address_host_table = mat;
+						L->at(z).num_rows = numt;
+					}
+				}
+			}
+			y++;
+		}
+	}
+	else
+	{
+		tok = strtok(names, " ");
+		while(tok != NULL)
+		{
+			numc = L->at(z).num_columns;
+			sprintf(sel, "Select * from %s", tok);
+			mysql_query(con, sel);
+			MYSQL_RES *result = mysql_store_result(con);
+			numt = mysql_num_rows(result);
+
+			MYSQL_ROW row;
+			mat = (int *)malloc(numt * numc * sizeof(int));
+			w = 0;
+			if(tok[0] == 'f' && tok[1] >= '0' && tok[1] <= '9')
+			{
+				while ((row = mysql_fetch_row(result))) 
+				{
+					for(x = 1; x <= numc; x++, w++)
+						mat[w] = atoi(row[x]);
+				}
+			}
+			else
+			{
+				while ((row = mysql_fetch_row(result))) 
+				{
+					for(x = 0; x < numc; x++, w++)
+						mat[w] = atoi(row[x]);
+				}
+			}
+			mysql_free_result(result);
+			L->at(z).address_host_table = mat;
+			L->at(z).num_rows = numt;
+
+			numc = (strlen(tok) + 1) * sizeof(char);
+			L->at(z).predname = (char *)malloc(numc);
+			strcpy(L->at(z).predname, tok);
+			tok = strtok(NULL, " ");
+			z++;
+		}
+	}
+	*ret = con;
+}
+
+void mysqlWrite(vector<rulenode>::iterator rul_str, vector<rulenode>::iterator fin, vector<gpunode> *L, MYSQL *con)
+{
+	int x, y, z, cols1, cols2, res_rows, tipo;
+	int *hres, *dop1;
+	char *id, *sign, *q1, *q2;
+	char sel[1024], weight[1024];
+	gpunode tmpfact;
+	while(rul_str != fin)
+	{
+		cols1 = rul_str->num_columns;
+		res_rows = cargafinal(rul_str->name, cols1, &dop1);
+		id = strtok(rul_str->rulename, "_");
+		sprintf(sel, "create table if not exists %s(weight double, ", id);
+		for(x = 0; x < cols1; x++)
+		{
+			sprintf(weight, "a%d char(10), ", x);
+			strcat(sel, weight);
+		}
+		sel[strlen(sel)-2] = ')';
+		strcat(sel, "ENGINE = MEMORY DEFAULT CHARSET=latin1");
+		mysql_query(con, sel);
+		sprintf(sel, "truncate %s", id);
+		mysql_query(con, sel);
+
+		if(res_rows == 0)
+		{
+			rul_str++;
+			continue;
+		}
+
+		if(res_rows > 0)
+		{
+			tmpfact = L->at(-rul_str->referencias[rul_str->num_rows - 2] - 1);
+			sign = tmpfact.predname;
+			tipo = res_rows * cols1 * sizeof(int);
+			hres = (int *)malloc(tipo);
+			cudaMemcpy(hres, dop1, tipo, cudaMemcpyDeviceToHost);
+			if(sign[0] == 'f' && sign[1] >= '0' && sign[1] <= '9')
+				sumar(tmpfact.name, dop1, cols1, res_rows);
+		}
+		else
+		{
+			hres = dop1;
+			res_rows = -res_rows;
+		}
+
+		sign = strtok(NULL, "_");
+		q1 = strtok(NULL, "_");
+		q2 = strtok(NULL, "_");
+		if(sign[0] == '0')
+			sprintf(weight, "%s.%s", q1, q2);
+		else
+			sprintf(weight, "-%s.%s", q1, q2);
+
+		FILE *fp;
+		char file[512];
+		sprintf(file, "/dev/shm/%s.tsv", id);
+		fp = fopen(file, "w");
+		if(fp == NULL)
+		{
+			cerr << "Failed to create main memory temporary file, attempting to use hardrive" << endl;
+			sprintf(file, "./temp/%s.tsv", id);
+			fp = fopen(file, "w");
+		}
+		
+		cols2 = cols1 - 1;
+		for(x = 0, z = 0; x < res_rows; x++, z++)
+		{
+			fprintf(fp, "%s\t", weight);
+			for(y = 0; y < cols2; y++, z++)
+				fprintf(fp, "%d\t", hres[z]);
+			fprintf(fp, "%d\n", hres[z]);
+		}
+		fclose(fp);
+
+		sprintf(sel, "LOAD DATA LOCAL INFILE '%s' INTO TABLE %s", file, id);
+		mysql_query(con, sel);
+		rul_str++;
+	}
+	mysql_close(con);
+}
+#endif
+
--- a/packages/cuda/old/dbio.h
+++ b/packages/cuda/old/dbio.h
@@ -0,0 +1,28 @@
+#ifndef _DBIO_H_
+#define _DBIO_H_
+
+#include "pred.h"
+#ifdef TUFFY
+#include <libpq-fe.h>
+#endif
+#ifdef ROCKIT
+#include <mysql/mysql.h>
+#endif
+#include <vector>
+#include "lista.h"
+
+using namespace std;
+
+#ifdef TUFFY
+void postgresRead(PGconn **ret, vector<gpunode> *L, int *inpquery, char *names, int finalDR);
+void postgresWrite(int *inpquery, int ninpf, vector<rulenode>::iterator rul_str, vector<rulenode>::iterator fin, vector<gpunode> *L, PGconn *conn, int finalDR);
+#endif
+#ifdef ROCKIT
+void mysqlRead(MYSQL **ret, int *qrs, vector<gpunode> *L, int ninpf, char *names, int finalDR);
+void mysqlWrite(vector<rulenode>::iterator rul_str, vector<rulenode>::iterator fin, vector<gpunode> *L, MYSQL *con);
+#endif
+#ifdef DATALOG
+void datalogWrite(int query, vector<rulenode>::iterator rul_str, vector<rulenode>::iterator fin, int finalDR, int **result);
+#endif
+
+#endif
--- a/packages/cuda/old/lista.cu
+++ b/packages/cuda/old/lista.cu
--- a/packages/cuda/old/lista.h
+++ b/packages/cuda/old/lista.h
@@ -0,0 +1,44 @@
+#ifndef _LISTA_H_
+#define _LISTA_H_
+
+typedef struct Node{
+	int name;
+	int *dev_address;
+	int rows;
+	int size;
+	int iteration;
+	int isrule;
+}memnode;
+
+typedef struct auxiliar{
+	int name;
+	int num_rows;
+	int num_columns;
+	int *address_host_table;
+	int *rule_names;
+	int *referencias;
+	int **select;
+	int *numsel;
+	int **project;
+	int2 *projpos;
+	int **selfjoin;
+	int *numselfj;
+	int **wherejoin;
+	int *numjoin;
+	int totalpreds;
+	int **preds;
+	int2 *numpreds;
+	int *negatives;
+	char *rulename;
+	int gen_act;
+	int gen_ant;
+}rulenode;
+
+typedef struct completed{
+	int name;
+	int numrules;
+	int reduce;
+	int reset;
+}compnode;
+
+#endif
--- a/packages/cuda/old/memory.cu
+++ b/packages/cuda/old/memory.cu
@@ -0,0 +1,575 @@
+#include <list>
+#include <iostream>
+#include <stdlib.h>
+#include <algorithm>
+#include <thrust/device_vector.h>
+#include "lista.h"
+#include "memory.h"
+#include "pred.h"
+
+#define MAX_REC 200
+#define MAX_FIX_POINTS 100
+
+memnode temp_storage[MAX_REC];
+/*List used to store information (address, size, etc.) about facts and rule results loaded in the GPU*/
+list<memnode> GPUmem;
+/*List used to store information about rule results offloaded from the GPU to the CPU*/
+list<memnode> CPUmem;
+
+/*Auxiliary function to sort rule list*/
+bool comparer(const rulenode &r1, const rulenode &r2)
+{
+	return (r1.name > r2.name); 
+}
+
+/*Used in search functions to compare iterations*/
+bool compareiteration(const memnode &r1, const memnode &r2)
+{
+	return (r1.iteration < r2.iteration); 
+}
+
+/*Used in search functions to compare names*/
+bool comparename(const memnode &r1, const memnode &r2)
+{
+	return (r1.name > r2.name); 
+}
+
+/*Linear search of 'name' fact*/
+template<class InputIterator>
+InputIterator buscarhecho(InputIterator first, InputIterator last, int name)
+{
+	while(first!=last) 
+	{
+		if(first->name == name && first->isrule == 0) return first;
+			++first;
+	}
+	return last;
+}
+
+/*Finds all results of rule 'name' in iteration 'itr' in both CPU and GPU memory. Every result found is removed from its respective list*/
+list<memnode>::iterator buscarpornombre(int name, int itr, int *totalrows, int *gpunum, int *cpunum)
+{
+	int x = 0, sum = 0;
+	memnode temp;
+	list<memnode>::iterator i;
+	temp.iteration = itr;
+	pair<list<memnode>::iterator, list<memnode>::iterator> rec = equal_range(GPUmem.begin(), GPUmem.end(), temp, compareiteration);
+
+	while(rec.first != rec.second)
+	{
+		if(rec.first->name == name && rec.first->isrule == 1)
+		{
+			temp_storage[x] = *rec.first;
+			rec.first = GPUmem.erase(rec.first);
+			sum += temp_storage[x].rows;
+			x++;
+		}	
+		else
+			rec.first++;
+	}
+	*gpunum = x;
+	temp.name = name;
+	temp.isrule = 1;
+	i = GPUmem.insert(rec.first, temp);
+	rec = equal_range(CPUmem.begin(), CPUmem.end(), temp, compareiteration);
+
+	while(rec.first != rec.second)
+	{				
+		if(rec.first->name == name && rec.first->isrule == 1)
+		{
+			temp_storage[x] = *rec.first;
+			rec.first = CPUmem.erase(rec.first);
+			sum += temp_storage[x].rows;
+			x++;
+		}	
+		else
+			rec.first++;
+	}
+	*totalrows = sum;
+	*cpunum = x;
+	return i;
+}
+
+list<memnode>::iterator buscarpornombrecpu(int name, int itr, int *totalrows, int *gpunum, int *cpunum)
+{
+	int x = 0, sum = 0;
+	memnode temp;
+	list<memnode>::iterator i;
+	temp.iteration = itr;
+	pair<list<memnode>::iterator, list<memnode>::iterator> rec = equal_range(GPUmem.begin(), GPUmem.end(), temp, compareiteration);
+
+	while(rec.first != rec.second)
+	{				
+		if(rec.first->name == name)
+		{
+			temp_storage[x] = *rec.first;
+			rec.first = GPUmem.erase(rec.first);
+			sum += temp_storage[x].rows;
+			x++;
+		}	
+		else
+			rec.first++;
+	}
+
+	*gpunum = x;
+	temp.name = name;
+	temp.isrule = 1;
+	rec = equal_range(CPUmem.begin(), CPUmem.end(), temp, compareiteration);
+
+	while(rec.first != rec.second)
+	{				
+		if(rec.first->name == name)
+		{
+			temp_storage[x] = *rec.first;
+			rec.first = CPUmem.erase(rec.first);
+			sum += temp_storage[x].rows;
+			x++;
+		}	
+		else
+			rec.first++;
+	}
+	i = CPUmem.insert(rec.first, temp);
+	*totalrows = sum;
+	*cpunum = x;
+	return i;
+}
+
+/*Removes the least recently used memory block from GPU memory, sending it to CPU memory if it's a rule result. 
+If there are no used memory blocks in the GPU and we still don't have enough memory, the program exits with error*/
+void limpiar(const char s[], size_t sz)
+{
+	list<memnode>::iterator ini;
+	memnode temp;
+	size_t free, total;
+
+	if(GPUmem.size() == 0)
+	{
+		cudaMemGetInfo(&free,&total);
+		cerr << s << ": not enough GPU memory: have " << free << " of " << total << ", need " << sz << " bytes." << endl;
+		exit(1);
+	}		
+
+	ini = GPUmem.begin();
+	if(ini->isrule)
+	{	
+		temp = *ini;
+		temp.dev_address = (int *)malloc(ini->size);
+		cudaMemcpyAsync(temp.dev_address, ini->dev_address, temp.size, cudaMemcpyDeviceToHost);
+		list<memnode>::iterator pos = lower_bound(CPUmem.begin(), CPUmem.end(), temp, compareiteration);
+		CPUmem.insert(pos, temp);
+	}
+	cudaFree(ini->dev_address);
+	GPUmem.erase(ini);
+}
+
+/*Allocs 'size' amount of bytes in GPU memory. If not enough memory is available, removes least recently used memory blocks until 
+enough space is available*/
+void reservar(int **ptr, size_t size)
+{
+	size_t free, total;
+
+        if (size == 0) { 
+                *ptr = NULL; 
+                return;
+        }
+
+	cudaMemGetInfo(&free, &total);
+	while(free < size)
+	{
+		cout << "Se limpio memoria " << free << " " << total << endl;
+		limpiar("not enough memory", size);
+		cudaMemGetInfo(&free, &total);
+	}
+
+	while(cudaMalloc(ptr, size) == cudaErrorMemoryAllocation)
+		limpiar("Error in memory allocation", size);
+	if (! *ptr ) {
+	  size_t free, total;
+	  cudaMemGetInfo(      &free, &total	 );
+	  cerr << "Could not allocate " << size << " bytes, only " << free << " avaliable from total of " << total << " !!!" << endl;
+	  cerr << "Exiting CUDA...." << endl;
+	  exit(1);
+	}
+}
+
+/*Creates a new entry in the GPU memory list*/
+void registrar(int name, int num_columns, int *ptr, int rows, int itr, int rule)
+{
+	memnode temp;
+	temp.name = name;
+	temp.dev_address = ptr;
+	temp.rows = rows;
+	temp.size = rows * num_columns * sizeof(int);
+	temp.iteration = itr;
+	temp.isrule = rule;
+	GPUmem.push_back(temp);
+}
+
+void registrarcpu(int name, int num_columns, int *ptr, int rows, int itr, int rule)
+{
+	memnode temp;
+	temp.name = name;
+	temp.dev_address = ptr;
+	temp.rows = rows;
+	temp.size = rows * num_columns * sizeof(int);
+	temp.iteration = itr;
+	temp.isrule = rule;
+	CPUmem.push_back(temp);
+}
+
+/*Updates the information of an element in a list*/
+template<class InputIterator>
+void actualizar(int num_columns, int *ptr, int rows, InputIterator i)
+{
+	i->dev_address = ptr;
+	i->rows = rows;
+	i->size = rows * num_columns * sizeof(int);
+}
+
+/*Count the total number of rows generated by rule 'name' in iteration 'iter'*/
+int numrows(int name, int itr)
+{
+	int sum = 0;
+	memnode temp;
+	temp.iteration = itr;
+	pair<list<memnode>::iterator, list<memnode>::iterator> rec = equal_range(GPUmem.begin(), GPUmem.end(), temp, compareiteration);
+	while(rec.first != rec.second)
+	{
+		if(rec.first->name == name)
+			sum += rec.first->rows;
+		rec.first++;
+	}
+	rec = equal_range(CPUmem.begin(), CPUmem.end(), temp, compareiteration);
+	while(rec.first != rec.second)
+	{
+		if(rec.first->name == name)
+			sum += rec.first->rows;
+		rec.first++;
+	}
+	return sum;
+}
+
+	extern "C" void * YAP_IntToAtom(int);
+	extern  "C" char * YAP_AtomName(void *);
+
+/*Loads facts or rule results in GPU memory. If a fact is already in GPU memory, its pointer is simply returned. Otherwise, 
+memory is reserved and the fact is loaded. Rule results are loaded based on the current iteration 'itr' and both GPU and 
+CPU memories are searched for all instances of said results. The instances are combined into a single one in GPU memory.*/
+int cargar(int name, int num_rows, int num_columns, int is_fact, int *address_host_table, int **ptr, int itr)
+{
+	int numgpu, numcpu, totalrows = 0;
+	int *temp, x;
+	int size, itrant, inc = 0;
+	list<memnode>::iterator i;
+	memnode fact;
+
+	if(is_fact)
+	{
+		i = buscarhecho(GPUmem.begin(), GPUmem.end(), name);
+		if(i != GPUmem.end())
+		{
+			fact = *i;
+			GPUmem.erase(i);
+			fact.iteration = itr;
+			*ptr = fact.dev_address;
+			GPUmem.push_back(fact);
+			return fact.rows;
+		}
+		size = num_rows * num_columns * sizeof(int);
+		reservar(&temp, size);
+		cudaMemcpyAsync(temp, address_host_table, size, cudaMemcpyHostToDevice);
+		registrar(name, num_columns, temp, num_rows, itr, 0);
+		*ptr = temp;
+		return num_rows;
+	}
+	if(itr > 0)
+	{
+		itrant = itr - 1;
+		i = buscarpornombre(name, itrant, &totalrows, &numgpu, &numcpu);
+		if((numgpu == 1) && (numcpu == 1))
+		{
+			actualizar(num_columns, temp_storage[0].dev_address, temp_storage[0].rows, i);
+			*ptr = temp_storage[0].dev_address;
+			return temp_storage[0].rows;
+		}
+		size = totalrows * num_columns * sizeof(int);
+		reservar(&temp, size);
+		for(x = 0; x < numgpu; x++)
+		{
+			cudaMemcpyAsync(temp + inc, temp_storage[x].dev_address, temp_storage[x].size, cudaMemcpyDeviceToDevice);
+			inc += temp_storage[x].size / sizeof(int);
+			cudaFree(temp_storage[x].dev_address);
+		}
+		for(; x < numcpu; x++)
+		{
+			cudaMemcpyAsync(temp + inc, temp_storage[x].dev_address, temp_storage[x].size, cudaMemcpyHostToDevice);
+			inc += temp_storage[x].size / sizeof(int);
+			free(temp_storage[x].dev_address);
+		}
+		actualizar(num_columns, temp, totalrows, i);
+		*ptr = temp;
+		return totalrows;
+	}
+	return 0;
+}
+
+int cargarcpu(int name, int num_rows, int num_columns, int is_fact, int *address_host_table, int **ptr, int itr)
+{
+	int numgpu, numcpu, totalrows = 0;
+	int *temp, x;
+	int size, itrant, inc = 0;
+	list<memnode>::iterator i;
+
+	if(is_fact)
+	{
+		*ptr = address_host_table;
+		return num_rows;
+	}
+	if(itr > 0)
+	{
+		itrant = itr - 1;
+		i = buscarpornombrecpu(name, itrant, &totalrows, &numgpu, &numcpu);
+
+		if((numgpu == 0) && (numcpu == 1))
+		{
+			actualizar(num_columns, temp_storage[0].dev_address, temp_storage[0].rows, i);
+			*ptr = temp_storage[0].dev_address;
+			return temp_storage[0].rows;
+		}
+		size = totalrows * num_columns * sizeof(int);
+		temp = (int *)malloc(size);
+		for(x = 0; x < numgpu; x++)
+		{
+			cudaMemcpyAsync(temp + inc, temp_storage[x].dev_address, temp_storage[x].size, cudaMemcpyDeviceToHost);
+			inc += temp_storage[x].size / sizeof(int);
+			cudaFree(temp_storage[x].dev_address);
+		}
+		for(; x < numcpu; x++)
+		{
+			memcpy(temp + inc, temp_storage[x].dev_address, temp_storage[x].size);
+			inc += temp_storage[x].size / sizeof(int);
+			free(temp_storage[x].dev_address);
+		}
+		actualizar(num_columns, temp, totalrows, i);
+		*ptr = temp;
+		return totalrows;
+	}
+	return 0;
+}
+
+/*Loads all results of rule 'name' from both GPU and CPU memories into the GPU*/
+int cargafinal(int name, int cols, int **ptr)
+{
+	int *temp, *ini, cont = 0, numg = 0, numc = 0;
+	memnode bus;
+	bus.name = name;
+	GPUmem.sort(comparename);
+	CPUmem.sort(comparename);
+	list<memnode>::iterator endg = GPUmem.end();
+	list<memnode>::iterator endc = CPUmem.end();
+	list<memnode>::iterator pos = lower_bound(GPUmem.begin(), endg, bus, comparename);
+	list<memnode>::iterator gpu = pos;
+	while(pos != endg && pos->name == name)
+	{
+		cont += pos->rows;
+		numg++;
+		pos++;
+	}
+	pos = lower_bound(CPUmem.begin(), endc, bus, comparename);
+	list<memnode>::iterator cpu = pos;
+	while(pos != endc && pos->name == name)
+	{
+		cont += pos->rows;
+		numc++;
+		pos++;
+	}
+
+	if(numg == 0 && numc == 0)
+		return 0;
+	if(numg == 1 && numc == 0) 
+	{
+		pos = gpu;
+		*ptr = pos->dev_address;
+		cont = pos->rows;
+		GPUmem.erase(pos);
+		#ifdef TUFFY
+		return -cont;
+		#else
+		return cont;
+		#endif
+	}
+	if(numg == 0 && numc == 1)
+	{
+		pos = cpu;
+		cont = pos->rows;
+		#ifdef TUFFY
+		reservar(&temp, pos->size);
+		cudaMemcpy(temp, pos->dev_address, pos->size, cudaMemcpyHostToDevice);
+		*ptr = temp;
+		#else
+		*ptr = pos->dev_address;
+		#endif
+		CPUmem.erase(pos);
+		return -cont;
+	}
+
+	reservar(&temp, cont * cols * sizeof(int));
+	ini = temp;
+	pos = gpu;
+	while(pos != endg && pos->name == name)
+	{
+		cudaMemcpy(temp, pos->dev_address, pos->size, cudaMemcpyDeviceToDevice);
+		temp += pos->size / sizeof(int);
+		pos++;
+	}
+	pos = cpu;
+	while(pos != endc && pos->name == name)
+	{
+		cudaMemcpy(temp, pos->dev_address, pos->size, cudaMemcpyHostToDevice);
+		temp += pos->size / sizeof(int);
+		pos++;
+	}
+	*ptr = ini;
+	return cont;
+}
+
+/*Compares the results of the current iteration against the results of older iterations. 
+Used to avoid infinite computations when the result is not a single fixed-point, but an 
+orbit of points.*/
+bool generadas(int name, int filas, int cols, int itr)
+{
+	int r1, r2, x, fin;
+	int *dop1, *dop2;
+
+	r2 = numrows(name, itr);
+	if(itr < MAX_FIX_POINTS)
+		fin = itr;
+	else
+		fin = MAX_FIX_POINTS;
+	for(x = 1; x <= fin; x++)
+	{
+		r1 = numrows(name, itr - x);
+		if(r1 == r2)
+		{
+			r2 = cargar(name, filas, cols, 0, NULL, &dop2, itr + 1);
+			thrust::device_ptr<int> pt2 = thrust::device_pointer_cast(dop2);
+			r1 = cargar(name, filas, cols, 0, NULL, &dop1, itr - x + 1);
+			thrust::device_ptr<int> pt1 = thrust::device_pointer_cast(dop1);
+			if(thrust::equal(pt1, pt1 + r1, pt2) == true)
+				return true;
+		}
+	}
+	return false;
+}
+
+void mostrar_memoria()
+{
+	unsigned int x;
+	list<memnode>::iterator i = GPUmem.begin();
+	cout << "Memoria inicio GPU" << endl;
+	for(x = 0; x < GPUmem.size(); x++, i++)
+		cout << i->name << " " << i->iteration << " " << i->isrule << " " << i->rows << " " << i->size << endl;
+	cout << "Memoria fin GPU" << endl;
+}
+
+void mostrar_memcpu()
+{
+	unsigned int x;
+	list<memnode>::iterator i = CPUmem.begin();
+	cout << "Memoria inicio CPU" << endl;
+	for(x = 0; x < CPUmem.size(); x++, i++)
+		cout << i->name << " " << i->iteration << endl;
+	cout << "Memoria fin CPU" << endl;
+}
+
+/*Clear all rule results from both GPU and CPU memory*/
+void clear_memory()
+{
+	list<memnode>::iterator ini;
+	list<memnode>::iterator fin;
+       	ini = GPUmem.begin();
+	fin = GPUmem.end();
+	while(ini != fin)
+	{
+		if(ini->isrule)
+		{
+			cudaFree(ini->dev_address);
+			ini = GPUmem.erase(ini);
+		}
+		else
+			ini++;
+	}
+	ini = CPUmem.begin();
+	fin = CPUmem.end();
+	while(ini != fin)
+	{
+		free(ini->dev_address);
+		ini++;
+	}
+	CPUmem.clear();
+}
+
+/*Clear everything from both GPU and CPU memory*/
+void clear_memory_all()
+{
+	list<memnode>::iterator ini;
+	list<memnode>::iterator fin;
+       	ini = GPUmem.begin();
+	fin = GPUmem.end();
+	while(ini != fin)
+	{
+		cudaFree(ini->dev_address);
+		ini++;
+	}
+	GPUmem.clear();
+	ini = CPUmem.begin();
+	fin = CPUmem.end();
+	while(ini != fin)
+	{
+		free(ini->dev_address);
+		ini++;
+	}
+	CPUmem.clear();
+}
+
+/*Remove all instances of fact 'name' from both CPU and GPU memories*/
+void liberar(int name)
+{
+	list<memnode>::iterator i;
+	memnode fact;
+	i = buscarhecho(GPUmem.begin(), GPUmem.end(), name);
+	if(i != GPUmem.end())
+	{
+		fact = *i;
+		GPUmem.erase(i);
+		cudaFree(fact.dev_address);
+	}
+	i = buscarhecho(CPUmem.begin(), CPUmem.end(), name);
+	if(i != CPUmem.end())
+	{
+		fact = *i;
+		CPUmem.erase(i);
+		free(fact.dev_address);
+	}
+}
+
+/*Add all rows in 'dop1' to the fact 'name' by creating a new array capable of holding both.*/
+void sumar(int name, int *dop1, int cols, int rows)
+{
+	list<memnode>::iterator i;
+	memnode fact;
+	i = buscarhecho(GPUmem.begin(), GPUmem.end(), name);
+	int *res, newrows, offset;
+	if(i != GPUmem.end())
+	{
+		fact = *i;
+		newrows = rows + fact.rows;
+		reservar(&res, newrows * cols * sizeof(int));
+		offset = fact.rows * cols;
+		cudaMemcpyAsync(res, fact.dev_address, offset * sizeof(int), cudaMemcpyDeviceToDevice);
+		GPUmem.erase(i);
+		registrar(name, cols, res, newrows, 0, 0);
+		cudaMemcpyAsync(res + offset, dop1, rows * cols * sizeof(int), cudaMemcpyDeviceToDevice);
+		cudaFree(fact.dev_address);
+	}
+}
--- a/packages/cuda/old/memory.h
+++ b/packages/cuda/old/memory.h
@@ -0,0 +1,27 @@
+#ifndef _MEMORY_H_
+#define _MEMORY_H_
+
+#include <list>
+#include <vector>
+#include "lista.h"
+
+using namespace std;
+
+bool comparer(const rulenode&, const rulenode&);
+void limpiar(const char [], size_t);
+void limpiartodo(int*, int*);
+int cargar(int, int, int, int, int*, int**, int);
+int cargarcpu(int, int, int, int, int*, int**, int);
+int cargafinal(int, int, int**);
+void reservar(int**, size_t);
+void registrar(int, int, int*, int, int, int);
+void registrarcpu(int, int, int*, int, int, int);
+bool generadas(int, int, int, int);
+void sumar(int, int*, int, int);
+void liberar(int);
+void mostrar_memoria(void);
+void mostrar_memcpu(void);
+void clear_memory(void);
+void clear_memory_all(void);
+
+#endif
--- a/packages/cuda/old/pred.h
+++ b/packages/cuda/old/pred.h
@@ -0,0 +1,47 @@
+#ifndef _PRED_H_
+#define _PRED_H_
+
+// #define DEBUG_MEM 1
+
+typedef struct Nodo{
+        int name;
+	int num_rows;
+	int num_columns;
+	int is_fact;
+	int *address_host_table;
+	int *negatives;
+	char *predname;
+	double *weight;
+}gpunode;
+
+typedef gpunode predicate;
+
+//#define TIMER 1
+#define DATALOG 1
+#define NUM_T 4
+#define INISIZE 1000000
+
+#if TIMER
+typedef struct Stats{
+  size_t joins, selects, unions, builtins;
+  size_t calls;
+  double total_time;
+  float max_time, min_time;
+  float select1_time, select2_time, join_time, sort_time, union_time, pred_time;
+}statinfo;
+
+extern statinfo cuda_stats;
+#endif
+
+/*Constants used to mark comparison predicates*/
+#define BPOFFSET (-6)
+#define SBG_EQ  (-1)
+#define SBG_GT  (-2)
+#define SBG_LT  (-3)
+#define SBG_GE  (-4)
+#define SBG_LE  (-5)
+#define SBG_DF  (-6)
+
+int Cuda_Eval(predicate**, int, predicate**, int, int*, int**, char*, int);
+void  Cuda_Statistics( void );
+#endif
--- a/packages/cuda/old/selectproyect.cu
+++ b/packages/cuda/old/selectproyect.cu
@@ -0,0 +1,306 @@
+#include <thrust/device_vector.h>
+#include <thrust/scan.h>
+#include <stdlib.h>
+#include "memory.h"
+#include "bpreds.h"
+
+/*Mark all rows that comply with the selections*/
+__global__ void marcar2(int *dop1, int rows, int cols, int *cons, int numc, int *res)
+{
+ 	extern __shared__ int shared[];
+	int id = blockIdx.x * blockDim.x + threadIdx.x;
+	int x, rowact, posact;
+	if(threadIdx.x < numc)
+		shared[threadIdx.x] = cons[threadIdx.x];
+	__syncthreads();
+	if(id < rows)
+	{
+		rowact = id * cols;
+		for(x = 0; x < numc; x += 2)
+		{
+			posact = rowact + shared[x];
+			if(dop1[posact] != shared[x+1])
+				return;
+		}
+		res[id] = 1;
+	}
+}
+/*If we already have an array of marks (perhaps because the selfjoin was applied first), 
+we unmark any rows that do not comply with the selections*/
+__global__ void marcar(int *dop1, int rows, int cols, int *cons, int numc, int *res)
+{
+	extern __shared__ int shared[];
+	int id = blockIdx.x * blockDim.x + threadIdx.x;
+	int x, rowact, posact;
+	if(threadIdx.x < numc)
+		shared[threadIdx.x] = cons[threadIdx.x];
+	__syncthreads();
+	if(id < rows)
+	{
+		if(res[id] == 0)
+			return;
+		rowact = id * cols;
+		for(x = 0; x < numc; x += 2)
+		{
+			posact = rowact + shared[x];
+			if(dop1[posact] != shared[x+1])
+			{
+				res[id] = 0;
+				return;
+			}
+		}
+	}
+}
+
+/*Unmark all rows that do not comply with the selfjoins.*/
+__global__ void samejoin(int *dop1, int rows, int cols, int *dhead, int cont, int *res)
+{
+	extern __shared__ int shared[];
+	int id = blockIdx.x * blockDim.x + threadIdx.x;
+	int temp, temp2, pos, x, y;
+	if(threadIdx.x < cont)
+		shared[threadIdx.x] = dhead[threadIdx.x];
+	__syncthreads();
+	if(id < rows)
+	{	
+		if(res[id] == 0)
+			return;
+		pos = id * cols;
+		for(x = 0; x < cont; x++)
+		{
+			temp = dop1[pos+shared[x]];
+			y = x + 1;
+			temp2 = shared[y];
+			while(temp2 > -1)
+			{
+				if(temp != dop1[temp2+pos])
+				{
+					res[id] = 0;
+					return;
+				}
+				y++;
+				temp2 = shared[y];
+			}
+			x = y;
+		}
+	}
+}
+
+/*Mark all rows that comply with the selfjoins*/
+__global__ void samejoin2(int *dop1, int rows, int cols, int *dhead, int cont, int *res)
+{
+	extern __shared__ int shared[];
+	int id = blockIdx.x * blockDim.x + threadIdx.x;
+	int temp, temp2, pos, x, y;
+	if(threadIdx.x < cont)
+		shared[threadIdx.x] = dhead[threadIdx.x];
+	__syncthreads();
+	if(id < rows)
+	{	
+		pos = id * cols;
+		for(x = 0; x < cont; x++)
+		{
+			temp = dop1[pos+shared[x]];
+			y = x + 1;
+			temp2 = shared[y];
+			while(temp2 > -1)
+			{
+				if(temp != dop1[temp2+pos])
+					return;
+				y++;
+				temp2 = shared[y];
+			}
+			x = y;
+		}
+		res[id] = 1;
+	}
+}
+
+/*Project all columns found in 'dhead' to a new array 'res'*/
+__global__ void proyectar(int *dop1, int rows, int cols, int *dhead, int hsize, int *res)
+{
+	extern __shared__ int shared[];
+	int id = blockIdx.x * blockDim.x + threadIdx.x;
+	int pos, posr, x;
+	if(threadIdx.x < hsize)
+		shared[threadIdx.x] = dhead[threadIdx.x];
+	__syncthreads();
+	if(id < rows)
+	{	
+		pos = id * cols;
+		posr = id * hsize;
+		for(x = 0; x < hsize; x++, posr++)
+			res[posr] = dop1[pos+shared[x]];
+	}
+}
+
+/*Project all columns found in 'dhead' using only the rows marked as valid (i.e. those that complied with 
+selections, selfjoins, etc.). The array 'temp' holds the result of the prefix sum of said marks.*/
+__global__ void llenarproyectar(int *dop1, int rows, int cols, int *temp, int *dhead, int hsize, int *res)
+{
+	extern __shared__ int shared[];
+	int id = blockIdx.x * blockDim.x + threadIdx.x;
+	int pos, posr, x;
+	if(threadIdx.x < hsize)
+		shared[threadIdx.x] = dhead[threadIdx.x];
+	__syncthreads();
+	if(id < rows)
+	{		
+		posr = temp[id];
+		if(temp[id+1] != posr)
+		{
+			pos = id * cols;
+			posr *= hsize;			
+			for(x = 0; x < hsize; x++, posr++)
+				res[posr] = dop1[pos+shared[x]];
+		}
+	}
+}
+
+/*Performs selections, selfjoins and comparison predicates when the rule has a single normal predicate.*/
+int selectproyect(int *dop1, int rows, int cols, int head_size, int *select, int numselect, int *selfjoin, int numselfj, int *preds, int numpreds, int *project, int **ret, int ANDlogic)
+{
+	int *fres = NULL, *temp = NULL;
+	int *dhead = NULL, tmplen;
+	int size, size2, num;
+	thrust::device_ptr<int> res;
+
+#if TIMER
+	cuda_stats.selects++;
+#endif
+
+	int head_bytes = maximo(4, numselect, numselfj, numpreds, head_size) * sizeof(int);
+	reservar(&dhead, head_bytes);
+	int numthreads = 1024;
+	//int numthreads = 32;
+	int blockllen = rows / numthreads + 1;
+
+	#ifdef ROCKIT
+		ANDlogic = 1;
+	#endif
+
+	if(numselect > 0)
+	{		
+		tmplen = rows + 1;
+		size2 = tmplen * sizeof(int);
+		reservar(&temp, size2);
+		cudaMemset(temp, 0, size2);
+		size = numselect * sizeof(int);
+		cudaMemcpy(dhead, select, size, cudaMemcpyHostToDevice);
+
+		marcar2<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, numselect, temp + 1);
+		
+		if(numselfj > 0)
+		{
+			size = numselfj * sizeof(int);
+			cudaMemcpy(dhead, selfjoin, size, cudaMemcpyHostToDevice);
+			samejoin<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, numselfj, temp + 1);
+		}
+
+		if(numpreds > 0)
+		{
+			size = numpreds * sizeof(int);
+			cudaMemcpy(dhead, preds, size, cudaMemcpyHostToDevice);
+			if(ANDlogic)
+				bpredsnormal<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, numpreds, temp + 1);
+			else
+				bpredsorlogic<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, numpreds, temp + 1);
+		}
+
+		res = thrust::device_pointer_cast(temp);
+		thrust::inclusive_scan(res + 1, res + tmplen, res + 1);
+		num = res[rows];
+		if(num == 0)
+			return 0;
+
+		size = head_size * sizeof(int);
+		reservar(&fres, num * size);
+		cudaMemcpy(dhead, project, size, cudaMemcpyHostToDevice);
+		llenarproyectar<<<blockllen, numthreads, size>>>(dop1, rows, cols, temp, dhead, head_size, fres);
+		cudaFree(dhead);
+		cudaFree(temp);
+		*ret = fres;
+		return num;
+	}
+	else
+	{
+		if(numselfj > 0)
+		{
+			tmplen = rows + 1;
+			size2 = tmplen * sizeof(int);
+			reservar(&temp, size2);
+			cudaMemset(temp, 0, size2);
+			size = numselfj * sizeof(int);
+			cudaMemcpy(dhead, selfjoin, size, cudaMemcpyHostToDevice);
+			samejoin2<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, numselfj, temp + 1);
+
+			if(numpreds > 0)
+			{
+				size = numpreds * sizeof(int);
+				cudaMemcpy(dhead, preds, size, cudaMemcpyHostToDevice);
+				if(ANDlogic)
+					bpredsnormal<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, numpreds, temp + 1);
+				else
+					bpredsorlogic<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, numpreds, temp + 1);
+
+			}
+
+			res = thrust::device_pointer_cast(temp);
+			thrust::inclusive_scan(res + 1, res + tmplen, res + 1);
+			num = res[rows];
+			if(num == 0)
+				return 0;
+
+			size = head_size * sizeof(int);
+			reservar(&fres, num * size);
+			cudaMemcpy(dhead, project, size, cudaMemcpyHostToDevice);
+			llenarproyectar<<<blockllen, numthreads, size>>>(dop1, rows, cols, temp, dhead, head_size, fres);
+			cudaFree(dhead);
+			cudaFree(temp);
+			*ret = fres;
+			return num;
+		}
+		else
+		{
+			if(numpreds > 0)
+			{
+				tmplen = rows + 1;
+				size2 = tmplen * sizeof(int);
+				reservar(&temp, size2);
+				cudaMemset(temp, 0, size2);		
+				size = numpreds * sizeof(int);
+				cudaMemcpy(dhead, preds, size, cudaMemcpyHostToDevice);
+
+				if(ANDlogic)
+					bpredsnormal2<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, numpreds, temp + 1);					
+				else
+					bpredsorlogic2<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, numpreds, temp + 1);
+				res = thrust::device_pointer_cast(temp);
+				thrust::inclusive_scan(res + 1, res + tmplen, res + 1);
+				num = res[rows];
+
+				if(num == 0)
+					return 0;
+
+				size = head_size * sizeof(int);
+				reservar(&fres, num * size);
+				cudaMemcpy(dhead, project, size, cudaMemcpyHostToDevice);
+				llenarproyectar<<<blockllen, numthreads, size>>>(dop1, rows, cols, temp, dhead, head_size, fres);
+				cudaFree(dhead);
+				cudaFree(temp);
+				*ret = fres;
+				return num;
+			}
+			else
+			{
+				size = head_size * sizeof(int);
+				reservar(&fres, rows * size);
+				cudaMemcpy(dhead, project, size, cudaMemcpyHostToDevice);
+				proyectar<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, head_size, fres);
+				cudaFree(dhead);
+				*ret = fres;
+				return rows;
+			}
+		}
+	}
+}
--- a/packages/cuda/old/treeb.cu
+++ b/packages/cuda/old/treeb.cu
--- a/packages/cuda/old/union2.cu
+++ b/packages/cuda/old/union2.cu
@@ -0,0 +1,763 @@
+/*Computer generated file to remove duplicates. Since Thrust's unique and sort, unlike their std's counterparts, don't have a way to specify the size of each element in
+the array, comparing pairs, triplets and other sets is not possible without defining a new pointer and all related operations for each set. If you have a better idea to do
+this, please don't hesitate to email us.*/
+
+#include <thrust/device_vector.h>
+#include <thrust/unique.h>
+#include <thrust/distance.h>
+#include <thrust/sort.h>
+#include <iostream>
+#include "memory.h"
+#include "union2.h"
+
+int unir(int *res, int rows, int tipo, int **ret, int final)
+{
+	thrust::device_ptr<int> pt, re;
+	thrust::device_ptr<s2> pt2, re2;
+	thrust::device_ptr<s3> pt3, re3;
+	thrust::device_ptr<s4> pt4, re4;
+	thrust::device_ptr<s5> pt5, re5;
+	thrust::device_ptr<s6> pt6, re6;
+	thrust::device_ptr<s7> pt7, re7;
+	thrust::device_ptr<s8> pt8, re8;
+	thrust::device_ptr<s9> pt9, re9;
+	thrust::device_ptr<s10> pt10, re10;
+	thrust::device_ptr<s11> pt11, re11;
+	thrust::device_ptr<s12> pt12, re12;
+	thrust::device_ptr<s13> pt13, re13;
+	thrust::device_ptr<s14> pt14, re14;
+	thrust::device_ptr<s15> pt15, re15;
+	thrust::device_ptr<s16> pt16, re16;
+	thrust::device_ptr<s17> pt17, re17;
+	thrust::device_ptr<s18> pt18, re18;
+	thrust::device_ptr<s19> pt19, re19;
+	thrust::device_ptr<s20> pt20, re20;
+	s2 *t2;
+	s3 *t3;
+	s4 *t4;
+	s5 *t5;
+	s6 *t6;
+	s7 *t7;
+	s8 *t8;
+	s9 *t9;
+	s10 *t10;
+	s11 *t11;
+	s12 *t12;
+	s13 *t13;
+	s14 *t14;
+	s15 *t15;
+	s16 *t16;
+	s17 *t17;
+	s18 *t18;
+	s19 *t19;
+	s20 *t20;
+	int flag, nrows, *nres, size;
+
+#if TIMER
+	cuda_stats.unions++;
+#endif
+
+	switch(tipo)
+	{
+		case 1:
+		{
+			pt = thrust::device_pointer_cast(res);
+			flag = 0;
+			while(flag != 1)
+			{
+				try
+				{
+					thrust::sort(pt, pt + rows);
+					if(final)
+					{
+						re = thrust::unique(pt, pt + rows, q1());
+						re = thrust::unique(pt, re);
+					}
+					else
+						re = thrust::unique(pt, pt + rows);
+					flag = 1;
+				}
+				catch(std::bad_alloc &e)
+				{
+					limpiar("sort/unique in unir", 0);
+				}
+			}
+			nrows = thrust::distance(pt, re);
+			if(nrows < rows / 2)
+			{
+				size = nrows * tipo * sizeof(int);
+				reservar(&nres, size);
+				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
+				cudaFree(*ret);
+				*ret = nres;
+			}
+			return nrows;
+		}
+		case 2:
+		{
+			t2 = (s2*)res;
+			pt2 = thrust::device_pointer_cast(t2);
+			flag = 0;
+			while(flag != 1)
+			{
+				try
+				{
+					thrust::sort(pt2, pt2 + rows, o2());
+					if(final)
+					{
+						re2 = thrust::unique(pt2, pt2 + rows, q2());
+						re2 = thrust::unique(pt2, re2, p2());
+					}
+					else
+						re2 = thrust::unique(pt2, pt2 + rows, p2());
+					flag = 1;
+				}
+				catch(std::bad_alloc &e)
+				{
+					limpiar("sort/unique in unir", 0);
+				}
+			}
+			nrows = thrust::distance(pt2, re2);
+			if(nrows < rows / 2)
+			{
+				size = nrows * tipo * sizeof(int);
+				reservar(&nres, size);
+				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
+				cudaFree(*ret);
+				*ret = nres;
+			}
+			return nrows;
+		}
+		case 3:
+		{
+			t3 = (s3*)res;
+			pt3 = thrust::device_pointer_cast(t3);
+			flag = 0;
+			while(flag != 1)
+			{
+				try
+				{
+					thrust::sort(pt3, pt3 + rows, o3());
+					if(final)
+					{
+						re3 = thrust::unique(pt3, pt3 + rows, q3());
+						re3 = thrust::unique(pt3, re3, p3());
+					}
+					else
+						re3 = thrust::unique(pt3, pt3 + rows, p3());
+					flag = 1;
+				}
+				catch(std::bad_alloc &e)
+				{
+					limpiar("sort/unique in unir", 0);
+				}
+			}
+			nrows = thrust::distance(pt3, re3);
+			if(nrows < rows / 2)
+			{
+				size = nrows * tipo * sizeof(int);
+				reservar(&nres, size);
+				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
+				cudaFree(*ret);
+				*ret = nres;
+			}
+			return nrows;
+		}
+		case 4:
+		{
+			t4 = (s4*)res;
+			pt4 = thrust::device_pointer_cast(t4);
+			flag = 0;
+			while(flag != 1)
+			{
+				try
+				{
+					thrust::sort(pt4, pt4 + rows, o4());
+					if(final)
+					{
+						re4 = thrust::unique(pt4, pt4 + rows, q4());
+						re4 = thrust::unique(pt4, re4, p4());
+					}
+					else
+						re4 = thrust::unique(pt4, pt4 + rows, p4());
+					flag = 1;
+				}
+				catch(std::bad_alloc &e)
+				{
+					limpiar("sort/unique in unir", 0);
+				}
+			}
+			nrows = thrust::distance(pt4, re4);
+			if(nrows < rows / 2)
+			{
+				size = nrows * tipo * sizeof(int);
+				reservar(&nres, size);
+				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
+				cudaFree(*ret);
+				*ret = nres;
+			}
+			return nrows;
+		}
+		case 5:
+		{
+			t5 = (s5*)res;
+			pt5 = thrust::device_pointer_cast(t5);
+			flag = 0;
+			while(flag != 1)
+			{
+				try
+				{
+					thrust::sort(pt5, pt5 + rows, o5());
+					if(final)
+					{
+						re5 = thrust::unique(pt5, pt5 + rows, q5());
+						re5 = thrust::unique(pt5, re5, p5());
+					}
+					else
+						re5 = thrust::unique(pt5, pt5 + rows, p5());
+					flag = 1;
+				}
+				catch(std::bad_alloc &e)
+				{
+					limpiar("sort/unique in unir", 0);
+				}
+			}
+			nrows = thrust::distance(pt5, re5);
+			if(nrows < rows / 2)
+			{
+				size = nrows * tipo * sizeof(int);
+				reservar(&nres, size);
+				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
+				cudaFree(*ret);
+				*ret = nres;
+			}
+			return nrows;
+		}
+		case 6:
+		{
+			t6 = (s6*)res;
+			pt6 = thrust::device_pointer_cast(t6);
+			flag = 0;
+			while(flag != 1)
+			{
+				try
+				{
+					thrust::sort(pt6, pt6 + rows, o6());
+					if(final)
+					{
+						re6 = thrust::unique(pt6, pt6 + rows, q6());
+						re6 = thrust::unique(pt6, re6, p6());
+					}
+					else
+						re6 = thrust::unique(pt6, pt6 + rows, p6());
+					flag = 1;
+				}
+				catch(std::bad_alloc &e)
+				{
+					limpiar("sort/unique in unir", 0);
+				}
+			}
+			nrows = thrust::distance(pt6, re6);
+			if(nrows < rows / 2)
+			{
+				size = nrows * tipo * sizeof(int);
+				reservar(&nres, size);
+				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
+				cudaFree(*ret);
+				*ret = nres;
+			}
+			return nrows;
+		}
+		case 7:
+		{
+			t7 = (s7*)res;
+			pt7 = thrust::device_pointer_cast(t7);
+			flag = 0;
+			while(flag != 1)
+			{
+				try
+				{
+					thrust::sort(pt7, pt7 + rows, o7());
+					if(final)
+					{
+						re7 = thrust::unique(pt7, pt7 + rows, q7());
+						re7 = thrust::unique(pt7, re7, p7());
+					}
+					else
+						re7 = thrust::unique(pt7, pt7 + rows, p7());
+					flag = 1;
+				}
+				catch(std::bad_alloc &e)
+				{
+					limpiar("sort/unique in unir", 0);
+				}
+			}
+			nrows = thrust::distance(pt7, re7);
+			if(nrows < rows / 2)
+			{
+				size = nrows * tipo * sizeof(int);
+				reservar(&nres, size);
+				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
+				cudaFree(*ret);
+				*ret = nres;
+			}
+			return nrows;
+		}
+		case 8:
+		{
+			t8 = (s8*)res;
+			pt8 = thrust::device_pointer_cast(t8);
+			flag = 0;
+			while(flag != 1)
+			{
+				try
+				{
+					thrust::sort(pt8, pt8 + rows, o8());
+					if(final)
+					{
+						re8 = thrust::unique(pt8, pt8 + rows, q8());
+						re8 = thrust::unique(pt8, re8, p8());
+					}
+					else
+						re8 = thrust::unique(pt8, pt8 + rows, p8());
+					flag = 1;
+				}
+				catch(std::bad_alloc &e)
+				{
+					limpiar("sort/unique in unir", 0);
+				}
+			}
+			nrows = thrust::distance(pt8, re8);
+			if(nrows < rows / 2)
+			{
+				size = nrows * tipo * sizeof(int);
+				reservar(&nres, size);
+				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
+				cudaFree(*ret);
+				*ret = nres;
+			}
+			return nrows;
+		}
+		case 9:
+		{
+			t9 = (s9*)res;
+			pt9 = thrust::device_pointer_cast(t9);
+			flag = 0;
+			while(flag != 1)
+			{
+				try
+				{
+					thrust::sort(pt9, pt9 + rows, o9());
+					if(final)
+					{
+						re9 = thrust::unique(pt9, pt9 + rows, q9());
+						re9 = thrust::unique(pt9, re9, p9());
+					}
+					else
+						re9 = thrust::unique(pt9, pt9 + rows, p9());
+					flag = 1;
+				}
+				catch(std::bad_alloc &e)
+				{
+					limpiar("sort/unique in unir", 0);
+				}
+			}
+			nrows = thrust::distance(pt9, re9);
+			if(nrows < rows / 2)
+			{
+				size = nrows * tipo * sizeof(int);
+				reservar(&nres, size);
+				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
+				cudaFree(*ret);
+				*ret = nres;
+			}
+			return nrows;
+		}
+		case 10:
+		{
+			t10 = (s10*)res;
+			pt10 = thrust::device_pointer_cast(t10);
+			flag = 0;
+			while(flag != 1)
+			{
+				try
+				{
+					thrust::sort(pt10, pt10 + rows, o10());
+					if(final)
+					{
+						re10 = thrust::unique(pt10, pt10 + rows, q10());
+						re10 = thrust::unique(pt10, re10, p10());
+					}
+					else
+						re10 = thrust::unique(pt10, pt10 + rows, p10());
+					flag = 1;
+				}
+				catch(std::bad_alloc &e)
+				{
+					limpiar("sort/unique in unir", 0);
+				}
+			}
+			nrows = thrust::distance(pt10, re10);
+			if(nrows < rows / 2)
+			{
+				size = nrows * tipo * sizeof(int);
+				reservar(&nres, size);
+				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
+				cudaFree(*ret);
+				*ret = nres;
+			}
+			return nrows;
+		}
+		case 11:
+		{
+			t11 = (s11*)res;
+			pt11 = thrust::device_pointer_cast(t11);
+			flag = 0;
+			while(flag != 1)
+			{
+				try
+				{
+					thrust::sort(pt11, pt11 + rows, o11());
+					if(final)
+					{
+						re11 = thrust::unique(pt11, pt11 + rows, q11());
+						re11 = thrust::unique(pt11, re11, p11());
+					}
+					else
+						re11 = thrust::unique(pt11, pt11 + rows, p11());
+					flag = 1;
+				}
+				catch(std::bad_alloc &e)
+				{
+					limpiar("sort/unique in unir", 0);
+				}
+			}
+			nrows = thrust::distance(pt11, re11);
+			if(nrows < rows / 2)
+			{
+				size = nrows * tipo * sizeof(int);
+				reservar(&nres, size);
+				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
+				cudaFree(*ret);
+				*ret = nres;
+			}
+			return nrows;
+		}
+		case 12:
+		{
+			t12 = (s12*)res;
+			pt12 = thrust::device_pointer_cast(t12);
+			flag = 0;
+			while(flag != 1)
+			{
+				try
+				{
+					thrust::sort(pt12, pt12 + rows, o12());
+					if(final)
+					{
+						re12 = thrust::unique(pt12, pt12 + rows, q12());
+						re12 = thrust::unique(pt12, re12, p12());
+					}
+					else
+						re12 = thrust::unique(pt12, pt12 + rows, p12());
+					flag = 1;
+				}
+				catch(std::bad_alloc &e)
+				{
+					limpiar("sort/unique in unir", 0);
+				}
+			}
+			nrows = thrust::distance(pt12, re12);
+			if(nrows < rows / 2)
+			{
+				size = nrows * tipo * sizeof(int);
+				reservar(&nres, size);
+				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
+				cudaFree(*ret);
+				*ret = nres;
+			}
+			return nrows;
+		}
+		case 13:
+		{
+			t13 = (s13*)res;
+			pt13 = thrust::device_pointer_cast(t13);
+			flag = 0;
+			while(flag != 1)
+			{
+				try
+				{
+					thrust::sort(pt13, pt13 + rows, o13());
+					if(final)
+					{
+						re13 = thrust::unique(pt13, pt13 + rows, q13());
+						re13 = thrust::unique(pt13, re13, p13());
+					}
+					else
+						re13 = thrust::unique(pt13, pt13 + rows, p13());
+					flag = 1;
+				}
+				catch(std::bad_alloc &e)
+				{
+					limpiar("sort/unique in unir", 0);
+				}
+			}
+			nrows = thrust::distance(pt13, re13);
+			if(nrows < rows / 2)
+			{
+				size = nrows * tipo * sizeof(int);
+				reservar(&nres, size);
+				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
+				cudaFree(*ret);
+				*ret = nres;
+			}
+			return nrows;
+		}
+		case 14:
+		{
+			t14 = (s14*)res;
+			pt14 = thrust::device_pointer_cast(t14);
+			flag = 0;
+			while(flag != 1)
+			{
+				try
+				{
+					thrust::sort(pt14, pt14 + rows, o14());
+					if(final)
+					{
+						re14 = thrust::unique(pt14, pt14 + rows, q14());
+						re14 = thrust::unique(pt14, re14, p14());
+					}
+					else
+						re14 = thrust::unique(pt14, pt14 + rows, p14());
+					flag = 1;
+				}
+				catch(std::bad_alloc &e)
+				{
+					limpiar("sort/unique in unir", 0);
+				}
+			}
+			nrows = thrust::distance(pt14, re14);
+			if(nrows < rows / 2)
+			{
+				size = nrows * tipo * sizeof(int);
+				reservar(&nres, size);
+				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
+				cudaFree(*ret);
+				*ret = nres;
+			}
+			return nrows;
+		}
+		case 15:
+		{
+			t15 = (s15*)res;
+			pt15 = thrust::device_pointer_cast(t15);
+			flag = 0;
+			while(flag != 1)
+			{
+				try
+				{
+					thrust::sort(pt15, pt15 + rows, o15());
+					if(final)
+					{
+						re15 = thrust::unique(pt15, pt15 + rows, q15());
+						re15 = thrust::unique(pt15, re15, p15());
+					}
+					else
+						re15 = thrust::unique(pt15, pt15 + rows, p15());
+					flag = 1;
+				}
+				catch(std::bad_alloc &e)
+				{
+					limpiar("sort/unique in unir", 0);
+				}
+			}
+			nrows = thrust::distance(pt15, re15);
+			if(nrows < rows / 2)
+			{
+				size = nrows * tipo * sizeof(int);
+				reservar(&nres, size);
+				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
+				cudaFree(*ret);
+				*ret = nres;
+			}
+			return nrows;
+		}
+		case 16:
+		{
+			t16 = (s16*)res;
+			pt16 = thrust::device_pointer_cast(t16);
+			flag = 0;
+			while(flag != 1)
+			{
+				try
+				{
+					thrust::sort(pt16, pt16 + rows, o16());
+					if(final)
+					{
+						re16 = thrust::unique(pt16, pt16 + rows, q16());
+						re16 = thrust::unique(pt16, re16, p16());
+					}
+					else
+						re16 = thrust::unique(pt16, pt16 + rows, p16());
+					flag = 1;
+				}
+				catch(std::bad_alloc &e)
+				{
+					limpiar("sort/unique in unir", 0);
+				}
+			}
+			nrows = thrust::distance(pt16, re16);
+			if(nrows < rows / 2)
+			{
+				size = nrows * tipo * sizeof(int);
+				reservar(&nres, size);
+				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
+				cudaFree(*ret);
+				*ret = nres;
+			}
+			return nrows;
+		}
+		case 17:
+		{
+			t17 = (s17*)res;
+			pt17 = thrust::device_pointer_cast(t17);
+			flag = 0;
+			while(flag != 1)
+			{
+				try
+				{
+					thrust::sort(pt17, pt17 + rows, o17());
+					if(final)
+					{
+						re17 = thrust::unique(pt17, pt17 + rows, q17());
+						re17 = thrust::unique(pt17, re17, p17());
+					}
+					else
+						re17 = thrust::unique(pt17, pt17 + rows, p17());
+					flag = 1;
+				}
+				catch(std::bad_alloc &e)
+				{
+					limpiar("sort/unique in unir", 0);
+				}
+			}
+			nrows = thrust::distance(pt17, re17);
+			if(nrows < rows / 2)
+			{
+				size = nrows * tipo * sizeof(int);
+				reservar(&nres, size);
+				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
+				cudaFree(*ret);
+				*ret = nres;
+			}
+			return nrows;
+		}
+		case 18:
+		{
+			t18 = (s18*)res;
+			pt18 = thrust::device_pointer_cast(t18);
+			flag = 0;
+			while(flag != 1)
+			{
+				try
+				{
+					thrust::sort(pt18, pt18 + rows, o18());
+					if(final)
+					{
+						re18 = thrust::unique(pt18, pt18 + rows, q18());
+						re18 = thrust::unique(pt18, re18, p18());
+					}
+					else
+						re18 = thrust::unique(pt18, pt18 + rows, p18());
+					flag = 1;
+				}
+				catch(std::bad_alloc &e)
+				{
+					limpiar("sort/unique in unir", 0);
+				}
+			}
+			nrows = thrust::distance(pt18, re18);
+			if(nrows < rows / 2)
+			{
+				size = nrows * tipo * sizeof(int);
+				reservar(&nres, size);
+				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
+				cudaFree(*ret);
+				*ret = nres;
+			}
+			return nrows;
+		}
+		case 19:
+		{
+			t19 = (s19*)res;
+			pt19 = thrust::device_pointer_cast(t19);
+			flag = 0;
+			while(flag != 1)
+			{
+				try
+				{
+					thrust::sort(pt19, pt19 + rows, o19());
+					if(final)
+					{
+						re19 = thrust::unique(pt19, pt19 + rows, q19());
+						re19 = thrust::unique(pt19, re19, p19());
+					}
+					else
+						re19 = thrust::unique(pt19, pt19 + rows, p19());
+					flag = 1;
+				}
+				catch(std::bad_alloc &e)
+				{
+					limpiar("sort/unique in unir", 0);
+				}
+			}
+			nrows = thrust::distance(pt19, re19);
+			if(nrows < rows / 2)
+			{
+				size = nrows * tipo * sizeof(int);
+				reservar(&nres, size);
+				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
+				cudaFree(*ret);
+				*ret = nres;
+			}
+			return nrows;
+		}
+		case 20:
+		{
+			t20 = (s20*)res;
+			pt20 = thrust::device_pointer_cast(t20);
+			flag = 0;
+			while(flag != 1)
+			{
+				try
+				{
+					thrust::sort(pt20, pt20 + rows, o20());
+					if(final)
+					{
+						re20 = thrust::unique(pt20, pt20 + rows, q20());
+						re20 = thrust::unique(pt20, re20, p20());
+					}
+					else
+						re20 = thrust::unique(pt20, pt20 + rows, p20());
+					flag = 1;
+				}
+				catch(std::bad_alloc &e)
+				{
+					limpiar("sort/unique in unir", 0);
+				}
+			}
+			nrows = thrust::distance(pt20, re20);
+			if(nrows < rows / 2)
+			{
+				size = nrows * tipo * sizeof(int);
+				reservar(&nres, size);
+				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
+				cudaFree(*ret);
+				*ret = nres;
+			}
+			return nrows;
+		}
+	}
+	return 0;
+}
--- a/packages/cuda/old/union2.h
+++ b/packages/cuda/old/union2.h
--- a/packages/cuda/pred.h
+++ b/packages/cuda/pred.h
--- a/packages/cuda/selectproyect.cu
+++ b/packages/cuda/selectproyect.cu
@@ -1,3 +1,4 @@
+#include "hip/hip_runtime.h"
 #include <thrust/device_vector.h>
 #include <thrust/scan.h>
 #include <stdlib.h>
@@ -8,10 +9,10 @@
 __global__ void marcar2(int *dop1, int rows, int cols, int *cons, int numc, int *res)
 {
 	extern __shared__ int shared[];
-	int id = blockIdx.x * blockDim.x + threadIdx.x;
+	int id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
 	int x, rowact, posact;
-	if(threadIdx.x < numc)
-		shared[threadIdx.x] = cons[threadIdx.x];
+	if(hipThreadIdx_x < numc)
+		shared[hipThreadIdx_x] = cons[hipThreadIdx_x];
 	__syncthreads();
 	if(id < rows)
 	{
@@ -30,10 +31,10 @@ we unmark any rows that do not comply with the selections*/
 __global__ void marcar(int *dop1, int rows, int cols, int *cons, int numc, int *res)
 {
 	extern __shared__ int shared[];
-	int id = blockIdx.x * blockDim.x + threadIdx.x;
+	int id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
 	int x, rowact, posact;
-	if(threadIdx.x < numc)
-		shared[threadIdx.x] = cons[threadIdx.x];
+	if(hipThreadIdx_x < numc)
+		shared[hipThreadIdx_x] = cons[hipThreadIdx_x];
 	__syncthreads();
 	if(id < rows)
 	{
@@ -56,10 +57,10 @@ __global__ void marcar(int *dop1, int rows, int cols, int *cons, int numc, int *
 __global__ void samejoin(int *dop1, int rows, int cols, int *dhead, int cont, int *res)
 {
 	extern __shared__ int shared[];
-	int id = blockIdx.x * blockDim.x + threadIdx.x;
+	int id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
 	int temp, temp2, pos, x, y;
-	if(threadIdx.x < cont)
-		shared[threadIdx.x] = dhead[threadIdx.x];
+	if(hipThreadIdx_x < cont)
+		shared[hipThreadIdx_x] = dhead[hipThreadIdx_x];
 	__syncthreads();
 	if(id < rows)
 	{	
@@ -90,10 +91,10 @@ __global__ void samejoin(int *dop1, int rows, int cols, int *dhead, int cont, in
 __global__ void samejoin2(int *dop1, int rows, int cols, int *dhead, int cont, int *res)
 {
 	extern __shared__ int shared[];
-	int id = blockIdx.x * blockDim.x + threadIdx.x;
+	int id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
 	int temp, temp2, pos, x, y;
-	if(threadIdx.x < cont)
-		shared[threadIdx.x] = dhead[threadIdx.x];
+	if(hipThreadIdx_x < cont)
+		shared[hipThreadIdx_x] = dhead[hipThreadIdx_x];
 	__syncthreads();
 	if(id < rows)
 	{	
@@ -120,10 +121,10 @@ __global__ void samejoin2(int *dop1, int rows, int cols, int *dhead, int cont, i
 __global__ void proyectar(int *dop1, int rows, int cols, int *dhead, int hsize, int *res)
 {
 	extern __shared__ int shared[];
-	int id = blockIdx.x * blockDim.x + threadIdx.x;
+	int id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
 	int pos, posr, x;
-	if(threadIdx.x < hsize)
-		shared[threadIdx.x] = dhead[threadIdx.x];
+	if(hipThreadIdx_x < hsize)
+		shared[hipThreadIdx_x] = dhead[hipThreadIdx_x];
 	__syncthreads();
 	if(id < rows)
 	{	
@@ -139,10 +140,10 @@ selections, selfjoins, etc.). The array 'temp' holds the result of the prefix su
 __global__ void llenarproyectar(int *dop1, int rows, int cols, int *temp, int *dhead, int hsize, int *res)
 {
 	extern __shared__ int shared[];
-	int id = blockIdx.x * blockDim.x + threadIdx.x;
+	int id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
 	int pos, posr, x;
-	if(threadIdx.x < hsize)
-		shared[threadIdx.x] = dhead[threadIdx.x];
+	if(hipThreadIdx_x < hsize)
+		shared[hipThreadIdx_x] = dhead[hipThreadIdx_x];
 	__syncthreads();
 	if(id < rows)
 	{		
@@ -184,27 +185,27 @@ int selectproyect(int *dop1, int rows, int cols, int head_size, int *select, int
 		tmplen = rows + 1;
 		size2 = tmplen * sizeof(int);
 		reservar(&temp, size2);
-		cudaMemset(temp, 0, size2);
+		hipMemset(temp, 0, size2);
 		size = numselect * sizeof(int);
-		cudaMemcpy(dhead, select, size, cudaMemcpyHostToDevice);
+		hipMemcpy(dhead, select, size, hipMemcpyHostToDevice);

-		marcar2<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, numselect, temp + 1);
+		hipLaunchKernel(HIP_KERNEL_NAME(marcar2), dim3(blockllen), dim3(numthreads), size, 0, dop1, rows, cols, dhead, numselect, temp + 1);
 		
 		if(numselfj > 0)
 		{
 			size = numselfj * sizeof(int);
-			cudaMemcpy(dhead, selfjoin, size, cudaMemcpyHostToDevice);
-			samejoin<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, numselfj, temp + 1);
+			hipMemcpy(dhead, selfjoin, size, hipMemcpyHostToDevice);
+			hipLaunchKernel(HIP_KERNEL_NAME(samejoin), dim3(blockllen), dim3(numthreads), size, 0, dop1, rows, cols, dhead, numselfj, temp + 1);
 		}

 		if(numpreds > 0)
 		{
 			size = numpreds * sizeof(int);
-			cudaMemcpy(dhead, preds, size, cudaMemcpyHostToDevice);
+			hipMemcpy(dhead, preds, size, hipMemcpyHostToDevice);
 			if(ANDlogic)
-				bpredsnormal<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, numpreds, temp + 1);
+				hipLaunchKernel(HIP_KERNEL_NAME(bpredsnormal), dim3(blockllen), dim3(numthreads), size, 0, dop1, rows, cols, dhead, numpreds, temp + 1);
 			else
-				bpredsorlogic<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, numpreds, temp + 1);
+				hipLaunchKernel(HIP_KERNEL_NAME(bpredsorlogic), dim3(blockllen), dim3(numthreads), size, 0, dop1, rows, cols, dhead, numpreds, temp + 1);
 		}

 		res = thrust::device_pointer_cast(temp);
@@ -215,10 +216,10 @@ int selectproyect(int *dop1, int rows, int cols, int head_size, int *select, int

 		size = head_size * sizeof(int);
 		reservar(&fres, num * size);
-		cudaMemcpy(dhead, project, size, cudaMemcpyHostToDevice);
-		llenarproyectar<<<blockllen, numthreads, size>>>(dop1, rows, cols, temp, dhead, head_size, fres);
-		cudaFree(dhead);
-		cudaFree(temp);
+		hipMemcpy(dhead, project, size, hipMemcpyHostToDevice);
+		hipLaunchKernel(HIP_KERNEL_NAME(llenarproyectar), dim3(blockllen), dim3(numthreads), size, 0, dop1, rows, cols, temp, dhead, head_size, fres);
+		hipFree(dhead);
+		hipFree(temp);
 		*ret = fres;
 		return num;
 	}
@@ -229,19 +230,19 @@ int selectproyect(int *dop1, int rows, int cols, int head_size, int *select, int
 			tmplen = rows + 1;
 			size2 = tmplen * sizeof(int);
 			reservar(&temp, size2);
-			cudaMemset(temp, 0, size2);
+			hipMemset(temp, 0, size2);
 			size = numselfj * sizeof(int);
-			cudaMemcpy(dhead, selfjoin, size, cudaMemcpyHostToDevice);
-			samejoin2<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, numselfj, temp + 1);
+			hipMemcpy(dhead, selfjoin, size, hipMemcpyHostToDevice);
+			hipLaunchKernel(HIP_KERNEL_NAME(samejoin2), dim3(blockllen), dim3(numthreads), size, 0, dop1, rows, cols, dhead, numselfj, temp + 1);

 			if(numpreds > 0)
 			{
 				size = numpreds * sizeof(int);
-				cudaMemcpy(dhead, preds, size, cudaMemcpyHostToDevice);
+				hipMemcpy(dhead, preds, size, hipMemcpyHostToDevice);
 				if(ANDlogic)
-					bpredsnormal<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, numpreds, temp + 1);
+					hipLaunchKernel(HIP_KERNEL_NAME(bpredsnormal), dim3(blockllen), dim3(numthreads), size, 0, dop1, rows, cols, dhead, numpreds, temp + 1);
 				else
-					bpredsorlogic<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, numpreds, temp + 1);
+					hipLaunchKernel(HIP_KERNEL_NAME(bpredsorlogic), dim3(blockllen), dim3(numthreads), size, 0, dop1, rows, cols, dhead, numpreds, temp + 1);

 			}

@@ -253,10 +254,10 @@ int selectproyect(int *dop1, int rows, int cols, int head_size, int *select, int

 			size = head_size * sizeof(int);
 			reservar(&fres, num * size);
-			cudaMemcpy(dhead, project, size, cudaMemcpyHostToDevice);
-			llenarproyectar<<<blockllen, numthreads, size>>>(dop1, rows, cols, temp, dhead, head_size, fres);
-			cudaFree(dhead);
-			cudaFree(temp);
+			hipMemcpy(dhead, project, size, hipMemcpyHostToDevice);
+			hipLaunchKernel(HIP_KERNEL_NAME(llenarproyectar), dim3(blockllen), dim3(numthreads), size, 0, dop1, rows, cols, temp, dhead, head_size, fres);
+			hipFree(dhead);
+			hipFree(temp);
 			*ret = fres;
 			return num;
 		}
@@ -267,14 +268,14 @@ int selectproyect(int *dop1, int rows, int cols, int head_size, int *select, int
 				tmplen = rows + 1;
 				size2 = tmplen * sizeof(int);
 				reservar(&temp, size2);
-				cudaMemset(temp, 0, size2);		
+				hipMemset(temp, 0, size2);		
 				size = numpreds * sizeof(int);
-				cudaMemcpy(dhead, preds, size, cudaMemcpyHostToDevice);
+				hipMemcpy(dhead, preds, size, hipMemcpyHostToDevice);

 				if(ANDlogic)
-					bpredsnormal2<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, numpreds, temp + 1);					
+					hipLaunchKernel(HIP_KERNEL_NAME(bpredsnormal2), dim3(blockllen), dim3(numthreads), size, 0, dop1, rows, cols, dhead, numpreds, temp + 1);					
 				else
-					bpredsorlogic2<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, numpreds, temp + 1);
+					hipLaunchKernel(HIP_KERNEL_NAME(bpredsorlogic2), dim3(blockllen), dim3(numthreads), size, 0, dop1, rows, cols, dhead, numpreds, temp + 1);
 				res = thrust::device_pointer_cast(temp);
 				thrust::inclusive_scan(res + 1, res + tmplen, res + 1);
 				num = res[rows];
@@ -284,10 +285,10 @@ int selectproyect(int *dop1, int rows, int cols, int head_size, int *select, int

 				size = head_size * sizeof(int);
 				reservar(&fres, num * size);
-				cudaMemcpy(dhead, project, size, cudaMemcpyHostToDevice);
-				llenarproyectar<<<blockllen, numthreads, size>>>(dop1, rows, cols, temp, dhead, head_size, fres);
-				cudaFree(dhead);
-				cudaFree(temp);
+				hipMemcpy(dhead, project, size, hipMemcpyHostToDevice);
+				hipLaunchKernel(HIP_KERNEL_NAME(llenarproyectar), dim3(blockllen), dim3(numthreads), size, 0, dop1, rows, cols, temp, dhead, head_size, fres);
+				hipFree(dhead);
+				hipFree(temp);
 				*ret = fres;
 				return num;
 			}
@@ -295,9 +296,9 @@ int selectproyect(int *dop1, int rows, int cols, int head_size, int *select, int
 			{
 				size = head_size * sizeof(int);
 				reservar(&fres, rows * size);
-				cudaMemcpy(dhead, project, size, cudaMemcpyHostToDevice);
-				proyectar<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, head_size, fres);
-				cudaFree(dhead);
+				hipMemcpy(dhead, project, size, hipMemcpyHostToDevice);
+				hipLaunchKernel(HIP_KERNEL_NAME(proyectar), dim3(blockllen), dim3(numthreads), size, 0, dop1, rows, cols, dhead, head_size, fres);
+				hipFree(dhead);
 				*ret = fres;
 				return rows;
 			}
--- a/packages/cuda/selectproyectcpu.cpp
+++ b/packages/cuda/selectproyectcpu.cpp
--- a/packages/cuda/treeb.cu
+++ b/packages/cuda/treeb.cu
@@ -1,3 +1,4 @@
+#include "hip/hip_runtime.h"
 #include <thrust/host_vector.h>
 #include <thrust/device_vector.h>
 #include <thrust/sequence.h>
@@ -160,11 +161,11 @@ __device__ int firstMatchingKeyInDataNode2(Record records[], IKeyType key)

 __global__ void gCreateIndex(IDataNode data[], IDirectoryNode dir[], int dirSize, int tree_size, int bottom_start, int nNodesPerBlock)
 {
-        int startIdx = blockIdx.x * nNodesPerBlock;
+        int startIdx = hipBlockIdx_x * nNodesPerBlock;
        int endIdx = startIdx + nNodesPerBlock;
        if(endIdx > dirSize)
                endIdx = dirSize;
-        int keyIdx = threadIdx.x;
+        int keyIdx = hipThreadIdx_x;

        // Proceed only when in internal nodes
        for(int nodeIdx = startIdx; nodeIdx < endIdx; nodeIdx++)
@@ -191,11 +192,11 @@ __global__ void gSearchTree(IDataNode* data, int nDataNodes, IDirectoryNode* dir
 {
 	// Bringing the root node (visited by every tuple) to the faster shared memory
 	__shared__ IKeyType RootNodeKeys[TREE_NODE_SIZE];
-	RootNodeKeys[threadIdx.x] = dir->keys[threadIdx.x];
+	RootNodeKeys[hipThreadIdx_x] = dir->keys[hipThreadIdx_x];

 	__syncthreads();

-	int OverallThreadIdx = blockIdx.x * THRD_PER_BLCK_search + threadIdx.x;
+	int OverallThreadIdx = hipBlockIdx_x * THRD_PER_BLCK_search + hipThreadIdx_x;

 	for(int keyIdx = OverallThreadIdx; keyIdx < nSearchKeys; keyIdx += THRD_PER_GRID_search)
 	{
@@ -219,7 +220,7 @@ __global__ void gSearchTree(IDataNode* data, int nDataNodes, IDirectoryNode* dir
 /*Counts the number of times a row in 'S' is to be joined to a row in 'R'.*/
 __global__ void gIndexJoin(int *R, int *S, int g_locations[], int sLen, int g_ResNums[])
 {
-	int s_cur = blockIdx.x * blockDim.x + threadIdx.x;
+	int s_cur = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;

 	if(s_cur < sLen) 
 	{
@@ -246,11 +247,11 @@ in 'g_locations' those rows that have equal values in the checked columns.*/
 __global__ void gIndexMultiJoinNegative(int *R, int *S, int g_locations[], int rLen, int *p1, int *p2, int of1, int of2, int *mloc, int *sloc, int *muljoin, int wj)
 {
 	extern __shared__ int shared[];
-	int r_cur = blockIdx.x * blockDim.x + threadIdx.x;
+	int r_cur = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
 	int posr, poss, x;

-	if(threadIdx.x < wj)
-		shared[threadIdx.x] = muljoin[threadIdx.x];
+	if(hipThreadIdx_x < wj)
+		shared[hipThreadIdx_x] = muljoin[hipThreadIdx_x];
 	__syncthreads();

 	if(r_cur < rLen) 
@@ -287,11 +288,11 @@ times a row in 'S' is to be joined to its corresponding row in 'R', storing the
 __global__ void gIndexMultiJoin(int *R, int *S, int g_locations[], int sLen, int g_ResNums[], int *p1, int *p2, int of1, int of2, int *mloc, int *sloc, int *muljoin, int wj)
 {
 	extern __shared__ int shared[];
-	int s_cur = blockIdx.x * blockDim.x + threadIdx.x;
+	int s_cur = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
 	int posr, poss, x;

-	if(threadIdx.x < wj)
-		shared[threadIdx.x] = muljoin[threadIdx.x];
+	if(hipThreadIdx_x < wj)
+		shared[hipThreadIdx_x] = muljoin[hipThreadIdx_x];
 	__syncthreads();

 	if(s_cur < sLen) 
@@ -330,10 +331,10 @@ __global__ void multiJoinWithWrite(int g_locations[], int sLen, int g_PrefixSums
 {
 	extern __shared__ int shared[];
 	int *extjoins = &shared[lenrul];
-	int s_cur = blockIdx.x * blockDim.x + threadIdx.x;
+	int s_cur = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;

-	if(threadIdx.x < (lenrul + wj))
-		shared[threadIdx.x] = rule[threadIdx.x];
+	if(hipThreadIdx_x < (lenrul + wj))
+		shared[hipThreadIdx_x] = rule[hipThreadIdx_x];
 	__syncthreads();

 	if(s_cur < sLen)
@@ -382,10 +383,10 @@ __global__ void multiJoinWithWrite2(int g_locations[], int sLen, int g_PrefixSum
 {
 	extern __shared__ int shared[];
 	int *extjoins = &shared[cols];
-	int s_cur = blockIdx.x * blockDim.x + threadIdx.x;
+	int s_cur = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;

-	if(threadIdx.x < (cols + wj))
-		shared[threadIdx.x] = rule[threadIdx.x];
+	if(hipThreadIdx_x < (cols + wj))
+		shared[hipThreadIdx_x] = rule[hipThreadIdx_x];
 	__syncthreads();

 	if(s_cur < sLen)
@@ -432,11 +433,11 @@ predicate are projected.*/
 __global__ void gJoinWithWriteNegative(int g_locations[], int rLen, int g_joinResultBuffers[], int *p1, int of1, int *rule, int halfrul, int *mloc)
 {
 	extern __shared__ int shared[];
-	int r_cur = blockIdx.x * blockDim.x + threadIdx.x;
+	int r_cur = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
 	int posr;

-	if(threadIdx.x < halfrul)
-		shared[threadIdx.x] = rule[threadIdx.x];
+	if(hipThreadIdx_x < halfrul)
+		shared[hipThreadIdx_x] = rule[hipThreadIdx_x];
 	__syncthreads();

 	if(r_cur < rLen)
@@ -461,11 +462,11 @@ predicate are projected.*/
 __global__ void gJoinWithWriteNegative2(int g_locations[], int rLen, int g_joinResultBuffers[], int *p1, int of1, int *rule, int cols, int *mloc)
 {
 	extern __shared__ int shared[];
-	int r_cur = blockIdx.x * blockDim.x + threadIdx.x;
+	int r_cur = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
 	int posr;

-	if(threadIdx.x < cols)
-		shared[threadIdx.x] = rule[threadIdx.x];
+	if(hipThreadIdx_x < cols)
+		shared[hipThreadIdx_x] = rule[hipThreadIdx_x];
 	__syncthreads();

 	if(r_cur < rLen)
@@ -489,10 +490,10 @@ __global__ void gJoinWithWriteNegative2(int g_locations[], int rLen, int g_joinR
 __global__ void gJoinWithWrite(int g_locations[], int sLen, int g_PrefixSums[], int g_joinResultBuffers[], int *p1, int *p2, int of1, int of2, int *rule, int halfrul, int lenrul, int *mloc, int *sloc)
 {
 	extern __shared__ int shared[];
-	int s_cur = blockIdx.x * blockDim.x + threadIdx.x;
+	int s_cur = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;

-	if(threadIdx.x < lenrul)
-		shared[threadIdx.x] = rule[threadIdx.x];
+	if(hipThreadIdx_x < lenrul)
+		shared[hipThreadIdx_x] = rule[hipThreadIdx_x];
 	__syncthreads();

 	if(s_cur < sLen)
@@ -525,10 +526,10 @@ projection, which is performed based on the variables in the head of the rule.*/
 __global__ void gJoinWithWrite2(int g_locations[], int sLen, int g_PrefixSums[], int g_joinResultBuffers[], int *p1, int *p2, int of1, int of2, int *rule, int cols, int *mloc, int *sloc)
 {
 	extern __shared__ int shared[];
-	int s_cur = blockIdx.x * blockDim.x + threadIdx.x;
+	int s_cur = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;

-	if(threadIdx.x < cols)
-		shared[threadIdx.x] = rule[threadIdx.x];
+	if(hipThreadIdx_x < cols)
+		shared[hipThreadIdx_x] = rule[hipThreadIdx_x];
 	__syncthreads();

 	if(s_cur < sLen)
@@ -563,7 +564,7 @@ __global__ void gJoinWithWrite2(int g_locations[], int sLen, int g_PrefixSums[],
 /*Load part of column 'wj' of 'p' in 'R'. Which values are loaded is defined by the prefix sum results in 'pos'.*/
 __global__ void llenar(int *p, int *R, int len, int of, int wj, int *pos, int *ids)
 {
-	int id = blockIdx.x * blockDim.x + threadIdx.x;
+	int id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
 	int cond;
 	if(id < len)
 	{
@@ -579,7 +580,7 @@ __global__ void llenar(int *p, int *R, int len, int of, int wj, int *pos, int *i
 /*Load an entire column from 'p' into 'R'.*/
 __global__ void llenarnosel(int *p, int *R, int len, int of, int wj)
 {
-	int id = blockIdx.x * blockDim.x + threadIdx.x;
+	int id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
 	if(id < len)
 		R[id] = p[id * of + wj];
 }
@@ -587,10 +588,10 @@ __global__ void llenarnosel(int *p, int *R, int len, int of, int wj)
 __global__ void projectfinal(int *res, int rows, int cols, int *rule, int *out)
 {
 	extern __shared__ int shared[];
-	int id = blockIdx.x * blockDim.x + threadIdx.x;
+	int id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;

-	if(threadIdx.x < cols)
-		shared[threadIdx.x] = rule[threadIdx.x];
+	if(hipThreadIdx_x < cols)
+		shared[hipThreadIdx_x] = rule[hipThreadIdx_x];
 	__syncthreads();
 	
 	if(id < rows)
@@ -614,26 +615,26 @@ void project(int *res, int resrows, int numcols1, int numcols2, int *proj, int *
 		int *pt = (int *)malloc(sizepro);
 		for(z = 0; z < numcols2; z++)
 			pt[z] = proj[z] - 1;
-		cudaMemcpy(dcons, pt, sizepro, cudaMemcpyHostToDevice); 
-		//cudaDeviceSynchronize(); //Small cudaMemcpys are asynchronous, uncomment this line if the pointer is being liberated before it is copied.
+		hipMemcpy(dcons, pt, sizepro, hipMemcpyHostToDevice); 
+		//hipDeviceSynchronize(); //Small cudaMemcpys are asynchronous, uncomment this line if the pointer is being liberated before it is copied.
 		free(pt);
 	}
 	else
-		cudaMemcpy(dcons, proj, sizepro, cudaMemcpyHostToDevice);
+		hipMemcpy(dcons, proj, sizepro, hipMemcpyHostToDevice);
 	reservar(&d_Rout, resrows * sizepro);
-	projectfinal<<<blockllen, numthreads, sizepro>>>(res, resrows, numcols1, dcons, d_Rout);
-	cudaFree(dcons);
-	cudaFree(*ret);
+	hipLaunchKernel(HIP_KERNEL_NAME(projectfinal), dim3(blockllen), dim3(numthreads), sizepro, 0, res, resrows, numcols1, dcons, d_Rout);
+	hipFree(dcons);
+	hipFree(*ret);
 	*ret = d_Rout;
 }

 __global__ void projectadd(int *dop1, int *dop2, int rows1, int rows2, int cols1, int cols2, int *dhead, int hsize, int *res)
 {
 	extern __shared__ int shared[];
-	int id = blockIdx.x * blockDim.x + threadIdx.x;
+	int id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
 	int pos2, posr, x, y, cond;
-	if(threadIdx.x < hsize)
-		shared[threadIdx.x] = dhead[threadIdx.x];
+	if(hipThreadIdx_x < hsize)
+		shared[hipThreadIdx_x] = dhead[hipThreadIdx_x];
 	__syncthreads();
 	if(id < rows2)
 	{
@@ -662,10 +663,10 @@ void juntar(int *dop1, int *dop2, int rows1, int rows2, int cols1, int cols2, in
 	int blockllen = rows2 / numthreads + 1;
 	sizepro = pcols * sizeof(int);
 	reservar(&dcons, sizepro);
-	cudaMemcpy(dcons, proj, sizepro, cudaMemcpyHostToDevice);
+	hipMemcpy(dcons, proj, sizepro, hipMemcpyHostToDevice);
 	reservar(&d_Rout, rows1 * rows2 * sizepro);
-	projectadd<<<blockllen, numthreads, sizepro>>>(dop1, dop2, rows1, rows2, cols1, cols2, dcons, pcols, d_Rout);
-	cudaFree(dcons);
+	hipLaunchKernel(HIP_KERNEL_NAME(projectadd), dim3(blockllen), dim3(numthreads), sizepro, 0, dop1, dop2, rows1, rows2, cols1, cols2, dcons, pcols, d_Rout);
+	hipFree(dcons);
 	*ret = d_Rout;
 }

@@ -743,51 +744,51 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:

 	#ifdef TIMER
 	//cout << "INICIO" << endl;
-	cudaEvent_t start, stop;
+	hipEvent_t start, stop;
 	float time;
-	cudaEventCreate(&start);
-	cudaEventCreate(&stop);
-	cudaEventRecord(start, 0);
+	hipEventCreate(&start);
+	hipEventCreate(&stop);
+	hipEventRecord(start, 0);
 	#endif

 	if(npred2.x > 0 || npred2.y > 0 || nsel2 > 0 || nsj2 > 0)
 	{
 		newLen = sLen + 1;
-		cudaMemsetAsync(temp, 0, newLen * sizeof(int));
+		hipMemsetAsync(temp, 0, newLen * sizeof(int));
 	}

 	if(npred2.x > 0 || npred2.y > 0)
 	{
 		size = npred2tot * sizeof(int);
-		cudaMemcpy(dcons, pred2, size, cudaMemcpyHostToDevice);
+		hipMemcpy(dcons, pred2, size, hipMemcpyHostToDevice);

 		if(npred2.y > 0) /*Fix case when a(X,Y),b(Y,Z),Z > Y*/
 		{
 			reservar(&temp2, sizet2);
-			cudaMemsetAsync(temp2, 0, newLen * sizeof(int));
+			hipMemsetAsync(temp2, 0, newLen * sizeof(int));
 			//res = thrust::device_pointer_cast(temp2);
-			bpreds<<<blockllen, numthreads, size>>>(p1, p2, sLen, of1, of2, dcons, npred2tot, npred2.x, temp + 1, temp2 + 1);
+			hipLaunchKernel(HIP_KERNEL_NAME(bpreds), dim3(blockllen), dim3(numthreads), size, 0, p1, p2, sLen, of1, of2, dcons, npred2tot, npred2.x, temp + 1, temp2 + 1);
 		}
 		else
 		{
 			if(negative)
-				bpreds<<<blockllen, numthreads, size>>>(p1, p2, sLen, of1, of2, dcons, npred2tot, npred2.x, temp + 1, NULL);
+				hipLaunchKernel(HIP_KERNEL_NAME(bpreds), dim3(blockllen), dim3(numthreads), size, 0, p1, p2, sLen, of1, of2, dcons, npred2tot, npred2.x, temp + 1, NULL);
 			else
-				bpredsOR<<<blockllen, numthreads, size>>>(p1, p2, sLen, of1, of2, dcons, npred2tot, npred2.x, temp + 1, NULL);
+				hipLaunchKernel(HIP_KERNEL_NAME(bpredsOR), dim3(blockllen), dim3(numthreads), size, 0, p1, p2, sLen, of1, of2, dcons, npred2tot, npred2.x, temp + 1, NULL);
 		}

 		if(nsel2 > 0)
 		{
 			size = nsel2 * sizeof(int);
-			cudaMemcpy(dcons, sel2, size, cudaMemcpyHostToDevice);
-			marcar<<<blockllen, numthreads, size>>>(p2, sLen, of2, dcons, nsel2, temp + 1);
+			hipMemcpy(dcons, sel2, size, hipMemcpyHostToDevice);
+			hipLaunchKernel(HIP_KERNEL_NAME(marcar), dim3(blockllen), dim3(numthreads), size, 0, p2, sLen, of2, dcons, nsel2, temp + 1);
 		}

 		if(nsj2 > 0)
 		{
 			size = nsj2 * sizeof(int);
-			cudaMemcpy(dcons, sjoin2, size, cudaMemcpyHostToDevice);
-			samejoin<<<blockllen, numthreads, size>>>(p2, sLen, of2, dcons, nsj2, temp + 1);
+			hipMemcpy(dcons, sjoin2, size, hipMemcpyHostToDevice);
+			hipLaunchKernel(HIP_KERNEL_NAME(samejoin), dim3(blockllen), dim3(numthreads), size, 0, p2, sLen, of2, dcons, nsj2, temp + 1);
 		}
 	}
 	else
@@ -795,14 +796,14 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
 		if(nsel2 > 0)
 		{
 			size = nsel2 * sizeof(int);
-			cudaMemcpy(dcons, sel2, size, cudaMemcpyHostToDevice);
-			marcar2<<<blockllen, numthreads, size>>>(p2, sLen, of2, dcons, nsel2, temp + 1);
+			hipMemcpy(dcons, sel2, size, hipMemcpyHostToDevice);
+			hipLaunchKernel(HIP_KERNEL_NAME(marcar2), dim3(blockllen), dim3(numthreads), size, 0, p2, sLen, of2, dcons, nsel2, temp + 1);

 			if(nsj2 > 0)
 			{
 				size = nsj2 * sizeof(int);
-				cudaMemcpy(dcons, sjoin2, size, cudaMemcpyHostToDevice);
-				samejoin<<<blockllen, numthreads, size>>>(p2, sLen, of2, dcons, nsj2, temp + 1);
+				hipMemcpy(dcons, sjoin2, size, hipMemcpyHostToDevice);
+				hipLaunchKernel(HIP_KERNEL_NAME(samejoin), dim3(blockllen), dim3(numthreads), size, 0, p2, sLen, of2, dcons, nsj2, temp + 1);
 			}
 		}
 		else
@@ -810,15 +811,15 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
 			if(nsj2 > 0)
 			{
 				size = nsj2 * sizeof(int);
-				cudaMemcpy(dcons, sjoin2, size, cudaMemcpyHostToDevice);
-				samejoin2<<<blockllen, numthreads, size>>>(p2, sLen, of2, dcons, nsj2, temp + 1);	
+				hipMemcpy(dcons, sjoin2, size, hipMemcpyHostToDevice);
+				hipLaunchKernel(HIP_KERNEL_NAME(samejoin2), dim3(blockllen), dim3(numthreads), size, 0, p2, sLen, of2, dcons, nsj2, temp + 1);	
 			}
 			else
 			{
 				sizem32S = m32sLen * sizeof(int);
 				reservar(&d_S, sizem32S);
-				cudaMemsetAsync(d_S + sLen, 0x7f, extraspaceS * sizeof(int));
-				llenarnosel<<<blockllen, numthreads>>>(p2, d_S, sLen, of2, wherej[1]);
+				hipMemsetAsync(d_S + sLen, 0x7f, extraspaceS * sizeof(int));
+				hipLaunchKernel(HIP_KERNEL_NAME(llenarnosel), dim3(blockllen), dim3(numthreads), 0, 0, p2, d_S, sLen, of2, wherej[1]);
 			}
 		}
 	}
@@ -842,8 +843,8 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
 	
 		if(newLen == 0) // && !negative) ARREGLAR
 		{
-			cudaFree(temp);
-			cudaFree(dcons);
+			hipFree(temp);
+			hipFree(dcons);
 			return 0;
 		}

@@ -854,24 +855,24 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:

 		reservar(&d_S, sizem32S);
 		reservar(&posS, sizem32S);
-		cudaMemsetAsync(d_S + newLen, 0x7f, sizextra);
-		cudaMemsetAsync(posS + newLen, 0x7f, sizextra);
-		llenar<<<blockllen, numthreads>>>(p2, d_S, sLen, of2, wherej[1], temp, posS);
+		hipMemsetAsync(d_S + newLen, 0x7f, sizextra);
+		hipMemsetAsync(posS + newLen, 0x7f, sizextra);
+		hipLaunchKernel(HIP_KERNEL_NAME(llenar), dim3(blockllen), dim3(numthreads), 0, 0, p2, d_S, sLen, of2, wherej[1], temp, posS);
 		sLen = newLen;
 	}

 	#ifdef TIMER
-	cudaEventRecord(stop, 0);
-	cudaEventSynchronize(stop);
-	cudaEventElapsedTime(&time, start, stop);
+	hipEventRecord(stop, 0);
+	hipEventSynchronize(stop);
+	hipEventElapsedTime(&time, start, stop);
 	//cout << "Select1 = " << time << endl;
 	cuda_stats.select1_time += time;

-	cudaEventDestroy(start);
-	cudaEventDestroy(stop);
-	cudaEventCreate(&start);
-	cudaEventCreate(&stop);
-	cudaEventRecord(start, 0);
+	hipEventDestroy(start);
+	hipEventDestroy(stop);
+	hipEventCreate(&start);
+	hipEventCreate(&stop);
+	hipEventRecord(start, 0);
 	#endif

 	blockllen = rLen / numthreads + 1;
@@ -880,30 +881,30 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
 	{
 		if(temp2 != NULL)
 		{
-			cudaFree(temp);
+			hipFree(temp);
 			temp = temp2;
 			res = thrust::device_pointer_cast(temp);
 			newLen = rLen + 1;
 			if(nsel1 > 0)
 			{
 				size = nsel1 * sizeof(int);
-				cudaMemcpy(dcons, sel1, size, cudaMemcpyHostToDevice);
-				marcar<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, nsel1, temp + 1);
+				hipMemcpy(dcons, sel1, size, hipMemcpyHostToDevice);
+				hipLaunchKernel(HIP_KERNEL_NAME(marcar), dim3(blockllen), dim3(numthreads), size, 0, p1, rLen, of1, dcons, nsel1, temp + 1);
 			}
 			if(nsj1 > 0)
 			{
 				size = nsj1 * sizeof(int);
-				cudaMemcpy(dcons, sjoin1, size, cudaMemcpyHostToDevice);
-				samejoin<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, nsj1, temp + 1);
+				hipMemcpy(dcons, sjoin1, size, hipMemcpyHostToDevice);
+				hipLaunchKernel(HIP_KERNEL_NAME(samejoin), dim3(blockllen), dim3(numthreads), size, 0, p1, rLen, of1, dcons, nsj1, temp + 1);
 			}
 			if(npred1.x > 0)
 			{
 				size = npred1.x * sizeof(int);
-				cudaMemcpy(dcons, pred1, size, cudaMemcpyHostToDevice);
+				hipMemcpy(dcons, pred1, size, hipMemcpyHostToDevice);
 				if(ANDlogic)
-					bpredsnormal<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, npred1.x, temp + 1);
+					hipLaunchKernel(HIP_KERNEL_NAME(bpredsnormal), dim3(blockllen), dim3(numthreads), size, 0, p1, rLen, of1, dcons, npred1.x, temp + 1);
 				else
-					bpredsorlogic<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, npred1.x, temp + 1);
+					hipLaunchKernel(HIP_KERNEL_NAME(bpredsorlogic), dim3(blockllen), dim3(numthreads), size, 0, p1, rLen, of1, dcons, npred1.x, temp + 1);
 			}
 		}
 		else
@@ -911,30 +912,30 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
 			if(npred1.x > 0 || nsel1 > 0 || nsj1 > 0)
 			{
 				newLen = rLen + 1;
-				cudaMemsetAsync(temp, 0, newLen * sizeof(int));
+				hipMemsetAsync(temp, 0, newLen * sizeof(int));
 			}

 			if(nsel1 > 0)
 			{
 				size = nsel1 * sizeof(int);
-				cudaMemcpy(dcons, sel1, size, cudaMemcpyHostToDevice);
-				marcar2<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, nsel1, temp + 1);
+				hipMemcpy(dcons, sel1, size, hipMemcpyHostToDevice);
+				hipLaunchKernel(HIP_KERNEL_NAME(marcar2), dim3(blockllen), dim3(numthreads), size, 0, p1, rLen, of1, dcons, nsel1, temp + 1);

 				if(nsj1 > 0)
 				{
 					size = nsj1 * sizeof(int);
-					cudaMemcpy(dcons, sjoin1, size, cudaMemcpyHostToDevice);
-					samejoin<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, nsj1, temp + 1);
+					hipMemcpy(dcons, sjoin1, size, hipMemcpyHostToDevice);
+					hipLaunchKernel(HIP_KERNEL_NAME(samejoin), dim3(blockllen), dim3(numthreads), size, 0, p1, rLen, of1, dcons, nsj1, temp + 1);
 				}

 				if(npred1.x > 0)
 				{
 					size = npred1.x * sizeof(int);
-					cudaMemcpy(dcons, pred1, size, cudaMemcpyHostToDevice);
+					hipMemcpy(dcons, pred1, size, hipMemcpyHostToDevice);
 					if(ANDlogic)
-						bpredsnormal<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, npred1.x, temp + 1);
+						hipLaunchKernel(HIP_KERNEL_NAME(bpredsnormal), dim3(blockllen), dim3(numthreads), size, 0, p1, rLen, of1, dcons, npred1.x, temp + 1);
 					else
-						bpredsorlogic<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, npred1.x, temp + 1);		
+						hipLaunchKernel(HIP_KERNEL_NAME(bpredsorlogic), dim3(blockllen), dim3(numthreads), size, 0, p1, rLen, of1, dcons, npred1.x, temp + 1);		
 				}
 			}
 			else
@@ -942,17 +943,17 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
 				if(nsj1 > 0)
 				{
 					size = nsj1 * sizeof(int);
-					cudaMemcpy(dcons, sjoin1, size, cudaMemcpyHostToDevice);
-					samejoin2<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, nsj1, temp + 1);
+					hipMemcpy(dcons, sjoin1, size, hipMemcpyHostToDevice);
+					hipLaunchKernel(HIP_KERNEL_NAME(samejoin2), dim3(blockllen), dim3(numthreads), size, 0, p1, rLen, of1, dcons, nsj1, temp + 1);

 					if(npred1.x > 0)
 					{
 						size = npred1.x * sizeof(int);
-						cudaMemcpy(dcons, pred1, size, cudaMemcpyHostToDevice);
+						hipMemcpy(dcons, pred1, size, hipMemcpyHostToDevice);
 						if(ANDlogic)
-							bpredsnormal<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, npred1.x, temp + 1);
+							hipLaunchKernel(HIP_KERNEL_NAME(bpredsnormal), dim3(blockllen), dim3(numthreads), size, 0, p1, rLen, of1, dcons, npred1.x, temp + 1);
 						else
-							bpredsorlogic<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, npred1.x, temp + 1);
+							hipLaunchKernel(HIP_KERNEL_NAME(bpredsorlogic), dim3(blockllen), dim3(numthreads), size, 0, p1, rLen, of1, dcons, npred1.x, temp + 1);
 					}
 				}
 				else
@@ -960,11 +961,11 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
 					if(npred1.x > 0)
 					{
 						size = npred1.x * sizeof(int);
-						cudaMemcpy(dcons, pred1, size, cudaMemcpyHostToDevice);
+						hipMemcpy(dcons, pred1, size, hipMemcpyHostToDevice);
 						if(ANDlogic)
-							bpredsnormal2<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, npred1.x, temp + 1);
+							hipLaunchKernel(HIP_KERNEL_NAME(bpredsnormal2), dim3(blockllen), dim3(numthreads), size, 0, p1, rLen, of1, dcons, npred1.x, temp + 1);
 						else
-							bpredsorlogic2<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, npred1.x, temp + 1);	
+							hipLaunchKernel(HIP_KERNEL_NAME(bpredsorlogic2), dim3(blockllen), dim3(numthreads), size, 0, p1, rLen, of1, dcons, npred1.x, temp + 1);	
 					}
 				}
 			}
@@ -976,11 +977,11 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
 			newLen = res[rLen];
 			if(newLen == 0)
 			{
-				cudaFree(temp);
-				cudaFree(dcons);
-				cudaFree(d_S);
+				hipFree(temp);
+				hipFree(dcons);
+				hipFree(d_S);
 				if(posS != NULL)
-					cudaFree(posS);
+					hipFree(posS);
 				return 0;
 			}	

@@ -991,41 +992,41 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:

 			reservar(&d_R, sizem32);
 			reservar(&posR, sizem32);
-			cudaMemsetAsync(d_R + newLen, 0x7f, sizextra);
-			cudaMemsetAsync(posR + newLen, 0x7f, sizextra);
-			llenar<<<blockllen, numthreads>>>(p1, d_R, rLen, of1, wherej[0], temp, posR);
+			hipMemsetAsync(d_R + newLen, 0x7f, sizextra);
+			hipMemsetAsync(posR + newLen, 0x7f, sizextra);
+			hipLaunchKernel(HIP_KERNEL_NAME(llenar), dim3(blockllen), dim3(numthreads), 0, 0, p1, d_R, rLen, of1, wherej[0], temp, posR);
 			rLen = newLen;
 		}
 		else
 		{
 			sizem32 = m32rLen * sizeof(int);
 			reservar(&d_R, sizem32);
-			cudaMemsetAsync(d_R + rLen, 0x7f, extraspace * sizeof(int));
-			llenarnosel<<<blockllen, numthreads>>>(p1, d_R, rLen, of1, wherej[0]);
+			hipMemsetAsync(d_R + rLen, 0x7f, extraspace * sizeof(int));
+			hipLaunchKernel(HIP_KERNEL_NAME(llenarnosel), dim3(blockllen), dim3(numthreads), 0, 0, p1, d_R, rLen, of1, wherej[0]);
 		}
 	}
 	else
 	{
 		sizem32 = m32rLen * sizeof(int);
 		reservar(&d_R, sizem32);
-		cudaMemsetAsync(d_R + rLen, 0x7f, extraspace * sizeof(int));
-		llenarnosel<<<blockllen, numthreads>>>(p1, d_R, rLen, of1, wherej[0]);
+		hipMemsetAsync(d_R + rLen, 0x7f, extraspace * sizeof(int));
+		hipLaunchKernel(HIP_KERNEL_NAME(llenarnosel), dim3(blockllen), dim3(numthreads), 0, 0, p1, d_R, rLen, of1, wherej[0]);
 	}

 	#ifdef TIMER
-	cudaEventRecord(stop, 0);
-	cudaEventSynchronize(stop);
-	cudaEventElapsedTime(&time, start, stop);
+	hipEventRecord(stop, 0);
+	hipEventSynchronize(stop);
+	hipEventElapsedTime(&time, start, stop);
 	//cout << "Select2 = " << time << endl;
 	cuda_stats.select2_time += time;
 	#endif

 	#ifdef TIMER
-	cudaEventDestroy(start);
-	cudaEventDestroy(stop);
-	cudaEventCreate(&start);
-	cudaEventCreate(&stop);
-	cudaEventRecord(start, 0);
+	hipEventDestroy(start);
+	hipEventDestroy(stop);
+	hipEventCreate(&start);
+	hipEventCreate(&stop);
+	hipEventRecord(start, 0);
 	#endif

 	thrust::device_ptr<Record> dvp1;
@@ -1084,17 +1085,17 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
 	}

 	#ifdef TIMER
-	cudaEventRecord(stop, 0);
-	cudaEventSynchronize(stop);
-	cudaEventElapsedTime(&time, start, stop);
+	hipEventRecord(stop, 0);
+	hipEventSynchronize(stop);
+	hipEventElapsedTime(&time, start, stop);
 	//cout << "Sort = " << time << endl;
 	cuda_stats.sort_time += time;
 	
-	cudaEventDestroy(start);
-	cudaEventDestroy(stop);
-	cudaEventCreate(&start);
-	cudaEventCreate(&stop);
-	cudaEventRecord(start, 0);
+	hipEventDestroy(start);
+	hipEventDestroy(stop);
+	hipEventCreate(&start);
+	hipEventCreate(&stop);
+	hipEventRecord(start, 0);
 	#endif

 	IDataNode* d_data;
@@ -1123,7 +1124,7 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
 	dim3 Dbc(THRD_PER_BLCK_create, 1, 1);
 	dim3 Dgc(BLCK_PER_GRID_create, 1, 1);

-	gCreateIndex <<<Dgc, Dbc>>> (d_data, d_dir, nDirNodes, tree_size, bottom_start, nNodesPerBlock);
+	hipLaunchKernel(HIP_KERNEL_NAME(gCreateIndex), dim3(Dgc), dim3(Dbc), 0, 0, d_data, d_dir, nDirNodes, tree_size, bottom_start, nNodesPerBlock);

 	int *d_locations;
 	int memSizeR;
@@ -1132,7 +1133,7 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
 	{
 		memSizeR = (rLen + 1) * sizeof(int);
 		reservar(&d_locations, memSizeR);
-		cudaMemsetAsync(d_locations, 0, sizeof(int));
+		hipMemsetAsync(d_locations, 0, sizeof(int));
 		nSearchKeys = rLen;
 	}
 	else
@@ -1146,13 +1147,13 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
 	unsigned int nKeysPerThread = uintCeilingDiv(nSearchKeys, THRD_PER_GRID_search);
 	if(negative)
 	{
-		gSearchTree <<<Dgs, Dbs>>> (d_data, nDataNodes, d_dir, nDirNodes, lvlDir, d_R, d_locations + 1, nSearchKeys, nKeysPerThread, tree_size, bottom_start);
-		cudaMemsetAsync(temp, 0, memSizeR);
+		hipLaunchKernel(HIP_KERNEL_NAME(gSearchTree), dim3(Dgs), dim3(Dbs), 0, 0, d_data, nDataNodes, d_dir, nDirNodes, lvlDir, d_R, d_locations + 1, nSearchKeys, nKeysPerThread, tree_size, bottom_start);
+		hipMemsetAsync(temp, 0, memSizeR);
 	}
 	else
 	{
-		gSearchTree <<<Dgs, Dbs>>> (d_data, nDataNodes, d_dir, nDirNodes, lvlDir, d_S, d_locations, nSearchKeys, nKeysPerThread, tree_size, bottom_start);
-		cudaMemsetAsync(temp, 0, memSizeS);
+		hipLaunchKernel(HIP_KERNEL_NAME(gSearchTree), dim3(Dgs), dim3(Dbs), 0, 0, d_data, nDataNodes, d_dir, nDirNodes, lvlDir, d_S, d_locations, nSearchKeys, nKeysPerThread, tree_size, bottom_start);
+		hipMemsetAsync(temp, 0, memSizeS);
 	}

 	int muljoin = 0, muljoinsize = 0, sum;
@@ -1165,8 +1166,8 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
 		{
 			muljoin = numj - 2;
 			muljoinsize = muljoin * sizeof(int);
-			cudaMemcpy(dcons, wherej + 2, muljoinsize, cudaMemcpyHostToDevice);
-			gIndexMultiJoinNegative<<<blockllen, numthreads, muljoinsize>>> (d_R, d_S, d_locations + 1, rLen, p1, p2, of1, of2, posR, posS, dcons, muljoin);
+			hipMemcpy(dcons, wherej + 2, muljoinsize, hipMemcpyHostToDevice);
+			hipLaunchKernel(HIP_KERNEL_NAME(gIndexMultiJoinNegative), dim3(blockllen), dim3(numthreads), muljoinsize, 0, d_R, d_S, d_locations + 1, rLen, p1, p2, of1, of2, posR, posS, dcons, muljoin);
 		}

 		res = thrust::device_pointer_cast(d_locations);	
@@ -1177,21 +1178,21 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
 		if(pos == (rule->num_rows - 3))
 		{
 			sizepro = rule->num_columns * sizeof(int);
-			cudaMemcpy(dcons, proj, sizepro, cudaMemcpyHostToDevice);
+			hipMemcpy(dcons, proj, sizepro, hipMemcpyHostToDevice);
 			resSize = sum * sizepro;
 			reservar(&d_Rout, resSize);
-			gJoinWithWriteNegative2<<<blockllen, numthreads, sizepro>>> (d_locations, rLen, d_Rout, p1, of1, dcons, rule->num_columns, posR);
+			hipLaunchKernel(HIP_KERNEL_NAME(gJoinWithWriteNegative2), dim3(blockllen), dim3(numthreads), sizepro, 0, d_locations, rLen, d_Rout, p1, of1, dcons, rule->num_columns, posR);
 		}
 		else
 		{	
 			sizepro = projp.x * sizeof(int);
-			cudaMemcpy(dcons, proj, sizepro, cudaMemcpyHostToDevice);
+			hipMemcpy(dcons, proj, sizepro, hipMemcpyHostToDevice);
 			resSize = sum * sizepro;
 			reservar(&d_Rout, resSize);
-			gJoinWithWriteNegative<<<blockllen, numthreads, sizepro>>> (d_locations, rLen, d_Rout, p1, of1, dcons, projp.x, posR);
+			hipLaunchKernel(HIP_KERNEL_NAME(gJoinWithWriteNegative), dim3(blockllen), dim3(numthreads), sizepro, 0, d_locations, rLen, d_Rout, p1, of1, dcons, projp.x, posR);
 		}
-		cudaFree(d_R);
-		cudaFree(d_S);
+		hipFree(d_R);
+		hipFree(d_S);
 	}
 	else
 	{
@@ -1200,26 +1201,26 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
 		{
 			muljoin = numj - 2;
 			muljoinsize = muljoin * sizeof(int);
-			cudaMemcpy(dcons, wherej + 2, muljoinsize, cudaMemcpyHostToDevice);
-			gIndexMultiJoin<<<blockllen, numthreads, muljoinsize>>> (d_R, d_S, d_locations, sLen, temp, p1, p2, of1, of2, posR, posS, dcons, muljoin);
+			hipMemcpy(dcons, wherej + 2, muljoinsize, hipMemcpyHostToDevice);
+			hipLaunchKernel(HIP_KERNEL_NAME(gIndexMultiJoin), dim3(blockllen), dim3(numthreads), muljoinsize, 0, d_R, d_S, d_locations, sLen, temp, p1, p2, of1, of2, posR, posS, dcons, muljoin);
 		}
 		else
-			gIndexJoin<<<blockllen, numthreads>>> (d_R, d_S, d_locations, sLen, temp);
-		cudaFree(d_R);
-		cudaFree(d_S);
+			hipLaunchKernel(HIP_KERNEL_NAME(gIndexJoin), dim3(blockllen), dim3(numthreads), 0, 0, d_R, d_S, d_locations, sLen, temp);
+		hipFree(d_R);
+		hipFree(d_S);

 		sum = res[sLen-1];
 		thrust::exclusive_scan(res, res + sLen, res);
 		sum += res[sLen-1];
 		if(sum == 0)
 		{
-			cudaFree(dcons);
-			cudaFree(d_locations);
-			cudaFree(temp);
+			hipFree(dcons);
+			hipFree(d_locations);
+			hipFree(temp);
 			if(posS != NULL)
-				cudaFree(posS);
+				hipFree(posS);
 			if(posR != NULL)
-				cudaFree(posR);
+				hipFree(posR);
 			return 0;
 		}
 		res[sLen] = sum;
@@ -1227,49 +1228,49 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
 		if(pos == (rule->num_rows - 3))
 		{
 			sizepro = rule->num_columns * sizeof(int);
-			cudaMemcpy(dcons, proj, sizepro, cudaMemcpyHostToDevice);
+			hipMemcpy(dcons, proj, sizepro, hipMemcpyHostToDevice);
 			resSize = sum * sizepro;
 			reservar(&d_Rout, resSize);
 			if(numj > 2)
 			{
-				cudaMemcpy(dcons + rule->num_columns, wherej + 2, muljoinsize, cudaMemcpyHostToDevice);
-				multiJoinWithWrite2<<<blockllen, numthreads, sizepro + muljoinsize>>> (d_locations, sLen, temp, d_Rout, p1, p2, of1, of2, dcons, rule->num_columns, posR, posS, muljoin);
+				hipMemcpy(dcons + rule->num_columns, wherej + 2, muljoinsize, hipMemcpyHostToDevice);
+				hipLaunchKernel(HIP_KERNEL_NAME(multiJoinWithWrite2), dim3(blockllen), dim3(numthreads), sizepro + muljoinsize, 0, d_locations, sLen, temp, d_Rout, p1, p2, of1, of2, dcons, rule->num_columns, posR, posS, muljoin);
 			}
 			else
-				gJoinWithWrite2<<<blockllen, numthreads, sizepro>>> (d_locations, sLen, temp, d_Rout, p1, p2, of1, of2, dcons, rule->num_columns, posR, posS);
+				hipLaunchKernel(HIP_KERNEL_NAME(gJoinWithWrite2), dim3(blockllen), dim3(numthreads), sizepro, 0, d_locations, sLen, temp, d_Rout, p1, p2, of1, of2, dcons, rule->num_columns, posR, posS);
 		}
 		else
 		{
 			sizepro = projp.y * sizeof(int);
-			cudaMemcpy(dcons, proj, sizepro, cudaMemcpyHostToDevice);
+			hipMemcpy(dcons, proj, sizepro, hipMemcpyHostToDevice);
 			resSize = sum * sizepro;
 			reservar(&d_Rout, resSize);
 			if(numj > 2)
 			{
-				cudaMemcpy(dcons + projp.y, wherej + 2, muljoinsize, cudaMemcpyHostToDevice);
-				multiJoinWithWrite<<<blockllen, numthreads, sizepro + muljoinsize>>> (d_locations, sLen, temp, d_Rout, p1, p2, of1, of2, dcons, projp.x, projp.y, posR, posS, muljoin);
+				hipMemcpy(dcons + projp.y, wherej + 2, muljoinsize, hipMemcpyHostToDevice);
+				hipLaunchKernel(HIP_KERNEL_NAME(multiJoinWithWrite), dim3(blockllen), dim3(numthreads), sizepro + muljoinsize, 0, d_locations, sLen, temp, d_Rout, p1, p2, of1, of2, dcons, projp.x, projp.y, posR, posS, muljoin);
 			}
 			else
-				gJoinWithWrite<<<blockllen, numthreads, sizepro>>> (d_locations, sLen, temp, d_Rout, p1, p2, of1, of2, dcons, projp.x, projp.y, posR, posS);
+				hipLaunchKernel(HIP_KERNEL_NAME(gJoinWithWrite), dim3(blockllen), dim3(numthreads), sizepro, 0, d_locations, sLen, temp, d_Rout, p1, p2, of1, of2, dcons, projp.x, projp.y, posR, posS);
 		}
 	}

-	cudaFree(dcons);
-	cudaFree(d_locations);
-	cudaFree(temp);
+	hipFree(dcons);
+	hipFree(d_locations);
+	hipFree(temp);
 	if(posS != NULL)
-		cudaFree(posS);
+		hipFree(posS);
 	if(posR != NULL)
-		cudaFree(posR);
+		hipFree(posR);
 	
 	if(*ret != NULL)
-		cudaFree(*ret);
+		hipFree(*ret);
 	*ret = d_Rout;

 	#ifdef TIMER
-	cudaEventRecord(stop, 0);
-	cudaEventSynchronize(stop);
-	cudaEventElapsedTime(&time, start, stop);
+	hipEventRecord(stop, 0);
+	hipEventSynchronize(stop);
+	hipEventElapsedTime(&time, start, stop);
 	//cout << "Join = " << time << endl;
 	//cout << "FIN" << endl;
 	cuda_stats.join_time += time;
--- a/packages/cuda/union2.cu
+++ b/packages/cuda/union2.cu
@@ -87,8 +87,8 @@ int unir(int *res, int rows, int tipo, int **ret, int final)
 			{
 				size = nrows * tipo * sizeof(int);
 				reservar(&nres, size);
-				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
-				cudaFree(*ret);
+				hipMemcpyAsync(nres, res, size, hipMemcpyDeviceToDevice);
+				hipFree(*ret);
 				*ret = nres;
 			}
 			return nrows;
@@ -122,8 +122,8 @@ int unir(int *res, int rows, int tipo, int **ret, int final)
 			{
 				size = nrows * tipo * sizeof(int);
 				reservar(&nres, size);
-				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
-				cudaFree(*ret);
+				hipMemcpyAsync(nres, res, size, hipMemcpyDeviceToDevice);
+				hipFree(*ret);
 				*ret = nres;
 			}
 			return nrows;
@@ -157,8 +157,8 @@ int unir(int *res, int rows, int tipo, int **ret, int final)
 			{
 				size = nrows * tipo * sizeof(int);
 				reservar(&nres, size);
-				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
-				cudaFree(*ret);
+				hipMemcpyAsync(nres, res, size, hipMemcpyDeviceToDevice);
+				hipFree(*ret);
 				*ret = nres;
 			}
 			return nrows;
@@ -192,8 +192,8 @@ int unir(int *res, int rows, int tipo, int **ret, int final)
 			{
 				size = nrows * tipo * sizeof(int);
 				reservar(&nres, size);
-				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
-				cudaFree(*ret);
+				hipMemcpyAsync(nres, res, size, hipMemcpyDeviceToDevice);
+				hipFree(*ret);
 				*ret = nres;
 			}
 			return nrows;
@@ -227,8 +227,8 @@ int unir(int *res, int rows, int tipo, int **ret, int final)
 			{
 				size = nrows * tipo * sizeof(int);
 				reservar(&nres, size);
-				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
-				cudaFree(*ret);
+				hipMemcpyAsync(nres, res, size, hipMemcpyDeviceToDevice);
+				hipFree(*ret);
 				*ret = nres;
 			}
 			return nrows;
@@ -262,8 +262,8 @@ int unir(int *res, int rows, int tipo, int **ret, int final)
 			{
 				size = nrows * tipo * sizeof(int);
 				reservar(&nres, size);
-				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
-				cudaFree(*ret);
+				hipMemcpyAsync(nres, res, size, hipMemcpyDeviceToDevice);
+				hipFree(*ret);
 				*ret = nres;
 			}
 			return nrows;
@@ -297,8 +297,8 @@ int unir(int *res, int rows, int tipo, int **ret, int final)
 			{
 				size = nrows * tipo * sizeof(int);
 				reservar(&nres, size);
-				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
-				cudaFree(*ret);
+				hipMemcpyAsync(nres, res, size, hipMemcpyDeviceToDevice);
+				hipFree(*ret);
 				*ret = nres;
 			}
 			return nrows;
@@ -332,8 +332,8 @@ int unir(int *res, int rows, int tipo, int **ret, int final)
 			{
 				size = nrows * tipo * sizeof(int);
 				reservar(&nres, size);
-				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
-				cudaFree(*ret);
+				hipMemcpyAsync(nres, res, size, hipMemcpyDeviceToDevice);
+				hipFree(*ret);
 				*ret = nres;
 			}
 			return nrows;
@@ -367,8 +367,8 @@ int unir(int *res, int rows, int tipo, int **ret, int final)
 			{
 				size = nrows * tipo * sizeof(int);
 				reservar(&nres, size);
-				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
-				cudaFree(*ret);
+				hipMemcpyAsync(nres, res, size, hipMemcpyDeviceToDevice);
+				hipFree(*ret);
 				*ret = nres;
 			}
 			return nrows;
@@ -402,8 +402,8 @@ int unir(int *res, int rows, int tipo, int **ret, int final)
 			{
 				size = nrows * tipo * sizeof(int);
 				reservar(&nres, size);
-				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
-				cudaFree(*ret);
+				hipMemcpyAsync(nres, res, size, hipMemcpyDeviceToDevice);
+				hipFree(*ret);
 				*ret = nres;
 			}
 			return nrows;
@@ -437,8 +437,8 @@ int unir(int *res, int rows, int tipo, int **ret, int final)
 			{
 				size = nrows * tipo * sizeof(int);
 				reservar(&nres, size);
-				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
-				cudaFree(*ret);
+				hipMemcpyAsync(nres, res, size, hipMemcpyDeviceToDevice);
+				hipFree(*ret);
 				*ret = nres;
 			}
 			return nrows;
@@ -472,8 +472,8 @@ int unir(int *res, int rows, int tipo, int **ret, int final)
 			{
 				size = nrows * tipo * sizeof(int);
 				reservar(&nres, size);
-				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
-				cudaFree(*ret);
+				hipMemcpyAsync(nres, res, size, hipMemcpyDeviceToDevice);
+				hipFree(*ret);
 				*ret = nres;
 			}
 			return nrows;
@@ -507,8 +507,8 @@ int unir(int *res, int rows, int tipo, int **ret, int final)
 			{
 				size = nrows * tipo * sizeof(int);
 				reservar(&nres, size);
-				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
-				cudaFree(*ret);
+				hipMemcpyAsync(nres, res, size, hipMemcpyDeviceToDevice);
+				hipFree(*ret);
 				*ret = nres;
 			}
 			return nrows;
@@ -542,8 +542,8 @@ int unir(int *res, int rows, int tipo, int **ret, int final)
 			{
 				size = nrows * tipo * sizeof(int);
 				reservar(&nres, size);
-				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
-				cudaFree(*ret);
+				hipMemcpyAsync(nres, res, size, hipMemcpyDeviceToDevice);
+				hipFree(*ret);
 				*ret = nres;
 			}
 			return nrows;
@@ -577,8 +577,8 @@ int unir(int *res, int rows, int tipo, int **ret, int final)
 			{
 				size = nrows * tipo * sizeof(int);
 				reservar(&nres, size);
-				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
-				cudaFree(*ret);
+				hipMemcpyAsync(nres, res, size, hipMemcpyDeviceToDevice);
+				hipFree(*ret);
 				*ret = nres;
 			}
 			return nrows;
@@ -612,8 +612,8 @@ int unir(int *res, int rows, int tipo, int **ret, int final)
 			{
 				size = nrows * tipo * sizeof(int);
 				reservar(&nres, size);
-				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
-				cudaFree(*ret);
+				hipMemcpyAsync(nres, res, size, hipMemcpyDeviceToDevice);
+				hipFree(*ret);
 				*ret = nres;
 			}
 			return nrows;
@@ -647,8 +647,8 @@ int unir(int *res, int rows, int tipo, int **ret, int final)
 			{
 				size = nrows * tipo * sizeof(int);
 				reservar(&nres, size);
-				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
-				cudaFree(*ret);
+				hipMemcpyAsync(nres, res, size, hipMemcpyDeviceToDevice);
+				hipFree(*ret);
 				*ret = nres;
 			}
 			return nrows;
@@ -682,8 +682,8 @@ int unir(int *res, int rows, int tipo, int **ret, int final)
 			{
 				size = nrows * tipo * sizeof(int);
 				reservar(&nres, size);
-				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
-				cudaFree(*ret);
+				hipMemcpyAsync(nres, res, size, hipMemcpyDeviceToDevice);
+				hipFree(*ret);
 				*ret = nres;
 			}
 			return nrows;
@@ -717,8 +717,8 @@ int unir(int *res, int rows, int tipo, int **ret, int final)
 			{
 				size = nrows * tipo * sizeof(int);
 				reservar(&nres, size);
-				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
-				cudaFree(*ret);
+				hipMemcpyAsync(nres, res, size, hipMemcpyDeviceToDevice);
+				hipFree(*ret);
 				*ret = nres;
 			}
 			return nrows;
@@ -752,8 +752,8 @@ int unir(int *res, int rows, int tipo, int **ret, int final)
 			{
 				size = nrows * tipo * sizeof(int);
 				reservar(&nres, size);
-				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
-				cudaFree(*ret);
+				hipMemcpyAsync(nres, res, size, hipMemcpyDeviceToDevice);
+				hipFree(*ret);
 				*ret = nres;
 			}
 			return nrows;
--- a/packages/cuda/union2.h
+++ b/packages/cuda/union2.h
--- a/packages/cuda/unioncpu2.cpp
+++ b/packages/cuda/unioncpu2.cpp