python support

2016-06-28 23:47:09 +01:00
parent c2fb631106
commit 3f59ec40cd
30 changed files with 1684 additions and 386 deletions
--- a/packages/ProbLog/problog/timer.yap
+++ b/packages/ProbLog/problog/timer.yap
@@ -240,7 +240,6 @@ timer_stop(Name,Duration) :-
 	->
 	 statistics(walltime,[StopTime,_]),
 	 Duration is StopTime-StartTime;
-
 	 throw(timer_not_started(timer_stop(Name,Duration)))
 	).

--- a/packages/ProbLog/problog_learning.yap
+++ b/packages/ProbLog/problog_learning.yap
@@ -1,4 +1,4 @@
-1%%% -*- Mode: Prolog; -*-
+%%% -*- Mode: Prolog; -*-

 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 %
@@ -567,6 +567,8 @@ init_learning :-
 	   set_problog_flag(alpha,Alpha)
 	  )
 	 )
+	;
+	 true
 	),

 	%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
--- a/packages/cuda/CC_CSSTree.cu
+++ b/packages/cuda/CC_CSSTree.cu
--- a/packages/cuda/CC_CSSTree.h
+++ b/packages/cuda/CC_CSSTree.h
--- a/packages/cuda/Makefile.in
+++ b/packages/cuda/Makefile.in
--- a/packages/cuda/bpreds.cu
+++ b/packages/cuda/bpreds.cu
@@ -1,3 +1,4 @@
+#include "hip/hip_runtime.h"
 #include <thrust/device_vector.h>
 #include <thrust/scan.h>
 #include <cstdarg>
@@ -25,10 +26,10 @@ int maximo(int count, ...)
 __global__ void bpreds(int *dop1, int *dop2, int rows, int of1, int of2, int *cons, int numc, int nx, int *res, int *res2)
 {
 	extern __shared__ int shared[];
-	int id = blockIdx.x * blockDim.x + threadIdx.x;
+	int id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
 	int x, rowact, rowact1, op1, op2;
-	if(threadIdx.x < numc)
-		shared[threadIdx.x] = cons[threadIdx.x];
+	if(hipThreadIdx_x < numc)
+		shared[hipThreadIdx_x] = cons[hipThreadIdx_x];
 	__syncthreads();
 	if(id < rows)
 	{
@@ -110,10 +111,10 @@ __global__ void bpreds(int *dop1, int *dop2, int rows, int of1, int of2, int *co
 __global__ void bpredsnormal2(int *dop1, int rows, int of1, int *cons, int numc, int *res)
 {
 	extern __shared__ int shared[];
-	int id = blockIdx.x * blockDim.x + threadIdx.x;
+	int id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
 	int x, rowact, op1, op2;
-	if(threadIdx.x < numc)
-		shared[threadIdx.x] = cons[threadIdx.x];
+	if(hipThreadIdx_x < numc)
+		shared[hipThreadIdx_x] = cons[hipThreadIdx_x];
 	__syncthreads();
 	if(id < rows)
 	{
@@ -159,10 +160,10 @@ __global__ void bpredsnormal2(int *dop1, int rows, int of1, int *cons, int numc,
 __global__ void bpredsnormal(int *dop1, int rows, int of1, int *cons, int numc, int *res)
 {
 	extern __shared__ int shared[];
-	int id = blockIdx.x * blockDim.x + threadIdx.x;
+	int id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
 	int x, rowact, op1, op2;
-	if(threadIdx.x < numc)
-		shared[threadIdx.x] = cons[threadIdx.x];
+	if(hipThreadIdx_x < numc)
+		shared[hipThreadIdx_x] = cons[hipThreadIdx_x];
 	__syncthreads();
 	if(id < rows)
 	{
@@ -226,10 +227,10 @@ __global__ void bpredsnormal(int *dop1, int rows, int of1, int *cons, int numc,
 __global__ void bpredsOR(int *dop1, int *dop2, int rows, int of1, int of2, int *cons, int numc, int nx, int *res, int *res2)
 {
 	extern __shared__ int shared[];
-	int id = blockIdx.x * blockDim.x + threadIdx.x;
+	int id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
 	int x, rowact, rowact1, op1, op2;
-	if(threadIdx.x < numc)
-		shared[threadIdx.x] = cons[threadIdx.x];
+	if(hipThreadIdx_x < numc)
+		shared[hipThreadIdx_x] = cons[hipThreadIdx_x];
 	__syncthreads();
 	if(id < rows)
 	{
@@ -344,10 +345,10 @@ __global__ void bpredsOR(int *dop1, int *dop2, int rows, int of1, int of2, int *
 __global__ void bpredsorlogic2(int *dop1, int rows, int of1, int *cons, int numc, int *res)
 {
 	extern __shared__ int shared[];
-	int id = blockIdx.x * blockDim.x + threadIdx.x;
+	int id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
 	int x, rowact, op1, op2;
-	if(threadIdx.x < numc)
-		shared[threadIdx.x] = cons[threadIdx.x];
+	if(hipThreadIdx_x < numc)
+		shared[hipThreadIdx_x] = cons[hipThreadIdx_x];
 	__syncthreads();
 	if(id < rows)
 	{
@@ -411,10 +412,10 @@ __global__ void bpredsorlogic2(int *dop1, int rows, int of1, int *cons, int numc
 __global__ void bpredsorlogic(int *dop1, int rows, int of1, int *cons, int numc, int *res)
 {
 	extern __shared__ int shared[];
-	int id = blockIdx.x * blockDim.x + threadIdx.x;
+	int id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
 	int x, rowact, op1, op2;
-	if(threadIdx.x < numc)
-		shared[threadIdx.x] = cons[threadIdx.x];
+	if(hipThreadIdx_x < numc)
+		shared[hipThreadIdx_x] = cons[hipThreadIdx_x];
 	__syncthreads();
 	if(id < rows)
 	{
--- a/packages/cuda/bpreds.h
+++ b/packages/cuda/bpreds.h
@@ -1,3 +1,4 @@
+#include "hip/hip_runtime.h"
 #ifndef _BPREDS_H_
 #define _BPREDS_H_

--- a/packages/cuda/bpredscpu.cpp
+++ b/packages/cuda/bpredscpu.cpp
--- a/packages/cuda/creator2.c
+++ b/packages/cuda/creator2.c
@@ -66,7 +66,7 @@ int main(int argc, char *argv[])
 	fprintf(cuda, "\t\t\t{\n");
 	fprintf(cuda, "\t\t\t\tsize = nrows * tipo * sizeof(int);\n");
 	fprintf(cuda, "\t\t\t\treservar(&nres, size);\n");
-	fprintf(cuda, "\t\t\t\tcudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);\n");
+	fprintf(cuda, "\t\t\t\tcudaMemcpyAsync(nres, res, size, hipMemcpyDeviceToDevice);\n");
 	fprintf(cuda, "\t\t\t\tcudaFree(*ret);\n");
 	fprintf(cuda, "\t\t\t\t*ret = nres;\n");
 	fprintf(cuda, "\t\t\t}\n");
@@ -103,7 +103,7 @@ int main(int argc, char *argv[])
 		fprintf(cuda, "\t\t\t{\n");
 		fprintf(cuda, "\t\t\t\tsize = nrows * tipo * sizeof(int);\n");
 		fprintf(cuda, "\t\t\t\treservar(&nres, size);\n");
-		fprintf(cuda, "\t\t\t\tcudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);\n");
+		fprintf(cuda, "\t\t\t\tcudaMemcpyAsync(nres, res, size, hipMemcpyDeviceToDevice);\n");
 		fprintf(cuda, "\t\t\t\tcudaFree(*ret);\n");
 		fprintf(cuda, "\t\t\t\t*ret = nres;\n");
 		fprintf(cuda, "\t\t\t}\n");
--- a/packages/cuda/cuda.c
+++ b/packages/cuda/cuda.c
--- a/packages/cuda/cuda.yap
+++ b/packages/cuda/cuda.yap
--- a/packages/cuda/dbio.cu
+++ b/packages/cuda/dbio.cu
@@ -27,8 +27,8 @@ void datalogWrite(int query, vector<rulenode>::iterator rul_str, vector<rulenode
 				res_rows = unir(dop1, res_rows, cols1, &dop1, 0);
 			tipo = res_rows * cols1 * sizeof(int);
 			hres = (int *)malloc(tipo);
-			cudaMemcpy(hres, dop1, tipo, cudaMemcpyDeviceToHost);
-			cudaFree(dop1);
+			hipMemcpy(hres, dop1, tipo, hipMemcpyDeviceToHost);
+			hipFree(dop1);
 			*result = hres;
 		}
 		else
@@ -39,13 +39,13 @@ void datalogWrite(int query, vector<rulenode>::iterator rul_str, vector<rulenode
 				int *dop2;
 				tipo = res_rows * cols1 * sizeof(int);
 				reservar(&dop2, tipo); 
-				cudaMemcpy(dop2, dop1, tipo, cudaMemcpyHostToDevice);
+				hipMemcpy(dop2, dop1, tipo, hipMemcpyHostToDevice);
 				free(dop1);
 				res_rows = unir(dop2, res_rows, cols1, &dop2, 0);
 				tipo = res_rows * cols1 * sizeof(int);
 				hres = (int *)malloc(tipo);
-				cudaMemcpy(hres, dop2, tipo, cudaMemcpyDeviceToHost);
-				cudaFree(dop2);
+				hipMemcpy(hres, dop2, tipo, hipMemcpyDeviceToHost);
+				hipFree(dop2);
 				*result = hres;
 			}
 			else
@@ -315,8 +315,8 @@ void postgresWrite(int *inpquery, int ninpf, vector<rulenode>::iterator rul_str,

 				tipo = res_rows * cols1 * sizeof(int);
 				hres = (int *)malloc(tipo);
-				cudaMemcpy(hres, dop1, tipo, cudaMemcpyDeviceToHost);
-				cudaFree(dop1);
+				hipMemcpy(hres, dop1, tipo, hipMemcpyDeviceToHost);
+				hipFree(dop1);
 				w = z + 1;

 				strtok(qposr->rulename, "_");
@@ -353,8 +353,8 @@ void postgresWrite(int *inpquery, int ninpf, vector<rulenode>::iterator rul_str,
 			res_rows = abs(res_rows);
 			tipo = res_rows * cols1 * sizeof(int);
 			hres = (int *)malloc(tipo);
-			cudaMemcpy(hres, dop1, tipo, cudaMemcpyDeviceToHost);
-			cudaFree(dop1);
+			hipMemcpy(hres, dop1, tipo, hipMemcpyDeviceToHost);
+			hipFree(dop1);

 			char file[] = "/dev/shm/buffer.csv";
 			FILE *fp;
@@ -554,7 +554,7 @@ void mysqlWrite(vector<rulenode>::iterator rul_str, vector<rulenode>::iterator f
 			sign = tmpfact.predname;
 			tipo = res_rows * cols1 * sizeof(int);
 			hres = (int *)malloc(tipo);
-			cudaMemcpy(hres, dop1, tipo, cudaMemcpyDeviceToHost);
+			hipMemcpy(hres, dop1, tipo, hipMemcpyDeviceToHost);
 			if(sign[0] == 'f' && sign[1] >= '0' && sign[1] <= '9')
 				sumar(tmpfact.name, dop1, cols1, res_rows);
 		}
--- a/packages/cuda/joincpu.cpp
+++ b/packages/cuda/joincpu.cpp
@@ -324,11 +324,11 @@ int joincpu(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenod
 	}

 	#ifdef TIMER
-	cudaEvent_t start, stop;
+	hipEvent_t start, stop;
 	float time;
-	cudaEventCreate(&start);
-	cudaEventCreate(&stop);
-	cudaEventRecord(start, 0);
+	hipEventCreate(&start);
+	hipEventCreate(&stop);
+	hipEventRecord(start, 0);
 	#endif

 	if(nsel1 > 0 || nsj1 > 0)
@@ -359,16 +359,16 @@ int joincpu(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenod
 	}

 	#ifdef TIMER
-	cudaEventRecord(stop, 0);
-	cudaEventSynchronize(stop);
-	cudaEventElapsedTime(&time, start, stop);
+	hipEventRecord(stop, 0);
+	hipEventSynchronize(stop);
+	hipEventElapsedTime(&time, start, stop);
 	cuda_stats.select1_time += time;

-	cudaEventDestroy(start);
-	cudaEventDestroy(stop);
-	cudaEventCreate(&start);
-	cudaEventCreate(&stop);
-	cudaEventRecord(start, 0);
+	hipEventDestroy(start);
+	hipEventDestroy(stop);
+	hipEventCreate(&start);
+	hipEventCreate(&stop);
+	hipEventRecord(start, 0);
 	#endif

 	if(nsel2 > 0 || nsj2 > 0)
@@ -381,16 +381,16 @@ int joincpu(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenod
 		Snl = sLen;

 	#ifdef TIMER
-	cudaEventRecord(stop, 0);
-	cudaEventSynchronize(stop);
-	cudaEventElapsedTime(&time, start, stop);
+	hipEventRecord(stop, 0);
+	hipEventSynchronize(stop);
+	hipEventElapsedTime(&time, start, stop);
 	cuda_stats.select2_time += time;

-	cudaEventDestroy(start);
-	cudaEventDestroy(stop);
-	cudaEventCreate(&start);
-	cudaEventCreate(&stop);
-	cudaEventRecord(start, 0);
+	hipEventDestroy(start);
+	hipEventDestroy(stop);
+	hipEventCreate(&start);
+	hipEventCreate(&stop);
+	hipEventRecord(start, 0);
 	#endif

 	//cout << "antes" << endl;
@@ -406,16 +406,16 @@ int joincpu(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenod
 	thrust::stable_sort_by_key(thrust::omp::par, Rres, Rres + Rnl, permutation);

 	#ifdef TIMER
-	cudaEventRecord(stop, 0);
-	cudaEventSynchronize(stop);
-	cudaEventElapsedTime(&time, start, stop);
+	hipEventRecord(stop, 0);
+	hipEventSynchronize(stop);
+	hipEventElapsedTime(&time, start, stop);
 	cuda_stats.sort_time += time;
 	
-	cudaEventDestroy(start);
-	cudaEventDestroy(stop);
-	cudaEventCreate(&start);
-	cudaEventCreate(&stop);
-	cudaEventRecord(start, 0);
+	hipEventDestroy(start);
+	hipEventDestroy(stop);
+	hipEventCreate(&start);
+	hipEventCreate(&stop);
+	hipEventRecord(start, 0);
 	#endif

 	/*cout << "despues" << endl;
@@ -482,9 +482,9 @@ int joincpu(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenod
 	*ret = fres;

 	#ifdef TIMER
-	cudaEventRecord(stop, 0);
-	cudaEventSynchronize(stop);
-	cudaEventElapsedTime(&time, start, stop);
+	hipEventRecord(stop, 0);
+	hipEventSynchronize(stop);
+	hipEventElapsedTime(&time, start, stop);
 	cuda_stats.join_time += time;
 	#endif

--- a/packages/cuda/lista.cu
+++ b/packages/cuda/lista.cu
@@ -967,7 +967,7 @@ vector<gpunode> L;
 extern "C"
 int Cuda_Eval(predicate **inpfacts, int ninpf, predicate **inprules, int ninpr, int *inpquery, int **result, char *names, int finalDR)
 {
-	cudaSetDevice(0);
+	hipSetDevice(0);
 	vector<rulenode> rules;
 	int x;

@@ -1029,11 +1029,11 @@ int Cuda_Eval(predicate **inpfacts, int ninpf, predicate **inprules, int ninpr,
 	vector<rulenode>::iterator qposr;

 #if TIMER
-	cudaEvent_t start, stop;
+	hipEvent_t start, stop;
 	float time;
-	cudaEventCreate(&start);
-	cudaEventCreate(&stop);
-	cudaEventRecord(start, 0);
+	hipEventCreate(&start);
+	hipEventCreate(&stop);
+	hipEventRecord(start, 0);
 #endif

 	while(reglas.size()) /*Here's the main loop*/
@@ -1084,7 +1084,7 @@ int Cuda_Eval(predicate **inpfacts, int ninpf, predicate **inprules, int ninpr,
 				{
 					num_refs = rows1 * cols1 * sizeof(int);
 					reservar(&res, num_refs);
-					cudaMemcpyAsync(res, dop1, num_refs, cudaMemcpyDeviceToDevice);
+					hipMemcpyAsync(res, dop1, num_refs, hipMemcpyDeviceToDevice);
 					registrar(rul_act->name, cols1, res, rows1, itr, 1);
 					genflag = 1;
 					rul_act->gen_ant = rul_act->gen_act;
@@ -1251,10 +1251,10 @@ int Cuda_Eval(predicate **inpfacts, int ninpf, predicate **inprules, int ninpr,
 			if(x == num_refs)
 			{
 				#ifdef TIMER
-				cudaEvent_t start2, stop2;
-				cudaEventCreate(&start2);
-				cudaEventCreate(&stop2);
-				cudaEventRecord(start2, 0);
+				hipEvent_t start2, stop2;
+				hipEventCreate(&start2);
+				hipEventCreate(&stop2);
+				hipEventRecord(start2, 0);
 				#endif

 				//cout << rul_act->name << " res_rows = " << res_rows << endl;
@@ -1263,11 +1263,11 @@ int Cuda_Eval(predicate **inpfacts, int ninpf, predicate **inprules, int ninpr,
 					res_rows = unir(res, res_rows, rul_act->num_columns, &res, 0);

 				#ifdef TIMER
-				cudaEventRecord(stop2, 0);
-				cudaEventSynchronize(stop2);
-				cudaEventElapsedTime(&time, start2, stop2);
-				cudaEventDestroy(start2);
-				cudaEventDestroy(stop2);
+				hipEventRecord(stop2, 0);
+				hipEventSynchronize(stop2);
+				hipEventElapsedTime(&time, start2, stop2);
+				hipEventDestroy(start2);
+				hipEventDestroy(stop2);
 				//cout << "Union = " << time << endl;
 				cuda_stats.union_time += time;
 				#endif					
@@ -1319,16 +1319,16 @@ int Cuda_Eval(predicate **inpfacts, int ninpf, predicate **inprules, int ninpr,
 	#endif

 #if TIMER
-	cudaEventRecord(stop, 0);
-	cudaEventSynchronize(stop);
-	cudaEventElapsedTime(&time, start, stop);
+	hipEventRecord(stop, 0);
+	hipEventSynchronize(stop);
+	hipEventElapsedTime(&time, start, stop);
 	cuda_stats.total_time += time;
 	if (time > cuda_stats.max_time) 
 	  cuda_stats.max_time = time;
 	if (time < cuda_stats.min_time || cuda_stats.calls == 1) 
 	  cuda_stats.min_time = time;
-	cudaEventDestroy(start);
-	cudaEventDestroy(stop);
+	hipEventDestroy(start);
+	hipEventDestroy(stop);
 	Cuda_Statistics();
 #endif

--- a/packages/cuda/lista.h
+++ b/packages/cuda/lista.h
--- a/packages/cuda/memory.cu
+++ b/packages/cuda/memory.cu
@@ -144,7 +144,7 @@ void limpiar(const char s[], size_t sz)

 	if(GPUmem.size() == 0)
 	{
-		cudaMemGetInfo(&free,&total);
+		hipMemGetInfo(&free,&total);
 		cerr << s << ": not enough GPU memory: have " << free << " of " << total << ", need " << sz << " bytes." << endl;
 		exit(1);
 	}		
@@ -154,11 +154,11 @@ void limpiar(const char s[], size_t sz)
 	{	
 		temp = *ini;
 		temp.dev_address = (int *)malloc(ini->size);
-		cudaMemcpyAsync(temp.dev_address, ini->dev_address, temp.size, cudaMemcpyDeviceToHost);
+		hipMemcpyAsync(temp.dev_address, ini->dev_address, temp.size, hipMemcpyDeviceToHost);
 		list<memnode>::iterator pos = lower_bound(CPUmem.begin(), CPUmem.end(), temp, compareiteration);
 		CPUmem.insert(pos, temp);
 	}
-	cudaFree(ini->dev_address);
+	hipFree(ini->dev_address);
 	GPUmem.erase(ini);
 }

@@ -173,19 +173,19 @@ void reservar(int **ptr, size_t size)
                return;
        }

-	cudaMemGetInfo(&free, &total);
+	hipMemGetInfo(&free, &total);
 	while(free < size)
 	{
 		cout << "Se limpio memoria " << free << " " << total << endl;
 		limpiar("not enough memory", size);
-		cudaMemGetInfo(&free, &total);
+		hipMemGetInfo(&free, &total);
 	}

-	while(cudaMalloc(ptr, size) == cudaErrorMemoryAllocation)
+	while(hipMalloc(ptr, size) == hipErrorMemoryAllocation)
 		limpiar("Error in memory allocation", size);
 	if (! *ptr ) {
 	  size_t free, total;
-	  cudaMemGetInfo(      &free, &total	 );
+	  hipMemGetInfo(      &free, &total	 );
 	  cerr << "Could not allocate " << size << " bytes, only " << free << " avaliable from total of " << total << " !!!" << endl;
 	  cerr << "Exiting CUDA...." << endl;
 	  exit(1);
@@ -277,7 +277,7 @@ int cargar(int name, int num_rows, int num_columns, int is_fact, int *address_ho
 		}
 		size = num_rows * num_columns * sizeof(int);
 		reservar(&temp, size);
-		cudaMemcpyAsync(temp, address_host_table, size, cudaMemcpyHostToDevice);
+		hipMemcpyAsync(temp, address_host_table, size, hipMemcpyHostToDevice);
 		registrar(name, num_columns, temp, num_rows, itr, 0);
 		*ptr = temp;
 		return num_rows;
@@ -296,13 +296,13 @@ int cargar(int name, int num_rows, int num_columns, int is_fact, int *address_ho
 		reservar(&temp, size);
 		for(x = 0; x < numgpu; x++)
 		{
-			cudaMemcpyAsync(temp + inc, temp_storage[x].dev_address, temp_storage[x].size, cudaMemcpyDeviceToDevice);
+			hipMemcpyAsync(temp + inc, temp_storage[x].dev_address, temp_storage[x].size, hipMemcpyDeviceToDevice);
 			inc += temp_storage[x].size / sizeof(int);
-			cudaFree(temp_storage[x].dev_address);
+			hipFree(temp_storage[x].dev_address);
 		}
 		for(; x < numcpu; x++)
 		{
-			cudaMemcpyAsync(temp + inc, temp_storage[x].dev_address, temp_storage[x].size, cudaMemcpyHostToDevice);
+			hipMemcpyAsync(temp + inc, temp_storage[x].dev_address, temp_storage[x].size, hipMemcpyHostToDevice);
 			inc += temp_storage[x].size / sizeof(int);
 			free(temp_storage[x].dev_address);
 		}
@@ -340,9 +340,9 @@ int cargarcpu(int name, int num_rows, int num_columns, int is_fact, int *address
 		temp = (int *)malloc(size);
 		for(x = 0; x < numgpu; x++)
 		{
-			cudaMemcpyAsync(temp + inc, temp_storage[x].dev_address, temp_storage[x].size, cudaMemcpyDeviceToHost);
+			hipMemcpyAsync(temp + inc, temp_storage[x].dev_address, temp_storage[x].size, hipMemcpyDeviceToHost);
 			inc += temp_storage[x].size / sizeof(int);
-			cudaFree(temp_storage[x].dev_address);
+			hipFree(temp_storage[x].dev_address);
 		}
 		for(; x < numcpu; x++)
 		{
@@ -404,7 +404,7 @@ int cargafinal(int name, int cols, int **ptr)
 		cont = pos->rows;
 		#ifdef TUFFY
 		reservar(&temp, pos->size);
-		cudaMemcpy(temp, pos->dev_address, pos->size, cudaMemcpyHostToDevice);
+		hipMemcpy(temp, pos->dev_address, pos->size, hipMemcpyHostToDevice);
 		*ptr = temp;
 		#else
 		*ptr = pos->dev_address;
@@ -418,14 +418,14 @@ int cargafinal(int name, int cols, int **ptr)
 	pos = gpu;
 	while(pos != endg && pos->name == name)
 	{
-		cudaMemcpy(temp, pos->dev_address, pos->size, cudaMemcpyDeviceToDevice);
+		hipMemcpy(temp, pos->dev_address, pos->size, hipMemcpyDeviceToDevice);
 		temp += pos->size / sizeof(int);
 		pos++;
 	}
 	pos = cpu;
 	while(pos != endc && pos->name == name)
 	{
-		cudaMemcpy(temp, pos->dev_address, pos->size, cudaMemcpyHostToDevice);
+		hipMemcpy(temp, pos->dev_address, pos->size, hipMemcpyHostToDevice);
 		temp += pos->size / sizeof(int);
 		pos++;
 	}
@@ -493,7 +493,7 @@ void clear_memory()
 	{
 		if(ini->isrule)
 		{
-			cudaFree(ini->dev_address);
+			hipFree(ini->dev_address);
 			ini = GPUmem.erase(ini);
 		}
 		else
@@ -518,7 +518,7 @@ void clear_memory_all()
 	fin = GPUmem.end();
 	while(ini != fin)
 	{
-		cudaFree(ini->dev_address);
+		hipFree(ini->dev_address);
 		ini++;
 	}
 	GPUmem.clear();
@@ -542,7 +542,7 @@ void liberar(int name)
 	{
 		fact = *i;
 		GPUmem.erase(i);
-		cudaFree(fact.dev_address);
+		hipFree(fact.dev_address);
 	}
 	i = buscarhecho(CPUmem.begin(), CPUmem.end(), name);
 	if(i != CPUmem.end())
@@ -566,10 +566,10 @@ void sumar(int name, int *dop1, int cols, int rows)
 		newrows = rows + fact.rows;
 		reservar(&res, newrows * cols * sizeof(int));
 		offset = fact.rows * cols;
-		cudaMemcpyAsync(res, fact.dev_address, offset * sizeof(int), cudaMemcpyDeviceToDevice);
+		hipMemcpyAsync(res, fact.dev_address, offset * sizeof(int), hipMemcpyDeviceToDevice);
 		GPUmem.erase(i);
 		registrar(name, cols, res, newrows, 0, 0);
-		cudaMemcpyAsync(res + offset, dop1, rows * cols * sizeof(int), cudaMemcpyDeviceToDevice);
-		cudaFree(fact.dev_address);
+		hipMemcpyAsync(res + offset, dop1, rows * cols * sizeof(int), hipMemcpyDeviceToDevice);
+		hipFree(fact.dev_address);
 	}
 }
--- a/packages/cuda/memory.h
+++ b/packages/cuda/memory.h
--- a/packages/cuda/pred.h
+++ b/packages/cuda/pred.h
--- a/packages/cuda/selectproyect.cu
+++ b/packages/cuda/selectproyect.cu
@@ -1,3 +1,4 @@
+#include "hip/hip_runtime.h"
 #include <thrust/device_vector.h>
 #include <thrust/scan.h>
 #include <stdlib.h>
@@ -8,10 +9,10 @@
 __global__ void marcar2(int *dop1, int rows, int cols, int *cons, int numc, int *res)
 {
 	extern __shared__ int shared[];
-	int id = blockIdx.x * blockDim.x + threadIdx.x;
+	int id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
 	int x, rowact, posact;
-	if(threadIdx.x < numc)
-		shared[threadIdx.x] = cons[threadIdx.x];
+	if(hipThreadIdx_x < numc)
+		shared[hipThreadIdx_x] = cons[hipThreadIdx_x];
 	__syncthreads();
 	if(id < rows)
 	{
@@ -30,10 +31,10 @@ we unmark any rows that do not comply with the selections*/
 __global__ void marcar(int *dop1, int rows, int cols, int *cons, int numc, int *res)
 {
 	extern __shared__ int shared[];
-	int id = blockIdx.x * blockDim.x + threadIdx.x;
+	int id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
 	int x, rowact, posact;
-	if(threadIdx.x < numc)
-		shared[threadIdx.x] = cons[threadIdx.x];
+	if(hipThreadIdx_x < numc)
+		shared[hipThreadIdx_x] = cons[hipThreadIdx_x];
 	__syncthreads();
 	if(id < rows)
 	{
@@ -56,10 +57,10 @@ __global__ void marcar(int *dop1, int rows, int cols, int *cons, int numc, int *
 __global__ void samejoin(int *dop1, int rows, int cols, int *dhead, int cont, int *res)
 {
 	extern __shared__ int shared[];
-	int id = blockIdx.x * blockDim.x + threadIdx.x;
+	int id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
 	int temp, temp2, pos, x, y;
-	if(threadIdx.x < cont)
-		shared[threadIdx.x] = dhead[threadIdx.x];
+	if(hipThreadIdx_x < cont)
+		shared[hipThreadIdx_x] = dhead[hipThreadIdx_x];
 	__syncthreads();
 	if(id < rows)
 	{	
@@ -90,10 +91,10 @@ __global__ void samejoin(int *dop1, int rows, int cols, int *dhead, int cont, in
 __global__ void samejoin2(int *dop1, int rows, int cols, int *dhead, int cont, int *res)
 {
 	extern __shared__ int shared[];
-	int id = blockIdx.x * blockDim.x + threadIdx.x;
+	int id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
 	int temp, temp2, pos, x, y;
-	if(threadIdx.x < cont)
-		shared[threadIdx.x] = dhead[threadIdx.x];
+	if(hipThreadIdx_x < cont)
+		shared[hipThreadIdx_x] = dhead[hipThreadIdx_x];
 	__syncthreads();
 	if(id < rows)
 	{	
@@ -120,10 +121,10 @@ __global__ void samejoin2(int *dop1, int rows, int cols, int *dhead, int cont, i
 __global__ void proyectar(int *dop1, int rows, int cols, int *dhead, int hsize, int *res)
 {
 	extern __shared__ int shared[];
-	int id = blockIdx.x * blockDim.x + threadIdx.x;
+	int id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
 	int pos, posr, x;
-	if(threadIdx.x < hsize)
-		shared[threadIdx.x] = dhead[threadIdx.x];
+	if(hipThreadIdx_x < hsize)
+		shared[hipThreadIdx_x] = dhead[hipThreadIdx_x];
 	__syncthreads();
 	if(id < rows)
 	{	
@@ -139,10 +140,10 @@ selections, selfjoins, etc.). The array 'temp' holds the result of the prefix su
 __global__ void llenarproyectar(int *dop1, int rows, int cols, int *temp, int *dhead, int hsize, int *res)
 {
 	extern __shared__ int shared[];
-	int id = blockIdx.x * blockDim.x + threadIdx.x;
+	int id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
 	int pos, posr, x;
-	if(threadIdx.x < hsize)
-		shared[threadIdx.x] = dhead[threadIdx.x];
+	if(hipThreadIdx_x < hsize)
+		shared[hipThreadIdx_x] = dhead[hipThreadIdx_x];
 	__syncthreads();
 	if(id < rows)
 	{		
@@ -184,27 +185,27 @@ int selectproyect(int *dop1, int rows, int cols, int head_size, int *select, int
 		tmplen = rows + 1;
 		size2 = tmplen * sizeof(int);
 		reservar(&temp, size2);
-		cudaMemset(temp, 0, size2);
+		hipMemset(temp, 0, size2);
 		size = numselect * sizeof(int);
-		cudaMemcpy(dhead, select, size, cudaMemcpyHostToDevice);
+		hipMemcpy(dhead, select, size, hipMemcpyHostToDevice);

-		marcar2<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, numselect, temp + 1);
+		hipLaunchKernel(HIP_KERNEL_NAME(marcar2), dim3(blockllen), dim3(numthreads), size, 0, dop1, rows, cols, dhead, numselect, temp + 1);
 		
 		if(numselfj > 0)
 		{
 			size = numselfj * sizeof(int);
-			cudaMemcpy(dhead, selfjoin, size, cudaMemcpyHostToDevice);
-			samejoin<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, numselfj, temp + 1);
+			hipMemcpy(dhead, selfjoin, size, hipMemcpyHostToDevice);
+			hipLaunchKernel(HIP_KERNEL_NAME(samejoin), dim3(blockllen), dim3(numthreads), size, 0, dop1, rows, cols, dhead, numselfj, temp + 1);
 		}

 		if(numpreds > 0)
 		{
 			size = numpreds * sizeof(int);
-			cudaMemcpy(dhead, preds, size, cudaMemcpyHostToDevice);
+			hipMemcpy(dhead, preds, size, hipMemcpyHostToDevice);
 			if(ANDlogic)
-				bpredsnormal<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, numpreds, temp + 1);
+				hipLaunchKernel(HIP_KERNEL_NAME(bpredsnormal), dim3(blockllen), dim3(numthreads), size, 0, dop1, rows, cols, dhead, numpreds, temp + 1);
 			else
-				bpredsorlogic<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, numpreds, temp + 1);
+				hipLaunchKernel(HIP_KERNEL_NAME(bpredsorlogic), dim3(blockllen), dim3(numthreads), size, 0, dop1, rows, cols, dhead, numpreds, temp + 1);
 		}

 		res = thrust::device_pointer_cast(temp);
@@ -215,10 +216,10 @@ int selectproyect(int *dop1, int rows, int cols, int head_size, int *select, int

 		size = head_size * sizeof(int);
 		reservar(&fres, num * size);
-		cudaMemcpy(dhead, project, size, cudaMemcpyHostToDevice);
-		llenarproyectar<<<blockllen, numthreads, size>>>(dop1, rows, cols, temp, dhead, head_size, fres);
-		cudaFree(dhead);
-		cudaFree(temp);
+		hipMemcpy(dhead, project, size, hipMemcpyHostToDevice);
+		hipLaunchKernel(HIP_KERNEL_NAME(llenarproyectar), dim3(blockllen), dim3(numthreads), size, 0, dop1, rows, cols, temp, dhead, head_size, fres);
+		hipFree(dhead);
+		hipFree(temp);
 		*ret = fres;
 		return num;
 	}
@@ -229,19 +230,19 @@ int selectproyect(int *dop1, int rows, int cols, int head_size, int *select, int
 			tmplen = rows + 1;
 			size2 = tmplen * sizeof(int);
 			reservar(&temp, size2);
-			cudaMemset(temp, 0, size2);
+			hipMemset(temp, 0, size2);
 			size = numselfj * sizeof(int);
-			cudaMemcpy(dhead, selfjoin, size, cudaMemcpyHostToDevice);
-			samejoin2<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, numselfj, temp + 1);
+			hipMemcpy(dhead, selfjoin, size, hipMemcpyHostToDevice);
+			hipLaunchKernel(HIP_KERNEL_NAME(samejoin2), dim3(blockllen), dim3(numthreads), size, 0, dop1, rows, cols, dhead, numselfj, temp + 1);

 			if(numpreds > 0)
 			{
 				size = numpreds * sizeof(int);
-				cudaMemcpy(dhead, preds, size, cudaMemcpyHostToDevice);
+				hipMemcpy(dhead, preds, size, hipMemcpyHostToDevice);
 				if(ANDlogic)
-					bpredsnormal<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, numpreds, temp + 1);
+					hipLaunchKernel(HIP_KERNEL_NAME(bpredsnormal), dim3(blockllen), dim3(numthreads), size, 0, dop1, rows, cols, dhead, numpreds, temp + 1);
 				else
-					bpredsorlogic<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, numpreds, temp + 1);
+					hipLaunchKernel(HIP_KERNEL_NAME(bpredsorlogic), dim3(blockllen), dim3(numthreads), size, 0, dop1, rows, cols, dhead, numpreds, temp + 1);

 			}

@@ -253,10 +254,10 @@ int selectproyect(int *dop1, int rows, int cols, int head_size, int *select, int

 			size = head_size * sizeof(int);
 			reservar(&fres, num * size);
-			cudaMemcpy(dhead, project, size, cudaMemcpyHostToDevice);
-			llenarproyectar<<<blockllen, numthreads, size>>>(dop1, rows, cols, temp, dhead, head_size, fres);
-			cudaFree(dhead);
-			cudaFree(temp);
+			hipMemcpy(dhead, project, size, hipMemcpyHostToDevice);
+			hipLaunchKernel(HIP_KERNEL_NAME(llenarproyectar), dim3(blockllen), dim3(numthreads), size, 0, dop1, rows, cols, temp, dhead, head_size, fres);
+			hipFree(dhead);
+			hipFree(temp);
 			*ret = fres;
 			return num;
 		}
@@ -267,14 +268,14 @@ int selectproyect(int *dop1, int rows, int cols, int head_size, int *select, int
 				tmplen = rows + 1;
 				size2 = tmplen * sizeof(int);
 				reservar(&temp, size2);
-				cudaMemset(temp, 0, size2);		
+				hipMemset(temp, 0, size2);		
 				size = numpreds * sizeof(int);
-				cudaMemcpy(dhead, preds, size, cudaMemcpyHostToDevice);
+				hipMemcpy(dhead, preds, size, hipMemcpyHostToDevice);

 				if(ANDlogic)
-					bpredsnormal2<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, numpreds, temp + 1);					
+					hipLaunchKernel(HIP_KERNEL_NAME(bpredsnormal2), dim3(blockllen), dim3(numthreads), size, 0, dop1, rows, cols, dhead, numpreds, temp + 1);					
 				else
-					bpredsorlogic2<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, numpreds, temp + 1);
+					hipLaunchKernel(HIP_KERNEL_NAME(bpredsorlogic2), dim3(blockllen), dim3(numthreads), size, 0, dop1, rows, cols, dhead, numpreds, temp + 1);
 				res = thrust::device_pointer_cast(temp);
 				thrust::inclusive_scan(res + 1, res + tmplen, res + 1);
 				num = res[rows];
@@ -284,10 +285,10 @@ int selectproyect(int *dop1, int rows, int cols, int head_size, int *select, int

 				size = head_size * sizeof(int);
 				reservar(&fres, num * size);
-				cudaMemcpy(dhead, project, size, cudaMemcpyHostToDevice);
-				llenarproyectar<<<blockllen, numthreads, size>>>(dop1, rows, cols, temp, dhead, head_size, fres);
-				cudaFree(dhead);
-				cudaFree(temp);
+				hipMemcpy(dhead, project, size, hipMemcpyHostToDevice);
+				hipLaunchKernel(HIP_KERNEL_NAME(llenarproyectar), dim3(blockllen), dim3(numthreads), size, 0, dop1, rows, cols, temp, dhead, head_size, fres);
+				hipFree(dhead);
+				hipFree(temp);
 				*ret = fres;
 				return num;
 			}
@@ -295,9 +296,9 @@ int selectproyect(int *dop1, int rows, int cols, int head_size, int *select, int
 			{
 				size = head_size * sizeof(int);
 				reservar(&fres, rows * size);
-				cudaMemcpy(dhead, project, size, cudaMemcpyHostToDevice);
-				proyectar<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, head_size, fres);
-				cudaFree(dhead);
+				hipMemcpy(dhead, project, size, hipMemcpyHostToDevice);
+				hipLaunchKernel(HIP_KERNEL_NAME(proyectar), dim3(blockllen), dim3(numthreads), size, 0, dop1, rows, cols, dhead, head_size, fres);
+				hipFree(dhead);
 				*ret = fres;
 				return rows;
 			}
--- a/packages/cuda/selectproyectcpu.cpp
+++ b/packages/cuda/selectproyectcpu.cpp
--- a/packages/cuda/treeb.cu
+++ b/packages/cuda/treeb.cu
@@ -1,3 +1,4 @@
+#include "hip/hip_runtime.h"
 #include <thrust/host_vector.h>
 #include <thrust/device_vector.h>
 #include <thrust/sequence.h>
@@ -160,11 +161,11 @@ __device__ int firstMatchingKeyInDataNode2(Record records[], IKeyType key)

 __global__ void gCreateIndex(IDataNode data[], IDirectoryNode dir[], int dirSize, int tree_size, int bottom_start, int nNodesPerBlock)
 {
-        int startIdx = blockIdx.x * nNodesPerBlock;
+        int startIdx = hipBlockIdx_x * nNodesPerBlock;
        int endIdx = startIdx + nNodesPerBlock;
        if(endIdx > dirSize)
                endIdx = dirSize;
-        int keyIdx = threadIdx.x;
+        int keyIdx = hipThreadIdx_x;

        // Proceed only when in internal nodes
        for(int nodeIdx = startIdx; nodeIdx < endIdx; nodeIdx++)
@@ -191,11 +192,11 @@ __global__ void gSearchTree(IDataNode* data, int nDataNodes, IDirectoryNode* dir
 {
 	// Bringing the root node (visited by every tuple) to the faster shared memory
 	__shared__ IKeyType RootNodeKeys[TREE_NODE_SIZE];
-	RootNodeKeys[threadIdx.x] = dir->keys[threadIdx.x];
+	RootNodeKeys[hipThreadIdx_x] = dir->keys[hipThreadIdx_x];

 	__syncthreads();

-	int OverallThreadIdx = blockIdx.x * THRD_PER_BLCK_search + threadIdx.x;
+	int OverallThreadIdx = hipBlockIdx_x * THRD_PER_BLCK_search + hipThreadIdx_x;

 	for(int keyIdx = OverallThreadIdx; keyIdx < nSearchKeys; keyIdx += THRD_PER_GRID_search)
 	{
@@ -219,7 +220,7 @@ __global__ void gSearchTree(IDataNode* data, int nDataNodes, IDirectoryNode* dir
 /*Counts the number of times a row in 'S' is to be joined to a row in 'R'.*/
 __global__ void gIndexJoin(int *R, int *S, int g_locations[], int sLen, int g_ResNums[])
 {
-	int s_cur = blockIdx.x * blockDim.x + threadIdx.x;
+	int s_cur = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;

 	if(s_cur < sLen) 
 	{
@@ -246,11 +247,11 @@ in 'g_locations' those rows that have equal values in the checked columns.*/
 __global__ void gIndexMultiJoinNegative(int *R, int *S, int g_locations[], int rLen, int *p1, int *p2, int of1, int of2, int *mloc, int *sloc, int *muljoin, int wj)
 {
 	extern __shared__ int shared[];
-	int r_cur = blockIdx.x * blockDim.x + threadIdx.x;
+	int r_cur = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
 	int posr, poss, x;

-	if(threadIdx.x < wj)
-		shared[threadIdx.x] = muljoin[threadIdx.x];
+	if(hipThreadIdx_x < wj)
+		shared[hipThreadIdx_x] = muljoin[hipThreadIdx_x];
 	__syncthreads();

 	if(r_cur < rLen) 
@@ -287,11 +288,11 @@ times a row in 'S' is to be joined to its corresponding row in 'R', storing the
 __global__ void gIndexMultiJoin(int *R, int *S, int g_locations[], int sLen, int g_ResNums[], int *p1, int *p2, int of1, int of2, int *mloc, int *sloc, int *muljoin, int wj)
 {
 	extern __shared__ int shared[];
-	int s_cur = blockIdx.x * blockDim.x + threadIdx.x;
+	int s_cur = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
 	int posr, poss, x;

-	if(threadIdx.x < wj)
-		shared[threadIdx.x] = muljoin[threadIdx.x];
+	if(hipThreadIdx_x < wj)
+		shared[hipThreadIdx_x] = muljoin[hipThreadIdx_x];
 	__syncthreads();

 	if(s_cur < sLen) 
@@ -330,10 +331,10 @@ __global__ void multiJoinWithWrite(int g_locations[], int sLen, int g_PrefixSums
 {
 	extern __shared__ int shared[];
 	int *extjoins = &shared[lenrul];
-	int s_cur = blockIdx.x * blockDim.x + threadIdx.x;
+	int s_cur = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;

-	if(threadIdx.x < (lenrul + wj))
-		shared[threadIdx.x] = rule[threadIdx.x];
+	if(hipThreadIdx_x < (lenrul + wj))
+		shared[hipThreadIdx_x] = rule[hipThreadIdx_x];
 	__syncthreads();

 	if(s_cur < sLen)
@@ -382,10 +383,10 @@ __global__ void multiJoinWithWrite2(int g_locations[], int sLen, int g_PrefixSum
 {
 	extern __shared__ int shared[];
 	int *extjoins = &shared[cols];
-	int s_cur = blockIdx.x * blockDim.x + threadIdx.x;
+	int s_cur = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;

-	if(threadIdx.x < (cols + wj))
-		shared[threadIdx.x] = rule[threadIdx.x];
+	if(hipThreadIdx_x < (cols + wj))
+		shared[hipThreadIdx_x] = rule[hipThreadIdx_x];
 	__syncthreads();

 	if(s_cur < sLen)
@@ -432,11 +433,11 @@ predicate are projected.*/
 __global__ void gJoinWithWriteNegative(int g_locations[], int rLen, int g_joinResultBuffers[], int *p1, int of1, int *rule, int halfrul, int *mloc)
 {
 	extern __shared__ int shared[];
-	int r_cur = blockIdx.x * blockDim.x + threadIdx.x;
+	int r_cur = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
 	int posr;

-	if(threadIdx.x < halfrul)
-		shared[threadIdx.x] = rule[threadIdx.x];
+	if(hipThreadIdx_x < halfrul)
+		shared[hipThreadIdx_x] = rule[hipThreadIdx_x];
 	__syncthreads();

 	if(r_cur < rLen)
@@ -461,11 +462,11 @@ predicate are projected.*/
 __global__ void gJoinWithWriteNegative2(int g_locations[], int rLen, int g_joinResultBuffers[], int *p1, int of1, int *rule, int cols, int *mloc)
 {
 	extern __shared__ int shared[];
-	int r_cur = blockIdx.x * blockDim.x + threadIdx.x;
+	int r_cur = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
 	int posr;

-	if(threadIdx.x < cols)
-		shared[threadIdx.x] = rule[threadIdx.x];
+	if(hipThreadIdx_x < cols)
+		shared[hipThreadIdx_x] = rule[hipThreadIdx_x];
 	__syncthreads();

 	if(r_cur < rLen)
@@ -489,10 +490,10 @@ __global__ void gJoinWithWriteNegative2(int g_locations[], int rLen, int g_joinR
 __global__ void gJoinWithWrite(int g_locations[], int sLen, int g_PrefixSums[], int g_joinResultBuffers[], int *p1, int *p2, int of1, int of2, int *rule, int halfrul, int lenrul, int *mloc, int *sloc)
 {
 	extern __shared__ int shared[];
-	int s_cur = blockIdx.x * blockDim.x + threadIdx.x;
+	int s_cur = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;

-	if(threadIdx.x < lenrul)
-		shared[threadIdx.x] = rule[threadIdx.x];
+	if(hipThreadIdx_x < lenrul)
+		shared[hipThreadIdx_x] = rule[hipThreadIdx_x];
 	__syncthreads();

 	if(s_cur < sLen)
@@ -525,10 +526,10 @@ projection, which is performed based on the variables in the head of the rule.*/
 __global__ void gJoinWithWrite2(int g_locations[], int sLen, int g_PrefixSums[], int g_joinResultBuffers[], int *p1, int *p2, int of1, int of2, int *rule, int cols, int *mloc, int *sloc)
 {
 	extern __shared__ int shared[];
-	int s_cur = blockIdx.x * blockDim.x + threadIdx.x;
+	int s_cur = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;

-	if(threadIdx.x < cols)
-		shared[threadIdx.x] = rule[threadIdx.x];
+	if(hipThreadIdx_x < cols)
+		shared[hipThreadIdx_x] = rule[hipThreadIdx_x];
 	__syncthreads();

 	if(s_cur < sLen)
@@ -563,7 +564,7 @@ __global__ void gJoinWithWrite2(int g_locations[], int sLen, int g_PrefixSums[],
 /*Load part of column 'wj' of 'p' in 'R'. Which values are loaded is defined by the prefix sum results in 'pos'.*/
 __global__ void llenar(int *p, int *R, int len, int of, int wj, int *pos, int *ids)
 {
-	int id = blockIdx.x * blockDim.x + threadIdx.x;
+	int id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
 	int cond;
 	if(id < len)
 	{
@@ -579,7 +580,7 @@ __global__ void llenar(int *p, int *R, int len, int of, int wj, int *pos, int *i
 /*Load an entire column from 'p' into 'R'.*/
 __global__ void llenarnosel(int *p, int *R, int len, int of, int wj)
 {
-	int id = blockIdx.x * blockDim.x + threadIdx.x;
+	int id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
 	if(id < len)
 		R[id] = p[id * of + wj];
 }
@@ -587,10 +588,10 @@ __global__ void llenarnosel(int *p, int *R, int len, int of, int wj)
 __global__ void projectfinal(int *res, int rows, int cols, int *rule, int *out)
 {
 	extern __shared__ int shared[];
-	int id = blockIdx.x * blockDim.x + threadIdx.x;
+	int id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;

-	if(threadIdx.x < cols)
-		shared[threadIdx.x] = rule[threadIdx.x];
+	if(hipThreadIdx_x < cols)
+		shared[hipThreadIdx_x] = rule[hipThreadIdx_x];
 	__syncthreads();
 	
 	if(id < rows)
@@ -614,26 +615,26 @@ void project(int *res, int resrows, int numcols1, int numcols2, int *proj, int *
 		int *pt = (int *)malloc(sizepro);
 		for(z = 0; z < numcols2; z++)
 			pt[z] = proj[z] - 1;
-		cudaMemcpy(dcons, pt, sizepro, cudaMemcpyHostToDevice); 
-		//cudaDeviceSynchronize(); //Small cudaMemcpys are asynchronous, uncomment this line if the pointer is being liberated before it is copied.
+		hipMemcpy(dcons, pt, sizepro, hipMemcpyHostToDevice); 
+		//hipDeviceSynchronize(); //Small cudaMemcpys are asynchronous, uncomment this line if the pointer is being liberated before it is copied.
 		free(pt);
 	}
 	else
-		cudaMemcpy(dcons, proj, sizepro, cudaMemcpyHostToDevice);
+		hipMemcpy(dcons, proj, sizepro, hipMemcpyHostToDevice);
 	reservar(&d_Rout, resrows * sizepro);
-	projectfinal<<<blockllen, numthreads, sizepro>>>(res, resrows, numcols1, dcons, d_Rout);
-	cudaFree(dcons);
-	cudaFree(*ret);
+	hipLaunchKernel(HIP_KERNEL_NAME(projectfinal), dim3(blockllen), dim3(numthreads), sizepro, 0, res, resrows, numcols1, dcons, d_Rout);
+	hipFree(dcons);
+	hipFree(*ret);
 	*ret = d_Rout;
 }

 __global__ void projectadd(int *dop1, int *dop2, int rows1, int rows2, int cols1, int cols2, int *dhead, int hsize, int *res)
 {
 	extern __shared__ int shared[];
-	int id = blockIdx.x * blockDim.x + threadIdx.x;
+	int id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
 	int pos2, posr, x, y, cond;
-	if(threadIdx.x < hsize)
-		shared[threadIdx.x] = dhead[threadIdx.x];
+	if(hipThreadIdx_x < hsize)
+		shared[hipThreadIdx_x] = dhead[hipThreadIdx_x];
 	__syncthreads();
 	if(id < rows2)
 	{
@@ -662,10 +663,10 @@ void juntar(int *dop1, int *dop2, int rows1, int rows2, int cols1, int cols2, in
 	int blockllen = rows2 / numthreads + 1;
 	sizepro = pcols * sizeof(int);
 	reservar(&dcons, sizepro);
-	cudaMemcpy(dcons, proj, sizepro, cudaMemcpyHostToDevice);
+	hipMemcpy(dcons, proj, sizepro, hipMemcpyHostToDevice);
 	reservar(&d_Rout, rows1 * rows2 * sizepro);
-	projectadd<<<blockllen, numthreads, sizepro>>>(dop1, dop2, rows1, rows2, cols1, cols2, dcons, pcols, d_Rout);
-	cudaFree(dcons);
+	hipLaunchKernel(HIP_KERNEL_NAME(projectadd), dim3(blockllen), dim3(numthreads), sizepro, 0, dop1, dop2, rows1, rows2, cols1, cols2, dcons, pcols, d_Rout);
+	hipFree(dcons);
 	*ret = d_Rout;
 }

@@ -743,51 +744,51 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:

 	#ifdef TIMER
 	//cout << "INICIO" << endl;
-	cudaEvent_t start, stop;
+	hipEvent_t start, stop;
 	float time;
-	cudaEventCreate(&start);
-	cudaEventCreate(&stop);
-	cudaEventRecord(start, 0);
+	hipEventCreate(&start);
+	hipEventCreate(&stop);
+	hipEventRecord(start, 0);
 	#endif

 	if(npred2.x > 0 || npred2.y > 0 || nsel2 > 0 || nsj2 > 0)
 	{
 		newLen = sLen + 1;
-		cudaMemsetAsync(temp, 0, newLen * sizeof(int));
+		hipMemsetAsync(temp, 0, newLen * sizeof(int));
 	}

 	if(npred2.x > 0 || npred2.y > 0)
 	{
 		size = npred2tot * sizeof(int);
-		cudaMemcpy(dcons, pred2, size, cudaMemcpyHostToDevice);
+		hipMemcpy(dcons, pred2, size, hipMemcpyHostToDevice);

 		if(npred2.y > 0) /*Fix case when a(X,Y),b(Y,Z),Z > Y*/
 		{
 			reservar(&temp2, sizet2);
-			cudaMemsetAsync(temp2, 0, newLen * sizeof(int));
+			hipMemsetAsync(temp2, 0, newLen * sizeof(int));
 			//res = thrust::device_pointer_cast(temp2);
-			bpreds<<<blockllen, numthreads, size>>>(p1, p2, sLen, of1, of2, dcons, npred2tot, npred2.x, temp + 1, temp2 + 1);
+			hipLaunchKernel(HIP_KERNEL_NAME(bpreds), dim3(blockllen), dim3(numthreads), size, 0, p1, p2, sLen, of1, of2, dcons, npred2tot, npred2.x, temp + 1, temp2 + 1);
 		}
 		else
 		{
 			if(negative)
-				bpreds<<<blockllen, numthreads, size>>>(p1, p2, sLen, of1, of2, dcons, npred2tot, npred2.x, temp + 1, NULL);
+				hipLaunchKernel(HIP_KERNEL_NAME(bpreds), dim3(blockllen), dim3(numthreads), size, 0, p1, p2, sLen, of1, of2, dcons, npred2tot, npred2.x, temp + 1, NULL);
 			else
-				bpredsOR<<<blockllen, numthreads, size>>>(p1, p2, sLen, of1, of2, dcons, npred2tot, npred2.x, temp + 1, NULL);
+				hipLaunchKernel(HIP_KERNEL_NAME(bpredsOR), dim3(blockllen), dim3(numthreads), size, 0, p1, p2, sLen, of1, of2, dcons, npred2tot, npred2.x, temp + 1, NULL);
 		}

 		if(nsel2 > 0)
 		{
 			size = nsel2 * sizeof(int);
-			cudaMemcpy(dcons, sel2, size, cudaMemcpyHostToDevice);
-			marcar<<<blockllen, numthreads, size>>>(p2, sLen, of2, dcons, nsel2, temp + 1);
+			hipMemcpy(dcons, sel2, size, hipMemcpyHostToDevice);
+			hipLaunchKernel(HIP_KERNEL_NAME(marcar), dim3(blockllen), dim3(numthreads), size, 0, p2, sLen, of2, dcons, nsel2, temp + 1);
 		}

 		if(nsj2 > 0)
 		{
 			size = nsj2 * sizeof(int);
-			cudaMemcpy(dcons, sjoin2, size, cudaMemcpyHostToDevice);
-			samejoin<<<blockllen, numthreads, size>>>(p2, sLen, of2, dcons, nsj2, temp + 1);
+			hipMemcpy(dcons, sjoin2, size, hipMemcpyHostToDevice);
+			hipLaunchKernel(HIP_KERNEL_NAME(samejoin), dim3(blockllen), dim3(numthreads), size, 0, p2, sLen, of2, dcons, nsj2, temp + 1);
 		}
 	}
 	else
@@ -795,14 +796,14 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
 		if(nsel2 > 0)
 		{
 			size = nsel2 * sizeof(int);
-			cudaMemcpy(dcons, sel2, size, cudaMemcpyHostToDevice);
-			marcar2<<<blockllen, numthreads, size>>>(p2, sLen, of2, dcons, nsel2, temp + 1);
+			hipMemcpy(dcons, sel2, size, hipMemcpyHostToDevice);
+			hipLaunchKernel(HIP_KERNEL_NAME(marcar2), dim3(blockllen), dim3(numthreads), size, 0, p2, sLen, of2, dcons, nsel2, temp + 1);

 			if(nsj2 > 0)
 			{
 				size = nsj2 * sizeof(int);
-				cudaMemcpy(dcons, sjoin2, size, cudaMemcpyHostToDevice);
-				samejoin<<<blockllen, numthreads, size>>>(p2, sLen, of2, dcons, nsj2, temp + 1);
+				hipMemcpy(dcons, sjoin2, size, hipMemcpyHostToDevice);
+				hipLaunchKernel(HIP_KERNEL_NAME(samejoin), dim3(blockllen), dim3(numthreads), size, 0, p2, sLen, of2, dcons, nsj2, temp + 1);
 			}
 		}
 		else
@@ -810,15 +811,15 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
 			if(nsj2 > 0)
 			{
 				size = nsj2 * sizeof(int);
-				cudaMemcpy(dcons, sjoin2, size, cudaMemcpyHostToDevice);
-				samejoin2<<<blockllen, numthreads, size>>>(p2, sLen, of2, dcons, nsj2, temp + 1);	
+				hipMemcpy(dcons, sjoin2, size, hipMemcpyHostToDevice);
+				hipLaunchKernel(HIP_KERNEL_NAME(samejoin2), dim3(blockllen), dim3(numthreads), size, 0, p2, sLen, of2, dcons, nsj2, temp + 1);	
 			}
 			else
 			{
 				sizem32S = m32sLen * sizeof(int);
 				reservar(&d_S, sizem32S);
-				cudaMemsetAsync(d_S + sLen, 0x7f, extraspaceS * sizeof(int));
-				llenarnosel<<<blockllen, numthreads>>>(p2, d_S, sLen, of2, wherej[1]);
+				hipMemsetAsync(d_S + sLen, 0x7f, extraspaceS * sizeof(int));
+				hipLaunchKernel(HIP_KERNEL_NAME(llenarnosel), dim3(blockllen), dim3(numthreads), 0, 0, p2, d_S, sLen, of2, wherej[1]);
 			}
 		}
 	}
@@ -842,8 +843,8 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
 	
 		if(newLen == 0) // && !negative) ARREGLAR
 		{
-			cudaFree(temp);
-			cudaFree(dcons);
+			hipFree(temp);
+			hipFree(dcons);
 			return 0;
 		}

@@ -854,24 +855,24 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:

 		reservar(&d_S, sizem32S);
 		reservar(&posS, sizem32S);
-		cudaMemsetAsync(d_S + newLen, 0x7f, sizextra);
-		cudaMemsetAsync(posS + newLen, 0x7f, sizextra);
-		llenar<<<blockllen, numthreads>>>(p2, d_S, sLen, of2, wherej[1], temp, posS);
+		hipMemsetAsync(d_S + newLen, 0x7f, sizextra);
+		hipMemsetAsync(posS + newLen, 0x7f, sizextra);
+		hipLaunchKernel(HIP_KERNEL_NAME(llenar), dim3(blockllen), dim3(numthreads), 0, 0, p2, d_S, sLen, of2, wherej[1], temp, posS);
 		sLen = newLen;
 	}

 	#ifdef TIMER
-	cudaEventRecord(stop, 0);
-	cudaEventSynchronize(stop);
-	cudaEventElapsedTime(&time, start, stop);
+	hipEventRecord(stop, 0);
+	hipEventSynchronize(stop);
+	hipEventElapsedTime(&time, start, stop);
 	//cout << "Select1 = " << time << endl;
 	cuda_stats.select1_time += time;

-	cudaEventDestroy(start);
-	cudaEventDestroy(stop);
-	cudaEventCreate(&start);
-	cudaEventCreate(&stop);
-	cudaEventRecord(start, 0);
+	hipEventDestroy(start);
+	hipEventDestroy(stop);
+	hipEventCreate(&start);
+	hipEventCreate(&stop);
+	hipEventRecord(start, 0);
 	#endif

 	blockllen = rLen / numthreads + 1;
@@ -880,30 +881,30 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
 	{
 		if(temp2 != NULL)
 		{
-			cudaFree(temp);
+			hipFree(temp);
 			temp = temp2;
 			res = thrust::device_pointer_cast(temp);
 			newLen = rLen + 1;
 			if(nsel1 > 0)
 			{
 				size = nsel1 * sizeof(int);
-				cudaMemcpy(dcons, sel1, size, cudaMemcpyHostToDevice);
-				marcar<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, nsel1, temp + 1);
+				hipMemcpy(dcons, sel1, size, hipMemcpyHostToDevice);
+				hipLaunchKernel(HIP_KERNEL_NAME(marcar), dim3(blockllen), dim3(numthreads), size, 0, p1, rLen, of1, dcons, nsel1, temp + 1);
 			}
 			if(nsj1 > 0)
 			{
 				size = nsj1 * sizeof(int);
-				cudaMemcpy(dcons, sjoin1, size, cudaMemcpyHostToDevice);
-				samejoin<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, nsj1, temp + 1);
+				hipMemcpy(dcons, sjoin1, size, hipMemcpyHostToDevice);
+				hipLaunchKernel(HIP_KERNEL_NAME(samejoin), dim3(blockllen), dim3(numthreads), size, 0, p1, rLen, of1, dcons, nsj1, temp + 1);
 			}
 			if(npred1.x > 0)
 			{
 				size = npred1.x * sizeof(int);
-				cudaMemcpy(dcons, pred1, size, cudaMemcpyHostToDevice);
+				hipMemcpy(dcons, pred1, size, hipMemcpyHostToDevice);
 				if(ANDlogic)
-					bpredsnormal<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, npred1.x, temp + 1);
+					hipLaunchKernel(HIP_KERNEL_NAME(bpredsnormal), dim3(blockllen), dim3(numthreads), size, 0, p1, rLen, of1, dcons, npred1.x, temp + 1);
 				else
-					bpredsorlogic<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, npred1.x, temp + 1);
+					hipLaunchKernel(HIP_KERNEL_NAME(bpredsorlogic), dim3(blockllen), dim3(numthreads), size, 0, p1, rLen, of1, dcons, npred1.x, temp + 1);
 			}
 		}
 		else
@@ -911,30 +912,30 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
 			if(npred1.x > 0 || nsel1 > 0 || nsj1 > 0)
 			{
 				newLen = rLen + 1;
-				cudaMemsetAsync(temp, 0, newLen * sizeof(int));
+				hipMemsetAsync(temp, 0, newLen * sizeof(int));
 			}

 			if(nsel1 > 0)
 			{
 				size = nsel1 * sizeof(int);
-				cudaMemcpy(dcons, sel1, size, cudaMemcpyHostToDevice);
-				marcar2<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, nsel1, temp + 1);
+				hipMemcpy(dcons, sel1, size, hipMemcpyHostToDevice);
+				hipLaunchKernel(HIP_KERNEL_NAME(marcar2), dim3(blockllen), dim3(numthreads), size, 0, p1, rLen, of1, dcons, nsel1, temp + 1);

 				if(nsj1 > 0)
 				{
 					size = nsj1 * sizeof(int);
-					cudaMemcpy(dcons, sjoin1, size, cudaMemcpyHostToDevice);
-					samejoin<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, nsj1, temp + 1);
+					hipMemcpy(dcons, sjoin1, size, hipMemcpyHostToDevice);
+					hipLaunchKernel(HIP_KERNEL_NAME(samejoin), dim3(blockllen), dim3(numthreads), size, 0, p1, rLen, of1, dcons, nsj1, temp + 1);
 				}

 				if(npred1.x > 0)
 				{
 					size = npred1.x * sizeof(int);
-					cudaMemcpy(dcons, pred1, size, cudaMemcpyHostToDevice);
+					hipMemcpy(dcons, pred1, size, hipMemcpyHostToDevice);
 					if(ANDlogic)
-						bpredsnormal<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, npred1.x, temp + 1);
+						hipLaunchKernel(HIP_KERNEL_NAME(bpredsnormal), dim3(blockllen), dim3(numthreads), size, 0, p1, rLen, of1, dcons, npred1.x, temp + 1);
 					else
-						bpredsorlogic<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, npred1.x, temp + 1);		
+						hipLaunchKernel(HIP_KERNEL_NAME(bpredsorlogic), dim3(blockllen), dim3(numthreads), size, 0, p1, rLen, of1, dcons, npred1.x, temp + 1);		
 				}
 			}
 			else
@@ -942,17 +943,17 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
 				if(nsj1 > 0)
 				{
 					size = nsj1 * sizeof(int);
-					cudaMemcpy(dcons, sjoin1, size, cudaMemcpyHostToDevice);
-					samejoin2<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, nsj1, temp + 1);
+					hipMemcpy(dcons, sjoin1, size, hipMemcpyHostToDevice);
+					hipLaunchKernel(HIP_KERNEL_NAME(samejoin2), dim3(blockllen), dim3(numthreads), size, 0, p1, rLen, of1, dcons, nsj1, temp + 1);

 					if(npred1.x > 0)
 					{
 						size = npred1.x * sizeof(int);
-						cudaMemcpy(dcons, pred1, size, cudaMemcpyHostToDevice);
+						hipMemcpy(dcons, pred1, size, hipMemcpyHostToDevice);
 						if(ANDlogic)
-							bpredsnormal<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, npred1.x, temp + 1);
+							hipLaunchKernel(HIP_KERNEL_NAME(bpredsnormal), dim3(blockllen), dim3(numthreads), size, 0, p1, rLen, of1, dcons, npred1.x, temp + 1);
 						else
-							bpredsorlogic<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, npred1.x, temp + 1);
+							hipLaunchKernel(HIP_KERNEL_NAME(bpredsorlogic), dim3(blockllen), dim3(numthreads), size, 0, p1, rLen, of1, dcons, npred1.x, temp + 1);
 					}
 				}
 				else
@@ -960,11 +961,11 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
 					if(npred1.x > 0)
 					{
 						size = npred1.x * sizeof(int);
-						cudaMemcpy(dcons, pred1, size, cudaMemcpyHostToDevice);
+						hipMemcpy(dcons, pred1, size, hipMemcpyHostToDevice);
 						if(ANDlogic)
-							bpredsnormal2<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, npred1.x, temp + 1);
+							hipLaunchKernel(HIP_KERNEL_NAME(bpredsnormal2), dim3(blockllen), dim3(numthreads), size, 0, p1, rLen, of1, dcons, npred1.x, temp + 1);
 						else
-							bpredsorlogic2<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, npred1.x, temp + 1);	
+							hipLaunchKernel(HIP_KERNEL_NAME(bpredsorlogic2), dim3(blockllen), dim3(numthreads), size, 0, p1, rLen, of1, dcons, npred1.x, temp + 1);	
 					}
 				}
 			}
@@ -976,11 +977,11 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
 			newLen = res[rLen];
 			if(newLen == 0)
 			{
-				cudaFree(temp);
-				cudaFree(dcons);
-				cudaFree(d_S);
+				hipFree(temp);
+				hipFree(dcons);
+				hipFree(d_S);
 				if(posS != NULL)
-					cudaFree(posS);
+					hipFree(posS);
 				return 0;
 			}	

@@ -991,41 +992,41 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:

 			reservar(&d_R, sizem32);
 			reservar(&posR, sizem32);
-			cudaMemsetAsync(d_R + newLen, 0x7f, sizextra);
-			cudaMemsetAsync(posR + newLen, 0x7f, sizextra);
-			llenar<<<blockllen, numthreads>>>(p1, d_R, rLen, of1, wherej[0], temp, posR);
+			hipMemsetAsync(d_R + newLen, 0x7f, sizextra);
+			hipMemsetAsync(posR + newLen, 0x7f, sizextra);
+			hipLaunchKernel(HIP_KERNEL_NAME(llenar), dim3(blockllen), dim3(numthreads), 0, 0, p1, d_R, rLen, of1, wherej[0], temp, posR);
 			rLen = newLen;
 		}
 		else
 		{
 			sizem32 = m32rLen * sizeof(int);
 			reservar(&d_R, sizem32);
-			cudaMemsetAsync(d_R + rLen, 0x7f, extraspace * sizeof(int));
-			llenarnosel<<<blockllen, numthreads>>>(p1, d_R, rLen, of1, wherej[0]);
+			hipMemsetAsync(d_R + rLen, 0x7f, extraspace * sizeof(int));
+			hipLaunchKernel(HIP_KERNEL_NAME(llenarnosel), dim3(blockllen), dim3(numthreads), 0, 0, p1, d_R, rLen, of1, wherej[0]);
 		}
 	}
 	else
 	{
 		sizem32 = m32rLen * sizeof(int);
 		reservar(&d_R, sizem32);
-		cudaMemsetAsync(d_R + rLen, 0x7f, extraspace * sizeof(int));
-		llenarnosel<<<blockllen, numthreads>>>(p1, d_R, rLen, of1, wherej[0]);
+		hipMemsetAsync(d_R + rLen, 0x7f, extraspace * sizeof(int));
+		hipLaunchKernel(HIP_KERNEL_NAME(llenarnosel), dim3(blockllen), dim3(numthreads), 0, 0, p1, d_R, rLen, of1, wherej[0]);
 	}

 	#ifdef TIMER
-	cudaEventRecord(stop, 0);
-	cudaEventSynchronize(stop);
-	cudaEventElapsedTime(&time, start, stop);
+	hipEventRecord(stop, 0);
+	hipEventSynchronize(stop);
+	hipEventElapsedTime(&time, start, stop);
 	//cout << "Select2 = " << time << endl;
 	cuda_stats.select2_time += time;
 	#endif

 	#ifdef TIMER
-	cudaEventDestroy(start);
-	cudaEventDestroy(stop);
-	cudaEventCreate(&start);
-	cudaEventCreate(&stop);
-	cudaEventRecord(start, 0);
+	hipEventDestroy(start);
+	hipEventDestroy(stop);
+	hipEventCreate(&start);
+	hipEventCreate(&stop);
+	hipEventRecord(start, 0);
 	#endif

 	thrust::device_ptr<Record> dvp1;
@@ -1084,17 +1085,17 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
 	}

 	#ifdef TIMER
-	cudaEventRecord(stop, 0);
-	cudaEventSynchronize(stop);
-	cudaEventElapsedTime(&time, start, stop);
+	hipEventRecord(stop, 0);
+	hipEventSynchronize(stop);
+	hipEventElapsedTime(&time, start, stop);
 	//cout << "Sort = " << time << endl;
 	cuda_stats.sort_time += time;
 	
-	cudaEventDestroy(start);
-	cudaEventDestroy(stop);
-	cudaEventCreate(&start);
-	cudaEventCreate(&stop);
-	cudaEventRecord(start, 0);
+	hipEventDestroy(start);
+	hipEventDestroy(stop);
+	hipEventCreate(&start);
+	hipEventCreate(&stop);
+	hipEventRecord(start, 0);
 	#endif

 	IDataNode* d_data;
@@ -1123,7 +1124,7 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
 	dim3 Dbc(THRD_PER_BLCK_create, 1, 1);
 	dim3 Dgc(BLCK_PER_GRID_create, 1, 1);

-	gCreateIndex <<<Dgc, Dbc>>> (d_data, d_dir, nDirNodes, tree_size, bottom_start, nNodesPerBlock);
+	hipLaunchKernel(HIP_KERNEL_NAME(gCreateIndex), dim3(Dgc), dim3(Dbc), 0, 0, d_data, d_dir, nDirNodes, tree_size, bottom_start, nNodesPerBlock);

 	int *d_locations;
 	int memSizeR;
@@ -1132,7 +1133,7 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
 	{
 		memSizeR = (rLen + 1) * sizeof(int);
 		reservar(&d_locations, memSizeR);
-		cudaMemsetAsync(d_locations, 0, sizeof(int));
+		hipMemsetAsync(d_locations, 0, sizeof(int));
 		nSearchKeys = rLen;
 	}
 	else
@@ -1146,13 +1147,13 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
 	unsigned int nKeysPerThread = uintCeilingDiv(nSearchKeys, THRD_PER_GRID_search);
 	if(negative)
 	{
-		gSearchTree <<<Dgs, Dbs>>> (d_data, nDataNodes, d_dir, nDirNodes, lvlDir, d_R, d_locations + 1, nSearchKeys, nKeysPerThread, tree_size, bottom_start);
-		cudaMemsetAsync(temp, 0, memSizeR);
+		hipLaunchKernel(HIP_KERNEL_NAME(gSearchTree), dim3(Dgs), dim3(Dbs), 0, 0, d_data, nDataNodes, d_dir, nDirNodes, lvlDir, d_R, d_locations + 1, nSearchKeys, nKeysPerThread, tree_size, bottom_start);
+		hipMemsetAsync(temp, 0, memSizeR);
 	}
 	else
 	{
-		gSearchTree <<<Dgs, Dbs>>> (d_data, nDataNodes, d_dir, nDirNodes, lvlDir, d_S, d_locations, nSearchKeys, nKeysPerThread, tree_size, bottom_start);
-		cudaMemsetAsync(temp, 0, memSizeS);
+		hipLaunchKernel(HIP_KERNEL_NAME(gSearchTree), dim3(Dgs), dim3(Dbs), 0, 0, d_data, nDataNodes, d_dir, nDirNodes, lvlDir, d_S, d_locations, nSearchKeys, nKeysPerThread, tree_size, bottom_start);
+		hipMemsetAsync(temp, 0, memSizeS);
 	}

 	int muljoin = 0, muljoinsize = 0, sum;
@@ -1165,8 +1166,8 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
 		{
 			muljoin = numj - 2;
 			muljoinsize = muljoin * sizeof(int);
-			cudaMemcpy(dcons, wherej + 2, muljoinsize, cudaMemcpyHostToDevice);
-			gIndexMultiJoinNegative<<<blockllen, numthreads, muljoinsize>>> (d_R, d_S, d_locations + 1, rLen, p1, p2, of1, of2, posR, posS, dcons, muljoin);
+			hipMemcpy(dcons, wherej + 2, muljoinsize, hipMemcpyHostToDevice);
+			hipLaunchKernel(HIP_KERNEL_NAME(gIndexMultiJoinNegative), dim3(blockllen), dim3(numthreads), muljoinsize, 0, d_R, d_S, d_locations + 1, rLen, p1, p2, of1, of2, posR, posS, dcons, muljoin);
 		}

 		res = thrust::device_pointer_cast(d_locations);	
@@ -1177,21 +1178,21 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
 		if(pos == (rule->num_rows - 3))
 		{
 			sizepro = rule->num_columns * sizeof(int);
-			cudaMemcpy(dcons, proj, sizepro, cudaMemcpyHostToDevice);
+			hipMemcpy(dcons, proj, sizepro, hipMemcpyHostToDevice);
 			resSize = sum * sizepro;
 			reservar(&d_Rout, resSize);
-			gJoinWithWriteNegative2<<<blockllen, numthreads, sizepro>>> (d_locations, rLen, d_Rout, p1, of1, dcons, rule->num_columns, posR);
+			hipLaunchKernel(HIP_KERNEL_NAME(gJoinWithWriteNegative2), dim3(blockllen), dim3(numthreads), sizepro, 0, d_locations, rLen, d_Rout, p1, of1, dcons, rule->num_columns, posR);
 		}
 		else
 		{	
 			sizepro = projp.x * sizeof(int);
-			cudaMemcpy(dcons, proj, sizepro, cudaMemcpyHostToDevice);
+			hipMemcpy(dcons, proj, sizepro, hipMemcpyHostToDevice);
 			resSize = sum * sizepro;
 			reservar(&d_Rout, resSize);
-			gJoinWithWriteNegative<<<blockllen, numthreads, sizepro>>> (d_locations, rLen, d_Rout, p1, of1, dcons, projp.x, posR);
+			hipLaunchKernel(HIP_KERNEL_NAME(gJoinWithWriteNegative), dim3(blockllen), dim3(numthreads), sizepro, 0, d_locations, rLen, d_Rout, p1, of1, dcons, projp.x, posR);
 		}
-		cudaFree(d_R);
-		cudaFree(d_S);
+		hipFree(d_R);
+		hipFree(d_S);
 	}
 	else
 	{
@@ -1200,26 +1201,26 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
 		{
 			muljoin = numj - 2;
 			muljoinsize = muljoin * sizeof(int);
-			cudaMemcpy(dcons, wherej + 2, muljoinsize, cudaMemcpyHostToDevice);
-			gIndexMultiJoin<<<blockllen, numthreads, muljoinsize>>> (d_R, d_S, d_locations, sLen, temp, p1, p2, of1, of2, posR, posS, dcons, muljoin);
+			hipMemcpy(dcons, wherej + 2, muljoinsize, hipMemcpyHostToDevice);
+			hipLaunchKernel(HIP_KERNEL_NAME(gIndexMultiJoin), dim3(blockllen), dim3(numthreads), muljoinsize, 0, d_R, d_S, d_locations, sLen, temp, p1, p2, of1, of2, posR, posS, dcons, muljoin);
 		}
 		else
-			gIndexJoin<<<blockllen, numthreads>>> (d_R, d_S, d_locations, sLen, temp);
-		cudaFree(d_R);
-		cudaFree(d_S);
+			hipLaunchKernel(HIP_KERNEL_NAME(gIndexJoin), dim3(blockllen), dim3(numthreads), 0, 0, d_R, d_S, d_locations, sLen, temp);
+		hipFree(d_R);
+		hipFree(d_S);

 		sum = res[sLen-1];
 		thrust::exclusive_scan(res, res + sLen, res);
 		sum += res[sLen-1];
 		if(sum == 0)
 		{
-			cudaFree(dcons);
-			cudaFree(d_locations);
-			cudaFree(temp);
+			hipFree(dcons);
+			hipFree(d_locations);
+			hipFree(temp);
 			if(posS != NULL)
-				cudaFree(posS);
+				hipFree(posS);
 			if(posR != NULL)
-				cudaFree(posR);
+				hipFree(posR);
 			return 0;
 		}
 		res[sLen] = sum;
@@ -1227,49 +1228,49 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
 		if(pos == (rule->num_rows - 3))
 		{
 			sizepro = rule->num_columns * sizeof(int);
-			cudaMemcpy(dcons, proj, sizepro, cudaMemcpyHostToDevice);
+			hipMemcpy(dcons, proj, sizepro, hipMemcpyHostToDevice);
 			resSize = sum * sizepro;
 			reservar(&d_Rout, resSize);
 			if(numj > 2)
 			{
-				cudaMemcpy(dcons + rule->num_columns, wherej + 2, muljoinsize, cudaMemcpyHostToDevice);
-				multiJoinWithWrite2<<<blockllen, numthreads, sizepro + muljoinsize>>> (d_locations, sLen, temp, d_Rout, p1, p2, of1, of2, dcons, rule->num_columns, posR, posS, muljoin);
+				hipMemcpy(dcons + rule->num_columns, wherej + 2, muljoinsize, hipMemcpyHostToDevice);
+				hipLaunchKernel(HIP_KERNEL_NAME(multiJoinWithWrite2), dim3(blockllen), dim3(numthreads), sizepro + muljoinsize, 0, d_locations, sLen, temp, d_Rout, p1, p2, of1, of2, dcons, rule->num_columns, posR, posS, muljoin);
 			}
 			else
-				gJoinWithWrite2<<<blockllen, numthreads, sizepro>>> (d_locations, sLen, temp, d_Rout, p1, p2, of1, of2, dcons, rule->num_columns, posR, posS);
+				hipLaunchKernel(HIP_KERNEL_NAME(gJoinWithWrite2), dim3(blockllen), dim3(numthreads), sizepro, 0, d_locations, sLen, temp, d_Rout, p1, p2, of1, of2, dcons, rule->num_columns, posR, posS);
 		}
 		else
 		{
 			sizepro = projp.y * sizeof(int);
-			cudaMemcpy(dcons, proj, sizepro, cudaMemcpyHostToDevice);
+			hipMemcpy(dcons, proj, sizepro, hipMemcpyHostToDevice);
 			resSize = sum * sizepro;
 			reservar(&d_Rout, resSize);
 			if(numj > 2)
 			{
-				cudaMemcpy(dcons + projp.y, wherej + 2, muljoinsize, cudaMemcpyHostToDevice);
-				multiJoinWithWrite<<<blockllen, numthreads, sizepro + muljoinsize>>> (d_locations, sLen, temp, d_Rout, p1, p2, of1, of2, dcons, projp.x, projp.y, posR, posS, muljoin);
+				hipMemcpy(dcons + projp.y, wherej + 2, muljoinsize, hipMemcpyHostToDevice);
+				hipLaunchKernel(HIP_KERNEL_NAME(multiJoinWithWrite), dim3(blockllen), dim3(numthreads), sizepro + muljoinsize, 0, d_locations, sLen, temp, d_Rout, p1, p2, of1, of2, dcons, projp.x, projp.y, posR, posS, muljoin);
 			}
 			else
-				gJoinWithWrite<<<blockllen, numthreads, sizepro>>> (d_locations, sLen, temp, d_Rout, p1, p2, of1, of2, dcons, projp.x, projp.y, posR, posS);
+				hipLaunchKernel(HIP_KERNEL_NAME(gJoinWithWrite), dim3(blockllen), dim3(numthreads), sizepro, 0, d_locations, sLen, temp, d_Rout, p1, p2, of1, of2, dcons, projp.x, projp.y, posR, posS);
 		}
 	}

-	cudaFree(dcons);
-	cudaFree(d_locations);
-	cudaFree(temp);
+	hipFree(dcons);
+	hipFree(d_locations);
+	hipFree(temp);
 	if(posS != NULL)
-		cudaFree(posS);
+		hipFree(posS);
 	if(posR != NULL)
-		cudaFree(posR);
+		hipFree(posR);
 	
 	if(*ret != NULL)
-		cudaFree(*ret);
+		hipFree(*ret);
 	*ret = d_Rout;

 	#ifdef TIMER
-	cudaEventRecord(stop, 0);
-	cudaEventSynchronize(stop);
-	cudaEventElapsedTime(&time, start, stop);
+	hipEventRecord(stop, 0);
+	hipEventSynchronize(stop);
+	hipEventElapsedTime(&time, start, stop);
 	//cout << "Join = " << time << endl;
 	//cout << "FIN" << endl;
 	cuda_stats.join_time += time;
--- a/packages/cuda/union2.cu
+++ b/packages/cuda/union2.cu
@@ -87,8 +87,8 @@ int unir(int *res, int rows, int tipo, int **ret, int final)
 			{
 				size = nrows * tipo * sizeof(int);
 				reservar(&nres, size);
-				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
-				cudaFree(*ret);
+				hipMemcpyAsync(nres, res, size, hipMemcpyDeviceToDevice);
+				hipFree(*ret);
 				*ret = nres;
 			}
 			return nrows;
@@ -122,8 +122,8 @@ int unir(int *res, int rows, int tipo, int **ret, int final)
 			{
 				size = nrows * tipo * sizeof(int);
 				reservar(&nres, size);
-				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
-				cudaFree(*ret);
+				hipMemcpyAsync(nres, res, size, hipMemcpyDeviceToDevice);
+				hipFree(*ret);
 				*ret = nres;
 			}
 			return nrows;
@@ -157,8 +157,8 @@ int unir(int *res, int rows, int tipo, int **ret, int final)
 			{
 				size = nrows * tipo * sizeof(int);
 				reservar(&nres, size);
-				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
-				cudaFree(*ret);
+				hipMemcpyAsync(nres, res, size, hipMemcpyDeviceToDevice);
+				hipFree(*ret);
 				*ret = nres;
 			}
 			return nrows;
@@ -192,8 +192,8 @@ int unir(int *res, int rows, int tipo, int **ret, int final)
 			{
 				size = nrows * tipo * sizeof(int);
 				reservar(&nres, size);
-				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
-				cudaFree(*ret);
+				hipMemcpyAsync(nres, res, size, hipMemcpyDeviceToDevice);
+				hipFree(*ret);
 				*ret = nres;
 			}
 			return nrows;
@@ -227,8 +227,8 @@ int unir(int *res, int rows, int tipo, int **ret, int final)
 			{
 				size = nrows * tipo * sizeof(int);
 				reservar(&nres, size);
-				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
-				cudaFree(*ret);
+				hipMemcpyAsync(nres, res, size, hipMemcpyDeviceToDevice);
+				hipFree(*ret);
 				*ret = nres;
 			}
 			return nrows;
@@ -262,8 +262,8 @@ int unir(int *res, int rows, int tipo, int **ret, int final)
 			{
 				size = nrows * tipo * sizeof(int);
 				reservar(&nres, size);
-				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
-				cudaFree(*ret);
+				hipMemcpyAsync(nres, res, size, hipMemcpyDeviceToDevice);
+				hipFree(*ret);
 				*ret = nres;
 			}
 			return nrows;
@@ -297,8 +297,8 @@ int unir(int *res, int rows, int tipo, int **ret, int final)
 			{
 				size = nrows * tipo * sizeof(int);
 				reservar(&nres, size);
-				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
-				cudaFree(*ret);
+				hipMemcpyAsync(nres, res, size, hipMemcpyDeviceToDevice);
+				hipFree(*ret);
 				*ret = nres;
 			}
 			return nrows;
@@ -332,8 +332,8 @@ int unir(int *res, int rows, int tipo, int **ret, int final)
 			{
 				size = nrows * tipo * sizeof(int);
 				reservar(&nres, size);
-				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
-				cudaFree(*ret);
+				hipMemcpyAsync(nres, res, size, hipMemcpyDeviceToDevice);
+				hipFree(*ret);
 				*ret = nres;
 			}
 			return nrows;
@@ -367,8 +367,8 @@ int unir(int *res, int rows, int tipo, int **ret, int final)
 			{
 				size = nrows * tipo * sizeof(int);
 				reservar(&nres, size);
-				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
-				cudaFree(*ret);
+				hipMemcpyAsync(nres, res, size, hipMemcpyDeviceToDevice);
+				hipFree(*ret);
 				*ret = nres;
 			}
 			return nrows;
@@ -402,8 +402,8 @@ int unir(int *res, int rows, int tipo, int **ret, int final)
 			{
 				size = nrows * tipo * sizeof(int);
 				reservar(&nres, size);
-				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
-				cudaFree(*ret);
+				hipMemcpyAsync(nres, res, size, hipMemcpyDeviceToDevice);
+				hipFree(*ret);
 				*ret = nres;
 			}
 			return nrows;
@@ -437,8 +437,8 @@ int unir(int *res, int rows, int tipo, int **ret, int final)
 			{
 				size = nrows * tipo * sizeof(int);
 				reservar(&nres, size);
-				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
-				cudaFree(*ret);
+				hipMemcpyAsync(nres, res, size, hipMemcpyDeviceToDevice);
+				hipFree(*ret);
 				*ret = nres;
 			}
 			return nrows;
@@ -472,8 +472,8 @@ int unir(int *res, int rows, int tipo, int **ret, int final)
 			{
 				size = nrows * tipo * sizeof(int);
 				reservar(&nres, size);
-				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
-				cudaFree(*ret);
+				hipMemcpyAsync(nres, res, size, hipMemcpyDeviceToDevice);
+				hipFree(*ret);
 				*ret = nres;
 			}
 			return nrows;
@@ -507,8 +507,8 @@ int unir(int *res, int rows, int tipo, int **ret, int final)
 			{
 				size = nrows * tipo * sizeof(int);
 				reservar(&nres, size);
-				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
-				cudaFree(*ret);
+				hipMemcpyAsync(nres, res, size, hipMemcpyDeviceToDevice);
+				hipFree(*ret);
 				*ret = nres;
 			}
 			return nrows;
@@ -542,8 +542,8 @@ int unir(int *res, int rows, int tipo, int **ret, int final)
 			{
 				size = nrows * tipo * sizeof(int);
 				reservar(&nres, size);
-				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
-				cudaFree(*ret);
+				hipMemcpyAsync(nres, res, size, hipMemcpyDeviceToDevice);
+				hipFree(*ret);
 				*ret = nres;
 			}
 			return nrows;
@@ -577,8 +577,8 @@ int unir(int *res, int rows, int tipo, int **ret, int final)
 			{
 				size = nrows * tipo * sizeof(int);
 				reservar(&nres, size);
-				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
-				cudaFree(*ret);
+				hipMemcpyAsync(nres, res, size, hipMemcpyDeviceToDevice);
+				hipFree(*ret);
 				*ret = nres;
 			}
 			return nrows;
@@ -612,8 +612,8 @@ int unir(int *res, int rows, int tipo, int **ret, int final)
 			{
 				size = nrows * tipo * sizeof(int);
 				reservar(&nres, size);
-				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
-				cudaFree(*ret);
+				hipMemcpyAsync(nres, res, size, hipMemcpyDeviceToDevice);
+				hipFree(*ret);
 				*ret = nres;
 			}
 			return nrows;
@@ -647,8 +647,8 @@ int unir(int *res, int rows, int tipo, int **ret, int final)
 			{
 				size = nrows * tipo * sizeof(int);
 				reservar(&nres, size);
-				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
-				cudaFree(*ret);
+				hipMemcpyAsync(nres, res, size, hipMemcpyDeviceToDevice);
+				hipFree(*ret);
 				*ret = nres;
 			}
 			return nrows;
@@ -682,8 +682,8 @@ int unir(int *res, int rows, int tipo, int **ret, int final)
 			{
 				size = nrows * tipo * sizeof(int);
 				reservar(&nres, size);
-				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
-				cudaFree(*ret);
+				hipMemcpyAsync(nres, res, size, hipMemcpyDeviceToDevice);
+				hipFree(*ret);
 				*ret = nres;
 			}
 			return nrows;
@@ -717,8 +717,8 @@ int unir(int *res, int rows, int tipo, int **ret, int final)
 			{
 				size = nrows * tipo * sizeof(int);
 				reservar(&nres, size);
-				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
-				cudaFree(*ret);
+				hipMemcpyAsync(nres, res, size, hipMemcpyDeviceToDevice);
+				hipFree(*ret);
 				*ret = nres;
 			}
 			return nrows;
@@ -752,8 +752,8 @@ int unir(int *res, int rows, int tipo, int **ret, int final)
 			{
 				size = nrows * tipo * sizeof(int);
 				reservar(&nres, size);
-				cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
-				cudaFree(*ret);
+				hipMemcpyAsync(nres, res, size, hipMemcpyDeviceToDevice);
+				hipFree(*ret);
 				*ret = nres;
 			}
 			return nrows;
--- a/packages/cuda/union2.h
+++ b/packages/cuda/union2.h
--- a/packages/cuda/unioncpu2.cpp
+++ b/packages/cuda/unioncpu2.cpp
--- a/packages/myddas/example
+++ b/packages/myddas/example
@@ -2,19 +2,21 @@
 sqlite3 side:
 create table test (id integer, x integer, y integer);
 .separator ","
-.import /home/vsc/Yap/ILP/HH/DaysInHospital_Y3.csv test
+.import DaysInHospital_Y3.csv   hh
+.export hh
+.save hh

 myddas side:
 use_module(library(myddas)).
-db_open(sqlite3,con,'../hh',x,x).
+db_open(sqlite3,con,'.hh',x,x).
 db_close(con).

 test 2:

 use_module(library(myddas)).
 assert((
-t2 :- 
-db_open(sqlite3,con,'../hh',x,x),
+t2 :-
+db_open(sqlite3,con,'hh',x,x),
 db_import(con,test,test),
 test(A,B,C),
 writeln(test(A,B,C)),
@@ -22,4 +24,3 @@ fail
 )).
 trace.
 t2.
-
--- a/packages/myddas/pl/myddas.ypp
+++ b/packages/myddas/pl/myddas.ypp
@@ -15,7 +15,12 @@
  *									 *
  *************************************************************************/

-#if USE_MYDDAS
+#if 1 //USE_MYDDAS
+
+:- load_foreign_files([myddas], [], init_myddas).
+
+/* Initialize MYDDAS GLOBAL STRUCTURES */
+:- c_db_initialize_myddas.

 #ifdef DEBUG
 :- yap_flag(single_var_warnings,on).
@@ -800,8 +805,6 @@
 				       ]).

 #ifdef MYDDAS_MYSQL
-:- load_foreign_files([], [], init_mysql).
-
 :- use_module(myddas_mysql,[
 			    db_my_result_set/1,
 			    db_datalog_describe/1,
@@ -867,7 +870,7 @@
 				% db_open/4
 				%

-#if MYDDAS_DECLARATIONS
+#if 1 // MYDDAS_DECLARATIONS
 :- db_open(Protocol) extra_arguments
 db=Db,
 port=Port,
--- a/packages/myddas/sqlite3/CMakeLists.txt
+++ b/packages/myddas/sqlite3/CMakeLists.txt
@@ -23,12 +23,11 @@ Android/jni/sqlite/nativehelper/jni.h
 #sqlite3 is now in the system

 set (SQLITE3_FOUND ON PARENT_SCOPE)
-  macro_log_feature (SQLITE3_FOUND "Sqlite3"
-      "Sqlite3 Data-Base "
-  "http://www.sqlite3ql.org" FALSE)
+  message (
+      "   * Sqlite3 Data-Base (http://www.sqlite3ql.org) is distributed with
+      MYDDAS" )

-
-  include_directories (${SQLITE3_INCLUDE_DIRECTORIES}  .. . Android/jni/sqlite Android/jni/sqlite/nativehelper)
+  include_directories (  .. . Android/jni/sqlite Android/jni/sqlite/nativehelper)

  set (MYDDAS_FLAGS ${MYDDAS_FLAGS} -DMYDDAS_SQLITE3=1 PARENT_SCOPE)
  set_property( DIRECTORY .. APPEND PROPERTY COMPILE_DEFINITIONS  MYDDAS_SQLITE3=1 )
--- a/packages/python/setup.py.cmake
+++ b/packages/python/setup.py.cmake
@@ -0,0 +1,10 @@
+from setuptools import setup, Extension
+
+
+setup(
+    name = "yapex",
+    version = "0.1",
+package_dir = {'': '${CMAKE_SOURCE_DIR}/packages/python'  },
+py_modules = ['yapex']
+
+)
--- a/packages/python/yap_kernel/prolog.js
+++ b/packages/python/yap_kernel/prolog.js
--- a/packages/yap-lbfgs/yap_lbfgs.c
+++ b/packages/yap-lbfgs/yap_lbfgs.c
@@ -78,14 +78,11 @@ static lbfgsfloatval_t evaluate(

  a1 = YAP_ArgOfTerm(1,call);
  if (YAP_IsFloatTerm(a1)) {
-      YAP_ShutdownGoal( TRUE );
      return (lbfgsfloatval_t) YAP_FloatOfTerm(a1);
  } else if (YAP_IsIntTerm(a1)) {
-    YAP_ShutdownGoal( TRUE );
    return (lbfgsfloatval_t) YAP_IntOfTerm(a1);
  }

-  YAP_ShutdownGoal( TRUE );
  fprintf(stderr, "ERROR: The evaluate call back function did not return a number as first argument.\n");
  return 0;
 }