CUDA: the missing files

2016-04-22 18:21:05 +01:00 · 2016-04-22 18:21:05 +01:00 · ea90785c6a
commit ea90785c6a
parent f966a5b912
11 changed files with 3071 additions and 0 deletions
--- a/packages/cuda/CC_CSSTree.cu
+++ b/packages/cuda/CC_CSSTree.cu
@ -0,0 +1,30 @@
+#include "CC_CSSTree.h"
+
+//return the start position of searching the key.
+int CC_CSSTree::search(int key)
+{
+	int i=0;
+	int curIndex=0;
+	int curNode=0;
+	int j=0;
+	//search
+	for(i=0;i<level;i++)
+	{
+		for(j=0;j<blockSize;j++)
+		{
+			if(ntree[curIndex+j]==-1)
+				break;
+			if(key<=ntree[curIndex+j])
+				break;
+		}
+		curNode=(fanout*(curNode)+j+1);
+		curIndex=curNode*blockSize;
+//#ifdef DEBUG
+//		cout<<curNode<<", "<<j<<", "<<ntree[curIndex]<<";   ";
+//#endif
+	}
+	curIndex=(curNode-numNode)*blockSize;
+	if(curIndex>numRecord) curIndex=numRecord-1;
+	//cout<<"I: "<<curIndex<<", ";//cout<<endl;
+	return curIndex;
+}
--- a/packages/cuda/CC_CSSTree.h
+++ b/packages/cuda/CC_CSSTree.h
@ -0,0 +1,146 @@
+#ifndef CSSTREE_H
+#define CSSTREE_H
+
+#include <iostream>
+#include <math.h>
+using namespace std;
+
+#define divRoundUp(n,s)    (((n) / (s)) + ((((n) % (s)) > 0) ? 1 : 0))
+#define CSS_TREE_FANOUT 33
+//we use implicit pointer to perform the addressing.
+
+typedef int Record;
+
+class CC_GenericTree
+{
+public:
+	int numRecord;
+	Record *data;
+	//we use the BFS layout as the default layout.
+	int numNode;
+	int level;
+	int gResult;
+	CC_GenericTree(){}
+	//we assume that numR=2^i. Otherwise, we pad the array with -1 from the beginning.
+	//we also assume that the record are sorted by the key.
+	CC_GenericTree(Record *d, int numR)
+	{
+		data=d;
+		numRecord=numR;	
+	}
+	virtual ~CC_GenericTree()
+	{
+	}
+	virtual int search(int key)=0;
+
+};
+
+class CC_CSSTree:public CC_GenericTree
+{
+public:
+	int *ntree;
+	int fanout;
+	int blockSize;
+	int *vStart;
+	int *vG;//vG[0] is used in computing the position for level 1.
+	int numKey;
+	CC_CSSTree(Record *d, int numR, int f):CC_GenericTree(d,numR)
+	{
+		fanout=f;
+		blockSize=fanout-1;
+		int numLeaf=divRoundUp(numR,blockSize);		
+		level=1;
+		int temp=numLeaf;
+		while(temp>1)
+		{
+			temp=divRoundUp(temp, fanout);
+			level++;
+		}
+		numNode=(int)((pow((double)fanout,(double)level)-1)/(fanout-1));
+		numKey=numNode*blockSize;
+		ntree=new int[numKey];
+		vStart=new int[level];
+		vG=new int[level];
+#ifdef DEBUG
+		cout<<numLeaf<<","<<level<<", "<<numNode<<endl;
+#endif
+		//layout the tree from bottom up.
+		int i=0,j=0,k=0;
+		int startNode=0;
+		int endNode=0;
+		int startKey, endKey;
+		int curIndex;
+		for(i=0;i<numNode;i++)
+			ntree[i]=-1;
+		//for <level-1>, i.e., the leaf level. [start,end]
+		for(i=0;i<level;i++)//level
+		{
+			startNode=(int)((pow((double)fanout,(double)i)-1)/(fanout-1));
+			endNode=(int)((pow((double)fanout,(double)(i+1))-1)/(fanout-1));
+			for(j= startNode;j< endNode;j++)//which node
+			{
+				startKey=j*blockSize;
+				endKey=startKey+blockSize;
+				for(k=startKey;k<endKey;k++)
+				{
+					curIndex=(int)(blockSize*pow((double)fanout,(double)(level-i-1))*(k+1-startNode*blockSize+(j-startNode))-1);
+					if(curIndex<numRecord+blockSize)
+					{
+						if(curIndex>=numRecord)
+							curIndex=numRecord-1;
+						ntree[k]=data[curIndex];
+					}
+					else
+						break;
+				}
+			}
+		}
+	}
+	~CC_CSSTree()
+	{
+		delete [] ntree;
+		delete [] vStart;
+		delete [] vG;
+	}
+	virtual int search(int key);
+	void print()
+	{
+		int i=0, j=0;
+		int k=0;
+		int startNode=0;
+		int endNode=0;
+		int startKey, endKey;
+		for(i=0;i<level;i++)//level
+		{
+			cout<<"Level, "<<i<<endl;
+			startNode=(int)((pow((double)fanout,(double)i)-1)/(fanout-1));
+			endNode=(int)((pow((double)fanout,(double)(i+1))-1)/(fanout-1));
+			for(j= startNode;j< endNode;j++)//which node
+			{
+				cout<<"Level, "<<i<<", Node, "<<j<<": ";
+				startKey=j*blockSize;
+				endKey=startKey+blockSize;
+				for(k=startKey;k<endKey;k++)
+				{
+					cout<<ntree[k]<<", ";	
+				}
+				cout<<endl;
+			}
+		}
+		for(i=0;i<numRecord;i++)
+		{
+			cout<<data[i]<<", ";
+			if(i%(fanout-1)==(fanout-2))
+			cout<<"*"<<endl;
+		}
+
+		
+
+	}
+
+
+};
+
+
+#endif
+
--- a/packages/cuda/bpreds.h
+++ b/packages/cuda/bpreds.h
@ -0,0 +1,12 @@
+#ifndef _BPREDS_H_
+#define _BPREDS_H_
+
+int maximo(int, ...);
+__global__ void bpreds(int*, int*, int, int, int, int*, int, int, int*, int*);
+__global__ void bpredsnormal(int*, int, int, int*, int, int*);
+__global__ void bpredsnormal2(int*, int, int, int*, int, int*);
+__global__ void bpredsOR(int*, int*, int, int, int, int*, int, int, int*, int*);
+__global__ void bpredsorlogic(int*, int, int, int*, int, int*);
+__global__ void bpredsorlogic2(int*, int, int, int*, int, int*);
+
+#endif
--- a/packages/cuda/bpredscpu.cpp
+++ b/packages/cuda/bpredscpu.cpp
@ -0,0 +1,88 @@
+#include "pred.h"
+
+int bpredscpu(int *dop1, int rows, int *bin, int3 numpreds, int **ret)
+{
+	int i, x, y, op1, op2;
+	int size = 0, rowact, flag = 0;
+	int predn = numpreds.x * 3;
+	int total = predn + numpreds.z;
+	int *fres, *ptr;
+	int div, fin, ini[NUM_T + 1];
+	vector<int> vec[NUM_T];
+
+	for(x = 0; x < NUM_T; x++)
+		vec[x].reserve(INISIZE);
+
+	//omp_set_num_threads(NUM_T);
+	div = rows / NUM_T;
+	ini[0] = 0;
+	for(x = 1; x < NUM_T; x++)
+		ini[x] = div * x;
+	ini[NUM_T] = rows;
+
+	#pragma omp parallel for private(x,rowact,y,fin,op1,op2) firstprivate(flag,total)
+	for(i = 0; i < NUM_T; i++)
+	{
+		fin = ini[i+1];
+		for(x = ini[i]; x < fin; x++)
+		{
+			rowact = x * numpreds.y;
+			for(y = 0; y < predn; y += 3)
+			{
+				op1 = bin[y+1];
+				if(op1 < 0)
+					op1 *= -1;
+				else
+					op1 = dop1[rowact + op1];
+				op2 = bin[y+2];
+				if(op2 < 0)
+					op2 *= -1;
+				else
+					op2 = dop1[rowact + op2];
+				switch(bin[y])
+				{
+					case SBG_EQ: if(op1 != op2)
+							flag = 1;
+							break;
+					case SBG_GT: if(op1 <= op2)
+							flag = 1;
+							break;
+					case SBG_LT: if(op1 >= op2)
+							flag = 1;
+							break;
+					case SBG_GE: if(op1 < op2)
+							flag = 1;
+							break;
+					case SBG_LE: if(op1 > op2)
+							flag = 1;
+							break;
+					case SBG_DF: if(op1 == op2)
+							flag = 1;
+				}
+				if(flag)
+					break;
+			}
+			if(flag != 1)
+			{
+				for(y = predn; y < total; y++)
+					vec[i].push_back(dop1[rowact+bin[y]]);
+			}
+			else
+				flag = 0;
+		}
+	}
+	for(x = 0; x < NUM_T; x++)
+	{
+		ini[x] = vec[x].size();
+		size += ini[x];
+	}
+	fres = (int *)malloc(size * sizeof(int));
+	ptr = fres;
+	for(x = 0; x < NUM_T; x++)
+	{
+		memcpy(ptr, vec[x].data(), ini[x] * sizeof(int));
+		ptr += ini[x];
+	}
+	*ret = fres;
+	return size / numpreds.z;
+}
--- a/packages/cuda/creator2.c
+++ b/packages/cuda/creator2.c
@ -0,0 +1,197 @@
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <ctype.h>
+
+/*Program used to generate union2.cu and union2.h. A new pointer and all its operations are generated for each set (pairs, triplets, etc.).
+ Arguments are the cardinality of the biggest set and the name of the cuda file. For example, executing "creator2 20 union2" will generate 
+ all pointers and operations for all sets from 1 to 20 in the files union2.cu and union2.h.*/
+
+int main(int argc, char *argv[])
+{
+	int num = atoi(argv[1]);
+	int x;
+	char *str = (char *)malloc((strlen(argv[2]) + 4) * sizeof(char));
+	sprintf(str, "%s.cu", argv[2]);
+	FILE *cuda = fopen(str, "w");
+
+	fprintf(cuda, "/*Computer generated file to remove duplicates. Since Thrust's unique and sort, unlike their std's counterparts, don't have a way to specify the size of each element in\n");
+	fprintf(cuda, "the array, comparing pairs, triplets and other sets is not possible without defining a new pointer and all related operations for each set. If you have a better idea to do\n");
+	fprintf(cuda, "this, please don't hesitate to email us.*/\n\n");
+	fprintf(cuda, "#include <thrust/device_vector.h>\n");
+	fprintf(cuda, "#include <thrust/unique.h>\n");
+	fprintf(cuda, "#include <thrust/distance.h>\n");
+	fprintf(cuda, "#include <thrust/sort.h>\n");
+	fprintf(cuda, "#include <iostream>\n");
+	fprintf(cuda, "#include \"memory.h\"\n");
+	fprintf(cuda, "#include \"%s.h\"\n\n", argv[2]);
+	fprintf(cuda, "int unir(int *res, int rows, int tipo, int **ret, int final)\n");
+	fprintf(cuda, "{\n");
+	fprintf(cuda, "\tthrust::device_ptr<int> pt, re;\n");
+	for(x = 2; x <= num; x++)
+		fprintf(cuda, "\tthrust::device_ptr<s%d> pt%d, re%d;\n", x, x ,x);
+	for(x = 2; x <= num; x++)
+		fprintf(cuda, "\ts%d *t%d;\n", x, x);
+	fprintf(cuda, "\tint flag, nrows, *nres, size;\n\n");
+	fprintf(cuda, "#if TIMER\n");
+	fprintf(cuda, "\tcuda_stats.unions++;\n");
+	fprintf(cuda, "#endif\n\n");
+	fprintf(cuda, "\tswitch(tipo)\n");
+	fprintf(cuda, "\t{\n");
+	fprintf(cuda, "\t\tcase 1:\n");
+	fprintf(cuda, "\t\t{\n");
+	fprintf(cuda, "\t\t\tpt = thrust::device_pointer_cast(res);\n");
+	fprintf(cuda, "\t\t\tflag = 0;\n");
+	fprintf(cuda, "\t\t\twhile(flag != 1)\n");
+	fprintf(cuda, "\t\t\t{\n");
+	fprintf(cuda, "\t\t\t\ttry\n");
+	fprintf(cuda, "\t\t\t\t{\n");
+	fprintf(cuda, "\t\t\t\t\tthrust::sort(pt, pt + rows);\n");
+	fprintf(cuda, "\t\t\t\t\tif(final)\n");
+	fprintf(cuda, "\t\t\t\t\t{\n");
+	fprintf(cuda, "\t\t\t\t\t\tre = thrust::unique(pt, pt + rows, q1());\n");
+	fprintf(cuda, "\t\t\t\t\t\tre = thrust::unique(pt, re);\n");
+	fprintf(cuda, "\t\t\t\t\t}\n");
+	fprintf(cuda, "\t\t\t\t\telse\n");
+	fprintf(cuda, "\t\t\t\t\t\tre = thrust::unique(pt, pt + rows);\n");
+	fprintf(cuda, "\t\t\t\t\tflag = 1;\n");
+	fprintf(cuda, "\t\t\t\t}\n");
+	fprintf(cuda, "\t\t\t\tcatch(std::bad_alloc &e)\n");
+	fprintf(cuda, "\t\t\t\t{\n");
+	fprintf(cuda, "\t\t\t\t\tlimpiar(\"sort/unique in unir\", 0);\n");
+	fprintf(cuda, "\t\t\t\t}\n");
+	fprintf(cuda, "\t\t\t}\n");
+	fprintf(cuda, "\t\t\tnrows = thrust::distance(pt, re);\n");
+	fprintf(cuda, "\t\t\tif(nrows < rows / 2)\n");
+	fprintf(cuda, "\t\t\t{\n");
+	fprintf(cuda, "\t\t\t\tsize = nrows * tipo * sizeof(int);\n");
+	fprintf(cuda, "\t\t\t\treservar(&nres, size);\n");
+	fprintf(cuda, "\t\t\t\tcudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);\n");
+	fprintf(cuda, "\t\t\t\tcudaFree(*ret);\n");
+	fprintf(cuda, "\t\t\t\t*ret = nres;\n");
+	fprintf(cuda, "\t\t\t}\n");
+	fprintf(cuda, "\t\t\treturn nrows;\n");
+	fprintf(cuda, "\t\t}\n");
+	for(x = 2; x <= num; x++)
+	{
+		fprintf(cuda, "\t\tcase %d:\n", x);
+		fprintf(cuda, "\t\t{\n");
+		fprintf(cuda, "\t\t\tt%d = (s%d*)res;\n", x, x);
+		fprintf(cuda, "\t\t\tpt%d = thrust::device_pointer_cast(t%d);\n", x, x);
+		fprintf(cuda, "\t\t\tflag = 0;\n");
+		fprintf(cuda, "\t\t\twhile(flag != 1)\n");
+		fprintf(cuda, "\t\t\t{\n");
+		fprintf(cuda, "\t\t\t\ttry\n");
+		fprintf(cuda, "\t\t\t\t{\n");
+		fprintf(cuda, "\t\t\t\t\tthrust::sort(pt%d, pt%d + rows, o%d());\n", x, x, x);
+		fprintf(cuda, "\t\t\t\t\tif(final)\n");
+		fprintf(cuda, "\t\t\t\t\t{\n");
+		fprintf(cuda, "\t\t\t\t\t\tre%d = thrust::unique(pt%d, pt%d + rows, q%d());\n", x, x, x, x);
+		fprintf(cuda, "\t\t\t\t\t\tre%d = thrust::unique(pt%d, re%d, p%d());\n", x, x, x, x);
+		fprintf(cuda, "\t\t\t\t\t}\n");
+		fprintf(cuda, "\t\t\t\t\telse\n");
+		fprintf(cuda, "\t\t\t\t\t\tre%d = thrust::unique(pt%d, pt%d + rows, p%d());\n", x, x, x, x);
+		fprintf(cuda, "\t\t\t\t\tflag = 1;\n");
+		fprintf(cuda, "\t\t\t\t}\n");
+		fprintf(cuda, "\t\t\t\tcatch(std::bad_alloc &e)\n");
+		fprintf(cuda, "\t\t\t\t{\n");
+		fprintf(cuda, "\t\t\t\t\tlimpiar(\"sort/unique in unir\", 0);\n");
+		fprintf(cuda, "\t\t\t\t}\n");
+		fprintf(cuda, "\t\t\t}\n");
+		fprintf(cuda, "\t\t\tnrows = thrust::distance(pt%d, re%d);\n", x, x);
+		fprintf(cuda, "\t\t\tif(nrows < rows / 2)\n");
+		fprintf(cuda, "\t\t\t{\n");
+		fprintf(cuda, "\t\t\t\tsize = nrows * tipo * sizeof(int);\n");
+		fprintf(cuda, "\t\t\t\treservar(&nres, size);\n");
+		fprintf(cuda, "\t\t\t\tcudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);\n");
+		fprintf(cuda, "\t\t\t\tcudaFree(*ret);\n");
+		fprintf(cuda, "\t\t\t\t*ret = nres;\n");
+		fprintf(cuda, "\t\t\t}\n");
+		fprintf(cuda, "\t\t\treturn nrows;\n");
+		fprintf(cuda, "\t\t}\n");
+	}
+	fprintf(cuda, "\t}\n");
+	fprintf(cuda, "\treturn 0;\n");
+	fprintf(cuda, "}\n");
+
+	fclose(cuda);
+	sprintf(str, "%s.h", argv[2]);
+	cuda = fopen(str, "w"); /*tipo de archivo cambiar*/
+
+	fprintf(cuda, "#ifndef _");
+	for(x = 0; x < strlen(argv[2]); x++)
+		fprintf(cuda, "%c", toupper(argv[2][x]));
+	fprintf(cuda, "_H_\n");
+	fprintf(cuda, "#define _");
+	for(x = 0; x < strlen(argv[2]); x++)
+		fprintf(cuda, "%c", toupper(argv[2][x]));
+	fprintf(cuda, "_H_\n\n");
+	fprintf(cuda, "int unir(int *res, int rows, int tipo, int **ret, int final);\n\n");
+	for(x = 2; x <= num; x++)
+	{
+		fprintf(cuda, "typedef struct n%d\n", x);
+		fprintf(cuda, "{\n");
+		fprintf(cuda, "\tint v[%d];\n", x);
+		fprintf(cuda, "}s%d;\n\n", x);
+	}
+	fprintf(cuda, "struct q1\n");
+	fprintf(cuda, "{\n");
+	fprintf(cuda, "\t__host__ __device__\n");
+	fprintf(cuda, "\tbool operator()(const int &r1, const int &r2)\n");
+	fprintf(cuda, "\t{\n");
+	fprintf(cuda, "\t\tif(r1 != r2)\n");
+	fprintf(cuda, "\t\t\treturn true;\n");
+	fprintf(cuda, "\t\treturn false;\n");
+	fprintf(cuda, "\t}\n");
+	fprintf(cuda, "};\n\n");
+	for(x = 2; x <= num; x++)
+	{
+		fprintf(cuda, "struct p%d\n", x);
+		fprintf(cuda, "{\n");
+		fprintf(cuda, "\t__host__ __device__\n");
+		fprintf(cuda, "\tbool operator()(const s%d &r1, const s%d &r2)\n", x, x);
+		fprintf(cuda, "\t{\n");
+     		fprintf(cuda, "\t\tint x;\n");
+		fprintf(cuda, "\t\tfor(x = 0; x < %d; x++)\n", x);
+		fprintf(cuda, "\t\t{\n");
+		fprintf(cuda, "\t\t\tif(r1.v[x] != r2.v[x])\n");
+		fprintf(cuda, "\t\t\t\treturn false;\n");
+		fprintf(cuda, "\t\t}\n");
+		fprintf(cuda, "\t\treturn true;\n");
+    		fprintf(cuda, "\t}\n");
+		fprintf(cuda, "};\n\n");
+		fprintf(cuda, "struct q%d\n", x);
+		fprintf(cuda, "{\n");
+		fprintf(cuda, "\t__host__ __device__\n");
+		fprintf(cuda, "\tbool operator()(const s%d &r1, const s%d &r2)\n", x, x);
+		fprintf(cuda, "\t{\n");
+     		fprintf(cuda, "\t\tint x;\n");
+		fprintf(cuda, "\t\tfor(x = 0; x < %d; x++)\n", x);
+		fprintf(cuda, "\t\t{\n");
+		fprintf(cuda, "\t\t\tif(r1.v[x] != r2.v[x])\n");
+		fprintf(cuda, "\t\t\t\treturn true;\n");
+		fprintf(cuda, "\t\t}\n");
+		fprintf(cuda, "\t\treturn false;\n");
+    		fprintf(cuda, "\t}\n");
+		fprintf(cuda, "};\n\n");
+		fprintf(cuda, "struct o%d\n", x);
+		fprintf(cuda, "{\n");
+		fprintf(cuda, "\t__host__ __device__\n");
+		fprintf(cuda, "\tbool operator()(const s%d &r1, const s%d &r2)\n", x, x);
+		fprintf(cuda, "\t{\n");
+     		fprintf(cuda, "\t\tint x;\n");
+		fprintf(cuda, "\t\tfor(x = 0; x < %d; x++)\n", x);
+		fprintf(cuda, "\t\t{\n");
+		fprintf(cuda, "\t\t\tif(r1.v[x] > r2.v[x])\n");
+		fprintf(cuda, "\t\t\t\treturn true;\n");
+		fprintf(cuda, "\t\t\tif(r1.v[x] < r2.v[x])\n");
+		fprintf(cuda, "\t\t\t\treturn false;\n");
+		fprintf(cuda, "\t\t}\n");
+		fprintf(cuda, "\t\treturn false;\n");
+    		fprintf(cuda, "\t}\n");
+		fprintf(cuda, "};\n\n");
+	}
+	fprintf(cuda, "#endif\n");
+	fclose(cuda);
+	free(str);
+}
--- a/packages/cuda/dbio.cu
+++ b/packages/cuda/dbio.cu
@ -0,0 +1,603 @@
+#include <iostream>
+#include <algorithm>
+#include <stdio.h>
+#include "memory.h"
+#include "union2.h"
+#include "dbio.h"
+
+#ifdef DATALOG
+//template<class InputIterator>
+//void datalogWrite(int query, InputIterator rul_str, InputIterator fin, int finalDR, int **result)
+void datalogWrite(int query, vector<rulenode>::iterator rul_str, vector<rulenode>::iterator fin, int finalDR, int **result)
+{
+	rulenode tmprule;
+	vector<rulenode>::iterator qposr;
+	int *dop1, *hres;
+	int cols1, res_rows, tipo;
+	tmprule.name = query;
+	qposr = lower_bound(rul_str, fin, tmprule, comparer);
+	cols1 = qposr->num_columns;
+	res_rows = cargafinal(query, cols1, &dop1);
+
+	if(res_rows != 0)
+	{	
+		if(res_rows > 0)
+		{
+			if(finalDR)
+				res_rows = unir(dop1, res_rows, cols1, &dop1, 0);
+			tipo = res_rows * cols1 * sizeof(int);
+			hres = (int *)malloc(tipo);
+			cudaMemcpy(hres, dop1, tipo, cudaMemcpyDeviceToHost);
+			cudaFree(dop1);
+			*result = hres;
+		}
+		else
+		{
+			res_rows *= -1;
+			if(finalDR)
+			{
+				int *dop2;
+				tipo = res_rows * cols1 * sizeof(int);
+				reservar(&dop2, tipo); 
+				cudaMemcpy(dop2, dop1, tipo, cudaMemcpyHostToDevice);
+				free(dop1);
+				res_rows = unir(dop2, res_rows, cols1, &dop2, 0);
+				tipo = res_rows * cols1 * sizeof(int);
+				hres = (int *)malloc(tipo);
+				cudaMemcpy(hres, dop2, tipo, cudaMemcpyDeviceToHost);
+				cudaFree(dop2);
+				*result = hres;
+			}
+			else
+				*result = dop1;
+		}
+	}
+}
+#endif
+
+#ifdef TUFFY
+void postgresRead(PGconn **ret, vector<gpunode> *L, int *inpquery, char *names, int finalDR)
+{
+	PGresult *pgr;
+	int x, y;
+	int *mat, *mat2;
+	char *tok, sel[1024], **qrs;
+	int w, z = 0, numt, numc, numc2, start = 0, start2, val;
+	PGconn *conn = PQconnectdb("host=localhost port=5432 dbname = prueba user=tuffer password=root");
+	if(PQstatus(conn) != CONNECTION_OK)
+    	{
+        	fprintf(stderr, "Connection to database failed: %s", PQerrorMessage(conn));
+		exit(1);
+        }
+
+	pgr = PQexec(conn, "Select nspname from pg_catalog.pg_namespace where oid = (select max(oid) from pg_catalog.pg_namespace)");
+	sprintf(sel, "SET search_path = %s", PQgetvalue(pgr, 0, 0)); 
+	PQclear(pgr);
+	PQexec(conn, sel);
+	tok = strtok(names, " ");	
+	if(finalDR)
+	{
+		qrs = (char **)malloc(100 * sizeof(char *));
+		while(tok != NULL)
+		{
+			sprintf(sel, "Select * from %s limit 0", tok);
+			pgr = PQexec(conn, sel);
+			numc = L->at(z).num_columns;
+			if(tok[0] == 'c')
+			{
+				sprintf(sel, "Select ");
+				numt = numc + 1;
+				for(x = 1; x < numt; x++)
+				{
+					strcat(sel, PQfname(pgr, x));
+					strcat(sel, ", ");
+				}
+				sel[strlen(sel)-2] = '\0';
+				sprintf(sel, "%s from %s", sel, tok);
+			}
+			else
+			{
+				sprintf(sel, "Select id, Club, ");
+				numt = numc + 6;
+				for(x = 8; x < numt; x++)
+				{
+					strcat(sel, PQfname(pgr, x));
+					strcat(sel, ", ");
+				}
+				sel[strlen(sel)-2] = '\0';
+				sprintf(sel, "%s from %s", sel, tok);
+			}
+			PQclear(pgr);
+			pgr = PQexec(conn, sel);
+			numt = PQntuples(pgr);
+			mat = (int *)malloc(numt * numc * sizeof(int));
+			if(tok[0] == 'c')
+			{
+				for(x = 0; x < numt; x++)
+				{
+					start = x * numc;
+					for(y = 0; y < numc; y++)
+						mat[start + y] = atoi(PQgetvalue(pgr, x, y));
+				}
+			}
+			else
+			{
+				numc2 = numc - 2;
+				mat2 = (int *)malloc(numt * numc2 * sizeof(int));
+				start = 0;
+				start2 = 0;
+				for(x = 0; x < numt; x++)
+				{
+					w = atoi(PQgetvalue(pgr, x, 1));
+					if(w < 2)
+					{
+						mat[start] = atoi(PQgetvalue(pgr, x, 0));
+						start++;
+						mat[start] = w;
+						start++;
+						if(w > 0)
+						{
+							for(y = 2; y < numc; y++)
+							{
+								val = atoi(PQgetvalue(pgr, x, y));
+								mat[start] = val;
+								mat2[start2] = val;
+								start++;
+								start2++;
+							}
+						}
+						else
+						{
+							for(y = 2; y < numc; y++)
+							{
+								val = atoi(PQgetvalue(pgr, x, y));
+								mat[start] = val;
+								start++;
+							}
+						}
+					}
+					else
+					{
+						for(y = 2; y < numc; y++)
+						{
+							val = atoi(PQgetvalue(pgr, x, y));
+							mat2[start2] = val;
+							start2++;
+						}
+					}
+				}
+				L->at(z+1).address_host_table = mat2;
+				L->at(z+1).num_rows = start2 / numc2;
+			}
+			L->at(z).address_host_table = mat;
+			L->at(z).num_rows = start / numc;
+			PQclear(pgr);
+			
+			x = 1;
+			while(inpquery[x] != -1)
+			{
+				if(L->at(z).name == inpquery[x])
+				{
+					numt = (strlen(tok) + 1) * sizeof(char);
+					qrs[x] = (char *)malloc(numt);
+					memcpy(qrs[x], tok, numt);
+				}
+				x += 2;
+			}
+			if(tok[0] == 'c')
+			{
+				tok = strtok(NULL, " ");
+				z++;
+			}
+			else
+			{
+				strtok(NULL, " ");	
+				tok = strtok(NULL, " ");
+				z += 2;
+			}
+		}
+	}
+	else
+	{
+		while(tok != NULL)
+		{
+			sprintf(sel, "Select * from %s limit 0", tok);
+			pgr = PQexec(conn, sel);
+			numc = L->at(z).num_columns;
+			if(tok[0] == 'c')
+			{
+				sprintf(sel, "Select weight, myid, ");
+				start = 1;
+				numt = numc + 1;
+			}
+			else
+			{
+				sprintf(sel, "Select truth, Club, atomID, ");
+				start = 8;
+				numt = numc + 5;
+			}
+			for(x = start; x < numt; x++)
+			{
+				strcat(sel, PQfname(pgr, x));
+				strcat(sel, ", ");
+			}
+			sel[strlen(sel)-2] = '\0';
+			sprintf(sel, "%s from %s", sel, tok);
+			PQclear(pgr);
+			pgr = PQexec(conn, sel);
+			numt = PQntuples(pgr);
+			mat = (int *)malloc(numt * numc * sizeof(int)); 
+			L->at(z).weight = (double *)malloc(numt * sizeof(double));
+			L->at(z).num_rows = numt;
+
+			for(x = 0; x < numt; x++)
+			{	
+				start = x * numc;
+				for(y = 1; y < numc; y++)
+					mat[start + y] = atoi(PQgetvalue(pgr, x, y));
+			}
+
+			numt *= numc;
+			double flo;
+			if(tok[0] == 'c')
+			{
+				for(x = 0, y = 0; x < numt; x+=numc, y++)
+				{
+					flo = atof(PQgetvalue(pgr, y, 0));
+					L->at(z).weight[y] = flo;
+					if(flo > 0)
+						mat[x] = y + 1;
+					else
+						mat[x] = -y - 1;
+				}
+			}
+			else
+			{
+				for(x = 0, y = 0; x < numt; x+=numc, y++)
+				{
+					if(PQgetvalue(pgr, y, 0)[0] == 't')
+						mat[x] = 2;
+					else
+						mat[x] = 1;
+				}				
+			}
+			L->at(z).address_host_table = mat;
+			numc = (strlen(tok) + 1) * sizeof(char);
+			L->at(z).predname = (char *)malloc(numc);
+			memcpy(L->at(z).predname, tok, numc);
+			PQclear(pgr);
+			tok = strtok(NULL, " ");
+			z++;
+		}
+	}
+	*ret = conn;
+}
+
+void postgresWrite(int *inpquery, int ninpf, vector<rulenode>::iterator rul_str, vector<rulenode>::iterator fin, vector<gpunode> *L, PGconn *conn, int finalDR)
+{
+	char sel[1024];
+	double *matw = NULL;
+	int qname, cols1, res_rows, tipo, *dop1;
+	int x, w, z, y, *hres;
+	rulenode tmprule;
+	vector<rulenode>::iterator qposr;
+	if(finalDR)
+	{
+		char file[] = "/dev/shm/mln0_atoms.csv";
+		z = 0;
+		int seqid = 1;
+		FILE *fp;
+		fp = fopen(file, "w");
+		if(fp == NULL)
+		{
+			cerr << "Failed to create main memory temporary file, attempting to use hardrive" << endl;
+			sprintf(file, "./temp/mln0_atoms.csv");
+			fp = fopen(file, "w");
+			if(fp == NULL)
+			{
+				cerr << "Failed to create main memory temporary file" << endl;
+				exit(1);
+			}
+		}
+		while((qname = inpquery[z]) != -1)
+		{
+			tmprule.name = qname;
+			qposr = lower_bound(rul_str, fin, tmprule, comparer);
+			cols1 = qposr->num_columns;
+			res_rows = cargafinal(qname, cols1, &dop1);
+
+			if(res_rows != 0)
+			{
+				if(res_rows < 0)
+					res_rows = unir(dop1, -res_rows, cols1, &dop1, 0);  /*duplicate elimination on result*/
+				else
+					res_rows = unir(dop1, res_rows, cols1, &dop1, finalDR);
+
+				tipo = res_rows * cols1 * sizeof(int);
+				hres = (int *)malloc(tipo);
+				cudaMemcpy(hres, dop1, tipo, cudaMemcpyDeviceToHost);
+				cudaFree(dop1);
+				w = z + 1;
+
+				strtok(qposr->rulename, "_");
+				strtok(NULL, "_");
+				int prid = atoi(strtok(NULL, "_"));
+
+				for(x = 0, w = 0; x < res_rows; x++, w+=2)
+				{
+					if(hres[w+1])
+						fprintf(fp, "%d,%d,%d,true\n", seqid, hres[w], prid);
+					else
+						fprintf(fp, "%d,%d,%d,false\n", seqid, hres[w], prid);
+					seqid++;
+				}
+				free(hres);
+			}
+			z += 2;
+		}
+		fclose(fp);
+		sprintf(sel, "Copy mln0_atoms(atomid,tupleID,predID,isquery) from '%s' CSV", file);
+		PQexec(conn, sel);
+	}
+	else
+	{
+		while(rul_str != fin)
+		{
+			cols1 = rul_str->num_columns;
+			res_rows = cargafinal(rul_str->name, cols1, &dop1);
+			if(res_rows == 0)
+			{
+				rul_str++;
+				continue;
+			}
+			res_rows = abs(res_rows);
+			tipo = res_rows * cols1 * sizeof(int);
+			hres = (int *)malloc(tipo);
+			cudaMemcpy(hres, dop1, tipo, cudaMemcpyDeviceToHost);
+			cudaFree(dop1);
+
+			char file[] = "/dev/shm/buffer.csv";
+			FILE *fp;
+			fp = fopen(file, "w");
+			if(fp == NULL)
+			{
+				cerr << "Failed to create main memory temporary file, attempting to use hardrive" << endl;
+				sprintf(file, "./temp/buffer.csv");
+				fp = fopen(file, "w");
+				if(fp == NULL)
+				{
+					cerr << "Failed to create main memory temporary file" << endl;
+					exit(1);
+				}
+			}
+
+			if(rul_str->rulename[0] == 'z')
+			{
+				char *name = rul_str->rulename + 1;
+				for(x = 0; x < ninpf; x++)
+				{
+					if(strncmp(L->at(x).predname, name, strlen(name)) == 0)
+					{
+						matw = L->at(x).weight;
+						break;
+					}
+				}
+
+				cols1 -= 3;
+				for(x = 0, z = 0; x < res_rows; x++, z+=3)
+				{
+					for(y = 0; y < cols1; y++, z++)
+						fprintf(fp, "%d,", hres[z]);
+					fprintf(fp, "%d,%lf,%d\n", hres[z], matw[abs(hres[z+1])-1], hres[z+2]);
+				}
+				fclose(fp);
+				sprintf(sel, "Copy %s from '%s' CSV", name, file);
+				PQexec(conn, sel);
+			}
+			else
+			{
+				cols1--;
+				for(x = 0, z = 0; x < res_rows; x++, z++)
+				{
+					for(y = 0; y < cols1; y++, z++)
+						fprintf(fp, "%d,", hres[z]);
+					fprintf(fp, "%d\n", hres[z]);
+				}
+				fclose(fp);
+				sprintf(sel, "Copy %s from '%s' CSV", rul_str->rulename, file);
+				PQexec(conn, sel);
+			}
+			free(hres);
+			rul_str++;
+		}
+	}
+	PQfinish(conn);
+	if(finalDR)
+		clear_memory_all();
+}
+#endif
+
+#ifdef ROCKIT
+void mysqlRead(MYSQL **ret, int *qrs, vector<gpunode> *L, int ninpf, char *names, int finalDR)
+{
+	char *tok, sel[1024];
+	int w, x, y, z = 0, numt, numc;
+	int *mat;
+	MYSQL *con = mysql_init(NULL);
+	if(con == NULL)
+	{
+		fprintf(stderr, "mysql_init() failed\n");
+      		exit(1);
+	}
+	mysql_options(con, MYSQL_OPT_LOCAL_INFILE, NULL);
+	mysql_real_connect(con, "localhost", "root", "root", "rockit", 0, NULL, 0);
+	if(finalDR)
+	{
+		y = 0;
+		while(qrs[y] != 0)
+		{
+			for(z = 0; z < ninpf; z++)
+			{
+				if(qrs[y] == L->at(z).name)
+				{
+					MYSQL_ROW row;
+					sprintf(sel, "Select count(*) from %s", L->at(z).predname);
+					mysql_query(con, sel);
+					MYSQL_RES *result = mysql_store_result(con);
+					row = mysql_fetch_row(result);
+					numt = atoi(row[0]);
+					mysql_free_result(result);
+
+					if(numt != L->at(z).num_rows)
+					{
+						liberar(L->at(z).name);
+						numc = L->at(z).num_columns;
+						sprintf(sel, "Select * from %s", L->at(z).predname);
+						mysql_query(con, sel);
+						MYSQL_RES *result = mysql_store_result(con);
+						mat = (int *)malloc(numt * numc * sizeof(int));
+						w = 0;
+						while ((row = mysql_fetch_row(result))) 
+						{
+							for(x = 0; x < numc; x++, w++)
+								mat[w] = atoi(row[x]);
+						}
+
+						mysql_free_result(result);
+						if(L->at(z).address_host_table != NULL)
+							free(L->at(z).address_host_table);
+						L->at(z).address_host_table = mat;
+						L->at(z).num_rows = numt;
+					}
+				}
+			}
+			y++;
+		}
+	}
+	else
+	{
+		tok = strtok(names, " ");
+		while(tok != NULL)
+		{
+			numc = L->at(z).num_columns;
+			sprintf(sel, "Select * from %s", tok);
+			mysql_query(con, sel);
+			MYSQL_RES *result = mysql_store_result(con);
+			numt = mysql_num_rows(result);
+
+			MYSQL_ROW row;
+			mat = (int *)malloc(numt * numc * sizeof(int));
+			w = 0;
+			if(tok[0] == 'f' && tok[1] >= '0' && tok[1] <= '9')
+			{
+				while ((row = mysql_fetch_row(result))) 
+				{
+					for(x = 1; x <= numc; x++, w++)
+						mat[w] = atoi(row[x]);
+				}
+			}
+			else
+			{
+				while ((row = mysql_fetch_row(result))) 
+				{
+					for(x = 0; x < numc; x++, w++)
+						mat[w] = atoi(row[x]);
+				}
+			}
+			mysql_free_result(result);
+			L->at(z).address_host_table = mat;
+			L->at(z).num_rows = numt;
+
+			numc = (strlen(tok) + 1) * sizeof(char);
+			L->at(z).predname = (char *)malloc(numc);
+			strcpy(L->at(z).predname, tok);
+			tok = strtok(NULL, " ");
+			z++;
+		}
+	}
+	*ret = con;
+}
+
+void mysqlWrite(vector<rulenode>::iterator rul_str, vector<rulenode>::iterator fin, vector<gpunode> *L, MYSQL *con)
+{
+	int x, y, z, cols1, cols2, res_rows, tipo;
+	int *hres, *dop1;
+	char *id, *sign, *q1, *q2;
+	char sel[1024], weight[1024];
+	gpunode tmpfact;
+	while(rul_str != fin)
+	{
+		cols1 = rul_str->num_columns;
+		res_rows = cargafinal(rul_str->name, cols1, &dop1);
+		id = strtok(rul_str->rulename, "_");
+		sprintf(sel, "create table if not exists %s(weight double, ", id);
+		for(x = 0; x < cols1; x++)
+		{
+			sprintf(weight, "a%d char(10), ", x);
+			strcat(sel, weight);
+		}
+		sel[strlen(sel)-2] = ')';
+		strcat(sel, "ENGINE = MEMORY DEFAULT CHARSET=latin1");
+		mysql_query(con, sel);
+		sprintf(sel, "truncate %s", id);
+		mysql_query(con, sel);
+
+		if(res_rows == 0)
+		{
+			rul_str++;
+			continue;
+		}
+
+		if(res_rows > 0)
+		{
+			tmpfact = L->at(-rul_str->referencias[rul_str->num_rows - 2] - 1);
+			sign = tmpfact.predname;
+			tipo = res_rows * cols1 * sizeof(int);
+			hres = (int *)malloc(tipo);
+			cudaMemcpy(hres, dop1, tipo, cudaMemcpyDeviceToHost);
+			if(sign[0] == 'f' && sign[1] >= '0' && sign[1] <= '9')
+				sumar(tmpfact.name, dop1, cols1, res_rows);
+		}
+		else
+		{
+			hres = dop1;
+			res_rows = -res_rows;
+		}
+
+		sign = strtok(NULL, "_");
+		q1 = strtok(NULL, "_");
+		q2 = strtok(NULL, "_");
+		if(sign[0] == '0')
+			sprintf(weight, "%s.%s", q1, q2);
+		else
+			sprintf(weight, "-%s.%s", q1, q2);
+
+		FILE *fp;
+		char file[512];
+		sprintf(file, "/dev/shm/%s.tsv", id);
+		fp = fopen(file, "w");
+		if(fp == NULL)
+		{
+			cerr << "Failed to create main memory temporary file, attempting to use hardrive" << endl;
+			sprintf(file, "./temp/%s.tsv", id);
+			fp = fopen(file, "w");
+		}
+		
+		cols2 = cols1 - 1;
+		for(x = 0, z = 0; x < res_rows; x++, z++)
+		{
+			fprintf(fp, "%s\t", weight);
+			for(y = 0; y < cols2; y++, z++)
+				fprintf(fp, "%d\t", hres[z]);
+			fprintf(fp, "%d\n", hres[z]);
+		}
+		fclose(fp);
+
+		sprintf(sel, "LOAD DATA LOCAL INFILE '%s' INTO TABLE %s", file, id);
+		mysql_query(con, sel);
+		rul_str++;
+	}
+	mysql_close(con);
+}
+#endif
+
--- a/packages/cuda/dbio.h
+++ b/packages/cuda/dbio.h
@ -0,0 +1,28 @@
+#ifndef _DBIO_H_
+#define _DBIO_H_
+
+#include "pred.h"
+#ifdef TUFFY
+#include <libpq-fe.h>
+#endif
+#ifdef ROCKIT
+#include <mysql/mysql.h>
+#endif
+#include <vector>
+#include "lista.h"
+
+using namespace std;
+
+#ifdef TUFFY
+void postgresRead(PGconn **ret, vector<gpunode> *L, int *inpquery, char *names, int finalDR);
+void postgresWrite(int *inpquery, int ninpf, vector<rulenode>::iterator rul_str, vector<rulenode>::iterator fin, vector<gpunode> *L, PGconn *conn, int finalDR);
+#endif
+#ifdef ROCKIT
+void mysqlRead(MYSQL **ret, int *qrs, vector<gpunode> *L, int ninpf, char *names, int finalDR);
+void mysqlWrite(vector<rulenode>::iterator rul_str, vector<rulenode>::iterator fin, vector<gpunode> *L, MYSQL *con);
+#endif
+#ifdef DATALOG
+void datalogWrite(int query, vector<rulenode>::iterator rul_str, vector<rulenode>::iterator fin, int finalDR, int **result);
+#endif
+
+#endif
--- a/packages/cuda/joincpu.cpp
+++ b/packages/cuda/joincpu.cpp
@ -0,0 +1,500 @@
+#include "CC_CSSTree.h"
+#include <vector>
+#include <thrust/sort.h>
+#include <thrust/system/omp/execution_policy.h>
+#include "pred.h"
+
+void partInlj(Record *R, int rLen, CC_CSSTree *tree, Record *S, int startS, int endS, int of1, int of2, vector<int> *res, int *p1, int *p2, int *perm, int *proj, int wj, int halfrul, int lenrul)
+{
+	//set_thread_affinity(cpuid,NUM_T);
+	int i=0;
+	int k=0;
+	int curIndex=0;
+	int keyForSearch;
+	int y, posS, posR;
+	for(k=startS; k<endS; k++)
+	{
+		if(S == NULL)
+			posS = k * of2;
+		else
+			posS = S[k] * of2;
+		keyForSearch=p2[posS + wj];
+		curIndex=tree->search(keyForSearch);
+		for(i=curIndex-1;i>0;i--)
+		{
+			if(keyForSearch == R[i])
+			{
+
+				//cout << keyForSearch << endl;
+
+				posR = perm[i] * of1;
+				for(y = 0; y < halfrul; y++)
+					res->push_back(p1[posR + proj[y]]);
+				for(; y < lenrul; y++)
+					res->push_back(p2[posS + proj[y]]);
+			}
+			else
+				if(R[i]<keyForSearch)
+				break;
+		}
+		for(i=curIndex;i<rLen;i++)
+		{	
+			if(keyForSearch == R[i])
+			{
+
+				//cout << -i << " " << keyForSearch << endl;
+
+				posR = perm[i] * of1;
+				for(y = 0; y < halfrul; y++)
+					res->push_back(p1[posR + proj[y]]);
+				for(; y < lenrul; y++)
+					res->push_back(p2[posS + proj[y]]);
+			}
+			else
+				if(R[i]>keyForSearch)
+				break;
+		}
+	}
+}
+
+void partInlj2(Record *R, int rLen, CC_CSSTree *tree, Record *S, int startS, int endS, int of1, int of2, vector<int> *res, int *p1, int *p2, int *perm, int *proj, int cols, int wj)
+{
+	//set_thread_affinity(cpuid,NUM_T);
+	int i=0;
+	int k=0;
+	int curIndex=0;
+	int keyForSearch;
+	int y, cond, posS, posR;
+	for(k=startS; k<endS; k++)
+	{
+		if(S == NULL)
+			posS = k * of2;
+		else
+			posS = S[k] * of2;
+		keyForSearch=p2[posS + wj];
+		curIndex=tree->search(keyForSearch);
+		for(i=curIndex-1;i>0;i--)
+		{
+			if(keyForSearch == R[i])
+			{
+
+				//cout << keyForSearch << endl;
+
+				posR = perm[i] * of1 - 1;
+				for(y = 0; y < cols; y++)
+				{
+					cond = proj[y];
+					if(cond > 0)
+						res->push_back(p1[posR + cond]);
+					else
+						res->push_back(p2[posS - cond - 1]);
+				}
+			}
+			else
+				if(R[i]<keyForSearch)
+				break;
+		}
+		for(i=curIndex;i<rLen;i++)
+		{	
+			if(keyForSearch == R[i])
+			{
+
+				//cout << -i << " " << keyForSearch << endl;
+
+				posR = perm[i] * of1 - 1;
+				for(y = 0; y < cols; y++)
+				{
+					cond = proj[y];
+					if(cond > 0)
+						res->push_back(p1[posR + cond]);
+					else
+						res->push_back(p2[posS - cond - 1]);
+				}
+			}
+			else
+				if(R[i]>keyForSearch)
+				break;
+		}
+	}
+}
+
+void multipartInlj(Record *R, int rLen, CC_CSSTree *tree, Record *S, int startS, int endS, int of1, int of2, vector<int> *res, int *p1, int *p2, int *perm, int *proj, int *wj, int numj, int halfrul, int lenrul)
+{
+	//set_thread_affinity(cpuid,NUM_T);
+	int i=0;
+	int k=0;
+	int curIndex=0;
+	int keyForSearch;
+	int y, posS, posR;
+	for(k=startS; k<endS; k++)
+	{
+		if(S == NULL)
+			posS = k * of2;
+		else
+			posS = S[k] * of2;
+		keyForSearch=p2[posS + wj[1]];
+		curIndex=tree->search(keyForSearch);
+		for(i=curIndex-1;i>0;i--)
+		{
+			if(keyForSearch == R[i])
+			{
+				posR = perm[i] * of1;
+				for(y = 2; y < numj; y += 2)
+				{
+					if(p1[posR + wj[y]] != p2[posS + wj[y+1]])
+						break;
+				}
+				if(y < numj)
+					continue;
+				for(y = 0; y < halfrul; y++)
+					res->push_back(p1[posR + proj[y]]);
+				for(; y < lenrul; y++)
+					res->push_back(p2[posS + proj[y]]);
+			}
+			else
+				if(R[i]<keyForSearch)
+				break;
+		}
+		for(i=curIndex;i<rLen;i++)
+		{	
+			if(keyForSearch == R[i])
+			{
+				posR = perm[i] * of1;
+				for(y = 2; y < numj; y += 2)
+				{
+					if(p1[posR + wj[y]] != p2[posS + wj[y+1]])
+						break;
+				}
+				if(y < numj)
+					continue;
+				for(y = 0; y < halfrul; y++)
+					res->push_back(p1[posR + proj[y]]);
+				for(; y < lenrul; y++)
+					res->push_back(p2[posS + proj[y]]);
+			}
+			else
+				if(R[i]>keyForSearch)
+				break;
+		}
+	}
+}
+
+void multipartInlj2(Record *R, int rLen, CC_CSSTree *tree, Record *S, int startS, int endS, int of1, int of2, vector<int> *res, int *p1, int *p2, int *perm, int *proj, int cols, int *wj, int numj)
+{
+	//set_thread_affinity(cpuid,NUM_T);
+	int i=0;
+	int k=0;
+	int curIndex=0;
+	int keyForSearch;
+	int y, cond, posS, posR;
+	for(k=startS; k<endS; k++)
+	{
+		if(S == NULL)
+			posS = k * of2;
+		else
+			posS = S[k] * of2;
+		keyForSearch=p2[posS + wj[1]];
+		curIndex=tree->search(keyForSearch);
+		for(i=curIndex-1;i>0;i--)
+		{
+			if(keyForSearch == R[i])
+			{
+				posR = perm[i] * of1;
+				for(y = 2; y < numj; y += 2)
+				{
+					if(p1[posR + wj[y]] != p2[posS + wj[y+1]])
+						break;
+				}
+				if(y < numj)
+					continue;
+				for(y = 0; y < cols; y++)
+				{
+					cond = proj[y];
+					if(cond > 0)
+						res->push_back(p1[posR + cond - 1]);
+					else
+						res->push_back(p2[posS - cond - 1]);
+				}
+			}
+			else
+				if(R[i]<keyForSearch)
+				break;
+		}
+		for(i=curIndex;i<rLen;i++)
+		{	
+			if(keyForSearch == R[i])
+			{
+				posR = perm[i] * of1;
+				for(y = 2; y < numj; y += 2)
+				{
+					if(p1[posR + wj[y]] != p2[posS + wj[y+1]])
+						break;
+				}
+				if(y < numj)
+					continue;
+				for(y = 0; y < cols; y++)
+				{
+					cond = proj[y];
+					if(cond > 0)
+						res->push_back(p1[posR + cond - 1]);
+					else
+						res->push_back(p2[posS - cond - 1]);
+				}
+			}
+			else
+				if(R[i]>keyForSearch)
+				break;
+		}
+	}
+}
+
+void inlj_omp(Record *R, int rLen, CC_CSSTree *tree, Record *S, int sLen, int of1, int of2, vector<int> *res, int *p1, int *p2, int *perm, int *proj, int2 projp, int cols, int* wj, int numj, int tipo)
+{
+	int i=0;
+	int j=0;
+	int *startS=new int[NUM_T];
+	int *endS=new int[NUM_T];
+	int chunkSize=sLen/NUM_T;
+	for(i=0;i<NUM_T;i++)
+	{
+		startS[i]=i*chunkSize;
+		if(i==(NUM_T-1))
+			endS[i]=sLen;
+		else
+			endS[i]=(i+1)*chunkSize;
+
+		//cout<<"T"<<i<<", "<<endS[i]-startS[i]<<"; ";
+
+	}
+
+	//cout<<endl;
+
+	//omp_set_num_threads(NUM_T);
+
+	//cout << "inicio" << endl;
+
+	#pragma omp parallel for
+	for(j=0;j<NUM_T;j++)
+	{
+		if(tipo)
+		{
+			if(numj > 2)
+				multipartInlj2(R, rLen, tree, S, startS[j], endS[j], of1, of2, &res[j], p1, p2, perm, proj, cols, wj, numj);
+			else
+				partInlj2(R, rLen, tree, S, startS[j], endS[j], of1, of2, &res[j], p1, p2, perm, proj, cols, wj[1]);
+		}
+		else
+		{
+			if(numj > 2)
+				multipartInlj(R, rLen, tree, S, startS[j], endS[j], of1, of2, &res[j], p1, p2, perm, proj, wj, numj, projp.x, projp.y);
+			else
+				partInlj(R, rLen, tree, S, startS[j], endS[j], of1, of2, &res[j], p1, p2, perm, proj, wj[1], projp.x, projp.y);
+		}
+	}
+	
+	//cout << "fin" << endl;
+	
+	delete startS;
+	delete endS;
+}
+
+int joincpu(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>::iterator rule, int pos, int bothops, int **ret)
+{
+	int pos2 = pos + 1;
+	int *sel1, nsel1 = 0;
+	int *sel2 = rule->select[pos2];
+	int nsel2 = rule->numsel[pos2];
+	int *proj = rule->project[pos];
+	int2 projp = rule->projpos[pos];
+	int *sjoin1, nsj1 = 0;
+	int *sjoin2 = rule->selfjoin[pos2];
+	int nsj2 = rule->numselfj[pos2];
+	int *wherej = rule->wherejoin[pos];
+	int numj = rule->numjoin[pos];
+	int size, *fres, ini[NUM_T], *temp;
+	int x, tipo = 0;
+	int *Sres = NULL, *Rres, Snl, Rnl, *permutation;
+
+	if(bothops)
+	{
+		sel1 = rule->select[pos];
+		nsel1 = rule->numsel[pos];
+		sjoin1 = rule->selfjoin[pos];
+		nsj1 = rule->numselfj[pos];
+	}
+
+	#ifdef TIMER
+	cudaEvent_t start, stop;
+	float time;
+	cudaEventCreate(&start);
+	cudaEventCreate(&stop);
+	cudaEventRecord(start, 0);
+	#endif
+
+	if(nsel1 > 0 || nsj1 > 0)
+		Rnl = selectproyectcpu2(p1, rLen, of1, sel1, nsel1, sjoin1, nsj1, wherej[0], &Rres, &permutation);
+	else
+	{
+
+		/*cout << "sin sel" << endl;
+		cout << "valores = " << rLen << " " << of1 << " " << wherej[0] << endl;
+		for(x = 0; x < 100; x++)
+			cout << p1[x] << " ";
+		cout << endl;
+		cout << "ultimo = " << p1[of1 * rLen - 1] << endl;*/ 
+
+		Rnl = rLen;
+		size = Rnl * sizeof(int);
+		permutation = (int *)malloc(size);
+		Rres = (int *)malloc(size);
+		#pragma omp parallel for firstprivate(of1)
+		for(x = 0; x < Rnl; x++)
+		{
+			permutation[x] = x;
+			Rres[x] = p1[of1 * x + wherej[0]];
+		}
+
+		//cout << "sin sel fin" << endl;
+
+	}
+
+	#ifdef TIMER
+	cudaEventRecord(stop, 0);
+	cudaEventSynchronize(stop);
+	cudaEventElapsedTime(&time, start, stop);
+	cuda_stats.select1_time += time;
+
+	cudaEventDestroy(start);
+	cudaEventDestroy(stop);
+	cudaEventCreate(&start);
+	cudaEventCreate(&stop);
+	cudaEventRecord(start, 0);
+	#endif
+
+	if(nsel2 > 0 || nsj2 > 0)
+	{
+		//cout << "con sel S" << endl;
+
+		Snl = selectproyectcpu2(p2, sLen, of2, sel2, nsel2, sjoin2, nsj2, wherej[1], &Sres, NULL);
+	}
+	else
+		Snl = sLen;
+
+	#ifdef TIMER
+	cudaEventRecord(stop, 0);
+	cudaEventSynchronize(stop);
+	cudaEventElapsedTime(&time, start, stop);
+	cuda_stats.select2_time += time;
+
+	cudaEventDestroy(start);
+	cudaEventDestroy(stop);
+	cudaEventCreate(&start);
+	cudaEventCreate(&stop);
+	cudaEventRecord(start, 0);
+	#endif
+
+	//cout << "antes" << endl;
+
+	/*cout << "antes" << endl;
+	for(x = 0; x < Rnl; x++)
+		cout << permutation[x] << " ";
+	cout << endl;
+	for(x = 0; x < 100; x++)
+		cout << Rres[x] << " ";
+	cout << endl;*/
+
+	thrust::stable_sort_by_key(thrust::omp::par, Rres, Rres + Rnl, permutation);
+
+	#ifdef TIMER
+	cudaEventRecord(stop, 0);
+	cudaEventSynchronize(stop);
+	cudaEventElapsedTime(&time, start, stop);
+	cuda_stats.sort_time += time;
+	
+	cudaEventDestroy(start);
+	cudaEventDestroy(stop);
+	cudaEventCreate(&start);
+	cudaEventCreate(&stop);
+	cudaEventRecord(start, 0);
+	#endif
+
+	/*cout << "despues" << endl;
+	for(x = 0; x < Rnl; x++)
+		cout << permutation[x] << " ";
+	cout << endl;
+	for(x = 0; x < Rnl; x++)
+		cout << Rres[x] << " ";
+	cout << endl;*/
+
+	//cout << "despues sort" << endl;
+
+	vector<int> *res = new vector<int>[NUM_T];
+	for(x = 0; x < NUM_T; x++)
+		res[x].reserve(INISIZE);
+	CC_CSSTree *tree = new CC_CSSTree(Rres, Rnl, CSS_TREE_FANOUT);
+	if(pos == (rule->num_rows - 3)) // && rule->num_bpreds.x == 0)
+		tipo = 1;
+	inlj_omp(Rres, Rnl, tree, Sres, Snl, of1, of2, res, p1, p2, permutation, proj, projp, rule->num_columns, wherej, numj, tipo);
+
+	/*cout << "proj = ";
+	for(x = 0; x < rule->num_columns; x++)
+		cout << proj[x] << " ";
+	cout << endl;
+	int y,z;
+	for(x = 0; x < NUM_T; x++)
+	{
+		cout << "Thread " << x << endl;
+		for(y = 0; y < res[x].size() / projp.y; y++)
+		{	
+			for(z = 0; z < projp.y; z++)
+				cout << res[x][y * projp.y + z] << " ";
+			cout << endl;
+		}
+	}
+	cout << "Tamanios" << endl;*/
+
+	size = 0;
+	for(x = 0; x < NUM_T; x++)
+	{
+		ini[x] = res[x].size();
+		size += ini[x];
+		
+		//cout << ini[x] << " " << size << endl;
+
+	}
+	fres = (int *)malloc(size * sizeof(int));
+	temp = fres;
+	for(x = 0; x < NUM_T; x++)
+	{
+		memcpy(temp, res[x].data(), ini[x] * sizeof(int));
+		temp += ini[x];
+	}
+
+	if(*ret != NULL)
+		free(*ret);
+	free(Rres);
+	free(permutation);
+	if(Sres != NULL)
+		free(Sres);
+	delete tree;
+	delete [] res;
+
+	*ret = fres;
+
+	#ifdef TIMER
+	cudaEventRecord(stop, 0);
+	cudaEventSynchronize(stop);
+	cudaEventElapsedTime(&time, start, stop);
+	cuda_stats.join_time += time;
+	#endif
+
+	//cout << "Projp.x = " << projp.x << " projp.y = " << projp.y << endl;
+
+	/*if(numj > 2)
+	{
+		cout << "total = " << rLen << " " << size / projp.y << " " << projp.y << " " << rule->num_columns << endl;
+		exit(1);
+	}*/
+
+	return size / projp.y;
+}
--- a/packages/cuda/selectproyectcpu.cpp
+++ b/packages/cuda/selectproyectcpu.cpp
@ -0,0 +1,394 @@
+#include <stdlib.h>
+#include <string.h>
+#include <omp.h>
+#include <vector>
+#include "pred.h"
+
+using namespace std;
+
+int selectproyectcpu(int *dop1, int rows, int cols, int head_size, int *select, int numselect, int *selfjoin, int numselfj, int *project, int **ret)
+{
+	int size = 0, pos, temp;
+	int i, x, y, z, w;
+	int *fres, *ptr;
+	int div, fin, ini[NUM_T + 1];
+	vector<int> vec[NUM_T];
+
+	for(x = 0; x < NUM_T; x++)
+		vec[x].reserve(INISIZE);
+
+	//omp_set_num_threads(NUM_T);
+	div = rows / NUM_T;
+	ini[0] = 0;
+	for(x = 1; x < NUM_T; x++)
+		ini[x] = div * x;
+	ini[NUM_T] = rows;
+
+	if(numselect > 0)
+	{	
+		#pragma omp parallel for private(x,pos,y,z,fin,temp,w)
+		for(i = 0; i < NUM_T; i++)
+		{
+			fin = ini[i+1];
+			for(x = ini[i]; x < fin; x++)
+			{
+				pos = x * cols;
+				for(y = 0; y < numselect; y += 2)
+				{
+					if(dop1[pos+select[y]] != select[y+1])
+						break;
+				}
+				for(z = 0; z < numselfj; z++)
+				{
+					temp = dop1[pos+selfjoin[z]];
+					w = z + 1;
+					while(selfjoin[w] > -1)
+					{
+						if(temp != dop1[pos+selfjoin[w]])
+							break;
+						w++;
+					}
+					z = w;
+					if(selfjoin[w] != -1)
+						break;
+				}
+				if(y == numselect && z == numselfj)
+				{
+					for(y = 0; y < head_size; y++)
+						vec[i].push_back(dop1[pos+project[y]]);
+				}
+			}
+		}
+	}
+	else
+	{
+		if(numselfj > 0)
+		{
+			#pragma omp parallel for private(x,pos,y,z,fin,w,temp)
+			for(i = 0; i < NUM_T; i++)
+			{
+				fin = ini[i+1];
+				for(x = ini[i]; x < fin; x++)
+				{
+					pos = x * cols;
+					for(z = 0; z < numselfj; z++)
+					{
+						temp = dop1[pos+selfjoin[z]];
+						w = z + 1;
+						while(selfjoin[w] > -1)
+						{
+							if(temp != dop1[pos+selfjoin[w]])
+								break;
+							w++;
+						}
+						z = w;
+						if(selfjoin[w] != -1)
+							break;
+					}
+					if(z == numselfj)
+					{
+						for(y = 0; y < head_size; y++)
+							vec[i].push_back(dop1[pos+project[y]]);
+					}
+				}
+			}
+		}
+		else
+		{
+			fres = (int *)malloc(rows * cols * sizeof(int));
+			#pragma omp parallel for private(pos,y,z)
+			for(x = 0; x < rows; x++)
+			{
+				pos = x * cols;
+				z = pos;
+				for(y = 0; y < head_size; y++, z++)
+					fres[z] = dop1[pos+project[y]];
+			}
+			*ret = fres;
+			return rows;
+		}
+	}
+	for(x = 0; x < NUM_T; x++)
+	{
+		ini[x] = vec[x].size();
+		size += ini[x];
+	}
+	fres = (int *)malloc(size * sizeof(int));
+	ptr = fres;
+	for(x = 0; x < NUM_T; x++)
+	{
+		memcpy(ptr, vec[x].data(), ini[x] * sizeof(int));
+		ptr += ini[x];
+	}
+	*ret = fres;
+	return size / head_size;
+}
+
+int selectproyectcpu2(int *dop1, int rows, int cols, int *select, int numselect, int *selfjoin, int numselfj, int wj, int **ret, int **list)
+{
+	int size = 0, pos, temp;
+	int i, x, y, z, w;
+	int *fres, *lres, *ptr;
+	int div, fin, ini[NUM_T + 1];
+	int *vec[NUM_T];
+	int *lis[NUM_T];
+	int cont[NUM_T];
+	
+	//omp_set_num_threads(NUM_T);
+	div = rows / NUM_T;
+	ini[0] = 0;
+	for(x = 1; x < NUM_T; x++)
+		ini[x] = div * x;
+	ini[NUM_T] = rows;
+
+	pos = div + NUM_T;
+	for(x = 0; x < NUM_T; x++)
+	{
+		vec[x] = (int *)malloc(pos * sizeof(int));
+		lis[x] = (int *)malloc(pos * sizeof(int));
+		cont[x] = 0;
+	}
+
+	/*cout << "numselect = " << numselect << endl;
+	for(x = 0; x < numselect; x++)
+		cout << select[x] << " ";
+	cout << endl;*/
+
+	if(numselect > 0)
+	{	
+		#pragma omp parallel for private(x,pos,y,z,fin,temp,w)
+		for(i = 0; i < NUM_T; i++)
+		{
+			fin = ini[i+1];
+			for(x = ini[i]; x < fin; x++)
+			{
+				pos = x * cols;
+				for(y = 0; y < numselect; y += 2)
+				{
+					if(dop1[pos+select[y]] != select[y+1])
+						break;
+				}
+				for(z = 0; z < numselfj; z++)
+				{
+					temp = dop1[pos+selfjoin[z]];
+					w = z + 1;
+					while(selfjoin[w] > -1)
+					{
+						if(temp != dop1[pos+selfjoin[w]])
+							break;
+						w++;
+					}
+					z = w;
+					if(selfjoin[w] != -1)
+						break;
+				}
+				if(y == numselect && z == numselfj)
+				{
+					lis[i][cont[i]] = x;
+					if(list != NULL)
+						vec[i][cont[i]] = dop1[pos+wj];
+					cont[i]++;
+				}
+			}
+		}
+	}
+	else
+	{
+		if(numselfj > 0)
+		{
+			#pragma omp parallel for private(x,pos,y,z,fin,w,temp) firstprivate(cont)
+			for(i = 0; i < NUM_T; i++)
+			{
+				fin = ini[i+1];
+				for(x = ini[i]; x < fin; x++)
+				{
+					pos = x * cols;
+					for(z = 0; z < numselfj; z++)
+					{
+						temp = dop1[pos+selfjoin[z]];
+						w = z + 1;
+						while(selfjoin[w] > -1)
+						{
+							if(temp != dop1[pos+selfjoin[w]])
+								break;
+							w++;
+						}
+						z = w;
+						if(selfjoin[w] != -1)
+							break;
+					}
+					if(z == numselfj)
+					{
+						lis[i][cont[i]] = x;
+						if(list != NULL)
+							vec[i][cont[i]] = dop1[pos+wj];
+						cont[i]++;
+					}
+				}
+			}
+		}
+	}
+
+	//cout << "despues sel" << endl;
+
+	for(x = 0; x < NUM_T; x++)
+		size += cont[x];
+	lres = (int *)malloc(size * sizeof(int));
+	ptr = lres;
+	for(x = 0; x < NUM_T; x++)
+	{
+		memcpy(ptr, lis[x], cont[x] * sizeof(int));
+		ptr += cont[x];
+	}
+	if(list != NULL)
+	{
+		fres = (int *)malloc(size * sizeof(int));
+		ptr = fres;
+		for(x = 0; x < NUM_T; x++)
+		{
+			memcpy(ptr, vec[x], cont[x] * sizeof(int));
+			ptr += cont[x];
+		}
+		*ret = fres;	
+		*list = lres;
+	}
+	else
+		*ret = lres;
+	for(x = 0; x < NUM_T; x++)
+	{
+		free(lis[x]);
+		free(vec[x]);
+	}
+	return size;
+}
+
+int selectproyectcpu3(int *dop1, int rows, int cols, int *select, int numselect, int *selfjoin, int numselfj, int wj, int **ret, int **list)
+{
+	int size = 0, pos, temp;
+	int i, x, y, z, w;
+	int *fres, *lres, *ptr;
+	int div, fin, ini[NUM_T + 1];
+	vector<int> vec[NUM_T];
+	vector<int> lis[NUM_T];
+
+	for(x = 0; x < NUM_T; x++)
+	{
+		vec[x].reserve(INISIZE);
+		lis[x].reserve(INISIZE);
+	}
+	
+	//omp_set_num_threads(NUM_T);
+	div = rows / NUM_T;
+	ini[0] = 0;
+	for(x = 1; x < NUM_T; x++)
+		ini[x] = div * x;
+	ini[NUM_T] = rows;
+
+	/*cout << "numselect = " << numselect << endl;
+	for(x = 0; x < numselect; x++)
+		cout << select[x] << " ";
+	cout << endl;*/
+
+	if(numselect > 0)
+	{	
+		#pragma omp parallel for private(x,pos,y,z,fin,temp,w)
+		for(i = 0; i < NUM_T; i++)
+		{
+			fin = ini[i+1];
+			for(x = ini[i]; x < fin; x++)
+			{
+				pos = x * cols;
+				for(y = 0; y < numselect; y += 2)
+				{
+					if(dop1[pos+select[y]] != select[y+1])
+						break;
+				}
+				for(z = 0; z < numselfj; z++)
+				{
+					temp = dop1[pos+selfjoin[z]];
+					w = z + 1;
+					while(selfjoin[w] > -1)
+					{
+						if(temp != dop1[pos+selfjoin[w]])
+							break;
+						w++;
+					}
+					z = w;
+					if(selfjoin[w] != -1)
+						break;
+				}
+				if(y == numselect && z == numselfj)
+				{
+					lis[i].push_back(x);
+					if(list != NULL)
+						vec[i].push_back(dop1[pos+wj]);
+				}
+			}
+		}
+	}
+	else
+	{
+		if(numselfj > 0)
+		{
+			#pragma omp parallel for private(x,pos,y,z,fin,w,temp)
+			for(i = 0; i < NUM_T; i++)
+			{
+				fin = ini[i+1];
+				for(x = ini[i]; x < fin; x++)
+				{
+					pos = x * cols;
+					for(z = 0; z < numselfj; z++)
+					{
+						temp = dop1[pos+selfjoin[z]];
+						w = z + 1;
+						while(selfjoin[w] > -1)
+						{
+							if(temp != dop1[pos+selfjoin[w]])
+								break;
+							w++;
+						}
+						z = w;
+						if(selfjoin[w] != -1)
+							break;
+					}
+					if(z == numselfj)
+					{
+						lis[i].push_back(x);
+						if(list != NULL)
+							vec[i].push_back(dop1[pos+wj]);
+					}
+				}
+			}
+		}
+	}
+
+	//cout << "despues sel" << endl;
+
+	for(x = 0; x < NUM_T; x++)
+	{
+		ini[x] = lis[x].size();
+		size += ini[x];
+	}
+	lres = (int *)malloc(size * sizeof(int));
+	ptr = lres;
+	for(x = 0; x < NUM_T; x++)
+	{
+		memcpy(ptr, lis[x].data(), ini[x] * sizeof(int));
+		ptr += ini[x];
+	}
+	if(list != NULL)
+	{
+		fres = (int *)malloc(size * sizeof(int));
+		ptr = fres;
+		for(x = 0; x < NUM_T; x++)
+		{
+			memcpy(ptr, vec[x].data(), ini[x] * sizeof(int));
+			ptr += ini[x];
+		}
+		*ret = fres;	
+		*list = lres;
+	}
+	else
+		*ret = lres;
+	return size;
+}
--- a/packages/cuda/union2.h
+++ b/packages/cuda/union2.h
--- a/packages/cuda/unioncpu2.cpp
+++ b/packages/cuda/unioncpu2.cpp
@ -0,0 +1,68 @@
+#include <thrust/unique.h>
+#include <thrust/distance.h>
+#include <thrust/system/omp/execution_policy.h>
+#include <iostream>
+#include "union2.h"
+
+int unircpu(int *res, int rows, int tipo, int **ret)
+{
+
+	//cout << "En union = " << rows << " " << tipo << endl;
+
+	s2 *t2, *re2;
+	s3 *t3, *re3;
+	int nrows, *nres;
+	//int size;
+
+	switch(tipo)
+	{
+		case 1: 
+		{
+			thrust::sort(thrust::omp::par, res, res + rows);
+			nres = thrust::unique(thrust::omp::par, res, res + rows);
+			nrows = thrust::distance(res, nres);
+			/*if(nrows < rows / 2)
+			{
+				size = nrows * tipo * sizeof(int);
+				nres = (int *)malloc(size);
+				memcpy(nres, res, size);
+				free(*ret);
+				*ret = nres;
+			}*/
+			return nrows;	
+		}			
+		case 2: 
+		{
+			t2 = (s2*)res;
+			thrust::sort(thrust::omp::par, t2, t2 + rows, o2());
+			re2 = thrust::unique(thrust::omp::par, t2, t2 + rows, p2());
+			nrows = thrust::distance(t2, re2);
+			/*if(nrows < rows / 2)
+			{
+				size = nrows * tipo * sizeof(int);
+				nres = (int *)malloc(size);
+				memcpy(nres, res, size);
+				free(*ret);
+				*ret = nres;
+			}*/
+			return nrows;
+		}
+		case 3: 
+		{
+			t3 = (s3*)res;
+			thrust::sort(thrust::omp::par, t3, t3 + rows, o3());
+			re3 = thrust::unique(thrust::omp::par, t3, t3 + rows, p3());
+			nrows = thrust::distance(t3, re3);
+			/*if(nrows < rows / 2)
+			{
+				size = nrows * tipo * sizeof(int);
+				nres = (int *)malloc(size);
+				memcpy(nres, res, size);
+				free(*ret);
+				*ret = nres;
+			}*/
+			return nrows;
+		}
+	}
+	return 0;
+}