#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/sequence.h>
#include <thrust/sort.h>
#include <thrust/gather.h>
#include <thrust/scan.h>
#include <iostream>
#include <cstdio>
#include "lista.h"
#include "bpreds.h"

#define WARP_SIZE		32
#define TREE_NODE_SIZE		WARP_SIZE
#define TREE_FANOUT		(TREE_NODE_SIZE + 1)

#define N_MULTI_P 		16
#define BLCK_PER_MP_create	256	// blocks per multiprocessor during tree creation
#define BLCK_PER_MP_search	512	// blocks per multiprocessor during tree searching
#define WAPRS_PER_BLCK_join	8//16	// blocks per multiprocessor during tree creation
#define BLCK_PER_MP_join	512//256	// blocks per multiprocessor during tree searching

#define THRD_PER_BLCK_create	TREE_NODE_SIZE
#define BLCK_PER_GRID_create	(N_MULTI_P * BLCK_PER_MP_create)
#define THRD_PER_BLCK_search	TREE_NODE_SIZE
#define BLCK_PER_GRID_search	(N_MULTI_P * BLCK_PER_MP_search)
#define THRD_PER_GRID_search	(THRD_PER_BLCK_search * BLCK_PER_GRID_search)
#define THRD_PER_BLCK_join	(WARP_SIZE * WAPRS_PER_BLCK_join)
#define BLCK_PER_GRID_join	(N_MULTI_P * BLCK_PER_MP_join)
#define THRD_PER_GRID_join	(THRD_PER_BLCK_join * BLCK_PER_GRID_join)

#define TEST_MAX		100

typedef int IKeyType;
typedef int Record;

typedef struct {
	int keys[TREE_NODE_SIZE];
} IDirectoryNode;

typedef struct {
	Record records[TREE_NODE_SIZE];
} IDataNode;

typedef struct {
	IDataNode* data;
	unsigned int nDataNodes;
	IDirectoryNode* dir;
	unsigned int nDirNodes;
} CUDA_CSSTree;

struct to_neg
{
	__host__ __device__
	bool operator()(const int &r1)
	{
		if(r1 < 0)
			return 1;
		return 0;
	}
};

__host__ __device__ unsigned int uintCeilingLog(unsigned int base, unsigned int num)
{
	unsigned int result = 0;

	for(unsigned int temp = 1; temp < num; temp *= base)
		result++;

	return result;
}

__host__ __device__ unsigned int uintCeilingDiv(unsigned int dividend, unsigned int divisor)
{
	return (dividend + divisor - 1) / divisor;
}

__host__ __device__ unsigned int uintPower(unsigned int base, unsigned int pow)
{
	unsigned int result = 1;

	for(; pow; pow--)
		result *= base;

	return result;
}

__device__ int getRightMostDescIdx(int tree_size, int nodeIdx)
{
	int tmp = nodeIdx * TREE_NODE_SIZE + TREE_FANOUT;
	int n = uintCeilingLog(TREE_FANOUT, uintCeilingDiv(TREE_NODE_SIZE * tree_size + TREE_FANOUT, tmp)) - 1;

	int result = (tmp * uintPower(TREE_FANOUT, n) - TREE_FANOUT) / TREE_NODE_SIZE;
    	return result; 
}

__device__ int getDataArrayIdx(int dirSize, int tree_size, int bottom_start, int treeIdx)
{
	int idx;
	if(treeIdx < dirSize) {
		idx = tree_size - bottom_start - 1;
	}
	else if( treeIdx < bottom_start ) {
		idx = tree_size - bottom_start + treeIdx - dirSize;
	}
	else {
		idx = treeIdx - bottom_start;
	}
	return idx;
}

// Binary Search
__device__ int firstMatchingKeyInDirNode1(int keys[], int key)
{
	int min = 0;
	int max = TREE_NODE_SIZE;
	int mid;
	int cut;
	while(max - min > 1) {
		mid = (min + max) / 2;
		cut = keys[mid];

		if(key > cut)
			min = mid;
		else
			max = mid;
	}

	if(keys[min] >= key)
		return min;

	return max;

}

// Binary Search
__device__ int firstMatchingKeyInDataNode2(Record records[], IKeyType key)
{
	int min = 0;
	int max = TREE_NODE_SIZE;
	int mid;
	int cut;
	while(max - min > 1) {
		mid = (min + max) / 2;
		cut = records[mid];

		if(key > cut)
			min = mid;
		else
			max = mid;
	}

	if(records[min] == key)
		return min;

	if(max < TREE_NODE_SIZE && records[max] == key)
		return max;

	return -1;
}

__global__ void gCreateIndex(IDataNode data[], IDirectoryNode dir[], int dirSize, int tree_size, int bottom_start, int nNodesPerBlock)
{
        int startIdx = blockIdx.x * nNodesPerBlock;
        int endIdx = startIdx + nNodesPerBlock;
        if(endIdx > dirSize)
                endIdx = dirSize;
        int keyIdx = threadIdx.x;

        // Proceed only when in internal nodes
        for(int nodeIdx = startIdx; nodeIdx < endIdx; nodeIdx++)
        {
                int childIdx = nodeIdx * TREE_FANOUT + keyIdx + 1;        // One step down to the left
                // Then look for the right most descendent
                int rightMostDesIdx;
                // Common cases
                if(childIdx < tree_size) {
                        rightMostDesIdx = getRightMostDescIdx(tree_size, childIdx);
                }
                // versus the unusual case when the tree is incomplete and the node does not have the full set of children
                else {
                        // pick the last node in the tree (largest element of the array)
                        rightMostDesIdx = tree_size - 1;
                }

                int dataArrayIdx = getDataArrayIdx(dirSize, tree_size, bottom_start, rightMostDesIdx);
	        dir[nodeIdx].keys[keyIdx] = data[dataArrayIdx].records[TREE_NODE_SIZE - 1];
        }
}

__global__ void gSearchTree(IDataNode* data, int nDataNodes, IDirectoryNode* dir, int nDirNodes, int lvlDir, Record* arr, int locations[], int nSearchKeys, int nKeysPerThread, int tree_size, int bottom_start)
{
	// Bringing the root node (visited by every tuple) to the faster shared memory
	__shared__ IKeyType RootNodeKeys[TREE_NODE_SIZE];
	RootNodeKeys[threadIdx.x] = dir->keys[threadIdx.x];

	__syncthreads();

	int OverallThreadIdx = blockIdx.x * THRD_PER_BLCK_search + threadIdx.x;

	for(int keyIdx = OverallThreadIdx; keyIdx < nSearchKeys; keyIdx += THRD_PER_GRID_search)
	{
		IKeyType val = arr[keyIdx];
		int loc = firstMatchingKeyInDirNode1(RootNodeKeys, val) + 1;
		for(int i = 1; i < lvlDir && loc < nDirNodes; i++) {
			int kid = firstMatchingKeyInDirNode1(dir[loc].keys, val);
			loc = loc * TREE_FANOUT + kid + 1;
		}

		if(loc >= tree_size)
			loc = nDataNodes - 1;
		else
			loc = getDataArrayIdx(nDirNodes, tree_size, bottom_start, loc);

		int offset = firstMatchingKeyInDataNode2(data[loc].records, val);
		locations[keyIdx] = (offset <0)?-1:(loc * TREE_NODE_SIZE + offset);
	}
}

/*Counts the number of times a row in 'S' is to be joined to a row in 'R'.*/
__global__ void gIndexJoin(int *R, int *S, int g_locations[], int sLen, int g_ResNums[])
{
	int s_cur = blockIdx.x * blockDim.x + threadIdx.x;

	if(s_cur < sLen) 
	{
		int count = 1;
		int r_cur = g_locations[s_cur];
		int s_key;
		if(r_cur >= 0)
		{
			s_key = S[s_cur];
			r_cur++;
			while(s_key == R[r_cur]) 
			{
				count++;
				r_cur++;
			}
			g_ResNums[s_cur] = count;
		}
	}
}

/*Corrects 'gSearchTree' results when dealing with a negative multijoin. Uses the values found in 'g_locations' which indicate, for each row in 'R', if its going
to be joined (positive number) or not (-1). Works by checking the additional columns to be joined (i.e. all except the two used by 'gSearchTree') and changing to -1 
in 'g_locations' those rows that have equal values in the checked columns.*/
__global__ void gIndexMultiJoinNegative(int *R, int *S, int g_locations[], int rLen, int *p1, int *p2, int of1, int of2, int *mloc, int *sloc, int *muljoin, int wj)
{
	extern __shared__ int shared[];
	int r_cur = blockIdx.x * blockDim.x + threadIdx.x;
	int posr, poss, x;

	if(threadIdx.x < wj)
		shared[threadIdx.x] = muljoin[threadIdx.x];
	__syncthreads();

	if(r_cur < rLen) 
	{
		int s_cur = g_locations[r_cur];
		int r_key;
		if(s_cur >= 0)
		{
			r_key = R[r_cur];
			if(mloc == NULL)
				posr = r_cur * of1;
			else
				posr = mloc[r_cur] * of1;
			while(r_key == S[s_cur])
			{
				poss = sloc[s_cur] * of2;
				for(x = 0; x < wj; x += 2)
				{
					if(p1[posr + shared[x]] != p2[poss + shared[x+1]])
						break;
				}
				if(x >= wj)
					return;
				s_cur++;
			}
			g_locations[r_cur] = -1;
		}
	}
}

/*Corrects 'gSearchTree' results when dealing with a multijoin. Uses the values found in 'g_locations' which indicate, for each row in 'S', if its going
to be joined (positive number) or not (-1). Works by checking the additional columns to be joined (i.e. all except the two used by 'gSearchTree') and counting the number of 
times a row in 'S' is to be joined to its corresponding row in 'R', storing the new result in 'g_locations'.*/
__global__ void gIndexMultiJoin(int *R, int *S, int g_locations[], int sLen, int g_ResNums[], int *p1, int *p2, int of1, int of2, int *mloc, int *sloc, int *muljoin, int wj)
{
	extern __shared__ int shared[];
	int s_cur = blockIdx.x * blockDim.x + threadIdx.x;
	int posr, poss, x;

	if(threadIdx.x < wj)
		shared[threadIdx.x] = muljoin[threadIdx.x];
	__syncthreads();

	if(s_cur < sLen) 
	{
		int count = 0;
		int r_cur = g_locations[s_cur];
		int s_key;
		if(r_cur >= 0)
		{
			s_key = S[s_cur];
			if(sloc == NULL)
				poss = s_cur * of2;
			else
				poss = sloc[s_cur] * of2;
			while(s_key == R[r_cur]) 
			{
				posr = mloc[r_cur] * of1;
				for(x = 0; x < wj; x += 2)
				{
					if(p1[posr + shared[x]] != p2[poss + shared[x+1]])
						break;
				}
				if(x >= wj)
					count++;
				r_cur++;
			}
			if(count > 0)
				g_ResNums[s_cur] = count;
		}
	}
}

/*Writes the result of the join and projects the necessary columns as defined by 'rule'. The difference between this function and 'gJoinWithWrite' is the comparison of the additional join
columns.*/
__global__ void multiJoinWithWrite(int g_locations[], int sLen, int g_PrefixSums[], int g_joinResultBuffers[], int *p1, int *p2, int of1, int of2, int *rule, int halfrul, int lenrul, int *mloc, int *sloc, int wj)
{
	extern __shared__ int shared[];
	int *extjoins = &shared[lenrul];
	int s_cur = blockIdx.x * blockDim.x + threadIdx.x;

	if(threadIdx.x < (lenrul + wj))
		shared[threadIdx.x] = rule[threadIdx.x];
	__syncthreads();

	if(s_cur < sLen)
	{
		int r_cur = g_locations[s_cur];
		if(r_cur >= 0)
		{
			int x, y, pos, posr, poss;
			int num1 = g_PrefixSums[s_cur];
			int num2 = g_PrefixSums[s_cur+1];
			
			int tmp1, tmp2;

			if(sloc == NULL)	
				poss = s_cur * of2;
			else
				poss = sloc[s_cur] * of2;
			for(x = num1; x < num2; x++, r_cur++)
			{
				pos = mloc[r_cur] * of1;
				for(y = 0; y < wj; y += 2) /*Additional comparison*/
				{
					tmp1 = p1[pos + extjoins[y]];
					tmp2 = p2[poss + extjoins[y+1]];
					if(tmp1 != tmp2)
						break;
				}
				if(y < wj)
				{
					x--;
					continue;
				}
				posr = x * lenrul;
				for(y = 0; y < halfrul; y++)
					g_joinResultBuffers[posr + y] = p1[pos + shared[y]];
				for(; y < lenrul; y++)
					g_joinResultBuffers[posr + y] = p2[poss + shared[y]];
			}
		}
	}
}

/*Writes the result of the join and projects the necessary columns as defined by 'rule'. The difference between this function and 'gJoinWithWrite2' is the comparison of the additional join
columns.*/
__global__ void multiJoinWithWrite2(int g_locations[], int sLen, int g_PrefixSums[], int g_joinResultBuffers[], int *p1, int *p2, int of1, int of2, int *rule, int cols, int *mloc, int *sloc, int wj)
{
	extern __shared__ int shared[];
	int *extjoins = &shared[cols];
	int s_cur = blockIdx.x * blockDim.x + threadIdx.x;

	if(threadIdx.x < (cols + wj))
		shared[threadIdx.x] = rule[threadIdx.x];
	__syncthreads();

	if(s_cur < sLen)
	{
		int r_cur = g_locations[s_cur];
		if(r_cur >= 0)
		{
			int x, y, pos, pos2, posr, cond;
			int num1 = g_PrefixSums[s_cur];
			int num2 = g_PrefixSums[s_cur+1];
			if(sloc == NULL)
				pos2 = s_cur * of2 - 1;
			else
				pos2 = sloc[s_cur] * of2 - 1;
			for(x = num1; x < num2; x++, r_cur++)
			{
				pos = mloc[r_cur] * of1 - 1;
				for(y = 0; y < wj; y += 2) /*Additional comparison*/
				{
					if(p1[pos + extjoins[y] + 1] != p2[pos2 + extjoins[y+1] + 1])
						break;
				}
				if(y < wj)
				{
					x--;
					continue;
				}
				posr = x * cols;
				for(y = 0; y < cols; y++)
				{
					cond = shared[y];
					if(cond > 0)
						g_joinResultBuffers[posr + y] = p1[pos + cond];
					else
						g_joinResultBuffers[posr + y] = p2[pos2 - cond];
				}
			}
		}
	}
}

/*Writes the result of the join and projects the necessary columns as defined by 'rule'. The difference between this function and 'gJoinWithWrite2' is that only the columns in the positve
predicate are projected.*/
__global__ void gJoinWithWriteNegative(int g_locations[], int rLen, int g_joinResultBuffers[], int *p1, int of1, int *rule, int halfrul, int *mloc)
{
	extern __shared__ int shared[];
	int r_cur = blockIdx.x * blockDim.x + threadIdx.x;
	int posr;

	if(threadIdx.x < halfrul)
		shared[threadIdx.x] = rule[threadIdx.x];
	__syncthreads();

	if(r_cur < rLen)
	{
		posr = g_locations[r_cur];
		if(g_locations[r_cur+1] != posr)
		{
			int y, pos;
			if(mloc == NULL)	
				pos = r_cur * of1;
			else
				pos = mloc[r_cur] * of1;
			posr *= halfrul;
			for(y = 0; y < halfrul; y++)
				g_joinResultBuffers[posr + y] = p1[pos + shared[y]];
		}
	}
}

/*Writes the result of the join and projects the necessary columns as defined by 'rule'. The difference between this function and 'gJoinWithWrite' is that only the columns in the positve
predicate are projected.*/
__global__ void gJoinWithWriteNegative2(int g_locations[], int rLen, int g_joinResultBuffers[], int *p1, int of1, int *rule, int cols, int *mloc)
{
	extern __shared__ int shared[];
	int r_cur = blockIdx.x * blockDim.x + threadIdx.x;
	int posr;

	if(threadIdx.x < cols)
		shared[threadIdx.x] = rule[threadIdx.x];
	__syncthreads();

	if(r_cur < rLen)
	{
		posr = g_locations[r_cur];
		if(g_locations[r_cur+1] != posr)
		{
			int y, pos;
			if(mloc == NULL)
				pos = r_cur * of1 - 1;
			else
				pos = mloc[r_cur] * of1 - 1;
			posr *= cols;
			for(y = 0; y < cols; y++)
				g_joinResultBuffers[posr + y] = p1[pos + shared[y]];
		}
	}
}

/*Writes the result of the join and projects the necessary columns as defined by 'rule'.*/
__global__ void gJoinWithWrite(int g_locations[], int sLen, int g_PrefixSums[], int g_joinResultBuffers[], int *p1, int *p2, int of1, int of2, int *rule, int halfrul, int lenrul, int *mloc, int *sloc)
{
	extern __shared__ int shared[];
	int s_cur = blockIdx.x * blockDim.x + threadIdx.x;

	if(threadIdx.x < lenrul)
		shared[threadIdx.x] = rule[threadIdx.x];
	__syncthreads();

	if(s_cur < sLen)
	{
		int r_cur = g_locations[s_cur];
		if(r_cur >= 0)
		{
			int x, y, pos, posr, poss;
			int num1 = g_PrefixSums[s_cur];
			int num2 = g_PrefixSums[s_cur+1];
			if(sloc == NULL)	
				poss = s_cur * of2;
			else
				poss = sloc[s_cur] * of2;
			for(x = num1; x < num2; x++, r_cur++)
			{
				pos = mloc[r_cur] * of1;
				posr = x * lenrul;
				for(y = 0; y < halfrul; y++)
					g_joinResultBuffers[posr + y] = p1[pos + shared[y]];
				for(; y < lenrul; y++)
					g_joinResultBuffers[posr + y] = p2[poss + shared[y]];
			}
		}
	}
}

/*Writes the result of the join and projects the necessary columns as defined by 'rule'. This version is used when performing the final join of the rule and its only difference is the 
projection, which is performed based on the variables in the head of the rule.*/
__global__ void gJoinWithWrite2(int g_locations[], int sLen, int g_PrefixSums[], int g_joinResultBuffers[], int *p1, int *p2, int of1, int of2, int *rule, int cols, int *mloc, int *sloc)
{
	extern __shared__ int shared[];
	int s_cur = blockIdx.x * blockDim.x + threadIdx.x;

	if(threadIdx.x < cols)
		shared[threadIdx.x] = rule[threadIdx.x];
	__syncthreads();

	if(s_cur < sLen)
	{
		int r_cur = g_locations[s_cur];
		if(r_cur >= 0)
		{
			int x, y, pos, pos2, posr, cond;
			int num1 = g_PrefixSums[s_cur];
			int num2 = g_PrefixSums[s_cur+1];
			if(sloc == NULL)
				pos2 = s_cur * of2 - 1;
			else
				pos2 = sloc[s_cur] * of2 - 1;
			for(x = num1; x < num2; x++, r_cur++)
			{
				pos = mloc[r_cur] * of1 - 1;
				posr = x * cols;
				for(y = 0; y < cols; y++)
				{
					cond = shared[y];
					if(cond > 0)
						g_joinResultBuffers[posr + y] = p1[pos + cond];
					else
						g_joinResultBuffers[posr + y] = p2[pos2 - cond];
				}
			}
		}
	}
}

/*Load part of column 'wj' of 'p' in 'R'. Which values are loaded is defined by the prefix sum results in 'pos'.*/
__global__ void llenar(int *p, int *R, int len, int of, int wj, int *pos, int *ids)
{
	int id = blockIdx.x * blockDim.x + threadIdx.x;
	int cond;
	if(id < len)
	{
		cond = pos[id];
		if(pos[id+1] != cond)
		{
			R[cond] = p[id * of + wj];
			ids[cond] = id;
		}
	}
}

/*Load an entire column from 'p' into 'R'.*/
__global__ void llenarnosel(int *p, int *R, int len, int of, int wj)
{
	int id = blockIdx.x * blockDim.x + threadIdx.x;
	if(id < len)
		R[id] = p[id * of + wj];
}

__global__ void projectfinal(int *res, int rows, int cols, int *rule, int *out)
{
	extern __shared__ int shared[];
	int id = blockIdx.x * blockDim.x + threadIdx.x;

	if(threadIdx.x < cols)
		shared[threadIdx.x] = rule[threadIdx.x];
	__syncthreads();
	
	if(id < rows)
	{
		id *= cols;
		for(int y = 0; y < cols; y++)
			out[id + y] = res[id + shared[y]];
	}
}

void project(int *res, int resrows, int numcols1, int numcols2, int *proj, int **ret, int type)
{
	int z, *dcons, *d_Rout;
	int numthreads = 1024;
	//numthreads = 32;
	int blockllen = resrows / numthreads + 1;
	int sizepro = numcols2 * sizeof(int);
	reservar(&dcons, sizepro);
	if(type)
	{
		int *pt = (int *)malloc(sizepro);
		for(z = 0; z < numcols2; z++)
			pt[z] = proj[z] - 1;
		cudaMemcpy(dcons, pt, sizepro, cudaMemcpyHostToDevice); 
		//cudaDeviceSynchronize(); //Small cudaMemcpys are asynchronous, uncomment this line if the pointer is being liberated before it is copied.
		free(pt);
	}
	else
		cudaMemcpy(dcons, proj, sizepro, cudaMemcpyHostToDevice);
	reservar(&d_Rout, resrows * sizepro);
	projectfinal<<<blockllen, numthreads, sizepro>>>(res, resrows, numcols1, dcons, d_Rout);
	cudaFree(dcons);
	cudaFree(*ret);
	*ret = d_Rout;
}

__global__ void projectadd(int *dop1, int *dop2, int rows1, int rows2, int cols1, int cols2, int *dhead, int hsize, int *res)
{
	extern __shared__ int shared[];
	int id = blockIdx.x * blockDim.x + threadIdx.x;
	int pos2, posr, x, y, cond;
	if(threadIdx.x < hsize)
		shared[threadIdx.x] = dhead[threadIdx.x];
	__syncthreads();
	if(id < rows2)
	{
		posr = id * hsize * rows1;
		pos2 = id * cols2 - 1;
		for(x = 0; x < rows1; x++)
		{
			for(y = 0; y < hsize; y++)
			{
				cond = shared[y];
				if(cond > 0)
					res[posr + y] = dop1[cond-1];
				else
					res[posr + y] = dop2[pos2 - cond];
			}
			posr += hsize;
		}
	}
}

void juntar(int *dop1, int *dop2, int rows1, int rows2, int cols1, int cols2, int *proj, int pcols, int **ret)
{
	int sizepro, *dcons, *d_Rout;
	int numthreads = 1024;
	//numthreads = 32;
	int blockllen = rows2 / numthreads + 1;
	sizepro = pcols * sizeof(int);
	reservar(&dcons, sizepro);
	cudaMemcpy(dcons, proj, sizepro, cudaMemcpyHostToDevice);
	reservar(&d_Rout, rows1 * rows2 * sizepro);
	projectadd<<<blockllen, numthreads, sizepro>>>(dop1, dop2, rows1, rows2, cols1, cols2, dcons, pcols, d_Rout);
	cudaFree(dcons);
	*ret = d_Rout;
}

/*Joins two predicates. Starts by performing all preliminary operations (selections, selfjoins, comparisons) on both predicates. Then a column pair is used to construct 
a CSS-Tree and that tree is searched for join positions. The positions are used in a prefix sum and its result allows us to write the result. Multijoins and negative 
predicates follow roughly the same process, but use different kernels.*/
int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>::iterator rule, int pos, int bothops, int **ret, int ANDlogic)
{
	int pos2 = pos + 1;
	int *sel1 = NULL, nsel1 = 0;
	int *sel2 = rule->select[pos2];
	int nsel2 = rule->numsel[pos2];
	int *proj = rule->project[pos];
	int2 projp = rule->projpos[pos];
	int *sjoin1 = NULL, nsj1 = 0;
	int *sjoin2 = rule->selfjoin[pos2];
	int nsj2 = rule->numselfj[pos2];
	int *pred1 = NULL;
	int2 npred1 = make_int2(0,0);
	int *pred2 = rule->preds[pos2];
	int2 npred2 = rule->numpreds[pos2];
	int npred2tot = npred2.x + npred2.y;
	int *wherej = rule->wherejoin[pos];
	int numj = rule->numjoin[pos];
	int negative = rule->negatives[pos2+1];
	int flag;

	#ifdef ROCKIT
		ANDlogic = 0;
	#endif
	if(negative)
		ANDlogic = 1;

#if TIMER
	cuda_stats.joins++;
#endif

	int size, sizet, sizet2;
	if(bothops)
	{
		sel1 = rule->select[pos];
		nsel1 = rule->numsel[pos];
		sjoin1 = rule->selfjoin[pos];
		nsj1 = rule->numselfj[pos];
		pred1 = rule->preds[pos];
		npred1 = rule->numpreds[pos];
		sizet = maximo(10, of1, of2, nsel1, nsel2, projp.y + numj - 2, nsj1, nsj2, numj, npred1.x, npred2tot) * sizeof(int);
	}
	else
		sizet = maximo(7, of1, of2, nsel2, projp.y + numj - 2, nsj2, numj, npred2tot) * sizeof(int);
	
	int *dcons, *temp, *temp2 = NULL;
	int *d_R, *d_S;
	int blockllen, numthreads;
	
	int extraspace = TREE_NODE_SIZE - rLen % TREE_NODE_SIZE;
	int m32rLen = rLen + extraspace;
	int extraspaceS = TREE_NODE_SIZE - sLen % TREE_NODE_SIZE;
	int m32sLen = sLen + extraspaceS;
	if(m32rLen > m32sLen)
		sizet2 = (m32rLen + 1) * sizeof(int);
	else
		sizet2 = (m32sLen + 1) * sizeof(int);

	reservar(&dcons, sizet);
	reservar(&temp, sizet2);
	thrust::device_ptr<int> res = thrust::device_pointer_cast(temp);

	numthreads = 1024;
	//numthreads = 32;
	blockllen = sLen / numthreads + 1;
	int memSizeS, newLen = 0;
	int *posR = NULL, *posS = NULL;
	int sizem32S = 0, sizextra;

	#ifdef TIMER
	//cout << "INICIO" << endl;
	cudaEvent_t start, stop;
	float time;
	cudaEventCreate(&start);
	cudaEventCreate(&stop);
	cudaEventRecord(start, 0);
	#endif

	if(npred2.x > 0 || npred2.y > 0 || nsel2 > 0 || nsj2 > 0)
	{
		newLen = sLen + 1;
		cudaMemsetAsync(temp, 0, newLen * sizeof(int));
	}

	if(npred2.x > 0 || npred2.y > 0)
	{
		size = npred2tot * sizeof(int);
		cudaMemcpy(dcons, pred2, size, cudaMemcpyHostToDevice);

		if(npred2.y > 0) /*Fix case when a(X,Y),b(Y,Z),Z > Y*/
		{
			reservar(&temp2, sizet2);
			cudaMemsetAsync(temp2, 0, newLen * sizeof(int));
			//res = thrust::device_pointer_cast(temp2);
			bpreds<<<blockllen, numthreads, size>>>(p1, p2, sLen, of1, of2, dcons, npred2tot, npred2.x, temp + 1, temp2 + 1);
		}
		else
		{
			if(negative)
				bpreds<<<blockllen, numthreads, size>>>(p1, p2, sLen, of1, of2, dcons, npred2tot, npred2.x, temp + 1, NULL);
			else
				bpredsOR<<<blockllen, numthreads, size>>>(p1, p2, sLen, of1, of2, dcons, npred2tot, npred2.x, temp + 1, NULL);
		}

		if(nsel2 > 0)
		{
			size = nsel2 * sizeof(int);
			cudaMemcpy(dcons, sel2, size, cudaMemcpyHostToDevice);
			marcar<<<blockllen, numthreads, size>>>(p2, sLen, of2, dcons, nsel2, temp + 1);
		}

		if(nsj2 > 0)
		{
			size = nsj2 * sizeof(int);
			cudaMemcpy(dcons, sjoin2, size, cudaMemcpyHostToDevice);
			samejoin<<<blockllen, numthreads, size>>>(p2, sLen, of2, dcons, nsj2, temp + 1);
		}
	}
	else
	{
		if(nsel2 > 0)
		{
			size = nsel2 * sizeof(int);
			cudaMemcpy(dcons, sel2, size, cudaMemcpyHostToDevice);
			marcar2<<<blockllen, numthreads, size>>>(p2, sLen, of2, dcons, nsel2, temp + 1);

			if(nsj2 > 0)
			{
				size = nsj2 * sizeof(int);
				cudaMemcpy(dcons, sjoin2, size, cudaMemcpyHostToDevice);
				samejoin<<<blockllen, numthreads, size>>>(p2, sLen, of2, dcons, nsj2, temp + 1);
			}
		}
		else
		{
			if(nsj2 > 0)
			{
				size = nsj2 * sizeof(int);
				cudaMemcpy(dcons, sjoin2, size, cudaMemcpyHostToDevice);
				samejoin2<<<blockllen, numthreads, size>>>(p2, sLen, of2, dcons, nsj2, temp + 1);	
			}
			else
			{
				sizem32S = m32sLen * sizeof(int);
				reservar(&d_S, sizem32S);
				cudaMemsetAsync(d_S + sLen, 0x7f, extraspaceS * sizeof(int));
				llenarnosel<<<blockllen, numthreads>>>(p2, d_S, sLen, of2, wherej[1]);
			}
		}
	}
	
	if(npred2.x > 0 || npred2.y > 0 || nsel2 > 0 || nsj2 > 0)
	{
		flag = 0;
		while(flag != 1)
		{
			try
			{
				thrust::inclusive_scan(res + 1, res + newLen, res + 1);		
				flag = 1;
			}
			catch(std::bad_alloc &e)
			{
				limpiar("inclusive scan in join", 0);
			}				
		}	
		newLen = res[sLen];
	
		if(newLen == 0) // && !negative) ARREGLAR
		{
			cudaFree(temp);
			cudaFree(dcons);
			return 0;
		}

		extraspaceS = TREE_NODE_SIZE - newLen % TREE_NODE_SIZE;
		sizextra = extraspaceS * sizeof(int);
		m32sLen = newLen + extraspaceS;
		sizem32S = m32sLen * sizeof(int);

		reservar(&d_S, sizem32S);
		reservar(&posS, sizem32S);
		cudaMemsetAsync(d_S + newLen, 0x7f, sizextra);
		cudaMemsetAsync(posS + newLen, 0x7f, sizextra);
		llenar<<<blockllen, numthreads>>>(p2, d_S, sLen, of2, wherej[1], temp, posS);
		sLen = newLen;
	}

	#ifdef TIMER
	cudaEventRecord(stop, 0);
	cudaEventSynchronize(stop);
	cudaEventElapsedTime(&time, start, stop);
	//cout << "Select1 = " << time << endl;
	cuda_stats.select1_time += time;

	cudaEventDestroy(start);
	cudaEventDestroy(stop);
	cudaEventCreate(&start);
	cudaEventCreate(&stop);
	cudaEventRecord(start, 0);
	#endif

	blockllen = rLen / numthreads + 1;
	int sizem32;
	if(bothops)
	{
		if(temp2 != NULL)
		{
			cudaFree(temp);
			temp = temp2;
			res = thrust::device_pointer_cast(temp);
			newLen = rLen + 1;
			if(nsel1 > 0)
			{
				size = nsel1 * sizeof(int);
				cudaMemcpy(dcons, sel1, size, cudaMemcpyHostToDevice);
				marcar<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, nsel1, temp + 1);
			}
			if(nsj1 > 0)
			{
				size = nsj1 * sizeof(int);
				cudaMemcpy(dcons, sjoin1, size, cudaMemcpyHostToDevice);
				samejoin<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, nsj1, temp + 1);
			}
			if(npred1.x > 0)
			{
				size = npred1.x * sizeof(int);
				cudaMemcpy(dcons, pred1, size, cudaMemcpyHostToDevice);
				if(ANDlogic)
					bpredsnormal<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, npred1.x, temp + 1);
				else
					bpredsorlogic<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, npred1.x, temp + 1);
			}
		}
		else
		{
			if(npred1.x > 0 || nsel1 > 0 || nsj1 > 0)
			{
				newLen = rLen + 1;
				cudaMemsetAsync(temp, 0, newLen * sizeof(int));
			}

			if(nsel1 > 0)
			{
				size = nsel1 * sizeof(int);
				cudaMemcpy(dcons, sel1, size, cudaMemcpyHostToDevice);
				marcar2<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, nsel1, temp + 1);

				if(nsj1 > 0)
				{
					size = nsj1 * sizeof(int);
					cudaMemcpy(dcons, sjoin1, size, cudaMemcpyHostToDevice);
					samejoin<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, nsj1, temp + 1);
				}

				if(npred1.x > 0)
				{
					size = npred1.x * sizeof(int);
					cudaMemcpy(dcons, pred1, size, cudaMemcpyHostToDevice);
					if(ANDlogic)
						bpredsnormal<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, npred1.x, temp + 1);
					else
						bpredsorlogic<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, npred1.x, temp + 1);		
				}
			}
			else
			{
				if(nsj1 > 0)
				{
					size = nsj1 * sizeof(int);
					cudaMemcpy(dcons, sjoin1, size, cudaMemcpyHostToDevice);
					samejoin2<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, nsj1, temp + 1);

					if(npred1.x > 0)
					{
						size = npred1.x * sizeof(int);
						cudaMemcpy(dcons, pred1, size, cudaMemcpyHostToDevice);
						if(ANDlogic)
							bpredsnormal<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, npred1.x, temp + 1);
						else
							bpredsorlogic<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, npred1.x, temp + 1);
					}
				}
				else
				{
					if(npred1.x > 0)
					{
						size = npred1.x * sizeof(int);
						cudaMemcpy(dcons, pred1, size, cudaMemcpyHostToDevice);
						if(ANDlogic)
							bpredsnormal2<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, npred1.x, temp + 1);
						else
							bpredsorlogic2<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, npred1.x, temp + 1);	
					}
				}
			}
		}

		if(temp2 != NULL || npred1.x > 0 || nsel1 > 0 || nsj1 > 0)
		{
			thrust::inclusive_scan(res + 1, res + newLen, res + 1);
			newLen = res[rLen];
			if(newLen == 0)
			{
				cudaFree(temp);
				cudaFree(dcons);
				cudaFree(d_S);
				if(posS != NULL)
					cudaFree(posS);
				return 0;
			}	

			extraspace = TREE_NODE_SIZE - newLen % TREE_NODE_SIZE;
			sizextra = extraspace * sizeof(int);
			m32rLen = newLen + extraspace;
			sizem32 = m32rLen * sizeof(int);

			reservar(&d_R, sizem32);
			reservar(&posR, sizem32);
			cudaMemsetAsync(d_R + newLen, 0x7f, sizextra);
			cudaMemsetAsync(posR + newLen, 0x7f, sizextra);
			llenar<<<blockllen, numthreads>>>(p1, d_R, rLen, of1, wherej[0], temp, posR);
			rLen = newLen;
		}
		else
		{
			sizem32 = m32rLen * sizeof(int);
			reservar(&d_R, sizem32);
			cudaMemsetAsync(d_R + rLen, 0x7f, extraspace * sizeof(int));
			llenarnosel<<<blockllen, numthreads>>>(p1, d_R, rLen, of1, wherej[0]);
		}
	}
	else
	{
		sizem32 = m32rLen * sizeof(int);
		reservar(&d_R, sizem32);
		cudaMemsetAsync(d_R + rLen, 0x7f, extraspace * sizeof(int));
		llenarnosel<<<blockllen, numthreads>>>(p1, d_R, rLen, of1, wherej[0]);
	}

	#ifdef TIMER
	cudaEventRecord(stop, 0);
	cudaEventSynchronize(stop);
	cudaEventElapsedTime(&time, start, stop);
	//cout << "Select2 = " << time << endl;
	cuda_stats.select2_time += time;
	#endif

	#ifdef TIMER
	cudaEventDestroy(start);
	cudaEventDestroy(stop);
	cudaEventCreate(&start);
	cudaEventCreate(&stop);
	cudaEventRecord(start, 0);
	#endif

	thrust::device_ptr<Record> dvp1;
	thrust::device_ptr<Record> permutation;
	if(negative)
	{
		dvp1 = thrust::device_pointer_cast(d_S);
		if(posS == NULL)
		{
			reservar(&posS, sizem32S);
			permutation = thrust::device_pointer_cast(posS);
			thrust::sequence(permutation, permutation + m32sLen);
		}
		else
			permutation = thrust::device_pointer_cast(posS);

		flag = 0;
		while(flag != 1)
		{
			try
			{
				thrust::stable_sort_by_key(dvp1, dvp1 + m32sLen, permutation);
				flag = 1;
			}
			catch(std::bad_alloc &e)
			{
				limpiar("inclusive scan in join", 0);
			}
		}
	}
	else
	{
		dvp1 = thrust::device_pointer_cast(d_R);
		if(posR == NULL)
		{
			reservar(&posR, sizem32);
			permutation = thrust::device_pointer_cast(posR);
			thrust::sequence(permutation, permutation + m32rLen);
		}
		else
			permutation = thrust::device_pointer_cast(posR);

		flag = 0;
		while(flag != 1)
		{
			try
			{
				thrust::stable_sort_by_key(dvp1, dvp1 + m32rLen, permutation);
				flag = 1;
			}
			catch(std::bad_alloc &e)
			{
				limpiar("inclusive scan in join", 0);
			}
		}
	}

	#ifdef TIMER
	cudaEventRecord(stop, 0);
	cudaEventSynchronize(stop);
	cudaEventElapsedTime(&time, start, stop);
	//cout << "Sort = " << time << endl;
	cuda_stats.sort_time += time;
	
	cudaEventDestroy(start);
	cudaEventDestroy(stop);
	cudaEventCreate(&start);
	cudaEventCreate(&stop);
	cudaEventRecord(start, 0);
	#endif

	IDataNode* d_data;
	IDirectoryNode* d_dir;
	unsigned int nDataNodes;

	if(negative)
	{
		nDataNodes = uintCeilingDiv(sLen, TREE_NODE_SIZE);
		d_data=(IDataNode *)d_S;
	}
	else
	{
		nDataNodes = uintCeilingDiv(rLen, TREE_NODE_SIZE);
		d_data=(IDataNode *)d_R;
	}

	unsigned int lvlDir = uintCeilingLog(TREE_FANOUT, nDataNodes);
	unsigned int nDirNodes = uintCeilingDiv(nDataNodes - 1, TREE_NODE_SIZE);
	unsigned int tree_size = nDirNodes + nDataNodes;
	unsigned int bottom_start = (uintPower(TREE_FANOUT, lvlDir) - 1) / TREE_NODE_SIZE;
	d_dir = (IDirectoryNode *)temp;

	unsigned int nNodesPerBlock = uintCeilingDiv(nDirNodes, BLCK_PER_GRID_create);

	dim3 Dbc(THRD_PER_BLCK_create, 1, 1);
	dim3 Dgc(BLCK_PER_GRID_create, 1, 1);

	gCreateIndex <<<Dgc, Dbc>>> (d_data, d_dir, nDirNodes, tree_size, bottom_start, nNodesPerBlock);

	int *d_locations;
	int memSizeR;
	unsigned int nSearchKeys;
	if(negative)
	{
		memSizeR = (rLen + 1) * sizeof(int);
		reservar(&d_locations, memSizeR);
		cudaMemsetAsync(d_locations, 0, sizeof(int));
		nSearchKeys = rLen;
	}
	else
	{
		memSizeS = sLen * sizeof(int);
		reservar(&d_locations, memSizeS);
		nSearchKeys = sLen;
	}
	dim3 Dbs(THRD_PER_BLCK_search, 1, 1);
	dim3 Dgs(BLCK_PER_GRID_search, 1, 1);
	unsigned int nKeysPerThread = uintCeilingDiv(nSearchKeys, THRD_PER_GRID_search);
	if(negative)
	{
		gSearchTree <<<Dgs, Dbs>>> (d_data, nDataNodes, d_dir, nDirNodes, lvlDir, d_R, d_locations + 1, nSearchKeys, nKeysPerThread, tree_size, bottom_start);
		cudaMemsetAsync(temp, 0, memSizeR);
	}
	else
	{
		gSearchTree <<<Dgs, Dbs>>> (d_data, nDataNodes, d_dir, nDirNodes, lvlDir, d_S, d_locations, nSearchKeys, nKeysPerThread, tree_size, bottom_start);
		cudaMemsetAsync(temp, 0, memSizeS);
	}

	int muljoin = 0, muljoinsize = 0, sum;
	int *d_Rout;
	int resSize, sizepro;
	if(negative)
	{
		blockllen = rLen / numthreads + 1;
		if(numj > 2)
		{
			muljoin = numj - 2;
			muljoinsize = muljoin * sizeof(int);
			cudaMemcpy(dcons, wherej + 2, muljoinsize, cudaMemcpyHostToDevice);
			gIndexMultiJoinNegative<<<blockllen, numthreads, muljoinsize>>> (d_R, d_S, d_locations + 1, rLen, p1, p2, of1, of2, posR, posS, dcons, muljoin);
		}

		res = thrust::device_pointer_cast(d_locations);	
		thrust::transform(res + 1, res + rLen + 1, res + 1, to_neg());
		thrust::inclusive_scan(res + 1, res + rLen + 1, res + 1);
		sum = res[rLen];

		if(pos == (rule->num_rows - 3))
		{
			sizepro = rule->num_columns * sizeof(int);
			cudaMemcpy(dcons, proj, sizepro, cudaMemcpyHostToDevice);
			resSize = sum * sizepro;
			reservar(&d_Rout, resSize);
			gJoinWithWriteNegative2<<<blockllen, numthreads, sizepro>>> (d_locations, rLen, d_Rout, p1, of1, dcons, rule->num_columns, posR);
		}
		else
		{	
			sizepro = projp.x * sizeof(int);
			cudaMemcpy(dcons, proj, sizepro, cudaMemcpyHostToDevice);
			resSize = sum * sizepro;
			reservar(&d_Rout, resSize);
			gJoinWithWriteNegative<<<blockllen, numthreads, sizepro>>> (d_locations, rLen, d_Rout, p1, of1, dcons, projp.x, posR);
		}
		cudaFree(d_R);
		cudaFree(d_S);
	}
	else
	{
		blockllen = sLen / numthreads + 1;
		if(numj > 2)
		{
			muljoin = numj - 2;
			muljoinsize = muljoin * sizeof(int);
			cudaMemcpy(dcons, wherej + 2, muljoinsize, cudaMemcpyHostToDevice);
			gIndexMultiJoin<<<blockllen, numthreads, muljoinsize>>> (d_R, d_S, d_locations, sLen, temp, p1, p2, of1, of2, posR, posS, dcons, muljoin);
		}
		else
			gIndexJoin<<<blockllen, numthreads>>> (d_R, d_S, d_locations, sLen, temp);
		cudaFree(d_R);
		cudaFree(d_S);

		sum = res[sLen-1];
		thrust::exclusive_scan(res, res + sLen, res);
		sum += res[sLen-1];
		if(sum == 0)
		{
			cudaFree(dcons);
			cudaFree(d_locations);
			cudaFree(temp);
			if(posS != NULL)
				cudaFree(posS);
			if(posR != NULL)
				cudaFree(posR);
			return 0;
		}
		res[sLen] = sum;

		if(pos == (rule->num_rows - 3))
		{
			sizepro = rule->num_columns * sizeof(int);
			cudaMemcpy(dcons, proj, sizepro, cudaMemcpyHostToDevice);
			resSize = sum * sizepro;
			reservar(&d_Rout, resSize);
			if(numj > 2)
			{
				cudaMemcpy(dcons + rule->num_columns, wherej + 2, muljoinsize, cudaMemcpyHostToDevice);
				multiJoinWithWrite2<<<blockllen, numthreads, sizepro + muljoinsize>>> (d_locations, sLen, temp, d_Rout, p1, p2, of1, of2, dcons, rule->num_columns, posR, posS, muljoin);
			}
			else
				gJoinWithWrite2<<<blockllen, numthreads, sizepro>>> (d_locations, sLen, temp, d_Rout, p1, p2, of1, of2, dcons, rule->num_columns, posR, posS);
		}
		else
		{
			sizepro = projp.y * sizeof(int);
			cudaMemcpy(dcons, proj, sizepro, cudaMemcpyHostToDevice);
			resSize = sum * sizepro;
			reservar(&d_Rout, resSize);
			if(numj > 2)
			{
				cudaMemcpy(dcons + projp.y, wherej + 2, muljoinsize, cudaMemcpyHostToDevice);
				multiJoinWithWrite<<<blockllen, numthreads, sizepro + muljoinsize>>> (d_locations, sLen, temp, d_Rout, p1, p2, of1, of2, dcons, projp.x, projp.y, posR, posS, muljoin);
			}
			else
				gJoinWithWrite<<<blockllen, numthreads, sizepro>>> (d_locations, sLen, temp, d_Rout, p1, p2, of1, of2, dcons, projp.x, projp.y, posR, posS);
		}
	}

	cudaFree(dcons);
	cudaFree(d_locations);
	cudaFree(temp);
	if(posS != NULL)
		cudaFree(posS);
	if(posR != NULL)
		cudaFree(posR);
	
	if(*ret != NULL)
		cudaFree(*ret);
	*ret = d_Rout;

	#ifdef TIMER
	cudaEventRecord(stop, 0);
	cudaEventSynchronize(stop);
	cudaEventElapsedTime(&time, start, stop);
	//cout << "Join = " << time << endl;
	//cout << "FIN" << endl;
	cuda_stats.join_time += time;
	#endif

	return sum;
}