#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/sequence.h>
#include <thrust/sort.h>
#include <thrust/gather.h>
#include <thrust/scan.h>
#include <iostream>
#include <cstdarg>
#include <cstdio>
#include "lista.h"
//#include "scanImpl.cu"

#define WARP_SIZE		32
#define TREE_NODE_SIZE		WARP_SIZE
#define TREE_FANOUT		(TREE_NODE_SIZE + 1)

#define N_MULTI_P 		16
#define BLCK_PER_MP_create	256	// blocks per multiprocessor during tree creation
#define BLCK_PER_MP_search	512	// blocks per multiprocessor during tree searching
#define WAPRS_PER_BLCK_join	8//16	// blocks per multiprocessor during tree creation
#define BLCK_PER_MP_join	512//256	// blocks per multiprocessor during tree searching

#define THRD_PER_BLCK_create	TREE_NODE_SIZE
#define BLCK_PER_GRID_create	(N_MULTI_P * BLCK_PER_MP_create)
#define THRD_PER_BLCK_search	TREE_NODE_SIZE
#define BLCK_PER_GRID_search	(N_MULTI_P * BLCK_PER_MP_search)
#define THRD_PER_GRID_search	(THRD_PER_BLCK_search * BLCK_PER_GRID_search)
#define THRD_PER_BLCK_join	(WARP_SIZE * WAPRS_PER_BLCK_join)
#define BLCK_PER_GRID_join	(N_MULTI_P * BLCK_PER_MP_join)
#define THRD_PER_GRID_join	(THRD_PER_BLCK_join * BLCK_PER_GRID_join)

#define TEST_MAX		100

typedef int IKeyType;
typedef int Record;

typedef struct {
	int keys[TREE_NODE_SIZE];
} IDirectoryNode;

typedef struct {
	Record records[TREE_NODE_SIZE];
} IDataNode;

typedef struct {
	IDataNode* data;
	unsigned int nDataNodes;
	IDirectoryNode* dir;
	unsigned int nDirNodes;
} CUDA_CSSTree;

__host__ __device__ unsigned int uintCeilingLog(unsigned int base, unsigned int num)
{
	unsigned int result = 0;

	for(unsigned int temp = 1; temp < num; temp *= base)
		result++;

	return result;
}

__host__ __device__ unsigned int uintCeilingDiv(unsigned int dividend, unsigned int divisor)
{
	return (dividend + divisor - 1) / divisor;
}

__host__ __device__ unsigned int uintPower(unsigned int base, unsigned int pow)
{
	unsigned int result = 1;

	for(; pow; pow--)
		result *= base;

	return result;
}

__device__ int getRightMostDescIdx(int tree_size, int nodeIdx)
{
	int tmp = nodeIdx * TREE_NODE_SIZE + TREE_FANOUT;
	int n = uintCeilingLog(TREE_FANOUT, uintCeilingDiv(TREE_NODE_SIZE * tree_size + TREE_FANOUT, tmp)) - 1;

	int result = (tmp * uintPower(TREE_FANOUT, n) - TREE_FANOUT) / TREE_NODE_SIZE;
    	return result; 
}

__device__ int getDataArrayIdx(int dirSize, int tree_size, int bottom_start, int treeIdx)
{
	int idx;
	if(treeIdx < dirSize) {
		idx = tree_size - bottom_start - 1;
	}
	else if( treeIdx < bottom_start ) {
		idx = tree_size - bottom_start + treeIdx - dirSize;
	}
	else {
		idx = treeIdx - bottom_start;
	}
	return idx;
}

// Binary Search
__device__ int firstMatchingKeyInDirNode1(int keys[], int key)
{
	int min = 0;
	int max = TREE_NODE_SIZE;
	int mid;
	int cut;
	while(max - min > 1) {
		mid = (min + max) / 2;
		cut = keys[mid];

		if(key > cut)
			min = mid;
		else
			max = mid;
	}

	if(keys[min] >= key)
		return min;

	return max;

}

// Binary Search
__device__ int firstMatchingKeyInDataNode2(Record records[], IKeyType key)
{
	int min = 0;
	int max = TREE_NODE_SIZE;
	int mid;
	int cut;
	while(max - min > 1) {
		mid = (min + max) / 2;
		cut = records[mid];

		if(key > cut)
			min = mid;
		else
			max = mid;
	}

	if(records[min] == key)
		return min;

	if(max < TREE_NODE_SIZE && records[max] == key)
		return max;

	return -1;
}

__global__ void gCreateIndex(IDataNode data[], IDirectoryNode dir[], int dirSize, int tree_size, int bottom_start, int nNodesPerBlock)
{
        int startIdx = blockIdx.x * nNodesPerBlock;
        int endIdx = startIdx + nNodesPerBlock;
        if(endIdx > dirSize)
                endIdx = dirSize;
        int keyIdx = threadIdx.x;

        // Proceed only when in internal nodes
        for(int nodeIdx = startIdx; nodeIdx < endIdx; nodeIdx++)
        {
                int childIdx = nodeIdx * TREE_FANOUT + keyIdx + 1;        // One step down to the left
                // Then look for the right most descendent
                int rightMostDesIdx;
                // Common cases
                if(childIdx < tree_size) {
                        rightMostDesIdx = getRightMostDescIdx(tree_size, childIdx);
                }
                // versus the unusual case when the tree is incomplete and the node does not have the full set of children
                else {
                        // pick the last node in the tree (largest element of the array)
                        rightMostDesIdx = tree_size - 1;
                }

                int dataArrayIdx = getDataArrayIdx(dirSize, tree_size, bottom_start, rightMostDesIdx);
	        dir[nodeIdx].keys[keyIdx] = data[dataArrayIdx].records[TREE_NODE_SIZE - 1];
        }
}

__global__ void gSearchTree(IDataNode* data, int nDataNodes, IDirectoryNode* dir, int nDirNodes, int lvlDir, Record* arr, int locations[], int nSearchKeys, int nKeysPerThread, int tree_size, int bottom_start)
{
	// Bringing the root node (visited by every tuple) to the faster shared memory
	__shared__ IKeyType RootNodeKeys[TREE_NODE_SIZE];
	RootNodeKeys[threadIdx.x] = dir->keys[threadIdx.x];

	__syncthreads();

	int OverallThreadIdx = blockIdx.x * THRD_PER_BLCK_search + threadIdx.x;

	for(int keyIdx = OverallThreadIdx; keyIdx < nSearchKeys; keyIdx += THRD_PER_GRID_search)
	{
		IKeyType val = arr[keyIdx];
		int loc = firstMatchingKeyInDirNode1(RootNodeKeys, val) + 1;
		for(int i = 1; i < lvlDir && loc < nDirNodes; i++) {
			int kid = firstMatchingKeyInDirNode1(dir[loc].keys, val);
			loc = loc * TREE_FANOUT + kid + 1;
		}

		if(loc >= tree_size)
			loc = nDataNodes - 1;
		else
			loc = getDataArrayIdx(nDirNodes, tree_size, bottom_start, loc);

		int offset = firstMatchingKeyInDataNode2(data[loc].records, val);
		locations[keyIdx] = (offset <0)?-1:(loc * TREE_NODE_SIZE + offset);
	}
}

__global__ void gIndexJoin(int *R, int *S, int g_locations[], int sLen, int g_ResNums[])
{
	int s_cur = blockIdx.x * blockDim.x + threadIdx.x;

	if(s_cur < sLen) 
	{
		int count = 1;
		int r_cur = g_locations[s_cur];
		int s_key;
		if(r_cur >= 0) /*&& r_cur < rLen) Tal vez la segunda parte no sea necesaria*/
		{
			s_key = S[s_cur];
			r_cur++;
			while(s_key == R[r_cur]) 
			{
				count++;
				r_cur++;
			}
			g_ResNums[s_cur] = count;
		}
		
	}
}

__global__ void gIndexMultiJoin(int *R, int *S, int g_locations[], int sLen, int g_ResNums[], int *p1, int *p2, int of1, int of2, int *mloc, int *sloc, int *muljoin, int wj)
{
	extern __shared__ int shared[];
	int s_cur = blockIdx.x * blockDim.x + threadIdx.x;
	int posr, poss, x, y, ini;

	if(threadIdx.x < wj)
		shared[threadIdx.x] = muljoin[threadIdx.x];
	__syncthreads();

	if(s_cur < sLen) 
	{
		int count = 1;
		int r_cur = g_locations[s_cur];
		int s_key;
		if(r_cur >= 0) /*&& r_cur < rLen) Tal vez la segunda parte no sea necesaria*/
		{
			s_key = S[s_cur];				
			r_cur++;
			while(s_key == R[r_cur]) 
			{
				count++;
				r_cur++;
			}
			if(sloc == NULL)
				poss = s_cur * of2;
			else
				poss = sloc[s_cur] * of2;
			ini = r_cur - count;	
			for(y = ini; y < r_cur; y++)
			{
				posr = mloc[y] * of1;
				for(x = 0; x < wj; x += 2)
				{
					if(p1[posr + shared[x]] != p2[poss + shared[x+1]])
					{
						count--;
						break;
					}
				}
			}
			if(count > 0)
				g_ResNums[s_cur] = count;
		}
		
	}
}

__global__ void multiJoinWithWrite(int g_locations[], int sLen, int g_PrefixSums[], int g_joinResultBuffers[], int *p1, int *p2, int of1, int of2, int *rule, int halfrul, int lenrul, int *mloc, int *sloc, int wj)
{
	extern __shared__ int shared[];
	int *extjoins = &shared[lenrul];
	int s_cur = blockIdx.x * blockDim.x + threadIdx.x;

	if(threadIdx.x < (lenrul + wj))
		shared[threadIdx.x] = rule[threadIdx.x];
	__syncthreads();

	if(s_cur < sLen)
	{
		int r_cur = g_locations[s_cur];
		if(r_cur >= 0)
		{
			int x, y, pos, posr, poss;
			int num1 = g_PrefixSums[s_cur];
			int num2 = g_PrefixSums[s_cur+1];
			
			int tmp1, tmp2;

			if(sloc == NULL)	
				poss = s_cur * of2;
			else
				poss = sloc[s_cur] * of2;
			for(x = num1; x < num2; x++, r_cur++)
			{
				pos = mloc[r_cur] * of1;
				for(y = 0; y < wj; y += 2)
				{
					tmp1 = p1[pos + extjoins[y]];
					tmp2 = p2[poss + extjoins[y+1]];
					if(tmp1 != tmp2)
						break;
				}
				if(y < wj)
				{
					x--;
					continue;
				}
				posr = x * lenrul;
				for(y = 0; y < halfrul; y++)
					g_joinResultBuffers[posr + y] = p1[pos + shared[y]];
				for(; y < lenrul; y++)
					g_joinResultBuffers[posr + y] = p2[poss + shared[y]];
			}
		}
	}
}

__global__ void multiJoinWithWrite2(int g_locations[], int sLen, int g_PrefixSums[], int g_joinResultBuffers[], int *p1, int *p2, int of1, int of2, int *rule, int cols, int *mloc, int *sloc, int wj)
{
	extern __shared__ int shared[];
	int *extjoins = &shared[cols];
	int s_cur = blockIdx.x * blockDim.x + threadIdx.x;

	if(threadIdx.x < (cols + wj))
		shared[threadIdx.x] = rule[threadIdx.x];
	__syncthreads();

	if(s_cur < sLen)
	{
		int r_cur = g_locations[s_cur];
		if(r_cur >= 0)
		{
			int x, y, pos, pos2, posr, cond;
			int num1 = g_PrefixSums[s_cur];
			int num2 = g_PrefixSums[s_cur+1];
			if(sloc == NULL)
				pos2 = s_cur * of2 - 1;
			else
				pos2 = sloc[s_cur] * of2 - 1;
			for(x = num1; x < num2; x++, r_cur++)
			{
				pos = mloc[r_cur] * of1 - 1;
				for(y = 0; y < wj; y += 2)
				{
					if(p1[pos + extjoins[y] + 1] != p2[pos2 + extjoins[y+1] + 1])
						break;
				}
				if(y < wj)
				{
					x--;
					continue;
				}
				posr = x * cols;
				for(y = 0; y < cols; y++)
				{
					cond = shared[y];
					if(cond > 0)
						g_joinResultBuffers[posr + y] = p1[pos + cond];
					else
						g_joinResultBuffers[posr + y] = p2[pos2 - cond];
				}
			}
		}
	}
}

__global__ void gJoinWithWrite(int g_locations[], int sLen, int g_PrefixSums[], int g_joinResultBuffers[], int *p1, int *p2, int of1, int of2, int *rule, int halfrul, int lenrul, int *mloc, int *sloc)
{
	extern __shared__ int shared[];
	int s_cur = blockIdx.x * blockDim.x + threadIdx.x;

	if(threadIdx.x < lenrul)
		shared[threadIdx.x] = rule[threadIdx.x];
	__syncthreads();

	if(s_cur < sLen)
	{
		int r_cur = g_locations[s_cur];
		if(r_cur >= 0)
		{
			int x, y, pos, posr, poss;
			int num1 = g_PrefixSums[s_cur];
			int num2 = g_PrefixSums[s_cur+1];
			if(sloc == NULL)	
				poss = s_cur * of2;
			else
				poss = sloc[s_cur] * of2;
			for(x = num1; x < num2; x++, r_cur++)
			{
				pos = mloc[r_cur] * of1;
				posr = x * lenrul;
				for(y = 0; y < halfrul; y++)
					g_joinResultBuffers[posr + y] = p1[pos + shared[y]];
				for(; y < lenrul; y++)
					g_joinResultBuffers[posr + y] = p2[poss + shared[y]];
			}
		}
	}
}

__global__ void gJoinWithWrite2(int g_locations[], int sLen, int g_PrefixSums[], int g_joinResultBuffers[], int *p1, int *p2, int of1, int of2, int *rule, int cols, int *mloc, int *sloc)
{
	extern __shared__ int shared[];
	int s_cur = blockIdx.x * blockDim.x + threadIdx.x;

	if(threadIdx.x < cols)
		shared[threadIdx.x] = rule[threadIdx.x];
	__syncthreads();

	if(s_cur < sLen)
	{
		int r_cur = g_locations[s_cur];
		if(r_cur >= 0)
		{
			int x, y, pos, pos2, posr, cond;
			int num1 = g_PrefixSums[s_cur];
			int num2 = g_PrefixSums[s_cur+1];
			if(sloc == NULL)
				pos2 = s_cur * of2 - 1;
			else
				pos2 = sloc[s_cur] * of2 - 1;
			for(x = num1; x < num2; x++, r_cur++)
			{
				pos = mloc[r_cur] * of1 - 1;
				posr = x * cols;
				for(y = 0; y < cols; y++)
				{
					cond = shared[y];
					if(cond > 0)
						g_joinResultBuffers[posr + y] = p1[pos + cond];
					else
						g_joinResultBuffers[posr + y] = p2[pos2 - cond];
				}
			}
		}
	}
}

int compare2 (const void * a, const void * b)
{
  return ( ((int2*)a)->y - ((int2*)b)->y );
}

/*void generateSort(Record *R, int maxmax, int rLen, int seed)
{
	int i=0;
	const int offset=(1<<15)-1;
	srand(seed);
	for(i=0;i<rLen;i++)
	{
		R[i].y=((((rand()& offset)<<15)+(rand()&1))+(rand()<<1)+(rand()&1))%maxmax;
		
	}
	qsort(R,rLen,sizeof(Record),compare);
	for(i=0;i<rLen;i++)
	R[i].x=i;

}

void generateRand(Record *R, int maxmax, int rLen, int seed)
{
	int i=0;
	const int offset=(1<<15)-1;
	srand(seed);
	for(i=0;i<rLen;i++)
	{
		R[i].y=((((rand()& offset)<<15)+(rand()&1))+(rand()<<1)+(rand()&1))%maxmax;
		//R[i].x=i+1;
		R[i].x=i;
	}
}*/

__global__ void llenar(int *p, int *R, int len, int of, int wj, int *pos, int *ids)
{
	int id = blockIdx.x * blockDim.x + threadIdx.x;
	int cond;
	if(id < len)
	{
		cond = pos[id+1];
		if(pos[id] != cond && cond > 0)
		{
			R[cond-1] = p[id * of + wj];
			ids[cond-1] = id;
		}
	}
}

__global__ void llenar2(int *p, int *R, int len, int of, int wj, int *pos)
{
	int id = blockIdx.x * blockDim.x + threadIdx.x;
	int cond;
	if(id < len)
	{
		cond = pos[id+1];
		if(pos[id] != cond && cond > 0)
			R[cond-1] = p[id * of + wj];
	}
}

__global__ void llenarnosel(int *p, int *R, int len, int of, int wj)
{
	int id = blockIdx.x * blockDim.x + threadIdx.x;
	if(id < len)
		R[id] = p[id * of + wj];
}

/*__global__ smalljoinc(int *p1, int *p2, int rLen, int sLen, int2 wj, int *r)
{
	int id = blockIdx.x * blockDim.x + threadIdx.x;
	if(id < rlen)
	{
		int comp = p1[id * of1 + wj.x];
		int x, cnt = 0;
		for(x = 0; x < sLen; x++)
		{
			if(comp == p2[x * of2 ])
				cnt++;
		}
		r[id] = cnt;
	}
}

__global__ smalljoinw(int *p1, int *p2, int wj, int *r)
{
	int id = blockIdx.x * blockDim.x + threadIdx.x;
	if(id < rlen)
	{
		int comp = p1[id];
		int x;
		for(x = 0; x < sLen; x++)
		{
			if(comp == p2[x])
				cnt++;
		}
		r[id] = cnt;
	}
}*/

int buscarunion(int *tmprule, int tmplen, int *rule, int pos2, int tam2, int *joins)
{
	int x, y;
	int cont = 0;
	for(x = 0; x < tmplen; x++)
	{
		for(y = pos2; y < (pos2 + tam2); y++)
		{
			if(tmprule[x] == rule[y])
			{
				joins[cont] = x;
				cont++;
				joins[cont] = y - pos2;
				cont++;
			}
		}
	}
	return cont;
}

int not_in(int *rule, int len, int bus)
{
	int x;
	for(x = 0; x < len; x++)
	{
		if(rule[x] == bus)
			return 0;
	}
	return 1;
}

int posiciones(int ini, int of1, int *firstpart, int *hpos, int *hcons, int *rule, int *temprule, int *tmprulpos, int *lenrul, int pos2, int of2, int posf)
{
	int cont = 0, x, y;
	for(y = ini; y < of1; y++)
	{
		if(firstpart[y] < 0)
		{
			hpos[cont] = y - ini;
			hcons[cont] = -firstpart[y];
			cont++;
		}
		else
		{
			x = 1;
			while(rule[x] != 0)
			{
				if(firstpart[y] == rule[x])
				{
					if(not_in(temprule, *lenrul, firstpart[y]))
					{
						temprule[*lenrul] = firstpart[y];
						tmprulpos[*lenrul] = y - ini;
						*lenrul = *lenrul + 1;
					}
					break;
				}
				x++;
			}
			if(rule[x] != 0)
				continue;
			for(x = (pos2 + of2 + 1); x < posf; x++)
			{
				if(rule[x] == 0)
				{
					x++;
					continue;
				}
				if(firstpart[y] == rule[x])
				{
					if(not_in(temprule, *lenrul, firstpart[y]))
					{
						temprule[*lenrul] = firstpart[y];
						tmprulpos[*lenrul] = y - ini;
						*lenrul = *lenrul + 1;
					}
					break;
				}
			}
		}
	}	
	return cont;
}

void join_final(int cols, int of1, int of2, int *rule, int *firstpart, int *secondpart, int *tmprulpos)
{
	int x, y;
	for(y = 0; y < cols; y++)
	{
		for(x = 0; x < of1; x++)
		{
			if(rule[y] == firstpart[x])
			{
				tmprulpos[y] = x + 1;
				break;
			}
		}
		if(x != of1)
			continue;
		for(x = 0; x < of2; x++)
		{
			if(rule[y] == secondpart[x])
			{
				tmprulpos[y] = -x - 1;
				break;
			}
		}
	}
}

int select_pos(int of1, int *firstpart, int *hpos, int *hcons)
{
	int cont = 0, y;
	for(y = 0; y < of1; y++)
	{
		if(firstpart[y] < 0)
		{
			hpos[cont] = y;
			hcons[cont] = -firstpart[y];
			cont++;
		}
	}
	return cont;
}

int checkquery(int *hpos, int *hcons, int cont, int *pred, int of1, int *rule, int *query, int cols)
{
	int x, y;
	if(rule[0] != query[0])
		return cont;
	for(x = 1; x <= cols; x++)
	{
		if(query[x] < 0)
		{
			for(y = 0; y < of1; y++)
			{
				if(pred[y] == rule[x])
				{
					hpos[cont] = y;
					hcons[cont] = -query[x];
					cont++;
				}
			}
		}
	}
	return cont;
}

int maximo(int count, ...)
{
	va_list ap;
    	int j, temp, mx = 0;
    	va_start(ap, count);

	for(j = 0; j < count; j++)
	{
		temp = va_arg(ap, int);
		if(temp > mx)
			mx = temp;
	}

    	va_end(ap);
    	return mx;
}

template <typename KeyVector, typename PermutationVector, typename TempVector>
void update_permutation(KeyVector& keys, PermutationVector& permutation, TempVector& temporary, int rows)
{
    // permute the keys with the current reordering
	thrust::gather(permutation, permutation + rows, keys, temporary);

    // stable_sort the permuted keys and update the permutation
	thrust::stable_sort_by_key(temporary, temporary + rows, permutation);
}


template <typename KeyVector, typename PermutationVector, typename TempVector>
void apply_permutation(KeyVector& keys, PermutationVector& permutation, TempVector& temporary, int rows)
{
    // permute the keys
    thrust::gather(permutation, permutation + rows, temporary, keys);
}

int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>::iterator rule, int pos, int bothops, int **ret)
{
	int pos2 = pos + 1;
	int *sel1, nsel1;
	int *sel2 = rule->select[pos2];
	int nsel2 = rule->numsel[pos2];
	int *proj = rule->project[pos];
	int2 projp = rule->projpos[pos];
	int *sjoin1, nsj1;
	int *sjoin2 = rule->selfjoin[pos2];
	int nsj2 = rule->numselfj[pos2];
	int *wherej = rule->wherejoin[pos];
	int numj = rule->numjoin[pos];
	int flag;
#if TIMER
	cuda_stats.joins++;
#endif

	int porLiberar = rLen * of1 * sizeof(int);
	int size, sizet, sizet2;
	if(bothops)
	{
		sel1 = rule->select[pos];
		nsel1 = rule->numsel[pos];
		sjoin1 = rule->selfjoin[pos];
		nsj1 = rule->numselfj[pos];
		sizet = maximo(7, of1, of2, nsel1, nsel2, projp.y + numj - 2, nsj1, nsj2) * sizeof(int);
	}
	else
		sizet = maximo(6, of1, of2, nsel2, projp.y + numj - 2, nsj2, numj) * sizeof(int);
	
	int *dcons, *temp;
	int *d_R, *d_S;
	int blockllen, numthreads;
	
	//int por_liberar = rLen * sizeof(int);
	int extraspace = TREE_NODE_SIZE - rLen % TREE_NODE_SIZE;
	int m32rLen = rLen + extraspace;
	if(m32rLen > sLen)
		sizet2 = (m32rLen + 1) * sizeof(int);
	else
		sizet2 = (sLen + 1) * sizeof(int);

	/*hcons = (int *)malloc(sizet);
	hpos = (int *)malloc(sizet);
	int dconsize = sizet * 2;*/

	reservar(&dcons, sizet);
#ifdef DEBUG_MEM
	cerr << "+ " << dcons << " dcons tree  " << sizet << endl;
#endif
	reservar(&temp, sizet2);
#ifdef DEBUG_MEM
	cerr << "+ " << temp << " temp tree " << sizet2 << endl;
#endif
	thrust::device_ptr<int> res = thrust::device_pointer_cast(temp);

	numthreads = 1024;
	blockllen = sLen / numthreads + 1;
	int memSizeS, newLen;
	int *posR = NULL, *posS = NULL;

	#ifdef TIMER
	//cout << "INICIO" << endl;
	cudaEvent_t start, stop;
	float time;
	cudaEventCreate(&start);
	cudaEventCreate(&stop);
	cudaEventRecord(start, 0);
	#endif

	//cout << "sLen y rLen = " << sLen << " " << rLen << endl;

	if(nsel2 > 0)
	{
		size = nsel2 * sizeof(int);
		newLen = sLen + 1;
		cudaMemsetAsync(temp, 0, newLen * sizeof(int));
		cudaMemcpy(dcons, sel2, size, cudaMemcpyHostToDevice);
		marcar<<<blockllen, numthreads, size>>>(p2, sLen, of2, dcons, nsel2, temp + 1);

		/*int y;
		int *htemp = (int *)malloc(newLen * sizeof(int));
		cout << "temp =" << endl;
		cudaMemcpy(htemp, temp, newLen * sizeof(int), cudaMemcpyDeviceToHost);
		for(y = 0; y < newLen; y++)
			cout << htemp[y] << " ";
		cout << endl;
		free(htemp);*/

		if(nsj2 > 0)
		{
			size = nsj2 * sizeof(int);
			cudaMemcpy(dcons, sjoin2, size, cudaMemcpyHostToDevice);
			samejoin<<<blockllen, numthreads, size>>>(p2, sLen, of2, dcons, nsj2, temp + 1);
		}

		/*htemp = (int *)malloc(newLen * sizeof(int));
		cout << "temp =" << endl;
		cudaMemcpy(htemp, temp, newLen * sizeof(int), cudaMemcpyDeviceToHost);
		for(y = 0; y < newLen; y++)
			cout << res[y] << " ";
		cout << endl;
		free(htemp);*/

		/*cout << "Despues de marcar" << endl;
		cout << "newLen = " << newLen << endl;*/

		flag = 0;
		while(flag != 1)
		{
			try
			{
				thrust::inclusive_scan(res + 1, res + newLen, res + 1);		
				flag = 1;
			}
			catch(std::bad_alloc &e)
			{
				limpiar("inclusive scan in join", 0);
			}				
		}
		//thrust::inclusive_scan(res + 1, res + newLen, res + 1);	
		newLen = res[sLen];
	
		if(newLen == 0)
			return 0;

		memSizeS = newLen * sizeof(int);
		reservar(&d_S, memSizeS);
#if DEBUG_MEM
		cerr << "+ " << d_S << " d_S  " << memSizeS << endl;
#endif
		reservar(&posS, memSizeS);
#if DEBUG_MEM
		cerr << "+ " << posS << " posS  " << memSizeS << endl;
#endif
		llenar<<<blockllen, numthreads>>>(p2, d_S, sLen, of2, wherej[1], temp, posS);
		sLen = newLen;
	}
	else
	{
		if(nsj2 > 0)
		{
			size = nsj2 * sizeof(int);
			newLen = sLen + 1;
			cudaMemsetAsync(temp, 0, newLen * sizeof(int));
			cudaMemcpy(dcons, sjoin2, size, cudaMemcpyHostToDevice);
			samejoin2<<<blockllen, numthreads, size>>>(p2, sLen, of2, dcons, nsj2, temp + 1);

			thrust::inclusive_scan(res + 1, res + newLen, res + 1);
			newLen = res[sLen];
			if(newLen == 0)
			  return 0;

			memSizeS = newLen * sizeof(int);
			reservar(&d_S, memSizeS);
#ifdef DEBUG_MEM
			cerr << "+ " << d_S << " d_S m " << memSizeS << endl;
#endif
			reservar(&posS, memSizeS);
#ifdef DEBUG_MEM
			cerr << "+ " << posS << " posS m " << memSizeS << endl;
#endif
			llenar<<<blockllen, numthreads>>>(p2, d_S, sLen, of2, wherej[1], temp, posS);
			sLen = newLen;
		}
		else
		{
			memSizeS = sLen * sizeof(int);
			reservar(&d_S, memSizeS);
#ifdef DEBUG_MEM
			cerr << "+ " << d_S << " d_S n " << memSizeS << endl;
#endif
			llenarnosel<<<blockllen, numthreads>>>(p2, d_S, sLen, of2, wherej[1]);
		}
	}

	#ifdef TIMER
	cudaEventRecord(stop, 0);
	cudaEventSynchronize(stop);
	cudaEventElapsedTime(&time, start, stop);
	//cout << "Select1 = " << time << endl;
	cuda_stats.select1_time += time;

	cudaEventDestroy(start);
	cudaEventDestroy(stop);
	cudaEventCreate(&start);
	cudaEventCreate(&stop);
	cudaEventRecord(start, 0);
	#endif

	blockllen = rLen / numthreads + 1;
	int sizem32, sizextra;
	if(bothops)
	{
		if(nsel1 > 0)
		{
			size = nsel1 * sizeof(int);
			newLen = rLen + 1;
			cudaMemsetAsync(temp, 0, newLen * sizeof(int));
			cudaMemcpy(dcons, sel1, size, cudaMemcpyHostToDevice);
			marcar<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, nsel1, temp + 1);

			if(nsj1 > 0)
			{
				size = nsj1 * sizeof(int);
				cudaMemcpy(dcons, sjoin1, size, cudaMemcpyHostToDevice);
				samejoin<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, nsj1, temp + 1);
			}

			thrust::inclusive_scan(res + 1, res + newLen, res + 1);
			newLen = res[rLen];
			if(newLen == 0)
				return 0;		

			extraspace = TREE_NODE_SIZE - newLen % TREE_NODE_SIZE;
			sizextra = extraspace * sizeof(int);
			m32rLen = newLen + extraspace;
			sizem32 = m32rLen * sizeof(int);
			reservar(&d_R, sizem32);
#ifdef DEBUG_MEM
			cerr << "+ " << d_R << " d_R m " << sizem32 << endl;
#endif
			reservar(&posR, sizem32);
#ifdef DEBUG_MEM
			cerr << "+ " << posR << " posR m " << sizem32 << endl;
#endif
			cudaMemsetAsync(d_R + newLen, 0x7f, sizextra);
			cudaMemsetAsync(posR + newLen, 0x7f, sizextra);
			llenar<<<blockllen, numthreads>>>(p1, d_R, rLen, of1, wherej[0], temp, posR);
			rLen = newLen;
		}
		else
		{
			if(nsj1 > 0)
			{
				size = nsj1 * sizeof(int);
				newLen = rLen + 1;
				cudaMemsetAsync(temp, 0, newLen * sizeof(int));
				cudaMemcpy(dcons, sjoin1, size, cudaMemcpyHostToDevice);
				samejoin2<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, nsj1, temp + 1);

				thrust::inclusive_scan(res + 1, res + newLen, res + 1);
				newLen = res[rLen];
				if(newLen == 0)
					return 0;

				extraspace = TREE_NODE_SIZE - newLen % TREE_NODE_SIZE;
				sizextra = extraspace * sizeof(int);
				m32rLen = newLen + extraspace;
				sizem32 = m32rLen * sizeof(int);
				reservar(&d_R, sizem32);
#ifdef DEBUG_MEM
				cerr << "+ " << d_R << " d_R n " << sizem32 << endl;
#endif
				reservar(&posR, sizem32);
#ifdef DEBUG_MEM
				cerr << "+ " << posR << " posR n " << sizem32 << endl;
#endif
				cudaMemsetAsync(d_R + newLen, 0x7f, sizextra);
				cudaMemsetAsync(posR + newLen, 0x7f, sizextra);
				llenar<<<blockllen, numthreads>>>(p1, d_R, rLen, of1, wherej[0], temp, posR);
				rLen = newLen;
			}
			else
			{
				sizem32 = m32rLen * sizeof(int);
				reservar(&d_R, sizem32);
#ifdef DEBUG_MEM
				cerr << "+ " << d_R << " d_R sizem32 " << sizem32 << endl;
#endif
				cudaMemsetAsync(d_R + rLen, 0x7f, extraspace * sizeof(int));
				llenarnosel<<<blockllen, numthreads>>>(p1, d_R, rLen, of1, wherej[0]);
			}
			
		}
	}
	else
	{
		sizem32 = m32rLen * sizeof(int);
		reservar(&d_R, sizem32);
#ifdef DEBUG_MEM
		cerr << "+ " << d_R << " d_R sz " << sizem32 << endl;
#endif
		cudaMemsetAsync(d_R + rLen, 0x7f, extraspace * sizeof(int));
		llenarnosel<<<blockllen, numthreads>>>(p1, d_R, rLen, of1, wherej[0]);
	}

	#ifdef TIMER
	cudaEventRecord(stop, 0);
	cudaEventSynchronize(stop);
	cudaEventElapsedTime(&time, start, stop);
	//cout << "Select2 = " << time << endl;
	cuda_stats.select2_time += time;
	#endif
	
	/*free(hcons);
	free(hpos);

	h_R = (int *)malloc(sizem32);
	cudaMemcpy(h_R, d_S, memSizeS, cudaMemcpyDeviceToHost);
	cout << "H_S " << "cont " << cont << " sLen " << sLen << endl;
	for(x = 0; x < sLen; x++)
		cout << h_R[x] << endl;
	free(h_R);*/

	#ifdef TIMER
	cudaEventDestroy(start);
	cudaEventDestroy(stop);
	cudaEventCreate(&start);
	cudaEventCreate(&stop);
	cudaEventRecord(start, 0);
	#endif

	thrust::device_ptr<Record> dvp1 = thrust::device_pointer_cast(d_R);
	thrust::device_ptr<Record> permutation;
	if(posR == NULL)
	{
		reservar(&posR, sizem32);
#ifdef DEBUG_MEM
		cerr << "+ " << posR << " posR m32 " << sizem32 << endl;
#endif
		permutation = thrust::device_pointer_cast(posR);
		thrust::sequence(permutation, permutation + m32rLen);
	}
	else
		permutation = thrust::device_pointer_cast(posR);

	flag = 0;
	while(flag != 1)
	{
		try
		{
			thrust::stable_sort_by_key(dvp1, dvp1 + m32rLen, permutation);
			flag = 1;
		}
		catch(std::bad_alloc &e)
		{
			limpiar("inclusive scan in join", 0);
		}
	}

	#ifdef TIMER
	cudaEventRecord(stop, 0);
	cudaEventSynchronize(stop);
	cudaEventElapsedTime(&time, start, stop);
	//cout << "Sort = " << time << endl;
	cuda_stats.sort_time += time;
	
	cudaEventDestroy(start);
	cudaEventDestroy(stop);
	cudaEventCreate(&start);
	cudaEventCreate(&stop);
	cudaEventRecord(start, 0);
	#endif

	IDataNode* d_data;
	IDirectoryNode* d_dir;
	unsigned int nDataNodes;

	nDataNodes = uintCeilingDiv(rLen, TREE_NODE_SIZE);
	d_data=(IDataNode *)d_R;

	unsigned int lvlDir = uintCeilingLog(TREE_FANOUT, nDataNodes);
	unsigned int nDirNodes = uintCeilingDiv(nDataNodes - 1, TREE_NODE_SIZE);
	unsigned int tree_size = nDirNodes + nDataNodes;
	unsigned int bottom_start = (uintPower(TREE_FANOUT, lvlDir) - 1) / TREE_NODE_SIZE;
	d_dir = (IDirectoryNode *)temp;

	unsigned int nNodesPerBlock = uintCeilingDiv(nDirNodes, BLCK_PER_GRID_create);

	dim3 Dbc(THRD_PER_BLCK_create, 1, 1);
	dim3 Dgc(BLCK_PER_GRID_create, 1, 1);

	gCreateIndex <<<Dgc, Dbc>>> (d_data, d_dir, nDirNodes, tree_size, bottom_start, nNodesPerBlock);

	/*int y;
	IDirectoryNode *h_dir = (IDirectoryNode*)malloc(sizeof(IDirectoryNode) * nDirNodes);
	cudaMemcpy(h_dir, d_dir, sizeof(IDirectoryNode) * nDirNodes, cudaMemcpyDeviceToHost);
	for(x = 0; x < nDirNodes; x++)
	{
		for(y = 0; y < TREE_NODE_SIZE; y++)
			printf("%d ", h_dir[x].keys[y]);
		printf("\n");
	}
	free(h_dir);*/

	int *d_locations;
	reservar(&d_locations, memSizeS);
#ifdef DEBUG_MEM
	cerr << "+ " << d_locations << " d_locs n " << memSizeS << endl;
#endif

	dim3 Dbs(THRD_PER_BLCK_search, 1, 1);
	dim3 Dgs(BLCK_PER_GRID_search, 1, 1);

	unsigned int nSearchKeys = sLen;
	unsigned int nKeysPerThread = uintCeilingDiv(nSearchKeys, THRD_PER_GRID_search);

	gSearchTree <<<Dgs, Dbs>>> (d_data, nDataNodes, d_dir, nDirNodes, lvlDir, d_S, d_locations, nSearchKeys, nKeysPerThread, tree_size, bottom_start);
	cudaMemsetAsync(temp, 0, memSizeS);

	blockllen = sLen / numthreads + 1;
	int muljoin = 0, muljoinsize = 0;
	if(numj > 2)
	{
		muljoin = numj - 2;
		muljoinsize = muljoin * sizeof(int);
		cudaMemcpy(dcons, wherej + 2, muljoinsize, cudaMemcpyHostToDevice);
		gIndexMultiJoin<<<blockllen, numthreads, muljoinsize>>> (d_R, d_S, d_locations, sLen, temp, p1, p2, of1, of2, posR, posS, dcons, muljoin);
	}
	else
		gIndexJoin<<<blockllen, numthreads>>> (d_R, d_S, d_locations, sLen, temp);
	liberar(d_R, sizem32);
	liberar(d_S, memSizeS);

	int sum = res[sLen-1];
	thrust::exclusive_scan(res, res + sLen, res);
	sum += res[sLen-1];
	if(sum == 0)
		return 0;	
	res[sLen] = sum;

	int *d_Rout;
	int resSize, sizepro;
	if(pos == (rule->num_rows - 3) && rule->num_bpreds.x == 0)
	{
		sizepro = rule->num_columns * sizeof(int);
		cudaMemcpy(dcons, proj, sizepro, cudaMemcpyHostToDevice);
		resSize = sum * sizepro;
		reservar(&d_Rout, resSize);
#ifdef DEBUG_MEM
		cerr << "+ " << d_Rout << " d_Rout n " << resSize << endl;
#endif
		if(numj > 2)
		{
			cudaMemcpy(dcons + rule->num_columns, wherej + 2, muljoinsize, cudaMemcpyHostToDevice);
			multiJoinWithWrite2<<<blockllen, numthreads, sizepro + muljoinsize>>> (d_locations, sLen, temp, d_Rout, p1, p2, of1, of2, dcons, rule->num_columns, posR, posS, muljoin);
		}
		else
			gJoinWithWrite2<<<blockllen, numthreads, sizepro>>> (d_locations, sLen, temp, d_Rout, p1, p2, of1, of2, dcons, rule->num_columns, posR, posS);
	}
	else
	{
		sizepro = projp.y * sizeof(int);
		cudaMemcpy(dcons, proj, sizepro, cudaMemcpyHostToDevice);
		resSize = sum * sizepro;
		reservar(&d_Rout, resSize);
#ifdef DEBUG_MEM
		cerr << "+ " << d_Rout << " d_Rout 2 " << resSize << endl;
#endif
		if(numj > 2)
		{
			cudaMemcpy(dcons + projp.y, wherej + 2, muljoinsize, cudaMemcpyHostToDevice);
			multiJoinWithWrite<<<blockllen, numthreads, sizepro + muljoinsize>>> (d_locations, sLen, temp, d_Rout, p1, p2, of1, of2, dcons, projp.x, projp.y, posR, posS, muljoin);
		}
		else
			gJoinWithWrite<<<blockllen, numthreads, sizepro>>> (d_locations, sLen, temp, d_Rout, p1, p2, of1, of2, dcons, projp.x, projp.y, posR, posS);
	}

	liberar(dcons, sizet);
	liberar(d_locations, memSizeS);
	liberar(temp, sizet2);
	liberar(posR, sizem32);
	if(posS != NULL)
		liberar(posS, memSizeS);
	
	/*if(posS != NULL)
		liberar(posS, memSizeS);
	liberar(dtmprulpos, sizerul);
	if(*ret != NULL)
		liberar(*ret, por_liberar);
	free(tmprulpos);
	if(final_cond != posf)
	{
		free(*newrule);
		*newrule = (int *)malloc(sizerul);
		memcpy(*newrule, temprule, sizerul);
		*newrullen = lenrul;
	}*/
	
	if(*ret != NULL)
		liberar(*ret, porLiberar);
	*ret = d_Rout;

	#ifdef TIMER
	cudaEventRecord(stop, 0);
	cudaEventSynchronize(stop);
	cudaEventElapsedTime(&time, start, stop);
	//cout << "Join = " << time << endl;
	//cout << "FIN" << endl;
	cuda_stats.join_time += time;
	#endif

	return sum;
}