#include <thrust/host_vector.h> #include <thrust/device_vector.h> #include <thrust/sequence.h> #include <thrust/sort.h> #include <thrust/gather.h> #include <thrust/scan.h> #include <iostream> #include <cstdio> #include "lista.h" #include "bpreds.h" #define WARP_SIZE 32 #define TREE_NODE_SIZE WARP_SIZE #define TREE_FANOUT (TREE_NODE_SIZE + 1) #define N_MULTI_P 16 #define BLCK_PER_MP_create 256 // blocks per multiprocessor during tree creation #define BLCK_PER_MP_search 512 // blocks per multiprocessor during tree searching #define WAPRS_PER_BLCK_join 8//16 // blocks per multiprocessor during tree creation #define BLCK_PER_MP_join 512//256 // blocks per multiprocessor during tree searching #define THRD_PER_BLCK_create TREE_NODE_SIZE #define BLCK_PER_GRID_create (N_MULTI_P * BLCK_PER_MP_create) #define THRD_PER_BLCK_search TREE_NODE_SIZE #define BLCK_PER_GRID_search (N_MULTI_P * BLCK_PER_MP_search) #define THRD_PER_GRID_search (THRD_PER_BLCK_search * BLCK_PER_GRID_search) #define THRD_PER_BLCK_join (WARP_SIZE * WAPRS_PER_BLCK_join) #define BLCK_PER_GRID_join (N_MULTI_P * BLCK_PER_MP_join) #define THRD_PER_GRID_join (THRD_PER_BLCK_join * BLCK_PER_GRID_join) #define TEST_MAX 100 typedef int IKeyType; typedef int Record; typedef struct { int keys[TREE_NODE_SIZE]; } IDirectoryNode; typedef struct { Record records[TREE_NODE_SIZE]; } IDataNode; typedef struct { IDataNode* data; unsigned int nDataNodes; IDirectoryNode* dir; unsigned int nDirNodes; } CUDA_CSSTree; struct to_neg { __host__ __device__ bool operator()(const int &r1) { if(r1 < 0) return 1; return 0; } }; __host__ __device__ unsigned int uintCeilingLog(unsigned int base, unsigned int num) { unsigned int result = 0; for(unsigned int temp = 1; temp < num; temp *= base) result++; return result; } __host__ __device__ unsigned int uintCeilingDiv(unsigned int dividend, unsigned int divisor) { return (dividend + divisor - 1) / divisor; } __host__ __device__ unsigned int uintPower(unsigned int base, unsigned int pow) { unsigned int result = 1; for(; pow; pow--) result *= base; return result; } __device__ int getRightMostDescIdx(int tree_size, int nodeIdx) { int tmp = nodeIdx * TREE_NODE_SIZE + TREE_FANOUT; int n = uintCeilingLog(TREE_FANOUT, uintCeilingDiv(TREE_NODE_SIZE * tree_size + TREE_FANOUT, tmp)) - 1; int result = (tmp * uintPower(TREE_FANOUT, n) - TREE_FANOUT) / TREE_NODE_SIZE; return result; } __device__ int getDataArrayIdx(int dirSize, int tree_size, int bottom_start, int treeIdx) { int idx; if(treeIdx < dirSize) { idx = tree_size - bottom_start - 1; } else if( treeIdx < bottom_start ) { idx = tree_size - bottom_start + treeIdx - dirSize; } else { idx = treeIdx - bottom_start; } return idx; } // Binary Search __device__ int firstMatchingKeyInDirNode1(int keys[], int key) { int min = 0; int max = TREE_NODE_SIZE; int mid; int cut; while(max - min > 1) { mid = (min + max) / 2; cut = keys[mid]; if(key > cut) min = mid; else max = mid; } if(keys[min] >= key) return min; return max; } // Binary Search __device__ int firstMatchingKeyInDataNode2(Record records[], IKeyType key) { int min = 0; int max = TREE_NODE_SIZE; int mid; int cut; while(max - min > 1) { mid = (min + max) / 2; cut = records[mid]; if(key > cut) min = mid; else max = mid; } if(records[min] == key) return min; if(max < TREE_NODE_SIZE && records[max] == key) return max; return -1; } __global__ void gCreateIndex(IDataNode data[], IDirectoryNode dir[], int dirSize, int tree_size, int bottom_start, int nNodesPerBlock) { int startIdx = blockIdx.x * nNodesPerBlock; int endIdx = startIdx + nNodesPerBlock; if(endIdx > dirSize) endIdx = dirSize; int keyIdx = threadIdx.x; // Proceed only when in internal nodes for(int nodeIdx = startIdx; nodeIdx < endIdx; nodeIdx++) { int childIdx = nodeIdx * TREE_FANOUT + keyIdx + 1; // One step down to the left // Then look for the right most descendent int rightMostDesIdx; // Common cases if(childIdx < tree_size) { rightMostDesIdx = getRightMostDescIdx(tree_size, childIdx); } // versus the unusual case when the tree is incomplete and the node does not have the full set of children else { // pick the last node in the tree (largest element of the array) rightMostDesIdx = tree_size - 1; } int dataArrayIdx = getDataArrayIdx(dirSize, tree_size, bottom_start, rightMostDesIdx); dir[nodeIdx].keys[keyIdx] = data[dataArrayIdx].records[TREE_NODE_SIZE - 1]; } } __global__ void gSearchTree(IDataNode* data, int nDataNodes, IDirectoryNode* dir, int nDirNodes, int lvlDir, Record* arr, int locations[], int nSearchKeys, int nKeysPerThread, int tree_size, int bottom_start) { // Bringing the root node (visited by every tuple) to the faster shared memory __shared__ IKeyType RootNodeKeys[TREE_NODE_SIZE]; RootNodeKeys[threadIdx.x] = dir->keys[threadIdx.x]; __syncthreads(); int OverallThreadIdx = blockIdx.x * THRD_PER_BLCK_search + threadIdx.x; for(int keyIdx = OverallThreadIdx; keyIdx < nSearchKeys; keyIdx += THRD_PER_GRID_search) { IKeyType val = arr[keyIdx]; int loc = firstMatchingKeyInDirNode1(RootNodeKeys, val) + 1; for(int i = 1; i < lvlDir && loc < nDirNodes; i++) { int kid = firstMatchingKeyInDirNode1(dir[loc].keys, val); loc = loc * TREE_FANOUT + kid + 1; } if(loc >= tree_size) loc = nDataNodes - 1; else loc = getDataArrayIdx(nDirNodes, tree_size, bottom_start, loc); int offset = firstMatchingKeyInDataNode2(data[loc].records, val); locations[keyIdx] = (offset <0)?-1:(loc * TREE_NODE_SIZE + offset); } } /*Counts the number of times a row in 'S' is to be joined to a row in 'R'.*/ __global__ void gIndexJoin(int *R, int *S, int g_locations[], int sLen, int g_ResNums[]) { int s_cur = blockIdx.x * blockDim.x + threadIdx.x; if(s_cur < sLen) { int count = 1; int r_cur = g_locations[s_cur]; int s_key; if(r_cur >= 0) { s_key = S[s_cur]; r_cur++; while(s_key == R[r_cur]) { count++; r_cur++; } g_ResNums[s_cur] = count; } } } /*Corrects 'gSearchTree' results when dealing with a negative multijoin. Uses the values found in 'g_locations' which indicate, for each row in 'R', if its going to be joined (positive number) or not (-1). Works by checking the additional columns to be joined (i.e. all except the two used by 'gSearchTree') and changing to -1 in 'g_locations' those rows that have equal values in the checked columns.*/ __global__ void gIndexMultiJoinNegative(int *R, int *S, int g_locations[], int rLen, int *p1, int *p2, int of1, int of2, int *mloc, int *sloc, int *muljoin, int wj) { extern __shared__ int shared[]; int r_cur = blockIdx.x * blockDim.x + threadIdx.x; int posr, poss, x; if(threadIdx.x < wj) shared[threadIdx.x] = muljoin[threadIdx.x]; __syncthreads(); if(r_cur < rLen) { int s_cur = g_locations[r_cur]; int r_key; if(s_cur >= 0) { r_key = R[r_cur]; if(mloc == NULL) posr = r_cur * of1; else posr = mloc[r_cur] * of1; while(r_key == S[s_cur]) { poss = sloc[s_cur] * of2; for(x = 0; x < wj; x += 2) { if(p1[posr + shared[x]] != p2[poss + shared[x+1]]) break; } if(x >= wj) return; s_cur++; } g_locations[r_cur] = -1; } } } /*Corrects 'gSearchTree' results when dealing with a multijoin. Uses the values found in 'g_locations' which indicate, for each row in 'S', if its going to be joined (positive number) or not (-1). Works by checking the additional columns to be joined (i.e. all except the two used by 'gSearchTree') and counting the number of times a row in 'S' is to be joined to its corresponding row in 'R', storing the new result in 'g_locations'.*/ __global__ void gIndexMultiJoin(int *R, int *S, int g_locations[], int sLen, int g_ResNums[], int *p1, int *p2, int of1, int of2, int *mloc, int *sloc, int *muljoin, int wj) { extern __shared__ int shared[]; int s_cur = blockIdx.x * blockDim.x + threadIdx.x; int posr, poss, x; if(threadIdx.x < wj) shared[threadIdx.x] = muljoin[threadIdx.x]; __syncthreads(); if(s_cur < sLen) { int count = 0; int r_cur = g_locations[s_cur]; int s_key; if(r_cur >= 0) { s_key = S[s_cur]; if(sloc == NULL) poss = s_cur * of2; else poss = sloc[s_cur] * of2; while(s_key == R[r_cur]) { posr = mloc[r_cur] * of1; for(x = 0; x < wj; x += 2) { if(p1[posr + shared[x]] != p2[poss + shared[x+1]]) break; } if(x >= wj) count++; r_cur++; } if(count > 0) g_ResNums[s_cur] = count; } } } /*Writes the result of the join and projects the necessary columns as defined by 'rule'. The difference between this function and 'gJoinWithWrite' is the comparison of the additional join columns.*/ __global__ void multiJoinWithWrite(int g_locations[], int sLen, int g_PrefixSums[], int g_joinResultBuffers[], int *p1, int *p2, int of1, int of2, int *rule, int halfrul, int lenrul, int *mloc, int *sloc, int wj) { extern __shared__ int shared[]; int *extjoins = &shared[lenrul]; int s_cur = blockIdx.x * blockDim.x + threadIdx.x; if(threadIdx.x < (lenrul + wj)) shared[threadIdx.x] = rule[threadIdx.x]; __syncthreads(); if(s_cur < sLen) { int r_cur = g_locations[s_cur]; if(r_cur >= 0) { int x, y, pos, posr, poss; int num1 = g_PrefixSums[s_cur]; int num2 = g_PrefixSums[s_cur+1]; int tmp1, tmp2; if(sloc == NULL) poss = s_cur * of2; else poss = sloc[s_cur] * of2; for(x = num1; x < num2; x++, r_cur++) { pos = mloc[r_cur] * of1; for(y = 0; y < wj; y += 2) /*Additional comparison*/ { tmp1 = p1[pos + extjoins[y]]; tmp2 = p2[poss + extjoins[y+1]]; if(tmp1 != tmp2) break; } if(y < wj) { x--; continue; } posr = x * lenrul; for(y = 0; y < halfrul; y++) g_joinResultBuffers[posr + y] = p1[pos + shared[y]]; for(; y < lenrul; y++) g_joinResultBuffers[posr + y] = p2[poss + shared[y]]; } } } } /*Writes the result of the join and projects the necessary columns as defined by 'rule'. The difference between this function and 'gJoinWithWrite2' is the comparison of the additional join columns.*/ __global__ void multiJoinWithWrite2(int g_locations[], int sLen, int g_PrefixSums[], int g_joinResultBuffers[], int *p1, int *p2, int of1, int of2, int *rule, int cols, int *mloc, int *sloc, int wj) { extern __shared__ int shared[]; int *extjoins = &shared[cols]; int s_cur = blockIdx.x * blockDim.x + threadIdx.x; if(threadIdx.x < (cols + wj)) shared[threadIdx.x] = rule[threadIdx.x]; __syncthreads(); if(s_cur < sLen) { int r_cur = g_locations[s_cur]; if(r_cur >= 0) { int x, y, pos, pos2, posr, cond; int num1 = g_PrefixSums[s_cur]; int num2 = g_PrefixSums[s_cur+1]; if(sloc == NULL) pos2 = s_cur * of2 - 1; else pos2 = sloc[s_cur] * of2 - 1; for(x = num1; x < num2; x++, r_cur++) { pos = mloc[r_cur] * of1 - 1; for(y = 0; y < wj; y += 2) /*Additional comparison*/ { if(p1[pos + extjoins[y] + 1] != p2[pos2 + extjoins[y+1] + 1]) break; } if(y < wj) { x--; continue; } posr = x * cols; for(y = 0; y < cols; y++) { cond = shared[y]; if(cond > 0) g_joinResultBuffers[posr + y] = p1[pos + cond]; else g_joinResultBuffers[posr + y] = p2[pos2 - cond]; } } } } } /*Writes the result of the join and projects the necessary columns as defined by 'rule'. The difference between this function and 'gJoinWithWrite2' is that only the columns in the positve predicate are projected.*/ __global__ void gJoinWithWriteNegative(int g_locations[], int rLen, int g_joinResultBuffers[], int *p1, int of1, int *rule, int halfrul, int *mloc) { extern __shared__ int shared[]; int r_cur = blockIdx.x * blockDim.x + threadIdx.x; int posr; if(threadIdx.x < halfrul) shared[threadIdx.x] = rule[threadIdx.x]; __syncthreads(); if(r_cur < rLen) { posr = g_locations[r_cur]; if(g_locations[r_cur+1] != posr) { int y, pos; if(mloc == NULL) pos = r_cur * of1; else pos = mloc[r_cur] * of1; posr *= halfrul; for(y = 0; y < halfrul; y++) g_joinResultBuffers[posr + y] = p1[pos + shared[y]]; } } } /*Writes the result of the join and projects the necessary columns as defined by 'rule'. The difference between this function and 'gJoinWithWrite' is that only the columns in the positve predicate are projected.*/ __global__ void gJoinWithWriteNegative2(int g_locations[], int rLen, int g_joinResultBuffers[], int *p1, int of1, int *rule, int cols, int *mloc) { extern __shared__ int shared[]; int r_cur = blockIdx.x * blockDim.x + threadIdx.x; int posr; if(threadIdx.x < cols) shared[threadIdx.x] = rule[threadIdx.x]; __syncthreads(); if(r_cur < rLen) { posr = g_locations[r_cur]; if(g_locations[r_cur+1] != posr) { int y, pos; if(mloc == NULL) pos = r_cur * of1 - 1; else pos = mloc[r_cur] * of1 - 1; posr *= cols; for(y = 0; y < cols; y++) g_joinResultBuffers[posr + y] = p1[pos + shared[y]]; } } } /*Writes the result of the join and projects the necessary columns as defined by 'rule'.*/ __global__ void gJoinWithWrite(int g_locations[], int sLen, int g_PrefixSums[], int g_joinResultBuffers[], int *p1, int *p2, int of1, int of2, int *rule, int halfrul, int lenrul, int *mloc, int *sloc) { extern __shared__ int shared[]; int s_cur = blockIdx.x * blockDim.x + threadIdx.x; if(threadIdx.x < lenrul) shared[threadIdx.x] = rule[threadIdx.x]; __syncthreads(); if(s_cur < sLen) { int r_cur = g_locations[s_cur]; if(r_cur >= 0) { int x, y, pos, posr, poss; int num1 = g_PrefixSums[s_cur]; int num2 = g_PrefixSums[s_cur+1]; if(sloc == NULL) poss = s_cur * of2; else poss = sloc[s_cur] * of2; for(x = num1; x < num2; x++, r_cur++) { pos = mloc[r_cur] * of1; posr = x * lenrul; for(y = 0; y < halfrul; y++) g_joinResultBuffers[posr + y] = p1[pos + shared[y]]; for(; y < lenrul; y++) g_joinResultBuffers[posr + y] = p2[poss + shared[y]]; } } } } /*Writes the result of the join and projects the necessary columns as defined by 'rule'. This version is used when performing the final join of the rule and its only difference is the projection, which is performed based on the variables in the head of the rule.*/ __global__ void gJoinWithWrite2(int g_locations[], int sLen, int g_PrefixSums[], int g_joinResultBuffers[], int *p1, int *p2, int of1, int of2, int *rule, int cols, int *mloc, int *sloc) { extern __shared__ int shared[]; int s_cur = blockIdx.x * blockDim.x + threadIdx.x; if(threadIdx.x < cols) shared[threadIdx.x] = rule[threadIdx.x]; __syncthreads(); if(s_cur < sLen) { int r_cur = g_locations[s_cur]; if(r_cur >= 0) { int x, y, pos, pos2, posr, cond; int num1 = g_PrefixSums[s_cur]; int num2 = g_PrefixSums[s_cur+1]; if(sloc == NULL) pos2 = s_cur * of2 - 1; else pos2 = sloc[s_cur] * of2 - 1; for(x = num1; x < num2; x++, r_cur++) { pos = mloc[r_cur] * of1 - 1; posr = x * cols; for(y = 0; y < cols; y++) { cond = shared[y]; if(cond > 0) g_joinResultBuffers[posr + y] = p1[pos + cond]; else g_joinResultBuffers[posr + y] = p2[pos2 - cond]; } } } } } /*Load part of column 'wj' of 'p' in 'R'. Which values are loaded is defined by the prefix sum results in 'pos'.*/ __global__ void llenar(int *p, int *R, int len, int of, int wj, int *pos, int *ids) { int id = blockIdx.x * blockDim.x + threadIdx.x; int cond; if(id < len) { cond = pos[id]; if(pos[id+1] != cond) { R[cond] = p[id * of + wj]; ids[cond] = id; } } } /*Load an entire column from 'p' into 'R'.*/ __global__ void llenarnosel(int *p, int *R, int len, int of, int wj) { int id = blockIdx.x * blockDim.x + threadIdx.x; if(id < len) R[id] = p[id * of + wj]; } __global__ void projectfinal(int *res, int rows, int cols, int *rule, int *out) { extern __shared__ int shared[]; int id = blockIdx.x * blockDim.x + threadIdx.x; if(threadIdx.x < cols) shared[threadIdx.x] = rule[threadIdx.x]; __syncthreads(); if(id < rows) { id *= cols; for(int y = 0; y < cols; y++) out[id + y] = res[id + shared[y]]; } } void project(int *res, int resrows, int numcols1, int numcols2, int *proj, int **ret, int type) { int z, *dcons, *d_Rout; int numthreads = 1024; //numthreads = 32; int blockllen = resrows / numthreads + 1; int sizepro = numcols2 * sizeof(int); reservar(&dcons, sizepro); if(type) { int *pt = (int *)malloc(sizepro); for(z = 0; z < numcols2; z++) pt[z] = proj[z] - 1; cudaMemcpy(dcons, pt, sizepro, cudaMemcpyHostToDevice); //cudaDeviceSynchronize(); //Small cudaMemcpys are asynchronous, uncomment this line if the pointer is being liberated before it is copied. free(pt); } else cudaMemcpy(dcons, proj, sizepro, cudaMemcpyHostToDevice); reservar(&d_Rout, resrows * sizepro); projectfinal<<<blockllen, numthreads, sizepro>>>(res, resrows, numcols1, dcons, d_Rout); cudaFree(dcons); cudaFree(*ret); *ret = d_Rout; } __global__ void projectadd(int *dop1, int *dop2, int rows1, int rows2, int cols1, int cols2, int *dhead, int hsize, int *res) { extern __shared__ int shared[]; int id = blockIdx.x * blockDim.x + threadIdx.x; int pos2, posr, x, y, cond; if(threadIdx.x < hsize) shared[threadIdx.x] = dhead[threadIdx.x]; __syncthreads(); if(id < rows2) { posr = id * hsize * rows1; pos2 = id * cols2 - 1; for(x = 0; x < rows1; x++) { for(y = 0; y < hsize; y++) { cond = shared[y]; if(cond > 0) res[posr + y] = dop1[cond-1]; else res[posr + y] = dop2[pos2 - cond]; } posr += hsize; } } } void juntar(int *dop1, int *dop2, int rows1, int rows2, int cols1, int cols2, int *proj, int pcols, int **ret) { int sizepro, *dcons, *d_Rout; int numthreads = 1024; //numthreads = 32; int blockllen = rows2 / numthreads + 1; sizepro = pcols * sizeof(int); reservar(&dcons, sizepro); cudaMemcpy(dcons, proj, sizepro, cudaMemcpyHostToDevice); reservar(&d_Rout, rows1 * rows2 * sizepro); projectadd<<<blockllen, numthreads, sizepro>>>(dop1, dop2, rows1, rows2, cols1, cols2, dcons, pcols, d_Rout); cudaFree(dcons); *ret = d_Rout; } /*Joins two predicates. Starts by performing all preliminary operations (selections, selfjoins, comparisons) on both predicates. Then a column pair is used to construct a CSS-Tree and that tree is searched for join positions. The positions are used in a prefix sum and its result allows us to write the result. Multijoins and negative predicates follow roughly the same process, but use different kernels.*/ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>::iterator rule, int pos, int bothops, int **ret, int ANDlogic) { int pos2 = pos + 1; int *sel1 = NULL, nsel1 = 0; int *sel2 = rule->select[pos2]; int nsel2 = rule->numsel[pos2]; int *proj = rule->project[pos]; int2 projp = rule->projpos[pos]; int *sjoin1 = NULL, nsj1 = 0; int *sjoin2 = rule->selfjoin[pos2]; int nsj2 = rule->numselfj[pos2]; int *pred1 = NULL; int2 npred1 = make_int2(0,0); int *pred2 = rule->preds[pos2]; int2 npred2 = rule->numpreds[pos2]; int npred2tot = npred2.x + npred2.y; int *wherej = rule->wherejoin[pos]; int numj = rule->numjoin[pos]; int negative = rule->negatives[pos2+1]; int flag; #ifdef ROCKIT ANDlogic = 0; #endif if(negative) ANDlogic = 1; #if TIMER cuda_stats.joins++; #endif int size, sizet, sizet2; if(bothops) { sel1 = rule->select[pos]; nsel1 = rule->numsel[pos]; sjoin1 = rule->selfjoin[pos]; nsj1 = rule->numselfj[pos]; pred1 = rule->preds[pos]; npred1 = rule->numpreds[pos]; sizet = maximo(10, of1, of2, nsel1, nsel2, projp.y + numj - 2, nsj1, nsj2, numj, npred1.x, npred2tot) * sizeof(int); } else sizet = maximo(7, of1, of2, nsel2, projp.y + numj - 2, nsj2, numj, npred2tot) * sizeof(int); int *dcons, *temp, *temp2 = NULL; int *d_R, *d_S; int blockllen, numthreads; int extraspace = TREE_NODE_SIZE - rLen % TREE_NODE_SIZE; int m32rLen = rLen + extraspace; int extraspaceS = TREE_NODE_SIZE - sLen % TREE_NODE_SIZE; int m32sLen = sLen + extraspaceS; if(m32rLen > m32sLen) sizet2 = (m32rLen + 1) * sizeof(int); else sizet2 = (m32sLen + 1) * sizeof(int); reservar(&dcons, sizet); reservar(&temp, sizet2); thrust::device_ptr<int> res = thrust::device_pointer_cast(temp); numthreads = 1024; //numthreads = 32; blockllen = sLen / numthreads + 1; int memSizeS, newLen = 0; int *posR = NULL, *posS = NULL; int sizem32S = 0, sizextra; #ifdef TIMER //cout << "INICIO" << endl; cudaEvent_t start, stop; float time; cudaEventCreate(&start); cudaEventCreate(&stop); cudaEventRecord(start, 0); #endif if(npred2.x > 0 || npred2.y > 0 || nsel2 > 0 || nsj2 > 0) { newLen = sLen + 1; cudaMemsetAsync(temp, 0, newLen * sizeof(int)); } if(npred2.x > 0 || npred2.y > 0) { size = npred2tot * sizeof(int); cudaMemcpy(dcons, pred2, size, cudaMemcpyHostToDevice); if(npred2.y > 0) /*Fix case when a(X,Y),b(Y,Z),Z > Y*/ { reservar(&temp2, sizet2); cudaMemsetAsync(temp2, 0, newLen * sizeof(int)); //res = thrust::device_pointer_cast(temp2); bpreds<<<blockllen, numthreads, size>>>(p1, p2, sLen, of1, of2, dcons, npred2tot, npred2.x, temp + 1, temp2 + 1); } else { if(negative) bpreds<<<blockllen, numthreads, size>>>(p1, p2, sLen, of1, of2, dcons, npred2tot, npred2.x, temp + 1, NULL); else bpredsOR<<<blockllen, numthreads, size>>>(p1, p2, sLen, of1, of2, dcons, npred2tot, npred2.x, temp + 1, NULL); } if(nsel2 > 0) { size = nsel2 * sizeof(int); cudaMemcpy(dcons, sel2, size, cudaMemcpyHostToDevice); marcar<<<blockllen, numthreads, size>>>(p2, sLen, of2, dcons, nsel2, temp + 1); } if(nsj2 > 0) { size = nsj2 * sizeof(int); cudaMemcpy(dcons, sjoin2, size, cudaMemcpyHostToDevice); samejoin<<<blockllen, numthreads, size>>>(p2, sLen, of2, dcons, nsj2, temp + 1); } } else { if(nsel2 > 0) { size = nsel2 * sizeof(int); cudaMemcpy(dcons, sel2, size, cudaMemcpyHostToDevice); marcar2<<<blockllen, numthreads, size>>>(p2, sLen, of2, dcons, nsel2, temp + 1); if(nsj2 > 0) { size = nsj2 * sizeof(int); cudaMemcpy(dcons, sjoin2, size, cudaMemcpyHostToDevice); samejoin<<<blockllen, numthreads, size>>>(p2, sLen, of2, dcons, nsj2, temp + 1); } } else { if(nsj2 > 0) { size = nsj2 * sizeof(int); cudaMemcpy(dcons, sjoin2, size, cudaMemcpyHostToDevice); samejoin2<<<blockllen, numthreads, size>>>(p2, sLen, of2, dcons, nsj2, temp + 1); } else { sizem32S = m32sLen * sizeof(int); reservar(&d_S, sizem32S); cudaMemsetAsync(d_S + sLen, 0x7f, extraspaceS * sizeof(int)); llenarnosel<<<blockllen, numthreads>>>(p2, d_S, sLen, of2, wherej[1]); } } } if(npred2.x > 0 || npred2.y > 0 || nsel2 > 0 || nsj2 > 0) { flag = 0; while(flag != 1) { try { thrust::inclusive_scan(res + 1, res + newLen, res + 1); flag = 1; } catch(std::bad_alloc &e) { limpiar("inclusive scan in join", 0); } } newLen = res[sLen]; if(newLen == 0) // && !negative) ARREGLAR { cudaFree(temp); cudaFree(dcons); return 0; } extraspaceS = TREE_NODE_SIZE - newLen % TREE_NODE_SIZE; sizextra = extraspaceS * sizeof(int); m32sLen = newLen + extraspaceS; sizem32S = m32sLen * sizeof(int); reservar(&d_S, sizem32S); reservar(&posS, sizem32S); cudaMemsetAsync(d_S + newLen, 0x7f, sizextra); cudaMemsetAsync(posS + newLen, 0x7f, sizextra); llenar<<<blockllen, numthreads>>>(p2, d_S, sLen, of2, wherej[1], temp, posS); sLen = newLen; } #ifdef TIMER cudaEventRecord(stop, 0); cudaEventSynchronize(stop); cudaEventElapsedTime(&time, start, stop); //cout << "Select1 = " << time << endl; cuda_stats.select1_time += time; cudaEventDestroy(start); cudaEventDestroy(stop); cudaEventCreate(&start); cudaEventCreate(&stop); cudaEventRecord(start, 0); #endif blockllen = rLen / numthreads + 1; int sizem32; if(bothops) { if(temp2 != NULL) { cudaFree(temp); temp = temp2; res = thrust::device_pointer_cast(temp); newLen = rLen + 1; if(nsel1 > 0) { size = nsel1 * sizeof(int); cudaMemcpy(dcons, sel1, size, cudaMemcpyHostToDevice); marcar<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, nsel1, temp + 1); } if(nsj1 > 0) { size = nsj1 * sizeof(int); cudaMemcpy(dcons, sjoin1, size, cudaMemcpyHostToDevice); samejoin<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, nsj1, temp + 1); } if(npred1.x > 0) { size = npred1.x * sizeof(int); cudaMemcpy(dcons, pred1, size, cudaMemcpyHostToDevice); if(ANDlogic) bpredsnormal<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, npred1.x, temp + 1); else bpredsorlogic<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, npred1.x, temp + 1); } } else { if(npred1.x > 0 || nsel1 > 0 || nsj1 > 0) { newLen = rLen + 1; cudaMemsetAsync(temp, 0, newLen * sizeof(int)); } if(nsel1 > 0) { size = nsel1 * sizeof(int); cudaMemcpy(dcons, sel1, size, cudaMemcpyHostToDevice); marcar2<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, nsel1, temp + 1); if(nsj1 > 0) { size = nsj1 * sizeof(int); cudaMemcpy(dcons, sjoin1, size, cudaMemcpyHostToDevice); samejoin<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, nsj1, temp + 1); } if(npred1.x > 0) { size = npred1.x * sizeof(int); cudaMemcpy(dcons, pred1, size, cudaMemcpyHostToDevice); if(ANDlogic) bpredsnormal<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, npred1.x, temp + 1); else bpredsorlogic<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, npred1.x, temp + 1); } } else { if(nsj1 > 0) { size = nsj1 * sizeof(int); cudaMemcpy(dcons, sjoin1, size, cudaMemcpyHostToDevice); samejoin2<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, nsj1, temp + 1); if(npred1.x > 0) { size = npred1.x * sizeof(int); cudaMemcpy(dcons, pred1, size, cudaMemcpyHostToDevice); if(ANDlogic) bpredsnormal<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, npred1.x, temp + 1); else bpredsorlogic<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, npred1.x, temp + 1); } } else { if(npred1.x > 0) { size = npred1.x * sizeof(int); cudaMemcpy(dcons, pred1, size, cudaMemcpyHostToDevice); if(ANDlogic) bpredsnormal2<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, npred1.x, temp + 1); else bpredsorlogic2<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, npred1.x, temp + 1); } } } } if(temp2 != NULL || npred1.x > 0 || nsel1 > 0 || nsj1 > 0) { thrust::inclusive_scan(res + 1, res + newLen, res + 1); newLen = res[rLen]; if(newLen == 0) { cudaFree(temp); cudaFree(dcons); cudaFree(d_S); if(posS != NULL) cudaFree(posS); return 0; } extraspace = TREE_NODE_SIZE - newLen % TREE_NODE_SIZE; sizextra = extraspace * sizeof(int); m32rLen = newLen + extraspace; sizem32 = m32rLen * sizeof(int); reservar(&d_R, sizem32); reservar(&posR, sizem32); cudaMemsetAsync(d_R + newLen, 0x7f, sizextra); cudaMemsetAsync(posR + newLen, 0x7f, sizextra); llenar<<<blockllen, numthreads>>>(p1, d_R, rLen, of1, wherej[0], temp, posR); rLen = newLen; } else { sizem32 = m32rLen * sizeof(int); reservar(&d_R, sizem32); cudaMemsetAsync(d_R + rLen, 0x7f, extraspace * sizeof(int)); llenarnosel<<<blockllen, numthreads>>>(p1, d_R, rLen, of1, wherej[0]); } } else { sizem32 = m32rLen * sizeof(int); reservar(&d_R, sizem32); cudaMemsetAsync(d_R + rLen, 0x7f, extraspace * sizeof(int)); llenarnosel<<<blockllen, numthreads>>>(p1, d_R, rLen, of1, wherej[0]); } #ifdef TIMER cudaEventRecord(stop, 0); cudaEventSynchronize(stop); cudaEventElapsedTime(&time, start, stop); //cout << "Select2 = " << time << endl; cuda_stats.select2_time += time; #endif #ifdef TIMER cudaEventDestroy(start); cudaEventDestroy(stop); cudaEventCreate(&start); cudaEventCreate(&stop); cudaEventRecord(start, 0); #endif thrust::device_ptr<Record> dvp1; thrust::device_ptr<Record> permutation; if(negative) { dvp1 = thrust::device_pointer_cast(d_S); if(posS == NULL) { reservar(&posS, sizem32S); permutation = thrust::device_pointer_cast(posS); thrust::sequence(permutation, permutation + m32sLen); } else permutation = thrust::device_pointer_cast(posS); flag = 0; while(flag != 1) { try { thrust::stable_sort_by_key(dvp1, dvp1 + m32sLen, permutation); flag = 1; } catch(std::bad_alloc &e) { limpiar("inclusive scan in join", 0); } } } else { dvp1 = thrust::device_pointer_cast(d_R); if(posR == NULL) { reservar(&posR, sizem32); permutation = thrust::device_pointer_cast(posR); thrust::sequence(permutation, permutation + m32rLen); } else permutation = thrust::device_pointer_cast(posR); flag = 0; while(flag != 1) { try { thrust::stable_sort_by_key(dvp1, dvp1 + m32rLen, permutation); flag = 1; } catch(std::bad_alloc &e) { limpiar("inclusive scan in join", 0); } } } #ifdef TIMER cudaEventRecord(stop, 0); cudaEventSynchronize(stop); cudaEventElapsedTime(&time, start, stop); //cout << "Sort = " << time << endl; cuda_stats.sort_time += time; cudaEventDestroy(start); cudaEventDestroy(stop); cudaEventCreate(&start); cudaEventCreate(&stop); cudaEventRecord(start, 0); #endif IDataNode* d_data; IDirectoryNode* d_dir; unsigned int nDataNodes; if(negative) { nDataNodes = uintCeilingDiv(sLen, TREE_NODE_SIZE); d_data=(IDataNode *)d_S; } else { nDataNodes = uintCeilingDiv(rLen, TREE_NODE_SIZE); d_data=(IDataNode *)d_R; } unsigned int lvlDir = uintCeilingLog(TREE_FANOUT, nDataNodes); unsigned int nDirNodes = uintCeilingDiv(nDataNodes - 1, TREE_NODE_SIZE); unsigned int tree_size = nDirNodes + nDataNodes; unsigned int bottom_start = (uintPower(TREE_FANOUT, lvlDir) - 1) / TREE_NODE_SIZE; d_dir = (IDirectoryNode *)temp; unsigned int nNodesPerBlock = uintCeilingDiv(nDirNodes, BLCK_PER_GRID_create); dim3 Dbc(THRD_PER_BLCK_create, 1, 1); dim3 Dgc(BLCK_PER_GRID_create, 1, 1); gCreateIndex <<<Dgc, Dbc>>> (d_data, d_dir, nDirNodes, tree_size, bottom_start, nNodesPerBlock); int *d_locations; int memSizeR; unsigned int nSearchKeys; if(negative) { memSizeR = (rLen + 1) * sizeof(int); reservar(&d_locations, memSizeR); cudaMemsetAsync(d_locations, 0, sizeof(int)); nSearchKeys = rLen; } else { memSizeS = sLen * sizeof(int); reservar(&d_locations, memSizeS); nSearchKeys = sLen; } dim3 Dbs(THRD_PER_BLCK_search, 1, 1); dim3 Dgs(BLCK_PER_GRID_search, 1, 1); unsigned int nKeysPerThread = uintCeilingDiv(nSearchKeys, THRD_PER_GRID_search); if(negative) { gSearchTree <<<Dgs, Dbs>>> (d_data, nDataNodes, d_dir, nDirNodes, lvlDir, d_R, d_locations + 1, nSearchKeys, nKeysPerThread, tree_size, bottom_start); cudaMemsetAsync(temp, 0, memSizeR); } else { gSearchTree <<<Dgs, Dbs>>> (d_data, nDataNodes, d_dir, nDirNodes, lvlDir, d_S, d_locations, nSearchKeys, nKeysPerThread, tree_size, bottom_start); cudaMemsetAsync(temp, 0, memSizeS); } int muljoin = 0, muljoinsize = 0, sum; int *d_Rout; int resSize, sizepro; if(negative) { blockllen = rLen / numthreads + 1; if(numj > 2) { muljoin = numj - 2; muljoinsize = muljoin * sizeof(int); cudaMemcpy(dcons, wherej + 2, muljoinsize, cudaMemcpyHostToDevice); gIndexMultiJoinNegative<<<blockllen, numthreads, muljoinsize>>> (d_R, d_S, d_locations + 1, rLen, p1, p2, of1, of2, posR, posS, dcons, muljoin); } res = thrust::device_pointer_cast(d_locations); thrust::transform(res + 1, res + rLen + 1, res + 1, to_neg()); thrust::inclusive_scan(res + 1, res + rLen + 1, res + 1); sum = res[rLen]; if(pos == (rule->num_rows - 3)) { sizepro = rule->num_columns * sizeof(int); cudaMemcpy(dcons, proj, sizepro, cudaMemcpyHostToDevice); resSize = sum * sizepro; reservar(&d_Rout, resSize); gJoinWithWriteNegative2<<<blockllen, numthreads, sizepro>>> (d_locations, rLen, d_Rout, p1, of1, dcons, rule->num_columns, posR); } else { sizepro = projp.x * sizeof(int); cudaMemcpy(dcons, proj, sizepro, cudaMemcpyHostToDevice); resSize = sum * sizepro; reservar(&d_Rout, resSize); gJoinWithWriteNegative<<<blockllen, numthreads, sizepro>>> (d_locations, rLen, d_Rout, p1, of1, dcons, projp.x, posR); } cudaFree(d_R); cudaFree(d_S); } else { blockllen = sLen / numthreads + 1; if(numj > 2) { muljoin = numj - 2; muljoinsize = muljoin * sizeof(int); cudaMemcpy(dcons, wherej + 2, muljoinsize, cudaMemcpyHostToDevice); gIndexMultiJoin<<<blockllen, numthreads, muljoinsize>>> (d_R, d_S, d_locations, sLen, temp, p1, p2, of1, of2, posR, posS, dcons, muljoin); } else gIndexJoin<<<blockllen, numthreads>>> (d_R, d_S, d_locations, sLen, temp); cudaFree(d_R); cudaFree(d_S); sum = res[sLen-1]; thrust::exclusive_scan(res, res + sLen, res); sum += res[sLen-1]; if(sum == 0) { cudaFree(dcons); cudaFree(d_locations); cudaFree(temp); if(posS != NULL) cudaFree(posS); if(posR != NULL) cudaFree(posR); return 0; } res[sLen] = sum; if(pos == (rule->num_rows - 3)) { sizepro = rule->num_columns * sizeof(int); cudaMemcpy(dcons, proj, sizepro, cudaMemcpyHostToDevice); resSize = sum * sizepro; reservar(&d_Rout, resSize); if(numj > 2) { cudaMemcpy(dcons + rule->num_columns, wherej + 2, muljoinsize, cudaMemcpyHostToDevice); multiJoinWithWrite2<<<blockllen, numthreads, sizepro + muljoinsize>>> (d_locations, sLen, temp, d_Rout, p1, p2, of1, of2, dcons, rule->num_columns, posR, posS, muljoin); } else gJoinWithWrite2<<<blockllen, numthreads, sizepro>>> (d_locations, sLen, temp, d_Rout, p1, p2, of1, of2, dcons, rule->num_columns, posR, posS); } else { sizepro = projp.y * sizeof(int); cudaMemcpy(dcons, proj, sizepro, cudaMemcpyHostToDevice); resSize = sum * sizepro; reservar(&d_Rout, resSize); if(numj > 2) { cudaMemcpy(dcons + projp.y, wherej + 2, muljoinsize, cudaMemcpyHostToDevice); multiJoinWithWrite<<<blockllen, numthreads, sizepro + muljoinsize>>> (d_locations, sLen, temp, d_Rout, p1, p2, of1, of2, dcons, projp.x, projp.y, posR, posS, muljoin); } else gJoinWithWrite<<<blockllen, numthreads, sizepro>>> (d_locations, sLen, temp, d_Rout, p1, p2, of1, of2, dcons, projp.x, projp.y, posR, posS); } } cudaFree(dcons); cudaFree(d_locations); cudaFree(temp); if(posS != NULL) cudaFree(posS); if(posR != NULL) cudaFree(posR); if(*ret != NULL) cudaFree(*ret); *ret = d_Rout; #ifdef TIMER cudaEventRecord(stop, 0); cudaEventSynchronize(stop); cudaEventElapsedTime(&time, start, stop); //cout << "Join = " << time << endl; //cout << "FIN" << endl; cuda_stats.join_time += time; #endif return sum; }