This repository has been archived on 2023-08-20. You can view files and clone it, but cannot push or open issues or pull requests.
yap-6.3/packages/cuda/treeb.cu

1280 lines
35 KiB
Plaintext
Executable File

#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/sequence.h>
#include <thrust/sort.h>
#include <thrust/gather.h>
#include <thrust/scan.h>
#include <iostream>
#include <cstdio>
#include "lista.h"
#include "bpreds.h"
#define WARP_SIZE 32
#define TREE_NODE_SIZE WARP_SIZE
#define TREE_FANOUT (TREE_NODE_SIZE + 1)
#define N_MULTI_P 16
#define BLCK_PER_MP_create 256 // blocks per multiprocessor during tree creation
#define BLCK_PER_MP_search 512 // blocks per multiprocessor during tree searching
#define WAPRS_PER_BLCK_join 8//16 // blocks per multiprocessor during tree creation
#define BLCK_PER_MP_join 512//256 // blocks per multiprocessor during tree searching
#define THRD_PER_BLCK_create TREE_NODE_SIZE
#define BLCK_PER_GRID_create (N_MULTI_P * BLCK_PER_MP_create)
#define THRD_PER_BLCK_search TREE_NODE_SIZE
#define BLCK_PER_GRID_search (N_MULTI_P * BLCK_PER_MP_search)
#define THRD_PER_GRID_search (THRD_PER_BLCK_search * BLCK_PER_GRID_search)
#define THRD_PER_BLCK_join (WARP_SIZE * WAPRS_PER_BLCK_join)
#define BLCK_PER_GRID_join (N_MULTI_P * BLCK_PER_MP_join)
#define THRD_PER_GRID_join (THRD_PER_BLCK_join * BLCK_PER_GRID_join)
#define TEST_MAX 100
typedef int IKeyType;
typedef int Record;
typedef struct {
int keys[TREE_NODE_SIZE];
} IDirectoryNode;
typedef struct {
Record records[TREE_NODE_SIZE];
} IDataNode;
typedef struct {
IDataNode* data;
unsigned int nDataNodes;
IDirectoryNode* dir;
unsigned int nDirNodes;
} CUDA_CSSTree;
struct to_neg
{
__host__ __device__
bool operator()(const int &r1)
{
if(r1 < 0)
return 1;
return 0;
}
};
__host__ __device__ unsigned int uintCeilingLog(unsigned int base, unsigned int num)
{
unsigned int result = 0;
for(unsigned int temp = 1; temp < num; temp *= base)
result++;
return result;
}
__host__ __device__ unsigned int uintCeilingDiv(unsigned int dividend, unsigned int divisor)
{
return (dividend + divisor - 1) / divisor;
}
__host__ __device__ unsigned int uintPower(unsigned int base, unsigned int pow)
{
unsigned int result = 1;
for(; pow; pow--)
result *= base;
return result;
}
__device__ int getRightMostDescIdx(int tree_size, int nodeIdx)
{
int tmp = nodeIdx * TREE_NODE_SIZE + TREE_FANOUT;
int n = uintCeilingLog(TREE_FANOUT, uintCeilingDiv(TREE_NODE_SIZE * tree_size + TREE_FANOUT, tmp)) - 1;
int result = (tmp * uintPower(TREE_FANOUT, n) - TREE_FANOUT) / TREE_NODE_SIZE;
return result;
}
__device__ int getDataArrayIdx(int dirSize, int tree_size, int bottom_start, int treeIdx)
{
int idx;
if(treeIdx < dirSize) {
idx = tree_size - bottom_start - 1;
}
else if( treeIdx < bottom_start ) {
idx = tree_size - bottom_start + treeIdx - dirSize;
}
else {
idx = treeIdx - bottom_start;
}
return idx;
}
// Binary Search
__device__ int firstMatchingKeyInDirNode1(int keys[], int key)
{
int min = 0;
int max = TREE_NODE_SIZE;
int mid;
int cut;
while(max - min > 1) {
mid = (min + max) / 2;
cut = keys[mid];
if(key > cut)
min = mid;
else
max = mid;
}
if(keys[min] >= key)
return min;
return max;
}
// Binary Search
__device__ int firstMatchingKeyInDataNode2(Record records[], IKeyType key)
{
int min = 0;
int max = TREE_NODE_SIZE;
int mid;
int cut;
while(max - min > 1) {
mid = (min + max) / 2;
cut = records[mid];
if(key > cut)
min = mid;
else
max = mid;
}
if(records[min] == key)
return min;
if(max < TREE_NODE_SIZE && records[max] == key)
return max;
return -1;
}
__global__ void gCreateIndex(IDataNode data[], IDirectoryNode dir[], int dirSize, int tree_size, int bottom_start, int nNodesPerBlock)
{
int startIdx = blockIdx.x * nNodesPerBlock;
int endIdx = startIdx + nNodesPerBlock;
if(endIdx > dirSize)
endIdx = dirSize;
int keyIdx = threadIdx.x;
// Proceed only when in internal nodes
for(int nodeIdx = startIdx; nodeIdx < endIdx; nodeIdx++)
{
int childIdx = nodeIdx * TREE_FANOUT + keyIdx + 1; // One step down to the left
// Then look for the right most descendent
int rightMostDesIdx;
// Common cases
if(childIdx < tree_size) {
rightMostDesIdx = getRightMostDescIdx(tree_size, childIdx);
}
// versus the unusual case when the tree is incomplete and the node does not have the full set of children
else {
// pick the last node in the tree (largest element of the array)
rightMostDesIdx = tree_size - 1;
}
int dataArrayIdx = getDataArrayIdx(dirSize, tree_size, bottom_start, rightMostDesIdx);
dir[nodeIdx].keys[keyIdx] = data[dataArrayIdx].records[TREE_NODE_SIZE - 1];
}
}
__global__ void gSearchTree(IDataNode* data, int nDataNodes, IDirectoryNode* dir, int nDirNodes, int lvlDir, Record* arr, int locations[], int nSearchKeys, int nKeysPerThread, int tree_size, int bottom_start)
{
// Bringing the root node (visited by every tuple) to the faster shared memory
__shared__ IKeyType RootNodeKeys[TREE_NODE_SIZE];
RootNodeKeys[threadIdx.x] = dir->keys[threadIdx.x];
__syncthreads();
int OverallThreadIdx = blockIdx.x * THRD_PER_BLCK_search + threadIdx.x;
for(int keyIdx = OverallThreadIdx; keyIdx < nSearchKeys; keyIdx += THRD_PER_GRID_search)
{
IKeyType val = arr[keyIdx];
int loc = firstMatchingKeyInDirNode1(RootNodeKeys, val) + 1;
for(int i = 1; i < lvlDir && loc < nDirNodes; i++) {
int kid = firstMatchingKeyInDirNode1(dir[loc].keys, val);
loc = loc * TREE_FANOUT + kid + 1;
}
if(loc >= tree_size)
loc = nDataNodes - 1;
else
loc = getDataArrayIdx(nDirNodes, tree_size, bottom_start, loc);
int offset = firstMatchingKeyInDataNode2(data[loc].records, val);
locations[keyIdx] = (offset <0)?-1:(loc * TREE_NODE_SIZE + offset);
}
}
/*Counts the number of times a row in 'S' is to be joined to a row in 'R'.*/
__global__ void gIndexJoin(int *R, int *S, int g_locations[], int sLen, int g_ResNums[])
{
int s_cur = blockIdx.x * blockDim.x + threadIdx.x;
if(s_cur < sLen)
{
int count = 1;
int r_cur = g_locations[s_cur];
int s_key;
if(r_cur >= 0)
{
s_key = S[s_cur];
r_cur++;
while(s_key == R[r_cur])
{
count++;
r_cur++;
}
g_ResNums[s_cur] = count;
}
}
}
/*Corrects 'gSearchTree' results when dealing with a negative multijoin. Uses the values found in 'g_locations' which indicate, for each row in 'R', if its going
to be joined (positive number) or not (-1). Works by checking the additional columns to be joined (i.e. all except the two used by 'gSearchTree') and changing to -1
in 'g_locations' those rows that have equal values in the checked columns.*/
__global__ void gIndexMultiJoinNegative(int *R, int *S, int g_locations[], int rLen, int *p1, int *p2, int of1, int of2, int *mloc, int *sloc, int *muljoin, int wj)
{
extern __shared__ int shared[];
int r_cur = blockIdx.x * blockDim.x + threadIdx.x;
int posr, poss, x;
if(threadIdx.x < wj)
shared[threadIdx.x] = muljoin[threadIdx.x];
__syncthreads();
if(r_cur < rLen)
{
int s_cur = g_locations[r_cur];
int r_key;
if(s_cur >= 0)
{
r_key = R[r_cur];
if(mloc == NULL)
posr = r_cur * of1;
else
posr = mloc[r_cur] * of1;
while(r_key == S[s_cur])
{
poss = sloc[s_cur] * of2;
for(x = 0; x < wj; x += 2)
{
if(p1[posr + shared[x]] != p2[poss + shared[x+1]])
break;
}
if(x >= wj)
return;
s_cur++;
}
g_locations[r_cur] = -1;
}
}
}
/*Corrects 'gSearchTree' results when dealing with a multijoin. Uses the values found in 'g_locations' which indicate, for each row in 'S', if its going
to be joined (positive number) or not (-1). Works by checking the additional columns to be joined (i.e. all except the two used by 'gSearchTree') and counting the number of
times a row in 'S' is to be joined to its corresponding row in 'R', storing the new result in 'g_locations'.*/
__global__ void gIndexMultiJoin(int *R, int *S, int g_locations[], int sLen, int g_ResNums[], int *p1, int *p2, int of1, int of2, int *mloc, int *sloc, int *muljoin, int wj)
{
extern __shared__ int shared[];
int s_cur = blockIdx.x * blockDim.x + threadIdx.x;
int posr, poss, x;
if(threadIdx.x < wj)
shared[threadIdx.x] = muljoin[threadIdx.x];
__syncthreads();
if(s_cur < sLen)
{
int count = 0;
int r_cur = g_locations[s_cur];
int s_key;
if(r_cur >= 0)
{
s_key = S[s_cur];
if(sloc == NULL)
poss = s_cur * of2;
else
poss = sloc[s_cur] * of2;
while(s_key == R[r_cur])
{
posr = mloc[r_cur] * of1;
for(x = 0; x < wj; x += 2)
{
if(p1[posr + shared[x]] != p2[poss + shared[x+1]])
break;
}
if(x >= wj)
count++;
r_cur++;
}
if(count > 0)
g_ResNums[s_cur] = count;
}
}
}
/*Writes the result of the join and projects the necessary columns as defined by 'rule'. The difference between this function and 'gJoinWithWrite' is the comparison of the additional join
columns.*/
__global__ void multiJoinWithWrite(int g_locations[], int sLen, int g_PrefixSums[], int g_joinResultBuffers[], int *p1, int *p2, int of1, int of2, int *rule, int halfrul, int lenrul, int *mloc, int *sloc, int wj)
{
extern __shared__ int shared[];
int *extjoins = &shared[lenrul];
int s_cur = blockIdx.x * blockDim.x + threadIdx.x;
if(threadIdx.x < (lenrul + wj))
shared[threadIdx.x] = rule[threadIdx.x];
__syncthreads();
if(s_cur < sLen)
{
int r_cur = g_locations[s_cur];
if(r_cur >= 0)
{
int x, y, pos, posr, poss;
int num1 = g_PrefixSums[s_cur];
int num2 = g_PrefixSums[s_cur+1];
int tmp1, tmp2;
if(sloc == NULL)
poss = s_cur * of2;
else
poss = sloc[s_cur] * of2;
for(x = num1; x < num2; x++, r_cur++)
{
pos = mloc[r_cur] * of1;
for(y = 0; y < wj; y += 2) /*Additional comparison*/
{
tmp1 = p1[pos + extjoins[y]];
tmp2 = p2[poss + extjoins[y+1]];
if(tmp1 != tmp2)
break;
}
if(y < wj)
{
x--;
continue;
}
posr = x * lenrul;
for(y = 0; y < halfrul; y++)
g_joinResultBuffers[posr + y] = p1[pos + shared[y]];
for(; y < lenrul; y++)
g_joinResultBuffers[posr + y] = p2[poss + shared[y]];
}
}
}
}
/*Writes the result of the join and projects the necessary columns as defined by 'rule'. The difference between this function and 'gJoinWithWrite2' is the comparison of the additional join
columns.*/
__global__ void multiJoinWithWrite2(int g_locations[], int sLen, int g_PrefixSums[], int g_joinResultBuffers[], int *p1, int *p2, int of1, int of2, int *rule, int cols, int *mloc, int *sloc, int wj)
{
extern __shared__ int shared[];
int *extjoins = &shared[cols];
int s_cur = blockIdx.x * blockDim.x + threadIdx.x;
if(threadIdx.x < (cols + wj))
shared[threadIdx.x] = rule[threadIdx.x];
__syncthreads();
if(s_cur < sLen)
{
int r_cur = g_locations[s_cur];
if(r_cur >= 0)
{
int x, y, pos, pos2, posr, cond;
int num1 = g_PrefixSums[s_cur];
int num2 = g_PrefixSums[s_cur+1];
if(sloc == NULL)
pos2 = s_cur * of2 - 1;
else
pos2 = sloc[s_cur] * of2 - 1;
for(x = num1; x < num2; x++, r_cur++)
{
pos = mloc[r_cur] * of1 - 1;
for(y = 0; y < wj; y += 2) /*Additional comparison*/
{
if(p1[pos + extjoins[y] + 1] != p2[pos2 + extjoins[y+1] + 1])
break;
}
if(y < wj)
{
x--;
continue;
}
posr = x * cols;
for(y = 0; y < cols; y++)
{
cond = shared[y];
if(cond > 0)
g_joinResultBuffers[posr + y] = p1[pos + cond];
else
g_joinResultBuffers[posr + y] = p2[pos2 - cond];
}
}
}
}
}
/*Writes the result of the join and projects the necessary columns as defined by 'rule'. The difference between this function and 'gJoinWithWrite2' is that only the columns in the positve
predicate are projected.*/
__global__ void gJoinWithWriteNegative(int g_locations[], int rLen, int g_joinResultBuffers[], int *p1, int of1, int *rule, int halfrul, int *mloc)
{
extern __shared__ int shared[];
int r_cur = blockIdx.x * blockDim.x + threadIdx.x;
int posr;
if(threadIdx.x < halfrul)
shared[threadIdx.x] = rule[threadIdx.x];
__syncthreads();
if(r_cur < rLen)
{
posr = g_locations[r_cur];
if(g_locations[r_cur+1] != posr)
{
int y, pos;
if(mloc == NULL)
pos = r_cur * of1;
else
pos = mloc[r_cur] * of1;
posr *= halfrul;
for(y = 0; y < halfrul; y++)
g_joinResultBuffers[posr + y] = p1[pos + shared[y]];
}
}
}
/*Writes the result of the join and projects the necessary columns as defined by 'rule'. The difference between this function and 'gJoinWithWrite' is that only the columns in the positve
predicate are projected.*/
__global__ void gJoinWithWriteNegative2(int g_locations[], int rLen, int g_joinResultBuffers[], int *p1, int of1, int *rule, int cols, int *mloc)
{
extern __shared__ int shared[];
int r_cur = blockIdx.x * blockDim.x + threadIdx.x;
int posr;
if(threadIdx.x < cols)
shared[threadIdx.x] = rule[threadIdx.x];
__syncthreads();
if(r_cur < rLen)
{
posr = g_locations[r_cur];
if(g_locations[r_cur+1] != posr)
{
int y, pos;
if(mloc == NULL)
pos = r_cur * of1 - 1;
else
pos = mloc[r_cur] * of1 - 1;
posr *= cols;
for(y = 0; y < cols; y++)
g_joinResultBuffers[posr + y] = p1[pos + shared[y]];
}
}
}
/*Writes the result of the join and projects the necessary columns as defined by 'rule'.*/
__global__ void gJoinWithWrite(int g_locations[], int sLen, int g_PrefixSums[], int g_joinResultBuffers[], int *p1, int *p2, int of1, int of2, int *rule, int halfrul, int lenrul, int *mloc, int *sloc)
{
extern __shared__ int shared[];
int s_cur = blockIdx.x * blockDim.x + threadIdx.x;
if(threadIdx.x < lenrul)
shared[threadIdx.x] = rule[threadIdx.x];
__syncthreads();
if(s_cur < sLen)
{
int r_cur = g_locations[s_cur];
if(r_cur >= 0)
{
int x, y, pos, posr, poss;
int num1 = g_PrefixSums[s_cur];
int num2 = g_PrefixSums[s_cur+1];
if(sloc == NULL)
poss = s_cur * of2;
else
poss = sloc[s_cur] * of2;
for(x = num1; x < num2; x++, r_cur++)
{
pos = mloc[r_cur] * of1;
posr = x * lenrul;
for(y = 0; y < halfrul; y++)
g_joinResultBuffers[posr + y] = p1[pos + shared[y]];
for(; y < lenrul; y++)
g_joinResultBuffers[posr + y] = p2[poss + shared[y]];
}
}
}
}
/*Writes the result of the join and projects the necessary columns as defined by 'rule'. This version is used when performing the final join of the rule and its only difference is the
projection, which is performed based on the variables in the head of the rule.*/
__global__ void gJoinWithWrite2(int g_locations[], int sLen, int g_PrefixSums[], int g_joinResultBuffers[], int *p1, int *p2, int of1, int of2, int *rule, int cols, int *mloc, int *sloc)
{
extern __shared__ int shared[];
int s_cur = blockIdx.x * blockDim.x + threadIdx.x;
if(threadIdx.x < cols)
shared[threadIdx.x] = rule[threadIdx.x];
__syncthreads();
if(s_cur < sLen)
{
int r_cur = g_locations[s_cur];
if(r_cur >= 0)
{
int x, y, pos, pos2, posr, cond;
int num1 = g_PrefixSums[s_cur];
int num2 = g_PrefixSums[s_cur+1];
if(sloc == NULL)
pos2 = s_cur * of2 - 1;
else
pos2 = sloc[s_cur] * of2 - 1;
for(x = num1; x < num2; x++, r_cur++)
{
pos = mloc[r_cur] * of1 - 1;
posr = x * cols;
for(y = 0; y < cols; y++)
{
cond = shared[y];
if(cond > 0)
g_joinResultBuffers[posr + y] = p1[pos + cond];
else
g_joinResultBuffers[posr + y] = p2[pos2 - cond];
}
}
}
}
}
/*Load part of column 'wj' of 'p' in 'R'. Which values are loaded is defined by the prefix sum results in 'pos'.*/
__global__ void llenar(int *p, int *R, int len, int of, int wj, int *pos, int *ids)
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
int cond;
if(id < len)
{
cond = pos[id];
if(pos[id+1] != cond)
{
R[cond] = p[id * of + wj];
ids[cond] = id;
}
}
}
/*Load an entire column from 'p' into 'R'.*/
__global__ void llenarnosel(int *p, int *R, int len, int of, int wj)
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
if(id < len)
R[id] = p[id * of + wj];
}
__global__ void projectfinal(int *res, int rows, int cols, int *rule, int *out)
{
extern __shared__ int shared[];
int id = blockIdx.x * blockDim.x + threadIdx.x;
if(threadIdx.x < cols)
shared[threadIdx.x] = rule[threadIdx.x];
__syncthreads();
if(id < rows)
{
id *= cols;
for(int y = 0; y < cols; y++)
out[id + y] = res[id + shared[y]];
}
}
void project(int *res, int resrows, int numcols1, int numcols2, int *proj, int **ret, int type)
{
int z, *dcons, *d_Rout;
int numthreads = 1024;
//numthreads = 32;
int blockllen = resrows / numthreads + 1;
int sizepro = numcols2 * sizeof(int);
reservar(&dcons, sizepro);
if(type)
{
int *pt = (int *)malloc(sizepro);
for(z = 0; z < numcols2; z++)
pt[z] = proj[z] - 1;
cudaMemcpy(dcons, pt, sizepro, cudaMemcpyHostToDevice);
//cudaDeviceSynchronize(); //Small cudaMemcpys are asynchronous, uncomment this line if the pointer is being liberated before it is copied.
free(pt);
}
else
cudaMemcpy(dcons, proj, sizepro, cudaMemcpyHostToDevice);
reservar(&d_Rout, resrows * sizepro);
projectfinal<<<blockllen, numthreads, sizepro>>>(res, resrows, numcols1, dcons, d_Rout);
cudaFree(dcons);
cudaFree(*ret);
*ret = d_Rout;
}
__global__ void projectadd(int *dop1, int *dop2, int rows1, int rows2, int cols1, int cols2, int *dhead, int hsize, int *res)
{
extern __shared__ int shared[];
int id = blockIdx.x * blockDim.x + threadIdx.x;
int pos2, posr, x, y, cond;
if(threadIdx.x < hsize)
shared[threadIdx.x] = dhead[threadIdx.x];
__syncthreads();
if(id < rows2)
{
posr = id * hsize * rows1;
pos2 = id * cols2 - 1;
for(x = 0; x < rows1; x++)
{
for(y = 0; y < hsize; y++)
{
cond = shared[y];
if(cond > 0)
res[posr + y] = dop1[cond-1];
else
res[posr + y] = dop2[pos2 - cond];
}
posr += hsize;
}
}
}
void juntar(int *dop1, int *dop2, int rows1, int rows2, int cols1, int cols2, int *proj, int pcols, int **ret)
{
int sizepro, *dcons, *d_Rout;
int numthreads = 1024;
//numthreads = 32;
int blockllen = rows2 / numthreads + 1;
sizepro = pcols * sizeof(int);
reservar(&dcons, sizepro);
cudaMemcpy(dcons, proj, sizepro, cudaMemcpyHostToDevice);
reservar(&d_Rout, rows1 * rows2 * sizepro);
projectadd<<<blockllen, numthreads, sizepro>>>(dop1, dop2, rows1, rows2, cols1, cols2, dcons, pcols, d_Rout);
cudaFree(dcons);
*ret = d_Rout;
}
/*Joins two predicates. Starts by performing all preliminary operations (selections, selfjoins, comparisons) on both predicates. Then a column pair is used to construct
a CSS-Tree and that tree is searched for join positions. The positions are used in a prefix sum and its result allows us to write the result. Multijoins and negative
predicates follow roughly the same process, but use different kernels.*/
int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>::iterator rule, int pos, int bothops, int **ret, int ANDlogic)
{
int pos2 = pos + 1;
int *sel1 = NULL, nsel1 = 0;
int *sel2 = rule->select[pos2];
int nsel2 = rule->numsel[pos2];
int *proj = rule->project[pos];
int2 projp = rule->projpos[pos];
int *sjoin1 = NULL, nsj1 = 0;
int *sjoin2 = rule->selfjoin[pos2];
int nsj2 = rule->numselfj[pos2];
int *pred1 = NULL;
int2 npred1 = make_int2(0,0);
int *pred2 = rule->preds[pos2];
int2 npred2 = rule->numpreds[pos2];
int npred2tot = npred2.x + npred2.y;
int *wherej = rule->wherejoin[pos];
int numj = rule->numjoin[pos];
int negative = rule->negatives[pos2+1];
int flag;
#ifdef ROCKIT
ANDlogic = 0;
#endif
if(negative)
ANDlogic = 1;
#if TIMER
cuda_stats.joins++;
#endif
int size, sizet, sizet2;
if(bothops)
{
sel1 = rule->select[pos];
nsel1 = rule->numsel[pos];
sjoin1 = rule->selfjoin[pos];
nsj1 = rule->numselfj[pos];
pred1 = rule->preds[pos];
npred1 = rule->numpreds[pos];
sizet = maximo(10, of1, of2, nsel1, nsel2, projp.y + numj - 2, nsj1, nsj2, numj, npred1.x, npred2tot) * sizeof(int);
}
else
sizet = maximo(7, of1, of2, nsel2, projp.y + numj - 2, nsj2, numj, npred2tot) * sizeof(int);
int *dcons, *temp, *temp2 = NULL;
int *d_R, *d_S;
int blockllen, numthreads;
int extraspace = TREE_NODE_SIZE - rLen % TREE_NODE_SIZE;
int m32rLen = rLen + extraspace;
int extraspaceS = TREE_NODE_SIZE - sLen % TREE_NODE_SIZE;
int m32sLen = sLen + extraspaceS;
if(m32rLen > m32sLen)
sizet2 = (m32rLen + 1) * sizeof(int);
else
sizet2 = (m32sLen + 1) * sizeof(int);
reservar(&dcons, sizet);
reservar(&temp, sizet2);
thrust::device_ptr<int> res = thrust::device_pointer_cast(temp);
numthreads = 1024;
//numthreads = 32;
blockllen = sLen / numthreads + 1;
int memSizeS, newLen = 0;
int *posR = NULL, *posS = NULL;
int sizem32S = 0, sizextra;
#ifdef TIMER
//cout << "INICIO" << endl;
cudaEvent_t start, stop;
float time;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
#endif
if(npred2.x > 0 || npred2.y > 0 || nsel2 > 0 || nsj2 > 0)
{
newLen = sLen + 1;
cudaMemsetAsync(temp, 0, newLen * sizeof(int));
}
if(npred2.x > 0 || npred2.y > 0)
{
size = npred2tot * sizeof(int);
cudaMemcpy(dcons, pred2, size, cudaMemcpyHostToDevice);
if(npred2.y > 0) /*Fix case when a(X,Y),b(Y,Z),Z > Y*/
{
reservar(&temp2, sizet2);
cudaMemsetAsync(temp2, 0, newLen * sizeof(int));
//res = thrust::device_pointer_cast(temp2);
bpreds<<<blockllen, numthreads, size>>>(p1, p2, sLen, of1, of2, dcons, npred2tot, npred2.x, temp + 1, temp2 + 1);
}
else
{
if(negative)
bpreds<<<blockllen, numthreads, size>>>(p1, p2, sLen, of1, of2, dcons, npred2tot, npred2.x, temp + 1, NULL);
else
bpredsOR<<<blockllen, numthreads, size>>>(p1, p2, sLen, of1, of2, dcons, npred2tot, npred2.x, temp + 1, NULL);
}
if(nsel2 > 0)
{
size = nsel2 * sizeof(int);
cudaMemcpy(dcons, sel2, size, cudaMemcpyHostToDevice);
marcar<<<blockllen, numthreads, size>>>(p2, sLen, of2, dcons, nsel2, temp + 1);
}
if(nsj2 > 0)
{
size = nsj2 * sizeof(int);
cudaMemcpy(dcons, sjoin2, size, cudaMemcpyHostToDevice);
samejoin<<<blockllen, numthreads, size>>>(p2, sLen, of2, dcons, nsj2, temp + 1);
}
}
else
{
if(nsel2 > 0)
{
size = nsel2 * sizeof(int);
cudaMemcpy(dcons, sel2, size, cudaMemcpyHostToDevice);
marcar2<<<blockllen, numthreads, size>>>(p2, sLen, of2, dcons, nsel2, temp + 1);
if(nsj2 > 0)
{
size = nsj2 * sizeof(int);
cudaMemcpy(dcons, sjoin2, size, cudaMemcpyHostToDevice);
samejoin<<<blockllen, numthreads, size>>>(p2, sLen, of2, dcons, nsj2, temp + 1);
}
}
else
{
if(nsj2 > 0)
{
size = nsj2 * sizeof(int);
cudaMemcpy(dcons, sjoin2, size, cudaMemcpyHostToDevice);
samejoin2<<<blockllen, numthreads, size>>>(p2, sLen, of2, dcons, nsj2, temp + 1);
}
else
{
sizem32S = m32sLen * sizeof(int);
reservar(&d_S, sizem32S);
cudaMemsetAsync(d_S + sLen, 0x7f, extraspaceS * sizeof(int));
llenarnosel<<<blockllen, numthreads>>>(p2, d_S, sLen, of2, wherej[1]);
}
}
}
if(npred2.x > 0 || npred2.y > 0 || nsel2 > 0 || nsj2 > 0)
{
flag = 0;
while(flag != 1)
{
try
{
thrust::inclusive_scan(res + 1, res + newLen, res + 1);
flag = 1;
}
catch(std::bad_alloc &e)
{
limpiar("inclusive scan in join", 0);
}
}
newLen = res[sLen];
if(newLen == 0) // && !negative) ARREGLAR
{
cudaFree(temp);
cudaFree(dcons);
return 0;
}
extraspaceS = TREE_NODE_SIZE - newLen % TREE_NODE_SIZE;
sizextra = extraspaceS * sizeof(int);
m32sLen = newLen + extraspaceS;
sizem32S = m32sLen * sizeof(int);
reservar(&d_S, sizem32S);
reservar(&posS, sizem32S);
cudaMemsetAsync(d_S + newLen, 0x7f, sizextra);
cudaMemsetAsync(posS + newLen, 0x7f, sizextra);
llenar<<<blockllen, numthreads>>>(p2, d_S, sLen, of2, wherej[1], temp, posS);
sLen = newLen;
}
#ifdef TIMER
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
//cout << "Select1 = " << time << endl;
cuda_stats.select1_time += time;
cudaEventDestroy(start);
cudaEventDestroy(stop);
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
#endif
blockllen = rLen / numthreads + 1;
int sizem32;
if(bothops)
{
if(temp2 != NULL)
{
cudaFree(temp);
temp = temp2;
res = thrust::device_pointer_cast(temp);
newLen = rLen + 1;
if(nsel1 > 0)
{
size = nsel1 * sizeof(int);
cudaMemcpy(dcons, sel1, size, cudaMemcpyHostToDevice);
marcar<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, nsel1, temp + 1);
}
if(nsj1 > 0)
{
size = nsj1 * sizeof(int);
cudaMemcpy(dcons, sjoin1, size, cudaMemcpyHostToDevice);
samejoin<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, nsj1, temp + 1);
}
if(npred1.x > 0)
{
size = npred1.x * sizeof(int);
cudaMemcpy(dcons, pred1, size, cudaMemcpyHostToDevice);
if(ANDlogic)
bpredsnormal<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, npred1.x, temp + 1);
else
bpredsorlogic<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, npred1.x, temp + 1);
}
}
else
{
if(npred1.x > 0 || nsel1 > 0 || nsj1 > 0)
{
newLen = rLen + 1;
cudaMemsetAsync(temp, 0, newLen * sizeof(int));
}
if(nsel1 > 0)
{
size = nsel1 * sizeof(int);
cudaMemcpy(dcons, sel1, size, cudaMemcpyHostToDevice);
marcar2<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, nsel1, temp + 1);
if(nsj1 > 0)
{
size = nsj1 * sizeof(int);
cudaMemcpy(dcons, sjoin1, size, cudaMemcpyHostToDevice);
samejoin<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, nsj1, temp + 1);
}
if(npred1.x > 0)
{
size = npred1.x * sizeof(int);
cudaMemcpy(dcons, pred1, size, cudaMemcpyHostToDevice);
if(ANDlogic)
bpredsnormal<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, npred1.x, temp + 1);
else
bpredsorlogic<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, npred1.x, temp + 1);
}
}
else
{
if(nsj1 > 0)
{
size = nsj1 * sizeof(int);
cudaMemcpy(dcons, sjoin1, size, cudaMemcpyHostToDevice);
samejoin2<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, nsj1, temp + 1);
if(npred1.x > 0)
{
size = npred1.x * sizeof(int);
cudaMemcpy(dcons, pred1, size, cudaMemcpyHostToDevice);
if(ANDlogic)
bpredsnormal<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, npred1.x, temp + 1);
else
bpredsorlogic<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, npred1.x, temp + 1);
}
}
else
{
if(npred1.x > 0)
{
size = npred1.x * sizeof(int);
cudaMemcpy(dcons, pred1, size, cudaMemcpyHostToDevice);
if(ANDlogic)
bpredsnormal2<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, npred1.x, temp + 1);
else
bpredsorlogic2<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, npred1.x, temp + 1);
}
}
}
}
if(temp2 != NULL || npred1.x > 0 || nsel1 > 0 || nsj1 > 0)
{
thrust::inclusive_scan(res + 1, res + newLen, res + 1);
newLen = res[rLen];
if(newLen == 0)
{
cudaFree(temp);
cudaFree(dcons);
cudaFree(d_S);
if(posS != NULL)
cudaFree(posS);
return 0;
}
extraspace = TREE_NODE_SIZE - newLen % TREE_NODE_SIZE;
sizextra = extraspace * sizeof(int);
m32rLen = newLen + extraspace;
sizem32 = m32rLen * sizeof(int);
reservar(&d_R, sizem32);
reservar(&posR, sizem32);
cudaMemsetAsync(d_R + newLen, 0x7f, sizextra);
cudaMemsetAsync(posR + newLen, 0x7f, sizextra);
llenar<<<blockllen, numthreads>>>(p1, d_R, rLen, of1, wherej[0], temp, posR);
rLen = newLen;
}
else
{
sizem32 = m32rLen * sizeof(int);
reservar(&d_R, sizem32);
cudaMemsetAsync(d_R + rLen, 0x7f, extraspace * sizeof(int));
llenarnosel<<<blockllen, numthreads>>>(p1, d_R, rLen, of1, wherej[0]);
}
}
else
{
sizem32 = m32rLen * sizeof(int);
reservar(&d_R, sizem32);
cudaMemsetAsync(d_R + rLen, 0x7f, extraspace * sizeof(int));
llenarnosel<<<blockllen, numthreads>>>(p1, d_R, rLen, of1, wherej[0]);
}
#ifdef TIMER
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
//cout << "Select2 = " << time << endl;
cuda_stats.select2_time += time;
#endif
#ifdef TIMER
cudaEventDestroy(start);
cudaEventDestroy(stop);
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
#endif
thrust::device_ptr<Record> dvp1;
thrust::device_ptr<Record> permutation;
if(negative)
{
dvp1 = thrust::device_pointer_cast(d_S);
if(posS == NULL)
{
reservar(&posS, sizem32S);
permutation = thrust::device_pointer_cast(posS);
thrust::sequence(permutation, permutation + m32sLen);
}
else
permutation = thrust::device_pointer_cast(posS);
flag = 0;
while(flag != 1)
{
try
{
thrust::stable_sort_by_key(dvp1, dvp1 + m32sLen, permutation);
flag = 1;
}
catch(std::bad_alloc &e)
{
limpiar("inclusive scan in join", 0);
}
}
}
else
{
dvp1 = thrust::device_pointer_cast(d_R);
if(posR == NULL)
{
reservar(&posR, sizem32);
permutation = thrust::device_pointer_cast(posR);
thrust::sequence(permutation, permutation + m32rLen);
}
else
permutation = thrust::device_pointer_cast(posR);
flag = 0;
while(flag != 1)
{
try
{
thrust::stable_sort_by_key(dvp1, dvp1 + m32rLen, permutation);
flag = 1;
}
catch(std::bad_alloc &e)
{
limpiar("inclusive scan in join", 0);
}
}
}
#ifdef TIMER
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
//cout << "Sort = " << time << endl;
cuda_stats.sort_time += time;
cudaEventDestroy(start);
cudaEventDestroy(stop);
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
#endif
IDataNode* d_data;
IDirectoryNode* d_dir;
unsigned int nDataNodes;
if(negative)
{
nDataNodes = uintCeilingDiv(sLen, TREE_NODE_SIZE);
d_data=(IDataNode *)d_S;
}
else
{
nDataNodes = uintCeilingDiv(rLen, TREE_NODE_SIZE);
d_data=(IDataNode *)d_R;
}
unsigned int lvlDir = uintCeilingLog(TREE_FANOUT, nDataNodes);
unsigned int nDirNodes = uintCeilingDiv(nDataNodes - 1, TREE_NODE_SIZE);
unsigned int tree_size = nDirNodes + nDataNodes;
unsigned int bottom_start = (uintPower(TREE_FANOUT, lvlDir) - 1) / TREE_NODE_SIZE;
d_dir = (IDirectoryNode *)temp;
unsigned int nNodesPerBlock = uintCeilingDiv(nDirNodes, BLCK_PER_GRID_create);
dim3 Dbc(THRD_PER_BLCK_create, 1, 1);
dim3 Dgc(BLCK_PER_GRID_create, 1, 1);
gCreateIndex <<<Dgc, Dbc>>> (d_data, d_dir, nDirNodes, tree_size, bottom_start, nNodesPerBlock);
int *d_locations;
int memSizeR;
unsigned int nSearchKeys;
if(negative)
{
memSizeR = (rLen + 1) * sizeof(int);
reservar(&d_locations, memSizeR);
cudaMemsetAsync(d_locations, 0, sizeof(int));
nSearchKeys = rLen;
}
else
{
memSizeS = sLen * sizeof(int);
reservar(&d_locations, memSizeS);
nSearchKeys = sLen;
}
dim3 Dbs(THRD_PER_BLCK_search, 1, 1);
dim3 Dgs(BLCK_PER_GRID_search, 1, 1);
unsigned int nKeysPerThread = uintCeilingDiv(nSearchKeys, THRD_PER_GRID_search);
if(negative)
{
gSearchTree <<<Dgs, Dbs>>> (d_data, nDataNodes, d_dir, nDirNodes, lvlDir, d_R, d_locations + 1, nSearchKeys, nKeysPerThread, tree_size, bottom_start);
cudaMemsetAsync(temp, 0, memSizeR);
}
else
{
gSearchTree <<<Dgs, Dbs>>> (d_data, nDataNodes, d_dir, nDirNodes, lvlDir, d_S, d_locations, nSearchKeys, nKeysPerThread, tree_size, bottom_start);
cudaMemsetAsync(temp, 0, memSizeS);
}
int muljoin = 0, muljoinsize = 0, sum;
int *d_Rout;
int resSize, sizepro;
if(negative)
{
blockllen = rLen / numthreads + 1;
if(numj > 2)
{
muljoin = numj - 2;
muljoinsize = muljoin * sizeof(int);
cudaMemcpy(dcons, wherej + 2, muljoinsize, cudaMemcpyHostToDevice);
gIndexMultiJoinNegative<<<blockllen, numthreads, muljoinsize>>> (d_R, d_S, d_locations + 1, rLen, p1, p2, of1, of2, posR, posS, dcons, muljoin);
}
res = thrust::device_pointer_cast(d_locations);
thrust::transform(res + 1, res + rLen + 1, res + 1, to_neg());
thrust::inclusive_scan(res + 1, res + rLen + 1, res + 1);
sum = res[rLen];
if(pos == (rule->num_rows - 3))
{
sizepro = rule->num_columns * sizeof(int);
cudaMemcpy(dcons, proj, sizepro, cudaMemcpyHostToDevice);
resSize = sum * sizepro;
reservar(&d_Rout, resSize);
gJoinWithWriteNegative2<<<blockllen, numthreads, sizepro>>> (d_locations, rLen, d_Rout, p1, of1, dcons, rule->num_columns, posR);
}
else
{
sizepro = projp.x * sizeof(int);
cudaMemcpy(dcons, proj, sizepro, cudaMemcpyHostToDevice);
resSize = sum * sizepro;
reservar(&d_Rout, resSize);
gJoinWithWriteNegative<<<blockllen, numthreads, sizepro>>> (d_locations, rLen, d_Rout, p1, of1, dcons, projp.x, posR);
}
cudaFree(d_R);
cudaFree(d_S);
}
else
{
blockllen = sLen / numthreads + 1;
if(numj > 2)
{
muljoin = numj - 2;
muljoinsize = muljoin * sizeof(int);
cudaMemcpy(dcons, wherej + 2, muljoinsize, cudaMemcpyHostToDevice);
gIndexMultiJoin<<<blockllen, numthreads, muljoinsize>>> (d_R, d_S, d_locations, sLen, temp, p1, p2, of1, of2, posR, posS, dcons, muljoin);
}
else
gIndexJoin<<<blockllen, numthreads>>> (d_R, d_S, d_locations, sLen, temp);
cudaFree(d_R);
cudaFree(d_S);
sum = res[sLen-1];
thrust::exclusive_scan(res, res + sLen, res);
sum += res[sLen-1];
if(sum == 0)
{
cudaFree(dcons);
cudaFree(d_locations);
cudaFree(temp);
if(posS != NULL)
cudaFree(posS);
if(posR != NULL)
cudaFree(posR);
return 0;
}
res[sLen] = sum;
if(pos == (rule->num_rows - 3))
{
sizepro = rule->num_columns * sizeof(int);
cudaMemcpy(dcons, proj, sizepro, cudaMemcpyHostToDevice);
resSize = sum * sizepro;
reservar(&d_Rout, resSize);
if(numj > 2)
{
cudaMemcpy(dcons + rule->num_columns, wherej + 2, muljoinsize, cudaMemcpyHostToDevice);
multiJoinWithWrite2<<<blockllen, numthreads, sizepro + muljoinsize>>> (d_locations, sLen, temp, d_Rout, p1, p2, of1, of2, dcons, rule->num_columns, posR, posS, muljoin);
}
else
gJoinWithWrite2<<<blockllen, numthreads, sizepro>>> (d_locations, sLen, temp, d_Rout, p1, p2, of1, of2, dcons, rule->num_columns, posR, posS);
}
else
{
sizepro = projp.y * sizeof(int);
cudaMemcpy(dcons, proj, sizepro, cudaMemcpyHostToDevice);
resSize = sum * sizepro;
reservar(&d_Rout, resSize);
if(numj > 2)
{
cudaMemcpy(dcons + projp.y, wherej + 2, muljoinsize, cudaMemcpyHostToDevice);
multiJoinWithWrite<<<blockllen, numthreads, sizepro + muljoinsize>>> (d_locations, sLen, temp, d_Rout, p1, p2, of1, of2, dcons, projp.x, projp.y, posR, posS, muljoin);
}
else
gJoinWithWrite<<<blockllen, numthreads, sizepro>>> (d_locations, sLen, temp, d_Rout, p1, p2, of1, of2, dcons, projp.x, projp.y, posR, posS);
}
}
cudaFree(dcons);
cudaFree(d_locations);
cudaFree(temp);
if(posS != NULL)
cudaFree(posS);
if(posR != NULL)
cudaFree(posR);
if(*ret != NULL)
cudaFree(*ret);
*ret = d_Rout;
#ifdef TIMER
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
//cout << "Join = " << time << endl;
//cout << "FIN" << endl;
cuda_stats.join_time += time;
#endif
return sum;
}