This repository has been archived on 2023-08-20. You can view files and clone it, but cannot push or open issues or pull requests.
yap-6.3/packages/cuda/treeb.cu
2013-11-03 14:13:08 +00:00

1231 lines
30 KiB
Plaintext
Executable File

#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/sequence.h>
#include <thrust/sort.h>
#include <thrust/gather.h>
#include <thrust/scan.h>
#include <iostream>
#include <cstdarg>
#include <cstdio>
#include "lista.h"
//#include "scanImpl.cu"
#define WARP_SIZE 32
#define TREE_NODE_SIZE WARP_SIZE
#define TREE_FANOUT (TREE_NODE_SIZE + 1)
#define N_MULTI_P 16
#define BLCK_PER_MP_create 256 // blocks per multiprocessor during tree creation
#define BLCK_PER_MP_search 512 // blocks per multiprocessor during tree searching
#define WAPRS_PER_BLCK_join 8//16 // blocks per multiprocessor during tree creation
#define BLCK_PER_MP_join 512//256 // blocks per multiprocessor during tree searching
#define THRD_PER_BLCK_create TREE_NODE_SIZE
#define BLCK_PER_GRID_create (N_MULTI_P * BLCK_PER_MP_create)
#define THRD_PER_BLCK_search TREE_NODE_SIZE
#define BLCK_PER_GRID_search (N_MULTI_P * BLCK_PER_MP_search)
#define THRD_PER_GRID_search (THRD_PER_BLCK_search * BLCK_PER_GRID_search)
#define THRD_PER_BLCK_join (WARP_SIZE * WAPRS_PER_BLCK_join)
#define BLCK_PER_GRID_join (N_MULTI_P * BLCK_PER_MP_join)
#define THRD_PER_GRID_join (THRD_PER_BLCK_join * BLCK_PER_GRID_join)
#define TEST_MAX 100
typedef int IKeyType;
typedef int Record;
typedef struct {
int keys[TREE_NODE_SIZE];
} IDirectoryNode;
typedef struct {
Record records[TREE_NODE_SIZE];
} IDataNode;
typedef struct {
IDataNode* data;
unsigned int nDataNodes;
IDirectoryNode* dir;
unsigned int nDirNodes;
} CUDA_CSSTree;
__host__ __device__ unsigned int uintCeilingLog(unsigned int base, unsigned int num)
{
unsigned int result = 0;
for(unsigned int temp = 1; temp < num; temp *= base)
result++;
return result;
}
__host__ __device__ unsigned int uintCeilingDiv(unsigned int dividend, unsigned int divisor)
{
return (dividend + divisor - 1) / divisor;
}
__host__ __device__ unsigned int uintPower(unsigned int base, unsigned int pow)
{
unsigned int result = 1;
for(; pow; pow--)
result *= base;
return result;
}
__device__ int getRightMostDescIdx(int tree_size, int nodeIdx)
{
int tmp = nodeIdx * TREE_NODE_SIZE + TREE_FANOUT;
int n = uintCeilingLog(TREE_FANOUT, uintCeilingDiv(TREE_NODE_SIZE * tree_size + TREE_FANOUT, tmp)) - 1;
int result = (tmp * uintPower(TREE_FANOUT, n) - TREE_FANOUT) / TREE_NODE_SIZE;
return result;
}
__device__ int getDataArrayIdx(int dirSize, int tree_size, int bottom_start, int treeIdx)
{
int idx;
if(treeIdx < dirSize) {
idx = tree_size - bottom_start - 1;
}
else if( treeIdx < bottom_start ) {
idx = tree_size - bottom_start + treeIdx - dirSize;
}
else {
idx = treeIdx - bottom_start;
}
return idx;
}
// Binary Search
__device__ int firstMatchingKeyInDirNode1(int keys[], int key)
{
int min = 0;
int max = TREE_NODE_SIZE;
int mid;
int cut;
while(max - min > 1) {
mid = (min + max) / 2;
cut = keys[mid];
if(key > cut)
min = mid;
else
max = mid;
}
if(keys[min] >= key)
return min;
return max;
}
// Binary Search
__device__ int firstMatchingKeyInDataNode2(Record records[], IKeyType key)
{
int min = 0;
int max = TREE_NODE_SIZE;
int mid;
int cut;
while(max - min > 1) {
mid = (min + max) / 2;
cut = records[mid];
if(key > cut)
min = mid;
else
max = mid;
}
if(records[min] == key)
return min;
if(max < TREE_NODE_SIZE && records[max] == key)
return max;
return -1;
}
__global__ void gCreateIndex(IDataNode data[], IDirectoryNode dir[], int dirSize, int tree_size, int bottom_start, int nNodesPerBlock)
{
int startIdx = blockIdx.x * nNodesPerBlock;
int endIdx = startIdx + nNodesPerBlock;
if(endIdx > dirSize)
endIdx = dirSize;
int keyIdx = threadIdx.x;
// Proceed only when in internal nodes
for(int nodeIdx = startIdx; nodeIdx < endIdx; nodeIdx++)
{
int childIdx = nodeIdx * TREE_FANOUT + keyIdx + 1; // One step down to the left
// Then look for the right most descendent
int rightMostDesIdx;
// Common cases
if(childIdx < tree_size) {
rightMostDesIdx = getRightMostDescIdx(tree_size, childIdx);
}
// versus the unusual case when the tree is incomplete and the node does not have the full set of children
else {
// pick the last node in the tree (largest element of the array)
rightMostDesIdx = tree_size - 1;
}
int dataArrayIdx = getDataArrayIdx(dirSize, tree_size, bottom_start, rightMostDesIdx);
dir[nodeIdx].keys[keyIdx] = data[dataArrayIdx].records[TREE_NODE_SIZE - 1];
}
}
__global__ void gSearchTree(IDataNode* data, int nDataNodes, IDirectoryNode* dir, int nDirNodes, int lvlDir, Record* arr, int locations[], int nSearchKeys, int nKeysPerThread, int tree_size, int bottom_start)
{
// Bringing the root node (visited by every tuple) to the faster shared memory
__shared__ IKeyType RootNodeKeys[TREE_NODE_SIZE];
RootNodeKeys[threadIdx.x] = dir->keys[threadIdx.x];
__syncthreads();
int OverallThreadIdx = blockIdx.x * THRD_PER_BLCK_search + threadIdx.x;
for(int keyIdx = OverallThreadIdx; keyIdx < nSearchKeys; keyIdx += THRD_PER_GRID_search)
{
IKeyType val = arr[keyIdx];
int loc = firstMatchingKeyInDirNode1(RootNodeKeys, val) + 1;
for(int i = 1; i < lvlDir && loc < nDirNodes; i++) {
int kid = firstMatchingKeyInDirNode1(dir[loc].keys, val);
loc = loc * TREE_FANOUT + kid + 1;
}
if(loc >= tree_size)
loc = nDataNodes - 1;
else
loc = getDataArrayIdx(nDirNodes, tree_size, bottom_start, loc);
int offset = firstMatchingKeyInDataNode2(data[loc].records, val);
locations[keyIdx] = (offset <0)?-1:(loc * TREE_NODE_SIZE + offset);
}
}
__global__ void gIndexJoin(int *R, int *S, int g_locations[], int sLen, int g_ResNums[])
{
int s_cur = blockIdx.x * blockDim.x + threadIdx.x;
if(s_cur < sLen)
{
int count = 1;
int r_cur = g_locations[s_cur];
int s_key;
if(r_cur >= 0) /*&& r_cur < rLen) Tal vez la segunda parte no sea necesaria*/
{
s_key = S[s_cur];
r_cur++;
while(s_key == R[r_cur])
{
count++;
r_cur++;
}
g_ResNums[s_cur] = count;
}
}
}
__global__ void gIndexMultiJoin(int *R, int *S, int g_locations[], int sLen, int g_ResNums[], int *p1, int *p2, int of1, int of2, int *mloc, int *sloc, int *muljoin, int wj)
{
extern __shared__ int shared[];
int s_cur = blockIdx.x * blockDim.x + threadIdx.x;
int posr, poss, x, y, ini;
if(threadIdx.x < wj)
shared[threadIdx.x] = muljoin[threadIdx.x];
__syncthreads();
if(s_cur < sLen)
{
int count = 1;
int r_cur = g_locations[s_cur];
int s_key;
if(r_cur >= 0) /*&& r_cur < rLen) Tal vez la segunda parte no sea necesaria*/
{
s_key = S[s_cur];
r_cur++;
while(s_key == R[r_cur])
{
count++;
r_cur++;
}
if(sloc == NULL)
poss = s_cur * of2;
else
poss = sloc[s_cur] * of2;
ini = r_cur - count;
for(y = ini; y < r_cur; y++)
{
posr = mloc[y] * of1;
for(x = 0; x < wj; x += 2)
{
if(p1[posr + shared[x]] != p2[poss + shared[x+1]])
{
count--;
break;
}
}
}
if(count > 0)
g_ResNums[s_cur] = count;
}
}
}
__global__ void multiJoinWithWrite(int g_locations[], int sLen, int g_PrefixSums[], int g_joinResultBuffers[], int *p1, int *p2, int of1, int of2, int *rule, int halfrul, int lenrul, int *mloc, int *sloc, int wj)
{
extern __shared__ int shared[];
int *extjoins = &shared[lenrul];
int s_cur = blockIdx.x * blockDim.x + threadIdx.x;
if(threadIdx.x < (lenrul + wj))
shared[threadIdx.x] = rule[threadIdx.x];
__syncthreads();
if(s_cur < sLen)
{
int r_cur = g_locations[s_cur];
if(r_cur >= 0)
{
int x, y, pos, posr, poss;
int num1 = g_PrefixSums[s_cur];
int num2 = g_PrefixSums[s_cur+1];
int tmp1, tmp2;
if(sloc == NULL)
poss = s_cur * of2;
else
poss = sloc[s_cur] * of2;
for(x = num1; x < num2; x++, r_cur++)
{
pos = mloc[r_cur] * of1;
for(y = 0; y < wj; y += 2)
{
tmp1 = p1[pos + extjoins[y]];
tmp2 = p2[poss + extjoins[y+1]];
if(tmp1 != tmp2)
break;
}
if(y < wj)
{
x--;
continue;
}
posr = x * lenrul;
for(y = 0; y < halfrul; y++)
g_joinResultBuffers[posr + y] = p1[pos + shared[y]];
for(; y < lenrul; y++)
g_joinResultBuffers[posr + y] = p2[poss + shared[y]];
}
}
}
}
__global__ void multiJoinWithWrite2(int g_locations[], int sLen, int g_PrefixSums[], int g_joinResultBuffers[], int *p1, int *p2, int of1, int of2, int *rule, int cols, int *mloc, int *sloc, int wj)
{
extern __shared__ int shared[];
int *extjoins = &shared[cols];
int s_cur = blockIdx.x * blockDim.x + threadIdx.x;
if(threadIdx.x < (cols + wj))
shared[threadIdx.x] = rule[threadIdx.x];
__syncthreads();
if(s_cur < sLen)
{
int r_cur = g_locations[s_cur];
if(r_cur >= 0)
{
int x, y, pos, pos2, posr, cond;
int num1 = g_PrefixSums[s_cur];
int num2 = g_PrefixSums[s_cur+1];
if(sloc == NULL)
pos2 = s_cur * of2 - 1;
else
pos2 = sloc[s_cur] * of2 - 1;
for(x = num1; x < num2; x++, r_cur++)
{
pos = mloc[r_cur] * of1 - 1;
for(y = 0; y < wj; y += 2)
{
if(p1[pos + extjoins[y] + 1] != p2[pos2 + extjoins[y+1] + 1])
break;
}
if(y < wj)
{
x--;
continue;
}
posr = x * cols;
for(y = 0; y < cols; y++)
{
cond = shared[y];
if(cond > 0)
g_joinResultBuffers[posr + y] = p1[pos + cond];
else
g_joinResultBuffers[posr + y] = p2[pos2 - cond];
}
}
}
}
}
__global__ void gJoinWithWrite(int g_locations[], int sLen, int g_PrefixSums[], int g_joinResultBuffers[], int *p1, int *p2, int of1, int of2, int *rule, int halfrul, int lenrul, int *mloc, int *sloc)
{
extern __shared__ int shared[];
int s_cur = blockIdx.x * blockDim.x + threadIdx.x;
if(threadIdx.x < lenrul)
shared[threadIdx.x] = rule[threadIdx.x];
__syncthreads();
if(s_cur < sLen)
{
int r_cur = g_locations[s_cur];
if(r_cur >= 0)
{
int x, y, pos, posr, poss;
int num1 = g_PrefixSums[s_cur];
int num2 = g_PrefixSums[s_cur+1];
if(sloc == NULL)
poss = s_cur * of2;
else
poss = sloc[s_cur] * of2;
for(x = num1; x < num2; x++, r_cur++)
{
pos = mloc[r_cur] * of1;
posr = x * lenrul;
for(y = 0; y < halfrul; y++)
g_joinResultBuffers[posr + y] = p1[pos + shared[y]];
for(; y < lenrul; y++)
g_joinResultBuffers[posr + y] = p2[poss + shared[y]];
}
}
}
}
__global__ void gJoinWithWrite2(int g_locations[], int sLen, int g_PrefixSums[], int g_joinResultBuffers[], int *p1, int *p2, int of1, int of2, int *rule, int cols, int *mloc, int *sloc)
{
extern __shared__ int shared[];
int s_cur = blockIdx.x * blockDim.x + threadIdx.x;
if(threadIdx.x < cols)
shared[threadIdx.x] = rule[threadIdx.x];
__syncthreads();
if(s_cur < sLen)
{
int r_cur = g_locations[s_cur];
if(r_cur >= 0)
{
int x, y, pos, pos2, posr, cond;
int num1 = g_PrefixSums[s_cur];
int num2 = g_PrefixSums[s_cur+1];
if(sloc == NULL)
pos2 = s_cur * of2 - 1;
else
pos2 = sloc[s_cur] * of2 - 1;
for(x = num1; x < num2; x++, r_cur++)
{
pos = mloc[r_cur] * of1 - 1;
posr = x * cols;
for(y = 0; y < cols; y++)
{
cond = shared[y];
if(cond > 0)
g_joinResultBuffers[posr + y] = p1[pos + cond];
else
g_joinResultBuffers[posr + y] = p2[pos2 - cond];
}
}
}
}
}
int compare2 (const void * a, const void * b)
{
return ( ((int2*)a)->y - ((int2*)b)->y );
}
/*void generateSort(Record *R, int maxmax, int rLen, int seed)
{
int i=0;
const int offset=(1<<15)-1;
srand(seed);
for(i=0;i<rLen;i++)
{
R[i].y=((((rand()& offset)<<15)+(rand()&1))+(rand()<<1)+(rand()&1))%maxmax;
}
qsort(R,rLen,sizeof(Record),compare);
for(i=0;i<rLen;i++)
R[i].x=i;
}
void generateRand(Record *R, int maxmax, int rLen, int seed)
{
int i=0;
const int offset=(1<<15)-1;
srand(seed);
for(i=0;i<rLen;i++)
{
R[i].y=((((rand()& offset)<<15)+(rand()&1))+(rand()<<1)+(rand()&1))%maxmax;
//R[i].x=i+1;
R[i].x=i;
}
}*/
__global__ void llenar(int *p, int *R, int len, int of, int wj, int *pos, int *ids)
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
int cond;
if(id < len)
{
cond = pos[id+1];
if(pos[id] != cond && cond > 0)
{
R[cond-1] = p[id * of + wj];
ids[cond-1] = id;
}
}
}
__global__ void llenar2(int *p, int *R, int len, int of, int wj, int *pos)
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
int cond;
if(id < len)
{
cond = pos[id+1];
if(pos[id] != cond && cond > 0)
R[cond-1] = p[id * of + wj];
}
}
__global__ void llenarnosel(int *p, int *R, int len, int of, int wj)
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
if(id < len)
R[id] = p[id * of + wj];
}
/*__global__ smalljoinc(int *p1, int *p2, int rLen, int sLen, int2 wj, int *r)
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
if(id < rlen)
{
int comp = p1[id * of1 + wj.x];
int x, cnt = 0;
for(x = 0; x < sLen; x++)
{
if(comp == p2[x * of2 ])
cnt++;
}
r[id] = cnt;
}
}
__global__ smalljoinw(int *p1, int *p2, int wj, int *r)
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
if(id < rlen)
{
int comp = p1[id];
int x;
for(x = 0; x < sLen; x++)
{
if(comp == p2[x])
cnt++;
}
r[id] = cnt;
}
}*/
int buscarunion(int *tmprule, int tmplen, int *rule, int pos2, int tam2, int *joins)
{
int x, y;
int cont = 0;
for(x = 0; x < tmplen; x++)
{
for(y = pos2; y < (pos2 + tam2); y++)
{
if(tmprule[x] == rule[y])
{
joins[cont] = x;
cont++;
joins[cont] = y - pos2;
cont++;
}
}
}
return cont;
}
int not_in(int *rule, int len, int bus)
{
int x;
for(x = 0; x < len; x++)
{
if(rule[x] == bus)
return 0;
}
return 1;
}
int posiciones(int ini, int of1, int *firstpart, int *hpos, int *hcons, int *rule, int *temprule, int *tmprulpos, int *lenrul, int pos2, int of2, int posf)
{
int cont = 0, x, y;
for(y = ini; y < of1; y++)
{
if(firstpart[y] < 0)
{
hpos[cont] = y - ini;
hcons[cont] = -firstpart[y];
cont++;
}
else
{
x = 1;
while(rule[x] != 0)
{
if(firstpart[y] == rule[x])
{
if(not_in(temprule, *lenrul, firstpart[y]))
{
temprule[*lenrul] = firstpart[y];
tmprulpos[*lenrul] = y - ini;
*lenrul = *lenrul + 1;
}
break;
}
x++;
}
if(rule[x] != 0)
continue;
for(x = (pos2 + of2 + 1); x < posf; x++)
{
if(rule[x] == 0)
{
x++;
continue;
}
if(firstpart[y] == rule[x])
{
if(not_in(temprule, *lenrul, firstpart[y]))
{
temprule[*lenrul] = firstpart[y];
tmprulpos[*lenrul] = y - ini;
*lenrul = *lenrul + 1;
}
break;
}
}
}
}
return cont;
}
void join_final(int cols, int of1, int of2, int *rule, int *firstpart, int *secondpart, int *tmprulpos)
{
int x, y;
for(y = 0; y < cols; y++)
{
for(x = 0; x < of1; x++)
{
if(rule[y] == firstpart[x])
{
tmprulpos[y] = x + 1;
break;
}
}
if(x != of1)
continue;
for(x = 0; x < of2; x++)
{
if(rule[y] == secondpart[x])
{
tmprulpos[y] = -x - 1;
break;
}
}
}
}
int select_pos(int of1, int *firstpart, int *hpos, int *hcons)
{
int cont = 0, y;
for(y = 0; y < of1; y++)
{
if(firstpart[y] < 0)
{
hpos[cont] = y;
hcons[cont] = -firstpart[y];
cont++;
}
}
return cont;
}
int checkquery(int *hpos, int *hcons, int cont, int *pred, int of1, int *rule, int *query, int cols)
{
int x, y;
if(rule[0] != query[0])
return cont;
for(x = 1; x <= cols; x++)
{
if(query[x] < 0)
{
for(y = 0; y < of1; y++)
{
if(pred[y] == rule[x])
{
hpos[cont] = y;
hcons[cont] = -query[x];
cont++;
}
}
}
}
return cont;
}
int maximo(int count, ...)
{
va_list ap;
int j, temp, mx = 0;
va_start(ap, count);
for(j = 0; j < count; j++)
{
temp = va_arg(ap, int);
if(temp > mx)
mx = temp;
}
va_end(ap);
return mx;
}
template <typename KeyVector, typename PermutationVector, typename TempVector>
void update_permutation(KeyVector& keys, PermutationVector& permutation, TempVector& temporary, int rows)
{
// permute the keys with the current reordering
thrust::gather(permutation, permutation + rows, keys, temporary);
// stable_sort the permuted keys and update the permutation
thrust::stable_sort_by_key(temporary, temporary + rows, permutation);
}
template <typename KeyVector, typename PermutationVector, typename TempVector>
void apply_permutation(KeyVector& keys, PermutationVector& permutation, TempVector& temporary, int rows)
{
// permute the keys
thrust::gather(permutation, permutation + rows, temporary, keys);
}
int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>::iterator rule, int pos, int bothops, int **ret)
{
int pos2 = pos + 1;
int *sel1, nsel1;
int *sel2 = rule->select[pos2];
int nsel2 = rule->numsel[pos2];
int *proj = rule->project[pos];
int2 projp = rule->projpos[pos];
int *sjoin1, nsj1;
int *sjoin2 = rule->selfjoin[pos2];
int nsj2 = rule->numselfj[pos2];
int *wherej = rule->wherejoin[pos];
int numj = rule->numjoin[pos];
int flag;
#if TIMER
cuda_stats.joins++;
#endif
int porLiberar = rLen * of1 * sizeof(int);
int size, sizet, sizet2;
if(bothops)
{
sel1 = rule->select[pos];
nsel1 = rule->numsel[pos];
sjoin1 = rule->selfjoin[pos];
nsj1 = rule->numselfj[pos];
sizet = maximo(7, of1, of2, nsel1, nsel2, projp.y + numj - 2, nsj1, nsj2) * sizeof(int);
}
else
sizet = maximo(6, of1, of2, nsel2, projp.y + numj - 2, nsj2, numj) * sizeof(int);
int *dcons, *temp;
int *d_R, *d_S;
int blockllen, numthreads;
//int por_liberar = rLen * sizeof(int);
int extraspace = TREE_NODE_SIZE - rLen % TREE_NODE_SIZE;
int m32rLen = rLen + extraspace;
if(m32rLen > sLen)
sizet2 = (m32rLen + 1) * sizeof(int);
else
sizet2 = (sLen + 1) * sizeof(int);
/*hcons = (int *)malloc(sizet);
hpos = (int *)malloc(sizet);
int dconsize = sizet * 2;*/
reservar(&dcons, sizet);
#ifdef DEBUG_MEM
cerr << "+ " << dcons << " dcons tree " << sizet << endl;
#endif
reservar(&temp, sizet2);
#ifdef DEBUG_MEM
cerr << "+ " << temp << " temp tree " << sizet2 << endl;
#endif
thrust::device_ptr<int> res = thrust::device_pointer_cast(temp);
numthreads = 1024;
blockllen = sLen / numthreads + 1;
int memSizeS, newLen;
int *posR = NULL, *posS = NULL;
#ifdef TIMER
//cout << "INICIO" << endl;
cudaEvent_t start, stop;
float time;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
#endif
//cout << "sLen y rLen = " << sLen << " " << rLen << endl;
if(nsel2 > 0)
{
size = nsel2 * sizeof(int);
newLen = sLen + 1;
cudaMemsetAsync(temp, 0, newLen * sizeof(int));
cudaMemcpy(dcons, sel2, size, cudaMemcpyHostToDevice);
marcar<<<blockllen, numthreads, size>>>(p2, sLen, of2, dcons, nsel2, temp + 1);
/*int y;
int *htemp = (int *)malloc(newLen * sizeof(int));
cout << "temp =" << endl;
cudaMemcpy(htemp, temp, newLen * sizeof(int), cudaMemcpyDeviceToHost);
for(y = 0; y < newLen; y++)
cout << htemp[y] << " ";
cout << endl;
free(htemp);*/
if(nsj2 > 0)
{
size = nsj2 * sizeof(int);
cudaMemcpy(dcons, sjoin2, size, cudaMemcpyHostToDevice);
samejoin<<<blockllen, numthreads, size>>>(p2, sLen, of2, dcons, nsj2, temp + 1);
}
/*htemp = (int *)malloc(newLen * sizeof(int));
cout << "temp =" << endl;
cudaMemcpy(htemp, temp, newLen * sizeof(int), cudaMemcpyDeviceToHost);
for(y = 0; y < newLen; y++)
cout << res[y] << " ";
cout << endl;
free(htemp);*/
/*cout << "Despues de marcar" << endl;
cout << "newLen = " << newLen << endl;*/
flag = 0;
while(flag != 1)
{
try
{
thrust::inclusive_scan(res + 1, res + newLen, res + 1);
flag = 1;
}
catch(std::bad_alloc &e)
{
limpiar("inclusive scan in join", 0);
}
}
//thrust::inclusive_scan(res + 1, res + newLen, res + 1);
newLen = res[sLen];
if(newLen == 0)
return 0;
memSizeS = newLen * sizeof(int);
reservar(&d_S, memSizeS);
#if DEBUG_MEM
cerr << "+ " << d_S << " d_S " << memSizeS << endl;
#endif
reservar(&posS, memSizeS);
#if DEBUG_MEM
cerr << "+ " << posS << " posS " << memSizeS << endl;
#endif
llenar<<<blockllen, numthreads>>>(p2, d_S, sLen, of2, wherej[1], temp, posS);
sLen = newLen;
}
else
{
if(nsj2 > 0)
{
size = nsj2 * sizeof(int);
newLen = sLen + 1;
cudaMemsetAsync(temp, 0, newLen * sizeof(int));
cudaMemcpy(dcons, sjoin2, size, cudaMemcpyHostToDevice);
samejoin2<<<blockllen, numthreads, size>>>(p2, sLen, of2, dcons, nsj2, temp + 1);
thrust::inclusive_scan(res + 1, res + newLen, res + 1);
newLen = res[sLen];
if(newLen == 0)
return 0;
memSizeS = newLen * sizeof(int);
reservar(&d_S, memSizeS);
#ifdef DEBUG_MEM
cerr << "+ " << d_S << " d_S m " << memSizeS << endl;
#endif
reservar(&posS, memSizeS);
#ifdef DEBUG_MEM
cerr << "+ " << posS << " posS m " << memSizeS << endl;
#endif
llenar<<<blockllen, numthreads>>>(p2, d_S, sLen, of2, wherej[1], temp, posS);
sLen = newLen;
}
else
{
memSizeS = sLen * sizeof(int);
reservar(&d_S, memSizeS);
#ifdef DEBUG_MEM
cerr << "+ " << d_S << " d_S n " << memSizeS << endl;
#endif
llenarnosel<<<blockllen, numthreads>>>(p2, d_S, sLen, of2, wherej[1]);
}
}
#ifdef TIMER
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
//cout << "Select1 = " << time << endl;
cuda_stats.select1_time += time;
cudaEventDestroy(start);
cudaEventDestroy(stop);
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
#endif
blockllen = rLen / numthreads + 1;
int sizem32, sizextra;
if(bothops)
{
if(nsel1 > 0)
{
size = nsel1 * sizeof(int);
newLen = rLen + 1;
cudaMemsetAsync(temp, 0, newLen * sizeof(int));
cudaMemcpy(dcons, sel1, size, cudaMemcpyHostToDevice);
marcar<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, nsel1, temp + 1);
if(nsj1 > 0)
{
size = nsj1 * sizeof(int);
cudaMemcpy(dcons, sjoin1, size, cudaMemcpyHostToDevice);
samejoin<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, nsj1, temp + 1);
}
thrust::inclusive_scan(res + 1, res + newLen, res + 1);
newLen = res[rLen];
if(newLen == 0)
return 0;
extraspace = TREE_NODE_SIZE - newLen % TREE_NODE_SIZE;
sizextra = extraspace * sizeof(int);
m32rLen = newLen + extraspace;
sizem32 = m32rLen * sizeof(int);
reservar(&d_R, sizem32);
#ifdef DEBUG_MEM
cerr << "+ " << d_R << " d_R m " << sizem32 << endl;
#endif
reservar(&posR, sizem32);
#ifdef DEBUG_MEM
cerr << "+ " << posR << " posR m " << sizem32 << endl;
#endif
cudaMemsetAsync(d_R + newLen, 0x7f, sizextra);
cudaMemsetAsync(posR + newLen, 0x7f, sizextra);
llenar<<<blockllen, numthreads>>>(p1, d_R, rLen, of1, wherej[0], temp, posR);
rLen = newLen;
}
else
{
if(nsj1 > 0)
{
size = nsj1 * sizeof(int);
newLen = rLen + 1;
cudaMemsetAsync(temp, 0, newLen * sizeof(int));
cudaMemcpy(dcons, sjoin1, size, cudaMemcpyHostToDevice);
samejoin2<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, nsj1, temp + 1);
thrust::inclusive_scan(res + 1, res + newLen, res + 1);
newLen = res[rLen];
if(newLen == 0)
return 0;
extraspace = TREE_NODE_SIZE - newLen % TREE_NODE_SIZE;
sizextra = extraspace * sizeof(int);
m32rLen = newLen + extraspace;
sizem32 = m32rLen * sizeof(int);
reservar(&d_R, sizem32);
#ifdef DEBUG_MEM
cerr << "+ " << d_R << " d_R n " << sizem32 << endl;
#endif
reservar(&posR, sizem32);
#ifdef DEBUG_MEM
cerr << "+ " << posR << " posR n " << sizem32 << endl;
#endif
cudaMemsetAsync(d_R + newLen, 0x7f, sizextra);
cudaMemsetAsync(posR + newLen, 0x7f, sizextra);
llenar<<<blockllen, numthreads>>>(p1, d_R, rLen, of1, wherej[0], temp, posR);
rLen = newLen;
}
else
{
sizem32 = m32rLen * sizeof(int);
reservar(&d_R, sizem32);
#ifdef DEBUG_MEM
cerr << "+ " << d_R << " d_R sizem32 " << sizem32 << endl;
#endif
cudaMemsetAsync(d_R + rLen, 0x7f, extraspace * sizeof(int));
llenarnosel<<<blockllen, numthreads>>>(p1, d_R, rLen, of1, wherej[0]);
}
}
}
else
{
sizem32 = m32rLen * sizeof(int);
reservar(&d_R, sizem32);
#ifdef DEBUG_MEM
cerr << "+ " << d_R << " d_R sz " << sizem32 << endl;
#endif
cudaMemsetAsync(d_R + rLen, 0x7f, extraspace * sizeof(int));
llenarnosel<<<blockllen, numthreads>>>(p1, d_R, rLen, of1, wherej[0]);
}
#ifdef TIMER
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
//cout << "Select2 = " << time << endl;
cuda_stats.select2_time += time;
#endif
/*free(hcons);
free(hpos);
h_R = (int *)malloc(sizem32);
cudaMemcpy(h_R, d_S, memSizeS, cudaMemcpyDeviceToHost);
cout << "H_S " << "cont " << cont << " sLen " << sLen << endl;
for(x = 0; x < sLen; x++)
cout << h_R[x] << endl;
free(h_R);*/
#ifdef TIMER
cudaEventDestroy(start);
cudaEventDestroy(stop);
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
#endif
thrust::device_ptr<Record> dvp1 = thrust::device_pointer_cast(d_R);
thrust::device_ptr<Record> permutation;
if(posR == NULL)
{
reservar(&posR, sizem32);
#ifdef DEBUG_MEM
cerr << "+ " << posR << " posR m32 " << sizem32 << endl;
#endif
permutation = thrust::device_pointer_cast(posR);
thrust::sequence(permutation, permutation + m32rLen);
}
else
permutation = thrust::device_pointer_cast(posR);
flag = 0;
while(flag != 1)
{
try
{
thrust::stable_sort_by_key(dvp1, dvp1 + m32rLen, permutation);
flag = 1;
}
catch(std::bad_alloc &e)
{
limpiar("inclusive scan in join", 0);
}
}
#ifdef TIMER
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
//cout << "Sort = " << time << endl;
cuda_stats.sort_time += time;
cudaEventDestroy(start);
cudaEventDestroy(stop);
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
#endif
IDataNode* d_data;
IDirectoryNode* d_dir;
unsigned int nDataNodes;
nDataNodes = uintCeilingDiv(rLen, TREE_NODE_SIZE);
d_data=(IDataNode *)d_R;
unsigned int lvlDir = uintCeilingLog(TREE_FANOUT, nDataNodes);
unsigned int nDirNodes = uintCeilingDiv(nDataNodes - 1, TREE_NODE_SIZE);
unsigned int tree_size = nDirNodes + nDataNodes;
unsigned int bottom_start = (uintPower(TREE_FANOUT, lvlDir) - 1) / TREE_NODE_SIZE;
d_dir = (IDirectoryNode *)temp;
unsigned int nNodesPerBlock = uintCeilingDiv(nDirNodes, BLCK_PER_GRID_create);
dim3 Dbc(THRD_PER_BLCK_create, 1, 1);
dim3 Dgc(BLCK_PER_GRID_create, 1, 1);
gCreateIndex <<<Dgc, Dbc>>> (d_data, d_dir, nDirNodes, tree_size, bottom_start, nNodesPerBlock);
/*int y;
IDirectoryNode *h_dir = (IDirectoryNode*)malloc(sizeof(IDirectoryNode) * nDirNodes);
cudaMemcpy(h_dir, d_dir, sizeof(IDirectoryNode) * nDirNodes, cudaMemcpyDeviceToHost);
for(x = 0; x < nDirNodes; x++)
{
for(y = 0; y < TREE_NODE_SIZE; y++)
printf("%d ", h_dir[x].keys[y]);
printf("\n");
}
free(h_dir);*/
int *d_locations;
reservar(&d_locations, memSizeS);
#ifdef DEBUG_MEM
cerr << "+ " << d_locations << " d_locs n " << memSizeS << endl;
#endif
dim3 Dbs(THRD_PER_BLCK_search, 1, 1);
dim3 Dgs(BLCK_PER_GRID_search, 1, 1);
unsigned int nSearchKeys = sLen;
unsigned int nKeysPerThread = uintCeilingDiv(nSearchKeys, THRD_PER_GRID_search);
gSearchTree <<<Dgs, Dbs>>> (d_data, nDataNodes, d_dir, nDirNodes, lvlDir, d_S, d_locations, nSearchKeys, nKeysPerThread, tree_size, bottom_start);
cudaMemsetAsync(temp, 0, memSizeS);
blockllen = sLen / numthreads + 1;
int muljoin = 0, muljoinsize = 0;
if(numj > 2)
{
muljoin = numj - 2;
muljoinsize = muljoin * sizeof(int);
cudaMemcpy(dcons, wherej + 2, muljoinsize, cudaMemcpyHostToDevice);
gIndexMultiJoin<<<blockllen, numthreads, muljoinsize>>> (d_R, d_S, d_locations, sLen, temp, p1, p2, of1, of2, posR, posS, dcons, muljoin);
}
else
gIndexJoin<<<blockllen, numthreads>>> (d_R, d_S, d_locations, sLen, temp);
liberar(d_R, sizem32);
liberar(d_S, memSizeS);
int sum = res[sLen-1];
thrust::exclusive_scan(res, res + sLen, res);
sum += res[sLen-1];
if(sum == 0)
return 0;
res[sLen] = sum;
int *d_Rout;
int resSize, sizepro;
if(pos == (rule->num_rows - 3) && rule->num_bpreds.x == 0)
{
sizepro = rule->num_columns * sizeof(int);
cudaMemcpy(dcons, proj, sizepro, cudaMemcpyHostToDevice);
resSize = sum * sizepro;
reservar(&d_Rout, resSize);
#ifdef DEBUG_MEM
cerr << "+ " << d_Rout << " d_Rout n " << resSize << endl;
#endif
if(numj > 2)
{
cudaMemcpy(dcons + rule->num_columns, wherej + 2, muljoinsize, cudaMemcpyHostToDevice);
multiJoinWithWrite2<<<blockllen, numthreads, sizepro + muljoinsize>>> (d_locations, sLen, temp, d_Rout, p1, p2, of1, of2, dcons, rule->num_columns, posR, posS, muljoin);
}
else
gJoinWithWrite2<<<blockllen, numthreads, sizepro>>> (d_locations, sLen, temp, d_Rout, p1, p2, of1, of2, dcons, rule->num_columns, posR, posS);
}
else
{
sizepro = projp.y * sizeof(int);
cudaMemcpy(dcons, proj, sizepro, cudaMemcpyHostToDevice);
resSize = sum * sizepro;
reservar(&d_Rout, resSize);
#ifdef DEBUG_MEM
cerr << "+ " << d_Rout << " d_Rout 2 " << resSize << endl;
#endif
if(numj > 2)
{
cudaMemcpy(dcons + projp.y, wherej + 2, muljoinsize, cudaMemcpyHostToDevice);
multiJoinWithWrite<<<blockllen, numthreads, sizepro + muljoinsize>>> (d_locations, sLen, temp, d_Rout, p1, p2, of1, of2, dcons, projp.x, projp.y, posR, posS, muljoin);
}
else
gJoinWithWrite<<<blockllen, numthreads, sizepro>>> (d_locations, sLen, temp, d_Rout, p1, p2, of1, of2, dcons, projp.x, projp.y, posR, posS);
}
liberar(dcons, sizet);
liberar(d_locations, memSizeS);
liberar(temp, sizet2);
liberar(posR, sizem32);
if(posS != NULL)
liberar(posS, memSizeS);
/*if(posS != NULL)
liberar(posS, memSizeS);
liberar(dtmprulpos, sizerul);
if(*ret != NULL)
liberar(*ret, por_liberar);
free(tmprulpos);
if(final_cond != posf)
{
free(*newrule);
*newrule = (int *)malloc(sizerul);
memcpy(*newrule, temprule, sizerul);
*newrullen = lenrul;
}*/
if(*ret != NULL)
liberar(*ret, porLiberar);
*ret = d_Rout;
#ifdef TIMER
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
//cout << "Join = " << time << endl;
//cout << "FIN" << endl;
cuda_stats.join_time += time;
#endif
return sum;
}