Merge branch 'master' of ssh://git.dcc.fc.up.pt/yap-6.3

This commit is contained in:
Vítor Santos Costa 2013-10-17 00:45:25 +01:00
commit 8921889002
11 changed files with 236 additions and 35 deletions

View File

@ -251,6 +251,7 @@ Yap_InitPreAllocCodeSpace(int wid)
char *ptr;
UInt sz = REMOTE_ScratchPad(wid).msz;
if (REMOTE_ScratchPad(wid).ptr == NULL) {
#if USE_DL_MALLOC
LOCK(DLMallocLock);
@ -261,7 +262,13 @@ Yap_InitPreAllocCodeSpace(int wid)
tmalloc += sz;
sz += sizeof(CELL);
#endif
while (!(ptr = my_malloc(sz))) {
while (!(ptr =
#ifdef YAPOR_COPY
malloc(sz)
#else
my_malloc(sz)
#endif
)) {
REMOTE_PrologMode(wid) &= ~MallocMode;
#if USE_DL_MALLOC
UNLOCK(DLMallocLock);
@ -290,6 +297,7 @@ Yap_InitPreAllocCodeSpace(int wid)
}
AuxBase = (ADDR)(ptr);
AuxSp = (CELL *)(AuxTop = AuxBase+REMOTE_ScratchPad(wid).sz);
printf("wid=%d %p %p %p--%p\n", wid, AuxBase, AuxSp, Yap_HeapBase, H);
return ptr;
}

View File

@ -52,8 +52,14 @@ int bpreds(int *dop1, int rows, int cols, int *bin, int3 numpreds, int **ret)
int tmplen = rows + 1;
int size = tmplen * sizeof(int);
reservar(&temp, size);
#ifdef DEBUG_MEM
cerr << "+ " << temp << " temp bpreds " << size << endl;
#endif
cudaMemset(temp, 0, size);
#if TIMER
cuda_stats.builtins++;
#endif
int *dhead;
int predn = numpreds.x * 3;
int spredn = predn * sizeof(int);
@ -64,6 +70,9 @@ int bpreds(int *dop1, int rows, int cols, int *bin, int3 numpreds, int **ret)
else
hsize = sproj;
reservar(&dhead, hsize);
#ifdef DEBUG_MEM
cerr << "+ " << dhead << " dhead " << hsize << endl;
#endif
cudaMemcpy(dhead, bin, spredn, cudaMemcpyHostToDevice);
int blockllen = rows / 1024 + 1;
@ -108,6 +117,9 @@ int bpreds(int *dop1, int rows, int cols, int *bin, int3 numpreds, int **ret)
int *fres;
reservar(&fres, num * sproj);
#ifdef DEBUG_MEM
cerr << "+ " << fres << " fres " << num * sproj << endl;
#endif
cudaMemcpy(dhead, bin + predn, sproj, cudaMemcpyHostToDevice);
llenarproyectar<<<blockllen, numthreads, sproj>>>(dop1, rows, numpreds.y, temp, dhead, numpreds.z, fres);

View File

@ -37,10 +37,13 @@ int32_t Cuda_Erase(predicate *pred);
void init_cuda( void );
//#define DEBUG_INTERFACE 1
#if DEBUG_INTERFACE
static void
dump_mat(int32_t mat[], int32_t nrows, int32_t ncols)
{
return;
int32_t i, j;
for ( i=0; i< nrows; i++) {
printf("%d", mat[i*ncols]);
@ -56,7 +59,7 @@ dump_vec(int32_t vec[], int32_t rows)
{
int32_t i = 1;
int32_t j = 0;
printf("%d", vec[0]);
for (j = 0; j < rows; j++) {
for ( ; vec[i]; i++ ) {
printf(", %d", vec[i]);
@ -392,6 +395,12 @@ static int cuda_count( void )
return YAP_Unify(YAP_ARG2, YAP_MkIntTerm(n));
}
static int cuda_statistics( void )
{
Cuda_Statistics();
return TRUE;
}
static int first_time = TRUE;
void
@ -414,5 +423,6 @@ init_cuda(void)
YAP_UserCPredicate("cuda_eval", cuda_eval, 2);
YAP_UserCPredicate("cuda_coverage", cuda_coverage, 4);
YAP_UserCPredicate("cuda_count", cuda_count, 2);
YAP_UserCPredicate("cuda_statistics", cuda_statistics, 0);
}

View File

@ -4,6 +4,7 @@
cuda_erase/1,
cuda_eval/2,
cuda_coverage/4,
cuda_statistics/0,
cuda_count/2]).
tell_warning :-
@ -47,9 +48,9 @@ body_to_list( (B1, B2), LF, L0, N0, NF) :- !,
body_to_list( B1, LF, LI, N0, N1),
body_to_list( B2, LI, L0, N1, NF).
body_to_list( true, L, L, N, N) :- !.
body_to_list( B, [NB|L], L, N0, N) :-
body_to_list( B, NL, L, N0, N) :-
inline( B, NB ), !,
N is N0+1.
body_to_list( NB, NL, L, N0, N).
body_to_list( B, [B|L], L, N0, N) :-
N is N0+1.

View File

@ -13,6 +13,10 @@ extern "C" {
#define MAXVALS 200
#if TIMER
statinfo cuda_stats;
#endif
bool compare(const gpunode &r1, const gpunode &r2)
{
return (r1.name > r2.name);
@ -851,6 +855,30 @@ void mostrareglas(list<rulenode> aux)
cout << endl;
}
extern "C"
void Cuda_Statistics(void)
{
cerr << "GPU Statistics" << endl;
#if TIMER
cerr << "Called " << cuda_stats.calls << "times." << endl;
cerr << "GPU time " << cuda_stats.total_time << "msec." << endl;
cerr << "Longest call " << cuda_stats.max_time << "msec." << endl;
cerr << "Fastest call " << cuda_stats.min_time << "msec." << endl << endl;
cerr << "Steps" << endl;
cerr << " Select First: " << cuda_stats.select1_time << " msec." << endl;
cerr << " Select Second: " << cuda_stats.select2_time << " msec." << endl;
cerr << " Sort: " << cuda_stats.sort_time << " msec." << endl;
cerr << " Join: " << cuda_stats.join_time << " msec." << endl;
cerr << " Union: " << cuda_stats.union_time << " msec." << endl;
cerr << " Built-in: " << cuda_stats.pred_time << " msec." << endl << endl;
cerr << "Operations" << endl;
cerr << " Joins: " << cuda_stats.joins << "." << endl;
cerr << " Selects/Projects: " << cuda_stats.selects << "." << endl;
cerr << " Unions: " << cuda_stats.unions << "." << endl;
cerr << " Built-ins: " << cuda_stats.builtins << "." << endl << endl;
#endif
}
extern "C"
int Cuda_Eval(predicate **inpfacts, int ninpf, predicate **inprules, int ninpr, predicate *inpquery, int **result)
{
@ -859,6 +887,9 @@ int Cuda_Eval(predicate **inpfacts, int ninpf, predicate **inprules, int ninpr,
int x, y;
int qsize, *query, qname;
#if TIMER
cuda_stats.calls++;
#endif
for(x = 0; x < ninpf; x++)
L.push_back(*inpfacts[x]);
for(x = 0; x < ninpr; x++)
@ -917,11 +948,13 @@ int Cuda_Eval(predicate **inpfacts, int ninpf, predicate **inprules, int ninpr,
vector<gpunode>::iterator qposf;
vector<rulenode>::iterator qposr;
#if TIMER
cudaEvent_t start, stop;
float time;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
#endif
while(reglas.size()) /*Here's the main loop*/
{
@ -967,6 +1000,9 @@ int Cuda_Eval(predicate **inpfacts, int ninpf, predicate **inprules, int ninpr,
{
num_refs = rows1 * cols1 * sizeof(int);
reservar(&res, num_refs);
#ifdef DEBUG_MEM
cerr << "+ " << res << " Res " << num_refs << endl;
#endif
cudaMemcpyAsync(res, dop1, num_refs, cudaMemcpyDeviceToDevice);
registrar(rul_act->name, cols1, res, rows1, itr, 1);
rul_act->gen_ant = rul_act->gen_act;
@ -1103,7 +1139,8 @@ int Cuda_Eval(predicate **inpfacts, int ninpf, predicate **inprules, int ninpr,
cudaEventElapsedTime(&time, start3, stop3);
cudaEventDestroy(start3);
cudaEventDestroy(stop3);
cout << "Predicados = " << time << endl;
//cout << "Predicados = " << time << endl;
cuda_stats.pred_time += time;
#endif
}
@ -1124,7 +1161,8 @@ int Cuda_Eval(predicate **inpfacts, int ninpf, predicate **inprules, int ninpr,
cudaEventElapsedTime(&time, start2, stop2);
cudaEventDestroy(start2);
cudaEventDestroy(stop2);
cout << "Union = " << time << endl;
//cout << "Union = " << time << endl;
cuda_stats.union_time += time;
#endif
//cout << "despues de unir = " << res_rows << endl;
@ -1251,25 +1289,40 @@ int Cuda_Eval(predicate **inpfacts, int ninpf, predicate **inprules, int ninpr,
else
{
res_rows = selectproyect(dop1, rows1, cols1, tmprule.num_columns, tmprule.select[0], tmprule.numsel[0], tmprule.selfjoin[0], tmprule.numselfj[0], tmprule.project[0], &res);
if(qposr != fin && qposr->name == qname)
if(qposr != fin && qposr->name == qname) {
cudaFree(dop1);
#ifdef DEBUG_MEM
cerr << "- " << dop1 << " dop1" << endl;
#endif
}
}
cols1 = tmprule.num_columns;
tipo = res_rows * cols1 * sizeof(int);
hres = (int *)malloc(tipo);
cudaMemcpy(hres, res, tipo, cudaMemcpyDeviceToHost);
if(res_rows > 0 && tmprule.numsel[0] != 0 && tmprule.numselfj[0] != 0)
if(res_rows > 0 /*&& tmprule.numsel[0] != 0 && tmprule.numselfj[0] != 0 */) {
cudaFree(res);
#ifdef DEBUG_MEM
cerr << "- " << res << " res" << endl;
#endif
}
}
else
res_rows = 0;
#if TIMER
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
cuda_stats.total_time += time;
if (time > cuda_stats.max_time)
cuda_stats.max_time = time;
if (time < cuda_stats.min_time || cuda_stats.calls == 1)
cuda_stats.min_time = time;
cudaEventDestroy(start);
cudaEventDestroy(stop);
#endif
if(showr == 1)
{

View File

@ -105,14 +105,14 @@ int buscarpornombrecpu(int name, int itr, int *totalrows)
return x;
}
void limpiar(const char s[])
void limpiar(const char s[], size_t sz)
{
list<memnode>::iterator ini;
memnode temp;
if(GPUmem.size() == 0)
{
cerr << s << ": not enough GPU memory: have " << avmem << endl;
cerr << s << ": not enough GPU memory: have " << avmem << ", need " << sz << " bytes." << endl;
exit(1);
}
@ -176,6 +176,9 @@ void liberar(int *ptr, int size)
//cout << "L " << avmem << " " << size;
cudaFree(ptr);
#ifdef DEBUG_MEM
cerr << "- " << ptr << " " << size << endl;
#endif
avmem += size;
//cout << " " << avmem << endl;
@ -185,16 +188,16 @@ void reservar(int **ptr, int size)
{
//size_t free, total;
//cudaMemGetInfo( &free, &total );
// cerr << "R " << free << " " << size << endl;
// cerr << "? " << free << " " << size << endl;
if (size == 0) {
*ptr = NULL;
return;
}
while(avmem < size)
limpiar("not enough memory");
limpiar("not enough memory", size);
while(cudaMalloc(ptr, size) == cudaErrorMemoryAllocation)
limpiar("error in memory allocation");
limpiar("Error in memory allocation", size);
if (! *ptr ) {
size_t free, total;
cudaMemGetInfo( &free, &total );
@ -202,7 +205,6 @@ void reservar(int **ptr, int size)
cerr << "Exiting CUDA...." << endl;
exit(1);
}
// cerr << *ptr << " " << size;
avmem -= size;
// cout << " " << avmem << endl;
@ -277,6 +279,9 @@ int cargar(int name, int num_rows, int num_columns, int is_fact, int *address_ho
}
size = num_rows * num_columns * sizeof(int);
reservar(&temp, size);
#ifdef DEBUG_MEM
cerr << "+ " << temp << " temp " << size << endl;
#endif
cudaMemcpyAsync(temp, address_host_table, size, cudaMemcpyHostToDevice);
registrar(name, num_columns, temp, num_rows, itr, 0);
*ptr = temp;
@ -296,6 +301,9 @@ int cargar(int name, int num_rows, int num_columns, int is_fact, int *address_ho
}
size = totalrows * num_columns * sizeof(int);
reservar(&temp, size);
#ifdef DEBUG_MEM
cerr << "+ " << temp << " temp 2 " << size << endl;
#endif
for(x = 1; x < numgpu; x++)
{
cudaMemcpyAsync(temp + temp_storage[x-1].size, temp_storage[x].dev_address, temp_storage[x].size, cudaMemcpyDeviceToDevice);
@ -338,6 +346,9 @@ int cargafinal(int name, int cols, int **ptr)
}
reservar(&temp, cont * cols * sizeof(int));
#ifdef DEBUG_MEM
cerr << "+ " << temp << " temp 3 " << cont * cols * sizeof(int) << endl;
#endif
ini = temp;
pos = gpu;
@ -460,6 +471,9 @@ void resultados(vector<rulenode>::iterator first, vector<rulenode>::iterator las
cout << endl;
}
cudaFree(gpu->dev_address);
#ifdef DEBUG_MEM
cerr << "- " << gpu->dev_address << " gpu->dev_address" << endl;
#endif
free(temp);
gpu++;
}
@ -491,6 +505,9 @@ void clear_memory()
{
if (ini->isrule) {
cudaFree(ini->dev_address);
#ifdef DEBUG_MEM
cerr << "- " << ini->dev_address << " ini->dev_address" << endl;
#endif
ini = GPUmem.erase(ini);
} else {
ini++;

View File

@ -11,7 +11,7 @@ using namespace std;
void calcular_mem(int);
void liberar(int*, int);
void limpiar(const char []);
void limpiar(const char [], size_t);
void limpiartodo(int*, int*);
int cargar(int, int, int, int, int*, int**, int);
int cargafinal(int, int, int**);

View File

@ -1,6 +1,8 @@
#ifndef _PRED_H_
#define _PRED_H_
// #define DEBUG_MEM 1
typedef struct Nodo{
int name;
int num_rows;
@ -11,6 +13,20 @@ typedef struct Nodo{
typedef gpunode predicate;
// #define TIMER 1
#if TIMER
typedef struct Stats{
size_t joins, selects, unions, builtins;
size_t calls;
double total_time;
float max_time, min_time;
float select1_time, select2_time, join_time, sort_time, union_time, pred_time;
}statinfo;
extern statinfo cuda_stats;
#endif
#define SBG_EQ (-1)
#define SBG_GT (-2)
#define SBG_LT (-3)
@ -19,5 +35,5 @@ typedef gpunode predicate;
#define SBG_DF (-6)
int Cuda_Eval(predicate**, int, predicate**, int, predicate*, int**);
void Cuda_Statistics( void );
#endif

View File

@ -206,8 +206,14 @@ int selectproyect(int *dop1, int rows, int cols, int head_size, int *select, int
int size, size2, num;
thrust::device_ptr<int> res;
#if TIMER
cuda_stats.selects++;
#endif
int head_bytes = mayor(numselect, numselfj, head_size) * sizeof(int);
reservar(&dhead, head_bytes);
#ifdef DEBUG_MEM
cerr << "+ " << dhead << " dhead " << head_bytes << endl;
#endif
int blockllen = rows / 1024 + 1;
int numthreads = 1024;
@ -218,6 +224,9 @@ int selectproyect(int *dop1, int rows, int cols, int head_size, int *select, int
tmplen = rows + 1;
size2 = tmplen * sizeof(int);
reservar(&temp, size2);
#ifdef DEBUG_MEM
cerr << "+ " << temp << " temp select " << size2 << endl;
#endif
cudaMemset(temp, 0, size2);
size = numselect * sizeof(int);
@ -240,6 +249,9 @@ int selectproyect(int *dop1, int rows, int cols, int head_size, int *select, int
size = head_size * sizeof(int);
reservar(&fres, num * size);
#ifdef DEBUG_MEM
cerr << "+ " << fres << " fres select " << num*size << endl;
#endif
cudaMemcpy(dhead, project, size, cudaMemcpyHostToDevice);
llenarproyectar<<<blockllen, numthreads, size>>>(dop1, rows, cols, temp, dhead, head_size, fres);
liberar(dhead, head_bytes);
@ -254,6 +266,9 @@ int selectproyect(int *dop1, int rows, int cols, int head_size, int *select, int
tmplen = rows + 1;
size2 = tmplen * sizeof(int);
reservar(&temp, size2);
#ifdef DEBUG_MEM
cerr << "+ " << temp << " temp select " << size2 << endl;
#endif
cudaMemset(temp, 0, size2);
size = numselfj * sizeof(int);
@ -268,6 +283,9 @@ int selectproyect(int *dop1, int rows, int cols, int head_size, int *select, int
size = head_size * sizeof(int);
reservar(&fres, num * size);
#ifdef DEBUG_MEM
cerr << "+ " << fres << " fres select again " << num*size << endl;
#endif
cudaMemcpy(dhead, project, size, cudaMemcpyHostToDevice);
llenarproyectar<<<blockllen, numthreads, size>>>(dop1, rows, cols, temp, dhead, head_size, fres);
liberar(dhead, head_bytes);
@ -279,6 +297,9 @@ int selectproyect(int *dop1, int rows, int cols, int head_size, int *select, int
{
size = head_size * sizeof(int);
reservar(&fres, rows * size);
#ifdef DEBUG_MEM
cerr << "+ " << fres << " fres select third " << rows*size << endl;
#endif
cudaMemcpy(dhead, project, size, cudaMemcpyHostToDevice);
proyectar<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, head_size, fres);
liberar(dhead, head_bytes);

View File

@ -234,7 +234,7 @@ __global__ void gIndexMultiJoin(int *R, int *S, int g_locations[], int sLen, int
{
extern __shared__ int shared[];
int s_cur = blockIdx.x * blockDim.x + threadIdx.x;
int posr, poss, x, y, temp, ini;
int posr, poss, x, y, ini;
if(threadIdx.x < wj)
shared[threadIdx.x] = muljoin[threadIdx.x];
@ -258,15 +258,17 @@ __global__ void gIndexMultiJoin(int *R, int *S, int g_locations[], int sLen, int
poss = s_cur * of2;
else
poss = sloc[s_cur] * of2;
ini = r_cur - count;
for(x = 0; x < wj; x += 2)
ini = r_cur - count;
for(y = ini; y < r_cur; y++)
{
posr = shared[x];
temp = p2[poss + shared[x+1]];
for(y = ini; y < r_cur; y++)
posr = mloc[y] * of1;
for(x = 0; x < wj; x += 2)
{
if(p1[mloc[y] * of1 + posr] != temp)
if(p1[posr + shared[x]] != p2[poss + shared[x+1]])
{
count--;
break;
}
}
}
if(count > 0)
@ -744,6 +746,9 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
int *wherej = rule->wherejoin[pos];
int numj = rule->numjoin[pos];
int flag;
#if TIMER
cuda_stats.joins++;
#endif
int porLiberar = rLen * of1 * sizeof(int);
int size, sizet, sizet2;
@ -775,7 +780,13 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
int dconsize = sizet * 2;*/
reservar(&dcons, sizet);
#ifdef DEBUG_MEM
cerr << "+ " << dcons << " dcons tree " << sizet << endl;
#endif
reservar(&temp, sizet2);
#ifdef DEBUG_MEM
cerr << "+ " << temp << " temp tree " << sizet2 << endl;
#endif
thrust::device_ptr<int> res = thrust::device_pointer_cast(temp);
numthreads = 1024;
@ -784,7 +795,7 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
int *posR = NULL, *posS = NULL;
#ifdef TIMER
cout << "INICIO" << endl;
//cout << "INICIO" << endl;
cudaEvent_t start, stop;
float time;
cudaEventCreate(&start);
@ -839,7 +850,7 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
}
catch(std::bad_alloc &e)
{
limpiar("inclusive scan in join");
limpiar("inclusive scan in join", 0);
}
}
//thrust::inclusive_scan(res + 1, res + newLen, res + 1);
@ -850,7 +861,13 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
memSizeS = newLen * sizeof(int);
reservar(&d_S, memSizeS);
#ifdef DEBUG_MEM
cerr << "+ " << d_S << " d_S " << memSizeS << endl;
#endif
reservar(&posS, memSizeS);
#ifdef DEBUG_MEM
cerr << "+ " << posS << " posS " << memSizeS << endl;
#endif
llenar<<<blockllen, numthreads>>>(p2, d_S, sLen, of2, wherej[1], temp, posS);
sLen = newLen;
}
@ -867,11 +884,17 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
thrust::inclusive_scan(res + 1, res + newLen, res + 1);
newLen = res[sLen];
if(newLen == 0)
return 0;
return 0;
memSizeS = newLen * sizeof(int);
reservar(&d_S, memSizeS);
#ifdef DEBUG_MEM
cerr << "+ " << d_S << " d_S m " << memSizeS << endl;
#endif
reservar(&posS, memSizeS);
#ifdef DEBUG_MEM
cerr << "+ " << posS << " posS m " << memSizeS << endl;
#endif
llenar<<<blockllen, numthreads>>>(p2, d_S, sLen, of2, wherej[1], temp, posS);
sLen = newLen;
}
@ -879,6 +902,9 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
{
memSizeS = sLen * sizeof(int);
reservar(&d_S, memSizeS);
#ifdef DEBUG_MEM
cerr << "+ " << d_S << " d_S n " << memSizeS << endl;
#endif
llenarnosel<<<blockllen, numthreads>>>(p2, d_S, sLen, of2, wherej[1]);
}
}
@ -887,7 +913,8 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
cout << "Select1 = " << time << endl;
//cout << "Select1 = " << time << endl;
cuda_stats.select1_time += time;
cudaEventDestroy(start);
cudaEventDestroy(stop);
@ -925,7 +952,13 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
m32rLen = newLen + extraspace;
sizem32 = m32rLen * sizeof(int);
reservar(&d_R, sizem32);
#ifdef DEBUG_MEM
cerr << "+ " << d_R << " d_R m " << sizem32 << endl;
#endif
reservar(&posR, sizem32);
#ifdef DEBUG_MEM
cerr << "+ " << posR << " posR m " << sizem32 << endl;
#endif
cudaMemsetAsync(d_R + newLen, 0x7f, sizextra);
cudaMemsetAsync(posR + newLen, 0x7f, sizextra);
llenar<<<blockllen, numthreads>>>(p1, d_R, rLen, of1, wherej[0], temp, posR);
@ -951,7 +984,13 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
m32rLen = newLen + extraspace;
sizem32 = m32rLen * sizeof(int);
reservar(&d_R, sizem32);
#ifdef DEBUG_MEM
cerr << "+ " << d_R << " d_R n " << sizem32 << endl;
#endif
reservar(&posR, sizem32);
#ifdef DEBUG_MEM
cerr << "+ " << posR << " posR n " << sizem32 << endl;
#endif
cudaMemsetAsync(d_R + newLen, 0x7f, sizextra);
cudaMemsetAsync(posR + newLen, 0x7f, sizextra);
llenar<<<blockllen, numthreads>>>(p1, d_R, rLen, of1, wherej[0], temp, posR);
@ -961,6 +1000,9 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
{
sizem32 = m32rLen * sizeof(int);
reservar(&d_R, sizem32);
#ifdef DEBUG_MEM
cerr << "+ " << d_R << " d_R sizem32 " << sizem32 << endl;
#endif
cudaMemsetAsync(d_R + rLen, 0x7f, extraspace * sizeof(int));
llenarnosel<<<blockllen, numthreads>>>(p1, d_R, rLen, of1, wherej[0]);
}
@ -971,6 +1013,9 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
{
sizem32 = m32rLen * sizeof(int);
reservar(&d_R, sizem32);
#ifdef DEBUG_MEM
cerr << "+ " << d_R << " d_R sz " << sizem32 << endl;
#endif
cudaMemsetAsync(d_R + rLen, 0x7f, extraspace * sizeof(int));
llenarnosel<<<blockllen, numthreads>>>(p1, d_R, rLen, of1, wherej[0]);
}
@ -979,7 +1024,8 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
cout << "Select2 = " << time << endl;
//cout << "Select2 = " << time << endl;
cuda_stats.select2_time += time;
#endif
/*free(hcons);
@ -1005,6 +1051,9 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
if(posR == NULL)
{
reservar(&posR, sizem32);
#ifdef DEBUG_MEM
cerr << "+ " << posR << " posR m32 " << sizem32 << endl;
#endif
permutation = thrust::device_pointer_cast(posR);
thrust::sequence(permutation, permutation + m32rLen);
}
@ -1021,7 +1070,7 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
}
catch(std::bad_alloc &e)
{
limpiar("inclusive scan in join");
limpiar("inclusive scan in join", 0);
}
}
@ -1029,7 +1078,8 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
cout << "Sort = " << time << endl;
//cout << "Sort = " << time << endl;
cuda_stats.sort_time += time;
cudaEventDestroy(start);
cudaEventDestroy(stop);
@ -1071,6 +1121,9 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
int *d_locations;
reservar(&d_locations, memSizeS);
#ifdef DEBUG_MEM
cerr << "+ " << d_locations << " d_locs n " << memSizeS << endl;
#endif
dim3 Dbs(THRD_PER_BLCK_search, 1, 1);
dim3 Dgs(BLCK_PER_GRID_search, 1, 1);
@ -1110,6 +1163,9 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
cudaMemcpy(dcons, proj, sizepro, cudaMemcpyHostToDevice);
resSize = sum * sizepro;
reservar(&d_Rout, resSize);
#ifdef DEBUG_MEM
cerr << "+ " << d_Rout << " d_Rout n " << resSize << endl;
#endif
if(numj > 2)
{
cudaMemcpy(dcons + rule->num_columns, wherej + 2, muljoinsize, cudaMemcpyHostToDevice);
@ -1124,6 +1180,9 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
cudaMemcpy(dcons, proj, sizepro, cudaMemcpyHostToDevice);
resSize = sum * sizepro;
reservar(&d_Rout, resSize);
#ifdef DEBUG_MEM
cerr << "+ " << d_Rout << " d_Rout 2 " << resSize << endl;
#endif
if(numj > 2)
{
cudaMemcpy(dcons + projp.y, wherej + 2, muljoinsize, cudaMemcpyHostToDevice);
@ -1162,8 +1221,9 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
cout << "Join = " << time << endl;
cout << "FIN" << endl;
//cout << "Join = " << time << endl;
//cout << "FIN" << endl;
cuda_stats.join_time += time;
#endif
return sum;

View File

@ -86,6 +86,9 @@ int unir(int *res, int rows, int tipo)
s3 *t3;
int flag, nrows;
#if TIMER
cuda_stats.unions++;
#endif
switch(tipo)
{
case 1:
@ -102,7 +105,7 @@ int unir(int *res, int rows, int tipo)
}
catch(std::bad_alloc &e)
{
limpiar("sort/unique in unir");
limpiar("sort/unique in unir", 0);
}
}
nrows = thrust::distance(pt, re);
@ -140,7 +143,7 @@ int unir(int *res, int rows, int tipo)
}
catch(std::bad_alloc &e)
{
limpiar("sort/unique in unir");
limpiar("sort/unique in unir", 0);
}
}
nrows = thrust::distance(pt2, re2);
@ -179,7 +182,7 @@ int unir(int *res, int rows, int tipo)
}
catch(std::bad_alloc &e)
{
limpiar("sort/unique in unir");
limpiar("sort/unique in unir", 0);
}
}
nrows = thrust::distance(pt3, re3);