allow debugging of memory allocation

This commit is contained in:
Vítor Santos Costa 2013-10-16 14:52:54 +01:00
parent b50305a100
commit 94cb9b7563
6 changed files with 48 additions and 9 deletions

View File

@ -52,6 +52,7 @@ int bpreds(int *dop1, int rows, int cols, int *bin, int3 numpreds, int **ret)
int tmplen = rows + 1; int tmplen = rows + 1;
int size = tmplen * sizeof(int); int size = tmplen * sizeof(int);
reservar(&temp, size); reservar(&temp, size);
// cerr << "+ " << temp << " temp bpreds " << size << endl;
cudaMemset(temp, 0, size); cudaMemset(temp, 0, size);
int *dhead; int *dhead;
@ -64,6 +65,7 @@ int bpreds(int *dop1, int rows, int cols, int *bin, int3 numpreds, int **ret)
else else
hsize = sproj; hsize = sproj;
reservar(&dhead, hsize); reservar(&dhead, hsize);
// cerr << "+ " << dhead << " dhead " << hsize << endl;
cudaMemcpy(dhead, bin, spredn, cudaMemcpyHostToDevice); cudaMemcpy(dhead, bin, spredn, cudaMemcpyHostToDevice);
int blockllen = rows / 1024 + 1; int blockllen = rows / 1024 + 1;
@ -108,6 +110,7 @@ int bpreds(int *dop1, int rows, int cols, int *bin, int3 numpreds, int **ret)
int *fres; int *fres;
reservar(&fres, num * sproj); reservar(&fres, num * sproj);
// cerr << "+ " << fres << " fres " << num * sproj << endl;
cudaMemcpy(dhead, bin + predn, sproj, cudaMemcpyHostToDevice); cudaMemcpy(dhead, bin + predn, sproj, cudaMemcpyHostToDevice);
llenarproyectar<<<blockllen, numthreads, sproj>>>(dop1, rows, numpreds.y, temp, dhead, numpreds.z, fres); llenarproyectar<<<blockllen, numthreads, sproj>>>(dop1, rows, numpreds.y, temp, dhead, numpreds.z, fres);

View File

@ -37,10 +37,13 @@ int32_t Cuda_Erase(predicate *pred);
void init_cuda( void ); void init_cuda( void );
//#define DEBUG_INTERFACE 1
#if DEBUG_INTERFACE #if DEBUG_INTERFACE
static void static void
dump_mat(int32_t mat[], int32_t nrows, int32_t ncols) dump_mat(int32_t mat[], int32_t nrows, int32_t ncols)
{ {
return;
int32_t i, j; int32_t i, j;
for ( i=0; i< nrows; i++) { for ( i=0; i< nrows; i++) {
printf("%d", mat[i*ncols]); printf("%d", mat[i*ncols]);
@ -56,7 +59,7 @@ dump_vec(int32_t vec[], int32_t rows)
{ {
int32_t i = 1; int32_t i = 1;
int32_t j = 0; int32_t j = 0;
printf("%d", vec[0]);
for (j = 0; j < rows; j++) { for (j = 0; j < rows; j++) {
for ( ; vec[i]; i++ ) { for ( ; vec[i]; i++ ) {
printf(", %d", vec[i]); printf(", %d", vec[i]);

View File

@ -967,6 +967,7 @@ int Cuda_Eval(predicate **inpfacts, int ninpf, predicate **inprules, int ninpr,
{ {
num_refs = rows1 * cols1 * sizeof(int); num_refs = rows1 * cols1 * sizeof(int);
reservar(&res, num_refs); reservar(&res, num_refs);
// cerr << "+ " << res << " Res " << num_refs << endl;
cudaMemcpyAsync(res, dop1, num_refs, cudaMemcpyDeviceToDevice); cudaMemcpyAsync(res, dop1, num_refs, cudaMemcpyDeviceToDevice);
registrar(rul_act->name, cols1, res, rows1, itr, 1); registrar(rul_act->name, cols1, res, rows1, itr, 1);
rul_act->gen_ant = rul_act->gen_act; rul_act->gen_ant = rul_act->gen_act;
@ -1251,16 +1252,20 @@ int Cuda_Eval(predicate **inpfacts, int ninpf, predicate **inprules, int ninpr,
else else
{ {
res_rows = selectproyect(dop1, rows1, cols1, tmprule.num_columns, tmprule.select[0], tmprule.numsel[0], tmprule.selfjoin[0], tmprule.numselfj[0], tmprule.project[0], &res); res_rows = selectproyect(dop1, rows1, cols1, tmprule.num_columns, tmprule.select[0], tmprule.numsel[0], tmprule.selfjoin[0], tmprule.numselfj[0], tmprule.project[0], &res);
if(qposr != fin && qposr->name == qname) if(qposr != fin && qposr->name == qname) {
cudaFree(dop1); cudaFree(dop1);
// cerr << "- " << dop1 << " dop1" << endl;
}
} }
cols1 = tmprule.num_columns; cols1 = tmprule.num_columns;
tipo = res_rows * cols1 * sizeof(int); tipo = res_rows * cols1 * sizeof(int);
hres = (int *)malloc(tipo); hres = (int *)malloc(tipo);
cudaMemcpy(hres, res, tipo, cudaMemcpyDeviceToHost); cudaMemcpy(hres, res, tipo, cudaMemcpyDeviceToHost);
if(res_rows > 0 && tmprule.numsel[0] != 0 && tmprule.numselfj[0] != 0) if(res_rows > 0 && tmprule.numsel[0] != 0 && tmprule.numselfj[0] != 0) {
cudaFree(res); cudaFree(res);
// cerr << "- " << res << " res" << endl;
}
} }
else else
res_rows = 0; res_rows = 0;

View File

@ -112,7 +112,7 @@ void limpiar(const char s[])
if(GPUmem.size() == 0) if(GPUmem.size() == 0)
{ {
cerr << s << ": not enough GPU memory: have " << avmem << endl; // cerr << s << ": not enough GPU memory: have " << avmem << endl;
exit(1); exit(1);
} }
@ -176,6 +176,7 @@ void liberar(int *ptr, int size)
//cout << "L " << avmem << " " << size; //cout << "L " << avmem << " " << size;
cudaFree(ptr); cudaFree(ptr);
// cerr << "- " << ptr << " " << size << endl;
avmem += size; avmem += size;
//cout << " " << avmem << endl; //cout << " " << avmem << endl;
@ -185,7 +186,7 @@ void reservar(int **ptr, int size)
{ {
//size_t free, total; //size_t free, total;
//cudaMemGetInfo( &free, &total ); //cudaMemGetInfo( &free, &total );
// cerr << "R " << free << " " << size << endl; // // cerr << "- " << free << " " << size << endl;
if (size == 0) { if (size == 0) {
*ptr = NULL; *ptr = NULL;
@ -198,11 +199,10 @@ void reservar(int **ptr, int size)
if (! *ptr ) { if (! *ptr ) {
size_t free, total; size_t free, total;
cudaMemGetInfo( &free, &total ); cudaMemGetInfo( &free, &total );
cerr << "Could not allocate " << size << " bytes, only " << free << " avaliable from total of " << total << " !!!" << endl; // cerr << "Could not allocate " << size << " bytes, only " << free << " avaliable from total of " << total << " !!!" << endl;
cerr << "Exiting CUDA...." << endl; // cerr << "Exiting CUDA...." << endl;
exit(1); exit(1);
} }
// cerr << *ptr << " " << size;
avmem -= size; avmem -= size;
// cout << " " << avmem << endl; // cout << " " << avmem << endl;
@ -277,6 +277,7 @@ int cargar(int name, int num_rows, int num_columns, int is_fact, int *address_ho
} }
size = num_rows * num_columns * sizeof(int); size = num_rows * num_columns * sizeof(int);
reservar(&temp, size); reservar(&temp, size);
// cerr << "+ " << temp << " temp " << size << endl;
cudaMemcpyAsync(temp, address_host_table, size, cudaMemcpyHostToDevice); cudaMemcpyAsync(temp, address_host_table, size, cudaMemcpyHostToDevice);
registrar(name, num_columns, temp, num_rows, itr, 0); registrar(name, num_columns, temp, num_rows, itr, 0);
*ptr = temp; *ptr = temp;
@ -296,6 +297,7 @@ int cargar(int name, int num_rows, int num_columns, int is_fact, int *address_ho
} }
size = totalrows * num_columns * sizeof(int); size = totalrows * num_columns * sizeof(int);
reservar(&temp, size); reservar(&temp, size);
// cerr << "+ " << temp << " temp 2 " << size << endl;
for(x = 1; x < numgpu; x++) for(x = 1; x < numgpu; x++)
{ {
cudaMemcpyAsync(temp + temp_storage[x-1].size, temp_storage[x].dev_address, temp_storage[x].size, cudaMemcpyDeviceToDevice); cudaMemcpyAsync(temp + temp_storage[x-1].size, temp_storage[x].dev_address, temp_storage[x].size, cudaMemcpyDeviceToDevice);
@ -338,6 +340,7 @@ int cargafinal(int name, int cols, int **ptr)
} }
reservar(&temp, cont * cols * sizeof(int)); reservar(&temp, cont * cols * sizeof(int));
// cerr << "+ " << temp << " temp 3 " << cont * cols * sizeof(int) << endl;
ini = temp; ini = temp;
pos = gpu; pos = gpu;
@ -460,6 +463,7 @@ void resultados(vector<rulenode>::iterator first, vector<rulenode>::iterator las
cout << endl; cout << endl;
} }
cudaFree(gpu->dev_address); cudaFree(gpu->dev_address);
// cerr << "- " << gpu->dev_address << " gpu->dev_address" << endl;
free(temp); free(temp);
gpu++; gpu++;
} }
@ -491,6 +495,7 @@ void clear_memory()
{ {
if (ini->isrule) { if (ini->isrule) {
cudaFree(ini->dev_address); cudaFree(ini->dev_address);
// cerr << "- " << ini->dev_address << " ini->dev_address" << endl;
ini = GPUmem.erase(ini); ini = GPUmem.erase(ini);
} else { } else {
ini++; ini++;

View File

@ -208,6 +208,7 @@ int selectproyect(int *dop1, int rows, int cols, int head_size, int *select, int
int head_bytes = mayor(numselect, numselfj, head_size) * sizeof(int); int head_bytes = mayor(numselect, numselfj, head_size) * sizeof(int);
reservar(&dhead, head_bytes); reservar(&dhead, head_bytes);
// cerr << "+ " << dhead << " dhead " << head_bytes << endl;
int blockllen = rows / 1024 + 1; int blockllen = rows / 1024 + 1;
int numthreads = 1024; int numthreads = 1024;
@ -218,6 +219,7 @@ int selectproyect(int *dop1, int rows, int cols, int head_size, int *select, int
tmplen = rows + 1; tmplen = rows + 1;
size2 = tmplen * sizeof(int); size2 = tmplen * sizeof(int);
reservar(&temp, size2); reservar(&temp, size2);
// cerr << "+ " << temp << " temp select " << size2 << endl;
cudaMemset(temp, 0, size2); cudaMemset(temp, 0, size2);
size = numselect * sizeof(int); size = numselect * sizeof(int);
@ -240,6 +242,7 @@ int selectproyect(int *dop1, int rows, int cols, int head_size, int *select, int
size = head_size * sizeof(int); size = head_size * sizeof(int);
reservar(&fres, num * size); reservar(&fres, num * size);
// cerr << "+ " << fres << " fres select " << num*size << endl;
cudaMemcpy(dhead, project, size, cudaMemcpyHostToDevice); cudaMemcpy(dhead, project, size, cudaMemcpyHostToDevice);
llenarproyectar<<<blockllen, numthreads, size>>>(dop1, rows, cols, temp, dhead, head_size, fres); llenarproyectar<<<blockllen, numthreads, size>>>(dop1, rows, cols, temp, dhead, head_size, fres);
liberar(dhead, head_bytes); liberar(dhead, head_bytes);
@ -254,6 +257,7 @@ int selectproyect(int *dop1, int rows, int cols, int head_size, int *select, int
tmplen = rows + 1; tmplen = rows + 1;
size2 = tmplen * sizeof(int); size2 = tmplen * sizeof(int);
reservar(&temp, size2); reservar(&temp, size2);
// cerr << "+ " << temp << " temp select " << size2 << endl;
cudaMemset(temp, 0, size2); cudaMemset(temp, 0, size2);
size = numselfj * sizeof(int); size = numselfj * sizeof(int);
@ -268,6 +272,7 @@ int selectproyect(int *dop1, int rows, int cols, int head_size, int *select, int
size = head_size * sizeof(int); size = head_size * sizeof(int);
reservar(&fres, num * size); reservar(&fres, num * size);
// cerr << "+ " << fres << " fres select again " << num*size << endl;
cudaMemcpy(dhead, project, size, cudaMemcpyHostToDevice); cudaMemcpy(dhead, project, size, cudaMemcpyHostToDevice);
llenarproyectar<<<blockllen, numthreads, size>>>(dop1, rows, cols, temp, dhead, head_size, fres); llenarproyectar<<<blockllen, numthreads, size>>>(dop1, rows, cols, temp, dhead, head_size, fres);
liberar(dhead, head_bytes); liberar(dhead, head_bytes);
@ -279,6 +284,7 @@ int selectproyect(int *dop1, int rows, int cols, int head_size, int *select, int
{ {
size = head_size * sizeof(int); size = head_size * sizeof(int);
reservar(&fres, rows * size); reservar(&fres, rows * size);
// cerr << "+ " << fres << " fres select third " << rows*size << endl;
cudaMemcpy(dhead, project, size, cudaMemcpyHostToDevice); cudaMemcpy(dhead, project, size, cudaMemcpyHostToDevice);
proyectar<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, head_size, fres); proyectar<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, head_size, fres);
liberar(dhead, head_bytes); liberar(dhead, head_bytes);

View File

@ -777,7 +777,9 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
int dconsize = sizet * 2;*/ int dconsize = sizet * 2;*/
reservar(&dcons, sizet); reservar(&dcons, sizet);
// cerr << "+ " << dcons << " dcons tree " << sizet << endl;
reservar(&temp, sizet2); reservar(&temp, sizet2);
// cerr << "+ " << temp << " temp tree " << sizet2 << endl;
thrust::device_ptr<int> res = thrust::device_pointer_cast(temp); thrust::device_ptr<int> res = thrust::device_pointer_cast(temp);
numthreads = 1024; numthreads = 1024;
@ -852,7 +854,9 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
memSizeS = newLen * sizeof(int); memSizeS = newLen * sizeof(int);
reservar(&d_S, memSizeS); reservar(&d_S, memSizeS);
// cerr << "+ " << d_S << " d_S " << memSizeS << endl;
reservar(&posS, memSizeS); reservar(&posS, memSizeS);
// cerr << "+ " << posS << " posS " << memSizeS << endl;
llenar<<<blockllen, numthreads>>>(p2, d_S, sLen, of2, wherej[1], temp, posS); llenar<<<blockllen, numthreads>>>(p2, d_S, sLen, of2, wherej[1], temp, posS);
sLen = newLen; sLen = newLen;
} }
@ -869,11 +873,13 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
thrust::inclusive_scan(res + 1, res + newLen, res + 1); thrust::inclusive_scan(res + 1, res + newLen, res + 1);
newLen = res[sLen]; newLen = res[sLen];
if(newLen == 0) if(newLen == 0)
return 0; return 0;
memSizeS = newLen * sizeof(int); memSizeS = newLen * sizeof(int);
reservar(&d_S, memSizeS); reservar(&d_S, memSizeS);
// cerr << "+ " << d_S << " d_S m " << memSizeS << endl;
reservar(&posS, memSizeS); reservar(&posS, memSizeS);
// cerr << "+ " << posS << " posS m " << memSizeS << endl;
llenar<<<blockllen, numthreads>>>(p2, d_S, sLen, of2, wherej[1], temp, posS); llenar<<<blockllen, numthreads>>>(p2, d_S, sLen, of2, wherej[1], temp, posS);
sLen = newLen; sLen = newLen;
} }
@ -881,6 +887,7 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
{ {
memSizeS = sLen * sizeof(int); memSizeS = sLen * sizeof(int);
reservar(&d_S, memSizeS); reservar(&d_S, memSizeS);
// cerr << "+ " << d_S << " d_S n " << memSizeS << endl;
llenarnosel<<<blockllen, numthreads>>>(p2, d_S, sLen, of2, wherej[1]); llenarnosel<<<blockllen, numthreads>>>(p2, d_S, sLen, of2, wherej[1]);
} }
} }
@ -927,7 +934,9 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
m32rLen = newLen + extraspace; m32rLen = newLen + extraspace;
sizem32 = m32rLen * sizeof(int); sizem32 = m32rLen * sizeof(int);
reservar(&d_R, sizem32); reservar(&d_R, sizem32);
// cerr << "+ " << d_R << " d_R m " << sizem32 << endl;
reservar(&posR, sizem32); reservar(&posR, sizem32);
// cerr << "+ " << posR << " posR m " << sizem32 << endl;
cudaMemsetAsync(d_R + newLen, 0x7f, sizextra); cudaMemsetAsync(d_R + newLen, 0x7f, sizextra);
cudaMemsetAsync(posR + newLen, 0x7f, sizextra); cudaMemsetAsync(posR + newLen, 0x7f, sizextra);
llenar<<<blockllen, numthreads>>>(p1, d_R, rLen, of1, wherej[0], temp, posR); llenar<<<blockllen, numthreads>>>(p1, d_R, rLen, of1, wherej[0], temp, posR);
@ -953,7 +962,9 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
m32rLen = newLen + extraspace; m32rLen = newLen + extraspace;
sizem32 = m32rLen * sizeof(int); sizem32 = m32rLen * sizeof(int);
reservar(&d_R, sizem32); reservar(&d_R, sizem32);
// cerr << "+ " << d_R << " d_R n " << sizem32 << endl;
reservar(&posR, sizem32); reservar(&posR, sizem32);
// cerr << "+ " << posR << " posR n " << sizem32 << endl;
cudaMemsetAsync(d_R + newLen, 0x7f, sizextra); cudaMemsetAsync(d_R + newLen, 0x7f, sizextra);
cudaMemsetAsync(posR + newLen, 0x7f, sizextra); cudaMemsetAsync(posR + newLen, 0x7f, sizextra);
llenar<<<blockllen, numthreads>>>(p1, d_R, rLen, of1, wherej[0], temp, posR); llenar<<<blockllen, numthreads>>>(p1, d_R, rLen, of1, wherej[0], temp, posR);
@ -963,6 +974,7 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
{ {
sizem32 = m32rLen * sizeof(int); sizem32 = m32rLen * sizeof(int);
reservar(&d_R, sizem32); reservar(&d_R, sizem32);
// cerr << "+ " << d_R << " d_R sizem32 " << sizem32 << endl;
cudaMemsetAsync(d_R + rLen, 0x7f, extraspace * sizeof(int)); cudaMemsetAsync(d_R + rLen, 0x7f, extraspace * sizeof(int));
llenarnosel<<<blockllen, numthreads>>>(p1, d_R, rLen, of1, wherej[0]); llenarnosel<<<blockllen, numthreads>>>(p1, d_R, rLen, of1, wherej[0]);
} }
@ -973,6 +985,7 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
{ {
sizem32 = m32rLen * sizeof(int); sizem32 = m32rLen * sizeof(int);
reservar(&d_R, sizem32); reservar(&d_R, sizem32);
// cerr << "+ " << d_R << " d_R sz " << sizem32 << endl;
cudaMemsetAsync(d_R + rLen, 0x7f, extraspace * sizeof(int)); cudaMemsetAsync(d_R + rLen, 0x7f, extraspace * sizeof(int));
llenarnosel<<<blockllen, numthreads>>>(p1, d_R, rLen, of1, wherej[0]); llenarnosel<<<blockllen, numthreads>>>(p1, d_R, rLen, of1, wherej[0]);
} }
@ -1007,6 +1020,7 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
if(posR == NULL) if(posR == NULL)
{ {
reservar(&posR, sizem32); reservar(&posR, sizem32);
// cerr << "+ " << posR << " posR m32 " << sizem32 << endl;
permutation = thrust::device_pointer_cast(posR); permutation = thrust::device_pointer_cast(posR);
thrust::sequence(permutation, permutation + m32rLen); thrust::sequence(permutation, permutation + m32rLen);
} }
@ -1073,6 +1087,7 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
int *d_locations; int *d_locations;
reservar(&d_locations, memSizeS); reservar(&d_locations, memSizeS);
// cerr << "+ " << d_locations << " d_locs n " << memSizeS << endl;
dim3 Dbs(THRD_PER_BLCK_search, 1, 1); dim3 Dbs(THRD_PER_BLCK_search, 1, 1);
dim3 Dgs(BLCK_PER_GRID_search, 1, 1); dim3 Dgs(BLCK_PER_GRID_search, 1, 1);
@ -1112,6 +1127,7 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
cudaMemcpy(dcons, proj, sizepro, cudaMemcpyHostToDevice); cudaMemcpy(dcons, proj, sizepro, cudaMemcpyHostToDevice);
resSize = sum * sizepro; resSize = sum * sizepro;
reservar(&d_Rout, resSize); reservar(&d_Rout, resSize);
// cerr << "+ " << d_Rout << " d_Rout n " << resSize << endl;
if(numj > 2) if(numj > 2)
{ {
cudaMemcpy(dcons + rule->num_columns, wherej + 2, muljoinsize, cudaMemcpyHostToDevice); cudaMemcpy(dcons + rule->num_columns, wherej + 2, muljoinsize, cudaMemcpyHostToDevice);
@ -1126,6 +1142,7 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
cudaMemcpy(dcons, proj, sizepro, cudaMemcpyHostToDevice); cudaMemcpy(dcons, proj, sizepro, cudaMemcpyHostToDevice);
resSize = sum * sizepro; resSize = sum * sizepro;
reservar(&d_Rout, resSize); reservar(&d_Rout, resSize);
// cerr << "+ " << d_Rout << " d_Rout 2 " << resSize << endl;
if(numj > 2) if(numj > 2)
{ {
cudaMemcpy(dcons + projp.y, wherej + 2, muljoinsize, cudaMemcpyHostToDevice); cudaMemcpy(dcons + projp.y, wherej + 2, muljoinsize, cudaMemcpyHostToDevice);