Merge branch 'master' of ssh://git.dcc.fc.up.pt/yap-6.3

2013-10-17 00:45:25 +01:00 · 2013-10-17 00:45:25 +01:00 · 8921889002
commit 8921889002
parent 8623131b31 2669240c4f
11 changed files with 236 additions and 35 deletions
--- a/C/alloc.c
+++ b/C/alloc.c
@ -251,6 +251,7 @@ Yap_InitPreAllocCodeSpace(int wid)
  char *ptr;
  UInt sz = REMOTE_ScratchPad(wid).msz;

+
  if (REMOTE_ScratchPad(wid).ptr == NULL) {
 #if USE_DL_MALLOC
    LOCK(DLMallocLock);
@ -261,7 +262,13 @@ Yap_InitPreAllocCodeSpace(int wid)
    tmalloc += sz;
    sz += sizeof(CELL);
 #endif
-    while (!(ptr = my_malloc(sz))) {
+    while (!(ptr = 
+#ifdef YAPOR_COPY
+	     malloc(sz)
+#else
+	     my_malloc(sz)
+#endif
+	     )) {
      REMOTE_PrologMode(wid) &= ~MallocMode;
 #if USE_DL_MALLOC
      UNLOCK(DLMallocLock);
@ -290,6 +297,7 @@ Yap_InitPreAllocCodeSpace(int wid)
  }
  AuxBase = (ADDR)(ptr);
  AuxSp = (CELL *)(AuxTop = AuxBase+REMOTE_ScratchPad(wid).sz);
+  printf("wid=%d %p %p %p--%p\n", wid, AuxBase, AuxSp, Yap_HeapBase, H);
  return ptr;
 }

--- a/packages/cuda/bpreds.cu
+++ b/packages/cuda/bpreds.cu
@ -52,8 +52,14 @@ int bpreds(int *dop1, int rows, int cols, int *bin, int3 numpreds, int **ret)
 	int tmplen = rows + 1;
 	int size = tmplen * sizeof(int);
 	reservar(&temp, size);
+#ifdef DEBUG_MEM
+	 cerr << "+ " << temp << " temp bpreds " << size << endl;
+#endif
 	cudaMemset(temp, 0, size);

+#if TIMER
+	cuda_stats.builtins++;
+#endif
 	int *dhead;
 	int predn = numpreds.x * 3;
 	int spredn = predn * sizeof(int);
@ -64,6 +70,9 @@ int bpreds(int *dop1, int rows, int cols, int *bin, int3 numpreds, int **ret)
 	else
 		hsize = sproj;
 	reservar(&dhead, hsize);
+#ifdef DEBUG_MEM
+	cerr << "+ " << dhead << " dhead  " << hsize << endl;
+#endif
 	cudaMemcpy(dhead, bin, spredn, cudaMemcpyHostToDevice);

 	int blockllen = rows / 1024 + 1;
@ -108,6 +117,9 @@ int bpreds(int *dop1, int rows, int cols, int *bin, int3 numpreds, int **ret)

 	int *fres;
 	reservar(&fres, num * sproj);
+#ifdef DEBUG_MEM
+	cerr << "+ " << fres << " fres  " << num * sproj << endl;
+#endif
 	cudaMemcpy(dhead, bin + predn, sproj, cudaMemcpyHostToDevice);
 	llenarproyectar<<<blockllen, numthreads, sproj>>>(dop1, rows, numpreds.y, temp, dhead, numpreds.z, fres);

--- a/packages/cuda/cuda.c
+++ b/packages/cuda/cuda.c
@ -37,10 +37,13 @@ int32_t Cuda_Erase(predicate *pred);

 void init_cuda( void );

+//#define DEBUG_INTERFACE 1
+
 #if DEBUG_INTERFACE
 static void
 dump_mat(int32_t mat[], int32_t nrows, int32_t ncols)
 {
+  return;
  int32_t i, j;
  for ( i=0; i< nrows; i++) {
    printf("%d", mat[i*ncols]);
@ -56,7 +59,7 @@ dump_vec(int32_t vec[], int32_t rows)
 {
  int32_t i = 1;
  int32_t j = 0;
-  printf("%d", vec[0]);
+
  for (j = 0; j < rows; j++) {
    for ( ; vec[i]; i++ ) {
      printf(", %d", vec[i]);
@ -392,6 +395,12 @@ static int cuda_count( void )
  return YAP_Unify(YAP_ARG2, YAP_MkIntTerm(n));
 }

+static int cuda_statistics( void )
+{
+  Cuda_Statistics();
+  return TRUE;
+}
+
 static int first_time = TRUE;

 void
@ -414,5 +423,6 @@ init_cuda(void)
  YAP_UserCPredicate("cuda_eval", cuda_eval, 2);
  YAP_UserCPredicate("cuda_coverage", cuda_coverage, 4);
  YAP_UserCPredicate("cuda_count", cuda_count, 2);
+  YAP_UserCPredicate("cuda_statistics", cuda_statistics, 0);
 }

--- a/packages/cuda/cuda.yap
+++ b/packages/cuda/cuda.yap
@ -4,6 +4,7 @@
 		 cuda_erase/1,
 		 cuda_eval/2,
 		 cuda_coverage/4,
+		 cuda_statistics/0,
 		 cuda_count/2]).

 tell_warning :-
@ -47,9 +48,9 @@ body_to_list( (B1, B2), LF, L0, N0, NF) :- !,
 	body_to_list( B1, LF, LI, N0, N1), 
 	body_to_list( B2, LI, L0, N1, NF). 
 body_to_list( true, L, L, N, N) :- !.
-body_to_list( B, [NB|L], L, N0, N) :-
+body_to_list( B, NL, L, N0, N) :-
 	inline( B, NB ), !,
-	N is N0+1.
+	body_to_list( NB, NL, L, N0, N).
 body_to_list( B, [B|L], L, N0, N) :-
 	N is N0+1.

--- a/packages/cuda/lista.cu
+++ b/packages/cuda/lista.cu
@ -13,6 +13,10 @@ extern "C" {

 #define MAXVALS 200

+#if TIMER
+statinfo cuda_stats;
+#endif
+
 bool compare(const gpunode &r1, const gpunode &r2)
 {
 	return (r1.name > r2.name); 
@ -851,6 +855,30 @@ void mostrareglas(list<rulenode> aux)
 	cout << endl;
 }

+extern "C"
+ void Cuda_Statistics(void)
+{
+  cerr << "GPU Statistics" << endl;
+#if TIMER
+  cerr << "Called " << cuda_stats.calls << "times." << endl;
+  cerr << "GPU time " << cuda_stats.total_time << "msec." << endl;
+  cerr << "Longest call " << cuda_stats.max_time << "msec." << endl;
+  cerr << "Fastest call " << cuda_stats.min_time << "msec." << endl << endl;
+  cerr << "Steps" << endl;
+  cerr << "    Select First: " << cuda_stats.select1_time << " msec." << endl;
+  cerr << "    Select Second: " << cuda_stats.select2_time << " msec." << endl;
+  cerr << "    Sort: " << cuda_stats.sort_time << " msec." << endl;
+  cerr << "    Join: " << cuda_stats.join_time << " msec." << endl;
+  cerr << "    Union: " << cuda_stats.union_time << " msec." << endl;
+  cerr << "    Built-in: " << cuda_stats.pred_time << " msec." << endl << endl;
+  cerr << "Operations" << endl;
+  cerr << "    Joins: " << cuda_stats.joins << "." << endl;
+  cerr << "    Selects/Projects: " << cuda_stats.selects << "." << endl;
+  cerr << "    Unions: " << cuda_stats.unions << "." << endl;
+  cerr << "    Built-ins: " << cuda_stats.builtins << "." << endl << endl;
+#endif
+}
+
 extern "C"
 int Cuda_Eval(predicate **inpfacts, int ninpf, predicate **inprules, int ninpr, predicate *inpquery, int **result)
 {
@ -859,6 +887,9 @@ int Cuda_Eval(predicate **inpfacts, int ninpf, predicate **inprules, int ninpr,
 	int x, y;
 	int qsize, *query, qname;

+#if TIMER
+	cuda_stats.calls++;
+#endif
 	for(x = 0; x < ninpf; x++)
 		L.push_back(*inpfacts[x]);
 	for(x = 0; x < ninpr; x++)
@ -917,11 +948,13 @@ int Cuda_Eval(predicate **inpfacts, int ninpf, predicate **inprules, int ninpr,
 	vector<gpunode>::iterator qposf;
 	vector<rulenode>::iterator qposr;

+#if TIMER
 	cudaEvent_t start, stop;
 	float time;
 	cudaEventCreate(&start);
 	cudaEventCreate(&stop);
 	cudaEventRecord(start, 0);
+#endif

 	while(reglas.size()) /*Here's the main loop*/
 	{
@ -967,6 +1000,9 @@ int Cuda_Eval(predicate **inpfacts, int ninpf, predicate **inprules, int ninpr,
 				{
 					num_refs = rows1 * cols1 * sizeof(int);
 					reservar(&res, num_refs);
+#ifdef DEBUG_MEM
+					cerr << "+ " << res << " Res  " << num_refs << endl;
+#endif
 					cudaMemcpyAsync(res, dop1, num_refs, cudaMemcpyDeviceToDevice);
 					registrar(rul_act->name, cols1, res, rows1, itr, 1);
 					rul_act->gen_ant = rul_act->gen_act;
@ -1103,7 +1139,8 @@ int Cuda_Eval(predicate **inpfacts, int ninpf, predicate **inprules, int ninpr,
 					cudaEventElapsedTime(&time, start3, stop3);
 					cudaEventDestroy(start3);
 					cudaEventDestroy(stop3);
-					cout << "Predicados = " << time << endl;
+					//cout << "Predicados = " << time << endl;
+					cuda_stats.pred_time += time;
 					#endif
 				}

@ -1124,7 +1161,8 @@ int Cuda_Eval(predicate **inpfacts, int ninpf, predicate **inprules, int ninpr,
 				cudaEventElapsedTime(&time, start2, stop2);
 				cudaEventDestroy(start2);
 				cudaEventDestroy(stop2);
-				cout << "Union = " << time << endl;
+				//cout << "Union = " << time << endl;
+				cuda_stats.union_time += time;
 				#endif					
 	
 				//cout << "despues de unir = " << res_rows << endl;
@ -1251,25 +1289,40 @@ int Cuda_Eval(predicate **inpfacts, int ninpf, predicate **inprules, int ninpr,
 		else
 		{		
 			res_rows = selectproyect(dop1, rows1, cols1, tmprule.num_columns, tmprule.select[0], tmprule.numsel[0], tmprule.selfjoin[0], tmprule.numselfj[0], tmprule.project[0], &res);
-			if(qposr != fin && qposr->name == qname)
+			if(qposr != fin && qposr->name == qname) {
 				cudaFree(dop1);
+#ifdef DEBUG_MEM
+				cerr << "- " << dop1 << " dop1" << endl;
+#endif
+			}
 		}

 		cols1 = tmprule.num_columns;
 		tipo = res_rows * cols1 * sizeof(int);
 		hres = (int *)malloc(tipo);
 		cudaMemcpy(hres, res, tipo, cudaMemcpyDeviceToHost);
-		if(res_rows > 0 && tmprule.numsel[0] != 0 && tmprule.numselfj[0] != 0)
+		if(res_rows > 0 /*&& tmprule.numsel[0] != 0 && tmprule.numselfj[0] != 0 */) {
 			cudaFree(res);
+#ifdef DEBUG_MEM
+			cerr << "- " << res << " res" << endl;
+#endif
+		}
 	}
 	else
 		res_rows = 0;

+#if TIMER
 	cudaEventRecord(stop, 0);
 	cudaEventSynchronize(stop);
 	cudaEventElapsedTime(&time, start, stop);
+	cuda_stats.total_time += time;
+	if (time > cuda_stats.max_time) 
+	  cuda_stats.max_time = time;
+	if (time < cuda_stats.min_time || cuda_stats.calls == 1) 
+	  cuda_stats.min_time = time;
 	cudaEventDestroy(start);
 	cudaEventDestroy(stop);
+#endif

 	if(showr == 1)
 	{
--- a/packages/cuda/memory.cu
+++ b/packages/cuda/memory.cu
@ -105,14 +105,14 @@ int buscarpornombrecpu(int name, int itr, int *totalrows)
 	return x;
 }

-void limpiar(const char s[])
+void limpiar(const char s[], size_t sz)
 {
 	list<memnode>::iterator ini;
 	memnode temp;

 	if(GPUmem.size() == 0)
 	{
-		cerr << s << ": not enough GPU memory: have " << avmem << endl;
+		cerr << s << ": not enough GPU memory: have " << avmem << ", need " << sz << " bytes." << endl;
 		exit(1);
 	}		

@ -176,6 +176,9 @@ void liberar(int *ptr, int size)
 	//cout << "L " << avmem << " " << size; 

 	cudaFree(ptr);
+#ifdef DEBUG_MEM
+	cerr << "- " << ptr << " " << size << endl;
+#endif
 	avmem += size;
 	
 	//cout << " " << avmem << endl;
@ -185,16 +188,16 @@ void reservar(int **ptr, int size)
 {
  //size_t free, total;
  //cudaMemGetInfo(      &free, &total	 );
-  // cerr << "R " << free << " " << size << endl;
+  //	cerr << "? " << free << " " << size << endl;

        if (size == 0) { 
                *ptr = NULL; 
                return;
        }
 	while(avmem < size)
-		limpiar("not enough memory");
+		limpiar("not enough memory", size);
 	while(cudaMalloc(ptr, size) == cudaErrorMemoryAllocation)
-		limpiar("error in memory allocation");
+		limpiar("Error in memory allocation", size);
 	if (! *ptr ) {
 	  size_t free, total;
 	  cudaMemGetInfo(      &free, &total	 );
@ -202,7 +205,6 @@ void reservar(int **ptr, int size)
 	  cerr << "Exiting CUDA...." << endl;
 	  exit(1);
 	}
-	// cerr << *ptr << " " << size;
 	avmem -= size;

 	// cout << " " << avmem << endl;
@ -277,6 +279,9 @@ int cargar(int name, int num_rows, int num_columns, int is_fact, int *address_ho
 		}
 		size = num_rows * num_columns * sizeof(int);
 		reservar(&temp, size);
+#ifdef DEBUG_MEM
+		cerr << "+ " << temp << " temp  " << size << endl;
+#endif
 		cudaMemcpyAsync(temp, address_host_table, size, cudaMemcpyHostToDevice);
 		registrar(name, num_columns, temp, num_rows, itr, 0);
 		*ptr = temp;
@ -296,6 +301,9 @@ int cargar(int name, int num_rows, int num_columns, int is_fact, int *address_ho
 		}
 		size = totalrows * num_columns * sizeof(int);
 		reservar(&temp, size);
+#ifdef DEBUG_MEM
+		cerr << "+ " << temp << " temp 2  " << size << endl;
+#endif
 		for(x = 1; x < numgpu; x++)
 		{
 			cudaMemcpyAsync(temp + temp_storage[x-1].size, temp_storage[x].dev_address, temp_storage[x].size, cudaMemcpyDeviceToDevice);
@ -338,6 +346,9 @@ int cargafinal(int name, int cols, int **ptr)
 	}
 	
 	reservar(&temp, cont * cols * sizeof(int));
+#ifdef DEBUG_MEM
+	cerr << "+ " << temp << " temp 3 " << cont * cols * sizeof(int) << endl;
+#endif
 	ini = temp;	

 	pos = gpu;
@ -460,6 +471,9 @@ void resultados(vector<rulenode>::iterator first, vector<rulenode>::iterator las
 				cout << endl;
 			}
 			cudaFree(gpu->dev_address);
+#ifdef DEBUG_MEM
+			cerr << "- " << gpu->dev_address << " gpu->dev_address" << endl;
+#endif
 			free(temp);
 			gpu++;
 		}
@ -491,6 +505,9 @@ void clear_memory()
 	{
 	  if (ini->isrule) {
 	    cudaFree(ini->dev_address);
+#ifdef DEBUG_MEM
+	    cerr << "- " << ini->dev_address << " ini->dev_address" << endl;
+#endif
 	    ini = GPUmem.erase(ini);
 	  } else {
 	    ini++;
--- a/packages/cuda/memory.h
+++ b/packages/cuda/memory.h
@ -11,7 +11,7 @@ using namespace std;

 void calcular_mem(int);
 void liberar(int*, int);
-void limpiar(const char []);
+void limpiar(const char [], size_t);
 void limpiartodo(int*, int*);
 int cargar(int, int, int, int, int*, int**, int);
 int cargafinal(int, int, int**);
--- a/packages/cuda/pred.h
+++ b/packages/cuda/pred.h
@ -1,6 +1,8 @@
 #ifndef _PRED_H_
 #define _PRED_H_

+// #define DEBUG_MEM 1
+
 typedef struct Nodo{
        int name;
 	int num_rows;
@ -11,6 +13,20 @@ typedef struct Nodo{

 typedef gpunode predicate;

+// #define TIMER 1
+
+#if TIMER
+typedef struct Stats{
+  size_t joins, selects, unions, builtins;
+  size_t calls;
+  double total_time;
+  float max_time, min_time;
+  float select1_time, select2_time, join_time, sort_time, union_time, pred_time;
+}statinfo;
+
+extern statinfo cuda_stats;
+#endif
+
 #define SBG_EQ  (-1)
 #define SBG_GT  (-2)
 #define SBG_LT  (-3)
@ -19,5 +35,5 @@ typedef gpunode predicate;
 #define SBG_DF  (-6)

 int Cuda_Eval(predicate**, int, predicate**, int, predicate*, int**);
-
+void  Cuda_Statistics( void );
 #endif
--- a/packages/cuda/selectproyect.cu
+++ b/packages/cuda/selectproyect.cu
@ -206,8 +206,14 @@ int selectproyect(int *dop1, int rows, int cols, int head_size, int *select, int
 	int size, size2, num;
 	thrust::device_ptr<int> res;

+#if TIMER
+	cuda_stats.selects++;
+#endif
 	int head_bytes = mayor(numselect, numselfj, head_size) * sizeof(int);
 	reservar(&dhead, head_bytes);
+#ifdef DEBUG_MEM
+	cerr << "+ " << dhead << " dhead  " << head_bytes << endl;
+#endif

 	int blockllen = rows / 1024 + 1;
 	int numthreads = 1024;
@ -218,6 +224,9 @@ int selectproyect(int *dop1, int rows, int cols, int head_size, int *select, int
 		tmplen = rows + 1;
 		size2 = tmplen * sizeof(int);
 		reservar(&temp, size2);
+#ifdef DEBUG_MEM
+		cerr << "+ " << temp << " temp  select " << size2 << endl;
+#endif
 		cudaMemset(temp, 0, size2);

 		size = numselect * sizeof(int);
@ -240,6 +249,9 @@ int selectproyect(int *dop1, int rows, int cols, int head_size, int *select, int

 		size = head_size * sizeof(int);
 		reservar(&fres, num * size);
+#ifdef DEBUG_MEM
+		cerr << "+ " << fres << " fres select  " << num*size << endl;
+#endif
 		cudaMemcpy(dhead, project, size, cudaMemcpyHostToDevice);
 		llenarproyectar<<<blockllen, numthreads, size>>>(dop1, rows, cols, temp, dhead, head_size, fres);
 		liberar(dhead, head_bytes);
@ -254,6 +266,9 @@ int selectproyect(int *dop1, int rows, int cols, int head_size, int *select, int
 			tmplen = rows + 1;
 			size2 = tmplen * sizeof(int);
 			reservar(&temp, size2);
+#ifdef DEBUG_MEM
+			cerr << "+ " << temp << " temp select  " << size2 << endl;
+#endif
 			cudaMemset(temp, 0, size2);
 			
 			size = numselfj * sizeof(int);
@ -268,6 +283,9 @@ int selectproyect(int *dop1, int rows, int cols, int head_size, int *select, int

 			size = head_size * sizeof(int);
 			reservar(&fres, num * size);
+#ifdef DEBUG_MEM
+			cerr << "+ " << fres << " fres select again  " << num*size << endl;
+#endif
 			cudaMemcpy(dhead, project, size, cudaMemcpyHostToDevice);
 			llenarproyectar<<<blockllen, numthreads, size>>>(dop1, rows, cols, temp, dhead, head_size, fres);
 			liberar(dhead, head_bytes);
@ -279,6 +297,9 @@ int selectproyect(int *dop1, int rows, int cols, int head_size, int *select, int
 		{
 			size = head_size * sizeof(int);
 			reservar(&fres, rows * size);
+#ifdef DEBUG_MEM
+			cerr << "+ " << fres << " fres select third  " << rows*size << endl;
+#endif
 			cudaMemcpy(dhead, project, size, cudaMemcpyHostToDevice);
 			proyectar<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, head_size, fres);
 			liberar(dhead, head_bytes);
--- a/packages/cuda/treeb.cu
+++ b/packages/cuda/treeb.cu
@ -234,7 +234,7 @@ __global__ void gIndexMultiJoin(int *R, int *S, int g_locations[], int sLen, int
 {
 	extern __shared__ int shared[];
 	int s_cur = blockIdx.x * blockDim.x + threadIdx.x;
-	int posr, poss, x, y, temp, ini;
+	int posr, poss, x, y, ini;

 	if(threadIdx.x < wj)
 		shared[threadIdx.x] = muljoin[threadIdx.x];
@ -258,15 +258,17 @@ __global__ void gIndexMultiJoin(int *R, int *S, int g_locations[], int sLen, int
 				poss = s_cur * of2;
 			else
 				poss = sloc[s_cur] * of2;
-			ini = r_cur - count;			
-			for(x = 0; x < wj; x += 2)
+			ini = r_cur - count;	
+			for(y = ini; y < r_cur; y++)
 			{
-				posr = shared[x];
-				temp = p2[poss + shared[x+1]];
-				for(y = ini; y < r_cur; y++)
+				posr = mloc[y] * of1;
+				for(x = 0; x < wj; x += 2)
 				{
-					if(p1[mloc[y] * of1 + posr] != temp)
+					if(p1[posr + shared[x]] != p2[poss + shared[x+1]])
+					{
 						count--;
+						break;
+					}
 				}
 			}
 			if(count > 0)
@ -744,6 +746,9 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
 	int *wherej = rule->wherejoin[pos];
 	int numj = rule->numjoin[pos];
 	int flag;
+#if TIMER
+	cuda_stats.joins++;
+#endif

 	int porLiberar = rLen * of1 * sizeof(int);
 	int size, sizet, sizet2;
@ -775,7 +780,13 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
 	int dconsize = sizet * 2;*/

 	reservar(&dcons, sizet);
+#ifdef DEBUG_MEM
+	cerr << "+ " << dcons << " dcons tree  " << sizet << endl;
+#endif
 	reservar(&temp, sizet2);
+#ifdef DEBUG_MEM
+	cerr << "+ " << temp << " temp tree " << sizet2 << endl;
+#endif
 	thrust::device_ptr<int> res = thrust::device_pointer_cast(temp);

 	numthreads = 1024;
@ -784,7 +795,7 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
 	int *posR = NULL, *posS = NULL;

 	#ifdef TIMER
-	cout << "INICIO" << endl;
+	//cout << "INICIO" << endl;
 	cudaEvent_t start, stop;
 	float time;
 	cudaEventCreate(&start);
@ -839,7 +850,7 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
 			}
 			catch(std::bad_alloc &e)
 			{
-				limpiar("inclusive scan in join");
+				limpiar("inclusive scan in join", 0);
 			}				
 		}
 		//thrust::inclusive_scan(res + 1, res + newLen, res + 1);	
@ -850,7 +861,13 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:

 		memSizeS = newLen * sizeof(int);
 		reservar(&d_S, memSizeS);
+#ifdef DEBUG_MEM
+		cerr << "+ " << d_S << " d_S  " << memSizeS << endl;
+#endif
 		reservar(&posS, memSizeS);
+#ifdef DEBUG_MEM
+		cerr << "+ " << posS << " posS  " << memSizeS << endl;
+#endif
 		llenar<<<blockllen, numthreads>>>(p2, d_S, sLen, of2, wherej[1], temp, posS);
 		sLen = newLen;
 	}
@ -867,11 +884,17 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
 			thrust::inclusive_scan(res + 1, res + newLen, res + 1);
 			newLen = res[sLen];
 			if(newLen == 0)
-				return 0;
+			  return 0;

 			memSizeS = newLen * sizeof(int);
 			reservar(&d_S, memSizeS);
+#ifdef DEBUG_MEM
+			cerr << "+ " << d_S << " d_S m " << memSizeS << endl;
+#endif
 			reservar(&posS, memSizeS);
+#ifdef DEBUG_MEM
+			cerr << "+ " << posS << " posS m " << memSizeS << endl;
+#endif
 			llenar<<<blockllen, numthreads>>>(p2, d_S, sLen, of2, wherej[1], temp, posS);
 			sLen = newLen;
 		}
@ -879,6 +902,9 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
 		{
 			memSizeS = sLen * sizeof(int);
 			reservar(&d_S, memSizeS);
+#ifdef DEBUG_MEM
+			cerr << "+ " << d_S << " d_S n " << memSizeS << endl;
+#endif
 			llenarnosel<<<blockllen, numthreads>>>(p2, d_S, sLen, of2, wherej[1]);
 		}
 	}
@ -887,7 +913,8 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
 	cudaEventRecord(stop, 0);
 	cudaEventSynchronize(stop);
 	cudaEventElapsedTime(&time, start, stop);
-	cout << "Select1 = " << time << endl;
+	//cout << "Select1 = " << time << endl;
+	cuda_stats.select1_time += time;

 	cudaEventDestroy(start);
 	cudaEventDestroy(stop);
@ -925,7 +952,13 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
 			m32rLen = newLen + extraspace;
 			sizem32 = m32rLen * sizeof(int);
 			reservar(&d_R, sizem32);
+#ifdef DEBUG_MEM
+			cerr << "+ " << d_R << " d_R m " << sizem32 << endl;
+#endif
 			reservar(&posR, sizem32);
+#ifdef DEBUG_MEM
+			cerr << "+ " << posR << " posR m " << sizem32 << endl;
+#endif
 			cudaMemsetAsync(d_R + newLen, 0x7f, sizextra);
 			cudaMemsetAsync(posR + newLen, 0x7f, sizextra);
 			llenar<<<blockllen, numthreads>>>(p1, d_R, rLen, of1, wherej[0], temp, posR);
@ -951,7 +984,13 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
 				m32rLen = newLen + extraspace;
 				sizem32 = m32rLen * sizeof(int);
 				reservar(&d_R, sizem32);
+#ifdef DEBUG_MEM
+				cerr << "+ " << d_R << " d_R n " << sizem32 << endl;
+#endif
 				reservar(&posR, sizem32);
+#ifdef DEBUG_MEM
+				cerr << "+ " << posR << " posR n " << sizem32 << endl;
+#endif
 				cudaMemsetAsync(d_R + newLen, 0x7f, sizextra);
 				cudaMemsetAsync(posR + newLen, 0x7f, sizextra);
 				llenar<<<blockllen, numthreads>>>(p1, d_R, rLen, of1, wherej[0], temp, posR);
@ -961,6 +1000,9 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
 			{
 				sizem32 = m32rLen * sizeof(int);
 				reservar(&d_R, sizem32);
+#ifdef DEBUG_MEM
+				cerr << "+ " << d_R << " d_R sizem32 " << sizem32 << endl;
+#endif
 				cudaMemsetAsync(d_R + rLen, 0x7f, extraspace * sizeof(int));
 				llenarnosel<<<blockllen, numthreads>>>(p1, d_R, rLen, of1, wherej[0]);
 			}
@ -971,6 +1013,9 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
 	{
 		sizem32 = m32rLen * sizeof(int);
 		reservar(&d_R, sizem32);
+#ifdef DEBUG_MEM
+		cerr << "+ " << d_R << " d_R sz " << sizem32 << endl;
+#endif
 		cudaMemsetAsync(d_R + rLen, 0x7f, extraspace * sizeof(int));
 		llenarnosel<<<blockllen, numthreads>>>(p1, d_R, rLen, of1, wherej[0]);
 	}
@ -979,7 +1024,8 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
 	cudaEventRecord(stop, 0);
 	cudaEventSynchronize(stop);
 	cudaEventElapsedTime(&time, start, stop);
-	cout << "Select2 = " << time << endl;
+	//cout << "Select2 = " << time << endl;
+	cuda_stats.select2_time += time;
 	#endif
 	
 	/*free(hcons);
@ -1005,6 +1051,9 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
 	if(posR == NULL)
 	{
 		reservar(&posR, sizem32);
+#ifdef DEBUG_MEM
+		cerr << "+ " << posR << " posR m32 " << sizem32 << endl;
+#endif
 		permutation = thrust::device_pointer_cast(posR);
 		thrust::sequence(permutation, permutation + m32rLen);
 	}
@ -1021,7 +1070,7 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
 		}
 		catch(std::bad_alloc &e)
 		{
-			limpiar("inclusive scan in join");
+			limpiar("inclusive scan in join", 0);
 		}
 	}

@ -1029,7 +1078,8 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
 	cudaEventRecord(stop, 0);
 	cudaEventSynchronize(stop);
 	cudaEventElapsedTime(&time, start, stop);
-	cout << "Sort = " << time << endl;
+	//cout << "Sort = " << time << endl;
+	cuda_stats.sort_time += time;
 	
 	cudaEventDestroy(start);
 	cudaEventDestroy(stop);
@ -1071,6 +1121,9 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:

 	int *d_locations;
 	reservar(&d_locations, memSizeS);
+#ifdef DEBUG_MEM
+	cerr << "+ " << d_locations << " d_locs n " << memSizeS << endl;
+#endif

 	dim3 Dbs(THRD_PER_BLCK_search, 1, 1);
 	dim3 Dgs(BLCK_PER_GRID_search, 1, 1);
@ -1110,6 +1163,9 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
 		cudaMemcpy(dcons, proj, sizepro, cudaMemcpyHostToDevice);
 		resSize = sum * sizepro;
 		reservar(&d_Rout, resSize);
+#ifdef DEBUG_MEM
+		cerr << "+ " << d_Rout << " d_Rout n " << resSize << endl;
+#endif
 		if(numj > 2)
 		{
 			cudaMemcpy(dcons + rule->num_columns, wherej + 2, muljoinsize, cudaMemcpyHostToDevice);
@ -1124,6 +1180,9 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
 		cudaMemcpy(dcons, proj, sizepro, cudaMemcpyHostToDevice);
 		resSize = sum * sizepro;
 		reservar(&d_Rout, resSize);
+#ifdef DEBUG_MEM
+		cerr << "+ " << d_Rout << " d_Rout 2 " << resSize << endl;
+#endif
 		if(numj > 2)
 		{
 			cudaMemcpy(dcons + projp.y, wherej + 2, muljoinsize, cudaMemcpyHostToDevice);
@ -1162,8 +1221,9 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
 	cudaEventRecord(stop, 0);
 	cudaEventSynchronize(stop);
 	cudaEventElapsedTime(&time, start, stop);
-	cout << "Join = " << time << endl;
-	cout << "FIN" << endl;
+	//cout << "Join = " << time << endl;
+	//cout << "FIN" << endl;
+	cuda_stats.join_time += time;
 	#endif

 	return sum;
--- a/packages/cuda/union2.cu
+++ b/packages/cuda/union2.cu
@ -86,6 +86,9 @@ int unir(int *res, int rows, int tipo)
 	s3 *t3;
 	int flag, nrows;

+#if TIMER
+	cuda_stats.unions++;
+#endif
 	switch(tipo)
 	{
 		case 1: 
@ -102,7 +105,7 @@ int unir(int *res, int rows, int tipo)
 				}
 				catch(std::bad_alloc &e)
 				{
-					limpiar("sort/unique in unir");
+					limpiar("sort/unique in unir", 0);
 				}				
 			}
 			nrows = thrust::distance(pt, re);
@ -140,7 +143,7 @@ int unir(int *res, int rows, int tipo)
 				}
 				catch(std::bad_alloc &e)
 				{
-					limpiar("sort/unique in unir");
+					limpiar("sort/unique in unir", 0);
 				}				
 			}
 			nrows = thrust::distance(pt2, re2);
@ -179,7 +182,7 @@ int unir(int *res, int rows, int tipo)
 				}
 				catch(std::bad_alloc &e)
 				{
-					limpiar("sort/unique in unir");
+					limpiar("sort/unique in unir", 0);
 				}				
 			}
 			nrows = thrust::distance(pt3, re3);