cuda fixes by Carlos and other diverse (fact init, configure).

2013-10-12 12:46:01 +01:00
parent 0dc78f106b
commit 691e977a68
10 changed files with 124 additions and 24 deletions
--- a/4
+++ b/4
@@ -6969,12 +6969,12 @@ fi
  then
    ENABLE_CUDA="@# "
  else
-    ENABLE_CUDA="@# "
+    ENABLE_CUDA=""
   case "$target_os" in
     *darwin*)
       CUDA_LDFLAGS="$LDFLAGS"
    CUDA_CPPFLAGS="-shared -arch=sm_20  -Xcompiler -fPIC -O3 "
-    CUDA_SHLIB_LD="$NVCC  -Xcompiler -dynamiclib  -L../.. -lYAP "
+    CUDA_SHLIB_LD="$NVCC  -Xcompiler -dynamiclib  -L../.. -lYap "
     ;;
    **)
       CUDA_LDFLAGS="$LDFLAGS $LIBS"
--- a/configure.in
+++ b/configure.in
@@ -915,7 +915,7 @@ else
     *darwin*)
       CUDA_LDFLAGS="$LDFLAGS"
    CUDA_CPPFLAGS="-shared -arch=sm_20  -Xcompiler -fPIC -O3 "
-    CUDA_SHLIB_LD="$NVCC  -Xcompiler -dynamiclib  -L../.. -lYAP "
+    CUDA_SHLIB_LD="$NVCC  -Xcompiler -dynamiclib  -L../.. -lYap "
     ;;
    **)
       CUDA_LDFLAGS="$LDFLAGS $LIBS"
--- a/library/dialect/swi/fli/swi.h
+++ b/library/dialect/swi/fli/swi.h
@@ -85,7 +85,6 @@ in_hash(ADDR key)
 static inline atom_t
 AtomToSWIAtom(Atom at)
 {
  atom_t ats;
  TranslationEntry *p;
  if ((p = Yap_GetTranslationProp(at)) != NULL)
@@ -96,7 +95,7 @@ AtomToSWIAtom(Atom at)
 static inline Atom
 SWIAtomToAtom(atom_t at)
 {
-  if ((CELL)at < 2*N_SWI_ATOMS+1)
+  if ((CELL)at & 1)
    return SWI_Atoms[at/2];
  return (Atom)at;
 }
--- a/packages/cuda/bpreds.cu
+++ b/packages/cuda/bpreds.cu
@@ -46,7 +46,7 @@ __global__ void predicates(int *dop1, int rows, int cols, int *cons, int numc, i
 	}
 }
-int bpreds(int *dop1, int rows, int *bin, int3 numpreds, int **ret)
+int bpreds(int *dop1, int rows, int cols, int *bin, int3 numpreds, int **ret)
 {
 	int *temp;
 	int tmplen = rows + 1;
@@ -124,6 +124,8 @@ int bpreds(int *dop1, int rows, int *bin, int3 numpreds, int **ret)
 	liberar(dhead, hsize);
 	liberar(temp, size);
 	liberar(dop1, rows * cols * sizeof(int));
 	*ret = fres;
 	return num;
 }
--- a/packages/cuda/cuda.c
+++ b/packages/cuda/cuda.c
@@ -158,6 +158,69 @@ load_facts( void ) {
  }
 }
 static int currentFact = 0;
 static predicate *currentPred = NULL;
 static int
 cuda_init_facts( void ) {
  int32_t nrows = YAP_IntOfTerm(YAP_ARG1);
  int32_t ncols = YAP_IntOfTerm(YAP_ARG2), i = 0;
  int32_t *mat = (int32_t *)malloc(sizeof(int32_t)*nrows*ncols);
  int32_t pname = YAP_AtomToInt(YAP_AtomOfTerm(YAP_ARG3));
  predicate *pred;
  if (!mat)
    return FALSE;
  if (YAP_IsVarTerm( YAP_ARG4)) {
    // new 
    pred = (predicate *)malloc(sizeof(predicate));
  } else {
    pred = (predicate *)YAP_IntOfTerm(YAP_ARG4);
    if (pred->address_host_table)
      free( pred->address_host_table );
 }
  pred->name = pname;
  pred->num_rows = nrows;
  pred->num_columns = ncols;
  pred->is_fact = TRUE;
  pred->address_host_table =  mat;
  currentPred = pred;
  currentFact = 0;
  if (YAP_IsVarTerm( YAP_ARG4)) {
    return YAP_Unify(YAP_ARG4, YAP_MkIntTerm((YAP_Int)pred));
  } else {
    return TRUE;
  }
 }
 static int
 cuda_load_fact( void ) {
  YAP_Term th = YAP_ARG1;
  int i, j;
  int ncols = currentPred->num_columns;
  int *mat = currentPred->address_host_table;
  i = currentFact;
  for (j = 0; j < ncols; j++) {
    YAP_Term ta = YAP_ArgOfTerm(j+1, th);
    if (YAP_IsAtomTerm(ta)) {
      mat[i*ncols+j] = YAP_AtomToInt(YAP_AtomOfTerm(ta));
    } else {
      mat[i*ncols+j] = YAP_IntOfTerm(ta);
    }
  }
  i++;
  if (i == currentPred->num_rows) {
    Cuda_NewFacts(currentPred);
    currentPred = NULL;
    currentFact = 0;
  } else {
    currentFact = i;
  }
 }
 static int
 load_rule( void ) {
  // maximum of 2K symbols per rule, should be enough for ILP
@@ -264,7 +327,8 @@ cuda_eval( void )
    }
    out = YAP_MkPairTerm(YAP_MkApplTerm( f, ncols, vec ), out);
  }
-  free( mat );
+  if (n > 0)
    free( mat );
  return YAP_Unify(YAP_ARG2, out);
 }
@@ -276,10 +340,16 @@ cuda_coverage( void )
  int32_t n = Cuda_Eval(facts, cf, rules, cr, ptr, & mat);
  int32_t ncols = ptr->num_columns;
  int32_t post = YAP_AtomToInt(YAP_AtomOfTerm(YAP_ARG2));
-  int32_t i = n/2, min = 0, max = n-1, t0 = mat[0], t1 = mat[(n-1)*2];
+  int32_t i = n/2, min = 0, max = n-1;
  int32_t t0, t1;
  if (n < 0)
    return FALSE;
  if (n == 0) {
    return YAP_Unify(YAP_ARG4, YAP_MkIntTerm(0)) && 
      YAP_Unify(YAP_ARG3, YAP_MkIntTerm(0));
  }
  t0 = mat[0], t1 = mat[(n-1)*2];
  if (t0 == t1) { /* all sametype */
    free( mat );
    /* all pos */
@@ -337,6 +407,8 @@ init_cuda(void)
  AtomLe = YAP_LookupAtom("=<");
  AtomDf = YAP_LookupAtom("\\=");
  YAP_UserCPredicate("load_facts", load_facts, 4);
  YAP_UserCPredicate("cuda_init_facts", cuda_init_facts, 4);
  YAP_UserCPredicate("cuda_load_fact", cuda_load_fact, 1);
  YAP_UserCPredicate("load_rule", load_rule, 4);
  YAP_UserCPredicate("cuda_erase", cuda_erase, 1);
  YAP_UserCPredicate("cuda_eval", cuda_eval, 2);
--- a/packages/cuda/cuda.yap
+++ b/packages/cuda/cuda.yap
@@ -21,9 +21,21 @@ cuda_inline(P, Q) :-
 cuda_extensional( Call, IdFacts) :-
 	strip_module(Call, Mod, Name/Arity),
 	functor(S, Name, Arity),
-	findall( S, Mod:S, L),
+	count_answers( Mod:S, N),
-	length( L, N ),
+	% reserve space
-	load_facts( N, Arity, L, IdFacts ).
+	cuda_init_facts( N, Arity, Name, IdFacts ),
 	% fill it out
 	( Mod:S, cuda_load_fact(S), fail ; true ).
 count_answers(G, N) :-
        S = count(0),
        ( 
            G,
            arg(1, S, I0),
            I is I0+1,
            nb_setarg(1, S, I),
            fail ;
            S = count(N) ).
 cuda_rule((Head :- Body) , IdRules) :-
 	body_to_list( Body, L, [], 1, N),
--- a/packages/cuda/lista.cu
+++ b/packages/cuda/lista.cu
@@ -264,7 +264,7 @@ int2 columnsproject(int *first, int tam, int *rule, int ini, int fin, int sini,
 		temp = first[x];
 		for(y = 0; y < ini; y++)
 		{
-			if(temp == rule[y])
+			if(temp == rule[y] && temp > 0) /*added condition to avoid constants*/
 			{
 				if(notin(temp, pv, ret.y))
 				{
@@ -279,7 +279,7 @@ int2 columnsproject(int *first, int tam, int *rule, int ini, int fin, int sini,
 			continue;
 		for(y = sfin + 1; y < fin; y++)
 		{
-			if(temp == rule[y])
+			if(temp == rule[y] && temp > 0)
 			{
 				if(notin(temp, pv, ret.y))
 				{
@@ -298,7 +298,7 @@ int2 columnsproject(int *first, int tam, int *rule, int ini, int fin, int sini,
 		temp = rule[x];
 		for(y = 0; y < ini; y++)
 		{
-			if(temp == rule[y])
+			if(temp == rule[y] && temp > 0)
 			{
 				if(notin(temp, pv, ret.y))
 				{
@@ -313,7 +313,7 @@ int2 columnsproject(int *first, int tam, int *rule, int ini, int fin, int sini,
 			continue;
 		for(y = sfin + 1; y < fin; y++)
 		{
-			if(temp == rule[y])
+			if(temp == rule[y] && temp > 0)
 			{
 				if(notin(temp, pv, ret.y))
 				{
@@ -1083,6 +1083,7 @@ int Cuda_Eval(predicate **inpfacts, int ninpf, predicate **inprules, int ninpr,
 				//cout << "resrows = " << res_rows << endl;
 			}
 			if(x == num_refs)
 			{
 				if(rul_act->num_bpreds.x > 0) /*Built-in predicates*/
@@ -1094,7 +1095,7 @@ int Cuda_Eval(predicate **inpfacts, int ninpf, predicate **inprules, int ninpr,
 					cudaEventRecord(start3, 0);
 					#endif					
-					res_rows = bpreds(res, res_rows, rul_act->builtin, rul_act->num_bpreds, &res);
+					res_rows = bpreds(res, res_rows, rul_act->num_columns, rul_act->builtin, rul_act->num_bpreds, &res);
 					#ifdef TIMER
 					cudaEventRecord(stop3, 0);
@@ -1115,7 +1116,7 @@ int Cuda_Eval(predicate **inpfacts, int ninpf, predicate **inprules, int ninpr,
 				cudaEventRecord(start2, 0);
 				#endif
-				res_rows = unir(res, res_rows, rul_act->num_columns, &res); /*Duplicate Elimination*/
+				res_rows = unir(res, res_rows, rul_act->num_columns); /*Duplicate Elimination*/
 				#ifdef TIMER
 				cudaEventRecord(stop2, 0);
@@ -1223,6 +1224,7 @@ int Cuda_Eval(predicate **inpfacts, int ninpf, predicate **inprules, int ninpr,
 		itr++;
 	}
 	tmprule.name = qname;
 	qposr = lower_bound(rul_str, fin, tmprule, comparer);
 	if(qposr != fin && qposr->name == qname) 
@@ -1246,14 +1248,19 @@ int Cuda_Eval(predicate **inpfacts, int ninpf, predicate **inprules, int ninpr,
 			res = dop1;
 			res_rows = rows1;
 		}
-		else		
+		else
 		{		
 			res_rows = selectproyect(dop1, rows1, cols1, tmprule.num_columns, tmprule.select[0], tmprule.numsel[0], tmprule.selfjoin[0], tmprule.numselfj[0], tmprule.project[0], &res);
 			if(qposr != fin && qposr->name == qname)
 				cudaFree(dop1);
 		}
 		cols1 = tmprule.num_columns;
 		tipo = res_rows * cols1 * sizeof(int);
 		hres = (int *)malloc(tipo);
 		cudaMemcpy(hres, res, tipo, cudaMemcpyDeviceToHost);
-		cudaFree(res);
+		if(res_rows > 0 && tmprule.numsel[0] != 0 && tmprule.numselfj[0] != 0)
 			cudaFree(res);
 	}
 	else
 		res_rows = 0;
--- a/packages/cuda/memory.cu
+++ b/packages/cuda/memory.cu
@@ -187,13 +187,21 @@ void reservar(int **ptr, int size)
  //cudaMemGetInfo(      &free, &total	 );
  // cerr << "R " << free << " " << size << endl;
-
+        if (size == 0) { 
                *ptr = NULL; 
                return;
        }
 	while(avmem < size)
 		limpiar("not enough memory");
 	while(cudaMalloc(ptr, size) == cudaErrorMemoryAllocation)
 		limpiar("error in memory allocation");
-	if (! *ptr )
+	if (! *ptr ) {
-	  exit(0);
+	  size_t free, total;
 	  cudaMemGetInfo(      &free, &total	 );
 	  cerr << "Could not allocate " << size << " bytes, only " << free << " avaliable from total of " << total << " !!!" << endl;
 	  cerr << "Exiting CUDA...." << endl;
 	  exit(1);
 	}
 	// cerr << *ptr << " " << size;
 	avmem -= size;
--- a/packages/cuda/treeb.cu
+++ b/packages/cuda/treeb.cu
@@ -1126,7 +1126,7 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
 		reservar(&d_Rout, resSize);
 		if(numj > 2)
 		{
-			cudaMemcpy(dcons + rule->num_columns, wherej + 2, muljoinsize, cudaMemcpyHostToDevice);
+			cudaMemcpy(dcons + projp.y, wherej + 2, muljoinsize, cudaMemcpyHostToDevice);
 			multiJoinWithWrite<<<blockllen, numthreads, sizepro + muljoinsize>>> (d_locations, sLen, temp, d_Rout, p1, p2, of1, of2, dcons, projp.x, projp.y, posR, posS, muljoin);
 		}
 		else
--- a/packages/cuda/union2.cu
+++ b/packages/cuda/union2.cu
@@ -77,7 +77,7 @@ struct o3
    	}
 };
-int unir(int *res, int rows, int tipo, int **ret)
+int unir(int *res, int rows, int tipo)
 {
 	thrust::device_ptr<int> pt, re;
 	thrust::device_ptr<s2> pt2, re2;