cuda package from Carlos

This commit is contained in:
Vítor Santos Costa 2013-10-07 12:20:00 +01:00
parent fcaabd2c0b
commit 726d7ca1cc
12 changed files with 3677 additions and 150 deletions

View File

@ -1,4 +1,4 @@
#
# default base directory for YAP installation
# (EROOT for architecture-dependent files)
#
@ -702,6 +702,7 @@ all: startup.yss
@INSTALL_DLLS@ (cd library/random; $(MAKE))
@INSTALL_DLLS@ (cd library/regex; $(MAKE))
@INSTALL_DLLS@ (cd library/rltree; $(MAKE))
@INSTALL_DLLS@ (cd packages/yap-lbfgs; $(MAKE))
@ENABLE_WINCONSOLE@ (cd swi/console; $(MAKE))
@INSTALL_DLLS@ (cd library/system; $(MAKE))
@INSTALL_DLLS@ (cd library/tries; $(MAKE))
@ -729,6 +730,7 @@ all: startup.yss
@ENABLE_CUDD@ (cd packages/ProbLog/simplecudd_lfi; $(MAKE))
@ENABLE_JPL@ @INSTALL_DLLS@ (cd packages/jpl; $(MAKE))
@ENABLE_PYTHON@ @INSTALL_DLLS@ (cd packages/python; $(MAKE))
@ENABLE_CUDA@ @INSTALL_DLLS@ (cd packages/cuda; $(MAKE))
startup.yss: yap@EXEC_SUFFIX@ $(PL_SOURCES)
-rm -f startup.yss
@ -775,6 +777,7 @@ install_unix: startup.yss libYap.a
@INSTALL_DLLS@ (cd library/random; $(MAKE) install)
@INSTALL_DLLS@ (cd library/regex; $(MAKE) install)
@INSTALL_DLLS@ (cd library/rltree; $(MAKE) install)
@INSTALL_DLLS@ (cd packages/yap-lbfgs; $(MAKE) install)
@INSTALL_DLLS@ (cd library/system; $(MAKE) install)
@INSTALL_DLLS@ (cd library/tries; $(MAKE) install)
@ENABLE_GECODE@ @INSTALL_DLLS@ (cd library/gecode; $(MAKE) install)
@ -799,6 +802,7 @@ install_unix: startup.yss libYap.a
@ENABLE_REAL@ @INSTALL_DLLS@ (cd packages/real; $(MAKE) install)
@ENABLE_JPL@ @INSTALL_DLLS@ (cd packages/jpl; $(MAKE) install)
@ENABLE_PYTHON@ @INSTALL_DLLS@ (cd packages/python; $(MAKE) install)
@ENABLE_CUDA@ @INSTALL_DLLS@ (cd packages/cuda; $(MAKE))
mkdir -p $(DESTDIR)$(INCLUDEDIR)
mkdir -p $(DESTDIR)$(INCLUDEDIR)/src
$(INSTALL) $(HEADERS) $(DESTDIR)$(INCLUDEDIR)/src
@ -813,6 +817,7 @@ install_unix: startup.yss libYap.a
@ENABLE_BDDLIB@ @INSTALL_DLLS@ (cd packages/bdd; $(MAKE) install)
@ENABLE_CUDD@ (cd packages/ProbLog/simplecudd; $(MAKE) install)
@ENABLE_CUDD@ (cd packages/ProbLog/simplecudd_lfi; $(MAKE) install)
@ENABLE_CUDA@ @INSTALL_DLLS@ (cd packages/cuda; $(MAKE) install)
install_win32: startup.yss @ENABLE_WINCONSOLE@ yap-win@EXEC_SUFFIX@
@ -843,6 +848,7 @@ install_win32: startup.yss @ENABLE_WINCONSOLE@ yap-win@EXEC_SUFFIX@
(cd library/regex; $(MAKE) install)
(cd library/rltree; $(MAKE) install)
(cd library/system; $(MAKE) install)
@INSTALL_DLLS@ (cd packages/yap-lbfgs; $(MAKE) install)
@ENABLE_WINCONSOLE@ (cd swi/console; $(MAKE) install)
@INSTALL_MATLAB@ (cd library/matlab; $(MAKE) install)
@ENABLE_REAL@ (cd packages/real; $(MAKE) install)
@ -871,6 +877,7 @@ install_win32: startup.yss @ENABLE_WINCONSOLE@ yap-win@EXEC_SUFFIX@
@ENABLE_BDDLIB@ (cd packages/bdd; $(MAKE) install)
@ENABLE_CUDD@ (cd packages/ProbLog/simplecudd; $(MAKE) install)
@ENABLE_CUDD@ (cd packages/ProbLog/simplecudd_lfi; $(MAKE) install)
@ENABLE_CUDA@ @INSTALL_DLLS@ (cd packages/cuda; $(MAKE) install)
install_library: @YAPLIB@
mkdir -p $(DESTDIR)$(INCLUDEDIR)
@ -914,6 +921,7 @@ clean: clean_docs
@INSTALL_DLLS@ (cd library/random; $(MAKE) clean)
@INSTALL_DLLS@ (cd library/regex; $(MAKE) clean)
@INSTALL_DLLS@ (cd library/rltree; $(MAKE) clean)
@INSTALL_DLLS@ (cd packages/yap-lbfgs; $(MAKE) clean)
@ENABLE_WINCONSOLE@ (cd swi/console; $(MAKE) clean)
@INSTALL_DLLS@ (cd library/system; $(MAKE) clean)
@INSTALL_DLLS@ (cd library/tries; $(MAKE) clean)
@ -941,6 +949,7 @@ clean: clean_docs
@ENABLE_CUDD@ (cd packages/ProbLog/simplecudd_lfi; $(MAKE) clean)
@ENABLE_JPL@ @INSTALL_DLLS@ (cd packages/jpl; $(MAKE) clean)
@ENABLE_PYTHON@ @INSTALL_DLLS@ (cd packages/python; $(MAKE) clean)
@ENABLE_CUDA@ @INSTALL_DLLS@ (cd packages/cuda; $(MAKE) clean)

181
configure vendored
View File

@ -705,6 +705,8 @@ ENABLE_CLIB
ENABLE_CHR
CUDA_LDFLAGS
CUDA_CPPFLAGS
CUDA_SHLIB_LD
ENABLE_CUDA
NO_BUILTIN_REGEXP
YAP_EXTRAS
SONAMEFLAG
@ -735,12 +737,12 @@ CPLINT_CFLAGS
CPLINT_LIBS
ENABLE_PRISM
ENABLE_GECODE
NVCC
PYTHON
REXE
INSTALL_INFO
MPI_CC
AR
INDENT
RANLIB
INSTALL_DATA
INSTALL_SCRIPT
@ -855,6 +857,7 @@ with_R
with_python
with_judy
with_minisat
with_cuda
with_cudd
enable_myddas
enable_myddas_stats
@ -1538,6 +1541,7 @@ Optional Packages:
--with-python=DIR interface to R language
--with-judy=DIR UDI needs judy library
--enable-minisat use minisat interface
--enable-cuda use minisat interface
--with-cudd=DIR use CUDD package in DIR
--with-java=JAVA_HOME use Java instalation in JAVA_HOME
--with-readline=DIR use GNU Readline Library in DIR
@ -4755,6 +4759,21 @@ fi
# Check whether --with-cuda was given.
if test "${with_cuda+set}" = set; then :
withval=$with_cuda; if test "$withval" = yes; then
yap_cv_cuda=/usr
elif test "$withval" = no; then
yap_cv_cuda=no
else
yap_cv_cuda="$withval"
fi
else
yap_cv_cuda=no
fi
# Check whether --with-cudd was given.
if test "${with_cudd+set}" = set; then :
withval=$with_cudd; yap_cv_cudd="$withval"
@ -5151,9 +5170,6 @@ else
ENABLE_CUDD=""
fi
CUDA_LDFLAGS=""
CUDA_CPPFLAGS=""
if test "$use_condor" = yes
@ -5459,98 +5475,6 @@ else
RANLIB="$ac_cv_prog_RANLIB"
fi
if test -n "$ac_tool_prefix"; then
# Extract the first word of "${ac_tool_prefix}indent", so it can be a program name with args.
set dummy ${ac_tool_prefix}indent; ac_word=$2
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
$as_echo_n "checking for $ac_word... " >&6; }
if ${ac_cv_prog_INDENT+:} false; then :
$as_echo_n "(cached) " >&6
else
if test -n "$INDENT"; then
ac_cv_prog_INDENT="$INDENT" # Let the user override the test.
else
as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
for as_dir in $PATH
do
IFS=$as_save_IFS
test -z "$as_dir" && as_dir=.
for ac_exec_ext in '' $ac_executable_extensions; do
if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
ac_cv_prog_INDENT="${ac_tool_prefix}indent"
$as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
break 2
fi
done
done
IFS=$as_save_IFS
fi
fi
INDENT=$ac_cv_prog_INDENT
if test -n "$INDENT"; then
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $INDENT" >&5
$as_echo "$INDENT" >&6; }
else
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
$as_echo "no" >&6; }
fi
fi
if test -z "$ac_cv_prog_INDENT"; then
ac_ct_INDENT=$INDENT
# Extract the first word of "indent", so it can be a program name with args.
set dummy indent; ac_word=$2
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
$as_echo_n "checking for $ac_word... " >&6; }
if ${ac_cv_prog_ac_ct_INDENT+:} false; then :
$as_echo_n "(cached) " >&6
else
if test -n "$ac_ct_INDENT"; then
ac_cv_prog_ac_ct_INDENT="$ac_ct_INDENT" # Let the user override the test.
else
as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
for as_dir in $PATH
do
IFS=$as_save_IFS
test -z "$as_dir" && as_dir=.
for ac_exec_ext in '' $ac_executable_extensions; do
if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
ac_cv_prog_ac_ct_INDENT="indent"
$as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
break 2
fi
done
done
IFS=$as_save_IFS
fi
fi
ac_ct_INDENT=$ac_cv_prog_ac_ct_INDENT
if test -n "$ac_ct_INDENT"; then
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_INDENT" >&5
$as_echo "$ac_ct_INDENT" >&6; }
else
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
$as_echo "no" >&6; }
fi
if test "x$ac_ct_INDENT" = x; then
INDENT=":"
else
case $cross_compiling:$ac_tool_warned in
yes:)
{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5
$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
ac_tool_warned=yes ;;
esac
INDENT=$ac_ct_INDENT
fi
else
INDENT="$ac_cv_prog_INDENT"
fi
if test -n "$ac_tool_prefix"; then
# Extract the first word of "${ac_tool_prefix}ar", so it can be a program name with args.
set dummy ${ac_tool_prefix}ar; ac_word=$2
@ -6994,6 +6918,68 @@ fi
fi
CUDA_LDFLAGS=""
CUDA_CPPFLAGS=""
if test "$yap_cv_cuda" = no
then
ENABLE_CUDA="@# "
else
# Extract the first word of "nvcc", so it can be a program name with args.
set dummy nvcc; ac_word=$2
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
$as_echo_n "checking for $ac_word... " >&6; }
if ${ac_cv_path_NVCC+:} false; then :
$as_echo_n "(cached) " >&6
else
case $NVCC in
[\\/]* | ?:[\\/]*)
ac_cv_path_NVCC="$NVCC" # Let the user override the test with a path.
;;
*)
as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
for as_dir in $yap_cv_cuda/bin
do
IFS=$as_save_IFS
test -z "$as_dir" && as_dir=.
for ac_exec_ext in '' $ac_executable_extensions; do
if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
ac_cv_path_NVCC="$as_dir/$ac_word$ac_exec_ext"
$as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
break 2
fi
done
done
IFS=$as_save_IFS
test -z "$ac_cv_path_NVCC" && ac_cv_path_NVCC="no"
;;
esac
fi
NVCC=$ac_cv_path_NVCC
if test -n "$NVCC"; then
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $NVCC" >&5
$as_echo "$NVCC" >&6; }
else
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
$as_echo "no" >&6; }
fi
if test "$yap_cv_cuda" = no
then
ENABLE_CUDA="@# "
else
ENABLE_CUDA="@# "
case "$target_os" in
*darwin*)
CUDA_LDFLAGS="$LDFLAGS"
CUDA_CPPFLAGS="-shared -arch=sm_20 -Xcompiler -fPIC -O3 "
CUDA_SHLIB_LD="$NVCC -Xcompiler -dynamiclib"
;;
esac
fi
fi
if test "$yap_cv_myddas" != "no"
then
@ -9819,6 +9805,9 @@ CMDEXT=sh

View File

@ -279,6 +279,17 @@ AC_ARG_WITH(minisat,
fi,
[yap_cv_minisat=yes])
AC_ARG_WITH(cuda,
[ --enable-cuda use minisat interface],
if test "$withval" = yes; then
yap_cv_cuda=/usr
elif test "$withval" = no; then
yap_cv_cuda=no
else
yap_cv_cuda="$withval"
fi,
[yap_cv_cuda=no])
AC_ARG_WITH(cudd,
[ --with-cudd[=DIR] use CUDD package in DIR],
yap_cv_cudd="$withval",
@ -567,9 +578,6 @@ else
ENABLE_CUDD=""
fi
CUDA_LDFLAGS=""
CUDA_CPPFLAGS=""
dnl condor universe does not like dynamic linking on Linux, DEC, and HP-UX platforms.
@ -688,7 +696,6 @@ dnl Checks for programs.
AC_PROG_LN_S
AC_PROG_INSTALL
AC_PROG_RANLIB
AC_CHECK_TOOL(INDENT,indent,:)
AC_CHECK_TOOL(AR,ar,:)
AC_CHECK_TOOL(MPI_CC,mpicc,${CC})
AC_PATH_PROG(INSTALL_INFO,install-info,true,$PATH:/sbin:/usr/sbin:/usr/etc:/usr/local/sbin)
@ -892,6 +899,28 @@ if test "$yap_cv_judy" != "no"; then
AC_CHECK_LIB(Judy, Judy1Set,,[AC_MSG_RESULT([libJudy not found, UDI will only work with one Index at a time])])
fi
CUDA_LDFLAGS=""
CUDA_CPPFLAGS=""
if test "$yap_cv_cuda" = no
then
ENABLE_CUDA="@# "
else
AC_PATH_PROG(NVCC, [nvcc], [no], [$yap_cv_cuda/bin])
if test "$yap_cv_cuda" = no
then
ENABLE_CUDA="@# "
else
ENABLE_CUDA="@# "
case "$target_os" in
*darwin*)
CUDA_LDFLAGS="$LDFLAGS"
CUDA_CPPFLAGS="-shared -arch=sm_20 -Xcompiler -fPIC -O3 "
CUDA_SHLIB_LD="$NVCC -Xcompiler -dynamiclib"
;;
esac
fi
fi
dnl if test "$yap_cv_cudd" != "no"
dnl then
dnl AC_CHECK_LIB(cudd,Cudd_Init)
@ -1844,6 +1873,9 @@ AC_SUBST(INSTALL_INFO)
dnl let YAP_EXTRAS fall through configure, from the env into Makefile
AC_SUBST(YAP_EXTRAS)
AC_SUBST(NO_BUILTIN_REGEXP)
AC_SUBST(ENABLE_CUDA)
AC_SUBST(NVCC)
AC_SUBST(CUDA_SHLIB_LD)
AC_SUBST(CUDA_CPPFLAGS)
AC_SUBST(CUDA_LDFLAGS)
AC_SUBST(ENABLE_CHR)

View File

@ -20,7 +20,9 @@ YAPLIBDIR=@libdir@/Yap
#
#
CC=@CC@
NVCC=@NVCC@
CFLAGS= @SHLIB_CFLAGS@ $(YAP_EXTRAS) $(DEFS) -I$(srcdir) -I../.. -I$(srcdir)/../../include @CUDA_CPPFLAGS@
NVCCFLAGS=@CUDA_CPPFLAGS@ -I$(srcdir) -I../.. -I$(srcdir)/../../include
LDFLAGS=@LDFLAGS@
#
#
@ -40,7 +42,7 @@ CWD=$(PWD)
BDD_PROLOG= \
$(srcdir)/cuda.yap
OBJS=cuda.o
OBJS=cuda.o memory.o lista.o
SOBJS=cuda.@SO@
#in some systems we just create a single object, in others we need to
@ -48,14 +50,17 @@ SOBJS=cuda.@SO@
all: $(SOBJS)
cuda.o: $(srcdir)/cuda.c
$(CC) -c $(CFLAGS) $(srcdir)/cuda.c -o cuda.o
cuda.o: $(srcdir)/cuda.c $(srcdir)/pred.h
$(NVCC) -c $(NVCCFLAGS) $(srcdir)/cuda.c -o cuda.o
@DO_SECOND_LD@%.@SO@: %.o
@DO_SECOND_LD@ @SHLIB_LD@ $(LDFLAGS) -o $@ $< @EXTRA_LIBS_FOR_DLLS@ @CUDA_LDFLAGS@
lista.o: $(srcdir)/lista.cu $(srcdir)/pred.h
$(NVCC) -c $(NVCCFLAGS) $(srcdir)/lista.cu -o lista.o
@DO_SECOND_LD@cuda.@SO@: cuda.o
@DO_SECOND_LD@ @SHLIB_LD@ $(LDFLAGS) -o cuda.@SO@ cuda.o @EXTRA_LIBS_FOR_DLLS@ @CUDA_LDFLAGS@
memory.o: $(srcdir)/memory.cu $(srcdir)/pred.h
$(NVCC) -c $(NVCCFLAGS) $(srcdir)/memory.cu -o memory.o
@DO_SECOND_LD@cuda.@SO@: $(OBJS)
@DO_SECOND_LD@ @CUDA_SHLIB_LD@ $(CUDA_LDFLAGS) -o cuda.@SO@ $(OBJS) -L../.. -lYAP
install: all
mkdir -p $(DESTDIR)$(SHAREDIR)

131
packages/cuda/bpreds.cu Normal file
View File

@ -0,0 +1,131 @@
#define SBG_EQ (-1)
#define SBG_GT (-2)
#define SBG_LT (-3)
#define SBG_GE (-4)
#define SBG_LE (-5)
#define SBG_DF (-6)
__global__ void predicates(int *dop1, int rows, int cols, int *cons, int numc, int *res)
{
extern __shared__ int shared[];
int id = blockIdx.x * blockDim.x + threadIdx.x;
int x, rowact, op1, op2;
if(threadIdx.x < numc)
shared[threadIdx.x] = cons[threadIdx.x];
__syncthreads();
if(id < rows)
{
rowact = id * cols;
for(x = 0; x < numc; x += 3)
{
op1 = shared[x+1];
if(op1 < 0)
op1 *= -1;
else
op1 = dop1[rowact + op1];
op2 = shared[x+2];
if(op2 < 0)
op2 *= -1;
else
op2 = dop1[rowact + op2];
switch(shared[x])
{
case SBG_EQ: if(op1 != op2)
return;
case SBG_GT: if(op1 <= op2)
return;
case SBG_LT: if(op1 >= op2)
return;
case SBG_GE: if(op1 < op2)
return;
case SBG_LE: if(op1 > op2)
return;
case SBG_DF: if(op1 == op2)
return;
}
}
res[id] = 1;
}
}
int bpreds(int *dop1, int rows, int *bin, int3 numpreds, int **ret)
{
int *temp;
int tmplen = rows + 1;
int size = tmplen * sizeof(int);
reservar(&temp, size);
cudaMemset(temp, 0, size);
int *dhead;
int predn = numpreds.x * 3;
int spredn = predn * sizeof(int);
int sproj = numpreds.z * sizeof(int);
int hsize;
if(predn > numpreds.z)
hsize = spredn;
else
hsize = sproj;
reservar(&dhead, hsize);
cudaMemcpy(dhead, bin, spredn, cudaMemcpyHostToDevice);
int blockllen = rows / 1024 + 1;
int numthreads = 1024;
/*int x;
cout << "arraypreds = ";
for(x = 0; x < predn; x++)
cout << bin[x] << " ";
cout << endl;
cout << "temptable = ";
for(x = 0; x < numpreds.z; x++)
cout << bin[x+predn] << " ";
cout << endl;
int y;
int *hop1 = (int *)malloc(numpreds.y * rows * sizeof(int));
cudaMemcpy(hop1, dop1, numpreds.y * rows * sizeof(int), cudaMemcpyDeviceToHost);
for(x = 0; x < rows; x++)
{
for(y = 0; y < numpreds.y; y++)
cout << hop1[x * numpreds.y + y] << " ";
cout << endl;
}
free(hop1);*/
predicates<<<blockllen, numthreads, spredn>>>(dop1, rows, numpreds.y, dhead, predn, temp + 1);
/*int y;
int *hop1 = (int *)malloc((rows + 1) * sizeof(int));
cudaMemcpy(hop1, temp, (rows + 1) * sizeof(int), cudaMemcpyDeviceToHost);
for(x = 0; x < (rows + 1); x++)
cout << hop1[x] << " ";
cout << endl;
free(hop1);*/
thrust::device_ptr<int> res;
res = thrust::device_pointer_cast(temp);
thrust::inclusive_scan(res + 1, res + tmplen, res + 1);
int num = res[rows];
if(num == 0)
return 0;
int *fres;
reservar(&fres, num * sproj);
cudaMemcpy(dhead, bin + predn, sproj, cudaMemcpyHostToDevice);
llenarproyectar<<<blockllen, numthreads, sproj>>>(dop1, rows, numpreds.y, temp, dhead, numpreds.z, fres);
/*int y;
int *hop1 = (int *)malloc(numpreds.z * num * sizeof(int));
cudaMemcpy(hop1, fres, numpreds.z * num * sizeof(int), cudaMemcpyDeviceToHost);
for(x = 0; x < num; x++)
{
for(y = 0; y < numpreds.z; y++)
cout << hop1[x * numpreds.z + y] << " ";
cout << endl;
}
free(hop1);*/
liberar(dhead, hsize);
liberar(temp, size);
*ret = fres;
return num;
}

View File

@ -4,38 +4,36 @@
#include "YapInterface.h"
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include "pred.h"
typedef struct predicate_struct {
int name;
int num_rows;
int num_columns;
int is_fact;
int *address_host_table;
} predicate;
predicate *facts[100]; /*Temporary solution to maintain facts and rules*/
predicate *rules[100];
int32_t cf = 0, cr = 0;
// initialize CUDA system
void Cuda_Initialize( void );
// add/replace a set of facts for predicate pred
int Cuda_NewFacts(predicate *pred);
int32_t Cuda_NewFacts(predicate *pred);
// add/replace a rule for predicate pred
int Cuda_NewRule(predicate *pred);
int32_t Cuda_NewRule(predicate *pred);
// erase predicate pred
int Cuda_Erase(predicate *pred);
int32_t Cuda_Erase(predicate *pred);
// evaluate predicate pred, mat is bound to a vector of solutions, and
// output the count
int Cuda_Eval(predicate *pred, int **mat);
//int32_t Cuda_Eval(predicate *pred, int32_t **mat); This functions arguments were changed, please see pred.h
void init_cuda( void );
static void
dump_mat(int mat[], int nrows, int ncols)
dump_mat(int32_t mat[], int32_t nrows, int32_t ncols)
{
int i, j;
int32_t i, j;
for ( i=0; i< nrows; i++) {
printf("%d", mat[i*ncols]);
for (j=1; j < ncols; j++) {
@ -46,10 +44,10 @@ dump_mat(int mat[], int nrows, int ncols)
}
static void
dump_vec(int vec[], int rows)
dump_vec(int32_t vec[], int32_t rows)
{
int i = 1;
int j = 0;
int32_t i = 1;
int32_t j = 0;
printf("%d", vec[0]);
for (j = 0; j < rows; j++) {
for ( ; vec[i]; i++ ) {
@ -67,19 +65,23 @@ void Cuda_Initialize( void )
{
}
int Cuda_NewFacts(predicate *pe)
int32_t Cuda_NewFacts(predicate *pe)
{
dump_mat( pe->address_host_table, pe->num_rows, pe->num_columns );
facts[cf] = pe;
cf++;
return TRUE;
}
int Cuda_NewRule(predicate *pe)
int32_t Cuda_NewRule(predicate *pe)
{
dump_vec( pe->address_host_table, pe->num_rows);
rules[cr] = pe;
cr++;
return TRUE;
}
int Cuda_Erase(predicate *pe)
int32_t Cuda_Erase(predicate *pe)
{
if (pe->address_host_table)
free( pe->address_host_table );
@ -90,15 +92,15 @@ int Cuda_Erase(predicate *pe)
static int
load_facts( void ) {
int nrows = YAP_IntOfTerm(YAP_ARG1);
int ncols = YAP_IntOfTerm(YAP_ARG2), i = 0;
int32_t nrows = YAP_IntOfTerm(YAP_ARG1);
int32_t ncols = YAP_IntOfTerm(YAP_ARG2), i = 0;
YAP_Term t3 = YAP_ARG3;
int *mat = (int *)malloc(sizeof(int)*nrows*ncols);
int pname = YAP_AtomToInt(YAP_NameOfFunctor(YAP_FunctorOfTerm(YAP_HeadOfTerm(t3))));
int32_t *mat = (int32_t *)malloc(sizeof(int32_t)*nrows*ncols);
int32_t pname = YAP_AtomToInt(YAP_NameOfFunctor(YAP_FunctorOfTerm(YAP_HeadOfTerm(t3))));
predicate *pred;
while(YAP_IsPairTerm(t3)) {
int j = 0;
int32_t j = 0;
YAP_Term th = YAP_HeadOfTerm(t3);
for (j = 0; j < ncols; j++) {
@ -136,28 +138,28 @@ load_facts( void ) {
static int
load_rule( void ) {
// maximum of 2K symbols per rule, should be enough for ILP
int vec[2048], *ptr = vec, *nvec;
int32_t vec[2048], *ptr = vec, *nvec;
// qK different variables;
YAP_Term vars[1024];
int nvars = 0;
int ngoals = YAP_IntOfTerm(YAP_ARG1); /* gives the number of goals */
int ncols = YAP_IntOfTerm(YAP_ARG2);
int32_t nvars = 0;
int32_t ngoals = YAP_IntOfTerm(YAP_ARG1); /* gives the number of goals */
int32_t ncols = YAP_IntOfTerm(YAP_ARG2);
YAP_Term t3 = YAP_ARG3;
int pname = YAP_AtomToInt(YAP_NameOfFunctor(YAP_FunctorOfTerm(YAP_HeadOfTerm(t3))));
int32_t pname = YAP_AtomToInt(YAP_NameOfFunctor(YAP_FunctorOfTerm(YAP_HeadOfTerm(t3))));
predicate *pred;
while(YAP_IsPairTerm(t3)) {
int j = 0;
int32_t j = 0;
YAP_Term th = YAP_HeadOfTerm(t3);
YAP_Functor f = YAP_FunctorOfTerm( th );
int n = YAP_ArityOfFunctor( f );
int32_t n = YAP_ArityOfFunctor( f );
*ptr++ = YAP_AtomToInt( YAP_NameOfFunctor( f ) );
for (j = 0; j < n; j++) {
YAP_Term ta = YAP_ArgOfTerm(j+1, th);
if (YAP_IsVarTerm(ta)) {
int k;
int32_t k;
for (k = 0; k< nvars; k++) {
if (vars[k] == ta) {
*ptr++ = k+1;
@ -190,8 +192,8 @@ load_rule( void ) {
pred->num_rows = ngoals;
pred->num_columns = ncols;
pred->is_fact = FALSE;
nvec = (int *)malloc(sizeof(int)*(ptr-vec));
memcpy(nvec, vec, sizeof(int)*(ptr-vec));
nvec = (int32_t *)malloc(sizeof(int32_t)*(ptr-vec));
memcpy(nvec, vec, sizeof(int32_t)*(ptr-vec));
pred->address_host_table = nvec;
Cuda_NewRule( pred );
return YAP_Unify(YAP_ARG4, YAP_MkIntTerm((YAP_Int)pred));
@ -207,19 +209,19 @@ cuda_erase( void )
static int
cuda_eval( void )
{
int *mat;
int32_t *mat;
predicate *ptr = (predicate *)YAP_IntOfTerm(YAP_ARG1);
int n = Cuda_Eval( ptr, & mat);
int ncols = ptr->num_columns;
int32_t n = Cuda_Eval(facts, cf, rules, cr, ptr, & mat);
int32_t ncols = ptr->num_columns;
YAP_Term out = YAP_TermNil();
YAP_Functor f = YAP_MkFunctor(YAP_IntToAtom(ptr->name), ncols);
YAP_Term vec[256];
int i;
int32_t i;
if (n < 0)
return FALSE;
for (i=0; i<n; i++) {
int ni = ((n-1)-i)*ncols, j;
int32_t ni = ((n-1)-i)*ncols, j;
for (j=0; j<ncols; j++) {
vec[i] = YAP_MkIntTerm(mat[ni+j]);
}
@ -230,9 +232,9 @@ cuda_eval( void )
static int cuda_count( void )
{
int *mat;
int32_t *mat;
predicate *ptr = (predicate *)YAP_IntOfTerm(YAP_ARG1);
int n = Cuda_Eval( ptr, & mat);
int32_t n = Cuda_Eval(facts, cf, rules, cr, ptr, & mat);
if (n < 0)
return FALSE;

1224
packages/cuda/lista.cu Normal file

File diff suppressed because it is too large Load Diff

482
packages/cuda/memory.cu Normal file
View File

@ -0,0 +1,482 @@
#include <list>
#include <iostream>
#include <stdlib.h>
#include <stdint.h>
#include <algorithm>
#include <thrust/device_vector.h>
#include "lista.h"
#include "memory.h"
#define MAX_REC 200
#define HALF_REC (MAX_REC / 2)
#define MAX_FIX_POINTS 100
unsigned int avmem;
memnode temp_storage[MAX_REC];
list<memnode> GPUmem;
list<memnode> CPUmem;
bool compareiteration(const memnode &r1, const memnode &r2)
{
return (r1.iteration < r2.iteration);
}
bool comparename(const memnode &r1, const memnode &r2)
{
return (r1.name > r2.name);
}
void calcular_mem(int dev)
{
cudaDeviceProp p;
cudaGetDeviceProperties(&p, dev);
avmem = p.totalGlobalMem;
temp_storage[0].dev_address = NULL;
temp_storage[0].size = 0;
temp_storage[HALF_REC].dev_address = NULL;
temp_storage[HALF_REC].size = 0;
//cout << "Initial memory available " << avmem << endl;
}
template<class InputIterator>
InputIterator buscarhecho(InputIterator first, InputIterator last, int name)
{
while(first!=last)
{
if(first->name == name) return first;
++first;
}
return last;
}
list<memnode>::iterator buscarpornombre(int name, int itr, int *totalrows, int *gpunum)
{
int x = 1, sum = 0;
memnode temp;
temp.name = name;
temp.iteration = itr;
pair<list<memnode>::iterator, list<memnode>::iterator> rec = equal_range(GPUmem.begin(), GPUmem.end(), temp, compareiteration);
while(rec.first != rec.second)
{
//cout << "itr = " << itr << " rec.first = " << rec.first->name << endl;
if(rec.first->name == name)
{
temp_storage[x] = *rec.first;
rec.first = GPUmem.erase(rec.first);
sum += temp_storage[x].rows;
x++;
}
else
rec.first++;
}
//if(x > 1)
rec.first = GPUmem.insert(rec.first, temp);
*totalrows = sum;
*gpunum = x;
return rec.first;
}
int buscarpornombrecpu(int name, int itr, int *totalrows)
{
int x = HALF_REC + 1, sum = 0;
memnode temp;
temp.iteration = itr;
pair<list<memnode>::iterator, list<memnode>::iterator> rec = equal_range(CPUmem.begin(), CPUmem.end(), temp, compareiteration);
/*if(rec.first != rec.second)
cout << "bscnomcpu = " << rec.first->name << " " << rec.first->iteration << endl;*/
while(rec.first != rec.second)
{
if(rec.first->name == name)
{
temp_storage[x] = *rec.first;
rec.first = CPUmem.erase(rec.first);
sum += temp_storage[x].rows;
x++;
}
else
rec.first++;
}
*totalrows += sum;
return x;
}
void limpiar()
{
list<memnode>::iterator ini;
memnode temp;
if(GPUmem.size() == 0)
{
cerr << "Not enough GPU memory: have " << avmem << endl;
exit(1);
}
ini = GPUmem.begin();
if(ini->isrule)
{
temp = *ini;
temp.dev_address = (int *)malloc(ini->size);
cudaMemcpyAsync(temp.dev_address, ini->dev_address, temp.size, cudaMemcpyDeviceToHost);
CPUmem.push_back(temp);
}
liberar(ini->dev_address, ini->size);
GPUmem.erase(ini);
}
void limpiartodo(int *p1, int *p2)
{
list<memnode>::iterator ini;
memnode temp;
int cont = 0;
if(p1 != NULL)
cont++;
if(p2 != NULL)
cont++;
ini = GPUmem.begin();
/*cout << "ANTES" << endl;
mostrar_memoria();
mostrar_memcpu();
cout << "FIN ANTES" << endl;*/
//cout << "mem = " << GPUmem.size() << " " << avmem << endl;
while(GPUmem.size() > cont)
{
if(ini->dev_address == p1 || ini->dev_address == p2)
{
ini++;
continue;
}
if(ini->isrule)
{
temp = *ini;
temp.dev_address = (int *)malloc(ini->size);
cudaMemcpy(temp.dev_address, ini->dev_address, temp.size, cudaMemcpyDeviceToHost);
CPUmem.push_back(temp);
}
liberar(ini->dev_address, temp.size);
ini = GPUmem.erase(ini);
}
/*cout << "DESPUES" << endl;
mostrar_memoria();
mostrar_memcpu();
cout << "FIN DESPUES" << endl;*/
//cout << "memfinal = " << GPUmem.size() << " " << avmem << endl;
}
void liberar(int *ptr, int size)
{
//cout << "L " << avmem << " " << size;
cudaFree(ptr);
avmem += size;
//cout << " " << avmem << endl;
}
void reservar(int **ptr, int size)
{
//cout << "R " << avmem << " " << size;
while(avmem < size)
limpiar();
while(cudaMalloc(ptr, size) == cudaErrorMemoryAllocation)
limpiar();
avmem -= size;
//cout << " " << avmem << endl;
}
void registrar(int name, int num_columns, int *ptr, int rows, int itr, int rule)
{
memnode temp;
temp.name = name;
temp.dev_address = ptr;
temp.rows = rows;
temp.size = rows * num_columns * sizeof(int);
temp.iteration = itr;
temp.isrule = rule;
GPUmem.push_back(temp);
}
template<class InputIterator>
void actualizar(int num_columns, int *ptr, int rows, InputIterator i)
{
i->dev_address = ptr;
i->rows = rows;
i->size = rows * num_columns * sizeof(int);
}
int numrows(int name, int itr)
{
int sum = 0;
memnode temp;
temp.iteration = itr;
pair<list<memnode>::iterator, list<memnode>::iterator> rec = equal_range(GPUmem.begin(), GPUmem.end(), temp, compareiteration);
while(rec.first != rec.second)
{
if(rec.first->name == name)
sum += rec.first->rows;
rec.first++;
}
rec = equal_range(CPUmem.begin(), CPUmem.end(), temp, compareiteration);
while(rec.first != rec.second)
{
if(rec.first->name == name)
sum += rec.first->rows;
rec.first++;
}
return sum;
}
int cargar(int name, int num_rows, int num_columns, int is_fact, int *address_host_table, int **ptr, int itr)
{
int numgpu, numcpu, totalrows = 0;
int *temp, x;
int size, itrant;
list<memnode>::iterator i;
memnode fact;
if(is_fact)
{
i = buscarhecho(GPUmem.begin(), GPUmem.end(), name);
if(i != GPUmem.end())
{
fact = *i;
GPUmem.erase(i);
fact.iteration = itr;
*ptr = fact.dev_address;
GPUmem.push_back(fact);
return fact.rows;
}
size = num_rows * num_columns * sizeof(int);
reservar(&temp, size);
cudaMemcpyAsync(temp, address_host_table, size, cudaMemcpyHostToDevice);
registrar(name, num_columns, temp, num_rows, itr, 0);
*ptr = temp;
return num_rows;
}
if(itr > 0)
{
itrant = itr - 1;
i = buscarpornombre(name, itrant, &totalrows, &numgpu);
numcpu = buscarpornombrecpu(name, itrant, &totalrows);
if((numgpu == 2) && (numcpu == (HALF_REC + 1)))
{
actualizar(num_columns, temp_storage[1].dev_address, temp_storage[1].rows, i);
*ptr = temp_storage[1].dev_address;
return temp_storage[1].rows;
}
size = totalrows * num_columns * sizeof(int);
reservar(&temp, size);
for(x = 1; x < numgpu; x++)
{
cudaMemcpyAsync(temp + temp_storage[x-1].size, temp_storage[x].dev_address, temp_storage[x].size, cudaMemcpyDeviceToDevice);
liberar(temp_storage[x].dev_address, temp_storage[x].size);
}
for(x = HALF_REC + 1; x < numcpu; x++)
{
cudaMemcpyAsync(temp + temp_storage[x-1].size, temp_storage[x].dev_address, temp_storage[x].size, cudaMemcpyHostToDevice);
free(temp_storage[x].dev_address);
}
actualizar(num_columns, temp, totalrows, i);
*ptr = temp;
return totalrows;
}
return 0;
}
int cargafinal(int name, int cols, int **ptr)
{
int *temp, *ini, cont = 0;
memnode bus;
bus.name = name;
GPUmem.sort(comparename);
CPUmem.sort(comparename);
list<memnode>::iterator endg = GPUmem.end();
list<memnode>::iterator endc = CPUmem.end();
list<memnode>::iterator pos = lower_bound(GPUmem.begin(), endg, bus, comparename);
list<memnode>::iterator gpu = pos;
while(pos != endg && pos->name == name)
{
cont += pos->rows;
pos++;
}
pos = lower_bound(CPUmem.begin(), endc, bus, comparename);
list<memnode>::iterator cpu = pos;
while(pos != endc && pos->name == name)
{
cont += pos->rows;
pos++;
}
reservar(&temp, cont * cols * sizeof(int));
ini = temp;
pos = gpu;
while(pos != endg && pos->name == name)
{
cudaMemcpy(temp, pos->dev_address, pos->size, cudaMemcpyDeviceToDevice);
temp += pos->size / sizeof(int);
pos++;
}
pos = cpu;
while(pos != endc && pos->name == name)
{
cudaMemcpy(temp, pos->dev_address, pos->size, cudaMemcpyHostToDevice);
temp += pos->size / sizeof(int);
pos++;
}
/*int x, y;
int *hop1 = (int *)malloc(cont * cols * sizeof(int));
cudaMemcpy(hop1, ini, cont * cols * sizeof(int), cudaMemcpyDeviceToHost);
cout << "select finala" << endl;
for(x = 0; x < cont; x++)
{
for(y = 0; y < cols; y++)
cout << hop1[x * cols + y] << " ";
cout << endl;
}
cout << "select finala" << endl;*/
*ptr = ini;
return cont;
}
bool generadas(int name, int filas, int cols, int itr)
{
int r1, r2, x, fin;
int *dop1, *dop2;
r2 = numrows(name, itr);
if(itr < MAX_FIX_POINTS)
fin = itr;
else
fin = MAX_FIX_POINTS;
for(x = 1; x <= fin; x++)
{
r1 = numrows(name, itr - x);
if(r1 == r2)
{
r2 = cargar(name, filas, cols, 0, NULL, &dop2, itr + 1);
thrust::device_ptr<int> pt2 = thrust::device_pointer_cast(dop2);
r1 = cargar(name, filas, cols, 0, NULL, &dop1, itr - x + 1);
thrust::device_ptr<int> pt1 = thrust::device_pointer_cast(dop1);
/*int y;
int *a = (int *)malloc(r1 * cols * sizeof(int));
cudaMemcpy(a, dop1, r1 * cols * sizeof(int), cudaMemcpyDeviceToHost);
for(x = 0; x < r1; x++)
{
for(y = 0; y < cols; y++)
cout << a[x * cols + y] << " ";
}
cout << endl;
cudaMemcpy(a, dop2, r1 * cols * sizeof(int), cudaMemcpyDeviceToHost);
for(x = 0; x < r1; x++)
{
for(y = 0; y < cols; y++)
cout << a[x * cols + y] << " ";
}
cout << endl;
free(a);*/
if(thrust::equal(pt1, pt1 + r1, pt2) == true)
return true;
}
}
return false;
}
void mostrar_memoria()
{
int x;
list<memnode>::iterator i = GPUmem.begin();
cout << "Memoria inicio GPU" << endl;
for(x = 0; x < GPUmem.size(); x++, i++)
cout << i->name << " " << i->iteration << " " << i->size << endl;
cout << "Memoria fin GPU" << endl;
}
void mostrar_memcpu()
{
int x;
list<memnode>::iterator i = CPUmem.begin();
cout << "Memoria inicio CPU" << endl;
for(x = 0; x < CPUmem.size(); x++, i++)
cout << i->name << " " << i->iteration << endl;
cout << "Memoria fin CPU" << endl;
}
void resultados(vector<rulenode>::iterator first, vector<rulenode>::iterator last)
{
GPUmem.sort(comparename);
CPUmem.sort(comparename);
list<memnode>::iterator gpu = GPUmem.begin();
list<memnode>::iterator cpu = CPUmem.begin();
int x, y, of, cols;
int *temp, cont = 0;
while(first != last)
{
while(first->name == gpu->name)
{
temp = (int *)malloc(gpu->size);
cudaMemcpy(temp, gpu->dev_address, gpu->size, cudaMemcpyDeviceToHost);
cols = gpu->size / (gpu->rows * sizeof(int));
cont += gpu->rows;
for(x = 0, of = 0; x < gpu->rows; x++)
{
for(y = 0; y < cols; y++, of++)
cout << temp[of] << " ";
cout << endl;
}
cudaFree(gpu->dev_address);
free(temp);
gpu++;
}
while(first->name == cpu->name)
{
cols = cpu->size / (cpu->rows * sizeof(int));
cont += cpu->rows;
for(x = 0, of = 0; x < cpu->rows; x++)
{
for(y = 0; y < cols; y++, of++)
cout << cpu->dev_address[of] << " ";
cout << endl;
}
free(cpu->dev_address);
cpu++;
}
first++;
}
cout << cont << endl;
}
/*device_vector<int> reservar_vector(int size)
{
limpiar(size * sizeof(int));
device_vector<int> ret(size);
return ret;
}*/
/*
void reservar_resultado(InputIterator req, int *ptr, int size)
{
limpiar(size);
memnode temp;
temp.name = req->name;
temp.size = size;
cudaMalloc(&temp.dev_address, size);
temp.in_use = 1;
GPUmem.push_back(temp);
avmem -= size;
ptr = temp.dev_address;
}*/

View File

@ -0,0 +1,289 @@
#include <thrust/device_vector.h>
//#include <thrust/device_ptr.h>
#include <thrust/scan.h>
#include <stdlib.h>
#include "memory.h"
__global__ void marcar(int *dop1, int rows, int cols, int *cons, int numc, int *res) /*a libreria*/
{
extern __shared__ int shared[];
int id = blockIdx.x * blockDim.x + threadIdx.x;
int x, rowact, posact;
if(threadIdx.x < numc)
shared[threadIdx.x] = cons[threadIdx.x];
__syncthreads();
if(id < rows)
{
rowact = id * cols;
for(x = 0; x < numc; x += 2)
{
posact = rowact + shared[x];
if(dop1[posact] != shared[x+1])
return;
}
res[id] = 1;
}
}
__global__ void marcar2(int *dop1, int rows, int cols, int *cons, int numc, int *res) /*a libreria*/
{
extern __shared__ int shared[];
int *spos = &shared[numc];
int id = blockIdx.x * blockDim.x + threadIdx.x;
int x, rowact, posact;
if(threadIdx.x < (numc * 2))
shared[threadIdx.x] = cons[threadIdx.x];
__syncthreads();
if(id < rows)
{
if(res[id] == 0)
return;
rowact = id * cols;
for(x = 0; x < numc; x++)
{
posact = rowact + spos[x];
if(dop1[posact] != shared[x])
{
res[id] = 0;
return;
}
}
}
}
__global__ void samejoin(int *dop1, int rows, int cols, int *dhead, int cont, int *res)
{
extern __shared__ int shared[];
int id = blockIdx.x * blockDim.x + threadIdx.x;
int temp, temp2, pos, x, y;
if(threadIdx.x < cont)
shared[threadIdx.x] = dhead[threadIdx.x];
__syncthreads();
if(id < rows)
{
if(res[id] == 0)
return;
pos = id * cols;
for(x = 0; x < cont; x++)
{
temp = shared[x];
y = x + 1;
temp2 = shared[y];
while(temp2 > -1)
{
if(dop1[temp+pos] != dop1[temp2+pos])
{
res[id] = 0;
return;
}
y++;
temp2 = shared[y];
}
x = y;
}
}
}
__global__ void samejoin2(int *dop1, int rows, int cols, int *dhead, int cont, int *res)
{
extern __shared__ int shared[];
int id = blockIdx.x * blockDim.x + threadIdx.x;
int temp, temp2, pos, x, y;
if(threadIdx.x < cont)
shared[threadIdx.x] = dhead[threadIdx.x];
__syncthreads();
if(id < rows)
{
pos = id * cols;
for(x = 0; x < cont; x++)
{
temp = shared[x];
y = x + 1;
temp2 = shared[y];
while(temp2 > -1)
{
if(dop1[temp+pos] != dop1[temp2+pos])
return;
y++;
temp2 = shared[y];
}
x = y;
}
res[id] = 1;
}
}
__global__ void proyectar(int *dop1, int rows, int cols, int *dhead, int hsize, int *res)
{
extern __shared__ int shared[];
int id = blockIdx.x * blockDim.x + threadIdx.x;
int pos, posr, x;
if(threadIdx.x < hsize)
shared[threadIdx.x] = dhead[threadIdx.x];
__syncthreads();
if(id < rows)
{
pos = id * cols;
posr = id * hsize;
for(x = 0; x < hsize; x++, posr++)
res[posr] = dop1[pos+shared[x]];
}
}
__global__ void llenarproyectar(int *dop1, int rows, int cols, int *temp, int *dhead, int hsize, int *res)
{
extern __shared__ int shared[];
int id = blockIdx.x * blockDim.x + threadIdx.x;
int pos, posr, x;
if(threadIdx.x < cols)
shared[threadIdx.x] = dhead[threadIdx.x];
__syncthreads();
if(id < rows)
{
posr = temp[id+1];
if(temp[id] != posr && posr > 0)
{
pos = id * cols;
posr = (posr - 1) * hsize;
for(x = 0; x < hsize; x++, posr++)
res[posr] = dop1[pos+shared[x]];
}
}
}
/*__global__ void removedup()
{
extern __shared__ int shared[];
int id = blockIdx.x * blockDim.x + threadIdx.x;
if(threadIdx.x < cols)
shared[threadIdx.x] = dhead[threadIdx.x];
if(id < rows)
{
}
}*/
template<typename T> /*a libreria*/
struct suma : public binary_function<T,T,T>
{
__host__ __device__
T operator()(const T &r1, const T &r2)
{
if(r1 > -1)
{
if(r2 > 0)
return r1 + r2;
return -r1;
}
else
{
if(r2 > 0)
return abs(r1) + r2;
return r1;
}
}
};
int mayor(int a, int b, int c)
{
if(a > b)
{
if(a > c)
return a;
}
else
{
if(b > c)
return b;
}
return c;
}
int selectproyect(int *dop1, int rows, int cols, int head_size, int *select, int numselect, int *selfjoin, int numselfj, int *project, int **ret)
{
int *fres = NULL, *temp = NULL;
int *dhead = NULL, tmplen;
int size, size2, num;
thrust::device_ptr<int> res;
int head_bytes = mayor(numselect, numselfj, head_size) * sizeof(int);
reservar(&dhead, head_bytes);
int blockllen = rows / 1024 + 1;
int numthreads = 1024;
//removerep(dop1, rows, cols, dhead,)
if(numselect > 0)
{
tmplen = rows + 1;
size2 = tmplen * sizeof(int);
reservar(&temp, size2);
cudaMemset(temp, 0, size2);
size = numselect * sizeof(int);
cudaMemcpy(dhead, select, size, cudaMemcpyHostToDevice);
marcar<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, numselect, temp + 1);
if(numselfj > 0)
{
size = numselfj * sizeof(int);
cudaMemcpy(dhead, selfjoin, size, cudaMemcpyHostToDevice);
samejoin<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, numselfj, temp + 1);
}
res = thrust::device_pointer_cast(temp);
thrust::inclusive_scan(res + 1, res + tmplen, res + 1);
num = res[rows];
if(num == 0)
return 0;
size = head_size * sizeof(int);
reservar(&fres, num * size);
cudaMemcpy(dhead, project, size, cudaMemcpyHostToDevice);
llenarproyectar<<<blockllen, numthreads, size>>>(dop1, rows, cols, temp, dhead, head_size, fres);
liberar(dhead, head_bytes);
liberar(temp, size2);
*ret = fres;
return num;
}
else
{
if(numselfj > 0)
{
tmplen = rows + 1;
size2 = tmplen * sizeof(int);
reservar(&temp, size2);
cudaMemset(temp, 0, size2);
size = numselfj * sizeof(int);
cudaMemcpy(dhead, selfjoin, size, cudaMemcpyHostToDevice);
samejoin2<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, numselfj, temp + 1);
res = thrust::device_pointer_cast(temp);
thrust::inclusive_scan(res + 1, res + tmplen, res + 1);
num = res[rows];
if(num == 0)
return 0;
size = head_size * sizeof(int);
reservar(&fres, num * size);
cudaMemcpy(dhead, project, size, cudaMemcpyHostToDevice);
llenarproyectar<<<blockllen, numthreads, size>>>(dop1, rows, cols, temp, dhead, head_size, fres);
liberar(dhead, head_bytes);
liberar(temp, size2);
*ret = fres;
return num;
}
else
{
size = head_size * sizeof(int);
reservar(&fres, rows * size);
cudaMemcpy(dhead, project, size, cudaMemcpyHostToDevice);
proyectar<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, head_size, fres);
liberar(dhead, head_bytes);
*ret = fres;
return rows;
}
}
}

View File

@ -5,7 +5,8 @@
main :-
cuda_extensional(db/2, _X),
cuda_rule((a(X, Y) :- db(Y, Z), db(X, Z), db(1, Z) ), _).
cuda_rule((a(X, Y) :- db(Y, Z), db(X, Z), db(1, Z) ), Q),
cuda_eval(Q, _).
db(1,a).
db(2,a).

1170
packages/cuda/treeb.cu Executable file

File diff suppressed because it is too large Load Diff

193
packages/cuda/union2.cu Normal file
View File

@ -0,0 +1,193 @@
#include <thrust/device_vector.h>
#include <thrust/unique.h>
#include <thrust/distance.h>
#include <iostream>
typedef struct n2
{
int v[2];
}s2;
typedef struct n3
{
int v[3];
}s3;
struct p2
{
__host__ __device__
bool operator()(const s2 &r1, const s2 &r2)
{
int x;
for(x = 0; x < 2; x++)
{
if(r1.v[x] != r2.v[x])
return false;
}
return true;
}
};
struct o2
{
__host__ __device__
bool operator()(const s2 &r1, const s2 &r2)
{
int x;
for(x = 0; x < 2; x++)
{
if(r1.v[x] > r2.v[x])
return true;
if(r1.v[x] < r2.v[x])
return false;
}
return false;
}
};
struct p3
{
__host__ __device__
bool operator()(const s3 &r1, const s3 &r2)
{
int x;
for(x = 0; x < 3; x++)
{
if(r1.v[x] != r2.v[x])
return false;
}
return true;
}
};
struct o3
{
__host__ __device__
bool operator()(const s3 &r1, const s3 &r2)
{
int x;
for(x = 0; x < 3; x++)
{
if(r1.v[x] > r2.v[x])
return true;
if(r1.v[x] < r2.v[x])
return false;
}
return false;
}
};
int unir(int *res, int rows, int tipo, int **ret)
{
thrust::device_ptr<int> pt, re;
thrust::device_ptr<s2> pt2, re2;
thrust::device_ptr<s3> pt3, re3;
s2 *t2;
s3 *t3;
int flag, nrows;
switch(tipo)
{
case 1:
{
pt = thrust::device_pointer_cast(res);
flag = 0;
while(flag != 1)
{
try
{
thrust::sort(pt, pt + rows);
re = thrust::unique(pt, pt + rows);
flag = 1;
}
catch(std::bad_alloc &e)
{
limpiar();
}
}
nrows = thrust::distance(pt, re);
thrust::device_vector<int> iVec(pt, pt + rows);
iVec.resize(nrows);
iVec.shrink_to_fit();
return nrows;
}
case 2:
{
t2 = (s2*)res;
/*int *a, x, y;
a = (int *)malloc(rows * 2 * sizeof(int));
cudaMemcpy(a, res, rows * 2 * sizeof(int), cudaMemcpyDeviceToHost);
cout << "INI" << endl;
for(x = 0; x < rows; x++)
{
for(y = 0; y < 2; y++)
cout << a[x * 2 + y] << " ";
cout << endl;
}
cout << "INI fin" << endl;
free(a);*/
pt2 = thrust::device_pointer_cast(t2);
flag = 0;
while(flag != 1)
{
try
{
thrust::sort(pt2, pt2 + rows, o2());
re2 = thrust::unique(pt2, pt2 + rows, p2());
flag = 1;
}
catch(std::bad_alloc &e)
{
limpiar();
}
}
nrows = thrust::distance(pt2, re2);
thrust::device_vector<s2> iVec(pt2, pt2 + rows);
iVec.resize(nrows);
iVec.shrink_to_fit();
/*tam = (int)(re2.get() - pt2.get());
a = (int *)malloc(tam * 2 * sizeof(int));
cudaMemcpy(a, res, tam * 2 * sizeof(int), cudaMemcpyDeviceToHost);
cout << "FIN" << endl;
for(x = 0; x < tam; x++)
{
for(y = 0; y < 2; y++)
cout << a[x * 2 + y] << " ";
cout << endl;
}
cout << "FIN fin" << endl;
free(a);
cout << "antes = " << rows << " despues = " << thrust::distance(pt2, re2) << endl;*/
return nrows;
}
case 3:
{
t3 = (s3*)res;
pt3 = thrust::device_pointer_cast(t3);
flag = 0;
while(flag != 1)
{
try
{
thrust::sort(pt3, pt3 + rows, o3());
re3 = thrust::unique(pt3, pt3 + rows, p3());
flag = 1;
}
catch(std::bad_alloc &e)
{
limpiar();
}
}
nrows = thrust::distance(pt3, re3);
thrust::device_vector<s3> iVec(pt3, pt3 + rows);
iVec.resize(nrows);
iVec.shrink_to_fit();
return nrows;
}
}
return 0;
}