This commit is contained in:
Vitor Santos Costa 2016-04-19 23:30:02 +01:00
parent 3d68f0e06b
commit cd41d373db
28 changed files with 3153 additions and 2229 deletions

View File

@ -1438,5 +1438,6 @@ void Yap_exit(int value) {
Yap_ShutdownLoadForeign();
}
Yap_CloseStreams(false);
Yap_CloseReadline();
exit(value);
}

View File

@ -185,7 +185,7 @@ available in experimental implementations.
*/
YAP_FLAG(FILE_NAME_VARIABLES_FLAG, "file_name_variables", true, booleanFlag,
"true", NULL),
YAP_FLAG(FLOAT_FORMAT_FLAG, "float_format", true, isatom, "%15e",
YAP_FLAG(FLOAT_FORMAT_FLAG, "float_format", true, isatom, "%15f",
NULL), /**< + `float_format `
C-library `printf()` format specification used by write/1 and

View File

@ -1,4 +1,4 @@
/*************************************************************************
/*************************************************************************
* *
* YAP Prolog %W% %G% *
* Yap Prolog was developed at NCCUP - Universidade do Porto *
@ -301,6 +301,7 @@ extern void Yap_DebugErrorPutc(int n);
extern void Yap_DebugErrorPuts(const char *s);
extern void Yap_DebugWriteIndicator(struct pred_entry *ap);
void Yap_PlWriteToStream(Term, int, int);
void Yap_CloseReadline(void);
/* depth_lim.c */
bool Yap_InitReadline(Term t);
void Yap_InitItDeepenPreds(void);

View File

@ -659,6 +659,19 @@ INLINE_ONLY inline EXTERN PropFlags IsPredProperty(int flags) {
return (PropFlags)((flags == PEProp));
}
INLINE_ONLY inline EXTERN Atom NameOfPred(PredEntry *pe);
INLINE_ONLY inline EXTERN Atom NameOfPred(PredEntry *pe) {
if (pe->ModuleOfPred == IDB_MODULE) {
return NULL;
} else if (pe->ArityOfPE == 0) {
return (Atom)pe->FunctorOfPred;
} else {
Functor f = pe->FunctorOfPred;
return NameOfFunctor(f);
}
}
/* Flags for code or dbase entry */
/* There are several flags for code and data base entries */
typedef enum {
@ -1322,7 +1335,6 @@ INLINE_ONLY inline EXTERN bool IsFlagProperty(PropFlags);
INLINE_ONLY inline EXTERN bool IsFlagProperty(PropFlags flags) {
return flags == FlagProperty;
}
/* Proto types */

View File

@ -474,6 +474,8 @@
#define LOCAL_search_atoms LOCAL->search_atoms_
#define REMOTE_search_atoms(wid) REMOTE(wid)->search_atoms_
#define LOCAL_SearchPreds LOCAL->SearchPreds_
#define REMOTE_SearchPreds(wid) REMOTE(wid)->SearchPreds_
#define LOCAL_CurSlot LOCAL->CurSlot_
#define REMOTE_CurSlot(wid) REMOTE(wid)->CurSlot_

View File

@ -268,6 +268,7 @@ const char* Error_Function_;
UInt exo_arg_;
// atom completion
struct scan_atoms* search_atoms_;
struct pred_entry* SearchPreds_;
// Slots
yhandle_t CurSlot_;
yhandle_t NSlots_;

View File

@ -269,6 +269,7 @@ static void InitWorker(int wid) {
REMOTE_CurSlot(wid) = 0;
REMOTE_NSlots(wid) = 0;
REMOTE_SlotBase(wid) = InitHandles(wid);

View File

@ -279,4 +279,5 @@ static void RestoreWorker(int wid USES_REGS) {
}

View File

@ -312,6 +312,7 @@ UInt exo_arg =0
// atom completion
struct scan_atoms* search_atoms void
struct pred_entry* SearchPreds void
// Slots
yhandle_t CurSlot =0

View File

@ -470,20 +470,13 @@ code with _C_.
*/
static Int get_byte(USES_REGS1) { /* '$get_byte'(Stream,-N) */
int sno = Yap_CheckStream(ARG1, Input_Stream_f, "get_byte/2");
int sno = Yap_CheckBinaryStream(ARG1, Input_Stream_f, "get_byte/2");
Int status;
Term out;
if (sno < 0)
return (FALSE);
status = GLOBAL_Stream[sno].status;
if (!(status & Binary_Stream_f)
//&& strictISOFlag()
) {
UNLOCK(GLOBAL_Stream[sno].streamlock);
Yap_Error(PERMISSION_ERROR_INPUT_STREAM, ARG1, "get_byte/2");
return (FALSE);
}
out = MkIntTerm(GLOBAL_Stream[sno].stream_getc(sno));
UNLOCK(GLOBAL_Stream[sno].streamlock);
return Yap_unify_constant(ARG2, out);
@ -812,16 +805,9 @@ static Int put_byte(USES_REGS1) { /* '$put_byte'(Stream,N) */
Yap_Error(DOMAIN_ERROR_OUT_OF_RANGE, t2, "put_code/1");
return FALSE;
}
int sno = Yap_CheckStream(ARG1, Output_Stream_f, "put/2");
int sno = Yap_CheckBinaryStream(ARG1, Output_Stream_f, "put/2");
if (sno < 0)
return (FALSE);
if (!(GLOBAL_Stream[sno].status & Binary_Stream_f)
// && strictISOFlag()
) {
UNLOCK(GLOBAL_Stream[sno].streamlock);
Yap_Error(PERMISSION_ERROR_OUTPUT_BINARY_STREAM, ARG1, NULL);
return false;
}
GLOBAL_Stream[sno].stream_putc(sno, ch);
/*
* if (!(GLOBAL_Stream[sno].status & Null_Stream_f))

View File

@ -1576,6 +1576,24 @@ int Yap_CheckTextStream__(const char *file, const char *f, int line, Term arg,
return sno;
}
int Yap_CheckBinaryStream__(const char *file, const char *f, int line, Term arg,
int kind, const char *msg) {
int sno;
if ((sno = CheckStream__(file, f, line, arg, kind, msg)) < 0)
return -1;
if ((GLOBAL_Stream[sno].status & Binary_Stream_f)) {
UNLOCK(GLOBAL_Stream[sno].streamlock);
if (kind == Input_Stream_f)
PlIOError__(file, f, line, PERMISSION_ERROR_INPUT_TEXT_STREAM, arg,
msg);
else
PlIOError__(file, f, line, PERMISSION_ERROR_OUTPUT_TEXT_STREAM, arg,
msg);
return -1;
}
return sno;
}
/* used from C-interface */
int Yap_GetFreeStreamDForReading(void) {
int sno = GetFreeStreamD();

View File

@ -45,6 +45,10 @@ extern int Yap_CheckStream__(const char *, const char *, int, Term, int,
Yap_CheckTextStream__(__FILE__, __FUNCTION__, __LINE__, arg, kind, msg)
extern int Yap_CheckTextStream__(const char *, const char *, int, Term, int,
const char *);
#define Yap_CheckBinaryStream(arg, kind, msg) \
Yap_CheckBinaryStream__(__FILE__, __FUNCTION__, __LINE__, arg, kind, msg)
extern int Yap_CheckBinaryStream__(const char *, const char *, int, Term, int,
const char *);
extern bool Yap_initStream(int sno, FILE *fd, const char *name, Term file_name,
encoding_t encoding, stream_flags_t flags,

View File

@ -149,7 +149,7 @@ static char *predicate_enumerate(const char *prefix, int state) {
} else {
Term cmod;
p = LOCAL_SearchPreds;
cmod = (p->ModuleOfPred != PROLOG_MODULE ? p->ModuleOfPred : TermProlog );
cmod = (p->ModuleOfPred != PROLOG_MODULE ? p->ModuleOfPred : TermProlog);
mod = Yap_GetModuleEntry(cmod);
}
while (mod) {
@ -168,8 +168,7 @@ static char *predicate_enumerate(const char *prefix, int state) {
p = mod->PredForME;
}
char *c = RepAtom(ap = NameOfPred(p))->StrOfAE;
if (strlen(c) > strlen(prefix) &&
strstr(c, prefix) == c &&
if (strlen(c) > strlen(prefix) && strstr(c, prefix) == c &&
!(p->PredFlags & HiddenPredFlag)) {
LOCAL_SearchPreds = p;
arity_t ar = p->ArityOfPE;
@ -177,19 +176,16 @@ static char *predicate_enumerate(const char *prefix, int state) {
if (Yap_IsPrefixOp(AbsAtom(ap), &l, &r) && ar == 1) {
return c;
}
size_t sz = strlen(c);
chain_t *el = (chain_t *)malloc(sizeof(chain_t)+sz);
strncpy(LOCAL_FileNameBuf, c, YAP_FILENAME_MAX);
strncat(LOCAL_FileNameBuf, "(", YAP_FILENAME_MAX);
return LOCAL_FileNameBuf;
}
}
LOCAL_SearchPreds = NULL;
return NULL;
}
static char *predicate_generator(const char *prefix, int state) {
static char *predicate_generator(const char *prefix, int state) {
char *s = predicate_enumerate(prefix, state);
if (s) {
@ -201,15 +197,15 @@ static char *predicate_enumerate(const char *prefix, int state) {
}
return s;
}
}
static char **prolog_completion(const char *text, int start, int end) {
static char **prolog_completion(const char *text, int start, int end) {
char **matches = NULL;
if (start == 0 && isalpha(text[0])) {
int i = 0;
while (i < end) {
if (isalnum(text[i]))
if (isalnum(text[i]) || text[i] == '_')
i++;
else
break;
@ -225,16 +221,20 @@ static char *predicate_enumerate(const char *prefix, int state) {
;
p = text + i;
if ((strstr(p,"[") == p) || (strstr(p,"compile(") == p) ||
(strstr(p,"consult(") == p) || (strstr(p,"load_files(") == p) ||
(strstr(p,"reconsult(") == p) || (strstr(p,"use_module(") == p))
if ((strstr(p, "[") == p) || (strstr(p, "compile(") == p) ||
(strstr(p, "consult(") == p) || (strstr(p, "load_files(") == p) ||
(strstr(p, "reconsult(") == p) || (strstr(p, "use_module(") == p) ||
(strstr(p, "cd(") == p))
matches = rl_completion_matches((char *)text, /* for pre-4.2 */
rl_filename_completion_function);
return matches;
}
int i = end, ch = '\0';
while (i > start) {
ch = text[-i];
ch = text[--i];
if (ch == '\'')
return rl_completion_matches((char *)text, /* for pre-4.2 */
rl_filename_completion_function);
if (isalnum(text[i]))
continue;
break;
@ -243,16 +243,16 @@ static char *predicate_enumerate(const char *prefix, int state) {
return rl_completion_matches((char *)text, atom_generator);
return NULL;
}
}
void Yap_ReadlineFlush(int sno) {
void Yap_ReadlineFlush(int sno) {
if (GLOBAL_Stream[sno].status & Tty_Stream_f &&
GLOBAL_Stream[sno].status & Output_Stream_f) {
rl_redisplay();
}
}
}
bool Yap_ReadlinePrompt(StreamDesc * s) {
bool Yap_ReadlinePrompt(StreamDesc *s) {
if (s->status & Tty_Stream_f) {
s->stream_getc = ReadlineGetc;
if (GLOBAL_Stream[0].status & Tty_Stream_f &&
@ -261,9 +261,9 @@ static char *predicate_enumerate(const char *prefix, int state) {
return true;
}
return false;
}
}
bool Yap_ReadlineOps(StreamDesc * s) {
bool Yap_ReadlineOps(StreamDesc *s) {
if (s->status & Tty_Stream_f) {
if (GLOBAL_Stream[0].status & Tty_Stream_f &&
is_same_tty(s->file, GLOBAL_Stream[0].file))
@ -273,9 +273,9 @@ static char *predicate_enumerate(const char *prefix, int state) {
return true;
}
return false;
}
}
static int prolog_complete(int ignore, int key) {
static int prolog_complete(int ignore, int key) {
if (rl_point > 0 && rl_line_buffer[rl_point - 1] != ' ') {
#if HAVE_DECL_RL_CATCH_SIGNALS_ /* actually version >= 1.2, or true readline \
*/
@ -292,9 +292,9 @@ static char *predicate_enumerate(const char *prefix, int state) {
rl_complete(ignore, key);
return 0;
}
}
bool Yap_InitReadline(Term enable) {
bool Yap_InitReadline(Term enable) {
// don't call readline within emacs
// if (getenv("ËMACS"))
// return;
@ -303,12 +303,13 @@ static char *predicate_enumerate(const char *prefix, int state) {
GLOBAL_Stream[StdInStream].u.irl.buf = NULL;
GLOBAL_Stream[StdInStream].u.irl.ptr = NULL;
GLOBAL_Stream[StdInStream].status |= Readline_Stream_f;
#if _MSC_VER || defined(__MINGW32__)
#if _WIN32
rl_instream = stdin;
#endif
rl_outstream = stderr;
using_history();
const char *s = Yap_AbsoluteFile("~/.YAP.history", NULL, true);
history_file = s;
if (read_history(s) != 0) {
FILE *f = fopen(s, "a");
if (f) {
@ -323,9 +324,9 @@ static char *predicate_enumerate(const char *prefix, int state) {
rl_add_defun("prolog-complete", (void *)prolog_complete, '\t');
#endif
return Yap_ReadlineOps(GLOBAL_Stream + StdInStream);
}
}
static bool getLine(int inp, int out) {
static bool getLine(int inp, int out) {
CACHE_REGS
rl_instream = GLOBAL_Stream[inp].file;
rl_outstream = GLOBAL_Stream[out].file;
@ -362,14 +363,13 @@ static char *predicate_enumerate(const char *prefix, int state) {
return false;
if (myrl_line[0] != '\0' && myrl_line[1] != '\0') {
add_history((char *)myrl_line);
write_history(history_file);
fflush(NULL);
}
s->u.irl.ptr = s->u.irl.buf = myrl_line;
return true;
}
}
static int ReadlinePutc(int sno, int ch) {
static int ReadlinePutc(int sno, int ch) {
CACHE_REGS
StreamDesc *s = &GLOBAL_Stream[sno];
#if MAC || _MSC_VER || defined(__MINGW32__)
@ -384,15 +384,15 @@ static char *predicate_enumerate(const char *prefix, int state) {
LOCAL_newline = true;
}
return ((int)ch);
}
}
/**
/**
@brief reading from the console is complicated because we need to
know whether to prompt and so on...
EOF must be handled by resetting the file.
*/
static int ReadlineGetc(int sno) {
*/
static int ReadlineGetc(int sno) {
StreamDesc *s = &GLOBAL_Stream[sno];
int ch;
bool fetch = (s->u.irl.buf == NULL);
@ -409,17 +409,17 @@ static char *predicate_enumerate(const char *prefix, int state) {
return EOF;
}
return console_post_process_read_char(ch, s);
}
}
/**
/**
@brief Yap_ReadlinePeekChar peeks the next char from the
readline buffer, but does not actually grab it.
The idea is to take advantage of the buffering. Special care must be taken
with EOF, though.
*/
Int Yap_ReadlinePeekChar(int sno) {
*/
Int Yap_ReadlinePeekChar(int sno) {
StreamDesc *s = &GLOBAL_Stream[sno];
int ch;
@ -446,9 +446,9 @@ static char *predicate_enumerate(const char *prefix, int state) {
return EOF;
}
return ch;
}
}
int Yap_ReadlineForSIGINT(void) {
int Yap_ReadlineForSIGINT(void) {
CACHE_REGS
int ch;
StreamDesc *s = &GLOBAL_Stream[StdInStream];
@ -471,20 +471,26 @@ static char *predicate_enumerate(const char *prefix, int state) {
return ch;
}
}
}
}
static Int has_readline(USES_REGS1) {
void Yap_CloseReadline(void) {
#if USE_READLINE
write_history(history_file);
#endif
}
static Int has_readline(USES_REGS1) {
#if USE_READLINE
return true;
#else
return false;
#endif
}
}
void Yap_InitReadlinePreds(void) {
void Yap_InitReadlinePreds(void) {
Yap_InitCPred("$has_readline", 0, has_readline,
SafePredFlag | HiddenPredFlag);
}
}
#else
bool Yap_InitReadline(Term enable) {

View File

@ -390,6 +390,8 @@ write1 ( USES_REGS1 )
if (output_stream == -1) output_stream = 1;
xarg * args = Yap_ArgListToVector ( TermNil, write_defs, WRITE_END );
if (args == NULL) {
if (LOCAL_Error_TYPE == DOMAIN_ERROR_OUT_OF_RANGE)
LOCAL_Error_TYPE = DOMAIN_ERROR_WRITE_OPTION;
if (LOCAL_Error_TYPE)
Yap_Error(LOCAL_Error_TYPE, LOCAL_Error_Term, NULL);
return false;
@ -415,6 +417,8 @@ write_canonical1 ( USES_REGS1 )
if (output_stream == -1) output_stream = 1;
xarg * args = Yap_ArgListToVector ( TermNil, write_defs, WRITE_END );
if (args == NULL) {
if (LOCAL_Error_TYPE == DOMAIN_ERROR_OUT_OF_RANGE)
LOCAL_Error_TYPE = DOMAIN_ERROR_WRITE_OPTION;
if (LOCAL_Error_TYPE)
Yap_Error(LOCAL_Error_TYPE, LOCAL_Error_Term, NULL);
return false;
@ -440,6 +444,8 @@ write_canonical ( USES_REGS1 )
we cannot make recursive Prolog calls */
xarg * args = Yap_ArgListToVector ( TermNil, write_defs, WRITE_END );
if (args == NULL) {
if (LOCAL_Error_TYPE == DOMAIN_ERROR_OUT_OF_RANGE)
LOCAL_Error_TYPE = DOMAIN_ERROR_WRITE_OPTION;
if (LOCAL_Error_TYPE)
Yap_Error(LOCAL_Error_TYPE, LOCAL_Error_Term, NULL);
return false;
@ -467,6 +473,8 @@ writeq1 ( USES_REGS1 )
we cannot make recursive Prolog calls */
xarg *args = Yap_ArgListToVector ( TermNil, write_defs, WRITE_END );
if (args == NULL) {
if (LOCAL_Error_TYPE == DOMAIN_ERROR_OUT_OF_RANGE)
LOCAL_Error_TYPE = DOMAIN_ERROR_WRITE_OPTION;
if (LOCAL_Error_TYPE)
Yap_Error(LOCAL_Error_TYPE, LOCAL_Error_Term, NULL);
return false;
@ -495,6 +503,8 @@ writeq ( USES_REGS1 )
we cannot make recursive Prolog calls */
xarg *args = Yap_ArgListToVector ( TermNil, write_defs, WRITE_END );
if (args == NULL) {
if (LOCAL_Error_TYPE == DOMAIN_ERROR_OUT_OF_RANGE)
LOCAL_Error_TYPE = DOMAIN_ERROR_WRITE_OPTION;
if (LOCAL_Error_TYPE)
Yap_Error(LOCAL_Error_TYPE, LOCAL_Error_Term, NULL);
return false;
@ -523,6 +533,8 @@ print1 ( USES_REGS1 )
we cannot make recursive Prolog calls */
xarg *args = Yap_ArgListToVector ( TermNil, write_defs, WRITE_END );
if (args == NULL) {
if (LOCAL_Error_TYPE == DOMAIN_ERROR_OUT_OF_RANGE)
LOCAL_Error_TYPE = DOMAIN_ERROR_WRITE_OPTION;
if (LOCAL_Error_TYPE)
Yap_Error(LOCAL_Error_TYPE, LOCAL_Error_Term, NULL);
return false;
@ -551,6 +563,8 @@ print ( USES_REGS1 )
we cannot make recursive Prolog calls */
xarg *args = Yap_ArgListToVector ( TermNil, write_defs, WRITE_END );
if (args == NULL) {
if (LOCAL_Error_TYPE == DOMAIN_ERROR_OUT_OF_RANGE)
LOCAL_Error_TYPE = DOMAIN_ERROR_WRITE_OPTION;
if (LOCAL_Error_TYPE)
Yap_Error(LOCAL_Error_TYPE, LOCAL_Error_Term, NULL);
return false;

View File

@ -54,11 +54,28 @@ if (CUDA_FOUND)
macro_optional_find_package (Thrust ON)
set (CUDA_SOURCES
lista.cu
memory.cu
cuda.c
CC_CSSTree.cu
bpreds.cu
dbio.cu
lista.cu
memory.cu
selectproyect.cu
treeb.cu
union2.cu
)
set (CXX_SOURCES
bpredscpu.cpp
joincpu.cpp
selectproyectcpu.cpp
unioncpu2.cpp
)
set (C_SOURCES
creator2.c
cuda.c
)
set (PL_SOURCES
cuda.yap
)

15
packages/cuda/Makefile.in Normal file → Executable file
View File

@ -23,7 +23,7 @@ CC=@CC@
NVCC=@NVCC@
CFLAGS= @SHLIB_CFLAGS@ $(YAP_EXTRAS) $(DEFS) -I$(srcdir) -I../.. -I$(srcdir)/../../include @CUDA_CPPFLAGS@
NVCCFLAGS=@CUDA_CPPFLAGS@ -I$(srcdir) -I../.. -I$(srcdir)/../../include
CUDA_LDFLAGS=@CUDA_LDFLAGS@
LDFLAGS=@LDFLAGS@
#
#
# You shouldn't need to change what follows.
@ -39,7 +39,7 @@ SO=@SO@
CWD=$(PWD)
#
CUDA_PROLOG= \
BDD_PROLOG= \
$(srcdir)/cuda.yap
OBJS=cuda.o memory.o lista.o
@ -62,16 +62,11 @@ memory.o: $(srcdir)/memory.cu $(srcdir)/pred.h
@DO_SECOND_LD@cuda.@SO@: $(OBJS)
@DO_SECOND_LD@ @CUDA_SHLIB_LD@ $(CUDA_LDFLAGS) -o cuda.@SO@ $(OBJS)
install: all install-examples
install: all
mkdir -p $(DESTDIR)$(SHAREDIR)
for h in $(CUDA_PROLOG); do $(INSTALL_DATA) $$h $(DESTDIR)$(SHAREDIR); done
for h in $(BDD_PROLOG); do $(INSTALL_DATA) $$h $(DESTDIR)$(SHAREDIR); done
$(INSTALL_PROGRAM) $(SOBJS) $(DESTDIR)$(YAPLIBDIR)
install-examples:
clean:
rm -f *.o *~ $(OBJS) *.BAK
distclean: clean
rm -f $(SOBJS) Makefile
rm -f *.o *~ $(OBJS) $(SOBJS) *.BAK

499
packages/cuda/bpreds.cu Normal file → Executable file
View File

@ -1,4 +1,113 @@
__global__ void predicates(int *dop1, int rows, int cols, int *cons, int numc, int *res)
#include <thrust/device_vector.h>
#include <thrust/scan.h>
#include <cstdarg>
#include "pred.h"
/*Determines the maximum from a set of values*/
int maximo(int count, ...)
{
va_list ap;
int j, temp, mx = 0;
va_start(ap, count);
for(j = 0; j < count; j++)
{
temp = va_arg(ap, int);
if(temp > mx)
mx = temp;
}
va_end(ap);
return mx;
}
__global__ void bpreds(int *dop1, int *dop2, int rows, int of1, int of2, int *cons, int numc, int nx, int *res, int *res2)
{
extern __shared__ int shared[];
int id = blockIdx.x * blockDim.x + threadIdx.x;
int x, rowact, rowact1, op1, op2;
if(threadIdx.x < numc)
shared[threadIdx.x] = cons[threadIdx.x];
__syncthreads();
if(id < rows)
{
rowact1 = id * of1;
rowact = id * of2;
for(x = nx; x < numc; x += 3)
{
op1 = shared[x+1];
if(op1 < 0)
op1 = dop1[rowact1 - op1 - 1];
else
op1 = dop2[rowact + op1];
op2 = shared[x+2];
if(op2 < 0)
op2 = dop1[rowact1 - op2 - 1];
else
op2 = dop2[rowact + op2];
switch(shared[x] - BPOFFSET)
{
case SBG_EQ: if(op1 != op2)
return;
break;
case SBG_GT: if(op1 <= op2)
return;
break;
case SBG_LT: if(op1 >= op2)
return;
break;
case SBG_GE: if(op1 < op2)
return;
break;
case SBG_LE: if(op1 > op2)
return;
break;
case SBG_DF: if(op1 == op2)
return;
}
}
if(res2 != NULL)
res2[id] = 1;
for(x = 0; x < nx; x += 3)
{
op1 = shared[x+1];
if(op1 < 0)
op1 *= -1;
else
op1 = dop2[rowact + op1];
op2 = shared[x+2];
if(op2 < 0)
op2 *= -1;
else
op2 = dop2[rowact + op2];
switch(shared[x])
{
case SBG_EQ: if(op1 != op2)
return;
break;
case SBG_GT: if(op1 <= op2)
return;
break;
case SBG_LT: if(op1 >= op2)
return;
break;
case SBG_GE: if(op1 < op2)
return;
break;
case SBG_LE: if(op1 > op2)
return;
break;
case SBG_DF: if(op1 == op2)
return;
}
}
res[id] = 1;
}
}
/*Mark all rows that comply with the comparison predicates*/
__global__ void bpredsnormal2(int *dop1, int rows, int of1, int *cons, int numc, int *res)
{
extern __shared__ int shared[];
int id = blockIdx.x * blockDim.x + threadIdx.x;
@ -8,7 +117,7 @@ __global__ void predicates(int *dop1, int rows, int cols, int *cons, int numc, i
__syncthreads();
if(id < rows)
{
rowact = id * cols;
rowact = id * of1;
for(x = 0; x < numc; x += 3)
{
op1 = shared[x+1];
@ -46,98 +155,306 @@ __global__ void predicates(int *dop1, int rows, int cols, int *cons, int numc, i
}
}
int bpreds(int *dop1, int rows, int cols, int *bin, int3 numpreds, int **ret)
/*Unmark all rows that do not comply with the comparison predicates*/
__global__ void bpredsnormal(int *dop1, int rows, int of1, int *cons, int numc, int *res)
{
int *temp;
int tmplen = rows + 1;
int size = tmplen * sizeof(int);
reservar(&temp, size);
#ifdef DEBUG_MEM
cerr << "+ " << temp << " temp bpreds " << size << endl;
#endif
cudaMemset(temp, 0, size);
#if TIMER
cuda_stats.builtins++;
#endif
int *dhead;
int predn = numpreds.x * 3;
int spredn = predn * sizeof(int);
int sproj = numpreds.z * sizeof(int);
int hsize;
if(predn > numpreds.z)
hsize = spredn;
extern __shared__ int shared[];
int id = blockIdx.x * blockDim.x + threadIdx.x;
int x, rowact, op1, op2;
if(threadIdx.x < numc)
shared[threadIdx.x] = cons[threadIdx.x];
__syncthreads();
if(id < rows)
{
if(res[id] == 0)
return;
rowact = id * of1;
for(x = 0; x < numc; x += 3)
{
op1 = shared[x+1];
if(op1 < 0)
op1 *= -1;
else
hsize = sproj;
reservar(&dhead, hsize);
#ifdef DEBUG_MEM
cerr << "+ " << dhead << " dhead " << hsize << endl;
#endif
cudaMemcpy(dhead, bin, spredn, cudaMemcpyHostToDevice);
int blockllen = rows / 1024 + 1;
int numthreads = 1024;
/*int x;
cout << "arraypreds = ";
for(x = 0; x < predn; x++)
cout << bin[x] << " ";
cout << endl;
cout << "temptable = ";
for(x = 0; x < numpreds.z; x++)
cout << bin[x+predn] << " ";
cout << endl;
int y;
int *hop1 = (int *)malloc(numpreds.y * rows * sizeof(int));
cudaMemcpy(hop1, dop1, numpreds.y * rows * sizeof(int), cudaMemcpyDeviceToHost);
for(x = 0; x < rows; x++)
op1 = dop1[rowact + op1];
op2 = shared[x+2];
if(op2 < 0)
op2 *= -1;
else
op2 = dop1[rowact + op2];
switch(shared[x])
{
for(y = 0; y < numpreds.y; y++)
cout << hop1[x * numpreds.y + y] << " ";
cout << endl;
}
free(hop1);*/
predicates<<<blockllen, numthreads, spredn>>>(dop1, rows, numpreds.y, dhead, predn, temp + 1);
/*int y;
int *hop1 = (int *)malloc((rows + 1) * sizeof(int));
cudaMemcpy(hop1, temp, (rows + 1) * sizeof(int), cudaMemcpyDeviceToHost);
for(x = 0; x < (rows + 1); x++)
cout << hop1[x] << " ";
cout << endl;
free(hop1);*/
thrust::device_ptr<int> res;
res = thrust::device_pointer_cast(temp);
thrust::inclusive_scan(res + 1, res + tmplen, res + 1);
int num = res[rows];
if(num == 0)
return 0;
int *fres;
reservar(&fres, num * sproj);
#ifdef DEBUG_MEM
cerr << "+ " << fres << " fres " << num * sproj << endl;
#endif
cudaMemcpy(dhead, bin + predn, sproj, cudaMemcpyHostToDevice);
llenarproyectar<<<blockllen, numthreads, sproj>>>(dop1, rows, numpreds.y, temp, dhead, numpreds.z, fres);
/*int y;
int *hop1 = (int *)malloc(numpreds.z * num * sizeof(int));
cudaMemcpy(hop1, fres, numpreds.z * num * sizeof(int), cudaMemcpyDeviceToHost);
for(x = 0; x < num; x++)
case SBG_EQ: if(op1 != op2)
{
for(y = 0; y < numpreds.z; y++)
cout << hop1[x * numpreds.z + y] << " ";
cout << endl;
res[id] = 0;
return;
}
break;
case SBG_GT: if(op1 <= op2)
{
res[id] = 0;
return;
}
break;
case SBG_LT: if(op1 >= op2)
{
res[id] = 0;
return;
}
break;
case SBG_GE: if(op1 < op2)
{
res[id] = 0;
return;
}
break;
case SBG_LE: if(op1 > op2)
{
res[id] = 0;
return;
}
break;
case SBG_DF: if(op1 == op2)
{
res[id] = 0;
return;
}
}
}
}
free(hop1);*/
liberar(dhead, hsize);
liberar(temp, size);
liberar(dop1, rows * cols * sizeof(int));
*ret = fres;
return num;
}
__global__ void bpredsOR(int *dop1, int *dop2, int rows, int of1, int of2, int *cons, int numc, int nx, int *res, int *res2)
{
extern __shared__ int shared[];
int id = blockIdx.x * blockDim.x + threadIdx.x;
int x, rowact, rowact1, op1, op2;
if(threadIdx.x < numc)
shared[threadIdx.x] = cons[threadIdx.x];
__syncthreads();
if(id < rows)
{
rowact1 = id * of1;
rowact = id * of2;
for(x = nx; x < numc; x += 3)
{
op1 = shared[x+1];
if(op1 < 0)
op1 = dop1[rowact1 - op1 - 1];
else
op1 = dop2[rowact + op1];
op2 = shared[x+2];
if(op2 < 0)
op2 = dop1[rowact1 - op2 - 1];
else
op2 = dop2[rowact + op2];
switch(shared[x] - BPOFFSET)
{
case SBG_EQ: if(op1 == op2)
{
res2[id] = 1;
x = numc;
}
break;
case SBG_GT: if(op1 > op2)
{
res2[id] = 1;
x = numc;
}
break;
case SBG_LT: if(op1 < op2)
{
res2[id] = 1;
x = numc;
}
break;
case SBG_GE: if(op1 >= op2)
{
res2[id] = 1;
x = numc;
}
break;
case SBG_LE: if(op1 <= op2)
{
res2[id] = 1;
x = numc;
}
break;
case SBG_DF: if(op1 != op2)
{
res2[id] = 1;
x = numc;
}
}
}
for(x = 0; x < nx; x += 3)
{
op1 = shared[x+1];
if(op1 < 0)
op1 *= -1;
else
op1 = dop2[rowact + op1];
op2 = shared[x+2];
if(op2 < 0)
op2 *= -1;
else
op2 = dop2[rowact + op2];
switch(shared[x])
{
case SBG_EQ: if(op1 == op2)
{
res[id] = 1;
return;
}
break;
case SBG_GT: if(op1 > op2)
{
res[id] = 1;
return;
}
break;
case SBG_LT: if(op1 < op2)
{
res[id] = 1;
return;
}
break;
case SBG_GE: if(op1 >= op2)
{
res[id] = 1;
return;
}
break;
case SBG_LE: if(op1 <= op2)
{
res[id] = 1;
return;
}
break;
case SBG_DF: if(op1 != op2)
{
res[id] = 1;
return;
}
}
}
}
}
/*Mark all rows that comply with the comparison predicates using disjunctions (i.e. a row is marked if it complies with at least one predicate)*/
__global__ void bpredsorlogic2(int *dop1, int rows, int of1, int *cons, int numc, int *res)
{
extern __shared__ int shared[];
int id = blockIdx.x * blockDim.x + threadIdx.x;
int x, rowact, op1, op2;
if(threadIdx.x < numc)
shared[threadIdx.x] = cons[threadIdx.x];
__syncthreads();
if(id < rows)
{
rowact = id * of1;
for(x = 0; x < numc; x += 3)
{
op1 = shared[x+1];
if(op1 < 0)
op1 *= -1;
else
op1 = dop1[rowact + op1];
op2 = shared[x+2];
if(op2 < 0)
op2 *= -1;
else
op2 = dop1[rowact + op2];
switch(shared[x])
{
case SBG_EQ: if(op1 == op2)
{
res[id] = 1;
return;
}
break;
case SBG_GT: if(op1 > op2)
{
res[id] = 1;
return;
}
break;
case SBG_LT: if(op1 < op2)
{
res[id] = 1;
return;
}
break;
case SBG_GE: if(op1 >= op2)
{
res[id] = 1;
return;
}
break;
case SBG_LE: if(op1 <= op2)
{
res[id] = 1;
return;
}
break;
case SBG_DF: if(op1 != op2)
{
res[id] = 1;
return;
}
}
}
}
}
/*Unmark all rows that do not comply with the comparison predicates using disjunctions (i.e. a row is unmarked only if it complies with none of the predicates)*/
__global__ void bpredsorlogic(int *dop1, int rows, int of1, int *cons, int numc, int *res)
{
extern __shared__ int shared[];
int id = blockIdx.x * blockDim.x + threadIdx.x;
int x, rowact, op1, op2;
if(threadIdx.x < numc)
shared[threadIdx.x] = cons[threadIdx.x];
__syncthreads();
if(id < rows)
{
if(res[id] == 0)
return;
rowact = id * of1;
for(x = 0; x < numc; x += 3)
{
op1 = shared[x+1];
if(op1 < 0)
op1 *= -1;
else
op1 = dop1[rowact + op1];
op2 = shared[x+2];
if(op2 < 0)
op2 *= -1;
else
op2 = dop1[rowact + op2];
switch(shared[x])
{
case SBG_EQ: if(op1 == op2)
return;
break;
case SBG_GT: if(op1 > op2)
return;
break;
case SBG_LT: if(op1 < op2)
return;
break;
case SBG_GE: if(op1 >= op2)
return;
break;
case SBG_LE: if(op1 <= op2)
return;
break;
case SBG_DF: if(op1 != op2)
return;
}
}
res[id] = 0;
}
}

228
packages/cuda/cuda.c Normal file → Executable file
View File

@ -6,19 +6,25 @@
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <inttypes.h>
#include "pred.h"
#define MAXARG 100
YAP_Atom AtomEq,
AtomGt,
AtomLt,
AtomGe,
AtomLe,
AtomDf;
AtomDf,
AtomNt;
predicate *facts[100]; /*Temporary solution to maintain facts and rules*/
predicate *rules[100];
predicate *facts[MAXARG]; /*Temporary solution to maintain facts and rules*/
predicate *rules[MAXARG];
int32_t cf = 0, cr = 0;
char names[1024];
// initialize CUDA system
void Cuda_Initialize( void );
@ -39,6 +45,19 @@ void init_cuda( void );
//#define DEBUG_INTERFACE 1
#ifdef ROCKIT
static int32_t query[100];
static int32_t qcont = 0;
static int cuda_init_query(void)
{
int32_t pname = YAP_AtomToInt(YAP_AtomOfTerm(YAP_ARG1));
query[qcont] = pname;
qcont++;
query[qcont] = 0;
return TRUE;
}
#endif
#if DEBUG_INTERFACE
static void
dump_mat(int32_t mat[], int32_t nrows, int32_t ncols)
@ -83,8 +102,18 @@ int32_t Cuda_NewFacts(predicate *pe)
#if DEBUG_INTERFACE
dump_mat( pe->address_host_table, pe->num_rows, pe->num_columns );
#endif
#ifdef ROCKIT
if(cf >= 0)
{
facts[cf] = pe;
cf++;
}
#else
facts[cf] = pe;
cf++;
#endif
return TRUE;
}
@ -115,7 +144,7 @@ int32_t Cuda_Erase(predicate *pe)
return TRUE;
}
static YAP_Bool
static int
load_facts( void ) {
int32_t nrows = YAP_IntOfTerm(YAP_ARG1);
@ -164,15 +193,18 @@ load_facts( void ) {
static int currentFact = 0;
static predicate *currentPred = NULL;
static YAP_Bool
static int
cuda_init_facts( void ) {
int32_t nrows = YAP_IntOfTerm(YAP_ARG1);
int32_t ncols = YAP_IntOfTerm(YAP_ARG2), i = 0;
int32_t ncols = YAP_IntOfTerm(YAP_ARG2);
int32_t *mat = (int32_t *)malloc(sizeof(int32_t)*nrows*ncols);
int32_t pname = YAP_AtomToInt(YAP_AtomOfTerm(YAP_ARG3));
predicate *pred;
strcat(names, YAP_AtomName(YAP_AtomOfTerm(YAP_ARG3)));
strcat(names, " ");
if (!mat)
return FALSE;
if (YAP_IsVarTerm( YAP_ARG4)) {
@ -198,14 +230,16 @@ cuda_init_facts( void ) {
}
}
static YAP_Bool
static int
cuda_load_fact( void ) {
YAP_Term th = YAP_ARG1;
int i, j;
int i = currentFact;
#if defined(DATALOG) || defined(TUFFY)
YAP_Term th = YAP_ARG1;
int ncols = currentPred->num_columns;
int j;
int *mat = currentPred->address_host_table;
i = currentFact;
for (j = 0; j < ncols; j++) {
YAP_Term ta = YAP_ArgOfTerm(j+1, th);
if (YAP_IsAtomTerm(ta)) {
@ -214,6 +248,8 @@ cuda_load_fact( void ) {
mat[i*ncols+j] = YAP_IntOfTerm(ta);
}
}
#endif
i++;
if (i == currentPred->num_rows) {
Cuda_NewFacts(currentPred);
@ -225,21 +261,26 @@ cuda_load_fact( void ) {
return TRUE;
}
static YAP_Bool
static int
load_rule( void ) {
// maximum of 2K symbols per rule, should be enough for ILP
int32_t vec[2048], *ptr = vec, *nvec;
int32_t vec[2048], *ptr = vec, *nvec, neg[2048];
// qK different variables;
YAP_Term vars[1024];
int32_t nvars = 0;
int32_t nvars = 0, x;
int32_t ngoals = YAP_IntOfTerm(YAP_ARG1); /* gives the number of goals */
int32_t ncols = YAP_IntOfTerm(YAP_ARG2);
YAP_Term t3 = YAP_ARG3;
int32_t pname = YAP_AtomToInt(YAP_NameOfFunctor(YAP_FunctorOfTerm(YAP_HeadOfTerm(t3))));
YAP_Atom name = YAP_NameOfFunctor(YAP_FunctorOfTerm(YAP_HeadOfTerm(t3)));
int32_t pname = YAP_AtomToInt(name);
const char *strname = YAP_AtomName(name);
predicate *pred;
int32_t cont = 0;
memset(neg, 0x0, 2048 * sizeof(int32_t));
while(YAP_IsPairTerm(t3)) {
int32_t j = 0;
int32_t j = 0, m;
YAP_Term th = YAP_HeadOfTerm(t3);
YAP_Functor f = YAP_FunctorOfTerm( th );
int32_t n = YAP_ArityOfFunctor( f );
@ -257,8 +298,17 @@ load_rule( void ) {
*ptr++ = SBG_LE;
else if (at == AtomDf)
*ptr++ = SBG_DF;
else if (at == AtomNt)
{
neg[cont] = 1;
cont++;
}
else
{
*ptr++ = YAP_AtomToInt( at );
cont++;
}
for (j = 0; j < n; j++) {
YAP_Term ta = YAP_ArgOfTerm(j+1, th);
@ -277,6 +327,34 @@ load_rule( void ) {
}
} else if (YAP_IsAtomTerm(ta)) {
*ptr++ = -YAP_AtomToInt(YAP_AtomOfTerm(ta));
} else if (YAP_IsApplTerm(ta)) {
f = YAP_FunctorOfTerm( ta );
at = YAP_NameOfFunctor( f );
m = YAP_ArityOfFunctor( f );
*ptr++ = YAP_AtomToInt( at );
for (x = 0; x < m; x++) {
YAP_Term ta2 = YAP_ArgOfTerm(x+1, ta);
if (YAP_IsVarTerm(ta2)) {
int32_t k;
for (k = 0; k < nvars; k++) {
if (vars[k] == ta2) {
*ptr++ = k+1;
break;
}
}
if (k == nvars) {
vars[k] = ta2;
*ptr++ = k+1;
nvars++;
}
} else if (YAP_IsAtomTerm(ta2)) {
*ptr++ = -YAP_AtomToInt(YAP_AtomOfTerm(ta));
} else {
*ptr++ = -YAP_IntOfTerm(ta);
}
}
} else {
*ptr++ = -YAP_IntOfTerm(ta);
}
@ -296,53 +374,136 @@ load_rule( void ) {
pred->num_rows = ngoals;
pred->num_columns = ncols;
pred->is_fact = FALSE;
x = (strlen(strname) + 1) * sizeof(char);
pred->predname = (char *)malloc(x);
memcpy(pred->predname, strname, x);
nvec = (int32_t *)malloc(sizeof(int32_t)*(ptr-vec));
memcpy(nvec, vec, sizeof(int32_t)*(ptr-vec));
pred->address_host_table = nvec;
pred->negatives = (int32_t *)malloc(sizeof(int32_t) * cont);
memcpy(pred->negatives, neg, sizeof(int32_t) * cont);
Cuda_NewRule( pred );
return YAP_Unify(YAP_ARG4, YAP_MkIntTerm((YAP_Int)pred));
}
static YAP_Bool
static int
cuda_erase( void )
{
predicate *ptr = (predicate *)YAP_IntOfTerm(YAP_ARG1);
return Cuda_Erase( ptr );
}
static YAP_Bool
void setQuery(YAP_Term t1, int32_t **res)
{
int32_t *query = (int32_t *)malloc(MAXARG * sizeof(int32_t));
int32_t x, y = 0, *itr;
predicate *ptr = NULL;
if(YAP_IsPairTerm(t1))
{
while(YAP_IsPairTerm(t1))
{
ptr = (predicate *)YAP_IntOfTerm(YAP_HeadOfTerm(t1));
query[y] = ptr->name;
itr = ptr->address_host_table;
x = 2;
while(itr[x] != 0)
x++;
query[y+1] = itr[x+1];
t1 = YAP_TailOfTerm(t1);
y+=2;
}
}
else
{
ptr = (predicate *)YAP_IntOfTerm(t1);
query[y] = ptr->name;
itr = ptr->address_host_table;
x = 2;
while(itr[x] != 0)
x++;
query[y+1] = itr[x+1];
y += 2;
}
query[y] = -1;
query[y+1] = -1;
*res = query;
}
static int
cuda_eval( void )
{
int32_t *mat;
#if defined(DATALOG) || defined(TUFFY)
int32_t *query = NULL;
setQuery(YAP_ARG1, &query);
#endif
int32_t finalDR = YAP_IntOfTerm(YAP_ARG3);
int32_t n = Cuda_Eval(facts, cf, rules, cr, query, & mat, names, finalDR);
#ifdef TUFFY
cf = 0;
#endif
#ifdef ROCKIT
if(cf > 0)
cf *= -1;
#endif
#if defined(TUFFY) || defined(ROCKIT)
cr = 0;
names[0] = '\0';
return FALSE;
#else
int32_t i;
predicate *ptr = (predicate *)YAP_IntOfTerm(YAP_ARG1);
int32_t n = Cuda_Eval(facts, cf, rules, cr, ptr, & mat);
int32_t ncols = ptr->num_columns;
YAP_Term out = YAP_TermNil();
YAP_Functor f = YAP_MkFunctor(YAP_IntToAtom(ptr->name), ncols);
YAP_Term vec[256];
int32_t i;
YAP_Atom at;
if (n < 0)
return FALSE;
for (i=0; i<n; i++) {
int32_t ni = ((n-1)-i)*ncols, j;
printf("%s(", YAP_AtomName(YAP_IntToAtom(ptr->name)));
for (j=0; j<ncols; j++) {
vec[j] = YAP_MkIntTerm(mat[ni+j]);
at = YAP_IntToAtom(mat[ni+j]);
if(at != NULL)
printf("%s", YAP_AtomName(at));
else
printf("%d", mat[ni+j]);
if(j < (ncols - 1))
printf(",");
}
out = YAP_MkPairTerm(YAP_MkApplTerm( f, ncols, vec ), out);
printf(")\n");
}
if (n > 0)
free( mat );
return YAP_Unify(YAP_ARG2, out);
#endif
}
static YAP_Bool
static int
cuda_coverage( void )
{
int32_t *mat;
predicate *ptr = (predicate *)YAP_IntOfTerm(YAP_ARG1);
int32_t n = Cuda_Eval(facts, cf, rules, cr, ptr, & mat);
int32_t ncols = ptr->num_columns;
#if defined(DATALOG) || defined(TUFFY)
int32_t *query = NULL;
setQuery(YAP_ARG1, &query);
#endif
int32_t n = Cuda_Eval(facts, cf, rules, cr, query, & mat, 0, 0);
int32_t post = YAP_AtomToInt(YAP_AtomOfTerm(YAP_ARG2));
int32_t i = n/2, min = 0, max = n-1;
int32_t t0, t1;
@ -384,11 +545,16 @@ cuda_coverage( void )
} while ( TRUE );
}
static YAP_Bool cuda_count( void )
static int cuda_count( void )
{
int32_t *mat;
predicate *ptr = (predicate *)YAP_IntOfTerm(YAP_ARG1);
int32_t n = Cuda_Eval(facts, cf, rules, cr, ptr, & mat);
#if defined(DATALOG) || defined(TUFFY)
int32_t *query = NULL;
setQuery(YAP_ARG1, &query);
#endif
int32_t n = Cuda_Eval(facts, cf, rules, cr, query, & mat, 0, 0);
if (n < 0)
return FALSE;
@ -396,7 +562,7 @@ static YAP_Bool cuda_count( void )
return YAP_Unify(YAP_ARG2, YAP_MkIntTerm(n));
}
static YAP_Bool cuda_statistics( void )
static int cuda_statistics( void )
{
Cuda_Statistics();
return TRUE;
@ -416,14 +582,20 @@ init_cuda(void)
AtomGe = YAP_LookupAtom(">=");
AtomLe = YAP_LookupAtom("=<");
AtomDf = YAP_LookupAtom("\\=");
AtomNt = YAP_LookupAtom("not");
YAP_UserCPredicate("load_facts", load_facts, 4);
YAP_UserCPredicate("cuda_init_facts", cuda_init_facts, 4);
YAP_UserCPredicate("cuda_load_fact", cuda_load_fact, 1);
YAP_UserCPredicate("load_rule", load_rule, 4);
YAP_UserCPredicate("cuda_erase", cuda_erase, 1);
YAP_UserCPredicate("cuda_eval", cuda_eval, 2);
YAP_UserCPredicate("cuda_eval", cuda_eval, 3);
YAP_UserCPredicate("cuda_coverage", cuda_coverage, 4);
YAP_UserCPredicate("cuda_count", cuda_count, 2);
YAP_UserCPredicate("cuda_statistics", cuda_statistics, 0);
#ifdef ROCKIT
YAP_UserCPredicate("cuda_init_query", cuda_init_query, 1);
#endif
}

9
packages/cuda/cuda.yap Normal file → Executable file
View File

@ -2,10 +2,11 @@
cuda_inline/2,
cuda_rule/2,
cuda_erase/1,
cuda_eval/2,
cuda_eval/3,
cuda_coverage/4,
cuda_statistics/0,
cuda_count/2]).
cuda_count/2,
cuda_query/1]).
tell_warning :-
print_message(warning,functionality(cuda)).
@ -40,7 +41,7 @@ count_answers(G, N) :-
cuda_rule((Head :- Body) , IdRules) :-
body_to_list( Body, L, [], 1, N),
functor(Head, _Na, Ar),
functor(Head, Na, Ar),
load_rule( N, Ar, [Head|L], IdRules ).
@ -54,3 +55,5 @@ body_to_list( B, NL, L, N0, N) :-
body_to_list( B, [B|L], L, N0, N) :-
N is N0+1.
cuda_query(Call) :-
cuda_init_query(Call).

965
packages/cuda/lista.cu Normal file → Executable file

File diff suppressed because it is too large Load Diff

7
packages/cuda/lista.h Normal file → Executable file
View File

@ -25,8 +25,11 @@ typedef struct auxiliar{
int *numselfj;
int **wherejoin;
int *numjoin;
int3 num_bpreds;
int *builtin;
int totalpreds;
int **preds;
int2 *numpreds;
int *negatives;
char *rulename;
int gen_act;
int gen_ant;
}rulenode;

493
packages/cuda/memory.cu Normal file → Executable file
View File

@ -5,63 +5,101 @@
#include <thrust/device_vector.h>
#include "lista.h"
#include "memory.h"
#include "pred.h"
#define MAX_REC 200
#define HALF_REC (MAX_REC / 2)
#define MAX_FIX_POINTS 100
unsigned int avmem;
memnode temp_storage[MAX_REC];
/*List used to store information (address, size, etc.) about facts and rule results loaded in the GPU*/
list<memnode> GPUmem;
/*List used to store information about rule results offloaded from the GPU to the CPU*/
list<memnode> CPUmem;
/*Auxiliary function to sort rule list*/
bool comparer(const rulenode &r1, const rulenode &r2)
{
return (r1.name > r2.name);
}
/*Used in search functions to compare iterations*/
bool compareiteration(const memnode &r1, const memnode &r2)
{
return (r1.iteration < r2.iteration);
}
/*Used in search functions to compare names*/
bool comparename(const memnode &r1, const memnode &r2)
{
return (r1.name > r2.name);
}
void calcular_mem(int dev)
{
cudaDeviceProp p;
cudaGetDeviceProperties(&p, dev);
avmem = p.totalGlobalMem;
temp_storage[0].dev_address = NULL;
temp_storage[0].size = 0;
temp_storage[HALF_REC].dev_address = NULL;
temp_storage[HALF_REC].size = 0;
//cout << "Initial memory available " << avmem << endl;
}
/*Linear search of 'name' fact*/
template<class InputIterator>
InputIterator buscarhecho(InputIterator first, InputIterator last, int name)
{
while(first!=last)
{
if(first->name == name) return first;
if(first->name == name && first->isrule == 0) return first;
++first;
}
return last;
}
list<memnode>::iterator buscarpornombre(int name, int itr, int *totalrows, int *gpunum)
/*Finds all results of rule 'name' in iteration 'itr' in both CPU and GPU memory. Every result found is removed from its respective list*/
list<memnode>::iterator buscarpornombre(int name, int itr, int *totalrows, int *gpunum, int *cpunum)
{
int x = 1, sum = 0;
int x = 0, sum = 0;
memnode temp;
temp.name = name;
list<memnode>::iterator i;
temp.iteration = itr;
pair<list<memnode>::iterator, list<memnode>::iterator> rec = equal_range(GPUmem.begin(), GPUmem.end(), temp, compareiteration);
while(rec.first != rec.second)
{
if(rec.first->name == name && rec.first->isrule == 1)
{
temp_storage[x] = *rec.first;
rec.first = GPUmem.erase(rec.first);
sum += temp_storage[x].rows;
x++;
}
else
rec.first++;
}
*gpunum = x;
temp.name = name;
temp.isrule = 1;
i = GPUmem.insert(rec.first, temp);
rec = equal_range(CPUmem.begin(), CPUmem.end(), temp, compareiteration);
//cout << "itr = " << itr << " rec.first = " << rec.first->name << endl;
while(rec.first != rec.second)
{
if(rec.first->name == name && rec.first->isrule == 1)
{
temp_storage[x] = *rec.first;
rec.first = CPUmem.erase(rec.first);
sum += temp_storage[x].rows;
x++;
}
else
rec.first++;
}
*totalrows = sum;
*cpunum = x;
return i;
}
list<memnode>::iterator buscarpornombrecpu(int name, int itr, int *totalrows, int *gpunum, int *cpunum)
{
int x = 0, sum = 0;
memnode temp;
list<memnode>::iterator i;
temp.iteration = itr;
pair<list<memnode>::iterator, list<memnode>::iterator> rec = equal_range(GPUmem.begin(), GPUmem.end(), temp, compareiteration);
while(rec.first != rec.second)
{
if(rec.first->name == name)
{
temp_storage[x] = *rec.first;
@ -72,22 +110,11 @@ list<memnode>::iterator buscarpornombre(int name, int itr, int *totalrows, int *
else
rec.first++;
}
//if(x > 1)
rec.first = GPUmem.insert(rec.first, temp);
*totalrows = sum;
*gpunum = x;
return rec.first;
}
int buscarpornombrecpu(int name, int itr, int *totalrows)
{
int x = HALF_REC + 1, sum = 0;
memnode temp;
temp.iteration = itr;
pair<list<memnode>::iterator, list<memnode>::iterator> rec = equal_range(CPUmem.begin(), CPUmem.end(), temp, compareiteration);
/*if(rec.first != rec.second)
cout << "bscnomcpu = " << rec.first->name << " " << rec.first->iteration << endl;*/
temp.name = name;
temp.isrule = 1;
rec = equal_range(CPUmem.begin(), CPUmem.end(), temp, compareiteration);
while(rec.first != rec.second)
{
@ -101,18 +128,24 @@ int buscarpornombrecpu(int name, int itr, int *totalrows)
else
rec.first++;
}
*totalrows += sum;
return x;
i = CPUmem.insert(rec.first, temp);
*totalrows = sum;
*cpunum = x;
return i;
}
/*Removes the least recently used memory block from GPU memory, sending it to CPU memory if it's a rule result.
If there are no used memory blocks in the GPU and we still don't have enough memory, the program exits with error*/
void limpiar(const char s[], size_t sz)
{
list<memnode>::iterator ini;
memnode temp;
size_t free, total;
if(GPUmem.size() == 0)
{
cerr << s << ": not enough GPU memory: have " << avmem << ", need " << sz << " bytes." << endl;
cudaMemGetInfo(&free,&total);
cerr << s << ": not enough GPU memory: have " << free << " of " << total << ", need " << sz << " bytes." << endl;
exit(1);
}
@ -122,80 +155,32 @@ void limpiar(const char s[], size_t sz)
temp = *ini;
temp.dev_address = (int *)malloc(ini->size);
cudaMemcpyAsync(temp.dev_address, ini->dev_address, temp.size, cudaMemcpyDeviceToHost);
CPUmem.push_back(temp);
list<memnode>::iterator pos = lower_bound(CPUmem.begin(), CPUmem.end(), temp, compareiteration);
CPUmem.insert(pos, temp);
}
liberar(ini->dev_address, ini->size);
cudaFree(ini->dev_address);
GPUmem.erase(ini);
}
void limpiartodo(int *p1, int *p2)
/*Allocs 'size' amount of bytes in GPU memory. If not enough memory is available, removes least recently used memory blocks until
enough space is available*/
void reservar(int **ptr, size_t size)
{
list<memnode>::iterator ini;
memnode temp;
int cont = 0;
if(p1 != NULL)
cont++;
if(p2 != NULL)
cont++;
ini = GPUmem.begin();
/*cout << "ANTES" << endl;
mostrar_memoria();
mostrar_memcpu();
cout << "FIN ANTES" << endl;*/
//cout << "mem = " << GPUmem.size() << " " << avmem << endl;
while(GPUmem.size() > cont)
{
if(ini->dev_address == p1 || ini->dev_address == p2)
{
ini++;
continue;
}
if(ini->isrule)
{
temp = *ini;
temp.dev_address = (int *)malloc(ini->size);
cudaMemcpy(temp.dev_address, ini->dev_address, temp.size, cudaMemcpyDeviceToHost);
CPUmem.push_back(temp);
}
liberar(ini->dev_address, temp.size);
ini = GPUmem.erase(ini);
}
/*cout << "DESPUES" << endl;
mostrar_memoria();
mostrar_memcpu();
cout << "FIN DESPUES" << endl;*/
//cout << "memfinal = " << GPUmem.size() << " " << avmem << endl;
}
void liberar(int *ptr, int size)
{
//cout << "L " << avmem << " " << size;
cudaFree(ptr);
#ifdef DEBUG_MEM
cerr << "- " << ptr << " " << size << endl;
#endif
avmem += size;
//cout << " " << avmem << endl;
}
void reservar(int **ptr, int size)
{
//size_t free, total;
//cudaMemGetInfo( &free, &total );
// cerr << "? " << free << " " << size << endl;
size_t free, total;
if (size == 0) {
*ptr = NULL;
return;
}
while(avmem < size)
cudaMemGetInfo(&free, &total);
while(free < size)
{
cout << "Se limpio memoria " << free << " " << total << endl;
limpiar("not enough memory", size);
cudaMemGetInfo(&free, &total);
}
while(cudaMalloc(ptr, size) == cudaErrorMemoryAllocation)
limpiar("Error in memory allocation", size);
if (! *ptr ) {
@ -205,11 +190,9 @@ void reservar(int **ptr, int size)
cerr << "Exiting CUDA...." << endl;
exit(1);
}
avmem -= size;
// cout << " " << avmem << endl;
}
/*Creates a new entry in the GPU memory list*/
void registrar(int name, int num_columns, int *ptr, int rows, int itr, int rule)
{
memnode temp;
@ -222,6 +205,19 @@ void registrar(int name, int num_columns, int *ptr, int rows, int itr, int rule)
GPUmem.push_back(temp);
}
void registrarcpu(int name, int num_columns, int *ptr, int rows, int itr, int rule)
{
memnode temp;
temp.name = name;
temp.dev_address = ptr;
temp.rows = rows;
temp.size = rows * num_columns * sizeof(int);
temp.iteration = itr;
temp.isrule = rule;
CPUmem.push_back(temp);
}
/*Updates the information of an element in a list*/
template<class InputIterator>
void actualizar(int num_columns, int *ptr, int rows, InputIterator i)
{
@ -230,6 +226,7 @@ void actualizar(int num_columns, int *ptr, int rows, InputIterator i)
i->size = rows * num_columns * sizeof(int);
}
/*Count the total number of rows generated by rule 'name' in iteration 'iter'*/
int numrows(int name, int itr)
{
int sum = 0;
@ -252,16 +249,17 @@ int numrows(int name, int itr)
return sum;
}
extern "C" void * YAP_IntToAtom(int);
extern "C" char * YAP_AtomName(void *);
/*Loads facts or rule results in GPU memory. If a fact is already in GPU memory, its pointer is simply returned. Otherwise,
memory is reserved and the fact is loaded. Rule results are loaded based on the current iteration 'itr' and both GPU and
CPU memories are searched for all instances of said results. The instances are combined into a single one in GPU memory.*/
int cargar(int name, int num_rows, int num_columns, int is_fact, int *address_host_table, int **ptr, int itr)
{
int numgpu, numcpu, totalrows = 0;
int *temp, x;
int size, itrant;
int size, itrant, inc = 0;
list<memnode>::iterator i;
memnode fact;
@ -279,9 +277,6 @@ int cargar(int name, int num_rows, int num_columns, int is_fact, int *address_ho
}
size = num_rows * num_columns * sizeof(int);
reservar(&temp, size);
#ifdef DEBUG_MEM
cerr << "+ " << temp << " temp " << size << endl;
#endif
cudaMemcpyAsync(temp, address_host_table, size, cudaMemcpyHostToDevice);
registrar(name, num_columns, temp, num_rows, itr, 0);
*ptr = temp;
@ -290,28 +285,25 @@ int cargar(int name, int num_rows, int num_columns, int is_fact, int *address_ho
if(itr > 0)
{
itrant = itr - 1;
i = buscarpornombre(name, itrant, &totalrows, &numgpu);
numcpu = buscarpornombrecpu(name, itrant, &totalrows);
if((numgpu == 2) && (numcpu == (HALF_REC + 1)))
i = buscarpornombre(name, itrant, &totalrows, &numgpu, &numcpu);
if((numgpu == 1) && (numcpu == 1))
{
actualizar(num_columns, temp_storage[1].dev_address, temp_storage[1].rows, i);
*ptr = temp_storage[1].dev_address;
return temp_storage[1].rows;
actualizar(num_columns, temp_storage[0].dev_address, temp_storage[0].rows, i);
*ptr = temp_storage[0].dev_address;
return temp_storage[0].rows;
}
size = totalrows * num_columns * sizeof(int);
reservar(&temp, size);
#ifdef DEBUG_MEM
cerr << "+ " << temp << " temp 2 " << size << endl;
#endif
for(x = 1; x < numgpu; x++)
for(x = 0; x < numgpu; x++)
{
cudaMemcpyAsync(temp + temp_storage[x-1].size, temp_storage[x].dev_address, temp_storage[x].size, cudaMemcpyDeviceToDevice);
liberar(temp_storage[x].dev_address, temp_storage[x].size);
cudaMemcpyAsync(temp + inc, temp_storage[x].dev_address, temp_storage[x].size, cudaMemcpyDeviceToDevice);
inc += temp_storage[x].size / sizeof(int);
cudaFree(temp_storage[x].dev_address);
}
for(x = HALF_REC + 1; x < numcpu; x++)
for(; x < numcpu; x++)
{
cudaMemcpyAsync(temp + temp_storage[x-1].size, temp_storage[x].dev_address, temp_storage[x].size, cudaMemcpyHostToDevice);
cudaMemcpyAsync(temp + inc, temp_storage[x].dev_address, temp_storage[x].size, cudaMemcpyHostToDevice);
inc += temp_storage[x].size / sizeof(int);
free(temp_storage[x].dev_address);
}
actualizar(num_columns, temp, totalrows, i);
@ -321,9 +313,54 @@ int cargar(int name, int num_rows, int num_columns, int is_fact, int *address_ho
return 0;
}
int cargarcpu(int name, int num_rows, int num_columns, int is_fact, int *address_host_table, int **ptr, int itr)
{
int numgpu, numcpu, totalrows = 0;
int *temp, x;
int size, itrant, inc = 0;
list<memnode>::iterator i;
if(is_fact)
{
*ptr = address_host_table;
return num_rows;
}
if(itr > 0)
{
itrant = itr - 1;
i = buscarpornombrecpu(name, itrant, &totalrows, &numgpu, &numcpu);
if((numgpu == 0) && (numcpu == 1))
{
actualizar(num_columns, temp_storage[0].dev_address, temp_storage[0].rows, i);
*ptr = temp_storage[0].dev_address;
return temp_storage[0].rows;
}
size = totalrows * num_columns * sizeof(int);
temp = (int *)malloc(size);
for(x = 0; x < numgpu; x++)
{
cudaMemcpyAsync(temp + inc, temp_storage[x].dev_address, temp_storage[x].size, cudaMemcpyDeviceToHost);
inc += temp_storage[x].size / sizeof(int);
cudaFree(temp_storage[x].dev_address);
}
for(; x < numcpu; x++)
{
memcpy(temp + inc, temp_storage[x].dev_address, temp_storage[x].size);
inc += temp_storage[x].size / sizeof(int);
free(temp_storage[x].dev_address);
}
actualizar(num_columns, temp, totalrows, i);
*ptr = temp;
return totalrows;
}
return 0;
}
/*Loads all results of rule 'name' from both GPU and CPU memories into the GPU*/
int cargafinal(int name, int cols, int **ptr)
{
int *temp, *ini, cont = 0;
int *temp, *ini, cont = 0, numg = 0, numc = 0;
memnode bus;
bus.name = name;
GPUmem.sort(comparename);
@ -335,6 +372,7 @@ int cargafinal(int name, int cols, int **ptr)
while(pos != endg && pos->name == name)
{
cont += pos->rows;
numg++;
pos++;
}
pos = lower_bound(CPUmem.begin(), endc, bus, comparename);
@ -342,15 +380,41 @@ int cargafinal(int name, int cols, int **ptr)
while(pos != endc && pos->name == name)
{
cont += pos->rows;
numc++;
pos++;
}
reservar(&temp, cont * cols * sizeof(int));
#ifdef DEBUG_MEM
cerr << "+ " << temp << " temp 3 " << cont * cols * sizeof(int) << endl;
#endif
ini = temp;
if(numg == 0 && numc == 0)
return 0;
if(numg == 1 && numc == 0)
{
pos = gpu;
*ptr = pos->dev_address;
cont = pos->rows;
GPUmem.erase(pos);
#ifdef TUFFY
return -cont;
#else
return cont;
#endif
}
if(numg == 0 && numc == 1)
{
pos = cpu;
cont = pos->rows;
#ifdef TUFFY
reservar(&temp, pos->size);
cudaMemcpy(temp, pos->dev_address, pos->size, cudaMemcpyHostToDevice);
*ptr = temp;
#else
*ptr = pos->dev_address;
#endif
CPUmem.erase(pos);
return -cont;
}
reservar(&temp, cont * cols * sizeof(int));
ini = temp;
pos = gpu;
while(pos != endg && pos->name == name)
{
@ -365,23 +429,13 @@ int cargafinal(int name, int cols, int **ptr)
temp += pos->size / sizeof(int);
pos++;
}
/*int x, y;
int *hop1 = (int *)malloc(cont * cols * sizeof(int));
cudaMemcpy(hop1, ini, cont * cols * sizeof(int), cudaMemcpyDeviceToHost);
cout << "select finala" << endl;
for(x = 0; x < cont; x++)
{
for(y = 0; y < cols; y++)
cout << hop1[x * cols + y] << " ";
cout << endl;
}
cout << "select finala" << endl;*/
*ptr = ini;
return cont;
}
/*Compares the results of the current iteration against the results of older iterations.
Used to avoid infinite computations when the result is not a single fixed-point, but an
orbit of points.*/
bool generadas(int name, int filas, int cols, int itr)
{
int r1, r2, x, fin;
@ -401,46 +455,26 @@ bool generadas(int name, int filas, int cols, int itr)
thrust::device_ptr<int> pt2 = thrust::device_pointer_cast(dop2);
r1 = cargar(name, filas, cols, 0, NULL, &dop1, itr - x + 1);
thrust::device_ptr<int> pt1 = thrust::device_pointer_cast(dop1);
/*int y;
int *a = (int *)malloc(r1 * cols * sizeof(int));
cudaMemcpy(a, dop1, r1 * cols * sizeof(int), cudaMemcpyDeviceToHost);
for(x = 0; x < r1; x++)
{
for(y = 0; y < cols; y++)
cout << a[x * cols + y] << " ";
}
cout << endl;
cudaMemcpy(a, dop2, r1 * cols * sizeof(int), cudaMemcpyDeviceToHost);
for(x = 0; x < r1; x++)
{
for(y = 0; y < cols; y++)
cout << a[x * cols + y] << " ";
}
cout << endl;
free(a);*/
if(thrust::equal(pt1, pt1 + r1, pt2) == true)
return true;
}
}
return false;
}
void mostrar_memoria()
{
int x;
unsigned int x;
list<memnode>::iterator i = GPUmem.begin();
cout << "Memoria inicio GPU" << endl;
for(x = 0; x < GPUmem.size(); x++, i++)
cout << i->name << " " << i->iteration << " " << i->size << endl;
cout << i->name << " " << i->iteration << " " << i->isrule << " " << i->rows << " " << i->size << endl;
cout << "Memoria fin GPU" << endl;
}
void mostrar_memcpu()
{
int x;
unsigned int x;
list<memnode>::iterator i = CPUmem.begin();
cout << "Memoria inicio CPU" << endl;
for(x = 0; x < CPUmem.size(); x++, i++)
@ -448,53 +482,7 @@ void mostrar_memcpu()
cout << "Memoria fin CPU" << endl;
}
void resultados(vector<rulenode>::iterator first, vector<rulenode>::iterator last)
{
GPUmem.sort(comparename);
CPUmem.sort(comparename);
list<memnode>::iterator gpu = GPUmem.begin();
list<memnode>::iterator cpu = CPUmem.begin();
int x, y, of, cols;
int *temp, cont = 0;
while(first != last)
{
while(first->name == gpu->name)
{
temp = (int *)malloc(gpu->size);
cudaMemcpy(temp, gpu->dev_address, gpu->size, cudaMemcpyDeviceToHost);
cols = gpu->size / (gpu->rows * sizeof(int));
cont += gpu->rows;
for(x = 0, of = 0; x < gpu->rows; x++)
{
for(y = 0; y < cols; y++, of++)
cout << temp[of] << " ";
cout << endl;
}
cudaFree(gpu->dev_address);
#ifdef DEBUG_MEM
cerr << "- " << gpu->dev_address << " gpu->dev_address" << endl;
#endif
free(temp);
gpu++;
}
while(first->name == cpu->name)
{
cols = cpu->size / (cpu->rows * sizeof(int));
cont += cpu->rows;
for(x = 0, of = 0; x < cpu->rows; x++)
{
for(y = 0; y < cols; y++, of++)
cout << cpu->dev_address[of] << " ";
cout << endl;
}
free(cpu->dev_address);
cpu++;
}
first++;
}
cout << cont << endl;
}
/*Clear all rule results from both GPU and CPU memory*/
void clear_memory()
{
list<memnode>::iterator ini;
@ -503,15 +491,13 @@ void clear_memory()
fin = GPUmem.end();
while(ini != fin)
{
if (ini->isrule) {
if(ini->isrule)
{
cudaFree(ini->dev_address);
#ifdef DEBUG_MEM
cerr << "- " << ini->dev_address << " ini->dev_address" << endl;
#endif
ini = GPUmem.erase(ini);
} else {
ini++;
}
else
ini++;
}
ini = CPUmem.begin();
fin = CPUmem.end();
@ -522,3 +508,68 @@ void clear_memory()
}
CPUmem.clear();
}
/*Clear everything from both GPU and CPU memory*/
void clear_memory_all()
{
list<memnode>::iterator ini;
list<memnode>::iterator fin;
ini = GPUmem.begin();
fin = GPUmem.end();
while(ini != fin)
{
cudaFree(ini->dev_address);
ini++;
}
GPUmem.clear();
ini = CPUmem.begin();
fin = CPUmem.end();
while(ini != fin)
{
free(ini->dev_address);
ini++;
}
CPUmem.clear();
}
/*Remove all instances of fact 'name' from both CPU and GPU memories*/
void liberar(int name)
{
list<memnode>::iterator i;
memnode fact;
i = buscarhecho(GPUmem.begin(), GPUmem.end(), name);
if(i != GPUmem.end())
{
fact = *i;
GPUmem.erase(i);
cudaFree(fact.dev_address);
}
i = buscarhecho(CPUmem.begin(), CPUmem.end(), name);
if(i != CPUmem.end())
{
fact = *i;
CPUmem.erase(i);
free(fact.dev_address);
}
}
/*Add all rows in 'dop1' to the fact 'name' by creating a new array capable of holding both.*/
void sumar(int name, int *dop1, int cols, int rows)
{
list<memnode>::iterator i;
memnode fact;
i = buscarhecho(GPUmem.begin(), GPUmem.end(), name);
int *res, newrows, offset;
if(i != GPUmem.end())
{
fact = *i;
newrows = rows + fact.rows;
reservar(&res, newrows * cols * sizeof(int));
offset = fact.rows * cols;
cudaMemcpyAsync(res, fact.dev_address, offset * sizeof(int), cudaMemcpyDeviceToDevice);
GPUmem.erase(i);
registrar(name, cols, res, newrows, 0, 0);
cudaMemcpyAsync(res + offset, dop1, rows * cols * sizeof(int), cudaMemcpyDeviceToDevice);
cudaFree(fact.dev_address);
}
}

13
packages/cuda/memory.h Normal file → Executable file
View File

@ -1,26 +1,27 @@
#ifndef _MEMORY_H_
#define _MEMORY_H_
//#include <thrust/device_vector.h>
#include <list>
#include <vector>
#include "lista.h"
using namespace std;
//using namespace thrust;
void calcular_mem(int);
void liberar(int*, int);
bool comparer(const rulenode&, const rulenode&);
void limpiar(const char [], size_t);
void limpiartodo(int*, int*);
int cargar(int, int, int, int, int*, int**, int);
int cargarcpu(int, int, int, int, int*, int**, int);
int cargafinal(int, int, int**);
void reservar(int**, int);
void reservar(int**, size_t);
void registrar(int, int, int*, int, int, int);
void registrarcpu(int, int, int*, int, int, int);
bool generadas(int, int, int, int);
void sumar(int, int*, int, int);
void liberar(int);
void mostrar_memoria(void);
void mostrar_memcpu(void);
void clear_memory(void);
void resultados(vector<rulenode>::iterator, vector<rulenode>::iterator);
void clear_memory_all(void);
#endif

12
packages/cuda/pred.h Normal file → Executable file
View File

@ -9,11 +9,17 @@ typedef struct Nodo{
int num_columns;
int is_fact;
int *address_host_table;
int *negatives;
char *predname;
double *weight;
}gpunode;
typedef gpunode predicate;
// #define TIMER 1
//#define TIMER 1
#define DATALOG 1
#define NUM_T 4
#define INISIZE 1000000
#if TIMER
typedef struct Stats{
@ -27,6 +33,8 @@ typedef struct Stats{
extern statinfo cuda_stats;
#endif
/*Constants used to mark comparison predicates*/
#define BPOFFSET (-6)
#define SBG_EQ (-1)
#define SBG_GT (-2)
#define SBG_LT (-3)
@ -34,6 +42,6 @@ extern statinfo cuda_stats;
#define SBG_LE (-5)
#define SBG_DF (-6)
int Cuda_Eval(predicate**, int, predicate**, int, predicate*, int**);
int Cuda_Eval(predicate**, int, predicate**, int, int*, int**, char*, int);
void Cuda_Statistics( void );
#endif

188
packages/cuda/selectproyect.cu Normal file → Executable file
View File

@ -1,10 +1,11 @@
#include <thrust/device_vector.h>
//#include <thrust/device_ptr.h>
#include <thrust/scan.h>
#include <stdlib.h>
#include "memory.h"
#include "bpreds.h"
__global__ void marcar(int *dop1, int rows, int cols, int *cons, int numc, int *res) /*a libreria*/
/*Mark all rows that comply with the selections*/
__global__ void marcar2(int *dop1, int rows, int cols, int *cons, int numc, int *res)
{
extern __shared__ int shared[];
int id = blockIdx.x * blockDim.x + threadIdx.x;
@ -24,14 +25,14 @@ __global__ void marcar(int *dop1, int rows, int cols, int *cons, int numc, int *
res[id] = 1;
}
}
__global__ void marcar2(int *dop1, int rows, int cols, int *cons, int numc, int *res) /*a libreria*/
/*If we already have an array of marks (perhaps because the selfjoin was applied first),
we unmark any rows that do not comply with the selections*/
__global__ void marcar(int *dop1, int rows, int cols, int *cons, int numc, int *res)
{
extern __shared__ int shared[];
int *spos = &shared[numc];
int id = blockIdx.x * blockDim.x + threadIdx.x;
int x, rowact, posact;
if(threadIdx.x < (numc * 2))
if(threadIdx.x < numc)
shared[threadIdx.x] = cons[threadIdx.x];
__syncthreads();
if(id < rows)
@ -39,10 +40,10 @@ __global__ void marcar2(int *dop1, int rows, int cols, int *cons, int numc, int
if(res[id] == 0)
return;
rowact = id * cols;
for(x = 0; x < numc; x++)
for(x = 0; x < numc; x += 2)
{
posact = rowact + spos[x];
if(dop1[posact] != shared[x])
posact = rowact + shared[x];
if(dop1[posact] != shared[x+1])
{
res[id] = 0;
return;
@ -51,6 +52,7 @@ __global__ void marcar2(int *dop1, int rows, int cols, int *cons, int numc, int
}
}
/*Unmark all rows that do not comply with the selfjoins.*/
__global__ void samejoin(int *dop1, int rows, int cols, int *dhead, int cont, int *res)
{
extern __shared__ int shared[];
@ -66,12 +68,12 @@ __global__ void samejoin(int *dop1, int rows, int cols, int *dhead, int cont, in
pos = id * cols;
for(x = 0; x < cont; x++)
{
temp = shared[x];
temp = dop1[pos+shared[x]];
y = x + 1;
temp2 = shared[y];
while(temp2 > -1)
{
if(dop1[temp+pos] != dop1[temp2+pos])
if(temp != dop1[temp2+pos])
{
res[id] = 0;
return;
@ -84,6 +86,7 @@ __global__ void samejoin(int *dop1, int rows, int cols, int *dhead, int cont, in
}
}
/*Mark all rows that comply with the selfjoins*/
__global__ void samejoin2(int *dop1, int rows, int cols, int *dhead, int cont, int *res)
{
extern __shared__ int shared[];
@ -97,12 +100,12 @@ __global__ void samejoin2(int *dop1, int rows, int cols, int *dhead, int cont, i
pos = id * cols;
for(x = 0; x < cont; x++)
{
temp = shared[x];
temp = dop1[pos+shared[x]];
y = x + 1;
temp2 = shared[y];
while(temp2 > -1)
{
if(dop1[temp+pos] != dop1[temp2+pos])
if(temp != dop1[temp2+pos])
return;
y++;
temp2 = shared[y];
@ -113,6 +116,7 @@ __global__ void samejoin2(int *dop1, int rows, int cols, int *dhead, int cont, i
}
}
/*Project all columns found in 'dhead' to a new array 'res'*/
__global__ void proyectar(int *dop1, int rows, int cols, int *dhead, int hsize, int *res)
{
extern __shared__ int shared[];
@ -130,76 +134,31 @@ __global__ void proyectar(int *dop1, int rows, int cols, int *dhead, int hsize,
}
}
/*Project all columns found in 'dhead' using only the rows marked as valid (i.e. those that complied with
selections, selfjoins, etc.). The array 'temp' holds the result of the prefix sum of said marks.*/
__global__ void llenarproyectar(int *dop1, int rows, int cols, int *temp, int *dhead, int hsize, int *res)
{
extern __shared__ int shared[];
int id = blockIdx.x * blockDim.x + threadIdx.x;
int pos, posr, x;
if(threadIdx.x < cols)
if(threadIdx.x < hsize)
shared[threadIdx.x] = dhead[threadIdx.x];
__syncthreads();
if(id < rows)
{
posr = temp[id+1];
if(temp[id] != posr && posr > 0)
posr = temp[id];
if(temp[id+1] != posr)
{
pos = id * cols;
posr = (posr - 1) * hsize;
posr *= hsize;
for(x = 0; x < hsize; x++, posr++)
res[posr] = dop1[pos+shared[x]];
}
}
}
/*__global__ void removedup()
{
extern __shared__ int shared[];
int id = blockIdx.x * blockDim.x + threadIdx.x;
if(threadIdx.x < cols)
shared[threadIdx.x] = dhead[threadIdx.x];
if(id < rows)
{
}
}*/
template<typename T> /*a libreria*/
struct suma : public binary_function<T,T,T>
{
__host__ __device__
T operator()(const T &r1, const T &r2)
{
if(r1 > -1)
{
if(r2 > 0)
return r1 + r2;
return -r1;
}
else
{
if(r2 > 0)
return abs(r1) + r2;
return r1;
}
}
};
int mayor(int a, int b, int c)
{
if(a > b)
{
if(a > c)
return a;
}
else
{
if(b > c)
return b;
}
return c;
}
int selectproyect(int *dop1, int rows, int cols, int head_size, int *select, int numselect, int *selfjoin, int numselfj, int *project, int **ret)
/*Performs selections, selfjoins and comparison predicates when the rule has a single normal predicate.*/
int selectproyect(int *dop1, int rows, int cols, int head_size, int *select, int numselect, int *selfjoin, int numselfj, int *preds, int numpreds, int *project, int **ret, int ANDlogic)
{
int *fres = NULL, *temp = NULL;
int *dhead = NULL, tmplen;
@ -209,30 +168,27 @@ int selectproyect(int *dop1, int rows, int cols, int head_size, int *select, int
#if TIMER
cuda_stats.selects++;
#endif
int head_bytes = mayor(numselect, numselfj, head_size) * sizeof(int);
int head_bytes = maximo(4, numselect, numselfj, numpreds, head_size) * sizeof(int);
reservar(&dhead, head_bytes);
#ifdef DEBUG_MEM
cerr << "+ " << dhead << " dhead " << head_bytes << endl;
#endif
int blockllen = rows / 1024 + 1;
int numthreads = 1024;
//int numthreads = 32;
int blockllen = rows / numthreads + 1;
#ifdef ROCKIT
ANDlogic = 1;
#endif
//removerep(dop1, rows, cols, dhead,)
if(numselect > 0)
{
tmplen = rows + 1;
size2 = tmplen * sizeof(int);
reservar(&temp, size2);
#ifdef DEBUG_MEM
cerr << "+ " << temp << " temp select " << size2 << endl;
#endif
cudaMemset(temp, 0, size2);
size = numselect * sizeof(int);
cudaMemcpy(dhead, select, size, cudaMemcpyHostToDevice);
marcar<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, numselect, temp + 1);
marcar2<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, numselect, temp + 1);
if(numselfj > 0)
{
@ -241,6 +197,16 @@ int selectproyect(int *dop1, int rows, int cols, int head_size, int *select, int
samejoin<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, numselfj, temp + 1);
}
if(numpreds > 0)
{
size = numpreds * sizeof(int);
cudaMemcpy(dhead, preds, size, cudaMemcpyHostToDevice);
if(ANDlogic)
bpredsnormal<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, numpreds, temp + 1);
else
bpredsorlogic<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, numpreds, temp + 1);
}
res = thrust::device_pointer_cast(temp);
thrust::inclusive_scan(res + 1, res + tmplen, res + 1);
num = res[rows];
@ -249,13 +215,10 @@ int selectproyect(int *dop1, int rows, int cols, int head_size, int *select, int
size = head_size * sizeof(int);
reservar(&fres, num * size);
#ifdef DEBUG_MEM
cerr << "+ " << fres << " fres select " << num*size << endl;
#endif
cudaMemcpy(dhead, project, size, cudaMemcpyHostToDevice);
llenarproyectar<<<blockllen, numthreads, size>>>(dop1, rows, cols, temp, dhead, head_size, fres);
liberar(dhead, head_bytes);
liberar(temp, size2);
cudaFree(dhead);
cudaFree(temp);
*ret = fres;
return num;
}
@ -266,15 +229,22 @@ int selectproyect(int *dop1, int rows, int cols, int head_size, int *select, int
tmplen = rows + 1;
size2 = tmplen * sizeof(int);
reservar(&temp, size2);
#ifdef DEBUG_MEM
cerr << "+ " << temp << " temp select " << size2 << endl;
#endif
cudaMemset(temp, 0, size2);
size = numselfj * sizeof(int);
cudaMemcpy(dhead, selfjoin, size, cudaMemcpyHostToDevice);
samejoin2<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, numselfj, temp + 1);
if(numpreds > 0)
{
size = numpreds * sizeof(int);
cudaMemcpy(dhead, preds, size, cudaMemcpyHostToDevice);
if(ANDlogic)
bpredsnormal<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, numpreds, temp + 1);
else
bpredsorlogic<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, numpreds, temp + 1);
}
res = thrust::device_pointer_cast(temp);
thrust::inclusive_scan(res + 1, res + tmplen, res + 1);
num = res[rows];
@ -283,13 +253,41 @@ int selectproyect(int *dop1, int rows, int cols, int head_size, int *select, int
size = head_size * sizeof(int);
reservar(&fres, num * size);
#ifdef DEBUG_MEM
cerr << "+ " << fres << " fres select again " << num*size << endl;
#endif
cudaMemcpy(dhead, project, size, cudaMemcpyHostToDevice);
llenarproyectar<<<blockllen, numthreads, size>>>(dop1, rows, cols, temp, dhead, head_size, fres);
liberar(dhead, head_bytes);
liberar(temp, size2);
cudaFree(dhead);
cudaFree(temp);
*ret = fres;
return num;
}
else
{
if(numpreds > 0)
{
tmplen = rows + 1;
size2 = tmplen * sizeof(int);
reservar(&temp, size2);
cudaMemset(temp, 0, size2);
size = numpreds * sizeof(int);
cudaMemcpy(dhead, preds, size, cudaMemcpyHostToDevice);
if(ANDlogic)
bpredsnormal2<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, numpreds, temp + 1);
else
bpredsorlogic2<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, numpreds, temp + 1);
res = thrust::device_pointer_cast(temp);
thrust::inclusive_scan(res + 1, res + tmplen, res + 1);
num = res[rows];
if(num == 0)
return 0;
size = head_size * sizeof(int);
reservar(&fres, num * size);
cudaMemcpy(dhead, project, size, cudaMemcpyHostToDevice);
llenarproyectar<<<blockllen, numthreads, size>>>(dop1, rows, cols, temp, dhead, head_size, fres);
cudaFree(dhead);
cudaFree(temp);
*ret = fres;
return num;
}
@ -297,14 +295,12 @@ int selectproyect(int *dop1, int rows, int cols, int head_size, int *select, int
{
size = head_size * sizeof(int);
reservar(&fres, rows * size);
#ifdef DEBUG_MEM
cerr << "+ " << fres << " fres select third " << rows*size << endl;
#endif
cudaMemcpy(dhead, project, size, cudaMemcpyHostToDevice);
proyectar<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, head_size, fres);
liberar(dhead, head_bytes);
cudaFree(dhead);
*ret = fres;
return rows;
}
}
}
}

File diff suppressed because it is too large Load Diff

889
packages/cuda/union2.cu Normal file → Executable file

File diff suppressed because it is too large Load Diff

View File

@ -2158,12 +2158,14 @@ static foreign_t init_python(void) {
char **argv;
term_t t = PL_new_term_ref();
YAP_Argv(&argv);
if (argv) {
#if PY_MAJOR_VERSION < 3
Py_SetProgramName(argv[0]);
#else
wchar_t *buf = Py_DecodeLocale(argv[0], NULL);
Py_SetProgramName(buf);
#endif
}
Py_Initialize();
py_Main = PyImport_AddModule("__main__");
py_Builtin = PyImport_AddModule("__builtin__");