new version of cuda interface

This commit is contained in:
Vitor Santos Costa 2016-07-31 10:14:02 -05:00
parent c6d174841a
commit d3599da6dc
37 changed files with 7040 additions and 367 deletions

2
.gitignore vendored
View File

@ -179,3 +179,5 @@ packages/myddas/hh
packages/myddas/DaysInHospital_Y3.csv
packages/myddas/agile.csv
*.pyc

0
packages/cuda/CC_CSSTree.cu Executable file → Normal file
View File

0
packages/cuda/CC_CSSTree.h Executable file → Normal file
View File

0
packages/cuda/Makefile.in Executable file → Normal file
View File

37
packages/cuda/bpreds.cu Executable file → Normal file
View File

@ -1,3 +1,4 @@
#include "hip/hip_runtime.h"
#include <thrust/device_vector.h>
#include <thrust/scan.h>
#include <cstdarg>
@ -25,10 +26,10 @@ int maximo(int count, ...)
__global__ void bpreds(int *dop1, int *dop2, int rows, int of1, int of2, int *cons, int numc, int nx, int *res, int *res2)
{
extern __shared__ int shared[];
int id = blockIdx.x * blockDim.x + threadIdx.x;
int id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
int x, rowact, rowact1, op1, op2;
if(threadIdx.x < numc)
shared[threadIdx.x] = cons[threadIdx.x];
if(hipThreadIdx_x < numc)
shared[hipThreadIdx_x] = cons[hipThreadIdx_x];
__syncthreads();
if(id < rows)
{
@ -110,10 +111,10 @@ __global__ void bpreds(int *dop1, int *dop2, int rows, int of1, int of2, int *co
__global__ void bpredsnormal2(int *dop1, int rows, int of1, int *cons, int numc, int *res)
{
extern __shared__ int shared[];
int id = blockIdx.x * blockDim.x + threadIdx.x;
int id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
int x, rowact, op1, op2;
if(threadIdx.x < numc)
shared[threadIdx.x] = cons[threadIdx.x];
if(hipThreadIdx_x < numc)
shared[hipThreadIdx_x] = cons[hipThreadIdx_x];
__syncthreads();
if(id < rows)
{
@ -159,10 +160,10 @@ __global__ void bpredsnormal2(int *dop1, int rows, int of1, int *cons, int numc,
__global__ void bpredsnormal(int *dop1, int rows, int of1, int *cons, int numc, int *res)
{
extern __shared__ int shared[];
int id = blockIdx.x * blockDim.x + threadIdx.x;
int id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
int x, rowact, op1, op2;
if(threadIdx.x < numc)
shared[threadIdx.x] = cons[threadIdx.x];
if(hipThreadIdx_x < numc)
shared[hipThreadIdx_x] = cons[hipThreadIdx_x];
__syncthreads();
if(id < rows)
{
@ -226,10 +227,10 @@ __global__ void bpredsnormal(int *dop1, int rows, int of1, int *cons, int numc,
__global__ void bpredsOR(int *dop1, int *dop2, int rows, int of1, int of2, int *cons, int numc, int nx, int *res, int *res2)
{
extern __shared__ int shared[];
int id = blockIdx.x * blockDim.x + threadIdx.x;
int id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
int x, rowact, rowact1, op1, op2;
if(threadIdx.x < numc)
shared[threadIdx.x] = cons[threadIdx.x];
if(hipThreadIdx_x < numc)
shared[hipThreadIdx_x] = cons[hipThreadIdx_x];
__syncthreads();
if(id < rows)
{
@ -344,10 +345,10 @@ __global__ void bpredsOR(int *dop1, int *dop2, int rows, int of1, int of2, int *
__global__ void bpredsorlogic2(int *dop1, int rows, int of1, int *cons, int numc, int *res)
{
extern __shared__ int shared[];
int id = blockIdx.x * blockDim.x + threadIdx.x;
int id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
int x, rowact, op1, op2;
if(threadIdx.x < numc)
shared[threadIdx.x] = cons[threadIdx.x];
if(hipThreadIdx_x < numc)
shared[hipThreadIdx_x] = cons[hipThreadIdx_x];
__syncthreads();
if(id < rows)
{
@ -411,10 +412,10 @@ __global__ void bpredsorlogic2(int *dop1, int rows, int of1, int *cons, int numc
__global__ void bpredsorlogic(int *dop1, int rows, int of1, int *cons, int numc, int *res)
{
extern __shared__ int shared[];
int id = blockIdx.x * blockDim.x + threadIdx.x;
int id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
int x, rowact, op1, op2;
if(threadIdx.x < numc)
shared[threadIdx.x] = cons[threadIdx.x];
if(hipThreadIdx_x < numc)
shared[hipThreadIdx_x] = cons[hipThreadIdx_x];
__syncthreads();
if(id < rows)
{

1
packages/cuda/bpreds.h Executable file → Normal file
View File

@ -1,3 +1,4 @@
#include "hip/hip_runtime.h"
#ifndef _BPREDS_H_
#define _BPREDS_H_

0
packages/cuda/bpredscpu.cpp Executable file → Normal file
View File

52
packages/cuda/clamp.rb Normal file
View File

@ -0,0 +1,52 @@
require "formula"
# Documentation: https://github.com/Homebrew/homebrew/wiki/Formula-Cookbook
# /usr/local/Library/Contributions/example-formula.rb
# PLEASE REMOVE ALL GENERATED COMMENTS BEFORE SUBMITTING YOUR PULL REQUEST!
class Clamp < Formula
homepage "https://bitbucket.org/multicoreware/cppamp-driver-ng/wiki/Home"
version "0.0.1-3"
url "https://bitbucket.org/multicoreware/cppamp-driver-ng/get/milestone3.tar.bz2"
head "https://bitbucket.org/multicoreware/cppamp-driver-ng.git"
sha1 "b8b88306561a60942f8ecbd8ff20554661c4e5f9"
depends_on "cmake" => :build
depends_on "wget" => :build
depends_on "git" => :build
depends_on "hg" => :build
depends_on "subversion" => :build
# depends_on :x11 # if your formula requires any X11/XQuartz components
def install
# ENV.deparallelize # if your formula fails when building in parallel
# Remove unrecognized options if warned by configure
# system "./configure", "--disable-debug",
# "--disable-dependency-tracking",
# "--disable-silent-rules",
# "--prefix=#{prefix}"
mkdir "macbuild" do
args = std_cmake_args
args << "-DCLANG_URL=https://bitbucket.org/multicoreware/cppamp-ng.git"
args << "-DLLVM_EXPERIMENTAL_TARGETS_TO_BUILD=CBackend"
args << "-DGMAC_URL=https://bitbucket.org/multicoreware/gmac"
system 'cmake', "..", *args
system "make", "world"
system "cd libc++; make install"
system "make", "install" # if this fails, try separate make/make install steps
end
end
test do
# `test do` will create, run in and delete a temporary directory.
#
# This test will fail and we won't accept that! It's enough to just replace
# "false" with the main program this formula installs, but it'd be nice if you
# were more thorough. Run the test with `brew test milestone`.
#
# The installed folder is not in the path, so use the entire path to any
# executables being tested: `system "#{bin}/program", "do", "something"`.
system "make", "test"
end
end

4
packages/cuda/creator2.c Executable file → Normal file
View File

@ -66,7 +66,7 @@ int main(int argc, char *argv[])
fprintf(cuda, "\t\t\t{\n");
fprintf(cuda, "\t\t\t\tsize = nrows * tipo * sizeof(int);\n");
fprintf(cuda, "\t\t\t\treservar(&nres, size);\n");
fprintf(cuda, "\t\t\t\tcudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);\n");
fprintf(cuda, "\t\t\t\tcudaMemcpyAsync(nres, res, size, hipMemcpyDeviceToDevice);\n");
fprintf(cuda, "\t\t\t\tcudaFree(*ret);\n");
fprintf(cuda, "\t\t\t\t*ret = nres;\n");
fprintf(cuda, "\t\t\t}\n");
@ -103,7 +103,7 @@ int main(int argc, char *argv[])
fprintf(cuda, "\t\t\t{\n");
fprintf(cuda, "\t\t\t\tsize = nrows * tipo * sizeof(int);\n");
fprintf(cuda, "\t\t\t\treservar(&nres, size);\n");
fprintf(cuda, "\t\t\t\tcudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);\n");
fprintf(cuda, "\t\t\t\tcudaMemcpyAsync(nres, res, size, hipMemcpyDeviceToDevice);\n");
fprintf(cuda, "\t\t\t\tcudaFree(*ret);\n");
fprintf(cuda, "\t\t\t\t*ret = nres;\n");
fprintf(cuda, "\t\t\t}\n");

0
packages/cuda/cuda.c Executable file → Normal file
View File

0
packages/cuda/cuda.yap Executable file → Normal file
View File

View File

@ -27,8 +27,8 @@ void datalogWrite(int query, vector<rulenode>::iterator rul_str, vector<rulenode
res_rows = unir(dop1, res_rows, cols1, &dop1, 0);
tipo = res_rows * cols1 * sizeof(int);
hres = (int *)malloc(tipo);
cudaMemcpy(hres, dop1, tipo, cudaMemcpyDeviceToHost);
cudaFree(dop1);
hipMemcpy(hres, dop1, tipo, hipMemcpyDeviceToHost);
hipFree(dop1);
*result = hres;
}
else
@ -39,13 +39,13 @@ void datalogWrite(int query, vector<rulenode>::iterator rul_str, vector<rulenode
int *dop2;
tipo = res_rows * cols1 * sizeof(int);
reservar(&dop2, tipo);
cudaMemcpy(dop2, dop1, tipo, cudaMemcpyHostToDevice);
hipMemcpy(dop2, dop1, tipo, hipMemcpyHostToDevice);
free(dop1);
res_rows = unir(dop2, res_rows, cols1, &dop2, 0);
tipo = res_rows * cols1 * sizeof(int);
hres = (int *)malloc(tipo);
cudaMemcpy(hres, dop2, tipo, cudaMemcpyDeviceToHost);
cudaFree(dop2);
hipMemcpy(hres, dop2, tipo, hipMemcpyDeviceToHost);
hipFree(dop2);
*result = hres;
}
else
@ -315,8 +315,8 @@ void postgresWrite(int *inpquery, int ninpf, vector<rulenode>::iterator rul_str,
tipo = res_rows * cols1 * sizeof(int);
hres = (int *)malloc(tipo);
cudaMemcpy(hres, dop1, tipo, cudaMemcpyDeviceToHost);
cudaFree(dop1);
hipMemcpy(hres, dop1, tipo, hipMemcpyDeviceToHost);
hipFree(dop1);
w = z + 1;
strtok(qposr->rulename, "_");
@ -353,8 +353,8 @@ void postgresWrite(int *inpquery, int ninpf, vector<rulenode>::iterator rul_str,
res_rows = abs(res_rows);
tipo = res_rows * cols1 * sizeof(int);
hres = (int *)malloc(tipo);
cudaMemcpy(hres, dop1, tipo, cudaMemcpyDeviceToHost);
cudaFree(dop1);
hipMemcpy(hres, dop1, tipo, hipMemcpyDeviceToHost);
hipFree(dop1);
char file[] = "/dev/shm/buffer.csv";
FILE *fp;
@ -554,7 +554,7 @@ void mysqlWrite(vector<rulenode>::iterator rul_str, vector<rulenode>::iterator f
sign = tmpfact.predname;
tipo = res_rows * cols1 * sizeof(int);
hres = (int *)malloc(tipo);
cudaMemcpy(hres, dop1, tipo, cudaMemcpyDeviceToHost);
hipMemcpy(hres, dop1, tipo, hipMemcpyDeviceToHost);
if(sign[0] == 'f' && sign[1] >= '0' && sign[1] <= '9')
sumar(tmpfact.name, dop1, cols1, res_rows);
}

View File

62
packages/cuda/joincpu.cpp Executable file → Normal file
View File

@ -324,11 +324,11 @@ int joincpu(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenod
}
#ifdef TIMER
cudaEvent_t start, stop;
hipEvent_t start, stop;
float time;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
hipEventCreate(&start);
hipEventCreate(&stop);
hipEventRecord(start, 0);
#endif
if(nsel1 > 0 || nsj1 > 0)
@ -359,16 +359,16 @@ int joincpu(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenod
}
#ifdef TIMER
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
hipEventRecord(stop, 0);
hipEventSynchronize(stop);
hipEventElapsedTime(&time, start, stop);
cuda_stats.select1_time += time;
cudaEventDestroy(start);
cudaEventDestroy(stop);
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
hipEventDestroy(start);
hipEventDestroy(stop);
hipEventCreate(&start);
hipEventCreate(&stop);
hipEventRecord(start, 0);
#endif
if(nsel2 > 0 || nsj2 > 0)
@ -381,16 +381,16 @@ int joincpu(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenod
Snl = sLen;
#ifdef TIMER
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
hipEventRecord(stop, 0);
hipEventSynchronize(stop);
hipEventElapsedTime(&time, start, stop);
cuda_stats.select2_time += time;
cudaEventDestroy(start);
cudaEventDestroy(stop);
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
hipEventDestroy(start);
hipEventDestroy(stop);
hipEventCreate(&start);
hipEventCreate(&stop);
hipEventRecord(start, 0);
#endif
//cout << "antes" << endl;
@ -406,16 +406,16 @@ int joincpu(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenod
thrust::stable_sort_by_key(thrust::omp::par, Rres, Rres + Rnl, permutation);
#ifdef TIMER
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
hipEventRecord(stop, 0);
hipEventSynchronize(stop);
hipEventElapsedTime(&time, start, stop);
cuda_stats.sort_time += time;
cudaEventDestroy(start);
cudaEventDestroy(stop);
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
hipEventDestroy(start);
hipEventDestroy(stop);
hipEventCreate(&start);
hipEventCreate(&stop);
hipEventRecord(start, 0);
#endif
/*cout << "despues" << endl;
@ -482,9 +482,9 @@ int joincpu(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenod
*ret = fres;
#ifdef TIMER
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
hipEventRecord(stop, 0);
hipEventSynchronize(stop);
hipEventElapsedTime(&time, start, stop);
cuda_stats.join_time += time;
#endif

40
packages/cuda/lista.cu Executable file → Normal file
View File

@ -967,7 +967,7 @@ vector<gpunode> L;
extern "C"
int Cuda_Eval(predicate **inpfacts, int ninpf, predicate **inprules, int ninpr, int *inpquery, int **result, char *names, int finalDR)
{
cudaSetDevice(0);
hipSetDevice(0);
vector<rulenode> rules;
int x;
@ -1029,11 +1029,11 @@ int Cuda_Eval(predicate **inpfacts, int ninpf, predicate **inprules, int ninpr,
vector<rulenode>::iterator qposr;
#if TIMER
cudaEvent_t start, stop;
hipEvent_t start, stop;
float time;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
hipEventCreate(&start);
hipEventCreate(&stop);
hipEventRecord(start, 0);
#endif
while(reglas.size()) /*Here's the main loop*/
@ -1084,7 +1084,7 @@ int Cuda_Eval(predicate **inpfacts, int ninpf, predicate **inprules, int ninpr,
{
num_refs = rows1 * cols1 * sizeof(int);
reservar(&res, num_refs);
cudaMemcpyAsync(res, dop1, num_refs, cudaMemcpyDeviceToDevice);
hipMemcpyAsync(res, dop1, num_refs, hipMemcpyDeviceToDevice);
registrar(rul_act->name, cols1, res, rows1, itr, 1);
genflag = 1;
rul_act->gen_ant = rul_act->gen_act;
@ -1251,10 +1251,10 @@ int Cuda_Eval(predicate **inpfacts, int ninpf, predicate **inprules, int ninpr,
if(x == num_refs)
{
#ifdef TIMER
cudaEvent_t start2, stop2;
cudaEventCreate(&start2);
cudaEventCreate(&stop2);
cudaEventRecord(start2, 0);
hipEvent_t start2, stop2;
hipEventCreate(&start2);
hipEventCreate(&stop2);
hipEventRecord(start2, 0);
#endif
//cout << rul_act->name << " res_rows = " << res_rows << endl;
@ -1263,11 +1263,11 @@ int Cuda_Eval(predicate **inpfacts, int ninpf, predicate **inprules, int ninpr,
res_rows = unir(res, res_rows, rul_act->num_columns, &res, 0);
#ifdef TIMER
cudaEventRecord(stop2, 0);
cudaEventSynchronize(stop2);
cudaEventElapsedTime(&time, start2, stop2);
cudaEventDestroy(start2);
cudaEventDestroy(stop2);
hipEventRecord(stop2, 0);
hipEventSynchronize(stop2);
hipEventElapsedTime(&time, start2, stop2);
hipEventDestroy(start2);
hipEventDestroy(stop2);
//cout << "Union = " << time << endl;
cuda_stats.union_time += time;
#endif
@ -1319,16 +1319,16 @@ int Cuda_Eval(predicate **inpfacts, int ninpf, predicate **inprules, int ninpr,
#endif
#if TIMER
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
hipEventRecord(stop, 0);
hipEventSynchronize(stop);
hipEventElapsedTime(&time, start, stop);
cuda_stats.total_time += time;
if (time > cuda_stats.max_time)
cuda_stats.max_time = time;
if (time < cuda_stats.min_time || cuda_stats.calls == 1)
cuda_stats.min_time = time;
cudaEventDestroy(start);
cudaEventDestroy(stop);
hipEventDestroy(start);
hipEventDestroy(stop);
Cuda_Statistics();
#endif

0
packages/cuda/lista.h Executable file → Normal file
View File

44
packages/cuda/memory.cu Executable file → Normal file
View File

@ -144,7 +144,7 @@ void limpiar(const char s[], size_t sz)
if(GPUmem.size() == 0)
{
cudaMemGetInfo(&free,&total);
hipMemGetInfo(&free,&total);
cerr << s << ": not enough GPU memory: have " << free << " of " << total << ", need " << sz << " bytes." << endl;
exit(1);
}
@ -154,11 +154,11 @@ void limpiar(const char s[], size_t sz)
{
temp = *ini;
temp.dev_address = (int *)malloc(ini->size);
cudaMemcpyAsync(temp.dev_address, ini->dev_address, temp.size, cudaMemcpyDeviceToHost);
hipMemcpyAsync(temp.dev_address, ini->dev_address, temp.size, hipMemcpyDeviceToHost);
list<memnode>::iterator pos = lower_bound(CPUmem.begin(), CPUmem.end(), temp, compareiteration);
CPUmem.insert(pos, temp);
}
cudaFree(ini->dev_address);
hipFree(ini->dev_address);
GPUmem.erase(ini);
}
@ -173,19 +173,19 @@ void reservar(int **ptr, size_t size)
return;
}
cudaMemGetInfo(&free, &total);
hipMemGetInfo(&free, &total);
while(free < size)
{
cout << "Se limpio memoria " << free << " " << total << endl;
limpiar("not enough memory", size);
cudaMemGetInfo(&free, &total);
hipMemGetInfo(&free, &total);
}
while(cudaMalloc(ptr, size) == cudaErrorMemoryAllocation)
while(hipMalloc(ptr, size) == hipErrorMemoryAllocation)
limpiar("Error in memory allocation", size);
if (! *ptr ) {
size_t free, total;
cudaMemGetInfo( &free, &total );
hipMemGetInfo( &free, &total );
cerr << "Could not allocate " << size << " bytes, only " << free << " avaliable from total of " << total << " !!!" << endl;
cerr << "Exiting CUDA...." << endl;
exit(1);
@ -277,7 +277,7 @@ int cargar(int name, int num_rows, int num_columns, int is_fact, int *address_ho
}
size = num_rows * num_columns * sizeof(int);
reservar(&temp, size);
cudaMemcpyAsync(temp, address_host_table, size, cudaMemcpyHostToDevice);
hipMemcpyAsync(temp, address_host_table, size, hipMemcpyHostToDevice);
registrar(name, num_columns, temp, num_rows, itr, 0);
*ptr = temp;
return num_rows;
@ -296,13 +296,13 @@ int cargar(int name, int num_rows, int num_columns, int is_fact, int *address_ho
reservar(&temp, size);
for(x = 0; x < numgpu; x++)
{
cudaMemcpyAsync(temp + inc, temp_storage[x].dev_address, temp_storage[x].size, cudaMemcpyDeviceToDevice);
hipMemcpyAsync(temp + inc, temp_storage[x].dev_address, temp_storage[x].size, hipMemcpyDeviceToDevice);
inc += temp_storage[x].size / sizeof(int);
cudaFree(temp_storage[x].dev_address);
hipFree(temp_storage[x].dev_address);
}
for(; x < numcpu; x++)
{
cudaMemcpyAsync(temp + inc, temp_storage[x].dev_address, temp_storage[x].size, cudaMemcpyHostToDevice);
hipMemcpyAsync(temp + inc, temp_storage[x].dev_address, temp_storage[x].size, hipMemcpyHostToDevice);
inc += temp_storage[x].size / sizeof(int);
free(temp_storage[x].dev_address);
}
@ -340,9 +340,9 @@ int cargarcpu(int name, int num_rows, int num_columns, int is_fact, int *address
temp = (int *)malloc(size);
for(x = 0; x < numgpu; x++)
{
cudaMemcpyAsync(temp + inc, temp_storage[x].dev_address, temp_storage[x].size, cudaMemcpyDeviceToHost);
hipMemcpyAsync(temp + inc, temp_storage[x].dev_address, temp_storage[x].size, hipMemcpyDeviceToHost);
inc += temp_storage[x].size / sizeof(int);
cudaFree(temp_storage[x].dev_address);
hipFree(temp_storage[x].dev_address);
}
for(; x < numcpu; x++)
{
@ -404,7 +404,7 @@ int cargafinal(int name, int cols, int **ptr)
cont = pos->rows;
#ifdef TUFFY
reservar(&temp, pos->size);
cudaMemcpy(temp, pos->dev_address, pos->size, cudaMemcpyHostToDevice);
hipMemcpy(temp, pos->dev_address, pos->size, hipMemcpyHostToDevice);
*ptr = temp;
#else
*ptr = pos->dev_address;
@ -418,14 +418,14 @@ int cargafinal(int name, int cols, int **ptr)
pos = gpu;
while(pos != endg && pos->name == name)
{
cudaMemcpy(temp, pos->dev_address, pos->size, cudaMemcpyDeviceToDevice);
hipMemcpy(temp, pos->dev_address, pos->size, hipMemcpyDeviceToDevice);
temp += pos->size / sizeof(int);
pos++;
}
pos = cpu;
while(pos != endc && pos->name == name)
{
cudaMemcpy(temp, pos->dev_address, pos->size, cudaMemcpyHostToDevice);
hipMemcpy(temp, pos->dev_address, pos->size, hipMemcpyHostToDevice);
temp += pos->size / sizeof(int);
pos++;
}
@ -493,7 +493,7 @@ void clear_memory()
{
if(ini->isrule)
{
cudaFree(ini->dev_address);
hipFree(ini->dev_address);
ini = GPUmem.erase(ini);
}
else
@ -518,7 +518,7 @@ void clear_memory_all()
fin = GPUmem.end();
while(ini != fin)
{
cudaFree(ini->dev_address);
hipFree(ini->dev_address);
ini++;
}
GPUmem.clear();
@ -542,7 +542,7 @@ void liberar(int name)
{
fact = *i;
GPUmem.erase(i);
cudaFree(fact.dev_address);
hipFree(fact.dev_address);
}
i = buscarhecho(CPUmem.begin(), CPUmem.end(), name);
if(i != CPUmem.end())
@ -566,10 +566,10 @@ void sumar(int name, int *dop1, int cols, int rows)
newrows = rows + fact.rows;
reservar(&res, newrows * cols * sizeof(int));
offset = fact.rows * cols;
cudaMemcpyAsync(res, fact.dev_address, offset * sizeof(int), cudaMemcpyDeviceToDevice);
hipMemcpyAsync(res, fact.dev_address, offset * sizeof(int), hipMemcpyDeviceToDevice);
GPUmem.erase(i);
registrar(name, cols, res, newrows, 0, 0);
cudaMemcpyAsync(res + offset, dop1, rows * cols * sizeof(int), cudaMemcpyDeviceToDevice);
cudaFree(fact.dev_address);
hipMemcpyAsync(res + offset, dop1, rows * cols * sizeof(int), hipMemcpyDeviceToDevice);
hipFree(fact.dev_address);
}
}

0
packages/cuda/memory.h Executable file → Normal file
View File

601
packages/cuda/old/cuda.c Executable file
View File

@ -0,0 +1,601 @@
// interface to CUDD Datalog evaluation
#include "config.h"
#include "YapInterface.h"
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <inttypes.h>
#include "pred.h"
#define MAXARG 100
YAP_Atom AtomEq,
AtomGt,
AtomLt,
AtomGe,
AtomLe,
AtomDf,
AtomNt;
predicate *facts[MAXARG]; /*Temporary solution to maintain facts and rules*/
predicate *rules[MAXARG];
int32_t cf = 0, cr = 0;
char names[1024];
// initialize CUDA system
void Cuda_Initialize( void );
// add/replace a set of facts for predicate pred
int32_t Cuda_NewFacts(predicate *pred);
// add/replace a rule for predicate pred
int32_t Cuda_NewRule(predicate *pred);
// erase predicate pred
int32_t Cuda_Erase(predicate *pred);
// evaluate predicate pred, mat is bound to a vector of solutions, and
// output the count
//int32_t Cuda_Eval(predicate *pred, int32_t **mat); This functions arguments were changed, please see pred.h
void init_cuda( void );
//#define DEBUG_INTERFACE 1
#ifdef ROCKIT
static int32_t query[100];
static int32_t qcont = 0;
static int cuda_init_query(void)
{
int32_t pname = YAP_AtomToInt(YAP_AtomOfTerm(YAP_ARG1));
query[qcont] = pname;
qcont++;
query[qcont] = 0;
return TRUE;
}
#endif
#if DEBUG_INTERFACE
static void
dump_mat(int32_t mat[], int32_t nrows, int32_t ncols)
{
return;
int32_t i, j;
for ( i=0; i< nrows; i++) {
printf("%d", mat[i*ncols]);
for (j=1; j < ncols; j++) {
printf(", %d", mat[i*ncols+j]);
}
printf("\n");
}
}
static void
dump_vec(int32_t vec[], int32_t rows)
{
int32_t i = 1;
int32_t j = 0;
for (j = 0; j < rows; j++) {
for ( ; vec[i]; i++ ) {
printf(", %d", vec[i]);
}
printf(", 0");
i++;
}
printf("\n");
}
#endif /* DEBUG_INTERFACE */
// stubs, will point at Carlos code.
void Cuda_Initialize( void )
{
}
int32_t Cuda_NewFacts(predicate *pe)
{
#if DEBUG_INTERFACE
dump_mat( pe->address_host_table, pe->num_rows, pe->num_columns );
#endif
#ifdef ROCKIT
if(cf >= 0)
{
facts[cf] = pe;
cf++;
}
#else
facts[cf] = pe;
cf++;
#endif
return TRUE;
}
int32_t Cuda_NewRule(predicate *pe)
{
#if DEBUG_INTERFACE
dump_vec( pe->address_host_table, pe->num_rows);
#endif
rules[cr] = pe;
cr++;
return TRUE;
}
int32_t Cuda_Erase(predicate *pe)
{
int i = 0;
while ( rules[i] != pe )
i++;
while (i < cr-1) {
rules[i] = rules[i+1];
i++;
}
rules[i] = NULL;
cr--;
if (pe->address_host_table)
free( pe->address_host_table );
free( pe );
return TRUE;
}
static int
load_facts( void ) {
int32_t nrows = YAP_IntOfTerm(YAP_ARG1);
int32_t ncols = YAP_IntOfTerm(YAP_ARG2), i = 0;
YAP_Term t3 = YAP_ARG3;
int32_t *mat = (int32_t *)malloc(sizeof(int32_t)*nrows*ncols);
int32_t pname = YAP_AtomToInt(YAP_NameOfFunctor(YAP_FunctorOfTerm(YAP_HeadOfTerm(t3))));
predicate *pred;
while(YAP_IsPairTerm(t3)) {
int32_t j = 0;
YAP_Term th = YAP_HeadOfTerm(t3);
for (j = 0; j < ncols; j++) {
YAP_Term ta = YAP_ArgOfTerm(j+1, th);
if (YAP_IsAtomTerm(ta)) {
mat[i*ncols+j] = YAP_AtomToInt(YAP_AtomOfTerm(ta));
} else {
mat[i*ncols+j] = YAP_IntOfTerm(ta);
}
}
t3 = YAP_TailOfTerm( t3 );
i++;
}
if (YAP_IsVarTerm( YAP_ARG4)) {
// new
pred = (predicate *)malloc(sizeof(predicate));
} else {
pred = (predicate *)YAP_IntOfTerm(YAP_ARG4);
if (pred->address_host_table)
free( pred->address_host_table );
}
pred->name = pname;
pred->num_rows = nrows;
pred->num_columns = ncols;
pred->is_fact = TRUE;
pred->address_host_table = mat;
Cuda_NewFacts(pred);
if (YAP_IsVarTerm( YAP_ARG4)) {
return YAP_Unify(YAP_ARG4, YAP_MkIntTerm((YAP_Int)pred));
} else {
return TRUE;
}
}
static int currentFact = 0;
static predicate *currentPred = NULL;
static int
cuda_init_facts( void ) {
int32_t nrows = YAP_IntOfTerm(YAP_ARG1);
int32_t ncols = YAP_IntOfTerm(YAP_ARG2);
int32_t *mat = (int32_t *)malloc(sizeof(int32_t)*nrows*ncols);
int32_t pname = YAP_AtomToInt(YAP_AtomOfTerm(YAP_ARG3));
predicate *pred;
strcat(names, YAP_AtomName(YAP_AtomOfTerm(YAP_ARG3)));
strcat(names, " ");
if (!mat)
return FALSE;
if (YAP_IsVarTerm( YAP_ARG4)) {
// new
pred = (predicate *)malloc(sizeof(predicate));
} else {
pred = (predicate *)YAP_IntOfTerm(YAP_ARG4);
if (pred->address_host_table)
free( pred->address_host_table );
}
pred->name = pname;
pred->num_rows = nrows;
pred->num_columns = ncols;
pred->is_fact = TRUE;
pred->address_host_table = mat;
currentPred = pred;
currentFact = 0;
if (YAP_IsVarTerm( YAP_ARG4)) {
return YAP_Unify(YAP_ARG4, YAP_MkIntTerm((YAP_Int)pred));
} else {
return TRUE;
}
}
static int
cuda_load_fact( void ) {
int i = currentFact;
#if defined(DATALOG) || defined(TUFFY)
YAP_Term th = YAP_ARG1;
int ncols = currentPred->num_columns;
int j;
int *mat = currentPred->address_host_table;
for (j = 0; j < ncols; j++) {
YAP_Term ta = YAP_ArgOfTerm(j+1, th);
if (YAP_IsAtomTerm(ta)) {
mat[i*ncols+j] = YAP_AtomToInt(YAP_AtomOfTerm(ta));
} else {
mat[i*ncols+j] = YAP_IntOfTerm(ta);
}
}
#endif
i++;
if (i == currentPred->num_rows) {
Cuda_NewFacts(currentPred);
currentPred = NULL;
currentFact = 0;
} else {
currentFact = i;
}
return TRUE;
}
static int
load_rule( void ) {
// maximum of 2K symbols per rule, should be enough for ILP
int32_t vec[2048], *ptr = vec, *nvec, neg[2048];
// qK different variables;
YAP_Term vars[1024];
int32_t nvars = 0, x;
int32_t ngoals = YAP_IntOfTerm(YAP_ARG1); /* gives the number of goals */
int32_t ncols = YAP_IntOfTerm(YAP_ARG2);
YAP_Term t3 = YAP_ARG3;
YAP_Atom name = YAP_NameOfFunctor(YAP_FunctorOfTerm(YAP_HeadOfTerm(t3)));
int32_t pname = YAP_AtomToInt(name);
const char *strname = YAP_AtomName(name);
predicate *pred;
int32_t cont = 0;
memset(neg, 0x0, 2048 * sizeof(int32_t));
while(YAP_IsPairTerm(t3)) {
int32_t j = 0, m;
YAP_Term th = YAP_HeadOfTerm(t3);
YAP_Functor f = YAP_FunctorOfTerm( th );
int32_t n = YAP_ArityOfFunctor( f );
YAP_Atom at = YAP_NameOfFunctor( f );
if (at == AtomEq)
*ptr++ = SBG_EQ;
else if (at == AtomGt)
*ptr++ = SBG_GT;
else if (at == AtomLt)
*ptr++ = SBG_LT;
else if (at == AtomGe)
*ptr++ = SBG_GE;
else if (at == AtomLe)
*ptr++ = SBG_LE;
else if (at == AtomDf)
*ptr++ = SBG_DF;
else if (at == AtomNt)
{
neg[cont] = 1;
cont++;
}
else
{
*ptr++ = YAP_AtomToInt( at );
cont++;
}
for (j = 0; j < n; j++) {
YAP_Term ta = YAP_ArgOfTerm(j+1, th);
if (YAP_IsVarTerm(ta)) {
int32_t k;
for (k = 0; k< nvars; k++) {
if (vars[k] == ta) {
*ptr++ = k+1;
break;
}
}
if (k == nvars) {
vars[k] = ta;
*ptr++ = k+1;
nvars++;
}
} else if (YAP_IsAtomTerm(ta)) {
*ptr++ = -YAP_AtomToInt(YAP_AtomOfTerm(ta));
} else if (YAP_IsApplTerm(ta)) {
f = YAP_FunctorOfTerm( ta );
at = YAP_NameOfFunctor( f );
m = YAP_ArityOfFunctor( f );
*ptr++ = YAP_AtomToInt( at );
for (x = 0; x < m; x++) {
YAP_Term ta2 = YAP_ArgOfTerm(x+1, ta);
if (YAP_IsVarTerm(ta2)) {
int32_t k;
for (k = 0; k < nvars; k++) {
if (vars[k] == ta2) {
*ptr++ = k+1;
break;
}
}
if (k == nvars) {
vars[k] = ta2;
*ptr++ = k+1;
nvars++;
}
} else if (YAP_IsAtomTerm(ta2)) {
*ptr++ = -YAP_AtomToInt(YAP_AtomOfTerm(ta));
} else {
*ptr++ = -YAP_IntOfTerm(ta);
}
}
} else {
*ptr++ = -YAP_IntOfTerm(ta);
}
}
*ptr++ = 0;
t3 = YAP_TailOfTerm( t3 );
}
if (YAP_IsVarTerm( YAP_ARG4)) {
// new
pred = (predicate *)malloc(sizeof(predicate));
} else {
pred = (predicate *)YAP_IntOfTerm(YAP_ARG4);
if (pred->address_host_table)
free( pred->address_host_table );
}
pred->name = pname;
pred->num_rows = ngoals;
pred->num_columns = ncols;
pred->is_fact = FALSE;
x = (strlen(strname) + 1) * sizeof(char);
pred->predname = (char *)malloc(x);
memcpy(pred->predname, strname, x);
nvec = (int32_t *)malloc(sizeof(int32_t)*(ptr-vec));
memcpy(nvec, vec, sizeof(int32_t)*(ptr-vec));
pred->address_host_table = nvec;
pred->negatives = (int32_t *)malloc(sizeof(int32_t) * cont);
memcpy(pred->negatives, neg, sizeof(int32_t) * cont);
Cuda_NewRule( pred );
return YAP_Unify(YAP_ARG4, YAP_MkIntTerm((YAP_Int)pred));
}
static int
cuda_erase( void )
{
predicate *ptr = (predicate *)YAP_IntOfTerm(YAP_ARG1);
return Cuda_Erase( ptr );
}
void setQuery(YAP_Term t1, int32_t **res)
{
int32_t *query = (int32_t *)malloc(MAXARG * sizeof(int32_t));
int32_t x, y = 0, *itr;
predicate *ptr = NULL;
if(YAP_IsPairTerm(t1))
{
while(YAP_IsPairTerm(t1))
{
ptr = (predicate *)YAP_IntOfTerm(YAP_HeadOfTerm(t1));
query[y] = ptr->name;
itr = ptr->address_host_table;
x = 2;
while(itr[x] != 0)
x++;
query[y+1] = itr[x+1];
t1 = YAP_TailOfTerm(t1);
y+=2;
}
}
else
{
ptr = (predicate *)YAP_IntOfTerm(t1);
query[y] = ptr->name;
itr = ptr->address_host_table;
x = 2;
while(itr[x] != 0)
x++;
query[y+1] = itr[x+1];
y += 2;
}
query[y] = -1;
query[y+1] = -1;
*res = query;
}
static int
cuda_eval( void )
{
int32_t *mat;
#if defined(DATALOG) || defined(TUFFY)
int32_t *query = NULL;
setQuery(YAP_ARG1, &query);
#endif
int32_t finalDR = YAP_IntOfTerm(YAP_ARG3);
int32_t n = Cuda_Eval(facts, cf, rules, cr, query, & mat, names, finalDR);
#ifdef TUFFY
cf = 0;
#endif
#ifdef ROCKIT
if(cf > 0)
cf *= -1;
#endif
#if defined(TUFFY) || defined(ROCKIT)
cr = 0;
names[0] = '\0';
return FALSE;
#else
int32_t i;
predicate *ptr = (predicate *)YAP_IntOfTerm(YAP_ARG1);
int32_t ncols = ptr->num_columns;
YAP_Term out = YAP_TermNil();
YAP_Functor f = YAP_MkFunctor(YAP_IntToAtom(ptr->name), ncols);
YAP_Term vec[256];
YAP_Atom at;
if (n < 0)
return FALSE;
for (i=0; i<n; i++) {
int32_t ni = ((n-1)-i)*ncols, j;
printf("%s(", YAP_AtomName(YAP_IntToAtom(ptr->name)));
for (j=0; j<ncols; j++) {
vec[j] = YAP_MkIntTerm(mat[ni+j]);
at = YAP_IntToAtom(mat[ni+j]);
if(at != NULL)
printf("%s", YAP_AtomName(at));
else
printf("%d", mat[ni+j]);
if(j < (ncols - 1))
printf(",");
}
out = YAP_MkPairTerm(YAP_MkApplTerm( f, ncols, vec ), out);
printf(")\n");
}
if (n > 0)
free( mat );
return YAP_Unify(YAP_ARG2, out);
#endif
}
static int
cuda_coverage( void )
{
int32_t *mat;
#if defined(DATALOG) || defined(TUFFY)
int32_t *query = NULL;
setQuery(YAP_ARG1, &query);
#endif
int32_t n = Cuda_Eval(facts, cf, rules, cr, query, & mat, 0, 0);
int32_t post = YAP_AtomToInt(YAP_AtomOfTerm(YAP_ARG2));
int32_t i = n/2, min = 0, max = n-1;
int32_t t0, t1;
if (n < 0)
return FALSE;
if (n == 0) {
return YAP_Unify(YAP_ARG4, YAP_MkIntTerm(0)) &&
YAP_Unify(YAP_ARG3, YAP_MkIntTerm(0));
}
t0 = mat[0], t1 = mat[(n-1)*2];
if (t0 == t1) { /* all sametype */
free( mat );
/* all pos */
if (t0 == post)
return YAP_Unify(YAP_ARG3, YAP_MkIntTerm(n)) &&
YAP_Unify(YAP_ARG4, YAP_MkIntTerm(0));
/* all neg */
return YAP_Unify(YAP_ARG4, YAP_MkIntTerm(n)) &&
YAP_Unify(YAP_ARG3, YAP_MkIntTerm(0));
}
do {
i = (min+max)/2;
if (i == min) i++;
if (mat[i*2] == t0) {
min = i;
} else {
max = i;
}
if (min+1 == max) {
free( mat );
if (t0 == post)
return YAP_Unify(YAP_ARG3, YAP_MkIntTerm(max)) &&
YAP_Unify(YAP_ARG4, YAP_MkIntTerm(n-max));
/* all neg */
return YAP_Unify(YAP_ARG4, YAP_MkIntTerm(max)) &&
YAP_Unify(YAP_ARG3, YAP_MkIntTerm(n-max));
}
} while ( TRUE );
}
static int cuda_count( void )
{
int32_t *mat;
#if defined(DATALOG) || defined(TUFFY)
int32_t *query = NULL;
setQuery(YAP_ARG1, &query);
#endif
int32_t n = Cuda_Eval(facts, cf, rules, cr, query, & mat, 0, 0);
if (n < 0)
return FALSE;
free( mat );
return YAP_Unify(YAP_ARG2, YAP_MkIntTerm(n));
}
static int cuda_statistics( void )
{
Cuda_Statistics();
return TRUE;
}
static int first_time = TRUE;
void
init_cuda(void)
{
if (first_time) Cuda_Initialize();
first_time = FALSE;
AtomEq = YAP_LookupAtom("=");
AtomGt = YAP_LookupAtom(">");
AtomLt = YAP_LookupAtom("<");
AtomGe = YAP_LookupAtom(">=");
AtomLe = YAP_LookupAtom("=<");
AtomDf = YAP_LookupAtom("\\=");
AtomNt = YAP_LookupAtom("not");
YAP_UserCPredicate("load_facts", load_facts, 4);
YAP_UserCPredicate("cuda_init_facts", cuda_init_facts, 4);
YAP_UserCPredicate("cuda_load_fact", cuda_load_fact, 1);
YAP_UserCPredicate("load_rule", load_rule, 4);
YAP_UserCPredicate("cuda_erase", cuda_erase, 1);
YAP_UserCPredicate("cuda_eval", cuda_eval, 3);
YAP_UserCPredicate("cuda_coverage", cuda_coverage, 4);
YAP_UserCPredicate("cuda_count", cuda_count, 2);
YAP_UserCPredicate("cuda_statistics", cuda_statistics, 0);
#ifdef ROCKIT
YAP_UserCPredicate("cuda_init_query", cuda_init_query, 1);
#endif
}

603
packages/cuda/old/dbio.cu Normal file
View File

@ -0,0 +1,603 @@
#include <iostream>
#include <algorithm>
#include <stdio.h>
#include "memory.h"
#include "union2.h"
#include "dbio.h"
#ifdef DATALOG
//template<class InputIterator>
//void datalogWrite(int query, InputIterator rul_str, InputIterator fin, int finalDR, int **result)
void datalogWrite(int query, vector<rulenode>::iterator rul_str, vector<rulenode>::iterator fin, int finalDR, int **result)
{
rulenode tmprule;
vector<rulenode>::iterator qposr;
int *dop1, *hres;
int cols1, res_rows, tipo;
tmprule.name = query;
qposr = lower_bound(rul_str, fin, tmprule, comparer);
cols1 = qposr->num_columns;
res_rows = cargafinal(query, cols1, &dop1);
if(res_rows != 0)
{
if(res_rows > 0)
{
if(finalDR)
res_rows = unir(dop1, res_rows, cols1, &dop1, 0);
tipo = res_rows * cols1 * sizeof(int);
hres = (int *)malloc(tipo);
cudaMemcpy(hres, dop1, tipo, cudaMemcpyDeviceToHost);
cudaFree(dop1);
*result = hres;
}
else
{
res_rows *= -1;
if(finalDR)
{
int *dop2;
tipo = res_rows * cols1 * sizeof(int);
reservar(&dop2, tipo);
cudaMemcpy(dop2, dop1, tipo, cudaMemcpyHostToDevice);
free(dop1);
res_rows = unir(dop2, res_rows, cols1, &dop2, 0);
tipo = res_rows * cols1 * sizeof(int);
hres = (int *)malloc(tipo);
cudaMemcpy(hres, dop2, tipo, cudaMemcpyDeviceToHost);
cudaFree(dop2);
*result = hres;
}
else
*result = dop1;
}
}
}
#endif
#ifdef TUFFY
void postgresRead(PGconn **ret, vector<gpunode> *L, int *inpquery, char *names, int finalDR)
{
PGresult *pgr;
int x, y;
int *mat, *mat2;
char *tok, sel[1024], **qrs;
int w, z = 0, numt, numc, numc2, start = 0, start2, val;
PGconn *conn = PQconnectdb("host=localhost port=5432 dbname = prueba user=tuffer password=root");
if(PQstatus(conn) != CONNECTION_OK)
{
fprintf(stderr, "Connection to database failed: %s", PQerrorMessage(conn));
exit(1);
}
pgr = PQexec(conn, "Select nspname from pg_catalog.pg_namespace where oid = (select max(oid) from pg_catalog.pg_namespace)");
sprintf(sel, "SET search_path = %s", PQgetvalue(pgr, 0, 0));
PQclear(pgr);
PQexec(conn, sel);
tok = strtok(names, " ");
if(finalDR)
{
qrs = (char **)malloc(100 * sizeof(char *));
while(tok != NULL)
{
sprintf(sel, "Select * from %s limit 0", tok);
pgr = PQexec(conn, sel);
numc = L->at(z).num_columns;
if(tok[0] == 'c')
{
sprintf(sel, "Select ");
numt = numc + 1;
for(x = 1; x < numt; x++)
{
strcat(sel, PQfname(pgr, x));
strcat(sel, ", ");
}
sel[strlen(sel)-2] = '\0';
sprintf(sel, "%s from %s", sel, tok);
}
else
{
sprintf(sel, "Select id, Club, ");
numt = numc + 6;
for(x = 8; x < numt; x++)
{
strcat(sel, PQfname(pgr, x));
strcat(sel, ", ");
}
sel[strlen(sel)-2] = '\0';
sprintf(sel, "%s from %s", sel, tok);
}
PQclear(pgr);
pgr = PQexec(conn, sel);
numt = PQntuples(pgr);
mat = (int *)malloc(numt * numc * sizeof(int));
if(tok[0] == 'c')
{
for(x = 0; x < numt; x++)
{
start = x * numc;
for(y = 0; y < numc; y++)
mat[start + y] = atoi(PQgetvalue(pgr, x, y));
}
}
else
{
numc2 = numc - 2;
mat2 = (int *)malloc(numt * numc2 * sizeof(int));
start = 0;
start2 = 0;
for(x = 0; x < numt; x++)
{
w = atoi(PQgetvalue(pgr, x, 1));
if(w < 2)
{
mat[start] = atoi(PQgetvalue(pgr, x, 0));
start++;
mat[start] = w;
start++;
if(w > 0)
{
for(y = 2; y < numc; y++)
{
val = atoi(PQgetvalue(pgr, x, y));
mat[start] = val;
mat2[start2] = val;
start++;
start2++;
}
}
else
{
for(y = 2; y < numc; y++)
{
val = atoi(PQgetvalue(pgr, x, y));
mat[start] = val;
start++;
}
}
}
else
{
for(y = 2; y < numc; y++)
{
val = atoi(PQgetvalue(pgr, x, y));
mat2[start2] = val;
start2++;
}
}
}
L->at(z+1).address_host_table = mat2;
L->at(z+1).num_rows = start2 / numc2;
}
L->at(z).address_host_table = mat;
L->at(z).num_rows = start / numc;
PQclear(pgr);
x = 1;
while(inpquery[x] != -1)
{
if(L->at(z).name == inpquery[x])
{
numt = (strlen(tok) + 1) * sizeof(char);
qrs[x] = (char *)malloc(numt);
memcpy(qrs[x], tok, numt);
}
x += 2;
}
if(tok[0] == 'c')
{
tok = strtok(NULL, " ");
z++;
}
else
{
strtok(NULL, " ");
tok = strtok(NULL, " ");
z += 2;
}
}
}
else
{
while(tok != NULL)
{
sprintf(sel, "Select * from %s limit 0", tok);
pgr = PQexec(conn, sel);
numc = L->at(z).num_columns;
if(tok[0] == 'c')
{
sprintf(sel, "Select weight, myid, ");
start = 1;
numt = numc + 1;
}
else
{
sprintf(sel, "Select truth, Club, atomID, ");
start = 8;
numt = numc + 5;
}
for(x = start; x < numt; x++)
{
strcat(sel, PQfname(pgr, x));
strcat(sel, ", ");
}
sel[strlen(sel)-2] = '\0';
sprintf(sel, "%s from %s", sel, tok);
PQclear(pgr);
pgr = PQexec(conn, sel);
numt = PQntuples(pgr);
mat = (int *)malloc(numt * numc * sizeof(int));
L->at(z).weight = (double *)malloc(numt * sizeof(double));
L->at(z).num_rows = numt;
for(x = 0; x < numt; x++)
{
start = x * numc;
for(y = 1; y < numc; y++)
mat[start + y] = atoi(PQgetvalue(pgr, x, y));
}
numt *= numc;
double flo;
if(tok[0] == 'c')
{
for(x = 0, y = 0; x < numt; x+=numc, y++)
{
flo = atof(PQgetvalue(pgr, y, 0));
L->at(z).weight[y] = flo;
if(flo > 0)
mat[x] = y + 1;
else
mat[x] = -y - 1;
}
}
else
{
for(x = 0, y = 0; x < numt; x+=numc, y++)
{
if(PQgetvalue(pgr, y, 0)[0] == 't')
mat[x] = 2;
else
mat[x] = 1;
}
}
L->at(z).address_host_table = mat;
numc = (strlen(tok) + 1) * sizeof(char);
L->at(z).predname = (char *)malloc(numc);
memcpy(L->at(z).predname, tok, numc);
PQclear(pgr);
tok = strtok(NULL, " ");
z++;
}
}
*ret = conn;
}
void postgresWrite(int *inpquery, int ninpf, vector<rulenode>::iterator rul_str, vector<rulenode>::iterator fin, vector<gpunode> *L, PGconn *conn, int finalDR)
{
char sel[1024];
double *matw = NULL;
int qname, cols1, res_rows, tipo, *dop1;
int x, w, z, y, *hres;
rulenode tmprule;
vector<rulenode>::iterator qposr;
if(finalDR)
{
char file[] = "/dev/shm/mln0_atoms.csv";
z = 0;
int seqid = 1;
FILE *fp;
fp = fopen(file, "w");
if(fp == NULL)
{
cerr << "Failed to create main memory temporary file, attempting to use hardrive" << endl;
sprintf(file, "./temp/mln0_atoms.csv");
fp = fopen(file, "w");
if(fp == NULL)
{
cerr << "Failed to create main memory temporary file" << endl;
exit(1);
}
}
while((qname = inpquery[z]) != -1)
{
tmprule.name = qname;
qposr = lower_bound(rul_str, fin, tmprule, comparer);
cols1 = qposr->num_columns;
res_rows = cargafinal(qname, cols1, &dop1);
if(res_rows != 0)
{
if(res_rows < 0)
res_rows = unir(dop1, -res_rows, cols1, &dop1, 0); /*duplicate elimination on result*/
else
res_rows = unir(dop1, res_rows, cols1, &dop1, finalDR);
tipo = res_rows * cols1 * sizeof(int);
hres = (int *)malloc(tipo);
cudaMemcpy(hres, dop1, tipo, cudaMemcpyDeviceToHost);
cudaFree(dop1);
w = z + 1;
strtok(qposr->rulename, "_");
strtok(NULL, "_");
int prid = atoi(strtok(NULL, "_"));
for(x = 0, w = 0; x < res_rows; x++, w+=2)
{
if(hres[w+1])
fprintf(fp, "%d,%d,%d,true\n", seqid, hres[w], prid);
else
fprintf(fp, "%d,%d,%d,false\n", seqid, hres[w], prid);
seqid++;
}
free(hres);
}
z += 2;
}
fclose(fp);
sprintf(sel, "Copy mln0_atoms(atomid,tupleID,predID,isquery) from '%s' CSV", file);
PQexec(conn, sel);
}
else
{
while(rul_str != fin)
{
cols1 = rul_str->num_columns;
res_rows = cargafinal(rul_str->name, cols1, &dop1);
if(res_rows == 0)
{
rul_str++;
continue;
}
res_rows = abs(res_rows);
tipo = res_rows * cols1 * sizeof(int);
hres = (int *)malloc(tipo);
cudaMemcpy(hres, dop1, tipo, cudaMemcpyDeviceToHost);
cudaFree(dop1);
char file[] = "/dev/shm/buffer.csv";
FILE *fp;
fp = fopen(file, "w");
if(fp == NULL)
{
cerr << "Failed to create main memory temporary file, attempting to use hardrive" << endl;
sprintf(file, "./temp/buffer.csv");
fp = fopen(file, "w");
if(fp == NULL)
{
cerr << "Failed to create main memory temporary file" << endl;
exit(1);
}
}
if(rul_str->rulename[0] == 'z')
{
char *name = rul_str->rulename + 1;
for(x = 0; x < ninpf; x++)
{
if(strncmp(L->at(x).predname, name, strlen(name)) == 0)
{
matw = L->at(x).weight;
break;
}
}
cols1 -= 3;
for(x = 0, z = 0; x < res_rows; x++, z+=3)
{
for(y = 0; y < cols1; y++, z++)
fprintf(fp, "%d,", hres[z]);
fprintf(fp, "%d,%lf,%d\n", hres[z], matw[abs(hres[z+1])-1], hres[z+2]);
}
fclose(fp);
sprintf(sel, "Copy %s from '%s' CSV", name, file);
PQexec(conn, sel);
}
else
{
cols1--;
for(x = 0, z = 0; x < res_rows; x++, z++)
{
for(y = 0; y < cols1; y++, z++)
fprintf(fp, "%d,", hres[z]);
fprintf(fp, "%d\n", hres[z]);
}
fclose(fp);
sprintf(sel, "Copy %s from '%s' CSV", rul_str->rulename, file);
PQexec(conn, sel);
}
free(hres);
rul_str++;
}
}
PQfinish(conn);
if(finalDR)
clear_memory_all();
}
#endif
#ifdef ROCKIT
void mysqlRead(MYSQL **ret, int *qrs, vector<gpunode> *L, int ninpf, char *names, int finalDR)
{
char *tok, sel[1024];
int w, x, y, z = 0, numt, numc;
int *mat;
MYSQL *con = mysql_init(NULL);
if(con == NULL)
{
fprintf(stderr, "mysql_init() failed\n");
exit(1);
}
mysql_options(con, MYSQL_OPT_LOCAL_INFILE, NULL);
mysql_real_connect(con, "localhost", "root", "root", "rockit", 0, NULL, 0);
if(finalDR)
{
y = 0;
while(qrs[y] != 0)
{
for(z = 0; z < ninpf; z++)
{
if(qrs[y] == L->at(z).name)
{
MYSQL_ROW row;
sprintf(sel, "Select count(*) from %s", L->at(z).predname);
mysql_query(con, sel);
MYSQL_RES *result = mysql_store_result(con);
row = mysql_fetch_row(result);
numt = atoi(row[0]);
mysql_free_result(result);
if(numt != L->at(z).num_rows)
{
liberar(L->at(z).name);
numc = L->at(z).num_columns;
sprintf(sel, "Select * from %s", L->at(z).predname);
mysql_query(con, sel);
MYSQL_RES *result = mysql_store_result(con);
mat = (int *)malloc(numt * numc * sizeof(int));
w = 0;
while ((row = mysql_fetch_row(result)))
{
for(x = 0; x < numc; x++, w++)
mat[w] = atoi(row[x]);
}
mysql_free_result(result);
if(L->at(z).address_host_table != NULL)
free(L->at(z).address_host_table);
L->at(z).address_host_table = mat;
L->at(z).num_rows = numt;
}
}
}
y++;
}
}
else
{
tok = strtok(names, " ");
while(tok != NULL)
{
numc = L->at(z).num_columns;
sprintf(sel, "Select * from %s", tok);
mysql_query(con, sel);
MYSQL_RES *result = mysql_store_result(con);
numt = mysql_num_rows(result);
MYSQL_ROW row;
mat = (int *)malloc(numt * numc * sizeof(int));
w = 0;
if(tok[0] == 'f' && tok[1] >= '0' && tok[1] <= '9')
{
while ((row = mysql_fetch_row(result)))
{
for(x = 1; x <= numc; x++, w++)
mat[w] = atoi(row[x]);
}
}
else
{
while ((row = mysql_fetch_row(result)))
{
for(x = 0; x < numc; x++, w++)
mat[w] = atoi(row[x]);
}
}
mysql_free_result(result);
L->at(z).address_host_table = mat;
L->at(z).num_rows = numt;
numc = (strlen(tok) + 1) * sizeof(char);
L->at(z).predname = (char *)malloc(numc);
strcpy(L->at(z).predname, tok);
tok = strtok(NULL, " ");
z++;
}
}
*ret = con;
}
void mysqlWrite(vector<rulenode>::iterator rul_str, vector<rulenode>::iterator fin, vector<gpunode> *L, MYSQL *con)
{
int x, y, z, cols1, cols2, res_rows, tipo;
int *hres, *dop1;
char *id, *sign, *q1, *q2;
char sel[1024], weight[1024];
gpunode tmpfact;
while(rul_str != fin)
{
cols1 = rul_str->num_columns;
res_rows = cargafinal(rul_str->name, cols1, &dop1);
id = strtok(rul_str->rulename, "_");
sprintf(sel, "create table if not exists %s(weight double, ", id);
for(x = 0; x < cols1; x++)
{
sprintf(weight, "a%d char(10), ", x);
strcat(sel, weight);
}
sel[strlen(sel)-2] = ')';
strcat(sel, "ENGINE = MEMORY DEFAULT CHARSET=latin1");
mysql_query(con, sel);
sprintf(sel, "truncate %s", id);
mysql_query(con, sel);
if(res_rows == 0)
{
rul_str++;
continue;
}
if(res_rows > 0)
{
tmpfact = L->at(-rul_str->referencias[rul_str->num_rows - 2] - 1);
sign = tmpfact.predname;
tipo = res_rows * cols1 * sizeof(int);
hres = (int *)malloc(tipo);
cudaMemcpy(hres, dop1, tipo, cudaMemcpyDeviceToHost);
if(sign[0] == 'f' && sign[1] >= '0' && sign[1] <= '9')
sumar(tmpfact.name, dop1, cols1, res_rows);
}
else
{
hres = dop1;
res_rows = -res_rows;
}
sign = strtok(NULL, "_");
q1 = strtok(NULL, "_");
q2 = strtok(NULL, "_");
if(sign[0] == '0')
sprintf(weight, "%s.%s", q1, q2);
else
sprintf(weight, "-%s.%s", q1, q2);
FILE *fp;
char file[512];
sprintf(file, "/dev/shm/%s.tsv", id);
fp = fopen(file, "w");
if(fp == NULL)
{
cerr << "Failed to create main memory temporary file, attempting to use hardrive" << endl;
sprintf(file, "./temp/%s.tsv", id);
fp = fopen(file, "w");
}
cols2 = cols1 - 1;
for(x = 0, z = 0; x < res_rows; x++, z++)
{
fprintf(fp, "%s\t", weight);
for(y = 0; y < cols2; y++, z++)
fprintf(fp, "%d\t", hres[z]);
fprintf(fp, "%d\n", hres[z]);
}
fclose(fp);
sprintf(sel, "LOAD DATA LOCAL INFILE '%s' INTO TABLE %s", file, id);
mysql_query(con, sel);
rul_str++;
}
mysql_close(con);
}
#endif

28
packages/cuda/old/dbio.h Normal file
View File

@ -0,0 +1,28 @@
#ifndef _DBIO_H_
#define _DBIO_H_
#include "pred.h"
#ifdef TUFFY
#include <libpq-fe.h>
#endif
#ifdef ROCKIT
#include <mysql/mysql.h>
#endif
#include <vector>
#include "lista.h"
using namespace std;
#ifdef TUFFY
void postgresRead(PGconn **ret, vector<gpunode> *L, int *inpquery, char *names, int finalDR);
void postgresWrite(int *inpquery, int ninpf, vector<rulenode>::iterator rul_str, vector<rulenode>::iterator fin, vector<gpunode> *L, PGconn *conn, int finalDR);
#endif
#ifdef ROCKIT
void mysqlRead(MYSQL **ret, int *qrs, vector<gpunode> *L, int ninpf, char *names, int finalDR);
void mysqlWrite(vector<rulenode>::iterator rul_str, vector<rulenode>::iterator fin, vector<gpunode> *L, MYSQL *con);
#endif
#ifdef DATALOG
void datalogWrite(int query, vector<rulenode>::iterator rul_str, vector<rulenode>::iterator fin, int finalDR, int **result);
#endif
#endif

1337
packages/cuda/old/lista.cu Executable file

File diff suppressed because it is too large Load Diff

44
packages/cuda/old/lista.h Executable file
View File

@ -0,0 +1,44 @@
#ifndef _LISTA_H_
#define _LISTA_H_
typedef struct Node{
int name;
int *dev_address;
int rows;
int size;
int iteration;
int isrule;
}memnode;
typedef struct auxiliar{
int name;
int num_rows;
int num_columns;
int *address_host_table;
int *rule_names;
int *referencias;
int **select;
int *numsel;
int **project;
int2 *projpos;
int **selfjoin;
int *numselfj;
int **wherejoin;
int *numjoin;
int totalpreds;
int **preds;
int2 *numpreds;
int *negatives;
char *rulename;
int gen_act;
int gen_ant;
}rulenode;
typedef struct completed{
int name;
int numrules;
int reduce;
int reset;
}compnode;
#endif

575
packages/cuda/old/memory.cu Executable file
View File

@ -0,0 +1,575 @@
#include <list>
#include <iostream>
#include <stdlib.h>
#include <algorithm>
#include <thrust/device_vector.h>
#include "lista.h"
#include "memory.h"
#include "pred.h"
#define MAX_REC 200
#define MAX_FIX_POINTS 100
memnode temp_storage[MAX_REC];
/*List used to store information (address, size, etc.) about facts and rule results loaded in the GPU*/
list<memnode> GPUmem;
/*List used to store information about rule results offloaded from the GPU to the CPU*/
list<memnode> CPUmem;
/*Auxiliary function to sort rule list*/
bool comparer(const rulenode &r1, const rulenode &r2)
{
return (r1.name > r2.name);
}
/*Used in search functions to compare iterations*/
bool compareiteration(const memnode &r1, const memnode &r2)
{
return (r1.iteration < r2.iteration);
}
/*Used in search functions to compare names*/
bool comparename(const memnode &r1, const memnode &r2)
{
return (r1.name > r2.name);
}
/*Linear search of 'name' fact*/
template<class InputIterator>
InputIterator buscarhecho(InputIterator first, InputIterator last, int name)
{
while(first!=last)
{
if(first->name == name && first->isrule == 0) return first;
++first;
}
return last;
}
/*Finds all results of rule 'name' in iteration 'itr' in both CPU and GPU memory. Every result found is removed from its respective list*/
list<memnode>::iterator buscarpornombre(int name, int itr, int *totalrows, int *gpunum, int *cpunum)
{
int x = 0, sum = 0;
memnode temp;
list<memnode>::iterator i;
temp.iteration = itr;
pair<list<memnode>::iterator, list<memnode>::iterator> rec = equal_range(GPUmem.begin(), GPUmem.end(), temp, compareiteration);
while(rec.first != rec.second)
{
if(rec.first->name == name && rec.first->isrule == 1)
{
temp_storage[x] = *rec.first;
rec.first = GPUmem.erase(rec.first);
sum += temp_storage[x].rows;
x++;
}
else
rec.first++;
}
*gpunum = x;
temp.name = name;
temp.isrule = 1;
i = GPUmem.insert(rec.first, temp);
rec = equal_range(CPUmem.begin(), CPUmem.end(), temp, compareiteration);
while(rec.first != rec.second)
{
if(rec.first->name == name && rec.first->isrule == 1)
{
temp_storage[x] = *rec.first;
rec.first = CPUmem.erase(rec.first);
sum += temp_storage[x].rows;
x++;
}
else
rec.first++;
}
*totalrows = sum;
*cpunum = x;
return i;
}
list<memnode>::iterator buscarpornombrecpu(int name, int itr, int *totalrows, int *gpunum, int *cpunum)
{
int x = 0, sum = 0;
memnode temp;
list<memnode>::iterator i;
temp.iteration = itr;
pair<list<memnode>::iterator, list<memnode>::iterator> rec = equal_range(GPUmem.begin(), GPUmem.end(), temp, compareiteration);
while(rec.first != rec.second)
{
if(rec.first->name == name)
{
temp_storage[x] = *rec.first;
rec.first = GPUmem.erase(rec.first);
sum += temp_storage[x].rows;
x++;
}
else
rec.first++;
}
*gpunum = x;
temp.name = name;
temp.isrule = 1;
rec = equal_range(CPUmem.begin(), CPUmem.end(), temp, compareiteration);
while(rec.first != rec.second)
{
if(rec.first->name == name)
{
temp_storage[x] = *rec.first;
rec.first = CPUmem.erase(rec.first);
sum += temp_storage[x].rows;
x++;
}
else
rec.first++;
}
i = CPUmem.insert(rec.first, temp);
*totalrows = sum;
*cpunum = x;
return i;
}
/*Removes the least recently used memory block from GPU memory, sending it to CPU memory if it's a rule result.
If there are no used memory blocks in the GPU and we still don't have enough memory, the program exits with error*/
void limpiar(const char s[], size_t sz)
{
list<memnode>::iterator ini;
memnode temp;
size_t free, total;
if(GPUmem.size() == 0)
{
cudaMemGetInfo(&free,&total);
cerr << s << ": not enough GPU memory: have " << free << " of " << total << ", need " << sz << " bytes." << endl;
exit(1);
}
ini = GPUmem.begin();
if(ini->isrule)
{
temp = *ini;
temp.dev_address = (int *)malloc(ini->size);
cudaMemcpyAsync(temp.dev_address, ini->dev_address, temp.size, cudaMemcpyDeviceToHost);
list<memnode>::iterator pos = lower_bound(CPUmem.begin(), CPUmem.end(), temp, compareiteration);
CPUmem.insert(pos, temp);
}
cudaFree(ini->dev_address);
GPUmem.erase(ini);
}
/*Allocs 'size' amount of bytes in GPU memory. If not enough memory is available, removes least recently used memory blocks until
enough space is available*/
void reservar(int **ptr, size_t size)
{
size_t free, total;
if (size == 0) {
*ptr = NULL;
return;
}
cudaMemGetInfo(&free, &total);
while(free < size)
{
cout << "Se limpio memoria " << free << " " << total << endl;
limpiar("not enough memory", size);
cudaMemGetInfo(&free, &total);
}
while(cudaMalloc(ptr, size) == cudaErrorMemoryAllocation)
limpiar("Error in memory allocation", size);
if (! *ptr ) {
size_t free, total;
cudaMemGetInfo( &free, &total );
cerr << "Could not allocate " << size << " bytes, only " << free << " avaliable from total of " << total << " !!!" << endl;
cerr << "Exiting CUDA...." << endl;
exit(1);
}
}
/*Creates a new entry in the GPU memory list*/
void registrar(int name, int num_columns, int *ptr, int rows, int itr, int rule)
{
memnode temp;
temp.name = name;
temp.dev_address = ptr;
temp.rows = rows;
temp.size = rows * num_columns * sizeof(int);
temp.iteration = itr;
temp.isrule = rule;
GPUmem.push_back(temp);
}
void registrarcpu(int name, int num_columns, int *ptr, int rows, int itr, int rule)
{
memnode temp;
temp.name = name;
temp.dev_address = ptr;
temp.rows = rows;
temp.size = rows * num_columns * sizeof(int);
temp.iteration = itr;
temp.isrule = rule;
CPUmem.push_back(temp);
}
/*Updates the information of an element in a list*/
template<class InputIterator>
void actualizar(int num_columns, int *ptr, int rows, InputIterator i)
{
i->dev_address = ptr;
i->rows = rows;
i->size = rows * num_columns * sizeof(int);
}
/*Count the total number of rows generated by rule 'name' in iteration 'iter'*/
int numrows(int name, int itr)
{
int sum = 0;
memnode temp;
temp.iteration = itr;
pair<list<memnode>::iterator, list<memnode>::iterator> rec = equal_range(GPUmem.begin(), GPUmem.end(), temp, compareiteration);
while(rec.first != rec.second)
{
if(rec.first->name == name)
sum += rec.first->rows;
rec.first++;
}
rec = equal_range(CPUmem.begin(), CPUmem.end(), temp, compareiteration);
while(rec.first != rec.second)
{
if(rec.first->name == name)
sum += rec.first->rows;
rec.first++;
}
return sum;
}
extern "C" void * YAP_IntToAtom(int);
extern "C" char * YAP_AtomName(void *);
/*Loads facts or rule results in GPU memory. If a fact is already in GPU memory, its pointer is simply returned. Otherwise,
memory is reserved and the fact is loaded. Rule results are loaded based on the current iteration 'itr' and both GPU and
CPU memories are searched for all instances of said results. The instances are combined into a single one in GPU memory.*/
int cargar(int name, int num_rows, int num_columns, int is_fact, int *address_host_table, int **ptr, int itr)
{
int numgpu, numcpu, totalrows = 0;
int *temp, x;
int size, itrant, inc = 0;
list<memnode>::iterator i;
memnode fact;
if(is_fact)
{
i = buscarhecho(GPUmem.begin(), GPUmem.end(), name);
if(i != GPUmem.end())
{
fact = *i;
GPUmem.erase(i);
fact.iteration = itr;
*ptr = fact.dev_address;
GPUmem.push_back(fact);
return fact.rows;
}
size = num_rows * num_columns * sizeof(int);
reservar(&temp, size);
cudaMemcpyAsync(temp, address_host_table, size, cudaMemcpyHostToDevice);
registrar(name, num_columns, temp, num_rows, itr, 0);
*ptr = temp;
return num_rows;
}
if(itr > 0)
{
itrant = itr - 1;
i = buscarpornombre(name, itrant, &totalrows, &numgpu, &numcpu);
if((numgpu == 1) && (numcpu == 1))
{
actualizar(num_columns, temp_storage[0].dev_address, temp_storage[0].rows, i);
*ptr = temp_storage[0].dev_address;
return temp_storage[0].rows;
}
size = totalrows * num_columns * sizeof(int);
reservar(&temp, size);
for(x = 0; x < numgpu; x++)
{
cudaMemcpyAsync(temp + inc, temp_storage[x].dev_address, temp_storage[x].size, cudaMemcpyDeviceToDevice);
inc += temp_storage[x].size / sizeof(int);
cudaFree(temp_storage[x].dev_address);
}
for(; x < numcpu; x++)
{
cudaMemcpyAsync(temp + inc, temp_storage[x].dev_address, temp_storage[x].size, cudaMemcpyHostToDevice);
inc += temp_storage[x].size / sizeof(int);
free(temp_storage[x].dev_address);
}
actualizar(num_columns, temp, totalrows, i);
*ptr = temp;
return totalrows;
}
return 0;
}
int cargarcpu(int name, int num_rows, int num_columns, int is_fact, int *address_host_table, int **ptr, int itr)
{
int numgpu, numcpu, totalrows = 0;
int *temp, x;
int size, itrant, inc = 0;
list<memnode>::iterator i;
if(is_fact)
{
*ptr = address_host_table;
return num_rows;
}
if(itr > 0)
{
itrant = itr - 1;
i = buscarpornombrecpu(name, itrant, &totalrows, &numgpu, &numcpu);
if((numgpu == 0) && (numcpu == 1))
{
actualizar(num_columns, temp_storage[0].dev_address, temp_storage[0].rows, i);
*ptr = temp_storage[0].dev_address;
return temp_storage[0].rows;
}
size = totalrows * num_columns * sizeof(int);
temp = (int *)malloc(size);
for(x = 0; x < numgpu; x++)
{
cudaMemcpyAsync(temp + inc, temp_storage[x].dev_address, temp_storage[x].size, cudaMemcpyDeviceToHost);
inc += temp_storage[x].size / sizeof(int);
cudaFree(temp_storage[x].dev_address);
}
for(; x < numcpu; x++)
{
memcpy(temp + inc, temp_storage[x].dev_address, temp_storage[x].size);
inc += temp_storage[x].size / sizeof(int);
free(temp_storage[x].dev_address);
}
actualizar(num_columns, temp, totalrows, i);
*ptr = temp;
return totalrows;
}
return 0;
}
/*Loads all results of rule 'name' from both GPU and CPU memories into the GPU*/
int cargafinal(int name, int cols, int **ptr)
{
int *temp, *ini, cont = 0, numg = 0, numc = 0;
memnode bus;
bus.name = name;
GPUmem.sort(comparename);
CPUmem.sort(comparename);
list<memnode>::iterator endg = GPUmem.end();
list<memnode>::iterator endc = CPUmem.end();
list<memnode>::iterator pos = lower_bound(GPUmem.begin(), endg, bus, comparename);
list<memnode>::iterator gpu = pos;
while(pos != endg && pos->name == name)
{
cont += pos->rows;
numg++;
pos++;
}
pos = lower_bound(CPUmem.begin(), endc, bus, comparename);
list<memnode>::iterator cpu = pos;
while(pos != endc && pos->name == name)
{
cont += pos->rows;
numc++;
pos++;
}
if(numg == 0 && numc == 0)
return 0;
if(numg == 1 && numc == 0)
{
pos = gpu;
*ptr = pos->dev_address;
cont = pos->rows;
GPUmem.erase(pos);
#ifdef TUFFY
return -cont;
#else
return cont;
#endif
}
if(numg == 0 && numc == 1)
{
pos = cpu;
cont = pos->rows;
#ifdef TUFFY
reservar(&temp, pos->size);
cudaMemcpy(temp, pos->dev_address, pos->size, cudaMemcpyHostToDevice);
*ptr = temp;
#else
*ptr = pos->dev_address;
#endif
CPUmem.erase(pos);
return -cont;
}
reservar(&temp, cont * cols * sizeof(int));
ini = temp;
pos = gpu;
while(pos != endg && pos->name == name)
{
cudaMemcpy(temp, pos->dev_address, pos->size, cudaMemcpyDeviceToDevice);
temp += pos->size / sizeof(int);
pos++;
}
pos = cpu;
while(pos != endc && pos->name == name)
{
cudaMemcpy(temp, pos->dev_address, pos->size, cudaMemcpyHostToDevice);
temp += pos->size / sizeof(int);
pos++;
}
*ptr = ini;
return cont;
}
/*Compares the results of the current iteration against the results of older iterations.
Used to avoid infinite computations when the result is not a single fixed-point, but an
orbit of points.*/
bool generadas(int name, int filas, int cols, int itr)
{
int r1, r2, x, fin;
int *dop1, *dop2;
r2 = numrows(name, itr);
if(itr < MAX_FIX_POINTS)
fin = itr;
else
fin = MAX_FIX_POINTS;
for(x = 1; x <= fin; x++)
{
r1 = numrows(name, itr - x);
if(r1 == r2)
{
r2 = cargar(name, filas, cols, 0, NULL, &dop2, itr + 1);
thrust::device_ptr<int> pt2 = thrust::device_pointer_cast(dop2);
r1 = cargar(name, filas, cols, 0, NULL, &dop1, itr - x + 1);
thrust::device_ptr<int> pt1 = thrust::device_pointer_cast(dop1);
if(thrust::equal(pt1, pt1 + r1, pt2) == true)
return true;
}
}
return false;
}
void mostrar_memoria()
{
unsigned int x;
list<memnode>::iterator i = GPUmem.begin();
cout << "Memoria inicio GPU" << endl;
for(x = 0; x < GPUmem.size(); x++, i++)
cout << i->name << " " << i->iteration << " " << i->isrule << " " << i->rows << " " << i->size << endl;
cout << "Memoria fin GPU" << endl;
}
void mostrar_memcpu()
{
unsigned int x;
list<memnode>::iterator i = CPUmem.begin();
cout << "Memoria inicio CPU" << endl;
for(x = 0; x < CPUmem.size(); x++, i++)
cout << i->name << " " << i->iteration << endl;
cout << "Memoria fin CPU" << endl;
}
/*Clear all rule results from both GPU and CPU memory*/
void clear_memory()
{
list<memnode>::iterator ini;
list<memnode>::iterator fin;
ini = GPUmem.begin();
fin = GPUmem.end();
while(ini != fin)
{
if(ini->isrule)
{
cudaFree(ini->dev_address);
ini = GPUmem.erase(ini);
}
else
ini++;
}
ini = CPUmem.begin();
fin = CPUmem.end();
while(ini != fin)
{
free(ini->dev_address);
ini++;
}
CPUmem.clear();
}
/*Clear everything from both GPU and CPU memory*/
void clear_memory_all()
{
list<memnode>::iterator ini;
list<memnode>::iterator fin;
ini = GPUmem.begin();
fin = GPUmem.end();
while(ini != fin)
{
cudaFree(ini->dev_address);
ini++;
}
GPUmem.clear();
ini = CPUmem.begin();
fin = CPUmem.end();
while(ini != fin)
{
free(ini->dev_address);
ini++;
}
CPUmem.clear();
}
/*Remove all instances of fact 'name' from both CPU and GPU memories*/
void liberar(int name)
{
list<memnode>::iterator i;
memnode fact;
i = buscarhecho(GPUmem.begin(), GPUmem.end(), name);
if(i != GPUmem.end())
{
fact = *i;
GPUmem.erase(i);
cudaFree(fact.dev_address);
}
i = buscarhecho(CPUmem.begin(), CPUmem.end(), name);
if(i != CPUmem.end())
{
fact = *i;
CPUmem.erase(i);
free(fact.dev_address);
}
}
/*Add all rows in 'dop1' to the fact 'name' by creating a new array capable of holding both.*/
void sumar(int name, int *dop1, int cols, int rows)
{
list<memnode>::iterator i;
memnode fact;
i = buscarhecho(GPUmem.begin(), GPUmem.end(), name);
int *res, newrows, offset;
if(i != GPUmem.end())
{
fact = *i;
newrows = rows + fact.rows;
reservar(&res, newrows * cols * sizeof(int));
offset = fact.rows * cols;
cudaMemcpyAsync(res, fact.dev_address, offset * sizeof(int), cudaMemcpyDeviceToDevice);
GPUmem.erase(i);
registrar(name, cols, res, newrows, 0, 0);
cudaMemcpyAsync(res + offset, dop1, rows * cols * sizeof(int), cudaMemcpyDeviceToDevice);
cudaFree(fact.dev_address);
}
}

27
packages/cuda/old/memory.h Executable file
View File

@ -0,0 +1,27 @@
#ifndef _MEMORY_H_
#define _MEMORY_H_
#include <list>
#include <vector>
#include "lista.h"
using namespace std;
bool comparer(const rulenode&, const rulenode&);
void limpiar(const char [], size_t);
void limpiartodo(int*, int*);
int cargar(int, int, int, int, int*, int**, int);
int cargarcpu(int, int, int, int, int*, int**, int);
int cargafinal(int, int, int**);
void reservar(int**, size_t);
void registrar(int, int, int*, int, int, int);
void registrarcpu(int, int, int*, int, int, int);
bool generadas(int, int, int, int);
void sumar(int, int*, int, int);
void liberar(int);
void mostrar_memoria(void);
void mostrar_memcpu(void);
void clear_memory(void);
void clear_memory_all(void);
#endif

47
packages/cuda/old/pred.h Executable file
View File

@ -0,0 +1,47 @@
#ifndef _PRED_H_
#define _PRED_H_
// #define DEBUG_MEM 1
typedef struct Nodo{
int name;
int num_rows;
int num_columns;
int is_fact;
int *address_host_table;
int *negatives;
char *predname;
double *weight;
}gpunode;
typedef gpunode predicate;
//#define TIMER 1
#define DATALOG 1
#define NUM_T 4
#define INISIZE 1000000
#if TIMER
typedef struct Stats{
size_t joins, selects, unions, builtins;
size_t calls;
double total_time;
float max_time, min_time;
float select1_time, select2_time, join_time, sort_time, union_time, pred_time;
}statinfo;
extern statinfo cuda_stats;
#endif
/*Constants used to mark comparison predicates*/
#define BPOFFSET (-6)
#define SBG_EQ (-1)
#define SBG_GT (-2)
#define SBG_LT (-3)
#define SBG_GE (-4)
#define SBG_LE (-5)
#define SBG_DF (-6)
int Cuda_Eval(predicate**, int, predicate**, int, int*, int**, char*, int);
void Cuda_Statistics( void );
#endif

View File

@ -0,0 +1,306 @@
#include <thrust/device_vector.h>
#include <thrust/scan.h>
#include <stdlib.h>
#include "memory.h"
#include "bpreds.h"
/*Mark all rows that comply with the selections*/
__global__ void marcar2(int *dop1, int rows, int cols, int *cons, int numc, int *res)
{
extern __shared__ int shared[];
int id = blockIdx.x * blockDim.x + threadIdx.x;
int x, rowact, posact;
if(threadIdx.x < numc)
shared[threadIdx.x] = cons[threadIdx.x];
__syncthreads();
if(id < rows)
{
rowact = id * cols;
for(x = 0; x < numc; x += 2)
{
posact = rowact + shared[x];
if(dop1[posact] != shared[x+1])
return;
}
res[id] = 1;
}
}
/*If we already have an array of marks (perhaps because the selfjoin was applied first),
we unmark any rows that do not comply with the selections*/
__global__ void marcar(int *dop1, int rows, int cols, int *cons, int numc, int *res)
{
extern __shared__ int shared[];
int id = blockIdx.x * blockDim.x + threadIdx.x;
int x, rowact, posact;
if(threadIdx.x < numc)
shared[threadIdx.x] = cons[threadIdx.x];
__syncthreads();
if(id < rows)
{
if(res[id] == 0)
return;
rowact = id * cols;
for(x = 0; x < numc; x += 2)
{
posact = rowact + shared[x];
if(dop1[posact] != shared[x+1])
{
res[id] = 0;
return;
}
}
}
}
/*Unmark all rows that do not comply with the selfjoins.*/
__global__ void samejoin(int *dop1, int rows, int cols, int *dhead, int cont, int *res)
{
extern __shared__ int shared[];
int id = blockIdx.x * blockDim.x + threadIdx.x;
int temp, temp2, pos, x, y;
if(threadIdx.x < cont)
shared[threadIdx.x] = dhead[threadIdx.x];
__syncthreads();
if(id < rows)
{
if(res[id] == 0)
return;
pos = id * cols;
for(x = 0; x < cont; x++)
{
temp = dop1[pos+shared[x]];
y = x + 1;
temp2 = shared[y];
while(temp2 > -1)
{
if(temp != dop1[temp2+pos])
{
res[id] = 0;
return;
}
y++;
temp2 = shared[y];
}
x = y;
}
}
}
/*Mark all rows that comply with the selfjoins*/
__global__ void samejoin2(int *dop1, int rows, int cols, int *dhead, int cont, int *res)
{
extern __shared__ int shared[];
int id = blockIdx.x * blockDim.x + threadIdx.x;
int temp, temp2, pos, x, y;
if(threadIdx.x < cont)
shared[threadIdx.x] = dhead[threadIdx.x];
__syncthreads();
if(id < rows)
{
pos = id * cols;
for(x = 0; x < cont; x++)
{
temp = dop1[pos+shared[x]];
y = x + 1;
temp2 = shared[y];
while(temp2 > -1)
{
if(temp != dop1[temp2+pos])
return;
y++;
temp2 = shared[y];
}
x = y;
}
res[id] = 1;
}
}
/*Project all columns found in 'dhead' to a new array 'res'*/
__global__ void proyectar(int *dop1, int rows, int cols, int *dhead, int hsize, int *res)
{
extern __shared__ int shared[];
int id = blockIdx.x * blockDim.x + threadIdx.x;
int pos, posr, x;
if(threadIdx.x < hsize)
shared[threadIdx.x] = dhead[threadIdx.x];
__syncthreads();
if(id < rows)
{
pos = id * cols;
posr = id * hsize;
for(x = 0; x < hsize; x++, posr++)
res[posr] = dop1[pos+shared[x]];
}
}
/*Project all columns found in 'dhead' using only the rows marked as valid (i.e. those that complied with
selections, selfjoins, etc.). The array 'temp' holds the result of the prefix sum of said marks.*/
__global__ void llenarproyectar(int *dop1, int rows, int cols, int *temp, int *dhead, int hsize, int *res)
{
extern __shared__ int shared[];
int id = blockIdx.x * blockDim.x + threadIdx.x;
int pos, posr, x;
if(threadIdx.x < hsize)
shared[threadIdx.x] = dhead[threadIdx.x];
__syncthreads();
if(id < rows)
{
posr = temp[id];
if(temp[id+1] != posr)
{
pos = id * cols;
posr *= hsize;
for(x = 0; x < hsize; x++, posr++)
res[posr] = dop1[pos+shared[x]];
}
}
}
/*Performs selections, selfjoins and comparison predicates when the rule has a single normal predicate.*/
int selectproyect(int *dop1, int rows, int cols, int head_size, int *select, int numselect, int *selfjoin, int numselfj, int *preds, int numpreds, int *project, int **ret, int ANDlogic)
{
int *fres = NULL, *temp = NULL;
int *dhead = NULL, tmplen;
int size, size2, num;
thrust::device_ptr<int> res;
#if TIMER
cuda_stats.selects++;
#endif
int head_bytes = maximo(4, numselect, numselfj, numpreds, head_size) * sizeof(int);
reservar(&dhead, head_bytes);
int numthreads = 1024;
//int numthreads = 32;
int blockllen = rows / numthreads + 1;
#ifdef ROCKIT
ANDlogic = 1;
#endif
if(numselect > 0)
{
tmplen = rows + 1;
size2 = tmplen * sizeof(int);
reservar(&temp, size2);
cudaMemset(temp, 0, size2);
size = numselect * sizeof(int);
cudaMemcpy(dhead, select, size, cudaMemcpyHostToDevice);
marcar2<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, numselect, temp + 1);
if(numselfj > 0)
{
size = numselfj * sizeof(int);
cudaMemcpy(dhead, selfjoin, size, cudaMemcpyHostToDevice);
samejoin<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, numselfj, temp + 1);
}
if(numpreds > 0)
{
size = numpreds * sizeof(int);
cudaMemcpy(dhead, preds, size, cudaMemcpyHostToDevice);
if(ANDlogic)
bpredsnormal<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, numpreds, temp + 1);
else
bpredsorlogic<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, numpreds, temp + 1);
}
res = thrust::device_pointer_cast(temp);
thrust::inclusive_scan(res + 1, res + tmplen, res + 1);
num = res[rows];
if(num == 0)
return 0;
size = head_size * sizeof(int);
reservar(&fres, num * size);
cudaMemcpy(dhead, project, size, cudaMemcpyHostToDevice);
llenarproyectar<<<blockllen, numthreads, size>>>(dop1, rows, cols, temp, dhead, head_size, fres);
cudaFree(dhead);
cudaFree(temp);
*ret = fres;
return num;
}
else
{
if(numselfj > 0)
{
tmplen = rows + 1;
size2 = tmplen * sizeof(int);
reservar(&temp, size2);
cudaMemset(temp, 0, size2);
size = numselfj * sizeof(int);
cudaMemcpy(dhead, selfjoin, size, cudaMemcpyHostToDevice);
samejoin2<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, numselfj, temp + 1);
if(numpreds > 0)
{
size = numpreds * sizeof(int);
cudaMemcpy(dhead, preds, size, cudaMemcpyHostToDevice);
if(ANDlogic)
bpredsnormal<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, numpreds, temp + 1);
else
bpredsorlogic<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, numpreds, temp + 1);
}
res = thrust::device_pointer_cast(temp);
thrust::inclusive_scan(res + 1, res + tmplen, res + 1);
num = res[rows];
if(num == 0)
return 0;
size = head_size * sizeof(int);
reservar(&fres, num * size);
cudaMemcpy(dhead, project, size, cudaMemcpyHostToDevice);
llenarproyectar<<<blockllen, numthreads, size>>>(dop1, rows, cols, temp, dhead, head_size, fres);
cudaFree(dhead);
cudaFree(temp);
*ret = fres;
return num;
}
else
{
if(numpreds > 0)
{
tmplen = rows + 1;
size2 = tmplen * sizeof(int);
reservar(&temp, size2);
cudaMemset(temp, 0, size2);
size = numpreds * sizeof(int);
cudaMemcpy(dhead, preds, size, cudaMemcpyHostToDevice);
if(ANDlogic)
bpredsnormal2<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, numpreds, temp + 1);
else
bpredsorlogic2<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, numpreds, temp + 1);
res = thrust::device_pointer_cast(temp);
thrust::inclusive_scan(res + 1, res + tmplen, res + 1);
num = res[rows];
if(num == 0)
return 0;
size = head_size * sizeof(int);
reservar(&fres, num * size);
cudaMemcpy(dhead, project, size, cudaMemcpyHostToDevice);
llenarproyectar<<<blockllen, numthreads, size>>>(dop1, rows, cols, temp, dhead, head_size, fres);
cudaFree(dhead);
cudaFree(temp);
*ret = fres;
return num;
}
else
{
size = head_size * sizeof(int);
reservar(&fres, rows * size);
cudaMemcpy(dhead, project, size, cudaMemcpyHostToDevice);
proyectar<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, head_size, fres);
cudaFree(dhead);
*ret = fres;
return rows;
}
}
}
}

1279
packages/cuda/old/treeb.cu Executable file

File diff suppressed because it is too large Load Diff

763
packages/cuda/old/union2.cu Executable file
View File

@ -0,0 +1,763 @@
/*Computer generated file to remove duplicates. Since Thrust's unique and sort, unlike their std's counterparts, don't have a way to specify the size of each element in
the array, comparing pairs, triplets and other sets is not possible without defining a new pointer and all related operations for each set. If you have a better idea to do
this, please don't hesitate to email us.*/
#include <thrust/device_vector.h>
#include <thrust/unique.h>
#include <thrust/distance.h>
#include <thrust/sort.h>
#include <iostream>
#include "memory.h"
#include "union2.h"
int unir(int *res, int rows, int tipo, int **ret, int final)
{
thrust::device_ptr<int> pt, re;
thrust::device_ptr<s2> pt2, re2;
thrust::device_ptr<s3> pt3, re3;
thrust::device_ptr<s4> pt4, re4;
thrust::device_ptr<s5> pt5, re5;
thrust::device_ptr<s6> pt6, re6;
thrust::device_ptr<s7> pt7, re7;
thrust::device_ptr<s8> pt8, re8;
thrust::device_ptr<s9> pt9, re9;
thrust::device_ptr<s10> pt10, re10;
thrust::device_ptr<s11> pt11, re11;
thrust::device_ptr<s12> pt12, re12;
thrust::device_ptr<s13> pt13, re13;
thrust::device_ptr<s14> pt14, re14;
thrust::device_ptr<s15> pt15, re15;
thrust::device_ptr<s16> pt16, re16;
thrust::device_ptr<s17> pt17, re17;
thrust::device_ptr<s18> pt18, re18;
thrust::device_ptr<s19> pt19, re19;
thrust::device_ptr<s20> pt20, re20;
s2 *t2;
s3 *t3;
s4 *t4;
s5 *t5;
s6 *t6;
s7 *t7;
s8 *t8;
s9 *t9;
s10 *t10;
s11 *t11;
s12 *t12;
s13 *t13;
s14 *t14;
s15 *t15;
s16 *t16;
s17 *t17;
s18 *t18;
s19 *t19;
s20 *t20;
int flag, nrows, *nres, size;
#if TIMER
cuda_stats.unions++;
#endif
switch(tipo)
{
case 1:
{
pt = thrust::device_pointer_cast(res);
flag = 0;
while(flag != 1)
{
try
{
thrust::sort(pt, pt + rows);
if(final)
{
re = thrust::unique(pt, pt + rows, q1());
re = thrust::unique(pt, re);
}
else
re = thrust::unique(pt, pt + rows);
flag = 1;
}
catch(std::bad_alloc &e)
{
limpiar("sort/unique in unir", 0);
}
}
nrows = thrust::distance(pt, re);
if(nrows < rows / 2)
{
size = nrows * tipo * sizeof(int);
reservar(&nres, size);
cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
cudaFree(*ret);
*ret = nres;
}
return nrows;
}
case 2:
{
t2 = (s2*)res;
pt2 = thrust::device_pointer_cast(t2);
flag = 0;
while(flag != 1)
{
try
{
thrust::sort(pt2, pt2 + rows, o2());
if(final)
{
re2 = thrust::unique(pt2, pt2 + rows, q2());
re2 = thrust::unique(pt2, re2, p2());
}
else
re2 = thrust::unique(pt2, pt2 + rows, p2());
flag = 1;
}
catch(std::bad_alloc &e)
{
limpiar("sort/unique in unir", 0);
}
}
nrows = thrust::distance(pt2, re2);
if(nrows < rows / 2)
{
size = nrows * tipo * sizeof(int);
reservar(&nres, size);
cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
cudaFree(*ret);
*ret = nres;
}
return nrows;
}
case 3:
{
t3 = (s3*)res;
pt3 = thrust::device_pointer_cast(t3);
flag = 0;
while(flag != 1)
{
try
{
thrust::sort(pt3, pt3 + rows, o3());
if(final)
{
re3 = thrust::unique(pt3, pt3 + rows, q3());
re3 = thrust::unique(pt3, re3, p3());
}
else
re3 = thrust::unique(pt3, pt3 + rows, p3());
flag = 1;
}
catch(std::bad_alloc &e)
{
limpiar("sort/unique in unir", 0);
}
}
nrows = thrust::distance(pt3, re3);
if(nrows < rows / 2)
{
size = nrows * tipo * sizeof(int);
reservar(&nres, size);
cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
cudaFree(*ret);
*ret = nres;
}
return nrows;
}
case 4:
{
t4 = (s4*)res;
pt4 = thrust::device_pointer_cast(t4);
flag = 0;
while(flag != 1)
{
try
{
thrust::sort(pt4, pt4 + rows, o4());
if(final)
{
re4 = thrust::unique(pt4, pt4 + rows, q4());
re4 = thrust::unique(pt4, re4, p4());
}
else
re4 = thrust::unique(pt4, pt4 + rows, p4());
flag = 1;
}
catch(std::bad_alloc &e)
{
limpiar("sort/unique in unir", 0);
}
}
nrows = thrust::distance(pt4, re4);
if(nrows < rows / 2)
{
size = nrows * tipo * sizeof(int);
reservar(&nres, size);
cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
cudaFree(*ret);
*ret = nres;
}
return nrows;
}
case 5:
{
t5 = (s5*)res;
pt5 = thrust::device_pointer_cast(t5);
flag = 0;
while(flag != 1)
{
try
{
thrust::sort(pt5, pt5 + rows, o5());
if(final)
{
re5 = thrust::unique(pt5, pt5 + rows, q5());
re5 = thrust::unique(pt5, re5, p5());
}
else
re5 = thrust::unique(pt5, pt5 + rows, p5());
flag = 1;
}
catch(std::bad_alloc &e)
{
limpiar("sort/unique in unir", 0);
}
}
nrows = thrust::distance(pt5, re5);
if(nrows < rows / 2)
{
size = nrows * tipo * sizeof(int);
reservar(&nres, size);
cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
cudaFree(*ret);
*ret = nres;
}
return nrows;
}
case 6:
{
t6 = (s6*)res;
pt6 = thrust::device_pointer_cast(t6);
flag = 0;
while(flag != 1)
{
try
{
thrust::sort(pt6, pt6 + rows, o6());
if(final)
{
re6 = thrust::unique(pt6, pt6 + rows, q6());
re6 = thrust::unique(pt6, re6, p6());
}
else
re6 = thrust::unique(pt6, pt6 + rows, p6());
flag = 1;
}
catch(std::bad_alloc &e)
{
limpiar("sort/unique in unir", 0);
}
}
nrows = thrust::distance(pt6, re6);
if(nrows < rows / 2)
{
size = nrows * tipo * sizeof(int);
reservar(&nres, size);
cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
cudaFree(*ret);
*ret = nres;
}
return nrows;
}
case 7:
{
t7 = (s7*)res;
pt7 = thrust::device_pointer_cast(t7);
flag = 0;
while(flag != 1)
{
try
{
thrust::sort(pt7, pt7 + rows, o7());
if(final)
{
re7 = thrust::unique(pt7, pt7 + rows, q7());
re7 = thrust::unique(pt7, re7, p7());
}
else
re7 = thrust::unique(pt7, pt7 + rows, p7());
flag = 1;
}
catch(std::bad_alloc &e)
{
limpiar("sort/unique in unir", 0);
}
}
nrows = thrust::distance(pt7, re7);
if(nrows < rows / 2)
{
size = nrows * tipo * sizeof(int);
reservar(&nres, size);
cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
cudaFree(*ret);
*ret = nres;
}
return nrows;
}
case 8:
{
t8 = (s8*)res;
pt8 = thrust::device_pointer_cast(t8);
flag = 0;
while(flag != 1)
{
try
{
thrust::sort(pt8, pt8 + rows, o8());
if(final)
{
re8 = thrust::unique(pt8, pt8 + rows, q8());
re8 = thrust::unique(pt8, re8, p8());
}
else
re8 = thrust::unique(pt8, pt8 + rows, p8());
flag = 1;
}
catch(std::bad_alloc &e)
{
limpiar("sort/unique in unir", 0);
}
}
nrows = thrust::distance(pt8, re8);
if(nrows < rows / 2)
{
size = nrows * tipo * sizeof(int);
reservar(&nres, size);
cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
cudaFree(*ret);
*ret = nres;
}
return nrows;
}
case 9:
{
t9 = (s9*)res;
pt9 = thrust::device_pointer_cast(t9);
flag = 0;
while(flag != 1)
{
try
{
thrust::sort(pt9, pt9 + rows, o9());
if(final)
{
re9 = thrust::unique(pt9, pt9 + rows, q9());
re9 = thrust::unique(pt9, re9, p9());
}
else
re9 = thrust::unique(pt9, pt9 + rows, p9());
flag = 1;
}
catch(std::bad_alloc &e)
{
limpiar("sort/unique in unir", 0);
}
}
nrows = thrust::distance(pt9, re9);
if(nrows < rows / 2)
{
size = nrows * tipo * sizeof(int);
reservar(&nres, size);
cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
cudaFree(*ret);
*ret = nres;
}
return nrows;
}
case 10:
{
t10 = (s10*)res;
pt10 = thrust::device_pointer_cast(t10);
flag = 0;
while(flag != 1)
{
try
{
thrust::sort(pt10, pt10 + rows, o10());
if(final)
{
re10 = thrust::unique(pt10, pt10 + rows, q10());
re10 = thrust::unique(pt10, re10, p10());
}
else
re10 = thrust::unique(pt10, pt10 + rows, p10());
flag = 1;
}
catch(std::bad_alloc &e)
{
limpiar("sort/unique in unir", 0);
}
}
nrows = thrust::distance(pt10, re10);
if(nrows < rows / 2)
{
size = nrows * tipo * sizeof(int);
reservar(&nres, size);
cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
cudaFree(*ret);
*ret = nres;
}
return nrows;
}
case 11:
{
t11 = (s11*)res;
pt11 = thrust::device_pointer_cast(t11);
flag = 0;
while(flag != 1)
{
try
{
thrust::sort(pt11, pt11 + rows, o11());
if(final)
{
re11 = thrust::unique(pt11, pt11 + rows, q11());
re11 = thrust::unique(pt11, re11, p11());
}
else
re11 = thrust::unique(pt11, pt11 + rows, p11());
flag = 1;
}
catch(std::bad_alloc &e)
{
limpiar("sort/unique in unir", 0);
}
}
nrows = thrust::distance(pt11, re11);
if(nrows < rows / 2)
{
size = nrows * tipo * sizeof(int);
reservar(&nres, size);
cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
cudaFree(*ret);
*ret = nres;
}
return nrows;
}
case 12:
{
t12 = (s12*)res;
pt12 = thrust::device_pointer_cast(t12);
flag = 0;
while(flag != 1)
{
try
{
thrust::sort(pt12, pt12 + rows, o12());
if(final)
{
re12 = thrust::unique(pt12, pt12 + rows, q12());
re12 = thrust::unique(pt12, re12, p12());
}
else
re12 = thrust::unique(pt12, pt12 + rows, p12());
flag = 1;
}
catch(std::bad_alloc &e)
{
limpiar("sort/unique in unir", 0);
}
}
nrows = thrust::distance(pt12, re12);
if(nrows < rows / 2)
{
size = nrows * tipo * sizeof(int);
reservar(&nres, size);
cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
cudaFree(*ret);
*ret = nres;
}
return nrows;
}
case 13:
{
t13 = (s13*)res;
pt13 = thrust::device_pointer_cast(t13);
flag = 0;
while(flag != 1)
{
try
{
thrust::sort(pt13, pt13 + rows, o13());
if(final)
{
re13 = thrust::unique(pt13, pt13 + rows, q13());
re13 = thrust::unique(pt13, re13, p13());
}
else
re13 = thrust::unique(pt13, pt13 + rows, p13());
flag = 1;
}
catch(std::bad_alloc &e)
{
limpiar("sort/unique in unir", 0);
}
}
nrows = thrust::distance(pt13, re13);
if(nrows < rows / 2)
{
size = nrows * tipo * sizeof(int);
reservar(&nres, size);
cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
cudaFree(*ret);
*ret = nres;
}
return nrows;
}
case 14:
{
t14 = (s14*)res;
pt14 = thrust::device_pointer_cast(t14);
flag = 0;
while(flag != 1)
{
try
{
thrust::sort(pt14, pt14 + rows, o14());
if(final)
{
re14 = thrust::unique(pt14, pt14 + rows, q14());
re14 = thrust::unique(pt14, re14, p14());
}
else
re14 = thrust::unique(pt14, pt14 + rows, p14());
flag = 1;
}
catch(std::bad_alloc &e)
{
limpiar("sort/unique in unir", 0);
}
}
nrows = thrust::distance(pt14, re14);
if(nrows < rows / 2)
{
size = nrows * tipo * sizeof(int);
reservar(&nres, size);
cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
cudaFree(*ret);
*ret = nres;
}
return nrows;
}
case 15:
{
t15 = (s15*)res;
pt15 = thrust::device_pointer_cast(t15);
flag = 0;
while(flag != 1)
{
try
{
thrust::sort(pt15, pt15 + rows, o15());
if(final)
{
re15 = thrust::unique(pt15, pt15 + rows, q15());
re15 = thrust::unique(pt15, re15, p15());
}
else
re15 = thrust::unique(pt15, pt15 + rows, p15());
flag = 1;
}
catch(std::bad_alloc &e)
{
limpiar("sort/unique in unir", 0);
}
}
nrows = thrust::distance(pt15, re15);
if(nrows < rows / 2)
{
size = nrows * tipo * sizeof(int);
reservar(&nres, size);
cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
cudaFree(*ret);
*ret = nres;
}
return nrows;
}
case 16:
{
t16 = (s16*)res;
pt16 = thrust::device_pointer_cast(t16);
flag = 0;
while(flag != 1)
{
try
{
thrust::sort(pt16, pt16 + rows, o16());
if(final)
{
re16 = thrust::unique(pt16, pt16 + rows, q16());
re16 = thrust::unique(pt16, re16, p16());
}
else
re16 = thrust::unique(pt16, pt16 + rows, p16());
flag = 1;
}
catch(std::bad_alloc &e)
{
limpiar("sort/unique in unir", 0);
}
}
nrows = thrust::distance(pt16, re16);
if(nrows < rows / 2)
{
size = nrows * tipo * sizeof(int);
reservar(&nres, size);
cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
cudaFree(*ret);
*ret = nres;
}
return nrows;
}
case 17:
{
t17 = (s17*)res;
pt17 = thrust::device_pointer_cast(t17);
flag = 0;
while(flag != 1)
{
try
{
thrust::sort(pt17, pt17 + rows, o17());
if(final)
{
re17 = thrust::unique(pt17, pt17 + rows, q17());
re17 = thrust::unique(pt17, re17, p17());
}
else
re17 = thrust::unique(pt17, pt17 + rows, p17());
flag = 1;
}
catch(std::bad_alloc &e)
{
limpiar("sort/unique in unir", 0);
}
}
nrows = thrust::distance(pt17, re17);
if(nrows < rows / 2)
{
size = nrows * tipo * sizeof(int);
reservar(&nres, size);
cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
cudaFree(*ret);
*ret = nres;
}
return nrows;
}
case 18:
{
t18 = (s18*)res;
pt18 = thrust::device_pointer_cast(t18);
flag = 0;
while(flag != 1)
{
try
{
thrust::sort(pt18, pt18 + rows, o18());
if(final)
{
re18 = thrust::unique(pt18, pt18 + rows, q18());
re18 = thrust::unique(pt18, re18, p18());
}
else
re18 = thrust::unique(pt18, pt18 + rows, p18());
flag = 1;
}
catch(std::bad_alloc &e)
{
limpiar("sort/unique in unir", 0);
}
}
nrows = thrust::distance(pt18, re18);
if(nrows < rows / 2)
{
size = nrows * tipo * sizeof(int);
reservar(&nres, size);
cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
cudaFree(*ret);
*ret = nres;
}
return nrows;
}
case 19:
{
t19 = (s19*)res;
pt19 = thrust::device_pointer_cast(t19);
flag = 0;
while(flag != 1)
{
try
{
thrust::sort(pt19, pt19 + rows, o19());
if(final)
{
re19 = thrust::unique(pt19, pt19 + rows, q19());
re19 = thrust::unique(pt19, re19, p19());
}
else
re19 = thrust::unique(pt19, pt19 + rows, p19());
flag = 1;
}
catch(std::bad_alloc &e)
{
limpiar("sort/unique in unir", 0);
}
}
nrows = thrust::distance(pt19, re19);
if(nrows < rows / 2)
{
size = nrows * tipo * sizeof(int);
reservar(&nres, size);
cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
cudaFree(*ret);
*ret = nres;
}
return nrows;
}
case 20:
{
t20 = (s20*)res;
pt20 = thrust::device_pointer_cast(t20);
flag = 0;
while(flag != 1)
{
try
{
thrust::sort(pt20, pt20 + rows, o20());
if(final)
{
re20 = thrust::unique(pt20, pt20 + rows, q20());
re20 = thrust::unique(pt20, re20, p20());
}
else
re20 = thrust::unique(pt20, pt20 + rows, p20());
flag = 1;
}
catch(std::bad_alloc &e)
{
limpiar("sort/unique in unir", 0);
}
}
nrows = thrust::distance(pt20, re20);
if(nrows < rows / 2)
{
size = nrows * tipo * sizeof(int);
reservar(&nres, size);
cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
cudaFree(*ret);
*ret = nres;
}
return nrows;
}
}
return 0;
}

1005
packages/cuda/old/union2.h Executable file

File diff suppressed because it is too large Load Diff

0
packages/cuda/pred.h Executable file → Normal file
View File

103
packages/cuda/selectproyect.cu Executable file → Normal file
View File

@ -1,3 +1,4 @@
#include "hip/hip_runtime.h"
#include <thrust/device_vector.h>
#include <thrust/scan.h>
#include <stdlib.h>
@ -8,10 +9,10 @@
__global__ void marcar2(int *dop1, int rows, int cols, int *cons, int numc, int *res)
{
extern __shared__ int shared[];
int id = blockIdx.x * blockDim.x + threadIdx.x;
int id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
int x, rowact, posact;
if(threadIdx.x < numc)
shared[threadIdx.x] = cons[threadIdx.x];
if(hipThreadIdx_x < numc)
shared[hipThreadIdx_x] = cons[hipThreadIdx_x];
__syncthreads();
if(id < rows)
{
@ -30,10 +31,10 @@ we unmark any rows that do not comply with the selections*/
__global__ void marcar(int *dop1, int rows, int cols, int *cons, int numc, int *res)
{
extern __shared__ int shared[];
int id = blockIdx.x * blockDim.x + threadIdx.x;
int id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
int x, rowact, posact;
if(threadIdx.x < numc)
shared[threadIdx.x] = cons[threadIdx.x];
if(hipThreadIdx_x < numc)
shared[hipThreadIdx_x] = cons[hipThreadIdx_x];
__syncthreads();
if(id < rows)
{
@ -56,10 +57,10 @@ __global__ void marcar(int *dop1, int rows, int cols, int *cons, int numc, int *
__global__ void samejoin(int *dop1, int rows, int cols, int *dhead, int cont, int *res)
{
extern __shared__ int shared[];
int id = blockIdx.x * blockDim.x + threadIdx.x;
int id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
int temp, temp2, pos, x, y;
if(threadIdx.x < cont)
shared[threadIdx.x] = dhead[threadIdx.x];
if(hipThreadIdx_x < cont)
shared[hipThreadIdx_x] = dhead[hipThreadIdx_x];
__syncthreads();
if(id < rows)
{
@ -90,10 +91,10 @@ __global__ void samejoin(int *dop1, int rows, int cols, int *dhead, int cont, in
__global__ void samejoin2(int *dop1, int rows, int cols, int *dhead, int cont, int *res)
{
extern __shared__ int shared[];
int id = blockIdx.x * blockDim.x + threadIdx.x;
int id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
int temp, temp2, pos, x, y;
if(threadIdx.x < cont)
shared[threadIdx.x] = dhead[threadIdx.x];
if(hipThreadIdx_x < cont)
shared[hipThreadIdx_x] = dhead[hipThreadIdx_x];
__syncthreads();
if(id < rows)
{
@ -120,10 +121,10 @@ __global__ void samejoin2(int *dop1, int rows, int cols, int *dhead, int cont, i
__global__ void proyectar(int *dop1, int rows, int cols, int *dhead, int hsize, int *res)
{
extern __shared__ int shared[];
int id = blockIdx.x * blockDim.x + threadIdx.x;
int id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
int pos, posr, x;
if(threadIdx.x < hsize)
shared[threadIdx.x] = dhead[threadIdx.x];
if(hipThreadIdx_x < hsize)
shared[hipThreadIdx_x] = dhead[hipThreadIdx_x];
__syncthreads();
if(id < rows)
{
@ -139,10 +140,10 @@ selections, selfjoins, etc.). The array 'temp' holds the result of the prefix su
__global__ void llenarproyectar(int *dop1, int rows, int cols, int *temp, int *dhead, int hsize, int *res)
{
extern __shared__ int shared[];
int id = blockIdx.x * blockDim.x + threadIdx.x;
int id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
int pos, posr, x;
if(threadIdx.x < hsize)
shared[threadIdx.x] = dhead[threadIdx.x];
if(hipThreadIdx_x < hsize)
shared[hipThreadIdx_x] = dhead[hipThreadIdx_x];
__syncthreads();
if(id < rows)
{
@ -184,27 +185,27 @@ int selectproyect(int *dop1, int rows, int cols, int head_size, int *select, int
tmplen = rows + 1;
size2 = tmplen * sizeof(int);
reservar(&temp, size2);
cudaMemset(temp, 0, size2);
hipMemset(temp, 0, size2);
size = numselect * sizeof(int);
cudaMemcpy(dhead, select, size, cudaMemcpyHostToDevice);
hipMemcpy(dhead, select, size, hipMemcpyHostToDevice);
marcar2<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, numselect, temp + 1);
hipLaunchKernel(HIP_KERNEL_NAME(marcar2), dim3(blockllen), dim3(numthreads), size, 0, dop1, rows, cols, dhead, numselect, temp + 1);
if(numselfj > 0)
{
size = numselfj * sizeof(int);
cudaMemcpy(dhead, selfjoin, size, cudaMemcpyHostToDevice);
samejoin<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, numselfj, temp + 1);
hipMemcpy(dhead, selfjoin, size, hipMemcpyHostToDevice);
hipLaunchKernel(HIP_KERNEL_NAME(samejoin), dim3(blockllen), dim3(numthreads), size, 0, dop1, rows, cols, dhead, numselfj, temp + 1);
}
if(numpreds > 0)
{
size = numpreds * sizeof(int);
cudaMemcpy(dhead, preds, size, cudaMemcpyHostToDevice);
hipMemcpy(dhead, preds, size, hipMemcpyHostToDevice);
if(ANDlogic)
bpredsnormal<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, numpreds, temp + 1);
hipLaunchKernel(HIP_KERNEL_NAME(bpredsnormal), dim3(blockllen), dim3(numthreads), size, 0, dop1, rows, cols, dhead, numpreds, temp + 1);
else
bpredsorlogic<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, numpreds, temp + 1);
hipLaunchKernel(HIP_KERNEL_NAME(bpredsorlogic), dim3(blockllen), dim3(numthreads), size, 0, dop1, rows, cols, dhead, numpreds, temp + 1);
}
res = thrust::device_pointer_cast(temp);
@ -215,10 +216,10 @@ int selectproyect(int *dop1, int rows, int cols, int head_size, int *select, int
size = head_size * sizeof(int);
reservar(&fres, num * size);
cudaMemcpy(dhead, project, size, cudaMemcpyHostToDevice);
llenarproyectar<<<blockllen, numthreads, size>>>(dop1, rows, cols, temp, dhead, head_size, fres);
cudaFree(dhead);
cudaFree(temp);
hipMemcpy(dhead, project, size, hipMemcpyHostToDevice);
hipLaunchKernel(HIP_KERNEL_NAME(llenarproyectar), dim3(blockllen), dim3(numthreads), size, 0, dop1, rows, cols, temp, dhead, head_size, fres);
hipFree(dhead);
hipFree(temp);
*ret = fres;
return num;
}
@ -229,19 +230,19 @@ int selectproyect(int *dop1, int rows, int cols, int head_size, int *select, int
tmplen = rows + 1;
size2 = tmplen * sizeof(int);
reservar(&temp, size2);
cudaMemset(temp, 0, size2);
hipMemset(temp, 0, size2);
size = numselfj * sizeof(int);
cudaMemcpy(dhead, selfjoin, size, cudaMemcpyHostToDevice);
samejoin2<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, numselfj, temp + 1);
hipMemcpy(dhead, selfjoin, size, hipMemcpyHostToDevice);
hipLaunchKernel(HIP_KERNEL_NAME(samejoin2), dim3(blockllen), dim3(numthreads), size, 0, dop1, rows, cols, dhead, numselfj, temp + 1);
if(numpreds > 0)
{
size = numpreds * sizeof(int);
cudaMemcpy(dhead, preds, size, cudaMemcpyHostToDevice);
hipMemcpy(dhead, preds, size, hipMemcpyHostToDevice);
if(ANDlogic)
bpredsnormal<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, numpreds, temp + 1);
hipLaunchKernel(HIP_KERNEL_NAME(bpredsnormal), dim3(blockllen), dim3(numthreads), size, 0, dop1, rows, cols, dhead, numpreds, temp + 1);
else
bpredsorlogic<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, numpreds, temp + 1);
hipLaunchKernel(HIP_KERNEL_NAME(bpredsorlogic), dim3(blockllen), dim3(numthreads), size, 0, dop1, rows, cols, dhead, numpreds, temp + 1);
}
@ -253,10 +254,10 @@ int selectproyect(int *dop1, int rows, int cols, int head_size, int *select, int
size = head_size * sizeof(int);
reservar(&fres, num * size);
cudaMemcpy(dhead, project, size, cudaMemcpyHostToDevice);
llenarproyectar<<<blockllen, numthreads, size>>>(dop1, rows, cols, temp, dhead, head_size, fres);
cudaFree(dhead);
cudaFree(temp);
hipMemcpy(dhead, project, size, hipMemcpyHostToDevice);
hipLaunchKernel(HIP_KERNEL_NAME(llenarproyectar), dim3(blockllen), dim3(numthreads), size, 0, dop1, rows, cols, temp, dhead, head_size, fres);
hipFree(dhead);
hipFree(temp);
*ret = fres;
return num;
}
@ -267,14 +268,14 @@ int selectproyect(int *dop1, int rows, int cols, int head_size, int *select, int
tmplen = rows + 1;
size2 = tmplen * sizeof(int);
reservar(&temp, size2);
cudaMemset(temp, 0, size2);
hipMemset(temp, 0, size2);
size = numpreds * sizeof(int);
cudaMemcpy(dhead, preds, size, cudaMemcpyHostToDevice);
hipMemcpy(dhead, preds, size, hipMemcpyHostToDevice);
if(ANDlogic)
bpredsnormal2<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, numpreds, temp + 1);
hipLaunchKernel(HIP_KERNEL_NAME(bpredsnormal2), dim3(blockllen), dim3(numthreads), size, 0, dop1, rows, cols, dhead, numpreds, temp + 1);
else
bpredsorlogic2<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, numpreds, temp + 1);
hipLaunchKernel(HIP_KERNEL_NAME(bpredsorlogic2), dim3(blockllen), dim3(numthreads), size, 0, dop1, rows, cols, dhead, numpreds, temp + 1);
res = thrust::device_pointer_cast(temp);
thrust::inclusive_scan(res + 1, res + tmplen, res + 1);
num = res[rows];
@ -284,10 +285,10 @@ int selectproyect(int *dop1, int rows, int cols, int head_size, int *select, int
size = head_size * sizeof(int);
reservar(&fres, num * size);
cudaMemcpy(dhead, project, size, cudaMemcpyHostToDevice);
llenarproyectar<<<blockllen, numthreads, size>>>(dop1, rows, cols, temp, dhead, head_size, fres);
cudaFree(dhead);
cudaFree(temp);
hipMemcpy(dhead, project, size, hipMemcpyHostToDevice);
hipLaunchKernel(HIP_KERNEL_NAME(llenarproyectar), dim3(blockllen), dim3(numthreads), size, 0, dop1, rows, cols, temp, dhead, head_size, fres);
hipFree(dhead);
hipFree(temp);
*ret = fres;
return num;
}
@ -295,9 +296,9 @@ int selectproyect(int *dop1, int rows, int cols, int head_size, int *select, int
{
size = head_size * sizeof(int);
reservar(&fres, rows * size);
cudaMemcpy(dhead, project, size, cudaMemcpyHostToDevice);
proyectar<<<blockllen, numthreads, size>>>(dop1, rows, cols, dhead, head_size, fres);
cudaFree(dhead);
hipMemcpy(dhead, project, size, hipMemcpyHostToDevice);
hipLaunchKernel(HIP_KERNEL_NAME(proyectar), dim3(blockllen), dim3(numthreads), size, 0, dop1, rows, cols, dhead, head_size, fres);
hipFree(dhead);
*ret = fres;
return rows;
}

0
packages/cuda/selectproyectcpu.cpp Executable file → Normal file
View File

347
packages/cuda/treeb.cu Executable file → Normal file
View File

@ -1,3 +1,4 @@
#include "hip/hip_runtime.h"
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/sequence.h>
@ -160,11 +161,11 @@ __device__ int firstMatchingKeyInDataNode2(Record records[], IKeyType key)
__global__ void gCreateIndex(IDataNode data[], IDirectoryNode dir[], int dirSize, int tree_size, int bottom_start, int nNodesPerBlock)
{
int startIdx = blockIdx.x * nNodesPerBlock;
int startIdx = hipBlockIdx_x * nNodesPerBlock;
int endIdx = startIdx + nNodesPerBlock;
if(endIdx > dirSize)
endIdx = dirSize;
int keyIdx = threadIdx.x;
int keyIdx = hipThreadIdx_x;
// Proceed only when in internal nodes
for(int nodeIdx = startIdx; nodeIdx < endIdx; nodeIdx++)
@ -191,11 +192,11 @@ __global__ void gSearchTree(IDataNode* data, int nDataNodes, IDirectoryNode* dir
{
// Bringing the root node (visited by every tuple) to the faster shared memory
__shared__ IKeyType RootNodeKeys[TREE_NODE_SIZE];
RootNodeKeys[threadIdx.x] = dir->keys[threadIdx.x];
RootNodeKeys[hipThreadIdx_x] = dir->keys[hipThreadIdx_x];
__syncthreads();
int OverallThreadIdx = blockIdx.x * THRD_PER_BLCK_search + threadIdx.x;
int OverallThreadIdx = hipBlockIdx_x * THRD_PER_BLCK_search + hipThreadIdx_x;
for(int keyIdx = OverallThreadIdx; keyIdx < nSearchKeys; keyIdx += THRD_PER_GRID_search)
{
@ -219,7 +220,7 @@ __global__ void gSearchTree(IDataNode* data, int nDataNodes, IDirectoryNode* dir
/*Counts the number of times a row in 'S' is to be joined to a row in 'R'.*/
__global__ void gIndexJoin(int *R, int *S, int g_locations[], int sLen, int g_ResNums[])
{
int s_cur = blockIdx.x * blockDim.x + threadIdx.x;
int s_cur = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
if(s_cur < sLen)
{
@ -246,11 +247,11 @@ in 'g_locations' those rows that have equal values in the checked columns.*/
__global__ void gIndexMultiJoinNegative(int *R, int *S, int g_locations[], int rLen, int *p1, int *p2, int of1, int of2, int *mloc, int *sloc, int *muljoin, int wj)
{
extern __shared__ int shared[];
int r_cur = blockIdx.x * blockDim.x + threadIdx.x;
int r_cur = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
int posr, poss, x;
if(threadIdx.x < wj)
shared[threadIdx.x] = muljoin[threadIdx.x];
if(hipThreadIdx_x < wj)
shared[hipThreadIdx_x] = muljoin[hipThreadIdx_x];
__syncthreads();
if(r_cur < rLen)
@ -287,11 +288,11 @@ times a row in 'S' is to be joined to its corresponding row in 'R', storing the
__global__ void gIndexMultiJoin(int *R, int *S, int g_locations[], int sLen, int g_ResNums[], int *p1, int *p2, int of1, int of2, int *mloc, int *sloc, int *muljoin, int wj)
{
extern __shared__ int shared[];
int s_cur = blockIdx.x * blockDim.x + threadIdx.x;
int s_cur = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
int posr, poss, x;
if(threadIdx.x < wj)
shared[threadIdx.x] = muljoin[threadIdx.x];
if(hipThreadIdx_x < wj)
shared[hipThreadIdx_x] = muljoin[hipThreadIdx_x];
__syncthreads();
if(s_cur < sLen)
@ -330,10 +331,10 @@ __global__ void multiJoinWithWrite(int g_locations[], int sLen, int g_PrefixSums
{
extern __shared__ int shared[];
int *extjoins = &shared[lenrul];
int s_cur = blockIdx.x * blockDim.x + threadIdx.x;
int s_cur = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
if(threadIdx.x < (lenrul + wj))
shared[threadIdx.x] = rule[threadIdx.x];
if(hipThreadIdx_x < (lenrul + wj))
shared[hipThreadIdx_x] = rule[hipThreadIdx_x];
__syncthreads();
if(s_cur < sLen)
@ -382,10 +383,10 @@ __global__ void multiJoinWithWrite2(int g_locations[], int sLen, int g_PrefixSum
{
extern __shared__ int shared[];
int *extjoins = &shared[cols];
int s_cur = blockIdx.x * blockDim.x + threadIdx.x;
int s_cur = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
if(threadIdx.x < (cols + wj))
shared[threadIdx.x] = rule[threadIdx.x];
if(hipThreadIdx_x < (cols + wj))
shared[hipThreadIdx_x] = rule[hipThreadIdx_x];
__syncthreads();
if(s_cur < sLen)
@ -432,11 +433,11 @@ predicate are projected.*/
__global__ void gJoinWithWriteNegative(int g_locations[], int rLen, int g_joinResultBuffers[], int *p1, int of1, int *rule, int halfrul, int *mloc)
{
extern __shared__ int shared[];
int r_cur = blockIdx.x * blockDim.x + threadIdx.x;
int r_cur = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
int posr;
if(threadIdx.x < halfrul)
shared[threadIdx.x] = rule[threadIdx.x];
if(hipThreadIdx_x < halfrul)
shared[hipThreadIdx_x] = rule[hipThreadIdx_x];
__syncthreads();
if(r_cur < rLen)
@ -461,11 +462,11 @@ predicate are projected.*/
__global__ void gJoinWithWriteNegative2(int g_locations[], int rLen, int g_joinResultBuffers[], int *p1, int of1, int *rule, int cols, int *mloc)
{
extern __shared__ int shared[];
int r_cur = blockIdx.x * blockDim.x + threadIdx.x;
int r_cur = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
int posr;
if(threadIdx.x < cols)
shared[threadIdx.x] = rule[threadIdx.x];
if(hipThreadIdx_x < cols)
shared[hipThreadIdx_x] = rule[hipThreadIdx_x];
__syncthreads();
if(r_cur < rLen)
@ -489,10 +490,10 @@ __global__ void gJoinWithWriteNegative2(int g_locations[], int rLen, int g_joinR
__global__ void gJoinWithWrite(int g_locations[], int sLen, int g_PrefixSums[], int g_joinResultBuffers[], int *p1, int *p2, int of1, int of2, int *rule, int halfrul, int lenrul, int *mloc, int *sloc)
{
extern __shared__ int shared[];
int s_cur = blockIdx.x * blockDim.x + threadIdx.x;
int s_cur = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
if(threadIdx.x < lenrul)
shared[threadIdx.x] = rule[threadIdx.x];
if(hipThreadIdx_x < lenrul)
shared[hipThreadIdx_x] = rule[hipThreadIdx_x];
__syncthreads();
if(s_cur < sLen)
@ -525,10 +526,10 @@ projection, which is performed based on the variables in the head of the rule.*/
__global__ void gJoinWithWrite2(int g_locations[], int sLen, int g_PrefixSums[], int g_joinResultBuffers[], int *p1, int *p2, int of1, int of2, int *rule, int cols, int *mloc, int *sloc)
{
extern __shared__ int shared[];
int s_cur = blockIdx.x * blockDim.x + threadIdx.x;
int s_cur = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
if(threadIdx.x < cols)
shared[threadIdx.x] = rule[threadIdx.x];
if(hipThreadIdx_x < cols)
shared[hipThreadIdx_x] = rule[hipThreadIdx_x];
__syncthreads();
if(s_cur < sLen)
@ -563,7 +564,7 @@ __global__ void gJoinWithWrite2(int g_locations[], int sLen, int g_PrefixSums[],
/*Load part of column 'wj' of 'p' in 'R'. Which values are loaded is defined by the prefix sum results in 'pos'.*/
__global__ void llenar(int *p, int *R, int len, int of, int wj, int *pos, int *ids)
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
int id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
int cond;
if(id < len)
{
@ -579,7 +580,7 @@ __global__ void llenar(int *p, int *R, int len, int of, int wj, int *pos, int *i
/*Load an entire column from 'p' into 'R'.*/
__global__ void llenarnosel(int *p, int *R, int len, int of, int wj)
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
int id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
if(id < len)
R[id] = p[id * of + wj];
}
@ -587,10 +588,10 @@ __global__ void llenarnosel(int *p, int *R, int len, int of, int wj)
__global__ void projectfinal(int *res, int rows, int cols, int *rule, int *out)
{
extern __shared__ int shared[];
int id = blockIdx.x * blockDim.x + threadIdx.x;
int id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
if(threadIdx.x < cols)
shared[threadIdx.x] = rule[threadIdx.x];
if(hipThreadIdx_x < cols)
shared[hipThreadIdx_x] = rule[hipThreadIdx_x];
__syncthreads();
if(id < rows)
@ -614,26 +615,26 @@ void project(int *res, int resrows, int numcols1, int numcols2, int *proj, int *
int *pt = (int *)malloc(sizepro);
for(z = 0; z < numcols2; z++)
pt[z] = proj[z] - 1;
cudaMemcpy(dcons, pt, sizepro, cudaMemcpyHostToDevice);
//cudaDeviceSynchronize(); //Small cudaMemcpys are asynchronous, uncomment this line if the pointer is being liberated before it is copied.
hipMemcpy(dcons, pt, sizepro, hipMemcpyHostToDevice);
//hipDeviceSynchronize(); //Small cudaMemcpys are asynchronous, uncomment this line if the pointer is being liberated before it is copied.
free(pt);
}
else
cudaMemcpy(dcons, proj, sizepro, cudaMemcpyHostToDevice);
hipMemcpy(dcons, proj, sizepro, hipMemcpyHostToDevice);
reservar(&d_Rout, resrows * sizepro);
projectfinal<<<blockllen, numthreads, sizepro>>>(res, resrows, numcols1, dcons, d_Rout);
cudaFree(dcons);
cudaFree(*ret);
hipLaunchKernel(HIP_KERNEL_NAME(projectfinal), dim3(blockllen), dim3(numthreads), sizepro, 0, res, resrows, numcols1, dcons, d_Rout);
hipFree(dcons);
hipFree(*ret);
*ret = d_Rout;
}
__global__ void projectadd(int *dop1, int *dop2, int rows1, int rows2, int cols1, int cols2, int *dhead, int hsize, int *res)
{
extern __shared__ int shared[];
int id = blockIdx.x * blockDim.x + threadIdx.x;
int id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
int pos2, posr, x, y, cond;
if(threadIdx.x < hsize)
shared[threadIdx.x] = dhead[threadIdx.x];
if(hipThreadIdx_x < hsize)
shared[hipThreadIdx_x] = dhead[hipThreadIdx_x];
__syncthreads();
if(id < rows2)
{
@ -662,10 +663,10 @@ void juntar(int *dop1, int *dop2, int rows1, int rows2, int cols1, int cols2, in
int blockllen = rows2 / numthreads + 1;
sizepro = pcols * sizeof(int);
reservar(&dcons, sizepro);
cudaMemcpy(dcons, proj, sizepro, cudaMemcpyHostToDevice);
hipMemcpy(dcons, proj, sizepro, hipMemcpyHostToDevice);
reservar(&d_Rout, rows1 * rows2 * sizepro);
projectadd<<<blockllen, numthreads, sizepro>>>(dop1, dop2, rows1, rows2, cols1, cols2, dcons, pcols, d_Rout);
cudaFree(dcons);
hipLaunchKernel(HIP_KERNEL_NAME(projectadd), dim3(blockllen), dim3(numthreads), sizepro, 0, dop1, dop2, rows1, rows2, cols1, cols2, dcons, pcols, d_Rout);
hipFree(dcons);
*ret = d_Rout;
}
@ -743,51 +744,51 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
#ifdef TIMER
//cout << "INICIO" << endl;
cudaEvent_t start, stop;
hipEvent_t start, stop;
float time;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
hipEventCreate(&start);
hipEventCreate(&stop);
hipEventRecord(start, 0);
#endif
if(npred2.x > 0 || npred2.y > 0 || nsel2 > 0 || nsj2 > 0)
{
newLen = sLen + 1;
cudaMemsetAsync(temp, 0, newLen * sizeof(int));
hipMemsetAsync(temp, 0, newLen * sizeof(int));
}
if(npred2.x > 0 || npred2.y > 0)
{
size = npred2tot * sizeof(int);
cudaMemcpy(dcons, pred2, size, cudaMemcpyHostToDevice);
hipMemcpy(dcons, pred2, size, hipMemcpyHostToDevice);
if(npred2.y > 0) /*Fix case when a(X,Y),b(Y,Z),Z > Y*/
{
reservar(&temp2, sizet2);
cudaMemsetAsync(temp2, 0, newLen * sizeof(int));
hipMemsetAsync(temp2, 0, newLen * sizeof(int));
//res = thrust::device_pointer_cast(temp2);
bpreds<<<blockllen, numthreads, size>>>(p1, p2, sLen, of1, of2, dcons, npred2tot, npred2.x, temp + 1, temp2 + 1);
hipLaunchKernel(HIP_KERNEL_NAME(bpreds), dim3(blockllen), dim3(numthreads), size, 0, p1, p2, sLen, of1, of2, dcons, npred2tot, npred2.x, temp + 1, temp2 + 1);
}
else
{
if(negative)
bpreds<<<blockllen, numthreads, size>>>(p1, p2, sLen, of1, of2, dcons, npred2tot, npred2.x, temp + 1, NULL);
hipLaunchKernel(HIP_KERNEL_NAME(bpreds), dim3(blockllen), dim3(numthreads), size, 0, p1, p2, sLen, of1, of2, dcons, npred2tot, npred2.x, temp + 1, NULL);
else
bpredsOR<<<blockllen, numthreads, size>>>(p1, p2, sLen, of1, of2, dcons, npred2tot, npred2.x, temp + 1, NULL);
hipLaunchKernel(HIP_KERNEL_NAME(bpredsOR), dim3(blockllen), dim3(numthreads), size, 0, p1, p2, sLen, of1, of2, dcons, npred2tot, npred2.x, temp + 1, NULL);
}
if(nsel2 > 0)
{
size = nsel2 * sizeof(int);
cudaMemcpy(dcons, sel2, size, cudaMemcpyHostToDevice);
marcar<<<blockllen, numthreads, size>>>(p2, sLen, of2, dcons, nsel2, temp + 1);
hipMemcpy(dcons, sel2, size, hipMemcpyHostToDevice);
hipLaunchKernel(HIP_KERNEL_NAME(marcar), dim3(blockllen), dim3(numthreads), size, 0, p2, sLen, of2, dcons, nsel2, temp + 1);
}
if(nsj2 > 0)
{
size = nsj2 * sizeof(int);
cudaMemcpy(dcons, sjoin2, size, cudaMemcpyHostToDevice);
samejoin<<<blockllen, numthreads, size>>>(p2, sLen, of2, dcons, nsj2, temp + 1);
hipMemcpy(dcons, sjoin2, size, hipMemcpyHostToDevice);
hipLaunchKernel(HIP_KERNEL_NAME(samejoin), dim3(blockllen), dim3(numthreads), size, 0, p2, sLen, of2, dcons, nsj2, temp + 1);
}
}
else
@ -795,14 +796,14 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
if(nsel2 > 0)
{
size = nsel2 * sizeof(int);
cudaMemcpy(dcons, sel2, size, cudaMemcpyHostToDevice);
marcar2<<<blockllen, numthreads, size>>>(p2, sLen, of2, dcons, nsel2, temp + 1);
hipMemcpy(dcons, sel2, size, hipMemcpyHostToDevice);
hipLaunchKernel(HIP_KERNEL_NAME(marcar2), dim3(blockllen), dim3(numthreads), size, 0, p2, sLen, of2, dcons, nsel2, temp + 1);
if(nsj2 > 0)
{
size = nsj2 * sizeof(int);
cudaMemcpy(dcons, sjoin2, size, cudaMemcpyHostToDevice);
samejoin<<<blockllen, numthreads, size>>>(p2, sLen, of2, dcons, nsj2, temp + 1);
hipMemcpy(dcons, sjoin2, size, hipMemcpyHostToDevice);
hipLaunchKernel(HIP_KERNEL_NAME(samejoin), dim3(blockllen), dim3(numthreads), size, 0, p2, sLen, of2, dcons, nsj2, temp + 1);
}
}
else
@ -810,15 +811,15 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
if(nsj2 > 0)
{
size = nsj2 * sizeof(int);
cudaMemcpy(dcons, sjoin2, size, cudaMemcpyHostToDevice);
samejoin2<<<blockllen, numthreads, size>>>(p2, sLen, of2, dcons, nsj2, temp + 1);
hipMemcpy(dcons, sjoin2, size, hipMemcpyHostToDevice);
hipLaunchKernel(HIP_KERNEL_NAME(samejoin2), dim3(blockllen), dim3(numthreads), size, 0, p2, sLen, of2, dcons, nsj2, temp + 1);
}
else
{
sizem32S = m32sLen * sizeof(int);
reservar(&d_S, sizem32S);
cudaMemsetAsync(d_S + sLen, 0x7f, extraspaceS * sizeof(int));
llenarnosel<<<blockllen, numthreads>>>(p2, d_S, sLen, of2, wherej[1]);
hipMemsetAsync(d_S + sLen, 0x7f, extraspaceS * sizeof(int));
hipLaunchKernel(HIP_KERNEL_NAME(llenarnosel), dim3(blockllen), dim3(numthreads), 0, 0, p2, d_S, sLen, of2, wherej[1]);
}
}
}
@ -842,8 +843,8 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
if(newLen == 0) // && !negative) ARREGLAR
{
cudaFree(temp);
cudaFree(dcons);
hipFree(temp);
hipFree(dcons);
return 0;
}
@ -854,24 +855,24 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
reservar(&d_S, sizem32S);
reservar(&posS, sizem32S);
cudaMemsetAsync(d_S + newLen, 0x7f, sizextra);
cudaMemsetAsync(posS + newLen, 0x7f, sizextra);
llenar<<<blockllen, numthreads>>>(p2, d_S, sLen, of2, wherej[1], temp, posS);
hipMemsetAsync(d_S + newLen, 0x7f, sizextra);
hipMemsetAsync(posS + newLen, 0x7f, sizextra);
hipLaunchKernel(HIP_KERNEL_NAME(llenar), dim3(blockllen), dim3(numthreads), 0, 0, p2, d_S, sLen, of2, wherej[1], temp, posS);
sLen = newLen;
}
#ifdef TIMER
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
hipEventRecord(stop, 0);
hipEventSynchronize(stop);
hipEventElapsedTime(&time, start, stop);
//cout << "Select1 = " << time << endl;
cuda_stats.select1_time += time;
cudaEventDestroy(start);
cudaEventDestroy(stop);
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
hipEventDestroy(start);
hipEventDestroy(stop);
hipEventCreate(&start);
hipEventCreate(&stop);
hipEventRecord(start, 0);
#endif
blockllen = rLen / numthreads + 1;
@ -880,30 +881,30 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
{
if(temp2 != NULL)
{
cudaFree(temp);
hipFree(temp);
temp = temp2;
res = thrust::device_pointer_cast(temp);
newLen = rLen + 1;
if(nsel1 > 0)
{
size = nsel1 * sizeof(int);
cudaMemcpy(dcons, sel1, size, cudaMemcpyHostToDevice);
marcar<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, nsel1, temp + 1);
hipMemcpy(dcons, sel1, size, hipMemcpyHostToDevice);
hipLaunchKernel(HIP_KERNEL_NAME(marcar), dim3(blockllen), dim3(numthreads), size, 0, p1, rLen, of1, dcons, nsel1, temp + 1);
}
if(nsj1 > 0)
{
size = nsj1 * sizeof(int);
cudaMemcpy(dcons, sjoin1, size, cudaMemcpyHostToDevice);
samejoin<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, nsj1, temp + 1);
hipMemcpy(dcons, sjoin1, size, hipMemcpyHostToDevice);
hipLaunchKernel(HIP_KERNEL_NAME(samejoin), dim3(blockllen), dim3(numthreads), size, 0, p1, rLen, of1, dcons, nsj1, temp + 1);
}
if(npred1.x > 0)
{
size = npred1.x * sizeof(int);
cudaMemcpy(dcons, pred1, size, cudaMemcpyHostToDevice);
hipMemcpy(dcons, pred1, size, hipMemcpyHostToDevice);
if(ANDlogic)
bpredsnormal<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, npred1.x, temp + 1);
hipLaunchKernel(HIP_KERNEL_NAME(bpredsnormal), dim3(blockllen), dim3(numthreads), size, 0, p1, rLen, of1, dcons, npred1.x, temp + 1);
else
bpredsorlogic<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, npred1.x, temp + 1);
hipLaunchKernel(HIP_KERNEL_NAME(bpredsorlogic), dim3(blockllen), dim3(numthreads), size, 0, p1, rLen, of1, dcons, npred1.x, temp + 1);
}
}
else
@ -911,30 +912,30 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
if(npred1.x > 0 || nsel1 > 0 || nsj1 > 0)
{
newLen = rLen + 1;
cudaMemsetAsync(temp, 0, newLen * sizeof(int));
hipMemsetAsync(temp, 0, newLen * sizeof(int));
}
if(nsel1 > 0)
{
size = nsel1 * sizeof(int);
cudaMemcpy(dcons, sel1, size, cudaMemcpyHostToDevice);
marcar2<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, nsel1, temp + 1);
hipMemcpy(dcons, sel1, size, hipMemcpyHostToDevice);
hipLaunchKernel(HIP_KERNEL_NAME(marcar2), dim3(blockllen), dim3(numthreads), size, 0, p1, rLen, of1, dcons, nsel1, temp + 1);
if(nsj1 > 0)
{
size = nsj1 * sizeof(int);
cudaMemcpy(dcons, sjoin1, size, cudaMemcpyHostToDevice);
samejoin<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, nsj1, temp + 1);
hipMemcpy(dcons, sjoin1, size, hipMemcpyHostToDevice);
hipLaunchKernel(HIP_KERNEL_NAME(samejoin), dim3(blockllen), dim3(numthreads), size, 0, p1, rLen, of1, dcons, nsj1, temp + 1);
}
if(npred1.x > 0)
{
size = npred1.x * sizeof(int);
cudaMemcpy(dcons, pred1, size, cudaMemcpyHostToDevice);
hipMemcpy(dcons, pred1, size, hipMemcpyHostToDevice);
if(ANDlogic)
bpredsnormal<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, npred1.x, temp + 1);
hipLaunchKernel(HIP_KERNEL_NAME(bpredsnormal), dim3(blockllen), dim3(numthreads), size, 0, p1, rLen, of1, dcons, npred1.x, temp + 1);
else
bpredsorlogic<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, npred1.x, temp + 1);
hipLaunchKernel(HIP_KERNEL_NAME(bpredsorlogic), dim3(blockllen), dim3(numthreads), size, 0, p1, rLen, of1, dcons, npred1.x, temp + 1);
}
}
else
@ -942,17 +943,17 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
if(nsj1 > 0)
{
size = nsj1 * sizeof(int);
cudaMemcpy(dcons, sjoin1, size, cudaMemcpyHostToDevice);
samejoin2<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, nsj1, temp + 1);
hipMemcpy(dcons, sjoin1, size, hipMemcpyHostToDevice);
hipLaunchKernel(HIP_KERNEL_NAME(samejoin2), dim3(blockllen), dim3(numthreads), size, 0, p1, rLen, of1, dcons, nsj1, temp + 1);
if(npred1.x > 0)
{
size = npred1.x * sizeof(int);
cudaMemcpy(dcons, pred1, size, cudaMemcpyHostToDevice);
hipMemcpy(dcons, pred1, size, hipMemcpyHostToDevice);
if(ANDlogic)
bpredsnormal<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, npred1.x, temp + 1);
hipLaunchKernel(HIP_KERNEL_NAME(bpredsnormal), dim3(blockllen), dim3(numthreads), size, 0, p1, rLen, of1, dcons, npred1.x, temp + 1);
else
bpredsorlogic<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, npred1.x, temp + 1);
hipLaunchKernel(HIP_KERNEL_NAME(bpredsorlogic), dim3(blockllen), dim3(numthreads), size, 0, p1, rLen, of1, dcons, npred1.x, temp + 1);
}
}
else
@ -960,11 +961,11 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
if(npred1.x > 0)
{
size = npred1.x * sizeof(int);
cudaMemcpy(dcons, pred1, size, cudaMemcpyHostToDevice);
hipMemcpy(dcons, pred1, size, hipMemcpyHostToDevice);
if(ANDlogic)
bpredsnormal2<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, npred1.x, temp + 1);
hipLaunchKernel(HIP_KERNEL_NAME(bpredsnormal2), dim3(blockllen), dim3(numthreads), size, 0, p1, rLen, of1, dcons, npred1.x, temp + 1);
else
bpredsorlogic2<<<blockllen, numthreads, size>>>(p1, rLen, of1, dcons, npred1.x, temp + 1);
hipLaunchKernel(HIP_KERNEL_NAME(bpredsorlogic2), dim3(blockllen), dim3(numthreads), size, 0, p1, rLen, of1, dcons, npred1.x, temp + 1);
}
}
}
@ -976,11 +977,11 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
newLen = res[rLen];
if(newLen == 0)
{
cudaFree(temp);
cudaFree(dcons);
cudaFree(d_S);
hipFree(temp);
hipFree(dcons);
hipFree(d_S);
if(posS != NULL)
cudaFree(posS);
hipFree(posS);
return 0;
}
@ -991,41 +992,41 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
reservar(&d_R, sizem32);
reservar(&posR, sizem32);
cudaMemsetAsync(d_R + newLen, 0x7f, sizextra);
cudaMemsetAsync(posR + newLen, 0x7f, sizextra);
llenar<<<blockllen, numthreads>>>(p1, d_R, rLen, of1, wherej[0], temp, posR);
hipMemsetAsync(d_R + newLen, 0x7f, sizextra);
hipMemsetAsync(posR + newLen, 0x7f, sizextra);
hipLaunchKernel(HIP_KERNEL_NAME(llenar), dim3(blockllen), dim3(numthreads), 0, 0, p1, d_R, rLen, of1, wherej[0], temp, posR);
rLen = newLen;
}
else
{
sizem32 = m32rLen * sizeof(int);
reservar(&d_R, sizem32);
cudaMemsetAsync(d_R + rLen, 0x7f, extraspace * sizeof(int));
llenarnosel<<<blockllen, numthreads>>>(p1, d_R, rLen, of1, wherej[0]);
hipMemsetAsync(d_R + rLen, 0x7f, extraspace * sizeof(int));
hipLaunchKernel(HIP_KERNEL_NAME(llenarnosel), dim3(blockllen), dim3(numthreads), 0, 0, p1, d_R, rLen, of1, wherej[0]);
}
}
else
{
sizem32 = m32rLen * sizeof(int);
reservar(&d_R, sizem32);
cudaMemsetAsync(d_R + rLen, 0x7f, extraspace * sizeof(int));
llenarnosel<<<blockllen, numthreads>>>(p1, d_R, rLen, of1, wherej[0]);
hipMemsetAsync(d_R + rLen, 0x7f, extraspace * sizeof(int));
hipLaunchKernel(HIP_KERNEL_NAME(llenarnosel), dim3(blockllen), dim3(numthreads), 0, 0, p1, d_R, rLen, of1, wherej[0]);
}
#ifdef TIMER
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
hipEventRecord(stop, 0);
hipEventSynchronize(stop);
hipEventElapsedTime(&time, start, stop);
//cout << "Select2 = " << time << endl;
cuda_stats.select2_time += time;
#endif
#ifdef TIMER
cudaEventDestroy(start);
cudaEventDestroy(stop);
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
hipEventDestroy(start);
hipEventDestroy(stop);
hipEventCreate(&start);
hipEventCreate(&stop);
hipEventRecord(start, 0);
#endif
thrust::device_ptr<Record> dvp1;
@ -1084,17 +1085,17 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
}
#ifdef TIMER
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
hipEventRecord(stop, 0);
hipEventSynchronize(stop);
hipEventElapsedTime(&time, start, stop);
//cout << "Sort = " << time << endl;
cuda_stats.sort_time += time;
cudaEventDestroy(start);
cudaEventDestroy(stop);
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
hipEventDestroy(start);
hipEventDestroy(stop);
hipEventCreate(&start);
hipEventCreate(&stop);
hipEventRecord(start, 0);
#endif
IDataNode* d_data;
@ -1123,7 +1124,7 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
dim3 Dbc(THRD_PER_BLCK_create, 1, 1);
dim3 Dgc(BLCK_PER_GRID_create, 1, 1);
gCreateIndex <<<Dgc, Dbc>>> (d_data, d_dir, nDirNodes, tree_size, bottom_start, nNodesPerBlock);
hipLaunchKernel(HIP_KERNEL_NAME(gCreateIndex), dim3(Dgc), dim3(Dbc), 0, 0, d_data, d_dir, nDirNodes, tree_size, bottom_start, nNodesPerBlock);
int *d_locations;
int memSizeR;
@ -1132,7 +1133,7 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
{
memSizeR = (rLen + 1) * sizeof(int);
reservar(&d_locations, memSizeR);
cudaMemsetAsync(d_locations, 0, sizeof(int));
hipMemsetAsync(d_locations, 0, sizeof(int));
nSearchKeys = rLen;
}
else
@ -1146,13 +1147,13 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
unsigned int nKeysPerThread = uintCeilingDiv(nSearchKeys, THRD_PER_GRID_search);
if(negative)
{
gSearchTree <<<Dgs, Dbs>>> (d_data, nDataNodes, d_dir, nDirNodes, lvlDir, d_R, d_locations + 1, nSearchKeys, nKeysPerThread, tree_size, bottom_start);
cudaMemsetAsync(temp, 0, memSizeR);
hipLaunchKernel(HIP_KERNEL_NAME(gSearchTree), dim3(Dgs), dim3(Dbs), 0, 0, d_data, nDataNodes, d_dir, nDirNodes, lvlDir, d_R, d_locations + 1, nSearchKeys, nKeysPerThread, tree_size, bottom_start);
hipMemsetAsync(temp, 0, memSizeR);
}
else
{
gSearchTree <<<Dgs, Dbs>>> (d_data, nDataNodes, d_dir, nDirNodes, lvlDir, d_S, d_locations, nSearchKeys, nKeysPerThread, tree_size, bottom_start);
cudaMemsetAsync(temp, 0, memSizeS);
hipLaunchKernel(HIP_KERNEL_NAME(gSearchTree), dim3(Dgs), dim3(Dbs), 0, 0, d_data, nDataNodes, d_dir, nDirNodes, lvlDir, d_S, d_locations, nSearchKeys, nKeysPerThread, tree_size, bottom_start);
hipMemsetAsync(temp, 0, memSizeS);
}
int muljoin = 0, muljoinsize = 0, sum;
@ -1165,8 +1166,8 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
{
muljoin = numj - 2;
muljoinsize = muljoin * sizeof(int);
cudaMemcpy(dcons, wherej + 2, muljoinsize, cudaMemcpyHostToDevice);
gIndexMultiJoinNegative<<<blockllen, numthreads, muljoinsize>>> (d_R, d_S, d_locations + 1, rLen, p1, p2, of1, of2, posR, posS, dcons, muljoin);
hipMemcpy(dcons, wherej + 2, muljoinsize, hipMemcpyHostToDevice);
hipLaunchKernel(HIP_KERNEL_NAME(gIndexMultiJoinNegative), dim3(blockllen), dim3(numthreads), muljoinsize, 0, d_R, d_S, d_locations + 1, rLen, p1, p2, of1, of2, posR, posS, dcons, muljoin);
}
res = thrust::device_pointer_cast(d_locations);
@ -1177,21 +1178,21 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
if(pos == (rule->num_rows - 3))
{
sizepro = rule->num_columns * sizeof(int);
cudaMemcpy(dcons, proj, sizepro, cudaMemcpyHostToDevice);
hipMemcpy(dcons, proj, sizepro, hipMemcpyHostToDevice);
resSize = sum * sizepro;
reservar(&d_Rout, resSize);
gJoinWithWriteNegative2<<<blockllen, numthreads, sizepro>>> (d_locations, rLen, d_Rout, p1, of1, dcons, rule->num_columns, posR);
hipLaunchKernel(HIP_KERNEL_NAME(gJoinWithWriteNegative2), dim3(blockllen), dim3(numthreads), sizepro, 0, d_locations, rLen, d_Rout, p1, of1, dcons, rule->num_columns, posR);
}
else
{
sizepro = projp.x * sizeof(int);
cudaMemcpy(dcons, proj, sizepro, cudaMemcpyHostToDevice);
hipMemcpy(dcons, proj, sizepro, hipMemcpyHostToDevice);
resSize = sum * sizepro;
reservar(&d_Rout, resSize);
gJoinWithWriteNegative<<<blockllen, numthreads, sizepro>>> (d_locations, rLen, d_Rout, p1, of1, dcons, projp.x, posR);
hipLaunchKernel(HIP_KERNEL_NAME(gJoinWithWriteNegative), dim3(blockllen), dim3(numthreads), sizepro, 0, d_locations, rLen, d_Rout, p1, of1, dcons, projp.x, posR);
}
cudaFree(d_R);
cudaFree(d_S);
hipFree(d_R);
hipFree(d_S);
}
else
{
@ -1200,26 +1201,26 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
{
muljoin = numj - 2;
muljoinsize = muljoin * sizeof(int);
cudaMemcpy(dcons, wherej + 2, muljoinsize, cudaMemcpyHostToDevice);
gIndexMultiJoin<<<blockllen, numthreads, muljoinsize>>> (d_R, d_S, d_locations, sLen, temp, p1, p2, of1, of2, posR, posS, dcons, muljoin);
hipMemcpy(dcons, wherej + 2, muljoinsize, hipMemcpyHostToDevice);
hipLaunchKernel(HIP_KERNEL_NAME(gIndexMultiJoin), dim3(blockllen), dim3(numthreads), muljoinsize, 0, d_R, d_S, d_locations, sLen, temp, p1, p2, of1, of2, posR, posS, dcons, muljoin);
}
else
gIndexJoin<<<blockllen, numthreads>>> (d_R, d_S, d_locations, sLen, temp);
cudaFree(d_R);
cudaFree(d_S);
hipLaunchKernel(HIP_KERNEL_NAME(gIndexJoin), dim3(blockllen), dim3(numthreads), 0, 0, d_R, d_S, d_locations, sLen, temp);
hipFree(d_R);
hipFree(d_S);
sum = res[sLen-1];
thrust::exclusive_scan(res, res + sLen, res);
sum += res[sLen-1];
if(sum == 0)
{
cudaFree(dcons);
cudaFree(d_locations);
cudaFree(temp);
hipFree(dcons);
hipFree(d_locations);
hipFree(temp);
if(posS != NULL)
cudaFree(posS);
hipFree(posS);
if(posR != NULL)
cudaFree(posR);
hipFree(posR);
return 0;
}
res[sLen] = sum;
@ -1227,49 +1228,49 @@ int join(int *p1, int *p2, int rLen, int sLen, int of1, int of2, list<rulenode>:
if(pos == (rule->num_rows - 3))
{
sizepro = rule->num_columns * sizeof(int);
cudaMemcpy(dcons, proj, sizepro, cudaMemcpyHostToDevice);
hipMemcpy(dcons, proj, sizepro, hipMemcpyHostToDevice);
resSize = sum * sizepro;
reservar(&d_Rout, resSize);
if(numj > 2)
{
cudaMemcpy(dcons + rule->num_columns, wherej + 2, muljoinsize, cudaMemcpyHostToDevice);
multiJoinWithWrite2<<<blockllen, numthreads, sizepro + muljoinsize>>> (d_locations, sLen, temp, d_Rout, p1, p2, of1, of2, dcons, rule->num_columns, posR, posS, muljoin);
hipMemcpy(dcons + rule->num_columns, wherej + 2, muljoinsize, hipMemcpyHostToDevice);
hipLaunchKernel(HIP_KERNEL_NAME(multiJoinWithWrite2), dim3(blockllen), dim3(numthreads), sizepro + muljoinsize, 0, d_locations, sLen, temp, d_Rout, p1, p2, of1, of2, dcons, rule->num_columns, posR, posS, muljoin);
}
else
gJoinWithWrite2<<<blockllen, numthreads, sizepro>>> (d_locations, sLen, temp, d_Rout, p1, p2, of1, of2, dcons, rule->num_columns, posR, posS);
hipLaunchKernel(HIP_KERNEL_NAME(gJoinWithWrite2), dim3(blockllen), dim3(numthreads), sizepro, 0, d_locations, sLen, temp, d_Rout, p1, p2, of1, of2, dcons, rule->num_columns, posR, posS);
}
else
{
sizepro = projp.y * sizeof(int);
cudaMemcpy(dcons, proj, sizepro, cudaMemcpyHostToDevice);
hipMemcpy(dcons, proj, sizepro, hipMemcpyHostToDevice);
resSize = sum * sizepro;
reservar(&d_Rout, resSize);
if(numj > 2)
{
cudaMemcpy(dcons + projp.y, wherej + 2, muljoinsize, cudaMemcpyHostToDevice);
multiJoinWithWrite<<<blockllen, numthreads, sizepro + muljoinsize>>> (d_locations, sLen, temp, d_Rout, p1, p2, of1, of2, dcons, projp.x, projp.y, posR, posS, muljoin);
hipMemcpy(dcons + projp.y, wherej + 2, muljoinsize, hipMemcpyHostToDevice);
hipLaunchKernel(HIP_KERNEL_NAME(multiJoinWithWrite), dim3(blockllen), dim3(numthreads), sizepro + muljoinsize, 0, d_locations, sLen, temp, d_Rout, p1, p2, of1, of2, dcons, projp.x, projp.y, posR, posS, muljoin);
}
else
gJoinWithWrite<<<blockllen, numthreads, sizepro>>> (d_locations, sLen, temp, d_Rout, p1, p2, of1, of2, dcons, projp.x, projp.y, posR, posS);
hipLaunchKernel(HIP_KERNEL_NAME(gJoinWithWrite), dim3(blockllen), dim3(numthreads), sizepro, 0, d_locations, sLen, temp, d_Rout, p1, p2, of1, of2, dcons, projp.x, projp.y, posR, posS);
}
}
cudaFree(dcons);
cudaFree(d_locations);
cudaFree(temp);
hipFree(dcons);
hipFree(d_locations);
hipFree(temp);
if(posS != NULL)
cudaFree(posS);
hipFree(posS);
if(posR != NULL)
cudaFree(posR);
hipFree(posR);
if(*ret != NULL)
cudaFree(*ret);
hipFree(*ret);
*ret = d_Rout;
#ifdef TIMER
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
hipEventRecord(stop, 0);
hipEventSynchronize(stop);
hipEventElapsedTime(&time, start, stop);
//cout << "Join = " << time << endl;
//cout << "FIN" << endl;
cuda_stats.join_time += time;

80
packages/cuda/union2.cu Executable file → Normal file
View File

@ -87,8 +87,8 @@ int unir(int *res, int rows, int tipo, int **ret, int final)
{
size = nrows * tipo * sizeof(int);
reservar(&nres, size);
cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
cudaFree(*ret);
hipMemcpyAsync(nres, res, size, hipMemcpyDeviceToDevice);
hipFree(*ret);
*ret = nres;
}
return nrows;
@ -122,8 +122,8 @@ int unir(int *res, int rows, int tipo, int **ret, int final)
{
size = nrows * tipo * sizeof(int);
reservar(&nres, size);
cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
cudaFree(*ret);
hipMemcpyAsync(nres, res, size, hipMemcpyDeviceToDevice);
hipFree(*ret);
*ret = nres;
}
return nrows;
@ -157,8 +157,8 @@ int unir(int *res, int rows, int tipo, int **ret, int final)
{
size = nrows * tipo * sizeof(int);
reservar(&nres, size);
cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
cudaFree(*ret);
hipMemcpyAsync(nres, res, size, hipMemcpyDeviceToDevice);
hipFree(*ret);
*ret = nres;
}
return nrows;
@ -192,8 +192,8 @@ int unir(int *res, int rows, int tipo, int **ret, int final)
{
size = nrows * tipo * sizeof(int);
reservar(&nres, size);
cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
cudaFree(*ret);
hipMemcpyAsync(nres, res, size, hipMemcpyDeviceToDevice);
hipFree(*ret);
*ret = nres;
}
return nrows;
@ -227,8 +227,8 @@ int unir(int *res, int rows, int tipo, int **ret, int final)
{
size = nrows * tipo * sizeof(int);
reservar(&nres, size);
cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
cudaFree(*ret);
hipMemcpyAsync(nres, res, size, hipMemcpyDeviceToDevice);
hipFree(*ret);
*ret = nres;
}
return nrows;
@ -262,8 +262,8 @@ int unir(int *res, int rows, int tipo, int **ret, int final)
{
size = nrows * tipo * sizeof(int);
reservar(&nres, size);
cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
cudaFree(*ret);
hipMemcpyAsync(nres, res, size, hipMemcpyDeviceToDevice);
hipFree(*ret);
*ret = nres;
}
return nrows;
@ -297,8 +297,8 @@ int unir(int *res, int rows, int tipo, int **ret, int final)
{
size = nrows * tipo * sizeof(int);
reservar(&nres, size);
cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
cudaFree(*ret);
hipMemcpyAsync(nres, res, size, hipMemcpyDeviceToDevice);
hipFree(*ret);
*ret = nres;
}
return nrows;
@ -332,8 +332,8 @@ int unir(int *res, int rows, int tipo, int **ret, int final)
{
size = nrows * tipo * sizeof(int);
reservar(&nres, size);
cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
cudaFree(*ret);
hipMemcpyAsync(nres, res, size, hipMemcpyDeviceToDevice);
hipFree(*ret);
*ret = nres;
}
return nrows;
@ -367,8 +367,8 @@ int unir(int *res, int rows, int tipo, int **ret, int final)
{
size = nrows * tipo * sizeof(int);
reservar(&nres, size);
cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
cudaFree(*ret);
hipMemcpyAsync(nres, res, size, hipMemcpyDeviceToDevice);
hipFree(*ret);
*ret = nres;
}
return nrows;
@ -402,8 +402,8 @@ int unir(int *res, int rows, int tipo, int **ret, int final)
{
size = nrows * tipo * sizeof(int);
reservar(&nres, size);
cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
cudaFree(*ret);
hipMemcpyAsync(nres, res, size, hipMemcpyDeviceToDevice);
hipFree(*ret);
*ret = nres;
}
return nrows;
@ -437,8 +437,8 @@ int unir(int *res, int rows, int tipo, int **ret, int final)
{
size = nrows * tipo * sizeof(int);
reservar(&nres, size);
cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
cudaFree(*ret);
hipMemcpyAsync(nres, res, size, hipMemcpyDeviceToDevice);
hipFree(*ret);
*ret = nres;
}
return nrows;
@ -472,8 +472,8 @@ int unir(int *res, int rows, int tipo, int **ret, int final)
{
size = nrows * tipo * sizeof(int);
reservar(&nres, size);
cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
cudaFree(*ret);
hipMemcpyAsync(nres, res, size, hipMemcpyDeviceToDevice);
hipFree(*ret);
*ret = nres;
}
return nrows;
@ -507,8 +507,8 @@ int unir(int *res, int rows, int tipo, int **ret, int final)
{
size = nrows * tipo * sizeof(int);
reservar(&nres, size);
cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
cudaFree(*ret);
hipMemcpyAsync(nres, res, size, hipMemcpyDeviceToDevice);
hipFree(*ret);
*ret = nres;
}
return nrows;
@ -542,8 +542,8 @@ int unir(int *res, int rows, int tipo, int **ret, int final)
{
size = nrows * tipo * sizeof(int);
reservar(&nres, size);
cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
cudaFree(*ret);
hipMemcpyAsync(nres, res, size, hipMemcpyDeviceToDevice);
hipFree(*ret);
*ret = nres;
}
return nrows;
@ -577,8 +577,8 @@ int unir(int *res, int rows, int tipo, int **ret, int final)
{
size = nrows * tipo * sizeof(int);
reservar(&nres, size);
cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
cudaFree(*ret);
hipMemcpyAsync(nres, res, size, hipMemcpyDeviceToDevice);
hipFree(*ret);
*ret = nres;
}
return nrows;
@ -612,8 +612,8 @@ int unir(int *res, int rows, int tipo, int **ret, int final)
{
size = nrows * tipo * sizeof(int);
reservar(&nres, size);
cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
cudaFree(*ret);
hipMemcpyAsync(nres, res, size, hipMemcpyDeviceToDevice);
hipFree(*ret);
*ret = nres;
}
return nrows;
@ -647,8 +647,8 @@ int unir(int *res, int rows, int tipo, int **ret, int final)
{
size = nrows * tipo * sizeof(int);
reservar(&nres, size);
cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
cudaFree(*ret);
hipMemcpyAsync(nres, res, size, hipMemcpyDeviceToDevice);
hipFree(*ret);
*ret = nres;
}
return nrows;
@ -682,8 +682,8 @@ int unir(int *res, int rows, int tipo, int **ret, int final)
{
size = nrows * tipo * sizeof(int);
reservar(&nres, size);
cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
cudaFree(*ret);
hipMemcpyAsync(nres, res, size, hipMemcpyDeviceToDevice);
hipFree(*ret);
*ret = nres;
}
return nrows;
@ -717,8 +717,8 @@ int unir(int *res, int rows, int tipo, int **ret, int final)
{
size = nrows * tipo * sizeof(int);
reservar(&nres, size);
cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
cudaFree(*ret);
hipMemcpyAsync(nres, res, size, hipMemcpyDeviceToDevice);
hipFree(*ret);
*ret = nres;
}
return nrows;
@ -752,8 +752,8 @@ int unir(int *res, int rows, int tipo, int **ret, int final)
{
size = nrows * tipo * sizeof(int);
reservar(&nres, size);
cudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);
cudaFree(*ret);
hipMemcpyAsync(nres, res, size, hipMemcpyDeviceToDevice);
hipFree(*ret);
*ret = nres;
}
return nrows;

0
packages/cuda/union2.h Executable file → Normal file
View File

0
packages/cuda/unioncpu2.cpp Executable file → Normal file
View File