This commit is contained in:
Vitor Santos Costa
2016-08-01 21:45:42 -05:00
72 changed files with 8053 additions and 54 deletions

30
packages/cuda/old/CC_CSSTree.cu Executable file
View File

@@ -0,0 +1,30 @@
#include "CC_CSSTree.h"
//return the start position of searching the key.
int CC_CSSTree::search(int key)
{
int i=0;
int curIndex=0;
int curNode=0;
int j=0;
//search
for(i=0;i<level;i++)
{
for(j=0;j<blockSize;j++)
{
if(ntree[curIndex+j]==-1)
break;
if(key<=ntree[curIndex+j])
break;
}
curNode=(fanout*(curNode)+j+1);
curIndex=curNode*blockSize;
//#ifdef DEBUG
// cout<<curNode<<", "<<j<<", "<<ntree[curIndex]<<"; ";
//#endif
}
curIndex=(curNode-numNode)*blockSize;
if(curIndex>numRecord) curIndex=numRecord-1;
//cout<<"I: "<<curIndex<<", ";//cout<<endl;
return curIndex;
}

146
packages/cuda/old/CC_CSSTree.h Executable file
View File

@@ -0,0 +1,146 @@
#ifndef CSSTREE_H
#define CSSTREE_H
#include <iostream>
#include <math.h>
using namespace std;
#define divRoundUp(n,s) (((n) / (s)) + ((((n) % (s)) > 0) ? 1 : 0))
#define CSS_TREE_FANOUT 33
//we use implicit pointer to perform the addressing.
typedef int Record;
class CC_GenericTree
{
public:
int numRecord;
Record *data;
//we use the BFS layout as the default layout.
int numNode;
int level;
int gResult;
CC_GenericTree(){}
//we assume that numR=2^i. Otherwise, we pad the array with -1 from the beginning.
//we also assume that the record are sorted by the key.
CC_GenericTree(Record *d, int numR)
{
data=d;
numRecord=numR;
}
virtual ~CC_GenericTree()
{
}
virtual int search(int key)=0;
};
class CC_CSSTree:public CC_GenericTree
{
public:
int *ntree;
int fanout;
int blockSize;
int *vStart;
int *vG;//vG[0] is used in computing the position for level 1.
int numKey;
CC_CSSTree(Record *d, int numR, int f):CC_GenericTree(d,numR)
{
fanout=f;
blockSize=fanout-1;
int numLeaf=divRoundUp(numR,blockSize);
level=1;
int temp=numLeaf;
while(temp>1)
{
temp=divRoundUp(temp, fanout);
level++;
}
numNode=(int)((pow((double)fanout,(double)level)-1)/(fanout-1));
numKey=numNode*blockSize;
ntree=new int[numKey];
vStart=new int[level];
vG=new int[level];
#ifdef DEBUG
cout<<numLeaf<<","<<level<<", "<<numNode<<endl;
#endif
//layout the tree from bottom up.
int i=0,j=0,k=0;
int startNode=0;
int endNode=0;
int startKey, endKey;
int curIndex;
for(i=0;i<numNode;i++)
ntree[i]=-1;
//for <level-1>, i.e., the leaf level. [start,end]
for(i=0;i<level;i++)//level
{
startNode=(int)((pow((double)fanout,(double)i)-1)/(fanout-1));
endNode=(int)((pow((double)fanout,(double)(i+1))-1)/(fanout-1));
for(j= startNode;j< endNode;j++)//which node
{
startKey=j*blockSize;
endKey=startKey+blockSize;
for(k=startKey;k<endKey;k++)
{
curIndex=(int)(blockSize*pow((double)fanout,(double)(level-i-1))*(k+1-startNode*blockSize+(j-startNode))-1);
if(curIndex<numRecord+blockSize)
{
if(curIndex>=numRecord)
curIndex=numRecord-1;
ntree[k]=data[curIndex];
}
else
break;
}
}
}
}
~CC_CSSTree()
{
delete [] ntree;
delete [] vStart;
delete [] vG;
}
virtual int search(int key);
void print()
{
int i=0, j=0;
int k=0;
int startNode=0;
int endNode=0;
int startKey, endKey;
for(i=0;i<level;i++)//level
{
cout<<"Level, "<<i<<endl;
startNode=(int)((pow((double)fanout,(double)i)-1)/(fanout-1));
endNode=(int)((pow((double)fanout,(double)(i+1))-1)/(fanout-1));
for(j= startNode;j< endNode;j++)//which node
{
cout<<"Level, "<<i<<", Node, "<<j<<": ";
startKey=j*blockSize;
endKey=startKey+blockSize;
for(k=startKey;k<endKey;k++)
{
cout<<ntree[k]<<", ";
}
cout<<endl;
}
}
for(i=0;i<numRecord;i++)
{
cout<<data[i]<<", ";
if(i%(fanout-1)==(fanout-2))
cout<<"*"<<endl;
}
}
};
#endif

460
packages/cuda/old/bpreds.cu Executable file
View File

@@ -0,0 +1,460 @@
#include <thrust/device_vector.h>
#include <thrust/scan.h>
#include <cstdarg>
#include "pred.h"
/*Determines the maximum from a set of values*/
int maximo(int count, ...)
{
va_list ap;
int j, temp, mx = 0;
va_start(ap, count);
for(j = 0; j < count; j++)
{
temp = va_arg(ap, int);
if(temp > mx)
mx = temp;
}
va_end(ap);
return mx;
}
__global__ void bpreds(int *dop1, int *dop2, int rows, int of1, int of2, int *cons, int numc, int nx, int *res, int *res2)
{
extern __shared__ int shared[];
int id = blockIdx.x * blockDim.x + threadIdx.x;
int x, rowact, rowact1, op1, op2;
if(threadIdx.x < numc)
shared[threadIdx.x] = cons[threadIdx.x];
__syncthreads();
if(id < rows)
{
rowact1 = id * of1;
rowact = id * of2;
for(x = nx; x < numc; x += 3)
{
op1 = shared[x+1];
if(op1 < 0)
op1 = dop1[rowact1 - op1 - 1];
else
op1 = dop2[rowact + op1];
op2 = shared[x+2];
if(op2 < 0)
op2 = dop1[rowact1 - op2 - 1];
else
op2 = dop2[rowact + op2];
switch(shared[x] - BPOFFSET)
{
case SBG_EQ: if(op1 != op2)
return;
break;
case SBG_GT: if(op1 <= op2)
return;
break;
case SBG_LT: if(op1 >= op2)
return;
break;
case SBG_GE: if(op1 < op2)
return;
break;
case SBG_LE: if(op1 > op2)
return;
break;
case SBG_DF: if(op1 == op2)
return;
}
}
if(res2 != NULL)
res2[id] = 1;
for(x = 0; x < nx; x += 3)
{
op1 = shared[x+1];
if(op1 < 0)
op1 *= -1;
else
op1 = dop2[rowact + op1];
op2 = shared[x+2];
if(op2 < 0)
op2 *= -1;
else
op2 = dop2[rowact + op2];
switch(shared[x])
{
case SBG_EQ: if(op1 != op2)
return;
break;
case SBG_GT: if(op1 <= op2)
return;
break;
case SBG_LT: if(op1 >= op2)
return;
break;
case SBG_GE: if(op1 < op2)
return;
break;
case SBG_LE: if(op1 > op2)
return;
break;
case SBG_DF: if(op1 == op2)
return;
}
}
res[id] = 1;
}
}
/*Mark all rows that comply with the comparison predicates*/
__global__ void bpredsnormal2(int *dop1, int rows, int of1, int *cons, int numc, int *res)
{
extern __shared__ int shared[];
int id = blockIdx.x * blockDim.x + threadIdx.x;
int x, rowact, op1, op2;
if(threadIdx.x < numc)
shared[threadIdx.x] = cons[threadIdx.x];
__syncthreads();
if(id < rows)
{
rowact = id * of1;
for(x = 0; x < numc; x += 3)
{
op1 = shared[x+1];
if(op1 < 0)
op1 *= -1;
else
op1 = dop1[rowact + op1];
op2 = shared[x+2];
if(op2 < 0)
op2 *= -1;
else
op2 = dop1[rowact + op2];
switch(shared[x])
{
case SBG_EQ: if(op1 != op2)
return;
break;
case SBG_GT: if(op1 <= op2)
return;
break;
case SBG_LT: if(op1 >= op2)
return;
break;
case SBG_GE: if(op1 < op2)
return;
break;
case SBG_LE: if(op1 > op2)
return;
break;
case SBG_DF: if(op1 == op2)
return;
}
}
res[id] = 1;
}
}
/*Unmark all rows that do not comply with the comparison predicates*/
__global__ void bpredsnormal(int *dop1, int rows, int of1, int *cons, int numc, int *res)
{
extern __shared__ int shared[];
int id = blockIdx.x * blockDim.x + threadIdx.x;
int x, rowact, op1, op2;
if(threadIdx.x < numc)
shared[threadIdx.x] = cons[threadIdx.x];
__syncthreads();
if(id < rows)
{
if(res[id] == 0)
return;
rowact = id * of1;
for(x = 0; x < numc; x += 3)
{
op1 = shared[x+1];
if(op1 < 0)
op1 *= -1;
else
op1 = dop1[rowact + op1];
op2 = shared[x+2];
if(op2 < 0)
op2 *= -1;
else
op2 = dop1[rowact + op2];
switch(shared[x])
{
case SBG_EQ: if(op1 != op2)
{
res[id] = 0;
return;
}
break;
case SBG_GT: if(op1 <= op2)
{
res[id] = 0;
return;
}
break;
case SBG_LT: if(op1 >= op2)
{
res[id] = 0;
return;
}
break;
case SBG_GE: if(op1 < op2)
{
res[id] = 0;
return;
}
break;
case SBG_LE: if(op1 > op2)
{
res[id] = 0;
return;
}
break;
case SBG_DF: if(op1 == op2)
{
res[id] = 0;
return;
}
}
}
}
}
__global__ void bpredsOR(int *dop1, int *dop2, int rows, int of1, int of2, int *cons, int numc, int nx, int *res, int *res2)
{
extern __shared__ int shared[];
int id = blockIdx.x * blockDim.x + threadIdx.x;
int x, rowact, rowact1, op1, op2;
if(threadIdx.x < numc)
shared[threadIdx.x] = cons[threadIdx.x];
__syncthreads();
if(id < rows)
{
rowact1 = id * of1;
rowact = id * of2;
for(x = nx; x < numc; x += 3)
{
op1 = shared[x+1];
if(op1 < 0)
op1 = dop1[rowact1 - op1 - 1];
else
op1 = dop2[rowact + op1];
op2 = shared[x+2];
if(op2 < 0)
op2 = dop1[rowact1 - op2 - 1];
else
op2 = dop2[rowact + op2];
switch(shared[x] - BPOFFSET)
{
case SBG_EQ: if(op1 == op2)
{
res2[id] = 1;
x = numc;
}
break;
case SBG_GT: if(op1 > op2)
{
res2[id] = 1;
x = numc;
}
break;
case SBG_LT: if(op1 < op2)
{
res2[id] = 1;
x = numc;
}
break;
case SBG_GE: if(op1 >= op2)
{
res2[id] = 1;
x = numc;
}
break;
case SBG_LE: if(op1 <= op2)
{
res2[id] = 1;
x = numc;
}
break;
case SBG_DF: if(op1 != op2)
{
res2[id] = 1;
x = numc;
}
}
}
for(x = 0; x < nx; x += 3)
{
op1 = shared[x+1];
if(op1 < 0)
op1 *= -1;
else
op1 = dop2[rowact + op1];
op2 = shared[x+2];
if(op2 < 0)
op2 *= -1;
else
op2 = dop2[rowact + op2];
switch(shared[x])
{
case SBG_EQ: if(op1 == op2)
{
res[id] = 1;
return;
}
break;
case SBG_GT: if(op1 > op2)
{
res[id] = 1;
return;
}
break;
case SBG_LT: if(op1 < op2)
{
res[id] = 1;
return;
}
break;
case SBG_GE: if(op1 >= op2)
{
res[id] = 1;
return;
}
break;
case SBG_LE: if(op1 <= op2)
{
res[id] = 1;
return;
}
break;
case SBG_DF: if(op1 != op2)
{
res[id] = 1;
return;
}
}
}
}
}
/*Mark all rows that comply with the comparison predicates using disjunctions (i.e. a row is marked if it complies with at least one predicate)*/
__global__ void bpredsorlogic2(int *dop1, int rows, int of1, int *cons, int numc, int *res)
{
extern __shared__ int shared[];
int id = blockIdx.x * blockDim.x + threadIdx.x;
int x, rowact, op1, op2;
if(threadIdx.x < numc)
shared[threadIdx.x] = cons[threadIdx.x];
__syncthreads();
if(id < rows)
{
rowact = id * of1;
for(x = 0; x < numc; x += 3)
{
op1 = shared[x+1];
if(op1 < 0)
op1 *= -1;
else
op1 = dop1[rowact + op1];
op2 = shared[x+2];
if(op2 < 0)
op2 *= -1;
else
op2 = dop1[rowact + op2];
switch(shared[x])
{
case SBG_EQ: if(op1 == op2)
{
res[id] = 1;
return;
}
break;
case SBG_GT: if(op1 > op2)
{
res[id] = 1;
return;
}
break;
case SBG_LT: if(op1 < op2)
{
res[id] = 1;
return;
}
break;
case SBG_GE: if(op1 >= op2)
{
res[id] = 1;
return;
}
break;
case SBG_LE: if(op1 <= op2)
{
res[id] = 1;
return;
}
break;
case SBG_DF: if(op1 != op2)
{
res[id] = 1;
return;
}
}
}
}
}
/*Unmark all rows that do not comply with the comparison predicates using disjunctions (i.e. a row is unmarked only if it complies with none of the predicates)*/
__global__ void bpredsorlogic(int *dop1, int rows, int of1, int *cons, int numc, int *res)
{
extern __shared__ int shared[];
int id = blockIdx.x * blockDim.x + threadIdx.x;
int x, rowact, op1, op2;
if(threadIdx.x < numc)
shared[threadIdx.x] = cons[threadIdx.x];
__syncthreads();
if(id < rows)
{
if(res[id] == 0)
return;
rowact = id * of1;
for(x = 0; x < numc; x += 3)
{
op1 = shared[x+1];
if(op1 < 0)
op1 *= -1;
else
op1 = dop1[rowact + op1];
op2 = shared[x+2];
if(op2 < 0)
op2 *= -1;
else
op2 = dop1[rowact + op2];
switch(shared[x])
{
case SBG_EQ: if(op1 == op2)
return;
break;
case SBG_GT: if(op1 > op2)
return;
break;
case SBG_LT: if(op1 < op2)
return;
break;
case SBG_GE: if(op1 >= op2)
return;
break;
case SBG_LE: if(op1 <= op2)
return;
break;
case SBG_DF: if(op1 != op2)
return;
}
}
res[id] = 0;
}
}

12
packages/cuda/old/bpreds.h Executable file
View File

@@ -0,0 +1,12 @@
#ifndef _BPREDS_H_
#define _BPREDS_H_
int maximo(int, ...);
__global__ void bpreds(int*, int*, int, int, int, int*, int, int, int*, int*);
__global__ void bpredsnormal(int*, int, int, int*, int, int*);
__global__ void bpredsnormal2(int*, int, int, int*, int, int*);
__global__ void bpredsOR(int*, int*, int, int, int, int*, int, int, int*, int*);
__global__ void bpredsorlogic(int*, int, int, int*, int, int*);
__global__ void bpredsorlogic2(int*, int, int, int*, int, int*);
#endif

197
packages/cuda/old/creator2.c Executable file
View File

@@ -0,0 +1,197 @@
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <ctype.h>
/*Program used to generate union2.cu and union2.h. A new pointer and all its operations are generated for each set (pairs, triplets, etc.).
Arguments are the cardinality of the biggest set and the name of the cuda file. For example, executing "creator2 20 union2" will generate
all pointers and operations for all sets from 1 to 20 in the files union2.cu and union2.h.*/
int main(int argc, char *argv[])
{
int num = atoi(argv[1]);
int x;
char *str = (char *)malloc((strlen(argv[2]) + 4) * sizeof(char));
sprintf(str, "%s.cu", argv[2]);
FILE *cuda = fopen(str, "w");
fprintf(cuda, "/*Computer generated file to remove duplicates. Since Thrust's unique and sort, unlike their std's counterparts, don't have a way to specify the size of each element in\n");
fprintf(cuda, "the array, comparing pairs, triplets and other sets is not possible without defining a new pointer and all related operations for each set. If you have a better idea to do\n");
fprintf(cuda, "this, please don't hesitate to email us.*/\n\n");
fprintf(cuda, "#include <thrust/device_vector.h>\n");
fprintf(cuda, "#include <thrust/unique.h>\n");
fprintf(cuda, "#include <thrust/distance.h>\n");
fprintf(cuda, "#include <thrust/sort.h>\n");
fprintf(cuda, "#include <iostream>\n");
fprintf(cuda, "#include \"memory.h\"\n");
fprintf(cuda, "#include \"%s.h\"\n\n", argv[2]);
fprintf(cuda, "int unir(int *res, int rows, int tipo, int **ret, int final)\n");
fprintf(cuda, "{\n");
fprintf(cuda, "\tthrust::device_ptr<int> pt, re;\n");
for(x = 2; x <= num; x++)
fprintf(cuda, "\tthrust::device_ptr<s%d> pt%d, re%d;\n", x, x ,x);
for(x = 2; x <= num; x++)
fprintf(cuda, "\ts%d *t%d;\n", x, x);
fprintf(cuda, "\tint flag, nrows, *nres, size;\n\n");
fprintf(cuda, "#if TIMER\n");
fprintf(cuda, "\tcuda_stats.unions++;\n");
fprintf(cuda, "#endif\n\n");
fprintf(cuda, "\tswitch(tipo)\n");
fprintf(cuda, "\t{\n");
fprintf(cuda, "\t\tcase 1:\n");
fprintf(cuda, "\t\t{\n");
fprintf(cuda, "\t\t\tpt = thrust::device_pointer_cast(res);\n");
fprintf(cuda, "\t\t\tflag = 0;\n");
fprintf(cuda, "\t\t\twhile(flag != 1)\n");
fprintf(cuda, "\t\t\t{\n");
fprintf(cuda, "\t\t\t\ttry\n");
fprintf(cuda, "\t\t\t\t{\n");
fprintf(cuda, "\t\t\t\t\tthrust::sort(pt, pt + rows);\n");
fprintf(cuda, "\t\t\t\t\tif(final)\n");
fprintf(cuda, "\t\t\t\t\t{\n");
fprintf(cuda, "\t\t\t\t\t\tre = thrust::unique(pt, pt + rows, q1());\n");
fprintf(cuda, "\t\t\t\t\t\tre = thrust::unique(pt, re);\n");
fprintf(cuda, "\t\t\t\t\t}\n");
fprintf(cuda, "\t\t\t\t\telse\n");
fprintf(cuda, "\t\t\t\t\t\tre = thrust::unique(pt, pt + rows);\n");
fprintf(cuda, "\t\t\t\t\tflag = 1;\n");
fprintf(cuda, "\t\t\t\t}\n");
fprintf(cuda, "\t\t\t\tcatch(std::bad_alloc &e)\n");
fprintf(cuda, "\t\t\t\t{\n");
fprintf(cuda, "\t\t\t\t\tlimpiar(\"sort/unique in unir\", 0);\n");
fprintf(cuda, "\t\t\t\t}\n");
fprintf(cuda, "\t\t\t}\n");
fprintf(cuda, "\t\t\tnrows = thrust::distance(pt, re);\n");
fprintf(cuda, "\t\t\tif(nrows < rows / 2)\n");
fprintf(cuda, "\t\t\t{\n");
fprintf(cuda, "\t\t\t\tsize = nrows * tipo * sizeof(int);\n");
fprintf(cuda, "\t\t\t\treservar(&nres, size);\n");
fprintf(cuda, "\t\t\t\tcudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);\n");
fprintf(cuda, "\t\t\t\tcudaFree(*ret);\n");
fprintf(cuda, "\t\t\t\t*ret = nres;\n");
fprintf(cuda, "\t\t\t}\n");
fprintf(cuda, "\t\t\treturn nrows;\n");
fprintf(cuda, "\t\t}\n");
for(x = 2; x <= num; x++)
{
fprintf(cuda, "\t\tcase %d:\n", x);
fprintf(cuda, "\t\t{\n");
fprintf(cuda, "\t\t\tt%d = (s%d*)res;\n", x, x);
fprintf(cuda, "\t\t\tpt%d = thrust::device_pointer_cast(t%d);\n", x, x);
fprintf(cuda, "\t\t\tflag = 0;\n");
fprintf(cuda, "\t\t\twhile(flag != 1)\n");
fprintf(cuda, "\t\t\t{\n");
fprintf(cuda, "\t\t\t\ttry\n");
fprintf(cuda, "\t\t\t\t{\n");
fprintf(cuda, "\t\t\t\t\tthrust::sort(pt%d, pt%d + rows, o%d());\n", x, x, x);
fprintf(cuda, "\t\t\t\t\tif(final)\n");
fprintf(cuda, "\t\t\t\t\t{\n");
fprintf(cuda, "\t\t\t\t\t\tre%d = thrust::unique(pt%d, pt%d + rows, q%d());\n", x, x, x, x);
fprintf(cuda, "\t\t\t\t\t\tre%d = thrust::unique(pt%d, re%d, p%d());\n", x, x, x, x);
fprintf(cuda, "\t\t\t\t\t}\n");
fprintf(cuda, "\t\t\t\t\telse\n");
fprintf(cuda, "\t\t\t\t\t\tre%d = thrust::unique(pt%d, pt%d + rows, p%d());\n", x, x, x, x);
fprintf(cuda, "\t\t\t\t\tflag = 1;\n");
fprintf(cuda, "\t\t\t\t}\n");
fprintf(cuda, "\t\t\t\tcatch(std::bad_alloc &e)\n");
fprintf(cuda, "\t\t\t\t{\n");
fprintf(cuda, "\t\t\t\t\tlimpiar(\"sort/unique in unir\", 0);\n");
fprintf(cuda, "\t\t\t\t}\n");
fprintf(cuda, "\t\t\t}\n");
fprintf(cuda, "\t\t\tnrows = thrust::distance(pt%d, re%d);\n", x, x);
fprintf(cuda, "\t\t\tif(nrows < rows / 2)\n");
fprintf(cuda, "\t\t\t{\n");
fprintf(cuda, "\t\t\t\tsize = nrows * tipo * sizeof(int);\n");
fprintf(cuda, "\t\t\t\treservar(&nres, size);\n");
fprintf(cuda, "\t\t\t\tcudaMemcpyAsync(nres, res, size, cudaMemcpyDeviceToDevice);\n");
fprintf(cuda, "\t\t\t\tcudaFree(*ret);\n");
fprintf(cuda, "\t\t\t\t*ret = nres;\n");
fprintf(cuda, "\t\t\t}\n");
fprintf(cuda, "\t\t\treturn nrows;\n");
fprintf(cuda, "\t\t}\n");
}
fprintf(cuda, "\t}\n");
fprintf(cuda, "\treturn 0;\n");
fprintf(cuda, "}\n");
fclose(cuda);
sprintf(str, "%s.h", argv[2]);
cuda = fopen(str, "w"); /*tipo de archivo cambiar*/
fprintf(cuda, "#ifndef _");
for(x = 0; x < strlen(argv[2]); x++)
fprintf(cuda, "%c", toupper(argv[2][x]));
fprintf(cuda, "_H_\n");
fprintf(cuda, "#define _");
for(x = 0; x < strlen(argv[2]); x++)
fprintf(cuda, "%c", toupper(argv[2][x]));
fprintf(cuda, "_H_\n\n");
fprintf(cuda, "int unir(int *res, int rows, int tipo, int **ret, int final);\n\n");
for(x = 2; x <= num; x++)
{
fprintf(cuda, "typedef struct n%d\n", x);
fprintf(cuda, "{\n");
fprintf(cuda, "\tint v[%d];\n", x);
fprintf(cuda, "}s%d;\n\n", x);
}
fprintf(cuda, "struct q1\n");
fprintf(cuda, "{\n");
fprintf(cuda, "\t__host__ __device__\n");
fprintf(cuda, "\tbool operator()(const int &r1, const int &r2)\n");
fprintf(cuda, "\t{\n");
fprintf(cuda, "\t\tif(r1 != r2)\n");
fprintf(cuda, "\t\t\treturn true;\n");
fprintf(cuda, "\t\treturn false;\n");
fprintf(cuda, "\t}\n");
fprintf(cuda, "};\n\n");
for(x = 2; x <= num; x++)
{
fprintf(cuda, "struct p%d\n", x);
fprintf(cuda, "{\n");
fprintf(cuda, "\t__host__ __device__\n");
fprintf(cuda, "\tbool operator()(const s%d &r1, const s%d &r2)\n", x, x);
fprintf(cuda, "\t{\n");
fprintf(cuda, "\t\tint x;\n");
fprintf(cuda, "\t\tfor(x = 0; x < %d; x++)\n", x);
fprintf(cuda, "\t\t{\n");
fprintf(cuda, "\t\t\tif(r1.v[x] != r2.v[x])\n");
fprintf(cuda, "\t\t\t\treturn false;\n");
fprintf(cuda, "\t\t}\n");
fprintf(cuda, "\t\treturn true;\n");
fprintf(cuda, "\t}\n");
fprintf(cuda, "};\n\n");
fprintf(cuda, "struct q%d\n", x);
fprintf(cuda, "{\n");
fprintf(cuda, "\t__host__ __device__\n");
fprintf(cuda, "\tbool operator()(const s%d &r1, const s%d &r2)\n", x, x);
fprintf(cuda, "\t{\n");
fprintf(cuda, "\t\tint x;\n");
fprintf(cuda, "\t\tfor(x = 0; x < %d; x++)\n", x);
fprintf(cuda, "\t\t{\n");
fprintf(cuda, "\t\t\tif(r1.v[x] != r2.v[x])\n");
fprintf(cuda, "\t\t\t\treturn true;\n");
fprintf(cuda, "\t\t}\n");
fprintf(cuda, "\t\treturn false;\n");
fprintf(cuda, "\t}\n");
fprintf(cuda, "};\n\n");
fprintf(cuda, "struct o%d\n", x);
fprintf(cuda, "{\n");
fprintf(cuda, "\t__host__ __device__\n");
fprintf(cuda, "\tbool operator()(const s%d &r1, const s%d &r2)\n", x, x);
fprintf(cuda, "\t{\n");
fprintf(cuda, "\t\tint x;\n");
fprintf(cuda, "\t\tfor(x = 0; x < %d; x++)\n", x);
fprintf(cuda, "\t\t{\n");
fprintf(cuda, "\t\t\tif(r1.v[x] > r2.v[x])\n");
fprintf(cuda, "\t\t\t\treturn true;\n");
fprintf(cuda, "\t\t\tif(r1.v[x] < r2.v[x])\n");
fprintf(cuda, "\t\t\t\treturn false;\n");
fprintf(cuda, "\t\t}\n");
fprintf(cuda, "\t\treturn false;\n");
fprintf(cuda, "\t}\n");
fprintf(cuda, "};\n\n");
}
fprintf(cuda, "#endif\n");
fclose(cuda);
free(str);
}