7 #if defined(GPU_GAUGE_ALG) && defined(MULTI_GPU) 12 BorderIdArg(
int X_[4],
int border_[4]) {
13 for (
int dir = 0; dir < 4; ++dir ) border[dir] = border_[dir];
14 for (
int dir = 0; dir < 4; ++dir ) X[dir] = X_[dir];
18 __global__
void ComputeBorderPointsActiveFaceIndex(BorderIdArg
arg,
int *faceindices,
int facesize,
int faceid,
int parity){
19 int idd = blockDim.x * blockIdx.x + threadIdx.x;
20 if ( idd < facesize ) {
23 if ( idx >= facesize / 2 ) {
24 borderid = arg.X[faceid] - 1;
28 for (
int dr = 0; dr < 4; ++dr ) X[dr] = arg.X[dr];
33 za = idx / ( X[1] / 2);
35 x[2] = za - x[3] * X[2];
37 xodd = (borderid + x[2] + x[3] +
parity) & 1;
38 x[1] = (2 * idx + xodd) - za * X[1];
41 za = idx / ( X[0] / 2);
43 x[2] = za - x[3] * X[2];
45 xodd = (borderid + x[2] + x[3] +
parity) & 1;
46 x[0] = (2 * idx + xodd) - za * X[0];
49 za = idx / ( X[0] / 2);
51 x[1] = za - x[3] * X[1];
53 xodd = (borderid + x[1] + x[3] +
parity) & 1;
54 x[0] = (2 * idx + xodd) - za * X[0];
57 za = idx / ( X[0] / 2);
59 x[1] = za - x[2] * X[1];
61 xodd = (borderid + x[1] + x[2] +
parity) & 1;
62 x[0] = (2 * idx + xodd) - za * X[0];
65 idx = (((x[3] * X[2] + x[2]) * X[1] + x[1]) * X[0] + x[0]);;
66 faceindices[idd] = idx;
73 void PreCalculateLatticeIndices(
size_t faceVolume[4],
size_t faceVolumeCB[4],
int X[4],
int border[4], \
74 int &threads,
int *borderpoints[2]){
75 BorderIdArg
arg(X, border);
77 for (
int dir = 0; dir < 4; ++dir )
79 thrust::device_ptr<int> array_faceT[2];
80 thrust::device_ptr<int> array_interiorT[2];
81 for (
int i = 0; i < 2; i++ ) {
83 cudaMemset(borderpoints[i], 0, nlinksfaces *
sizeof(
int) );
84 array_faceT[i] = thrust::device_pointer_cast(borderpoints[i]);
86 dim3 nthreads(128, 1, 1);
88 for (
int dir = 0; dir < 4; ++dir ) {
90 dim3 blocks((faceVolume[dir] + nthreads.x - 1) / nthreads.x,1,1);
91 for (
int oddbit = 0; oddbit < 2; oddbit++ )
92 ComputeBorderPointsActiveFaceIndex <<< blocks, nthreads >>> (arg, borderpoints[oddbit] + start, faceVolume[dir], dir, oddbit);
93 start += faceVolume[dir];
97 for (
int i = 0; i < 2; i++ ) {
100 thrust::sort(thrust::cuda::par(alloc), array_faceT[i], array_faceT[i] + nlinksfaces);
101 thrust::device_ptr<int> new_end = thrust::unique(array_faceT[i], array_faceT[i] + nlinksfaces);
102 size[i] = thrust::raw_pointer_cast(new_end) - thrust::raw_pointer_cast(array_faceT[i]);
104 if ( size[0] == size[1] ) threads = size[0];
105 else errorQuda(
"BORDER: Even and Odd sizes does not match, not supported!!!!, %d:%d",size[0],size[1]);
108 #endif // GPU_GAUGE_ALG && MULTI_GPU
static std::map< void *, MemAlloc > alloc[N_ALLOC_TYPE]
#define pool_device_malloc(size)
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
int comm_dim_partitioned(int dim)