13 LatticeField(param), bytes(0), norm_bytes(0), nColor(3), nSpin(4)
47 odd = (
char*)clover +
bytes/2;
52 loadCPUField(clover, norm, h_clov, cpu_prec, cpu_order);
62 oddInv = (
char*)cloverInv +
bytes/2;
64 evenInvNorm = invNorm;
69 loadCPUField(cloverInv, invNorm, h_clov_inv, cpu_prec, cpu_order);
80 evenNorm = evenInvNorm;
85 #ifdef USE_TEXTURE_OBJECTS
86 createTexObject(evenTex, evenNormTex, even, evenNorm);
87 createTexObject(oddTex, oddNormTex, odd, oddNorm);
88 createTexObject(evenInvTex, evenInvNormTex, evenInv, evenInvNorm);
89 createTexObject(oddInvTex, oddInvNormTex, oddInv, oddInvNorm);
94 #ifdef USE_TEXTURE_OBJECTS
95 void cudaCloverField::createTexObject(cudaTextureObject_t &tex, cudaTextureObject_t &texNorm,
96 void *field,
void *
norm) {
100 cudaChannelFormatDesc desc;
101 memset(&desc, 0,
sizeof(cudaChannelFormatDesc));
103 else desc.f = cudaChannelFormatKindSigned;
111 cudaResourceDesc resDesc;
112 memset(&resDesc, 0,
sizeof(resDesc));
113 resDesc.resType = cudaResourceTypeLinear;
114 resDesc.res.linear.devPtr = field;
115 resDesc.res.linear.desc = desc;
116 resDesc.res.linear.sizeInBytes =
bytes/2;
118 cudaTextureDesc texDesc;
119 memset(&texDesc, 0,
sizeof(texDesc));
121 else texDesc.readMode = cudaReadModeElementType;
123 cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL);
128 cudaChannelFormatDesc desc;
129 memset(&desc, 0,
sizeof(cudaChannelFormatDesc));
130 desc.f = cudaChannelFormatKindFloat;
133 cudaResourceDesc resDesc;
134 memset(&resDesc, 0,
sizeof(resDesc));
135 resDesc.resType = cudaResourceTypeLinear;
136 resDesc.res.linear.devPtr = norm;
137 resDesc.res.linear.desc = desc;
138 resDesc.res.linear.sizeInBytes =
norm_bytes/2;
140 cudaTextureDesc texDesc;
141 memset(&texDesc, 0,
sizeof(texDesc));
142 texDesc.readMode = cudaReadModeElementType;
144 cudaCreateTextureObject(&texNorm, &resDesc, &texDesc, NULL);
150 void cudaCloverField::destroyTexObject() {
151 cudaDestroyTextureObject(evenTex);
152 cudaDestroyTextureObject(oddTex);
153 cudaDestroyTextureObject(evenInvTex);
154 cudaDestroyTextureObject(oddInvTex);
156 cudaDestroyTextureObject(evenNormTex);
157 cudaDestroyTextureObject(evenNormTex);
158 cudaDestroyTextureObject(evenInvNormTex);
159 cudaDestroyTextureObject(evenInvNormTex);
167 #ifdef USE_TEXTURE_OBJECTS
171 if (clover != cloverInv) {
181 template <
bool bqcd,
typename Float>
182 static inline void packCloverMatrix(float4* a,
Float *b,
int Vh)
184 const Float half = bqcd ? 1.0 : 0.5;
186 for (
int i=0; i<18; i++) {
187 a[i*
Vh].x = half * b[4*i+0];
188 a[i*
Vh].y = half * b[4*i+1];
189 a[i*
Vh].z = half * b[4*i+2];
190 a[i*
Vh].w = half * b[4*i+3];
194 template <
bool bqcd,
typename Float>
195 static inline void packCloverMatrix(double2* a,
Float *b,
int Vh)
197 const Float half = bqcd ? 1.0 : 0.5;
199 for (
int i=0; i<36; i++) {
200 a[i*
Vh].x = half * b[2*i+0];
201 a[i*
Vh].y = half * b[2*i+1];
213 template <
typename Float>
214 static inline void reorderBQCD(
Float *quda,
Float *bqcd) {
223 int bq[36] = { 21, 32, 33, 0, 1, 20,
224 28, 29, 30, 31, 6, 7, 14, 15, 22, 23,
225 34, 35, 8, 9, 16, 17, 24, 25,
226 10, 11, 18, 19, 26, 27,
232 for (
int i=0; i<6; i++) sign[i] = 1;
233 for (
int i=6; i<36; i+=2) {
234 if ( (i >= 10 && i<= 15) || (i >= 18 && i <= 29) ) {
235 sign[i] = -1; sign[i+1] = -1;
237 sign[i] = 1; sign[i+1] = -1;
242 for (
int i=0; i<36; i++) quda[i] = sign[i] * bqcd[bq[i]];
245 for (
int i=0; i<36; i++) quda[i+36] = sign[i] * bqcd[bq[i]+36];
248 template <
typename Float,
typename FloatN>
249 static void packParityClover(
FloatN *res,
Float *clover,
int Vh,
int pad,
253 for (
int i = 0; i <
Vh; i++) {
254 packCloverMatrix<false>(res+i, clover+72*i, Vh+pad);
257 for (
int i = 0; i <
Vh; i++) {
259 reorderBQCD(tmp, clover+72*i);
260 packCloverMatrix<true>(res+i,
tmp, Vh+pad);
265 template <
typename Float,
typename FloatN>
268 int Vh = X[0]*X[1]*X[2]*X[3]/2;
270 for (
int i=0; i<
Vh; i++) {
272 int boundaryCrossings = i/X[0] + i/(X[1]*X[0]) + i/(X[2]*X[1]*X[0]);
275 int k = 2*i + boundaryCrossings%2;
276 packCloverMatrix<false>(even+i, clover+72*k, Vh+pad);
280 int k = 2*i + (boundaryCrossings+1)%2;
281 packCloverMatrix<false>(odd+i, clover+72*k, Vh+pad);
286 template<
bool bqcd,
typename Float>
287 static inline void packCloverMatrixHalf(short4 *res,
float *norm,
Float *clover,
int Vh)
289 const Float half = bqcd ? 1.0 : 0.5;
293 for (
int chi=0; chi<2; chi++) {
294 max = fabs(clover[0]);
295 for (
int i=1; i<36; i++) {
296 if ((a = fabs(clover[i])) > max) max = a;
299 for (
int i=0; i<9; i++) {
300 res[i*
Vh].x = (short) (c * clover[4*i+0]);
301 res[i*
Vh].y = (short) (c * clover[4*i+1]);
302 res[i*
Vh].z = (short) (c * clover[4*i+2]);
303 res[i*
Vh].w = (short) (c * clover[4*i+3]);
305 norm[chi*
Vh] = half*max;
311 template <
typename Float>
312 static void packParityCloverHalf(short4 *res,
float *norm,
Float *clover,
316 for (
int i = 0; i <
Vh; i++) {
317 packCloverMatrixHalf<false>(res+i, norm+i, clover+72*i, Vh+pad);
320 for (
int i = 0; i <
Vh; i++) {
322 reorderBQCD(tmp, clover+72*i);
323 packCloverMatrixHalf<true>(res+i, norm+i,
tmp, Vh+pad);
328 template <
typename Float>
329 static void packFullCloverHalf(short4 *even,
float *evenNorm, short4 *odd,
float *oddNorm,
330 Float *clover,
int *X,
int pad)
332 int Vh = X[0]*X[1]*X[2]*X[3]/2;
334 for (
int i=0; i<
Vh; i++) {
336 int boundaryCrossings = i/X[0] + i/(X[1]*X[0]) + i/(X[2]*X[1]*X[0]);
339 int k = 2*i + boundaryCrossings%2;
340 packCloverMatrixHalf<false>(even+i, evenNorm+i, clover+72*k, Vh+pad);
344 int k = 2*i + (boundaryCrossings+1)%2;
345 packCloverMatrixHalf<false>(odd+i, oddNorm+i, clover+72*k, Vh+pad);
350 void cudaCloverField::loadCPUField(
void *clover,
void *norm,
const void *h_clover,
353 void *h_clover_odd = (
char*)h_clover + cpu_prec*
real_length/2;
356 loadFullField(clover, norm, (
char*)clover+
bytes/2, (
char*)norm+
norm_bytes/2, h_clover, cpu_prec, cpu_order);
358 loadParityField(clover, norm, h_clover, cpu_prec, cpu_order);
359 loadParityField((
char*)clover+
bytes/2, (
char*)norm+
norm_bytes/2, h_clover_odd, cpu_prec, cpu_order);
361 cudaMemcpy(clover, h_clover,
total_bytes, cudaMemcpyHostToDevice);
367 void cudaCloverField::loadParityField(
void *clover,
void *cloverNorm,
const void *h_clover,
371 void *packedClover, *packedCloverNorm=0;
374 errorQuda(
"Cannot have CUDA double precision without CPU double precision");
377 errorQuda(
"Invalid clover order %d", cpu_order);
384 packParityClover((double2 *)packedClover, (
double *)h_clover,
volumeCB, pad, cpu_order);
387 packParityClover((float4 *)packedClover, (
double *)h_clover,
volumeCB, pad, cpu_order);
389 packParityClover((float4 *)packedClover, (
float *)h_clover,
volumeCB, pad, cpu_order);
393 packParityCloverHalf((short4 *)packedClover, (
float *)packedCloverNorm,
394 (
double *)h_clover,
volumeCB, pad, cpu_order);
396 packParityCloverHalf((short4 *)packedClover, (
float *)packedCloverNorm,
397 (
float *)h_clover,
volumeCB, pad, cpu_order);
401 cudaMemcpy(clover, packedClover,
bytes/2, cudaMemcpyHostToDevice);
403 cudaMemcpy(cloverNorm, packedCloverNorm,
norm_bytes/2, cudaMemcpyHostToDevice);
406 void cudaCloverField::loadFullField(
void *even,
void *evenNorm,
void *odd,
void *oddNorm,
411 void *packedEven, *packedOdd, *packedEvenNorm=0, *packedOddNorm=0;
414 errorQuda(
"Cannot have CUDA double precision without CPU double precision");
430 packFullClover((double2 *)packedEven, (double2 *)packedOdd, (
double *)clover,
x, pad);
433 packFullClover((float4 *)packedEven, (float4 *)packedOdd, (
double *)clover,
x, pad);
435 packFullClover((float4 *)packedEven, (float4 *)packedOdd, (
float *)clover,
x, pad);
439 packFullCloverHalf((short4 *)packedEven, (
float *)packedEvenNorm, (short4 *)packedOdd,
440 (
float *) packedOddNorm, (
double *)clover,
x, pad);
442 packFullCloverHalf((short4 *)packedEven, (
float *)packedEvenNorm, (short4 *)packedOdd,
443 (
float * )packedOddNorm, (
float *)clover,
x, pad);
447 cudaMemcpy(even, packedEven,
bytes/2, cudaMemcpyHostToDevice);
448 cudaMemcpy(odd, packedOdd,
bytes/2, cudaMemcpyHostToDevice);
450 cudaMemcpy(evenNorm, packedEvenNorm,
norm_bytes/2, cudaMemcpyHostToDevice);
451 cudaMemcpy(oddNorm, packedOddNorm,
norm_bytes/2, cudaMemcpyHostToDevice);
459 void cudaCloverField::compute(
const cudaGaugeField &
gauge) {
462 errorQuda(
"Gauge and clover precisions must match");