20 cudaMemset(gauge, 0,
bytes);
23 odd = (
char*)gauge +
bytes/2;
25 #ifdef USE_TEXTURE_OBJECTS
26 createTexObject(evenTex, even);
27 createTexObject(oddTex, odd);
32 #ifdef USE_TEXTURE_OBJECTS
33 void cudaGaugeField::createTexObject(cudaTextureObject_t &tex,
void *field) {
37 cudaChannelFormatDesc desc;
38 memset(&desc, 0,
sizeof(cudaChannelFormatDesc));
40 else desc.f = cudaChannelFormatKindSigned;
44 desc.x = 8*
sizeof(int);
45 desc.y = 8*
sizeof(int);
46 desc.z = 8*
sizeof(int);
47 desc.w = 8*
sizeof(int);
55 cudaResourceDesc resDesc;
56 memset(&resDesc, 0,
sizeof(resDesc));
57 resDesc.resType = cudaResourceTypeLinear;
58 resDesc.res.linear.devPtr = field;
59 resDesc.res.linear.desc = desc;
60 resDesc.res.linear.sizeInBytes =
bytes/2;
62 cudaTextureDesc texDesc;
63 memset(&texDesc, 0,
sizeof(texDesc));
65 else texDesc.readMode = cudaReadModeElementType;
67 cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL);
71 void cudaGaugeField::destroyTexObject() {
72 cudaDestroyTextureObject(evenTex);
73 cudaDestroyTextureObject(oddTex);
80 #ifdef USE_TEXTURE_OBJECTS
88 static double anisotropy_;
89 static double fat_link_max_;
95 template <
typename Float,
typename FloatN>
98 size_t bytes,
int volumeCB,
int *surfaceCB,
int pad,
int nFace,
105 if( ! packedEven )
errorQuda(
"packedEven is borked\n");
106 if( ! packedOdd )
errorQuda(
"packedOdd is borked\n");
107 if( ! even )
errorQuda(
"even is borked\n");
108 if( ! odd )
errorQuda(
"odd is borked\n");
109 if( ! cpuGauge )
errorQuda(
"cpuGauge is borked\n");
114 errorQuda(
"Gauge order %d is not supported for multi-gpu\n", cpu_order);
125 if( tmp[
dir][i] > fat_link_max ){
126 fat_link_max = tmp[
dir][i];
132 if(cpuGauge[i] > fat_link_max){ fat_link_max = cpuGauge[i]; }
135 errorQuda(
"Gauge ordering scheme not supported\n");
139 double fat_link_max_double = fat_link_max;
144 fat_link_max = fat_link_max_double;
146 int voxels[] = {volumeCB, volumeCB, volumeCB, volumeCB};
149 fat_link_max_ = fat_link_max;
153 packQDPGaugeField(packedEven, (
Float**)cpuGauge, 0, reconstruct, volumeCB,
154 voxels, pad, 0, nFaceLocal, type);
155 packQDPGaugeField(packedOdd, (
Float**)cpuGauge, 1, reconstruct, volumeCB,
156 voxels, pad, 0, nFaceLocal, type);
158 packCPSGaugeField(packedEven, (
Float*)cpuGauge, 0, reconstruct, volumeCB, pad);
159 packCPSGaugeField(packedOdd, (
Float*)cpuGauge, 1, reconstruct, volumeCB, pad);
167 errorQuda(
"Invalid gauge_order %d", cpu_order);
173 packQDPGaugeField(packedEven, (
Float**)cpuGhost, 0, reconstruct, volumeCB,
174 surfaceCB, pad, volumeCB, nFace, type);
175 packQDPGaugeField(packedOdd, (
Float**)cpuGhost, 1, reconstruct, volumeCB,
176 surfaceCB, pad, volumeCB, nFace, type);
181 cudaMemcpy(even, packedEven, bytes, cudaMemcpyHostToDevice);
186 template <
typename Float,
typename Float2>
189 Float2 *packedEven = (Float2*)buffer;
190 Float2 *packedOdd = (Float2*)((
char*)buffer + bytes/2);
195 cudaMemcpy(even, packedEven, bytes/2, cudaMemcpyHostToDevice);
196 cudaMemcpy(odd, packedOdd, bytes/2, cudaMemcpyHostToDevice);
227 loadGaugeField((double2*)(even), (double2*)(odd), (
double*)cpu.gauge, (
double**)cpu.ghost,
231 loadGaugeField((double2*)(even), (double2*)(odd), (
float*)cpu.gauge, (
float**)cpu.ghost,
232 cpu.
order, reconstruct, bytes, volumeCB, surfaceCB, pad, nFace,
link_type,
240 loadGaugeField((float2*)(even), (float2*)(odd), (
double*)cpu.gauge, (
double**)cpu.ghost,
241 cpu.
order, reconstruct, bytes, volumeCB, surfaceCB, pad, nFace,
link_type,
244 loadGaugeField((float4*)(even), (float4*)(odd), (
double*)cpu.gauge, (
double**)cpu.ghost,
245 cpu.
order, reconstruct, bytes, volumeCB, surfaceCB, pad, nFace,
link_type,
250 loadGaugeField((float2*)(even), (float2*)(odd), (
float*)cpu.gauge, (
float**)cpu.ghost,
251 cpu.
order, reconstruct, bytes, volumeCB, surfaceCB, pad, nFace,
link_type,
254 loadGaugeField((float4*)(even), (float4*)(odd), (
float*)cpu.gauge, (
float**)cpu.ghost,
255 cpu.
order, reconstruct, bytes, volumeCB, surfaceCB, pad, nFace,
link_type,
264 loadGaugeField((short2*)(even), (short2*)(odd), (
double*)cpu.gauge, (
double**)cpu.ghost,
265 cpu.
order, reconstruct, bytes, volumeCB, surfaceCB, pad, nFace,
link_type,
268 loadGaugeField((short4*)(even), (short4*)(odd), (
double*)cpu.gauge, (
double**)cpu.ghost,
269 cpu.
order, reconstruct, bytes, volumeCB, surfaceCB, pad, nFace,
link_type,
274 loadGaugeField((short2*)(even), (short2*)(odd), (
float*)cpu.gauge, (
float**)cpu.ghost,
275 cpu.
order, reconstruct, bytes, volumeCB, surfaceCB, pad, nFace,
link_type,
278 loadGaugeField((short4*)(even), (short4*)(odd), (
float*)cpu.gauge, (
float**)(cpu.ghost),
279 cpu.
order, reconstruct, bytes, volumeCB, surfaceCB, pad, nFace,
link_type,
287 loadMomField((double2*)(even), (double2*)(odd), (
double*)cpu.gauge, bytes,
290 loadMomField((double2*)(even), (double2*)(odd), (
float*)cpu.gauge, bytes,
295 loadMomField((float2*)(even), (float2*)(odd), (
double*)cpu.gauge, bytes,
298 loadMomField((float2*)(even), (float2*)(odd), (
float*)cpu.gauge, bytes,
304 errorQuda(
"Invalid pack location %d", pack_location);
314 template <
typename Float,
typename FloatN>
322 cudaMemcpy(buffer, gauge, bytes, cudaMemcpyDeviceToHost);
325 unpackQDPGaugeField((
Float**)cpuGauge, packedEven, 0, reconstruct, volumeCB, pad);
326 unpackQDPGaugeField((
Float**)cpuGauge, packedOdd, 1, reconstruct, volumeCB, pad);
328 unpackCPSGaugeField((
Float*)cpuGauge, packedEven, 0, reconstruct, volumeCB, pad);
329 unpackCPSGaugeField((
Float*)cpuGauge, packedOdd, 1, reconstruct, volumeCB, pad);
331 unpackMILCGaugeField((
Float*)cpuGauge, packedEven, 0, reconstruct, volumeCB, pad);
332 unpackMILCGaugeField((
Float*)cpuGauge, packedOdd, 1, reconstruct, volumeCB, pad);
334 unpackBQCDGaugeField((
Float*)cpuGauge, packedEven, 0, reconstruct, volumeCB, pad);
335 unpackBQCDGaugeField((
Float*)cpuGauge, packedOdd, 1, reconstruct, volumeCB, pad);
352 template<
typename FloatN,
typename Float>
353 static void storeGaugeField(
Float* cpuGauge,
FloatN *gauge,
int bytes,
int volumeCB,
357 for (
int i=0; i<2; i++) cudaStreamCreate(&streams[i]);
364 void *unpackedEven = unpacked;
365 void *unpackedOdd = (
char*)unpacked + datalen/2;
370 cudaMemcpyAsync(cpuGauge, unpackedEven, datalen/2, cudaMemcpyDeviceToHost, streams[0]);
372 cudaMemcpy(cpuGauge, unpackedEven, datalen/2, cudaMemcpyDeviceToHost);
378 cudaMemcpyAsync(cpuGauge + 4*volumeCB*
gaugeSiteSize, unpackedOdd, datalen/2, cudaMemcpyDeviceToHost, streams[1]);
379 for(
int i=0; i<2; i++) cudaStreamSynchronize(streams[i]);
381 cudaMemcpy(cpuGauge + 4*volumeCB*
gaugeSiteSize, unpackedOdd, datalen/2, cudaMemcpyDeviceToHost);
385 for(
int i=0; i<2; i++) cudaStreamDestroy(streams[i]);
388 template <
typename Float,
typename Float2>
391 Float2 *packedEven = (Float2*)buffer;
392 Float2 *packedOdd = (Float2*)((
char*)buffer + bytes/2);
394 cudaMemcpy(packedEven, even, bytes/2, cudaMemcpyDeviceToHost);
395 cudaMemcpy(packedOdd, odd, bytes/2, cudaMemcpyDeviceToHost);
409 errorQuda(
"cpu precision %d and cuda precision %d must be the same",
413 errorQuda(
"Only no reconstruction supported");
416 errorQuda(
"Only QUDA_FLOAT2_GAUGE_ORDER supported");
419 errorQuda(
"Only QUDA_MILC_GAUGE_ORDER supported");
422 storeGaugeField((
double*)cpu.gauge, (double2*)gauge, bytes, volumeCB, stride,
precision);
424 storeGaugeField((
float*)cpu.gauge, (float2*)gauge, bytes, volumeCB, stride,
precision);
426 errorQuda(
"Half precision not supported");
433 fat_link_max_ = fat_link_max;
443 storeGaugeField((
double*)cpu.gauge, (double2*)(gauge),
446 storeGaugeField((
float*)cpu.gauge, (double2*)(gauge),
454 storeGaugeField((
double*)cpu.gauge, (float2*)(gauge),
457 storeGaugeField((
double*)cpu.gauge, (float4*)(gauge),
462 storeGaugeField((
float*)cpu.gauge, (float2*)(gauge),
465 storeGaugeField((
float*)cpu.gauge, (float4*)(gauge),
474 storeGaugeField((
double*)cpu.gauge, (short2*)(gauge),
477 storeGaugeField((
double*)cpu.gauge, (short4*)(gauge),
482 storeGaugeField((
float*)cpu.gauge, (short2*)(gauge),
485 storeGaugeField((
float*)cpu.gauge, (short4*)(gauge),
493 errorQuda(
"cpu and gpu precison has to be the same at this moment");
496 errorQuda(
"half precision is not supported at this moment");
499 errorQuda(
"Only MILC gauge order supported in momentum unpack, not %d", cpu.
order);
508 errorQuda(
"Invalid pack location %d", pack_location);
516 cudaMemcpy(
backup_h, gauge, bytes, cudaMemcpyDeviceToHost);
523 cudaMemcpy(gauge,
backup_h, bytes, cudaMemcpyHostToDevice);
548 errorQuda(
"Casting a cudaGaugeField into cudaColorSpinorField not possible in half precision");
553 spinor_param.
nDim = spin;
554 for (
int d=0; d<a.
Ndim(); d++) spinor_param.
x[d] = a.
X()[d];
556 spinor_param.
pad = a.
Pad();
562 spinor_param.
v = (
void*)a.
Gauge_p();