QUDA: quda/lib/blas_param.h Source File

QUDA v0.3.2
A library for QCD on GPUs
00001 //
00002 // Auto-tuned blas CUDA parameters, generated by blas_test
00003 //
00004 
00005 static int blas_threads[24][3] = {
00006   {  64,   32,   32},  // Kernel  0: copyCuda (high source precision)
00007   {  32,   32,   32},  // Kernel  1: copyCuda (low source precision)
00008   {  64,   64,   64},  // Kernel  2: axpbyCuda
00009   {  64,   64,   64},  // Kernel  3: xpyCuda
00010   {  64,   64,   64},  // Kernel  4: axpyCuda
00011   {  64,   64,   64},  // Kernel  5: xpayCuda
00012   {  64,   64,   64},  // Kernel  6: mxpyCuda
00013   {  64,   64,   64},  // Kernel  7: axCuda
00014   {  64,   64,   64},  // Kernel  8: caxpyCuda
00015   {  64,   64,   64},  // Kernel  9: caxpbyCuda
00016   {  64,   64,   64},  // Kernel 10: cxpaypbzCuda
00017   {  64,   32,  480},  // Kernel 11: axpyBzpcxCuda
00018   { 192,   32,  480},  // Kernel 12: axpyZpbxCuda
00019   {  64,   64,  128},  // Kernel 13: caxpbypzYmbwCuda
00020   {  64,  128,   64},  // Kernel 14: sumCuda
00021   {  64,  128,   64},  // Kernel 15: normCuda
00022   {  64,   64,   64},  // Kernel 16: reDotProductCuda
00023   {  64,  256,   64},  // Kernel 17: axpyNormCuda
00024   {  64,  128,   64},  // Kernel 18: xmyNormCuda
00025   {  64,  128,   32},  // Kernel 19: cDotProductCuda
00026   {  64,   64,   32},  // Kernel 20: xpaycDotzyCuda
00027   {  64,   64,   64},  // Kernel 21: cDotProductNormACuda
00028   {  64,   64,   64},  // Kernel 22: cDotProductNormBCuda
00029   {  64,   64,   64}   // Kernel 23: caxpbypzYmbwcDotProductWYNormYCuda
00030 };
00031 
00032 static int blas_blocks[24][3] = {
00033   {  256, 65536, 65536},  // Kernel  0: copyCuda (high source precision)
00034   {32768,  2048, 16384},  // Kernel  1: copyCuda (low source precision)
00035   { 4096,   128,   128},  // Kernel  2: axpbyCuda
00036   { 4096,   128,   128},  // Kernel  3: xpyCuda
00037   { 4096,   128,   128},  // Kernel  4: axpyCuda
00038   { 4096,   128,   128},  // Kernel  5: xpayCuda
00039   { 4096,   128,   128},  // Kernel  6: mxpyCuda
00040   { 4096,   128,   128},  // Kernel  7: axCuda
00041   { 1024, 65536, 32768},  // Kernel  8: caxpyCuda
00042   { 1024, 65536, 65536},  // Kernel  9: caxpbyCuda
00043   { 4096,   128, 65536},  // Kernel 10: cxpaypbzCuda
00044   {  256,   128,  8192},  // Kernel 11: axpyBzpcxCuda
00045   { 2048,   128,  8192},  // Kernel 12: axpyZpbxCuda
00046   { 4096,   128, 32768},  // Kernel 13: caxpbypzYmbwCuda
00047   {  128,   128,   128},  // Kernel 14: sumCuda
00048   {  128,   128,   128},  // Kernel 15: normCuda
00049   {  128,   128,  2048},  // Kernel 16: reDotProductCuda
00050   { 1024,   512,   128},  // Kernel 17: axpyNormCuda
00051   { 1024,  2048,   128},  // Kernel 18: xmyNormCuda
00052   {  256,   256,   128},  // Kernel 19: cDotProductCuda
00053   {  256,   128,   128},  // Kernel 20: xpaycDotzyCuda
00054   {  256,  1024,  1024},  // Kernel 21: cDotProductNormACuda
00055   {  256,  1024,  1024},  // Kernel 22: cDotProductNormBCuda
00056   {  256,  1024,  1024}   // Kernel 23: caxpbypzYmbwcDotProductWYNormYCuda
00057 };