|
QUDA v0.3.2
A library for QCD on GPUs
|
00001 // 00002 // Auto-tuned blas CUDA parameters, generated by blas_test 00003 // 00004 00005 static int blas_threads[24][3] = { 00006 { 64, 32, 32}, // Kernel 0: copyCuda (high source precision) 00007 { 32, 32, 32}, // Kernel 1: copyCuda (low source precision) 00008 { 64, 64, 64}, // Kernel 2: axpbyCuda 00009 { 64, 64, 64}, // Kernel 3: xpyCuda 00010 { 64, 64, 64}, // Kernel 4: axpyCuda 00011 { 64, 64, 64}, // Kernel 5: xpayCuda 00012 { 64, 64, 64}, // Kernel 6: mxpyCuda 00013 { 64, 64, 64}, // Kernel 7: axCuda 00014 { 64, 64, 64}, // Kernel 8: caxpyCuda 00015 { 64, 64, 64}, // Kernel 9: caxpbyCuda 00016 { 64, 64, 64}, // Kernel 10: cxpaypbzCuda 00017 { 64, 32, 480}, // Kernel 11: axpyBzpcxCuda 00018 { 192, 32, 480}, // Kernel 12: axpyZpbxCuda 00019 { 64, 64, 128}, // Kernel 13: caxpbypzYmbwCuda 00020 { 64, 128, 64}, // Kernel 14: sumCuda 00021 { 64, 128, 64}, // Kernel 15: normCuda 00022 { 64, 64, 64}, // Kernel 16: reDotProductCuda 00023 { 64, 256, 64}, // Kernel 17: axpyNormCuda 00024 { 64, 128, 64}, // Kernel 18: xmyNormCuda 00025 { 64, 128, 32}, // Kernel 19: cDotProductCuda 00026 { 64, 64, 32}, // Kernel 20: xpaycDotzyCuda 00027 { 64, 64, 64}, // Kernel 21: cDotProductNormACuda 00028 { 64, 64, 64}, // Kernel 22: cDotProductNormBCuda 00029 { 64, 64, 64} // Kernel 23: caxpbypzYmbwcDotProductWYNormYCuda 00030 }; 00031 00032 static int blas_blocks[24][3] = { 00033 { 256, 65536, 65536}, // Kernel 0: copyCuda (high source precision) 00034 {32768, 2048, 16384}, // Kernel 1: copyCuda (low source precision) 00035 { 4096, 128, 128}, // Kernel 2: axpbyCuda 00036 { 4096, 128, 128}, // Kernel 3: xpyCuda 00037 { 4096, 128, 128}, // Kernel 4: axpyCuda 00038 { 4096, 128, 128}, // Kernel 5: xpayCuda 00039 { 4096, 128, 128}, // Kernel 6: mxpyCuda 00040 { 4096, 128, 128}, // Kernel 7: axCuda 00041 { 1024, 65536, 32768}, // Kernel 8: caxpyCuda 00042 { 1024, 65536, 65536}, // Kernel 9: caxpbyCuda 00043 { 4096, 128, 65536}, // Kernel 10: cxpaypbzCuda 00044 { 256, 128, 8192}, // Kernel 11: axpyBzpcxCuda 00045 { 2048, 128, 8192}, // Kernel 12: axpyZpbxCuda 00046 { 4096, 128, 32768}, // Kernel 13: caxpbypzYmbwCuda 00047 { 128, 128, 128}, // Kernel 14: sumCuda 00048 { 128, 128, 128}, // Kernel 15: normCuda 00049 { 128, 128, 2048}, // Kernel 16: reDotProductCuda 00050 { 1024, 512, 128}, // Kernel 17: axpyNormCuda 00051 { 1024, 2048, 128}, // Kernel 18: xmyNormCuda 00052 { 256, 256, 128}, // Kernel 19: cDotProductCuda 00053 { 256, 128, 128}, // Kernel 20: xpaycDotzyCuda 00054 { 256, 1024, 1024}, // Kernel 21: cDotProductNormACuda 00055 { 256, 1024, 1024}, // Kernel 22: cDotProductNormBCuda 00056 { 256, 1024, 1024} // Kernel 23: caxpbypzYmbwcDotProductWYNormYCuda 00057 };
1.7.3