QUDA v0.4.0
A library for QCD on GPUs
|
00001 00010 #pragma once 00011 00012 template <typename type> int vecLength() { return 0; } 00013 00014 template<> int vecLength<short>() { return 1; } 00015 template<> int vecLength<float>() { return 1; } 00016 template<> int vecLength<double>() { return 1; } 00017 00018 template<> int vecLength<short2>() { return 2; } 00019 template<> int vecLength<float2>() { return 2; } 00020 template<> int vecLength<double2>() { return 2; } 00021 00022 template<> int vecLength<short4>() { return 4; } 00023 template<> int vecLength<float4>() { return 4; } 00024 template<> int vecLength<double4>() { return 4; } 00025 00026 __device__ inline void copyFloatN(float2 &a, const float2 &b) { a = b; } 00027 __device__ inline void copyFloatN(double2 &a, const double2 &b) { a = make_double2(b.x, b.y); } 00028 00029 __device__ inline void copyFloatN(float4 &a, const float4 &b) { a = make_float4(b.x, b.y, b.z, b.w); } 00030 __device__ inline void copyFloatN(double4 &a, const double4 &b) { a = make_double4(b.x, b.y, b.z, b.w); } 00031 00032 __device__ inline void copyFloatN(float2 &a, const double2 &b) { a = make_float2(b.x, b.y); } 00033 __device__ inline void copyFloatN(double2 &a, const float2 &b) { a = make_double2(b.x, b.y); } 00034 __device__ inline void copyFloatN(float4 &a, const double4 &b) { a = make_float4(b.x, b.y, b.z, b.w); } 00035 __device__ inline void copyFloatN(double4 &a, const float4 &b) { a = make_double4(b.x, b.y, b.z, b.w); } 00036 00048 template<typename OutputType, typename InputType> 00049 __device__ inline void convert(OutputType x[], InputType y[], const int N) { 00050 // default is one-2-one conversion, e.g., matching vector lengths and precisions 00051 #pragma unroll 00052 for (int j=0; j<N; j++) copyFloatN(x[j], y[j]); 00053 } 00054 00055 template<> __device__ inline void convert<float2,short2>(float2 x[], short2 y[], const int N) { 00056 #pragma unroll 00057 for (int j=0; j<N; j++) x[j] = make_float2(y[j].x, y[j].y); 00058 } 00059 00060 template<> __device__ inline void convert<float4,short4>(float4 x[], short4 y[], const int N) { 00061 #pragma unroll 00062 for (int j=0; j<N; j++) x[j] = make_float4(y[j].x, y[j].y, y[j].z, y[j].w); 00063 } 00064 00065 // 4 <-> 2 vector conversion 00066 00067 template<> __device__ inline void convert<double4,double2>(double4 x[], double2 y[], const int N) { 00068 #pragma unroll 00069 for (int j=0; j<N; j++) x[j] = make_double4(y[2*j].x, y[2*j].y, y[2*j+1].x, y[2*j+1].y); 00070 } 00071 00072 template<> __device__ inline void convert<double2,double4>(double2 x[], double4 y[], const int N) { 00073 #pragma unroll 00074 for (int j=0; j<N/2; j++) { 00075 x[2*j] = make_double2(y[j].x, y[j].y); 00076 x[2*j+1] = make_double2(y[j].z, y[j].w); 00077 } 00078 } 00079 00080 template<> __device__ inline void convert<float4,float2>(float4 x[], float2 y[], const int N) { 00081 #pragma unroll 00082 for (int j=0; j<N; j++) x[j] = make_float4(y[2*j].x, y[2*j].y, y[2*j+1].x, y[2*j+1].y); 00083 } 00084 00085 template<> __device__ inline void convert<float2,float4>(float2 x[], float4 y[], const int N) { 00086 #pragma unroll 00087 for (int j=0; j<N/2; j++) { 00088 x[2*j] = make_float2(y[j].x, y[j].y); 00089 x[2*j+1] = make_float2(y[j].z, y[j].w); 00090 } 00091 } 00092 00093 template<> __device__ inline void convert<short4,float2>(short4 x[], float2 y[], const int N) { 00094 #pragma unroll 00095 for (int j=0; j<N; j++) x[j] = make_short4(y[2*j].x, y[2*j].y, y[2*j+1].x, y[2*j+1].y); 00096 } 00097 00098 template<> __device__ inline void convert<float2,short4>(float2 x[], short4 y[], const int N) { 00099 #pragma unroll 00100 for (int j=0; j<N/2; j++) { 00101 x[2*j] = make_float2(y[j].x, y[j].y); 00102 x[2*j+1] = make_float2(y[j].z, y[j].w); 00103 } 00104 } 00105 00106 template<> __device__ inline void convert<float4,short2>(float4 x[], short2 y[], const int N) { 00107 #pragma unroll 00108 for (int j=0; j<N; j++) x[j] = make_float4(y[2*j].x, y[2*j].y, y[2*j+1].x, y[2*j+1].y); 00109 } 00110 00111 template<> __device__ inline void convert<short2,float4>(short2 x[], float4 y[], const int N) { 00112 #pragma unroll 00113 for (int j=0; j<N/2; j++) { 00114 x[2*j] = make_short2(y[j].x, y[j].y); 00115 x[2*j+1] = make_short2(y[j].z, y[j].w); 00116 } 00117 } 00118 00119 template<> __device__ inline void convert<short4,double2>(short4 x[], double2 y[], const int N) { 00120 #pragma unroll 00121 for (int j=0; j<N; j++) x[j] = make_short4(y[2*j].x, y[2*j].y, y[2*j+1].x, y[2*j+1].y); 00122 } 00123 00124 template<> __device__ inline void convert<double2,short4>(double2 x[], short4 y[], const int N) { 00125 #pragma unroll 00126 for (int j=0; j<N/2; j++) { 00127 x[2*j] = make_double2(y[j].x, y[j].y); 00128 x[2*j+1] = make_double2(y[j].z, y[j].w); 00129 } 00130 } 00131 00132 template<> __device__ inline void convert<double4,short2>(double4 x[], short2 y[], const int N) { 00133 #pragma unroll 00134 for (int j=0; j<N; j++) x[j] = make_double4(y[2*j].x, y[2*j].y, y[2*j+1].x, y[2*j+1].y); 00135 } 00136 00137 template<> __device__ inline void convert<short2,double4>(short2 x[], double4 y[], const int N) { 00138 #pragma unroll 00139 for (int j=0; j<N/2; j++) { 00140 x[2*j] = make_short2(y[j].x, y[j].y); 00141 x[2*j+1] = make_short2(y[j].z, y[j].w); 00142 } 00143 } 00144 00145 template<> __device__ inline void convert<float4,double2>(float4 x[], double2 y[], const int N) { 00146 #pragma unroll 00147 for (int j=0; j<N; j++) x[j] = make_float4(y[2*j].x, y[2*j].y, y[2*j+1].x, y[2*j+1].y); 00148 } 00149 00150 template<> __device__ inline void convert<double2,float4>(double2 x[], float4 y[], const int N) { 00151 #pragma unroll 00152 for (int j=0; j<N/2; j++) { 00153 x[2*j] = make_double2(y[j].x, y[j].y); 00154 x[2*j+1] = make_double2(y[j].z, y[j].w); 00155 } 00156 } 00157 00158 template<> __device__ inline void convert<double4,float2>(double4 x[], float2 y[], const int N) { 00159 #pragma unroll 00160 for (int j=0; j<N; j++) x[j] = make_double4(y[2*j].x, y[2*j].y, y[2*j+1].x, y[2*j+1].y); 00161 } 00162 00163 template<> __device__ inline void convert<float2,double4>(float2 x[], double4 y[], const int N) { 00164 #pragma unroll 00165 for (int j=0; j<N/2; j++) { 00166 x[2*j] = make_float2(y[j].x, y[j].y); 00167 x[2*j+1] = make_float2(y[j].z, y[j].w); 00168 } 00169 }