QUDA v0.4.0
A library for QCD on GPUs
|
00001 /* 00002 Inline ptx instructions for low-level control of code generation. 00003 Primarily these are for doing stores avoiding L1 cache and minimal 00004 impact on L2 (streaming through L2). 00005 */ 00006 00007 #if (__COMPUTE_CAPABILITY__ >= 200) 00008 00009 #if (POINTER_SIZE==8) // 64-bit pointers 00010 00011 __device__ inline void load_streaming_float4(float4 &a, const float4* addr) 00012 { 00013 float x, y, z, w; 00014 asm("ld.cs.global.v4.f32 {%0, %1, %2, %3}, [%4+0];" : "=f"(x), "=f"(y), "=f"(z), "=f"(w) : "l"(addr)); 00015 a.x = x; a.y = y; a.z = z; a.w = w; 00016 } 00017 00018 __device__ inline void load_global_float4(float4 &a, const float4* addr) 00019 { 00020 float x, y, z, w; 00021 asm("ld.cg.global.v4.f32 {%0, %1, %2, %3}, [%4+0];" : "=f"(x), "=f"(y), "=f"(z), "=f"(w) : "l"(addr)); 00022 a.x = x; a.y = y; a.z = z; a.w = w; 00023 } 00024 00025 __device__ inline void store_streaming_float4(float4* addr, float x, float y, float z, float w) 00026 { 00027 asm("st.cs.global.v4.f32 [%0+0], {%1, %2, %3, %4};" :: "l"(addr), "f"(x), "f"(y), "f"(z), "f"(w)); 00028 } 00029 00030 __device__ inline void store_streaming_short4(short4* addr, short x, short y, short z, short w) 00031 { 00032 asm("st.cs.global.v4.s16 [%0+0], {%1, %2, %3, %4};" :: "l"(addr), "h"(x), "h"(y), "h"(z), "h"(w)); 00033 } 00034 00035 __device__ inline void store_streaming_double2(double2* addr, double x, double y) 00036 { 00037 asm("st.cs.global.v2.f64 [%0+0], {%1, %2};" :: "l"(addr), "d"(x), "d"(y)); 00038 } 00039 00040 __device__ inline void store_streaming_float2(float2* addr, float x, float y) 00041 { 00042 asm("st.cs.global.v2.f32 [%0+0], {%1, %2};" :: "l"(addr), "f"(x), "f"(y)); 00043 } 00044 00045 __device__ inline void store_streaming_short2(short2* addr, short x, short y) 00046 { 00047 asm("st.cs.global.v2.s16 [%0+0], {%1, %2};" :: "l"(addr), "h"(x), "h"(y)); 00048 } 00049 00050 #else // 32-bit pointers 00051 00052 __device__ inline void load_streaming_float4(float4 &a, const float4* addr) 00053 { 00054 float x, y, z, w; 00055 asm("ld.cs.global.v4.f32 {%0, %1, %2, %3}, [%4+0];" : "=f"(x), "=f"(y), "=f"(z), "=f"(w) : "r"(addr)); 00056 a.x = x; a.y = y; a.z = z; a.w = w; 00057 } 00058 00059 __device__ inline void load_global_float4(float4 &a, const float4* addr) 00060 { 00061 float x, y, z, w; 00062 asm("ld.cg.global.v4.f32 {%0, %1, %2, %3}, [%4+0];" : "=f"(x), "=f"(y), "=f"(z), "=f"(w) : "r"(addr)); 00063 a.x = x; a.y = y; a.z = z; a.w = w; 00064 } 00065 00066 __device__ inline void store_streaming_float4(float4* addr, float x, float y, float z, float w) 00067 { 00068 asm("st.cs.global.v4.f32 [%0+0], {%1, %2, %3, %4};" :: "r"(addr), "f"(x), "f"(y), "f"(z), "f"(w)); 00069 } 00070 00071 __device__ inline void store_streaming_short4(short4* addr, short x, short y, short z, short w) 00072 { 00073 asm("st.cs.global.v4.s16 [%0+0], {%1, %2, %3, %4};" :: "r"(addr), "h"(x), "h"(y), "h"(z), "h"(w)); 00074 } 00075 00076 __device__ inline void store_streaming_double2(double2* addr, double x, double y) 00077 { 00078 asm("st.cs.global.v2.f64 [%0+0], {%1, %2};" :: "r"(addr), "d"(x), "d"(y)); 00079 } 00080 00081 __device__ inline void store_streaming_float2(float2* addr, float x, float y) 00082 { 00083 asm("st.cs.global.v2.f32 [%0+0], {%1, %2};" :: "r"(addr), "f"(x), "f"(y)); 00084 } 00085 00086 __device__ inline void store_streaming_short2(short2* addr, short x, short y) 00087 { 00088 asm("st.cs.global.v2.s16 [%0+0], {%1, %2};" :: "r"(addr), "h"(x), "h"(y)); 00089 } 00090 00091 #endif // POINTER_SIZE 00092 00093 #endif // COMPUTE_CAPABILITY 00094 00095