QUDA: quda/include/inline_ptx.h Source File

QUDA v0.4.0
A library for QCD on GPUs
00001 /*
00002   Inline ptx instructions for low-level control of code generation.
00003   Primarily these are for doing stores avoiding L1 cache and minimal
00004   impact on L2 (streaming through L2).
00005 */
00006 
00007 #if (__COMPUTE_CAPABILITY__ >= 200)
00008 
00009 #if (POINTER_SIZE==8) // 64-bit pointers
00010 
00011 __device__ inline void load_streaming_float4(float4 &a, const float4* addr)
00012 {
00013   float x, y, z, w;
00014   asm("ld.cs.global.v4.f32 {%0, %1, %2, %3}, [%4+0];" : "=f"(x), "=f"(y), "=f"(z), "=f"(w) : "l"(addr));
00015   a.x = x; a.y = y; a.z = z; a.w = w;
00016 }
00017 
00018 __device__ inline void load_global_float4(float4 &a, const float4* addr)
00019 {
00020   float x, y, z, w;
00021   asm("ld.cg.global.v4.f32 {%0, %1, %2, %3}, [%4+0];" : "=f"(x), "=f"(y), "=f"(z), "=f"(w) : "l"(addr));
00022   a.x = x; a.y = y; a.z = z; a.w = w;
00023 }
00024 
00025 __device__ inline void store_streaming_float4(float4* addr, float x, float y, float z, float w)
00026 {
00027   asm("st.cs.global.v4.f32 [%0+0], {%1, %2, %3, %4};" :: "l"(addr), "f"(x), "f"(y), "f"(z), "f"(w));
00028 }
00029 
00030 __device__ inline void store_streaming_short4(short4* addr, short x, short y, short z, short w)
00031 {
00032   asm("st.cs.global.v4.s16 [%0+0], {%1, %2, %3, %4};" :: "l"(addr), "h"(x), "h"(y), "h"(z), "h"(w));
00033 }
00034 
00035 __device__ inline void store_streaming_double2(double2* addr, double x, double y)
00036 {
00037   asm("st.cs.global.v2.f64 [%0+0], {%1, %2};" :: "l"(addr), "d"(x), "d"(y));
00038 }
00039 
00040 __device__ inline void store_streaming_float2(float2* addr, float x, float y)
00041 {
00042   asm("st.cs.global.v2.f32 [%0+0], {%1, %2};" :: "l"(addr), "f"(x), "f"(y));
00043 }
00044 
00045 __device__ inline void store_streaming_short2(short2* addr, short x, short y)
00046 {
00047   asm("st.cs.global.v2.s16 [%0+0], {%1, %2};" :: "l"(addr), "h"(x), "h"(y));
00048 }
00049 
00050 #else // 32-bit pointers
00051 
00052 __device__ inline void load_streaming_float4(float4 &a, const float4* addr)
00053 {
00054   float x, y, z, w;
00055   asm("ld.cs.global.v4.f32 {%0, %1, %2, %3}, [%4+0];" : "=f"(x), "=f"(y), "=f"(z), "=f"(w) : "r"(addr));
00056   a.x = x; a.y = y; a.z = z; a.w = w;
00057 }
00058 
00059 __device__ inline void load_global_float4(float4 &a, const float4* addr)
00060 {
00061   float x, y, z, w;
00062   asm("ld.cg.global.v4.f32 {%0, %1, %2, %3}, [%4+0];" : "=f"(x), "=f"(y), "=f"(z), "=f"(w) : "r"(addr));
00063   a.x = x; a.y = y; a.z = z; a.w = w;
00064 }
00065 
00066 __device__ inline void store_streaming_float4(float4* addr, float x, float y, float z, float w)
00067 {
00068   asm("st.cs.global.v4.f32 [%0+0], {%1, %2, %3, %4};" :: "r"(addr), "f"(x), "f"(y), "f"(z), "f"(w));
00069 }
00070 
00071 __device__ inline void store_streaming_short4(short4* addr, short x, short y, short z, short w)
00072 {
00073   asm("st.cs.global.v4.s16 [%0+0], {%1, %2, %3, %4};" :: "r"(addr), "h"(x), "h"(y), "h"(z), "h"(w));
00074 }
00075 
00076 __device__ inline void store_streaming_double2(double2* addr, double x, double y)
00077 {
00078   asm("st.cs.global.v2.f64 [%0+0], {%1, %2};" :: "r"(addr), "d"(x), "d"(y));
00079 }
00080 
00081 __device__ inline void store_streaming_float2(float2* addr, float x, float y)
00082 {
00083   asm("st.cs.global.v2.f32 [%0+0], {%1, %2};" :: "r"(addr), "f"(x), "f"(y));
00084 }
00085 
00086 __device__ inline void store_streaming_short2(short2* addr, short x, short y)
00087 {
00088   asm("st.cs.global.v2.s16 [%0+0], {%1, %2};" :: "r"(addr), "h"(x), "h"(y));
00089 }
00090 
00091 #endif // POINTER_SIZE
00092 
00093 #endif // COMPUTE_CAPABILITY
00094 
00095