QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
inline_ptx.h
Go to the documentation of this file.
1 /*
2  Inline ptx instructions for low-level control of code generation.
3  Primarily these are for doing stores avoiding L1 cache and minimal
4  impact on L2 (streaming through L2).
5 */
6 
7 // Define a different pointer storage size for 64 and 32 bit
8 #if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__)
9 #define __PTR "l"
10 #else
11 #define __PTR "r"
12 #endif
13 
14 #if (__COMPUTE_CAPABILITY__ >= 200)
15 
16 namespace quda {
17 
18  __device__ inline void load_streaming_double2(double2 &a, const double2* addr)
19  {
20  double x, y;
21  asm("ld.cs.global.v2.f64 {%0, %1}, [%2+0];" : "=d"(x), "=d"(y) : __PTR(addr));
22  a.x = x; a.y = y;
23  }
24 
25  __device__ inline void load_streaming_float4(float4 &a, const float4* addr)
26  {
27  float x, y, z, w;
28  asm("ld.cs.global.v4.f32 {%0, %1, %2, %3}, [%4+0];" : "=f"(x), "=f"(y), "=f"(z), "=f"(w) : __PTR(addr));
29  a.x = x; a.y = y; a.z = z; a.w = w;
30  }
31 
32  __device__ inline void load_global_float4(float4 &a, const float4* addr)
33  {
34  float x, y, z, w;
35  asm("ld.cg.global.v4.f32 {%0, %1, %2, %3}, [%4+0];" : "=f"(x), "=f"(y), "=f"(z), "=f"(w) : __PTR(addr));
36  a.x = x; a.y = y; a.z = z; a.w = w;
37  }
38 
39  __device__ inline void store_streaming_float4(float4* addr, float x, float y, float z, float w)
40  {
41  asm("st.cs.global.v4.f32 [%0+0], {%1, %2, %3, %4};" :: __PTR(addr), "f"(x), "f"(y), "f"(z), "f"(w));
42  }
43 
44  __device__ inline void store_streaming_short4(short4* addr, short x, short y, short z, short w)
45  {
46  asm("st.cs.global.v4.s16 [%0+0], {%1, %2, %3, %4};" :: __PTR(addr), "h"(x), "h"(y), "h"(z), "h"(w));
47  }
48 
49  __device__ inline void store_streaming_double2(double2* addr, double x, double y)
50  {
51  asm("st.cs.global.v2.f64 [%0+0], {%1, %2};" :: __PTR(addr), "d"(x), "d"(y));
52  }
53 
54  __device__ inline void store_streaming_float2(float2* addr, float x, float y)
55  {
56  asm("st.cs.global.v2.f32 [%0+0], {%1, %2};" :: __PTR(addr), "f"(x), "f"(y));
57  }
58 
59  __device__ inline void store_streaming_short2(short2* addr, short x, short y)
60  {
61  asm("st.cs.global.v2.s16 [%0+0], {%1, %2};" :: __PTR(addr), "h"(x), "h"(y));
62  }
63 
64 } // namespace quda
65 
66 #endif // COMPUTE_CAPABILITY
67 
68 
int y[4]
int x[4]
#define __PTR
Definition: inline_ptx.h:11