QUDA  0.9.0
inline_ptx.h
Go to the documentation of this file.
1 #pragma once
2 
3 /*
4  Inline ptx instructions for low-level control of code generation.
5  Primarily these are for doing stores avoiding L1 cache and minimal
6  impact on L2 (streaming through L2).
7 */
8 
9 // Define a different pointer storage size for 64 and 32 bit
10 #if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__)
11 #define __PTR "l"
12 #else
13 #define __PTR "r"
14 #endif
15 
16 namespace quda {
17 
18  __device__ inline void load_streaming_double2(double2 &a, const double2* addr)
19  {
20  double x, y;
21  asm("ld.cs.global.v2.f64 {%0, %1}, [%2+0];" : "=d"(x), "=d"(y) : __PTR(addr));
22  a.x = x; a.y = y;
23  }
24 
25  __device__ inline void load_streaming_float4(float4 &a, const float4* addr)
26  {
27  float x, y, z, w;
28  asm("ld.cs.global.v4.f32 {%0, %1, %2, %3}, [%4+0];" : "=f"(x), "=f"(y), "=f"(z), "=f"(w) : __PTR(addr));
29  a.x = x; a.y = y; a.z = z; a.w = w;
30  }
31 
32  __device__ inline void load_global_float4(float4 &a, const float4* addr)
33  {
34  float x, y, z, w;
35  asm("ld.cg.global.v4.f32 {%0, %1, %2, %3}, [%4+0];" : "=f"(x), "=f"(y), "=f"(z), "=f"(w) : __PTR(addr));
36  a.x = x; a.y = y; a.z = z; a.w = w;
37  }
38 
39  __device__ inline void store_streaming_float4(float4* addr, float x, float y, float z, float w)
40  {
41  asm("st.cs.global.v4.f32 [%0+0], {%1, %2, %3, %4};" :: __PTR(addr), "f"(x), "f"(y), "f"(z), "f"(w));
42  }
43 
44  __device__ inline void store_streaming_short4(short4* addr, short x, short y, short z, short w)
45  {
46  asm("st.cs.global.v4.s16 [%0+0], {%1, %2, %3, %4};" :: __PTR(addr), "h"(x), "h"(y), "h"(z), "h"(w));
47  }
48 
49  __device__ inline void store_streaming_double2(double2* addr, double x, double y)
50  {
51  asm("st.cs.global.v2.f64 [%0+0], {%1, %2};" :: __PTR(addr), "d"(x), "d"(y));
52  }
53 
54  __device__ inline void store_streaming_float2(float2* addr, float x, float y)
55  {
56  asm("st.cs.global.v2.f32 [%0+0], {%1, %2};" :: __PTR(addr), "f"(x), "f"(y));
57  }
58 
59  __device__ inline void store_streaming_short2(short2* addr, short x, short y)
60  {
61  asm("st.cs.global.v2.s16 [%0+0], {%1, %2};" :: __PTR(addr), "h"(x), "h"(y));
62  }
63 
64 } // namespace quda
__device__ void load_global_float4(float4 &a, const float4 *addr)
Definition: inline_ptx.h:32
int int int w
__device__ void store_streaming_double2(double2 *addr, double x, double y)
Definition: inline_ptx.h:49
__device__ void store_streaming_float4(float4 *addr, float x, float y, float z, float w)
Definition: inline_ptx.h:39
__device__ void load_streaming_double2(double2 &a, const double2 *addr)
Definition: inline_ptx.h:18
__device__ void store_streaming_float2(float2 *addr, float x, float y)
Definition: inline_ptx.h:54
__device__ void store_streaming_short2(short2 *addr, short x, short y)
Definition: inline_ptx.h:59
#define __PTR
Definition: inline_ptx.h:13
__device__ void store_streaming_short4(short4 *addr, short x, short y, short z, short w)
Definition: inline_ptx.h:44
#define a
__device__ void load_streaming_float4(float4 &a, const float4 *addr)
Definition: inline_ptx.h:25