QUDA  v1.1.0
A library for QCD on GPUs
inline_ptx.h
Go to the documentation of this file.
1 #pragma once
2 
3 /*
4  Inline ptx instructions for low-level control of code generation.
5  Primarily these are for doing stores avoiding L1 cache and minimal
6  impact on L2 (streaming through L2).
7 */
8 
9 // Define a different pointer storage size for 64 and 32 bit
10 #if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__)
11 #define __PTR "l"
12 #else
13 #define __PTR "r"
14 #endif
15 
16 namespace quda {
17 
18  // If you're bored...
19  // http://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-st
20 
21  __device__ inline void load_streaming_double2(double2 &a, const double2* addr)
22  {
23  double x, y;
24  asm("ld.cs.global.v2.f64 {%0, %1}, [%2+0];" : "=d"(x), "=d"(y) : __PTR(addr));
25  a.x = x; a.y = y;
26  }
27 
28  __device__ inline void load_streaming_float4(float4 &a, const float4* addr)
29  {
30  float x, y, z, w;
31  asm("ld.cs.global.v4.f32 {%0, %1, %2, %3}, [%4+0];" : "=f"(x), "=f"(y), "=f"(z), "=f"(w) : __PTR(addr));
32  a.x = x; a.y = y; a.z = z; a.w = w;
33  }
34 
35  __device__ inline void load_cached_short4(short4 &a, const short4 *addr)
36  {
37  short x, y, z, w;
38  asm("ld.ca.global.v4.s16 {%0, %1, %2, %3}, [%4+0];" : "=h"(x), "=h"(y), "=h"(z), "=h"(w) : __PTR(addr));
39  a.x = x;
40  a.y = y;
41  a.z = z;
42  a.w = w;
43  }
44 
45  __device__ inline void load_cached_short2(short2 &a, const short2 *addr)
46  {
47  short x, y;
48  asm("ld.ca.global.v2.s16 {%0, %1}, [%2+0];" : "=h"(x), "=h"(y) : __PTR(addr));
49  a.x = x;
50  a.y = y;
51  }
52 
53  __device__ inline void load_global_short4(short4 &a, const short4 *addr)
54  {
55  short x, y, z, w;
56  asm("ld.cg.global.v4.s16 {%0, %1, %2, %3}, [%4+0];" : "=h"(x), "=h"(y), "=h"(z), "=h"(w) : __PTR(addr));
57  a.x = x;
58  a.y = y;
59  a.z = z;
60  a.w = w;
61  }
62 
63  __device__ inline void load_global_short2(short2 &a, const short2 *addr)
64  {
65  short x, y;
66  asm("ld.cg.global.v2.s16 {%0, %1}, [%2+0];" : "=h"(x), "=h"(y) : __PTR(addr));
67  a.x = x;
68  a.y = y;
69  }
70 
71  __device__ inline void load_global_float4(float4 &a, const float4* addr)
72  {
73  float x, y, z, w;
74  asm("ld.cg.global.v4.f32 {%0, %1, %2, %3}, [%4+0];" : "=f"(x), "=f"(y), "=f"(z), "=f"(w) : __PTR(addr));
75  a.x = x; a.y = y; a.z = z; a.w = w;
76  }
77 
78  __device__ inline void store_streaming_float4(float4* addr, float x, float y, float z, float w)
79  {
80  asm("st.cs.global.v4.f32 [%0+0], {%1, %2, %3, %4};" :: __PTR(addr), "f"(x), "f"(y), "f"(z), "f"(w));
81  }
82 
83  __device__ inline void store_streaming_short4(short4* addr, short x, short y, short z, short w)
84  {
85  asm("st.cs.global.v4.s16 [%0+0], {%1, %2, %3, %4};" :: __PTR(addr), "h"(x), "h"(y), "h"(z), "h"(w));
86  }
87 
88  __device__ inline void store_streaming_double2(double2* addr, double x, double y)
89  {
90  asm("st.cs.global.v2.f64 [%0+0], {%1, %2};" :: __PTR(addr), "d"(x), "d"(y));
91  }
92 
93  __device__ inline void store_streaming_float2(float2* addr, float x, float y)
94  {
95  asm("st.cs.global.v2.f32 [%0+0], {%1, %2};" :: __PTR(addr), "f"(x), "f"(y));
96  }
97 
98  __device__ inline void store_streaming_short2(short2* addr, short x, short y)
99  {
100  asm("st.cs.global.v2.s16 [%0+0], {%1, %2};" :: __PTR(addr), "h"(x), "h"(y));
101  }
102 
103 } // namespace quda
#define __PTR
Definition: inline_ptx.h:13
__device__ void load_cached_short4(short4 &a, const short4 *addr)
Definition: inline_ptx.h:35
__device__ void store_streaming_float2(float2 *addr, float x, float y)
Definition: inline_ptx.h:93
__device__ void store_streaming_float4(float4 *addr, float x, float y, float z, float w)
Definition: inline_ptx.h:78
__device__ void store_streaming_double2(double2 *addr, double x, double y)
Definition: inline_ptx.h:88
__device__ void load_streaming_double2(double2 &a, const double2 *addr)
Definition: inline_ptx.h:21
__device__ void load_global_short2(short2 &a, const short2 *addr)
Definition: inline_ptx.h:63
__device__ void load_global_short4(short4 &a, const short4 *addr)
Definition: inline_ptx.h:53
__device__ void store_streaming_short4(short4 *addr, short x, short y, short z, short w)
Definition: inline_ptx.h:83
__device__ void store_streaming_short2(short2 *addr, short x, short y)
Definition: inline_ptx.h:98
__device__ void load_streaming_float4(float4 &a, const float4 *addr)
Definition: inline_ptx.h:28
__device__ void load_global_float4(float4 &a, const float4 *addr)
Definition: inline_ptx.h:71
__device__ void load_cached_short2(short2 &a, const short2 *addr)
Definition: inline_ptx.h:45