quda-ref/v1.1.0/inline__ptx_8h_source.html

 #pragma once


 /*

   Inline ptx instructions for low-level control of code generation.

   Primarily these are for doing stores avoiding L1 cache and minimal

   impact on L2 (streaming through L2).

 */


 // Define a different pointer storage size for 64 and 32 bit

 #if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__)

 #define __PTR   "l"

 #else

 #define __PTR   "r"

 #endif


 namespace quda {


   // If you're bored...

   // http://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-st


   __device__ inline void load_streaming_double2(double2 &a, const double2* addr)

   {

     double x, y;

     asm("ld.cs.global.v2.f64 {%0, %1}, [%2+0];" : "=d"(x), "=d"(y) : __PTR(addr));

     a.x = x; a.y = y;

   }


   __device__ inline void load_streaming_float4(float4 &a, const float4* addr)

   {

     float x, y, z, w;

     asm("ld.cs.global.v4.f32 {%0, %1, %2, %3}, [%4+0];" : "=f"(x), "=f"(y), "=f"(z), "=f"(w) : __PTR(addr));

     a.x = x; a.y = y; a.z = z; a.w = w;

   }


   __device__ inline void load_cached_short4(short4 &a, const short4 *addr)

   {

     short x, y, z, w;

     asm("ld.ca.global.v4.s16 {%0, %1, %2, %3}, [%4+0];" : "=h"(x), "=h"(y), "=h"(z), "=h"(w) : __PTR(addr));

     a.x = x;

     a.y = y;

     a.z = z;

     a.w = w;

   }


   __device__ inline void load_cached_short2(short2 &a, const short2 *addr)

   {

     short x, y;

     asm("ld.ca.global.v2.s16 {%0, %1}, [%2+0];" : "=h"(x), "=h"(y) : __PTR(addr));

     a.x = x;

     a.y = y;

   }


   __device__ inline void load_global_short4(short4 &a, const short4 *addr)

   {

     short x, y, z, w;

     asm("ld.cg.global.v4.s16 {%0, %1, %2, %3}, [%4+0];" : "=h"(x), "=h"(y), "=h"(z), "=h"(w) : __PTR(addr));

     a.x = x;

     a.y = y;

     a.z = z;

     a.w = w;

   }


   __device__ inline void load_global_short2(short2 &a, const short2 *addr)

   {

     short x, y;

     asm("ld.cg.global.v2.s16 {%0, %1}, [%2+0];" : "=h"(x), "=h"(y) : __PTR(addr));

     a.x = x;

     a.y = y;

   }


   __device__ inline void load_global_float4(float4 &a, const float4* addr)

   {

     float x, y, z, w;

     asm("ld.cg.global.v4.f32 {%0, %1, %2, %3}, [%4+0];" : "=f"(x), "=f"(y), "=f"(z), "=f"(w) : __PTR(addr));

     a.x = x; a.y = y; a.z = z; a.w = w;

   }


   __device__ inline void store_streaming_float4(float4* addr, float x, float y, float z, float w)

   {

     asm("st.cs.global.v4.f32 [%0+0], {%1, %2, %3, %4};" :: __PTR(addr), "f"(x), "f"(y), "f"(z), "f"(w));

   }


   __device__ inline void store_streaming_short4(short4* addr, short x, short y, short z, short w)

   {

     asm("st.cs.global.v4.s16 [%0+0], {%1, %2, %3, %4};" :: __PTR(addr), "h"(x), "h"(y), "h"(z), "h"(w));

   }


   __device__ inline void store_streaming_double2(double2* addr, double x, double y)

   {

     asm("st.cs.global.v2.f64 [%0+0], {%1, %2};" :: __PTR(addr), "d"(x), "d"(y));

   }


   __device__ inline void store_streaming_float2(float2* addr, float x, float y)

   {

     asm("st.cs.global.v2.f32 [%0+0], {%1, %2};" :: __PTR(addr), "f"(x), "f"(y));

   }


   __device__ inline void store_streaming_short2(short2* addr, short x, short y)

   {

     asm("st.cs.global.v2.s16 [%0+0], {%1, %2};" :: __PTR(addr), "h"(x), "h"(y));

   }


 } // namespace quda

__PTR
#define __PTR
Definition: inline_ptx.h:13

quda
Definition: blas_lapack.h:24

quda::load_cached_short4
__device__ void load_cached_short4(short4 &a, const short4 *addr)
Definition: inline_ptx.h:35

quda::store_streaming_float2
__device__ void store_streaming_float2(float2 *addr, float x, float y)
Definition: inline_ptx.h:93

quda::store_streaming_float4
__device__ void store_streaming_float4(float4 *addr, float x, float y, float z, float w)
Definition: inline_ptx.h:78

quda::store_streaming_double2
__device__ void store_streaming_double2(double2 *addr, double x, double y)
Definition: inline_ptx.h:88

quda::load_streaming_double2
__device__ void load_streaming_double2(double2 &a, const double2 *addr)
Definition: inline_ptx.h:21

quda::load_global_short2
__device__ void load_global_short2(short2 &a, const short2 *addr)
Definition: inline_ptx.h:63

quda::load_global_short4
__device__ void load_global_short4(short4 &a, const short4 *addr)
Definition: inline_ptx.h:53

quda::store_streaming_short4
__device__ void store_streaming_short4(short4 *addr, short x, short y, short z, short w)
Definition: inline_ptx.h:83

quda::store_streaming_short2
__device__ void store_streaming_short2(short2 *addr, short x, short y)
Definition: inline_ptx.h:98

quda::load_streaming_float4
__device__ void load_streaming_float4(float4 &a, const float4 *addr)
Definition: inline_ptx.h:28

quda::load_global_float4
__device__ void load_global_float4(float4 &a, const float4 *addr)
Definition: inline_ptx.h:71

quda::load_cached_short2
__device__ void load_cached_short2(short2 &a, const short2 *addr)
Definition: inline_ptx.h:45