10 #if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__)
24 asm(
"ld.cs.global.v2.f64 {%0, %1}, [%2+0];" :
"=d"(x),
"=d"(y) :
__PTR(addr));
31 asm(
"ld.cs.global.v4.f32 {%0, %1, %2, %3}, [%4+0];" :
"=f"(x),
"=f"(y),
"=f"(z),
"=f"(w) :
__PTR(addr));
32 a.x = x; a.y = y; a.z = z; a.w = w;
38 asm(
"ld.ca.global.v4.s16 {%0, %1, %2, %3}, [%4+0];" :
"=h"(x),
"=h"(y),
"=h"(z),
"=h"(w) :
__PTR(addr));
48 asm(
"ld.ca.global.v2.s16 {%0, %1}, [%2+0];" :
"=h"(x),
"=h"(y) :
__PTR(addr));
56 asm(
"ld.cg.global.v4.s16 {%0, %1, %2, %3}, [%4+0];" :
"=h"(x),
"=h"(y),
"=h"(z),
"=h"(w) :
__PTR(addr));
66 asm(
"ld.cg.global.v2.s16 {%0, %1}, [%2+0];" :
"=h"(x),
"=h"(y) :
__PTR(addr));
74 asm(
"ld.cg.global.v4.f32 {%0, %1, %2, %3}, [%4+0];" :
"=f"(x),
"=f"(y),
"=f"(z),
"=f"(w) :
__PTR(addr));
75 a.x = x; a.y = y; a.z = z; a.w = w;
80 asm(
"st.cs.global.v4.f32 [%0+0], {%1, %2, %3, %4};" :: __PTR(addr),
"f"(x),
"f"(y),
"f"(z),
"f"(w));
85 asm(
"st.cs.global.v4.s16 [%0+0], {%1, %2, %3, %4};" :: __PTR(addr),
"h"(x),
"h"(y),
"h"(z),
"h"(w));
90 asm(
"st.cs.global.v2.f64 [%0+0], {%1, %2};" :: __PTR(addr),
"d"(x),
"d"(y));
95 asm(
"st.cs.global.v2.f32 [%0+0], {%1, %2};" :: __PTR(addr),
"f"(x),
"f"(y));
100 asm(
"st.cs.global.v2.s16 [%0+0], {%1, %2};" :: __PTR(addr),
"h"(x),
"h"(y));
__device__ void load_cached_short4(short4 &a, const short4 *addr)
__device__ void store_streaming_float2(float2 *addr, float x, float y)
__device__ void store_streaming_float4(float4 *addr, float x, float y, float z, float w)
__device__ void store_streaming_double2(double2 *addr, double x, double y)
__device__ void load_streaming_double2(double2 &a, const double2 *addr)
__device__ void load_global_short2(short2 &a, const short2 *addr)
__device__ void load_global_short4(short4 &a, const short4 *addr)
__device__ void store_streaming_short4(short4 *addr, short x, short y, short z, short w)
__device__ void store_streaming_short2(short2 *addr, short x, short y)
__device__ void load_streaming_float4(float4 &a, const float4 *addr)
__device__ void load_global_float4(float4 &a, const float4 *addr)
__device__ void load_cached_short2(short2 &a, const short2 *addr)