v0.9.0/doc/random_8cu_source.html

 #include <stdio.h>
 #include <string.h>
 #include <iostream>
 #include <random_quda.h>
 #include <cuda.h>
 #include <quda_internal.h>

 #include <comm_quda.h>
 #include <index_helper.cuh>


 namespace quda {

 #define BLOCKSDIVUP(a, b)  (((a)+(b)-1)/(b))


 dim3 GetBlockDim(size_t threads, size_t size){
     int blockx = BLOCKSDIVUP(size, threads);
     dim3 blocks(blockx,1,1);
     return blocks;
 }


 #  define CUDA_SAFE_CALL_NO_SYNC( call) {                               \
         cudaError err = call;                                           \
         if( cudaSuccess != err) {                                       \
             fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \
                     __FILE__, __LINE__, cudaGetErrorString( err) );     \
             exit(EXIT_FAILURE);                                         \
         } }

 #  define CUDA_SAFE_CALL( call)     CUDA_SAFE_CALL_NO_SYNC(call);

 __global__ void
 kernel_random(cuRNGState *state, int seed, int rng_size, int node_offset ){
     int id = blockIdx.x * blockDim.x + threadIdx.x;
     if(id < rng_size){
         /* Each thread gets same seed, a different sequence number, no offset */
         curand_init(seed, id + node_offset, 0, &state[id]);
     }
 }

 struct rngArg{
     int comm_dim[4];
     int comm_coord[4];
     int X[4];
 };


 __global__ void
 kernel_random(cuRNGState *state, int seed, int rng_size, int node_offset, rngArg arg ){
     int id = blockIdx.x * blockDim.x + threadIdx.x;
     if(id < rng_size){
         /* Each thread gets same seed, a different sequence number, no offset */
     #ifndef MULTI_GPU
         curand_init(seed, id + node_offset, 0, &state[id]);
     #else

     int x[4];
     getCoords(x, id, arg.X, 0);
     for(int i=0; i<4;i++) x[i] += arg.comm_coord[i] * arg.X[i];
     int idd = ((((x[3] * arg.comm_dim[2] * arg.X[2] + x[2]) * arg.comm_dim[1] * arg.X[1]) + x[1] ) * arg.comm_dim[0] * arg.X[0] + x[0]) >> 1 ;
     curand_init(seed, idd, 0, &state[id]);
     #endif
     }
 }

 void launch_kernel_random(cuRNGState *state, int seed, int rng_size, int node_offset, int X[4]){
     dim3 nthreads(128,1,1);
     dim3 nblocks = GetBlockDim(nthreads.x, rng_size);
     //CUDA_SAFE_CALL(cudaFuncSetCacheConfig( kernel_random, cudaFuncCachePreferL1));
     #ifndef MULTI_GPU
     kernel_random<<<nblocks,nthreads>>>(state, seed, rng_size, node_offset);
     #else
     rngArg arg;
     for(int i=0; i < 4; i++){
         arg.comm_dim[i] = comm_dim(i);
         arg.comm_coord[i] = comm_coord(i);
         arg.X[i] = X[i];
     }
     kernel_random<<<nblocks,nthreads>>>(state, seed, rng_size, 0, arg);
     #endif
     qudaDeviceSynchronize();
 }

 RNG::RNG(int rng_sizes, int seedin){
     rng_size = rng_sizes;
     seed = seedin;
     state = NULL;
     node_offset = 0;
     #ifdef MULTI_GPU
     for(int i=0; i<4;i++) X[i]=0;
     node_offset = comm_rank() * rng_sizes;
     #endif
 #if defined(XORWOW)
     printfQuda("Using curandStateXORWOW\n");
 #elif defined(RG32k3a)
     printfQuda("Using curandStateMRG32k3a\n");
 #else
     printfQuda("Using curandStateMRG32k3a\n");
 #endif
 }


 RNG::RNG(int rng_sizes, int seedin, const int XX[4]){
     rng_size = rng_sizes;
     seed = seedin;
     state = NULL;
     node_offset = 0;
     #ifdef MULTI_GPU
     for(int i=0; i<4;i++) X[i]=XX[i];
     node_offset = comm_rank() * rng_sizes;
     #endif
 #if defined(XORWOW)
     printfQuda("Using curandStateXORWOW\n");
 #elif defined(RG32k3a)
     printfQuda("Using curandStateMRG32k3a\n");
 #else
     printfQuda("Using curandStateMRG32k3a\n");
 #endif
 }


 void RNG::Init(){
   AllocateRNG();
   launch_kernel_random(state, seed, rng_size, node_offset, X);
 }


 void RNG::AllocateRNG(){
     if(rng_size>0 && state == NULL){
         state = (cuRNGState*)device_malloc(rng_size * sizeof(cuRNGState));
         CUDA_SAFE_CALL(cudaMemset( state , 0 , rng_size * sizeof(cuRNGState) ));
         printfQuda("Allocated array of random numbers with rng_size: %.2f MB\n", rng_size * sizeof(cuRNGState)/(float)(1048576));
     }
     else{
         errorQuda("Array of random numbers not allocated, array size: %d !\nExiting...\n",rng_size);
     }
 }
 void RNG::Release(){
     if(rng_size>0 && state != NULL){
         device_free(state);
         printfQuda("Free array of random numbers with rng_size: %.2f MB\n", rng_size * sizeof(cuRNGState)/(float)(1048576));
         rng_size = 0;
         state = NULL;
     }
 }


 void RNG::restore(){
   cudaError_t err = cudaMemcpy(state, backup_state, rng_size * sizeof(cuRNGState), cudaMemcpyHostToDevice);
   if (err != cudaSuccess) {
     host_free(backup_state);
     printfQuda("ERROR: Failed to restore curand rng states array\n");
     errorQuda("Aborting");
   }
   host_free(backup_state);
 }
 void RNG::backup(){
   backup_state = (cuRNGState*) safe_malloc(rng_size * sizeof(cuRNGState));
   cudaError_t err = cudaMemcpy(backup_state, state, rng_size * sizeof(cuRNGState), cudaMemcpyDeviceToHost);
   if (err != cudaSuccess) {
     host_free(backup_state);
     printfQuda("ERROR: Failed to backup curand rng states array\n");
     errorQuda("Aborting");
   }
 }


 }
quda::RNG::AllocateRNG
void AllocateRNG()
allocate curand rng states array in device memory
Definition: random.cu:155

comm_rank
int comm_rank(void)
Definition: comm_mpi.cpp:120

quda::RNG::Init
void Init()
Initialize CURAND RNG states.
Definition: random.cu:146

quda::rngArg::comm_coord
int comm_coord[4]
Definition: random.cu:55

blockDim
dim3 dim3 blockDim
Definition: CMakeCUDACompilerId.cpp1.ii:2471

quda::kernel_random
__global__ void kernel_random(cuRNGState *state, int seed, int rng_size, int node_offset)
CUDA kernel to initialize CURAND RNG states.
Definition: random.cu:45

quda::cuRNGState
struct curandStateMRG32k3a cuRNGState
Definition: random_quda.h:17

quda::rngArg::comm_dim
int comm_dim[4]
Definition: random.cu:54

quda::RNG::RNG
RNG(int rng_sizes, int seedin, const int XX[4])
Definition: random.cu:122

errorQuda
#define errorQuda(...)
Definition: util_quda.h:90

host_free
#define host_free(ptr)
Definition: malloc_quda.h:59

comm_dim
int comm_dim(int dim)
Definition: comm_common.cpp:404

comm_coord
int comm_coord(int dim)
Definition: comm_common.cpp:411

quda::rngArg::X
int X[4]
Definition: random.cu:56

quda::RNG::backup
void backup()
Backup CURAND array states initialization.
Definition: random.cu:189

quda::rngArg
Definition: random.cu:53

BLOCKSDIVUP
#define BLOCKSDIVUP(a, b)
Definition: random.cu:15

quda
Definition: blas_cublas.h:6

quda::RNG::state
cuRNGState * state
Definition: random_quda.h:42

comm_quda.h

x
p x
Definition: CMakeCUDACompilerId.cpp1.ii:3011

quda::RNG::seed
int seed
Definition: random_quda.h:46

quda::RNG::Release
void Release()
Release Device memory for CURAND RNG states.
Definition: random.cu:168

quda::RNG::restore
void restore()
Restore CURAND array states initialization.
Definition: random.cu:179

fused_exterior_ndeg_tm_dslash_cuda_gen.i
int i
start here
Definition: fused_exterior_ndeg_tm_dslash_cuda_gen.py:816

quda::RNG::rng_size
int rng_size
number of curand states
Definition: random_quda.h:48

quda::launch_kernel_random
void launch_kernel_random(cuRNGState *state, int seed, int rng_size, int node_offset, int X[4])
Call CUDA kernel to initialize CURAND RNG states.
Definition: random.cu:85

CUDA_SAFE_CALL
#define CUDA_SAFE_CALL(call)
Definition: random.cu:35

err
cudaError_t err
Definition: CMakeCUDACompilerId.cpp1.ii:15938

quda::RNG::backup_state
cuRNGState * backup_state
Definition: random_quda.h:44

quda::qudaDeviceSynchronize
cudaError_t qudaDeviceSynchronize()
Wrapper around cudaDeviceSynchronize or cuDeviceSynchronize.
Definition: quda_cuda_api.cpp:277

safe_malloc
#define safe_malloc(size)
Definition: malloc_quda.h:54

X
int X
Definition: asym_wilson_clover_dslash_dagger_fermi_core.h:394

index_helper.cuh

printfQuda
#define printfQuda(...)
Definition: util_quda.h:84

size
size_t size
Definition: CMakeCUDACompilerId.cpp1.ii:2289

quda::arg
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
Definition: complex_quda.h:880

device_malloc
#define device_malloc(size)
Definition: malloc_quda.h:52

random_quda.h

quda::RNG::node_offset
int node_offset
offset in the index, in case of multigpus
Definition: random_quda.h:50

quda::GetBlockDim
dim3 GetBlockDim(size_t threads, size_t size)
Definition: random.cu:18

quda::RNG::X
int X[4]
Definition: random_quda.h:51

quda_internal.h

device_free
#define device_free(ptr)
Definition: malloc_quda.h:57

quda::getCoords
static __device__ __host__ void getCoords(int x[], int cb_index, const I X[], int parity)
Definition: index_helper.cuh:129