QUDA  0.9.0
random.cu
Go to the documentation of this file.
1 
2 #include <stdio.h>
3 #include <string.h>
4 #include <iostream>
5 #include <random_quda.h>
6 #include <cuda.h>
7 #include <quda_internal.h>
8 
9 #include <comm_quda.h>
10 #include <index_helper.cuh>
11 
12 
13 namespace quda {
14 
15 #define BLOCKSDIVUP(a, b) (((a)+(b)-1)/(b))
16 
17 
18 dim3 GetBlockDim(size_t threads, size_t size){
19  int blockx = BLOCKSDIVUP(size, threads);
20  dim3 blocks(blockx,1,1);
21  return blocks;
22 }
23 
24 
25 
26 
27 # define CUDA_SAFE_CALL_NO_SYNC( call) { \
28  cudaError err = call; \
29  if( cudaSuccess != err) { \
30  fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \
31  __FILE__, __LINE__, cudaGetErrorString( err) ); \
32  exit(EXIT_FAILURE); \
33  } }
34 
35 # define CUDA_SAFE_CALL( call) CUDA_SAFE_CALL_NO_SYNC(call);
36 
44 __global__ void
45 kernel_random(cuRNGState *state, int seed, int rng_size, int node_offset ){
46  int id = blockIdx.x * blockDim.x + threadIdx.x;
47  if(id < rng_size){
48  /* Each thread gets same seed, a different sequence number, no offset */
49  curand_init(seed, id + node_offset, 0, &state[id]);
50  }
51 }
52 
53 struct rngArg{
54  int comm_dim[4];
55  int comm_coord[4];
56  int X[4];
57 };
58 
59 
60 __global__ void
61 kernel_random(cuRNGState *state, int seed, int rng_size, int node_offset, rngArg arg ){
62  int id = blockIdx.x * blockDim.x + threadIdx.x;
63  if(id < rng_size){
64  /* Each thread gets same seed, a different sequence number, no offset */
65  #ifndef MULTI_GPU
66  curand_init(seed, id + node_offset, 0, &state[id]);
67  #else
68 
69  int x[4];
70  getCoords(x, id, arg.X, 0);
71  for(int i=0; i<4;i++) x[i] += arg.comm_coord[i] * arg.X[i];
72  int idd = ((((x[3] * arg.comm_dim[2] * arg.X[2] + x[2]) * arg.comm_dim[1] * arg.X[1]) + x[1] ) * arg.comm_dim[0] * arg.X[0] + x[0]) >> 1 ;
73  curand_init(seed, idd, 0, &state[id]);
74  #endif
75  }
76 }
77 
85 void launch_kernel_random(cuRNGState *state, int seed, int rng_size, int node_offset, int X[4]){
86  dim3 nthreads(128,1,1);
87  dim3 nblocks = GetBlockDim(nthreads.x, rng_size);
88  //CUDA_SAFE_CALL(cudaFuncSetCacheConfig( kernel_random, cudaFuncCachePreferL1));
89  #ifndef MULTI_GPU
90  kernel_random<<<nblocks,nthreads>>>(state, seed, rng_size, node_offset);
91  #else
92  rngArg arg;
93  for(int i=0; i < 4; i++){
94  arg.comm_dim[i] = comm_dim(i);
95  arg.comm_coord[i] = comm_coord(i);
96  arg.X[i] = X[i];
97  }
98  kernel_random<<<nblocks,nthreads>>>(state, seed, rng_size, 0, arg);
99  #endif
101 }
102 
103 RNG::RNG(int rng_sizes, int seedin){
104  rng_size = rng_sizes;
105  seed = seedin;
106  state = NULL;
107  node_offset = 0;
108  #ifdef MULTI_GPU
109  for(int i=0; i<4;i++) X[i]=0;
110  node_offset = comm_rank() * rng_sizes;
111  #endif
112 #if defined(XORWOW)
113  printfQuda("Using curandStateXORWOW\n");
114 #elif defined(RG32k3a)
115  printfQuda("Using curandStateMRG32k3a\n");
116 #else
117  printfQuda("Using curandStateMRG32k3a\n");
118 #endif
119 }
120 
121 
122 RNG::RNG(int rng_sizes, int seedin, const int XX[4]){
123  rng_size = rng_sizes;
124  seed = seedin;
125  state = NULL;
126  node_offset = 0;
127  #ifdef MULTI_GPU
128  for(int i=0; i<4;i++) X[i]=XX[i];
129  node_offset = comm_rank() * rng_sizes;
130  #endif
131 #if defined(XORWOW)
132  printfQuda("Using curandStateXORWOW\n");
133 #elif defined(RG32k3a)
134  printfQuda("Using curandStateMRG32k3a\n");
135 #else
136  printfQuda("Using curandStateMRG32k3a\n");
137 #endif
138 }
139 
140 
141 
142 
146 void RNG::Init(){
147  AllocateRNG();
149 }
150 
151 
156  if(rng_size>0 && state == NULL){
158  CUDA_SAFE_CALL(cudaMemset( state , 0 , rng_size * sizeof(cuRNGState) ));
159  printfQuda("Allocated array of random numbers with rng_size: %.2f MB\n", rng_size * sizeof(cuRNGState)/(float)(1048576));
160  }
161  else{
162  errorQuda("Array of random numbers not allocated, array size: %d !\nExiting...\n",rng_size);
163  }
164 }
169  if(rng_size>0 && state != NULL){
171  printfQuda("Free array of random numbers with rng_size: %.2f MB\n", rng_size * sizeof(cuRNGState)/(float)(1048576));
172  rng_size = 0;
173  state = NULL;
174  }
175 }
176 
177 
180  cudaError_t err = cudaMemcpy(state, backup_state, rng_size * sizeof(cuRNGState), cudaMemcpyHostToDevice);
181  if (err != cudaSuccess) {
183  printfQuda("ERROR: Failed to restore curand rng states array\n");
184  errorQuda("Aborting");
185  }
187 }
189 void RNG::backup(){
191  cudaError_t err = cudaMemcpy(backup_state, state, rng_size * sizeof(cuRNGState), cudaMemcpyDeviceToHost);
192  if (err != cudaSuccess) {
194  printfQuda("ERROR: Failed to backup curand rng states array\n");
195  errorQuda("Aborting");
196  }
197 }
198 
199 
200 }
void AllocateRNG()
allocate curand rng states array in device memory
Definition: random.cu:155
int comm_rank(void)
Definition: comm_mpi.cpp:120
void Init()
Initialize CURAND RNG states.
Definition: random.cu:146
int comm_coord[4]
Definition: random.cu:55
dim3 dim3 blockDim
__global__ void kernel_random(cuRNGState *state, int seed, int rng_size, int node_offset)
CUDA kernel to initialize CURAND RNG states.
Definition: random.cu:45
struct curandStateMRG32k3a cuRNGState
Definition: random_quda.h:17
int comm_dim[4]
Definition: random.cu:54
RNG(int rng_sizes, int seedin, const int XX[4])
Definition: random.cu:122
#define errorQuda(...)
Definition: util_quda.h:90
#define host_free(ptr)
Definition: malloc_quda.h:59
int comm_dim(int dim)
int comm_coord(int dim)
int X[4]
Definition: random.cu:56
void backup()
Backup CURAND array states initialization.
Definition: random.cu:189
#define BLOCKSDIVUP(a, b)
Definition: random.cu:15
cuRNGState * state
Definition: random_quda.h:42
void Release()
Release Device memory for CURAND RNG states.
Definition: random.cu:168
void restore()
Restore CURAND array states initialization.
Definition: random.cu:179
int rng_size
number of curand states
Definition: random_quda.h:48
void launch_kernel_random(cuRNGState *state, int seed, int rng_size, int node_offset, int X[4])
Call CUDA kernel to initialize CURAND RNG states.
Definition: random.cu:85
#define CUDA_SAFE_CALL(call)
Definition: random.cu:35
cudaError_t err
cuRNGState * backup_state
Definition: random_quda.h:44
cudaError_t qudaDeviceSynchronize()
Wrapper around cudaDeviceSynchronize or cuDeviceSynchronize.
#define safe_malloc(size)
Definition: malloc_quda.h:54
#define printfQuda(...)
Definition: util_quda.h:84
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
Definition: complex_quda.h:880
#define device_malloc(size)
Definition: malloc_quda.h:52
int node_offset
offset in the index, in case of multigpus
Definition: random_quda.h:50
dim3 GetBlockDim(size_t threads, size_t size)
Definition: random.cu:18
int X[4]
Definition: random_quda.h:51
#define device_free(ptr)
Definition: malloc_quda.h:57
static __device__ __host__ void getCoords(int x[], int cb_index, const I X[], int parity)