5 #include <quda_internal.h>
7 #include <random_quda.h>
9 #include <index_helper.cuh>
11 #define BLOCKSDIVUP(a, b) (((a)+(b)-1)/(b))
15 dim3 GetGridDim(size_t threads, size_t size) {
16 int blockx = BLOCKSDIVUP(size, threads);
17 dim3 blocks(blockx,1,1);
22 size_t commDim[QUDA_MAX_DIM];
23 int commCoord[QUDA_MAX_DIM];
25 rngArg(const int X_[]) {
26 for (int i=0; i<4; i++) {
27 commDim[i] = comm_dim(i);
28 commCoord[i] = comm_coord(i);
35 @brief CUDA kernel to initialize CURAND RNG states
36 @param state CURAND RNG state array
37 @param seed initial seed for RNG
38 @param size size of the CURAND RNG state array
39 @param arg Metadata needed for computing multi-gpu offsets
41 __global__ void kernel_random(cuRNGState *state, unsigned long long seed, int size_cb, rngArg arg)
43 int id = blockIdx.x * blockDim.x + threadIdx.x;
44 int parity = blockIdx.y * blockDim.y + threadIdx.y;
46 // Each thread gets same seed, a different sequence number, no offset
48 getCoords(x, id, arg.X, parity);
49 for (int i = 0; i < 4; i++) x[i] += arg.commCoord[i] * arg.X[i];
51 = (((x[3] * arg.commDim[2] * arg.X[2] + x[2]) * arg.commDim[1] * arg.X[1]) + x[1]) * arg.commDim[0] * arg.X[0]
53 curand_init(seed, idd, 0, &state[parity * size_cb + id]);
58 @brief Call CUDA kernel to initialize CURAND RNG states
59 @param state CURAND RNG state array
60 @param seed initial seed for RNG
61 @param size_cb Checkerboarded size of the CURAND RNG state array
62 @param n_parity Number of parities (1 or 2)
63 @param X array of lattice dimensions
65 void launch_kernel_random(cuRNGState *state, unsigned long long seed, int size_cb, int n_parity, int X[4])
68 tp.block = dim3(128, 1, 1);
69 tp.grid = GetGridDim(tp.block.x, size_cb);
71 tp.block.y = n_parity;
72 qudaLaunchKernel(kernel_random, tp, 0, state, seed, size_cb, arg);
73 qudaDeviceSynchronize();
76 RNG::RNG(const LatticeField &meta, unsigned long long seedin) :
79 size_cb(meta.VolumeCB())
82 for (int i = 0; i < 4; i++) X[i] = meta.X()[i];
84 if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Using curandStateXORWOW\n");
85 #elif defined(RG32k3a)
86 if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Using curandStateMRG32k3a\n");
88 if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Using curandStateMRG32k3a\n");
92 RNG::RNG(const LatticeFieldParam ¶m, unsigned long long seedin) : seed(seedin), size(1), size_cb(1)
95 for (int i = 0; i < 4; i++) {
99 size_cb = size / param.siteSubset;
102 if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Using curandStateXORWOW\n");
103 #elif defined(RG32k3a)
104 if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Using curandStateMRG32k3a\n");
106 if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Using curandStateMRG32k3a\n");
111 @brief Initialize CURAND RNG states
115 launch_kernel_random(state, seed, size_cb, size / size_cb, X);
119 @brief Allocate Device memory for CURAND RNG states
121 void RNG::AllocateRNG() {
122 if (size > 0 && state == nullptr) {
123 state = (cuRNGState *)device_malloc(size * sizeof(cuRNGState));
124 qudaMemset(state, 0, size * sizeof(cuRNGState));
125 if (getVerbosity() >= QUDA_DEBUG_VERBOSE)
126 printfQuda("Allocated array of random numbers with size: %.2f MB\n",
127 size * sizeof(cuRNGState) / (float)(1048576));
129 errorQuda("Array of random numbers not allocated, array size: %d !\nExiting...\n", size);
134 @brief Release Device memory for CURAND RNG states
136 void RNG::Release() {
137 if (size > 0 && state != nullptr) {
139 if (getVerbosity() >= QUDA_DEBUG_VERBOSE)
140 printfQuda("Free array of random numbers with size: %.2f MB\n", size * sizeof(cuRNGState) / (float)(1048576));
146 /*! @brief Backup CURAND array states initialization */
149 backup_state = (cuRNGState *)safe_malloc(size * sizeof(cuRNGState));
150 qudaMemcpy(backup_state, state, size * sizeof(cuRNGState), cudaMemcpyDeviceToHost);
153 /*! @brief Restore CURAND array states initialization */
156 qudaMemcpy(state, backup_state, size * sizeof(cuRNGState), cudaMemcpyHostToDevice);
157 host_free(backup_state);