QUDA  1.0.0
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
restrictor.cuh
Go to the documentation of this file.
2 #include <cub_helper.cuh>
3 #include <multigrid_helper.cuh>
4 #include <fast_intdiv.h>
5 
6 // enabling CTA swizzling improves spatial locality of MG blocks reducing cache line wastage
7 #ifndef SWIZZLE
8 #define SWIZZLE
9 #endif
10 
11 namespace quda {
12 
13  using namespace quda::colorspinor;
14 
18  template <typename Float, typename vFloat, int fineSpin, int fineColor,
19  int coarseSpin, int coarseColor, QudaFieldOrder order>
20  struct RestrictArg {
21 
25  const int *fine_to_coarse;
26  const int *coarse_to_fine;
28  const int parity; // the parity of the input field (if single parity)
29  const int nParity; // number of parities of input fine field
30  int_fastdiv swizzle; // swizzle factor for transposing blockIdx.x mapping to coarse grid coordinate
31 
33  const int *fine_to_coarse, const int *coarse_to_fine, int parity)
34  : out(out), in(in), V(V), fine_to_coarse(fine_to_coarse), coarse_to_fine(coarse_to_fine),
35  spin_map(), parity(parity), nParity(in.SiteSubset()), swizzle(1)
36  { }
37 
39  out(arg.out), in(arg.in), V(arg.V),
40  fine_to_coarse(arg.fine_to_coarse), coarse_to_fine(arg.coarse_to_fine), spin_map(),
41  parity(arg.parity), nParity(arg.nParity), swizzle(arg.swizzle)
42  { }
43  };
44 
48  template <typename Float, int fineSpin, int fineColor, int coarseColor, int coarse_colors_per_thread,
49  class FineColor, class Rotator>
50  __device__ __host__ inline void rotateCoarseColor(complex<Float> out[fineSpin*coarse_colors_per_thread],
51  const FineColor &in, const Rotator &V,
52  int parity, int nParity, int x_cb, int coarse_color_block) {
53  const int spinor_parity = (nParity == 2) ? parity : 0;
54  const int v_parity = (V.Nparity() == 2) ? parity : 0;
55 
56 #pragma unroll
57  for (int s=0; s<fineSpin; s++)
58 #pragma unroll
59  for (int coarse_color_local=0; coarse_color_local<coarse_colors_per_thread; coarse_color_local++) {
60  out[s*coarse_colors_per_thread+coarse_color_local] = 0.0;
61  }
62 
63 #pragma unroll
64  for (int coarse_color_local=0; coarse_color_local<coarse_colors_per_thread; coarse_color_local++) {
65  int i = coarse_color_block + coarse_color_local;
66 #pragma unroll
67  for (int s=0; s<fineSpin; s++) {
68 
69  constexpr int color_unroll = fineColor == 3 ? 3 : 2;
70 
71  complex<Float> partial[color_unroll];
72 #pragma unroll
73  for (int k=0; k<color_unroll; k++) partial[k] = 0.0;
74 
75 #pragma unroll
76  for (int j=0; j<fineColor; j+=color_unroll) {
77 #pragma unroll
78  for (int k=0; k<color_unroll; k++)
79  partial[k] += conj(V(v_parity, x_cb, s, j+k, i)) * in(spinor_parity, x_cb, s, j+k);
80  }
81 
82 #pragma unroll
83  for (int k=0; k<color_unroll; k++) out[s*coarse_colors_per_thread + coarse_color_local] += partial[k];
84  }
85  }
86 
87  }
88 
89  template <typename Float, int fineSpin, int fineColor, int coarseSpin, int coarseColor, int coarse_colors_per_thread, typename Arg>
90  void Restrict(Arg arg) {
91  for (int parity_coarse=0; parity_coarse<2; parity_coarse++)
92  for (int x_coarse_cb=0; x_coarse_cb<arg.out.VolumeCB(); x_coarse_cb++)
93  for (int s=0; s<coarseSpin; s++)
94  for (int c=0; c<coarseColor; c++)
95  arg.out(parity_coarse, x_coarse_cb, s, c) = 0.0;
96 
97  // loop over fine degrees of freedom
98  for (int parity=0; parity<arg.nParity; parity++) {
99  parity = (arg.nParity == 2) ? parity : arg.parity;
100 
101  for (int x_cb=0; x_cb<arg.in.VolumeCB(); x_cb++) {
102 
103  int x = parity*arg.in.VolumeCB() + x_cb;
104  int x_coarse = arg.fine_to_coarse[x];
105  int parity_coarse = (x_coarse >= arg.out.VolumeCB()) ? 1 : 0;
106  int x_coarse_cb = x_coarse - parity_coarse*arg.out.VolumeCB();
107 
108  for (int coarse_color_block=0; coarse_color_block<coarseColor; coarse_color_block+=coarse_colors_per_thread) {
109  complex<Float> tmp[fineSpin*coarse_colors_per_thread];
110  rotateCoarseColor<Float,fineSpin,fineColor,coarseColor,coarse_colors_per_thread>
111  (tmp, arg.in, arg.V, parity, arg.nParity, x_cb, coarse_color_block);
112 
113  for (int s=0; s<fineSpin; s++) {
114  for (int coarse_color_local=0; coarse_color_local<coarse_colors_per_thread; coarse_color_local++) {
115  int c = coarse_color_block + coarse_color_local;
116  arg.out(parity_coarse,x_coarse_cb,arg.spin_map(s,parity),c) += tmp[s*coarse_colors_per_thread+coarse_color_local];
117  }
118  }
119 
120  }
121  }
122  }
123 
124  }
125 
134  template <int block_size, typename Float, int fineSpin, int fineColor, int coarseSpin,
135  int coarseColor, int coarse_colors_per_thread, typename Arg>
136  __global__ void RestrictKernel(Arg arg) {
137 
138 #ifdef SWIZZLE
139  // the portion of the grid that is exactly divisible by the number of SMs
140  const int gridp = gridDim.x - gridDim.x % arg.swizzle;
141 
142  int x_coarse = blockIdx.x;
143  if (blockIdx.x < gridp) {
144  // this is the portion of the block that we are going to transpose
145  const int i = blockIdx.x % arg.swizzle;
146  const int j = blockIdx.x / arg.swizzle;
147 
148  // tranpose the coordinates
149  x_coarse = i * (gridp / arg.swizzle) + j;
150  }
151 #else
152  int x_coarse = blockIdx.x;
153 #endif
154 
155  int parity_coarse = x_coarse >= arg.out.VolumeCB() ? 1 : 0;
156  int x_coarse_cb = x_coarse - parity_coarse*arg.out.VolumeCB();
157 
158  // obtain fine index from this look up table
159  // since both parities map to the same block, each thread block must do both parities
160 
161  // threadIdx.x - fine checkboard offset
162  // threadIdx.y - fine parity offset
163  // blockIdx.x - which coarse block are we working on (swizzled to improve cache efficiency)
164  // assume that coarse_to_fine look up map is ordered as (coarse-block-id + fine-point-id)
165  // and that fine-point-id is parity ordered
166  int parity = arg.nParity == 2 ? threadIdx.y : arg.parity;
167  int x_fine = arg.coarse_to_fine[ (x_coarse*2 + parity) * blockDim.x + threadIdx.x];
168  int x_fine_cb = x_fine - parity*arg.in.VolumeCB();
169 
170  int coarse_color_block = (blockDim.z*blockIdx.z + threadIdx.z) * coarse_colors_per_thread;
171  if (coarse_color_block >= coarseColor) return;
172 
173  complex<Float> tmp[fineSpin*coarse_colors_per_thread];
174  rotateCoarseColor<Float,fineSpin,fineColor,coarseColor,coarse_colors_per_thread>
175  (tmp, arg.in, arg.V, parity, arg.nParity, x_fine_cb, coarse_color_block);
176 
177  typedef vector_type<complex<Float>, coarseSpin*coarse_colors_per_thread> vector;
178  vector reduced;
179 
180  // first lets coarsen spin locally
181  for (int s=0; s<fineSpin; s++) {
182  for (int v=0; v<coarse_colors_per_thread; v++) {
183  reduced[arg.spin_map(s,parity)*coarse_colors_per_thread+v] += tmp[s*coarse_colors_per_thread+v];
184  }
185  }
186 
187  // now lets coarsen geometry across threads
188  if (arg.nParity == 2) {
189  typedef cub::BlockReduce<vector, block_size, cub::BLOCK_REDUCE_WARP_REDUCTIONS, 2> BlockReduce;
190  __shared__ typename BlockReduce::TempStorage temp_storage;
191 
192  // note this is not safe for blockDim.z > 1
193  reduced = BlockReduce(temp_storage).Sum(reduced);
194  } else {
195  typedef cub::BlockReduce<vector, block_size, cub::BLOCK_REDUCE_WARP_REDUCTIONS> BlockReduce;
196  __shared__ typename BlockReduce::TempStorage temp_storage;
197 
198  // note this is not safe for blockDim.z > 1
199  reduced = BlockReduce(temp_storage).Sum(reduced);
200  }
201 
202  if (threadIdx.x==0 && threadIdx.y == 0) {
203  for (int s=0; s<coarseSpin; s++) {
204  for (int coarse_color_local=0; coarse_color_local<coarse_colors_per_thread; coarse_color_local++) {
205  int v = coarse_color_block + coarse_color_local;
206  arg.out(parity_coarse, x_coarse_cb, s, v) = reduced[s*coarse_colors_per_thread+coarse_color_local];
207  }
208  }
209  }
210  }
211 
212 }
int_fastdiv swizzle
Definition: restrictor.cuh:30
FieldOrderCB< Float, coarseSpin, coarseColor, 1, order > out
Definition: restrictor.cuh:22
__global__ void RestrictKernel(Arg arg)
Definition: restrictor.cuh:136
enum QudaFieldOrder_s QudaFieldOrder
cudaColorSpinorField * tmp
Definition: covdev_test.cpp:44
RestrictArg(const RestrictArg< Float, vFloat, fineSpin, fineColor, coarseSpin, coarseColor, order > &arg)
Definition: restrictor.cuh:38
const int * coarse_to_fine
Definition: restrictor.cuh:26
const int nParity
Definition: restrictor.cuh:29
const int parity
Definition: restrictor.cuh:28
const FieldOrderCB< Float, fineSpin, fineColor, 1, order > in
Definition: restrictor.cuh:23
cpuColorSpinorField * in
const FieldOrderCB< Float, fineSpin, fineColor, coarseColor, order, vFloat > V
Definition: restrictor.cuh:24
int V
Definition: test_util.cpp:27
void Restrict(Arg arg)
Definition: restrictor.cuh:90
__device__ __host__ void rotateCoarseColor(complex< Float > out[fineSpin *coarse_colors_per_thread], const FineColor &in, const Rotator &V, int parity, int nParity, int x_cb, int coarse_color_block)
Definition: restrictor.cuh:50
const int * fine_to_coarse
Definition: restrictor.cuh:25
cpuColorSpinorField * out
const int nParity
Definition: spinor_noise.cu:25
__shared__ float s[]
const spin_mapper< fineSpin, coarseSpin > spin_map
Definition: restrictor.cuh:27
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
RestrictArg(ColorSpinorField &out, const ColorSpinorField &in, const ColorSpinorField &V, const int *fine_to_coarse, const int *coarse_to_fine, int parity)
Definition: restrictor.cuh:32
colorspinor::FieldOrderCB< real, Ns, Nc, 1, order > V
Definition: spinor_noise.cu:23
__host__ __device__ ValueType conj(ValueType x)
Definition: complex_quda.h:130
QudaParity parity
Definition: covdev_test.cpp:54