Actual source code: ex1.raja.cxx

  1: //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
  2: // Copyright (c) 2016-21, Lawrence Livermore National Security, LLC
  3: // and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
  4: //
  5: // SPDX-License-Identifier: (BSD-3-Clause)
  6: //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//

  8: #include <cstdlib>
  9: #include <cstdio>
 10: #include <cstring>

 12: #include <iostream>
 13: #include <cmath>

 15: #include "RAJA/RAJA.hpp"

 17: #include "memoryManager.hpp"

 19: /*
 20:  * Jacobi Example
 21:  *
 22:  * ----[Details]--------------------
 23:  * This code uses a five point finite difference stencil
 24:  * to discretize the following boundary value problem
 25:  *
 26:  * U_xx + U_yy = f on [0,1] x [0,1].
 27:  *
 28:  * The right-hand side is chosen to be
 29:  * f = 2*x*(y-1)*(y-2*x+x*y+2)*exp(x-y).
 30:  *
 31:  * A structured grid is used to discretize the domain
 32:  * [0,1] x [0,1]. Values inside the domain are computed
 33:  * using the Jacobi method to solve the associated
 34:  * linear system. The scheme is invoked until the l_2
 35:  * difference of subsequent iterations is below a
 36:  * tolerance.
 37:  *
 38:  * The scheme is implemented by allocating two arrays
 39:  * (I, Iold) and initialized to zero. The first set of
 40:  * nested for loops apply an iteration of the Jacobi
 41:  * scheme. The scheme is only applied to the interior
 42:  * nodes.
 43:  *
 44:  * The second set of nested for loops is used to
 45:  * update Iold and compute the l_2 norm of the
 46:  * difference of the iterates.
 47:  *
 48:  * Computing the l_2 norm requires a reduction operation.
 49:  * To simplify the reduction procedure, the RAJA API
 50:  * introduces thread safe variables.
 51:  *
 52:  * ----[RAJA Concepts]---------------
 53:  * - Forall::nested loop
 54:  * - RAJA Reduction
 55:  *
 56:  */

 58: /*
 59:  *  ----[Constant Values]-----
 60:  * CUDA_BLOCK_SIZE_X - Number of threads in the
 61:  *                     x-dimension of a cuda thread block
 62:  *
 63:  * CUDA_BLOCK_SIZE_Y - Number of threads in the
 64:  *                     y-dimension of a cuda thread block
 65:  *
 66:  * CUDA_BLOCK_SIZE   - Number of threads per threads block
 67: */
 68: #if defined(RAJA_ENABLE_CUDA)
 69: const int CUDA_BLOCK_SIZE = 256;
 70: #endif

 72: #if defined(RAJA_ENABLE_HIP)
 73: const int HIP_BLOCK_SIZE = 256;
 74: #endif

 76: //
 77: //  Struct to hold grid info
 78: //  o - Origin in a cartesian dimension
 79: //  h - Spacing between grid points
 80: //  n - Number of grid points
 81: //
 82: struct grid_s {
 83:   double o, h;
 84:   int    n;
 85: };

 87: //
 88: // ----[Functions]---------
 89: // solution   - Function for the analytic solution
 90: // computeErr - Displays the maximum error in the solution
 91: //
 92: double solution(double x, double y);
 93: void   computeErr(double *I, grid_s grid);

 95: int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 96: {
 97:   std::cout << "Jacobi Example" << std::endl;

 99:   /*
100:    * ----[Solver Parameters]------------
101:    * tol       - Method terminates once the norm is less than tol
102:    * N         - Number of unknown gridpoints per cartesian dimension
103:    * NN        - Total number of gridpoints on the grid
104:    * maxIter   - Maximum number of iterations to be taken
105:    *
106:    * resI2     - Residual
107:    * iteration - Iteration number
108:    * grid_s    - Struct with grid information for a cartesian dimension
109:   */
110:   double tol = 1e-10;

112:   int N       = 50;
113:   int NN      = (N + 2) * (N + 2);
114:   int maxIter = 100000;

116:   double resI2;
117:   int    iteration;

119:   grid_s gridx;
120:   gridx.o = 0.0;
121:   gridx.h = 1.0 / (N + 1.0);
122:   gridx.n = N + 2;

124:   //
125:   //I, Iold - Holds iterates of Jacobi method
126:   //
127:   double *I    = memoryManager::allocate<double>(NN);
128:   double *Iold = memoryManager::allocate<double>(NN);

130:   memset(I, 0, NN * sizeof(double));
131:   memset(Iold, 0, NN * sizeof(double));

133:   printf("Standard  C++ Loop \n");
134:   resI2     = 1;
135:   iteration = 0;

137:   while (resI2 > tol * tol) {
138:     //
139:     // Jacobi Iteration
140:     //
141:     for (int n = 1; n <= N; ++n) {
142:       for (int m = 1; m <= N; ++m) {
143:         double x = gridx.o + m * gridx.h;
144:         double y = gridx.o + n * gridx.h;

146:         double f = gridx.h * gridx.h * (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y));

148:         int id = n * (N + 2) + m;
149:         I[id]  = 0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] + Iold[id - 1] + Iold[id + 1]);
150:       }
151:     }

153:     //
154:     // Compute residual and update Iold
155:     //
156:     resI2 = 0.0;
157:     for (int k = 0; k < NN; k++) {
158:       resI2 += (I[k] - Iold[k]) * (I[k] - Iold[k]);
159:       Iold[k] = I[k];
160:     }

162:     if (iteration > maxIter) {
163:       printf("Standard C++ Loop - Maxed out on iterations \n");
164:       exit(-1);
165:     }

167:     iteration++;
168:   }
169:   computeErr(I, gridx);
170:   printf("No of iterations: %d \n \n", iteration);

172:   //
173:   // RAJA loop calls may be shortened by predefining policies
174:   //
175:   RAJA::RangeSegment gridRange(0, NN);
176:   RAJA::RangeSegment jacobiRange(1, (N + 1));

178:   using jacobiSeqNestedPolicy = RAJA::KernelPolicy<RAJA::statement::For<1, RAJA::seq_exec, RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>;

180:   printf("RAJA: Sequential Policy - Nested ForallN \n");
181:   resI2     = 1;
182:   iteration = 0;
183:   memset(I, 0, NN * sizeof(double));
184:   memset(Iold, 0, NN * sizeof(double));

186:   /*
187:    *  Sequential Jacobi Iteration.
188:    *
189:    *  Note that a RAJA ReduceSum object is used to accumulate the sum
190:    *  for the residual. Since the loop is run sequentially, this is
191:    *  not strictly necessary. It is done here for consistency and
192:    *  comparison with other RAJA variants in this example.
193:    */
194:   while (resI2 > tol * tol) {
195:     RAJA::kernel<jacobiSeqNestedPolicy>(RAJA::make_tuple(jacobiRange, jacobiRange), [=](RAJA::Index_type m, RAJA::Index_type n) {
196:       double x = gridx.o + m * gridx.h;
197:       double y = gridx.o + n * gridx.h;

199:       double f = gridx.h * gridx.h * (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y));

201:       int id = n * (N + 2) + m;
202:       I[id]  = 0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] + Iold[id - 1] + Iold[id + 1]);
203:     });

205:     RAJA::ReduceSum<RAJA::seq_reduce, double> RAJA_resI2(0.0);
206:     RAJA::forall<RAJA::seq_exec>(gridRange, [=](RAJA::Index_type k) {
207:       RAJA_resI2 += (I[k] - Iold[k]) * (I[k] - Iold[k]);
208:       Iold[k] = I[k];
209:     });

211:     resI2 = RAJA_resI2;
212:     if (iteration > maxIter) {
213:       printf("Jacobi: Sequential - Maxed out on iterations! \n");
214:       exit(-1);
215:     }
216:     iteration++;
217:   }
218:   computeErr(I, gridx);
219:   printf("No of iterations: %d \n \n", iteration);

221: #if defined(RAJA_ENABLE_OPENMP)
222:   printf("RAJA: OpenMP Policy - Nested ForallN \n");
223:   resI2     = 1;
224:   iteration = 0;
225:   memset(I, 0, NN * sizeof(double));
226:   memset(Iold, 0, NN * sizeof(double));

228:   /*
229:    *  OpenMP parallel Jacobi Iteration.
230:    *
231:    *  ----[RAJA Policies]-----------
232:    *  RAJA::omp_collapse_for_exec -
233:    *  introduced a nested region
234:    *
235:    *  Note that OpenMP RAJA ReduceSum object performs the reduction
236:    *  operation for the residual in a thread-safe manner.
237:    */

239:   using jacobiOmpNestedPolicy = RAJA::KernelPolicy<RAJA::statement::For<1, RAJA::omp_parallel_for_exec, RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>;

241:   while (resI2 > tol * tol) {
242:     RAJA::kernel<jacobiOmpNestedPolicy>(RAJA::make_tuple(jacobiRange, jacobiRange), [=](RAJA::Index_type m, RAJA::Index_type n) {
243:       double x = gridx.o + m * gridx.h;
244:       double y = gridx.o + n * gridx.h;

246:       double f = gridx.h * gridx.h * (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y));

248:       int id = n * (N + 2) + m;
249:       I[id]  = 0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] + Iold[id - 1] + Iold[id + 1]);
250:     });

252:     RAJA::ReduceSum<RAJA::omp_reduce, double> RAJA_resI2(0.0);

254:     RAJA::forall<RAJA::omp_parallel_for_exec>(gridRange, [=](RAJA::Index_type k) {
255:       RAJA_resI2 += (I[k] - Iold[k]) * (I[k] - Iold[k]);
256:       Iold[k] = I[k];
257:     });

259:     resI2 = RAJA_resI2;
260:     if (iteration > maxIter) {
261:       printf("Jacobi: OpenMP - Maxed out on iterations! \n");
262:       exit(-1);
263:     }
264:     iteration++;
265:   }
266:   computeErr(I, gridx);
267:   printf("No of iterations: %d \n \n", iteration);
268: #endif

270: #if defined(RAJA_ENABLE_CUDA)
271:   /*
272:    *  CUDA Jacobi Iteration.
273:    *
274:    *  ----[RAJA Policies]-----------
275:    *  RAJA::cuda_threadblock_y_exec, RAJA::cuda_threadblock_x_exec -
276:    *  define the mapping of loop iterations to GPU thread blocks
277:    *
278:    *  Note that CUDA RAJA ReduceSum object performs the reduction
279:    *  operation for the residual in a thread-safe manner on the GPU.
280:    */

282:   printf("RAJA: CUDA Policy - Nested ForallN \n");

284:   using jacobiCUDANestedPolicy = RAJA::KernelPolicy<RAJA::statement::CudaKernel<RAJA::statement::Tile<1, RAJA::tile_fixed<32>, RAJA::cuda_block_y_loop, RAJA::statement::Tile<0, RAJA::tile_fixed<32>, RAJA::cuda_block_x_loop, RAJA::statement::For<1, RAJA::cuda_thread_y_direct, RAJA::statement::For<0, RAJA::cuda_thread_x_direct, RAJA::statement::Lambda<0>>>>>>>;

286:   resI2     = 1;
287:   iteration = 0;
288:   memset(I, 0, NN * sizeof(double));
289:   memset(Iold, 0, NN * sizeof(double));

291:   while (resI2 > tol * tol) {
292:     //
293:     // Jacobi Iteration
294:     //
295:     RAJA::kernel<jacobiCUDANestedPolicy>(RAJA::make_tuple(jacobiRange, jacobiRange), [=] RAJA_DEVICE(RAJA::Index_type m, RAJA::Index_type n) {
296:       double x = gridx.o + m * gridx.h;
297:       double y = gridx.o + n * gridx.h;

299:       double f = gridx.h * gridx.h * (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y));

301:       int id = n * (N + 2) + m;
302:       I[id]  = 0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] + Iold[id - 1] + Iold[id + 1]);
303:     });

305:     //
306:     // Compute residual and update Iold
307:     //
308:     RAJA::ReduceSum<RAJA::cuda_reduce, double> RAJA_resI2(0.0);
309:     RAJA::forall<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(gridRange, [=] RAJA_DEVICE(RAJA::Index_type k) {
310:       RAJA_resI2 += (I[k] - Iold[k]) * (I[k] - Iold[k]);
311:       Iold[k] = I[k];
312:     });

314:     resI2 = RAJA_resI2;

316:     if (iteration > maxIter) {
317:       printf("RAJA: CUDA - Maxed out on iterations! \n");
318:       exit(-1);
319:     }
320:     iteration++;
321:   }
322:   cudaDeviceSynchronize();
323:   computeErr(I, gridx);
324:   printf("No of iterations: %d \n \n", iteration);
325: #endif

327: #if defined(RAJA_ENABLE_HIP)
328:   /*
329:    *  HIP Jacobi Iteration.
330:    *
331:    *  ----[RAJA Policies]-----------
332:    *  RAJA::cuda_threadblock_y_exec, RAJA::cuda_threadblock_x_exec -
333:    *  define the mapping of loop iterations to GPU thread blocks
334:    *
335:    *  Note that HIP RAJA ReduceSum object performs the reduction
336:    *  operation for the residual in a thread-safe manner on the GPU.
337:    */

339:   printf("RAJA: HIP Policy - Nested ForallN \n");

341:   using jacobiHIPNestedPolicy = RAJA::KernelPolicy<RAJA::statement::HipKernel<RAJA::statement::Tile<1, RAJA::tile_fixed<32>, RAJA::hip_block_y_loop, RAJA::statement::Tile<0, RAJA::tile_fixed<32>, RAJA::hip_block_x_loop, RAJA::statement::For<1, RAJA::hip_thread_y_direct, RAJA::statement::For<0, RAJA::hip_thread_x_direct, RAJA::statement::Lambda<0>>>>>>>;

343:   resI2     = 1;
344:   iteration = 0;
345:   memset(I, 0, NN * sizeof(double));
346:   memset(Iold, 0, NN * sizeof(double));

348:   double *d_I    = memoryManager::allocate_gpu<double>(NN);
349:   double *d_Iold = memoryManager::allocate_gpu<double>(NN);
350:   hipErrchk(hipMemcpy(d_I, I, NN * sizeof(double), hipMemcpyHostToDevice));
351:   hipErrchk(hipMemcpy(d_Iold, Iold, NN * sizeof(double), hipMemcpyHostToDevice));

353:   while (resI2 > tol * tol) {
354:     //
355:     // Jacobi Iteration
356:     //
357:     RAJA::kernel<jacobiHIPNestedPolicy>(RAJA::make_tuple(jacobiRange, jacobiRange), [=] RAJA_DEVICE(RAJA::Index_type m, RAJA::Index_type n) {
358:       double x = gridx.o + m * gridx.h;
359:       double y = gridx.o + n * gridx.h;

361:       double f = gridx.h * gridx.h * (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y));

363:       int id  = n * (N + 2) + m;
364:       d_I[id] = 0.25 * (-f + d_Iold[id - N - 2] + d_Iold[id + N + 2] + d_Iold[id - 1] + d_Iold[id + 1]);
365:     });

367:     //
368:     // Compute residual and update Iold
369:     //
370:     RAJA::ReduceSum<RAJA::hip_reduce, double> RAJA_resI2(0.0);
371:     RAJA::forall<RAJA::hip_exec<HIP_BLOCK_SIZE>>(gridRange, [=] RAJA_DEVICE(RAJA::Index_type k) {
372:       RAJA_resI2 += (d_I[k] - d_Iold[k]) * (d_I[k] - d_Iold[k]);
373:       d_Iold[k] = d_I[k];
374:     });

376:     resI2 = RAJA_resI2;

378:     if (iteration > maxIter) {
379:       printf("RAJA: HIP - Maxed out on iterations! \n");
380:       exit(-1);
381:     }
382:     iteration++;
383:   }
384:   hipDeviceSynchronize();
385:   hipErrchk(hipMemcpy(I, d_I, NN * sizeof(double), hipMemcpyDeviceToHost));
386:   computeErr(I, gridx);
387:   printf("No of iterations: %d \n \n", iteration);

389:   memoryManager::deallocate_gpu(d_I);
390:   memoryManager::deallocate_gpu(d_Iold);
391: #endif

393:   memoryManager::deallocate(I);
394:   memoryManager::deallocate(Iold);

396:   return 0;
397: }

399: //
400: // Function for the analytic solution
401: //
402: double solution(double x, double y)
403: {
404:   return x * y * exp(x - y) * (1 - x) * (1 - y);
405: }

407: //
408: // Error is computed via ||I_{approx}(:) - U_{analytic}(:)||_{inf}
409: //
410: void computeErr(double *I, grid_s grid)
411: {
412:   RAJA::RangeSegment                        gridRange(0, grid.n);
413:   RAJA::ReduceMax<RAJA::seq_reduce, double> tMax(-1.0);

415:   using jacobiSeqNestedPolicy = RAJA::KernelPolicy<RAJA::statement::For<1, RAJA::seq_exec, RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>;

417:   RAJA::kernel<jacobiSeqNestedPolicy>(RAJA::make_tuple(gridRange, gridRange), [=](RAJA::Index_type ty, RAJA::Index_type tx) {
418:     int    id    = tx + grid.n * ty;
419:     double x     = grid.o + tx * grid.h;
420:     double y     = grid.o + ty * grid.h;
421:     double myErr = std::abs(I[id] - solution(x, y));
422:     tMax.max(myErr);
423:   });

425:   double l2err = tMax;
426:   printf("Max error = %lg, h = %f \n", l2err, grid.h);
427: }

429: /*TEST

431:     test:
432:       requires: raja !cuda

434:     test:
435:       suffix: 2
436:       requires: raja cuda

438: TEST*/