Actual source code: ex2k.kokkos.cxx

  1: static char help[] = "Benchmarking various accessing methods of DMDA vectors on host\n\n";

  3: /*
  4:   On a machine with AMD EPYC-7452 CPUs, we got this data using one MPI rank and a serial-only Kokkos:
  5:            Time (sec.), on Mar. 1, 2022
  6:   ------------------------------------------
  7:   n     PETSc          C          Kokkos
  8:   ------------------------------------------
  9:   32    4.6464E-05  4.7451E-05  1.6880E-04
 10:   64    2.5654E-04  2.5164E-04  5.6780E-04
 11:   128   1.9383E-03  1.8878E-03  4.7938E-03
 12:   256   1.4450E-02  1.3619E-02  3.7778E-02
 13:   512   1.1580E-01  1.1551E-01  2.8428E-01
 14:   1024  1.4179E+00  1.3772E+00  2.2873E+00

 16:   Overall, C is -2% ~ 5% faster than PETSc. But Kokkos is 1.6~3.6x slower than PETSc
 17: */

 19: #include <petscdmda_kokkos.hpp>
 20: #include <petscdm.h>
 21: #include <petscdmda.h>
 22: #include <Kokkos_DualView.hpp>

 24: using Kokkos::Iterate;
 25: using Kokkos::MDRangePolicy;
 26: using Kokkos::Rank;
 27: using HostMirrorMemorySpace              = Kokkos::DualView<PetscScalar *>::host_mirror_space::memory_space;
 28: using PetscScalarKokkosOffsetView3D      = Kokkos::Experimental::OffsetView<PetscScalar ***, Kokkos::LayoutRight, HostMirrorMemorySpace>;
 29: using ConstPetscScalarKokkosOffsetView3D = Kokkos::Experimental::OffsetView<const PetscScalar ***, Kokkos::LayoutRight, HostMirrorMemorySpace>;

 31: /* PETSc multi-dimensional array access */
 32: static PetscErrorCode Update1(DM da, const PetscScalar ***__restrict__ x1, PetscScalar ***__restrict__ y1, PetscInt nwarm, PetscInt nloop, PetscLogDouble *avgTime)
 33: {
 34:   PetscInt       it, i, j, k;
 35:   PetscLogDouble tstart = 0.0, tend;
 36:   PetscInt       xm, ym, zm, xs, ys, zs, gxm, gym, gzm, gxs, gys, gzs;

 38:   PetscFunctionBegin;
 39:   PetscCall(DMDAGetCorners(da, &xs, &ys, &zs, &xm, &ym, &zm));
 40:   PetscCall(DMDAGetGhostCorners(da, &gxs, &gys, &gzs, &gxm, &gym, &gzm));
 41:   for (it = 0; it < nwarm + nloop; it++) {
 42:     if (it == nwarm) PetscCall(PetscTime(&tstart));
 43:     for (k = zs; k < zs + zm; k++) {
 44:       for (j = ys; j < ys + ym; j++) {
 45:         for (i = xs; i < xs + xm; i++) y1[k][j][i] = 6 * x1[k][j][i] - x1[k - 1][j][i] - x1[k][j - 1][i] - x1[k][j][i - 1] - x1[k + 1][j][i] - x1[k][j + 1][i] - x1[k][j][i + 1];
 46:       }
 47:     }
 48:   }
 49:   PetscCall(PetscTime(&tend));
 50:   *avgTime = (tend - tstart) / nloop;
 51:   PetscFunctionReturn(PETSC_SUCCESS);
 52: }

 54: /* C multi-dimensional array access */
 55: static PetscErrorCode Update2(DM da, const PetscScalar *__restrict__ x2, PetscScalar *__restrict__ y2, PetscInt nwarm, PetscInt nloop, PetscLogDouble *avgTime)
 56: {
 57:   PetscInt       it, i, j, k;
 58:   PetscLogDouble tstart = 0.0, tend;
 59:   PetscInt       xm, ym, zm, xs, ys, zs, gxm, gym, gzm, gxs, gys, gzs;

 61:   PetscFunctionBegin;
 62:   PetscCall(DMDAGetCorners(da, &xs, &ys, &zs, &xm, &ym, &zm));
 63:   PetscCall(DMDAGetGhostCorners(da, &gxs, &gys, &gzs, &gxm, &gym, &gzm));
 64: #define X2(k, j, i) x2[(k - gzs) * gym * gxm + (j - gys) * gxm + (i - gxs)]
 65: #define Y2(k, j, i) y2[(k - zs) * ym * xm + (j - ys) * xm + (i - xs)]
 66:   for (it = 0; it < nwarm + nloop; it++) {
 67:     if (it == nwarm) PetscCall(PetscTime(&tstart));
 68:     for (k = zs; k < zs + zm; k++) {
 69:       for (j = ys; j < ys + ym; j++) {
 70:         for (i = xs; i < xs + xm; i++) Y2(k, j, i) = 6 * X2(k, j, i) - X2(k - 1, j, i) - X2(k, j - 1, i) - X2(k, j, i - 1) - X2(k + 1, j, i) - X2(k, j + 1, i) - X2(k, j, i + 1);
 71:       }
 72:     }
 73:   }
 74:   PetscCall(PetscTime(&tend));
 75:   *avgTime = (tend - tstart) / nloop;
 76: #undef X2
 77: #undef Y2
 78:   PetscFunctionReturn(PETSC_SUCCESS);
 79: }

 81: int main(int argc, char **argv)
 82: {
 83:   DM                                 da;
 84:   PetscInt                           xm, ym, zm, xs, ys, zs, gxm, gym, gzm, gxs, gys, gzs;
 85:   PetscInt                           dof = 1, sw = 1;
 86:   DMBoundaryType                     bx = DM_BOUNDARY_PERIODIC, by = DM_BOUNDARY_PERIODIC, bz = DM_BOUNDARY_PERIODIC;
 87:   DMDAStencilType                    st = DMDA_STENCIL_STAR;
 88:   Vec                                x, y; /* local/global vectors of the da */
 89:   PetscRandom                        rctx;
 90:   const PetscScalar               ***x1;
 91:   PetscScalar                     ***y1; /* arrays of g, ll */
 92:   const PetscScalar                 *x2;
 93:   PetscScalar                       *y2;
 94:   ConstPetscScalarKokkosOffsetView3D x3;
 95:   PetscScalarKokkosOffsetView3D      y3;
 96:   PetscLogDouble                     tstart = 0.0, tend = 0.0, avgTime = 0.0;
 97:   PetscInt                           nwarm = 2, nloop = 10;
 98:   PetscInt                           min = 32, max = 32 * 8; /* min and max sizes of the grids to sample */

100:   PetscFunctionBeginUser;
101:   PetscCall(PetscInitialize(&argc, &argv, nullptr, help));
102:   PetscCall(PetscRandomCreate(PETSC_COMM_WORLD, &rctx));
103:   PetscCall(PetscOptionsGetInt(NULL, NULL, "-min", &min, NULL));
104:   PetscCall(PetscOptionsGetInt(NULL, NULL, "-max", &max, NULL));

106:   for (PetscInt len = min; len <= max; len = len * 2) {
107:     PetscCall(DMDACreate3d(PETSC_COMM_WORLD, bx, by, bz, st, len, len, len, PETSC_DECIDE, PETSC_DECIDE, PETSC_DECIDE, dof, sw, 0, 0, 0, &da));
108:     PetscCall(DMSetFromOptions(da));
109:     PetscCall(DMSetUp(da));

111:     PetscCall(DMDAGetCorners(da, &xs, &ys, &zs, &xm, &ym, &zm));
112:     PetscCall(DMDAGetGhostCorners(da, &gxs, &gys, &gzs, &gxm, &gym, &gzm));
113:     PetscCall(DMCreateLocalVector(da, &x)); /* Create local x and global y */
114:     PetscCall(DMCreateGlobalVector(da, &y));

116:     /* Access with PETSc multi-dimensional arrays */
117:     PetscCall(VecSetRandom(x, rctx));
118:     PetscCall(VecSet(y, 0.0));
119:     PetscCall(DMDAVecGetArrayRead(da, x, &x1));
120:     PetscCall(DMDAVecGetArray(da, y, &y1));
121:     PetscCall(Update1(da, x1, y1, nwarm, nloop, &avgTime));
122:     PetscCall(DMDAVecRestoreArrayRead(da, x, &x1));
123:     PetscCall(DMDAVecRestoreArray(da, y, &y1));
124:     PetscCall(PetscTime(&tend));
125:     PetscCall(PetscPrintf(PETSC_COMM_WORLD, "%4d^3 -- PETSc average time  = %e\n", static_cast<int>(len), avgTime));

127:     /* Access with C multi-dimensional arrays */
128:     PetscCall(VecSetRandom(x, rctx));
129:     PetscCall(VecSet(y, 0.0));
130:     PetscCall(VecGetArrayRead(x, &x2));
131:     PetscCall(VecGetArray(y, &y2));
132:     PetscCall(Update2(da, x2, y2, nwarm, nloop, &avgTime));
133:     PetscCall(VecRestoreArrayRead(x, &x2));
134:     PetscCall(VecRestoreArray(y, &y2));
135:     PetscCall(PetscPrintf(PETSC_COMM_WORLD, "%4d^3 -- C average time      = %e\n", static_cast<int>(len), avgTime));

137:     /* Access with Kokkos multi-dimensional OffsetViews */
138:     PetscCall(VecSet(y, 0.0));
139:     PetscCall(VecSetRandom(x, rctx));
140:     PetscCall(DMDAVecGetKokkosOffsetView(da, x, &x3));
141:     PetscCall(DMDAVecGetKokkosOffsetView(da, y, &y3));

143:     for (PetscInt it = 0; it < nwarm + nloop; it++) {
144:       if (it == nwarm) PetscCall(PetscTime(&tstart));
145:       Kokkos::parallel_for(
146:         "stencil", MDRangePolicy<Kokkos::DefaultHostExecutionSpace, Rank<3, Iterate::Right, Iterate::Right>>({zs, ys, xs}, {zs + zm, ys + ym, xs + xm}),
147:         KOKKOS_LAMBDA(PetscInt k, PetscInt j, PetscInt i) { y3(k, j, i) = 6 * x3(k, j, i) - x3(k - 1, j, i) - x3(k, j - 1, i) - x3(k, j, i - 1) - x3(k + 1, j, i) - x3(k, j + 1, i) - x3(k, j, i + 1); });
148:     }
149:     PetscCall(PetscTime(&tend));
150:     PetscCall(DMDAVecRestoreKokkosOffsetView(da, x, &x3));
151:     PetscCall(DMDAVecRestoreKokkosOffsetView(da, y, &y3));
152:     avgTime = (tend - tstart) / nloop;
153:     PetscCall(PetscPrintf(PETSC_COMM_WORLD, "%4d^3 -- Kokkos average time = %e\n", static_cast<int>(len), avgTime));

155:     PetscCall(VecDestroy(&x));
156:     PetscCall(VecDestroy(&y));
157:     PetscCall(DMDestroy(&da));
158:   }
159:   PetscCall(PetscRandomDestroy(&rctx));
160:   PetscCall(PetscFinalize());
161:   return 0;
162: }

164: /*TEST
165:   build:
166:     requires: kokkos_kernels

168:   test:
169:     suffix: 1
170:     requires: kokkos_kernels
171:     args: -min 32 -max 32 -dm_vec_type kokkos
172:     filter: grep -v "time"

174: TEST*/