Actual source code: ex2k.kokkos.cxx
1: static char help[] = "Benchmarking various accessing methods of DMDA vectors on host\n\n";
3: /*
4: On a machine with AMD EPYC-7452 CPUs, we got this data using one MPI rank and a serial-only Kokkos:
5: Time (sec.), on Mar. 1, 2022
6: ------------------------------------------
7: n PETSc C Kokkos
8: ------------------------------------------
9: 32 4.6464E-05 4.7451E-05 1.6880E-04
10: 64 2.5654E-04 2.5164E-04 5.6780E-04
11: 128 1.9383E-03 1.8878E-03 4.7938E-03
12: 256 1.4450E-02 1.3619E-02 3.7778E-02
13: 512 1.1580E-01 1.1551E-01 2.8428E-01
14: 1024 1.4179E+00 1.3772E+00 2.2873E+00
16: Overall, C is -2% ~ 5% faster than PETSc. But Kokkos is 1.6~3.6x slower than PETSc
17: */
19: #include <petscdmda_kokkos.hpp>
20: #include <petscdm.h>
21: #include <petscdmda.h>
23: using Kokkos::Iterate;
24: using Kokkos::MDRangePolicy;
25: using Kokkos::Rank;
26: using PetscScalarKokkosOffsetView3D = Kokkos::Experimental::OffsetView<PetscScalar ***, Kokkos::LayoutRight, Kokkos::HostSpace>;
27: using ConstPetscScalarKokkosOffsetView3D = Kokkos::Experimental::OffsetView<const PetscScalar ***, Kokkos::LayoutRight, Kokkos::HostSpace>;
29: /* PETSc multi-dimensional array access */
30: static PetscErrorCode Update1(DM da, const PetscScalar ***__restrict__ x1, PetscScalar ***__restrict__ y1, PetscInt nwarm, PetscInt nloop, PetscLogDouble *avgTime)
31: {
32: PetscInt it, i, j, k;
33: PetscLogDouble tstart = 0.0, tend;
34: PetscInt xm, ym, zm, xs, ys, zs, gxm, gym, gzm, gxs, gys, gzs;
36: PetscFunctionBegin;
37: PetscCall(DMDAGetCorners(da, &xs, &ys, &zs, &xm, &ym, &zm));
38: PetscCall(DMDAGetGhostCorners(da, &gxs, &gys, &gzs, &gxm, &gym, &gzm));
39: for (it = 0; it < nwarm + nloop; it++) {
40: if (it == nwarm) PetscCall(PetscTime(&tstart));
41: for (k = zs; k < zs + zm; k++) {
42: for (j = ys; j < ys + ym; j++) {
43: for (i = xs; i < xs + xm; i++) y1[k][j][i] = 6 * x1[k][j][i] - x1[k - 1][j][i] - x1[k][j - 1][i] - x1[k][j][i - 1] - x1[k + 1][j][i] - x1[k][j + 1][i] - x1[k][j][i + 1];
44: }
45: }
46: }
47: PetscCall(PetscTime(&tend));
48: *avgTime = (tend - tstart) / nloop;
49: PetscFunctionReturn(PETSC_SUCCESS);
50: }
52: /* C multi-dimensional array access */
53: static PetscErrorCode Update2(DM da, const PetscScalar *__restrict__ x2, PetscScalar *__restrict__ y2, PetscInt nwarm, PetscInt nloop, PetscLogDouble *avgTime)
54: {
55: PetscInt it, i, j, k;
56: PetscLogDouble tstart = 0.0, tend;
57: PetscInt xm, ym, zm, xs, ys, zs, gxm, gym, gzm, gxs, gys, gzs;
59: PetscFunctionBegin;
60: PetscCall(DMDAGetCorners(da, &xs, &ys, &zs, &xm, &ym, &zm));
61: PetscCall(DMDAGetGhostCorners(da, &gxs, &gys, &gzs, &gxm, &gym, &gzm));
62: #define X2(k, j, i) x2[(k - gzs) * gym * gxm + (j - gys) * gxm + (i - gxs)]
63: #define Y2(k, j, i) y2[(k - zs) * ym * xm + (j - ys) * xm + (i - xs)]
64: for (it = 0; it < nwarm + nloop; it++) {
65: if (it == nwarm) PetscCall(PetscTime(&tstart));
66: for (k = zs; k < zs + zm; k++) {
67: for (j = ys; j < ys + ym; j++) {
68: for (i = xs; i < xs + xm; i++) Y2(k, j, i) = 6 * X2(k, j, i) - X2(k - 1, j, i) - X2(k, j - 1, i) - X2(k, j, i - 1) - X2(k + 1, j, i) - X2(k, j + 1, i) - X2(k, j, i + 1);
69: }
70: }
71: }
72: PetscCall(PetscTime(&tend));
73: *avgTime = (tend - tstart) / nloop;
74: #undef X2
75: #undef Y2
76: PetscFunctionReturn(PETSC_SUCCESS);
77: }
79: int main(int argc, char **argv)
80: {
81: DM da;
82: PetscInt xm, ym, zm, xs, ys, zs, gxm, gym, gzm, gxs, gys, gzs;
83: PetscInt dof = 1, sw = 1;
84: DMBoundaryType bx = DM_BOUNDARY_PERIODIC, by = DM_BOUNDARY_PERIODIC, bz = DM_BOUNDARY_PERIODIC;
85: DMDAStencilType st = DMDA_STENCIL_STAR;
86: Vec x, y; /* local/global vectors of the da */
87: PetscRandom rctx;
88: const PetscScalar ***x1;
89: PetscScalar ***y1; /* arrays of g, ll */
90: const PetscScalar *x2;
91: PetscScalar *y2;
92: ConstPetscScalarKokkosOffsetView3D x3;
93: PetscScalarKokkosOffsetView3D y3;
94: PetscLogDouble tstart = 0.0, tend = 0.0, avgTime = 0.0;
95: PetscInt nwarm = 2, nloop = 10;
96: PetscInt min = 32, max = 32 * 8; /* min and max sizes of the grids to sample */
98: PetscFunctionBeginUser;
99: PetscCall(PetscInitialize(&argc, &argv, (char *)0, help));
100: PetscCall(PetscRandomCreate(PETSC_COMM_WORLD, &rctx));
101: PetscCall(PetscOptionsGetInt(NULL, NULL, "-min", &min, NULL));
102: PetscCall(PetscOptionsGetInt(NULL, NULL, "-max", &max, NULL));
104: for (PetscInt len = min; len <= max; len = len * 2) {
105: PetscCall(DMDACreate3d(PETSC_COMM_WORLD, bx, by, bz, st, len, len, len, PETSC_DECIDE, PETSC_DECIDE, PETSC_DECIDE, dof, sw, 0, 0, 0, &da));
106: PetscCall(DMSetFromOptions(da));
107: PetscCall(DMSetUp(da));
109: PetscCall(DMDAGetCorners(da, &xs, &ys, &zs, &xm, &ym, &zm));
110: PetscCall(DMDAGetGhostCorners(da, &gxs, &gys, &gzs, &gxm, &gym, &gzm));
111: PetscCall(DMCreateLocalVector(da, &x)); /* Create local x and global y */
112: PetscCall(DMCreateGlobalVector(da, &y));
114: /* Access with petsc multi-dimensional arrays */
115: PetscCall(VecSetRandom(x, rctx));
116: PetscCall(VecSet(y, 0.0));
117: PetscCall(DMDAVecGetArrayRead(da, x, &x1));
118: PetscCall(DMDAVecGetArray(da, y, &y1));
119: PetscCall(Update1(da, x1, y1, nwarm, nloop, &avgTime));
120: PetscCall(DMDAVecRestoreArrayRead(da, x, &x1));
121: PetscCall(DMDAVecRestoreArray(da, y, &y1));
122: PetscCall(PetscTime(&tend));
123: PetscCall(PetscPrintf(PETSC_COMM_WORLD, "%4d^3 -- PETSc average time = %e\n", len, avgTime));
125: /* Access with C multi-dimensional arrays */
126: PetscCall(VecSetRandom(x, rctx));
127: PetscCall(VecSet(y, 0.0));
128: PetscCall(VecGetArrayRead(x, &x2));
129: PetscCall(VecGetArray(y, &y2));
130: PetscCall(Update2(da, x2, y2, nwarm, nloop, &avgTime));
131: PetscCall(VecRestoreArrayRead(x, &x2));
132: PetscCall(VecRestoreArray(y, &y2));
133: PetscCall(PetscPrintf(PETSC_COMM_WORLD, "%4d^3 -- C average time = %e\n", len, avgTime));
135: /* Access with Kokkos multi-dimensional OffsetViews */
136: PetscCall(VecSet(y, 0.0));
137: PetscCall(VecSetRandom(x, rctx));
138: PetscCall(DMDAVecGetKokkosOffsetView(da, x, &x3));
139: PetscCall(DMDAVecGetKokkosOffsetView(da, y, &y3));
141: for (PetscInt it = 0; it < nwarm + nloop; it++) {
142: if (it == nwarm) PetscCall(PetscTime(&tstart));
143: Kokkos::parallel_for(
144: "stencil", MDRangePolicy<Kokkos::DefaultHostExecutionSpace, Rank<3, Iterate::Right, Iterate::Right>>({zs, ys, xs}, {zs + zm, ys + ym, xs + xm}),
145: KOKKOS_LAMBDA(PetscInt k, PetscInt j, PetscInt i) { y3(k, j, i) = 6 * x3(k, j, i) - x3(k - 1, j, i) - x3(k, j - 1, i) - x3(k, j, i - 1) - x3(k + 1, j, i) - x3(k, j + 1, i) - x3(k, j, i + 1); });
146: }
147: PetscCall(PetscTime(&tend));
148: PetscCall(DMDAVecRestoreKokkosOffsetView(da, x, &x3));
149: PetscCall(DMDAVecRestoreKokkosOffsetView(da, y, &y3));
150: avgTime = (tend - tstart) / nloop;
151: PetscCall(PetscPrintf(PETSC_COMM_WORLD, "%4d^3 -- Kokkos average time = %e\n", len, avgTime));
153: PetscCall(VecDestroy(&x));
154: PetscCall(VecDestroy(&y));
155: PetscCall(DMDestroy(&da));
156: }
157: PetscCall(PetscRandomDestroy(&rctx));
158: PetscCall(PetscFinalize());
159: return 0;
160: }
162: /*TEST
163: build:
164: requires: kokkos_kernels
166: test:
167: suffix: 1
168: requires: kokkos_kernels
169: args: -min 32 -max 32 -dm_vec_type kokkos
170: filter: grep -v "time"
172: TEST*/