Actual source code: ex2k.c
1: static char help[] = "Benchmarking VecMDot() or VecMAXPY()\n";
2: /*
3: Usage:
4: mpirun -n <np> ./ex2k -vec_type <vector type>
5: -n <n> # number of data points of vector sizes from 128, 256, 512 and up. Maxima and default is 23.
6: -m <m> # run each VecMDot() m times to get the average time, default is 100.
7: -test_name <VecMDot or VecMAXPY> # test to run, by default it is VecMDot
8: -output_bw <bool> # output bandwidth instead of time
10: Example:
12: Running on Frontier at OLCF:
13: # run with 1 mpi rank (-n1), 32 CPUs (-c32)
14: $ srun -n1 -c32 --gpus-per-node=8 --gpu-bind=closest ./ex2k -vec_type kokkos
15: */
17: #include <petscvec.h>
18: #include <petscdevice.h>
20: int main(int argc, char **argv)
21: {
22: PetscInt i, j, k, M, N, mcount, its = 100, nsamples, ncount, maxN;
23: PetscLogDouble tstart, tend, times[8], fom; // figure of merit
24: Vec x, *ys;
25: PetscScalar *vals;
26: PetscMPIInt size;
27: PetscDeviceContext dctx;
28: char testName[64] = "VecMDot"; // By default, test VecMDot
29: PetscBool testMDot, testMAXPY;
30: PetscBool outputBW = PETSC_FALSE; // output bandwidth instead of time
31: PetscRandom rnd;
32: PetscLogStage stage1;
33: // clang-format off
34: // Try vectors of these (local) sizes. The max is very close to 2^31
35: PetscInt Ms[] = {128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768,
36: 65536, 131072, 262144, 524288, 1048576, 2097152, 4194304,
37: 8388608, 16777216, 33554432, 67108864, 134217728, 268435456, 536870912};
38: PetscInt Ns[] = {1, 3, 8, 30}; // try this number of y vectors in VecMDot
39: // clang-format on
41: PetscFunctionBeginUser;
42: PetscCall(PetscInitialize(&argc, &argv, (char *)0, help));
43: PetscCallMPI(MPI_Comm_size(PETSC_COMM_WORLD, &size));
44: PetscCall(PetscRandomCreate(PETSC_COMM_WORLD, &rnd));
46: mcount = sizeof(Ms) / sizeof(Ms[0]); // length of Ms[]
47: ncount = sizeof(Ns) / sizeof(Ns[0]); // length of Ns[]
48: maxN = Ns[ncount - 1]; // at most this many y vectors
50: nsamples = mcount;
51: PetscCall(PetscOptionsGetInt(NULL, NULL, "-n", &mcount, NULL)); // Up to vectors of local size 2^{mcount+6}
52: PetscCall(PetscOptionsGetInt(NULL, NULL, "-m", &its, NULL)); // Run each VecMDot() its times
53: PetscCall(PetscOptionsGetString(NULL, NULL, "-test_name", testName, sizeof(testName), NULL));
54: PetscCall(PetscOptionsGetBool(NULL, NULL, "-output_bw", &outputBW, NULL));
55: PetscCall(PetscStrncmp(testName, "VecMDot", sizeof(testName), &testMDot));
56: PetscCall(PetscStrncmp(testName, "VecMAXPY", sizeof(testName), &testMAXPY));
57: PetscCheck(testMDot || testMAXPY, PETSC_COMM_WORLD, PETSC_ERR_USER_INPUT, "Unsupported test name: %s", testName);
58: PetscCall(PetscDeviceContextGetCurrentContext(&dctx));
59: PetscCall(PetscMalloc1(maxN, &vals));
60: for (j = 0; j < maxN; j++) vals[j] = 3.14 + j; // same across all processes
62: PetscCall(PetscLogStageRegister("Profiling", &stage1));
63: PetscCall(PetscPrintf(PETSC_COMM_WORLD, "Vector(N) "));
64: for (j = 0; j < ncount; j++) PetscCall(PetscPrintf(PETSC_COMM_WORLD, " %s-%" PetscInt_FMT " ", testName, Ns[j]));
65: PetscCall(PetscPrintf(PETSC_COMM_WORLD, outputBW ? " (GB/s)\n" : " (us)\n"));
66: PetscCall(PetscPrintf(PETSC_COMM_WORLD, "--------------------------------------------------------------------------\n"));
68: nsamples = PetscMin(nsamples, mcount);
69: for (k = 0; k < nsamples; k++) { // for vector (local) size M
70: M = Ms[k];
71: PetscCall(VecCreate(PETSC_COMM_WORLD, &x));
72: PetscCall(VecSetFromOptions(x));
73: PetscCall(VecSetSizes(x, M, PETSC_DECIDE));
74: PetscCall(VecSetUp(x));
75: PetscCall(VecDuplicateVecs(x, maxN, &ys));
76: PetscCall(VecSetRandom(x, rnd));
77: for (i = 0; i < maxN; i++) PetscCall(VecSetRandom(ys[i], rnd));
79: for (j = 0; j < ncount; j++) { // try N y vectors
80: // Warm-up
81: N = Ns[j];
82: for (i = 0; i < 2; i++) {
83: if (testMDot) PetscCall(VecMDot(x, N, ys, vals));
84: else if (testMAXPY) PetscCall(VecMAXPY(x, N, vals, ys));
85: }
86: PetscCall(PetscDeviceContextSynchronize(dctx));
87: PetscCallMPI(MPI_Barrier(PETSC_COMM_WORLD));
89: PetscCall(PetscLogStagePush(stage1)); // use LogStage so that -log_view result will be clearer
90: PetscCall(PetscTime(&tstart));
91: for (i = 0; i < its; i++) {
92: if (testMDot) PetscCall(VecMDot(x, N, ys, vals));
93: else if (testMAXPY) PetscCall(VecMAXPY(x, N, vals, ys));
94: }
95: PetscCall(PetscDeviceContextSynchronize(dctx));
96: PetscCallMPI(MPI_Barrier(PETSC_COMM_WORLD));
97: PetscCall(PetscTime(&tend));
98: times[j] = (tend - tstart) * 1e6 / its;
99: PetscCall(PetscLogStagePop());
100: }
102: PetscCall(PetscPrintf(PETSC_COMM_WORLD, "%12" PetscInt_FMT, M));
103: for (j = 0; j < ncount; j++) {
104: N = Ns[j];
105: if (outputBW) {
106: // Read N y vectors and x vector of size M, and then write vals[] of size N
107: PetscLogDouble bytes = (M * (N + 1.0) + N) * sizeof(PetscScalar);
108: fom = (bytes / times[j]) * 1e-3;
109: } else {
110: fom = times[j];
111: }
112: PetscCall(PetscPrintf(PETSC_COMM_WORLD, "%12.1f ", fom));
113: }
114: PetscCall(PetscPrintf(PETSC_COMM_WORLD, "\n"));
116: PetscCall(VecDestroy(&x));
117: PetscCall(VecDestroyVecs(maxN, &ys));
118: }
120: PetscCall(PetscRandomDestroy(&rnd));
121: PetscCall(PetscFree(vals));
122: PetscCall(PetscFinalize());
123: return 0;
124: }
126: /*TEST
127: testset:
128: args: -n 2 -m 2 -test_name {{VecMDot VecMAXPY}}
129: output_file: output/empty.out
130: filter: grep "DOES_NOT_EXIST"
132: test:
133: suffix: standard
135: test:
136: requires: kokkos_kernels
137: suffix: kok
138: args: -vec_type kokkos
140: TEST*/