Actual source code: ex2k.c

  1: static char help[] = "Benchmarking VecMDot() or VecMAXPY()\n";
  2: /*
  3:   Usage:
  4:    mpirun -n <np> ./ex2k -vec_type <vector type>
  5:      -n  <n>  # number of data points of vector sizes from 128, 256, 512 and up. Maxima and default is 23.
  6:      -m  <m>  # run each VecMDot() m times to get the average time, default is 100.
  7:      -test_name <VecMDot or VecMAXPY>  # test to run, by default it is VecMDot
  8:      -output_bw <bool> # output bandwidth instead of time

 10:   Example:

 12:   Running on Frontier at OLCF:
 13:   # run with 1 mpi rank (-n1), 32 CPUs (-c32)
 14:   $ srun -n1 -c32 --gpus-per-node=8 --gpu-bind=closest ./ex2k -vec_type kokkos
 15: */

 17: #include <petscvec.h>
 18: #include <petscdevice.h>

 20: int main(int argc, char **argv)
 21: {
 22:   PetscInt           i, j, k, M, N, mcount, its = 100, nsamples, ncount, maxN;
 23:   PetscLogDouble     tstart, tend, times[8], fom; // figure of merit
 24:   Vec                x, *ys;
 25:   PetscScalar       *vals;
 26:   PetscMPIInt        size;
 27:   PetscDeviceContext dctx;
 28:   char               testName[64] = "VecMDot"; // By default, test VecMDot
 29:   PetscBool          testMDot, testMAXPY;
 30:   PetscBool          outputBW = PETSC_FALSE; // output bandwidth instead of time
 31:   PetscRandom        rnd;
 32:   PetscLogStage      stage1;
 33:   // clang-format off
 34:   // Try vectors of these (local) sizes. The max is very close to 2^31
 35:   PetscInt  Ms[]  = {128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768,
 36:                      65536, 131072, 262144, 524288, 1048576, 2097152, 4194304,
 37:                      8388608, 16777216, 33554432, 67108864, 134217728, 268435456, 536870912};
 38:   PetscInt  Ns[] = {1, 3, 8, 30}; // try this number of y vectors in VecMDot
 39:   // clang-format on

 41:   PetscFunctionBeginUser;
 42:   PetscCall(PetscInitialize(&argc, &argv, (char *)0, help));
 43:   PetscCallMPI(MPI_Comm_size(PETSC_COMM_WORLD, &size));
 44:   PetscCall(PetscRandomCreate(PETSC_COMM_WORLD, &rnd));

 46:   mcount = sizeof(Ms) / sizeof(Ms[0]); // length of Ms[]
 47:   ncount = sizeof(Ns) / sizeof(Ns[0]); // length of Ns[]
 48:   maxN   = Ns[ncount - 1];             // at most this many y vectors

 50:   nsamples = mcount;
 51:   PetscCall(PetscOptionsGetInt(NULL, NULL, "-n", &mcount, NULL)); // Up to vectors of local size 2^{mcount+6}
 52:   PetscCall(PetscOptionsGetInt(NULL, NULL, "-m", &its, NULL));    // Run each VecMDot() its times
 53:   PetscCall(PetscOptionsGetString(NULL, NULL, "-test_name", testName, sizeof(testName), NULL));
 54:   PetscCall(PetscOptionsGetBool(NULL, NULL, "-output_bw", &outputBW, NULL));
 55:   PetscCall(PetscStrncmp(testName, "VecMDot", sizeof(testName), &testMDot));
 56:   PetscCall(PetscStrncmp(testName, "VecMAXPY", sizeof(testName), &testMAXPY));
 57:   PetscCheck(testMDot || testMAXPY, PETSC_COMM_WORLD, PETSC_ERR_USER_INPUT, "Unsupported test name: %s", testName);
 58:   PetscCall(PetscDeviceContextGetCurrentContext(&dctx));
 59:   PetscCall(PetscMalloc1(maxN, &vals));
 60:   for (j = 0; j < maxN; j++) vals[j] = 3.14 + j; // same across all processes

 62:   PetscCall(PetscLogStageRegister("Profiling", &stage1));
 63:   PetscCall(PetscPrintf(PETSC_COMM_WORLD, "Vector(N)   "));
 64:   for (j = 0; j < ncount; j++) PetscCall(PetscPrintf(PETSC_COMM_WORLD, "   %s-%" PetscInt_FMT " ", testName, Ns[j]));
 65:   PetscCall(PetscPrintf(PETSC_COMM_WORLD, outputBW ? " (GB/s)\n" : " (us)\n"));
 66:   PetscCall(PetscPrintf(PETSC_COMM_WORLD, "--------------------------------------------------------------------------\n"));

 68:   nsamples = PetscMin(nsamples, mcount);
 69:   for (k = 0; k < nsamples; k++) { // for vector (local) size M
 70:     M = Ms[k];
 71:     PetscCall(VecCreate(PETSC_COMM_WORLD, &x));
 72:     PetscCall(VecSetFromOptions(x));
 73:     PetscCall(VecSetSizes(x, M, PETSC_DECIDE));
 74:     PetscCall(VecSetUp(x));
 75:     PetscCall(VecDuplicateVecs(x, maxN, &ys));
 76:     PetscCall(VecSetRandom(x, rnd));
 77:     for (i = 0; i < maxN; i++) PetscCall(VecSetRandom(ys[i], rnd));

 79:     for (j = 0; j < ncount; j++) { // try N y vectors
 80:       // Warm-up
 81:       N = Ns[j];
 82:       for (i = 0; i < 2; i++) {
 83:         if (testMDot) PetscCall(VecMDot(x, N, ys, vals));
 84:         else if (testMAXPY) PetscCall(VecMAXPY(x, N, vals, ys));
 85:       }
 86:       PetscCall(PetscDeviceContextSynchronize(dctx));
 87:       PetscCallMPI(MPI_Barrier(PETSC_COMM_WORLD));

 89:       PetscCall(PetscLogStagePush(stage1)); // use LogStage so that -log_view result will be clearer
 90:       PetscCall(PetscTime(&tstart));
 91:       for (i = 0; i < its; i++) {
 92:         if (testMDot) PetscCall(VecMDot(x, N, ys, vals));
 93:         else if (testMAXPY) PetscCall(VecMAXPY(x, N, vals, ys));
 94:       }
 95:       PetscCall(PetscDeviceContextSynchronize(dctx));
 96:       PetscCallMPI(MPI_Barrier(PETSC_COMM_WORLD));
 97:       PetscCall(PetscTime(&tend));
 98:       times[j] = (tend - tstart) * 1e6 / its;
 99:       PetscCall(PetscLogStagePop());
100:     }

102:     PetscCall(PetscPrintf(PETSC_COMM_WORLD, "%12" PetscInt_FMT, M));
103:     for (j = 0; j < ncount; j++) {
104:       N = Ns[j];
105:       if (outputBW) {
106:         // Read N y vectors and x vector of size M, and then write vals[] of size N
107:         PetscLogDouble bytes = (M * (N + 1.0) + N) * sizeof(PetscScalar);
108:         fom                  = (bytes / times[j]) * 1e-3;
109:       } else {
110:         fom = times[j];
111:       }
112:       PetscCall(PetscPrintf(PETSC_COMM_WORLD, "%12.1f ", fom));
113:     }
114:     PetscCall(PetscPrintf(PETSC_COMM_WORLD, "\n"));

116:     PetscCall(VecDestroy(&x));
117:     PetscCall(VecDestroyVecs(maxN, &ys));
118:   }

120:   PetscCall(PetscRandomDestroy(&rnd));
121:   PetscCall(PetscFree(vals));
122:   PetscCall(PetscFinalize());
123:   return 0;
124: }

126: /*TEST
127:   testset:
128:     args: -n 2 -m 2 -test_name {{VecMDot  VecMAXPY}}
129:     output_file: output/empty.out
130:     filter: grep "DOES_NOT_EXIST"

132:     test:
133:       suffix: standard

135:     test:
136:       requires: kokkos_kernels
137:       suffix: kok
138:       args: -vec_type kokkos

140: TEST*/