Actual source code: ex1k.kokkos.cxx

  1: static char help[] = "Benchmarking MatMult() with AIJ and its subclass matrix types\n";

  3: /*
  4: Usage:
  5:   mpirun -n <np> ./ex1k
  6:     -f <file>        : input petsc matrix binary file; one can convert a file from MatrixMarket using mat/tests/ex72.c
  7:     -mat_type <type> : aij or its subclass. Default is aij.
  8:     -n <num>         : run MatMult() this many times and report average time. Default is 500.

 10: Notes:
 11:   It uses CPU-timer to measure the time.

 13: Examples:
 14:   On OLCF Summit (with GPU-aware MPI)
 15:     # 6 MPI ranks:
 16:     # 6 resource sets (-n 6), 1 MPI rank per RS (-a 1), 7 CPU cores per RS (-c 7), and 1 GPU per RS (-g 1), 6 RSs per node (-r 6)
 17:     jsrun --smpiargs "-gpu" -n 6 -a 1 -c 7 -g 1 -r 6 ./ex1k -f 1138_bus.aij -mat_type aijcusparse

 19:     # 1 MPI rank
 20:     jsrun --smpiargs "-gpu" -n 1 -a 1 -c 7 -g 1 -r 1 ./ex1k -f 1138_bus.aij -mat_type aijcusparse

 22:   On OLCF Crusher:
 23:     # 1 MPI rank
 24:     # run with 1 node (-N1), 1 mpi rank (-n1), 2 hardware threads per rank (-c2)
 25:     srun -N1 -n1 -c2 --gpus-per-node=8 --gpu-bind=closest ./ex1k -f HV15R.aij -mat_type aijkokkos

 27:     # 8 MPI ranks
 28:     srun -N1 -n8 -c2 --gpus-per-node=8 --gpu-bind=closest ./ex1k -f HV15R.aij -mat_type aijkokkos
 29: */
 30: #include <petscmat.h>
 31: #include <petscdevice.h>

 33: #if defined(PETSC_HAVE_CUDA)
 34: #include <petscdevice_cuda.h>
 35:   #define SyncDevice() PetscCallCUDA(cudaDeviceSynchronize())
 36: #elif defined(PETSC_HAVE_HIP)
 37: #include <petscdevice_hip.h>
 38:   #define SyncDevice() PetscCallHIP(hipDeviceSynchronize())
 39: #elif defined(PETSC_HAVE_KOKKOS)
 40:   #include <Kokkos_Core.hpp>
 41:   #define SyncDevice() Kokkos::fence()
 42: #else
 43:   #define SyncDevice()
 44: #endif

 46: int main(int argc, char **args)
 47: {
 48:   Mat            A, A2;
 49:   Vec            x, y, x2, y2;
 50:   PetscViewer    fd;
 51:   char           matfile[PETSC_MAX_PATH_LEN];
 52:   char           mattype[64];
 53:   PetscBool      flg;
 54:   PetscLogStage  stage;
 55:   PetscInt       i, n = 500, nskip = 5, M, N;
 56:   MatInfo        info;
 57:   PetscLogDouble tstart = 0, tend = 0, avgTime;
 58:   PetscRandom    rctx;
 59:   PetscReal      norm;
 60:   PetscMPIInt    size;

 62:   PetscFunctionBeginUser;
 63:   PetscCall(PetscInitialize(&argc, &args, (char *)0, help));
 64:   PetscCallMPI(MPI_Comm_size(PETSC_COMM_WORLD, &size));

 66:   /* Read options -n */
 67:   PetscCall(PetscOptionsGetInt(NULL, NULL, "-n", &n, NULL));

 69:   /* Load the matrix from a binary file */
 70:   PetscCall(PetscOptionsGetString(NULL, NULL, "-f", matfile, PETSC_MAX_PATH_LEN, &flg));
 71:   PetscCheck(flg, PETSC_COMM_WORLD, PETSC_ERR_USER_INPUT, "Must indicate a petsc matrix binary file with the -f option");
 72:   PetscCall(PetscOptionsGetString(NULL, NULL, "-mat_type", mattype, sizeof(mattype), &flg));
 73:   if (!flg) PetscCall(PetscStrncpy(mattype, MATAIJ, sizeof(mattype)));

 75:   /* Read the matrix file to A2 */
 76:   PetscCall(PetscViewerBinaryOpen(PETSC_COMM_WORLD, matfile, FILE_MODE_READ, &fd));
 77:   PetscCall(MatCreate(PETSC_COMM_WORLD, &A2));
 78:   PetscCall(MatSetType(A2, MATAIJ));
 79:   PetscCall(MatLoad(A2, fd));
 80:   PetscCall(MatCreateVecs(A2, &x2, &y2));
 81:   PetscCall(PetscViewerDestroy(&fd));

 83:   PetscCall(MatGetSize(A2, &M, &N));
 84:   PetscCall(MatGetInfo(A2, MAT_GLOBAL_SUM, &info));
 85:   PetscCall(PetscPrintf(PETSC_COMM_WORLD, "Input matrix %s: %" PetscInt_FMT " x %" PetscInt_FMT "; %lld nonzeros; %.1f per row\n", matfile, M, N, (long long)info.nz_used, (double)info.nz_used / (double)M));

 87:   /* Copy A2 to A and convert A to the specified type */
 88:   PetscCall(MatDuplicate(A2, MAT_COPY_VALUES, &A));
 89:   PetscCall(MatConvert(A, mattype, MAT_INPLACE_MATRIX, &A));
 90:   PetscCall(MatCreateVecs(A, &x, &y));

 92:   /* Init x, x2 with the same value */
 93:   PetscCall(PetscRandomCreate(PETSC_COMM_WORLD, &rctx));
 94:   PetscCall(VecSetRandom(x2, rctx));
 95:   PetscCall(PetscRandomDestroy(&rctx));
 96:   PetscCall(VecCopy(x2, x));

 98:   /* Compute the reference y2 = A2 x2 */
 99:   PetscCall(MatMult(A2, x2, y2));

101:   /* Measure y = Ax */
102:   PetscCall(PetscLogStageRegister("MatMult", &stage));
103:   for (i = 0; i < n + nskip; i++) {
104:     if (i == nskip) {
105:       SyncDevice();
106:       PetscCall(PetscLogStagePush(stage));
107:       PetscCallMPI(MPI_Barrier(PETSC_COMM_WORLD));
108:       PetscCall(PetscTime(&tstart));
109:     }
110:     PetscCall(MatMult(A, x, y));
111:   }
112:   SyncDevice();
113:   PetscCallMPI(MPI_Barrier(PETSC_COMM_WORLD));
114:   PetscCall(PetscTime(&tend));
115:   avgTime = (tend - tstart) * 1e6 / n; /* microseconds */
116:   PetscCall(PetscLogStagePop());

118:   /* Validate y against y2 */
119:   PetscCall(VecAYPX(y2, -1, y));
120:   PetscCall(VecNorm(y2, NORM_2, &norm));
121:   PetscCheck(norm < 1e-6, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "MatMult() error with norm %g", (double)norm);
122:   PetscCall(PetscPrintf(PETSC_COMM_WORLD, "MatMult() average time (us) with %d MPI ranks = %8.2f\n", size, avgTime));

124:   PetscCall(MatDestroy(&A));
125:   PetscCall(VecDestroy(&x));
126:   PetscCall(VecDestroy(&y));
127:   PetscCall(MatDestroy(&A2));
128:   PetscCall(VecDestroy(&x2));
129:   PetscCall(VecDestroy(&y2));
130:   PetscCall(PetscFinalize());
131:   return 0;
132: }

134: /*TEST

136:   testset:
137:     args: -n 2 -f ${DATAFILESPATH}/matrices/small
138:     nsize: 1
139:     filter: grep "DOES_NOT_EXIST"
140:     output_file: output/empty.out
141:     requires: datafilespath !complex double !single kokkos_kernels

143:     test:
144:       suffix: 1
145:       requires: cuda
146:       args: -mat_type aijcusparse

148:     test:
149:       suffix: 2
150:       args: -mat_type aijkokkos

152:     test:
153:       suffix: 3
154:       requires: hip
155:       args: -mat_type aijhipsparse

157: TEST*/