Actual source code: ex1k.kokkos.cxx
1: static const char help[] = "Benchmarking PetscSF Ping-pong latency (similar to osu_latency)\n\n";
3: /*
4: This is a simple test to measure the latency of MPI communication.
5: The test is run with two processes. The first process sends a message
6: to the second process, and after having received the message, the second
7: process sends a message back to the first process once. The is repeated
8: a number of times. The latency is defined as half time of the round-trip.
10: It mimics osu_latency from the OSU microbenchmarks (https://mvapich.cse.ohio-state.edu/benchmarks/).
12: Usage: mpirun -n 2 ./ex1k -mtype <type>
13: Other arguments have a default value that is also used in osu_latency.
15: Examples:
17: On Summit at OLCF:
18: jsrun --smpiargs "-gpu" -n 2 -a 1 -c 7 -g 1 -r 2 -l GPU-GPU -d packed -b packed:7 ./ex1k -mtype kokkos
20: On Crusher at OLCF:
21: srun -n2 -c32 --cpu-bind=map_cpu:0,1 --gpus-per-node=8 --gpu-bind=map_gpu:0,1 ./ex1k -mtype kokkos
22: */
23: #include <petscsf.h>
24: #include <Kokkos_Core.hpp>
26: /* Same values as OSU microbenchmarks */
27: #define LAT_LOOP_SMALL 10000
28: #define LAT_SKIP_SMALL 100
29: #define LAT_LOOP_LARGE 1000
30: #define LAT_SKIP_LARGE 10
31: #define LARGE_MESSAGE_SIZE 8192
33: int main(int argc, char **argv)
34: {
35: PetscSF sf[64];
36: PetscLogDouble t_start = 0, t_end = 0, time[64];
37: PetscInt i, j, n, nroots, nleaves, niter = 100, nskip = 10;
38: PetscInt maxn = 512 * 1024; /* max 4M bytes messages */
39: PetscSFNode *iremote;
40: PetscMPIInt rank, size;
41: PetscScalar *rootdata = NULL, *leafdata = NULL, *pbuf, *ebuf;
42: size_t msgsize;
43: PetscMemType mtype = PETSC_MEMTYPE_HOST;
44: char mstring[16] = {0};
45: PetscBool set;
46: PetscInt skipSmall = -1, loopSmall = -1;
47: MPI_Op op = MPI_REPLACE;
49: PetscFunctionBeginUser;
50: Kokkos::initialize(argc, argv); // Test initializing kokkos before petsc
51: PetscCall(PetscInitialize(&argc, &argv, NULL, help));
52: PetscCall(PetscKokkosInitializeCheck());
54: PetscCallMPI(MPI_Comm_size(PETSC_COMM_WORLD, &size));
55: PetscCallMPI(MPI_Comm_rank(PETSC_COMM_WORLD, &rank));
56: PetscCheck(size == 2, PETSC_COMM_WORLD, PETSC_ERR_WRONG_MPI_SIZE, "Must run with 2 processes");
58: PetscCall(PetscOptionsGetInt(NULL, NULL, "-maxn", &maxn, NULL)); /* maxn PetscScalars */
59: PetscCall(PetscOptionsGetInt(NULL, NULL, "-skipSmall", &skipSmall, NULL));
60: PetscCall(PetscOptionsGetInt(NULL, NULL, "-loopSmall", &loopSmall, NULL));
62: PetscCall(PetscMalloc1(maxn, &iremote));
63: PetscCall(PetscOptionsGetString(NULL, NULL, "-mtype", mstring, 16, &set));
64: if (set) {
65: PetscBool isHost, isKokkos;
66: PetscCall(PetscStrcasecmp(mstring, "host", &isHost));
67: PetscCall(PetscStrcasecmp(mstring, "kokkos", &isKokkos));
68: if (isHost) mtype = PETSC_MEMTYPE_HOST;
69: else if (isKokkos) mtype = PETSC_MEMTYPE_KOKKOS;
70: else SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "Unknown memory type: %s", mstring);
71: }
73: if (mtype == PETSC_MEMTYPE_HOST) {
74: PetscCall(PetscMalloc2(maxn, &rootdata, maxn, &leafdata));
75: } else {
76: PetscCallCXX(rootdata = (PetscScalar *)Kokkos::kokkos_malloc(sizeof(PetscScalar) * maxn));
77: PetscCallCXX(leafdata = (PetscScalar *)Kokkos::kokkos_malloc(sizeof(PetscScalar) * maxn));
78: }
79: PetscCall(PetscMalloc2(maxn, &pbuf, maxn, &ebuf));
80: for (i = 0; i < maxn; i++) {
81: pbuf[i] = 123.0;
82: ebuf[i] = 456.0;
83: }
85: for (n = 1, i = 0; n <= maxn; n *= 2, i++) {
86: PetscCall(PetscSFCreate(PETSC_COMM_WORLD, &sf[i]));
87: PetscCall(PetscSFSetFromOptions(sf[i]));
88: if (rank == 0) {
89: nroots = n;
90: nleaves = 0;
91: } else {
92: nroots = 0;
93: nleaves = n;
94: for (j = 0; j < nleaves; j++) {
95: iremote[j].rank = 0;
96: iremote[j].index = j;
97: }
98: }
99: PetscCall(PetscSFSetGraph(sf[i], nroots, nleaves, NULL, PETSC_COPY_VALUES, iremote, PETSC_COPY_VALUES));
100: PetscCall(PetscSFSetUp(sf[i]));
101: }
103: if (loopSmall > 0) {
104: nskip = skipSmall;
105: niter = loopSmall;
106: } else {
107: nskip = LAT_SKIP_SMALL;
108: niter = LAT_LOOP_SMALL;
109: }
111: for (n = 1, j = 0; n <= maxn; n *= 2, j++) {
112: msgsize = sizeof(PetscScalar) * n;
113: if (mtype == PETSC_MEMTYPE_HOST) {
114: PetscCall(PetscArraycpy(rootdata, pbuf, n));
115: PetscCall(PetscArraycpy(leafdata, ebuf, n));
116: } else {
117: Kokkos::View<PetscScalar *> dst1((PetscScalar *)rootdata, n);
118: Kokkos::View<PetscScalar *> dst2((PetscScalar *)leafdata, n);
119: Kokkos::View<const PetscScalar *, Kokkos::HostSpace> src1((const PetscScalar *)pbuf, n);
120: Kokkos::View<const PetscScalar *, Kokkos::HostSpace> src2((const PetscScalar *)ebuf, n);
121: PetscCallCXX(Kokkos::deep_copy(dst1, src1));
122: PetscCallCXX(Kokkos::deep_copy(dst2, src2));
123: }
125: if (msgsize > LARGE_MESSAGE_SIZE) {
126: nskip = LAT_SKIP_LARGE;
127: niter = LAT_LOOP_LARGE;
128: }
129: PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
131: for (i = 0; i < niter + nskip; i++) {
132: if (i == nskip) {
133: PetscCallCXX(Kokkos::fence());
134: PetscCallMPI(MPI_Barrier(PETSC_COMM_WORLD));
135: t_start = MPI_Wtime();
136: }
137: PetscCall(PetscSFBcastWithMemTypeBegin(sf[j], MPIU_SCALAR, mtype, rootdata, mtype, leafdata, op));
138: PetscCall(PetscSFBcastEnd(sf[j], MPIU_SCALAR, rootdata, leafdata, op));
139: PetscCall(PetscSFReduceWithMemTypeBegin(sf[j], MPIU_SCALAR, mtype, leafdata, mtype, rootdata, op));
140: PetscCall(PetscSFReduceEnd(sf[j], MPIU_SCALAR, leafdata, rootdata, op));
141: }
142: PetscCallCXX(Kokkos::fence());
143: PetscCallMPI(MPI_Barrier(PETSC_COMM_WORLD));
144: t_end = MPI_Wtime();
145: time[j] = (t_end - t_start) * 1e6 / (niter * 2);
146: }
148: PetscCall(PetscPrintf(PETSC_COMM_WORLD, "\t## PetscSF Ping-pong test on %s ##\n Message(Bytes) \t\tLatency(us)\n", mtype == PETSC_MEMTYPE_HOST ? "Host" : "Device"));
149: for (n = 1, j = 0; n <= maxn; n *= 2, j++) {
150: PetscCall(PetscSFDestroy(&sf[j]));
151: PetscCall(PetscPrintf(PETSC_COMM_WORLD, "%16" PetscInt_FMT " \t %16.4f\n", ((PetscInt)sizeof(PetscScalar)) * n, time[j]));
152: }
153: PetscCall(PetscFree2(pbuf, ebuf));
154: if (mtype == PETSC_MEMTYPE_HOST) {
155: PetscCall(PetscFree2(rootdata, leafdata));
156: } else {
157: PetscCallCXX(Kokkos::kokkos_free(rootdata));
158: PetscCallCXX(Kokkos::kokkos_free(leafdata));
159: }
160: PetscCall(PetscFree(iremote));
161: PetscCall(PetscFinalize());
162: Kokkos::finalize();
163: return 0;
164: }
166: /*TEST
167: testset:
168: requires: kokkos
169: # use small numbers to make the test cheap
170: args: -maxn 4 -skipSmall 1 -loopSmall 1
171: filter: grep "DOES_NOT_EXIST"
172: output_file: output/empty.out
173: nsize: 2
175: test:
176: args: -mtype {{host kokkos}}
178: test:
179: requires: mpix_stream
180: args: -mtype kokkos -sf_use_stream_aware_mpi 1
182: TEST*/