Actual source code: ex2.c
1: static char help[] = "Test SF cuda stream synchronization in device to host communication\n\n";
2: /*
3: SF uses asynchronous operations internally. When destination data is on GPU, it does asynchronous
4: operations in the default stream and does not sync these operations since it assumes routines consume
5: the destination data are also on the default stream. However, when destination data in on CPU,
6: SF must guarantee the data is ready to use on CPU after PetscSFXxxEnd().
7: */
9: #include <petscvec.h>
10: int main(int argc, char **argv)
11: {
12: PetscInt i, n = 100000; /* Big enough to make the asynchronous copy meaningful */
13: PetscScalar *val;
14: const PetscScalar *yval;
15: Vec x, y;
16: PetscMPIInt size;
17: IS ix, iy;
18: VecScatter vscat;
20: PetscFunctionBegin;
21: PetscFunctionBeginUser;
22: PetscCall(PetscInitialize(&argc, &argv, (char *)0, help));
23: PetscCallMPI(MPI_Comm_size(PETSC_COMM_WORLD, &size));
24: PetscCheck(size == 1, PETSC_COMM_WORLD, PETSC_ERR_WRONG_MPI_SIZE, "This is a uni-processor test");
26: /* Create two CUDA vectors x, y. Though we only care y's memory on host, we make y a CUDA vector,
27: since we want to have y's memory on host pinned (i.e.,non-pagable), to really trigger asynchronous
28: cudaMemcpyDeviceToHost.
29: */
30: PetscCall(VecCreateSeq(PETSC_COMM_WORLD, n, &x));
31: PetscCall(VecSetFromOptions(x));
32: PetscCall(VecCreateSeq(PETSC_COMM_WORLD, n, &y));
33: PetscCall(VecSetFromOptions(y));
35: /* Init x, y, and push them to GPU (their offloadmask = PETSC_OFFLOAD_GPU) */
36: PetscCall(VecGetArray(x, &val));
37: for (i = 0; i < n; i++) val[i] = i / 2.0;
38: PetscCall(VecRestoreArray(x, &val));
39: PetscCall(VecScale(x, 2.0));
40: PetscCall(VecSet(y, 314));
42: /* Pull y to CPU (make its offloadmask = PETSC_OFFLOAD_CPU) */
43: PetscCall(VecGetArray(y, &val));
44: PetscCall(VecRestoreArray(y, &val));
46: /* The vscat is simply a vector copy */
47: PetscCall(ISCreateStride(PETSC_COMM_SELF, n, 0, 1, &ix));
48: PetscCall(ISCreateStride(PETSC_COMM_SELF, n, 0, 1, &iy));
49: PetscCall(VecScatterCreate(x, ix, y, iy, &vscat));
51: /* Do device to host vecscatter and then immediately use y on host. VecScat/SF may use asynchronous
52: cudaMemcpy or kernels, but it must guarantee y is ready to use on host. Otherwise, wrong data will be displayed.
53: */
54: PetscCall(VecScatterBegin(vscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
55: PetscCall(VecScatterEnd(vscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
56: PetscCall(VecGetArrayRead(y, &yval));
57: /* Display the first and the last entries of y to see if it is valid on host */
58: PetscCall(PetscPrintf(PETSC_COMM_SELF, "y[0]=%.f, y[%" PetscInt_FMT "] = %.f\n", (float)PetscRealPart(yval[0]), n - 1, (float)PetscRealPart(yval[n - 1])));
59: PetscCall(VecRestoreArrayRead(y, &yval));
61: PetscCall(VecDestroy(&x));
62: PetscCall(VecDestroy(&y));
63: PetscCall(ISDestroy(&ix));
64: PetscCall(ISDestroy(&iy));
65: PetscCall(VecScatterDestroy(&vscat));
66: PetscCall(PetscFinalize());
67: return 0;
68: }
70: /*TEST
72: test:
73: requires: cuda
74: diff_args: -j
75: #make sure the host memory is pinned
76: # sf_backend cuda is not needed if compiling only with cuda
77: args: -vec_type cuda -sf_backend cuda -vec_pinned_memory_min 0
79: test:
80: suffix: hip
81: requires: hip
82: diff_args: -j
83: output_file: output/ex2_1.out
84: #make sure the host memory is pinned
85: # sf_backend hip is not needed if compiling only with hip
86: args: -vec_type hip -sf_backend hip -vec_pinned_memory_min 0
88: TEST*/