Actual source code: hpddm.cu
1: #define HPDDM_MIXED_PRECISION 1
2: #include <petsc/private/petschpddm.h>
3: #include <petscdevice_cuda.h>
4: #include <thrust/device_ptr.h>
6: PetscErrorCode KSPSolve_HPDDM_CUDA_Private(KSP_HPDDM *data, const PetscScalar *b, PetscScalar *x, PetscInt n, MPI_Comm comm)
7: {
8: const PetscInt N = data->op->getDof() * n;
9: #if PetscDefined(USE_REAL_DOUBLE)
10: typedef HPDDM::downscaled_type<PetscScalar> K;
11: #endif
12: #if PetscDefined(USE_REAL_SINGLE)
13: typedef HPDDM::upscaled_type<PetscScalar> K;
14: #endif
16: PetscFunctionBegin; // TODO: remove all cudaMemcpy() once HPDDM::IterativeMethod::solve() handles device pointers
17: if (data->precision != PETSC_KSPHPDDM_DEFAULT_PRECISION) {
18: const thrust::device_ptr<const PetscScalar> db = thrust::device_pointer_cast(b);
19: const thrust::device_ptr<PetscScalar> dx = thrust::device_pointer_cast(x);
20: K *ptr, *host_ptr;
21: thrust::device_ptr<K> dptr[2];
23: PetscCall(PetscMalloc1(2 * N, &host_ptr));
24: PetscCallCUDA(cudaMalloc((void **)&ptr, 2 * N * sizeof(K)));
25: dptr[0] = thrust::device_pointer_cast(ptr);
26: dptr[1] = thrust::device_pointer_cast(ptr + N);
27: thrust::copy_n(thrust::cuda::par.on(PetscDefaultCudaStream), db, N, dptr[0]);
28: thrust::copy_n(thrust::cuda::par.on(PetscDefaultCudaStream), dx, N, dptr[1]);
29: PetscCallCUDA(cudaMemcpy(host_ptr, ptr, 2 * N * sizeof(K), cudaMemcpyDeviceToHost));
30: PetscCall(HPDDM::IterativeMethod::solve(*data->op, host_ptr, host_ptr + N, n, comm));
31: PetscCallCUDA(cudaMemcpy(ptr + N, host_ptr + N, N * sizeof(K), cudaMemcpyHostToDevice));
32: thrust::copy_n(thrust::cuda::par.on(PetscDefaultCudaStream), dptr[1], N, dx);
33: PetscCallCUDA(cudaFree(ptr));
34: PetscCall(PetscFree(host_ptr));
35: PetscCall(PetscLogGpuToCpu(2 * N * sizeof(K)));
36: PetscCall(PetscLogCpuToGpu(N * sizeof(K)));
37: } else {
38: PetscScalar *host_ptr;
40: PetscCall(PetscMalloc1(2 * N, &host_ptr));
41: PetscCallCUDA(cudaMemcpy(host_ptr, b, N * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
42: PetscCallCUDA(cudaMemcpy(host_ptr + N, x, N * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
43: PetscCall(HPDDM::IterativeMethod::solve(*data->op, host_ptr, host_ptr + N, n, comm));
44: PetscCallCUDA(cudaMemcpy(x, host_ptr + N, N * sizeof(PetscScalar), cudaMemcpyHostToDevice));
45: PetscCall(PetscFree(host_ptr));
46: PetscCall(PetscLogGpuToCpu(2 * N * sizeof(PetscScalar)));
47: PetscCall(PetscLogCpuToGpu(N * sizeof(PetscScalar)));
48: }
49: PetscFunctionReturn(PETSC_SUCCESS);
50: }