Actual source code: pbjacobi_cuda.cu

  1: #include <petscdevice_cuda.h>
  2: #include <petsc/private/petsclegacycupmblas.h>
  3: #include <../src/ksp/pc/impls/pbjacobi/pbjacobi.h>

  5: #if PETSC_PKG_CUDA_VERSION_LT(11, 7, 0)
  6: __global__ static void MatMultBatched(PetscInt bs, PetscInt mbs, const PetscScalar *A, const PetscScalar *x, PetscScalar *y, PetscBool transpose)
  7: {
  8:   const PetscInt gridSize = gridDim.x * blockDim.x;
  9:   PetscInt       row      = blockIdx.x * blockDim.x + threadIdx.x;
 10:   const PetscInt bs2      = bs * bs;

 12:   /* One row per thread. The blocks are stored in column-major order */
 13:   for (; row < bs * mbs; row += gridSize) {
 14:     const PetscScalar *Ap, *xp;
 15:     PetscScalar       *yp;
 16:     PetscInt           i, j, k;

 18:     k  = row / bs;                               /* k-th block */
 19:     i  = row % bs;                               /* this thread deals with i-th row of the block */
 20:     Ap = &A[bs2 * k + i * (transpose ? bs : 1)]; /* Ap points to the first entry of i-th row */
 21:     xp = &x[bs * k];
 22:     yp = &y[bs * k];
 23:     /* multiply i-th row (column) with x */
 24:     yp[i] = 0.0;
 25:     for (j = 0; j < bs; j++) {
 26:       yp[i] += Ap[0] * xp[j];
 27:       Ap += (transpose ? 1 : bs); /* block is in column major order */
 28:     }
 29:   }
 30: }
 31: #endif

 33: static PetscErrorCode PCApplyOrTranspose_PBJacobi_CUDA(PC pc, Vec x, Vec y, cublasOperation_t op)
 34: {
 35:   const PetscScalar *xx;
 36:   PetscScalar       *yy;
 37:   cublasHandle_t     handle;
 38:   PC_PBJacobi       *jac = (PC_PBJacobi *)pc->data;
 39:   const PetscScalar *A   = (const PetscScalar *)jac->spptr;
 40:   const PetscInt     bs = jac->bs, mbs = jac->mbs;

 42:   PetscFunctionBegin;
 43:   PetscCall(VecCUDAGetArrayRead(x, &xx));
 44:   PetscCall(VecCUDAGetArrayWrite(y, &yy));
 45:   PetscCall(PetscCUBLASGetHandle(&handle));
 46:   PetscCallCUBLAS(cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST)); /* alpha, beta are on host */

 48: #if PETSC_PKG_CUDA_VERSION_GE(11, 7, 0)
 49:   /* y = alpha op(A) x + beta y */
 50:   const PetscScalar alpha = 1.0, beta = 0.0;
 51:   PetscCallCUBLAS(cublasXgemvStridedBatched(handle, op, bs, bs, &alpha, A, bs, bs * bs, xx, 1, bs, &beta, yy, 1, bs, mbs));
 52: #else
 53:   PetscInt gridSize = PetscMin((bs * mbs + 255) / 256, 2147483647); /* <= 2^31-1 */
 54:   MatMultBatched<<<gridSize, 256>>>(bs, mbs, A, xx, yy, (op == CUBLAS_OP_T ? PETSC_TRUE : PETSC_FALSE));
 55:   PetscCallCUDA(cudaGetLastError());
 56: #endif
 57:   PetscCall(VecCUDARestoreArrayRead(x, &xx));
 58:   PetscCall(VecCUDARestoreArrayWrite(y, &yy));
 59:   PetscCall(PetscLogGpuFlops(bs * bs * mbs * 2));
 60:   PetscFunctionReturn(PETSC_SUCCESS);
 61: }

 63: static PetscErrorCode PCApply_PBJacobi_CUDA(PC pc, Vec x, Vec y)
 64: {
 65:   PetscFunctionBegin;
 66:   PetscCall(PCApplyOrTranspose_PBJacobi_CUDA(pc, x, y, CUBLAS_OP_N)); // No transpose
 67:   PetscFunctionReturn(PETSC_SUCCESS);
 68: }

 70: static PetscErrorCode PCApplyTranspose_PBJacobi_CUDA(PC pc, Vec x, Vec y)
 71: {
 72:   PetscFunctionBegin;
 73:   PetscCall(PCApplyOrTranspose_PBJacobi_CUDA(pc, x, y, CUBLAS_OP_T)); // Transpose
 74:   PetscFunctionReturn(PETSC_SUCCESS);
 75: }

 77: static PetscErrorCode PCDestroy_PBJacobi_CUDA(PC pc)
 78: {
 79:   PC_PBJacobi *jac = (PC_PBJacobi *)pc->data;

 81:   PetscFunctionBegin;
 82:   PetscCallCUDA(cudaFree(jac->spptr));
 83:   PetscCall(PCDestroy_PBJacobi(pc));
 84:   PetscFunctionReturn(PETSC_SUCCESS);
 85: }

 87: PETSC_INTERN PetscErrorCode PCSetUp_PBJacobi_CUDA(PC pc)
 88: {
 89:   PC_PBJacobi *jac = (PC_PBJacobi *)pc->data;
 90:   size_t       size;

 92:   PetscFunctionBegin;
 93:   PetscCall(PCSetUp_PBJacobi_Host(pc)); /* Compute the inverse on host now. Might worth doing it on device directly */
 94:   size = sizeof(PetscScalar) * jac->bs * jac->bs * jac->mbs;

 96:   /* PBJacobi_CUDA is simple so that we use jac->spptr as if it is diag_d */
 97:   if (!jac->spptr) PetscCallCUDAVoid(cudaMalloc(&jac->spptr, size));
 98:   PetscCallCUDAVoid(cudaMemcpy(jac->spptr, jac->diag, size, cudaMemcpyHostToDevice));
 99:   PetscCall(PetscLogCpuToGpu(size));

101:   pc->ops->apply          = PCApply_PBJacobi_CUDA;
102:   pc->ops->applytranspose = PCApplyTranspose_PBJacobi_CUDA;
103:   pc->ops->destroy        = PCDestroy_PBJacobi_CUDA;
104:   PetscFunctionReturn(PETSC_SUCCESS);
105: }