Actual source code: matseqdensecupm.hpp

  1: #pragma once

  3: #include <petsc/private/matdensecupmimpl.h>
  4: #include <../src/mat/impls/dense/seq/dense.h>

  6: #include <petsc/private/deviceimpl.h>
  7: #include <petsc/private/randomimpl.h>
  8: #include <petsc/private/vecimpl.h>
  9: #include <petsc/private/cupmobject.hpp>
 10: #include <petsc/private/cupmsolverinterface.hpp>

 12: #include <petsc/private/cpp/type_traits.hpp>
 13: #include <petsc/private/cpp/utility.hpp>

 15: #include <../src/vec/vec/impls/seq/cupm/vecseqcupm.hpp>

 17: namespace Petsc
 18: {

 20: namespace mat
 21: {

 23: namespace cupm
 24: {

 26: namespace impl
 27: {

 29: template <device::cupm::DeviceType T>
 30: class MatDense_Seq_CUPM : MatDense_CUPM<T, MatDense_Seq_CUPM<T>> {
 31: public:
 32:   MATDENSECUPM_HEADER(T, MatDense_Seq_CUPM<T>);

 34: private:
 35:   struct Mat_SeqDenseCUPM {
 36:     PetscScalar *d_v;           // pointer to the matrix on the GPU
 37:     PetscScalar *unplacedarray; // if one called MatCUPMDensePlaceArray(), this is where it stashed the original
 38:     bool         d_user_alloc;
 39:     bool         d_unplaced_user_alloc;
 40:     // factorization support
 41:     cupmBlasInt_t *d_fact_ipiv;  // device pivots
 42:     cupmScalar_t  *d_fact_tau;   // device QR tau vector
 43:     cupmBlasInt_t *d_fact_info;  // device info
 44:     cupmScalar_t  *d_fact_work;  // device workspace
 45:     cupmBlasInt_t  d_fact_lwork; // size of device workspace
 46:     // workspace
 47:     Vec workvec;
 48:   };

 50:   static PetscErrorCode SetPreallocation_(Mat, PetscDeviceContext, PetscScalar *) noexcept;

 52:   static PetscErrorCode HostToDevice_(Mat, PetscDeviceContext) noexcept;
 53:   static PetscErrorCode DeviceToHost_(Mat, PetscDeviceContext) noexcept;

 55:   static PetscErrorCode CheckCUPMSolverInfo_(const cupmBlasInt_t *, cupmStream_t) noexcept;

 57:   template <typename Derived>
 58:   struct SolveCommon;
 59:   struct SolveQR;
 60:   struct SolveCholesky;
 61:   struct SolveLU;

 63:   template <typename Solver, bool transpose>
 64:   static PetscErrorCode MatSolve_Factored_Dispatch_(Mat, Vec, Vec) noexcept;
 65:   template <typename Solver, bool transpose>
 66:   static PetscErrorCode MatMatSolve_Factored_Dispatch_(Mat, Mat, Mat) noexcept;
 67:   template <bool transpose>
 68:   static PetscErrorCode MatMultAdd_Dispatch_(Mat, Vec, Vec, Vec) noexcept;

 70:   template <bool to_host>
 71:   static PetscErrorCode Convert_Dispatch_(Mat, MatType, MatReuse, Mat *) noexcept;

 73:   PETSC_NODISCARD static constexpr MatType       MATIMPLCUPM_() noexcept;
 74:   PETSC_NODISCARD static constexpr Mat_SeqDense *MatIMPLCast_(Mat) noexcept;

 76: public:
 77:   PETSC_NODISCARD static constexpr Mat_SeqDenseCUPM *MatCUPMCast(Mat) noexcept;

 79:   // define these by hand since they don't fit the above mold
 80:   PETSC_NODISCARD static constexpr const char *MatConvert_seqdensecupm_seqdense_C() noexcept;
 81:   PETSC_NODISCARD static constexpr const char *MatProductSetFromOptions_seqaij_seqdensecupm_C() noexcept;

 83:   static PetscErrorCode Create(Mat) noexcept;
 84:   static PetscErrorCode Destroy(Mat) noexcept;
 85:   static PetscErrorCode SetUp(Mat) noexcept;
 86:   static PetscErrorCode Reset(Mat) noexcept;

 88:   static PetscErrorCode BindToCPU(Mat, PetscBool) noexcept;
 89:   static PetscErrorCode Convert_SeqDense_SeqDenseCUPM(Mat, MatType, MatReuse, Mat *) noexcept;
 90:   static PetscErrorCode Convert_SeqDenseCUPM_SeqDense(Mat, MatType, MatReuse, Mat *) noexcept;

 92:   template <PetscMemType, PetscMemoryAccessMode>
 93:   static PetscErrorCode GetArray(Mat, PetscScalar **, PetscDeviceContext) noexcept;
 94:   template <PetscMemType, PetscMemoryAccessMode>
 95:   static PetscErrorCode RestoreArray(Mat, PetscScalar **, PetscDeviceContext) noexcept;
 96:   template <PetscMemoryAccessMode>
 97:   static PetscErrorCode GetArrayAndMemType(Mat, PetscScalar **, PetscMemType *, PetscDeviceContext) noexcept;
 98:   template <PetscMemoryAccessMode>
 99:   static PetscErrorCode RestoreArrayAndMemType(Mat, PetscScalar **, PetscDeviceContext) noexcept;

101: private:
102:   template <PetscMemType mtype, PetscMemoryAccessMode mode>
103:   static PetscErrorCode GetArrayC_(Mat m, PetscScalar **p) noexcept
104:   {
105:     PetscDeviceContext dctx;

107:     PetscFunctionBegin;
108:     PetscCall(GetHandles_(&dctx));
109:     PetscCall(GetArray<mtype, mode>(m, p, dctx));
110:     PetscFunctionReturn(PETSC_SUCCESS);
111:   }

113:   template <PetscMemType mtype, PetscMemoryAccessMode mode>
114:   static PetscErrorCode RestoreArrayC_(Mat m, PetscScalar **p) noexcept
115:   {
116:     PetscDeviceContext dctx;

118:     PetscFunctionBegin;
119:     PetscCall(GetHandles_(&dctx));
120:     PetscCall(RestoreArray<mtype, mode>(m, p, dctx));
121:     PetscFunctionReturn(PETSC_SUCCESS);
122:   }

124:   template <PetscMemoryAccessMode mode>
125:   static PetscErrorCode GetArrayAndMemTypeC_(Mat m, PetscScalar **p, PetscMemType *tp) noexcept
126:   {
127:     PetscDeviceContext dctx;

129:     PetscFunctionBegin;
130:     PetscCall(GetHandles_(&dctx));
131:     PetscCall(GetArrayAndMemType<mode>(m, p, tp, dctx));
132:     PetscFunctionReturn(PETSC_SUCCESS);
133:   }

135:   template <PetscMemoryAccessMode mode>
136:   static PetscErrorCode RestoreArrayAndMemTypeC_(Mat m, PetscScalar **p) noexcept
137:   {
138:     PetscDeviceContext dctx;

140:     PetscFunctionBegin;
141:     PetscCall(GetHandles_(&dctx));
142:     PetscCall(RestoreArrayAndMemType<mode>(m, p, dctx));
143:     PetscFunctionReturn(PETSC_SUCCESS);
144:   }

146: public:
147:   static PetscErrorCode PlaceArray(Mat, const PetscScalar *) noexcept;
148:   static PetscErrorCode ReplaceArray(Mat, const PetscScalar *) noexcept;
149:   static PetscErrorCode ResetArray(Mat) noexcept;

151:   template <bool transpose_A, bool transpose_B>
152:   static PetscErrorCode MatMatMult_Numeric_Dispatch(Mat, Mat, Mat) noexcept;
153:   static PetscErrorCode Copy(Mat, Mat, MatStructure) noexcept;
154:   static PetscErrorCode ZeroEntries(Mat) noexcept;
155:   static PetscErrorCode Scale(Mat, PetscScalar) noexcept;
156:   static PetscErrorCode AXPY(Mat, PetscScalar, Mat, MatStructure) noexcept;
157:   static PetscErrorCode Duplicate(Mat, MatDuplicateOption, Mat *) noexcept;
158:   static PetscErrorCode SetRandom(Mat, PetscRandom) noexcept;

160:   static PetscErrorCode GetColumnVector(Mat, Vec, PetscInt) noexcept;
161:   template <PetscMemoryAccessMode>
162:   static PetscErrorCode GetColumnVec(Mat, PetscInt, Vec *) noexcept;
163:   template <PetscMemoryAccessMode>
164:   static PetscErrorCode RestoreColumnVec(Mat, PetscInt, Vec *) noexcept;

166:   static PetscErrorCode GetFactor(Mat, MatFactorType, Mat *) noexcept;
167:   static PetscErrorCode InvertFactors(Mat) noexcept;

169:   static PetscErrorCode GetSubMatrix(Mat, PetscInt, PetscInt, PetscInt, PetscInt, Mat *) noexcept;
170:   static PetscErrorCode RestoreSubMatrix(Mat, Mat *) noexcept;
171: };

173: } // namespace impl

175: namespace
176: {

178: // Declare this here so that the functions below can make use of it
179: template <device::cupm::DeviceType T>
180: inline PetscErrorCode MatCreateSeqDenseCUPM(MPI_Comm comm, PetscInt m, PetscInt n, PetscScalar *data, Mat *A, PetscDeviceContext dctx = nullptr, bool preallocate = true) noexcept
181: {
182:   PetscFunctionBegin;
183:   PetscCall(impl::MatDense_Seq_CUPM<T>::CreateIMPLDenseCUPM(comm, m, n, m, n, data, A, dctx, preallocate));
184:   PetscFunctionReturn(PETSC_SUCCESS);
185: }

187: } // anonymous namespace

189: namespace impl
190: {

192: // ==========================================================================================
193: // MatDense_Seq_CUPM - Private API - Utility
194: // ==========================================================================================

196: template <device::cupm::DeviceType T>
197: inline PetscErrorCode MatDense_Seq_CUPM<T>::SetPreallocation_(Mat m, PetscDeviceContext dctx, PetscScalar *user_device_array) noexcept
198: {
199:   const auto   mcu   = MatCUPMCast(m);
200:   const auto   nrows = m->rmap->n;
201:   const auto   ncols = m->cmap->n;
202:   auto        &lda   = MatIMPLCast(m)->lda;
203:   cupmStream_t stream;

205:   PetscFunctionBegin;
206:   PetscCheckTypeName(m, MATSEQDENSECUPM());
208:   PetscCall(checkCupmBlasIntCast(nrows));
209:   PetscCall(checkCupmBlasIntCast(ncols));
210:   PetscCall(GetHandlesFrom_(dctx, &stream));
211:   if (lda <= 0) lda = nrows;
212:   if (!mcu->d_user_alloc) PetscCallCUPM(cupmFreeAsync(mcu->d_v, stream));
213:   if (user_device_array) {
214:     mcu->d_user_alloc = PETSC_TRUE;
215:     mcu->d_v          = user_device_array;
216:   } else {
217:     PetscInt size;

219:     mcu->d_user_alloc = PETSC_FALSE;
220:     PetscCall(PetscIntMultError(lda, ncols, &size));
221:     PetscCall(PetscCUPMMallocAsync(&mcu->d_v, size, stream));
222:     PetscCall(PetscCUPMMemsetAsync(mcu->d_v, 0, size, stream));
223:   }
224:   m->offloadmask = PETSC_OFFLOAD_GPU;
225:   PetscFunctionReturn(PETSC_SUCCESS);
226: }

228: template <device::cupm::DeviceType T>
229: inline PetscErrorCode MatDense_Seq_CUPM<T>::HostToDevice_(Mat m, PetscDeviceContext dctx) noexcept
230: {
231:   const auto nrows = m->rmap->n;
232:   const auto ncols = m->cmap->n;
233:   const auto copy  = m->offloadmask == PETSC_OFFLOAD_CPU || m->offloadmask == PETSC_OFFLOAD_UNALLOCATED;

235:   PetscFunctionBegin;
236:   PetscCheckTypeName(m, MATSEQDENSECUPM());
237:   if (m->boundtocpu) PetscFunctionReturn(PETSC_SUCCESS);
238:   PetscCall(PetscInfo(m, "%s matrix %" PetscInt_FMT " x %" PetscInt_FMT "\n", copy ? "Copy" : "Reusing", nrows, ncols));
239:   if (copy) {
240:     const auto   mcu = MatCUPMCast(m);
241:     cupmStream_t stream;

243:     // Allocate GPU memory if not present
244:     if (!mcu->d_v) PetscCall(SetPreallocation(m, dctx, nullptr));
245:     PetscCall(GetHandlesFrom_(dctx, &stream));
246:     PetscCall(PetscLogEventBegin(MAT_DenseCopyToGPU, m, 0, 0, 0));
247:     {
248:       const auto mimpl = MatIMPLCast(m);
249:       const auto lda   = mimpl->lda;
250:       const auto src   = mimpl->v;
251:       const auto dest  = mcu->d_v;

253:       if (lda > nrows) {
254:         PetscCall(PetscCUPMMemcpy2DAsync(dest, lda, src, lda, nrows, ncols, cupmMemcpyHostToDevice, stream));
255:       } else {
256:         PetscCall(PetscCUPMMemcpyAsync(dest, src, lda * ncols, cupmMemcpyHostToDevice, stream));
257:       }
258:     }
259:     PetscCall(PetscLogEventEnd(MAT_DenseCopyToGPU, m, 0, 0, 0));
260:     // order important, ensure that offloadmask is PETSC_OFFLOAD_BOTH
261:     m->offloadmask = PETSC_OFFLOAD_BOTH;
262:   }
263:   PetscFunctionReturn(PETSC_SUCCESS);
264: }

266: template <device::cupm::DeviceType T>
267: inline PetscErrorCode MatDense_Seq_CUPM<T>::DeviceToHost_(Mat m, PetscDeviceContext dctx) noexcept
268: {
269:   const auto nrows = m->rmap->n;
270:   const auto ncols = m->cmap->n;
271:   const auto copy  = m->offloadmask == PETSC_OFFLOAD_GPU;

273:   PetscFunctionBegin;
274:   PetscCheckTypeName(m, MATSEQDENSECUPM());
275:   PetscCall(PetscInfo(m, "%s matrix %" PetscInt_FMT " x %" PetscInt_FMT "\n", copy ? "Copy" : "Reusing", nrows, ncols));
276:   if (copy) {
277:     const auto   mimpl = MatIMPLCast(m);
278:     cupmStream_t stream;

280:     // MatCreateSeqDenseCUPM may not allocate CPU memory. Allocate if needed
281:     if (!mimpl->v) PetscCall(MatSeqDenseSetPreallocation(m, nullptr));
282:     PetscCall(GetHandlesFrom_(dctx, &stream));
283:     PetscCall(PetscLogEventBegin(MAT_DenseCopyFromGPU, m, 0, 0, 0));
284:     {
285:       const auto lda  = mimpl->lda;
286:       const auto dest = mimpl->v;
287:       const auto src  = MatCUPMCast(m)->d_v;

289:       if (lda > nrows) {
290:         PetscCall(PetscCUPMMemcpy2DAsync(dest, lda, src, lda, nrows, ncols, cupmMemcpyDeviceToHost, stream));
291:       } else {
292:         PetscCall(PetscCUPMMemcpyAsync(dest, src, lda * ncols, cupmMemcpyDeviceToHost, stream));
293:       }
294:     }
295:     PetscCall(PetscLogEventEnd(MAT_DenseCopyFromGPU, m, 0, 0, 0));
296:     // order is important, MatSeqDenseSetPreallocation() might set offloadmask
297:     m->offloadmask = PETSC_OFFLOAD_BOTH;
298:   }
299:   PetscFunctionReturn(PETSC_SUCCESS);
300: }

302: template <device::cupm::DeviceType T>
303: inline PetscErrorCode MatDense_Seq_CUPM<T>::CheckCUPMSolverInfo_(const cupmBlasInt_t *fact_info, cupmStream_t stream) noexcept
304: {
305:   PetscFunctionBegin;
306:   if (PetscDefined(USE_DEBUG)) {
307:     cupmBlasInt_t info = 0;

309:     PetscCall(PetscCUPMMemcpyAsync(&info, fact_info, 1, cupmMemcpyDeviceToHost, stream));
310:     if (stream) PetscCallCUPM(cupmStreamSynchronize(stream));
311:     static_assert(std::is_same<decltype(info), int>::value, "");
312:     PetscCheck(info <= 0, PETSC_COMM_SELF, PETSC_ERR_MAT_CH_ZRPVT, "Bad factorization: zero pivot in row %d", info - 1);
313:     PetscCheck(info >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Wrong argument to cupmSolver %d", -info);
314:   }
315:   PetscFunctionReturn(PETSC_SUCCESS);
316: }

318: // ==========================================================================================
319: // MatDense_Seq_CUPM - Private API - Solver Dispatch
320: // ==========================================================================================

322: // specific solvers called through the dispatch_() family of functions
323: template <device::cupm::DeviceType T>
324: template <typename Derived>
325: struct MatDense_Seq_CUPM<T>::SolveCommon {
326:   using derived_type = Derived;

328:   template <typename F>
329:   static PetscErrorCode ResizeFactLwork(Mat_SeqDenseCUPM *mcu, cupmStream_t stream, F &&cupmSolverComputeFactLwork) noexcept
330:   {
331:     cupmBlasInt_t lwork;

333:     PetscFunctionBegin;
334:     PetscCallCUPMSOLVER(cupmSolverComputeFactLwork(&lwork));
335:     if (lwork > mcu->d_fact_lwork) {
336:       mcu->d_fact_lwork = lwork;
337:       PetscCallCUPM(cupmFreeAsync(mcu->d_fact_work, stream));
338:       PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_work, lwork, stream));
339:     }
340:     PetscFunctionReturn(PETSC_SUCCESS);
341:   }

343:   static PetscErrorCode FactorPrepare(Mat A, cupmStream_t stream) noexcept
344:   {
345:     const auto mcu = MatCUPMCast(A);

347:     PetscFunctionBegin;
348:     PetscCall(PetscInfo(A, "%s factor %" PetscInt_FMT " x %" PetscInt_FMT " on backend\n", derived_type::NAME(), A->rmap->n, A->cmap->n));
349:     A->factortype             = derived_type::MATFACTORTYPE();
350:     A->ops->solve             = MatSolve_Factored_Dispatch_<derived_type, false>;
351:     A->ops->solvetranspose    = MatSolve_Factored_Dispatch_<derived_type, true>;
352:     A->ops->matsolve          = MatMatSolve_Factored_Dispatch_<derived_type, false>;
353:     A->ops->matsolvetranspose = MatMatSolve_Factored_Dispatch_<derived_type, true>;

355:     PetscCall(PetscStrFreeAllocpy(MATSOLVERCUPM(), &A->solvertype));
356:     if (!mcu->d_fact_info) PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_info, 1, stream));
357:     PetscFunctionReturn(PETSC_SUCCESS);
358:   }
359: };

361: template <device::cupm::DeviceType T>
362: struct MatDense_Seq_CUPM<T>::SolveLU : SolveCommon<SolveLU> {
363:   using base_type = SolveCommon<SolveLU>;

365:   static constexpr const char   *NAME() noexcept { return "LU"; }
366:   static constexpr MatFactorType MATFACTORTYPE() noexcept { return MAT_FACTOR_LU; }

368:   static PetscErrorCode Factor(Mat A, IS, IS, const MatFactorInfo *) noexcept
369:   {
370:     const auto         m = static_cast<cupmBlasInt_t>(A->rmap->n);
371:     const auto         n = static_cast<cupmBlasInt_t>(A->cmap->n);
372:     cupmStream_t       stream;
373:     cupmSolverHandle_t handle;
374:     PetscDeviceContext dctx;

376:     PetscFunctionBegin;
377:     if (!m || !n) PetscFunctionReturn(PETSC_SUCCESS);
378:     PetscCall(GetHandles_(&dctx, &handle, &stream));
379:     PetscCall(base_type::FactorPrepare(A, stream));
380:     {
381:       const auto mcu = MatCUPMCast(A);
382:       const auto da  = DeviceArrayReadWrite(dctx, A);
383:       const auto lda = static_cast<cupmBlasInt_t>(MatIMPLCast(A)->lda);

385:       // clang-format off
386:       PetscCall(
387:         base_type::ResizeFactLwork(
388:           mcu, stream,
389:           [&](cupmBlasInt_t *fact_lwork)
390:           {
391:             return cupmSolverXgetrf_bufferSize(handle, m, n, da.cupmdata(), lda, fact_lwork);
392:           }
393:         )
394:       );
395:       // clang-format on
396:       if (!mcu->d_fact_ipiv) PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_ipiv, n, stream));

398:       PetscCall(PetscLogGpuTimeBegin());
399:       PetscCallCUPMSOLVER(cupmSolverXgetrf(handle, m, n, da.cupmdata(), lda, mcu->d_fact_work, mcu->d_fact_lwork, mcu->d_fact_ipiv, mcu->d_fact_info));
400:       PetscCall(PetscLogGpuTimeEnd());
401:       PetscCall(CheckCUPMSolverInfo_(mcu->d_fact_info, stream));
402:     }
403:     PetscCall(PetscLogGpuFlops(2.0 * n * n * m / 3.0));
404:     PetscFunctionReturn(PETSC_SUCCESS);
405:   }

407:   template <bool transpose>
408:   static PetscErrorCode Solve(Mat A, cupmScalar_t *x, cupmBlasInt_t ldx, cupmBlasInt_t m, cupmBlasInt_t nrhs, cupmBlasInt_t k, PetscDeviceContext dctx, cupmStream_t stream) noexcept
409:   {
410:     const auto         mcu       = MatCUPMCast(A);
411:     const auto         fact_info = mcu->d_fact_info;
412:     const auto         fact_ipiv = mcu->d_fact_ipiv;
413:     cupmSolverHandle_t handle;

415:     PetscFunctionBegin;
416:     PetscCall(GetHandlesFrom_(dctx, &handle));
417:     PetscCall(PetscInfo(A, "%s solve %d x %d on backend\n", NAME(), m, k));
418:     PetscCall(PetscLogGpuTimeBegin());
419:     {
420:       constexpr auto op  = transpose ? CUPMSOLVER_OP_T : CUPMSOLVER_OP_N;
421:       const auto     da  = DeviceArrayRead(dctx, A);
422:       const auto     lda = static_cast<cupmBlasInt_t>(MatIMPLCast(A)->lda);

424:       // clang-format off
425:       PetscCall(
426:         base_type::ResizeFactLwork(
427:           mcu, stream,
428:           [&](cupmBlasInt_t *lwork)
429:           {
430:             return cupmSolverXgetrs_bufferSize(
431:               handle, op, m, nrhs, da.cupmdata(), lda, fact_ipiv, x, ldx, lwork
432:             );
433:           }
434:         )
435:       );
436:       // clang-format on
437:       PetscCallCUPMSOLVER(cupmSolverXgetrs(handle, op, m, nrhs, da.cupmdata(), lda, fact_ipiv, x, ldx, mcu->d_fact_work, mcu->d_fact_lwork, fact_info));
438:       PetscCall(CheckCUPMSolverInfo_(fact_info, stream));
439:     }
440:     PetscCall(PetscLogGpuTimeEnd());
441:     PetscCall(PetscLogGpuFlops(nrhs * (2.0 * m * m - m)));
442:     PetscFunctionReturn(PETSC_SUCCESS);
443:   }
444: };

446: template <device::cupm::DeviceType T>
447: struct MatDense_Seq_CUPM<T>::SolveCholesky : SolveCommon<SolveCholesky> {
448:   using base_type = SolveCommon<SolveCholesky>;

450:   static constexpr const char   *NAME() noexcept { return "Cholesky"; }
451:   static constexpr MatFactorType MATFACTORTYPE() noexcept { return MAT_FACTOR_CHOLESKY; }

453:   static PetscErrorCode Factor(Mat A, IS, const MatFactorInfo *) noexcept
454:   {
455:     const auto         n = static_cast<cupmBlasInt_t>(A->rmap->n);
456:     PetscDeviceContext dctx;
457:     cupmSolverHandle_t handle;
458:     cupmStream_t       stream;

460:     PetscFunctionBegin;
461:     if (!n || !A->cmap->n) PetscFunctionReturn(PETSC_SUCCESS);
462:     PetscCheck(A->spd == PETSC_BOOL3_TRUE, PETSC_COMM_SELF, PETSC_ERR_SUP, "%ssytrs unavailable. Use MAT_FACTOR_LU", cupmSolverName());
463:     PetscCall(GetHandles_(&dctx, &handle, &stream));
464:     PetscCall(base_type::FactorPrepare(A, stream));
465:     {
466:       const auto mcu = MatCUPMCast(A);
467:       const auto da  = DeviceArrayReadWrite(dctx, A);
468:       const auto lda = static_cast<cupmBlasInt_t>(MatIMPLCast(A)->lda);

470:       // clang-format off
471:       PetscCall(
472:         base_type::ResizeFactLwork(
473:           mcu, stream,
474:           [&](cupmBlasInt_t *fact_lwork)
475:           {
476:             return cupmSolverXpotrf_bufferSize(
477:               handle, CUPMSOLVER_FILL_MODE_LOWER, n, da.cupmdata(), lda, fact_lwork
478:             );
479:           }
480:         )
481:       );
482:       // clang-format on
483:       PetscCall(PetscLogGpuTimeBegin());
484:       PetscCallCUPMSOLVER(cupmSolverXpotrf(handle, CUPMSOLVER_FILL_MODE_LOWER, n, da.cupmdata(), lda, mcu->d_fact_work, mcu->d_fact_lwork, mcu->d_fact_info));
485:       PetscCall(PetscLogGpuTimeEnd());
486:       PetscCall(CheckCUPMSolverInfo_(mcu->d_fact_info, stream));
487:     }
488:     PetscCall(PetscLogGpuFlops(1.0 * n * n * n / 3.0));

490: #if 0
491:     // At the time of writing this interface (cuda 10.0), cusolverDn does not implement *sytrs
492:     // and *hetr* routines. The code below should work, and it can be activated when *sytrs
493:     // routines will be available
494:     if (!mcu->d_fact_ipiv) PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_ipiv, n, stream));
495:     if (!mcu->d_fact_lwork) {
496:       PetscCallCUPMSOLVER(cupmSolverDnXsytrf_bufferSize(handle, n, da.cupmdata(), lda, &mcu->d_fact_lwork));
497:       PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_work, mcu->d_fact_lwork, stream));
498:     }
499:     if (mcu->d_fact_info) PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_info, 1, stream));
500:     PetscCall(PetscLogGpuTimeBegin());
501:     PetscCallCUPMSOLVER(cupmSolverXsytrf(handle, CUPMSOLVER_FILL_MODE_LOWER, n, da, lda, mcu->d_fact_ipiv, mcu->d_fact_work, mcu->d_fact_lwork, mcu->d_fact_info));
502:     PetscCall(PetscLogGpuTimeEnd());
503: #endif
504:     PetscFunctionReturn(PETSC_SUCCESS);
505:   }

507:   template <bool transpose>
508:   static PetscErrorCode Solve(Mat A, cupmScalar_t *x, cupmBlasInt_t ldx, cupmBlasInt_t m, cupmBlasInt_t nrhs, cupmBlasInt_t k, PetscDeviceContext dctx, cupmStream_t stream) noexcept
509:   {
510:     const auto         mcu       = MatCUPMCast(A);
511:     const auto         fact_info = mcu->d_fact_info;
512:     cupmSolverHandle_t handle;

514:     PetscFunctionBegin;
515:     PetscAssert(!mcu->d_fact_ipiv, PETSC_COMM_SELF, PETSC_ERR_LIB, "%ssytrs not implemented", cupmSolverName());
516:     PetscCall(GetHandlesFrom_(dctx, &handle));
517:     PetscCall(PetscInfo(A, "%s solve %d x %d on backend\n", NAME(), m, k));
518:     PetscCall(PetscLogGpuTimeBegin());
519:     {
520:       const auto da  = DeviceArrayRead(dctx, A);
521:       const auto lda = static_cast<cupmBlasInt_t>(MatIMPLCast(A)->lda);

523:       // clang-format off
524:       PetscCall(
525:         base_type::ResizeFactLwork(
526:           mcu, stream,
527:           [&](cupmBlasInt_t *lwork)
528:           {
529:             return cupmSolverXpotrs_bufferSize(
530:               handle, CUPMSOLVER_FILL_MODE_LOWER, m, nrhs, da.cupmdata(), lda, x, ldx, lwork
531:             );
532:           }
533:         )
534:       );
535:       // clang-format on
536:       PetscCallCUPMSOLVER(cupmSolverXpotrs(handle, CUPMSOLVER_FILL_MODE_LOWER, m, nrhs, da.cupmdata(), lda, x, ldx, mcu->d_fact_work, mcu->d_fact_lwork, fact_info));
537:       PetscCall(CheckCUPMSolverInfo_(fact_info, stream));
538:     }
539:     PetscCall(PetscLogGpuTimeEnd());
540:     PetscCall(PetscLogGpuFlops(nrhs * (2.0 * m * m - m)));
541:     PetscFunctionReturn(PETSC_SUCCESS);
542:   }
543: };

545: template <device::cupm::DeviceType T>
546: struct MatDense_Seq_CUPM<T>::SolveQR : SolveCommon<SolveQR> {
547:   using base_type = SolveCommon<SolveQR>;

549:   static constexpr const char   *NAME() noexcept { return "QR"; }
550:   static constexpr MatFactorType MATFACTORTYPE() noexcept { return MAT_FACTOR_QR; }

552:   static PetscErrorCode Factor(Mat A, IS, const MatFactorInfo *) noexcept
553:   {
554:     const auto         m     = static_cast<cupmBlasInt_t>(A->rmap->n);
555:     const auto         n     = static_cast<cupmBlasInt_t>(A->cmap->n);
556:     const auto         min   = std::min(m, n);
557:     const auto         mimpl = MatIMPLCast(A);
558:     cupmStream_t       stream;
559:     cupmSolverHandle_t handle;
560:     PetscDeviceContext dctx;

562:     PetscFunctionBegin;
563:     if (!m || !n) PetscFunctionReturn(PETSC_SUCCESS);
564:     PetscCall(GetHandles_(&dctx, &handle, &stream));
565:     PetscCall(base_type::FactorPrepare(A, stream));
566:     mimpl->rank = min;
567:     {
568:       const auto mcu = MatCUPMCast(A);
569:       const auto da  = DeviceArrayReadWrite(dctx, A);
570:       const auto lda = static_cast<cupmBlasInt_t>(mimpl->lda);

572:       if (!mcu->workvec) PetscCall(vec::cupm::VecCreateSeqCUPMAsync<T>(PetscObjectComm(PetscObjectCast(A)), m, &mcu->workvec));
573:       if (!mcu->d_fact_tau) PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_tau, min, stream));
574:       // clang-format off
575:       PetscCall(
576:         base_type::ResizeFactLwork(
577:           mcu, stream,
578:           [&](cupmBlasInt_t *fact_lwork)
579:           {
580:             return cupmSolverXgeqrf_bufferSize(handle, m, n, da.cupmdata(), lda, fact_lwork);
581:           }
582:         )
583:       );
584:       // clang-format on
585:       PetscCall(PetscLogGpuTimeBegin());
586:       PetscCallCUPMSOLVER(cupmSolverXgeqrf(handle, m, n, da.cupmdata(), lda, mcu->d_fact_tau, mcu->d_fact_work, mcu->d_fact_lwork, mcu->d_fact_info));
587:       PetscCall(PetscLogGpuTimeEnd());
588:       PetscCall(CheckCUPMSolverInfo_(mcu->d_fact_info, stream));
589:     }
590:     PetscCall(PetscLogGpuFlops(2.0 * min * min * (std::max(m, n) - min / 3.0)));
591:     PetscFunctionReturn(PETSC_SUCCESS);
592:   }

594:   template <bool transpose>
595:   static PetscErrorCode Solve(Mat A, cupmScalar_t *x, cupmBlasInt_t ldx, cupmBlasInt_t m, cupmBlasInt_t nrhs, cupmBlasInt_t k, PetscDeviceContext dctx, cupmStream_t stream) noexcept
596:   {
597:     const auto         mimpl      = MatIMPLCast(A);
598:     const auto         rank       = static_cast<cupmBlasInt_t>(mimpl->rank);
599:     const auto         mcu        = MatCUPMCast(A);
600:     const auto         fact_info  = mcu->d_fact_info;
601:     const auto         fact_tau   = mcu->d_fact_tau;
602:     const auto         fact_work  = mcu->d_fact_work;
603:     const auto         fact_lwork = mcu->d_fact_lwork;
604:     cupmSolverHandle_t solver_handle;
605:     cupmBlasHandle_t   blas_handle;

607:     PetscFunctionBegin;
608:     PetscCall(GetHandlesFrom_(dctx, &blas_handle, &solver_handle));
609:     PetscCall(PetscInfo(A, "%s solve %d x %d on backend\n", NAME(), m, k));
610:     PetscCall(PetscLogGpuTimeBegin());
611:     {
612:       const auto da  = DeviceArrayRead(dctx, A);
613:       const auto one = cupmScalarCast(1.0);
614:       const auto lda = static_cast<cupmBlasInt_t>(mimpl->lda);

616:       if (transpose) {
617:         PetscCallCUPMBLAS(cupmBlasXtrsm(blas_handle, CUPMBLAS_SIDE_LEFT, CUPMBLAS_FILL_MODE_UPPER, CUPMBLAS_OP_T, CUPMBLAS_DIAG_NON_UNIT, rank, nrhs, &one, da.cupmdata(), lda, x, ldx));
618:         PetscCallCUPMSOLVER(cupmSolverXormqr(solver_handle, CUPMSOLVER_SIDE_LEFT, CUPMSOLVER_OP_N, m, nrhs, rank, da.cupmdata(), lda, fact_tau, x, ldx, fact_work, fact_lwork, fact_info));
619:         PetscCall(CheckCUPMSolverInfo_(fact_info, stream));
620:       } else {
621:         constexpr auto op = PetscDefined(USE_COMPLEX) ? CUPMSOLVER_OP_C : CUPMSOLVER_OP_T;

623:         PetscCallCUPMSOLVER(cupmSolverXormqr(solver_handle, CUPMSOLVER_SIDE_LEFT, op, m, nrhs, rank, da.cupmdata(), lda, fact_tau, x, ldx, fact_work, fact_lwork, fact_info));
624:         PetscCall(CheckCUPMSolverInfo_(fact_info, stream));
625:         PetscCallCUPMBLAS(cupmBlasXtrsm(blas_handle, CUPMBLAS_SIDE_LEFT, CUPMBLAS_FILL_MODE_UPPER, CUPMBLAS_OP_N, CUPMBLAS_DIAG_NON_UNIT, rank, nrhs, &one, da.cupmdata(), lda, x, ldx));
626:       }
627:     }
628:     PetscCall(PetscLogGpuTimeEnd());
629:     PetscCall(PetscLogFlops(nrhs * (4.0 * m * rank - (rank * rank))));
630:     PetscFunctionReturn(PETSC_SUCCESS);
631:   }
632: };

634: template <device::cupm::DeviceType T>
635: template <typename Solver, bool transpose>
636: inline PetscErrorCode MatDense_Seq_CUPM<T>::MatSolve_Factored_Dispatch_(Mat A, Vec x, Vec y) noexcept
637: {
638:   using namespace vec::cupm;
639:   const auto         pobj_A  = PetscObjectCast(A);
640:   const auto         m       = static_cast<cupmBlasInt_t>(A->rmap->n);
641:   const auto         k       = static_cast<cupmBlasInt_t>(A->cmap->n);
642:   auto              &workvec = MatCUPMCast(A)->workvec;
643:   PetscScalar       *y_array = nullptr;
644:   PetscDeviceContext dctx;
645:   PetscBool          xiscupm, yiscupm, aiscupm;
646:   bool               use_y_array_directly;
647:   cupmStream_t       stream;

649:   PetscFunctionBegin;
650:   PetscCheck(A->factortype != MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix must be factored to solve");
651:   PetscCall(PetscObjectTypeCompare(PetscObjectCast(x), VecSeq_CUPM::VECSEQCUPM(), &xiscupm));
652:   PetscCall(PetscObjectTypeCompare(PetscObjectCast(y), VecSeq_CUPM::VECSEQCUPM(), &yiscupm));
653:   PetscCall(PetscObjectTypeCompare(pobj_A, MATSEQDENSECUPM(), &aiscupm));
654:   PetscAssert(aiscupm, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix A is somehow not CUPM?????????????????????????????");
655:   PetscCall(GetHandles_(&dctx, &stream));
656:   use_y_array_directly = yiscupm && (k >= m);
657:   {
658:     const PetscScalar *x_array;
659:     const auto         xisdevice = xiscupm && PetscOffloadDevice(x->offloadmask);
660:     const auto         copy_mode = xisdevice ? cupmMemcpyDeviceToDevice : cupmMemcpyHostToDevice;

662:     if (!use_y_array_directly && !workvec) PetscCall(VecCreateSeqCUPMAsync<T>(PetscObjectComm(pobj_A), m, &workvec));
663:     // The logic here is to try to minimize the amount of memory copying:
664:     //
665:     // If we call VecCUPMGetArrayRead(X, &x) every time xiscupm and the data is not offloaded
666:     // to the GPU yet, then the data is copied to the GPU. But we are only trying to get the
667:     // data in order to copy it into the y array. So the array x will be wherever the data
668:     // already is so that only one memcpy is performed
669:     if (xisdevice) {
670:       PetscCall(VecCUPMGetArrayReadAsync<T>(x, &x_array, dctx));
671:     } else {
672:       PetscCall(VecGetArrayRead(x, &x_array));
673:     }
674:     PetscCall(VecCUPMGetArrayWriteAsync<T>(use_y_array_directly ? y : workvec, &y_array, dctx));
675:     PetscCall(PetscCUPMMemcpyAsync(y_array, x_array, m, copy_mode, stream));
676:     if (xisdevice) {
677:       PetscCall(VecCUPMRestoreArrayReadAsync<T>(x, &x_array, dctx));
678:     } else {
679:       PetscCall(VecRestoreArrayRead(x, &x_array));
680:     }
681:   }

683:   if (!aiscupm) PetscCall(MatConvert(A, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &A));
684:   PetscCall(Solver{}.template Solve<transpose>(A, cupmScalarPtrCast(y_array), m, m, 1, k, dctx, stream));
685:   if (!aiscupm) PetscCall(MatConvert(A, MATSEQDENSE, MAT_INPLACE_MATRIX, &A));

687:   if (use_y_array_directly) {
688:     PetscCall(VecCUPMRestoreArrayWriteAsync<T>(y, &y_array, dctx));
689:   } else {
690:     const auto   copy_mode = yiscupm ? cupmMemcpyDeviceToDevice : cupmMemcpyDeviceToHost;
691:     PetscScalar *yv;

693:     // The logic here is that the data is not yet in either y's GPU array or its CPU array.
694:     // There is nothing in the interface to say where the user would like it to end up. So we
695:     // choose the GPU, because it is the faster option
696:     if (yiscupm) {
697:       PetscCall(VecCUPMGetArrayWriteAsync<T>(y, &yv, dctx));
698:     } else {
699:       PetscCall(VecGetArray(y, &yv));
700:     }
701:     PetscCall(PetscCUPMMemcpyAsync(yv, y_array, k, copy_mode, stream));
702:     if (yiscupm) {
703:       PetscCall(VecCUPMRestoreArrayWriteAsync<T>(y, &yv, dctx));
704:     } else {
705:       PetscCall(VecRestoreArray(y, &yv));
706:     }
707:     PetscCall(VecCUPMRestoreArrayWriteAsync<T>(workvec, &y_array));
708:   }
709:   PetscFunctionReturn(PETSC_SUCCESS);
710: }

712: template <device::cupm::DeviceType T>
713: template <typename Solver, bool transpose>
714: inline PetscErrorCode MatDense_Seq_CUPM<T>::MatMatSolve_Factored_Dispatch_(Mat A, Mat B, Mat X) noexcept
715: {
716:   const auto         m = static_cast<cupmBlasInt_t>(A->rmap->n);
717:   const auto         k = static_cast<cupmBlasInt_t>(A->cmap->n);
718:   cupmBlasInt_t      nrhs, ldb, ldx, ldy;
719:   PetscScalar       *y;
720:   PetscBool          biscupm, xiscupm, aiscupm;
721:   PetscDeviceContext dctx;
722:   cupmStream_t       stream;

724:   PetscFunctionBegin;
725:   PetscCheck(A->factortype != MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix must be factored to solve");
726:   PetscCall(PetscObjectTypeCompare(PetscObjectCast(B), MATSEQDENSECUPM(), &biscupm));
727:   PetscCall(PetscObjectTypeCompare(PetscObjectCast(X), MATSEQDENSECUPM(), &xiscupm));
728:   PetscCall(PetscObjectTypeCompare(PetscObjectCast(A), MATSEQDENSECUPM(), &aiscupm));
729:   PetscCall(GetHandles_(&dctx, &stream));
730:   {
731:     PetscInt n;

733:     PetscCall(MatGetSize(B, nullptr, &n));
734:     PetscCall(PetscCUPMBlasIntCast(n, &nrhs));
735:     PetscCall(MatDenseGetLDA(B, &n));
736:     PetscCall(PetscCUPMBlasIntCast(n, &ldb));
737:     PetscCall(MatDenseGetLDA(X, &n));
738:     PetscCall(PetscCUPMBlasIntCast(n, &ldx));
739:   }
740:   {
741:     // The logic here is to try to minimize the amount of memory copying:
742:     //
743:     // If we call MatDenseCUPMGetArrayRead(B, &b) every time biscupm and the data is not
744:     // offloaded to the GPU yet, then the data is copied to the GPU. But we are only trying to
745:     // get the data in order to copy it into the y array. So the array b will be wherever the
746:     // data already is so that only one memcpy is performed
747:     const auto         bisdevice = biscupm && PetscOffloadDevice(B->offloadmask);
748:     const auto         copy_mode = bisdevice ? cupmMemcpyDeviceToDevice : cupmMemcpyHostToDevice;
749:     const PetscScalar *b;

751:     if (bisdevice) {
752:       b = DeviceArrayRead(dctx, B);
753:     } else if (biscupm) {
754:       b = HostArrayRead(dctx, B);
755:     } else {
756:       PetscCall(MatDenseGetArrayRead(B, &b));
757:     }

759:     if (ldx < m || !xiscupm) {
760:       // X's array cannot serve as the array (too small or not on device), B's array cannot
761:       // serve as the array (const), so allocate a new array
762:       ldy = m;
763:       PetscCall(PetscCUPMMallocAsync(&y, nrhs * m));
764:     } else {
765:       // X's array should serve as the array
766:       ldy = ldx;
767:       y   = DeviceArrayWrite(dctx, X);
768:     }
769:     PetscCall(PetscCUPMMemcpy2DAsync(y, ldy, b, ldb, m, nrhs, copy_mode, stream));
770:     if (!bisdevice && !biscupm) PetscCall(MatDenseRestoreArrayRead(B, &b));
771:   }

773:   // convert to CUPM twice??????????????????????????????????
774:   // but A should already be CUPM??????????????????????????????????????
775:   if (!aiscupm) PetscCall(MatConvert(A, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &A));
776:   PetscCall(Solver{}.template Solve<transpose>(A, cupmScalarPtrCast(y), ldy, m, nrhs, k, dctx, stream));
777:   if (!aiscupm) PetscCall(MatConvert(A, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &A));

779:   if (ldx < m || !xiscupm) {
780:     const auto   copy_mode = xiscupm ? cupmMemcpyDeviceToDevice : cupmMemcpyDeviceToHost;
781:     PetscScalar *x;

783:     // The logic here is that the data is not yet in either X's GPU array or its CPU
784:     // array. There is nothing in the interface to say where the user would like it to end up.
785:     // So we choose the GPU, because it is the faster option
786:     if (xiscupm) {
787:       x = DeviceArrayWrite(dctx, X);
788:     } else {
789:       PetscCall(MatDenseGetArray(X, &x));
790:     }
791:     PetscCall(PetscCUPMMemcpy2DAsync(x, ldx, y, ldy, k, nrhs, copy_mode, stream));
792:     if (!xiscupm) PetscCall(MatDenseRestoreArray(X, &x));
793:     PetscCallCUPM(cupmFreeAsync(y, stream));
794:   }
795:   PetscFunctionReturn(PETSC_SUCCESS);
796: }

798: template <device::cupm::DeviceType T>
799: template <bool transpose>
800: inline PetscErrorCode MatDense_Seq_CUPM<T>::MatMultAdd_Dispatch_(Mat A, Vec xx, Vec yy, Vec zz) noexcept
801: {
802:   const auto         m = static_cast<cupmBlasInt_t>(A->rmap->n);
803:   const auto         n = static_cast<cupmBlasInt_t>(A->cmap->n);
804:   cupmBlasHandle_t   handle;
805:   PetscDeviceContext dctx;

807:   PetscFunctionBegin;
808:   if (yy && yy != zz) PetscCall(VecSeq_CUPM::Copy(yy, zz)); // mult add
809:   if (!m || !n) {
810:     // mult only
811:     if (!yy) PetscCall(VecSeq_CUPM::Set(zz, 0.0));
812:     PetscFunctionReturn(PETSC_SUCCESS);
813:   }
814:   PetscCall(PetscInfo(A, "Matrix-vector product %" PetscBLASInt_FMT " x %" PetscBLASInt_FMT " on backend\n", m, n));
815:   PetscCall(GetHandles_(&dctx, &handle));
816:   {
817:     constexpr auto op   = transpose ? CUPMBLAS_OP_T : CUPMBLAS_OP_N;
818:     const auto     one  = cupmScalarCast(1.0);
819:     const auto     zero = cupmScalarCast(0.0);
820:     const auto     da   = DeviceArrayRead(dctx, A);
821:     const auto     dxx  = VecSeq_CUPM::DeviceArrayRead(dctx, xx);
822:     const auto     dzz  = VecSeq_CUPM::DeviceArrayReadWrite(dctx, zz);

824:     PetscCall(PetscLogGpuTimeBegin());
825:     PetscCallCUPMBLAS(cupmBlasXgemv(handle, op, m, n, &one, da.cupmdata(), static_cast<cupmBlasInt_t>(MatIMPLCast(A)->lda), dxx.cupmdata(), 1, (yy ? &one : &zero), dzz.cupmdata(), 1));
826:     PetscCall(PetscLogGpuTimeEnd());
827:   }
828:   PetscCall(PetscLogGpuFlops(2.0 * m * n - (yy ? 0 : m)));
829:   PetscFunctionReturn(PETSC_SUCCESS);
830: }

832: // ==========================================================================================
833: // MatDense_Seq_CUPM - Private API - Conversion Dispatch
834: // ==========================================================================================

836: template <device::cupm::DeviceType T>
837: template <bool to_host>
838: inline PetscErrorCode MatDense_Seq_CUPM<T>::Convert_Dispatch_(Mat M, MatType type, MatReuse reuse, Mat *newmat) noexcept
839: {
840:   PetscFunctionBegin;
841:   if (reuse == MAT_REUSE_MATRIX || reuse == MAT_INITIAL_MATRIX) {
842:     // TODO these cases should be optimized
843:     PetscCall(MatConvert_Basic(M, type, reuse, newmat));
844:   } else {
845:     const auto B    = *newmat;
846:     const auto pobj = PetscObjectCast(B);

848:     if (to_host) {
849:       PetscCall(BindToCPU(B, PETSC_TRUE));
850:       PetscCall(Reset(B));
851:     } else {
852:       PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUPM()));
853:     }

855:     PetscCall(PetscStrFreeAllocpy(to_host ? VECSTANDARD : VecSeq_CUPM::VECCUPM(), &B->defaultvectype));
856:     PetscCall(PetscObjectChangeTypeName(pobj, to_host ? MATSEQDENSE : MATSEQDENSECUPM()));
857:     // cvec might be the wrong VecType, destroy and rebuild it if necessary
858:     // REVIEW ME: this is possibly very inefficient
859:     PetscCall(VecDestroy(&MatIMPLCast(B)->cvec));

861:     MatComposeOp_CUPM(to_host, pobj, MatConvert_seqdensecupm_seqdense_C(), nullptr, Convert_SeqDenseCUPM_SeqDense);
862:     MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMGetArray_C(), nullptr, GetArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_READ_WRITE>);
863:     MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMGetArrayRead_C(), nullptr, GetArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_READ>);
864:     MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMGetArrayWrite_C(), nullptr, GetArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_WRITE>);
865:     MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMRestoreArray_C(), nullptr, RestoreArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_READ_WRITE>);
866:     MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMRestoreArrayRead_C(), nullptr, RestoreArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_READ>);
867:     MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMRestoreArrayWrite_C(), nullptr, RestoreArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_WRITE>);
868:     MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMPlaceArray_C(), nullptr, PlaceArray);
869:     MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMResetArray_C(), nullptr, ResetArray);
870:     MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMReplaceArray_C(), nullptr, ReplaceArray);
871:     MatComposeOp_CUPM(to_host, pobj, MatProductSetFromOptions_seqaij_seqdensecupm_C(), nullptr, MatProductSetFromOptions_SeqAIJ_SeqDense);
872:     MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMSetPreallocation_C(), nullptr, SetPreallocation);

874:     if (to_host) {
875:       B->offloadmask = PETSC_OFFLOAD_CPU;
876:     } else {
877:       Mat_SeqDenseCUPM *mcu;

879:       PetscCall(PetscNew(&mcu));
880:       B->spptr       = mcu;
881:       B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; // REVIEW ME: why not offload host??
882:       PetscCall(BindToCPU(B, PETSC_FALSE));
883:     }

885:     MatSetOp_CUPM(to_host, B, bindtocpu, nullptr, BindToCPU);
886:     MatSetOp_CUPM(to_host, B, destroy, MatDestroy_SeqDense, Destroy);
887:   }
888:   PetscFunctionReturn(PETSC_SUCCESS);
889: }

891: // ==========================================================================================
892: // MatDense_Seq_CUPM - Public API
893: // ==========================================================================================

895: template <device::cupm::DeviceType T>
896: inline constexpr MatType MatDense_Seq_CUPM<T>::MATIMPLCUPM_() noexcept
897: {
898:   return MATSEQDENSECUPM();
899: }

901: template <device::cupm::DeviceType T>
902: inline constexpr typename MatDense_Seq_CUPM<T>::Mat_SeqDenseCUPM *MatDense_Seq_CUPM<T>::MatCUPMCast(Mat m) noexcept
903: {
904:   return static_cast<Mat_SeqDenseCUPM *>(m->spptr);
905: }

907: template <device::cupm::DeviceType T>
908: inline constexpr Mat_SeqDense *MatDense_Seq_CUPM<T>::MatIMPLCast_(Mat m) noexcept
909: {
910:   return static_cast<Mat_SeqDense *>(m->data);
911: }

913: template <device::cupm::DeviceType T>
914: inline constexpr const char *MatDense_Seq_CUPM<T>::MatConvert_seqdensecupm_seqdense_C() noexcept
915: {
916:   return T == device::cupm::DeviceType::CUDA ? "MatConvert_seqdensecuda_seqdense_C" : "MatConvert_seqdensehip_seqdense_C";
917: }

919: template <device::cupm::DeviceType T>
920: inline constexpr const char *MatDense_Seq_CUPM<T>::MatProductSetFromOptions_seqaij_seqdensecupm_C() noexcept
921: {
922:   return T == device::cupm::DeviceType::CUDA ? "MatProductSetFromOptions_seqaij_seqdensecuda_C" : "MatProductSetFromOptions_seqaij_seqdensehip_C";
923: }

925: // ==========================================================================================

927: // MatCreate_SeqDenseCUPM()
928: template <device::cupm::DeviceType T>
929: inline PetscErrorCode MatDense_Seq_CUPM<T>::Create(Mat A) noexcept
930: {
931:   PetscFunctionBegin;
932:   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUPM()));
933:   PetscCall(MatCreate_SeqDense(A));
934:   PetscCall(Convert_SeqDense_SeqDenseCUPM(A, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &A));
935:   PetscFunctionReturn(PETSC_SUCCESS);
936: }

938: template <device::cupm::DeviceType T>
939: inline PetscErrorCode MatDense_Seq_CUPM<T>::Destroy(Mat A) noexcept
940: {
941:   PetscFunctionBegin;
942:   // prevent copying back data if we own the data pointer
943:   if (!MatIMPLCast(A)->user_alloc) A->offloadmask = PETSC_OFFLOAD_CPU;
944:   PetscCall(Convert_SeqDenseCUPM_SeqDense(A, MATSEQDENSE, MAT_INPLACE_MATRIX, &A));
945:   PetscCall(MatDestroy_SeqDense(A));
946:   PetscFunctionReturn(PETSC_SUCCESS);
947: }

949: // obj->ops->setup()
950: template <device::cupm::DeviceType T>
951: inline PetscErrorCode MatDense_Seq_CUPM<T>::SetUp(Mat A) noexcept
952: {
953:   PetscFunctionBegin;
954:   PetscCall(PetscLayoutSetUp(A->rmap));
955:   PetscCall(PetscLayoutSetUp(A->cmap));
956:   if (!A->preallocated) {
957:     PetscDeviceContext dctx;

959:     PetscCall(GetHandles_(&dctx));
960:     PetscCall(SetPreallocation(A, dctx, nullptr));
961:   }
962:   PetscFunctionReturn(PETSC_SUCCESS);
963: }

965: template <device::cupm::DeviceType T>
966: inline PetscErrorCode MatDense_Seq_CUPM<T>::Reset(Mat A) noexcept
967: {
968:   PetscFunctionBegin;
969:   if (const auto mcu = MatCUPMCast(A)) {
970:     cupmStream_t stream;

972:     PetscCheck(!mcu->unplacedarray, PETSC_COMM_SELF, PETSC_ERR_ORDER, "MatDense%sResetArray() must be called first", cupmNAME());
973:     PetscCall(GetHandles_(&stream));
974:     if (!mcu->d_user_alloc) PetscCallCUPM(cupmFreeAsync(mcu->d_v, stream));
975:     PetscCallCUPM(cupmFreeAsync(mcu->d_fact_tau, stream));
976:     PetscCallCUPM(cupmFreeAsync(mcu->d_fact_ipiv, stream));
977:     PetscCallCUPM(cupmFreeAsync(mcu->d_fact_info, stream));
978:     PetscCallCUPM(cupmFreeAsync(mcu->d_fact_work, stream));
979:     PetscCall(VecDestroy(&mcu->workvec));
980:     PetscCall(PetscFree(A->spptr /* mcu */));
981:   }
982:   PetscFunctionReturn(PETSC_SUCCESS);
983: }

985: // ==========================================================================================

987: template <device::cupm::DeviceType T>
988: inline PetscErrorCode MatDense_Seq_CUPM<T>::BindToCPU(Mat A, PetscBool to_host) noexcept
989: {
990:   const auto mimpl = MatIMPLCast(A);
991:   const auto pobj  = PetscObjectCast(A);

993:   PetscFunctionBegin;
994:   PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
995:   PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
996:   A->boundtocpu = to_host;
997:   PetscCall(PetscStrFreeAllocpy(to_host ? PETSCRANDER48 : PETSCDEVICERAND(), &A->defaultrandtype));
998:   if (to_host) {
999:     PetscDeviceContext dctx;

1001:     // make sure we have an up-to-date copy on the CPU
1002:     PetscCall(GetHandles_(&dctx));
1003:     PetscCall(DeviceToHost_(A, dctx));
1004:   } else {
1005:     PetscBool iscupm;

1007:     if (auto &cvec = mimpl->cvec) {
1008:       PetscCall(PetscObjectTypeCompare(PetscObjectCast(cvec), VecSeq_CUPM::VECSEQCUPM(), &iscupm));
1009:       if (!iscupm) PetscCall(VecDestroy(&cvec));
1010:     }
1011:     if (auto &cmat = mimpl->cmat) {
1012:       PetscCall(PetscObjectTypeCompare(PetscObjectCast(cmat), MATSEQDENSECUPM(), &iscupm));
1013:       if (!iscupm) PetscCall(MatDestroy(&cmat));
1014:     }
1015:   }

1017:   // ============================================================
1018:   // Composed ops
1019:   // ============================================================
1020:   MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArray_C", MatDenseGetArray_SeqDense, GetArrayC_<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_READ_WRITE>);
1021:   MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArrayRead_C", MatDenseGetArray_SeqDense, GetArrayC_<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_READ>);
1022:   MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArrayWrite_C", MatDenseGetArray_SeqDense, GetArrayC_<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_WRITE>);
1023:   MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArrayAndMemType_C", nullptr, GetArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_READ_WRITE>);
1024:   MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreArrayAndMemType_C", nullptr, RestoreArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_READ_WRITE>);
1025:   MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArrayReadAndMemType_C", nullptr, GetArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_READ>);
1026:   MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreArrayReadAndMemType_C", nullptr, RestoreArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_READ>);
1027:   MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArrayWriteAndMemType_C", nullptr, GetArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_WRITE>);
1028:   MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreArrayWriteAndMemType_C", nullptr, RestoreArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_WRITE>);
1029:   MatComposeOp_CUPM(to_host, pobj, "MatDenseGetColumnVec_C", MatDenseGetColumnVec_SeqDense, GetColumnVec<PETSC_MEMORY_ACCESS_READ_WRITE>);
1030:   MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreColumnVec_C", MatDenseRestoreColumnVec_SeqDense, RestoreColumnVec<PETSC_MEMORY_ACCESS_READ_WRITE>);
1031:   MatComposeOp_CUPM(to_host, pobj, "MatDenseGetColumnVecRead_C", MatDenseGetColumnVecRead_SeqDense, GetColumnVec<PETSC_MEMORY_ACCESS_READ>);
1032:   MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreColumnVecRead_C", MatDenseRestoreColumnVecRead_SeqDense, RestoreColumnVec<PETSC_MEMORY_ACCESS_READ>);
1033:   MatComposeOp_CUPM(to_host, pobj, "MatDenseGetColumnVecWrite_C", MatDenseGetColumnVecWrite_SeqDense, GetColumnVec<PETSC_MEMORY_ACCESS_WRITE>);
1034:   MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreColumnVecWrite_C", MatDenseRestoreColumnVecWrite_SeqDense, RestoreColumnVec<PETSC_MEMORY_ACCESS_WRITE>);
1035:   MatComposeOp_CUPM(to_host, pobj, "MatDenseGetSubMatrix_C", MatDenseGetSubMatrix_SeqDense, GetSubMatrix);
1036:   MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreSubMatrix_C", MatDenseRestoreSubMatrix_SeqDense, RestoreSubMatrix);
1037:   MatComposeOp_CUPM(to_host, pobj, "MatQRFactor_C", MatQRFactor_SeqDense, SolveQR::Factor);
1038:   // always the same
1039:   PetscCall(PetscObjectComposeFunction(pobj, "MatDenseSetLDA_C", MatDenseSetLDA_SeqDense));

1041:   // ============================================================
1042:   // Function pointer ops
1043:   // ============================================================
1044:   MatSetOp_CUPM(to_host, A, duplicate, MatDuplicate_SeqDense, Duplicate);
1045:   MatSetOp_CUPM(to_host, A, mult, MatMult_SeqDense, [](Mat A, Vec xx, Vec yy) { return MatMultAdd_Dispatch_</* transpose */ false>(A, xx, nullptr, yy); });
1046:   MatSetOp_CUPM(to_host, A, multtranspose, MatMultTranspose_SeqDense, [](Mat A, Vec xx, Vec yy) { return MatMultAdd_Dispatch_</* transpose */ true>(A, xx, nullptr, yy); });
1047:   MatSetOp_CUPM(to_host, A, multadd, MatMultAdd_SeqDense, MatMultAdd_Dispatch_</* transpose */ false>);
1048:   MatSetOp_CUPM(to_host, A, multtransposeadd, MatMultTransposeAdd_SeqDense, MatMultAdd_Dispatch_</* transpose */ true>);
1049:   MatSetOp_CUPM(to_host, A, matmultnumeric, MatMatMultNumeric_SeqDense_SeqDense, MatMatMult_Numeric_Dispatch</* transpose_A */ false, /* transpose_B */ false>);
1050:   MatSetOp_CUPM(to_host, A, mattransposemultnumeric, MatMatTransposeMultNumeric_SeqDense_SeqDense, MatMatMult_Numeric_Dispatch</* transpose_A */ false, /* transpose_B */ true>);
1051:   MatSetOp_CUPM(to_host, A, transposematmultnumeric, MatTransposeMatMultNumeric_SeqDense_SeqDense, MatMatMult_Numeric_Dispatch</* transpose_A */ true, /* transpose_B */ false>);
1052:   MatSetOp_CUPM(to_host, A, axpy, MatAXPY_SeqDense, AXPY);
1053:   MatSetOp_CUPM(to_host, A, choleskyfactor, MatCholeskyFactor_SeqDense, SolveCholesky::Factor);
1054:   MatSetOp_CUPM(to_host, A, lufactor, MatLUFactor_SeqDense, SolveLU::Factor);
1055:   MatSetOp_CUPM(to_host, A, getcolumnvector, MatGetColumnVector_SeqDense, GetColumnVector);
1056:   MatSetOp_CUPM(to_host, A, scale, MatScale_SeqDense, Scale);
1057:   MatSetOp_CUPM(to_host, A, shift, MatShift_SeqDense, Shift);
1058:   MatSetOp_CUPM(to_host, A, copy, MatCopy_SeqDense, Copy);
1059:   MatSetOp_CUPM(to_host, A, zeroentries, MatZeroEntries_SeqDense, ZeroEntries);
1060:   MatSetOp_CUPM(to_host, A, setup, MatSetUp_SeqDense, SetUp);
1061:   MatSetOp_CUPM(to_host, A, setrandom, MatSetRandom_SeqDense, SetRandom);
1062:   MatSetOp_CUPM(to_host, A, getdiagonal, MatGetDiagonal_SeqDense, GetDiagonal);
1063:   // seemingly always the same
1064:   A->ops->productsetfromoptions = MatProductSetFromOptions_SeqDense;

1066:   if (const auto cmat = mimpl->cmat) PetscCall(MatBindToCPU(cmat, to_host));
1067:   PetscFunctionReturn(PETSC_SUCCESS);
1068: }

1070: template <device::cupm::DeviceType T>
1071: inline PetscErrorCode MatDense_Seq_CUPM<T>::Convert_SeqDenseCUPM_SeqDense(Mat M, MatType type, MatReuse reuse, Mat *newmat) noexcept
1072: {
1073:   PetscFunctionBegin;
1074:   PetscCall(Convert_Dispatch_</* to host */ true>(M, type, reuse, newmat));
1075:   PetscFunctionReturn(PETSC_SUCCESS);
1076: }

1078: template <device::cupm::DeviceType T>
1079: inline PetscErrorCode MatDense_Seq_CUPM<T>::Convert_SeqDense_SeqDenseCUPM(Mat M, MatType type, MatReuse reuse, Mat *newmat) noexcept
1080: {
1081:   PetscFunctionBegin;
1082:   PetscCall(Convert_Dispatch_</* to host */ false>(M, type, reuse, newmat));
1083:   PetscFunctionReturn(PETSC_SUCCESS);
1084: }

1086: // ==========================================================================================

1088: template <device::cupm::DeviceType T>
1089: template <PetscMemType mtype, PetscMemoryAccessMode access>
1090: inline PetscErrorCode MatDense_Seq_CUPM<T>::GetArray(Mat m, PetscScalar **array, PetscDeviceContext dctx) noexcept
1091: {
1092:   constexpr auto hostmem     = PetscMemTypeHost(mtype);
1093:   constexpr auto read_access = PetscMemoryAccessRead(access);

1095:   PetscFunctionBegin;
1096:   static_assert((mtype == PETSC_MEMTYPE_HOST) || (mtype == PETSC_MEMTYPE_DEVICE), "");
1097:   PetscCall(PetscDeviceContextGetOptionalNullContext_Internal(&dctx));
1098:   if (hostmem) {
1099:     if (read_access) {
1100:       PetscCall(DeviceToHost_(m, dctx));
1101:     } else if (!MatIMPLCast(m)->v) {
1102:       // MatCreateSeqDenseCUPM may not allocate CPU memory. Allocate if needed
1103:       PetscCall(MatSeqDenseSetPreallocation(m, nullptr));
1104:     }
1105:     *array = MatIMPLCast(m)->v;
1106:   } else {
1107:     if (read_access) {
1108:       PetscCall(HostToDevice_(m, dctx));
1109:     } else if (!MatCUPMCast(m)->d_v) {
1110:       // write-only
1111:       PetscCall(SetPreallocation(m, dctx, nullptr));
1112:     }
1113:     *array = MatCUPMCast(m)->d_v;
1114:   }
1115:   if (PetscMemoryAccessWrite(access)) {
1116:     m->offloadmask = hostmem ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
1117:     PetscCall(PetscObjectStateIncrease(PetscObjectCast(m)));
1118:   }
1119:   PetscFunctionReturn(PETSC_SUCCESS);
1120: }

1122: template <device::cupm::DeviceType T>
1123: template <PetscMemType mtype, PetscMemoryAccessMode access>
1124: inline PetscErrorCode MatDense_Seq_CUPM<T>::RestoreArray(Mat m, PetscScalar **array, PetscDeviceContext) noexcept
1125: {
1126:   PetscFunctionBegin;
1127:   static_assert((mtype == PETSC_MEMTYPE_HOST) || (mtype == PETSC_MEMTYPE_DEVICE), "");
1128:   if (PetscMemoryAccessWrite(access)) {
1129:     // WRITE or READ_WRITE
1130:     m->offloadmask = PetscMemTypeHost(mtype) ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
1131:     PetscCall(PetscObjectStateIncrease(PetscObjectCast(m)));
1132:   }
1133:   if (array) {
1134:     PetscCall(CheckPointerMatchesMemType_(*array, mtype));
1135:     *array = nullptr;
1136:   }
1137:   PetscFunctionReturn(PETSC_SUCCESS);
1138: }

1140: template <device::cupm::DeviceType T>
1141: template <PetscMemoryAccessMode access>
1142: inline PetscErrorCode MatDense_Seq_CUPM<T>::GetArrayAndMemType(Mat m, PetscScalar **array, PetscMemType *mtype, PetscDeviceContext dctx) noexcept
1143: {
1144:   PetscFunctionBegin;
1145:   PetscCall(GetArray<PETSC_MEMTYPE_DEVICE, access>(m, array, dctx));
1146:   if (mtype) *mtype = PETSC_MEMTYPE_CUPM();
1147:   PetscFunctionReturn(PETSC_SUCCESS);
1148: }

1150: template <device::cupm::DeviceType T>
1151: template <PetscMemoryAccessMode access>
1152: inline PetscErrorCode MatDense_Seq_CUPM<T>::RestoreArrayAndMemType(Mat m, PetscScalar **array, PetscDeviceContext dctx) noexcept
1153: {
1154:   PetscFunctionBegin;
1155:   PetscCall(RestoreArray<PETSC_MEMTYPE_DEVICE, access>(m, array, dctx));
1156:   PetscFunctionReturn(PETSC_SUCCESS);
1157: }

1159: // ==========================================================================================

1161: template <device::cupm::DeviceType T>
1162: inline PetscErrorCode MatDense_Seq_CUPM<T>::PlaceArray(Mat A, const PetscScalar *array) noexcept
1163: {
1164:   const auto mimpl = MatIMPLCast(A);
1165:   const auto mcu   = MatCUPMCast(A);

1167:   PetscFunctionBegin;
1168:   PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1169:   PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1170:   PetscCheck(!mcu->unplacedarray, PETSC_COMM_SELF, PETSC_ERR_ORDER, "MatDense%sResetArray() must be called first", cupmNAME());
1171:   if (mimpl->v) {
1172:     PetscDeviceContext dctx;

1174:     PetscCall(GetHandles_(&dctx));
1175:     PetscCall(HostToDevice_(A, dctx));
1176:   }
1177:   mcu->unplacedarray         = util::exchange(mcu->d_v, const_cast<PetscScalar *>(array));
1178:   mcu->d_unplaced_user_alloc = util::exchange(mcu->d_user_alloc, PETSC_TRUE);
1179:   PetscFunctionReturn(PETSC_SUCCESS);
1180: }

1182: template <device::cupm::DeviceType T>
1183: inline PetscErrorCode MatDense_Seq_CUPM<T>::ReplaceArray(Mat A, const PetscScalar *array) noexcept
1184: {
1185:   const auto mimpl = MatIMPLCast(A);
1186:   const auto mcu   = MatCUPMCast(A);

1188:   PetscFunctionBegin;
1189:   PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1190:   PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1191:   PetscCheck(!mcu->unplacedarray, PETSC_COMM_SELF, PETSC_ERR_ORDER, "MatDense%sResetArray() must be called first", cupmNAME());
1192:   if (!mcu->d_user_alloc) {
1193:     cupmStream_t stream;

1195:     PetscCall(GetHandles_(&stream));
1196:     PetscCallCUPM(cupmFreeAsync(mcu->d_v, stream));
1197:   }
1198:   mcu->d_v          = const_cast<PetscScalar *>(array);
1199:   mcu->d_user_alloc = PETSC_FALSE;
1200:   PetscFunctionReturn(PETSC_SUCCESS);
1201: }

1203: template <device::cupm::DeviceType T>
1204: inline PetscErrorCode MatDense_Seq_CUPM<T>::ResetArray(Mat A) noexcept
1205: {
1206:   const auto mimpl = MatIMPLCast(A);
1207:   const auto mcu   = MatCUPMCast(A);

1209:   PetscFunctionBegin;
1210:   PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1211:   PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1212:   if (mimpl->v) {
1213:     PetscDeviceContext dctx;

1215:     PetscCall(GetHandles_(&dctx));
1216:     PetscCall(HostToDevice_(A, dctx));
1217:   }
1218:   mcu->d_v          = util::exchange(mcu->unplacedarray, nullptr);
1219:   mcu->d_user_alloc = mcu->d_unplaced_user_alloc;
1220:   PetscFunctionReturn(PETSC_SUCCESS);
1221: }

1223: // ==========================================================================================

1225: template <device::cupm::DeviceType T>
1226: template <bool transpose_A, bool transpose_B>
1227: inline PetscErrorCode MatDense_Seq_CUPM<T>::MatMatMult_Numeric_Dispatch(Mat A, Mat B, Mat C) noexcept
1228: {
1229:   cupmBlasInt_t      m, n, k;
1230:   PetscBool          Aiscupm, Biscupm;
1231:   PetscDeviceContext dctx;
1232:   cupmBlasHandle_t   handle;

1234:   PetscFunctionBegin;
1235:   PetscCall(PetscCUPMBlasIntCast(C->rmap->n, &m));
1236:   PetscCall(PetscCUPMBlasIntCast(C->cmap->n, &n));
1237:   PetscCall(PetscCUPMBlasIntCast(transpose_A ? A->rmap->n : A->cmap->n, &k));
1238:   if (!m || !n || !k) PetscFunctionReturn(PETSC_SUCCESS);

1240:   // we may end up with SEQDENSE as one of the arguments
1241:   // REVIEW ME: how? and why is it not B and C????????
1242:   PetscCall(PetscObjectTypeCompare(PetscObjectCast(A), MATSEQDENSECUPM(), &Aiscupm));
1243:   PetscCall(PetscObjectTypeCompare(PetscObjectCast(B), MATSEQDENSECUPM(), &Biscupm));
1244:   if (!Aiscupm) PetscCall(MatConvert(A, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &A));
1245:   if (!Biscupm) PetscCall(MatConvert(B, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &B));
1246:   PetscCall(PetscInfo(C, "Matrix-Matrix product %" PetscBLASInt_FMT " x %" PetscBLASInt_FMT " x %" PetscBLASInt_FMT " on backend\n", m, k, n));
1247:   PetscCall(GetHandles_(&dctx, &handle));

1249:   PetscCall(PetscLogGpuTimeBegin());
1250:   {
1251:     const auto one  = cupmScalarCast(1.0);
1252:     const auto zero = cupmScalarCast(0.0);
1253:     const auto da   = DeviceArrayRead(dctx, A);
1254:     const auto db   = DeviceArrayRead(dctx, B);
1255:     const auto dc   = DeviceArrayWrite(dctx, C);
1256:     PetscInt   alda, blda, clda;

1258:     PetscCall(MatDenseGetLDA(A, &alda));
1259:     PetscCall(MatDenseGetLDA(B, &blda));
1260:     PetscCall(MatDenseGetLDA(C, &clda));
1261:     PetscCallCUPMBLAS(cupmBlasXgemm(handle, transpose_A ? CUPMBLAS_OP_T : CUPMBLAS_OP_N, transpose_B ? CUPMBLAS_OP_T : CUPMBLAS_OP_N, m, n, k, &one, da.cupmdata(), alda, db.cupmdata(), blda, &zero, dc.cupmdata(), clda));
1262:   }
1263:   PetscCall(PetscLogGpuTimeEnd());

1265:   PetscCall(PetscLogGpuFlops(1.0 * m * n * k + 1.0 * m * n * (k - 1)));
1266:   if (!Aiscupm) PetscCall(MatConvert(A, MATSEQDENSE, MAT_INPLACE_MATRIX, &A));
1267:   if (!Biscupm) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B));
1268:   PetscFunctionReturn(PETSC_SUCCESS);
1269: }

1271: template <device::cupm::DeviceType T>
1272: inline PetscErrorCode MatDense_Seq_CUPM<T>::Copy(Mat A, Mat B, MatStructure str) noexcept
1273: {
1274:   const auto m = A->rmap->n;
1275:   const auto n = A->cmap->n;

1277:   PetscFunctionBegin;
1278:   PetscAssert(m == B->rmap->n && n == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "size(B) != size(A)");
1279:   // The two matrices must have the same copy implementation to be eligible for fast copy
1280:   if (A->ops->copy == B->ops->copy) {
1281:     PetscDeviceContext dctx;
1282:     cupmStream_t       stream;

1284:     PetscCall(GetHandles_(&dctx, &stream));
1285:     PetscCall(PetscLogGpuTimeBegin());
1286:     {
1287:       const auto va = DeviceArrayRead(dctx, A);
1288:       const auto vb = DeviceArrayWrite(dctx, B);
1289:       // order is important, DeviceArrayRead/Write() might call SetPreallocation() which sets
1290:       // lda!
1291:       const auto lda_a = MatIMPLCast(A)->lda;
1292:       const auto lda_b = MatIMPLCast(B)->lda;

1294:       if (lda_a > m || lda_b > m) {
1295:         PetscAssert(lda_b > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "B lda (%" PetscBLASInt_FMT ") must be > 0 at this point, this indicates Mat%sSetPreallocation() was not called when it should have been!", lda_b, cupmNAME());
1296:         PetscAssert(lda_a > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A lda (%" PetscBLASInt_FMT ") must be > 0 at this point, this indicates Mat%sSetPreallocation() was not called when it should have been!", lda_a, cupmNAME());
1297:         PetscCall(PetscCUPMMemcpy2DAsync(vb.data(), lda_b, va.data(), lda_a, m, n, cupmMemcpyDeviceToDevice, stream));
1298:       } else {
1299:         PetscCall(PetscCUPMMemcpyAsync(vb.data(), va.data(), m * n, cupmMemcpyDeviceToDevice, stream));
1300:       }
1301:     }
1302:     PetscCall(PetscLogGpuTimeEnd());
1303:   } else {
1304:     PetscCall(MatCopy_Basic(A, B, str));
1305:   }
1306:   PetscFunctionReturn(PETSC_SUCCESS);
1307: }

1309: template <device::cupm::DeviceType T>
1310: inline PetscErrorCode MatDense_Seq_CUPM<T>::ZeroEntries(Mat m) noexcept
1311: {
1312:   PetscDeviceContext dctx;
1313:   cupmStream_t       stream;

1315:   PetscFunctionBegin;
1316:   PetscCall(GetHandles_(&dctx, &stream));
1317:   PetscCall(PetscLogGpuTimeBegin());
1318:   {
1319:     const auto va  = DeviceArrayWrite(dctx, m);
1320:     const auto lda = MatIMPLCast(m)->lda;
1321:     const auto ma  = m->rmap->n;
1322:     const auto na  = m->cmap->n;

1324:     if (lda > ma) {
1325:       PetscCall(PetscCUPMMemset2DAsync(va.data(), lda, 0, ma, na, stream));
1326:     } else {
1327:       PetscCall(PetscCUPMMemsetAsync(va.data(), 0, ma * na, stream));
1328:     }
1329:   }
1330:   PetscCall(PetscLogGpuTimeEnd());
1331:   PetscFunctionReturn(PETSC_SUCCESS);
1332: }

1334: namespace detail
1335: {

1337: // ==========================================================================================
1338: // SubMatIndexFunctor
1339: //
1340: // Iterator which permutes a linear index range into matrix indices for am nrows x ncols
1341: // submat with leading dimension lda. Essentially SubMatIndexFunctor(i) returns the index for
1342: // the i'th sequential entry in the matrix.
1343: // ==========================================================================================
1344: template <typename T>
1345: struct SubMatIndexFunctor {
1346:   PETSC_HOSTDEVICE_INLINE_DECL T operator()(T x) const noexcept { return ((x / nrows) * lda) + (x % nrows); }

1348:   PetscInt nrows;
1349:   PetscInt ncols;
1350:   PetscInt lda;
1351: };

1353: template <typename Iterator>
1354: struct SubMatrixIterator : MatrixIteratorBase<Iterator, SubMatIndexFunctor<typename thrust::iterator_difference<Iterator>::type>> {
1355:   using base_type = MatrixIteratorBase<Iterator, SubMatIndexFunctor<typename thrust::iterator_difference<Iterator>::type>>;

1357:   using iterator = typename base_type::iterator;

1359:   constexpr SubMatrixIterator(Iterator first, Iterator last, PetscInt nrows, PetscInt ncols, PetscInt lda) noexcept :
1360:     base_type{
1361:       std::move(first), std::move(last), {nrows, ncols, lda}
1362:   }
1363:   {
1364:   }

1366:   PETSC_NODISCARD iterator end() const noexcept { return this->begin() + (this->func.nrows * this->func.ncols); }
1367: };

1369: namespace
1370: {

1372: template <typename T>
1373: PETSC_NODISCARD inline SubMatrixIterator<typename thrust::device_vector<T>::iterator> make_submat_iterator(PetscInt rstart, PetscInt rend, PetscInt cstart, PetscInt cend, PetscInt lda, T *ptr) noexcept
1374: {
1375:   const auto nrows = rend - rstart;
1376:   const auto ncols = cend - cstart;
1377:   const auto dptr  = thrust::device_pointer_cast(ptr);

1379:   return {dptr + (rstart * lda) + cstart, dptr + ((rstart + nrows) * lda) + cstart, nrows, ncols, lda};
1380: }

1382: } // namespace

1384: } // namespace detail

1386: template <device::cupm::DeviceType T>
1387: inline PetscErrorCode MatDense_Seq_CUPM<T>::Scale(Mat A, PetscScalar alpha) noexcept
1388: {
1389:   const auto         m = A->rmap->n;
1390:   const auto         n = A->cmap->n;
1391:   const auto         N = m * n;
1392:   PetscDeviceContext dctx;

1394:   PetscFunctionBegin;
1395:   PetscCall(PetscInfo(A, "Performing Scale %" PetscInt_FMT " x %" PetscInt_FMT " on backend\n", m, n));
1396:   PetscCall(GetHandles_(&dctx));
1397:   {
1398:     const auto da  = DeviceArrayReadWrite(dctx, A);
1399:     const auto lda = MatIMPLCast(A)->lda;

1401:     if (lda > m) {
1402:       cupmStream_t stream;

1404:       PetscCall(GetHandlesFrom_(dctx, &stream));
1405:       // clang-format off
1406:       PetscCallThrust(
1407:         const auto sub_mat = detail::make_submat_iterator(0, m, 0, n, lda, da.data());

1409:         THRUST_CALL(
1410:           thrust::transform,
1411:           stream,
1412:           sub_mat.begin(), sub_mat.end(), sub_mat.begin(),
1413:           device::cupm::functors::make_times_equals(alpha)
1414:         )
1415:       );
1416:       // clang-format on
1417:     } else {
1418:       const auto       cu_alpha = cupmScalarCast(alpha);
1419:       cupmBlasHandle_t handle;

1421:       PetscCall(GetHandlesFrom_(dctx, &handle));
1422:       PetscCall(PetscLogGpuTimeBegin());
1423:       PetscCallCUPMBLAS(cupmBlasXscal(handle, N, &cu_alpha, da.cupmdata(), 1));
1424:       PetscCall(PetscLogGpuTimeEnd());
1425:     }
1426:   }
1427:   PetscCall(PetscLogGpuFlops(N));
1428:   PetscFunctionReturn(PETSC_SUCCESS);
1429: }

1431: template <device::cupm::DeviceType T>
1432: inline PetscErrorCode MatDense_Seq_CUPM<T>::AXPY(Mat Y, PetscScalar alpha, Mat X, MatStructure) noexcept
1433: {
1434:   const auto         m_x = X->rmap->n, m_y = Y->rmap->n;
1435:   const auto         n_x = X->cmap->n, n_y = Y->cmap->n;
1436:   const auto         N = m_x * n_x;
1437:   PetscDeviceContext dctx;

1439:   PetscFunctionBegin;
1440:   if (!m_x || !n_x || alpha == (PetscScalar)0.0) PetscFunctionReturn(PETSC_SUCCESS);
1441:   PetscCall(PetscInfo(Y, "Performing AXPY %" PetscInt_FMT " x %" PetscInt_FMT " on backend\n", m_y, n_y));
1442:   PetscCall(GetHandles_(&dctx));
1443:   {
1444:     const auto dx    = DeviceArrayRead(dctx, X);
1445:     const auto dy    = DeviceArrayReadWrite(dctx, Y);
1446:     const auto lda_x = MatIMPLCast(X)->lda;
1447:     const auto lda_y = MatIMPLCast(Y)->lda;

1449:     if (lda_x > m_x || lda_y > m_x) {
1450:       cupmStream_t stream;

1452:       PetscCall(GetHandlesFrom_(dctx, &stream));
1453:       // clang-format off
1454:       PetscCallThrust(
1455:         const auto sub_mat_y = detail::make_submat_iterator(0, m_y, 0, n_y, lda_y, dy.data());
1456:         const auto sub_mat_x = detail::make_submat_iterator(0, m_x, 0, n_x, lda_x, dx.data());

1458:         THRUST_CALL(
1459:           thrust::transform,
1460:           stream,
1461:           sub_mat_x.begin(), sub_mat_x.end(), sub_mat_y.begin(), sub_mat_y.begin(),
1462:           device::cupm::functors::make_axpy(alpha)
1463:         );
1464:       );
1465:       // clang-format on
1466:     } else {
1467:       const auto       cu_alpha = cupmScalarCast(alpha);
1468:       cupmBlasHandle_t handle;

1470:       PetscCall(GetHandlesFrom_(dctx, &handle));
1471:       PetscCall(PetscLogGpuTimeBegin());
1472:       PetscCallCUPMBLAS(cupmBlasXaxpy(handle, N, &cu_alpha, dx.cupmdata(), 1, dy.cupmdata(), 1));
1473:       PetscCall(PetscLogGpuTimeEnd());
1474:     }
1475:   }
1476:   PetscCall(PetscLogGpuFlops(PetscMax(2 * N - 1, 0)));
1477:   PetscFunctionReturn(PETSC_SUCCESS);
1478: }

1480: template <device::cupm::DeviceType T>
1481: inline PetscErrorCode MatDense_Seq_CUPM<T>::Duplicate(Mat A, MatDuplicateOption opt, Mat *B) noexcept
1482: {
1483:   const auto         hopt = (opt == MAT_COPY_VALUES && A->offloadmask != PETSC_OFFLOAD_CPU) ? MAT_DO_NOT_COPY_VALUES : opt;
1484:   PetscDeviceContext dctx;

1486:   PetscFunctionBegin;
1487:   PetscCall(GetHandles_(&dctx));
1488:   // do not call SetPreallocation() yet, we call it afterwards??
1489:   PetscCall(MatCreateSeqDenseCUPM<T>(PetscObjectComm(PetscObjectCast(A)), A->rmap->n, A->cmap->n, nullptr, B, dctx, /* preallocate */ false));
1490:   PetscCall(MatDuplicateNoCreate_SeqDense(*B, A, hopt));
1491:   if (opt == MAT_COPY_VALUES && hopt != MAT_COPY_VALUES) PetscCall(Copy(A, *B, SAME_NONZERO_PATTERN));
1492:   // allocate memory if needed
1493:   if (opt != MAT_COPY_VALUES && !MatCUPMCast(*B)->d_v) PetscCall(SetPreallocation(*B, dctx, nullptr));
1494:   PetscFunctionReturn(PETSC_SUCCESS);
1495: }

1497: template <device::cupm::DeviceType T>
1498: inline PetscErrorCode MatDense_Seq_CUPM<T>::SetRandom(Mat A, PetscRandom rng) noexcept
1499: {
1500:   PetscBool device;

1502:   PetscFunctionBegin;
1503:   PetscCall(PetscObjectTypeCompare(PetscObjectCast(rng), PETSCDEVICERAND(), &device));
1504:   if (device) {
1505:     const auto         m = A->rmap->n;
1506:     const auto         n = A->cmap->n;
1507:     PetscDeviceContext dctx;

1509:     PetscCall(GetHandles_(&dctx));
1510:     {
1511:       const auto a = DeviceArrayWrite(dctx, A);
1512:       PetscInt   lda;

1514:       PetscCall(MatDenseGetLDA(A, &lda));
1515:       if (lda > m) {
1516:         for (PetscInt i = 0; i < n; i++) PetscCall(PetscRandomGetValues(rng, m, a.data() + i * lda));
1517:       } else {
1518:         PetscInt mn;

1520:         PetscCall(PetscIntMultError(m, n, &mn));
1521:         PetscCall(PetscRandomGetValues(rng, mn, a));
1522:       }
1523:     }
1524:   } else {
1525:     PetscCall(MatSetRandom_SeqDense(A, rng));
1526:   }
1527:   PetscFunctionReturn(PETSC_SUCCESS);
1528: }

1530: // ==========================================================================================

1532: template <device::cupm::DeviceType T>
1533: inline PetscErrorCode MatDense_Seq_CUPM<T>::GetColumnVector(Mat A, Vec v, PetscInt col) noexcept
1534: {
1535:   const auto         offloadmask = A->offloadmask;
1536:   const auto         n           = A->rmap->n;
1537:   const auto         col_offset  = [&](const PetscScalar *ptr) { return ptr + col * MatIMPLCast(A)->lda; };
1538:   PetscBool          viscupm;
1539:   PetscDeviceContext dctx;
1540:   cupmStream_t       stream;

1542:   PetscFunctionBegin;
1543:   PetscCall(PetscObjectTypeCompareAny(PetscObjectCast(v), &viscupm, VecSeq_CUPM::VECSEQCUPM(), VecSeq_CUPM::VECMPICUPM(), VecSeq_CUPM::VECCUPM(), ""));
1544:   PetscCall(GetHandles_(&dctx, &stream));
1545:   if (viscupm && !v->boundtocpu) {
1546:     const auto x = VecSeq_CUPM::DeviceArrayWrite(dctx, v);

1548:     // update device data
1549:     if (PetscOffloadDevice(offloadmask)) {
1550:       PetscCall(PetscCUPMMemcpyAsync(x.data(), col_offset(DeviceArrayRead(dctx, A)), n, cupmMemcpyDeviceToDevice, stream));
1551:     } else {
1552:       PetscCall(PetscCUPMMemcpyAsync(x.data(), col_offset(HostArrayRead(dctx, A)), n, cupmMemcpyHostToDevice, stream));
1553:     }
1554:   } else {
1555:     PetscScalar *x;

1557:     // update host data
1558:     PetscCall(VecGetArrayWrite(v, &x));
1559:     if (PetscOffloadUnallocated(offloadmask) || PetscOffloadHost(offloadmask)) {
1560:       PetscCall(PetscArraycpy(x, col_offset(HostArrayRead(dctx, A)), n));
1561:     } else if (PetscOffloadDevice(offloadmask)) {
1562:       PetscCall(PetscCUPMMemcpyAsync(x, col_offset(DeviceArrayRead(dctx, A)), n, cupmMemcpyDeviceToHost, stream));
1563:     }
1564:     PetscCall(VecRestoreArrayWrite(v, &x));
1565:   }
1566:   PetscFunctionReturn(PETSC_SUCCESS);
1567: }

1569: template <device::cupm::DeviceType T>
1570: template <PetscMemoryAccessMode access>
1571: inline PetscErrorCode MatDense_Seq_CUPM<T>::GetColumnVec(Mat A, PetscInt col, Vec *v) noexcept
1572: {
1573:   using namespace vec::cupm;
1574:   const auto         mimpl = MatIMPLCast(A);
1575:   PetscDeviceContext dctx;

1577:   PetscFunctionBegin;
1578:   PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1579:   PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1580:   mimpl->vecinuse = col + 1;
1581:   if (!mimpl->cvec) PetscCall(MatDenseCreateColumnVec_Private(A, &mimpl->cvec));
1582:   PetscCall(GetHandles_(&dctx));
1583:   PetscCall(GetArray<PETSC_MEMTYPE_DEVICE, access>(A, const_cast<PetscScalar **>(&mimpl->ptrinuse), dctx));
1584:   PetscCall(VecCUPMPlaceArrayAsync<T>(mimpl->cvec, mimpl->ptrinuse + static_cast<std::size_t>(col) * static_cast<std::size_t>(mimpl->lda)));
1585:   if (access == PETSC_MEMORY_ACCESS_READ) PetscCall(VecLockReadPush(mimpl->cvec));
1586:   *v = mimpl->cvec;
1587:   PetscFunctionReturn(PETSC_SUCCESS);
1588: }

1590: template <device::cupm::DeviceType T>
1591: template <PetscMemoryAccessMode access>
1592: inline PetscErrorCode MatDense_Seq_CUPM<T>::RestoreColumnVec(Mat A, PetscInt, Vec *v) noexcept
1593: {
1594:   using namespace vec::cupm;
1595:   const auto         mimpl = MatIMPLCast(A);
1596:   const auto         cvec  = mimpl->cvec;
1597:   PetscDeviceContext dctx;

1599:   PetscFunctionBegin;
1600:   PetscCheck(mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseGetColumnVec() first");
1601:   PetscCheck(cvec, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing internal column vector");
1602:   mimpl->vecinuse = 0;
1603:   if (access == PETSC_MEMORY_ACCESS_READ) PetscCall(VecLockReadPop(cvec));
1604:   PetscCall(VecCUPMResetArrayAsync<T>(cvec));
1605:   PetscCall(GetHandles_(&dctx));
1606:   PetscCall(RestoreArray<PETSC_MEMTYPE_DEVICE, access>(A, const_cast<PetscScalar **>(&mimpl->ptrinuse), dctx));
1607:   if (v) *v = nullptr;
1608:   PetscFunctionReturn(PETSC_SUCCESS);
1609: }

1611: // ==========================================================================================

1613: template <device::cupm::DeviceType T>
1614: inline PetscErrorCode MatDense_Seq_CUPM<T>::GetFactor(Mat A, MatFactorType ftype, Mat *fact_out) noexcept
1615: {
1616:   Mat                fact = nullptr;
1617:   PetscDeviceContext dctx;

1619:   PetscFunctionBegin;
1620:   PetscCall(GetHandles_(&dctx));
1621:   PetscCall(MatCreateSeqDenseCUPM<T>(PetscObjectComm(PetscObjectCast(A)), A->rmap->n, A->cmap->n, nullptr, &fact, dctx, /* preallocate */ false));
1622:   fact->factortype = ftype;
1623:   switch (ftype) {
1624:   case MAT_FACTOR_LU:
1625:   case MAT_FACTOR_ILU: // fall-through
1626:     fact->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqDense;
1627:     fact->ops->ilufactorsymbolic = MatLUFactorSymbolic_SeqDense;
1628:     break;
1629:   case MAT_FACTOR_CHOLESKY:
1630:   case MAT_FACTOR_ICC: // fall-through
1631:     fact->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqDense;
1632:     break;
1633:   case MAT_FACTOR_QR: {
1634:     const auto pobj = PetscObjectCast(fact);

1636:     PetscCall(PetscObjectComposeFunction(pobj, "MatQRFactor_C", MatQRFactor_SeqDense));
1637:     PetscCall(PetscObjectComposeFunction(pobj, "MatQRFactorSymbolic_C", MatQRFactorSymbolic_SeqDense));
1638:   } break;
1639:   case MAT_FACTOR_NONE:
1640:   case MAT_FACTOR_ILUDT:     // fall-through
1641:   case MAT_FACTOR_NUM_TYPES: // fall-through
1642:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatFactorType %s not supported", MatFactorTypes[ftype]);
1643:   }
1644:   PetscCall(PetscStrFreeAllocpy(MATSOLVERCUPM(), &fact->solvertype));
1645:   PetscCall(PetscStrallocpy(MATORDERINGEXTERNAL, const_cast<char **>(fact->preferredordering) + MAT_FACTOR_LU));
1646:   PetscCall(PetscStrallocpy(MATORDERINGEXTERNAL, const_cast<char **>(fact->preferredordering) + MAT_FACTOR_ILU));
1647:   PetscCall(PetscStrallocpy(MATORDERINGEXTERNAL, const_cast<char **>(fact->preferredordering) + MAT_FACTOR_CHOLESKY));
1648:   PetscCall(PetscStrallocpy(MATORDERINGEXTERNAL, const_cast<char **>(fact->preferredordering) + MAT_FACTOR_ICC));
1649:   *fact_out = fact;
1650:   PetscFunctionReturn(PETSC_SUCCESS);
1651: }

1653: template <device::cupm::DeviceType T>
1654: inline PetscErrorCode MatDense_Seq_CUPM<T>::InvertFactors(Mat A) noexcept
1655: {
1656:   const auto         mimpl = MatIMPLCast(A);
1657:   const auto         mcu   = MatCUPMCast(A);
1658:   const auto         n     = static_cast<cupmBlasInt_t>(A->cmap->n);
1659:   cupmSolverHandle_t handle;
1660:   PetscDeviceContext dctx;
1661:   cupmStream_t       stream;

1663:   PetscFunctionBegin;
1664: #if PetscDefined(HAVE_CUDA) && PetscDefined(USING_NVCC)
1665:   // HIP appears to have this by default??
1666:   PetscCheck(PETSC_PKG_CUDA_VERSION_GE(10, 1, 0), PETSC_COMM_SELF, PETSC_ERR_SUP, "Upgrade to CUDA version 10.1.0 or higher");
1667: #endif
1668:   if (!n || !A->rmap->n) PetscFunctionReturn(PETSC_SUCCESS);
1669:   PetscCheck(A->factortype == MAT_FACTOR_CHOLESKY, PETSC_COMM_SELF, PETSC_ERR_LIB, "Factor type %s not implemented", MatFactorTypes[A->factortype]);
1670:   // spd
1671:   PetscCheck(!mcu->d_fact_ipiv, PETSC_COMM_SELF, PETSC_ERR_LIB, "%sDnsytri not implemented", cupmSolverName());

1673:   PetscCall(GetHandles_(&dctx, &handle, &stream));
1674:   {
1675:     const auto    da  = DeviceArrayReadWrite(dctx, A);
1676:     const auto    lda = static_cast<cupmBlasInt_t>(mimpl->lda);
1677:     cupmBlasInt_t il;

1679:     PetscCallCUPMSOLVER(cupmSolverXpotri_bufferSize(handle, CUPMSOLVER_FILL_MODE_LOWER, n, da.cupmdata(), lda, &il));
1680:     if (il > mcu->d_fact_lwork) {
1681:       mcu->d_fact_lwork = il;
1682:       PetscCallCUPM(cupmFreeAsync(mcu->d_fact_work, stream));
1683:       PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_work, il, stream));
1684:     }
1685:     PetscCall(PetscLogGpuTimeBegin());
1686:     PetscCallCUPMSOLVER(cupmSolverXpotri(handle, CUPMSOLVER_FILL_MODE_LOWER, n, da.cupmdata(), lda, mcu->d_fact_work, mcu->d_fact_lwork, mcu->d_fact_info));
1687:     PetscCall(PetscLogGpuTimeEnd());
1688:   }
1689:   PetscCall(CheckCUPMSolverInfo_(mcu->d_fact_info, stream));
1690:   // TODO (write cuda kernel)
1691:   PetscCall(MatSeqDenseSymmetrize_Private(A, PETSC_TRUE));
1692:   PetscCall(PetscLogGpuFlops(1.0 * n * n * n / 3.0));

1694:   A->ops->solve          = nullptr;
1695:   A->ops->solvetranspose = nullptr;
1696:   A->ops->matsolve       = nullptr;
1697:   A->factortype          = MAT_FACTOR_NONE;

1699:   PetscCall(PetscFree(A->solvertype));
1700:   PetscFunctionReturn(PETSC_SUCCESS);
1701: }

1703: // ==========================================================================================

1705: template <device::cupm::DeviceType T>
1706: inline PetscErrorCode MatDense_Seq_CUPM<T>::GetSubMatrix(Mat A, PetscInt rbegin, PetscInt rend, PetscInt cbegin, PetscInt cend, Mat *mat) noexcept
1707: {
1708:   const auto         mimpl        = MatIMPLCast(A);
1709:   const auto         array_offset = [&](PetscScalar *ptr) { return ptr + rbegin + static_cast<std::size_t>(cbegin) * mimpl->lda; };
1710:   const auto         n            = rend - rbegin;
1711:   const auto         m            = cend - cbegin;
1712:   auto              &cmat         = mimpl->cmat;
1713:   PetscDeviceContext dctx;

1715:   PetscFunctionBegin;
1716:   PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1717:   PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1718:   mimpl->matinuse = cbegin + 1;

1720:   PetscCall(GetHandles_(&dctx));
1721:   PetscCall(HostToDevice_(A, dctx));

1723:   if (cmat && ((m != cmat->cmap->N) || (n != cmat->rmap->N))) PetscCall(MatDestroy(&cmat));
1724:   {
1725:     const auto device_array = array_offset(MatCUPMCast(A)->d_v);

1727:     if (cmat) {
1728:       PetscCall(PlaceArray(cmat, device_array));
1729:     } else {
1730:       PetscCall(MatCreateSeqDenseCUPM<T>(PetscObjectComm(PetscObjectCast(A)), n, m, device_array, &cmat, dctx));
1731:     }
1732:   }
1733:   PetscCall(MatDenseSetLDA(cmat, mimpl->lda));
1734:   // place CPU array if present but do not copy any data
1735:   if (const auto host_array = mimpl->v) {
1736:     cmat->offloadmask = PETSC_OFFLOAD_GPU;
1737:     PetscCall(MatDensePlaceArray(cmat, array_offset(host_array)));
1738:   }

1740:   cmat->offloadmask = A->offloadmask;
1741:   *mat              = cmat;
1742:   PetscFunctionReturn(PETSC_SUCCESS);
1743: }

1745: template <device::cupm::DeviceType T>
1746: inline PetscErrorCode MatDense_Seq_CUPM<T>::RestoreSubMatrix(Mat A, Mat *m) noexcept
1747: {
1748:   const auto mimpl = MatIMPLCast(A);
1749:   const auto cmat  = mimpl->cmat;
1750:   const auto reset = static_cast<bool>(mimpl->v);
1751:   bool       copy, was_offload_host;

1753:   PetscFunctionBegin;
1754:   PetscCheck(mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseGetSubMatrix() first");
1755:   PetscCheck(cmat, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing internal column matrix");
1756:   PetscCheck(*m == cmat, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Not the matrix obtained from MatDenseGetSubMatrix()");
1757:   mimpl->matinuse = 0;

1759:   // calls to ResetArray may change it, so save it here
1760:   was_offload_host = cmat->offloadmask == PETSC_OFFLOAD_CPU;
1761:   if (was_offload_host && !reset) {
1762:     copy = true;
1763:     PetscCall(MatSeqDenseSetPreallocation(A, nullptr));
1764:   } else {
1765:     copy = false;
1766:   }

1768:   PetscCall(ResetArray(cmat));
1769:   if (reset) PetscCall(MatDenseResetArray(cmat));
1770:   if (copy) {
1771:     PetscDeviceContext dctx;

1773:     PetscCall(GetHandles_(&dctx));
1774:     PetscCall(DeviceToHost_(A, dctx));
1775:   } else {
1776:     A->offloadmask = was_offload_host ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
1777:   }

1779:   cmat->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
1780:   *m                = nullptr;
1781:   PetscFunctionReturn(PETSC_SUCCESS);
1782: }

1784: // ==========================================================================================

1786: namespace
1787: {

1789: template <device::cupm::DeviceType T>
1790: inline PetscErrorCode MatMatMultNumeric_SeqDenseCUPM_SeqDenseCUPM(Mat A, Mat B, Mat C, PetscBool TA, PetscBool TB) noexcept
1791: {
1792:   PetscFunctionBegin;
1793:   if (TA) {
1794:     if (TB) {
1795:       PetscCall(MatDense_Seq_CUPM<T>::template MatMatMult_Numeric_Dispatch<true, true>(A, B, C));
1796:     } else {
1797:       PetscCall(MatDense_Seq_CUPM<T>::template MatMatMult_Numeric_Dispatch<true, false>(A, B, C));
1798:     }
1799:   } else {
1800:     if (TB) {
1801:       PetscCall(MatDense_Seq_CUPM<T>::template MatMatMult_Numeric_Dispatch<false, true>(A, B, C));
1802:     } else {
1803:       PetscCall(MatDense_Seq_CUPM<T>::template MatMatMult_Numeric_Dispatch<false, false>(A, B, C));
1804:     }
1805:   }
1806:   PetscFunctionReturn(PETSC_SUCCESS);
1807: }

1809: template <device::cupm::DeviceType T>
1810: inline PetscErrorCode MatSolverTypeRegister_DENSECUPM() noexcept
1811: {
1812:   PetscFunctionBegin;
1813:   for (auto ftype : util::make_array(MAT_FACTOR_LU, MAT_FACTOR_CHOLESKY, MAT_FACTOR_QR)) {
1814:     PetscCall(MatSolverTypeRegister(MatDense_Seq_CUPM<T>::MATSOLVERCUPM(), MATSEQDENSE, ftype, MatDense_Seq_CUPM<T>::GetFactor));
1815:     PetscCall(MatSolverTypeRegister(MatDense_Seq_CUPM<T>::MATSOLVERCUPM(), MatDense_Seq_CUPM<T>::MATSEQDENSECUPM(), ftype, MatDense_Seq_CUPM<T>::GetFactor));
1816:   }
1817:   PetscFunctionReturn(PETSC_SUCCESS);
1818: }

1820: } // anonymous namespace

1822: } // namespace impl

1824: } // namespace cupm

1826: } // namespace mat

1828: } // namespace Petsc