Actual source code: matseqdensecupm.hpp
1: #pragma once
3: #include <petsc/private/matdensecupmimpl.h>
4: #include <../src/mat/impls/dense/seq/dense.h>
6: #include <petsc/private/deviceimpl.h>
7: #include <petsc/private/randomimpl.h>
8: #include <petsc/private/vecimpl.h>
9: #include <petsc/private/cupmobject.hpp>
10: #include <petsc/private/cupmsolverinterface.hpp>
12: #include <petsc/private/cpp/type_traits.hpp>
13: #include <petsc/private/cpp/utility.hpp>
15: #include <../src/vec/vec/impls/seq/cupm/vecseqcupm.hpp>
17: namespace Petsc
18: {
20: namespace mat
21: {
23: namespace cupm
24: {
26: namespace impl
27: {
29: template <device::cupm::DeviceType T>
30: class MatDense_Seq_CUPM : MatDense_CUPM<T, MatDense_Seq_CUPM<T>> {
31: public:
32: MATDENSECUPM_HEADER(T, MatDense_Seq_CUPM<T>);
34: private:
35: struct Mat_SeqDenseCUPM {
36: PetscScalar *d_v; // pointer to the matrix on the GPU
37: PetscScalar *unplacedarray; // if one called MatCUPMDensePlaceArray(), this is where it stashed the original
38: bool d_user_alloc;
39: bool d_unplaced_user_alloc;
40: // factorization support
41: cupmBlasInt_t *d_fact_ipiv; // device pivots
42: cupmScalar_t *d_fact_tau; // device QR tau vector
43: cupmBlasInt_t *d_fact_info; // device info
44: cupmScalar_t *d_fact_work; // device workspace
45: cupmBlasInt_t d_fact_lwork; // size of device workspace
46: // workspace
47: Vec workvec;
48: };
50: static PetscErrorCode SetPreallocation_(Mat, PetscDeviceContext, PetscScalar *) noexcept;
52: static PetscErrorCode HostToDevice_(Mat, PetscDeviceContext) noexcept;
53: static PetscErrorCode DeviceToHost_(Mat, PetscDeviceContext) noexcept;
55: static PetscErrorCode CheckCUPMSolverInfo_(const cupmBlasInt_t *, cupmStream_t) noexcept;
57: template <typename Derived>
58: struct SolveCommon;
59: struct SolveQR;
60: struct SolveCholesky;
61: struct SolveLU;
63: template <typename Solver, bool transpose>
64: static PetscErrorCode MatSolve_Factored_Dispatch_(Mat, Vec, Vec) noexcept;
65: template <typename Solver, bool transpose>
66: static PetscErrorCode MatMatSolve_Factored_Dispatch_(Mat, Mat, Mat) noexcept;
67: template <bool transpose>
68: static PetscErrorCode MatMultAdd_Dispatch_(Mat, Vec, Vec, Vec) noexcept;
70: template <bool to_host>
71: static PetscErrorCode Convert_Dispatch_(Mat, MatType, MatReuse, Mat *) noexcept;
73: PETSC_NODISCARD static constexpr MatType MATIMPLCUPM_() noexcept;
74: PETSC_NODISCARD static constexpr Mat_SeqDense *MatIMPLCast_(Mat) noexcept;
76: public:
77: PETSC_NODISCARD static constexpr Mat_SeqDenseCUPM *MatCUPMCast(Mat) noexcept;
79: // define these by hand since they don't fit the above mold
80: PETSC_NODISCARD static constexpr const char *MatConvert_seqdensecupm_seqdense_C() noexcept;
81: PETSC_NODISCARD static constexpr const char *MatProductSetFromOptions_seqaij_seqdensecupm_C() noexcept;
83: static PetscErrorCode Create(Mat) noexcept;
84: static PetscErrorCode Destroy(Mat) noexcept;
85: static PetscErrorCode SetUp(Mat) noexcept;
86: static PetscErrorCode Reset(Mat) noexcept;
88: static PetscErrorCode BindToCPU(Mat, PetscBool) noexcept;
89: static PetscErrorCode Convert_SeqDense_SeqDenseCUPM(Mat, MatType, MatReuse, Mat *) noexcept;
90: static PetscErrorCode Convert_SeqDenseCUPM_SeqDense(Mat, MatType, MatReuse, Mat *) noexcept;
92: template <PetscMemType, PetscMemoryAccessMode>
93: static PetscErrorCode GetArray(Mat, PetscScalar **, PetscDeviceContext) noexcept;
94: template <PetscMemType, PetscMemoryAccessMode>
95: static PetscErrorCode RestoreArray(Mat, PetscScalar **, PetscDeviceContext) noexcept;
96: template <PetscMemoryAccessMode>
97: static PetscErrorCode GetArrayAndMemType(Mat, PetscScalar **, PetscMemType *, PetscDeviceContext) noexcept;
98: template <PetscMemoryAccessMode>
99: static PetscErrorCode RestoreArrayAndMemType(Mat, PetscScalar **, PetscDeviceContext) noexcept;
101: private:
102: template <PetscMemType mtype, PetscMemoryAccessMode mode>
103: static PetscErrorCode GetArrayC_(Mat m, PetscScalar **p) noexcept
104: {
105: PetscDeviceContext dctx;
107: PetscFunctionBegin;
108: PetscCall(GetHandles_(&dctx));
109: PetscCall(GetArray<mtype, mode>(m, p, dctx));
110: PetscFunctionReturn(PETSC_SUCCESS);
111: }
113: template <PetscMemType mtype, PetscMemoryAccessMode mode>
114: static PetscErrorCode RestoreArrayC_(Mat m, PetscScalar **p) noexcept
115: {
116: PetscDeviceContext dctx;
118: PetscFunctionBegin;
119: PetscCall(GetHandles_(&dctx));
120: PetscCall(RestoreArray<mtype, mode>(m, p, dctx));
121: PetscFunctionReturn(PETSC_SUCCESS);
122: }
124: template <PetscMemoryAccessMode mode>
125: static PetscErrorCode GetArrayAndMemTypeC_(Mat m, PetscScalar **p, PetscMemType *tp) noexcept
126: {
127: PetscDeviceContext dctx;
129: PetscFunctionBegin;
130: PetscCall(GetHandles_(&dctx));
131: PetscCall(GetArrayAndMemType<mode>(m, p, tp, dctx));
132: PetscFunctionReturn(PETSC_SUCCESS);
133: }
135: template <PetscMemoryAccessMode mode>
136: static PetscErrorCode RestoreArrayAndMemTypeC_(Mat m, PetscScalar **p) noexcept
137: {
138: PetscDeviceContext dctx;
140: PetscFunctionBegin;
141: PetscCall(GetHandles_(&dctx));
142: PetscCall(RestoreArrayAndMemType<mode>(m, p, dctx));
143: PetscFunctionReturn(PETSC_SUCCESS);
144: }
146: public:
147: static PetscErrorCode PlaceArray(Mat, const PetscScalar *) noexcept;
148: static PetscErrorCode ReplaceArray(Mat, const PetscScalar *) noexcept;
149: static PetscErrorCode ResetArray(Mat) noexcept;
151: template <bool transpose_A, bool transpose_B>
152: static PetscErrorCode MatMatMult_Numeric_Dispatch(Mat, Mat, Mat) noexcept;
153: static PetscErrorCode Copy(Mat, Mat, MatStructure) noexcept;
154: static PetscErrorCode ZeroEntries(Mat) noexcept;
155: static PetscErrorCode Scale(Mat, PetscScalar) noexcept;
156: static PetscErrorCode AXPY(Mat, PetscScalar, Mat, MatStructure) noexcept;
157: static PetscErrorCode Duplicate(Mat, MatDuplicateOption, Mat *) noexcept;
158: static PetscErrorCode SetRandom(Mat, PetscRandom) noexcept;
160: static PetscErrorCode GetColumnVector(Mat, Vec, PetscInt) noexcept;
161: template <PetscMemoryAccessMode>
162: static PetscErrorCode GetColumnVec(Mat, PetscInt, Vec *) noexcept;
163: template <PetscMemoryAccessMode>
164: static PetscErrorCode RestoreColumnVec(Mat, PetscInt, Vec *) noexcept;
166: static PetscErrorCode GetFactor(Mat, MatFactorType, Mat *) noexcept;
167: static PetscErrorCode InvertFactors(Mat) noexcept;
169: static PetscErrorCode GetSubMatrix(Mat, PetscInt, PetscInt, PetscInt, PetscInt, Mat *) noexcept;
170: static PetscErrorCode RestoreSubMatrix(Mat, Mat *) noexcept;
171: };
173: } // namespace impl
175: namespace
176: {
178: // Declare this here so that the functions below can make use of it
179: template <device::cupm::DeviceType T>
180: inline PetscErrorCode MatCreateSeqDenseCUPM(MPI_Comm comm, PetscInt m, PetscInt n, PetscScalar *data, Mat *A, PetscDeviceContext dctx = nullptr, bool preallocate = true) noexcept
181: {
182: PetscFunctionBegin;
183: PetscCall(impl::MatDense_Seq_CUPM<T>::CreateIMPLDenseCUPM(comm, m, n, m, n, data, A, dctx, preallocate));
184: PetscFunctionReturn(PETSC_SUCCESS);
185: }
187: } // anonymous namespace
189: namespace impl
190: {
192: // ==========================================================================================
193: // MatDense_Seq_CUPM - Private API - Utility
194: // ==========================================================================================
196: template <device::cupm::DeviceType T>
197: inline PetscErrorCode MatDense_Seq_CUPM<T>::SetPreallocation_(Mat m, PetscDeviceContext dctx, PetscScalar *user_device_array) noexcept
198: {
199: const auto mcu = MatCUPMCast(m);
200: const auto nrows = m->rmap->n;
201: const auto ncols = m->cmap->n;
202: auto &lda = MatIMPLCast(m)->lda;
203: cupmStream_t stream;
205: PetscFunctionBegin;
206: PetscCheckTypeName(m, MATSEQDENSECUPM());
208: PetscCall(checkCupmBlasIntCast(nrows));
209: PetscCall(checkCupmBlasIntCast(ncols));
210: PetscCall(GetHandlesFrom_(dctx, &stream));
211: if (lda <= 0) lda = nrows;
212: if (!mcu->d_user_alloc) PetscCallCUPM(cupmFreeAsync(mcu->d_v, stream));
213: if (user_device_array) {
214: mcu->d_user_alloc = PETSC_TRUE;
215: mcu->d_v = user_device_array;
216: } else {
217: PetscInt size;
219: mcu->d_user_alloc = PETSC_FALSE;
220: PetscCall(PetscIntMultError(lda, ncols, &size));
221: PetscCall(PetscCUPMMallocAsync(&mcu->d_v, size, stream));
222: PetscCall(PetscCUPMMemsetAsync(mcu->d_v, 0, size, stream));
223: }
224: m->offloadmask = PETSC_OFFLOAD_GPU;
225: PetscFunctionReturn(PETSC_SUCCESS);
226: }
228: template <device::cupm::DeviceType T>
229: inline PetscErrorCode MatDense_Seq_CUPM<T>::HostToDevice_(Mat m, PetscDeviceContext dctx) noexcept
230: {
231: const auto nrows = m->rmap->n;
232: const auto ncols = m->cmap->n;
233: const auto copy = m->offloadmask == PETSC_OFFLOAD_CPU || m->offloadmask == PETSC_OFFLOAD_UNALLOCATED;
235: PetscFunctionBegin;
236: PetscCheckTypeName(m, MATSEQDENSECUPM());
237: if (m->boundtocpu) PetscFunctionReturn(PETSC_SUCCESS);
238: PetscCall(PetscInfo(m, "%s matrix %" PetscInt_FMT " x %" PetscInt_FMT "\n", copy ? "Copy" : "Reusing", nrows, ncols));
239: if (copy) {
240: const auto mcu = MatCUPMCast(m);
241: cupmStream_t stream;
243: // Allocate GPU memory if not present
244: if (!mcu->d_v) PetscCall(SetPreallocation(m, dctx, nullptr));
245: PetscCall(GetHandlesFrom_(dctx, &stream));
246: PetscCall(PetscLogEventBegin(MAT_DenseCopyToGPU, m, 0, 0, 0));
247: {
248: const auto mimpl = MatIMPLCast(m);
249: const auto lda = mimpl->lda;
250: const auto src = mimpl->v;
251: const auto dest = mcu->d_v;
253: if (lda > nrows) {
254: PetscCall(PetscCUPMMemcpy2DAsync(dest, lda, src, lda, nrows, ncols, cupmMemcpyHostToDevice, stream));
255: } else {
256: PetscCall(PetscCUPMMemcpyAsync(dest, src, lda * ncols, cupmMemcpyHostToDevice, stream));
257: }
258: }
259: PetscCall(PetscLogEventEnd(MAT_DenseCopyToGPU, m, 0, 0, 0));
260: // order important, ensure that offloadmask is PETSC_OFFLOAD_BOTH
261: m->offloadmask = PETSC_OFFLOAD_BOTH;
262: }
263: PetscFunctionReturn(PETSC_SUCCESS);
264: }
266: template <device::cupm::DeviceType T>
267: inline PetscErrorCode MatDense_Seq_CUPM<T>::DeviceToHost_(Mat m, PetscDeviceContext dctx) noexcept
268: {
269: const auto nrows = m->rmap->n;
270: const auto ncols = m->cmap->n;
271: const auto copy = m->offloadmask == PETSC_OFFLOAD_GPU;
273: PetscFunctionBegin;
274: PetscCheckTypeName(m, MATSEQDENSECUPM());
275: PetscCall(PetscInfo(m, "%s matrix %" PetscInt_FMT " x %" PetscInt_FMT "\n", copy ? "Copy" : "Reusing", nrows, ncols));
276: if (copy) {
277: const auto mimpl = MatIMPLCast(m);
278: cupmStream_t stream;
280: // MatCreateSeqDenseCUPM may not allocate CPU memory. Allocate if needed
281: if (!mimpl->v) PetscCall(MatSeqDenseSetPreallocation(m, nullptr));
282: PetscCall(GetHandlesFrom_(dctx, &stream));
283: PetscCall(PetscLogEventBegin(MAT_DenseCopyFromGPU, m, 0, 0, 0));
284: {
285: const auto lda = mimpl->lda;
286: const auto dest = mimpl->v;
287: const auto src = MatCUPMCast(m)->d_v;
289: if (lda > nrows) {
290: PetscCall(PetscCUPMMemcpy2DAsync(dest, lda, src, lda, nrows, ncols, cupmMemcpyDeviceToHost, stream));
291: } else {
292: PetscCall(PetscCUPMMemcpyAsync(dest, src, lda * ncols, cupmMemcpyDeviceToHost, stream));
293: }
294: }
295: PetscCall(PetscLogEventEnd(MAT_DenseCopyFromGPU, m, 0, 0, 0));
296: // order is important, MatSeqDenseSetPreallocation() might set offloadmask
297: m->offloadmask = PETSC_OFFLOAD_BOTH;
298: }
299: PetscFunctionReturn(PETSC_SUCCESS);
300: }
302: template <device::cupm::DeviceType T>
303: inline PetscErrorCode MatDense_Seq_CUPM<T>::CheckCUPMSolverInfo_(const cupmBlasInt_t *fact_info, cupmStream_t stream) noexcept
304: {
305: PetscFunctionBegin;
306: if (PetscDefined(USE_DEBUG)) {
307: cupmBlasInt_t info = 0;
309: PetscCall(PetscCUPMMemcpyAsync(&info, fact_info, 1, cupmMemcpyDeviceToHost, stream));
310: if (stream) PetscCallCUPM(cupmStreamSynchronize(stream));
311: static_assert(std::is_same<decltype(info), int>::value, "");
312: PetscCheck(info <= 0, PETSC_COMM_SELF, PETSC_ERR_MAT_CH_ZRPVT, "Bad factorization: zero pivot in row %d", info - 1);
313: PetscCheck(info >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Wrong argument to cupmSolver %d", -info);
314: }
315: PetscFunctionReturn(PETSC_SUCCESS);
316: }
318: // ==========================================================================================
319: // MatDense_Seq_CUPM - Private API - Solver Dispatch
320: // ==========================================================================================
322: // specific solvers called through the dispatch_() family of functions
323: template <device::cupm::DeviceType T>
324: template <typename Derived>
325: struct MatDense_Seq_CUPM<T>::SolveCommon {
326: using derived_type = Derived;
328: template <typename F>
329: static PetscErrorCode ResizeFactLwork(Mat_SeqDenseCUPM *mcu, cupmStream_t stream, F &&cupmSolverComputeFactLwork) noexcept
330: {
331: cupmBlasInt_t lwork;
333: PetscFunctionBegin;
334: PetscCallCUPMSOLVER(cupmSolverComputeFactLwork(&lwork));
335: if (lwork > mcu->d_fact_lwork) {
336: mcu->d_fact_lwork = lwork;
337: PetscCallCUPM(cupmFreeAsync(mcu->d_fact_work, stream));
338: PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_work, lwork, stream));
339: }
340: PetscFunctionReturn(PETSC_SUCCESS);
341: }
343: static PetscErrorCode FactorPrepare(Mat A, cupmStream_t stream) noexcept
344: {
345: const auto mcu = MatCUPMCast(A);
347: PetscFunctionBegin;
348: PetscCall(PetscInfo(A, "%s factor %" PetscInt_FMT " x %" PetscInt_FMT " on backend\n", derived_type::NAME(), A->rmap->n, A->cmap->n));
349: A->factortype = derived_type::MATFACTORTYPE();
350: A->ops->solve = MatSolve_Factored_Dispatch_<derived_type, false>;
351: A->ops->solvetranspose = MatSolve_Factored_Dispatch_<derived_type, true>;
352: A->ops->matsolve = MatMatSolve_Factored_Dispatch_<derived_type, false>;
353: A->ops->matsolvetranspose = MatMatSolve_Factored_Dispatch_<derived_type, true>;
355: PetscCall(PetscStrFreeAllocpy(MATSOLVERCUPM(), &A->solvertype));
356: if (!mcu->d_fact_info) PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_info, 1, stream));
357: PetscFunctionReturn(PETSC_SUCCESS);
358: }
359: };
361: template <device::cupm::DeviceType T>
362: struct MatDense_Seq_CUPM<T>::SolveLU : SolveCommon<SolveLU> {
363: using base_type = SolveCommon<SolveLU>;
365: static constexpr const char *NAME() noexcept { return "LU"; }
366: static constexpr MatFactorType MATFACTORTYPE() noexcept { return MAT_FACTOR_LU; }
368: static PetscErrorCode Factor(Mat A, IS, IS, const MatFactorInfo *) noexcept
369: {
370: const auto m = static_cast<cupmBlasInt_t>(A->rmap->n);
371: const auto n = static_cast<cupmBlasInt_t>(A->cmap->n);
372: cupmStream_t stream;
373: cupmSolverHandle_t handle;
374: PetscDeviceContext dctx;
376: PetscFunctionBegin;
377: if (!m || !n) PetscFunctionReturn(PETSC_SUCCESS);
378: PetscCall(GetHandles_(&dctx, &handle, &stream));
379: PetscCall(base_type::FactorPrepare(A, stream));
380: {
381: const auto mcu = MatCUPMCast(A);
382: const auto da = DeviceArrayReadWrite(dctx, A);
383: const auto lda = static_cast<cupmBlasInt_t>(MatIMPLCast(A)->lda);
385: // clang-format off
386: PetscCall(
387: base_type::ResizeFactLwork(
388: mcu, stream,
389: [&](cupmBlasInt_t *fact_lwork)
390: {
391: return cupmSolverXgetrf_bufferSize(handle, m, n, da.cupmdata(), lda, fact_lwork);
392: }
393: )
394: );
395: // clang-format on
396: if (!mcu->d_fact_ipiv) PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_ipiv, n, stream));
398: PetscCall(PetscLogGpuTimeBegin());
399: PetscCallCUPMSOLVER(cupmSolverXgetrf(handle, m, n, da.cupmdata(), lda, mcu->d_fact_work, mcu->d_fact_lwork, mcu->d_fact_ipiv, mcu->d_fact_info));
400: PetscCall(PetscLogGpuTimeEnd());
401: PetscCall(CheckCUPMSolverInfo_(mcu->d_fact_info, stream));
402: }
403: PetscCall(PetscLogGpuFlops(2.0 * n * n * m / 3.0));
404: PetscFunctionReturn(PETSC_SUCCESS);
405: }
407: template <bool transpose>
408: static PetscErrorCode Solve(Mat A, cupmScalar_t *x, cupmBlasInt_t ldx, cupmBlasInt_t m, cupmBlasInt_t nrhs, cupmBlasInt_t k, PetscDeviceContext dctx, cupmStream_t stream) noexcept
409: {
410: const auto mcu = MatCUPMCast(A);
411: const auto fact_info = mcu->d_fact_info;
412: const auto fact_ipiv = mcu->d_fact_ipiv;
413: cupmSolverHandle_t handle;
415: PetscFunctionBegin;
416: PetscCall(GetHandlesFrom_(dctx, &handle));
417: PetscCall(PetscInfo(A, "%s solve %d x %d on backend\n", NAME(), m, k));
418: PetscCall(PetscLogGpuTimeBegin());
419: {
420: constexpr auto op = transpose ? CUPMSOLVER_OP_T : CUPMSOLVER_OP_N;
421: const auto da = DeviceArrayRead(dctx, A);
422: const auto lda = static_cast<cupmBlasInt_t>(MatIMPLCast(A)->lda);
424: // clang-format off
425: PetscCall(
426: base_type::ResizeFactLwork(
427: mcu, stream,
428: [&](cupmBlasInt_t *lwork)
429: {
430: return cupmSolverXgetrs_bufferSize(
431: handle, op, m, nrhs, da.cupmdata(), lda, fact_ipiv, x, ldx, lwork
432: );
433: }
434: )
435: );
436: // clang-format on
437: PetscCallCUPMSOLVER(cupmSolverXgetrs(handle, op, m, nrhs, da.cupmdata(), lda, fact_ipiv, x, ldx, mcu->d_fact_work, mcu->d_fact_lwork, fact_info));
438: PetscCall(CheckCUPMSolverInfo_(fact_info, stream));
439: }
440: PetscCall(PetscLogGpuTimeEnd());
441: PetscCall(PetscLogGpuFlops(nrhs * (2.0 * m * m - m)));
442: PetscFunctionReturn(PETSC_SUCCESS);
443: }
444: };
446: template <device::cupm::DeviceType T>
447: struct MatDense_Seq_CUPM<T>::SolveCholesky : SolveCommon<SolveCholesky> {
448: using base_type = SolveCommon<SolveCholesky>;
450: static constexpr const char *NAME() noexcept { return "Cholesky"; }
451: static constexpr MatFactorType MATFACTORTYPE() noexcept { return MAT_FACTOR_CHOLESKY; }
453: static PetscErrorCode Factor(Mat A, IS, const MatFactorInfo *) noexcept
454: {
455: const auto n = static_cast<cupmBlasInt_t>(A->rmap->n);
456: PetscDeviceContext dctx;
457: cupmSolverHandle_t handle;
458: cupmStream_t stream;
460: PetscFunctionBegin;
461: if (!n || !A->cmap->n) PetscFunctionReturn(PETSC_SUCCESS);
462: PetscCheck(A->spd == PETSC_BOOL3_TRUE, PETSC_COMM_SELF, PETSC_ERR_SUP, "%ssytrs unavailable. Use MAT_FACTOR_LU", cupmSolverName());
463: PetscCall(GetHandles_(&dctx, &handle, &stream));
464: PetscCall(base_type::FactorPrepare(A, stream));
465: {
466: const auto mcu = MatCUPMCast(A);
467: const auto da = DeviceArrayReadWrite(dctx, A);
468: const auto lda = static_cast<cupmBlasInt_t>(MatIMPLCast(A)->lda);
470: // clang-format off
471: PetscCall(
472: base_type::ResizeFactLwork(
473: mcu, stream,
474: [&](cupmBlasInt_t *fact_lwork)
475: {
476: return cupmSolverXpotrf_bufferSize(
477: handle, CUPMSOLVER_FILL_MODE_LOWER, n, da.cupmdata(), lda, fact_lwork
478: );
479: }
480: )
481: );
482: // clang-format on
483: PetscCall(PetscLogGpuTimeBegin());
484: PetscCallCUPMSOLVER(cupmSolverXpotrf(handle, CUPMSOLVER_FILL_MODE_LOWER, n, da.cupmdata(), lda, mcu->d_fact_work, mcu->d_fact_lwork, mcu->d_fact_info));
485: PetscCall(PetscLogGpuTimeEnd());
486: PetscCall(CheckCUPMSolverInfo_(mcu->d_fact_info, stream));
487: }
488: PetscCall(PetscLogGpuFlops(1.0 * n * n * n / 3.0));
490: #if 0
491: // At the time of writing this interface (cuda 10.0), cusolverDn does not implement *sytrs
492: // and *hetr* routines. The code below should work, and it can be activated when *sytrs
493: // routines will be available
494: if (!mcu->d_fact_ipiv) PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_ipiv, n, stream));
495: if (!mcu->d_fact_lwork) {
496: PetscCallCUPMSOLVER(cupmSolverDnXsytrf_bufferSize(handle, n, da.cupmdata(), lda, &mcu->d_fact_lwork));
497: PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_work, mcu->d_fact_lwork, stream));
498: }
499: if (mcu->d_fact_info) PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_info, 1, stream));
500: PetscCall(PetscLogGpuTimeBegin());
501: PetscCallCUPMSOLVER(cupmSolverXsytrf(handle, CUPMSOLVER_FILL_MODE_LOWER, n, da, lda, mcu->d_fact_ipiv, mcu->d_fact_work, mcu->d_fact_lwork, mcu->d_fact_info));
502: PetscCall(PetscLogGpuTimeEnd());
503: #endif
504: PetscFunctionReturn(PETSC_SUCCESS);
505: }
507: template <bool transpose>
508: static PetscErrorCode Solve(Mat A, cupmScalar_t *x, cupmBlasInt_t ldx, cupmBlasInt_t m, cupmBlasInt_t nrhs, cupmBlasInt_t k, PetscDeviceContext dctx, cupmStream_t stream) noexcept
509: {
510: const auto mcu = MatCUPMCast(A);
511: const auto fact_info = mcu->d_fact_info;
512: cupmSolverHandle_t handle;
514: PetscFunctionBegin;
515: PetscAssert(!mcu->d_fact_ipiv, PETSC_COMM_SELF, PETSC_ERR_LIB, "%ssytrs not implemented", cupmSolverName());
516: PetscCall(GetHandlesFrom_(dctx, &handle));
517: PetscCall(PetscInfo(A, "%s solve %d x %d on backend\n", NAME(), m, k));
518: PetscCall(PetscLogGpuTimeBegin());
519: {
520: const auto da = DeviceArrayRead(dctx, A);
521: const auto lda = static_cast<cupmBlasInt_t>(MatIMPLCast(A)->lda);
523: // clang-format off
524: PetscCall(
525: base_type::ResizeFactLwork(
526: mcu, stream,
527: [&](cupmBlasInt_t *lwork)
528: {
529: return cupmSolverXpotrs_bufferSize(
530: handle, CUPMSOLVER_FILL_MODE_LOWER, m, nrhs, da.cupmdata(), lda, x, ldx, lwork
531: );
532: }
533: )
534: );
535: // clang-format on
536: PetscCallCUPMSOLVER(cupmSolverXpotrs(handle, CUPMSOLVER_FILL_MODE_LOWER, m, nrhs, da.cupmdata(), lda, x, ldx, mcu->d_fact_work, mcu->d_fact_lwork, fact_info));
537: PetscCall(CheckCUPMSolverInfo_(fact_info, stream));
538: }
539: PetscCall(PetscLogGpuTimeEnd());
540: PetscCall(PetscLogGpuFlops(nrhs * (2.0 * m * m - m)));
541: PetscFunctionReturn(PETSC_SUCCESS);
542: }
543: };
545: template <device::cupm::DeviceType T>
546: struct MatDense_Seq_CUPM<T>::SolveQR : SolveCommon<SolveQR> {
547: using base_type = SolveCommon<SolveQR>;
549: static constexpr const char *NAME() noexcept { return "QR"; }
550: static constexpr MatFactorType MATFACTORTYPE() noexcept { return MAT_FACTOR_QR; }
552: static PetscErrorCode Factor(Mat A, IS, const MatFactorInfo *) noexcept
553: {
554: const auto m = static_cast<cupmBlasInt_t>(A->rmap->n);
555: const auto n = static_cast<cupmBlasInt_t>(A->cmap->n);
556: const auto min = std::min(m, n);
557: const auto mimpl = MatIMPLCast(A);
558: cupmStream_t stream;
559: cupmSolverHandle_t handle;
560: PetscDeviceContext dctx;
562: PetscFunctionBegin;
563: if (!m || !n) PetscFunctionReturn(PETSC_SUCCESS);
564: PetscCall(GetHandles_(&dctx, &handle, &stream));
565: PetscCall(base_type::FactorPrepare(A, stream));
566: mimpl->rank = min;
567: {
568: const auto mcu = MatCUPMCast(A);
569: const auto da = DeviceArrayReadWrite(dctx, A);
570: const auto lda = static_cast<cupmBlasInt_t>(mimpl->lda);
572: if (!mcu->workvec) PetscCall(vec::cupm::VecCreateSeqCUPMAsync<T>(PetscObjectComm(PetscObjectCast(A)), m, &mcu->workvec));
573: if (!mcu->d_fact_tau) PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_tau, min, stream));
574: // clang-format off
575: PetscCall(
576: base_type::ResizeFactLwork(
577: mcu, stream,
578: [&](cupmBlasInt_t *fact_lwork)
579: {
580: return cupmSolverXgeqrf_bufferSize(handle, m, n, da.cupmdata(), lda, fact_lwork);
581: }
582: )
583: );
584: // clang-format on
585: PetscCall(PetscLogGpuTimeBegin());
586: PetscCallCUPMSOLVER(cupmSolverXgeqrf(handle, m, n, da.cupmdata(), lda, mcu->d_fact_tau, mcu->d_fact_work, mcu->d_fact_lwork, mcu->d_fact_info));
587: PetscCall(PetscLogGpuTimeEnd());
588: PetscCall(CheckCUPMSolverInfo_(mcu->d_fact_info, stream));
589: }
590: PetscCall(PetscLogGpuFlops(2.0 * min * min * (std::max(m, n) - min / 3.0)));
591: PetscFunctionReturn(PETSC_SUCCESS);
592: }
594: template <bool transpose>
595: static PetscErrorCode Solve(Mat A, cupmScalar_t *x, cupmBlasInt_t ldx, cupmBlasInt_t m, cupmBlasInt_t nrhs, cupmBlasInt_t k, PetscDeviceContext dctx, cupmStream_t stream) noexcept
596: {
597: const auto mimpl = MatIMPLCast(A);
598: const auto rank = static_cast<cupmBlasInt_t>(mimpl->rank);
599: const auto mcu = MatCUPMCast(A);
600: const auto fact_info = mcu->d_fact_info;
601: const auto fact_tau = mcu->d_fact_tau;
602: const auto fact_work = mcu->d_fact_work;
603: const auto fact_lwork = mcu->d_fact_lwork;
604: cupmSolverHandle_t solver_handle;
605: cupmBlasHandle_t blas_handle;
607: PetscFunctionBegin;
608: PetscCall(GetHandlesFrom_(dctx, &blas_handle, &solver_handle));
609: PetscCall(PetscInfo(A, "%s solve %d x %d on backend\n", NAME(), m, k));
610: PetscCall(PetscLogGpuTimeBegin());
611: {
612: const auto da = DeviceArrayRead(dctx, A);
613: const auto one = cupmScalarCast(1.0);
614: const auto lda = static_cast<cupmBlasInt_t>(mimpl->lda);
616: if (transpose) {
617: PetscCallCUPMBLAS(cupmBlasXtrsm(blas_handle, CUPMBLAS_SIDE_LEFT, CUPMBLAS_FILL_MODE_UPPER, CUPMBLAS_OP_T, CUPMBLAS_DIAG_NON_UNIT, rank, nrhs, &one, da.cupmdata(), lda, x, ldx));
618: PetscCallCUPMSOLVER(cupmSolverXormqr(solver_handle, CUPMSOLVER_SIDE_LEFT, CUPMSOLVER_OP_N, m, nrhs, rank, da.cupmdata(), lda, fact_tau, x, ldx, fact_work, fact_lwork, fact_info));
619: PetscCall(CheckCUPMSolverInfo_(fact_info, stream));
620: } else {
621: constexpr auto op = PetscDefined(USE_COMPLEX) ? CUPMSOLVER_OP_C : CUPMSOLVER_OP_T;
623: PetscCallCUPMSOLVER(cupmSolverXormqr(solver_handle, CUPMSOLVER_SIDE_LEFT, op, m, nrhs, rank, da.cupmdata(), lda, fact_tau, x, ldx, fact_work, fact_lwork, fact_info));
624: PetscCall(CheckCUPMSolverInfo_(fact_info, stream));
625: PetscCallCUPMBLAS(cupmBlasXtrsm(blas_handle, CUPMBLAS_SIDE_LEFT, CUPMBLAS_FILL_MODE_UPPER, CUPMBLAS_OP_N, CUPMBLAS_DIAG_NON_UNIT, rank, nrhs, &one, da.cupmdata(), lda, x, ldx));
626: }
627: }
628: PetscCall(PetscLogGpuTimeEnd());
629: PetscCall(PetscLogFlops(nrhs * (4.0 * m * rank - (rank * rank))));
630: PetscFunctionReturn(PETSC_SUCCESS);
631: }
632: };
634: template <device::cupm::DeviceType T>
635: template <typename Solver, bool transpose>
636: inline PetscErrorCode MatDense_Seq_CUPM<T>::MatSolve_Factored_Dispatch_(Mat A, Vec x, Vec y) noexcept
637: {
638: using namespace vec::cupm;
639: const auto pobj_A = PetscObjectCast(A);
640: const auto m = static_cast<cupmBlasInt_t>(A->rmap->n);
641: const auto k = static_cast<cupmBlasInt_t>(A->cmap->n);
642: auto &workvec = MatCUPMCast(A)->workvec;
643: PetscScalar *y_array = nullptr;
644: PetscDeviceContext dctx;
645: PetscBool xiscupm, yiscupm, aiscupm;
646: bool use_y_array_directly;
647: cupmStream_t stream;
649: PetscFunctionBegin;
650: PetscCheck(A->factortype != MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix must be factored to solve");
651: PetscCall(PetscObjectTypeCompare(PetscObjectCast(x), VecSeq_CUPM::VECSEQCUPM(), &xiscupm));
652: PetscCall(PetscObjectTypeCompare(PetscObjectCast(y), VecSeq_CUPM::VECSEQCUPM(), &yiscupm));
653: PetscCall(PetscObjectTypeCompare(pobj_A, MATSEQDENSECUPM(), &aiscupm));
654: PetscAssert(aiscupm, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix A is somehow not CUPM?????????????????????????????");
655: PetscCall(GetHandles_(&dctx, &stream));
656: use_y_array_directly = yiscupm && (k >= m);
657: {
658: const PetscScalar *x_array;
659: const auto xisdevice = xiscupm && PetscOffloadDevice(x->offloadmask);
660: const auto copy_mode = xisdevice ? cupmMemcpyDeviceToDevice : cupmMemcpyHostToDevice;
662: if (!use_y_array_directly && !workvec) PetscCall(VecCreateSeqCUPMAsync<T>(PetscObjectComm(pobj_A), m, &workvec));
663: // The logic here is to try to minimize the amount of memory copying:
664: //
665: // If we call VecCUPMGetArrayRead(X, &x) every time xiscupm and the data is not offloaded
666: // to the GPU yet, then the data is copied to the GPU. But we are only trying to get the
667: // data in order to copy it into the y array. So the array x will be wherever the data
668: // already is so that only one memcpy is performed
669: if (xisdevice) {
670: PetscCall(VecCUPMGetArrayReadAsync<T>(x, &x_array, dctx));
671: } else {
672: PetscCall(VecGetArrayRead(x, &x_array));
673: }
674: PetscCall(VecCUPMGetArrayWriteAsync<T>(use_y_array_directly ? y : workvec, &y_array, dctx));
675: PetscCall(PetscCUPMMemcpyAsync(y_array, x_array, m, copy_mode, stream));
676: if (xisdevice) {
677: PetscCall(VecCUPMRestoreArrayReadAsync<T>(x, &x_array, dctx));
678: } else {
679: PetscCall(VecRestoreArrayRead(x, &x_array));
680: }
681: }
683: if (!aiscupm) PetscCall(MatConvert(A, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &A));
684: PetscCall(Solver{}.template Solve<transpose>(A, cupmScalarPtrCast(y_array), m, m, 1, k, dctx, stream));
685: if (!aiscupm) PetscCall(MatConvert(A, MATSEQDENSE, MAT_INPLACE_MATRIX, &A));
687: if (use_y_array_directly) {
688: PetscCall(VecCUPMRestoreArrayWriteAsync<T>(y, &y_array, dctx));
689: } else {
690: const auto copy_mode = yiscupm ? cupmMemcpyDeviceToDevice : cupmMemcpyDeviceToHost;
691: PetscScalar *yv;
693: // The logic here is that the data is not yet in either y's GPU array or its CPU array.
694: // There is nothing in the interface to say where the user would like it to end up. So we
695: // choose the GPU, because it is the faster option
696: if (yiscupm) {
697: PetscCall(VecCUPMGetArrayWriteAsync<T>(y, &yv, dctx));
698: } else {
699: PetscCall(VecGetArray(y, &yv));
700: }
701: PetscCall(PetscCUPMMemcpyAsync(yv, y_array, k, copy_mode, stream));
702: if (yiscupm) {
703: PetscCall(VecCUPMRestoreArrayWriteAsync<T>(y, &yv, dctx));
704: } else {
705: PetscCall(VecRestoreArray(y, &yv));
706: }
707: PetscCall(VecCUPMRestoreArrayWriteAsync<T>(workvec, &y_array));
708: }
709: PetscFunctionReturn(PETSC_SUCCESS);
710: }
712: template <device::cupm::DeviceType T>
713: template <typename Solver, bool transpose>
714: inline PetscErrorCode MatDense_Seq_CUPM<T>::MatMatSolve_Factored_Dispatch_(Mat A, Mat B, Mat X) noexcept
715: {
716: const auto m = static_cast<cupmBlasInt_t>(A->rmap->n);
717: const auto k = static_cast<cupmBlasInt_t>(A->cmap->n);
718: cupmBlasInt_t nrhs, ldb, ldx, ldy;
719: PetscScalar *y;
720: PetscBool biscupm, xiscupm, aiscupm;
721: PetscDeviceContext dctx;
722: cupmStream_t stream;
724: PetscFunctionBegin;
725: PetscCheck(A->factortype != MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix must be factored to solve");
726: PetscCall(PetscObjectTypeCompare(PetscObjectCast(B), MATSEQDENSECUPM(), &biscupm));
727: PetscCall(PetscObjectTypeCompare(PetscObjectCast(X), MATSEQDENSECUPM(), &xiscupm));
728: PetscCall(PetscObjectTypeCompare(PetscObjectCast(A), MATSEQDENSECUPM(), &aiscupm));
729: PetscCall(GetHandles_(&dctx, &stream));
730: {
731: PetscInt n;
733: PetscCall(MatGetSize(B, nullptr, &n));
734: PetscCall(PetscCUPMBlasIntCast(n, &nrhs));
735: PetscCall(MatDenseGetLDA(B, &n));
736: PetscCall(PetscCUPMBlasIntCast(n, &ldb));
737: PetscCall(MatDenseGetLDA(X, &n));
738: PetscCall(PetscCUPMBlasIntCast(n, &ldx));
739: }
740: {
741: // The logic here is to try to minimize the amount of memory copying:
742: //
743: // If we call MatDenseCUPMGetArrayRead(B, &b) every time biscupm and the data is not
744: // offloaded to the GPU yet, then the data is copied to the GPU. But we are only trying to
745: // get the data in order to copy it into the y array. So the array b will be wherever the
746: // data already is so that only one memcpy is performed
747: const auto bisdevice = biscupm && PetscOffloadDevice(B->offloadmask);
748: const auto copy_mode = bisdevice ? cupmMemcpyDeviceToDevice : cupmMemcpyHostToDevice;
749: const PetscScalar *b;
751: if (bisdevice) {
752: b = DeviceArrayRead(dctx, B);
753: } else if (biscupm) {
754: b = HostArrayRead(dctx, B);
755: } else {
756: PetscCall(MatDenseGetArrayRead(B, &b));
757: }
759: if (ldx < m || !xiscupm) {
760: // X's array cannot serve as the array (too small or not on device), B's array cannot
761: // serve as the array (const), so allocate a new array
762: ldy = m;
763: PetscCall(PetscCUPMMallocAsync(&y, nrhs * m));
764: } else {
765: // X's array should serve as the array
766: ldy = ldx;
767: y = DeviceArrayWrite(dctx, X);
768: }
769: PetscCall(PetscCUPMMemcpy2DAsync(y, ldy, b, ldb, m, nrhs, copy_mode, stream));
770: if (!bisdevice && !biscupm) PetscCall(MatDenseRestoreArrayRead(B, &b));
771: }
773: // convert to CUPM twice??????????????????????????????????
774: // but A should already be CUPM??????????????????????????????????????
775: if (!aiscupm) PetscCall(MatConvert(A, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &A));
776: PetscCall(Solver{}.template Solve<transpose>(A, cupmScalarPtrCast(y), ldy, m, nrhs, k, dctx, stream));
777: if (!aiscupm) PetscCall(MatConvert(A, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &A));
779: if (ldx < m || !xiscupm) {
780: const auto copy_mode = xiscupm ? cupmMemcpyDeviceToDevice : cupmMemcpyDeviceToHost;
781: PetscScalar *x;
783: // The logic here is that the data is not yet in either X's GPU array or its CPU
784: // array. There is nothing in the interface to say where the user would like it to end up.
785: // So we choose the GPU, because it is the faster option
786: if (xiscupm) {
787: x = DeviceArrayWrite(dctx, X);
788: } else {
789: PetscCall(MatDenseGetArray(X, &x));
790: }
791: PetscCall(PetscCUPMMemcpy2DAsync(x, ldx, y, ldy, k, nrhs, copy_mode, stream));
792: if (!xiscupm) PetscCall(MatDenseRestoreArray(X, &x));
793: PetscCallCUPM(cupmFreeAsync(y, stream));
794: }
795: PetscFunctionReturn(PETSC_SUCCESS);
796: }
798: template <device::cupm::DeviceType T>
799: template <bool transpose>
800: inline PetscErrorCode MatDense_Seq_CUPM<T>::MatMultAdd_Dispatch_(Mat A, Vec xx, Vec yy, Vec zz) noexcept
801: {
802: const auto m = static_cast<cupmBlasInt_t>(A->rmap->n);
803: const auto n = static_cast<cupmBlasInt_t>(A->cmap->n);
804: cupmBlasHandle_t handle;
805: PetscDeviceContext dctx;
807: PetscFunctionBegin;
808: if (yy && yy != zz) PetscCall(VecSeq_CUPM::Copy(yy, zz)); // mult add
809: if (!m || !n) {
810: // mult only
811: if (!yy) PetscCall(VecSeq_CUPM::Set(zz, 0.0));
812: PetscFunctionReturn(PETSC_SUCCESS);
813: }
814: PetscCall(PetscInfo(A, "Matrix-vector product %" PetscBLASInt_FMT " x %" PetscBLASInt_FMT " on backend\n", m, n));
815: PetscCall(GetHandles_(&dctx, &handle));
816: {
817: constexpr auto op = transpose ? CUPMBLAS_OP_T : CUPMBLAS_OP_N;
818: const auto one = cupmScalarCast(1.0);
819: const auto zero = cupmScalarCast(0.0);
820: const auto da = DeviceArrayRead(dctx, A);
821: const auto dxx = VecSeq_CUPM::DeviceArrayRead(dctx, xx);
822: const auto dzz = VecSeq_CUPM::DeviceArrayReadWrite(dctx, zz);
824: PetscCall(PetscLogGpuTimeBegin());
825: PetscCallCUPMBLAS(cupmBlasXgemv(handle, op, m, n, &one, da.cupmdata(), static_cast<cupmBlasInt_t>(MatIMPLCast(A)->lda), dxx.cupmdata(), 1, (yy ? &one : &zero), dzz.cupmdata(), 1));
826: PetscCall(PetscLogGpuTimeEnd());
827: }
828: PetscCall(PetscLogGpuFlops(2.0 * m * n - (yy ? 0 : m)));
829: PetscFunctionReturn(PETSC_SUCCESS);
830: }
832: // ==========================================================================================
833: // MatDense_Seq_CUPM - Private API - Conversion Dispatch
834: // ==========================================================================================
836: template <device::cupm::DeviceType T>
837: template <bool to_host>
838: inline PetscErrorCode MatDense_Seq_CUPM<T>::Convert_Dispatch_(Mat M, MatType type, MatReuse reuse, Mat *newmat) noexcept
839: {
840: PetscFunctionBegin;
841: if (reuse == MAT_REUSE_MATRIX || reuse == MAT_INITIAL_MATRIX) {
842: // TODO these cases should be optimized
843: PetscCall(MatConvert_Basic(M, type, reuse, newmat));
844: } else {
845: const auto B = *newmat;
846: const auto pobj = PetscObjectCast(B);
848: if (to_host) {
849: PetscCall(BindToCPU(B, PETSC_TRUE));
850: PetscCall(Reset(B));
851: } else {
852: PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUPM()));
853: }
855: PetscCall(PetscStrFreeAllocpy(to_host ? VECSTANDARD : VecSeq_CUPM::VECCUPM(), &B->defaultvectype));
856: PetscCall(PetscObjectChangeTypeName(pobj, to_host ? MATSEQDENSE : MATSEQDENSECUPM()));
857: // cvec might be the wrong VecType, destroy and rebuild it if necessary
858: // REVIEW ME: this is possibly very inefficient
859: PetscCall(VecDestroy(&MatIMPLCast(B)->cvec));
861: MatComposeOp_CUPM(to_host, pobj, MatConvert_seqdensecupm_seqdense_C(), nullptr, Convert_SeqDenseCUPM_SeqDense);
862: MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMGetArray_C(), nullptr, GetArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_READ_WRITE>);
863: MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMGetArrayRead_C(), nullptr, GetArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_READ>);
864: MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMGetArrayWrite_C(), nullptr, GetArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_WRITE>);
865: MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMRestoreArray_C(), nullptr, RestoreArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_READ_WRITE>);
866: MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMRestoreArrayRead_C(), nullptr, RestoreArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_READ>);
867: MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMRestoreArrayWrite_C(), nullptr, RestoreArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_WRITE>);
868: MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMPlaceArray_C(), nullptr, PlaceArray);
869: MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMResetArray_C(), nullptr, ResetArray);
870: MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMReplaceArray_C(), nullptr, ReplaceArray);
871: MatComposeOp_CUPM(to_host, pobj, MatProductSetFromOptions_seqaij_seqdensecupm_C(), nullptr, MatProductSetFromOptions_SeqAIJ_SeqDense);
872: MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMSetPreallocation_C(), nullptr, SetPreallocation);
874: if (to_host) {
875: B->offloadmask = PETSC_OFFLOAD_CPU;
876: } else {
877: Mat_SeqDenseCUPM *mcu;
879: PetscCall(PetscNew(&mcu));
880: B->spptr = mcu;
881: B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; // REVIEW ME: why not offload host??
882: PetscCall(BindToCPU(B, PETSC_FALSE));
883: }
885: MatSetOp_CUPM(to_host, B, bindtocpu, nullptr, BindToCPU);
886: MatSetOp_CUPM(to_host, B, destroy, MatDestroy_SeqDense, Destroy);
887: }
888: PetscFunctionReturn(PETSC_SUCCESS);
889: }
891: // ==========================================================================================
892: // MatDense_Seq_CUPM - Public API
893: // ==========================================================================================
895: template <device::cupm::DeviceType T>
896: inline constexpr MatType MatDense_Seq_CUPM<T>::MATIMPLCUPM_() noexcept
897: {
898: return MATSEQDENSECUPM();
899: }
901: template <device::cupm::DeviceType T>
902: inline constexpr typename MatDense_Seq_CUPM<T>::Mat_SeqDenseCUPM *MatDense_Seq_CUPM<T>::MatCUPMCast(Mat m) noexcept
903: {
904: return static_cast<Mat_SeqDenseCUPM *>(m->spptr);
905: }
907: template <device::cupm::DeviceType T>
908: inline constexpr Mat_SeqDense *MatDense_Seq_CUPM<T>::MatIMPLCast_(Mat m) noexcept
909: {
910: return static_cast<Mat_SeqDense *>(m->data);
911: }
913: template <device::cupm::DeviceType T>
914: inline constexpr const char *MatDense_Seq_CUPM<T>::MatConvert_seqdensecupm_seqdense_C() noexcept
915: {
916: return T == device::cupm::DeviceType::CUDA ? "MatConvert_seqdensecuda_seqdense_C" : "MatConvert_seqdensehip_seqdense_C";
917: }
919: template <device::cupm::DeviceType T>
920: inline constexpr const char *MatDense_Seq_CUPM<T>::MatProductSetFromOptions_seqaij_seqdensecupm_C() noexcept
921: {
922: return T == device::cupm::DeviceType::CUDA ? "MatProductSetFromOptions_seqaij_seqdensecuda_C" : "MatProductSetFromOptions_seqaij_seqdensehip_C";
923: }
925: // ==========================================================================================
927: // MatCreate_SeqDenseCUPM()
928: template <device::cupm::DeviceType T>
929: inline PetscErrorCode MatDense_Seq_CUPM<T>::Create(Mat A) noexcept
930: {
931: PetscFunctionBegin;
932: PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUPM()));
933: PetscCall(MatCreate_SeqDense(A));
934: PetscCall(Convert_SeqDense_SeqDenseCUPM(A, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &A));
935: PetscFunctionReturn(PETSC_SUCCESS);
936: }
938: template <device::cupm::DeviceType T>
939: inline PetscErrorCode MatDense_Seq_CUPM<T>::Destroy(Mat A) noexcept
940: {
941: PetscFunctionBegin;
942: // prevent copying back data if we own the data pointer
943: if (!MatIMPLCast(A)->user_alloc) A->offloadmask = PETSC_OFFLOAD_CPU;
944: PetscCall(Convert_SeqDenseCUPM_SeqDense(A, MATSEQDENSE, MAT_INPLACE_MATRIX, &A));
945: PetscCall(MatDestroy_SeqDense(A));
946: PetscFunctionReturn(PETSC_SUCCESS);
947: }
949: // obj->ops->setup()
950: template <device::cupm::DeviceType T>
951: inline PetscErrorCode MatDense_Seq_CUPM<T>::SetUp(Mat A) noexcept
952: {
953: PetscFunctionBegin;
954: PetscCall(PetscLayoutSetUp(A->rmap));
955: PetscCall(PetscLayoutSetUp(A->cmap));
956: if (!A->preallocated) {
957: PetscDeviceContext dctx;
959: PetscCall(GetHandles_(&dctx));
960: PetscCall(SetPreallocation(A, dctx, nullptr));
961: }
962: PetscFunctionReturn(PETSC_SUCCESS);
963: }
965: template <device::cupm::DeviceType T>
966: inline PetscErrorCode MatDense_Seq_CUPM<T>::Reset(Mat A) noexcept
967: {
968: PetscFunctionBegin;
969: if (const auto mcu = MatCUPMCast(A)) {
970: cupmStream_t stream;
972: PetscCheck(!mcu->unplacedarray, PETSC_COMM_SELF, PETSC_ERR_ORDER, "MatDense%sResetArray() must be called first", cupmNAME());
973: PetscCall(GetHandles_(&stream));
974: if (!mcu->d_user_alloc) PetscCallCUPM(cupmFreeAsync(mcu->d_v, stream));
975: PetscCallCUPM(cupmFreeAsync(mcu->d_fact_tau, stream));
976: PetscCallCUPM(cupmFreeAsync(mcu->d_fact_ipiv, stream));
977: PetscCallCUPM(cupmFreeAsync(mcu->d_fact_info, stream));
978: PetscCallCUPM(cupmFreeAsync(mcu->d_fact_work, stream));
979: PetscCall(VecDestroy(&mcu->workvec));
980: PetscCall(PetscFree(A->spptr /* mcu */));
981: }
982: PetscFunctionReturn(PETSC_SUCCESS);
983: }
985: // ==========================================================================================
987: template <device::cupm::DeviceType T>
988: inline PetscErrorCode MatDense_Seq_CUPM<T>::BindToCPU(Mat A, PetscBool to_host) noexcept
989: {
990: const auto mimpl = MatIMPLCast(A);
991: const auto pobj = PetscObjectCast(A);
993: PetscFunctionBegin;
994: PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
995: PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
996: A->boundtocpu = to_host;
997: PetscCall(PetscStrFreeAllocpy(to_host ? PETSCRANDER48 : PETSCDEVICERAND(), &A->defaultrandtype));
998: if (to_host) {
999: PetscDeviceContext dctx;
1001: // make sure we have an up-to-date copy on the CPU
1002: PetscCall(GetHandles_(&dctx));
1003: PetscCall(DeviceToHost_(A, dctx));
1004: } else {
1005: PetscBool iscupm;
1007: if (auto &cvec = mimpl->cvec) {
1008: PetscCall(PetscObjectTypeCompare(PetscObjectCast(cvec), VecSeq_CUPM::VECSEQCUPM(), &iscupm));
1009: if (!iscupm) PetscCall(VecDestroy(&cvec));
1010: }
1011: if (auto &cmat = mimpl->cmat) {
1012: PetscCall(PetscObjectTypeCompare(PetscObjectCast(cmat), MATSEQDENSECUPM(), &iscupm));
1013: if (!iscupm) PetscCall(MatDestroy(&cmat));
1014: }
1015: }
1017: // ============================================================
1018: // Composed ops
1019: // ============================================================
1020: MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArray_C", MatDenseGetArray_SeqDense, GetArrayC_<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_READ_WRITE>);
1021: MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArrayRead_C", MatDenseGetArray_SeqDense, GetArrayC_<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_READ>);
1022: MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArrayWrite_C", MatDenseGetArray_SeqDense, GetArrayC_<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_WRITE>);
1023: MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArrayAndMemType_C", nullptr, GetArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_READ_WRITE>);
1024: MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreArrayAndMemType_C", nullptr, RestoreArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_READ_WRITE>);
1025: MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArrayReadAndMemType_C", nullptr, GetArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_READ>);
1026: MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreArrayReadAndMemType_C", nullptr, RestoreArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_READ>);
1027: MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArrayWriteAndMemType_C", nullptr, GetArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_WRITE>);
1028: MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreArrayWriteAndMemType_C", nullptr, RestoreArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_WRITE>);
1029: MatComposeOp_CUPM(to_host, pobj, "MatDenseGetColumnVec_C", MatDenseGetColumnVec_SeqDense, GetColumnVec<PETSC_MEMORY_ACCESS_READ_WRITE>);
1030: MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreColumnVec_C", MatDenseRestoreColumnVec_SeqDense, RestoreColumnVec<PETSC_MEMORY_ACCESS_READ_WRITE>);
1031: MatComposeOp_CUPM(to_host, pobj, "MatDenseGetColumnVecRead_C", MatDenseGetColumnVecRead_SeqDense, GetColumnVec<PETSC_MEMORY_ACCESS_READ>);
1032: MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreColumnVecRead_C", MatDenseRestoreColumnVecRead_SeqDense, RestoreColumnVec<PETSC_MEMORY_ACCESS_READ>);
1033: MatComposeOp_CUPM(to_host, pobj, "MatDenseGetColumnVecWrite_C", MatDenseGetColumnVecWrite_SeqDense, GetColumnVec<PETSC_MEMORY_ACCESS_WRITE>);
1034: MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreColumnVecWrite_C", MatDenseRestoreColumnVecWrite_SeqDense, RestoreColumnVec<PETSC_MEMORY_ACCESS_WRITE>);
1035: MatComposeOp_CUPM(to_host, pobj, "MatDenseGetSubMatrix_C", MatDenseGetSubMatrix_SeqDense, GetSubMatrix);
1036: MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreSubMatrix_C", MatDenseRestoreSubMatrix_SeqDense, RestoreSubMatrix);
1037: MatComposeOp_CUPM(to_host, pobj, "MatQRFactor_C", MatQRFactor_SeqDense, SolveQR::Factor);
1038: // always the same
1039: PetscCall(PetscObjectComposeFunction(pobj, "MatDenseSetLDA_C", MatDenseSetLDA_SeqDense));
1041: // ============================================================
1042: // Function pointer ops
1043: // ============================================================
1044: MatSetOp_CUPM(to_host, A, duplicate, MatDuplicate_SeqDense, Duplicate);
1045: MatSetOp_CUPM(to_host, A, mult, MatMult_SeqDense, [](Mat A, Vec xx, Vec yy) { return MatMultAdd_Dispatch_</* transpose */ false>(A, xx, nullptr, yy); });
1046: MatSetOp_CUPM(to_host, A, multtranspose, MatMultTranspose_SeqDense, [](Mat A, Vec xx, Vec yy) { return MatMultAdd_Dispatch_</* transpose */ true>(A, xx, nullptr, yy); });
1047: MatSetOp_CUPM(to_host, A, multadd, MatMultAdd_SeqDense, MatMultAdd_Dispatch_</* transpose */ false>);
1048: MatSetOp_CUPM(to_host, A, multtransposeadd, MatMultTransposeAdd_SeqDense, MatMultAdd_Dispatch_</* transpose */ true>);
1049: MatSetOp_CUPM(to_host, A, matmultnumeric, MatMatMultNumeric_SeqDense_SeqDense, MatMatMult_Numeric_Dispatch</* transpose_A */ false, /* transpose_B */ false>);
1050: MatSetOp_CUPM(to_host, A, mattransposemultnumeric, MatMatTransposeMultNumeric_SeqDense_SeqDense, MatMatMult_Numeric_Dispatch</* transpose_A */ false, /* transpose_B */ true>);
1051: MatSetOp_CUPM(to_host, A, transposematmultnumeric, MatTransposeMatMultNumeric_SeqDense_SeqDense, MatMatMult_Numeric_Dispatch</* transpose_A */ true, /* transpose_B */ false>);
1052: MatSetOp_CUPM(to_host, A, axpy, MatAXPY_SeqDense, AXPY);
1053: MatSetOp_CUPM(to_host, A, choleskyfactor, MatCholeskyFactor_SeqDense, SolveCholesky::Factor);
1054: MatSetOp_CUPM(to_host, A, lufactor, MatLUFactor_SeqDense, SolveLU::Factor);
1055: MatSetOp_CUPM(to_host, A, getcolumnvector, MatGetColumnVector_SeqDense, GetColumnVector);
1056: MatSetOp_CUPM(to_host, A, scale, MatScale_SeqDense, Scale);
1057: MatSetOp_CUPM(to_host, A, shift, MatShift_SeqDense, Shift);
1058: MatSetOp_CUPM(to_host, A, copy, MatCopy_SeqDense, Copy);
1059: MatSetOp_CUPM(to_host, A, zeroentries, MatZeroEntries_SeqDense, ZeroEntries);
1060: MatSetOp_CUPM(to_host, A, setup, MatSetUp_SeqDense, SetUp);
1061: MatSetOp_CUPM(to_host, A, setrandom, MatSetRandom_SeqDense, SetRandom);
1062: MatSetOp_CUPM(to_host, A, getdiagonal, MatGetDiagonal_SeqDense, GetDiagonal);
1063: // seemingly always the same
1064: A->ops->productsetfromoptions = MatProductSetFromOptions_SeqDense;
1066: if (const auto cmat = mimpl->cmat) PetscCall(MatBindToCPU(cmat, to_host));
1067: PetscFunctionReturn(PETSC_SUCCESS);
1068: }
1070: template <device::cupm::DeviceType T>
1071: inline PetscErrorCode MatDense_Seq_CUPM<T>::Convert_SeqDenseCUPM_SeqDense(Mat M, MatType type, MatReuse reuse, Mat *newmat) noexcept
1072: {
1073: PetscFunctionBegin;
1074: PetscCall(Convert_Dispatch_</* to host */ true>(M, type, reuse, newmat));
1075: PetscFunctionReturn(PETSC_SUCCESS);
1076: }
1078: template <device::cupm::DeviceType T>
1079: inline PetscErrorCode MatDense_Seq_CUPM<T>::Convert_SeqDense_SeqDenseCUPM(Mat M, MatType type, MatReuse reuse, Mat *newmat) noexcept
1080: {
1081: PetscFunctionBegin;
1082: PetscCall(Convert_Dispatch_</* to host */ false>(M, type, reuse, newmat));
1083: PetscFunctionReturn(PETSC_SUCCESS);
1084: }
1086: // ==========================================================================================
1088: template <device::cupm::DeviceType T>
1089: template <PetscMemType mtype, PetscMemoryAccessMode access>
1090: inline PetscErrorCode MatDense_Seq_CUPM<T>::GetArray(Mat m, PetscScalar **array, PetscDeviceContext dctx) noexcept
1091: {
1092: constexpr auto hostmem = PetscMemTypeHost(mtype);
1093: constexpr auto read_access = PetscMemoryAccessRead(access);
1095: PetscFunctionBegin;
1096: static_assert((mtype == PETSC_MEMTYPE_HOST) || (mtype == PETSC_MEMTYPE_DEVICE), "");
1097: PetscCall(PetscDeviceContextGetOptionalNullContext_Internal(&dctx));
1098: if (hostmem) {
1099: if (read_access) {
1100: PetscCall(DeviceToHost_(m, dctx));
1101: } else if (!MatIMPLCast(m)->v) {
1102: // MatCreateSeqDenseCUPM may not allocate CPU memory. Allocate if needed
1103: PetscCall(MatSeqDenseSetPreallocation(m, nullptr));
1104: }
1105: *array = MatIMPLCast(m)->v;
1106: } else {
1107: if (read_access) {
1108: PetscCall(HostToDevice_(m, dctx));
1109: } else if (!MatCUPMCast(m)->d_v) {
1110: // write-only
1111: PetscCall(SetPreallocation(m, dctx, nullptr));
1112: }
1113: *array = MatCUPMCast(m)->d_v;
1114: }
1115: if (PetscMemoryAccessWrite(access)) {
1116: m->offloadmask = hostmem ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
1117: PetscCall(PetscObjectStateIncrease(PetscObjectCast(m)));
1118: }
1119: PetscFunctionReturn(PETSC_SUCCESS);
1120: }
1122: template <device::cupm::DeviceType T>
1123: template <PetscMemType mtype, PetscMemoryAccessMode access>
1124: inline PetscErrorCode MatDense_Seq_CUPM<T>::RestoreArray(Mat m, PetscScalar **array, PetscDeviceContext) noexcept
1125: {
1126: PetscFunctionBegin;
1127: static_assert((mtype == PETSC_MEMTYPE_HOST) || (mtype == PETSC_MEMTYPE_DEVICE), "");
1128: if (PetscMemoryAccessWrite(access)) {
1129: // WRITE or READ_WRITE
1130: m->offloadmask = PetscMemTypeHost(mtype) ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
1131: PetscCall(PetscObjectStateIncrease(PetscObjectCast(m)));
1132: }
1133: if (array) {
1134: PetscCall(CheckPointerMatchesMemType_(*array, mtype));
1135: *array = nullptr;
1136: }
1137: PetscFunctionReturn(PETSC_SUCCESS);
1138: }
1140: template <device::cupm::DeviceType T>
1141: template <PetscMemoryAccessMode access>
1142: inline PetscErrorCode MatDense_Seq_CUPM<T>::GetArrayAndMemType(Mat m, PetscScalar **array, PetscMemType *mtype, PetscDeviceContext dctx) noexcept
1143: {
1144: PetscFunctionBegin;
1145: PetscCall(GetArray<PETSC_MEMTYPE_DEVICE, access>(m, array, dctx));
1146: if (mtype) *mtype = PETSC_MEMTYPE_CUPM();
1147: PetscFunctionReturn(PETSC_SUCCESS);
1148: }
1150: template <device::cupm::DeviceType T>
1151: template <PetscMemoryAccessMode access>
1152: inline PetscErrorCode MatDense_Seq_CUPM<T>::RestoreArrayAndMemType(Mat m, PetscScalar **array, PetscDeviceContext dctx) noexcept
1153: {
1154: PetscFunctionBegin;
1155: PetscCall(RestoreArray<PETSC_MEMTYPE_DEVICE, access>(m, array, dctx));
1156: PetscFunctionReturn(PETSC_SUCCESS);
1157: }
1159: // ==========================================================================================
1161: template <device::cupm::DeviceType T>
1162: inline PetscErrorCode MatDense_Seq_CUPM<T>::PlaceArray(Mat A, const PetscScalar *array) noexcept
1163: {
1164: const auto mimpl = MatIMPLCast(A);
1165: const auto mcu = MatCUPMCast(A);
1167: PetscFunctionBegin;
1168: PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1169: PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1170: PetscCheck(!mcu->unplacedarray, PETSC_COMM_SELF, PETSC_ERR_ORDER, "MatDense%sResetArray() must be called first", cupmNAME());
1171: if (mimpl->v) {
1172: PetscDeviceContext dctx;
1174: PetscCall(GetHandles_(&dctx));
1175: PetscCall(HostToDevice_(A, dctx));
1176: }
1177: mcu->unplacedarray = util::exchange(mcu->d_v, const_cast<PetscScalar *>(array));
1178: mcu->d_unplaced_user_alloc = util::exchange(mcu->d_user_alloc, PETSC_TRUE);
1179: PetscFunctionReturn(PETSC_SUCCESS);
1180: }
1182: template <device::cupm::DeviceType T>
1183: inline PetscErrorCode MatDense_Seq_CUPM<T>::ReplaceArray(Mat A, const PetscScalar *array) noexcept
1184: {
1185: const auto mimpl = MatIMPLCast(A);
1186: const auto mcu = MatCUPMCast(A);
1188: PetscFunctionBegin;
1189: PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1190: PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1191: PetscCheck(!mcu->unplacedarray, PETSC_COMM_SELF, PETSC_ERR_ORDER, "MatDense%sResetArray() must be called first", cupmNAME());
1192: if (!mcu->d_user_alloc) {
1193: cupmStream_t stream;
1195: PetscCall(GetHandles_(&stream));
1196: PetscCallCUPM(cupmFreeAsync(mcu->d_v, stream));
1197: }
1198: mcu->d_v = const_cast<PetscScalar *>(array);
1199: mcu->d_user_alloc = PETSC_FALSE;
1200: PetscFunctionReturn(PETSC_SUCCESS);
1201: }
1203: template <device::cupm::DeviceType T>
1204: inline PetscErrorCode MatDense_Seq_CUPM<T>::ResetArray(Mat A) noexcept
1205: {
1206: const auto mimpl = MatIMPLCast(A);
1207: const auto mcu = MatCUPMCast(A);
1209: PetscFunctionBegin;
1210: PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1211: PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1212: if (mimpl->v) {
1213: PetscDeviceContext dctx;
1215: PetscCall(GetHandles_(&dctx));
1216: PetscCall(HostToDevice_(A, dctx));
1217: }
1218: mcu->d_v = util::exchange(mcu->unplacedarray, nullptr);
1219: mcu->d_user_alloc = mcu->d_unplaced_user_alloc;
1220: PetscFunctionReturn(PETSC_SUCCESS);
1221: }
1223: // ==========================================================================================
1225: template <device::cupm::DeviceType T>
1226: template <bool transpose_A, bool transpose_B>
1227: inline PetscErrorCode MatDense_Seq_CUPM<T>::MatMatMult_Numeric_Dispatch(Mat A, Mat B, Mat C) noexcept
1228: {
1229: cupmBlasInt_t m, n, k;
1230: PetscBool Aiscupm, Biscupm;
1231: PetscDeviceContext dctx;
1232: cupmBlasHandle_t handle;
1234: PetscFunctionBegin;
1235: PetscCall(PetscCUPMBlasIntCast(C->rmap->n, &m));
1236: PetscCall(PetscCUPMBlasIntCast(C->cmap->n, &n));
1237: PetscCall(PetscCUPMBlasIntCast(transpose_A ? A->rmap->n : A->cmap->n, &k));
1238: if (!m || !n || !k) PetscFunctionReturn(PETSC_SUCCESS);
1240: // we may end up with SEQDENSE as one of the arguments
1241: // REVIEW ME: how? and why is it not B and C????????
1242: PetscCall(PetscObjectTypeCompare(PetscObjectCast(A), MATSEQDENSECUPM(), &Aiscupm));
1243: PetscCall(PetscObjectTypeCompare(PetscObjectCast(B), MATSEQDENSECUPM(), &Biscupm));
1244: if (!Aiscupm) PetscCall(MatConvert(A, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &A));
1245: if (!Biscupm) PetscCall(MatConvert(B, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &B));
1246: PetscCall(PetscInfo(C, "Matrix-Matrix product %" PetscBLASInt_FMT " x %" PetscBLASInt_FMT " x %" PetscBLASInt_FMT " on backend\n", m, k, n));
1247: PetscCall(GetHandles_(&dctx, &handle));
1249: PetscCall(PetscLogGpuTimeBegin());
1250: {
1251: const auto one = cupmScalarCast(1.0);
1252: const auto zero = cupmScalarCast(0.0);
1253: const auto da = DeviceArrayRead(dctx, A);
1254: const auto db = DeviceArrayRead(dctx, B);
1255: const auto dc = DeviceArrayWrite(dctx, C);
1256: PetscInt alda, blda, clda;
1258: PetscCall(MatDenseGetLDA(A, &alda));
1259: PetscCall(MatDenseGetLDA(B, &blda));
1260: PetscCall(MatDenseGetLDA(C, &clda));
1261: PetscCallCUPMBLAS(cupmBlasXgemm(handle, transpose_A ? CUPMBLAS_OP_T : CUPMBLAS_OP_N, transpose_B ? CUPMBLAS_OP_T : CUPMBLAS_OP_N, m, n, k, &one, da.cupmdata(), alda, db.cupmdata(), blda, &zero, dc.cupmdata(), clda));
1262: }
1263: PetscCall(PetscLogGpuTimeEnd());
1265: PetscCall(PetscLogGpuFlops(1.0 * m * n * k + 1.0 * m * n * (k - 1)));
1266: if (!Aiscupm) PetscCall(MatConvert(A, MATSEQDENSE, MAT_INPLACE_MATRIX, &A));
1267: if (!Biscupm) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B));
1268: PetscFunctionReturn(PETSC_SUCCESS);
1269: }
1271: template <device::cupm::DeviceType T>
1272: inline PetscErrorCode MatDense_Seq_CUPM<T>::Copy(Mat A, Mat B, MatStructure str) noexcept
1273: {
1274: const auto m = A->rmap->n;
1275: const auto n = A->cmap->n;
1277: PetscFunctionBegin;
1278: PetscAssert(m == B->rmap->n && n == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "size(B) != size(A)");
1279: // The two matrices must have the same copy implementation to be eligible for fast copy
1280: if (A->ops->copy == B->ops->copy) {
1281: PetscDeviceContext dctx;
1282: cupmStream_t stream;
1284: PetscCall(GetHandles_(&dctx, &stream));
1285: PetscCall(PetscLogGpuTimeBegin());
1286: {
1287: const auto va = DeviceArrayRead(dctx, A);
1288: const auto vb = DeviceArrayWrite(dctx, B);
1289: // order is important, DeviceArrayRead/Write() might call SetPreallocation() which sets
1290: // lda!
1291: const auto lda_a = MatIMPLCast(A)->lda;
1292: const auto lda_b = MatIMPLCast(B)->lda;
1294: if (lda_a > m || lda_b > m) {
1295: PetscAssert(lda_b > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "B lda (%" PetscBLASInt_FMT ") must be > 0 at this point, this indicates Mat%sSetPreallocation() was not called when it should have been!", lda_b, cupmNAME());
1296: PetscAssert(lda_a > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A lda (%" PetscBLASInt_FMT ") must be > 0 at this point, this indicates Mat%sSetPreallocation() was not called when it should have been!", lda_a, cupmNAME());
1297: PetscCall(PetscCUPMMemcpy2DAsync(vb.data(), lda_b, va.data(), lda_a, m, n, cupmMemcpyDeviceToDevice, stream));
1298: } else {
1299: PetscCall(PetscCUPMMemcpyAsync(vb.data(), va.data(), m * n, cupmMemcpyDeviceToDevice, stream));
1300: }
1301: }
1302: PetscCall(PetscLogGpuTimeEnd());
1303: } else {
1304: PetscCall(MatCopy_Basic(A, B, str));
1305: }
1306: PetscFunctionReturn(PETSC_SUCCESS);
1307: }
1309: template <device::cupm::DeviceType T>
1310: inline PetscErrorCode MatDense_Seq_CUPM<T>::ZeroEntries(Mat m) noexcept
1311: {
1312: PetscDeviceContext dctx;
1313: cupmStream_t stream;
1315: PetscFunctionBegin;
1316: PetscCall(GetHandles_(&dctx, &stream));
1317: PetscCall(PetscLogGpuTimeBegin());
1318: {
1319: const auto va = DeviceArrayWrite(dctx, m);
1320: const auto lda = MatIMPLCast(m)->lda;
1321: const auto ma = m->rmap->n;
1322: const auto na = m->cmap->n;
1324: if (lda > ma) {
1325: PetscCall(PetscCUPMMemset2DAsync(va.data(), lda, 0, ma, na, stream));
1326: } else {
1327: PetscCall(PetscCUPMMemsetAsync(va.data(), 0, ma * na, stream));
1328: }
1329: }
1330: PetscCall(PetscLogGpuTimeEnd());
1331: PetscFunctionReturn(PETSC_SUCCESS);
1332: }
1334: namespace detail
1335: {
1337: // ==========================================================================================
1338: // SubMatIndexFunctor
1339: //
1340: // Iterator which permutes a linear index range into matrix indices for am nrows x ncols
1341: // submat with leading dimension lda. Essentially SubMatIndexFunctor(i) returns the index for
1342: // the i'th sequential entry in the matrix.
1343: // ==========================================================================================
1344: template <typename T>
1345: struct SubMatIndexFunctor {
1346: PETSC_HOSTDEVICE_INLINE_DECL T operator()(T x) const noexcept { return ((x / nrows) * lda) + (x % nrows); }
1348: PetscInt nrows;
1349: PetscInt ncols;
1350: PetscInt lda;
1351: };
1353: template <typename Iterator>
1354: struct SubMatrixIterator : MatrixIteratorBase<Iterator, SubMatIndexFunctor<typename thrust::iterator_difference<Iterator>::type>> {
1355: using base_type = MatrixIteratorBase<Iterator, SubMatIndexFunctor<typename thrust::iterator_difference<Iterator>::type>>;
1357: using iterator = typename base_type::iterator;
1359: constexpr SubMatrixIterator(Iterator first, Iterator last, PetscInt nrows, PetscInt ncols, PetscInt lda) noexcept :
1360: base_type{
1361: std::move(first), std::move(last), {nrows, ncols, lda}
1362: }
1363: {
1364: }
1366: PETSC_NODISCARD iterator end() const noexcept { return this->begin() + (this->func.nrows * this->func.ncols); }
1367: };
1369: namespace
1370: {
1372: template <typename T>
1373: PETSC_NODISCARD inline SubMatrixIterator<typename thrust::device_vector<T>::iterator> make_submat_iterator(PetscInt rstart, PetscInt rend, PetscInt cstart, PetscInt cend, PetscInt lda, T *ptr) noexcept
1374: {
1375: const auto nrows = rend - rstart;
1376: const auto ncols = cend - cstart;
1377: const auto dptr = thrust::device_pointer_cast(ptr);
1379: return {dptr + (rstart * lda) + cstart, dptr + ((rstart + nrows) * lda) + cstart, nrows, ncols, lda};
1380: }
1382: } // namespace
1384: } // namespace detail
1386: template <device::cupm::DeviceType T>
1387: inline PetscErrorCode MatDense_Seq_CUPM<T>::Scale(Mat A, PetscScalar alpha) noexcept
1388: {
1389: const auto m = A->rmap->n;
1390: const auto n = A->cmap->n;
1391: const auto N = m * n;
1392: PetscDeviceContext dctx;
1394: PetscFunctionBegin;
1395: PetscCall(PetscInfo(A, "Performing Scale %" PetscInt_FMT " x %" PetscInt_FMT " on backend\n", m, n));
1396: PetscCall(GetHandles_(&dctx));
1397: {
1398: const auto da = DeviceArrayReadWrite(dctx, A);
1399: const auto lda = MatIMPLCast(A)->lda;
1401: if (lda > m) {
1402: cupmStream_t stream;
1404: PetscCall(GetHandlesFrom_(dctx, &stream));
1405: // clang-format off
1406: PetscCallThrust(
1407: const auto sub_mat = detail::make_submat_iterator(0, m, 0, n, lda, da.data());
1409: THRUST_CALL(
1410: thrust::transform,
1411: stream,
1412: sub_mat.begin(), sub_mat.end(), sub_mat.begin(),
1413: device::cupm::functors::make_times_equals(alpha)
1414: )
1415: );
1416: // clang-format on
1417: } else {
1418: const auto cu_alpha = cupmScalarCast(alpha);
1419: cupmBlasHandle_t handle;
1421: PetscCall(GetHandlesFrom_(dctx, &handle));
1422: PetscCall(PetscLogGpuTimeBegin());
1423: PetscCallCUPMBLAS(cupmBlasXscal(handle, N, &cu_alpha, da.cupmdata(), 1));
1424: PetscCall(PetscLogGpuTimeEnd());
1425: }
1426: }
1427: PetscCall(PetscLogGpuFlops(N));
1428: PetscFunctionReturn(PETSC_SUCCESS);
1429: }
1431: template <device::cupm::DeviceType T>
1432: inline PetscErrorCode MatDense_Seq_CUPM<T>::AXPY(Mat Y, PetscScalar alpha, Mat X, MatStructure) noexcept
1433: {
1434: const auto m_x = X->rmap->n, m_y = Y->rmap->n;
1435: const auto n_x = X->cmap->n, n_y = Y->cmap->n;
1436: const auto N = m_x * n_x;
1437: PetscDeviceContext dctx;
1439: PetscFunctionBegin;
1440: if (!m_x || !n_x || alpha == (PetscScalar)0.0) PetscFunctionReturn(PETSC_SUCCESS);
1441: PetscCall(PetscInfo(Y, "Performing AXPY %" PetscInt_FMT " x %" PetscInt_FMT " on backend\n", m_y, n_y));
1442: PetscCall(GetHandles_(&dctx));
1443: {
1444: const auto dx = DeviceArrayRead(dctx, X);
1445: const auto dy = DeviceArrayReadWrite(dctx, Y);
1446: const auto lda_x = MatIMPLCast(X)->lda;
1447: const auto lda_y = MatIMPLCast(Y)->lda;
1449: if (lda_x > m_x || lda_y > m_x) {
1450: cupmStream_t stream;
1452: PetscCall(GetHandlesFrom_(dctx, &stream));
1453: // clang-format off
1454: PetscCallThrust(
1455: const auto sub_mat_y = detail::make_submat_iterator(0, m_y, 0, n_y, lda_y, dy.data());
1456: const auto sub_mat_x = detail::make_submat_iterator(0, m_x, 0, n_x, lda_x, dx.data());
1458: THRUST_CALL(
1459: thrust::transform,
1460: stream,
1461: sub_mat_x.begin(), sub_mat_x.end(), sub_mat_y.begin(), sub_mat_y.begin(),
1462: device::cupm::functors::make_axpy(alpha)
1463: );
1464: );
1465: // clang-format on
1466: } else {
1467: const auto cu_alpha = cupmScalarCast(alpha);
1468: cupmBlasHandle_t handle;
1470: PetscCall(GetHandlesFrom_(dctx, &handle));
1471: PetscCall(PetscLogGpuTimeBegin());
1472: PetscCallCUPMBLAS(cupmBlasXaxpy(handle, N, &cu_alpha, dx.cupmdata(), 1, dy.cupmdata(), 1));
1473: PetscCall(PetscLogGpuTimeEnd());
1474: }
1475: }
1476: PetscCall(PetscLogGpuFlops(PetscMax(2 * N - 1, 0)));
1477: PetscFunctionReturn(PETSC_SUCCESS);
1478: }
1480: template <device::cupm::DeviceType T>
1481: inline PetscErrorCode MatDense_Seq_CUPM<T>::Duplicate(Mat A, MatDuplicateOption opt, Mat *B) noexcept
1482: {
1483: const auto hopt = (opt == MAT_COPY_VALUES && A->offloadmask != PETSC_OFFLOAD_CPU) ? MAT_DO_NOT_COPY_VALUES : opt;
1484: PetscDeviceContext dctx;
1486: PetscFunctionBegin;
1487: PetscCall(GetHandles_(&dctx));
1488: // do not call SetPreallocation() yet, we call it afterwards??
1489: PetscCall(MatCreateSeqDenseCUPM<T>(PetscObjectComm(PetscObjectCast(A)), A->rmap->n, A->cmap->n, nullptr, B, dctx, /* preallocate */ false));
1490: PetscCall(MatDuplicateNoCreate_SeqDense(*B, A, hopt));
1491: if (opt == MAT_COPY_VALUES && hopt != MAT_COPY_VALUES) PetscCall(Copy(A, *B, SAME_NONZERO_PATTERN));
1492: // allocate memory if needed
1493: if (opt != MAT_COPY_VALUES && !MatCUPMCast(*B)->d_v) PetscCall(SetPreallocation(*B, dctx, nullptr));
1494: PetscFunctionReturn(PETSC_SUCCESS);
1495: }
1497: template <device::cupm::DeviceType T>
1498: inline PetscErrorCode MatDense_Seq_CUPM<T>::SetRandom(Mat A, PetscRandom rng) noexcept
1499: {
1500: PetscBool device;
1502: PetscFunctionBegin;
1503: PetscCall(PetscObjectTypeCompare(PetscObjectCast(rng), PETSCDEVICERAND(), &device));
1504: if (device) {
1505: const auto m = A->rmap->n;
1506: const auto n = A->cmap->n;
1507: PetscDeviceContext dctx;
1509: PetscCall(GetHandles_(&dctx));
1510: {
1511: const auto a = DeviceArrayWrite(dctx, A);
1512: PetscInt lda;
1514: PetscCall(MatDenseGetLDA(A, &lda));
1515: if (lda > m) {
1516: for (PetscInt i = 0; i < n; i++) PetscCall(PetscRandomGetValues(rng, m, a.data() + i * lda));
1517: } else {
1518: PetscInt mn;
1520: PetscCall(PetscIntMultError(m, n, &mn));
1521: PetscCall(PetscRandomGetValues(rng, mn, a));
1522: }
1523: }
1524: } else {
1525: PetscCall(MatSetRandom_SeqDense(A, rng));
1526: }
1527: PetscFunctionReturn(PETSC_SUCCESS);
1528: }
1530: // ==========================================================================================
1532: template <device::cupm::DeviceType T>
1533: inline PetscErrorCode MatDense_Seq_CUPM<T>::GetColumnVector(Mat A, Vec v, PetscInt col) noexcept
1534: {
1535: const auto offloadmask = A->offloadmask;
1536: const auto n = A->rmap->n;
1537: const auto col_offset = [&](const PetscScalar *ptr) { return ptr + col * MatIMPLCast(A)->lda; };
1538: PetscBool viscupm;
1539: PetscDeviceContext dctx;
1540: cupmStream_t stream;
1542: PetscFunctionBegin;
1543: PetscCall(PetscObjectTypeCompareAny(PetscObjectCast(v), &viscupm, VecSeq_CUPM::VECSEQCUPM(), VecSeq_CUPM::VECMPICUPM(), VecSeq_CUPM::VECCUPM(), ""));
1544: PetscCall(GetHandles_(&dctx, &stream));
1545: if (viscupm && !v->boundtocpu) {
1546: const auto x = VecSeq_CUPM::DeviceArrayWrite(dctx, v);
1548: // update device data
1549: if (PetscOffloadDevice(offloadmask)) {
1550: PetscCall(PetscCUPMMemcpyAsync(x.data(), col_offset(DeviceArrayRead(dctx, A)), n, cupmMemcpyDeviceToDevice, stream));
1551: } else {
1552: PetscCall(PetscCUPMMemcpyAsync(x.data(), col_offset(HostArrayRead(dctx, A)), n, cupmMemcpyHostToDevice, stream));
1553: }
1554: } else {
1555: PetscScalar *x;
1557: // update host data
1558: PetscCall(VecGetArrayWrite(v, &x));
1559: if (PetscOffloadUnallocated(offloadmask) || PetscOffloadHost(offloadmask)) {
1560: PetscCall(PetscArraycpy(x, col_offset(HostArrayRead(dctx, A)), n));
1561: } else if (PetscOffloadDevice(offloadmask)) {
1562: PetscCall(PetscCUPMMemcpyAsync(x, col_offset(DeviceArrayRead(dctx, A)), n, cupmMemcpyDeviceToHost, stream));
1563: }
1564: PetscCall(VecRestoreArrayWrite(v, &x));
1565: }
1566: PetscFunctionReturn(PETSC_SUCCESS);
1567: }
1569: template <device::cupm::DeviceType T>
1570: template <PetscMemoryAccessMode access>
1571: inline PetscErrorCode MatDense_Seq_CUPM<T>::GetColumnVec(Mat A, PetscInt col, Vec *v) noexcept
1572: {
1573: using namespace vec::cupm;
1574: const auto mimpl = MatIMPLCast(A);
1575: PetscDeviceContext dctx;
1577: PetscFunctionBegin;
1578: PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1579: PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1580: mimpl->vecinuse = col + 1;
1581: if (!mimpl->cvec) PetscCall(MatDenseCreateColumnVec_Private(A, &mimpl->cvec));
1582: PetscCall(GetHandles_(&dctx));
1583: PetscCall(GetArray<PETSC_MEMTYPE_DEVICE, access>(A, const_cast<PetscScalar **>(&mimpl->ptrinuse), dctx));
1584: PetscCall(VecCUPMPlaceArrayAsync<T>(mimpl->cvec, mimpl->ptrinuse + static_cast<std::size_t>(col) * static_cast<std::size_t>(mimpl->lda)));
1585: if (access == PETSC_MEMORY_ACCESS_READ) PetscCall(VecLockReadPush(mimpl->cvec));
1586: *v = mimpl->cvec;
1587: PetscFunctionReturn(PETSC_SUCCESS);
1588: }
1590: template <device::cupm::DeviceType T>
1591: template <PetscMemoryAccessMode access>
1592: inline PetscErrorCode MatDense_Seq_CUPM<T>::RestoreColumnVec(Mat A, PetscInt, Vec *v) noexcept
1593: {
1594: using namespace vec::cupm;
1595: const auto mimpl = MatIMPLCast(A);
1596: const auto cvec = mimpl->cvec;
1597: PetscDeviceContext dctx;
1599: PetscFunctionBegin;
1600: PetscCheck(mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseGetColumnVec() first");
1601: PetscCheck(cvec, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing internal column vector");
1602: mimpl->vecinuse = 0;
1603: if (access == PETSC_MEMORY_ACCESS_READ) PetscCall(VecLockReadPop(cvec));
1604: PetscCall(VecCUPMResetArrayAsync<T>(cvec));
1605: PetscCall(GetHandles_(&dctx));
1606: PetscCall(RestoreArray<PETSC_MEMTYPE_DEVICE, access>(A, const_cast<PetscScalar **>(&mimpl->ptrinuse), dctx));
1607: if (v) *v = nullptr;
1608: PetscFunctionReturn(PETSC_SUCCESS);
1609: }
1611: // ==========================================================================================
1613: template <device::cupm::DeviceType T>
1614: inline PetscErrorCode MatDense_Seq_CUPM<T>::GetFactor(Mat A, MatFactorType ftype, Mat *fact_out) noexcept
1615: {
1616: Mat fact = nullptr;
1617: PetscDeviceContext dctx;
1619: PetscFunctionBegin;
1620: PetscCall(GetHandles_(&dctx));
1621: PetscCall(MatCreateSeqDenseCUPM<T>(PetscObjectComm(PetscObjectCast(A)), A->rmap->n, A->cmap->n, nullptr, &fact, dctx, /* preallocate */ false));
1622: fact->factortype = ftype;
1623: switch (ftype) {
1624: case MAT_FACTOR_LU:
1625: case MAT_FACTOR_ILU: // fall-through
1626: fact->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqDense;
1627: fact->ops->ilufactorsymbolic = MatLUFactorSymbolic_SeqDense;
1628: break;
1629: case MAT_FACTOR_CHOLESKY:
1630: case MAT_FACTOR_ICC: // fall-through
1631: fact->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqDense;
1632: break;
1633: case MAT_FACTOR_QR: {
1634: const auto pobj = PetscObjectCast(fact);
1636: PetscCall(PetscObjectComposeFunction(pobj, "MatQRFactor_C", MatQRFactor_SeqDense));
1637: PetscCall(PetscObjectComposeFunction(pobj, "MatQRFactorSymbolic_C", MatQRFactorSymbolic_SeqDense));
1638: } break;
1639: case MAT_FACTOR_NONE:
1640: case MAT_FACTOR_ILUDT: // fall-through
1641: case MAT_FACTOR_NUM_TYPES: // fall-through
1642: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatFactorType %s not supported", MatFactorTypes[ftype]);
1643: }
1644: PetscCall(PetscStrFreeAllocpy(MATSOLVERCUPM(), &fact->solvertype));
1645: PetscCall(PetscStrallocpy(MATORDERINGEXTERNAL, const_cast<char **>(fact->preferredordering) + MAT_FACTOR_LU));
1646: PetscCall(PetscStrallocpy(MATORDERINGEXTERNAL, const_cast<char **>(fact->preferredordering) + MAT_FACTOR_ILU));
1647: PetscCall(PetscStrallocpy(MATORDERINGEXTERNAL, const_cast<char **>(fact->preferredordering) + MAT_FACTOR_CHOLESKY));
1648: PetscCall(PetscStrallocpy(MATORDERINGEXTERNAL, const_cast<char **>(fact->preferredordering) + MAT_FACTOR_ICC));
1649: *fact_out = fact;
1650: PetscFunctionReturn(PETSC_SUCCESS);
1651: }
1653: template <device::cupm::DeviceType T>
1654: inline PetscErrorCode MatDense_Seq_CUPM<T>::InvertFactors(Mat A) noexcept
1655: {
1656: const auto mimpl = MatIMPLCast(A);
1657: const auto mcu = MatCUPMCast(A);
1658: const auto n = static_cast<cupmBlasInt_t>(A->cmap->n);
1659: cupmSolverHandle_t handle;
1660: PetscDeviceContext dctx;
1661: cupmStream_t stream;
1663: PetscFunctionBegin;
1664: #if PetscDefined(HAVE_CUDA) && PetscDefined(USING_NVCC)
1665: // HIP appears to have this by default??
1666: PetscCheck(PETSC_PKG_CUDA_VERSION_GE(10, 1, 0), PETSC_COMM_SELF, PETSC_ERR_SUP, "Upgrade to CUDA version 10.1.0 or higher");
1667: #endif
1668: if (!n || !A->rmap->n) PetscFunctionReturn(PETSC_SUCCESS);
1669: PetscCheck(A->factortype == MAT_FACTOR_CHOLESKY, PETSC_COMM_SELF, PETSC_ERR_LIB, "Factor type %s not implemented", MatFactorTypes[A->factortype]);
1670: // spd
1671: PetscCheck(!mcu->d_fact_ipiv, PETSC_COMM_SELF, PETSC_ERR_LIB, "%sDnsytri not implemented", cupmSolverName());
1673: PetscCall(GetHandles_(&dctx, &handle, &stream));
1674: {
1675: const auto da = DeviceArrayReadWrite(dctx, A);
1676: const auto lda = static_cast<cupmBlasInt_t>(mimpl->lda);
1677: cupmBlasInt_t il;
1679: PetscCallCUPMSOLVER(cupmSolverXpotri_bufferSize(handle, CUPMSOLVER_FILL_MODE_LOWER, n, da.cupmdata(), lda, &il));
1680: if (il > mcu->d_fact_lwork) {
1681: mcu->d_fact_lwork = il;
1682: PetscCallCUPM(cupmFreeAsync(mcu->d_fact_work, stream));
1683: PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_work, il, stream));
1684: }
1685: PetscCall(PetscLogGpuTimeBegin());
1686: PetscCallCUPMSOLVER(cupmSolverXpotri(handle, CUPMSOLVER_FILL_MODE_LOWER, n, da.cupmdata(), lda, mcu->d_fact_work, mcu->d_fact_lwork, mcu->d_fact_info));
1687: PetscCall(PetscLogGpuTimeEnd());
1688: }
1689: PetscCall(CheckCUPMSolverInfo_(mcu->d_fact_info, stream));
1690: // TODO (write cuda kernel)
1691: PetscCall(MatSeqDenseSymmetrize_Private(A, PETSC_TRUE));
1692: PetscCall(PetscLogGpuFlops(1.0 * n * n * n / 3.0));
1694: A->ops->solve = nullptr;
1695: A->ops->solvetranspose = nullptr;
1696: A->ops->matsolve = nullptr;
1697: A->factortype = MAT_FACTOR_NONE;
1699: PetscCall(PetscFree(A->solvertype));
1700: PetscFunctionReturn(PETSC_SUCCESS);
1701: }
1703: // ==========================================================================================
1705: template <device::cupm::DeviceType T>
1706: inline PetscErrorCode MatDense_Seq_CUPM<T>::GetSubMatrix(Mat A, PetscInt rbegin, PetscInt rend, PetscInt cbegin, PetscInt cend, Mat *mat) noexcept
1707: {
1708: const auto mimpl = MatIMPLCast(A);
1709: const auto array_offset = [&](PetscScalar *ptr) { return ptr + rbegin + static_cast<std::size_t>(cbegin) * mimpl->lda; };
1710: const auto n = rend - rbegin;
1711: const auto m = cend - cbegin;
1712: auto &cmat = mimpl->cmat;
1713: PetscDeviceContext dctx;
1715: PetscFunctionBegin;
1716: PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1717: PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1718: mimpl->matinuse = cbegin + 1;
1720: PetscCall(GetHandles_(&dctx));
1721: PetscCall(HostToDevice_(A, dctx));
1723: if (cmat && ((m != cmat->cmap->N) || (n != cmat->rmap->N))) PetscCall(MatDestroy(&cmat));
1724: {
1725: const auto device_array = array_offset(MatCUPMCast(A)->d_v);
1727: if (cmat) {
1728: PetscCall(PlaceArray(cmat, device_array));
1729: } else {
1730: PetscCall(MatCreateSeqDenseCUPM<T>(PetscObjectComm(PetscObjectCast(A)), n, m, device_array, &cmat, dctx));
1731: }
1732: }
1733: PetscCall(MatDenseSetLDA(cmat, mimpl->lda));
1734: // place CPU array if present but do not copy any data
1735: if (const auto host_array = mimpl->v) {
1736: cmat->offloadmask = PETSC_OFFLOAD_GPU;
1737: PetscCall(MatDensePlaceArray(cmat, array_offset(host_array)));
1738: }
1740: cmat->offloadmask = A->offloadmask;
1741: *mat = cmat;
1742: PetscFunctionReturn(PETSC_SUCCESS);
1743: }
1745: template <device::cupm::DeviceType T>
1746: inline PetscErrorCode MatDense_Seq_CUPM<T>::RestoreSubMatrix(Mat A, Mat *m) noexcept
1747: {
1748: const auto mimpl = MatIMPLCast(A);
1749: const auto cmat = mimpl->cmat;
1750: const auto reset = static_cast<bool>(mimpl->v);
1751: bool copy, was_offload_host;
1753: PetscFunctionBegin;
1754: PetscCheck(mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseGetSubMatrix() first");
1755: PetscCheck(cmat, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing internal column matrix");
1756: PetscCheck(*m == cmat, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Not the matrix obtained from MatDenseGetSubMatrix()");
1757: mimpl->matinuse = 0;
1759: // calls to ResetArray may change it, so save it here
1760: was_offload_host = cmat->offloadmask == PETSC_OFFLOAD_CPU;
1761: if (was_offload_host && !reset) {
1762: copy = true;
1763: PetscCall(MatSeqDenseSetPreallocation(A, nullptr));
1764: } else {
1765: copy = false;
1766: }
1768: PetscCall(ResetArray(cmat));
1769: if (reset) PetscCall(MatDenseResetArray(cmat));
1770: if (copy) {
1771: PetscDeviceContext dctx;
1773: PetscCall(GetHandles_(&dctx));
1774: PetscCall(DeviceToHost_(A, dctx));
1775: } else {
1776: A->offloadmask = was_offload_host ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
1777: }
1779: cmat->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
1780: *m = nullptr;
1781: PetscFunctionReturn(PETSC_SUCCESS);
1782: }
1784: // ==========================================================================================
1786: namespace
1787: {
1789: template <device::cupm::DeviceType T>
1790: inline PetscErrorCode MatMatMultNumeric_SeqDenseCUPM_SeqDenseCUPM(Mat A, Mat B, Mat C, PetscBool TA, PetscBool TB) noexcept
1791: {
1792: PetscFunctionBegin;
1793: if (TA) {
1794: if (TB) {
1795: PetscCall(MatDense_Seq_CUPM<T>::template MatMatMult_Numeric_Dispatch<true, true>(A, B, C));
1796: } else {
1797: PetscCall(MatDense_Seq_CUPM<T>::template MatMatMult_Numeric_Dispatch<true, false>(A, B, C));
1798: }
1799: } else {
1800: if (TB) {
1801: PetscCall(MatDense_Seq_CUPM<T>::template MatMatMult_Numeric_Dispatch<false, true>(A, B, C));
1802: } else {
1803: PetscCall(MatDense_Seq_CUPM<T>::template MatMatMult_Numeric_Dispatch<false, false>(A, B, C));
1804: }
1805: }
1806: PetscFunctionReturn(PETSC_SUCCESS);
1807: }
1809: template <device::cupm::DeviceType T>
1810: inline PetscErrorCode MatSolverTypeRegister_DENSECUPM() noexcept
1811: {
1812: PetscFunctionBegin;
1813: for (auto ftype : util::make_array(MAT_FACTOR_LU, MAT_FACTOR_CHOLESKY, MAT_FACTOR_QR)) {
1814: PetscCall(MatSolverTypeRegister(MatDense_Seq_CUPM<T>::MATSOLVERCUPM(), MATSEQDENSE, ftype, MatDense_Seq_CUPM<T>::GetFactor));
1815: PetscCall(MatSolverTypeRegister(MatDense_Seq_CUPM<T>::MATSOLVERCUPM(), MatDense_Seq_CUPM<T>::MATSEQDENSECUPM(), ftype, MatDense_Seq_CUPM<T>::GetFactor));
1816: }
1817: PetscFunctionReturn(PETSC_SUCCESS);
1818: }
1820: } // anonymous namespace
1822: } // namespace impl
1824: } // namespace cupm
1826: } // namespace mat
1828: } // namespace Petsc