Actual source code: matseqdensecupm.hpp

  1: #ifndef PETSCMATSEQDENSECUPM_HPP
  2: #define PETSCMATSEQDENSECUPM_HPP

  4: #include <petsc/private/matdensecupmimpl.h>
  5: #include <../src/mat/impls/dense/seq/dense.h>

  7: #if defined(__cplusplus)
  8: #include <petsc/private/deviceimpl.h>
  9: #include <petsc/private/randomimpl.h>
 10: #include <petsc/private/vecimpl.h>
 11: #include <petsc/private/cupmobject.hpp>
 12: #include <petsc/private/cupmsolverinterface.hpp>

 14: #include <petsc/private/cpp/type_traits.hpp>
 15: #include <petsc/private/cpp/utility.hpp>

 17: #include <../src/vec/vec/impls/seq/cupm/vecseqcupm.hpp>

 19: namespace Petsc
 20: {

 22: namespace mat
 23: {

 25: namespace cupm
 26: {

 28: namespace impl
 29: {

 31: template <device::cupm::DeviceType T>
 32: class MatDense_Seq_CUPM : MatDense_CUPM<T, MatDense_Seq_CUPM<T>> {
 33: public:
 34:   MATDENSECUPM_HEADER(T, MatDense_Seq_CUPM<T>);

 36: private:
 37:   struct Mat_SeqDenseCUPM {
 38:     PetscScalar *d_v;           // pointer to the matrix on the GPU
 39:     PetscScalar *unplacedarray; // if one called MatCUPMDensePlaceArray(), this is where it stashed the original
 40:     bool         d_user_alloc;
 41:     bool         d_unplaced_user_alloc;
 42:     // factorization support
 43:     cupmBlasInt_t *d_fact_ipiv;  // device pivots
 44:     cupmScalar_t  *d_fact_tau;   // device QR tau vector
 45:     cupmBlasInt_t *d_fact_info;  // device info
 46:     cupmScalar_t  *d_fact_work;  // device workspace
 47:     cupmBlasInt_t  d_fact_lwork; // size of device workspace
 48:     // workspace
 49:     Vec workvec;
 50:   };

 52:   static PetscErrorCode SetPreallocation_(Mat, PetscDeviceContext, PetscScalar *) noexcept;

 54:   static PetscErrorCode HostToDevice_(Mat, PetscDeviceContext) noexcept;
 55:   static PetscErrorCode DeviceToHost_(Mat, PetscDeviceContext) noexcept;

 57:   static PetscErrorCode CheckCUPMSolverInfo_(const cupmBlasInt_t *, cupmStream_t) noexcept;

 59:   template <typename Derived>
 60:   struct SolveCommon;
 61:   struct SolveQR;
 62:   struct SolveCholesky;
 63:   struct SolveLU;

 65:   template <typename Solver, bool transpose>
 66:   static PetscErrorCode MatSolve_Factored_Dispatch_(Mat, Vec, Vec) noexcept;
 67:   template <typename Solver, bool transpose>
 68:   static PetscErrorCode MatMatSolve_Factored_Dispatch_(Mat, Mat, Mat) noexcept;
 69:   template <bool transpose>
 70:   static PetscErrorCode MatMultAdd_Dispatch_(Mat, Vec, Vec, Vec) noexcept;

 72:   template <bool to_host>
 73:   static PetscErrorCode Convert_Dispatch_(Mat, MatType, MatReuse, Mat *) noexcept;

 75:   PETSC_NODISCARD static constexpr MatType       MATIMPLCUPM_() noexcept;
 76:   PETSC_NODISCARD static constexpr Mat_SeqDense *MatIMPLCast_(Mat) noexcept;

 78: public:
 79:   PETSC_NODISCARD static constexpr Mat_SeqDenseCUPM *MatCUPMCast(Mat) noexcept;

 81:   // define these by hand since they don't fit the above mold
 82:   PETSC_NODISCARD static constexpr const char *MatConvert_seqdensecupm_seqdense_C() noexcept;
 83:   PETSC_NODISCARD static constexpr const char *MatProductSetFromOptions_seqaij_seqdensecupm_C() noexcept;

 85:   static PetscErrorCode Create(Mat) noexcept;
 86:   static PetscErrorCode Destroy(Mat) noexcept;
 87:   static PetscErrorCode SetUp(Mat) noexcept;
 88:   static PetscErrorCode Reset(Mat) noexcept;

 90:   static PetscErrorCode BindToCPU(Mat, PetscBool) noexcept;
 91:   static PetscErrorCode Convert_SeqDense_SeqDenseCUPM(Mat, MatType, MatReuse, Mat *) noexcept;
 92:   static PetscErrorCode Convert_SeqDenseCUPM_SeqDense(Mat, MatType, MatReuse, Mat *) noexcept;

 94:   template <PetscMemType, PetscMemoryAccessMode>
 95:   static PetscErrorCode GetArray(Mat, PetscScalar **, PetscDeviceContext) noexcept;
 96:   template <PetscMemType, PetscMemoryAccessMode>
 97:   static PetscErrorCode RestoreArray(Mat, PetscScalar **, PetscDeviceContext) noexcept;
 98:   template <PetscMemoryAccessMode>
 99:   static PetscErrorCode GetArrayAndMemType(Mat, PetscScalar **, PetscMemType *, PetscDeviceContext) noexcept;
100:   template <PetscMemoryAccessMode>
101:   static PetscErrorCode RestoreArrayAndMemType(Mat, PetscScalar **, PetscDeviceContext) noexcept;

103: private:
104:   template <PetscMemType mtype, PetscMemoryAccessMode mode>
105:   static PetscErrorCode GetArrayC_(Mat m, PetscScalar **p) noexcept
106:   {
107:     PetscDeviceContext dctx;

109:     PetscFunctionBegin;
110:     PetscCall(GetHandles_(&dctx));
111:     PetscCall(GetArray<mtype, mode>(m, p, dctx));
112:     PetscFunctionReturn(PETSC_SUCCESS);
113:   }

115:   template <PetscMemType mtype, PetscMemoryAccessMode mode>
116:   static PetscErrorCode RestoreArrayC_(Mat m, PetscScalar **p) noexcept
117:   {
118:     PetscDeviceContext dctx;

120:     PetscFunctionBegin;
121:     PetscCall(GetHandles_(&dctx));
122:     PetscCall(RestoreArray<mtype, mode>(m, p, dctx));
123:     PetscFunctionReturn(PETSC_SUCCESS);
124:   }

126:   template <PetscMemoryAccessMode mode>
127:   static PetscErrorCode GetArrayAndMemTypeC_(Mat m, PetscScalar **p, PetscMemType *tp) noexcept
128:   {
129:     PetscDeviceContext dctx;

131:     PetscFunctionBegin;
132:     PetscCall(GetHandles_(&dctx));
133:     PetscCall(GetArrayAndMemType<mode>(m, p, tp, dctx));
134:     PetscFunctionReturn(PETSC_SUCCESS);
135:   }

137:   template <PetscMemoryAccessMode mode>
138:   static PetscErrorCode RestoreArrayAndMemTypeC_(Mat m, PetscScalar **p) noexcept
139:   {
140:     PetscDeviceContext dctx;

142:     PetscFunctionBegin;
143:     PetscCall(GetHandles_(&dctx));
144:     PetscCall(RestoreArrayAndMemType<mode>(m, p, dctx));
145:     PetscFunctionReturn(PETSC_SUCCESS);
146:   }

148: public:
149:   static PetscErrorCode PlaceArray(Mat, const PetscScalar *) noexcept;
150:   static PetscErrorCode ReplaceArray(Mat, const PetscScalar *) noexcept;
151:   static PetscErrorCode ResetArray(Mat) noexcept;

153:   template <bool transpose_A, bool transpose_B>
154:   static PetscErrorCode MatMatMult_Numeric_Dispatch(Mat, Mat, Mat) noexcept;
155:   static PetscErrorCode Copy(Mat, Mat, MatStructure) noexcept;
156:   static PetscErrorCode ZeroEntries(Mat) noexcept;
157:   static PetscErrorCode Scale(Mat, PetscScalar) noexcept;
158:   static PetscErrorCode Shift(Mat, PetscScalar) noexcept;
159:   static PetscErrorCode AXPY(Mat, PetscScalar, Mat, MatStructure) noexcept;
160:   static PetscErrorCode Duplicate(Mat, MatDuplicateOption, Mat *) noexcept;
161:   static PetscErrorCode SetRandom(Mat, PetscRandom) noexcept;

163:   static PetscErrorCode GetColumnVector(Mat, Vec, PetscInt) noexcept;
164:   template <PetscMemoryAccessMode>
165:   static PetscErrorCode GetColumnVec(Mat, PetscInt, Vec *) noexcept;
166:   template <PetscMemoryAccessMode>
167:   static PetscErrorCode RestoreColumnVec(Mat, PetscInt, Vec *) noexcept;

169:   static PetscErrorCode GetFactor(Mat, MatFactorType, Mat *) noexcept;
170:   static PetscErrorCode InvertFactors(Mat) noexcept;

172:   static PetscErrorCode GetSubMatrix(Mat, PetscInt, PetscInt, PetscInt, PetscInt, Mat *) noexcept;
173:   static PetscErrorCode RestoreSubMatrix(Mat, Mat *) noexcept;
174: };

176: } // namespace impl

178: namespace
179: {

181: // Declare this here so that the functions below can make use of it
182: template <device::cupm::DeviceType T>
183: inline PetscErrorCode MatCreateSeqDenseCUPM(MPI_Comm comm, PetscInt m, PetscInt n, PetscScalar *data, Mat *A, PetscDeviceContext dctx = nullptr, bool preallocate = true) noexcept
184: {
185:   PetscFunctionBegin;
186:   PetscCall(impl::MatDense_Seq_CUPM<T>::CreateIMPLDenseCUPM(comm, m, n, m, n, data, A, dctx, preallocate));
187:   PetscFunctionReturn(PETSC_SUCCESS);
188: }

190: } // anonymous namespace

192: namespace impl
193: {

195: // ==========================================================================================
196: // MatDense_Seq_CUPM - Private API - Utility
197: // ==========================================================================================

199: template <device::cupm::DeviceType T>
200: inline PetscErrorCode MatDense_Seq_CUPM<T>::SetPreallocation_(Mat m, PetscDeviceContext dctx, PetscScalar *user_device_array) noexcept
201: {
202:   const auto   mcu   = MatCUPMCast(m);
203:   const auto   nrows = m->rmap->n;
204:   const auto   ncols = m->cmap->n;
205:   auto        &lda   = MatIMPLCast(m)->lda;
206:   cupmStream_t stream;

208:   PetscFunctionBegin;
209:   PetscCheckTypeName(m, MATSEQDENSECUPM());
211:   PetscCall(checkCupmBlasIntCast(nrows));
212:   PetscCall(checkCupmBlasIntCast(ncols));
213:   PetscCall(GetHandlesFrom_(dctx, &stream));
214:   if (lda <= 0) lda = nrows;
215:   if (!mcu->d_user_alloc) PetscCallCUPM(cupmFreeAsync(mcu->d_v, stream));
216:   if (user_device_array) {
217:     mcu->d_user_alloc = PETSC_TRUE;
218:     mcu->d_v          = user_device_array;
219:   } else {
220:     PetscInt size;

222:     mcu->d_user_alloc = PETSC_FALSE;
223:     PetscCall(PetscIntMultError(lda, ncols, &size));
224:     PetscCall(PetscCUPMMallocAsync(&mcu->d_v, size, stream));
225:     PetscCall(PetscCUPMMemsetAsync(mcu->d_v, 0, size, stream));
226:   }
227:   m->offloadmask = PETSC_OFFLOAD_GPU;
228:   PetscFunctionReturn(PETSC_SUCCESS);
229: }

231: template <device::cupm::DeviceType T>
232: inline PetscErrorCode MatDense_Seq_CUPM<T>::HostToDevice_(Mat m, PetscDeviceContext dctx) noexcept
233: {
234:   const auto nrows = m->rmap->n;
235:   const auto ncols = m->cmap->n;
236:   const auto copy  = m->offloadmask == PETSC_OFFLOAD_CPU || m->offloadmask == PETSC_OFFLOAD_UNALLOCATED;

238:   PetscFunctionBegin;
239:   PetscCheckTypeName(m, MATSEQDENSECUPM());
240:   if (m->boundtocpu) PetscFunctionReturn(PETSC_SUCCESS);
241:   PetscCall(PetscInfo(m, "%s matrix %" PetscInt_FMT " x %" PetscInt_FMT "\n", copy ? "Copy" : "Reusing", nrows, ncols));
242:   if (copy) {
243:     const auto   mcu = MatCUPMCast(m);
244:     cupmStream_t stream;

246:     // Allocate GPU memory if not present
247:     if (!mcu->d_v) PetscCall(SetPreallocation(m, dctx));
248:     PetscCall(GetHandlesFrom_(dctx, &stream));
249:     PetscCall(PetscLogEventBegin(MAT_DenseCopyToGPU, m, 0, 0, 0));
250:     {
251:       const auto mimpl = MatIMPLCast(m);
252:       const auto lda   = mimpl->lda;
253:       const auto src   = mimpl->v;
254:       const auto dest  = mcu->d_v;

256:       if (lda > nrows) {
257:         PetscCall(PetscCUPMMemcpy2DAsync(dest, lda, src, lda, nrows, ncols, cupmMemcpyHostToDevice, stream));
258:       } else {
259:         PetscCall(PetscCUPMMemcpyAsync(dest, src, lda * ncols, cupmMemcpyHostToDevice, stream));
260:       }
261:     }
262:     PetscCall(PetscLogEventEnd(MAT_DenseCopyToGPU, m, 0, 0, 0));
263:     // order important, ensure that offloadmask is PETSC_OFFLOAD_BOTH
264:     m->offloadmask = PETSC_OFFLOAD_BOTH;
265:   }
266:   PetscFunctionReturn(PETSC_SUCCESS);
267: }

269: template <device::cupm::DeviceType T>
270: inline PetscErrorCode MatDense_Seq_CUPM<T>::DeviceToHost_(Mat m, PetscDeviceContext dctx) noexcept
271: {
272:   const auto nrows = m->rmap->n;
273:   const auto ncols = m->cmap->n;
274:   const auto copy  = m->offloadmask == PETSC_OFFLOAD_GPU;

276:   PetscFunctionBegin;
277:   PetscCheckTypeName(m, MATSEQDENSECUPM());
278:   PetscCall(PetscInfo(m, "%s matrix %" PetscInt_FMT " x %" PetscInt_FMT "\n", copy ? "Copy" : "Reusing", nrows, ncols));
279:   if (copy) {
280:     const auto   mimpl = MatIMPLCast(m);
281:     cupmStream_t stream;

283:     // MatCreateSeqDenseCUPM may not allocate CPU memory. Allocate if needed
284:     if (!mimpl->v) PetscCall(MatSeqDenseSetPreallocation(m, nullptr));
285:     PetscCall(GetHandlesFrom_(dctx, &stream));
286:     PetscCall(PetscLogEventBegin(MAT_DenseCopyFromGPU, m, 0, 0, 0));
287:     {
288:       const auto lda  = mimpl->lda;
289:       const auto dest = mimpl->v;
290:       const auto src  = MatCUPMCast(m)->d_v;

292:       if (lda > nrows) {
293:         PetscCall(PetscCUPMMemcpy2DAsync(dest, lda, src, lda, nrows, ncols, cupmMemcpyDeviceToHost, stream));
294:       } else {
295:         PetscCall(PetscCUPMMemcpyAsync(dest, src, lda * ncols, cupmMemcpyDeviceToHost, stream));
296:       }
297:     }
298:     PetscCall(PetscLogEventEnd(MAT_DenseCopyFromGPU, m, 0, 0, 0));
299:     // order is important, MatSeqDenseSetPreallocation() might set offloadmask
300:     m->offloadmask = PETSC_OFFLOAD_BOTH;
301:   }
302:   PetscFunctionReturn(PETSC_SUCCESS);
303: }

305: template <device::cupm::DeviceType T>
306: inline PetscErrorCode MatDense_Seq_CUPM<T>::CheckCUPMSolverInfo_(const cupmBlasInt_t *fact_info, cupmStream_t stream) noexcept
307: {
308:   PetscFunctionBegin;
309:   if (PetscDefined(USE_DEBUG)) {
310:     cupmBlasInt_t info = 0;

312:     PetscCall(PetscCUPMMemcpyAsync(&info, fact_info, 1, cupmMemcpyDeviceToHost, stream));
313:     if (stream) PetscCallCUPM(cupmStreamSynchronize(stream));
314:     static_assert(std::is_same<decltype(info), int>::value, "");
315:     PetscCheck(info <= 0, PETSC_COMM_SELF, PETSC_ERR_MAT_CH_ZRPVT, "Bad factorization: zero pivot in row %d", info - 1);
316:     PetscCheck(info >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Wrong argument to cupmSolver %d", -info);
317:   }
318:   PetscFunctionReturn(PETSC_SUCCESS);
319: }

321: // ==========================================================================================
322: // MatDense_Seq_CUPM - Private API - Solver Dispatch
323: // ==========================================================================================

325: // specific solvers called through the dispatch_() family of functions
326: template <device::cupm::DeviceType T>
327: template <typename Derived>
328: struct MatDense_Seq_CUPM<T>::SolveCommon {
329:   using derived_type = Derived;

331:   template <typename F>
332:   static PetscErrorCode ResizeFactLwork(Mat_SeqDenseCUPM *mcu, cupmStream_t stream, F &&cupmSolverComputeFactLwork) noexcept
333:   {
334:     cupmBlasInt_t lwork;

336:     PetscFunctionBegin;
337:     PetscCallCUPMSOLVER(cupmSolverComputeFactLwork(&lwork));
338:     if (lwork > mcu->d_fact_lwork) {
339:       mcu->d_fact_lwork = lwork;
340:       PetscCallCUPM(cupmFreeAsync(mcu->d_fact_work, stream));
341:       PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_work, lwork, stream));
342:     }
343:     PetscFunctionReturn(PETSC_SUCCESS);
344:   }

346:   static PetscErrorCode FactorPrepare(Mat A, cupmStream_t stream) noexcept
347:   {
348:     const auto mcu = MatCUPMCast(A);

350:     PetscFunctionBegin;
351:     PetscCall(PetscInfo(A, "%s factor %" PetscInt_FMT " x %" PetscInt_FMT " on backend\n", derived_type::NAME(), A->rmap->n, A->cmap->n));
352:     A->factortype             = derived_type::MATFACTORTYPE();
353:     A->ops->solve             = MatSolve_Factored_Dispatch_<derived_type, false>;
354:     A->ops->solvetranspose    = MatSolve_Factored_Dispatch_<derived_type, true>;
355:     A->ops->matsolve          = MatMatSolve_Factored_Dispatch_<derived_type, false>;
356:     A->ops->matsolvetranspose = MatMatSolve_Factored_Dispatch_<derived_type, true>;

358:     PetscCall(PetscStrFreeAllocpy(MATSOLVERCUPM(), &A->solvertype));
359:     if (!mcu->d_fact_info) PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_info, 1, stream));
360:     PetscFunctionReturn(PETSC_SUCCESS);
361:   }
362: };

364: template <device::cupm::DeviceType T>
365: struct MatDense_Seq_CUPM<T>::SolveLU : SolveCommon<SolveLU> {
366:   using base_type = SolveCommon<SolveLU>;

368:   static constexpr const char   *NAME() noexcept { return "LU"; }
369:   static constexpr MatFactorType MATFACTORTYPE() noexcept { return MAT_FACTOR_LU; }

371:   static PetscErrorCode Factor(Mat A, IS, IS, const MatFactorInfo *) noexcept
372:   {
373:     const auto         m = static_cast<cupmBlasInt_t>(A->rmap->n);
374:     const auto         n = static_cast<cupmBlasInt_t>(A->cmap->n);
375:     cupmStream_t       stream;
376:     cupmSolverHandle_t handle;
377:     PetscDeviceContext dctx;

379:     PetscFunctionBegin;
380:     if (!m || !n) PetscFunctionReturn(PETSC_SUCCESS);
381:     PetscCall(GetHandles_(&dctx, &handle, &stream));
382:     PetscCall(base_type::FactorPrepare(A, stream));
383:     {
384:       const auto mcu = MatCUPMCast(A);
385:       const auto da  = DeviceArrayReadWrite(dctx, A);
386:       const auto lda = static_cast<cupmBlasInt_t>(MatIMPLCast(A)->lda);

388:       // clang-format off
389:       PetscCall(
390:         base_type::ResizeFactLwork(
391:           mcu, stream,
392:           [&](cupmBlasInt_t *fact_lwork)
393:           {
394:             return cupmSolverXgetrf_bufferSize(handle, m, n, da.cupmdata(), lda, fact_lwork);
395:           }
396:         )
397:       );
398:       // clang-format on
399:       if (!mcu->d_fact_ipiv) PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_ipiv, n, stream));

401:       PetscCall(PetscLogGpuTimeBegin());
402:       PetscCallCUPMSOLVER(cupmSolverXgetrf(handle, m, n, da.cupmdata(), lda, mcu->d_fact_work, mcu->d_fact_lwork, mcu->d_fact_ipiv, mcu->d_fact_info));
403:       PetscCall(PetscLogGpuTimeEnd());
404:       PetscCall(CheckCUPMSolverInfo_(mcu->d_fact_info, stream));
405:     }
406:     PetscCall(PetscLogGpuFlops(2.0 * n * n * m / 3.0));
407:     PetscFunctionReturn(PETSC_SUCCESS);
408:   }

410:   template <bool transpose>
411:   static PetscErrorCode Solve(Mat A, cupmScalar_t *x, cupmBlasInt_t ldx, cupmBlasInt_t m, cupmBlasInt_t nrhs, cupmBlasInt_t k, PetscDeviceContext dctx, cupmStream_t stream) noexcept
412:   {
413:     const auto         mcu       = MatCUPMCast(A);
414:     const auto         fact_info = mcu->d_fact_info;
415:     const auto         fact_ipiv = mcu->d_fact_ipiv;
416:     cupmSolverHandle_t handle;

418:     PetscFunctionBegin;
419:     PetscCall(GetHandlesFrom_(dctx, &handle));
420:     PetscCall(PetscInfo(A, "%s solve %d x %d on backend\n", NAME(), m, k));
421:     PetscCall(PetscLogGpuTimeBegin());
422:     {
423:       constexpr auto op  = transpose ? CUPMSOLVER_OP_T : CUPMSOLVER_OP_N;
424:       const auto     da  = DeviceArrayRead(dctx, A);
425:       const auto     lda = static_cast<cupmBlasInt_t>(MatIMPLCast(A)->lda);

427:       // clang-format off
428:       PetscCall(
429:         base_type::ResizeFactLwork(
430:           mcu, stream,
431:           [&](cupmBlasInt_t *lwork)
432:           {
433:             return cupmSolverXgetrs_bufferSize(
434:               handle, op, m, nrhs, da.cupmdata(), lda, fact_ipiv, x, ldx, lwork
435:             );
436:           }
437:         )
438:       );
439:       // clang-format on
440:       PetscCallCUPMSOLVER(cupmSolverXgetrs(handle, op, m, nrhs, da.cupmdata(), lda, fact_ipiv, x, ldx, mcu->d_fact_work, mcu->d_fact_lwork, fact_info));
441:       PetscCall(CheckCUPMSolverInfo_(fact_info, stream));
442:     }
443:     PetscCall(PetscLogGpuTimeEnd());
444:     PetscCall(PetscLogGpuFlops(nrhs * (2.0 * m * m - m)));
445:     PetscFunctionReturn(PETSC_SUCCESS);
446:   }
447: };

449: template <device::cupm::DeviceType T>
450: struct MatDense_Seq_CUPM<T>::SolveCholesky : SolveCommon<SolveCholesky> {
451:   using base_type = SolveCommon<SolveCholesky>;

453:   static constexpr const char   *NAME() noexcept { return "Cholesky"; }
454:   static constexpr MatFactorType MATFACTORTYPE() noexcept { return MAT_FACTOR_CHOLESKY; }

456:   static PetscErrorCode Factor(Mat A, IS, const MatFactorInfo *) noexcept
457:   {
458:     const auto         n = static_cast<cupmBlasInt_t>(A->rmap->n);
459:     PetscDeviceContext dctx;
460:     cupmSolverHandle_t handle;
461:     cupmStream_t       stream;

463:     PetscFunctionBegin;
464:     if (!n || !A->cmap->n) PetscFunctionReturn(PETSC_SUCCESS);
465:     PetscCheck(A->spd == PETSC_BOOL3_TRUE, PETSC_COMM_SELF, PETSC_ERR_SUP, "%ssytrs unavailable. Use MAT_FACTOR_LU", cupmSolverName());
466:     PetscCall(GetHandles_(&dctx, &handle, &stream));
467:     PetscCall(base_type::FactorPrepare(A, stream));
468:     {
469:       const auto mcu = MatCUPMCast(A);
470:       const auto da  = DeviceArrayReadWrite(dctx, A);
471:       const auto lda = static_cast<cupmBlasInt_t>(MatIMPLCast(A)->lda);

473:       // clang-format off
474:       PetscCall(
475:         base_type::ResizeFactLwork(
476:           mcu, stream,
477:           [&](cupmBlasInt_t *fact_lwork)
478:           {
479:             return cupmSolverXpotrf_bufferSize(
480:               handle, CUPMSOLVER_FILL_MODE_LOWER, n, da.cupmdata(), lda, fact_lwork
481:             );
482:           }
483:         )
484:       );
485:       // clang-format on
486:       PetscCall(PetscLogGpuTimeBegin());
487:       PetscCallCUPMSOLVER(cupmSolverXpotrf(handle, CUPMSOLVER_FILL_MODE_LOWER, n, da.cupmdata(), lda, mcu->d_fact_work, mcu->d_fact_lwork, mcu->d_fact_info));
488:       PetscCall(PetscLogGpuTimeEnd());
489:       PetscCall(CheckCUPMSolverInfo_(mcu->d_fact_info, stream));
490:     }
491:     PetscCall(PetscLogGpuFlops(1.0 * n * n * n / 3.0));

493:   #if 0
494:     // At the time of writing this interface (cuda 10.0), cusolverDn does not implement *sytrs
495:     // and *hetr* routines. The code below should work, and it can be activated when *sytrs
496:     // routines will be available
497:     if (!mcu->d_fact_ipiv) PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_ipiv, n, stream));
498:     if (!mcu->d_fact_lwork) {
499:       PetscCallCUPMSOLVER(cupmSolverDnXsytrf_bufferSize(handle, n, da.cupmdata(), lda, &mcu->d_fact_lwork));
500:       PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_work, mcu->d_fact_lwork, stream));
501:     }
502:     if (mcu->d_fact_info) PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_info, 1, stream));
503:     PetscCall(PetscLogGpuTimeBegin());
504:     PetscCallCUPMSOLVER(cupmSolverXsytrf(handle, CUPMSOLVER_FILL_MODE_LOWER, n, da, lda, mcu->d_fact_ipiv, mcu->d_fact_work, mcu->d_fact_lwork, mcu->d_fact_info));
505:     PetscCall(PetscLogGpuTimeEnd());
506:   #endif
507:     PetscFunctionReturn(PETSC_SUCCESS);
508:   }

510:   template <bool transpose>
511:   static PetscErrorCode Solve(Mat A, cupmScalar_t *x, cupmBlasInt_t ldx, cupmBlasInt_t m, cupmBlasInt_t nrhs, cupmBlasInt_t k, PetscDeviceContext dctx, cupmStream_t stream) noexcept
512:   {
513:     const auto         mcu       = MatCUPMCast(A);
514:     const auto         fact_info = mcu->d_fact_info;
515:     cupmSolverHandle_t handle;

517:     PetscFunctionBegin;
518:     PetscAssert(!mcu->d_fact_ipiv, PETSC_COMM_SELF, PETSC_ERR_LIB, "%ssytrs not implemented", cupmSolverName());
519:     PetscCall(GetHandlesFrom_(dctx, &handle));
520:     PetscCall(PetscInfo(A, "%s solve %d x %d on backend\n", NAME(), m, k));
521:     PetscCall(PetscLogGpuTimeBegin());
522:     {
523:       const auto da  = DeviceArrayRead(dctx, A);
524:       const auto lda = static_cast<cupmBlasInt_t>(MatIMPLCast(A)->lda);

526:       // clang-format off
527:       PetscCall(
528:         base_type::ResizeFactLwork(
529:           mcu, stream,
530:           [&](cupmBlasInt_t *lwork)
531:           {
532:             return cupmSolverXpotrs_bufferSize(
533:               handle, CUPMSOLVER_FILL_MODE_LOWER, m, nrhs, da.cupmdata(), lda, x, ldx, lwork
534:             );
535:           }
536:         )
537:       );
538:       // clang-format on
539:       PetscCallCUPMSOLVER(cupmSolverXpotrs(handle, CUPMSOLVER_FILL_MODE_LOWER, m, nrhs, da.cupmdata(), lda, x, ldx, mcu->d_fact_work, mcu->d_fact_lwork, fact_info));
540:       PetscCall(CheckCUPMSolverInfo_(fact_info, stream));
541:     }
542:     PetscCall(PetscLogGpuTimeEnd());
543:     PetscCall(PetscLogGpuFlops(nrhs * (2.0 * m * m - m)));
544:     PetscFunctionReturn(PETSC_SUCCESS);
545:   }
546: };

548: template <device::cupm::DeviceType T>
549: struct MatDense_Seq_CUPM<T>::SolveQR : SolveCommon<SolveQR> {
550:   using base_type = SolveCommon<SolveQR>;

552:   static constexpr const char   *NAME() noexcept { return "QR"; }
553:   static constexpr MatFactorType MATFACTORTYPE() noexcept { return MAT_FACTOR_QR; }

555:   static PetscErrorCode Factor(Mat A, IS, const MatFactorInfo *) noexcept
556:   {
557:     const auto         m     = static_cast<cupmBlasInt_t>(A->rmap->n);
558:     const auto         n     = static_cast<cupmBlasInt_t>(A->cmap->n);
559:     const auto         min   = std::min(m, n);
560:     const auto         mimpl = MatIMPLCast(A);
561:     cupmStream_t       stream;
562:     cupmSolverHandle_t handle;
563:     PetscDeviceContext dctx;

565:     PetscFunctionBegin;
566:     if (!m || !n) PetscFunctionReturn(PETSC_SUCCESS);
567:     PetscCall(GetHandles_(&dctx, &handle, &stream));
568:     PetscCall(base_type::FactorPrepare(A, stream));
569:     mimpl->rank = min;
570:     {
571:       const auto mcu = MatCUPMCast(A);
572:       const auto da  = DeviceArrayReadWrite(dctx, A);
573:       const auto lda = static_cast<cupmBlasInt_t>(mimpl->lda);

575:       if (!mcu->workvec) PetscCall(vec::cupm::VecCreateSeqCUPMAsync<T>(PetscObjectComm(PetscObjectCast(A)), m, &mcu->workvec));
576:       if (!mcu->d_fact_tau) PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_tau, min, stream));
577:       // clang-format off
578:       PetscCall(
579:         base_type::ResizeFactLwork(
580:           mcu, stream,
581:           [&](cupmBlasInt_t *fact_lwork)
582:           {
583:             return cupmSolverXgeqrf_bufferSize(handle, m, n, da.cupmdata(), lda, fact_lwork);
584:           }
585:         )
586:       );
587:       // clang-format on
588:       PetscCall(PetscLogGpuTimeBegin());
589:       PetscCallCUPMSOLVER(cupmSolverXgeqrf(handle, m, n, da.cupmdata(), lda, mcu->d_fact_tau, mcu->d_fact_work, mcu->d_fact_lwork, mcu->d_fact_info));
590:       PetscCall(PetscLogGpuTimeEnd());
591:       PetscCall(CheckCUPMSolverInfo_(mcu->d_fact_info, stream));
592:     }
593:     PetscCall(PetscLogGpuFlops(2.0 * min * min * (std::max(m, n) - min / 3.0)));
594:     PetscFunctionReturn(PETSC_SUCCESS);
595:   }

597:   template <bool transpose>
598:   static PetscErrorCode Solve(Mat A, cupmScalar_t *x, cupmBlasInt_t ldx, cupmBlasInt_t m, cupmBlasInt_t nrhs, cupmBlasInt_t k, PetscDeviceContext dctx, cupmStream_t stream) noexcept
599:   {
600:     const auto         mimpl      = MatIMPLCast(A);
601:     const auto         rank       = static_cast<cupmBlasInt_t>(mimpl->rank);
602:     const auto         mcu        = MatCUPMCast(A);
603:     const auto         fact_info  = mcu->d_fact_info;
604:     const auto         fact_tau   = mcu->d_fact_tau;
605:     const auto         fact_work  = mcu->d_fact_work;
606:     const auto         fact_lwork = mcu->d_fact_lwork;
607:     cupmSolverHandle_t solver_handle;
608:     cupmBlasHandle_t   blas_handle;

610:     PetscFunctionBegin;
611:     PetscCall(GetHandlesFrom_(dctx, &blas_handle, &solver_handle));
612:     PetscCall(PetscInfo(A, "%s solve %d x %d on backend\n", NAME(), m, k));
613:     PetscCall(PetscLogGpuTimeBegin());
614:     {
615:       const auto da  = DeviceArrayRead(dctx, A);
616:       const auto one = cupmScalarCast(1.0);
617:       const auto lda = static_cast<cupmBlasInt_t>(mimpl->lda);

619:       if (transpose) {
620:         PetscCallCUPMBLAS(cupmBlasXtrsm(blas_handle, CUPMBLAS_SIDE_LEFT, CUPMBLAS_FILL_MODE_UPPER, CUPMBLAS_OP_T, CUPMBLAS_DIAG_NON_UNIT, rank, nrhs, &one, da.cupmdata(), lda, x, ldx));
621:         PetscCallCUPMSOLVER(cupmSolverXormqr(solver_handle, CUPMSOLVER_SIDE_LEFT, CUPMSOLVER_OP_N, m, nrhs, rank, da.cupmdata(), lda, fact_tau, x, ldx, fact_work, fact_lwork, fact_info));
622:         PetscCall(CheckCUPMSolverInfo_(fact_info, stream));
623:       } else {
624:         constexpr auto op = PetscDefined(USE_COMPLEX) ? CUPMSOLVER_OP_C : CUPMSOLVER_OP_T;

626:         PetscCallCUPMSOLVER(cupmSolverXormqr(solver_handle, CUPMSOLVER_SIDE_LEFT, op, m, nrhs, rank, da.cupmdata(), lda, fact_tau, x, ldx, fact_work, fact_lwork, fact_info));
627:         PetscCall(CheckCUPMSolverInfo_(fact_info, stream));
628:         PetscCallCUPMBLAS(cupmBlasXtrsm(blas_handle, CUPMBLAS_SIDE_LEFT, CUPMBLAS_FILL_MODE_UPPER, CUPMBLAS_OP_N, CUPMBLAS_DIAG_NON_UNIT, rank, nrhs, &one, da.cupmdata(), lda, x, ldx));
629:       }
630:     }
631:     PetscCall(PetscLogGpuTimeEnd());
632:     PetscCall(PetscLogFlops(nrhs * (4.0 * m * rank - (rank * rank))));
633:     PetscFunctionReturn(PETSC_SUCCESS);
634:   }
635: };

637: template <device::cupm::DeviceType T>
638: template <typename Solver, bool transpose>
639: inline PetscErrorCode MatDense_Seq_CUPM<T>::MatSolve_Factored_Dispatch_(Mat A, Vec x, Vec y) noexcept
640: {
641:   using namespace vec::cupm;
642:   const auto         pobj_A  = PetscObjectCast(A);
643:   const auto         m       = static_cast<cupmBlasInt_t>(A->rmap->n);
644:   const auto         k       = static_cast<cupmBlasInt_t>(A->cmap->n);
645:   auto              &workvec = MatCUPMCast(A)->workvec;
646:   PetscScalar       *y_array = nullptr;
647:   PetscDeviceContext dctx;
648:   PetscBool          xiscupm, yiscupm, aiscupm;
649:   bool               use_y_array_directly;
650:   cupmStream_t       stream;

652:   PetscFunctionBegin;
653:   PetscCheck(A->factortype != MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix must be factored to solve");
654:   PetscCall(PetscObjectTypeCompare(PetscObjectCast(x), VecSeq_CUPM::VECSEQCUPM(), &xiscupm));
655:   PetscCall(PetscObjectTypeCompare(PetscObjectCast(y), VecSeq_CUPM::VECSEQCUPM(), &yiscupm));
656:   PetscCall(PetscObjectTypeCompare(pobj_A, MATSEQDENSECUPM(), &aiscupm));
657:   PetscAssert(aiscupm, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix A is somehow not CUPM?????????????????????????????");
658:   PetscCall(GetHandles_(&dctx, &stream));
659:   use_y_array_directly = yiscupm && (k >= m);
660:   {
661:     const PetscScalar *x_array;
662:     const auto         xisdevice = xiscupm && PetscOffloadDevice(x->offloadmask);
663:     const auto         copy_mode = xisdevice ? cupmMemcpyDeviceToDevice : cupmMemcpyHostToDevice;

665:     if (!use_y_array_directly && !workvec) PetscCall(VecCreateSeqCUPMAsync<T>(PetscObjectComm(pobj_A), m, &workvec));
666:     // The logic here is to try to minimize the amount of memory copying:
667:     //
668:     // If we call VecCUPMGetArrayRead(X, &x) every time xiscupm and the data is not offloaded
669:     // to the GPU yet, then the data is copied to the GPU. But we are only trying to get the
670:     // data in order to copy it into the y array. So the array x will be wherever the data
671:     // already is so that only one memcpy is performed
672:     if (xisdevice) {
673:       PetscCall(VecCUPMGetArrayReadAsync<T>(x, &x_array, dctx));
674:     } else {
675:       PetscCall(VecGetArrayRead(x, &x_array));
676:     }
677:     PetscCall(VecCUPMGetArrayWriteAsync<T>(use_y_array_directly ? y : workvec, &y_array, dctx));
678:     PetscCall(PetscCUPMMemcpyAsync(y_array, x_array, m, copy_mode, stream));
679:     if (xisdevice) {
680:       PetscCall(VecCUPMRestoreArrayReadAsync<T>(x, &x_array, dctx));
681:     } else {
682:       PetscCall(VecRestoreArrayRead(x, &x_array));
683:     }
684:   }

686:   if (!aiscupm) PetscCall(MatConvert(A, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &A));
687:   PetscCall(Solver{}.template Solve<transpose>(A, cupmScalarPtrCast(y_array), m, m, 1, k, dctx, stream));
688:   if (!aiscupm) PetscCall(MatConvert(A, MATSEQDENSE, MAT_INPLACE_MATRIX, &A));

690:   if (use_y_array_directly) {
691:     PetscCall(VecCUPMRestoreArrayWriteAsync<T>(y, &y_array, dctx));
692:   } else {
693:     const auto   copy_mode = yiscupm ? cupmMemcpyDeviceToDevice : cupmMemcpyDeviceToHost;
694:     PetscScalar *yv;

696:     // The logic here is that the data is not yet in either y's GPU array or its CPU array.
697:     // There is nothing in the interface to say where the user would like it to end up. So we
698:     // choose the GPU, because it is the faster option
699:     if (yiscupm) {
700:       PetscCall(VecCUPMGetArrayWriteAsync<T>(y, &yv, dctx));
701:     } else {
702:       PetscCall(VecGetArray(y, &yv));
703:     }
704:     PetscCall(PetscCUPMMemcpyAsync(yv, y_array, k, copy_mode, stream));
705:     if (yiscupm) {
706:       PetscCall(VecCUPMRestoreArrayWriteAsync<T>(y, &yv, dctx));
707:     } else {
708:       PetscCall(VecRestoreArray(y, &yv));
709:     }
710:     PetscCall(VecCUPMRestoreArrayWriteAsync<T>(workvec, &y_array));
711:   }
712:   PetscFunctionReturn(PETSC_SUCCESS);
713: }

715: template <device::cupm::DeviceType T>
716: template <typename Solver, bool transpose>
717: inline PetscErrorCode MatDense_Seq_CUPM<T>::MatMatSolve_Factored_Dispatch_(Mat A, Mat B, Mat X) noexcept
718: {
719:   const auto         m = static_cast<cupmBlasInt_t>(A->rmap->n);
720:   const auto         k = static_cast<cupmBlasInt_t>(A->cmap->n);
721:   cupmBlasInt_t      nrhs, ldb, ldx, ldy;
722:   PetscScalar       *y;
723:   PetscBool          biscupm, xiscupm, aiscupm;
724:   PetscDeviceContext dctx;
725:   cupmStream_t       stream;

727:   PetscFunctionBegin;
728:   PetscCheck(A->factortype != MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix must be factored to solve");
729:   PetscCall(PetscObjectTypeCompare(PetscObjectCast(B), MATSEQDENSECUPM(), &biscupm));
730:   PetscCall(PetscObjectTypeCompare(PetscObjectCast(X), MATSEQDENSECUPM(), &xiscupm));
731:   PetscCall(PetscObjectTypeCompare(PetscObjectCast(A), MATSEQDENSECUPM(), &aiscupm));
732:   PetscCall(GetHandles_(&dctx, &stream));
733:   {
734:     PetscInt n;

736:     PetscCall(MatGetSize(B, nullptr, &n));
737:     PetscCall(PetscCUPMBlasIntCast(n, &nrhs));
738:     PetscCall(MatDenseGetLDA(B, &n));
739:     PetscCall(PetscCUPMBlasIntCast(n, &ldb));
740:     PetscCall(MatDenseGetLDA(X, &n));
741:     PetscCall(PetscCUPMBlasIntCast(n, &ldx));
742:   }
743:   {
744:     // The logic here is to try to minimize the amount of memory copying:
745:     //
746:     // If we call MatDenseCUPMGetArrayRead(B, &b) every time biscupm and the data is not
747:     // offloaded to the GPU yet, then the data is copied to the GPU. But we are only trying to
748:     // get the data in order to copy it into the y array. So the array b will be wherever the
749:     // data already is so that only one memcpy is performed
750:     const auto         bisdevice = biscupm && PetscOffloadDevice(B->offloadmask);
751:     const auto         copy_mode = bisdevice ? cupmMemcpyDeviceToDevice : cupmMemcpyHostToDevice;
752:     const PetscScalar *b;

754:     if (bisdevice) {
755:       b = DeviceArrayRead(dctx, B);
756:     } else if (biscupm) {
757:       b = HostArrayRead(dctx, B);
758:     } else {
759:       PetscCall(MatDenseGetArrayRead(B, &b));
760:     }

762:     if (ldx < m || !xiscupm) {
763:       // X's array cannot serve as the array (too small or not on device), B's array cannot
764:       // serve as the array (const), so allocate a new array
765:       ldy = m;
766:       PetscCall(PetscCUPMMallocAsync(&y, nrhs * m));
767:     } else {
768:       // X's array should serve as the array
769:       ldy = ldx;
770:       y   = DeviceArrayWrite(dctx, X);
771:     }
772:     PetscCall(PetscCUPMMemcpy2DAsync(y, ldy, b, ldb, m, nrhs, copy_mode, stream));
773:     if (!bisdevice && !biscupm) PetscCall(MatDenseRestoreArrayRead(B, &b));
774:   }

776:   // convert to CUPM twice??????????????????????????????????
777:   // but A should already be CUPM??????????????????????????????????????
778:   if (!aiscupm) PetscCall(MatConvert(A, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &A));
779:   PetscCall(Solver{}.template Solve<transpose>(A, cupmScalarPtrCast(y), ldy, m, nrhs, k, dctx, stream));
780:   if (!aiscupm) PetscCall(MatConvert(A, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &A));

782:   if (ldx < m || !xiscupm) {
783:     const auto   copy_mode = xiscupm ? cupmMemcpyDeviceToDevice : cupmMemcpyDeviceToHost;
784:     PetscScalar *x;

786:     // The logic here is that the data is not yet in either X's GPU array or its CPU
787:     // array. There is nothing in the interface to say where the user would like it to end up.
788:     // So we choose the GPU, because it is the faster option
789:     if (xiscupm) {
790:       x = DeviceArrayWrite(dctx, X);
791:     } else {
792:       PetscCall(MatDenseGetArray(X, &x));
793:     }
794:     PetscCall(PetscCUPMMemcpy2DAsync(x, ldx, y, ldy, k, nrhs, copy_mode, stream));
795:     if (!xiscupm) PetscCall(MatDenseRestoreArray(X, &x));
796:     PetscCallCUPM(cupmFreeAsync(y, stream));
797:   }
798:   PetscFunctionReturn(PETSC_SUCCESS);
799: }

801: template <device::cupm::DeviceType T>
802: template <bool transpose>
803: inline PetscErrorCode MatDense_Seq_CUPM<T>::MatMultAdd_Dispatch_(Mat A, Vec xx, Vec yy, Vec zz) noexcept
804: {
805:   const auto         m = static_cast<cupmBlasInt_t>(A->rmap->n);
806:   const auto         n = static_cast<cupmBlasInt_t>(A->cmap->n);
807:   cupmBlasHandle_t   handle;
808:   PetscDeviceContext dctx;

810:   PetscFunctionBegin;
811:   if (yy && yy != zz) PetscCall(VecSeq_CUPM::Copy(yy, zz)); // mult add
812:   if (!m || !n) {
813:     // mult only
814:     if (!yy) PetscCall(VecSeq_CUPM::Set(zz, 0.0));
815:     PetscFunctionReturn(PETSC_SUCCESS);
816:   }
817:   PetscCall(PetscInfo(A, "Matrix-vector product %" PetscBLASInt_FMT " x %" PetscBLASInt_FMT " on backend\n", m, n));
818:   PetscCall(GetHandles_(&dctx, &handle));
819:   {
820:     constexpr auto op   = transpose ? CUPMBLAS_OP_T : CUPMBLAS_OP_N;
821:     const auto     one  = cupmScalarCast(1.0);
822:     const auto     zero = cupmScalarCast(0.0);
823:     const auto     da   = DeviceArrayRead(dctx, A);
824:     const auto     dxx  = VecSeq_CUPM::DeviceArrayRead(dctx, xx);
825:     const auto     dzz  = VecSeq_CUPM::DeviceArrayReadWrite(dctx, zz);

827:     PetscCall(PetscLogGpuTimeBegin());
828:     PetscCallCUPMBLAS(cupmBlasXgemv(handle, op, m, n, &one, da.cupmdata(), static_cast<cupmBlasInt_t>(MatIMPLCast(A)->lda), dxx.cupmdata(), 1, (yy ? &one : &zero), dzz.cupmdata(), 1));
829:     PetscCall(PetscLogGpuTimeEnd());
830:   }
831:   PetscCall(PetscLogGpuFlops(2.0 * m * n - (yy ? 0 : m)));
832:   PetscFunctionReturn(PETSC_SUCCESS);
833: }

835: // ==========================================================================================
836: // MatDense_Seq_CUPM - Private API - Conversion Dispatch
837: // ==========================================================================================

839: template <device::cupm::DeviceType T>
840: template <bool to_host>
841: inline PetscErrorCode MatDense_Seq_CUPM<T>::Convert_Dispatch_(Mat M, MatType type, MatReuse reuse, Mat *newmat) noexcept
842: {
843:   PetscFunctionBegin;
844:   if (reuse == MAT_REUSE_MATRIX || reuse == MAT_INITIAL_MATRIX) {
845:     // TODO these cases should be optimized
846:     PetscCall(MatConvert_Basic(M, type, reuse, newmat));
847:   } else {
848:     const auto B    = *newmat;
849:     const auto pobj = PetscObjectCast(B);

851:     if (to_host) {
852:       PetscCall(BindToCPU(B, PETSC_TRUE));
853:       PetscCall(Reset(B));
854:     } else {
855:       PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUPM()));
856:     }

858:     PetscCall(PetscStrFreeAllocpy(to_host ? VECSTANDARD : VecSeq_CUPM::VECCUPM(), &B->defaultvectype));
859:     PetscCall(PetscObjectChangeTypeName(pobj, to_host ? MATSEQDENSE : MATSEQDENSECUPM()));
860:     // cvec might be the wrong VecType, destroy and rebuild it if necessary
861:     // REVIEW ME: this is possibly very inefficient
862:     PetscCall(VecDestroy(&MatIMPLCast(B)->cvec));

864:     MatComposeOp_CUPM(to_host, pobj, MatConvert_seqdensecupm_seqdense_C(), nullptr, Convert_SeqDenseCUPM_SeqDense);
865:     MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMGetArray_C(), nullptr, GetArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_READ_WRITE>);
866:     MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMGetArrayRead_C(), nullptr, GetArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_READ>);
867:     MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMGetArrayWrite_C(), nullptr, GetArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_WRITE>);
868:     MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMRestoreArray_C(), nullptr, RestoreArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_READ_WRITE>);
869:     MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMRestoreArrayRead_C(), nullptr, RestoreArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_READ>);
870:     MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMRestoreArrayWrite_C(), nullptr, RestoreArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_WRITE>);
871:     MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMPlaceArray_C(), nullptr, PlaceArray);
872:     MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMResetArray_C(), nullptr, ResetArray);
873:     MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMReplaceArray_C(), nullptr, ReplaceArray);
874:     MatComposeOp_CUPM(to_host, pobj, MatProductSetFromOptions_seqaij_seqdensecupm_C(), nullptr, MatProductSetFromOptions_SeqAIJ_SeqDense);

876:     if (to_host) {
877:       B->offloadmask = PETSC_OFFLOAD_CPU;
878:     } else {
879:       Mat_SeqDenseCUPM *mcu;

881:       PetscCall(PetscNew(&mcu));
882:       B->spptr       = mcu;
883:       B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; // REVIEW ME: why not offload host??
884:       PetscCall(BindToCPU(B, PETSC_FALSE));
885:     }

887:     MatSetOp_CUPM(to_host, B, bindtocpu, nullptr, BindToCPU);
888:     MatSetOp_CUPM(to_host, B, destroy, MatDestroy_SeqDense, Destroy);
889:   }
890:   PetscFunctionReturn(PETSC_SUCCESS);
891: }

893: // ==========================================================================================
894: // MatDense_Seq_CUPM - Public API
895: // ==========================================================================================

897: template <device::cupm::DeviceType T>
898: inline constexpr MatType MatDense_Seq_CUPM<T>::MATIMPLCUPM_() noexcept
899: {
900:   return MATSEQDENSECUPM();
901: }

903: template <device::cupm::DeviceType T>
904: inline constexpr typename MatDense_Seq_CUPM<T>::Mat_SeqDenseCUPM *MatDense_Seq_CUPM<T>::MatCUPMCast(Mat m) noexcept
905: {
906:   return static_cast<Mat_SeqDenseCUPM *>(m->spptr);
907: }

909: template <device::cupm::DeviceType T>
910: inline constexpr Mat_SeqDense *MatDense_Seq_CUPM<T>::MatIMPLCast_(Mat m) noexcept
911: {
912:   return static_cast<Mat_SeqDense *>(m->data);
913: }

915: template <device::cupm::DeviceType T>
916: inline constexpr const char *MatDense_Seq_CUPM<T>::MatConvert_seqdensecupm_seqdense_C() noexcept
917: {
918:   return T == device::cupm::DeviceType::CUDA ? "MatConvert_seqdensecuda_seqdense_C" : "MatConvert_seqdensehip_seqdense_C";
919: }

921: template <device::cupm::DeviceType T>
922: inline constexpr const char *MatDense_Seq_CUPM<T>::MatProductSetFromOptions_seqaij_seqdensecupm_C() noexcept
923: {
924:   return T == device::cupm::DeviceType::CUDA ? "MatProductSetFromOptions_seqaij_seqdensecuda_C" : "MatProductSetFromOptions_seqaij_seqdensehip_C";
925: }

927: // ==========================================================================================

929: // MatCreate_SeqDenseCUPM()
930: template <device::cupm::DeviceType T>
931: inline PetscErrorCode MatDense_Seq_CUPM<T>::Create(Mat A) noexcept
932: {
933:   PetscFunctionBegin;
934:   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUPM()));
935:   PetscCall(MatCreate_SeqDense(A));
936:   PetscCall(Convert_SeqDense_SeqDenseCUPM(A, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &A));
937:   PetscFunctionReturn(PETSC_SUCCESS);
938: }

940: template <device::cupm::DeviceType T>
941: inline PetscErrorCode MatDense_Seq_CUPM<T>::Destroy(Mat A) noexcept
942: {
943:   PetscFunctionBegin;
944:   // prevent copying back data if we own the data pointer
945:   if (!MatIMPLCast(A)->user_alloc) A->offloadmask = PETSC_OFFLOAD_CPU;
946:   PetscCall(Convert_SeqDenseCUPM_SeqDense(A, MATSEQDENSE, MAT_INPLACE_MATRIX, &A));
947:   PetscCall(MatDestroy_SeqDense(A));
948:   PetscFunctionReturn(PETSC_SUCCESS);
949: }

951: // obj->ops->setup()
952: template <device::cupm::DeviceType T>
953: inline PetscErrorCode MatDense_Seq_CUPM<T>::SetUp(Mat A) noexcept
954: {
955:   PetscFunctionBegin;
956:   PetscCall(PetscLayoutSetUp(A->rmap));
957:   PetscCall(PetscLayoutSetUp(A->cmap));
958:   if (!A->preallocated) {
959:     PetscDeviceContext dctx;

961:     PetscCall(GetHandles_(&dctx));
962:     PetscCall(SetPreallocation(A, dctx));
963:   }
964:   PetscFunctionReturn(PETSC_SUCCESS);
965: }

967: template <device::cupm::DeviceType T>
968: inline PetscErrorCode MatDense_Seq_CUPM<T>::Reset(Mat A) noexcept
969: {
970:   PetscFunctionBegin;
971:   if (const auto mcu = MatCUPMCast(A)) {
972:     cupmStream_t stream;

974:     PetscCheck(!mcu->unplacedarray, PETSC_COMM_SELF, PETSC_ERR_ORDER, "MatDense%sResetArray() must be called first", cupmNAME());
975:     PetscCall(GetHandles_(&stream));
976:     if (!mcu->d_user_alloc) PetscCallCUPM(cupmFreeAsync(mcu->d_v, stream));
977:     PetscCallCUPM(cupmFreeAsync(mcu->d_fact_tau, stream));
978:     PetscCallCUPM(cupmFreeAsync(mcu->d_fact_ipiv, stream));
979:     PetscCallCUPM(cupmFreeAsync(mcu->d_fact_info, stream));
980:     PetscCallCUPM(cupmFreeAsync(mcu->d_fact_work, stream));
981:     PetscCall(VecDestroy(&mcu->workvec));
982:     PetscCall(PetscFree(A->spptr /* mcu */));
983:   }
984:   PetscFunctionReturn(PETSC_SUCCESS);
985: }

987: // ==========================================================================================

989: template <device::cupm::DeviceType T>
990: inline PetscErrorCode MatDense_Seq_CUPM<T>::BindToCPU(Mat A, PetscBool to_host) noexcept
991: {
992:   const auto mimpl = MatIMPLCast(A);
993:   const auto pobj  = PetscObjectCast(A);

995:   PetscFunctionBegin;
996:   PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
997:   PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
998:   A->boundtocpu = to_host;
999:   PetscCall(PetscStrFreeAllocpy(to_host ? PETSCRANDER48 : PETSCDEVICERAND(), &A->defaultrandtype));
1000:   if (to_host) {
1001:     PetscDeviceContext dctx;

1003:     // make sure we have an up-to-date copy on the CPU
1004:     PetscCall(GetHandles_(&dctx));
1005:     PetscCall(DeviceToHost_(A, dctx));
1006:   } else {
1007:     PetscBool iscupm;

1009:     if (auto &cvec = mimpl->cvec) {
1010:       PetscCall(PetscObjectTypeCompare(PetscObjectCast(cvec), VecSeq_CUPM::VECSEQCUPM(), &iscupm));
1011:       if (!iscupm) PetscCall(VecDestroy(&cvec));
1012:     }
1013:     if (auto &cmat = mimpl->cmat) {
1014:       PetscCall(PetscObjectTypeCompare(PetscObjectCast(cmat), MATSEQDENSECUPM(), &iscupm));
1015:       if (!iscupm) PetscCall(MatDestroy(&cmat));
1016:     }
1017:   }

1019:   // ============================================================
1020:   // Composed ops
1021:   // ============================================================
1022:   MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArray_C", MatDenseGetArray_SeqDense, GetArrayC_<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_READ_WRITE>);
1023:   MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArrayRead_C", MatDenseGetArray_SeqDense, GetArrayC_<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_READ>);
1024:   MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArrayWrite_C", MatDenseGetArray_SeqDense, GetArrayC_<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_WRITE>);
1025:   MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArrayAndMemType_C", nullptr, GetArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_READ_WRITE>);
1026:   MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreArrayAndMemType_C", nullptr, RestoreArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_READ_WRITE>);
1027:   MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArrayReadAndMemType_C", nullptr, GetArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_READ>);
1028:   MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreArrayReadAndMemType_C", nullptr, RestoreArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_READ>);
1029:   MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArrayWriteAndMemType_C", nullptr, GetArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_WRITE>);
1030:   MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreArrayWriteAndMemType_C", nullptr, RestoreArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_WRITE>);
1031:   MatComposeOp_CUPM(to_host, pobj, "MatDenseGetColumnVec_C", MatDenseGetColumnVec_SeqDense, GetColumnVec<PETSC_MEMORY_ACCESS_READ_WRITE>);
1032:   MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreColumnVec_C", MatDenseRestoreColumnVec_SeqDense, RestoreColumnVec<PETSC_MEMORY_ACCESS_READ_WRITE>);
1033:   MatComposeOp_CUPM(to_host, pobj, "MatDenseGetColumnVecRead_C", MatDenseGetColumnVecRead_SeqDense, GetColumnVec<PETSC_MEMORY_ACCESS_READ>);
1034:   MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreColumnVecRead_C", MatDenseRestoreColumnVecRead_SeqDense, RestoreColumnVec<PETSC_MEMORY_ACCESS_READ>);
1035:   MatComposeOp_CUPM(to_host, pobj, "MatDenseGetColumnVecWrite_C", MatDenseGetColumnVecWrite_SeqDense, GetColumnVec<PETSC_MEMORY_ACCESS_WRITE>);
1036:   MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreColumnVecWrite_C", MatDenseRestoreColumnVecWrite_SeqDense, RestoreColumnVec<PETSC_MEMORY_ACCESS_WRITE>);
1037:   MatComposeOp_CUPM(to_host, pobj, "MatDenseGetSubMatrix_C", MatDenseGetSubMatrix_SeqDense, GetSubMatrix);
1038:   MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreSubMatrix_C", MatDenseRestoreSubMatrix_SeqDense, RestoreSubMatrix);
1039:   MatComposeOp_CUPM(to_host, pobj, "MatQRFactor_C", MatQRFactor_SeqDense, SolveQR::Factor);
1040:   // always the same
1041:   PetscCall(PetscObjectComposeFunction(pobj, "MatDenseSetLDA_C", MatDenseSetLDA_SeqDense));

1043:   // ============================================================
1044:   // Function pointer ops
1045:   // ============================================================
1046:   MatSetOp_CUPM(to_host, A, duplicate, MatDuplicate_SeqDense, Duplicate);
1047:   MatSetOp_CUPM(to_host, A, mult, MatMult_SeqDense, [](Mat A, Vec xx, Vec yy) { return MatMultAdd_Dispatch_</* transpose */ false>(A, xx, nullptr, yy); });
1048:   MatSetOp_CUPM(to_host, A, multtranspose, MatMultTranspose_SeqDense, [](Mat A, Vec xx, Vec yy) { return MatMultAdd_Dispatch_</* transpose */ true>(A, xx, nullptr, yy); });
1049:   MatSetOp_CUPM(to_host, A, multadd, MatMultAdd_SeqDense, MatMultAdd_Dispatch_</* transpose */ false>);
1050:   MatSetOp_CUPM(to_host, A, multtransposeadd, MatMultTransposeAdd_SeqDense, MatMultAdd_Dispatch_</* transpose */ true>);
1051:   MatSetOp_CUPM(to_host, A, matmultnumeric, MatMatMultNumeric_SeqDense_SeqDense, MatMatMult_Numeric_Dispatch</* transpose_A */ false, /* transpose_B */ false>);
1052:   MatSetOp_CUPM(to_host, A, mattransposemultnumeric, MatMatTransposeMultNumeric_SeqDense_SeqDense, MatMatMult_Numeric_Dispatch</* transpose_A */ false, /* transpose_B */ true>);
1053:   MatSetOp_CUPM(to_host, A, transposematmultnumeric, MatTransposeMatMultNumeric_SeqDense_SeqDense, MatMatMult_Numeric_Dispatch</* transpose_A */ true, /* transpose_B */ false>);
1054:   MatSetOp_CUPM(to_host, A, axpy, MatAXPY_SeqDense, AXPY);
1055:   MatSetOp_CUPM(to_host, A, choleskyfactor, MatCholeskyFactor_SeqDense, SolveCholesky::Factor);
1056:   MatSetOp_CUPM(to_host, A, lufactor, MatLUFactor_SeqDense, SolveLU::Factor);
1057:   MatSetOp_CUPM(to_host, A, getcolumnvector, MatGetColumnVector_SeqDense, GetColumnVector);
1058:   MatSetOp_CUPM(to_host, A, scale, MatScale_SeqDense, Scale);
1059:   MatSetOp_CUPM(to_host, A, shift, MatShift_SeqDense, Shift);
1060:   MatSetOp_CUPM(to_host, A, copy, MatCopy_SeqDense, Copy);
1061:   MatSetOp_CUPM(to_host, A, zeroentries, MatZeroEntries_SeqDense, ZeroEntries);
1062:   MatSetOp_CUPM(to_host, A, setup, MatSetUp_SeqDense, SetUp);
1063:   MatSetOp_CUPM(to_host, A, setrandom, MatSetRandom_SeqDense, SetRandom);
1064:   // seemingly always the same
1065:   A->ops->productsetfromoptions = MatProductSetFromOptions_SeqDense;

1067:   if (const auto cmat = mimpl->cmat) PetscCall(MatBindToCPU(cmat, to_host));
1068:   PetscFunctionReturn(PETSC_SUCCESS);
1069: }

1071: template <device::cupm::DeviceType T>
1072: inline PetscErrorCode MatDense_Seq_CUPM<T>::Convert_SeqDenseCUPM_SeqDense(Mat M, MatType type, MatReuse reuse, Mat *newmat) noexcept
1073: {
1074:   PetscFunctionBegin;
1075:   PetscCall(Convert_Dispatch_</* to host */ true>(M, type, reuse, newmat));
1076:   PetscFunctionReturn(PETSC_SUCCESS);
1077: }

1079: template <device::cupm::DeviceType T>
1080: inline PetscErrorCode MatDense_Seq_CUPM<T>::Convert_SeqDense_SeqDenseCUPM(Mat M, MatType type, MatReuse reuse, Mat *newmat) noexcept
1081: {
1082:   PetscFunctionBegin;
1083:   PetscCall(Convert_Dispatch_</* to host */ false>(M, type, reuse, newmat));
1084:   PetscFunctionReturn(PETSC_SUCCESS);
1085: }

1087: // ==========================================================================================

1089: template <device::cupm::DeviceType T>
1090: template <PetscMemType mtype, PetscMemoryAccessMode access>
1091: inline PetscErrorCode MatDense_Seq_CUPM<T>::GetArray(Mat m, PetscScalar **array, PetscDeviceContext dctx) noexcept
1092: {
1093:   constexpr auto hostmem     = PetscMemTypeHost(mtype);
1094:   constexpr auto read_access = PetscMemoryAccessRead(access);

1096:   PetscFunctionBegin;
1097:   static_assert((mtype == PETSC_MEMTYPE_HOST) || (mtype == PETSC_MEMTYPE_DEVICE), "");
1098:   PetscCall(PetscDeviceContextGetOptionalNullContext_Internal(&dctx));
1099:   if (hostmem) {
1100:     if (read_access) {
1101:       PetscCall(DeviceToHost_(m, dctx));
1102:     } else if (!MatIMPLCast(m)->v) {
1103:       // MatCreateSeqDenseCUPM may not allocate CPU memory. Allocate if needed
1104:       PetscCall(MatSeqDenseSetPreallocation(m, nullptr));
1105:     }
1106:     *array = MatIMPLCast(m)->v;
1107:   } else {
1108:     if (read_access) {
1109:       PetscCall(HostToDevice_(m, dctx));
1110:     } else if (!MatCUPMCast(m)->d_v) {
1111:       // write-only
1112:       PetscCall(SetPreallocation(m, dctx, nullptr));
1113:     }
1114:     *array = MatCUPMCast(m)->d_v;
1115:   }
1116:   if (PetscMemoryAccessWrite(access)) {
1117:     m->offloadmask = hostmem ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
1118:     PetscCall(PetscObjectStateIncrease(PetscObjectCast(m)));
1119:   }
1120:   PetscFunctionReturn(PETSC_SUCCESS);
1121: }

1123: template <device::cupm::DeviceType T>
1124: template <PetscMemType mtype, PetscMemoryAccessMode access>
1125: inline PetscErrorCode MatDense_Seq_CUPM<T>::RestoreArray(Mat m, PetscScalar **array, PetscDeviceContext) noexcept
1126: {
1127:   PetscFunctionBegin;
1128:   static_assert((mtype == PETSC_MEMTYPE_HOST) || (mtype == PETSC_MEMTYPE_DEVICE), "");
1129:   if (PetscMemoryAccessWrite(access)) {
1130:     // WRITE or READ_WRITE
1131:     m->offloadmask = PetscMemTypeHost(mtype) ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
1132:     PetscCall(PetscObjectStateIncrease(PetscObjectCast(m)));
1133:   }
1134:   if (array) {
1135:     PetscCall(CheckPointerMatchesMemType_(*array, mtype));
1136:     *array = nullptr;
1137:   }
1138:   PetscFunctionReturn(PETSC_SUCCESS);
1139: }

1141: template <device::cupm::DeviceType T>
1142: template <PetscMemoryAccessMode access>
1143: inline PetscErrorCode MatDense_Seq_CUPM<T>::GetArrayAndMemType(Mat m, PetscScalar **array, PetscMemType *mtype, PetscDeviceContext dctx) noexcept
1144: {
1145:   PetscFunctionBegin;
1146:   PetscCall(GetArray<PETSC_MEMTYPE_DEVICE, access>(m, array, dctx));
1147:   if (mtype) *mtype = PETSC_MEMTYPE_CUPM();
1148:   PetscFunctionReturn(PETSC_SUCCESS);
1149: }

1151: template <device::cupm::DeviceType T>
1152: template <PetscMemoryAccessMode access>
1153: inline PetscErrorCode MatDense_Seq_CUPM<T>::RestoreArrayAndMemType(Mat m, PetscScalar **array, PetscDeviceContext dctx) noexcept
1154: {
1155:   PetscFunctionBegin;
1156:   PetscCall(RestoreArray<PETSC_MEMTYPE_DEVICE, access>(m, array, dctx));
1157:   PetscFunctionReturn(PETSC_SUCCESS);
1158: }

1160: // ==========================================================================================

1162: template <device::cupm::DeviceType T>
1163: inline PetscErrorCode MatDense_Seq_CUPM<T>::PlaceArray(Mat A, const PetscScalar *array) noexcept
1164: {
1165:   const auto mimpl = MatIMPLCast(A);
1166:   const auto mcu   = MatCUPMCast(A);

1168:   PetscFunctionBegin;
1169:   PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1170:   PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1171:   PetscCheck(!mcu->unplacedarray, PETSC_COMM_SELF, PETSC_ERR_ORDER, "MatDense%sResetArray() must be called first", cupmNAME());
1172:   if (mimpl->v) {
1173:     PetscDeviceContext dctx;

1175:     PetscCall(GetHandles_(&dctx));
1176:     PetscCall(HostToDevice_(A, dctx));
1177:   }
1178:   mcu->unplacedarray         = util::exchange(mcu->d_v, const_cast<PetscScalar *>(array));
1179:   mcu->d_unplaced_user_alloc = util::exchange(mcu->d_user_alloc, PETSC_TRUE);
1180:   PetscFunctionReturn(PETSC_SUCCESS);
1181: }

1183: template <device::cupm::DeviceType T>
1184: inline PetscErrorCode MatDense_Seq_CUPM<T>::ReplaceArray(Mat A, const PetscScalar *array) noexcept
1185: {
1186:   const auto mimpl = MatIMPLCast(A);
1187:   const auto mcu   = MatCUPMCast(A);

1189:   PetscFunctionBegin;
1190:   PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1191:   PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1192:   PetscCheck(!mcu->unplacedarray, PETSC_COMM_SELF, PETSC_ERR_ORDER, "MatDense%sResetArray() must be called first", cupmNAME());
1193:   if (!mcu->d_user_alloc) {
1194:     cupmStream_t stream;

1196:     PetscCall(GetHandles_(&stream));
1197:     PetscCallCUPM(cupmFreeAsync(mcu->d_v, stream));
1198:   }
1199:   mcu->d_v          = const_cast<PetscScalar *>(array);
1200:   mcu->d_user_alloc = PETSC_FALSE;
1201:   PetscFunctionReturn(PETSC_SUCCESS);
1202: }

1204: template <device::cupm::DeviceType T>
1205: inline PetscErrorCode MatDense_Seq_CUPM<T>::ResetArray(Mat A) noexcept
1206: {
1207:   const auto mimpl = MatIMPLCast(A);
1208:   const auto mcu   = MatCUPMCast(A);

1210:   PetscFunctionBegin;
1211:   PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1212:   PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1213:   if (mimpl->v) {
1214:     PetscDeviceContext dctx;

1216:     PetscCall(GetHandles_(&dctx));
1217:     PetscCall(HostToDevice_(A, dctx));
1218:   }
1219:   mcu->d_v          = util::exchange(mcu->unplacedarray, nullptr);
1220:   mcu->d_user_alloc = mcu->d_unplaced_user_alloc;
1221:   PetscFunctionReturn(PETSC_SUCCESS);
1222: }

1224: // ==========================================================================================

1226: template <device::cupm::DeviceType T>
1227: template <bool transpose_A, bool transpose_B>
1228: inline PetscErrorCode MatDense_Seq_CUPM<T>::MatMatMult_Numeric_Dispatch(Mat A, Mat B, Mat C) noexcept
1229: {
1230:   cupmBlasInt_t      m, n, k;
1231:   PetscBool          Aiscupm, Biscupm;
1232:   PetscDeviceContext dctx;
1233:   cupmBlasHandle_t   handle;

1235:   PetscFunctionBegin;
1236:   PetscCall(PetscCUPMBlasIntCast(C->rmap->n, &m));
1237:   PetscCall(PetscCUPMBlasIntCast(C->cmap->n, &n));
1238:   PetscCall(PetscCUPMBlasIntCast(transpose_A ? A->rmap->n : A->cmap->n, &k));
1239:   if (!m || !n || !k) PetscFunctionReturn(PETSC_SUCCESS);

1241:   // we may end up with SEQDENSE as one of the arguments
1242:   // REVIEW ME: how? and why is it not B and C????????
1243:   PetscCall(PetscObjectTypeCompare(PetscObjectCast(A), MATSEQDENSECUPM(), &Aiscupm));
1244:   PetscCall(PetscObjectTypeCompare(PetscObjectCast(B), MATSEQDENSECUPM(), &Biscupm));
1245:   if (!Aiscupm) PetscCall(MatConvert(A, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &A));
1246:   if (!Biscupm) PetscCall(MatConvert(B, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &B));
1247:   PetscCall(PetscInfo(C, "Matrix-Matrix product %" PetscBLASInt_FMT " x %" PetscBLASInt_FMT " x %" PetscBLASInt_FMT " on backend\n", m, k, n));
1248:   PetscCall(GetHandles_(&dctx, &handle));

1250:   PetscCall(PetscLogGpuTimeBegin());
1251:   {
1252:     const auto one  = cupmScalarCast(1.0);
1253:     const auto zero = cupmScalarCast(0.0);
1254:     const auto da   = DeviceArrayRead(dctx, A);
1255:     const auto db   = DeviceArrayRead(dctx, B);
1256:     const auto dc   = DeviceArrayWrite(dctx, C);
1257:     PetscInt   alda, blda, clda;

1259:     PetscCall(MatDenseGetLDA(A, &alda));
1260:     PetscCall(MatDenseGetLDA(B, &blda));
1261:     PetscCall(MatDenseGetLDA(C, &clda));
1262:     PetscCallCUPMBLAS(cupmBlasXgemm(handle, transpose_A ? CUPMBLAS_OP_T : CUPMBLAS_OP_N, transpose_B ? CUPMBLAS_OP_T : CUPMBLAS_OP_N, m, n, k, &one, da.cupmdata(), alda, db.cupmdata(), blda, &zero, dc.cupmdata(), clda));
1263:   }
1264:   PetscCall(PetscLogGpuTimeEnd());

1266:   PetscCall(PetscLogGpuFlops(1.0 * m * n * k + 1.0 * m * n * (k - 1)));
1267:   if (!Aiscupm) PetscCall(MatConvert(A, MATSEQDENSE, MAT_INPLACE_MATRIX, &A));
1268:   if (!Biscupm) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B));
1269:   PetscFunctionReturn(PETSC_SUCCESS);
1270: }

1272: template <device::cupm::DeviceType T>
1273: inline PetscErrorCode MatDense_Seq_CUPM<T>::Copy(Mat A, Mat B, MatStructure str) noexcept
1274: {
1275:   const auto m = A->rmap->n;
1276:   const auto n = A->cmap->n;

1278:   PetscFunctionBegin;
1279:   PetscAssert(m == B->rmap->n && n == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "size(B) != size(A)");
1280:   // The two matrices must have the same copy implementation to be eligible for fast copy
1281:   if (A->ops->copy == B->ops->copy) {
1282:     PetscDeviceContext dctx;
1283:     cupmStream_t       stream;

1285:     PetscCall(GetHandles_(&dctx, &stream));
1286:     PetscCall(PetscLogGpuTimeBegin());
1287:     {
1288:       const auto va = DeviceArrayRead(dctx, A);
1289:       const auto vb = DeviceArrayWrite(dctx, B);
1290:       // order is important, DeviceArrayRead/Write() might call SetPreallocation() which sets
1291:       // lda!
1292:       const auto lda_a = MatIMPLCast(A)->lda;
1293:       const auto lda_b = MatIMPLCast(B)->lda;

1295:       if (lda_a > m || lda_b > m) {
1296:         PetscAssert(lda_b > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "B lda (%" PetscBLASInt_FMT ") must be > 0 at this point, this indicates Mat%sSetPreallocation() was not called when it should have been!", lda_b, cupmNAME());
1297:         PetscAssert(lda_a > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A lda (%" PetscBLASInt_FMT ") must be > 0 at this point, this indicates Mat%sSetPreallocation() was not called when it should have been!", lda_a, cupmNAME());
1298:         PetscCall(PetscCUPMMemcpy2DAsync(vb.data(), lda_b, va.data(), lda_a, m, n, cupmMemcpyDeviceToDevice, stream));
1299:       } else {
1300:         PetscCall(PetscCUPMMemcpyAsync(vb.data(), va.data(), m * n, cupmMemcpyDeviceToDevice, stream));
1301:       }
1302:     }
1303:     PetscCall(PetscLogGpuTimeEnd());
1304:   } else {
1305:     PetscCall(MatCopy_Basic(A, B, str));
1306:   }
1307:   PetscFunctionReturn(PETSC_SUCCESS);
1308: }

1310: template <device::cupm::DeviceType T>
1311: inline PetscErrorCode MatDense_Seq_CUPM<T>::ZeroEntries(Mat m) noexcept
1312: {
1313:   PetscDeviceContext dctx;
1314:   cupmStream_t       stream;

1316:   PetscFunctionBegin;
1317:   PetscCall(GetHandles_(&dctx, &stream));
1318:   PetscCall(PetscLogGpuTimeBegin());
1319:   {
1320:     const auto va  = DeviceArrayWrite(dctx, m);
1321:     const auto lda = MatIMPLCast(m)->lda;
1322:     const auto ma  = m->rmap->n;
1323:     const auto na  = m->cmap->n;

1325:     if (lda > ma) {
1326:       PetscCall(PetscCUPMMemset2DAsync(va.data(), lda, 0, ma, na, stream));
1327:     } else {
1328:       PetscCall(PetscCUPMMemsetAsync(va.data(), 0, ma * na, stream));
1329:     }
1330:   }
1331:   PetscCall(PetscLogGpuTimeEnd());
1332:   PetscFunctionReturn(PETSC_SUCCESS);
1333: }

1335: namespace detail
1336: {

1338: // ==========================================================================================
1339: // SubMatIndexFunctor
1340: //
1341: // Iterator which permutes a linear index range into matrix indices for am nrows x ncols
1342: // submat with leading dimension lda. Essentially SubMatIndexFunctor(i) returns the index for
1343: // the i'th sequential entry in the matrix.
1344: // ==========================================================================================
1345: template <typename T>
1346: struct SubMatIndexFunctor {
1347:   PETSC_HOSTDEVICE_INLINE_DECL T operator()(T x) const noexcept { return ((x / nrows) * lda) + (x % nrows); }

1349:   PetscInt nrows;
1350:   PetscInt ncols;
1351:   PetscInt lda;
1352: };

1354: template <typename Iterator>
1355: struct SubMatrixIterator : MatrixIteratorBase<Iterator, SubMatIndexFunctor<typename thrust::iterator_difference<Iterator>::type>> {
1356:   using base_type = MatrixIteratorBase<Iterator, SubMatIndexFunctor<typename thrust::iterator_difference<Iterator>::type>>;

1358:   using iterator = typename base_type::iterator;

1360:   constexpr SubMatrixIterator(Iterator first, Iterator last, PetscInt nrows, PetscInt ncols, PetscInt lda) noexcept :
1361:     base_type{
1362:       std::move(first), std::move(last), {nrows, ncols, lda}
1363:   }
1364:   {
1365:   }

1367:   PETSC_NODISCARD iterator end() const noexcept { return this->begin() + (this->func.nrows * this->func.ncols); }
1368: };

1370: namespace
1371: {

1373: template <typename T>
1374: PETSC_NODISCARD inline SubMatrixIterator<typename thrust::device_vector<T>::iterator> make_submat_iterator(PetscInt rstart, PetscInt rend, PetscInt cstart, PetscInt cend, PetscInt lda, T *ptr) noexcept
1375: {
1376:   const auto nrows = rend - rstart;
1377:   const auto ncols = cend - cstart;
1378:   const auto dptr  = thrust::device_pointer_cast(ptr);

1380:   return {dptr + (rstart * lda) + cstart, dptr + ((rstart + nrows) * lda) + cstart, nrows, ncols, lda};
1381: }

1383: } // namespace

1385: } // namespace detail

1387: template <device::cupm::DeviceType T>
1388: inline PetscErrorCode MatDense_Seq_CUPM<T>::Scale(Mat A, PetscScalar alpha) noexcept
1389: {
1390:   const auto         m = A->rmap->n;
1391:   const auto         n = A->cmap->n;
1392:   const auto         N = m * n;
1393:   PetscDeviceContext dctx;

1395:   PetscFunctionBegin;
1396:   PetscCall(PetscInfo(A, "Performing Scale %" PetscInt_FMT " x %" PetscInt_FMT " on backend\n", m, n));
1397:   PetscCall(GetHandles_(&dctx));
1398:   {
1399:     const auto da  = DeviceArrayReadWrite(dctx, A);
1400:     const auto lda = MatIMPLCast(A)->lda;

1402:     if (lda > m) {
1403:       cupmStream_t stream;

1405:       PetscCall(GetHandlesFrom_(dctx, &stream));
1406:       // clang-format off
1407:       PetscCallThrust(
1408:         const auto sub_mat = detail::make_submat_iterator(0, m, 0, n, lda, da.data());

1410:         THRUST_CALL(
1411:           thrust::transform,
1412:           stream,
1413:           sub_mat.begin(), sub_mat.end(), sub_mat.begin(),
1414:           device::cupm::functors::make_times_equals(alpha)
1415:         )
1416:       );
1417:       // clang-format on
1418:     } else {
1419:       const auto       cu_alpha = cupmScalarCast(alpha);
1420:       cupmBlasHandle_t handle;

1422:       PetscCall(GetHandlesFrom_(dctx, &handle));
1423:       PetscCall(PetscLogGpuTimeBegin());
1424:       PetscCallCUPMBLAS(cupmBlasXscal(handle, N, &cu_alpha, da.cupmdata(), 1));
1425:       PetscCall(PetscLogGpuTimeEnd());
1426:     }
1427:   }
1428:   PetscCall(PetscLogGpuFlops(N));
1429:   PetscFunctionReturn(PETSC_SUCCESS);
1430: }

1432: template <device::cupm::DeviceType T>
1433: inline PetscErrorCode MatDense_Seq_CUPM<T>::Shift(Mat A, PetscScalar alpha) noexcept
1434: {
1435:   const auto         m = A->rmap->n;
1436:   const auto         n = A->cmap->n;
1437:   PetscDeviceContext dctx;

1439:   PetscFunctionBegin;
1440:   PetscCall(GetHandles_(&dctx));
1441:   PetscCall(PetscInfo(A, "Performing Shift %" PetscInt_FMT " x %" PetscInt_FMT " on backend\n", m, n));
1442:   PetscCall(DiagonalUnaryTransform(A, 0, m, n, dctx, device::cupm::functors::make_plus_equals(alpha)));
1443:   PetscFunctionReturn(PETSC_SUCCESS);
1444: }

1446: template <device::cupm::DeviceType T>
1447: inline PetscErrorCode MatDense_Seq_CUPM<T>::AXPY(Mat Y, PetscScalar alpha, Mat X, MatStructure) noexcept
1448: {
1449:   const auto         m_x = X->rmap->n, m_y = Y->rmap->n;
1450:   const auto         n_x = X->cmap->n, n_y = Y->cmap->n;
1451:   const auto         N = m_x * n_x;
1452:   PetscDeviceContext dctx;

1454:   PetscFunctionBegin;
1455:   if (!m_x || !n_x || alpha == (PetscScalar)0.0) PetscFunctionReturn(PETSC_SUCCESS);
1456:   PetscCall(PetscInfo(Y, "Performing AXPY %" PetscInt_FMT " x %" PetscInt_FMT " on backend\n", m_y, n_y));
1457:   PetscCall(GetHandles_(&dctx));
1458:   {
1459:     const auto dx    = DeviceArrayRead(dctx, X);
1460:     const auto dy    = DeviceArrayReadWrite(dctx, Y);
1461:     const auto lda_x = MatIMPLCast(X)->lda;
1462:     const auto lda_y = MatIMPLCast(Y)->lda;

1464:     if (lda_x > m_x || lda_y > m_x) {
1465:       cupmStream_t stream;

1467:       PetscCall(GetHandlesFrom_(dctx, &stream));
1468:       // clang-format off
1469:       PetscCallThrust(
1470:         const auto sub_mat_y = detail::make_submat_iterator(0, m_y, 0, n_y, lda_y, dy.data());
1471:         const auto sub_mat_x = detail::make_submat_iterator(0, m_x, 0, n_x, lda_x, dx.data());

1473:         THRUST_CALL(
1474:           thrust::transform,
1475:           stream,
1476:           sub_mat_x.begin(), sub_mat_x.end(), sub_mat_y.begin(), sub_mat_y.begin(),
1477:           device::cupm::functors::make_axpy(alpha)
1478:         );
1479:       );
1480:       // clang-format on
1481:     } else {
1482:       const auto       cu_alpha = cupmScalarCast(alpha);
1483:       cupmBlasHandle_t handle;

1485:       PetscCall(GetHandlesFrom_(dctx, &handle));
1486:       PetscCall(PetscLogGpuTimeBegin());
1487:       PetscCallCUPMBLAS(cupmBlasXaxpy(handle, N, &cu_alpha, dx.cupmdata(), 1, dy.cupmdata(), 1));
1488:       PetscCall(PetscLogGpuTimeEnd());
1489:     }
1490:   }
1491:   PetscCall(PetscLogGpuFlops(PetscMax(2 * N - 1, 0)));
1492:   PetscFunctionReturn(PETSC_SUCCESS);
1493: }

1495: template <device::cupm::DeviceType T>
1496: inline PetscErrorCode MatDense_Seq_CUPM<T>::Duplicate(Mat A, MatDuplicateOption opt, Mat *B) noexcept
1497: {
1498:   const auto         hopt = (opt == MAT_COPY_VALUES && A->offloadmask != PETSC_OFFLOAD_CPU) ? MAT_DO_NOT_COPY_VALUES : opt;
1499:   PetscDeviceContext dctx;

1501:   PetscFunctionBegin;
1502:   PetscCall(GetHandles_(&dctx));
1503:   // do not call SetPreallocation() yet, we call it afterwards??
1504:   PetscCall(MatCreateSeqDenseCUPM<T>(PetscObjectComm(PetscObjectCast(A)), A->rmap->n, A->cmap->n, nullptr, B, dctx, /* preallocate */ false));
1505:   PetscCall(MatDuplicateNoCreate_SeqDense(*B, A, hopt));
1506:   if (opt == MAT_COPY_VALUES && hopt != MAT_COPY_VALUES) PetscCall(Copy(A, *B, SAME_NONZERO_PATTERN));
1507:   // allocate memory if needed
1508:   if (opt != MAT_COPY_VALUES && !MatCUPMCast(*B)->d_v) PetscCall(SetPreallocation(*B, dctx));
1509:   PetscFunctionReturn(PETSC_SUCCESS);
1510: }

1512: template <device::cupm::DeviceType T>
1513: inline PetscErrorCode MatDense_Seq_CUPM<T>::SetRandom(Mat A, PetscRandom rng) noexcept
1514: {
1515:   PetscBool device;

1517:   PetscFunctionBegin;
1518:   PetscCall(PetscObjectTypeCompare(PetscObjectCast(rng), PETSCDEVICERAND(), &device));
1519:   if (device) {
1520:     const auto         m = A->rmap->n;
1521:     const auto         n = A->cmap->n;
1522:     PetscDeviceContext dctx;

1524:     PetscCall(GetHandles_(&dctx));
1525:     {
1526:       const auto a = DeviceArrayWrite(dctx, A);
1527:       PetscInt   lda;

1529:       PetscCall(MatDenseGetLDA(A, &lda));
1530:       if (lda > m) {
1531:         for (PetscInt i = 0; i < n; i++) PetscCall(PetscRandomGetValues(rng, m, a.data() + i * lda));
1532:       } else {
1533:         PetscInt mn;

1535:         PetscCall(PetscIntMultError(m, n, &mn));
1536:         PetscCall(PetscRandomGetValues(rng, mn, a));
1537:       }
1538:     }
1539:   } else {
1540:     PetscCall(MatSetRandom_SeqDense(A, rng));
1541:   }
1542:   PetscFunctionReturn(PETSC_SUCCESS);
1543: }

1545: // ==========================================================================================

1547: template <device::cupm::DeviceType T>
1548: inline PetscErrorCode MatDense_Seq_CUPM<T>::GetColumnVector(Mat A, Vec v, PetscInt col) noexcept
1549: {
1550:   const auto         offloadmask = A->offloadmask;
1551:   const auto         n           = A->rmap->n;
1552:   const auto         col_offset  = [&](const PetscScalar *ptr) { return ptr + col * MatIMPLCast(A)->lda; };
1553:   PetscBool          viscupm;
1554:   PetscDeviceContext dctx;
1555:   cupmStream_t       stream;

1557:   PetscFunctionBegin;
1558:   PetscCall(PetscObjectTypeCompareAny(PetscObjectCast(v), &viscupm, VecSeq_CUPM::VECSEQCUPM(), VecSeq_CUPM::VECMPICUPM(), VecSeq_CUPM::VECCUPM(), ""));
1559:   PetscCall(GetHandles_(&dctx, &stream));
1560:   if (viscupm && !v->boundtocpu) {
1561:     const auto x = VecSeq_CUPM::DeviceArrayWrite(dctx, v);

1563:     // update device data
1564:     if (PetscOffloadDevice(offloadmask)) {
1565:       PetscCall(PetscCUPMMemcpyAsync(x.data(), col_offset(DeviceArrayRead(dctx, A)), n, cupmMemcpyDeviceToDevice, stream));
1566:     } else {
1567:       PetscCall(PetscCUPMMemcpyAsync(x.data(), col_offset(HostArrayRead(dctx, A)), n, cupmMemcpyHostToDevice, stream));
1568:     }
1569:   } else {
1570:     PetscScalar *x;

1572:     // update host data
1573:     PetscCall(VecGetArrayWrite(v, &x));
1574:     if (PetscOffloadUnallocated(offloadmask) || PetscOffloadHost(offloadmask)) {
1575:       PetscCall(PetscArraycpy(x, col_offset(HostArrayRead(dctx, A)), n));
1576:     } else if (PetscOffloadDevice(offloadmask)) {
1577:       PetscCall(PetscCUPMMemcpyAsync(x, col_offset(DeviceArrayRead(dctx, A)), n, cupmMemcpyDeviceToHost, stream));
1578:     }
1579:     PetscCall(VecRestoreArrayWrite(v, &x));
1580:   }
1581:   PetscFunctionReturn(PETSC_SUCCESS);
1582: }

1584: template <device::cupm::DeviceType T>
1585: template <PetscMemoryAccessMode access>
1586: inline PetscErrorCode MatDense_Seq_CUPM<T>::GetColumnVec(Mat A, PetscInt col, Vec *v) noexcept
1587: {
1588:   using namespace vec::cupm;
1589:   const auto         mimpl = MatIMPLCast(A);
1590:   PetscDeviceContext dctx;

1592:   PetscFunctionBegin;
1593:   PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1594:   PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1595:   mimpl->vecinuse = col + 1;
1596:   PetscCall(GetHandles_(&dctx));
1597:   PetscCall(GetArray<PETSC_MEMTYPE_DEVICE, access>(A, const_cast<PetscScalar **>(&mimpl->ptrinuse), dctx));
1598:   if (!mimpl->cvec) {
1599:     // we pass the data of A, to prevent allocating needless GPU memory the first time
1600:     // VecCUPMPlaceArray is called
1601:     PetscCall(VecCreateSeqCUPMWithArraysAsync<T>(PetscObjectComm(PetscObjectCast(A)), A->rmap->bs, A->rmap->n, nullptr, mimpl->ptrinuse, &mimpl->cvec));
1602:   }
1603:   PetscCall(VecCUPMPlaceArrayAsync<T>(mimpl->cvec, mimpl->ptrinuse + static_cast<std::size_t>(col) * static_cast<std::size_t>(mimpl->lda)));
1604:   if (access == PETSC_MEMORY_ACCESS_READ) PetscCall(VecLockReadPush(mimpl->cvec));
1605:   *v = mimpl->cvec;
1606:   PetscFunctionReturn(PETSC_SUCCESS);
1607: }

1609: template <device::cupm::DeviceType T>
1610: template <PetscMemoryAccessMode access>
1611: inline PetscErrorCode MatDense_Seq_CUPM<T>::RestoreColumnVec(Mat A, PetscInt, Vec *v) noexcept
1612: {
1613:   using namespace vec::cupm;
1614:   const auto         mimpl = MatIMPLCast(A);
1615:   const auto         cvec  = mimpl->cvec;
1616:   PetscDeviceContext dctx;

1618:   PetscFunctionBegin;
1619:   PetscCheck(mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseGetColumnVec() first");
1620:   PetscCheck(cvec, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing internal column vector");
1621:   mimpl->vecinuse = 0;
1622:   if (access == PETSC_MEMORY_ACCESS_READ) PetscCall(VecLockReadPop(cvec));
1623:   PetscCall(VecCUPMResetArrayAsync<T>(cvec));
1624:   PetscCall(GetHandles_(&dctx));
1625:   PetscCall(RestoreArray<PETSC_MEMTYPE_DEVICE, access>(A, const_cast<PetscScalar **>(&mimpl->ptrinuse), dctx));
1626:   if (v) *v = nullptr;
1627:   PetscFunctionReturn(PETSC_SUCCESS);
1628: }

1630: // ==========================================================================================

1632: template <device::cupm::DeviceType T>
1633: inline PetscErrorCode MatDense_Seq_CUPM<T>::GetFactor(Mat A, MatFactorType ftype, Mat *fact_out) noexcept
1634: {
1635:   Mat                fact = nullptr;
1636:   PetscDeviceContext dctx;

1638:   PetscFunctionBegin;
1639:   PetscCall(GetHandles_(&dctx));
1640:   PetscCall(MatCreateSeqDenseCUPM<T>(PetscObjectComm(PetscObjectCast(A)), A->rmap->n, A->cmap->n, nullptr, &fact, dctx, /* preallocate */ false));
1641:   fact->factortype = ftype;
1642:   switch (ftype) {
1643:   case MAT_FACTOR_LU:
1644:   case MAT_FACTOR_ILU: // fall-through
1645:     fact->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqDense;
1646:     fact->ops->ilufactorsymbolic = MatLUFactorSymbolic_SeqDense;
1647:     break;
1648:   case MAT_FACTOR_CHOLESKY:
1649:   case MAT_FACTOR_ICC: // fall-through
1650:     fact->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqDense;
1651:     break;
1652:   case MAT_FACTOR_QR: {
1653:     const auto pobj = PetscObjectCast(fact);

1655:     PetscCall(PetscObjectComposeFunction(pobj, "MatQRFactor_C", MatQRFactor_SeqDense));
1656:     PetscCall(PetscObjectComposeFunction(pobj, "MatQRFactorSymbolic_C", MatQRFactorSymbolic_SeqDense));
1657:   } break;
1658:   case MAT_FACTOR_NONE:
1659:   case MAT_FACTOR_ILUDT:     // fall-through
1660:   case MAT_FACTOR_NUM_TYPES: // fall-through
1661:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatFactorType %s not supported", MatFactorTypes[ftype]);
1662:   }
1663:   PetscCall(PetscStrFreeAllocpy(MATSOLVERCUPM(), &fact->solvertype));
1664:   PetscCall(PetscStrallocpy(MATORDERINGEXTERNAL, const_cast<char **>(fact->preferredordering) + MAT_FACTOR_LU));
1665:   PetscCall(PetscStrallocpy(MATORDERINGEXTERNAL, const_cast<char **>(fact->preferredordering) + MAT_FACTOR_ILU));
1666:   PetscCall(PetscStrallocpy(MATORDERINGEXTERNAL, const_cast<char **>(fact->preferredordering) + MAT_FACTOR_CHOLESKY));
1667:   PetscCall(PetscStrallocpy(MATORDERINGEXTERNAL, const_cast<char **>(fact->preferredordering) + MAT_FACTOR_ICC));
1668:   *fact_out = fact;
1669:   PetscFunctionReturn(PETSC_SUCCESS);
1670: }

1672: template <device::cupm::DeviceType T>
1673: inline PetscErrorCode MatDense_Seq_CUPM<T>::InvertFactors(Mat A) noexcept
1674: {
1675:   const auto         mimpl = MatIMPLCast(A);
1676:   const auto         mcu   = MatCUPMCast(A);
1677:   const auto         n     = static_cast<cupmBlasInt_t>(A->cmap->n);
1678:   cupmSolverHandle_t handle;
1679:   PetscDeviceContext dctx;
1680:   cupmStream_t       stream;

1682:   PetscFunctionBegin;
1683:   #if PetscDefined(HAVE_CUDA) && PetscDefined(USING_NVCC)
1684:   // HIP appears to have this by default??
1685:   PetscCheck(PETSC_PKG_CUDA_VERSION_GE(10, 1, 0), PETSC_COMM_SELF, PETSC_ERR_SUP, "Upgrade to CUDA version 10.1.0 or higher");
1686:   #endif
1687:   if (!n || !A->rmap->n) PetscFunctionReturn(PETSC_SUCCESS);
1688:   PetscCheck(A->factortype == MAT_FACTOR_CHOLESKY, PETSC_COMM_SELF, PETSC_ERR_LIB, "Factor type %s not implemented", MatFactorTypes[A->factortype]);
1689:   // spd
1690:   PetscCheck(!mcu->d_fact_ipiv, PETSC_COMM_SELF, PETSC_ERR_LIB, "%sDnsytri not implemented", cupmSolverName());

1692:   PetscCall(GetHandles_(&dctx, &handle, &stream));
1693:   {
1694:     const auto    da  = DeviceArrayReadWrite(dctx, A);
1695:     const auto    lda = static_cast<cupmBlasInt_t>(mimpl->lda);
1696:     cupmBlasInt_t il;

1698:     PetscCallCUPMSOLVER(cupmSolverXpotri_bufferSize(handle, CUPMSOLVER_FILL_MODE_LOWER, n, da.cupmdata(), lda, &il));
1699:     if (il > mcu->d_fact_lwork) {
1700:       mcu->d_fact_lwork = il;
1701:       PetscCallCUPM(cupmFreeAsync(mcu->d_fact_work, stream));
1702:       PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_work, il, stream));
1703:     }
1704:     PetscCall(PetscLogGpuTimeBegin());
1705:     PetscCallCUPMSOLVER(cupmSolverXpotri(handle, CUPMSOLVER_FILL_MODE_LOWER, n, da.cupmdata(), lda, mcu->d_fact_work, mcu->d_fact_lwork, mcu->d_fact_info));
1706:     PetscCall(PetscLogGpuTimeEnd());
1707:   }
1708:   PetscCall(CheckCUPMSolverInfo_(mcu->d_fact_info, stream));
1709:   // TODO (write cuda kernel)
1710:   PetscCall(MatSeqDenseSymmetrize_Private(A, PETSC_TRUE));
1711:   PetscCall(PetscLogGpuFlops(1.0 * n * n * n / 3.0));

1713:   A->ops->solve          = nullptr;
1714:   A->ops->solvetranspose = nullptr;
1715:   A->ops->matsolve       = nullptr;
1716:   A->factortype          = MAT_FACTOR_NONE;

1718:   PetscCall(PetscFree(A->solvertype));
1719:   PetscFunctionReturn(PETSC_SUCCESS);
1720: }

1722: // ==========================================================================================

1724: template <device::cupm::DeviceType T>
1725: inline PetscErrorCode MatDense_Seq_CUPM<T>::GetSubMatrix(Mat A, PetscInt rbegin, PetscInt rend, PetscInt cbegin, PetscInt cend, Mat *mat) noexcept
1726: {
1727:   const auto         mimpl        = MatIMPLCast(A);
1728:   const auto         array_offset = [&](PetscScalar *ptr) { return ptr + rbegin + static_cast<std::size_t>(cbegin) * mimpl->lda; };
1729:   const auto         n            = rend - rbegin;
1730:   const auto         m            = cend - cbegin;
1731:   auto              &cmat         = mimpl->cmat;
1732:   PetscDeviceContext dctx;

1734:   PetscFunctionBegin;
1735:   PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1736:   PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1737:   mimpl->matinuse = cbegin + 1;

1739:   PetscCall(GetHandles_(&dctx));
1740:   PetscCall(HostToDevice_(A, dctx));

1742:   if (cmat && ((m != cmat->cmap->N) || (n != cmat->rmap->N))) PetscCall(MatDestroy(&cmat));
1743:   {
1744:     const auto device_array = array_offset(MatCUPMCast(A)->d_v);

1746:     if (cmat) {
1747:       PetscCall(PlaceArray(cmat, device_array));
1748:     } else {
1749:       PetscCall(MatCreateSeqDenseCUPM<T>(PetscObjectComm(PetscObjectCast(A)), n, m, device_array, &cmat, dctx));
1750:     }
1751:   }
1752:   PetscCall(MatDenseSetLDA(cmat, mimpl->lda));
1753:   // place CPU array if present but do not copy any data
1754:   if (const auto host_array = mimpl->v) {
1755:     cmat->offloadmask = PETSC_OFFLOAD_GPU;
1756:     PetscCall(MatDensePlaceArray(cmat, array_offset(host_array)));
1757:   }

1759:   cmat->offloadmask = A->offloadmask;
1760:   *mat              = cmat;
1761:   PetscFunctionReturn(PETSC_SUCCESS);
1762: }

1764: template <device::cupm::DeviceType T>
1765: inline PetscErrorCode MatDense_Seq_CUPM<T>::RestoreSubMatrix(Mat A, Mat *m) noexcept
1766: {
1767:   const auto mimpl = MatIMPLCast(A);
1768:   const auto cmat  = mimpl->cmat;
1769:   const auto reset = static_cast<bool>(mimpl->v);
1770:   bool       copy, was_offload_host;

1772:   PetscFunctionBegin;
1773:   PetscCheck(mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseGetSubMatrix() first");
1774:   PetscCheck(cmat, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing internal column matrix");
1775:   PetscCheck(*m == cmat, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Not the matrix obtained from MatDenseGetSubMatrix()");
1776:   mimpl->matinuse = 0;

1778:   // calls to ResetArray may change it, so save it here
1779:   was_offload_host = cmat->offloadmask == PETSC_OFFLOAD_CPU;
1780:   if (was_offload_host && !reset) {
1781:     copy = true;
1782:     PetscCall(MatSeqDenseSetPreallocation(A, nullptr));
1783:   } else {
1784:     copy = false;
1785:   }

1787:   PetscCall(ResetArray(cmat));
1788:   if (reset) PetscCall(MatDenseResetArray(cmat));
1789:   if (copy) {
1790:     PetscDeviceContext dctx;

1792:     PetscCall(GetHandles_(&dctx));
1793:     PetscCall(DeviceToHost_(A, dctx));
1794:   } else {
1795:     A->offloadmask = was_offload_host ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
1796:   }

1798:   cmat->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
1799:   *m                = nullptr;
1800:   PetscFunctionReturn(PETSC_SUCCESS);
1801: }

1803: // ==========================================================================================

1805: namespace
1806: {

1808: template <device::cupm::DeviceType T>
1809: inline PetscErrorCode MatMatMultNumeric_SeqDenseCUPM_SeqDenseCUPM(Mat A, Mat B, Mat C, PetscBool TA, PetscBool TB) noexcept
1810: {
1811:   PetscFunctionBegin;
1812:   if (TA) {
1813:     if (TB) {
1814:       PetscCall(MatDense_Seq_CUPM<T>::template MatMatMult_Numeric_Dispatch<true, true>(A, B, C));
1815:     } else {
1816:       PetscCall(MatDense_Seq_CUPM<T>::template MatMatMult_Numeric_Dispatch<true, false>(A, B, C));
1817:     }
1818:   } else {
1819:     if (TB) {
1820:       PetscCall(MatDense_Seq_CUPM<T>::template MatMatMult_Numeric_Dispatch<false, true>(A, B, C));
1821:     } else {
1822:       PetscCall(MatDense_Seq_CUPM<T>::template MatMatMult_Numeric_Dispatch<false, false>(A, B, C));
1823:     }
1824:   }
1825:   PetscFunctionReturn(PETSC_SUCCESS);
1826: }

1828: template <device::cupm::DeviceType T>
1829: inline PetscErrorCode MatSolverTypeRegister_DENSECUPM() noexcept
1830: {
1831:   PetscFunctionBegin;
1832:   for (auto ftype : util::make_array(MAT_FACTOR_LU, MAT_FACTOR_CHOLESKY, MAT_FACTOR_QR)) {
1833:     PetscCall(MatSolverTypeRegister(MatDense_Seq_CUPM<T>::MATSOLVERCUPM(), MATSEQDENSE, ftype, MatDense_Seq_CUPM<T>::GetFactor));
1834:     PetscCall(MatSolverTypeRegister(MatDense_Seq_CUPM<T>::MATSOLVERCUPM(), MatDense_Seq_CUPM<T>::MATSEQDENSECUPM(), ftype, MatDense_Seq_CUPM<T>::GetFactor));
1835:   }
1836:   PetscFunctionReturn(PETSC_SUCCESS);
1837: }

1839: } // anonymous namespace

1841: } // namespace impl

1843: } // namespace cupm

1845: } // namespace mat

1847: } // namespace Petsc

1849: #endif // __cplusplus

1851: #endif // PETSCMATSEQDENSECUPM_HPP