Actual source code: matseqdensecupm.hpp
1: #ifndef PETSCMATSEQDENSECUPM_HPP
2: #define PETSCMATSEQDENSECUPM_HPP
4: #include <petsc/private/matdensecupmimpl.h>
5: #include <../src/mat/impls/dense/seq/dense.h>
7: #if defined(__cplusplus)
8: #include <petsc/private/deviceimpl.h>
9: #include <petsc/private/randomimpl.h>
10: #include <petsc/private/vecimpl.h>
11: #include <petsc/private/cupmobject.hpp>
12: #include <petsc/private/cupmsolverinterface.hpp>
14: #include <petsc/private/cpp/type_traits.hpp>
15: #include <petsc/private/cpp/utility.hpp>
17: #include <../src/vec/vec/impls/seq/cupm/vecseqcupm.hpp>
19: namespace Petsc
20: {
22: namespace mat
23: {
25: namespace cupm
26: {
28: namespace impl
29: {
31: template <device::cupm::DeviceType T>
32: class MatDense_Seq_CUPM : MatDense_CUPM<T, MatDense_Seq_CUPM<T>> {
33: public:
34: MATDENSECUPM_HEADER(T, MatDense_Seq_CUPM<T>);
36: private:
37: struct Mat_SeqDenseCUPM {
38: PetscScalar *d_v; // pointer to the matrix on the GPU
39: PetscScalar *unplacedarray; // if one called MatCUPMDensePlaceArray(), this is where it stashed the original
40: bool d_user_alloc;
41: bool d_unplaced_user_alloc;
42: // factorization support
43: cupmBlasInt_t *d_fact_ipiv; // device pivots
44: cupmScalar_t *d_fact_tau; // device QR tau vector
45: cupmBlasInt_t *d_fact_info; // device info
46: cupmScalar_t *d_fact_work; // device workspace
47: cupmBlasInt_t d_fact_lwork; // size of device workspace
48: // workspace
49: Vec workvec;
50: };
52: static PetscErrorCode SetPreallocation_(Mat, PetscDeviceContext, PetscScalar *) noexcept;
54: static PetscErrorCode HostToDevice_(Mat, PetscDeviceContext) noexcept;
55: static PetscErrorCode DeviceToHost_(Mat, PetscDeviceContext) noexcept;
57: static PetscErrorCode CheckCUPMSolverInfo_(const cupmBlasInt_t *, cupmStream_t) noexcept;
59: template <typename Derived>
60: struct SolveCommon;
61: struct SolveQR;
62: struct SolveCholesky;
63: struct SolveLU;
65: template <typename Solver, bool transpose>
66: static PetscErrorCode MatSolve_Factored_Dispatch_(Mat, Vec, Vec) noexcept;
67: template <typename Solver, bool transpose>
68: static PetscErrorCode MatMatSolve_Factored_Dispatch_(Mat, Mat, Mat) noexcept;
69: template <bool transpose>
70: static PetscErrorCode MatMultAdd_Dispatch_(Mat, Vec, Vec, Vec) noexcept;
72: template <bool to_host>
73: static PetscErrorCode Convert_Dispatch_(Mat, MatType, MatReuse, Mat *) noexcept;
75: PETSC_NODISCARD static constexpr MatType MATIMPLCUPM_() noexcept;
76: PETSC_NODISCARD static constexpr Mat_SeqDense *MatIMPLCast_(Mat) noexcept;
78: public:
79: PETSC_NODISCARD static constexpr Mat_SeqDenseCUPM *MatCUPMCast(Mat) noexcept;
81: // define these by hand since they don't fit the above mold
82: PETSC_NODISCARD static constexpr const char *MatConvert_seqdensecupm_seqdense_C() noexcept;
83: PETSC_NODISCARD static constexpr const char *MatProductSetFromOptions_seqaij_seqdensecupm_C() noexcept;
85: static PetscErrorCode Create(Mat) noexcept;
86: static PetscErrorCode Destroy(Mat) noexcept;
87: static PetscErrorCode SetUp(Mat) noexcept;
88: static PetscErrorCode Reset(Mat) noexcept;
90: static PetscErrorCode BindToCPU(Mat, PetscBool) noexcept;
91: static PetscErrorCode Convert_SeqDense_SeqDenseCUPM(Mat, MatType, MatReuse, Mat *) noexcept;
92: static PetscErrorCode Convert_SeqDenseCUPM_SeqDense(Mat, MatType, MatReuse, Mat *) noexcept;
94: template <PetscMemType, PetscMemoryAccessMode>
95: static PetscErrorCode GetArray(Mat, PetscScalar **, PetscDeviceContext) noexcept;
96: template <PetscMemType, PetscMemoryAccessMode>
97: static PetscErrorCode RestoreArray(Mat, PetscScalar **, PetscDeviceContext) noexcept;
98: template <PetscMemoryAccessMode>
99: static PetscErrorCode GetArrayAndMemType(Mat, PetscScalar **, PetscMemType *, PetscDeviceContext) noexcept;
100: template <PetscMemoryAccessMode>
101: static PetscErrorCode RestoreArrayAndMemType(Mat, PetscScalar **, PetscDeviceContext) noexcept;
103: private:
104: template <PetscMemType mtype, PetscMemoryAccessMode mode>
105: static PetscErrorCode GetArrayC_(Mat m, PetscScalar **p) noexcept
106: {
107: PetscDeviceContext dctx;
109: PetscFunctionBegin;
110: PetscCall(GetHandles_(&dctx));
111: PetscCall(GetArray<mtype, mode>(m, p, dctx));
112: PetscFunctionReturn(PETSC_SUCCESS);
113: }
115: template <PetscMemType mtype, PetscMemoryAccessMode mode>
116: static PetscErrorCode RestoreArrayC_(Mat m, PetscScalar **p) noexcept
117: {
118: PetscDeviceContext dctx;
120: PetscFunctionBegin;
121: PetscCall(GetHandles_(&dctx));
122: PetscCall(RestoreArray<mtype, mode>(m, p, dctx));
123: PetscFunctionReturn(PETSC_SUCCESS);
124: }
126: template <PetscMemoryAccessMode mode>
127: static PetscErrorCode GetArrayAndMemTypeC_(Mat m, PetscScalar **p, PetscMemType *tp) noexcept
128: {
129: PetscDeviceContext dctx;
131: PetscFunctionBegin;
132: PetscCall(GetHandles_(&dctx));
133: PetscCall(GetArrayAndMemType<mode>(m, p, tp, dctx));
134: PetscFunctionReturn(PETSC_SUCCESS);
135: }
137: template <PetscMemoryAccessMode mode>
138: static PetscErrorCode RestoreArrayAndMemTypeC_(Mat m, PetscScalar **p) noexcept
139: {
140: PetscDeviceContext dctx;
142: PetscFunctionBegin;
143: PetscCall(GetHandles_(&dctx));
144: PetscCall(RestoreArrayAndMemType<mode>(m, p, dctx));
145: PetscFunctionReturn(PETSC_SUCCESS);
146: }
148: public:
149: static PetscErrorCode PlaceArray(Mat, const PetscScalar *) noexcept;
150: static PetscErrorCode ReplaceArray(Mat, const PetscScalar *) noexcept;
151: static PetscErrorCode ResetArray(Mat) noexcept;
153: template <bool transpose_A, bool transpose_B>
154: static PetscErrorCode MatMatMult_Numeric_Dispatch(Mat, Mat, Mat) noexcept;
155: static PetscErrorCode Copy(Mat, Mat, MatStructure) noexcept;
156: static PetscErrorCode ZeroEntries(Mat) noexcept;
157: static PetscErrorCode Scale(Mat, PetscScalar) noexcept;
158: static PetscErrorCode Shift(Mat, PetscScalar) noexcept;
159: static PetscErrorCode AXPY(Mat, PetscScalar, Mat, MatStructure) noexcept;
160: static PetscErrorCode Duplicate(Mat, MatDuplicateOption, Mat *) noexcept;
161: static PetscErrorCode SetRandom(Mat, PetscRandom) noexcept;
163: static PetscErrorCode GetColumnVector(Mat, Vec, PetscInt) noexcept;
164: template <PetscMemoryAccessMode>
165: static PetscErrorCode GetColumnVec(Mat, PetscInt, Vec *) noexcept;
166: template <PetscMemoryAccessMode>
167: static PetscErrorCode RestoreColumnVec(Mat, PetscInt, Vec *) noexcept;
169: static PetscErrorCode GetFactor(Mat, MatFactorType, Mat *) noexcept;
170: static PetscErrorCode InvertFactors(Mat) noexcept;
172: static PetscErrorCode GetSubMatrix(Mat, PetscInt, PetscInt, PetscInt, PetscInt, Mat *) noexcept;
173: static PetscErrorCode RestoreSubMatrix(Mat, Mat *) noexcept;
174: };
176: } // namespace impl
178: namespace
179: {
181: // Declare this here so that the functions below can make use of it
182: template <device::cupm::DeviceType T>
183: inline PetscErrorCode MatCreateSeqDenseCUPM(MPI_Comm comm, PetscInt m, PetscInt n, PetscScalar *data, Mat *A, PetscDeviceContext dctx = nullptr, bool preallocate = true) noexcept
184: {
185: PetscFunctionBegin;
186: PetscCall(impl::MatDense_Seq_CUPM<T>::CreateIMPLDenseCUPM(comm, m, n, m, n, data, A, dctx, preallocate));
187: PetscFunctionReturn(PETSC_SUCCESS);
188: }
190: } // anonymous namespace
192: namespace impl
193: {
195: // ==========================================================================================
196: // MatDense_Seq_CUPM - Private API - Utility
197: // ==========================================================================================
199: template <device::cupm::DeviceType T>
200: inline PetscErrorCode MatDense_Seq_CUPM<T>::SetPreallocation_(Mat m, PetscDeviceContext dctx, PetscScalar *user_device_array) noexcept
201: {
202: const auto mcu = MatCUPMCast(m);
203: const auto nrows = m->rmap->n;
204: const auto ncols = m->cmap->n;
205: auto &lda = MatIMPLCast(m)->lda;
206: cupmStream_t stream;
208: PetscFunctionBegin;
209: PetscCheckTypeName(m, MATSEQDENSECUPM());
211: PetscCall(checkCupmBlasIntCast(nrows));
212: PetscCall(checkCupmBlasIntCast(ncols));
213: PetscCall(GetHandlesFrom_(dctx, &stream));
214: if (lda <= 0) lda = nrows;
215: if (!mcu->d_user_alloc) PetscCallCUPM(cupmFreeAsync(mcu->d_v, stream));
216: if (user_device_array) {
217: mcu->d_user_alloc = PETSC_TRUE;
218: mcu->d_v = user_device_array;
219: } else {
220: PetscInt size;
222: mcu->d_user_alloc = PETSC_FALSE;
223: PetscCall(PetscIntMultError(lda, ncols, &size));
224: PetscCall(PetscCUPMMallocAsync(&mcu->d_v, size, stream));
225: PetscCall(PetscCUPMMemsetAsync(mcu->d_v, 0, size, stream));
226: }
227: m->offloadmask = PETSC_OFFLOAD_GPU;
228: PetscFunctionReturn(PETSC_SUCCESS);
229: }
231: template <device::cupm::DeviceType T>
232: inline PetscErrorCode MatDense_Seq_CUPM<T>::HostToDevice_(Mat m, PetscDeviceContext dctx) noexcept
233: {
234: const auto nrows = m->rmap->n;
235: const auto ncols = m->cmap->n;
236: const auto copy = m->offloadmask == PETSC_OFFLOAD_CPU || m->offloadmask == PETSC_OFFLOAD_UNALLOCATED;
238: PetscFunctionBegin;
239: PetscCheckTypeName(m, MATSEQDENSECUPM());
240: if (m->boundtocpu) PetscFunctionReturn(PETSC_SUCCESS);
241: PetscCall(PetscInfo(m, "%s matrix %" PetscInt_FMT " x %" PetscInt_FMT "\n", copy ? "Copy" : "Reusing", nrows, ncols));
242: if (copy) {
243: const auto mcu = MatCUPMCast(m);
244: cupmStream_t stream;
246: // Allocate GPU memory if not present
247: if (!mcu->d_v) PetscCall(SetPreallocation(m, dctx));
248: PetscCall(GetHandlesFrom_(dctx, &stream));
249: PetscCall(PetscLogEventBegin(MAT_DenseCopyToGPU, m, 0, 0, 0));
250: {
251: const auto mimpl = MatIMPLCast(m);
252: const auto lda = mimpl->lda;
253: const auto src = mimpl->v;
254: const auto dest = mcu->d_v;
256: if (lda > nrows) {
257: PetscCall(PetscCUPMMemcpy2DAsync(dest, lda, src, lda, nrows, ncols, cupmMemcpyHostToDevice, stream));
258: } else {
259: PetscCall(PetscCUPMMemcpyAsync(dest, src, lda * ncols, cupmMemcpyHostToDevice, stream));
260: }
261: }
262: PetscCall(PetscLogEventEnd(MAT_DenseCopyToGPU, m, 0, 0, 0));
263: // order important, ensure that offloadmask is PETSC_OFFLOAD_BOTH
264: m->offloadmask = PETSC_OFFLOAD_BOTH;
265: }
266: PetscFunctionReturn(PETSC_SUCCESS);
267: }
269: template <device::cupm::DeviceType T>
270: inline PetscErrorCode MatDense_Seq_CUPM<T>::DeviceToHost_(Mat m, PetscDeviceContext dctx) noexcept
271: {
272: const auto nrows = m->rmap->n;
273: const auto ncols = m->cmap->n;
274: const auto copy = m->offloadmask == PETSC_OFFLOAD_GPU;
276: PetscFunctionBegin;
277: PetscCheckTypeName(m, MATSEQDENSECUPM());
278: PetscCall(PetscInfo(m, "%s matrix %" PetscInt_FMT " x %" PetscInt_FMT "\n", copy ? "Copy" : "Reusing", nrows, ncols));
279: if (copy) {
280: const auto mimpl = MatIMPLCast(m);
281: cupmStream_t stream;
283: // MatCreateSeqDenseCUPM may not allocate CPU memory. Allocate if needed
284: if (!mimpl->v) PetscCall(MatSeqDenseSetPreallocation(m, nullptr));
285: PetscCall(GetHandlesFrom_(dctx, &stream));
286: PetscCall(PetscLogEventBegin(MAT_DenseCopyFromGPU, m, 0, 0, 0));
287: {
288: const auto lda = mimpl->lda;
289: const auto dest = mimpl->v;
290: const auto src = MatCUPMCast(m)->d_v;
292: if (lda > nrows) {
293: PetscCall(PetscCUPMMemcpy2DAsync(dest, lda, src, lda, nrows, ncols, cupmMemcpyDeviceToHost, stream));
294: } else {
295: PetscCall(PetscCUPMMemcpyAsync(dest, src, lda * ncols, cupmMemcpyDeviceToHost, stream));
296: }
297: }
298: PetscCall(PetscLogEventEnd(MAT_DenseCopyFromGPU, m, 0, 0, 0));
299: // order is important, MatSeqDenseSetPreallocation() might set offloadmask
300: m->offloadmask = PETSC_OFFLOAD_BOTH;
301: }
302: PetscFunctionReturn(PETSC_SUCCESS);
303: }
305: template <device::cupm::DeviceType T>
306: inline PetscErrorCode MatDense_Seq_CUPM<T>::CheckCUPMSolverInfo_(const cupmBlasInt_t *fact_info, cupmStream_t stream) noexcept
307: {
308: PetscFunctionBegin;
309: if (PetscDefined(USE_DEBUG)) {
310: cupmBlasInt_t info = 0;
312: PetscCall(PetscCUPMMemcpyAsync(&info, fact_info, 1, cupmMemcpyDeviceToHost, stream));
313: if (stream) PetscCallCUPM(cupmStreamSynchronize(stream));
314: static_assert(std::is_same<decltype(info), int>::value, "");
315: PetscCheck(info <= 0, PETSC_COMM_SELF, PETSC_ERR_MAT_CH_ZRPVT, "Bad factorization: zero pivot in row %d", info - 1);
316: PetscCheck(info >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Wrong argument to cupmSolver %d", -info);
317: }
318: PetscFunctionReturn(PETSC_SUCCESS);
319: }
321: // ==========================================================================================
322: // MatDense_Seq_CUPM - Private API - Solver Dispatch
323: // ==========================================================================================
325: // specific solvers called through the dispatch_() family of functions
326: template <device::cupm::DeviceType T>
327: template <typename Derived>
328: struct MatDense_Seq_CUPM<T>::SolveCommon {
329: using derived_type = Derived;
331: template <typename F>
332: static PetscErrorCode ResizeFactLwork(Mat_SeqDenseCUPM *mcu, cupmStream_t stream, F &&cupmSolverComputeFactLwork) noexcept
333: {
334: cupmBlasInt_t lwork;
336: PetscFunctionBegin;
337: PetscCallCUPMSOLVER(cupmSolverComputeFactLwork(&lwork));
338: if (lwork > mcu->d_fact_lwork) {
339: mcu->d_fact_lwork = lwork;
340: PetscCallCUPM(cupmFreeAsync(mcu->d_fact_work, stream));
341: PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_work, lwork, stream));
342: }
343: PetscFunctionReturn(PETSC_SUCCESS);
344: }
346: static PetscErrorCode FactorPrepare(Mat A, cupmStream_t stream) noexcept
347: {
348: const auto mcu = MatCUPMCast(A);
350: PetscFunctionBegin;
351: PetscCall(PetscInfo(A, "%s factor %" PetscInt_FMT " x %" PetscInt_FMT " on backend\n", derived_type::NAME(), A->rmap->n, A->cmap->n));
352: A->factortype = derived_type::MATFACTORTYPE();
353: A->ops->solve = MatSolve_Factored_Dispatch_<derived_type, false>;
354: A->ops->solvetranspose = MatSolve_Factored_Dispatch_<derived_type, true>;
355: A->ops->matsolve = MatMatSolve_Factored_Dispatch_<derived_type, false>;
356: A->ops->matsolvetranspose = MatMatSolve_Factored_Dispatch_<derived_type, true>;
358: PetscCall(PetscStrFreeAllocpy(MATSOLVERCUPM(), &A->solvertype));
359: if (!mcu->d_fact_info) PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_info, 1, stream));
360: PetscFunctionReturn(PETSC_SUCCESS);
361: }
362: };
364: template <device::cupm::DeviceType T>
365: struct MatDense_Seq_CUPM<T>::SolveLU : SolveCommon<SolveLU> {
366: using base_type = SolveCommon<SolveLU>;
368: static constexpr const char *NAME() noexcept { return "LU"; }
369: static constexpr MatFactorType MATFACTORTYPE() noexcept { return MAT_FACTOR_LU; }
371: static PetscErrorCode Factor(Mat A, IS, IS, const MatFactorInfo *) noexcept
372: {
373: const auto m = static_cast<cupmBlasInt_t>(A->rmap->n);
374: const auto n = static_cast<cupmBlasInt_t>(A->cmap->n);
375: cupmStream_t stream;
376: cupmSolverHandle_t handle;
377: PetscDeviceContext dctx;
379: PetscFunctionBegin;
380: if (!m || !n) PetscFunctionReturn(PETSC_SUCCESS);
381: PetscCall(GetHandles_(&dctx, &handle, &stream));
382: PetscCall(base_type::FactorPrepare(A, stream));
383: {
384: const auto mcu = MatCUPMCast(A);
385: const auto da = DeviceArrayReadWrite(dctx, A);
386: const auto lda = static_cast<cupmBlasInt_t>(MatIMPLCast(A)->lda);
388: // clang-format off
389: PetscCall(
390: base_type::ResizeFactLwork(
391: mcu, stream,
392: [&](cupmBlasInt_t *fact_lwork)
393: {
394: return cupmSolverXgetrf_bufferSize(handle, m, n, da.cupmdata(), lda, fact_lwork);
395: }
396: )
397: );
398: // clang-format on
399: if (!mcu->d_fact_ipiv) PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_ipiv, n, stream));
401: PetscCall(PetscLogGpuTimeBegin());
402: PetscCallCUPMSOLVER(cupmSolverXgetrf(handle, m, n, da.cupmdata(), lda, mcu->d_fact_work, mcu->d_fact_lwork, mcu->d_fact_ipiv, mcu->d_fact_info));
403: PetscCall(PetscLogGpuTimeEnd());
404: PetscCall(CheckCUPMSolverInfo_(mcu->d_fact_info, stream));
405: }
406: PetscCall(PetscLogGpuFlops(2.0 * n * n * m / 3.0));
407: PetscFunctionReturn(PETSC_SUCCESS);
408: }
410: template <bool transpose>
411: static PetscErrorCode Solve(Mat A, cupmScalar_t *x, cupmBlasInt_t ldx, cupmBlasInt_t m, cupmBlasInt_t nrhs, cupmBlasInt_t k, PetscDeviceContext dctx, cupmStream_t stream) noexcept
412: {
413: const auto mcu = MatCUPMCast(A);
414: const auto fact_info = mcu->d_fact_info;
415: const auto fact_ipiv = mcu->d_fact_ipiv;
416: cupmSolverHandle_t handle;
418: PetscFunctionBegin;
419: PetscCall(GetHandlesFrom_(dctx, &handle));
420: PetscCall(PetscInfo(A, "%s solve %d x %d on backend\n", NAME(), m, k));
421: PetscCall(PetscLogGpuTimeBegin());
422: {
423: constexpr auto op = transpose ? CUPMSOLVER_OP_T : CUPMSOLVER_OP_N;
424: const auto da = DeviceArrayRead(dctx, A);
425: const auto lda = static_cast<cupmBlasInt_t>(MatIMPLCast(A)->lda);
427: // clang-format off
428: PetscCall(
429: base_type::ResizeFactLwork(
430: mcu, stream,
431: [&](cupmBlasInt_t *lwork)
432: {
433: return cupmSolverXgetrs_bufferSize(
434: handle, op, m, nrhs, da.cupmdata(), lda, fact_ipiv, x, ldx, lwork
435: );
436: }
437: )
438: );
439: // clang-format on
440: PetscCallCUPMSOLVER(cupmSolverXgetrs(handle, op, m, nrhs, da.cupmdata(), lda, fact_ipiv, x, ldx, mcu->d_fact_work, mcu->d_fact_lwork, fact_info));
441: PetscCall(CheckCUPMSolverInfo_(fact_info, stream));
442: }
443: PetscCall(PetscLogGpuTimeEnd());
444: PetscCall(PetscLogGpuFlops(nrhs * (2.0 * m * m - m)));
445: PetscFunctionReturn(PETSC_SUCCESS);
446: }
447: };
449: template <device::cupm::DeviceType T>
450: struct MatDense_Seq_CUPM<T>::SolveCholesky : SolveCommon<SolveCholesky> {
451: using base_type = SolveCommon<SolveCholesky>;
453: static constexpr const char *NAME() noexcept { return "Cholesky"; }
454: static constexpr MatFactorType MATFACTORTYPE() noexcept { return MAT_FACTOR_CHOLESKY; }
456: static PetscErrorCode Factor(Mat A, IS, const MatFactorInfo *) noexcept
457: {
458: const auto n = static_cast<cupmBlasInt_t>(A->rmap->n);
459: PetscDeviceContext dctx;
460: cupmSolverHandle_t handle;
461: cupmStream_t stream;
463: PetscFunctionBegin;
464: if (!n || !A->cmap->n) PetscFunctionReturn(PETSC_SUCCESS);
465: PetscCheck(A->spd == PETSC_BOOL3_TRUE, PETSC_COMM_SELF, PETSC_ERR_SUP, "%ssytrs unavailable. Use MAT_FACTOR_LU", cupmSolverName());
466: PetscCall(GetHandles_(&dctx, &handle, &stream));
467: PetscCall(base_type::FactorPrepare(A, stream));
468: {
469: const auto mcu = MatCUPMCast(A);
470: const auto da = DeviceArrayReadWrite(dctx, A);
471: const auto lda = static_cast<cupmBlasInt_t>(MatIMPLCast(A)->lda);
473: // clang-format off
474: PetscCall(
475: base_type::ResizeFactLwork(
476: mcu, stream,
477: [&](cupmBlasInt_t *fact_lwork)
478: {
479: return cupmSolverXpotrf_bufferSize(
480: handle, CUPMSOLVER_FILL_MODE_LOWER, n, da.cupmdata(), lda, fact_lwork
481: );
482: }
483: )
484: );
485: // clang-format on
486: PetscCall(PetscLogGpuTimeBegin());
487: PetscCallCUPMSOLVER(cupmSolverXpotrf(handle, CUPMSOLVER_FILL_MODE_LOWER, n, da.cupmdata(), lda, mcu->d_fact_work, mcu->d_fact_lwork, mcu->d_fact_info));
488: PetscCall(PetscLogGpuTimeEnd());
489: PetscCall(CheckCUPMSolverInfo_(mcu->d_fact_info, stream));
490: }
491: PetscCall(PetscLogGpuFlops(1.0 * n * n * n / 3.0));
493: #if 0
494: // At the time of writing this interface (cuda 10.0), cusolverDn does not implement *sytrs
495: // and *hetr* routines. The code below should work, and it can be activated when *sytrs
496: // routines will be available
497: if (!mcu->d_fact_ipiv) PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_ipiv, n, stream));
498: if (!mcu->d_fact_lwork) {
499: PetscCallCUPMSOLVER(cupmSolverDnXsytrf_bufferSize(handle, n, da.cupmdata(), lda, &mcu->d_fact_lwork));
500: PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_work, mcu->d_fact_lwork, stream));
501: }
502: if (mcu->d_fact_info) PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_info, 1, stream));
503: PetscCall(PetscLogGpuTimeBegin());
504: PetscCallCUPMSOLVER(cupmSolverXsytrf(handle, CUPMSOLVER_FILL_MODE_LOWER, n, da, lda, mcu->d_fact_ipiv, mcu->d_fact_work, mcu->d_fact_lwork, mcu->d_fact_info));
505: PetscCall(PetscLogGpuTimeEnd());
506: #endif
507: PetscFunctionReturn(PETSC_SUCCESS);
508: }
510: template <bool transpose>
511: static PetscErrorCode Solve(Mat A, cupmScalar_t *x, cupmBlasInt_t ldx, cupmBlasInt_t m, cupmBlasInt_t nrhs, cupmBlasInt_t k, PetscDeviceContext dctx, cupmStream_t stream) noexcept
512: {
513: const auto mcu = MatCUPMCast(A);
514: const auto fact_info = mcu->d_fact_info;
515: cupmSolverHandle_t handle;
517: PetscFunctionBegin;
518: PetscAssert(!mcu->d_fact_ipiv, PETSC_COMM_SELF, PETSC_ERR_LIB, "%ssytrs not implemented", cupmSolverName());
519: PetscCall(GetHandlesFrom_(dctx, &handle));
520: PetscCall(PetscInfo(A, "%s solve %d x %d on backend\n", NAME(), m, k));
521: PetscCall(PetscLogGpuTimeBegin());
522: {
523: const auto da = DeviceArrayRead(dctx, A);
524: const auto lda = static_cast<cupmBlasInt_t>(MatIMPLCast(A)->lda);
526: // clang-format off
527: PetscCall(
528: base_type::ResizeFactLwork(
529: mcu, stream,
530: [&](cupmBlasInt_t *lwork)
531: {
532: return cupmSolverXpotrs_bufferSize(
533: handle, CUPMSOLVER_FILL_MODE_LOWER, m, nrhs, da.cupmdata(), lda, x, ldx, lwork
534: );
535: }
536: )
537: );
538: // clang-format on
539: PetscCallCUPMSOLVER(cupmSolverXpotrs(handle, CUPMSOLVER_FILL_MODE_LOWER, m, nrhs, da.cupmdata(), lda, x, ldx, mcu->d_fact_work, mcu->d_fact_lwork, fact_info));
540: PetscCall(CheckCUPMSolverInfo_(fact_info, stream));
541: }
542: PetscCall(PetscLogGpuTimeEnd());
543: PetscCall(PetscLogGpuFlops(nrhs * (2.0 * m * m - m)));
544: PetscFunctionReturn(PETSC_SUCCESS);
545: }
546: };
548: template <device::cupm::DeviceType T>
549: struct MatDense_Seq_CUPM<T>::SolveQR : SolveCommon<SolveQR> {
550: using base_type = SolveCommon<SolveQR>;
552: static constexpr const char *NAME() noexcept { return "QR"; }
553: static constexpr MatFactorType MATFACTORTYPE() noexcept { return MAT_FACTOR_QR; }
555: static PetscErrorCode Factor(Mat A, IS, const MatFactorInfo *) noexcept
556: {
557: const auto m = static_cast<cupmBlasInt_t>(A->rmap->n);
558: const auto n = static_cast<cupmBlasInt_t>(A->cmap->n);
559: const auto min = std::min(m, n);
560: const auto mimpl = MatIMPLCast(A);
561: cupmStream_t stream;
562: cupmSolverHandle_t handle;
563: PetscDeviceContext dctx;
565: PetscFunctionBegin;
566: if (!m || !n) PetscFunctionReturn(PETSC_SUCCESS);
567: PetscCall(GetHandles_(&dctx, &handle, &stream));
568: PetscCall(base_type::FactorPrepare(A, stream));
569: mimpl->rank = min;
570: {
571: const auto mcu = MatCUPMCast(A);
572: const auto da = DeviceArrayReadWrite(dctx, A);
573: const auto lda = static_cast<cupmBlasInt_t>(mimpl->lda);
575: if (!mcu->workvec) PetscCall(vec::cupm::VecCreateSeqCUPMAsync<T>(PetscObjectComm(PetscObjectCast(A)), m, &mcu->workvec));
576: if (!mcu->d_fact_tau) PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_tau, min, stream));
577: // clang-format off
578: PetscCall(
579: base_type::ResizeFactLwork(
580: mcu, stream,
581: [&](cupmBlasInt_t *fact_lwork)
582: {
583: return cupmSolverXgeqrf_bufferSize(handle, m, n, da.cupmdata(), lda, fact_lwork);
584: }
585: )
586: );
587: // clang-format on
588: PetscCall(PetscLogGpuTimeBegin());
589: PetscCallCUPMSOLVER(cupmSolverXgeqrf(handle, m, n, da.cupmdata(), lda, mcu->d_fact_tau, mcu->d_fact_work, mcu->d_fact_lwork, mcu->d_fact_info));
590: PetscCall(PetscLogGpuTimeEnd());
591: PetscCall(CheckCUPMSolverInfo_(mcu->d_fact_info, stream));
592: }
593: PetscCall(PetscLogGpuFlops(2.0 * min * min * (std::max(m, n) - min / 3.0)));
594: PetscFunctionReturn(PETSC_SUCCESS);
595: }
597: template <bool transpose>
598: static PetscErrorCode Solve(Mat A, cupmScalar_t *x, cupmBlasInt_t ldx, cupmBlasInt_t m, cupmBlasInt_t nrhs, cupmBlasInt_t k, PetscDeviceContext dctx, cupmStream_t stream) noexcept
599: {
600: const auto mimpl = MatIMPLCast(A);
601: const auto rank = static_cast<cupmBlasInt_t>(mimpl->rank);
602: const auto mcu = MatCUPMCast(A);
603: const auto fact_info = mcu->d_fact_info;
604: const auto fact_tau = mcu->d_fact_tau;
605: const auto fact_work = mcu->d_fact_work;
606: const auto fact_lwork = mcu->d_fact_lwork;
607: cupmSolverHandle_t solver_handle;
608: cupmBlasHandle_t blas_handle;
610: PetscFunctionBegin;
611: PetscCall(GetHandlesFrom_(dctx, &blas_handle, &solver_handle));
612: PetscCall(PetscInfo(A, "%s solve %d x %d on backend\n", NAME(), m, k));
613: PetscCall(PetscLogGpuTimeBegin());
614: {
615: const auto da = DeviceArrayRead(dctx, A);
616: const auto one = cupmScalarCast(1.0);
617: const auto lda = static_cast<cupmBlasInt_t>(mimpl->lda);
619: if (transpose) {
620: PetscCallCUPMBLAS(cupmBlasXtrsm(blas_handle, CUPMBLAS_SIDE_LEFT, CUPMBLAS_FILL_MODE_UPPER, CUPMBLAS_OP_T, CUPMBLAS_DIAG_NON_UNIT, rank, nrhs, &one, da.cupmdata(), lda, x, ldx));
621: PetscCallCUPMSOLVER(cupmSolverXormqr(solver_handle, CUPMSOLVER_SIDE_LEFT, CUPMSOLVER_OP_N, m, nrhs, rank, da.cupmdata(), lda, fact_tau, x, ldx, fact_work, fact_lwork, fact_info));
622: PetscCall(CheckCUPMSolverInfo_(fact_info, stream));
623: } else {
624: constexpr auto op = PetscDefined(USE_COMPLEX) ? CUPMSOLVER_OP_C : CUPMSOLVER_OP_T;
626: PetscCallCUPMSOLVER(cupmSolverXormqr(solver_handle, CUPMSOLVER_SIDE_LEFT, op, m, nrhs, rank, da.cupmdata(), lda, fact_tau, x, ldx, fact_work, fact_lwork, fact_info));
627: PetscCall(CheckCUPMSolverInfo_(fact_info, stream));
628: PetscCallCUPMBLAS(cupmBlasXtrsm(blas_handle, CUPMBLAS_SIDE_LEFT, CUPMBLAS_FILL_MODE_UPPER, CUPMBLAS_OP_N, CUPMBLAS_DIAG_NON_UNIT, rank, nrhs, &one, da.cupmdata(), lda, x, ldx));
629: }
630: }
631: PetscCall(PetscLogGpuTimeEnd());
632: PetscCall(PetscLogFlops(nrhs * (4.0 * m * rank - (rank * rank))));
633: PetscFunctionReturn(PETSC_SUCCESS);
634: }
635: };
637: template <device::cupm::DeviceType T>
638: template <typename Solver, bool transpose>
639: inline PetscErrorCode MatDense_Seq_CUPM<T>::MatSolve_Factored_Dispatch_(Mat A, Vec x, Vec y) noexcept
640: {
641: using namespace vec::cupm;
642: const auto pobj_A = PetscObjectCast(A);
643: const auto m = static_cast<cupmBlasInt_t>(A->rmap->n);
644: const auto k = static_cast<cupmBlasInt_t>(A->cmap->n);
645: auto &workvec = MatCUPMCast(A)->workvec;
646: PetscScalar *y_array = nullptr;
647: PetscDeviceContext dctx;
648: PetscBool xiscupm, yiscupm, aiscupm;
649: bool use_y_array_directly;
650: cupmStream_t stream;
652: PetscFunctionBegin;
653: PetscCheck(A->factortype != MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix must be factored to solve");
654: PetscCall(PetscObjectTypeCompare(PetscObjectCast(x), VecSeq_CUPM::VECSEQCUPM(), &xiscupm));
655: PetscCall(PetscObjectTypeCompare(PetscObjectCast(y), VecSeq_CUPM::VECSEQCUPM(), &yiscupm));
656: PetscCall(PetscObjectTypeCompare(pobj_A, MATSEQDENSECUPM(), &aiscupm));
657: PetscAssert(aiscupm, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix A is somehow not CUPM?????????????????????????????");
658: PetscCall(GetHandles_(&dctx, &stream));
659: use_y_array_directly = yiscupm && (k >= m);
660: {
661: const PetscScalar *x_array;
662: const auto xisdevice = xiscupm && PetscOffloadDevice(x->offloadmask);
663: const auto copy_mode = xisdevice ? cupmMemcpyDeviceToDevice : cupmMemcpyHostToDevice;
665: if (!use_y_array_directly && !workvec) PetscCall(VecCreateSeqCUPMAsync<T>(PetscObjectComm(pobj_A), m, &workvec));
666: // The logic here is to try to minimize the amount of memory copying:
667: //
668: // If we call VecCUPMGetArrayRead(X, &x) every time xiscupm and the data is not offloaded
669: // to the GPU yet, then the data is copied to the GPU. But we are only trying to get the
670: // data in order to copy it into the y array. So the array x will be wherever the data
671: // already is so that only one memcpy is performed
672: if (xisdevice) {
673: PetscCall(VecCUPMGetArrayReadAsync<T>(x, &x_array, dctx));
674: } else {
675: PetscCall(VecGetArrayRead(x, &x_array));
676: }
677: PetscCall(VecCUPMGetArrayWriteAsync<T>(use_y_array_directly ? y : workvec, &y_array, dctx));
678: PetscCall(PetscCUPMMemcpyAsync(y_array, x_array, m, copy_mode, stream));
679: if (xisdevice) {
680: PetscCall(VecCUPMRestoreArrayReadAsync<T>(x, &x_array, dctx));
681: } else {
682: PetscCall(VecRestoreArrayRead(x, &x_array));
683: }
684: }
686: if (!aiscupm) PetscCall(MatConvert(A, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &A));
687: PetscCall(Solver{}.template Solve<transpose>(A, cupmScalarPtrCast(y_array), m, m, 1, k, dctx, stream));
688: if (!aiscupm) PetscCall(MatConvert(A, MATSEQDENSE, MAT_INPLACE_MATRIX, &A));
690: if (use_y_array_directly) {
691: PetscCall(VecCUPMRestoreArrayWriteAsync<T>(y, &y_array, dctx));
692: } else {
693: const auto copy_mode = yiscupm ? cupmMemcpyDeviceToDevice : cupmMemcpyDeviceToHost;
694: PetscScalar *yv;
696: // The logic here is that the data is not yet in either y's GPU array or its CPU array.
697: // There is nothing in the interface to say where the user would like it to end up. So we
698: // choose the GPU, because it is the faster option
699: if (yiscupm) {
700: PetscCall(VecCUPMGetArrayWriteAsync<T>(y, &yv, dctx));
701: } else {
702: PetscCall(VecGetArray(y, &yv));
703: }
704: PetscCall(PetscCUPMMemcpyAsync(yv, y_array, k, copy_mode, stream));
705: if (yiscupm) {
706: PetscCall(VecCUPMRestoreArrayWriteAsync<T>(y, &yv, dctx));
707: } else {
708: PetscCall(VecRestoreArray(y, &yv));
709: }
710: PetscCall(VecCUPMRestoreArrayWriteAsync<T>(workvec, &y_array));
711: }
712: PetscFunctionReturn(PETSC_SUCCESS);
713: }
715: template <device::cupm::DeviceType T>
716: template <typename Solver, bool transpose>
717: inline PetscErrorCode MatDense_Seq_CUPM<T>::MatMatSolve_Factored_Dispatch_(Mat A, Mat B, Mat X) noexcept
718: {
719: const auto m = static_cast<cupmBlasInt_t>(A->rmap->n);
720: const auto k = static_cast<cupmBlasInt_t>(A->cmap->n);
721: cupmBlasInt_t nrhs, ldb, ldx, ldy;
722: PetscScalar *y;
723: PetscBool biscupm, xiscupm, aiscupm;
724: PetscDeviceContext dctx;
725: cupmStream_t stream;
727: PetscFunctionBegin;
728: PetscCheck(A->factortype != MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix must be factored to solve");
729: PetscCall(PetscObjectTypeCompare(PetscObjectCast(B), MATSEQDENSECUPM(), &biscupm));
730: PetscCall(PetscObjectTypeCompare(PetscObjectCast(X), MATSEQDENSECUPM(), &xiscupm));
731: PetscCall(PetscObjectTypeCompare(PetscObjectCast(A), MATSEQDENSECUPM(), &aiscupm));
732: PetscCall(GetHandles_(&dctx, &stream));
733: {
734: PetscInt n;
736: PetscCall(MatGetSize(B, nullptr, &n));
737: PetscCall(PetscCUPMBlasIntCast(n, &nrhs));
738: PetscCall(MatDenseGetLDA(B, &n));
739: PetscCall(PetscCUPMBlasIntCast(n, &ldb));
740: PetscCall(MatDenseGetLDA(X, &n));
741: PetscCall(PetscCUPMBlasIntCast(n, &ldx));
742: }
743: {
744: // The logic here is to try to minimize the amount of memory copying:
745: //
746: // If we call MatDenseCUPMGetArrayRead(B, &b) every time biscupm and the data is not
747: // offloaded to the GPU yet, then the data is copied to the GPU. But we are only trying to
748: // get the data in order to copy it into the y array. So the array b will be wherever the
749: // data already is so that only one memcpy is performed
750: const auto bisdevice = biscupm && PetscOffloadDevice(B->offloadmask);
751: const auto copy_mode = bisdevice ? cupmMemcpyDeviceToDevice : cupmMemcpyHostToDevice;
752: const PetscScalar *b;
754: if (bisdevice) {
755: b = DeviceArrayRead(dctx, B);
756: } else if (biscupm) {
757: b = HostArrayRead(dctx, B);
758: } else {
759: PetscCall(MatDenseGetArrayRead(B, &b));
760: }
762: if (ldx < m || !xiscupm) {
763: // X's array cannot serve as the array (too small or not on device), B's array cannot
764: // serve as the array (const), so allocate a new array
765: ldy = m;
766: PetscCall(PetscCUPMMallocAsync(&y, nrhs * m));
767: } else {
768: // X's array should serve as the array
769: ldy = ldx;
770: y = DeviceArrayWrite(dctx, X);
771: }
772: PetscCall(PetscCUPMMemcpy2DAsync(y, ldy, b, ldb, m, nrhs, copy_mode, stream));
773: if (!bisdevice && !biscupm) PetscCall(MatDenseRestoreArrayRead(B, &b));
774: }
776: // convert to CUPM twice??????????????????????????????????
777: // but A should already be CUPM??????????????????????????????????????
778: if (!aiscupm) PetscCall(MatConvert(A, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &A));
779: PetscCall(Solver{}.template Solve<transpose>(A, cupmScalarPtrCast(y), ldy, m, nrhs, k, dctx, stream));
780: if (!aiscupm) PetscCall(MatConvert(A, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &A));
782: if (ldx < m || !xiscupm) {
783: const auto copy_mode = xiscupm ? cupmMemcpyDeviceToDevice : cupmMemcpyDeviceToHost;
784: PetscScalar *x;
786: // The logic here is that the data is not yet in either X's GPU array or its CPU
787: // array. There is nothing in the interface to say where the user would like it to end up.
788: // So we choose the GPU, because it is the faster option
789: if (xiscupm) {
790: x = DeviceArrayWrite(dctx, X);
791: } else {
792: PetscCall(MatDenseGetArray(X, &x));
793: }
794: PetscCall(PetscCUPMMemcpy2DAsync(x, ldx, y, ldy, k, nrhs, copy_mode, stream));
795: if (!xiscupm) PetscCall(MatDenseRestoreArray(X, &x));
796: PetscCallCUPM(cupmFreeAsync(y, stream));
797: }
798: PetscFunctionReturn(PETSC_SUCCESS);
799: }
801: template <device::cupm::DeviceType T>
802: template <bool transpose>
803: inline PetscErrorCode MatDense_Seq_CUPM<T>::MatMultAdd_Dispatch_(Mat A, Vec xx, Vec yy, Vec zz) noexcept
804: {
805: const auto m = static_cast<cupmBlasInt_t>(A->rmap->n);
806: const auto n = static_cast<cupmBlasInt_t>(A->cmap->n);
807: cupmBlasHandle_t handle;
808: PetscDeviceContext dctx;
810: PetscFunctionBegin;
811: if (yy && yy != zz) PetscCall(VecSeq_CUPM::Copy(yy, zz)); // mult add
812: if (!m || !n) {
813: // mult only
814: if (!yy) PetscCall(VecSeq_CUPM::Set(zz, 0.0));
815: PetscFunctionReturn(PETSC_SUCCESS);
816: }
817: PetscCall(PetscInfo(A, "Matrix-vector product %" PetscBLASInt_FMT " x %" PetscBLASInt_FMT " on backend\n", m, n));
818: PetscCall(GetHandles_(&dctx, &handle));
819: {
820: constexpr auto op = transpose ? CUPMBLAS_OP_T : CUPMBLAS_OP_N;
821: const auto one = cupmScalarCast(1.0);
822: const auto zero = cupmScalarCast(0.0);
823: const auto da = DeviceArrayRead(dctx, A);
824: const auto dxx = VecSeq_CUPM::DeviceArrayRead(dctx, xx);
825: const auto dzz = VecSeq_CUPM::DeviceArrayReadWrite(dctx, zz);
827: PetscCall(PetscLogGpuTimeBegin());
828: PetscCallCUPMBLAS(cupmBlasXgemv(handle, op, m, n, &one, da.cupmdata(), static_cast<cupmBlasInt_t>(MatIMPLCast(A)->lda), dxx.cupmdata(), 1, (yy ? &one : &zero), dzz.cupmdata(), 1));
829: PetscCall(PetscLogGpuTimeEnd());
830: }
831: PetscCall(PetscLogGpuFlops(2.0 * m * n - (yy ? 0 : m)));
832: PetscFunctionReturn(PETSC_SUCCESS);
833: }
835: // ==========================================================================================
836: // MatDense_Seq_CUPM - Private API - Conversion Dispatch
837: // ==========================================================================================
839: template <device::cupm::DeviceType T>
840: template <bool to_host>
841: inline PetscErrorCode MatDense_Seq_CUPM<T>::Convert_Dispatch_(Mat M, MatType type, MatReuse reuse, Mat *newmat) noexcept
842: {
843: PetscFunctionBegin;
844: if (reuse == MAT_REUSE_MATRIX || reuse == MAT_INITIAL_MATRIX) {
845: // TODO these cases should be optimized
846: PetscCall(MatConvert_Basic(M, type, reuse, newmat));
847: } else {
848: const auto B = *newmat;
849: const auto pobj = PetscObjectCast(B);
851: if (to_host) {
852: PetscCall(BindToCPU(B, PETSC_TRUE));
853: PetscCall(Reset(B));
854: } else {
855: PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUPM()));
856: }
858: PetscCall(PetscStrFreeAllocpy(to_host ? VECSTANDARD : VecSeq_CUPM::VECCUPM(), &B->defaultvectype));
859: PetscCall(PetscObjectChangeTypeName(pobj, to_host ? MATSEQDENSE : MATSEQDENSECUPM()));
860: // cvec might be the wrong VecType, destroy and rebuild it if necessary
861: // REVIEW ME: this is possibly very inefficient
862: PetscCall(VecDestroy(&MatIMPLCast(B)->cvec));
864: MatComposeOp_CUPM(to_host, pobj, MatConvert_seqdensecupm_seqdense_C(), nullptr, Convert_SeqDenseCUPM_SeqDense);
865: MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMGetArray_C(), nullptr, GetArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_READ_WRITE>);
866: MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMGetArrayRead_C(), nullptr, GetArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_READ>);
867: MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMGetArrayWrite_C(), nullptr, GetArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_WRITE>);
868: MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMRestoreArray_C(), nullptr, RestoreArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_READ_WRITE>);
869: MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMRestoreArrayRead_C(), nullptr, RestoreArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_READ>);
870: MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMRestoreArrayWrite_C(), nullptr, RestoreArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_WRITE>);
871: MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMPlaceArray_C(), nullptr, PlaceArray);
872: MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMResetArray_C(), nullptr, ResetArray);
873: MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMReplaceArray_C(), nullptr, ReplaceArray);
874: MatComposeOp_CUPM(to_host, pobj, MatProductSetFromOptions_seqaij_seqdensecupm_C(), nullptr, MatProductSetFromOptions_SeqAIJ_SeqDense);
876: if (to_host) {
877: B->offloadmask = PETSC_OFFLOAD_CPU;
878: } else {
879: Mat_SeqDenseCUPM *mcu;
881: PetscCall(PetscNew(&mcu));
882: B->spptr = mcu;
883: B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; // REVIEW ME: why not offload host??
884: PetscCall(BindToCPU(B, PETSC_FALSE));
885: }
887: MatSetOp_CUPM(to_host, B, bindtocpu, nullptr, BindToCPU);
888: MatSetOp_CUPM(to_host, B, destroy, MatDestroy_SeqDense, Destroy);
889: }
890: PetscFunctionReturn(PETSC_SUCCESS);
891: }
893: // ==========================================================================================
894: // MatDense_Seq_CUPM - Public API
895: // ==========================================================================================
897: template <device::cupm::DeviceType T>
898: inline constexpr MatType MatDense_Seq_CUPM<T>::MATIMPLCUPM_() noexcept
899: {
900: return MATSEQDENSECUPM();
901: }
903: template <device::cupm::DeviceType T>
904: inline constexpr typename MatDense_Seq_CUPM<T>::Mat_SeqDenseCUPM *MatDense_Seq_CUPM<T>::MatCUPMCast(Mat m) noexcept
905: {
906: return static_cast<Mat_SeqDenseCUPM *>(m->spptr);
907: }
909: template <device::cupm::DeviceType T>
910: inline constexpr Mat_SeqDense *MatDense_Seq_CUPM<T>::MatIMPLCast_(Mat m) noexcept
911: {
912: return static_cast<Mat_SeqDense *>(m->data);
913: }
915: template <device::cupm::DeviceType T>
916: inline constexpr const char *MatDense_Seq_CUPM<T>::MatConvert_seqdensecupm_seqdense_C() noexcept
917: {
918: return T == device::cupm::DeviceType::CUDA ? "MatConvert_seqdensecuda_seqdense_C" : "MatConvert_seqdensehip_seqdense_C";
919: }
921: template <device::cupm::DeviceType T>
922: inline constexpr const char *MatDense_Seq_CUPM<T>::MatProductSetFromOptions_seqaij_seqdensecupm_C() noexcept
923: {
924: return T == device::cupm::DeviceType::CUDA ? "MatProductSetFromOptions_seqaij_seqdensecuda_C" : "MatProductSetFromOptions_seqaij_seqdensehip_C";
925: }
927: // ==========================================================================================
929: // MatCreate_SeqDenseCUPM()
930: template <device::cupm::DeviceType T>
931: inline PetscErrorCode MatDense_Seq_CUPM<T>::Create(Mat A) noexcept
932: {
933: PetscFunctionBegin;
934: PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUPM()));
935: PetscCall(MatCreate_SeqDense(A));
936: PetscCall(Convert_SeqDense_SeqDenseCUPM(A, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &A));
937: PetscFunctionReturn(PETSC_SUCCESS);
938: }
940: template <device::cupm::DeviceType T>
941: inline PetscErrorCode MatDense_Seq_CUPM<T>::Destroy(Mat A) noexcept
942: {
943: PetscFunctionBegin;
944: // prevent copying back data if we own the data pointer
945: if (!MatIMPLCast(A)->user_alloc) A->offloadmask = PETSC_OFFLOAD_CPU;
946: PetscCall(Convert_SeqDenseCUPM_SeqDense(A, MATSEQDENSE, MAT_INPLACE_MATRIX, &A));
947: PetscCall(MatDestroy_SeqDense(A));
948: PetscFunctionReturn(PETSC_SUCCESS);
949: }
951: // obj->ops->setup()
952: template <device::cupm::DeviceType T>
953: inline PetscErrorCode MatDense_Seq_CUPM<T>::SetUp(Mat A) noexcept
954: {
955: PetscFunctionBegin;
956: PetscCall(PetscLayoutSetUp(A->rmap));
957: PetscCall(PetscLayoutSetUp(A->cmap));
958: if (!A->preallocated) {
959: PetscDeviceContext dctx;
961: PetscCall(GetHandles_(&dctx));
962: PetscCall(SetPreallocation(A, dctx));
963: }
964: PetscFunctionReturn(PETSC_SUCCESS);
965: }
967: template <device::cupm::DeviceType T>
968: inline PetscErrorCode MatDense_Seq_CUPM<T>::Reset(Mat A) noexcept
969: {
970: PetscFunctionBegin;
971: if (const auto mcu = MatCUPMCast(A)) {
972: cupmStream_t stream;
974: PetscCheck(!mcu->unplacedarray, PETSC_COMM_SELF, PETSC_ERR_ORDER, "MatDense%sResetArray() must be called first", cupmNAME());
975: PetscCall(GetHandles_(&stream));
976: if (!mcu->d_user_alloc) PetscCallCUPM(cupmFreeAsync(mcu->d_v, stream));
977: PetscCallCUPM(cupmFreeAsync(mcu->d_fact_tau, stream));
978: PetscCallCUPM(cupmFreeAsync(mcu->d_fact_ipiv, stream));
979: PetscCallCUPM(cupmFreeAsync(mcu->d_fact_info, stream));
980: PetscCallCUPM(cupmFreeAsync(mcu->d_fact_work, stream));
981: PetscCall(VecDestroy(&mcu->workvec));
982: PetscCall(PetscFree(A->spptr /* mcu */));
983: }
984: PetscFunctionReturn(PETSC_SUCCESS);
985: }
987: // ==========================================================================================
989: template <device::cupm::DeviceType T>
990: inline PetscErrorCode MatDense_Seq_CUPM<T>::BindToCPU(Mat A, PetscBool to_host) noexcept
991: {
992: const auto mimpl = MatIMPLCast(A);
993: const auto pobj = PetscObjectCast(A);
995: PetscFunctionBegin;
996: PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
997: PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
998: A->boundtocpu = to_host;
999: PetscCall(PetscStrFreeAllocpy(to_host ? PETSCRANDER48 : PETSCDEVICERAND(), &A->defaultrandtype));
1000: if (to_host) {
1001: PetscDeviceContext dctx;
1003: // make sure we have an up-to-date copy on the CPU
1004: PetscCall(GetHandles_(&dctx));
1005: PetscCall(DeviceToHost_(A, dctx));
1006: } else {
1007: PetscBool iscupm;
1009: if (auto &cvec = mimpl->cvec) {
1010: PetscCall(PetscObjectTypeCompare(PetscObjectCast(cvec), VecSeq_CUPM::VECSEQCUPM(), &iscupm));
1011: if (!iscupm) PetscCall(VecDestroy(&cvec));
1012: }
1013: if (auto &cmat = mimpl->cmat) {
1014: PetscCall(PetscObjectTypeCompare(PetscObjectCast(cmat), MATSEQDENSECUPM(), &iscupm));
1015: if (!iscupm) PetscCall(MatDestroy(&cmat));
1016: }
1017: }
1019: // ============================================================
1020: // Composed ops
1021: // ============================================================
1022: MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArray_C", MatDenseGetArray_SeqDense, GetArrayC_<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_READ_WRITE>);
1023: MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArrayRead_C", MatDenseGetArray_SeqDense, GetArrayC_<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_READ>);
1024: MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArrayWrite_C", MatDenseGetArray_SeqDense, GetArrayC_<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_WRITE>);
1025: MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArrayAndMemType_C", nullptr, GetArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_READ_WRITE>);
1026: MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreArrayAndMemType_C", nullptr, RestoreArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_READ_WRITE>);
1027: MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArrayReadAndMemType_C", nullptr, GetArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_READ>);
1028: MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreArrayReadAndMemType_C", nullptr, RestoreArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_READ>);
1029: MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArrayWriteAndMemType_C", nullptr, GetArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_WRITE>);
1030: MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreArrayWriteAndMemType_C", nullptr, RestoreArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_WRITE>);
1031: MatComposeOp_CUPM(to_host, pobj, "MatDenseGetColumnVec_C", MatDenseGetColumnVec_SeqDense, GetColumnVec<PETSC_MEMORY_ACCESS_READ_WRITE>);
1032: MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreColumnVec_C", MatDenseRestoreColumnVec_SeqDense, RestoreColumnVec<PETSC_MEMORY_ACCESS_READ_WRITE>);
1033: MatComposeOp_CUPM(to_host, pobj, "MatDenseGetColumnVecRead_C", MatDenseGetColumnVecRead_SeqDense, GetColumnVec<PETSC_MEMORY_ACCESS_READ>);
1034: MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreColumnVecRead_C", MatDenseRestoreColumnVecRead_SeqDense, RestoreColumnVec<PETSC_MEMORY_ACCESS_READ>);
1035: MatComposeOp_CUPM(to_host, pobj, "MatDenseGetColumnVecWrite_C", MatDenseGetColumnVecWrite_SeqDense, GetColumnVec<PETSC_MEMORY_ACCESS_WRITE>);
1036: MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreColumnVecWrite_C", MatDenseRestoreColumnVecWrite_SeqDense, RestoreColumnVec<PETSC_MEMORY_ACCESS_WRITE>);
1037: MatComposeOp_CUPM(to_host, pobj, "MatDenseGetSubMatrix_C", MatDenseGetSubMatrix_SeqDense, GetSubMatrix);
1038: MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreSubMatrix_C", MatDenseRestoreSubMatrix_SeqDense, RestoreSubMatrix);
1039: MatComposeOp_CUPM(to_host, pobj, "MatQRFactor_C", MatQRFactor_SeqDense, SolveQR::Factor);
1040: // always the same
1041: PetscCall(PetscObjectComposeFunction(pobj, "MatDenseSetLDA_C", MatDenseSetLDA_SeqDense));
1043: // ============================================================
1044: // Function pointer ops
1045: // ============================================================
1046: MatSetOp_CUPM(to_host, A, duplicate, MatDuplicate_SeqDense, Duplicate);
1047: MatSetOp_CUPM(to_host, A, mult, MatMult_SeqDense, [](Mat A, Vec xx, Vec yy) { return MatMultAdd_Dispatch_</* transpose */ false>(A, xx, nullptr, yy); });
1048: MatSetOp_CUPM(to_host, A, multtranspose, MatMultTranspose_SeqDense, [](Mat A, Vec xx, Vec yy) { return MatMultAdd_Dispatch_</* transpose */ true>(A, xx, nullptr, yy); });
1049: MatSetOp_CUPM(to_host, A, multadd, MatMultAdd_SeqDense, MatMultAdd_Dispatch_</* transpose */ false>);
1050: MatSetOp_CUPM(to_host, A, multtransposeadd, MatMultTransposeAdd_SeqDense, MatMultAdd_Dispatch_</* transpose */ true>);
1051: MatSetOp_CUPM(to_host, A, matmultnumeric, MatMatMultNumeric_SeqDense_SeqDense, MatMatMult_Numeric_Dispatch</* transpose_A */ false, /* transpose_B */ false>);
1052: MatSetOp_CUPM(to_host, A, mattransposemultnumeric, MatMatTransposeMultNumeric_SeqDense_SeqDense, MatMatMult_Numeric_Dispatch</* transpose_A */ false, /* transpose_B */ true>);
1053: MatSetOp_CUPM(to_host, A, transposematmultnumeric, MatTransposeMatMultNumeric_SeqDense_SeqDense, MatMatMult_Numeric_Dispatch</* transpose_A */ true, /* transpose_B */ false>);
1054: MatSetOp_CUPM(to_host, A, axpy, MatAXPY_SeqDense, AXPY);
1055: MatSetOp_CUPM(to_host, A, choleskyfactor, MatCholeskyFactor_SeqDense, SolveCholesky::Factor);
1056: MatSetOp_CUPM(to_host, A, lufactor, MatLUFactor_SeqDense, SolveLU::Factor);
1057: MatSetOp_CUPM(to_host, A, getcolumnvector, MatGetColumnVector_SeqDense, GetColumnVector);
1058: MatSetOp_CUPM(to_host, A, scale, MatScale_SeqDense, Scale);
1059: MatSetOp_CUPM(to_host, A, shift, MatShift_SeqDense, Shift);
1060: MatSetOp_CUPM(to_host, A, copy, MatCopy_SeqDense, Copy);
1061: MatSetOp_CUPM(to_host, A, zeroentries, MatZeroEntries_SeqDense, ZeroEntries);
1062: MatSetOp_CUPM(to_host, A, setup, MatSetUp_SeqDense, SetUp);
1063: MatSetOp_CUPM(to_host, A, setrandom, MatSetRandom_SeqDense, SetRandom);
1064: // seemingly always the same
1065: A->ops->productsetfromoptions = MatProductSetFromOptions_SeqDense;
1067: if (const auto cmat = mimpl->cmat) PetscCall(MatBindToCPU(cmat, to_host));
1068: PetscFunctionReturn(PETSC_SUCCESS);
1069: }
1071: template <device::cupm::DeviceType T>
1072: inline PetscErrorCode MatDense_Seq_CUPM<T>::Convert_SeqDenseCUPM_SeqDense(Mat M, MatType type, MatReuse reuse, Mat *newmat) noexcept
1073: {
1074: PetscFunctionBegin;
1075: PetscCall(Convert_Dispatch_</* to host */ true>(M, type, reuse, newmat));
1076: PetscFunctionReturn(PETSC_SUCCESS);
1077: }
1079: template <device::cupm::DeviceType T>
1080: inline PetscErrorCode MatDense_Seq_CUPM<T>::Convert_SeqDense_SeqDenseCUPM(Mat M, MatType type, MatReuse reuse, Mat *newmat) noexcept
1081: {
1082: PetscFunctionBegin;
1083: PetscCall(Convert_Dispatch_</* to host */ false>(M, type, reuse, newmat));
1084: PetscFunctionReturn(PETSC_SUCCESS);
1085: }
1087: // ==========================================================================================
1089: template <device::cupm::DeviceType T>
1090: template <PetscMemType mtype, PetscMemoryAccessMode access>
1091: inline PetscErrorCode MatDense_Seq_CUPM<T>::GetArray(Mat m, PetscScalar **array, PetscDeviceContext dctx) noexcept
1092: {
1093: constexpr auto hostmem = PetscMemTypeHost(mtype);
1094: constexpr auto read_access = PetscMemoryAccessRead(access);
1096: PetscFunctionBegin;
1097: static_assert((mtype == PETSC_MEMTYPE_HOST) || (mtype == PETSC_MEMTYPE_DEVICE), "");
1098: PetscCall(PetscDeviceContextGetOptionalNullContext_Internal(&dctx));
1099: if (hostmem) {
1100: if (read_access) {
1101: PetscCall(DeviceToHost_(m, dctx));
1102: } else if (!MatIMPLCast(m)->v) {
1103: // MatCreateSeqDenseCUPM may not allocate CPU memory. Allocate if needed
1104: PetscCall(MatSeqDenseSetPreallocation(m, nullptr));
1105: }
1106: *array = MatIMPLCast(m)->v;
1107: } else {
1108: if (read_access) {
1109: PetscCall(HostToDevice_(m, dctx));
1110: } else if (!MatCUPMCast(m)->d_v) {
1111: // write-only
1112: PetscCall(SetPreallocation(m, dctx, nullptr));
1113: }
1114: *array = MatCUPMCast(m)->d_v;
1115: }
1116: if (PetscMemoryAccessWrite(access)) {
1117: m->offloadmask = hostmem ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
1118: PetscCall(PetscObjectStateIncrease(PetscObjectCast(m)));
1119: }
1120: PetscFunctionReturn(PETSC_SUCCESS);
1121: }
1123: template <device::cupm::DeviceType T>
1124: template <PetscMemType mtype, PetscMemoryAccessMode access>
1125: inline PetscErrorCode MatDense_Seq_CUPM<T>::RestoreArray(Mat m, PetscScalar **array, PetscDeviceContext) noexcept
1126: {
1127: PetscFunctionBegin;
1128: static_assert((mtype == PETSC_MEMTYPE_HOST) || (mtype == PETSC_MEMTYPE_DEVICE), "");
1129: if (PetscMemoryAccessWrite(access)) {
1130: // WRITE or READ_WRITE
1131: m->offloadmask = PetscMemTypeHost(mtype) ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
1132: PetscCall(PetscObjectStateIncrease(PetscObjectCast(m)));
1133: }
1134: if (array) {
1135: PetscCall(CheckPointerMatchesMemType_(*array, mtype));
1136: *array = nullptr;
1137: }
1138: PetscFunctionReturn(PETSC_SUCCESS);
1139: }
1141: template <device::cupm::DeviceType T>
1142: template <PetscMemoryAccessMode access>
1143: inline PetscErrorCode MatDense_Seq_CUPM<T>::GetArrayAndMemType(Mat m, PetscScalar **array, PetscMemType *mtype, PetscDeviceContext dctx) noexcept
1144: {
1145: PetscFunctionBegin;
1146: PetscCall(GetArray<PETSC_MEMTYPE_DEVICE, access>(m, array, dctx));
1147: if (mtype) *mtype = PETSC_MEMTYPE_CUPM();
1148: PetscFunctionReturn(PETSC_SUCCESS);
1149: }
1151: template <device::cupm::DeviceType T>
1152: template <PetscMemoryAccessMode access>
1153: inline PetscErrorCode MatDense_Seq_CUPM<T>::RestoreArrayAndMemType(Mat m, PetscScalar **array, PetscDeviceContext dctx) noexcept
1154: {
1155: PetscFunctionBegin;
1156: PetscCall(RestoreArray<PETSC_MEMTYPE_DEVICE, access>(m, array, dctx));
1157: PetscFunctionReturn(PETSC_SUCCESS);
1158: }
1160: // ==========================================================================================
1162: template <device::cupm::DeviceType T>
1163: inline PetscErrorCode MatDense_Seq_CUPM<T>::PlaceArray(Mat A, const PetscScalar *array) noexcept
1164: {
1165: const auto mimpl = MatIMPLCast(A);
1166: const auto mcu = MatCUPMCast(A);
1168: PetscFunctionBegin;
1169: PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1170: PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1171: PetscCheck(!mcu->unplacedarray, PETSC_COMM_SELF, PETSC_ERR_ORDER, "MatDense%sResetArray() must be called first", cupmNAME());
1172: if (mimpl->v) {
1173: PetscDeviceContext dctx;
1175: PetscCall(GetHandles_(&dctx));
1176: PetscCall(HostToDevice_(A, dctx));
1177: }
1178: mcu->unplacedarray = util::exchange(mcu->d_v, const_cast<PetscScalar *>(array));
1179: mcu->d_unplaced_user_alloc = util::exchange(mcu->d_user_alloc, PETSC_TRUE);
1180: PetscFunctionReturn(PETSC_SUCCESS);
1181: }
1183: template <device::cupm::DeviceType T>
1184: inline PetscErrorCode MatDense_Seq_CUPM<T>::ReplaceArray(Mat A, const PetscScalar *array) noexcept
1185: {
1186: const auto mimpl = MatIMPLCast(A);
1187: const auto mcu = MatCUPMCast(A);
1189: PetscFunctionBegin;
1190: PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1191: PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1192: PetscCheck(!mcu->unplacedarray, PETSC_COMM_SELF, PETSC_ERR_ORDER, "MatDense%sResetArray() must be called first", cupmNAME());
1193: if (!mcu->d_user_alloc) {
1194: cupmStream_t stream;
1196: PetscCall(GetHandles_(&stream));
1197: PetscCallCUPM(cupmFreeAsync(mcu->d_v, stream));
1198: }
1199: mcu->d_v = const_cast<PetscScalar *>(array);
1200: mcu->d_user_alloc = PETSC_FALSE;
1201: PetscFunctionReturn(PETSC_SUCCESS);
1202: }
1204: template <device::cupm::DeviceType T>
1205: inline PetscErrorCode MatDense_Seq_CUPM<T>::ResetArray(Mat A) noexcept
1206: {
1207: const auto mimpl = MatIMPLCast(A);
1208: const auto mcu = MatCUPMCast(A);
1210: PetscFunctionBegin;
1211: PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1212: PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1213: if (mimpl->v) {
1214: PetscDeviceContext dctx;
1216: PetscCall(GetHandles_(&dctx));
1217: PetscCall(HostToDevice_(A, dctx));
1218: }
1219: mcu->d_v = util::exchange(mcu->unplacedarray, nullptr);
1220: mcu->d_user_alloc = mcu->d_unplaced_user_alloc;
1221: PetscFunctionReturn(PETSC_SUCCESS);
1222: }
1224: // ==========================================================================================
1226: template <device::cupm::DeviceType T>
1227: template <bool transpose_A, bool transpose_B>
1228: inline PetscErrorCode MatDense_Seq_CUPM<T>::MatMatMult_Numeric_Dispatch(Mat A, Mat B, Mat C) noexcept
1229: {
1230: cupmBlasInt_t m, n, k;
1231: PetscBool Aiscupm, Biscupm;
1232: PetscDeviceContext dctx;
1233: cupmBlasHandle_t handle;
1235: PetscFunctionBegin;
1236: PetscCall(PetscCUPMBlasIntCast(C->rmap->n, &m));
1237: PetscCall(PetscCUPMBlasIntCast(C->cmap->n, &n));
1238: PetscCall(PetscCUPMBlasIntCast(transpose_A ? A->rmap->n : A->cmap->n, &k));
1239: if (!m || !n || !k) PetscFunctionReturn(PETSC_SUCCESS);
1241: // we may end up with SEQDENSE as one of the arguments
1242: // REVIEW ME: how? and why is it not B and C????????
1243: PetscCall(PetscObjectTypeCompare(PetscObjectCast(A), MATSEQDENSECUPM(), &Aiscupm));
1244: PetscCall(PetscObjectTypeCompare(PetscObjectCast(B), MATSEQDENSECUPM(), &Biscupm));
1245: if (!Aiscupm) PetscCall(MatConvert(A, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &A));
1246: if (!Biscupm) PetscCall(MatConvert(B, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &B));
1247: PetscCall(PetscInfo(C, "Matrix-Matrix product %" PetscBLASInt_FMT " x %" PetscBLASInt_FMT " x %" PetscBLASInt_FMT " on backend\n", m, k, n));
1248: PetscCall(GetHandles_(&dctx, &handle));
1250: PetscCall(PetscLogGpuTimeBegin());
1251: {
1252: const auto one = cupmScalarCast(1.0);
1253: const auto zero = cupmScalarCast(0.0);
1254: const auto da = DeviceArrayRead(dctx, A);
1255: const auto db = DeviceArrayRead(dctx, B);
1256: const auto dc = DeviceArrayWrite(dctx, C);
1257: PetscInt alda, blda, clda;
1259: PetscCall(MatDenseGetLDA(A, &alda));
1260: PetscCall(MatDenseGetLDA(B, &blda));
1261: PetscCall(MatDenseGetLDA(C, &clda));
1262: PetscCallCUPMBLAS(cupmBlasXgemm(handle, transpose_A ? CUPMBLAS_OP_T : CUPMBLAS_OP_N, transpose_B ? CUPMBLAS_OP_T : CUPMBLAS_OP_N, m, n, k, &one, da.cupmdata(), alda, db.cupmdata(), blda, &zero, dc.cupmdata(), clda));
1263: }
1264: PetscCall(PetscLogGpuTimeEnd());
1266: PetscCall(PetscLogGpuFlops(1.0 * m * n * k + 1.0 * m * n * (k - 1)));
1267: if (!Aiscupm) PetscCall(MatConvert(A, MATSEQDENSE, MAT_INPLACE_MATRIX, &A));
1268: if (!Biscupm) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B));
1269: PetscFunctionReturn(PETSC_SUCCESS);
1270: }
1272: template <device::cupm::DeviceType T>
1273: inline PetscErrorCode MatDense_Seq_CUPM<T>::Copy(Mat A, Mat B, MatStructure str) noexcept
1274: {
1275: const auto m = A->rmap->n;
1276: const auto n = A->cmap->n;
1278: PetscFunctionBegin;
1279: PetscAssert(m == B->rmap->n && n == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "size(B) != size(A)");
1280: // The two matrices must have the same copy implementation to be eligible for fast copy
1281: if (A->ops->copy == B->ops->copy) {
1282: PetscDeviceContext dctx;
1283: cupmStream_t stream;
1285: PetscCall(GetHandles_(&dctx, &stream));
1286: PetscCall(PetscLogGpuTimeBegin());
1287: {
1288: const auto va = DeviceArrayRead(dctx, A);
1289: const auto vb = DeviceArrayWrite(dctx, B);
1290: // order is important, DeviceArrayRead/Write() might call SetPreallocation() which sets
1291: // lda!
1292: const auto lda_a = MatIMPLCast(A)->lda;
1293: const auto lda_b = MatIMPLCast(B)->lda;
1295: if (lda_a > m || lda_b > m) {
1296: PetscAssert(lda_b > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "B lda (%" PetscBLASInt_FMT ") must be > 0 at this point, this indicates Mat%sSetPreallocation() was not called when it should have been!", lda_b, cupmNAME());
1297: PetscAssert(lda_a > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A lda (%" PetscBLASInt_FMT ") must be > 0 at this point, this indicates Mat%sSetPreallocation() was not called when it should have been!", lda_a, cupmNAME());
1298: PetscCall(PetscCUPMMemcpy2DAsync(vb.data(), lda_b, va.data(), lda_a, m, n, cupmMemcpyDeviceToDevice, stream));
1299: } else {
1300: PetscCall(PetscCUPMMemcpyAsync(vb.data(), va.data(), m * n, cupmMemcpyDeviceToDevice, stream));
1301: }
1302: }
1303: PetscCall(PetscLogGpuTimeEnd());
1304: } else {
1305: PetscCall(MatCopy_Basic(A, B, str));
1306: }
1307: PetscFunctionReturn(PETSC_SUCCESS);
1308: }
1310: template <device::cupm::DeviceType T>
1311: inline PetscErrorCode MatDense_Seq_CUPM<T>::ZeroEntries(Mat m) noexcept
1312: {
1313: PetscDeviceContext dctx;
1314: cupmStream_t stream;
1316: PetscFunctionBegin;
1317: PetscCall(GetHandles_(&dctx, &stream));
1318: PetscCall(PetscLogGpuTimeBegin());
1319: {
1320: const auto va = DeviceArrayWrite(dctx, m);
1321: const auto lda = MatIMPLCast(m)->lda;
1322: const auto ma = m->rmap->n;
1323: const auto na = m->cmap->n;
1325: if (lda > ma) {
1326: PetscCall(PetscCUPMMemset2DAsync(va.data(), lda, 0, ma, na, stream));
1327: } else {
1328: PetscCall(PetscCUPMMemsetAsync(va.data(), 0, ma * na, stream));
1329: }
1330: }
1331: PetscCall(PetscLogGpuTimeEnd());
1332: PetscFunctionReturn(PETSC_SUCCESS);
1333: }
1335: namespace detail
1336: {
1338: // ==========================================================================================
1339: // SubMatIndexFunctor
1340: //
1341: // Iterator which permutes a linear index range into matrix indices for am nrows x ncols
1342: // submat with leading dimension lda. Essentially SubMatIndexFunctor(i) returns the index for
1343: // the i'th sequential entry in the matrix.
1344: // ==========================================================================================
1345: template <typename T>
1346: struct SubMatIndexFunctor {
1347: PETSC_HOSTDEVICE_INLINE_DECL T operator()(T x) const noexcept { return ((x / nrows) * lda) + (x % nrows); }
1349: PetscInt nrows;
1350: PetscInt ncols;
1351: PetscInt lda;
1352: };
1354: template <typename Iterator>
1355: struct SubMatrixIterator : MatrixIteratorBase<Iterator, SubMatIndexFunctor<typename thrust::iterator_difference<Iterator>::type>> {
1356: using base_type = MatrixIteratorBase<Iterator, SubMatIndexFunctor<typename thrust::iterator_difference<Iterator>::type>>;
1358: using iterator = typename base_type::iterator;
1360: constexpr SubMatrixIterator(Iterator first, Iterator last, PetscInt nrows, PetscInt ncols, PetscInt lda) noexcept :
1361: base_type{
1362: std::move(first), std::move(last), {nrows, ncols, lda}
1363: }
1364: {
1365: }
1367: PETSC_NODISCARD iterator end() const noexcept { return this->begin() + (this->func.nrows * this->func.ncols); }
1368: };
1370: namespace
1371: {
1373: template <typename T>
1374: PETSC_NODISCARD inline SubMatrixIterator<typename thrust::device_vector<T>::iterator> make_submat_iterator(PetscInt rstart, PetscInt rend, PetscInt cstart, PetscInt cend, PetscInt lda, T *ptr) noexcept
1375: {
1376: const auto nrows = rend - rstart;
1377: const auto ncols = cend - cstart;
1378: const auto dptr = thrust::device_pointer_cast(ptr);
1380: return {dptr + (rstart * lda) + cstart, dptr + ((rstart + nrows) * lda) + cstart, nrows, ncols, lda};
1381: }
1383: } // namespace
1385: } // namespace detail
1387: template <device::cupm::DeviceType T>
1388: inline PetscErrorCode MatDense_Seq_CUPM<T>::Scale(Mat A, PetscScalar alpha) noexcept
1389: {
1390: const auto m = A->rmap->n;
1391: const auto n = A->cmap->n;
1392: const auto N = m * n;
1393: PetscDeviceContext dctx;
1395: PetscFunctionBegin;
1396: PetscCall(PetscInfo(A, "Performing Scale %" PetscInt_FMT " x %" PetscInt_FMT " on backend\n", m, n));
1397: PetscCall(GetHandles_(&dctx));
1398: {
1399: const auto da = DeviceArrayReadWrite(dctx, A);
1400: const auto lda = MatIMPLCast(A)->lda;
1402: if (lda > m) {
1403: cupmStream_t stream;
1405: PetscCall(GetHandlesFrom_(dctx, &stream));
1406: // clang-format off
1407: PetscCallThrust(
1408: const auto sub_mat = detail::make_submat_iterator(0, m, 0, n, lda, da.data());
1410: THRUST_CALL(
1411: thrust::transform,
1412: stream,
1413: sub_mat.begin(), sub_mat.end(), sub_mat.begin(),
1414: device::cupm::functors::make_times_equals(alpha)
1415: )
1416: );
1417: // clang-format on
1418: } else {
1419: const auto cu_alpha = cupmScalarCast(alpha);
1420: cupmBlasHandle_t handle;
1422: PetscCall(GetHandlesFrom_(dctx, &handle));
1423: PetscCall(PetscLogGpuTimeBegin());
1424: PetscCallCUPMBLAS(cupmBlasXscal(handle, N, &cu_alpha, da.cupmdata(), 1));
1425: PetscCall(PetscLogGpuTimeEnd());
1426: }
1427: }
1428: PetscCall(PetscLogGpuFlops(N));
1429: PetscFunctionReturn(PETSC_SUCCESS);
1430: }
1432: template <device::cupm::DeviceType T>
1433: inline PetscErrorCode MatDense_Seq_CUPM<T>::Shift(Mat A, PetscScalar alpha) noexcept
1434: {
1435: const auto m = A->rmap->n;
1436: const auto n = A->cmap->n;
1437: PetscDeviceContext dctx;
1439: PetscFunctionBegin;
1440: PetscCall(GetHandles_(&dctx));
1441: PetscCall(PetscInfo(A, "Performing Shift %" PetscInt_FMT " x %" PetscInt_FMT " on backend\n", m, n));
1442: PetscCall(DiagonalUnaryTransform(A, 0, m, n, dctx, device::cupm::functors::make_plus_equals(alpha)));
1443: PetscFunctionReturn(PETSC_SUCCESS);
1444: }
1446: template <device::cupm::DeviceType T>
1447: inline PetscErrorCode MatDense_Seq_CUPM<T>::AXPY(Mat Y, PetscScalar alpha, Mat X, MatStructure) noexcept
1448: {
1449: const auto m_x = X->rmap->n, m_y = Y->rmap->n;
1450: const auto n_x = X->cmap->n, n_y = Y->cmap->n;
1451: const auto N = m_x * n_x;
1452: PetscDeviceContext dctx;
1454: PetscFunctionBegin;
1455: if (!m_x || !n_x || alpha == (PetscScalar)0.0) PetscFunctionReturn(PETSC_SUCCESS);
1456: PetscCall(PetscInfo(Y, "Performing AXPY %" PetscInt_FMT " x %" PetscInt_FMT " on backend\n", m_y, n_y));
1457: PetscCall(GetHandles_(&dctx));
1458: {
1459: const auto dx = DeviceArrayRead(dctx, X);
1460: const auto dy = DeviceArrayReadWrite(dctx, Y);
1461: const auto lda_x = MatIMPLCast(X)->lda;
1462: const auto lda_y = MatIMPLCast(Y)->lda;
1464: if (lda_x > m_x || lda_y > m_x) {
1465: cupmStream_t stream;
1467: PetscCall(GetHandlesFrom_(dctx, &stream));
1468: // clang-format off
1469: PetscCallThrust(
1470: const auto sub_mat_y = detail::make_submat_iterator(0, m_y, 0, n_y, lda_y, dy.data());
1471: const auto sub_mat_x = detail::make_submat_iterator(0, m_x, 0, n_x, lda_x, dx.data());
1473: THRUST_CALL(
1474: thrust::transform,
1475: stream,
1476: sub_mat_x.begin(), sub_mat_x.end(), sub_mat_y.begin(), sub_mat_y.begin(),
1477: device::cupm::functors::make_axpy(alpha)
1478: );
1479: );
1480: // clang-format on
1481: } else {
1482: const auto cu_alpha = cupmScalarCast(alpha);
1483: cupmBlasHandle_t handle;
1485: PetscCall(GetHandlesFrom_(dctx, &handle));
1486: PetscCall(PetscLogGpuTimeBegin());
1487: PetscCallCUPMBLAS(cupmBlasXaxpy(handle, N, &cu_alpha, dx.cupmdata(), 1, dy.cupmdata(), 1));
1488: PetscCall(PetscLogGpuTimeEnd());
1489: }
1490: }
1491: PetscCall(PetscLogGpuFlops(PetscMax(2 * N - 1, 0)));
1492: PetscFunctionReturn(PETSC_SUCCESS);
1493: }
1495: template <device::cupm::DeviceType T>
1496: inline PetscErrorCode MatDense_Seq_CUPM<T>::Duplicate(Mat A, MatDuplicateOption opt, Mat *B) noexcept
1497: {
1498: const auto hopt = (opt == MAT_COPY_VALUES && A->offloadmask != PETSC_OFFLOAD_CPU) ? MAT_DO_NOT_COPY_VALUES : opt;
1499: PetscDeviceContext dctx;
1501: PetscFunctionBegin;
1502: PetscCall(GetHandles_(&dctx));
1503: // do not call SetPreallocation() yet, we call it afterwards??
1504: PetscCall(MatCreateSeqDenseCUPM<T>(PetscObjectComm(PetscObjectCast(A)), A->rmap->n, A->cmap->n, nullptr, B, dctx, /* preallocate */ false));
1505: PetscCall(MatDuplicateNoCreate_SeqDense(*B, A, hopt));
1506: if (opt == MAT_COPY_VALUES && hopt != MAT_COPY_VALUES) PetscCall(Copy(A, *B, SAME_NONZERO_PATTERN));
1507: // allocate memory if needed
1508: if (opt != MAT_COPY_VALUES && !MatCUPMCast(*B)->d_v) PetscCall(SetPreallocation(*B, dctx));
1509: PetscFunctionReturn(PETSC_SUCCESS);
1510: }
1512: template <device::cupm::DeviceType T>
1513: inline PetscErrorCode MatDense_Seq_CUPM<T>::SetRandom(Mat A, PetscRandom rng) noexcept
1514: {
1515: PetscBool device;
1517: PetscFunctionBegin;
1518: PetscCall(PetscObjectTypeCompare(PetscObjectCast(rng), PETSCDEVICERAND(), &device));
1519: if (device) {
1520: const auto m = A->rmap->n;
1521: const auto n = A->cmap->n;
1522: PetscDeviceContext dctx;
1524: PetscCall(GetHandles_(&dctx));
1525: {
1526: const auto a = DeviceArrayWrite(dctx, A);
1527: PetscInt lda;
1529: PetscCall(MatDenseGetLDA(A, &lda));
1530: if (lda > m) {
1531: for (PetscInt i = 0; i < n; i++) PetscCall(PetscRandomGetValues(rng, m, a.data() + i * lda));
1532: } else {
1533: PetscInt mn;
1535: PetscCall(PetscIntMultError(m, n, &mn));
1536: PetscCall(PetscRandomGetValues(rng, mn, a));
1537: }
1538: }
1539: } else {
1540: PetscCall(MatSetRandom_SeqDense(A, rng));
1541: }
1542: PetscFunctionReturn(PETSC_SUCCESS);
1543: }
1545: // ==========================================================================================
1547: template <device::cupm::DeviceType T>
1548: inline PetscErrorCode MatDense_Seq_CUPM<T>::GetColumnVector(Mat A, Vec v, PetscInt col) noexcept
1549: {
1550: const auto offloadmask = A->offloadmask;
1551: const auto n = A->rmap->n;
1552: const auto col_offset = [&](const PetscScalar *ptr) { return ptr + col * MatIMPLCast(A)->lda; };
1553: PetscBool viscupm;
1554: PetscDeviceContext dctx;
1555: cupmStream_t stream;
1557: PetscFunctionBegin;
1558: PetscCall(PetscObjectTypeCompareAny(PetscObjectCast(v), &viscupm, VecSeq_CUPM::VECSEQCUPM(), VecSeq_CUPM::VECMPICUPM(), VecSeq_CUPM::VECCUPM(), ""));
1559: PetscCall(GetHandles_(&dctx, &stream));
1560: if (viscupm && !v->boundtocpu) {
1561: const auto x = VecSeq_CUPM::DeviceArrayWrite(dctx, v);
1563: // update device data
1564: if (PetscOffloadDevice(offloadmask)) {
1565: PetscCall(PetscCUPMMemcpyAsync(x.data(), col_offset(DeviceArrayRead(dctx, A)), n, cupmMemcpyDeviceToDevice, stream));
1566: } else {
1567: PetscCall(PetscCUPMMemcpyAsync(x.data(), col_offset(HostArrayRead(dctx, A)), n, cupmMemcpyHostToDevice, stream));
1568: }
1569: } else {
1570: PetscScalar *x;
1572: // update host data
1573: PetscCall(VecGetArrayWrite(v, &x));
1574: if (PetscOffloadUnallocated(offloadmask) || PetscOffloadHost(offloadmask)) {
1575: PetscCall(PetscArraycpy(x, col_offset(HostArrayRead(dctx, A)), n));
1576: } else if (PetscOffloadDevice(offloadmask)) {
1577: PetscCall(PetscCUPMMemcpyAsync(x, col_offset(DeviceArrayRead(dctx, A)), n, cupmMemcpyDeviceToHost, stream));
1578: }
1579: PetscCall(VecRestoreArrayWrite(v, &x));
1580: }
1581: PetscFunctionReturn(PETSC_SUCCESS);
1582: }
1584: template <device::cupm::DeviceType T>
1585: template <PetscMemoryAccessMode access>
1586: inline PetscErrorCode MatDense_Seq_CUPM<T>::GetColumnVec(Mat A, PetscInt col, Vec *v) noexcept
1587: {
1588: using namespace vec::cupm;
1589: const auto mimpl = MatIMPLCast(A);
1590: PetscDeviceContext dctx;
1592: PetscFunctionBegin;
1593: PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1594: PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1595: mimpl->vecinuse = col + 1;
1596: PetscCall(GetHandles_(&dctx));
1597: PetscCall(GetArray<PETSC_MEMTYPE_DEVICE, access>(A, const_cast<PetscScalar **>(&mimpl->ptrinuse), dctx));
1598: if (!mimpl->cvec) {
1599: // we pass the data of A, to prevent allocating needless GPU memory the first time
1600: // VecCUPMPlaceArray is called
1601: PetscCall(VecCreateSeqCUPMWithArraysAsync<T>(PetscObjectComm(PetscObjectCast(A)), A->rmap->bs, A->rmap->n, nullptr, mimpl->ptrinuse, &mimpl->cvec));
1602: }
1603: PetscCall(VecCUPMPlaceArrayAsync<T>(mimpl->cvec, mimpl->ptrinuse + static_cast<std::size_t>(col) * static_cast<std::size_t>(mimpl->lda)));
1604: if (access == PETSC_MEMORY_ACCESS_READ) PetscCall(VecLockReadPush(mimpl->cvec));
1605: *v = mimpl->cvec;
1606: PetscFunctionReturn(PETSC_SUCCESS);
1607: }
1609: template <device::cupm::DeviceType T>
1610: template <PetscMemoryAccessMode access>
1611: inline PetscErrorCode MatDense_Seq_CUPM<T>::RestoreColumnVec(Mat A, PetscInt, Vec *v) noexcept
1612: {
1613: using namespace vec::cupm;
1614: const auto mimpl = MatIMPLCast(A);
1615: const auto cvec = mimpl->cvec;
1616: PetscDeviceContext dctx;
1618: PetscFunctionBegin;
1619: PetscCheck(mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseGetColumnVec() first");
1620: PetscCheck(cvec, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing internal column vector");
1621: mimpl->vecinuse = 0;
1622: if (access == PETSC_MEMORY_ACCESS_READ) PetscCall(VecLockReadPop(cvec));
1623: PetscCall(VecCUPMResetArrayAsync<T>(cvec));
1624: PetscCall(GetHandles_(&dctx));
1625: PetscCall(RestoreArray<PETSC_MEMTYPE_DEVICE, access>(A, const_cast<PetscScalar **>(&mimpl->ptrinuse), dctx));
1626: if (v) *v = nullptr;
1627: PetscFunctionReturn(PETSC_SUCCESS);
1628: }
1630: // ==========================================================================================
1632: template <device::cupm::DeviceType T>
1633: inline PetscErrorCode MatDense_Seq_CUPM<T>::GetFactor(Mat A, MatFactorType ftype, Mat *fact_out) noexcept
1634: {
1635: Mat fact = nullptr;
1636: PetscDeviceContext dctx;
1638: PetscFunctionBegin;
1639: PetscCall(GetHandles_(&dctx));
1640: PetscCall(MatCreateSeqDenseCUPM<T>(PetscObjectComm(PetscObjectCast(A)), A->rmap->n, A->cmap->n, nullptr, &fact, dctx, /* preallocate */ false));
1641: fact->factortype = ftype;
1642: switch (ftype) {
1643: case MAT_FACTOR_LU:
1644: case MAT_FACTOR_ILU: // fall-through
1645: fact->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqDense;
1646: fact->ops->ilufactorsymbolic = MatLUFactorSymbolic_SeqDense;
1647: break;
1648: case MAT_FACTOR_CHOLESKY:
1649: case MAT_FACTOR_ICC: // fall-through
1650: fact->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqDense;
1651: break;
1652: case MAT_FACTOR_QR: {
1653: const auto pobj = PetscObjectCast(fact);
1655: PetscCall(PetscObjectComposeFunction(pobj, "MatQRFactor_C", MatQRFactor_SeqDense));
1656: PetscCall(PetscObjectComposeFunction(pobj, "MatQRFactorSymbolic_C", MatQRFactorSymbolic_SeqDense));
1657: } break;
1658: case MAT_FACTOR_NONE:
1659: case MAT_FACTOR_ILUDT: // fall-through
1660: case MAT_FACTOR_NUM_TYPES: // fall-through
1661: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatFactorType %s not supported", MatFactorTypes[ftype]);
1662: }
1663: PetscCall(PetscStrFreeAllocpy(MATSOLVERCUPM(), &fact->solvertype));
1664: PetscCall(PetscStrallocpy(MATORDERINGEXTERNAL, const_cast<char **>(fact->preferredordering) + MAT_FACTOR_LU));
1665: PetscCall(PetscStrallocpy(MATORDERINGEXTERNAL, const_cast<char **>(fact->preferredordering) + MAT_FACTOR_ILU));
1666: PetscCall(PetscStrallocpy(MATORDERINGEXTERNAL, const_cast<char **>(fact->preferredordering) + MAT_FACTOR_CHOLESKY));
1667: PetscCall(PetscStrallocpy(MATORDERINGEXTERNAL, const_cast<char **>(fact->preferredordering) + MAT_FACTOR_ICC));
1668: *fact_out = fact;
1669: PetscFunctionReturn(PETSC_SUCCESS);
1670: }
1672: template <device::cupm::DeviceType T>
1673: inline PetscErrorCode MatDense_Seq_CUPM<T>::InvertFactors(Mat A) noexcept
1674: {
1675: const auto mimpl = MatIMPLCast(A);
1676: const auto mcu = MatCUPMCast(A);
1677: const auto n = static_cast<cupmBlasInt_t>(A->cmap->n);
1678: cupmSolverHandle_t handle;
1679: PetscDeviceContext dctx;
1680: cupmStream_t stream;
1682: PetscFunctionBegin;
1683: #if PetscDefined(HAVE_CUDA) && PetscDefined(USING_NVCC)
1684: // HIP appears to have this by default??
1685: PetscCheck(PETSC_PKG_CUDA_VERSION_GE(10, 1, 0), PETSC_COMM_SELF, PETSC_ERR_SUP, "Upgrade to CUDA version 10.1.0 or higher");
1686: #endif
1687: if (!n || !A->rmap->n) PetscFunctionReturn(PETSC_SUCCESS);
1688: PetscCheck(A->factortype == MAT_FACTOR_CHOLESKY, PETSC_COMM_SELF, PETSC_ERR_LIB, "Factor type %s not implemented", MatFactorTypes[A->factortype]);
1689: // spd
1690: PetscCheck(!mcu->d_fact_ipiv, PETSC_COMM_SELF, PETSC_ERR_LIB, "%sDnsytri not implemented", cupmSolverName());
1692: PetscCall(GetHandles_(&dctx, &handle, &stream));
1693: {
1694: const auto da = DeviceArrayReadWrite(dctx, A);
1695: const auto lda = static_cast<cupmBlasInt_t>(mimpl->lda);
1696: cupmBlasInt_t il;
1698: PetscCallCUPMSOLVER(cupmSolverXpotri_bufferSize(handle, CUPMSOLVER_FILL_MODE_LOWER, n, da.cupmdata(), lda, &il));
1699: if (il > mcu->d_fact_lwork) {
1700: mcu->d_fact_lwork = il;
1701: PetscCallCUPM(cupmFreeAsync(mcu->d_fact_work, stream));
1702: PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_work, il, stream));
1703: }
1704: PetscCall(PetscLogGpuTimeBegin());
1705: PetscCallCUPMSOLVER(cupmSolverXpotri(handle, CUPMSOLVER_FILL_MODE_LOWER, n, da.cupmdata(), lda, mcu->d_fact_work, mcu->d_fact_lwork, mcu->d_fact_info));
1706: PetscCall(PetscLogGpuTimeEnd());
1707: }
1708: PetscCall(CheckCUPMSolverInfo_(mcu->d_fact_info, stream));
1709: // TODO (write cuda kernel)
1710: PetscCall(MatSeqDenseSymmetrize_Private(A, PETSC_TRUE));
1711: PetscCall(PetscLogGpuFlops(1.0 * n * n * n / 3.0));
1713: A->ops->solve = nullptr;
1714: A->ops->solvetranspose = nullptr;
1715: A->ops->matsolve = nullptr;
1716: A->factortype = MAT_FACTOR_NONE;
1718: PetscCall(PetscFree(A->solvertype));
1719: PetscFunctionReturn(PETSC_SUCCESS);
1720: }
1722: // ==========================================================================================
1724: template <device::cupm::DeviceType T>
1725: inline PetscErrorCode MatDense_Seq_CUPM<T>::GetSubMatrix(Mat A, PetscInt rbegin, PetscInt rend, PetscInt cbegin, PetscInt cend, Mat *mat) noexcept
1726: {
1727: const auto mimpl = MatIMPLCast(A);
1728: const auto array_offset = [&](PetscScalar *ptr) { return ptr + rbegin + static_cast<std::size_t>(cbegin) * mimpl->lda; };
1729: const auto n = rend - rbegin;
1730: const auto m = cend - cbegin;
1731: auto &cmat = mimpl->cmat;
1732: PetscDeviceContext dctx;
1734: PetscFunctionBegin;
1735: PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1736: PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1737: mimpl->matinuse = cbegin + 1;
1739: PetscCall(GetHandles_(&dctx));
1740: PetscCall(HostToDevice_(A, dctx));
1742: if (cmat && ((m != cmat->cmap->N) || (n != cmat->rmap->N))) PetscCall(MatDestroy(&cmat));
1743: {
1744: const auto device_array = array_offset(MatCUPMCast(A)->d_v);
1746: if (cmat) {
1747: PetscCall(PlaceArray(cmat, device_array));
1748: } else {
1749: PetscCall(MatCreateSeqDenseCUPM<T>(PetscObjectComm(PetscObjectCast(A)), n, m, device_array, &cmat, dctx));
1750: }
1751: }
1752: PetscCall(MatDenseSetLDA(cmat, mimpl->lda));
1753: // place CPU array if present but do not copy any data
1754: if (const auto host_array = mimpl->v) {
1755: cmat->offloadmask = PETSC_OFFLOAD_GPU;
1756: PetscCall(MatDensePlaceArray(cmat, array_offset(host_array)));
1757: }
1759: cmat->offloadmask = A->offloadmask;
1760: *mat = cmat;
1761: PetscFunctionReturn(PETSC_SUCCESS);
1762: }
1764: template <device::cupm::DeviceType T>
1765: inline PetscErrorCode MatDense_Seq_CUPM<T>::RestoreSubMatrix(Mat A, Mat *m) noexcept
1766: {
1767: const auto mimpl = MatIMPLCast(A);
1768: const auto cmat = mimpl->cmat;
1769: const auto reset = static_cast<bool>(mimpl->v);
1770: bool copy, was_offload_host;
1772: PetscFunctionBegin;
1773: PetscCheck(mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseGetSubMatrix() first");
1774: PetscCheck(cmat, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing internal column matrix");
1775: PetscCheck(*m == cmat, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Not the matrix obtained from MatDenseGetSubMatrix()");
1776: mimpl->matinuse = 0;
1778: // calls to ResetArray may change it, so save it here
1779: was_offload_host = cmat->offloadmask == PETSC_OFFLOAD_CPU;
1780: if (was_offload_host && !reset) {
1781: copy = true;
1782: PetscCall(MatSeqDenseSetPreallocation(A, nullptr));
1783: } else {
1784: copy = false;
1785: }
1787: PetscCall(ResetArray(cmat));
1788: if (reset) PetscCall(MatDenseResetArray(cmat));
1789: if (copy) {
1790: PetscDeviceContext dctx;
1792: PetscCall(GetHandles_(&dctx));
1793: PetscCall(DeviceToHost_(A, dctx));
1794: } else {
1795: A->offloadmask = was_offload_host ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
1796: }
1798: cmat->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
1799: *m = nullptr;
1800: PetscFunctionReturn(PETSC_SUCCESS);
1801: }
1803: // ==========================================================================================
1805: namespace
1806: {
1808: template <device::cupm::DeviceType T>
1809: inline PetscErrorCode MatMatMultNumeric_SeqDenseCUPM_SeqDenseCUPM(Mat A, Mat B, Mat C, PetscBool TA, PetscBool TB) noexcept
1810: {
1811: PetscFunctionBegin;
1812: if (TA) {
1813: if (TB) {
1814: PetscCall(MatDense_Seq_CUPM<T>::template MatMatMult_Numeric_Dispatch<true, true>(A, B, C));
1815: } else {
1816: PetscCall(MatDense_Seq_CUPM<T>::template MatMatMult_Numeric_Dispatch<true, false>(A, B, C));
1817: }
1818: } else {
1819: if (TB) {
1820: PetscCall(MatDense_Seq_CUPM<T>::template MatMatMult_Numeric_Dispatch<false, true>(A, B, C));
1821: } else {
1822: PetscCall(MatDense_Seq_CUPM<T>::template MatMatMult_Numeric_Dispatch<false, false>(A, B, C));
1823: }
1824: }
1825: PetscFunctionReturn(PETSC_SUCCESS);
1826: }
1828: template <device::cupm::DeviceType T>
1829: inline PetscErrorCode MatSolverTypeRegister_DENSECUPM() noexcept
1830: {
1831: PetscFunctionBegin;
1832: for (auto ftype : util::make_array(MAT_FACTOR_LU, MAT_FACTOR_CHOLESKY, MAT_FACTOR_QR)) {
1833: PetscCall(MatSolverTypeRegister(MatDense_Seq_CUPM<T>::MATSOLVERCUPM(), MATSEQDENSE, ftype, MatDense_Seq_CUPM<T>::GetFactor));
1834: PetscCall(MatSolverTypeRegister(MatDense_Seq_CUPM<T>::MATSOLVERCUPM(), MatDense_Seq_CUPM<T>::MATSEQDENSECUPM(), ftype, MatDense_Seq_CUPM<T>::GetFactor));
1835: }
1836: PetscFunctionReturn(PETSC_SUCCESS);
1837: }
1839: } // anonymous namespace
1841: } // namespace impl
1843: } // namespace cupm
1845: } // namespace mat
1847: } // namespace Petsc
1849: #endif // __cplusplus
1851: #endif // PETSCMATSEQDENSECUPM_HPP