Actual source code: aijkok.kokkos.cxx

  1: #include <petscvec_kokkos.hpp>
  2: #include <petscpkg_version.h>
  3: #include <petsc/private/petscimpl.h>
  4: #include <petsc/private/sfimpl.h>
  5: #include <petscsystypes.h>
  6: #include <petscerror.h>

  8: #include <Kokkos_Core.hpp>
  9: #include <KokkosBlas.hpp>
 10: #include <KokkosSparse_CrsMatrix.hpp>
 11: #include <KokkosSparse_spmv.hpp>
 12: #include <KokkosSparse_spiluk.hpp>
 13: #include <KokkosSparse_sptrsv.hpp>
 14: #include <KokkosSparse_spgemm.hpp>
 15: #include <KokkosSparse_spadd.hpp>

 17: #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>

 19: #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0)
 20:   #include <KokkosSparse_Utils.hpp>
 21: using KokkosSparse::sort_crs_matrix;
 22: using KokkosSparse::Impl::transpose_matrix;
 23: #else
 24:   #include <KokkosKernels_Sorting.hpp>
 25: using KokkosKernels::sort_crs_matrix;
 26: using KokkosKernels::Impl::transpose_matrix;
 27: #endif

 29: static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */

 31: /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
 32:    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
 33:    In the latter case, it is important to set a_dual's sync state correctly.
 34:  */
 35: static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode)
 36: {
 37:   Mat_SeqAIJ       *aijseq;
 38:   Mat_SeqAIJKokkos *aijkok;

 40:   PetscFunctionBegin;
 41:   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
 42:   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));

 44:   aijseq = static_cast<Mat_SeqAIJ *>(A->data);
 45:   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

 47:   /* If aijkok does not exist, we just copy i, j to device.
 48:      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
 49:      In both cases, we build a new aijkok structure.
 50:   */
 51:   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
 52:     delete aijkok;
 53:     aijkok   = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq->nz, aijseq->i, aijseq->j, aijseq->a, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/);
 54:     A->spptr = aijkok;
 55:   }

 57:   if (aijkok->device_mat_d.data()) {
 58:     A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this
 59:   }
 60:   PetscFunctionReturn(PETSC_SUCCESS);
 61: }

 63: /* Sync CSR data to device if not yet */
 64: PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
 65: {
 66:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

 68:   PetscFunctionBegin;
 69:   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device");
 70:   PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
 71:   if (aijkok->a_dual.need_sync_device()) {
 72:     aijkok->a_dual.sync_device();
 73:     aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */
 74:     aijkok->hermitian_updated = PETSC_FALSE;
 75:   }
 76:   PetscFunctionReturn(PETSC_SUCCESS);
 77: }

 79: /* Mark the CSR data on device as modified */
 80: PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
 81: {
 82:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

 84:   PetscFunctionBegin;
 85:   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries");
 86:   aijkok->a_dual.clear_sync_state();
 87:   aijkok->a_dual.modify_device();
 88:   aijkok->transpose_updated = PETSC_FALSE;
 89:   aijkok->hermitian_updated = PETSC_FALSE;
 90:   PetscCall(MatSeqAIJInvalidateDiagonal(A));
 91:   PetscCall(PetscObjectStateIncrease((PetscObject)A));
 92:   PetscFunctionReturn(PETSC_SUCCESS);
 93: }

 95: static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
 96: {
 97:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

 99:   PetscFunctionBegin;
100:   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
101:   /* We do not expect one needs factors on host  */
102:   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host");
103:   PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK");
104:   aijkok->a_dual.sync_host();
105:   PetscFunctionReturn(PETSC_SUCCESS);
106: }

108: static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
109: {
110:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

112:   PetscFunctionBegin;
113:   /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's.
114:     Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be
115:     reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate
116:     must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd.
117:   */
118:   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
119:     aijkok->a_dual.sync_host();
120:     *array = aijkok->a_dual.view_host().data();
121:   } else { /* Happens when calling MatSetValues on a newly created matrix */
122:     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
123:   }
124:   PetscFunctionReturn(PETSC_SUCCESS);
125: }

127: static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
128: {
129:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

131:   PetscFunctionBegin;
132:   if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host();
133:   PetscFunctionReturn(PETSC_SUCCESS);
134: }

136: static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
137: {
138:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

140:   PetscFunctionBegin;
141:   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
142:     aijkok->a_dual.sync_host();
143:     *array = aijkok->a_dual.view_host().data();
144:   } else {
145:     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
146:   }
147:   PetscFunctionReturn(PETSC_SUCCESS);
148: }

150: static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
151: {
152:   PetscFunctionBegin;
153:   *array = NULL;
154:   PetscFunctionReturn(PETSC_SUCCESS);
155: }

157: static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
158: {
159:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

161:   PetscFunctionBegin;
162:   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
163:     *array = aijkok->a_dual.view_host().data();
164:   } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
165:     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
166:   }
167:   PetscFunctionReturn(PETSC_SUCCESS);
168: }

170: static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
171: {
172:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

174:   PetscFunctionBegin;
175:   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
176:     aijkok->a_dual.clear_sync_state();
177:     aijkok->a_dual.modify_host();
178:   }
179:   PetscFunctionReturn(PETSC_SUCCESS);
180: }

182: static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
183: {
184:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

186:   PetscFunctionBegin;
187:   PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL");

189:   if (i) *i = aijkok->i_device_data();
190:   if (j) *j = aijkok->j_device_data();
191:   if (a) {
192:     aijkok->a_dual.sync_device();
193:     *a = aijkok->a_device_data();
194:   }
195:   if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS;
196:   PetscFunctionReturn(PETSC_SUCCESS);
197: }

199: // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy
200: PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat)
201: {
202:   Mat_SeqAIJKokkos                            *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
203:   Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat);

205:   PetscFunctionBegin;
206:   PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
207:   aijkok->device_mat_d = create_mirror(DefaultMemorySpace(), h_mat_k);
208:   Kokkos::deep_copy(aijkok->device_mat_d, h_mat_k);
209:   PetscFunctionReturn(PETSC_SUCCESS);
210: }

212: // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL
213: PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat)
214: {
215:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

217:   PetscFunctionBegin;
218:   if (aijkok && aijkok->device_mat_d.data()) {
219:     *d_mat = aijkok->device_mat_d.data();
220:   } else {
221:     PetscCall(MatSeqAIJKokkosSyncDevice(A)); // create aijkok (we are making d_mat now so make a place for it)
222:     *d_mat = NULL;
223:   }
224:   PetscFunctionReturn(PETSC_SUCCESS);
225: }

227: /*
228:   Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device.

230:   Input Parameter:
231: .  A       - the MATSEQAIJKOKKOS matrix

233:   Output Parameters:
234: +  perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i))
235: -  T_d    - the transpose on device, whose value array is allocated but not initialized
236: */
237: static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d)
238: {
239:   Mat_SeqAIJ             *aseq = static_cast<Mat_SeqAIJ *>(A->data);
240:   PetscInt                nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
241:   const PetscInt         *Ai = aseq->i, *Aj = aseq->j;
242:   MatRowMapKokkosViewHost Ti_h("Ti", n + 1);
243:   MatRowMapType          *Ti = Ti_h.data();
244:   MatColIdxKokkosViewHost Tj_h("Tj", nz);
245:   MatRowMapKokkosViewHost perm_h("permutation", nz);
246:   PetscInt               *Tj   = Tj_h.data();
247:   PetscInt               *perm = perm_h.data();
248:   PetscInt               *offset;

250:   PetscFunctionBegin;
251:   // Populate Ti
252:   PetscCallCXX(Kokkos::deep_copy(Ti_h, 0));
253:   Ti++;
254:   for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++;
255:   Ti--;
256:   for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i];

258:   // Populate Tj and the permutation array
259:   PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices
260:   for (PetscInt i = 0; i < m; i++) {
261:     for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i)
262:       PetscInt r    = Aj[j];                       // row r of T
263:       PetscInt disp = Ti[r] + offset[r];

265:       Tj[disp]   = i; // col i of T
266:       perm[disp] = j;
267:       offset[r]++;
268:     }
269:   }
270:   PetscCall(PetscFree(offset));

272:   // Sort each row of T, along with the permutation array
273:   for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i]));

275:   // Output perm and T on device
276:   auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h);
277:   auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h);
278:   PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d));
279:   PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h));
280:   PetscFunctionReturn(PETSC_SUCCESS);
281: }

283: // Generate the transpose on device and cache it internally
284: // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own
285: PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT)
286: {
287:   Mat_SeqAIJ       *aseq = static_cast<Mat_SeqAIJ *>(A->data);
288:   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
289:   PetscInt          nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
290:   KokkosCsrMatrix  &T = akok->csrmatT;

292:   PetscFunctionBegin;
293:   PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
294:   PetscCallCXX(akok->a_dual.sync_device()); // Sync A's valeus since we are going to access them on device

296:   const auto &Aa = akok->a_dual.view_device();

298:   if (A->symmetric == PETSC_BOOL3_TRUE) {
299:     *csrmatT = akok->csrmat;
300:   } else {
301:     // See if we already have a cached transpose and its value is up to date
302:     if (T.numRows() == n && T.numCols() == m) {  // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
303:       if (!akok->transpose_updated) {            // if the value is out of date, update the cached version
304:         const auto &perm = akok->transpose_perm; // get the permutation array
305:         auto       &Ta   = T.values;

307:         PetscCallCXX(Kokkos::parallel_for(
308:           nz, KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); }));
309:       }
310:     } else { // Generate T of size n x m for the first time
311:       MatRowMapKokkosView perm;

313:       PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
314:       akok->transpose_perm = perm; // cache the perm in this matrix for reuse
315:       PetscCallCXX(Kokkos::parallel_for(
316:         nz, KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); }));
317:     }
318:     akok->transpose_updated = PETSC_TRUE;
319:     *csrmatT                = akok->csrmatT;
320:   }
321:   PetscFunctionReturn(PETSC_SUCCESS);
322: }

324: // Generate the Hermitian on device and cache it internally
325: static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH)
326: {
327:   Mat_SeqAIJ       *aseq = static_cast<Mat_SeqAIJ *>(A->data);
328:   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
329:   PetscInt          nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
330:   KokkosCsrMatrix  &T = akok->csrmatH;

332:   PetscFunctionBegin;
333:   PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
334:   PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device

336:   const auto &Aa = akok->a_dual.view_device();

338:   if (A->hermitian == PETSC_BOOL3_TRUE) {
339:     *csrmatH = akok->csrmat;
340:   } else {
341:     // See if we already have a cached hermitian and its value is up to date
342:     if (T.numRows() == n && T.numCols() == m) {  // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
343:       if (!akok->hermitian_updated) {            // if the value is out of date, update the cached version
344:         const auto &perm = akok->transpose_perm; // get the permutation array
345:         auto       &Ta   = T.values;

347:         PetscCallCXX(Kokkos::parallel_for(
348:           nz, KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); }));
349:       }
350:     } else { // Generate T of size n x m for the first time
351:       MatRowMapKokkosView perm;

353:       PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
354:       akok->transpose_perm = perm; // cache the perm in this matrix for reuse
355:       PetscCallCXX(Kokkos::parallel_for(
356:         nz, KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); }));
357:     }
358:     akok->hermitian_updated = PETSC_TRUE;
359:     *csrmatH                = akok->csrmatH;
360:   }
361:   PetscFunctionReturn(PETSC_SUCCESS);
362: }

364: /* y = A x */
365: static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
366: {
367:   Mat_SeqAIJKokkos          *aijkok;
368:   ConstPetscScalarKokkosView xv;
369:   PetscScalarKokkosView      yv;

371:   PetscFunctionBegin;
372:   PetscCall(PetscLogGpuTimeBegin());
373:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
374:   PetscCall(VecGetKokkosView(xx, &xv));
375:   PetscCall(VecGetKokkosViewWrite(yy, &yv));
376:   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
377:   PetscCallCXX(KokkosSparse::spmv("N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */
378:   PetscCall(VecRestoreKokkosView(xx, &xv));
379:   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
380:   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
381:   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
382:   PetscCall(PetscLogGpuTimeEnd());
383:   PetscFunctionReturn(PETSC_SUCCESS);
384: }

386: /* y = A^T x */
387: static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
388: {
389:   Mat_SeqAIJKokkos          *aijkok;
390:   const char                *mode;
391:   ConstPetscScalarKokkosView xv;
392:   PetscScalarKokkosView      yv;
393:   KokkosCsrMatrix            csrmat;

395:   PetscFunctionBegin;
396:   PetscCall(PetscLogGpuTimeBegin());
397:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
398:   PetscCall(VecGetKokkosView(xx, &xv));
399:   PetscCall(VecGetKokkosViewWrite(yy, &yv));
400:   if (A->form_explicit_transpose) {
401:     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
402:     mode = "N";
403:   } else {
404:     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
405:     csrmat = aijkok->csrmat;
406:     mode   = "T";
407:   }
408:   PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */
409:   PetscCall(VecRestoreKokkosView(xx, &xv));
410:   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
411:   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
412:   PetscCall(PetscLogGpuTimeEnd());
413:   PetscFunctionReturn(PETSC_SUCCESS);
414: }

416: /* y = A^H x */
417: static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
418: {
419:   Mat_SeqAIJKokkos          *aijkok;
420:   const char                *mode;
421:   ConstPetscScalarKokkosView xv;
422:   PetscScalarKokkosView      yv;
423:   KokkosCsrMatrix            csrmat;

425:   PetscFunctionBegin;
426:   PetscCall(PetscLogGpuTimeBegin());
427:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
428:   PetscCall(VecGetKokkosView(xx, &xv));
429:   PetscCall(VecGetKokkosViewWrite(yy, &yv));
430:   if (A->form_explicit_transpose) {
431:     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
432:     mode = "N";
433:   } else {
434:     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
435:     csrmat = aijkok->csrmat;
436:     mode   = "C";
437:   }
438:   PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */
439:   PetscCall(VecRestoreKokkosView(xx, &xv));
440:   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
441:   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
442:   PetscCall(PetscLogGpuTimeEnd());
443:   PetscFunctionReturn(PETSC_SUCCESS);
444: }

446: /* z = A x + y */
447: static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
448: {
449:   Mat_SeqAIJKokkos          *aijkok;
450:   ConstPetscScalarKokkosView xv, yv;
451:   PetscScalarKokkosView      zv;

453:   PetscFunctionBegin;
454:   PetscCall(PetscLogGpuTimeBegin());
455:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
456:   PetscCall(VecGetKokkosView(xx, &xv));
457:   PetscCall(VecGetKokkosView(yy, &yv));
458:   PetscCall(VecGetKokkosViewWrite(zz, &zv));
459:   if (zz != yy) Kokkos::deep_copy(zv, yv);
460:   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
461:   PetscCallCXX(KokkosSparse::spmv("N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */
462:   PetscCall(VecRestoreKokkosView(xx, &xv));
463:   PetscCall(VecRestoreKokkosView(yy, &yv));
464:   PetscCall(VecRestoreKokkosViewWrite(zz, &zv));
465:   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
466:   PetscCall(PetscLogGpuTimeEnd());
467:   PetscFunctionReturn(PETSC_SUCCESS);
468: }

470: /* z = A^T x + y */
471: static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
472: {
473:   Mat_SeqAIJKokkos          *aijkok;
474:   const char                *mode;
475:   ConstPetscScalarKokkosView xv, yv;
476:   PetscScalarKokkosView      zv;
477:   KokkosCsrMatrix            csrmat;

479:   PetscFunctionBegin;
480:   PetscCall(PetscLogGpuTimeBegin());
481:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
482:   PetscCall(VecGetKokkosView(xx, &xv));
483:   PetscCall(VecGetKokkosView(yy, &yv));
484:   PetscCall(VecGetKokkosViewWrite(zz, &zv));
485:   if (zz != yy) Kokkos::deep_copy(zv, yv);
486:   if (A->form_explicit_transpose) {
487:     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
488:     mode = "N";
489:   } else {
490:     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
491:     csrmat = aijkok->csrmat;
492:     mode   = "T";
493:   }
494:   PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */
495:   PetscCall(VecRestoreKokkosView(xx, &xv));
496:   PetscCall(VecRestoreKokkosView(yy, &yv));
497:   PetscCall(VecRestoreKokkosViewWrite(zz, &zv));
498:   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
499:   PetscCall(PetscLogGpuTimeEnd());
500:   PetscFunctionReturn(PETSC_SUCCESS);
501: }

503: /* z = A^H x + y */
504: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
505: {
506:   Mat_SeqAIJKokkos          *aijkok;
507:   const char                *mode;
508:   ConstPetscScalarKokkosView xv, yv;
509:   PetscScalarKokkosView      zv;
510:   KokkosCsrMatrix            csrmat;

512:   PetscFunctionBegin;
513:   PetscCall(PetscLogGpuTimeBegin());
514:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
515:   PetscCall(VecGetKokkosView(xx, &xv));
516:   PetscCall(VecGetKokkosView(yy, &yv));
517:   PetscCall(VecGetKokkosViewWrite(zz, &zv));
518:   if (zz != yy) Kokkos::deep_copy(zv, yv);
519:   if (A->form_explicit_transpose) {
520:     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
521:     mode = "N";
522:   } else {
523:     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
524:     csrmat = aijkok->csrmat;
525:     mode   = "C";
526:   }
527:   PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */
528:   PetscCall(VecRestoreKokkosView(xx, &xv));
529:   PetscCall(VecRestoreKokkosView(yy, &yv));
530:   PetscCall(VecRestoreKokkosViewWrite(zz, &zv));
531:   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
532:   PetscCall(PetscLogGpuTimeEnd());
533:   PetscFunctionReturn(PETSC_SUCCESS);
534: }

536: PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg)
537: {
538:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

540:   PetscFunctionBegin;
541:   switch (op) {
542:   case MAT_FORM_EXPLICIT_TRANSPOSE:
543:     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
544:     if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose());
545:     A->form_explicit_transpose = flg;
546:     break;
547:   default:
548:     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
549:     break;
550:   }
551:   PetscFunctionReturn(PETSC_SUCCESS);
552: }

554: /* Depending on reuse, either build a new mat, or use the existing mat */
555: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat)
556: {
557:   Mat_SeqAIJ *aseq;

559:   PetscFunctionBegin;
560:   PetscCall(PetscKokkosInitializeCheck());
561:   if (reuse == MAT_INITIAL_MATRIX) {                      /* Build a brand new mat */
562:     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));  /* the returned newmat is a SeqAIJKokkos */
563:   } else if (reuse == MAT_REUSE_MATRIX) {                 /* Reuse the mat created before */
564:     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */
565:   } else if (reuse == MAT_INPLACE_MATRIX) {               /* newmat is A */
566:     PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX");
567:     PetscCall(PetscFree(A->defaultvectype));
568:     PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */
569:     PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS));
570:     PetscCall(MatSetOps_SeqAIJKokkos(A));
571:     aseq = static_cast<Mat_SeqAIJ *>(A->data);
572:     if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */
573:       PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
574:       A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq->nz, aseq->i, aseq->j, aseq->a, A->nonzerostate, PETSC_FALSE);
575:     }
576:   }
577:   PetscFunctionReturn(PETSC_SUCCESS);
578: }

580: /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
581:    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
582:  */
583: static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B)
584: {
585:   Mat_SeqAIJ       *bseq;
586:   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok;
587:   Mat               mat;

589:   PetscFunctionBegin;
590:   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
591:   PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B));
592:   mat = *B;
593:   if (A->assembled) {
594:     bseq = static_cast<Mat_SeqAIJ *>(mat->data);
595:     bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq->nz, bseq->i, bseq->j, bseq->a, mat->nonzerostate, PETSC_FALSE);
596:     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
597:     /* Now copy values to B if needed */
598:     if (dupOption == MAT_COPY_VALUES) {
599:       if (akok->a_dual.need_sync_device()) {
600:         Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host());
601:         bkok->a_dual.modify_host();
602:       } else { /* If device has the latest data, we only copy data on device */
603:         Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device());
604:         bkok->a_dual.modify_device();
605:       }
606:     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
607:       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
608:       bkok->a_dual.modify_host();
609:     }
610:     mat->spptr = bkok;
611:   }

613:   PetscCall(PetscFree(mat->defaultvectype));
614:   PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */
615:   PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS));
616:   PetscCall(MatSetOps_SeqAIJKokkos(mat));
617:   PetscFunctionReturn(PETSC_SUCCESS);
618: }

620: static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B)
621: {
622:   Mat               At;
623:   KokkosCsrMatrix   internT;
624:   Mat_SeqAIJKokkos *atkok, *bkok;

626:   PetscFunctionBegin;
627:   if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B));
628:   PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */
629:   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
630:     /* Deep copy internT, as we want to isolate the internal transpose */
631:     PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT)));
632:     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At));
633:     if (reuse == MAT_INITIAL_MATRIX) *B = At;
634:     else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */
635:   } else {                                    /* MAT_REUSE_MATRIX, just need to copy values to B on device */
636:     if ((*B)->assembled) {
637:       bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr);
638:       PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values));
639:       PetscCall(MatSeqAIJKokkosModifyDevice(*B));
640:     } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
641:       Mat_SeqAIJ             *bseq = static_cast<Mat_SeqAIJ *>((*B)->data);
642:       MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */
643:       MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz());
644:       PetscCallCXX(Kokkos::deep_copy(a_h, internT.values));
645:       PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries));
646:     } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated");
647:   }
648:   PetscFunctionReturn(PETSC_SUCCESS);
649: }

651: static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
652: {
653:   Mat_SeqAIJKokkos *aijkok;

655:   PetscFunctionBegin;
656:   if (A->factortype == MAT_FACTOR_NONE) {
657:     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
658:     delete aijkok;
659:   } else {
660:     delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr);
661:   }
662:   A->spptr = NULL;
663:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
664:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
665:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
666:   PetscCall(MatDestroy_SeqAIJ(A));
667:   PetscFunctionReturn(PETSC_SUCCESS);
668: }

670: /*MC
671:    MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos

673:    A matrix type type using Kokkos-Kernels CrsMatrix type for portability across different device types

675:    Options Database Key:
676: .  -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()`

678:   Level: beginner

680: .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS`
681: M*/
682: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
683: {
684:   PetscFunctionBegin;
685:   PetscCall(PetscKokkosInitializeCheck());
686:   PetscCall(MatCreate_SeqAIJ(A));
687:   PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A));
688:   PetscFunctionReturn(PETSC_SUCCESS);
689: }

691: /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
692: PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
693: {
694:   Mat_SeqAIJ         *a, *b;
695:   Mat_SeqAIJKokkos   *akok, *bkok, *ckok;
696:   MatScalarKokkosView aa, ba, ca;
697:   MatRowMapKokkosView ai, bi, ci;
698:   MatColIdxKokkosView aj, bj, cj;
699:   PetscInt            m, n, nnz, aN;

701:   PetscFunctionBegin;
705:   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
706:   PetscCheckTypeName(B, MATSEQAIJKOKKOS);
707:   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
708:   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");

710:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
711:   PetscCall(MatSeqAIJKokkosSyncDevice(B));
712:   a    = static_cast<Mat_SeqAIJ *>(A->data);
713:   b    = static_cast<Mat_SeqAIJ *>(B->data);
714:   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
715:   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
716:   aa   = akok->a_dual.view_device();
717:   ai   = akok->i_dual.view_device();
718:   ba   = bkok->a_dual.view_device();
719:   bi   = bkok->i_dual.view_device();
720:   m    = A->rmap->n; /* M, N and nnz of C */
721:   n    = A->cmap->n + B->cmap->n;
722:   nnz  = a->nz + b->nz;
723:   aN   = A->cmap->n; /* N of A */
724:   if (reuse == MAT_INITIAL_MATRIX) {
725:     aj           = akok->j_dual.view_device();
726:     bj           = bkok->j_dual.view_device();
727:     auto ca_dual = MatScalarKokkosDualView("a", aa.extent(0) + ba.extent(0));
728:     auto ci_dual = MatRowMapKokkosDualView("i", ai.extent(0));
729:     auto cj_dual = MatColIdxKokkosDualView("j", aj.extent(0) + bj.extent(0));
730:     ca           = ca_dual.view_device();
731:     ci           = ci_dual.view_device();
732:     cj           = cj_dual.view_device();

734:     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
735:     Kokkos::parallel_for(
736:       Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
737:         PetscInt i       = t.league_rank(); /* row i */
738:         PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);

740:         Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
741:                                                    ci(i) = coffset;
742:                                                    if (i == m - 1) ci(m) = ai(m) + bi(m);
743:         });

745:         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
746:           if (k < alen) {
747:             ca(coffset + k) = aa(ai(i) + k);
748:             cj(coffset + k) = aj(ai(i) + k);
749:           } else {
750:             ca(coffset + k) = ba(bi(i) + k - alen);
751:             cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */
752:           }
753:         });
754:       });
755:     ca_dual.modify_device();
756:     ci_dual.modify_device();
757:     cj_dual.modify_device();
758:     PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci_dual, cj_dual, ca_dual));
759:     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C));
760:   } else if (reuse == MAT_REUSE_MATRIX) {
762:     PetscCheckTypeName(*C, MATSEQAIJKOKKOS);
763:     ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr);
764:     ca   = ckok->a_dual.view_device();
765:     ci   = ckok->i_dual.view_device();

767:     Kokkos::parallel_for(
768:       Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
769:         PetscInt i    = t.league_rank(); /* row i */
770:         PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
771:         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
772:           if (k < alen) ca(ci(i) + k) = aa(ai(i) + k);
773:           else ca(ci(i) + k) = ba(bi(i) + k - alen);
774:         });
775:       });
776:     PetscCall(MatSeqAIJKokkosModifyDevice(*C));
777:   }
778:   PetscFunctionReturn(PETSC_SUCCESS);
779: }

781: static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata)
782: {
783:   PetscFunctionBegin;
784:   delete static_cast<MatProductData_SeqAIJKokkos *>(pdata);
785:   PetscFunctionReturn(PETSC_SUCCESS);
786: }

788: static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
789: {
790:   Mat_Product                 *product = C->product;
791:   Mat                          A, B;
792:   bool                         transA, transB; /* use bool, since KK needs this type */
793:   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
794:   Mat_SeqAIJ                  *c;
795:   MatProductData_SeqAIJKokkos *pdata;
796:   KokkosCsrMatrix              csrmatA, csrmatB;

798:   PetscFunctionBegin;
799:   MatCheckProduct(C, 1);
800:   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty");
801:   pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data);

803:   // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)).
804:   // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C),
805:   // we still do numeric.
806:   if (pdata->reusesym) { // numeric reuses results from symbolic
807:     pdata->reusesym = PETSC_FALSE;
808:     PetscFunctionReturn(PETSC_SUCCESS);
809:   }

811:   switch (product->type) {
812:   case MATPRODUCT_AB:
813:     transA = false;
814:     transB = false;
815:     break;
816:   case MATPRODUCT_AtB:
817:     transA = true;
818:     transB = false;
819:     break;
820:   case MATPRODUCT_ABt:
821:     transA = false;
822:     transB = true;
823:     break;
824:   default:
825:     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
826:   }

828:   A = product->A;
829:   B = product->B;
830:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
831:   PetscCall(MatSeqAIJKokkosSyncDevice(B));
832:   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
833:   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
834:   ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr);

836:   PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty");

838:   csrmatA = akok->csrmat;
839:   csrmatB = bkok->csrmat;

841:   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
842:   if (transA) {
843:     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
844:     transA = false;
845:   }

847:   if (transB) {
848:     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
849:     transB = false;
850:   }
851:   PetscCall(PetscLogGpuTimeBegin());
852:   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat));
853: #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
854:   auto spgemmHandle = pdata->kh.get_spgemm_handle();
855:   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */
856: #endif

858:   PetscCall(PetscLogGpuTimeEnd());
859:   PetscCall(MatSeqAIJKokkosModifyDevice(C));
860:   /* shorter version of MatAssemblyEnd_SeqAIJ */
861:   c = (Mat_SeqAIJ *)C->data;
862:   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
863:   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
864:   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
865:   c->reallocs         = 0;
866:   C->info.mallocs     = 0;
867:   C->info.nz_unneeded = 0;
868:   C->assembled = C->was_assembled = PETSC_TRUE;
869:   C->num_ass++;
870:   PetscFunctionReturn(PETSC_SUCCESS);
871: }

873: static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
874: {
875:   Mat_Product                 *product = C->product;
876:   MatProductType               ptype;
877:   Mat                          A, B;
878:   bool                         transA, transB;
879:   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
880:   MatProductData_SeqAIJKokkos *pdata;
881:   MPI_Comm                     comm;
882:   KokkosCsrMatrix              csrmatA, csrmatB, csrmatC;

884:   PetscFunctionBegin;
885:   MatCheckProduct(C, 1);
886:   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
887:   PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty");
888:   A = product->A;
889:   B = product->B;
890:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
891:   PetscCall(MatSeqAIJKokkosSyncDevice(B));
892:   akok    = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
893:   bkok    = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
894:   csrmatA = akok->csrmat;
895:   csrmatB = bkok->csrmat;

897:   ptype = product->type;
898:   // Take advantage of the symmetry if true
899:   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
900:     ptype                                          = MATPRODUCT_AB;
901:     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
902:   }
903:   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
904:     ptype                                          = MATPRODUCT_AB;
905:     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
906:   }

908:   switch (ptype) {
909:   case MATPRODUCT_AB:
910:     transA = false;
911:     transB = false;
912:     break;
913:   case MATPRODUCT_AtB:
914:     transA = true;
915:     transB = false;
916:     break;
917:   case MATPRODUCT_ABt:
918:     transA = false;
919:     transB = true;
920:     break;
921:   default:
922:     SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
923:   }
924:   PetscCallCXX(product->data = pdata = new MatProductData_SeqAIJKokkos());
925:   pdata->reusesym = product->api_user;

927:   /* TODO: add command line options to select spgemm algorithms */
928:   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */

930:   /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */
931: #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
932:   #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
933:   spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
934:   #endif
935: #endif
936:   PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg));

938:   PetscCall(PetscLogGpuTimeBegin());
939:   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
940:   if (transA) {
941:     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
942:     transA = false;
943:   }

945:   if (transB) {
946:     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
947:     transB = false;
948:   }

950:   PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
951:   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
952:     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
953:     calling new Mat_SeqAIJKokkos().
954:     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
955:   */
956:   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
957: #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
958:   /* Query if KK outputs a sorted matrix. If not, we need to sort it */
959:   auto spgemmHandle = pdata->kh.get_spgemm_handle();
960:   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/
961: #endif
962:   PetscCall(PetscLogGpuTimeEnd());

964:   PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
965:   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok));
966:   C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
967:   PetscFunctionReturn(PETSC_SUCCESS);
968: }

970: /* handles sparse matrix matrix ops */
971: static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
972: {
973:   Mat_Product *product = mat->product;
974:   PetscBool    Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE;

976:   PetscFunctionBegin;
977:   MatCheckProduct(mat, 1);
978:   PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok));
979:   if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok));
980:   if (Biskok && Ciskok) {
981:     switch (product->type) {
982:     case MATPRODUCT_AB:
983:     case MATPRODUCT_AtB:
984:     case MATPRODUCT_ABt:
985:       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
986:       break;
987:     case MATPRODUCT_PtAP:
988:     case MATPRODUCT_RARt:
989:     case MATPRODUCT_ABC:
990:       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
991:       break;
992:     default:
993:       break;
994:     }
995:   } else { /* fallback for AIJ */
996:     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
997:   }
998:   PetscFunctionReturn(PETSC_SUCCESS);
999: }

1001: static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
1002: {
1003:   Mat_SeqAIJKokkos *aijkok;

1005:   PetscFunctionBegin;
1006:   PetscCall(PetscLogGpuTimeBegin());
1007:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1008:   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1009:   KokkosBlas::scal(aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device());
1010:   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1011:   PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0)));
1012:   PetscCall(PetscLogGpuTimeEnd());
1013:   PetscFunctionReturn(PETSC_SUCCESS);
1014: }

1016: static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
1017: {
1018:   Mat_SeqAIJKokkos *aijkok;

1020:   PetscFunctionBegin;
1021:   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1022:   if (aijkok) { /* Only zero the device if data is already there */
1023:     KokkosBlas::fill(aijkok->a_dual.view_device(), 0.0);
1024:     PetscCall(MatSeqAIJKokkosModifyDevice(A));
1025:   } else { /* Might be preallocated but not assembled */
1026:     PetscCall(MatZeroEntries_SeqAIJ(A));
1027:   }
1028:   PetscFunctionReturn(PETSC_SUCCESS);
1029: }

1031: static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x)
1032: {
1033:   Mat_SeqAIJ           *aijseq;
1034:   Mat_SeqAIJKokkos     *aijkok;
1035:   PetscInt              n;
1036:   PetscScalarKokkosView xv;

1038:   PetscFunctionBegin;
1039:   PetscCall(VecGetLocalSize(x, &n));
1040:   PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
1041:   PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices");

1043:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1044:   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

1046:   if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { /* Set the diagonal pointer if not already */
1047:     PetscCall(MatMarkDiagonal_SeqAIJ(A));
1048:     aijseq = static_cast<Mat_SeqAIJ *>(A->data);
1049:     aijkok->SetDiagonal(aijseq->diag);
1050:   }

1052:   const auto &Aa    = aijkok->a_dual.view_device();
1053:   const auto &Ai    = aijkok->i_dual.view_device();
1054:   const auto &Adiag = aijkok->diag_dual.view_device();

1056:   PetscCall(VecGetKokkosViewWrite(x, &xv));
1057:   Kokkos::parallel_for(
1058:     n, KOKKOS_LAMBDA(const PetscInt i) {
1059:       if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i));
1060:       else xv(i) = 0;
1061:     });
1062:   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1063:   PetscFunctionReturn(PETSC_SUCCESS);
1064: }

1066: /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
1067: PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1068: {
1069:   Mat_SeqAIJKokkos *aijkok;

1071:   PetscFunctionBegin;
1074:   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1075:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1076:   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1077:   *kv    = aijkok->a_dual.view_device();
1078:   PetscFunctionReturn(PETSC_SUCCESS);
1079: }

1081: PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1082: {
1083:   PetscFunctionBegin;
1086:   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1087:   PetscFunctionReturn(PETSC_SUCCESS);
1088: }

1090: PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv)
1091: {
1092:   Mat_SeqAIJKokkos *aijkok;

1094:   PetscFunctionBegin;
1097:   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1098:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1099:   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1100:   *kv    = aijkok->a_dual.view_device();
1101:   PetscFunctionReturn(PETSC_SUCCESS);
1102: }

1104: PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv)
1105: {
1106:   PetscFunctionBegin;
1109:   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1110:   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1111:   PetscFunctionReturn(PETSC_SUCCESS);
1112: }

1114: PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1115: {
1116:   Mat_SeqAIJKokkos *aijkok;

1118:   PetscFunctionBegin;
1121:   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1122:   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1123:   *kv    = aijkok->a_dual.view_device();
1124:   PetscFunctionReturn(PETSC_SUCCESS);
1125: }

1127: PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1128: {
1129:   PetscFunctionBegin;
1132:   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1133:   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1134:   PetscFunctionReturn(PETSC_SUCCESS);
1135: }

1137: /* Computes Y += alpha X */
1138: static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern)
1139: {
1140:   Mat_SeqAIJ              *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
1141:   Mat_SeqAIJKokkos        *xkok, *ykok, *zkok;
1142:   ConstMatScalarKokkosView Xa;
1143:   MatScalarKokkosView      Ya;

1145:   PetscFunctionBegin;
1146:   PetscCheckTypeName(Y, MATSEQAIJKOKKOS);
1147:   PetscCheckTypeName(X, MATSEQAIJKOKKOS);
1148:   PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1149:   PetscCall(MatSeqAIJKokkosSyncDevice(X));
1150:   PetscCall(PetscLogGpuTimeBegin());

1152:   if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
1153:     PetscBool e;
1154:     PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e));
1155:     if (e) {
1156:       PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e));
1157:       if (e) pattern = SAME_NONZERO_PATTERN;
1158:     }
1159:   }

1161:   /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
1162:     cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
1163:     If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
1164:     KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
1165:   */
1166:   ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1167:   xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr);
1168:   Xa   = xkok->a_dual.view_device();
1169:   Ya   = ykok->a_dual.view_device();

1171:   if (pattern == SAME_NONZERO_PATTERN) {
1172:     KokkosBlas::axpy(alpha, Xa, Ya);
1173:     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1174:   } else if (pattern == SUBSET_NONZERO_PATTERN) {
1175:     MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device();
1176:     MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device();

1178:     Kokkos::parallel_for(
1179:       Kokkos::TeamPolicy<>(Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1180:         PetscInt i = t.league_rank(); // row i
1181:         Kokkos::single(Kokkos::PerTeam(t), [=]() {
1182:           // Only one thread works in a team
1183:           PetscInt p, q = Yi(i);
1184:           for (p = Xi(i); p < Xi(i + 1); p++) {          // For each nonzero on row i of X,
1185:             while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y.
1186:             if (Xj(p) == Yj(q)) {                        // Found it
1187:               Ya(q) += alpha * Xa(p);
1188:               q++;
1189:             } else {
1190:             // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
1191:             // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
1192: #if PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0)
1193:               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan();
1194: #else
1195:               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1");
1196: #endif
1197:             }
1198:           }
1199:         });
1200:       });
1201:     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1202:   } else { // different nonzero patterns
1203:     Mat             Z;
1204:     KokkosCsrMatrix zcsr;
1205:     KernelHandle    kh;
1206:     kh.create_spadd_handle(true); // X, Y are sorted
1207:     KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr);
1208:     KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr);
1209:     zkok = new Mat_SeqAIJKokkos(zcsr);
1210:     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z));
1211:     PetscCall(MatHeaderReplace(Y, &Z));
1212:     kh.destroy_spadd_handle();
1213:   }
1214:   PetscCall(PetscLogGpuTimeEnd());
1215:   PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y
1216:   PetscFunctionReturn(PETSC_SUCCESS);
1217: }

1219: static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
1220: {
1221:   Mat_SeqAIJKokkos *akok;
1222:   Mat_SeqAIJ       *aseq;

1224:   PetscFunctionBegin;
1225:   PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j));
1226:   aseq = static_cast<Mat_SeqAIJ *>(mat->data);
1227:   akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr);
1228:   delete akok;
1229:   mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq->nz, aseq->i, aseq->j, aseq->a, mat->nonzerostate + 1, PETSC_FALSE);
1230:   PetscCall(MatZeroEntries_SeqAIJKokkos(mat));
1231:   akok->SetUpCOO(aseq);
1232:   PetscFunctionReturn(PETSC_SUCCESS);
1233: }

1235: static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode)
1236: {
1237:   Mat_SeqAIJ                 *aseq = static_cast<Mat_SeqAIJ *>(A->data);
1238:   Mat_SeqAIJKokkos           *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1239:   PetscCount                  Annz = aseq->nz;
1240:   const PetscCountKokkosView &jmap = akok->jmap_d;
1241:   const PetscCountKokkosView &perm = akok->perm_d;
1242:   MatScalarKokkosView         Aa;
1243:   ConstMatScalarKokkosView    kv;
1244:   PetscMemType                memtype;

1246:   PetscFunctionBegin;
1247:   PetscCall(PetscGetMemType(v, &memtype));
1248:   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
1249:     kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, aseq->coo_n));
1250:   } else {
1251:     kv = ConstMatScalarKokkosView(v, aseq->coo_n); /* Directly use v[]'s memory */
1252:   }

1254:   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */
1255:   else PetscCall(MatSeqAIJGetKokkosView(A, &Aa));                             /* read & write matrix values */

1257:   Kokkos::parallel_for(
1258:     Annz, KOKKOS_LAMBDA(const PetscCount i) {
1259:       PetscScalar sum = 0.0;
1260:       for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k));
1261:       Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum;
1262:     });

1264:   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa));
1265:   else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa));
1266:   PetscFunctionReturn(PETSC_SUCCESS);
1267: }

1269: PETSC_INTERN PetscErrorCode MatSeqAIJMoveDiagonalValuesFront_SeqAIJKokkos(Mat A, const PetscInt *diag)
1270: {
1271:   Mat_SeqAIJKokkos          *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1272:   MatScalarKokkosView        Aa;
1273:   const MatRowMapKokkosView &Ai = akok->i_dual.view_device();
1274:   PetscInt                   m  = A->rmap->n;
1275:   ConstMatRowMapKokkosView   Adiag(diag, m); /* diag is a device pointer */

1277:   PetscFunctionBegin;
1278:   PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa));
1279:   Kokkos::parallel_for(
1280:     m, KOKKOS_LAMBDA(const PetscInt i) {
1281:       PetscScalar tmp;
1282:       if (Adiag(i) >= Ai(i) && Adiag(i) < Ai(i + 1)) { /* The diagonal element exists */
1283:         tmp          = Aa(Ai(i));
1284:         Aa(Ai(i))    = Aa(Adiag(i));
1285:         Aa(Adiag(i)) = tmp;
1286:       }
1287:     });
1288:   PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa));
1289:   PetscFunctionReturn(PETSC_SUCCESS);
1290: }

1292: static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1293: {
1294:   PetscFunctionBegin;
1295:   PetscCall(MatSeqAIJKokkosSyncHost(A));
1296:   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
1297:   B->offloadmask = PETSC_OFFLOAD_CPU;
1298:   PetscFunctionReturn(PETSC_SUCCESS);
1299: }

1301: static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
1302: {
1303:   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;

1305:   PetscFunctionBegin;
1306:   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
1307:   A->boundtocpu  = PETSC_FALSE;

1309:   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
1310:   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
1311:   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
1312:   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
1313:   A->ops->scale                     = MatScale_SeqAIJKokkos;
1314:   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
1315:   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
1316:   A->ops->mult                      = MatMult_SeqAIJKokkos;
1317:   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
1318:   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
1319:   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
1320:   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
1321:   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1322:   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
1323:   A->ops->transpose                 = MatTranspose_SeqAIJKokkos;
1324:   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
1325:   A->ops->getdiagonal               = MatGetDiagonal_SeqAIJKokkos;
1326:   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
1327:   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
1328:   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1329:   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1330:   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1331:   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
1332:   a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos;

1334:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos));
1335:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos));
1336:   PetscFunctionReturn(PETSC_SUCCESS);
1337: }

1339: PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok)
1340: {
1341:   Mat_SeqAIJ *aseq;
1342:   PetscInt    i, m, n;

1344:   PetscFunctionBegin;
1345:   PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty");

1347:   m = akok->nrows();
1348:   n = akok->ncols();
1349:   PetscCall(MatSetSizes(A, m, n, m, n));
1350:   PetscCall(MatSetType(A, MATSEQAIJKOKKOS));

1352:   /* Set up data structures of A as a MATSEQAIJ */
1353:   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL));
1354:   aseq = (Mat_SeqAIJ *)(A)->data;

1356:   akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */
1357:   akok->j_dual.sync_host();

1359:   aseq->i            = akok->i_host_data();
1360:   aseq->j            = akok->j_host_data();
1361:   aseq->a            = akok->a_host_data();
1362:   aseq->nonew        = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1363:   aseq->singlemalloc = PETSC_FALSE;
1364:   aseq->free_a       = PETSC_FALSE;
1365:   aseq->free_ij      = PETSC_FALSE;
1366:   aseq->nz           = akok->nnz();
1367:   aseq->maxnz        = aseq->nz;

1369:   PetscCall(PetscMalloc1(m, &aseq->imax));
1370:   PetscCall(PetscMalloc1(m, &aseq->ilen));
1371:   for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i];

1373:   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1374:   akok->nonzerostate = A->nonzerostate;
1375:   A->spptr           = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */
1376:   PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
1377:   PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
1378:   PetscFunctionReturn(PETSC_SUCCESS);
1379: }

1381: PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr)
1382: {
1383:   PetscFunctionBegin;
1384:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1385:   *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat;
1386:   PetscFunctionReturn(PETSC_SUCCESS);
1387: }

1389: PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A)
1390: {
1391:   Mat_SeqAIJKokkos *akok;
1392:   PetscFunctionBegin;
1393:   PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr));
1394:   PetscCall(MatCreate(comm, A));
1395:   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1396:   PetscFunctionReturn(PETSC_SUCCESS);
1397: }

1399: /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure

1401:    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1402:  */
1403: PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A)
1404: {
1405:   PetscFunctionBegin;
1406:   PetscCall(MatCreate(comm, A));
1407:   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1408:   PetscFunctionReturn(PETSC_SUCCESS);
1409: }

1411: /*@C
1412:    MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format
1413:    (the default parallel PETSc format). This matrix will ultimately be handled by
1414:    Kokkos for calculations.

1416:    Collective

1418:    Input Parameters:
1419: +  comm - MPI communicator, set to `PETSC_COMM_SELF`
1420: .  m - number of rows
1421: .  n - number of columns
1422: .  nz - number of nonzeros per row (same for all rows), ignored if `nnz` is provided
1423: -  nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`

1425:    Output Parameter:
1426: .  A - the matrix

1428:    Level: intermediate

1430:    Notes:
1431:    It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
1432:    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1433:    [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]

1435:    The AIJ format, also called
1436:    compressed row storage, is fully compatible with standard Fortran
1437:    storage.  That is, the stored row and column indices can begin at
1438:    either one (as in Fortran) or zero.

1440:    Specify the preallocated storage with either `nz` or `nnz` (not both).
1441:    Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
1442:    allocation.

1444: .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`
1445: @*/
1446: PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
1447: {
1448:   PetscFunctionBegin;
1449:   PetscCall(PetscKokkosInitializeCheck());
1450:   PetscCall(MatCreate(comm, A));
1451:   PetscCall(MatSetSizes(*A, m, n, m, n));
1452:   PetscCall(MatSetType(*A, MATSEQAIJKOKKOS));
1453:   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
1454:   PetscFunctionReturn(PETSC_SUCCESS);
1455: }

1457: typedef Kokkos::TeamPolicy<>::member_type team_member;
1458: //
1459: // This factorization exploits block diagonal matrices with "Nf" (not used).
1460: // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization
1461: //
1462: static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B, Mat A, const MatFactorInfo *info)
1463: {
1464:   Mat_SeqAIJ       *b      = (Mat_SeqAIJ *)B->data;
1465:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
1466:   IS                isrow = b->row, isicol = b->icol;
1467:   const PetscInt   *r_h, *ic_h;
1468:   const PetscInt n = A->rmap->n, *ai_d = aijkok->i_dual.view_device().data(), *aj_d = aijkok->j_dual.view_device().data(), *bi_d = baijkok->i_dual.view_device().data(), *bj_d = baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d.data();
1469:   const PetscScalar *aa_d = aijkok->a_dual.view_device().data();
1470:   PetscScalar       *ba_d = baijkok->a_dual.view_device().data();
1471:   PetscBool          row_identity, col_identity;
1472:   PetscInt           nc, Nf = 1, nVec = 32; // should be a parameter, Nf is batch size - not used

1474:   PetscFunctionBegin;
1475:   PetscCheck(A->rmap->n == n, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "square matrices only supported %" PetscInt_FMT " %" PetscInt_FMT, A->rmap->n, n);
1476:   PetscCall(MatIsStructurallySymmetric(A, &row_identity));
1477:   PetscCheck(row_identity, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "structurally symmetric matrices only supported");
1478:   PetscCall(ISGetIndices(isrow, &r_h));
1479:   PetscCall(ISGetIndices(isicol, &ic_h));
1480:   PetscCall(ISGetSize(isicol, &nc));
1481:   PetscCall(PetscLogGpuTimeBegin());
1482:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1483:   {
1484: #define KOKKOS_SHARED_LEVEL 1
1485:     using scr_mem_t    = Kokkos::DefaultExecutionSpace::scratch_memory_space;
1486:     using sizet_scr_t  = Kokkos::View<size_t, scr_mem_t>;
1487:     using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>;
1488:     const Kokkos::View<const PetscInt *, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> h_r_k(r_h, n);
1489:     Kokkos::View<PetscInt *, Kokkos::LayoutLeft>                                                                         d_r_k("r", n);
1490:     const Kokkos::View<const PetscInt *, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> h_ic_k(ic_h, nc);
1491:     Kokkos::View<PetscInt *, Kokkos::LayoutLeft>                                                                         d_ic_k("ic", nc);
1492:     size_t                                                                                                               flops_h = 0.0;
1493:     Kokkos::View<size_t, Kokkos::HostSpace>                                                                              h_flops_k(&flops_h);
1494:     Kokkos::View<size_t>                                                                                                 d_flops_k("flops");
1495:     const int                                                                                                            conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256
1496:     const int                                                                                                            nloc = n / Nf, Ni = (conc > 8) ? 1 /* some intelligent number of SMs -- but need league_barrier */ : 1;
1497:     Kokkos::deep_copy(d_flops_k, h_flops_k);
1498:     Kokkos::deep_copy(d_r_k, h_r_k);
1499:     Kokkos::deep_copy(d_ic_k, h_ic_k);
1500:     // Fill A --> fact
1501:     Kokkos::parallel_for(
1502:       Kokkos::TeamPolicy<>(Nf * Ni, team_size, nVec), KOKKOS_LAMBDA(const team_member team) {
1503:         const PetscInt  field = team.league_rank() / Ni, field_block = team.league_rank() % Ni; // use grid.x/y in CUDA
1504:         const PetscInt  nloc_i = (nloc / Ni + !!(nloc % Ni)), start_i = field * nloc + field_block * nloc_i, end_i = (start_i + nloc_i) > (field + 1) * nloc ? (field + 1) * nloc : (start_i + nloc_i);
1505:         const PetscInt *ic = d_ic_k.data(), *r = d_r_k.data();
1506:         // zero rows of B
1507:         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=](const int &rowb) {
1508:           PetscInt     nzbL = bi_d[rowb + 1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb + 1]; // with diag
1509:           PetscScalar *baL = ba_d + bi_d[rowb];
1510:           PetscScalar *baU = ba_d + bdiag_d[rowb + 1] + 1;
1511:           /* zero (unfactored row) */
1512:           for (int j = 0; j < nzbL; j++) baL[j] = 0;
1513:           for (int j = 0; j < nzbU; j++) baU[j] = 0;
1514:         });
1515:         // copy A into B
1516:         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=](const int &rowb) {
1517:           PetscInt           rowa = r[rowb], nza = ai_d[rowa + 1] - ai_d[rowa];
1518:           const PetscScalar *av    = aa_d + ai_d[rowa];
1519:           const PetscInt    *ajtmp = aj_d + ai_d[rowa];
1520:           /* load in initial (unfactored row) */
1521:           for (int j = 0; j < nza; j++) {
1522:             PetscInt    colb = ic[ajtmp[j]];
1523:             PetscScalar vala = av[j];
1524:             if (colb == rowb) {
1525:               *(ba_d + bdiag_d[rowb]) = vala;
1526:             } else {
1527:               const PetscInt *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb + 1] + 1 : bi_d[rowb]);
1528:               PetscScalar    *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb + 1] + 1 : bi_d[rowb]);
1529:               PetscInt        nz = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb + 1] + 1) : bi_d[rowb + 1] - bi_d[rowb], set = 0;
1530:               for (int j = 0; j < nz; j++) {
1531:                 if (pbj[j] == colb) {
1532:                   pba[j] = vala;
1533:                   set++;
1534:                   break;
1535:                 }
1536:               }
1537: #if !defined(PETSC_HAVE_SYCL)
1538:               if (set != 1) printf("\t\t\t ERROR DID NOT SET ?????\n");
1539: #else
1540:               (void)set;
1541: #endif
1542:             }
1543:           }
1544:         });
1545:       });
1546:     Kokkos::fence();

1548:     Kokkos::parallel_for(
1549:       Kokkos::TeamPolicy<>(Nf * Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size() + scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA(const team_member team) {
1550:         sizet_scr_t    colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL));
1551:         scalar_scr_t   L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL));
1552:         sizet_scr_t    flops(team.team_scratch(KOKKOS_SHARED_LEVEL));
1553:         const PetscInt field = team.league_rank() / Ni, field_block_idx = team.league_rank() % Ni; // use grid.x/y in CUDA
1554:         const PetscInt start = field * nloc, end = start + nloc;
1555:         Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; });
1556:         // A22 panel update for each row A(1,:) and col A(:,1)
1557:         for (int ii = start; ii < end - 1; ii++) {
1558:           const PetscInt    *bjUi = bj_d + bdiag_d[ii + 1] + 1, nzUi = bdiag_d[ii] - (bdiag_d[ii + 1] + 1); // vector, and vector size, of column indices of U(i,(i+1):end)
1559:           const PetscScalar *baUi    = ba_d + bdiag_d[ii + 1] + 1;                                          // vector of data  U(i,i+1:end)
1560:           const PetscInt     nUi_its = nzUi / Ni + !!(nzUi % Ni);
1561:           const PetscScalar  Bii     = *(ba_d + bdiag_d[ii]); // diagonal in its special place
1562:           Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=](const int j) {
1563:             PetscInt kIdx = j * Ni + field_block_idx;
1564:             if (kIdx >= nzUi) /* void */
1565:               ;
1566:             else {
1567:               const PetscInt  myk = bjUi[kIdx];                // assume symmetric structure, need a transposed meta-data here in general
1568:               const PetscInt *pjL = bj_d + bi_d[myk];          // look for L(myk,ii) in start of row
1569:               const PetscInt  nzL = bi_d[myk + 1] - bi_d[myk]; // size of L_k(:)
1570:               size_t          st_idx;
1571:               // find and do L(k,i) = A(:k,i) / A(i,i)
1572:               Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; });
1573:               // get column, there has got to be a better way
1574:               Kokkos::parallel_reduce(
1575:                 Kokkos::ThreadVectorRange(team, nzL),
1576:                 [&](const int &j, size_t &idx) {
1577:                   if (pjL[j] == ii) {
1578:                     PetscScalar *pLki = ba_d + bi_d[myk] + j;
1579:                     idx               = j;           // output
1580:                     *pLki             = *pLki / Bii; // column scaling:  L(k,i) = A(:k,i) / A(i,i)
1581:                   }
1582:                 },
1583:                 st_idx);
1584:               Kokkos::single(Kokkos::PerThread(team), [=]() {
1585:                 colkIdx() = st_idx;
1586:                 L_ki()    = *(ba_d + bi_d[myk] + st_idx);
1587:               });
1588: #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
1589:               if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n", (int)myk, ii); // uses a register
1590: #endif
1591:               // active row k, do  A_kj -= Lki * U_ij; j \in U(i,:) j != i
1592:               // U(i+1,:end)
1593:               Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, nzUi), [=](const int &uiIdx) { // index into i (U)
1594:                 PetscScalar Uij = baUi[uiIdx];
1595:                 PetscInt    col = bjUi[uiIdx];
1596:                 if (col == myk) {
1597:                   // A_kk = A_kk - L_ki * U_ij(k)
1598:                   PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place
1599:                   *Akkv             = *Akkv - L_ki() * Uij;  // UiK
1600:                 } else {
1601:                   PetscScalar    *start, *end, *pAkjv = NULL;
1602:                   PetscInt        high, low;
1603:                   const PetscInt *startj;
1604:                   if (col < myk) { // L
1605:                     PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx();
1606:                     PetscInt     idx  = (pLki + 1) - (ba_d + bi_d[myk]); // index into row
1607:                     start             = pLki + 1;                        // start at pLki+1, A22(myk,1)
1608:                     startj            = bj_d + bi_d[myk] + idx;
1609:                     end               = ba_d + bi_d[myk + 1];
1610:                   } else {
1611:                     PetscInt idx = bdiag_d[myk + 1] + 1;
1612:                     start        = ba_d + idx;
1613:                     startj       = bj_d + idx;
1614:                     end          = ba_d + bdiag_d[myk];
1615:                   }
1616:                   // search for 'col', use bisection search - TODO
1617:                   low  = 0;
1618:                   high = (PetscInt)(end - start);
1619:                   while (high - low > 5) {
1620:                     int t = (low + high) / 2;
1621:                     if (startj[t] > col) high = t;
1622:                     else low = t;
1623:                   }
1624:                   for (pAkjv = start + low; pAkjv < start + high; pAkjv++) {
1625:                     if (startj[pAkjv - start] == col) break;
1626:                   }
1627: #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
1628:                   if (pAkjv == start + high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n", (int)myk, (int)col); // uses a register
1629: #endif
1630:                   *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij
1631:                 }
1632:               });
1633:             }
1634:           });
1635:           team.team_barrier(); // this needs to be a league barrier to use more that one SM per block
1636:           if (field_block_idx == 0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add(flops.data(), (size_t)(2 * (nzUi * nzUi) + 2)); });
1637:         } /* endof for (i=0; i<n; i++) { */
1638:         Kokkos::single(Kokkos::PerTeam(team), [=]() {
1639:           Kokkos::atomic_add(&d_flops_k(), flops());
1640:           flops() = 0;
1641:         });
1642:       });
1643:     Kokkos::fence();
1644:     Kokkos::deep_copy(h_flops_k, d_flops_k);
1645:     PetscCall(PetscLogGpuFlops((PetscLogDouble)h_flops_k()));
1646:     Kokkos::parallel_for(
1647:       Kokkos::TeamPolicy<>(Nf * Ni, 1, 256), KOKKOS_LAMBDA(const team_member team) {
1648:         const PetscInt lg_rank = team.league_rank(), field = lg_rank / Ni;                            //, field_offset = lg_rank%Ni;
1649:         const PetscInt start = field * nloc, end = start + nloc, n_its = (nloc / Ni + !!(nloc % Ni)); // 1/Ni iters
1650:         /* Invert diagonal for simpler triangular solves */
1651:         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=](int outer_index) {
1652:           int i = start + outer_index * Ni + lg_rank % Ni;
1653:           if (i < end) {
1654:             PetscScalar *pv = ba_d + bdiag_d[i];
1655:             *pv             = 1.0 / (*pv);
1656:           }
1657:         });
1658:       });
1659:   }
1660:   PetscCall(PetscLogGpuTimeEnd());
1661:   PetscCall(ISRestoreIndices(isicol, &ic_h));
1662:   PetscCall(ISRestoreIndices(isrow, &r_h));

1664:   PetscCall(ISIdentity(isrow, &row_identity));
1665:   PetscCall(ISIdentity(isicol, &col_identity));
1666:   if (b->inode.size) {
1667:     B->ops->solve = MatSolve_SeqAIJ_Inode;
1668:   } else if (row_identity && col_identity) {
1669:     B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering;
1670:   } else {
1671:     B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos
1672:   }
1673:   B->offloadmask = PETSC_OFFLOAD_GPU;
1674:   PetscCall(MatSeqAIJKokkosSyncHost(B));          // solve on CPU
1675:   B->ops->solveadd          = MatSolveAdd_SeqAIJ; // and this
1676:   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
1677:   B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1678:   B->ops->matsolve          = MatMatSolve_SeqAIJ;
1679:   B->assembled              = PETSC_TRUE;
1680:   B->preallocated           = PETSC_TRUE;

1682:   PetscFunctionReturn(PETSC_SUCCESS);
1683: }

1685: static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1686: {
1687:   PetscFunctionBegin;
1688:   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1689:   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
1690:   PetscFunctionReturn(PETSC_SUCCESS);
1691: }

1693: static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A)
1694: {
1695:   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;

1697:   PetscFunctionBegin;
1698:   if (!factors->sptrsv_symbolic_completed) {
1699:     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d);
1700:     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d);
1701:     factors->sptrsv_symbolic_completed = PETSC_TRUE;
1702:   }
1703:   PetscFunctionReturn(PETSC_SUCCESS);
1704: }

1706: /* Check if we need to update factors etc for transpose solve */
1707: static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1708: {
1709:   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1710:   MatColIdxType               n       = A->rmap->n;

1712:   PetscFunctionBegin;
1713:   if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */
1714:     /* Update L^T and do sptrsv symbolic */
1715:     factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1);
1716:     Kokkos::deep_copy(factors->iLt_d, 0); /* KK requires 0 */
1717:     factors->jLt_d = MatColIdxKokkosView("factors->jLt_d", factors->jL_d.extent(0));
1718:     factors->aLt_d = MatScalarKokkosView("factors->aLt_d", factors->aL_d.extent(0));

1720:     transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d, factors->jL_d, factors->aL_d,
1721:                                                                                                                                                                                                               factors->iLt_d, factors->jLt_d, factors->aLt_d);

1723:     /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices.
1724:       We have to sort the indices, until KK provides finer control options.
1725:     */
1726:     sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d);

1728:     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d);

1730:     /* Update U^T and do sptrsv symbolic */
1731:     factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1);
1732:     Kokkos::deep_copy(factors->iUt_d, 0); /* KK requires 0 */
1733:     factors->jUt_d = MatColIdxKokkosView("factors->jUt_d", factors->jU_d.extent(0));
1734:     factors->aUt_d = MatScalarKokkosView("factors->aUt_d", factors->aU_d.extent(0));

1736:     transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d, factors->jU_d, factors->aU_d,
1737:                                                                                                                                                                                                               factors->iUt_d, factors->jUt_d, factors->aUt_d);

1739:     /* Sort indices. See comments above */
1740:     sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d);

1742:     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d);
1743:     factors->transpose_updated = PETSC_TRUE;
1744:   }
1745:   PetscFunctionReturn(PETSC_SUCCESS);
1746: }

1748: /* Solve Ax = b, with A = LU */
1749: static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A, Vec b, Vec x)
1750: {
1751:   ConstPetscScalarKokkosView  bv;
1752:   PetscScalarKokkosView       xv;
1753:   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;

1755:   PetscFunctionBegin;
1756:   PetscCall(PetscLogGpuTimeBegin());
1757:   PetscCall(MatSeqAIJKokkosSymbolicSolveCheck(A));
1758:   PetscCall(VecGetKokkosView(b, &bv));
1759:   PetscCall(VecGetKokkosViewWrite(x, &xv));
1760:   /* Solve L tmpv = b */
1761:   PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, bv, factors->workVector));
1762:   /* Solve Ux = tmpv */
1763:   PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, factors->workVector, xv));
1764:   PetscCall(VecRestoreKokkosView(b, &bv));
1765:   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1766:   PetscCall(PetscLogGpuTimeEnd());
1767:   PetscFunctionReturn(PETSC_SUCCESS);
1768: }

1770: /* Solve A^T x = b, where A^T = U^T L^T */
1771: static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A, Vec b, Vec x)
1772: {
1773:   ConstPetscScalarKokkosView  bv;
1774:   PetscScalarKokkosView       xv;
1775:   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;

1777:   PetscFunctionBegin;
1778:   PetscCall(PetscLogGpuTimeBegin());
1779:   PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A));
1780:   PetscCall(VecGetKokkosView(b, &bv));
1781:   PetscCall(VecGetKokkosViewWrite(x, &xv));
1782:   /* Solve U^T tmpv = b */
1783:   KokkosSparse::Experimental::sptrsv_solve(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, bv, factors->workVector);

1785:   /* Solve L^T x = tmpv */
1786:   KokkosSparse::Experimental::sptrsv_solve(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, factors->workVector, xv);
1787:   PetscCall(VecRestoreKokkosView(b, &bv));
1788:   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1789:   PetscCall(PetscLogGpuTimeEnd());
1790:   PetscFunctionReturn(PETSC_SUCCESS);
1791: }

1793: static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1794: {
1795:   Mat_SeqAIJKokkos           *aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
1796:   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
1797:   PetscInt                    fill_lev = info->levels;

1799:   PetscFunctionBegin;
1800:   PetscCall(PetscLogGpuTimeBegin());
1801:   PetscCall(MatSeqAIJKokkosSyncDevice(A));

1803:   auto a_d = aijkok->a_dual.view_device();
1804:   auto i_d = aijkok->i_dual.view_device();
1805:   auto j_d = aijkok->j_dual.view_device();

1807:   KokkosSparse::Experimental::spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d);

1809:   B->assembled              = PETSC_TRUE;
1810:   B->preallocated           = PETSC_TRUE;
1811:   B->ops->solve             = MatSolve_SeqAIJKokkos;
1812:   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJKokkos;
1813:   B->ops->matsolve          = NULL;
1814:   B->ops->matsolvetranspose = NULL;
1815:   B->offloadmask            = PETSC_OFFLOAD_GPU;

1817:   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
1818:   factors->transpose_updated         = PETSC_FALSE;
1819:   factors->sptrsv_symbolic_completed = PETSC_FALSE;
1820:   /* TODO: log flops, but how to know that? */
1821:   PetscCall(PetscLogGpuTimeEnd());
1822:   PetscFunctionReturn(PETSC_SUCCESS);
1823: }

1825: static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1826: {
1827:   Mat_SeqAIJKokkos           *aijkok;
1828:   Mat_SeqAIJ                 *b;
1829:   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
1830:   PetscInt                    fill_lev = info->levels;
1831:   PetscInt                    nnzA     = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU;
1832:   PetscInt                    n        = A->rmap->n;

1834:   PetscFunctionBegin;
1835:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1836:   /* Rebuild factors */
1837:   if (factors) {
1838:     factors->Destroy();
1839:   } /* Destroy the old if it exists */
1840:   else {
1841:     B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);
1842:   }

1844:   /* Create a spiluk handle and then do symbolic factorization */
1845:   nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA);
1846:   factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU);

1848:   auto spiluk_handle = factors->kh.get_spiluk_handle();

1850:   Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */
1851:   Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL());
1852:   Kokkos::realloc(factors->iU_d, n + 1);
1853:   Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU());

1855:   aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
1856:   auto i_d = aijkok->i_dual.view_device();
1857:   auto j_d = aijkok->j_dual.view_device();
1858:   KokkosSparse::Experimental::spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d);
1859:   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */

1861:   Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
1862:   Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU());
1863:   Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */
1864:   Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU());

1866:   /* TODO: add options to select sptrsv algorithms */
1867:   /* Create sptrsv handles for L, U and their transpose */
1868: #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
1869:   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE;
1870: #else
1871:   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1;
1872: #endif

1874:   factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */);
1875:   factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */);
1876:   factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */);
1877:   factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */);

1879:   /* Fill fields of the factor matrix B */
1880:   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL));
1881:   b     = (Mat_SeqAIJ *)B->data;
1882:   b->nz = b->maxnz          = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU();
1883:   B->info.fill_ratio_given  = info->fill;
1884:   B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0;

1886:   B->offloadmask          = PETSC_OFFLOAD_GPU;
1887:   B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos;
1888:   PetscFunctionReturn(PETSC_SUCCESS);
1889: }

1891: static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1892: {
1893:   Mat_SeqAIJ    *b     = (Mat_SeqAIJ *)B->data;
1894:   const PetscInt nrows = A->rmap->n;

1896:   PetscFunctionBegin;
1897:   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1898:   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE;
1899:   // move B data into Kokkos
1900:   PetscCall(MatSeqAIJKokkosSyncDevice(B)); // create aijkok
1901:   PetscCall(MatSeqAIJKokkosSyncDevice(A)); // create aijkok
1902:   {
1903:     Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
1904:     if (!baijkok->diag_d.extent(0)) {
1905:       const Kokkos::View<PetscInt *, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> h_diag(b->diag, nrows + 1);
1906:       baijkok->diag_d = Kokkos::View<PetscInt *>(Kokkos::create_mirror(DefaultMemorySpace(), h_diag));
1907:       Kokkos::deep_copy(baijkok->diag_d, h_diag);
1908:     }
1909:   }
1910:   PetscFunctionReturn(PETSC_SUCCESS);
1911: }

1913: static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A, MatSolverType *type)
1914: {
1915:   PetscFunctionBegin;
1916:   *type = MATSOLVERKOKKOS;
1917:   PetscFunctionReturn(PETSC_SUCCESS);
1918: }

1920: static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A, MatSolverType *type)
1921: {
1922:   PetscFunctionBegin;
1923:   *type = MATSOLVERKOKKOSDEVICE;
1924:   PetscFunctionReturn(PETSC_SUCCESS);
1925: }

1927: /*MC
1928:   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
1929:   on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`.

1931:   Level: beginner

1933: .seealso: [](ch_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation`
1934: M*/
1935: PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
1936: {
1937:   PetscInt n = A->rmap->n;

1939:   PetscFunctionBegin;
1940:   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
1941:   PetscCall(MatSetSizes(*B, n, n, n, n));
1942:   (*B)->factortype = ftype;
1943:   PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
1944:   PetscCall(MatSetType(*B, MATSEQAIJKOKKOS));

1946:   if (ftype == MAT_FACTOR_LU) {
1947:     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
1948:     (*B)->canuseordering        = PETSC_TRUE;
1949:     (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos;
1950:   } else if (ftype == MAT_FACTOR_ILU) {
1951:     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
1952:     (*B)->canuseordering         = PETSC_FALSE;
1953:     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
1954:   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);

1956:   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
1957:   PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos));
1958:   PetscFunctionReturn(PETSC_SUCCESS);
1959: }

1961: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A, MatFactorType ftype, Mat *B)
1962: {
1963:   PetscInt n = A->rmap->n;

1965:   PetscFunctionBegin;
1966:   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
1967:   PetscCall(MatSetSizes(*B, n, n, n, n));
1968:   (*B)->factortype     = ftype;
1969:   (*B)->canuseordering = PETSC_TRUE;
1970:   PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
1971:   PetscCall(MatSetType(*B, MATSEQAIJKOKKOS));

1973:   if (ftype == MAT_FACTOR_LU) {
1974:     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
1975:     (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE;
1976:   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for KOKKOS Matrix Types");

1978:   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
1979:   PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_kokkos_device));
1980:   PetscFunctionReturn(PETSC_SUCCESS);
1981: }

1983: PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void)
1984: {
1985:   PetscFunctionBegin;
1986:   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos));
1987:   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos));
1988:   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_seqaijkokkos_kokkos_device));
1989:   PetscFunctionReturn(PETSC_SUCCESS);
1990: }

1992: /* Utility to print out a KokkosCsrMatrix for debugging */
1993: PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat)
1994: {
1995:   const auto        &iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.row_map);
1996:   const auto        &jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.entries);
1997:   const auto        &av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.values);
1998:   const PetscInt    *i  = iv.data();
1999:   const PetscInt    *j  = jv.data();
2000:   const PetscScalar *a  = av.data();
2001:   PetscInt           m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz();

2003:   PetscFunctionBegin;
2004:   PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz));
2005:   for (PetscInt k = 0; k < m; k++) {
2006:     PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k));
2007:     for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p])));
2008:     PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n"));
2009:   }
2010:   PetscFunctionReturn(PETSC_SUCCESS);
2011: }