Actual source code: aijkok.kokkos.cxx
1: #include <petscvec_kokkos.hpp>
2: #include <petscpkg_version.h>
3: #include <petsc/private/petscimpl.h>
4: #include <petsc/private/sfimpl.h>
5: #include <petscsystypes.h>
6: #include <petscerror.h>
8: #include <Kokkos_Core.hpp>
9: #include <KokkosBlas.hpp>
10: #include <KokkosSparse_CrsMatrix.hpp>
11: #include <KokkosSparse_spmv.hpp>
12: #include <KokkosSparse_spiluk.hpp>
13: #include <KokkosSparse_sptrsv.hpp>
14: #include <KokkosSparse_spgemm.hpp>
15: #include <KokkosSparse_spadd.hpp>
17: #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>
19: #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0)
20: #include <KokkosSparse_Utils.hpp>
21: using KokkosSparse::sort_crs_matrix;
22: using KokkosSparse::Impl::transpose_matrix;
23: #else
24: #include <KokkosKernels_Sorting.hpp>
25: using KokkosKernels::sort_crs_matrix;
26: using KokkosKernels::Impl::transpose_matrix;
27: #endif
29: static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
31: /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
32: we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
33: In the latter case, it is important to set a_dual's sync state correctly.
34: */
35: static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode)
36: {
37: Mat_SeqAIJ *aijseq;
38: Mat_SeqAIJKokkos *aijkok;
40: PetscFunctionBegin;
41: if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
42: PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
44: aijseq = static_cast<Mat_SeqAIJ *>(A->data);
45: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
47: /* If aijkok does not exist, we just copy i, j to device.
48: If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
49: In both cases, we build a new aijkok structure.
50: */
51: if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
52: delete aijkok;
53: aijkok = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq->nz, aijseq->i, aijseq->j, aijseq->a, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/);
54: A->spptr = aijkok;
55: }
57: if (aijkok->device_mat_d.data()) {
58: A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this
59: }
60: PetscFunctionReturn(PETSC_SUCCESS);
61: }
63: /* Sync CSR data to device if not yet */
64: PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
65: {
66: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
68: PetscFunctionBegin;
69: PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device");
70: PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
71: if (aijkok->a_dual.need_sync_device()) {
72: aijkok->a_dual.sync_device();
73: aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */
74: aijkok->hermitian_updated = PETSC_FALSE;
75: }
76: PetscFunctionReturn(PETSC_SUCCESS);
77: }
79: /* Mark the CSR data on device as modified */
80: PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
81: {
82: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
84: PetscFunctionBegin;
85: PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries");
86: aijkok->a_dual.clear_sync_state();
87: aijkok->a_dual.modify_device();
88: aijkok->transpose_updated = PETSC_FALSE;
89: aijkok->hermitian_updated = PETSC_FALSE;
90: PetscCall(MatSeqAIJInvalidateDiagonal(A));
91: PetscCall(PetscObjectStateIncrease((PetscObject)A));
92: PetscFunctionReturn(PETSC_SUCCESS);
93: }
95: static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
96: {
97: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
99: PetscFunctionBegin;
100: PetscCheckTypeName(A, MATSEQAIJKOKKOS);
101: /* We do not expect one needs factors on host */
102: PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host");
103: PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK");
104: aijkok->a_dual.sync_host();
105: PetscFunctionReturn(PETSC_SUCCESS);
106: }
108: static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
109: {
110: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
112: PetscFunctionBegin;
113: /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's.
114: Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be
115: reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate
116: must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd.
117: */
118: if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
119: aijkok->a_dual.sync_host();
120: *array = aijkok->a_dual.view_host().data();
121: } else { /* Happens when calling MatSetValues on a newly created matrix */
122: *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
123: }
124: PetscFunctionReturn(PETSC_SUCCESS);
125: }
127: static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
128: {
129: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
131: PetscFunctionBegin;
132: if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host();
133: PetscFunctionReturn(PETSC_SUCCESS);
134: }
136: static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
137: {
138: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
140: PetscFunctionBegin;
141: if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
142: aijkok->a_dual.sync_host();
143: *array = aijkok->a_dual.view_host().data();
144: } else {
145: *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
146: }
147: PetscFunctionReturn(PETSC_SUCCESS);
148: }
150: static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
151: {
152: PetscFunctionBegin;
153: *array = NULL;
154: PetscFunctionReturn(PETSC_SUCCESS);
155: }
157: static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
158: {
159: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
161: PetscFunctionBegin;
162: if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
163: *array = aijkok->a_dual.view_host().data();
164: } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
165: *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
166: }
167: PetscFunctionReturn(PETSC_SUCCESS);
168: }
170: static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
171: {
172: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
174: PetscFunctionBegin;
175: if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
176: aijkok->a_dual.clear_sync_state();
177: aijkok->a_dual.modify_host();
178: }
179: PetscFunctionReturn(PETSC_SUCCESS);
180: }
182: static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
183: {
184: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
186: PetscFunctionBegin;
187: PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL");
189: if (i) *i = aijkok->i_device_data();
190: if (j) *j = aijkok->j_device_data();
191: if (a) {
192: aijkok->a_dual.sync_device();
193: *a = aijkok->a_device_data();
194: }
195: if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS;
196: PetscFunctionReturn(PETSC_SUCCESS);
197: }
199: // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy
200: PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat)
201: {
202: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
203: Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat);
205: PetscFunctionBegin;
206: PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
207: aijkok->device_mat_d = create_mirror(DefaultMemorySpace(), h_mat_k);
208: Kokkos::deep_copy(aijkok->device_mat_d, h_mat_k);
209: PetscFunctionReturn(PETSC_SUCCESS);
210: }
212: // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL
213: PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat)
214: {
215: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
217: PetscFunctionBegin;
218: if (aijkok && aijkok->device_mat_d.data()) {
219: *d_mat = aijkok->device_mat_d.data();
220: } else {
221: PetscCall(MatSeqAIJKokkosSyncDevice(A)); // create aijkok (we are making d_mat now so make a place for it)
222: *d_mat = NULL;
223: }
224: PetscFunctionReturn(PETSC_SUCCESS);
225: }
227: /*
228: Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device.
230: Input Parameter:
231: . A - the MATSEQAIJKOKKOS matrix
233: Output Parameters:
234: + perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i))
235: - T_d - the transpose on device, whose value array is allocated but not initialized
236: */
237: static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d)
238: {
239: Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data);
240: PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
241: const PetscInt *Ai = aseq->i, *Aj = aseq->j;
242: MatRowMapKokkosViewHost Ti_h("Ti", n + 1);
243: MatRowMapType *Ti = Ti_h.data();
244: MatColIdxKokkosViewHost Tj_h("Tj", nz);
245: MatRowMapKokkosViewHost perm_h("permutation", nz);
246: PetscInt *Tj = Tj_h.data();
247: PetscInt *perm = perm_h.data();
248: PetscInt *offset;
250: PetscFunctionBegin;
251: // Populate Ti
252: PetscCallCXX(Kokkos::deep_copy(Ti_h, 0));
253: Ti++;
254: for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++;
255: Ti--;
256: for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i];
258: // Populate Tj and the permutation array
259: PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices
260: for (PetscInt i = 0; i < m; i++) {
261: for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i)
262: PetscInt r = Aj[j]; // row r of T
263: PetscInt disp = Ti[r] + offset[r];
265: Tj[disp] = i; // col i of T
266: perm[disp] = j;
267: offset[r]++;
268: }
269: }
270: PetscCall(PetscFree(offset));
272: // Sort each row of T, along with the permutation array
273: for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i]));
275: // Output perm and T on device
276: auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h);
277: auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h);
278: PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d));
279: PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h));
280: PetscFunctionReturn(PETSC_SUCCESS);
281: }
283: // Generate the transpose on device and cache it internally
284: // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own
285: PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT)
286: {
287: Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data);
288: Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
289: PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
290: KokkosCsrMatrix &T = akok->csrmatT;
292: PetscFunctionBegin;
293: PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
294: PetscCallCXX(akok->a_dual.sync_device()); // Sync A's valeus since we are going to access them on device
296: const auto &Aa = akok->a_dual.view_device();
298: if (A->symmetric == PETSC_BOOL3_TRUE) {
299: *csrmatT = akok->csrmat;
300: } else {
301: // See if we already have a cached transpose and its value is up to date
302: if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
303: if (!akok->transpose_updated) { // if the value is out of date, update the cached version
304: const auto &perm = akok->transpose_perm; // get the permutation array
305: auto &Ta = T.values;
307: PetscCallCXX(Kokkos::parallel_for(
308: nz, KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); }));
309: }
310: } else { // Generate T of size n x m for the first time
311: MatRowMapKokkosView perm;
313: PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
314: akok->transpose_perm = perm; // cache the perm in this matrix for reuse
315: PetscCallCXX(Kokkos::parallel_for(
316: nz, KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); }));
317: }
318: akok->transpose_updated = PETSC_TRUE;
319: *csrmatT = akok->csrmatT;
320: }
321: PetscFunctionReturn(PETSC_SUCCESS);
322: }
324: // Generate the Hermitian on device and cache it internally
325: static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH)
326: {
327: Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data);
328: Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
329: PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
330: KokkosCsrMatrix &T = akok->csrmatH;
332: PetscFunctionBegin;
333: PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
334: PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device
336: const auto &Aa = akok->a_dual.view_device();
338: if (A->hermitian == PETSC_BOOL3_TRUE) {
339: *csrmatH = akok->csrmat;
340: } else {
341: // See if we already have a cached hermitian and its value is up to date
342: if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
343: if (!akok->hermitian_updated) { // if the value is out of date, update the cached version
344: const auto &perm = akok->transpose_perm; // get the permutation array
345: auto &Ta = T.values;
347: PetscCallCXX(Kokkos::parallel_for(
348: nz, KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); }));
349: }
350: } else { // Generate T of size n x m for the first time
351: MatRowMapKokkosView perm;
353: PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
354: akok->transpose_perm = perm; // cache the perm in this matrix for reuse
355: PetscCallCXX(Kokkos::parallel_for(
356: nz, KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); }));
357: }
358: akok->hermitian_updated = PETSC_TRUE;
359: *csrmatH = akok->csrmatH;
360: }
361: PetscFunctionReturn(PETSC_SUCCESS);
362: }
364: /* y = A x */
365: static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
366: {
367: Mat_SeqAIJKokkos *aijkok;
368: ConstPetscScalarKokkosView xv;
369: PetscScalarKokkosView yv;
371: PetscFunctionBegin;
372: PetscCall(PetscLogGpuTimeBegin());
373: PetscCall(MatSeqAIJKokkosSyncDevice(A));
374: PetscCall(VecGetKokkosView(xx, &xv));
375: PetscCall(VecGetKokkosViewWrite(yy, &yv));
376: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
377: PetscCallCXX(KokkosSparse::spmv("N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */
378: PetscCall(VecRestoreKokkosView(xx, &xv));
379: PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
380: /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
381: PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
382: PetscCall(PetscLogGpuTimeEnd());
383: PetscFunctionReturn(PETSC_SUCCESS);
384: }
386: /* y = A^T x */
387: static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
388: {
389: Mat_SeqAIJKokkos *aijkok;
390: const char *mode;
391: ConstPetscScalarKokkosView xv;
392: PetscScalarKokkosView yv;
393: KokkosCsrMatrix csrmat;
395: PetscFunctionBegin;
396: PetscCall(PetscLogGpuTimeBegin());
397: PetscCall(MatSeqAIJKokkosSyncDevice(A));
398: PetscCall(VecGetKokkosView(xx, &xv));
399: PetscCall(VecGetKokkosViewWrite(yy, &yv));
400: if (A->form_explicit_transpose) {
401: PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
402: mode = "N";
403: } else {
404: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
405: csrmat = aijkok->csrmat;
406: mode = "T";
407: }
408: PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */
409: PetscCall(VecRestoreKokkosView(xx, &xv));
410: PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
411: PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
412: PetscCall(PetscLogGpuTimeEnd());
413: PetscFunctionReturn(PETSC_SUCCESS);
414: }
416: /* y = A^H x */
417: static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
418: {
419: Mat_SeqAIJKokkos *aijkok;
420: const char *mode;
421: ConstPetscScalarKokkosView xv;
422: PetscScalarKokkosView yv;
423: KokkosCsrMatrix csrmat;
425: PetscFunctionBegin;
426: PetscCall(PetscLogGpuTimeBegin());
427: PetscCall(MatSeqAIJKokkosSyncDevice(A));
428: PetscCall(VecGetKokkosView(xx, &xv));
429: PetscCall(VecGetKokkosViewWrite(yy, &yv));
430: if (A->form_explicit_transpose) {
431: PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
432: mode = "N";
433: } else {
434: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
435: csrmat = aijkok->csrmat;
436: mode = "C";
437: }
438: PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */
439: PetscCall(VecRestoreKokkosView(xx, &xv));
440: PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
441: PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
442: PetscCall(PetscLogGpuTimeEnd());
443: PetscFunctionReturn(PETSC_SUCCESS);
444: }
446: /* z = A x + y */
447: static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
448: {
449: Mat_SeqAIJKokkos *aijkok;
450: ConstPetscScalarKokkosView xv, yv;
451: PetscScalarKokkosView zv;
453: PetscFunctionBegin;
454: PetscCall(PetscLogGpuTimeBegin());
455: PetscCall(MatSeqAIJKokkosSyncDevice(A));
456: PetscCall(VecGetKokkosView(xx, &xv));
457: PetscCall(VecGetKokkosView(yy, &yv));
458: PetscCall(VecGetKokkosViewWrite(zz, &zv));
459: if (zz != yy) Kokkos::deep_copy(zv, yv);
460: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
461: PetscCallCXX(KokkosSparse::spmv("N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */
462: PetscCall(VecRestoreKokkosView(xx, &xv));
463: PetscCall(VecRestoreKokkosView(yy, &yv));
464: PetscCall(VecRestoreKokkosViewWrite(zz, &zv));
465: PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
466: PetscCall(PetscLogGpuTimeEnd());
467: PetscFunctionReturn(PETSC_SUCCESS);
468: }
470: /* z = A^T x + y */
471: static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
472: {
473: Mat_SeqAIJKokkos *aijkok;
474: const char *mode;
475: ConstPetscScalarKokkosView xv, yv;
476: PetscScalarKokkosView zv;
477: KokkosCsrMatrix csrmat;
479: PetscFunctionBegin;
480: PetscCall(PetscLogGpuTimeBegin());
481: PetscCall(MatSeqAIJKokkosSyncDevice(A));
482: PetscCall(VecGetKokkosView(xx, &xv));
483: PetscCall(VecGetKokkosView(yy, &yv));
484: PetscCall(VecGetKokkosViewWrite(zz, &zv));
485: if (zz != yy) Kokkos::deep_copy(zv, yv);
486: if (A->form_explicit_transpose) {
487: PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
488: mode = "N";
489: } else {
490: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
491: csrmat = aijkok->csrmat;
492: mode = "T";
493: }
494: PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */
495: PetscCall(VecRestoreKokkosView(xx, &xv));
496: PetscCall(VecRestoreKokkosView(yy, &yv));
497: PetscCall(VecRestoreKokkosViewWrite(zz, &zv));
498: PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
499: PetscCall(PetscLogGpuTimeEnd());
500: PetscFunctionReturn(PETSC_SUCCESS);
501: }
503: /* z = A^H x + y */
504: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
505: {
506: Mat_SeqAIJKokkos *aijkok;
507: const char *mode;
508: ConstPetscScalarKokkosView xv, yv;
509: PetscScalarKokkosView zv;
510: KokkosCsrMatrix csrmat;
512: PetscFunctionBegin;
513: PetscCall(PetscLogGpuTimeBegin());
514: PetscCall(MatSeqAIJKokkosSyncDevice(A));
515: PetscCall(VecGetKokkosView(xx, &xv));
516: PetscCall(VecGetKokkosView(yy, &yv));
517: PetscCall(VecGetKokkosViewWrite(zz, &zv));
518: if (zz != yy) Kokkos::deep_copy(zv, yv);
519: if (A->form_explicit_transpose) {
520: PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
521: mode = "N";
522: } else {
523: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
524: csrmat = aijkok->csrmat;
525: mode = "C";
526: }
527: PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */
528: PetscCall(VecRestoreKokkosView(xx, &xv));
529: PetscCall(VecRestoreKokkosView(yy, &yv));
530: PetscCall(VecRestoreKokkosViewWrite(zz, &zv));
531: PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
532: PetscCall(PetscLogGpuTimeEnd());
533: PetscFunctionReturn(PETSC_SUCCESS);
534: }
536: PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg)
537: {
538: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
540: PetscFunctionBegin;
541: switch (op) {
542: case MAT_FORM_EXPLICIT_TRANSPOSE:
543: /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
544: if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose());
545: A->form_explicit_transpose = flg;
546: break;
547: default:
548: PetscCall(MatSetOption_SeqAIJ(A, op, flg));
549: break;
550: }
551: PetscFunctionReturn(PETSC_SUCCESS);
552: }
554: /* Depending on reuse, either build a new mat, or use the existing mat */
555: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat)
556: {
557: Mat_SeqAIJ *aseq;
559: PetscFunctionBegin;
560: PetscCall(PetscKokkosInitializeCheck());
561: if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */
562: PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat)); /* the returned newmat is a SeqAIJKokkos */
563: } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */
564: PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */
565: } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */
566: PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX");
567: PetscCall(PetscFree(A->defaultvectype));
568: PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */
569: PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS));
570: PetscCall(MatSetOps_SeqAIJKokkos(A));
571: aseq = static_cast<Mat_SeqAIJ *>(A->data);
572: if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */
573: PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
574: A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq->nz, aseq->i, aseq->j, aseq->a, A->nonzerostate, PETSC_FALSE);
575: }
576: }
577: PetscFunctionReturn(PETSC_SUCCESS);
578: }
580: /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
581: an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
582: */
583: static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B)
584: {
585: Mat_SeqAIJ *bseq;
586: Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok;
587: Mat mat;
589: PetscFunctionBegin;
590: /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
591: PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B));
592: mat = *B;
593: if (A->assembled) {
594: bseq = static_cast<Mat_SeqAIJ *>(mat->data);
595: bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq->nz, bseq->i, bseq->j, bseq->a, mat->nonzerostate, PETSC_FALSE);
596: bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
597: /* Now copy values to B if needed */
598: if (dupOption == MAT_COPY_VALUES) {
599: if (akok->a_dual.need_sync_device()) {
600: Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host());
601: bkok->a_dual.modify_host();
602: } else { /* If device has the latest data, we only copy data on device */
603: Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device());
604: bkok->a_dual.modify_device();
605: }
606: } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
607: /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
608: bkok->a_dual.modify_host();
609: }
610: mat->spptr = bkok;
611: }
613: PetscCall(PetscFree(mat->defaultvectype));
614: PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */
615: PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS));
616: PetscCall(MatSetOps_SeqAIJKokkos(mat));
617: PetscFunctionReturn(PETSC_SUCCESS);
618: }
620: static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B)
621: {
622: Mat At;
623: KokkosCsrMatrix internT;
624: Mat_SeqAIJKokkos *atkok, *bkok;
626: PetscFunctionBegin;
627: if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B));
628: PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */
629: if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
630: /* Deep copy internT, as we want to isolate the internal transpose */
631: PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT)));
632: PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At));
633: if (reuse == MAT_INITIAL_MATRIX) *B = At;
634: else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */
635: } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */
636: if ((*B)->assembled) {
637: bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr);
638: PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values));
639: PetscCall(MatSeqAIJKokkosModifyDevice(*B));
640: } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
641: Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ *>((*B)->data);
642: MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */
643: MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz());
644: PetscCallCXX(Kokkos::deep_copy(a_h, internT.values));
645: PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries));
646: } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated");
647: }
648: PetscFunctionReturn(PETSC_SUCCESS);
649: }
651: static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
652: {
653: Mat_SeqAIJKokkos *aijkok;
655: PetscFunctionBegin;
656: if (A->factortype == MAT_FACTOR_NONE) {
657: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
658: delete aijkok;
659: } else {
660: delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr);
661: }
662: A->spptr = NULL;
663: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
664: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
665: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
666: PetscCall(MatDestroy_SeqAIJ(A));
667: PetscFunctionReturn(PETSC_SUCCESS);
668: }
670: /*MC
671: MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos
673: A matrix type type using Kokkos-Kernels CrsMatrix type for portability across different device types
675: Options Database Key:
676: . -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()`
678: Level: beginner
680: .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS`
681: M*/
682: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
683: {
684: PetscFunctionBegin;
685: PetscCall(PetscKokkosInitializeCheck());
686: PetscCall(MatCreate_SeqAIJ(A));
687: PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A));
688: PetscFunctionReturn(PETSC_SUCCESS);
689: }
691: /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
692: PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
693: {
694: Mat_SeqAIJ *a, *b;
695: Mat_SeqAIJKokkos *akok, *bkok, *ckok;
696: MatScalarKokkosView aa, ba, ca;
697: MatRowMapKokkosView ai, bi, ci;
698: MatColIdxKokkosView aj, bj, cj;
699: PetscInt m, n, nnz, aN;
701: PetscFunctionBegin;
705: PetscCheckTypeName(A, MATSEQAIJKOKKOS);
706: PetscCheckTypeName(B, MATSEQAIJKOKKOS);
707: PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
708: PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
710: PetscCall(MatSeqAIJKokkosSyncDevice(A));
711: PetscCall(MatSeqAIJKokkosSyncDevice(B));
712: a = static_cast<Mat_SeqAIJ *>(A->data);
713: b = static_cast<Mat_SeqAIJ *>(B->data);
714: akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
715: bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
716: aa = akok->a_dual.view_device();
717: ai = akok->i_dual.view_device();
718: ba = bkok->a_dual.view_device();
719: bi = bkok->i_dual.view_device();
720: m = A->rmap->n; /* M, N and nnz of C */
721: n = A->cmap->n + B->cmap->n;
722: nnz = a->nz + b->nz;
723: aN = A->cmap->n; /* N of A */
724: if (reuse == MAT_INITIAL_MATRIX) {
725: aj = akok->j_dual.view_device();
726: bj = bkok->j_dual.view_device();
727: auto ca_dual = MatScalarKokkosDualView("a", aa.extent(0) + ba.extent(0));
728: auto ci_dual = MatRowMapKokkosDualView("i", ai.extent(0));
729: auto cj_dual = MatColIdxKokkosDualView("j", aj.extent(0) + bj.extent(0));
730: ca = ca_dual.view_device();
731: ci = ci_dual.view_device();
732: cj = cj_dual.view_device();
734: /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
735: Kokkos::parallel_for(
736: Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
737: PetscInt i = t.league_rank(); /* row i */
738: PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
740: Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
741: ci(i) = coffset;
742: if (i == m - 1) ci(m) = ai(m) + bi(m);
743: });
745: Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
746: if (k < alen) {
747: ca(coffset + k) = aa(ai(i) + k);
748: cj(coffset + k) = aj(ai(i) + k);
749: } else {
750: ca(coffset + k) = ba(bi(i) + k - alen);
751: cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */
752: }
753: });
754: });
755: ca_dual.modify_device();
756: ci_dual.modify_device();
757: cj_dual.modify_device();
758: PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci_dual, cj_dual, ca_dual));
759: PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C));
760: } else if (reuse == MAT_REUSE_MATRIX) {
762: PetscCheckTypeName(*C, MATSEQAIJKOKKOS);
763: ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr);
764: ca = ckok->a_dual.view_device();
765: ci = ckok->i_dual.view_device();
767: Kokkos::parallel_for(
768: Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
769: PetscInt i = t.league_rank(); /* row i */
770: PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
771: Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
772: if (k < alen) ca(ci(i) + k) = aa(ai(i) + k);
773: else ca(ci(i) + k) = ba(bi(i) + k - alen);
774: });
775: });
776: PetscCall(MatSeqAIJKokkosModifyDevice(*C));
777: }
778: PetscFunctionReturn(PETSC_SUCCESS);
779: }
781: static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata)
782: {
783: PetscFunctionBegin;
784: delete static_cast<MatProductData_SeqAIJKokkos *>(pdata);
785: PetscFunctionReturn(PETSC_SUCCESS);
786: }
788: static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
789: {
790: Mat_Product *product = C->product;
791: Mat A, B;
792: bool transA, transB; /* use bool, since KK needs this type */
793: Mat_SeqAIJKokkos *akok, *bkok, *ckok;
794: Mat_SeqAIJ *c;
795: MatProductData_SeqAIJKokkos *pdata;
796: KokkosCsrMatrix csrmatA, csrmatB;
798: PetscFunctionBegin;
799: MatCheckProduct(C, 1);
800: PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty");
801: pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data);
803: // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)).
804: // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C),
805: // we still do numeric.
806: if (pdata->reusesym) { // numeric reuses results from symbolic
807: pdata->reusesym = PETSC_FALSE;
808: PetscFunctionReturn(PETSC_SUCCESS);
809: }
811: switch (product->type) {
812: case MATPRODUCT_AB:
813: transA = false;
814: transB = false;
815: break;
816: case MATPRODUCT_AtB:
817: transA = true;
818: transB = false;
819: break;
820: case MATPRODUCT_ABt:
821: transA = false;
822: transB = true;
823: break;
824: default:
825: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
826: }
828: A = product->A;
829: B = product->B;
830: PetscCall(MatSeqAIJKokkosSyncDevice(A));
831: PetscCall(MatSeqAIJKokkosSyncDevice(B));
832: akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
833: bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
834: ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr);
836: PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty");
838: csrmatA = akok->csrmat;
839: csrmatB = bkok->csrmat;
841: /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
842: if (transA) {
843: PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
844: transA = false;
845: }
847: if (transB) {
848: PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
849: transB = false;
850: }
851: PetscCall(PetscLogGpuTimeBegin());
852: PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat));
853: #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
854: auto spgemmHandle = pdata->kh.get_spgemm_handle();
855: if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */
856: #endif
858: PetscCall(PetscLogGpuTimeEnd());
859: PetscCall(MatSeqAIJKokkosModifyDevice(C));
860: /* shorter version of MatAssemblyEnd_SeqAIJ */
861: c = (Mat_SeqAIJ *)C->data;
862: PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
863: PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
864: PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
865: c->reallocs = 0;
866: C->info.mallocs = 0;
867: C->info.nz_unneeded = 0;
868: C->assembled = C->was_assembled = PETSC_TRUE;
869: C->num_ass++;
870: PetscFunctionReturn(PETSC_SUCCESS);
871: }
873: static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
874: {
875: Mat_Product *product = C->product;
876: MatProductType ptype;
877: Mat A, B;
878: bool transA, transB;
879: Mat_SeqAIJKokkos *akok, *bkok, *ckok;
880: MatProductData_SeqAIJKokkos *pdata;
881: MPI_Comm comm;
882: KokkosCsrMatrix csrmatA, csrmatB, csrmatC;
884: PetscFunctionBegin;
885: MatCheckProduct(C, 1);
886: PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
887: PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty");
888: A = product->A;
889: B = product->B;
890: PetscCall(MatSeqAIJKokkosSyncDevice(A));
891: PetscCall(MatSeqAIJKokkosSyncDevice(B));
892: akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
893: bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
894: csrmatA = akok->csrmat;
895: csrmatB = bkok->csrmat;
897: ptype = product->type;
898: // Take advantage of the symmetry if true
899: if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
900: ptype = MATPRODUCT_AB;
901: product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
902: }
903: if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
904: ptype = MATPRODUCT_AB;
905: product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
906: }
908: switch (ptype) {
909: case MATPRODUCT_AB:
910: transA = false;
911: transB = false;
912: break;
913: case MATPRODUCT_AtB:
914: transA = true;
915: transB = false;
916: break;
917: case MATPRODUCT_ABt:
918: transA = false;
919: transB = true;
920: break;
921: default:
922: SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
923: }
924: PetscCallCXX(product->data = pdata = new MatProductData_SeqAIJKokkos());
925: pdata->reusesym = product->api_user;
927: /* TODO: add command line options to select spgemm algorithms */
928: auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */
930: /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */
931: #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
932: #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
933: spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
934: #endif
935: #endif
936: PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg));
938: PetscCall(PetscLogGpuTimeBegin());
939: /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
940: if (transA) {
941: PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
942: transA = false;
943: }
945: if (transB) {
946: PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
947: transB = false;
948: }
950: PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
951: /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
952: So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
953: calling new Mat_SeqAIJKokkos().
954: TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
955: */
956: PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
957: #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
958: /* Query if KK outputs a sorted matrix. If not, we need to sort it */
959: auto spgemmHandle = pdata->kh.get_spgemm_handle();
960: if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/
961: #endif
962: PetscCall(PetscLogGpuTimeEnd());
964: PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
965: PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok));
966: C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
967: PetscFunctionReturn(PETSC_SUCCESS);
968: }
970: /* handles sparse matrix matrix ops */
971: static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
972: {
973: Mat_Product *product = mat->product;
974: PetscBool Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE;
976: PetscFunctionBegin;
977: MatCheckProduct(mat, 1);
978: PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok));
979: if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok));
980: if (Biskok && Ciskok) {
981: switch (product->type) {
982: case MATPRODUCT_AB:
983: case MATPRODUCT_AtB:
984: case MATPRODUCT_ABt:
985: mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
986: break;
987: case MATPRODUCT_PtAP:
988: case MATPRODUCT_RARt:
989: case MATPRODUCT_ABC:
990: mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
991: break;
992: default:
993: break;
994: }
995: } else { /* fallback for AIJ */
996: PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
997: }
998: PetscFunctionReturn(PETSC_SUCCESS);
999: }
1001: static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
1002: {
1003: Mat_SeqAIJKokkos *aijkok;
1005: PetscFunctionBegin;
1006: PetscCall(PetscLogGpuTimeBegin());
1007: PetscCall(MatSeqAIJKokkosSyncDevice(A));
1008: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1009: KokkosBlas::scal(aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device());
1010: PetscCall(MatSeqAIJKokkosModifyDevice(A));
1011: PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0)));
1012: PetscCall(PetscLogGpuTimeEnd());
1013: PetscFunctionReturn(PETSC_SUCCESS);
1014: }
1016: static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
1017: {
1018: Mat_SeqAIJKokkos *aijkok;
1020: PetscFunctionBegin;
1021: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1022: if (aijkok) { /* Only zero the device if data is already there */
1023: KokkosBlas::fill(aijkok->a_dual.view_device(), 0.0);
1024: PetscCall(MatSeqAIJKokkosModifyDevice(A));
1025: } else { /* Might be preallocated but not assembled */
1026: PetscCall(MatZeroEntries_SeqAIJ(A));
1027: }
1028: PetscFunctionReturn(PETSC_SUCCESS);
1029: }
1031: static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x)
1032: {
1033: Mat_SeqAIJ *aijseq;
1034: Mat_SeqAIJKokkos *aijkok;
1035: PetscInt n;
1036: PetscScalarKokkosView xv;
1038: PetscFunctionBegin;
1039: PetscCall(VecGetLocalSize(x, &n));
1040: PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
1041: PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices");
1043: PetscCall(MatSeqAIJKokkosSyncDevice(A));
1044: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1046: if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { /* Set the diagonal pointer if not already */
1047: PetscCall(MatMarkDiagonal_SeqAIJ(A));
1048: aijseq = static_cast<Mat_SeqAIJ *>(A->data);
1049: aijkok->SetDiagonal(aijseq->diag);
1050: }
1052: const auto &Aa = aijkok->a_dual.view_device();
1053: const auto &Ai = aijkok->i_dual.view_device();
1054: const auto &Adiag = aijkok->diag_dual.view_device();
1056: PetscCall(VecGetKokkosViewWrite(x, &xv));
1057: Kokkos::parallel_for(
1058: n, KOKKOS_LAMBDA(const PetscInt i) {
1059: if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i));
1060: else xv(i) = 0;
1061: });
1062: PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1063: PetscFunctionReturn(PETSC_SUCCESS);
1064: }
1066: /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
1067: PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1068: {
1069: Mat_SeqAIJKokkos *aijkok;
1071: PetscFunctionBegin;
1074: PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1075: PetscCall(MatSeqAIJKokkosSyncDevice(A));
1076: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1077: *kv = aijkok->a_dual.view_device();
1078: PetscFunctionReturn(PETSC_SUCCESS);
1079: }
1081: PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1082: {
1083: PetscFunctionBegin;
1086: PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1087: PetscFunctionReturn(PETSC_SUCCESS);
1088: }
1090: PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv)
1091: {
1092: Mat_SeqAIJKokkos *aijkok;
1094: PetscFunctionBegin;
1097: PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1098: PetscCall(MatSeqAIJKokkosSyncDevice(A));
1099: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1100: *kv = aijkok->a_dual.view_device();
1101: PetscFunctionReturn(PETSC_SUCCESS);
1102: }
1104: PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv)
1105: {
1106: PetscFunctionBegin;
1109: PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1110: PetscCall(MatSeqAIJKokkosModifyDevice(A));
1111: PetscFunctionReturn(PETSC_SUCCESS);
1112: }
1114: PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1115: {
1116: Mat_SeqAIJKokkos *aijkok;
1118: PetscFunctionBegin;
1121: PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1122: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1123: *kv = aijkok->a_dual.view_device();
1124: PetscFunctionReturn(PETSC_SUCCESS);
1125: }
1127: PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1128: {
1129: PetscFunctionBegin;
1132: PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1133: PetscCall(MatSeqAIJKokkosModifyDevice(A));
1134: PetscFunctionReturn(PETSC_SUCCESS);
1135: }
1137: /* Computes Y += alpha X */
1138: static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern)
1139: {
1140: Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
1141: Mat_SeqAIJKokkos *xkok, *ykok, *zkok;
1142: ConstMatScalarKokkosView Xa;
1143: MatScalarKokkosView Ya;
1145: PetscFunctionBegin;
1146: PetscCheckTypeName(Y, MATSEQAIJKOKKOS);
1147: PetscCheckTypeName(X, MATSEQAIJKOKKOS);
1148: PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1149: PetscCall(MatSeqAIJKokkosSyncDevice(X));
1150: PetscCall(PetscLogGpuTimeBegin());
1152: if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
1153: PetscBool e;
1154: PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e));
1155: if (e) {
1156: PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e));
1157: if (e) pattern = SAME_NONZERO_PATTERN;
1158: }
1159: }
1161: /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
1162: cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
1163: If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
1164: KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
1165: */
1166: ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1167: xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr);
1168: Xa = xkok->a_dual.view_device();
1169: Ya = ykok->a_dual.view_device();
1171: if (pattern == SAME_NONZERO_PATTERN) {
1172: KokkosBlas::axpy(alpha, Xa, Ya);
1173: PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1174: } else if (pattern == SUBSET_NONZERO_PATTERN) {
1175: MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device();
1176: MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device();
1178: Kokkos::parallel_for(
1179: Kokkos::TeamPolicy<>(Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1180: PetscInt i = t.league_rank(); // row i
1181: Kokkos::single(Kokkos::PerTeam(t), [=]() {
1182: // Only one thread works in a team
1183: PetscInt p, q = Yi(i);
1184: for (p = Xi(i); p < Xi(i + 1); p++) { // For each nonzero on row i of X,
1185: while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y.
1186: if (Xj(p) == Yj(q)) { // Found it
1187: Ya(q) += alpha * Xa(p);
1188: q++;
1189: } else {
1190: // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
1191: // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
1192: #if PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0)
1193: if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan();
1194: #else
1195: if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1");
1196: #endif
1197: }
1198: }
1199: });
1200: });
1201: PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1202: } else { // different nonzero patterns
1203: Mat Z;
1204: KokkosCsrMatrix zcsr;
1205: KernelHandle kh;
1206: kh.create_spadd_handle(true); // X, Y are sorted
1207: KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr);
1208: KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr);
1209: zkok = new Mat_SeqAIJKokkos(zcsr);
1210: PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z));
1211: PetscCall(MatHeaderReplace(Y, &Z));
1212: kh.destroy_spadd_handle();
1213: }
1214: PetscCall(PetscLogGpuTimeEnd());
1215: PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y
1216: PetscFunctionReturn(PETSC_SUCCESS);
1217: }
1219: static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
1220: {
1221: Mat_SeqAIJKokkos *akok;
1222: Mat_SeqAIJ *aseq;
1224: PetscFunctionBegin;
1225: PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j));
1226: aseq = static_cast<Mat_SeqAIJ *>(mat->data);
1227: akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr);
1228: delete akok;
1229: mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq->nz, aseq->i, aseq->j, aseq->a, mat->nonzerostate + 1, PETSC_FALSE);
1230: PetscCall(MatZeroEntries_SeqAIJKokkos(mat));
1231: akok->SetUpCOO(aseq);
1232: PetscFunctionReturn(PETSC_SUCCESS);
1233: }
1235: static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode)
1236: {
1237: Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data);
1238: Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1239: PetscCount Annz = aseq->nz;
1240: const PetscCountKokkosView &jmap = akok->jmap_d;
1241: const PetscCountKokkosView &perm = akok->perm_d;
1242: MatScalarKokkosView Aa;
1243: ConstMatScalarKokkosView kv;
1244: PetscMemType memtype;
1246: PetscFunctionBegin;
1247: PetscCall(PetscGetMemType(v, &memtype));
1248: if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
1249: kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, aseq->coo_n));
1250: } else {
1251: kv = ConstMatScalarKokkosView(v, aseq->coo_n); /* Directly use v[]'s memory */
1252: }
1254: if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */
1255: else PetscCall(MatSeqAIJGetKokkosView(A, &Aa)); /* read & write matrix values */
1257: Kokkos::parallel_for(
1258: Annz, KOKKOS_LAMBDA(const PetscCount i) {
1259: PetscScalar sum = 0.0;
1260: for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k));
1261: Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum;
1262: });
1264: if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa));
1265: else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa));
1266: PetscFunctionReturn(PETSC_SUCCESS);
1267: }
1269: PETSC_INTERN PetscErrorCode MatSeqAIJMoveDiagonalValuesFront_SeqAIJKokkos(Mat A, const PetscInt *diag)
1270: {
1271: Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1272: MatScalarKokkosView Aa;
1273: const MatRowMapKokkosView &Ai = akok->i_dual.view_device();
1274: PetscInt m = A->rmap->n;
1275: ConstMatRowMapKokkosView Adiag(diag, m); /* diag is a device pointer */
1277: PetscFunctionBegin;
1278: PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa));
1279: Kokkos::parallel_for(
1280: m, KOKKOS_LAMBDA(const PetscInt i) {
1281: PetscScalar tmp;
1282: if (Adiag(i) >= Ai(i) && Adiag(i) < Ai(i + 1)) { /* The diagonal element exists */
1283: tmp = Aa(Ai(i));
1284: Aa(Ai(i)) = Aa(Adiag(i));
1285: Aa(Adiag(i)) = tmp;
1286: }
1287: });
1288: PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa));
1289: PetscFunctionReturn(PETSC_SUCCESS);
1290: }
1292: static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1293: {
1294: PetscFunctionBegin;
1295: PetscCall(MatSeqAIJKokkosSyncHost(A));
1296: PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
1297: B->offloadmask = PETSC_OFFLOAD_CPU;
1298: PetscFunctionReturn(PETSC_SUCCESS);
1299: }
1301: static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
1302: {
1303: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
1305: PetscFunctionBegin;
1306: A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
1307: A->boundtocpu = PETSC_FALSE;
1309: A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos;
1310: A->ops->destroy = MatDestroy_SeqAIJKokkos;
1311: A->ops->duplicate = MatDuplicate_SeqAIJKokkos;
1312: A->ops->axpy = MatAXPY_SeqAIJKokkos;
1313: A->ops->scale = MatScale_SeqAIJKokkos;
1314: A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos;
1315: A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos;
1316: A->ops->mult = MatMult_SeqAIJKokkos;
1317: A->ops->multadd = MatMultAdd_SeqAIJKokkos;
1318: A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos;
1319: A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos;
1320: A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos;
1321: A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1322: A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
1323: A->ops->transpose = MatTranspose_SeqAIJKokkos;
1324: A->ops->setoption = MatSetOption_SeqAIJKokkos;
1325: A->ops->getdiagonal = MatGetDiagonal_SeqAIJKokkos;
1326: a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos;
1327: a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos;
1328: a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1329: a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1330: a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1331: a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
1332: a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos;
1334: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos));
1335: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos));
1336: PetscFunctionReturn(PETSC_SUCCESS);
1337: }
1339: PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok)
1340: {
1341: Mat_SeqAIJ *aseq;
1342: PetscInt i, m, n;
1344: PetscFunctionBegin;
1345: PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty");
1347: m = akok->nrows();
1348: n = akok->ncols();
1349: PetscCall(MatSetSizes(A, m, n, m, n));
1350: PetscCall(MatSetType(A, MATSEQAIJKOKKOS));
1352: /* Set up data structures of A as a MATSEQAIJ */
1353: PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL));
1354: aseq = (Mat_SeqAIJ *)(A)->data;
1356: akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */
1357: akok->j_dual.sync_host();
1359: aseq->i = akok->i_host_data();
1360: aseq->j = akok->j_host_data();
1361: aseq->a = akok->a_host_data();
1362: aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1363: aseq->singlemalloc = PETSC_FALSE;
1364: aseq->free_a = PETSC_FALSE;
1365: aseq->free_ij = PETSC_FALSE;
1366: aseq->nz = akok->nnz();
1367: aseq->maxnz = aseq->nz;
1369: PetscCall(PetscMalloc1(m, &aseq->imax));
1370: PetscCall(PetscMalloc1(m, &aseq->ilen));
1371: for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i];
1373: /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1374: akok->nonzerostate = A->nonzerostate;
1375: A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */
1376: PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
1377: PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
1378: PetscFunctionReturn(PETSC_SUCCESS);
1379: }
1381: PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr)
1382: {
1383: PetscFunctionBegin;
1384: PetscCall(MatSeqAIJKokkosSyncDevice(A));
1385: *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat;
1386: PetscFunctionReturn(PETSC_SUCCESS);
1387: }
1389: PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A)
1390: {
1391: Mat_SeqAIJKokkos *akok;
1392: PetscFunctionBegin;
1393: PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr));
1394: PetscCall(MatCreate(comm, A));
1395: PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1396: PetscFunctionReturn(PETSC_SUCCESS);
1397: }
1399: /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1401: Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1402: */
1403: PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A)
1404: {
1405: PetscFunctionBegin;
1406: PetscCall(MatCreate(comm, A));
1407: PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1408: PetscFunctionReturn(PETSC_SUCCESS);
1409: }
1411: /*@C
1412: MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format
1413: (the default parallel PETSc format). This matrix will ultimately be handled by
1414: Kokkos for calculations.
1416: Collective
1418: Input Parameters:
1419: + comm - MPI communicator, set to `PETSC_COMM_SELF`
1420: . m - number of rows
1421: . n - number of columns
1422: . nz - number of nonzeros per row (same for all rows), ignored if `nnz` is provided
1423: - nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
1425: Output Parameter:
1426: . A - the matrix
1428: Level: intermediate
1430: Notes:
1431: It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
1432: MatXXXXSetPreallocation() paradgm instead of this routine directly.
1433: [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
1435: The AIJ format, also called
1436: compressed row storage, is fully compatible with standard Fortran
1437: storage. That is, the stored row and column indices can begin at
1438: either one (as in Fortran) or zero.
1440: Specify the preallocated storage with either `nz` or `nnz` (not both).
1441: Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
1442: allocation.
1444: .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`
1445: @*/
1446: PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
1447: {
1448: PetscFunctionBegin;
1449: PetscCall(PetscKokkosInitializeCheck());
1450: PetscCall(MatCreate(comm, A));
1451: PetscCall(MatSetSizes(*A, m, n, m, n));
1452: PetscCall(MatSetType(*A, MATSEQAIJKOKKOS));
1453: PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
1454: PetscFunctionReturn(PETSC_SUCCESS);
1455: }
1457: typedef Kokkos::TeamPolicy<>::member_type team_member;
1458: //
1459: // This factorization exploits block diagonal matrices with "Nf" (not used).
1460: // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization
1461: //
1462: static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B, Mat A, const MatFactorInfo *info)
1463: {
1464: Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data;
1465: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
1466: IS isrow = b->row, isicol = b->icol;
1467: const PetscInt *r_h, *ic_h;
1468: const PetscInt n = A->rmap->n, *ai_d = aijkok->i_dual.view_device().data(), *aj_d = aijkok->j_dual.view_device().data(), *bi_d = baijkok->i_dual.view_device().data(), *bj_d = baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d.data();
1469: const PetscScalar *aa_d = aijkok->a_dual.view_device().data();
1470: PetscScalar *ba_d = baijkok->a_dual.view_device().data();
1471: PetscBool row_identity, col_identity;
1472: PetscInt nc, Nf = 1, nVec = 32; // should be a parameter, Nf is batch size - not used
1474: PetscFunctionBegin;
1475: PetscCheck(A->rmap->n == n, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "square matrices only supported %" PetscInt_FMT " %" PetscInt_FMT, A->rmap->n, n);
1476: PetscCall(MatIsStructurallySymmetric(A, &row_identity));
1477: PetscCheck(row_identity, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "structurally symmetric matrices only supported");
1478: PetscCall(ISGetIndices(isrow, &r_h));
1479: PetscCall(ISGetIndices(isicol, &ic_h));
1480: PetscCall(ISGetSize(isicol, &nc));
1481: PetscCall(PetscLogGpuTimeBegin());
1482: PetscCall(MatSeqAIJKokkosSyncDevice(A));
1483: {
1484: #define KOKKOS_SHARED_LEVEL 1
1485: using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space;
1486: using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>;
1487: using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>;
1488: const Kokkos::View<const PetscInt *, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> h_r_k(r_h, n);
1489: Kokkos::View<PetscInt *, Kokkos::LayoutLeft> d_r_k("r", n);
1490: const Kokkos::View<const PetscInt *, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> h_ic_k(ic_h, nc);
1491: Kokkos::View<PetscInt *, Kokkos::LayoutLeft> d_ic_k("ic", nc);
1492: size_t flops_h = 0.0;
1493: Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k(&flops_h);
1494: Kokkos::View<size_t> d_flops_k("flops");
1495: const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256
1496: const int nloc = n / Nf, Ni = (conc > 8) ? 1 /* some intelligent number of SMs -- but need league_barrier */ : 1;
1497: Kokkos::deep_copy(d_flops_k, h_flops_k);
1498: Kokkos::deep_copy(d_r_k, h_r_k);
1499: Kokkos::deep_copy(d_ic_k, h_ic_k);
1500: // Fill A --> fact
1501: Kokkos::parallel_for(
1502: Kokkos::TeamPolicy<>(Nf * Ni, team_size, nVec), KOKKOS_LAMBDA(const team_member team) {
1503: const PetscInt field = team.league_rank() / Ni, field_block = team.league_rank() % Ni; // use grid.x/y in CUDA
1504: const PetscInt nloc_i = (nloc / Ni + !!(nloc % Ni)), start_i = field * nloc + field_block * nloc_i, end_i = (start_i + nloc_i) > (field + 1) * nloc ? (field + 1) * nloc : (start_i + nloc_i);
1505: const PetscInt *ic = d_ic_k.data(), *r = d_r_k.data();
1506: // zero rows of B
1507: Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=](const int &rowb) {
1508: PetscInt nzbL = bi_d[rowb + 1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb + 1]; // with diag
1509: PetscScalar *baL = ba_d + bi_d[rowb];
1510: PetscScalar *baU = ba_d + bdiag_d[rowb + 1] + 1;
1511: /* zero (unfactored row) */
1512: for (int j = 0; j < nzbL; j++) baL[j] = 0;
1513: for (int j = 0; j < nzbU; j++) baU[j] = 0;
1514: });
1515: // copy A into B
1516: Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=](const int &rowb) {
1517: PetscInt rowa = r[rowb], nza = ai_d[rowa + 1] - ai_d[rowa];
1518: const PetscScalar *av = aa_d + ai_d[rowa];
1519: const PetscInt *ajtmp = aj_d + ai_d[rowa];
1520: /* load in initial (unfactored row) */
1521: for (int j = 0; j < nza; j++) {
1522: PetscInt colb = ic[ajtmp[j]];
1523: PetscScalar vala = av[j];
1524: if (colb == rowb) {
1525: *(ba_d + bdiag_d[rowb]) = vala;
1526: } else {
1527: const PetscInt *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb + 1] + 1 : bi_d[rowb]);
1528: PetscScalar *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb + 1] + 1 : bi_d[rowb]);
1529: PetscInt nz = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb + 1] + 1) : bi_d[rowb + 1] - bi_d[rowb], set = 0;
1530: for (int j = 0; j < nz; j++) {
1531: if (pbj[j] == colb) {
1532: pba[j] = vala;
1533: set++;
1534: break;
1535: }
1536: }
1537: #if !defined(PETSC_HAVE_SYCL)
1538: if (set != 1) printf("\t\t\t ERROR DID NOT SET ?????\n");
1539: #else
1540: (void)set;
1541: #endif
1542: }
1543: }
1544: });
1545: });
1546: Kokkos::fence();
1548: Kokkos::parallel_for(
1549: Kokkos::TeamPolicy<>(Nf * Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size() + scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA(const team_member team) {
1550: sizet_scr_t colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL));
1551: scalar_scr_t L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL));
1552: sizet_scr_t flops(team.team_scratch(KOKKOS_SHARED_LEVEL));
1553: const PetscInt field = team.league_rank() / Ni, field_block_idx = team.league_rank() % Ni; // use grid.x/y in CUDA
1554: const PetscInt start = field * nloc, end = start + nloc;
1555: Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; });
1556: // A22 panel update for each row A(1,:) and col A(:,1)
1557: for (int ii = start; ii < end - 1; ii++) {
1558: const PetscInt *bjUi = bj_d + bdiag_d[ii + 1] + 1, nzUi = bdiag_d[ii] - (bdiag_d[ii + 1] + 1); // vector, and vector size, of column indices of U(i,(i+1):end)
1559: const PetscScalar *baUi = ba_d + bdiag_d[ii + 1] + 1; // vector of data U(i,i+1:end)
1560: const PetscInt nUi_its = nzUi / Ni + !!(nzUi % Ni);
1561: const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place
1562: Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=](const int j) {
1563: PetscInt kIdx = j * Ni + field_block_idx;
1564: if (kIdx >= nzUi) /* void */
1565: ;
1566: else {
1567: const PetscInt myk = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general
1568: const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row
1569: const PetscInt nzL = bi_d[myk + 1] - bi_d[myk]; // size of L_k(:)
1570: size_t st_idx;
1571: // find and do L(k,i) = A(:k,i) / A(i,i)
1572: Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; });
1573: // get column, there has got to be a better way
1574: Kokkos::parallel_reduce(
1575: Kokkos::ThreadVectorRange(team, nzL),
1576: [&](const int &j, size_t &idx) {
1577: if (pjL[j] == ii) {
1578: PetscScalar *pLki = ba_d + bi_d[myk] + j;
1579: idx = j; // output
1580: *pLki = *pLki / Bii; // column scaling: L(k,i) = A(:k,i) / A(i,i)
1581: }
1582: },
1583: st_idx);
1584: Kokkos::single(Kokkos::PerThread(team), [=]() {
1585: colkIdx() = st_idx;
1586: L_ki() = *(ba_d + bi_d[myk] + st_idx);
1587: });
1588: #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
1589: if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n", (int)myk, ii); // uses a register
1590: #endif
1591: // active row k, do A_kj -= Lki * U_ij; j \in U(i,:) j != i
1592: // U(i+1,:end)
1593: Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, nzUi), [=](const int &uiIdx) { // index into i (U)
1594: PetscScalar Uij = baUi[uiIdx];
1595: PetscInt col = bjUi[uiIdx];
1596: if (col == myk) {
1597: // A_kk = A_kk - L_ki * U_ij(k)
1598: PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place
1599: *Akkv = *Akkv - L_ki() * Uij; // UiK
1600: } else {
1601: PetscScalar *start, *end, *pAkjv = NULL;
1602: PetscInt high, low;
1603: const PetscInt *startj;
1604: if (col < myk) { // L
1605: PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx();
1606: PetscInt idx = (pLki + 1) - (ba_d + bi_d[myk]); // index into row
1607: start = pLki + 1; // start at pLki+1, A22(myk,1)
1608: startj = bj_d + bi_d[myk] + idx;
1609: end = ba_d + bi_d[myk + 1];
1610: } else {
1611: PetscInt idx = bdiag_d[myk + 1] + 1;
1612: start = ba_d + idx;
1613: startj = bj_d + idx;
1614: end = ba_d + bdiag_d[myk];
1615: }
1616: // search for 'col', use bisection search - TODO
1617: low = 0;
1618: high = (PetscInt)(end - start);
1619: while (high - low > 5) {
1620: int t = (low + high) / 2;
1621: if (startj[t] > col) high = t;
1622: else low = t;
1623: }
1624: for (pAkjv = start + low; pAkjv < start + high; pAkjv++) {
1625: if (startj[pAkjv - start] == col) break;
1626: }
1627: #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
1628: if (pAkjv == start + high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n", (int)myk, (int)col); // uses a register
1629: #endif
1630: *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij
1631: }
1632: });
1633: }
1634: });
1635: team.team_barrier(); // this needs to be a league barrier to use more that one SM per block
1636: if (field_block_idx == 0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add(flops.data(), (size_t)(2 * (nzUi * nzUi) + 2)); });
1637: } /* endof for (i=0; i<n; i++) { */
1638: Kokkos::single(Kokkos::PerTeam(team), [=]() {
1639: Kokkos::atomic_add(&d_flops_k(), flops());
1640: flops() = 0;
1641: });
1642: });
1643: Kokkos::fence();
1644: Kokkos::deep_copy(h_flops_k, d_flops_k);
1645: PetscCall(PetscLogGpuFlops((PetscLogDouble)h_flops_k()));
1646: Kokkos::parallel_for(
1647: Kokkos::TeamPolicy<>(Nf * Ni, 1, 256), KOKKOS_LAMBDA(const team_member team) {
1648: const PetscInt lg_rank = team.league_rank(), field = lg_rank / Ni; //, field_offset = lg_rank%Ni;
1649: const PetscInt start = field * nloc, end = start + nloc, n_its = (nloc / Ni + !!(nloc % Ni)); // 1/Ni iters
1650: /* Invert diagonal for simpler triangular solves */
1651: Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=](int outer_index) {
1652: int i = start + outer_index * Ni + lg_rank % Ni;
1653: if (i < end) {
1654: PetscScalar *pv = ba_d + bdiag_d[i];
1655: *pv = 1.0 / (*pv);
1656: }
1657: });
1658: });
1659: }
1660: PetscCall(PetscLogGpuTimeEnd());
1661: PetscCall(ISRestoreIndices(isicol, &ic_h));
1662: PetscCall(ISRestoreIndices(isrow, &r_h));
1664: PetscCall(ISIdentity(isrow, &row_identity));
1665: PetscCall(ISIdentity(isicol, &col_identity));
1666: if (b->inode.size) {
1667: B->ops->solve = MatSolve_SeqAIJ_Inode;
1668: } else if (row_identity && col_identity) {
1669: B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering;
1670: } else {
1671: B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos
1672: }
1673: B->offloadmask = PETSC_OFFLOAD_GPU;
1674: PetscCall(MatSeqAIJKokkosSyncHost(B)); // solve on CPU
1675: B->ops->solveadd = MatSolveAdd_SeqAIJ; // and this
1676: B->ops->solvetranspose = MatSolveTranspose_SeqAIJ;
1677: B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1678: B->ops->matsolve = MatMatSolve_SeqAIJ;
1679: B->assembled = PETSC_TRUE;
1680: B->preallocated = PETSC_TRUE;
1682: PetscFunctionReturn(PETSC_SUCCESS);
1683: }
1685: static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1686: {
1687: PetscFunctionBegin;
1688: PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1689: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
1690: PetscFunctionReturn(PETSC_SUCCESS);
1691: }
1693: static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A)
1694: {
1695: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1697: PetscFunctionBegin;
1698: if (!factors->sptrsv_symbolic_completed) {
1699: KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d);
1700: KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d);
1701: factors->sptrsv_symbolic_completed = PETSC_TRUE;
1702: }
1703: PetscFunctionReturn(PETSC_SUCCESS);
1704: }
1706: /* Check if we need to update factors etc for transpose solve */
1707: static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1708: {
1709: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1710: MatColIdxType n = A->rmap->n;
1712: PetscFunctionBegin;
1713: if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */
1714: /* Update L^T and do sptrsv symbolic */
1715: factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1);
1716: Kokkos::deep_copy(factors->iLt_d, 0); /* KK requires 0 */
1717: factors->jLt_d = MatColIdxKokkosView("factors->jLt_d", factors->jL_d.extent(0));
1718: factors->aLt_d = MatScalarKokkosView("factors->aLt_d", factors->aL_d.extent(0));
1720: transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d, factors->jL_d, factors->aL_d,
1721: factors->iLt_d, factors->jLt_d, factors->aLt_d);
1723: /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices.
1724: We have to sort the indices, until KK provides finer control options.
1725: */
1726: sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d);
1728: KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d);
1730: /* Update U^T and do sptrsv symbolic */
1731: factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1);
1732: Kokkos::deep_copy(factors->iUt_d, 0); /* KK requires 0 */
1733: factors->jUt_d = MatColIdxKokkosView("factors->jUt_d", factors->jU_d.extent(0));
1734: factors->aUt_d = MatScalarKokkosView("factors->aUt_d", factors->aU_d.extent(0));
1736: transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d, factors->jU_d, factors->aU_d,
1737: factors->iUt_d, factors->jUt_d, factors->aUt_d);
1739: /* Sort indices. See comments above */
1740: sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d);
1742: KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d);
1743: factors->transpose_updated = PETSC_TRUE;
1744: }
1745: PetscFunctionReturn(PETSC_SUCCESS);
1746: }
1748: /* Solve Ax = b, with A = LU */
1749: static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A, Vec b, Vec x)
1750: {
1751: ConstPetscScalarKokkosView bv;
1752: PetscScalarKokkosView xv;
1753: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1755: PetscFunctionBegin;
1756: PetscCall(PetscLogGpuTimeBegin());
1757: PetscCall(MatSeqAIJKokkosSymbolicSolveCheck(A));
1758: PetscCall(VecGetKokkosView(b, &bv));
1759: PetscCall(VecGetKokkosViewWrite(x, &xv));
1760: /* Solve L tmpv = b */
1761: PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, bv, factors->workVector));
1762: /* Solve Ux = tmpv */
1763: PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, factors->workVector, xv));
1764: PetscCall(VecRestoreKokkosView(b, &bv));
1765: PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1766: PetscCall(PetscLogGpuTimeEnd());
1767: PetscFunctionReturn(PETSC_SUCCESS);
1768: }
1770: /* Solve A^T x = b, where A^T = U^T L^T */
1771: static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A, Vec b, Vec x)
1772: {
1773: ConstPetscScalarKokkosView bv;
1774: PetscScalarKokkosView xv;
1775: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1777: PetscFunctionBegin;
1778: PetscCall(PetscLogGpuTimeBegin());
1779: PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A));
1780: PetscCall(VecGetKokkosView(b, &bv));
1781: PetscCall(VecGetKokkosViewWrite(x, &xv));
1782: /* Solve U^T tmpv = b */
1783: KokkosSparse::Experimental::sptrsv_solve(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, bv, factors->workVector);
1785: /* Solve L^T x = tmpv */
1786: KokkosSparse::Experimental::sptrsv_solve(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, factors->workVector, xv);
1787: PetscCall(VecRestoreKokkosView(b, &bv));
1788: PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1789: PetscCall(PetscLogGpuTimeEnd());
1790: PetscFunctionReturn(PETSC_SUCCESS);
1791: }
1793: static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1794: {
1795: Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos *)A->spptr;
1796: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
1797: PetscInt fill_lev = info->levels;
1799: PetscFunctionBegin;
1800: PetscCall(PetscLogGpuTimeBegin());
1801: PetscCall(MatSeqAIJKokkosSyncDevice(A));
1803: auto a_d = aijkok->a_dual.view_device();
1804: auto i_d = aijkok->i_dual.view_device();
1805: auto j_d = aijkok->j_dual.view_device();
1807: KokkosSparse::Experimental::spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d);
1809: B->assembled = PETSC_TRUE;
1810: B->preallocated = PETSC_TRUE;
1811: B->ops->solve = MatSolve_SeqAIJKokkos;
1812: B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos;
1813: B->ops->matsolve = NULL;
1814: B->ops->matsolvetranspose = NULL;
1815: B->offloadmask = PETSC_OFFLOAD_GPU;
1817: /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
1818: factors->transpose_updated = PETSC_FALSE;
1819: factors->sptrsv_symbolic_completed = PETSC_FALSE;
1820: /* TODO: log flops, but how to know that? */
1821: PetscCall(PetscLogGpuTimeEnd());
1822: PetscFunctionReturn(PETSC_SUCCESS);
1823: }
1825: static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1826: {
1827: Mat_SeqAIJKokkos *aijkok;
1828: Mat_SeqAIJ *b;
1829: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
1830: PetscInt fill_lev = info->levels;
1831: PetscInt nnzA = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU;
1832: PetscInt n = A->rmap->n;
1834: PetscFunctionBegin;
1835: PetscCall(MatSeqAIJKokkosSyncDevice(A));
1836: /* Rebuild factors */
1837: if (factors) {
1838: factors->Destroy();
1839: } /* Destroy the old if it exists */
1840: else {
1841: B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);
1842: }
1844: /* Create a spiluk handle and then do symbolic factorization */
1845: nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA);
1846: factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU);
1848: auto spiluk_handle = factors->kh.get_spiluk_handle();
1850: Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */
1851: Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL());
1852: Kokkos::realloc(factors->iU_d, n + 1);
1853: Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU());
1855: aijkok = (Mat_SeqAIJKokkos *)A->spptr;
1856: auto i_d = aijkok->i_dual.view_device();
1857: auto j_d = aijkok->j_dual.view_device();
1858: KokkosSparse::Experimental::spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d);
1859: /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
1861: Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
1862: Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU());
1863: Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */
1864: Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU());
1866: /* TODO: add options to select sptrsv algorithms */
1867: /* Create sptrsv handles for L, U and their transpose */
1868: #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
1869: auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE;
1870: #else
1871: auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1;
1872: #endif
1874: factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */);
1875: factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */);
1876: factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */);
1877: factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */);
1879: /* Fill fields of the factor matrix B */
1880: PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL));
1881: b = (Mat_SeqAIJ *)B->data;
1882: b->nz = b->maxnz = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU();
1883: B->info.fill_ratio_given = info->fill;
1884: B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0;
1886: B->offloadmask = PETSC_OFFLOAD_GPU;
1887: B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos;
1888: PetscFunctionReturn(PETSC_SUCCESS);
1889: }
1891: static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1892: {
1893: Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data;
1894: const PetscInt nrows = A->rmap->n;
1896: PetscFunctionBegin;
1897: PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1898: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE;
1899: // move B data into Kokkos
1900: PetscCall(MatSeqAIJKokkosSyncDevice(B)); // create aijkok
1901: PetscCall(MatSeqAIJKokkosSyncDevice(A)); // create aijkok
1902: {
1903: Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
1904: if (!baijkok->diag_d.extent(0)) {
1905: const Kokkos::View<PetscInt *, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> h_diag(b->diag, nrows + 1);
1906: baijkok->diag_d = Kokkos::View<PetscInt *>(Kokkos::create_mirror(DefaultMemorySpace(), h_diag));
1907: Kokkos::deep_copy(baijkok->diag_d, h_diag);
1908: }
1909: }
1910: PetscFunctionReturn(PETSC_SUCCESS);
1911: }
1913: static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A, MatSolverType *type)
1914: {
1915: PetscFunctionBegin;
1916: *type = MATSOLVERKOKKOS;
1917: PetscFunctionReturn(PETSC_SUCCESS);
1918: }
1920: static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A, MatSolverType *type)
1921: {
1922: PetscFunctionBegin;
1923: *type = MATSOLVERKOKKOSDEVICE;
1924: PetscFunctionReturn(PETSC_SUCCESS);
1925: }
1927: /*MC
1928: MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
1929: on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`.
1931: Level: beginner
1933: .seealso: [](ch_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation`
1934: M*/
1935: PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
1936: {
1937: PetscInt n = A->rmap->n;
1939: PetscFunctionBegin;
1940: PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
1941: PetscCall(MatSetSizes(*B, n, n, n, n));
1942: (*B)->factortype = ftype;
1943: PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
1944: PetscCall(MatSetType(*B, MATSEQAIJKOKKOS));
1946: if (ftype == MAT_FACTOR_LU) {
1947: PetscCall(MatSetBlockSizesFromMats(*B, A, A));
1948: (*B)->canuseordering = PETSC_TRUE;
1949: (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos;
1950: } else if (ftype == MAT_FACTOR_ILU) {
1951: PetscCall(MatSetBlockSizesFromMats(*B, A, A));
1952: (*B)->canuseordering = PETSC_FALSE;
1953: (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
1954: } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
1956: PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
1957: PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos));
1958: PetscFunctionReturn(PETSC_SUCCESS);
1959: }
1961: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A, MatFactorType ftype, Mat *B)
1962: {
1963: PetscInt n = A->rmap->n;
1965: PetscFunctionBegin;
1966: PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
1967: PetscCall(MatSetSizes(*B, n, n, n, n));
1968: (*B)->factortype = ftype;
1969: (*B)->canuseordering = PETSC_TRUE;
1970: PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
1971: PetscCall(MatSetType(*B, MATSEQAIJKOKKOS));
1973: if (ftype == MAT_FACTOR_LU) {
1974: PetscCall(MatSetBlockSizesFromMats(*B, A, A));
1975: (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE;
1976: } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for KOKKOS Matrix Types");
1978: PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
1979: PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_kokkos_device));
1980: PetscFunctionReturn(PETSC_SUCCESS);
1981: }
1983: PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void)
1984: {
1985: PetscFunctionBegin;
1986: PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos));
1987: PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos));
1988: PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_seqaijkokkos_kokkos_device));
1989: PetscFunctionReturn(PETSC_SUCCESS);
1990: }
1992: /* Utility to print out a KokkosCsrMatrix for debugging */
1993: PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat)
1994: {
1995: const auto &iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.row_map);
1996: const auto &jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.entries);
1997: const auto &av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.values);
1998: const PetscInt *i = iv.data();
1999: const PetscInt *j = jv.data();
2000: const PetscScalar *a = av.data();
2001: PetscInt m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz();
2003: PetscFunctionBegin;
2004: PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz));
2005: for (PetscInt k = 0; k < m; k++) {
2006: PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k));
2007: for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p])));
2008: PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n"));
2009: }
2010: PetscFunctionReturn(PETSC_SUCCESS);
2011: }