Actual source code: vpbjacobi_kok.kokkos.cxx
1: #include <petscvec_kokkos.hpp>
2: #include <../src/vec/vec/impls/seq/kokkos/veckokkosimpl.hpp>
3: #include <petscdevice.h>
4: #include <../src/ksp/pc/impls/vpbjacobi/vpbjacobi.h>
6: /* A class that manages helper arrays assisting parallel PCApply() with Kokkos */
7: struct PC_VPBJacobi_Kokkos {
8: /* Cache the old sizes to check if we need realloc */
9: PetscInt n; /* number of rows of the local matrix */
10: PetscInt nblocks; /* number of point blocks */
11: PetscInt nsize; /* sum of sizes of the point blocks */
13: /* Helper arrays that are pre-computed on host and then copied to device.
14: bs: [nblocks+1], "csr" version of bsizes[]
15: bs2: [nblocks+1], "csr" version of squares of bsizes[]
16: matIdx: [n], row i of the local matrix belongs to the matIdx_d[i] block
17: */
18: PetscIntKokkosDualView bs_dual, bs2_dual, matIdx_dual;
19: PetscScalarKokkosDualView diag_dual;
21: PC_VPBJacobi_Kokkos(PetscInt n, PetscInt nblocks, PetscInt nsize, const PetscInt *bsizes, MatScalar *diag_ptr_h) :
22: n(n), nblocks(nblocks), nsize(nsize), bs_dual("bs_dual", nblocks + 1), bs2_dual("bs2_dual", nblocks + 1), matIdx_dual("matIdx_dual", n)
23: {
24: PetscScalarKokkosViewHost diag_h(diag_ptr_h, nsize);
26: auto diag_d = Kokkos::create_mirror_view(DefaultMemorySpace(), diag_h);
27: diag_dual = PetscScalarKokkosDualView(diag_d, diag_h);
28: PetscCallVoid(UpdateOffsetsOnDevice(bsizes, diag_ptr_h));
29: }
31: PetscErrorCode UpdateOffsetsOnDevice(const PetscInt *bsizes, MatScalar *diag_ptr_h)
32: {
33: PetscFunctionBegin;
34: PetscCheck(diag_dual.view_host().data() == diag_ptr_h, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Host pointer has changed since last call");
35: PetscCall(ComputeOffsetsOnHost(bsizes));
37: PetscCallCXX(bs_dual.modify_host());
38: PetscCallCXX(bs2_dual.modify_host());
39: PetscCallCXX(matIdx_dual.modify_host());
40: PetscCallCXX(diag_dual.modify_host());
42: PetscCallCXX(bs_dual.sync_device());
43: PetscCallCXX(bs2_dual.sync_device());
44: PetscCallCXX(matIdx_dual.sync_device());
45: PetscCallCXX(diag_dual.sync_device());
46: PetscCall(PetscLogCpuToGpu(sizeof(PetscInt) * (2 * nblocks + 2 + n) + sizeof(MatScalar) * nsize));
47: PetscFunctionReturn(PETSC_SUCCESS);
48: }
50: private:
51: PetscErrorCode ComputeOffsetsOnHost(const PetscInt *bsizes)
52: {
53: PetscInt *bs_h = bs_dual.view_host().data();
54: PetscInt *bs2_h = bs2_dual.view_host().data();
55: PetscInt *matIdx_h = matIdx_dual.view_host().data();
57: PetscFunctionBegin;
58: bs_h[0] = bs2_h[0] = 0;
59: for (PetscInt i = 0; i < nblocks; i++) {
60: bs_h[i + 1] = bs_h[i] + bsizes[i];
61: bs2_h[i + 1] = bs2_h[i] + bsizes[i] * bsizes[i];
62: for (PetscInt j = 0; j < bsizes[i]; j++) matIdx_h[bs_h[i] + j] = i;
63: }
64: PetscFunctionReturn(PETSC_SUCCESS);
65: }
66: };
68: template <PetscBool transpose>
69: static PetscErrorCode PCApplyOrTranspose_VPBJacobi_Kokkos(PC pc, Vec x, Vec y)
70: {
71: PC_VPBJacobi *jac = (PC_VPBJacobi *)pc->data;
72: PC_VPBJacobi_Kokkos *pckok = static_cast<PC_VPBJacobi_Kokkos *>(jac->spptr);
73: ConstPetscScalarKokkosView xv;
74: PetscScalarKokkosView yv;
75: PetscScalarKokkosView diag = pckok->diag_dual.view_device();
76: PetscIntKokkosView bs = pckok->bs_dual.view_device();
77: PetscIntKokkosView bs2 = pckok->bs2_dual.view_device();
78: PetscIntKokkosView matIdx = pckok->matIdx_dual.view_device();
79: const char *label = transpose ? "PCApplyTranspose_VPBJacobi_Kokkos" : "PCApply_VPBJacobi_Kokkos";
81: PetscFunctionBegin;
82: PetscCall(PetscLogGpuTimeBegin());
83: VecErrorIfNotKokkos(x);
84: VecErrorIfNotKokkos(y);
85: PetscCall(VecGetKokkosView(x, &xv));
86: PetscCall(VecGetKokkosViewWrite(y, &yv));
87: PetscCallCXX(Kokkos::parallel_for(
88: label, pckok->n, KOKKOS_LAMBDA(PetscInt row) {
89: const PetscScalar *Ap, *xp;
90: PetscScalar *yp;
91: PetscInt i, j, k, m;
93: k = matIdx(row); /* k-th block/matrix */
94: m = bs(k + 1) - bs(k); /* block size of the k-th block */
95: i = row - bs(k); /* i-th row of the block */
96: Ap = &diag(bs2(k) + i * (transpose ? m : 1)); /* Ap points to the first entry of i-th row/column */
97: xp = &xv(bs(k));
98: yp = &yv(bs(k));
100: yp[i] = 0.0;
101: for (j = 0; j < m; j++) {
102: yp[i] += Ap[0] * xp[j];
103: Ap += transpose ? 1 : m;
104: }
105: }));
106: PetscCall(VecRestoreKokkosView(x, &xv));
107: PetscCall(VecRestoreKokkosViewWrite(y, &yv));
108: PetscCall(PetscLogGpuFlops(pckok->nsize * 2)); /* FMA on entries in all blocks */
109: PetscCall(PetscLogGpuTimeEnd());
110: PetscFunctionReturn(PETSC_SUCCESS);
111: }
113: static PetscErrorCode PCDestroy_VPBJacobi_Kokkos(PC pc)
114: {
115: PC_VPBJacobi *jac = (PC_VPBJacobi *)pc->data;
117: PetscFunctionBegin;
118: PetscCallCXX(delete static_cast<PC_VPBJacobi_Kokkos *>(jac->spptr));
119: PetscCall(PCDestroy_VPBJacobi(pc));
120: PetscFunctionReturn(PETSC_SUCCESS);
121: }
123: PETSC_INTERN PetscErrorCode PCSetUp_VPBJacobi_Kokkos(PC pc)
124: {
125: PC_VPBJacobi *jac = (PC_VPBJacobi *)pc->data;
126: PC_VPBJacobi_Kokkos *pckok = static_cast<PC_VPBJacobi_Kokkos *>(jac->spptr);
127: PetscInt i, n, nblocks, nsize = 0;
128: const PetscInt *bsizes;
130: PetscFunctionBegin;
131: PetscCall(PCSetUp_VPBJacobi_Host(pc)); /* Compute the inverse on host now. Might worth doing it on device directly */
132: PetscCall(MatGetVariableBlockSizes(pc->pmat, &nblocks, &bsizes));
133: for (i = 0; i < nblocks; i++) nsize += bsizes[i] * bsizes[i];
134: PetscCall(MatGetLocalSize(pc->pmat, &n, NULL));
136: /* If one calls MatSetVariableBlockSizes() multiple times and sizes have been changed (is it allowed?), we delete the old and rebuild anyway */
137: if (pckok && (pckok->n != n || pckok->nblocks != nblocks || pckok->nsize != nsize)) {
138: PetscCallCXX(delete pckok);
139: pckok = nullptr;
140: }
142: if (!pckok) { /* allocate the struct along with the helper arrays from the scratch */
143: PetscCallCXX(jac->spptr = new PC_VPBJacobi_Kokkos(n, nblocks, nsize, bsizes, jac->diag));
144: } else { /* update the value only */
145: PetscCall(pckok->UpdateOffsetsOnDevice(bsizes, jac->diag));
146: }
148: pc->ops->apply = PCApplyOrTranspose_VPBJacobi_Kokkos<PETSC_FALSE>;
149: pc->ops->applytranspose = PCApplyOrTranspose_VPBJacobi_Kokkos<PETSC_TRUE>;
150: pc->ops->destroy = PCDestroy_VPBJacobi_Kokkos;
151: PetscFunctionReturn(PETSC_SUCCESS);
152: }