Actual source code: vpbjacobi_kok.kokkos.cxx

  1: #include <petscvec_kokkos.hpp>
  2: #include <../src/vec/vec/impls/seq/kokkos/veckokkosimpl.hpp>
  3: #include <petscdevice.h>
  4: #include <../src/ksp/pc/impls/vpbjacobi/vpbjacobi.h>

  6: /* A class that manages helper arrays assisting parallel PCApply() with Kokkos */
  7: struct PC_VPBJacobi_Kokkos {
  8:   /* Cache the old sizes to check if we need realloc */
  9:   PetscInt n;       /* number of rows of the local matrix */
 10:   PetscInt nblocks; /* number of point blocks */
 11:   PetscInt nsize;   /* sum of sizes of the point blocks */

 13:   /* Helper arrays that are pre-computed on host and then copied to device.
 14:     bs:     [nblocks+1], "csr" version of bsizes[]
 15:     bs2:    [nblocks+1], "csr" version of squares of bsizes[]
 16:     matIdx: [n], row i of the local matrix belongs to the matIdx_d[i] block
 17:   */
 18:   PetscIntKokkosDualView    bs_dual, bs2_dual, matIdx_dual;
 19:   PetscScalarKokkosDualView diag_dual;

 21:   PC_VPBJacobi_Kokkos(PetscInt n, PetscInt nblocks, PetscInt nsize, const PetscInt *bsizes, MatScalar *diag_ptr_h) :
 22:     n(n), nblocks(nblocks), nsize(nsize), bs_dual("bs_dual", nblocks + 1), bs2_dual("bs2_dual", nblocks + 1), matIdx_dual("matIdx_dual", n)
 23:   {
 24:     PetscScalarKokkosViewHost diag_h(diag_ptr_h, nsize);

 26:     auto diag_d = Kokkos::create_mirror_view(DefaultMemorySpace(), diag_h);
 27:     diag_dual   = PetscScalarKokkosDualView(diag_d, diag_h);
 28:     PetscCallVoid(UpdateOffsetsOnDevice(bsizes, diag_ptr_h));
 29:   }

 31:   PetscErrorCode UpdateOffsetsOnDevice(const PetscInt *bsizes, MatScalar *diag_ptr_h)
 32:   {
 33:     PetscFunctionBegin;
 34:     PetscCheck(diag_dual.view_host().data() == diag_ptr_h, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Host pointer has changed since last call");
 35:     PetscCall(ComputeOffsetsOnHost(bsizes));

 37:     PetscCallCXX(bs_dual.modify_host());
 38:     PetscCallCXX(bs2_dual.modify_host());
 39:     PetscCallCXX(matIdx_dual.modify_host());
 40:     PetscCallCXX(diag_dual.modify_host());

 42:     PetscCallCXX(bs_dual.sync_device());
 43:     PetscCallCXX(bs2_dual.sync_device());
 44:     PetscCallCXX(matIdx_dual.sync_device());
 45:     PetscCallCXX(diag_dual.sync_device());
 46:     PetscCall(PetscLogCpuToGpu(sizeof(PetscInt) * (2 * nblocks + 2 + n) + sizeof(MatScalar) * nsize));
 47:     PetscFunctionReturn(PETSC_SUCCESS);
 48:   }

 50: private:
 51:   PetscErrorCode ComputeOffsetsOnHost(const PetscInt *bsizes)
 52:   {
 53:     PetscInt *bs_h     = bs_dual.view_host().data();
 54:     PetscInt *bs2_h    = bs2_dual.view_host().data();
 55:     PetscInt *matIdx_h = matIdx_dual.view_host().data();

 57:     PetscFunctionBegin;
 58:     bs_h[0] = bs2_h[0] = 0;
 59:     for (PetscInt i = 0; i < nblocks; i++) {
 60:       bs_h[i + 1]  = bs_h[i] + bsizes[i];
 61:       bs2_h[i + 1] = bs2_h[i] + bsizes[i] * bsizes[i];
 62:       for (PetscInt j = 0; j < bsizes[i]; j++) matIdx_h[bs_h[i] + j] = i;
 63:     }
 64:     PetscFunctionReturn(PETSC_SUCCESS);
 65:   }
 66: };

 68: template <PetscBool transpose>
 69: static PetscErrorCode PCApplyOrTranspose_VPBJacobi_Kokkos(PC pc, Vec x, Vec y)
 70: {
 71:   PC_VPBJacobi              *jac   = (PC_VPBJacobi *)pc->data;
 72:   PC_VPBJacobi_Kokkos       *pckok = static_cast<PC_VPBJacobi_Kokkos *>(jac->spptr);
 73:   ConstPetscScalarKokkosView xv;
 74:   PetscScalarKokkosView      yv;
 75:   PetscScalarKokkosView      diag   = pckok->diag_dual.view_device();
 76:   PetscIntKokkosView         bs     = pckok->bs_dual.view_device();
 77:   PetscIntKokkosView         bs2    = pckok->bs2_dual.view_device();
 78:   PetscIntKokkosView         matIdx = pckok->matIdx_dual.view_device();
 79:   const char                *label  = transpose ? "PCApplyTranspose_VPBJacobi_Kokkos" : "PCApply_VPBJacobi_Kokkos";

 81:   PetscFunctionBegin;
 82:   PetscCall(PetscLogGpuTimeBegin());
 83:   VecErrorIfNotKokkos(x);
 84:   VecErrorIfNotKokkos(y);
 85:   PetscCall(VecGetKokkosView(x, &xv));
 86:   PetscCall(VecGetKokkosViewWrite(y, &yv));
 87:   PetscCallCXX(Kokkos::parallel_for(
 88:     label, pckok->n, KOKKOS_LAMBDA(PetscInt row) {
 89:       const PetscScalar *Ap, *xp;
 90:       PetscScalar       *yp;
 91:       PetscInt           i, j, k, m;

 93:       k  = matIdx(row);                             /* k-th block/matrix */
 94:       m  = bs(k + 1) - bs(k);                       /* block size of the k-th block */
 95:       i  = row - bs(k);                             /* i-th row of the block */
 96:       Ap = &diag(bs2(k) + i * (transpose ? m : 1)); /* Ap points to the first entry of i-th row/column */
 97:       xp = &xv(bs(k));
 98:       yp = &yv(bs(k));

100:       yp[i] = 0.0;
101:       for (j = 0; j < m; j++) {
102:         yp[i] += Ap[0] * xp[j];
103:         Ap += transpose ? 1 : m;
104:       }
105:     }));
106:   PetscCall(VecRestoreKokkosView(x, &xv));
107:   PetscCall(VecRestoreKokkosViewWrite(y, &yv));
108:   PetscCall(PetscLogGpuFlops(pckok->nsize * 2)); /* FMA on entries in all blocks */
109:   PetscCall(PetscLogGpuTimeEnd());
110:   PetscFunctionReturn(PETSC_SUCCESS);
111: }

113: static PetscErrorCode PCDestroy_VPBJacobi_Kokkos(PC pc)
114: {
115:   PC_VPBJacobi *jac = (PC_VPBJacobi *)pc->data;

117:   PetscFunctionBegin;
118:   PetscCallCXX(delete static_cast<PC_VPBJacobi_Kokkos *>(jac->spptr));
119:   PetscCall(PCDestroy_VPBJacobi(pc));
120:   PetscFunctionReturn(PETSC_SUCCESS);
121: }

123: PETSC_INTERN PetscErrorCode PCSetUp_VPBJacobi_Kokkos(PC pc)
124: {
125:   PC_VPBJacobi        *jac   = (PC_VPBJacobi *)pc->data;
126:   PC_VPBJacobi_Kokkos *pckok = static_cast<PC_VPBJacobi_Kokkos *>(jac->spptr);
127:   PetscInt             i, n, nblocks, nsize = 0;
128:   const PetscInt      *bsizes;

130:   PetscFunctionBegin;
131:   PetscCall(PCSetUp_VPBJacobi_Host(pc)); /* Compute the inverse on host now. Might worth doing it on device directly */
132:   PetscCall(MatGetVariableBlockSizes(pc->pmat, &nblocks, &bsizes));
133:   for (i = 0; i < nblocks; i++) nsize += bsizes[i] * bsizes[i];
134:   PetscCall(MatGetLocalSize(pc->pmat, &n, NULL));

136:   /* If one calls MatSetVariableBlockSizes() multiple times and sizes have been changed (is it allowed?), we delete the old and rebuild anyway */
137:   if (pckok && (pckok->n != n || pckok->nblocks != nblocks || pckok->nsize != nsize)) {
138:     PetscCallCXX(delete pckok);
139:     pckok = nullptr;
140:   }

142:   if (!pckok) { /* allocate the struct along with the helper arrays from the scratch */
143:     PetscCallCXX(jac->spptr = new PC_VPBJacobi_Kokkos(n, nblocks, nsize, bsizes, jac->diag));
144:   } else { /* update the value only */
145:     PetscCall(pckok->UpdateOffsetsOnDevice(bsizes, jac->diag));
146:   }

148:   pc->ops->apply          = PCApplyOrTranspose_VPBJacobi_Kokkos<PETSC_FALSE>;
149:   pc->ops->applytranspose = PCApplyOrTranspose_VPBJacobi_Kokkos<PETSC_TRUE>;
150:   pc->ops->destroy        = PCDestroy_VPBJacobi_Kokkos;
151:   PetscFunctionReturn(PETSC_SUCCESS);
152: }