Actual source code: ex2k.kokkos.cxx

  1: static char help[] = "Benchmarking various accessing methods of DMDA vectors on host\n\n";

  3: /*
  4:   On a machine with AMD EPYC-7452 CPUs, we got this data using one MPI rank and a serial-only Kokkos:
  5:            Time (sec.), on Mar. 1, 2022
  6:   ------------------------------------------
  7:   n     PETSc          C          Kokkos
  8:   ------------------------------------------
  9:   32    4.6464E-05  4.7451E-05  1.6880E-04
 10:   64    2.5654E-04  2.5164E-04  5.6780E-04
 11:   128   1.9383E-03  1.8878E-03  4.7938E-03
 12:   256   1.4450E-02  1.3619E-02  3.7778E-02
 13:   512   1.1580E-01  1.1551E-01  2.8428E-01
 14:   1024  1.4179E+00  1.3772E+00  2.2873E+00

 16:   Overall, C is -2% ~ 5% faster than PETSc. But Kokkos is 1.6~3.6x slower than PETSc
 17: */

 19: #include <petscdmda_kokkos.hpp>
 20: #include <petscdm.h>
 21: #include <petscdmda.h>

 23: using Kokkos::Iterate;
 24: using Kokkos::MDRangePolicy;
 25: using Kokkos::Rank;
 26: using PetscScalarKokkosOffsetView3D      = Kokkos::Experimental::OffsetView<PetscScalar ***, Kokkos::LayoutRight, Kokkos::HostSpace>;
 27: using ConstPetscScalarKokkosOffsetView3D = Kokkos::Experimental::OffsetView<const PetscScalar ***, Kokkos::LayoutRight, Kokkos::HostSpace>;

 29: /* PETSc multi-dimensional array access */
 30: static PetscErrorCode Update1(DM da, const PetscScalar ***__restrict__ x1, PetscScalar ***__restrict__ y1, PetscInt nwarm, PetscInt nloop, PetscLogDouble *avgTime)
 31: {
 32:   PetscInt       it, i, j, k;
 33:   PetscLogDouble tstart = 0.0, tend;
 34:   PetscInt       xm, ym, zm, xs, ys, zs, gxm, gym, gzm, gxs, gys, gzs;

 36:   PetscFunctionBegin;
 37:   PetscCall(DMDAGetCorners(da, &xs, &ys, &zs, &xm, &ym, &zm));
 38:   PetscCall(DMDAGetGhostCorners(da, &gxs, &gys, &gzs, &gxm, &gym, &gzm));
 39:   for (it = 0; it < nwarm + nloop; it++) {
 40:     if (it == nwarm) PetscCall(PetscTime(&tstart));
 41:     for (k = zs; k < zs + zm; k++) {
 42:       for (j = ys; j < ys + ym; j++) {
 43:         for (i = xs; i < xs + xm; i++) y1[k][j][i] = 6 * x1[k][j][i] - x1[k - 1][j][i] - x1[k][j - 1][i] - x1[k][j][i - 1] - x1[k + 1][j][i] - x1[k][j + 1][i] - x1[k][j][i + 1];
 44:       }
 45:     }
 46:   }
 47:   PetscCall(PetscTime(&tend));
 48:   *avgTime = (tend - tstart) / nloop;
 49:   PetscFunctionReturn(PETSC_SUCCESS);
 50: }

 52: /* C multi-dimensional array access */
 53: static PetscErrorCode Update2(DM da, const PetscScalar *__restrict__ x2, PetscScalar *__restrict__ y2, PetscInt nwarm, PetscInt nloop, PetscLogDouble *avgTime)
 54: {
 55:   PetscInt       it, i, j, k;
 56:   PetscLogDouble tstart = 0.0, tend;
 57:   PetscInt       xm, ym, zm, xs, ys, zs, gxm, gym, gzm, gxs, gys, gzs;

 59:   PetscFunctionBegin;
 60:   PetscCall(DMDAGetCorners(da, &xs, &ys, &zs, &xm, &ym, &zm));
 61:   PetscCall(DMDAGetGhostCorners(da, &gxs, &gys, &gzs, &gxm, &gym, &gzm));
 62: #define X2(k, j, i) x2[(k - gzs) * gym * gxm + (j - gys) * gxm + (i - gxs)]
 63: #define Y2(k, j, i) y2[(k - zs) * ym * xm + (j - ys) * xm + (i - xs)]
 64:   for (it = 0; it < nwarm + nloop; it++) {
 65:     if (it == nwarm) PetscCall(PetscTime(&tstart));
 66:     for (k = zs; k < zs + zm; k++) {
 67:       for (j = ys; j < ys + ym; j++) {
 68:         for (i = xs; i < xs + xm; i++) Y2(k, j, i) = 6 * X2(k, j, i) - X2(k - 1, j, i) - X2(k, j - 1, i) - X2(k, j, i - 1) - X2(k + 1, j, i) - X2(k, j + 1, i) - X2(k, j, i + 1);
 69:       }
 70:     }
 71:   }
 72:   PetscCall(PetscTime(&tend));
 73:   *avgTime = (tend - tstart) / nloop;
 74: #undef X2
 75: #undef Y2
 76:   PetscFunctionReturn(PETSC_SUCCESS);
 77: }

 79: int main(int argc, char **argv)
 80: {
 81:   DM                                 da;
 82:   PetscInt                           xm, ym, zm, xs, ys, zs, gxm, gym, gzm, gxs, gys, gzs;
 83:   PetscInt                           dof = 1, sw = 1;
 84:   DMBoundaryType                     bx = DM_BOUNDARY_PERIODIC, by = DM_BOUNDARY_PERIODIC, bz = DM_BOUNDARY_PERIODIC;
 85:   DMDAStencilType                    st = DMDA_STENCIL_STAR;
 86:   Vec                                x, y; /* local/global vectors of the da */
 87:   PetscRandom                        rctx;
 88:   const PetscScalar               ***x1;
 89:   PetscScalar                     ***y1; /* arrays of g, ll */
 90:   const PetscScalar                 *x2;
 91:   PetscScalar                       *y2;
 92:   ConstPetscScalarKokkosOffsetView3D x3;
 93:   PetscScalarKokkosOffsetView3D      y3;
 94:   PetscLogDouble                     tstart = 0.0, tend = 0.0, avgTime = 0.0;
 95:   PetscInt                           nwarm = 2, nloop = 10;
 96:   PetscInt                           min = 32, max = 32 * 8; /* min and max sizes of the grids to sample */

 98:   PetscFunctionBeginUser;
 99:   PetscCall(PetscInitialize(&argc, &argv, (char *)0, help));
100:   PetscCall(PetscRandomCreate(PETSC_COMM_WORLD, &rctx));
101:   PetscCall(PetscOptionsGetInt(NULL, NULL, "-min", &min, NULL));
102:   PetscCall(PetscOptionsGetInt(NULL, NULL, "-max", &max, NULL));

104:   for (PetscInt len = min; len <= max; len = len * 2) {
105:     PetscCall(DMDACreate3d(PETSC_COMM_WORLD, bx, by, bz, st, len, len, len, PETSC_DECIDE, PETSC_DECIDE, PETSC_DECIDE, dof, sw, 0, 0, 0, &da));
106:     PetscCall(DMSetFromOptions(da));
107:     PetscCall(DMSetUp(da));

109:     PetscCall(DMDAGetCorners(da, &xs, &ys, &zs, &xm, &ym, &zm));
110:     PetscCall(DMDAGetGhostCorners(da, &gxs, &gys, &gzs, &gxm, &gym, &gzm));
111:     PetscCall(DMCreateLocalVector(da, &x)); /* Create local x and global y */
112:     PetscCall(DMCreateGlobalVector(da, &y));

114:     /* Access with petsc multi-dimensional arrays */
115:     PetscCall(VecSetRandom(x, rctx));
116:     PetscCall(VecSet(y, 0.0));
117:     PetscCall(DMDAVecGetArrayRead(da, x, &x1));
118:     PetscCall(DMDAVecGetArray(da, y, &y1));
119:     PetscCall(Update1(da, x1, y1, nwarm, nloop, &avgTime));
120:     PetscCall(DMDAVecRestoreArrayRead(da, x, &x1));
121:     PetscCall(DMDAVecRestoreArray(da, y, &y1));
122:     PetscCall(PetscTime(&tend));
123:     PetscCall(PetscPrintf(PETSC_COMM_WORLD, "%4d^3 -- PETSc average time  = %e\n", len, avgTime));

125:     /* Access with C multi-dimensional arrays */
126:     PetscCall(VecSetRandom(x, rctx));
127:     PetscCall(VecSet(y, 0.0));
128:     PetscCall(VecGetArrayRead(x, &x2));
129:     PetscCall(VecGetArray(y, &y2));
130:     PetscCall(Update2(da, x2, y2, nwarm, nloop, &avgTime));
131:     PetscCall(VecRestoreArrayRead(x, &x2));
132:     PetscCall(VecRestoreArray(y, &y2));
133:     PetscCall(PetscPrintf(PETSC_COMM_WORLD, "%4d^3 -- C average time      = %e\n", len, avgTime));

135:     /* Access with Kokkos multi-dimensional OffsetViews */
136:     PetscCall(VecSet(y, 0.0));
137:     PetscCall(VecSetRandom(x, rctx));
138:     PetscCall(DMDAVecGetKokkosOffsetView(da, x, &x3));
139:     PetscCall(DMDAVecGetKokkosOffsetView(da, y, &y3));

141:     for (PetscInt it = 0; it < nwarm + nloop; it++) {
142:       if (it == nwarm) PetscCall(PetscTime(&tstart));
143:       Kokkos::parallel_for(
144:         "stencil", MDRangePolicy<Kokkos::DefaultHostExecutionSpace, Rank<3, Iterate::Right, Iterate::Right>>({zs, ys, xs}, {zs + zm, ys + ym, xs + xm}),
145:         KOKKOS_LAMBDA(PetscInt k, PetscInt j, PetscInt i) { y3(k, j, i) = 6 * x3(k, j, i) - x3(k - 1, j, i) - x3(k, j - 1, i) - x3(k, j, i - 1) - x3(k + 1, j, i) - x3(k, j + 1, i) - x3(k, j, i + 1); });
146:     }
147:     PetscCall(PetscTime(&tend));
148:     PetscCall(DMDAVecRestoreKokkosOffsetView(da, x, &x3));
149:     PetscCall(DMDAVecRestoreKokkosOffsetView(da, y, &y3));
150:     avgTime = (tend - tstart) / nloop;
151:     PetscCall(PetscPrintf(PETSC_COMM_WORLD, "%4d^3 -- Kokkos average time = %e\n", len, avgTime));

153:     PetscCall(VecDestroy(&x));
154:     PetscCall(VecDestroy(&y));
155:     PetscCall(DMDestroy(&da));
156:   }
157:   PetscCall(PetscRandomDestroy(&rctx));
158:   PetscCall(PetscFinalize());
159:   return 0;
160: }

162: /*TEST
163:   build:
164:     requires: kokkos_kernels

166:   test:
167:     suffix: 1
168:     requires: kokkos_kernels
169:     args: -min 32 -max 32 -dm_vec_type kokkos
170:     filter: grep -v "time"

172: TEST*/