Actual source code: ex5k.kokkos.cxx

  1: static char help[] = "Test of Kokkos matrix assemble with 1D Laplacian. Kokkos version of ex5cu \n\n";

  3: #include <petscconf.h>
  4: #include <petscmat.h>

  6: /*
  7:     Include Kokkos files
  8: */
  9: #include <Kokkos_Core.hpp>
 10: #include <Kokkos_OffsetView.hpp>

 12: #include <petscaijdevice.h>

 14: int main(int argc, char **argv)
 15: {
 16:   Mat                        A;
 17:   PetscInt                   N = 11, nz = 3, Istart, Iend, num_threads = 128;
 18:   PetscSplitCSRDataStructure d_mat;
 19:   PetscLogEvent              event;
 20:   Vec                        x, y;
 21:   PetscMPIInt                rank;

 23:   PetscFunctionBeginUser;
 24:   PetscCall(PetscInitialize(&argc, &argv, (char *)0, help));
 25: #if defined(PETSC_HAVE_KOKKOS_KERNELS)
 26:   PetscCall(PetscOptionsGetInt(NULL, NULL, "-n", &N, NULL));
 27:   PetscCall(PetscOptionsGetInt(NULL, NULL, "-nz_row", &nz, NULL)); // for debugging, will be wrong if nz<3
 28:   PetscCall(PetscOptionsGetInt(NULL, NULL, "-n", &N, NULL));
 29:   PetscCall(PetscOptionsGetInt(NULL, NULL, "-num_threads", &num_threads, NULL));
 30:   if (nz > N + 1) {
 31:     PetscCall(PetscPrintf(PETSC_COMM_WORLD, "warning decreasing nz\n"));
 32:     nz = N + 1;
 33:   }
 34:   PetscCallMPI(MPI_Comm_rank(PETSC_COMM_WORLD, &rank));

 36:   PetscCall(PetscLogEventRegister("GPU operator", MAT_CLASSID, &event));
 37:   PetscCall(MatCreate(PETSC_COMM_WORLD, &A));
 38:   PetscCall(MatSetSizes(A, PETSC_DECIDE, PETSC_DECIDE, N, N));
 39:   PetscCall(MatSetType(A, MATAIJKOKKOS));
 40:   PetscCall(MatSeqAIJSetPreallocation(A, nz, NULL));
 41:   PetscCall(MatMPIAIJSetPreallocation(A, nz, NULL, nz - 1, NULL));
 42:   PetscCall(MatSetFromOptions(A));
 43:   PetscCall(MatSetOption(A, MAT_IGNORE_OFF_PROC_ENTRIES, PETSC_TRUE));
 44:   PetscCall(MatCreateVecs(A, &x, &y));
 45:   PetscCall(MatGetOwnershipRange(A, &Istart, &Iend));

 47:   // assemble end on CPU. We are not assembling redundant here, and ignoring off proc entries, but we could
 48:   for (int i = Istart; i < Iend + 1; i++) {
 49:     PetscScalar values[] = {1, 1, 1, 1};
 50:     PetscInt    js[] = {i - 1, i}, nn = (i == N) ? 1 : 2; // negative indices are ignored but >= N are not, so clip end
 51:     PetscCall(MatSetValues(A, nn, js, nn, js, values, ADD_VALUES));
 52:   }
 53:   PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
 54:   PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));

 56:   // test Kokkos
 57:   PetscCall(VecSet(x, 1.0));
 58:   PetscCall(MatMult(A, x, y));
 59:   PetscCall(VecViewFromOptions(y, NULL, "-ex5_vec_view"));

 61:   // assemble on GPU
 62:   if (Iend < N) Iend++; // elements, ignore off processor entries so do redundant
 63:   PetscCall(PetscLogEventBegin(event, 0, 0, 0, 0));
 64:   PetscCall(MatKokkosGetDeviceMatWrite(A, &d_mat));
 65:   Kokkos::fence();
 66:   Kokkos::parallel_for(
 67:     Kokkos::RangePolicy<>(Istart, Iend + 1), KOKKOS_LAMBDA(int i) {
 68:       PetscScalar values[] = {1, 1, 1, 1};
 69:       PetscInt    js[] = {i - 1, i}, nn = (i == N) ? 1 : 2;
 70:       static_cast<void>(MatSetValuesDevice(d_mat, nn, js, nn, js, values, ADD_VALUES));
 71:     });
 72:   Kokkos::fence();
 73:   PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
 74:   PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));

 76:   PetscCall(VecSet(x, 1.0));
 77:   PetscCall(MatMult(A, x, y));
 78:   PetscCall(VecViewFromOptions(y, NULL, "-ex5_vec_view"));
 79:   PetscCall(PetscLogEventEnd(event, 0, 0, 0, 0));

 81:   PetscCall(MatDestroy(&A));
 82:   PetscCall(VecDestroy(&x));
 83:   PetscCall(VecDestroy(&y));
 84: #else
 85:   SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_COR, "Kokkos kernels required");
 86: #endif
 87:   PetscCall(PetscFinalize());
 88:   return 0;
 89: }

 91: /*TEST

 93:    build:
 94:      requires: kokkos_kernels !defined(PETSC_HAVE_CUDA_CLANG)

 96:    test:
 97:      suffix: 0
 98:      requires: kokkos_kernels double !complex !single
 99:      args: -n 11 -ex5_vec_view
100:      nsize:  2

102: TEST*/