Actual source code: ex1k.kokkos.cxx
1: static char help[] = "Benchmarking memory bandwidth with VecAXPY() on parallel vectors\n";
2: /*
3: Usage:
4: mpirun -n <np> ./ex1k -vec_type <device vector type>
5: -n <n> # number of data points of vector sizes from 128, 256, 512 and up. Maxima and default is 23.
6: -m <m> # run each VecAXPY() m times to get the average time, default is 1000.
8: Example:
10: Running on Crusher at OLCF:
11: # run with 1 mpi rank (-n1), 32 CPUs (-c32), and map the process to CPU 0 and GPU 0
12: $ srun -n1 -c32 --cpu-bind=map_cpu:0 --gpus-per-node=8 --gpu-bind=map_gpu:0 ./ex1k -vec_type kokkos
13: */
15: #include <petscvec.h>
16: #include <petscdevice.h>
18: #if defined(PETSC_HAVE_CUDA)
19: #include <petscdevice_cuda.h>
20: #define SyncDevice() PetscCallCUDA(cudaDeviceSynchronize())
21: #elif defined(PETSC_HAVE_HIP)
22: #include <petscdevice_hip.h>
23: #define SyncDevice() PetscCallHIP(hipDeviceSynchronize())
24: #elif defined(PETSC_HAVE_KOKKOS)
25: #include <Kokkos_Core.hpp>
26: #define SyncDevice() Kokkos::fence()
27: #else
28: #define SyncDevice() 0
29: #endif
31: int main(int argc, char **argv)
32: {
33: PetscInt i, k, N, n, m = 1000, nsamples;
34: PetscLogDouble tstart, tend, time;
35: Vec x, y;
36: PetscScalar alpha = 3.14;
37: PetscLogDouble bandwidth;
38: PetscMPIInt size;
39: PetscInt Ns[] = {/* Use explicit sizes so that one can add sizes very close to 2^31 */
40: 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288, 1048576, 2097152, 4194304, 8388608, 16777216, 33554432, 67108864, 134217728, 268435456, 536870912};
41: n = nsamples = sizeof(Ns) / sizeof(Ns[0]);
43: PetscFunctionBeginUser;
44: PetscCall(PetscInitialize(&argc, &argv, (char *)0, help));
45: PetscCall(PetscOptionsGetInt(NULL, NULL, "-n", &n, NULL)); /* Up to vectors of local size 2^{n+6} */
46: PetscCall(PetscOptionsGetInt(NULL, NULL, "-m", &m, NULL)); /* Run each VecAXPY() m times */
48: PetscCallMPI(MPI_Comm_size(PETSC_COMM_WORLD, &size));
50: PetscCall(PetscPrintf(PETSC_COMM_WORLD, "Vector size (N) Time (us) Bandwidth (GB/s)\n"));
51: PetscCall(PetscPrintf(PETSC_COMM_WORLD, "----------------------------------------------\n"));
53: nsamples = PetscMin(nsamples, n);
54: for (k = 0; k < nsamples; k++) {
55: N = Ns[k];
56: PetscCall(VecCreate(PETSC_COMM_WORLD, &x));
57: PetscCall(VecSetFromOptions(x));
58: PetscCall(VecSetSizes(x, N, PETSC_DECIDE));
59: PetscCall(VecSetUp(x));
60: PetscCall(VecDuplicate(x, &y));
61: PetscCall(VecSet(x, 2.5));
62: PetscCall(VecSet(y, 4.0));
64: /* Warm-up */
65: for (i = 0; i < 4; i++) PetscCall(VecAXPY(x, alpha, y));
66: SyncDevice();
67: PetscCallMPI(MPI_Barrier(PETSC_COMM_WORLD));
69: PetscCall(PetscTime(&tstart));
70: for (i = 0; i < m; i++) PetscCall(VecAXPY(x, alpha, y));
71: SyncDevice();
72: PetscCallMPI(MPI_Barrier(PETSC_COMM_WORLD));
73: PetscCall(PetscTime(&tend));
74: time = (tend - tstart) * 1e6 / m;
75: bandwidth = 3 * N * size * sizeof(PetscScalar) / time * 1e-3; /* read x, y and write y */
76: PetscCall(PetscPrintf(PETSC_COMM_WORLD, "%12" PetscInt_FMT ", %12.4f, %8.2f\n", N, time, bandwidth));
77: PetscCall(VecDestroy(&x));
78: PetscCall(VecDestroy(&y));
79: }
81: PetscCall(PetscFinalize());
82: return 0;
83: }
85: /*TEST
86: build:
87: requires: kokkos_kernels
89: test:
90: args: -n 2 -m 2 -vec_type kokkos
91: output_file: output/empty.out
92: filter: grep "DOES_NOT_EXIST"
94: TEST*/