Actual source code: ex1k.kokkos.cxx
1: static char help[] = "Benchmarking MatMult() with AIJ and its subclass matrix types\n";
3: /*
4: Usage:
5: mpirun -n <np> ./ex1k
6: -f <file> : input petsc matrix binary file; one can convert a file from MatrixMarket using mat/tests/ex72.c
7: -mat_type <type> : aij or its subclass. Default is aij.
8: -n <num> : run MatMult() this many times and report average time. Default is 500.
10: Notes:
11: It uses CPU-timer to measure the time.
13: Examples:
14: On OLCF Summit (with GPU-aware MPI)
15: # 6 MPI ranks:
16: # 6 resource sets (-n 6), 1 MPI rank per RS (-a 1), 7 CPU cores per RS (-c 7), and 1 GPU per RS (-g 1), 6 RSs per node (-r 6)
17: jsrun --smpiargs "-gpu" -n 6 -a 1 -c 7 -g 1 -r 6 ./ex1k -f 1138_bus.aij -mat_type aijcusparse
19: # 1 MPI rank
20: jsrun --smpiargs "-gpu" -n 1 -a 1 -c 7 -g 1 -r 1 ./ex1k -f 1138_bus.aij -mat_type aijcusparse
22: On OLCF Crusher:
23: # 1 MPI rank
24: # run with 1 node (-N1), 1 mpi rank (-n1), 2 hardware threads per rank (-c2)
25: srun -N1 -n1 -c2 --gpus-per-node=8 --gpu-bind=closest ./ex1k -f HV15R.aij -mat_type aijkokkos
27: # 8 MPI ranks
28: srun -N1 -n8 -c2 --gpus-per-node=8 --gpu-bind=closest ./ex1k -f HV15R.aij -mat_type aijkokkos
29: */
30: #include <petscmat.h>
31: #include <petscdevice.h>
33: #if defined(PETSC_HAVE_CUDA)
34: #include <petscdevice_cuda.h>
35: #define SyncDevice() PetscCallCUDA(cudaDeviceSynchronize())
36: #elif defined(PETSC_HAVE_HIP)
37: #include <petscdevice_hip.h>
38: #define SyncDevice() PetscCallHIP(hipDeviceSynchronize())
39: #elif defined(PETSC_HAVE_KOKKOS)
40: #include <Kokkos_Core.hpp>
41: #define SyncDevice() Kokkos::fence()
42: #else
43: #define SyncDevice()
44: #endif
46: int main(int argc, char **args)
47: {
48: Mat A, A2;
49: Vec x, y, x2, y2;
50: PetscViewer fd;
51: char matfile[PETSC_MAX_PATH_LEN];
52: char mattype[64];
53: PetscBool flg;
54: PetscLogStage stage;
55: PetscInt i, n = 500, nskip = 5, M, N;
56: MatInfo info;
57: PetscLogDouble tstart = 0, tend = 0, avgTime;
58: PetscRandom rctx;
59: PetscReal norm;
60: PetscMPIInt size;
62: PetscFunctionBeginUser;
63: PetscCall(PetscInitialize(&argc, &args, (char *)0, help));
64: PetscCallMPI(MPI_Comm_size(PETSC_COMM_WORLD, &size));
66: /* Read options -n */
67: PetscCall(PetscOptionsGetInt(NULL, NULL, "-n", &n, NULL));
69: /* Load the matrix from a binary file */
70: PetscCall(PetscOptionsGetString(NULL, NULL, "-f", matfile, PETSC_MAX_PATH_LEN, &flg));
71: PetscCheck(flg, PETSC_COMM_WORLD, PETSC_ERR_USER_INPUT, "Must indicate a petsc matrix binary file with the -f option");
72: PetscCall(PetscOptionsGetString(NULL, NULL, "-mat_type", mattype, sizeof(mattype), &flg));
73: if (!flg) PetscCall(PetscStrncpy(mattype, MATAIJ, sizeof(mattype)));
75: /* Read the matrix file to A2 */
76: PetscCall(PetscViewerBinaryOpen(PETSC_COMM_WORLD, matfile, FILE_MODE_READ, &fd));
77: PetscCall(MatCreate(PETSC_COMM_WORLD, &A2));
78: PetscCall(MatSetType(A2, MATAIJ));
79: PetscCall(MatLoad(A2, fd));
80: PetscCall(MatCreateVecs(A2, &x2, &y2));
81: PetscCall(PetscViewerDestroy(&fd));
83: PetscCall(MatGetSize(A2, &M, &N));
84: PetscCall(MatGetInfo(A2, MAT_GLOBAL_SUM, &info));
85: PetscCall(PetscPrintf(PETSC_COMM_WORLD, "Input matrix %s: %" PetscInt_FMT " x %" PetscInt_FMT "; %lld nonzeros; %.1f per row\n", matfile, M, N, (long long)info.nz_used, (double)info.nz_used / (double)M));
87: /* Copy A2 to A and convert A to the specified type */
88: PetscCall(MatDuplicate(A2, MAT_COPY_VALUES, &A));
89: PetscCall(MatConvert(A, mattype, MAT_INPLACE_MATRIX, &A));
90: PetscCall(MatCreateVecs(A, &x, &y));
92: /* Init x, x2 with the same value */
93: PetscCall(PetscRandomCreate(PETSC_COMM_WORLD, &rctx));
94: PetscCall(VecSetRandom(x2, rctx));
95: PetscCall(PetscRandomDestroy(&rctx));
96: PetscCall(VecCopy(x2, x));
98: /* Compute the reference y2 = A2 x2 */
99: PetscCall(MatMult(A2, x2, y2));
101: /* Measure y = Ax */
102: PetscCall(PetscLogStageRegister("MatMult", &stage));
103: for (i = 0; i < n + nskip; i++) {
104: if (i == nskip) {
105: SyncDevice();
106: PetscCall(PetscLogStagePush(stage));
107: PetscCallMPI(MPI_Barrier(PETSC_COMM_WORLD));
108: PetscCall(PetscTime(&tstart));
109: }
110: PetscCall(MatMult(A, x, y));
111: }
112: SyncDevice();
113: PetscCallMPI(MPI_Barrier(PETSC_COMM_WORLD));
114: PetscCall(PetscTime(&tend));
115: avgTime = (tend - tstart) * 1e6 / n; /* microseconds */
116: PetscCall(PetscLogStagePop());
118: /* Validate y against y2 */
119: PetscCall(VecAYPX(y2, -1, y));
120: PetscCall(VecNorm(y2, NORM_2, &norm));
121: PetscCheck(norm < 1e-6, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "MatMult() error with norm %g", (double)norm);
122: PetscCall(PetscPrintf(PETSC_COMM_WORLD, "MatMult() average time (us) with %d MPI ranks = %8.2f\n", size, avgTime));
124: PetscCall(MatDestroy(&A));
125: PetscCall(VecDestroy(&x));
126: PetscCall(VecDestroy(&y));
127: PetscCall(MatDestroy(&A2));
128: PetscCall(VecDestroy(&x2));
129: PetscCall(VecDestroy(&y2));
130: PetscCall(PetscFinalize());
131: return 0;
132: }
134: /*TEST
136: testset:
137: args: -n 2 -f ${DATAFILESPATH}/matrices/small
138: nsize: 1
139: filter: grep "DOES_NOT_EXIST"
140: output_file: output/empty.out
141: requires: datafilespath !complex double !single kokkos_kernels
143: test:
144: suffix: 1
145: requires: cuda
146: args: -mat_type aijcusparse
148: test:
149: suffix: 2
150: args: -mat_type aijkokkos
152: test:
153: suffix: 3
154: requires: hip
155: args: -mat_type aijhipsparse
157: TEST*/