Actual source code: ex2k.kokkos.cxx
1: static char help[] = "Benchmarking MatProduct with AIJ and its subclass matrix types\n";
3: /*
4: Usage:
5: mpirun -n <np> ./ex2k
6: -A <filename> : input petsc binary file for matrix A; one can convert a file from MatrixMarket using mat/tests/ex72.c
7: -P <filename> : input petsc binary file for matrix P; optional, if not given, P = A
8: -mat_type <str> : aij or its subclass. Default is aij.
9: -prod_type <str> : AP, AtP, APt, PtAP or PAPt. Default is AP.
10: -n <num> : run MatProductNumeric() this many times and report average time. Default is 100.
12: Notes:
13: It uses CPU-timer to measure the time.
15: Examples:
16: On OLCF Summit (with GPU-aware MPI)
17: # 6 MPI ranks:
18: # 6 resource sets (-n 6), 1 MPI rank per RS (-a 1), 7 CPU cores per RS (-c 7), and 1 GPU per RS (-g 1), 6 RSs per node (-r 6)
19: jsrun --smpiargs "-gpu" -n 6 -a 1 -c 7 -g 1 -r 6 ./ex2k -A cage12.aij -mat_type aijcusparse
21: # 1 MPI rank
22: jsrun --smpiargs "-gpu" -n 1 -a 1 -c 7 -g 1 -r 1 ./ex2k -A cage12.aij -mat_type aijcusparse
24: On OLCF Crusher:
25: # 1 MPI rank
26: # run with 1 node (-N1), 1 mpi rank (-n1), 2 hardware threads per rank (-c2)
27: srun -N1 -n1 -c2 --gpus-per-node=8 --gpu-bind=closest ./ex2k -A HV15R.aij -mat_type aijkokkos
29: # 8 MPI ranks
30: srun -N1 -n8 -c2 --gpus-per-node=8 --gpu-bind=closest ./ex2k -A HV15R.aij -mat_type aijkokkos
31: */
32: #include <petscmat.h>
33: #include <petscdevice.h>
35: #if defined(PETSC_HAVE_CUDA)
36: #include <petscdevice_cuda.h>
37: #define SyncDevice() PetscCallCUDA(cudaDeviceSynchronize())
38: #elif defined(PETSC_HAVE_HIP)
39: #include <petscdevice_hip.h>
40: #define SyncDevice() PetscCallHIP(hipDeviceSynchronize())
41: #elif defined(PETSC_HAVE_KOKKOS)
42: #include <Kokkos_Core.hpp>
43: #define SyncDevice() Kokkos::fence()
44: #else
45: #define SyncDevice()
46: #endif
48: int main(int argc, char **args)
49: {
50: Mat A, P, C;
51: Mat A2, P2, C2; /* Shadow matrices (of MATAIJ) of A,P,C for initialization and validation */
52: char matTypeStr[64], prodTypeStr[32];
53: char fileA[PETSC_MAX_PATH_LEN], fileP[PETSC_MAX_PATH_LEN];
54: PetscViewer fdA, fdP;
55: PetscBool flg, flgA, flgP, equal = PETSC_FALSE;
56: PetscLogStage stage;
57: PetscInt i, n = 100, nskip = 2, M, N;
58: MatInfo info;
59: PetscLogDouble tstart = 0, tend = 0, avgTime;
60: PetscMPIInt size;
61: MatProductType prodType;
62: PetscBool isAP, isAtP, isAPt, isPtAP, isPAPt;
64: PetscFunctionBeginUser;
65: PetscCall(PetscInitialize(&argc, &args, (char *)0, help));
66: PetscCallMPI(MPI_Comm_size(PETSC_COMM_WORLD, &size));
68: /* Read options -n */
69: PetscCall(PetscOptionsGetInt(NULL, NULL, "-n", &n, NULL));
71: /* Load the matrix from a binary file */
72: PetscCall(PetscOptionsGetString(NULL, NULL, "-A", fileA, PETSC_MAX_PATH_LEN, &flgA));
73: PetscCall(PetscOptionsGetString(NULL, NULL, "-P", fileP, PETSC_MAX_PATH_LEN, &flgP));
74: PetscCheck(flgA, PETSC_COMM_WORLD, PETSC_ERR_USER_INPUT, "Must give a petsc matrix binary file with the -A option");
76: PetscCall(PetscOptionsGetString(NULL, NULL, "-mat_type", matTypeStr, sizeof(matTypeStr), &flg));
77: if (!flg) PetscCall(PetscStrncpy(matTypeStr, MATAIJ, sizeof(matTypeStr))); /* Inject the default if not provided */
79: PetscCall(PetscOptionsGetString(NULL, NULL, "-prod_type", prodTypeStr, sizeof(prodTypeStr), &flg));
80: if (!flg) PetscCall(PetscStrncpy(prodTypeStr, "AP", sizeof(prodTypeStr))); /* Inject the default if not provided */
82: PetscCall(PetscStrcmp(prodTypeStr, "AP", &isAP));
83: PetscCall(PetscStrcmp(prodTypeStr, "AtP", &isAtP));
84: PetscCall(PetscStrcmp(prodTypeStr, "APt", &isAPt));
85: PetscCall(PetscStrcmp(prodTypeStr, "PtAP", &isPtAP));
86: PetscCall(PetscStrcmp(prodTypeStr, "PAPt", &isPAPt));
88: if (isAP) prodType = MATPRODUCT_AB;
89: else if (isAtP) prodType = MATPRODUCT_AtB;
90: else if (isAPt) prodType = MATPRODUCT_ABt;
91: else if (isPtAP) prodType = MATPRODUCT_PtAP;
92: else if (isPAPt) prodType = MATPRODUCT_RARt;
93: else SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_USER, "Unsupported product type %s", prodTypeStr);
95: /* Read the matrix file to A2 */
96: PetscCall(PetscViewerBinaryOpen(PETSC_COMM_WORLD, fileA, FILE_MODE_READ, &fdA));
97: PetscCall(MatCreate(PETSC_COMM_WORLD, &A2));
98: PetscCall(MatSetType(A2, MATAIJ));
99: PetscCall(MatLoad(A2, fdA));
100: PetscCall(PetscViewerDestroy(&fdA));
102: PetscCall(MatGetSize(A2, &M, &N));
103: PetscCall(MatGetInfo(A2, MAT_GLOBAL_SUM, &info));
104: PetscCall(PetscPrintf(PETSC_COMM_WORLD, "Input matrix A: %s, %" PetscInt_FMT " x %" PetscInt_FMT ", %lld nonzeros, %.1f per row\n", fileA, M, N, (long long)info.nz_used, (double)info.nz_used / (double)M));
106: /* Copy A2 to A and convert A to the specified type */
107: PetscCall(MatDuplicate(A2, MAT_COPY_VALUES, &A));
108: PetscCall(MatConvert(A, matTypeStr, MAT_INPLACE_MATRIX, &A));
110: /* Init P, P2 similarly */
111: if (flgP) { /* If user provided P */
112: PetscCall(PetscViewerBinaryOpen(PETSC_COMM_WORLD, fileP, FILE_MODE_READ, &fdP));
113: PetscCall(MatCreate(PETSC_COMM_WORLD, &P2));
114: PetscCall(MatSetType(P2, MATAIJ));
115: PetscCall(MatLoad(P2, fdP));
116: PetscCall(PetscViewerDestroy(&fdP));
118: PetscCall(MatDuplicate(P2, MAT_COPY_VALUES, &P));
119: PetscCall(MatConvert(P, matTypeStr, MAT_INPLACE_MATRIX, &P));
121: PetscCall(MatGetSize(P2, &M, &N));
122: PetscCall(MatGetInfo(P2, MAT_GLOBAL_SUM, &info));
123: PetscCall(PetscPrintf(PETSC_COMM_WORLD, "Input matrix P: %s, %" PetscInt_FMT " x %" PetscInt_FMT ", %lld nonzeros, %.1f per row\n", fileP, M, N, (long long)info.nz_used, (double)info.nz_used / (double)M));
124: } else { /* otherwise just let P = A */
125: PetscCall(PetscPrintf(PETSC_COMM_WORLD, "Input matrix P = A\n"));
126: P2 = A2;
127: P = A;
128: }
130: /* Compute the reference C2 */
131: PetscCall(MatProductCreate(A2, P2, NULL, &C2));
132: PetscCall(MatProductSetType(C2, prodType));
133: PetscCall(MatProductSetFill(C2, PETSC_DEFAULT));
134: PetscCall(MatProductSetFromOptions(C2));
135: PetscCall(MatProductSymbolic(C2));
136: PetscCall(MatProductNumeric(C2));
137: PetscCall(MatGetSize(C2, &M, &N));
138: PetscCall(MatGetInfo(C2, MAT_GLOBAL_SUM, &info));
139: PetscCall(PetscPrintf(PETSC_COMM_WORLD, "Mat product C = %s: %" PetscInt_FMT " x %" PetscInt_FMT ", %lld nonzeros, %.1f per row\n", prodTypeStr, M, N, (long long)info.nz_used, (double)info.nz_used / (double)M));
141: /* Compute C */
142: PetscCall(MatProductCreate(A, P, NULL, &C));
143: PetscCall(MatProductSetType(C, prodType));
144: PetscCall(MatProductSetAlgorithm(C, MATPRODUCTALGORITHMBACKEND));
145: PetscCall(MatProductSetFill(C, PETSC_DEFAULT));
146: PetscCall(MatProductSetFromOptions(C));
148: /* Measure MatProductSymbolic */
149: PetscCall(PetscLogStageRegister("MatProductSymbolic", &stage));
150: PetscCall(PetscLogStagePush(stage));
151: SyncDevice();
152: PetscCallMPI(MPI_Barrier(PETSC_COMM_WORLD));
153: PetscCall(PetscTime(&tstart));
154: PetscCall(MatProductSymbolic(C));
155: SyncDevice();
156: PetscCallMPI(MPI_Barrier(PETSC_COMM_WORLD));
157: PetscCall(PetscTime(&tend));
158: avgTime = (tend - tstart) * 1e6; /* microseconds */
159: PetscCall(PetscLogStagePop());
160: PetscCall(PetscPrintf(PETSC_COMM_WORLD, "\nMatProductSymbolic() time (us) with %d MPI ranks = %8.2f\n", size, avgTime));
162: /* Measure MatProductNumeric */
163: PetscCall(PetscLogStageRegister("MatProductNumeric", &stage));
164: for (i = 0; i < n + nskip; i++) {
165: if (i == nskip) {
166: SyncDevice();
167: PetscCall(PetscLogStagePush(stage));
168: PetscCallMPI(MPI_Barrier(PETSC_COMM_WORLD));
169: PetscCall(PetscTime(&tstart));
170: }
171: PetscCall(MatProductReplaceMats(A, P, NULL, C));
172: PetscCall(MatProductNumeric(C));
173: }
174: SyncDevice();
175: PetscCallMPI(MPI_Barrier(PETSC_COMM_WORLD));
176: PetscCall(PetscTime(&tend));
177: avgTime = (tend - tstart) * 1e6 / n; /* microseconds */
178: PetscCall(PetscLogStagePop());
180: PetscCall(MatMultEqual(C, C2, 8, &equal)); /* Not MatEqual() since C and C2 are not necessarily bitwise equal */
182: PetscCheck(equal, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Matrix production error");
183: PetscCall(PetscPrintf(PETSC_COMM_WORLD, "MatProductNumeric() average time (us) with %d MPI ranks = %8.2f\n", size, avgTime));
185: PetscCall(MatDestroy(&A));
186: if (flgP) PetscCall(MatDestroy(&P));
187: PetscCall(MatDestroy(&C));
189: PetscCall(MatDestroy(&A2));
190: if (flgP) PetscCall(MatDestroy(&P2));
191: PetscCall(MatDestroy(&C2));
193: PetscCall(PetscFinalize());
194: return 0;
195: }
197: /*TEST
199: testset:
200: args: -n 2 -A ${DATAFILESPATH}/matrices/small
201: nsize: 1
202: filter: grep "DOES_NOT_EXIST"
203: output_file: output/empty.out
204: requires: datafilespath !complex double !single kokkos_kernels
206: test:
207: suffix: 1
208: requires: cuda
209: args: -mat_type aijcusparse
211: test:
212: suffix: 2
213: args: -mat_type aijkokkos
215: test:
216: suffix: 3
217: requires: hip
218: args: -mat_type aijhipsparse
220: TEST*/