Actual source code: ex1k.kokkos.cxx
1: static const char help[] = "Benchmarking PetscSF Ping-pong latency (similar to osu_latency)\n\n";
3: /*
4: This is a simple test to measure the latency of MPI communication.
5: The test is run with two processes. The first process sends a message
6: to the second process, and after having received the message, the second
7: process sends a message back to the first process once. The is repeated
8: a number of times. The latency is defined as half time of the round-trip.
10: It mimics osu_latency from the OSU microbenchmarks (https://mvapich.cse.ohio-state.edu/benchmarks/).
12: Usage: mpirun -n 2 ./ex1k -mtype <type>
13: Other arguments have a default value that is also used in osu_latency.
15: Examples:
17: On Summit at OLCF:
18: jsrun --smpiargs "-gpu" -n 2 -a 1 -c 7 -g 1 -r 2 -l GPU-GPU -d packed -b packed:7 ./ex1k -mtype kokkos
20: On Crusher at OLCF:
21: srun -n2 -c32 --cpu-bind=map_cpu:0,1 --gpus-per-node=8 --gpu-bind=map_gpu:0,1 ./ex1k -mtype kokkos
22: */
23: #include <petscsf.h>
24: #include <Kokkos_Core.hpp>
26: /* Same values as OSU microbenchmarks */
27: #define LAT_LOOP_SMALL 10000
28: #define LAT_SKIP_SMALL 100
29: #define LAT_LOOP_LARGE 1000
30: #define LAT_SKIP_LARGE 10
31: #define LARGE_MESSAGE_SIZE 8192
33: int main(int argc, char **argv)
34: {
35: PetscSF sf[64];
36: PetscLogDouble t_start = 0, t_end = 0, time[64];
37: PetscInt i, j, n, nroots, nleaves, niter = 100, nskip = 10;
38: PetscInt maxn = 512 * 1024; /* max 4M bytes messages */
39: PetscSFNode *iremote;
40: PetscMPIInt rank, size;
41: PetscScalar *rootdata = NULL, *leafdata = NULL, *pbuf, *ebuf;
42: size_t msgsize;
43: PetscMemType mtype = PETSC_MEMTYPE_HOST;
44: char mstring[16] = {0};
45: PetscBool set;
46: PetscInt skipSmall = -1, loopSmall = -1;
47: MPI_Op op = MPI_REPLACE;
49: PetscFunctionBeginUser;
50: PetscCall(PetscInitialize(&argc, &argv, NULL, help));
51: PetscCall(PetscKokkosInitializeCheck());
53: PetscCallMPI(MPI_Comm_size(PETSC_COMM_WORLD, &size));
54: PetscCallMPI(MPI_Comm_rank(PETSC_COMM_WORLD, &rank));
55: PetscCheck(size == 2, PETSC_COMM_WORLD, PETSC_ERR_WRONG_MPI_SIZE, "Must run with 2 processes");
57: PetscCall(PetscOptionsGetInt(NULL, NULL, "-maxn", &maxn, NULL)); /* maxn PetscScalars */
58: PetscCall(PetscOptionsGetInt(NULL, NULL, "-skipSmall", &skipSmall, NULL));
59: PetscCall(PetscOptionsGetInt(NULL, NULL, "-loopSmall", &loopSmall, NULL));
61: PetscCall(PetscMalloc1(maxn, &iremote));
62: PetscCall(PetscOptionsGetString(NULL, NULL, "-mtype", mstring, 16, &set));
63: if (set) {
64: PetscBool isHost, isKokkos;
65: PetscCall(PetscStrcasecmp(mstring, "host", &isHost));
66: PetscCall(PetscStrcasecmp(mstring, "kokkos", &isKokkos));
67: if (isHost) mtype = PETSC_MEMTYPE_HOST;
68: else if (isKokkos) mtype = PETSC_MEMTYPE_KOKKOS;
69: else SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "Unknown memory type: %s", mstring);
70: }
72: if (mtype == PETSC_MEMTYPE_HOST) {
73: PetscCall(PetscMalloc2(maxn, &rootdata, maxn, &leafdata));
74: } else {
75: PetscCallCXX(rootdata = (PetscScalar *)Kokkos::kokkos_malloc(sizeof(PetscScalar) * maxn));
76: PetscCallCXX(leafdata = (PetscScalar *)Kokkos::kokkos_malloc(sizeof(PetscScalar) * maxn));
77: }
78: PetscCall(PetscMalloc2(maxn, &pbuf, maxn, &ebuf));
79: for (i = 0; i < maxn; i++) {
80: pbuf[i] = 123.0;
81: ebuf[i] = 456.0;
82: }
84: for (n = 1, i = 0; n <= maxn; n *= 2, i++) {
85: PetscCall(PetscSFCreate(PETSC_COMM_WORLD, &sf[i]));
86: PetscCall(PetscSFSetFromOptions(sf[i]));
87: if (rank == 0) {
88: nroots = n;
89: nleaves = 0;
90: } else {
91: nroots = 0;
92: nleaves = n;
93: for (j = 0; j < nleaves; j++) {
94: iremote[j].rank = 0;
95: iremote[j].index = j;
96: }
97: }
98: PetscCall(PetscSFSetGraph(sf[i], nroots, nleaves, NULL, PETSC_COPY_VALUES, iremote, PETSC_COPY_VALUES));
99: PetscCall(PetscSFSetUp(sf[i]));
100: }
102: if (loopSmall > 0) {
103: nskip = skipSmall;
104: niter = loopSmall;
105: } else {
106: nskip = LAT_SKIP_SMALL;
107: niter = LAT_LOOP_SMALL;
108: }
110: for (n = 1, j = 0; n <= maxn; n *= 2, j++) {
111: msgsize = sizeof(PetscScalar) * n;
112: if (mtype == PETSC_MEMTYPE_HOST) {
113: PetscCall(PetscArraycpy(rootdata, pbuf, n));
114: PetscCall(PetscArraycpy(leafdata, ebuf, n));
115: } else {
116: Kokkos::View<PetscScalar *> dst1((PetscScalar *)rootdata, n);
117: Kokkos::View<PetscScalar *> dst2((PetscScalar *)leafdata, n);
118: Kokkos::View<const PetscScalar *, Kokkos::HostSpace> src1((const PetscScalar *)pbuf, n);
119: Kokkos::View<const PetscScalar *, Kokkos::HostSpace> src2((const PetscScalar *)ebuf, n);
120: PetscCallCXX(Kokkos::deep_copy(dst1, src1));
121: PetscCallCXX(Kokkos::deep_copy(dst2, src2));
122: }
124: if (msgsize > LARGE_MESSAGE_SIZE) {
125: nskip = LAT_SKIP_LARGE;
126: niter = LAT_LOOP_LARGE;
127: }
128: PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
130: for (i = 0; i < niter + nskip; i++) {
131: if (i == nskip) {
132: PetscCallCXX(Kokkos::fence());
133: PetscCallMPI(MPI_Barrier(PETSC_COMM_WORLD));
134: t_start = MPI_Wtime();
135: }
136: PetscCall(PetscSFBcastWithMemTypeBegin(sf[j], MPIU_SCALAR, mtype, rootdata, mtype, leafdata, op));
137: PetscCall(PetscSFBcastEnd(sf[j], MPIU_SCALAR, rootdata, leafdata, op));
138: PetscCall(PetscSFReduceWithMemTypeBegin(sf[j], MPIU_SCALAR, mtype, leafdata, mtype, rootdata, op));
139: PetscCall(PetscSFReduceEnd(sf[j], MPIU_SCALAR, leafdata, rootdata, op));
140: }
141: PetscCallCXX(Kokkos::fence());
142: PetscCallMPI(MPI_Barrier(PETSC_COMM_WORLD));
143: t_end = MPI_Wtime();
144: time[j] = (t_end - t_start) * 1e6 / (niter * 2);
145: }
147: PetscCall(PetscPrintf(PETSC_COMM_WORLD, "\t## PetscSF Ping-pong test on %s ##\n Message(Bytes) \t\tLatency(us)\n", mtype == PETSC_MEMTYPE_HOST ? "Host" : "Device"));
148: for (n = 1, j = 0; n <= maxn; n *= 2, j++) {
149: PetscCall(PetscSFDestroy(&sf[j]));
150: PetscCall(PetscPrintf(PETSC_COMM_WORLD, "%16" PetscInt_FMT " \t %16.4f\n", ((PetscInt)sizeof(PetscScalar)) * n, time[j]));
151: }
152: PetscCall(PetscFree2(pbuf, ebuf));
153: if (mtype == PETSC_MEMTYPE_HOST) {
154: PetscCall(PetscFree2(rootdata, leafdata));
155: } else {
156: PetscCallCXX(Kokkos::kokkos_free(rootdata));
157: PetscCallCXX(Kokkos::kokkos_free(leafdata));
158: }
159: PetscCall(PetscFree(iremote));
160: PetscCall(PetscFinalize());
161: return 0;
162: }
164: /*TEST
165: testset:
166: requires: kokkos
167: # use small numbers to make the test cheap
168: args: -maxn 4 -skipSmall 1 -loopSmall 1
169: filter: grep "DOES_NOT_EXIST"
170: output_file: output/empty.out
171: nsize: 2
173: test:
174: args: -mtype {{host kokkos}}
176: test:
177: requires: mpix_stream
178: args: -mtype kokkos -sf_use_stream_aware_mpi 1
180: TEST*/