Actual source code: ex1k.kokkos.cxx

  1: static const char help[] = "Benchmarking PetscSF Ping-pong latency (similar to osu_latency)\n\n";

  3: /*
  4:   This is a simple test to measure the latency of MPI communication.
  5:   The test is run with two processes.  The first process sends a message
  6:   to the second process, and after having received the message, the second
  7:   process sends a message back to the first process once.  The is repeated
  8:   a number of times.  The latency is defined as half time of the round-trip.

 10:   It mimics osu_latency from the OSU microbenchmarks (https://mvapich.cse.ohio-state.edu/benchmarks/).

 12:   Usage: mpirun -n 2 ./ex1k -mtype <type>
 13:   Other arguments have a default value that is also used in osu_latency.

 15:   Examples:

 17:   On Summit at OLCF:
 18:     jsrun --smpiargs "-gpu" -n 2 -a 1 -c 7 -g 1 -r 2 -l GPU-GPU -d packed -b packed:7 ./ex1k  -mtype kokkos

 20:   On Crusher at OLCF:
 21:     srun -n2 -c32 --cpu-bind=map_cpu:0,1 --gpus-per-node=8 --gpu-bind=map_gpu:0,1 ./ex1k -mtype kokkos
 22: */
 23: #include <petscsf.h>
 24: #include <Kokkos_Core.hpp>

 26: /* Same values as OSU microbenchmarks */
 27: #define LAT_LOOP_SMALL     10000
 28: #define LAT_SKIP_SMALL     100
 29: #define LAT_LOOP_LARGE     1000
 30: #define LAT_SKIP_LARGE     10
 31: #define LARGE_MESSAGE_SIZE 8192

 33: int main(int argc, char **argv)
 34: {
 35:   PetscSF        sf[64];
 36:   PetscLogDouble t_start = 0, t_end = 0, time[64];
 37:   PetscInt       i, j, n, nroots, nleaves, niter = 100, nskip = 10;
 38:   PetscInt       maxn = 512 * 1024; /* max 4M bytes messages */
 39:   PetscSFNode   *iremote;
 40:   PetscMPIInt    rank, size;
 41:   PetscScalar   *rootdata = NULL, *leafdata = NULL, *pbuf, *ebuf;
 42:   size_t         msgsize;
 43:   PetscMemType   mtype       = PETSC_MEMTYPE_HOST;
 44:   char           mstring[16] = {0};
 45:   PetscBool      set;
 46:   PetscInt       skipSmall = -1, loopSmall = -1;
 47:   MPI_Op         op = MPI_REPLACE;

 49:   PetscFunctionBeginUser;
 50:   PetscCall(PetscInitialize(&argc, &argv, NULL, help));
 51:   PetscCall(PetscKokkosInitializeCheck());

 53:   PetscCallMPI(MPI_Comm_size(PETSC_COMM_WORLD, &size));
 54:   PetscCallMPI(MPI_Comm_rank(PETSC_COMM_WORLD, &rank));
 55:   PetscCheck(size == 2, PETSC_COMM_WORLD, PETSC_ERR_WRONG_MPI_SIZE, "Must run with 2 processes");

 57:   PetscCall(PetscOptionsGetInt(NULL, NULL, "-maxn", &maxn, NULL)); /* maxn PetscScalars */
 58:   PetscCall(PetscOptionsGetInt(NULL, NULL, "-skipSmall", &skipSmall, NULL));
 59:   PetscCall(PetscOptionsGetInt(NULL, NULL, "-loopSmall", &loopSmall, NULL));

 61:   PetscCall(PetscMalloc1(maxn, &iremote));
 62:   PetscCall(PetscOptionsGetString(NULL, NULL, "-mtype", mstring, 16, &set));
 63:   if (set) {
 64:     PetscBool isHost, isKokkos;
 65:     PetscCall(PetscStrcasecmp(mstring, "host", &isHost));
 66:     PetscCall(PetscStrcasecmp(mstring, "kokkos", &isKokkos));
 67:     if (isHost) mtype = PETSC_MEMTYPE_HOST;
 68:     else if (isKokkos) mtype = PETSC_MEMTYPE_KOKKOS;
 69:     else SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "Unknown memory type: %s", mstring);
 70:   }

 72:   if (mtype == PETSC_MEMTYPE_HOST) {
 73:     PetscCall(PetscMalloc2(maxn, &rootdata, maxn, &leafdata));
 74:   } else {
 75:     PetscCallCXX(rootdata = (PetscScalar *)Kokkos::kokkos_malloc(sizeof(PetscScalar) * maxn));
 76:     PetscCallCXX(leafdata = (PetscScalar *)Kokkos::kokkos_malloc(sizeof(PetscScalar) * maxn));
 77:   }
 78:   PetscCall(PetscMalloc2(maxn, &pbuf, maxn, &ebuf));
 79:   for (i = 0; i < maxn; i++) {
 80:     pbuf[i] = 123.0;
 81:     ebuf[i] = 456.0;
 82:   }

 84:   for (n = 1, i = 0; n <= maxn; n *= 2, i++) {
 85:     PetscCall(PetscSFCreate(PETSC_COMM_WORLD, &sf[i]));
 86:     PetscCall(PetscSFSetFromOptions(sf[i]));
 87:     if (rank == 0) {
 88:       nroots  = n;
 89:       nleaves = 0;
 90:     } else {
 91:       nroots  = 0;
 92:       nleaves = n;
 93:       for (j = 0; j < nleaves; j++) {
 94:         iremote[j].rank  = 0;
 95:         iremote[j].index = j;
 96:       }
 97:     }
 98:     PetscCall(PetscSFSetGraph(sf[i], nroots, nleaves, NULL, PETSC_COPY_VALUES, iremote, PETSC_COPY_VALUES));
 99:     PetscCall(PetscSFSetUp(sf[i]));
100:   }

102:   if (loopSmall > 0) {
103:     nskip = skipSmall;
104:     niter = loopSmall;
105:   } else {
106:     nskip = LAT_SKIP_SMALL;
107:     niter = LAT_LOOP_SMALL;
108:   }

110:   for (n = 1, j = 0; n <= maxn; n *= 2, j++) {
111:     msgsize = sizeof(PetscScalar) * n;
112:     if (mtype == PETSC_MEMTYPE_HOST) {
113:       PetscCall(PetscArraycpy(rootdata, pbuf, n));
114:       PetscCall(PetscArraycpy(leafdata, ebuf, n));
115:     } else {
116:       Kokkos::View<PetscScalar *>                          dst1((PetscScalar *)rootdata, n);
117:       Kokkos::View<PetscScalar *>                          dst2((PetscScalar *)leafdata, n);
118:       Kokkos::View<const PetscScalar *, Kokkos::HostSpace> src1((const PetscScalar *)pbuf, n);
119:       Kokkos::View<const PetscScalar *, Kokkos::HostSpace> src2((const PetscScalar *)ebuf, n);
120:       PetscCallCXX(Kokkos::deep_copy(dst1, src1));
121:       PetscCallCXX(Kokkos::deep_copy(dst2, src2));
122:     }

124:     if (msgsize > LARGE_MESSAGE_SIZE) {
125:       nskip = LAT_SKIP_LARGE;
126:       niter = LAT_LOOP_LARGE;
127:     }
128:     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));

130:     for (i = 0; i < niter + nskip; i++) {
131:       if (i == nskip) {
132:         PetscCallCXX(Kokkos::fence());
133:         PetscCallMPI(MPI_Barrier(PETSC_COMM_WORLD));
134:         t_start = MPI_Wtime();
135:       }
136:       PetscCall(PetscSFBcastWithMemTypeBegin(sf[j], MPIU_SCALAR, mtype, rootdata, mtype, leafdata, op));
137:       PetscCall(PetscSFBcastEnd(sf[j], MPIU_SCALAR, rootdata, leafdata, op));
138:       PetscCall(PetscSFReduceWithMemTypeBegin(sf[j], MPIU_SCALAR, mtype, leafdata, mtype, rootdata, op));
139:       PetscCall(PetscSFReduceEnd(sf[j], MPIU_SCALAR, leafdata, rootdata, op));
140:     }
141:     PetscCallCXX(Kokkos::fence());
142:     PetscCallMPI(MPI_Barrier(PETSC_COMM_WORLD));
143:     t_end   = MPI_Wtime();
144:     time[j] = (t_end - t_start) * 1e6 / (niter * 2);
145:   }

147:   PetscCall(PetscPrintf(PETSC_COMM_WORLD, "\t##  PetscSF Ping-pong test on %s ##\n  Message(Bytes) \t\tLatency(us)\n", mtype == PETSC_MEMTYPE_HOST ? "Host" : "Device"));
148:   for (n = 1, j = 0; n <= maxn; n *= 2, j++) {
149:     PetscCall(PetscSFDestroy(&sf[j]));
150:     PetscCall(PetscPrintf(PETSC_COMM_WORLD, "%16" PetscInt_FMT " \t %16.4f\n", ((PetscInt)sizeof(PetscScalar)) * n, time[j]));
151:   }
152:   PetscCall(PetscFree2(pbuf, ebuf));
153:   if (mtype == PETSC_MEMTYPE_HOST) {
154:     PetscCall(PetscFree2(rootdata, leafdata));
155:   } else {
156:     PetscCallCXX(Kokkos::kokkos_free(rootdata));
157:     PetscCallCXX(Kokkos::kokkos_free(leafdata));
158:   }
159:   PetscCall(PetscFree(iremote));
160:   PetscCall(PetscFinalize());
161:   return 0;
162: }

164: /*TEST
165:   testset:
166:     requires: kokkos
167:     # use small numbers to make the test cheap
168:     args: -maxn 4 -skipSmall 1 -loopSmall 1
169:     filter: grep "DOES_NOT_EXIST"
170:     output_file: output/empty.out
171:     nsize: 2

173:     test:
174:       args: -mtype {{host kokkos}}

176:     test:
177:       requires: mpix_stream
178:       args: -mtype kokkos -sf_use_stream_aware_mpi 1

180: TEST*/