Actual source code: mpits.c

  1: #include <petscsys.h>
  2: #include <petsc/private/petscimpl.h>

  4: PetscLogEvent PETSC_BuildTwoSided;
  5: PetscLogEvent PETSC_BuildTwoSidedF;

  7: const char *const PetscBuildTwoSidedTypes[] = {"ALLREDUCE", "IBARRIER", "REDSCATTER", "PetscBuildTwoSidedType", "PETSC_BUILDTWOSIDED_", NULL};

  9: static PetscBuildTwoSidedType _twosided_type = PETSC_BUILDTWOSIDED_NOTSET;

 11: /*@
 12:    PetscCommBuildTwoSidedSetType - set algorithm to use when building two-sided communication

 14:    Logically Collective

 16:    Input Parameters:
 17: +  comm - `PETSC_COMM_WORLD`
 18: -  twosided - algorithm to use in subsequent calls to `PetscCommBuildTwoSided()`

 20:    Level: developer

 22:    Note:
 23:    This option is currently global, but could be made per-communicator.

 25: .seealso: `PetscCommBuildTwoSided()`, `PetscCommBuildTwoSidedGetType()`, `PetscBuildTwoSidedType`
 26: @*/
 27: PetscErrorCode PetscCommBuildTwoSidedSetType(MPI_Comm comm, PetscBuildTwoSidedType twosided)
 28: {
 29:   PetscFunctionBegin;
 31:     PetscMPIInt b1[2], b2[2];
 32:     b1[0] = -(PetscMPIInt)twosided;
 33:     b1[1] = (PetscMPIInt)twosided;
 34:     PetscCall(MPIU_Allreduce(b1, b2, 2, MPI_INT, MPI_MAX, comm));
 35:     PetscCheck(-b2[0] == b2[1], comm, PETSC_ERR_ARG_WRONG, "Enum value must be same on all processes");
 36:   }
 37:   _twosided_type = twosided;
 38:   PetscFunctionReturn(PETSC_SUCCESS);
 39: }

 41: /*@
 42:    PetscCommBuildTwoSidedGetType - get algorithm used when building two-sided communication

 44:    Logically Collective

 46:    Output Parameters:
 47: +  comm - communicator on which to query algorithm
 48: -  twosided - algorithm to use for `PetscCommBuildTwoSided()`

 50:    Level: developer

 52: .seealso: `PetscCommBuildTwoSided()`, `PetscCommBuildTwoSidedSetType()`, `PetscBuildTwoSidedType`
 53: @*/
 54: PetscErrorCode PetscCommBuildTwoSidedGetType(MPI_Comm comm, PetscBuildTwoSidedType *twosided)
 55: {
 56:   PetscMPIInt size;

 58:   PetscFunctionBegin;
 59:   *twosided = PETSC_BUILDTWOSIDED_NOTSET;
 60:   if (_twosided_type == PETSC_BUILDTWOSIDED_NOTSET) {
 61:     PetscCallMPI(MPI_Comm_size(comm, &size));
 62:     _twosided_type = PETSC_BUILDTWOSIDED_ALLREDUCE; /* default for small comms, see https://gitlab.com/petsc/petsc/-/merge_requests/2611 */
 63: #if defined(PETSC_HAVE_MPI_NONBLOCKING_COLLECTIVES)
 64:     if (size > 1024) _twosided_type = PETSC_BUILDTWOSIDED_IBARRIER;
 65: #endif
 66:     PetscCall(PetscOptionsGetEnum(NULL, NULL, "-build_twosided", PetscBuildTwoSidedTypes, (PetscEnum *)&_twosided_type, NULL));
 67:   }
 68:   *twosided = _twosided_type;
 69:   PetscFunctionReturn(PETSC_SUCCESS);
 70: }

 72: #if defined(PETSC_HAVE_MPI_NONBLOCKING_COLLECTIVES)
 73: static PetscErrorCode PetscCommBuildTwoSided_Ibarrier(MPI_Comm comm, PetscMPIInt count, MPI_Datatype dtype, PetscMPIInt nto, const PetscMPIInt *toranks, const void *todata, PetscMPIInt *nfrom, PetscMPIInt **fromranks, void *fromdata)
 74: {
 75:   PetscMPIInt    nrecvs, tag, done, i;
 76:   MPI_Aint       lb, unitbytes;
 77:   char          *tdata;
 78:   MPI_Request   *sendreqs, barrier;
 79:   PetscSegBuffer segrank, segdata;
 80:   PetscBool      barrier_started;

 82:   PetscFunctionBegin;
 83:   PetscCall(PetscCommDuplicate(comm, &comm, &tag));
 84:   PetscCallMPI(MPI_Type_get_extent(dtype, &lb, &unitbytes));
 85:   PetscCheck(lb == 0, comm, PETSC_ERR_SUP, "Datatype with nonzero lower bound %ld", (long)lb);
 86:   tdata = (char *)todata;
 87:   PetscCall(PetscMalloc1(nto, &sendreqs));
 88:   for (i = 0; i < nto; i++) PetscCallMPI(MPI_Issend((void *)(tdata + count * unitbytes * i), count, dtype, toranks[i], tag, comm, sendreqs + i));
 89:   PetscCall(PetscSegBufferCreate(sizeof(PetscMPIInt), 4, &segrank));
 90:   PetscCall(PetscSegBufferCreate(unitbytes, 4 * count, &segdata));

 92:   nrecvs  = 0;
 93:   barrier = MPI_REQUEST_NULL;
 94:   /* MPICH-3.2 sometimes does not create a request in some "optimized" cases.  This is arguably a standard violation,
 95:    * but we need to work around it. */
 96:   barrier_started = PETSC_FALSE;
 97:   for (done = 0; !done;) {
 98:     PetscMPIInt flag;
 99:     MPI_Status  status;
100:     PetscCallMPI(MPI_Iprobe(MPI_ANY_SOURCE, tag, comm, &flag, &status));
101:     if (flag) { /* incoming message */
102:       PetscMPIInt *recvrank;
103:       void        *buf;
104:       PetscCall(PetscSegBufferGet(segrank, 1, &recvrank));
105:       PetscCall(PetscSegBufferGet(segdata, count, &buf));
106:       *recvrank = status.MPI_SOURCE;
107:       PetscCallMPI(MPI_Recv(buf, count, dtype, status.MPI_SOURCE, tag, comm, MPI_STATUS_IGNORE));
108:       nrecvs++;
109:     }
110:     if (!barrier_started) {
111:       PetscMPIInt sent, nsends;
112:       PetscCall(PetscMPIIntCast(nto, &nsends));
113:       PetscCallMPI(MPI_Testall(nsends, sendreqs, &sent, MPI_STATUSES_IGNORE));
114:       if (sent) {
115:         PetscCallMPI(MPI_Ibarrier(comm, &barrier));
116:         barrier_started = PETSC_TRUE;
117:         PetscCall(PetscFree(sendreqs));
118:       }
119:     } else {
120:       PetscCallMPI(MPI_Test(&barrier, &done, MPI_STATUS_IGNORE));
121:     }
122:   }
123:   *nfrom = nrecvs;
124:   PetscCall(PetscSegBufferExtractAlloc(segrank, fromranks));
125:   PetscCall(PetscSegBufferDestroy(&segrank));
126:   PetscCall(PetscSegBufferExtractAlloc(segdata, fromdata));
127:   PetscCall(PetscSegBufferDestroy(&segdata));
128:   PetscCall(PetscCommDestroy(&comm));
129:   PetscFunctionReturn(PETSC_SUCCESS);
130: }
131: #endif

133: static PetscErrorCode PetscCommBuildTwoSided_Allreduce(MPI_Comm comm, PetscMPIInt count, MPI_Datatype dtype, PetscMPIInt nto, const PetscMPIInt *toranks, const void *todata, PetscMPIInt *nfrom, PetscMPIInt **fromranks, void *fromdata)
134: {
135:   PetscMPIInt       size, rank, *iflags, nrecvs, tag, *franks, i, flg;
136:   MPI_Aint          lb, unitbytes;
137:   char             *tdata, *fdata;
138:   MPI_Request      *reqs, *sendreqs;
139:   MPI_Status       *statuses;
140:   PetscCommCounter *counter;

142:   PetscFunctionBegin;
143:   PetscCallMPI(MPI_Comm_size(comm, &size));
144:   PetscCallMPI(MPI_Comm_rank(comm, &rank));
145:   PetscCall(PetscCommDuplicate(comm, &comm, &tag));
146:   PetscCallMPI(MPI_Comm_get_attr(comm, Petsc_Counter_keyval, &counter, &flg));
147:   PetscCheck(flg, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Inner PETSc communicator does not have its tag/name counter attribute set");
148:   if (!counter->iflags) {
149:     PetscCall(PetscCalloc1(size, &counter->iflags));
150:     iflags = counter->iflags;
151:   } else {
152:     iflags = counter->iflags;
153:     PetscCall(PetscArrayzero(iflags, size));
154:   }
155:   for (i = 0; i < nto; i++) iflags[toranks[i]] = 1;
156:   PetscCall(MPIU_Allreduce(MPI_IN_PLACE, iflags, size, MPI_INT, MPI_SUM, comm));
157:   nrecvs = iflags[rank];
158:   PetscCallMPI(MPI_Type_get_extent(dtype, &lb, &unitbytes));
159:   PetscCheck(lb == 0, comm, PETSC_ERR_SUP, "Datatype with nonzero lower bound %ld", (long)lb);
160:   PetscCall(PetscMalloc(nrecvs * count * unitbytes, &fdata));
161:   tdata = (char *)todata;
162:   PetscCall(PetscMalloc2(nto + nrecvs, &reqs, nto + nrecvs, &statuses));
163:   sendreqs = reqs + nrecvs;
164:   for (i = 0; i < nrecvs; i++) PetscCallMPI(MPI_Irecv((void *)(fdata + count * unitbytes * i), count, dtype, MPI_ANY_SOURCE, tag, comm, reqs + i));
165:   for (i = 0; i < nto; i++) PetscCallMPI(MPI_Isend((void *)(tdata + count * unitbytes * i), count, dtype, toranks[i], tag, comm, sendreqs + i));
166:   PetscCallMPI(MPI_Waitall(nto + nrecvs, reqs, statuses));
167:   PetscCall(PetscMalloc1(nrecvs, &franks));
168:   for (i = 0; i < nrecvs; i++) franks[i] = statuses[i].MPI_SOURCE;
169:   PetscCall(PetscFree2(reqs, statuses));
170:   PetscCall(PetscCommDestroy(&comm));

172:   *nfrom             = nrecvs;
173:   *fromranks         = franks;
174:   *(void **)fromdata = fdata;
175:   PetscFunctionReturn(PETSC_SUCCESS);
176: }

178: #if defined(PETSC_HAVE_MPI_REDUCE_SCATTER_BLOCK)
179: static PetscErrorCode PetscCommBuildTwoSided_RedScatter(MPI_Comm comm, PetscMPIInt count, MPI_Datatype dtype, PetscMPIInt nto, const PetscMPIInt *toranks, const void *todata, PetscMPIInt *nfrom, PetscMPIInt **fromranks, void *fromdata)
180: {
181:   PetscMPIInt       size, *iflags, nrecvs, tag, *franks, i, flg;
182:   MPI_Aint          lb, unitbytes;
183:   char             *tdata, *fdata;
184:   MPI_Request      *reqs, *sendreqs;
185:   MPI_Status       *statuses;
186:   PetscCommCounter *counter;

188:   PetscFunctionBegin;
189:   PetscCallMPI(MPI_Comm_size(comm, &size));
190:   PetscCall(PetscCommDuplicate(comm, &comm, &tag));
191:   PetscCallMPI(MPI_Comm_get_attr(comm, Petsc_Counter_keyval, &counter, &flg));
192:   PetscCheck(flg, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Inner PETSc communicator does not have its tag/name counter attribute set");
193:   if (!counter->iflags) {
194:     PetscCall(PetscCalloc1(size, &counter->iflags));
195:     iflags = counter->iflags;
196:   } else {
197:     iflags = counter->iflags;
198:     PetscCall(PetscArrayzero(iflags, size));
199:   }
200:   for (i = 0; i < nto; i++) iflags[toranks[i]] = 1;
201:   PetscCallMPI(MPI_Reduce_scatter_block(iflags, &nrecvs, 1, MPI_INT, MPI_SUM, comm));
202:   PetscCallMPI(MPI_Type_get_extent(dtype, &lb, &unitbytes));
203:   PetscCheck(lb == 0, comm, PETSC_ERR_SUP, "Datatype with nonzero lower bound %ld", (long)lb);
204:   PetscCall(PetscMalloc(nrecvs * count * unitbytes, &fdata));
205:   tdata = (char *)todata;
206:   PetscCall(PetscMalloc2(nto + nrecvs, &reqs, nto + nrecvs, &statuses));
207:   sendreqs = reqs + nrecvs;
208:   for (i = 0; i < nrecvs; i++) PetscCallMPI(MPI_Irecv((void *)(fdata + count * unitbytes * i), count, dtype, MPI_ANY_SOURCE, tag, comm, reqs + i));
209:   for (i = 0; i < nto; i++) PetscCallMPI(MPI_Isend((void *)(tdata + count * unitbytes * i), count, dtype, toranks[i], tag, comm, sendreqs + i));
210:   PetscCallMPI(MPI_Waitall(nto + nrecvs, reqs, statuses));
211:   PetscCall(PetscMalloc1(nrecvs, &franks));
212:   for (i = 0; i < nrecvs; i++) franks[i] = statuses[i].MPI_SOURCE;
213:   PetscCall(PetscFree2(reqs, statuses));
214:   PetscCall(PetscCommDestroy(&comm));

216:   *nfrom             = nrecvs;
217:   *fromranks         = franks;
218:   *(void **)fromdata = fdata;
219:   PetscFunctionReturn(PETSC_SUCCESS);
220: }
221: #endif

223: /*@C
224:    PetscCommBuildTwoSided - discovers communicating ranks given one-sided information, moving constant-sized data in the process (often message lengths)

226:    Collective

228:    Input Parameters:
229: +  comm - communicator
230: .  count - number of entries to send/receive (must match on all ranks)
231: .  dtype - datatype to send/receive from each rank (must match on all ranks)
232: .  nto - number of ranks to send data to
233: .  toranks - ranks to send to (array of length nto)
234: -  todata - data to send to each rank (packed)

236:    Output Parameters:
237: +  nfrom - number of ranks receiving messages from
238: .  fromranks - ranks receiving messages from (length `nfrom`, caller should `PetscFree()`)
239: -  fromdata - packed data from each rank, each with count entries of type dtype (length nfrom, caller responsible for `PetscFree()`)

241:    Options Database Key:
242: .  -build_twosided <allreduce|ibarrier|redscatter> - algorithm to set up two-sided communication. Default is allreduce for communicators with <= 1024 ranks,
243:                    otherwise ibarrier.

245:   Level: developer

247:    Notes:
248:    This memory-scalable interface is an alternative to calling `PetscGatherNumberOfMessages()` and
249:    `PetscGatherMessageLengths()`, possibly with a subsequent round of communication to send other constant-size data.

251:    Basic data types as well as contiguous types are supported, but non-contiguous (e.g., strided) types are not.

253:    References:
254: .  * - Hoefler, Siebert and Lumsdaine, The MPI_Ibarrier implementation uses the algorithm in
255:    Scalable communication protocols for dynamic sparse data exchange, 2010.

257: .seealso: `PetscGatherNumberOfMessages()`, `PetscGatherMessageLengths()`, `PetscCommBuildTwoSidedSetType()`, `PetscCommBuildTwoSidedType`
258: @*/
259: PetscErrorCode PetscCommBuildTwoSided(MPI_Comm comm, PetscMPIInt count, MPI_Datatype dtype, PetscMPIInt nto, const PetscMPIInt *toranks, const void *todata, PetscMPIInt *nfrom, PetscMPIInt **fromranks, void *fromdata)
260: {
261:   PetscBuildTwoSidedType buildtype = PETSC_BUILDTWOSIDED_NOTSET;

263:   PetscFunctionBegin;
264:   PetscCall(PetscSysInitializePackage());
265:   PetscCall(PetscLogEventSync(PETSC_BuildTwoSided, comm));
266:   PetscCall(PetscLogEventBegin(PETSC_BuildTwoSided, 0, 0, 0, 0));
267:   PetscCall(PetscCommBuildTwoSidedGetType(comm, &buildtype));
268:   switch (buildtype) {
269:   case PETSC_BUILDTWOSIDED_IBARRIER:
270: #if defined(PETSC_HAVE_MPI_NONBLOCKING_COLLECTIVES)
271:     PetscCall(PetscCommBuildTwoSided_Ibarrier(comm, count, dtype, nto, toranks, todata, nfrom, fromranks, fromdata));
272:     break;
273: #else
274:     SETERRQ(comm, PETSC_ERR_PLIB, "MPI implementation does not provide MPI_Ibarrier (part of MPI-3)");
275: #endif
276:   case PETSC_BUILDTWOSIDED_ALLREDUCE:
277:     PetscCall(PetscCommBuildTwoSided_Allreduce(comm, count, dtype, nto, toranks, todata, nfrom, fromranks, fromdata));
278:     break;
279:   case PETSC_BUILDTWOSIDED_REDSCATTER:
280: #if defined(PETSC_HAVE_MPI_REDUCE_SCATTER_BLOCK)
281:     PetscCall(PetscCommBuildTwoSided_RedScatter(comm, count, dtype, nto, toranks, todata, nfrom, fromranks, fromdata));
282:     break;
283: #else
284:     SETERRQ(comm, PETSC_ERR_PLIB, "MPI implementation does not provide MPI_Reduce_scatter_block (part of MPI-2.2)");
285: #endif
286:   default:
287:     SETERRQ(comm, PETSC_ERR_PLIB, "Unknown method for building two-sided communication");
288:   }
289:   PetscCall(PetscLogEventEnd(PETSC_BuildTwoSided, 0, 0, 0, 0));
290:   PetscFunctionReturn(PETSC_SUCCESS);
291: }

293: static PetscErrorCode PetscCommBuildTwoSidedFReq_Reference(MPI_Comm comm, PetscMPIInt count, MPI_Datatype dtype, PetscMPIInt nto, const PetscMPIInt *toranks, const void *todata, PetscMPIInt *nfrom, PetscMPIInt **fromranks, void *fromdata, PetscMPIInt ntags, MPI_Request **toreqs, MPI_Request **fromreqs, PetscErrorCode (*send)(MPI_Comm, const PetscMPIInt[], PetscMPIInt, PetscMPIInt, void *, MPI_Request[], void *), PetscErrorCode (*recv)(MPI_Comm, const PetscMPIInt[], PetscMPIInt, void *, MPI_Request[], void *), void *ctx)
294: {
295:   PetscMPIInt  i, *tag;
296:   MPI_Aint     lb, unitbytes;
297:   MPI_Request *sendreq, *recvreq;

299:   PetscFunctionBegin;
300:   PetscCall(PetscMalloc1(ntags, &tag));
301:   if (ntags > 0) PetscCall(PetscCommDuplicate(comm, &comm, &tag[0]));
302:   for (i = 1; i < ntags; i++) PetscCall(PetscCommGetNewTag(comm, &tag[i]));

304:   /* Perform complete initial rendezvous */
305:   PetscCall(PetscCommBuildTwoSided(comm, count, dtype, nto, toranks, todata, nfrom, fromranks, fromdata));

307:   PetscCall(PetscMalloc1(nto * ntags, &sendreq));
308:   PetscCall(PetscMalloc1(*nfrom * ntags, &recvreq));

310:   PetscCallMPI(MPI_Type_get_extent(dtype, &lb, &unitbytes));
311:   PetscCheck(lb == 0, comm, PETSC_ERR_SUP, "Datatype with nonzero lower bound %ld", (long)lb);
312:   for (i = 0; i < nto; i++) {
313:     PetscMPIInt k;
314:     for (k = 0; k < ntags; k++) sendreq[i * ntags + k] = MPI_REQUEST_NULL;
315:     PetscCall((*send)(comm, tag, i, toranks[i], ((char *)todata) + count * unitbytes * i, sendreq + i * ntags, ctx));
316:   }
317:   for (i = 0; i < *nfrom; i++) {
318:     void       *header = (*(char **)fromdata) + count * unitbytes * i;
319:     PetscMPIInt k;
320:     for (k = 0; k < ntags; k++) recvreq[i * ntags + k] = MPI_REQUEST_NULL;
321:     PetscCall((*recv)(comm, tag, (*fromranks)[i], header, recvreq + i * ntags, ctx));
322:   }
323:   PetscCall(PetscFree(tag));
324:   PetscCall(PetscCommDestroy(&comm));
325:   *toreqs   = sendreq;
326:   *fromreqs = recvreq;
327:   PetscFunctionReturn(PETSC_SUCCESS);
328: }

330: #if defined(PETSC_HAVE_MPI_NONBLOCKING_COLLECTIVES)

332: static PetscErrorCode PetscCommBuildTwoSidedFReq_Ibarrier(MPI_Comm comm, PetscMPIInt count, MPI_Datatype dtype, PetscMPIInt nto, const PetscMPIInt *toranks, const void *todata, PetscMPIInt *nfrom, PetscMPIInt **fromranks, void *fromdata, PetscMPIInt ntags, MPI_Request **toreqs, MPI_Request **fromreqs, PetscErrorCode (*send)(MPI_Comm, const PetscMPIInt[], PetscMPIInt, PetscMPIInt, void *, MPI_Request[], void *), PetscErrorCode (*recv)(MPI_Comm, const PetscMPIInt[], PetscMPIInt, void *, MPI_Request[], void *), void *ctx)
333: {
334:   PetscMPIInt    nrecvs, tag, *tags, done, i;
335:   MPI_Aint       lb, unitbytes;
336:   char          *tdata;
337:   MPI_Request   *sendreqs, *usendreqs, *req, barrier;
338:   PetscSegBuffer segrank, segdata, segreq;
339:   PetscBool      barrier_started;

341:   PetscFunctionBegin;
342:   PetscCall(PetscCommDuplicate(comm, &comm, &tag));
343:   PetscCall(PetscMalloc1(ntags, &tags));
344:   for (i = 0; i < ntags; i++) PetscCall(PetscCommGetNewTag(comm, &tags[i]));
345:   PetscCallMPI(MPI_Type_get_extent(dtype, &lb, &unitbytes));
346:   PetscCheck(lb == 0, comm, PETSC_ERR_SUP, "Datatype with nonzero lower bound %ld", (long)lb);
347:   tdata = (char *)todata;
348:   PetscCall(PetscMalloc1(nto, &sendreqs));
349:   PetscCall(PetscMalloc1(nto * ntags, &usendreqs));
350:   /* Post synchronous sends */
351:   for (i = 0; i < nto; i++) PetscCallMPI(MPI_Issend((void *)(tdata + count * unitbytes * i), count, dtype, toranks[i], tag, comm, sendreqs + i));
352:   /* Post actual payloads.  These are typically larger messages.  Hopefully sending these later does not slow down the
353:    * synchronous messages above. */
354:   for (i = 0; i < nto; i++) {
355:     PetscMPIInt k;
356:     for (k = 0; k < ntags; k++) usendreqs[i * ntags + k] = MPI_REQUEST_NULL;
357:     PetscCall((*send)(comm, tags, i, toranks[i], tdata + count * unitbytes * i, usendreqs + i * ntags, ctx));
358:   }

360:   PetscCall(PetscSegBufferCreate(sizeof(PetscMPIInt), 4, &segrank));
361:   PetscCall(PetscSegBufferCreate(unitbytes, 4 * count, &segdata));
362:   PetscCall(PetscSegBufferCreate(sizeof(MPI_Request), 4, &segreq));

364:   nrecvs  = 0;
365:   barrier = MPI_REQUEST_NULL;
366:   /* MPICH-3.2 sometimes does not create a request in some "optimized" cases.  This is arguably a standard violation,
367:    * but we need to work around it. */
368:   barrier_started = PETSC_FALSE;
369:   for (done = 0; !done;) {
370:     PetscMPIInt flag;
371:     MPI_Status  status;
372:     PetscCallMPI(MPI_Iprobe(MPI_ANY_SOURCE, tag, comm, &flag, &status));
373:     if (flag) { /* incoming message */
374:       PetscMPIInt *recvrank, k;
375:       void        *buf;
376:       PetscCall(PetscSegBufferGet(segrank, 1, &recvrank));
377:       PetscCall(PetscSegBufferGet(segdata, count, &buf));
378:       *recvrank = status.MPI_SOURCE;
379:       PetscCallMPI(MPI_Recv(buf, count, dtype, status.MPI_SOURCE, tag, comm, MPI_STATUS_IGNORE));
380:       PetscCall(PetscSegBufferGet(segreq, ntags, &req));
381:       for (k = 0; k < ntags; k++) req[k] = MPI_REQUEST_NULL;
382:       PetscCall((*recv)(comm, tags, status.MPI_SOURCE, buf, req, ctx));
383:       nrecvs++;
384:     }
385:     if (!barrier_started) {
386:       PetscMPIInt sent, nsends;
387:       PetscCall(PetscMPIIntCast(nto, &nsends));
388:       PetscCallMPI(MPI_Testall(nsends, sendreqs, &sent, MPI_STATUSES_IGNORE));
389:       if (sent) {
390:         PetscCallMPI(MPI_Ibarrier(comm, &barrier));
391:         barrier_started = PETSC_TRUE;
392:       }
393:     } else {
394:       PetscCallMPI(MPI_Test(&barrier, &done, MPI_STATUS_IGNORE));
395:     }
396:   }
397:   *nfrom = nrecvs;
398:   PetscCall(PetscSegBufferExtractAlloc(segrank, fromranks));
399:   PetscCall(PetscSegBufferDestroy(&segrank));
400:   PetscCall(PetscSegBufferExtractAlloc(segdata, fromdata));
401:   PetscCall(PetscSegBufferDestroy(&segdata));
402:   *toreqs = usendreqs;
403:   PetscCall(PetscSegBufferExtractAlloc(segreq, fromreqs));
404:   PetscCall(PetscSegBufferDestroy(&segreq));
405:   PetscCall(PetscFree(sendreqs));
406:   PetscCall(PetscFree(tags));
407:   PetscCall(PetscCommDestroy(&comm));
408:   PetscFunctionReturn(PETSC_SUCCESS);
409: }
410: #endif

412: /*@C
413:    PetscCommBuildTwoSidedF - discovers communicating ranks given one-sided information, calling user-defined functions during rendezvous

415:    Collective

417:    Input Parameters:
418: +  comm - communicator
419: .  count - number of entries to send/receive in initial rendezvous (must match on all ranks)
420: .  dtype - datatype to send/receive from each rank (must match on all ranks)
421: .  nto - number of ranks to send data to
422: .  toranks - ranks to send to (array of length nto)
423: .  todata - data to send to each rank (packed)
424: .  ntags - number of tags needed by send/recv callbacks
425: .  send - callback invoked on sending process when ready to send primary payload
426: .  recv - callback invoked on receiving process after delivery of rendezvous message
427: -  ctx - context for callbacks

429:    Output Parameters:
430: +  nfrom - number of ranks receiving messages from
431: .  fromranks - ranks receiving messages from (length nfrom; caller should `PetscFree()`)
432: -  fromdata - packed data from each rank, each with count entries of type dtype (length nfrom, caller responsible for `PetscFree()`)

434:    Level: developer

436:    Notes:
437:    This memory-scalable interface is an alternative to calling `PetscGatherNumberOfMessages()` and
438:    `PetscGatherMessageLengths()`, possibly with a subsequent round of communication to send other data.

440:    Basic data types as well as contiguous types are supported, but non-contiguous (e.g., strided) types are not.

442:    References:
443: .  * - Hoefler, Siebert and Lumsdaine, The MPI_Ibarrier implementation uses the algorithm in
444:    Scalable communication protocols for dynamic sparse data exchange, 2010.

446: .seealso: `PetscCommBuildTwoSided()`, `PetscCommBuildTwoSidedFReq()`, `PetscGatherNumberOfMessages()`, `PetscGatherMessageLengths()`
447: @*/
448: PetscErrorCode PetscCommBuildTwoSidedF(MPI_Comm comm, PetscMPIInt count, MPI_Datatype dtype, PetscMPIInt nto, const PetscMPIInt *toranks, const void *todata, PetscMPIInt *nfrom, PetscMPIInt **fromranks, void *fromdata, PetscMPIInt ntags, PetscErrorCode (*send)(MPI_Comm, const PetscMPIInt[], PetscMPIInt, PetscMPIInt, void *, MPI_Request[], void *), PetscErrorCode (*recv)(MPI_Comm, const PetscMPIInt[], PetscMPIInt, void *, MPI_Request[], void *), void *ctx)
449: {
450:   MPI_Request *toreqs, *fromreqs;

452:   PetscFunctionBegin;
453:   PetscCall(PetscCommBuildTwoSidedFReq(comm, count, dtype, nto, toranks, todata, nfrom, fromranks, fromdata, ntags, &toreqs, &fromreqs, send, recv, ctx));
454:   PetscCallMPI(MPI_Waitall(nto * ntags, toreqs, MPI_STATUSES_IGNORE));
455:   PetscCallMPI(MPI_Waitall(*nfrom * ntags, fromreqs, MPI_STATUSES_IGNORE));
456:   PetscCall(PetscFree(toreqs));
457:   PetscCall(PetscFree(fromreqs));
458:   PetscFunctionReturn(PETSC_SUCCESS);
459: }

461: /*@C
462:    PetscCommBuildTwoSidedFReq - discovers communicating ranks given one-sided information, calling user-defined functions during rendezvous, returns requests

464:    Collective

466:    Input Parameters:
467: +  comm - communicator
468: .  count - number of entries to send/receive in initial rendezvous (must match on all ranks)
469: .  dtype - datatype to send/receive from each rank (must match on all ranks)
470: .  nto - number of ranks to send data to
471: .  toranks - ranks to send to (array of length nto)
472: .  todata - data to send to each rank (packed)
473: .  ntags - number of tags needed by send/recv callbacks
474: .  send - callback invoked on sending process when ready to send primary payload
475: .  recv - callback invoked on receiving process after delivery of rendezvous message
476: -  ctx - context for callbacks

478:    Output Parameters:
479: +  nfrom - number of ranks receiving messages from
480: .  fromranks - ranks receiving messages from (length nfrom; caller should `PetscFree()`)
481: .  fromdata - packed data from each rank, each with count entries of type dtype (length nfrom, caller responsible for `PetscFree()`)
482: .  toreqs - array of nto*ntags sender requests (caller must wait on these, then `PetscFree()`)
483: -  fromreqs - array of nfrom*ntags receiver requests (caller must wait on these, then `PetscFree()`)

485:    Level: developer

487:    Notes:
488:    This memory-scalable interface is an alternative to calling `PetscGatherNumberOfMessages()` and
489:    `PetscGatherMessageLengths()`, possibly with a subsequent round of communication to send other data.

491:    Basic data types as well as contiguous types are supported, but non-contiguous (e.g., strided) types are not.

493:    References:
494: .  * - Hoefler, Siebert and Lumsdaine, The MPI_Ibarrier implementation uses the algorithm in
495:    Scalable communication protocols for dynamic sparse data exchange, 2010.

497: .seealso: `PetscCommBuildTwoSided()`, `PetscCommBuildTwoSidedF()`, `PetscGatherNumberOfMessages()`, `PetscGatherMessageLengths()`
498: @*/
499: PetscErrorCode PetscCommBuildTwoSidedFReq(MPI_Comm comm, PetscMPIInt count, MPI_Datatype dtype, PetscMPIInt nto, const PetscMPIInt *toranks, const void *todata, PetscMPIInt *nfrom, PetscMPIInt **fromranks, void *fromdata, PetscMPIInt ntags, MPI_Request **toreqs, MPI_Request **fromreqs, PetscErrorCode (*send)(MPI_Comm, const PetscMPIInt[], PetscMPIInt, PetscMPIInt, void *, MPI_Request[], void *), PetscErrorCode (*recv)(MPI_Comm, const PetscMPIInt[], PetscMPIInt, void *, MPI_Request[], void *), void *ctx)
500: {
501:   PetscErrorCode (*f)(MPI_Comm, PetscMPIInt, MPI_Datatype, PetscMPIInt, const PetscMPIInt[], const void *, PetscMPIInt *, PetscMPIInt **, void *, PetscMPIInt, MPI_Request **, MPI_Request **, PetscErrorCode (*send)(MPI_Comm, const PetscMPIInt[], PetscMPIInt, PetscMPIInt, void *, MPI_Request[], void *), PetscErrorCode (*recv)(MPI_Comm, const PetscMPIInt[], PetscMPIInt, void *, MPI_Request[], void *), void *ctx);
502:   PetscBuildTwoSidedType buildtype = PETSC_BUILDTWOSIDED_NOTSET;
503:   PetscMPIInt            i, size;

505:   PetscFunctionBegin;
506:   PetscCall(PetscSysInitializePackage());
507:   PetscCallMPI(MPI_Comm_size(comm, &size));
508:   for (i = 0; i < nto; i++) PetscCheck(toranks[i] >= 0 && size > toranks[i], comm, PETSC_ERR_ARG_OUTOFRANGE, "toranks[%d] %d not in comm size %d", i, toranks[i], size);
509:   PetscCall(PetscLogEventSync(PETSC_BuildTwoSidedF, comm));
510:   PetscCall(PetscLogEventBegin(PETSC_BuildTwoSidedF, 0, 0, 0, 0));
511:   PetscCall(PetscCommBuildTwoSidedGetType(comm, &buildtype));
512:   switch (buildtype) {
513:   case PETSC_BUILDTWOSIDED_IBARRIER:
514: #if defined(PETSC_HAVE_MPI_NONBLOCKING_COLLECTIVES)
515:     f = PetscCommBuildTwoSidedFReq_Ibarrier;
516:     break;
517: #else
518:     SETERRQ(comm, PETSC_ERR_PLIB, "MPI implementation does not provide MPI_Ibarrier (part of MPI-3)");
519: #endif
520:   case PETSC_BUILDTWOSIDED_ALLREDUCE:
521:   case PETSC_BUILDTWOSIDED_REDSCATTER:
522:     f = PetscCommBuildTwoSidedFReq_Reference;
523:     break;
524:   default:
525:     SETERRQ(comm, PETSC_ERR_PLIB, "Unknown method for building two-sided communication");
526:   }
527:   PetscCall((*f)(comm, count, dtype, nto, toranks, todata, nfrom, fromranks, fromdata, ntags, toreqs, fromreqs, send, recv, ctx));
528:   PetscCall(PetscLogEventEnd(PETSC_BuildTwoSidedF, 0, 0, 0, 0));
529:   PetscFunctionReturn(PETSC_SUCCESS);
530: }