Actual source code: sfpack.c

  1: #include "petsc/private/sfimpl.h"
  2: #include <../src/vec/is/sf/impls/basic/sfpack.h>
  3: #include <../src/vec/is/sf/impls/basic/sfbasic.h>

  5: /* This is a C file that contains packing facilities, with dispatches to device if enabled. */

  7: /*
  8:  * MPI_Reduce_local is not really useful because it can't handle sparse data and it vectorizes "in the wrong direction",
  9:  * therefore we pack data types manually. This file defines packing routines for the standard data types.
 10:  */

 12: #define CPPJoin4(a, b, c, d) a##_##b##_##c##_##d

 14: /* Operations working like s += t */
 15: #define OP_BINARY(op, s, t) \
 16:   do { \
 17:     (s) = (s)op(t); \
 18:   } while (0) /* binary ops in the middle such as +, *, && etc. */
 19: #define OP_FUNCTION(op, s, t) \
 20:   do { \
 21:     (s) = op((s), (t)); \
 22:   } while (0) /* ops like a function, such as PetscMax, PetscMin */
 23: #define OP_LXOR(op, s, t) \
 24:   do { \
 25:     (s) = (!(s)) != (!(t)); \
 26:   } while (0) /* logical exclusive OR */
 27: #define OP_ASSIGN(op, s, t) \
 28:   do { \
 29:     (s) = (t); \
 30:   } while (0)
 31: /* Ref MPI MAXLOC */
 32: #define OP_XLOC(op, s, t) \
 33:   do { \
 34:     if ((s).u == (t).u) (s).i = PetscMin((s).i, (t).i); \
 35:     else if (!((s).u op(t).u)) s = t; \
 36:   } while (0)

 38: /* DEF_PackFunc - macro defining a Pack routine

 40:    Arguments of the macro:
 41:    +Type      Type of the basic data in an entry, i.e., int, PetscInt, PetscReal etc. It is not the type of an entry.
 42:    .BS        Block size for vectorization. It is a factor of bsz.
 43:    -EQ        (bs == BS) ? 1 : 0. EQ is a compile-time const to help compiler optimizations. See below.

 45:    Arguments of the Pack routine:
 46:    +count     Number of indices in idx[].
 47:    .start     When opt and idx are NULL, it means indices are contiguous & start is the first index; otherwise, not used.
 48:    .opt       Per-pack optimization plan. NULL means no such plan.
 49:    .idx       Indices of entries to packed.
 50:    .link      Provide a context for the current call, such as link->bs, number of basic types in an entry. Ex. if unit is MPI_2INT, then bs=2 and the basic type is int.
 51:    .unpacked  Address of the unpacked data. The entries will be packed are unpacked[idx[i]],for i in [0,count).
 52:    -packed    Address of the packed data.
 53:  */
 54: #define DEF_PackFunc(Type, BS, EQ) \
 55:   static PetscErrorCode CPPJoin4(Pack, Type, BS, EQ)(PetscSFLink link, PetscInt count, PetscInt start, PetscSFPackOpt opt, const PetscInt *idx, const void *unpacked, void *packed) \
 56:   { \
 57:     const Type    *u = (const Type *)unpacked, *u2; \
 58:     Type          *p = (Type *)packed, *p2; \
 59:     PetscInt       i, j, k, X, Y, r, bs = link->bs; \
 60:     const PetscInt M   = (EQ) ? 1 : bs / BS; /* If EQ, then M=1 enables compiler's const-propagation */ \
 61:     const PetscInt MBS = M * BS;             /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \
 62:     PetscFunctionBegin; \
 63:     if (!idx) PetscCall(PetscArraycpy(p, u + start * MBS, MBS * count)); /* idx[] are contiguous */ \
 64:     else if (opt) { /* has optimizations available */ p2 = p; \
 65:       for (r = 0; r < opt->n; r++) { \
 66:         u2 = u + opt->start[r] * MBS; \
 67:         X  = opt->X[r]; \
 68:         Y  = opt->Y[r]; \
 69:         for (k = 0; k < opt->dz[r]; k++) \
 70:           for (j = 0; j < opt->dy[r]; j++) { \
 71:             PetscCall(PetscArraycpy(p2, u2 + (X * Y * k + X * j) * MBS, opt->dx[r] * MBS)); \
 72:             p2 += opt->dx[r] * MBS; \
 73:           } \
 74:       } \
 75:     } else { \
 76:       for (i = 0; i < count; i++) \
 77:         for (j = 0; j < M; j++)    /* Decent compilers should eliminate this loop when M = const 1 */ \
 78:           for (k = 0; k < BS; k++) /* Compiler either unrolls (BS=1) or vectorizes (BS=2,4,8,etc) this loop */ \
 79:             p[i * MBS + j * BS + k] = u[idx[i] * MBS + j * BS + k]; \
 80:     } \
 81:     PetscFunctionReturn(PETSC_SUCCESS); \
 82:   }

 84: /* DEF_Action - macro defining a UnpackAndInsert routine that unpacks data from a contiguous buffer
 85:                 and inserts into a sparse array.

 87:    Arguments:
 88:   .Type       Type of the data
 89:   .BS         Block size for vectorization
 90:   .EQ        (bs == BS) ? 1 : 0. EQ is a compile-time const.

 92:   Notes:
 93:    This macro is not combined with DEF_ActionAndOp because we want to use memcpy in this macro.
 94:  */
 95: #define DEF_UnpackFunc(Type, BS, EQ) \
 96:   static PetscErrorCode CPPJoin4(UnpackAndInsert, Type, BS, EQ)(PetscSFLink link, PetscInt count, PetscInt start, PetscSFPackOpt opt, const PetscInt *idx, void *unpacked, const void *packed) \
 97:   { \
 98:     Type          *u = (Type *)unpacked, *u2; \
 99:     const Type    *p = (const Type *)packed; \
100:     PetscInt       i, j, k, X, Y, r, bs = link->bs; \
101:     const PetscInt M   = (EQ) ? 1 : bs / BS; /* If EQ, then M=1 enables compiler's const-propagation */ \
102:     const PetscInt MBS = M * BS;             /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \
103:     PetscFunctionBegin; \
104:     if (!idx) { \
105:       u += start * MBS; \
106:       if (u != p) PetscCall(PetscArraycpy(u, p, count *MBS)); \
107:     } else if (opt) { /* has optimizations available */ \
108:       for (r = 0; r < opt->n; r++) { \
109:         u2 = u + opt->start[r] * MBS; \
110:         X  = opt->X[r]; \
111:         Y  = opt->Y[r]; \
112:         for (k = 0; k < opt->dz[r]; k++) \
113:           for (j = 0; j < opt->dy[r]; j++) { \
114:             PetscCall(PetscArraycpy(u2 + (X * Y * k + X * j) * MBS, p, opt->dx[r] * MBS)); \
115:             p += opt->dx[r] * MBS; \
116:           } \
117:       } \
118:     } else { \
119:       for (i = 0; i < count; i++) \
120:         for (j = 0; j < M; j++) \
121:           for (k = 0; k < BS; k++) u[idx[i] * MBS + j * BS + k] = p[i * MBS + j * BS + k]; \
122:     } \
123:     PetscFunctionReturn(PETSC_SUCCESS); \
124:   }

126: /* DEF_UnpackAndOp - macro defining a UnpackAndOp routine where Op should not be Insert

128:    Arguments:
129:   +Opname     Name of the Op, such as Add, Mult, LAND, etc.
130:   .Type       Type of the data
131:   .BS         Block size for vectorization
132:   .EQ         (bs == BS) ? 1 : 0. EQ is a compile-time const.
133:   .Op         Operator for the op, such as +, *, &&, ||, PetscMax, PetscMin, etc.
134:   .OpApply    Macro defining application of the op. Could be OP_BINARY, OP_FUNCTION, OP_LXOR
135:  */
136: #define DEF_UnpackAndOp(Type, BS, EQ, Opname, Op, OpApply) \
137:   static PetscErrorCode CPPJoin4(UnpackAnd##Opname, Type, BS, EQ)(PetscSFLink link, PetscInt count, PetscInt start, PetscSFPackOpt opt, const PetscInt *idx, void *unpacked, const void *packed) \
138:   { \
139:     Type          *u = (Type *)unpacked, *u2; \
140:     const Type    *p = (const Type *)packed; \
141:     PetscInt       i, j, k, X, Y, r, bs = link->bs; \
142:     const PetscInt M   = (EQ) ? 1 : bs / BS; /* If EQ, then M=1 enables compiler's const-propagation */ \
143:     const PetscInt MBS = M * BS;             /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \
144:     PetscFunctionBegin; \
145:     if (!idx) { \
146:       u += start * MBS; \
147:       for (i = 0; i < count; i++) \
148:         for (j = 0; j < M; j++) \
149:           for (k = 0; k < BS; k++) OpApply(Op, u[i * MBS + j * BS + k], p[i * MBS + j * BS + k]); \
150:     } else if (opt) { /* idx[] has patterns */ \
151:       for (r = 0; r < opt->n; r++) { \
152:         u2 = u + opt->start[r] * MBS; \
153:         X  = opt->X[r]; \
154:         Y  = opt->Y[r]; \
155:         for (k = 0; k < opt->dz[r]; k++) \
156:           for (j = 0; j < opt->dy[r]; j++) { \
157:             for (i = 0; i < opt->dx[r] * MBS; i++) OpApply(Op, u2[(X * Y * k + X * j) * MBS + i], p[i]); \
158:             p += opt->dx[r] * MBS; \
159:           } \
160:       } \
161:     } else { \
162:       for (i = 0; i < count; i++) \
163:         for (j = 0; j < M; j++) \
164:           for (k = 0; k < BS; k++) OpApply(Op, u[idx[i] * MBS + j * BS + k], p[i * MBS + j * BS + k]); \
165:     } \
166:     PetscFunctionReturn(PETSC_SUCCESS); \
167:   }

169: #define DEF_FetchAndOp(Type, BS, EQ, Opname, Op, OpApply) \
170:   static PetscErrorCode CPPJoin4(FetchAnd##Opname, Type, BS, EQ)(PetscSFLink link, PetscInt count, PetscInt start, PetscSFPackOpt opt, const PetscInt *idx, void *unpacked, void *packed) \
171:   { \
172:     Type          *u = (Type *)unpacked, *p = (Type *)packed, tmp; \
173:     PetscInt       i, j, k, r, l, bs = link->bs; \
174:     const PetscInt M   = (EQ) ? 1 : bs / BS; \
175:     const PetscInt MBS = M * BS; \
176:     PetscFunctionBegin; \
177:     for (i = 0; i < count; i++) { \
178:       r = (!idx ? start + i : idx[i]) * MBS; \
179:       l = i * MBS; \
180:       for (j = 0; j < M; j++) \
181:         for (k = 0; k < BS; k++) { \
182:           tmp = u[r + j * BS + k]; \
183:           OpApply(Op, u[r + j * BS + k], p[l + j * BS + k]); \
184:           p[l + j * BS + k] = tmp; \
185:         } \
186:     } \
187:     PetscFunctionReturn(PETSC_SUCCESS); \
188:   }

190: #define DEF_ScatterAndOp(Type, BS, EQ, Opname, Op, OpApply) \
191:   static PetscErrorCode CPPJoin4(ScatterAnd##Opname, Type, BS, EQ)(PetscSFLink link, PetscInt count, PetscInt srcStart, PetscSFPackOpt srcOpt, const PetscInt *srcIdx, const void *src, PetscInt dstStart, PetscSFPackOpt dstOpt, const PetscInt *dstIdx, void *dst) \
192:   { \
193:     const Type    *u = (const Type *)src; \
194:     Type          *v = (Type *)dst; \
195:     PetscInt       i, j, k, s, t, X, Y, bs = link->bs; \
196:     const PetscInt M   = (EQ) ? 1 : bs / BS; \
197:     const PetscInt MBS = M * BS; \
198:     PetscFunctionBegin; \
199:     if (!srcIdx) { /* src is contiguous */ \
200:       u += srcStart * MBS; \
201:       PetscCall(CPPJoin4(UnpackAnd##Opname, Type, BS, EQ)(link, count, dstStart, dstOpt, dstIdx, dst, u)); \
202:     } else if (srcOpt && !dstIdx) { /* src is 3D, dst is contiguous */ \
203:       u += srcOpt->start[0] * MBS; \
204:       v += dstStart * MBS; \
205:       X = srcOpt->X[0]; \
206:       Y = srcOpt->Y[0]; \
207:       for (k = 0; k < srcOpt->dz[0]; k++) \
208:         for (j = 0; j < srcOpt->dy[0]; j++) { \
209:           for (i = 0; i < srcOpt->dx[0] * MBS; i++) OpApply(Op, v[i], u[(X * Y * k + X * j) * MBS + i]); \
210:           v += srcOpt->dx[0] * MBS; \
211:         } \
212:     } else { /* all other cases */ \
213:       for (i = 0; i < count; i++) { \
214:         s = (!srcIdx ? srcStart + i : srcIdx[i]) * MBS; \
215:         t = (!dstIdx ? dstStart + i : dstIdx[i]) * MBS; \
216:         for (j = 0; j < M; j++) \
217:           for (k = 0; k < BS; k++) OpApply(Op, v[t + j * BS + k], u[s + j * BS + k]); \
218:       } \
219:     } \
220:     PetscFunctionReturn(PETSC_SUCCESS); \
221:   }

223: #define DEF_FetchAndOpLocal(Type, BS, EQ, Opname, Op, OpApply) \
224:   static PetscErrorCode CPPJoin4(FetchAnd##Opname##Local, Type, BS, EQ)(PetscSFLink link, PetscInt count, PetscInt rootstart, PetscSFPackOpt rootopt, const PetscInt *rootidx, void *rootdata, PetscInt leafstart, PetscSFPackOpt leafopt, const PetscInt *leafidx, const void *leafdata, void *leafupdate) \
225:   { \
226:     Type          *rdata = (Type *)rootdata, *lupdate = (Type *)leafupdate; \
227:     const Type    *ldata = (const Type *)leafdata; \
228:     PetscInt       i, j, k, r, l, bs = link->bs; \
229:     const PetscInt M   = (EQ) ? 1 : bs / BS; \
230:     const PetscInt MBS = M * BS; \
231:     PetscFunctionBegin; \
232:     for (i = 0; i < count; i++) { \
233:       r = (rootidx ? rootidx[i] : rootstart + i) * MBS; \
234:       l = (leafidx ? leafidx[i] : leafstart + i) * MBS; \
235:       for (j = 0; j < M; j++) \
236:         for (k = 0; k < BS; k++) { \
237:           lupdate[l + j * BS + k] = rdata[r + j * BS + k]; \
238:           OpApply(Op, rdata[r + j * BS + k], ldata[l + j * BS + k]); \
239:         } \
240:     } \
241:     PetscFunctionReturn(PETSC_SUCCESS); \
242:   }

244: /* Pack, Unpack/Fetch ops */
245: #define DEF_Pack(Type, BS, EQ) \
246:   DEF_PackFunc(Type, BS, EQ) DEF_UnpackFunc(Type, BS, EQ) DEF_ScatterAndOp(Type, BS, EQ, Insert, =, OP_ASSIGN) static void CPPJoin4(PackInit_Pack, Type, BS, EQ)(PetscSFLink link) \
247:   { \
248:     link->h_Pack             = CPPJoin4(Pack, Type, BS, EQ); \
249:     link->h_UnpackAndInsert  = CPPJoin4(UnpackAndInsert, Type, BS, EQ); \
250:     link->h_ScatterAndInsert = CPPJoin4(ScatterAndInsert, Type, BS, EQ); \
251:   }

253: /* Add, Mult ops */
254: #define DEF_Add(Type, BS, EQ) \
255:   DEF_UnpackAndOp(Type, BS, EQ, Add, +, OP_BINARY) DEF_UnpackAndOp(Type, BS, EQ, Mult, *, OP_BINARY) DEF_FetchAndOp(Type, BS, EQ, Add, +, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, Add, +, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, Mult, *, OP_BINARY) DEF_FetchAndOpLocal(Type, BS, EQ, Add, +, OP_BINARY) static void CPPJoin4(PackInit_Add, Type, BS, EQ)(PetscSFLink link) \
256:   { \
257:     link->h_UnpackAndAdd     = CPPJoin4(UnpackAndAdd, Type, BS, EQ); \
258:     link->h_UnpackAndMult    = CPPJoin4(UnpackAndMult, Type, BS, EQ); \
259:     link->h_FetchAndAdd      = CPPJoin4(FetchAndAdd, Type, BS, EQ); \
260:     link->h_ScatterAndAdd    = CPPJoin4(ScatterAndAdd, Type, BS, EQ); \
261:     link->h_ScatterAndMult   = CPPJoin4(ScatterAndMult, Type, BS, EQ); \
262:     link->h_FetchAndAddLocal = CPPJoin4(FetchAndAddLocal, Type, BS, EQ); \
263:   }

265: /* Max, Min ops */
266: #define DEF_Cmp(Type, BS, EQ) \
267:   DEF_UnpackAndOp(Type, BS, EQ, Max, PetscMax, OP_FUNCTION) DEF_UnpackAndOp(Type, BS, EQ, Min, PetscMin, OP_FUNCTION) DEF_ScatterAndOp(Type, BS, EQ, Max, PetscMax, OP_FUNCTION) DEF_ScatterAndOp(Type, BS, EQ, Min, PetscMin, OP_FUNCTION) static void CPPJoin4(PackInit_Compare, Type, BS, EQ)(PetscSFLink link) \
268:   { \
269:     link->h_UnpackAndMax  = CPPJoin4(UnpackAndMax, Type, BS, EQ); \
270:     link->h_UnpackAndMin  = CPPJoin4(UnpackAndMin, Type, BS, EQ); \
271:     link->h_ScatterAndMax = CPPJoin4(ScatterAndMax, Type, BS, EQ); \
272:     link->h_ScatterAndMin = CPPJoin4(ScatterAndMin, Type, BS, EQ); \
273:   }

275: /* Logical ops.
276:   The operator in OP_LXOR should be empty but is ||. It is not used. Put here to avoid
277:   the compilation warning "empty macro arguments are undefined in ISO C90"
278:  */
279: #define DEF_Log(Type, BS, EQ) \
280:   DEF_UnpackAndOp(Type, BS, EQ, LAND, &&, OP_BINARY) DEF_UnpackAndOp(Type, BS, EQ, LOR, ||, OP_BINARY) DEF_UnpackAndOp(Type, BS, EQ, LXOR, ||, OP_LXOR) DEF_ScatterAndOp(Type, BS, EQ, LAND, &&, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, LOR, ||, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, LXOR, ||, OP_LXOR) static void CPPJoin4(PackInit_Logical, Type, BS, EQ)(PetscSFLink link) \
281:   { \
282:     link->h_UnpackAndLAND  = CPPJoin4(UnpackAndLAND, Type, BS, EQ); \
283:     link->h_UnpackAndLOR   = CPPJoin4(UnpackAndLOR, Type, BS, EQ); \
284:     link->h_UnpackAndLXOR  = CPPJoin4(UnpackAndLXOR, Type, BS, EQ); \
285:     link->h_ScatterAndLAND = CPPJoin4(ScatterAndLAND, Type, BS, EQ); \
286:     link->h_ScatterAndLOR  = CPPJoin4(ScatterAndLOR, Type, BS, EQ); \
287:     link->h_ScatterAndLXOR = CPPJoin4(ScatterAndLXOR, Type, BS, EQ); \
288:   }

290: /* Bitwise ops */
291: #define DEF_Bit(Type, BS, EQ) \
292:   DEF_UnpackAndOp(Type, BS, EQ, BAND, &, OP_BINARY) DEF_UnpackAndOp(Type, BS, EQ, BOR, |, OP_BINARY) DEF_UnpackAndOp(Type, BS, EQ, BXOR, ^, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, BAND, &, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, BOR, |, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, BXOR, ^, OP_BINARY) static void CPPJoin4(PackInit_Bitwise, Type, BS, EQ)(PetscSFLink link) \
293:   { \
294:     link->h_UnpackAndBAND  = CPPJoin4(UnpackAndBAND, Type, BS, EQ); \
295:     link->h_UnpackAndBOR   = CPPJoin4(UnpackAndBOR, Type, BS, EQ); \
296:     link->h_UnpackAndBXOR  = CPPJoin4(UnpackAndBXOR, Type, BS, EQ); \
297:     link->h_ScatterAndBAND = CPPJoin4(ScatterAndBAND, Type, BS, EQ); \
298:     link->h_ScatterAndBOR  = CPPJoin4(ScatterAndBOR, Type, BS, EQ); \
299:     link->h_ScatterAndBXOR = CPPJoin4(ScatterAndBXOR, Type, BS, EQ); \
300:   }

302: /* Maxloc, Minloc ops */
303: #define DEF_Xloc(Type, BS, EQ) \
304:   DEF_UnpackAndOp(Type, BS, EQ, Max, >, OP_XLOC) DEF_UnpackAndOp(Type, BS, EQ, Min, <, OP_XLOC) DEF_ScatterAndOp(Type, BS, EQ, Max, >, OP_XLOC) DEF_ScatterAndOp(Type, BS, EQ, Min, <, OP_XLOC) static void CPPJoin4(PackInit_Xloc, Type, BS, EQ)(PetscSFLink link) \
305:   { \
306:     link->h_UnpackAndMaxloc  = CPPJoin4(UnpackAndMax, Type, BS, EQ); \
307:     link->h_UnpackAndMinloc  = CPPJoin4(UnpackAndMin, Type, BS, EQ); \
308:     link->h_ScatterAndMaxloc = CPPJoin4(ScatterAndMax, Type, BS, EQ); \
309:     link->h_ScatterAndMinloc = CPPJoin4(ScatterAndMin, Type, BS, EQ); \
310:   }

312: #define DEF_IntegerType(Type, BS, EQ) \
313:   DEF_Pack(Type, BS, EQ) DEF_Add(Type, BS, EQ) DEF_Cmp(Type, BS, EQ) DEF_Log(Type, BS, EQ) DEF_Bit(Type, BS, EQ) static void CPPJoin4(PackInit_IntegerType, Type, BS, EQ)(PetscSFLink link) \
314:   { \
315:     CPPJoin4(PackInit_Pack, Type, BS, EQ)(link); \
316:     CPPJoin4(PackInit_Add, Type, BS, EQ)(link); \
317:     CPPJoin4(PackInit_Compare, Type, BS, EQ)(link); \
318:     CPPJoin4(PackInit_Logical, Type, BS, EQ)(link); \
319:     CPPJoin4(PackInit_Bitwise, Type, BS, EQ)(link); \
320:   }

322: #define DEF_RealType(Type, BS, EQ) \
323:   DEF_Pack(Type, BS, EQ) DEF_Add(Type, BS, EQ) DEF_Cmp(Type, BS, EQ) static void CPPJoin4(PackInit_RealType, Type, BS, EQ)(PetscSFLink link) \
324:   { \
325:     CPPJoin4(PackInit_Pack, Type, BS, EQ)(link); \
326:     CPPJoin4(PackInit_Add, Type, BS, EQ)(link); \
327:     CPPJoin4(PackInit_Compare, Type, BS, EQ)(link); \
328:   }

330: #if defined(PETSC_HAVE_COMPLEX)
331:   #define DEF_ComplexType(Type, BS, EQ) \
332:     DEF_Pack(Type, BS, EQ) DEF_Add(Type, BS, EQ) static void CPPJoin4(PackInit_ComplexType, Type, BS, EQ)(PetscSFLink link) \
333:     { \
334:       CPPJoin4(PackInit_Pack, Type, BS, EQ)(link); \
335:       CPPJoin4(PackInit_Add, Type, BS, EQ)(link); \
336:     }
337: #endif

339: #define DEF_DumbType(Type, BS, EQ) \
340:   DEF_Pack(Type, BS, EQ) static void CPPJoin4(PackInit_DumbType, Type, BS, EQ)(PetscSFLink link) \
341:   { \
342:     CPPJoin4(PackInit_Pack, Type, BS, EQ)(link); \
343:   }

345: /* Maxloc, Minloc */
346: #define DEF_PairType(Type, BS, EQ) \
347:   DEF_Pack(Type, BS, EQ) DEF_Xloc(Type, BS, EQ) static void CPPJoin4(PackInit_PairType, Type, BS, EQ)(PetscSFLink link) \
348:   { \
349:     CPPJoin4(PackInit_Pack, Type, BS, EQ)(link); \
350:     CPPJoin4(PackInit_Xloc, Type, BS, EQ)(link); \
351:   }

353: DEF_IntegerType(PetscInt, 1, 1)   /* unit = 1 MPIU_INT  */
354:   DEF_IntegerType(PetscInt, 2, 1) /* unit = 2 MPIU_INTs */
355:   DEF_IntegerType(PetscInt, 4, 1) /* unit = 4 MPIU_INTs */
356:   DEF_IntegerType(PetscInt, 8, 1) /* unit = 8 MPIU_INTs */
357:   DEF_IntegerType(PetscInt, 1, 0) /* unit = 1*n MPIU_INTs, n>1 */
358:   DEF_IntegerType(PetscInt, 2, 0) /* unit = 2*n MPIU_INTs, n>1 */
359:   DEF_IntegerType(PetscInt, 4, 0) /* unit = 4*n MPIU_INTs, n>1 */
360:   DEF_IntegerType(PetscInt, 8, 0) /* unit = 8*n MPIU_INTs, n>1. Routines with bigger BS are tried first. */

362: #if defined(PETSC_USE_64BIT_INDICES) /* Do not need (though it is OK) to generate redundant functions if PetscInt is int */
363:   DEF_IntegerType(int, 1, 1) DEF_IntegerType(int, 2, 1) DEF_IntegerType(int, 4, 1) DEF_IntegerType(int, 8, 1) DEF_IntegerType(int, 1, 0) DEF_IntegerType(int, 2, 0) DEF_IntegerType(int, 4, 0) DEF_IntegerType(int, 8, 0)
364: #endif

366:   /* The typedefs are used to get a typename without space that CPPJoin can handle */
367:   typedef signed char SignedChar;
368: DEF_IntegerType(SignedChar, 1, 1) DEF_IntegerType(SignedChar, 2, 1) DEF_IntegerType(SignedChar, 4, 1) DEF_IntegerType(SignedChar, 8, 1) DEF_IntegerType(SignedChar, 1, 0) DEF_IntegerType(SignedChar, 2, 0) DEF_IntegerType(SignedChar, 4, 0) DEF_IntegerType(SignedChar, 8, 0)

370:   typedef unsigned char UnsignedChar;
371: DEF_IntegerType(UnsignedChar, 1, 1) DEF_IntegerType(UnsignedChar, 2, 1) DEF_IntegerType(UnsignedChar, 4, 1) DEF_IntegerType(UnsignedChar, 8, 1) DEF_IntegerType(UnsignedChar, 1, 0) DEF_IntegerType(UnsignedChar, 2, 0) DEF_IntegerType(UnsignedChar, 4, 0) DEF_IntegerType(UnsignedChar, 8, 0)

373:   DEF_RealType(PetscReal, 1, 1) DEF_RealType(PetscReal, 2, 1) DEF_RealType(PetscReal, 4, 1) DEF_RealType(PetscReal, 8, 1) DEF_RealType(PetscReal, 1, 0) DEF_RealType(PetscReal, 2, 0) DEF_RealType(PetscReal, 4, 0) DEF_RealType(PetscReal, 8, 0)
374: #if defined(PETSC_HAVE_COMPLEX)
375:     DEF_ComplexType(PetscComplex, 1, 1) DEF_ComplexType(PetscComplex, 2, 1) DEF_ComplexType(PetscComplex, 4, 1) DEF_ComplexType(PetscComplex, 8, 1) DEF_ComplexType(PetscComplex, 1, 0) DEF_ComplexType(PetscComplex, 2, 0) DEF_ComplexType(PetscComplex, 4, 0) DEF_ComplexType(PetscComplex, 8, 0)
376: #endif

378: #define PairType(Type1, Type2) Type1##_##Type2
379:       typedef struct {
380:   int u;
381:   int i;
382: } PairType(int, int);
383: typedef struct {
384:   PetscInt u;
385:   PetscInt i;
386: } PairType(PetscInt, PetscInt);
387: DEF_PairType(PairType(int, int), 1, 1) DEF_PairType(PairType(PetscInt, PetscInt), 1, 1)

389:   /* If we don't know the basic type, we treat it as a stream of chars or ints */
390:   DEF_DumbType(char, 1, 1) DEF_DumbType(char, 2, 1) DEF_DumbType(char, 4, 1) DEF_DumbType(char, 1, 0) DEF_DumbType(char, 2, 0) DEF_DumbType(char, 4, 0)

392:     typedef int DumbInt; /* To have a different name than 'int' used above. The name is used to make routine names. */
393: DEF_DumbType(DumbInt, 1, 1) DEF_DumbType(DumbInt, 2, 1) DEF_DumbType(DumbInt, 4, 1) DEF_DumbType(DumbInt, 8, 1) DEF_DumbType(DumbInt, 1, 0) DEF_DumbType(DumbInt, 2, 0) DEF_DumbType(DumbInt, 4, 0) DEF_DumbType(DumbInt, 8, 0)

395:   PetscErrorCode PetscSFLinkDestroy(PetscSF sf, PetscSFLink link)
396: {
397:   PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
398:   PetscInt       i, nreqs = (bas->nrootreqs + sf->nleafreqs) * 8;

400:   PetscFunctionBegin;
401:   /* Destroy device-specific fields */
402:   if (link->deviceinited) PetscCall((*link->Destroy)(sf, link));

404:   /* Destroy host related fields */
405:   if (!link->isbuiltin) PetscCallMPI(MPI_Type_free(&link->unit));
406:   if (!link->use_nvshmem) {
407:     for (i = 0; i < nreqs; i++) { /* Persistent reqs must be freed. */
408:       if (link->reqs[i] != MPI_REQUEST_NULL) PetscCallMPI(MPI_Request_free(&link->reqs[i]));
409:     }
410:     PetscCall(PetscFree(link->reqs));
411:     for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) {
412:       PetscCall(PetscFree(link->rootbuf_alloc[i][PETSC_MEMTYPE_HOST]));
413:       PetscCall(PetscFree(link->leafbuf_alloc[i][PETSC_MEMTYPE_HOST]));
414:     }
415:   }
416:   PetscCall(PetscFree(link));
417:   PetscFunctionReturn(PETSC_SUCCESS);
418: }

420: PetscErrorCode PetscSFLinkCreate(PetscSF sf, MPI_Datatype unit, PetscMemType rootmtype, const void *rootdata, PetscMemType leafmtype, const void *leafdata, MPI_Op op, PetscSFOperation sfop, PetscSFLink *mylink)
421: {
422:   PetscFunctionBegin;
423:   PetscCall(PetscSFSetErrorOnUnsupportedOverlap(sf, unit, rootdata, leafdata));
424: #if defined(PETSC_HAVE_NVSHMEM)
425:   {
426:     PetscBool use_nvshmem;
427:     PetscCall(PetscSFLinkNvshmemCheck(sf, rootmtype, rootdata, leafmtype, leafdata, &use_nvshmem));
428:     if (use_nvshmem) {
429:       PetscCall(PetscSFLinkCreate_NVSHMEM(sf, unit, rootmtype, rootdata, leafmtype, leafdata, op, sfop, mylink));
430:       PetscFunctionReturn(PETSC_SUCCESS);
431:     }
432:   }
433: #endif
434:   PetscCall(PetscSFLinkCreate_MPI(sf, unit, rootmtype, rootdata, leafmtype, leafdata, op, sfop, mylink));
435:   PetscFunctionReturn(PETSC_SUCCESS);
436: }

438: /* Return root/leaf buffers and MPI requests attached to the link for MPI communication in the given direction.
439:    If the sf uses persistent requests and the requests have not been initialized, then initialize them.
440: */
441: PetscErrorCode PetscSFLinkGetMPIBuffersAndRequests(PetscSF sf, PetscSFLink link, PetscSFDirection direction, void **rootbuf, void **leafbuf, MPI_Request **rootreqs, MPI_Request **leafreqs)
442: {
443:   PetscSF_Basic     *bas = (PetscSF_Basic *)sf->data;
444:   PetscInt           i, j, cnt, nrootranks, ndrootranks, nleafranks, ndleafranks;
445:   const PetscInt    *rootoffset, *leafoffset;
446:   MPI_Aint           disp;
447:   MPI_Comm           comm          = PetscObjectComm((PetscObject)sf);
448:   MPI_Datatype       unit          = link->unit;
449:   const PetscMemType rootmtype_mpi = link->rootmtype_mpi, leafmtype_mpi = link->leafmtype_mpi; /* Used to select buffers passed to MPI */
450:   const PetscInt     rootdirect_mpi = link->rootdirect_mpi, leafdirect_mpi = link->leafdirect_mpi;

452:   PetscFunctionBegin;
453:   /* Init persistent MPI requests if not yet. Currently only SFBasic uses persistent MPI */
454:   if (sf->persistent) {
455:     if (rootreqs && bas->rootbuflen[PETSCSF_REMOTE] && !link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi]) {
456:       PetscCall(PetscSFGetRootInfo_Basic(sf, &nrootranks, &ndrootranks, NULL, &rootoffset, NULL));
457:       if (direction == PETSCSF_LEAF2../../../../../..) {
458:         for (i = ndrootranks, j = 0; i < nrootranks; i++, j++) {
459:           disp = (rootoffset[i] - rootoffset[ndrootranks]) * link->unitbytes;
460:           cnt  = rootoffset[i + 1] - rootoffset[i];
461:           PetscCallMPI(MPIU_Recv_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi] + disp, cnt, unit, bas->iranks[i], link->tag, comm, link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi] + j));
462:         }
463:       } else { /* PETSCSF_../../../../../..2LEAF */
464:         for (i = ndrootranks, j = 0; i < nrootranks; i++, j++) {
465:           disp = (rootoffset[i] - rootoffset[ndrootranks]) * link->unitbytes;
466:           cnt  = rootoffset[i + 1] - rootoffset[i];
467:           PetscCallMPI(MPIU_Send_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi] + disp, cnt, unit, bas->iranks[i], link->tag, comm, link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi] + j));
468:         }
469:       }
470:       link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi] = PETSC_TRUE;
471:     }

473:     if (leafreqs && sf->leafbuflen[PETSCSF_REMOTE] && !link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi]) {
474:       PetscCall(PetscSFGetLeafInfo_Basic(sf, &nleafranks, &ndleafranks, NULL, &leafoffset, NULL, NULL));
475:       if (direction == PETSCSF_LEAF2../../../../../..) {
476:         for (i = ndleafranks, j = 0; i < nleafranks; i++, j++) {
477:           disp = (leafoffset[i] - leafoffset[ndleafranks]) * link->unitbytes;
478:           cnt  = leafoffset[i + 1] - leafoffset[i];
479:           PetscCallMPI(MPIU_Send_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi] + disp, cnt, unit, sf->ranks[i], link->tag, comm, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi] + j));
480:         }
481:       } else { /* PETSCSF_../../../../../..2LEAF */
482:         for (i = ndleafranks, j = 0; i < nleafranks; i++, j++) {
483:           disp = (leafoffset[i] - leafoffset[ndleafranks]) * link->unitbytes;
484:           cnt  = leafoffset[i + 1] - leafoffset[i];
485:           PetscCallMPI(MPIU_Recv_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi] + disp, cnt, unit, sf->ranks[i], link->tag, comm, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi] + j));
486:         }
487:       }
488:       link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi] = PETSC_TRUE;
489:     }
490:   }
491:   if (rootbuf) *rootbuf = link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi];
492:   if (leafbuf) *leafbuf = link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi];
493:   if (rootreqs) *rootreqs = link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi];
494:   if (leafreqs) *leafreqs = link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi];
495:   PetscFunctionReturn(PETSC_SUCCESS);
496: }

498: PetscErrorCode PetscSFLinkGetInUse(PetscSF sf, MPI_Datatype unit, const void *rootdata, const void *leafdata, PetscCopyMode cmode, PetscSFLink *mylink)
499: {
500:   PetscSFLink    link, *p;
501:   PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;

503:   PetscFunctionBegin;
504:   /* Look for types in cache */
505:   for (p = &bas->inuse; (link = *p); p = &link->next) {
506:     PetscBool match;
507:     PetscCall(MPIPetsc_Type_compare(unit, link->unit, &match));
508:     if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) {
509:       switch (cmode) {
510:       case PETSC_OWN_POINTER:
511:         *p = link->next;
512:         break; /* Remove from inuse list */
513:       case PETSC_USE_POINTER:
514:         break;
515:       default:
516:         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "invalid cmode");
517:       }
518:       *mylink = link;
519:       PetscFunctionReturn(PETSC_SUCCESS);
520:     }
521:   }
522:   SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Could not find pack");
523: }

525: PetscErrorCode PetscSFLinkReclaim(PetscSF sf, PetscSFLink *mylink)
526: {
527:   PetscSF_Basic *bas  = (PetscSF_Basic *)sf->data;
528:   PetscSFLink    link = *mylink;

530:   PetscFunctionBegin;
531:   link->rootdata = NULL;
532:   link->leafdata = NULL;
533:   link->next     = bas->avail;
534:   bas->avail     = link;
535:   *mylink        = NULL;
536:   PetscFunctionReturn(PETSC_SUCCESS);
537: }

539: /* Error out on unsupported overlapped communications */
540: PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF sf, MPI_Datatype unit, const void *rootdata, const void *leafdata)
541: {
542:   PetscSFLink    link, *p;
543:   PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
544:   PetscBool      match;

546:   PetscFunctionBegin;
547:   if (PetscDefined(USE_DEBUG)) {
548:     /* Look up links in use and error out if there is a match. When both rootdata and leafdata are NULL, ignore
549:        the potential overlapping since this process does not participate in communication. Overlapping is harmless.
550:     */
551:     if (rootdata || leafdata) {
552:       for (p = &bas->inuse; (link = *p); p = &link->next) {
553:         PetscCall(MPIPetsc_Type_compare(unit, link->unit, &match));
554:         PetscCheck(!match || rootdata != link->rootdata || leafdata != link->leafdata, PETSC_COMM_SELF, PETSC_ERR_SUP, "Overlapped PetscSF with the same rootdata(%p), leafdata(%p) and data type. Undo the overlapping to avoid the error.", rootdata, leafdata);
555:       }
556:     }
557:   }
558:   PetscFunctionReturn(PETSC_SUCCESS);
559: }

561: static PetscErrorCode PetscSFLinkMemcpy_Host(PetscSFLink link, PetscMemType dstmtype, void *dst, PetscMemType srcmtype, const void *src, size_t n)
562: {
563:   PetscFunctionBegin;
564:   if (n) PetscCall(PetscMemcpy(dst, src, n));
565:   PetscFunctionReturn(PETSC_SUCCESS);
566: }

568: PetscErrorCode PetscSFLinkSetUp_Host(PetscSF sf, PetscSFLink link, MPI_Datatype unit)
569: {
570:   PetscInt    nSignedChar = 0, nUnsignedChar = 0, nInt = 0, nPetscInt = 0, nPetscReal = 0;
571:   PetscBool   is2Int, is2PetscInt;
572:   PetscMPIInt ni, na, nd, combiner;
573: #if defined(PETSC_HAVE_COMPLEX)
574:   PetscInt nPetscComplex = 0;
575: #endif

577:   PetscFunctionBegin;
578:   PetscCall(MPIPetsc_Type_compare_contig(unit, MPI_SIGNED_CHAR, &nSignedChar));
579:   PetscCall(MPIPetsc_Type_compare_contig(unit, MPI_UNSIGNED_CHAR, &nUnsignedChar));
580:   /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */
581:   PetscCall(MPIPetsc_Type_compare_contig(unit, MPI_INT, &nInt));
582:   PetscCall(MPIPetsc_Type_compare_contig(unit, MPIU_INT, &nPetscInt));
583:   PetscCall(MPIPetsc_Type_compare_contig(unit, MPIU_REAL, &nPetscReal));
584: #if defined(PETSC_HAVE_COMPLEX)
585:   PetscCall(MPIPetsc_Type_compare_contig(unit, MPIU_COMPLEX, &nPetscComplex));
586: #endif
587:   PetscCall(MPIPetsc_Type_compare(unit, MPI_2INT, &is2Int));
588:   PetscCall(MPIPetsc_Type_compare(unit, MPIU_2INT, &is2PetscInt));
589:   /* TODO: shaell we also handle Fortran MPI_2REAL? */
590:   PetscCallMPI(MPI_Type_get_envelope(unit, &ni, &na, &nd, &combiner));
591:   link->isbuiltin = (combiner == MPI_COMBINER_NAMED) ? PETSC_TRUE : PETSC_FALSE; /* unit is MPI builtin */
592:   link->bs        = 1;                                                           /* default */

594:   if (is2Int) {
595:     PackInit_PairType_int_int_1_1(link);
596:     link->bs        = 1;
597:     link->unitbytes = 2 * sizeof(int);
598:     link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
599:     link->basicunit = MPI_2INT;
600:     link->unit      = MPI_2INT;
601:   } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */
602:     PackInit_PairType_PetscInt_PetscInt_1_1(link);
603:     link->bs        = 1;
604:     link->unitbytes = 2 * sizeof(PetscInt);
605:     link->basicunit = MPIU_2INT;
606:     link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
607:     link->unit      = MPIU_2INT;
608:   } else if (nPetscReal) {
609:     if (nPetscReal == 8) PackInit_RealType_PetscReal_8_1(link);
610:     else if (nPetscReal % 8 == 0) PackInit_RealType_PetscReal_8_0(link);
611:     else if (nPetscReal == 4) PackInit_RealType_PetscReal_4_1(link);
612:     else if (nPetscReal % 4 == 0) PackInit_RealType_PetscReal_4_0(link);
613:     else if (nPetscReal == 2) PackInit_RealType_PetscReal_2_1(link);
614:     else if (nPetscReal % 2 == 0) PackInit_RealType_PetscReal_2_0(link);
615:     else if (nPetscReal == 1) PackInit_RealType_PetscReal_1_1(link);
616:     else if (nPetscReal % 1 == 0) PackInit_RealType_PetscReal_1_0(link);
617:     link->bs        = nPetscReal;
618:     link->unitbytes = nPetscReal * sizeof(PetscReal);
619:     link->basicunit = MPIU_REAL;
620:     if (link->bs == 1) {
621:       link->isbuiltin = PETSC_TRUE;
622:       link->unit      = MPIU_REAL;
623:     }
624:   } else if (nPetscInt) {
625:     if (nPetscInt == 8) PackInit_IntegerType_PetscInt_8_1(link);
626:     else if (nPetscInt % 8 == 0) PackInit_IntegerType_PetscInt_8_0(link);
627:     else if (nPetscInt == 4) PackInit_IntegerType_PetscInt_4_1(link);
628:     else if (nPetscInt % 4 == 0) PackInit_IntegerType_PetscInt_4_0(link);
629:     else if (nPetscInt == 2) PackInit_IntegerType_PetscInt_2_1(link);
630:     else if (nPetscInt % 2 == 0) PackInit_IntegerType_PetscInt_2_0(link);
631:     else if (nPetscInt == 1) PackInit_IntegerType_PetscInt_1_1(link);
632:     else if (nPetscInt % 1 == 0) PackInit_IntegerType_PetscInt_1_0(link);
633:     link->bs        = nPetscInt;
634:     link->unitbytes = nPetscInt * sizeof(PetscInt);
635:     link->basicunit = MPIU_INT;
636:     if (link->bs == 1) {
637:       link->isbuiltin = PETSC_TRUE;
638:       link->unit      = MPIU_INT;
639:     }
640: #if defined(PETSC_USE_64BIT_INDICES)
641:   } else if (nInt) {
642:     if (nInt == 8) PackInit_IntegerType_int_8_1(link);
643:     else if (nInt % 8 == 0) PackInit_IntegerType_int_8_0(link);
644:     else if (nInt == 4) PackInit_IntegerType_int_4_1(link);
645:     else if (nInt % 4 == 0) PackInit_IntegerType_int_4_0(link);
646:     else if (nInt == 2) PackInit_IntegerType_int_2_1(link);
647:     else if (nInt % 2 == 0) PackInit_IntegerType_int_2_0(link);
648:     else if (nInt == 1) PackInit_IntegerType_int_1_1(link);
649:     else if (nInt % 1 == 0) PackInit_IntegerType_int_1_0(link);
650:     link->bs        = nInt;
651:     link->unitbytes = nInt * sizeof(int);
652:     link->basicunit = MPI_INT;
653:     if (link->bs == 1) {
654:       link->isbuiltin = PETSC_TRUE;
655:       link->unit      = MPI_INT;
656:     }
657: #endif
658:   } else if (nSignedChar) {
659:     if (nSignedChar == 8) PackInit_IntegerType_SignedChar_8_1(link);
660:     else if (nSignedChar % 8 == 0) PackInit_IntegerType_SignedChar_8_0(link);
661:     else if (nSignedChar == 4) PackInit_IntegerType_SignedChar_4_1(link);
662:     else if (nSignedChar % 4 == 0) PackInit_IntegerType_SignedChar_4_0(link);
663:     else if (nSignedChar == 2) PackInit_IntegerType_SignedChar_2_1(link);
664:     else if (nSignedChar % 2 == 0) PackInit_IntegerType_SignedChar_2_0(link);
665:     else if (nSignedChar == 1) PackInit_IntegerType_SignedChar_1_1(link);
666:     else if (nSignedChar % 1 == 0) PackInit_IntegerType_SignedChar_1_0(link);
667:     link->bs        = nSignedChar;
668:     link->unitbytes = nSignedChar * sizeof(SignedChar);
669:     link->basicunit = MPI_SIGNED_CHAR;
670:     if (link->bs == 1) {
671:       link->isbuiltin = PETSC_TRUE;
672:       link->unit      = MPI_SIGNED_CHAR;
673:     }
674:   } else if (nUnsignedChar) {
675:     if (nUnsignedChar == 8) PackInit_IntegerType_UnsignedChar_8_1(link);
676:     else if (nUnsignedChar % 8 == 0) PackInit_IntegerType_UnsignedChar_8_0(link);
677:     else if (nUnsignedChar == 4) PackInit_IntegerType_UnsignedChar_4_1(link);
678:     else if (nUnsignedChar % 4 == 0) PackInit_IntegerType_UnsignedChar_4_0(link);
679:     else if (nUnsignedChar == 2) PackInit_IntegerType_UnsignedChar_2_1(link);
680:     else if (nUnsignedChar % 2 == 0) PackInit_IntegerType_UnsignedChar_2_0(link);
681:     else if (nUnsignedChar == 1) PackInit_IntegerType_UnsignedChar_1_1(link);
682:     else if (nUnsignedChar % 1 == 0) PackInit_IntegerType_UnsignedChar_1_0(link);
683:     link->bs        = nUnsignedChar;
684:     link->unitbytes = nUnsignedChar * sizeof(UnsignedChar);
685:     link->basicunit = MPI_UNSIGNED_CHAR;
686:     if (link->bs == 1) {
687:       link->isbuiltin = PETSC_TRUE;
688:       link->unit      = MPI_UNSIGNED_CHAR;
689:     }
690: #if defined(PETSC_HAVE_COMPLEX)
691:   } else if (nPetscComplex) {
692:     if (nPetscComplex == 8) PackInit_ComplexType_PetscComplex_8_1(link);
693:     else if (nPetscComplex % 8 == 0) PackInit_ComplexType_PetscComplex_8_0(link);
694:     else if (nPetscComplex == 4) PackInit_ComplexType_PetscComplex_4_1(link);
695:     else if (nPetscComplex % 4 == 0) PackInit_ComplexType_PetscComplex_4_0(link);
696:     else if (nPetscComplex == 2) PackInit_ComplexType_PetscComplex_2_1(link);
697:     else if (nPetscComplex % 2 == 0) PackInit_ComplexType_PetscComplex_2_0(link);
698:     else if (nPetscComplex == 1) PackInit_ComplexType_PetscComplex_1_1(link);
699:     else if (nPetscComplex % 1 == 0) PackInit_ComplexType_PetscComplex_1_0(link);
700:     link->bs        = nPetscComplex;
701:     link->unitbytes = nPetscComplex * sizeof(PetscComplex);
702:     link->basicunit = MPIU_COMPLEX;
703:     if (link->bs == 1) {
704:       link->isbuiltin = PETSC_TRUE;
705:       link->unit      = MPIU_COMPLEX;
706:     }
707: #endif
708:   } else {
709:     MPI_Aint lb, nbyte;
710:     PetscCallMPI(MPI_Type_get_extent(unit, &lb, &nbyte));
711:     PetscCheck(lb == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Datatype with nonzero lower bound %ld", (long)lb);
712:     if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */
713:       if (nbyte == 4) PackInit_DumbType_char_4_1(link);
714:       else if (nbyte % 4 == 0) PackInit_DumbType_char_4_0(link);
715:       else if (nbyte == 2) PackInit_DumbType_char_2_1(link);
716:       else if (nbyte % 2 == 0) PackInit_DumbType_char_2_0(link);
717:       else if (nbyte == 1) PackInit_DumbType_char_1_1(link);
718:       else if (nbyte % 1 == 0) PackInit_DumbType_char_1_0(link);
719:       link->bs        = nbyte;
720:       link->unitbytes = nbyte;
721:       link->basicunit = MPI_BYTE;
722:     } else {
723:       nInt = nbyte / sizeof(int);
724:       if (nInt == 8) PackInit_DumbType_DumbInt_8_1(link);
725:       else if (nInt % 8 == 0) PackInit_DumbType_DumbInt_8_0(link);
726:       else if (nInt == 4) PackInit_DumbType_DumbInt_4_1(link);
727:       else if (nInt % 4 == 0) PackInit_DumbType_DumbInt_4_0(link);
728:       else if (nInt == 2) PackInit_DumbType_DumbInt_2_1(link);
729:       else if (nInt % 2 == 0) PackInit_DumbType_DumbInt_2_0(link);
730:       else if (nInt == 1) PackInit_DumbType_DumbInt_1_1(link);
731:       else if (nInt % 1 == 0) PackInit_DumbType_DumbInt_1_0(link);
732:       link->bs        = nInt;
733:       link->unitbytes = nbyte;
734:       link->basicunit = MPI_INT;
735:     }
736:     if (link->isbuiltin) link->unit = unit;
737:   }

739:   if (!link->isbuiltin) PetscCallMPI(MPI_Type_dup(unit, &link->unit));

741:   link->Memcpy = PetscSFLinkMemcpy_Host;
742:   PetscFunctionReturn(PETSC_SUCCESS);
743: }

745: PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink link, PetscMemType mtype, MPI_Op op, PetscBool atomic, PetscErrorCode (**UnpackAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *))
746: {
747:   PetscFunctionBegin;
748:   *UnpackAndOp = NULL;
749:   if (PetscMemTypeHost(mtype)) {
750:     if (op == MPI_REPLACE) *UnpackAndOp = link->h_UnpackAndInsert;
751:     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->h_UnpackAndAdd;
752:     else if (op == MPI_PROD) *UnpackAndOp = link->h_UnpackAndMult;
753:     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->h_UnpackAndMax;
754:     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->h_UnpackAndMin;
755:     else if (op == MPI_LAND) *UnpackAndOp = link->h_UnpackAndLAND;
756:     else if (op == MPI_BAND) *UnpackAndOp = link->h_UnpackAndBAND;
757:     else if (op == MPI_LOR) *UnpackAndOp = link->h_UnpackAndLOR;
758:     else if (op == MPI_BOR) *UnpackAndOp = link->h_UnpackAndBOR;
759:     else if (op == MPI_LXOR) *UnpackAndOp = link->h_UnpackAndLXOR;
760:     else if (op == MPI_BXOR) *UnpackAndOp = link->h_UnpackAndBXOR;
761:     else if (op == MPI_MAXLOC) *UnpackAndOp = link->h_UnpackAndMaxloc;
762:     else if (op == MPI_MINLOC) *UnpackAndOp = link->h_UnpackAndMinloc;
763:   }
764: #if defined(PETSC_HAVE_DEVICE)
765:   else if (PetscMemTypeDevice(mtype) && !atomic) {
766:     if (op == MPI_REPLACE) *UnpackAndOp = link->d_UnpackAndInsert;
767:     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->d_UnpackAndAdd;
768:     else if (op == MPI_PROD) *UnpackAndOp = link->d_UnpackAndMult;
769:     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->d_UnpackAndMax;
770:     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->d_UnpackAndMin;
771:     else if (op == MPI_LAND) *UnpackAndOp = link->d_UnpackAndLAND;
772:     else if (op == MPI_BAND) *UnpackAndOp = link->d_UnpackAndBAND;
773:     else if (op == MPI_LOR) *UnpackAndOp = link->d_UnpackAndLOR;
774:     else if (op == MPI_BOR) *UnpackAndOp = link->d_UnpackAndBOR;
775:     else if (op == MPI_LXOR) *UnpackAndOp = link->d_UnpackAndLXOR;
776:     else if (op == MPI_BXOR) *UnpackAndOp = link->d_UnpackAndBXOR;
777:     else if (op == MPI_MAXLOC) *UnpackAndOp = link->d_UnpackAndMaxloc;
778:     else if (op == MPI_MINLOC) *UnpackAndOp = link->d_UnpackAndMinloc;
779:   } else if (PetscMemTypeDevice(mtype) && atomic) {
780:     if (op == MPI_REPLACE) *UnpackAndOp = link->da_UnpackAndInsert;
781:     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->da_UnpackAndAdd;
782:     else if (op == MPI_PROD) *UnpackAndOp = link->da_UnpackAndMult;
783:     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->da_UnpackAndMax;
784:     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->da_UnpackAndMin;
785:     else if (op == MPI_LAND) *UnpackAndOp = link->da_UnpackAndLAND;
786:     else if (op == MPI_BAND) *UnpackAndOp = link->da_UnpackAndBAND;
787:     else if (op == MPI_LOR) *UnpackAndOp = link->da_UnpackAndLOR;
788:     else if (op == MPI_BOR) *UnpackAndOp = link->da_UnpackAndBOR;
789:     else if (op == MPI_LXOR) *UnpackAndOp = link->da_UnpackAndLXOR;
790:     else if (op == MPI_BXOR) *UnpackAndOp = link->da_UnpackAndBXOR;
791:     else if (op == MPI_MAXLOC) *UnpackAndOp = link->da_UnpackAndMaxloc;
792:     else if (op == MPI_MINLOC) *UnpackAndOp = link->da_UnpackAndMinloc;
793:   }
794: #endif
795:   PetscFunctionReturn(PETSC_SUCCESS);
796: }

798: PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink link, PetscMemType mtype, MPI_Op op, PetscBool atomic, PetscErrorCode (**ScatterAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *))
799: {
800:   PetscFunctionBegin;
801:   *ScatterAndOp = NULL;
802:   if (PetscMemTypeHost(mtype)) {
803:     if (op == MPI_REPLACE) *ScatterAndOp = link->h_ScatterAndInsert;
804:     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->h_ScatterAndAdd;
805:     else if (op == MPI_PROD) *ScatterAndOp = link->h_ScatterAndMult;
806:     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->h_ScatterAndMax;
807:     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->h_ScatterAndMin;
808:     else if (op == MPI_LAND) *ScatterAndOp = link->h_ScatterAndLAND;
809:     else if (op == MPI_BAND) *ScatterAndOp = link->h_ScatterAndBAND;
810:     else if (op == MPI_LOR) *ScatterAndOp = link->h_ScatterAndLOR;
811:     else if (op == MPI_BOR) *ScatterAndOp = link->h_ScatterAndBOR;
812:     else if (op == MPI_LXOR) *ScatterAndOp = link->h_ScatterAndLXOR;
813:     else if (op == MPI_BXOR) *ScatterAndOp = link->h_ScatterAndBXOR;
814:     else if (op == MPI_MAXLOC) *ScatterAndOp = link->h_ScatterAndMaxloc;
815:     else if (op == MPI_MINLOC) *ScatterAndOp = link->h_ScatterAndMinloc;
816:   }
817: #if defined(PETSC_HAVE_DEVICE)
818:   else if (PetscMemTypeDevice(mtype) && !atomic) {
819:     if (op == MPI_REPLACE) *ScatterAndOp = link->d_ScatterAndInsert;
820:     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->d_ScatterAndAdd;
821:     else if (op == MPI_PROD) *ScatterAndOp = link->d_ScatterAndMult;
822:     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->d_ScatterAndMax;
823:     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->d_ScatterAndMin;
824:     else if (op == MPI_LAND) *ScatterAndOp = link->d_ScatterAndLAND;
825:     else if (op == MPI_BAND) *ScatterAndOp = link->d_ScatterAndBAND;
826:     else if (op == MPI_LOR) *ScatterAndOp = link->d_ScatterAndLOR;
827:     else if (op == MPI_BOR) *ScatterAndOp = link->d_ScatterAndBOR;
828:     else if (op == MPI_LXOR) *ScatterAndOp = link->d_ScatterAndLXOR;
829:     else if (op == MPI_BXOR) *ScatterAndOp = link->d_ScatterAndBXOR;
830:     else if (op == MPI_MAXLOC) *ScatterAndOp = link->d_ScatterAndMaxloc;
831:     else if (op == MPI_MINLOC) *ScatterAndOp = link->d_ScatterAndMinloc;
832:   } else if (PetscMemTypeDevice(mtype) && atomic) {
833:     if (op == MPI_REPLACE) *ScatterAndOp = link->da_ScatterAndInsert;
834:     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->da_ScatterAndAdd;
835:     else if (op == MPI_PROD) *ScatterAndOp = link->da_ScatterAndMult;
836:     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->da_ScatterAndMax;
837:     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->da_ScatterAndMin;
838:     else if (op == MPI_LAND) *ScatterAndOp = link->da_ScatterAndLAND;
839:     else if (op == MPI_BAND) *ScatterAndOp = link->da_ScatterAndBAND;
840:     else if (op == MPI_LOR) *ScatterAndOp = link->da_ScatterAndLOR;
841:     else if (op == MPI_BOR) *ScatterAndOp = link->da_ScatterAndBOR;
842:     else if (op == MPI_LXOR) *ScatterAndOp = link->da_ScatterAndLXOR;
843:     else if (op == MPI_BXOR) *ScatterAndOp = link->da_ScatterAndBXOR;
844:     else if (op == MPI_MAXLOC) *ScatterAndOp = link->da_ScatterAndMaxloc;
845:     else if (op == MPI_MINLOC) *ScatterAndOp = link->da_ScatterAndMinloc;
846:   }
847: #endif
848:   PetscFunctionReturn(PETSC_SUCCESS);
849: }

851: PetscErrorCode PetscSFLinkGetFetchAndOp(PetscSFLink link, PetscMemType mtype, MPI_Op op, PetscBool atomic, PetscErrorCode (**FetchAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, void *))
852: {
853:   PetscFunctionBegin;
854:   *FetchAndOp = NULL;
855:   PetscCheck(op == MPI_SUM || op == MPIU_SUM, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for MPI_Op in FetchAndOp");
856:   if (PetscMemTypeHost(mtype)) *FetchAndOp = link->h_FetchAndAdd;
857: #if defined(PETSC_HAVE_DEVICE)
858:   else if (PetscMemTypeDevice(mtype) && !atomic) *FetchAndOp = link->d_FetchAndAdd;
859:   else if (PetscMemTypeDevice(mtype) && atomic) *FetchAndOp = link->da_FetchAndAdd;
860: #endif
861:   PetscFunctionReturn(PETSC_SUCCESS);
862: }

864: PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink link, PetscMemType mtype, MPI_Op op, PetscBool atomic, PetscErrorCode (**FetchAndOpLocal)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *))
865: {
866:   PetscFunctionBegin;
867:   *FetchAndOpLocal = NULL;
868:   PetscCheck(op == MPI_SUM || op == MPIU_SUM, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for MPI_Op in FetchAndOp");
869:   if (PetscMemTypeHost(mtype)) *FetchAndOpLocal = link->h_FetchAndAddLocal;
870: #if defined(PETSC_HAVE_DEVICE)
871:   else if (PetscMemTypeDevice(mtype) && !atomic) *FetchAndOpLocal = link->d_FetchAndAddLocal;
872:   else if (PetscMemTypeDevice(mtype) && atomic) *FetchAndOpLocal = link->da_FetchAndAddLocal;
873: #endif
874:   PetscFunctionReturn(PETSC_SUCCESS);
875: }

877: static inline PetscErrorCode PetscSFLinkLogFlopsAfterUnpackRootData(PetscSF sf, PetscSFLink link, PetscSFScope scope, MPI_Op op)
878: {
879:   PetscLogDouble flops;
880:   PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;

882:   PetscFunctionBegin;
883:   if (op != MPI_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */
884:     flops = bas->rootbuflen[scope] * link->bs;               /* # of roots in buffer x # of scalars in unit */
885: #if defined(PETSC_HAVE_DEVICE)
886:     if (PetscMemTypeDevice(link->rootmtype)) PetscCall(PetscLogGpuFlops(flops));
887:     else
888: #endif
889:       PetscCall(PetscLogFlops(flops));
890:   }
891:   PetscFunctionReturn(PETSC_SUCCESS);
892: }

894: static inline PetscErrorCode PetscSFLinkLogFlopsAfterUnpackLeafData(PetscSF sf, PetscSFLink link, PetscSFScope scope, MPI_Op op)
895: {
896:   PetscLogDouble flops;

898:   PetscFunctionBegin;
899:   if (op != MPI_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */
900:     flops = sf->leafbuflen[scope] * link->bs;                /* # of roots in buffer x # of scalars in unit */
901: #if defined(PETSC_HAVE_DEVICE)
902:     if (PetscMemTypeDevice(link->leafmtype)) PetscCall(PetscLogGpuFlops(flops));
903:     else
904: #endif
905:       PetscCall(PetscLogFlops(flops));
906:   }
907:   PetscFunctionReturn(PETSC_SUCCESS);
908: }

910: /* When SF could not find a proper UnpackAndOp() from link, it falls back to MPI_Reduce_local.
911:   Input Parameters:
912:   +sf      - The StarForest
913:   .link    - The link
914:   .count   - Number of entries to unpack
915:   .start   - The first index, significent when indices=NULL
916:   .indices - Indices of entries in <data>. If NULL, it means indices are contiguous and the first is given in <start>
917:   .buf     - A contiguous buffer to unpack from
918:   -op      - Operation after unpack

920:   Output Parameters:
921:   .data    - The data to unpack to
922: */
923: static inline PetscErrorCode PetscSFLinkUnpackDataWithMPIReduceLocal(PetscSF sf, PetscSFLink link, PetscInt count, PetscInt start, const PetscInt *indices, void *data, const void *buf, MPI_Op op)
924: {
925:   PetscFunctionBegin;
926: #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL)
927:   {
928:     PetscInt i;
929:     if (indices) {
930:       /* Note we use link->unit instead of link->basicunit. When op can be mapped to MPI_SUM etc, it operates on
931:          basic units of a root/leaf element-wisely. Otherwise, it is meant to operate on a whole root/leaf.
932:       */
933:       for (i = 0; i < count; i++) PetscCallMPI(MPI_Reduce_local((const char *)buf + i * link->unitbytes, (char *)data + indices[i] * link->unitbytes, 1, link->unit, op));
934:     } else {
935:       PetscCallMPI(MPIU_Reduce_local(buf, (char *)data + start * link->unitbytes, count, link->unit, op));
936:     }
937:   }
938:   PetscFunctionReturn(PETSC_SUCCESS);
939: #else
940:   SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No unpacking reduction operation for this MPI_Op");
941: #endif
942: }

944: static inline PetscErrorCode PetscSFLinkScatterDataWithMPIReduceLocal(PetscSF sf, PetscSFLink link, PetscInt count, PetscInt srcStart, const PetscInt *srcIdx, const void *src, PetscInt dstStart, const PetscInt *dstIdx, void *dst, MPI_Op op)
945: {
946:   PetscFunctionBegin;
947: #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL)
948:   {
949:     PetscInt i, disp;
950:     if (!srcIdx) {
951:       PetscCall(PetscSFLinkUnpackDataWithMPIReduceLocal(sf, link, count, dstStart, dstIdx, dst, (const char *)src + srcStart * link->unitbytes, op));
952:     } else {
953:       for (i = 0; i < count; i++) {
954:         disp = dstIdx ? dstIdx[i] : dstStart + i;
955:         PetscCallMPI(MPIU_Reduce_local((const char *)src + srcIdx[i] * link->unitbytes, (char *)dst + disp * link->unitbytes, 1, link->unit, op));
956:       }
957:     }
958:   }
959:   PetscFunctionReturn(PETSC_SUCCESS);
960: #else
961:   SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No unpacking reduction operation for this MPI_Op");
962: #endif
963: }

965: /*=============================================================================
966:               Pack/Unpack/Fetch/Scatter routines
967:  ============================================================================*/

969: /* Pack rootdata to rootbuf
970:   Input Parameters:
971:   + sf       - The SF this packing works on.
972:   . link     - It gives the memtype of the roots and also provides root buffer.
973:   . scope    - PETSCSF_LOCAL or PETSCSF_REMOTE. Note SF has the ability to do local and remote communications separately.
974:   - rootdata - Where to read the roots.

976:   Notes:
977:   When rootdata can be directly used as root buffer, the routine is almost a no-op. After the call, root data is
978:   in a place where the underlying MPI is ready to access (use_gpu_aware_mpi or not)
979:  */
980: PetscErrorCode PetscSFLinkPackRootData_Private(PetscSF sf, PetscSFLink link, PetscSFScope scope, const void *rootdata)
981: {
982:   const PetscInt *rootindices = NULL;
983:   PetscInt        count, start;
984:   PetscErrorCode (*Pack)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *) = NULL;
985:   PetscMemType   rootmtype                                                                                        = link->rootmtype;
986:   PetscSFPackOpt opt                                                                                              = NULL;

988:   PetscFunctionBegin;
989:   PetscCall(PetscLogEventBegin(PETSCSF_Pack, sf, 0, 0, 0));
990:   if (!link->rootdirect[scope]) { /* If rootdata works directly as rootbuf, skip packing */
991:     PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf, link, rootmtype, scope, &count, &start, &opt, &rootindices));
992:     PetscCall(PetscSFLinkGetPack(link, rootmtype, &Pack));
993:     PetscCall((*Pack)(link, count, start, opt, rootindices, rootdata, link->rootbuf[scope][rootmtype]));
994:   }
995:   PetscCall(PetscLogEventEnd(PETSCSF_Pack, sf, 0, 0, 0));
996:   PetscFunctionReturn(PETSC_SUCCESS);
997: }

999: /* Pack leafdata to leafbuf */
1000: PetscErrorCode PetscSFLinkPackLeafData_Private(PetscSF sf, PetscSFLink link, PetscSFScope scope, const void *leafdata)
1001: {
1002:   const PetscInt *leafindices = NULL;
1003:   PetscInt        count, start;
1004:   PetscErrorCode (*Pack)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *) = NULL;
1005:   PetscMemType   leafmtype                                                                                        = link->leafmtype;
1006:   PetscSFPackOpt opt                                                                                              = NULL;

1008:   PetscFunctionBegin;
1009:   PetscCall(PetscLogEventBegin(PETSCSF_Pack, sf, 0, 0, 0));
1010:   if (!link->leafdirect[scope]) { /* If leafdata works directly as rootbuf, skip packing */
1011:     PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf, link, leafmtype, scope, &count, &start, &opt, &leafindices));
1012:     PetscCall(PetscSFLinkGetPack(link, leafmtype, &Pack));
1013:     PetscCall((*Pack)(link, count, start, opt, leafindices, leafdata, link->leafbuf[scope][leafmtype]));
1014:   }
1015:   PetscCall(PetscLogEventEnd(PETSCSF_Pack, sf, 0, 0, 0));
1016:   PetscFunctionReturn(PETSC_SUCCESS);
1017: }

1019: /* Pack rootdata to rootbuf, which are in the same memory space */
1020: PetscErrorCode PetscSFLinkPackRootData(PetscSF sf, PetscSFLink link, PetscSFScope scope, const void *rootdata)
1021: {
1022:   PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;

1024:   PetscFunctionBegin;
1025:   if (scope == PETSCSF_REMOTE) { /* Sync the device if rootdata is not on petsc default stream */
1026:     if (PetscMemTypeDevice(link->rootmtype) && link->SyncDevice && sf->unknown_input_stream) PetscCall((*link->SyncDevice)(link));
1027:     if (link->PrePack) PetscCall((*link->PrePack)(sf, link, PETSCSF_../../../../../..2LEAF)); /* Used by SF nvshmem */
1028:   }
1029:   PetscCall(PetscLogEventBegin(PETSCSF_Pack, sf, 0, 0, 0));
1030:   if (bas->rootbuflen[scope]) PetscCall(PetscSFLinkPackRootData_Private(sf, link, scope, rootdata));
1031:   PetscCall(PetscLogEventEnd(PETSCSF_Pack, sf, 0, 0, 0));
1032:   PetscFunctionReturn(PETSC_SUCCESS);
1033: }
1034: /* Pack leafdata to leafbuf, which are in the same memory space */
1035: PetscErrorCode PetscSFLinkPackLeafData(PetscSF sf, PetscSFLink link, PetscSFScope scope, const void *leafdata)
1036: {
1037:   PetscFunctionBegin;
1038:   if (scope == PETSCSF_REMOTE) {
1039:     if (PetscMemTypeDevice(link->leafmtype) && link->SyncDevice && sf->unknown_input_stream) PetscCall((*link->SyncDevice)(link));
1040:     if (link->PrePack) PetscCall((*link->PrePack)(sf, link, PETSCSF_LEAF2../../../../../..)); /* Used by SF nvshmem */
1041:   }
1042:   PetscCall(PetscLogEventBegin(PETSCSF_Pack, sf, 0, 0, 0));
1043:   if (sf->leafbuflen[scope]) PetscCall(PetscSFLinkPackLeafData_Private(sf, link, scope, leafdata));
1044:   PetscCall(PetscLogEventEnd(PETSCSF_Pack, sf, 0, 0, 0));
1045:   PetscFunctionReturn(PETSC_SUCCESS);
1046: }

1048: PetscErrorCode PetscSFLinkUnpackRootData_Private(PetscSF sf, PetscSFLink link, PetscSFScope scope, void *rootdata, MPI_Op op)
1049: {
1050:   const PetscInt *rootindices = NULL;
1051:   PetscInt        count, start;
1052:   PetscSF_Basic  *bas                                                                                                    = (PetscSF_Basic *)sf->data;
1053:   PetscErrorCode (*UnpackAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *) = NULL;
1054:   PetscMemType   rootmtype                                                                                               = link->rootmtype;
1055:   PetscSFPackOpt opt                                                                                                     = NULL;

1057:   PetscFunctionBegin;
1058:   if (!link->rootdirect[scope]) { /* If rootdata works directly as rootbuf, skip unpacking */
1059:     PetscCall(PetscSFLinkGetUnpackAndOp(link, rootmtype, op, bas->rootdups[scope], &UnpackAndOp));
1060:     if (UnpackAndOp) {
1061:       PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf, link, rootmtype, scope, &count, &start, &opt, &rootindices));
1062:       PetscCall((*UnpackAndOp)(link, count, start, opt, rootindices, rootdata, link->rootbuf[scope][rootmtype]));
1063:     } else {
1064:       PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf, link, PETSC_MEMTYPE_HOST, scope, &count, &start, &opt, &rootindices));
1065:       PetscCall(PetscSFLinkUnpackDataWithMPIReduceLocal(sf, link, count, start, rootindices, rootdata, link->rootbuf[scope][rootmtype], op));
1066:     }
1067:   }
1068:   PetscCall(PetscSFLinkLogFlopsAfterUnpackRootData(sf, link, scope, op));
1069:   PetscFunctionReturn(PETSC_SUCCESS);
1070: }

1072: PetscErrorCode PetscSFLinkUnpackLeafData_Private(PetscSF sf, PetscSFLink link, PetscSFScope scope, void *leafdata, MPI_Op op)
1073: {
1074:   const PetscInt *leafindices = NULL;
1075:   PetscInt        count, start;
1076:   PetscErrorCode (*UnpackAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *) = NULL;
1077:   PetscMemType   leafmtype                                                                                               = link->leafmtype;
1078:   PetscSFPackOpt opt                                                                                                     = NULL;

1080:   PetscFunctionBegin;
1081:   if (!link->leafdirect[scope]) { /* If leafdata works directly as rootbuf, skip unpacking */
1082:     PetscCall(PetscSFLinkGetUnpackAndOp(link, leafmtype, op, sf->leafdups[scope], &UnpackAndOp));
1083:     if (UnpackAndOp) {
1084:       PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf, link, leafmtype, scope, &count, &start, &opt, &leafindices));
1085:       PetscCall((*UnpackAndOp)(link, count, start, opt, leafindices, leafdata, link->leafbuf[scope][leafmtype]));
1086:     } else {
1087:       PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf, link, PETSC_MEMTYPE_HOST, scope, &count, &start, &opt, &leafindices));
1088:       PetscCall(PetscSFLinkUnpackDataWithMPIReduceLocal(sf, link, count, start, leafindices, leafdata, link->leafbuf[scope][leafmtype], op));
1089:     }
1090:   }
1091:   PetscCall(PetscSFLinkLogFlopsAfterUnpackLeafData(sf, link, scope, op));
1092:   PetscFunctionReturn(PETSC_SUCCESS);
1093: }
1094: /* Unpack rootbuf to rootdata, which are in the same memory space */
1095: PetscErrorCode PetscSFLinkUnpackRootData(PetscSF sf, PetscSFLink link, PetscSFScope scope, void *rootdata, MPI_Op op)
1096: {
1097:   PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;

1099:   PetscFunctionBegin;
1100:   PetscCall(PetscLogEventBegin(PETSCSF_Unpack, sf, 0, 0, 0));
1101:   if (bas->rootbuflen[scope]) PetscCall(PetscSFLinkUnpackRootData_Private(sf, link, scope, rootdata, op));
1102:   PetscCall(PetscLogEventEnd(PETSCSF_Unpack, sf, 0, 0, 0));
1103:   if (scope == PETSCSF_REMOTE) {
1104:     if (link->PostUnpack) PetscCall((*link->PostUnpack)(sf, link, PETSCSF_LEAF2../../../../../..)); /* Used by SF nvshmem */
1105:     if (PetscMemTypeDevice(link->rootmtype) && link->SyncDevice && sf->unknown_input_stream) PetscCall((*link->SyncDevice)(link));
1106:   }
1107:   PetscFunctionReturn(PETSC_SUCCESS);
1108: }

1110: /* Unpack leafbuf to leafdata for remote (common case) or local (rare case when rootmtype != leafmtype) */
1111: PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF sf, PetscSFLink link, PetscSFScope scope, void *leafdata, MPI_Op op)
1112: {
1113:   PetscFunctionBegin;
1114:   PetscCall(PetscLogEventBegin(PETSCSF_Unpack, sf, 0, 0, 0));
1115:   if (sf->leafbuflen[scope]) PetscCall(PetscSFLinkUnpackLeafData_Private(sf, link, scope, leafdata, op));
1116:   PetscCall(PetscLogEventEnd(PETSCSF_Unpack, sf, 0, 0, 0));
1117:   if (scope == PETSCSF_REMOTE) {
1118:     if (link->PostUnpack) PetscCall((*link->PostUnpack)(sf, link, PETSCSF_../../../../../..2LEAF)); /* Used by SF nvshmem */
1119:     if (PetscMemTypeDevice(link->leafmtype) && link->SyncDevice && sf->unknown_input_stream) PetscCall((*link->SyncDevice)(link));
1120:   }
1121:   PetscFunctionReturn(PETSC_SUCCESS);
1122: }

1124: /* FetchAndOp rootdata with rootbuf, it is a kind of Unpack on rootdata, except it also updates rootbuf */
1125: PetscErrorCode PetscSFLinkFetchAndOpRemote(PetscSF sf, PetscSFLink link, void *rootdata, MPI_Op op)
1126: {
1127:   const PetscInt *rootindices = NULL;
1128:   PetscInt        count, start;
1129:   PetscSF_Basic  *bas                                                                                             = (PetscSF_Basic *)sf->data;
1130:   PetscErrorCode (*FetchAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, void *) = NULL;
1131:   PetscMemType   rootmtype                                                                                        = link->rootmtype;
1132:   PetscSFPackOpt opt                                                                                              = NULL;

1134:   PetscFunctionBegin;
1135:   PetscCall(PetscLogEventBegin(PETSCSF_Unpack, sf, 0, 0, 0));
1136:   if (bas->rootbuflen[PETSCSF_REMOTE]) {
1137:     /* Do FetchAndOp on rootdata with rootbuf */
1138:     PetscCall(PetscSFLinkGetFetchAndOp(link, rootmtype, op, bas->rootdups[PETSCSF_REMOTE], &FetchAndOp));
1139:     PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf, link, rootmtype, PETSCSF_REMOTE, &count, &start, &opt, &rootindices));
1140:     PetscCall((*FetchAndOp)(link, count, start, opt, rootindices, rootdata, link->rootbuf[PETSCSF_REMOTE][rootmtype]));
1141:   }
1142:   PetscCall(PetscSFLinkLogFlopsAfterUnpackRootData(sf, link, PETSCSF_REMOTE, op));
1143:   PetscCall(PetscLogEventEnd(PETSCSF_Unpack, sf, 0, 0, 0));
1144:   PetscFunctionReturn(PETSC_SUCCESS);
1145: }

1147: PetscErrorCode PetscSFLinkScatterLocal(PetscSF sf, PetscSFLink link, PetscSFDirection direction, void *rootdata, void *leafdata, MPI_Op op)
1148: {
1149:   const PetscInt *rootindices = NULL, *leafindices = NULL;
1150:   PetscInt        count, rootstart, leafstart;
1151:   PetscSF_Basic  *bas                                                                                                                                                 = (PetscSF_Basic *)sf->data;
1152:   PetscErrorCode (*ScatterAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *) = NULL;
1153:   PetscMemType   rootmtype = link->rootmtype, leafmtype = link->leafmtype, srcmtype, dstmtype;
1154:   PetscSFPackOpt leafopt = NULL, rootopt = NULL;
1155:   PetscInt       buflen = sf->leafbuflen[PETSCSF_LOCAL];
1156:   char          *srcbuf = NULL, *dstbuf = NULL;
1157:   PetscBool      dstdups;

1159:   PetscFunctionBegin;
1160:   if (!buflen) PetscFunctionReturn(PETSC_SUCCESS);
1161:   if (rootmtype != leafmtype) { /* The cross memory space local scatter is done by pack, copy and unpack */
1162:     if (direction == PETSCSF_../../../../../..2LEAF) {
1163:       PetscCall(PetscSFLinkPackRootData(sf, link, PETSCSF_LOCAL, rootdata));
1164:       srcmtype = rootmtype;
1165:       srcbuf   = link->rootbuf[PETSCSF_LOCAL][rootmtype];
1166:       dstmtype = leafmtype;
1167:       dstbuf   = link->leafbuf[PETSCSF_LOCAL][leafmtype];
1168:     } else {
1169:       PetscCall(PetscSFLinkPackLeafData(sf, link, PETSCSF_LOCAL, leafdata));
1170:       srcmtype = leafmtype;
1171:       srcbuf   = link->leafbuf[PETSCSF_LOCAL][leafmtype];
1172:       dstmtype = rootmtype;
1173:       dstbuf   = link->rootbuf[PETSCSF_LOCAL][rootmtype];
1174:     }
1175:     PetscCall((*link->Memcpy)(link, dstmtype, dstbuf, srcmtype, srcbuf, buflen * link->unitbytes));
1176:     /* If above is a device to host copy, we have to sync the stream before accessing the buffer on host */
1177:     if (PetscMemTypeHost(dstmtype)) PetscCall((*link->SyncStream)(link));
1178:     if (direction == PETSCSF_../../../../../..2LEAF) {
1179:       PetscCall(PetscSFLinkUnpackLeafData(sf, link, PETSCSF_LOCAL, leafdata, op));
1180:     } else {
1181:       PetscCall(PetscSFLinkUnpackRootData(sf, link, PETSCSF_LOCAL, rootdata, op));
1182:     }
1183:   } else {
1184:     dstdups  = (direction == PETSCSF_../../../../../..2LEAF) ? sf->leafdups[PETSCSF_LOCAL] : bas->rootdups[PETSCSF_LOCAL];
1185:     dstmtype = (direction == PETSCSF_../../../../../..2LEAF) ? link->leafmtype : link->rootmtype;
1186:     PetscCall(PetscSFLinkGetScatterAndOp(link, dstmtype, op, dstdups, &ScatterAndOp));
1187:     if (ScatterAndOp) {
1188:       PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf, link, rootmtype, PETSCSF_LOCAL, &count, &rootstart, &rootopt, &rootindices));
1189:       PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf, link, leafmtype, PETSCSF_LOCAL, &count, &leafstart, &leafopt, &leafindices));
1190:       if (direction == PETSCSF_../../../../../..2LEAF) {
1191:         PetscCall((*ScatterAndOp)(link, count, rootstart, rootopt, rootindices, rootdata, leafstart, leafopt, leafindices, leafdata));
1192:       } else {
1193:         PetscCall((*ScatterAndOp)(link, count, leafstart, leafopt, leafindices, leafdata, rootstart, rootopt, rootindices, rootdata));
1194:       }
1195:     } else {
1196:       PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf, link, PETSC_MEMTYPE_HOST, PETSCSF_LOCAL, &count, &rootstart, &rootopt, &rootindices));
1197:       PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf, link, PETSC_MEMTYPE_HOST, PETSCSF_LOCAL, &count, &leafstart, &leafopt, &leafindices));
1198:       if (direction == PETSCSF_../../../../../..2LEAF) {
1199:         PetscCall(PetscSFLinkScatterDataWithMPIReduceLocal(sf, link, count, rootstart, rootindices, rootdata, leafstart, leafindices, leafdata, op));
1200:       } else {
1201:         PetscCall(PetscSFLinkScatterDataWithMPIReduceLocal(sf, link, count, leafstart, leafindices, leafdata, rootstart, rootindices, rootdata, op));
1202:       }
1203:     }
1204:   }
1205:   PetscFunctionReturn(PETSC_SUCCESS);
1206: }

1208: /* Fetch rootdata to leafdata and leafupdate locally */
1209: PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF sf, PetscSFLink link, void *rootdata, const void *leafdata, void *leafupdate, MPI_Op op)
1210: {
1211:   const PetscInt *rootindices = NULL, *leafindices = NULL;
1212:   PetscInt        count, rootstart, leafstart;
1213:   PetscSF_Basic  *bas                                                                                                                                                            = (PetscSF_Basic *)sf->data;
1214:   PetscErrorCode (*FetchAndOpLocal)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *) = NULL;
1215:   const PetscMemType rootmtype = link->rootmtype, leafmtype = link->leafmtype;
1216:   PetscSFPackOpt     leafopt = NULL, rootopt = NULL;

1218:   PetscFunctionBegin;
1219:   if (!bas->rootbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(PETSC_SUCCESS);
1220:   if (rootmtype != leafmtype) {
1221:     /* The local communication has to go through pack and unpack */
1222:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Doing PetscSFFetchAndOp with rootdata and leafdata on opposite side of CPU and GPU");
1223:   } else {
1224:     PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf, link, rootmtype, PETSCSF_LOCAL, &count, &rootstart, &rootopt, &rootindices));
1225:     PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf, link, leafmtype, PETSCSF_LOCAL, &count, &leafstart, &leafopt, &leafindices));
1226:     PetscCall(PetscSFLinkGetFetchAndOpLocal(link, rootmtype, op, bas->rootdups[PETSCSF_LOCAL], &FetchAndOpLocal));
1227:     PetscCall((*FetchAndOpLocal)(link, count, rootstart, rootopt, rootindices, rootdata, leafstart, leafopt, leafindices, leafdata, leafupdate));
1228:   }
1229:   PetscFunctionReturn(PETSC_SUCCESS);
1230: }

1232: /*
1233:   Create per-rank pack/unpack optimizations based on indice patterns

1235:    Input Parameters:
1236:   +  n       - Number of destination ranks
1237:   .  offset  - [n+1] For the i-th rank, its associated indices are idx[offset[i], offset[i+1]). offset[0] needs not to be 0.
1238:   -  idx     - [*]   Array storing indices

1240:    Output Parameters:
1241:   +  opt     - Pack optimizations. NULL if no optimizations.
1242: */
1243: PetscErrorCode PetscSFCreatePackOpt(PetscInt n, const PetscInt *offset, const PetscInt *idx, PetscSFPackOpt *out)
1244: {
1245:   PetscInt       r, p, start, i, j, k, dx, dy, dz, dydz, m, X, Y;
1246:   PetscBool      optimizable = PETSC_TRUE;
1247:   PetscSFPackOpt opt;

1249:   PetscFunctionBegin;
1250:   PetscCall(PetscMalloc1(1, &opt));
1251:   PetscCall(PetscMalloc1(7 * n + 2, &opt->array));
1252:   opt->n = opt->array[0] = n;
1253:   opt->offset            = opt->array + 1;
1254:   opt->start             = opt->array + n + 2;
1255:   opt->dx                = opt->array + 2 * n + 2;
1256:   opt->dy                = opt->array + 3 * n + 2;
1257:   opt->dz                = opt->array + 4 * n + 2;
1258:   opt->X                 = opt->array + 5 * n + 2;
1259:   opt->Y                 = opt->array + 6 * n + 2;

1261:   for (r = 0; r < n; r++) {            /* For each destination rank */
1262:     m     = offset[r + 1] - offset[r]; /* Total number of indices for this rank. We want to see if m can be factored into dx*dy*dz */
1263:     p     = offset[r];
1264:     start = idx[p]; /* First index for this rank */
1265:     p++;

1267:     /* Search in X dimension */
1268:     for (dx = 1; dx < m; dx++, p++) {
1269:       if (start + dx != idx[p]) break;
1270:     }

1272:     dydz = m / dx;
1273:     X    = dydz > 1 ? (idx[p] - start) : dx;
1274:     /* Not optimizable if m is not a multiple of dx, or some unrecognized pattern is found */
1275:     if (m % dx || X <= 0) {
1276:       optimizable = PETSC_FALSE;
1277:       goto finish;
1278:     }
1279:     for (dy = 1; dy < dydz; dy++) { /* Search in Y dimension */
1280:       for (i = 0; i < dx; i++, p++) {
1281:         if (start + X * dy + i != idx[p]) {
1282:           if (i) {
1283:             optimizable = PETSC_FALSE;
1284:             goto finish;
1285:           } /* The pattern is violated in the middle of an x-walk */
1286:           else
1287:             goto Z_dimension;
1288:         }
1289:       }
1290:     }

1292:   Z_dimension:
1293:     dz = m / (dx * dy);
1294:     Y  = dz > 1 ? (idx[p] - start) / X : dy;
1295:     /* Not optimizable if m is not a multiple of dx*dy, or some unrecognized pattern is found */
1296:     if (m % (dx * dy) || Y <= 0) {
1297:       optimizable = PETSC_FALSE;
1298:       goto finish;
1299:     }
1300:     for (k = 1; k < dz; k++) { /* Go through Z dimension to see if remaining indices follow the pattern */
1301:       for (j = 0; j < dy; j++) {
1302:         for (i = 0; i < dx; i++, p++) {
1303:           if (start + X * Y * k + X * j + i != idx[p]) {
1304:             optimizable = PETSC_FALSE;
1305:             goto finish;
1306:           }
1307:         }
1308:       }
1309:     }
1310:     opt->start[r] = start;
1311:     opt->dx[r]    = dx;
1312:     opt->dy[r]    = dy;
1313:     opt->dz[r]    = dz;
1314:     opt->X[r]     = X;
1315:     opt->Y[r]     = Y;
1316:   }

1318: finish:
1319:   /* If not optimizable, free arrays to save memory */
1320:   if (!n || !optimizable) {
1321:     PetscCall(PetscFree(opt->array));
1322:     PetscCall(PetscFree(opt));
1323:     *out = NULL;
1324:   } else {
1325:     opt->offset[0] = 0;
1326:     for (r = 0; r < n; r++) opt->offset[r + 1] = opt->offset[r] + opt->dx[r] * opt->dy[r] * opt->dz[r];
1327:     *out = opt;
1328:   }
1329:   PetscFunctionReturn(PETSC_SUCCESS);
1330: }

1332: static inline PetscErrorCode PetscSFDestroyPackOpt(PetscSF sf, PetscMemType mtype, PetscSFPackOpt *out)
1333: {
1334:   PetscSFPackOpt opt = *out;

1336:   PetscFunctionBegin;
1337:   if (opt) {
1338:     PetscCall(PetscSFFree(sf, mtype, opt->array));
1339:     PetscCall(PetscFree(opt));
1340:     *out = NULL;
1341:   }
1342:   PetscFunctionReturn(PETSC_SUCCESS);
1343: }

1345: PetscErrorCode PetscSFSetUpPackFields(PetscSF sf)
1346: {
1347:   PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
1348:   PetscInt       i, j;

1350:   PetscFunctionBegin;
1351:   /* [0] for PETSCSF_LOCAL and [1] for PETSCSF_REMOTE in the following */
1352:   for (i = 0; i < 2; i++) { /* Set defaults */
1353:     sf->leafstart[i]   = 0;
1354:     sf->leafcontig[i]  = PETSC_TRUE;
1355:     sf->leafdups[i]    = PETSC_FALSE;
1356:     bas->rootstart[i]  = 0;
1357:     bas->rootcontig[i] = PETSC_TRUE;
1358:     bas->rootdups[i]   = PETSC_FALSE;
1359:   }

1361:   sf->leafbuflen[0] = sf->roffset[sf->ndranks];
1362:   sf->leafbuflen[1] = sf->roffset[sf->nranks] - sf->roffset[sf->ndranks];

1364:   if (sf->leafbuflen[0]) sf->leafstart[0] = sf->rmine[0];
1365:   if (sf->leafbuflen[1]) sf->leafstart[1] = sf->rmine[sf->roffset[sf->ndranks]];

1367:   /* Are leaf indices for self and remote contiguous? If yes, it is best for pack/unpack */
1368:   for (i = 0; i < sf->roffset[sf->ndranks]; i++) { /* self */
1369:     if (sf->rmine[i] != sf->leafstart[0] + i) {
1370:       sf->leafcontig[0] = PETSC_FALSE;
1371:       break;
1372:     }
1373:   }
1374:   for (i = sf->roffset[sf->ndranks], j = 0; i < sf->roffset[sf->nranks]; i++, j++) { /* remote */
1375:     if (sf->rmine[i] != sf->leafstart[1] + j) {
1376:       sf->leafcontig[1] = PETSC_FALSE;
1377:       break;
1378:     }
1379:   }

1381:   /* If not, see if we can have per-rank optimizations by doing index analysis */
1382:   if (!sf->leafcontig[0]) PetscCall(PetscSFCreatePackOpt(sf->ndranks, sf->roffset, sf->rmine, &sf->leafpackopt[0]));
1383:   if (!sf->leafcontig[1]) PetscCall(PetscSFCreatePackOpt(sf->nranks - sf->ndranks, sf->roffset + sf->ndranks, sf->rmine, &sf->leafpackopt[1]));

1385:   /* Are root indices for self and remote contiguous? */
1386:   bas->rootbuflen[0] = bas->ioffset[bas->ndiranks];
1387:   bas->rootbuflen[1] = bas->ioffset[bas->niranks] - bas->ioffset[bas->ndiranks];

1389:   if (bas->rootbuflen[0]) bas->rootstart[0] = bas->irootloc[0];
1390:   if (bas->rootbuflen[1]) bas->rootstart[1] = bas->irootloc[bas->ioffset[bas->ndiranks]];

1392:   for (i = 0; i < bas->ioffset[bas->ndiranks]; i++) {
1393:     if (bas->irootloc[i] != bas->rootstart[0] + i) {
1394:       bas->rootcontig[0] = PETSC_FALSE;
1395:       break;
1396:     }
1397:   }
1398:   for (i = bas->ioffset[bas->ndiranks], j = 0; i < bas->ioffset[bas->niranks]; i++, j++) {
1399:     if (bas->irootloc[i] != bas->rootstart[1] + j) {
1400:       bas->rootcontig[1] = PETSC_FALSE;
1401:       break;
1402:     }
1403:   }

1405:   if (!bas->rootcontig[0]) PetscCall(PetscSFCreatePackOpt(bas->ndiranks, bas->ioffset, bas->irootloc, &bas->rootpackopt[0]));
1406:   if (!bas->rootcontig[1]) PetscCall(PetscSFCreatePackOpt(bas->niranks - bas->ndiranks, bas->ioffset + bas->ndiranks, bas->irootloc, &bas->rootpackopt[1]));

1408:   /* Check dups in indices so that CUDA unpacking kernels can use cheaper regular instructions instead of atomics when they know there are no data race chances */
1409:   if (PetscDefined(HAVE_DEVICE)) {
1410:     PetscBool ismulti = (sf->multi == sf) ? PETSC_TRUE : PETSC_FALSE;
1411:     if (!sf->leafcontig[0] && !ismulti) PetscCall(PetscCheckDupsInt(sf->leafbuflen[0], sf->rmine, &sf->leafdups[0]));
1412:     if (!sf->leafcontig[1] && !ismulti) PetscCall(PetscCheckDupsInt(sf->leafbuflen[1], sf->rmine + sf->roffset[sf->ndranks], &sf->leafdups[1]));
1413:     if (!bas->rootcontig[0] && !ismulti) PetscCall(PetscCheckDupsInt(bas->rootbuflen[0], bas->irootloc, &bas->rootdups[0]));
1414:     if (!bas->rootcontig[1] && !ismulti) PetscCall(PetscCheckDupsInt(bas->rootbuflen[1], bas->irootloc + bas->ioffset[bas->ndiranks], &bas->rootdups[1]));
1415:   }
1416:   PetscFunctionReturn(PETSC_SUCCESS);
1417: }

1419: PetscErrorCode PetscSFResetPackFields(PetscSF sf)
1420: {
1421:   PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
1422:   PetscInt       i;

1424:   PetscFunctionBegin;
1425:   for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) {
1426:     PetscCall(PetscSFDestroyPackOpt(sf, PETSC_MEMTYPE_HOST, &sf->leafpackopt[i]));
1427:     PetscCall(PetscSFDestroyPackOpt(sf, PETSC_MEMTYPE_HOST, &bas->rootpackopt[i]));
1428: #if defined(PETSC_HAVE_DEVICE)
1429:     PetscCall(PetscSFDestroyPackOpt(sf, PETSC_MEMTYPE_DEVICE, &sf->leafpackopt_d[i]));
1430:     PetscCall(PetscSFDestroyPackOpt(sf, PETSC_MEMTYPE_DEVICE, &bas->rootpackopt_d[i]));
1431: #endif
1432:   }
1433:   PetscFunctionReturn(PETSC_SUCCESS);
1434: }