Actual source code: mpiptap.c


  2: /*
  3:   Defines projective product routines where A is a MPIAIJ matrix
  4:           C = P^T * A * P
  5: */

  7: #include <../src/mat/impls/aij/seq/aij.h>
  8: #include <../src/mat/utils/freespace.h>
  9: #include <../src/mat/impls/aij/mpi/mpiaij.h>
 10: #include <petscbt.h>
 11: #include <petsctime.h>
 12: #include <petsc/private/hashmapiv.h>
 13: #include <petsc/private/hashseti.h>
 14: #include <petscsf.h>

 16: PetscErrorCode MatView_MPIAIJ_PtAP(Mat A, PetscViewer viewer)
 17: {
 18:   PetscBool         iascii;
 19:   PetscViewerFormat format;
 20:   Mat_APMPI        *ptap;

 22:   PetscFunctionBegin;
 23:   MatCheckProduct(A, 1);
 24:   ptap = (Mat_APMPI *)A->product->data;
 25:   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii));
 26:   if (iascii) {
 27:     PetscCall(PetscViewerGetFormat(viewer, &format));
 28:     if (format == PETSC_VIEWER_ASCII_INFO || format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
 29:       if (ptap->algType == 0) {
 30:         PetscCall(PetscViewerASCIIPrintf(viewer, "using scalable MatPtAP() implementation\n"));
 31:       } else if (ptap->algType == 1) {
 32:         PetscCall(PetscViewerASCIIPrintf(viewer, "using nonscalable MatPtAP() implementation\n"));
 33:       } else if (ptap->algType == 2) {
 34:         PetscCall(PetscViewerASCIIPrintf(viewer, "using allatonce MatPtAP() implementation\n"));
 35:       } else if (ptap->algType == 3) {
 36:         PetscCall(PetscViewerASCIIPrintf(viewer, "using merged allatonce MatPtAP() implementation\n"));
 37:       }
 38:     }
 39:   }
 40:   PetscFunctionReturn(PETSC_SUCCESS);
 41: }

 43: PetscErrorCode MatDestroy_MPIAIJ_PtAP(void *data)
 44: {
 45:   Mat_APMPI           *ptap = (Mat_APMPI *)data;
 46:   Mat_Merge_SeqsToMPI *merge;

 48:   PetscFunctionBegin;
 49:   PetscCall(PetscFree2(ptap->startsj_s, ptap->startsj_r));
 50:   PetscCall(PetscFree(ptap->bufa));
 51:   PetscCall(MatDestroy(&ptap->P_loc));
 52:   PetscCall(MatDestroy(&ptap->P_oth));
 53:   PetscCall(MatDestroy(&ptap->A_loc)); /* used by MatTransposeMatMult() */
 54:   PetscCall(MatDestroy(&ptap->Rd));
 55:   PetscCall(MatDestroy(&ptap->Ro));
 56:   if (ptap->AP_loc) { /* used by alg_rap */
 57:     Mat_SeqAIJ *ap = (Mat_SeqAIJ *)(ptap->AP_loc)->data;
 58:     PetscCall(PetscFree(ap->i));
 59:     PetscCall(PetscFree2(ap->j, ap->a));
 60:     PetscCall(MatDestroy(&ptap->AP_loc));
 61:   } else { /* used by alg_ptap */
 62:     PetscCall(PetscFree(ptap->api));
 63:     PetscCall(PetscFree(ptap->apj));
 64:   }
 65:   PetscCall(MatDestroy(&ptap->C_loc));
 66:   PetscCall(MatDestroy(&ptap->C_oth));
 67:   if (ptap->apa) PetscCall(PetscFree(ptap->apa));

 69:   PetscCall(MatDestroy(&ptap->Pt));

 71:   merge = ptap->merge;
 72:   if (merge) { /* used by alg_ptap */
 73:     PetscCall(PetscFree(merge->id_r));
 74:     PetscCall(PetscFree(merge->len_s));
 75:     PetscCall(PetscFree(merge->len_r));
 76:     PetscCall(PetscFree(merge->bi));
 77:     PetscCall(PetscFree(merge->bj));
 78:     PetscCall(PetscFree(merge->buf_ri[0]));
 79:     PetscCall(PetscFree(merge->buf_ri));
 80:     PetscCall(PetscFree(merge->buf_rj[0]));
 81:     PetscCall(PetscFree(merge->buf_rj));
 82:     PetscCall(PetscFree(merge->coi));
 83:     PetscCall(PetscFree(merge->coj));
 84:     PetscCall(PetscFree(merge->owners_co));
 85:     PetscCall(PetscLayoutDestroy(&merge->rowmap));
 86:     PetscCall(PetscFree(ptap->merge));
 87:   }
 88:   PetscCall(ISLocalToGlobalMappingDestroy(&ptap->ltog));

 90:   PetscCall(PetscSFDestroy(&ptap->sf));
 91:   PetscCall(PetscFree(ptap->c_othi));
 92:   PetscCall(PetscFree(ptap->c_rmti));
 93:   PetscCall(PetscFree(ptap));
 94:   PetscFunctionReturn(PETSC_SUCCESS);
 95: }

 97: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable(Mat A, Mat P, Mat C)
 98: {
 99:   Mat_MPIAIJ        *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data;
100:   Mat_SeqAIJ        *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = (Mat_SeqAIJ *)(a->B)->data;
101:   Mat_SeqAIJ        *ap, *p_loc, *p_oth = NULL, *c_seq;
102:   Mat_APMPI         *ptap;
103:   Mat                AP_loc, C_loc, C_oth;
104:   PetscInt           i, rstart, rend, cm, ncols, row, *api, *apj, am = A->rmap->n, apnz, nout;
105:   PetscScalar       *apa;
106:   const PetscInt    *cols;
107:   const PetscScalar *vals;

109:   PetscFunctionBegin;
110:   MatCheckProduct(C, 3);
111:   ptap = (Mat_APMPI *)C->product->data;
112:   PetscCheck(ptap, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be computed. Missing data");
113:   PetscCheck(ptap->AP_loc, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be reused. Do not call MatProductClear()");

115:   PetscCall(MatZeroEntries(C));

117:   /* 1) get R = Pd^T,Ro = Po^T */
118:   if (ptap->reuse == MAT_REUSE_MATRIX) {
119:     PetscCall(MatTranspose(p->A, MAT_REUSE_MATRIX, &ptap->Rd));
120:     PetscCall(MatTranspose(p->B, MAT_REUSE_MATRIX, &ptap->Ro));
121:   }

123:   /* 2) get AP_loc */
124:   AP_loc = ptap->AP_loc;
125:   ap     = (Mat_SeqAIJ *)AP_loc->data;

127:   /* 2-1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
128:   if (ptap->reuse == MAT_REUSE_MATRIX) {
129:     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
130:     PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_REUSE_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &ptap->P_oth));
131:     PetscCall(MatMPIAIJGetLocalMat(P, MAT_REUSE_MATRIX, &ptap->P_loc));
132:   }

134:   /* 2-2) compute numeric A_loc*P - dominating part */
135:   /* get data from symbolic products */
136:   p_loc = (Mat_SeqAIJ *)(ptap->P_loc)->data;
137:   if (ptap->P_oth) p_oth = (Mat_SeqAIJ *)(ptap->P_oth)->data;

139:   api = ap->i;
140:   apj = ap->j;
141:   PetscCall(ISLocalToGlobalMappingApply(ptap->ltog, api[AP_loc->rmap->n], apj, apj));
142:   for (i = 0; i < am; i++) {
143:     /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */
144:     apnz = api[i + 1] - api[i];
145:     apa  = ap->a + api[i];
146:     PetscCall(PetscArrayzero(apa, apnz));
147:     AProw_scalable(i, ad, ao, p_loc, p_oth, api, apj, apa);
148:   }
149:   PetscCall(ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, api[AP_loc->rmap->n], apj, &nout, apj));
150:   PetscCheck(api[AP_loc->rmap->n] == nout, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT, api[AP_loc->rmap->n], nout);

152:   /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */
153:   /* Always use scalable version since we are in the MPI scalable version */
154:   PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Rd, AP_loc, ptap->C_loc));
155:   PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Ro, AP_loc, ptap->C_oth));

157:   C_loc = ptap->C_loc;
158:   C_oth = ptap->C_oth;

160:   /* add C_loc and Co to to C */
161:   PetscCall(MatGetOwnershipRange(C, &rstart, &rend));

163:   /* C_loc -> C */
164:   cm    = C_loc->rmap->N;
165:   c_seq = (Mat_SeqAIJ *)C_loc->data;
166:   cols  = c_seq->j;
167:   vals  = c_seq->a;
168:   PetscCall(ISLocalToGlobalMappingApply(ptap->ltog, c_seq->i[C_loc->rmap->n], c_seq->j, c_seq->j));

170:   /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */
171:   /* when there are no off-processor parts.  */
172:   /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */
173:   /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */
174:   /* a table, and other, more complex stuff has to be done. */
175:   if (C->assembled) {
176:     C->was_assembled = PETSC_TRUE;
177:     C->assembled     = PETSC_FALSE;
178:   }
179:   if (C->was_assembled) {
180:     for (i = 0; i < cm; i++) {
181:       ncols = c_seq->i[i + 1] - c_seq->i[i];
182:       row   = rstart + i;
183:       PetscCall(MatSetValues_MPIAIJ(C, 1, &row, ncols, cols, vals, ADD_VALUES));
184:       cols += ncols;
185:       vals += ncols;
186:     }
187:   } else {
188:     PetscCall(MatSetValues_MPIAIJ_CopyFromCSRFormat(C, c_seq->j, c_seq->i, c_seq->a));
189:   }
190:   PetscCall(ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, c_seq->i[C_loc->rmap->n], c_seq->j, &nout, c_seq->j));
191:   PetscCheck(c_seq->i[C_loc->rmap->n] == nout, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT, c_seq->i[C_loc->rmap->n], nout);

193:   /* Co -> C, off-processor part */
194:   cm    = C_oth->rmap->N;
195:   c_seq = (Mat_SeqAIJ *)C_oth->data;
196:   cols  = c_seq->j;
197:   vals  = c_seq->a;
198:   PetscCall(ISLocalToGlobalMappingApply(ptap->ltog, c_seq->i[C_oth->rmap->n], c_seq->j, c_seq->j));
199:   for (i = 0; i < cm; i++) {
200:     ncols = c_seq->i[i + 1] - c_seq->i[i];
201:     row   = p->garray[i];
202:     PetscCall(MatSetValues(C, 1, &row, ncols, cols, vals, ADD_VALUES));
203:     cols += ncols;
204:     vals += ncols;
205:   }
206:   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
207:   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));

209:   ptap->reuse = MAT_REUSE_MATRIX;

211:   PetscCall(ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, c_seq->i[C_oth->rmap->n], c_seq->j, &nout, c_seq->j));
212:   PetscCheck(c_seq->i[C_oth->rmap->n] == nout, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT, c_seq->i[C_loc->rmap->n], nout);
213:   PetscFunctionReturn(PETSC_SUCCESS);
214: }

216: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(Mat A, Mat P, PetscReal fill, Mat Cmpi)
217: {
218:   Mat_APMPI               *ptap;
219:   Mat_MPIAIJ              *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data;
220:   MPI_Comm                 comm;
221:   PetscMPIInt              size, rank;
222:   Mat                      P_loc, P_oth;
223:   PetscFreeSpaceList       free_space = NULL, current_space = NULL;
224:   PetscInt                 am = A->rmap->n, pm = P->rmap->n, pN = P->cmap->N, pn = P->cmap->n;
225:   PetscInt                *lnk, i, k, pnz, row, nsend;
226:   PetscMPIInt              tagi, tagj, *len_si, *len_s, *len_ri, nrecv;
227:   PETSC_UNUSED PetscMPIInt icompleted = 0;
228:   PetscInt               **buf_rj, **buf_ri, **buf_ri_k;
229:   const PetscInt          *owners;
230:   PetscInt                 len, proc, *dnz, *onz, nzi, nspacedouble;
231:   PetscInt                 nrows, *buf_s, *buf_si, *buf_si_i, **nextrow, **nextci;
232:   MPI_Request             *swaits, *rwaits;
233:   MPI_Status              *sstatus, rstatus;
234:   PetscLayout              rowmap;
235:   PetscInt                *owners_co, *coi, *coj; /* i and j array of (p->B)^T*A*P - used in the communication */
236:   PetscMPIInt             *len_r, *id_r;          /* array of length of comm->size, store send/recv matrix values */
237:   PetscInt                *api, *apj, *Jptr, apnz, *prmap = p->garray, con, j, Crmax, *aj, *ai, *pi, nout;
238:   Mat_SeqAIJ              *p_loc, *p_oth = NULL, *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = NULL, *c_loc, *c_oth;
239:   PetscScalar             *apv;
240:   PetscHMapI               ta;
241:   MatType                  mtype;
242:   const char              *prefix;
243: #if defined(PETSC_USE_INFO)
244:   PetscReal apfill;
245: #endif

247:   PetscFunctionBegin;
248:   MatCheckProduct(Cmpi, 4);
249:   PetscCheck(!Cmpi->product->data, PetscObjectComm((PetscObject)Cmpi), PETSC_ERR_PLIB, "Product data not empty");
250:   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
251:   PetscCallMPI(MPI_Comm_size(comm, &size));
252:   PetscCallMPI(MPI_Comm_rank(comm, &rank));

254:   if (size > 1) ao = (Mat_SeqAIJ *)(a->B)->data;

256:   /* create symbolic parallel matrix Cmpi */
257:   PetscCall(MatGetType(A, &mtype));
258:   PetscCall(MatSetType(Cmpi, mtype));

260:   /* create struct Mat_APMPI and attached it to C later */
261:   PetscCall(PetscNew(&ptap));
262:   ptap->reuse   = MAT_INITIAL_MATRIX;
263:   ptap->algType = 0;

265:   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
266:   PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_INITIAL_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &P_oth));
267:   /* get P_loc by taking all local rows of P */
268:   PetscCall(MatMPIAIJGetLocalMat(P, MAT_INITIAL_MATRIX, &P_loc));

270:   ptap->P_loc = P_loc;
271:   ptap->P_oth = P_oth;

273:   /* (0) compute Rd = Pd^T, Ro = Po^T  */
274:   PetscCall(MatTranspose(p->A, MAT_INITIAL_MATRIX, &ptap->Rd));
275:   PetscCall(MatTranspose(p->B, MAT_INITIAL_MATRIX, &ptap->Ro));

277:   /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */
278:   p_loc = (Mat_SeqAIJ *)P_loc->data;
279:   if (P_oth) p_oth = (Mat_SeqAIJ *)P_oth->data;

281:   /* create and initialize a linked list */
282:   PetscCall(PetscHMapICreateWithSize(pn, &ta)); /* for compute AP_loc and Cmpi */
283:   MatRowMergeMax_SeqAIJ(p_loc, P_loc->rmap->N, ta);
284:   MatRowMergeMax_SeqAIJ(p_oth, P_oth->rmap->N, ta);
285:   PetscCall(PetscHMapIGetSize(ta, &Crmax)); /* Crmax = nnz(sum of Prows) */

287:   PetscCall(PetscLLCondensedCreate_Scalable(Crmax, &lnk));

289:   /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */
290:   if (ao) {
291:     PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ad->i[am], PetscIntSumTruncate(ao->i[am], p_loc->i[pm]))), &free_space));
292:   } else {
293:     PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ad->i[am], p_loc->i[pm])), &free_space));
294:   }
295:   current_space = free_space;
296:   nspacedouble  = 0;

298:   PetscCall(PetscMalloc1(am + 1, &api));
299:   api[0] = 0;
300:   for (i = 0; i < am; i++) {
301:     /* diagonal portion: Ad[i,:]*P */
302:     ai  = ad->i;
303:     pi  = p_loc->i;
304:     nzi = ai[i + 1] - ai[i];
305:     aj  = ad->j + ai[i];
306:     for (j = 0; j < nzi; j++) {
307:       row  = aj[j];
308:       pnz  = pi[row + 1] - pi[row];
309:       Jptr = p_loc->j + pi[row];
310:       /* add non-zero cols of P into the sorted linked list lnk */
311:       PetscCall(PetscLLCondensedAddSorted_Scalable(pnz, Jptr, lnk));
312:     }
313:     /* off-diagonal portion: Ao[i,:]*P */
314:     if (ao) {
315:       ai  = ao->i;
316:       pi  = p_oth->i;
317:       nzi = ai[i + 1] - ai[i];
318:       aj  = ao->j + ai[i];
319:       for (j = 0; j < nzi; j++) {
320:         row  = aj[j];
321:         pnz  = pi[row + 1] - pi[row];
322:         Jptr = p_oth->j + pi[row];
323:         PetscCall(PetscLLCondensedAddSorted_Scalable(pnz, Jptr, lnk));
324:       }
325:     }
326:     apnz       = lnk[0];
327:     api[i + 1] = api[i] + apnz;

329:     /* if free space is not available, double the total space in the list */
330:     if (current_space->local_remaining < apnz) {
331:       PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(apnz, current_space->total_array_size), &current_space));
332:       nspacedouble++;
333:     }

335:     /* Copy data into free space, then initialize lnk */
336:     PetscCall(PetscLLCondensedClean_Scalable(apnz, current_space->array, lnk));

338:     current_space->array += apnz;
339:     current_space->local_used += apnz;
340:     current_space->local_remaining -= apnz;
341:   }
342:   /* Allocate space for apj and apv, initialize apj, and */
343:   /* destroy list of free space and other temporary array(s) */
344:   PetscCall(PetscCalloc2(api[am], &apj, api[am], &apv));
345:   PetscCall(PetscFreeSpaceContiguous(&free_space, apj));
346:   PetscCall(PetscLLCondensedDestroy_Scalable(lnk));

348:   /* Create AP_loc for reuse */
349:   PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, am, pN, api, apj, apv, &ptap->AP_loc));
350:   PetscCall(MatSeqAIJCompactOutExtraColumns_SeqAIJ(ptap->AP_loc, &ptap->ltog));

352: #if defined(PETSC_USE_INFO)
353:   if (ao) {
354:     apfill = (PetscReal)api[am] / (ad->i[am] + ao->i[am] + p_loc->i[pm] + 1);
355:   } else {
356:     apfill = (PetscReal)api[am] / (ad->i[am] + p_loc->i[pm] + 1);
357:   }
358:   ptap->AP_loc->info.mallocs           = nspacedouble;
359:   ptap->AP_loc->info.fill_ratio_given  = fill;
360:   ptap->AP_loc->info.fill_ratio_needed = apfill;

362:   if (api[am]) {
363:     PetscCall(PetscInfo(ptap->AP_loc, "Scalable algorithm, AP_loc reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", nspacedouble, (double)fill, (double)apfill));
364:     PetscCall(PetscInfo(ptap->AP_loc, "Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n", (double)apfill));
365:   } else {
366:     PetscCall(PetscInfo(ptap->AP_loc, "Scalable algorithm, AP_loc is empty \n"));
367:   }
368: #endif

370:   /* (2-1) compute symbolic Co = Ro*AP_loc  */
371:   PetscCall(MatProductCreate(ptap->Ro, ptap->AP_loc, NULL, &ptap->C_oth));
372:   PetscCall(MatGetOptionsPrefix(A, &prefix));
373:   PetscCall(MatSetOptionsPrefix(ptap->C_oth, prefix));
374:   PetscCall(MatAppendOptionsPrefix(ptap->C_oth, "inner_offdiag_"));

376:   PetscCall(MatProductSetType(ptap->C_oth, MATPRODUCT_AB));
377:   PetscCall(MatProductSetAlgorithm(ptap->C_oth, "sorted"));
378:   PetscCall(MatProductSetFill(ptap->C_oth, fill));
379:   PetscCall(MatProductSetFromOptions(ptap->C_oth));
380:   PetscCall(MatProductSymbolic(ptap->C_oth));

382:   /* (3) send coj of C_oth to other processors  */
383:   /* determine row ownership */
384:   PetscCall(PetscLayoutCreate(comm, &rowmap));
385:   PetscCall(PetscLayoutSetLocalSize(rowmap, pn));
386:   PetscCall(PetscLayoutSetBlockSize(rowmap, 1));
387:   PetscCall(PetscLayoutSetUp(rowmap));
388:   PetscCall(PetscLayoutGetRanges(rowmap, &owners));

390:   /* determine the number of messages to send, their lengths */
391:   PetscCall(PetscMalloc4(size, &len_s, size, &len_si, size, &sstatus, size + 2, &owners_co));
392:   PetscCall(PetscArrayzero(len_s, size));
393:   PetscCall(PetscArrayzero(len_si, size));

395:   c_oth = (Mat_SeqAIJ *)ptap->C_oth->data;
396:   coi   = c_oth->i;
397:   coj   = c_oth->j;
398:   con   = ptap->C_oth->rmap->n;
399:   proc  = 0;
400:   PetscCall(ISLocalToGlobalMappingApply(ptap->ltog, coi[con], coj, coj));
401:   for (i = 0; i < con; i++) {
402:     while (prmap[i] >= owners[proc + 1]) proc++;
403:     len_si[proc]++;                     /* num of rows in Co(=Pt*AP) to be sent to [proc] */
404:     len_s[proc] += coi[i + 1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */
405:   }

407:   len          = 0; /* max length of buf_si[], see (4) */
408:   owners_co[0] = 0;
409:   nsend        = 0;
410:   for (proc = 0; proc < size; proc++) {
411:     owners_co[proc + 1] = owners_co[proc] + len_si[proc];
412:     if (len_s[proc]) {
413:       nsend++;
414:       len_si[proc] = 2 * (len_si[proc] + 1); /* length of buf_si to be sent to [proc] */
415:       len += len_si[proc];
416:     }
417:   }

419:   /* determine the number and length of messages to receive for coi and coj  */
420:   PetscCall(PetscGatherNumberOfMessages(comm, NULL, len_s, &nrecv));
421:   PetscCall(PetscGatherMessageLengths2(comm, nsend, nrecv, len_s, len_si, &id_r, &len_r, &len_ri));

423:   /* post the Irecv and Isend of coj */
424:   PetscCall(PetscCommGetNewTag(comm, &tagj));
425:   PetscCall(PetscPostIrecvInt(comm, tagj, nrecv, id_r, len_r, &buf_rj, &rwaits));
426:   PetscCall(PetscMalloc1(nsend + 1, &swaits));
427:   for (proc = 0, k = 0; proc < size; proc++) {
428:     if (!len_s[proc]) continue;
429:     i = owners_co[proc];
430:     PetscCallMPI(MPI_Isend(coj + coi[i], len_s[proc], MPIU_INT, proc, tagj, comm, swaits + k));
431:     k++;
432:   }

434:   /* (2-2) compute symbolic C_loc = Rd*AP_loc */
435:   PetscCall(MatProductCreate(ptap->Rd, ptap->AP_loc, NULL, &ptap->C_loc));
436:   PetscCall(MatProductSetType(ptap->C_loc, MATPRODUCT_AB));
437:   PetscCall(MatProductSetAlgorithm(ptap->C_loc, "default"));
438:   PetscCall(MatProductSetFill(ptap->C_loc, fill));

440:   PetscCall(MatSetOptionsPrefix(ptap->C_loc, prefix));
441:   PetscCall(MatAppendOptionsPrefix(ptap->C_loc, "inner_diag_"));

443:   PetscCall(MatProductSetFromOptions(ptap->C_loc));
444:   PetscCall(MatProductSymbolic(ptap->C_loc));

446:   c_loc = (Mat_SeqAIJ *)ptap->C_loc->data;
447:   PetscCall(ISLocalToGlobalMappingApply(ptap->ltog, c_loc->i[ptap->C_loc->rmap->n], c_loc->j, c_loc->j));

449:   /* receives coj are complete */
450:   for (i = 0; i < nrecv; i++) PetscCallMPI(MPI_Waitany(nrecv, rwaits, &icompleted, &rstatus));
451:   PetscCall(PetscFree(rwaits));
452:   if (nsend) PetscCallMPI(MPI_Waitall(nsend, swaits, sstatus));

454:   /* add received column indices into ta to update Crmax */
455:   for (k = 0; k < nrecv; k++) { /* k-th received message */
456:     Jptr = buf_rj[k];
457:     for (j = 0; j < len_r[k]; j++) PetscCall(PetscHMapISet(ta, *(Jptr + j) + 1, 1));
458:   }
459:   PetscCall(PetscHMapIGetSize(ta, &Crmax));
460:   PetscCall(PetscHMapIDestroy(&ta));

462:   /* (4) send and recv coi */
463:   PetscCall(PetscCommGetNewTag(comm, &tagi));
464:   PetscCall(PetscPostIrecvInt(comm, tagi, nrecv, id_r, len_ri, &buf_ri, &rwaits));
465:   PetscCall(PetscMalloc1(len + 1, &buf_s));
466:   buf_si = buf_s; /* points to the beginning of k-th msg to be sent */
467:   for (proc = 0, k = 0; proc < size; proc++) {
468:     if (!len_s[proc]) continue;
469:     /* form outgoing message for i-structure:
470:          buf_si[0]:                 nrows to be sent
471:                [1:nrows]:           row index (global)
472:                [nrows+1:2*nrows+1]: i-structure index
473:     */
474:     nrows       = len_si[proc] / 2 - 1; /* num of rows in Co to be sent to [proc] */
475:     buf_si_i    = buf_si + nrows + 1;
476:     buf_si[0]   = nrows;
477:     buf_si_i[0] = 0;
478:     nrows       = 0;
479:     for (i = owners_co[proc]; i < owners_co[proc + 1]; i++) {
480:       nzi                 = coi[i + 1] - coi[i];
481:       buf_si_i[nrows + 1] = buf_si_i[nrows] + nzi;   /* i-structure */
482:       buf_si[nrows + 1]   = prmap[i] - owners[proc]; /* local row index */
483:       nrows++;
484:     }
485:     PetscCallMPI(MPI_Isend(buf_si, len_si[proc], MPIU_INT, proc, tagi, comm, swaits + k));
486:     k++;
487:     buf_si += len_si[proc];
488:   }
489:   for (i = 0; i < nrecv; i++) PetscCallMPI(MPI_Waitany(nrecv, rwaits, &icompleted, &rstatus));
490:   PetscCall(PetscFree(rwaits));
491:   if (nsend) PetscCallMPI(MPI_Waitall(nsend, swaits, sstatus));

493:   PetscCall(PetscFree4(len_s, len_si, sstatus, owners_co));
494:   PetscCall(PetscFree(len_ri));
495:   PetscCall(PetscFree(swaits));
496:   PetscCall(PetscFree(buf_s));

498:   /* (5) compute the local portion of Cmpi      */
499:   /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */
500:   PetscCall(PetscFreeSpaceGet(Crmax, &free_space));
501:   current_space = free_space;

503:   PetscCall(PetscMalloc3(nrecv, &buf_ri_k, nrecv, &nextrow, nrecv, &nextci));
504:   for (k = 0; k < nrecv; k++) {
505:     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
506:     nrows       = *buf_ri_k[k];
507:     nextrow[k]  = buf_ri_k[k] + 1;           /* next row number of k-th recved i-structure */
508:     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recved i-structure  */
509:   }

511:   MatPreallocateBegin(comm, pn, pn, dnz, onz);
512:   PetscCall(PetscLLCondensedCreate_Scalable(Crmax, &lnk));
513:   for (i = 0; i < pn; i++) {
514:     /* add C_loc into Cmpi */
515:     nzi  = c_loc->i[i + 1] - c_loc->i[i];
516:     Jptr = c_loc->j + c_loc->i[i];
517:     PetscCall(PetscLLCondensedAddSorted_Scalable(nzi, Jptr, lnk));

519:     /* add received col data into lnk */
520:     for (k = 0; k < nrecv; k++) { /* k-th received message */
521:       if (i == *nextrow[k]) {     /* i-th row */
522:         nzi  = *(nextci[k] + 1) - *nextci[k];
523:         Jptr = buf_rj[k] + *nextci[k];
524:         PetscCall(PetscLLCondensedAddSorted_Scalable(nzi, Jptr, lnk));
525:         nextrow[k]++;
526:         nextci[k]++;
527:       }
528:     }
529:     nzi = lnk[0];

531:     /* copy data into free space, then initialize lnk */
532:     PetscCall(PetscLLCondensedClean_Scalable(nzi, current_space->array, lnk));
533:     PetscCall(MatPreallocateSet(i + owners[rank], nzi, current_space->array, dnz, onz));
534:   }
535:   PetscCall(PetscFree3(buf_ri_k, nextrow, nextci));
536:   PetscCall(PetscLLCondensedDestroy_Scalable(lnk));
537:   PetscCall(PetscFreeSpaceDestroy(free_space));

539:   /* local sizes and preallocation */
540:   PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE));
541:   if (P->cmap->bs > 0) {
542:     PetscCall(PetscLayoutSetBlockSize(Cmpi->rmap, P->cmap->bs));
543:     PetscCall(PetscLayoutSetBlockSize(Cmpi->cmap, P->cmap->bs));
544:   }
545:   PetscCall(MatMPIAIJSetPreallocation(Cmpi, 0, dnz, 0, onz));
546:   MatPreallocateEnd(dnz, onz);

548:   /* members in merge */
549:   PetscCall(PetscFree(id_r));
550:   PetscCall(PetscFree(len_r));
551:   PetscCall(PetscFree(buf_ri[0]));
552:   PetscCall(PetscFree(buf_ri));
553:   PetscCall(PetscFree(buf_rj[0]));
554:   PetscCall(PetscFree(buf_rj));
555:   PetscCall(PetscLayoutDestroy(&rowmap));

557:   nout = 0;
558:   PetscCall(ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, c_oth->i[ptap->C_oth->rmap->n], c_oth->j, &nout, c_oth->j));
559:   PetscCheck(c_oth->i[ptap->C_oth->rmap->n] == nout, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT, c_oth->i[ptap->C_oth->rmap->n], nout);
560:   PetscCall(ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, c_loc->i[ptap->C_loc->rmap->n], c_loc->j, &nout, c_loc->j));
561:   PetscCheck(c_loc->i[ptap->C_loc->rmap->n] == nout, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT, c_loc->i[ptap->C_loc->rmap->n], nout);

563:   /* attach the supporting struct to Cmpi for reuse */
564:   Cmpi->product->data    = ptap;
565:   Cmpi->product->view    = MatView_MPIAIJ_PtAP;
566:   Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP;

568:   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
569:   Cmpi->assembled        = PETSC_FALSE;
570:   Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable;
571:   PetscFunctionReturn(PETSC_SUCCESS);
572: }

574: static inline PetscErrorCode MatPtAPSymbolicComputeOneRowOfAP_private(Mat A, Mat P, Mat P_oth, const PetscInt *map, PetscInt dof, PetscInt i, PetscHSetI dht, PetscHSetI oht)
575: {
576:   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data;
577:   Mat_SeqAIJ *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = (Mat_SeqAIJ *)(a->B)->data, *p_oth = (Mat_SeqAIJ *)P_oth->data, *pd = (Mat_SeqAIJ *)p->A->data, *po = (Mat_SeqAIJ *)p->B->data;
578:   PetscInt   *ai, nzi, j, *aj, row, col, *pi, *pj, pnz, nzpi, *p_othcols, k;
579:   PetscInt    pcstart, pcend, column, offset;

581:   PetscFunctionBegin;
582:   pcstart = P->cmap->rstart;
583:   pcstart *= dof;
584:   pcend = P->cmap->rend;
585:   pcend *= dof;
586:   /* diagonal portion: Ad[i,:]*P */
587:   ai  = ad->i;
588:   nzi = ai[i + 1] - ai[i];
589:   aj  = ad->j + ai[i];
590:   for (j = 0; j < nzi; j++) {
591:     row    = aj[j];
592:     offset = row % dof;
593:     row /= dof;
594:     nzpi = pd->i[row + 1] - pd->i[row];
595:     pj   = pd->j + pd->i[row];
596:     for (k = 0; k < nzpi; k++) PetscCall(PetscHSetIAdd(dht, pj[k] * dof + offset + pcstart));
597:   }
598:   /* off diag P */
599:   for (j = 0; j < nzi; j++) {
600:     row    = aj[j];
601:     offset = row % dof;
602:     row /= dof;
603:     nzpi = po->i[row + 1] - po->i[row];
604:     pj   = po->j + po->i[row];
605:     for (k = 0; k < nzpi; k++) PetscCall(PetscHSetIAdd(oht, p->garray[pj[k]] * dof + offset));
606:   }

608:   /* off diagonal part: Ao[i, :]*P_oth */
609:   if (ao) {
610:     ai  = ao->i;
611:     pi  = p_oth->i;
612:     nzi = ai[i + 1] - ai[i];
613:     aj  = ao->j + ai[i];
614:     for (j = 0; j < nzi; j++) {
615:       row       = aj[j];
616:       offset    = a->garray[row] % dof;
617:       row       = map[row];
618:       pnz       = pi[row + 1] - pi[row];
619:       p_othcols = p_oth->j + pi[row];
620:       for (col = 0; col < pnz; col++) {
621:         column = p_othcols[col] * dof + offset;
622:         if (column >= pcstart && column < pcend) {
623:           PetscCall(PetscHSetIAdd(dht, column));
624:         } else {
625:           PetscCall(PetscHSetIAdd(oht, column));
626:         }
627:       }
628:     }
629:   } /* end if (ao) */
630:   PetscFunctionReturn(PETSC_SUCCESS);
631: }

633: static inline PetscErrorCode MatPtAPNumericComputeOneRowOfAP_private(Mat A, Mat P, Mat P_oth, const PetscInt *map, PetscInt dof, PetscInt i, PetscHMapIV hmap)
634: {
635:   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data;
636:   Mat_SeqAIJ *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = (Mat_SeqAIJ *)(a->B)->data, *p_oth = (Mat_SeqAIJ *)P_oth->data, *pd = (Mat_SeqAIJ *)p->A->data, *po = (Mat_SeqAIJ *)p->B->data;
637:   PetscInt   *ai, nzi, j, *aj, row, col, *pi, pnz, *p_othcols, pcstart, *pj, k, nzpi, offset;
638:   PetscScalar ra, *aa, *pa;

640:   PetscFunctionBegin;
641:   pcstart = P->cmap->rstart;
642:   pcstart *= dof;

644:   /* diagonal portion: Ad[i,:]*P */
645:   ai  = ad->i;
646:   nzi = ai[i + 1] - ai[i];
647:   aj  = ad->j + ai[i];
648:   aa  = ad->a + ai[i];
649:   for (j = 0; j < nzi; j++) {
650:     ra     = aa[j];
651:     row    = aj[j];
652:     offset = row % dof;
653:     row /= dof;
654:     nzpi = pd->i[row + 1] - pd->i[row];
655:     pj   = pd->j + pd->i[row];
656:     pa   = pd->a + pd->i[row];
657:     for (k = 0; k < nzpi; k++) PetscCall(PetscHMapIVAddValue(hmap, pj[k] * dof + offset + pcstart, ra * pa[k]));
658:     PetscCall(PetscLogFlops(2.0 * nzpi));
659:   }
660:   for (j = 0; j < nzi; j++) {
661:     ra     = aa[j];
662:     row    = aj[j];
663:     offset = row % dof;
664:     row /= dof;
665:     nzpi = po->i[row + 1] - po->i[row];
666:     pj   = po->j + po->i[row];
667:     pa   = po->a + po->i[row];
668:     for (k = 0; k < nzpi; k++) PetscCall(PetscHMapIVAddValue(hmap, p->garray[pj[k]] * dof + offset, ra * pa[k]));
669:     PetscCall(PetscLogFlops(2.0 * nzpi));
670:   }

672:   /* off diagonal part: Ao[i, :]*P_oth */
673:   if (ao) {
674:     ai  = ao->i;
675:     pi  = p_oth->i;
676:     nzi = ai[i + 1] - ai[i];
677:     aj  = ao->j + ai[i];
678:     aa  = ao->a + ai[i];
679:     for (j = 0; j < nzi; j++) {
680:       row       = aj[j];
681:       offset    = a->garray[row] % dof;
682:       row       = map[row];
683:       ra        = aa[j];
684:       pnz       = pi[row + 1] - pi[row];
685:       p_othcols = p_oth->j + pi[row];
686:       pa        = p_oth->a + pi[row];
687:       for (col = 0; col < pnz; col++) PetscCall(PetscHMapIVAddValue(hmap, p_othcols[col] * dof + offset, ra * pa[col]));
688:       PetscCall(PetscLogFlops(2.0 * pnz));
689:     }
690:   } /* end if (ao) */

692:   PetscFunctionReturn(PETSC_SUCCESS);
693: }

695: PetscErrorCode MatGetBrowsOfAcols_MPIXAIJ(Mat, Mat, PetscInt dof, MatReuse, Mat *);

697: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce(Mat A, Mat P, PetscInt dof, Mat C)
698: {
699:   Mat_MPIAIJ     *p = (Mat_MPIAIJ *)P->data, *c = (Mat_MPIAIJ *)C->data;
700:   Mat_SeqAIJ     *cd, *co, *po = (Mat_SeqAIJ *)p->B->data, *pd = (Mat_SeqAIJ *)p->A->data;
701:   Mat_APMPI      *ptap;
702:   PetscHMapIV     hmap;
703:   PetscInt        i, j, jj, kk, nzi, *c_rmtj, voff, *c_othj, pn, pon, pcstart, pcend, ccstart, ccend, row, am, *poj, *pdj, *apindices, cmaxr, *c_rmtc, *c_rmtjj, *dcc, *occ, loc;
704:   PetscScalar    *c_rmta, *c_otha, *poa, *pda, *apvalues, *apvaluestmp, *c_rmtaa;
705:   PetscInt        offset, ii, pocol;
706:   const PetscInt *mappingindices;
707:   IS              map;

709:   PetscFunctionBegin;
710:   MatCheckProduct(C, 4);
711:   ptap = (Mat_APMPI *)C->product->data;
712:   PetscCheck(ptap, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be computed. Missing data");
713:   PetscCheck(ptap->P_oth, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be reused. Do not call MatProductClear()");

715:   PetscCall(MatZeroEntries(C));

717:   /* Get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
718:   if (ptap->reuse == MAT_REUSE_MATRIX) {
719:     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
720:     PetscCall(MatGetBrowsOfAcols_MPIXAIJ(A, P, dof, MAT_REUSE_MATRIX, &ptap->P_oth));
721:   }
722:   PetscCall(PetscObjectQuery((PetscObject)ptap->P_oth, "aoffdiagtopothmapping", (PetscObject *)&map));

724:   PetscCall(MatGetLocalSize(p->B, NULL, &pon));
725:   pon *= dof;
726:   PetscCall(PetscCalloc2(ptap->c_rmti[pon], &c_rmtj, ptap->c_rmti[pon], &c_rmta));
727:   PetscCall(MatGetLocalSize(A, &am, NULL));
728:   cmaxr = 0;
729:   for (i = 0; i < pon; i++) cmaxr = PetscMax(cmaxr, ptap->c_rmti[i + 1] - ptap->c_rmti[i]);
730:   PetscCall(PetscCalloc4(cmaxr, &apindices, cmaxr, &apvalues, cmaxr, &apvaluestmp, pon, &c_rmtc));
731:   PetscCall(PetscHMapIVCreateWithSize(cmaxr, &hmap));
732:   PetscCall(ISGetIndices(map, &mappingindices));
733:   for (i = 0; i < am && pon; i++) {
734:     PetscCall(PetscHMapIVClear(hmap));
735:     offset = i % dof;
736:     ii     = i / dof;
737:     nzi    = po->i[ii + 1] - po->i[ii];
738:     if (!nzi) continue;
739:     PetscCall(MatPtAPNumericComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, hmap));
740:     voff = 0;
741:     PetscCall(PetscHMapIVGetPairs(hmap, &voff, apindices, apvalues));
742:     if (!voff) continue;

744:     /* Form C(ii, :) */
745:     poj = po->j + po->i[ii];
746:     poa = po->a + po->i[ii];
747:     for (j = 0; j < nzi; j++) {
748:       pocol   = poj[j] * dof + offset;
749:       c_rmtjj = c_rmtj + ptap->c_rmti[pocol];
750:       c_rmtaa = c_rmta + ptap->c_rmti[pocol];
751:       for (jj = 0; jj < voff; jj++) {
752:         apvaluestmp[jj] = apvalues[jj] * poa[j];
753:         /* If the row is empty */
754:         if (!c_rmtc[pocol]) {
755:           c_rmtjj[jj] = apindices[jj];
756:           c_rmtaa[jj] = apvaluestmp[jj];
757:           c_rmtc[pocol]++;
758:         } else {
759:           PetscCall(PetscFindInt(apindices[jj], c_rmtc[pocol], c_rmtjj, &loc));
760:           if (loc >= 0) { /* hit */
761:             c_rmtaa[loc] += apvaluestmp[jj];
762:             PetscCall(PetscLogFlops(1.0));
763:           } else { /* new element */
764:             loc = -(loc + 1);
765:             /* Move data backward */
766:             for (kk = c_rmtc[pocol]; kk > loc; kk--) {
767:               c_rmtjj[kk] = c_rmtjj[kk - 1];
768:               c_rmtaa[kk] = c_rmtaa[kk - 1];
769:             } /* End kk */
770:             c_rmtjj[loc] = apindices[jj];
771:             c_rmtaa[loc] = apvaluestmp[jj];
772:             c_rmtc[pocol]++;
773:           }
774:         }
775:         PetscCall(PetscLogFlops(voff));
776:       } /* End jj */
777:     }   /* End j */
778:   }     /* End i */

780:   PetscCall(PetscFree4(apindices, apvalues, apvaluestmp, c_rmtc));
781:   PetscCall(PetscHMapIVDestroy(&hmap));

783:   PetscCall(MatGetLocalSize(P, NULL, &pn));
784:   pn *= dof;
785:   PetscCall(PetscCalloc2(ptap->c_othi[pn], &c_othj, ptap->c_othi[pn], &c_otha));

787:   PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));
788:   PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_SCALAR, c_rmta, c_otha, MPI_REPLACE));
789:   PetscCall(MatGetOwnershipRangeColumn(P, &pcstart, &pcend));
790:   pcstart = pcstart * dof;
791:   pcend   = pcend * dof;
792:   cd      = (Mat_SeqAIJ *)(c->A)->data;
793:   co      = (Mat_SeqAIJ *)(c->B)->data;

795:   cmaxr = 0;
796:   for (i = 0; i < pn; i++) cmaxr = PetscMax(cmaxr, (cd->i[i + 1] - cd->i[i]) + (co->i[i + 1] - co->i[i]));
797:   PetscCall(PetscCalloc5(cmaxr, &apindices, cmaxr, &apvalues, cmaxr, &apvaluestmp, pn, &dcc, pn, &occ));
798:   PetscCall(PetscHMapIVCreateWithSize(cmaxr, &hmap));
799:   for (i = 0; i < am && pn; i++) {
800:     PetscCall(PetscHMapIVClear(hmap));
801:     offset = i % dof;
802:     ii     = i / dof;
803:     nzi    = pd->i[ii + 1] - pd->i[ii];
804:     if (!nzi) continue;
805:     PetscCall(MatPtAPNumericComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, hmap));
806:     voff = 0;
807:     PetscCall(PetscHMapIVGetPairs(hmap, &voff, apindices, apvalues));
808:     if (!voff) continue;
809:     /* Form C(ii, :) */
810:     pdj = pd->j + pd->i[ii];
811:     pda = pd->a + pd->i[ii];
812:     for (j = 0; j < nzi; j++) {
813:       row = pcstart + pdj[j] * dof + offset;
814:       for (jj = 0; jj < voff; jj++) apvaluestmp[jj] = apvalues[jj] * pda[j];
815:       PetscCall(PetscLogFlops(voff));
816:       PetscCall(MatSetValues(C, 1, &row, voff, apindices, apvaluestmp, ADD_VALUES));
817:     }
818:   }
819:   PetscCall(ISRestoreIndices(map, &mappingindices));
820:   PetscCall(MatGetOwnershipRangeColumn(C, &ccstart, &ccend));
821:   PetscCall(PetscFree5(apindices, apvalues, apvaluestmp, dcc, occ));
822:   PetscCall(PetscHMapIVDestroy(&hmap));
823:   PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));
824:   PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_SCALAR, c_rmta, c_otha, MPI_REPLACE));
825:   PetscCall(PetscFree2(c_rmtj, c_rmta));

827:   /* Add contributions from remote */
828:   for (i = 0; i < pn; i++) {
829:     row = i + pcstart;
830:     PetscCall(MatSetValues(C, 1, &row, ptap->c_othi[i + 1] - ptap->c_othi[i], c_othj + ptap->c_othi[i], c_otha + ptap->c_othi[i], ADD_VALUES));
831:   }
832:   PetscCall(PetscFree2(c_othj, c_otha));

834:   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
835:   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));

837:   ptap->reuse = MAT_REUSE_MATRIX;
838:   PetscFunctionReturn(PETSC_SUCCESS);
839: }

841: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce(Mat A, Mat P, Mat C)
842: {
843:   PetscFunctionBegin;

845:   PetscCall(MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce(A, P, 1, C));
846:   PetscFunctionReturn(PETSC_SUCCESS);
847: }

849: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce_merged(Mat A, Mat P, PetscInt dof, Mat C)
850: {
851:   Mat_MPIAIJ     *p = (Mat_MPIAIJ *)P->data, *c = (Mat_MPIAIJ *)C->data;
852:   Mat_SeqAIJ     *cd, *co, *po = (Mat_SeqAIJ *)p->B->data, *pd = (Mat_SeqAIJ *)p->A->data;
853:   Mat_APMPI      *ptap;
854:   PetscHMapIV     hmap;
855:   PetscInt        i, j, jj, kk, nzi, dnzi, *c_rmtj, voff, *c_othj, pn, pon, pcstart, pcend, row, am, *poj, *pdj, *apindices, cmaxr, *c_rmtc, *c_rmtjj, loc;
856:   PetscScalar    *c_rmta, *c_otha, *poa, *pda, *apvalues, *apvaluestmp, *c_rmtaa;
857:   PetscInt        offset, ii, pocol;
858:   const PetscInt *mappingindices;
859:   IS              map;

861:   PetscFunctionBegin;
862:   MatCheckProduct(C, 4);
863:   ptap = (Mat_APMPI *)C->product->data;
864:   PetscCheck(ptap, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be computed. Missing data");
865:   PetscCheck(ptap->P_oth, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be reused. Do not call MatProductClear()");

867:   PetscCall(MatZeroEntries(C));

869:   /* Get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
870:   if (ptap->reuse == MAT_REUSE_MATRIX) {
871:     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
872:     PetscCall(MatGetBrowsOfAcols_MPIXAIJ(A, P, dof, MAT_REUSE_MATRIX, &ptap->P_oth));
873:   }
874:   PetscCall(PetscObjectQuery((PetscObject)ptap->P_oth, "aoffdiagtopothmapping", (PetscObject *)&map));
875:   PetscCall(MatGetLocalSize(p->B, NULL, &pon));
876:   pon *= dof;
877:   PetscCall(MatGetLocalSize(P, NULL, &pn));
878:   pn *= dof;

880:   PetscCall(PetscCalloc2(ptap->c_rmti[pon], &c_rmtj, ptap->c_rmti[pon], &c_rmta));
881:   PetscCall(MatGetLocalSize(A, &am, NULL));
882:   PetscCall(MatGetOwnershipRangeColumn(P, &pcstart, &pcend));
883:   pcstart *= dof;
884:   pcend *= dof;
885:   cmaxr = 0;
886:   for (i = 0; i < pon; i++) cmaxr = PetscMax(cmaxr, ptap->c_rmti[i + 1] - ptap->c_rmti[i]);
887:   cd = (Mat_SeqAIJ *)(c->A)->data;
888:   co = (Mat_SeqAIJ *)(c->B)->data;
889:   for (i = 0; i < pn; i++) cmaxr = PetscMax(cmaxr, (cd->i[i + 1] - cd->i[i]) + (co->i[i + 1] - co->i[i]));
890:   PetscCall(PetscCalloc4(cmaxr, &apindices, cmaxr, &apvalues, cmaxr, &apvaluestmp, pon, &c_rmtc));
891:   PetscCall(PetscHMapIVCreateWithSize(cmaxr, &hmap));
892:   PetscCall(ISGetIndices(map, &mappingindices));
893:   for (i = 0; i < am && (pon || pn); i++) {
894:     PetscCall(PetscHMapIVClear(hmap));
895:     offset = i % dof;
896:     ii     = i / dof;
897:     nzi    = po->i[ii + 1] - po->i[ii];
898:     dnzi   = pd->i[ii + 1] - pd->i[ii];
899:     if (!nzi && !dnzi) continue;
900:     PetscCall(MatPtAPNumericComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, hmap));
901:     voff = 0;
902:     PetscCall(PetscHMapIVGetPairs(hmap, &voff, apindices, apvalues));
903:     if (!voff) continue;

905:     /* Form remote C(ii, :) */
906:     poj = po->j + po->i[ii];
907:     poa = po->a + po->i[ii];
908:     for (j = 0; j < nzi; j++) {
909:       pocol   = poj[j] * dof + offset;
910:       c_rmtjj = c_rmtj + ptap->c_rmti[pocol];
911:       c_rmtaa = c_rmta + ptap->c_rmti[pocol];
912:       for (jj = 0; jj < voff; jj++) {
913:         apvaluestmp[jj] = apvalues[jj] * poa[j];
914:         /* If the row is empty */
915:         if (!c_rmtc[pocol]) {
916:           c_rmtjj[jj] = apindices[jj];
917:           c_rmtaa[jj] = apvaluestmp[jj];
918:           c_rmtc[pocol]++;
919:         } else {
920:           PetscCall(PetscFindInt(apindices[jj], c_rmtc[pocol], c_rmtjj, &loc));
921:           if (loc >= 0) { /* hit */
922:             c_rmtaa[loc] += apvaluestmp[jj];
923:             PetscCall(PetscLogFlops(1.0));
924:           } else { /* new element */
925:             loc = -(loc + 1);
926:             /* Move data backward */
927:             for (kk = c_rmtc[pocol]; kk > loc; kk--) {
928:               c_rmtjj[kk] = c_rmtjj[kk - 1];
929:               c_rmtaa[kk] = c_rmtaa[kk - 1];
930:             } /* End kk */
931:             c_rmtjj[loc] = apindices[jj];
932:             c_rmtaa[loc] = apvaluestmp[jj];
933:             c_rmtc[pocol]++;
934:           }
935:         }
936:       } /* End jj */
937:       PetscCall(PetscLogFlops(voff));
938:     } /* End j */

940:     /* Form local C(ii, :) */
941:     pdj = pd->j + pd->i[ii];
942:     pda = pd->a + pd->i[ii];
943:     for (j = 0; j < dnzi; j++) {
944:       row = pcstart + pdj[j] * dof + offset;
945:       for (jj = 0; jj < voff; jj++) apvaluestmp[jj] = apvalues[jj] * pda[j]; /* End kk */
946:       PetscCall(PetscLogFlops(voff));
947:       PetscCall(MatSetValues(C, 1, &row, voff, apindices, apvaluestmp, ADD_VALUES));
948:     } /* End j */
949:   }   /* End i */

951:   PetscCall(ISRestoreIndices(map, &mappingindices));
952:   PetscCall(PetscFree4(apindices, apvalues, apvaluestmp, c_rmtc));
953:   PetscCall(PetscHMapIVDestroy(&hmap));
954:   PetscCall(PetscCalloc2(ptap->c_othi[pn], &c_othj, ptap->c_othi[pn], &c_otha));

956:   PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));
957:   PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_SCALAR, c_rmta, c_otha, MPI_REPLACE));
958:   PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));
959:   PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_SCALAR, c_rmta, c_otha, MPI_REPLACE));
960:   PetscCall(PetscFree2(c_rmtj, c_rmta));

962:   /* Add contributions from remote */
963:   for (i = 0; i < pn; i++) {
964:     row = i + pcstart;
965:     PetscCall(MatSetValues(C, 1, &row, ptap->c_othi[i + 1] - ptap->c_othi[i], c_othj + ptap->c_othi[i], c_otha + ptap->c_othi[i], ADD_VALUES));
966:   }
967:   PetscCall(PetscFree2(c_othj, c_otha));

969:   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
970:   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));

972:   ptap->reuse = MAT_REUSE_MATRIX;
973:   PetscFunctionReturn(PETSC_SUCCESS);
974: }

976: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged(Mat A, Mat P, Mat C)
977: {
978:   PetscFunctionBegin;

980:   PetscCall(MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce_merged(A, P, 1, C));
981:   PetscFunctionReturn(PETSC_SUCCESS);
982: }

984: /* TODO: move algorithm selection to MatProductSetFromOptions */
985: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce(Mat A, Mat P, PetscInt dof, PetscReal fill, Mat Cmpi)
986: {
987:   Mat_APMPI      *ptap;
988:   Mat_MPIAIJ     *p = (Mat_MPIAIJ *)P->data;
989:   MPI_Comm        comm;
990:   Mat_SeqAIJ     *pd = (Mat_SeqAIJ *)p->A->data, *po = (Mat_SeqAIJ *)p->B->data;
991:   MatType         mtype;
992:   PetscSF         sf;
993:   PetscSFNode    *iremote;
994:   PetscInt        rootspacesize, *rootspace, *rootspaceoffsets, nleaves;
995:   const PetscInt *rootdegrees;
996:   PetscHSetI      ht, oht, *hta, *hto;
997:   PetscInt        pn, pon, *c_rmtc, i, j, nzi, htsize, htosize, *c_rmtj, off, *c_othj, rcvncols, sendncols, *c_rmtoffsets;
998:   PetscInt        lidx, *rdj, col, pcstart, pcend, *dnz, *onz, am, arstart, arend, *poj, *pdj;
999:   PetscInt        nalg = 2, alg = 0, offset, ii;
1000:   PetscMPIInt     owner;
1001:   const PetscInt *mappingindices;
1002:   PetscBool       flg;
1003:   const char     *algTypes[2] = {"overlapping", "merged"};
1004:   IS              map;

1006:   PetscFunctionBegin;
1007:   MatCheckProduct(Cmpi, 5);
1008:   PetscCheck(!Cmpi->product->data, PetscObjectComm((PetscObject)Cmpi), PETSC_ERR_PLIB, "Product data not empty");
1009:   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));

1011:   /* Create symbolic parallel matrix Cmpi */
1012:   PetscCall(MatGetLocalSize(P, NULL, &pn));
1013:   pn *= dof;
1014:   PetscCall(MatGetType(A, &mtype));
1015:   PetscCall(MatSetType(Cmpi, mtype));
1016:   PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE));

1018:   PetscCall(PetscNew(&ptap));
1019:   ptap->reuse   = MAT_INITIAL_MATRIX;
1020:   ptap->algType = 2;

1022:   /* Get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
1023:   PetscCall(MatGetBrowsOfAcols_MPIXAIJ(A, P, dof, MAT_INITIAL_MATRIX, &ptap->P_oth));
1024:   PetscCall(PetscObjectQuery((PetscObject)ptap->P_oth, "aoffdiagtopothmapping", (PetscObject *)&map));
1025:   /* This equals to the number of offdiag columns in P */
1026:   PetscCall(MatGetLocalSize(p->B, NULL, &pon));
1027:   pon *= dof;
1028:   /* offsets */
1029:   PetscCall(PetscMalloc1(pon + 1, &ptap->c_rmti));
1030:   /* The number of columns we will send to remote ranks */
1031:   PetscCall(PetscMalloc1(pon, &c_rmtc));
1032:   PetscCall(PetscMalloc1(pon, &hta));
1033:   for (i = 0; i < pon; i++) PetscCall(PetscHSetICreate(&hta[i]));
1034:   PetscCall(MatGetLocalSize(A, &am, NULL));
1035:   PetscCall(MatGetOwnershipRange(A, &arstart, &arend));
1036:   /* Create hash table to merge all columns for C(i, :) */
1037:   PetscCall(PetscHSetICreate(&ht));

1039:   PetscCall(ISGetIndices(map, &mappingindices));
1040:   ptap->c_rmti[0] = 0;
1041:   /* 2) Pass 1: calculate the size for C_rmt (a matrix need to be sent to other processors)  */
1042:   for (i = 0; i < am && pon; i++) {
1043:     /* Form one row of AP */
1044:     PetscCall(PetscHSetIClear(ht));
1045:     offset = i % dof;
1046:     ii     = i / dof;
1047:     /* If the off diag is empty, we should not do any calculation */
1048:     nzi = po->i[ii + 1] - po->i[ii];
1049:     if (!nzi) continue;

1051:     PetscCall(MatPtAPSymbolicComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, ht, ht));
1052:     PetscCall(PetscHSetIGetSize(ht, &htsize));
1053:     /* If AP is empty, just continue */
1054:     if (!htsize) continue;
1055:     /* Form C(ii, :) */
1056:     poj = po->j + po->i[ii];
1057:     for (j = 0; j < nzi; j++) PetscCall(PetscHSetIUpdate(hta[poj[j] * dof + offset], ht));
1058:   }

1060:   for (i = 0; i < pon; i++) {
1061:     PetscCall(PetscHSetIGetSize(hta[i], &htsize));
1062:     ptap->c_rmti[i + 1] = ptap->c_rmti[i] + htsize;
1063:     c_rmtc[i]           = htsize;
1064:   }

1066:   PetscCall(PetscMalloc1(ptap->c_rmti[pon], &c_rmtj));

1068:   for (i = 0; i < pon; i++) {
1069:     off = 0;
1070:     PetscCall(PetscHSetIGetElems(hta[i], &off, c_rmtj + ptap->c_rmti[i]));
1071:     PetscCall(PetscHSetIDestroy(&hta[i]));
1072:   }
1073:   PetscCall(PetscFree(hta));

1075:   PetscCall(PetscMalloc1(pon, &iremote));
1076:   for (i = 0; i < pon; i++) {
1077:     owner  = 0;
1078:     lidx   = 0;
1079:     offset = i % dof;
1080:     ii     = i / dof;
1081:     PetscCall(PetscLayoutFindOwnerIndex(P->cmap, p->garray[ii], &owner, &lidx));
1082:     iremote[i].index = lidx * dof + offset;
1083:     iremote[i].rank  = owner;
1084:   }

1086:   PetscCall(PetscSFCreate(comm, &sf));
1087:   PetscCall(PetscSFSetGraph(sf, pn, pon, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER));
1088:   /* Reorder ranks properly so that the data handled by gather and scatter have the same order */
1089:   PetscCall(PetscSFSetRankOrder(sf, PETSC_TRUE));
1090:   PetscCall(PetscSFSetFromOptions(sf));
1091:   PetscCall(PetscSFSetUp(sf));
1092:   /* How many neighbors have contributions to my rows? */
1093:   PetscCall(PetscSFComputeDegreeBegin(sf, &rootdegrees));
1094:   PetscCall(PetscSFComputeDegreeEnd(sf, &rootdegrees));
1095:   rootspacesize = 0;
1096:   for (i = 0; i < pn; i++) rootspacesize += rootdegrees[i];
1097:   PetscCall(PetscMalloc1(rootspacesize, &rootspace));
1098:   PetscCall(PetscMalloc1(rootspacesize + 1, &rootspaceoffsets));
1099:   /* Get information from leaves
1100:    * Number of columns other people contribute to my rows
1101:    * */
1102:   PetscCall(PetscSFGatherBegin(sf, MPIU_INT, c_rmtc, rootspace));
1103:   PetscCall(PetscSFGatherEnd(sf, MPIU_INT, c_rmtc, rootspace));
1104:   PetscCall(PetscFree(c_rmtc));
1105:   PetscCall(PetscCalloc1(pn + 1, &ptap->c_othi));
1106:   /* The number of columns is received for each row */
1107:   ptap->c_othi[0]     = 0;
1108:   rootspacesize       = 0;
1109:   rootspaceoffsets[0] = 0;
1110:   for (i = 0; i < pn; i++) {
1111:     rcvncols = 0;
1112:     for (j = 0; j < rootdegrees[i]; j++) {
1113:       rcvncols += rootspace[rootspacesize];
1114:       rootspaceoffsets[rootspacesize + 1] = rootspaceoffsets[rootspacesize] + rootspace[rootspacesize];
1115:       rootspacesize++;
1116:     }
1117:     ptap->c_othi[i + 1] = ptap->c_othi[i] + rcvncols;
1118:   }
1119:   PetscCall(PetscFree(rootspace));

1121:   PetscCall(PetscMalloc1(pon, &c_rmtoffsets));
1122:   PetscCall(PetscSFScatterBegin(sf, MPIU_INT, rootspaceoffsets, c_rmtoffsets));
1123:   PetscCall(PetscSFScatterEnd(sf, MPIU_INT, rootspaceoffsets, c_rmtoffsets));
1124:   PetscCall(PetscSFDestroy(&sf));
1125:   PetscCall(PetscFree(rootspaceoffsets));

1127:   PetscCall(PetscCalloc1(ptap->c_rmti[pon], &iremote));
1128:   nleaves = 0;
1129:   for (i = 0; i < pon; i++) {
1130:     owner = 0;
1131:     ii    = i / dof;
1132:     PetscCall(PetscLayoutFindOwnerIndex(P->cmap, p->garray[ii], &owner, NULL));
1133:     sendncols = ptap->c_rmti[i + 1] - ptap->c_rmti[i];
1134:     for (j = 0; j < sendncols; j++) {
1135:       iremote[nleaves].rank    = owner;
1136:       iremote[nleaves++].index = c_rmtoffsets[i] + j;
1137:     }
1138:   }
1139:   PetscCall(PetscFree(c_rmtoffsets));
1140:   PetscCall(PetscCalloc1(ptap->c_othi[pn], &c_othj));

1142:   PetscCall(PetscSFCreate(comm, &ptap->sf));
1143:   PetscCall(PetscSFSetGraph(ptap->sf, ptap->c_othi[pn], nleaves, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER));
1144:   PetscCall(PetscSFSetFromOptions(ptap->sf));
1145:   /* One to one map */
1146:   PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));

1148:   PetscCall(PetscMalloc2(pn, &dnz, pn, &onz));
1149:   PetscCall(PetscHSetICreate(&oht));
1150:   PetscCall(MatGetOwnershipRangeColumn(P, &pcstart, &pcend));
1151:   pcstart *= dof;
1152:   pcend *= dof;
1153:   PetscCall(PetscMalloc2(pn, &hta, pn, &hto));
1154:   for (i = 0; i < pn; i++) {
1155:     PetscCall(PetscHSetICreate(&hta[i]));
1156:     PetscCall(PetscHSetICreate(&hto[i]));
1157:   }
1158:   /* Work on local part */
1159:   /* 4) Pass 1: Estimate memory for C_loc */
1160:   for (i = 0; i < am && pn; i++) {
1161:     PetscCall(PetscHSetIClear(ht));
1162:     PetscCall(PetscHSetIClear(oht));
1163:     offset = i % dof;
1164:     ii     = i / dof;
1165:     nzi    = pd->i[ii + 1] - pd->i[ii];
1166:     if (!nzi) continue;

1168:     PetscCall(MatPtAPSymbolicComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, ht, oht));
1169:     PetscCall(PetscHSetIGetSize(ht, &htsize));
1170:     PetscCall(PetscHSetIGetSize(oht, &htosize));
1171:     if (!(htsize + htosize)) continue;
1172:     /* Form C(ii, :) */
1173:     pdj = pd->j + pd->i[ii];
1174:     for (j = 0; j < nzi; j++) {
1175:       PetscCall(PetscHSetIUpdate(hta[pdj[j] * dof + offset], ht));
1176:       PetscCall(PetscHSetIUpdate(hto[pdj[j] * dof + offset], oht));
1177:     }
1178:   }

1180:   PetscCall(ISRestoreIndices(map, &mappingindices));

1182:   PetscCall(PetscHSetIDestroy(&ht));
1183:   PetscCall(PetscHSetIDestroy(&oht));

1185:   /* Get remote data */
1186:   PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));
1187:   PetscCall(PetscFree(c_rmtj));

1189:   for (i = 0; i < pn; i++) {
1190:     nzi = ptap->c_othi[i + 1] - ptap->c_othi[i];
1191:     rdj = c_othj + ptap->c_othi[i];
1192:     for (j = 0; j < nzi; j++) {
1193:       col = rdj[j];
1194:       /* diag part */
1195:       if (col >= pcstart && col < pcend) {
1196:         PetscCall(PetscHSetIAdd(hta[i], col));
1197:       } else { /* off diag */
1198:         PetscCall(PetscHSetIAdd(hto[i], col));
1199:       }
1200:     }
1201:     PetscCall(PetscHSetIGetSize(hta[i], &htsize));
1202:     dnz[i] = htsize;
1203:     PetscCall(PetscHSetIDestroy(&hta[i]));
1204:     PetscCall(PetscHSetIGetSize(hto[i], &htsize));
1205:     onz[i] = htsize;
1206:     PetscCall(PetscHSetIDestroy(&hto[i]));
1207:   }

1209:   PetscCall(PetscFree2(hta, hto));
1210:   PetscCall(PetscFree(c_othj));

1212:   /* local sizes and preallocation */
1213:   PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE));
1214:   PetscCall(MatSetBlockSizes(Cmpi, dof > 1 ? dof : P->cmap->bs, dof > 1 ? dof : P->cmap->bs));
1215:   PetscCall(MatMPIAIJSetPreallocation(Cmpi, 0, dnz, 0, onz));
1216:   PetscCall(MatSetUp(Cmpi));
1217:   PetscCall(PetscFree2(dnz, onz));

1219:   /* attach the supporting struct to Cmpi for reuse */
1220:   Cmpi->product->data    = ptap;
1221:   Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP;
1222:   Cmpi->product->view    = MatView_MPIAIJ_PtAP;

1224:   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
1225:   Cmpi->assembled = PETSC_FALSE;
1226:   /* pick an algorithm */
1227:   PetscOptionsBegin(PetscObjectComm((PetscObject)A), ((PetscObject)A)->prefix, "MatPtAP", "Mat");
1228:   alg = 0;
1229:   PetscCall(PetscOptionsEList("-matptap_allatonce_via", "PtAP allatonce numeric approach", "MatPtAP", algTypes, nalg, algTypes[alg], &alg, &flg));
1230:   PetscOptionsEnd();
1231:   switch (alg) {
1232:   case 0:
1233:     Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce;
1234:     break;
1235:   case 1:
1236:     Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged;
1237:     break;
1238:   default:
1239:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, " Unsupported allatonce numerical algorithm ");
1240:   }
1241:   PetscFunctionReturn(PETSC_SUCCESS);
1242: }

1244: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce(Mat A, Mat P, PetscReal fill, Mat C)
1245: {
1246:   PetscFunctionBegin;
1247:   PetscCall(MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce(A, P, 1, fill, C));
1248:   PetscFunctionReturn(PETSC_SUCCESS);
1249: }

1251: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce_merged(Mat A, Mat P, PetscInt dof, PetscReal fill, Mat Cmpi)
1252: {
1253:   Mat_APMPI      *ptap;
1254:   Mat_MPIAIJ     *p = (Mat_MPIAIJ *)P->data;
1255:   MPI_Comm        comm;
1256:   Mat_SeqAIJ     *pd = (Mat_SeqAIJ *)p->A->data, *po = (Mat_SeqAIJ *)p->B->data;
1257:   MatType         mtype;
1258:   PetscSF         sf;
1259:   PetscSFNode    *iremote;
1260:   PetscInt        rootspacesize, *rootspace, *rootspaceoffsets, nleaves;
1261:   const PetscInt *rootdegrees;
1262:   PetscHSetI      ht, oht, *hta, *hto, *htd;
1263:   PetscInt        pn, pon, *c_rmtc, i, j, nzi, dnzi, htsize, htosize, *c_rmtj, off, *c_othj, rcvncols, sendncols, *c_rmtoffsets;
1264:   PetscInt        lidx, *rdj, col, pcstart, pcend, *dnz, *onz, am, arstart, arend, *poj, *pdj;
1265:   PetscInt        nalg = 2, alg = 0, offset, ii;
1266:   PetscMPIInt     owner;
1267:   PetscBool       flg;
1268:   const char     *algTypes[2] = {"merged", "overlapping"};
1269:   const PetscInt *mappingindices;
1270:   IS              map;

1272:   PetscFunctionBegin;
1273:   MatCheckProduct(Cmpi, 5);
1274:   PetscCheck(!Cmpi->product->data, PetscObjectComm((PetscObject)Cmpi), PETSC_ERR_PLIB, "Product data not empty");
1275:   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));

1277:   /* Create symbolic parallel matrix Cmpi */
1278:   PetscCall(MatGetLocalSize(P, NULL, &pn));
1279:   pn *= dof;
1280:   PetscCall(MatGetType(A, &mtype));
1281:   PetscCall(MatSetType(Cmpi, mtype));
1282:   PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE));

1284:   PetscCall(PetscNew(&ptap));
1285:   ptap->reuse   = MAT_INITIAL_MATRIX;
1286:   ptap->algType = 3;

1288:   /* 0) Get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
1289:   PetscCall(MatGetBrowsOfAcols_MPIXAIJ(A, P, dof, MAT_INITIAL_MATRIX, &ptap->P_oth));
1290:   PetscCall(PetscObjectQuery((PetscObject)ptap->P_oth, "aoffdiagtopothmapping", (PetscObject *)&map));

1292:   /* This equals to the number of offdiag columns in P */
1293:   PetscCall(MatGetLocalSize(p->B, NULL, &pon));
1294:   pon *= dof;
1295:   /* offsets */
1296:   PetscCall(PetscMalloc1(pon + 1, &ptap->c_rmti));
1297:   /* The number of columns we will send to remote ranks */
1298:   PetscCall(PetscMalloc1(pon, &c_rmtc));
1299:   PetscCall(PetscMalloc1(pon, &hta));
1300:   for (i = 0; i < pon; i++) PetscCall(PetscHSetICreate(&hta[i]));
1301:   PetscCall(MatGetLocalSize(A, &am, NULL));
1302:   PetscCall(MatGetOwnershipRange(A, &arstart, &arend));
1303:   /* Create hash table to merge all columns for C(i, :) */
1304:   PetscCall(PetscHSetICreate(&ht));
1305:   PetscCall(PetscHSetICreate(&oht));
1306:   PetscCall(PetscMalloc2(pn, &htd, pn, &hto));
1307:   for (i = 0; i < pn; i++) {
1308:     PetscCall(PetscHSetICreate(&htd[i]));
1309:     PetscCall(PetscHSetICreate(&hto[i]));
1310:   }

1312:   PetscCall(ISGetIndices(map, &mappingindices));
1313:   ptap->c_rmti[0] = 0;
1314:   /* 2) Pass 1: calculate the size for C_rmt (a matrix need to be sent to other processors)  */
1315:   for (i = 0; i < am && (pon || pn); i++) {
1316:     /* Form one row of AP */
1317:     PetscCall(PetscHSetIClear(ht));
1318:     PetscCall(PetscHSetIClear(oht));
1319:     offset = i % dof;
1320:     ii     = i / dof;
1321:     /* If the off diag is empty, we should not do any calculation */
1322:     nzi  = po->i[ii + 1] - po->i[ii];
1323:     dnzi = pd->i[ii + 1] - pd->i[ii];
1324:     if (!nzi && !dnzi) continue;

1326:     PetscCall(MatPtAPSymbolicComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, ht, oht));
1327:     PetscCall(PetscHSetIGetSize(ht, &htsize));
1328:     PetscCall(PetscHSetIGetSize(oht, &htosize));
1329:     /* If AP is empty, just continue */
1330:     if (!(htsize + htosize)) continue;

1332:     /* Form remote C(ii, :) */
1333:     poj = po->j + po->i[ii];
1334:     for (j = 0; j < nzi; j++) {
1335:       PetscCall(PetscHSetIUpdate(hta[poj[j] * dof + offset], ht));
1336:       PetscCall(PetscHSetIUpdate(hta[poj[j] * dof + offset], oht));
1337:     }

1339:     /* Form local C(ii, :) */
1340:     pdj = pd->j + pd->i[ii];
1341:     for (j = 0; j < dnzi; j++) {
1342:       PetscCall(PetscHSetIUpdate(htd[pdj[j] * dof + offset], ht));
1343:       PetscCall(PetscHSetIUpdate(hto[pdj[j] * dof + offset], oht));
1344:     }
1345:   }

1347:   PetscCall(ISRestoreIndices(map, &mappingindices));

1349:   PetscCall(PetscHSetIDestroy(&ht));
1350:   PetscCall(PetscHSetIDestroy(&oht));

1352:   for (i = 0; i < pon; i++) {
1353:     PetscCall(PetscHSetIGetSize(hta[i], &htsize));
1354:     ptap->c_rmti[i + 1] = ptap->c_rmti[i] + htsize;
1355:     c_rmtc[i]           = htsize;
1356:   }

1358:   PetscCall(PetscMalloc1(ptap->c_rmti[pon], &c_rmtj));

1360:   for (i = 0; i < pon; i++) {
1361:     off = 0;
1362:     PetscCall(PetscHSetIGetElems(hta[i], &off, c_rmtj + ptap->c_rmti[i]));
1363:     PetscCall(PetscHSetIDestroy(&hta[i]));
1364:   }
1365:   PetscCall(PetscFree(hta));

1367:   PetscCall(PetscMalloc1(pon, &iremote));
1368:   for (i = 0; i < pon; i++) {
1369:     owner  = 0;
1370:     lidx   = 0;
1371:     offset = i % dof;
1372:     ii     = i / dof;
1373:     PetscCall(PetscLayoutFindOwnerIndex(P->cmap, p->garray[ii], &owner, &lidx));
1374:     iremote[i].index = lidx * dof + offset;
1375:     iremote[i].rank  = owner;
1376:   }

1378:   PetscCall(PetscSFCreate(comm, &sf));
1379:   PetscCall(PetscSFSetGraph(sf, pn, pon, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER));
1380:   /* Reorder ranks properly so that the data handled by gather and scatter have the same order */
1381:   PetscCall(PetscSFSetRankOrder(sf, PETSC_TRUE));
1382:   PetscCall(PetscSFSetFromOptions(sf));
1383:   PetscCall(PetscSFSetUp(sf));
1384:   /* How many neighbors have contributions to my rows? */
1385:   PetscCall(PetscSFComputeDegreeBegin(sf, &rootdegrees));
1386:   PetscCall(PetscSFComputeDegreeEnd(sf, &rootdegrees));
1387:   rootspacesize = 0;
1388:   for (i = 0; i < pn; i++) rootspacesize += rootdegrees[i];
1389:   PetscCall(PetscMalloc1(rootspacesize, &rootspace));
1390:   PetscCall(PetscMalloc1(rootspacesize + 1, &rootspaceoffsets));
1391:   /* Get information from leaves
1392:    * Number of columns other people contribute to my rows
1393:    * */
1394:   PetscCall(PetscSFGatherBegin(sf, MPIU_INT, c_rmtc, rootspace));
1395:   PetscCall(PetscSFGatherEnd(sf, MPIU_INT, c_rmtc, rootspace));
1396:   PetscCall(PetscFree(c_rmtc));
1397:   PetscCall(PetscMalloc1(pn + 1, &ptap->c_othi));
1398:   /* The number of columns is received for each row */
1399:   ptap->c_othi[0]     = 0;
1400:   rootspacesize       = 0;
1401:   rootspaceoffsets[0] = 0;
1402:   for (i = 0; i < pn; i++) {
1403:     rcvncols = 0;
1404:     for (j = 0; j < rootdegrees[i]; j++) {
1405:       rcvncols += rootspace[rootspacesize];
1406:       rootspaceoffsets[rootspacesize + 1] = rootspaceoffsets[rootspacesize] + rootspace[rootspacesize];
1407:       rootspacesize++;
1408:     }
1409:     ptap->c_othi[i + 1] = ptap->c_othi[i] + rcvncols;
1410:   }
1411:   PetscCall(PetscFree(rootspace));

1413:   PetscCall(PetscMalloc1(pon, &c_rmtoffsets));
1414:   PetscCall(PetscSFScatterBegin(sf, MPIU_INT, rootspaceoffsets, c_rmtoffsets));
1415:   PetscCall(PetscSFScatterEnd(sf, MPIU_INT, rootspaceoffsets, c_rmtoffsets));
1416:   PetscCall(PetscSFDestroy(&sf));
1417:   PetscCall(PetscFree(rootspaceoffsets));

1419:   PetscCall(PetscCalloc1(ptap->c_rmti[pon], &iremote));
1420:   nleaves = 0;
1421:   for (i = 0; i < pon; i++) {
1422:     owner = 0;
1423:     ii    = i / dof;
1424:     PetscCall(PetscLayoutFindOwnerIndex(P->cmap, p->garray[ii], &owner, NULL));
1425:     sendncols = ptap->c_rmti[i + 1] - ptap->c_rmti[i];
1426:     for (j = 0; j < sendncols; j++) {
1427:       iremote[nleaves].rank    = owner;
1428:       iremote[nleaves++].index = c_rmtoffsets[i] + j;
1429:     }
1430:   }
1431:   PetscCall(PetscFree(c_rmtoffsets));
1432:   PetscCall(PetscCalloc1(ptap->c_othi[pn], &c_othj));

1434:   PetscCall(PetscSFCreate(comm, &ptap->sf));
1435:   PetscCall(PetscSFSetGraph(ptap->sf, ptap->c_othi[pn], nleaves, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER));
1436:   PetscCall(PetscSFSetFromOptions(ptap->sf));
1437:   /* One to one map */
1438:   PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));
1439:   /* Get remote data */
1440:   PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));
1441:   PetscCall(PetscFree(c_rmtj));
1442:   PetscCall(PetscMalloc2(pn, &dnz, pn, &onz));
1443:   PetscCall(MatGetOwnershipRangeColumn(P, &pcstart, &pcend));
1444:   pcstart *= dof;
1445:   pcend *= dof;
1446:   for (i = 0; i < pn; i++) {
1447:     nzi = ptap->c_othi[i + 1] - ptap->c_othi[i];
1448:     rdj = c_othj + ptap->c_othi[i];
1449:     for (j = 0; j < nzi; j++) {
1450:       col = rdj[j];
1451:       /* diag part */
1452:       if (col >= pcstart && col < pcend) {
1453:         PetscCall(PetscHSetIAdd(htd[i], col));
1454:       } else { /* off diag */
1455:         PetscCall(PetscHSetIAdd(hto[i], col));
1456:       }
1457:     }
1458:     PetscCall(PetscHSetIGetSize(htd[i], &htsize));
1459:     dnz[i] = htsize;
1460:     PetscCall(PetscHSetIDestroy(&htd[i]));
1461:     PetscCall(PetscHSetIGetSize(hto[i], &htsize));
1462:     onz[i] = htsize;
1463:     PetscCall(PetscHSetIDestroy(&hto[i]));
1464:   }

1466:   PetscCall(PetscFree2(htd, hto));
1467:   PetscCall(PetscFree(c_othj));

1469:   /* local sizes and preallocation */
1470:   PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE));
1471:   PetscCall(MatSetBlockSizes(Cmpi, dof > 1 ? dof : P->cmap->bs, dof > 1 ? dof : P->cmap->bs));
1472:   PetscCall(MatMPIAIJSetPreallocation(Cmpi, 0, dnz, 0, onz));
1473:   PetscCall(PetscFree2(dnz, onz));

1475:   /* attach the supporting struct to Cmpi for reuse */
1476:   Cmpi->product->data    = ptap;
1477:   Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP;
1478:   Cmpi->product->view    = MatView_MPIAIJ_PtAP;

1480:   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
1481:   Cmpi->assembled = PETSC_FALSE;
1482:   /* pick an algorithm */
1483:   PetscOptionsBegin(PetscObjectComm((PetscObject)A), ((PetscObject)A)->prefix, "MatPtAP", "Mat");
1484:   alg = 0;
1485:   PetscCall(PetscOptionsEList("-matptap_allatonce_via", "PtAP allatonce numeric approach", "MatPtAP", algTypes, nalg, algTypes[alg], &alg, &flg));
1486:   PetscOptionsEnd();
1487:   switch (alg) {
1488:   case 0:
1489:     Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged;
1490:     break;
1491:   case 1:
1492:     Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce;
1493:     break;
1494:   default:
1495:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, " Unsupported allatonce numerical algorithm ");
1496:   }
1497:   PetscFunctionReturn(PETSC_SUCCESS);
1498: }

1500: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce_merged(Mat A, Mat P, PetscReal fill, Mat C)
1501: {
1502:   PetscFunctionBegin;
1503:   PetscCall(MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce_merged(A, P, 1, fill, C));
1504:   PetscFunctionReturn(PETSC_SUCCESS);
1505: }

1507: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ(Mat A, Mat P, PetscReal fill, Mat Cmpi)
1508: {
1509:   Mat_APMPI               *ptap;
1510:   Mat_MPIAIJ              *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data;
1511:   MPI_Comm                 comm;
1512:   PetscMPIInt              size, rank;
1513:   PetscFreeSpaceList       free_space = NULL, current_space = NULL;
1514:   PetscInt                 am = A->rmap->n, pm = P->rmap->n, pN = P->cmap->N, pn = P->cmap->n;
1515:   PetscInt                *lnk, i, k, pnz, row, nsend;
1516:   PetscBT                  lnkbt;
1517:   PetscMPIInt              tagi, tagj, *len_si, *len_s, *len_ri, nrecv;
1518:   PETSC_UNUSED PetscMPIInt icompleted = 0;
1519:   PetscInt               **buf_rj, **buf_ri, **buf_ri_k;
1520:   PetscInt                 len, proc, *dnz, *onz, *owners, nzi, nspacedouble;
1521:   PetscInt                 nrows, *buf_s, *buf_si, *buf_si_i, **nextrow, **nextci;
1522:   MPI_Request             *swaits, *rwaits;
1523:   MPI_Status              *sstatus, rstatus;
1524:   PetscLayout              rowmap;
1525:   PetscInt                *owners_co, *coi, *coj; /* i and j array of (p->B)^T*A*P - used in the communication */
1526:   PetscMPIInt             *len_r, *id_r;          /* array of length of comm->size, store send/recv matrix values */
1527:   PetscInt                *api, *apj, *Jptr, apnz, *prmap = p->garray, con, j, ap_rmax = 0, Crmax, *aj, *ai, *pi;
1528:   Mat_SeqAIJ              *p_loc, *p_oth = NULL, *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = NULL, *c_loc, *c_oth;
1529:   PetscScalar             *apv;
1530:   PetscHMapI               ta;
1531:   MatType                  mtype;
1532:   const char              *prefix;
1533: #if defined(PETSC_USE_INFO)
1534:   PetscReal apfill;
1535: #endif

1537:   PetscFunctionBegin;
1538:   MatCheckProduct(Cmpi, 4);
1539:   PetscCheck(!Cmpi->product->data, PetscObjectComm((PetscObject)Cmpi), PETSC_ERR_PLIB, "Product data not empty");
1540:   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
1541:   PetscCallMPI(MPI_Comm_size(comm, &size));
1542:   PetscCallMPI(MPI_Comm_rank(comm, &rank));

1544:   if (size > 1) ao = (Mat_SeqAIJ *)(a->B)->data;

1546:   /* create symbolic parallel matrix Cmpi */
1547:   PetscCall(MatGetType(A, &mtype));
1548:   PetscCall(MatSetType(Cmpi, mtype));

1550:   /* Do dense axpy in MatPtAPNumeric_MPIAIJ_MPIAIJ() */
1551:   Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ;

1553:   /* create struct Mat_APMPI and attached it to C later */
1554:   PetscCall(PetscNew(&ptap));
1555:   ptap->reuse   = MAT_INITIAL_MATRIX;
1556:   ptap->algType = 1;

1558:   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
1559:   PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_INITIAL_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &ptap->P_oth));
1560:   /* get P_loc by taking all local rows of P */
1561:   PetscCall(MatMPIAIJGetLocalMat(P, MAT_INITIAL_MATRIX, &ptap->P_loc));

1563:   /* (0) compute Rd = Pd^T, Ro = Po^T  */
1564:   PetscCall(MatTranspose(p->A, MAT_INITIAL_MATRIX, &ptap->Rd));
1565:   PetscCall(MatTranspose(p->B, MAT_INITIAL_MATRIX, &ptap->Ro));

1567:   /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */
1568:   p_loc = (Mat_SeqAIJ *)(ptap->P_loc)->data;
1569:   if (ptap->P_oth) p_oth = (Mat_SeqAIJ *)(ptap->P_oth)->data;

1571:   /* create and initialize a linked list */
1572:   PetscCall(PetscHMapICreateWithSize(pn, &ta)); /* for compute AP_loc and Cmpi */
1573:   MatRowMergeMax_SeqAIJ(p_loc, ptap->P_loc->rmap->N, ta);
1574:   MatRowMergeMax_SeqAIJ(p_oth, ptap->P_oth->rmap->N, ta);
1575:   PetscCall(PetscHMapIGetSize(ta, &Crmax)); /* Crmax = nnz(sum of Prows) */
1576:   /* printf("[%d] est %d, Crmax %d; pN %d\n",rank,5*(p_loc->rmax+p_oth->rmax + (PetscInt)(1.e-2*pN)),Crmax,pN); */

1578:   PetscCall(PetscLLCondensedCreate(Crmax, pN, &lnk, &lnkbt));

1580:   /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */
1581:   if (ao) {
1582:     PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ad->i[am], PetscIntSumTruncate(ao->i[am], p_loc->i[pm]))), &free_space));
1583:   } else {
1584:     PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ad->i[am], p_loc->i[pm])), &free_space));
1585:   }
1586:   current_space = free_space;
1587:   nspacedouble  = 0;

1589:   PetscCall(PetscMalloc1(am + 1, &api));
1590:   api[0] = 0;
1591:   for (i = 0; i < am; i++) {
1592:     /* diagonal portion: Ad[i,:]*P */
1593:     ai  = ad->i;
1594:     pi  = p_loc->i;
1595:     nzi = ai[i + 1] - ai[i];
1596:     aj  = ad->j + ai[i];
1597:     for (j = 0; j < nzi; j++) {
1598:       row  = aj[j];
1599:       pnz  = pi[row + 1] - pi[row];
1600:       Jptr = p_loc->j + pi[row];
1601:       /* add non-zero cols of P into the sorted linked list lnk */
1602:       PetscCall(PetscLLCondensedAddSorted(pnz, Jptr, lnk, lnkbt));
1603:     }
1604:     /* off-diagonal portion: Ao[i,:]*P */
1605:     if (ao) {
1606:       ai  = ao->i;
1607:       pi  = p_oth->i;
1608:       nzi = ai[i + 1] - ai[i];
1609:       aj  = ao->j + ai[i];
1610:       for (j = 0; j < nzi; j++) {
1611:         row  = aj[j];
1612:         pnz  = pi[row + 1] - pi[row];
1613:         Jptr = p_oth->j + pi[row];
1614:         PetscCall(PetscLLCondensedAddSorted(pnz, Jptr, lnk, lnkbt));
1615:       }
1616:     }
1617:     apnz       = lnk[0];
1618:     api[i + 1] = api[i] + apnz;
1619:     if (ap_rmax < apnz) ap_rmax = apnz;

1621:     /* if free space is not available, double the total space in the list */
1622:     if (current_space->local_remaining < apnz) {
1623:       PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(apnz, current_space->total_array_size), &current_space));
1624:       nspacedouble++;
1625:     }

1627:     /* Copy data into free space, then initialize lnk */
1628:     PetscCall(PetscLLCondensedClean(pN, apnz, current_space->array, lnk, lnkbt));

1630:     current_space->array += apnz;
1631:     current_space->local_used += apnz;
1632:     current_space->local_remaining -= apnz;
1633:   }
1634:   /* Allocate space for apj and apv, initialize apj, and */
1635:   /* destroy list of free space and other temporary array(s) */
1636:   PetscCall(PetscMalloc2(api[am], &apj, api[am], &apv));
1637:   PetscCall(PetscFreeSpaceContiguous(&free_space, apj));
1638:   PetscCall(PetscLLDestroy(lnk, lnkbt));

1640:   /* Create AP_loc for reuse */
1641:   PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, am, pN, api, apj, apv, &ptap->AP_loc));
1642:   PetscCall(MatSetType(ptap->AP_loc, ((PetscObject)p->A)->type_name));
1643: #if defined(PETSC_USE_INFO)
1644:   if (ao) {
1645:     apfill = (PetscReal)api[am] / (ad->i[am] + ao->i[am] + p_loc->i[pm] + 1);
1646:   } else {
1647:     apfill = (PetscReal)api[am] / (ad->i[am] + p_loc->i[pm] + 1);
1648:   }
1649:   ptap->AP_loc->info.mallocs           = nspacedouble;
1650:   ptap->AP_loc->info.fill_ratio_given  = fill;
1651:   ptap->AP_loc->info.fill_ratio_needed = apfill;

1653:   if (api[am]) {
1654:     PetscCall(PetscInfo(ptap->AP_loc, "Nonscalable algorithm, AP_loc reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", nspacedouble, (double)fill, (double)apfill));
1655:     PetscCall(PetscInfo(ptap->AP_loc, "Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n", (double)apfill));
1656:   } else {
1657:     PetscCall(PetscInfo(ptap->AP_loc, "Nonscalable algorithm, AP_loc is empty \n"));
1658:   }
1659: #endif

1661:   /* (2-1) compute symbolic Co = Ro*AP_loc  */
1662:   PetscCall(MatGetOptionsPrefix(A, &prefix));
1663:   PetscCall(MatSetOptionsPrefix(ptap->Ro, prefix));
1664:   PetscCall(MatAppendOptionsPrefix(ptap->Ro, "inner_offdiag_"));
1665:   PetscCall(MatProductCreate(ptap->Ro, ptap->AP_loc, NULL, &ptap->C_oth));
1666:   PetscCall(MatGetOptionsPrefix(Cmpi, &prefix));
1667:   PetscCall(MatSetOptionsPrefix(ptap->C_oth, prefix));
1668:   PetscCall(MatAppendOptionsPrefix(ptap->C_oth, "inner_C_oth_"));
1669:   PetscCall(MatProductSetType(ptap->C_oth, MATPRODUCT_AB));
1670:   PetscCall(MatProductSetAlgorithm(ptap->C_oth, "default"));
1671:   PetscCall(MatProductSetFill(ptap->C_oth, fill));
1672:   PetscCall(MatProductSetFromOptions(ptap->C_oth));
1673:   PetscCall(MatProductSymbolic(ptap->C_oth));

1675:   /* (3) send coj of C_oth to other processors  */
1676:   /* determine row ownership */
1677:   PetscCall(PetscLayoutCreate(comm, &rowmap));
1678:   rowmap->n  = pn;
1679:   rowmap->bs = 1;
1680:   PetscCall(PetscLayoutSetUp(rowmap));
1681:   owners = rowmap->range;

1683:   /* determine the number of messages to send, their lengths */
1684:   PetscCall(PetscMalloc4(size, &len_s, size, &len_si, size, &sstatus, size + 2, &owners_co));
1685:   PetscCall(PetscArrayzero(len_s, size));
1686:   PetscCall(PetscArrayzero(len_si, size));

1688:   c_oth = (Mat_SeqAIJ *)ptap->C_oth->data;
1689:   coi   = c_oth->i;
1690:   coj   = c_oth->j;
1691:   con   = ptap->C_oth->rmap->n;
1692:   proc  = 0;
1693:   for (i = 0; i < con; i++) {
1694:     while (prmap[i] >= owners[proc + 1]) proc++;
1695:     len_si[proc]++;                     /* num of rows in Co(=Pt*AP) to be sent to [proc] */
1696:     len_s[proc] += coi[i + 1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */
1697:   }

1699:   len          = 0; /* max length of buf_si[], see (4) */
1700:   owners_co[0] = 0;
1701:   nsend        = 0;
1702:   for (proc = 0; proc < size; proc++) {
1703:     owners_co[proc + 1] = owners_co[proc] + len_si[proc];
1704:     if (len_s[proc]) {
1705:       nsend++;
1706:       len_si[proc] = 2 * (len_si[proc] + 1); /* length of buf_si to be sent to [proc] */
1707:       len += len_si[proc];
1708:     }
1709:   }

1711:   /* determine the number and length of messages to receive for coi and coj  */
1712:   PetscCall(PetscGatherNumberOfMessages(comm, NULL, len_s, &nrecv));
1713:   PetscCall(PetscGatherMessageLengths2(comm, nsend, nrecv, len_s, len_si, &id_r, &len_r, &len_ri));

1715:   /* post the Irecv and Isend of coj */
1716:   PetscCall(PetscCommGetNewTag(comm, &tagj));
1717:   PetscCall(PetscPostIrecvInt(comm, tagj, nrecv, id_r, len_r, &buf_rj, &rwaits));
1718:   PetscCall(PetscMalloc1(nsend + 1, &swaits));
1719:   for (proc = 0, k = 0; proc < size; proc++) {
1720:     if (!len_s[proc]) continue;
1721:     i = owners_co[proc];
1722:     PetscCallMPI(MPI_Isend(coj + coi[i], len_s[proc], MPIU_INT, proc, tagj, comm, swaits + k));
1723:     k++;
1724:   }

1726:   /* (2-2) compute symbolic C_loc = Rd*AP_loc */
1727:   PetscCall(MatSetOptionsPrefix(ptap->Rd, prefix));
1728:   PetscCall(MatAppendOptionsPrefix(ptap->Rd, "inner_diag_"));
1729:   PetscCall(MatProductCreate(ptap->Rd, ptap->AP_loc, NULL, &ptap->C_loc));
1730:   PetscCall(MatGetOptionsPrefix(Cmpi, &prefix));
1731:   PetscCall(MatSetOptionsPrefix(ptap->C_loc, prefix));
1732:   PetscCall(MatAppendOptionsPrefix(ptap->C_loc, "inner_C_loc_"));
1733:   PetscCall(MatProductSetType(ptap->C_loc, MATPRODUCT_AB));
1734:   PetscCall(MatProductSetAlgorithm(ptap->C_loc, "default"));
1735:   PetscCall(MatProductSetFill(ptap->C_loc, fill));
1736:   PetscCall(MatProductSetFromOptions(ptap->C_loc));
1737:   PetscCall(MatProductSymbolic(ptap->C_loc));

1739:   c_loc = (Mat_SeqAIJ *)ptap->C_loc->data;

1741:   /* receives coj are complete */
1742:   for (i = 0; i < nrecv; i++) PetscCallMPI(MPI_Waitany(nrecv, rwaits, &icompleted, &rstatus));
1743:   PetscCall(PetscFree(rwaits));
1744:   if (nsend) PetscCallMPI(MPI_Waitall(nsend, swaits, sstatus));

1746:   /* add received column indices into ta to update Crmax */
1747:   for (k = 0; k < nrecv; k++) { /* k-th received message */
1748:     Jptr = buf_rj[k];
1749:     for (j = 0; j < len_r[k]; j++) PetscCall(PetscHMapISet(ta, *(Jptr + j) + 1, 1));
1750:   }
1751:   PetscCall(PetscHMapIGetSize(ta, &Crmax));
1752:   PetscCall(PetscHMapIDestroy(&ta));

1754:   /* (4) send and recv coi */
1755:   PetscCall(PetscCommGetNewTag(comm, &tagi));
1756:   PetscCall(PetscPostIrecvInt(comm, tagi, nrecv, id_r, len_ri, &buf_ri, &rwaits));
1757:   PetscCall(PetscMalloc1(len + 1, &buf_s));
1758:   buf_si = buf_s; /* points to the beginning of k-th msg to be sent */
1759:   for (proc = 0, k = 0; proc < size; proc++) {
1760:     if (!len_s[proc]) continue;
1761:     /* form outgoing message for i-structure:
1762:          buf_si[0]:                 nrows to be sent
1763:                [1:nrows]:           row index (global)
1764:                [nrows+1:2*nrows+1]: i-structure index
1765:     */
1766:     nrows       = len_si[proc] / 2 - 1; /* num of rows in Co to be sent to [proc] */
1767:     buf_si_i    = buf_si + nrows + 1;
1768:     buf_si[0]   = nrows;
1769:     buf_si_i[0] = 0;
1770:     nrows       = 0;
1771:     for (i = owners_co[proc]; i < owners_co[proc + 1]; i++) {
1772:       nzi                 = coi[i + 1] - coi[i];
1773:       buf_si_i[nrows + 1] = buf_si_i[nrows] + nzi;   /* i-structure */
1774:       buf_si[nrows + 1]   = prmap[i] - owners[proc]; /* local row index */
1775:       nrows++;
1776:     }
1777:     PetscCallMPI(MPI_Isend(buf_si, len_si[proc], MPIU_INT, proc, tagi, comm, swaits + k));
1778:     k++;
1779:     buf_si += len_si[proc];
1780:   }
1781:   for (i = 0; i < nrecv; i++) PetscCallMPI(MPI_Waitany(nrecv, rwaits, &icompleted, &rstatus));
1782:   PetscCall(PetscFree(rwaits));
1783:   if (nsend) PetscCallMPI(MPI_Waitall(nsend, swaits, sstatus));

1785:   PetscCall(PetscFree4(len_s, len_si, sstatus, owners_co));
1786:   PetscCall(PetscFree(len_ri));
1787:   PetscCall(PetscFree(swaits));
1788:   PetscCall(PetscFree(buf_s));

1790:   /* (5) compute the local portion of Cmpi      */
1791:   /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */
1792:   PetscCall(PetscFreeSpaceGet(Crmax, &free_space));
1793:   current_space = free_space;

1795:   PetscCall(PetscMalloc3(nrecv, &buf_ri_k, nrecv, &nextrow, nrecv, &nextci));
1796:   for (k = 0; k < nrecv; k++) {
1797:     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1798:     nrows       = *buf_ri_k[k];
1799:     nextrow[k]  = buf_ri_k[k] + 1;           /* next row number of k-th recved i-structure */
1800:     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recved i-structure  */
1801:   }

1803:   MatPreallocateBegin(comm, pn, pn, dnz, onz);
1804:   PetscCall(PetscLLCondensedCreate(Crmax, pN, &lnk, &lnkbt));
1805:   for (i = 0; i < pn; i++) {
1806:     /* add C_loc into Cmpi */
1807:     nzi  = c_loc->i[i + 1] - c_loc->i[i];
1808:     Jptr = c_loc->j + c_loc->i[i];
1809:     PetscCall(PetscLLCondensedAddSorted(nzi, Jptr, lnk, lnkbt));

1811:     /* add received col data into lnk */
1812:     for (k = 0; k < nrecv; k++) { /* k-th received message */
1813:       if (i == *nextrow[k]) {     /* i-th row */
1814:         nzi  = *(nextci[k] + 1) - *nextci[k];
1815:         Jptr = buf_rj[k] + *nextci[k];
1816:         PetscCall(PetscLLCondensedAddSorted(nzi, Jptr, lnk, lnkbt));
1817:         nextrow[k]++;
1818:         nextci[k]++;
1819:       }
1820:     }
1821:     nzi = lnk[0];

1823:     /* copy data into free space, then initialize lnk */
1824:     PetscCall(PetscLLCondensedClean(pN, nzi, current_space->array, lnk, lnkbt));
1825:     PetscCall(MatPreallocateSet(i + owners[rank], nzi, current_space->array, dnz, onz));
1826:   }
1827:   PetscCall(PetscFree3(buf_ri_k, nextrow, nextci));
1828:   PetscCall(PetscLLDestroy(lnk, lnkbt));
1829:   PetscCall(PetscFreeSpaceDestroy(free_space));

1831:   /* local sizes and preallocation */
1832:   PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE));
1833:   if (P->cmap->bs > 0) {
1834:     PetscCall(PetscLayoutSetBlockSize(Cmpi->rmap, P->cmap->bs));
1835:     PetscCall(PetscLayoutSetBlockSize(Cmpi->cmap, P->cmap->bs));
1836:   }
1837:   PetscCall(MatMPIAIJSetPreallocation(Cmpi, 0, dnz, 0, onz));
1838:   MatPreallocateEnd(dnz, onz);

1840:   /* members in merge */
1841:   PetscCall(PetscFree(id_r));
1842:   PetscCall(PetscFree(len_r));
1843:   PetscCall(PetscFree(buf_ri[0]));
1844:   PetscCall(PetscFree(buf_ri));
1845:   PetscCall(PetscFree(buf_rj[0]));
1846:   PetscCall(PetscFree(buf_rj));
1847:   PetscCall(PetscLayoutDestroy(&rowmap));

1849:   PetscCall(PetscCalloc1(pN, &ptap->apa));

1851:   /* attach the supporting struct to Cmpi for reuse */
1852:   Cmpi->product->data    = ptap;
1853:   Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP;
1854:   Cmpi->product->view    = MatView_MPIAIJ_PtAP;

1856:   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
1857:   Cmpi->assembled = PETSC_FALSE;
1858:   PetscFunctionReturn(PETSC_SUCCESS);
1859: }

1861: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ(Mat A, Mat P, Mat C)
1862: {
1863:   Mat_MPIAIJ        *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data;
1864:   Mat_SeqAIJ        *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = (Mat_SeqAIJ *)(a->B)->data;
1865:   Mat_SeqAIJ        *ap, *p_loc, *p_oth = NULL, *c_seq;
1866:   Mat_APMPI         *ptap;
1867:   Mat                AP_loc, C_loc, C_oth;
1868:   PetscInt           i, rstart, rend, cm, ncols, row;
1869:   PetscInt          *api, *apj, am = A->rmap->n, j, col, apnz;
1870:   PetscScalar       *apa;
1871:   const PetscInt    *cols;
1872:   const PetscScalar *vals;

1874:   PetscFunctionBegin;
1875:   MatCheckProduct(C, 3);
1876:   ptap = (Mat_APMPI *)C->product->data;
1877:   PetscCheck(ptap, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be computed. Missing data");
1878:   PetscCheck(ptap->AP_loc, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be reused. Do not call MatProductClear()");

1880:   PetscCall(MatZeroEntries(C));
1881:   /* 1) get R = Pd^T,Ro = Po^T */
1882:   if (ptap->reuse == MAT_REUSE_MATRIX) {
1883:     PetscCall(MatTranspose(p->A, MAT_REUSE_MATRIX, &ptap->Rd));
1884:     PetscCall(MatTranspose(p->B, MAT_REUSE_MATRIX, &ptap->Ro));
1885:   }

1887:   /* 2) get AP_loc */
1888:   AP_loc = ptap->AP_loc;
1889:   ap     = (Mat_SeqAIJ *)AP_loc->data;

1891:   /* 2-1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
1892:   if (ptap->reuse == MAT_REUSE_MATRIX) {
1893:     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
1894:     PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_REUSE_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &ptap->P_oth));
1895:     PetscCall(MatMPIAIJGetLocalMat(P, MAT_REUSE_MATRIX, &ptap->P_loc));
1896:   }

1898:   /* 2-2) compute numeric A_loc*P - dominating part */
1899:   /* get data from symbolic products */
1900:   p_loc = (Mat_SeqAIJ *)(ptap->P_loc)->data;
1901:   if (ptap->P_oth) p_oth = (Mat_SeqAIJ *)(ptap->P_oth)->data;
1902:   apa = ptap->apa;
1903:   api = ap->i;
1904:   apj = ap->j;
1905:   for (i = 0; i < am; i++) {
1906:     /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */
1907:     AProw_nonscalable(i, ad, ao, p_loc, p_oth, apa);
1908:     apnz = api[i + 1] - api[i];
1909:     for (j = 0; j < apnz; j++) {
1910:       col                 = apj[j + api[i]];
1911:       ap->a[j + ap->i[i]] = apa[col];
1912:       apa[col]            = 0.0;
1913:     }
1914:   }
1915:   /* We have modified the contents of local matrix AP_loc and must increase its ObjectState, since we are not doing AssemblyBegin/End on it. */
1916:   PetscCall(PetscObjectStateIncrease((PetscObject)AP_loc));

1918:   /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */
1919:   PetscCall(MatProductNumeric(ptap->C_loc));
1920:   PetscCall(MatProductNumeric(ptap->C_oth));
1921:   C_loc = ptap->C_loc;
1922:   C_oth = ptap->C_oth;

1924:   /* add C_loc and Co to to C */
1925:   PetscCall(MatGetOwnershipRange(C, &rstart, &rend));

1927:   /* C_loc -> C */
1928:   cm    = C_loc->rmap->N;
1929:   c_seq = (Mat_SeqAIJ *)C_loc->data;
1930:   cols  = c_seq->j;
1931:   vals  = c_seq->a;

1933:   /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */
1934:   /* when there are no off-processor parts.  */
1935:   /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */
1936:   /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */
1937:   /* a table, and other, more complex stuff has to be done. */
1938:   if (C->assembled) {
1939:     C->was_assembled = PETSC_TRUE;
1940:     C->assembled     = PETSC_FALSE;
1941:   }
1942:   if (C->was_assembled) {
1943:     for (i = 0; i < cm; i++) {
1944:       ncols = c_seq->i[i + 1] - c_seq->i[i];
1945:       row   = rstart + i;
1946:       PetscCall(MatSetValues_MPIAIJ(C, 1, &row, ncols, cols, vals, ADD_VALUES));
1947:       cols += ncols;
1948:       vals += ncols;
1949:     }
1950:   } else {
1951:     PetscCall(MatSetValues_MPIAIJ_CopyFromCSRFormat(C, c_seq->j, c_seq->i, c_seq->a));
1952:   }

1954:   /* Co -> C, off-processor part */
1955:   cm    = C_oth->rmap->N;
1956:   c_seq = (Mat_SeqAIJ *)C_oth->data;
1957:   cols  = c_seq->j;
1958:   vals  = c_seq->a;
1959:   for (i = 0; i < cm; i++) {
1960:     ncols = c_seq->i[i + 1] - c_seq->i[i];
1961:     row   = p->garray[i];
1962:     PetscCall(MatSetValues(C, 1, &row, ncols, cols, vals, ADD_VALUES));
1963:     cols += ncols;
1964:     vals += ncols;
1965:   }

1967:   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
1968:   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));

1970:   ptap->reuse = MAT_REUSE_MATRIX;
1971:   PetscFunctionReturn(PETSC_SUCCESS);
1972: }

1974: PETSC_INTERN PetscErrorCode MatProductSymbolic_PtAP_MPIAIJ_MPIAIJ(Mat C)
1975: {
1976:   Mat_Product        *product = C->product;
1977:   Mat                 A = product->A, P = product->B;
1978:   MatProductAlgorithm alg  = product->alg;
1979:   PetscReal           fill = product->fill;
1980:   PetscBool           flg;

1982:   PetscFunctionBegin;
1983:   /* scalable: do R=P^T locally, then C=R*A*P */
1984:   PetscCall(PetscStrcmp(alg, "scalable", &flg));
1985:   if (flg) {
1986:     PetscCall(MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(A, P, product->fill, C));
1987:     C->ops->productnumeric = MatProductNumeric_PtAP;
1988:     goto next;
1989:   }

1991:   /* nonscalable: do R=P^T locally, then C=R*A*P */
1992:   PetscCall(PetscStrcmp(alg, "nonscalable", &flg));
1993:   if (flg) {
1994:     PetscCall(MatPtAPSymbolic_MPIAIJ_MPIAIJ(A, P, fill, C));
1995:     goto next;
1996:   }

1998:   /* allatonce */
1999:   PetscCall(PetscStrcmp(alg, "allatonce", &flg));
2000:   if (flg) {
2001:     PetscCall(MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce(A, P, fill, C));
2002:     goto next;
2003:   }

2005:   /* allatonce_merged */
2006:   PetscCall(PetscStrcmp(alg, "allatonce_merged", &flg));
2007:   if (flg) {
2008:     PetscCall(MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce_merged(A, P, fill, C));
2009:     goto next;
2010:   }

2012:   /* backend general code */
2013:   PetscCall(PetscStrcmp(alg, "backend", &flg));
2014:   if (flg) {
2015:     PetscCall(MatProductSymbolic_MPIAIJBACKEND(C));
2016:     PetscFunctionReturn(PETSC_SUCCESS);
2017:   }

2019:   /* hypre */
2020: #if defined(PETSC_HAVE_HYPRE)
2021:   PetscCall(PetscStrcmp(alg, "hypre", &flg));
2022:   if (flg) {
2023:     PetscCall(MatPtAPSymbolic_AIJ_AIJ_wHYPRE(A, P, fill, C));
2024:     C->ops->productnumeric = MatProductNumeric_PtAP;
2025:     PetscFunctionReturn(PETSC_SUCCESS);
2026:   }
2027: #endif
2028:   SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_SUP, "Mat Product Algorithm is not supported");

2030: next:
2031:   C->ops->productnumeric = MatProductNumeric_PtAP;
2032:   PetscFunctionReturn(PETSC_SUCCESS);
2033: }