Actual source code: matmatmult.c


  2: /*
  3:   Defines matrix-matrix product routines for pairs of SeqAIJ matrices
  4:           C = A * B
  5: */

  7: #include <../src/mat/impls/aij/seq/aij.h>
  8: #include <../src/mat/utils/freespace.h>
  9: #include <petscbt.h>
 10: #include <petsc/private/isimpl.h>
 11: #include <../src/mat/impls/dense/seq/dense.h>

 13: PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ(Mat A, Mat B, Mat C)
 14: {
 15:   PetscFunctionBegin;
 16:   if (C->ops->matmultnumeric) PetscCall((*C->ops->matmultnumeric)(A, B, C));
 17:   else PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(A, B, C));
 18:   PetscFunctionReturn(PETSC_SUCCESS);
 19: }

 21: /* Modified from MatCreateSeqAIJWithArrays() */
 22: PETSC_INTERN PetscErrorCode MatSetSeqAIJWithArrays_private(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt i[], PetscInt j[], PetscScalar a[], MatType mtype, Mat mat)
 23: {
 24:   PetscInt    ii;
 25:   Mat_SeqAIJ *aij;
 26:   PetscBool   isseqaij, osingle, ofree_a, ofree_ij;

 28:   PetscFunctionBegin;
 29:   PetscCheck(m <= 0 || !i[0], PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "i (row indices) must start with 0");
 30:   PetscCall(MatSetSizes(mat, m, n, m, n));

 32:   if (!mtype) {
 33:     PetscCall(PetscObjectBaseTypeCompare((PetscObject)mat, MATSEQAIJ, &isseqaij));
 34:     if (!isseqaij) PetscCall(MatSetType(mat, MATSEQAIJ));
 35:   } else {
 36:     PetscCall(MatSetType(mat, mtype));
 37:   }

 39:   aij      = (Mat_SeqAIJ *)(mat)->data;
 40:   osingle  = aij->singlemalloc;
 41:   ofree_a  = aij->free_a;
 42:   ofree_ij = aij->free_ij;
 43:   /* changes the free flags */
 44:   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(mat, MAT_SKIP_ALLOCATION, NULL));

 46:   PetscCall(PetscFree(aij->ilen));
 47:   PetscCall(PetscFree(aij->imax));
 48:   PetscCall(PetscMalloc1(m, &aij->imax));
 49:   PetscCall(PetscMalloc1(m, &aij->ilen));
 50:   for (ii = 0, aij->nonzerorowcnt = 0, aij->rmax = 0; ii < m; ii++) {
 51:     const PetscInt rnz = i[ii + 1] - i[ii];
 52:     aij->nonzerorowcnt += !!rnz;
 53:     aij->rmax     = PetscMax(aij->rmax, rnz);
 54:     aij->ilen[ii] = aij->imax[ii] = i[ii + 1] - i[ii];
 55:   }
 56:   aij->maxnz = i[m];
 57:   aij->nz    = i[m];

 59:   if (osingle) {
 60:     PetscCall(PetscFree3(aij->a, aij->j, aij->i));
 61:   } else {
 62:     if (ofree_a) PetscCall(PetscFree(aij->a));
 63:     if (ofree_ij) PetscCall(PetscFree(aij->j));
 64:     if (ofree_ij) PetscCall(PetscFree(aij->i));
 65:   }
 66:   aij->i     = i;
 67:   aij->j     = j;
 68:   aij->a     = a;
 69:   aij->nonew = -1; /* this indicates that inserting a new value in the matrix that generates a new nonzero is an error */
 70:   /* default to not retain ownership */
 71:   aij->singlemalloc = PETSC_FALSE;
 72:   aij->free_a       = PETSC_FALSE;
 73:   aij->free_ij      = PETSC_FALSE;
 74:   PetscCall(MatCheckCompressedRow(mat, aij->nonzerorowcnt, &aij->compressedrow, aij->i, m, 0.6));
 75:   PetscFunctionReturn(PETSC_SUCCESS);
 76: }

 78: PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A, Mat B, PetscReal fill, Mat C)
 79: {
 80:   Mat_Product        *product = C->product;
 81:   MatProductAlgorithm alg;
 82:   PetscBool           flg;

 84:   PetscFunctionBegin;
 85:   if (product) {
 86:     alg = product->alg;
 87:   } else {
 88:     alg = "sorted";
 89:   }
 90:   /* sorted */
 91:   PetscCall(PetscStrcmp(alg, "sorted", &flg));
 92:   if (flg) {
 93:     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(A, B, fill, C));
 94:     PetscFunctionReturn(PETSC_SUCCESS);
 95:   }

 97:   /* scalable */
 98:   PetscCall(PetscStrcmp(alg, "scalable", &flg));
 99:   if (flg) {
100:     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(A, B, fill, C));
101:     PetscFunctionReturn(PETSC_SUCCESS);
102:   }

104:   /* scalable_fast */
105:   PetscCall(PetscStrcmp(alg, "scalable_fast", &flg));
106:   if (flg) {
107:     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(A, B, fill, C));
108:     PetscFunctionReturn(PETSC_SUCCESS);
109:   }

111:   /* heap */
112:   PetscCall(PetscStrcmp(alg, "heap", &flg));
113:   if (flg) {
114:     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(A, B, fill, C));
115:     PetscFunctionReturn(PETSC_SUCCESS);
116:   }

118:   /* btheap */
119:   PetscCall(PetscStrcmp(alg, "btheap", &flg));
120:   if (flg) {
121:     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(A, B, fill, C));
122:     PetscFunctionReturn(PETSC_SUCCESS);
123:   }

125:   /* llcondensed */
126:   PetscCall(PetscStrcmp(alg, "llcondensed", &flg));
127:   if (flg) {
128:     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(A, B, fill, C));
129:     PetscFunctionReturn(PETSC_SUCCESS);
130:   }

132:   /* rowmerge */
133:   PetscCall(PetscStrcmp(alg, "rowmerge", &flg));
134:   if (flg) {
135:     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(A, B, fill, C));
136:     PetscFunctionReturn(PETSC_SUCCESS);
137:   }

139: #if defined(PETSC_HAVE_HYPRE)
140:   PetscCall(PetscStrcmp(alg, "hypre", &flg));
141:   if (flg) {
142:     PetscCall(MatMatMultSymbolic_AIJ_AIJ_wHYPRE(A, B, fill, C));
143:     PetscFunctionReturn(PETSC_SUCCESS);
144:   }
145: #endif

147:   SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Mat Product Algorithm is not supported");
148: }

150: PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(Mat A, Mat B, PetscReal fill, Mat C)
151: {
152:   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
153:   PetscInt          *ai = a->i, *bi = b->i, *ci, *cj;
154:   PetscInt           am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
155:   PetscReal          afill;
156:   PetscInt           i, j, anzi, brow, bnzj, cnzi, *bj, *aj, *lnk, ndouble = 0, Crmax;
157:   PetscHMapI         ta;
158:   PetscBT            lnkbt;
159:   PetscFreeSpaceList free_space = NULL, current_space = NULL;

161:   PetscFunctionBegin;
162:   /* Get ci and cj */
163:   /* Allocate ci array, arrays for fill computation and */
164:   /* free space for accumulating nonzero column info */
165:   PetscCall(PetscMalloc1(am + 2, &ci));
166:   ci[0] = 0;

168:   /* create and initialize a linked list */
169:   PetscCall(PetscHMapICreateWithSize(bn, &ta));
170:   MatRowMergeMax_SeqAIJ(b, bm, ta);
171:   PetscCall(PetscHMapIGetSize(ta, &Crmax));
172:   PetscCall(PetscHMapIDestroy(&ta));

174:   PetscCall(PetscLLCondensedCreate(Crmax, bn, &lnk, &lnkbt));

176:   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
177:   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space));

179:   current_space = free_space;

181:   /* Determine ci and cj */
182:   for (i = 0; i < am; i++) {
183:     anzi = ai[i + 1] - ai[i];
184:     aj   = a->j + ai[i];
185:     for (j = 0; j < anzi; j++) {
186:       brow = aj[j];
187:       bnzj = bi[brow + 1] - bi[brow];
188:       bj   = b->j + bi[brow];
189:       /* add non-zero cols of B into the sorted linked list lnk */
190:       PetscCall(PetscLLCondensedAddSorted(bnzj, bj, lnk, lnkbt));
191:     }
192:     /* add possible missing diagonal entry */
193:     if (C->force_diagonals) PetscCall(PetscLLCondensedAddSorted(1, &i, lnk, lnkbt));
194:     cnzi = lnk[0];

196:     /* If free space is not available, make more free space */
197:     /* Double the amount of total space in the list */
198:     if (current_space->local_remaining < cnzi) {
199:       PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi, current_space->total_array_size), &current_space));
200:       ndouble++;
201:     }

203:     /* Copy data into free space, then initialize lnk */
204:     PetscCall(PetscLLCondensedClean(bn, cnzi, current_space->array, lnk, lnkbt));

206:     current_space->array += cnzi;
207:     current_space->local_used += cnzi;
208:     current_space->local_remaining -= cnzi;

210:     ci[i + 1] = ci[i] + cnzi;
211:   }

213:   /* Column indices are in the list of free space */
214:   /* Allocate space for cj, initialize cj, and */
215:   /* destroy list of free space and other temporary array(s) */
216:   PetscCall(PetscMalloc1(ci[am] + 1, &cj));
217:   PetscCall(PetscFreeSpaceContiguous(&free_space, cj));
218:   PetscCall(PetscLLCondensedDestroy(lnk, lnkbt));

220:   /* put together the new symbolic matrix */
221:   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C));
222:   PetscCall(MatSetBlockSizesFromMats(C, A, B));

224:   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
225:   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
226:   c          = (Mat_SeqAIJ *)(C->data);
227:   c->free_a  = PETSC_FALSE;
228:   c->free_ij = PETSC_TRUE;
229:   c->nonew   = 0;

231:   /* fast, needs non-scalable O(bn) array 'abdense' */
232:   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;

234:   /* set MatInfo */
235:   afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5;
236:   if (afill < 1.0) afill = 1.0;
237:   C->info.mallocs           = ndouble;
238:   C->info.fill_ratio_given  = fill;
239:   C->info.fill_ratio_needed = afill;

241: #if defined(PETSC_USE_INFO)
242:   if (ci[am]) {
243:     PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill));
244:     PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill));
245:   } else {
246:     PetscCall(PetscInfo(C, "Empty matrix product\n"));
247:   }
248: #endif
249:   PetscFunctionReturn(PETSC_SUCCESS);
250: }

252: PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(Mat A, Mat B, Mat C)
253: {
254:   PetscLogDouble     flops = 0.0;
255:   Mat_SeqAIJ        *a     = (Mat_SeqAIJ *)A->data;
256:   Mat_SeqAIJ        *b     = (Mat_SeqAIJ *)B->data;
257:   Mat_SeqAIJ        *c     = (Mat_SeqAIJ *)C->data;
258:   PetscInt          *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bjj, *ci = c->i, *cj = c->j;
259:   PetscInt           am = A->rmap->n, cm = C->rmap->n;
260:   PetscInt           i, j, k, anzi, bnzi, cnzi, brow;
261:   PetscScalar       *ca, valtmp;
262:   PetscScalar       *ab_dense;
263:   PetscContainer     cab_dense;
264:   const PetscScalar *aa, *ba, *baj;

266:   PetscFunctionBegin;
267:   PetscCall(MatSeqAIJGetArrayRead(A, &aa));
268:   PetscCall(MatSeqAIJGetArrayRead(B, &ba));
269:   if (!c->a) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */
270:     PetscCall(PetscMalloc1(ci[cm] + 1, &ca));
271:     c->a      = ca;
272:     c->free_a = PETSC_TRUE;
273:   } else ca = c->a;

275:   /* TODO this should be done in the symbolic phase */
276:   /* However, this function is so heavily used (sometimes in an hidden way through multnumeric function pointers
277:      that is hard to eradicate) */
278:   PetscCall(PetscObjectQuery((PetscObject)C, "__PETSc__ab_dense", (PetscObject *)&cab_dense));
279:   if (!cab_dense) {
280:     PetscCall(PetscMalloc1(B->cmap->N, &ab_dense));
281:     PetscCall(PetscContainerCreate(PETSC_COMM_SELF, &cab_dense));
282:     PetscCall(PetscContainerSetPointer(cab_dense, ab_dense));
283:     PetscCall(PetscContainerSetUserDestroy(cab_dense, PetscContainerUserDestroyDefault));
284:     PetscCall(PetscObjectCompose((PetscObject)C, "__PETSc__ab_dense", (PetscObject)cab_dense));
285:     PetscCall(PetscObjectDereference((PetscObject)cab_dense));
286:   }
287:   PetscCall(PetscContainerGetPointer(cab_dense, (void **)&ab_dense));
288:   PetscCall(PetscArrayzero(ab_dense, B->cmap->N));

290:   /* clean old values in C */
291:   PetscCall(PetscArrayzero(ca, ci[cm]));
292:   /* Traverse A row-wise. */
293:   /* Build the ith row in C by summing over nonzero columns in A, */
294:   /* the rows of B corresponding to nonzeros of A. */
295:   for (i = 0; i < am; i++) {
296:     anzi = ai[i + 1] - ai[i];
297:     for (j = 0; j < anzi; j++) {
298:       brow = aj[j];
299:       bnzi = bi[brow + 1] - bi[brow];
300:       bjj  = bj + bi[brow];
301:       baj  = ba + bi[brow];
302:       /* perform dense axpy */
303:       valtmp = aa[j];
304:       for (k = 0; k < bnzi; k++) ab_dense[bjj[k]] += valtmp * baj[k];
305:       flops += 2 * bnzi;
306:     }
307:     aj += anzi;
308:     aa += anzi;

310:     cnzi = ci[i + 1] - ci[i];
311:     for (k = 0; k < cnzi; k++) {
312:       ca[k] += ab_dense[cj[k]];
313:       ab_dense[cj[k]] = 0.0; /* zero ab_dense */
314:     }
315:     flops += cnzi;
316:     cj += cnzi;
317:     ca += cnzi;
318:   }
319: #if defined(PETSC_HAVE_DEVICE)
320:   if (C->offloadmask != PETSC_OFFLOAD_UNALLOCATED) C->offloadmask = PETSC_OFFLOAD_CPU;
321: #endif
322:   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
323:   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
324:   PetscCall(PetscLogFlops(flops));
325:   PetscCall(MatSeqAIJRestoreArrayRead(A, &aa));
326:   PetscCall(MatSeqAIJRestoreArrayRead(B, &ba));
327:   PetscFunctionReturn(PETSC_SUCCESS);
328: }

330: PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(Mat A, Mat B, Mat C)
331: {
332:   PetscLogDouble     flops = 0.0;
333:   Mat_SeqAIJ        *a     = (Mat_SeqAIJ *)A->data;
334:   Mat_SeqAIJ        *b     = (Mat_SeqAIJ *)B->data;
335:   Mat_SeqAIJ        *c     = (Mat_SeqAIJ *)C->data;
336:   PetscInt          *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bjj, *ci = c->i, *cj = c->j;
337:   PetscInt           am = A->rmap->N, cm = C->rmap->N;
338:   PetscInt           i, j, k, anzi, bnzi, cnzi, brow;
339:   PetscScalar       *ca = c->a, valtmp;
340:   const PetscScalar *aa, *ba, *baj;
341:   PetscInt           nextb;

343:   PetscFunctionBegin;
344:   PetscCall(MatSeqAIJGetArrayRead(A, &aa));
345:   PetscCall(MatSeqAIJGetArrayRead(B, &ba));
346:   if (!ca) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */
347:     PetscCall(PetscMalloc1(ci[cm] + 1, &ca));
348:     c->a      = ca;
349:     c->free_a = PETSC_TRUE;
350:   }

352:   /* clean old values in C */
353:   PetscCall(PetscArrayzero(ca, ci[cm]));
354:   /* Traverse A row-wise. */
355:   /* Build the ith row in C by summing over nonzero columns in A, */
356:   /* the rows of B corresponding to nonzeros of A. */
357:   for (i = 0; i < am; i++) {
358:     anzi = ai[i + 1] - ai[i];
359:     cnzi = ci[i + 1] - ci[i];
360:     for (j = 0; j < anzi; j++) {
361:       brow = aj[j];
362:       bnzi = bi[brow + 1] - bi[brow];
363:       bjj  = bj + bi[brow];
364:       baj  = ba + bi[brow];
365:       /* perform sparse axpy */
366:       valtmp = aa[j];
367:       nextb  = 0;
368:       for (k = 0; nextb < bnzi; k++) {
369:         if (cj[k] == bjj[nextb]) { /* ccol == bcol */
370:           ca[k] += valtmp * baj[nextb++];
371:         }
372:       }
373:       flops += 2 * bnzi;
374:     }
375:     aj += anzi;
376:     aa += anzi;
377:     cj += cnzi;
378:     ca += cnzi;
379:   }
380: #if defined(PETSC_HAVE_DEVICE)
381:   if (C->offloadmask != PETSC_OFFLOAD_UNALLOCATED) C->offloadmask = PETSC_OFFLOAD_CPU;
382: #endif
383:   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
384:   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
385:   PetscCall(PetscLogFlops(flops));
386:   PetscCall(MatSeqAIJRestoreArrayRead(A, &aa));
387:   PetscCall(MatSeqAIJRestoreArrayRead(B, &ba));
388:   PetscFunctionReturn(PETSC_SUCCESS);
389: }

391: PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(Mat A, Mat B, PetscReal fill, Mat C)
392: {
393:   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
394:   PetscInt          *ai = a->i, *bi = b->i, *ci, *cj;
395:   PetscInt           am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
396:   MatScalar         *ca;
397:   PetscReal          afill;
398:   PetscInt           i, j, anzi, brow, bnzj, cnzi, *bj, *aj, *lnk, ndouble = 0, Crmax;
399:   PetscHMapI         ta;
400:   PetscFreeSpaceList free_space = NULL, current_space = NULL;

402:   PetscFunctionBegin;
403:   /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_fast() */
404:   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
405:   PetscCall(PetscMalloc1(am + 2, &ci));
406:   ci[0] = 0;

408:   /* create and initialize a linked list */
409:   PetscCall(PetscHMapICreateWithSize(bn, &ta));
410:   MatRowMergeMax_SeqAIJ(b, bm, ta);
411:   PetscCall(PetscHMapIGetSize(ta, &Crmax));
412:   PetscCall(PetscHMapIDestroy(&ta));

414:   PetscCall(PetscLLCondensedCreate_fast(Crmax, &lnk));

416:   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
417:   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space));
418:   current_space = free_space;

420:   /* Determine ci and cj */
421:   for (i = 0; i < am; i++) {
422:     anzi = ai[i + 1] - ai[i];
423:     aj   = a->j + ai[i];
424:     for (j = 0; j < anzi; j++) {
425:       brow = aj[j];
426:       bnzj = bi[brow + 1] - bi[brow];
427:       bj   = b->j + bi[brow];
428:       /* add non-zero cols of B into the sorted linked list lnk */
429:       PetscCall(PetscLLCondensedAddSorted_fast(bnzj, bj, lnk));
430:     }
431:     /* add possible missing diagonal entry */
432:     if (C->force_diagonals) PetscCall(PetscLLCondensedAddSorted_fast(1, &i, lnk));
433:     cnzi = lnk[1];

435:     /* If free space is not available, make more free space */
436:     /* Double the amount of total space in the list */
437:     if (current_space->local_remaining < cnzi) {
438:       PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi, current_space->total_array_size), &current_space));
439:       ndouble++;
440:     }

442:     /* Copy data into free space, then initialize lnk */
443:     PetscCall(PetscLLCondensedClean_fast(cnzi, current_space->array, lnk));

445:     current_space->array += cnzi;
446:     current_space->local_used += cnzi;
447:     current_space->local_remaining -= cnzi;

449:     ci[i + 1] = ci[i] + cnzi;
450:   }

452:   /* Column indices are in the list of free space */
453:   /* Allocate space for cj, initialize cj, and */
454:   /* destroy list of free space and other temporary array(s) */
455:   PetscCall(PetscMalloc1(ci[am] + 1, &cj));
456:   PetscCall(PetscFreeSpaceContiguous(&free_space, cj));
457:   PetscCall(PetscLLCondensedDestroy_fast(lnk));

459:   /* Allocate space for ca */
460:   PetscCall(PetscCalloc1(ci[am] + 1, &ca));

462:   /* put together the new symbolic matrix */
463:   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, ca, ((PetscObject)A)->type_name, C));
464:   PetscCall(MatSetBlockSizesFromMats(C, A, B));

466:   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
467:   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
468:   c          = (Mat_SeqAIJ *)(C->data);
469:   c->free_a  = PETSC_TRUE;
470:   c->free_ij = PETSC_TRUE;
471:   c->nonew   = 0;

473:   /* slower, less memory */
474:   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable;

476:   /* set MatInfo */
477:   afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5;
478:   if (afill < 1.0) afill = 1.0;
479:   C->info.mallocs           = ndouble;
480:   C->info.fill_ratio_given  = fill;
481:   C->info.fill_ratio_needed = afill;

483: #if defined(PETSC_USE_INFO)
484:   if (ci[am]) {
485:     PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill));
486:     PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill));
487:   } else {
488:     PetscCall(PetscInfo(C, "Empty matrix product\n"));
489:   }
490: #endif
491:   PetscFunctionReturn(PETSC_SUCCESS);
492: }

494: PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(Mat A, Mat B, PetscReal fill, Mat C)
495: {
496:   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
497:   PetscInt          *ai = a->i, *bi = b->i, *ci, *cj;
498:   PetscInt           am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
499:   MatScalar         *ca;
500:   PetscReal          afill;
501:   PetscInt           i, j, anzi, brow, bnzj, cnzi, *bj, *aj, *lnk, ndouble = 0, Crmax;
502:   PetscHMapI         ta;
503:   PetscFreeSpaceList free_space = NULL, current_space = NULL;

505:   PetscFunctionBegin;
506:   /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_Scalalbe() */
507:   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
508:   PetscCall(PetscMalloc1(am + 2, &ci));
509:   ci[0] = 0;

511:   /* create and initialize a linked list */
512:   PetscCall(PetscHMapICreateWithSize(bn, &ta));
513:   MatRowMergeMax_SeqAIJ(b, bm, ta);
514:   PetscCall(PetscHMapIGetSize(ta, &Crmax));
515:   PetscCall(PetscHMapIDestroy(&ta));
516:   PetscCall(PetscLLCondensedCreate_Scalable(Crmax, &lnk));

518:   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
519:   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space));
520:   current_space = free_space;

522:   /* Determine ci and cj */
523:   for (i = 0; i < am; i++) {
524:     anzi = ai[i + 1] - ai[i];
525:     aj   = a->j + ai[i];
526:     for (j = 0; j < anzi; j++) {
527:       brow = aj[j];
528:       bnzj = bi[brow + 1] - bi[brow];
529:       bj   = b->j + bi[brow];
530:       /* add non-zero cols of B into the sorted linked list lnk */
531:       PetscCall(PetscLLCondensedAddSorted_Scalable(bnzj, bj, lnk));
532:     }
533:     /* add possible missing diagonal entry */
534:     if (C->force_diagonals) PetscCall(PetscLLCondensedAddSorted_Scalable(1, &i, lnk));

536:     cnzi = lnk[0];

538:     /* If free space is not available, make more free space */
539:     /* Double the amount of total space in the list */
540:     if (current_space->local_remaining < cnzi) {
541:       PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi, current_space->total_array_size), &current_space));
542:       ndouble++;
543:     }

545:     /* Copy data into free space, then initialize lnk */
546:     PetscCall(PetscLLCondensedClean_Scalable(cnzi, current_space->array, lnk));

548:     current_space->array += cnzi;
549:     current_space->local_used += cnzi;
550:     current_space->local_remaining -= cnzi;

552:     ci[i + 1] = ci[i] + cnzi;
553:   }

555:   /* Column indices are in the list of free space */
556:   /* Allocate space for cj, initialize cj, and */
557:   /* destroy list of free space and other temporary array(s) */
558:   PetscCall(PetscMalloc1(ci[am] + 1, &cj));
559:   PetscCall(PetscFreeSpaceContiguous(&free_space, cj));
560:   PetscCall(PetscLLCondensedDestroy_Scalable(lnk));

562:   /* Allocate space for ca */
563:   PetscCall(PetscCalloc1(ci[am] + 1, &ca));

565:   /* put together the new symbolic matrix */
566:   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, ca, ((PetscObject)A)->type_name, C));
567:   PetscCall(MatSetBlockSizesFromMats(C, A, B));

569:   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
570:   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
571:   c          = (Mat_SeqAIJ *)(C->data);
572:   c->free_a  = PETSC_TRUE;
573:   c->free_ij = PETSC_TRUE;
574:   c->nonew   = 0;

576:   /* slower, less memory */
577:   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable;

579:   /* set MatInfo */
580:   afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5;
581:   if (afill < 1.0) afill = 1.0;
582:   C->info.mallocs           = ndouble;
583:   C->info.fill_ratio_given  = fill;
584:   C->info.fill_ratio_needed = afill;

586: #if defined(PETSC_USE_INFO)
587:   if (ci[am]) {
588:     PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill));
589:     PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill));
590:   } else {
591:     PetscCall(PetscInfo(C, "Empty matrix product\n"));
592:   }
593: #endif
594:   PetscFunctionReturn(PETSC_SUCCESS);
595: }

597: PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(Mat A, Mat B, PetscReal fill, Mat C)
598: {
599:   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
600:   const PetscInt    *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j;
601:   PetscInt          *ci, *cj, *bb;
602:   PetscInt           am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
603:   PetscReal          afill;
604:   PetscInt           i, j, col, ndouble = 0;
605:   PetscFreeSpaceList free_space = NULL, current_space = NULL;
606:   PetscHeap          h;

608:   PetscFunctionBegin;
609:   /* Get ci and cj - by merging sorted rows using a heap */
610:   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
611:   PetscCall(PetscMalloc1(am + 2, &ci));
612:   ci[0] = 0;

614:   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
615:   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space));
616:   current_space = free_space;

618:   PetscCall(PetscHeapCreate(a->rmax, &h));
619:   PetscCall(PetscMalloc1(a->rmax, &bb));

621:   /* Determine ci and cj */
622:   for (i = 0; i < am; i++) {
623:     const PetscInt  anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
624:     const PetscInt *acol = aj + ai[i];        /* column indices of nonzero entries in this row */
625:     ci[i + 1]            = ci[i];
626:     /* Populate the min heap */
627:     for (j = 0; j < anzi; j++) {
628:       bb[j] = bi[acol[j]];           /* bb points at the start of the row */
629:       if (bb[j] < bi[acol[j] + 1]) { /* Add if row is nonempty */
630:         PetscCall(PetscHeapAdd(h, j, bj[bb[j]++]));
631:       }
632:     }
633:     /* Pick off the min element, adding it to free space */
634:     PetscCall(PetscHeapPop(h, &j, &col));
635:     while (j >= 0) {
636:       if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */
637:         PetscCall(PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2, current_space->total_array_size), 16 << 20), &current_space));
638:         ndouble++;
639:       }
640:       *(current_space->array++) = col;
641:       current_space->local_used++;
642:       current_space->local_remaining--;
643:       ci[i + 1]++;

645:       /* stash if anything else remains in this row of B */
646:       if (bb[j] < bi[acol[j] + 1]) PetscCall(PetscHeapStash(h, j, bj[bb[j]++]));
647:       while (1) { /* pop and stash any other rows of B that also had an entry in this column */
648:         PetscInt j2, col2;
649:         PetscCall(PetscHeapPeek(h, &j2, &col2));
650:         if (col2 != col) break;
651:         PetscCall(PetscHeapPop(h, &j2, &col2));
652:         if (bb[j2] < bi[acol[j2] + 1]) PetscCall(PetscHeapStash(h, j2, bj[bb[j2]++]));
653:       }
654:       /* Put any stashed elements back into the min heap */
655:       PetscCall(PetscHeapUnstash(h));
656:       PetscCall(PetscHeapPop(h, &j, &col));
657:     }
658:   }
659:   PetscCall(PetscFree(bb));
660:   PetscCall(PetscHeapDestroy(&h));

662:   /* Column indices are in the list of free space */
663:   /* Allocate space for cj, initialize cj, and */
664:   /* destroy list of free space and other temporary array(s) */
665:   PetscCall(PetscMalloc1(ci[am], &cj));
666:   PetscCall(PetscFreeSpaceContiguous(&free_space, cj));

668:   /* put together the new symbolic matrix */
669:   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C));
670:   PetscCall(MatSetBlockSizesFromMats(C, A, B));

672:   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
673:   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
674:   c          = (Mat_SeqAIJ *)(C->data);
675:   c->free_a  = PETSC_TRUE;
676:   c->free_ij = PETSC_TRUE;
677:   c->nonew   = 0;

679:   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;

681:   /* set MatInfo */
682:   afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5;
683:   if (afill < 1.0) afill = 1.0;
684:   C->info.mallocs           = ndouble;
685:   C->info.fill_ratio_given  = fill;
686:   C->info.fill_ratio_needed = afill;

688: #if defined(PETSC_USE_INFO)
689:   if (ci[am]) {
690:     PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill));
691:     PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill));
692:   } else {
693:     PetscCall(PetscInfo(C, "Empty matrix product\n"));
694:   }
695: #endif
696:   PetscFunctionReturn(PETSC_SUCCESS);
697: }

699: PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(Mat A, Mat B, PetscReal fill, Mat C)
700: {
701:   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
702:   const PetscInt    *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j;
703:   PetscInt          *ci, *cj, *bb;
704:   PetscInt           am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
705:   PetscReal          afill;
706:   PetscInt           i, j, col, ndouble = 0;
707:   PetscFreeSpaceList free_space = NULL, current_space = NULL;
708:   PetscHeap          h;
709:   PetscBT            bt;

711:   PetscFunctionBegin;
712:   /* Get ci and cj - using a heap for the sorted rows, but use BT so that each index is only added once */
713:   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
714:   PetscCall(PetscMalloc1(am + 2, &ci));
715:   ci[0] = 0;

717:   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
718:   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space));

720:   current_space = free_space;

722:   PetscCall(PetscHeapCreate(a->rmax, &h));
723:   PetscCall(PetscMalloc1(a->rmax, &bb));
724:   PetscCall(PetscBTCreate(bn, &bt));

726:   /* Determine ci and cj */
727:   for (i = 0; i < am; i++) {
728:     const PetscInt  anzi = ai[i + 1] - ai[i];    /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
729:     const PetscInt *acol = aj + ai[i];           /* column indices of nonzero entries in this row */
730:     const PetscInt *fptr = current_space->array; /* Save beginning of the row so we can clear the BT later */
731:     ci[i + 1]            = ci[i];
732:     /* Populate the min heap */
733:     for (j = 0; j < anzi; j++) {
734:       PetscInt brow = acol[j];
735:       for (bb[j] = bi[brow]; bb[j] < bi[brow + 1]; bb[j]++) {
736:         PetscInt bcol = bj[bb[j]];
737:         if (!PetscBTLookupSet(bt, bcol)) { /* new entry */
738:           PetscCall(PetscHeapAdd(h, j, bcol));
739:           bb[j]++;
740:           break;
741:         }
742:       }
743:     }
744:     /* Pick off the min element, adding it to free space */
745:     PetscCall(PetscHeapPop(h, &j, &col));
746:     while (j >= 0) {
747:       if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */
748:         fptr = NULL;                            /* need PetscBTMemzero */
749:         PetscCall(PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2, current_space->total_array_size), 16 << 20), &current_space));
750:         ndouble++;
751:       }
752:       *(current_space->array++) = col;
753:       current_space->local_used++;
754:       current_space->local_remaining--;
755:       ci[i + 1]++;

757:       /* stash if anything else remains in this row of B */
758:       for (; bb[j] < bi[acol[j] + 1]; bb[j]++) {
759:         PetscInt bcol = bj[bb[j]];
760:         if (!PetscBTLookupSet(bt, bcol)) { /* new entry */
761:           PetscCall(PetscHeapAdd(h, j, bcol));
762:           bb[j]++;
763:           break;
764:         }
765:       }
766:       PetscCall(PetscHeapPop(h, &j, &col));
767:     }
768:     if (fptr) { /* Clear the bits for this row */
769:       for (; fptr < current_space->array; fptr++) PetscCall(PetscBTClear(bt, *fptr));
770:     } else { /* We reallocated so we don't remember (easily) how to clear only the bits we changed */
771:       PetscCall(PetscBTMemzero(bn, bt));
772:     }
773:   }
774:   PetscCall(PetscFree(bb));
775:   PetscCall(PetscHeapDestroy(&h));
776:   PetscCall(PetscBTDestroy(&bt));

778:   /* Column indices are in the list of free space */
779:   /* Allocate space for cj, initialize cj, and */
780:   /* destroy list of free space and other temporary array(s) */
781:   PetscCall(PetscMalloc1(ci[am], &cj));
782:   PetscCall(PetscFreeSpaceContiguous(&free_space, cj));

784:   /* put together the new symbolic matrix */
785:   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C));
786:   PetscCall(MatSetBlockSizesFromMats(C, A, B));

788:   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
789:   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
790:   c          = (Mat_SeqAIJ *)(C->data);
791:   c->free_a  = PETSC_TRUE;
792:   c->free_ij = PETSC_TRUE;
793:   c->nonew   = 0;

795:   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;

797:   /* set MatInfo */
798:   afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5;
799:   if (afill < 1.0) afill = 1.0;
800:   C->info.mallocs           = ndouble;
801:   C->info.fill_ratio_given  = fill;
802:   C->info.fill_ratio_needed = afill;

804: #if defined(PETSC_USE_INFO)
805:   if (ci[am]) {
806:     PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill));
807:     PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill));
808:   } else {
809:     PetscCall(PetscInfo(C, "Empty matrix product\n"));
810:   }
811: #endif
812:   PetscFunctionReturn(PETSC_SUCCESS);
813: }

815: PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(Mat A, Mat B, PetscReal fill, Mat C)
816: {
817:   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
818:   const PetscInt *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j, *inputi, *inputj, *inputcol, *inputcol_L1;
819:   PetscInt       *ci, *cj, *outputj, worki_L1[9], worki_L2[9];
820:   PetscInt        c_maxmem, a_maxrownnz = 0, a_rownnz;
821:   const PetscInt  workcol[8] = {0, 1, 2, 3, 4, 5, 6, 7};
822:   const PetscInt  am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
823:   const PetscInt *brow_ptr[8], *brow_end[8];
824:   PetscInt        window[8];
825:   PetscInt        window_min, old_window_min, ci_nnz, outputi_nnz = 0, L1_nrows, L2_nrows;
826:   PetscInt        i, k, ndouble = 0, L1_rowsleft, rowsleft;
827:   PetscReal       afill;
828:   PetscInt       *workj_L1, *workj_L2, *workj_L3;
829:   PetscInt        L1_nnz, L2_nnz;

831:   /* Step 1: Get upper bound on memory required for allocation.
832:              Because of the way virtual memory works,
833:              only the memory pages that are actually needed will be physically allocated. */
834:   PetscFunctionBegin;
835:   PetscCall(PetscMalloc1(am + 1, &ci));
836:   for (i = 0; i < am; i++) {
837:     const PetscInt  anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
838:     const PetscInt *acol = aj + ai[i];        /* column indices of nonzero entries in this row */
839:     a_rownnz             = 0;
840:     for (k = 0; k < anzi; ++k) {
841:       a_rownnz += bi[acol[k] + 1] - bi[acol[k]];
842:       if (a_rownnz > bn) {
843:         a_rownnz = bn;
844:         break;
845:       }
846:     }
847:     a_maxrownnz = PetscMax(a_maxrownnz, a_rownnz);
848:   }
849:   /* temporary work areas for merging rows */
850:   PetscCall(PetscMalloc1(a_maxrownnz * 8, &workj_L1));
851:   PetscCall(PetscMalloc1(a_maxrownnz * 8, &workj_L2));
852:   PetscCall(PetscMalloc1(a_maxrownnz, &workj_L3));

854:   /* This should be enough for almost all matrices. If not, memory is reallocated later. */
855:   c_maxmem = 8 * (ai[am] + bi[bm]);
856:   /* Step 2: Populate pattern for C */
857:   PetscCall(PetscMalloc1(c_maxmem, &cj));

859:   ci_nnz      = 0;
860:   ci[0]       = 0;
861:   worki_L1[0] = 0;
862:   worki_L2[0] = 0;
863:   for (i = 0; i < am; i++) {
864:     const PetscInt  anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
865:     const PetscInt *acol = aj + ai[i];        /* column indices of nonzero entries in this row */
866:     rowsleft             = anzi;
867:     inputcol_L1          = acol;
868:     L2_nnz               = 0;
869:     L2_nrows             = 1; /* Number of rows to be merged on Level 3. output of L3 already exists -> initial value 1   */
870:     worki_L2[1]          = 0;
871:     outputi_nnz          = 0;

873:     /* If the number of indices in C so far + the max number of columns in the next row > c_maxmem  -> allocate more memory */
874:     while (ci_nnz + a_maxrownnz > c_maxmem) {
875:       c_maxmem *= 2;
876:       ndouble++;
877:       PetscCall(PetscRealloc(sizeof(PetscInt) * c_maxmem, &cj));
878:     }

880:     while (rowsleft) {
881:       L1_rowsleft = PetscMin(64, rowsleft); /* In the inner loop max 64 rows of B can be merged */
882:       L1_nrows    = 0;
883:       L1_nnz      = 0;
884:       inputcol    = inputcol_L1;
885:       inputi      = bi;
886:       inputj      = bj;

888:       /* The following macro is used to specialize for small rows in A.
889:          This helps with compiler unrolling, improving performance substantially.
890:           Input:  inputj   inputi  inputcol  bn
891:           Output: outputj  outputi_nnz                       */
892: #define MatMatMultSymbolic_RowMergeMacro(ANNZ) \
893:   window_min  = bn; \
894:   outputi_nnz = 0; \
895:   for (k = 0; k < ANNZ; ++k) { \
896:     brow_ptr[k] = inputj + inputi[inputcol[k]]; \
897:     brow_end[k] = inputj + inputi[inputcol[k] + 1]; \
898:     window[k]   = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \
899:     window_min  = PetscMin(window[k], window_min); \
900:   } \
901:   while (window_min < bn) { \
902:     outputj[outputi_nnz++] = window_min; \
903:     /* advance front and compute new minimum */ \
904:     old_window_min = window_min; \
905:     window_min     = bn; \
906:     for (k = 0; k < ANNZ; ++k) { \
907:       if (window[k] == old_window_min) { \
908:         brow_ptr[k]++; \
909:         window[k] = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \
910:       } \
911:       window_min = PetscMin(window[k], window_min); \
912:     } \
913:   }

915:       /************** L E V E L  1 ***************/
916:       /* Merge up to 8 rows of B to L1 work array*/
917:       while (L1_rowsleft) {
918:         outputi_nnz = 0;
919:         if (anzi > 8) outputj = workj_L1 + L1_nnz; /* Level 1 rowmerge*/
920:         else outputj = cj + ci_nnz;                /* Merge directly to C */

922:         switch (L1_rowsleft) {
923:         case 1:
924:           brow_ptr[0] = inputj + inputi[inputcol[0]];
925:           brow_end[0] = inputj + inputi[inputcol[0] + 1];
926:           for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
927:           inputcol += L1_rowsleft;
928:           rowsleft -= L1_rowsleft;
929:           L1_rowsleft = 0;
930:           break;
931:         case 2:
932:           MatMatMultSymbolic_RowMergeMacro(2);
933:           inputcol += L1_rowsleft;
934:           rowsleft -= L1_rowsleft;
935:           L1_rowsleft = 0;
936:           break;
937:         case 3:
938:           MatMatMultSymbolic_RowMergeMacro(3);
939:           inputcol += L1_rowsleft;
940:           rowsleft -= L1_rowsleft;
941:           L1_rowsleft = 0;
942:           break;
943:         case 4:
944:           MatMatMultSymbolic_RowMergeMacro(4);
945:           inputcol += L1_rowsleft;
946:           rowsleft -= L1_rowsleft;
947:           L1_rowsleft = 0;
948:           break;
949:         case 5:
950:           MatMatMultSymbolic_RowMergeMacro(5);
951:           inputcol += L1_rowsleft;
952:           rowsleft -= L1_rowsleft;
953:           L1_rowsleft = 0;
954:           break;
955:         case 6:
956:           MatMatMultSymbolic_RowMergeMacro(6);
957:           inputcol += L1_rowsleft;
958:           rowsleft -= L1_rowsleft;
959:           L1_rowsleft = 0;
960:           break;
961:         case 7:
962:           MatMatMultSymbolic_RowMergeMacro(7);
963:           inputcol += L1_rowsleft;
964:           rowsleft -= L1_rowsleft;
965:           L1_rowsleft = 0;
966:           break;
967:         default:
968:           MatMatMultSymbolic_RowMergeMacro(8);
969:           inputcol += 8;
970:           rowsleft -= 8;
971:           L1_rowsleft -= 8;
972:           break;
973:         }
974:         inputcol_L1 = inputcol;
975:         L1_nnz += outputi_nnz;
976:         worki_L1[++L1_nrows] = L1_nnz;
977:       }

979:       /********************** L E V E L  2 ************************/
980:       /* Merge from L1 work array to either C or to L2 work array */
981:       if (anzi > 8) {
982:         inputi      = worki_L1;
983:         inputj      = workj_L1;
984:         inputcol    = workcol;
985:         outputi_nnz = 0;

987:         if (anzi <= 64) outputj = cj + ci_nnz; /* Merge from L1 work array to C */
988:         else outputj = workj_L2 + L2_nnz;      /* Merge from L1 work array to L2 work array */

990:         switch (L1_nrows) {
991:         case 1:
992:           brow_ptr[0] = inputj + inputi[inputcol[0]];
993:           brow_end[0] = inputj + inputi[inputcol[0] + 1];
994:           for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
995:           break;
996:         case 2:
997:           MatMatMultSymbolic_RowMergeMacro(2);
998:           break;
999:         case 3:
1000:           MatMatMultSymbolic_RowMergeMacro(3);
1001:           break;
1002:         case 4:
1003:           MatMatMultSymbolic_RowMergeMacro(4);
1004:           break;
1005:         case 5:
1006:           MatMatMultSymbolic_RowMergeMacro(5);
1007:           break;
1008:         case 6:
1009:           MatMatMultSymbolic_RowMergeMacro(6);
1010:           break;
1011:         case 7:
1012:           MatMatMultSymbolic_RowMergeMacro(7);
1013:           break;
1014:         case 8:
1015:           MatMatMultSymbolic_RowMergeMacro(8);
1016:           break;
1017:         default:
1018:           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatMatMult logic error: Not merging 1-8 rows from L1 work array!");
1019:         }
1020:         L2_nnz += outputi_nnz;
1021:         worki_L2[++L2_nrows] = L2_nnz;

1023:         /************************ L E V E L  3 **********************/
1024:         /* Merge from L2 work array to either C or to L2 work array */
1025:         if (anzi > 64 && (L2_nrows == 8 || rowsleft == 0)) {
1026:           inputi      = worki_L2;
1027:           inputj      = workj_L2;
1028:           inputcol    = workcol;
1029:           outputi_nnz = 0;
1030:           if (rowsleft) outputj = workj_L3;
1031:           else outputj = cj + ci_nnz;
1032:           switch (L2_nrows) {
1033:           case 1:
1034:             brow_ptr[0] = inputj + inputi[inputcol[0]];
1035:             brow_end[0] = inputj + inputi[inputcol[0] + 1];
1036:             for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
1037:             break;
1038:           case 2:
1039:             MatMatMultSymbolic_RowMergeMacro(2);
1040:             break;
1041:           case 3:
1042:             MatMatMultSymbolic_RowMergeMacro(3);
1043:             break;
1044:           case 4:
1045:             MatMatMultSymbolic_RowMergeMacro(4);
1046:             break;
1047:           case 5:
1048:             MatMatMultSymbolic_RowMergeMacro(5);
1049:             break;
1050:           case 6:
1051:             MatMatMultSymbolic_RowMergeMacro(6);
1052:             break;
1053:           case 7:
1054:             MatMatMultSymbolic_RowMergeMacro(7);
1055:             break;
1056:           case 8:
1057:             MatMatMultSymbolic_RowMergeMacro(8);
1058:             break;
1059:           default:
1060:             SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatMatMult logic error: Not merging 1-8 rows from L2 work array!");
1061:           }
1062:           L2_nrows    = 1;
1063:           L2_nnz      = outputi_nnz;
1064:           worki_L2[1] = outputi_nnz;
1065:           /* Copy to workj_L2 */
1066:           if (rowsleft) {
1067:             for (k = 0; k < outputi_nnz; ++k) workj_L2[k] = outputj[k];
1068:           }
1069:         }
1070:       }
1071:     } /* while (rowsleft) */
1072: #undef MatMatMultSymbolic_RowMergeMacro

1074:     /* terminate current row */
1075:     ci_nnz += outputi_nnz;
1076:     ci[i + 1] = ci_nnz;
1077:   }

1079:   /* Step 3: Create the new symbolic matrix */
1080:   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C));
1081:   PetscCall(MatSetBlockSizesFromMats(C, A, B));

1083:   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1084:   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
1085:   c          = (Mat_SeqAIJ *)(C->data);
1086:   c->free_a  = PETSC_TRUE;
1087:   c->free_ij = PETSC_TRUE;
1088:   c->nonew   = 0;

1090:   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;

1092:   /* set MatInfo */
1093:   afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5;
1094:   if (afill < 1.0) afill = 1.0;
1095:   C->info.mallocs           = ndouble;
1096:   C->info.fill_ratio_given  = fill;
1097:   C->info.fill_ratio_needed = afill;

1099: #if defined(PETSC_USE_INFO)
1100:   if (ci[am]) {
1101:     PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill));
1102:     PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill));
1103:   } else {
1104:     PetscCall(PetscInfo(C, "Empty matrix product\n"));
1105:   }
1106: #endif

1108:   /* Step 4: Free temporary work areas */
1109:   PetscCall(PetscFree(workj_L1));
1110:   PetscCall(PetscFree(workj_L2));
1111:   PetscCall(PetscFree(workj_L3));
1112:   PetscFunctionReturn(PETSC_SUCCESS);
1113: }

1115: /* concatenate unique entries and then sort */
1116: PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(Mat A, Mat B, PetscReal fill, Mat C)
1117: {
1118:   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
1119:   const PetscInt *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j;
1120:   PetscInt       *ci, *cj, bcol;
1121:   PetscInt        am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
1122:   PetscReal       afill;
1123:   PetscInt        i, j, ndouble = 0;
1124:   PetscSegBuffer  seg, segrow;
1125:   char           *seen;

1127:   PetscFunctionBegin;
1128:   PetscCall(PetscMalloc1(am + 1, &ci));
1129:   ci[0] = 0;

1131:   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
1132:   PetscCall(PetscSegBufferCreate(sizeof(PetscInt), (PetscInt)(fill * (ai[am] + bi[bm])), &seg));
1133:   PetscCall(PetscSegBufferCreate(sizeof(PetscInt), 100, &segrow));
1134:   PetscCall(PetscCalloc1(bn, &seen));

1136:   /* Determine ci and cj */
1137:   for (i = 0; i < am; i++) {
1138:     const PetscInt  anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
1139:     const PetscInt *acol = aj + ai[i];        /* column indices of nonzero entries in this row */
1140:     PetscInt packlen     = 0, *PETSC_RESTRICT crow;

1142:     /* Pack segrow */
1143:     for (j = 0; j < anzi; j++) {
1144:       PetscInt brow = acol[j], bjstart = bi[brow], bjend = bi[brow + 1], k;
1145:       for (k = bjstart; k < bjend; k++) {
1146:         bcol = bj[k];
1147:         if (!seen[bcol]) { /* new entry */
1148:           PetscInt *PETSC_RESTRICT slot;
1149:           PetscCall(PetscSegBufferGetInts(segrow, 1, &slot));
1150:           *slot      = bcol;
1151:           seen[bcol] = 1;
1152:           packlen++;
1153:         }
1154:       }
1155:     }

1157:     /* Check i-th diagonal entry */
1158:     if (C->force_diagonals && !seen[i]) {
1159:       PetscInt *PETSC_RESTRICT slot;
1160:       PetscCall(PetscSegBufferGetInts(segrow, 1, &slot));
1161:       *slot   = i;
1162:       seen[i] = 1;
1163:       packlen++;
1164:     }

1166:     PetscCall(PetscSegBufferGetInts(seg, packlen, &crow));
1167:     PetscCall(PetscSegBufferExtractTo(segrow, crow));
1168:     PetscCall(PetscSortInt(packlen, crow));
1169:     ci[i + 1] = ci[i] + packlen;
1170:     for (j = 0; j < packlen; j++) seen[crow[j]] = 0;
1171:   }
1172:   PetscCall(PetscSegBufferDestroy(&segrow));
1173:   PetscCall(PetscFree(seen));

1175:   /* Column indices are in the segmented buffer */
1176:   PetscCall(PetscSegBufferExtractAlloc(seg, &cj));
1177:   PetscCall(PetscSegBufferDestroy(&seg));

1179:   /* put together the new symbolic matrix */
1180:   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C));
1181:   PetscCall(MatSetBlockSizesFromMats(C, A, B));

1183:   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1184:   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
1185:   c          = (Mat_SeqAIJ *)(C->data);
1186:   c->free_a  = PETSC_TRUE;
1187:   c->free_ij = PETSC_TRUE;
1188:   c->nonew   = 0;

1190:   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;

1192:   /* set MatInfo */
1193:   afill = (PetscReal)ci[am] / PetscMax(ai[am] + bi[bm], 1) + 1.e-5;
1194:   if (afill < 1.0) afill = 1.0;
1195:   C->info.mallocs           = ndouble;
1196:   C->info.fill_ratio_given  = fill;
1197:   C->info.fill_ratio_needed = afill;

1199: #if defined(PETSC_USE_INFO)
1200:   if (ci[am]) {
1201:     PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill));
1202:     PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill));
1203:   } else {
1204:     PetscCall(PetscInfo(C, "Empty matrix product\n"));
1205:   }
1206: #endif
1207:   PetscFunctionReturn(PETSC_SUCCESS);
1208: }

1210: PetscErrorCode MatDestroy_SeqAIJ_MatMatMultTrans(void *data)
1211: {
1212:   Mat_MatMatTransMult *abt = (Mat_MatMatTransMult *)data;

1214:   PetscFunctionBegin;
1215:   PetscCall(MatTransposeColoringDestroy(&abt->matcoloring));
1216:   PetscCall(MatDestroy(&abt->Bt_den));
1217:   PetscCall(MatDestroy(&abt->ABt_den));
1218:   PetscCall(PetscFree(abt));
1219:   PetscFunctionReturn(PETSC_SUCCESS);
1220: }

1222: PetscErrorCode MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(Mat A, Mat B, PetscReal fill, Mat C)
1223: {
1224:   Mat                  Bt;
1225:   Mat_MatMatTransMult *abt;
1226:   Mat_Product         *product = C->product;
1227:   char                *alg;

1229:   PetscFunctionBegin;
1230:   PetscCheck(product, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct");
1231:   PetscCheck(!product->data, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Extra product struct not empty");

1233:   /* create symbolic Bt */
1234:   PetscCall(MatTransposeSymbolic(B, &Bt));
1235:   PetscCall(MatSetBlockSizes(Bt, PetscAbs(A->cmap->bs), PetscAbs(B->cmap->bs)));
1236:   PetscCall(MatSetType(Bt, ((PetscObject)A)->type_name));

1238:   /* get symbolic C=A*Bt */
1239:   PetscCall(PetscStrallocpy(product->alg, &alg));
1240:   PetscCall(MatProductSetAlgorithm(C, "sorted")); /* set algorithm for C = A*Bt */
1241:   PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(A, Bt, fill, C));
1242:   PetscCall(MatProductSetAlgorithm(C, alg)); /* resume original algorithm for ABt product */
1243:   PetscCall(PetscFree(alg));

1245:   /* create a supporting struct for reuse intermediate dense matrices with matcoloring */
1246:   PetscCall(PetscNew(&abt));

1248:   product->data    = abt;
1249:   product->destroy = MatDestroy_SeqAIJ_MatMatMultTrans;

1251:   C->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ;

1253:   abt->usecoloring = PETSC_FALSE;
1254:   PetscCall(PetscStrcmp(product->alg, "color", &abt->usecoloring));
1255:   if (abt->usecoloring) {
1256:     /* Create MatTransposeColoring from symbolic C=A*B^T */
1257:     MatTransposeColoring matcoloring;
1258:     MatColoring          coloring;
1259:     ISColoring           iscoloring;
1260:     Mat                  Bt_dense, C_dense;

1262:     /* inode causes memory problem */
1263:     PetscCall(MatSetOption(C, MAT_USE_INODES, PETSC_FALSE));

1265:     PetscCall(MatColoringCreate(C, &coloring));
1266:     PetscCall(MatColoringSetDistance(coloring, 2));
1267:     PetscCall(MatColoringSetType(coloring, MATCOLORINGSL));
1268:     PetscCall(MatColoringSetFromOptions(coloring));
1269:     PetscCall(MatColoringApply(coloring, &iscoloring));
1270:     PetscCall(MatColoringDestroy(&coloring));
1271:     PetscCall(MatTransposeColoringCreate(C, iscoloring, &matcoloring));

1273:     abt->matcoloring = matcoloring;

1275:     PetscCall(ISColoringDestroy(&iscoloring));

1277:     /* Create Bt_dense and C_dense = A*Bt_dense */
1278:     PetscCall(MatCreate(PETSC_COMM_SELF, &Bt_dense));
1279:     PetscCall(MatSetSizes(Bt_dense, A->cmap->n, matcoloring->ncolors, A->cmap->n, matcoloring->ncolors));
1280:     PetscCall(MatSetType(Bt_dense, MATSEQDENSE));
1281:     PetscCall(MatSeqDenseSetPreallocation(Bt_dense, NULL));

1283:     Bt_dense->assembled = PETSC_TRUE;
1284:     abt->Bt_den         = Bt_dense;

1286:     PetscCall(MatCreate(PETSC_COMM_SELF, &C_dense));
1287:     PetscCall(MatSetSizes(C_dense, A->rmap->n, matcoloring->ncolors, A->rmap->n, matcoloring->ncolors));
1288:     PetscCall(MatSetType(C_dense, MATSEQDENSE));
1289:     PetscCall(MatSeqDenseSetPreallocation(C_dense, NULL));

1291:     Bt_dense->assembled = PETSC_TRUE;
1292:     abt->ABt_den        = C_dense;

1294: #if defined(PETSC_USE_INFO)
1295:     {
1296:       Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data;
1297:       PetscCall(PetscInfo(C, "Use coloring of C=A*B^T; B^T: %" PetscInt_FMT " %" PetscInt_FMT ", Bt_dense: %" PetscInt_FMT ",%" PetscInt_FMT "; Cnz %" PetscInt_FMT " / (cm*ncolors %" PetscInt_FMT ") = %g\n", B->cmap->n, B->rmap->n, Bt_dense->rmap->n,
1298:                           Bt_dense->cmap->n, c->nz, A->rmap->n * matcoloring->ncolors, (double)(((PetscReal)(c->nz)) / ((PetscReal)(A->rmap->n * matcoloring->ncolors)))));
1299:     }
1300: #endif
1301:   }
1302:   /* clean up */
1303:   PetscCall(MatDestroy(&Bt));
1304:   PetscFunctionReturn(PETSC_SUCCESS);
1305: }

1307: PetscErrorCode MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(Mat A, Mat B, Mat C)
1308: {
1309:   Mat_SeqAIJ          *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c = (Mat_SeqAIJ *)C->data;
1310:   PetscInt            *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, anzi, bnzj, nexta, nextb, *acol, *bcol, brow;
1311:   PetscInt             cm = C->rmap->n, *ci = c->i, *cj = c->j, i, j, cnzi, *ccol;
1312:   PetscLogDouble       flops = 0.0;
1313:   MatScalar           *aa = a->a, *aval, *ba = b->a, *bval, *ca, *cval;
1314:   Mat_MatMatTransMult *abt;
1315:   Mat_Product         *product = C->product;

1317:   PetscFunctionBegin;
1318:   PetscCheck(product, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct");
1319:   abt = (Mat_MatMatTransMult *)product->data;
1320:   PetscCheck(abt, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct");
1321:   /* clear old values in C */
1322:   if (!c->a) {
1323:     PetscCall(PetscCalloc1(ci[cm] + 1, &ca));
1324:     c->a      = ca;
1325:     c->free_a = PETSC_TRUE;
1326:   } else {
1327:     ca = c->a;
1328:     PetscCall(PetscArrayzero(ca, ci[cm] + 1));
1329:   }

1331:   if (abt->usecoloring) {
1332:     MatTransposeColoring matcoloring = abt->matcoloring;
1333:     Mat                  Bt_dense, C_dense = abt->ABt_den;

1335:     /* Get Bt_dense by Apply MatTransposeColoring to B */
1336:     Bt_dense = abt->Bt_den;
1337:     PetscCall(MatTransColoringApplySpToDen(matcoloring, B, Bt_dense));

1339:     /* C_dense = A*Bt_dense */
1340:     PetscCall(MatMatMultNumeric_SeqAIJ_SeqDense(A, Bt_dense, C_dense));

1342:     /* Recover C from C_dense */
1343:     PetscCall(MatTransColoringApplyDenToSp(matcoloring, C_dense, C));
1344:     PetscFunctionReturn(PETSC_SUCCESS);
1345:   }

1347:   for (i = 0; i < cm; i++) {
1348:     anzi = ai[i + 1] - ai[i];
1349:     acol = aj + ai[i];
1350:     aval = aa + ai[i];
1351:     cnzi = ci[i + 1] - ci[i];
1352:     ccol = cj + ci[i];
1353:     cval = ca + ci[i];
1354:     for (j = 0; j < cnzi; j++) {
1355:       brow = ccol[j];
1356:       bnzj = bi[brow + 1] - bi[brow];
1357:       bcol = bj + bi[brow];
1358:       bval = ba + bi[brow];

1360:       /* perform sparse inner-product c(i,j)=A[i,:]*B[j,:]^T */
1361:       nexta = 0;
1362:       nextb = 0;
1363:       while (nexta < anzi && nextb < bnzj) {
1364:         while (nexta < anzi && acol[nexta] < bcol[nextb]) nexta++;
1365:         if (nexta == anzi) break;
1366:         while (nextb < bnzj && acol[nexta] > bcol[nextb]) nextb++;
1367:         if (nextb == bnzj) break;
1368:         if (acol[nexta] == bcol[nextb]) {
1369:           cval[j] += aval[nexta] * bval[nextb];
1370:           nexta++;
1371:           nextb++;
1372:           flops += 2;
1373:         }
1374:       }
1375:     }
1376:   }
1377:   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
1378:   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
1379:   PetscCall(PetscLogFlops(flops));
1380:   PetscFunctionReturn(PETSC_SUCCESS);
1381: }

1383: PetscErrorCode MatDestroy_SeqAIJ_MatTransMatMult(void *data)
1384: {
1385:   Mat_MatTransMatMult *atb = (Mat_MatTransMatMult *)data;

1387:   PetscFunctionBegin;
1388:   PetscCall(MatDestroy(&atb->At));
1389:   if (atb->destroy) PetscCall((*atb->destroy)(atb->data));
1390:   PetscCall(PetscFree(atb));
1391:   PetscFunctionReturn(PETSC_SUCCESS);
1392: }

1394: PetscErrorCode MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A, Mat B, PetscReal fill, Mat C)
1395: {
1396:   Mat          At      = NULL;
1397:   Mat_Product *product = C->product;
1398:   PetscBool    flg, def, square;

1400:   PetscFunctionBegin;
1401:   MatCheckProduct(C, 4);
1402:   square = (PetscBool)(A == B && A->symmetric == PETSC_BOOL3_TRUE);
1403:   /* outerproduct */
1404:   PetscCall(PetscStrcmp(product->alg, "outerproduct", &flg));
1405:   if (flg) {
1406:     /* create symbolic At */
1407:     if (!square) {
1408:       PetscCall(MatTransposeSymbolic(A, &At));
1409:       PetscCall(MatSetBlockSizes(At, PetscAbs(A->cmap->bs), PetscAbs(B->cmap->bs)));
1410:       PetscCall(MatSetType(At, ((PetscObject)A)->type_name));
1411:     }
1412:     /* get symbolic C=At*B */
1413:     PetscCall(MatProductSetAlgorithm(C, "sorted"));
1414:     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(square ? A : At, B, fill, C));

1416:     /* clean up */
1417:     if (!square) PetscCall(MatDestroy(&At));

1419:     C->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ; /* outerproduct */
1420:     PetscCall(MatProductSetAlgorithm(C, "outerproduct"));
1421:     PetscFunctionReturn(PETSC_SUCCESS);
1422:   }

1424:   /* matmatmult */
1425:   PetscCall(PetscStrcmp(product->alg, "default", &def));
1426:   PetscCall(PetscStrcmp(product->alg, "at*b", &flg));
1427:   if (flg || def) {
1428:     Mat_MatTransMatMult *atb;

1430:     PetscCheck(!product->data, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Extra product struct not empty");
1431:     PetscCall(PetscNew(&atb));
1432:     if (!square) PetscCall(MatTranspose(A, MAT_INITIAL_MATRIX, &At));
1433:     PetscCall(MatProductSetAlgorithm(C, "sorted"));
1434:     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(square ? A : At, B, fill, C));
1435:     PetscCall(MatProductSetAlgorithm(C, "at*b"));
1436:     product->data    = atb;
1437:     product->destroy = MatDestroy_SeqAIJ_MatTransMatMult;
1438:     atb->At          = At;

1440:     C->ops->mattransposemultnumeric = NULL; /* see MatProductNumeric_AtB_SeqAIJ_SeqAIJ */
1441:     PetscFunctionReturn(PETSC_SUCCESS);
1442:   }

1444:   SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Mat Product Algorithm is not supported");
1445: }

1447: PetscErrorCode MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ(Mat A, Mat B, Mat C)
1448: {
1449:   Mat_SeqAIJ    *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c = (Mat_SeqAIJ *)C->data;
1450:   PetscInt       am = A->rmap->n, anzi, *ai = a->i, *aj = a->j, *bi = b->i, *bj, bnzi, nextb;
1451:   PetscInt       cm = C->rmap->n, *ci = c->i, *cj = c->j, crow, *cjj, i, j, k;
1452:   PetscLogDouble flops = 0.0;
1453:   MatScalar     *aa    = a->a, *ba, *ca, *caj;

1455:   PetscFunctionBegin;
1456:   if (!c->a) {
1457:     PetscCall(PetscCalloc1(ci[cm] + 1, &ca));

1459:     c->a      = ca;
1460:     c->free_a = PETSC_TRUE;
1461:   } else {
1462:     ca = c->a;
1463:     PetscCall(PetscArrayzero(ca, ci[cm]));
1464:   }

1466:   /* compute A^T*B using outer product (A^T)[:,i]*B[i,:] */
1467:   for (i = 0; i < am; i++) {
1468:     bj   = b->j + bi[i];
1469:     ba   = b->a + bi[i];
1470:     bnzi = bi[i + 1] - bi[i];
1471:     anzi = ai[i + 1] - ai[i];
1472:     for (j = 0; j < anzi; j++) {
1473:       nextb = 0;
1474:       crow  = *aj++;
1475:       cjj   = cj + ci[crow];
1476:       caj   = ca + ci[crow];
1477:       /* perform sparse axpy operation.  Note cjj includes bj. */
1478:       for (k = 0; nextb < bnzi; k++) {
1479:         if (cjj[k] == *(bj + nextb)) { /* ccol == bcol */
1480:           caj[k] += (*aa) * (*(ba + nextb));
1481:           nextb++;
1482:         }
1483:       }
1484:       flops += 2 * bnzi;
1485:       aa++;
1486:     }
1487:   }

1489:   /* Assemble the final matrix and clean up */
1490:   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
1491:   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
1492:   PetscCall(PetscLogFlops(flops));
1493:   PetscFunctionReturn(PETSC_SUCCESS);
1494: }

1496: PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C)
1497: {
1498:   PetscFunctionBegin;
1499:   PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C));
1500:   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqDense;
1501:   PetscFunctionReturn(PETSC_SUCCESS);
1502: }

1504: PETSC_INTERN PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat A, Mat B, Mat C, const PetscBool add)
1505: {
1506:   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data;
1507:   PetscScalar       *c, r1, r2, r3, r4, *c1, *c2, *c3, *c4;
1508:   const PetscScalar *aa, *b, *b1, *b2, *b3, *b4, *av;
1509:   const PetscInt    *aj;
1510:   PetscInt           cm = C->rmap->n, cn = B->cmap->n, bm, am = A->rmap->n;
1511:   PetscInt           clda;
1512:   PetscInt           am4, bm4, col, i, j, n;

1514:   PetscFunctionBegin;
1515:   if (!cm || !cn) PetscFunctionReturn(PETSC_SUCCESS);
1516:   PetscCall(MatSeqAIJGetArrayRead(A, &av));
1517:   if (add) {
1518:     PetscCall(MatDenseGetArray(C, &c));
1519:   } else {
1520:     PetscCall(MatDenseGetArrayWrite(C, &c));
1521:   }
1522:   PetscCall(MatDenseGetArrayRead(B, &b));
1523:   PetscCall(MatDenseGetLDA(B, &bm));
1524:   PetscCall(MatDenseGetLDA(C, &clda));
1525:   am4 = 4 * clda;
1526:   bm4 = 4 * bm;
1527:   b1  = b;
1528:   b2  = b1 + bm;
1529:   b3  = b2 + bm;
1530:   b4  = b3 + bm;
1531:   c1  = c;
1532:   c2  = c1 + clda;
1533:   c3  = c2 + clda;
1534:   c4  = c3 + clda;
1535:   for (col = 0; col < (cn / 4) * 4; col += 4) { /* over columns of C */
1536:     for (i = 0; i < am; i++) {                  /* over rows of A in those columns */
1537:       r1 = r2 = r3 = r4 = 0.0;
1538:       n                 = a->i[i + 1] - a->i[i];
1539:       aj                = a->j + a->i[i];
1540:       aa                = av + a->i[i];
1541:       for (j = 0; j < n; j++) {
1542:         const PetscScalar aatmp = aa[j];
1543:         const PetscInt    ajtmp = aj[j];
1544:         r1 += aatmp * b1[ajtmp];
1545:         r2 += aatmp * b2[ajtmp];
1546:         r3 += aatmp * b3[ajtmp];
1547:         r4 += aatmp * b4[ajtmp];
1548:       }
1549:       if (add) {
1550:         c1[i] += r1;
1551:         c2[i] += r2;
1552:         c3[i] += r3;
1553:         c4[i] += r4;
1554:       } else {
1555:         c1[i] = r1;
1556:         c2[i] = r2;
1557:         c3[i] = r3;
1558:         c4[i] = r4;
1559:       }
1560:     }
1561:     b1 += bm4;
1562:     b2 += bm4;
1563:     b3 += bm4;
1564:     b4 += bm4;
1565:     c1 += am4;
1566:     c2 += am4;
1567:     c3 += am4;
1568:     c4 += am4;
1569:   }
1570:   /* process remaining columns */
1571:   if (col != cn) {
1572:     PetscInt rc = cn - col;

1574:     if (rc == 1) {
1575:       for (i = 0; i < am; i++) {
1576:         r1 = 0.0;
1577:         n  = a->i[i + 1] - a->i[i];
1578:         aj = a->j + a->i[i];
1579:         aa = av + a->i[i];
1580:         for (j = 0; j < n; j++) r1 += aa[j] * b1[aj[j]];
1581:         if (add) c1[i] += r1;
1582:         else c1[i] = r1;
1583:       }
1584:     } else if (rc == 2) {
1585:       for (i = 0; i < am; i++) {
1586:         r1 = r2 = 0.0;
1587:         n       = a->i[i + 1] - a->i[i];
1588:         aj      = a->j + a->i[i];
1589:         aa      = av + a->i[i];
1590:         for (j = 0; j < n; j++) {
1591:           const PetscScalar aatmp = aa[j];
1592:           const PetscInt    ajtmp = aj[j];
1593:           r1 += aatmp * b1[ajtmp];
1594:           r2 += aatmp * b2[ajtmp];
1595:         }
1596:         if (add) {
1597:           c1[i] += r1;
1598:           c2[i] += r2;
1599:         } else {
1600:           c1[i] = r1;
1601:           c2[i] = r2;
1602:         }
1603:       }
1604:     } else {
1605:       for (i = 0; i < am; i++) {
1606:         r1 = r2 = r3 = 0.0;
1607:         n            = a->i[i + 1] - a->i[i];
1608:         aj           = a->j + a->i[i];
1609:         aa           = av + a->i[i];
1610:         for (j = 0; j < n; j++) {
1611:           const PetscScalar aatmp = aa[j];
1612:           const PetscInt    ajtmp = aj[j];
1613:           r1 += aatmp * b1[ajtmp];
1614:           r2 += aatmp * b2[ajtmp];
1615:           r3 += aatmp * b3[ajtmp];
1616:         }
1617:         if (add) {
1618:           c1[i] += r1;
1619:           c2[i] += r2;
1620:           c3[i] += r3;
1621:         } else {
1622:           c1[i] = r1;
1623:           c2[i] = r2;
1624:           c3[i] = r3;
1625:         }
1626:       }
1627:     }
1628:   }
1629:   PetscCall(PetscLogFlops(cn * (2.0 * a->nz)));
1630:   if (add) {
1631:     PetscCall(MatDenseRestoreArray(C, &c));
1632:   } else {
1633:     PetscCall(MatDenseRestoreArrayWrite(C, &c));
1634:   }
1635:   PetscCall(MatDenseRestoreArrayRead(B, &b));
1636:   PetscCall(MatSeqAIJRestoreArrayRead(A, &av));
1637:   PetscFunctionReturn(PETSC_SUCCESS);
1638: }

1640: PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqDense(Mat A, Mat B, Mat C)
1641: {
1642:   PetscFunctionBegin;
1643:   PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n);
1644:   PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n);
1645:   PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n);

1647:   PetscCall(MatMatMultNumericAdd_SeqAIJ_SeqDense(A, B, C, PETSC_FALSE));
1648:   PetscFunctionReturn(PETSC_SUCCESS);
1649: }

1651: static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AB(Mat C)
1652: {
1653:   PetscFunctionBegin;
1654:   C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqDense;
1655:   C->ops->productsymbolic = MatProductSymbolic_AB;
1656:   PetscFunctionReturn(PETSC_SUCCESS);
1657: }

1659: PETSC_INTERN PetscErrorCode MatTMatTMultSymbolic_SeqAIJ_SeqDense(Mat, Mat, PetscReal, Mat);

1661: static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(Mat C)
1662: {
1663:   PetscFunctionBegin;
1664:   C->ops->transposematmultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense;
1665:   C->ops->productsymbolic          = MatProductSymbolic_AtB;
1666:   PetscFunctionReturn(PETSC_SUCCESS);
1667: }

1669: static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(Mat C)
1670: {
1671:   PetscFunctionBegin;
1672:   C->ops->mattransposemultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense;
1673:   C->ops->productsymbolic          = MatProductSymbolic_ABt;
1674:   PetscFunctionReturn(PETSC_SUCCESS);
1675: }

1677: PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat C)
1678: {
1679:   Mat_Product *product = C->product;

1681:   PetscFunctionBegin;
1682:   switch (product->type) {
1683:   case MATPRODUCT_AB:
1684:     PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_AB(C));
1685:     break;
1686:   case MATPRODUCT_AtB:
1687:     PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(C));
1688:     break;
1689:   case MATPRODUCT_ABt:
1690:     PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(C));
1691:     break;
1692:   default:
1693:     break;
1694:   }
1695:   PetscFunctionReturn(PETSC_SUCCESS);
1696: }

1698: static PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(Mat C)
1699: {
1700:   Mat_Product *product = C->product;
1701:   Mat          A       = product->A;
1702:   PetscBool    baij;

1704:   PetscFunctionBegin;
1705:   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQBAIJ, &baij));
1706:   if (!baij) { /* A is seqsbaij */
1707:     PetscBool sbaij;
1708:     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQSBAIJ, &sbaij));
1709:     PetscCheck(sbaij, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "Mat must be either seqbaij or seqsbaij format");

1711:     C->ops->matmultsymbolic = MatMatMultSymbolic_SeqSBAIJ_SeqDense;
1712:   } else { /* A is seqbaij */
1713:     C->ops->matmultsymbolic = MatMatMultSymbolic_SeqBAIJ_SeqDense;
1714:   }

1716:   C->ops->productsymbolic = MatProductSymbolic_AB;
1717:   PetscFunctionReturn(PETSC_SUCCESS);
1718: }

1720: PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense(Mat C)
1721: {
1722:   Mat_Product *product = C->product;

1724:   PetscFunctionBegin;
1725:   MatCheckProduct(C, 1);
1726:   PetscCheck(product->A, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing A");
1727:   if (product->type == MATPRODUCT_AB || (product->type == MATPRODUCT_AtB && product->A->symmetric == PETSC_BOOL3_TRUE)) PetscCall(MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(C));
1728:   PetscFunctionReturn(PETSC_SUCCESS);
1729: }

1731: static PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ_AB(Mat C)
1732: {
1733:   PetscFunctionBegin;
1734:   C->ops->matmultsymbolic = MatMatMultSymbolic_SeqDense_SeqAIJ;
1735:   C->ops->productsymbolic = MatProductSymbolic_AB;
1736:   PetscFunctionReturn(PETSC_SUCCESS);
1737: }

1739: PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ(Mat C)
1740: {
1741:   Mat_Product *product = C->product;

1743:   PetscFunctionBegin;
1744:   if (product->type == MATPRODUCT_AB) PetscCall(MatProductSetFromOptions_SeqDense_SeqAIJ_AB(C));
1745:   PetscFunctionReturn(PETSC_SUCCESS);
1746: }

1748: PetscErrorCode MatTransColoringApplySpToDen_SeqAIJ(MatTransposeColoring coloring, Mat B, Mat Btdense)
1749: {
1750:   Mat_SeqAIJ   *b       = (Mat_SeqAIJ *)B->data;
1751:   Mat_SeqDense *btdense = (Mat_SeqDense *)Btdense->data;
1752:   PetscInt     *bi = b->i, *bj = b->j;
1753:   PetscInt      m = Btdense->rmap->n, n = Btdense->cmap->n, j, k, l, col, anz, *btcol, brow, ncolumns;
1754:   MatScalar    *btval, *btval_den, *ba = b->a;
1755:   PetscInt     *columns = coloring->columns, *colorforcol = coloring->colorforcol, ncolors = coloring->ncolors;

1757:   PetscFunctionBegin;
1758:   btval_den = btdense->v;
1759:   PetscCall(PetscArrayzero(btval_den, m * n));
1760:   for (k = 0; k < ncolors; k++) {
1761:     ncolumns = coloring->ncolumns[k];
1762:     for (l = 0; l < ncolumns; l++) { /* insert a row of B to a column of Btdense */
1763:       col   = *(columns + colorforcol[k] + l);
1764:       btcol = bj + bi[col];
1765:       btval = ba + bi[col];
1766:       anz   = bi[col + 1] - bi[col];
1767:       for (j = 0; j < anz; j++) {
1768:         brow            = btcol[j];
1769:         btval_den[brow] = btval[j];
1770:       }
1771:     }
1772:     btval_den += m;
1773:   }
1774:   PetscFunctionReturn(PETSC_SUCCESS);
1775: }

1777: PetscErrorCode MatTransColoringApplyDenToSp_SeqAIJ(MatTransposeColoring matcoloring, Mat Cden, Mat Csp)
1778: {
1779:   Mat_SeqAIJ        *csp = (Mat_SeqAIJ *)Csp->data;
1780:   const PetscScalar *ca_den, *ca_den_ptr;
1781:   PetscScalar       *ca = csp->a;
1782:   PetscInt           k, l, m = Cden->rmap->n, ncolors = matcoloring->ncolors;
1783:   PetscInt           brows = matcoloring->brows, *den2sp = matcoloring->den2sp;
1784:   PetscInt           nrows, *row, *idx;
1785:   PetscInt          *rows = matcoloring->rows, *colorforrow = matcoloring->colorforrow;

1787:   PetscFunctionBegin;
1788:   PetscCall(MatDenseGetArrayRead(Cden, &ca_den));

1790:   if (brows > 0) {
1791:     PetscInt *lstart, row_end, row_start;
1792:     lstart = matcoloring->lstart;
1793:     PetscCall(PetscArrayzero(lstart, ncolors));

1795:     row_end = brows;
1796:     if (row_end > m) row_end = m;
1797:     for (row_start = 0; row_start < m; row_start += brows) { /* loop over row blocks of Csp */
1798:       ca_den_ptr = ca_den;
1799:       for (k = 0; k < ncolors; k++) { /* loop over colors (columns of Cden) */
1800:         nrows = matcoloring->nrows[k];
1801:         row   = rows + colorforrow[k];
1802:         idx   = den2sp + colorforrow[k];
1803:         for (l = lstart[k]; l < nrows; l++) {
1804:           if (row[l] >= row_end) {
1805:             lstart[k] = l;
1806:             break;
1807:           } else {
1808:             ca[idx[l]] = ca_den_ptr[row[l]];
1809:           }
1810:         }
1811:         ca_den_ptr += m;
1812:       }
1813:       row_end += brows;
1814:       if (row_end > m) row_end = m;
1815:     }
1816:   } else { /* non-blocked impl: loop over columns of Csp - slow if Csp is large */
1817:     ca_den_ptr = ca_den;
1818:     for (k = 0; k < ncolors; k++) {
1819:       nrows = matcoloring->nrows[k];
1820:       row   = rows + colorforrow[k];
1821:       idx   = den2sp + colorforrow[k];
1822:       for (l = 0; l < nrows; l++) ca[idx[l]] = ca_den_ptr[row[l]];
1823:       ca_den_ptr += m;
1824:     }
1825:   }

1827:   PetscCall(MatDenseRestoreArrayRead(Cden, &ca_den));
1828: #if defined(PETSC_USE_INFO)
1829:   if (matcoloring->brows > 0) {
1830:     PetscCall(PetscInfo(Csp, "Loop over %" PetscInt_FMT " row blocks for den2sp\n", brows));
1831:   } else {
1832:     PetscCall(PetscInfo(Csp, "Loop over colors/columns of Cden, inefficient for large sparse matrix product \n"));
1833:   }
1834: #endif
1835:   PetscFunctionReturn(PETSC_SUCCESS);
1836: }

1838: PetscErrorCode MatTransposeColoringCreate_SeqAIJ(Mat mat, ISColoring iscoloring, MatTransposeColoring c)
1839: {
1840:   PetscInt        i, n, nrows, Nbs, j, k, m, ncols, col, cm;
1841:   const PetscInt *is, *ci, *cj, *row_idx;
1842:   PetscInt        nis = iscoloring->n, *rowhit, bs = 1;
1843:   IS             *isa;
1844:   Mat_SeqAIJ     *csp = (Mat_SeqAIJ *)mat->data;
1845:   PetscInt       *colorforrow, *rows, *rows_i, *idxhit, *spidx, *den2sp, *den2sp_i;
1846:   PetscInt       *colorforcol, *columns, *columns_i, brows;
1847:   PetscBool       flg;

1849:   PetscFunctionBegin;
1850:   PetscCall(ISColoringGetIS(iscoloring, PETSC_USE_POINTER, PETSC_IGNORE, &isa));

1852:   /* bs >1 is not being tested yet! */
1853:   Nbs       = mat->cmap->N / bs;
1854:   c->M      = mat->rmap->N / bs; /* set total rows, columns and local rows */
1855:   c->N      = Nbs;
1856:   c->m      = c->M;
1857:   c->rstart = 0;
1858:   c->brows  = 100;

1860:   c->ncolors = nis;
1861:   PetscCall(PetscMalloc3(nis, &c->ncolumns, nis, &c->nrows, nis + 1, &colorforrow));
1862:   PetscCall(PetscMalloc1(csp->nz + 1, &rows));
1863:   PetscCall(PetscMalloc1(csp->nz + 1, &den2sp));

1865:   brows = c->brows;
1866:   PetscCall(PetscOptionsGetInt(NULL, NULL, "-matden2sp_brows", &brows, &flg));
1867:   if (flg) c->brows = brows;
1868:   if (brows > 0) PetscCall(PetscMalloc1(nis + 1, &c->lstart));

1870:   colorforrow[0] = 0;
1871:   rows_i         = rows;
1872:   den2sp_i       = den2sp;

1874:   PetscCall(PetscMalloc1(nis + 1, &colorforcol));
1875:   PetscCall(PetscMalloc1(Nbs + 1, &columns));

1877:   colorforcol[0] = 0;
1878:   columns_i      = columns;

1880:   /* get column-wise storage of mat */
1881:   PetscCall(MatGetColumnIJ_SeqAIJ_Color(mat, 0, PETSC_FALSE, PETSC_FALSE, &ncols, &ci, &cj, &spidx, NULL));

1883:   cm = c->m;
1884:   PetscCall(PetscMalloc1(cm + 1, &rowhit));
1885:   PetscCall(PetscMalloc1(cm + 1, &idxhit));
1886:   for (i = 0; i < nis; i++) { /* loop over color */
1887:     PetscCall(ISGetLocalSize(isa[i], &n));
1888:     PetscCall(ISGetIndices(isa[i], &is));

1890:     c->ncolumns[i] = n;
1891:     if (n) PetscCall(PetscArraycpy(columns_i, is, n));
1892:     colorforcol[i + 1] = colorforcol[i] + n;
1893:     columns_i += n;

1895:     /* fast, crude version requires O(N*N) work */
1896:     PetscCall(PetscArrayzero(rowhit, cm));

1898:     for (j = 0; j < n; j++) { /* loop over columns*/
1899:       col     = is[j];
1900:       row_idx = cj + ci[col];
1901:       m       = ci[col + 1] - ci[col];
1902:       for (k = 0; k < m; k++) { /* loop over columns marking them in rowhit */
1903:         idxhit[*row_idx]   = spidx[ci[col] + k];
1904:         rowhit[*row_idx++] = col + 1;
1905:       }
1906:     }
1907:     /* count the number of hits */
1908:     nrows = 0;
1909:     for (j = 0; j < cm; j++) {
1910:       if (rowhit[j]) nrows++;
1911:     }
1912:     c->nrows[i]        = nrows;
1913:     colorforrow[i + 1] = colorforrow[i] + nrows;

1915:     nrows = 0;
1916:     for (j = 0; j < cm; j++) { /* loop over rows */
1917:       if (rowhit[j]) {
1918:         rows_i[nrows]   = j;
1919:         den2sp_i[nrows] = idxhit[j];
1920:         nrows++;
1921:       }
1922:     }
1923:     den2sp_i += nrows;

1925:     PetscCall(ISRestoreIndices(isa[i], &is));
1926:     rows_i += nrows;
1927:   }
1928:   PetscCall(MatRestoreColumnIJ_SeqAIJ_Color(mat, 0, PETSC_FALSE, PETSC_FALSE, &ncols, &ci, &cj, &spidx, NULL));
1929:   PetscCall(PetscFree(rowhit));
1930:   PetscCall(ISColoringRestoreIS(iscoloring, PETSC_USE_POINTER, &isa));
1931:   PetscCheck(csp->nz == colorforrow[nis], PETSC_COMM_SELF, PETSC_ERR_PLIB, "csp->nz %" PetscInt_FMT " != colorforrow[nis] %" PetscInt_FMT, csp->nz, colorforrow[nis]);

1933:   c->colorforrow = colorforrow;
1934:   c->rows        = rows;
1935:   c->den2sp      = den2sp;
1936:   c->colorforcol = colorforcol;
1937:   c->columns     = columns;

1939:   PetscCall(PetscFree(idxhit));
1940:   PetscFunctionReturn(PETSC_SUCCESS);
1941: }

1943: static PetscErrorCode MatProductNumeric_AtB_SeqAIJ_SeqAIJ(Mat C)
1944: {
1945:   Mat_Product *product = C->product;
1946:   Mat          A = product->A, B = product->B;

1948:   PetscFunctionBegin;
1949:   if (C->ops->mattransposemultnumeric) {
1950:     /* Alg: "outerproduct" */
1951:     PetscCall((*C->ops->mattransposemultnumeric)(A, B, C));
1952:   } else {
1953:     /* Alg: "matmatmult" -- C = At*B */
1954:     Mat_MatTransMatMult *atb = (Mat_MatTransMatMult *)product->data;

1956:     PetscCheck(atb, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct");
1957:     if (atb->At) {
1958:       /* At is computed in MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ();
1959:          user may have called MatProductReplaceMats() to get this A=product->A */
1960:       PetscCall(MatTransposeSetPrecursor(A, atb->At));
1961:       PetscCall(MatTranspose(A, MAT_REUSE_MATRIX, &atb->At));
1962:     }
1963:     PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ(atb->At ? atb->At : A, B, C));
1964:   }
1965:   PetscFunctionReturn(PETSC_SUCCESS);
1966: }

1968: static PetscErrorCode MatProductSymbolic_AtB_SeqAIJ_SeqAIJ(Mat C)
1969: {
1970:   Mat_Product *product = C->product;
1971:   Mat          A = product->A, B = product->B;
1972:   PetscReal    fill = product->fill;

1974:   PetscFunctionBegin;
1975:   PetscCall(MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(A, B, fill, C));

1977:   C->ops->productnumeric = MatProductNumeric_AtB_SeqAIJ_SeqAIJ;
1978:   PetscFunctionReturn(PETSC_SUCCESS);
1979: }

1981: static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AB(Mat C)
1982: {
1983:   Mat_Product *product = C->product;
1984:   PetscInt     alg     = 0; /* default algorithm */
1985:   PetscBool    flg     = PETSC_FALSE;
1986: #if !defined(PETSC_HAVE_HYPRE)
1987:   const char *algTypes[7] = {"sorted", "scalable", "scalable_fast", "heap", "btheap", "llcondensed", "rowmerge"};
1988:   PetscInt    nalg        = 7;
1989: #else
1990:   const char *algTypes[8] = {"sorted", "scalable", "scalable_fast", "heap", "btheap", "llcondensed", "rowmerge", "hypre"};
1991:   PetscInt    nalg        = 8;
1992: #endif

1994:   PetscFunctionBegin;
1995:   /* Set default algorithm */
1996:   PetscCall(PetscStrcmp(C->product->alg, "default", &flg));
1997:   if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));

1999:   /* Get runtime option */
2000:   if (product->api_user) {
2001:     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatMult", "Mat");
2002:     PetscCall(PetscOptionsEList("-matmatmult_via", "Algorithmic approach", "MatMatMult", algTypes, nalg, algTypes[0], &alg, &flg));
2003:     PetscOptionsEnd();
2004:   } else {
2005:     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_AB", "Mat");
2006:     PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_AB", algTypes, nalg, algTypes[0], &alg, &flg));
2007:     PetscOptionsEnd();
2008:   }
2009:   if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));

2011:   C->ops->productsymbolic = MatProductSymbolic_AB;
2012:   C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqAIJ;
2013:   PetscFunctionReturn(PETSC_SUCCESS);
2014: }

2016: static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AtB(Mat C)
2017: {
2018:   Mat_Product *product     = C->product;
2019:   PetscInt     alg         = 0; /* default algorithm */
2020:   PetscBool    flg         = PETSC_FALSE;
2021:   const char  *algTypes[3] = {"default", "at*b", "outerproduct"};
2022:   PetscInt     nalg        = 3;

2024:   PetscFunctionBegin;
2025:   /* Get runtime option */
2026:   if (product->api_user) {
2027:     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatTransposeMatMult", "Mat");
2028:     PetscCall(PetscOptionsEList("-mattransposematmult_via", "Algorithmic approach", "MatTransposeMatMult", algTypes, nalg, algTypes[alg], &alg, &flg));
2029:     PetscOptionsEnd();
2030:   } else {
2031:     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_AtB", "Mat");
2032:     PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_AtB", algTypes, nalg, algTypes[alg], &alg, &flg));
2033:     PetscOptionsEnd();
2034:   }
2035:   if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));

2037:   C->ops->productsymbolic = MatProductSymbolic_AtB_SeqAIJ_SeqAIJ;
2038:   PetscFunctionReturn(PETSC_SUCCESS);
2039: }

2041: static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABt(Mat C)
2042: {
2043:   Mat_Product *product     = C->product;
2044:   PetscInt     alg         = 0; /* default algorithm */
2045:   PetscBool    flg         = PETSC_FALSE;
2046:   const char  *algTypes[2] = {"default", "color"};
2047:   PetscInt     nalg        = 2;

2049:   PetscFunctionBegin;
2050:   /* Set default algorithm */
2051:   PetscCall(PetscStrcmp(C->product->alg, "default", &flg));
2052:   if (!flg) {
2053:     alg = 1;
2054:     PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));
2055:   }

2057:   /* Get runtime option */
2058:   if (product->api_user) {
2059:     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatTransposeMult", "Mat");
2060:     PetscCall(PetscOptionsEList("-matmattransmult_via", "Algorithmic approach", "MatMatTransposeMult", algTypes, nalg, algTypes[alg], &alg, &flg));
2061:     PetscOptionsEnd();
2062:   } else {
2063:     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_ABt", "Mat");
2064:     PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_ABt", algTypes, nalg, algTypes[alg], &alg, &flg));
2065:     PetscOptionsEnd();
2066:   }
2067:   if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));

2069:   C->ops->mattransposemultsymbolic = MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ;
2070:   C->ops->productsymbolic          = MatProductSymbolic_ABt;
2071:   PetscFunctionReturn(PETSC_SUCCESS);
2072: }

2074: static PetscErrorCode MatProductSetFromOptions_SeqAIJ_PtAP(Mat C)
2075: {
2076:   Mat_Product *product = C->product;
2077:   PetscBool    flg     = PETSC_FALSE;
2078:   PetscInt     alg     = 0; /* default algorithm -- alg=1 should be default!!! */
2079: #if !defined(PETSC_HAVE_HYPRE)
2080:   const char *algTypes[2] = {"scalable", "rap"};
2081:   PetscInt    nalg        = 2;
2082: #else
2083:   const char *algTypes[3] = {"scalable", "rap", "hypre"};
2084:   PetscInt    nalg        = 3;
2085: #endif

2087:   PetscFunctionBegin;
2088:   /* Set default algorithm */
2089:   PetscCall(PetscStrcmp(product->alg, "default", &flg));
2090:   if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));

2092:   /* Get runtime option */
2093:   if (product->api_user) {
2094:     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatPtAP", "Mat");
2095:     PetscCall(PetscOptionsEList("-matptap_via", "Algorithmic approach", "MatPtAP", algTypes, nalg, algTypes[0], &alg, &flg));
2096:     PetscOptionsEnd();
2097:   } else {
2098:     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_PtAP", "Mat");
2099:     PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_PtAP", algTypes, nalg, algTypes[0], &alg, &flg));
2100:     PetscOptionsEnd();
2101:   }
2102:   if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));

2104:   C->ops->productsymbolic = MatProductSymbolic_PtAP_SeqAIJ_SeqAIJ;
2105:   PetscFunctionReturn(PETSC_SUCCESS);
2106: }

2108: static PetscErrorCode MatProductSetFromOptions_SeqAIJ_RARt(Mat C)
2109: {
2110:   Mat_Product *product     = C->product;
2111:   PetscBool    flg         = PETSC_FALSE;
2112:   PetscInt     alg         = 0; /* default algorithm */
2113:   const char  *algTypes[3] = {"r*a*rt", "r*art", "coloring_rart"};
2114:   PetscInt     nalg        = 3;

2116:   PetscFunctionBegin;
2117:   /* Set default algorithm */
2118:   PetscCall(PetscStrcmp(product->alg, "default", &flg));
2119:   if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));

2121:   /* Get runtime option */
2122:   if (product->api_user) {
2123:     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatRARt", "Mat");
2124:     PetscCall(PetscOptionsEList("-matrart_via", "Algorithmic approach", "MatRARt", algTypes, nalg, algTypes[0], &alg, &flg));
2125:     PetscOptionsEnd();
2126:   } else {
2127:     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_RARt", "Mat");
2128:     PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_RARt", algTypes, nalg, algTypes[0], &alg, &flg));
2129:     PetscOptionsEnd();
2130:   }
2131:   if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));

2133:   C->ops->productsymbolic = MatProductSymbolic_RARt_SeqAIJ_SeqAIJ;
2134:   PetscFunctionReturn(PETSC_SUCCESS);
2135: }

2137: /* ABC = A*B*C = A*(B*C); ABC's algorithm must be chosen from AB's algorithm */
2138: static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABC(Mat C)
2139: {
2140:   Mat_Product *product     = C->product;
2141:   PetscInt     alg         = 0; /* default algorithm */
2142:   PetscBool    flg         = PETSC_FALSE;
2143:   const char  *algTypes[7] = {"sorted", "scalable", "scalable_fast", "heap", "btheap", "llcondensed", "rowmerge"};
2144:   PetscInt     nalg        = 7;

2146:   PetscFunctionBegin;
2147:   /* Set default algorithm */
2148:   PetscCall(PetscStrcmp(product->alg, "default", &flg));
2149:   if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));

2151:   /* Get runtime option */
2152:   if (product->api_user) {
2153:     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatMatMult", "Mat");
2154:     PetscCall(PetscOptionsEList("-matmatmatmult_via", "Algorithmic approach", "MatMatMatMult", algTypes, nalg, algTypes[alg], &alg, &flg));
2155:     PetscOptionsEnd();
2156:   } else {
2157:     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_ABC", "Mat");
2158:     PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_ABC", algTypes, nalg, algTypes[alg], &alg, &flg));
2159:     PetscOptionsEnd();
2160:   }
2161:   if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));

2163:   C->ops->matmatmultsymbolic = MatMatMatMultSymbolic_SeqAIJ_SeqAIJ_SeqAIJ;
2164:   C->ops->productsymbolic    = MatProductSymbolic_ABC;
2165:   PetscFunctionReturn(PETSC_SUCCESS);
2166: }

2168: PetscErrorCode MatProductSetFromOptions_SeqAIJ(Mat C)
2169: {
2170:   Mat_Product *product = C->product;

2172:   PetscFunctionBegin;
2173:   switch (product->type) {
2174:   case MATPRODUCT_AB:
2175:     PetscCall(MatProductSetFromOptions_SeqAIJ_AB(C));
2176:     break;
2177:   case MATPRODUCT_AtB:
2178:     PetscCall(MatProductSetFromOptions_SeqAIJ_AtB(C));
2179:     break;
2180:   case MATPRODUCT_ABt:
2181:     PetscCall(MatProductSetFromOptions_SeqAIJ_ABt(C));
2182:     break;
2183:   case MATPRODUCT_PtAP:
2184:     PetscCall(MatProductSetFromOptions_SeqAIJ_PtAP(C));
2185:     break;
2186:   case MATPRODUCT_RARt:
2187:     PetscCall(MatProductSetFromOptions_SeqAIJ_RARt(C));
2188:     break;
2189:   case MATPRODUCT_ABC:
2190:     PetscCall(MatProductSetFromOptions_SeqAIJ_ABC(C));
2191:     break;
2192:   default:
2193:     break;
2194:   }
2195:   PetscFunctionReturn(PETSC_SUCCESS);
2196: }