Actual source code: baij2.c

  1: #include <../src/mat/impls/baij/seq/baij.h>
  2: #include <../src/mat/impls/dense/seq/dense.h>
  3: #include <petsc/private/kernels/blockinvert.h>
  4: #include <petscbt.h>
  5: #include <petscblaslapack.h>

  7: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
  8:   #include <immintrin.h>
  9: #endif

 11: PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A, PetscInt is_max, IS is[], PetscInt ov)
 12: {
 13:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
 14:   PetscInt        row, i, j, k, l, m, n, *nidx, isz, val, ival;
 15:   const PetscInt *idx;
 16:   PetscInt        start, end, *ai, *aj, bs;
 17:   PetscBT         table;

 19:   PetscFunctionBegin;
 20:   m  = a->mbs;
 21:   ai = a->i;
 22:   aj = a->j;
 23:   bs = A->rmap->bs;

 25:   PetscCheck(ov >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified");

 27:   PetscCall(PetscBTCreate(m, &table));
 28:   PetscCall(PetscMalloc1(m + 1, &nidx));

 30:   for (i = 0; i < is_max; i++) {
 31:     /* Initialise the two local arrays */
 32:     isz = 0;
 33:     PetscCall(PetscBTMemzero(m, table));

 35:     /* Extract the indices, assume there can be duplicate entries */
 36:     PetscCall(ISGetIndices(is[i], &idx));
 37:     PetscCall(ISGetLocalSize(is[i], &n));

 39:     /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
 40:     for (j = 0; j < n; ++j) {
 41:       ival = idx[j] / bs; /* convert the indices into block indices */
 42:       PetscCheck(ival < m, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "index greater than mat-dim");
 43:       if (!PetscBTLookupSet(table, ival)) nidx[isz++] = ival;
 44:     }
 45:     PetscCall(ISRestoreIndices(is[i], &idx));
 46:     PetscCall(ISDestroy(&is[i]));

 48:     k = 0;
 49:     for (j = 0; j < ov; j++) { /* for each overlap*/
 50:       n = isz;
 51:       for (; k < n; k++) { /* do only those rows in nidx[k], which are not done yet */
 52:         row   = nidx[k];
 53:         start = ai[row];
 54:         end   = ai[row + 1];
 55:         for (l = start; l < end; l++) {
 56:           val = aj[l];
 57:           if (!PetscBTLookupSet(table, val)) nidx[isz++] = val;
 58:         }
 59:       }
 60:     }
 61:     PetscCall(ISCreateBlock(PETSC_COMM_SELF, bs, isz, nidx, PETSC_COPY_VALUES, is + i));
 62:   }
 63:   PetscCall(PetscBTDestroy(&table));
 64:   PetscCall(PetscFree(nidx));
 65:   PetscFunctionReturn(PETSC_SUCCESS);
 66: }

 68: PetscErrorCode MatCreateSubMatrix_SeqBAIJ_Private(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B)
 69: {
 70:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *c;
 71:   PetscInt       *smap, i, k, kstart, kend, oldcols = a->nbs, *lens;
 72:   PetscInt        row, mat_i, *mat_j, tcol, *mat_ilen;
 73:   const PetscInt *irow, *icol;
 74:   PetscInt        nrows, ncols, *ssmap, bs = A->rmap->bs, bs2 = a->bs2;
 75:   PetscInt       *aj = a->j, *ai = a->i;
 76:   MatScalar      *mat_a;
 77:   Mat             C;
 78:   PetscBool       flag;

 80:   PetscFunctionBegin;
 81:   PetscCall(ISGetIndices(isrow, &irow));
 82:   PetscCall(ISGetIndices(iscol, &icol));
 83:   PetscCall(ISGetLocalSize(isrow, &nrows));
 84:   PetscCall(ISGetLocalSize(iscol, &ncols));

 86:   PetscCall(PetscCalloc1(1 + oldcols, &smap));
 87:   ssmap = smap;
 88:   PetscCall(PetscMalloc1(1 + nrows, &lens));
 89:   for (i = 0; i < ncols; i++) smap[icol[i]] = i + 1;
 90:   /* determine lens of each row */
 91:   for (i = 0; i < nrows; i++) {
 92:     kstart  = ai[irow[i]];
 93:     kend    = kstart + a->ilen[irow[i]];
 94:     lens[i] = 0;
 95:     for (k = kstart; k < kend; k++) {
 96:       if (ssmap[aj[k]]) lens[i]++;
 97:     }
 98:   }
 99:   /* Create and fill new matrix */
100:   if (scall == MAT_REUSE_MATRIX) {
101:     c = (Mat_SeqBAIJ *)((*B)->data);

103:     PetscCheck(c->mbs == nrows && c->nbs == ncols && (*B)->rmap->bs == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Submatrix wrong size");
104:     PetscCall(PetscArraycmp(c->ilen, lens, c->mbs, &flag));
105:     PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong no of nonzeros");
106:     PetscCall(PetscArrayzero(c->ilen, c->mbs));
107:     C = *B;
108:   } else {
109:     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &C));
110:     PetscCall(MatSetSizes(C, nrows * bs, ncols * bs, PETSC_DETERMINE, PETSC_DETERMINE));
111:     PetscCall(MatSetType(C, ((PetscObject)A)->type_name));
112:     PetscCall(MatSeqBAIJSetPreallocation(C, bs, 0, lens));
113:   }
114:   c = (Mat_SeqBAIJ *)(C->data);
115:   for (i = 0; i < nrows; i++) {
116:     row      = irow[i];
117:     kstart   = ai[row];
118:     kend     = kstart + a->ilen[row];
119:     mat_i    = c->i[i];
120:     mat_j    = c->j ? c->j + mat_i : NULL;       /* mustn't add to NULL, that is UB */
121:     mat_a    = c->a ? c->a + mat_i * bs2 : NULL; /* mustn't add to NULL, that is UB */
122:     mat_ilen = c->ilen + i;
123:     for (k = kstart; k < kend; k++) {
124:       if ((tcol = ssmap[a->j[k]])) {
125:         *mat_j++ = tcol - 1;
126:         PetscCall(PetscArraycpy(mat_a, a->a + k * bs2, bs2));
127:         mat_a += bs2;
128:         (*mat_ilen)++;
129:       }
130:     }
131:   }
132:   /* sort */
133:   if (c->j && c->a) {
134:     MatScalar *work;
135:     PetscCall(PetscMalloc1(bs2, &work));
136:     for (i = 0; i < nrows; i++) {
137:       PetscInt ilen;
138:       mat_i = c->i[i];
139:       mat_j = c->j + mat_i;
140:       mat_a = c->a + mat_i * bs2;
141:       ilen  = c->ilen[i];
142:       PetscCall(PetscSortIntWithDataArray(ilen, mat_j, mat_a, bs2 * sizeof(MatScalar), work));
143:     }
144:     PetscCall(PetscFree(work));
145:   }

147:   /* Free work space */
148:   PetscCall(ISRestoreIndices(iscol, &icol));
149:   PetscCall(PetscFree(smap));
150:   PetscCall(PetscFree(lens));
151:   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
152:   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));

154:   PetscCall(ISRestoreIndices(isrow, &irow));
155:   *B = C;
156:   PetscFunctionReturn(PETSC_SUCCESS);
157: }

159: PetscErrorCode MatCreateSubMatrix_SeqBAIJ(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B)
160: {
161:   IS is1, is2;

163:   PetscFunctionBegin;
164:   PetscCall(ISCompressIndicesGeneral(A->rmap->N, A->rmap->n, A->rmap->bs, 1, &isrow, &is1));
165:   if (isrow == iscol) {
166:     is2 = is1;
167:     PetscCall(PetscObjectReference((PetscObject)is2));
168:   } else PetscCall(ISCompressIndicesGeneral(A->cmap->N, A->cmap->n, A->cmap->bs, 1, &iscol, &is2));
169:   PetscCall(MatCreateSubMatrix_SeqBAIJ_Private(A, is1, is2, scall, B));
170:   PetscCall(ISDestroy(&is1));
171:   PetscCall(ISDestroy(&is2));
172:   PetscFunctionReturn(PETSC_SUCCESS);
173: }

175: PetscErrorCode MatDestroySubMatrix_SeqBAIJ(Mat C)
176: {
177:   Mat_SeqBAIJ *c       = (Mat_SeqBAIJ *)C->data;
178:   Mat_SubSppt *submatj = c->submatis1;

180:   PetscFunctionBegin;
181:   PetscCall((*submatj->destroy)(C));
182:   PetscCall(MatDestroySubMatrix_Private(submatj));
183:   PetscFunctionReturn(PETSC_SUCCESS);
184: }

186: /* Note this has code duplication with MatDestroySubMatrices_SeqAIJ() */
187: PetscErrorCode MatDestroySubMatrices_SeqBAIJ(PetscInt n, Mat *mat[])
188: {
189:   PetscInt     i;
190:   Mat          C;
191:   Mat_SeqBAIJ *c;
192:   Mat_SubSppt *submatj;

194:   PetscFunctionBegin;
195:   for (i = 0; i < n; i++) {
196:     C       = (*mat)[i];
197:     c       = (Mat_SeqBAIJ *)C->data;
198:     submatj = c->submatis1;
199:     if (submatj) {
200:       if (--((PetscObject)C)->refct <= 0) {
201:         PetscCall(PetscFree(C->factorprefix));
202:         PetscCall((*submatj->destroy)(C));
203:         PetscCall(MatDestroySubMatrix_Private(submatj));
204:         PetscCall(PetscFree(C->defaultvectype));
205:         PetscCall(PetscFree(C->defaultrandtype));
206:         PetscCall(PetscLayoutDestroy(&C->rmap));
207:         PetscCall(PetscLayoutDestroy(&C->cmap));
208:         PetscCall(PetscHeaderDestroy(&C));
209:       }
210:     } else {
211:       PetscCall(MatDestroy(&C));
212:     }
213:   }

215:   /* Destroy Dummy submatrices created for reuse */
216:   PetscCall(MatDestroySubMatrices_Dummy(n, mat));

218:   PetscCall(PetscFree(*mat));
219:   PetscFunctionReturn(PETSC_SUCCESS);
220: }

222: PetscErrorCode MatCreateSubMatrices_SeqBAIJ(Mat A, PetscInt n, const IS irow[], const IS icol[], MatReuse scall, Mat *B[])
223: {
224:   PetscInt i;

226:   PetscFunctionBegin;
227:   if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(n + 1, B));

229:   for (i = 0; i < n; i++) PetscCall(MatCreateSubMatrix_SeqBAIJ(A, irow[i], icol[i], scall, &(*B)[i]));
230:   PetscFunctionReturn(PETSC_SUCCESS);
231: }

233: /* Should check that shapes of vectors and matrices match */
234: PetscErrorCode MatMult_SeqBAIJ_1(Mat A, Vec xx, Vec zz)
235: {
236:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
237:   PetscScalar       *z, sum;
238:   const PetscScalar *x;
239:   const MatScalar   *v;
240:   PetscInt           mbs, i, n;
241:   const PetscInt    *idx, *ii, *ridx = NULL;
242:   PetscBool          usecprow = a->compressedrow.use;

244:   PetscFunctionBegin;
245:   PetscCall(VecGetArrayRead(xx, &x));
246:   PetscCall(VecGetArrayWrite(zz, &z));

248:   if (usecprow) {
249:     mbs  = a->compressedrow.nrows;
250:     ii   = a->compressedrow.i;
251:     ridx = a->compressedrow.rindex;
252:     PetscCall(PetscArrayzero(z, a->mbs));
253:   } else {
254:     mbs = a->mbs;
255:     ii  = a->i;
256:   }

258:   for (i = 0; i < mbs; i++) {
259:     n   = ii[1] - ii[0];
260:     v   = a->a + ii[0];
261:     idx = a->j + ii[0];
262:     ii++;
263:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
264:     PetscPrefetchBlock(v + 1 * n, 1 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
265:     sum = 0.0;
266:     PetscSparseDensePlusDot(sum, x, v, idx, n);
267:     if (usecprow) {
268:       z[ridx[i]] = sum;
269:     } else {
270:       z[i] = sum;
271:     }
272:   }
273:   PetscCall(VecRestoreArrayRead(xx, &x));
274:   PetscCall(VecRestoreArrayWrite(zz, &z));
275:   PetscCall(PetscLogFlops(2.0 * a->nz - a->nonzerorowcnt));
276:   PetscFunctionReturn(PETSC_SUCCESS);
277: }

279: PetscErrorCode MatMult_SeqBAIJ_2(Mat A, Vec xx, Vec zz)
280: {
281:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
282:   PetscScalar       *z = NULL, sum1, sum2, *zarray;
283:   const PetscScalar *x, *xb;
284:   PetscScalar        x1, x2;
285:   const MatScalar   *v;
286:   PetscInt           mbs, i, *idx, *ii, j, n, *ridx = NULL;
287:   PetscBool          usecprow = a->compressedrow.use;

289:   PetscFunctionBegin;
290:   PetscCall(VecGetArrayRead(xx, &x));
291:   PetscCall(VecGetArrayWrite(zz, &zarray));

293:   idx = a->j;
294:   v   = a->a;
295:   if (usecprow) {
296:     mbs  = a->compressedrow.nrows;
297:     ii   = a->compressedrow.i;
298:     ridx = a->compressedrow.rindex;
299:     PetscCall(PetscArrayzero(zarray, 2 * a->mbs));
300:   } else {
301:     mbs = a->mbs;
302:     ii  = a->i;
303:     z   = zarray;
304:   }

306:   for (i = 0; i < mbs; i++) {
307:     n = ii[1] - ii[0];
308:     ii++;
309:     sum1 = 0.0;
310:     sum2 = 0.0;
311:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
312:     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
313:     for (j = 0; j < n; j++) {
314:       xb = x + 2 * (*idx++);
315:       x1 = xb[0];
316:       x2 = xb[1];
317:       sum1 += v[0] * x1 + v[2] * x2;
318:       sum2 += v[1] * x1 + v[3] * x2;
319:       v += 4;
320:     }
321:     if (usecprow) z = zarray + 2 * ridx[i];
322:     z[0] = sum1;
323:     z[1] = sum2;
324:     if (!usecprow) z += 2;
325:   }
326:   PetscCall(VecRestoreArrayRead(xx, &x));
327:   PetscCall(VecRestoreArrayWrite(zz, &zarray));
328:   PetscCall(PetscLogFlops(8.0 * a->nz - 2.0 * a->nonzerorowcnt));
329:   PetscFunctionReturn(PETSC_SUCCESS);
330: }

332: PetscErrorCode MatMult_SeqBAIJ_3(Mat A, Vec xx, Vec zz)
333: {
334:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
335:   PetscScalar       *z = NULL, sum1, sum2, sum3, x1, x2, x3, *zarray;
336:   const PetscScalar *x, *xb;
337:   const MatScalar   *v;
338:   PetscInt           mbs, i, *idx, *ii, j, n, *ridx = NULL;
339:   PetscBool          usecprow = a->compressedrow.use;

341: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
342:   #pragma disjoint(*v, *z, *xb)
343: #endif

345:   PetscFunctionBegin;
346:   PetscCall(VecGetArrayRead(xx, &x));
347:   PetscCall(VecGetArrayWrite(zz, &zarray));

349:   idx = a->j;
350:   v   = a->a;
351:   if (usecprow) {
352:     mbs  = a->compressedrow.nrows;
353:     ii   = a->compressedrow.i;
354:     ridx = a->compressedrow.rindex;
355:     PetscCall(PetscArrayzero(zarray, 3 * a->mbs));
356:   } else {
357:     mbs = a->mbs;
358:     ii  = a->i;
359:     z   = zarray;
360:   }

362:   for (i = 0; i < mbs; i++) {
363:     n = ii[1] - ii[0];
364:     ii++;
365:     sum1 = 0.0;
366:     sum2 = 0.0;
367:     sum3 = 0.0;
368:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
369:     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
370:     for (j = 0; j < n; j++) {
371:       xb = x + 3 * (*idx++);
372:       x1 = xb[0];
373:       x2 = xb[1];
374:       x3 = xb[2];

376:       sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3;
377:       sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3;
378:       sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3;
379:       v += 9;
380:     }
381:     if (usecprow) z = zarray + 3 * ridx[i];
382:     z[0] = sum1;
383:     z[1] = sum2;
384:     z[2] = sum3;
385:     if (!usecprow) z += 3;
386:   }
387:   PetscCall(VecRestoreArrayRead(xx, &x));
388:   PetscCall(VecRestoreArrayWrite(zz, &zarray));
389:   PetscCall(PetscLogFlops(18.0 * a->nz - 3.0 * a->nonzerorowcnt));
390:   PetscFunctionReturn(PETSC_SUCCESS);
391: }

393: PetscErrorCode MatMult_SeqBAIJ_4(Mat A, Vec xx, Vec zz)
394: {
395:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
396:   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, x1, x2, x3, x4, *zarray;
397:   const PetscScalar *x, *xb;
398:   const MatScalar   *v;
399:   PetscInt           mbs, i, *idx, *ii, j, n, *ridx = NULL;
400:   PetscBool          usecprow = a->compressedrow.use;

402:   PetscFunctionBegin;
403:   PetscCall(VecGetArrayRead(xx, &x));
404:   PetscCall(VecGetArrayWrite(zz, &zarray));

406:   idx = a->j;
407:   v   = a->a;
408:   if (usecprow) {
409:     mbs  = a->compressedrow.nrows;
410:     ii   = a->compressedrow.i;
411:     ridx = a->compressedrow.rindex;
412:     PetscCall(PetscArrayzero(zarray, 4 * a->mbs));
413:   } else {
414:     mbs = a->mbs;
415:     ii  = a->i;
416:     z   = zarray;
417:   }

419:   for (i = 0; i < mbs; i++) {
420:     n = ii[1] - ii[0];
421:     ii++;
422:     sum1 = 0.0;
423:     sum2 = 0.0;
424:     sum3 = 0.0;
425:     sum4 = 0.0;

427:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
428:     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
429:     for (j = 0; j < n; j++) {
430:       xb = x + 4 * (*idx++);
431:       x1 = xb[0];
432:       x2 = xb[1];
433:       x3 = xb[2];
434:       x4 = xb[3];
435:       sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
436:       sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
437:       sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
438:       sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
439:       v += 16;
440:     }
441:     if (usecprow) z = zarray + 4 * ridx[i];
442:     z[0] = sum1;
443:     z[1] = sum2;
444:     z[2] = sum3;
445:     z[3] = sum4;
446:     if (!usecprow) z += 4;
447:   }
448:   PetscCall(VecRestoreArrayRead(xx, &x));
449:   PetscCall(VecRestoreArrayWrite(zz, &zarray));
450:   PetscCall(PetscLogFlops(32.0 * a->nz - 4.0 * a->nonzerorowcnt));
451:   PetscFunctionReturn(PETSC_SUCCESS);
452: }

454: PetscErrorCode MatMult_SeqBAIJ_5(Mat A, Vec xx, Vec zz)
455: {
456:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
457:   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, x1, x2, x3, x4, x5, *zarray;
458:   const PetscScalar *xb, *x;
459:   const MatScalar   *v;
460:   const PetscInt    *idx, *ii, *ridx = NULL;
461:   PetscInt           mbs, i, j, n;
462:   PetscBool          usecprow = a->compressedrow.use;

464:   PetscFunctionBegin;
465:   PetscCall(VecGetArrayRead(xx, &x));
466:   PetscCall(VecGetArrayWrite(zz, &zarray));

468:   idx = a->j;
469:   v   = a->a;
470:   if (usecprow) {
471:     mbs  = a->compressedrow.nrows;
472:     ii   = a->compressedrow.i;
473:     ridx = a->compressedrow.rindex;
474:     PetscCall(PetscArrayzero(zarray, 5 * a->mbs));
475:   } else {
476:     mbs = a->mbs;
477:     ii  = a->i;
478:     z   = zarray;
479:   }

481:   for (i = 0; i < mbs; i++) {
482:     n = ii[1] - ii[0];
483:     ii++;
484:     sum1 = 0.0;
485:     sum2 = 0.0;
486:     sum3 = 0.0;
487:     sum4 = 0.0;
488:     sum5 = 0.0;
489:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
490:     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
491:     for (j = 0; j < n; j++) {
492:       xb = x + 5 * (*idx++);
493:       x1 = xb[0];
494:       x2 = xb[1];
495:       x3 = xb[2];
496:       x4 = xb[3];
497:       x5 = xb[4];
498:       sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
499:       sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
500:       sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
501:       sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
502:       sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
503:       v += 25;
504:     }
505:     if (usecprow) z = zarray + 5 * ridx[i];
506:     z[0] = sum1;
507:     z[1] = sum2;
508:     z[2] = sum3;
509:     z[3] = sum4;
510:     z[4] = sum5;
511:     if (!usecprow) z += 5;
512:   }
513:   PetscCall(VecRestoreArrayRead(xx, &x));
514:   PetscCall(VecRestoreArrayWrite(zz, &zarray));
515:   PetscCall(PetscLogFlops(50.0 * a->nz - 5.0 * a->nonzerorowcnt));
516:   PetscFunctionReturn(PETSC_SUCCESS);
517: }

519: PetscErrorCode MatMult_SeqBAIJ_6(Mat A, Vec xx, Vec zz)
520: {
521:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
522:   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6;
523:   const PetscScalar *x, *xb;
524:   PetscScalar        x1, x2, x3, x4, x5, x6, *zarray;
525:   const MatScalar   *v;
526:   PetscInt           mbs, i, *idx, *ii, j, n, *ridx = NULL;
527:   PetscBool          usecprow = a->compressedrow.use;

529:   PetscFunctionBegin;
530:   PetscCall(VecGetArrayRead(xx, &x));
531:   PetscCall(VecGetArrayWrite(zz, &zarray));

533:   idx = a->j;
534:   v   = a->a;
535:   if (usecprow) {
536:     mbs  = a->compressedrow.nrows;
537:     ii   = a->compressedrow.i;
538:     ridx = a->compressedrow.rindex;
539:     PetscCall(PetscArrayzero(zarray, 6 * a->mbs));
540:   } else {
541:     mbs = a->mbs;
542:     ii  = a->i;
543:     z   = zarray;
544:   }

546:   for (i = 0; i < mbs; i++) {
547:     n = ii[1] - ii[0];
548:     ii++;
549:     sum1 = 0.0;
550:     sum2 = 0.0;
551:     sum3 = 0.0;
552:     sum4 = 0.0;
553:     sum5 = 0.0;
554:     sum6 = 0.0;

556:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
557:     PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
558:     for (j = 0; j < n; j++) {
559:       xb = x + 6 * (*idx++);
560:       x1 = xb[0];
561:       x2 = xb[1];
562:       x3 = xb[2];
563:       x4 = xb[3];
564:       x5 = xb[4];
565:       x6 = xb[5];
566:       sum1 += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
567:       sum2 += v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
568:       sum3 += v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
569:       sum4 += v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
570:       sum5 += v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
571:       sum6 += v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
572:       v += 36;
573:     }
574:     if (usecprow) z = zarray + 6 * ridx[i];
575:     z[0] = sum1;
576:     z[1] = sum2;
577:     z[2] = sum3;
578:     z[3] = sum4;
579:     z[4] = sum5;
580:     z[5] = sum6;
581:     if (!usecprow) z += 6;
582:   }

584:   PetscCall(VecRestoreArrayRead(xx, &x));
585:   PetscCall(VecRestoreArrayWrite(zz, &zarray));
586:   PetscCall(PetscLogFlops(72.0 * a->nz - 6.0 * a->nonzerorowcnt));
587:   PetscFunctionReturn(PETSC_SUCCESS);
588: }

590: PetscErrorCode MatMult_SeqBAIJ_7(Mat A, Vec xx, Vec zz)
591: {
592:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
593:   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7;
594:   const PetscScalar *x, *xb;
595:   PetscScalar        x1, x2, x3, x4, x5, x6, x7, *zarray;
596:   const MatScalar   *v;
597:   PetscInt           mbs, i, *idx, *ii, j, n, *ridx = NULL;
598:   PetscBool          usecprow = a->compressedrow.use;

600:   PetscFunctionBegin;
601:   PetscCall(VecGetArrayRead(xx, &x));
602:   PetscCall(VecGetArrayWrite(zz, &zarray));

604:   idx = a->j;
605:   v   = a->a;
606:   if (usecprow) {
607:     mbs  = a->compressedrow.nrows;
608:     ii   = a->compressedrow.i;
609:     ridx = a->compressedrow.rindex;
610:     PetscCall(PetscArrayzero(zarray, 7 * a->mbs));
611:   } else {
612:     mbs = a->mbs;
613:     ii  = a->i;
614:     z   = zarray;
615:   }

617:   for (i = 0; i < mbs; i++) {
618:     n = ii[1] - ii[0];
619:     ii++;
620:     sum1 = 0.0;
621:     sum2 = 0.0;
622:     sum3 = 0.0;
623:     sum4 = 0.0;
624:     sum5 = 0.0;
625:     sum6 = 0.0;
626:     sum7 = 0.0;

628:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
629:     PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
630:     for (j = 0; j < n; j++) {
631:       xb = x + 7 * (*idx++);
632:       x1 = xb[0];
633:       x2 = xb[1];
634:       x3 = xb[2];
635:       x4 = xb[3];
636:       x5 = xb[4];
637:       x6 = xb[5];
638:       x7 = xb[6];
639:       sum1 += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
640:       sum2 += v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
641:       sum3 += v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
642:       sum4 += v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
643:       sum5 += v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
644:       sum6 += v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
645:       sum7 += v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
646:       v += 49;
647:     }
648:     if (usecprow) z = zarray + 7 * ridx[i];
649:     z[0] = sum1;
650:     z[1] = sum2;
651:     z[2] = sum3;
652:     z[3] = sum4;
653:     z[4] = sum5;
654:     z[5] = sum6;
655:     z[6] = sum7;
656:     if (!usecprow) z += 7;
657:   }

659:   PetscCall(VecRestoreArrayRead(xx, &x));
660:   PetscCall(VecRestoreArrayWrite(zz, &zarray));
661:   PetscCall(PetscLogFlops(98.0 * a->nz - 7.0 * a->nonzerorowcnt));
662:   PetscFunctionReturn(PETSC_SUCCESS);
663: }

665: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
666: PetscErrorCode MatMult_SeqBAIJ_9_AVX2(Mat A, Vec xx, Vec zz)
667: {
668:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
669:   PetscScalar       *z = NULL, *work, *workt, *zarray;
670:   const PetscScalar *x, *xb;
671:   const MatScalar   *v;
672:   PetscInt           mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
673:   const PetscInt    *idx, *ii, *ridx = NULL;
674:   PetscInt           k;
675:   PetscBool          usecprow = a->compressedrow.use;

677:   __m256d a0, a1, a2, a3, a4, a5;
678:   __m256d w0, w1, w2, w3;
679:   __m256d z0, z1, z2;
680:   __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL << 63);

682:   PetscFunctionBegin;
683:   PetscCall(VecGetArrayRead(xx, &x));
684:   PetscCall(VecGetArrayWrite(zz, &zarray));

686:   idx = a->j;
687:   v   = a->a;
688:   if (usecprow) {
689:     mbs  = a->compressedrow.nrows;
690:     ii   = a->compressedrow.i;
691:     ridx = a->compressedrow.rindex;
692:     PetscCall(PetscArrayzero(zarray, bs * a->mbs));
693:   } else {
694:     mbs = a->mbs;
695:     ii  = a->i;
696:     z   = zarray;
697:   }

699:   if (!a->mult_work) {
700:     k = PetscMax(A->rmap->n, A->cmap->n);
701:     PetscCall(PetscMalloc1(k + 1, &a->mult_work));
702:   }

704:   work = a->mult_work;
705:   for (i = 0; i < mbs; i++) {
706:     n = ii[1] - ii[0];
707:     ii++;
708:     workt = work;
709:     for (j = 0; j < n; j++) {
710:       xb = x + bs * (*idx++);
711:       for (k = 0; k < bs; k++) workt[k] = xb[k];
712:       workt += bs;
713:     }
714:     if (usecprow) z = zarray + bs * ridx[i];

716:     z0 = _mm256_setzero_pd();
717:     z1 = _mm256_setzero_pd();
718:     z2 = _mm256_setzero_pd();

720:     for (j = 0; j < n; j++) {
721:       /* first column of a */
722:       w0 = _mm256_set1_pd(work[j * 9]);
723:       a0 = _mm256_loadu_pd(&v[j * 81]);
724:       z0 = _mm256_fmadd_pd(a0, w0, z0);
725:       a1 = _mm256_loadu_pd(&v[j * 81 + 4]);
726:       z1 = _mm256_fmadd_pd(a1, w0, z1);
727:       a2 = _mm256_loadu_pd(&v[j * 81 + 8]);
728:       z2 = _mm256_fmadd_pd(a2, w0, z2);

730:       /* second column of a */
731:       w1 = _mm256_set1_pd(work[j * 9 + 1]);
732:       a0 = _mm256_loadu_pd(&v[j * 81 + 9]);
733:       z0 = _mm256_fmadd_pd(a0, w1, z0);
734:       a1 = _mm256_loadu_pd(&v[j * 81 + 13]);
735:       z1 = _mm256_fmadd_pd(a1, w1, z1);
736:       a2 = _mm256_loadu_pd(&v[j * 81 + 17]);
737:       z2 = _mm256_fmadd_pd(a2, w1, z2);

739:       /* third column of a */
740:       w2 = _mm256_set1_pd(work[j * 9 + 2]);
741:       a3 = _mm256_loadu_pd(&v[j * 81 + 18]);
742:       z0 = _mm256_fmadd_pd(a3, w2, z0);
743:       a4 = _mm256_loadu_pd(&v[j * 81 + 22]);
744:       z1 = _mm256_fmadd_pd(a4, w2, z1);
745:       a5 = _mm256_loadu_pd(&v[j * 81 + 26]);
746:       z2 = _mm256_fmadd_pd(a5, w2, z2);

748:       /* fourth column of a */
749:       w3 = _mm256_set1_pd(work[j * 9 + 3]);
750:       a0 = _mm256_loadu_pd(&v[j * 81 + 27]);
751:       z0 = _mm256_fmadd_pd(a0, w3, z0);
752:       a1 = _mm256_loadu_pd(&v[j * 81 + 31]);
753:       z1 = _mm256_fmadd_pd(a1, w3, z1);
754:       a2 = _mm256_loadu_pd(&v[j * 81 + 35]);
755:       z2 = _mm256_fmadd_pd(a2, w3, z2);

757:       /* fifth column of a */
758:       w0 = _mm256_set1_pd(work[j * 9 + 4]);
759:       a3 = _mm256_loadu_pd(&v[j * 81 + 36]);
760:       z0 = _mm256_fmadd_pd(a3, w0, z0);
761:       a4 = _mm256_loadu_pd(&v[j * 81 + 40]);
762:       z1 = _mm256_fmadd_pd(a4, w0, z1);
763:       a5 = _mm256_loadu_pd(&v[j * 81 + 44]);
764:       z2 = _mm256_fmadd_pd(a5, w0, z2);

766:       /* sixth column of a */
767:       w1 = _mm256_set1_pd(work[j * 9 + 5]);
768:       a0 = _mm256_loadu_pd(&v[j * 81 + 45]);
769:       z0 = _mm256_fmadd_pd(a0, w1, z0);
770:       a1 = _mm256_loadu_pd(&v[j * 81 + 49]);
771:       z1 = _mm256_fmadd_pd(a1, w1, z1);
772:       a2 = _mm256_loadu_pd(&v[j * 81 + 53]);
773:       z2 = _mm256_fmadd_pd(a2, w1, z2);

775:       /* seventh column of a */
776:       w2 = _mm256_set1_pd(work[j * 9 + 6]);
777:       a0 = _mm256_loadu_pd(&v[j * 81 + 54]);
778:       z0 = _mm256_fmadd_pd(a0, w2, z0);
779:       a1 = _mm256_loadu_pd(&v[j * 81 + 58]);
780:       z1 = _mm256_fmadd_pd(a1, w2, z1);
781:       a2 = _mm256_loadu_pd(&v[j * 81 + 62]);
782:       z2 = _mm256_fmadd_pd(a2, w2, z2);

784:       /* eighth column of a */
785:       w3 = _mm256_set1_pd(work[j * 9 + 7]);
786:       a3 = _mm256_loadu_pd(&v[j * 81 + 63]);
787:       z0 = _mm256_fmadd_pd(a3, w3, z0);
788:       a4 = _mm256_loadu_pd(&v[j * 81 + 67]);
789:       z1 = _mm256_fmadd_pd(a4, w3, z1);
790:       a5 = _mm256_loadu_pd(&v[j * 81 + 71]);
791:       z2 = _mm256_fmadd_pd(a5, w3, z2);

793:       /* ninth column of a */
794:       w0 = _mm256_set1_pd(work[j * 9 + 8]);
795:       a0 = _mm256_loadu_pd(&v[j * 81 + 72]);
796:       z0 = _mm256_fmadd_pd(a0, w0, z0);
797:       a1 = _mm256_loadu_pd(&v[j * 81 + 76]);
798:       z1 = _mm256_fmadd_pd(a1, w0, z1);
799:       a2 = _mm256_maskload_pd(&v[j * 81 + 80], mask1);
800:       z2 = _mm256_fmadd_pd(a2, w0, z2);
801:     }

803:     _mm256_storeu_pd(&z[0], z0);
804:     _mm256_storeu_pd(&z[4], z1);
805:     _mm256_maskstore_pd(&z[8], mask1, z2);

807:     v += n * bs2;
808:     if (!usecprow) z += bs;
809:   }
810:   PetscCall(VecRestoreArrayRead(xx, &x));
811:   PetscCall(VecRestoreArrayWrite(zz, &zarray));
812:   PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt));
813:   PetscFunctionReturn(PETSC_SUCCESS);
814: }
815: #endif

817: PetscErrorCode MatMult_SeqBAIJ_11(Mat A, Vec xx, Vec zz)
818: {
819:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
820:   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11;
821:   const PetscScalar *x, *xb;
822:   PetscScalar       *zarray, xv;
823:   const MatScalar   *v;
824:   const PetscInt    *ii, *ij = a->j, *idx;
825:   PetscInt           mbs, i, j, k, n, *ridx = NULL;
826:   PetscBool          usecprow = a->compressedrow.use;

828:   PetscFunctionBegin;
829:   PetscCall(VecGetArrayRead(xx, &x));
830:   PetscCall(VecGetArrayWrite(zz, &zarray));

832:   v = a->a;
833:   if (usecprow) {
834:     mbs  = a->compressedrow.nrows;
835:     ii   = a->compressedrow.i;
836:     ridx = a->compressedrow.rindex;
837:     PetscCall(PetscArrayzero(zarray, 11 * a->mbs));
838:   } else {
839:     mbs = a->mbs;
840:     ii  = a->i;
841:     z   = zarray;
842:   }

844:   for (i = 0; i < mbs; i++) {
845:     n     = ii[i + 1] - ii[i];
846:     idx   = ij + ii[i];
847:     sum1  = 0.0;
848:     sum2  = 0.0;
849:     sum3  = 0.0;
850:     sum4  = 0.0;
851:     sum5  = 0.0;
852:     sum6  = 0.0;
853:     sum7  = 0.0;
854:     sum8  = 0.0;
855:     sum9  = 0.0;
856:     sum10 = 0.0;
857:     sum11 = 0.0;

859:     for (j = 0; j < n; j++) {
860:       xb = x + 11 * (idx[j]);

862:       for (k = 0; k < 11; k++) {
863:         xv = xb[k];
864:         sum1 += v[0] * xv;
865:         sum2 += v[1] * xv;
866:         sum3 += v[2] * xv;
867:         sum4 += v[3] * xv;
868:         sum5 += v[4] * xv;
869:         sum6 += v[5] * xv;
870:         sum7 += v[6] * xv;
871:         sum8 += v[7] * xv;
872:         sum9 += v[8] * xv;
873:         sum10 += v[9] * xv;
874:         sum11 += v[10] * xv;
875:         v += 11;
876:       }
877:     }
878:     if (usecprow) z = zarray + 11 * ridx[i];
879:     z[0]  = sum1;
880:     z[1]  = sum2;
881:     z[2]  = sum3;
882:     z[3]  = sum4;
883:     z[4]  = sum5;
884:     z[5]  = sum6;
885:     z[6]  = sum7;
886:     z[7]  = sum8;
887:     z[8]  = sum9;
888:     z[9]  = sum10;
889:     z[10] = sum11;

891:     if (!usecprow) z += 11;
892:   }

894:   PetscCall(VecRestoreArrayRead(xx, &x));
895:   PetscCall(VecRestoreArrayWrite(zz, &zarray));
896:   PetscCall(PetscLogFlops(242.0 * a->nz - 11.0 * a->nonzerorowcnt));
897:   PetscFunctionReturn(PETSC_SUCCESS);
898: }

900: /* MatMult_SeqBAIJ_12 version 1: Columns in the block are accessed one at a time */
901: PetscErrorCode MatMult_SeqBAIJ_12_ver1(Mat A, Vec xx, Vec zz)
902: {
903:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
904:   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12;
905:   const PetscScalar *x, *xb;
906:   PetscScalar       *zarray, xv;
907:   const MatScalar   *v;
908:   const PetscInt    *ii, *ij = a->j, *idx;
909:   PetscInt           mbs, i, j, k, n, *ridx = NULL;
910:   PetscBool          usecprow = a->compressedrow.use;

912:   PetscFunctionBegin;
913:   PetscCall(VecGetArrayRead(xx, &x));
914:   PetscCall(VecGetArrayWrite(zz, &zarray));

916:   v = a->a;
917:   if (usecprow) {
918:     mbs  = a->compressedrow.nrows;
919:     ii   = a->compressedrow.i;
920:     ridx = a->compressedrow.rindex;
921:     PetscCall(PetscArrayzero(zarray, 12 * a->mbs));
922:   } else {
923:     mbs = a->mbs;
924:     ii  = a->i;
925:     z   = zarray;
926:   }

928:   for (i = 0; i < mbs; i++) {
929:     n     = ii[i + 1] - ii[i];
930:     idx   = ij + ii[i];
931:     sum1  = 0.0;
932:     sum2  = 0.0;
933:     sum3  = 0.0;
934:     sum4  = 0.0;
935:     sum5  = 0.0;
936:     sum6  = 0.0;
937:     sum7  = 0.0;
938:     sum8  = 0.0;
939:     sum9  = 0.0;
940:     sum10 = 0.0;
941:     sum11 = 0.0;
942:     sum12 = 0.0;

944:     for (j = 0; j < n; j++) {
945:       xb = x + 12 * (idx[j]);

947:       for (k = 0; k < 12; k++) {
948:         xv = xb[k];
949:         sum1 += v[0] * xv;
950:         sum2 += v[1] * xv;
951:         sum3 += v[2] * xv;
952:         sum4 += v[3] * xv;
953:         sum5 += v[4] * xv;
954:         sum6 += v[5] * xv;
955:         sum7 += v[6] * xv;
956:         sum8 += v[7] * xv;
957:         sum9 += v[8] * xv;
958:         sum10 += v[9] * xv;
959:         sum11 += v[10] * xv;
960:         sum12 += v[11] * xv;
961:         v += 12;
962:       }
963:     }
964:     if (usecprow) z = zarray + 12 * ridx[i];
965:     z[0]  = sum1;
966:     z[1]  = sum2;
967:     z[2]  = sum3;
968:     z[3]  = sum4;
969:     z[4]  = sum5;
970:     z[5]  = sum6;
971:     z[6]  = sum7;
972:     z[7]  = sum8;
973:     z[8]  = sum9;
974:     z[9]  = sum10;
975:     z[10] = sum11;
976:     z[11] = sum12;
977:     if (!usecprow) z += 12;
978:   }
979:   PetscCall(VecRestoreArrayRead(xx, &x));
980:   PetscCall(VecRestoreArrayWrite(zz, &zarray));
981:   PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt));
982:   PetscFunctionReturn(PETSC_SUCCESS);
983: }

985: PetscErrorCode MatMultAdd_SeqBAIJ_12_ver1(Mat A, Vec xx, Vec yy, Vec zz)
986: {
987:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
988:   PetscScalar       *z = NULL, *y = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12;
989:   const PetscScalar *x, *xb;
990:   PetscScalar       *zarray, *yarray, xv;
991:   const MatScalar   *v;
992:   const PetscInt    *ii, *ij = a->j, *idx;
993:   PetscInt           mbs = a->mbs, i, j, k, n, *ridx = NULL;
994:   PetscBool          usecprow = a->compressedrow.use;

996:   PetscFunctionBegin;
997:   PetscCall(VecGetArrayRead(xx, &x));
998:   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));

1000:   v = a->a;
1001:   if (usecprow) {
1002:     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 12 * mbs));
1003:     mbs  = a->compressedrow.nrows;
1004:     ii   = a->compressedrow.i;
1005:     ridx = a->compressedrow.rindex;
1006:   } else {
1007:     ii = a->i;
1008:     y  = yarray;
1009:     z  = zarray;
1010:   }

1012:   for (i = 0; i < mbs; i++) {
1013:     n   = ii[i + 1] - ii[i];
1014:     idx = ij + ii[i];

1016:     if (usecprow) {
1017:       y = yarray + 12 * ridx[i];
1018:       z = zarray + 12 * ridx[i];
1019:     }
1020:     sum1  = y[0];
1021:     sum2  = y[1];
1022:     sum3  = y[2];
1023:     sum4  = y[3];
1024:     sum5  = y[4];
1025:     sum6  = y[5];
1026:     sum7  = y[6];
1027:     sum8  = y[7];
1028:     sum9  = y[8];
1029:     sum10 = y[9];
1030:     sum11 = y[10];
1031:     sum12 = y[11];

1033:     for (j = 0; j < n; j++) {
1034:       xb = x + 12 * (idx[j]);

1036:       for (k = 0; k < 12; k++) {
1037:         xv = xb[k];
1038:         sum1 += v[0] * xv;
1039:         sum2 += v[1] * xv;
1040:         sum3 += v[2] * xv;
1041:         sum4 += v[3] * xv;
1042:         sum5 += v[4] * xv;
1043:         sum6 += v[5] * xv;
1044:         sum7 += v[6] * xv;
1045:         sum8 += v[7] * xv;
1046:         sum9 += v[8] * xv;
1047:         sum10 += v[9] * xv;
1048:         sum11 += v[10] * xv;
1049:         sum12 += v[11] * xv;
1050:         v += 12;
1051:       }
1052:     }

1054:     z[0]  = sum1;
1055:     z[1]  = sum2;
1056:     z[2]  = sum3;
1057:     z[3]  = sum4;
1058:     z[4]  = sum5;
1059:     z[5]  = sum6;
1060:     z[6]  = sum7;
1061:     z[7]  = sum8;
1062:     z[8]  = sum9;
1063:     z[9]  = sum10;
1064:     z[10] = sum11;
1065:     z[11] = sum12;
1066:     if (!usecprow) {
1067:       y += 12;
1068:       z += 12;
1069:     }
1070:   }
1071:   PetscCall(VecRestoreArrayRead(xx, &x));
1072:   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
1073:   PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt));
1074:   PetscFunctionReturn(PETSC_SUCCESS);
1075: }

1077: /* MatMult_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */
1078: PetscErrorCode MatMult_SeqBAIJ_12_ver2(Mat A, Vec xx, Vec zz)
1079: {
1080:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1081:   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12;
1082:   const PetscScalar *x, *xb;
1083:   PetscScalar        x1, x2, x3, x4, *zarray;
1084:   const MatScalar   *v;
1085:   const PetscInt    *ii, *ij = a->j, *idx, *ridx = NULL;
1086:   PetscInt           mbs, i, j, n;
1087:   PetscBool          usecprow = a->compressedrow.use;

1089:   PetscFunctionBegin;
1090:   PetscCall(VecGetArrayRead(xx, &x));
1091:   PetscCall(VecGetArrayWrite(zz, &zarray));

1093:   v = a->a;
1094:   if (usecprow) {
1095:     mbs  = a->compressedrow.nrows;
1096:     ii   = a->compressedrow.i;
1097:     ridx = a->compressedrow.rindex;
1098:     PetscCall(PetscArrayzero(zarray, 12 * a->mbs));
1099:   } else {
1100:     mbs = a->mbs;
1101:     ii  = a->i;
1102:     z   = zarray;
1103:   }

1105:   for (i = 0; i < mbs; i++) {
1106:     n   = ii[i + 1] - ii[i];
1107:     idx = ij + ii[i];

1109:     sum1 = sum2 = sum3 = sum4 = sum5 = sum6 = sum7 = sum8 = sum9 = sum10 = sum11 = sum12 = 0;
1110:     for (j = 0; j < n; j++) {
1111:       xb = x + 12 * (idx[j]);
1112:       x1 = xb[0];
1113:       x2 = xb[1];
1114:       x3 = xb[2];
1115:       x4 = xb[3];

1117:       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1118:       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1119:       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1120:       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1121:       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1122:       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1123:       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1124:       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1125:       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1126:       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1127:       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1128:       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1129:       v += 48;

1131:       x1 = xb[4];
1132:       x2 = xb[5];
1133:       x3 = xb[6];
1134:       x4 = xb[7];

1136:       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1137:       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1138:       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1139:       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1140:       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1141:       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1142:       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1143:       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1144:       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1145:       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1146:       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1147:       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1148:       v += 48;

1150:       x1 = xb[8];
1151:       x2 = xb[9];
1152:       x3 = xb[10];
1153:       x4 = xb[11];
1154:       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1155:       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1156:       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1157:       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1158:       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1159:       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1160:       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1161:       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1162:       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1163:       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1164:       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1165:       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1166:       v += 48;
1167:     }
1168:     if (usecprow) z = zarray + 12 * ridx[i];
1169:     z[0]  = sum1;
1170:     z[1]  = sum2;
1171:     z[2]  = sum3;
1172:     z[3]  = sum4;
1173:     z[4]  = sum5;
1174:     z[5]  = sum6;
1175:     z[6]  = sum7;
1176:     z[7]  = sum8;
1177:     z[8]  = sum9;
1178:     z[9]  = sum10;
1179:     z[10] = sum11;
1180:     z[11] = sum12;
1181:     if (!usecprow) z += 12;
1182:   }
1183:   PetscCall(VecRestoreArrayRead(xx, &x));
1184:   PetscCall(VecRestoreArrayWrite(zz, &zarray));
1185:   PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt));
1186:   PetscFunctionReturn(PETSC_SUCCESS);
1187: }

1189: /* MatMultAdd_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */
1190: PetscErrorCode MatMultAdd_SeqBAIJ_12_ver2(Mat A, Vec xx, Vec yy, Vec zz)
1191: {
1192:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1193:   PetscScalar       *z = NULL, *y = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12;
1194:   const PetscScalar *x, *xb;
1195:   PetscScalar        x1, x2, x3, x4, *zarray, *yarray;
1196:   const MatScalar   *v;
1197:   const PetscInt    *ii, *ij = a->j, *idx, *ridx = NULL;
1198:   PetscInt           mbs      = a->mbs, i, j, n;
1199:   PetscBool          usecprow = a->compressedrow.use;

1201:   PetscFunctionBegin;
1202:   PetscCall(VecGetArrayRead(xx, &x));
1203:   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));

1205:   v = a->a;
1206:   if (usecprow) {
1207:     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 12 * mbs));
1208:     mbs  = a->compressedrow.nrows;
1209:     ii   = a->compressedrow.i;
1210:     ridx = a->compressedrow.rindex;
1211:   } else {
1212:     ii = a->i;
1213:     y  = yarray;
1214:     z  = zarray;
1215:   }

1217:   for (i = 0; i < mbs; i++) {
1218:     n   = ii[i + 1] - ii[i];
1219:     idx = ij + ii[i];

1221:     if (usecprow) {
1222:       y = yarray + 12 * ridx[i];
1223:       z = zarray + 12 * ridx[i];
1224:     }
1225:     sum1  = y[0];
1226:     sum2  = y[1];
1227:     sum3  = y[2];
1228:     sum4  = y[3];
1229:     sum5  = y[4];
1230:     sum6  = y[5];
1231:     sum7  = y[6];
1232:     sum8  = y[7];
1233:     sum9  = y[8];
1234:     sum10 = y[9];
1235:     sum11 = y[10];
1236:     sum12 = y[11];

1238:     for (j = 0; j < n; j++) {
1239:       xb = x + 12 * (idx[j]);
1240:       x1 = xb[0];
1241:       x2 = xb[1];
1242:       x3 = xb[2];
1243:       x4 = xb[3];

1245:       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1246:       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1247:       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1248:       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1249:       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1250:       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1251:       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1252:       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1253:       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1254:       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1255:       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1256:       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1257:       v += 48;

1259:       x1 = xb[4];
1260:       x2 = xb[5];
1261:       x3 = xb[6];
1262:       x4 = xb[7];

1264:       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1265:       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1266:       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1267:       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1268:       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1269:       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1270:       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1271:       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1272:       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1273:       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1274:       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1275:       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1276:       v += 48;

1278:       x1 = xb[8];
1279:       x2 = xb[9];
1280:       x3 = xb[10];
1281:       x4 = xb[11];
1282:       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1283:       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1284:       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1285:       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1286:       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1287:       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1288:       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1289:       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1290:       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1291:       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1292:       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1293:       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1294:       v += 48;
1295:     }
1296:     z[0]  = sum1;
1297:     z[1]  = sum2;
1298:     z[2]  = sum3;
1299:     z[3]  = sum4;
1300:     z[4]  = sum5;
1301:     z[5]  = sum6;
1302:     z[6]  = sum7;
1303:     z[7]  = sum8;
1304:     z[8]  = sum9;
1305:     z[9]  = sum10;
1306:     z[10] = sum11;
1307:     z[11] = sum12;
1308:     if (!usecprow) {
1309:       y += 12;
1310:       z += 12;
1311:     }
1312:   }
1313:   PetscCall(VecRestoreArrayRead(xx, &x));
1314:   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
1315:   PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt));
1316:   PetscFunctionReturn(PETSC_SUCCESS);
1317: }

1319: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
1320: PetscErrorCode MatMult_SeqBAIJ_12_AVX2(Mat A, Vec xx, Vec zz)
1321: {
1322:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1323:   PetscScalar       *z = NULL, *zarray;
1324:   const PetscScalar *x, *work;
1325:   const MatScalar   *v = a->a;
1326:   PetscInt           mbs, i, j, n;
1327:   const PetscInt    *idx = a->j, *ii, *ridx = NULL;
1328:   PetscBool          usecprow = a->compressedrow.use;
1329:   const PetscInt     bs = 12, bs2 = 144;

1331:   __m256d a0, a1, a2, a3, a4, a5;
1332:   __m256d w0, w1, w2, w3;
1333:   __m256d z0, z1, z2;

1335:   PetscFunctionBegin;
1336:   PetscCall(VecGetArrayRead(xx, &x));
1337:   PetscCall(VecGetArrayWrite(zz, &zarray));

1339:   if (usecprow) {
1340:     mbs  = a->compressedrow.nrows;
1341:     ii   = a->compressedrow.i;
1342:     ridx = a->compressedrow.rindex;
1343:     PetscCall(PetscArrayzero(zarray, bs * a->mbs));
1344:   } else {
1345:     mbs = a->mbs;
1346:     ii  = a->i;
1347:     z   = zarray;
1348:   }

1350:   for (i = 0; i < mbs; i++) {
1351:     z0 = _mm256_setzero_pd();
1352:     z1 = _mm256_setzero_pd();
1353:     z2 = _mm256_setzero_pd();

1355:     n = ii[1] - ii[0];
1356:     ii++;
1357:     for (j = 0; j < n; j++) {
1358:       work = x + bs * (*idx++);

1360:       /* first column of a */
1361:       w0 = _mm256_set1_pd(work[0]);
1362:       a0 = _mm256_loadu_pd(v + 0);
1363:       z0 = _mm256_fmadd_pd(a0, w0, z0);
1364:       a1 = _mm256_loadu_pd(v + 4);
1365:       z1 = _mm256_fmadd_pd(a1, w0, z1);
1366:       a2 = _mm256_loadu_pd(v + 8);
1367:       z2 = _mm256_fmadd_pd(a2, w0, z2);

1369:       /* second column of a */
1370:       w1 = _mm256_set1_pd(work[1]);
1371:       a3 = _mm256_loadu_pd(v + 12);
1372:       z0 = _mm256_fmadd_pd(a3, w1, z0);
1373:       a4 = _mm256_loadu_pd(v + 16);
1374:       z1 = _mm256_fmadd_pd(a4, w1, z1);
1375:       a5 = _mm256_loadu_pd(v + 20);
1376:       z2 = _mm256_fmadd_pd(a5, w1, z2);

1378:       /* third column of a */
1379:       w2 = _mm256_set1_pd(work[2]);
1380:       a0 = _mm256_loadu_pd(v + 24);
1381:       z0 = _mm256_fmadd_pd(a0, w2, z0);
1382:       a1 = _mm256_loadu_pd(v + 28);
1383:       z1 = _mm256_fmadd_pd(a1, w2, z1);
1384:       a2 = _mm256_loadu_pd(v + 32);
1385:       z2 = _mm256_fmadd_pd(a2, w2, z2);

1387:       /* fourth column of a */
1388:       w3 = _mm256_set1_pd(work[3]);
1389:       a3 = _mm256_loadu_pd(v + 36);
1390:       z0 = _mm256_fmadd_pd(a3, w3, z0);
1391:       a4 = _mm256_loadu_pd(v + 40);
1392:       z1 = _mm256_fmadd_pd(a4, w3, z1);
1393:       a5 = _mm256_loadu_pd(v + 44);
1394:       z2 = _mm256_fmadd_pd(a5, w3, z2);

1396:       /* fifth column of a */
1397:       w0 = _mm256_set1_pd(work[4]);
1398:       a0 = _mm256_loadu_pd(v + 48);
1399:       z0 = _mm256_fmadd_pd(a0, w0, z0);
1400:       a1 = _mm256_loadu_pd(v + 52);
1401:       z1 = _mm256_fmadd_pd(a1, w0, z1);
1402:       a2 = _mm256_loadu_pd(v + 56);
1403:       z2 = _mm256_fmadd_pd(a2, w0, z2);

1405:       /* sixth column of a */
1406:       w1 = _mm256_set1_pd(work[5]);
1407:       a3 = _mm256_loadu_pd(v + 60);
1408:       z0 = _mm256_fmadd_pd(a3, w1, z0);
1409:       a4 = _mm256_loadu_pd(v + 64);
1410:       z1 = _mm256_fmadd_pd(a4, w1, z1);
1411:       a5 = _mm256_loadu_pd(v + 68);
1412:       z2 = _mm256_fmadd_pd(a5, w1, z2);

1414:       /* seventh column of a */
1415:       w2 = _mm256_set1_pd(work[6]);
1416:       a0 = _mm256_loadu_pd(v + 72);
1417:       z0 = _mm256_fmadd_pd(a0, w2, z0);
1418:       a1 = _mm256_loadu_pd(v + 76);
1419:       z1 = _mm256_fmadd_pd(a1, w2, z1);
1420:       a2 = _mm256_loadu_pd(v + 80);
1421:       z2 = _mm256_fmadd_pd(a2, w2, z2);

1423:       /* eighth column of a */
1424:       w3 = _mm256_set1_pd(work[7]);
1425:       a3 = _mm256_loadu_pd(v + 84);
1426:       z0 = _mm256_fmadd_pd(a3, w3, z0);
1427:       a4 = _mm256_loadu_pd(v + 88);
1428:       z1 = _mm256_fmadd_pd(a4, w3, z1);
1429:       a5 = _mm256_loadu_pd(v + 92);
1430:       z2 = _mm256_fmadd_pd(a5, w3, z2);

1432:       /* ninth column of a */
1433:       w0 = _mm256_set1_pd(work[8]);
1434:       a0 = _mm256_loadu_pd(v + 96);
1435:       z0 = _mm256_fmadd_pd(a0, w0, z0);
1436:       a1 = _mm256_loadu_pd(v + 100);
1437:       z1 = _mm256_fmadd_pd(a1, w0, z1);
1438:       a2 = _mm256_loadu_pd(v + 104);
1439:       z2 = _mm256_fmadd_pd(a2, w0, z2);

1441:       /* tenth column of a */
1442:       w1 = _mm256_set1_pd(work[9]);
1443:       a3 = _mm256_loadu_pd(v + 108);
1444:       z0 = _mm256_fmadd_pd(a3, w1, z0);
1445:       a4 = _mm256_loadu_pd(v + 112);
1446:       z1 = _mm256_fmadd_pd(a4, w1, z1);
1447:       a5 = _mm256_loadu_pd(v + 116);
1448:       z2 = _mm256_fmadd_pd(a5, w1, z2);

1450:       /* eleventh column of a */
1451:       w2 = _mm256_set1_pd(work[10]);
1452:       a0 = _mm256_loadu_pd(v + 120);
1453:       z0 = _mm256_fmadd_pd(a0, w2, z0);
1454:       a1 = _mm256_loadu_pd(v + 124);
1455:       z1 = _mm256_fmadd_pd(a1, w2, z1);
1456:       a2 = _mm256_loadu_pd(v + 128);
1457:       z2 = _mm256_fmadd_pd(a2, w2, z2);

1459:       /* twelveth column of a */
1460:       w3 = _mm256_set1_pd(work[11]);
1461:       a3 = _mm256_loadu_pd(v + 132);
1462:       z0 = _mm256_fmadd_pd(a3, w3, z0);
1463:       a4 = _mm256_loadu_pd(v + 136);
1464:       z1 = _mm256_fmadd_pd(a4, w3, z1);
1465:       a5 = _mm256_loadu_pd(v + 140);
1466:       z2 = _mm256_fmadd_pd(a5, w3, z2);

1468:       v += bs2;
1469:     }
1470:     if (usecprow) z = zarray + bs * ridx[i];
1471:     _mm256_storeu_pd(&z[0], z0);
1472:     _mm256_storeu_pd(&z[4], z1);
1473:     _mm256_storeu_pd(&z[8], z2);
1474:     if (!usecprow) z += bs;
1475:   }
1476:   PetscCall(VecRestoreArrayRead(xx, &x));
1477:   PetscCall(VecRestoreArrayWrite(zz, &zarray));
1478:   PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt));
1479:   PetscFunctionReturn(PETSC_SUCCESS);
1480: }
1481: #endif

1483: /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */
1484: /* Default MatMult for block size 15 */
1485: PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A, Vec xx, Vec zz)
1486: {
1487:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1488:   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15;
1489:   const PetscScalar *x, *xb;
1490:   PetscScalar       *zarray, xv;
1491:   const MatScalar   *v;
1492:   const PetscInt    *ii, *ij = a->j, *idx;
1493:   PetscInt           mbs, i, j, k, n, *ridx = NULL;
1494:   PetscBool          usecprow = a->compressedrow.use;

1496:   PetscFunctionBegin;
1497:   PetscCall(VecGetArrayRead(xx, &x));
1498:   PetscCall(VecGetArrayWrite(zz, &zarray));

1500:   v = a->a;
1501:   if (usecprow) {
1502:     mbs  = a->compressedrow.nrows;
1503:     ii   = a->compressedrow.i;
1504:     ridx = a->compressedrow.rindex;
1505:     PetscCall(PetscArrayzero(zarray, 15 * a->mbs));
1506:   } else {
1507:     mbs = a->mbs;
1508:     ii  = a->i;
1509:     z   = zarray;
1510:   }

1512:   for (i = 0; i < mbs; i++) {
1513:     n     = ii[i + 1] - ii[i];
1514:     idx   = ij + ii[i];
1515:     sum1  = 0.0;
1516:     sum2  = 0.0;
1517:     sum3  = 0.0;
1518:     sum4  = 0.0;
1519:     sum5  = 0.0;
1520:     sum6  = 0.0;
1521:     sum7  = 0.0;
1522:     sum8  = 0.0;
1523:     sum9  = 0.0;
1524:     sum10 = 0.0;
1525:     sum11 = 0.0;
1526:     sum12 = 0.0;
1527:     sum13 = 0.0;
1528:     sum14 = 0.0;
1529:     sum15 = 0.0;

1531:     for (j = 0; j < n; j++) {
1532:       xb = x + 15 * (idx[j]);

1534:       for (k = 0; k < 15; k++) {
1535:         xv = xb[k];
1536:         sum1 += v[0] * xv;
1537:         sum2 += v[1] * xv;
1538:         sum3 += v[2] * xv;
1539:         sum4 += v[3] * xv;
1540:         sum5 += v[4] * xv;
1541:         sum6 += v[5] * xv;
1542:         sum7 += v[6] * xv;
1543:         sum8 += v[7] * xv;
1544:         sum9 += v[8] * xv;
1545:         sum10 += v[9] * xv;
1546:         sum11 += v[10] * xv;
1547:         sum12 += v[11] * xv;
1548:         sum13 += v[12] * xv;
1549:         sum14 += v[13] * xv;
1550:         sum15 += v[14] * xv;
1551:         v += 15;
1552:       }
1553:     }
1554:     if (usecprow) z = zarray + 15 * ridx[i];
1555:     z[0]  = sum1;
1556:     z[1]  = sum2;
1557:     z[2]  = sum3;
1558:     z[3]  = sum4;
1559:     z[4]  = sum5;
1560:     z[5]  = sum6;
1561:     z[6]  = sum7;
1562:     z[7]  = sum8;
1563:     z[8]  = sum9;
1564:     z[9]  = sum10;
1565:     z[10] = sum11;
1566:     z[11] = sum12;
1567:     z[12] = sum13;
1568:     z[13] = sum14;
1569:     z[14] = sum15;

1571:     if (!usecprow) z += 15;
1572:   }

1574:   PetscCall(VecRestoreArrayRead(xx, &x));
1575:   PetscCall(VecRestoreArrayWrite(zz, &zarray));
1576:   PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt));
1577:   PetscFunctionReturn(PETSC_SUCCESS);
1578: }

1580: /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */
1581: PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A, Vec xx, Vec zz)
1582: {
1583:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1584:   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15;
1585:   const PetscScalar *x, *xb;
1586:   PetscScalar        x1, x2, x3, x4, *zarray;
1587:   const MatScalar   *v;
1588:   const PetscInt    *ii, *ij = a->j, *idx;
1589:   PetscInt           mbs, i, j, n, *ridx = NULL;
1590:   PetscBool          usecprow = a->compressedrow.use;

1592:   PetscFunctionBegin;
1593:   PetscCall(VecGetArrayRead(xx, &x));
1594:   PetscCall(VecGetArrayWrite(zz, &zarray));

1596:   v = a->a;
1597:   if (usecprow) {
1598:     mbs  = a->compressedrow.nrows;
1599:     ii   = a->compressedrow.i;
1600:     ridx = a->compressedrow.rindex;
1601:     PetscCall(PetscArrayzero(zarray, 15 * a->mbs));
1602:   } else {
1603:     mbs = a->mbs;
1604:     ii  = a->i;
1605:     z   = zarray;
1606:   }

1608:   for (i = 0; i < mbs; i++) {
1609:     n     = ii[i + 1] - ii[i];
1610:     idx   = ij + ii[i];
1611:     sum1  = 0.0;
1612:     sum2  = 0.0;
1613:     sum3  = 0.0;
1614:     sum4  = 0.0;
1615:     sum5  = 0.0;
1616:     sum6  = 0.0;
1617:     sum7  = 0.0;
1618:     sum8  = 0.0;
1619:     sum9  = 0.0;
1620:     sum10 = 0.0;
1621:     sum11 = 0.0;
1622:     sum12 = 0.0;
1623:     sum13 = 0.0;
1624:     sum14 = 0.0;
1625:     sum15 = 0.0;

1627:     for (j = 0; j < n; j++) {
1628:       xb = x + 15 * (idx[j]);
1629:       x1 = xb[0];
1630:       x2 = xb[1];
1631:       x3 = xb[2];
1632:       x4 = xb[3];

1634:       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4;
1635:       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4;
1636:       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4;
1637:       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4;
1638:       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4;
1639:       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4;
1640:       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4;
1641:       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4;
1642:       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4;
1643:       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4;
1644:       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4;
1645:       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4;
1646:       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4;
1647:       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4;
1648:       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4;

1650:       v += 60;

1652:       x1 = xb[4];
1653:       x2 = xb[5];
1654:       x3 = xb[6];
1655:       x4 = xb[7];

1657:       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4;
1658:       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4;
1659:       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4;
1660:       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4;
1661:       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4;
1662:       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4;
1663:       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4;
1664:       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4;
1665:       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4;
1666:       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4;
1667:       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4;
1668:       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4;
1669:       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4;
1670:       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4;
1671:       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4;
1672:       v += 60;

1674:       x1 = xb[8];
1675:       x2 = xb[9];
1676:       x3 = xb[10];
1677:       x4 = xb[11];
1678:       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4;
1679:       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4;
1680:       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4;
1681:       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4;
1682:       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4;
1683:       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4;
1684:       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4;
1685:       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4;
1686:       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4;
1687:       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4;
1688:       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4;
1689:       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4;
1690:       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4;
1691:       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4;
1692:       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4;
1693:       v += 60;

1695:       x1 = xb[12];
1696:       x2 = xb[13];
1697:       x3 = xb[14];
1698:       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3;
1699:       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3;
1700:       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3;
1701:       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3;
1702:       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3;
1703:       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3;
1704:       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3;
1705:       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3;
1706:       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3;
1707:       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3;
1708:       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3;
1709:       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3;
1710:       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3;
1711:       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3;
1712:       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3;
1713:       v += 45;
1714:     }
1715:     if (usecprow) z = zarray + 15 * ridx[i];
1716:     z[0]  = sum1;
1717:     z[1]  = sum2;
1718:     z[2]  = sum3;
1719:     z[3]  = sum4;
1720:     z[4]  = sum5;
1721:     z[5]  = sum6;
1722:     z[6]  = sum7;
1723:     z[7]  = sum8;
1724:     z[8]  = sum9;
1725:     z[9]  = sum10;
1726:     z[10] = sum11;
1727:     z[11] = sum12;
1728:     z[12] = sum13;
1729:     z[13] = sum14;
1730:     z[14] = sum15;

1732:     if (!usecprow) z += 15;
1733:   }

1735:   PetscCall(VecRestoreArrayRead(xx, &x));
1736:   PetscCall(VecRestoreArrayWrite(zz, &zarray));
1737:   PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt));
1738:   PetscFunctionReturn(PETSC_SUCCESS);
1739: }

1741: /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */
1742: PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A, Vec xx, Vec zz)
1743: {
1744:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1745:   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15;
1746:   const PetscScalar *x, *xb;
1747:   PetscScalar        x1, x2, x3, x4, x5, x6, x7, x8, *zarray;
1748:   const MatScalar   *v;
1749:   const PetscInt    *ii, *ij = a->j, *idx;
1750:   PetscInt           mbs, i, j, n, *ridx = NULL;
1751:   PetscBool          usecprow = a->compressedrow.use;

1753:   PetscFunctionBegin;
1754:   PetscCall(VecGetArrayRead(xx, &x));
1755:   PetscCall(VecGetArrayWrite(zz, &zarray));

1757:   v = a->a;
1758:   if (usecprow) {
1759:     mbs  = a->compressedrow.nrows;
1760:     ii   = a->compressedrow.i;
1761:     ridx = a->compressedrow.rindex;
1762:     PetscCall(PetscArrayzero(zarray, 15 * a->mbs));
1763:   } else {
1764:     mbs = a->mbs;
1765:     ii  = a->i;
1766:     z   = zarray;
1767:   }

1769:   for (i = 0; i < mbs; i++) {
1770:     n     = ii[i + 1] - ii[i];
1771:     idx   = ij + ii[i];
1772:     sum1  = 0.0;
1773:     sum2  = 0.0;
1774:     sum3  = 0.0;
1775:     sum4  = 0.0;
1776:     sum5  = 0.0;
1777:     sum6  = 0.0;
1778:     sum7  = 0.0;
1779:     sum8  = 0.0;
1780:     sum9  = 0.0;
1781:     sum10 = 0.0;
1782:     sum11 = 0.0;
1783:     sum12 = 0.0;
1784:     sum13 = 0.0;
1785:     sum14 = 0.0;
1786:     sum15 = 0.0;

1788:     for (j = 0; j < n; j++) {
1789:       xb = x + 15 * (idx[j]);
1790:       x1 = xb[0];
1791:       x2 = xb[1];
1792:       x3 = xb[2];
1793:       x4 = xb[3];
1794:       x5 = xb[4];
1795:       x6 = xb[5];
1796:       x7 = xb[6];
1797:       x8 = xb[7];

1799:       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7 + v[105] * x8;
1800:       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7 + v[106] * x8;
1801:       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7 + v[107] * x8;
1802:       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7 + v[108] * x8;
1803:       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7 + v[109] * x8;
1804:       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7 + v[110] * x8;
1805:       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7 + v[111] * x8;
1806:       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7 + v[112] * x8;
1807:       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7 + v[113] * x8;
1808:       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7 + v[114] * x8;
1809:       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7 + v[115] * x8;
1810:       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7 + v[116] * x8;
1811:       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7 + v[117] * x8;
1812:       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7 + v[118] * x8;
1813:       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7 + v[119] * x8;
1814:       v += 120;

1816:       x1 = xb[8];
1817:       x2 = xb[9];
1818:       x3 = xb[10];
1819:       x4 = xb[11];
1820:       x5 = xb[12];
1821:       x6 = xb[13];
1822:       x7 = xb[14];

1824:       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7;
1825:       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7;
1826:       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7;
1827:       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7;
1828:       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7;
1829:       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7;
1830:       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7;
1831:       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7;
1832:       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7;
1833:       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7;
1834:       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7;
1835:       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7;
1836:       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7;
1837:       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7;
1838:       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7;
1839:       v += 105;
1840:     }
1841:     if (usecprow) z = zarray + 15 * ridx[i];
1842:     z[0]  = sum1;
1843:     z[1]  = sum2;
1844:     z[2]  = sum3;
1845:     z[3]  = sum4;
1846:     z[4]  = sum5;
1847:     z[5]  = sum6;
1848:     z[6]  = sum7;
1849:     z[7]  = sum8;
1850:     z[8]  = sum9;
1851:     z[9]  = sum10;
1852:     z[10] = sum11;
1853:     z[11] = sum12;
1854:     z[12] = sum13;
1855:     z[13] = sum14;
1856:     z[14] = sum15;

1858:     if (!usecprow) z += 15;
1859:   }

1861:   PetscCall(VecRestoreArrayRead(xx, &x));
1862:   PetscCall(VecRestoreArrayWrite(zz, &zarray));
1863:   PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt));
1864:   PetscFunctionReturn(PETSC_SUCCESS);
1865: }

1867: /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */
1868: PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A, Vec xx, Vec zz)
1869: {
1870:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1871:   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15;
1872:   const PetscScalar *x, *xb;
1873:   PetscScalar        x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, *zarray;
1874:   const MatScalar   *v;
1875:   const PetscInt    *ii, *ij = a->j, *idx;
1876:   PetscInt           mbs, i, j, n, *ridx = NULL;
1877:   PetscBool          usecprow = a->compressedrow.use;

1879:   PetscFunctionBegin;
1880:   PetscCall(VecGetArrayRead(xx, &x));
1881:   PetscCall(VecGetArrayWrite(zz, &zarray));

1883:   v = a->a;
1884:   if (usecprow) {
1885:     mbs  = a->compressedrow.nrows;
1886:     ii   = a->compressedrow.i;
1887:     ridx = a->compressedrow.rindex;
1888:     PetscCall(PetscArrayzero(zarray, 15 * a->mbs));
1889:   } else {
1890:     mbs = a->mbs;
1891:     ii  = a->i;
1892:     z   = zarray;
1893:   }

1895:   for (i = 0; i < mbs; i++) {
1896:     n     = ii[i + 1] - ii[i];
1897:     idx   = ij + ii[i];
1898:     sum1  = 0.0;
1899:     sum2  = 0.0;
1900:     sum3  = 0.0;
1901:     sum4  = 0.0;
1902:     sum5  = 0.0;
1903:     sum6  = 0.0;
1904:     sum7  = 0.0;
1905:     sum8  = 0.0;
1906:     sum9  = 0.0;
1907:     sum10 = 0.0;
1908:     sum11 = 0.0;
1909:     sum12 = 0.0;
1910:     sum13 = 0.0;
1911:     sum14 = 0.0;
1912:     sum15 = 0.0;

1914:     for (j = 0; j < n; j++) {
1915:       xb  = x + 15 * (idx[j]);
1916:       x1  = xb[0];
1917:       x2  = xb[1];
1918:       x3  = xb[2];
1919:       x4  = xb[3];
1920:       x5  = xb[4];
1921:       x6  = xb[5];
1922:       x7  = xb[6];
1923:       x8  = xb[7];
1924:       x9  = xb[8];
1925:       x10 = xb[9];
1926:       x11 = xb[10];
1927:       x12 = xb[11];
1928:       x13 = xb[12];
1929:       x14 = xb[13];
1930:       x15 = xb[14];

1932:       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7 + v[105] * x8 + v[120] * x9 + v[135] * x10 + v[150] * x11 + v[165] * x12 + v[180] * x13 + v[195] * x14 + v[210] * x15;
1933:       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7 + v[106] * x8 + v[121] * x9 + v[136] * x10 + v[151] * x11 + v[166] * x12 + v[181] * x13 + v[196] * x14 + v[211] * x15;
1934:       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7 + v[107] * x8 + v[122] * x9 + v[137] * x10 + v[152] * x11 + v[167] * x12 + v[182] * x13 + v[197] * x14 + v[212] * x15;
1935:       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7 + v[108] * x8 + v[123] * x9 + v[138] * x10 + v[153] * x11 + v[168] * x12 + v[183] * x13 + v[198] * x14 + v[213] * x15;
1936:       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7 + v[109] * x8 + v[124] * x9 + v[139] * x10 + v[154] * x11 + v[169] * x12 + v[184] * x13 + v[199] * x14 + v[214] * x15;
1937:       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7 + v[110] * x8 + v[125] * x9 + v[140] * x10 + v[155] * x11 + v[170] * x12 + v[185] * x13 + v[200] * x14 + v[215] * x15;
1938:       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7 + v[111] * x8 + v[126] * x9 + v[141] * x10 + v[156] * x11 + v[171] * x12 + v[186] * x13 + v[201] * x14 + v[216] * x15;
1939:       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7 + v[112] * x8 + v[127] * x9 + v[142] * x10 + v[157] * x11 + v[172] * x12 + v[187] * x13 + v[202] * x14 + v[217] * x15;
1940:       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7 + v[113] * x8 + v[128] * x9 + v[143] * x10 + v[158] * x11 + v[173] * x12 + v[188] * x13 + v[203] * x14 + v[218] * x15;
1941:       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7 + v[114] * x8 + v[129] * x9 + v[144] * x10 + v[159] * x11 + v[174] * x12 + v[189] * x13 + v[204] * x14 + v[219] * x15;
1942:       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7 + v[115] * x8 + v[130] * x9 + v[145] * x10 + v[160] * x11 + v[175] * x12 + v[190] * x13 + v[205] * x14 + v[220] * x15;
1943:       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7 + v[116] * x8 + v[131] * x9 + v[146] * x10 + v[161] * x11 + v[176] * x12 + v[191] * x13 + v[206] * x14 + v[221] * x15;
1944:       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7 + v[117] * x8 + v[132] * x9 + v[147] * x10 + v[162] * x11 + v[177] * x12 + v[192] * x13 + v[207] * x14 + v[222] * x15;
1945:       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7 + v[118] * x8 + v[133] * x9 + v[148] * x10 + v[163] * x11 + v[178] * x12 + v[193] * x13 + v[208] * x14 + v[223] * x15;
1946:       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7 + v[119] * x8 + v[134] * x9 + v[149] * x10 + v[164] * x11 + v[179] * x12 + v[194] * x13 + v[209] * x14 + v[224] * x15;
1947:       v += 225;
1948:     }
1949:     if (usecprow) z = zarray + 15 * ridx[i];
1950:     z[0]  = sum1;
1951:     z[1]  = sum2;
1952:     z[2]  = sum3;
1953:     z[3]  = sum4;
1954:     z[4]  = sum5;
1955:     z[5]  = sum6;
1956:     z[6]  = sum7;
1957:     z[7]  = sum8;
1958:     z[8]  = sum9;
1959:     z[9]  = sum10;
1960:     z[10] = sum11;
1961:     z[11] = sum12;
1962:     z[12] = sum13;
1963:     z[13] = sum14;
1964:     z[14] = sum15;

1966:     if (!usecprow) z += 15;
1967:   }

1969:   PetscCall(VecRestoreArrayRead(xx, &x));
1970:   PetscCall(VecRestoreArrayWrite(zz, &zarray));
1971:   PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt));
1972:   PetscFunctionReturn(PETSC_SUCCESS);
1973: }

1975: /*
1976:     This will not work with MatScalar == float because it calls the BLAS
1977: */
1978: PetscErrorCode MatMult_SeqBAIJ_N(Mat A, Vec xx, Vec zz)
1979: {
1980:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1981:   PetscScalar       *z = NULL, *work, *workt, *zarray;
1982:   const PetscScalar *x, *xb;
1983:   const MatScalar   *v;
1984:   PetscInt           mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
1985:   const PetscInt    *idx, *ii, *ridx = NULL;
1986:   PetscInt           ncols, k;
1987:   PetscBool          usecprow = a->compressedrow.use;

1989:   PetscFunctionBegin;
1990:   PetscCall(VecGetArrayRead(xx, &x));
1991:   PetscCall(VecGetArrayWrite(zz, &zarray));

1993:   idx = a->j;
1994:   v   = a->a;
1995:   if (usecprow) {
1996:     mbs  = a->compressedrow.nrows;
1997:     ii   = a->compressedrow.i;
1998:     ridx = a->compressedrow.rindex;
1999:     PetscCall(PetscArrayzero(zarray, bs * a->mbs));
2000:   } else {
2001:     mbs = a->mbs;
2002:     ii  = a->i;
2003:     z   = zarray;
2004:   }

2006:   if (!a->mult_work) {
2007:     k = PetscMax(A->rmap->n, A->cmap->n);
2008:     PetscCall(PetscMalloc1(k + 1, &a->mult_work));
2009:   }
2010:   work = a->mult_work;
2011:   for (i = 0; i < mbs; i++) {
2012:     n = ii[1] - ii[0];
2013:     ii++;
2014:     ncols = n * bs;
2015:     workt = work;
2016:     for (j = 0; j < n; j++) {
2017:       xb = x + bs * (*idx++);
2018:       for (k = 0; k < bs; k++) workt[k] = xb[k];
2019:       workt += bs;
2020:     }
2021:     if (usecprow) z = zarray + bs * ridx[i];
2022:     PetscKernel_w_gets_Ar_times_v(bs, ncols, work, v, z);
2023:     v += n * bs2;
2024:     if (!usecprow) z += bs;
2025:   }
2026:   PetscCall(VecRestoreArrayRead(xx, &x));
2027:   PetscCall(VecRestoreArrayWrite(zz, &zarray));
2028:   PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt));
2029:   PetscFunctionReturn(PETSC_SUCCESS);
2030: }

2032: PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A, Vec xx, Vec yy, Vec zz)
2033: {
2034:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2035:   const PetscScalar *x;
2036:   PetscScalar       *y, *z, sum;
2037:   const MatScalar   *v;
2038:   PetscInt           mbs = a->mbs, i, n, *ridx = NULL;
2039:   const PetscInt    *idx, *ii;
2040:   PetscBool          usecprow = a->compressedrow.use;

2042:   PetscFunctionBegin;
2043:   PetscCall(VecGetArrayRead(xx, &x));
2044:   PetscCall(VecGetArrayPair(yy, zz, &y, &z));

2046:   idx = a->j;
2047:   v   = a->a;
2048:   if (usecprow) {
2049:     if (zz != yy) PetscCall(PetscArraycpy(z, y, mbs));
2050:     mbs  = a->compressedrow.nrows;
2051:     ii   = a->compressedrow.i;
2052:     ridx = a->compressedrow.rindex;
2053:   } else {
2054:     ii = a->i;
2055:   }

2057:   for (i = 0; i < mbs; i++) {
2058:     n = ii[1] - ii[0];
2059:     ii++;
2060:     if (!usecprow) {
2061:       sum = y[i];
2062:     } else {
2063:       sum = y[ridx[i]];
2064:     }
2065:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2066:     PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
2067:     PetscSparseDensePlusDot(sum, x, v, idx, n);
2068:     v += n;
2069:     idx += n;
2070:     if (usecprow) {
2071:       z[ridx[i]] = sum;
2072:     } else {
2073:       z[i] = sum;
2074:     }
2075:   }
2076:   PetscCall(VecRestoreArrayRead(xx, &x));
2077:   PetscCall(VecRestoreArrayPair(yy, zz, &y, &z));
2078:   PetscCall(PetscLogFlops(2.0 * a->nz));
2079:   PetscFunctionReturn(PETSC_SUCCESS);
2080: }

2082: PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A, Vec xx, Vec yy, Vec zz)
2083: {
2084:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2085:   PetscScalar       *y = NULL, *z = NULL, sum1, sum2;
2086:   const PetscScalar *x, *xb;
2087:   PetscScalar        x1, x2, *yarray, *zarray;
2088:   const MatScalar   *v;
2089:   PetscInt           mbs = a->mbs, i, n, j;
2090:   const PetscInt    *idx, *ii, *ridx = NULL;
2091:   PetscBool          usecprow = a->compressedrow.use;

2093:   PetscFunctionBegin;
2094:   PetscCall(VecGetArrayRead(xx, &x));
2095:   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));

2097:   idx = a->j;
2098:   v   = a->a;
2099:   if (usecprow) {
2100:     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 2 * mbs));
2101:     mbs  = a->compressedrow.nrows;
2102:     ii   = a->compressedrow.i;
2103:     ridx = a->compressedrow.rindex;
2104:   } else {
2105:     ii = a->i;
2106:     y  = yarray;
2107:     z  = zarray;
2108:   }

2110:   for (i = 0; i < mbs; i++) {
2111:     n = ii[1] - ii[0];
2112:     ii++;
2113:     if (usecprow) {
2114:       z = zarray + 2 * ridx[i];
2115:       y = yarray + 2 * ridx[i];
2116:     }
2117:     sum1 = y[0];
2118:     sum2 = y[1];
2119:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
2120:     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2121:     for (j = 0; j < n; j++) {
2122:       xb = x + 2 * (*idx++);
2123:       x1 = xb[0];
2124:       x2 = xb[1];

2126:       sum1 += v[0] * x1 + v[2] * x2;
2127:       sum2 += v[1] * x1 + v[3] * x2;
2128:       v += 4;
2129:     }
2130:     z[0] = sum1;
2131:     z[1] = sum2;
2132:     if (!usecprow) {
2133:       z += 2;
2134:       y += 2;
2135:     }
2136:   }
2137:   PetscCall(VecRestoreArrayRead(xx, &x));
2138:   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2139:   PetscCall(PetscLogFlops(4.0 * a->nz));
2140:   PetscFunctionReturn(PETSC_SUCCESS);
2141: }

2143: PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A, Vec xx, Vec yy, Vec zz)
2144: {
2145:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2146:   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, x1, x2, x3, *yarray, *zarray;
2147:   const PetscScalar *x, *xb;
2148:   const MatScalar   *v;
2149:   PetscInt           mbs = a->mbs, i, j, n;
2150:   const PetscInt    *idx, *ii, *ridx = NULL;
2151:   PetscBool          usecprow = a->compressedrow.use;

2153:   PetscFunctionBegin;
2154:   PetscCall(VecGetArrayRead(xx, &x));
2155:   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));

2157:   idx = a->j;
2158:   v   = a->a;
2159:   if (usecprow) {
2160:     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 3 * mbs));
2161:     mbs  = a->compressedrow.nrows;
2162:     ii   = a->compressedrow.i;
2163:     ridx = a->compressedrow.rindex;
2164:   } else {
2165:     ii = a->i;
2166:     y  = yarray;
2167:     z  = zarray;
2168:   }

2170:   for (i = 0; i < mbs; i++) {
2171:     n = ii[1] - ii[0];
2172:     ii++;
2173:     if (usecprow) {
2174:       z = zarray + 3 * ridx[i];
2175:       y = yarray + 3 * ridx[i];
2176:     }
2177:     sum1 = y[0];
2178:     sum2 = y[1];
2179:     sum3 = y[2];
2180:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
2181:     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2182:     for (j = 0; j < n; j++) {
2183:       xb = x + 3 * (*idx++);
2184:       x1 = xb[0];
2185:       x2 = xb[1];
2186:       x3 = xb[2];
2187:       sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3;
2188:       sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3;
2189:       sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3;
2190:       v += 9;
2191:     }
2192:     z[0] = sum1;
2193:     z[1] = sum2;
2194:     z[2] = sum3;
2195:     if (!usecprow) {
2196:       z += 3;
2197:       y += 3;
2198:     }
2199:   }
2200:   PetscCall(VecRestoreArrayRead(xx, &x));
2201:   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2202:   PetscCall(PetscLogFlops(18.0 * a->nz));
2203:   PetscFunctionReturn(PETSC_SUCCESS);
2204: }

2206: PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A, Vec xx, Vec yy, Vec zz)
2207: {
2208:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2209:   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, x1, x2, x3, x4, *yarray, *zarray;
2210:   const PetscScalar *x, *xb;
2211:   const MatScalar   *v;
2212:   PetscInt           mbs = a->mbs, i, j, n;
2213:   const PetscInt    *idx, *ii, *ridx = NULL;
2214:   PetscBool          usecprow = a->compressedrow.use;

2216:   PetscFunctionBegin;
2217:   PetscCall(VecGetArrayRead(xx, &x));
2218:   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));

2220:   idx = a->j;
2221:   v   = a->a;
2222:   if (usecprow) {
2223:     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 4 * mbs));
2224:     mbs  = a->compressedrow.nrows;
2225:     ii   = a->compressedrow.i;
2226:     ridx = a->compressedrow.rindex;
2227:   } else {
2228:     ii = a->i;
2229:     y  = yarray;
2230:     z  = zarray;
2231:   }

2233:   for (i = 0; i < mbs; i++) {
2234:     n = ii[1] - ii[0];
2235:     ii++;
2236:     if (usecprow) {
2237:       z = zarray + 4 * ridx[i];
2238:       y = yarray + 4 * ridx[i];
2239:     }
2240:     sum1 = y[0];
2241:     sum2 = y[1];
2242:     sum3 = y[2];
2243:     sum4 = y[3];
2244:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
2245:     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2246:     for (j = 0; j < n; j++) {
2247:       xb = x + 4 * (*idx++);
2248:       x1 = xb[0];
2249:       x2 = xb[1];
2250:       x3 = xb[2];
2251:       x4 = xb[3];
2252:       sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
2253:       sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
2254:       sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
2255:       sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
2256:       v += 16;
2257:     }
2258:     z[0] = sum1;
2259:     z[1] = sum2;
2260:     z[2] = sum3;
2261:     z[3] = sum4;
2262:     if (!usecprow) {
2263:       z += 4;
2264:       y += 4;
2265:     }
2266:   }
2267:   PetscCall(VecRestoreArrayRead(xx, &x));
2268:   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2269:   PetscCall(PetscLogFlops(32.0 * a->nz));
2270:   PetscFunctionReturn(PETSC_SUCCESS);
2271: }

2273: PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A, Vec xx, Vec yy, Vec zz)
2274: {
2275:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2276:   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, x1, x2, x3, x4, x5;
2277:   const PetscScalar *x, *xb;
2278:   PetscScalar       *yarray, *zarray;
2279:   const MatScalar   *v;
2280:   PetscInt           mbs = a->mbs, i, j, n;
2281:   const PetscInt    *idx, *ii, *ridx = NULL;
2282:   PetscBool          usecprow = a->compressedrow.use;

2284:   PetscFunctionBegin;
2285:   PetscCall(VecGetArrayRead(xx, &x));
2286:   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));

2288:   idx = a->j;
2289:   v   = a->a;
2290:   if (usecprow) {
2291:     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 5 * mbs));
2292:     mbs  = a->compressedrow.nrows;
2293:     ii   = a->compressedrow.i;
2294:     ridx = a->compressedrow.rindex;
2295:   } else {
2296:     ii = a->i;
2297:     y  = yarray;
2298:     z  = zarray;
2299:   }

2301:   for (i = 0; i < mbs; i++) {
2302:     n = ii[1] - ii[0];
2303:     ii++;
2304:     if (usecprow) {
2305:       z = zarray + 5 * ridx[i];
2306:       y = yarray + 5 * ridx[i];
2307:     }
2308:     sum1 = y[0];
2309:     sum2 = y[1];
2310:     sum3 = y[2];
2311:     sum4 = y[3];
2312:     sum5 = y[4];
2313:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
2314:     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2315:     for (j = 0; j < n; j++) {
2316:       xb = x + 5 * (*idx++);
2317:       x1 = xb[0];
2318:       x2 = xb[1];
2319:       x3 = xb[2];
2320:       x4 = xb[3];
2321:       x5 = xb[4];
2322:       sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
2323:       sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
2324:       sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
2325:       sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
2326:       sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
2327:       v += 25;
2328:     }
2329:     z[0] = sum1;
2330:     z[1] = sum2;
2331:     z[2] = sum3;
2332:     z[3] = sum4;
2333:     z[4] = sum5;
2334:     if (!usecprow) {
2335:       z += 5;
2336:       y += 5;
2337:     }
2338:   }
2339:   PetscCall(VecRestoreArrayRead(xx, &x));
2340:   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2341:   PetscCall(PetscLogFlops(50.0 * a->nz));
2342:   PetscFunctionReturn(PETSC_SUCCESS);
2343: }

2345: PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A, Vec xx, Vec yy, Vec zz)
2346: {
2347:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2348:   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6;
2349:   const PetscScalar *x, *xb;
2350:   PetscScalar        x1, x2, x3, x4, x5, x6, *yarray, *zarray;
2351:   const MatScalar   *v;
2352:   PetscInt           mbs = a->mbs, i, j, n;
2353:   const PetscInt    *idx, *ii, *ridx = NULL;
2354:   PetscBool          usecprow = a->compressedrow.use;

2356:   PetscFunctionBegin;
2357:   PetscCall(VecGetArrayRead(xx, &x));
2358:   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));

2360:   idx = a->j;
2361:   v   = a->a;
2362:   if (usecprow) {
2363:     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 6 * mbs));
2364:     mbs  = a->compressedrow.nrows;
2365:     ii   = a->compressedrow.i;
2366:     ridx = a->compressedrow.rindex;
2367:   } else {
2368:     ii = a->i;
2369:     y  = yarray;
2370:     z  = zarray;
2371:   }

2373:   for (i = 0; i < mbs; i++) {
2374:     n = ii[1] - ii[0];
2375:     ii++;
2376:     if (usecprow) {
2377:       z = zarray + 6 * ridx[i];
2378:       y = yarray + 6 * ridx[i];
2379:     }
2380:     sum1 = y[0];
2381:     sum2 = y[1];
2382:     sum3 = y[2];
2383:     sum4 = y[3];
2384:     sum5 = y[4];
2385:     sum6 = y[5];
2386:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
2387:     PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2388:     for (j = 0; j < n; j++) {
2389:       xb = x + 6 * (*idx++);
2390:       x1 = xb[0];
2391:       x2 = xb[1];
2392:       x3 = xb[2];
2393:       x4 = xb[3];
2394:       x5 = xb[4];
2395:       x6 = xb[5];
2396:       sum1 += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
2397:       sum2 += v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
2398:       sum3 += v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
2399:       sum4 += v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
2400:       sum5 += v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
2401:       sum6 += v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
2402:       v += 36;
2403:     }
2404:     z[0] = sum1;
2405:     z[1] = sum2;
2406:     z[2] = sum3;
2407:     z[3] = sum4;
2408:     z[4] = sum5;
2409:     z[5] = sum6;
2410:     if (!usecprow) {
2411:       z += 6;
2412:       y += 6;
2413:     }
2414:   }
2415:   PetscCall(VecRestoreArrayRead(xx, &x));
2416:   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2417:   PetscCall(PetscLogFlops(72.0 * a->nz));
2418:   PetscFunctionReturn(PETSC_SUCCESS);
2419: }

2421: PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A, Vec xx, Vec yy, Vec zz)
2422: {
2423:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2424:   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7;
2425:   const PetscScalar *x, *xb;
2426:   PetscScalar        x1, x2, x3, x4, x5, x6, x7, *yarray, *zarray;
2427:   const MatScalar   *v;
2428:   PetscInt           mbs = a->mbs, i, j, n;
2429:   const PetscInt    *idx, *ii, *ridx = NULL;
2430:   PetscBool          usecprow = a->compressedrow.use;

2432:   PetscFunctionBegin;
2433:   PetscCall(VecGetArrayRead(xx, &x));
2434:   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));

2436:   idx = a->j;
2437:   v   = a->a;
2438:   if (usecprow) {
2439:     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 7 * mbs));
2440:     mbs  = a->compressedrow.nrows;
2441:     ii   = a->compressedrow.i;
2442:     ridx = a->compressedrow.rindex;
2443:   } else {
2444:     ii = a->i;
2445:     y  = yarray;
2446:     z  = zarray;
2447:   }

2449:   for (i = 0; i < mbs; i++) {
2450:     n = ii[1] - ii[0];
2451:     ii++;
2452:     if (usecprow) {
2453:       z = zarray + 7 * ridx[i];
2454:       y = yarray + 7 * ridx[i];
2455:     }
2456:     sum1 = y[0];
2457:     sum2 = y[1];
2458:     sum3 = y[2];
2459:     sum4 = y[3];
2460:     sum5 = y[4];
2461:     sum6 = y[5];
2462:     sum7 = y[6];
2463:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
2464:     PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2465:     for (j = 0; j < n; j++) {
2466:       xb = x + 7 * (*idx++);
2467:       x1 = xb[0];
2468:       x2 = xb[1];
2469:       x3 = xb[2];
2470:       x4 = xb[3];
2471:       x5 = xb[4];
2472:       x6 = xb[5];
2473:       x7 = xb[6];
2474:       sum1 += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
2475:       sum2 += v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
2476:       sum3 += v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
2477:       sum4 += v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
2478:       sum5 += v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
2479:       sum6 += v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
2480:       sum7 += v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
2481:       v += 49;
2482:     }
2483:     z[0] = sum1;
2484:     z[1] = sum2;
2485:     z[2] = sum3;
2486:     z[3] = sum4;
2487:     z[4] = sum5;
2488:     z[5] = sum6;
2489:     z[6] = sum7;
2490:     if (!usecprow) {
2491:       z += 7;
2492:       y += 7;
2493:     }
2494:   }
2495:   PetscCall(VecRestoreArrayRead(xx, &x));
2496:   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2497:   PetscCall(PetscLogFlops(98.0 * a->nz));
2498:   PetscFunctionReturn(PETSC_SUCCESS);
2499: }

2501: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
2502: PetscErrorCode MatMultAdd_SeqBAIJ_9_AVX2(Mat A, Vec xx, Vec yy, Vec zz)
2503: {
2504:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2505:   PetscScalar       *z = NULL, *work, *workt, *zarray;
2506:   const PetscScalar *x, *xb;
2507:   const MatScalar   *v;
2508:   PetscInt           mbs, i, j, n;
2509:   PetscInt           k;
2510:   PetscBool          usecprow = a->compressedrow.use;
2511:   const PetscInt    *idx, *ii, *ridx = NULL, bs = 9, bs2 = 81;

2513:   __m256d a0, a1, a2, a3, a4, a5;
2514:   __m256d w0, w1, w2, w3;
2515:   __m256d z0, z1, z2;
2516:   __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL << 63);

2518:   PetscFunctionBegin;
2519:   PetscCall(VecCopy(yy, zz));
2520:   PetscCall(VecGetArrayRead(xx, &x));
2521:   PetscCall(VecGetArray(zz, &zarray));

2523:   idx = a->j;
2524:   v   = a->a;
2525:   if (usecprow) {
2526:     mbs  = a->compressedrow.nrows;
2527:     ii   = a->compressedrow.i;
2528:     ridx = a->compressedrow.rindex;
2529:   } else {
2530:     mbs = a->mbs;
2531:     ii  = a->i;
2532:     z   = zarray;
2533:   }

2535:   if (!a->mult_work) {
2536:     k = PetscMax(A->rmap->n, A->cmap->n);
2537:     PetscCall(PetscMalloc1(k + 1, &a->mult_work));
2538:   }

2540:   work = a->mult_work;
2541:   for (i = 0; i < mbs; i++) {
2542:     n = ii[1] - ii[0];
2543:     ii++;
2544:     workt = work;
2545:     for (j = 0; j < n; j++) {
2546:       xb = x + bs * (*idx++);
2547:       for (k = 0; k < bs; k++) workt[k] = xb[k];
2548:       workt += bs;
2549:     }
2550:     if (usecprow) z = zarray + bs * ridx[i];

2552:     z0 = _mm256_loadu_pd(&z[0]);
2553:     z1 = _mm256_loadu_pd(&z[4]);
2554:     z2 = _mm256_set1_pd(z[8]);

2556:     for (j = 0; j < n; j++) {
2557:       /* first column of a */
2558:       w0 = _mm256_set1_pd(work[j * 9]);
2559:       a0 = _mm256_loadu_pd(&v[j * 81]);
2560:       z0 = _mm256_fmadd_pd(a0, w0, z0);
2561:       a1 = _mm256_loadu_pd(&v[j * 81 + 4]);
2562:       z1 = _mm256_fmadd_pd(a1, w0, z1);
2563:       a2 = _mm256_loadu_pd(&v[j * 81 + 8]);
2564:       z2 = _mm256_fmadd_pd(a2, w0, z2);

2566:       /* second column of a */
2567:       w1 = _mm256_set1_pd(work[j * 9 + 1]);
2568:       a0 = _mm256_loadu_pd(&v[j * 81 + 9]);
2569:       z0 = _mm256_fmadd_pd(a0, w1, z0);
2570:       a1 = _mm256_loadu_pd(&v[j * 81 + 13]);
2571:       z1 = _mm256_fmadd_pd(a1, w1, z1);
2572:       a2 = _mm256_loadu_pd(&v[j * 81 + 17]);
2573:       z2 = _mm256_fmadd_pd(a2, w1, z2);

2575:       /* third column of a */
2576:       w2 = _mm256_set1_pd(work[j * 9 + 2]);
2577:       a3 = _mm256_loadu_pd(&v[j * 81 + 18]);
2578:       z0 = _mm256_fmadd_pd(a3, w2, z0);
2579:       a4 = _mm256_loadu_pd(&v[j * 81 + 22]);
2580:       z1 = _mm256_fmadd_pd(a4, w2, z1);
2581:       a5 = _mm256_loadu_pd(&v[j * 81 + 26]);
2582:       z2 = _mm256_fmadd_pd(a5, w2, z2);

2584:       /* fourth column of a */
2585:       w3 = _mm256_set1_pd(work[j * 9 + 3]);
2586:       a0 = _mm256_loadu_pd(&v[j * 81 + 27]);
2587:       z0 = _mm256_fmadd_pd(a0, w3, z0);
2588:       a1 = _mm256_loadu_pd(&v[j * 81 + 31]);
2589:       z1 = _mm256_fmadd_pd(a1, w3, z1);
2590:       a2 = _mm256_loadu_pd(&v[j * 81 + 35]);
2591:       z2 = _mm256_fmadd_pd(a2, w3, z2);

2593:       /* fifth column of a */
2594:       w0 = _mm256_set1_pd(work[j * 9 + 4]);
2595:       a3 = _mm256_loadu_pd(&v[j * 81 + 36]);
2596:       z0 = _mm256_fmadd_pd(a3, w0, z0);
2597:       a4 = _mm256_loadu_pd(&v[j * 81 + 40]);
2598:       z1 = _mm256_fmadd_pd(a4, w0, z1);
2599:       a5 = _mm256_loadu_pd(&v[j * 81 + 44]);
2600:       z2 = _mm256_fmadd_pd(a5, w0, z2);

2602:       /* sixth column of a */
2603:       w1 = _mm256_set1_pd(work[j * 9 + 5]);
2604:       a0 = _mm256_loadu_pd(&v[j * 81 + 45]);
2605:       z0 = _mm256_fmadd_pd(a0, w1, z0);
2606:       a1 = _mm256_loadu_pd(&v[j * 81 + 49]);
2607:       z1 = _mm256_fmadd_pd(a1, w1, z1);
2608:       a2 = _mm256_loadu_pd(&v[j * 81 + 53]);
2609:       z2 = _mm256_fmadd_pd(a2, w1, z2);

2611:       /* seventh column of a */
2612:       w2 = _mm256_set1_pd(work[j * 9 + 6]);
2613:       a0 = _mm256_loadu_pd(&v[j * 81 + 54]);
2614:       z0 = _mm256_fmadd_pd(a0, w2, z0);
2615:       a1 = _mm256_loadu_pd(&v[j * 81 + 58]);
2616:       z1 = _mm256_fmadd_pd(a1, w2, z1);
2617:       a2 = _mm256_loadu_pd(&v[j * 81 + 62]);
2618:       z2 = _mm256_fmadd_pd(a2, w2, z2);

2620:       /* eighth column of a */
2621:       w3 = _mm256_set1_pd(work[j * 9 + 7]);
2622:       a3 = _mm256_loadu_pd(&v[j * 81 + 63]);
2623:       z0 = _mm256_fmadd_pd(a3, w3, z0);
2624:       a4 = _mm256_loadu_pd(&v[j * 81 + 67]);
2625:       z1 = _mm256_fmadd_pd(a4, w3, z1);
2626:       a5 = _mm256_loadu_pd(&v[j * 81 + 71]);
2627:       z2 = _mm256_fmadd_pd(a5, w3, z2);

2629:       /* ninth column of a */
2630:       w0 = _mm256_set1_pd(work[j * 9 + 8]);
2631:       a0 = _mm256_loadu_pd(&v[j * 81 + 72]);
2632:       z0 = _mm256_fmadd_pd(a0, w0, z0);
2633:       a1 = _mm256_loadu_pd(&v[j * 81 + 76]);
2634:       z1 = _mm256_fmadd_pd(a1, w0, z1);
2635:       a2 = _mm256_maskload_pd(&v[j * 81 + 80], mask1);
2636:       z2 = _mm256_fmadd_pd(a2, w0, z2);
2637:     }

2639:     _mm256_storeu_pd(&z[0], z0);
2640:     _mm256_storeu_pd(&z[4], z1);
2641:     _mm256_maskstore_pd(&z[8], mask1, z2);

2643:     v += n * bs2;
2644:     if (!usecprow) z += bs;
2645:   }
2646:   PetscCall(VecRestoreArrayRead(xx, &x));
2647:   PetscCall(VecRestoreArray(zz, &zarray));
2648:   PetscCall(PetscLogFlops(162.0 * a->nz));
2649:   PetscFunctionReturn(PETSC_SUCCESS);
2650: }
2651: #endif

2653: PetscErrorCode MatMultAdd_SeqBAIJ_11(Mat A, Vec xx, Vec yy, Vec zz)
2654: {
2655:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2656:   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11;
2657:   const PetscScalar *x, *xb;
2658:   PetscScalar        x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, *yarray, *zarray;
2659:   const MatScalar   *v;
2660:   PetscInt           mbs = a->mbs, i, j, n;
2661:   const PetscInt    *idx, *ii, *ridx = NULL;
2662:   PetscBool          usecprow = a->compressedrow.use;

2664:   PetscFunctionBegin;
2665:   PetscCall(VecGetArrayRead(xx, &x));
2666:   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));

2668:   idx = a->j;
2669:   v   = a->a;
2670:   if (usecprow) {
2671:     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 7 * mbs));
2672:     mbs  = a->compressedrow.nrows;
2673:     ii   = a->compressedrow.i;
2674:     ridx = a->compressedrow.rindex;
2675:   } else {
2676:     ii = a->i;
2677:     y  = yarray;
2678:     z  = zarray;
2679:   }

2681:   for (i = 0; i < mbs; i++) {
2682:     n = ii[1] - ii[0];
2683:     ii++;
2684:     if (usecprow) {
2685:       z = zarray + 11 * ridx[i];
2686:       y = yarray + 11 * ridx[i];
2687:     }
2688:     sum1  = y[0];
2689:     sum2  = y[1];
2690:     sum3  = y[2];
2691:     sum4  = y[3];
2692:     sum5  = y[4];
2693:     sum6  = y[5];
2694:     sum7  = y[6];
2695:     sum8  = y[7];
2696:     sum9  = y[8];
2697:     sum10 = y[9];
2698:     sum11 = y[10];
2699:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);           /* Indices for the next row (assumes same size as this one) */
2700:     PetscPrefetchBlock(v + 121 * n, 121 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2701:     for (j = 0; j < n; j++) {
2702:       xb  = x + 11 * (*idx++);
2703:       x1  = xb[0];
2704:       x2  = xb[1];
2705:       x3  = xb[2];
2706:       x4  = xb[3];
2707:       x5  = xb[4];
2708:       x6  = xb[5];
2709:       x7  = xb[6];
2710:       x8  = xb[7];
2711:       x9  = xb[8];
2712:       x10 = xb[9];
2713:       x11 = xb[10];
2714:       sum1 += v[0] * x1 + v[11] * x2 + v[2 * 11] * x3 + v[3 * 11] * x4 + v[4 * 11] * x5 + v[5 * 11] * x6 + v[6 * 11] * x7 + v[7 * 11] * x8 + v[8 * 11] * x9 + v[9 * 11] * x10 + v[10 * 11] * x11;
2715:       sum2 += v[1 + 0] * x1 + v[1 + 11] * x2 + v[1 + 2 * 11] * x3 + v[1 + 3 * 11] * x4 + v[1 + 4 * 11] * x5 + v[1 + 5 * 11] * x6 + v[1 + 6 * 11] * x7 + v[1 + 7 * 11] * x8 + v[1 + 8 * 11] * x9 + v[1 + 9 * 11] * x10 + v[1 + 10 * 11] * x11;
2716:       sum3 += v[2 + 0] * x1 + v[2 + 11] * x2 + v[2 + 2 * 11] * x3 + v[2 + 3 * 11] * x4 + v[2 + 4 * 11] * x5 + v[2 + 5 * 11] * x6 + v[2 + 6 * 11] * x7 + v[2 + 7 * 11] * x8 + v[2 + 8 * 11] * x9 + v[2 + 9 * 11] * x10 + v[2 + 10 * 11] * x11;
2717:       sum4 += v[3 + 0] * x1 + v[3 + 11] * x2 + v[3 + 2 * 11] * x3 + v[3 + 3 * 11] * x4 + v[3 + 4 * 11] * x5 + v[3 + 5 * 11] * x6 + v[3 + 6 * 11] * x7 + v[3 + 7 * 11] * x8 + v[3 + 8 * 11] * x9 + v[3 + 9 * 11] * x10 + v[3 + 10 * 11] * x11;
2718:       sum5 += v[4 + 0] * x1 + v[4 + 11] * x2 + v[4 + 2 * 11] * x3 + v[4 + 3 * 11] * x4 + v[4 + 4 * 11] * x5 + v[4 + 5 * 11] * x6 + v[4 + 6 * 11] * x7 + v[4 + 7 * 11] * x8 + v[4 + 8 * 11] * x9 + v[4 + 9 * 11] * x10 + v[4 + 10 * 11] * x11;
2719:       sum6 += v[5 + 0] * x1 + v[5 + 11] * x2 + v[5 + 2 * 11] * x3 + v[5 + 3 * 11] * x4 + v[5 + 4 * 11] * x5 + v[5 + 5 * 11] * x6 + v[5 + 6 * 11] * x7 + v[5 + 7 * 11] * x8 + v[5 + 8 * 11] * x9 + v[5 + 9 * 11] * x10 + v[5 + 10 * 11] * x11;
2720:       sum7 += v[6 + 0] * x1 + v[6 + 11] * x2 + v[6 + 2 * 11] * x3 + v[6 + 3 * 11] * x4 + v[6 + 4 * 11] * x5 + v[6 + 5 * 11] * x6 + v[6 + 6 * 11] * x7 + v[6 + 7 * 11] * x8 + v[6 + 8 * 11] * x9 + v[6 + 9 * 11] * x10 + v[6 + 10 * 11] * x11;
2721:       sum8 += v[7 + 0] * x1 + v[7 + 11] * x2 + v[7 + 2 * 11] * x3 + v[7 + 3 * 11] * x4 + v[7 + 4 * 11] * x5 + v[7 + 5 * 11] * x6 + v[7 + 6 * 11] * x7 + v[7 + 7 * 11] * x8 + v[7 + 8 * 11] * x9 + v[7 + 9 * 11] * x10 + v[7 + 10 * 11] * x11;
2722:       sum9 += v[8 + 0] * x1 + v[8 + 11] * x2 + v[8 + 2 * 11] * x3 + v[8 + 3 * 11] * x4 + v[8 + 4 * 11] * x5 + v[8 + 5 * 11] * x6 + v[8 + 6 * 11] * x7 + v[8 + 7 * 11] * x8 + v[8 + 8 * 11] * x9 + v[8 + 9 * 11] * x10 + v[8 + 10 * 11] * x11;
2723:       sum10 += v[9 + 0] * x1 + v[9 + 11] * x2 + v[9 + 2 * 11] * x3 + v[9 + 3 * 11] * x4 + v[9 + 4 * 11] * x5 + v[9 + 5 * 11] * x6 + v[9 + 6 * 11] * x7 + v[9 + 7 * 11] * x8 + v[9 + 8 * 11] * x9 + v[9 + 9 * 11] * x10 + v[9 + 10 * 11] * x11;
2724:       sum11 += v[10 + 0] * x1 + v[10 + 11] * x2 + v[10 + 2 * 11] * x3 + v[10 + 3 * 11] * x4 + v[10 + 4 * 11] * x5 + v[10 + 5 * 11] * x6 + v[10 + 6 * 11] * x7 + v[10 + 7 * 11] * x8 + v[10 + 8 * 11] * x9 + v[10 + 9 * 11] * x10 + v[10 + 10 * 11] * x11;
2725:       v += 121;
2726:     }
2727:     z[0]  = sum1;
2728:     z[1]  = sum2;
2729:     z[2]  = sum3;
2730:     z[3]  = sum4;
2731:     z[4]  = sum5;
2732:     z[5]  = sum6;
2733:     z[6]  = sum7;
2734:     z[7]  = sum8;
2735:     z[8]  = sum9;
2736:     z[9]  = sum10;
2737:     z[10] = sum11;
2738:     if (!usecprow) {
2739:       z += 11;
2740:       y += 11;
2741:     }
2742:   }
2743:   PetscCall(VecRestoreArrayRead(xx, &x));
2744:   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2745:   PetscCall(PetscLogFlops(242.0 * a->nz));
2746:   PetscFunctionReturn(PETSC_SUCCESS);
2747: }

2749: PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A, Vec xx, Vec yy, Vec zz)
2750: {
2751:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2752:   PetscScalar       *z = NULL, *work, *workt, *zarray;
2753:   const PetscScalar *x, *xb;
2754:   const MatScalar   *v;
2755:   PetscInt           mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
2756:   PetscInt           ncols, k;
2757:   const PetscInt    *ridx     = NULL, *idx, *ii;
2758:   PetscBool          usecprow = a->compressedrow.use;

2760:   PetscFunctionBegin;
2761:   PetscCall(VecCopy(yy, zz));
2762:   PetscCall(VecGetArrayRead(xx, &x));
2763:   PetscCall(VecGetArray(zz, &zarray));

2765:   idx = a->j;
2766:   v   = a->a;
2767:   if (usecprow) {
2768:     mbs  = a->compressedrow.nrows;
2769:     ii   = a->compressedrow.i;
2770:     ridx = a->compressedrow.rindex;
2771:   } else {
2772:     mbs = a->mbs;
2773:     ii  = a->i;
2774:     z   = zarray;
2775:   }

2777:   if (!a->mult_work) {
2778:     k = PetscMax(A->rmap->n, A->cmap->n);
2779:     PetscCall(PetscMalloc1(k + 1, &a->mult_work));
2780:   }
2781:   work = a->mult_work;
2782:   for (i = 0; i < mbs; i++) {
2783:     n = ii[1] - ii[0];
2784:     ii++;
2785:     ncols = n * bs;
2786:     workt = work;
2787:     for (j = 0; j < n; j++) {
2788:       xb = x + bs * (*idx++);
2789:       for (k = 0; k < bs; k++) workt[k] = xb[k];
2790:       workt += bs;
2791:     }
2792:     if (usecprow) z = zarray + bs * ridx[i];
2793:     PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z);
2794:     v += n * bs2;
2795:     if (!usecprow) z += bs;
2796:   }
2797:   PetscCall(VecRestoreArrayRead(xx, &x));
2798:   PetscCall(VecRestoreArray(zz, &zarray));
2799:   PetscCall(PetscLogFlops(2.0 * a->nz * bs2));
2800:   PetscFunctionReturn(PETSC_SUCCESS);
2801: }

2803: PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A, Vec xx, Vec zz)
2804: {
2805:   PetscScalar zero = 0.0;

2807:   PetscFunctionBegin;
2808:   PetscCall(VecSet(zz, zero));
2809:   PetscCall(MatMultHermitianTransposeAdd_SeqBAIJ(A, xx, zz, zz));
2810:   PetscFunctionReturn(PETSC_SUCCESS);
2811: }

2813: PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A, Vec xx, Vec zz)
2814: {
2815:   PetscScalar zero = 0.0;

2817:   PetscFunctionBegin;
2818:   PetscCall(VecSet(zz, zero));
2819:   PetscCall(MatMultTransposeAdd_SeqBAIJ(A, xx, zz, zz));
2820:   PetscFunctionReturn(PETSC_SUCCESS);
2821: }

2823: PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A, Vec xx, Vec yy, Vec zz)
2824: {
2825:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2826:   PetscScalar       *z, x1, x2, x3, x4, x5;
2827:   const PetscScalar *x, *xb = NULL;
2828:   const MatScalar   *v;
2829:   PetscInt           mbs, i, rval, bs     = A->rmap->bs, j, n;
2830:   const PetscInt    *idx, *ii, *ib, *ridx = NULL;
2831:   Mat_CompressedRow  cprow    = a->compressedrow;
2832:   PetscBool          usecprow = cprow.use;

2834:   PetscFunctionBegin;
2835:   if (yy != zz) PetscCall(VecCopy(yy, zz));
2836:   PetscCall(VecGetArrayRead(xx, &x));
2837:   PetscCall(VecGetArray(zz, &z));

2839:   idx = a->j;
2840:   v   = a->a;
2841:   if (usecprow) {
2842:     mbs  = cprow.nrows;
2843:     ii   = cprow.i;
2844:     ridx = cprow.rindex;
2845:   } else {
2846:     mbs = a->mbs;
2847:     ii  = a->i;
2848:     xb  = x;
2849:   }

2851:   switch (bs) {
2852:   case 1:
2853:     for (i = 0; i < mbs; i++) {
2854:       if (usecprow) xb = x + ridx[i];
2855:       x1 = xb[0];
2856:       ib = idx + ii[0];
2857:       n  = ii[1] - ii[0];
2858:       ii++;
2859:       for (j = 0; j < n; j++) {
2860:         rval = ib[j];
2861:         z[rval] += PetscConj(*v) * x1;
2862:         v++;
2863:       }
2864:       if (!usecprow) xb++;
2865:     }
2866:     break;
2867:   case 2:
2868:     for (i = 0; i < mbs; i++) {
2869:       if (usecprow) xb = x + 2 * ridx[i];
2870:       x1 = xb[0];
2871:       x2 = xb[1];
2872:       ib = idx + ii[0];
2873:       n  = ii[1] - ii[0];
2874:       ii++;
2875:       for (j = 0; j < n; j++) {
2876:         rval = ib[j] * 2;
2877:         z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2;
2878:         z[rval++] += PetscConj(v[2]) * x1 + PetscConj(v[3]) * x2;
2879:         v += 4;
2880:       }
2881:       if (!usecprow) xb += 2;
2882:     }
2883:     break;
2884:   case 3:
2885:     for (i = 0; i < mbs; i++) {
2886:       if (usecprow) xb = x + 3 * ridx[i];
2887:       x1 = xb[0];
2888:       x2 = xb[1];
2889:       x3 = xb[2];
2890:       ib = idx + ii[0];
2891:       n  = ii[1] - ii[0];
2892:       ii++;
2893:       for (j = 0; j < n; j++) {
2894:         rval = ib[j] * 3;
2895:         z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3;
2896:         z[rval++] += PetscConj(v[3]) * x1 + PetscConj(v[4]) * x2 + PetscConj(v[5]) * x3;
2897:         z[rval++] += PetscConj(v[6]) * x1 + PetscConj(v[7]) * x2 + PetscConj(v[8]) * x3;
2898:         v += 9;
2899:       }
2900:       if (!usecprow) xb += 3;
2901:     }
2902:     break;
2903:   case 4:
2904:     for (i = 0; i < mbs; i++) {
2905:       if (usecprow) xb = x + 4 * ridx[i];
2906:       x1 = xb[0];
2907:       x2 = xb[1];
2908:       x3 = xb[2];
2909:       x4 = xb[3];
2910:       ib = idx + ii[0];
2911:       n  = ii[1] - ii[0];
2912:       ii++;
2913:       for (j = 0; j < n; j++) {
2914:         rval = ib[j] * 4;
2915:         z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3 + PetscConj(v[3]) * x4;
2916:         z[rval++] += PetscConj(v[4]) * x1 + PetscConj(v[5]) * x2 + PetscConj(v[6]) * x3 + PetscConj(v[7]) * x4;
2917:         z[rval++] += PetscConj(v[8]) * x1 + PetscConj(v[9]) * x2 + PetscConj(v[10]) * x3 + PetscConj(v[11]) * x4;
2918:         z[rval++] += PetscConj(v[12]) * x1 + PetscConj(v[13]) * x2 + PetscConj(v[14]) * x3 + PetscConj(v[15]) * x4;
2919:         v += 16;
2920:       }
2921:       if (!usecprow) xb += 4;
2922:     }
2923:     break;
2924:   case 5:
2925:     for (i = 0; i < mbs; i++) {
2926:       if (usecprow) xb = x + 5 * ridx[i];
2927:       x1 = xb[0];
2928:       x2 = xb[1];
2929:       x3 = xb[2];
2930:       x4 = xb[3];
2931:       x5 = xb[4];
2932:       ib = idx + ii[0];
2933:       n  = ii[1] - ii[0];
2934:       ii++;
2935:       for (j = 0; j < n; j++) {
2936:         rval = ib[j] * 5;
2937:         z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3 + PetscConj(v[3]) * x4 + PetscConj(v[4]) * x5;
2938:         z[rval++] += PetscConj(v[5]) * x1 + PetscConj(v[6]) * x2 + PetscConj(v[7]) * x3 + PetscConj(v[8]) * x4 + PetscConj(v[9]) * x5;
2939:         z[rval++] += PetscConj(v[10]) * x1 + PetscConj(v[11]) * x2 + PetscConj(v[12]) * x3 + PetscConj(v[13]) * x4 + PetscConj(v[14]) * x5;
2940:         z[rval++] += PetscConj(v[15]) * x1 + PetscConj(v[16]) * x2 + PetscConj(v[17]) * x3 + PetscConj(v[18]) * x4 + PetscConj(v[19]) * x5;
2941:         z[rval++] += PetscConj(v[20]) * x1 + PetscConj(v[21]) * x2 + PetscConj(v[22]) * x3 + PetscConj(v[23]) * x4 + PetscConj(v[24]) * x5;
2942:         v += 25;
2943:       }
2944:       if (!usecprow) xb += 5;
2945:     }
2946:     break;
2947:   default: /* block sizes larger than 5 by 5 are handled by BLAS */
2948:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "block size larger than 5 is not supported yet");
2949: #if 0
2950:     {
2951:       PetscInt          ncols,k,bs2=a->bs2;
2952:       PetscScalar       *work,*workt,zb;
2953:       const PetscScalar *xtmp;
2954:       if (!a->mult_work) {
2955:         k    = PetscMax(A->rmap->n,A->cmap->n);
2956:         PetscCall(PetscMalloc1(k+1,&a->mult_work));
2957:       }
2958:       work = a->mult_work;
2959:       xtmp = x;
2960:       for (i=0; i<mbs; i++) {
2961:         n     = ii[1] - ii[0]; ii++;
2962:         ncols = n*bs;
2963:         PetscCall(PetscArrayzero(work,ncols));
2964:         if (usecprow) xtmp = x + bs*ridx[i];
2965:         PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
2966:         v += n*bs2;
2967:         if (!usecprow) xtmp += bs;
2968:         workt = work;
2969:         for (j=0; j<n; j++) {
2970:           zb = z + bs*(*idx++);
2971:           for (k=0; k<bs; k++) zb[k] += workt[k] ;
2972:           workt += bs;
2973:         }
2974:       }
2975:     }
2976: #endif
2977:   }
2978:   PetscCall(VecRestoreArrayRead(xx, &x));
2979:   PetscCall(VecRestoreArray(zz, &z));
2980:   PetscCall(PetscLogFlops(2.0 * a->nz * a->bs2));
2981:   PetscFunctionReturn(PETSC_SUCCESS);
2982: }

2984: PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A, Vec xx, Vec yy, Vec zz)
2985: {
2986:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2987:   PetscScalar       *zb, *z, x1, x2, x3, x4, x5;
2988:   const PetscScalar *x, *xb = NULL;
2989:   const MatScalar   *v;
2990:   PetscInt           mbs, i, rval, bs = A->rmap->bs, j, n, bs2 = a->bs2;
2991:   const PetscInt    *idx, *ii, *ib, *ridx = NULL;
2992:   Mat_CompressedRow  cprow    = a->compressedrow;
2993:   PetscBool          usecprow = cprow.use;

2995:   PetscFunctionBegin;
2996:   if (yy != zz) PetscCall(VecCopy(yy, zz));
2997:   PetscCall(VecGetArrayRead(xx, &x));
2998:   PetscCall(VecGetArray(zz, &z));

3000:   idx = a->j;
3001:   v   = a->a;
3002:   if (usecprow) {
3003:     mbs  = cprow.nrows;
3004:     ii   = cprow.i;
3005:     ridx = cprow.rindex;
3006:   } else {
3007:     mbs = a->mbs;
3008:     ii  = a->i;
3009:     xb  = x;
3010:   }

3012:   switch (bs) {
3013:   case 1:
3014:     for (i = 0; i < mbs; i++) {
3015:       if (usecprow) xb = x + ridx[i];
3016:       x1 = xb[0];
3017:       ib = idx + ii[0];
3018:       n  = ii[1] - ii[0];
3019:       ii++;
3020:       for (j = 0; j < n; j++) {
3021:         rval = ib[j];
3022:         z[rval] += *v * x1;
3023:         v++;
3024:       }
3025:       if (!usecprow) xb++;
3026:     }
3027:     break;
3028:   case 2:
3029:     for (i = 0; i < mbs; i++) {
3030:       if (usecprow) xb = x + 2 * ridx[i];
3031:       x1 = xb[0];
3032:       x2 = xb[1];
3033:       ib = idx + ii[0];
3034:       n  = ii[1] - ii[0];
3035:       ii++;
3036:       for (j = 0; j < n; j++) {
3037:         rval = ib[j] * 2;
3038:         z[rval++] += v[0] * x1 + v[1] * x2;
3039:         z[rval++] += v[2] * x1 + v[3] * x2;
3040:         v += 4;
3041:       }
3042:       if (!usecprow) xb += 2;
3043:     }
3044:     break;
3045:   case 3:
3046:     for (i = 0; i < mbs; i++) {
3047:       if (usecprow) xb = x + 3 * ridx[i];
3048:       x1 = xb[0];
3049:       x2 = xb[1];
3050:       x3 = xb[2];
3051:       ib = idx + ii[0];
3052:       n  = ii[1] - ii[0];
3053:       ii++;
3054:       for (j = 0; j < n; j++) {
3055:         rval = ib[j] * 3;
3056:         z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3;
3057:         z[rval++] += v[3] * x1 + v[4] * x2 + v[5] * x3;
3058:         z[rval++] += v[6] * x1 + v[7] * x2 + v[8] * x3;
3059:         v += 9;
3060:       }
3061:       if (!usecprow) xb += 3;
3062:     }
3063:     break;
3064:   case 4:
3065:     for (i = 0; i < mbs; i++) {
3066:       if (usecprow) xb = x + 4 * ridx[i];
3067:       x1 = xb[0];
3068:       x2 = xb[1];
3069:       x3 = xb[2];
3070:       x4 = xb[3];
3071:       ib = idx + ii[0];
3072:       n  = ii[1] - ii[0];
3073:       ii++;
3074:       for (j = 0; j < n; j++) {
3075:         rval = ib[j] * 4;
3076:         z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4;
3077:         z[rval++] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4;
3078:         z[rval++] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4;
3079:         z[rval++] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
3080:         v += 16;
3081:       }
3082:       if (!usecprow) xb += 4;
3083:     }
3084:     break;
3085:   case 5:
3086:     for (i = 0; i < mbs; i++) {
3087:       if (usecprow) xb = x + 5 * ridx[i];
3088:       x1 = xb[0];
3089:       x2 = xb[1];
3090:       x3 = xb[2];
3091:       x4 = xb[3];
3092:       x5 = xb[4];
3093:       ib = idx + ii[0];
3094:       n  = ii[1] - ii[0];
3095:       ii++;
3096:       for (j = 0; j < n; j++) {
3097:         rval = ib[j] * 5;
3098:         z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5;
3099:         z[rval++] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5;
3100:         z[rval++] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5;
3101:         z[rval++] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5;
3102:         z[rval++] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
3103:         v += 25;
3104:       }
3105:       if (!usecprow) xb += 5;
3106:     }
3107:     break;
3108:   default: { /* block sizes larger then 5 by 5 are handled by BLAS */
3109:     PetscInt           ncols, k;
3110:     PetscScalar       *work, *workt;
3111:     const PetscScalar *xtmp;
3112:     if (!a->mult_work) {
3113:       k = PetscMax(A->rmap->n, A->cmap->n);
3114:       PetscCall(PetscMalloc1(k + 1, &a->mult_work));
3115:     }
3116:     work = a->mult_work;
3117:     xtmp = x;
3118:     for (i = 0; i < mbs; i++) {
3119:       n = ii[1] - ii[0];
3120:       ii++;
3121:       ncols = n * bs;
3122:       PetscCall(PetscArrayzero(work, ncols));
3123:       if (usecprow) xtmp = x + bs * ridx[i];
3124:       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, xtmp, v, work);
3125:       v += n * bs2;
3126:       if (!usecprow) xtmp += bs;
3127:       workt = work;
3128:       for (j = 0; j < n; j++) {
3129:         zb = z + bs * (*idx++);
3130:         for (k = 0; k < bs; k++) zb[k] += workt[k];
3131:         workt += bs;
3132:       }
3133:     }
3134:   }
3135:   }
3136:   PetscCall(VecRestoreArrayRead(xx, &x));
3137:   PetscCall(VecRestoreArray(zz, &z));
3138:   PetscCall(PetscLogFlops(2.0 * a->nz * a->bs2));
3139:   PetscFunctionReturn(PETSC_SUCCESS);
3140: }

3142: PetscErrorCode MatScale_SeqBAIJ(Mat inA, PetscScalar alpha)
3143: {
3144:   Mat_SeqBAIJ *a       = (Mat_SeqBAIJ *)inA->data;
3145:   PetscInt     totalnz = a->bs2 * a->nz;
3146:   PetscScalar  oalpha  = alpha;
3147:   PetscBLASInt one     = 1, tnz;

3149:   PetscFunctionBegin;
3150:   PetscCall(PetscBLASIntCast(totalnz, &tnz));
3151:   PetscCallBLAS("BLASscal", BLASscal_(&tnz, &oalpha, a->a, &one));
3152:   PetscCall(PetscLogFlops(totalnz));
3153:   PetscFunctionReturn(PETSC_SUCCESS);
3154: }

3156: PetscErrorCode MatNorm_SeqBAIJ(Mat A, NormType type, PetscReal *norm)
3157: {
3158:   Mat_SeqBAIJ *a   = (Mat_SeqBAIJ *)A->data;
3159:   MatScalar   *v   = a->a;
3160:   PetscReal    sum = 0.0;
3161:   PetscInt     i, j, k, bs = A->rmap->bs, nz = a->nz, bs2 = a->bs2, k1;

3163:   PetscFunctionBegin;
3164:   if (type == NORM_FROBENIUS) {
3165: #if defined(PETSC_USE_REAL___FP16)
3166:     PetscBLASInt one = 1, cnt = bs2 * nz;
3167:     PetscCallBLAS("BLASnrm2", *norm = BLASnrm2_(&cnt, v, &one));
3168: #else
3169:     for (i = 0; i < bs2 * nz; i++) {
3170:       sum += PetscRealPart(PetscConj(*v) * (*v));
3171:       v++;
3172:     }
3173: #endif
3174:     *norm = PetscSqrtReal(sum);
3175:     PetscCall(PetscLogFlops(2.0 * bs2 * nz));
3176:   } else if (type == NORM_1) { /* maximum column sum */
3177:     PetscReal *tmp;
3178:     PetscInt  *bcol = a->j;
3179:     PetscCall(PetscCalloc1(A->cmap->n + 1, &tmp));
3180:     for (i = 0; i < nz; i++) {
3181:       for (j = 0; j < bs; j++) {
3182:         k1 = bs * (*bcol) + j; /* column index */
3183:         for (k = 0; k < bs; k++) {
3184:           tmp[k1] += PetscAbsScalar(*v);
3185:           v++;
3186:         }
3187:       }
3188:       bcol++;
3189:     }
3190:     *norm = 0.0;
3191:     for (j = 0; j < A->cmap->n; j++) {
3192:       if (tmp[j] > *norm) *norm = tmp[j];
3193:     }
3194:     PetscCall(PetscFree(tmp));
3195:     PetscCall(PetscLogFlops(PetscMax(bs2 * nz - 1, 0)));
3196:   } else if (type == NORM_INFINITY) { /* maximum row sum */
3197:     *norm = 0.0;
3198:     for (k = 0; k < bs; k++) {
3199:       for (j = 0; j < a->mbs; j++) {
3200:         v   = a->a + bs2 * a->i[j] + k;
3201:         sum = 0.0;
3202:         for (i = 0; i < a->i[j + 1] - a->i[j]; i++) {
3203:           for (k1 = 0; k1 < bs; k1++) {
3204:             sum += PetscAbsScalar(*v);
3205:             v += bs;
3206:           }
3207:         }
3208:         if (sum > *norm) *norm = sum;
3209:       }
3210:     }
3211:     PetscCall(PetscLogFlops(PetscMax(bs2 * nz - 1, 0)));
3212:   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for this norm yet");
3213:   PetscFunctionReturn(PETSC_SUCCESS);
3214: }

3216: PetscErrorCode MatEqual_SeqBAIJ(Mat A, Mat B, PetscBool *flg)
3217: {
3218:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)B->data;

3220:   PetscFunctionBegin;
3221:   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
3222:   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs) || (a->nz != b->nz)) {
3223:     *flg = PETSC_FALSE;
3224:     PetscFunctionReturn(PETSC_SUCCESS);
3225:   }

3227:   /* if the a->i are the same */
3228:   PetscCall(PetscArraycmp(a->i, b->i, a->mbs + 1, flg));
3229:   if (!*flg) PetscFunctionReturn(PETSC_SUCCESS);

3231:   /* if a->j are the same */
3232:   PetscCall(PetscArraycmp(a->j, b->j, a->nz, flg));
3233:   if (!*flg) PetscFunctionReturn(PETSC_SUCCESS);

3235:   /* if a->a are the same */
3236:   PetscCall(PetscArraycmp(a->a, b->a, (a->nz) * (A->rmap->bs) * (B->rmap->bs), flg));
3237:   PetscFunctionReturn(PETSC_SUCCESS);
3238: }

3240: PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A, Vec v)
3241: {
3242:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3243:   PetscInt     i, j, k, n, row, bs, *ai, *aj, ambs, bs2;
3244:   PetscScalar *x, zero = 0.0;
3245:   MatScalar   *aa, *aa_j;

3247:   PetscFunctionBegin;
3248:   PetscCheck(!A->factortype, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
3249:   bs   = A->rmap->bs;
3250:   aa   = a->a;
3251:   ai   = a->i;
3252:   aj   = a->j;
3253:   ambs = a->mbs;
3254:   bs2  = a->bs2;

3256:   PetscCall(VecSet(v, zero));
3257:   PetscCall(VecGetArray(v, &x));
3258:   PetscCall(VecGetLocalSize(v, &n));
3259:   PetscCheck(n == A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
3260:   for (i = 0; i < ambs; i++) {
3261:     for (j = ai[i]; j < ai[i + 1]; j++) {
3262:       if (aj[j] == i) {
3263:         row  = i * bs;
3264:         aa_j = aa + j * bs2;
3265:         for (k = 0; k < bs2; k += (bs + 1), row++) x[row] = aa_j[k];
3266:         break;
3267:       }
3268:     }
3269:   }
3270:   PetscCall(VecRestoreArray(v, &x));
3271:   PetscFunctionReturn(PETSC_SUCCESS);
3272: }

3274: PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A, Vec ll, Vec rr)
3275: {
3276:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3277:   const PetscScalar *l, *r, *li, *ri;
3278:   PetscScalar        x;
3279:   MatScalar         *aa, *v;
3280:   PetscInt           i, j, k, lm, rn, M, m, n, mbs, tmp, bs, bs2, iai;
3281:   const PetscInt    *ai, *aj;

3283:   PetscFunctionBegin;
3284:   ai  = a->i;
3285:   aj  = a->j;
3286:   aa  = a->a;
3287:   m   = A->rmap->n;
3288:   n   = A->cmap->n;
3289:   bs  = A->rmap->bs;
3290:   mbs = a->mbs;
3291:   bs2 = a->bs2;
3292:   if (ll) {
3293:     PetscCall(VecGetArrayRead(ll, &l));
3294:     PetscCall(VecGetLocalSize(ll, &lm));
3295:     PetscCheck(lm == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
3296:     for (i = 0; i < mbs; i++) { /* for each block row */
3297:       M  = ai[i + 1] - ai[i];
3298:       li = l + i * bs;
3299:       v  = aa + bs2 * ai[i];
3300:       for (j = 0; j < M; j++) { /* for each block */
3301:         for (k = 0; k < bs2; k++) (*v++) *= li[k % bs];
3302:       }
3303:     }
3304:     PetscCall(VecRestoreArrayRead(ll, &l));
3305:     PetscCall(PetscLogFlops(a->nz));
3306:   }

3308:   if (rr) {
3309:     PetscCall(VecGetArrayRead(rr, &r));
3310:     PetscCall(VecGetLocalSize(rr, &rn));
3311:     PetscCheck(rn == n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
3312:     for (i = 0; i < mbs; i++) { /* for each block row */
3313:       iai = ai[i];
3314:       M   = ai[i + 1] - iai;
3315:       v   = aa + bs2 * iai;
3316:       for (j = 0; j < M; j++) { /* for each block */
3317:         ri = r + bs * aj[iai + j];
3318:         for (k = 0; k < bs; k++) {
3319:           x = ri[k];
3320:           for (tmp = 0; tmp < bs; tmp++) v[tmp] *= x;
3321:           v += bs;
3322:         }
3323:       }
3324:     }
3325:     PetscCall(VecRestoreArrayRead(rr, &r));
3326:     PetscCall(PetscLogFlops(a->nz));
3327:   }
3328:   PetscFunctionReturn(PETSC_SUCCESS);
3329: }

3331: PetscErrorCode MatGetInfo_SeqBAIJ(Mat A, MatInfoType flag, MatInfo *info)
3332: {
3333:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;

3335:   PetscFunctionBegin;
3336:   info->block_size   = a->bs2;
3337:   info->nz_allocated = a->bs2 * a->maxnz;
3338:   info->nz_used      = a->bs2 * a->nz;
3339:   info->nz_unneeded  = info->nz_allocated - info->nz_used;
3340:   info->assemblies   = A->num_ass;
3341:   info->mallocs      = A->info.mallocs;
3342:   info->memory       = 0; /* REVIEW ME */
3343:   if (A->factortype) {
3344:     info->fill_ratio_given  = A->info.fill_ratio_given;
3345:     info->fill_ratio_needed = A->info.fill_ratio_needed;
3346:     info->factor_mallocs    = A->info.factor_mallocs;
3347:   } else {
3348:     info->fill_ratio_given  = 0;
3349:     info->fill_ratio_needed = 0;
3350:     info->factor_mallocs    = 0;
3351:   }
3352:   PetscFunctionReturn(PETSC_SUCCESS);
3353: }

3355: PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
3356: {
3357:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;

3359:   PetscFunctionBegin;
3360:   PetscCall(PetscArrayzero(a->a, a->bs2 * a->i[a->mbs]));
3361:   PetscFunctionReturn(PETSC_SUCCESS);
3362: }

3364: PetscErrorCode MatMatMultSymbolic_SeqBAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C)
3365: {
3366:   PetscFunctionBegin;
3367:   PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C));
3368:   C->ops->matmultnumeric = MatMatMultNumeric_SeqBAIJ_SeqDense;
3369:   PetscFunctionReturn(PETSC_SUCCESS);
3370: }

3372: PetscErrorCode MatMatMult_SeqBAIJ_1_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3373: {
3374:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3375:   PetscScalar       *z = NULL, sum1;
3376:   const PetscScalar *xb;
3377:   PetscScalar        x1;
3378:   const MatScalar   *v, *vv;
3379:   PetscInt           mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
3380:   PetscBool          usecprow = a->compressedrow.use;

3382:   PetscFunctionBegin;
3383:   idx = a->j;
3384:   v   = a->a;
3385:   if (usecprow) {
3386:     mbs  = a->compressedrow.nrows;
3387:     ii   = a->compressedrow.i;
3388:     ridx = a->compressedrow.rindex;
3389:   } else {
3390:     mbs = a->mbs;
3391:     ii  = a->i;
3392:     z   = c;
3393:   }

3395:   for (i = 0; i < mbs; i++) {
3396:     n = ii[1] - ii[0];
3397:     ii++;
3398:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
3399:     PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
3400:     if (usecprow) z = c + ridx[i];
3401:     jj = idx;
3402:     vv = v;
3403:     for (k = 0; k < cn; k++) {
3404:       idx  = jj;
3405:       v    = vv;
3406:       sum1 = 0.0;
3407:       for (j = 0; j < n; j++) {
3408:         xb = b + (*idx++);
3409:         x1 = xb[0 + k * bm];
3410:         sum1 += v[0] * x1;
3411:         v += 1;
3412:       }
3413:       z[0 + k * cm] = sum1;
3414:     }
3415:     if (!usecprow) z += 1;
3416:   }
3417:   PetscFunctionReturn(PETSC_SUCCESS);
3418: }

3420: PetscErrorCode MatMatMult_SeqBAIJ_2_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3421: {
3422:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3423:   PetscScalar       *z = NULL, sum1, sum2;
3424:   const PetscScalar *xb;
3425:   PetscScalar        x1, x2;
3426:   const MatScalar   *v, *vv;
3427:   PetscInt           mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
3428:   PetscBool          usecprow = a->compressedrow.use;

3430:   PetscFunctionBegin;
3431:   idx = a->j;
3432:   v   = a->a;
3433:   if (usecprow) {
3434:     mbs  = a->compressedrow.nrows;
3435:     ii   = a->compressedrow.i;
3436:     ridx = a->compressedrow.rindex;
3437:   } else {
3438:     mbs = a->mbs;
3439:     ii  = a->i;
3440:     z   = c;
3441:   }

3443:   for (i = 0; i < mbs; i++) {
3444:     n = ii[1] - ii[0];
3445:     ii++;
3446:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
3447:     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3448:     if (usecprow) z = c + 2 * ridx[i];
3449:     jj = idx;
3450:     vv = v;
3451:     for (k = 0; k < cn; k++) {
3452:       idx  = jj;
3453:       v    = vv;
3454:       sum1 = 0.0;
3455:       sum2 = 0.0;
3456:       for (j = 0; j < n; j++) {
3457:         xb = b + 2 * (*idx++);
3458:         x1 = xb[0 + k * bm];
3459:         x2 = xb[1 + k * bm];
3460:         sum1 += v[0] * x1 + v[2] * x2;
3461:         sum2 += v[1] * x1 + v[3] * x2;
3462:         v += 4;
3463:       }
3464:       z[0 + k * cm] = sum1;
3465:       z[1 + k * cm] = sum2;
3466:     }
3467:     if (!usecprow) z += 2;
3468:   }
3469:   PetscFunctionReturn(PETSC_SUCCESS);
3470: }

3472: PetscErrorCode MatMatMult_SeqBAIJ_3_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3473: {
3474:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3475:   PetscScalar       *z = NULL, sum1, sum2, sum3;
3476:   const PetscScalar *xb;
3477:   PetscScalar        x1, x2, x3;
3478:   const MatScalar   *v, *vv;
3479:   PetscInt           mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
3480:   PetscBool          usecprow = a->compressedrow.use;

3482:   PetscFunctionBegin;
3483:   idx = a->j;
3484:   v   = a->a;
3485:   if (usecprow) {
3486:     mbs  = a->compressedrow.nrows;
3487:     ii   = a->compressedrow.i;
3488:     ridx = a->compressedrow.rindex;
3489:   } else {
3490:     mbs = a->mbs;
3491:     ii  = a->i;
3492:     z   = c;
3493:   }

3495:   for (i = 0; i < mbs; i++) {
3496:     n = ii[1] - ii[0];
3497:     ii++;
3498:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
3499:     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3500:     if (usecprow) z = c + 3 * ridx[i];
3501:     jj = idx;
3502:     vv = v;
3503:     for (k = 0; k < cn; k++) {
3504:       idx  = jj;
3505:       v    = vv;
3506:       sum1 = 0.0;
3507:       sum2 = 0.0;
3508:       sum3 = 0.0;
3509:       for (j = 0; j < n; j++) {
3510:         xb = b + 3 * (*idx++);
3511:         x1 = xb[0 + k * bm];
3512:         x2 = xb[1 + k * bm];
3513:         x3 = xb[2 + k * bm];
3514:         sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3;
3515:         sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3;
3516:         sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3;
3517:         v += 9;
3518:       }
3519:       z[0 + k * cm] = sum1;
3520:       z[1 + k * cm] = sum2;
3521:       z[2 + k * cm] = sum3;
3522:     }
3523:     if (!usecprow) z += 3;
3524:   }
3525:   PetscFunctionReturn(PETSC_SUCCESS);
3526: }

3528: PetscErrorCode MatMatMult_SeqBAIJ_4_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3529: {
3530:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3531:   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4;
3532:   const PetscScalar *xb;
3533:   PetscScalar        x1, x2, x3, x4;
3534:   const MatScalar   *v, *vv;
3535:   PetscInt           mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
3536:   PetscBool          usecprow = a->compressedrow.use;

3538:   PetscFunctionBegin;
3539:   idx = a->j;
3540:   v   = a->a;
3541:   if (usecprow) {
3542:     mbs  = a->compressedrow.nrows;
3543:     ii   = a->compressedrow.i;
3544:     ridx = a->compressedrow.rindex;
3545:   } else {
3546:     mbs = a->mbs;
3547:     ii  = a->i;
3548:     z   = c;
3549:   }

3551:   for (i = 0; i < mbs; i++) {
3552:     n = ii[1] - ii[0];
3553:     ii++;
3554:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
3555:     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3556:     if (usecprow) z = c + 4 * ridx[i];
3557:     jj = idx;
3558:     vv = v;
3559:     for (k = 0; k < cn; k++) {
3560:       idx  = jj;
3561:       v    = vv;
3562:       sum1 = 0.0;
3563:       sum2 = 0.0;
3564:       sum3 = 0.0;
3565:       sum4 = 0.0;
3566:       for (j = 0; j < n; j++) {
3567:         xb = b + 4 * (*idx++);
3568:         x1 = xb[0 + k * bm];
3569:         x2 = xb[1 + k * bm];
3570:         x3 = xb[2 + k * bm];
3571:         x4 = xb[3 + k * bm];
3572:         sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
3573:         sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
3574:         sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
3575:         sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
3576:         v += 16;
3577:       }
3578:       z[0 + k * cm] = sum1;
3579:       z[1 + k * cm] = sum2;
3580:       z[2 + k * cm] = sum3;
3581:       z[3 + k * cm] = sum4;
3582:     }
3583:     if (!usecprow) z += 4;
3584:   }
3585:   PetscFunctionReturn(PETSC_SUCCESS);
3586: }

3588: PetscErrorCode MatMatMult_SeqBAIJ_5_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3589: {
3590:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3591:   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5;
3592:   const PetscScalar *xb;
3593:   PetscScalar        x1, x2, x3, x4, x5;
3594:   const MatScalar   *v, *vv;
3595:   PetscInt           mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
3596:   PetscBool          usecprow = a->compressedrow.use;

3598:   PetscFunctionBegin;
3599:   idx = a->j;
3600:   v   = a->a;
3601:   if (usecprow) {
3602:     mbs  = a->compressedrow.nrows;
3603:     ii   = a->compressedrow.i;
3604:     ridx = a->compressedrow.rindex;
3605:   } else {
3606:     mbs = a->mbs;
3607:     ii  = a->i;
3608:     z   = c;
3609:   }

3611:   for (i = 0; i < mbs; i++) {
3612:     n = ii[1] - ii[0];
3613:     ii++;
3614:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
3615:     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3616:     if (usecprow) z = c + 5 * ridx[i];
3617:     jj = idx;
3618:     vv = v;
3619:     for (k = 0; k < cn; k++) {
3620:       idx  = jj;
3621:       v    = vv;
3622:       sum1 = 0.0;
3623:       sum2 = 0.0;
3624:       sum3 = 0.0;
3625:       sum4 = 0.0;
3626:       sum5 = 0.0;
3627:       for (j = 0; j < n; j++) {
3628:         xb = b + 5 * (*idx++);
3629:         x1 = xb[0 + k * bm];
3630:         x2 = xb[1 + k * bm];
3631:         x3 = xb[2 + k * bm];
3632:         x4 = xb[3 + k * bm];
3633:         x5 = xb[4 + k * bm];
3634:         sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
3635:         sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
3636:         sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
3637:         sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
3638:         sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
3639:         v += 25;
3640:       }
3641:       z[0 + k * cm] = sum1;
3642:       z[1 + k * cm] = sum2;
3643:       z[2 + k * cm] = sum3;
3644:       z[3 + k * cm] = sum4;
3645:       z[4 + k * cm] = sum5;
3646:     }
3647:     if (!usecprow) z += 5;
3648:   }
3649:   PetscFunctionReturn(PETSC_SUCCESS);
3650: }

3652: PetscErrorCode MatMatMultNumeric_SeqBAIJ_SeqDense(Mat A, Mat B, Mat C)
3653: {
3654:   Mat_SeqBAIJ     *a  = (Mat_SeqBAIJ *)A->data;
3655:   Mat_SeqDense    *bd = (Mat_SeqDense *)B->data;
3656:   Mat_SeqDense    *cd = (Mat_SeqDense *)C->data;
3657:   PetscInt         cm = cd->lda, cn = B->cmap->n, bm = bd->lda;
3658:   PetscInt         mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
3659:   PetscBLASInt     bbs, bcn, bbm, bcm;
3660:   PetscScalar     *z = NULL;
3661:   PetscScalar     *c, *b;
3662:   const MatScalar *v;
3663:   const PetscInt  *idx, *ii, *ridx = NULL;
3664:   PetscScalar      _DZero = 0.0, _DOne = 1.0;
3665:   PetscBool        usecprow = a->compressedrow.use;

3667:   PetscFunctionBegin;
3668:   if (!cm || !cn) PetscFunctionReturn(PETSC_SUCCESS);
3669:   PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n);
3670:   PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n);
3671:   PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n);
3672:   b = bd->v;
3673:   if (a->nonzerorowcnt != A->rmap->n) PetscCall(MatZeroEntries(C));
3674:   PetscCall(MatDenseGetArray(C, &c));
3675:   switch (bs) {
3676:   case 1:
3677:     PetscCall(MatMatMult_SeqBAIJ_1_Private(A, b, bm, c, cm, cn));
3678:     break;
3679:   case 2:
3680:     PetscCall(MatMatMult_SeqBAIJ_2_Private(A, b, bm, c, cm, cn));
3681:     break;
3682:   case 3:
3683:     PetscCall(MatMatMult_SeqBAIJ_3_Private(A, b, bm, c, cm, cn));
3684:     break;
3685:   case 4:
3686:     PetscCall(MatMatMult_SeqBAIJ_4_Private(A, b, bm, c, cm, cn));
3687:     break;
3688:   case 5:
3689:     PetscCall(MatMatMult_SeqBAIJ_5_Private(A, b, bm, c, cm, cn));
3690:     break;
3691:   default: /* block sizes larger than 5 by 5 are handled by BLAS */
3692:     PetscCall(PetscBLASIntCast(bs, &bbs));
3693:     PetscCall(PetscBLASIntCast(cn, &bcn));
3694:     PetscCall(PetscBLASIntCast(bm, &bbm));
3695:     PetscCall(PetscBLASIntCast(cm, &bcm));
3696:     idx = a->j;
3697:     v   = a->a;
3698:     if (usecprow) {
3699:       mbs  = a->compressedrow.nrows;
3700:       ii   = a->compressedrow.i;
3701:       ridx = a->compressedrow.rindex;
3702:     } else {
3703:       mbs = a->mbs;
3704:       ii  = a->i;
3705:       z   = c;
3706:     }
3707:     for (i = 0; i < mbs; i++) {
3708:       n = ii[1] - ii[0];
3709:       ii++;
3710:       if (usecprow) z = c + bs * ridx[i];
3711:       if (n) {
3712:         PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DZero, z, &bcm));
3713:         v += bs2;
3714:       }
3715:       for (j = 1; j < n; j++) {
3716:         PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DOne, z, &bcm));
3717:         v += bs2;
3718:       }
3719:       if (!usecprow) z += bs;
3720:     }
3721:   }
3722:   PetscCall(MatDenseRestoreArray(C, &c));
3723:   PetscCall(PetscLogFlops((2.0 * a->nz * bs2 - bs * a->nonzerorowcnt) * cn));
3724:   PetscFunctionReturn(PETSC_SUCCESS);
3725: }