Actual source code: sbaij2.c


  2: #include <../src/mat/impls/baij/seq/baij.h>
  3: #include <../src/mat/impls/dense/seq/dense.h>
  4: #include <../src/mat/impls/sbaij/seq/sbaij.h>
  5: #include <petsc/private/kernels/blockinvert.h>
  6: #include <petscbt.h>
  7: #include <petscblaslapack.h>

  9: PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A, PetscInt is_max, IS is[], PetscInt ov)
 10: {
 11:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data;
 12:   PetscInt        brow, i, j, k, l, mbs, n, *nidx, isz, bcol, bcol_max, start, end, *ai, *aj, bs;
 13:   const PetscInt *idx;
 14:   PetscBT         table_out, table_in;

 16:   PetscFunctionBegin;
 17:   PetscCheck(ov >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified");
 18:   mbs = a->mbs;
 19:   ai  = a->i;
 20:   aj  = a->j;
 21:   bs  = A->rmap->bs;
 22:   PetscCall(PetscBTCreate(mbs, &table_out));
 23:   PetscCall(PetscMalloc1(mbs + 1, &nidx));
 24:   PetscCall(PetscBTCreate(mbs, &table_in));

 26:   for (i = 0; i < is_max; i++) { /* for each is */
 27:     isz = 0;
 28:     PetscCall(PetscBTMemzero(mbs, table_out));

 30:     /* Extract the indices, assume there can be duplicate entries */
 31:     PetscCall(ISGetIndices(is[i], &idx));
 32:     PetscCall(ISGetLocalSize(is[i], &n));

 34:     /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */
 35:     bcol_max = 0;
 36:     for (j = 0; j < n; ++j) {
 37:       brow = idx[j] / bs; /* convert the indices into block indices */
 38:       PetscCheck(brow < mbs, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "index greater than mat-dim");
 39:       if (!PetscBTLookupSet(table_out, brow)) {
 40:         nidx[isz++] = brow;
 41:         if (bcol_max < brow) bcol_max = brow;
 42:       }
 43:     }
 44:     PetscCall(ISRestoreIndices(is[i], &idx));
 45:     PetscCall(ISDestroy(&is[i]));

 47:     k = 0;
 48:     for (j = 0; j < ov; j++) { /* for each overlap */
 49:       /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */
 50:       PetscCall(PetscBTMemzero(mbs, table_in));
 51:       for (l = k; l < isz; l++) PetscCall(PetscBTSet(table_in, nidx[l]));

 53:       n = isz; /* length of the updated is[i] */
 54:       for (brow = 0; brow < mbs; brow++) {
 55:         start = ai[brow];
 56:         end   = ai[brow + 1];
 57:         if (PetscBTLookup(table_in, brow)) { /* brow is on nidx - row search: collect all bcol in this brow */
 58:           for (l = start; l < end; l++) {
 59:             bcol = aj[l];
 60:             if (!PetscBTLookupSet(table_out, bcol)) {
 61:               nidx[isz++] = bcol;
 62:               if (bcol_max < bcol) bcol_max = bcol;
 63:             }
 64:           }
 65:           k++;
 66:           if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */
 67:         } else {             /* brow is not on nidx - col search: add brow onto nidx if there is a bcol in nidx */
 68:           for (l = start; l < end; l++) {
 69:             bcol = aj[l];
 70:             if (bcol > bcol_max) break;
 71:             if (PetscBTLookup(table_in, bcol)) {
 72:               if (!PetscBTLookupSet(table_out, brow)) nidx[isz++] = brow;
 73:               break; /* for l = start; l<end ; l++) */
 74:             }
 75:           }
 76:         }
 77:       }
 78:     } /* for each overlap */
 79:     PetscCall(ISCreateBlock(PETSC_COMM_SELF, bs, isz, nidx, PETSC_COPY_VALUES, is + i));
 80:   } /* for each is */
 81:   PetscCall(PetscBTDestroy(&table_out));
 82:   PetscCall(PetscFree(nidx));
 83:   PetscCall(PetscBTDestroy(&table_in));
 84:   PetscFunctionReturn(PETSC_SUCCESS);
 85: }

 87: /* Bseq is non-symmetric SBAIJ matrix, only used internally by PETSc.
 88:         Zero some ops' to avoid invalid use */
 89: PetscErrorCode MatSeqSBAIJZeroOps_Private(Mat Bseq)
 90: {
 91:   PetscFunctionBegin;
 92:   PetscCall(MatSetOption(Bseq, MAT_SYMMETRIC, PETSC_FALSE));
 93:   Bseq->ops->mult                   = NULL;
 94:   Bseq->ops->multadd                = NULL;
 95:   Bseq->ops->multtranspose          = NULL;
 96:   Bseq->ops->multtransposeadd       = NULL;
 97:   Bseq->ops->lufactor               = NULL;
 98:   Bseq->ops->choleskyfactor         = NULL;
 99:   Bseq->ops->lufactorsymbolic       = NULL;
100:   Bseq->ops->choleskyfactorsymbolic = NULL;
101:   Bseq->ops->getinertia             = NULL;
102:   PetscFunctionReturn(PETSC_SUCCESS);
103: }

105: /* same as MatCreateSubMatrices_SeqBAIJ(), except cast Mat_SeqSBAIJ */
106: PetscErrorCode MatCreateSubMatrix_SeqSBAIJ_Private(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B)
107: {
108:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data, *c;
109:   PetscInt       *smap, i, k, kstart, kend, oldcols = a->nbs, *lens;
110:   PetscInt        row, mat_i, *mat_j, tcol, *mat_ilen;
111:   const PetscInt *irow, *icol;
112:   PetscInt        nrows, ncols, *ssmap, bs = A->rmap->bs, bs2 = a->bs2;
113:   PetscInt       *aj = a->j, *ai = a->i;
114:   MatScalar      *mat_a;
115:   Mat             C;
116:   PetscBool       flag;

118:   PetscFunctionBegin;

120:   PetscCall(ISGetIndices(isrow, &irow));
121:   PetscCall(ISGetIndices(iscol, &icol));
122:   PetscCall(ISGetLocalSize(isrow, &nrows));
123:   PetscCall(ISGetLocalSize(iscol, &ncols));

125:   PetscCall(PetscCalloc1(1 + oldcols, &smap));
126:   ssmap = smap;
127:   PetscCall(PetscMalloc1(1 + nrows, &lens));
128:   for (i = 0; i < ncols; i++) smap[icol[i]] = i + 1;
129:   /* determine lens of each row */
130:   for (i = 0; i < nrows; i++) {
131:     kstart  = ai[irow[i]];
132:     kend    = kstart + a->ilen[irow[i]];
133:     lens[i] = 0;
134:     for (k = kstart; k < kend; k++) {
135:       if (ssmap[aj[k]]) lens[i]++;
136:     }
137:   }
138:   /* Create and fill new matrix */
139:   if (scall == MAT_REUSE_MATRIX) {
140:     c = (Mat_SeqSBAIJ *)((*B)->data);

142:     PetscCheck(c->mbs == nrows && c->nbs == ncols && (*B)->rmap->bs == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Submatrix wrong size");
143:     PetscCall(PetscArraycmp(c->ilen, lens, c->mbs, &flag));
144:     PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong no of nonzeros");
145:     PetscCall(PetscArrayzero(c->ilen, c->mbs));
146:     C = *B;
147:   } else {
148:     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &C));
149:     PetscCall(MatSetSizes(C, nrows * bs, ncols * bs, PETSC_DETERMINE, PETSC_DETERMINE));
150:     PetscCall(MatSetType(C, ((PetscObject)A)->type_name));
151:     PetscCall(MatSeqSBAIJSetPreallocation(C, bs, 0, lens));
152:   }
153:   c = (Mat_SeqSBAIJ *)(C->data);
154:   for (i = 0; i < nrows; i++) {
155:     row      = irow[i];
156:     kstart   = ai[row];
157:     kend     = kstart + a->ilen[row];
158:     mat_i    = c->i[i];
159:     mat_j    = c->j + mat_i;
160:     mat_a    = c->a + mat_i * bs2;
161:     mat_ilen = c->ilen + i;
162:     for (k = kstart; k < kend; k++) {
163:       if ((tcol = ssmap[a->j[k]])) {
164:         *mat_j++ = tcol - 1;
165:         PetscCall(PetscArraycpy(mat_a, a->a + k * bs2, bs2));
166:         mat_a += bs2;
167:         (*mat_ilen)++;
168:       }
169:     }
170:   }
171:   /* sort */
172:   {
173:     MatScalar *work;

175:     PetscCall(PetscMalloc1(bs2, &work));
176:     for (i = 0; i < nrows; i++) {
177:       PetscInt ilen;
178:       mat_i = c->i[i];
179:       mat_j = c->j + mat_i;
180:       mat_a = c->a + mat_i * bs2;
181:       ilen  = c->ilen[i];
182:       PetscCall(PetscSortIntWithDataArray(ilen, mat_j, mat_a, bs2 * sizeof(MatScalar), work));
183:     }
184:     PetscCall(PetscFree(work));
185:   }

187:   /* Free work space */
188:   PetscCall(ISRestoreIndices(iscol, &icol));
189:   PetscCall(PetscFree(smap));
190:   PetscCall(PetscFree(lens));
191:   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
192:   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));

194:   PetscCall(ISRestoreIndices(isrow, &irow));
195:   *B = C;
196:   PetscFunctionReturn(PETSC_SUCCESS);
197: }

199: PetscErrorCode MatCreateSubMatrix_SeqSBAIJ(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B)
200: {
201:   IS is1, is2;

203:   PetscFunctionBegin;
204:   PetscCall(ISCompressIndicesGeneral(A->rmap->N, A->rmap->n, A->rmap->bs, 1, &isrow, &is1));
205:   if (isrow == iscol) {
206:     is2 = is1;
207:     PetscCall(PetscObjectReference((PetscObject)is2));
208:   } else PetscCall(ISCompressIndicesGeneral(A->cmap->N, A->cmap->n, A->cmap->bs, 1, &iscol, &is2));
209:   PetscCall(MatCreateSubMatrix_SeqSBAIJ_Private(A, is1, is2, scall, B));
210:   PetscCall(ISDestroy(&is1));
211:   PetscCall(ISDestroy(&is2));

213:   if (isrow != iscol) {
214:     PetscBool isequal;
215:     PetscCall(ISEqual(isrow, iscol, &isequal));
216:     if (!isequal) PetscCall(MatSeqSBAIJZeroOps_Private(*B));
217:   }
218:   PetscFunctionReturn(PETSC_SUCCESS);
219: }

221: PetscErrorCode MatCreateSubMatrices_SeqSBAIJ(Mat A, PetscInt n, const IS irow[], const IS icol[], MatReuse scall, Mat *B[])
222: {
223:   PetscInt i;

225:   PetscFunctionBegin;
226:   if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(n + 1, B));

228:   for (i = 0; i < n; i++) PetscCall(MatCreateSubMatrix_SeqSBAIJ(A, irow[i], icol[i], scall, &(*B)[i]));
229:   PetscFunctionReturn(PETSC_SUCCESS);
230: }

232: /* Should check that shapes of vectors and matrices match */
233: PetscErrorCode MatMult_SeqSBAIJ_2(Mat A, Vec xx, Vec zz)
234: {
235:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
236:   PetscScalar       *z, x1, x2, zero = 0.0;
237:   const PetscScalar *x, *xb;
238:   const MatScalar   *v;
239:   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
240:   const PetscInt    *aj = a->j, *ai = a->i, *ib;
241:   PetscInt           nonzerorow = 0;

243:   PetscFunctionBegin;
244:   PetscCall(VecSet(zz, zero));
245:   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
246:   PetscCall(VecGetArrayRead(xx, &x));
247:   PetscCall(VecGetArray(zz, &z));

249:   v  = a->a;
250:   xb = x;

252:   for (i = 0; i < mbs; i++) {
253:     n    = ai[1] - ai[0]; /* length of i_th block row of A */
254:     x1   = xb[0];
255:     x2   = xb[1];
256:     ib   = aj + *ai;
257:     jmin = 0;
258:     nonzerorow += (n > 0);
259:     if (*ib == i) { /* (diag of A)*x */
260:       z[2 * i] += v[0] * x1 + v[2] * x2;
261:       z[2 * i + 1] += v[2] * x1 + v[3] * x2;
262:       v += 4;
263:       jmin++;
264:     }
265:     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
266:     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
267:     for (j = jmin; j < n; j++) {
268:       /* (strict lower triangular part of A)*x  */
269:       cval = ib[j] * 2;
270:       z[cval] += v[0] * x1 + v[1] * x2;
271:       z[cval + 1] += v[2] * x1 + v[3] * x2;
272:       /* (strict upper triangular part of A)*x  */
273:       z[2 * i] += v[0] * x[cval] + v[2] * x[cval + 1];
274:       z[2 * i + 1] += v[1] * x[cval] + v[3] * x[cval + 1];
275:       v += 4;
276:     }
277:     xb += 2;
278:     ai++;
279:   }

281:   PetscCall(VecRestoreArrayRead(xx, &x));
282:   PetscCall(VecRestoreArray(zz, &z));
283:   PetscCall(PetscLogFlops(8.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
284:   PetscFunctionReturn(PETSC_SUCCESS);
285: }

287: PetscErrorCode MatMult_SeqSBAIJ_3(Mat A, Vec xx, Vec zz)
288: {
289:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
290:   PetscScalar       *z, x1, x2, x3, zero = 0.0;
291:   const PetscScalar *x, *xb;
292:   const MatScalar   *v;
293:   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
294:   const PetscInt    *aj = a->j, *ai = a->i, *ib;
295:   PetscInt           nonzerorow = 0;

297:   PetscFunctionBegin;
298:   PetscCall(VecSet(zz, zero));
299:   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
300:   PetscCall(VecGetArrayRead(xx, &x));
301:   PetscCall(VecGetArray(zz, &z));

303:   v  = a->a;
304:   xb = x;

306:   for (i = 0; i < mbs; i++) {
307:     n    = ai[1] - ai[0]; /* length of i_th block row of A */
308:     x1   = xb[0];
309:     x2   = xb[1];
310:     x3   = xb[2];
311:     ib   = aj + *ai;
312:     jmin = 0;
313:     nonzerorow += (n > 0);
314:     if (*ib == i) { /* (diag of A)*x */
315:       z[3 * i] += v[0] * x1 + v[3] * x2 + v[6] * x3;
316:       z[3 * i + 1] += v[3] * x1 + v[4] * x2 + v[7] * x3;
317:       z[3 * i + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
318:       v += 9;
319:       jmin++;
320:     }
321:     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
322:     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
323:     for (j = jmin; j < n; j++) {
324:       /* (strict lower triangular part of A)*x  */
325:       cval = ib[j] * 3;
326:       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3;
327:       z[cval + 1] += v[3] * x1 + v[4] * x2 + v[5] * x3;
328:       z[cval + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
329:       /* (strict upper triangular part of A)*x  */
330:       z[3 * i] += v[0] * x[cval] + v[3] * x[cval + 1] + v[6] * x[cval + 2];
331:       z[3 * i + 1] += v[1] * x[cval] + v[4] * x[cval + 1] + v[7] * x[cval + 2];
332:       z[3 * i + 2] += v[2] * x[cval] + v[5] * x[cval + 1] + v[8] * x[cval + 2];
333:       v += 9;
334:     }
335:     xb += 3;
336:     ai++;
337:   }

339:   PetscCall(VecRestoreArrayRead(xx, &x));
340:   PetscCall(VecRestoreArray(zz, &z));
341:   PetscCall(PetscLogFlops(18.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
342:   PetscFunctionReturn(PETSC_SUCCESS);
343: }

345: PetscErrorCode MatMult_SeqSBAIJ_4(Mat A, Vec xx, Vec zz)
346: {
347:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
348:   PetscScalar       *z, x1, x2, x3, x4, zero = 0.0;
349:   const PetscScalar *x, *xb;
350:   const MatScalar   *v;
351:   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
352:   const PetscInt    *aj = a->j, *ai = a->i, *ib;
353:   PetscInt           nonzerorow = 0;

355:   PetscFunctionBegin;
356:   PetscCall(VecSet(zz, zero));
357:   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
358:   PetscCall(VecGetArrayRead(xx, &x));
359:   PetscCall(VecGetArray(zz, &z));

361:   v  = a->a;
362:   xb = x;

364:   for (i = 0; i < mbs; i++) {
365:     n    = ai[1] - ai[0]; /* length of i_th block row of A */
366:     x1   = xb[0];
367:     x2   = xb[1];
368:     x3   = xb[2];
369:     x4   = xb[3];
370:     ib   = aj + *ai;
371:     jmin = 0;
372:     nonzerorow += (n > 0);
373:     if (*ib == i) { /* (diag of A)*x */
374:       z[4 * i] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
375:       z[4 * i + 1] += v[4] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
376:       z[4 * i + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[14] * x4;
377:       z[4 * i + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
378:       v += 16;
379:       jmin++;
380:     }
381:     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
382:     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
383:     for (j = jmin; j < n; j++) {
384:       /* (strict lower triangular part of A)*x  */
385:       cval = ib[j] * 4;
386:       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4;
387:       z[cval + 1] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4;
388:       z[cval + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4;
389:       z[cval + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
390:       /* (strict upper triangular part of A)*x  */
391:       z[4 * i] += v[0] * x[cval] + v[4] * x[cval + 1] + v[8] * x[cval + 2] + v[12] * x[cval + 3];
392:       z[4 * i + 1] += v[1] * x[cval] + v[5] * x[cval + 1] + v[9] * x[cval + 2] + v[13] * x[cval + 3];
393:       z[4 * i + 2] += v[2] * x[cval] + v[6] * x[cval + 1] + v[10] * x[cval + 2] + v[14] * x[cval + 3];
394:       z[4 * i + 3] += v[3] * x[cval] + v[7] * x[cval + 1] + v[11] * x[cval + 2] + v[15] * x[cval + 3];
395:       v += 16;
396:     }
397:     xb += 4;
398:     ai++;
399:   }

401:   PetscCall(VecRestoreArrayRead(xx, &x));
402:   PetscCall(VecRestoreArray(zz, &z));
403:   PetscCall(PetscLogFlops(32.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
404:   PetscFunctionReturn(PETSC_SUCCESS);
405: }

407: PetscErrorCode MatMult_SeqSBAIJ_5(Mat A, Vec xx, Vec zz)
408: {
409:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
410:   PetscScalar       *z, x1, x2, x3, x4, x5, zero = 0.0;
411:   const PetscScalar *x, *xb;
412:   const MatScalar   *v;
413:   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
414:   const PetscInt    *aj = a->j, *ai = a->i, *ib;
415:   PetscInt           nonzerorow = 0;

417:   PetscFunctionBegin;
418:   PetscCall(VecSet(zz, zero));
419:   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
420:   PetscCall(VecGetArrayRead(xx, &x));
421:   PetscCall(VecGetArray(zz, &z));

423:   v  = a->a;
424:   xb = x;

426:   for (i = 0; i < mbs; i++) {
427:     n    = ai[1] - ai[0]; /* length of i_th block row of A */
428:     x1   = xb[0];
429:     x2   = xb[1];
430:     x3   = xb[2];
431:     x4   = xb[3];
432:     x5   = xb[4];
433:     ib   = aj + *ai;
434:     jmin = 0;
435:     nonzerorow += (n > 0);
436:     if (*ib == i) { /* (diag of A)*x */
437:       z[5 * i] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
438:       z[5 * i + 1] += v[5] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
439:       z[5 * i + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
440:       z[5 * i + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[23] * x5;
441:       z[5 * i + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
442:       v += 25;
443:       jmin++;
444:     }
445:     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
446:     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
447:     for (j = jmin; j < n; j++) {
448:       /* (strict lower triangular part of A)*x  */
449:       cval = ib[j] * 5;
450:       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5;
451:       z[cval + 1] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5;
452:       z[cval + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5;
453:       z[cval + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5;
454:       z[cval + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
455:       /* (strict upper triangular part of A)*x  */
456:       z[5 * i] += v[0] * x[cval] + v[5] * x[cval + 1] + v[10] * x[cval + 2] + v[15] * x[cval + 3] + v[20] * x[cval + 4];
457:       z[5 * i + 1] += v[1] * x[cval] + v[6] * x[cval + 1] + v[11] * x[cval + 2] + v[16] * x[cval + 3] + v[21] * x[cval + 4];
458:       z[5 * i + 2] += v[2] * x[cval] + v[7] * x[cval + 1] + v[12] * x[cval + 2] + v[17] * x[cval + 3] + v[22] * x[cval + 4];
459:       z[5 * i + 3] += v[3] * x[cval] + v[8] * x[cval + 1] + v[13] * x[cval + 2] + v[18] * x[cval + 3] + v[23] * x[cval + 4];
460:       z[5 * i + 4] += v[4] * x[cval] + v[9] * x[cval + 1] + v[14] * x[cval + 2] + v[19] * x[cval + 3] + v[24] * x[cval + 4];
461:       v += 25;
462:     }
463:     xb += 5;
464:     ai++;
465:   }

467:   PetscCall(VecRestoreArrayRead(xx, &x));
468:   PetscCall(VecRestoreArray(zz, &z));
469:   PetscCall(PetscLogFlops(50.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
470:   PetscFunctionReturn(PETSC_SUCCESS);
471: }

473: PetscErrorCode MatMult_SeqSBAIJ_6(Mat A, Vec xx, Vec zz)
474: {
475:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
476:   PetscScalar       *z, x1, x2, x3, x4, x5, x6, zero = 0.0;
477:   const PetscScalar *x, *xb;
478:   const MatScalar   *v;
479:   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
480:   const PetscInt    *aj = a->j, *ai = a->i, *ib;
481:   PetscInt           nonzerorow = 0;

483:   PetscFunctionBegin;
484:   PetscCall(VecSet(zz, zero));
485:   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
486:   PetscCall(VecGetArrayRead(xx, &x));
487:   PetscCall(VecGetArray(zz, &z));

489:   v  = a->a;
490:   xb = x;

492:   for (i = 0; i < mbs; i++) {
493:     n    = ai[1] - ai[0]; /* length of i_th block row of A */
494:     x1   = xb[0];
495:     x2   = xb[1];
496:     x3   = xb[2];
497:     x4   = xb[3];
498:     x5   = xb[4];
499:     x6   = xb[5];
500:     ib   = aj + *ai;
501:     jmin = 0;
502:     nonzerorow += (n > 0);
503:     if (*ib == i) { /* (diag of A)*x */
504:       z[6 * i] += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
505:       z[6 * i + 1] += v[6] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
506:       z[6 * i + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
507:       z[6 * i + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
508:       z[6 * i + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[34] * x6;
509:       z[6 * i + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
510:       v += 36;
511:       jmin++;
512:     }
513:     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
514:     PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
515:     for (j = jmin; j < n; j++) {
516:       /* (strict lower triangular part of A)*x  */
517:       cval = ib[j] * 6;
518:       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6;
519:       z[cval + 1] += v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4 + v[10] * x5 + v[11] * x6;
520:       z[cval + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4 + v[16] * x5 + v[17] * x6;
521:       z[cval + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[22] * x5 + v[23] * x6;
522:       z[cval + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[29] * x6;
523:       z[cval + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
524:       /* (strict upper triangular part of A)*x  */
525:       z[6 * i] += v[0] * x[cval] + v[6] * x[cval + 1] + v[12] * x[cval + 2] + v[18] * x[cval + 3] + v[24] * x[cval + 4] + v[30] * x[cval + 5];
526:       z[6 * i + 1] += v[1] * x[cval] + v[7] * x[cval + 1] + v[13] * x[cval + 2] + v[19] * x[cval + 3] + v[25] * x[cval + 4] + v[31] * x[cval + 5];
527:       z[6 * i + 2] += v[2] * x[cval] + v[8] * x[cval + 1] + v[14] * x[cval + 2] + v[20] * x[cval + 3] + v[26] * x[cval + 4] + v[32] * x[cval + 5];
528:       z[6 * i + 3] += v[3] * x[cval] + v[9] * x[cval + 1] + v[15] * x[cval + 2] + v[21] * x[cval + 3] + v[27] * x[cval + 4] + v[33] * x[cval + 5];
529:       z[6 * i + 4] += v[4] * x[cval] + v[10] * x[cval + 1] + v[16] * x[cval + 2] + v[22] * x[cval + 3] + v[28] * x[cval + 4] + v[34] * x[cval + 5];
530:       z[6 * i + 5] += v[5] * x[cval] + v[11] * x[cval + 1] + v[17] * x[cval + 2] + v[23] * x[cval + 3] + v[29] * x[cval + 4] + v[35] * x[cval + 5];
531:       v += 36;
532:     }
533:     xb += 6;
534:     ai++;
535:   }

537:   PetscCall(VecRestoreArrayRead(xx, &x));
538:   PetscCall(VecRestoreArray(zz, &z));
539:   PetscCall(PetscLogFlops(72.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
540:   PetscFunctionReturn(PETSC_SUCCESS);
541: }

543: PetscErrorCode MatMult_SeqSBAIJ_7(Mat A, Vec xx, Vec zz)
544: {
545:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
546:   PetscScalar       *z, x1, x2, x3, x4, x5, x6, x7, zero = 0.0;
547:   const PetscScalar *x, *xb;
548:   const MatScalar   *v;
549:   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
550:   const PetscInt    *aj = a->j, *ai = a->i, *ib;
551:   PetscInt           nonzerorow = 0;

553:   PetscFunctionBegin;
554:   PetscCall(VecSet(zz, zero));
555:   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
556:   PetscCall(VecGetArrayRead(xx, &x));
557:   PetscCall(VecGetArray(zz, &z));

559:   v  = a->a;
560:   xb = x;

562:   for (i = 0; i < mbs; i++) {
563:     n    = ai[1] - ai[0]; /* length of i_th block row of A */
564:     x1   = xb[0];
565:     x2   = xb[1];
566:     x3   = xb[2];
567:     x4   = xb[3];
568:     x5   = xb[4];
569:     x6   = xb[5];
570:     x7   = xb[6];
571:     ib   = aj + *ai;
572:     jmin = 0;
573:     nonzerorow += (n > 0);
574:     if (*ib == i) { /* (diag of A)*x */
575:       z[7 * i] += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
576:       z[7 * i + 1] += v[7] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
577:       z[7 * i + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
578:       z[7 * i + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
579:       z[7 * i + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
580:       z[7 * i + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[47] * x7;
581:       z[7 * i + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
582:       v += 49;
583:       jmin++;
584:     }
585:     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
586:     PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
587:     for (j = jmin; j < n; j++) {
588:       /* (strict lower triangular part of A)*x  */
589:       cval = ib[j] * 7;
590:       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6 + v[6] * x7;
591:       z[cval + 1] += v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5 + v[12] * x6 + v[13] * x7;
592:       z[cval + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[17] * x4 + v[18] * x5 + v[19] * x6 + v[20] * x7;
593:       z[cval + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[25] * x5 + v[26] * x6 + v[27] * x7;
594:       z[cval + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[33] * x6 + v[34] * x7;
595:       z[cval + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[41] * x7;
596:       z[cval + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
597:       /* (strict upper triangular part of A)*x  */
598:       z[7 * i] += v[0] * x[cval] + v[7] * x[cval + 1] + v[14] * x[cval + 2] + v[21] * x[cval + 3] + v[28] * x[cval + 4] + v[35] * x[cval + 5] + v[42] * x[cval + 6];
599:       z[7 * i + 1] += v[1] * x[cval] + v[8] * x[cval + 1] + v[15] * x[cval + 2] + v[22] * x[cval + 3] + v[29] * x[cval + 4] + v[36] * x[cval + 5] + v[43] * x[cval + 6];
600:       z[7 * i + 2] += v[2] * x[cval] + v[9] * x[cval + 1] + v[16] * x[cval + 2] + v[23] * x[cval + 3] + v[30] * x[cval + 4] + v[37] * x[cval + 5] + v[44] * x[cval + 6];
601:       z[7 * i + 3] += v[3] * x[cval] + v[10] * x[cval + 1] + v[17] * x[cval + 2] + v[24] * x[cval + 3] + v[31] * x[cval + 4] + v[38] * x[cval + 5] + v[45] * x[cval + 6];
602:       z[7 * i + 4] += v[4] * x[cval] + v[11] * x[cval + 1] + v[18] * x[cval + 2] + v[25] * x[cval + 3] + v[32] * x[cval + 4] + v[39] * x[cval + 5] + v[46] * x[cval + 6];
603:       z[7 * i + 5] += v[5] * x[cval] + v[12] * x[cval + 1] + v[19] * x[cval + 2] + v[26] * x[cval + 3] + v[33] * x[cval + 4] + v[40] * x[cval + 5] + v[47] * x[cval + 6];
604:       z[7 * i + 6] += v[6] * x[cval] + v[13] * x[cval + 1] + v[20] * x[cval + 2] + v[27] * x[cval + 3] + v[34] * x[cval + 4] + v[41] * x[cval + 5] + v[48] * x[cval + 6];
605:       v += 49;
606:     }
607:     xb += 7;
608:     ai++;
609:   }
610:   PetscCall(VecRestoreArrayRead(xx, &x));
611:   PetscCall(VecRestoreArray(zz, &z));
612:   PetscCall(PetscLogFlops(98.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
613:   PetscFunctionReturn(PETSC_SUCCESS);
614: }

616: /*
617:     This will not work with MatScalar == float because it calls the BLAS
618: */
619: PetscErrorCode MatMult_SeqSBAIJ_N(Mat A, Vec xx, Vec zz)
620: {
621:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
622:   PetscScalar       *z, *z_ptr, *zb, *work, *workt, zero = 0.0;
623:   const PetscScalar *x, *x_ptr, *xb;
624:   const MatScalar   *v;
625:   PetscInt           mbs = a->mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2, ncols, k;
626:   const PetscInt    *idx, *aj, *ii;
627:   PetscInt           nonzerorow = 0;

629:   PetscFunctionBegin;
630:   PetscCall(VecSet(zz, zero));
631:   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
632:   PetscCall(VecGetArrayRead(xx, &x));
633:   PetscCall(VecGetArray(zz, &z));

635:   x_ptr = x;
636:   z_ptr = z;

638:   aj = a->j;
639:   v  = a->a;
640:   ii = a->i;

642:   if (!a->mult_work) PetscCall(PetscMalloc1(A->rmap->N + 1, &a->mult_work));
643:   work = a->mult_work;

645:   for (i = 0; i < mbs; i++) {
646:     n     = ii[1] - ii[0];
647:     ncols = n * bs;
648:     workt = work;
649:     idx   = aj + ii[0];
650:     nonzerorow += (n > 0);

652:     /* upper triangular part */
653:     for (j = 0; j < n; j++) {
654:       xb = x_ptr + bs * (*idx++);
655:       for (k = 0; k < bs; k++) workt[k] = xb[k];
656:       workt += bs;
657:     }
658:     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
659:     PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z);

661:     /* strict lower triangular part */
662:     idx = aj + ii[0];
663:     if (n && *idx == i) {
664:       ncols -= bs;
665:       v += bs2;
666:       idx++;
667:       n--;
668:     }

670:     if (ncols > 0) {
671:       workt = work;
672:       PetscCall(PetscArrayzero(workt, ncols));
673:       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, x, v, workt);
674:       for (j = 0; j < n; j++) {
675:         zb = z_ptr + bs * (*idx++);
676:         for (k = 0; k < bs; k++) zb[k] += workt[k];
677:         workt += bs;
678:       }
679:     }
680:     x += bs;
681:     v += n * bs2;
682:     z += bs;
683:     ii++;
684:   }

686:   PetscCall(VecRestoreArrayRead(xx, &x));
687:   PetscCall(VecRestoreArray(zz, &z));
688:   PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow) * bs2 - nonzerorow));
689:   PetscFunctionReturn(PETSC_SUCCESS);
690: }

692: PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A, Vec xx, Vec yy, Vec zz)
693: {
694:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
695:   PetscScalar       *z, x1;
696:   const PetscScalar *x, *xb;
697:   const MatScalar   *v;
698:   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
699:   const PetscInt    *aj = a->j, *ai = a->i, *ib;
700:   PetscInt           nonzerorow = 0;
701: #if defined(PETSC_USE_COMPLEX)
702:   const int aconj = A->hermitian == PETSC_BOOL3_TRUE;
703: #else
704:   const int aconj = 0;
705: #endif

707:   PetscFunctionBegin;
708:   PetscCall(VecCopy(yy, zz));
709:   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
710:   PetscCall(VecGetArrayRead(xx, &x));
711:   PetscCall(VecGetArray(zz, &z));
712:   v  = a->a;
713:   xb = x;

715:   for (i = 0; i < mbs; i++) {
716:     n    = ai[1] - ai[0]; /* length of i_th row of A */
717:     x1   = xb[0];
718:     ib   = aj + *ai;
719:     jmin = 0;
720:     nonzerorow += (n > 0);
721:     if (n && *ib == i) { /* (diag of A)*x */
722:       z[i] += *v++ * x[*ib++];
723:       jmin++;
724:     }
725:     if (aconj) {
726:       for (j = jmin; j < n; j++) {
727:         cval = *ib;
728:         z[cval] += PetscConj(*v) * x1; /* (strict lower triangular part of A)*x  */
729:         z[i] += *v++ * x[*ib++];       /* (strict upper triangular part of A)*x  */
730:       }
731:     } else {
732:       for (j = jmin; j < n; j++) {
733:         cval = *ib;
734:         z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
735:         z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
736:       }
737:     }
738:     xb++;
739:     ai++;
740:   }

742:   PetscCall(VecRestoreArrayRead(xx, &x));
743:   PetscCall(VecRestoreArray(zz, &z));

745:   PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow)));
746:   PetscFunctionReturn(PETSC_SUCCESS);
747: }

749: PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A, Vec xx, Vec yy, Vec zz)
750: {
751:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
752:   PetscScalar       *z, x1, x2;
753:   const PetscScalar *x, *xb;
754:   const MatScalar   *v;
755:   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
756:   const PetscInt    *aj = a->j, *ai = a->i, *ib;
757:   PetscInt           nonzerorow = 0;

759:   PetscFunctionBegin;
760:   PetscCall(VecCopy(yy, zz));
761:   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
762:   PetscCall(VecGetArrayRead(xx, &x));
763:   PetscCall(VecGetArray(zz, &z));

765:   v  = a->a;
766:   xb = x;

768:   for (i = 0; i < mbs; i++) {
769:     n    = ai[1] - ai[0]; /* length of i_th block row of A */
770:     x1   = xb[0];
771:     x2   = xb[1];
772:     ib   = aj + *ai;
773:     jmin = 0;
774:     nonzerorow += (n > 0);
775:     if (n && *ib == i) { /* (diag of A)*x */
776:       z[2 * i] += v[0] * x1 + v[2] * x2;
777:       z[2 * i + 1] += v[2] * x1 + v[3] * x2;
778:       v += 4;
779:       jmin++;
780:     }
781:     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
782:     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
783:     for (j = jmin; j < n; j++) {
784:       /* (strict lower triangular part of A)*x  */
785:       cval = ib[j] * 2;
786:       z[cval] += v[0] * x1 + v[1] * x2;
787:       z[cval + 1] += v[2] * x1 + v[3] * x2;
788:       /* (strict upper triangular part of A)*x  */
789:       z[2 * i] += v[0] * x[cval] + v[2] * x[cval + 1];
790:       z[2 * i + 1] += v[1] * x[cval] + v[3] * x[cval + 1];
791:       v += 4;
792:     }
793:     xb += 2;
794:     ai++;
795:   }
796:   PetscCall(VecRestoreArrayRead(xx, &x));
797:   PetscCall(VecRestoreArray(zz, &z));

799:   PetscCall(PetscLogFlops(4.0 * (a->nz * 2.0 - nonzerorow)));
800:   PetscFunctionReturn(PETSC_SUCCESS);
801: }

803: PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A, Vec xx, Vec yy, Vec zz)
804: {
805:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
806:   PetscScalar       *z, x1, x2, x3;
807:   const PetscScalar *x, *xb;
808:   const MatScalar   *v;
809:   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
810:   const PetscInt    *aj = a->j, *ai = a->i, *ib;
811:   PetscInt           nonzerorow = 0;

813:   PetscFunctionBegin;
814:   PetscCall(VecCopy(yy, zz));
815:   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
816:   PetscCall(VecGetArrayRead(xx, &x));
817:   PetscCall(VecGetArray(zz, &z));

819:   v  = a->a;
820:   xb = x;

822:   for (i = 0; i < mbs; i++) {
823:     n    = ai[1] - ai[0]; /* length of i_th block row of A */
824:     x1   = xb[0];
825:     x2   = xb[1];
826:     x3   = xb[2];
827:     ib   = aj + *ai;
828:     jmin = 0;
829:     nonzerorow += (n > 0);
830:     if (n && *ib == i) { /* (diag of A)*x */
831:       z[3 * i] += v[0] * x1 + v[3] * x2 + v[6] * x3;
832:       z[3 * i + 1] += v[3] * x1 + v[4] * x2 + v[7] * x3;
833:       z[3 * i + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
834:       v += 9;
835:       jmin++;
836:     }
837:     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
838:     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
839:     for (j = jmin; j < n; j++) {
840:       /* (strict lower triangular part of A)*x  */
841:       cval = ib[j] * 3;
842:       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3;
843:       z[cval + 1] += v[3] * x1 + v[4] * x2 + v[5] * x3;
844:       z[cval + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
845:       /* (strict upper triangular part of A)*x  */
846:       z[3 * i] += v[0] * x[cval] + v[3] * x[cval + 1] + v[6] * x[cval + 2];
847:       z[3 * i + 1] += v[1] * x[cval] + v[4] * x[cval + 1] + v[7] * x[cval + 2];
848:       z[3 * i + 2] += v[2] * x[cval] + v[5] * x[cval + 1] + v[8] * x[cval + 2];
849:       v += 9;
850:     }
851:     xb += 3;
852:     ai++;
853:   }

855:   PetscCall(VecRestoreArrayRead(xx, &x));
856:   PetscCall(VecRestoreArray(zz, &z));

858:   PetscCall(PetscLogFlops(18.0 * (a->nz * 2.0 - nonzerorow)));
859:   PetscFunctionReturn(PETSC_SUCCESS);
860: }

862: PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A, Vec xx, Vec yy, Vec zz)
863: {
864:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
865:   PetscScalar       *z, x1, x2, x3, x4;
866:   const PetscScalar *x, *xb;
867:   const MatScalar   *v;
868:   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
869:   const PetscInt    *aj = a->j, *ai = a->i, *ib;
870:   PetscInt           nonzerorow = 0;

872:   PetscFunctionBegin;
873:   PetscCall(VecCopy(yy, zz));
874:   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
875:   PetscCall(VecGetArrayRead(xx, &x));
876:   PetscCall(VecGetArray(zz, &z));

878:   v  = a->a;
879:   xb = x;

881:   for (i = 0; i < mbs; i++) {
882:     n    = ai[1] - ai[0]; /* length of i_th block row of A */
883:     x1   = xb[0];
884:     x2   = xb[1];
885:     x3   = xb[2];
886:     x4   = xb[3];
887:     ib   = aj + *ai;
888:     jmin = 0;
889:     nonzerorow += (n > 0);
890:     if (n && *ib == i) { /* (diag of A)*x */
891:       z[4 * i] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
892:       z[4 * i + 1] += v[4] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
893:       z[4 * i + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[14] * x4;
894:       z[4 * i + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
895:       v += 16;
896:       jmin++;
897:     }
898:     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
899:     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
900:     for (j = jmin; j < n; j++) {
901:       /* (strict lower triangular part of A)*x  */
902:       cval = ib[j] * 4;
903:       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4;
904:       z[cval + 1] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4;
905:       z[cval + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4;
906:       z[cval + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
907:       /* (strict upper triangular part of A)*x  */
908:       z[4 * i] += v[0] * x[cval] + v[4] * x[cval + 1] + v[8] * x[cval + 2] + v[12] * x[cval + 3];
909:       z[4 * i + 1] += v[1] * x[cval] + v[5] * x[cval + 1] + v[9] * x[cval + 2] + v[13] * x[cval + 3];
910:       z[4 * i + 2] += v[2] * x[cval] + v[6] * x[cval + 1] + v[10] * x[cval + 2] + v[14] * x[cval + 3];
911:       z[4 * i + 3] += v[3] * x[cval] + v[7] * x[cval + 1] + v[11] * x[cval + 2] + v[15] * x[cval + 3];
912:       v += 16;
913:     }
914:     xb += 4;
915:     ai++;
916:   }

918:   PetscCall(VecRestoreArrayRead(xx, &x));
919:   PetscCall(VecRestoreArray(zz, &z));

921:   PetscCall(PetscLogFlops(32.0 * (a->nz * 2.0 - nonzerorow)));
922:   PetscFunctionReturn(PETSC_SUCCESS);
923: }

925: PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A, Vec xx, Vec yy, Vec zz)
926: {
927:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
928:   PetscScalar       *z, x1, x2, x3, x4, x5;
929:   const PetscScalar *x, *xb;
930:   const MatScalar   *v;
931:   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
932:   const PetscInt    *aj = a->j, *ai = a->i, *ib;
933:   PetscInt           nonzerorow = 0;

935:   PetscFunctionBegin;
936:   PetscCall(VecCopy(yy, zz));
937:   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
938:   PetscCall(VecGetArrayRead(xx, &x));
939:   PetscCall(VecGetArray(zz, &z));

941:   v  = a->a;
942:   xb = x;

944:   for (i = 0; i < mbs; i++) {
945:     n    = ai[1] - ai[0]; /* length of i_th block row of A */
946:     x1   = xb[0];
947:     x2   = xb[1];
948:     x3   = xb[2];
949:     x4   = xb[3];
950:     x5   = xb[4];
951:     ib   = aj + *ai;
952:     jmin = 0;
953:     nonzerorow += (n > 0);
954:     if (n && *ib == i) { /* (diag of A)*x */
955:       z[5 * i] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
956:       z[5 * i + 1] += v[5] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
957:       z[5 * i + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
958:       z[5 * i + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[23] * x5;
959:       z[5 * i + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
960:       v += 25;
961:       jmin++;
962:     }
963:     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
964:     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
965:     for (j = jmin; j < n; j++) {
966:       /* (strict lower triangular part of A)*x  */
967:       cval = ib[j] * 5;
968:       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5;
969:       z[cval + 1] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5;
970:       z[cval + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5;
971:       z[cval + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5;
972:       z[cval + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
973:       /* (strict upper triangular part of A)*x  */
974:       z[5 * i] += v[0] * x[cval] + v[5] * x[cval + 1] + v[10] * x[cval + 2] + v[15] * x[cval + 3] + v[20] * x[cval + 4];
975:       z[5 * i + 1] += v[1] * x[cval] + v[6] * x[cval + 1] + v[11] * x[cval + 2] + v[16] * x[cval + 3] + v[21] * x[cval + 4];
976:       z[5 * i + 2] += v[2] * x[cval] + v[7] * x[cval + 1] + v[12] * x[cval + 2] + v[17] * x[cval + 3] + v[22] * x[cval + 4];
977:       z[5 * i + 3] += v[3] * x[cval] + v[8] * x[cval + 1] + v[13] * x[cval + 2] + v[18] * x[cval + 3] + v[23] * x[cval + 4];
978:       z[5 * i + 4] += v[4] * x[cval] + v[9] * x[cval + 1] + v[14] * x[cval + 2] + v[19] * x[cval + 3] + v[24] * x[cval + 4];
979:       v += 25;
980:     }
981:     xb += 5;
982:     ai++;
983:   }

985:   PetscCall(VecRestoreArrayRead(xx, &x));
986:   PetscCall(VecRestoreArray(zz, &z));

988:   PetscCall(PetscLogFlops(50.0 * (a->nz * 2.0 - nonzerorow)));
989:   PetscFunctionReturn(PETSC_SUCCESS);
990: }

992: PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A, Vec xx, Vec yy, Vec zz)
993: {
994:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
995:   PetscScalar       *z, x1, x2, x3, x4, x5, x6;
996:   const PetscScalar *x, *xb;
997:   const MatScalar   *v;
998:   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
999:   const PetscInt    *aj = a->j, *ai = a->i, *ib;
1000:   PetscInt           nonzerorow = 0;

1002:   PetscFunctionBegin;
1003:   PetscCall(VecCopy(yy, zz));
1004:   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
1005:   PetscCall(VecGetArrayRead(xx, &x));
1006:   PetscCall(VecGetArray(zz, &z));

1008:   v  = a->a;
1009:   xb = x;

1011:   for (i = 0; i < mbs; i++) {
1012:     n    = ai[1] - ai[0]; /* length of i_th block row of A */
1013:     x1   = xb[0];
1014:     x2   = xb[1];
1015:     x3   = xb[2];
1016:     x4   = xb[3];
1017:     x5   = xb[4];
1018:     x6   = xb[5];
1019:     ib   = aj + *ai;
1020:     jmin = 0;
1021:     nonzerorow += (n > 0);
1022:     if (n && *ib == i) { /* (diag of A)*x */
1023:       z[6 * i] += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
1024:       z[6 * i + 1] += v[6] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
1025:       z[6 * i + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
1026:       z[6 * i + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
1027:       z[6 * i + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[34] * x6;
1028:       z[6 * i + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
1029:       v += 36;
1030:       jmin++;
1031:     }
1032:     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1033:     PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1034:     for (j = jmin; j < n; j++) {
1035:       /* (strict lower triangular part of A)*x  */
1036:       cval = ib[j] * 6;
1037:       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6;
1038:       z[cval + 1] += v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4 + v[10] * x5 + v[11] * x6;
1039:       z[cval + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4 + v[16] * x5 + v[17] * x6;
1040:       z[cval + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[22] * x5 + v[23] * x6;
1041:       z[cval + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[29] * x6;
1042:       z[cval + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
1043:       /* (strict upper triangular part of A)*x  */
1044:       z[6 * i] += v[0] * x[cval] + v[6] * x[cval + 1] + v[12] * x[cval + 2] + v[18] * x[cval + 3] + v[24] * x[cval + 4] + v[30] * x[cval + 5];
1045:       z[6 * i + 1] += v[1] * x[cval] + v[7] * x[cval + 1] + v[13] * x[cval + 2] + v[19] * x[cval + 3] + v[25] * x[cval + 4] + v[31] * x[cval + 5];
1046:       z[6 * i + 2] += v[2] * x[cval] + v[8] * x[cval + 1] + v[14] * x[cval + 2] + v[20] * x[cval + 3] + v[26] * x[cval + 4] + v[32] * x[cval + 5];
1047:       z[6 * i + 3] += v[3] * x[cval] + v[9] * x[cval + 1] + v[15] * x[cval + 2] + v[21] * x[cval + 3] + v[27] * x[cval + 4] + v[33] * x[cval + 5];
1048:       z[6 * i + 4] += v[4] * x[cval] + v[10] * x[cval + 1] + v[16] * x[cval + 2] + v[22] * x[cval + 3] + v[28] * x[cval + 4] + v[34] * x[cval + 5];
1049:       z[6 * i + 5] += v[5] * x[cval] + v[11] * x[cval + 1] + v[17] * x[cval + 2] + v[23] * x[cval + 3] + v[29] * x[cval + 4] + v[35] * x[cval + 5];
1050:       v += 36;
1051:     }
1052:     xb += 6;
1053:     ai++;
1054:   }

1056:   PetscCall(VecRestoreArrayRead(xx, &x));
1057:   PetscCall(VecRestoreArray(zz, &z));

1059:   PetscCall(PetscLogFlops(72.0 * (a->nz * 2.0 - nonzerorow)));
1060:   PetscFunctionReturn(PETSC_SUCCESS);
1061: }

1063: PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A, Vec xx, Vec yy, Vec zz)
1064: {
1065:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1066:   PetscScalar       *z, x1, x2, x3, x4, x5, x6, x7;
1067:   const PetscScalar *x, *xb;
1068:   const MatScalar   *v;
1069:   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
1070:   const PetscInt    *aj = a->j, *ai = a->i, *ib;
1071:   PetscInt           nonzerorow = 0;

1073:   PetscFunctionBegin;
1074:   PetscCall(VecCopy(yy, zz));
1075:   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
1076:   PetscCall(VecGetArrayRead(xx, &x));
1077:   PetscCall(VecGetArray(zz, &z));

1079:   v  = a->a;
1080:   xb = x;

1082:   for (i = 0; i < mbs; i++) {
1083:     n    = ai[1] - ai[0]; /* length of i_th block row of A */
1084:     x1   = xb[0];
1085:     x2   = xb[1];
1086:     x3   = xb[2];
1087:     x4   = xb[3];
1088:     x5   = xb[4];
1089:     x6   = xb[5];
1090:     x7   = xb[6];
1091:     ib   = aj + *ai;
1092:     jmin = 0;
1093:     nonzerorow += (n > 0);
1094:     if (n && *ib == i) { /* (diag of A)*x */
1095:       z[7 * i] += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
1096:       z[7 * i + 1] += v[7] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
1097:       z[7 * i + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
1098:       z[7 * i + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
1099:       z[7 * i + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
1100:       z[7 * i + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[47] * x7;
1101:       z[7 * i + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
1102:       v += 49;
1103:       jmin++;
1104:     }
1105:     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1106:     PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1107:     for (j = jmin; j < n; j++) {
1108:       /* (strict lower triangular part of A)*x  */
1109:       cval = ib[j] * 7;
1110:       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6 + v[6] * x7;
1111:       z[cval + 1] += v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5 + v[12] * x6 + v[13] * x7;
1112:       z[cval + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[17] * x4 + v[18] * x5 + v[19] * x6 + v[20] * x7;
1113:       z[cval + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[25] * x5 + v[26] * x6 + v[27] * x7;
1114:       z[cval + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[33] * x6 + v[34] * x7;
1115:       z[cval + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[41] * x7;
1116:       z[cval + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
1117:       /* (strict upper triangular part of A)*x  */
1118:       z[7 * i] += v[0] * x[cval] + v[7] * x[cval + 1] + v[14] * x[cval + 2] + v[21] * x[cval + 3] + v[28] * x[cval + 4] + v[35] * x[cval + 5] + v[42] * x[cval + 6];
1119:       z[7 * i + 1] += v[1] * x[cval] + v[8] * x[cval + 1] + v[15] * x[cval + 2] + v[22] * x[cval + 3] + v[29] * x[cval + 4] + v[36] * x[cval + 5] + v[43] * x[cval + 6];
1120:       z[7 * i + 2] += v[2] * x[cval] + v[9] * x[cval + 1] + v[16] * x[cval + 2] + v[23] * x[cval + 3] + v[30] * x[cval + 4] + v[37] * x[cval + 5] + v[44] * x[cval + 6];
1121:       z[7 * i + 3] += v[3] * x[cval] + v[10] * x[cval + 1] + v[17] * x[cval + 2] + v[24] * x[cval + 3] + v[31] * x[cval + 4] + v[38] * x[cval + 5] + v[45] * x[cval + 6];
1122:       z[7 * i + 4] += v[4] * x[cval] + v[11] * x[cval + 1] + v[18] * x[cval + 2] + v[25] * x[cval + 3] + v[32] * x[cval + 4] + v[39] * x[cval + 5] + v[46] * x[cval + 6];
1123:       z[7 * i + 5] += v[5] * x[cval] + v[12] * x[cval + 1] + v[19] * x[cval + 2] + v[26] * x[cval + 3] + v[33] * x[cval + 4] + v[40] * x[cval + 5] + v[47] * x[cval + 6];
1124:       z[7 * i + 6] += v[6] * x[cval] + v[13] * x[cval + 1] + v[20] * x[cval + 2] + v[27] * x[cval + 3] + v[34] * x[cval + 4] + v[41] * x[cval + 5] + v[48] * x[cval + 6];
1125:       v += 49;
1126:     }
1127:     xb += 7;
1128:     ai++;
1129:   }

1131:   PetscCall(VecRestoreArrayRead(xx, &x));
1132:   PetscCall(VecRestoreArray(zz, &z));

1134:   PetscCall(PetscLogFlops(98.0 * (a->nz * 2.0 - nonzerorow)));
1135:   PetscFunctionReturn(PETSC_SUCCESS);
1136: }

1138: PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A, Vec xx, Vec yy, Vec zz)
1139: {
1140:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1141:   PetscScalar       *z, *z_ptr = NULL, *zb, *work, *workt;
1142:   const PetscScalar *x, *x_ptr, *xb;
1143:   const MatScalar   *v;
1144:   PetscInt           mbs = a->mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2, ncols, k;
1145:   const PetscInt    *idx, *aj, *ii;
1146:   PetscInt           nonzerorow = 0;

1148:   PetscFunctionBegin;
1149:   PetscCall(VecCopy(yy, zz));
1150:   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
1151:   PetscCall(VecGetArrayRead(xx, &x));
1152:   x_ptr = x;
1153:   PetscCall(VecGetArray(zz, &z));
1154:   z_ptr = z;

1156:   aj = a->j;
1157:   v  = a->a;
1158:   ii = a->i;

1160:   if (!a->mult_work) PetscCall(PetscMalloc1(A->rmap->n + 1, &a->mult_work));
1161:   work = a->mult_work;

1163:   for (i = 0; i < mbs; i++) {
1164:     n     = ii[1] - ii[0];
1165:     ncols = n * bs;
1166:     workt = work;
1167:     idx   = aj + ii[0];
1168:     nonzerorow += (n > 0);

1170:     /* upper triangular part */
1171:     for (j = 0; j < n; j++) {
1172:       xb = x_ptr + bs * (*idx++);
1173:       for (k = 0; k < bs; k++) workt[k] = xb[k];
1174:       workt += bs;
1175:     }
1176:     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
1177:     PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z);

1179:     /* strict lower triangular part */
1180:     idx = aj + ii[0];
1181:     if (n && *idx == i) {
1182:       ncols -= bs;
1183:       v += bs2;
1184:       idx++;
1185:       n--;
1186:     }
1187:     if (ncols > 0) {
1188:       workt = work;
1189:       PetscCall(PetscArrayzero(workt, ncols));
1190:       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, x, v, workt);
1191:       for (j = 0; j < n; j++) {
1192:         zb = z_ptr + bs * (*idx++);
1193:         for (k = 0; k < bs; k++) zb[k] += workt[k];
1194:         workt += bs;
1195:       }
1196:     }

1198:     x += bs;
1199:     v += n * bs2;
1200:     z += bs;
1201:     ii++;
1202:   }

1204:   PetscCall(VecRestoreArrayRead(xx, &x));
1205:   PetscCall(VecRestoreArray(zz, &z));

1207:   PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow)));
1208:   PetscFunctionReturn(PETSC_SUCCESS);
1209: }

1211: PetscErrorCode MatScale_SeqSBAIJ(Mat inA, PetscScalar alpha)
1212: {
1213:   Mat_SeqSBAIJ *a      = (Mat_SeqSBAIJ *)inA->data;
1214:   PetscScalar   oalpha = alpha;
1215:   PetscBLASInt  one    = 1, totalnz;

1217:   PetscFunctionBegin;
1218:   PetscCall(PetscBLASIntCast(a->bs2 * a->nz, &totalnz));
1219:   PetscCallBLAS("BLASscal", BLASscal_(&totalnz, &oalpha, a->a, &one));
1220:   PetscCall(PetscLogFlops(totalnz));
1221:   PetscFunctionReturn(PETSC_SUCCESS);
1222: }

1224: PetscErrorCode MatNorm_SeqSBAIJ(Mat A, NormType type, PetscReal *norm)
1225: {
1226:   Mat_SeqSBAIJ    *a        = (Mat_SeqSBAIJ *)A->data;
1227:   const MatScalar *v        = a->a;
1228:   PetscReal        sum_diag = 0.0, sum_off = 0.0, *sum;
1229:   PetscInt         i, j, k, bs = A->rmap->bs, bs2 = a->bs2, k1, mbs = a->mbs, jmin, jmax, nexti, ik, *jl, *il;
1230:   const PetscInt  *aj = a->j, *col;

1232:   PetscFunctionBegin;
1233:   if (!a->nz) {
1234:     *norm = 0.0;
1235:     PetscFunctionReturn(PETSC_SUCCESS);
1236:   }
1237:   if (type == NORM_FROBENIUS) {
1238:     for (k = 0; k < mbs; k++) {
1239:       jmin = a->i[k];
1240:       jmax = a->i[k + 1];
1241:       col  = aj + jmin;
1242:       if (jmax - jmin > 0 && *col == k) { /* diagonal block */
1243:         for (i = 0; i < bs2; i++) {
1244:           sum_diag += PetscRealPart(PetscConj(*v) * (*v));
1245:           v++;
1246:         }
1247:         jmin++;
1248:       }
1249:       for (j = jmin; j < jmax; j++) { /* off-diagonal blocks */
1250:         for (i = 0; i < bs2; i++) {
1251:           sum_off += PetscRealPart(PetscConj(*v) * (*v));
1252:           v++;
1253:         }
1254:       }
1255:     }
1256:     *norm = PetscSqrtReal(sum_diag + 2 * sum_off);
1257:     PetscCall(PetscLogFlops(2.0 * bs2 * a->nz));
1258:   } else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */
1259:     PetscCall(PetscMalloc3(bs, &sum, mbs, &il, mbs, &jl));
1260:     for (i = 0; i < mbs; i++) jl[i] = mbs;
1261:     il[0] = 0;

1263:     *norm = 0.0;
1264:     for (k = 0; k < mbs; k++) { /* k_th block row */
1265:       for (j = 0; j < bs; j++) sum[j] = 0.0;
1266:       /*-- col sum --*/
1267:       i = jl[k]; /* first |A(i,k)| to be added */
1268:       /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window)
1269:                   at step k */
1270:       while (i < mbs) {
1271:         nexti = jl[i]; /* next block row to be added */
1272:         ik    = il[i]; /* block index of A(i,k) in the array a */
1273:         for (j = 0; j < bs; j++) {
1274:           v = a->a + ik * bs2 + j * bs;
1275:           for (k1 = 0; k1 < bs; k1++) {
1276:             sum[j] += PetscAbsScalar(*v);
1277:             v++;
1278:           }
1279:         }
1280:         /* update il, jl */
1281:         jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1282:         jmax = a->i[i + 1];
1283:         if (jmin < jmax) {
1284:           il[i] = jmin;
1285:           j     = a->j[jmin];
1286:           jl[i] = jl[j];
1287:           jl[j] = i;
1288:         }
1289:         i = nexti;
1290:       }
1291:       /*-- row sum --*/
1292:       jmin = a->i[k];
1293:       jmax = a->i[k + 1];
1294:       for (i = jmin; i < jmax; i++) {
1295:         for (j = 0; j < bs; j++) {
1296:           v = a->a + i * bs2 + j;
1297:           for (k1 = 0; k1 < bs; k1++) {
1298:             sum[j] += PetscAbsScalar(*v);
1299:             v += bs;
1300:           }
1301:         }
1302:       }
1303:       /* add k_th block row to il, jl */
1304:       col = aj + jmin;
1305:       if (jmax - jmin > 0 && *col == k) jmin++;
1306:       if (jmin < jmax) {
1307:         il[k] = jmin;
1308:         j     = a->j[jmin];
1309:         jl[k] = jl[j];
1310:         jl[j] = k;
1311:       }
1312:       for (j = 0; j < bs; j++) {
1313:         if (sum[j] > *norm) *norm = sum[j];
1314:       }
1315:     }
1316:     PetscCall(PetscFree3(sum, il, jl));
1317:     PetscCall(PetscLogFlops(PetscMax(mbs * a->nz - 1, 0)));
1318:   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for this norm yet");
1319:   PetscFunctionReturn(PETSC_SUCCESS);
1320: }

1322: PetscErrorCode MatEqual_SeqSBAIJ(Mat A, Mat B, PetscBool *flg)
1323: {
1324:   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *b = (Mat_SeqSBAIJ *)B->data;

1326:   PetscFunctionBegin;
1327:   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1328:   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs) || (a->nz != b->nz)) {
1329:     *flg = PETSC_FALSE;
1330:     PetscFunctionReturn(PETSC_SUCCESS);
1331:   }

1333:   /* if the a->i are the same */
1334:   PetscCall(PetscArraycmp(a->i, b->i, a->mbs + 1, flg));
1335:   if (!*flg) PetscFunctionReturn(PETSC_SUCCESS);

1337:   /* if a->j are the same */
1338:   PetscCall(PetscArraycmp(a->j, b->j, a->nz, flg));
1339:   if (!*flg) PetscFunctionReturn(PETSC_SUCCESS);

1341:   /* if a->a are the same */
1342:   PetscCall(PetscArraycmp(a->a, b->a, (a->nz) * (A->rmap->bs) * (A->rmap->bs), flg));
1343:   PetscFunctionReturn(PETSC_SUCCESS);
1344: }

1346: PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A, Vec v)
1347: {
1348:   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ *)A->data;
1349:   PetscInt         i, j, k, row, bs, ambs, bs2;
1350:   const PetscInt  *ai, *aj;
1351:   PetscScalar     *x, zero = 0.0;
1352:   const MatScalar *aa, *aa_j;

1354:   PetscFunctionBegin;
1355:   bs = A->rmap->bs;
1356:   PetscCheck(!A->factortype || bs <= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix with bs>1");

1358:   aa   = a->a;
1359:   ambs = a->mbs;

1361:   if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) {
1362:     PetscInt *diag = a->diag;
1363:     aa             = a->a;
1364:     ambs           = a->mbs;
1365:     PetscCall(VecGetArray(v, &x));
1366:     for (i = 0; i < ambs; i++) x[i] = 1.0 / aa[diag[i]];
1367:     PetscCall(VecRestoreArray(v, &x));
1368:     PetscFunctionReturn(PETSC_SUCCESS);
1369:   }

1371:   ai  = a->i;
1372:   aj  = a->j;
1373:   bs2 = a->bs2;
1374:   PetscCall(VecSet(v, zero));
1375:   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
1376:   PetscCall(VecGetArray(v, &x));
1377:   for (i = 0; i < ambs; i++) {
1378:     j = ai[i];
1379:     if (aj[j] == i) { /* if this is a diagonal element */
1380:       row  = i * bs;
1381:       aa_j = aa + j * bs2;
1382:       for (k = 0; k < bs2; k += (bs + 1), row++) x[row] = aa_j[k];
1383:     }
1384:   }
1385:   PetscCall(VecRestoreArray(v, &x));
1386:   PetscFunctionReturn(PETSC_SUCCESS);
1387: }

1389: PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A, Vec ll, Vec rr)
1390: {
1391:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1392:   PetscScalar        x;
1393:   const PetscScalar *l, *li, *ri;
1394:   MatScalar         *aa, *v;
1395:   PetscInt           i, j, k, lm, M, m, mbs, tmp, bs, bs2;
1396:   const PetscInt    *ai, *aj;
1397:   PetscBool          flg;

1399:   PetscFunctionBegin;
1400:   if (ll != rr) {
1401:     PetscCall(VecEqual(ll, rr, &flg));
1402:     PetscCheck(flg, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "For symmetric format, left and right scaling vectors must be same");
1403:   }
1404:   if (!ll) PetscFunctionReturn(PETSC_SUCCESS);
1405:   ai  = a->i;
1406:   aj  = a->j;
1407:   aa  = a->a;
1408:   m   = A->rmap->N;
1409:   bs  = A->rmap->bs;
1410:   mbs = a->mbs;
1411:   bs2 = a->bs2;

1413:   PetscCall(VecGetArrayRead(ll, &l));
1414:   PetscCall(VecGetLocalSize(ll, &lm));
1415:   PetscCheck(lm == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
1416:   for (i = 0; i < mbs; i++) { /* for each block row */
1417:     M  = ai[i + 1] - ai[i];
1418:     li = l + i * bs;
1419:     v  = aa + bs2 * ai[i];
1420:     for (j = 0; j < M; j++) { /* for each block */
1421:       ri = l + bs * aj[ai[i] + j];
1422:       for (k = 0; k < bs; k++) {
1423:         x = ri[k];
1424:         for (tmp = 0; tmp < bs; tmp++) (*v++) *= li[tmp] * x;
1425:       }
1426:     }
1427:   }
1428:   PetscCall(VecRestoreArrayRead(ll, &l));
1429:   PetscCall(PetscLogFlops(2.0 * a->nz));
1430:   PetscFunctionReturn(PETSC_SUCCESS);
1431: }

1433: PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A, MatInfoType flag, MatInfo *info)
1434: {
1435:   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;

1437:   PetscFunctionBegin;
1438:   info->block_size   = a->bs2;
1439:   info->nz_allocated = a->bs2 * a->maxnz; /*num. of nonzeros in upper triangular part */
1440:   info->nz_used      = a->bs2 * a->nz;    /*num. of nonzeros in upper triangular part */
1441:   info->nz_unneeded  = info->nz_allocated - info->nz_used;
1442:   info->assemblies   = A->num_ass;
1443:   info->mallocs      = A->info.mallocs;
1444:   info->memory       = 0; /* REVIEW ME */
1445:   if (A->factortype) {
1446:     info->fill_ratio_given  = A->info.fill_ratio_given;
1447:     info->fill_ratio_needed = A->info.fill_ratio_needed;
1448:     info->factor_mallocs    = A->info.factor_mallocs;
1449:   } else {
1450:     info->fill_ratio_given  = 0;
1451:     info->fill_ratio_needed = 0;
1452:     info->factor_mallocs    = 0;
1453:   }
1454:   PetscFunctionReturn(PETSC_SUCCESS);
1455: }

1457: PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A)
1458: {
1459:   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;

1461:   PetscFunctionBegin;
1462:   PetscCall(PetscArrayzero(a->a, a->bs2 * a->i[a->mbs]));
1463:   PetscFunctionReturn(PETSC_SUCCESS);
1464: }

1466: PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A, Vec v, PetscInt idx[])
1467: {
1468:   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ *)A->data;
1469:   PetscInt         i, j, n, row, col, bs, mbs;
1470:   const PetscInt  *ai, *aj;
1471:   PetscReal        atmp;
1472:   const MatScalar *aa;
1473:   PetscScalar     *x;
1474:   PetscInt         ncols, brow, bcol, krow, kcol;

1476:   PetscFunctionBegin;
1477:   PetscCheck(!idx, PETSC_COMM_SELF, PETSC_ERR_SUP, "Send email to petsc-maint@mcs.anl.gov");
1478:   PetscCheck(!A->factortype, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
1479:   bs  = A->rmap->bs;
1480:   aa  = a->a;
1481:   ai  = a->i;
1482:   aj  = a->j;
1483:   mbs = a->mbs;

1485:   PetscCall(VecSet(v, 0.0));
1486:   PetscCall(VecGetArray(v, &x));
1487:   PetscCall(VecGetLocalSize(v, &n));
1488:   PetscCheck(n == A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
1489:   for (i = 0; i < mbs; i++) {
1490:     ncols = ai[1] - ai[0];
1491:     ai++;
1492:     brow = bs * i;
1493:     for (j = 0; j < ncols; j++) {
1494:       bcol = bs * (*aj);
1495:       for (kcol = 0; kcol < bs; kcol++) {
1496:         col = bcol + kcol; /* col index */
1497:         for (krow = 0; krow < bs; krow++) {
1498:           atmp = PetscAbsScalar(*aa);
1499:           aa++;
1500:           row = brow + krow; /* row index */
1501:           if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1502:           if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
1503:         }
1504:       }
1505:       aj++;
1506:     }
1507:   }
1508:   PetscCall(VecRestoreArray(v, &x));
1509:   PetscFunctionReturn(PETSC_SUCCESS);
1510: }

1512: PetscErrorCode MatMatMultSymbolic_SeqSBAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C)
1513: {
1514:   PetscFunctionBegin;
1515:   PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C));
1516:   C->ops->matmultnumeric = MatMatMultNumeric_SeqSBAIJ_SeqDense;
1517:   PetscFunctionReturn(PETSC_SUCCESS);
1518: }

1520: PetscErrorCode MatMatMult_SeqSBAIJ_1_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1521: {
1522:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1523:   PetscScalar       *z = c;
1524:   const PetscScalar *xb;
1525:   PetscScalar        x1;
1526:   const MatScalar   *v   = a->a, *vv;
1527:   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1528: #if defined(PETSC_USE_COMPLEX)
1529:   const int aconj = A->hermitian == PETSC_BOOL3_TRUE;
1530: #else
1531:   const int aconj = 0;
1532: #endif

1534:   PetscFunctionBegin;
1535:   for (i = 0; i < mbs; i++) {
1536:     n = ii[1] - ii[0];
1537:     ii++;
1538:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1539:     PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
1540:     jj = idx;
1541:     vv = v;
1542:     for (k = 0; k < cn; k++) {
1543:       idx = jj;
1544:       v   = vv;
1545:       for (j = 0; j < n; j++) {
1546:         xb = b + (*idx);
1547:         x1 = xb[0 + k * bm];
1548:         z[0 + k * cm] += v[0] * x1;
1549:         if (*idx != i) c[(*idx) + k * cm] += (aconj ? PetscConj(v[0]) : v[0]) * b[i + k * bm];
1550:         v += 1;
1551:         ++idx;
1552:       }
1553:     }
1554:     z += 1;
1555:   }
1556:   PetscFunctionReturn(PETSC_SUCCESS);
1557: }

1559: PetscErrorCode MatMatMult_SeqSBAIJ_2_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1560: {
1561:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1562:   PetscScalar       *z = c;
1563:   const PetscScalar *xb;
1564:   PetscScalar        x1, x2;
1565:   const MatScalar   *v   = a->a, *vv;
1566:   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;

1568:   PetscFunctionBegin;
1569:   for (i = 0; i < mbs; i++) {
1570:     n = ii[1] - ii[0];
1571:     ii++;
1572:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
1573:     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1574:     jj = idx;
1575:     vv = v;
1576:     for (k = 0; k < cn; k++) {
1577:       idx = jj;
1578:       v   = vv;
1579:       for (j = 0; j < n; j++) {
1580:         xb = b + 2 * (*idx);
1581:         x1 = xb[0 + k * bm];
1582:         x2 = xb[1 + k * bm];
1583:         z[0 + k * cm] += v[0] * x1 + v[2] * x2;
1584:         z[1 + k * cm] += v[1] * x1 + v[3] * x2;
1585:         if (*idx != i) {
1586:           c[2 * (*idx) + 0 + k * cm] += v[0] * b[2 * i + k * bm] + v[1] * b[2 * i + 1 + k * bm];
1587:           c[2 * (*idx) + 1 + k * cm] += v[2] * b[2 * i + k * bm] + v[3] * b[2 * i + 1 + k * bm];
1588:         }
1589:         v += 4;
1590:         ++idx;
1591:       }
1592:     }
1593:     z += 2;
1594:   }
1595:   PetscFunctionReturn(PETSC_SUCCESS);
1596: }

1598: PetscErrorCode MatMatMult_SeqSBAIJ_3_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1599: {
1600:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1601:   PetscScalar       *z = c;
1602:   const PetscScalar *xb;
1603:   PetscScalar        x1, x2, x3;
1604:   const MatScalar   *v   = a->a, *vv;
1605:   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;

1607:   PetscFunctionBegin;
1608:   for (i = 0; i < mbs; i++) {
1609:     n = ii[1] - ii[0];
1610:     ii++;
1611:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
1612:     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1613:     jj = idx;
1614:     vv = v;
1615:     for (k = 0; k < cn; k++) {
1616:       idx = jj;
1617:       v   = vv;
1618:       for (j = 0; j < n; j++) {
1619:         xb = b + 3 * (*idx);
1620:         x1 = xb[0 + k * bm];
1621:         x2 = xb[1 + k * bm];
1622:         x3 = xb[2 + k * bm];
1623:         z[0 + k * cm] += v[0] * x1 + v[3] * x2 + v[6] * x3;
1624:         z[1 + k * cm] += v[1] * x1 + v[4] * x2 + v[7] * x3;
1625:         z[2 + k * cm] += v[2] * x1 + v[5] * x2 + v[8] * x3;
1626:         if (*idx != i) {
1627:           c[3 * (*idx) + 0 + k * cm] += v[0] * b[3 * i + k * bm] + v[3] * b[3 * i + 1 + k * bm] + v[6] * b[3 * i + 2 + k * bm];
1628:           c[3 * (*idx) + 1 + k * cm] += v[1] * b[3 * i + k * bm] + v[4] * b[3 * i + 1 + k * bm] + v[7] * b[3 * i + 2 + k * bm];
1629:           c[3 * (*idx) + 2 + k * cm] += v[2] * b[3 * i + k * bm] + v[5] * b[3 * i + 1 + k * bm] + v[8] * b[3 * i + 2 + k * bm];
1630:         }
1631:         v += 9;
1632:         ++idx;
1633:       }
1634:     }
1635:     z += 3;
1636:   }
1637:   PetscFunctionReturn(PETSC_SUCCESS);
1638: }

1640: PetscErrorCode MatMatMult_SeqSBAIJ_4_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1641: {
1642:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1643:   PetscScalar       *z = c;
1644:   const PetscScalar *xb;
1645:   PetscScalar        x1, x2, x3, x4;
1646:   const MatScalar   *v   = a->a, *vv;
1647:   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;

1649:   PetscFunctionBegin;
1650:   for (i = 0; i < mbs; i++) {
1651:     n = ii[1] - ii[0];
1652:     ii++;
1653:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
1654:     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1655:     jj = idx;
1656:     vv = v;
1657:     for (k = 0; k < cn; k++) {
1658:       idx = jj;
1659:       v   = vv;
1660:       for (j = 0; j < n; j++) {
1661:         xb = b + 4 * (*idx);
1662:         x1 = xb[0 + k * bm];
1663:         x2 = xb[1 + k * bm];
1664:         x3 = xb[2 + k * bm];
1665:         x4 = xb[3 + k * bm];
1666:         z[0 + k * cm] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
1667:         z[1 + k * cm] += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
1668:         z[2 + k * cm] += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
1669:         z[3 + k * cm] += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
1670:         if (*idx != i) {
1671:           c[4 * (*idx) + 0 + k * cm] += v[0] * b[4 * i + k * bm] + v[4] * b[4 * i + 1 + k * bm] + v[8] * b[4 * i + 2 + k * bm] + v[12] * b[4 * i + 3 + k * bm];
1672:           c[4 * (*idx) + 1 + k * cm] += v[1] * b[4 * i + k * bm] + v[5] * b[4 * i + 1 + k * bm] + v[9] * b[4 * i + 2 + k * bm] + v[13] * b[4 * i + 3 + k * bm];
1673:           c[4 * (*idx) + 2 + k * cm] += v[2] * b[4 * i + k * bm] + v[6] * b[4 * i + 1 + k * bm] + v[10] * b[4 * i + 2 + k * bm] + v[14] * b[4 * i + 3 + k * bm];
1674:           c[4 * (*idx) + 3 + k * cm] += v[3] * b[4 * i + k * bm] + v[7] * b[4 * i + 1 + k * bm] + v[11] * b[4 * i + 2 + k * bm] + v[15] * b[4 * i + 3 + k * bm];
1675:         }
1676:         v += 16;
1677:         ++idx;
1678:       }
1679:     }
1680:     z += 4;
1681:   }
1682:   PetscFunctionReturn(PETSC_SUCCESS);
1683: }

1685: PetscErrorCode MatMatMult_SeqSBAIJ_5_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1686: {
1687:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1688:   PetscScalar       *z = c;
1689:   const PetscScalar *xb;
1690:   PetscScalar        x1, x2, x3, x4, x5;
1691:   const MatScalar   *v   = a->a, *vv;
1692:   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;

1694:   PetscFunctionBegin;
1695:   for (i = 0; i < mbs; i++) {
1696:     n = ii[1] - ii[0];
1697:     ii++;
1698:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
1699:     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1700:     jj = idx;
1701:     vv = v;
1702:     for (k = 0; k < cn; k++) {
1703:       idx = jj;
1704:       v   = vv;
1705:       for (j = 0; j < n; j++) {
1706:         xb = b + 5 * (*idx);
1707:         x1 = xb[0 + k * bm];
1708:         x2 = xb[1 + k * bm];
1709:         x3 = xb[2 + k * bm];
1710:         x4 = xb[3 + k * bm];
1711:         x5 = xb[4 + k * cm];
1712:         z[0 + k * cm] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
1713:         z[1 + k * cm] += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
1714:         z[2 + k * cm] += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
1715:         z[3 + k * cm] += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
1716:         z[4 + k * cm] += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
1717:         if (*idx != i) {
1718:           c[5 * (*idx) + 0 + k * cm] += v[0] * b[5 * i + k * bm] + v[5] * b[5 * i + 1 + k * bm] + v[10] * b[5 * i + 2 + k * bm] + v[15] * b[5 * i + 3 + k * bm] + v[20] * b[5 * i + 4 + k * bm];
1719:           c[5 * (*idx) + 1 + k * cm] += v[1] * b[5 * i + k * bm] + v[6] * b[5 * i + 1 + k * bm] + v[11] * b[5 * i + 2 + k * bm] + v[16] * b[5 * i + 3 + k * bm] + v[21] * b[5 * i + 4 + k * bm];
1720:           c[5 * (*idx) + 2 + k * cm] += v[2] * b[5 * i + k * bm] + v[7] * b[5 * i + 1 + k * bm] + v[12] * b[5 * i + 2 + k * bm] + v[17] * b[5 * i + 3 + k * bm] + v[22] * b[5 * i + 4 + k * bm];
1721:           c[5 * (*idx) + 3 + k * cm] += v[3] * b[5 * i + k * bm] + v[8] * b[5 * i + 1 + k * bm] + v[13] * b[5 * i + 2 + k * bm] + v[18] * b[5 * i + 3 + k * bm] + v[23] * b[5 * i + 4 + k * bm];
1722:           c[5 * (*idx) + 4 + k * cm] += v[4] * b[5 * i + k * bm] + v[9] * b[5 * i + 1 + k * bm] + v[14] * b[5 * i + 2 + k * bm] + v[19] * b[5 * i + 3 + k * bm] + v[24] * b[5 * i + 4 + k * bm];
1723:         }
1724:         v += 25;
1725:         ++idx;
1726:       }
1727:     }
1728:     z += 5;
1729:   }
1730:   PetscFunctionReturn(PETSC_SUCCESS);
1731: }

1733: PetscErrorCode MatMatMultNumeric_SeqSBAIJ_SeqDense(Mat A, Mat B, Mat C)
1734: {
1735:   Mat_SeqSBAIJ    *a  = (Mat_SeqSBAIJ *)A->data;
1736:   Mat_SeqDense    *bd = (Mat_SeqDense *)B->data;
1737:   Mat_SeqDense    *cd = (Mat_SeqDense *)C->data;
1738:   PetscInt         cm = cd->lda, cn = B->cmap->n, bm = bd->lda;
1739:   PetscInt         mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
1740:   PetscBLASInt     bbs, bcn, bbm, bcm;
1741:   PetscScalar     *z = NULL;
1742:   PetscScalar     *c, *b;
1743:   const MatScalar *v;
1744:   const PetscInt  *idx, *ii;
1745:   PetscScalar      _DOne = 1.0;

1747:   PetscFunctionBegin;
1748:   if (!cm || !cn) PetscFunctionReturn(PETSC_SUCCESS);
1749:   PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n);
1750:   PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n);
1751:   PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n);
1752:   b = bd->v;
1753:   PetscCall(MatZeroEntries(C));
1754:   PetscCall(MatDenseGetArray(C, &c));
1755:   switch (bs) {
1756:   case 1:
1757:     PetscCall(MatMatMult_SeqSBAIJ_1_Private(A, b, bm, c, cm, cn));
1758:     break;
1759:   case 2:
1760:     PetscCall(MatMatMult_SeqSBAIJ_2_Private(A, b, bm, c, cm, cn));
1761:     break;
1762:   case 3:
1763:     PetscCall(MatMatMult_SeqSBAIJ_3_Private(A, b, bm, c, cm, cn));
1764:     break;
1765:   case 4:
1766:     PetscCall(MatMatMult_SeqSBAIJ_4_Private(A, b, bm, c, cm, cn));
1767:     break;
1768:   case 5:
1769:     PetscCall(MatMatMult_SeqSBAIJ_5_Private(A, b, bm, c, cm, cn));
1770:     break;
1771:   default: /* block sizes larger than 5 by 5 are handled by BLAS */
1772:     PetscCall(PetscBLASIntCast(bs, &bbs));
1773:     PetscCall(PetscBLASIntCast(cn, &bcn));
1774:     PetscCall(PetscBLASIntCast(bm, &bbm));
1775:     PetscCall(PetscBLASIntCast(cm, &bcm));
1776:     idx = a->j;
1777:     v   = a->a;
1778:     mbs = a->mbs;
1779:     ii  = a->i;
1780:     z   = c;
1781:     for (i = 0; i < mbs; i++) {
1782:       n = ii[1] - ii[0];
1783:       ii++;
1784:       for (j = 0; j < n; j++) {
1785:         if (*idx != i) PetscCallBLAS("BLASgemm", BLASgemm_("T", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * i, &bbm, &_DOne, c + bs * (*idx), &bcm));
1786:         PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DOne, z, &bcm));
1787:         v += bs2;
1788:       }
1789:       z += bs;
1790:     }
1791:   }
1792:   PetscCall(MatDenseRestoreArray(C, &c));
1793:   PetscCall(PetscLogFlops((2.0 * (a->nz * 2.0 - a->nonzerorowcnt) * bs2 - a->nonzerorowcnt) * cn));
1794:   PetscFunctionReturn(PETSC_SUCCESS);
1795: }