Actual source code: sbaij2.c
2: #include <../src/mat/impls/baij/seq/baij.h>
3: #include <../src/mat/impls/dense/seq/dense.h>
4: #include <../src/mat/impls/sbaij/seq/sbaij.h>
5: #include <petsc/private/kernels/blockinvert.h>
6: #include <petscbt.h>
7: #include <petscblaslapack.h>
9: PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A, PetscInt is_max, IS is[], PetscInt ov)
10: {
11: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
12: PetscInt brow, i, j, k, l, mbs, n, *nidx, isz, bcol, bcol_max, start, end, *ai, *aj, bs;
13: const PetscInt *idx;
14: PetscBT table_out, table_in;
16: PetscFunctionBegin;
17: PetscCheck(ov >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified");
18: mbs = a->mbs;
19: ai = a->i;
20: aj = a->j;
21: bs = A->rmap->bs;
22: PetscCall(PetscBTCreate(mbs, &table_out));
23: PetscCall(PetscMalloc1(mbs + 1, &nidx));
24: PetscCall(PetscBTCreate(mbs, &table_in));
26: for (i = 0; i < is_max; i++) { /* for each is */
27: isz = 0;
28: PetscCall(PetscBTMemzero(mbs, table_out));
30: /* Extract the indices, assume there can be duplicate entries */
31: PetscCall(ISGetIndices(is[i], &idx));
32: PetscCall(ISGetLocalSize(is[i], &n));
34: /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */
35: bcol_max = 0;
36: for (j = 0; j < n; ++j) {
37: brow = idx[j] / bs; /* convert the indices into block indices */
38: PetscCheck(brow < mbs, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "index greater than mat-dim");
39: if (!PetscBTLookupSet(table_out, brow)) {
40: nidx[isz++] = brow;
41: if (bcol_max < brow) bcol_max = brow;
42: }
43: }
44: PetscCall(ISRestoreIndices(is[i], &idx));
45: PetscCall(ISDestroy(&is[i]));
47: k = 0;
48: for (j = 0; j < ov; j++) { /* for each overlap */
49: /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */
50: PetscCall(PetscBTMemzero(mbs, table_in));
51: for (l = k; l < isz; l++) PetscCall(PetscBTSet(table_in, nidx[l]));
53: n = isz; /* length of the updated is[i] */
54: for (brow = 0; brow < mbs; brow++) {
55: start = ai[brow];
56: end = ai[brow + 1];
57: if (PetscBTLookup(table_in, brow)) { /* brow is on nidx - row search: collect all bcol in this brow */
58: for (l = start; l < end; l++) {
59: bcol = aj[l];
60: if (!PetscBTLookupSet(table_out, bcol)) {
61: nidx[isz++] = bcol;
62: if (bcol_max < bcol) bcol_max = bcol;
63: }
64: }
65: k++;
66: if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */
67: } else { /* brow is not on nidx - col search: add brow onto nidx if there is a bcol in nidx */
68: for (l = start; l < end; l++) {
69: bcol = aj[l];
70: if (bcol > bcol_max) break;
71: if (PetscBTLookup(table_in, bcol)) {
72: if (!PetscBTLookupSet(table_out, brow)) nidx[isz++] = brow;
73: break; /* for l = start; l<end ; l++) */
74: }
75: }
76: }
77: }
78: } /* for each overlap */
79: PetscCall(ISCreateBlock(PETSC_COMM_SELF, bs, isz, nidx, PETSC_COPY_VALUES, is + i));
80: } /* for each is */
81: PetscCall(PetscBTDestroy(&table_out));
82: PetscCall(PetscFree(nidx));
83: PetscCall(PetscBTDestroy(&table_in));
84: PetscFunctionReturn(PETSC_SUCCESS);
85: }
87: /* Bseq is non-symmetric SBAIJ matrix, only used internally by PETSc.
88: Zero some ops' to avoid invalid use */
89: PetscErrorCode MatSeqSBAIJZeroOps_Private(Mat Bseq)
90: {
91: PetscFunctionBegin;
92: PetscCall(MatSetOption(Bseq, MAT_SYMMETRIC, PETSC_FALSE));
93: Bseq->ops->mult = NULL;
94: Bseq->ops->multadd = NULL;
95: Bseq->ops->multtranspose = NULL;
96: Bseq->ops->multtransposeadd = NULL;
97: Bseq->ops->lufactor = NULL;
98: Bseq->ops->choleskyfactor = NULL;
99: Bseq->ops->lufactorsymbolic = NULL;
100: Bseq->ops->choleskyfactorsymbolic = NULL;
101: Bseq->ops->getinertia = NULL;
102: PetscFunctionReturn(PETSC_SUCCESS);
103: }
105: /* same as MatCreateSubMatrices_SeqBAIJ(), except cast Mat_SeqSBAIJ */
106: PetscErrorCode MatCreateSubMatrix_SeqSBAIJ_Private(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B)
107: {
108: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *c;
109: PetscInt *smap, i, k, kstart, kend, oldcols = a->nbs, *lens;
110: PetscInt row, mat_i, *mat_j, tcol, *mat_ilen;
111: const PetscInt *irow, *icol;
112: PetscInt nrows, ncols, *ssmap, bs = A->rmap->bs, bs2 = a->bs2;
113: PetscInt *aj = a->j, *ai = a->i;
114: MatScalar *mat_a;
115: Mat C;
116: PetscBool flag;
118: PetscFunctionBegin;
120: PetscCall(ISGetIndices(isrow, &irow));
121: PetscCall(ISGetIndices(iscol, &icol));
122: PetscCall(ISGetLocalSize(isrow, &nrows));
123: PetscCall(ISGetLocalSize(iscol, &ncols));
125: PetscCall(PetscCalloc1(1 + oldcols, &smap));
126: ssmap = smap;
127: PetscCall(PetscMalloc1(1 + nrows, &lens));
128: for (i = 0; i < ncols; i++) smap[icol[i]] = i + 1;
129: /* determine lens of each row */
130: for (i = 0; i < nrows; i++) {
131: kstart = ai[irow[i]];
132: kend = kstart + a->ilen[irow[i]];
133: lens[i] = 0;
134: for (k = kstart; k < kend; k++) {
135: if (ssmap[aj[k]]) lens[i]++;
136: }
137: }
138: /* Create and fill new matrix */
139: if (scall == MAT_REUSE_MATRIX) {
140: c = (Mat_SeqSBAIJ *)((*B)->data);
142: PetscCheck(c->mbs == nrows && c->nbs == ncols && (*B)->rmap->bs == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Submatrix wrong size");
143: PetscCall(PetscArraycmp(c->ilen, lens, c->mbs, &flag));
144: PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong no of nonzeros");
145: PetscCall(PetscArrayzero(c->ilen, c->mbs));
146: C = *B;
147: } else {
148: PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &C));
149: PetscCall(MatSetSizes(C, nrows * bs, ncols * bs, PETSC_DETERMINE, PETSC_DETERMINE));
150: PetscCall(MatSetType(C, ((PetscObject)A)->type_name));
151: PetscCall(MatSeqSBAIJSetPreallocation(C, bs, 0, lens));
152: }
153: c = (Mat_SeqSBAIJ *)(C->data);
154: for (i = 0; i < nrows; i++) {
155: row = irow[i];
156: kstart = ai[row];
157: kend = kstart + a->ilen[row];
158: mat_i = c->i[i];
159: mat_j = c->j + mat_i;
160: mat_a = c->a + mat_i * bs2;
161: mat_ilen = c->ilen + i;
162: for (k = kstart; k < kend; k++) {
163: if ((tcol = ssmap[a->j[k]])) {
164: *mat_j++ = tcol - 1;
165: PetscCall(PetscArraycpy(mat_a, a->a + k * bs2, bs2));
166: mat_a += bs2;
167: (*mat_ilen)++;
168: }
169: }
170: }
171: /* sort */
172: {
173: MatScalar *work;
175: PetscCall(PetscMalloc1(bs2, &work));
176: for (i = 0; i < nrows; i++) {
177: PetscInt ilen;
178: mat_i = c->i[i];
179: mat_j = c->j + mat_i;
180: mat_a = c->a + mat_i * bs2;
181: ilen = c->ilen[i];
182: PetscCall(PetscSortIntWithDataArray(ilen, mat_j, mat_a, bs2 * sizeof(MatScalar), work));
183: }
184: PetscCall(PetscFree(work));
185: }
187: /* Free work space */
188: PetscCall(ISRestoreIndices(iscol, &icol));
189: PetscCall(PetscFree(smap));
190: PetscCall(PetscFree(lens));
191: PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
192: PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
194: PetscCall(ISRestoreIndices(isrow, &irow));
195: *B = C;
196: PetscFunctionReturn(PETSC_SUCCESS);
197: }
199: PetscErrorCode MatCreateSubMatrix_SeqSBAIJ(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B)
200: {
201: IS is1, is2;
203: PetscFunctionBegin;
204: PetscCall(ISCompressIndicesGeneral(A->rmap->N, A->rmap->n, A->rmap->bs, 1, &isrow, &is1));
205: if (isrow == iscol) {
206: is2 = is1;
207: PetscCall(PetscObjectReference((PetscObject)is2));
208: } else PetscCall(ISCompressIndicesGeneral(A->cmap->N, A->cmap->n, A->cmap->bs, 1, &iscol, &is2));
209: PetscCall(MatCreateSubMatrix_SeqSBAIJ_Private(A, is1, is2, scall, B));
210: PetscCall(ISDestroy(&is1));
211: PetscCall(ISDestroy(&is2));
213: if (isrow != iscol) {
214: PetscBool isequal;
215: PetscCall(ISEqual(isrow, iscol, &isequal));
216: if (!isequal) PetscCall(MatSeqSBAIJZeroOps_Private(*B));
217: }
218: PetscFunctionReturn(PETSC_SUCCESS);
219: }
221: PetscErrorCode MatCreateSubMatrices_SeqSBAIJ(Mat A, PetscInt n, const IS irow[], const IS icol[], MatReuse scall, Mat *B[])
222: {
223: PetscInt i;
225: PetscFunctionBegin;
226: if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(n + 1, B));
228: for (i = 0; i < n; i++) PetscCall(MatCreateSubMatrix_SeqSBAIJ(A, irow[i], icol[i], scall, &(*B)[i]));
229: PetscFunctionReturn(PETSC_SUCCESS);
230: }
232: /* Should check that shapes of vectors and matrices match */
233: PetscErrorCode MatMult_SeqSBAIJ_2(Mat A, Vec xx, Vec zz)
234: {
235: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
236: PetscScalar *z, x1, x2, zero = 0.0;
237: const PetscScalar *x, *xb;
238: const MatScalar *v;
239: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
240: const PetscInt *aj = a->j, *ai = a->i, *ib;
241: PetscInt nonzerorow = 0;
243: PetscFunctionBegin;
244: PetscCall(VecSet(zz, zero));
245: if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
246: PetscCall(VecGetArrayRead(xx, &x));
247: PetscCall(VecGetArray(zz, &z));
249: v = a->a;
250: xb = x;
252: for (i = 0; i < mbs; i++) {
253: n = ai[1] - ai[0]; /* length of i_th block row of A */
254: x1 = xb[0];
255: x2 = xb[1];
256: ib = aj + *ai;
257: jmin = 0;
258: nonzerorow += (n > 0);
259: if (*ib == i) { /* (diag of A)*x */
260: z[2 * i] += v[0] * x1 + v[2] * x2;
261: z[2 * i + 1] += v[2] * x1 + v[3] * x2;
262: v += 4;
263: jmin++;
264: }
265: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
266: PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
267: for (j = jmin; j < n; j++) {
268: /* (strict lower triangular part of A)*x */
269: cval = ib[j] * 2;
270: z[cval] += v[0] * x1 + v[1] * x2;
271: z[cval + 1] += v[2] * x1 + v[3] * x2;
272: /* (strict upper triangular part of A)*x */
273: z[2 * i] += v[0] * x[cval] + v[2] * x[cval + 1];
274: z[2 * i + 1] += v[1] * x[cval] + v[3] * x[cval + 1];
275: v += 4;
276: }
277: xb += 2;
278: ai++;
279: }
281: PetscCall(VecRestoreArrayRead(xx, &x));
282: PetscCall(VecRestoreArray(zz, &z));
283: PetscCall(PetscLogFlops(8.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
284: PetscFunctionReturn(PETSC_SUCCESS);
285: }
287: PetscErrorCode MatMult_SeqSBAIJ_3(Mat A, Vec xx, Vec zz)
288: {
289: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
290: PetscScalar *z, x1, x2, x3, zero = 0.0;
291: const PetscScalar *x, *xb;
292: const MatScalar *v;
293: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
294: const PetscInt *aj = a->j, *ai = a->i, *ib;
295: PetscInt nonzerorow = 0;
297: PetscFunctionBegin;
298: PetscCall(VecSet(zz, zero));
299: if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
300: PetscCall(VecGetArrayRead(xx, &x));
301: PetscCall(VecGetArray(zz, &z));
303: v = a->a;
304: xb = x;
306: for (i = 0; i < mbs; i++) {
307: n = ai[1] - ai[0]; /* length of i_th block row of A */
308: x1 = xb[0];
309: x2 = xb[1];
310: x3 = xb[2];
311: ib = aj + *ai;
312: jmin = 0;
313: nonzerorow += (n > 0);
314: if (*ib == i) { /* (diag of A)*x */
315: z[3 * i] += v[0] * x1 + v[3] * x2 + v[6] * x3;
316: z[3 * i + 1] += v[3] * x1 + v[4] * x2 + v[7] * x3;
317: z[3 * i + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
318: v += 9;
319: jmin++;
320: }
321: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
322: PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
323: for (j = jmin; j < n; j++) {
324: /* (strict lower triangular part of A)*x */
325: cval = ib[j] * 3;
326: z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3;
327: z[cval + 1] += v[3] * x1 + v[4] * x2 + v[5] * x3;
328: z[cval + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
329: /* (strict upper triangular part of A)*x */
330: z[3 * i] += v[0] * x[cval] + v[3] * x[cval + 1] + v[6] * x[cval + 2];
331: z[3 * i + 1] += v[1] * x[cval] + v[4] * x[cval + 1] + v[7] * x[cval + 2];
332: z[3 * i + 2] += v[2] * x[cval] + v[5] * x[cval + 1] + v[8] * x[cval + 2];
333: v += 9;
334: }
335: xb += 3;
336: ai++;
337: }
339: PetscCall(VecRestoreArrayRead(xx, &x));
340: PetscCall(VecRestoreArray(zz, &z));
341: PetscCall(PetscLogFlops(18.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
342: PetscFunctionReturn(PETSC_SUCCESS);
343: }
345: PetscErrorCode MatMult_SeqSBAIJ_4(Mat A, Vec xx, Vec zz)
346: {
347: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
348: PetscScalar *z, x1, x2, x3, x4, zero = 0.0;
349: const PetscScalar *x, *xb;
350: const MatScalar *v;
351: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
352: const PetscInt *aj = a->j, *ai = a->i, *ib;
353: PetscInt nonzerorow = 0;
355: PetscFunctionBegin;
356: PetscCall(VecSet(zz, zero));
357: if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
358: PetscCall(VecGetArrayRead(xx, &x));
359: PetscCall(VecGetArray(zz, &z));
361: v = a->a;
362: xb = x;
364: for (i = 0; i < mbs; i++) {
365: n = ai[1] - ai[0]; /* length of i_th block row of A */
366: x1 = xb[0];
367: x2 = xb[1];
368: x3 = xb[2];
369: x4 = xb[3];
370: ib = aj + *ai;
371: jmin = 0;
372: nonzerorow += (n > 0);
373: if (*ib == i) { /* (diag of A)*x */
374: z[4 * i] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
375: z[4 * i + 1] += v[4] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
376: z[4 * i + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[14] * x4;
377: z[4 * i + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
378: v += 16;
379: jmin++;
380: }
381: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
382: PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
383: for (j = jmin; j < n; j++) {
384: /* (strict lower triangular part of A)*x */
385: cval = ib[j] * 4;
386: z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4;
387: z[cval + 1] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4;
388: z[cval + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4;
389: z[cval + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
390: /* (strict upper triangular part of A)*x */
391: z[4 * i] += v[0] * x[cval] + v[4] * x[cval + 1] + v[8] * x[cval + 2] + v[12] * x[cval + 3];
392: z[4 * i + 1] += v[1] * x[cval] + v[5] * x[cval + 1] + v[9] * x[cval + 2] + v[13] * x[cval + 3];
393: z[4 * i + 2] += v[2] * x[cval] + v[6] * x[cval + 1] + v[10] * x[cval + 2] + v[14] * x[cval + 3];
394: z[4 * i + 3] += v[3] * x[cval] + v[7] * x[cval + 1] + v[11] * x[cval + 2] + v[15] * x[cval + 3];
395: v += 16;
396: }
397: xb += 4;
398: ai++;
399: }
401: PetscCall(VecRestoreArrayRead(xx, &x));
402: PetscCall(VecRestoreArray(zz, &z));
403: PetscCall(PetscLogFlops(32.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
404: PetscFunctionReturn(PETSC_SUCCESS);
405: }
407: PetscErrorCode MatMult_SeqSBAIJ_5(Mat A, Vec xx, Vec zz)
408: {
409: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
410: PetscScalar *z, x1, x2, x3, x4, x5, zero = 0.0;
411: const PetscScalar *x, *xb;
412: const MatScalar *v;
413: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
414: const PetscInt *aj = a->j, *ai = a->i, *ib;
415: PetscInt nonzerorow = 0;
417: PetscFunctionBegin;
418: PetscCall(VecSet(zz, zero));
419: if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
420: PetscCall(VecGetArrayRead(xx, &x));
421: PetscCall(VecGetArray(zz, &z));
423: v = a->a;
424: xb = x;
426: for (i = 0; i < mbs; i++) {
427: n = ai[1] - ai[0]; /* length of i_th block row of A */
428: x1 = xb[0];
429: x2 = xb[1];
430: x3 = xb[2];
431: x4 = xb[3];
432: x5 = xb[4];
433: ib = aj + *ai;
434: jmin = 0;
435: nonzerorow += (n > 0);
436: if (*ib == i) { /* (diag of A)*x */
437: z[5 * i] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
438: z[5 * i + 1] += v[5] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
439: z[5 * i + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
440: z[5 * i + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[23] * x5;
441: z[5 * i + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
442: v += 25;
443: jmin++;
444: }
445: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
446: PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
447: for (j = jmin; j < n; j++) {
448: /* (strict lower triangular part of A)*x */
449: cval = ib[j] * 5;
450: z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5;
451: z[cval + 1] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5;
452: z[cval + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5;
453: z[cval + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5;
454: z[cval + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
455: /* (strict upper triangular part of A)*x */
456: z[5 * i] += v[0] * x[cval] + v[5] * x[cval + 1] + v[10] * x[cval + 2] + v[15] * x[cval + 3] + v[20] * x[cval + 4];
457: z[5 * i + 1] += v[1] * x[cval] + v[6] * x[cval + 1] + v[11] * x[cval + 2] + v[16] * x[cval + 3] + v[21] * x[cval + 4];
458: z[5 * i + 2] += v[2] * x[cval] + v[7] * x[cval + 1] + v[12] * x[cval + 2] + v[17] * x[cval + 3] + v[22] * x[cval + 4];
459: z[5 * i + 3] += v[3] * x[cval] + v[8] * x[cval + 1] + v[13] * x[cval + 2] + v[18] * x[cval + 3] + v[23] * x[cval + 4];
460: z[5 * i + 4] += v[4] * x[cval] + v[9] * x[cval + 1] + v[14] * x[cval + 2] + v[19] * x[cval + 3] + v[24] * x[cval + 4];
461: v += 25;
462: }
463: xb += 5;
464: ai++;
465: }
467: PetscCall(VecRestoreArrayRead(xx, &x));
468: PetscCall(VecRestoreArray(zz, &z));
469: PetscCall(PetscLogFlops(50.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
470: PetscFunctionReturn(PETSC_SUCCESS);
471: }
473: PetscErrorCode MatMult_SeqSBAIJ_6(Mat A, Vec xx, Vec zz)
474: {
475: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
476: PetscScalar *z, x1, x2, x3, x4, x5, x6, zero = 0.0;
477: const PetscScalar *x, *xb;
478: const MatScalar *v;
479: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
480: const PetscInt *aj = a->j, *ai = a->i, *ib;
481: PetscInt nonzerorow = 0;
483: PetscFunctionBegin;
484: PetscCall(VecSet(zz, zero));
485: if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
486: PetscCall(VecGetArrayRead(xx, &x));
487: PetscCall(VecGetArray(zz, &z));
489: v = a->a;
490: xb = x;
492: for (i = 0; i < mbs; i++) {
493: n = ai[1] - ai[0]; /* length of i_th block row of A */
494: x1 = xb[0];
495: x2 = xb[1];
496: x3 = xb[2];
497: x4 = xb[3];
498: x5 = xb[4];
499: x6 = xb[5];
500: ib = aj + *ai;
501: jmin = 0;
502: nonzerorow += (n > 0);
503: if (*ib == i) { /* (diag of A)*x */
504: z[6 * i] += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
505: z[6 * i + 1] += v[6] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
506: z[6 * i + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
507: z[6 * i + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
508: z[6 * i + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[34] * x6;
509: z[6 * i + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
510: v += 36;
511: jmin++;
512: }
513: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
514: PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
515: for (j = jmin; j < n; j++) {
516: /* (strict lower triangular part of A)*x */
517: cval = ib[j] * 6;
518: z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6;
519: z[cval + 1] += v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4 + v[10] * x5 + v[11] * x6;
520: z[cval + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4 + v[16] * x5 + v[17] * x6;
521: z[cval + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[22] * x5 + v[23] * x6;
522: z[cval + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[29] * x6;
523: z[cval + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
524: /* (strict upper triangular part of A)*x */
525: z[6 * i] += v[0] * x[cval] + v[6] * x[cval + 1] + v[12] * x[cval + 2] + v[18] * x[cval + 3] + v[24] * x[cval + 4] + v[30] * x[cval + 5];
526: z[6 * i + 1] += v[1] * x[cval] + v[7] * x[cval + 1] + v[13] * x[cval + 2] + v[19] * x[cval + 3] + v[25] * x[cval + 4] + v[31] * x[cval + 5];
527: z[6 * i + 2] += v[2] * x[cval] + v[8] * x[cval + 1] + v[14] * x[cval + 2] + v[20] * x[cval + 3] + v[26] * x[cval + 4] + v[32] * x[cval + 5];
528: z[6 * i + 3] += v[3] * x[cval] + v[9] * x[cval + 1] + v[15] * x[cval + 2] + v[21] * x[cval + 3] + v[27] * x[cval + 4] + v[33] * x[cval + 5];
529: z[6 * i + 4] += v[4] * x[cval] + v[10] * x[cval + 1] + v[16] * x[cval + 2] + v[22] * x[cval + 3] + v[28] * x[cval + 4] + v[34] * x[cval + 5];
530: z[6 * i + 5] += v[5] * x[cval] + v[11] * x[cval + 1] + v[17] * x[cval + 2] + v[23] * x[cval + 3] + v[29] * x[cval + 4] + v[35] * x[cval + 5];
531: v += 36;
532: }
533: xb += 6;
534: ai++;
535: }
537: PetscCall(VecRestoreArrayRead(xx, &x));
538: PetscCall(VecRestoreArray(zz, &z));
539: PetscCall(PetscLogFlops(72.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
540: PetscFunctionReturn(PETSC_SUCCESS);
541: }
543: PetscErrorCode MatMult_SeqSBAIJ_7(Mat A, Vec xx, Vec zz)
544: {
545: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
546: PetscScalar *z, x1, x2, x3, x4, x5, x6, x7, zero = 0.0;
547: const PetscScalar *x, *xb;
548: const MatScalar *v;
549: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
550: const PetscInt *aj = a->j, *ai = a->i, *ib;
551: PetscInt nonzerorow = 0;
553: PetscFunctionBegin;
554: PetscCall(VecSet(zz, zero));
555: if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
556: PetscCall(VecGetArrayRead(xx, &x));
557: PetscCall(VecGetArray(zz, &z));
559: v = a->a;
560: xb = x;
562: for (i = 0; i < mbs; i++) {
563: n = ai[1] - ai[0]; /* length of i_th block row of A */
564: x1 = xb[0];
565: x2 = xb[1];
566: x3 = xb[2];
567: x4 = xb[3];
568: x5 = xb[4];
569: x6 = xb[5];
570: x7 = xb[6];
571: ib = aj + *ai;
572: jmin = 0;
573: nonzerorow += (n > 0);
574: if (*ib == i) { /* (diag of A)*x */
575: z[7 * i] += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
576: z[7 * i + 1] += v[7] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
577: z[7 * i + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
578: z[7 * i + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
579: z[7 * i + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
580: z[7 * i + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[47] * x7;
581: z[7 * i + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
582: v += 49;
583: jmin++;
584: }
585: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
586: PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
587: for (j = jmin; j < n; j++) {
588: /* (strict lower triangular part of A)*x */
589: cval = ib[j] * 7;
590: z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6 + v[6] * x7;
591: z[cval + 1] += v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5 + v[12] * x6 + v[13] * x7;
592: z[cval + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[17] * x4 + v[18] * x5 + v[19] * x6 + v[20] * x7;
593: z[cval + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[25] * x5 + v[26] * x6 + v[27] * x7;
594: z[cval + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[33] * x6 + v[34] * x7;
595: z[cval + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[41] * x7;
596: z[cval + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
597: /* (strict upper triangular part of A)*x */
598: z[7 * i] += v[0] * x[cval] + v[7] * x[cval + 1] + v[14] * x[cval + 2] + v[21] * x[cval + 3] + v[28] * x[cval + 4] + v[35] * x[cval + 5] + v[42] * x[cval + 6];
599: z[7 * i + 1] += v[1] * x[cval] + v[8] * x[cval + 1] + v[15] * x[cval + 2] + v[22] * x[cval + 3] + v[29] * x[cval + 4] + v[36] * x[cval + 5] + v[43] * x[cval + 6];
600: z[7 * i + 2] += v[2] * x[cval] + v[9] * x[cval + 1] + v[16] * x[cval + 2] + v[23] * x[cval + 3] + v[30] * x[cval + 4] + v[37] * x[cval + 5] + v[44] * x[cval + 6];
601: z[7 * i + 3] += v[3] * x[cval] + v[10] * x[cval + 1] + v[17] * x[cval + 2] + v[24] * x[cval + 3] + v[31] * x[cval + 4] + v[38] * x[cval + 5] + v[45] * x[cval + 6];
602: z[7 * i + 4] += v[4] * x[cval] + v[11] * x[cval + 1] + v[18] * x[cval + 2] + v[25] * x[cval + 3] + v[32] * x[cval + 4] + v[39] * x[cval + 5] + v[46] * x[cval + 6];
603: z[7 * i + 5] += v[5] * x[cval] + v[12] * x[cval + 1] + v[19] * x[cval + 2] + v[26] * x[cval + 3] + v[33] * x[cval + 4] + v[40] * x[cval + 5] + v[47] * x[cval + 6];
604: z[7 * i + 6] += v[6] * x[cval] + v[13] * x[cval + 1] + v[20] * x[cval + 2] + v[27] * x[cval + 3] + v[34] * x[cval + 4] + v[41] * x[cval + 5] + v[48] * x[cval + 6];
605: v += 49;
606: }
607: xb += 7;
608: ai++;
609: }
610: PetscCall(VecRestoreArrayRead(xx, &x));
611: PetscCall(VecRestoreArray(zz, &z));
612: PetscCall(PetscLogFlops(98.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
613: PetscFunctionReturn(PETSC_SUCCESS);
614: }
616: /*
617: This will not work with MatScalar == float because it calls the BLAS
618: */
619: PetscErrorCode MatMult_SeqSBAIJ_N(Mat A, Vec xx, Vec zz)
620: {
621: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
622: PetscScalar *z, *z_ptr, *zb, *work, *workt, zero = 0.0;
623: const PetscScalar *x, *x_ptr, *xb;
624: const MatScalar *v;
625: PetscInt mbs = a->mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2, ncols, k;
626: const PetscInt *idx, *aj, *ii;
627: PetscInt nonzerorow = 0;
629: PetscFunctionBegin;
630: PetscCall(VecSet(zz, zero));
631: if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
632: PetscCall(VecGetArrayRead(xx, &x));
633: PetscCall(VecGetArray(zz, &z));
635: x_ptr = x;
636: z_ptr = z;
638: aj = a->j;
639: v = a->a;
640: ii = a->i;
642: if (!a->mult_work) PetscCall(PetscMalloc1(A->rmap->N + 1, &a->mult_work));
643: work = a->mult_work;
645: for (i = 0; i < mbs; i++) {
646: n = ii[1] - ii[0];
647: ncols = n * bs;
648: workt = work;
649: idx = aj + ii[0];
650: nonzerorow += (n > 0);
652: /* upper triangular part */
653: for (j = 0; j < n; j++) {
654: xb = x_ptr + bs * (*idx++);
655: for (k = 0; k < bs; k++) workt[k] = xb[k];
656: workt += bs;
657: }
658: /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
659: PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z);
661: /* strict lower triangular part */
662: idx = aj + ii[0];
663: if (n && *idx == i) {
664: ncols -= bs;
665: v += bs2;
666: idx++;
667: n--;
668: }
670: if (ncols > 0) {
671: workt = work;
672: PetscCall(PetscArrayzero(workt, ncols));
673: PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, x, v, workt);
674: for (j = 0; j < n; j++) {
675: zb = z_ptr + bs * (*idx++);
676: for (k = 0; k < bs; k++) zb[k] += workt[k];
677: workt += bs;
678: }
679: }
680: x += bs;
681: v += n * bs2;
682: z += bs;
683: ii++;
684: }
686: PetscCall(VecRestoreArrayRead(xx, &x));
687: PetscCall(VecRestoreArray(zz, &z));
688: PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow) * bs2 - nonzerorow));
689: PetscFunctionReturn(PETSC_SUCCESS);
690: }
692: PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A, Vec xx, Vec yy, Vec zz)
693: {
694: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
695: PetscScalar *z, x1;
696: const PetscScalar *x, *xb;
697: const MatScalar *v;
698: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
699: const PetscInt *aj = a->j, *ai = a->i, *ib;
700: PetscInt nonzerorow = 0;
701: #if defined(PETSC_USE_COMPLEX)
702: const int aconj = A->hermitian == PETSC_BOOL3_TRUE;
703: #else
704: const int aconj = 0;
705: #endif
707: PetscFunctionBegin;
708: PetscCall(VecCopy(yy, zz));
709: if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
710: PetscCall(VecGetArrayRead(xx, &x));
711: PetscCall(VecGetArray(zz, &z));
712: v = a->a;
713: xb = x;
715: for (i = 0; i < mbs; i++) {
716: n = ai[1] - ai[0]; /* length of i_th row of A */
717: x1 = xb[0];
718: ib = aj + *ai;
719: jmin = 0;
720: nonzerorow += (n > 0);
721: if (n && *ib == i) { /* (diag of A)*x */
722: z[i] += *v++ * x[*ib++];
723: jmin++;
724: }
725: if (aconj) {
726: for (j = jmin; j < n; j++) {
727: cval = *ib;
728: z[cval] += PetscConj(*v) * x1; /* (strict lower triangular part of A)*x */
729: z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */
730: }
731: } else {
732: for (j = jmin; j < n; j++) {
733: cval = *ib;
734: z[cval] += *v * x1; /* (strict lower triangular part of A)*x */
735: z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */
736: }
737: }
738: xb++;
739: ai++;
740: }
742: PetscCall(VecRestoreArrayRead(xx, &x));
743: PetscCall(VecRestoreArray(zz, &z));
745: PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow)));
746: PetscFunctionReturn(PETSC_SUCCESS);
747: }
749: PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A, Vec xx, Vec yy, Vec zz)
750: {
751: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
752: PetscScalar *z, x1, x2;
753: const PetscScalar *x, *xb;
754: const MatScalar *v;
755: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
756: const PetscInt *aj = a->j, *ai = a->i, *ib;
757: PetscInt nonzerorow = 0;
759: PetscFunctionBegin;
760: PetscCall(VecCopy(yy, zz));
761: if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
762: PetscCall(VecGetArrayRead(xx, &x));
763: PetscCall(VecGetArray(zz, &z));
765: v = a->a;
766: xb = x;
768: for (i = 0; i < mbs; i++) {
769: n = ai[1] - ai[0]; /* length of i_th block row of A */
770: x1 = xb[0];
771: x2 = xb[1];
772: ib = aj + *ai;
773: jmin = 0;
774: nonzerorow += (n > 0);
775: if (n && *ib == i) { /* (diag of A)*x */
776: z[2 * i] += v[0] * x1 + v[2] * x2;
777: z[2 * i + 1] += v[2] * x1 + v[3] * x2;
778: v += 4;
779: jmin++;
780: }
781: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
782: PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
783: for (j = jmin; j < n; j++) {
784: /* (strict lower triangular part of A)*x */
785: cval = ib[j] * 2;
786: z[cval] += v[0] * x1 + v[1] * x2;
787: z[cval + 1] += v[2] * x1 + v[3] * x2;
788: /* (strict upper triangular part of A)*x */
789: z[2 * i] += v[0] * x[cval] + v[2] * x[cval + 1];
790: z[2 * i + 1] += v[1] * x[cval] + v[3] * x[cval + 1];
791: v += 4;
792: }
793: xb += 2;
794: ai++;
795: }
796: PetscCall(VecRestoreArrayRead(xx, &x));
797: PetscCall(VecRestoreArray(zz, &z));
799: PetscCall(PetscLogFlops(4.0 * (a->nz * 2.0 - nonzerorow)));
800: PetscFunctionReturn(PETSC_SUCCESS);
801: }
803: PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A, Vec xx, Vec yy, Vec zz)
804: {
805: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
806: PetscScalar *z, x1, x2, x3;
807: const PetscScalar *x, *xb;
808: const MatScalar *v;
809: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
810: const PetscInt *aj = a->j, *ai = a->i, *ib;
811: PetscInt nonzerorow = 0;
813: PetscFunctionBegin;
814: PetscCall(VecCopy(yy, zz));
815: if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
816: PetscCall(VecGetArrayRead(xx, &x));
817: PetscCall(VecGetArray(zz, &z));
819: v = a->a;
820: xb = x;
822: for (i = 0; i < mbs; i++) {
823: n = ai[1] - ai[0]; /* length of i_th block row of A */
824: x1 = xb[0];
825: x2 = xb[1];
826: x3 = xb[2];
827: ib = aj + *ai;
828: jmin = 0;
829: nonzerorow += (n > 0);
830: if (n && *ib == i) { /* (diag of A)*x */
831: z[3 * i] += v[0] * x1 + v[3] * x2 + v[6] * x3;
832: z[3 * i + 1] += v[3] * x1 + v[4] * x2 + v[7] * x3;
833: z[3 * i + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
834: v += 9;
835: jmin++;
836: }
837: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
838: PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
839: for (j = jmin; j < n; j++) {
840: /* (strict lower triangular part of A)*x */
841: cval = ib[j] * 3;
842: z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3;
843: z[cval + 1] += v[3] * x1 + v[4] * x2 + v[5] * x3;
844: z[cval + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
845: /* (strict upper triangular part of A)*x */
846: z[3 * i] += v[0] * x[cval] + v[3] * x[cval + 1] + v[6] * x[cval + 2];
847: z[3 * i + 1] += v[1] * x[cval] + v[4] * x[cval + 1] + v[7] * x[cval + 2];
848: z[3 * i + 2] += v[2] * x[cval] + v[5] * x[cval + 1] + v[8] * x[cval + 2];
849: v += 9;
850: }
851: xb += 3;
852: ai++;
853: }
855: PetscCall(VecRestoreArrayRead(xx, &x));
856: PetscCall(VecRestoreArray(zz, &z));
858: PetscCall(PetscLogFlops(18.0 * (a->nz * 2.0 - nonzerorow)));
859: PetscFunctionReturn(PETSC_SUCCESS);
860: }
862: PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A, Vec xx, Vec yy, Vec zz)
863: {
864: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
865: PetscScalar *z, x1, x2, x3, x4;
866: const PetscScalar *x, *xb;
867: const MatScalar *v;
868: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
869: const PetscInt *aj = a->j, *ai = a->i, *ib;
870: PetscInt nonzerorow = 0;
872: PetscFunctionBegin;
873: PetscCall(VecCopy(yy, zz));
874: if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
875: PetscCall(VecGetArrayRead(xx, &x));
876: PetscCall(VecGetArray(zz, &z));
878: v = a->a;
879: xb = x;
881: for (i = 0; i < mbs; i++) {
882: n = ai[1] - ai[0]; /* length of i_th block row of A */
883: x1 = xb[0];
884: x2 = xb[1];
885: x3 = xb[2];
886: x4 = xb[3];
887: ib = aj + *ai;
888: jmin = 0;
889: nonzerorow += (n > 0);
890: if (n && *ib == i) { /* (diag of A)*x */
891: z[4 * i] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
892: z[4 * i + 1] += v[4] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
893: z[4 * i + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[14] * x4;
894: z[4 * i + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
895: v += 16;
896: jmin++;
897: }
898: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
899: PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
900: for (j = jmin; j < n; j++) {
901: /* (strict lower triangular part of A)*x */
902: cval = ib[j] * 4;
903: z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4;
904: z[cval + 1] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4;
905: z[cval + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4;
906: z[cval + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
907: /* (strict upper triangular part of A)*x */
908: z[4 * i] += v[0] * x[cval] + v[4] * x[cval + 1] + v[8] * x[cval + 2] + v[12] * x[cval + 3];
909: z[4 * i + 1] += v[1] * x[cval] + v[5] * x[cval + 1] + v[9] * x[cval + 2] + v[13] * x[cval + 3];
910: z[4 * i + 2] += v[2] * x[cval] + v[6] * x[cval + 1] + v[10] * x[cval + 2] + v[14] * x[cval + 3];
911: z[4 * i + 3] += v[3] * x[cval] + v[7] * x[cval + 1] + v[11] * x[cval + 2] + v[15] * x[cval + 3];
912: v += 16;
913: }
914: xb += 4;
915: ai++;
916: }
918: PetscCall(VecRestoreArrayRead(xx, &x));
919: PetscCall(VecRestoreArray(zz, &z));
921: PetscCall(PetscLogFlops(32.0 * (a->nz * 2.0 - nonzerorow)));
922: PetscFunctionReturn(PETSC_SUCCESS);
923: }
925: PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A, Vec xx, Vec yy, Vec zz)
926: {
927: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
928: PetscScalar *z, x1, x2, x3, x4, x5;
929: const PetscScalar *x, *xb;
930: const MatScalar *v;
931: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
932: const PetscInt *aj = a->j, *ai = a->i, *ib;
933: PetscInt nonzerorow = 0;
935: PetscFunctionBegin;
936: PetscCall(VecCopy(yy, zz));
937: if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
938: PetscCall(VecGetArrayRead(xx, &x));
939: PetscCall(VecGetArray(zz, &z));
941: v = a->a;
942: xb = x;
944: for (i = 0; i < mbs; i++) {
945: n = ai[1] - ai[0]; /* length of i_th block row of A */
946: x1 = xb[0];
947: x2 = xb[1];
948: x3 = xb[2];
949: x4 = xb[3];
950: x5 = xb[4];
951: ib = aj + *ai;
952: jmin = 0;
953: nonzerorow += (n > 0);
954: if (n && *ib == i) { /* (diag of A)*x */
955: z[5 * i] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
956: z[5 * i + 1] += v[5] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
957: z[5 * i + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
958: z[5 * i + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[23] * x5;
959: z[5 * i + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
960: v += 25;
961: jmin++;
962: }
963: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
964: PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
965: for (j = jmin; j < n; j++) {
966: /* (strict lower triangular part of A)*x */
967: cval = ib[j] * 5;
968: z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5;
969: z[cval + 1] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5;
970: z[cval + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5;
971: z[cval + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5;
972: z[cval + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
973: /* (strict upper triangular part of A)*x */
974: z[5 * i] += v[0] * x[cval] + v[5] * x[cval + 1] + v[10] * x[cval + 2] + v[15] * x[cval + 3] + v[20] * x[cval + 4];
975: z[5 * i + 1] += v[1] * x[cval] + v[6] * x[cval + 1] + v[11] * x[cval + 2] + v[16] * x[cval + 3] + v[21] * x[cval + 4];
976: z[5 * i + 2] += v[2] * x[cval] + v[7] * x[cval + 1] + v[12] * x[cval + 2] + v[17] * x[cval + 3] + v[22] * x[cval + 4];
977: z[5 * i + 3] += v[3] * x[cval] + v[8] * x[cval + 1] + v[13] * x[cval + 2] + v[18] * x[cval + 3] + v[23] * x[cval + 4];
978: z[5 * i + 4] += v[4] * x[cval] + v[9] * x[cval + 1] + v[14] * x[cval + 2] + v[19] * x[cval + 3] + v[24] * x[cval + 4];
979: v += 25;
980: }
981: xb += 5;
982: ai++;
983: }
985: PetscCall(VecRestoreArrayRead(xx, &x));
986: PetscCall(VecRestoreArray(zz, &z));
988: PetscCall(PetscLogFlops(50.0 * (a->nz * 2.0 - nonzerorow)));
989: PetscFunctionReturn(PETSC_SUCCESS);
990: }
992: PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A, Vec xx, Vec yy, Vec zz)
993: {
994: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
995: PetscScalar *z, x1, x2, x3, x4, x5, x6;
996: const PetscScalar *x, *xb;
997: const MatScalar *v;
998: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
999: const PetscInt *aj = a->j, *ai = a->i, *ib;
1000: PetscInt nonzerorow = 0;
1002: PetscFunctionBegin;
1003: PetscCall(VecCopy(yy, zz));
1004: if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
1005: PetscCall(VecGetArrayRead(xx, &x));
1006: PetscCall(VecGetArray(zz, &z));
1008: v = a->a;
1009: xb = x;
1011: for (i = 0; i < mbs; i++) {
1012: n = ai[1] - ai[0]; /* length of i_th block row of A */
1013: x1 = xb[0];
1014: x2 = xb[1];
1015: x3 = xb[2];
1016: x4 = xb[3];
1017: x5 = xb[4];
1018: x6 = xb[5];
1019: ib = aj + *ai;
1020: jmin = 0;
1021: nonzerorow += (n > 0);
1022: if (n && *ib == i) { /* (diag of A)*x */
1023: z[6 * i] += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
1024: z[6 * i + 1] += v[6] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
1025: z[6 * i + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
1026: z[6 * i + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
1027: z[6 * i + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[34] * x6;
1028: z[6 * i + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
1029: v += 36;
1030: jmin++;
1031: }
1032: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1033: PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1034: for (j = jmin; j < n; j++) {
1035: /* (strict lower triangular part of A)*x */
1036: cval = ib[j] * 6;
1037: z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6;
1038: z[cval + 1] += v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4 + v[10] * x5 + v[11] * x6;
1039: z[cval + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4 + v[16] * x5 + v[17] * x6;
1040: z[cval + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[22] * x5 + v[23] * x6;
1041: z[cval + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[29] * x6;
1042: z[cval + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
1043: /* (strict upper triangular part of A)*x */
1044: z[6 * i] += v[0] * x[cval] + v[6] * x[cval + 1] + v[12] * x[cval + 2] + v[18] * x[cval + 3] + v[24] * x[cval + 4] + v[30] * x[cval + 5];
1045: z[6 * i + 1] += v[1] * x[cval] + v[7] * x[cval + 1] + v[13] * x[cval + 2] + v[19] * x[cval + 3] + v[25] * x[cval + 4] + v[31] * x[cval + 5];
1046: z[6 * i + 2] += v[2] * x[cval] + v[8] * x[cval + 1] + v[14] * x[cval + 2] + v[20] * x[cval + 3] + v[26] * x[cval + 4] + v[32] * x[cval + 5];
1047: z[6 * i + 3] += v[3] * x[cval] + v[9] * x[cval + 1] + v[15] * x[cval + 2] + v[21] * x[cval + 3] + v[27] * x[cval + 4] + v[33] * x[cval + 5];
1048: z[6 * i + 4] += v[4] * x[cval] + v[10] * x[cval + 1] + v[16] * x[cval + 2] + v[22] * x[cval + 3] + v[28] * x[cval + 4] + v[34] * x[cval + 5];
1049: z[6 * i + 5] += v[5] * x[cval] + v[11] * x[cval + 1] + v[17] * x[cval + 2] + v[23] * x[cval + 3] + v[29] * x[cval + 4] + v[35] * x[cval + 5];
1050: v += 36;
1051: }
1052: xb += 6;
1053: ai++;
1054: }
1056: PetscCall(VecRestoreArrayRead(xx, &x));
1057: PetscCall(VecRestoreArray(zz, &z));
1059: PetscCall(PetscLogFlops(72.0 * (a->nz * 2.0 - nonzerorow)));
1060: PetscFunctionReturn(PETSC_SUCCESS);
1061: }
1063: PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A, Vec xx, Vec yy, Vec zz)
1064: {
1065: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1066: PetscScalar *z, x1, x2, x3, x4, x5, x6, x7;
1067: const PetscScalar *x, *xb;
1068: const MatScalar *v;
1069: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
1070: const PetscInt *aj = a->j, *ai = a->i, *ib;
1071: PetscInt nonzerorow = 0;
1073: PetscFunctionBegin;
1074: PetscCall(VecCopy(yy, zz));
1075: if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
1076: PetscCall(VecGetArrayRead(xx, &x));
1077: PetscCall(VecGetArray(zz, &z));
1079: v = a->a;
1080: xb = x;
1082: for (i = 0; i < mbs; i++) {
1083: n = ai[1] - ai[0]; /* length of i_th block row of A */
1084: x1 = xb[0];
1085: x2 = xb[1];
1086: x3 = xb[2];
1087: x4 = xb[3];
1088: x5 = xb[4];
1089: x6 = xb[5];
1090: x7 = xb[6];
1091: ib = aj + *ai;
1092: jmin = 0;
1093: nonzerorow += (n > 0);
1094: if (n && *ib == i) { /* (diag of A)*x */
1095: z[7 * i] += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
1096: z[7 * i + 1] += v[7] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
1097: z[7 * i + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
1098: z[7 * i + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
1099: z[7 * i + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
1100: z[7 * i + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[47] * x7;
1101: z[7 * i + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
1102: v += 49;
1103: jmin++;
1104: }
1105: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1106: PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1107: for (j = jmin; j < n; j++) {
1108: /* (strict lower triangular part of A)*x */
1109: cval = ib[j] * 7;
1110: z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6 + v[6] * x7;
1111: z[cval + 1] += v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5 + v[12] * x6 + v[13] * x7;
1112: z[cval + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[17] * x4 + v[18] * x5 + v[19] * x6 + v[20] * x7;
1113: z[cval + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[25] * x5 + v[26] * x6 + v[27] * x7;
1114: z[cval + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[33] * x6 + v[34] * x7;
1115: z[cval + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[41] * x7;
1116: z[cval + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
1117: /* (strict upper triangular part of A)*x */
1118: z[7 * i] += v[0] * x[cval] + v[7] * x[cval + 1] + v[14] * x[cval + 2] + v[21] * x[cval + 3] + v[28] * x[cval + 4] + v[35] * x[cval + 5] + v[42] * x[cval + 6];
1119: z[7 * i + 1] += v[1] * x[cval] + v[8] * x[cval + 1] + v[15] * x[cval + 2] + v[22] * x[cval + 3] + v[29] * x[cval + 4] + v[36] * x[cval + 5] + v[43] * x[cval + 6];
1120: z[7 * i + 2] += v[2] * x[cval] + v[9] * x[cval + 1] + v[16] * x[cval + 2] + v[23] * x[cval + 3] + v[30] * x[cval + 4] + v[37] * x[cval + 5] + v[44] * x[cval + 6];
1121: z[7 * i + 3] += v[3] * x[cval] + v[10] * x[cval + 1] + v[17] * x[cval + 2] + v[24] * x[cval + 3] + v[31] * x[cval + 4] + v[38] * x[cval + 5] + v[45] * x[cval + 6];
1122: z[7 * i + 4] += v[4] * x[cval] + v[11] * x[cval + 1] + v[18] * x[cval + 2] + v[25] * x[cval + 3] + v[32] * x[cval + 4] + v[39] * x[cval + 5] + v[46] * x[cval + 6];
1123: z[7 * i + 5] += v[5] * x[cval] + v[12] * x[cval + 1] + v[19] * x[cval + 2] + v[26] * x[cval + 3] + v[33] * x[cval + 4] + v[40] * x[cval + 5] + v[47] * x[cval + 6];
1124: z[7 * i + 6] += v[6] * x[cval] + v[13] * x[cval + 1] + v[20] * x[cval + 2] + v[27] * x[cval + 3] + v[34] * x[cval + 4] + v[41] * x[cval + 5] + v[48] * x[cval + 6];
1125: v += 49;
1126: }
1127: xb += 7;
1128: ai++;
1129: }
1131: PetscCall(VecRestoreArrayRead(xx, &x));
1132: PetscCall(VecRestoreArray(zz, &z));
1134: PetscCall(PetscLogFlops(98.0 * (a->nz * 2.0 - nonzerorow)));
1135: PetscFunctionReturn(PETSC_SUCCESS);
1136: }
1138: PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A, Vec xx, Vec yy, Vec zz)
1139: {
1140: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1141: PetscScalar *z, *z_ptr = NULL, *zb, *work, *workt;
1142: const PetscScalar *x, *x_ptr, *xb;
1143: const MatScalar *v;
1144: PetscInt mbs = a->mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2, ncols, k;
1145: const PetscInt *idx, *aj, *ii;
1146: PetscInt nonzerorow = 0;
1148: PetscFunctionBegin;
1149: PetscCall(VecCopy(yy, zz));
1150: if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
1151: PetscCall(VecGetArrayRead(xx, &x));
1152: x_ptr = x;
1153: PetscCall(VecGetArray(zz, &z));
1154: z_ptr = z;
1156: aj = a->j;
1157: v = a->a;
1158: ii = a->i;
1160: if (!a->mult_work) PetscCall(PetscMalloc1(A->rmap->n + 1, &a->mult_work));
1161: work = a->mult_work;
1163: for (i = 0; i < mbs; i++) {
1164: n = ii[1] - ii[0];
1165: ncols = n * bs;
1166: workt = work;
1167: idx = aj + ii[0];
1168: nonzerorow += (n > 0);
1170: /* upper triangular part */
1171: for (j = 0; j < n; j++) {
1172: xb = x_ptr + bs * (*idx++);
1173: for (k = 0; k < bs; k++) workt[k] = xb[k];
1174: workt += bs;
1175: }
1176: /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
1177: PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z);
1179: /* strict lower triangular part */
1180: idx = aj + ii[0];
1181: if (n && *idx == i) {
1182: ncols -= bs;
1183: v += bs2;
1184: idx++;
1185: n--;
1186: }
1187: if (ncols > 0) {
1188: workt = work;
1189: PetscCall(PetscArrayzero(workt, ncols));
1190: PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, x, v, workt);
1191: for (j = 0; j < n; j++) {
1192: zb = z_ptr + bs * (*idx++);
1193: for (k = 0; k < bs; k++) zb[k] += workt[k];
1194: workt += bs;
1195: }
1196: }
1198: x += bs;
1199: v += n * bs2;
1200: z += bs;
1201: ii++;
1202: }
1204: PetscCall(VecRestoreArrayRead(xx, &x));
1205: PetscCall(VecRestoreArray(zz, &z));
1207: PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow)));
1208: PetscFunctionReturn(PETSC_SUCCESS);
1209: }
1211: PetscErrorCode MatScale_SeqSBAIJ(Mat inA, PetscScalar alpha)
1212: {
1213: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)inA->data;
1214: PetscScalar oalpha = alpha;
1215: PetscBLASInt one = 1, totalnz;
1217: PetscFunctionBegin;
1218: PetscCall(PetscBLASIntCast(a->bs2 * a->nz, &totalnz));
1219: PetscCallBLAS("BLASscal", BLASscal_(&totalnz, &oalpha, a->a, &one));
1220: PetscCall(PetscLogFlops(totalnz));
1221: PetscFunctionReturn(PETSC_SUCCESS);
1222: }
1224: PetscErrorCode MatNorm_SeqSBAIJ(Mat A, NormType type, PetscReal *norm)
1225: {
1226: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1227: const MatScalar *v = a->a;
1228: PetscReal sum_diag = 0.0, sum_off = 0.0, *sum;
1229: PetscInt i, j, k, bs = A->rmap->bs, bs2 = a->bs2, k1, mbs = a->mbs, jmin, jmax, nexti, ik, *jl, *il;
1230: const PetscInt *aj = a->j, *col;
1232: PetscFunctionBegin;
1233: if (!a->nz) {
1234: *norm = 0.0;
1235: PetscFunctionReturn(PETSC_SUCCESS);
1236: }
1237: if (type == NORM_FROBENIUS) {
1238: for (k = 0; k < mbs; k++) {
1239: jmin = a->i[k];
1240: jmax = a->i[k + 1];
1241: col = aj + jmin;
1242: if (jmax - jmin > 0 && *col == k) { /* diagonal block */
1243: for (i = 0; i < bs2; i++) {
1244: sum_diag += PetscRealPart(PetscConj(*v) * (*v));
1245: v++;
1246: }
1247: jmin++;
1248: }
1249: for (j = jmin; j < jmax; j++) { /* off-diagonal blocks */
1250: for (i = 0; i < bs2; i++) {
1251: sum_off += PetscRealPart(PetscConj(*v) * (*v));
1252: v++;
1253: }
1254: }
1255: }
1256: *norm = PetscSqrtReal(sum_diag + 2 * sum_off);
1257: PetscCall(PetscLogFlops(2.0 * bs2 * a->nz));
1258: } else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */
1259: PetscCall(PetscMalloc3(bs, &sum, mbs, &il, mbs, &jl));
1260: for (i = 0; i < mbs; i++) jl[i] = mbs;
1261: il[0] = 0;
1263: *norm = 0.0;
1264: for (k = 0; k < mbs; k++) { /* k_th block row */
1265: for (j = 0; j < bs; j++) sum[j] = 0.0;
1266: /*-- col sum --*/
1267: i = jl[k]; /* first |A(i,k)| to be added */
1268: /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window)
1269: at step k */
1270: while (i < mbs) {
1271: nexti = jl[i]; /* next block row to be added */
1272: ik = il[i]; /* block index of A(i,k) in the array a */
1273: for (j = 0; j < bs; j++) {
1274: v = a->a + ik * bs2 + j * bs;
1275: for (k1 = 0; k1 < bs; k1++) {
1276: sum[j] += PetscAbsScalar(*v);
1277: v++;
1278: }
1279: }
1280: /* update il, jl */
1281: jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1282: jmax = a->i[i + 1];
1283: if (jmin < jmax) {
1284: il[i] = jmin;
1285: j = a->j[jmin];
1286: jl[i] = jl[j];
1287: jl[j] = i;
1288: }
1289: i = nexti;
1290: }
1291: /*-- row sum --*/
1292: jmin = a->i[k];
1293: jmax = a->i[k + 1];
1294: for (i = jmin; i < jmax; i++) {
1295: for (j = 0; j < bs; j++) {
1296: v = a->a + i * bs2 + j;
1297: for (k1 = 0; k1 < bs; k1++) {
1298: sum[j] += PetscAbsScalar(*v);
1299: v += bs;
1300: }
1301: }
1302: }
1303: /* add k_th block row to il, jl */
1304: col = aj + jmin;
1305: if (jmax - jmin > 0 && *col == k) jmin++;
1306: if (jmin < jmax) {
1307: il[k] = jmin;
1308: j = a->j[jmin];
1309: jl[k] = jl[j];
1310: jl[j] = k;
1311: }
1312: for (j = 0; j < bs; j++) {
1313: if (sum[j] > *norm) *norm = sum[j];
1314: }
1315: }
1316: PetscCall(PetscFree3(sum, il, jl));
1317: PetscCall(PetscLogFlops(PetscMax(mbs * a->nz - 1, 0)));
1318: } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for this norm yet");
1319: PetscFunctionReturn(PETSC_SUCCESS);
1320: }
1322: PetscErrorCode MatEqual_SeqSBAIJ(Mat A, Mat B, PetscBool *flg)
1323: {
1324: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *b = (Mat_SeqSBAIJ *)B->data;
1326: PetscFunctionBegin;
1327: /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */
1328: if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs) || (a->nz != b->nz)) {
1329: *flg = PETSC_FALSE;
1330: PetscFunctionReturn(PETSC_SUCCESS);
1331: }
1333: /* if the a->i are the same */
1334: PetscCall(PetscArraycmp(a->i, b->i, a->mbs + 1, flg));
1335: if (!*flg) PetscFunctionReturn(PETSC_SUCCESS);
1337: /* if a->j are the same */
1338: PetscCall(PetscArraycmp(a->j, b->j, a->nz, flg));
1339: if (!*flg) PetscFunctionReturn(PETSC_SUCCESS);
1341: /* if a->a are the same */
1342: PetscCall(PetscArraycmp(a->a, b->a, (a->nz) * (A->rmap->bs) * (A->rmap->bs), flg));
1343: PetscFunctionReturn(PETSC_SUCCESS);
1344: }
1346: PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A, Vec v)
1347: {
1348: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1349: PetscInt i, j, k, row, bs, ambs, bs2;
1350: const PetscInt *ai, *aj;
1351: PetscScalar *x, zero = 0.0;
1352: const MatScalar *aa, *aa_j;
1354: PetscFunctionBegin;
1355: bs = A->rmap->bs;
1356: PetscCheck(!A->factortype || bs <= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix with bs>1");
1358: aa = a->a;
1359: ambs = a->mbs;
1361: if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) {
1362: PetscInt *diag = a->diag;
1363: aa = a->a;
1364: ambs = a->mbs;
1365: PetscCall(VecGetArray(v, &x));
1366: for (i = 0; i < ambs; i++) x[i] = 1.0 / aa[diag[i]];
1367: PetscCall(VecRestoreArray(v, &x));
1368: PetscFunctionReturn(PETSC_SUCCESS);
1369: }
1371: ai = a->i;
1372: aj = a->j;
1373: bs2 = a->bs2;
1374: PetscCall(VecSet(v, zero));
1375: if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
1376: PetscCall(VecGetArray(v, &x));
1377: for (i = 0; i < ambs; i++) {
1378: j = ai[i];
1379: if (aj[j] == i) { /* if this is a diagonal element */
1380: row = i * bs;
1381: aa_j = aa + j * bs2;
1382: for (k = 0; k < bs2; k += (bs + 1), row++) x[row] = aa_j[k];
1383: }
1384: }
1385: PetscCall(VecRestoreArray(v, &x));
1386: PetscFunctionReturn(PETSC_SUCCESS);
1387: }
1389: PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A, Vec ll, Vec rr)
1390: {
1391: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1392: PetscScalar x;
1393: const PetscScalar *l, *li, *ri;
1394: MatScalar *aa, *v;
1395: PetscInt i, j, k, lm, M, m, mbs, tmp, bs, bs2;
1396: const PetscInt *ai, *aj;
1397: PetscBool flg;
1399: PetscFunctionBegin;
1400: if (ll != rr) {
1401: PetscCall(VecEqual(ll, rr, &flg));
1402: PetscCheck(flg, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "For symmetric format, left and right scaling vectors must be same");
1403: }
1404: if (!ll) PetscFunctionReturn(PETSC_SUCCESS);
1405: ai = a->i;
1406: aj = a->j;
1407: aa = a->a;
1408: m = A->rmap->N;
1409: bs = A->rmap->bs;
1410: mbs = a->mbs;
1411: bs2 = a->bs2;
1413: PetscCall(VecGetArrayRead(ll, &l));
1414: PetscCall(VecGetLocalSize(ll, &lm));
1415: PetscCheck(lm == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
1416: for (i = 0; i < mbs; i++) { /* for each block row */
1417: M = ai[i + 1] - ai[i];
1418: li = l + i * bs;
1419: v = aa + bs2 * ai[i];
1420: for (j = 0; j < M; j++) { /* for each block */
1421: ri = l + bs * aj[ai[i] + j];
1422: for (k = 0; k < bs; k++) {
1423: x = ri[k];
1424: for (tmp = 0; tmp < bs; tmp++) (*v++) *= li[tmp] * x;
1425: }
1426: }
1427: }
1428: PetscCall(VecRestoreArrayRead(ll, &l));
1429: PetscCall(PetscLogFlops(2.0 * a->nz));
1430: PetscFunctionReturn(PETSC_SUCCESS);
1431: }
1433: PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A, MatInfoType flag, MatInfo *info)
1434: {
1435: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1437: PetscFunctionBegin;
1438: info->block_size = a->bs2;
1439: info->nz_allocated = a->bs2 * a->maxnz; /*num. of nonzeros in upper triangular part */
1440: info->nz_used = a->bs2 * a->nz; /*num. of nonzeros in upper triangular part */
1441: info->nz_unneeded = info->nz_allocated - info->nz_used;
1442: info->assemblies = A->num_ass;
1443: info->mallocs = A->info.mallocs;
1444: info->memory = 0; /* REVIEW ME */
1445: if (A->factortype) {
1446: info->fill_ratio_given = A->info.fill_ratio_given;
1447: info->fill_ratio_needed = A->info.fill_ratio_needed;
1448: info->factor_mallocs = A->info.factor_mallocs;
1449: } else {
1450: info->fill_ratio_given = 0;
1451: info->fill_ratio_needed = 0;
1452: info->factor_mallocs = 0;
1453: }
1454: PetscFunctionReturn(PETSC_SUCCESS);
1455: }
1457: PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A)
1458: {
1459: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1461: PetscFunctionBegin;
1462: PetscCall(PetscArrayzero(a->a, a->bs2 * a->i[a->mbs]));
1463: PetscFunctionReturn(PETSC_SUCCESS);
1464: }
1466: PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A, Vec v, PetscInt idx[])
1467: {
1468: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1469: PetscInt i, j, n, row, col, bs, mbs;
1470: const PetscInt *ai, *aj;
1471: PetscReal atmp;
1472: const MatScalar *aa;
1473: PetscScalar *x;
1474: PetscInt ncols, brow, bcol, krow, kcol;
1476: PetscFunctionBegin;
1477: PetscCheck(!idx, PETSC_COMM_SELF, PETSC_ERR_SUP, "Send email to petsc-maint@mcs.anl.gov");
1478: PetscCheck(!A->factortype, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
1479: bs = A->rmap->bs;
1480: aa = a->a;
1481: ai = a->i;
1482: aj = a->j;
1483: mbs = a->mbs;
1485: PetscCall(VecSet(v, 0.0));
1486: PetscCall(VecGetArray(v, &x));
1487: PetscCall(VecGetLocalSize(v, &n));
1488: PetscCheck(n == A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
1489: for (i = 0; i < mbs; i++) {
1490: ncols = ai[1] - ai[0];
1491: ai++;
1492: brow = bs * i;
1493: for (j = 0; j < ncols; j++) {
1494: bcol = bs * (*aj);
1495: for (kcol = 0; kcol < bs; kcol++) {
1496: col = bcol + kcol; /* col index */
1497: for (krow = 0; krow < bs; krow++) {
1498: atmp = PetscAbsScalar(*aa);
1499: aa++;
1500: row = brow + krow; /* row index */
1501: if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1502: if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
1503: }
1504: }
1505: aj++;
1506: }
1507: }
1508: PetscCall(VecRestoreArray(v, &x));
1509: PetscFunctionReturn(PETSC_SUCCESS);
1510: }
1512: PetscErrorCode MatMatMultSymbolic_SeqSBAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C)
1513: {
1514: PetscFunctionBegin;
1515: PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C));
1516: C->ops->matmultnumeric = MatMatMultNumeric_SeqSBAIJ_SeqDense;
1517: PetscFunctionReturn(PETSC_SUCCESS);
1518: }
1520: PetscErrorCode MatMatMult_SeqSBAIJ_1_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1521: {
1522: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1523: PetscScalar *z = c;
1524: const PetscScalar *xb;
1525: PetscScalar x1;
1526: const MatScalar *v = a->a, *vv;
1527: PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1528: #if defined(PETSC_USE_COMPLEX)
1529: const int aconj = A->hermitian == PETSC_BOOL3_TRUE;
1530: #else
1531: const int aconj = 0;
1532: #endif
1534: PetscFunctionBegin;
1535: for (i = 0; i < mbs; i++) {
1536: n = ii[1] - ii[0];
1537: ii++;
1538: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1539: PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1540: jj = idx;
1541: vv = v;
1542: for (k = 0; k < cn; k++) {
1543: idx = jj;
1544: v = vv;
1545: for (j = 0; j < n; j++) {
1546: xb = b + (*idx);
1547: x1 = xb[0 + k * bm];
1548: z[0 + k * cm] += v[0] * x1;
1549: if (*idx != i) c[(*idx) + k * cm] += (aconj ? PetscConj(v[0]) : v[0]) * b[i + k * bm];
1550: v += 1;
1551: ++idx;
1552: }
1553: }
1554: z += 1;
1555: }
1556: PetscFunctionReturn(PETSC_SUCCESS);
1557: }
1559: PetscErrorCode MatMatMult_SeqSBAIJ_2_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1560: {
1561: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1562: PetscScalar *z = c;
1563: const PetscScalar *xb;
1564: PetscScalar x1, x2;
1565: const MatScalar *v = a->a, *vv;
1566: PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1568: PetscFunctionBegin;
1569: for (i = 0; i < mbs; i++) {
1570: n = ii[1] - ii[0];
1571: ii++;
1572: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1573: PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1574: jj = idx;
1575: vv = v;
1576: for (k = 0; k < cn; k++) {
1577: idx = jj;
1578: v = vv;
1579: for (j = 0; j < n; j++) {
1580: xb = b + 2 * (*idx);
1581: x1 = xb[0 + k * bm];
1582: x2 = xb[1 + k * bm];
1583: z[0 + k * cm] += v[0] * x1 + v[2] * x2;
1584: z[1 + k * cm] += v[1] * x1 + v[3] * x2;
1585: if (*idx != i) {
1586: c[2 * (*idx) + 0 + k * cm] += v[0] * b[2 * i + k * bm] + v[1] * b[2 * i + 1 + k * bm];
1587: c[2 * (*idx) + 1 + k * cm] += v[2] * b[2 * i + k * bm] + v[3] * b[2 * i + 1 + k * bm];
1588: }
1589: v += 4;
1590: ++idx;
1591: }
1592: }
1593: z += 2;
1594: }
1595: PetscFunctionReturn(PETSC_SUCCESS);
1596: }
1598: PetscErrorCode MatMatMult_SeqSBAIJ_3_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1599: {
1600: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1601: PetscScalar *z = c;
1602: const PetscScalar *xb;
1603: PetscScalar x1, x2, x3;
1604: const MatScalar *v = a->a, *vv;
1605: PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1607: PetscFunctionBegin;
1608: for (i = 0; i < mbs; i++) {
1609: n = ii[1] - ii[0];
1610: ii++;
1611: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1612: PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1613: jj = idx;
1614: vv = v;
1615: for (k = 0; k < cn; k++) {
1616: idx = jj;
1617: v = vv;
1618: for (j = 0; j < n; j++) {
1619: xb = b + 3 * (*idx);
1620: x1 = xb[0 + k * bm];
1621: x2 = xb[1 + k * bm];
1622: x3 = xb[2 + k * bm];
1623: z[0 + k * cm] += v[0] * x1 + v[3] * x2 + v[6] * x3;
1624: z[1 + k * cm] += v[1] * x1 + v[4] * x2 + v[7] * x3;
1625: z[2 + k * cm] += v[2] * x1 + v[5] * x2 + v[8] * x3;
1626: if (*idx != i) {
1627: c[3 * (*idx) + 0 + k * cm] += v[0] * b[3 * i + k * bm] + v[3] * b[3 * i + 1 + k * bm] + v[6] * b[3 * i + 2 + k * bm];
1628: c[3 * (*idx) + 1 + k * cm] += v[1] * b[3 * i + k * bm] + v[4] * b[3 * i + 1 + k * bm] + v[7] * b[3 * i + 2 + k * bm];
1629: c[3 * (*idx) + 2 + k * cm] += v[2] * b[3 * i + k * bm] + v[5] * b[3 * i + 1 + k * bm] + v[8] * b[3 * i + 2 + k * bm];
1630: }
1631: v += 9;
1632: ++idx;
1633: }
1634: }
1635: z += 3;
1636: }
1637: PetscFunctionReturn(PETSC_SUCCESS);
1638: }
1640: PetscErrorCode MatMatMult_SeqSBAIJ_4_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1641: {
1642: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1643: PetscScalar *z = c;
1644: const PetscScalar *xb;
1645: PetscScalar x1, x2, x3, x4;
1646: const MatScalar *v = a->a, *vv;
1647: PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1649: PetscFunctionBegin;
1650: for (i = 0; i < mbs; i++) {
1651: n = ii[1] - ii[0];
1652: ii++;
1653: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1654: PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1655: jj = idx;
1656: vv = v;
1657: for (k = 0; k < cn; k++) {
1658: idx = jj;
1659: v = vv;
1660: for (j = 0; j < n; j++) {
1661: xb = b + 4 * (*idx);
1662: x1 = xb[0 + k * bm];
1663: x2 = xb[1 + k * bm];
1664: x3 = xb[2 + k * bm];
1665: x4 = xb[3 + k * bm];
1666: z[0 + k * cm] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
1667: z[1 + k * cm] += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
1668: z[2 + k * cm] += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
1669: z[3 + k * cm] += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
1670: if (*idx != i) {
1671: c[4 * (*idx) + 0 + k * cm] += v[0] * b[4 * i + k * bm] + v[4] * b[4 * i + 1 + k * bm] + v[8] * b[4 * i + 2 + k * bm] + v[12] * b[4 * i + 3 + k * bm];
1672: c[4 * (*idx) + 1 + k * cm] += v[1] * b[4 * i + k * bm] + v[5] * b[4 * i + 1 + k * bm] + v[9] * b[4 * i + 2 + k * bm] + v[13] * b[4 * i + 3 + k * bm];
1673: c[4 * (*idx) + 2 + k * cm] += v[2] * b[4 * i + k * bm] + v[6] * b[4 * i + 1 + k * bm] + v[10] * b[4 * i + 2 + k * bm] + v[14] * b[4 * i + 3 + k * bm];
1674: c[4 * (*idx) + 3 + k * cm] += v[3] * b[4 * i + k * bm] + v[7] * b[4 * i + 1 + k * bm] + v[11] * b[4 * i + 2 + k * bm] + v[15] * b[4 * i + 3 + k * bm];
1675: }
1676: v += 16;
1677: ++idx;
1678: }
1679: }
1680: z += 4;
1681: }
1682: PetscFunctionReturn(PETSC_SUCCESS);
1683: }
1685: PetscErrorCode MatMatMult_SeqSBAIJ_5_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1686: {
1687: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1688: PetscScalar *z = c;
1689: const PetscScalar *xb;
1690: PetscScalar x1, x2, x3, x4, x5;
1691: const MatScalar *v = a->a, *vv;
1692: PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1694: PetscFunctionBegin;
1695: for (i = 0; i < mbs; i++) {
1696: n = ii[1] - ii[0];
1697: ii++;
1698: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1699: PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1700: jj = idx;
1701: vv = v;
1702: for (k = 0; k < cn; k++) {
1703: idx = jj;
1704: v = vv;
1705: for (j = 0; j < n; j++) {
1706: xb = b + 5 * (*idx);
1707: x1 = xb[0 + k * bm];
1708: x2 = xb[1 + k * bm];
1709: x3 = xb[2 + k * bm];
1710: x4 = xb[3 + k * bm];
1711: x5 = xb[4 + k * cm];
1712: z[0 + k * cm] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
1713: z[1 + k * cm] += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
1714: z[2 + k * cm] += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
1715: z[3 + k * cm] += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
1716: z[4 + k * cm] += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
1717: if (*idx != i) {
1718: c[5 * (*idx) + 0 + k * cm] += v[0] * b[5 * i + k * bm] + v[5] * b[5 * i + 1 + k * bm] + v[10] * b[5 * i + 2 + k * bm] + v[15] * b[5 * i + 3 + k * bm] + v[20] * b[5 * i + 4 + k * bm];
1719: c[5 * (*idx) + 1 + k * cm] += v[1] * b[5 * i + k * bm] + v[6] * b[5 * i + 1 + k * bm] + v[11] * b[5 * i + 2 + k * bm] + v[16] * b[5 * i + 3 + k * bm] + v[21] * b[5 * i + 4 + k * bm];
1720: c[5 * (*idx) + 2 + k * cm] += v[2] * b[5 * i + k * bm] + v[7] * b[5 * i + 1 + k * bm] + v[12] * b[5 * i + 2 + k * bm] + v[17] * b[5 * i + 3 + k * bm] + v[22] * b[5 * i + 4 + k * bm];
1721: c[5 * (*idx) + 3 + k * cm] += v[3] * b[5 * i + k * bm] + v[8] * b[5 * i + 1 + k * bm] + v[13] * b[5 * i + 2 + k * bm] + v[18] * b[5 * i + 3 + k * bm] + v[23] * b[5 * i + 4 + k * bm];
1722: c[5 * (*idx) + 4 + k * cm] += v[4] * b[5 * i + k * bm] + v[9] * b[5 * i + 1 + k * bm] + v[14] * b[5 * i + 2 + k * bm] + v[19] * b[5 * i + 3 + k * bm] + v[24] * b[5 * i + 4 + k * bm];
1723: }
1724: v += 25;
1725: ++idx;
1726: }
1727: }
1728: z += 5;
1729: }
1730: PetscFunctionReturn(PETSC_SUCCESS);
1731: }
1733: PetscErrorCode MatMatMultNumeric_SeqSBAIJ_SeqDense(Mat A, Mat B, Mat C)
1734: {
1735: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1736: Mat_SeqDense *bd = (Mat_SeqDense *)B->data;
1737: Mat_SeqDense *cd = (Mat_SeqDense *)C->data;
1738: PetscInt cm = cd->lda, cn = B->cmap->n, bm = bd->lda;
1739: PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
1740: PetscBLASInt bbs, bcn, bbm, bcm;
1741: PetscScalar *z = NULL;
1742: PetscScalar *c, *b;
1743: const MatScalar *v;
1744: const PetscInt *idx, *ii;
1745: PetscScalar _DOne = 1.0;
1747: PetscFunctionBegin;
1748: if (!cm || !cn) PetscFunctionReturn(PETSC_SUCCESS);
1749: PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n);
1750: PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n);
1751: PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n);
1752: b = bd->v;
1753: PetscCall(MatZeroEntries(C));
1754: PetscCall(MatDenseGetArray(C, &c));
1755: switch (bs) {
1756: case 1:
1757: PetscCall(MatMatMult_SeqSBAIJ_1_Private(A, b, bm, c, cm, cn));
1758: break;
1759: case 2:
1760: PetscCall(MatMatMult_SeqSBAIJ_2_Private(A, b, bm, c, cm, cn));
1761: break;
1762: case 3:
1763: PetscCall(MatMatMult_SeqSBAIJ_3_Private(A, b, bm, c, cm, cn));
1764: break;
1765: case 4:
1766: PetscCall(MatMatMult_SeqSBAIJ_4_Private(A, b, bm, c, cm, cn));
1767: break;
1768: case 5:
1769: PetscCall(MatMatMult_SeqSBAIJ_5_Private(A, b, bm, c, cm, cn));
1770: break;
1771: default: /* block sizes larger than 5 by 5 are handled by BLAS */
1772: PetscCall(PetscBLASIntCast(bs, &bbs));
1773: PetscCall(PetscBLASIntCast(cn, &bcn));
1774: PetscCall(PetscBLASIntCast(bm, &bbm));
1775: PetscCall(PetscBLASIntCast(cm, &bcm));
1776: idx = a->j;
1777: v = a->a;
1778: mbs = a->mbs;
1779: ii = a->i;
1780: z = c;
1781: for (i = 0; i < mbs; i++) {
1782: n = ii[1] - ii[0];
1783: ii++;
1784: for (j = 0; j < n; j++) {
1785: if (*idx != i) PetscCallBLAS("BLASgemm", BLASgemm_("T", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * i, &bbm, &_DOne, c + bs * (*idx), &bcm));
1786: PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DOne, z, &bcm));
1787: v += bs2;
1788: }
1789: z += bs;
1790: }
1791: }
1792: PetscCall(MatDenseRestoreArray(C, &c));
1793: PetscCall(PetscLogFlops((2.0 * (a->nz * 2.0 - a->nonzerorowcnt) * bs2 - a->nonzerorowcnt) * cn));
1794: PetscFunctionReturn(PETSC_SUCCESS);
1795: }