Actual source code: baij2.c
1: #include <../src/mat/impls/baij/seq/baij.h>
2: #include <../src/mat/impls/dense/seq/dense.h>
3: #include <petsc/private/kernels/blockinvert.h>
4: #include <petscbt.h>
5: #include <petscblaslapack.h>
7: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
8: #include <immintrin.h>
9: #endif
11: PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A, PetscInt is_max, IS is[], PetscInt ov)
12: {
13: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
14: PetscInt row, i, j, k, l, m, n, *nidx, isz, val, ival;
15: const PetscInt *idx;
16: PetscInt start, end, *ai, *aj, bs;
17: PetscBT table;
19: PetscFunctionBegin;
20: m = a->mbs;
21: ai = a->i;
22: aj = a->j;
23: bs = A->rmap->bs;
25: PetscCheck(ov >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified");
27: PetscCall(PetscBTCreate(m, &table));
28: PetscCall(PetscMalloc1(m + 1, &nidx));
30: for (i = 0; i < is_max; i++) {
31: /* Initialise the two local arrays */
32: isz = 0;
33: PetscCall(PetscBTMemzero(m, table));
35: /* Extract the indices, assume there can be duplicate entries */
36: PetscCall(ISGetIndices(is[i], &idx));
37: PetscCall(ISGetLocalSize(is[i], &n));
39: /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
40: for (j = 0; j < n; ++j) {
41: ival = idx[j] / bs; /* convert the indices into block indices */
42: PetscCheck(ival < m, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "index greater than mat-dim");
43: if (!PetscBTLookupSet(table, ival)) nidx[isz++] = ival;
44: }
45: PetscCall(ISRestoreIndices(is[i], &idx));
46: PetscCall(ISDestroy(&is[i]));
48: k = 0;
49: for (j = 0; j < ov; j++) { /* for each overlap*/
50: n = isz;
51: for (; k < n; k++) { /* do only those rows in nidx[k], which are not done yet */
52: row = nidx[k];
53: start = ai[row];
54: end = ai[row + 1];
55: for (l = start; l < end; l++) {
56: val = aj[l];
57: if (!PetscBTLookupSet(table, val)) nidx[isz++] = val;
58: }
59: }
60: }
61: PetscCall(ISCreateBlock(PETSC_COMM_SELF, bs, isz, nidx, PETSC_COPY_VALUES, is + i));
62: }
63: PetscCall(PetscBTDestroy(&table));
64: PetscCall(PetscFree(nidx));
65: PetscFunctionReturn(PETSC_SUCCESS);
66: }
68: PetscErrorCode MatCreateSubMatrix_SeqBAIJ_Private(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B)
69: {
70: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *c;
71: PetscInt *smap, i, k, kstart, kend, oldcols = a->nbs, *lens;
72: PetscInt row, mat_i, *mat_j, tcol, *mat_ilen;
73: const PetscInt *irow, *icol;
74: PetscInt nrows, ncols, *ssmap, bs = A->rmap->bs, bs2 = a->bs2;
75: PetscInt *aj = a->j, *ai = a->i;
76: MatScalar *mat_a;
77: Mat C;
78: PetscBool flag;
80: PetscFunctionBegin;
81: PetscCall(ISGetIndices(isrow, &irow));
82: PetscCall(ISGetIndices(iscol, &icol));
83: PetscCall(ISGetLocalSize(isrow, &nrows));
84: PetscCall(ISGetLocalSize(iscol, &ncols));
86: PetscCall(PetscCalloc1(1 + oldcols, &smap));
87: ssmap = smap;
88: PetscCall(PetscMalloc1(1 + nrows, &lens));
89: for (i = 0; i < ncols; i++) smap[icol[i]] = i + 1;
90: /* determine lens of each row */
91: for (i = 0; i < nrows; i++) {
92: kstart = ai[irow[i]];
93: kend = kstart + a->ilen[irow[i]];
94: lens[i] = 0;
95: for (k = kstart; k < kend; k++) {
96: if (ssmap[aj[k]]) lens[i]++;
97: }
98: }
99: /* Create and fill new matrix */
100: if (scall == MAT_REUSE_MATRIX) {
101: c = (Mat_SeqBAIJ *)((*B)->data);
103: PetscCheck(c->mbs == nrows && c->nbs == ncols && (*B)->rmap->bs == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Submatrix wrong size");
104: PetscCall(PetscArraycmp(c->ilen, lens, c->mbs, &flag));
105: PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong no of nonzeros");
106: PetscCall(PetscArrayzero(c->ilen, c->mbs));
107: C = *B;
108: } else {
109: PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &C));
110: PetscCall(MatSetSizes(C, nrows * bs, ncols * bs, PETSC_DETERMINE, PETSC_DETERMINE));
111: PetscCall(MatSetType(C, ((PetscObject)A)->type_name));
112: PetscCall(MatSeqBAIJSetPreallocation(C, bs, 0, lens));
113: }
114: c = (Mat_SeqBAIJ *)(C->data);
115: for (i = 0; i < nrows; i++) {
116: row = irow[i];
117: kstart = ai[row];
118: kend = kstart + a->ilen[row];
119: mat_i = c->i[i];
120: mat_j = c->j ? c->j + mat_i : NULL; /* mustn't add to NULL, that is UB */
121: mat_a = c->a ? c->a + mat_i * bs2 : NULL; /* mustn't add to NULL, that is UB */
122: mat_ilen = c->ilen + i;
123: for (k = kstart; k < kend; k++) {
124: if ((tcol = ssmap[a->j[k]])) {
125: *mat_j++ = tcol - 1;
126: PetscCall(PetscArraycpy(mat_a, a->a + k * bs2, bs2));
127: mat_a += bs2;
128: (*mat_ilen)++;
129: }
130: }
131: }
132: /* sort */
133: if (c->j && c->a) {
134: MatScalar *work;
135: PetscCall(PetscMalloc1(bs2, &work));
136: for (i = 0; i < nrows; i++) {
137: PetscInt ilen;
138: mat_i = c->i[i];
139: mat_j = c->j + mat_i;
140: mat_a = c->a + mat_i * bs2;
141: ilen = c->ilen[i];
142: PetscCall(PetscSortIntWithDataArray(ilen, mat_j, mat_a, bs2 * sizeof(MatScalar), work));
143: }
144: PetscCall(PetscFree(work));
145: }
147: /* Free work space */
148: PetscCall(ISRestoreIndices(iscol, &icol));
149: PetscCall(PetscFree(smap));
150: PetscCall(PetscFree(lens));
151: PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
152: PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
154: PetscCall(ISRestoreIndices(isrow, &irow));
155: *B = C;
156: PetscFunctionReturn(PETSC_SUCCESS);
157: }
159: PetscErrorCode MatCreateSubMatrix_SeqBAIJ(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B)
160: {
161: IS is1, is2;
163: PetscFunctionBegin;
164: PetscCall(ISCompressIndicesGeneral(A->rmap->N, A->rmap->n, A->rmap->bs, 1, &isrow, &is1));
165: if (isrow == iscol) {
166: is2 = is1;
167: PetscCall(PetscObjectReference((PetscObject)is2));
168: } else PetscCall(ISCompressIndicesGeneral(A->cmap->N, A->cmap->n, A->cmap->bs, 1, &iscol, &is2));
169: PetscCall(MatCreateSubMatrix_SeqBAIJ_Private(A, is1, is2, scall, B));
170: PetscCall(ISDestroy(&is1));
171: PetscCall(ISDestroy(&is2));
172: PetscFunctionReturn(PETSC_SUCCESS);
173: }
175: PetscErrorCode MatDestroySubMatrix_SeqBAIJ(Mat C)
176: {
177: Mat_SeqBAIJ *c = (Mat_SeqBAIJ *)C->data;
178: Mat_SubSppt *submatj = c->submatis1;
180: PetscFunctionBegin;
181: PetscCall((*submatj->destroy)(C));
182: PetscCall(MatDestroySubMatrix_Private(submatj));
183: PetscFunctionReturn(PETSC_SUCCESS);
184: }
186: /* Note this has code duplication with MatDestroySubMatrices_SeqAIJ() */
187: PetscErrorCode MatDestroySubMatrices_SeqBAIJ(PetscInt n, Mat *mat[])
188: {
189: PetscInt i;
190: Mat C;
191: Mat_SeqBAIJ *c;
192: Mat_SubSppt *submatj;
194: PetscFunctionBegin;
195: for (i = 0; i < n; i++) {
196: C = (*mat)[i];
197: c = (Mat_SeqBAIJ *)C->data;
198: submatj = c->submatis1;
199: if (submatj) {
200: if (--((PetscObject)C)->refct <= 0) {
201: PetscCall(PetscFree(C->factorprefix));
202: PetscCall((*submatj->destroy)(C));
203: PetscCall(MatDestroySubMatrix_Private(submatj));
204: PetscCall(PetscFree(C->defaultvectype));
205: PetscCall(PetscFree(C->defaultrandtype));
206: PetscCall(PetscLayoutDestroy(&C->rmap));
207: PetscCall(PetscLayoutDestroy(&C->cmap));
208: PetscCall(PetscHeaderDestroy(&C));
209: }
210: } else {
211: PetscCall(MatDestroy(&C));
212: }
213: }
215: /* Destroy Dummy submatrices created for reuse */
216: PetscCall(MatDestroySubMatrices_Dummy(n, mat));
218: PetscCall(PetscFree(*mat));
219: PetscFunctionReturn(PETSC_SUCCESS);
220: }
222: PetscErrorCode MatCreateSubMatrices_SeqBAIJ(Mat A, PetscInt n, const IS irow[], const IS icol[], MatReuse scall, Mat *B[])
223: {
224: PetscInt i;
226: PetscFunctionBegin;
227: if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(n + 1, B));
229: for (i = 0; i < n; i++) PetscCall(MatCreateSubMatrix_SeqBAIJ(A, irow[i], icol[i], scall, &(*B)[i]));
230: PetscFunctionReturn(PETSC_SUCCESS);
231: }
233: /* Should check that shapes of vectors and matrices match */
234: PetscErrorCode MatMult_SeqBAIJ_1(Mat A, Vec xx, Vec zz)
235: {
236: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
237: PetscScalar *z, sum;
238: const PetscScalar *x;
239: const MatScalar *v;
240: PetscInt mbs, i, n;
241: const PetscInt *idx, *ii, *ridx = NULL;
242: PetscBool usecprow = a->compressedrow.use;
244: PetscFunctionBegin;
245: PetscCall(VecGetArrayRead(xx, &x));
246: PetscCall(VecGetArrayWrite(zz, &z));
248: if (usecprow) {
249: mbs = a->compressedrow.nrows;
250: ii = a->compressedrow.i;
251: ridx = a->compressedrow.rindex;
252: PetscCall(PetscArrayzero(z, a->mbs));
253: } else {
254: mbs = a->mbs;
255: ii = a->i;
256: }
258: for (i = 0; i < mbs; i++) {
259: n = ii[1] - ii[0];
260: v = a->a + ii[0];
261: idx = a->j + ii[0];
262: ii++;
263: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
264: PetscPrefetchBlock(v + 1 * n, 1 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
265: sum = 0.0;
266: PetscSparseDensePlusDot(sum, x, v, idx, n);
267: if (usecprow) {
268: z[ridx[i]] = sum;
269: } else {
270: z[i] = sum;
271: }
272: }
273: PetscCall(VecRestoreArrayRead(xx, &x));
274: PetscCall(VecRestoreArrayWrite(zz, &z));
275: PetscCall(PetscLogFlops(2.0 * a->nz - a->nonzerorowcnt));
276: PetscFunctionReturn(PETSC_SUCCESS);
277: }
279: PetscErrorCode MatMult_SeqBAIJ_2(Mat A, Vec xx, Vec zz)
280: {
281: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
282: PetscScalar *z = NULL, sum1, sum2, *zarray;
283: const PetscScalar *x, *xb;
284: PetscScalar x1, x2;
285: const MatScalar *v;
286: PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL;
287: PetscBool usecprow = a->compressedrow.use;
289: PetscFunctionBegin;
290: PetscCall(VecGetArrayRead(xx, &x));
291: PetscCall(VecGetArrayWrite(zz, &zarray));
293: idx = a->j;
294: v = a->a;
295: if (usecprow) {
296: mbs = a->compressedrow.nrows;
297: ii = a->compressedrow.i;
298: ridx = a->compressedrow.rindex;
299: PetscCall(PetscArrayzero(zarray, 2 * a->mbs));
300: } else {
301: mbs = a->mbs;
302: ii = a->i;
303: z = zarray;
304: }
306: for (i = 0; i < mbs; i++) {
307: n = ii[1] - ii[0];
308: ii++;
309: sum1 = 0.0;
310: sum2 = 0.0;
311: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
312: PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
313: for (j = 0; j < n; j++) {
314: xb = x + 2 * (*idx++);
315: x1 = xb[0];
316: x2 = xb[1];
317: sum1 += v[0] * x1 + v[2] * x2;
318: sum2 += v[1] * x1 + v[3] * x2;
319: v += 4;
320: }
321: if (usecprow) z = zarray + 2 * ridx[i];
322: z[0] = sum1;
323: z[1] = sum2;
324: if (!usecprow) z += 2;
325: }
326: PetscCall(VecRestoreArrayRead(xx, &x));
327: PetscCall(VecRestoreArrayWrite(zz, &zarray));
328: PetscCall(PetscLogFlops(8.0 * a->nz - 2.0 * a->nonzerorowcnt));
329: PetscFunctionReturn(PETSC_SUCCESS);
330: }
332: PetscErrorCode MatMult_SeqBAIJ_3(Mat A, Vec xx, Vec zz)
333: {
334: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
335: PetscScalar *z = NULL, sum1, sum2, sum3, x1, x2, x3, *zarray;
336: const PetscScalar *x, *xb;
337: const MatScalar *v;
338: PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL;
339: PetscBool usecprow = a->compressedrow.use;
341: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
342: #pragma disjoint(*v, *z, *xb)
343: #endif
345: PetscFunctionBegin;
346: PetscCall(VecGetArrayRead(xx, &x));
347: PetscCall(VecGetArrayWrite(zz, &zarray));
349: idx = a->j;
350: v = a->a;
351: if (usecprow) {
352: mbs = a->compressedrow.nrows;
353: ii = a->compressedrow.i;
354: ridx = a->compressedrow.rindex;
355: PetscCall(PetscArrayzero(zarray, 3 * a->mbs));
356: } else {
357: mbs = a->mbs;
358: ii = a->i;
359: z = zarray;
360: }
362: for (i = 0; i < mbs; i++) {
363: n = ii[1] - ii[0];
364: ii++;
365: sum1 = 0.0;
366: sum2 = 0.0;
367: sum3 = 0.0;
368: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
369: PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
370: for (j = 0; j < n; j++) {
371: xb = x + 3 * (*idx++);
372: x1 = xb[0];
373: x2 = xb[1];
374: x3 = xb[2];
376: sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3;
377: sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3;
378: sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3;
379: v += 9;
380: }
381: if (usecprow) z = zarray + 3 * ridx[i];
382: z[0] = sum1;
383: z[1] = sum2;
384: z[2] = sum3;
385: if (!usecprow) z += 3;
386: }
387: PetscCall(VecRestoreArrayRead(xx, &x));
388: PetscCall(VecRestoreArrayWrite(zz, &zarray));
389: PetscCall(PetscLogFlops(18.0 * a->nz - 3.0 * a->nonzerorowcnt));
390: PetscFunctionReturn(PETSC_SUCCESS);
391: }
393: PetscErrorCode MatMult_SeqBAIJ_4(Mat A, Vec xx, Vec zz)
394: {
395: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
396: PetscScalar *z = NULL, sum1, sum2, sum3, sum4, x1, x2, x3, x4, *zarray;
397: const PetscScalar *x, *xb;
398: const MatScalar *v;
399: PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL;
400: PetscBool usecprow = a->compressedrow.use;
402: PetscFunctionBegin;
403: PetscCall(VecGetArrayRead(xx, &x));
404: PetscCall(VecGetArrayWrite(zz, &zarray));
406: idx = a->j;
407: v = a->a;
408: if (usecprow) {
409: mbs = a->compressedrow.nrows;
410: ii = a->compressedrow.i;
411: ridx = a->compressedrow.rindex;
412: PetscCall(PetscArrayzero(zarray, 4 * a->mbs));
413: } else {
414: mbs = a->mbs;
415: ii = a->i;
416: z = zarray;
417: }
419: for (i = 0; i < mbs; i++) {
420: n = ii[1] - ii[0];
421: ii++;
422: sum1 = 0.0;
423: sum2 = 0.0;
424: sum3 = 0.0;
425: sum4 = 0.0;
427: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
428: PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
429: for (j = 0; j < n; j++) {
430: xb = x + 4 * (*idx++);
431: x1 = xb[0];
432: x2 = xb[1];
433: x3 = xb[2];
434: x4 = xb[3];
435: sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
436: sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
437: sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
438: sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
439: v += 16;
440: }
441: if (usecprow) z = zarray + 4 * ridx[i];
442: z[0] = sum1;
443: z[1] = sum2;
444: z[2] = sum3;
445: z[3] = sum4;
446: if (!usecprow) z += 4;
447: }
448: PetscCall(VecRestoreArrayRead(xx, &x));
449: PetscCall(VecRestoreArrayWrite(zz, &zarray));
450: PetscCall(PetscLogFlops(32.0 * a->nz - 4.0 * a->nonzerorowcnt));
451: PetscFunctionReturn(PETSC_SUCCESS);
452: }
454: PetscErrorCode MatMult_SeqBAIJ_5(Mat A, Vec xx, Vec zz)
455: {
456: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
457: PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, x1, x2, x3, x4, x5, *zarray;
458: const PetscScalar *xb, *x;
459: const MatScalar *v;
460: const PetscInt *idx, *ii, *ridx = NULL;
461: PetscInt mbs, i, j, n;
462: PetscBool usecprow = a->compressedrow.use;
464: PetscFunctionBegin;
465: PetscCall(VecGetArrayRead(xx, &x));
466: PetscCall(VecGetArrayWrite(zz, &zarray));
468: idx = a->j;
469: v = a->a;
470: if (usecprow) {
471: mbs = a->compressedrow.nrows;
472: ii = a->compressedrow.i;
473: ridx = a->compressedrow.rindex;
474: PetscCall(PetscArrayzero(zarray, 5 * a->mbs));
475: } else {
476: mbs = a->mbs;
477: ii = a->i;
478: z = zarray;
479: }
481: for (i = 0; i < mbs; i++) {
482: n = ii[1] - ii[0];
483: ii++;
484: sum1 = 0.0;
485: sum2 = 0.0;
486: sum3 = 0.0;
487: sum4 = 0.0;
488: sum5 = 0.0;
489: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
490: PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
491: for (j = 0; j < n; j++) {
492: xb = x + 5 * (*idx++);
493: x1 = xb[0];
494: x2 = xb[1];
495: x3 = xb[2];
496: x4 = xb[3];
497: x5 = xb[4];
498: sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
499: sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
500: sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
501: sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
502: sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
503: v += 25;
504: }
505: if (usecprow) z = zarray + 5 * ridx[i];
506: z[0] = sum1;
507: z[1] = sum2;
508: z[2] = sum3;
509: z[3] = sum4;
510: z[4] = sum5;
511: if (!usecprow) z += 5;
512: }
513: PetscCall(VecRestoreArrayRead(xx, &x));
514: PetscCall(VecRestoreArrayWrite(zz, &zarray));
515: PetscCall(PetscLogFlops(50.0 * a->nz - 5.0 * a->nonzerorowcnt));
516: PetscFunctionReturn(PETSC_SUCCESS);
517: }
519: PetscErrorCode MatMult_SeqBAIJ_6(Mat A, Vec xx, Vec zz)
520: {
521: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
522: PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6;
523: const PetscScalar *x, *xb;
524: PetscScalar x1, x2, x3, x4, x5, x6, *zarray;
525: const MatScalar *v;
526: PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL;
527: PetscBool usecprow = a->compressedrow.use;
529: PetscFunctionBegin;
530: PetscCall(VecGetArrayRead(xx, &x));
531: PetscCall(VecGetArrayWrite(zz, &zarray));
533: idx = a->j;
534: v = a->a;
535: if (usecprow) {
536: mbs = a->compressedrow.nrows;
537: ii = a->compressedrow.i;
538: ridx = a->compressedrow.rindex;
539: PetscCall(PetscArrayzero(zarray, 6 * a->mbs));
540: } else {
541: mbs = a->mbs;
542: ii = a->i;
543: z = zarray;
544: }
546: for (i = 0; i < mbs; i++) {
547: n = ii[1] - ii[0];
548: ii++;
549: sum1 = 0.0;
550: sum2 = 0.0;
551: sum3 = 0.0;
552: sum4 = 0.0;
553: sum5 = 0.0;
554: sum6 = 0.0;
556: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
557: PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
558: for (j = 0; j < n; j++) {
559: xb = x + 6 * (*idx++);
560: x1 = xb[0];
561: x2 = xb[1];
562: x3 = xb[2];
563: x4 = xb[3];
564: x5 = xb[4];
565: x6 = xb[5];
566: sum1 += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
567: sum2 += v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
568: sum3 += v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
569: sum4 += v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
570: sum5 += v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
571: sum6 += v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
572: v += 36;
573: }
574: if (usecprow) z = zarray + 6 * ridx[i];
575: z[0] = sum1;
576: z[1] = sum2;
577: z[2] = sum3;
578: z[3] = sum4;
579: z[4] = sum5;
580: z[5] = sum6;
581: if (!usecprow) z += 6;
582: }
584: PetscCall(VecRestoreArrayRead(xx, &x));
585: PetscCall(VecRestoreArrayWrite(zz, &zarray));
586: PetscCall(PetscLogFlops(72.0 * a->nz - 6.0 * a->nonzerorowcnt));
587: PetscFunctionReturn(PETSC_SUCCESS);
588: }
590: PetscErrorCode MatMult_SeqBAIJ_7(Mat A, Vec xx, Vec zz)
591: {
592: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
593: PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7;
594: const PetscScalar *x, *xb;
595: PetscScalar x1, x2, x3, x4, x5, x6, x7, *zarray;
596: const MatScalar *v;
597: PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL;
598: PetscBool usecprow = a->compressedrow.use;
600: PetscFunctionBegin;
601: PetscCall(VecGetArrayRead(xx, &x));
602: PetscCall(VecGetArrayWrite(zz, &zarray));
604: idx = a->j;
605: v = a->a;
606: if (usecprow) {
607: mbs = a->compressedrow.nrows;
608: ii = a->compressedrow.i;
609: ridx = a->compressedrow.rindex;
610: PetscCall(PetscArrayzero(zarray, 7 * a->mbs));
611: } else {
612: mbs = a->mbs;
613: ii = a->i;
614: z = zarray;
615: }
617: for (i = 0; i < mbs; i++) {
618: n = ii[1] - ii[0];
619: ii++;
620: sum1 = 0.0;
621: sum2 = 0.0;
622: sum3 = 0.0;
623: sum4 = 0.0;
624: sum5 = 0.0;
625: sum6 = 0.0;
626: sum7 = 0.0;
628: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
629: PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
630: for (j = 0; j < n; j++) {
631: xb = x + 7 * (*idx++);
632: x1 = xb[0];
633: x2 = xb[1];
634: x3 = xb[2];
635: x4 = xb[3];
636: x5 = xb[4];
637: x6 = xb[5];
638: x7 = xb[6];
639: sum1 += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
640: sum2 += v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
641: sum3 += v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
642: sum4 += v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
643: sum5 += v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
644: sum6 += v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
645: sum7 += v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
646: v += 49;
647: }
648: if (usecprow) z = zarray + 7 * ridx[i];
649: z[0] = sum1;
650: z[1] = sum2;
651: z[2] = sum3;
652: z[3] = sum4;
653: z[4] = sum5;
654: z[5] = sum6;
655: z[6] = sum7;
656: if (!usecprow) z += 7;
657: }
659: PetscCall(VecRestoreArrayRead(xx, &x));
660: PetscCall(VecRestoreArrayWrite(zz, &zarray));
661: PetscCall(PetscLogFlops(98.0 * a->nz - 7.0 * a->nonzerorowcnt));
662: PetscFunctionReturn(PETSC_SUCCESS);
663: }
665: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
666: PetscErrorCode MatMult_SeqBAIJ_9_AVX2(Mat A, Vec xx, Vec zz)
667: {
668: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
669: PetscScalar *z = NULL, *work, *workt, *zarray;
670: const PetscScalar *x, *xb;
671: const MatScalar *v;
672: PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
673: const PetscInt *idx, *ii, *ridx = NULL;
674: PetscInt k;
675: PetscBool usecprow = a->compressedrow.use;
677: __m256d a0, a1, a2, a3, a4, a5;
678: __m256d w0, w1, w2, w3;
679: __m256d z0, z1, z2;
680: __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL << 63);
682: PetscFunctionBegin;
683: PetscCall(VecGetArrayRead(xx, &x));
684: PetscCall(VecGetArrayWrite(zz, &zarray));
686: idx = a->j;
687: v = a->a;
688: if (usecprow) {
689: mbs = a->compressedrow.nrows;
690: ii = a->compressedrow.i;
691: ridx = a->compressedrow.rindex;
692: PetscCall(PetscArrayzero(zarray, bs * a->mbs));
693: } else {
694: mbs = a->mbs;
695: ii = a->i;
696: z = zarray;
697: }
699: if (!a->mult_work) {
700: k = PetscMax(A->rmap->n, A->cmap->n);
701: PetscCall(PetscMalloc1(k + 1, &a->mult_work));
702: }
704: work = a->mult_work;
705: for (i = 0; i < mbs; i++) {
706: n = ii[1] - ii[0];
707: ii++;
708: workt = work;
709: for (j = 0; j < n; j++) {
710: xb = x + bs * (*idx++);
711: for (k = 0; k < bs; k++) workt[k] = xb[k];
712: workt += bs;
713: }
714: if (usecprow) z = zarray + bs * ridx[i];
716: z0 = _mm256_setzero_pd();
717: z1 = _mm256_setzero_pd();
718: z2 = _mm256_setzero_pd();
720: for (j = 0; j < n; j++) {
721: /* first column of a */
722: w0 = _mm256_set1_pd(work[j * 9]);
723: a0 = _mm256_loadu_pd(&v[j * 81]);
724: z0 = _mm256_fmadd_pd(a0, w0, z0);
725: a1 = _mm256_loadu_pd(&v[j * 81 + 4]);
726: z1 = _mm256_fmadd_pd(a1, w0, z1);
727: a2 = _mm256_loadu_pd(&v[j * 81 + 8]);
728: z2 = _mm256_fmadd_pd(a2, w0, z2);
730: /* second column of a */
731: w1 = _mm256_set1_pd(work[j * 9 + 1]);
732: a0 = _mm256_loadu_pd(&v[j * 81 + 9]);
733: z0 = _mm256_fmadd_pd(a0, w1, z0);
734: a1 = _mm256_loadu_pd(&v[j * 81 + 13]);
735: z1 = _mm256_fmadd_pd(a1, w1, z1);
736: a2 = _mm256_loadu_pd(&v[j * 81 + 17]);
737: z2 = _mm256_fmadd_pd(a2, w1, z2);
739: /* third column of a */
740: w2 = _mm256_set1_pd(work[j * 9 + 2]);
741: a3 = _mm256_loadu_pd(&v[j * 81 + 18]);
742: z0 = _mm256_fmadd_pd(a3, w2, z0);
743: a4 = _mm256_loadu_pd(&v[j * 81 + 22]);
744: z1 = _mm256_fmadd_pd(a4, w2, z1);
745: a5 = _mm256_loadu_pd(&v[j * 81 + 26]);
746: z2 = _mm256_fmadd_pd(a5, w2, z2);
748: /* fourth column of a */
749: w3 = _mm256_set1_pd(work[j * 9 + 3]);
750: a0 = _mm256_loadu_pd(&v[j * 81 + 27]);
751: z0 = _mm256_fmadd_pd(a0, w3, z0);
752: a1 = _mm256_loadu_pd(&v[j * 81 + 31]);
753: z1 = _mm256_fmadd_pd(a1, w3, z1);
754: a2 = _mm256_loadu_pd(&v[j * 81 + 35]);
755: z2 = _mm256_fmadd_pd(a2, w3, z2);
757: /* fifth column of a */
758: w0 = _mm256_set1_pd(work[j * 9 + 4]);
759: a3 = _mm256_loadu_pd(&v[j * 81 + 36]);
760: z0 = _mm256_fmadd_pd(a3, w0, z0);
761: a4 = _mm256_loadu_pd(&v[j * 81 + 40]);
762: z1 = _mm256_fmadd_pd(a4, w0, z1);
763: a5 = _mm256_loadu_pd(&v[j * 81 + 44]);
764: z2 = _mm256_fmadd_pd(a5, w0, z2);
766: /* sixth column of a */
767: w1 = _mm256_set1_pd(work[j * 9 + 5]);
768: a0 = _mm256_loadu_pd(&v[j * 81 + 45]);
769: z0 = _mm256_fmadd_pd(a0, w1, z0);
770: a1 = _mm256_loadu_pd(&v[j * 81 + 49]);
771: z1 = _mm256_fmadd_pd(a1, w1, z1);
772: a2 = _mm256_loadu_pd(&v[j * 81 + 53]);
773: z2 = _mm256_fmadd_pd(a2, w1, z2);
775: /* seventh column of a */
776: w2 = _mm256_set1_pd(work[j * 9 + 6]);
777: a0 = _mm256_loadu_pd(&v[j * 81 + 54]);
778: z0 = _mm256_fmadd_pd(a0, w2, z0);
779: a1 = _mm256_loadu_pd(&v[j * 81 + 58]);
780: z1 = _mm256_fmadd_pd(a1, w2, z1);
781: a2 = _mm256_loadu_pd(&v[j * 81 + 62]);
782: z2 = _mm256_fmadd_pd(a2, w2, z2);
784: /* eighth column of a */
785: w3 = _mm256_set1_pd(work[j * 9 + 7]);
786: a3 = _mm256_loadu_pd(&v[j * 81 + 63]);
787: z0 = _mm256_fmadd_pd(a3, w3, z0);
788: a4 = _mm256_loadu_pd(&v[j * 81 + 67]);
789: z1 = _mm256_fmadd_pd(a4, w3, z1);
790: a5 = _mm256_loadu_pd(&v[j * 81 + 71]);
791: z2 = _mm256_fmadd_pd(a5, w3, z2);
793: /* ninth column of a */
794: w0 = _mm256_set1_pd(work[j * 9 + 8]);
795: a0 = _mm256_loadu_pd(&v[j * 81 + 72]);
796: z0 = _mm256_fmadd_pd(a0, w0, z0);
797: a1 = _mm256_loadu_pd(&v[j * 81 + 76]);
798: z1 = _mm256_fmadd_pd(a1, w0, z1);
799: a2 = _mm256_maskload_pd(&v[j * 81 + 80], mask1);
800: z2 = _mm256_fmadd_pd(a2, w0, z2);
801: }
803: _mm256_storeu_pd(&z[0], z0);
804: _mm256_storeu_pd(&z[4], z1);
805: _mm256_maskstore_pd(&z[8], mask1, z2);
807: v += n * bs2;
808: if (!usecprow) z += bs;
809: }
810: PetscCall(VecRestoreArrayRead(xx, &x));
811: PetscCall(VecRestoreArrayWrite(zz, &zarray));
812: PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt));
813: PetscFunctionReturn(PETSC_SUCCESS);
814: }
815: #endif
817: PetscErrorCode MatMult_SeqBAIJ_11(Mat A, Vec xx, Vec zz)
818: {
819: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
820: PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11;
821: const PetscScalar *x, *xb;
822: PetscScalar *zarray, xv;
823: const MatScalar *v;
824: const PetscInt *ii, *ij = a->j, *idx;
825: PetscInt mbs, i, j, k, n, *ridx = NULL;
826: PetscBool usecprow = a->compressedrow.use;
828: PetscFunctionBegin;
829: PetscCall(VecGetArrayRead(xx, &x));
830: PetscCall(VecGetArrayWrite(zz, &zarray));
832: v = a->a;
833: if (usecprow) {
834: mbs = a->compressedrow.nrows;
835: ii = a->compressedrow.i;
836: ridx = a->compressedrow.rindex;
837: PetscCall(PetscArrayzero(zarray, 11 * a->mbs));
838: } else {
839: mbs = a->mbs;
840: ii = a->i;
841: z = zarray;
842: }
844: for (i = 0; i < mbs; i++) {
845: n = ii[i + 1] - ii[i];
846: idx = ij + ii[i];
847: sum1 = 0.0;
848: sum2 = 0.0;
849: sum3 = 0.0;
850: sum4 = 0.0;
851: sum5 = 0.0;
852: sum6 = 0.0;
853: sum7 = 0.0;
854: sum8 = 0.0;
855: sum9 = 0.0;
856: sum10 = 0.0;
857: sum11 = 0.0;
859: for (j = 0; j < n; j++) {
860: xb = x + 11 * (idx[j]);
862: for (k = 0; k < 11; k++) {
863: xv = xb[k];
864: sum1 += v[0] * xv;
865: sum2 += v[1] * xv;
866: sum3 += v[2] * xv;
867: sum4 += v[3] * xv;
868: sum5 += v[4] * xv;
869: sum6 += v[5] * xv;
870: sum7 += v[6] * xv;
871: sum8 += v[7] * xv;
872: sum9 += v[8] * xv;
873: sum10 += v[9] * xv;
874: sum11 += v[10] * xv;
875: v += 11;
876: }
877: }
878: if (usecprow) z = zarray + 11 * ridx[i];
879: z[0] = sum1;
880: z[1] = sum2;
881: z[2] = sum3;
882: z[3] = sum4;
883: z[4] = sum5;
884: z[5] = sum6;
885: z[6] = sum7;
886: z[7] = sum8;
887: z[8] = sum9;
888: z[9] = sum10;
889: z[10] = sum11;
891: if (!usecprow) z += 11;
892: }
894: PetscCall(VecRestoreArrayRead(xx, &x));
895: PetscCall(VecRestoreArrayWrite(zz, &zarray));
896: PetscCall(PetscLogFlops(242.0 * a->nz - 11.0 * a->nonzerorowcnt));
897: PetscFunctionReturn(PETSC_SUCCESS);
898: }
900: /* MatMult_SeqBAIJ_12 version 1: Columns in the block are accessed one at a time */
901: PetscErrorCode MatMult_SeqBAIJ_12_ver1(Mat A, Vec xx, Vec zz)
902: {
903: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
904: PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12;
905: const PetscScalar *x, *xb;
906: PetscScalar *zarray, xv;
907: const MatScalar *v;
908: const PetscInt *ii, *ij = a->j, *idx;
909: PetscInt mbs, i, j, k, n, *ridx = NULL;
910: PetscBool usecprow = a->compressedrow.use;
912: PetscFunctionBegin;
913: PetscCall(VecGetArrayRead(xx, &x));
914: PetscCall(VecGetArrayWrite(zz, &zarray));
916: v = a->a;
917: if (usecprow) {
918: mbs = a->compressedrow.nrows;
919: ii = a->compressedrow.i;
920: ridx = a->compressedrow.rindex;
921: PetscCall(PetscArrayzero(zarray, 12 * a->mbs));
922: } else {
923: mbs = a->mbs;
924: ii = a->i;
925: z = zarray;
926: }
928: for (i = 0; i < mbs; i++) {
929: n = ii[i + 1] - ii[i];
930: idx = ij + ii[i];
931: sum1 = 0.0;
932: sum2 = 0.0;
933: sum3 = 0.0;
934: sum4 = 0.0;
935: sum5 = 0.0;
936: sum6 = 0.0;
937: sum7 = 0.0;
938: sum8 = 0.0;
939: sum9 = 0.0;
940: sum10 = 0.0;
941: sum11 = 0.0;
942: sum12 = 0.0;
944: for (j = 0; j < n; j++) {
945: xb = x + 12 * (idx[j]);
947: for (k = 0; k < 12; k++) {
948: xv = xb[k];
949: sum1 += v[0] * xv;
950: sum2 += v[1] * xv;
951: sum3 += v[2] * xv;
952: sum4 += v[3] * xv;
953: sum5 += v[4] * xv;
954: sum6 += v[5] * xv;
955: sum7 += v[6] * xv;
956: sum8 += v[7] * xv;
957: sum9 += v[8] * xv;
958: sum10 += v[9] * xv;
959: sum11 += v[10] * xv;
960: sum12 += v[11] * xv;
961: v += 12;
962: }
963: }
964: if (usecprow) z = zarray + 12 * ridx[i];
965: z[0] = sum1;
966: z[1] = sum2;
967: z[2] = sum3;
968: z[3] = sum4;
969: z[4] = sum5;
970: z[5] = sum6;
971: z[6] = sum7;
972: z[7] = sum8;
973: z[8] = sum9;
974: z[9] = sum10;
975: z[10] = sum11;
976: z[11] = sum12;
977: if (!usecprow) z += 12;
978: }
979: PetscCall(VecRestoreArrayRead(xx, &x));
980: PetscCall(VecRestoreArrayWrite(zz, &zarray));
981: PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt));
982: PetscFunctionReturn(PETSC_SUCCESS);
983: }
985: PetscErrorCode MatMultAdd_SeqBAIJ_12_ver1(Mat A, Vec xx, Vec yy, Vec zz)
986: {
987: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
988: PetscScalar *z = NULL, *y = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12;
989: const PetscScalar *x, *xb;
990: PetscScalar *zarray, *yarray, xv;
991: const MatScalar *v;
992: const PetscInt *ii, *ij = a->j, *idx;
993: PetscInt mbs = a->mbs, i, j, k, n, *ridx = NULL;
994: PetscBool usecprow = a->compressedrow.use;
996: PetscFunctionBegin;
997: PetscCall(VecGetArrayRead(xx, &x));
998: PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
1000: v = a->a;
1001: if (usecprow) {
1002: if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 12 * mbs));
1003: mbs = a->compressedrow.nrows;
1004: ii = a->compressedrow.i;
1005: ridx = a->compressedrow.rindex;
1006: } else {
1007: ii = a->i;
1008: y = yarray;
1009: z = zarray;
1010: }
1012: for (i = 0; i < mbs; i++) {
1013: n = ii[i + 1] - ii[i];
1014: idx = ij + ii[i];
1016: if (usecprow) {
1017: y = yarray + 12 * ridx[i];
1018: z = zarray + 12 * ridx[i];
1019: }
1020: sum1 = y[0];
1021: sum2 = y[1];
1022: sum3 = y[2];
1023: sum4 = y[3];
1024: sum5 = y[4];
1025: sum6 = y[5];
1026: sum7 = y[6];
1027: sum8 = y[7];
1028: sum9 = y[8];
1029: sum10 = y[9];
1030: sum11 = y[10];
1031: sum12 = y[11];
1033: for (j = 0; j < n; j++) {
1034: xb = x + 12 * (idx[j]);
1036: for (k = 0; k < 12; k++) {
1037: xv = xb[k];
1038: sum1 += v[0] * xv;
1039: sum2 += v[1] * xv;
1040: sum3 += v[2] * xv;
1041: sum4 += v[3] * xv;
1042: sum5 += v[4] * xv;
1043: sum6 += v[5] * xv;
1044: sum7 += v[6] * xv;
1045: sum8 += v[7] * xv;
1046: sum9 += v[8] * xv;
1047: sum10 += v[9] * xv;
1048: sum11 += v[10] * xv;
1049: sum12 += v[11] * xv;
1050: v += 12;
1051: }
1052: }
1054: z[0] = sum1;
1055: z[1] = sum2;
1056: z[2] = sum3;
1057: z[3] = sum4;
1058: z[4] = sum5;
1059: z[5] = sum6;
1060: z[6] = sum7;
1061: z[7] = sum8;
1062: z[8] = sum9;
1063: z[9] = sum10;
1064: z[10] = sum11;
1065: z[11] = sum12;
1066: if (!usecprow) {
1067: y += 12;
1068: z += 12;
1069: }
1070: }
1071: PetscCall(VecRestoreArrayRead(xx, &x));
1072: PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
1073: PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt));
1074: PetscFunctionReturn(PETSC_SUCCESS);
1075: }
1077: /* MatMult_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */
1078: PetscErrorCode MatMult_SeqBAIJ_12_ver2(Mat A, Vec xx, Vec zz)
1079: {
1080: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1081: PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12;
1082: const PetscScalar *x, *xb;
1083: PetscScalar x1, x2, x3, x4, *zarray;
1084: const MatScalar *v;
1085: const PetscInt *ii, *ij = a->j, *idx, *ridx = NULL;
1086: PetscInt mbs, i, j, n;
1087: PetscBool usecprow = a->compressedrow.use;
1089: PetscFunctionBegin;
1090: PetscCall(VecGetArrayRead(xx, &x));
1091: PetscCall(VecGetArrayWrite(zz, &zarray));
1093: v = a->a;
1094: if (usecprow) {
1095: mbs = a->compressedrow.nrows;
1096: ii = a->compressedrow.i;
1097: ridx = a->compressedrow.rindex;
1098: PetscCall(PetscArrayzero(zarray, 12 * a->mbs));
1099: } else {
1100: mbs = a->mbs;
1101: ii = a->i;
1102: z = zarray;
1103: }
1105: for (i = 0; i < mbs; i++) {
1106: n = ii[i + 1] - ii[i];
1107: idx = ij + ii[i];
1109: sum1 = sum2 = sum3 = sum4 = sum5 = sum6 = sum7 = sum8 = sum9 = sum10 = sum11 = sum12 = 0;
1110: for (j = 0; j < n; j++) {
1111: xb = x + 12 * (idx[j]);
1112: x1 = xb[0];
1113: x2 = xb[1];
1114: x3 = xb[2];
1115: x4 = xb[3];
1117: sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1118: sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1119: sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1120: sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1121: sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1122: sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1123: sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1124: sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1125: sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1126: sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1127: sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1128: sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1129: v += 48;
1131: x1 = xb[4];
1132: x2 = xb[5];
1133: x3 = xb[6];
1134: x4 = xb[7];
1136: sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1137: sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1138: sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1139: sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1140: sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1141: sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1142: sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1143: sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1144: sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1145: sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1146: sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1147: sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1148: v += 48;
1150: x1 = xb[8];
1151: x2 = xb[9];
1152: x3 = xb[10];
1153: x4 = xb[11];
1154: sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1155: sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1156: sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1157: sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1158: sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1159: sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1160: sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1161: sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1162: sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1163: sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1164: sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1165: sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1166: v += 48;
1167: }
1168: if (usecprow) z = zarray + 12 * ridx[i];
1169: z[0] = sum1;
1170: z[1] = sum2;
1171: z[2] = sum3;
1172: z[3] = sum4;
1173: z[4] = sum5;
1174: z[5] = sum6;
1175: z[6] = sum7;
1176: z[7] = sum8;
1177: z[8] = sum9;
1178: z[9] = sum10;
1179: z[10] = sum11;
1180: z[11] = sum12;
1181: if (!usecprow) z += 12;
1182: }
1183: PetscCall(VecRestoreArrayRead(xx, &x));
1184: PetscCall(VecRestoreArrayWrite(zz, &zarray));
1185: PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt));
1186: PetscFunctionReturn(PETSC_SUCCESS);
1187: }
1189: /* MatMultAdd_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */
1190: PetscErrorCode MatMultAdd_SeqBAIJ_12_ver2(Mat A, Vec xx, Vec yy, Vec zz)
1191: {
1192: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1193: PetscScalar *z = NULL, *y = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12;
1194: const PetscScalar *x, *xb;
1195: PetscScalar x1, x2, x3, x4, *zarray, *yarray;
1196: const MatScalar *v;
1197: const PetscInt *ii, *ij = a->j, *idx, *ridx = NULL;
1198: PetscInt mbs = a->mbs, i, j, n;
1199: PetscBool usecprow = a->compressedrow.use;
1201: PetscFunctionBegin;
1202: PetscCall(VecGetArrayRead(xx, &x));
1203: PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
1205: v = a->a;
1206: if (usecprow) {
1207: if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 12 * mbs));
1208: mbs = a->compressedrow.nrows;
1209: ii = a->compressedrow.i;
1210: ridx = a->compressedrow.rindex;
1211: } else {
1212: ii = a->i;
1213: y = yarray;
1214: z = zarray;
1215: }
1217: for (i = 0; i < mbs; i++) {
1218: n = ii[i + 1] - ii[i];
1219: idx = ij + ii[i];
1221: if (usecprow) {
1222: y = yarray + 12 * ridx[i];
1223: z = zarray + 12 * ridx[i];
1224: }
1225: sum1 = y[0];
1226: sum2 = y[1];
1227: sum3 = y[2];
1228: sum4 = y[3];
1229: sum5 = y[4];
1230: sum6 = y[5];
1231: sum7 = y[6];
1232: sum8 = y[7];
1233: sum9 = y[8];
1234: sum10 = y[9];
1235: sum11 = y[10];
1236: sum12 = y[11];
1238: for (j = 0; j < n; j++) {
1239: xb = x + 12 * (idx[j]);
1240: x1 = xb[0];
1241: x2 = xb[1];
1242: x3 = xb[2];
1243: x4 = xb[3];
1245: sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1246: sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1247: sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1248: sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1249: sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1250: sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1251: sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1252: sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1253: sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1254: sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1255: sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1256: sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1257: v += 48;
1259: x1 = xb[4];
1260: x2 = xb[5];
1261: x3 = xb[6];
1262: x4 = xb[7];
1264: sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1265: sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1266: sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1267: sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1268: sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1269: sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1270: sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1271: sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1272: sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1273: sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1274: sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1275: sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1276: v += 48;
1278: x1 = xb[8];
1279: x2 = xb[9];
1280: x3 = xb[10];
1281: x4 = xb[11];
1282: sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1283: sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1284: sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1285: sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1286: sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1287: sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1288: sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1289: sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1290: sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1291: sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1292: sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1293: sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1294: v += 48;
1295: }
1296: z[0] = sum1;
1297: z[1] = sum2;
1298: z[2] = sum3;
1299: z[3] = sum4;
1300: z[4] = sum5;
1301: z[5] = sum6;
1302: z[6] = sum7;
1303: z[7] = sum8;
1304: z[8] = sum9;
1305: z[9] = sum10;
1306: z[10] = sum11;
1307: z[11] = sum12;
1308: if (!usecprow) {
1309: y += 12;
1310: z += 12;
1311: }
1312: }
1313: PetscCall(VecRestoreArrayRead(xx, &x));
1314: PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
1315: PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt));
1316: PetscFunctionReturn(PETSC_SUCCESS);
1317: }
1319: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
1320: PetscErrorCode MatMult_SeqBAIJ_12_AVX2(Mat A, Vec xx, Vec zz)
1321: {
1322: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1323: PetscScalar *z = NULL, *zarray;
1324: const PetscScalar *x, *work;
1325: const MatScalar *v = a->a;
1326: PetscInt mbs, i, j, n;
1327: const PetscInt *idx = a->j, *ii, *ridx = NULL;
1328: PetscBool usecprow = a->compressedrow.use;
1329: const PetscInt bs = 12, bs2 = 144;
1331: __m256d a0, a1, a2, a3, a4, a5;
1332: __m256d w0, w1, w2, w3;
1333: __m256d z0, z1, z2;
1335: PetscFunctionBegin;
1336: PetscCall(VecGetArrayRead(xx, &x));
1337: PetscCall(VecGetArrayWrite(zz, &zarray));
1339: if (usecprow) {
1340: mbs = a->compressedrow.nrows;
1341: ii = a->compressedrow.i;
1342: ridx = a->compressedrow.rindex;
1343: PetscCall(PetscArrayzero(zarray, bs * a->mbs));
1344: } else {
1345: mbs = a->mbs;
1346: ii = a->i;
1347: z = zarray;
1348: }
1350: for (i = 0; i < mbs; i++) {
1351: z0 = _mm256_setzero_pd();
1352: z1 = _mm256_setzero_pd();
1353: z2 = _mm256_setzero_pd();
1355: n = ii[1] - ii[0];
1356: ii++;
1357: for (j = 0; j < n; j++) {
1358: work = x + bs * (*idx++);
1360: /* first column of a */
1361: w0 = _mm256_set1_pd(work[0]);
1362: a0 = _mm256_loadu_pd(v + 0);
1363: z0 = _mm256_fmadd_pd(a0, w0, z0);
1364: a1 = _mm256_loadu_pd(v + 4);
1365: z1 = _mm256_fmadd_pd(a1, w0, z1);
1366: a2 = _mm256_loadu_pd(v + 8);
1367: z2 = _mm256_fmadd_pd(a2, w0, z2);
1369: /* second column of a */
1370: w1 = _mm256_set1_pd(work[1]);
1371: a3 = _mm256_loadu_pd(v + 12);
1372: z0 = _mm256_fmadd_pd(a3, w1, z0);
1373: a4 = _mm256_loadu_pd(v + 16);
1374: z1 = _mm256_fmadd_pd(a4, w1, z1);
1375: a5 = _mm256_loadu_pd(v + 20);
1376: z2 = _mm256_fmadd_pd(a5, w1, z2);
1378: /* third column of a */
1379: w2 = _mm256_set1_pd(work[2]);
1380: a0 = _mm256_loadu_pd(v + 24);
1381: z0 = _mm256_fmadd_pd(a0, w2, z0);
1382: a1 = _mm256_loadu_pd(v + 28);
1383: z1 = _mm256_fmadd_pd(a1, w2, z1);
1384: a2 = _mm256_loadu_pd(v + 32);
1385: z2 = _mm256_fmadd_pd(a2, w2, z2);
1387: /* fourth column of a */
1388: w3 = _mm256_set1_pd(work[3]);
1389: a3 = _mm256_loadu_pd(v + 36);
1390: z0 = _mm256_fmadd_pd(a3, w3, z0);
1391: a4 = _mm256_loadu_pd(v + 40);
1392: z1 = _mm256_fmadd_pd(a4, w3, z1);
1393: a5 = _mm256_loadu_pd(v + 44);
1394: z2 = _mm256_fmadd_pd(a5, w3, z2);
1396: /* fifth column of a */
1397: w0 = _mm256_set1_pd(work[4]);
1398: a0 = _mm256_loadu_pd(v + 48);
1399: z0 = _mm256_fmadd_pd(a0, w0, z0);
1400: a1 = _mm256_loadu_pd(v + 52);
1401: z1 = _mm256_fmadd_pd(a1, w0, z1);
1402: a2 = _mm256_loadu_pd(v + 56);
1403: z2 = _mm256_fmadd_pd(a2, w0, z2);
1405: /* sixth column of a */
1406: w1 = _mm256_set1_pd(work[5]);
1407: a3 = _mm256_loadu_pd(v + 60);
1408: z0 = _mm256_fmadd_pd(a3, w1, z0);
1409: a4 = _mm256_loadu_pd(v + 64);
1410: z1 = _mm256_fmadd_pd(a4, w1, z1);
1411: a5 = _mm256_loadu_pd(v + 68);
1412: z2 = _mm256_fmadd_pd(a5, w1, z2);
1414: /* seventh column of a */
1415: w2 = _mm256_set1_pd(work[6]);
1416: a0 = _mm256_loadu_pd(v + 72);
1417: z0 = _mm256_fmadd_pd(a0, w2, z0);
1418: a1 = _mm256_loadu_pd(v + 76);
1419: z1 = _mm256_fmadd_pd(a1, w2, z1);
1420: a2 = _mm256_loadu_pd(v + 80);
1421: z2 = _mm256_fmadd_pd(a2, w2, z2);
1423: /* eighth column of a */
1424: w3 = _mm256_set1_pd(work[7]);
1425: a3 = _mm256_loadu_pd(v + 84);
1426: z0 = _mm256_fmadd_pd(a3, w3, z0);
1427: a4 = _mm256_loadu_pd(v + 88);
1428: z1 = _mm256_fmadd_pd(a4, w3, z1);
1429: a5 = _mm256_loadu_pd(v + 92);
1430: z2 = _mm256_fmadd_pd(a5, w3, z2);
1432: /* ninth column of a */
1433: w0 = _mm256_set1_pd(work[8]);
1434: a0 = _mm256_loadu_pd(v + 96);
1435: z0 = _mm256_fmadd_pd(a0, w0, z0);
1436: a1 = _mm256_loadu_pd(v + 100);
1437: z1 = _mm256_fmadd_pd(a1, w0, z1);
1438: a2 = _mm256_loadu_pd(v + 104);
1439: z2 = _mm256_fmadd_pd(a2, w0, z2);
1441: /* tenth column of a */
1442: w1 = _mm256_set1_pd(work[9]);
1443: a3 = _mm256_loadu_pd(v + 108);
1444: z0 = _mm256_fmadd_pd(a3, w1, z0);
1445: a4 = _mm256_loadu_pd(v + 112);
1446: z1 = _mm256_fmadd_pd(a4, w1, z1);
1447: a5 = _mm256_loadu_pd(v + 116);
1448: z2 = _mm256_fmadd_pd(a5, w1, z2);
1450: /* eleventh column of a */
1451: w2 = _mm256_set1_pd(work[10]);
1452: a0 = _mm256_loadu_pd(v + 120);
1453: z0 = _mm256_fmadd_pd(a0, w2, z0);
1454: a1 = _mm256_loadu_pd(v + 124);
1455: z1 = _mm256_fmadd_pd(a1, w2, z1);
1456: a2 = _mm256_loadu_pd(v + 128);
1457: z2 = _mm256_fmadd_pd(a2, w2, z2);
1459: /* twelveth column of a */
1460: w3 = _mm256_set1_pd(work[11]);
1461: a3 = _mm256_loadu_pd(v + 132);
1462: z0 = _mm256_fmadd_pd(a3, w3, z0);
1463: a4 = _mm256_loadu_pd(v + 136);
1464: z1 = _mm256_fmadd_pd(a4, w3, z1);
1465: a5 = _mm256_loadu_pd(v + 140);
1466: z2 = _mm256_fmadd_pd(a5, w3, z2);
1468: v += bs2;
1469: }
1470: if (usecprow) z = zarray + bs * ridx[i];
1471: _mm256_storeu_pd(&z[0], z0);
1472: _mm256_storeu_pd(&z[4], z1);
1473: _mm256_storeu_pd(&z[8], z2);
1474: if (!usecprow) z += bs;
1475: }
1476: PetscCall(VecRestoreArrayRead(xx, &x));
1477: PetscCall(VecRestoreArrayWrite(zz, &zarray));
1478: PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt));
1479: PetscFunctionReturn(PETSC_SUCCESS);
1480: }
1481: #endif
1483: /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */
1484: /* Default MatMult for block size 15 */
1485: PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A, Vec xx, Vec zz)
1486: {
1487: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1488: PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15;
1489: const PetscScalar *x, *xb;
1490: PetscScalar *zarray, xv;
1491: const MatScalar *v;
1492: const PetscInt *ii, *ij = a->j, *idx;
1493: PetscInt mbs, i, j, k, n, *ridx = NULL;
1494: PetscBool usecprow = a->compressedrow.use;
1496: PetscFunctionBegin;
1497: PetscCall(VecGetArrayRead(xx, &x));
1498: PetscCall(VecGetArrayWrite(zz, &zarray));
1500: v = a->a;
1501: if (usecprow) {
1502: mbs = a->compressedrow.nrows;
1503: ii = a->compressedrow.i;
1504: ridx = a->compressedrow.rindex;
1505: PetscCall(PetscArrayzero(zarray, 15 * a->mbs));
1506: } else {
1507: mbs = a->mbs;
1508: ii = a->i;
1509: z = zarray;
1510: }
1512: for (i = 0; i < mbs; i++) {
1513: n = ii[i + 1] - ii[i];
1514: idx = ij + ii[i];
1515: sum1 = 0.0;
1516: sum2 = 0.0;
1517: sum3 = 0.0;
1518: sum4 = 0.0;
1519: sum5 = 0.0;
1520: sum6 = 0.0;
1521: sum7 = 0.0;
1522: sum8 = 0.0;
1523: sum9 = 0.0;
1524: sum10 = 0.0;
1525: sum11 = 0.0;
1526: sum12 = 0.0;
1527: sum13 = 0.0;
1528: sum14 = 0.0;
1529: sum15 = 0.0;
1531: for (j = 0; j < n; j++) {
1532: xb = x + 15 * (idx[j]);
1534: for (k = 0; k < 15; k++) {
1535: xv = xb[k];
1536: sum1 += v[0] * xv;
1537: sum2 += v[1] * xv;
1538: sum3 += v[2] * xv;
1539: sum4 += v[3] * xv;
1540: sum5 += v[4] * xv;
1541: sum6 += v[5] * xv;
1542: sum7 += v[6] * xv;
1543: sum8 += v[7] * xv;
1544: sum9 += v[8] * xv;
1545: sum10 += v[9] * xv;
1546: sum11 += v[10] * xv;
1547: sum12 += v[11] * xv;
1548: sum13 += v[12] * xv;
1549: sum14 += v[13] * xv;
1550: sum15 += v[14] * xv;
1551: v += 15;
1552: }
1553: }
1554: if (usecprow) z = zarray + 15 * ridx[i];
1555: z[0] = sum1;
1556: z[1] = sum2;
1557: z[2] = sum3;
1558: z[3] = sum4;
1559: z[4] = sum5;
1560: z[5] = sum6;
1561: z[6] = sum7;
1562: z[7] = sum8;
1563: z[8] = sum9;
1564: z[9] = sum10;
1565: z[10] = sum11;
1566: z[11] = sum12;
1567: z[12] = sum13;
1568: z[13] = sum14;
1569: z[14] = sum15;
1571: if (!usecprow) z += 15;
1572: }
1574: PetscCall(VecRestoreArrayRead(xx, &x));
1575: PetscCall(VecRestoreArrayWrite(zz, &zarray));
1576: PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt));
1577: PetscFunctionReturn(PETSC_SUCCESS);
1578: }
1580: /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */
1581: PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A, Vec xx, Vec zz)
1582: {
1583: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1584: PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15;
1585: const PetscScalar *x, *xb;
1586: PetscScalar x1, x2, x3, x4, *zarray;
1587: const MatScalar *v;
1588: const PetscInt *ii, *ij = a->j, *idx;
1589: PetscInt mbs, i, j, n, *ridx = NULL;
1590: PetscBool usecprow = a->compressedrow.use;
1592: PetscFunctionBegin;
1593: PetscCall(VecGetArrayRead(xx, &x));
1594: PetscCall(VecGetArrayWrite(zz, &zarray));
1596: v = a->a;
1597: if (usecprow) {
1598: mbs = a->compressedrow.nrows;
1599: ii = a->compressedrow.i;
1600: ridx = a->compressedrow.rindex;
1601: PetscCall(PetscArrayzero(zarray, 15 * a->mbs));
1602: } else {
1603: mbs = a->mbs;
1604: ii = a->i;
1605: z = zarray;
1606: }
1608: for (i = 0; i < mbs; i++) {
1609: n = ii[i + 1] - ii[i];
1610: idx = ij + ii[i];
1611: sum1 = 0.0;
1612: sum2 = 0.0;
1613: sum3 = 0.0;
1614: sum4 = 0.0;
1615: sum5 = 0.0;
1616: sum6 = 0.0;
1617: sum7 = 0.0;
1618: sum8 = 0.0;
1619: sum9 = 0.0;
1620: sum10 = 0.0;
1621: sum11 = 0.0;
1622: sum12 = 0.0;
1623: sum13 = 0.0;
1624: sum14 = 0.0;
1625: sum15 = 0.0;
1627: for (j = 0; j < n; j++) {
1628: xb = x + 15 * (idx[j]);
1629: x1 = xb[0];
1630: x2 = xb[1];
1631: x3 = xb[2];
1632: x4 = xb[3];
1634: sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4;
1635: sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4;
1636: sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4;
1637: sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4;
1638: sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4;
1639: sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4;
1640: sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4;
1641: sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4;
1642: sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4;
1643: sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4;
1644: sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4;
1645: sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4;
1646: sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4;
1647: sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4;
1648: sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4;
1650: v += 60;
1652: x1 = xb[4];
1653: x2 = xb[5];
1654: x3 = xb[6];
1655: x4 = xb[7];
1657: sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4;
1658: sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4;
1659: sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4;
1660: sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4;
1661: sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4;
1662: sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4;
1663: sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4;
1664: sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4;
1665: sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4;
1666: sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4;
1667: sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4;
1668: sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4;
1669: sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4;
1670: sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4;
1671: sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4;
1672: v += 60;
1674: x1 = xb[8];
1675: x2 = xb[9];
1676: x3 = xb[10];
1677: x4 = xb[11];
1678: sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4;
1679: sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4;
1680: sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4;
1681: sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4;
1682: sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4;
1683: sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4;
1684: sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4;
1685: sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4;
1686: sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4;
1687: sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4;
1688: sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4;
1689: sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4;
1690: sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4;
1691: sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4;
1692: sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4;
1693: v += 60;
1695: x1 = xb[12];
1696: x2 = xb[13];
1697: x3 = xb[14];
1698: sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3;
1699: sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3;
1700: sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3;
1701: sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3;
1702: sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3;
1703: sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3;
1704: sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3;
1705: sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3;
1706: sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3;
1707: sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3;
1708: sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3;
1709: sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3;
1710: sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3;
1711: sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3;
1712: sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3;
1713: v += 45;
1714: }
1715: if (usecprow) z = zarray + 15 * ridx[i];
1716: z[0] = sum1;
1717: z[1] = sum2;
1718: z[2] = sum3;
1719: z[3] = sum4;
1720: z[4] = sum5;
1721: z[5] = sum6;
1722: z[6] = sum7;
1723: z[7] = sum8;
1724: z[8] = sum9;
1725: z[9] = sum10;
1726: z[10] = sum11;
1727: z[11] = sum12;
1728: z[12] = sum13;
1729: z[13] = sum14;
1730: z[14] = sum15;
1732: if (!usecprow) z += 15;
1733: }
1735: PetscCall(VecRestoreArrayRead(xx, &x));
1736: PetscCall(VecRestoreArrayWrite(zz, &zarray));
1737: PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt));
1738: PetscFunctionReturn(PETSC_SUCCESS);
1739: }
1741: /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */
1742: PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A, Vec xx, Vec zz)
1743: {
1744: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1745: PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15;
1746: const PetscScalar *x, *xb;
1747: PetscScalar x1, x2, x3, x4, x5, x6, x7, x8, *zarray;
1748: const MatScalar *v;
1749: const PetscInt *ii, *ij = a->j, *idx;
1750: PetscInt mbs, i, j, n, *ridx = NULL;
1751: PetscBool usecprow = a->compressedrow.use;
1753: PetscFunctionBegin;
1754: PetscCall(VecGetArrayRead(xx, &x));
1755: PetscCall(VecGetArrayWrite(zz, &zarray));
1757: v = a->a;
1758: if (usecprow) {
1759: mbs = a->compressedrow.nrows;
1760: ii = a->compressedrow.i;
1761: ridx = a->compressedrow.rindex;
1762: PetscCall(PetscArrayzero(zarray, 15 * a->mbs));
1763: } else {
1764: mbs = a->mbs;
1765: ii = a->i;
1766: z = zarray;
1767: }
1769: for (i = 0; i < mbs; i++) {
1770: n = ii[i + 1] - ii[i];
1771: idx = ij + ii[i];
1772: sum1 = 0.0;
1773: sum2 = 0.0;
1774: sum3 = 0.0;
1775: sum4 = 0.0;
1776: sum5 = 0.0;
1777: sum6 = 0.0;
1778: sum7 = 0.0;
1779: sum8 = 0.0;
1780: sum9 = 0.0;
1781: sum10 = 0.0;
1782: sum11 = 0.0;
1783: sum12 = 0.0;
1784: sum13 = 0.0;
1785: sum14 = 0.0;
1786: sum15 = 0.0;
1788: for (j = 0; j < n; j++) {
1789: xb = x + 15 * (idx[j]);
1790: x1 = xb[0];
1791: x2 = xb[1];
1792: x3 = xb[2];
1793: x4 = xb[3];
1794: x5 = xb[4];
1795: x6 = xb[5];
1796: x7 = xb[6];
1797: x8 = xb[7];
1799: sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7 + v[105] * x8;
1800: sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7 + v[106] * x8;
1801: sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7 + v[107] * x8;
1802: sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7 + v[108] * x8;
1803: sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7 + v[109] * x8;
1804: sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7 + v[110] * x8;
1805: sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7 + v[111] * x8;
1806: sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7 + v[112] * x8;
1807: sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7 + v[113] * x8;
1808: sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7 + v[114] * x8;
1809: sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7 + v[115] * x8;
1810: sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7 + v[116] * x8;
1811: sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7 + v[117] * x8;
1812: sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7 + v[118] * x8;
1813: sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7 + v[119] * x8;
1814: v += 120;
1816: x1 = xb[8];
1817: x2 = xb[9];
1818: x3 = xb[10];
1819: x4 = xb[11];
1820: x5 = xb[12];
1821: x6 = xb[13];
1822: x7 = xb[14];
1824: sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7;
1825: sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7;
1826: sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7;
1827: sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7;
1828: sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7;
1829: sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7;
1830: sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7;
1831: sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7;
1832: sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7;
1833: sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7;
1834: sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7;
1835: sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7;
1836: sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7;
1837: sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7;
1838: sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7;
1839: v += 105;
1840: }
1841: if (usecprow) z = zarray + 15 * ridx[i];
1842: z[0] = sum1;
1843: z[1] = sum2;
1844: z[2] = sum3;
1845: z[3] = sum4;
1846: z[4] = sum5;
1847: z[5] = sum6;
1848: z[6] = sum7;
1849: z[7] = sum8;
1850: z[8] = sum9;
1851: z[9] = sum10;
1852: z[10] = sum11;
1853: z[11] = sum12;
1854: z[12] = sum13;
1855: z[13] = sum14;
1856: z[14] = sum15;
1858: if (!usecprow) z += 15;
1859: }
1861: PetscCall(VecRestoreArrayRead(xx, &x));
1862: PetscCall(VecRestoreArrayWrite(zz, &zarray));
1863: PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt));
1864: PetscFunctionReturn(PETSC_SUCCESS);
1865: }
1867: /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */
1868: PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A, Vec xx, Vec zz)
1869: {
1870: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1871: PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15;
1872: const PetscScalar *x, *xb;
1873: PetscScalar x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, *zarray;
1874: const MatScalar *v;
1875: const PetscInt *ii, *ij = a->j, *idx;
1876: PetscInt mbs, i, j, n, *ridx = NULL;
1877: PetscBool usecprow = a->compressedrow.use;
1879: PetscFunctionBegin;
1880: PetscCall(VecGetArrayRead(xx, &x));
1881: PetscCall(VecGetArrayWrite(zz, &zarray));
1883: v = a->a;
1884: if (usecprow) {
1885: mbs = a->compressedrow.nrows;
1886: ii = a->compressedrow.i;
1887: ridx = a->compressedrow.rindex;
1888: PetscCall(PetscArrayzero(zarray, 15 * a->mbs));
1889: } else {
1890: mbs = a->mbs;
1891: ii = a->i;
1892: z = zarray;
1893: }
1895: for (i = 0; i < mbs; i++) {
1896: n = ii[i + 1] - ii[i];
1897: idx = ij + ii[i];
1898: sum1 = 0.0;
1899: sum2 = 0.0;
1900: sum3 = 0.0;
1901: sum4 = 0.0;
1902: sum5 = 0.0;
1903: sum6 = 0.0;
1904: sum7 = 0.0;
1905: sum8 = 0.0;
1906: sum9 = 0.0;
1907: sum10 = 0.0;
1908: sum11 = 0.0;
1909: sum12 = 0.0;
1910: sum13 = 0.0;
1911: sum14 = 0.0;
1912: sum15 = 0.0;
1914: for (j = 0; j < n; j++) {
1915: xb = x + 15 * (idx[j]);
1916: x1 = xb[0];
1917: x2 = xb[1];
1918: x3 = xb[2];
1919: x4 = xb[3];
1920: x5 = xb[4];
1921: x6 = xb[5];
1922: x7 = xb[6];
1923: x8 = xb[7];
1924: x9 = xb[8];
1925: x10 = xb[9];
1926: x11 = xb[10];
1927: x12 = xb[11];
1928: x13 = xb[12];
1929: x14 = xb[13];
1930: x15 = xb[14];
1932: sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7 + v[105] * x8 + v[120] * x9 + v[135] * x10 + v[150] * x11 + v[165] * x12 + v[180] * x13 + v[195] * x14 + v[210] * x15;
1933: sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7 + v[106] * x8 + v[121] * x9 + v[136] * x10 + v[151] * x11 + v[166] * x12 + v[181] * x13 + v[196] * x14 + v[211] * x15;
1934: sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7 + v[107] * x8 + v[122] * x9 + v[137] * x10 + v[152] * x11 + v[167] * x12 + v[182] * x13 + v[197] * x14 + v[212] * x15;
1935: sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7 + v[108] * x8 + v[123] * x9 + v[138] * x10 + v[153] * x11 + v[168] * x12 + v[183] * x13 + v[198] * x14 + v[213] * x15;
1936: sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7 + v[109] * x8 + v[124] * x9 + v[139] * x10 + v[154] * x11 + v[169] * x12 + v[184] * x13 + v[199] * x14 + v[214] * x15;
1937: sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7 + v[110] * x8 + v[125] * x9 + v[140] * x10 + v[155] * x11 + v[170] * x12 + v[185] * x13 + v[200] * x14 + v[215] * x15;
1938: sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7 + v[111] * x8 + v[126] * x9 + v[141] * x10 + v[156] * x11 + v[171] * x12 + v[186] * x13 + v[201] * x14 + v[216] * x15;
1939: sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7 + v[112] * x8 + v[127] * x9 + v[142] * x10 + v[157] * x11 + v[172] * x12 + v[187] * x13 + v[202] * x14 + v[217] * x15;
1940: sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7 + v[113] * x8 + v[128] * x9 + v[143] * x10 + v[158] * x11 + v[173] * x12 + v[188] * x13 + v[203] * x14 + v[218] * x15;
1941: sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7 + v[114] * x8 + v[129] * x9 + v[144] * x10 + v[159] * x11 + v[174] * x12 + v[189] * x13 + v[204] * x14 + v[219] * x15;
1942: sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7 + v[115] * x8 + v[130] * x9 + v[145] * x10 + v[160] * x11 + v[175] * x12 + v[190] * x13 + v[205] * x14 + v[220] * x15;
1943: sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7 + v[116] * x8 + v[131] * x9 + v[146] * x10 + v[161] * x11 + v[176] * x12 + v[191] * x13 + v[206] * x14 + v[221] * x15;
1944: sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7 + v[117] * x8 + v[132] * x9 + v[147] * x10 + v[162] * x11 + v[177] * x12 + v[192] * x13 + v[207] * x14 + v[222] * x15;
1945: sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7 + v[118] * x8 + v[133] * x9 + v[148] * x10 + v[163] * x11 + v[178] * x12 + v[193] * x13 + v[208] * x14 + v[223] * x15;
1946: sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7 + v[119] * x8 + v[134] * x9 + v[149] * x10 + v[164] * x11 + v[179] * x12 + v[194] * x13 + v[209] * x14 + v[224] * x15;
1947: v += 225;
1948: }
1949: if (usecprow) z = zarray + 15 * ridx[i];
1950: z[0] = sum1;
1951: z[1] = sum2;
1952: z[2] = sum3;
1953: z[3] = sum4;
1954: z[4] = sum5;
1955: z[5] = sum6;
1956: z[6] = sum7;
1957: z[7] = sum8;
1958: z[8] = sum9;
1959: z[9] = sum10;
1960: z[10] = sum11;
1961: z[11] = sum12;
1962: z[12] = sum13;
1963: z[13] = sum14;
1964: z[14] = sum15;
1966: if (!usecprow) z += 15;
1967: }
1969: PetscCall(VecRestoreArrayRead(xx, &x));
1970: PetscCall(VecRestoreArrayWrite(zz, &zarray));
1971: PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt));
1972: PetscFunctionReturn(PETSC_SUCCESS);
1973: }
1975: /*
1976: This will not work with MatScalar == float because it calls the BLAS
1977: */
1978: PetscErrorCode MatMult_SeqBAIJ_N(Mat A, Vec xx, Vec zz)
1979: {
1980: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1981: PetscScalar *z = NULL, *work, *workt, *zarray;
1982: const PetscScalar *x, *xb;
1983: const MatScalar *v;
1984: PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
1985: const PetscInt *idx, *ii, *ridx = NULL;
1986: PetscInt ncols, k;
1987: PetscBool usecprow = a->compressedrow.use;
1989: PetscFunctionBegin;
1990: PetscCall(VecGetArrayRead(xx, &x));
1991: PetscCall(VecGetArrayWrite(zz, &zarray));
1993: idx = a->j;
1994: v = a->a;
1995: if (usecprow) {
1996: mbs = a->compressedrow.nrows;
1997: ii = a->compressedrow.i;
1998: ridx = a->compressedrow.rindex;
1999: PetscCall(PetscArrayzero(zarray, bs * a->mbs));
2000: } else {
2001: mbs = a->mbs;
2002: ii = a->i;
2003: z = zarray;
2004: }
2006: if (!a->mult_work) {
2007: k = PetscMax(A->rmap->n, A->cmap->n);
2008: PetscCall(PetscMalloc1(k + 1, &a->mult_work));
2009: }
2010: work = a->mult_work;
2011: for (i = 0; i < mbs; i++) {
2012: n = ii[1] - ii[0];
2013: ii++;
2014: ncols = n * bs;
2015: workt = work;
2016: for (j = 0; j < n; j++) {
2017: xb = x + bs * (*idx++);
2018: for (k = 0; k < bs; k++) workt[k] = xb[k];
2019: workt += bs;
2020: }
2021: if (usecprow) z = zarray + bs * ridx[i];
2022: PetscKernel_w_gets_Ar_times_v(bs, ncols, work, v, z);
2023: v += n * bs2;
2024: if (!usecprow) z += bs;
2025: }
2026: PetscCall(VecRestoreArrayRead(xx, &x));
2027: PetscCall(VecRestoreArrayWrite(zz, &zarray));
2028: PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt));
2029: PetscFunctionReturn(PETSC_SUCCESS);
2030: }
2032: PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A, Vec xx, Vec yy, Vec zz)
2033: {
2034: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2035: const PetscScalar *x;
2036: PetscScalar *y, *z, sum;
2037: const MatScalar *v;
2038: PetscInt mbs = a->mbs, i, n, *ridx = NULL;
2039: const PetscInt *idx, *ii;
2040: PetscBool usecprow = a->compressedrow.use;
2042: PetscFunctionBegin;
2043: PetscCall(VecGetArrayRead(xx, &x));
2044: PetscCall(VecGetArrayPair(yy, zz, &y, &z));
2046: idx = a->j;
2047: v = a->a;
2048: if (usecprow) {
2049: if (zz != yy) PetscCall(PetscArraycpy(z, y, mbs));
2050: mbs = a->compressedrow.nrows;
2051: ii = a->compressedrow.i;
2052: ridx = a->compressedrow.rindex;
2053: } else {
2054: ii = a->i;
2055: }
2057: for (i = 0; i < mbs; i++) {
2058: n = ii[1] - ii[0];
2059: ii++;
2060: if (!usecprow) {
2061: sum = y[i];
2062: } else {
2063: sum = y[ridx[i]];
2064: }
2065: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2066: PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2067: PetscSparseDensePlusDot(sum, x, v, idx, n);
2068: v += n;
2069: idx += n;
2070: if (usecprow) {
2071: z[ridx[i]] = sum;
2072: } else {
2073: z[i] = sum;
2074: }
2075: }
2076: PetscCall(VecRestoreArrayRead(xx, &x));
2077: PetscCall(VecRestoreArrayPair(yy, zz, &y, &z));
2078: PetscCall(PetscLogFlops(2.0 * a->nz));
2079: PetscFunctionReturn(PETSC_SUCCESS);
2080: }
2082: PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A, Vec xx, Vec yy, Vec zz)
2083: {
2084: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2085: PetscScalar *y = NULL, *z = NULL, sum1, sum2;
2086: const PetscScalar *x, *xb;
2087: PetscScalar x1, x2, *yarray, *zarray;
2088: const MatScalar *v;
2089: PetscInt mbs = a->mbs, i, n, j;
2090: const PetscInt *idx, *ii, *ridx = NULL;
2091: PetscBool usecprow = a->compressedrow.use;
2093: PetscFunctionBegin;
2094: PetscCall(VecGetArrayRead(xx, &x));
2095: PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
2097: idx = a->j;
2098: v = a->a;
2099: if (usecprow) {
2100: if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 2 * mbs));
2101: mbs = a->compressedrow.nrows;
2102: ii = a->compressedrow.i;
2103: ridx = a->compressedrow.rindex;
2104: } else {
2105: ii = a->i;
2106: y = yarray;
2107: z = zarray;
2108: }
2110: for (i = 0; i < mbs; i++) {
2111: n = ii[1] - ii[0];
2112: ii++;
2113: if (usecprow) {
2114: z = zarray + 2 * ridx[i];
2115: y = yarray + 2 * ridx[i];
2116: }
2117: sum1 = y[0];
2118: sum2 = y[1];
2119: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2120: PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2121: for (j = 0; j < n; j++) {
2122: xb = x + 2 * (*idx++);
2123: x1 = xb[0];
2124: x2 = xb[1];
2126: sum1 += v[0] * x1 + v[2] * x2;
2127: sum2 += v[1] * x1 + v[3] * x2;
2128: v += 4;
2129: }
2130: z[0] = sum1;
2131: z[1] = sum2;
2132: if (!usecprow) {
2133: z += 2;
2134: y += 2;
2135: }
2136: }
2137: PetscCall(VecRestoreArrayRead(xx, &x));
2138: PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2139: PetscCall(PetscLogFlops(4.0 * a->nz));
2140: PetscFunctionReturn(PETSC_SUCCESS);
2141: }
2143: PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A, Vec xx, Vec yy, Vec zz)
2144: {
2145: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2146: PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, x1, x2, x3, *yarray, *zarray;
2147: const PetscScalar *x, *xb;
2148: const MatScalar *v;
2149: PetscInt mbs = a->mbs, i, j, n;
2150: const PetscInt *idx, *ii, *ridx = NULL;
2151: PetscBool usecprow = a->compressedrow.use;
2153: PetscFunctionBegin;
2154: PetscCall(VecGetArrayRead(xx, &x));
2155: PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
2157: idx = a->j;
2158: v = a->a;
2159: if (usecprow) {
2160: if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 3 * mbs));
2161: mbs = a->compressedrow.nrows;
2162: ii = a->compressedrow.i;
2163: ridx = a->compressedrow.rindex;
2164: } else {
2165: ii = a->i;
2166: y = yarray;
2167: z = zarray;
2168: }
2170: for (i = 0; i < mbs; i++) {
2171: n = ii[1] - ii[0];
2172: ii++;
2173: if (usecprow) {
2174: z = zarray + 3 * ridx[i];
2175: y = yarray + 3 * ridx[i];
2176: }
2177: sum1 = y[0];
2178: sum2 = y[1];
2179: sum3 = y[2];
2180: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2181: PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2182: for (j = 0; j < n; j++) {
2183: xb = x + 3 * (*idx++);
2184: x1 = xb[0];
2185: x2 = xb[1];
2186: x3 = xb[2];
2187: sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3;
2188: sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3;
2189: sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3;
2190: v += 9;
2191: }
2192: z[0] = sum1;
2193: z[1] = sum2;
2194: z[2] = sum3;
2195: if (!usecprow) {
2196: z += 3;
2197: y += 3;
2198: }
2199: }
2200: PetscCall(VecRestoreArrayRead(xx, &x));
2201: PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2202: PetscCall(PetscLogFlops(18.0 * a->nz));
2203: PetscFunctionReturn(PETSC_SUCCESS);
2204: }
2206: PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A, Vec xx, Vec yy, Vec zz)
2207: {
2208: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2209: PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, x1, x2, x3, x4, *yarray, *zarray;
2210: const PetscScalar *x, *xb;
2211: const MatScalar *v;
2212: PetscInt mbs = a->mbs, i, j, n;
2213: const PetscInt *idx, *ii, *ridx = NULL;
2214: PetscBool usecprow = a->compressedrow.use;
2216: PetscFunctionBegin;
2217: PetscCall(VecGetArrayRead(xx, &x));
2218: PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
2220: idx = a->j;
2221: v = a->a;
2222: if (usecprow) {
2223: if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 4 * mbs));
2224: mbs = a->compressedrow.nrows;
2225: ii = a->compressedrow.i;
2226: ridx = a->compressedrow.rindex;
2227: } else {
2228: ii = a->i;
2229: y = yarray;
2230: z = zarray;
2231: }
2233: for (i = 0; i < mbs; i++) {
2234: n = ii[1] - ii[0];
2235: ii++;
2236: if (usecprow) {
2237: z = zarray + 4 * ridx[i];
2238: y = yarray + 4 * ridx[i];
2239: }
2240: sum1 = y[0];
2241: sum2 = y[1];
2242: sum3 = y[2];
2243: sum4 = y[3];
2244: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2245: PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2246: for (j = 0; j < n; j++) {
2247: xb = x + 4 * (*idx++);
2248: x1 = xb[0];
2249: x2 = xb[1];
2250: x3 = xb[2];
2251: x4 = xb[3];
2252: sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
2253: sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
2254: sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
2255: sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
2256: v += 16;
2257: }
2258: z[0] = sum1;
2259: z[1] = sum2;
2260: z[2] = sum3;
2261: z[3] = sum4;
2262: if (!usecprow) {
2263: z += 4;
2264: y += 4;
2265: }
2266: }
2267: PetscCall(VecRestoreArrayRead(xx, &x));
2268: PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2269: PetscCall(PetscLogFlops(32.0 * a->nz));
2270: PetscFunctionReturn(PETSC_SUCCESS);
2271: }
2273: PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A, Vec xx, Vec yy, Vec zz)
2274: {
2275: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2276: PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, x1, x2, x3, x4, x5;
2277: const PetscScalar *x, *xb;
2278: PetscScalar *yarray, *zarray;
2279: const MatScalar *v;
2280: PetscInt mbs = a->mbs, i, j, n;
2281: const PetscInt *idx, *ii, *ridx = NULL;
2282: PetscBool usecprow = a->compressedrow.use;
2284: PetscFunctionBegin;
2285: PetscCall(VecGetArrayRead(xx, &x));
2286: PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
2288: idx = a->j;
2289: v = a->a;
2290: if (usecprow) {
2291: if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 5 * mbs));
2292: mbs = a->compressedrow.nrows;
2293: ii = a->compressedrow.i;
2294: ridx = a->compressedrow.rindex;
2295: } else {
2296: ii = a->i;
2297: y = yarray;
2298: z = zarray;
2299: }
2301: for (i = 0; i < mbs; i++) {
2302: n = ii[1] - ii[0];
2303: ii++;
2304: if (usecprow) {
2305: z = zarray + 5 * ridx[i];
2306: y = yarray + 5 * ridx[i];
2307: }
2308: sum1 = y[0];
2309: sum2 = y[1];
2310: sum3 = y[2];
2311: sum4 = y[3];
2312: sum5 = y[4];
2313: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2314: PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2315: for (j = 0; j < n; j++) {
2316: xb = x + 5 * (*idx++);
2317: x1 = xb[0];
2318: x2 = xb[1];
2319: x3 = xb[2];
2320: x4 = xb[3];
2321: x5 = xb[4];
2322: sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
2323: sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
2324: sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
2325: sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
2326: sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
2327: v += 25;
2328: }
2329: z[0] = sum1;
2330: z[1] = sum2;
2331: z[2] = sum3;
2332: z[3] = sum4;
2333: z[4] = sum5;
2334: if (!usecprow) {
2335: z += 5;
2336: y += 5;
2337: }
2338: }
2339: PetscCall(VecRestoreArrayRead(xx, &x));
2340: PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2341: PetscCall(PetscLogFlops(50.0 * a->nz));
2342: PetscFunctionReturn(PETSC_SUCCESS);
2343: }
2345: PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A, Vec xx, Vec yy, Vec zz)
2346: {
2347: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2348: PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6;
2349: const PetscScalar *x, *xb;
2350: PetscScalar x1, x2, x3, x4, x5, x6, *yarray, *zarray;
2351: const MatScalar *v;
2352: PetscInt mbs = a->mbs, i, j, n;
2353: const PetscInt *idx, *ii, *ridx = NULL;
2354: PetscBool usecprow = a->compressedrow.use;
2356: PetscFunctionBegin;
2357: PetscCall(VecGetArrayRead(xx, &x));
2358: PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
2360: idx = a->j;
2361: v = a->a;
2362: if (usecprow) {
2363: if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 6 * mbs));
2364: mbs = a->compressedrow.nrows;
2365: ii = a->compressedrow.i;
2366: ridx = a->compressedrow.rindex;
2367: } else {
2368: ii = a->i;
2369: y = yarray;
2370: z = zarray;
2371: }
2373: for (i = 0; i < mbs; i++) {
2374: n = ii[1] - ii[0];
2375: ii++;
2376: if (usecprow) {
2377: z = zarray + 6 * ridx[i];
2378: y = yarray + 6 * ridx[i];
2379: }
2380: sum1 = y[0];
2381: sum2 = y[1];
2382: sum3 = y[2];
2383: sum4 = y[3];
2384: sum5 = y[4];
2385: sum6 = y[5];
2386: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2387: PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2388: for (j = 0; j < n; j++) {
2389: xb = x + 6 * (*idx++);
2390: x1 = xb[0];
2391: x2 = xb[1];
2392: x3 = xb[2];
2393: x4 = xb[3];
2394: x5 = xb[4];
2395: x6 = xb[5];
2396: sum1 += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
2397: sum2 += v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
2398: sum3 += v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
2399: sum4 += v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
2400: sum5 += v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
2401: sum6 += v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
2402: v += 36;
2403: }
2404: z[0] = sum1;
2405: z[1] = sum2;
2406: z[2] = sum3;
2407: z[3] = sum4;
2408: z[4] = sum5;
2409: z[5] = sum6;
2410: if (!usecprow) {
2411: z += 6;
2412: y += 6;
2413: }
2414: }
2415: PetscCall(VecRestoreArrayRead(xx, &x));
2416: PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2417: PetscCall(PetscLogFlops(72.0 * a->nz));
2418: PetscFunctionReturn(PETSC_SUCCESS);
2419: }
2421: PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A, Vec xx, Vec yy, Vec zz)
2422: {
2423: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2424: PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7;
2425: const PetscScalar *x, *xb;
2426: PetscScalar x1, x2, x3, x4, x5, x6, x7, *yarray, *zarray;
2427: const MatScalar *v;
2428: PetscInt mbs = a->mbs, i, j, n;
2429: const PetscInt *idx, *ii, *ridx = NULL;
2430: PetscBool usecprow = a->compressedrow.use;
2432: PetscFunctionBegin;
2433: PetscCall(VecGetArrayRead(xx, &x));
2434: PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
2436: idx = a->j;
2437: v = a->a;
2438: if (usecprow) {
2439: if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 7 * mbs));
2440: mbs = a->compressedrow.nrows;
2441: ii = a->compressedrow.i;
2442: ridx = a->compressedrow.rindex;
2443: } else {
2444: ii = a->i;
2445: y = yarray;
2446: z = zarray;
2447: }
2449: for (i = 0; i < mbs; i++) {
2450: n = ii[1] - ii[0];
2451: ii++;
2452: if (usecprow) {
2453: z = zarray + 7 * ridx[i];
2454: y = yarray + 7 * ridx[i];
2455: }
2456: sum1 = y[0];
2457: sum2 = y[1];
2458: sum3 = y[2];
2459: sum4 = y[3];
2460: sum5 = y[4];
2461: sum6 = y[5];
2462: sum7 = y[6];
2463: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2464: PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2465: for (j = 0; j < n; j++) {
2466: xb = x + 7 * (*idx++);
2467: x1 = xb[0];
2468: x2 = xb[1];
2469: x3 = xb[2];
2470: x4 = xb[3];
2471: x5 = xb[4];
2472: x6 = xb[5];
2473: x7 = xb[6];
2474: sum1 += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
2475: sum2 += v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
2476: sum3 += v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
2477: sum4 += v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
2478: sum5 += v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
2479: sum6 += v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
2480: sum7 += v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
2481: v += 49;
2482: }
2483: z[0] = sum1;
2484: z[1] = sum2;
2485: z[2] = sum3;
2486: z[3] = sum4;
2487: z[4] = sum5;
2488: z[5] = sum6;
2489: z[6] = sum7;
2490: if (!usecprow) {
2491: z += 7;
2492: y += 7;
2493: }
2494: }
2495: PetscCall(VecRestoreArrayRead(xx, &x));
2496: PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2497: PetscCall(PetscLogFlops(98.0 * a->nz));
2498: PetscFunctionReturn(PETSC_SUCCESS);
2499: }
2501: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
2502: PetscErrorCode MatMultAdd_SeqBAIJ_9_AVX2(Mat A, Vec xx, Vec yy, Vec zz)
2503: {
2504: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2505: PetscScalar *z = NULL, *work, *workt, *zarray;
2506: const PetscScalar *x, *xb;
2507: const MatScalar *v;
2508: PetscInt mbs, i, j, n;
2509: PetscInt k;
2510: PetscBool usecprow = a->compressedrow.use;
2511: const PetscInt *idx, *ii, *ridx = NULL, bs = 9, bs2 = 81;
2513: __m256d a0, a1, a2, a3, a4, a5;
2514: __m256d w0, w1, w2, w3;
2515: __m256d z0, z1, z2;
2516: __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL << 63);
2518: PetscFunctionBegin;
2519: PetscCall(VecCopy(yy, zz));
2520: PetscCall(VecGetArrayRead(xx, &x));
2521: PetscCall(VecGetArray(zz, &zarray));
2523: idx = a->j;
2524: v = a->a;
2525: if (usecprow) {
2526: mbs = a->compressedrow.nrows;
2527: ii = a->compressedrow.i;
2528: ridx = a->compressedrow.rindex;
2529: } else {
2530: mbs = a->mbs;
2531: ii = a->i;
2532: z = zarray;
2533: }
2535: if (!a->mult_work) {
2536: k = PetscMax(A->rmap->n, A->cmap->n);
2537: PetscCall(PetscMalloc1(k + 1, &a->mult_work));
2538: }
2540: work = a->mult_work;
2541: for (i = 0; i < mbs; i++) {
2542: n = ii[1] - ii[0];
2543: ii++;
2544: workt = work;
2545: for (j = 0; j < n; j++) {
2546: xb = x + bs * (*idx++);
2547: for (k = 0; k < bs; k++) workt[k] = xb[k];
2548: workt += bs;
2549: }
2550: if (usecprow) z = zarray + bs * ridx[i];
2552: z0 = _mm256_loadu_pd(&z[0]);
2553: z1 = _mm256_loadu_pd(&z[4]);
2554: z2 = _mm256_set1_pd(z[8]);
2556: for (j = 0; j < n; j++) {
2557: /* first column of a */
2558: w0 = _mm256_set1_pd(work[j * 9]);
2559: a0 = _mm256_loadu_pd(&v[j * 81]);
2560: z0 = _mm256_fmadd_pd(a0, w0, z0);
2561: a1 = _mm256_loadu_pd(&v[j * 81 + 4]);
2562: z1 = _mm256_fmadd_pd(a1, w0, z1);
2563: a2 = _mm256_loadu_pd(&v[j * 81 + 8]);
2564: z2 = _mm256_fmadd_pd(a2, w0, z2);
2566: /* second column of a */
2567: w1 = _mm256_set1_pd(work[j * 9 + 1]);
2568: a0 = _mm256_loadu_pd(&v[j * 81 + 9]);
2569: z0 = _mm256_fmadd_pd(a0, w1, z0);
2570: a1 = _mm256_loadu_pd(&v[j * 81 + 13]);
2571: z1 = _mm256_fmadd_pd(a1, w1, z1);
2572: a2 = _mm256_loadu_pd(&v[j * 81 + 17]);
2573: z2 = _mm256_fmadd_pd(a2, w1, z2);
2575: /* third column of a */
2576: w2 = _mm256_set1_pd(work[j * 9 + 2]);
2577: a3 = _mm256_loadu_pd(&v[j * 81 + 18]);
2578: z0 = _mm256_fmadd_pd(a3, w2, z0);
2579: a4 = _mm256_loadu_pd(&v[j * 81 + 22]);
2580: z1 = _mm256_fmadd_pd(a4, w2, z1);
2581: a5 = _mm256_loadu_pd(&v[j * 81 + 26]);
2582: z2 = _mm256_fmadd_pd(a5, w2, z2);
2584: /* fourth column of a */
2585: w3 = _mm256_set1_pd(work[j * 9 + 3]);
2586: a0 = _mm256_loadu_pd(&v[j * 81 + 27]);
2587: z0 = _mm256_fmadd_pd(a0, w3, z0);
2588: a1 = _mm256_loadu_pd(&v[j * 81 + 31]);
2589: z1 = _mm256_fmadd_pd(a1, w3, z1);
2590: a2 = _mm256_loadu_pd(&v[j * 81 + 35]);
2591: z2 = _mm256_fmadd_pd(a2, w3, z2);
2593: /* fifth column of a */
2594: w0 = _mm256_set1_pd(work[j * 9 + 4]);
2595: a3 = _mm256_loadu_pd(&v[j * 81 + 36]);
2596: z0 = _mm256_fmadd_pd(a3, w0, z0);
2597: a4 = _mm256_loadu_pd(&v[j * 81 + 40]);
2598: z1 = _mm256_fmadd_pd(a4, w0, z1);
2599: a5 = _mm256_loadu_pd(&v[j * 81 + 44]);
2600: z2 = _mm256_fmadd_pd(a5, w0, z2);
2602: /* sixth column of a */
2603: w1 = _mm256_set1_pd(work[j * 9 + 5]);
2604: a0 = _mm256_loadu_pd(&v[j * 81 + 45]);
2605: z0 = _mm256_fmadd_pd(a0, w1, z0);
2606: a1 = _mm256_loadu_pd(&v[j * 81 + 49]);
2607: z1 = _mm256_fmadd_pd(a1, w1, z1);
2608: a2 = _mm256_loadu_pd(&v[j * 81 + 53]);
2609: z2 = _mm256_fmadd_pd(a2, w1, z2);
2611: /* seventh column of a */
2612: w2 = _mm256_set1_pd(work[j * 9 + 6]);
2613: a0 = _mm256_loadu_pd(&v[j * 81 + 54]);
2614: z0 = _mm256_fmadd_pd(a0, w2, z0);
2615: a1 = _mm256_loadu_pd(&v[j * 81 + 58]);
2616: z1 = _mm256_fmadd_pd(a1, w2, z1);
2617: a2 = _mm256_loadu_pd(&v[j * 81 + 62]);
2618: z2 = _mm256_fmadd_pd(a2, w2, z2);
2620: /* eighth column of a */
2621: w3 = _mm256_set1_pd(work[j * 9 + 7]);
2622: a3 = _mm256_loadu_pd(&v[j * 81 + 63]);
2623: z0 = _mm256_fmadd_pd(a3, w3, z0);
2624: a4 = _mm256_loadu_pd(&v[j * 81 + 67]);
2625: z1 = _mm256_fmadd_pd(a4, w3, z1);
2626: a5 = _mm256_loadu_pd(&v[j * 81 + 71]);
2627: z2 = _mm256_fmadd_pd(a5, w3, z2);
2629: /* ninth column of a */
2630: w0 = _mm256_set1_pd(work[j * 9 + 8]);
2631: a0 = _mm256_loadu_pd(&v[j * 81 + 72]);
2632: z0 = _mm256_fmadd_pd(a0, w0, z0);
2633: a1 = _mm256_loadu_pd(&v[j * 81 + 76]);
2634: z1 = _mm256_fmadd_pd(a1, w0, z1);
2635: a2 = _mm256_maskload_pd(&v[j * 81 + 80], mask1);
2636: z2 = _mm256_fmadd_pd(a2, w0, z2);
2637: }
2639: _mm256_storeu_pd(&z[0], z0);
2640: _mm256_storeu_pd(&z[4], z1);
2641: _mm256_maskstore_pd(&z[8], mask1, z2);
2643: v += n * bs2;
2644: if (!usecprow) z += bs;
2645: }
2646: PetscCall(VecRestoreArrayRead(xx, &x));
2647: PetscCall(VecRestoreArray(zz, &zarray));
2648: PetscCall(PetscLogFlops(162.0 * a->nz));
2649: PetscFunctionReturn(PETSC_SUCCESS);
2650: }
2651: #endif
2653: PetscErrorCode MatMultAdd_SeqBAIJ_11(Mat A, Vec xx, Vec yy, Vec zz)
2654: {
2655: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2656: PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11;
2657: const PetscScalar *x, *xb;
2658: PetscScalar x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, *yarray, *zarray;
2659: const MatScalar *v;
2660: PetscInt mbs = a->mbs, i, j, n;
2661: const PetscInt *idx, *ii, *ridx = NULL;
2662: PetscBool usecprow = a->compressedrow.use;
2664: PetscFunctionBegin;
2665: PetscCall(VecGetArrayRead(xx, &x));
2666: PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
2668: idx = a->j;
2669: v = a->a;
2670: if (usecprow) {
2671: if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 7 * mbs));
2672: mbs = a->compressedrow.nrows;
2673: ii = a->compressedrow.i;
2674: ridx = a->compressedrow.rindex;
2675: } else {
2676: ii = a->i;
2677: y = yarray;
2678: z = zarray;
2679: }
2681: for (i = 0; i < mbs; i++) {
2682: n = ii[1] - ii[0];
2683: ii++;
2684: if (usecprow) {
2685: z = zarray + 11 * ridx[i];
2686: y = yarray + 11 * ridx[i];
2687: }
2688: sum1 = y[0];
2689: sum2 = y[1];
2690: sum3 = y[2];
2691: sum4 = y[3];
2692: sum5 = y[4];
2693: sum6 = y[5];
2694: sum7 = y[6];
2695: sum8 = y[7];
2696: sum9 = y[8];
2697: sum10 = y[9];
2698: sum11 = y[10];
2699: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2700: PetscPrefetchBlock(v + 121 * n, 121 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2701: for (j = 0; j < n; j++) {
2702: xb = x + 11 * (*idx++);
2703: x1 = xb[0];
2704: x2 = xb[1];
2705: x3 = xb[2];
2706: x4 = xb[3];
2707: x5 = xb[4];
2708: x6 = xb[5];
2709: x7 = xb[6];
2710: x8 = xb[7];
2711: x9 = xb[8];
2712: x10 = xb[9];
2713: x11 = xb[10];
2714: sum1 += v[0] * x1 + v[11] * x2 + v[2 * 11] * x3 + v[3 * 11] * x4 + v[4 * 11] * x5 + v[5 * 11] * x6 + v[6 * 11] * x7 + v[7 * 11] * x8 + v[8 * 11] * x9 + v[9 * 11] * x10 + v[10 * 11] * x11;
2715: sum2 += v[1 + 0] * x1 + v[1 + 11] * x2 + v[1 + 2 * 11] * x3 + v[1 + 3 * 11] * x4 + v[1 + 4 * 11] * x5 + v[1 + 5 * 11] * x6 + v[1 + 6 * 11] * x7 + v[1 + 7 * 11] * x8 + v[1 + 8 * 11] * x9 + v[1 + 9 * 11] * x10 + v[1 + 10 * 11] * x11;
2716: sum3 += v[2 + 0] * x1 + v[2 + 11] * x2 + v[2 + 2 * 11] * x3 + v[2 + 3 * 11] * x4 + v[2 + 4 * 11] * x5 + v[2 + 5 * 11] * x6 + v[2 + 6 * 11] * x7 + v[2 + 7 * 11] * x8 + v[2 + 8 * 11] * x9 + v[2 + 9 * 11] * x10 + v[2 + 10 * 11] * x11;
2717: sum4 += v[3 + 0] * x1 + v[3 + 11] * x2 + v[3 + 2 * 11] * x3 + v[3 + 3 * 11] * x4 + v[3 + 4 * 11] * x5 + v[3 + 5 * 11] * x6 + v[3 + 6 * 11] * x7 + v[3 + 7 * 11] * x8 + v[3 + 8 * 11] * x9 + v[3 + 9 * 11] * x10 + v[3 + 10 * 11] * x11;
2718: sum5 += v[4 + 0] * x1 + v[4 + 11] * x2 + v[4 + 2 * 11] * x3 + v[4 + 3 * 11] * x4 + v[4 + 4 * 11] * x5 + v[4 + 5 * 11] * x6 + v[4 + 6 * 11] * x7 + v[4 + 7 * 11] * x8 + v[4 + 8 * 11] * x9 + v[4 + 9 * 11] * x10 + v[4 + 10 * 11] * x11;
2719: sum6 += v[5 + 0] * x1 + v[5 + 11] * x2 + v[5 + 2 * 11] * x3 + v[5 + 3 * 11] * x4 + v[5 + 4 * 11] * x5 + v[5 + 5 * 11] * x6 + v[5 + 6 * 11] * x7 + v[5 + 7 * 11] * x8 + v[5 + 8 * 11] * x9 + v[5 + 9 * 11] * x10 + v[5 + 10 * 11] * x11;
2720: sum7 += v[6 + 0] * x1 + v[6 + 11] * x2 + v[6 + 2 * 11] * x3 + v[6 + 3 * 11] * x4 + v[6 + 4 * 11] * x5 + v[6 + 5 * 11] * x6 + v[6 + 6 * 11] * x7 + v[6 + 7 * 11] * x8 + v[6 + 8 * 11] * x9 + v[6 + 9 * 11] * x10 + v[6 + 10 * 11] * x11;
2721: sum8 += v[7 + 0] * x1 + v[7 + 11] * x2 + v[7 + 2 * 11] * x3 + v[7 + 3 * 11] * x4 + v[7 + 4 * 11] * x5 + v[7 + 5 * 11] * x6 + v[7 + 6 * 11] * x7 + v[7 + 7 * 11] * x8 + v[7 + 8 * 11] * x9 + v[7 + 9 * 11] * x10 + v[7 + 10 * 11] * x11;
2722: sum9 += v[8 + 0] * x1 + v[8 + 11] * x2 + v[8 + 2 * 11] * x3 + v[8 + 3 * 11] * x4 + v[8 + 4 * 11] * x5 + v[8 + 5 * 11] * x6 + v[8 + 6 * 11] * x7 + v[8 + 7 * 11] * x8 + v[8 + 8 * 11] * x9 + v[8 + 9 * 11] * x10 + v[8 + 10 * 11] * x11;
2723: sum10 += v[9 + 0] * x1 + v[9 + 11] * x2 + v[9 + 2 * 11] * x3 + v[9 + 3 * 11] * x4 + v[9 + 4 * 11] * x5 + v[9 + 5 * 11] * x6 + v[9 + 6 * 11] * x7 + v[9 + 7 * 11] * x8 + v[9 + 8 * 11] * x9 + v[9 + 9 * 11] * x10 + v[9 + 10 * 11] * x11;
2724: sum11 += v[10 + 0] * x1 + v[10 + 11] * x2 + v[10 + 2 * 11] * x3 + v[10 + 3 * 11] * x4 + v[10 + 4 * 11] * x5 + v[10 + 5 * 11] * x6 + v[10 + 6 * 11] * x7 + v[10 + 7 * 11] * x8 + v[10 + 8 * 11] * x9 + v[10 + 9 * 11] * x10 + v[10 + 10 * 11] * x11;
2725: v += 121;
2726: }
2727: z[0] = sum1;
2728: z[1] = sum2;
2729: z[2] = sum3;
2730: z[3] = sum4;
2731: z[4] = sum5;
2732: z[5] = sum6;
2733: z[6] = sum7;
2734: z[7] = sum8;
2735: z[8] = sum9;
2736: z[9] = sum10;
2737: z[10] = sum11;
2738: if (!usecprow) {
2739: z += 11;
2740: y += 11;
2741: }
2742: }
2743: PetscCall(VecRestoreArrayRead(xx, &x));
2744: PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2745: PetscCall(PetscLogFlops(242.0 * a->nz));
2746: PetscFunctionReturn(PETSC_SUCCESS);
2747: }
2749: PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A, Vec xx, Vec yy, Vec zz)
2750: {
2751: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2752: PetscScalar *z = NULL, *work, *workt, *zarray;
2753: const PetscScalar *x, *xb;
2754: const MatScalar *v;
2755: PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
2756: PetscInt ncols, k;
2757: const PetscInt *ridx = NULL, *idx, *ii;
2758: PetscBool usecprow = a->compressedrow.use;
2760: PetscFunctionBegin;
2761: PetscCall(VecCopy(yy, zz));
2762: PetscCall(VecGetArrayRead(xx, &x));
2763: PetscCall(VecGetArray(zz, &zarray));
2765: idx = a->j;
2766: v = a->a;
2767: if (usecprow) {
2768: mbs = a->compressedrow.nrows;
2769: ii = a->compressedrow.i;
2770: ridx = a->compressedrow.rindex;
2771: } else {
2772: mbs = a->mbs;
2773: ii = a->i;
2774: z = zarray;
2775: }
2777: if (!a->mult_work) {
2778: k = PetscMax(A->rmap->n, A->cmap->n);
2779: PetscCall(PetscMalloc1(k + 1, &a->mult_work));
2780: }
2781: work = a->mult_work;
2782: for (i = 0; i < mbs; i++) {
2783: n = ii[1] - ii[0];
2784: ii++;
2785: ncols = n * bs;
2786: workt = work;
2787: for (j = 0; j < n; j++) {
2788: xb = x + bs * (*idx++);
2789: for (k = 0; k < bs; k++) workt[k] = xb[k];
2790: workt += bs;
2791: }
2792: if (usecprow) z = zarray + bs * ridx[i];
2793: PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z);
2794: v += n * bs2;
2795: if (!usecprow) z += bs;
2796: }
2797: PetscCall(VecRestoreArrayRead(xx, &x));
2798: PetscCall(VecRestoreArray(zz, &zarray));
2799: PetscCall(PetscLogFlops(2.0 * a->nz * bs2));
2800: PetscFunctionReturn(PETSC_SUCCESS);
2801: }
2803: PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A, Vec xx, Vec zz)
2804: {
2805: PetscScalar zero = 0.0;
2807: PetscFunctionBegin;
2808: PetscCall(VecSet(zz, zero));
2809: PetscCall(MatMultHermitianTransposeAdd_SeqBAIJ(A, xx, zz, zz));
2810: PetscFunctionReturn(PETSC_SUCCESS);
2811: }
2813: PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A, Vec xx, Vec zz)
2814: {
2815: PetscScalar zero = 0.0;
2817: PetscFunctionBegin;
2818: PetscCall(VecSet(zz, zero));
2819: PetscCall(MatMultTransposeAdd_SeqBAIJ(A, xx, zz, zz));
2820: PetscFunctionReturn(PETSC_SUCCESS);
2821: }
2823: PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A, Vec xx, Vec yy, Vec zz)
2824: {
2825: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2826: PetscScalar *z, x1, x2, x3, x4, x5;
2827: const PetscScalar *x, *xb = NULL;
2828: const MatScalar *v;
2829: PetscInt mbs, i, rval, bs = A->rmap->bs, j, n;
2830: const PetscInt *idx, *ii, *ib, *ridx = NULL;
2831: Mat_CompressedRow cprow = a->compressedrow;
2832: PetscBool usecprow = cprow.use;
2834: PetscFunctionBegin;
2835: if (yy != zz) PetscCall(VecCopy(yy, zz));
2836: PetscCall(VecGetArrayRead(xx, &x));
2837: PetscCall(VecGetArray(zz, &z));
2839: idx = a->j;
2840: v = a->a;
2841: if (usecprow) {
2842: mbs = cprow.nrows;
2843: ii = cprow.i;
2844: ridx = cprow.rindex;
2845: } else {
2846: mbs = a->mbs;
2847: ii = a->i;
2848: xb = x;
2849: }
2851: switch (bs) {
2852: case 1:
2853: for (i = 0; i < mbs; i++) {
2854: if (usecprow) xb = x + ridx[i];
2855: x1 = xb[0];
2856: ib = idx + ii[0];
2857: n = ii[1] - ii[0];
2858: ii++;
2859: for (j = 0; j < n; j++) {
2860: rval = ib[j];
2861: z[rval] += PetscConj(*v) * x1;
2862: v++;
2863: }
2864: if (!usecprow) xb++;
2865: }
2866: break;
2867: case 2:
2868: for (i = 0; i < mbs; i++) {
2869: if (usecprow) xb = x + 2 * ridx[i];
2870: x1 = xb[0];
2871: x2 = xb[1];
2872: ib = idx + ii[0];
2873: n = ii[1] - ii[0];
2874: ii++;
2875: for (j = 0; j < n; j++) {
2876: rval = ib[j] * 2;
2877: z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2;
2878: z[rval++] += PetscConj(v[2]) * x1 + PetscConj(v[3]) * x2;
2879: v += 4;
2880: }
2881: if (!usecprow) xb += 2;
2882: }
2883: break;
2884: case 3:
2885: for (i = 0; i < mbs; i++) {
2886: if (usecprow) xb = x + 3 * ridx[i];
2887: x1 = xb[0];
2888: x2 = xb[1];
2889: x3 = xb[2];
2890: ib = idx + ii[0];
2891: n = ii[1] - ii[0];
2892: ii++;
2893: for (j = 0; j < n; j++) {
2894: rval = ib[j] * 3;
2895: z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3;
2896: z[rval++] += PetscConj(v[3]) * x1 + PetscConj(v[4]) * x2 + PetscConj(v[5]) * x3;
2897: z[rval++] += PetscConj(v[6]) * x1 + PetscConj(v[7]) * x2 + PetscConj(v[8]) * x3;
2898: v += 9;
2899: }
2900: if (!usecprow) xb += 3;
2901: }
2902: break;
2903: case 4:
2904: for (i = 0; i < mbs; i++) {
2905: if (usecprow) xb = x + 4 * ridx[i];
2906: x1 = xb[0];
2907: x2 = xb[1];
2908: x3 = xb[2];
2909: x4 = xb[3];
2910: ib = idx + ii[0];
2911: n = ii[1] - ii[0];
2912: ii++;
2913: for (j = 0; j < n; j++) {
2914: rval = ib[j] * 4;
2915: z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3 + PetscConj(v[3]) * x4;
2916: z[rval++] += PetscConj(v[4]) * x1 + PetscConj(v[5]) * x2 + PetscConj(v[6]) * x3 + PetscConj(v[7]) * x4;
2917: z[rval++] += PetscConj(v[8]) * x1 + PetscConj(v[9]) * x2 + PetscConj(v[10]) * x3 + PetscConj(v[11]) * x4;
2918: z[rval++] += PetscConj(v[12]) * x1 + PetscConj(v[13]) * x2 + PetscConj(v[14]) * x3 + PetscConj(v[15]) * x4;
2919: v += 16;
2920: }
2921: if (!usecprow) xb += 4;
2922: }
2923: break;
2924: case 5:
2925: for (i = 0; i < mbs; i++) {
2926: if (usecprow) xb = x + 5 * ridx[i];
2927: x1 = xb[0];
2928: x2 = xb[1];
2929: x3 = xb[2];
2930: x4 = xb[3];
2931: x5 = xb[4];
2932: ib = idx + ii[0];
2933: n = ii[1] - ii[0];
2934: ii++;
2935: for (j = 0; j < n; j++) {
2936: rval = ib[j] * 5;
2937: z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3 + PetscConj(v[3]) * x4 + PetscConj(v[4]) * x5;
2938: z[rval++] += PetscConj(v[5]) * x1 + PetscConj(v[6]) * x2 + PetscConj(v[7]) * x3 + PetscConj(v[8]) * x4 + PetscConj(v[9]) * x5;
2939: z[rval++] += PetscConj(v[10]) * x1 + PetscConj(v[11]) * x2 + PetscConj(v[12]) * x3 + PetscConj(v[13]) * x4 + PetscConj(v[14]) * x5;
2940: z[rval++] += PetscConj(v[15]) * x1 + PetscConj(v[16]) * x2 + PetscConj(v[17]) * x3 + PetscConj(v[18]) * x4 + PetscConj(v[19]) * x5;
2941: z[rval++] += PetscConj(v[20]) * x1 + PetscConj(v[21]) * x2 + PetscConj(v[22]) * x3 + PetscConj(v[23]) * x4 + PetscConj(v[24]) * x5;
2942: v += 25;
2943: }
2944: if (!usecprow) xb += 5;
2945: }
2946: break;
2947: default: /* block sizes larger than 5 by 5 are handled by BLAS */
2948: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "block size larger than 5 is not supported yet");
2949: #if 0
2950: {
2951: PetscInt ncols,k,bs2=a->bs2;
2952: PetscScalar *work,*workt,zb;
2953: const PetscScalar *xtmp;
2954: if (!a->mult_work) {
2955: k = PetscMax(A->rmap->n,A->cmap->n);
2956: PetscCall(PetscMalloc1(k+1,&a->mult_work));
2957: }
2958: work = a->mult_work;
2959: xtmp = x;
2960: for (i=0; i<mbs; i++) {
2961: n = ii[1] - ii[0]; ii++;
2962: ncols = n*bs;
2963: PetscCall(PetscArrayzero(work,ncols));
2964: if (usecprow) xtmp = x + bs*ridx[i];
2965: PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
2966: v += n*bs2;
2967: if (!usecprow) xtmp += bs;
2968: workt = work;
2969: for (j=0; j<n; j++) {
2970: zb = z + bs*(*idx++);
2971: for (k=0; k<bs; k++) zb[k] += workt[k] ;
2972: workt += bs;
2973: }
2974: }
2975: }
2976: #endif
2977: }
2978: PetscCall(VecRestoreArrayRead(xx, &x));
2979: PetscCall(VecRestoreArray(zz, &z));
2980: PetscCall(PetscLogFlops(2.0 * a->nz * a->bs2));
2981: PetscFunctionReturn(PETSC_SUCCESS);
2982: }
2984: PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A, Vec xx, Vec yy, Vec zz)
2985: {
2986: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2987: PetscScalar *zb, *z, x1, x2, x3, x4, x5;
2988: const PetscScalar *x, *xb = NULL;
2989: const MatScalar *v;
2990: PetscInt mbs, i, rval, bs = A->rmap->bs, j, n, bs2 = a->bs2;
2991: const PetscInt *idx, *ii, *ib, *ridx = NULL;
2992: Mat_CompressedRow cprow = a->compressedrow;
2993: PetscBool usecprow = cprow.use;
2995: PetscFunctionBegin;
2996: if (yy != zz) PetscCall(VecCopy(yy, zz));
2997: PetscCall(VecGetArrayRead(xx, &x));
2998: PetscCall(VecGetArray(zz, &z));
3000: idx = a->j;
3001: v = a->a;
3002: if (usecprow) {
3003: mbs = cprow.nrows;
3004: ii = cprow.i;
3005: ridx = cprow.rindex;
3006: } else {
3007: mbs = a->mbs;
3008: ii = a->i;
3009: xb = x;
3010: }
3012: switch (bs) {
3013: case 1:
3014: for (i = 0; i < mbs; i++) {
3015: if (usecprow) xb = x + ridx[i];
3016: x1 = xb[0];
3017: ib = idx + ii[0];
3018: n = ii[1] - ii[0];
3019: ii++;
3020: for (j = 0; j < n; j++) {
3021: rval = ib[j];
3022: z[rval] += *v * x1;
3023: v++;
3024: }
3025: if (!usecprow) xb++;
3026: }
3027: break;
3028: case 2:
3029: for (i = 0; i < mbs; i++) {
3030: if (usecprow) xb = x + 2 * ridx[i];
3031: x1 = xb[0];
3032: x2 = xb[1];
3033: ib = idx + ii[0];
3034: n = ii[1] - ii[0];
3035: ii++;
3036: for (j = 0; j < n; j++) {
3037: rval = ib[j] * 2;
3038: z[rval++] += v[0] * x1 + v[1] * x2;
3039: z[rval++] += v[2] * x1 + v[3] * x2;
3040: v += 4;
3041: }
3042: if (!usecprow) xb += 2;
3043: }
3044: break;
3045: case 3:
3046: for (i = 0; i < mbs; i++) {
3047: if (usecprow) xb = x + 3 * ridx[i];
3048: x1 = xb[0];
3049: x2 = xb[1];
3050: x3 = xb[2];
3051: ib = idx + ii[0];
3052: n = ii[1] - ii[0];
3053: ii++;
3054: for (j = 0; j < n; j++) {
3055: rval = ib[j] * 3;
3056: z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3;
3057: z[rval++] += v[3] * x1 + v[4] * x2 + v[5] * x3;
3058: z[rval++] += v[6] * x1 + v[7] * x2 + v[8] * x3;
3059: v += 9;
3060: }
3061: if (!usecprow) xb += 3;
3062: }
3063: break;
3064: case 4:
3065: for (i = 0; i < mbs; i++) {
3066: if (usecprow) xb = x + 4 * ridx[i];
3067: x1 = xb[0];
3068: x2 = xb[1];
3069: x3 = xb[2];
3070: x4 = xb[3];
3071: ib = idx + ii[0];
3072: n = ii[1] - ii[0];
3073: ii++;
3074: for (j = 0; j < n; j++) {
3075: rval = ib[j] * 4;
3076: z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4;
3077: z[rval++] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4;
3078: z[rval++] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4;
3079: z[rval++] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
3080: v += 16;
3081: }
3082: if (!usecprow) xb += 4;
3083: }
3084: break;
3085: case 5:
3086: for (i = 0; i < mbs; i++) {
3087: if (usecprow) xb = x + 5 * ridx[i];
3088: x1 = xb[0];
3089: x2 = xb[1];
3090: x3 = xb[2];
3091: x4 = xb[3];
3092: x5 = xb[4];
3093: ib = idx + ii[0];
3094: n = ii[1] - ii[0];
3095: ii++;
3096: for (j = 0; j < n; j++) {
3097: rval = ib[j] * 5;
3098: z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5;
3099: z[rval++] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5;
3100: z[rval++] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5;
3101: z[rval++] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5;
3102: z[rval++] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
3103: v += 25;
3104: }
3105: if (!usecprow) xb += 5;
3106: }
3107: break;
3108: default: { /* block sizes larger then 5 by 5 are handled by BLAS */
3109: PetscInt ncols, k;
3110: PetscScalar *work, *workt;
3111: const PetscScalar *xtmp;
3112: if (!a->mult_work) {
3113: k = PetscMax(A->rmap->n, A->cmap->n);
3114: PetscCall(PetscMalloc1(k + 1, &a->mult_work));
3115: }
3116: work = a->mult_work;
3117: xtmp = x;
3118: for (i = 0; i < mbs; i++) {
3119: n = ii[1] - ii[0];
3120: ii++;
3121: ncols = n * bs;
3122: PetscCall(PetscArrayzero(work, ncols));
3123: if (usecprow) xtmp = x + bs * ridx[i];
3124: PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, xtmp, v, work);
3125: v += n * bs2;
3126: if (!usecprow) xtmp += bs;
3127: workt = work;
3128: for (j = 0; j < n; j++) {
3129: zb = z + bs * (*idx++);
3130: for (k = 0; k < bs; k++) zb[k] += workt[k];
3131: workt += bs;
3132: }
3133: }
3134: }
3135: }
3136: PetscCall(VecRestoreArrayRead(xx, &x));
3137: PetscCall(VecRestoreArray(zz, &z));
3138: PetscCall(PetscLogFlops(2.0 * a->nz * a->bs2));
3139: PetscFunctionReturn(PETSC_SUCCESS);
3140: }
3142: PetscErrorCode MatScale_SeqBAIJ(Mat inA, PetscScalar alpha)
3143: {
3144: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)inA->data;
3145: PetscInt totalnz = a->bs2 * a->nz;
3146: PetscScalar oalpha = alpha;
3147: PetscBLASInt one = 1, tnz;
3149: PetscFunctionBegin;
3150: PetscCall(PetscBLASIntCast(totalnz, &tnz));
3151: PetscCallBLAS("BLASscal", BLASscal_(&tnz, &oalpha, a->a, &one));
3152: PetscCall(PetscLogFlops(totalnz));
3153: PetscFunctionReturn(PETSC_SUCCESS);
3154: }
3156: PetscErrorCode MatNorm_SeqBAIJ(Mat A, NormType type, PetscReal *norm)
3157: {
3158: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3159: MatScalar *v = a->a;
3160: PetscReal sum = 0.0;
3161: PetscInt i, j, k, bs = A->rmap->bs, nz = a->nz, bs2 = a->bs2, k1;
3163: PetscFunctionBegin;
3164: if (type == NORM_FROBENIUS) {
3165: #if defined(PETSC_USE_REAL___FP16)
3166: PetscBLASInt one = 1, cnt = bs2 * nz;
3167: PetscCallBLAS("BLASnrm2", *norm = BLASnrm2_(&cnt, v, &one));
3168: #else
3169: for (i = 0; i < bs2 * nz; i++) {
3170: sum += PetscRealPart(PetscConj(*v) * (*v));
3171: v++;
3172: }
3173: #endif
3174: *norm = PetscSqrtReal(sum);
3175: PetscCall(PetscLogFlops(2.0 * bs2 * nz));
3176: } else if (type == NORM_1) { /* maximum column sum */
3177: PetscReal *tmp;
3178: PetscInt *bcol = a->j;
3179: PetscCall(PetscCalloc1(A->cmap->n + 1, &tmp));
3180: for (i = 0; i < nz; i++) {
3181: for (j = 0; j < bs; j++) {
3182: k1 = bs * (*bcol) + j; /* column index */
3183: for (k = 0; k < bs; k++) {
3184: tmp[k1] += PetscAbsScalar(*v);
3185: v++;
3186: }
3187: }
3188: bcol++;
3189: }
3190: *norm = 0.0;
3191: for (j = 0; j < A->cmap->n; j++) {
3192: if (tmp[j] > *norm) *norm = tmp[j];
3193: }
3194: PetscCall(PetscFree(tmp));
3195: PetscCall(PetscLogFlops(PetscMax(bs2 * nz - 1, 0)));
3196: } else if (type == NORM_INFINITY) { /* maximum row sum */
3197: *norm = 0.0;
3198: for (k = 0; k < bs; k++) {
3199: for (j = 0; j < a->mbs; j++) {
3200: v = a->a + bs2 * a->i[j] + k;
3201: sum = 0.0;
3202: for (i = 0; i < a->i[j + 1] - a->i[j]; i++) {
3203: for (k1 = 0; k1 < bs; k1++) {
3204: sum += PetscAbsScalar(*v);
3205: v += bs;
3206: }
3207: }
3208: if (sum > *norm) *norm = sum;
3209: }
3210: }
3211: PetscCall(PetscLogFlops(PetscMax(bs2 * nz - 1, 0)));
3212: } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for this norm yet");
3213: PetscFunctionReturn(PETSC_SUCCESS);
3214: }
3216: PetscErrorCode MatEqual_SeqBAIJ(Mat A, Mat B, PetscBool *flg)
3217: {
3218: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)B->data;
3220: PetscFunctionBegin;
3221: /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */
3222: if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs) || (a->nz != b->nz)) {
3223: *flg = PETSC_FALSE;
3224: PetscFunctionReturn(PETSC_SUCCESS);
3225: }
3227: /* if the a->i are the same */
3228: PetscCall(PetscArraycmp(a->i, b->i, a->mbs + 1, flg));
3229: if (!*flg) PetscFunctionReturn(PETSC_SUCCESS);
3231: /* if a->j are the same */
3232: PetscCall(PetscArraycmp(a->j, b->j, a->nz, flg));
3233: if (!*flg) PetscFunctionReturn(PETSC_SUCCESS);
3235: /* if a->a are the same */
3236: PetscCall(PetscArraycmp(a->a, b->a, (a->nz) * (A->rmap->bs) * (B->rmap->bs), flg));
3237: PetscFunctionReturn(PETSC_SUCCESS);
3238: }
3240: PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A, Vec v)
3241: {
3242: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3243: PetscInt i, j, k, n, row, bs, *ai, *aj, ambs, bs2;
3244: PetscScalar *x, zero = 0.0;
3245: MatScalar *aa, *aa_j;
3247: PetscFunctionBegin;
3248: PetscCheck(!A->factortype, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
3249: bs = A->rmap->bs;
3250: aa = a->a;
3251: ai = a->i;
3252: aj = a->j;
3253: ambs = a->mbs;
3254: bs2 = a->bs2;
3256: PetscCall(VecSet(v, zero));
3257: PetscCall(VecGetArray(v, &x));
3258: PetscCall(VecGetLocalSize(v, &n));
3259: PetscCheck(n == A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
3260: for (i = 0; i < ambs; i++) {
3261: for (j = ai[i]; j < ai[i + 1]; j++) {
3262: if (aj[j] == i) {
3263: row = i * bs;
3264: aa_j = aa + j * bs2;
3265: for (k = 0; k < bs2; k += (bs + 1), row++) x[row] = aa_j[k];
3266: break;
3267: }
3268: }
3269: }
3270: PetscCall(VecRestoreArray(v, &x));
3271: PetscFunctionReturn(PETSC_SUCCESS);
3272: }
3274: PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A, Vec ll, Vec rr)
3275: {
3276: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3277: const PetscScalar *l, *r, *li, *ri;
3278: PetscScalar x;
3279: MatScalar *aa, *v;
3280: PetscInt i, j, k, lm, rn, M, m, n, mbs, tmp, bs, bs2, iai;
3281: const PetscInt *ai, *aj;
3283: PetscFunctionBegin;
3284: ai = a->i;
3285: aj = a->j;
3286: aa = a->a;
3287: m = A->rmap->n;
3288: n = A->cmap->n;
3289: bs = A->rmap->bs;
3290: mbs = a->mbs;
3291: bs2 = a->bs2;
3292: if (ll) {
3293: PetscCall(VecGetArrayRead(ll, &l));
3294: PetscCall(VecGetLocalSize(ll, &lm));
3295: PetscCheck(lm == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
3296: for (i = 0; i < mbs; i++) { /* for each block row */
3297: M = ai[i + 1] - ai[i];
3298: li = l + i * bs;
3299: v = aa + bs2 * ai[i];
3300: for (j = 0; j < M; j++) { /* for each block */
3301: for (k = 0; k < bs2; k++) (*v++) *= li[k % bs];
3302: }
3303: }
3304: PetscCall(VecRestoreArrayRead(ll, &l));
3305: PetscCall(PetscLogFlops(a->nz));
3306: }
3308: if (rr) {
3309: PetscCall(VecGetArrayRead(rr, &r));
3310: PetscCall(VecGetLocalSize(rr, &rn));
3311: PetscCheck(rn == n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
3312: for (i = 0; i < mbs; i++) { /* for each block row */
3313: iai = ai[i];
3314: M = ai[i + 1] - iai;
3315: v = aa + bs2 * iai;
3316: for (j = 0; j < M; j++) { /* for each block */
3317: ri = r + bs * aj[iai + j];
3318: for (k = 0; k < bs; k++) {
3319: x = ri[k];
3320: for (tmp = 0; tmp < bs; tmp++) v[tmp] *= x;
3321: v += bs;
3322: }
3323: }
3324: }
3325: PetscCall(VecRestoreArrayRead(rr, &r));
3326: PetscCall(PetscLogFlops(a->nz));
3327: }
3328: PetscFunctionReturn(PETSC_SUCCESS);
3329: }
3331: PetscErrorCode MatGetInfo_SeqBAIJ(Mat A, MatInfoType flag, MatInfo *info)
3332: {
3333: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3335: PetscFunctionBegin;
3336: info->block_size = a->bs2;
3337: info->nz_allocated = a->bs2 * a->maxnz;
3338: info->nz_used = a->bs2 * a->nz;
3339: info->nz_unneeded = info->nz_allocated - info->nz_used;
3340: info->assemblies = A->num_ass;
3341: info->mallocs = A->info.mallocs;
3342: info->memory = 0; /* REVIEW ME */
3343: if (A->factortype) {
3344: info->fill_ratio_given = A->info.fill_ratio_given;
3345: info->fill_ratio_needed = A->info.fill_ratio_needed;
3346: info->factor_mallocs = A->info.factor_mallocs;
3347: } else {
3348: info->fill_ratio_given = 0;
3349: info->fill_ratio_needed = 0;
3350: info->factor_mallocs = 0;
3351: }
3352: PetscFunctionReturn(PETSC_SUCCESS);
3353: }
3355: PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
3356: {
3357: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3359: PetscFunctionBegin;
3360: PetscCall(PetscArrayzero(a->a, a->bs2 * a->i[a->mbs]));
3361: PetscFunctionReturn(PETSC_SUCCESS);
3362: }
3364: PetscErrorCode MatMatMultSymbolic_SeqBAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C)
3365: {
3366: PetscFunctionBegin;
3367: PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C));
3368: C->ops->matmultnumeric = MatMatMultNumeric_SeqBAIJ_SeqDense;
3369: PetscFunctionReturn(PETSC_SUCCESS);
3370: }
3372: PetscErrorCode MatMatMult_SeqBAIJ_1_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3373: {
3374: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3375: PetscScalar *z = NULL, sum1;
3376: const PetscScalar *xb;
3377: PetscScalar x1;
3378: const MatScalar *v, *vv;
3379: PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
3380: PetscBool usecprow = a->compressedrow.use;
3382: PetscFunctionBegin;
3383: idx = a->j;
3384: v = a->a;
3385: if (usecprow) {
3386: mbs = a->compressedrow.nrows;
3387: ii = a->compressedrow.i;
3388: ridx = a->compressedrow.rindex;
3389: } else {
3390: mbs = a->mbs;
3391: ii = a->i;
3392: z = c;
3393: }
3395: for (i = 0; i < mbs; i++) {
3396: n = ii[1] - ii[0];
3397: ii++;
3398: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
3399: PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3400: if (usecprow) z = c + ridx[i];
3401: jj = idx;
3402: vv = v;
3403: for (k = 0; k < cn; k++) {
3404: idx = jj;
3405: v = vv;
3406: sum1 = 0.0;
3407: for (j = 0; j < n; j++) {
3408: xb = b + (*idx++);
3409: x1 = xb[0 + k * bm];
3410: sum1 += v[0] * x1;
3411: v += 1;
3412: }
3413: z[0 + k * cm] = sum1;
3414: }
3415: if (!usecprow) z += 1;
3416: }
3417: PetscFunctionReturn(PETSC_SUCCESS);
3418: }
3420: PetscErrorCode MatMatMult_SeqBAIJ_2_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3421: {
3422: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3423: PetscScalar *z = NULL, sum1, sum2;
3424: const PetscScalar *xb;
3425: PetscScalar x1, x2;
3426: const MatScalar *v, *vv;
3427: PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
3428: PetscBool usecprow = a->compressedrow.use;
3430: PetscFunctionBegin;
3431: idx = a->j;
3432: v = a->a;
3433: if (usecprow) {
3434: mbs = a->compressedrow.nrows;
3435: ii = a->compressedrow.i;
3436: ridx = a->compressedrow.rindex;
3437: } else {
3438: mbs = a->mbs;
3439: ii = a->i;
3440: z = c;
3441: }
3443: for (i = 0; i < mbs; i++) {
3444: n = ii[1] - ii[0];
3445: ii++;
3446: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
3447: PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3448: if (usecprow) z = c + 2 * ridx[i];
3449: jj = idx;
3450: vv = v;
3451: for (k = 0; k < cn; k++) {
3452: idx = jj;
3453: v = vv;
3454: sum1 = 0.0;
3455: sum2 = 0.0;
3456: for (j = 0; j < n; j++) {
3457: xb = b + 2 * (*idx++);
3458: x1 = xb[0 + k * bm];
3459: x2 = xb[1 + k * bm];
3460: sum1 += v[0] * x1 + v[2] * x2;
3461: sum2 += v[1] * x1 + v[3] * x2;
3462: v += 4;
3463: }
3464: z[0 + k * cm] = sum1;
3465: z[1 + k * cm] = sum2;
3466: }
3467: if (!usecprow) z += 2;
3468: }
3469: PetscFunctionReturn(PETSC_SUCCESS);
3470: }
3472: PetscErrorCode MatMatMult_SeqBAIJ_3_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3473: {
3474: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3475: PetscScalar *z = NULL, sum1, sum2, sum3;
3476: const PetscScalar *xb;
3477: PetscScalar x1, x2, x3;
3478: const MatScalar *v, *vv;
3479: PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
3480: PetscBool usecprow = a->compressedrow.use;
3482: PetscFunctionBegin;
3483: idx = a->j;
3484: v = a->a;
3485: if (usecprow) {
3486: mbs = a->compressedrow.nrows;
3487: ii = a->compressedrow.i;
3488: ridx = a->compressedrow.rindex;
3489: } else {
3490: mbs = a->mbs;
3491: ii = a->i;
3492: z = c;
3493: }
3495: for (i = 0; i < mbs; i++) {
3496: n = ii[1] - ii[0];
3497: ii++;
3498: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
3499: PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3500: if (usecprow) z = c + 3 * ridx[i];
3501: jj = idx;
3502: vv = v;
3503: for (k = 0; k < cn; k++) {
3504: idx = jj;
3505: v = vv;
3506: sum1 = 0.0;
3507: sum2 = 0.0;
3508: sum3 = 0.0;
3509: for (j = 0; j < n; j++) {
3510: xb = b + 3 * (*idx++);
3511: x1 = xb[0 + k * bm];
3512: x2 = xb[1 + k * bm];
3513: x3 = xb[2 + k * bm];
3514: sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3;
3515: sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3;
3516: sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3;
3517: v += 9;
3518: }
3519: z[0 + k * cm] = sum1;
3520: z[1 + k * cm] = sum2;
3521: z[2 + k * cm] = sum3;
3522: }
3523: if (!usecprow) z += 3;
3524: }
3525: PetscFunctionReturn(PETSC_SUCCESS);
3526: }
3528: PetscErrorCode MatMatMult_SeqBAIJ_4_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3529: {
3530: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3531: PetscScalar *z = NULL, sum1, sum2, sum3, sum4;
3532: const PetscScalar *xb;
3533: PetscScalar x1, x2, x3, x4;
3534: const MatScalar *v, *vv;
3535: PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
3536: PetscBool usecprow = a->compressedrow.use;
3538: PetscFunctionBegin;
3539: idx = a->j;
3540: v = a->a;
3541: if (usecprow) {
3542: mbs = a->compressedrow.nrows;
3543: ii = a->compressedrow.i;
3544: ridx = a->compressedrow.rindex;
3545: } else {
3546: mbs = a->mbs;
3547: ii = a->i;
3548: z = c;
3549: }
3551: for (i = 0; i < mbs; i++) {
3552: n = ii[1] - ii[0];
3553: ii++;
3554: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
3555: PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3556: if (usecprow) z = c + 4 * ridx[i];
3557: jj = idx;
3558: vv = v;
3559: for (k = 0; k < cn; k++) {
3560: idx = jj;
3561: v = vv;
3562: sum1 = 0.0;
3563: sum2 = 0.0;
3564: sum3 = 0.0;
3565: sum4 = 0.0;
3566: for (j = 0; j < n; j++) {
3567: xb = b + 4 * (*idx++);
3568: x1 = xb[0 + k * bm];
3569: x2 = xb[1 + k * bm];
3570: x3 = xb[2 + k * bm];
3571: x4 = xb[3 + k * bm];
3572: sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
3573: sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
3574: sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
3575: sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
3576: v += 16;
3577: }
3578: z[0 + k * cm] = sum1;
3579: z[1 + k * cm] = sum2;
3580: z[2 + k * cm] = sum3;
3581: z[3 + k * cm] = sum4;
3582: }
3583: if (!usecprow) z += 4;
3584: }
3585: PetscFunctionReturn(PETSC_SUCCESS);
3586: }
3588: PetscErrorCode MatMatMult_SeqBAIJ_5_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3589: {
3590: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3591: PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5;
3592: const PetscScalar *xb;
3593: PetscScalar x1, x2, x3, x4, x5;
3594: const MatScalar *v, *vv;
3595: PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
3596: PetscBool usecprow = a->compressedrow.use;
3598: PetscFunctionBegin;
3599: idx = a->j;
3600: v = a->a;
3601: if (usecprow) {
3602: mbs = a->compressedrow.nrows;
3603: ii = a->compressedrow.i;
3604: ridx = a->compressedrow.rindex;
3605: } else {
3606: mbs = a->mbs;
3607: ii = a->i;
3608: z = c;
3609: }
3611: for (i = 0; i < mbs; i++) {
3612: n = ii[1] - ii[0];
3613: ii++;
3614: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
3615: PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3616: if (usecprow) z = c + 5 * ridx[i];
3617: jj = idx;
3618: vv = v;
3619: for (k = 0; k < cn; k++) {
3620: idx = jj;
3621: v = vv;
3622: sum1 = 0.0;
3623: sum2 = 0.0;
3624: sum3 = 0.0;
3625: sum4 = 0.0;
3626: sum5 = 0.0;
3627: for (j = 0; j < n; j++) {
3628: xb = b + 5 * (*idx++);
3629: x1 = xb[0 + k * bm];
3630: x2 = xb[1 + k * bm];
3631: x3 = xb[2 + k * bm];
3632: x4 = xb[3 + k * bm];
3633: x5 = xb[4 + k * bm];
3634: sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
3635: sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
3636: sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
3637: sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
3638: sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
3639: v += 25;
3640: }
3641: z[0 + k * cm] = sum1;
3642: z[1 + k * cm] = sum2;
3643: z[2 + k * cm] = sum3;
3644: z[3 + k * cm] = sum4;
3645: z[4 + k * cm] = sum5;
3646: }
3647: if (!usecprow) z += 5;
3648: }
3649: PetscFunctionReturn(PETSC_SUCCESS);
3650: }
3652: PetscErrorCode MatMatMultNumeric_SeqBAIJ_SeqDense(Mat A, Mat B, Mat C)
3653: {
3654: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3655: Mat_SeqDense *bd = (Mat_SeqDense *)B->data;
3656: Mat_SeqDense *cd = (Mat_SeqDense *)C->data;
3657: PetscInt cm = cd->lda, cn = B->cmap->n, bm = bd->lda;
3658: PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
3659: PetscBLASInt bbs, bcn, bbm, bcm;
3660: PetscScalar *z = NULL;
3661: PetscScalar *c, *b;
3662: const MatScalar *v;
3663: const PetscInt *idx, *ii, *ridx = NULL;
3664: PetscScalar _DZero = 0.0, _DOne = 1.0;
3665: PetscBool usecprow = a->compressedrow.use;
3667: PetscFunctionBegin;
3668: if (!cm || !cn) PetscFunctionReturn(PETSC_SUCCESS);
3669: PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n);
3670: PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n);
3671: PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n);
3672: b = bd->v;
3673: if (a->nonzerorowcnt != A->rmap->n) PetscCall(MatZeroEntries(C));
3674: PetscCall(MatDenseGetArray(C, &c));
3675: switch (bs) {
3676: case 1:
3677: PetscCall(MatMatMult_SeqBAIJ_1_Private(A, b, bm, c, cm, cn));
3678: break;
3679: case 2:
3680: PetscCall(MatMatMult_SeqBAIJ_2_Private(A, b, bm, c, cm, cn));
3681: break;
3682: case 3:
3683: PetscCall(MatMatMult_SeqBAIJ_3_Private(A, b, bm, c, cm, cn));
3684: break;
3685: case 4:
3686: PetscCall(MatMatMult_SeqBAIJ_4_Private(A, b, bm, c, cm, cn));
3687: break;
3688: case 5:
3689: PetscCall(MatMatMult_SeqBAIJ_5_Private(A, b, bm, c, cm, cn));
3690: break;
3691: default: /* block sizes larger than 5 by 5 are handled by BLAS */
3692: PetscCall(PetscBLASIntCast(bs, &bbs));
3693: PetscCall(PetscBLASIntCast(cn, &bcn));
3694: PetscCall(PetscBLASIntCast(bm, &bbm));
3695: PetscCall(PetscBLASIntCast(cm, &bcm));
3696: idx = a->j;
3697: v = a->a;
3698: if (usecprow) {
3699: mbs = a->compressedrow.nrows;
3700: ii = a->compressedrow.i;
3701: ridx = a->compressedrow.rindex;
3702: } else {
3703: mbs = a->mbs;
3704: ii = a->i;
3705: z = c;
3706: }
3707: for (i = 0; i < mbs; i++) {
3708: n = ii[1] - ii[0];
3709: ii++;
3710: if (usecprow) z = c + bs * ridx[i];
3711: if (n) {
3712: PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DZero, z, &bcm));
3713: v += bs2;
3714: }
3715: for (j = 1; j < n; j++) {
3716: PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DOne, z, &bcm));
3717: v += bs2;
3718: }
3719: if (!usecprow) z += bs;
3720: }
3721: }
3722: PetscCall(MatDenseRestoreArray(C, &c));
3723: PetscCall(PetscLogFlops((2.0 * a->nz * bs2 - bs * a->nonzerorowcnt) * cn));
3724: PetscFunctionReturn(PETSC_SUCCESS);
3725: }