Actual source code: inode.c
2: /*
3: This file provides high performance routines for the Inode format (compressed sparse row)
4: by taking advantage of rows with identical nonzero structure (I-nodes).
5: */
6: #include <../src/mat/impls/aij/seq/aij.h>
8: static PetscErrorCode MatCreateColInode_Private(Mat A, PetscInt *size, PetscInt **ns)
9: {
10: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
11: PetscInt i, count, m, n, min_mn, *ns_row, *ns_col;
13: PetscFunctionBegin;
14: n = A->cmap->n;
15: m = A->rmap->n;
16: PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
17: ns_row = a->inode.size;
19: min_mn = (m < n) ? m : n;
20: if (!ns) {
21: for (count = 0, i = 0; count < min_mn; count += ns_row[i], i++)
22: ;
23: for (; count + 1 < n; count++, i++)
24: ;
25: if (count < n) i++;
26: *size = i;
27: PetscFunctionReturn(PETSC_SUCCESS);
28: }
29: PetscCall(PetscMalloc1(n + 1, &ns_col));
31: /* Use the same row structure wherever feasible. */
32: for (count = 0, i = 0; count < min_mn; count += ns_row[i], i++) ns_col[i] = ns_row[i];
34: /* if m < n; pad up the remainder with inode_limit */
35: for (; count + 1 < n; count++, i++) ns_col[i] = 1;
36: /* The last node is the odd ball. pad it up with the remaining rows; */
37: if (count < n) {
38: ns_col[i] = n - count;
39: i++;
40: } else if (count > n) {
41: /* Adjust for the over estimation */
42: ns_col[i - 1] += n - count;
43: }
44: *size = i;
45: *ns = ns_col;
46: PetscFunctionReturn(PETSC_SUCCESS);
47: }
49: /*
50: This builds symmetric version of nonzero structure,
51: */
52: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift)
53: {
54: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
55: PetscInt *work, *ia, *ja, nz, nslim_row, nslim_col, m, row, col, n;
56: PetscInt *tns, *tvc, *ns_row = a->inode.size, *ns_col, nsz, i1, i2;
57: const PetscInt *j, *jmax, *ai = a->i, *aj = a->j;
59: PetscFunctionBegin;
60: nslim_row = a->inode.node_count;
61: m = A->rmap->n;
62: n = A->cmap->n;
63: PetscCheck(m == n, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");
64: PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
66: /* Use the row_inode as column_inode */
67: nslim_col = nslim_row;
68: ns_col = ns_row;
70: /* allocate space for reformatted inode structure */
71: PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc));
72: for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + ns_row[i1];
74: for (i1 = 0, col = 0; i1 < nslim_col; ++i1) {
75: nsz = ns_col[i1];
76: for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1;
77: }
78: /* allocate space for row pointers */
79: PetscCall(PetscCalloc1(nslim_row + 1, &ia));
80: *iia = ia;
81: PetscCall(PetscMalloc1(nslim_row + 1, &work));
83: /* determine the number of columns in each row */
84: ia[0] = oshift;
85: for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
86: j = aj + ai[row] + ishift;
87: jmax = aj + ai[row + 1] + ishift;
88: if (j == jmax) continue; /* empty row */
89: col = *j++ + ishift;
90: i2 = tvc[col];
91: while (i2 < i1 && j < jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elements */
92: ia[i1 + 1]++;
93: ia[i2 + 1]++;
94: i2++; /* Start col of next node */
95: while ((j < jmax) && ((col = *j + ishift) < tns[i2])) ++j;
96: i2 = tvc[col];
97: }
98: if (i2 == i1) ia[i2 + 1]++; /* now the diagonal element */
99: }
101: /* shift ia[i] to point to next row */
102: for (i1 = 1; i1 < nslim_row + 1; i1++) {
103: row = ia[i1 - 1];
104: ia[i1] += row;
105: work[i1 - 1] = row - oshift;
106: }
108: /* allocate space for column pointers */
109: nz = ia[nslim_row] + (!ishift);
110: PetscCall(PetscMalloc1(nz, &ja));
111: *jja = ja;
113: /* loop over lower triangular part putting into ja */
114: for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
115: j = aj + ai[row] + ishift;
116: jmax = aj + ai[row + 1] + ishift;
117: if (j == jmax) continue; /* empty row */
118: col = *j++ + ishift;
119: i2 = tvc[col];
120: while (i2 < i1 && j < jmax) {
121: ja[work[i2]++] = i1 + oshift;
122: ja[work[i1]++] = i2 + oshift;
123: ++i2;
124: while ((j < jmax) && ((col = *j + ishift) < tns[i2])) ++j; /* Skip rest col indices in this node */
125: i2 = tvc[col];
126: }
127: if (i2 == i1) ja[work[i1]++] = i2 + oshift;
128: }
129: PetscCall(PetscFree(work));
130: PetscCall(PetscFree2(tns, tvc));
131: PetscFunctionReturn(PETSC_SUCCESS);
132: }
134: /*
135: This builds nonsymmetric version of nonzero structure,
136: */
137: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift)
138: {
139: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
140: PetscInt *work, *ia, *ja, nz, nslim_row, n, row, col, *ns_col, nslim_col;
141: PetscInt *tns, *tvc, nsz, i1, i2;
142: const PetscInt *j, *ai = a->i, *aj = a->j, *ns_row = a->inode.size;
144: PetscFunctionBegin;
145: PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
146: nslim_row = a->inode.node_count;
147: n = A->cmap->n;
149: /* Create The column_inode for this matrix */
150: PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col));
152: /* allocate space for reformatted column_inode structure */
153: PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc));
154: for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + ns_col[i1];
156: for (i1 = 0, col = 0; i1 < nslim_col; ++i1) {
157: nsz = ns_col[i1];
158: for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1;
159: }
160: /* allocate space for row pointers */
161: PetscCall(PetscCalloc1(nslim_row + 1, &ia));
162: *iia = ia;
163: PetscCall(PetscMalloc1(nslim_row + 1, &work));
165: /* determine the number of columns in each row */
166: ia[0] = oshift;
167: for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
168: j = aj + ai[row] + ishift;
169: nz = ai[row + 1] - ai[row];
170: if (!nz) continue; /* empty row */
171: col = *j++ + ishift;
172: i2 = tvc[col];
173: while (nz-- > 0) { /* off-diagonal elements */
174: ia[i1 + 1]++;
175: i2++; /* Start col of next node */
176: while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
177: if (nz > 0) i2 = tvc[col];
178: }
179: }
181: /* shift ia[i] to point to next row */
182: for (i1 = 1; i1 < nslim_row + 1; i1++) {
183: row = ia[i1 - 1];
184: ia[i1] += row;
185: work[i1 - 1] = row - oshift;
186: }
188: /* allocate space for column pointers */
189: nz = ia[nslim_row] + (!ishift);
190: PetscCall(PetscMalloc1(nz, &ja));
191: *jja = ja;
193: /* loop over matrix putting into ja */
194: for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
195: j = aj + ai[row] + ishift;
196: nz = ai[row + 1] - ai[row];
197: if (!nz) continue; /* empty row */
198: col = *j++ + ishift;
199: i2 = tvc[col];
200: while (nz-- > 0) {
201: ja[work[i1]++] = i2 + oshift;
202: ++i2;
203: while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
204: if (nz > 0) i2 = tvc[col];
205: }
206: }
207: PetscCall(PetscFree(ns_col));
208: PetscCall(PetscFree(work));
209: PetscCall(PetscFree2(tns, tvc));
210: PetscFunctionReturn(PETSC_SUCCESS);
211: }
213: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
214: {
215: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
217: PetscFunctionBegin;
218: if (n) *n = a->inode.node_count;
219: if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
220: if (!blockcompressed) {
221: PetscCall(MatGetRowIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
222: } else if (symmetric) {
223: PetscCall(MatGetRowIJ_SeqAIJ_Inode_Symmetric(A, ia, ja, 0, oshift));
224: } else {
225: PetscCall(MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A, ia, ja, 0, oshift));
226: }
227: PetscFunctionReturn(PETSC_SUCCESS);
228: }
230: static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
231: {
232: PetscFunctionBegin;
233: if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
235: if (!blockcompressed) {
236: PetscCall(MatRestoreRowIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
237: } else {
238: PetscCall(PetscFree(*ia));
239: PetscCall(PetscFree(*ja));
240: }
241: PetscFunctionReturn(PETSC_SUCCESS);
242: }
244: static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift)
245: {
246: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
247: PetscInt *work, *ia, *ja, *j, nz, nslim_row, n, row, col, *ns_col, nslim_col;
248: PetscInt *tns, *tvc, *ns_row = a->inode.size, nsz, i1, i2, *ai = a->i, *aj = a->j;
250: PetscFunctionBegin;
251: PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
252: nslim_row = a->inode.node_count;
253: n = A->cmap->n;
255: /* Create The column_inode for this matrix */
256: PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col));
258: /* allocate space for reformatted column_inode structure */
259: PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc));
260: for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + ns_col[i1];
262: for (i1 = 0, col = 0; i1 < nslim_col; ++i1) {
263: nsz = ns_col[i1];
264: for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1;
265: }
266: /* allocate space for column pointers */
267: PetscCall(PetscCalloc1(nslim_col + 1, &ia));
268: *iia = ia;
269: PetscCall(PetscMalloc1(nslim_col + 1, &work));
271: /* determine the number of columns in each row */
272: ia[0] = oshift;
273: for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
274: j = aj + ai[row] + ishift;
275: col = *j++ + ishift;
276: i2 = tvc[col];
277: nz = ai[row + 1] - ai[row];
278: while (nz-- > 0) { /* off-diagonal elements */
279: /* ia[i1+1]++; */
280: ia[i2 + 1]++;
281: i2++;
282: while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
283: if (nz > 0) i2 = tvc[col];
284: }
285: }
287: /* shift ia[i] to point to next col */
288: for (i1 = 1; i1 < nslim_col + 1; i1++) {
289: col = ia[i1 - 1];
290: ia[i1] += col;
291: work[i1 - 1] = col - oshift;
292: }
294: /* allocate space for column pointers */
295: nz = ia[nslim_col] + (!ishift);
296: PetscCall(PetscMalloc1(nz, &ja));
297: *jja = ja;
299: /* loop over matrix putting into ja */
300: for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
301: j = aj + ai[row] + ishift;
302: col = *j++ + ishift;
303: i2 = tvc[col];
304: nz = ai[row + 1] - ai[row];
305: while (nz-- > 0) {
306: /* ja[work[i1]++] = i2 + oshift; */
307: ja[work[i2]++] = i1 + oshift;
308: i2++;
309: while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
310: if (nz > 0) i2 = tvc[col];
311: }
312: }
313: PetscCall(PetscFree(ns_col));
314: PetscCall(PetscFree(work));
315: PetscCall(PetscFree2(tns, tvc));
316: PetscFunctionReturn(PETSC_SUCCESS);
317: }
319: static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
320: {
321: PetscFunctionBegin;
322: PetscCall(MatCreateColInode_Private(A, n, NULL));
323: if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
325: if (!blockcompressed) {
326: PetscCall(MatGetColumnIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
327: } else if (symmetric) {
328: /* Since the indices are symmetric it doesn't matter */
329: PetscCall(MatGetRowIJ_SeqAIJ_Inode_Symmetric(A, ia, ja, 0, oshift));
330: } else {
331: PetscCall(MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A, ia, ja, 0, oshift));
332: }
333: PetscFunctionReturn(PETSC_SUCCESS);
334: }
336: static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
337: {
338: PetscFunctionBegin;
339: if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
340: if (!blockcompressed) {
341: PetscCall(MatRestoreColumnIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
342: } else {
343: PetscCall(PetscFree(*ia));
344: PetscCall(PetscFree(*ja));
345: }
346: PetscFunctionReturn(PETSC_SUCCESS);
347: }
349: PetscErrorCode MatMult_SeqAIJ_Inode(Mat A, Vec xx, Vec yy)
350: {
351: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
352: PetscScalar sum1, sum2, sum3, sum4, sum5, tmp0, tmp1;
353: PetscScalar *y;
354: const PetscScalar *x;
355: const MatScalar *v1, *v2, *v3, *v4, *v5;
356: PetscInt i1, i2, n, i, row, node_max, nsz, sz, nonzerorow = 0;
357: const PetscInt *idx, *ns, *ii;
359: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
360: #pragma disjoint(*x, *y, *v1, *v2, *v3, *v4, *v5)
361: #endif
363: PetscFunctionBegin;
364: PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
365: node_max = a->inode.node_count;
366: ns = a->inode.size; /* Node Size array */
367: PetscCall(VecGetArrayRead(xx, &x));
368: PetscCall(VecGetArray(yy, &y));
369: idx = a->j;
370: v1 = a->a;
371: ii = a->i;
373: for (i = 0, row = 0; i < node_max; ++i) {
374: nsz = ns[i];
375: n = ii[1] - ii[0];
376: nonzerorow += (n > 0) * nsz;
377: ii += nsz;
378: PetscPrefetchBlock(idx + nsz * n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Prefetch the indices for the block row after the current one */
379: PetscPrefetchBlock(v1 + nsz * n, nsz * n, 0, PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one */
380: sz = n; /* No of non zeros in this row */
381: /* Switch on the size of Node */
382: switch (nsz) { /* Each loop in 'case' is unrolled */
383: case 1:
384: sum1 = 0.;
386: for (n = 0; n < sz - 1; n += 2) {
387: i1 = idx[0]; /* The instructions are ordered to */
388: i2 = idx[1]; /* make the compiler's job easy */
389: idx += 2;
390: tmp0 = x[i1];
391: tmp1 = x[i2];
392: sum1 += v1[0] * tmp0 + v1[1] * tmp1;
393: v1 += 2;
394: }
396: if (n == sz - 1) { /* Take care of the last nonzero */
397: tmp0 = x[*idx++];
398: sum1 += *v1++ * tmp0;
399: }
400: y[row++] = sum1;
401: break;
402: case 2:
403: sum1 = 0.;
404: sum2 = 0.;
405: v2 = v1 + n;
407: for (n = 0; n < sz - 1; n += 2) {
408: i1 = idx[0];
409: i2 = idx[1];
410: idx += 2;
411: tmp0 = x[i1];
412: tmp1 = x[i2];
413: sum1 += v1[0] * tmp0 + v1[1] * tmp1;
414: v1 += 2;
415: sum2 += v2[0] * tmp0 + v2[1] * tmp1;
416: v2 += 2;
417: }
418: if (n == sz - 1) {
419: tmp0 = x[*idx++];
420: sum1 += *v1++ * tmp0;
421: sum2 += *v2++ * tmp0;
422: }
423: y[row++] = sum1;
424: y[row++] = sum2;
425: v1 = v2; /* Since the next block to be processed starts there*/
426: idx += sz;
427: break;
428: case 3:
429: sum1 = 0.;
430: sum2 = 0.;
431: sum3 = 0.;
432: v2 = v1 + n;
433: v3 = v2 + n;
435: for (n = 0; n < sz - 1; n += 2) {
436: i1 = idx[0];
437: i2 = idx[1];
438: idx += 2;
439: tmp0 = x[i1];
440: tmp1 = x[i2];
441: sum1 += v1[0] * tmp0 + v1[1] * tmp1;
442: v1 += 2;
443: sum2 += v2[0] * tmp0 + v2[1] * tmp1;
444: v2 += 2;
445: sum3 += v3[0] * tmp0 + v3[1] * tmp1;
446: v3 += 2;
447: }
448: if (n == sz - 1) {
449: tmp0 = x[*idx++];
450: sum1 += *v1++ * tmp0;
451: sum2 += *v2++ * tmp0;
452: sum3 += *v3++ * tmp0;
453: }
454: y[row++] = sum1;
455: y[row++] = sum2;
456: y[row++] = sum3;
457: v1 = v3; /* Since the next block to be processed starts there*/
458: idx += 2 * sz;
459: break;
460: case 4:
461: sum1 = 0.;
462: sum2 = 0.;
463: sum3 = 0.;
464: sum4 = 0.;
465: v2 = v1 + n;
466: v3 = v2 + n;
467: v4 = v3 + n;
469: for (n = 0; n < sz - 1; n += 2) {
470: i1 = idx[0];
471: i2 = idx[1];
472: idx += 2;
473: tmp0 = x[i1];
474: tmp1 = x[i2];
475: sum1 += v1[0] * tmp0 + v1[1] * tmp1;
476: v1 += 2;
477: sum2 += v2[0] * tmp0 + v2[1] * tmp1;
478: v2 += 2;
479: sum3 += v3[0] * tmp0 + v3[1] * tmp1;
480: v3 += 2;
481: sum4 += v4[0] * tmp0 + v4[1] * tmp1;
482: v4 += 2;
483: }
484: if (n == sz - 1) {
485: tmp0 = x[*idx++];
486: sum1 += *v1++ * tmp0;
487: sum2 += *v2++ * tmp0;
488: sum3 += *v3++ * tmp0;
489: sum4 += *v4++ * tmp0;
490: }
491: y[row++] = sum1;
492: y[row++] = sum2;
493: y[row++] = sum3;
494: y[row++] = sum4;
495: v1 = v4; /* Since the next block to be processed starts there*/
496: idx += 3 * sz;
497: break;
498: case 5:
499: sum1 = 0.;
500: sum2 = 0.;
501: sum3 = 0.;
502: sum4 = 0.;
503: sum5 = 0.;
504: v2 = v1 + n;
505: v3 = v2 + n;
506: v4 = v3 + n;
507: v5 = v4 + n;
509: for (n = 0; n < sz - 1; n += 2) {
510: i1 = idx[0];
511: i2 = idx[1];
512: idx += 2;
513: tmp0 = x[i1];
514: tmp1 = x[i2];
515: sum1 += v1[0] * tmp0 + v1[1] * tmp1;
516: v1 += 2;
517: sum2 += v2[0] * tmp0 + v2[1] * tmp1;
518: v2 += 2;
519: sum3 += v3[0] * tmp0 + v3[1] * tmp1;
520: v3 += 2;
521: sum4 += v4[0] * tmp0 + v4[1] * tmp1;
522: v4 += 2;
523: sum5 += v5[0] * tmp0 + v5[1] * tmp1;
524: v5 += 2;
525: }
526: if (n == sz - 1) {
527: tmp0 = x[*idx++];
528: sum1 += *v1++ * tmp0;
529: sum2 += *v2++ * tmp0;
530: sum3 += *v3++ * tmp0;
531: sum4 += *v4++ * tmp0;
532: sum5 += *v5++ * tmp0;
533: }
534: y[row++] = sum1;
535: y[row++] = sum2;
536: y[row++] = sum3;
537: y[row++] = sum4;
538: y[row++] = sum5;
539: v1 = v5; /* Since the next block to be processed starts there */
540: idx += 4 * sz;
541: break;
542: default:
543: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported");
544: }
545: }
546: PetscCall(VecRestoreArrayRead(xx, &x));
547: PetscCall(VecRestoreArray(yy, &y));
548: PetscCall(PetscLogFlops(2.0 * a->nz - nonzerorow));
549: PetscFunctionReturn(PETSC_SUCCESS);
550: }
552: /* Almost same code as the MatMult_SeqAIJ_Inode() */
553: PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A, Vec xx, Vec zz, Vec yy)
554: {
555: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
556: PetscScalar sum1, sum2, sum3, sum4, sum5, tmp0, tmp1;
557: const MatScalar *v1, *v2, *v3, *v4, *v5;
558: const PetscScalar *x;
559: PetscScalar *y, *z, *zt;
560: PetscInt i1, i2, n, i, row, node_max, nsz, sz;
561: const PetscInt *idx, *ns, *ii;
563: PetscFunctionBegin;
564: PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
565: node_max = a->inode.node_count;
566: ns = a->inode.size; /* Node Size array */
568: PetscCall(VecGetArrayRead(xx, &x));
569: PetscCall(VecGetArrayPair(zz, yy, &z, &y));
570: zt = z;
572: idx = a->j;
573: v1 = a->a;
574: ii = a->i;
576: for (i = 0, row = 0; i < node_max; ++i) {
577: nsz = ns[i];
578: n = ii[1] - ii[0];
579: ii += nsz;
580: sz = n; /* No of non zeros in this row */
581: /* Switch on the size of Node */
582: switch (nsz) { /* Each loop in 'case' is unrolled */
583: case 1:
584: sum1 = *zt++;
586: for (n = 0; n < sz - 1; n += 2) {
587: i1 = idx[0]; /* The instructions are ordered to */
588: i2 = idx[1]; /* make the compiler's job easy */
589: idx += 2;
590: tmp0 = x[i1];
591: tmp1 = x[i2];
592: sum1 += v1[0] * tmp0 + v1[1] * tmp1;
593: v1 += 2;
594: }
596: if (n == sz - 1) { /* Take care of the last nonzero */
597: tmp0 = x[*idx++];
598: sum1 += *v1++ * tmp0;
599: }
600: y[row++] = sum1;
601: break;
602: case 2:
603: sum1 = *zt++;
604: sum2 = *zt++;
605: v2 = v1 + n;
607: for (n = 0; n < sz - 1; n += 2) {
608: i1 = idx[0];
609: i2 = idx[1];
610: idx += 2;
611: tmp0 = x[i1];
612: tmp1 = x[i2];
613: sum1 += v1[0] * tmp0 + v1[1] * tmp1;
614: v1 += 2;
615: sum2 += v2[0] * tmp0 + v2[1] * tmp1;
616: v2 += 2;
617: }
618: if (n == sz - 1) {
619: tmp0 = x[*idx++];
620: sum1 += *v1++ * tmp0;
621: sum2 += *v2++ * tmp0;
622: }
623: y[row++] = sum1;
624: y[row++] = sum2;
625: v1 = v2; /* Since the next block to be processed starts there*/
626: idx += sz;
627: break;
628: case 3:
629: sum1 = *zt++;
630: sum2 = *zt++;
631: sum3 = *zt++;
632: v2 = v1 + n;
633: v3 = v2 + n;
635: for (n = 0; n < sz - 1; n += 2) {
636: i1 = idx[0];
637: i2 = idx[1];
638: idx += 2;
639: tmp0 = x[i1];
640: tmp1 = x[i2];
641: sum1 += v1[0] * tmp0 + v1[1] * tmp1;
642: v1 += 2;
643: sum2 += v2[0] * tmp0 + v2[1] * tmp1;
644: v2 += 2;
645: sum3 += v3[0] * tmp0 + v3[1] * tmp1;
646: v3 += 2;
647: }
648: if (n == sz - 1) {
649: tmp0 = x[*idx++];
650: sum1 += *v1++ * tmp0;
651: sum2 += *v2++ * tmp0;
652: sum3 += *v3++ * tmp0;
653: }
654: y[row++] = sum1;
655: y[row++] = sum2;
656: y[row++] = sum3;
657: v1 = v3; /* Since the next block to be processed starts there*/
658: idx += 2 * sz;
659: break;
660: case 4:
661: sum1 = *zt++;
662: sum2 = *zt++;
663: sum3 = *zt++;
664: sum4 = *zt++;
665: v2 = v1 + n;
666: v3 = v2 + n;
667: v4 = v3 + n;
669: for (n = 0; n < sz - 1; n += 2) {
670: i1 = idx[0];
671: i2 = idx[1];
672: idx += 2;
673: tmp0 = x[i1];
674: tmp1 = x[i2];
675: sum1 += v1[0] * tmp0 + v1[1] * tmp1;
676: v1 += 2;
677: sum2 += v2[0] * tmp0 + v2[1] * tmp1;
678: v2 += 2;
679: sum3 += v3[0] * tmp0 + v3[1] * tmp1;
680: v3 += 2;
681: sum4 += v4[0] * tmp0 + v4[1] * tmp1;
682: v4 += 2;
683: }
684: if (n == sz - 1) {
685: tmp0 = x[*idx++];
686: sum1 += *v1++ * tmp0;
687: sum2 += *v2++ * tmp0;
688: sum3 += *v3++ * tmp0;
689: sum4 += *v4++ * tmp0;
690: }
691: y[row++] = sum1;
692: y[row++] = sum2;
693: y[row++] = sum3;
694: y[row++] = sum4;
695: v1 = v4; /* Since the next block to be processed starts there*/
696: idx += 3 * sz;
697: break;
698: case 5:
699: sum1 = *zt++;
700: sum2 = *zt++;
701: sum3 = *zt++;
702: sum4 = *zt++;
703: sum5 = *zt++;
704: v2 = v1 + n;
705: v3 = v2 + n;
706: v4 = v3 + n;
707: v5 = v4 + n;
709: for (n = 0; n < sz - 1; n += 2) {
710: i1 = idx[0];
711: i2 = idx[1];
712: idx += 2;
713: tmp0 = x[i1];
714: tmp1 = x[i2];
715: sum1 += v1[0] * tmp0 + v1[1] * tmp1;
716: v1 += 2;
717: sum2 += v2[0] * tmp0 + v2[1] * tmp1;
718: v2 += 2;
719: sum3 += v3[0] * tmp0 + v3[1] * tmp1;
720: v3 += 2;
721: sum4 += v4[0] * tmp0 + v4[1] * tmp1;
722: v4 += 2;
723: sum5 += v5[0] * tmp0 + v5[1] * tmp1;
724: v5 += 2;
725: }
726: if (n == sz - 1) {
727: tmp0 = x[*idx++];
728: sum1 += *v1++ * tmp0;
729: sum2 += *v2++ * tmp0;
730: sum3 += *v3++ * tmp0;
731: sum4 += *v4++ * tmp0;
732: sum5 += *v5++ * tmp0;
733: }
734: y[row++] = sum1;
735: y[row++] = sum2;
736: y[row++] = sum3;
737: y[row++] = sum4;
738: y[row++] = sum5;
739: v1 = v5; /* Since the next block to be processed starts there */
740: idx += 4 * sz;
741: break;
742: default:
743: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported");
744: }
745: }
746: PetscCall(VecRestoreArrayRead(xx, &x));
747: PetscCall(VecRestoreArrayPair(zz, yy, &z, &y));
748: PetscCall(PetscLogFlops(2.0 * a->nz));
749: PetscFunctionReturn(PETSC_SUCCESS);
750: }
752: static PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A, Vec bb, Vec xx)
753: {
754: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
755: IS iscol = a->col, isrow = a->row;
756: const PetscInt *r, *c, *rout, *cout;
757: PetscInt i, j, n = A->rmap->n, nz;
758: PetscInt node_max, *ns, row, nsz, aii, i0, i1;
759: const PetscInt *ai = a->i, *a_j = a->j, *vi, *ad, *aj;
760: PetscScalar *x, *tmp, *tmps, tmp0, tmp1;
761: PetscScalar sum1, sum2, sum3, sum4, sum5;
762: const MatScalar *v1, *v2, *v3, *v4, *v5, *a_a = a->a, *aa;
763: const PetscScalar *b;
765: PetscFunctionBegin;
766: PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
767: node_max = a->inode.node_count;
768: ns = a->inode.size; /* Node Size array */
770: PetscCall(VecGetArrayRead(bb, &b));
771: PetscCall(VecGetArrayWrite(xx, &x));
772: tmp = a->solve_work;
774: PetscCall(ISGetIndices(isrow, &rout));
775: r = rout;
776: PetscCall(ISGetIndices(iscol, &cout));
777: c = cout + (n - 1);
779: /* forward solve the lower triangular */
780: tmps = tmp;
781: aa = a_a;
782: aj = a_j;
783: ad = a->diag;
785: for (i = 0, row = 0; i < node_max; ++i) {
786: nsz = ns[i];
787: aii = ai[row];
788: v1 = aa + aii;
789: vi = aj + aii;
790: nz = ad[row] - aii;
791: if (i < node_max - 1) {
792: /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
793: * but our indexing to determine its size could. */
794: PetscPrefetchBlock(aj + ai[row + nsz], ad[row + nsz] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); /* indices */
795: /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
796: PetscPrefetchBlock(aa + ai[row + nsz], ad[row + nsz + ns[i + 1] - 1] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA);
797: /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
798: }
800: switch (nsz) { /* Each loop in 'case' is unrolled */
801: case 1:
802: sum1 = b[*r++];
803: for (j = 0; j < nz - 1; j += 2) {
804: i0 = vi[0];
805: i1 = vi[1];
806: vi += 2;
807: tmp0 = tmps[i0];
808: tmp1 = tmps[i1];
809: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
810: v1 += 2;
811: }
812: if (j == nz - 1) {
813: tmp0 = tmps[*vi++];
814: sum1 -= *v1++ * tmp0;
815: }
816: tmp[row++] = sum1;
817: break;
818: case 2:
819: sum1 = b[*r++];
820: sum2 = b[*r++];
821: v2 = aa + ai[row + 1];
823: for (j = 0; j < nz - 1; j += 2) {
824: i0 = vi[0];
825: i1 = vi[1];
826: vi += 2;
827: tmp0 = tmps[i0];
828: tmp1 = tmps[i1];
829: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
830: v1 += 2;
831: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
832: v2 += 2;
833: }
834: if (j == nz - 1) {
835: tmp0 = tmps[*vi++];
836: sum1 -= *v1++ * tmp0;
837: sum2 -= *v2++ * tmp0;
838: }
839: sum2 -= *v2++ * sum1;
840: tmp[row++] = sum1;
841: tmp[row++] = sum2;
842: break;
843: case 3:
844: sum1 = b[*r++];
845: sum2 = b[*r++];
846: sum3 = b[*r++];
847: v2 = aa + ai[row + 1];
848: v3 = aa + ai[row + 2];
850: for (j = 0; j < nz - 1; j += 2) {
851: i0 = vi[0];
852: i1 = vi[1];
853: vi += 2;
854: tmp0 = tmps[i0];
855: tmp1 = tmps[i1];
856: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
857: v1 += 2;
858: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
859: v2 += 2;
860: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
861: v3 += 2;
862: }
863: if (j == nz - 1) {
864: tmp0 = tmps[*vi++];
865: sum1 -= *v1++ * tmp0;
866: sum2 -= *v2++ * tmp0;
867: sum3 -= *v3++ * tmp0;
868: }
869: sum2 -= *v2++ * sum1;
870: sum3 -= *v3++ * sum1;
871: sum3 -= *v3++ * sum2;
873: tmp[row++] = sum1;
874: tmp[row++] = sum2;
875: tmp[row++] = sum3;
876: break;
878: case 4:
879: sum1 = b[*r++];
880: sum2 = b[*r++];
881: sum3 = b[*r++];
882: sum4 = b[*r++];
883: v2 = aa + ai[row + 1];
884: v3 = aa + ai[row + 2];
885: v4 = aa + ai[row + 3];
887: for (j = 0; j < nz - 1; j += 2) {
888: i0 = vi[0];
889: i1 = vi[1];
890: vi += 2;
891: tmp0 = tmps[i0];
892: tmp1 = tmps[i1];
893: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
894: v1 += 2;
895: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
896: v2 += 2;
897: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
898: v3 += 2;
899: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
900: v4 += 2;
901: }
902: if (j == nz - 1) {
903: tmp0 = tmps[*vi++];
904: sum1 -= *v1++ * tmp0;
905: sum2 -= *v2++ * tmp0;
906: sum3 -= *v3++ * tmp0;
907: sum4 -= *v4++ * tmp0;
908: }
909: sum2 -= *v2++ * sum1;
910: sum3 -= *v3++ * sum1;
911: sum4 -= *v4++ * sum1;
912: sum3 -= *v3++ * sum2;
913: sum4 -= *v4++ * sum2;
914: sum4 -= *v4++ * sum3;
916: tmp[row++] = sum1;
917: tmp[row++] = sum2;
918: tmp[row++] = sum3;
919: tmp[row++] = sum4;
920: break;
921: case 5:
922: sum1 = b[*r++];
923: sum2 = b[*r++];
924: sum3 = b[*r++];
925: sum4 = b[*r++];
926: sum5 = b[*r++];
927: v2 = aa + ai[row + 1];
928: v3 = aa + ai[row + 2];
929: v4 = aa + ai[row + 3];
930: v5 = aa + ai[row + 4];
932: for (j = 0; j < nz - 1; j += 2) {
933: i0 = vi[0];
934: i1 = vi[1];
935: vi += 2;
936: tmp0 = tmps[i0];
937: tmp1 = tmps[i1];
938: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
939: v1 += 2;
940: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
941: v2 += 2;
942: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
943: v3 += 2;
944: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
945: v4 += 2;
946: sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
947: v5 += 2;
948: }
949: if (j == nz - 1) {
950: tmp0 = tmps[*vi++];
951: sum1 -= *v1++ * tmp0;
952: sum2 -= *v2++ * tmp0;
953: sum3 -= *v3++ * tmp0;
954: sum4 -= *v4++ * tmp0;
955: sum5 -= *v5++ * tmp0;
956: }
958: sum2 -= *v2++ * sum1;
959: sum3 -= *v3++ * sum1;
960: sum4 -= *v4++ * sum1;
961: sum5 -= *v5++ * sum1;
962: sum3 -= *v3++ * sum2;
963: sum4 -= *v4++ * sum2;
964: sum5 -= *v5++ * sum2;
965: sum4 -= *v4++ * sum3;
966: sum5 -= *v5++ * sum3;
967: sum5 -= *v5++ * sum4;
969: tmp[row++] = sum1;
970: tmp[row++] = sum2;
971: tmp[row++] = sum3;
972: tmp[row++] = sum4;
973: tmp[row++] = sum5;
974: break;
975: default:
976: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
977: }
978: }
979: /* backward solve the upper triangular */
980: for (i = node_max - 1, row = n - 1; i >= 0; i--) {
981: nsz = ns[i];
982: aii = ai[row + 1] - 1;
983: v1 = aa + aii;
984: vi = aj + aii;
985: nz = aii - ad[row];
986: switch (nsz) { /* Each loop in 'case' is unrolled */
987: case 1:
988: sum1 = tmp[row];
990: for (j = nz; j > 1; j -= 2) {
991: vi -= 2;
992: i0 = vi[2];
993: i1 = vi[1];
994: tmp0 = tmps[i0];
995: tmp1 = tmps[i1];
996: v1 -= 2;
997: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
998: }
999: if (j == 1) {
1000: tmp0 = tmps[*vi--];
1001: sum1 -= *v1-- * tmp0;
1002: }
1003: x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1004: row--;
1005: break;
1006: case 2:
1007: sum1 = tmp[row];
1008: sum2 = tmp[row - 1];
1009: v2 = aa + ai[row] - 1;
1010: for (j = nz; j > 1; j -= 2) {
1011: vi -= 2;
1012: i0 = vi[2];
1013: i1 = vi[1];
1014: tmp0 = tmps[i0];
1015: tmp1 = tmps[i1];
1016: v1 -= 2;
1017: v2 -= 2;
1018: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1019: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1020: }
1021: if (j == 1) {
1022: tmp0 = tmps[*vi--];
1023: sum1 -= *v1-- * tmp0;
1024: sum2 -= *v2-- * tmp0;
1025: }
1027: tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1028: row--;
1029: sum2 -= *v2-- * tmp0;
1030: x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1031: row--;
1032: break;
1033: case 3:
1034: sum1 = tmp[row];
1035: sum2 = tmp[row - 1];
1036: sum3 = tmp[row - 2];
1037: v2 = aa + ai[row] - 1;
1038: v3 = aa + ai[row - 1] - 1;
1039: for (j = nz; j > 1; j -= 2) {
1040: vi -= 2;
1041: i0 = vi[2];
1042: i1 = vi[1];
1043: tmp0 = tmps[i0];
1044: tmp1 = tmps[i1];
1045: v1 -= 2;
1046: v2 -= 2;
1047: v3 -= 2;
1048: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1049: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1050: sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1051: }
1052: if (j == 1) {
1053: tmp0 = tmps[*vi--];
1054: sum1 -= *v1-- * tmp0;
1055: sum2 -= *v2-- * tmp0;
1056: sum3 -= *v3-- * tmp0;
1057: }
1058: tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1059: row--;
1060: sum2 -= *v2-- * tmp0;
1061: sum3 -= *v3-- * tmp0;
1062: tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1063: row--;
1064: sum3 -= *v3-- * tmp0;
1065: x[*c--] = tmp[row] = sum3 * a_a[ad[row]];
1066: row--;
1068: break;
1069: case 4:
1070: sum1 = tmp[row];
1071: sum2 = tmp[row - 1];
1072: sum3 = tmp[row - 2];
1073: sum4 = tmp[row - 3];
1074: v2 = aa + ai[row] - 1;
1075: v3 = aa + ai[row - 1] - 1;
1076: v4 = aa + ai[row - 2] - 1;
1078: for (j = nz; j > 1; j -= 2) {
1079: vi -= 2;
1080: i0 = vi[2];
1081: i1 = vi[1];
1082: tmp0 = tmps[i0];
1083: tmp1 = tmps[i1];
1084: v1 -= 2;
1085: v2 -= 2;
1086: v3 -= 2;
1087: v4 -= 2;
1088: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1089: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1090: sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1091: sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1092: }
1093: if (j == 1) {
1094: tmp0 = tmps[*vi--];
1095: sum1 -= *v1-- * tmp0;
1096: sum2 -= *v2-- * tmp0;
1097: sum3 -= *v3-- * tmp0;
1098: sum4 -= *v4-- * tmp0;
1099: }
1101: tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1102: row--;
1103: sum2 -= *v2-- * tmp0;
1104: sum3 -= *v3-- * tmp0;
1105: sum4 -= *v4-- * tmp0;
1106: tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1107: row--;
1108: sum3 -= *v3-- * tmp0;
1109: sum4 -= *v4-- * tmp0;
1110: tmp0 = x[*c--] = tmp[row] = sum3 * a_a[ad[row]];
1111: row--;
1112: sum4 -= *v4-- * tmp0;
1113: x[*c--] = tmp[row] = sum4 * a_a[ad[row]];
1114: row--;
1115: break;
1116: case 5:
1117: sum1 = tmp[row];
1118: sum2 = tmp[row - 1];
1119: sum3 = tmp[row - 2];
1120: sum4 = tmp[row - 3];
1121: sum5 = tmp[row - 4];
1122: v2 = aa + ai[row] - 1;
1123: v3 = aa + ai[row - 1] - 1;
1124: v4 = aa + ai[row - 2] - 1;
1125: v5 = aa + ai[row - 3] - 1;
1126: for (j = nz; j > 1; j -= 2) {
1127: vi -= 2;
1128: i0 = vi[2];
1129: i1 = vi[1];
1130: tmp0 = tmps[i0];
1131: tmp1 = tmps[i1];
1132: v1 -= 2;
1133: v2 -= 2;
1134: v3 -= 2;
1135: v4 -= 2;
1136: v5 -= 2;
1137: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1138: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1139: sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1140: sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1141: sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1142: }
1143: if (j == 1) {
1144: tmp0 = tmps[*vi--];
1145: sum1 -= *v1-- * tmp0;
1146: sum2 -= *v2-- * tmp0;
1147: sum3 -= *v3-- * tmp0;
1148: sum4 -= *v4-- * tmp0;
1149: sum5 -= *v5-- * tmp0;
1150: }
1152: tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1153: row--;
1154: sum2 -= *v2-- * tmp0;
1155: sum3 -= *v3-- * tmp0;
1156: sum4 -= *v4-- * tmp0;
1157: sum5 -= *v5-- * tmp0;
1158: tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1159: row--;
1160: sum3 -= *v3-- * tmp0;
1161: sum4 -= *v4-- * tmp0;
1162: sum5 -= *v5-- * tmp0;
1163: tmp0 = x[*c--] = tmp[row] = sum3 * a_a[ad[row]];
1164: row--;
1165: sum4 -= *v4-- * tmp0;
1166: sum5 -= *v5-- * tmp0;
1167: tmp0 = x[*c--] = tmp[row] = sum4 * a_a[ad[row]];
1168: row--;
1169: sum5 -= *v5-- * tmp0;
1170: x[*c--] = tmp[row] = sum5 * a_a[ad[row]];
1171: row--;
1172: break;
1173: default:
1174: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
1175: }
1176: }
1177: PetscCall(ISRestoreIndices(isrow, &rout));
1178: PetscCall(ISRestoreIndices(iscol, &cout));
1179: PetscCall(VecRestoreArrayRead(bb, &b));
1180: PetscCall(VecRestoreArrayWrite(xx, &x));
1181: PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n));
1182: PetscFunctionReturn(PETSC_SUCCESS);
1183: }
1185: PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B, Mat A, const MatFactorInfo *info)
1186: {
1187: Mat C = B;
1188: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)C->data;
1189: IS isrow = b->row, isicol = b->icol;
1190: const PetscInt *r, *ic, *ics;
1191: const PetscInt n = A->rmap->n, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bdiag = b->diag;
1192: PetscInt i, j, k, nz, nzL, row, *pj;
1193: const PetscInt *ajtmp, *bjtmp;
1194: MatScalar *pc, *pc1, *pc2, *pc3, *pc4, mul1, mul2, mul3, mul4, *pv, *rtmp1, *rtmp2, *rtmp3, *rtmp4;
1195: const MatScalar *aa = a->a, *v, *v1, *v2, *v3, *v4;
1196: FactorShiftCtx sctx;
1197: const PetscInt *ddiag;
1198: PetscReal rs;
1199: MatScalar d;
1200: PetscInt inod, nodesz, node_max, col;
1201: const PetscInt *ns;
1202: PetscInt *tmp_vec1, *tmp_vec2, *nsmap;
1204: PetscFunctionBegin;
1205: /* MatPivotSetUp(): initialize shift context sctx */
1206: PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));
1208: if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1209: ddiag = a->diag;
1210: sctx.shift_top = info->zeropivot;
1211: for (i = 0; i < n; i++) {
1212: /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1213: d = (aa)[ddiag[i]];
1214: rs = -PetscAbsScalar(d) - PetscRealPart(d);
1215: v = aa + ai[i];
1216: nz = ai[i + 1] - ai[i];
1217: for (j = 0; j < nz; j++) rs += PetscAbsScalar(v[j]);
1218: if (rs > sctx.shift_top) sctx.shift_top = rs;
1219: }
1220: sctx.shift_top *= 1.1;
1221: sctx.nshift_max = 5;
1222: sctx.shift_lo = 0.;
1223: sctx.shift_hi = 1.;
1224: }
1226: PetscCall(ISGetIndices(isrow, &r));
1227: PetscCall(ISGetIndices(isicol, &ic));
1229: PetscCall(PetscCalloc4(n, &rtmp1, n, &rtmp2, n, &rtmp3, n, &rtmp4));
1230: ics = ic;
1232: node_max = a->inode.node_count;
1233: ns = a->inode.size;
1234: PetscCheck(ns, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix without inode information");
1236: /* If max inode size > 4, split it into two inodes.*/
1237: /* also map the inode sizes according to the ordering */
1238: PetscCall(PetscMalloc1(n + 1, &tmp_vec1));
1239: for (i = 0, j = 0; i < node_max; ++i, ++j) {
1240: if (ns[i] > 4) {
1241: tmp_vec1[j] = 4;
1242: ++j;
1243: tmp_vec1[j] = ns[i] - tmp_vec1[j - 1];
1244: } else {
1245: tmp_vec1[j] = ns[i];
1246: }
1247: }
1248: /* Use the correct node_max */
1249: node_max = j;
1251: /* Now reorder the inode info based on mat re-ordering info */
1252: /* First create a row -> inode_size_array_index map */
1253: PetscCall(PetscMalloc1(n + 1, &nsmap));
1254: PetscCall(PetscMalloc1(node_max + 1, &tmp_vec2));
1255: for (i = 0, row = 0; i < node_max; i++) {
1256: nodesz = tmp_vec1[i];
1257: for (j = 0; j < nodesz; j++, row++) nsmap[row] = i;
1258: }
1259: /* Using nsmap, create a reordered ns structure */
1260: for (i = 0, j = 0; i < node_max; i++) {
1261: nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
1262: tmp_vec2[i] = nodesz;
1263: j += nodesz;
1264: }
1265: PetscCall(PetscFree(nsmap));
1266: PetscCall(PetscFree(tmp_vec1));
1268: /* Now use the correct ns */
1269: ns = tmp_vec2;
1271: do {
1272: sctx.newshift = PETSC_FALSE;
1273: /* Now loop over each block-row, and do the factorization */
1274: for (inod = 0, i = 0; inod < node_max; inod++) { /* i: row index; inod: inode index */
1275: nodesz = ns[inod];
1277: switch (nodesz) {
1278: case 1:
1279: /* zero rtmp1 */
1280: /* L part */
1281: nz = bi[i + 1] - bi[i];
1282: bjtmp = bj + bi[i];
1283: for (j = 0; j < nz; j++) rtmp1[bjtmp[j]] = 0.0;
1285: /* U part */
1286: nz = bdiag[i] - bdiag[i + 1];
1287: bjtmp = bj + bdiag[i + 1] + 1;
1288: for (j = 0; j < nz; j++) rtmp1[bjtmp[j]] = 0.0;
1290: /* load in initial (unfactored row) */
1291: nz = ai[r[i] + 1] - ai[r[i]];
1292: ajtmp = aj + ai[r[i]];
1293: v = aa + ai[r[i]];
1294: for (j = 0; j < nz; j++) rtmp1[ics[ajtmp[j]]] = v[j];
1296: /* ZeropivotApply() */
1297: rtmp1[i] += sctx.shift_amount; /* shift the diagonal of the matrix */
1299: /* elimination */
1300: bjtmp = bj + bi[i];
1301: row = *bjtmp++;
1302: nzL = bi[i + 1] - bi[i];
1303: for (k = 0; k < nzL; k++) {
1304: pc = rtmp1 + row;
1305: if (*pc != 0.0) {
1306: pv = b->a + bdiag[row];
1307: mul1 = *pc * (*pv);
1308: *pc = mul1;
1309: pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1310: pv = b->a + bdiag[row + 1] + 1;
1311: nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1312: for (j = 0; j < nz; j++) rtmp1[pj[j]] -= mul1 * pv[j];
1313: PetscCall(PetscLogFlops(1 + 2.0 * nz));
1314: }
1315: row = *bjtmp++;
1316: }
1318: /* finished row so stick it into b->a */
1319: rs = 0.0;
1320: /* L part */
1321: pv = b->a + bi[i];
1322: pj = b->j + bi[i];
1323: nz = bi[i + 1] - bi[i];
1324: for (j = 0; j < nz; j++) {
1325: pv[j] = rtmp1[pj[j]];
1326: rs += PetscAbsScalar(pv[j]);
1327: }
1329: /* U part */
1330: pv = b->a + bdiag[i + 1] + 1;
1331: pj = b->j + bdiag[i + 1] + 1;
1332: nz = bdiag[i] - bdiag[i + 1] - 1;
1333: for (j = 0; j < nz; j++) {
1334: pv[j] = rtmp1[pj[j]];
1335: rs += PetscAbsScalar(pv[j]);
1336: }
1338: /* Check zero pivot */
1339: sctx.rs = rs;
1340: sctx.pv = rtmp1[i];
1341: PetscCall(MatPivotCheck(B, A, info, &sctx, i));
1342: if (sctx.newshift) break;
1344: /* Mark diagonal and invert diagonal for simpler triangular solves */
1345: pv = b->a + bdiag[i];
1346: *pv = 1.0 / sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */
1347: break;
1349: case 2:
1350: /* zero rtmp1 and rtmp2 */
1351: /* L part */
1352: nz = bi[i + 1] - bi[i];
1353: bjtmp = bj + bi[i];
1354: for (j = 0; j < nz; j++) {
1355: col = bjtmp[j];
1356: rtmp1[col] = 0.0;
1357: rtmp2[col] = 0.0;
1358: }
1360: /* U part */
1361: nz = bdiag[i] - bdiag[i + 1];
1362: bjtmp = bj + bdiag[i + 1] + 1;
1363: for (j = 0; j < nz; j++) {
1364: col = bjtmp[j];
1365: rtmp1[col] = 0.0;
1366: rtmp2[col] = 0.0;
1367: }
1369: /* load in initial (unfactored row) */
1370: nz = ai[r[i] + 1] - ai[r[i]];
1371: ajtmp = aj + ai[r[i]];
1372: v1 = aa + ai[r[i]];
1373: v2 = aa + ai[r[i] + 1];
1374: for (j = 0; j < nz; j++) {
1375: col = ics[ajtmp[j]];
1376: rtmp1[col] = v1[j];
1377: rtmp2[col] = v2[j];
1378: }
1379: /* ZeropivotApply(): shift the diagonal of the matrix */
1380: rtmp1[i] += sctx.shift_amount;
1381: rtmp2[i + 1] += sctx.shift_amount;
1383: /* elimination */
1384: bjtmp = bj + bi[i];
1385: row = *bjtmp++; /* pivot row */
1386: nzL = bi[i + 1] - bi[i];
1387: for (k = 0; k < nzL; k++) {
1388: pc1 = rtmp1 + row;
1389: pc2 = rtmp2 + row;
1390: if (*pc1 != 0.0 || *pc2 != 0.0) {
1391: pv = b->a + bdiag[row];
1392: mul1 = *pc1 * (*pv);
1393: mul2 = *pc2 * (*pv);
1394: *pc1 = mul1;
1395: *pc2 = mul2;
1397: pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1398: pv = b->a + bdiag[row + 1] + 1;
1399: nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1400: for (j = 0; j < nz; j++) {
1401: col = pj[j];
1402: rtmp1[col] -= mul1 * pv[j];
1403: rtmp2[col] -= mul2 * pv[j];
1404: }
1405: PetscCall(PetscLogFlops(2 + 4.0 * nz));
1406: }
1407: row = *bjtmp++;
1408: }
1410: /* finished row i; check zero pivot, then stick row i into b->a */
1411: rs = 0.0;
1412: /* L part */
1413: pc1 = b->a + bi[i];
1414: pj = b->j + bi[i];
1415: nz = bi[i + 1] - bi[i];
1416: for (j = 0; j < nz; j++) {
1417: col = pj[j];
1418: pc1[j] = rtmp1[col];
1419: rs += PetscAbsScalar(pc1[j]);
1420: }
1421: /* U part */
1422: pc1 = b->a + bdiag[i + 1] + 1;
1423: pj = b->j + bdiag[i + 1] + 1;
1424: nz = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */
1425: for (j = 0; j < nz; j++) {
1426: col = pj[j];
1427: pc1[j] = rtmp1[col];
1428: rs += PetscAbsScalar(pc1[j]);
1429: }
1431: sctx.rs = rs;
1432: sctx.pv = rtmp1[i];
1433: PetscCall(MatPivotCheck(B, A, info, &sctx, i));
1434: if (sctx.newshift) break;
1435: pc1 = b->a + bdiag[i]; /* Mark diagonal */
1436: *pc1 = 1.0 / sctx.pv;
1438: /* Now take care of diagonal 2x2 block. */
1439: pc2 = rtmp2 + i;
1440: if (*pc2 != 0.0) {
1441: mul1 = (*pc2) * (*pc1); /* *pc1=diag[i] is inverted! */
1442: *pc2 = mul1; /* insert L entry */
1443: pj = b->j + bdiag[i + 1] + 1; /* beginning of U(i,:) */
1444: nz = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */
1445: for (j = 0; j < nz; j++) {
1446: col = pj[j];
1447: rtmp2[col] -= mul1 * rtmp1[col];
1448: }
1449: PetscCall(PetscLogFlops(1 + 2.0 * nz));
1450: }
1452: /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1453: rs = 0.0;
1454: /* L part */
1455: pc2 = b->a + bi[i + 1];
1456: pj = b->j + bi[i + 1];
1457: nz = bi[i + 2] - bi[i + 1];
1458: for (j = 0; j < nz; j++) {
1459: col = pj[j];
1460: pc2[j] = rtmp2[col];
1461: rs += PetscAbsScalar(pc2[j]);
1462: }
1463: /* U part */
1464: pc2 = b->a + bdiag[i + 2] + 1;
1465: pj = b->j + bdiag[i + 2] + 1;
1466: nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */
1467: for (j = 0; j < nz; j++) {
1468: col = pj[j];
1469: pc2[j] = rtmp2[col];
1470: rs += PetscAbsScalar(pc2[j]);
1471: }
1473: sctx.rs = rs;
1474: sctx.pv = rtmp2[i + 1];
1475: PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1));
1476: if (sctx.newshift) break;
1477: pc2 = b->a + bdiag[i + 1];
1478: *pc2 = 1.0 / sctx.pv;
1479: break;
1481: case 3:
1482: /* zero rtmp */
1483: /* L part */
1484: nz = bi[i + 1] - bi[i];
1485: bjtmp = bj + bi[i];
1486: for (j = 0; j < nz; j++) {
1487: col = bjtmp[j];
1488: rtmp1[col] = 0.0;
1489: rtmp2[col] = 0.0;
1490: rtmp3[col] = 0.0;
1491: }
1493: /* U part */
1494: nz = bdiag[i] - bdiag[i + 1];
1495: bjtmp = bj + bdiag[i + 1] + 1;
1496: for (j = 0; j < nz; j++) {
1497: col = bjtmp[j];
1498: rtmp1[col] = 0.0;
1499: rtmp2[col] = 0.0;
1500: rtmp3[col] = 0.0;
1501: }
1503: /* load in initial (unfactored row) */
1504: nz = ai[r[i] + 1] - ai[r[i]];
1505: ajtmp = aj + ai[r[i]];
1506: v1 = aa + ai[r[i]];
1507: v2 = aa + ai[r[i] + 1];
1508: v3 = aa + ai[r[i] + 2];
1509: for (j = 0; j < nz; j++) {
1510: col = ics[ajtmp[j]];
1511: rtmp1[col] = v1[j];
1512: rtmp2[col] = v2[j];
1513: rtmp3[col] = v3[j];
1514: }
1515: /* ZeropivotApply(): shift the diagonal of the matrix */
1516: rtmp1[i] += sctx.shift_amount;
1517: rtmp2[i + 1] += sctx.shift_amount;
1518: rtmp3[i + 2] += sctx.shift_amount;
1520: /* elimination */
1521: bjtmp = bj + bi[i];
1522: row = *bjtmp++; /* pivot row */
1523: nzL = bi[i + 1] - bi[i];
1524: for (k = 0; k < nzL; k++) {
1525: pc1 = rtmp1 + row;
1526: pc2 = rtmp2 + row;
1527: pc3 = rtmp3 + row;
1528: if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
1529: pv = b->a + bdiag[row];
1530: mul1 = *pc1 * (*pv);
1531: mul2 = *pc2 * (*pv);
1532: mul3 = *pc3 * (*pv);
1533: *pc1 = mul1;
1534: *pc2 = mul2;
1535: *pc3 = mul3;
1537: pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1538: pv = b->a + bdiag[row + 1] + 1;
1539: nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1540: for (j = 0; j < nz; j++) {
1541: col = pj[j];
1542: rtmp1[col] -= mul1 * pv[j];
1543: rtmp2[col] -= mul2 * pv[j];
1544: rtmp3[col] -= mul3 * pv[j];
1545: }
1546: PetscCall(PetscLogFlops(3 + 6.0 * nz));
1547: }
1548: row = *bjtmp++;
1549: }
1551: /* finished row i; check zero pivot, then stick row i into b->a */
1552: rs = 0.0;
1553: /* L part */
1554: pc1 = b->a + bi[i];
1555: pj = b->j + bi[i];
1556: nz = bi[i + 1] - bi[i];
1557: for (j = 0; j < nz; j++) {
1558: col = pj[j];
1559: pc1[j] = rtmp1[col];
1560: rs += PetscAbsScalar(pc1[j]);
1561: }
1562: /* U part */
1563: pc1 = b->a + bdiag[i + 1] + 1;
1564: pj = b->j + bdiag[i + 1] + 1;
1565: nz = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */
1566: for (j = 0; j < nz; j++) {
1567: col = pj[j];
1568: pc1[j] = rtmp1[col];
1569: rs += PetscAbsScalar(pc1[j]);
1570: }
1572: sctx.rs = rs;
1573: sctx.pv = rtmp1[i];
1574: PetscCall(MatPivotCheck(B, A, info, &sctx, i));
1575: if (sctx.newshift) break;
1576: pc1 = b->a + bdiag[i]; /* Mark diag[i] */
1577: *pc1 = 1.0 / sctx.pv;
1579: /* Now take care of 1st column of diagonal 3x3 block. */
1580: pc2 = rtmp2 + i;
1581: pc3 = rtmp3 + i;
1582: if (*pc2 != 0.0 || *pc3 != 0.0) {
1583: mul2 = (*pc2) * (*pc1);
1584: *pc2 = mul2;
1585: mul3 = (*pc3) * (*pc1);
1586: *pc3 = mul3;
1587: pj = b->j + bdiag[i + 1] + 1; /* beginning of U(i,:) */
1588: nz = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */
1589: for (j = 0; j < nz; j++) {
1590: col = pj[j];
1591: rtmp2[col] -= mul2 * rtmp1[col];
1592: rtmp3[col] -= mul3 * rtmp1[col];
1593: }
1594: PetscCall(PetscLogFlops(2 + 4.0 * nz));
1595: }
1597: /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1598: rs = 0.0;
1599: /* L part */
1600: pc2 = b->a + bi[i + 1];
1601: pj = b->j + bi[i + 1];
1602: nz = bi[i + 2] - bi[i + 1];
1603: for (j = 0; j < nz; j++) {
1604: col = pj[j];
1605: pc2[j] = rtmp2[col];
1606: rs += PetscAbsScalar(pc2[j]);
1607: }
1608: /* U part */
1609: pc2 = b->a + bdiag[i + 2] + 1;
1610: pj = b->j + bdiag[i + 2] + 1;
1611: nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */
1612: for (j = 0; j < nz; j++) {
1613: col = pj[j];
1614: pc2[j] = rtmp2[col];
1615: rs += PetscAbsScalar(pc2[j]);
1616: }
1618: sctx.rs = rs;
1619: sctx.pv = rtmp2[i + 1];
1620: PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1));
1621: if (sctx.newshift) break;
1622: pc2 = b->a + bdiag[i + 1];
1623: *pc2 = 1.0 / sctx.pv; /* Mark diag[i+1] */
1625: /* Now take care of 2nd column of diagonal 3x3 block. */
1626: pc3 = rtmp3 + i + 1;
1627: if (*pc3 != 0.0) {
1628: mul3 = (*pc3) * (*pc2);
1629: *pc3 = mul3;
1630: pj = b->j + bdiag[i + 2] + 1; /* beginning of U(i+1,:) */
1631: nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* num of entries in U(i+1,:) excluding diag */
1632: for (j = 0; j < nz; j++) {
1633: col = pj[j];
1634: rtmp3[col] -= mul3 * rtmp2[col];
1635: }
1636: PetscCall(PetscLogFlops(1 + 2.0 * nz));
1637: }
1639: /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1640: rs = 0.0;
1641: /* L part */
1642: pc3 = b->a + bi[i + 2];
1643: pj = b->j + bi[i + 2];
1644: nz = bi[i + 3] - bi[i + 2];
1645: for (j = 0; j < nz; j++) {
1646: col = pj[j];
1647: pc3[j] = rtmp3[col];
1648: rs += PetscAbsScalar(pc3[j]);
1649: }
1650: /* U part */
1651: pc3 = b->a + bdiag[i + 3] + 1;
1652: pj = b->j + bdiag[i + 3] + 1;
1653: nz = bdiag[i + 2] - bdiag[i + 3] - 1; /* exclude diagonal */
1654: for (j = 0; j < nz; j++) {
1655: col = pj[j];
1656: pc3[j] = rtmp3[col];
1657: rs += PetscAbsScalar(pc3[j]);
1658: }
1660: sctx.rs = rs;
1661: sctx.pv = rtmp3[i + 2];
1662: PetscCall(MatPivotCheck(B, A, info, &sctx, i + 2));
1663: if (sctx.newshift) break;
1664: pc3 = b->a + bdiag[i + 2];
1665: *pc3 = 1.0 / sctx.pv; /* Mark diag[i+2] */
1666: break;
1667: case 4:
1668: /* zero rtmp */
1669: /* L part */
1670: nz = bi[i + 1] - bi[i];
1671: bjtmp = bj + bi[i];
1672: for (j = 0; j < nz; j++) {
1673: col = bjtmp[j];
1674: rtmp1[col] = 0.0;
1675: rtmp2[col] = 0.0;
1676: rtmp3[col] = 0.0;
1677: rtmp4[col] = 0.0;
1678: }
1680: /* U part */
1681: nz = bdiag[i] - bdiag[i + 1];
1682: bjtmp = bj + bdiag[i + 1] + 1;
1683: for (j = 0; j < nz; j++) {
1684: col = bjtmp[j];
1685: rtmp1[col] = 0.0;
1686: rtmp2[col] = 0.0;
1687: rtmp3[col] = 0.0;
1688: rtmp4[col] = 0.0;
1689: }
1691: /* load in initial (unfactored row) */
1692: nz = ai[r[i] + 1] - ai[r[i]];
1693: ajtmp = aj + ai[r[i]];
1694: v1 = aa + ai[r[i]];
1695: v2 = aa + ai[r[i] + 1];
1696: v3 = aa + ai[r[i] + 2];
1697: v4 = aa + ai[r[i] + 3];
1698: for (j = 0; j < nz; j++) {
1699: col = ics[ajtmp[j]];
1700: rtmp1[col] = v1[j];
1701: rtmp2[col] = v2[j];
1702: rtmp3[col] = v3[j];
1703: rtmp4[col] = v4[j];
1704: }
1705: /* ZeropivotApply(): shift the diagonal of the matrix */
1706: rtmp1[i] += sctx.shift_amount;
1707: rtmp2[i + 1] += sctx.shift_amount;
1708: rtmp3[i + 2] += sctx.shift_amount;
1709: rtmp4[i + 3] += sctx.shift_amount;
1711: /* elimination */
1712: bjtmp = bj + bi[i];
1713: row = *bjtmp++; /* pivot row */
1714: nzL = bi[i + 1] - bi[i];
1715: for (k = 0; k < nzL; k++) {
1716: pc1 = rtmp1 + row;
1717: pc2 = rtmp2 + row;
1718: pc3 = rtmp3 + row;
1719: pc4 = rtmp4 + row;
1720: if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1721: pv = b->a + bdiag[row];
1722: mul1 = *pc1 * (*pv);
1723: mul2 = *pc2 * (*pv);
1724: mul3 = *pc3 * (*pv);
1725: mul4 = *pc4 * (*pv);
1726: *pc1 = mul1;
1727: *pc2 = mul2;
1728: *pc3 = mul3;
1729: *pc4 = mul4;
1731: pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1732: pv = b->a + bdiag[row + 1] + 1;
1733: nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1734: for (j = 0; j < nz; j++) {
1735: col = pj[j];
1736: rtmp1[col] -= mul1 * pv[j];
1737: rtmp2[col] -= mul2 * pv[j];
1738: rtmp3[col] -= mul3 * pv[j];
1739: rtmp4[col] -= mul4 * pv[j];
1740: }
1741: PetscCall(PetscLogFlops(4 + 8.0 * nz));
1742: }
1743: row = *bjtmp++;
1744: }
1746: /* finished row i; check zero pivot, then stick row i into b->a */
1747: rs = 0.0;
1748: /* L part */
1749: pc1 = b->a + bi[i];
1750: pj = b->j + bi[i];
1751: nz = bi[i + 1] - bi[i];
1752: for (j = 0; j < nz; j++) {
1753: col = pj[j];
1754: pc1[j] = rtmp1[col];
1755: rs += PetscAbsScalar(pc1[j]);
1756: }
1757: /* U part */
1758: pc1 = b->a + bdiag[i + 1] + 1;
1759: pj = b->j + bdiag[i + 1] + 1;
1760: nz = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */
1761: for (j = 0; j < nz; j++) {
1762: col = pj[j];
1763: pc1[j] = rtmp1[col];
1764: rs += PetscAbsScalar(pc1[j]);
1765: }
1767: sctx.rs = rs;
1768: sctx.pv = rtmp1[i];
1769: PetscCall(MatPivotCheck(B, A, info, &sctx, i));
1770: if (sctx.newshift) break;
1771: pc1 = b->a + bdiag[i]; /* Mark diag[i] */
1772: *pc1 = 1.0 / sctx.pv;
1774: /* Now take care of 1st column of diagonal 4x4 block. */
1775: pc2 = rtmp2 + i;
1776: pc3 = rtmp3 + i;
1777: pc4 = rtmp4 + i;
1778: if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1779: mul2 = (*pc2) * (*pc1);
1780: *pc2 = mul2;
1781: mul3 = (*pc3) * (*pc1);
1782: *pc3 = mul3;
1783: mul4 = (*pc4) * (*pc1);
1784: *pc4 = mul4;
1785: pj = b->j + bdiag[i + 1] + 1; /* beginning of U(i,:) */
1786: nz = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */
1787: for (j = 0; j < nz; j++) {
1788: col = pj[j];
1789: rtmp2[col] -= mul2 * rtmp1[col];
1790: rtmp3[col] -= mul3 * rtmp1[col];
1791: rtmp4[col] -= mul4 * rtmp1[col];
1792: }
1793: PetscCall(PetscLogFlops(3 + 6.0 * nz));
1794: }
1796: /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1797: rs = 0.0;
1798: /* L part */
1799: pc2 = b->a + bi[i + 1];
1800: pj = b->j + bi[i + 1];
1801: nz = bi[i + 2] - bi[i + 1];
1802: for (j = 0; j < nz; j++) {
1803: col = pj[j];
1804: pc2[j] = rtmp2[col];
1805: rs += PetscAbsScalar(pc2[j]);
1806: }
1807: /* U part */
1808: pc2 = b->a + bdiag[i + 2] + 1;
1809: pj = b->j + bdiag[i + 2] + 1;
1810: nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */
1811: for (j = 0; j < nz; j++) {
1812: col = pj[j];
1813: pc2[j] = rtmp2[col];
1814: rs += PetscAbsScalar(pc2[j]);
1815: }
1817: sctx.rs = rs;
1818: sctx.pv = rtmp2[i + 1];
1819: PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1));
1820: if (sctx.newshift) break;
1821: pc2 = b->a + bdiag[i + 1];
1822: *pc2 = 1.0 / sctx.pv; /* Mark diag[i+1] */
1824: /* Now take care of 2nd column of diagonal 4x4 block. */
1825: pc3 = rtmp3 + i + 1;
1826: pc4 = rtmp4 + i + 1;
1827: if (*pc3 != 0.0 || *pc4 != 0.0) {
1828: mul3 = (*pc3) * (*pc2);
1829: *pc3 = mul3;
1830: mul4 = (*pc4) * (*pc2);
1831: *pc4 = mul4;
1832: pj = b->j + bdiag[i + 2] + 1; /* beginning of U(i+1,:) */
1833: nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* num of entries in U(i+1,:) excluding diag */
1834: for (j = 0; j < nz; j++) {
1835: col = pj[j];
1836: rtmp3[col] -= mul3 * rtmp2[col];
1837: rtmp4[col] -= mul4 * rtmp2[col];
1838: }
1839: PetscCall(PetscLogFlops(4.0 * nz));
1840: }
1842: /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1843: rs = 0.0;
1844: /* L part */
1845: pc3 = b->a + bi[i + 2];
1846: pj = b->j + bi[i + 2];
1847: nz = bi[i + 3] - bi[i + 2];
1848: for (j = 0; j < nz; j++) {
1849: col = pj[j];
1850: pc3[j] = rtmp3[col];
1851: rs += PetscAbsScalar(pc3[j]);
1852: }
1853: /* U part */
1854: pc3 = b->a + bdiag[i + 3] + 1;
1855: pj = b->j + bdiag[i + 3] + 1;
1856: nz = bdiag[i + 2] - bdiag[i + 3] - 1; /* exclude diagonal */
1857: for (j = 0; j < nz; j++) {
1858: col = pj[j];
1859: pc3[j] = rtmp3[col];
1860: rs += PetscAbsScalar(pc3[j]);
1861: }
1863: sctx.rs = rs;
1864: sctx.pv = rtmp3[i + 2];
1865: PetscCall(MatPivotCheck(B, A, info, &sctx, i + 2));
1866: if (sctx.newshift) break;
1867: pc3 = b->a + bdiag[i + 2];
1868: *pc3 = 1.0 / sctx.pv; /* Mark diag[i+2] */
1870: /* Now take care of 3rd column of diagonal 4x4 block. */
1871: pc4 = rtmp4 + i + 2;
1872: if (*pc4 != 0.0) {
1873: mul4 = (*pc4) * (*pc3);
1874: *pc4 = mul4;
1875: pj = b->j + bdiag[i + 3] + 1; /* beginning of U(i+2,:) */
1876: nz = bdiag[i + 2] - bdiag[i + 3] - 1; /* num of entries in U(i+2,:) excluding diag */
1877: for (j = 0; j < nz; j++) {
1878: col = pj[j];
1879: rtmp4[col] -= mul4 * rtmp3[col];
1880: }
1881: PetscCall(PetscLogFlops(1 + 2.0 * nz));
1882: }
1884: /* finished i+3; check zero pivot, then stick row i+3 into b->a */
1885: rs = 0.0;
1886: /* L part */
1887: pc4 = b->a + bi[i + 3];
1888: pj = b->j + bi[i + 3];
1889: nz = bi[i + 4] - bi[i + 3];
1890: for (j = 0; j < nz; j++) {
1891: col = pj[j];
1892: pc4[j] = rtmp4[col];
1893: rs += PetscAbsScalar(pc4[j]);
1894: }
1895: /* U part */
1896: pc4 = b->a + bdiag[i + 4] + 1;
1897: pj = b->j + bdiag[i + 4] + 1;
1898: nz = bdiag[i + 3] - bdiag[i + 4] - 1; /* exclude diagonal */
1899: for (j = 0; j < nz; j++) {
1900: col = pj[j];
1901: pc4[j] = rtmp4[col];
1902: rs += PetscAbsScalar(pc4[j]);
1903: }
1905: sctx.rs = rs;
1906: sctx.pv = rtmp4[i + 3];
1907: PetscCall(MatPivotCheck(B, A, info, &sctx, i + 3));
1908: if (sctx.newshift) break;
1909: pc4 = b->a + bdiag[i + 3];
1910: *pc4 = 1.0 / sctx.pv; /* Mark diag[i+3] */
1911: break;
1913: default:
1914: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Node size not yet supported ");
1915: }
1916: if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */
1917: i += nodesz; /* Update the row */
1918: }
1920: /* MatPivotRefine() */
1921: if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction > 0 && sctx.nshift < sctx.nshift_max) {
1922: /*
1923: * if no shift in this attempt & shifting & started shifting & can refine,
1924: * then try lower shift
1925: */
1926: sctx.shift_hi = sctx.shift_fraction;
1927: sctx.shift_fraction = (sctx.shift_hi + sctx.shift_lo) / 2.;
1928: sctx.shift_amount = sctx.shift_fraction * sctx.shift_top;
1929: sctx.newshift = PETSC_TRUE;
1930: sctx.nshift++;
1931: }
1932: } while (sctx.newshift);
1934: PetscCall(PetscFree4(rtmp1, rtmp2, rtmp3, rtmp4));
1935: PetscCall(PetscFree(tmp_vec2));
1936: PetscCall(ISRestoreIndices(isicol, &ic));
1937: PetscCall(ISRestoreIndices(isrow, &r));
1939: if (b->inode.size) {
1940: C->ops->solve = MatSolve_SeqAIJ_Inode;
1941: } else {
1942: C->ops->solve = MatSolve_SeqAIJ;
1943: }
1944: C->ops->solveadd = MatSolveAdd_SeqAIJ;
1945: C->ops->solvetranspose = MatSolveTranspose_SeqAIJ;
1946: C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1947: C->ops->matsolve = MatMatSolve_SeqAIJ;
1948: C->assembled = PETSC_TRUE;
1949: C->preallocated = PETSC_TRUE;
1951: PetscCall(PetscLogFlops(C->cmap->n));
1953: /* MatShiftView(A,info,&sctx) */
1954: if (sctx.nshift) {
1955: if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
1956: PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top));
1957: } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1958: PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
1959: } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
1960: PetscCall(PetscInfo(A, "number of shift_inblocks applied %" PetscInt_FMT ", each shift_amount %g\n", sctx.nshift, (double)info->shiftamount));
1961: }
1962: }
1963: PetscFunctionReturn(PETSC_SUCCESS);
1964: }
1966: #if 0
1967: // unused
1968: static PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B, Mat A, const MatFactorInfo *info)
1969: {
1970: Mat C = B;
1971: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)C->data;
1972: IS iscol = b->col, isrow = b->row, isicol = b->icol;
1973: const PetscInt *r, *ic, *c, *ics;
1974: PetscInt n = A->rmap->n, *bi = b->i;
1975: PetscInt *bj = b->j, *nbj = b->j + 1, *ajtmp, *bjtmp, nz, nz_tmp, row, prow;
1976: PetscInt i, j, idx, *bd = b->diag, node_max, nodesz;
1977: PetscInt *ai = a->i, *aj = a->j;
1978: PetscInt *ns, *tmp_vec1, *tmp_vec2, *nsmap, *pj;
1979: PetscScalar mul1, mul2, mul3, tmp;
1980: MatScalar *pc1, *pc2, *pc3, *ba = b->a, *pv, *rtmp11, *rtmp22, *rtmp33;
1981: const MatScalar *v1, *v2, *v3, *aa = a->a, *rtmp1;
1982: PetscReal rs = 0.0;
1983: FactorShiftCtx sctx;
1985: PetscFunctionBegin;
1986: sctx.shift_top = 0;
1987: sctx.nshift_max = 0;
1988: sctx.shift_lo = 0;
1989: sctx.shift_hi = 0;
1990: sctx.shift_fraction = 0;
1992: /* if both shift schemes are chosen by user, only use info->shiftpd */
1993: if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1994: sctx.shift_top = 0;
1995: for (i = 0; i < n; i++) {
1996: /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1997: rs = 0.0;
1998: ajtmp = aj + ai[i];
1999: rtmp1 = aa + ai[i];
2000: nz = ai[i + 1] - ai[i];
2001: for (j = 0; j < nz; j++) {
2002: if (*ajtmp != i) {
2003: rs += PetscAbsScalar(*rtmp1++);
2004: } else {
2005: rs -= PetscRealPart(*rtmp1++);
2006: }
2007: ajtmp++;
2008: }
2009: if (rs > sctx.shift_top) sctx.shift_top = rs;
2010: }
2011: if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
2012: sctx.shift_top *= 1.1;
2013: sctx.nshift_max = 5;
2014: sctx.shift_lo = 0.;
2015: sctx.shift_hi = 1.;
2016: }
2017: sctx.shift_amount = 0;
2018: sctx.nshift = 0;
2020: PetscCall(ISGetIndices(isrow, &r));
2021: PetscCall(ISGetIndices(iscol, &c));
2022: PetscCall(ISGetIndices(isicol, &ic));
2023: PetscCall(PetscCalloc3(n, &rtmp11, n, &rtmp22, n, &rtmp33));
2024: ics = ic;
2026: node_max = a->inode.node_count;
2027: ns = a->inode.size;
2028: PetscCheck(ns, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix without inode information");
2030: /* If max inode size > 3, split it into two inodes.*/
2031: /* also map the inode sizes according to the ordering */
2032: PetscCall(PetscMalloc1(n + 1, &tmp_vec1));
2033: for (i = 0, j = 0; i < node_max; ++i, ++j) {
2034: if (ns[i] > 3) {
2035: tmp_vec1[j] = ns[i] / 2; /* Assuming ns[i] < =5 */
2036: ++j;
2037: tmp_vec1[j] = ns[i] - tmp_vec1[j - 1];
2038: } else {
2039: tmp_vec1[j] = ns[i];
2040: }
2041: }
2042: /* Use the correct node_max */
2043: node_max = j;
2045: /* Now reorder the inode info based on mat re-ordering info */
2046: /* First create a row -> inode_size_array_index map */
2047: PetscCall(PetscMalloc2(n + 1, &nsmap, node_max + 1, &tmp_vec2));
2048: for (i = 0, row = 0; i < node_max; i++) {
2049: nodesz = tmp_vec1[i];
2050: for (j = 0; j < nodesz; j++, row++) nsmap[row] = i;
2051: }
2052: /* Using nsmap, create a reordered ns structure */
2053: for (i = 0, j = 0; i < node_max; i++) {
2054: nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
2055: tmp_vec2[i] = nodesz;
2056: j += nodesz;
2057: }
2058: PetscCall(PetscFree2(nsmap, tmp_vec1));
2059: /* Now use the correct ns */
2060: ns = tmp_vec2;
2062: do {
2063: sctx.newshift = PETSC_FALSE;
2064: /* Now loop over each block-row, and do the factorization */
2065: for (i = 0, row = 0; i < node_max; i++) {
2066: nodesz = ns[i];
2067: nz = bi[row + 1] - bi[row];
2068: bjtmp = bj + bi[row];
2070: switch (nodesz) {
2071: case 1:
2072: for (j = 0; j < nz; j++) {
2073: idx = bjtmp[j];
2074: rtmp11[idx] = 0.0;
2075: }
2077: /* load in initial (unfactored row) */
2078: idx = r[row];
2079: nz_tmp = ai[idx + 1] - ai[idx];
2080: ajtmp = aj + ai[idx];
2081: v1 = aa + ai[idx];
2083: for (j = 0; j < nz_tmp; j++) {
2084: idx = ics[ajtmp[j]];
2085: rtmp11[idx] = v1[j];
2086: }
2087: rtmp11[ics[r[row]]] += sctx.shift_amount;
2089: prow = *bjtmp++;
2090: while (prow < row) {
2091: pc1 = rtmp11 + prow;
2092: if (*pc1 != 0.0) {
2093: pv = ba + bd[prow];
2094: pj = nbj + bd[prow];
2095: mul1 = *pc1 * *pv++;
2096: *pc1 = mul1;
2097: nz_tmp = bi[prow + 1] - bd[prow] - 1;
2098: PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp));
2099: for (j = 0; j < nz_tmp; j++) {
2100: tmp = pv[j];
2101: idx = pj[j];
2102: rtmp11[idx] -= mul1 * tmp;
2103: }
2104: }
2105: prow = *bjtmp++;
2106: }
2107: pj = bj + bi[row];
2108: pc1 = ba + bi[row];
2110: sctx.pv = rtmp11[row];
2111: rtmp11[row] = 1.0 / rtmp11[row]; /* invert diag */
2112: rs = 0.0;
2113: for (j = 0; j < nz; j++) {
2114: idx = pj[j];
2115: pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
2116: if (idx != row) rs += PetscAbsScalar(pc1[j]);
2117: }
2118: sctx.rs = rs;
2119: PetscCall(MatPivotCheck(B, A, info, &sctx, row));
2120: if (sctx.newshift) goto endofwhile;
2121: break;
2123: case 2:
2124: for (j = 0; j < nz; j++) {
2125: idx = bjtmp[j];
2126: rtmp11[idx] = 0.0;
2127: rtmp22[idx] = 0.0;
2128: }
2130: /* load in initial (unfactored row) */
2131: idx = r[row];
2132: nz_tmp = ai[idx + 1] - ai[idx];
2133: ajtmp = aj + ai[idx];
2134: v1 = aa + ai[idx];
2135: v2 = aa + ai[idx + 1];
2136: for (j = 0; j < nz_tmp; j++) {
2137: idx = ics[ajtmp[j]];
2138: rtmp11[idx] = v1[j];
2139: rtmp22[idx] = v2[j];
2140: }
2141: rtmp11[ics[r[row]]] += sctx.shift_amount;
2142: rtmp22[ics[r[row + 1]]] += sctx.shift_amount;
2144: prow = *bjtmp++;
2145: while (prow < row) {
2146: pc1 = rtmp11 + prow;
2147: pc2 = rtmp22 + prow;
2148: if (*pc1 != 0.0 || *pc2 != 0.0) {
2149: pv = ba + bd[prow];
2150: pj = nbj + bd[prow];
2151: mul1 = *pc1 * *pv;
2152: mul2 = *pc2 * *pv;
2153: ++pv;
2154: *pc1 = mul1;
2155: *pc2 = mul2;
2157: nz_tmp = bi[prow + 1] - bd[prow] - 1;
2158: for (j = 0; j < nz_tmp; j++) {
2159: tmp = pv[j];
2160: idx = pj[j];
2161: rtmp11[idx] -= mul1 * tmp;
2162: rtmp22[idx] -= mul2 * tmp;
2163: }
2164: PetscCall(PetscLogFlops(2 + 4.0 * nz_tmp));
2165: }
2166: prow = *bjtmp++;
2167: }
2169: /* Now take care of diagonal 2x2 block. Note: prow = row here */
2170: pc1 = rtmp11 + prow;
2171: pc2 = rtmp22 + prow;
2173: sctx.pv = *pc1;
2174: pj = bj + bi[prow];
2175: rs = 0.0;
2176: for (j = 0; j < nz; j++) {
2177: idx = pj[j];
2178: if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
2179: }
2180: sctx.rs = rs;
2181: PetscCall(MatPivotCheck(B, A, info, &sctx, row));
2182: if (sctx.newshift) goto endofwhile;
2184: if (*pc2 != 0.0) {
2185: pj = nbj + bd[prow];
2186: mul2 = (*pc2) / (*pc1); /* since diag is not yet inverted.*/
2187: *pc2 = mul2;
2188: nz_tmp = bi[prow + 1] - bd[prow] - 1;
2189: for (j = 0; j < nz_tmp; j++) {
2190: idx = pj[j];
2191: tmp = rtmp11[idx];
2192: rtmp22[idx] -= mul2 * tmp;
2193: }
2194: PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp));
2195: }
2197: pj = bj + bi[row];
2198: pc1 = ba + bi[row];
2199: pc2 = ba + bi[row + 1];
2201: sctx.pv = rtmp22[row + 1];
2202: rs = 0.0;
2203: rtmp11[row] = 1.0 / rtmp11[row];
2204: rtmp22[row + 1] = 1.0 / rtmp22[row + 1];
2205: /* copy row entries from dense representation to sparse */
2206: for (j = 0; j < nz; j++) {
2207: idx = pj[j];
2208: pc1[j] = rtmp11[idx];
2209: pc2[j] = rtmp22[idx];
2210: if (idx != row + 1) rs += PetscAbsScalar(pc2[j]);
2211: }
2212: sctx.rs = rs;
2213: PetscCall(MatPivotCheck(B, A, info, &sctx, row + 1));
2214: if (sctx.newshift) goto endofwhile;
2215: break;
2217: case 3:
2218: for (j = 0; j < nz; j++) {
2219: idx = bjtmp[j];
2220: rtmp11[idx] = 0.0;
2221: rtmp22[idx] = 0.0;
2222: rtmp33[idx] = 0.0;
2223: }
2224: /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
2225: idx = r[row];
2226: nz_tmp = ai[idx + 1] - ai[idx];
2227: ajtmp = aj + ai[idx];
2228: v1 = aa + ai[idx];
2229: v2 = aa + ai[idx + 1];
2230: v3 = aa + ai[idx + 2];
2231: for (j = 0; j < nz_tmp; j++) {
2232: idx = ics[ajtmp[j]];
2233: rtmp11[idx] = v1[j];
2234: rtmp22[idx] = v2[j];
2235: rtmp33[idx] = v3[j];
2236: }
2237: rtmp11[ics[r[row]]] += sctx.shift_amount;
2238: rtmp22[ics[r[row + 1]]] += sctx.shift_amount;
2239: rtmp33[ics[r[row + 2]]] += sctx.shift_amount;
2241: /* loop over all pivot row blocks above this row block */
2242: prow = *bjtmp++;
2243: while (prow < row) {
2244: pc1 = rtmp11 + prow;
2245: pc2 = rtmp22 + prow;
2246: pc3 = rtmp33 + prow;
2247: if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
2248: pv = ba + bd[prow];
2249: pj = nbj + bd[prow];
2250: mul1 = *pc1 * *pv;
2251: mul2 = *pc2 * *pv;
2252: mul3 = *pc3 * *pv;
2253: ++pv;
2254: *pc1 = mul1;
2255: *pc2 = mul2;
2256: *pc3 = mul3;
2258: nz_tmp = bi[prow + 1] - bd[prow] - 1;
2259: /* update this row based on pivot row */
2260: for (j = 0; j < nz_tmp; j++) {
2261: tmp = pv[j];
2262: idx = pj[j];
2263: rtmp11[idx] -= mul1 * tmp;
2264: rtmp22[idx] -= mul2 * tmp;
2265: rtmp33[idx] -= mul3 * tmp;
2266: }
2267: PetscCall(PetscLogFlops(3 + 6.0 * nz_tmp));
2268: }
2269: prow = *bjtmp++;
2270: }
2272: /* Now take care of diagonal 3x3 block in this set of rows */
2273: /* note: prow = row here */
2274: pc1 = rtmp11 + prow;
2275: pc2 = rtmp22 + prow;
2276: pc3 = rtmp33 + prow;
2278: sctx.pv = *pc1;
2279: pj = bj + bi[prow];
2280: rs = 0.0;
2281: for (j = 0; j < nz; j++) {
2282: idx = pj[j];
2283: if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
2284: }
2285: sctx.rs = rs;
2286: PetscCall(MatPivotCheck(B, A, info, &sctx, row));
2287: if (sctx.newshift) goto endofwhile;
2289: if (*pc2 != 0.0 || *pc3 != 0.0) {
2290: mul2 = (*pc2) / (*pc1);
2291: mul3 = (*pc3) / (*pc1);
2292: *pc2 = mul2;
2293: *pc3 = mul3;
2294: nz_tmp = bi[prow + 1] - bd[prow] - 1;
2295: pj = nbj + bd[prow];
2296: for (j = 0; j < nz_tmp; j++) {
2297: idx = pj[j];
2298: tmp = rtmp11[idx];
2299: rtmp22[idx] -= mul2 * tmp;
2300: rtmp33[idx] -= mul3 * tmp;
2301: }
2302: PetscCall(PetscLogFlops(2 + 4.0 * nz_tmp));
2303: }
2304: ++prow;
2306: pc2 = rtmp22 + prow;
2307: pc3 = rtmp33 + prow;
2308: sctx.pv = *pc2;
2309: pj = bj + bi[prow];
2310: rs = 0.0;
2311: for (j = 0; j < nz; j++) {
2312: idx = pj[j];
2313: if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
2314: }
2315: sctx.rs = rs;
2316: PetscCall(MatPivotCheck(B, A, info, &sctx, row + 1));
2317: if (sctx.newshift) goto endofwhile;
2319: if (*pc3 != 0.0) {
2320: mul3 = (*pc3) / (*pc2);
2321: *pc3 = mul3;
2322: pj = nbj + bd[prow];
2323: nz_tmp = bi[prow + 1] - bd[prow] - 1;
2324: for (j = 0; j < nz_tmp; j++) {
2325: idx = pj[j];
2326: tmp = rtmp22[idx];
2327: rtmp33[idx] -= mul3 * tmp;
2328: }
2329: PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp));
2330: }
2332: pj = bj + bi[row];
2333: pc1 = ba + bi[row];
2334: pc2 = ba + bi[row + 1];
2335: pc3 = ba + bi[row + 2];
2337: sctx.pv = rtmp33[row + 2];
2338: rs = 0.0;
2339: rtmp11[row] = 1.0 / rtmp11[row];
2340: rtmp22[row + 1] = 1.0 / rtmp22[row + 1];
2341: rtmp33[row + 2] = 1.0 / rtmp33[row + 2];
2342: /* copy row entries from dense representation to sparse */
2343: for (j = 0; j < nz; j++) {
2344: idx = pj[j];
2345: pc1[j] = rtmp11[idx];
2346: pc2[j] = rtmp22[idx];
2347: pc3[j] = rtmp33[idx];
2348: if (idx != row + 2) rs += PetscAbsScalar(pc3[j]);
2349: }
2351: sctx.rs = rs;
2352: PetscCall(MatPivotCheck(B, A, info, &sctx, row + 2));
2353: if (sctx.newshift) goto endofwhile;
2354: break;
2356: default:
2357: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Node size not yet supported ");
2358: }
2359: row += nodesz; /* Update the row */
2360: }
2361: endofwhile:;
2362: } while (sctx.newshift);
2363: PetscCall(PetscFree3(rtmp11, rtmp22, rtmp33));
2364: PetscCall(PetscFree(tmp_vec2));
2365: PetscCall(ISRestoreIndices(isicol, &ic));
2366: PetscCall(ISRestoreIndices(isrow, &r));
2367: PetscCall(ISRestoreIndices(iscol, &c));
2369: (B)->ops->solve = MatSolve_SeqAIJ_inplace;
2370: /* do not set solve add, since MatSolve_Inode + Add is faster */
2371: C->ops->solvetranspose = MatSolveTranspose_SeqAIJ_inplace;
2372: C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace;
2373: C->assembled = PETSC_TRUE;
2374: C->preallocated = PETSC_TRUE;
2375: if (sctx.nshift) {
2376: if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
2377: PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top));
2378: } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
2379: PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
2380: }
2381: }
2382: PetscCall(PetscLogFlops(C->cmap->n));
2383: PetscCall(MatSeqAIJCheckInode(C));
2384: PetscFunctionReturn(PETSC_SUCCESS);
2385: }
2386: #endif
2388: PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A, Vec bb, Vec xx)
2389: {
2390: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
2391: IS iscol = a->col, isrow = a->row;
2392: const PetscInt *r, *c, *rout, *cout;
2393: PetscInt i, j, n = A->rmap->n;
2394: PetscInt node_max, row, nsz, aii, i0, i1, nz;
2395: const PetscInt *ai = a->i, *a_j = a->j, *ns, *vi, *ad, *aj;
2396: PetscScalar *x, *tmp, *tmps, tmp0, tmp1;
2397: PetscScalar sum1, sum2, sum3, sum4, sum5;
2398: const MatScalar *v1, *v2, *v3, *v4, *v5, *a_a = a->a, *aa;
2399: const PetscScalar *b;
2401: PetscFunctionBegin;
2402: PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
2403: node_max = a->inode.node_count;
2404: ns = a->inode.size; /* Node Size array */
2406: PetscCall(VecGetArrayRead(bb, &b));
2407: PetscCall(VecGetArrayWrite(xx, &x));
2408: tmp = a->solve_work;
2410: PetscCall(ISGetIndices(isrow, &rout));
2411: r = rout;
2412: PetscCall(ISGetIndices(iscol, &cout));
2413: c = cout;
2415: /* forward solve the lower triangular */
2416: tmps = tmp;
2417: aa = a_a;
2418: aj = a_j;
2419: ad = a->diag;
2421: for (i = 0, row = 0; i < node_max; ++i) {
2422: nsz = ns[i];
2423: aii = ai[row];
2424: v1 = aa + aii;
2425: vi = aj + aii;
2426: nz = ai[row + 1] - ai[row];
2428: if (i < node_max - 1) {
2429: /* Prefetch the indices for the next block */
2430: PetscPrefetchBlock(aj + ai[row + nsz], ai[row + nsz + 1] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); /* indices */
2431: /* Prefetch the data for the next block */
2432: PetscPrefetchBlock(aa + ai[row + nsz], ai[row + nsz + ns[i + 1]] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA);
2433: }
2435: switch (nsz) { /* Each loop in 'case' is unrolled */
2436: case 1:
2437: sum1 = b[r[row]];
2438: for (j = 0; j < nz - 1; j += 2) {
2439: i0 = vi[j];
2440: i1 = vi[j + 1];
2441: tmp0 = tmps[i0];
2442: tmp1 = tmps[i1];
2443: sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2444: }
2445: if (j == nz - 1) {
2446: tmp0 = tmps[vi[j]];
2447: sum1 -= v1[j] * tmp0;
2448: }
2449: tmp[row++] = sum1;
2450: break;
2451: case 2:
2452: sum1 = b[r[row]];
2453: sum2 = b[r[row + 1]];
2454: v2 = aa + ai[row + 1];
2456: for (j = 0; j < nz - 1; j += 2) {
2457: i0 = vi[j];
2458: i1 = vi[j + 1];
2459: tmp0 = tmps[i0];
2460: tmp1 = tmps[i1];
2461: sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2462: sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2463: }
2464: if (j == nz - 1) {
2465: tmp0 = tmps[vi[j]];
2466: sum1 -= v1[j] * tmp0;
2467: sum2 -= v2[j] * tmp0;
2468: }
2469: sum2 -= v2[nz] * sum1;
2470: tmp[row++] = sum1;
2471: tmp[row++] = sum2;
2472: break;
2473: case 3:
2474: sum1 = b[r[row]];
2475: sum2 = b[r[row + 1]];
2476: sum3 = b[r[row + 2]];
2477: v2 = aa + ai[row + 1];
2478: v3 = aa + ai[row + 2];
2480: for (j = 0; j < nz - 1; j += 2) {
2481: i0 = vi[j];
2482: i1 = vi[j + 1];
2483: tmp0 = tmps[i0];
2484: tmp1 = tmps[i1];
2485: sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2486: sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2487: sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1;
2488: }
2489: if (j == nz - 1) {
2490: tmp0 = tmps[vi[j]];
2491: sum1 -= v1[j] * tmp0;
2492: sum2 -= v2[j] * tmp0;
2493: sum3 -= v3[j] * tmp0;
2494: }
2495: sum2 -= v2[nz] * sum1;
2496: sum3 -= v3[nz] * sum1;
2497: sum3 -= v3[nz + 1] * sum2;
2498: tmp[row++] = sum1;
2499: tmp[row++] = sum2;
2500: tmp[row++] = sum3;
2501: break;
2503: case 4:
2504: sum1 = b[r[row]];
2505: sum2 = b[r[row + 1]];
2506: sum3 = b[r[row + 2]];
2507: sum4 = b[r[row + 3]];
2508: v2 = aa + ai[row + 1];
2509: v3 = aa + ai[row + 2];
2510: v4 = aa + ai[row + 3];
2512: for (j = 0; j < nz - 1; j += 2) {
2513: i0 = vi[j];
2514: i1 = vi[j + 1];
2515: tmp0 = tmps[i0];
2516: tmp1 = tmps[i1];
2517: sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2518: sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2519: sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1;
2520: sum4 -= v4[j] * tmp0 + v4[j + 1] * tmp1;
2521: }
2522: if (j == nz - 1) {
2523: tmp0 = tmps[vi[j]];
2524: sum1 -= v1[j] * tmp0;
2525: sum2 -= v2[j] * tmp0;
2526: sum3 -= v3[j] * tmp0;
2527: sum4 -= v4[j] * tmp0;
2528: }
2529: sum2 -= v2[nz] * sum1;
2530: sum3 -= v3[nz] * sum1;
2531: sum4 -= v4[nz] * sum1;
2532: sum3 -= v3[nz + 1] * sum2;
2533: sum4 -= v4[nz + 1] * sum2;
2534: sum4 -= v4[nz + 2] * sum3;
2536: tmp[row++] = sum1;
2537: tmp[row++] = sum2;
2538: tmp[row++] = sum3;
2539: tmp[row++] = sum4;
2540: break;
2541: case 5:
2542: sum1 = b[r[row]];
2543: sum2 = b[r[row + 1]];
2544: sum3 = b[r[row + 2]];
2545: sum4 = b[r[row + 3]];
2546: sum5 = b[r[row + 4]];
2547: v2 = aa + ai[row + 1];
2548: v3 = aa + ai[row + 2];
2549: v4 = aa + ai[row + 3];
2550: v5 = aa + ai[row + 4];
2552: for (j = 0; j < nz - 1; j += 2) {
2553: i0 = vi[j];
2554: i1 = vi[j + 1];
2555: tmp0 = tmps[i0];
2556: tmp1 = tmps[i1];
2557: sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2558: sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2559: sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1;
2560: sum4 -= v4[j] * tmp0 + v4[j + 1] * tmp1;
2561: sum5 -= v5[j] * tmp0 + v5[j + 1] * tmp1;
2562: }
2563: if (j == nz - 1) {
2564: tmp0 = tmps[vi[j]];
2565: sum1 -= v1[j] * tmp0;
2566: sum2 -= v2[j] * tmp0;
2567: sum3 -= v3[j] * tmp0;
2568: sum4 -= v4[j] * tmp0;
2569: sum5 -= v5[j] * tmp0;
2570: }
2572: sum2 -= v2[nz] * sum1;
2573: sum3 -= v3[nz] * sum1;
2574: sum4 -= v4[nz] * sum1;
2575: sum5 -= v5[nz] * sum1;
2576: sum3 -= v3[nz + 1] * sum2;
2577: sum4 -= v4[nz + 1] * sum2;
2578: sum5 -= v5[nz + 1] * sum2;
2579: sum4 -= v4[nz + 2] * sum3;
2580: sum5 -= v5[nz + 2] * sum3;
2581: sum5 -= v5[nz + 3] * sum4;
2583: tmp[row++] = sum1;
2584: tmp[row++] = sum2;
2585: tmp[row++] = sum3;
2586: tmp[row++] = sum4;
2587: tmp[row++] = sum5;
2588: break;
2589: default:
2590: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
2591: }
2592: }
2593: /* backward solve the upper triangular */
2594: for (i = node_max - 1, row = n - 1; i >= 0; i--) {
2595: nsz = ns[i];
2596: aii = ad[row + 1] + 1;
2597: v1 = aa + aii;
2598: vi = aj + aii;
2599: nz = ad[row] - ad[row + 1] - 1;
2601: if (i > 0) {
2602: /* Prefetch the indices for the next block */
2603: PetscPrefetchBlock(aj + ad[row - nsz + 1] + 1, ad[row - nsz] - ad[row - nsz + 1], 0, PETSC_PREFETCH_HINT_NTA);
2604: /* Prefetch the data for the next block */
2605: PetscPrefetchBlock(aa + ad[row - nsz + 1] + 1, ad[row - nsz - ns[i - 1] + 1] - ad[row - nsz + 1], 0, PETSC_PREFETCH_HINT_NTA);
2606: }
2608: switch (nsz) { /* Each loop in 'case' is unrolled */
2609: case 1:
2610: sum1 = tmp[row];
2612: for (j = 0; j < nz - 1; j += 2) {
2613: i0 = vi[j];
2614: i1 = vi[j + 1];
2615: tmp0 = tmps[i0];
2616: tmp1 = tmps[i1];
2617: sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2618: }
2619: if (j == nz - 1) {
2620: tmp0 = tmps[vi[j]];
2621: sum1 -= v1[j] * tmp0;
2622: }
2623: x[c[row]] = tmp[row] = sum1 * v1[nz];
2624: row--;
2625: break;
2626: case 2:
2627: sum1 = tmp[row];
2628: sum2 = tmp[row - 1];
2629: v2 = aa + ad[row] + 1;
2630: for (j = 0; j < nz - 1; j += 2) {
2631: i0 = vi[j];
2632: i1 = vi[j + 1];
2633: tmp0 = tmps[i0];
2634: tmp1 = tmps[i1];
2635: sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2636: sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2637: }
2638: if (j == nz - 1) {
2639: tmp0 = tmps[vi[j]];
2640: sum1 -= v1[j] * tmp0;
2641: sum2 -= v2[j + 1] * tmp0;
2642: }
2644: tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2645: row--;
2646: sum2 -= v2[0] * tmp0;
2647: x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2648: row--;
2649: break;
2650: case 3:
2651: sum1 = tmp[row];
2652: sum2 = tmp[row - 1];
2653: sum3 = tmp[row - 2];
2654: v2 = aa + ad[row] + 1;
2655: v3 = aa + ad[row - 1] + 1;
2656: for (j = 0; j < nz - 1; j += 2) {
2657: i0 = vi[j];
2658: i1 = vi[j + 1];
2659: tmp0 = tmps[i0];
2660: tmp1 = tmps[i1];
2661: sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2662: sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2663: sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1;
2664: }
2665: if (j == nz - 1) {
2666: tmp0 = tmps[vi[j]];
2667: sum1 -= v1[j] * tmp0;
2668: sum2 -= v2[j + 1] * tmp0;
2669: sum3 -= v3[j + 2] * tmp0;
2670: }
2671: tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2672: row--;
2673: sum2 -= v2[0] * tmp0;
2674: sum3 -= v3[1] * tmp0;
2675: tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2676: row--;
2677: sum3 -= v3[0] * tmp0;
2678: x[c[row]] = tmp[row] = sum3 * v3[nz + 2];
2679: row--;
2681: break;
2682: case 4:
2683: sum1 = tmp[row];
2684: sum2 = tmp[row - 1];
2685: sum3 = tmp[row - 2];
2686: sum4 = tmp[row - 3];
2687: v2 = aa + ad[row] + 1;
2688: v3 = aa + ad[row - 1] + 1;
2689: v4 = aa + ad[row - 2] + 1;
2691: for (j = 0; j < nz - 1; j += 2) {
2692: i0 = vi[j];
2693: i1 = vi[j + 1];
2694: tmp0 = tmps[i0];
2695: tmp1 = tmps[i1];
2696: sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2697: sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2698: sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1;
2699: sum4 -= v4[j + 3] * tmp0 + v4[j + 4] * tmp1;
2700: }
2701: if (j == nz - 1) {
2702: tmp0 = tmps[vi[j]];
2703: sum1 -= v1[j] * tmp0;
2704: sum2 -= v2[j + 1] * tmp0;
2705: sum3 -= v3[j + 2] * tmp0;
2706: sum4 -= v4[j + 3] * tmp0;
2707: }
2709: tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2710: row--;
2711: sum2 -= v2[0] * tmp0;
2712: sum3 -= v3[1] * tmp0;
2713: sum4 -= v4[2] * tmp0;
2714: tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2715: row--;
2716: sum3 -= v3[0] * tmp0;
2717: sum4 -= v4[1] * tmp0;
2718: tmp0 = x[c[row]] = tmp[row] = sum3 * v3[nz + 2];
2719: row--;
2720: sum4 -= v4[0] * tmp0;
2721: x[c[row]] = tmp[row] = sum4 * v4[nz + 3];
2722: row--;
2723: break;
2724: case 5:
2725: sum1 = tmp[row];
2726: sum2 = tmp[row - 1];
2727: sum3 = tmp[row - 2];
2728: sum4 = tmp[row - 3];
2729: sum5 = tmp[row - 4];
2730: v2 = aa + ad[row] + 1;
2731: v3 = aa + ad[row - 1] + 1;
2732: v4 = aa + ad[row - 2] + 1;
2733: v5 = aa + ad[row - 3] + 1;
2734: for (j = 0; j < nz - 1; j += 2) {
2735: i0 = vi[j];
2736: i1 = vi[j + 1];
2737: tmp0 = tmps[i0];
2738: tmp1 = tmps[i1];
2739: sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2740: sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2741: sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1;
2742: sum4 -= v4[j + 3] * tmp0 + v4[j + 4] * tmp1;
2743: sum5 -= v5[j + 4] * tmp0 + v5[j + 5] * tmp1;
2744: }
2745: if (j == nz - 1) {
2746: tmp0 = tmps[vi[j]];
2747: sum1 -= v1[j] * tmp0;
2748: sum2 -= v2[j + 1] * tmp0;
2749: sum3 -= v3[j + 2] * tmp0;
2750: sum4 -= v4[j + 3] * tmp0;
2751: sum5 -= v5[j + 4] * tmp0;
2752: }
2754: tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2755: row--;
2756: sum2 -= v2[0] * tmp0;
2757: sum3 -= v3[1] * tmp0;
2758: sum4 -= v4[2] * tmp0;
2759: sum5 -= v5[3] * tmp0;
2760: tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2761: row--;
2762: sum3 -= v3[0] * tmp0;
2763: sum4 -= v4[1] * tmp0;
2764: sum5 -= v5[2] * tmp0;
2765: tmp0 = x[c[row]] = tmp[row] = sum3 * v3[nz + 2];
2766: row--;
2767: sum4 -= v4[0] * tmp0;
2768: sum5 -= v5[1] * tmp0;
2769: tmp0 = x[c[row]] = tmp[row] = sum4 * v4[nz + 3];
2770: row--;
2771: sum5 -= v5[0] * tmp0;
2772: x[c[row]] = tmp[row] = sum5 * v5[nz + 4];
2773: row--;
2774: break;
2775: default:
2776: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
2777: }
2778: }
2779: PetscCall(ISRestoreIndices(isrow, &rout));
2780: PetscCall(ISRestoreIndices(iscol, &cout));
2781: PetscCall(VecRestoreArrayRead(bb, &b));
2782: PetscCall(VecRestoreArrayWrite(xx, &x));
2783: PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n));
2784: PetscFunctionReturn(PETSC_SUCCESS);
2785: }
2787: /*
2788: Makes a longer coloring[] array and calls the usual code with that
2789: */
2790: PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat, PetscInt ncolors, PetscInt nin, ISColoringValue coloring[], ISColoring *iscoloring)
2791: {
2792: Mat_SeqAIJ *a = (Mat_SeqAIJ *)mat->data;
2793: PetscInt n = mat->cmap->n, m = a->inode.node_count, j, *ns = a->inode.size, row;
2794: PetscInt *colorused, i;
2795: ISColoringValue *newcolor;
2797: PetscFunctionBegin;
2798: PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
2799: PetscCall(PetscMalloc1(n + 1, &newcolor));
2800: /* loop over inodes, marking a color for each column*/
2801: row = 0;
2802: for (i = 0; i < m; i++) {
2803: for (j = 0; j < ns[i]; j++) newcolor[row++] = coloring[i] + j * ncolors;
2804: }
2806: /* eliminate unneeded colors */
2807: PetscCall(PetscCalloc1(5 * ncolors, &colorused));
2808: for (i = 0; i < n; i++) colorused[newcolor[i]] = 1;
2810: for (i = 1; i < 5 * ncolors; i++) colorused[i] += colorused[i - 1];
2811: ncolors = colorused[5 * ncolors - 1];
2812: for (i = 0; i < n; i++) newcolor[i] = colorused[newcolor[i]] - 1;
2813: PetscCall(PetscFree(colorused));
2814: PetscCall(ISColoringCreate(PetscObjectComm((PetscObject)mat), ncolors, n, newcolor, PETSC_OWN_POINTER, iscoloring));
2815: PetscCall(PetscFree(coloring));
2816: PetscFunctionReturn(PETSC_SUCCESS);
2817: }
2819: #include <petsc/private/kernels/blockinvert.h>
2821: PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx)
2822: {
2823: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
2824: PetscScalar sum1 = 0.0, sum2 = 0.0, sum3 = 0.0, sum4 = 0.0, sum5 = 0.0, tmp0, tmp1, tmp2, tmp3;
2825: MatScalar *ibdiag, *bdiag, work[25], *t;
2826: PetscScalar *x, tmp4, tmp5, x1, x2, x3, x4, x5;
2827: const MatScalar *v = a->a, *v1 = NULL, *v2 = NULL, *v3 = NULL, *v4 = NULL, *v5 = NULL;
2828: const PetscScalar *xb, *b;
2829: PetscReal zeropivot = 100. * PETSC_MACHINE_EPSILON, shift = 0.0;
2830: PetscInt n, m = a->inode.node_count, cnt = 0, i, j, row, i1, i2;
2831: PetscInt sz, k, ipvt[5];
2832: PetscBool allowzeropivot, zeropivotdetected;
2833: const PetscInt *sizes = a->inode.size, *idx, *diag = a->diag, *ii = a->i;
2835: PetscFunctionBegin;
2836: allowzeropivot = PetscNot(A->erroriffailure);
2837: PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
2838: PetscCheck(omega == 1.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for omega != 1.0; use -mat_no_inode");
2839: PetscCheck(fshift == 0.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for fshift != 0.0; use -mat_no_inode");
2841: if (!a->inode.ibdiagvalid) {
2842: if (!a->inode.ibdiag) {
2843: /* calculate space needed for diagonal blocks */
2844: for (i = 0; i < m; i++) cnt += sizes[i] * sizes[i];
2845: a->inode.bdiagsize = cnt;
2847: PetscCall(PetscMalloc3(cnt, &a->inode.ibdiag, cnt, &a->inode.bdiag, A->rmap->n, &a->inode.ssor_work));
2848: }
2850: /* copy over the diagonal blocks and invert them */
2851: ibdiag = a->inode.ibdiag;
2852: bdiag = a->inode.bdiag;
2853: cnt = 0;
2854: for (i = 0, row = 0; i < m; i++) {
2855: for (j = 0; j < sizes[i]; j++) {
2856: for (k = 0; k < sizes[i]; k++) bdiag[cnt + k * sizes[i] + j] = v[diag[row + j] - j + k];
2857: }
2858: PetscCall(PetscArraycpy(ibdiag + cnt, bdiag + cnt, sizes[i] * sizes[i]));
2860: switch (sizes[i]) {
2861: case 1:
2862: /* Create matrix data structure */
2863: if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) {
2864: if (allowzeropivot) {
2865: A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2866: A->factorerror_zeropivot_value = PetscAbsScalar(ibdiag[cnt]);
2867: A->factorerror_zeropivot_row = row;
2868: PetscCall(PetscInfo(A, "Zero pivot, row %" PetscInt_FMT "\n", row));
2869: } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Zero pivot on row %" PetscInt_FMT, row);
2870: }
2871: ibdiag[cnt] = 1.0 / ibdiag[cnt];
2872: break;
2873: case 2:
2874: PetscCall(PetscKernel_A_gets_inverse_A_2(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected));
2875: if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2876: break;
2877: case 3:
2878: PetscCall(PetscKernel_A_gets_inverse_A_3(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected));
2879: if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2880: break;
2881: case 4:
2882: PetscCall(PetscKernel_A_gets_inverse_A_4(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected));
2883: if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2884: break;
2885: case 5:
2886: PetscCall(PetscKernel_A_gets_inverse_A_5(ibdiag + cnt, ipvt, work, shift, allowzeropivot, &zeropivotdetected));
2887: if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2888: break;
2889: default:
2890: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
2891: }
2892: cnt += sizes[i] * sizes[i];
2893: row += sizes[i];
2894: }
2895: a->inode.ibdiagvalid = PETSC_TRUE;
2896: }
2897: ibdiag = a->inode.ibdiag;
2898: bdiag = a->inode.bdiag;
2899: t = a->inode.ssor_work;
2901: PetscCall(VecGetArray(xx, &x));
2902: PetscCall(VecGetArrayRead(bb, &b));
2903: /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
2904: if (flag & SOR_ZERO_INITIAL_GUESS) {
2905: if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
2906: for (i = 0, row = 0; i < m; i++) {
2907: sz = diag[row] - ii[row];
2908: v1 = a->a + ii[row];
2909: idx = a->j + ii[row];
2911: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2912: switch (sizes[i]) {
2913: case 1:
2915: sum1 = b[row];
2916: for (n = 0; n < sz - 1; n += 2) {
2917: i1 = idx[0];
2918: i2 = idx[1];
2919: idx += 2;
2920: tmp0 = x[i1];
2921: tmp1 = x[i2];
2922: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2923: v1 += 2;
2924: }
2926: if (n == sz - 1) {
2927: tmp0 = x[*idx];
2928: sum1 -= *v1 * tmp0;
2929: }
2930: t[row] = sum1;
2931: x[row++] = sum1 * (*ibdiag++);
2932: break;
2933: case 2:
2934: v2 = a->a + ii[row + 1];
2935: sum1 = b[row];
2936: sum2 = b[row + 1];
2937: for (n = 0; n < sz - 1; n += 2) {
2938: i1 = idx[0];
2939: i2 = idx[1];
2940: idx += 2;
2941: tmp0 = x[i1];
2942: tmp1 = x[i2];
2943: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2944: v1 += 2;
2945: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
2946: v2 += 2;
2947: }
2949: if (n == sz - 1) {
2950: tmp0 = x[*idx];
2951: sum1 -= v1[0] * tmp0;
2952: sum2 -= v2[0] * tmp0;
2953: }
2954: t[row] = sum1;
2955: t[row + 1] = sum2;
2956: x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[2];
2957: x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[3];
2958: ibdiag += 4;
2959: break;
2960: case 3:
2961: v2 = a->a + ii[row + 1];
2962: v3 = a->a + ii[row + 2];
2963: sum1 = b[row];
2964: sum2 = b[row + 1];
2965: sum3 = b[row + 2];
2966: for (n = 0; n < sz - 1; n += 2) {
2967: i1 = idx[0];
2968: i2 = idx[1];
2969: idx += 2;
2970: tmp0 = x[i1];
2971: tmp1 = x[i2];
2972: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2973: v1 += 2;
2974: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
2975: v2 += 2;
2976: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
2977: v3 += 2;
2978: }
2980: if (n == sz - 1) {
2981: tmp0 = x[*idx];
2982: sum1 -= v1[0] * tmp0;
2983: sum2 -= v2[0] * tmp0;
2984: sum3 -= v3[0] * tmp0;
2985: }
2986: t[row] = sum1;
2987: t[row + 1] = sum2;
2988: t[row + 2] = sum3;
2989: x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6];
2990: x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7];
2991: x[row++] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8];
2992: ibdiag += 9;
2993: break;
2994: case 4:
2995: v2 = a->a + ii[row + 1];
2996: v3 = a->a + ii[row + 2];
2997: v4 = a->a + ii[row + 3];
2998: sum1 = b[row];
2999: sum2 = b[row + 1];
3000: sum3 = b[row + 2];
3001: sum4 = b[row + 3];
3002: for (n = 0; n < sz - 1; n += 2) {
3003: i1 = idx[0];
3004: i2 = idx[1];
3005: idx += 2;
3006: tmp0 = x[i1];
3007: tmp1 = x[i2];
3008: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3009: v1 += 2;
3010: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3011: v2 += 2;
3012: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3013: v3 += 2;
3014: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3015: v4 += 2;
3016: }
3018: if (n == sz - 1) {
3019: tmp0 = x[*idx];
3020: sum1 -= v1[0] * tmp0;
3021: sum2 -= v2[0] * tmp0;
3022: sum3 -= v3[0] * tmp0;
3023: sum4 -= v4[0] * tmp0;
3024: }
3025: t[row] = sum1;
3026: t[row + 1] = sum2;
3027: t[row + 2] = sum3;
3028: t[row + 3] = sum4;
3029: x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12];
3030: x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13];
3031: x[row++] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14];
3032: x[row++] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15];
3033: ibdiag += 16;
3034: break;
3035: case 5:
3036: v2 = a->a + ii[row + 1];
3037: v3 = a->a + ii[row + 2];
3038: v4 = a->a + ii[row + 3];
3039: v5 = a->a + ii[row + 4];
3040: sum1 = b[row];
3041: sum2 = b[row + 1];
3042: sum3 = b[row + 2];
3043: sum4 = b[row + 3];
3044: sum5 = b[row + 4];
3045: for (n = 0; n < sz - 1; n += 2) {
3046: i1 = idx[0];
3047: i2 = idx[1];
3048: idx += 2;
3049: tmp0 = x[i1];
3050: tmp1 = x[i2];
3051: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3052: v1 += 2;
3053: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3054: v2 += 2;
3055: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3056: v3 += 2;
3057: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3058: v4 += 2;
3059: sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3060: v5 += 2;
3061: }
3063: if (n == sz - 1) {
3064: tmp0 = x[*idx];
3065: sum1 -= v1[0] * tmp0;
3066: sum2 -= v2[0] * tmp0;
3067: sum3 -= v3[0] * tmp0;
3068: sum4 -= v4[0] * tmp0;
3069: sum5 -= v5[0] * tmp0;
3070: }
3071: t[row] = sum1;
3072: t[row + 1] = sum2;
3073: t[row + 2] = sum3;
3074: t[row + 3] = sum4;
3075: t[row + 4] = sum5;
3076: x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20];
3077: x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21];
3078: x[row++] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22];
3079: x[row++] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23];
3080: x[row++] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24];
3081: ibdiag += 25;
3082: break;
3083: default:
3084: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
3085: }
3086: }
3088: xb = t;
3089: PetscCall(PetscLogFlops(a->nz));
3090: } else xb = b;
3091: if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3092: ibdiag = a->inode.ibdiag + a->inode.bdiagsize;
3093: for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) {
3094: ibdiag -= sizes[i] * sizes[i];
3095: sz = ii[row + 1] - diag[row] - 1;
3096: v1 = a->a + diag[row] + 1;
3097: idx = a->j + diag[row] + 1;
3099: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3100: switch (sizes[i]) {
3101: case 1:
3103: sum1 = xb[row];
3104: for (n = 0; n < sz - 1; n += 2) {
3105: i1 = idx[0];
3106: i2 = idx[1];
3107: idx += 2;
3108: tmp0 = x[i1];
3109: tmp1 = x[i2];
3110: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3111: v1 += 2;
3112: }
3114: if (n == sz - 1) {
3115: tmp0 = x[*idx];
3116: sum1 -= *v1 * tmp0;
3117: }
3118: x[row--] = sum1 * (*ibdiag);
3119: break;
3121: case 2:
3123: sum1 = xb[row];
3124: sum2 = xb[row - 1];
3125: /* note that sum1 is associated with the second of the two rows */
3126: v2 = a->a + diag[row - 1] + 2;
3127: for (n = 0; n < sz - 1; n += 2) {
3128: i1 = idx[0];
3129: i2 = idx[1];
3130: idx += 2;
3131: tmp0 = x[i1];
3132: tmp1 = x[i2];
3133: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3134: v1 += 2;
3135: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3136: v2 += 2;
3137: }
3139: if (n == sz - 1) {
3140: tmp0 = x[*idx];
3141: sum1 -= *v1 * tmp0;
3142: sum2 -= *v2 * tmp0;
3143: }
3144: x[row--] = sum2 * ibdiag[1] + sum1 * ibdiag[3];
3145: x[row--] = sum2 * ibdiag[0] + sum1 * ibdiag[2];
3146: break;
3147: case 3:
3149: sum1 = xb[row];
3150: sum2 = xb[row - 1];
3151: sum3 = xb[row - 2];
3152: v2 = a->a + diag[row - 1] + 2;
3153: v3 = a->a + diag[row - 2] + 3;
3154: for (n = 0; n < sz - 1; n += 2) {
3155: i1 = idx[0];
3156: i2 = idx[1];
3157: idx += 2;
3158: tmp0 = x[i1];
3159: tmp1 = x[i2];
3160: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3161: v1 += 2;
3162: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3163: v2 += 2;
3164: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3165: v3 += 2;
3166: }
3168: if (n == sz - 1) {
3169: tmp0 = x[*idx];
3170: sum1 -= *v1 * tmp0;
3171: sum2 -= *v2 * tmp0;
3172: sum3 -= *v3 * tmp0;
3173: }
3174: x[row--] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3175: x[row--] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3176: x[row--] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3177: break;
3178: case 4:
3180: sum1 = xb[row];
3181: sum2 = xb[row - 1];
3182: sum3 = xb[row - 2];
3183: sum4 = xb[row - 3];
3184: v2 = a->a + diag[row - 1] + 2;
3185: v3 = a->a + diag[row - 2] + 3;
3186: v4 = a->a + diag[row - 3] + 4;
3187: for (n = 0; n < sz - 1; n += 2) {
3188: i1 = idx[0];
3189: i2 = idx[1];
3190: idx += 2;
3191: tmp0 = x[i1];
3192: tmp1 = x[i2];
3193: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3194: v1 += 2;
3195: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3196: v2 += 2;
3197: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3198: v3 += 2;
3199: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3200: v4 += 2;
3201: }
3203: if (n == sz - 1) {
3204: tmp0 = x[*idx];
3205: sum1 -= *v1 * tmp0;
3206: sum2 -= *v2 * tmp0;
3207: sum3 -= *v3 * tmp0;
3208: sum4 -= *v4 * tmp0;
3209: }
3210: x[row--] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3211: x[row--] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3212: x[row--] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3213: x[row--] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3214: break;
3215: case 5:
3217: sum1 = xb[row];
3218: sum2 = xb[row - 1];
3219: sum3 = xb[row - 2];
3220: sum4 = xb[row - 3];
3221: sum5 = xb[row - 4];
3222: v2 = a->a + diag[row - 1] + 2;
3223: v3 = a->a + diag[row - 2] + 3;
3224: v4 = a->a + diag[row - 3] + 4;
3225: v5 = a->a + diag[row - 4] + 5;
3226: for (n = 0; n < sz - 1; n += 2) {
3227: i1 = idx[0];
3228: i2 = idx[1];
3229: idx += 2;
3230: tmp0 = x[i1];
3231: tmp1 = x[i2];
3232: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3233: v1 += 2;
3234: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3235: v2 += 2;
3236: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3237: v3 += 2;
3238: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3239: v4 += 2;
3240: sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3241: v5 += 2;
3242: }
3244: if (n == sz - 1) {
3245: tmp0 = x[*idx];
3246: sum1 -= *v1 * tmp0;
3247: sum2 -= *v2 * tmp0;
3248: sum3 -= *v3 * tmp0;
3249: sum4 -= *v4 * tmp0;
3250: sum5 -= *v5 * tmp0;
3251: }
3252: x[row--] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3253: x[row--] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3254: x[row--] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3255: x[row--] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3256: x[row--] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3257: break;
3258: default:
3259: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
3260: }
3261: }
3263: PetscCall(PetscLogFlops(a->nz));
3264: }
3265: its--;
3266: }
3267: while (its--) {
3268: if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
3269: for (i = 0, row = 0, ibdiag = a->inode.ibdiag; i < m; row += sizes[i], ibdiag += sizes[i] * sizes[i], i++) {
3270: sz = diag[row] - ii[row];
3271: v1 = a->a + ii[row];
3272: idx = a->j + ii[row];
3273: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3274: switch (sizes[i]) {
3275: case 1:
3276: sum1 = b[row];
3277: for (n = 0; n < sz - 1; n += 2) {
3278: i1 = idx[0];
3279: i2 = idx[1];
3280: idx += 2;
3281: tmp0 = x[i1];
3282: tmp1 = x[i2];
3283: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3284: v1 += 2;
3285: }
3286: if (n == sz - 1) {
3287: tmp0 = x[*idx++];
3288: sum1 -= *v1 * tmp0;
3289: v1++;
3290: }
3291: t[row] = sum1;
3292: sz = ii[row + 1] - diag[row] - 1;
3293: idx = a->j + diag[row] + 1;
3294: v1 += 1;
3295: for (n = 0; n < sz - 1; n += 2) {
3296: i1 = idx[0];
3297: i2 = idx[1];
3298: idx += 2;
3299: tmp0 = x[i1];
3300: tmp1 = x[i2];
3301: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3302: v1 += 2;
3303: }
3304: if (n == sz - 1) {
3305: tmp0 = x[*idx++];
3306: sum1 -= *v1 * tmp0;
3307: }
3308: /* in MatSOR_SeqAIJ this line would be
3309: *
3310: * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++);
3311: *
3312: * but omega == 1, so this becomes
3313: *
3314: * x[row] = sum1*(*ibdiag++);
3315: *
3316: */
3317: x[row] = sum1 * (*ibdiag);
3318: break;
3319: case 2:
3320: v2 = a->a + ii[row + 1];
3321: sum1 = b[row];
3322: sum2 = b[row + 1];
3323: for (n = 0; n < sz - 1; n += 2) {
3324: i1 = idx[0];
3325: i2 = idx[1];
3326: idx += 2;
3327: tmp0 = x[i1];
3328: tmp1 = x[i2];
3329: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3330: v1 += 2;
3331: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3332: v2 += 2;
3333: }
3334: if (n == sz - 1) {
3335: tmp0 = x[*idx++];
3336: sum1 -= v1[0] * tmp0;
3337: sum2 -= v2[0] * tmp0;
3338: v1++;
3339: v2++;
3340: }
3341: t[row] = sum1;
3342: t[row + 1] = sum2;
3343: sz = ii[row + 1] - diag[row] - 2;
3344: idx = a->j + diag[row] + 2;
3345: v1 += 2;
3346: v2 += 2;
3347: for (n = 0; n < sz - 1; n += 2) {
3348: i1 = idx[0];
3349: i2 = idx[1];
3350: idx += 2;
3351: tmp0 = x[i1];
3352: tmp1 = x[i2];
3353: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3354: v1 += 2;
3355: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3356: v2 += 2;
3357: }
3358: if (n == sz - 1) {
3359: tmp0 = x[*idx];
3360: sum1 -= v1[0] * tmp0;
3361: sum2 -= v2[0] * tmp0;
3362: }
3363: x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[2];
3364: x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[3];
3365: break;
3366: case 3:
3367: v2 = a->a + ii[row + 1];
3368: v3 = a->a + ii[row + 2];
3369: sum1 = b[row];
3370: sum2 = b[row + 1];
3371: sum3 = b[row + 2];
3372: for (n = 0; n < sz - 1; n += 2) {
3373: i1 = idx[0];
3374: i2 = idx[1];
3375: idx += 2;
3376: tmp0 = x[i1];
3377: tmp1 = x[i2];
3378: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3379: v1 += 2;
3380: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3381: v2 += 2;
3382: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3383: v3 += 2;
3384: }
3385: if (n == sz - 1) {
3386: tmp0 = x[*idx++];
3387: sum1 -= v1[0] * tmp0;
3388: sum2 -= v2[0] * tmp0;
3389: sum3 -= v3[0] * tmp0;
3390: v1++;
3391: v2++;
3392: v3++;
3393: }
3394: t[row] = sum1;
3395: t[row + 1] = sum2;
3396: t[row + 2] = sum3;
3397: sz = ii[row + 1] - diag[row] - 3;
3398: idx = a->j + diag[row] + 3;
3399: v1 += 3;
3400: v2 += 3;
3401: v3 += 3;
3402: for (n = 0; n < sz - 1; n += 2) {
3403: i1 = idx[0];
3404: i2 = idx[1];
3405: idx += 2;
3406: tmp0 = x[i1];
3407: tmp1 = x[i2];
3408: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3409: v1 += 2;
3410: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3411: v2 += 2;
3412: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3413: v3 += 2;
3414: }
3415: if (n == sz - 1) {
3416: tmp0 = x[*idx];
3417: sum1 -= v1[0] * tmp0;
3418: sum2 -= v2[0] * tmp0;
3419: sum3 -= v3[0] * tmp0;
3420: }
3421: x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6];
3422: x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7];
3423: x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8];
3424: break;
3425: case 4:
3426: v2 = a->a + ii[row + 1];
3427: v3 = a->a + ii[row + 2];
3428: v4 = a->a + ii[row + 3];
3429: sum1 = b[row];
3430: sum2 = b[row + 1];
3431: sum3 = b[row + 2];
3432: sum4 = b[row + 3];
3433: for (n = 0; n < sz - 1; n += 2) {
3434: i1 = idx[0];
3435: i2 = idx[1];
3436: idx += 2;
3437: tmp0 = x[i1];
3438: tmp1 = x[i2];
3439: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3440: v1 += 2;
3441: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3442: v2 += 2;
3443: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3444: v3 += 2;
3445: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3446: v4 += 2;
3447: }
3448: if (n == sz - 1) {
3449: tmp0 = x[*idx++];
3450: sum1 -= v1[0] * tmp0;
3451: sum2 -= v2[0] * tmp0;
3452: sum3 -= v3[0] * tmp0;
3453: sum4 -= v4[0] * tmp0;
3454: v1++;
3455: v2++;
3456: v3++;
3457: v4++;
3458: }
3459: t[row] = sum1;
3460: t[row + 1] = sum2;
3461: t[row + 2] = sum3;
3462: t[row + 3] = sum4;
3463: sz = ii[row + 1] - diag[row] - 4;
3464: idx = a->j + diag[row] + 4;
3465: v1 += 4;
3466: v2 += 4;
3467: v3 += 4;
3468: v4 += 4;
3469: for (n = 0; n < sz - 1; n += 2) {
3470: i1 = idx[0];
3471: i2 = idx[1];
3472: idx += 2;
3473: tmp0 = x[i1];
3474: tmp1 = x[i2];
3475: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3476: v1 += 2;
3477: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3478: v2 += 2;
3479: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3480: v3 += 2;
3481: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3482: v4 += 2;
3483: }
3484: if (n == sz - 1) {
3485: tmp0 = x[*idx];
3486: sum1 -= v1[0] * tmp0;
3487: sum2 -= v2[0] * tmp0;
3488: sum3 -= v3[0] * tmp0;
3489: sum4 -= v4[0] * tmp0;
3490: }
3491: x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12];
3492: x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13];
3493: x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14];
3494: x[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15];
3495: break;
3496: case 5:
3497: v2 = a->a + ii[row + 1];
3498: v3 = a->a + ii[row + 2];
3499: v4 = a->a + ii[row + 3];
3500: v5 = a->a + ii[row + 4];
3501: sum1 = b[row];
3502: sum2 = b[row + 1];
3503: sum3 = b[row + 2];
3504: sum4 = b[row + 3];
3505: sum5 = b[row + 4];
3506: for (n = 0; n < sz - 1; n += 2) {
3507: i1 = idx[0];
3508: i2 = idx[1];
3509: idx += 2;
3510: tmp0 = x[i1];
3511: tmp1 = x[i2];
3512: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3513: v1 += 2;
3514: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3515: v2 += 2;
3516: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3517: v3 += 2;
3518: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3519: v4 += 2;
3520: sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3521: v5 += 2;
3522: }
3523: if (n == sz - 1) {
3524: tmp0 = x[*idx++];
3525: sum1 -= v1[0] * tmp0;
3526: sum2 -= v2[0] * tmp0;
3527: sum3 -= v3[0] * tmp0;
3528: sum4 -= v4[0] * tmp0;
3529: sum5 -= v5[0] * tmp0;
3530: v1++;
3531: v2++;
3532: v3++;
3533: v4++;
3534: v5++;
3535: }
3536: t[row] = sum1;
3537: t[row + 1] = sum2;
3538: t[row + 2] = sum3;
3539: t[row + 3] = sum4;
3540: t[row + 4] = sum5;
3541: sz = ii[row + 1] - diag[row] - 5;
3542: idx = a->j + diag[row] + 5;
3543: v1 += 5;
3544: v2 += 5;
3545: v3 += 5;
3546: v4 += 5;
3547: v5 += 5;
3548: for (n = 0; n < sz - 1; n += 2) {
3549: i1 = idx[0];
3550: i2 = idx[1];
3551: idx += 2;
3552: tmp0 = x[i1];
3553: tmp1 = x[i2];
3554: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3555: v1 += 2;
3556: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3557: v2 += 2;
3558: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3559: v3 += 2;
3560: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3561: v4 += 2;
3562: sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3563: v5 += 2;
3564: }
3565: if (n == sz - 1) {
3566: tmp0 = x[*idx];
3567: sum1 -= v1[0] * tmp0;
3568: sum2 -= v2[0] * tmp0;
3569: sum3 -= v3[0] * tmp0;
3570: sum4 -= v4[0] * tmp0;
3571: sum5 -= v5[0] * tmp0;
3572: }
3573: x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20];
3574: x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21];
3575: x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22];
3576: x[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23];
3577: x[row + 4] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24];
3578: break;
3579: default:
3580: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
3581: }
3582: }
3583: xb = t;
3584: PetscCall(PetscLogFlops(2.0 * a->nz)); /* undercounts diag inverse */
3585: } else xb = b;
3587: if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3588: ibdiag = a->inode.ibdiag + a->inode.bdiagsize;
3589: for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) {
3590: ibdiag -= sizes[i] * sizes[i];
3592: /* set RHS */
3593: if (xb == b) {
3594: /* whole (old way) */
3595: sz = ii[row + 1] - ii[row];
3596: idx = a->j + ii[row];
3597: switch (sizes[i]) {
3598: case 5:
3599: v5 = a->a + ii[row - 4]; /* fall through */
3600: case 4:
3601: v4 = a->a + ii[row - 3]; /* fall through */
3602: case 3:
3603: v3 = a->a + ii[row - 2]; /* fall through */
3604: case 2:
3605: v2 = a->a + ii[row - 1]; /* fall through */
3606: case 1:
3607: v1 = a->a + ii[row];
3608: break;
3609: default:
3610: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
3611: }
3612: } else {
3613: /* upper, no diag */
3614: sz = ii[row + 1] - diag[row] - 1;
3615: idx = a->j + diag[row] + 1;
3616: switch (sizes[i]) {
3617: case 5:
3618: v5 = a->a + diag[row - 4] + 5; /* fall through */
3619: case 4:
3620: v4 = a->a + diag[row - 3] + 4; /* fall through */
3621: case 3:
3622: v3 = a->a + diag[row - 2] + 3; /* fall through */
3623: case 2:
3624: v2 = a->a + diag[row - 1] + 2; /* fall through */
3625: case 1:
3626: v1 = a->a + diag[row] + 1;
3627: }
3628: }
3629: /* set sum */
3630: switch (sizes[i]) {
3631: case 5:
3632: sum5 = xb[row - 4]; /* fall through */
3633: case 4:
3634: sum4 = xb[row - 3]; /* fall through */
3635: case 3:
3636: sum3 = xb[row - 2]; /* fall through */
3637: case 2:
3638: sum2 = xb[row - 1]; /* fall through */
3639: case 1:
3640: /* note that sum1 is associated with the last row */
3641: sum1 = xb[row];
3642: }
3643: /* do sums */
3644: for (n = 0; n < sz - 1; n += 2) {
3645: i1 = idx[0];
3646: i2 = idx[1];
3647: idx += 2;
3648: tmp0 = x[i1];
3649: tmp1 = x[i2];
3650: switch (sizes[i]) {
3651: case 5:
3652: sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3653: v5 += 2; /* fall through */
3654: case 4:
3655: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3656: v4 += 2; /* fall through */
3657: case 3:
3658: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3659: v3 += 2; /* fall through */
3660: case 2:
3661: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3662: v2 += 2; /* fall through */
3663: case 1:
3664: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3665: v1 += 2;
3666: }
3667: }
3668: /* ragged edge */
3669: if (n == sz - 1) {
3670: tmp0 = x[*idx];
3671: switch (sizes[i]) {
3672: case 5:
3673: sum5 -= *v5 * tmp0; /* fall through */
3674: case 4:
3675: sum4 -= *v4 * tmp0; /* fall through */
3676: case 3:
3677: sum3 -= *v3 * tmp0; /* fall through */
3678: case 2:
3679: sum2 -= *v2 * tmp0; /* fall through */
3680: case 1:
3681: sum1 -= *v1 * tmp0;
3682: }
3683: }
3684: /* update */
3685: if (xb == b) {
3686: /* whole (old way) w/ diag */
3687: switch (sizes[i]) {
3688: case 5:
3689: x[row--] += sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3690: x[row--] += sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3691: x[row--] += sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3692: x[row--] += sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3693: x[row--] += sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3694: break;
3695: case 4:
3696: x[row--] += sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3697: x[row--] += sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3698: x[row--] += sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3699: x[row--] += sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3700: break;
3701: case 3:
3702: x[row--] += sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3703: x[row--] += sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3704: x[row--] += sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3705: break;
3706: case 2:
3707: x[row--] += sum2 * ibdiag[1] + sum1 * ibdiag[3];
3708: x[row--] += sum2 * ibdiag[0] + sum1 * ibdiag[2];
3709: break;
3710: case 1:
3711: x[row--] += sum1 * (*ibdiag);
3712: break;
3713: }
3714: } else {
3715: /* no diag so set = */
3716: switch (sizes[i]) {
3717: case 5:
3718: x[row--] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3719: x[row--] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3720: x[row--] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3721: x[row--] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3722: x[row--] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3723: break;
3724: case 4:
3725: x[row--] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3726: x[row--] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3727: x[row--] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3728: x[row--] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3729: break;
3730: case 3:
3731: x[row--] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3732: x[row--] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3733: x[row--] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3734: break;
3735: case 2:
3736: x[row--] = sum2 * ibdiag[1] + sum1 * ibdiag[3];
3737: x[row--] = sum2 * ibdiag[0] + sum1 * ibdiag[2];
3738: break;
3739: case 1:
3740: x[row--] = sum1 * (*ibdiag);
3741: break;
3742: }
3743: }
3744: }
3745: if (xb == b) {
3746: PetscCall(PetscLogFlops(2.0 * a->nz));
3747: } else {
3748: PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper, undercounts diag inverse */
3749: }
3750: }
3751: }
3752: if (flag & SOR_EISENSTAT) {
3753: /*
3754: Apply (U + D)^-1 where D is now the block diagonal
3755: */
3756: ibdiag = a->inode.ibdiag + a->inode.bdiagsize;
3757: for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) {
3758: ibdiag -= sizes[i] * sizes[i];
3759: sz = ii[row + 1] - diag[row] - 1;
3760: v1 = a->a + diag[row] + 1;
3761: idx = a->j + diag[row] + 1;
3762: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3763: switch (sizes[i]) {
3764: case 1:
3766: sum1 = b[row];
3767: for (n = 0; n < sz - 1; n += 2) {
3768: i1 = idx[0];
3769: i2 = idx[1];
3770: idx += 2;
3771: tmp0 = x[i1];
3772: tmp1 = x[i2];
3773: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3774: v1 += 2;
3775: }
3777: if (n == sz - 1) {
3778: tmp0 = x[*idx];
3779: sum1 -= *v1 * tmp0;
3780: }
3781: x[row] = sum1 * (*ibdiag);
3782: row--;
3783: break;
3785: case 2:
3787: sum1 = b[row];
3788: sum2 = b[row - 1];
3789: /* note that sum1 is associated with the second of the two rows */
3790: v2 = a->a + diag[row - 1] + 2;
3791: for (n = 0; n < sz - 1; n += 2) {
3792: i1 = idx[0];
3793: i2 = idx[1];
3794: idx += 2;
3795: tmp0 = x[i1];
3796: tmp1 = x[i2];
3797: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3798: v1 += 2;
3799: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3800: v2 += 2;
3801: }
3803: if (n == sz - 1) {
3804: tmp0 = x[*idx];
3805: sum1 -= *v1 * tmp0;
3806: sum2 -= *v2 * tmp0;
3807: }
3808: x[row] = sum2 * ibdiag[1] + sum1 * ibdiag[3];
3809: x[row - 1] = sum2 * ibdiag[0] + sum1 * ibdiag[2];
3810: row -= 2;
3811: break;
3812: case 3:
3814: sum1 = b[row];
3815: sum2 = b[row - 1];
3816: sum3 = b[row - 2];
3817: v2 = a->a + diag[row - 1] + 2;
3818: v3 = a->a + diag[row - 2] + 3;
3819: for (n = 0; n < sz - 1; n += 2) {
3820: i1 = idx[0];
3821: i2 = idx[1];
3822: idx += 2;
3823: tmp0 = x[i1];
3824: tmp1 = x[i2];
3825: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3826: v1 += 2;
3827: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3828: v2 += 2;
3829: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3830: v3 += 2;
3831: }
3833: if (n == sz - 1) {
3834: tmp0 = x[*idx];
3835: sum1 -= *v1 * tmp0;
3836: sum2 -= *v2 * tmp0;
3837: sum3 -= *v3 * tmp0;
3838: }
3839: x[row] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3840: x[row - 1] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3841: x[row - 2] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3842: row -= 3;
3843: break;
3844: case 4:
3846: sum1 = b[row];
3847: sum2 = b[row - 1];
3848: sum3 = b[row - 2];
3849: sum4 = b[row - 3];
3850: v2 = a->a + diag[row - 1] + 2;
3851: v3 = a->a + diag[row - 2] + 3;
3852: v4 = a->a + diag[row - 3] + 4;
3853: for (n = 0; n < sz - 1; n += 2) {
3854: i1 = idx[0];
3855: i2 = idx[1];
3856: idx += 2;
3857: tmp0 = x[i1];
3858: tmp1 = x[i2];
3859: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3860: v1 += 2;
3861: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3862: v2 += 2;
3863: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3864: v3 += 2;
3865: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3866: v4 += 2;
3867: }
3869: if (n == sz - 1) {
3870: tmp0 = x[*idx];
3871: sum1 -= *v1 * tmp0;
3872: sum2 -= *v2 * tmp0;
3873: sum3 -= *v3 * tmp0;
3874: sum4 -= *v4 * tmp0;
3875: }
3876: x[row] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3877: x[row - 1] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3878: x[row - 2] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3879: x[row - 3] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3880: row -= 4;
3881: break;
3882: case 5:
3884: sum1 = b[row];
3885: sum2 = b[row - 1];
3886: sum3 = b[row - 2];
3887: sum4 = b[row - 3];
3888: sum5 = b[row - 4];
3889: v2 = a->a + diag[row - 1] + 2;
3890: v3 = a->a + diag[row - 2] + 3;
3891: v4 = a->a + diag[row - 3] + 4;
3892: v5 = a->a + diag[row - 4] + 5;
3893: for (n = 0; n < sz - 1; n += 2) {
3894: i1 = idx[0];
3895: i2 = idx[1];
3896: idx += 2;
3897: tmp0 = x[i1];
3898: tmp1 = x[i2];
3899: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3900: v1 += 2;
3901: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3902: v2 += 2;
3903: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3904: v3 += 2;
3905: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3906: v4 += 2;
3907: sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3908: v5 += 2;
3909: }
3911: if (n == sz - 1) {
3912: tmp0 = x[*idx];
3913: sum1 -= *v1 * tmp0;
3914: sum2 -= *v2 * tmp0;
3915: sum3 -= *v3 * tmp0;
3916: sum4 -= *v4 * tmp0;
3917: sum5 -= *v5 * tmp0;
3918: }
3919: x[row] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3920: x[row - 1] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3921: x[row - 2] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3922: x[row - 3] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3923: x[row - 4] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3924: row -= 5;
3925: break;
3926: default:
3927: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
3928: }
3929: }
3930: PetscCall(PetscLogFlops(a->nz));
3932: /*
3933: t = b - D x where D is the block diagonal
3934: */
3935: cnt = 0;
3936: for (i = 0, row = 0; i < m; i++) {
3937: switch (sizes[i]) {
3938: case 1:
3939: t[row] = b[row] - bdiag[cnt++] * x[row];
3940: row++;
3941: break;
3942: case 2:
3943: x1 = x[row];
3944: x2 = x[row + 1];
3945: tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 2];
3946: tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 3];
3947: t[row] = b[row] - tmp1;
3948: t[row + 1] = b[row + 1] - tmp2;
3949: row += 2;
3950: cnt += 4;
3951: break;
3952: case 3:
3953: x1 = x[row];
3954: x2 = x[row + 1];
3955: x3 = x[row + 2];
3956: tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 3] + x3 * bdiag[cnt + 6];
3957: tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 7];
3958: tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 8];
3959: t[row] = b[row] - tmp1;
3960: t[row + 1] = b[row + 1] - tmp2;
3961: t[row + 2] = b[row + 2] - tmp3;
3962: row += 3;
3963: cnt += 9;
3964: break;
3965: case 4:
3966: x1 = x[row];
3967: x2 = x[row + 1];
3968: x3 = x[row + 2];
3969: x4 = x[row + 3];
3970: tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 8] + x4 * bdiag[cnt + 12];
3971: tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 9] + x4 * bdiag[cnt + 13];
3972: tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 14];
3973: tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 15];
3974: t[row] = b[row] - tmp1;
3975: t[row + 1] = b[row + 1] - tmp2;
3976: t[row + 2] = b[row + 2] - tmp3;
3977: t[row + 3] = b[row + 3] - tmp4;
3978: row += 4;
3979: cnt += 16;
3980: break;
3981: case 5:
3982: x1 = x[row];
3983: x2 = x[row + 1];
3984: x3 = x[row + 2];
3985: x4 = x[row + 3];
3986: x5 = x[row + 4];
3987: tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 15] + x5 * bdiag[cnt + 20];
3988: tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 16] + x5 * bdiag[cnt + 21];
3989: tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 12] + x4 * bdiag[cnt + 17] + x5 * bdiag[cnt + 22];
3990: tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 8] + x3 * bdiag[cnt + 13] + x4 * bdiag[cnt + 18] + x5 * bdiag[cnt + 23];
3991: tmp5 = x1 * bdiag[cnt + 4] + x2 * bdiag[cnt + 9] + x3 * bdiag[cnt + 14] + x4 * bdiag[cnt + 19] + x5 * bdiag[cnt + 24];
3992: t[row] = b[row] - tmp1;
3993: t[row + 1] = b[row + 1] - tmp2;
3994: t[row + 2] = b[row + 2] - tmp3;
3995: t[row + 3] = b[row + 3] - tmp4;
3996: t[row + 4] = b[row + 4] - tmp5;
3997: row += 5;
3998: cnt += 25;
3999: break;
4000: default:
4001: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
4002: }
4003: }
4004: PetscCall(PetscLogFlops(m));
4006: /*
4007: Apply (L + D)^-1 where D is the block diagonal
4008: */
4009: for (i = 0, row = 0; i < m; i++) {
4010: sz = diag[row] - ii[row];
4011: v1 = a->a + ii[row];
4012: idx = a->j + ii[row];
4013: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
4014: switch (sizes[i]) {
4015: case 1:
4017: sum1 = t[row];
4018: for (n = 0; n < sz - 1; n += 2) {
4019: i1 = idx[0];
4020: i2 = idx[1];
4021: idx += 2;
4022: tmp0 = t[i1];
4023: tmp1 = t[i2];
4024: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4025: v1 += 2;
4026: }
4028: if (n == sz - 1) {
4029: tmp0 = t[*idx];
4030: sum1 -= *v1 * tmp0;
4031: }
4032: x[row] += t[row] = sum1 * (*ibdiag++);
4033: row++;
4034: break;
4035: case 2:
4036: v2 = a->a + ii[row + 1];
4037: sum1 = t[row];
4038: sum2 = t[row + 1];
4039: for (n = 0; n < sz - 1; n += 2) {
4040: i1 = idx[0];
4041: i2 = idx[1];
4042: idx += 2;
4043: tmp0 = t[i1];
4044: tmp1 = t[i2];
4045: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4046: v1 += 2;
4047: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
4048: v2 += 2;
4049: }
4051: if (n == sz - 1) {
4052: tmp0 = t[*idx];
4053: sum1 -= v1[0] * tmp0;
4054: sum2 -= v2[0] * tmp0;
4055: }
4056: x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[2];
4057: x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[3];
4058: ibdiag += 4;
4059: row += 2;
4060: break;
4061: case 3:
4062: v2 = a->a + ii[row + 1];
4063: v3 = a->a + ii[row + 2];
4064: sum1 = t[row];
4065: sum2 = t[row + 1];
4066: sum3 = t[row + 2];
4067: for (n = 0; n < sz - 1; n += 2) {
4068: i1 = idx[0];
4069: i2 = idx[1];
4070: idx += 2;
4071: tmp0 = t[i1];
4072: tmp1 = t[i2];
4073: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4074: v1 += 2;
4075: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
4076: v2 += 2;
4077: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
4078: v3 += 2;
4079: }
4081: if (n == sz - 1) {
4082: tmp0 = t[*idx];
4083: sum1 -= v1[0] * tmp0;
4084: sum2 -= v2[0] * tmp0;
4085: sum3 -= v3[0] * tmp0;
4086: }
4087: x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6];
4088: x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7];
4089: x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8];
4090: ibdiag += 9;
4091: row += 3;
4092: break;
4093: case 4:
4094: v2 = a->a + ii[row + 1];
4095: v3 = a->a + ii[row + 2];
4096: v4 = a->a + ii[row + 3];
4097: sum1 = t[row];
4098: sum2 = t[row + 1];
4099: sum3 = t[row + 2];
4100: sum4 = t[row + 3];
4101: for (n = 0; n < sz - 1; n += 2) {
4102: i1 = idx[0];
4103: i2 = idx[1];
4104: idx += 2;
4105: tmp0 = t[i1];
4106: tmp1 = t[i2];
4107: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4108: v1 += 2;
4109: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
4110: v2 += 2;
4111: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
4112: v3 += 2;
4113: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
4114: v4 += 2;
4115: }
4117: if (n == sz - 1) {
4118: tmp0 = t[*idx];
4119: sum1 -= v1[0] * tmp0;
4120: sum2 -= v2[0] * tmp0;
4121: sum3 -= v3[0] * tmp0;
4122: sum4 -= v4[0] * tmp0;
4123: }
4124: x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12];
4125: x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13];
4126: x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14];
4127: x[row + 3] += t[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15];
4128: ibdiag += 16;
4129: row += 4;
4130: break;
4131: case 5:
4132: v2 = a->a + ii[row + 1];
4133: v3 = a->a + ii[row + 2];
4134: v4 = a->a + ii[row + 3];
4135: v5 = a->a + ii[row + 4];
4136: sum1 = t[row];
4137: sum2 = t[row + 1];
4138: sum3 = t[row + 2];
4139: sum4 = t[row + 3];
4140: sum5 = t[row + 4];
4141: for (n = 0; n < sz - 1; n += 2) {
4142: i1 = idx[0];
4143: i2 = idx[1];
4144: idx += 2;
4145: tmp0 = t[i1];
4146: tmp1 = t[i2];
4147: sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4148: v1 += 2;
4149: sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
4150: v2 += 2;
4151: sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
4152: v3 += 2;
4153: sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
4154: v4 += 2;
4155: sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
4156: v5 += 2;
4157: }
4159: if (n == sz - 1) {
4160: tmp0 = t[*idx];
4161: sum1 -= v1[0] * tmp0;
4162: sum2 -= v2[0] * tmp0;
4163: sum3 -= v3[0] * tmp0;
4164: sum4 -= v4[0] * tmp0;
4165: sum5 -= v5[0] * tmp0;
4166: }
4167: x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20];
4168: x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21];
4169: x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22];
4170: x[row + 3] += t[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23];
4171: x[row + 4] += t[row + 4] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24];
4172: ibdiag += 25;
4173: row += 5;
4174: break;
4175: default:
4176: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
4177: }
4178: }
4179: PetscCall(PetscLogFlops(a->nz));
4180: }
4181: PetscCall(VecRestoreArray(xx, &x));
4182: PetscCall(VecRestoreArrayRead(bb, &b));
4183: PetscFunctionReturn(PETSC_SUCCESS);
4184: }
4186: static PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A, Vec bb, Vec xx)
4187: {
4188: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4189: PetscScalar *x, tmp1, tmp2, tmp3, tmp4, tmp5, x1, x2, x3, x4, x5;
4190: const MatScalar *bdiag = a->inode.bdiag;
4191: const PetscScalar *b;
4192: PetscInt m = a->inode.node_count, cnt = 0, i, row;
4193: const PetscInt *sizes = a->inode.size;
4195: PetscFunctionBegin;
4196: PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
4197: PetscCall(VecGetArray(xx, &x));
4198: PetscCall(VecGetArrayRead(bb, &b));
4199: cnt = 0;
4200: for (i = 0, row = 0; i < m; i++) {
4201: switch (sizes[i]) {
4202: case 1:
4203: x[row] = b[row] * bdiag[cnt++];
4204: row++;
4205: break;
4206: case 2:
4207: x1 = b[row];
4208: x2 = b[row + 1];
4209: tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 2];
4210: tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 3];
4211: x[row++] = tmp1;
4212: x[row++] = tmp2;
4213: cnt += 4;
4214: break;
4215: case 3:
4216: x1 = b[row];
4217: x2 = b[row + 1];
4218: x3 = b[row + 2];
4219: tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 3] + x3 * bdiag[cnt + 6];
4220: tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 7];
4221: tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 8];
4222: x[row++] = tmp1;
4223: x[row++] = tmp2;
4224: x[row++] = tmp3;
4225: cnt += 9;
4226: break;
4227: case 4:
4228: x1 = b[row];
4229: x2 = b[row + 1];
4230: x3 = b[row + 2];
4231: x4 = b[row + 3];
4232: tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 8] + x4 * bdiag[cnt + 12];
4233: tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 9] + x4 * bdiag[cnt + 13];
4234: tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 14];
4235: tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 15];
4236: x[row++] = tmp1;
4237: x[row++] = tmp2;
4238: x[row++] = tmp3;
4239: x[row++] = tmp4;
4240: cnt += 16;
4241: break;
4242: case 5:
4243: x1 = b[row];
4244: x2 = b[row + 1];
4245: x3 = b[row + 2];
4246: x4 = b[row + 3];
4247: x5 = b[row + 4];
4248: tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 15] + x5 * bdiag[cnt + 20];
4249: tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 16] + x5 * bdiag[cnt + 21];
4250: tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 12] + x4 * bdiag[cnt + 17] + x5 * bdiag[cnt + 22];
4251: tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 8] + x3 * bdiag[cnt + 13] + x4 * bdiag[cnt + 18] + x5 * bdiag[cnt + 23];
4252: tmp5 = x1 * bdiag[cnt + 4] + x2 * bdiag[cnt + 9] + x3 * bdiag[cnt + 14] + x4 * bdiag[cnt + 19] + x5 * bdiag[cnt + 24];
4253: x[row++] = tmp1;
4254: x[row++] = tmp2;
4255: x[row++] = tmp3;
4256: x[row++] = tmp4;
4257: x[row++] = tmp5;
4258: cnt += 25;
4259: break;
4260: default:
4261: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
4262: }
4263: }
4264: PetscCall(PetscLogFlops(2.0 * cnt));
4265: PetscCall(VecRestoreArray(xx, &x));
4266: PetscCall(VecRestoreArrayRead(bb, &b));
4267: PetscFunctionReturn(PETSC_SUCCESS);
4268: }
4270: static PetscErrorCode MatSeqAIJ_Inode_ResetOps(Mat A)
4271: {
4272: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4274: PetscFunctionBegin;
4275: a->inode.node_count = 0;
4276: a->inode.use = PETSC_FALSE;
4277: a->inode.checked = PETSC_FALSE;
4278: a->inode.mat_nonzerostate = -1;
4279: A->ops->getrowij = MatGetRowIJ_SeqAIJ;
4280: A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ;
4281: A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ;
4282: A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ;
4283: A->ops->coloringpatch = NULL;
4284: A->ops->multdiagonalblock = NULL;
4285: if (A->factortype) A->ops->solve = MatSolve_SeqAIJ_inplace;
4286: PetscFunctionReturn(PETSC_SUCCESS);
4287: }
4289: /*
4290: samestructure indicates that the matrix has not changed its nonzero structure so we
4291: do not need to recompute the inodes
4292: */
4293: PetscErrorCode MatSeqAIJCheckInode(Mat A)
4294: {
4295: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4296: PetscInt i, j, m, nzx, nzy, *ns, node_count, blk_size;
4297: PetscBool flag;
4298: const PetscInt *idx, *idy, *ii;
4300: PetscFunctionBegin;
4301: if (!a->inode.use) {
4302: PetscCall(MatSeqAIJ_Inode_ResetOps(A));
4303: PetscCall(PetscFree(a->inode.size));
4304: PetscFunctionReturn(PETSC_SUCCESS);
4305: }
4306: if (a->inode.checked && A->nonzerostate == a->inode.mat_nonzerostate) PetscFunctionReturn(PETSC_SUCCESS);
4308: m = A->rmap->n;
4309: if (!a->inode.size) PetscCall(PetscMalloc1(m + 1, &a->inode.size));
4310: ns = a->inode.size;
4312: i = 0;
4313: node_count = 0;
4314: idx = a->j;
4315: ii = a->i;
4316: if (idx) {
4317: while (i < m) { /* For each row */
4318: nzx = ii[i + 1] - ii[i]; /* Number of nonzeros */
4319: /* Limits the number of elements in a node to 'a->inode.limit' */
4320: for (j = i + 1, idy = idx, blk_size = 1; j < m && blk_size < a->inode.limit; ++j, ++blk_size) {
4321: nzy = ii[j + 1] - ii[j]; /* Same number of nonzeros */
4322: if (nzy != nzx) break;
4323: idy += nzx; /* Same nonzero pattern */
4324: PetscCall(PetscArraycmp(idx, idy, nzx, &flag));
4325: if (!flag) break;
4326: }
4327: ns[node_count++] = blk_size;
4328: idx += blk_size * nzx;
4329: i = j;
4330: }
4331: }
4332: /* If not enough inodes found,, do not use inode version of the routines */
4333: if (!m || !idx || node_count > .8 * m) {
4334: PetscCall(MatSeqAIJ_Inode_ResetOps(A));
4335: PetscCall(PetscFree(a->inode.size));
4336: PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n", node_count, m));
4337: } else {
4338: if (!A->factortype) {
4339: A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4340: if (A->rmap->n == A->cmap->n) {
4341: A->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode;
4342: A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode;
4343: A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode;
4344: A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode;
4345: A->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode;
4346: }
4347: } else {
4348: A->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4349: }
4350: a->inode.node_count = node_count;
4351: PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n", node_count, m, a->inode.limit));
4352: }
4353: a->inode.checked = PETSC_TRUE;
4354: a->inode.mat_nonzerostate = A->nonzerostate;
4355: PetscFunctionReturn(PETSC_SUCCESS);
4356: }
4358: PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A, MatDuplicateOption cpvalues, Mat *C)
4359: {
4360: Mat B = *C;
4361: Mat_SeqAIJ *c = (Mat_SeqAIJ *)B->data, *a = (Mat_SeqAIJ *)A->data;
4362: PetscInt m = A->rmap->n;
4364: PetscFunctionBegin;
4365: c->inode.use = a->inode.use;
4366: c->inode.limit = a->inode.limit;
4367: c->inode.max_limit = a->inode.max_limit;
4368: c->inode.checked = PETSC_FALSE;
4369: c->inode.size = NULL;
4370: c->inode.node_count = 0;
4371: c->inode.ibdiagvalid = PETSC_FALSE;
4372: c->inode.ibdiag = NULL;
4373: c->inode.bdiag = NULL;
4374: c->inode.mat_nonzerostate = -1;
4375: if (a->inode.use) {
4376: if (a->inode.checked && a->inode.size) {
4377: PetscCall(PetscMalloc1(m + 1, &c->inode.size));
4378: PetscCall(PetscArraycpy(c->inode.size, a->inode.size, m + 1));
4380: c->inode.checked = PETSC_TRUE;
4381: c->inode.node_count = a->inode.node_count;
4382: c->inode.mat_nonzerostate = (*C)->nonzerostate;
4383: }
4384: /* note the table of functions below should match that in MatSeqAIJCheckInode() */
4385: if (!B->factortype) {
4386: B->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode;
4387: B->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode;
4388: B->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode;
4389: B->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode;
4390: B->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode;
4391: B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4392: } else {
4393: B->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4394: }
4395: }
4396: PetscFunctionReturn(PETSC_SUCCESS);
4397: }
4399: static inline PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols, PetscInt nzl, PetscInt nzu, PetscInt nz, const PetscInt *ai, const PetscInt *aj, const PetscInt *adiag, PetscInt row)
4400: {
4401: PetscInt k;
4402: const PetscInt *vi;
4404: PetscFunctionBegin;
4405: vi = aj + ai[row];
4406: for (k = 0; k < nzl; k++) cols[k] = vi[k];
4407: vi = aj + adiag[row];
4408: cols[nzl] = vi[0];
4409: vi = aj + adiag[row + 1] + 1;
4410: for (k = 0; k < nzu; k++) cols[nzl + 1 + k] = vi[k];
4411: PetscFunctionReturn(PETSC_SUCCESS);
4412: }
4413: /*
4414: MatSeqAIJCheckInode_FactorLU - Check Inode for factored seqaij matrix.
4415: Modified from MatSeqAIJCheckInode().
4417: Input Parameters:
4418: . Mat A - ILU or LU matrix factor
4420: */
4421: PetscErrorCode MatSeqAIJCheckInode_FactorLU(Mat A)
4422: {
4423: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4424: PetscInt i, j, m, nzl1, nzu1, nzl2, nzu2, nzx, nzy, node_count, blk_size;
4425: PetscInt *cols1, *cols2, *ns;
4426: const PetscInt *ai = a->i, *aj = a->j, *adiag = a->diag;
4427: PetscBool flag;
4429: PetscFunctionBegin;
4430: if (!a->inode.use) PetscFunctionReturn(PETSC_SUCCESS);
4431: if (a->inode.checked) PetscFunctionReturn(PETSC_SUCCESS);
4433: m = A->rmap->n;
4434: if (a->inode.size) ns = a->inode.size;
4435: else PetscCall(PetscMalloc1(m + 1, &ns));
4437: i = 0;
4438: node_count = 0;
4439: PetscCall(PetscMalloc2(m, &cols1, m, &cols2));
4440: while (i < m) { /* For each row */
4441: nzl1 = ai[i + 1] - ai[i]; /* Number of nonzeros in L */
4442: nzu1 = adiag[i] - adiag[i + 1] - 1; /* Number of nonzeros in U excluding diagonal*/
4443: nzx = nzl1 + nzu1 + 1;
4444: PetscCall(MatGetRow_FactoredLU(cols1, nzl1, nzu1, nzx, ai, aj, adiag, i));
4446: /* Limits the number of elements in a node to 'a->inode.limit' */
4447: for (j = i + 1, blk_size = 1; j < m && blk_size < a->inode.limit; ++j, ++blk_size) {
4448: nzl2 = ai[j + 1] - ai[j];
4449: nzu2 = adiag[j] - adiag[j + 1] - 1;
4450: nzy = nzl2 + nzu2 + 1;
4451: if (nzy != nzx) break;
4452: PetscCall(MatGetRow_FactoredLU(cols2, nzl2, nzu2, nzy, ai, aj, adiag, j));
4453: PetscCall(PetscArraycmp(cols1, cols2, nzx, &flag));
4454: if (!flag) break;
4455: }
4456: ns[node_count++] = blk_size;
4457: i = j;
4458: }
4459: PetscCall(PetscFree2(cols1, cols2));
4460: /* If not enough inodes found,, do not use inode version of the routines */
4461: if (!m || node_count > .8 * m) {
4462: PetscCall(PetscFree(ns));
4464: a->inode.node_count = 0;
4465: a->inode.size = NULL;
4466: a->inode.use = PETSC_FALSE;
4468: PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n", node_count, m));
4469: } else {
4470: A->ops->mult = NULL;
4471: A->ops->sor = NULL;
4472: A->ops->multadd = NULL;
4473: A->ops->getrowij = NULL;
4474: A->ops->restorerowij = NULL;
4475: A->ops->getcolumnij = NULL;
4476: A->ops->restorecolumnij = NULL;
4477: A->ops->coloringpatch = NULL;
4478: A->ops->multdiagonalblock = NULL;
4479: a->inode.node_count = node_count;
4480: a->inode.size = ns;
4482: PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n", node_count, m, a->inode.limit));
4483: }
4484: a->inode.checked = PETSC_TRUE;
4485: PetscFunctionReturn(PETSC_SUCCESS);
4486: }
4488: PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A)
4489: {
4490: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4492: PetscFunctionBegin;
4493: a->inode.ibdiagvalid = PETSC_FALSE;
4494: PetscFunctionReturn(PETSC_SUCCESS);
4495: }
4497: /*
4498: This is really ugly. if inodes are used this replaces the
4499: permutations with ones that correspond to rows/cols of the matrix
4500: rather then inode blocks
4501: */
4502: PetscErrorCode MatInodeAdjustForInodes(Mat A, IS *rperm, IS *cperm)
4503: {
4504: PetscFunctionBegin;
4505: PetscTryMethod(A, "MatInodeAdjustForInodes_C", (Mat, IS *, IS *), (A, rperm, cperm));
4506: PetscFunctionReturn(PETSC_SUCCESS);
4507: }
4509: PetscErrorCode MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A, IS *rperm, IS *cperm)
4510: {
4511: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4512: PetscInt m = A->rmap->n, n = A->cmap->n, i, j, nslim_row = a->inode.node_count;
4513: const PetscInt *ridx, *cidx;
4514: PetscInt row, col, *permr, *permc, *ns_row = a->inode.size, *tns, start_val, end_val, indx;
4515: PetscInt nslim_col, *ns_col;
4516: IS ris = *rperm, cis = *cperm;
4518: PetscFunctionBegin;
4519: if (!a->inode.size) PetscFunctionReturn(PETSC_SUCCESS); /* no inodes so return */
4520: if (a->inode.node_count == m) PetscFunctionReturn(PETSC_SUCCESS); /* all inodes are of size 1 */
4522: PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col));
4523: PetscCall(PetscMalloc1(((nslim_row > nslim_col) ? nslim_row : nslim_col) + 1, &tns));
4524: PetscCall(PetscMalloc2(m, &permr, n, &permc));
4526: PetscCall(ISGetIndices(ris, &ridx));
4527: PetscCall(ISGetIndices(cis, &cidx));
4529: /* Form the inode structure for the rows of permuted matric using inv perm*/
4530: for (i = 0, tns[0] = 0; i < nslim_row; ++i) tns[i + 1] = tns[i] + ns_row[i];
4532: /* Construct the permutations for rows*/
4533: for (i = 0, row = 0; i < nslim_row; ++i) {
4534: indx = ridx[i];
4535: start_val = tns[indx];
4536: end_val = tns[indx + 1];
4537: for (j = start_val; j < end_val; ++j, ++row) permr[row] = j;
4538: }
4540: /* Form the inode structure for the columns of permuted matrix using inv perm*/
4541: for (i = 0, tns[0] = 0; i < nslim_col; ++i) tns[i + 1] = tns[i] + ns_col[i];
4543: /* Construct permutations for columns */
4544: for (i = 0, col = 0; i < nslim_col; ++i) {
4545: indx = cidx[i];
4546: start_val = tns[indx];
4547: end_val = tns[indx + 1];
4548: for (j = start_val; j < end_val; ++j, ++col) permc[col] = j;
4549: }
4551: PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, permr, PETSC_COPY_VALUES, rperm));
4552: PetscCall(ISSetPermutation(*rperm));
4553: PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, permc, PETSC_COPY_VALUES, cperm));
4554: PetscCall(ISSetPermutation(*cperm));
4556: PetscCall(ISRestoreIndices(ris, &ridx));
4557: PetscCall(ISRestoreIndices(cis, &cidx));
4559: PetscCall(PetscFree(ns_col));
4560: PetscCall(PetscFree2(permr, permc));
4561: PetscCall(ISDestroy(&cis));
4562: PetscCall(ISDestroy(&ris));
4563: PetscCall(PetscFree(tns));
4564: PetscFunctionReturn(PETSC_SUCCESS);
4565: }
4567: /*@C
4568: MatInodeGetInodeSizes - Returns the inode information of a matrix with inodes
4570: Not Collective
4572: Input Parameter:
4573: . A - the Inode matrix or matrix derived from the Inode class -- e.g., `MATSEQAIJ`
4575: Output Parameters:
4576: + node_count - no of inodes present in the matrix.
4577: . sizes - an array of size `node_count`, with the sizes of each inode.
4578: - limit - the max size used to generate the inodes.
4580: Level: advanced
4582: Note:
4583: It should be called after the matrix is assembled.
4584: The contents of the sizes[] array should not be changed.
4585: `NULL` may be passed for information not needed
4587: .seealso: [](ch_matrices), `Mat`, `MatGetInfo()`
4588: @*/
4589: PetscErrorCode MatInodeGetInodeSizes(Mat A, PetscInt *node_count, PetscInt *sizes[], PetscInt *limit)
4590: {
4591: PetscErrorCode (*f)(Mat, PetscInt *, PetscInt **, PetscInt *);
4593: PetscFunctionBegin;
4594: PetscCheck(A->assembled, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for unassembled matrix");
4595: PetscCall(PetscObjectQueryFunction((PetscObject)A, "MatInodeGetInodeSizes_C", &f));
4596: if (f) PetscCall((*f)(A, node_count, sizes, limit));
4597: PetscFunctionReturn(PETSC_SUCCESS);
4598: }
4600: PetscErrorCode MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A, PetscInt *node_count, PetscInt *sizes[], PetscInt *limit)
4601: {
4602: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4604: PetscFunctionBegin;
4605: if (node_count) *node_count = a->inode.node_count;
4606: if (sizes) *sizes = a->inode.size;
4607: if (limit) *limit = a->inode.limit;
4608: PetscFunctionReturn(PETSC_SUCCESS);
4609: }