Actual source code: baijfact9.c
2: /*
3: Factorization code for BAIJ format.
4: */
5: #include <../src/mat/impls/baij/seq/baij.h>
6: #include <petsc/private/kernels/blockinvert.h>
8: /*
9: Version for when blocks are 5 by 5
10: */
11: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_5_inplace(Mat C, Mat A, const MatFactorInfo *info)
12: {
13: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
14: IS isrow = b->row, isicol = b->icol;
15: const PetscInt *r, *ic, *bi = b->i, *bj = b->j, *ajtmpold, *ajtmp;
16: PetscInt i, j, n = a->mbs, nz, row, idx, ipvt[5];
17: const PetscInt *diag_offset = b->diag, *ai = a->i, *aj = a->j, *pj;
18: MatScalar *w, *pv, *rtmp, *x, *pc;
19: const MatScalar *v, *aa = a->a;
20: MatScalar p1, p2, p3, p4, m1, m2, m3, m4, m5, m6, m7, m8, m9, x1, x2, x3, x4;
21: MatScalar p5, p6, p7, p8, p9, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16;
22: MatScalar x17, x18, x19, x20, x21, x22, x23, x24, x25, p10, p11, p12, p13, p14;
23: MatScalar p15, p16, p17, p18, p19, p20, p21, p22, p23, p24, p25, m10, m11, m12;
24: MatScalar m13, m14, m15, m16, m17, m18, m19, m20, m21, m22, m23, m24, m25;
25: MatScalar *ba = b->a, work[25];
26: PetscReal shift = info->shiftamount;
27: PetscBool allowzeropivot, zeropivotdetected;
29: PetscFunctionBegin;
30: allowzeropivot = PetscNot(A->erroriffailure);
31: PetscCall(ISGetIndices(isrow, &r));
32: PetscCall(ISGetIndices(isicol, &ic));
33: PetscCall(PetscMalloc1(25 * (n + 1), &rtmp));
35: #define PETSC_USE_MEMZERO 1
36: #define PETSC_USE_MEMCPY 1
38: for (i = 0; i < n; i++) {
39: nz = bi[i + 1] - bi[i];
40: ajtmp = bj + bi[i];
41: for (j = 0; j < nz; j++) {
42: #if defined(PETSC_USE_MEMZERO)
43: PetscCall(PetscArrayzero(rtmp + 25 * ajtmp[j], 25));
44: #else
45: x = rtmp + 25 * ajtmp[j];
46: x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0;
47: x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = x[16] = x[17] = 0.0;
48: x[18] = x[19] = x[20] = x[21] = x[22] = x[23] = x[24] = 0.0;
49: #endif
50: }
51: /* load in initial (unfactored row) */
52: idx = r[i];
53: nz = ai[idx + 1] - ai[idx];
54: ajtmpold = aj + ai[idx];
55: v = aa + 25 * ai[idx];
56: for (j = 0; j < nz; j++) {
57: #if defined(PETSC_USE_MEMCPY)
58: PetscCall(PetscArraycpy(rtmp + 25 * ic[ajtmpold[j]], v, 25));
59: #else
60: x = rtmp + 25 * ic[ajtmpold[j]];
61: x[0] = v[0];
62: x[1] = v[1];
63: x[2] = v[2];
64: x[3] = v[3];
65: x[4] = v[4];
66: x[5] = v[5];
67: x[6] = v[6];
68: x[7] = v[7];
69: x[8] = v[8];
70: x[9] = v[9];
71: x[10] = v[10];
72: x[11] = v[11];
73: x[12] = v[12];
74: x[13] = v[13];
75: x[14] = v[14];
76: x[15] = v[15];
77: x[16] = v[16];
78: x[17] = v[17];
79: x[18] = v[18];
80: x[19] = v[19];
81: x[20] = v[20];
82: x[21] = v[21];
83: x[22] = v[22];
84: x[23] = v[23];
85: x[24] = v[24];
86: #endif
87: v += 25;
88: }
89: row = *ajtmp++;
90: while (row < i) {
91: pc = rtmp + 25 * row;
92: p1 = pc[0];
93: p2 = pc[1];
94: p3 = pc[2];
95: p4 = pc[3];
96: p5 = pc[4];
97: p6 = pc[5];
98: p7 = pc[6];
99: p8 = pc[7];
100: p9 = pc[8];
101: p10 = pc[9];
102: p11 = pc[10];
103: p12 = pc[11];
104: p13 = pc[12];
105: p14 = pc[13];
106: p15 = pc[14];
107: p16 = pc[15];
108: p17 = pc[16];
109: p18 = pc[17];
110: p19 = pc[18];
111: p20 = pc[19];
112: p21 = pc[20];
113: p22 = pc[21];
114: p23 = pc[22];
115: p24 = pc[23];
116: p25 = pc[24];
117: if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 || p16 != 0.0 || p17 != 0.0 || p18 != 0.0 || p19 != 0.0 || p20 != 0.0 || p21 != 0.0 || p22 != 0.0 || p23 != 0.0 || p24 != 0.0 || p25 != 0.0) {
118: pv = ba + 25 * diag_offset[row];
119: pj = bj + diag_offset[row] + 1;
120: x1 = pv[0];
121: x2 = pv[1];
122: x3 = pv[2];
123: x4 = pv[3];
124: x5 = pv[4];
125: x6 = pv[5];
126: x7 = pv[6];
127: x8 = pv[7];
128: x9 = pv[8];
129: x10 = pv[9];
130: x11 = pv[10];
131: x12 = pv[11];
132: x13 = pv[12];
133: x14 = pv[13];
134: x15 = pv[14];
135: x16 = pv[15];
136: x17 = pv[16];
137: x18 = pv[17];
138: x19 = pv[18];
139: x20 = pv[19];
140: x21 = pv[20];
141: x22 = pv[21];
142: x23 = pv[22];
143: x24 = pv[23];
144: x25 = pv[24];
145: pc[0] = m1 = p1 * x1 + p6 * x2 + p11 * x3 + p16 * x4 + p21 * x5;
146: pc[1] = m2 = p2 * x1 + p7 * x2 + p12 * x3 + p17 * x4 + p22 * x5;
147: pc[2] = m3 = p3 * x1 + p8 * x2 + p13 * x3 + p18 * x4 + p23 * x5;
148: pc[3] = m4 = p4 * x1 + p9 * x2 + p14 * x3 + p19 * x4 + p24 * x5;
149: pc[4] = m5 = p5 * x1 + p10 * x2 + p15 * x3 + p20 * x4 + p25 * x5;
151: pc[5] = m6 = p1 * x6 + p6 * x7 + p11 * x8 + p16 * x9 + p21 * x10;
152: pc[6] = m7 = p2 * x6 + p7 * x7 + p12 * x8 + p17 * x9 + p22 * x10;
153: pc[7] = m8 = p3 * x6 + p8 * x7 + p13 * x8 + p18 * x9 + p23 * x10;
154: pc[8] = m9 = p4 * x6 + p9 * x7 + p14 * x8 + p19 * x9 + p24 * x10;
155: pc[9] = m10 = p5 * x6 + p10 * x7 + p15 * x8 + p20 * x9 + p25 * x10;
157: pc[10] = m11 = p1 * x11 + p6 * x12 + p11 * x13 + p16 * x14 + p21 * x15;
158: pc[11] = m12 = p2 * x11 + p7 * x12 + p12 * x13 + p17 * x14 + p22 * x15;
159: pc[12] = m13 = p3 * x11 + p8 * x12 + p13 * x13 + p18 * x14 + p23 * x15;
160: pc[13] = m14 = p4 * x11 + p9 * x12 + p14 * x13 + p19 * x14 + p24 * x15;
161: pc[14] = m15 = p5 * x11 + p10 * x12 + p15 * x13 + p20 * x14 + p25 * x15;
163: pc[15] = m16 = p1 * x16 + p6 * x17 + p11 * x18 + p16 * x19 + p21 * x20;
164: pc[16] = m17 = p2 * x16 + p7 * x17 + p12 * x18 + p17 * x19 + p22 * x20;
165: pc[17] = m18 = p3 * x16 + p8 * x17 + p13 * x18 + p18 * x19 + p23 * x20;
166: pc[18] = m19 = p4 * x16 + p9 * x17 + p14 * x18 + p19 * x19 + p24 * x20;
167: pc[19] = m20 = p5 * x16 + p10 * x17 + p15 * x18 + p20 * x19 + p25 * x20;
169: pc[20] = m21 = p1 * x21 + p6 * x22 + p11 * x23 + p16 * x24 + p21 * x25;
170: pc[21] = m22 = p2 * x21 + p7 * x22 + p12 * x23 + p17 * x24 + p22 * x25;
171: pc[22] = m23 = p3 * x21 + p8 * x22 + p13 * x23 + p18 * x24 + p23 * x25;
172: pc[23] = m24 = p4 * x21 + p9 * x22 + p14 * x23 + p19 * x24 + p24 * x25;
173: pc[24] = m25 = p5 * x21 + p10 * x22 + p15 * x23 + p20 * x24 + p25 * x25;
175: nz = bi[row + 1] - diag_offset[row] - 1;
176: pv += 25;
177: for (j = 0; j < nz; j++) {
178: x1 = pv[0];
179: x2 = pv[1];
180: x3 = pv[2];
181: x4 = pv[3];
182: x5 = pv[4];
183: x6 = pv[5];
184: x7 = pv[6];
185: x8 = pv[7];
186: x9 = pv[8];
187: x10 = pv[9];
188: x11 = pv[10];
189: x12 = pv[11];
190: x13 = pv[12];
191: x14 = pv[13];
192: x15 = pv[14];
193: x16 = pv[15];
194: x17 = pv[16];
195: x18 = pv[17];
196: x19 = pv[18];
197: x20 = pv[19];
198: x21 = pv[20];
199: x22 = pv[21];
200: x23 = pv[22];
201: x24 = pv[23];
202: x25 = pv[24];
203: x = rtmp + 25 * pj[j];
204: x[0] -= m1 * x1 + m6 * x2 + m11 * x3 + m16 * x4 + m21 * x5;
205: x[1] -= m2 * x1 + m7 * x2 + m12 * x3 + m17 * x4 + m22 * x5;
206: x[2] -= m3 * x1 + m8 * x2 + m13 * x3 + m18 * x4 + m23 * x5;
207: x[3] -= m4 * x1 + m9 * x2 + m14 * x3 + m19 * x4 + m24 * x5;
208: x[4] -= m5 * x1 + m10 * x2 + m15 * x3 + m20 * x4 + m25 * x5;
210: x[5] -= m1 * x6 + m6 * x7 + m11 * x8 + m16 * x9 + m21 * x10;
211: x[6] -= m2 * x6 + m7 * x7 + m12 * x8 + m17 * x9 + m22 * x10;
212: x[7] -= m3 * x6 + m8 * x7 + m13 * x8 + m18 * x9 + m23 * x10;
213: x[8] -= m4 * x6 + m9 * x7 + m14 * x8 + m19 * x9 + m24 * x10;
214: x[9] -= m5 * x6 + m10 * x7 + m15 * x8 + m20 * x9 + m25 * x10;
216: x[10] -= m1 * x11 + m6 * x12 + m11 * x13 + m16 * x14 + m21 * x15;
217: x[11] -= m2 * x11 + m7 * x12 + m12 * x13 + m17 * x14 + m22 * x15;
218: x[12] -= m3 * x11 + m8 * x12 + m13 * x13 + m18 * x14 + m23 * x15;
219: x[13] -= m4 * x11 + m9 * x12 + m14 * x13 + m19 * x14 + m24 * x15;
220: x[14] -= m5 * x11 + m10 * x12 + m15 * x13 + m20 * x14 + m25 * x15;
222: x[15] -= m1 * x16 + m6 * x17 + m11 * x18 + m16 * x19 + m21 * x20;
223: x[16] -= m2 * x16 + m7 * x17 + m12 * x18 + m17 * x19 + m22 * x20;
224: x[17] -= m3 * x16 + m8 * x17 + m13 * x18 + m18 * x19 + m23 * x20;
225: x[18] -= m4 * x16 + m9 * x17 + m14 * x18 + m19 * x19 + m24 * x20;
226: x[19] -= m5 * x16 + m10 * x17 + m15 * x18 + m20 * x19 + m25 * x20;
228: x[20] -= m1 * x21 + m6 * x22 + m11 * x23 + m16 * x24 + m21 * x25;
229: x[21] -= m2 * x21 + m7 * x22 + m12 * x23 + m17 * x24 + m22 * x25;
230: x[22] -= m3 * x21 + m8 * x22 + m13 * x23 + m18 * x24 + m23 * x25;
231: x[23] -= m4 * x21 + m9 * x22 + m14 * x23 + m19 * x24 + m24 * x25;
232: x[24] -= m5 * x21 + m10 * x22 + m15 * x23 + m20 * x24 + m25 * x25;
234: pv += 25;
235: }
236: PetscCall(PetscLogFlops(250.0 * nz + 225.0));
237: }
238: row = *ajtmp++;
239: }
240: /* finished row so stick it into b->a */
241: pv = ba + 25 * bi[i];
242: pj = bj + bi[i];
243: nz = bi[i + 1] - bi[i];
244: for (j = 0; j < nz; j++) {
245: #if defined(PETSC_USE_MEMCPY)
246: PetscCall(PetscArraycpy(pv, rtmp + 25 * pj[j], 25));
247: #else
248: x = rtmp + 25 * pj[j];
249: pv[0] = x[0];
250: pv[1] = x[1];
251: pv[2] = x[2];
252: pv[3] = x[3];
253: pv[4] = x[4];
254: pv[5] = x[5];
255: pv[6] = x[6];
256: pv[7] = x[7];
257: pv[8] = x[8];
258: pv[9] = x[9];
259: pv[10] = x[10];
260: pv[11] = x[11];
261: pv[12] = x[12];
262: pv[13] = x[13];
263: pv[14] = x[14];
264: pv[15] = x[15];
265: pv[16] = x[16];
266: pv[17] = x[17];
267: pv[18] = x[18];
268: pv[19] = x[19];
269: pv[20] = x[20];
270: pv[21] = x[21];
271: pv[22] = x[22];
272: pv[23] = x[23];
273: pv[24] = x[24];
274: #endif
275: pv += 25;
276: }
277: /* invert diagonal block */
278: w = ba + 25 * diag_offset[i];
279: PetscCall(PetscKernel_A_gets_inverse_A_5(w, ipvt, work, shift, allowzeropivot, &zeropivotdetected));
280: if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
281: }
283: PetscCall(PetscFree(rtmp));
284: PetscCall(ISRestoreIndices(isicol, &ic));
285: PetscCall(ISRestoreIndices(isrow, &r));
287: C->ops->solve = MatSolve_SeqBAIJ_5_inplace;
288: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_5_inplace;
289: C->assembled = PETSC_TRUE;
291: PetscCall(PetscLogFlops(1.333333333333 * 5 * 5 * 5 * b->mbs)); /* from inverting diagonal blocks */
292: PetscFunctionReturn(PETSC_SUCCESS);
293: }
295: /* MatLUFactorNumeric_SeqBAIJ_5 -
296: copied from MatLUFactorNumeric_SeqBAIJ_N_inplace() and manually re-implemented
297: PetscKernel_A_gets_A_times_B()
298: PetscKernel_A_gets_A_minus_B_times_C()
299: PetscKernel_A_gets_inverse_A()
300: */
302: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_5(Mat B, Mat A, const MatFactorInfo *info)
303: {
304: Mat C = B;
305: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
306: IS isrow = b->row, isicol = b->icol;
307: const PetscInt *r, *ic;
308: PetscInt i, j, k, nz, nzL, row;
309: const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j;
310: const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag, *pj, bs2 = a->bs2;
311: MatScalar *rtmp, *pc, *mwork, *v, *pv, *aa = a->a, work[25];
312: PetscInt flg, ipvt[5];
313: PetscReal shift = info->shiftamount;
314: PetscBool allowzeropivot, zeropivotdetected;
316: PetscFunctionBegin;
317: allowzeropivot = PetscNot(A->erroriffailure);
318: PetscCall(ISGetIndices(isrow, &r));
319: PetscCall(ISGetIndices(isicol, &ic));
321: /* generate work space needed by the factorization */
322: PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork));
323: PetscCall(PetscArrayzero(rtmp, bs2 * n));
325: for (i = 0; i < n; i++) {
326: /* zero rtmp */
327: /* L part */
328: nz = bi[i + 1] - bi[i];
329: bjtmp = bj + bi[i];
330: for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
332: /* U part */
333: nz = bdiag[i] - bdiag[i + 1];
334: bjtmp = bj + bdiag[i + 1] + 1;
335: for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
337: /* load in initial (unfactored row) */
338: nz = ai[r[i] + 1] - ai[r[i]];
339: ajtmp = aj + ai[r[i]];
340: v = aa + bs2 * ai[r[i]];
341: for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], v + bs2 * j, bs2));
343: /* elimination */
344: bjtmp = bj + bi[i];
345: nzL = bi[i + 1] - bi[i];
346: for (k = 0; k < nzL; k++) {
347: row = bjtmp[k];
348: pc = rtmp + bs2 * row;
349: for (flg = 0, j = 0; j < bs2; j++) {
350: if (pc[j] != 0.0) {
351: flg = 1;
352: break;
353: }
354: }
355: if (flg) {
356: pv = b->a + bs2 * bdiag[row];
357: /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
358: PetscCall(PetscKernel_A_gets_A_times_B_5(pc, pv, mwork));
360: pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
361: pv = b->a + bs2 * (bdiag[row + 1] + 1);
362: nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */
363: for (j = 0; j < nz; j++) {
364: /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
365: /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
366: v = rtmp + bs2 * pj[j];
367: PetscCall(PetscKernel_A_gets_A_minus_B_times_C_5(v, pc, pv));
368: pv += bs2;
369: }
370: PetscCall(PetscLogFlops(250.0 * nz + 225)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
371: }
372: }
374: /* finished row so stick it into b->a */
375: /* L part */
376: pv = b->a + bs2 * bi[i];
377: pj = b->j + bi[i];
378: nz = bi[i + 1] - bi[i];
379: for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
381: /* Mark diagonal and invert diagonal for simpler triangular solves */
382: pv = b->a + bs2 * bdiag[i];
383: pj = b->j + bdiag[i];
384: PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
385: PetscCall(PetscKernel_A_gets_inverse_A_5(pv, ipvt, work, shift, allowzeropivot, &zeropivotdetected));
386: if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
388: /* U part */
389: pv = b->a + bs2 * (bdiag[i + 1] + 1);
390: pj = b->j + bdiag[i + 1] + 1;
391: nz = bdiag[i] - bdiag[i + 1] - 1;
392: for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
393: }
395: PetscCall(PetscFree2(rtmp, mwork));
396: PetscCall(ISRestoreIndices(isicol, &ic));
397: PetscCall(ISRestoreIndices(isrow, &r));
399: C->ops->solve = MatSolve_SeqBAIJ_5;
400: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_5;
401: C->assembled = PETSC_TRUE;
403: PetscCall(PetscLogFlops(1.333333333333 * 5 * 5 * 5 * n)); /* from inverting diagonal blocks */
404: PetscFunctionReturn(PETSC_SUCCESS);
405: }
407: /*
408: Version for when blocks are 5 by 5 Using natural ordering
409: */
410: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_5_NaturalOrdering_inplace(Mat C, Mat A, const MatFactorInfo *info)
411: {
412: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
413: PetscInt i, j, n = a->mbs, *bi = b->i, *bj = b->j, ipvt[5];
414: PetscInt *ajtmpold, *ajtmp, nz, row;
415: PetscInt *diag_offset = b->diag, *ai = a->i, *aj = a->j, *pj;
416: MatScalar *pv, *v, *rtmp, *pc, *w, *x;
417: MatScalar x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
418: MatScalar x16, x17, x18, x19, x20, x21, x22, x23, x24, x25;
419: MatScalar p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15;
420: MatScalar p16, p17, p18, p19, p20, p21, p22, p23, p24, p25;
421: MatScalar m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14, m15;
422: MatScalar m16, m17, m18, m19, m20, m21, m22, m23, m24, m25;
423: MatScalar *ba = b->a, *aa = a->a, work[25];
424: PetscReal shift = info->shiftamount;
425: PetscBool allowzeropivot, zeropivotdetected;
427: PetscFunctionBegin;
428: allowzeropivot = PetscNot(A->erroriffailure);
429: PetscCall(PetscMalloc1(25 * (n + 1), &rtmp));
430: for (i = 0; i < n; i++) {
431: nz = bi[i + 1] - bi[i];
432: ajtmp = bj + bi[i];
433: for (j = 0; j < nz; j++) {
434: x = rtmp + 25 * ajtmp[j];
435: x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0;
436: x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
437: x[16] = x[17] = x[18] = x[19] = x[20] = x[21] = x[22] = x[23] = x[24] = 0.0;
438: }
439: /* load in initial (unfactored row) */
440: nz = ai[i + 1] - ai[i];
441: ajtmpold = aj + ai[i];
442: v = aa + 25 * ai[i];
443: for (j = 0; j < nz; j++) {
444: x = rtmp + 25 * ajtmpold[j];
445: x[0] = v[0];
446: x[1] = v[1];
447: x[2] = v[2];
448: x[3] = v[3];
449: x[4] = v[4];
450: x[5] = v[5];
451: x[6] = v[6];
452: x[7] = v[7];
453: x[8] = v[8];
454: x[9] = v[9];
455: x[10] = v[10];
456: x[11] = v[11];
457: x[12] = v[12];
458: x[13] = v[13];
459: x[14] = v[14];
460: x[15] = v[15];
461: x[16] = v[16];
462: x[17] = v[17];
463: x[18] = v[18];
464: x[19] = v[19];
465: x[20] = v[20];
466: x[21] = v[21];
467: x[22] = v[22];
468: x[23] = v[23];
469: x[24] = v[24];
470: v += 25;
471: }
472: row = *ajtmp++;
473: while (row < i) {
474: pc = rtmp + 25 * row;
475: p1 = pc[0];
476: p2 = pc[1];
477: p3 = pc[2];
478: p4 = pc[3];
479: p5 = pc[4];
480: p6 = pc[5];
481: p7 = pc[6];
482: p8 = pc[7];
483: p9 = pc[8];
484: p10 = pc[9];
485: p11 = pc[10];
486: p12 = pc[11];
487: p13 = pc[12];
488: p14 = pc[13];
489: p15 = pc[14];
490: p16 = pc[15];
491: p17 = pc[16];
492: p18 = pc[17];
493: p19 = pc[18];
494: p20 = pc[19];
495: p21 = pc[20];
496: p22 = pc[21];
497: p23 = pc[22];
498: p24 = pc[23];
499: p25 = pc[24];
500: if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 || p16 != 0.0 || p17 != 0.0 || p18 != 0.0 || p19 != 0.0 || p20 != 0.0 || p21 != 0.0 || p22 != 0.0 || p23 != 0.0 || p24 != 0.0 || p25 != 0.0) {
501: pv = ba + 25 * diag_offset[row];
502: pj = bj + diag_offset[row] + 1;
503: x1 = pv[0];
504: x2 = pv[1];
505: x3 = pv[2];
506: x4 = pv[3];
507: x5 = pv[4];
508: x6 = pv[5];
509: x7 = pv[6];
510: x8 = pv[7];
511: x9 = pv[8];
512: x10 = pv[9];
513: x11 = pv[10];
514: x12 = pv[11];
515: x13 = pv[12];
516: x14 = pv[13];
517: x15 = pv[14];
518: x16 = pv[15];
519: x17 = pv[16];
520: x18 = pv[17];
521: x19 = pv[18];
522: x20 = pv[19];
523: x21 = pv[20];
524: x22 = pv[21];
525: x23 = pv[22];
526: x24 = pv[23];
527: x25 = pv[24];
528: pc[0] = m1 = p1 * x1 + p6 * x2 + p11 * x3 + p16 * x4 + p21 * x5;
529: pc[1] = m2 = p2 * x1 + p7 * x2 + p12 * x3 + p17 * x4 + p22 * x5;
530: pc[2] = m3 = p3 * x1 + p8 * x2 + p13 * x3 + p18 * x4 + p23 * x5;
531: pc[3] = m4 = p4 * x1 + p9 * x2 + p14 * x3 + p19 * x4 + p24 * x5;
532: pc[4] = m5 = p5 * x1 + p10 * x2 + p15 * x3 + p20 * x4 + p25 * x5;
534: pc[5] = m6 = p1 * x6 + p6 * x7 + p11 * x8 + p16 * x9 + p21 * x10;
535: pc[6] = m7 = p2 * x6 + p7 * x7 + p12 * x8 + p17 * x9 + p22 * x10;
536: pc[7] = m8 = p3 * x6 + p8 * x7 + p13 * x8 + p18 * x9 + p23 * x10;
537: pc[8] = m9 = p4 * x6 + p9 * x7 + p14 * x8 + p19 * x9 + p24 * x10;
538: pc[9] = m10 = p5 * x6 + p10 * x7 + p15 * x8 + p20 * x9 + p25 * x10;
540: pc[10] = m11 = p1 * x11 + p6 * x12 + p11 * x13 + p16 * x14 + p21 * x15;
541: pc[11] = m12 = p2 * x11 + p7 * x12 + p12 * x13 + p17 * x14 + p22 * x15;
542: pc[12] = m13 = p3 * x11 + p8 * x12 + p13 * x13 + p18 * x14 + p23 * x15;
543: pc[13] = m14 = p4 * x11 + p9 * x12 + p14 * x13 + p19 * x14 + p24 * x15;
544: pc[14] = m15 = p5 * x11 + p10 * x12 + p15 * x13 + p20 * x14 + p25 * x15;
546: pc[15] = m16 = p1 * x16 + p6 * x17 + p11 * x18 + p16 * x19 + p21 * x20;
547: pc[16] = m17 = p2 * x16 + p7 * x17 + p12 * x18 + p17 * x19 + p22 * x20;
548: pc[17] = m18 = p3 * x16 + p8 * x17 + p13 * x18 + p18 * x19 + p23 * x20;
549: pc[18] = m19 = p4 * x16 + p9 * x17 + p14 * x18 + p19 * x19 + p24 * x20;
550: pc[19] = m20 = p5 * x16 + p10 * x17 + p15 * x18 + p20 * x19 + p25 * x20;
552: pc[20] = m21 = p1 * x21 + p6 * x22 + p11 * x23 + p16 * x24 + p21 * x25;
553: pc[21] = m22 = p2 * x21 + p7 * x22 + p12 * x23 + p17 * x24 + p22 * x25;
554: pc[22] = m23 = p3 * x21 + p8 * x22 + p13 * x23 + p18 * x24 + p23 * x25;
555: pc[23] = m24 = p4 * x21 + p9 * x22 + p14 * x23 + p19 * x24 + p24 * x25;
556: pc[24] = m25 = p5 * x21 + p10 * x22 + p15 * x23 + p20 * x24 + p25 * x25;
558: nz = bi[row + 1] - diag_offset[row] - 1;
559: pv += 25;
560: for (j = 0; j < nz; j++) {
561: x1 = pv[0];
562: x2 = pv[1];
563: x3 = pv[2];
564: x4 = pv[3];
565: x5 = pv[4];
566: x6 = pv[5];
567: x7 = pv[6];
568: x8 = pv[7];
569: x9 = pv[8];
570: x10 = pv[9];
571: x11 = pv[10];
572: x12 = pv[11];
573: x13 = pv[12];
574: x14 = pv[13];
575: x15 = pv[14];
576: x16 = pv[15];
577: x17 = pv[16];
578: x18 = pv[17];
579: x19 = pv[18];
580: x20 = pv[19];
581: x21 = pv[20];
582: x22 = pv[21];
583: x23 = pv[22];
584: x24 = pv[23];
585: x25 = pv[24];
586: x = rtmp + 25 * pj[j];
587: x[0] -= m1 * x1 + m6 * x2 + m11 * x3 + m16 * x4 + m21 * x5;
588: x[1] -= m2 * x1 + m7 * x2 + m12 * x3 + m17 * x4 + m22 * x5;
589: x[2] -= m3 * x1 + m8 * x2 + m13 * x3 + m18 * x4 + m23 * x5;
590: x[3] -= m4 * x1 + m9 * x2 + m14 * x3 + m19 * x4 + m24 * x5;
591: x[4] -= m5 * x1 + m10 * x2 + m15 * x3 + m20 * x4 + m25 * x5;
593: x[5] -= m1 * x6 + m6 * x7 + m11 * x8 + m16 * x9 + m21 * x10;
594: x[6] -= m2 * x6 + m7 * x7 + m12 * x8 + m17 * x9 + m22 * x10;
595: x[7] -= m3 * x6 + m8 * x7 + m13 * x8 + m18 * x9 + m23 * x10;
596: x[8] -= m4 * x6 + m9 * x7 + m14 * x8 + m19 * x9 + m24 * x10;
597: x[9] -= m5 * x6 + m10 * x7 + m15 * x8 + m20 * x9 + m25 * x10;
599: x[10] -= m1 * x11 + m6 * x12 + m11 * x13 + m16 * x14 + m21 * x15;
600: x[11] -= m2 * x11 + m7 * x12 + m12 * x13 + m17 * x14 + m22 * x15;
601: x[12] -= m3 * x11 + m8 * x12 + m13 * x13 + m18 * x14 + m23 * x15;
602: x[13] -= m4 * x11 + m9 * x12 + m14 * x13 + m19 * x14 + m24 * x15;
603: x[14] -= m5 * x11 + m10 * x12 + m15 * x13 + m20 * x14 + m25 * x15;
605: x[15] -= m1 * x16 + m6 * x17 + m11 * x18 + m16 * x19 + m21 * x20;
606: x[16] -= m2 * x16 + m7 * x17 + m12 * x18 + m17 * x19 + m22 * x20;
607: x[17] -= m3 * x16 + m8 * x17 + m13 * x18 + m18 * x19 + m23 * x20;
608: x[18] -= m4 * x16 + m9 * x17 + m14 * x18 + m19 * x19 + m24 * x20;
609: x[19] -= m5 * x16 + m10 * x17 + m15 * x18 + m20 * x19 + m25 * x20;
611: x[20] -= m1 * x21 + m6 * x22 + m11 * x23 + m16 * x24 + m21 * x25;
612: x[21] -= m2 * x21 + m7 * x22 + m12 * x23 + m17 * x24 + m22 * x25;
613: x[22] -= m3 * x21 + m8 * x22 + m13 * x23 + m18 * x24 + m23 * x25;
614: x[23] -= m4 * x21 + m9 * x22 + m14 * x23 + m19 * x24 + m24 * x25;
615: x[24] -= m5 * x21 + m10 * x22 + m15 * x23 + m20 * x24 + m25 * x25;
616: pv += 25;
617: }
618: PetscCall(PetscLogFlops(250.0 * nz + 225.0));
619: }
620: row = *ajtmp++;
621: }
622: /* finished row so stick it into b->a */
623: pv = ba + 25 * bi[i];
624: pj = bj + bi[i];
625: nz = bi[i + 1] - bi[i];
626: for (j = 0; j < nz; j++) {
627: x = rtmp + 25 * pj[j];
628: pv[0] = x[0];
629: pv[1] = x[1];
630: pv[2] = x[2];
631: pv[3] = x[3];
632: pv[4] = x[4];
633: pv[5] = x[5];
634: pv[6] = x[6];
635: pv[7] = x[7];
636: pv[8] = x[8];
637: pv[9] = x[9];
638: pv[10] = x[10];
639: pv[11] = x[11];
640: pv[12] = x[12];
641: pv[13] = x[13];
642: pv[14] = x[14];
643: pv[15] = x[15];
644: pv[16] = x[16];
645: pv[17] = x[17];
646: pv[18] = x[18];
647: pv[19] = x[19];
648: pv[20] = x[20];
649: pv[21] = x[21];
650: pv[22] = x[22];
651: pv[23] = x[23];
652: pv[24] = x[24];
653: pv += 25;
654: }
655: /* invert diagonal block */
656: w = ba + 25 * diag_offset[i];
657: PetscCall(PetscKernel_A_gets_inverse_A_5(w, ipvt, work, shift, allowzeropivot, &zeropivotdetected));
658: if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
659: }
661: PetscCall(PetscFree(rtmp));
663: C->ops->solve = MatSolve_SeqBAIJ_5_NaturalOrdering_inplace;
664: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_inplace;
665: C->assembled = PETSC_TRUE;
667: PetscCall(PetscLogFlops(1.333333333333 * 5 * 5 * 5 * b->mbs)); /* from inverting diagonal blocks */
668: PetscFunctionReturn(PETSC_SUCCESS);
669: }
671: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_5_NaturalOrdering(Mat B, Mat A, const MatFactorInfo *info)
672: {
673: Mat C = B;
674: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
675: PetscInt i, j, k, nz, nzL, row;
676: const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j;
677: const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag, *pj, bs2 = a->bs2;
678: MatScalar *rtmp, *pc, *mwork, *v, *vv, *pv, *aa = a->a, work[25];
679: PetscInt flg, ipvt[5];
680: PetscReal shift = info->shiftamount;
681: PetscBool allowzeropivot, zeropivotdetected;
683: PetscFunctionBegin;
684: allowzeropivot = PetscNot(A->erroriffailure);
686: /* generate work space needed by the factorization */
687: PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork));
688: PetscCall(PetscArrayzero(rtmp, bs2 * n));
690: for (i = 0; i < n; i++) {
691: /* zero rtmp */
692: /* L part */
693: nz = bi[i + 1] - bi[i];
694: bjtmp = bj + bi[i];
695: for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
697: /* U part */
698: nz = bdiag[i] - bdiag[i + 1];
699: bjtmp = bj + bdiag[i + 1] + 1;
700: for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
702: /* load in initial (unfactored row) */
703: nz = ai[i + 1] - ai[i];
704: ajtmp = aj + ai[i];
705: v = aa + bs2 * ai[i];
706: for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ajtmp[j], v + bs2 * j, bs2));
708: /* elimination */
709: bjtmp = bj + bi[i];
710: nzL = bi[i + 1] - bi[i];
711: for (k = 0; k < nzL; k++) {
712: row = bjtmp[k];
713: pc = rtmp + bs2 * row;
714: for (flg = 0, j = 0; j < bs2; j++) {
715: if (pc[j] != 0.0) {
716: flg = 1;
717: break;
718: }
719: }
720: if (flg) {
721: pv = b->a + bs2 * bdiag[row];
722: /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
723: PetscCall(PetscKernel_A_gets_A_times_B_5(pc, pv, mwork));
725: pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
726: pv = b->a + bs2 * (bdiag[row + 1] + 1);
727: nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */
728: for (j = 0; j < nz; j++) {
729: /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
730: /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
731: vv = rtmp + bs2 * pj[j];
732: PetscCall(PetscKernel_A_gets_A_minus_B_times_C_5(vv, pc, pv));
733: pv += bs2;
734: }
735: PetscCall(PetscLogFlops(250.0 * nz + 225)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
736: }
737: }
739: /* finished row so stick it into b->a */
740: /* L part */
741: pv = b->a + bs2 * bi[i];
742: pj = b->j + bi[i];
743: nz = bi[i + 1] - bi[i];
744: for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
746: /* Mark diagonal and invert diagonal for simpler triangular solves */
747: pv = b->a + bs2 * bdiag[i];
748: pj = b->j + bdiag[i];
749: PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
750: PetscCall(PetscKernel_A_gets_inverse_A_5(pv, ipvt, work, shift, allowzeropivot, &zeropivotdetected));
751: if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
753: /* U part */
754: pv = b->a + bs2 * (bdiag[i + 1] + 1);
755: pj = b->j + bdiag[i + 1] + 1;
756: nz = bdiag[i] - bdiag[i + 1] - 1;
757: for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
758: }
759: PetscCall(PetscFree2(rtmp, mwork));
761: C->ops->solve = MatSolve_SeqBAIJ_5_NaturalOrdering;
762: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_5_NaturalOrdering;
763: C->assembled = PETSC_TRUE;
765: PetscCall(PetscLogFlops(1.333333333333 * 5 * 5 * 5 * n)); /* from inverting diagonal blocks */
766: PetscFunctionReturn(PETSC_SUCCESS);
767: }