Actual source code: baijsolv.c
1: #include <../src/mat/impls/baij/seq/baij.h>
2: #include <petsc/private/kernels/blockinvert.h>
4: PetscErrorCode MatSolve_SeqBAIJ_N_inplace(Mat A, Vec bb, Vec xx)
5: {
6: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
7: IS iscol = a->col, isrow = a->row;
8: const PetscInt *r, *c, *rout, *cout;
9: const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *vi;
10: PetscInt i, nz;
11: const PetscInt bs = A->rmap->bs, bs2 = a->bs2;
12: const MatScalar *aa = a->a, *v;
13: PetscScalar *x, *s, *t, *ls;
14: const PetscScalar *b;
16: PetscFunctionBegin;
17: PetscCall(VecGetArrayRead(bb, &b));
18: PetscCall(VecGetArray(xx, &x));
19: t = a->solve_work;
21: PetscCall(ISGetIndices(isrow, &rout));
22: r = rout;
23: PetscCall(ISGetIndices(iscol, &cout));
24: c = cout + (n - 1);
26: /* forward solve the lower triangular */
27: PetscCall(PetscArraycpy(t, b + bs * (*r++), bs));
28: for (i = 1; i < n; i++) {
29: v = aa + bs2 * ai[i];
30: vi = aj + ai[i];
31: nz = a->diag[i] - ai[i];
32: s = t + bs * i;
33: PetscCall(PetscArraycpy(s, b + bs * (*r++), bs));
34: while (nz--) {
35: PetscKernel_v_gets_v_minus_A_times_w(bs, s, v, t + bs * (*vi++));
36: v += bs2;
37: }
38: }
39: /* backward solve the upper triangular */
40: ls = a->solve_work + A->cmap->n;
41: for (i = n - 1; i >= 0; i--) {
42: v = aa + bs2 * (a->diag[i] + 1);
43: vi = aj + a->diag[i] + 1;
44: nz = ai[i + 1] - a->diag[i] - 1;
45: PetscCall(PetscArraycpy(ls, t + i * bs, bs));
46: while (nz--) {
47: PetscKernel_v_gets_v_minus_A_times_w(bs, ls, v, t + bs * (*vi++));
48: v += bs2;
49: }
50: PetscKernel_w_gets_A_times_v(bs, ls, aa + bs2 * a->diag[i], t + i * bs);
51: PetscCall(PetscArraycpy(x + bs * (*c--), t + i * bs, bs));
52: }
54: PetscCall(ISRestoreIndices(isrow, &rout));
55: PetscCall(ISRestoreIndices(iscol, &cout));
56: PetscCall(VecRestoreArrayRead(bb, &b));
57: PetscCall(VecRestoreArray(xx, &x));
58: PetscCall(PetscLogFlops(2.0 * (a->bs2) * (a->nz) - A->rmap->bs * A->cmap->n));
59: PetscFunctionReturn(PETSC_SUCCESS);
60: }
62: PetscErrorCode MatSolve_SeqBAIJ_7_inplace(Mat A, Vec bb, Vec xx)
63: {
64: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
65: IS iscol = a->col, isrow = a->row;
66: const PetscInt *r, *c, *ai = a->i, *aj = a->j;
67: const PetscInt *rout, *cout, *diag = a->diag, *vi, n = a->mbs;
68: PetscInt i, nz, idx, idt, idc;
69: const MatScalar *aa = a->a, *v;
70: PetscScalar s1, s2, s3, s4, s5, s6, s7, x1, x2, x3, x4, x5, x6, x7, *x, *t;
71: const PetscScalar *b;
73: PetscFunctionBegin;
74: PetscCall(VecGetArrayRead(bb, &b));
75: PetscCall(VecGetArray(xx, &x));
76: t = a->solve_work;
78: PetscCall(ISGetIndices(isrow, &rout));
79: r = rout;
80: PetscCall(ISGetIndices(iscol, &cout));
81: c = cout + (n - 1);
83: /* forward solve the lower triangular */
84: idx = 7 * (*r++);
85: t[0] = b[idx];
86: t[1] = b[1 + idx];
87: t[2] = b[2 + idx];
88: t[3] = b[3 + idx];
89: t[4] = b[4 + idx];
90: t[5] = b[5 + idx];
91: t[6] = b[6 + idx];
93: for (i = 1; i < n; i++) {
94: v = aa + 49 * ai[i];
95: vi = aj + ai[i];
96: nz = diag[i] - ai[i];
97: idx = 7 * (*r++);
98: s1 = b[idx];
99: s2 = b[1 + idx];
100: s3 = b[2 + idx];
101: s4 = b[3 + idx];
102: s5 = b[4 + idx];
103: s6 = b[5 + idx];
104: s7 = b[6 + idx];
105: while (nz--) {
106: idx = 7 * (*vi++);
107: x1 = t[idx];
108: x2 = t[1 + idx];
109: x3 = t[2 + idx];
110: x4 = t[3 + idx];
111: x5 = t[4 + idx];
112: x6 = t[5 + idx];
113: x7 = t[6 + idx];
114: s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
115: s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
116: s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
117: s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
118: s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
119: s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
120: s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
121: v += 49;
122: }
123: idx = 7 * i;
124: t[idx] = s1;
125: t[1 + idx] = s2;
126: t[2 + idx] = s3;
127: t[3 + idx] = s4;
128: t[4 + idx] = s5;
129: t[5 + idx] = s6;
130: t[6 + idx] = s7;
131: }
132: /* backward solve the upper triangular */
133: for (i = n - 1; i >= 0; i--) {
134: v = aa + 49 * diag[i] + 49;
135: vi = aj + diag[i] + 1;
136: nz = ai[i + 1] - diag[i] - 1;
137: idt = 7 * i;
138: s1 = t[idt];
139: s2 = t[1 + idt];
140: s3 = t[2 + idt];
141: s4 = t[3 + idt];
142: s5 = t[4 + idt];
143: s6 = t[5 + idt];
144: s7 = t[6 + idt];
145: while (nz--) {
146: idx = 7 * (*vi++);
147: x1 = t[idx];
148: x2 = t[1 + idx];
149: x3 = t[2 + idx];
150: x4 = t[3 + idx];
151: x5 = t[4 + idx];
152: x6 = t[5 + idx];
153: x7 = t[6 + idx];
154: s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
155: s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
156: s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
157: s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
158: s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
159: s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
160: s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
161: v += 49;
162: }
163: idc = 7 * (*c--);
164: v = aa + 49 * diag[i];
165: x[idc] = t[idt] = v[0] * s1 + v[7] * s2 + v[14] * s3 + v[21] * s4 + v[28] * s5 + v[35] * s6 + v[42] * s7;
166: x[1 + idc] = t[1 + idt] = v[1] * s1 + v[8] * s2 + v[15] * s3 + v[22] * s4 + v[29] * s5 + v[36] * s6 + v[43] * s7;
167: x[2 + idc] = t[2 + idt] = v[2] * s1 + v[9] * s2 + v[16] * s3 + v[23] * s4 + v[30] * s5 + v[37] * s6 + v[44] * s7;
168: x[3 + idc] = t[3 + idt] = v[3] * s1 + v[10] * s2 + v[17] * s3 + v[24] * s4 + v[31] * s5 + v[38] * s6 + v[45] * s7;
169: x[4 + idc] = t[4 + idt] = v[4] * s1 + v[11] * s2 + v[18] * s3 + v[25] * s4 + v[32] * s5 + v[39] * s6 + v[46] * s7;
170: x[5 + idc] = t[5 + idt] = v[5] * s1 + v[12] * s2 + v[19] * s3 + v[26] * s4 + v[33] * s5 + v[40] * s6 + v[47] * s7;
171: x[6 + idc] = t[6 + idt] = v[6] * s1 + v[13] * s2 + v[20] * s3 + v[27] * s4 + v[34] * s5 + v[41] * s6 + v[48] * s7;
172: }
174: PetscCall(ISRestoreIndices(isrow, &rout));
175: PetscCall(ISRestoreIndices(iscol, &cout));
176: PetscCall(VecRestoreArrayRead(bb, &b));
177: PetscCall(VecRestoreArray(xx, &x));
178: PetscCall(PetscLogFlops(2.0 * 49 * (a->nz) - 7.0 * A->cmap->n));
179: PetscFunctionReturn(PETSC_SUCCESS);
180: }
182: PetscErrorCode MatSolve_SeqBAIJ_7(Mat A, Vec bb, Vec xx)
183: {
184: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
185: IS iscol = a->col, isrow = a->row;
186: const PetscInt *r, *c, *ai = a->i, *aj = a->j, *adiag = a->diag;
187: const PetscInt n = a->mbs, *rout, *cout, *vi;
188: PetscInt i, nz, idx, idt, idc, m;
189: const MatScalar *aa = a->a, *v;
190: PetscScalar s1, s2, s3, s4, s5, s6, s7, x1, x2, x3, x4, x5, x6, x7, *x, *t;
191: const PetscScalar *b;
193: PetscFunctionBegin;
194: PetscCall(VecGetArrayRead(bb, &b));
195: PetscCall(VecGetArray(xx, &x));
196: t = a->solve_work;
198: PetscCall(ISGetIndices(isrow, &rout));
199: r = rout;
200: PetscCall(ISGetIndices(iscol, &cout));
201: c = cout;
203: /* forward solve the lower triangular */
204: idx = 7 * r[0];
205: t[0] = b[idx];
206: t[1] = b[1 + idx];
207: t[2] = b[2 + idx];
208: t[3] = b[3 + idx];
209: t[4] = b[4 + idx];
210: t[5] = b[5 + idx];
211: t[6] = b[6 + idx];
213: for (i = 1; i < n; i++) {
214: v = aa + 49 * ai[i];
215: vi = aj + ai[i];
216: nz = ai[i + 1] - ai[i];
217: idx = 7 * r[i];
218: s1 = b[idx];
219: s2 = b[1 + idx];
220: s3 = b[2 + idx];
221: s4 = b[3 + idx];
222: s5 = b[4 + idx];
223: s6 = b[5 + idx];
224: s7 = b[6 + idx];
225: for (m = 0; m < nz; m++) {
226: idx = 7 * vi[m];
227: x1 = t[idx];
228: x2 = t[1 + idx];
229: x3 = t[2 + idx];
230: x4 = t[3 + idx];
231: x5 = t[4 + idx];
232: x6 = t[5 + idx];
233: x7 = t[6 + idx];
234: s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
235: s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
236: s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
237: s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
238: s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
239: s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
240: s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
241: v += 49;
242: }
243: idx = 7 * i;
244: t[idx] = s1;
245: t[1 + idx] = s2;
246: t[2 + idx] = s3;
247: t[3 + idx] = s4;
248: t[4 + idx] = s5;
249: t[5 + idx] = s6;
250: t[6 + idx] = s7;
251: }
252: /* backward solve the upper triangular */
253: for (i = n - 1; i >= 0; i--) {
254: v = aa + 49 * (adiag[i + 1] + 1);
255: vi = aj + adiag[i + 1] + 1;
256: nz = adiag[i] - adiag[i + 1] - 1;
257: idt = 7 * i;
258: s1 = t[idt];
259: s2 = t[1 + idt];
260: s3 = t[2 + idt];
261: s4 = t[3 + idt];
262: s5 = t[4 + idt];
263: s6 = t[5 + idt];
264: s7 = t[6 + idt];
265: for (m = 0; m < nz; m++) {
266: idx = 7 * vi[m];
267: x1 = t[idx];
268: x2 = t[1 + idx];
269: x3 = t[2 + idx];
270: x4 = t[3 + idx];
271: x5 = t[4 + idx];
272: x6 = t[5 + idx];
273: x7 = t[6 + idx];
274: s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
275: s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
276: s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
277: s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
278: s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
279: s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
280: s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
281: v += 49;
282: }
283: idc = 7 * c[i];
284: x[idc] = t[idt] = v[0] * s1 + v[7] * s2 + v[14] * s3 + v[21] * s4 + v[28] * s5 + v[35] * s6 + v[42] * s7;
285: x[1 + idc] = t[1 + idt] = v[1] * s1 + v[8] * s2 + v[15] * s3 + v[22] * s4 + v[29] * s5 + v[36] * s6 + v[43] * s7;
286: x[2 + idc] = t[2 + idt] = v[2] * s1 + v[9] * s2 + v[16] * s3 + v[23] * s4 + v[30] * s5 + v[37] * s6 + v[44] * s7;
287: x[3 + idc] = t[3 + idt] = v[3] * s1 + v[10] * s2 + v[17] * s3 + v[24] * s4 + v[31] * s5 + v[38] * s6 + v[45] * s7;
288: x[4 + idc] = t[4 + idt] = v[4] * s1 + v[11] * s2 + v[18] * s3 + v[25] * s4 + v[32] * s5 + v[39] * s6 + v[46] * s7;
289: x[5 + idc] = t[5 + idt] = v[5] * s1 + v[12] * s2 + v[19] * s3 + v[26] * s4 + v[33] * s5 + v[40] * s6 + v[47] * s7;
290: x[6 + idc] = t[6 + idt] = v[6] * s1 + v[13] * s2 + v[20] * s3 + v[27] * s4 + v[34] * s5 + v[41] * s6 + v[48] * s7;
291: }
293: PetscCall(ISRestoreIndices(isrow, &rout));
294: PetscCall(ISRestoreIndices(iscol, &cout));
295: PetscCall(VecRestoreArrayRead(bb, &b));
296: PetscCall(VecRestoreArray(xx, &x));
297: PetscCall(PetscLogFlops(2.0 * 49 * (a->nz) - 7.0 * A->cmap->n));
298: PetscFunctionReturn(PETSC_SUCCESS);
299: }
301: PetscErrorCode MatSolve_SeqBAIJ_6_inplace(Mat A, Vec bb, Vec xx)
302: {
303: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
304: IS iscol = a->col, isrow = a->row;
305: const PetscInt *r, *c, *rout, *cout;
306: const PetscInt *diag = a->diag, n = a->mbs, *vi, *ai = a->i, *aj = a->j;
307: PetscInt i, nz, idx, idt, idc;
308: const MatScalar *aa = a->a, *v;
309: PetscScalar *x, s1, s2, s3, s4, s5, s6, x1, x2, x3, x4, x5, x6, *t;
310: const PetscScalar *b;
312: PetscFunctionBegin;
313: PetscCall(VecGetArrayRead(bb, &b));
314: PetscCall(VecGetArray(xx, &x));
315: t = a->solve_work;
317: PetscCall(ISGetIndices(isrow, &rout));
318: r = rout;
319: PetscCall(ISGetIndices(iscol, &cout));
320: c = cout + (n - 1);
322: /* forward solve the lower triangular */
323: idx = 6 * (*r++);
324: t[0] = b[idx];
325: t[1] = b[1 + idx];
326: t[2] = b[2 + idx];
327: t[3] = b[3 + idx];
328: t[4] = b[4 + idx];
329: t[5] = b[5 + idx];
330: for (i = 1; i < n; i++) {
331: v = aa + 36 * ai[i];
332: vi = aj + ai[i];
333: nz = diag[i] - ai[i];
334: idx = 6 * (*r++);
335: s1 = b[idx];
336: s2 = b[1 + idx];
337: s3 = b[2 + idx];
338: s4 = b[3 + idx];
339: s5 = b[4 + idx];
340: s6 = b[5 + idx];
341: while (nz--) {
342: idx = 6 * (*vi++);
343: x1 = t[idx];
344: x2 = t[1 + idx];
345: x3 = t[2 + idx];
346: x4 = t[3 + idx];
347: x5 = t[4 + idx];
348: x6 = t[5 + idx];
349: s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
350: s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
351: s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
352: s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
353: s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
354: s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
355: v += 36;
356: }
357: idx = 6 * i;
358: t[idx] = s1;
359: t[1 + idx] = s2;
360: t[2 + idx] = s3;
361: t[3 + idx] = s4;
362: t[4 + idx] = s5;
363: t[5 + idx] = s6;
364: }
365: /* backward solve the upper triangular */
366: for (i = n - 1; i >= 0; i--) {
367: v = aa + 36 * diag[i] + 36;
368: vi = aj + diag[i] + 1;
369: nz = ai[i + 1] - diag[i] - 1;
370: idt = 6 * i;
371: s1 = t[idt];
372: s2 = t[1 + idt];
373: s3 = t[2 + idt];
374: s4 = t[3 + idt];
375: s5 = t[4 + idt];
376: s6 = t[5 + idt];
377: while (nz--) {
378: idx = 6 * (*vi++);
379: x1 = t[idx];
380: x2 = t[1 + idx];
381: x3 = t[2 + idx];
382: x4 = t[3 + idx];
383: x5 = t[4 + idx];
384: x6 = t[5 + idx];
385: s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
386: s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
387: s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
388: s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
389: s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
390: s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
391: v += 36;
392: }
393: idc = 6 * (*c--);
394: v = aa + 36 * diag[i];
395: x[idc] = t[idt] = v[0] * s1 + v[6] * s2 + v[12] * s3 + v[18] * s4 + v[24] * s5 + v[30] * s6;
396: x[1 + idc] = t[1 + idt] = v[1] * s1 + v[7] * s2 + v[13] * s3 + v[19] * s4 + v[25] * s5 + v[31] * s6;
397: x[2 + idc] = t[2 + idt] = v[2] * s1 + v[8] * s2 + v[14] * s3 + v[20] * s4 + v[26] * s5 + v[32] * s6;
398: x[3 + idc] = t[3 + idt] = v[3] * s1 + v[9] * s2 + v[15] * s3 + v[21] * s4 + v[27] * s5 + v[33] * s6;
399: x[4 + idc] = t[4 + idt] = v[4] * s1 + v[10] * s2 + v[16] * s3 + v[22] * s4 + v[28] * s5 + v[34] * s6;
400: x[5 + idc] = t[5 + idt] = v[5] * s1 + v[11] * s2 + v[17] * s3 + v[23] * s4 + v[29] * s5 + v[35] * s6;
401: }
403: PetscCall(ISRestoreIndices(isrow, &rout));
404: PetscCall(ISRestoreIndices(iscol, &cout));
405: PetscCall(VecRestoreArrayRead(bb, &b));
406: PetscCall(VecRestoreArray(xx, &x));
407: PetscCall(PetscLogFlops(2.0 * 36 * (a->nz) - 6.0 * A->cmap->n));
408: PetscFunctionReturn(PETSC_SUCCESS);
409: }
411: PetscErrorCode MatSolve_SeqBAIJ_6(Mat A, Vec bb, Vec xx)
412: {
413: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
414: IS iscol = a->col, isrow = a->row;
415: const PetscInt *r, *c, *rout, *cout;
416: const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
417: PetscInt i, nz, idx, idt, idc, m;
418: const MatScalar *aa = a->a, *v;
419: PetscScalar *x, s1, s2, s3, s4, s5, s6, x1, x2, x3, x4, x5, x6, *t;
420: const PetscScalar *b;
422: PetscFunctionBegin;
423: PetscCall(VecGetArrayRead(bb, &b));
424: PetscCall(VecGetArray(xx, &x));
425: t = a->solve_work;
427: PetscCall(ISGetIndices(isrow, &rout));
428: r = rout;
429: PetscCall(ISGetIndices(iscol, &cout));
430: c = cout;
432: /* forward solve the lower triangular */
433: idx = 6 * r[0];
434: t[0] = b[idx];
435: t[1] = b[1 + idx];
436: t[2] = b[2 + idx];
437: t[3] = b[3 + idx];
438: t[4] = b[4 + idx];
439: t[5] = b[5 + idx];
440: for (i = 1; i < n; i++) {
441: v = aa + 36 * ai[i];
442: vi = aj + ai[i];
443: nz = ai[i + 1] - ai[i];
444: idx = 6 * r[i];
445: s1 = b[idx];
446: s2 = b[1 + idx];
447: s3 = b[2 + idx];
448: s4 = b[3 + idx];
449: s5 = b[4 + idx];
450: s6 = b[5 + idx];
451: for (m = 0; m < nz; m++) {
452: idx = 6 * vi[m];
453: x1 = t[idx];
454: x2 = t[1 + idx];
455: x3 = t[2 + idx];
456: x4 = t[3 + idx];
457: x5 = t[4 + idx];
458: x6 = t[5 + idx];
459: s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
460: s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
461: s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
462: s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
463: s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
464: s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
465: v += 36;
466: }
467: idx = 6 * i;
468: t[idx] = s1;
469: t[1 + idx] = s2;
470: t[2 + idx] = s3;
471: t[3 + idx] = s4;
472: t[4 + idx] = s5;
473: t[5 + idx] = s6;
474: }
475: /* backward solve the upper triangular */
476: for (i = n - 1; i >= 0; i--) {
477: v = aa + 36 * (adiag[i + 1] + 1);
478: vi = aj + adiag[i + 1] + 1;
479: nz = adiag[i] - adiag[i + 1] - 1;
480: idt = 6 * i;
481: s1 = t[idt];
482: s2 = t[1 + idt];
483: s3 = t[2 + idt];
484: s4 = t[3 + idt];
485: s5 = t[4 + idt];
486: s6 = t[5 + idt];
487: for (m = 0; m < nz; m++) {
488: idx = 6 * vi[m];
489: x1 = t[idx];
490: x2 = t[1 + idx];
491: x3 = t[2 + idx];
492: x4 = t[3 + idx];
493: x5 = t[4 + idx];
494: x6 = t[5 + idx];
495: s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
496: s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
497: s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
498: s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
499: s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
500: s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
501: v += 36;
502: }
503: idc = 6 * c[i];
504: x[idc] = t[idt] = v[0] * s1 + v[6] * s2 + v[12] * s3 + v[18] * s4 + v[24] * s5 + v[30] * s6;
505: x[1 + idc] = t[1 + idt] = v[1] * s1 + v[7] * s2 + v[13] * s3 + v[19] * s4 + v[25] * s5 + v[31] * s6;
506: x[2 + idc] = t[2 + idt] = v[2] * s1 + v[8] * s2 + v[14] * s3 + v[20] * s4 + v[26] * s5 + v[32] * s6;
507: x[3 + idc] = t[3 + idt] = v[3] * s1 + v[9] * s2 + v[15] * s3 + v[21] * s4 + v[27] * s5 + v[33] * s6;
508: x[4 + idc] = t[4 + idt] = v[4] * s1 + v[10] * s2 + v[16] * s3 + v[22] * s4 + v[28] * s5 + v[34] * s6;
509: x[5 + idc] = t[5 + idt] = v[5] * s1 + v[11] * s2 + v[17] * s3 + v[23] * s4 + v[29] * s5 + v[35] * s6;
510: }
512: PetscCall(ISRestoreIndices(isrow, &rout));
513: PetscCall(ISRestoreIndices(iscol, &cout));
514: PetscCall(VecRestoreArrayRead(bb, &b));
515: PetscCall(VecRestoreArray(xx, &x));
516: PetscCall(PetscLogFlops(2.0 * 36 * (a->nz) - 6.0 * A->cmap->n));
517: PetscFunctionReturn(PETSC_SUCCESS);
518: }
520: PetscErrorCode MatSolve_SeqBAIJ_5_inplace(Mat A, Vec bb, Vec xx)
521: {
522: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
523: IS iscol = a->col, isrow = a->row;
524: const PetscInt *r, *c, *rout, *cout, *diag = a->diag;
525: const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j;
526: PetscInt i, nz, idx, idt, idc;
527: const MatScalar *aa = a->a, *v;
528: PetscScalar *x, s1, s2, s3, s4, s5, x1, x2, x3, x4, x5, *t;
529: const PetscScalar *b;
531: PetscFunctionBegin;
532: PetscCall(VecGetArrayRead(bb, &b));
533: PetscCall(VecGetArray(xx, &x));
534: t = a->solve_work;
536: PetscCall(ISGetIndices(isrow, &rout));
537: r = rout;
538: PetscCall(ISGetIndices(iscol, &cout));
539: c = cout + (n - 1);
541: /* forward solve the lower triangular */
542: idx = 5 * (*r++);
543: t[0] = b[idx];
544: t[1] = b[1 + idx];
545: t[2] = b[2 + idx];
546: t[3] = b[3 + idx];
547: t[4] = b[4 + idx];
548: for (i = 1; i < n; i++) {
549: v = aa + 25 * ai[i];
550: vi = aj + ai[i];
551: nz = diag[i] - ai[i];
552: idx = 5 * (*r++);
553: s1 = b[idx];
554: s2 = b[1 + idx];
555: s3 = b[2 + idx];
556: s4 = b[3 + idx];
557: s5 = b[4 + idx];
558: while (nz--) {
559: idx = 5 * (*vi++);
560: x1 = t[idx];
561: x2 = t[1 + idx];
562: x3 = t[2 + idx];
563: x4 = t[3 + idx];
564: x5 = t[4 + idx];
565: s1 -= v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
566: s2 -= v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
567: s3 -= v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
568: s4 -= v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
569: s5 -= v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
570: v += 25;
571: }
572: idx = 5 * i;
573: t[idx] = s1;
574: t[1 + idx] = s2;
575: t[2 + idx] = s3;
576: t[3 + idx] = s4;
577: t[4 + idx] = s5;
578: }
579: /* backward solve the upper triangular */
580: for (i = n - 1; i >= 0; i--) {
581: v = aa + 25 * diag[i] + 25;
582: vi = aj + diag[i] + 1;
583: nz = ai[i + 1] - diag[i] - 1;
584: idt = 5 * i;
585: s1 = t[idt];
586: s2 = t[1 + idt];
587: s3 = t[2 + idt];
588: s4 = t[3 + idt];
589: s5 = t[4 + idt];
590: while (nz--) {
591: idx = 5 * (*vi++);
592: x1 = t[idx];
593: x2 = t[1 + idx];
594: x3 = t[2 + idx];
595: x4 = t[3 + idx];
596: x5 = t[4 + idx];
597: s1 -= v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
598: s2 -= v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
599: s3 -= v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
600: s4 -= v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
601: s5 -= v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
602: v += 25;
603: }
604: idc = 5 * (*c--);
605: v = aa + 25 * diag[i];
606: x[idc] = t[idt] = v[0] * s1 + v[5] * s2 + v[10] * s3 + v[15] * s4 + v[20] * s5;
607: x[1 + idc] = t[1 + idt] = v[1] * s1 + v[6] * s2 + v[11] * s3 + v[16] * s4 + v[21] * s5;
608: x[2 + idc] = t[2 + idt] = v[2] * s1 + v[7] * s2 + v[12] * s3 + v[17] * s4 + v[22] * s5;
609: x[3 + idc] = t[3 + idt] = v[3] * s1 + v[8] * s2 + v[13] * s3 + v[18] * s4 + v[23] * s5;
610: x[4 + idc] = t[4 + idt] = v[4] * s1 + v[9] * s2 + v[14] * s3 + v[19] * s4 + v[24] * s5;
611: }
613: PetscCall(ISRestoreIndices(isrow, &rout));
614: PetscCall(ISRestoreIndices(iscol, &cout));
615: PetscCall(VecRestoreArrayRead(bb, &b));
616: PetscCall(VecRestoreArray(xx, &x));
617: PetscCall(PetscLogFlops(2.0 * 25 * (a->nz) - 5.0 * A->cmap->n));
618: PetscFunctionReturn(PETSC_SUCCESS);
619: }
621: PetscErrorCode MatSolve_SeqBAIJ_5(Mat A, Vec bb, Vec xx)
622: {
623: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
624: IS iscol = a->col, isrow = a->row;
625: const PetscInt *r, *c, *rout, *cout;
626: const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
627: PetscInt i, nz, idx, idt, idc, m;
628: const MatScalar *aa = a->a, *v;
629: PetscScalar *x, s1, s2, s3, s4, s5, x1, x2, x3, x4, x5, *t;
630: const PetscScalar *b;
632: PetscFunctionBegin;
633: PetscCall(VecGetArrayRead(bb, &b));
634: PetscCall(VecGetArray(xx, &x));
635: t = a->solve_work;
637: PetscCall(ISGetIndices(isrow, &rout));
638: r = rout;
639: PetscCall(ISGetIndices(iscol, &cout));
640: c = cout;
642: /* forward solve the lower triangular */
643: idx = 5 * r[0];
644: t[0] = b[idx];
645: t[1] = b[1 + idx];
646: t[2] = b[2 + idx];
647: t[3] = b[3 + idx];
648: t[4] = b[4 + idx];
649: for (i = 1; i < n; i++) {
650: v = aa + 25 * ai[i];
651: vi = aj + ai[i];
652: nz = ai[i + 1] - ai[i];
653: idx = 5 * r[i];
654: s1 = b[idx];
655: s2 = b[1 + idx];
656: s3 = b[2 + idx];
657: s4 = b[3 + idx];
658: s5 = b[4 + idx];
659: for (m = 0; m < nz; m++) {
660: idx = 5 * vi[m];
661: x1 = t[idx];
662: x2 = t[1 + idx];
663: x3 = t[2 + idx];
664: x4 = t[3 + idx];
665: x5 = t[4 + idx];
666: s1 -= v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
667: s2 -= v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
668: s3 -= v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
669: s4 -= v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
670: s5 -= v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
671: v += 25;
672: }
673: idx = 5 * i;
674: t[idx] = s1;
675: t[1 + idx] = s2;
676: t[2 + idx] = s3;
677: t[3 + idx] = s4;
678: t[4 + idx] = s5;
679: }
680: /* backward solve the upper triangular */
681: for (i = n - 1; i >= 0; i--) {
682: v = aa + 25 * (adiag[i + 1] + 1);
683: vi = aj + adiag[i + 1] + 1;
684: nz = adiag[i] - adiag[i + 1] - 1;
685: idt = 5 * i;
686: s1 = t[idt];
687: s2 = t[1 + idt];
688: s3 = t[2 + idt];
689: s4 = t[3 + idt];
690: s5 = t[4 + idt];
691: for (m = 0; m < nz; m++) {
692: idx = 5 * vi[m];
693: x1 = t[idx];
694: x2 = t[1 + idx];
695: x3 = t[2 + idx];
696: x4 = t[3 + idx];
697: x5 = t[4 + idx];
698: s1 -= v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
699: s2 -= v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
700: s3 -= v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
701: s4 -= v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
702: s5 -= v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
703: v += 25;
704: }
705: idc = 5 * c[i];
706: x[idc] = t[idt] = v[0] * s1 + v[5] * s2 + v[10] * s3 + v[15] * s4 + v[20] * s5;
707: x[1 + idc] = t[1 + idt] = v[1] * s1 + v[6] * s2 + v[11] * s3 + v[16] * s4 + v[21] * s5;
708: x[2 + idc] = t[2 + idt] = v[2] * s1 + v[7] * s2 + v[12] * s3 + v[17] * s4 + v[22] * s5;
709: x[3 + idc] = t[3 + idt] = v[3] * s1 + v[8] * s2 + v[13] * s3 + v[18] * s4 + v[23] * s5;
710: x[4 + idc] = t[4 + idt] = v[4] * s1 + v[9] * s2 + v[14] * s3 + v[19] * s4 + v[24] * s5;
711: }
713: PetscCall(ISRestoreIndices(isrow, &rout));
714: PetscCall(ISRestoreIndices(iscol, &cout));
715: PetscCall(VecRestoreArrayRead(bb, &b));
716: PetscCall(VecRestoreArray(xx, &x));
717: PetscCall(PetscLogFlops(2.0 * 25 * (a->nz) - 5.0 * A->cmap->n));
718: PetscFunctionReturn(PETSC_SUCCESS);
719: }
721: PetscErrorCode MatSolve_SeqBAIJ_4_inplace(Mat A, Vec bb, Vec xx)
722: {
723: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
724: IS iscol = a->col, isrow = a->row;
725: const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j;
726: PetscInt i, nz, idx, idt, idc;
727: const PetscInt *r, *c, *diag = a->diag, *rout, *cout;
728: const MatScalar *aa = a->a, *v;
729: PetscScalar *x, s1, s2, s3, s4, x1, x2, x3, x4, *t;
730: const PetscScalar *b;
732: PetscFunctionBegin;
733: PetscCall(VecGetArrayRead(bb, &b));
734: PetscCall(VecGetArray(xx, &x));
735: t = a->solve_work;
737: PetscCall(ISGetIndices(isrow, &rout));
738: r = rout;
739: PetscCall(ISGetIndices(iscol, &cout));
740: c = cout + (n - 1);
742: /* forward solve the lower triangular */
743: idx = 4 * (*r++);
744: t[0] = b[idx];
745: t[1] = b[1 + idx];
746: t[2] = b[2 + idx];
747: t[3] = b[3 + idx];
748: for (i = 1; i < n; i++) {
749: v = aa + 16 * ai[i];
750: vi = aj + ai[i];
751: nz = diag[i] - ai[i];
752: idx = 4 * (*r++);
753: s1 = b[idx];
754: s2 = b[1 + idx];
755: s3 = b[2 + idx];
756: s4 = b[3 + idx];
757: while (nz--) {
758: idx = 4 * (*vi++);
759: x1 = t[idx];
760: x2 = t[1 + idx];
761: x3 = t[2 + idx];
762: x4 = t[3 + idx];
763: s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
764: s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
765: s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
766: s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
767: v += 16;
768: }
769: idx = 4 * i;
770: t[idx] = s1;
771: t[1 + idx] = s2;
772: t[2 + idx] = s3;
773: t[3 + idx] = s4;
774: }
775: /* backward solve the upper triangular */
776: for (i = n - 1; i >= 0; i--) {
777: v = aa + 16 * diag[i] + 16;
778: vi = aj + diag[i] + 1;
779: nz = ai[i + 1] - diag[i] - 1;
780: idt = 4 * i;
781: s1 = t[idt];
782: s2 = t[1 + idt];
783: s3 = t[2 + idt];
784: s4 = t[3 + idt];
785: while (nz--) {
786: idx = 4 * (*vi++);
787: x1 = t[idx];
788: x2 = t[1 + idx];
789: x3 = t[2 + idx];
790: x4 = t[3 + idx];
791: s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
792: s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
793: s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
794: s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
795: v += 16;
796: }
797: idc = 4 * (*c--);
798: v = aa + 16 * diag[i];
799: x[idc] = t[idt] = v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4;
800: x[1 + idc] = t[1 + idt] = v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4;
801: x[2 + idc] = t[2 + idt] = v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4;
802: x[3 + idc] = t[3 + idt] = v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4;
803: }
805: PetscCall(ISRestoreIndices(isrow, &rout));
806: PetscCall(ISRestoreIndices(iscol, &cout));
807: PetscCall(VecRestoreArrayRead(bb, &b));
808: PetscCall(VecRestoreArray(xx, &x));
809: PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n));
810: PetscFunctionReturn(PETSC_SUCCESS);
811: }
813: PetscErrorCode MatSolve_SeqBAIJ_4(Mat A, Vec bb, Vec xx)
814: {
815: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
816: IS iscol = a->col, isrow = a->row;
817: const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
818: PetscInt i, nz, idx, idt, idc, m;
819: const PetscInt *r, *c, *rout, *cout;
820: const MatScalar *aa = a->a, *v;
821: PetscScalar *x, s1, s2, s3, s4, x1, x2, x3, x4, *t;
822: const PetscScalar *b;
824: PetscFunctionBegin;
825: PetscCall(VecGetArrayRead(bb, &b));
826: PetscCall(VecGetArray(xx, &x));
827: t = a->solve_work;
829: PetscCall(ISGetIndices(isrow, &rout));
830: r = rout;
831: PetscCall(ISGetIndices(iscol, &cout));
832: c = cout;
834: /* forward solve the lower triangular */
835: idx = 4 * r[0];
836: t[0] = b[idx];
837: t[1] = b[1 + idx];
838: t[2] = b[2 + idx];
839: t[3] = b[3 + idx];
840: for (i = 1; i < n; i++) {
841: v = aa + 16 * ai[i];
842: vi = aj + ai[i];
843: nz = ai[i + 1] - ai[i];
844: idx = 4 * r[i];
845: s1 = b[idx];
846: s2 = b[1 + idx];
847: s3 = b[2 + idx];
848: s4 = b[3 + idx];
849: for (m = 0; m < nz; m++) {
850: idx = 4 * vi[m];
851: x1 = t[idx];
852: x2 = t[1 + idx];
853: x3 = t[2 + idx];
854: x4 = t[3 + idx];
855: s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
856: s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
857: s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
858: s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
859: v += 16;
860: }
861: idx = 4 * i;
862: t[idx] = s1;
863: t[1 + idx] = s2;
864: t[2 + idx] = s3;
865: t[3 + idx] = s4;
866: }
867: /* backward solve the upper triangular */
868: for (i = n - 1; i >= 0; i--) {
869: v = aa + 16 * (adiag[i + 1] + 1);
870: vi = aj + adiag[i + 1] + 1;
871: nz = adiag[i] - adiag[i + 1] - 1;
872: idt = 4 * i;
873: s1 = t[idt];
874: s2 = t[1 + idt];
875: s3 = t[2 + idt];
876: s4 = t[3 + idt];
877: for (m = 0; m < nz; m++) {
878: idx = 4 * vi[m];
879: x1 = t[idx];
880: x2 = t[1 + idx];
881: x3 = t[2 + idx];
882: x4 = t[3 + idx];
883: s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
884: s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
885: s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
886: s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
887: v += 16;
888: }
889: idc = 4 * c[i];
890: x[idc] = t[idt] = v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4;
891: x[1 + idc] = t[1 + idt] = v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4;
892: x[2 + idc] = t[2 + idt] = v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4;
893: x[3 + idc] = t[3 + idt] = v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4;
894: }
896: PetscCall(ISRestoreIndices(isrow, &rout));
897: PetscCall(ISRestoreIndices(iscol, &cout));
898: PetscCall(VecRestoreArrayRead(bb, &b));
899: PetscCall(VecRestoreArray(xx, &x));
900: PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n));
901: PetscFunctionReturn(PETSC_SUCCESS);
902: }
904: PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A, Vec bb, Vec xx)
905: {
906: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
907: IS iscol = a->col, isrow = a->row;
908: const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j;
909: PetscInt i, nz, idx, idt, idc;
910: const PetscInt *r, *c, *diag = a->diag, *rout, *cout;
911: const MatScalar *aa = a->a, *v;
912: MatScalar s1, s2, s3, s4, x1, x2, x3, x4, *t;
913: PetscScalar *x;
914: const PetscScalar *b;
916: PetscFunctionBegin;
917: PetscCall(VecGetArrayRead(bb, &b));
918: PetscCall(VecGetArray(xx, &x));
919: t = (MatScalar *)a->solve_work;
921: PetscCall(ISGetIndices(isrow, &rout));
922: r = rout;
923: PetscCall(ISGetIndices(iscol, &cout));
924: c = cout + (n - 1);
926: /* forward solve the lower triangular */
927: idx = 4 * (*r++);
928: t[0] = (MatScalar)b[idx];
929: t[1] = (MatScalar)b[1 + idx];
930: t[2] = (MatScalar)b[2 + idx];
931: t[3] = (MatScalar)b[3 + idx];
932: for (i = 1; i < n; i++) {
933: v = aa + 16 * ai[i];
934: vi = aj + ai[i];
935: nz = diag[i] - ai[i];
936: idx = 4 * (*r++);
937: s1 = (MatScalar)b[idx];
938: s2 = (MatScalar)b[1 + idx];
939: s3 = (MatScalar)b[2 + idx];
940: s4 = (MatScalar)b[3 + idx];
941: while (nz--) {
942: idx = 4 * (*vi++);
943: x1 = t[idx];
944: x2 = t[1 + idx];
945: x3 = t[2 + idx];
946: x4 = t[3 + idx];
947: s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
948: s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
949: s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
950: s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
951: v += 16;
952: }
953: idx = 4 * i;
954: t[idx] = s1;
955: t[1 + idx] = s2;
956: t[2 + idx] = s3;
957: t[3 + idx] = s4;
958: }
959: /* backward solve the upper triangular */
960: for (i = n - 1; i >= 0; i--) {
961: v = aa + 16 * diag[i] + 16;
962: vi = aj + diag[i] + 1;
963: nz = ai[i + 1] - diag[i] - 1;
964: idt = 4 * i;
965: s1 = t[idt];
966: s2 = t[1 + idt];
967: s3 = t[2 + idt];
968: s4 = t[3 + idt];
969: while (nz--) {
970: idx = 4 * (*vi++);
971: x1 = t[idx];
972: x2 = t[1 + idx];
973: x3 = t[2 + idx];
974: x4 = t[3 + idx];
975: s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
976: s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
977: s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
978: s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
979: v += 16;
980: }
981: idc = 4 * (*c--);
982: v = aa + 16 * diag[i];
983: t[idt] = v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4;
984: t[1 + idt] = v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4;
985: t[2 + idt] = v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4;
986: t[3 + idt] = v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4;
987: x[idc] = (PetscScalar)t[idt];
988: x[1 + idc] = (PetscScalar)t[1 + idt];
989: x[2 + idc] = (PetscScalar)t[2 + idt];
990: x[3 + idc] = (PetscScalar)t[3 + idt];
991: }
993: PetscCall(ISRestoreIndices(isrow, &rout));
994: PetscCall(ISRestoreIndices(iscol, &cout));
995: PetscCall(VecRestoreArrayRead(bb, &b));
996: PetscCall(VecRestoreArray(xx, &x));
997: PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n));
998: PetscFunctionReturn(PETSC_SUCCESS);
999: }
1001: #if defined(PETSC_HAVE_SSE)
1003: #include PETSC_HAVE_SSE
1005: PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A, Vec bb, Vec xx)
1006: {
1007: /*
1008: Note: This code uses demotion of double
1009: to float when performing the mixed-mode computation.
1010: This may not be numerically reasonable for all applications.
1011: */
1012: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1013: IS iscol = a->col, isrow = a->row;
1014: PetscInt i, n = a->mbs, *vi, *ai = a->i, *aj = a->j, nz, idx, idt, idc, ai16;
1015: const PetscInt *r, *c, *diag = a->diag, *rout, *cout;
1016: MatScalar *aa = a->a, *v;
1017: PetscScalar *x, *b, *t;
1019: /* Make space in temp stack for 16 Byte Aligned arrays */
1020: float ssealignedspace[11], *tmps, *tmpx;
1021: unsigned long offset;
1023: PetscFunctionBegin;
1024: SSE_SCOPE_BEGIN;
1026: offset = (unsigned long)ssealignedspace % 16;
1027: if (offset) offset = (16 - offset) / 4;
1028: tmps = &ssealignedspace[offset];
1029: tmpx = &ssealignedspace[offset + 4];
1030: PREFETCH_NTA(aa + 16 * ai[1]);
1032: PetscCall(VecGetArray(bb, &b));
1033: PetscCall(VecGetArray(xx, &x));
1034: t = a->solve_work;
1036: PetscCall(ISGetIndices(isrow, &rout));
1037: r = rout;
1038: PetscCall(ISGetIndices(iscol, &cout));
1039: c = cout + (n - 1);
1041: /* forward solve the lower triangular */
1042: idx = 4 * (*r++);
1043: t[0] = b[idx];
1044: t[1] = b[1 + idx];
1045: t[2] = b[2 + idx];
1046: t[3] = b[3 + idx];
1047: v = aa + 16 * ai[1];
1049: for (i = 1; i < n;) {
1050: PREFETCH_NTA(&v[8]);
1051: vi = aj + ai[i];
1052: nz = diag[i] - ai[i];
1053: idx = 4 * (*r++);
1055: /* Demote sum from double to float */
1056: CONVERT_DOUBLE4_FLOAT4(tmps, &b[idx]);
1057: LOAD_PS(tmps, XMM7);
1059: while (nz--) {
1060: PREFETCH_NTA(&v[16]);
1061: idx = 4 * (*vi++);
1063: /* Demote solution (so far) from double to float */
1064: CONVERT_DOUBLE4_FLOAT4(tmpx, &x[idx]);
1066: /* 4x4 Matrix-Vector product with negative accumulation: */
1067: SSE_INLINE_BEGIN_2(tmpx, v)
1068: SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6)
1070: /* First Column */
1071: SSE_COPY_PS(XMM0, XMM6)
1072: SSE_SHUFFLE(XMM0, XMM0, 0x00)
1073: SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0)
1074: SSE_SUB_PS(XMM7, XMM0)
1076: /* Second Column */
1077: SSE_COPY_PS(XMM1, XMM6)
1078: SSE_SHUFFLE(XMM1, XMM1, 0x55)
1079: SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4)
1080: SSE_SUB_PS(XMM7, XMM1)
1082: SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24)
1084: /* Third Column */
1085: SSE_COPY_PS(XMM2, XMM6)
1086: SSE_SHUFFLE(XMM2, XMM2, 0xAA)
1087: SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8)
1088: SSE_SUB_PS(XMM7, XMM2)
1090: /* Fourth Column */
1091: SSE_COPY_PS(XMM3, XMM6)
1092: SSE_SHUFFLE(XMM3, XMM3, 0xFF)
1093: SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12)
1094: SSE_SUB_PS(XMM7, XMM3)
1095: SSE_INLINE_END_2
1097: v += 16;
1098: }
1099: idx = 4 * i;
1100: v = aa + 16 * ai[++i];
1101: PREFETCH_NTA(v);
1102: STORE_PS(tmps, XMM7);
1104: /* Promote result from float to double */
1105: CONVERT_FLOAT4_DOUBLE4(&t[idx], tmps);
1106: }
1107: /* backward solve the upper triangular */
1108: idt = 4 * (n - 1);
1109: ai16 = 16 * diag[n - 1];
1110: v = aa + ai16 + 16;
1111: for (i = n - 1; i >= 0;) {
1112: PREFETCH_NTA(&v[8]);
1113: vi = aj + diag[i] + 1;
1114: nz = ai[i + 1] - diag[i] - 1;
1116: /* Demote accumulator from double to float */
1117: CONVERT_DOUBLE4_FLOAT4(tmps, &t[idt]);
1118: LOAD_PS(tmps, XMM7);
1120: while (nz--) {
1121: PREFETCH_NTA(&v[16]);
1122: idx = 4 * (*vi++);
1124: /* Demote solution (so far) from double to float */
1125: CONVERT_DOUBLE4_FLOAT4(tmpx, &t[idx]);
1127: /* 4x4 Matrix-Vector Product with negative accumulation: */
1128: SSE_INLINE_BEGIN_2(tmpx, v)
1129: SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6)
1131: /* First Column */
1132: SSE_COPY_PS(XMM0, XMM6)
1133: SSE_SHUFFLE(XMM0, XMM0, 0x00)
1134: SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0)
1135: SSE_SUB_PS(XMM7, XMM0)
1137: /* Second Column */
1138: SSE_COPY_PS(XMM1, XMM6)
1139: SSE_SHUFFLE(XMM1, XMM1, 0x55)
1140: SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4)
1141: SSE_SUB_PS(XMM7, XMM1)
1143: SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24)
1145: /* Third Column */
1146: SSE_COPY_PS(XMM2, XMM6)
1147: SSE_SHUFFLE(XMM2, XMM2, 0xAA)
1148: SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8)
1149: SSE_SUB_PS(XMM7, XMM2)
1151: /* Fourth Column */
1152: SSE_COPY_PS(XMM3, XMM6)
1153: SSE_SHUFFLE(XMM3, XMM3, 0xFF)
1154: SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12)
1155: SSE_SUB_PS(XMM7, XMM3)
1156: SSE_INLINE_END_2
1157: v += 16;
1158: }
1159: v = aa + ai16;
1160: ai16 = 16 * diag[--i];
1161: PREFETCH_NTA(aa + ai16 + 16);
1162: /*
1163: Scale the result by the diagonal 4x4 block,
1164: which was inverted as part of the factorization
1165: */
1166: SSE_INLINE_BEGIN_3(v, tmps, aa + ai16)
1167: /* First Column */
1168: SSE_COPY_PS(XMM0, XMM7)
1169: SSE_SHUFFLE(XMM0, XMM0, 0x00)
1170: SSE_MULT_PS_M(XMM0, SSE_ARG_1, FLOAT_0)
1172: /* Second Column */
1173: SSE_COPY_PS(XMM1, XMM7)
1174: SSE_SHUFFLE(XMM1, XMM1, 0x55)
1175: SSE_MULT_PS_M(XMM1, SSE_ARG_1, FLOAT_4)
1176: SSE_ADD_PS(XMM0, XMM1)
1178: SSE_PREFETCH_NTA(SSE_ARG_3, FLOAT_24)
1180: /* Third Column */
1181: SSE_COPY_PS(XMM2, XMM7)
1182: SSE_SHUFFLE(XMM2, XMM2, 0xAA)
1183: SSE_MULT_PS_M(XMM2, SSE_ARG_1, FLOAT_8)
1184: SSE_ADD_PS(XMM0, XMM2)
1186: /* Fourth Column */
1187: SSE_COPY_PS(XMM3, XMM7)
1188: SSE_SHUFFLE(XMM3, XMM3, 0xFF)
1189: SSE_MULT_PS_M(XMM3, SSE_ARG_1, FLOAT_12)
1190: SSE_ADD_PS(XMM0, XMM3)
1192: SSE_STORE_PS(SSE_ARG_2, FLOAT_0, XMM0)
1193: SSE_INLINE_END_3
1195: /* Promote solution from float to double */
1196: CONVERT_FLOAT4_DOUBLE4(&t[idt], tmps);
1198: /* Apply reordering to t and stream into x. */
1199: /* This way, x doesn't pollute the cache. */
1200: /* Be careful with size: 2 doubles = 4 floats! */
1201: idc = 4 * (*c--);
1202: SSE_INLINE_BEGIN_2((float *)&t[idt], (float *)&x[idc])
1203: /* x[idc] = t[idt]; x[1+idc] = t[1+idc]; */
1204: SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM0)
1205: SSE_STREAM_PS(SSE_ARG_2, FLOAT_0, XMM0)
1206: /* x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
1207: SSE_LOAD_PS(SSE_ARG_1, FLOAT_4, XMM1)
1208: SSE_STREAM_PS(SSE_ARG_2, FLOAT_4, XMM1)
1209: SSE_INLINE_END_2
1210: v = aa + ai16 + 16;
1211: idt -= 4;
1212: }
1214: PetscCall(ISRestoreIndices(isrow, &rout));
1215: PetscCall(ISRestoreIndices(iscol, &cout));
1216: PetscCall(VecRestoreArray(bb, &b));
1217: PetscCall(VecRestoreArray(xx, &x));
1218: PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n));
1219: SSE_SCOPE_END;
1220: PetscFunctionReturn(PETSC_SUCCESS);
1221: }
1223: #endif
1225: PetscErrorCode MatSolve_SeqBAIJ_3_inplace(Mat A, Vec bb, Vec xx)
1226: {
1227: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1228: IS iscol = a->col, isrow = a->row;
1229: const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j;
1230: PetscInt i, nz, idx, idt, idc;
1231: const PetscInt *r, *c, *diag = a->diag, *rout, *cout;
1232: const MatScalar *aa = a->a, *v;
1233: PetscScalar *x, s1, s2, s3, x1, x2, x3, *t;
1234: const PetscScalar *b;
1236: PetscFunctionBegin;
1237: PetscCall(VecGetArrayRead(bb, &b));
1238: PetscCall(VecGetArray(xx, &x));
1239: t = a->solve_work;
1241: PetscCall(ISGetIndices(isrow, &rout));
1242: r = rout;
1243: PetscCall(ISGetIndices(iscol, &cout));
1244: c = cout + (n - 1);
1246: /* forward solve the lower triangular */
1247: idx = 3 * (*r++);
1248: t[0] = b[idx];
1249: t[1] = b[1 + idx];
1250: t[2] = b[2 + idx];
1251: for (i = 1; i < n; i++) {
1252: v = aa + 9 * ai[i];
1253: vi = aj + ai[i];
1254: nz = diag[i] - ai[i];
1255: idx = 3 * (*r++);
1256: s1 = b[idx];
1257: s2 = b[1 + idx];
1258: s3 = b[2 + idx];
1259: while (nz--) {
1260: idx = 3 * (*vi++);
1261: x1 = t[idx];
1262: x2 = t[1 + idx];
1263: x3 = t[2 + idx];
1264: s1 -= v[0] * x1 + v[3] * x2 + v[6] * x3;
1265: s2 -= v[1] * x1 + v[4] * x2 + v[7] * x3;
1266: s3 -= v[2] * x1 + v[5] * x2 + v[8] * x3;
1267: v += 9;
1268: }
1269: idx = 3 * i;
1270: t[idx] = s1;
1271: t[1 + idx] = s2;
1272: t[2 + idx] = s3;
1273: }
1274: /* backward solve the upper triangular */
1275: for (i = n - 1; i >= 0; i--) {
1276: v = aa + 9 * diag[i] + 9;
1277: vi = aj + diag[i] + 1;
1278: nz = ai[i + 1] - diag[i] - 1;
1279: idt = 3 * i;
1280: s1 = t[idt];
1281: s2 = t[1 + idt];
1282: s3 = t[2 + idt];
1283: while (nz--) {
1284: idx = 3 * (*vi++);
1285: x1 = t[idx];
1286: x2 = t[1 + idx];
1287: x3 = t[2 + idx];
1288: s1 -= v[0] * x1 + v[3] * x2 + v[6] * x3;
1289: s2 -= v[1] * x1 + v[4] * x2 + v[7] * x3;
1290: s3 -= v[2] * x1 + v[5] * x2 + v[8] * x3;
1291: v += 9;
1292: }
1293: idc = 3 * (*c--);
1294: v = aa + 9 * diag[i];
1295: x[idc] = t[idt] = v[0] * s1 + v[3] * s2 + v[6] * s3;
1296: x[1 + idc] = t[1 + idt] = v[1] * s1 + v[4] * s2 + v[7] * s3;
1297: x[2 + idc] = t[2 + idt] = v[2] * s1 + v[5] * s2 + v[8] * s3;
1298: }
1299: PetscCall(ISRestoreIndices(isrow, &rout));
1300: PetscCall(ISRestoreIndices(iscol, &cout));
1301: PetscCall(VecRestoreArrayRead(bb, &b));
1302: PetscCall(VecRestoreArray(xx, &x));
1303: PetscCall(PetscLogFlops(2.0 * 9 * (a->nz) - 3.0 * A->cmap->n));
1304: PetscFunctionReturn(PETSC_SUCCESS);
1305: }
1307: PetscErrorCode MatSolve_SeqBAIJ_3(Mat A, Vec bb, Vec xx)
1308: {
1309: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1310: IS iscol = a->col, isrow = a->row;
1311: const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
1312: PetscInt i, nz, idx, idt, idc, m;
1313: const PetscInt *r, *c, *rout, *cout;
1314: const MatScalar *aa = a->a, *v;
1315: PetscScalar *x, s1, s2, s3, x1, x2, x3, *t;
1316: const PetscScalar *b;
1318: PetscFunctionBegin;
1319: PetscCall(VecGetArrayRead(bb, &b));
1320: PetscCall(VecGetArray(xx, &x));
1321: t = a->solve_work;
1323: PetscCall(ISGetIndices(isrow, &rout));
1324: r = rout;
1325: PetscCall(ISGetIndices(iscol, &cout));
1326: c = cout;
1328: /* forward solve the lower triangular */
1329: idx = 3 * r[0];
1330: t[0] = b[idx];
1331: t[1] = b[1 + idx];
1332: t[2] = b[2 + idx];
1333: for (i = 1; i < n; i++) {
1334: v = aa + 9 * ai[i];
1335: vi = aj + ai[i];
1336: nz = ai[i + 1] - ai[i];
1337: idx = 3 * r[i];
1338: s1 = b[idx];
1339: s2 = b[1 + idx];
1340: s3 = b[2 + idx];
1341: for (m = 0; m < nz; m++) {
1342: idx = 3 * vi[m];
1343: x1 = t[idx];
1344: x2 = t[1 + idx];
1345: x3 = t[2 + idx];
1346: s1 -= v[0] * x1 + v[3] * x2 + v[6] * x3;
1347: s2 -= v[1] * x1 + v[4] * x2 + v[7] * x3;
1348: s3 -= v[2] * x1 + v[5] * x2 + v[8] * x3;
1349: v += 9;
1350: }
1351: idx = 3 * i;
1352: t[idx] = s1;
1353: t[1 + idx] = s2;
1354: t[2 + idx] = s3;
1355: }
1356: /* backward solve the upper triangular */
1357: for (i = n - 1; i >= 0; i--) {
1358: v = aa + 9 * (adiag[i + 1] + 1);
1359: vi = aj + adiag[i + 1] + 1;
1360: nz = adiag[i] - adiag[i + 1] - 1;
1361: idt = 3 * i;
1362: s1 = t[idt];
1363: s2 = t[1 + idt];
1364: s3 = t[2 + idt];
1365: for (m = 0; m < nz; m++) {
1366: idx = 3 * vi[m];
1367: x1 = t[idx];
1368: x2 = t[1 + idx];
1369: x3 = t[2 + idx];
1370: s1 -= v[0] * x1 + v[3] * x2 + v[6] * x3;
1371: s2 -= v[1] * x1 + v[4] * x2 + v[7] * x3;
1372: s3 -= v[2] * x1 + v[5] * x2 + v[8] * x3;
1373: v += 9;
1374: }
1375: idc = 3 * c[i];
1376: x[idc] = t[idt] = v[0] * s1 + v[3] * s2 + v[6] * s3;
1377: x[1 + idc] = t[1 + idt] = v[1] * s1 + v[4] * s2 + v[7] * s3;
1378: x[2 + idc] = t[2 + idt] = v[2] * s1 + v[5] * s2 + v[8] * s3;
1379: }
1380: PetscCall(ISRestoreIndices(isrow, &rout));
1381: PetscCall(ISRestoreIndices(iscol, &cout));
1382: PetscCall(VecRestoreArrayRead(bb, &b));
1383: PetscCall(VecRestoreArray(xx, &x));
1384: PetscCall(PetscLogFlops(2.0 * 9 * (a->nz) - 3.0 * A->cmap->n));
1385: PetscFunctionReturn(PETSC_SUCCESS);
1386: }
1388: PetscErrorCode MatSolve_SeqBAIJ_2_inplace(Mat A, Vec bb, Vec xx)
1389: {
1390: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1391: IS iscol = a->col, isrow = a->row;
1392: const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j;
1393: PetscInt i, nz, idx, idt, idc;
1394: const PetscInt *r, *c, *diag = a->diag, *rout, *cout;
1395: const MatScalar *aa = a->a, *v;
1396: PetscScalar *x, s1, s2, x1, x2, *t;
1397: const PetscScalar *b;
1399: PetscFunctionBegin;
1400: PetscCall(VecGetArrayRead(bb, &b));
1401: PetscCall(VecGetArray(xx, &x));
1402: t = a->solve_work;
1404: PetscCall(ISGetIndices(isrow, &rout));
1405: r = rout;
1406: PetscCall(ISGetIndices(iscol, &cout));
1407: c = cout + (n - 1);
1409: /* forward solve the lower triangular */
1410: idx = 2 * (*r++);
1411: t[0] = b[idx];
1412: t[1] = b[1 + idx];
1413: for (i = 1; i < n; i++) {
1414: v = aa + 4 * ai[i];
1415: vi = aj + ai[i];
1416: nz = diag[i] - ai[i];
1417: idx = 2 * (*r++);
1418: s1 = b[idx];
1419: s2 = b[1 + idx];
1420: while (nz--) {
1421: idx = 2 * (*vi++);
1422: x1 = t[idx];
1423: x2 = t[1 + idx];
1424: s1 -= v[0] * x1 + v[2] * x2;
1425: s2 -= v[1] * x1 + v[3] * x2;
1426: v += 4;
1427: }
1428: idx = 2 * i;
1429: t[idx] = s1;
1430: t[1 + idx] = s2;
1431: }
1432: /* backward solve the upper triangular */
1433: for (i = n - 1; i >= 0; i--) {
1434: v = aa + 4 * diag[i] + 4;
1435: vi = aj + diag[i] + 1;
1436: nz = ai[i + 1] - diag[i] - 1;
1437: idt = 2 * i;
1438: s1 = t[idt];
1439: s2 = t[1 + idt];
1440: while (nz--) {
1441: idx = 2 * (*vi++);
1442: x1 = t[idx];
1443: x2 = t[1 + idx];
1444: s1 -= v[0] * x1 + v[2] * x2;
1445: s2 -= v[1] * x1 + v[3] * x2;
1446: v += 4;
1447: }
1448: idc = 2 * (*c--);
1449: v = aa + 4 * diag[i];
1450: x[idc] = t[idt] = v[0] * s1 + v[2] * s2;
1451: x[1 + idc] = t[1 + idt] = v[1] * s1 + v[3] * s2;
1452: }
1453: PetscCall(ISRestoreIndices(isrow, &rout));
1454: PetscCall(ISRestoreIndices(iscol, &cout));
1455: PetscCall(VecRestoreArrayRead(bb, &b));
1456: PetscCall(VecRestoreArray(xx, &x));
1457: PetscCall(PetscLogFlops(2.0 * 4 * (a->nz) - 2.0 * A->cmap->n));
1458: PetscFunctionReturn(PETSC_SUCCESS);
1459: }
1461: PetscErrorCode MatSolve_SeqBAIJ_2(Mat A, Vec bb, Vec xx)
1462: {
1463: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1464: IS iscol = a->col, isrow = a->row;
1465: const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
1466: PetscInt i, nz, idx, jdx, idt, idc, m;
1467: const PetscInt *r, *c, *rout, *cout;
1468: const MatScalar *aa = a->a, *v;
1469: PetscScalar *x, s1, s2, x1, x2, *t;
1470: const PetscScalar *b;
1472: PetscFunctionBegin;
1473: PetscCall(VecGetArrayRead(bb, &b));
1474: PetscCall(VecGetArray(xx, &x));
1475: t = a->solve_work;
1477: PetscCall(ISGetIndices(isrow, &rout));
1478: r = rout;
1479: PetscCall(ISGetIndices(iscol, &cout));
1480: c = cout;
1482: /* forward solve the lower triangular */
1483: idx = 2 * r[0];
1484: t[0] = b[idx];
1485: t[1] = b[1 + idx];
1486: for (i = 1; i < n; i++) {
1487: v = aa + 4 * ai[i];
1488: vi = aj + ai[i];
1489: nz = ai[i + 1] - ai[i];
1490: idx = 2 * r[i];
1491: s1 = b[idx];
1492: s2 = b[1 + idx];
1493: for (m = 0; m < nz; m++) {
1494: jdx = 2 * vi[m];
1495: x1 = t[jdx];
1496: x2 = t[1 + jdx];
1497: s1 -= v[0] * x1 + v[2] * x2;
1498: s2 -= v[1] * x1 + v[3] * x2;
1499: v += 4;
1500: }
1501: idx = 2 * i;
1502: t[idx] = s1;
1503: t[1 + idx] = s2;
1504: }
1505: /* backward solve the upper triangular */
1506: for (i = n - 1; i >= 0; i--) {
1507: v = aa + 4 * (adiag[i + 1] + 1);
1508: vi = aj + adiag[i + 1] + 1;
1509: nz = adiag[i] - adiag[i + 1] - 1;
1510: idt = 2 * i;
1511: s1 = t[idt];
1512: s2 = t[1 + idt];
1513: for (m = 0; m < nz; m++) {
1514: idx = 2 * vi[m];
1515: x1 = t[idx];
1516: x2 = t[1 + idx];
1517: s1 -= v[0] * x1 + v[2] * x2;
1518: s2 -= v[1] * x1 + v[3] * x2;
1519: v += 4;
1520: }
1521: idc = 2 * c[i];
1522: x[idc] = t[idt] = v[0] * s1 + v[2] * s2;
1523: x[1 + idc] = t[1 + idt] = v[1] * s1 + v[3] * s2;
1524: }
1525: PetscCall(ISRestoreIndices(isrow, &rout));
1526: PetscCall(ISRestoreIndices(iscol, &cout));
1527: PetscCall(VecRestoreArrayRead(bb, &b));
1528: PetscCall(VecRestoreArray(xx, &x));
1529: PetscCall(PetscLogFlops(2.0 * 4 * (a->nz) - 2.0 * A->cmap->n));
1530: PetscFunctionReturn(PETSC_SUCCESS);
1531: }
1533: PetscErrorCode MatSolve_SeqBAIJ_1_inplace(Mat A, Vec bb, Vec xx)
1534: {
1535: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1536: IS iscol = a->col, isrow = a->row;
1537: const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j;
1538: PetscInt i, nz;
1539: const PetscInt *r, *c, *diag = a->diag, *rout, *cout;
1540: const MatScalar *aa = a->a, *v;
1541: PetscScalar *x, s1, *t;
1542: const PetscScalar *b;
1544: PetscFunctionBegin;
1545: if (!n) PetscFunctionReturn(PETSC_SUCCESS);
1547: PetscCall(VecGetArrayRead(bb, &b));
1548: PetscCall(VecGetArray(xx, &x));
1549: t = a->solve_work;
1551: PetscCall(ISGetIndices(isrow, &rout));
1552: r = rout;
1553: PetscCall(ISGetIndices(iscol, &cout));
1554: c = cout + (n - 1);
1556: /* forward solve the lower triangular */
1557: t[0] = b[*r++];
1558: for (i = 1; i < n; i++) {
1559: v = aa + ai[i];
1560: vi = aj + ai[i];
1561: nz = diag[i] - ai[i];
1562: s1 = b[*r++];
1563: while (nz--) s1 -= (*v++) * t[*vi++];
1564: t[i] = s1;
1565: }
1566: /* backward solve the upper triangular */
1567: for (i = n - 1; i >= 0; i--) {
1568: v = aa + diag[i] + 1;
1569: vi = aj + diag[i] + 1;
1570: nz = ai[i + 1] - diag[i] - 1;
1571: s1 = t[i];
1572: while (nz--) s1 -= (*v++) * t[*vi++];
1573: x[*c--] = t[i] = aa[diag[i]] * s1;
1574: }
1576: PetscCall(ISRestoreIndices(isrow, &rout));
1577: PetscCall(ISRestoreIndices(iscol, &cout));
1578: PetscCall(VecRestoreArrayRead(bb, &b));
1579: PetscCall(VecRestoreArray(xx, &x));
1580: PetscCall(PetscLogFlops(2.0 * 1 * (a->nz) - A->cmap->n));
1581: PetscFunctionReturn(PETSC_SUCCESS);
1582: }
1584: PetscErrorCode MatSolve_SeqBAIJ_1(Mat A, Vec bb, Vec xx)
1585: {
1586: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1587: IS iscol = a->col, isrow = a->row;
1588: PetscInt i, n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag, nz;
1589: const PetscInt *rout, *cout, *r, *c;
1590: PetscScalar *x, *tmp, sum;
1591: const PetscScalar *b;
1592: const MatScalar *aa = a->a, *v;
1594: PetscFunctionBegin;
1595: if (!n) PetscFunctionReturn(PETSC_SUCCESS);
1597: PetscCall(VecGetArrayRead(bb, &b));
1598: PetscCall(VecGetArray(xx, &x));
1599: tmp = a->solve_work;
1601: PetscCall(ISGetIndices(isrow, &rout));
1602: r = rout;
1603: PetscCall(ISGetIndices(iscol, &cout));
1604: c = cout;
1606: /* forward solve the lower triangular */
1607: tmp[0] = b[r[0]];
1608: v = aa;
1609: vi = aj;
1610: for (i = 1; i < n; i++) {
1611: nz = ai[i + 1] - ai[i];
1612: sum = b[r[i]];
1613: PetscSparseDenseMinusDot(sum, tmp, v, vi, nz);
1614: tmp[i] = sum;
1615: v += nz;
1616: vi += nz;
1617: }
1619: /* backward solve the upper triangular */
1620: for (i = n - 1; i >= 0; i--) {
1621: v = aa + adiag[i + 1] + 1;
1622: vi = aj + adiag[i + 1] + 1;
1623: nz = adiag[i] - adiag[i + 1] - 1;
1624: sum = tmp[i];
1625: PetscSparseDenseMinusDot(sum, tmp, v, vi, nz);
1626: x[c[i]] = tmp[i] = sum * v[nz]; /* v[nz] = aa[adiag[i]] */
1627: }
1629: PetscCall(ISRestoreIndices(isrow, &rout));
1630: PetscCall(ISRestoreIndices(iscol, &cout));
1631: PetscCall(VecRestoreArrayRead(bb, &b));
1632: PetscCall(VecRestoreArray(xx, &x));
1633: PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n));
1634: PetscFunctionReturn(PETSC_SUCCESS);
1635: }