Actual source code: baijsolv.c

  1: #include <../src/mat/impls/baij/seq/baij.h>
  2: #include <petsc/private/kernels/blockinvert.h>

  4: PetscErrorCode MatSolve_SeqBAIJ_N_inplace(Mat A, Vec bb, Vec xx)
  5: {
  6:   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
  7:   IS                 iscol = a->col, isrow = a->row;
  8:   const PetscInt    *r, *c, *rout, *cout;
  9:   const PetscInt     n = a->mbs, *ai = a->i, *aj = a->j, *vi;
 10:   PetscInt           i, nz;
 11:   const PetscInt     bs = A->rmap->bs, bs2 = a->bs2;
 12:   const MatScalar   *aa = a->a, *v;
 13:   PetscScalar       *x, *s, *t, *ls;
 14:   const PetscScalar *b;

 16:   PetscFunctionBegin;
 17:   PetscCall(VecGetArrayRead(bb, &b));
 18:   PetscCall(VecGetArray(xx, &x));
 19:   t = a->solve_work;

 21:   PetscCall(ISGetIndices(isrow, &rout));
 22:   r = rout;
 23:   PetscCall(ISGetIndices(iscol, &cout));
 24:   c = cout + (n - 1);

 26:   /* forward solve the lower triangular */
 27:   PetscCall(PetscArraycpy(t, b + bs * (*r++), bs));
 28:   for (i = 1; i < n; i++) {
 29:     v  = aa + bs2 * ai[i];
 30:     vi = aj + ai[i];
 31:     nz = a->diag[i] - ai[i];
 32:     s  = t + bs * i;
 33:     PetscCall(PetscArraycpy(s, b + bs * (*r++), bs));
 34:     while (nz--) {
 35:       PetscKernel_v_gets_v_minus_A_times_w(bs, s, v, t + bs * (*vi++));
 36:       v += bs2;
 37:     }
 38:   }
 39:   /* backward solve the upper triangular */
 40:   ls = a->solve_work + A->cmap->n;
 41:   for (i = n - 1; i >= 0; i--) {
 42:     v  = aa + bs2 * (a->diag[i] + 1);
 43:     vi = aj + a->diag[i] + 1;
 44:     nz = ai[i + 1] - a->diag[i] - 1;
 45:     PetscCall(PetscArraycpy(ls, t + i * bs, bs));
 46:     while (nz--) {
 47:       PetscKernel_v_gets_v_minus_A_times_w(bs, ls, v, t + bs * (*vi++));
 48:       v += bs2;
 49:     }
 50:     PetscKernel_w_gets_A_times_v(bs, ls, aa + bs2 * a->diag[i], t + i * bs);
 51:     PetscCall(PetscArraycpy(x + bs * (*c--), t + i * bs, bs));
 52:   }

 54:   PetscCall(ISRestoreIndices(isrow, &rout));
 55:   PetscCall(ISRestoreIndices(iscol, &cout));
 56:   PetscCall(VecRestoreArrayRead(bb, &b));
 57:   PetscCall(VecRestoreArray(xx, &x));
 58:   PetscCall(PetscLogFlops(2.0 * (a->bs2) * (a->nz) - A->rmap->bs * A->cmap->n));
 59:   PetscFunctionReturn(PETSC_SUCCESS);
 60: }

 62: PetscErrorCode MatSolve_SeqBAIJ_7_inplace(Mat A, Vec bb, Vec xx)
 63: {
 64:   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
 65:   IS                 iscol = a->col, isrow = a->row;
 66:   const PetscInt    *r, *c, *ai = a->i, *aj = a->j;
 67:   const PetscInt    *rout, *cout, *diag = a->diag, *vi, n = a->mbs;
 68:   PetscInt           i, nz, idx, idt, idc;
 69:   const MatScalar   *aa = a->a, *v;
 70:   PetscScalar        s1, s2, s3, s4, s5, s6, s7, x1, x2, x3, x4, x5, x6, x7, *x, *t;
 71:   const PetscScalar *b;

 73:   PetscFunctionBegin;
 74:   PetscCall(VecGetArrayRead(bb, &b));
 75:   PetscCall(VecGetArray(xx, &x));
 76:   t = a->solve_work;

 78:   PetscCall(ISGetIndices(isrow, &rout));
 79:   r = rout;
 80:   PetscCall(ISGetIndices(iscol, &cout));
 81:   c = cout + (n - 1);

 83:   /* forward solve the lower triangular */
 84:   idx  = 7 * (*r++);
 85:   t[0] = b[idx];
 86:   t[1] = b[1 + idx];
 87:   t[2] = b[2 + idx];
 88:   t[3] = b[3 + idx];
 89:   t[4] = b[4 + idx];
 90:   t[5] = b[5 + idx];
 91:   t[6] = b[6 + idx];

 93:   for (i = 1; i < n; i++) {
 94:     v   = aa + 49 * ai[i];
 95:     vi  = aj + ai[i];
 96:     nz  = diag[i] - ai[i];
 97:     idx = 7 * (*r++);
 98:     s1  = b[idx];
 99:     s2  = b[1 + idx];
100:     s3  = b[2 + idx];
101:     s4  = b[3 + idx];
102:     s5  = b[4 + idx];
103:     s6  = b[5 + idx];
104:     s7  = b[6 + idx];
105:     while (nz--) {
106:       idx = 7 * (*vi++);
107:       x1  = t[idx];
108:       x2  = t[1 + idx];
109:       x3  = t[2 + idx];
110:       x4  = t[3 + idx];
111:       x5  = t[4 + idx];
112:       x6  = t[5 + idx];
113:       x7  = t[6 + idx];
114:       s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
115:       s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
116:       s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
117:       s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
118:       s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
119:       s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
120:       s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
121:       v += 49;
122:     }
123:     idx        = 7 * i;
124:     t[idx]     = s1;
125:     t[1 + idx] = s2;
126:     t[2 + idx] = s3;
127:     t[3 + idx] = s4;
128:     t[4 + idx] = s5;
129:     t[5 + idx] = s6;
130:     t[6 + idx] = s7;
131:   }
132:   /* backward solve the upper triangular */
133:   for (i = n - 1; i >= 0; i--) {
134:     v   = aa + 49 * diag[i] + 49;
135:     vi  = aj + diag[i] + 1;
136:     nz  = ai[i + 1] - diag[i] - 1;
137:     idt = 7 * i;
138:     s1  = t[idt];
139:     s2  = t[1 + idt];
140:     s3  = t[2 + idt];
141:     s4  = t[3 + idt];
142:     s5  = t[4 + idt];
143:     s6  = t[5 + idt];
144:     s7  = t[6 + idt];
145:     while (nz--) {
146:       idx = 7 * (*vi++);
147:       x1  = t[idx];
148:       x2  = t[1 + idx];
149:       x3  = t[2 + idx];
150:       x4  = t[3 + idx];
151:       x5  = t[4 + idx];
152:       x6  = t[5 + idx];
153:       x7  = t[6 + idx];
154:       s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
155:       s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
156:       s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
157:       s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
158:       s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
159:       s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
160:       s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
161:       v += 49;
162:     }
163:     idc    = 7 * (*c--);
164:     v      = aa + 49 * diag[i];
165:     x[idc] = t[idt] = v[0] * s1 + v[7] * s2 + v[14] * s3 + v[21] * s4 + v[28] * s5 + v[35] * s6 + v[42] * s7;
166:     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[8] * s2 + v[15] * s3 + v[22] * s4 + v[29] * s5 + v[36] * s6 + v[43] * s7;
167:     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[9] * s2 + v[16] * s3 + v[23] * s4 + v[30] * s5 + v[37] * s6 + v[44] * s7;
168:     x[3 + idc] = t[3 + idt] = v[3] * s1 + v[10] * s2 + v[17] * s3 + v[24] * s4 + v[31] * s5 + v[38] * s6 + v[45] * s7;
169:     x[4 + idc] = t[4 + idt] = v[4] * s1 + v[11] * s2 + v[18] * s3 + v[25] * s4 + v[32] * s5 + v[39] * s6 + v[46] * s7;
170:     x[5 + idc] = t[5 + idt] = v[5] * s1 + v[12] * s2 + v[19] * s3 + v[26] * s4 + v[33] * s5 + v[40] * s6 + v[47] * s7;
171:     x[6 + idc] = t[6 + idt] = v[6] * s1 + v[13] * s2 + v[20] * s3 + v[27] * s4 + v[34] * s5 + v[41] * s6 + v[48] * s7;
172:   }

174:   PetscCall(ISRestoreIndices(isrow, &rout));
175:   PetscCall(ISRestoreIndices(iscol, &cout));
176:   PetscCall(VecRestoreArrayRead(bb, &b));
177:   PetscCall(VecRestoreArray(xx, &x));
178:   PetscCall(PetscLogFlops(2.0 * 49 * (a->nz) - 7.0 * A->cmap->n));
179:   PetscFunctionReturn(PETSC_SUCCESS);
180: }

182: PetscErrorCode MatSolve_SeqBAIJ_7(Mat A, Vec bb, Vec xx)
183: {
184:   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
185:   IS                 iscol = a->col, isrow = a->row;
186:   const PetscInt    *r, *c, *ai = a->i, *aj = a->j, *adiag = a->diag;
187:   const PetscInt     n = a->mbs, *rout, *cout, *vi;
188:   PetscInt           i, nz, idx, idt, idc, m;
189:   const MatScalar   *aa = a->a, *v;
190:   PetscScalar        s1, s2, s3, s4, s5, s6, s7, x1, x2, x3, x4, x5, x6, x7, *x, *t;
191:   const PetscScalar *b;

193:   PetscFunctionBegin;
194:   PetscCall(VecGetArrayRead(bb, &b));
195:   PetscCall(VecGetArray(xx, &x));
196:   t = a->solve_work;

198:   PetscCall(ISGetIndices(isrow, &rout));
199:   r = rout;
200:   PetscCall(ISGetIndices(iscol, &cout));
201:   c = cout;

203:   /* forward solve the lower triangular */
204:   idx  = 7 * r[0];
205:   t[0] = b[idx];
206:   t[1] = b[1 + idx];
207:   t[2] = b[2 + idx];
208:   t[3] = b[3 + idx];
209:   t[4] = b[4 + idx];
210:   t[5] = b[5 + idx];
211:   t[6] = b[6 + idx];

213:   for (i = 1; i < n; i++) {
214:     v   = aa + 49 * ai[i];
215:     vi  = aj + ai[i];
216:     nz  = ai[i + 1] - ai[i];
217:     idx = 7 * r[i];
218:     s1  = b[idx];
219:     s2  = b[1 + idx];
220:     s3  = b[2 + idx];
221:     s4  = b[3 + idx];
222:     s5  = b[4 + idx];
223:     s6  = b[5 + idx];
224:     s7  = b[6 + idx];
225:     for (m = 0; m < nz; m++) {
226:       idx = 7 * vi[m];
227:       x1  = t[idx];
228:       x2  = t[1 + idx];
229:       x3  = t[2 + idx];
230:       x4  = t[3 + idx];
231:       x5  = t[4 + idx];
232:       x6  = t[5 + idx];
233:       x7  = t[6 + idx];
234:       s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
235:       s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
236:       s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
237:       s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
238:       s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
239:       s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
240:       s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
241:       v += 49;
242:     }
243:     idx        = 7 * i;
244:     t[idx]     = s1;
245:     t[1 + idx] = s2;
246:     t[2 + idx] = s3;
247:     t[3 + idx] = s4;
248:     t[4 + idx] = s5;
249:     t[5 + idx] = s6;
250:     t[6 + idx] = s7;
251:   }
252:   /* backward solve the upper triangular */
253:   for (i = n - 1; i >= 0; i--) {
254:     v   = aa + 49 * (adiag[i + 1] + 1);
255:     vi  = aj + adiag[i + 1] + 1;
256:     nz  = adiag[i] - adiag[i + 1] - 1;
257:     idt = 7 * i;
258:     s1  = t[idt];
259:     s2  = t[1 + idt];
260:     s3  = t[2 + idt];
261:     s4  = t[3 + idt];
262:     s5  = t[4 + idt];
263:     s6  = t[5 + idt];
264:     s7  = t[6 + idt];
265:     for (m = 0; m < nz; m++) {
266:       idx = 7 * vi[m];
267:       x1  = t[idx];
268:       x2  = t[1 + idx];
269:       x3  = t[2 + idx];
270:       x4  = t[3 + idx];
271:       x5  = t[4 + idx];
272:       x6  = t[5 + idx];
273:       x7  = t[6 + idx];
274:       s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
275:       s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
276:       s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
277:       s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
278:       s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
279:       s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
280:       s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
281:       v += 49;
282:     }
283:     idc    = 7 * c[i];
284:     x[idc] = t[idt] = v[0] * s1 + v[7] * s2 + v[14] * s3 + v[21] * s4 + v[28] * s5 + v[35] * s6 + v[42] * s7;
285:     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[8] * s2 + v[15] * s3 + v[22] * s4 + v[29] * s5 + v[36] * s6 + v[43] * s7;
286:     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[9] * s2 + v[16] * s3 + v[23] * s4 + v[30] * s5 + v[37] * s6 + v[44] * s7;
287:     x[3 + idc] = t[3 + idt] = v[3] * s1 + v[10] * s2 + v[17] * s3 + v[24] * s4 + v[31] * s5 + v[38] * s6 + v[45] * s7;
288:     x[4 + idc] = t[4 + idt] = v[4] * s1 + v[11] * s2 + v[18] * s3 + v[25] * s4 + v[32] * s5 + v[39] * s6 + v[46] * s7;
289:     x[5 + idc] = t[5 + idt] = v[5] * s1 + v[12] * s2 + v[19] * s3 + v[26] * s4 + v[33] * s5 + v[40] * s6 + v[47] * s7;
290:     x[6 + idc] = t[6 + idt] = v[6] * s1 + v[13] * s2 + v[20] * s3 + v[27] * s4 + v[34] * s5 + v[41] * s6 + v[48] * s7;
291:   }

293:   PetscCall(ISRestoreIndices(isrow, &rout));
294:   PetscCall(ISRestoreIndices(iscol, &cout));
295:   PetscCall(VecRestoreArrayRead(bb, &b));
296:   PetscCall(VecRestoreArray(xx, &x));
297:   PetscCall(PetscLogFlops(2.0 * 49 * (a->nz) - 7.0 * A->cmap->n));
298:   PetscFunctionReturn(PETSC_SUCCESS);
299: }

301: PetscErrorCode MatSolve_SeqBAIJ_6_inplace(Mat A, Vec bb, Vec xx)
302: {
303:   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
304:   IS                 iscol = a->col, isrow = a->row;
305:   const PetscInt    *r, *c, *rout, *cout;
306:   const PetscInt    *diag = a->diag, n = a->mbs, *vi, *ai = a->i, *aj = a->j;
307:   PetscInt           i, nz, idx, idt, idc;
308:   const MatScalar   *aa = a->a, *v;
309:   PetscScalar       *x, s1, s2, s3, s4, s5, s6, x1, x2, x3, x4, x5, x6, *t;
310:   const PetscScalar *b;

312:   PetscFunctionBegin;
313:   PetscCall(VecGetArrayRead(bb, &b));
314:   PetscCall(VecGetArray(xx, &x));
315:   t = a->solve_work;

317:   PetscCall(ISGetIndices(isrow, &rout));
318:   r = rout;
319:   PetscCall(ISGetIndices(iscol, &cout));
320:   c = cout + (n - 1);

322:   /* forward solve the lower triangular */
323:   idx  = 6 * (*r++);
324:   t[0] = b[idx];
325:   t[1] = b[1 + idx];
326:   t[2] = b[2 + idx];
327:   t[3] = b[3 + idx];
328:   t[4] = b[4 + idx];
329:   t[5] = b[5 + idx];
330:   for (i = 1; i < n; i++) {
331:     v   = aa + 36 * ai[i];
332:     vi  = aj + ai[i];
333:     nz  = diag[i] - ai[i];
334:     idx = 6 * (*r++);
335:     s1  = b[idx];
336:     s2  = b[1 + idx];
337:     s3  = b[2 + idx];
338:     s4  = b[3 + idx];
339:     s5  = b[4 + idx];
340:     s6  = b[5 + idx];
341:     while (nz--) {
342:       idx = 6 * (*vi++);
343:       x1  = t[idx];
344:       x2  = t[1 + idx];
345:       x3  = t[2 + idx];
346:       x4  = t[3 + idx];
347:       x5  = t[4 + idx];
348:       x6  = t[5 + idx];
349:       s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
350:       s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
351:       s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
352:       s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
353:       s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
354:       s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
355:       v += 36;
356:     }
357:     idx        = 6 * i;
358:     t[idx]     = s1;
359:     t[1 + idx] = s2;
360:     t[2 + idx] = s3;
361:     t[3 + idx] = s4;
362:     t[4 + idx] = s5;
363:     t[5 + idx] = s6;
364:   }
365:   /* backward solve the upper triangular */
366:   for (i = n - 1; i >= 0; i--) {
367:     v   = aa + 36 * diag[i] + 36;
368:     vi  = aj + diag[i] + 1;
369:     nz  = ai[i + 1] - diag[i] - 1;
370:     idt = 6 * i;
371:     s1  = t[idt];
372:     s2  = t[1 + idt];
373:     s3  = t[2 + idt];
374:     s4  = t[3 + idt];
375:     s5  = t[4 + idt];
376:     s6  = t[5 + idt];
377:     while (nz--) {
378:       idx = 6 * (*vi++);
379:       x1  = t[idx];
380:       x2  = t[1 + idx];
381:       x3  = t[2 + idx];
382:       x4  = t[3 + idx];
383:       x5  = t[4 + idx];
384:       x6  = t[5 + idx];
385:       s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
386:       s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
387:       s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
388:       s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
389:       s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
390:       s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
391:       v += 36;
392:     }
393:     idc    = 6 * (*c--);
394:     v      = aa + 36 * diag[i];
395:     x[idc] = t[idt] = v[0] * s1 + v[6] * s2 + v[12] * s3 + v[18] * s4 + v[24] * s5 + v[30] * s6;
396:     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[7] * s2 + v[13] * s3 + v[19] * s4 + v[25] * s5 + v[31] * s6;
397:     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[8] * s2 + v[14] * s3 + v[20] * s4 + v[26] * s5 + v[32] * s6;
398:     x[3 + idc] = t[3 + idt] = v[3] * s1 + v[9] * s2 + v[15] * s3 + v[21] * s4 + v[27] * s5 + v[33] * s6;
399:     x[4 + idc] = t[4 + idt] = v[4] * s1 + v[10] * s2 + v[16] * s3 + v[22] * s4 + v[28] * s5 + v[34] * s6;
400:     x[5 + idc] = t[5 + idt] = v[5] * s1 + v[11] * s2 + v[17] * s3 + v[23] * s4 + v[29] * s5 + v[35] * s6;
401:   }

403:   PetscCall(ISRestoreIndices(isrow, &rout));
404:   PetscCall(ISRestoreIndices(iscol, &cout));
405:   PetscCall(VecRestoreArrayRead(bb, &b));
406:   PetscCall(VecRestoreArray(xx, &x));
407:   PetscCall(PetscLogFlops(2.0 * 36 * (a->nz) - 6.0 * A->cmap->n));
408:   PetscFunctionReturn(PETSC_SUCCESS);
409: }

411: PetscErrorCode MatSolve_SeqBAIJ_6(Mat A, Vec bb, Vec xx)
412: {
413:   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
414:   IS                 iscol = a->col, isrow = a->row;
415:   const PetscInt    *r, *c, *rout, *cout;
416:   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
417:   PetscInt           i, nz, idx, idt, idc, m;
418:   const MatScalar   *aa = a->a, *v;
419:   PetscScalar       *x, s1, s2, s3, s4, s5, s6, x1, x2, x3, x4, x5, x6, *t;
420:   const PetscScalar *b;

422:   PetscFunctionBegin;
423:   PetscCall(VecGetArrayRead(bb, &b));
424:   PetscCall(VecGetArray(xx, &x));
425:   t = a->solve_work;

427:   PetscCall(ISGetIndices(isrow, &rout));
428:   r = rout;
429:   PetscCall(ISGetIndices(iscol, &cout));
430:   c = cout;

432:   /* forward solve the lower triangular */
433:   idx  = 6 * r[0];
434:   t[0] = b[idx];
435:   t[1] = b[1 + idx];
436:   t[2] = b[2 + idx];
437:   t[3] = b[3 + idx];
438:   t[4] = b[4 + idx];
439:   t[5] = b[5 + idx];
440:   for (i = 1; i < n; i++) {
441:     v   = aa + 36 * ai[i];
442:     vi  = aj + ai[i];
443:     nz  = ai[i + 1] - ai[i];
444:     idx = 6 * r[i];
445:     s1  = b[idx];
446:     s2  = b[1 + idx];
447:     s3  = b[2 + idx];
448:     s4  = b[3 + idx];
449:     s5  = b[4 + idx];
450:     s6  = b[5 + idx];
451:     for (m = 0; m < nz; m++) {
452:       idx = 6 * vi[m];
453:       x1  = t[idx];
454:       x2  = t[1 + idx];
455:       x3  = t[2 + idx];
456:       x4  = t[3 + idx];
457:       x5  = t[4 + idx];
458:       x6  = t[5 + idx];
459:       s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
460:       s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
461:       s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
462:       s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
463:       s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
464:       s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
465:       v += 36;
466:     }
467:     idx        = 6 * i;
468:     t[idx]     = s1;
469:     t[1 + idx] = s2;
470:     t[2 + idx] = s3;
471:     t[3 + idx] = s4;
472:     t[4 + idx] = s5;
473:     t[5 + idx] = s6;
474:   }
475:   /* backward solve the upper triangular */
476:   for (i = n - 1; i >= 0; i--) {
477:     v   = aa + 36 * (adiag[i + 1] + 1);
478:     vi  = aj + adiag[i + 1] + 1;
479:     nz  = adiag[i] - adiag[i + 1] - 1;
480:     idt = 6 * i;
481:     s1  = t[idt];
482:     s2  = t[1 + idt];
483:     s3  = t[2 + idt];
484:     s4  = t[3 + idt];
485:     s5  = t[4 + idt];
486:     s6  = t[5 + idt];
487:     for (m = 0; m < nz; m++) {
488:       idx = 6 * vi[m];
489:       x1  = t[idx];
490:       x2  = t[1 + idx];
491:       x3  = t[2 + idx];
492:       x4  = t[3 + idx];
493:       x5  = t[4 + idx];
494:       x6  = t[5 + idx];
495:       s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
496:       s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
497:       s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
498:       s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
499:       s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
500:       s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
501:       v += 36;
502:     }
503:     idc    = 6 * c[i];
504:     x[idc] = t[idt] = v[0] * s1 + v[6] * s2 + v[12] * s3 + v[18] * s4 + v[24] * s5 + v[30] * s6;
505:     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[7] * s2 + v[13] * s3 + v[19] * s4 + v[25] * s5 + v[31] * s6;
506:     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[8] * s2 + v[14] * s3 + v[20] * s4 + v[26] * s5 + v[32] * s6;
507:     x[3 + idc] = t[3 + idt] = v[3] * s1 + v[9] * s2 + v[15] * s3 + v[21] * s4 + v[27] * s5 + v[33] * s6;
508:     x[4 + idc] = t[4 + idt] = v[4] * s1 + v[10] * s2 + v[16] * s3 + v[22] * s4 + v[28] * s5 + v[34] * s6;
509:     x[5 + idc] = t[5 + idt] = v[5] * s1 + v[11] * s2 + v[17] * s3 + v[23] * s4 + v[29] * s5 + v[35] * s6;
510:   }

512:   PetscCall(ISRestoreIndices(isrow, &rout));
513:   PetscCall(ISRestoreIndices(iscol, &cout));
514:   PetscCall(VecRestoreArrayRead(bb, &b));
515:   PetscCall(VecRestoreArray(xx, &x));
516:   PetscCall(PetscLogFlops(2.0 * 36 * (a->nz) - 6.0 * A->cmap->n));
517:   PetscFunctionReturn(PETSC_SUCCESS);
518: }

520: PetscErrorCode MatSolve_SeqBAIJ_5_inplace(Mat A, Vec bb, Vec xx)
521: {
522:   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
523:   IS                 iscol = a->col, isrow = a->row;
524:   const PetscInt    *r, *c, *rout, *cout, *diag = a->diag;
525:   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j;
526:   PetscInt           i, nz, idx, idt, idc;
527:   const MatScalar   *aa = a->a, *v;
528:   PetscScalar       *x, s1, s2, s3, s4, s5, x1, x2, x3, x4, x5, *t;
529:   const PetscScalar *b;

531:   PetscFunctionBegin;
532:   PetscCall(VecGetArrayRead(bb, &b));
533:   PetscCall(VecGetArray(xx, &x));
534:   t = a->solve_work;

536:   PetscCall(ISGetIndices(isrow, &rout));
537:   r = rout;
538:   PetscCall(ISGetIndices(iscol, &cout));
539:   c = cout + (n - 1);

541:   /* forward solve the lower triangular */
542:   idx  = 5 * (*r++);
543:   t[0] = b[idx];
544:   t[1] = b[1 + idx];
545:   t[2] = b[2 + idx];
546:   t[3] = b[3 + idx];
547:   t[4] = b[4 + idx];
548:   for (i = 1; i < n; i++) {
549:     v   = aa + 25 * ai[i];
550:     vi  = aj + ai[i];
551:     nz  = diag[i] - ai[i];
552:     idx = 5 * (*r++);
553:     s1  = b[idx];
554:     s2  = b[1 + idx];
555:     s3  = b[2 + idx];
556:     s4  = b[3 + idx];
557:     s5  = b[4 + idx];
558:     while (nz--) {
559:       idx = 5 * (*vi++);
560:       x1  = t[idx];
561:       x2  = t[1 + idx];
562:       x3  = t[2 + idx];
563:       x4  = t[3 + idx];
564:       x5  = t[4 + idx];
565:       s1 -= v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
566:       s2 -= v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
567:       s3 -= v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
568:       s4 -= v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
569:       s5 -= v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
570:       v += 25;
571:     }
572:     idx        = 5 * i;
573:     t[idx]     = s1;
574:     t[1 + idx] = s2;
575:     t[2 + idx] = s3;
576:     t[3 + idx] = s4;
577:     t[4 + idx] = s5;
578:   }
579:   /* backward solve the upper triangular */
580:   for (i = n - 1; i >= 0; i--) {
581:     v   = aa + 25 * diag[i] + 25;
582:     vi  = aj + diag[i] + 1;
583:     nz  = ai[i + 1] - diag[i] - 1;
584:     idt = 5 * i;
585:     s1  = t[idt];
586:     s2  = t[1 + idt];
587:     s3  = t[2 + idt];
588:     s4  = t[3 + idt];
589:     s5  = t[4 + idt];
590:     while (nz--) {
591:       idx = 5 * (*vi++);
592:       x1  = t[idx];
593:       x2  = t[1 + idx];
594:       x3  = t[2 + idx];
595:       x4  = t[3 + idx];
596:       x5  = t[4 + idx];
597:       s1 -= v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
598:       s2 -= v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
599:       s3 -= v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
600:       s4 -= v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
601:       s5 -= v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
602:       v += 25;
603:     }
604:     idc    = 5 * (*c--);
605:     v      = aa + 25 * diag[i];
606:     x[idc] = t[idt] = v[0] * s1 + v[5] * s2 + v[10] * s3 + v[15] * s4 + v[20] * s5;
607:     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[6] * s2 + v[11] * s3 + v[16] * s4 + v[21] * s5;
608:     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[7] * s2 + v[12] * s3 + v[17] * s4 + v[22] * s5;
609:     x[3 + idc] = t[3 + idt] = v[3] * s1 + v[8] * s2 + v[13] * s3 + v[18] * s4 + v[23] * s5;
610:     x[4 + idc] = t[4 + idt] = v[4] * s1 + v[9] * s2 + v[14] * s3 + v[19] * s4 + v[24] * s5;
611:   }

613:   PetscCall(ISRestoreIndices(isrow, &rout));
614:   PetscCall(ISRestoreIndices(iscol, &cout));
615:   PetscCall(VecRestoreArrayRead(bb, &b));
616:   PetscCall(VecRestoreArray(xx, &x));
617:   PetscCall(PetscLogFlops(2.0 * 25 * (a->nz) - 5.0 * A->cmap->n));
618:   PetscFunctionReturn(PETSC_SUCCESS);
619: }

621: PetscErrorCode MatSolve_SeqBAIJ_5(Mat A, Vec bb, Vec xx)
622: {
623:   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
624:   IS                 iscol = a->col, isrow = a->row;
625:   const PetscInt    *r, *c, *rout, *cout;
626:   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
627:   PetscInt           i, nz, idx, idt, idc, m;
628:   const MatScalar   *aa = a->a, *v;
629:   PetscScalar       *x, s1, s2, s3, s4, s5, x1, x2, x3, x4, x5, *t;
630:   const PetscScalar *b;

632:   PetscFunctionBegin;
633:   PetscCall(VecGetArrayRead(bb, &b));
634:   PetscCall(VecGetArray(xx, &x));
635:   t = a->solve_work;

637:   PetscCall(ISGetIndices(isrow, &rout));
638:   r = rout;
639:   PetscCall(ISGetIndices(iscol, &cout));
640:   c = cout;

642:   /* forward solve the lower triangular */
643:   idx  = 5 * r[0];
644:   t[0] = b[idx];
645:   t[1] = b[1 + idx];
646:   t[2] = b[2 + idx];
647:   t[3] = b[3 + idx];
648:   t[4] = b[4 + idx];
649:   for (i = 1; i < n; i++) {
650:     v   = aa + 25 * ai[i];
651:     vi  = aj + ai[i];
652:     nz  = ai[i + 1] - ai[i];
653:     idx = 5 * r[i];
654:     s1  = b[idx];
655:     s2  = b[1 + idx];
656:     s3  = b[2 + idx];
657:     s4  = b[3 + idx];
658:     s5  = b[4 + idx];
659:     for (m = 0; m < nz; m++) {
660:       idx = 5 * vi[m];
661:       x1  = t[idx];
662:       x2  = t[1 + idx];
663:       x3  = t[2 + idx];
664:       x4  = t[3 + idx];
665:       x5  = t[4 + idx];
666:       s1 -= v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
667:       s2 -= v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
668:       s3 -= v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
669:       s4 -= v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
670:       s5 -= v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
671:       v += 25;
672:     }
673:     idx        = 5 * i;
674:     t[idx]     = s1;
675:     t[1 + idx] = s2;
676:     t[2 + idx] = s3;
677:     t[3 + idx] = s4;
678:     t[4 + idx] = s5;
679:   }
680:   /* backward solve the upper triangular */
681:   for (i = n - 1; i >= 0; i--) {
682:     v   = aa + 25 * (adiag[i + 1] + 1);
683:     vi  = aj + adiag[i + 1] + 1;
684:     nz  = adiag[i] - adiag[i + 1] - 1;
685:     idt = 5 * i;
686:     s1  = t[idt];
687:     s2  = t[1 + idt];
688:     s3  = t[2 + idt];
689:     s4  = t[3 + idt];
690:     s5  = t[4 + idt];
691:     for (m = 0; m < nz; m++) {
692:       idx = 5 * vi[m];
693:       x1  = t[idx];
694:       x2  = t[1 + idx];
695:       x3  = t[2 + idx];
696:       x4  = t[3 + idx];
697:       x5  = t[4 + idx];
698:       s1 -= v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
699:       s2 -= v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
700:       s3 -= v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
701:       s4 -= v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
702:       s5 -= v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
703:       v += 25;
704:     }
705:     idc    = 5 * c[i];
706:     x[idc] = t[idt] = v[0] * s1 + v[5] * s2 + v[10] * s3 + v[15] * s4 + v[20] * s5;
707:     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[6] * s2 + v[11] * s3 + v[16] * s4 + v[21] * s5;
708:     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[7] * s2 + v[12] * s3 + v[17] * s4 + v[22] * s5;
709:     x[3 + idc] = t[3 + idt] = v[3] * s1 + v[8] * s2 + v[13] * s3 + v[18] * s4 + v[23] * s5;
710:     x[4 + idc] = t[4 + idt] = v[4] * s1 + v[9] * s2 + v[14] * s3 + v[19] * s4 + v[24] * s5;
711:   }

713:   PetscCall(ISRestoreIndices(isrow, &rout));
714:   PetscCall(ISRestoreIndices(iscol, &cout));
715:   PetscCall(VecRestoreArrayRead(bb, &b));
716:   PetscCall(VecRestoreArray(xx, &x));
717:   PetscCall(PetscLogFlops(2.0 * 25 * (a->nz) - 5.0 * A->cmap->n));
718:   PetscFunctionReturn(PETSC_SUCCESS);
719: }

721: PetscErrorCode MatSolve_SeqBAIJ_4_inplace(Mat A, Vec bb, Vec xx)
722: {
723:   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
724:   IS                 iscol = a->col, isrow = a->row;
725:   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j;
726:   PetscInt           i, nz, idx, idt, idc;
727:   const PetscInt    *r, *c, *diag = a->diag, *rout, *cout;
728:   const MatScalar   *aa = a->a, *v;
729:   PetscScalar       *x, s1, s2, s3, s4, x1, x2, x3, x4, *t;
730:   const PetscScalar *b;

732:   PetscFunctionBegin;
733:   PetscCall(VecGetArrayRead(bb, &b));
734:   PetscCall(VecGetArray(xx, &x));
735:   t = a->solve_work;

737:   PetscCall(ISGetIndices(isrow, &rout));
738:   r = rout;
739:   PetscCall(ISGetIndices(iscol, &cout));
740:   c = cout + (n - 1);

742:   /* forward solve the lower triangular */
743:   idx  = 4 * (*r++);
744:   t[0] = b[idx];
745:   t[1] = b[1 + idx];
746:   t[2] = b[2 + idx];
747:   t[3] = b[3 + idx];
748:   for (i = 1; i < n; i++) {
749:     v   = aa + 16 * ai[i];
750:     vi  = aj + ai[i];
751:     nz  = diag[i] - ai[i];
752:     idx = 4 * (*r++);
753:     s1  = b[idx];
754:     s2  = b[1 + idx];
755:     s3  = b[2 + idx];
756:     s4  = b[3 + idx];
757:     while (nz--) {
758:       idx = 4 * (*vi++);
759:       x1  = t[idx];
760:       x2  = t[1 + idx];
761:       x3  = t[2 + idx];
762:       x4  = t[3 + idx];
763:       s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
764:       s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
765:       s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
766:       s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
767:       v += 16;
768:     }
769:     idx        = 4 * i;
770:     t[idx]     = s1;
771:     t[1 + idx] = s2;
772:     t[2 + idx] = s3;
773:     t[3 + idx] = s4;
774:   }
775:   /* backward solve the upper triangular */
776:   for (i = n - 1; i >= 0; i--) {
777:     v   = aa + 16 * diag[i] + 16;
778:     vi  = aj + diag[i] + 1;
779:     nz  = ai[i + 1] - diag[i] - 1;
780:     idt = 4 * i;
781:     s1  = t[idt];
782:     s2  = t[1 + idt];
783:     s3  = t[2 + idt];
784:     s4  = t[3 + idt];
785:     while (nz--) {
786:       idx = 4 * (*vi++);
787:       x1  = t[idx];
788:       x2  = t[1 + idx];
789:       x3  = t[2 + idx];
790:       x4  = t[3 + idx];
791:       s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
792:       s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
793:       s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
794:       s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
795:       v += 16;
796:     }
797:     idc    = 4 * (*c--);
798:     v      = aa + 16 * diag[i];
799:     x[idc] = t[idt] = v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4;
800:     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4;
801:     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4;
802:     x[3 + idc] = t[3 + idt] = v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4;
803:   }

805:   PetscCall(ISRestoreIndices(isrow, &rout));
806:   PetscCall(ISRestoreIndices(iscol, &cout));
807:   PetscCall(VecRestoreArrayRead(bb, &b));
808:   PetscCall(VecRestoreArray(xx, &x));
809:   PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n));
810:   PetscFunctionReturn(PETSC_SUCCESS);
811: }

813: PetscErrorCode MatSolve_SeqBAIJ_4(Mat A, Vec bb, Vec xx)
814: {
815:   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
816:   IS                 iscol = a->col, isrow = a->row;
817:   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
818:   PetscInt           i, nz, idx, idt, idc, m;
819:   const PetscInt    *r, *c, *rout, *cout;
820:   const MatScalar   *aa = a->a, *v;
821:   PetscScalar       *x, s1, s2, s3, s4, x1, x2, x3, x4, *t;
822:   const PetscScalar *b;

824:   PetscFunctionBegin;
825:   PetscCall(VecGetArrayRead(bb, &b));
826:   PetscCall(VecGetArray(xx, &x));
827:   t = a->solve_work;

829:   PetscCall(ISGetIndices(isrow, &rout));
830:   r = rout;
831:   PetscCall(ISGetIndices(iscol, &cout));
832:   c = cout;

834:   /* forward solve the lower triangular */
835:   idx  = 4 * r[0];
836:   t[0] = b[idx];
837:   t[1] = b[1 + idx];
838:   t[2] = b[2 + idx];
839:   t[3] = b[3 + idx];
840:   for (i = 1; i < n; i++) {
841:     v   = aa + 16 * ai[i];
842:     vi  = aj + ai[i];
843:     nz  = ai[i + 1] - ai[i];
844:     idx = 4 * r[i];
845:     s1  = b[idx];
846:     s2  = b[1 + idx];
847:     s3  = b[2 + idx];
848:     s4  = b[3 + idx];
849:     for (m = 0; m < nz; m++) {
850:       idx = 4 * vi[m];
851:       x1  = t[idx];
852:       x2  = t[1 + idx];
853:       x3  = t[2 + idx];
854:       x4  = t[3 + idx];
855:       s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
856:       s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
857:       s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
858:       s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
859:       v += 16;
860:     }
861:     idx        = 4 * i;
862:     t[idx]     = s1;
863:     t[1 + idx] = s2;
864:     t[2 + idx] = s3;
865:     t[3 + idx] = s4;
866:   }
867:   /* backward solve the upper triangular */
868:   for (i = n - 1; i >= 0; i--) {
869:     v   = aa + 16 * (adiag[i + 1] + 1);
870:     vi  = aj + adiag[i + 1] + 1;
871:     nz  = adiag[i] - adiag[i + 1] - 1;
872:     idt = 4 * i;
873:     s1  = t[idt];
874:     s2  = t[1 + idt];
875:     s3  = t[2 + idt];
876:     s4  = t[3 + idt];
877:     for (m = 0; m < nz; m++) {
878:       idx = 4 * vi[m];
879:       x1  = t[idx];
880:       x2  = t[1 + idx];
881:       x3  = t[2 + idx];
882:       x4  = t[3 + idx];
883:       s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
884:       s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
885:       s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
886:       s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
887:       v += 16;
888:     }
889:     idc    = 4 * c[i];
890:     x[idc] = t[idt] = v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4;
891:     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4;
892:     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4;
893:     x[3 + idc] = t[3 + idt] = v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4;
894:   }

896:   PetscCall(ISRestoreIndices(isrow, &rout));
897:   PetscCall(ISRestoreIndices(iscol, &cout));
898:   PetscCall(VecRestoreArrayRead(bb, &b));
899:   PetscCall(VecRestoreArray(xx, &x));
900:   PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n));
901:   PetscFunctionReturn(PETSC_SUCCESS);
902: }

904: PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A, Vec bb, Vec xx)
905: {
906:   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
907:   IS                 iscol = a->col, isrow = a->row;
908:   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j;
909:   PetscInt           i, nz, idx, idt, idc;
910:   const PetscInt    *r, *c, *diag = a->diag, *rout, *cout;
911:   const MatScalar   *aa = a->a, *v;
912:   MatScalar          s1, s2, s3, s4, x1, x2, x3, x4, *t;
913:   PetscScalar       *x;
914:   const PetscScalar *b;

916:   PetscFunctionBegin;
917:   PetscCall(VecGetArrayRead(bb, &b));
918:   PetscCall(VecGetArray(xx, &x));
919:   t = (MatScalar *)a->solve_work;

921:   PetscCall(ISGetIndices(isrow, &rout));
922:   r = rout;
923:   PetscCall(ISGetIndices(iscol, &cout));
924:   c = cout + (n - 1);

926:   /* forward solve the lower triangular */
927:   idx  = 4 * (*r++);
928:   t[0] = (MatScalar)b[idx];
929:   t[1] = (MatScalar)b[1 + idx];
930:   t[2] = (MatScalar)b[2 + idx];
931:   t[3] = (MatScalar)b[3 + idx];
932:   for (i = 1; i < n; i++) {
933:     v   = aa + 16 * ai[i];
934:     vi  = aj + ai[i];
935:     nz  = diag[i] - ai[i];
936:     idx = 4 * (*r++);
937:     s1  = (MatScalar)b[idx];
938:     s2  = (MatScalar)b[1 + idx];
939:     s3  = (MatScalar)b[2 + idx];
940:     s4  = (MatScalar)b[3 + idx];
941:     while (nz--) {
942:       idx = 4 * (*vi++);
943:       x1  = t[idx];
944:       x2  = t[1 + idx];
945:       x3  = t[2 + idx];
946:       x4  = t[3 + idx];
947:       s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
948:       s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
949:       s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
950:       s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
951:       v += 16;
952:     }
953:     idx        = 4 * i;
954:     t[idx]     = s1;
955:     t[1 + idx] = s2;
956:     t[2 + idx] = s3;
957:     t[3 + idx] = s4;
958:   }
959:   /* backward solve the upper triangular */
960:   for (i = n - 1; i >= 0; i--) {
961:     v   = aa + 16 * diag[i] + 16;
962:     vi  = aj + diag[i] + 1;
963:     nz  = ai[i + 1] - diag[i] - 1;
964:     idt = 4 * i;
965:     s1  = t[idt];
966:     s2  = t[1 + idt];
967:     s3  = t[2 + idt];
968:     s4  = t[3 + idt];
969:     while (nz--) {
970:       idx = 4 * (*vi++);
971:       x1  = t[idx];
972:       x2  = t[1 + idx];
973:       x3  = t[2 + idx];
974:       x4  = t[3 + idx];
975:       s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
976:       s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
977:       s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
978:       s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
979:       v += 16;
980:     }
981:     idc        = 4 * (*c--);
982:     v          = aa + 16 * diag[i];
983:     t[idt]     = v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4;
984:     t[1 + idt] = v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4;
985:     t[2 + idt] = v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4;
986:     t[3 + idt] = v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4;
987:     x[idc]     = (PetscScalar)t[idt];
988:     x[1 + idc] = (PetscScalar)t[1 + idt];
989:     x[2 + idc] = (PetscScalar)t[2 + idt];
990:     x[3 + idc] = (PetscScalar)t[3 + idt];
991:   }

993:   PetscCall(ISRestoreIndices(isrow, &rout));
994:   PetscCall(ISRestoreIndices(iscol, &cout));
995:   PetscCall(VecRestoreArrayRead(bb, &b));
996:   PetscCall(VecRestoreArray(xx, &x));
997:   PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n));
998:   PetscFunctionReturn(PETSC_SUCCESS);
999: }

1001: #if defined(PETSC_HAVE_SSE)

1003:   #include PETSC_HAVE_SSE

1005: PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A, Vec bb, Vec xx)
1006: {
1007:   /*
1008:      Note: This code uses demotion of double
1009:      to float when performing the mixed-mode computation.
1010:      This may not be numerically reasonable for all applications.
1011:   */
1012:   Mat_SeqBAIJ    *a     = (Mat_SeqBAIJ *)A->data;
1013:   IS              iscol = a->col, isrow = a->row;
1014:   PetscInt        i, n = a->mbs, *vi, *ai = a->i, *aj = a->j, nz, idx, idt, idc, ai16;
1015:   const PetscInt *r, *c, *diag = a->diag, *rout, *cout;
1016:   MatScalar      *aa = a->a, *v;
1017:   PetscScalar    *x, *b, *t;

1019:   /* Make space in temp stack for 16 Byte Aligned arrays */
1020:   float         ssealignedspace[11], *tmps, *tmpx;
1021:   unsigned long offset;

1023:   PetscFunctionBegin;
1024:   SSE_SCOPE_BEGIN;

1026:   offset = (unsigned long)ssealignedspace % 16;
1027:   if (offset) offset = (16 - offset) / 4;
1028:   tmps = &ssealignedspace[offset];
1029:   tmpx = &ssealignedspace[offset + 4];
1030:   PREFETCH_NTA(aa + 16 * ai[1]);

1032:   PetscCall(VecGetArray(bb, &b));
1033:   PetscCall(VecGetArray(xx, &x));
1034:   t = a->solve_work;

1036:   PetscCall(ISGetIndices(isrow, &rout));
1037:   r = rout;
1038:   PetscCall(ISGetIndices(iscol, &cout));
1039:   c = cout + (n - 1);

1041:   /* forward solve the lower triangular */
1042:   idx  = 4 * (*r++);
1043:   t[0] = b[idx];
1044:   t[1] = b[1 + idx];
1045:   t[2] = b[2 + idx];
1046:   t[3] = b[3 + idx];
1047:   v    = aa + 16 * ai[1];

1049:   for (i = 1; i < n;) {
1050:     PREFETCH_NTA(&v[8]);
1051:     vi  = aj + ai[i];
1052:     nz  = diag[i] - ai[i];
1053:     idx = 4 * (*r++);

1055:     /* Demote sum from double to float */
1056:     CONVERT_DOUBLE4_FLOAT4(tmps, &b[idx]);
1057:     LOAD_PS(tmps, XMM7);

1059:     while (nz--) {
1060:       PREFETCH_NTA(&v[16]);
1061:       idx = 4 * (*vi++);

1063:       /* Demote solution (so far) from double to float */
1064:       CONVERT_DOUBLE4_FLOAT4(tmpx, &x[idx]);

1066:       /* 4x4 Matrix-Vector product with negative accumulation: */
1067:       SSE_INLINE_BEGIN_2(tmpx, v)
1068:       SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6)

1070:       /* First Column */
1071:       SSE_COPY_PS(XMM0, XMM6)
1072:       SSE_SHUFFLE(XMM0, XMM0, 0x00)
1073:       SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0)
1074:       SSE_SUB_PS(XMM7, XMM0)

1076:       /* Second Column */
1077:       SSE_COPY_PS(XMM1, XMM6)
1078:       SSE_SHUFFLE(XMM1, XMM1, 0x55)
1079:       SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4)
1080:       SSE_SUB_PS(XMM7, XMM1)

1082:       SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24)

1084:       /* Third Column */
1085:       SSE_COPY_PS(XMM2, XMM6)
1086:       SSE_SHUFFLE(XMM2, XMM2, 0xAA)
1087:       SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8)
1088:       SSE_SUB_PS(XMM7, XMM2)

1090:       /* Fourth Column */
1091:       SSE_COPY_PS(XMM3, XMM6)
1092:       SSE_SHUFFLE(XMM3, XMM3, 0xFF)
1093:       SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12)
1094:       SSE_SUB_PS(XMM7, XMM3)
1095:       SSE_INLINE_END_2

1097:       v += 16;
1098:     }
1099:     idx = 4 * i;
1100:     v   = aa + 16 * ai[++i];
1101:     PREFETCH_NTA(v);
1102:     STORE_PS(tmps, XMM7);

1104:     /* Promote result from float to double */
1105:     CONVERT_FLOAT4_DOUBLE4(&t[idx], tmps);
1106:   }
1107:   /* backward solve the upper triangular */
1108:   idt  = 4 * (n - 1);
1109:   ai16 = 16 * diag[n - 1];
1110:   v    = aa + ai16 + 16;
1111:   for (i = n - 1; i >= 0;) {
1112:     PREFETCH_NTA(&v[8]);
1113:     vi = aj + diag[i] + 1;
1114:     nz = ai[i + 1] - diag[i] - 1;

1116:     /* Demote accumulator from double to float */
1117:     CONVERT_DOUBLE4_FLOAT4(tmps, &t[idt]);
1118:     LOAD_PS(tmps, XMM7);

1120:     while (nz--) {
1121:       PREFETCH_NTA(&v[16]);
1122:       idx = 4 * (*vi++);

1124:       /* Demote solution (so far) from double to float */
1125:       CONVERT_DOUBLE4_FLOAT4(tmpx, &t[idx]);

1127:       /* 4x4 Matrix-Vector Product with negative accumulation: */
1128:       SSE_INLINE_BEGIN_2(tmpx, v)
1129:       SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6)

1131:       /* First Column */
1132:       SSE_COPY_PS(XMM0, XMM6)
1133:       SSE_SHUFFLE(XMM0, XMM0, 0x00)
1134:       SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0)
1135:       SSE_SUB_PS(XMM7, XMM0)

1137:       /* Second Column */
1138:       SSE_COPY_PS(XMM1, XMM6)
1139:       SSE_SHUFFLE(XMM1, XMM1, 0x55)
1140:       SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4)
1141:       SSE_SUB_PS(XMM7, XMM1)

1143:       SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24)

1145:       /* Third Column */
1146:       SSE_COPY_PS(XMM2, XMM6)
1147:       SSE_SHUFFLE(XMM2, XMM2, 0xAA)
1148:       SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8)
1149:       SSE_SUB_PS(XMM7, XMM2)

1151:       /* Fourth Column */
1152:       SSE_COPY_PS(XMM3, XMM6)
1153:       SSE_SHUFFLE(XMM3, XMM3, 0xFF)
1154:       SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12)
1155:       SSE_SUB_PS(XMM7, XMM3)
1156:       SSE_INLINE_END_2
1157:       v += 16;
1158:     }
1159:     v    = aa + ai16;
1160:     ai16 = 16 * diag[--i];
1161:     PREFETCH_NTA(aa + ai16 + 16);
1162:     /*
1163:        Scale the result by the diagonal 4x4 block,
1164:        which was inverted as part of the factorization
1165:     */
1166:     SSE_INLINE_BEGIN_3(v, tmps, aa + ai16)
1167:     /* First Column */
1168:     SSE_COPY_PS(XMM0, XMM7)
1169:     SSE_SHUFFLE(XMM0, XMM0, 0x00)
1170:     SSE_MULT_PS_M(XMM0, SSE_ARG_1, FLOAT_0)

1172:     /* Second Column */
1173:     SSE_COPY_PS(XMM1, XMM7)
1174:     SSE_SHUFFLE(XMM1, XMM1, 0x55)
1175:     SSE_MULT_PS_M(XMM1, SSE_ARG_1, FLOAT_4)
1176:     SSE_ADD_PS(XMM0, XMM1)

1178:     SSE_PREFETCH_NTA(SSE_ARG_3, FLOAT_24)

1180:     /* Third Column */
1181:     SSE_COPY_PS(XMM2, XMM7)
1182:     SSE_SHUFFLE(XMM2, XMM2, 0xAA)
1183:     SSE_MULT_PS_M(XMM2, SSE_ARG_1, FLOAT_8)
1184:     SSE_ADD_PS(XMM0, XMM2)

1186:     /* Fourth Column */
1187:     SSE_COPY_PS(XMM3, XMM7)
1188:     SSE_SHUFFLE(XMM3, XMM3, 0xFF)
1189:     SSE_MULT_PS_M(XMM3, SSE_ARG_1, FLOAT_12)
1190:     SSE_ADD_PS(XMM0, XMM3)

1192:     SSE_STORE_PS(SSE_ARG_2, FLOAT_0, XMM0)
1193:     SSE_INLINE_END_3

1195:     /* Promote solution from float to double */
1196:     CONVERT_FLOAT4_DOUBLE4(&t[idt], tmps);

1198:     /* Apply reordering to t and stream into x.    */
1199:     /* This way, x doesn't pollute the cache.      */
1200:     /* Be careful with size: 2 doubles = 4 floats! */
1201:     idc = 4 * (*c--);
1202:     SSE_INLINE_BEGIN_2((float *)&t[idt], (float *)&x[idc])
1203:     /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
1204:     SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM0)
1205:     SSE_STREAM_PS(SSE_ARG_2, FLOAT_0, XMM0)
1206:     /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
1207:     SSE_LOAD_PS(SSE_ARG_1, FLOAT_4, XMM1)
1208:     SSE_STREAM_PS(SSE_ARG_2, FLOAT_4, XMM1)
1209:     SSE_INLINE_END_2
1210:     v = aa + ai16 + 16;
1211:     idt -= 4;
1212:   }

1214:   PetscCall(ISRestoreIndices(isrow, &rout));
1215:   PetscCall(ISRestoreIndices(iscol, &cout));
1216:   PetscCall(VecRestoreArray(bb, &b));
1217:   PetscCall(VecRestoreArray(xx, &x));
1218:   PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n));
1219:   SSE_SCOPE_END;
1220:   PetscFunctionReturn(PETSC_SUCCESS);
1221: }

1223: #endif

1225: PetscErrorCode MatSolve_SeqBAIJ_3_inplace(Mat A, Vec bb, Vec xx)
1226: {
1227:   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
1228:   IS                 iscol = a->col, isrow = a->row;
1229:   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j;
1230:   PetscInt           i, nz, idx, idt, idc;
1231:   const PetscInt    *r, *c, *diag = a->diag, *rout, *cout;
1232:   const MatScalar   *aa = a->a, *v;
1233:   PetscScalar       *x, s1, s2, s3, x1, x2, x3, *t;
1234:   const PetscScalar *b;

1236:   PetscFunctionBegin;
1237:   PetscCall(VecGetArrayRead(bb, &b));
1238:   PetscCall(VecGetArray(xx, &x));
1239:   t = a->solve_work;

1241:   PetscCall(ISGetIndices(isrow, &rout));
1242:   r = rout;
1243:   PetscCall(ISGetIndices(iscol, &cout));
1244:   c = cout + (n - 1);

1246:   /* forward solve the lower triangular */
1247:   idx  = 3 * (*r++);
1248:   t[0] = b[idx];
1249:   t[1] = b[1 + idx];
1250:   t[2] = b[2 + idx];
1251:   for (i = 1; i < n; i++) {
1252:     v   = aa + 9 * ai[i];
1253:     vi  = aj + ai[i];
1254:     nz  = diag[i] - ai[i];
1255:     idx = 3 * (*r++);
1256:     s1  = b[idx];
1257:     s2  = b[1 + idx];
1258:     s3  = b[2 + idx];
1259:     while (nz--) {
1260:       idx = 3 * (*vi++);
1261:       x1  = t[idx];
1262:       x2  = t[1 + idx];
1263:       x3  = t[2 + idx];
1264:       s1 -= v[0] * x1 + v[3] * x2 + v[6] * x3;
1265:       s2 -= v[1] * x1 + v[4] * x2 + v[7] * x3;
1266:       s3 -= v[2] * x1 + v[5] * x2 + v[8] * x3;
1267:       v += 9;
1268:     }
1269:     idx        = 3 * i;
1270:     t[idx]     = s1;
1271:     t[1 + idx] = s2;
1272:     t[2 + idx] = s3;
1273:   }
1274:   /* backward solve the upper triangular */
1275:   for (i = n - 1; i >= 0; i--) {
1276:     v   = aa + 9 * diag[i] + 9;
1277:     vi  = aj + diag[i] + 1;
1278:     nz  = ai[i + 1] - diag[i] - 1;
1279:     idt = 3 * i;
1280:     s1  = t[idt];
1281:     s2  = t[1 + idt];
1282:     s3  = t[2 + idt];
1283:     while (nz--) {
1284:       idx = 3 * (*vi++);
1285:       x1  = t[idx];
1286:       x2  = t[1 + idx];
1287:       x3  = t[2 + idx];
1288:       s1 -= v[0] * x1 + v[3] * x2 + v[6] * x3;
1289:       s2 -= v[1] * x1 + v[4] * x2 + v[7] * x3;
1290:       s3 -= v[2] * x1 + v[5] * x2 + v[8] * x3;
1291:       v += 9;
1292:     }
1293:     idc    = 3 * (*c--);
1294:     v      = aa + 9 * diag[i];
1295:     x[idc] = t[idt] = v[0] * s1 + v[3] * s2 + v[6] * s3;
1296:     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[4] * s2 + v[7] * s3;
1297:     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[5] * s2 + v[8] * s3;
1298:   }
1299:   PetscCall(ISRestoreIndices(isrow, &rout));
1300:   PetscCall(ISRestoreIndices(iscol, &cout));
1301:   PetscCall(VecRestoreArrayRead(bb, &b));
1302:   PetscCall(VecRestoreArray(xx, &x));
1303:   PetscCall(PetscLogFlops(2.0 * 9 * (a->nz) - 3.0 * A->cmap->n));
1304:   PetscFunctionReturn(PETSC_SUCCESS);
1305: }

1307: PetscErrorCode MatSolve_SeqBAIJ_3(Mat A, Vec bb, Vec xx)
1308: {
1309:   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
1310:   IS                 iscol = a->col, isrow = a->row;
1311:   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
1312:   PetscInt           i, nz, idx, idt, idc, m;
1313:   const PetscInt    *r, *c, *rout, *cout;
1314:   const MatScalar   *aa = a->a, *v;
1315:   PetscScalar       *x, s1, s2, s3, x1, x2, x3, *t;
1316:   const PetscScalar *b;

1318:   PetscFunctionBegin;
1319:   PetscCall(VecGetArrayRead(bb, &b));
1320:   PetscCall(VecGetArray(xx, &x));
1321:   t = a->solve_work;

1323:   PetscCall(ISGetIndices(isrow, &rout));
1324:   r = rout;
1325:   PetscCall(ISGetIndices(iscol, &cout));
1326:   c = cout;

1328:   /* forward solve the lower triangular */
1329:   idx  = 3 * r[0];
1330:   t[0] = b[idx];
1331:   t[1] = b[1 + idx];
1332:   t[2] = b[2 + idx];
1333:   for (i = 1; i < n; i++) {
1334:     v   = aa + 9 * ai[i];
1335:     vi  = aj + ai[i];
1336:     nz  = ai[i + 1] - ai[i];
1337:     idx = 3 * r[i];
1338:     s1  = b[idx];
1339:     s2  = b[1 + idx];
1340:     s3  = b[2 + idx];
1341:     for (m = 0; m < nz; m++) {
1342:       idx = 3 * vi[m];
1343:       x1  = t[idx];
1344:       x2  = t[1 + idx];
1345:       x3  = t[2 + idx];
1346:       s1 -= v[0] * x1 + v[3] * x2 + v[6] * x3;
1347:       s2 -= v[1] * x1 + v[4] * x2 + v[7] * x3;
1348:       s3 -= v[2] * x1 + v[5] * x2 + v[8] * x3;
1349:       v += 9;
1350:     }
1351:     idx        = 3 * i;
1352:     t[idx]     = s1;
1353:     t[1 + idx] = s2;
1354:     t[2 + idx] = s3;
1355:   }
1356:   /* backward solve the upper triangular */
1357:   for (i = n - 1; i >= 0; i--) {
1358:     v   = aa + 9 * (adiag[i + 1] + 1);
1359:     vi  = aj + adiag[i + 1] + 1;
1360:     nz  = adiag[i] - adiag[i + 1] - 1;
1361:     idt = 3 * i;
1362:     s1  = t[idt];
1363:     s2  = t[1 + idt];
1364:     s3  = t[2 + idt];
1365:     for (m = 0; m < nz; m++) {
1366:       idx = 3 * vi[m];
1367:       x1  = t[idx];
1368:       x2  = t[1 + idx];
1369:       x3  = t[2 + idx];
1370:       s1 -= v[0] * x1 + v[3] * x2 + v[6] * x3;
1371:       s2 -= v[1] * x1 + v[4] * x2 + v[7] * x3;
1372:       s3 -= v[2] * x1 + v[5] * x2 + v[8] * x3;
1373:       v += 9;
1374:     }
1375:     idc    = 3 * c[i];
1376:     x[idc] = t[idt] = v[0] * s1 + v[3] * s2 + v[6] * s3;
1377:     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[4] * s2 + v[7] * s3;
1378:     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[5] * s2 + v[8] * s3;
1379:   }
1380:   PetscCall(ISRestoreIndices(isrow, &rout));
1381:   PetscCall(ISRestoreIndices(iscol, &cout));
1382:   PetscCall(VecRestoreArrayRead(bb, &b));
1383:   PetscCall(VecRestoreArray(xx, &x));
1384:   PetscCall(PetscLogFlops(2.0 * 9 * (a->nz) - 3.0 * A->cmap->n));
1385:   PetscFunctionReturn(PETSC_SUCCESS);
1386: }

1388: PetscErrorCode MatSolve_SeqBAIJ_2_inplace(Mat A, Vec bb, Vec xx)
1389: {
1390:   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
1391:   IS                 iscol = a->col, isrow = a->row;
1392:   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j;
1393:   PetscInt           i, nz, idx, idt, idc;
1394:   const PetscInt    *r, *c, *diag = a->diag, *rout, *cout;
1395:   const MatScalar   *aa = a->a, *v;
1396:   PetscScalar       *x, s1, s2, x1, x2, *t;
1397:   const PetscScalar *b;

1399:   PetscFunctionBegin;
1400:   PetscCall(VecGetArrayRead(bb, &b));
1401:   PetscCall(VecGetArray(xx, &x));
1402:   t = a->solve_work;

1404:   PetscCall(ISGetIndices(isrow, &rout));
1405:   r = rout;
1406:   PetscCall(ISGetIndices(iscol, &cout));
1407:   c = cout + (n - 1);

1409:   /* forward solve the lower triangular */
1410:   idx  = 2 * (*r++);
1411:   t[0] = b[idx];
1412:   t[1] = b[1 + idx];
1413:   for (i = 1; i < n; i++) {
1414:     v   = aa + 4 * ai[i];
1415:     vi  = aj + ai[i];
1416:     nz  = diag[i] - ai[i];
1417:     idx = 2 * (*r++);
1418:     s1  = b[idx];
1419:     s2  = b[1 + idx];
1420:     while (nz--) {
1421:       idx = 2 * (*vi++);
1422:       x1  = t[idx];
1423:       x2  = t[1 + idx];
1424:       s1 -= v[0] * x1 + v[2] * x2;
1425:       s2 -= v[1] * x1 + v[3] * x2;
1426:       v += 4;
1427:     }
1428:     idx        = 2 * i;
1429:     t[idx]     = s1;
1430:     t[1 + idx] = s2;
1431:   }
1432:   /* backward solve the upper triangular */
1433:   for (i = n - 1; i >= 0; i--) {
1434:     v   = aa + 4 * diag[i] + 4;
1435:     vi  = aj + diag[i] + 1;
1436:     nz  = ai[i + 1] - diag[i] - 1;
1437:     idt = 2 * i;
1438:     s1  = t[idt];
1439:     s2  = t[1 + idt];
1440:     while (nz--) {
1441:       idx = 2 * (*vi++);
1442:       x1  = t[idx];
1443:       x2  = t[1 + idx];
1444:       s1 -= v[0] * x1 + v[2] * x2;
1445:       s2 -= v[1] * x1 + v[3] * x2;
1446:       v += 4;
1447:     }
1448:     idc    = 2 * (*c--);
1449:     v      = aa + 4 * diag[i];
1450:     x[idc] = t[idt] = v[0] * s1 + v[2] * s2;
1451:     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[3] * s2;
1452:   }
1453:   PetscCall(ISRestoreIndices(isrow, &rout));
1454:   PetscCall(ISRestoreIndices(iscol, &cout));
1455:   PetscCall(VecRestoreArrayRead(bb, &b));
1456:   PetscCall(VecRestoreArray(xx, &x));
1457:   PetscCall(PetscLogFlops(2.0 * 4 * (a->nz) - 2.0 * A->cmap->n));
1458:   PetscFunctionReturn(PETSC_SUCCESS);
1459: }

1461: PetscErrorCode MatSolve_SeqBAIJ_2(Mat A, Vec bb, Vec xx)
1462: {
1463:   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
1464:   IS                 iscol = a->col, isrow = a->row;
1465:   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
1466:   PetscInt           i, nz, idx, jdx, idt, idc, m;
1467:   const PetscInt    *r, *c, *rout, *cout;
1468:   const MatScalar   *aa = a->a, *v;
1469:   PetscScalar       *x, s1, s2, x1, x2, *t;
1470:   const PetscScalar *b;

1472:   PetscFunctionBegin;
1473:   PetscCall(VecGetArrayRead(bb, &b));
1474:   PetscCall(VecGetArray(xx, &x));
1475:   t = a->solve_work;

1477:   PetscCall(ISGetIndices(isrow, &rout));
1478:   r = rout;
1479:   PetscCall(ISGetIndices(iscol, &cout));
1480:   c = cout;

1482:   /* forward solve the lower triangular */
1483:   idx  = 2 * r[0];
1484:   t[0] = b[idx];
1485:   t[1] = b[1 + idx];
1486:   for (i = 1; i < n; i++) {
1487:     v   = aa + 4 * ai[i];
1488:     vi  = aj + ai[i];
1489:     nz  = ai[i + 1] - ai[i];
1490:     idx = 2 * r[i];
1491:     s1  = b[idx];
1492:     s2  = b[1 + idx];
1493:     for (m = 0; m < nz; m++) {
1494:       jdx = 2 * vi[m];
1495:       x1  = t[jdx];
1496:       x2  = t[1 + jdx];
1497:       s1 -= v[0] * x1 + v[2] * x2;
1498:       s2 -= v[1] * x1 + v[3] * x2;
1499:       v += 4;
1500:     }
1501:     idx        = 2 * i;
1502:     t[idx]     = s1;
1503:     t[1 + idx] = s2;
1504:   }
1505:   /* backward solve the upper triangular */
1506:   for (i = n - 1; i >= 0; i--) {
1507:     v   = aa + 4 * (adiag[i + 1] + 1);
1508:     vi  = aj + adiag[i + 1] + 1;
1509:     nz  = adiag[i] - adiag[i + 1] - 1;
1510:     idt = 2 * i;
1511:     s1  = t[idt];
1512:     s2  = t[1 + idt];
1513:     for (m = 0; m < nz; m++) {
1514:       idx = 2 * vi[m];
1515:       x1  = t[idx];
1516:       x2  = t[1 + idx];
1517:       s1 -= v[0] * x1 + v[2] * x2;
1518:       s2 -= v[1] * x1 + v[3] * x2;
1519:       v += 4;
1520:     }
1521:     idc    = 2 * c[i];
1522:     x[idc] = t[idt] = v[0] * s1 + v[2] * s2;
1523:     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[3] * s2;
1524:   }
1525:   PetscCall(ISRestoreIndices(isrow, &rout));
1526:   PetscCall(ISRestoreIndices(iscol, &cout));
1527:   PetscCall(VecRestoreArrayRead(bb, &b));
1528:   PetscCall(VecRestoreArray(xx, &x));
1529:   PetscCall(PetscLogFlops(2.0 * 4 * (a->nz) - 2.0 * A->cmap->n));
1530:   PetscFunctionReturn(PETSC_SUCCESS);
1531: }

1533: PetscErrorCode MatSolve_SeqBAIJ_1_inplace(Mat A, Vec bb, Vec xx)
1534: {
1535:   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
1536:   IS                 iscol = a->col, isrow = a->row;
1537:   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j;
1538:   PetscInt           i, nz;
1539:   const PetscInt    *r, *c, *diag = a->diag, *rout, *cout;
1540:   const MatScalar   *aa = a->a, *v;
1541:   PetscScalar       *x, s1, *t;
1542:   const PetscScalar *b;

1544:   PetscFunctionBegin;
1545:   if (!n) PetscFunctionReturn(PETSC_SUCCESS);

1547:   PetscCall(VecGetArrayRead(bb, &b));
1548:   PetscCall(VecGetArray(xx, &x));
1549:   t = a->solve_work;

1551:   PetscCall(ISGetIndices(isrow, &rout));
1552:   r = rout;
1553:   PetscCall(ISGetIndices(iscol, &cout));
1554:   c = cout + (n - 1);

1556:   /* forward solve the lower triangular */
1557:   t[0] = b[*r++];
1558:   for (i = 1; i < n; i++) {
1559:     v  = aa + ai[i];
1560:     vi = aj + ai[i];
1561:     nz = diag[i] - ai[i];
1562:     s1 = b[*r++];
1563:     while (nz--) s1 -= (*v++) * t[*vi++];
1564:     t[i] = s1;
1565:   }
1566:   /* backward solve the upper triangular */
1567:   for (i = n - 1; i >= 0; i--) {
1568:     v  = aa + diag[i] + 1;
1569:     vi = aj + diag[i] + 1;
1570:     nz = ai[i + 1] - diag[i] - 1;
1571:     s1 = t[i];
1572:     while (nz--) s1 -= (*v++) * t[*vi++];
1573:     x[*c--] = t[i] = aa[diag[i]] * s1;
1574:   }

1576:   PetscCall(ISRestoreIndices(isrow, &rout));
1577:   PetscCall(ISRestoreIndices(iscol, &cout));
1578:   PetscCall(VecRestoreArrayRead(bb, &b));
1579:   PetscCall(VecRestoreArray(xx, &x));
1580:   PetscCall(PetscLogFlops(2.0 * 1 * (a->nz) - A->cmap->n));
1581:   PetscFunctionReturn(PETSC_SUCCESS);
1582: }

1584: PetscErrorCode MatSolve_SeqBAIJ_1(Mat A, Vec bb, Vec xx)
1585: {
1586:   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
1587:   IS                 iscol = a->col, isrow = a->row;
1588:   PetscInt           i, n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag, nz;
1589:   const PetscInt    *rout, *cout, *r, *c;
1590:   PetscScalar       *x, *tmp, sum;
1591:   const PetscScalar *b;
1592:   const MatScalar   *aa = a->a, *v;

1594:   PetscFunctionBegin;
1595:   if (!n) PetscFunctionReturn(PETSC_SUCCESS);

1597:   PetscCall(VecGetArrayRead(bb, &b));
1598:   PetscCall(VecGetArray(xx, &x));
1599:   tmp = a->solve_work;

1601:   PetscCall(ISGetIndices(isrow, &rout));
1602:   r = rout;
1603:   PetscCall(ISGetIndices(iscol, &cout));
1604:   c = cout;

1606:   /* forward solve the lower triangular */
1607:   tmp[0] = b[r[0]];
1608:   v      = aa;
1609:   vi     = aj;
1610:   for (i = 1; i < n; i++) {
1611:     nz  = ai[i + 1] - ai[i];
1612:     sum = b[r[i]];
1613:     PetscSparseDenseMinusDot(sum, tmp, v, vi, nz);
1614:     tmp[i] = sum;
1615:     v += nz;
1616:     vi += nz;
1617:   }

1619:   /* backward solve the upper triangular */
1620:   for (i = n - 1; i >= 0; i--) {
1621:     v   = aa + adiag[i + 1] + 1;
1622:     vi  = aj + adiag[i + 1] + 1;
1623:     nz  = adiag[i] - adiag[i + 1] - 1;
1624:     sum = tmp[i];
1625:     PetscSparseDenseMinusDot(sum, tmp, v, vi, nz);
1626:     x[c[i]] = tmp[i] = sum * v[nz]; /* v[nz] = aa[adiag[i]] */
1627:   }

1629:   PetscCall(ISRestoreIndices(isrow, &rout));
1630:   PetscCall(ISRestoreIndices(iscol, &cout));
1631:   PetscCall(VecRestoreArrayRead(bb, &b));
1632:   PetscCall(VecRestoreArray(xx, &x));
1633:   PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n));
1634:   PetscFunctionReturn(PETSC_SUCCESS);
1635: }