Actual source code: baijfact13.c


  2: /*
  3:     Factorization code for BAIJ format.
  4: */
  5: #include <../src/mat/impls/baij/seq/baij.h>
  6: #include <petsc/private/kernels/blockinvert.h>

  8: /*
  9:       Version for when blocks are 3 by 3
 10: */
 11: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_3_inplace(Mat C, Mat A, const MatFactorInfo *info)
 12: {
 13:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
 14:   IS              isrow = b->row, isicol = b->icol;
 15:   const PetscInt *r, *ic;
 16:   PetscInt        i, j, n = a->mbs, *bi = b->i, *bj = b->j;
 17:   PetscInt       *ajtmpold, *ajtmp, nz, row, *ai = a->i, *aj = a->j;
 18:   PetscInt       *diag_offset = b->diag, idx, *pj;
 19:   MatScalar      *pv, *v, *rtmp, *pc, *w, *x;
 20:   MatScalar       p1, p2, p3, p4, m1, m2, m3, m4, m5, m6, m7, m8, m9, x1, x2, x3, x4;
 21:   MatScalar       p5, p6, p7, p8, p9, x5, x6, x7, x8, x9;
 22:   MatScalar      *ba = b->a, *aa = a->a;
 23:   PetscReal       shift = info->shiftamount;
 24:   PetscBool       allowzeropivot, zeropivotdetected;

 26:   PetscFunctionBegin;
 27:   PetscCall(ISGetIndices(isrow, &r));
 28:   PetscCall(ISGetIndices(isicol, &ic));
 29:   PetscCall(PetscMalloc1(9 * (n + 1), &rtmp));
 30:   allowzeropivot = PetscNot(A->erroriffailure);

 32:   for (i = 0; i < n; i++) {
 33:     nz    = bi[i + 1] - bi[i];
 34:     ajtmp = bj + bi[i];
 35:     for (j = 0; j < nz; j++) {
 36:       x    = rtmp + 9 * ajtmp[j];
 37:       x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = 0.0;
 38:     }
 39:     /* load in initial (unfactored row) */
 40:     idx      = r[i];
 41:     nz       = ai[idx + 1] - ai[idx];
 42:     ajtmpold = aj + ai[idx];
 43:     v        = aa + 9 * ai[idx];
 44:     for (j = 0; j < nz; j++) {
 45:       x    = rtmp + 9 * ic[ajtmpold[j]];
 46:       x[0] = v[0];
 47:       x[1] = v[1];
 48:       x[2] = v[2];
 49:       x[3] = v[3];
 50:       x[4] = v[4];
 51:       x[5] = v[5];
 52:       x[6] = v[6];
 53:       x[7] = v[7];
 54:       x[8] = v[8];
 55:       v += 9;
 56:     }
 57:     row = *ajtmp++;
 58:     while (row < i) {
 59:       pc = rtmp + 9 * row;
 60:       p1 = pc[0];
 61:       p2 = pc[1];
 62:       p3 = pc[2];
 63:       p4 = pc[3];
 64:       p5 = pc[4];
 65:       p6 = pc[5];
 66:       p7 = pc[6];
 67:       p8 = pc[7];
 68:       p9 = pc[8];
 69:       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0) {
 70:         pv    = ba + 9 * diag_offset[row];
 71:         pj    = bj + diag_offset[row] + 1;
 72:         x1    = pv[0];
 73:         x2    = pv[1];
 74:         x3    = pv[2];
 75:         x4    = pv[3];
 76:         x5    = pv[4];
 77:         x6    = pv[5];
 78:         x7    = pv[6];
 79:         x8    = pv[7];
 80:         x9    = pv[8];
 81:         pc[0] = m1 = p1 * x1 + p4 * x2 + p7 * x3;
 82:         pc[1] = m2 = p2 * x1 + p5 * x2 + p8 * x3;
 83:         pc[2] = m3 = p3 * x1 + p6 * x2 + p9 * x3;

 85:         pc[3] = m4 = p1 * x4 + p4 * x5 + p7 * x6;
 86:         pc[4] = m5 = p2 * x4 + p5 * x5 + p8 * x6;
 87:         pc[5] = m6 = p3 * x4 + p6 * x5 + p9 * x6;

 89:         pc[6] = m7 = p1 * x7 + p4 * x8 + p7 * x9;
 90:         pc[7] = m8 = p2 * x7 + p5 * x8 + p8 * x9;
 91:         pc[8] = m9 = p3 * x7 + p6 * x8 + p9 * x9;
 92:         nz         = bi[row + 1] - diag_offset[row] - 1;
 93:         pv += 9;
 94:         for (j = 0; j < nz; j++) {
 95:           x1 = pv[0];
 96:           x2 = pv[1];
 97:           x3 = pv[2];
 98:           x4 = pv[3];
 99:           x5 = pv[4];
100:           x6 = pv[5];
101:           x7 = pv[6];
102:           x8 = pv[7];
103:           x9 = pv[8];
104:           x  = rtmp + 9 * pj[j];
105:           x[0] -= m1 * x1 + m4 * x2 + m7 * x3;
106:           x[1] -= m2 * x1 + m5 * x2 + m8 * x3;
107:           x[2] -= m3 * x1 + m6 * x2 + m9 * x3;

109:           x[3] -= m1 * x4 + m4 * x5 + m7 * x6;
110:           x[4] -= m2 * x4 + m5 * x5 + m8 * x6;
111:           x[5] -= m3 * x4 + m6 * x5 + m9 * x6;

113:           x[6] -= m1 * x7 + m4 * x8 + m7 * x9;
114:           x[7] -= m2 * x7 + m5 * x8 + m8 * x9;
115:           x[8] -= m3 * x7 + m6 * x8 + m9 * x9;
116:           pv += 9;
117:         }
118:         PetscCall(PetscLogFlops(54.0 * nz + 36.0));
119:       }
120:       row = *ajtmp++;
121:     }
122:     /* finished row so stick it into b->a */
123:     pv = ba + 9 * bi[i];
124:     pj = bj + bi[i];
125:     nz = bi[i + 1] - bi[i];
126:     for (j = 0; j < nz; j++) {
127:       x     = rtmp + 9 * pj[j];
128:       pv[0] = x[0];
129:       pv[1] = x[1];
130:       pv[2] = x[2];
131:       pv[3] = x[3];
132:       pv[4] = x[4];
133:       pv[5] = x[5];
134:       pv[6] = x[6];
135:       pv[7] = x[7];
136:       pv[8] = x[8];
137:       pv += 9;
138:     }
139:     /* invert diagonal block */
140:     w = ba + 9 * diag_offset[i];
141:     PetscCall(PetscKernel_A_gets_inverse_A_3(w, shift, allowzeropivot, &zeropivotdetected));
142:     if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
143:   }

145:   PetscCall(PetscFree(rtmp));
146:   PetscCall(ISRestoreIndices(isicol, &ic));
147:   PetscCall(ISRestoreIndices(isrow, &r));

149:   C->ops->solve          = MatSolve_SeqBAIJ_3_inplace;
150:   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3_inplace;
151:   C->assembled           = PETSC_TRUE;

153:   PetscCall(PetscLogFlops(1.333333333333 * 3 * 3 * 3 * b->mbs)); /* from inverting diagonal blocks */
154:   PetscFunctionReturn(PETSC_SUCCESS);
155: }

157: /* MatLUFactorNumeric_SeqBAIJ_3 -
158:      copied from MatLUFactorNumeric_SeqBAIJ_N_inplace() and manually re-implemented
159:        PetscKernel_A_gets_A_times_B()
160:        PetscKernel_A_gets_A_minus_B_times_C()
161:        PetscKernel_A_gets_inverse_A()
162: */
163: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_3(Mat B, Mat A, const MatFactorInfo *info)
164: {
165:   Mat             C = B;
166:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
167:   IS              isrow = b->row, isicol = b->icol;
168:   const PetscInt *r, *ic;
169:   PetscInt        i, j, k, nz, nzL, row;
170:   const PetscInt  n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j;
171:   const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag, *pj, bs2 = a->bs2;
172:   MatScalar      *rtmp, *pc, *mwork, *v, *pv, *aa = a->a;
173:   PetscInt        flg;
174:   PetscReal       shift = info->shiftamount;
175:   PetscBool       allowzeropivot, zeropivotdetected;

177:   PetscFunctionBegin;
178:   PetscCall(ISGetIndices(isrow, &r));
179:   PetscCall(ISGetIndices(isicol, &ic));
180:   allowzeropivot = PetscNot(A->erroriffailure);

182:   /* generate work space needed by the factorization */
183:   PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork));
184:   PetscCall(PetscArrayzero(rtmp, bs2 * n));

186:   for (i = 0; i < n; i++) {
187:     /* zero rtmp */
188:     /* L part */
189:     nz    = bi[i + 1] - bi[i];
190:     bjtmp = bj + bi[i];
191:     for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));

193:     /* U part */
194:     nz    = bdiag[i] - bdiag[i + 1];
195:     bjtmp = bj + bdiag[i + 1] + 1;
196:     for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));

198:     /* load in initial (unfactored row) */
199:     nz    = ai[r[i] + 1] - ai[r[i]];
200:     ajtmp = aj + ai[r[i]];
201:     v     = aa + bs2 * ai[r[i]];
202:     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], v + bs2 * j, bs2));

204:     /* elimination */
205:     bjtmp = bj + bi[i];
206:     nzL   = bi[i + 1] - bi[i];
207:     for (k = 0; k < nzL; k++) {
208:       row = bjtmp[k];
209:       pc  = rtmp + bs2 * row;
210:       for (flg = 0, j = 0; j < bs2; j++) {
211:         if (pc[j] != 0.0) {
212:           flg = 1;
213:           break;
214:         }
215:       }
216:       if (flg) {
217:         pv = b->a + bs2 * bdiag[row];
218:         /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
219:         PetscCall(PetscKernel_A_gets_A_times_B_3(pc, pv, mwork));

221:         pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
222:         pv = b->a + bs2 * (bdiag[row + 1] + 1);
223:         nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
224:         for (j = 0; j < nz; j++) {
225:           /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
226:           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
227:           v = rtmp + bs2 * pj[j];
228:           PetscCall(PetscKernel_A_gets_A_minus_B_times_C_3(v, pc, pv));
229:           pv += bs2;
230:         }
231:         PetscCall(PetscLogFlops(54.0 * nz + 45)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
232:       }
233:     }

235:     /* finished row so stick it into b->a */
236:     /* L part */
237:     pv = b->a + bs2 * bi[i];
238:     pj = b->j + bi[i];
239:     nz = bi[i + 1] - bi[i];
240:     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));

242:     /* Mark diagonal and invert diagonal for simpler triangular solves */
243:     pv = b->a + bs2 * bdiag[i];
244:     pj = b->j + bdiag[i];
245:     PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
246:     PetscCall(PetscKernel_A_gets_inverse_A_3(pv, shift, allowzeropivot, &zeropivotdetected));
247:     if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;

249:     /* U part */
250:     pj = b->j + bdiag[i + 1] + 1;
251:     pv = b->a + bs2 * (bdiag[i + 1] + 1);
252:     nz = bdiag[i] - bdiag[i + 1] - 1;
253:     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
254:   }

256:   PetscCall(PetscFree2(rtmp, mwork));
257:   PetscCall(ISRestoreIndices(isicol, &ic));
258:   PetscCall(ISRestoreIndices(isrow, &r));

260:   C->ops->solve          = MatSolve_SeqBAIJ_3;
261:   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3;
262:   C->assembled           = PETSC_TRUE;

264:   PetscCall(PetscLogFlops(1.333333333333 * 3 * 3 * 3 * n)); /* from inverting diagonal blocks */
265:   PetscFunctionReturn(PETSC_SUCCESS);
266: }

268: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering_inplace(Mat C, Mat A, const MatFactorInfo *info)
269: {
270:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
271:   PetscInt     i, j, n = a->mbs, *bi = b->i, *bj = b->j;
272:   PetscInt    *ajtmpold, *ajtmp, nz, row;
273:   PetscInt    *diag_offset = b->diag, *ai = a->i, *aj = a->j, *pj;
274:   MatScalar   *pv, *v, *rtmp, *pc, *w, *x;
275:   MatScalar    p1, p2, p3, p4, m1, m2, m3, m4, m5, m6, m7, m8, m9, x1, x2, x3, x4;
276:   MatScalar    p5, p6, p7, p8, p9, x5, x6, x7, x8, x9;
277:   MatScalar   *ba = b->a, *aa = a->a;
278:   PetscReal    shift = info->shiftamount;
279:   PetscBool    allowzeropivot, zeropivotdetected;

281:   PetscFunctionBegin;
282:   PetscCall(PetscMalloc1(9 * (n + 1), &rtmp));
283:   allowzeropivot = PetscNot(A->erroriffailure);

285:   for (i = 0; i < n; i++) {
286:     nz    = bi[i + 1] - bi[i];
287:     ajtmp = bj + bi[i];
288:     for (j = 0; j < nz; j++) {
289:       x    = rtmp + 9 * ajtmp[j];
290:       x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = 0.0;
291:     }
292:     /* load in initial (unfactored row) */
293:     nz       = ai[i + 1] - ai[i];
294:     ajtmpold = aj + ai[i];
295:     v        = aa + 9 * ai[i];
296:     for (j = 0; j < nz; j++) {
297:       x    = rtmp + 9 * ajtmpold[j];
298:       x[0] = v[0];
299:       x[1] = v[1];
300:       x[2] = v[2];
301:       x[3] = v[3];
302:       x[4] = v[4];
303:       x[5] = v[5];
304:       x[6] = v[6];
305:       x[7] = v[7];
306:       x[8] = v[8];
307:       v += 9;
308:     }
309:     row = *ajtmp++;
310:     while (row < i) {
311:       pc = rtmp + 9 * row;
312:       p1 = pc[0];
313:       p2 = pc[1];
314:       p3 = pc[2];
315:       p4 = pc[3];
316:       p5 = pc[4];
317:       p6 = pc[5];
318:       p7 = pc[6];
319:       p8 = pc[7];
320:       p9 = pc[8];
321:       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0) {
322:         pv    = ba + 9 * diag_offset[row];
323:         pj    = bj + diag_offset[row] + 1;
324:         x1    = pv[0];
325:         x2    = pv[1];
326:         x3    = pv[2];
327:         x4    = pv[3];
328:         x5    = pv[4];
329:         x6    = pv[5];
330:         x7    = pv[6];
331:         x8    = pv[7];
332:         x9    = pv[8];
333:         pc[0] = m1 = p1 * x1 + p4 * x2 + p7 * x3;
334:         pc[1] = m2 = p2 * x1 + p5 * x2 + p8 * x3;
335:         pc[2] = m3 = p3 * x1 + p6 * x2 + p9 * x3;

337:         pc[3] = m4 = p1 * x4 + p4 * x5 + p7 * x6;
338:         pc[4] = m5 = p2 * x4 + p5 * x5 + p8 * x6;
339:         pc[5] = m6 = p3 * x4 + p6 * x5 + p9 * x6;

341:         pc[6] = m7 = p1 * x7 + p4 * x8 + p7 * x9;
342:         pc[7] = m8 = p2 * x7 + p5 * x8 + p8 * x9;
343:         pc[8] = m9 = p3 * x7 + p6 * x8 + p9 * x9;

345:         nz = bi[row + 1] - diag_offset[row] - 1;
346:         pv += 9;
347:         for (j = 0; j < nz; j++) {
348:           x1 = pv[0];
349:           x2 = pv[1];
350:           x3 = pv[2];
351:           x4 = pv[3];
352:           x5 = pv[4];
353:           x6 = pv[5];
354:           x7 = pv[6];
355:           x8 = pv[7];
356:           x9 = pv[8];
357:           x  = rtmp + 9 * pj[j];
358:           x[0] -= m1 * x1 + m4 * x2 + m7 * x3;
359:           x[1] -= m2 * x1 + m5 * x2 + m8 * x3;
360:           x[2] -= m3 * x1 + m6 * x2 + m9 * x3;

362:           x[3] -= m1 * x4 + m4 * x5 + m7 * x6;
363:           x[4] -= m2 * x4 + m5 * x5 + m8 * x6;
364:           x[5] -= m3 * x4 + m6 * x5 + m9 * x6;

366:           x[6] -= m1 * x7 + m4 * x8 + m7 * x9;
367:           x[7] -= m2 * x7 + m5 * x8 + m8 * x9;
368:           x[8] -= m3 * x7 + m6 * x8 + m9 * x9;
369:           pv += 9;
370:         }
371:         PetscCall(PetscLogFlops(54.0 * nz + 36.0));
372:       }
373:       row = *ajtmp++;
374:     }
375:     /* finished row so stick it into b->a */
376:     pv = ba + 9 * bi[i];
377:     pj = bj + bi[i];
378:     nz = bi[i + 1] - bi[i];
379:     for (j = 0; j < nz; j++) {
380:       x     = rtmp + 9 * pj[j];
381:       pv[0] = x[0];
382:       pv[1] = x[1];
383:       pv[2] = x[2];
384:       pv[3] = x[3];
385:       pv[4] = x[4];
386:       pv[5] = x[5];
387:       pv[6] = x[6];
388:       pv[7] = x[7];
389:       pv[8] = x[8];
390:       pv += 9;
391:     }
392:     /* invert diagonal block */
393:     w = ba + 9 * diag_offset[i];
394:     PetscCall(PetscKernel_A_gets_inverse_A_3(w, shift, allowzeropivot, &zeropivotdetected));
395:     if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
396:   }

398:   PetscCall(PetscFree(rtmp));

400:   C->ops->solve          = MatSolve_SeqBAIJ_3_NaturalOrdering_inplace;
401:   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_inplace;
402:   C->assembled           = PETSC_TRUE;

404:   PetscCall(PetscLogFlops(1.333333333333 * 3 * 3 * 3 * b->mbs)); /* from inverting diagonal blocks */
405:   PetscFunctionReturn(PETSC_SUCCESS);
406: }

408: /*
409:   MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering -
410:     copied from MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering_inplace()
411: */
412: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering(Mat B, Mat A, const MatFactorInfo *info)
413: {
414:   Mat             C = B;
415:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
416:   PetscInt        i, j, k, nz, nzL, row;
417:   const PetscInt  n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j;
418:   const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag, *pj, bs2 = a->bs2;
419:   MatScalar      *rtmp, *pc, *mwork, *v, *pv, *aa = a->a;
420:   PetscInt        flg;
421:   PetscReal       shift = info->shiftamount;
422:   PetscBool       allowzeropivot, zeropivotdetected;

424:   PetscFunctionBegin;
425:   allowzeropivot = PetscNot(A->erroriffailure);

427:   /* generate work space needed by the factorization */
428:   PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork));
429:   PetscCall(PetscArrayzero(rtmp, bs2 * n));

431:   for (i = 0; i < n; i++) {
432:     /* zero rtmp */
433:     /* L part */
434:     nz    = bi[i + 1] - bi[i];
435:     bjtmp = bj + bi[i];
436:     for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));

438:     /* U part */
439:     nz    = bdiag[i] - bdiag[i + 1];
440:     bjtmp = bj + bdiag[i + 1] + 1;
441:     for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));

443:     /* load in initial (unfactored row) */
444:     nz    = ai[i + 1] - ai[i];
445:     ajtmp = aj + ai[i];
446:     v     = aa + bs2 * ai[i];
447:     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ajtmp[j], v + bs2 * j, bs2));

449:     /* elimination */
450:     bjtmp = bj + bi[i];
451:     nzL   = bi[i + 1] - bi[i];
452:     for (k = 0; k < nzL; k++) {
453:       row = bjtmp[k];
454:       pc  = rtmp + bs2 * row;
455:       for (flg = 0, j = 0; j < bs2; j++) {
456:         if (pc[j] != 0.0) {
457:           flg = 1;
458:           break;
459:         }
460:       }
461:       if (flg) {
462:         pv = b->a + bs2 * bdiag[row];
463:         /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
464:         PetscCall(PetscKernel_A_gets_A_times_B_3(pc, pv, mwork));

466:         pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
467:         pv = b->a + bs2 * (bdiag[row + 1] + 1);
468:         nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
469:         for (j = 0; j < nz; j++) {
470:           /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
471:           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
472:           v = rtmp + bs2 * pj[j];
473:           PetscCall(PetscKernel_A_gets_A_minus_B_times_C_3(v, pc, pv));
474:           pv += bs2;
475:         }
476:         PetscCall(PetscLogFlops(54.0 * nz + 45)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
477:       }
478:     }

480:     /* finished row so stick it into b->a */
481:     /* L part */
482:     pv = b->a + bs2 * bi[i];
483:     pj = b->j + bi[i];
484:     nz = bi[i + 1] - bi[i];
485:     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));

487:     /* Mark diagonal and invert diagonal for simpler triangular solves */
488:     pv = b->a + bs2 * bdiag[i];
489:     pj = b->j + bdiag[i];
490:     PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
491:     PetscCall(PetscKernel_A_gets_inverse_A_3(pv, shift, allowzeropivot, &zeropivotdetected));
492:     if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;

494:     /* U part */
495:     pv = b->a + bs2 * (bdiag[i + 1] + 1);
496:     pj = b->j + bdiag[i + 1] + 1;
497:     nz = bdiag[i] - bdiag[i + 1] - 1;
498:     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
499:   }
500:   PetscCall(PetscFree2(rtmp, mwork));

502:   C->ops->solve          = MatSolve_SeqBAIJ_3_NaturalOrdering;
503:   C->ops->forwardsolve   = MatForwardSolve_SeqBAIJ_3_NaturalOrdering;
504:   C->ops->backwardsolve  = MatBackwardSolve_SeqBAIJ_3_NaturalOrdering;
505:   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3_NaturalOrdering;
506:   C->assembled           = PETSC_TRUE;

508:   PetscCall(PetscLogFlops(1.333333333333 * 3 * 3 * 3 * n)); /* from inverting diagonal blocks */
509:   PetscFunctionReturn(PETSC_SUCCESS);
510: }