Actual source code: baijfact.c


  2: /*
  3:     Factorization code for BAIJ format.
  4: */
  5: #include <../src/mat/impls/baij/seq/baij.h>
  6: #include <petsc/private/kernels/blockinvert.h>

  8: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2(Mat B, Mat A, const MatFactorInfo *info)
  9: {
 10:   Mat             C = B;
 11:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
 12:   IS              isrow = b->row, isicol = b->icol;
 13:   const PetscInt *r, *ic;
 14:   const PetscInt  n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, bs2 = a->bs2;
 15:   const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag;
 16:   MatScalar      *rtmp, *pc, *mwork, *pv;
 17:   MatScalar      *aa             = a->a, *v;
 18:   PetscReal       shift          = info->shiftamount;
 19:   const PetscBool allowzeropivot = PetscNot(A->erroriffailure);

 21:   PetscFunctionBegin;
 22:   PetscCall(ISGetIndices(isrow, &r));
 23:   PetscCall(ISGetIndices(isicol, &ic));

 25:   /* generate work space needed by the factorization */
 26:   PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork));
 27:   PetscCall(PetscArrayzero(rtmp, bs2 * n));

 29:   for (PetscInt i = 0; i < n; i++) {
 30:     /* zero rtmp */
 31:     /* L part */
 32:     PetscInt *pj;
 33:     PetscInt  nzL, nz = bi[i + 1] - bi[i];
 34:     bjtmp = bj + bi[i];
 35:     for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));

 37:     /* U part */
 38:     nz    = bdiag[i] - bdiag[i + 1];
 39:     bjtmp = bj + bdiag[i + 1] + 1;
 40:     for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));

 42:     /* load in initial (unfactored row) */
 43:     nz    = ai[r[i] + 1] - ai[r[i]];
 44:     ajtmp = aj + ai[r[i]];
 45:     v     = aa + bs2 * ai[r[i]];
 46:     for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], v + bs2 * j, bs2));

 48:     /* elimination */
 49:     bjtmp = bj + bi[i];
 50:     nzL   = bi[i + 1] - bi[i];
 51:     for (PetscInt k = 0; k < nzL; k++) {
 52:       PetscBool flg = PETSC_FALSE;
 53:       PetscInt  row = bjtmp[k];

 55:       pc = rtmp + bs2 * row;
 56:       for (PetscInt j = 0; j < bs2; j++) {
 57:         if (pc[j] != (PetscScalar)0.0) {
 58:           flg = PETSC_TRUE;
 59:           break;
 60:         }
 61:       }
 62:       if (flg) {
 63:         pv = b->a + bs2 * bdiag[row];
 64:         /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
 65:         PetscCall(PetscKernel_A_gets_A_times_B_2(pc, pv, mwork));

 67:         pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
 68:         pv = b->a + bs2 * (bdiag[row + 1] + 1);
 69:         nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */
 70:         for (PetscInt j = 0; j < nz; j++) {
 71:           /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
 72:           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
 73:           v = rtmp + 4 * pj[j];
 74:           PetscCall(PetscKernel_A_gets_A_minus_B_times_C_2(v, pc, pv));
 75:           pv += 4;
 76:         }
 77:         PetscCall(PetscLogFlops(16.0 * nz + 12)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
 78:       }
 79:     }

 81:     /* finished row so stick it into b->a */
 82:     /* L part */
 83:     pv = b->a + bs2 * bi[i];
 84:     pj = b->j + bi[i];
 85:     nz = bi[i + 1] - bi[i];
 86:     for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));

 88:     /* Mark diagonal and invert diagonal for simpler triangular solves */
 89:     pv = b->a + bs2 * bdiag[i];
 90:     pj = b->j + bdiag[i];
 91:     PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
 92:     {
 93:       PetscBool zeropivotdetected;

 95:       PetscCall(PetscKernel_A_gets_inverse_A_2(pv, shift, allowzeropivot, &zeropivotdetected));
 96:       if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
 97:     }

 99:     /* U part */
100:     pv = b->a + bs2 * (bdiag[i + 1] + 1);
101:     pj = b->j + bdiag[i + 1] + 1;
102:     nz = bdiag[i] - bdiag[i + 1] - 1;
103:     for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
104:   }

106:   PetscCall(PetscFree2(rtmp, mwork));
107:   PetscCall(ISRestoreIndices(isicol, &ic));
108:   PetscCall(ISRestoreIndices(isrow, &r));

110:   C->ops->solve          = MatSolve_SeqBAIJ_2;
111:   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2;
112:   C->assembled           = PETSC_TRUE;

114:   PetscCall(PetscLogFlops(1.333333333333 * 2 * 2 * 2 * n)); /* from inverting diagonal blocks */
115:   PetscFunctionReturn(PETSC_SUCCESS);
116: }

118: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering(Mat B, Mat A, const MatFactorInfo *info)
119: {
120:   Mat             C = B;
121:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
122:   PetscInt        i, j, k, nz, nzL, row, *pj;
123:   const PetscInt  n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, bs2 = a->bs2;
124:   const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag;
125:   MatScalar      *rtmp, *pc, *mwork, *pv;
126:   MatScalar      *aa = a->a, *v;
127:   PetscInt        flg;
128:   PetscReal       shift = info->shiftamount;
129:   PetscBool       allowzeropivot, zeropivotdetected;

131:   PetscFunctionBegin;
132:   allowzeropivot = PetscNot(A->erroriffailure);

134:   /* generate work space needed by the factorization */
135:   PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork));
136:   PetscCall(PetscArrayzero(rtmp, bs2 * n));

138:   for (i = 0; i < n; i++) {
139:     /* zero rtmp */
140:     /* L part */
141:     nz    = bi[i + 1] - bi[i];
142:     bjtmp = bj + bi[i];
143:     for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));

145:     /* U part */
146:     nz    = bdiag[i] - bdiag[i + 1];
147:     bjtmp = bj + bdiag[i + 1] + 1;
148:     for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));

150:     /* load in initial (unfactored row) */
151:     nz    = ai[i + 1] - ai[i];
152:     ajtmp = aj + ai[i];
153:     v     = aa + bs2 * ai[i];
154:     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ajtmp[j], v + bs2 * j, bs2));

156:     /* elimination */
157:     bjtmp = bj + bi[i];
158:     nzL   = bi[i + 1] - bi[i];
159:     for (k = 0; k < nzL; k++) {
160:       row = bjtmp[k];
161:       pc  = rtmp + bs2 * row;
162:       for (flg = 0, j = 0; j < bs2; j++) {
163:         if (pc[j] != (PetscScalar)0.0) {
164:           flg = 1;
165:           break;
166:         }
167:       }
168:       if (flg) {
169:         pv = b->a + bs2 * bdiag[row];
170:         /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
171:         PetscCall(PetscKernel_A_gets_A_times_B_2(pc, pv, mwork));

173:         pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
174:         pv = b->a + bs2 * (bdiag[row + 1] + 1);
175:         nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
176:         for (j = 0; j < nz; j++) {
177:           /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
178:           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
179:           v = rtmp + 4 * pj[j];
180:           PetscCall(PetscKernel_A_gets_A_minus_B_times_C_2(v, pc, pv));
181:           pv += 4;
182:         }
183:         PetscCall(PetscLogFlops(16.0 * nz + 12)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
184:       }
185:     }

187:     /* finished row so stick it into b->a */
188:     /* L part */
189:     pv = b->a + bs2 * bi[i];
190:     pj = b->j + bi[i];
191:     nz = bi[i + 1] - bi[i];
192:     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));

194:     /* Mark diagonal and invert diagonal for simpler triangular solves */
195:     pv = b->a + bs2 * bdiag[i];
196:     pj = b->j + bdiag[i];
197:     PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
198:     PetscCall(PetscKernel_A_gets_inverse_A_2(pv, shift, allowzeropivot, &zeropivotdetected));
199:     if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;

201:     /* U part */
202:     /*
203:     pv = b->a + bs2*bi[2*n-i];
204:     pj = b->j + bi[2*n-i];
205:     nz = bi[2*n-i+1] - bi[2*n-i] - 1;
206:     */
207:     pv = b->a + bs2 * (bdiag[i + 1] + 1);
208:     pj = b->j + bdiag[i + 1] + 1;
209:     nz = bdiag[i] - bdiag[i + 1] - 1;
210:     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
211:   }
212:   PetscCall(PetscFree2(rtmp, mwork));

214:   C->ops->solve          = MatSolve_SeqBAIJ_2_NaturalOrdering;
215:   C->ops->forwardsolve   = MatForwardSolve_SeqBAIJ_2_NaturalOrdering;
216:   C->ops->backwardsolve  = MatBackwardSolve_SeqBAIJ_2_NaturalOrdering;
217:   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering;
218:   C->assembled           = PETSC_TRUE;

220:   PetscCall(PetscLogFlops(1.333333333333 * 2 * 2 * 2 * n)); /* from inverting diagonal blocks */
221:   PetscFunctionReturn(PETSC_SUCCESS);
222: }

224: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2_inplace(Mat B, Mat A, const MatFactorInfo *info)
225: {
226:   Mat             C = B;
227:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
228:   IS              isrow = b->row, isicol = b->icol;
229:   const PetscInt *r, *ic;
230:   PetscInt        i, j, n = a->mbs, *bi = b->i, *bj = b->j;
231:   PetscInt       *ajtmpold, *ajtmp, nz, row;
232:   PetscInt       *diag_offset = b->diag, idx, *ai = a->i, *aj = a->j, *pj;
233:   MatScalar      *pv, *v, *rtmp, m1, m2, m3, m4, *pc, *w, *x, x1, x2, x3, x4;
234:   MatScalar       p1, p2, p3, p4;
235:   MatScalar      *ba = b->a, *aa = a->a;
236:   PetscReal       shift = info->shiftamount;
237:   PetscBool       allowzeropivot, zeropivotdetected;

239:   PetscFunctionBegin;
240:   allowzeropivot = PetscNot(A->erroriffailure);
241:   PetscCall(ISGetIndices(isrow, &r));
242:   PetscCall(ISGetIndices(isicol, &ic));
243:   PetscCall(PetscMalloc1(4 * (n + 1), &rtmp));

245:   for (i = 0; i < n; i++) {
246:     nz    = bi[i + 1] - bi[i];
247:     ajtmp = bj + bi[i];
248:     for (j = 0; j < nz; j++) {
249:       x    = rtmp + 4 * ajtmp[j];
250:       x[0] = x[1] = x[2] = x[3] = 0.0;
251:     }
252:     /* load in initial (unfactored row) */
253:     idx      = r[i];
254:     nz       = ai[idx + 1] - ai[idx];
255:     ajtmpold = aj + ai[idx];
256:     v        = aa + 4 * ai[idx];
257:     for (j = 0; j < nz; j++) {
258:       x    = rtmp + 4 * ic[ajtmpold[j]];
259:       x[0] = v[0];
260:       x[1] = v[1];
261:       x[2] = v[2];
262:       x[3] = v[3];
263:       v += 4;
264:     }
265:     row = *ajtmp++;
266:     while (row < i) {
267:       pc = rtmp + 4 * row;
268:       p1 = pc[0];
269:       p2 = pc[1];
270:       p3 = pc[2];
271:       p4 = pc[3];
272:       if (p1 != (PetscScalar)0.0 || p2 != (PetscScalar)0.0 || p3 != (PetscScalar)0.0 || p4 != (PetscScalar)0.0) {
273:         pv    = ba + 4 * diag_offset[row];
274:         pj    = bj + diag_offset[row] + 1;
275:         x1    = pv[0];
276:         x2    = pv[1];
277:         x3    = pv[2];
278:         x4    = pv[3];
279:         pc[0] = m1 = p1 * x1 + p3 * x2;
280:         pc[1] = m2 = p2 * x1 + p4 * x2;
281:         pc[2] = m3 = p1 * x3 + p3 * x4;
282:         pc[3] = m4 = p2 * x3 + p4 * x4;
283:         nz         = bi[row + 1] - diag_offset[row] - 1;
284:         pv += 4;
285:         for (j = 0; j < nz; j++) {
286:           x1 = pv[0];
287:           x2 = pv[1];
288:           x3 = pv[2];
289:           x4 = pv[3];
290:           x  = rtmp + 4 * pj[j];
291:           x[0] -= m1 * x1 + m3 * x2;
292:           x[1] -= m2 * x1 + m4 * x2;
293:           x[2] -= m1 * x3 + m3 * x4;
294:           x[3] -= m2 * x3 + m4 * x4;
295:           pv += 4;
296:         }
297:         PetscCall(PetscLogFlops(16.0 * nz + 12.0));
298:       }
299:       row = *ajtmp++;
300:     }
301:     /* finished row so stick it into b->a */
302:     pv = ba + 4 * bi[i];
303:     pj = bj + bi[i];
304:     nz = bi[i + 1] - bi[i];
305:     for (j = 0; j < nz; j++) {
306:       x     = rtmp + 4 * pj[j];
307:       pv[0] = x[0];
308:       pv[1] = x[1];
309:       pv[2] = x[2];
310:       pv[3] = x[3];
311:       pv += 4;
312:     }
313:     /* invert diagonal block */
314:     w = ba + 4 * diag_offset[i];
315:     PetscCall(PetscKernel_A_gets_inverse_A_2(w, shift, allowzeropivot, &zeropivotdetected));
316:     if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
317:   }

319:   PetscCall(PetscFree(rtmp));
320:   PetscCall(ISRestoreIndices(isicol, &ic));
321:   PetscCall(ISRestoreIndices(isrow, &r));

323:   C->ops->solve          = MatSolve_SeqBAIJ_2_inplace;
324:   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_inplace;
325:   C->assembled           = PETSC_TRUE;

327:   PetscCall(PetscLogFlops(1.333333333333 * 8 * b->mbs)); /* from inverting diagonal blocks */
328:   PetscFunctionReturn(PETSC_SUCCESS);
329: }
330: /*
331:       Version for when blocks are 2 by 2 Using natural ordering
332: */
333: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering_inplace(Mat C, Mat A, const MatFactorInfo *info)
334: {
335:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
336:   PetscInt     i, j, n = a->mbs, *bi = b->i, *bj = b->j;
337:   PetscInt    *ajtmpold, *ajtmp, nz, row;
338:   PetscInt    *diag_offset = b->diag, *ai = a->i, *aj = a->j, *pj;
339:   MatScalar   *pv, *v, *rtmp, *pc, *w, *x;
340:   MatScalar    p1, p2, p3, p4, m1, m2, m3, m4, x1, x2, x3, x4;
341:   MatScalar   *ba = b->a, *aa = a->a;
342:   PetscReal    shift = info->shiftamount;
343:   PetscBool    allowzeropivot, zeropivotdetected;

345:   PetscFunctionBegin;
346:   allowzeropivot = PetscNot(A->erroriffailure);
347:   PetscCall(PetscMalloc1(4 * (n + 1), &rtmp));
348:   for (i = 0; i < n; i++) {
349:     nz    = bi[i + 1] - bi[i];
350:     ajtmp = bj + bi[i];
351:     for (j = 0; j < nz; j++) {
352:       x    = rtmp + 4 * ajtmp[j];
353:       x[0] = x[1] = x[2] = x[3] = 0.0;
354:     }
355:     /* load in initial (unfactored row) */
356:     nz       = ai[i + 1] - ai[i];
357:     ajtmpold = aj + ai[i];
358:     v        = aa + 4 * ai[i];
359:     for (j = 0; j < nz; j++) {
360:       x    = rtmp + 4 * ajtmpold[j];
361:       x[0] = v[0];
362:       x[1] = v[1];
363:       x[2] = v[2];
364:       x[3] = v[3];
365:       v += 4;
366:     }
367:     row = *ajtmp++;
368:     while (row < i) {
369:       pc = rtmp + 4 * row;
370:       p1 = pc[0];
371:       p2 = pc[1];
372:       p3 = pc[2];
373:       p4 = pc[3];
374:       if (p1 != (PetscScalar)0.0 || p2 != (PetscScalar)0.0 || p3 != (PetscScalar)0.0 || p4 != (PetscScalar)0.0) {
375:         pv    = ba + 4 * diag_offset[row];
376:         pj    = bj + diag_offset[row] + 1;
377:         x1    = pv[0];
378:         x2    = pv[1];
379:         x3    = pv[2];
380:         x4    = pv[3];
381:         pc[0] = m1 = p1 * x1 + p3 * x2;
382:         pc[1] = m2 = p2 * x1 + p4 * x2;
383:         pc[2] = m3 = p1 * x3 + p3 * x4;
384:         pc[3] = m4 = p2 * x3 + p4 * x4;
385:         nz         = bi[row + 1] - diag_offset[row] - 1;
386:         pv += 4;
387:         for (j = 0; j < nz; j++) {
388:           x1 = pv[0];
389:           x2 = pv[1];
390:           x3 = pv[2];
391:           x4 = pv[3];
392:           x  = rtmp + 4 * pj[j];
393:           x[0] -= m1 * x1 + m3 * x2;
394:           x[1] -= m2 * x1 + m4 * x2;
395:           x[2] -= m1 * x3 + m3 * x4;
396:           x[3] -= m2 * x3 + m4 * x4;
397:           pv += 4;
398:         }
399:         PetscCall(PetscLogFlops(16.0 * nz + 12.0));
400:       }
401:       row = *ajtmp++;
402:     }
403:     /* finished row so stick it into b->a */
404:     pv = ba + 4 * bi[i];
405:     pj = bj + bi[i];
406:     nz = bi[i + 1] - bi[i];
407:     for (j = 0; j < nz; j++) {
408:       x     = rtmp + 4 * pj[j];
409:       pv[0] = x[0];
410:       pv[1] = x[1];
411:       pv[2] = x[2];
412:       pv[3] = x[3];
413:       /*
414:       printf(" col %d:",pj[j]);
415:       PetscInt j1;
416:       for (j1=0; j1<4; j1++) printf(" %g,",*(pv+j1));
417:       printf("\n");
418:       */
419:       pv += 4;
420:     }
421:     /* invert diagonal block */
422:     w = ba + 4 * diag_offset[i];
423:     PetscCall(PetscKernel_A_gets_inverse_A_2(w, shift, allowzeropivot, &zeropivotdetected));
424:     if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
425:   }

427:   PetscCall(PetscFree(rtmp));

429:   C->ops->solve          = MatSolve_SeqBAIJ_2_NaturalOrdering_inplace;
430:   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace;
431:   C->assembled           = PETSC_TRUE;

433:   PetscCall(PetscLogFlops(1.333333333333 * 8 * b->mbs)); /* from inverting diagonal blocks */
434:   PetscFunctionReturn(PETSC_SUCCESS);
435: }

437: /*
438:      Version for when blocks are 1 by 1.
439: */
440: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_1(Mat B, Mat A, const MatFactorInfo *info)
441: {
442:   Mat              C = B;
443:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
444:   IS               isrow = b->row, isicol = b->icol;
445:   const PetscInt  *r, *ic, *ics;
446:   const PetscInt   n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bdiag = b->diag;
447:   PetscInt         i, j, k, nz, nzL, row, *pj;
448:   const PetscInt  *ajtmp, *bjtmp;
449:   MatScalar       *rtmp, *pc, multiplier, *pv;
450:   const MatScalar *aa = a->a, *v;
451:   PetscBool        row_identity, col_identity;
452:   FactorShiftCtx   sctx;
453:   const PetscInt  *ddiag;
454:   PetscReal        rs;
455:   MatScalar        d;

457:   PetscFunctionBegin;
458:   /* MatPivotSetUp(): initialize shift context sctx */
459:   PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));

461:   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
462:     ddiag          = a->diag;
463:     sctx.shift_top = info->zeropivot;
464:     for (i = 0; i < n; i++) {
465:       /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
466:       d  = (aa)[ddiag[i]];
467:       rs = -PetscAbsScalar(d) - PetscRealPart(d);
468:       v  = aa + ai[i];
469:       nz = ai[i + 1] - ai[i];
470:       for (j = 0; j < nz; j++) rs += PetscAbsScalar(v[j]);
471:       if (rs > sctx.shift_top) sctx.shift_top = rs;
472:     }
473:     sctx.shift_top *= 1.1;
474:     sctx.nshift_max = 5;
475:     sctx.shift_lo   = 0.;
476:     sctx.shift_hi   = 1.;
477:   }

479:   PetscCall(ISGetIndices(isrow, &r));
480:   PetscCall(ISGetIndices(isicol, &ic));
481:   PetscCall(PetscMalloc1(n + 1, &rtmp));
482:   ics = ic;

484:   do {
485:     sctx.newshift = PETSC_FALSE;
486:     for (i = 0; i < n; i++) {
487:       /* zero rtmp */
488:       /* L part */
489:       nz    = bi[i + 1] - bi[i];
490:       bjtmp = bj + bi[i];
491:       for (j = 0; j < nz; j++) rtmp[bjtmp[j]] = 0.0;

493:       /* U part */
494:       nz    = bdiag[i] - bdiag[i + 1];
495:       bjtmp = bj + bdiag[i + 1] + 1;
496:       for (j = 0; j < nz; j++) rtmp[bjtmp[j]] = 0.0;

498:       /* load in initial (unfactored row) */
499:       nz    = ai[r[i] + 1] - ai[r[i]];
500:       ajtmp = aj + ai[r[i]];
501:       v     = aa + ai[r[i]];
502:       for (j = 0; j < nz; j++) rtmp[ics[ajtmp[j]]] = v[j];

504:       /* ZeropivotApply() */
505:       rtmp[i] += sctx.shift_amount; /* shift the diagonal of the matrix */

507:       /* elimination */
508:       bjtmp = bj + bi[i];
509:       row   = *bjtmp++;
510:       nzL   = bi[i + 1] - bi[i];
511:       for (k = 0; k < nzL; k++) {
512:         pc = rtmp + row;
513:         if (*pc != (PetscScalar)0.0) {
514:           pv         = b->a + bdiag[row];
515:           multiplier = *pc * (*pv);
516:           *pc        = multiplier;

518:           pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
519:           pv = b->a + bdiag[row + 1] + 1;
520:           nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
521:           for (j = 0; j < nz; j++) rtmp[pj[j]] -= multiplier * pv[j];
522:           PetscCall(PetscLogFlops(2.0 * nz));
523:         }
524:         row = *bjtmp++;
525:       }

527:       /* finished row so stick it into b->a */
528:       rs = 0.0;
529:       /* L part */
530:       pv = b->a + bi[i];
531:       pj = b->j + bi[i];
532:       nz = bi[i + 1] - bi[i];
533:       for (j = 0; j < nz; j++) {
534:         pv[j] = rtmp[pj[j]];
535:         rs += PetscAbsScalar(pv[j]);
536:       }

538:       /* U part */
539:       pv = b->a + bdiag[i + 1] + 1;
540:       pj = b->j + bdiag[i + 1] + 1;
541:       nz = bdiag[i] - bdiag[i + 1] - 1;
542:       for (j = 0; j < nz; j++) {
543:         pv[j] = rtmp[pj[j]];
544:         rs += PetscAbsScalar(pv[j]);
545:       }

547:       sctx.rs = rs;
548:       sctx.pv = rtmp[i];
549:       PetscCall(MatPivotCheck(B, A, info, &sctx, i));
550:       if (sctx.newshift) break; /* break for-loop */
551:       rtmp[i] = sctx.pv;        /* sctx.pv might be updated in the case of MAT_SHIFT_INBLOCKS */

553:       /* Mark diagonal and invert diagonal for simpler triangular solves */
554:       pv  = b->a + bdiag[i];
555:       *pv = (PetscScalar)1.0 / rtmp[i];

557:     } /* endof for (i=0; i<n; i++) { */

559:     /* MatPivotRefine() */
560:     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction > 0 && sctx.nshift < sctx.nshift_max) {
561:       /*
562:        * if no shift in this attempt & shifting & started shifting & can refine,
563:        * then try lower shift
564:        */
565:       sctx.shift_hi       = sctx.shift_fraction;
566:       sctx.shift_fraction = (sctx.shift_hi + sctx.shift_lo) / 2.;
567:       sctx.shift_amount   = sctx.shift_fraction * sctx.shift_top;
568:       sctx.newshift       = PETSC_TRUE;
569:       sctx.nshift++;
570:     }
571:   } while (sctx.newshift);

573:   PetscCall(PetscFree(rtmp));
574:   PetscCall(ISRestoreIndices(isicol, &ic));
575:   PetscCall(ISRestoreIndices(isrow, &r));

577:   PetscCall(ISIdentity(isrow, &row_identity));
578:   PetscCall(ISIdentity(isicol, &col_identity));
579:   if (row_identity && col_identity) {
580:     C->ops->solve          = MatSolve_SeqBAIJ_1_NaturalOrdering;
581:     C->ops->forwardsolve   = MatForwardSolve_SeqBAIJ_1_NaturalOrdering;
582:     C->ops->backwardsolve  = MatBackwardSolve_SeqBAIJ_1_NaturalOrdering;
583:     C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering;
584:   } else {
585:     C->ops->solve          = MatSolve_SeqBAIJ_1;
586:     C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1;
587:   }
588:   C->assembled = PETSC_TRUE;
589:   PetscCall(PetscLogFlops(C->cmap->n));

591:   /* MatShiftView(A,info,&sctx) */
592:   if (sctx.nshift) {
593:     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
594:       PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top));
595:     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
596:       PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
597:     } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
598:       PetscCall(PetscInfo(A, "number of shift_inblocks applied %" PetscInt_FMT ", each shift_amount %g\n", sctx.nshift, (double)info->shiftamount));
599:     }
600:   }
601:   PetscFunctionReturn(PETSC_SUCCESS);
602: }

604: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_1_inplace(Mat C, Mat A, const MatFactorInfo *info)
605: {
606:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
607:   IS              isrow = b->row, isicol = b->icol;
608:   const PetscInt *r, *ic;
609:   PetscInt        i, j, n = a->mbs, *bi = b->i, *bj = b->j;
610:   PetscInt       *ajtmpold, *ajtmp, nz, row, *ai = a->i, *aj = a->j;
611:   PetscInt       *diag_offset = b->diag, diag, *pj;
612:   MatScalar      *pv, *v, *rtmp, multiplier, *pc;
613:   MatScalar      *ba = b->a, *aa = a->a;
614:   PetscBool       row_identity, col_identity;

616:   PetscFunctionBegin;
617:   PetscCall(ISGetIndices(isrow, &r));
618:   PetscCall(ISGetIndices(isicol, &ic));
619:   PetscCall(PetscMalloc1(n + 1, &rtmp));

621:   for (i = 0; i < n; i++) {
622:     nz    = bi[i + 1] - bi[i];
623:     ajtmp = bj + bi[i];
624:     for (j = 0; j < nz; j++) rtmp[ajtmp[j]] = 0.0;

626:     /* load in initial (unfactored row) */
627:     nz       = ai[r[i] + 1] - ai[r[i]];
628:     ajtmpold = aj + ai[r[i]];
629:     v        = aa + ai[r[i]];
630:     for (j = 0; j < nz; j++) rtmp[ic[ajtmpold[j]]] = v[j];

632:     row = *ajtmp++;
633:     while (row < i) {
634:       pc = rtmp + row;
635:       if (*pc != 0.0) {
636:         pv         = ba + diag_offset[row];
637:         pj         = bj + diag_offset[row] + 1;
638:         multiplier = *pc * *pv++;
639:         *pc        = multiplier;
640:         nz         = bi[row + 1] - diag_offset[row] - 1;
641:         for (j = 0; j < nz; j++) rtmp[pj[j]] -= multiplier * pv[j];
642:         PetscCall(PetscLogFlops(1.0 + 2.0 * nz));
643:       }
644:       row = *ajtmp++;
645:     }
646:     /* finished row so stick it into b->a */
647:     pv = ba + bi[i];
648:     pj = bj + bi[i];
649:     nz = bi[i + 1] - bi[i];
650:     for (j = 0; j < nz; j++) pv[j] = rtmp[pj[j]];
651:     diag = diag_offset[i] - bi[i];
652:     /* check pivot entry for current row */
653:     PetscCheck(pv[diag] != 0.0, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Zero pivot: row in original ordering %" PetscInt_FMT " in permuted ordering %" PetscInt_FMT, r[i], i);
654:     pv[diag] = 1.0 / pv[diag];
655:   }

657:   PetscCall(PetscFree(rtmp));
658:   PetscCall(ISRestoreIndices(isicol, &ic));
659:   PetscCall(ISRestoreIndices(isrow, &r));
660:   PetscCall(ISIdentity(isrow, &row_identity));
661:   PetscCall(ISIdentity(isicol, &col_identity));
662:   if (row_identity && col_identity) {
663:     C->ops->solve          = MatSolve_SeqBAIJ_1_NaturalOrdering_inplace;
664:     C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace;
665:   } else {
666:     C->ops->solve          = MatSolve_SeqBAIJ_1_inplace;
667:     C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_inplace;
668:   }
669:   C->assembled = PETSC_TRUE;
670:   PetscCall(PetscLogFlops(C->cmap->n));
671:   PetscFunctionReturn(PETSC_SUCCESS);
672: }

674: static PetscErrorCode MatFactorGetSolverType_petsc(Mat A, MatSolverType *type)
675: {
676:   PetscFunctionBegin;
677:   *type = MATSOLVERPETSC;
678:   PetscFunctionReturn(PETSC_SUCCESS);
679: }

681: PETSC_INTERN PetscErrorCode MatGetFactor_seqbaij_petsc(Mat A, MatFactorType ftype, Mat *B)
682: {
683:   PetscInt n = A->rmap->n;

685:   PetscFunctionBegin;
686: #if defined(PETSC_USE_COMPLEX)
687:   PetscCheck(A->hermitian != PETSC_BOOL3_TRUE || !(ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC), PETSC_COMM_SELF, PETSC_ERR_SUP, "Hermitian Factor is not supported");
688: #endif
689:   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
690:   PetscCall(MatSetSizes(*B, n, n, n, n));
691:   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
692:     PetscCall(MatSetType(*B, MATSEQBAIJ));

694:     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqBAIJ;
695:     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqBAIJ;
696:     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
697:     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
698:     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
699:   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
700:     PetscCall(MatSetType(*B, MATSEQSBAIJ));
701:     PetscCall(MatSeqSBAIJSetPreallocation(*B, A->rmap->bs, MAT_SKIP_ALLOCATION, NULL));

703:     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqBAIJ;
704:     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqBAIJ;
705:     /*  Future optimization would be direct symbolic and numerical factorization for BAIJ to support orderings and Cholesky, instead of first converting to SBAIJ */
706:     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
707:     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
708:   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported");
709:   (*B)->factortype     = ftype;
710:   (*B)->canuseordering = PETSC_TRUE;

712:   PetscCall(PetscFree((*B)->solvertype));
713:   PetscCall(PetscStrallocpy(MATSOLVERPETSC, &(*B)->solvertype));
714:   PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_petsc));
715:   PetscFunctionReturn(PETSC_SUCCESS);
716: }

718: PetscErrorCode MatLUFactor_SeqBAIJ(Mat A, IS row, IS col, const MatFactorInfo *info)
719: {
720:   Mat C;

722:   PetscFunctionBegin;
723:   PetscCall(MatGetFactor(A, MATSOLVERPETSC, MAT_FACTOR_LU, &C));
724:   PetscCall(MatLUFactorSymbolic(C, A, row, col, info));
725:   PetscCall(MatLUFactorNumeric(C, A, info));

727:   A->ops->solve          = C->ops->solve;
728:   A->ops->solvetranspose = C->ops->solvetranspose;

730:   PetscCall(MatHeaderMerge(A, &C));
731:   PetscFunctionReturn(PETSC_SUCCESS);
732: }

734: #include <../src/mat/impls/sbaij/seq/sbaij.h>
735: PetscErrorCode MatCholeskyFactorNumeric_SeqBAIJ_N(Mat C, Mat A, const MatFactorInfo *info)
736: {
737:   Mat_SeqBAIJ    *a  = (Mat_SeqBAIJ *)A->data;
738:   Mat_SeqSBAIJ   *b  = (Mat_SeqSBAIJ *)C->data;
739:   IS              ip = b->row;
740:   const PetscInt *rip;
741:   PetscInt        i, j, mbs = a->mbs, bs = A->rmap->bs, *bi = b->i, *bj = b->j, *bcol;
742:   PetscInt       *ai = a->i, *aj = a->j;
743:   PetscInt        k, jmin, jmax, *jl, *il, col, nexti, ili, nz;
744:   MatScalar      *rtmp, *ba = b->a, *bval, *aa = a->a, dk, uikdi;
745:   PetscReal       rs;
746:   FactorShiftCtx  sctx;

748:   PetscFunctionBegin;
749:   if (bs > 1) { /* convert A to a SBAIJ matrix and apply Cholesky factorization from it */
750:     if (!a->sbaijMat) PetscCall(MatConvert(A, MATSEQSBAIJ, MAT_INITIAL_MATRIX, &a->sbaijMat));
751:     PetscCall((a->sbaijMat)->ops->choleskyfactornumeric(C, a->sbaijMat, info));
752:     PetscCall(MatDestroy(&a->sbaijMat));
753:     PetscFunctionReturn(PETSC_SUCCESS);
754:   }

756:   /* MatPivotSetUp(): initialize shift context sctx */
757:   PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));

759:   PetscCall(ISGetIndices(ip, &rip));
760:   PetscCall(PetscMalloc3(mbs, &rtmp, mbs, &il, mbs, &jl));

762:   sctx.shift_amount = 0.;
763:   sctx.nshift       = 0;
764:   do {
765:     sctx.newshift = PETSC_FALSE;
766:     for (i = 0; i < mbs; i++) {
767:       rtmp[i] = 0.0;
768:       jl[i]   = mbs;
769:       il[0]   = 0;
770:     }

772:     for (k = 0; k < mbs; k++) {
773:       bval = ba + bi[k];
774:       /* initialize k-th row by the perm[k]-th row of A */
775:       jmin = ai[rip[k]];
776:       jmax = ai[rip[k] + 1];
777:       for (j = jmin; j < jmax; j++) {
778:         col = rip[aj[j]];
779:         if (col >= k) { /* only take upper triangular entry */
780:           rtmp[col] = aa[j];
781:           *bval++   = 0.0; /* for in-place factorization */
782:         }
783:       }

785:       /* shift the diagonal of the matrix */
786:       if (sctx.nshift) rtmp[k] += sctx.shift_amount;

788:       /* modify k-th row by adding in those rows i with U(i,k)!=0 */
789:       dk = rtmp[k];
790:       i  = jl[k]; /* first row to be added to k_th row  */

792:       while (i < k) {
793:         nexti = jl[i]; /* next row to be added to k_th row */

795:         /* compute multiplier, update diag(k) and U(i,k) */
796:         ili   = il[i];                /* index of first nonzero element in U(i,k:bms-1) */
797:         uikdi = -ba[ili] * ba[bi[i]]; /* diagonal(k) */
798:         dk += uikdi * ba[ili];
799:         ba[ili] = uikdi; /* -U(i,k) */

801:         /* add multiple of row i to k-th row */
802:         jmin = ili + 1;
803:         jmax = bi[i + 1];
804:         if (jmin < jmax) {
805:           for (j = jmin; j < jmax; j++) rtmp[bj[j]] += uikdi * ba[j];
806:           /* update il and jl for row i */
807:           il[i] = jmin;
808:           j     = bj[jmin];
809:           jl[i] = jl[j];
810:           jl[j] = i;
811:         }
812:         i = nexti;
813:       }

815:       /* shift the diagonals when zero pivot is detected */
816:       /* compute rs=sum of abs(off-diagonal) */
817:       rs   = 0.0;
818:       jmin = bi[k] + 1;
819:       nz   = bi[k + 1] - jmin;
820:       if (nz) {
821:         bcol = bj + jmin;
822:         while (nz--) {
823:           rs += PetscAbsScalar(rtmp[*bcol]);
824:           bcol++;
825:         }
826:       }

828:       sctx.rs = rs;
829:       sctx.pv = dk;
830:       PetscCall(MatPivotCheck(C, A, info, &sctx, k));
831:       if (sctx.newshift) break;
832:       dk = sctx.pv;

834:       /* copy data into U(k,:) */
835:       ba[bi[k]] = 1.0 / dk; /* U(k,k) */
836:       jmin      = bi[k] + 1;
837:       jmax      = bi[k + 1];
838:       if (jmin < jmax) {
839:         for (j = jmin; j < jmax; j++) {
840:           col       = bj[j];
841:           ba[j]     = rtmp[col];
842:           rtmp[col] = 0.0;
843:         }
844:         /* add the k-th row into il and jl */
845:         il[k] = jmin;
846:         i     = bj[jmin];
847:         jl[k] = jl[i];
848:         jl[i] = k;
849:       }
850:     }
851:   } while (sctx.newshift);
852:   PetscCall(PetscFree3(rtmp, il, jl));

854:   PetscCall(ISRestoreIndices(ip, &rip));

856:   C->assembled    = PETSC_TRUE;
857:   C->preallocated = PETSC_TRUE;

859:   PetscCall(PetscLogFlops(C->rmap->N));
860:   if (sctx.nshift) {
861:     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
862:       PetscCall(PetscInfo(A, "number of shiftpd tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
863:     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
864:       PetscCall(PetscInfo(A, "number of shiftnz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
865:     }
866:   }
867:   PetscFunctionReturn(PETSC_SUCCESS);
868: }

870: PetscErrorCode MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering(Mat C, Mat A, const MatFactorInfo *info)
871: {
872:   Mat_SeqBAIJ   *a = (Mat_SeqBAIJ *)A->data;
873:   Mat_SeqSBAIJ  *b = (Mat_SeqSBAIJ *)C->data;
874:   PetscInt       i, j, am = a->mbs;
875:   PetscInt      *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j;
876:   PetscInt       k, jmin, *jl, *il, nexti, ili, *acol, *bcol, nz;
877:   MatScalar     *rtmp, *ba = b->a, *aa = a->a, dk, uikdi, *aval, *bval;
878:   PetscReal      rs;
879:   FactorShiftCtx sctx;

881:   PetscFunctionBegin;
882:   /* MatPivotSetUp(): initialize shift context sctx */
883:   PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));

885:   PetscCall(PetscMalloc3(am, &rtmp, am, &il, am, &jl));

887:   do {
888:     sctx.newshift = PETSC_FALSE;
889:     for (i = 0; i < am; i++) {
890:       rtmp[i] = 0.0;
891:       jl[i]   = am;
892:       il[0]   = 0;
893:     }

895:     for (k = 0; k < am; k++) {
896:       /* initialize k-th row with elements nonzero in row perm(k) of A */
897:       nz   = ai[k + 1] - ai[k];
898:       acol = aj + ai[k];
899:       aval = aa + ai[k];
900:       bval = ba + bi[k];
901:       while (nz--) {
902:         if (*acol < k) { /* skip lower triangular entries */
903:           acol++;
904:           aval++;
905:         } else {
906:           rtmp[*acol++] = *aval++;
907:           *bval++       = 0.0; /* for in-place factorization */
908:         }
909:       }

911:       /* shift the diagonal of the matrix */
912:       if (sctx.nshift) rtmp[k] += sctx.shift_amount;

914:       /* modify k-th row by adding in those rows i with U(i,k)!=0 */
915:       dk = rtmp[k];
916:       i  = jl[k]; /* first row to be added to k_th row  */

918:       while (i < k) {
919:         nexti = jl[i]; /* next row to be added to k_th row */
920:         /* compute multiplier, update D(k) and U(i,k) */
921:         ili   = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
922:         uikdi = -ba[ili] * ba[bi[i]];
923:         dk += uikdi * ba[ili];
924:         ba[ili] = uikdi; /* -U(i,k) */

926:         /* add multiple of row i to k-th row ... */
927:         jmin = ili + 1;
928:         nz   = bi[i + 1] - jmin;
929:         if (nz > 0) {
930:           bcol = bj + jmin;
931:           bval = ba + jmin;
932:           while (nz--) rtmp[*bcol++] += uikdi * (*bval++);
933:           /* update il and jl for i-th row */
934:           il[i] = jmin;
935:           j     = bj[jmin];
936:           jl[i] = jl[j];
937:           jl[j] = i;
938:         }
939:         i = nexti;
940:       }

942:       /* shift the diagonals when zero pivot is detected */
943:       /* compute rs=sum of abs(off-diagonal) */
944:       rs   = 0.0;
945:       jmin = bi[k] + 1;
946:       nz   = bi[k + 1] - jmin;
947:       if (nz) {
948:         bcol = bj + jmin;
949:         while (nz--) {
950:           rs += PetscAbsScalar(rtmp[*bcol]);
951:           bcol++;
952:         }
953:       }

955:       sctx.rs = rs;
956:       sctx.pv = dk;
957:       PetscCall(MatPivotCheck(C, A, info, &sctx, k));
958:       if (sctx.newshift) break; /* sctx.shift_amount is updated */
959:       dk = sctx.pv;

961:       /* copy data into U(k,:) */
962:       ba[bi[k]] = 1.0 / dk;
963:       jmin      = bi[k] + 1;
964:       nz        = bi[k + 1] - jmin;
965:       if (nz) {
966:         bcol = bj + jmin;
967:         bval = ba + jmin;
968:         while (nz--) {
969:           *bval++       = rtmp[*bcol];
970:           rtmp[*bcol++] = 0.0;
971:         }
972:         /* add k-th row into il and jl */
973:         il[k] = jmin;
974:         i     = bj[jmin];
975:         jl[k] = jl[i];
976:         jl[i] = k;
977:       }
978:     }
979:   } while (sctx.newshift);
980:   PetscCall(PetscFree3(rtmp, il, jl));

982:   C->ops->solve          = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
983:   C->ops->solvetranspose = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
984:   C->assembled           = PETSC_TRUE;
985:   C->preallocated        = PETSC_TRUE;

987:   PetscCall(PetscLogFlops(C->rmap->N));
988:   if (sctx.nshift) {
989:     if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
990:       PetscCall(PetscInfo(A, "number of shiftnz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
991:     } else if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
992:       PetscCall(PetscInfo(A, "number of shiftpd tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
993:     }
994:   }
995:   PetscFunctionReturn(PETSC_SUCCESS);
996: }

998: #include <petscbt.h>
999: #include <../src/mat/utils/freespace.h>
1000: PetscErrorCode MatICCFactorSymbolic_SeqBAIJ(Mat fact, Mat A, IS perm, const MatFactorInfo *info)
1001: {
1002:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1003:   Mat_SeqSBAIJ      *b;
1004:   Mat                B;
1005:   PetscBool          perm_identity, missing;
1006:   PetscInt           reallocs = 0, i, *ai = a->i, *aj = a->j, am = a->mbs, bs = A->rmap->bs, *ui;
1007:   const PetscInt    *rip;
1008:   PetscInt           jmin, jmax, nzk, k, j, *jl, prow, *il, nextprow;
1009:   PetscInt           nlnk, *lnk, *lnk_lvl = NULL, ncols, ncols_upper, *cols, *cols_lvl, *uj, **uj_ptr, **uj_lvl_ptr;
1010:   PetscReal          fill = info->fill, levels = info->levels;
1011:   PetscFreeSpaceList free_space = NULL, current_space = NULL;
1012:   PetscFreeSpaceList free_space_lvl = NULL, current_space_lvl = NULL;
1013:   PetscBT            lnkbt;

1015:   PetscFunctionBegin;
1016:   PetscCall(MatMissingDiagonal(A, &missing, &i));
1017:   PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);

1019:   if (bs > 1) {
1020:     if (!a->sbaijMat) PetscCall(MatConvert(A, MATSEQSBAIJ, MAT_INITIAL_MATRIX, &a->sbaijMat));
1021:     (fact)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqSBAIJ; /* undue the change made in MatGetFactor_seqbaij_petsc */

1023:     PetscCall(MatICCFactorSymbolic(fact, a->sbaijMat, perm, info));
1024:     PetscFunctionReturn(PETSC_SUCCESS);
1025:   }

1027:   PetscCall(ISIdentity(perm, &perm_identity));
1028:   PetscCall(ISGetIndices(perm, &rip));

1030:   /* special case that simply copies fill pattern */
1031:   if (!levels && perm_identity) {
1032:     PetscCall(PetscMalloc1(am + 1, &ui));
1033:     for (i = 0; i < am; i++) ui[i] = ai[i + 1] - a->diag[i]; /* ui: rowlengths - changes when !perm_identity */
1034:     B = fact;
1035:     PetscCall(MatSeqSBAIJSetPreallocation(B, 1, 0, ui));

1037:     b  = (Mat_SeqSBAIJ *)B->data;
1038:     uj = b->j;
1039:     for (i = 0; i < am; i++) {
1040:       aj = a->j + a->diag[i];
1041:       for (j = 0; j < ui[i]; j++) *uj++ = *aj++;
1042:       b->ilen[i] = ui[i];
1043:     }
1044:     PetscCall(PetscFree(ui));

1046:     B->factortype = MAT_FACTOR_NONE;

1048:     PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
1049:     PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
1050:     B->factortype = MAT_FACTOR_ICC;

1052:     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering;
1053:     PetscFunctionReturn(PETSC_SUCCESS);
1054:   }

1056:   /* initialization */
1057:   PetscCall(PetscMalloc1(am + 1, &ui));
1058:   ui[0] = 0;
1059:   PetscCall(PetscMalloc1(2 * am + 1, &cols_lvl));

1061:   /* jl: linked list for storing indices of the pivot rows
1062:      il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
1063:   PetscCall(PetscMalloc4(am, &uj_ptr, am, &uj_lvl_ptr, am, &il, am, &jl));
1064:   for (i = 0; i < am; i++) {
1065:     jl[i] = am;
1066:     il[i] = 0;
1067:   }

1069:   /* create and initialize a linked list for storing column indices of the active row k */
1070:   nlnk = am + 1;
1071:   PetscCall(PetscIncompleteLLCreate(am, am, nlnk, lnk, lnk_lvl, lnkbt));

1073:   /* initial FreeSpace size is fill*(ai[am]+am)/2 */
1074:   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am] / 2, am / 2)), &free_space));

1076:   current_space = free_space;

1078:   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am] / 2, am / 2)), &free_space_lvl));
1079:   current_space_lvl = free_space_lvl;

1081:   for (k = 0; k < am; k++) { /* for each active row k */
1082:     /* initialize lnk by the column indices of row rip[k] of A */
1083:     nzk         = 0;
1084:     ncols       = ai[rip[k] + 1] - ai[rip[k]];
1085:     ncols_upper = 0;
1086:     cols        = cols_lvl + am;
1087:     for (j = 0; j < ncols; j++) {
1088:       i = rip[*(aj + ai[rip[k]] + j)];
1089:       if (i >= k) { /* only take upper triangular entry */
1090:         cols[ncols_upper]     = i;
1091:         cols_lvl[ncols_upper] = -1; /* initialize level for nonzero entries */
1092:         ncols_upper++;
1093:       }
1094:     }
1095:     PetscCall(PetscIncompleteLLAdd(ncols_upper, cols, levels, cols_lvl, am, &nlnk, lnk, lnk_lvl, lnkbt));
1096:     nzk += nlnk;

1098:     /* update lnk by computing fill-in for each pivot row to be merged in */
1099:     prow = jl[k]; /* 1st pivot row */

1101:     while (prow < k) {
1102:       nextprow = jl[prow];

1104:       /* merge prow into k-th row */
1105:       jmin  = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */
1106:       jmax  = ui[prow + 1];
1107:       ncols = jmax - jmin;
1108:       i     = jmin - ui[prow];
1109:       cols  = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */
1110:       for (j = 0; j < ncols; j++) cols_lvl[j] = *(uj_lvl_ptr[prow] + i + j);
1111:       PetscCall(PetscIncompleteLLAddSorted(ncols, cols, levels, cols_lvl, am, &nlnk, lnk, lnk_lvl, lnkbt));
1112:       nzk += nlnk;

1114:       /* update il and jl for prow */
1115:       if (jmin < jmax) {
1116:         il[prow] = jmin;

1118:         j        = *cols;
1119:         jl[prow] = jl[j];
1120:         jl[j]    = prow;
1121:       }
1122:       prow = nextprow;
1123:     }

1125:     /* if free space is not available, make more free space */
1126:     if (current_space->local_remaining < nzk) {
1127:       i = am - k + 1;                                                             /* num of unfactored rows */
1128:       i = PetscMin(PetscIntMultTruncate(i, nzk), PetscIntMultTruncate(i, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
1129:       PetscCall(PetscFreeSpaceGet(i, &current_space));
1130:       PetscCall(PetscFreeSpaceGet(i, &current_space_lvl));
1131:       reallocs++;
1132:     }

1134:     /* copy data into free_space and free_space_lvl, then initialize lnk */
1135:     PetscCall(PetscIncompleteLLClean(am, am, nzk, lnk, lnk_lvl, current_space->array, current_space_lvl->array, lnkbt));

1137:     /* add the k-th row into il and jl */
1138:     if (nzk - 1 > 0) {
1139:       i     = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
1140:       jl[k] = jl[i];
1141:       jl[i] = k;
1142:       il[k] = ui[k] + 1;
1143:     }
1144:     uj_ptr[k]     = current_space->array;
1145:     uj_lvl_ptr[k] = current_space_lvl->array;

1147:     current_space->array += nzk;
1148:     current_space->local_used += nzk;
1149:     current_space->local_remaining -= nzk;

1151:     current_space_lvl->array += nzk;
1152:     current_space_lvl->local_used += nzk;
1153:     current_space_lvl->local_remaining -= nzk;

1155:     ui[k + 1] = ui[k] + nzk;
1156:   }

1158:   PetscCall(ISRestoreIndices(perm, &rip));
1159:   PetscCall(PetscFree4(uj_ptr, uj_lvl_ptr, il, jl));
1160:   PetscCall(PetscFree(cols_lvl));

1162:   /* copy free_space into uj and free free_space; set uj in new datastructure; */
1163:   PetscCall(PetscMalloc1(ui[am] + 1, &uj));
1164:   PetscCall(PetscFreeSpaceContiguous(&free_space, uj));
1165:   PetscCall(PetscIncompleteLLDestroy(lnk, lnkbt));
1166:   PetscCall(PetscFreeSpaceDestroy(free_space_lvl));

1168:   /* put together the new matrix in MATSEQSBAIJ format */
1169:   B = fact;
1170:   PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL));

1172:   b               = (Mat_SeqSBAIJ *)B->data;
1173:   b->singlemalloc = PETSC_FALSE;
1174:   b->free_a       = PETSC_TRUE;
1175:   b->free_ij      = PETSC_TRUE;

1177:   PetscCall(PetscMalloc1(ui[am] + 1, &b->a));

1179:   b->j             = uj;
1180:   b->i             = ui;
1181:   b->diag          = NULL;
1182:   b->ilen          = NULL;
1183:   b->imax          = NULL;
1184:   b->row           = perm;
1185:   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */

1187:   PetscCall(PetscObjectReference((PetscObject)perm));

1189:   b->icol = perm;

1191:   PetscCall(PetscObjectReference((PetscObject)perm));
1192:   PetscCall(PetscMalloc1(am + 1, &b->solve_work));

1194:   b->maxnz = b->nz = ui[am];

1196:   B->info.factor_mallocs   = reallocs;
1197:   B->info.fill_ratio_given = fill;
1198:   if (ai[am] != 0.) {
1199:     /* nonzeros in lower triangular part of A (including diagonals)= (ai[am]+am)/2 */
1200:     B->info.fill_ratio_needed = ((PetscReal)2 * ui[am]) / (ai[am] + am);
1201:   } else {
1202:     B->info.fill_ratio_needed = 0.0;
1203:   }
1204: #if defined(PETSC_USE_INFO)
1205:   if (ai[am] != 0) {
1206:     PetscReal af = B->info.fill_ratio_needed;
1207:     PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af));
1208:     PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af));
1209:     PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af));
1210:   } else {
1211:     PetscCall(PetscInfo(A, "Empty matrix\n"));
1212:   }
1213: #endif
1214:   if (perm_identity) {
1215:     B->ops->solve                 = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
1216:     B->ops->solvetranspose        = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
1217:     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering;
1218:   } else {
1219:     (fact)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N;
1220:   }
1221:   PetscFunctionReturn(PETSC_SUCCESS);
1222: }

1224: PetscErrorCode MatCholeskyFactorSymbolic_SeqBAIJ(Mat fact, Mat A, IS perm, const MatFactorInfo *info)
1225: {
1226:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1227:   Mat_SeqSBAIJ      *b;
1228:   Mat                B;
1229:   PetscBool          perm_identity, missing;
1230:   PetscReal          fill = info->fill;
1231:   const PetscInt    *rip;
1232:   PetscInt           i, mbs = a->mbs, bs = A->rmap->bs, *ai = a->i, *aj = a->j, reallocs = 0, prow;
1233:   PetscInt          *jl, jmin, jmax, nzk, *ui, k, j, *il, nextprow;
1234:   PetscInt           nlnk, *lnk, ncols, ncols_upper, *cols, *uj, **ui_ptr, *uj_ptr;
1235:   PetscFreeSpaceList free_space = NULL, current_space = NULL;
1236:   PetscBT            lnkbt;

1238:   PetscFunctionBegin;
1239:   if (bs > 1) { /* convert to seqsbaij */
1240:     if (!a->sbaijMat) PetscCall(MatConvert(A, MATSEQSBAIJ, MAT_INITIAL_MATRIX, &a->sbaijMat));
1241:     (fact)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqSBAIJ; /* undue the change made in MatGetFactor_seqbaij_petsc */

1243:     PetscCall(MatCholeskyFactorSymbolic(fact, a->sbaijMat, perm, info));
1244:     PetscFunctionReturn(PETSC_SUCCESS);
1245:   }

1247:   PetscCall(MatMissingDiagonal(A, &missing, &i));
1248:   PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);

1250:   /* check whether perm is the identity mapping */
1251:   PetscCall(ISIdentity(perm, &perm_identity));
1252:   PetscCheck(perm_identity, PETSC_COMM_SELF, PETSC_ERR_SUP, "Matrix reordering is not supported");
1253:   PetscCall(ISGetIndices(perm, &rip));

1255:   /* initialization */
1256:   PetscCall(PetscMalloc1(mbs + 1, &ui));
1257:   ui[0] = 0;

1259:   /* jl: linked list for storing indices of the pivot rows
1260:      il: il[i] points to the 1st nonzero entry of U(i,k:mbs-1) */
1261:   PetscCall(PetscMalloc4(mbs, &ui_ptr, mbs, &il, mbs, &jl, mbs, &cols));
1262:   for (i = 0; i < mbs; i++) {
1263:     jl[i] = mbs;
1264:     il[i] = 0;
1265:   }

1267:   /* create and initialize a linked list for storing column indices of the active row k */
1268:   nlnk = mbs + 1;
1269:   PetscCall(PetscLLCreate(mbs, mbs, nlnk, lnk, lnkbt));

1271:   /* initial FreeSpace size is fill* (ai[mbs]+mbs)/2 */
1272:   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[mbs] / 2, mbs / 2)), &free_space));

1274:   current_space = free_space;

1276:   for (k = 0; k < mbs; k++) { /* for each active row k */
1277:     /* initialize lnk by the column indices of row rip[k] of A */
1278:     nzk         = 0;
1279:     ncols       = ai[rip[k] + 1] - ai[rip[k]];
1280:     ncols_upper = 0;
1281:     for (j = 0; j < ncols; j++) {
1282:       i = rip[*(aj + ai[rip[k]] + j)];
1283:       if (i >= k) { /* only take upper triangular entry */
1284:         cols[ncols_upper] = i;
1285:         ncols_upper++;
1286:       }
1287:     }
1288:     PetscCall(PetscLLAdd(ncols_upper, cols, mbs, &nlnk, lnk, lnkbt));
1289:     nzk += nlnk;

1291:     /* update lnk by computing fill-in for each pivot row to be merged in */
1292:     prow = jl[k]; /* 1st pivot row */

1294:     while (prow < k) {
1295:       nextprow = jl[prow];
1296:       /* merge prow into k-th row */
1297:       jmin   = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:mbs-1) */
1298:       jmax   = ui[prow + 1];
1299:       ncols  = jmax - jmin;
1300:       uj_ptr = ui_ptr[prow] + jmin - ui[prow]; /* points to the 2nd nzero entry in U(prow,k:mbs-1) */
1301:       PetscCall(PetscLLAddSorted(ncols, uj_ptr, mbs, &nlnk, lnk, lnkbt));
1302:       nzk += nlnk;

1304:       /* update il and jl for prow */
1305:       if (jmin < jmax) {
1306:         il[prow] = jmin;
1307:         j        = *uj_ptr;
1308:         jl[prow] = jl[j];
1309:         jl[j]    = prow;
1310:       }
1311:       prow = nextprow;
1312:     }

1314:     /* if free space is not available, make more free space */
1315:     if (current_space->local_remaining < nzk) {
1316:       i = mbs - k + 1;                                                            /* num of unfactored rows */
1317:       i = PetscMin(PetscIntMultTruncate(i, nzk), PetscIntMultTruncate(i, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
1318:       PetscCall(PetscFreeSpaceGet(i, &current_space));
1319:       reallocs++;
1320:     }

1322:     /* copy data into free space, then initialize lnk */
1323:     PetscCall(PetscLLClean(mbs, mbs, nzk, lnk, current_space->array, lnkbt));

1325:     /* add the k-th row into il and jl */
1326:     if (nzk - 1 > 0) {
1327:       i     = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:mbs-1) */
1328:       jl[k] = jl[i];
1329:       jl[i] = k;
1330:       il[k] = ui[k] + 1;
1331:     }
1332:     ui_ptr[k] = current_space->array;
1333:     current_space->array += nzk;
1334:     current_space->local_used += nzk;
1335:     current_space->local_remaining -= nzk;

1337:     ui[k + 1] = ui[k] + nzk;
1338:   }

1340:   PetscCall(ISRestoreIndices(perm, &rip));
1341:   PetscCall(PetscFree4(ui_ptr, il, jl, cols));

1343:   /* copy free_space into uj and free free_space; set uj in new datastructure; */
1344:   PetscCall(PetscMalloc1(ui[mbs] + 1, &uj));
1345:   PetscCall(PetscFreeSpaceContiguous(&free_space, uj));
1346:   PetscCall(PetscLLDestroy(lnk, lnkbt));

1348:   /* put together the new matrix in MATSEQSBAIJ format */
1349:   B = fact;
1350:   PetscCall(MatSeqSBAIJSetPreallocation(B, bs, MAT_SKIP_ALLOCATION, NULL));

1352:   b               = (Mat_SeqSBAIJ *)B->data;
1353:   b->singlemalloc = PETSC_FALSE;
1354:   b->free_a       = PETSC_TRUE;
1355:   b->free_ij      = PETSC_TRUE;

1357:   PetscCall(PetscMalloc1(ui[mbs] + 1, &b->a));

1359:   b->j             = uj;
1360:   b->i             = ui;
1361:   b->diag          = NULL;
1362:   b->ilen          = NULL;
1363:   b->imax          = NULL;
1364:   b->row           = perm;
1365:   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */

1367:   PetscCall(PetscObjectReference((PetscObject)perm));
1368:   b->icol = perm;
1369:   PetscCall(PetscObjectReference((PetscObject)perm));
1370:   PetscCall(PetscMalloc1(mbs + 1, &b->solve_work));
1371:   b->maxnz = b->nz = ui[mbs];

1373:   B->info.factor_mallocs   = reallocs;
1374:   B->info.fill_ratio_given = fill;
1375:   if (ai[mbs] != 0.) {
1376:     /* nonzeros in lower triangular part of A = (ai[mbs]+mbs)/2 */
1377:     B->info.fill_ratio_needed = ((PetscReal)2 * ui[mbs]) / (ai[mbs] + mbs);
1378:   } else {
1379:     B->info.fill_ratio_needed = 0.0;
1380:   }
1381: #if defined(PETSC_USE_INFO)
1382:   if (ai[mbs] != 0.) {
1383:     PetscReal af = B->info.fill_ratio_needed;
1384:     PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af));
1385:     PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af));
1386:     PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af));
1387:   } else {
1388:     PetscCall(PetscInfo(A, "Empty matrix\n"));
1389:   }
1390: #endif
1391:   if (perm_identity) {
1392:     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering;
1393:   } else {
1394:     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N;
1395:   }
1396:   PetscFunctionReturn(PETSC_SUCCESS);
1397: }

1399: PetscErrorCode MatSolve_SeqBAIJ_N_NaturalOrdering(Mat A, Vec bb, Vec xx)
1400: {
1401:   Mat_SeqBAIJ       *a  = (Mat_SeqBAIJ *)A->data;
1402:   const PetscInt    *ai = a->i, *aj = a->j, *adiag = a->diag, *vi;
1403:   PetscInt           i, k, n                       = a->mbs;
1404:   PetscInt           nz, bs = A->rmap->bs, bs2 = a->bs2;
1405:   const MatScalar   *aa = a->a, *v;
1406:   PetscScalar       *x, *s, *t, *ls;
1407:   const PetscScalar *b;

1409:   PetscFunctionBegin;
1410:   PetscCall(VecGetArrayRead(bb, &b));
1411:   PetscCall(VecGetArray(xx, &x));
1412:   t = a->solve_work;

1414:   /* forward solve the lower triangular */
1415:   PetscCall(PetscArraycpy(t, b, bs)); /* copy 1st block of b to t */

1417:   for (i = 1; i < n; i++) {
1418:     v  = aa + bs2 * ai[i];
1419:     vi = aj + ai[i];
1420:     nz = ai[i + 1] - ai[i];
1421:     s  = t + bs * i;
1422:     PetscCall(PetscArraycpy(s, b + bs * i, bs)); /* copy i_th block of b to t */
1423:     for (k = 0; k < nz; k++) {
1424:       PetscKernel_v_gets_v_minus_A_times_w(bs, s, v, t + bs * vi[k]);
1425:       v += bs2;
1426:     }
1427:   }

1429:   /* backward solve the upper triangular */
1430:   ls = a->solve_work + A->cmap->n;
1431:   for (i = n - 1; i >= 0; i--) {
1432:     v  = aa + bs2 * (adiag[i + 1] + 1);
1433:     vi = aj + adiag[i + 1] + 1;
1434:     nz = adiag[i] - adiag[i + 1] - 1;
1435:     PetscCall(PetscArraycpy(ls, t + i * bs, bs));
1436:     for (k = 0; k < nz; k++) {
1437:       PetscKernel_v_gets_v_minus_A_times_w(bs, ls, v, t + bs * vi[k]);
1438:       v += bs2;
1439:     }
1440:     PetscKernel_w_gets_A_times_v(bs, ls, aa + bs2 * adiag[i], t + i * bs); /* *inv(diagonal[i]) */
1441:     PetscCall(PetscArraycpy(x + i * bs, t + i * bs, bs));
1442:   }

1444:   PetscCall(VecRestoreArrayRead(bb, &b));
1445:   PetscCall(VecRestoreArray(xx, &x));
1446:   PetscCall(PetscLogFlops(2.0 * (a->bs2) * (a->nz) - A->rmap->bs * A->cmap->n));
1447:   PetscFunctionReturn(PETSC_SUCCESS);
1448: }

1450: PetscErrorCode MatSolve_SeqBAIJ_N(Mat A, Vec bb, Vec xx)
1451: {
1452:   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
1453:   IS                 iscol = a->col, isrow = a->row;
1454:   const PetscInt    *r, *c, *rout, *cout, *ai = a->i, *aj = a->j, *adiag = a->diag, *vi;
1455:   PetscInt           i, m, n = a->mbs;
1456:   PetscInt           nz, bs = A->rmap->bs, bs2 = a->bs2;
1457:   const MatScalar   *aa = a->a, *v;
1458:   PetscScalar       *x, *s, *t, *ls;
1459:   const PetscScalar *b;

1461:   PetscFunctionBegin;
1462:   PetscCall(VecGetArrayRead(bb, &b));
1463:   PetscCall(VecGetArray(xx, &x));
1464:   t = a->solve_work;

1466:   PetscCall(ISGetIndices(isrow, &rout));
1467:   r = rout;
1468:   PetscCall(ISGetIndices(iscol, &cout));
1469:   c = cout;

1471:   /* forward solve the lower triangular */
1472:   PetscCall(PetscArraycpy(t, b + bs * r[0], bs));
1473:   for (i = 1; i < n; i++) {
1474:     v  = aa + bs2 * ai[i];
1475:     vi = aj + ai[i];
1476:     nz = ai[i + 1] - ai[i];
1477:     s  = t + bs * i;
1478:     PetscCall(PetscArraycpy(s, b + bs * r[i], bs));
1479:     for (m = 0; m < nz; m++) {
1480:       PetscKernel_v_gets_v_minus_A_times_w(bs, s, v, t + bs * vi[m]);
1481:       v += bs2;
1482:     }
1483:   }

1485:   /* backward solve the upper triangular */
1486:   ls = a->solve_work + A->cmap->n;
1487:   for (i = n - 1; i >= 0; i--) {
1488:     v  = aa + bs2 * (adiag[i + 1] + 1);
1489:     vi = aj + adiag[i + 1] + 1;
1490:     nz = adiag[i] - adiag[i + 1] - 1;
1491:     PetscCall(PetscArraycpy(ls, t + i * bs, bs));
1492:     for (m = 0; m < nz; m++) {
1493:       PetscKernel_v_gets_v_minus_A_times_w(bs, ls, v, t + bs * vi[m]);
1494:       v += bs2;
1495:     }
1496:     PetscKernel_w_gets_A_times_v(bs, ls, v, t + i * bs); /* *inv(diagonal[i]) */
1497:     PetscCall(PetscArraycpy(x + bs * c[i], t + i * bs, bs));
1498:   }
1499:   PetscCall(ISRestoreIndices(isrow, &rout));
1500:   PetscCall(ISRestoreIndices(iscol, &cout));
1501:   PetscCall(VecRestoreArrayRead(bb, &b));
1502:   PetscCall(VecRestoreArray(xx, &x));
1503:   PetscCall(PetscLogFlops(2.0 * (a->bs2) * (a->nz) - A->rmap->bs * A->cmap->n));
1504:   PetscFunctionReturn(PETSC_SUCCESS);
1505: }

1507: /*
1508:     For each block in an block array saves the largest absolute value in the block into another array
1509: */
1510: static PetscErrorCode MatBlockAbs_private(PetscInt nbs, PetscInt bs2, PetscScalar *blockarray, PetscReal *absarray)
1511: {
1512:   PetscInt i, j;

1514:   PetscFunctionBegin;
1515:   PetscCall(PetscArrayzero(absarray, nbs + 1));
1516:   for (i = 0; i < nbs; i++) {
1517:     for (j = 0; j < bs2; j++) {
1518:       if (absarray[i] < PetscAbsScalar(blockarray[i * nbs + j])) absarray[i] = PetscAbsScalar(blockarray[i * nbs + j]);
1519:     }
1520:   }
1521:   PetscFunctionReturn(PETSC_SUCCESS);
1522: }

1524: /*
1525:      This needs to be renamed and called by the regular MatILUFactor_SeqBAIJ when drop tolerance is used
1526: */
1527: PetscErrorCode MatILUDTFactor_SeqBAIJ(Mat A, IS isrow, IS iscol, const MatFactorInfo *info, Mat *fact)
1528: {
1529:   Mat             B = *fact;
1530:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b;
1531:   IS              isicol;
1532:   const PetscInt *r, *ic;
1533:   PetscInt        i, mbs = a->mbs, bs = A->rmap->bs, bs2 = a->bs2, *ai = a->i, *aj = a->j, *ajtmp, *adiag;
1534:   PetscInt       *bi, *bj, *bdiag;

1536:   PetscInt   row, nzi, nzi_bl, nzi_bu, *im, dtcount, nzi_al, nzi_au;
1537:   PetscInt   nlnk, *lnk;
1538:   PetscBT    lnkbt;
1539:   PetscBool  row_identity, icol_identity;
1540:   MatScalar *aatmp, *pv, *batmp, *ba, *rtmp, *pc, *multiplier, *vtmp;
1541:   PetscInt   j, nz, *pj, *bjtmp, k, ncut, *jtmp;

1543:   PetscReal  dt = info->dt; /* shift=info->shiftamount; */
1544:   PetscInt   nnz_max;
1545:   PetscBool  missing;
1546:   PetscReal *vtmp_abs;
1547:   MatScalar *v_work;
1548:   PetscInt  *v_pivots;
1549:   PetscBool  allowzeropivot, zeropivotdetected = PETSC_FALSE;

1551:   PetscFunctionBegin;
1552:   /* symbolic factorization, can be reused */
1553:   PetscCall(MatMissingDiagonal(A, &missing, &i));
1554:   PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1555:   adiag = a->diag;

1557:   PetscCall(ISInvertPermutation(iscol, PETSC_DECIDE, &isicol));

1559:   /* bdiag is location of diagonal in factor */
1560:   PetscCall(PetscMalloc1(mbs + 1, &bdiag));

1562:   /* allocate row pointers bi */
1563:   PetscCall(PetscMalloc1(2 * mbs + 2, &bi));

1565:   /* allocate bj and ba; max num of nonzero entries is (ai[n]+2*n*dtcount+2) */
1566:   dtcount = (PetscInt)info->dtcount;
1567:   if (dtcount > mbs - 1) dtcount = mbs - 1;
1568:   nnz_max = ai[mbs] + 2 * mbs * dtcount + 2;
1569:   /* printf("MatILUDTFactor_SeqBAIJ, bs %d, ai[mbs] %d, nnz_max  %d, dtcount %d\n",bs,ai[mbs],nnz_max,dtcount); */
1570:   PetscCall(PetscMalloc1(nnz_max, &bj));
1571:   nnz_max = nnz_max * bs2;
1572:   PetscCall(PetscMalloc1(nnz_max, &ba));

1574:   /* put together the new matrix */
1575:   PetscCall(MatSeqBAIJSetPreallocation(B, bs, MAT_SKIP_ALLOCATION, NULL));

1577:   b               = (Mat_SeqBAIJ *)(B)->data;
1578:   b->free_a       = PETSC_TRUE;
1579:   b->free_ij      = PETSC_TRUE;
1580:   b->singlemalloc = PETSC_FALSE;

1582:   b->a    = ba;
1583:   b->j    = bj;
1584:   b->i    = bi;
1585:   b->diag = bdiag;
1586:   b->ilen = NULL;
1587:   b->imax = NULL;
1588:   b->row  = isrow;
1589:   b->col  = iscol;

1591:   PetscCall(PetscObjectReference((PetscObject)isrow));
1592:   PetscCall(PetscObjectReference((PetscObject)iscol));

1594:   b->icol = isicol;
1595:   PetscCall(PetscMalloc1(bs * (mbs + 1), &b->solve_work));
1596:   b->maxnz = nnz_max / bs2;

1598:   (B)->factortype            = MAT_FACTOR_ILUDT;
1599:   (B)->info.factor_mallocs   = 0;
1600:   (B)->info.fill_ratio_given = ((PetscReal)nnz_max) / ((PetscReal)(ai[mbs] * bs2));
1601:   /* end of symbolic factorization */
1602:   PetscCall(ISGetIndices(isrow, &r));
1603:   PetscCall(ISGetIndices(isicol, &ic));

1605:   /* linked list for storing column indices of the active row */
1606:   nlnk = mbs + 1;
1607:   PetscCall(PetscLLCreate(mbs, mbs, nlnk, lnk, lnkbt));

1609:   /* im: used by PetscLLAddSortedLU(); jtmp: working array for column indices of active row */
1610:   PetscCall(PetscMalloc2(mbs, &im, mbs, &jtmp));
1611:   /* rtmp, vtmp: working arrays for sparse and contiguous row entries of active row */
1612:   PetscCall(PetscMalloc2(mbs * bs2, &rtmp, mbs * bs2, &vtmp));
1613:   PetscCall(PetscMalloc1(mbs + 1, &vtmp_abs));
1614:   PetscCall(PetscMalloc3(bs, &v_work, bs2, &multiplier, bs, &v_pivots));

1616:   allowzeropivot  = PetscNot(A->erroriffailure);
1617:   bi[0]           = 0;
1618:   bdiag[0]        = (nnz_max / bs2) - 1; /* location of diagonal in factor B */
1619:   bi[2 * mbs + 1] = bdiag[0] + 1;        /* endof bj and ba array */
1620:   for (i = 0; i < mbs; i++) {
1621:     /* copy initial fill into linked list */
1622:     nzi = ai[r[i] + 1] - ai[r[i]];
1623:     PetscCheck(nzi, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Empty row in matrix: row in original ordering %" PetscInt_FMT " in permuted ordering %" PetscInt_FMT, r[i], i);
1624:     nzi_al = adiag[r[i]] - ai[r[i]];
1625:     nzi_au = ai[r[i] + 1] - adiag[r[i]] - 1;

1627:     /* load in initial unfactored row */
1628:     ajtmp = aj + ai[r[i]];
1629:     PetscCall(PetscLLAddPerm(nzi, ajtmp, ic, mbs, &nlnk, lnk, lnkbt));
1630:     PetscCall(PetscArrayzero(rtmp, mbs * bs2));
1631:     aatmp = a->a + bs2 * ai[r[i]];
1632:     for (j = 0; j < nzi; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], aatmp + bs2 * j, bs2));

1634:     /* add pivot rows into linked list */
1635:     row = lnk[mbs];
1636:     while (row < i) {
1637:       nzi_bl = bi[row + 1] - bi[row] + 1;
1638:       bjtmp  = bj + bdiag[row + 1] + 1; /* points to 1st column next to the diagonal in U */
1639:       PetscCall(PetscLLAddSortedLU(bjtmp, row, &nlnk, lnk, lnkbt, i, nzi_bl, im));
1640:       nzi += nlnk;
1641:       row = lnk[row];
1642:     }

1644:     /* copy data from lnk into jtmp, then initialize lnk */
1645:     PetscCall(PetscLLClean(mbs, mbs, nzi, lnk, jtmp, lnkbt));

1647:     /* numerical factorization */
1648:     bjtmp = jtmp;
1649:     row   = *bjtmp++; /* 1st pivot row */

1651:     while (row < i) {
1652:       pc = rtmp + bs2 * row;
1653:       pv = ba + bs2 * bdiag[row];                           /* inv(diag) of the pivot row */
1654:       PetscKernel_A_gets_A_times_B(bs, pc, pv, multiplier); /* pc= multiplier = pc*inv(diag[row]) */
1655:       PetscCall(MatBlockAbs_private(1, bs2, pc, vtmp_abs));
1656:       if (vtmp_abs[0] > dt) {         /* apply tolerance dropping rule */
1657:         pj = bj + bdiag[row + 1] + 1; /* point to 1st entry of U(row,:) */
1658:         pv = ba + bs2 * (bdiag[row + 1] + 1);
1659:         nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:), excluding diagonal */
1660:         for (j = 0; j < nz; j++) PetscKernel_A_gets_A_minus_B_times_C(bs, rtmp + bs2 * pj[j], pc, pv + bs2 * j);
1661:         /* PetscCall(PetscLogFlops(bslog*(nz+1.0)-bs)); */
1662:       }
1663:       row = *bjtmp++;
1664:     }

1666:     /* copy sparse rtmp into contiguous vtmp; separate L and U part */
1667:     nzi_bl = 0;
1668:     j      = 0;
1669:     while (jtmp[j] < i) { /* L-part. Note: jtmp is sorted */
1670:       PetscCall(PetscArraycpy(vtmp + bs2 * j, rtmp + bs2 * jtmp[j], bs2));
1671:       nzi_bl++;
1672:       j++;
1673:     }
1674:     nzi_bu = nzi - nzi_bl - 1;

1676:     while (j < nzi) { /* U-part */
1677:       PetscCall(PetscArraycpy(vtmp + bs2 * j, rtmp + bs2 * jtmp[j], bs2));
1678:       j++;
1679:     }

1681:     PetscCall(MatBlockAbs_private(nzi, bs2, vtmp, vtmp_abs));

1683:     bjtmp = bj + bi[i];
1684:     batmp = ba + bs2 * bi[i];
1685:     /* apply level dropping rule to L part */
1686:     ncut = nzi_al + dtcount;
1687:     if (ncut < nzi_bl) {
1688:       PetscCall(PetscSortSplitReal(ncut, nzi_bl, vtmp_abs, jtmp));
1689:       PetscCall(PetscSortIntWithScalarArray(ncut, jtmp, vtmp));
1690:     } else {
1691:       ncut = nzi_bl;
1692:     }
1693:     for (j = 0; j < ncut; j++) {
1694:       bjtmp[j] = jtmp[j];
1695:       PetscCall(PetscArraycpy(batmp + bs2 * j, rtmp + bs2 * bjtmp[j], bs2));
1696:     }
1697:     bi[i + 1] = bi[i] + ncut;
1698:     nzi       = ncut + 1;

1700:     /* apply level dropping rule to U part */
1701:     ncut = nzi_au + dtcount;
1702:     if (ncut < nzi_bu) {
1703:       PetscCall(PetscSortSplitReal(ncut, nzi_bu, vtmp_abs + nzi_bl + 1, jtmp + nzi_bl + 1));
1704:       PetscCall(PetscSortIntWithScalarArray(ncut, jtmp + nzi_bl + 1, vtmp + nzi_bl + 1));
1705:     } else {
1706:       ncut = nzi_bu;
1707:     }
1708:     nzi += ncut;

1710:     /* mark bdiagonal */
1711:     bdiag[i + 1]    = bdiag[i] - (ncut + 1);
1712:     bi[2 * mbs - i] = bi[2 * mbs - i + 1] - (ncut + 1);

1714:     bjtmp = bj + bdiag[i];
1715:     batmp = ba + bs2 * bdiag[i];
1716:     PetscCall(PetscArraycpy(batmp, rtmp + bs2 * i, bs2));
1717:     *bjtmp = i;

1719:     bjtmp = bj + bdiag[i + 1] + 1;
1720:     batmp = ba + (bdiag[i + 1] + 1) * bs2;

1722:     for (k = 0; k < ncut; k++) {
1723:       bjtmp[k] = jtmp[nzi_bl + 1 + k];
1724:       PetscCall(PetscArraycpy(batmp + bs2 * k, rtmp + bs2 * bjtmp[k], bs2));
1725:     }

1727:     im[i] = nzi; /* used by PetscLLAddSortedLU() */

1729:     /* invert diagonal block for simpler triangular solves - add shift??? */
1730:     batmp = ba + bs2 * bdiag[i];

1732:     PetscCall(PetscKernel_A_gets_inverse_A(bs, batmp, v_pivots, v_work, allowzeropivot, &zeropivotdetected));
1733:     if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
1734:   } /* for (i=0; i<mbs; i++) */
1735:   PetscCall(PetscFree3(v_work, multiplier, v_pivots));

1737:   /* printf("end of L %d, beginning of U %d\n",bi[mbs],bdiag[mbs]); */
1738:   PetscCheck(bi[mbs] < bdiag[mbs], PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "end of L array %" PetscInt_FMT " cannot >= the beginning of U array %" PetscInt_FMT, bi[mbs], bdiag[mbs]);

1740:   PetscCall(ISRestoreIndices(isrow, &r));
1741:   PetscCall(ISRestoreIndices(isicol, &ic));

1743:   PetscCall(PetscLLDestroy(lnk, lnkbt));

1745:   PetscCall(PetscFree2(im, jtmp));
1746:   PetscCall(PetscFree2(rtmp, vtmp));

1748:   PetscCall(PetscLogFlops(bs2 * B->cmap->n));
1749:   b->maxnz = b->nz = bi[mbs] + bdiag[0] - bdiag[mbs];

1751:   PetscCall(ISIdentity(isrow, &row_identity));
1752:   PetscCall(ISIdentity(isicol, &icol_identity));
1753:   if (row_identity && icol_identity) {
1754:     B->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering;
1755:   } else {
1756:     B->ops->solve = MatSolve_SeqBAIJ_N;
1757:   }

1759:   B->ops->solveadd          = NULL;
1760:   B->ops->solvetranspose    = NULL;
1761:   B->ops->solvetransposeadd = NULL;
1762:   B->ops->matsolve          = NULL;
1763:   B->assembled              = PETSC_TRUE;
1764:   B->preallocated           = PETSC_TRUE;
1765:   PetscFunctionReturn(PETSC_SUCCESS);
1766: }