Actual source code: baijfact.c
2: /*
3: Factorization code for BAIJ format.
4: */
5: #include <../src/mat/impls/baij/seq/baij.h>
6: #include <petsc/private/kernels/blockinvert.h>
8: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2(Mat B, Mat A, const MatFactorInfo *info)
9: {
10: Mat C = B;
11: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
12: IS isrow = b->row, isicol = b->icol;
13: const PetscInt *r, *ic;
14: const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, bs2 = a->bs2;
15: const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag;
16: MatScalar *rtmp, *pc, *mwork, *pv;
17: MatScalar *aa = a->a, *v;
18: PetscReal shift = info->shiftamount;
19: const PetscBool allowzeropivot = PetscNot(A->erroriffailure);
21: PetscFunctionBegin;
22: PetscCall(ISGetIndices(isrow, &r));
23: PetscCall(ISGetIndices(isicol, &ic));
25: /* generate work space needed by the factorization */
26: PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork));
27: PetscCall(PetscArrayzero(rtmp, bs2 * n));
29: for (PetscInt i = 0; i < n; i++) {
30: /* zero rtmp */
31: /* L part */
32: PetscInt *pj;
33: PetscInt nzL, nz = bi[i + 1] - bi[i];
34: bjtmp = bj + bi[i];
35: for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
37: /* U part */
38: nz = bdiag[i] - bdiag[i + 1];
39: bjtmp = bj + bdiag[i + 1] + 1;
40: for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
42: /* load in initial (unfactored row) */
43: nz = ai[r[i] + 1] - ai[r[i]];
44: ajtmp = aj + ai[r[i]];
45: v = aa + bs2 * ai[r[i]];
46: for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], v + bs2 * j, bs2));
48: /* elimination */
49: bjtmp = bj + bi[i];
50: nzL = bi[i + 1] - bi[i];
51: for (PetscInt k = 0; k < nzL; k++) {
52: PetscBool flg = PETSC_FALSE;
53: PetscInt row = bjtmp[k];
55: pc = rtmp + bs2 * row;
56: for (PetscInt j = 0; j < bs2; j++) {
57: if (pc[j] != (PetscScalar)0.0) {
58: flg = PETSC_TRUE;
59: break;
60: }
61: }
62: if (flg) {
63: pv = b->a + bs2 * bdiag[row];
64: /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
65: PetscCall(PetscKernel_A_gets_A_times_B_2(pc, pv, mwork));
67: pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
68: pv = b->a + bs2 * (bdiag[row + 1] + 1);
69: nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */
70: for (PetscInt j = 0; j < nz; j++) {
71: /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
72: /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
73: v = rtmp + 4 * pj[j];
74: PetscCall(PetscKernel_A_gets_A_minus_B_times_C_2(v, pc, pv));
75: pv += 4;
76: }
77: PetscCall(PetscLogFlops(16.0 * nz + 12)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
78: }
79: }
81: /* finished row so stick it into b->a */
82: /* L part */
83: pv = b->a + bs2 * bi[i];
84: pj = b->j + bi[i];
85: nz = bi[i + 1] - bi[i];
86: for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
88: /* Mark diagonal and invert diagonal for simpler triangular solves */
89: pv = b->a + bs2 * bdiag[i];
90: pj = b->j + bdiag[i];
91: PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
92: {
93: PetscBool zeropivotdetected;
95: PetscCall(PetscKernel_A_gets_inverse_A_2(pv, shift, allowzeropivot, &zeropivotdetected));
96: if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
97: }
99: /* U part */
100: pv = b->a + bs2 * (bdiag[i + 1] + 1);
101: pj = b->j + bdiag[i + 1] + 1;
102: nz = bdiag[i] - bdiag[i + 1] - 1;
103: for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
104: }
106: PetscCall(PetscFree2(rtmp, mwork));
107: PetscCall(ISRestoreIndices(isicol, &ic));
108: PetscCall(ISRestoreIndices(isrow, &r));
110: C->ops->solve = MatSolve_SeqBAIJ_2;
111: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2;
112: C->assembled = PETSC_TRUE;
114: PetscCall(PetscLogFlops(1.333333333333 * 2 * 2 * 2 * n)); /* from inverting diagonal blocks */
115: PetscFunctionReturn(PETSC_SUCCESS);
116: }
118: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering(Mat B, Mat A, const MatFactorInfo *info)
119: {
120: Mat C = B;
121: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
122: PetscInt i, j, k, nz, nzL, row, *pj;
123: const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, bs2 = a->bs2;
124: const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag;
125: MatScalar *rtmp, *pc, *mwork, *pv;
126: MatScalar *aa = a->a, *v;
127: PetscInt flg;
128: PetscReal shift = info->shiftamount;
129: PetscBool allowzeropivot, zeropivotdetected;
131: PetscFunctionBegin;
132: allowzeropivot = PetscNot(A->erroriffailure);
134: /* generate work space needed by the factorization */
135: PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork));
136: PetscCall(PetscArrayzero(rtmp, bs2 * n));
138: for (i = 0; i < n; i++) {
139: /* zero rtmp */
140: /* L part */
141: nz = bi[i + 1] - bi[i];
142: bjtmp = bj + bi[i];
143: for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
145: /* U part */
146: nz = bdiag[i] - bdiag[i + 1];
147: bjtmp = bj + bdiag[i + 1] + 1;
148: for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
150: /* load in initial (unfactored row) */
151: nz = ai[i + 1] - ai[i];
152: ajtmp = aj + ai[i];
153: v = aa + bs2 * ai[i];
154: for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ajtmp[j], v + bs2 * j, bs2));
156: /* elimination */
157: bjtmp = bj + bi[i];
158: nzL = bi[i + 1] - bi[i];
159: for (k = 0; k < nzL; k++) {
160: row = bjtmp[k];
161: pc = rtmp + bs2 * row;
162: for (flg = 0, j = 0; j < bs2; j++) {
163: if (pc[j] != (PetscScalar)0.0) {
164: flg = 1;
165: break;
166: }
167: }
168: if (flg) {
169: pv = b->a + bs2 * bdiag[row];
170: /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
171: PetscCall(PetscKernel_A_gets_A_times_B_2(pc, pv, mwork));
173: pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
174: pv = b->a + bs2 * (bdiag[row + 1] + 1);
175: nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
176: for (j = 0; j < nz; j++) {
177: /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
178: /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
179: v = rtmp + 4 * pj[j];
180: PetscCall(PetscKernel_A_gets_A_minus_B_times_C_2(v, pc, pv));
181: pv += 4;
182: }
183: PetscCall(PetscLogFlops(16.0 * nz + 12)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
184: }
185: }
187: /* finished row so stick it into b->a */
188: /* L part */
189: pv = b->a + bs2 * bi[i];
190: pj = b->j + bi[i];
191: nz = bi[i + 1] - bi[i];
192: for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
194: /* Mark diagonal and invert diagonal for simpler triangular solves */
195: pv = b->a + bs2 * bdiag[i];
196: pj = b->j + bdiag[i];
197: PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
198: PetscCall(PetscKernel_A_gets_inverse_A_2(pv, shift, allowzeropivot, &zeropivotdetected));
199: if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
201: /* U part */
202: /*
203: pv = b->a + bs2*bi[2*n-i];
204: pj = b->j + bi[2*n-i];
205: nz = bi[2*n-i+1] - bi[2*n-i] - 1;
206: */
207: pv = b->a + bs2 * (bdiag[i + 1] + 1);
208: pj = b->j + bdiag[i + 1] + 1;
209: nz = bdiag[i] - bdiag[i + 1] - 1;
210: for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
211: }
212: PetscCall(PetscFree2(rtmp, mwork));
214: C->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering;
215: C->ops->forwardsolve = MatForwardSolve_SeqBAIJ_2_NaturalOrdering;
216: C->ops->backwardsolve = MatBackwardSolve_SeqBAIJ_2_NaturalOrdering;
217: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering;
218: C->assembled = PETSC_TRUE;
220: PetscCall(PetscLogFlops(1.333333333333 * 2 * 2 * 2 * n)); /* from inverting diagonal blocks */
221: PetscFunctionReturn(PETSC_SUCCESS);
222: }
224: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2_inplace(Mat B, Mat A, const MatFactorInfo *info)
225: {
226: Mat C = B;
227: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
228: IS isrow = b->row, isicol = b->icol;
229: const PetscInt *r, *ic;
230: PetscInt i, j, n = a->mbs, *bi = b->i, *bj = b->j;
231: PetscInt *ajtmpold, *ajtmp, nz, row;
232: PetscInt *diag_offset = b->diag, idx, *ai = a->i, *aj = a->j, *pj;
233: MatScalar *pv, *v, *rtmp, m1, m2, m3, m4, *pc, *w, *x, x1, x2, x3, x4;
234: MatScalar p1, p2, p3, p4;
235: MatScalar *ba = b->a, *aa = a->a;
236: PetscReal shift = info->shiftamount;
237: PetscBool allowzeropivot, zeropivotdetected;
239: PetscFunctionBegin;
240: allowzeropivot = PetscNot(A->erroriffailure);
241: PetscCall(ISGetIndices(isrow, &r));
242: PetscCall(ISGetIndices(isicol, &ic));
243: PetscCall(PetscMalloc1(4 * (n + 1), &rtmp));
245: for (i = 0; i < n; i++) {
246: nz = bi[i + 1] - bi[i];
247: ajtmp = bj + bi[i];
248: for (j = 0; j < nz; j++) {
249: x = rtmp + 4 * ajtmp[j];
250: x[0] = x[1] = x[2] = x[3] = 0.0;
251: }
252: /* load in initial (unfactored row) */
253: idx = r[i];
254: nz = ai[idx + 1] - ai[idx];
255: ajtmpold = aj + ai[idx];
256: v = aa + 4 * ai[idx];
257: for (j = 0; j < nz; j++) {
258: x = rtmp + 4 * ic[ajtmpold[j]];
259: x[0] = v[0];
260: x[1] = v[1];
261: x[2] = v[2];
262: x[3] = v[3];
263: v += 4;
264: }
265: row = *ajtmp++;
266: while (row < i) {
267: pc = rtmp + 4 * row;
268: p1 = pc[0];
269: p2 = pc[1];
270: p3 = pc[2];
271: p4 = pc[3];
272: if (p1 != (PetscScalar)0.0 || p2 != (PetscScalar)0.0 || p3 != (PetscScalar)0.0 || p4 != (PetscScalar)0.0) {
273: pv = ba + 4 * diag_offset[row];
274: pj = bj + diag_offset[row] + 1;
275: x1 = pv[0];
276: x2 = pv[1];
277: x3 = pv[2];
278: x4 = pv[3];
279: pc[0] = m1 = p1 * x1 + p3 * x2;
280: pc[1] = m2 = p2 * x1 + p4 * x2;
281: pc[2] = m3 = p1 * x3 + p3 * x4;
282: pc[3] = m4 = p2 * x3 + p4 * x4;
283: nz = bi[row + 1] - diag_offset[row] - 1;
284: pv += 4;
285: for (j = 0; j < nz; j++) {
286: x1 = pv[0];
287: x2 = pv[1];
288: x3 = pv[2];
289: x4 = pv[3];
290: x = rtmp + 4 * pj[j];
291: x[0] -= m1 * x1 + m3 * x2;
292: x[1] -= m2 * x1 + m4 * x2;
293: x[2] -= m1 * x3 + m3 * x4;
294: x[3] -= m2 * x3 + m4 * x4;
295: pv += 4;
296: }
297: PetscCall(PetscLogFlops(16.0 * nz + 12.0));
298: }
299: row = *ajtmp++;
300: }
301: /* finished row so stick it into b->a */
302: pv = ba + 4 * bi[i];
303: pj = bj + bi[i];
304: nz = bi[i + 1] - bi[i];
305: for (j = 0; j < nz; j++) {
306: x = rtmp + 4 * pj[j];
307: pv[0] = x[0];
308: pv[1] = x[1];
309: pv[2] = x[2];
310: pv[3] = x[3];
311: pv += 4;
312: }
313: /* invert diagonal block */
314: w = ba + 4 * diag_offset[i];
315: PetscCall(PetscKernel_A_gets_inverse_A_2(w, shift, allowzeropivot, &zeropivotdetected));
316: if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
317: }
319: PetscCall(PetscFree(rtmp));
320: PetscCall(ISRestoreIndices(isicol, &ic));
321: PetscCall(ISRestoreIndices(isrow, &r));
323: C->ops->solve = MatSolve_SeqBAIJ_2_inplace;
324: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_inplace;
325: C->assembled = PETSC_TRUE;
327: PetscCall(PetscLogFlops(1.333333333333 * 8 * b->mbs)); /* from inverting diagonal blocks */
328: PetscFunctionReturn(PETSC_SUCCESS);
329: }
330: /*
331: Version for when blocks are 2 by 2 Using natural ordering
332: */
333: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering_inplace(Mat C, Mat A, const MatFactorInfo *info)
334: {
335: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
336: PetscInt i, j, n = a->mbs, *bi = b->i, *bj = b->j;
337: PetscInt *ajtmpold, *ajtmp, nz, row;
338: PetscInt *diag_offset = b->diag, *ai = a->i, *aj = a->j, *pj;
339: MatScalar *pv, *v, *rtmp, *pc, *w, *x;
340: MatScalar p1, p2, p3, p4, m1, m2, m3, m4, x1, x2, x3, x4;
341: MatScalar *ba = b->a, *aa = a->a;
342: PetscReal shift = info->shiftamount;
343: PetscBool allowzeropivot, zeropivotdetected;
345: PetscFunctionBegin;
346: allowzeropivot = PetscNot(A->erroriffailure);
347: PetscCall(PetscMalloc1(4 * (n + 1), &rtmp));
348: for (i = 0; i < n; i++) {
349: nz = bi[i + 1] - bi[i];
350: ajtmp = bj + bi[i];
351: for (j = 0; j < nz; j++) {
352: x = rtmp + 4 * ajtmp[j];
353: x[0] = x[1] = x[2] = x[3] = 0.0;
354: }
355: /* load in initial (unfactored row) */
356: nz = ai[i + 1] - ai[i];
357: ajtmpold = aj + ai[i];
358: v = aa + 4 * ai[i];
359: for (j = 0; j < nz; j++) {
360: x = rtmp + 4 * ajtmpold[j];
361: x[0] = v[0];
362: x[1] = v[1];
363: x[2] = v[2];
364: x[3] = v[3];
365: v += 4;
366: }
367: row = *ajtmp++;
368: while (row < i) {
369: pc = rtmp + 4 * row;
370: p1 = pc[0];
371: p2 = pc[1];
372: p3 = pc[2];
373: p4 = pc[3];
374: if (p1 != (PetscScalar)0.0 || p2 != (PetscScalar)0.0 || p3 != (PetscScalar)0.0 || p4 != (PetscScalar)0.0) {
375: pv = ba + 4 * diag_offset[row];
376: pj = bj + diag_offset[row] + 1;
377: x1 = pv[0];
378: x2 = pv[1];
379: x3 = pv[2];
380: x4 = pv[3];
381: pc[0] = m1 = p1 * x1 + p3 * x2;
382: pc[1] = m2 = p2 * x1 + p4 * x2;
383: pc[2] = m3 = p1 * x3 + p3 * x4;
384: pc[3] = m4 = p2 * x3 + p4 * x4;
385: nz = bi[row + 1] - diag_offset[row] - 1;
386: pv += 4;
387: for (j = 0; j < nz; j++) {
388: x1 = pv[0];
389: x2 = pv[1];
390: x3 = pv[2];
391: x4 = pv[3];
392: x = rtmp + 4 * pj[j];
393: x[0] -= m1 * x1 + m3 * x2;
394: x[1] -= m2 * x1 + m4 * x2;
395: x[2] -= m1 * x3 + m3 * x4;
396: x[3] -= m2 * x3 + m4 * x4;
397: pv += 4;
398: }
399: PetscCall(PetscLogFlops(16.0 * nz + 12.0));
400: }
401: row = *ajtmp++;
402: }
403: /* finished row so stick it into b->a */
404: pv = ba + 4 * bi[i];
405: pj = bj + bi[i];
406: nz = bi[i + 1] - bi[i];
407: for (j = 0; j < nz; j++) {
408: x = rtmp + 4 * pj[j];
409: pv[0] = x[0];
410: pv[1] = x[1];
411: pv[2] = x[2];
412: pv[3] = x[3];
413: /*
414: printf(" col %d:",pj[j]);
415: PetscInt j1;
416: for (j1=0; j1<4; j1++) printf(" %g,",*(pv+j1));
417: printf("\n");
418: */
419: pv += 4;
420: }
421: /* invert diagonal block */
422: w = ba + 4 * diag_offset[i];
423: PetscCall(PetscKernel_A_gets_inverse_A_2(w, shift, allowzeropivot, &zeropivotdetected));
424: if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
425: }
427: PetscCall(PetscFree(rtmp));
429: C->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering_inplace;
430: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace;
431: C->assembled = PETSC_TRUE;
433: PetscCall(PetscLogFlops(1.333333333333 * 8 * b->mbs)); /* from inverting diagonal blocks */
434: PetscFunctionReturn(PETSC_SUCCESS);
435: }
437: /*
438: Version for when blocks are 1 by 1.
439: */
440: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_1(Mat B, Mat A, const MatFactorInfo *info)
441: {
442: Mat C = B;
443: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
444: IS isrow = b->row, isicol = b->icol;
445: const PetscInt *r, *ic, *ics;
446: const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bdiag = b->diag;
447: PetscInt i, j, k, nz, nzL, row, *pj;
448: const PetscInt *ajtmp, *bjtmp;
449: MatScalar *rtmp, *pc, multiplier, *pv;
450: const MatScalar *aa = a->a, *v;
451: PetscBool row_identity, col_identity;
452: FactorShiftCtx sctx;
453: const PetscInt *ddiag;
454: PetscReal rs;
455: MatScalar d;
457: PetscFunctionBegin;
458: /* MatPivotSetUp(): initialize shift context sctx */
459: PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));
461: if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
462: ddiag = a->diag;
463: sctx.shift_top = info->zeropivot;
464: for (i = 0; i < n; i++) {
465: /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
466: d = (aa)[ddiag[i]];
467: rs = -PetscAbsScalar(d) - PetscRealPart(d);
468: v = aa + ai[i];
469: nz = ai[i + 1] - ai[i];
470: for (j = 0; j < nz; j++) rs += PetscAbsScalar(v[j]);
471: if (rs > sctx.shift_top) sctx.shift_top = rs;
472: }
473: sctx.shift_top *= 1.1;
474: sctx.nshift_max = 5;
475: sctx.shift_lo = 0.;
476: sctx.shift_hi = 1.;
477: }
479: PetscCall(ISGetIndices(isrow, &r));
480: PetscCall(ISGetIndices(isicol, &ic));
481: PetscCall(PetscMalloc1(n + 1, &rtmp));
482: ics = ic;
484: do {
485: sctx.newshift = PETSC_FALSE;
486: for (i = 0; i < n; i++) {
487: /* zero rtmp */
488: /* L part */
489: nz = bi[i + 1] - bi[i];
490: bjtmp = bj + bi[i];
491: for (j = 0; j < nz; j++) rtmp[bjtmp[j]] = 0.0;
493: /* U part */
494: nz = bdiag[i] - bdiag[i + 1];
495: bjtmp = bj + bdiag[i + 1] + 1;
496: for (j = 0; j < nz; j++) rtmp[bjtmp[j]] = 0.0;
498: /* load in initial (unfactored row) */
499: nz = ai[r[i] + 1] - ai[r[i]];
500: ajtmp = aj + ai[r[i]];
501: v = aa + ai[r[i]];
502: for (j = 0; j < nz; j++) rtmp[ics[ajtmp[j]]] = v[j];
504: /* ZeropivotApply() */
505: rtmp[i] += sctx.shift_amount; /* shift the diagonal of the matrix */
507: /* elimination */
508: bjtmp = bj + bi[i];
509: row = *bjtmp++;
510: nzL = bi[i + 1] - bi[i];
511: for (k = 0; k < nzL; k++) {
512: pc = rtmp + row;
513: if (*pc != (PetscScalar)0.0) {
514: pv = b->a + bdiag[row];
515: multiplier = *pc * (*pv);
516: *pc = multiplier;
518: pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
519: pv = b->a + bdiag[row + 1] + 1;
520: nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
521: for (j = 0; j < nz; j++) rtmp[pj[j]] -= multiplier * pv[j];
522: PetscCall(PetscLogFlops(2.0 * nz));
523: }
524: row = *bjtmp++;
525: }
527: /* finished row so stick it into b->a */
528: rs = 0.0;
529: /* L part */
530: pv = b->a + bi[i];
531: pj = b->j + bi[i];
532: nz = bi[i + 1] - bi[i];
533: for (j = 0; j < nz; j++) {
534: pv[j] = rtmp[pj[j]];
535: rs += PetscAbsScalar(pv[j]);
536: }
538: /* U part */
539: pv = b->a + bdiag[i + 1] + 1;
540: pj = b->j + bdiag[i + 1] + 1;
541: nz = bdiag[i] - bdiag[i + 1] - 1;
542: for (j = 0; j < nz; j++) {
543: pv[j] = rtmp[pj[j]];
544: rs += PetscAbsScalar(pv[j]);
545: }
547: sctx.rs = rs;
548: sctx.pv = rtmp[i];
549: PetscCall(MatPivotCheck(B, A, info, &sctx, i));
550: if (sctx.newshift) break; /* break for-loop */
551: rtmp[i] = sctx.pv; /* sctx.pv might be updated in the case of MAT_SHIFT_INBLOCKS */
553: /* Mark diagonal and invert diagonal for simpler triangular solves */
554: pv = b->a + bdiag[i];
555: *pv = (PetscScalar)1.0 / rtmp[i];
557: } /* endof for (i=0; i<n; i++) { */
559: /* MatPivotRefine() */
560: if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction > 0 && sctx.nshift < sctx.nshift_max) {
561: /*
562: * if no shift in this attempt & shifting & started shifting & can refine,
563: * then try lower shift
564: */
565: sctx.shift_hi = sctx.shift_fraction;
566: sctx.shift_fraction = (sctx.shift_hi + sctx.shift_lo) / 2.;
567: sctx.shift_amount = sctx.shift_fraction * sctx.shift_top;
568: sctx.newshift = PETSC_TRUE;
569: sctx.nshift++;
570: }
571: } while (sctx.newshift);
573: PetscCall(PetscFree(rtmp));
574: PetscCall(ISRestoreIndices(isicol, &ic));
575: PetscCall(ISRestoreIndices(isrow, &r));
577: PetscCall(ISIdentity(isrow, &row_identity));
578: PetscCall(ISIdentity(isicol, &col_identity));
579: if (row_identity && col_identity) {
580: C->ops->solve = MatSolve_SeqBAIJ_1_NaturalOrdering;
581: C->ops->forwardsolve = MatForwardSolve_SeqBAIJ_1_NaturalOrdering;
582: C->ops->backwardsolve = MatBackwardSolve_SeqBAIJ_1_NaturalOrdering;
583: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering;
584: } else {
585: C->ops->solve = MatSolve_SeqBAIJ_1;
586: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1;
587: }
588: C->assembled = PETSC_TRUE;
589: PetscCall(PetscLogFlops(C->cmap->n));
591: /* MatShiftView(A,info,&sctx) */
592: if (sctx.nshift) {
593: if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
594: PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top));
595: } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
596: PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
597: } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
598: PetscCall(PetscInfo(A, "number of shift_inblocks applied %" PetscInt_FMT ", each shift_amount %g\n", sctx.nshift, (double)info->shiftamount));
599: }
600: }
601: PetscFunctionReturn(PETSC_SUCCESS);
602: }
604: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_1_inplace(Mat C, Mat A, const MatFactorInfo *info)
605: {
606: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
607: IS isrow = b->row, isicol = b->icol;
608: const PetscInt *r, *ic;
609: PetscInt i, j, n = a->mbs, *bi = b->i, *bj = b->j;
610: PetscInt *ajtmpold, *ajtmp, nz, row, *ai = a->i, *aj = a->j;
611: PetscInt *diag_offset = b->diag, diag, *pj;
612: MatScalar *pv, *v, *rtmp, multiplier, *pc;
613: MatScalar *ba = b->a, *aa = a->a;
614: PetscBool row_identity, col_identity;
616: PetscFunctionBegin;
617: PetscCall(ISGetIndices(isrow, &r));
618: PetscCall(ISGetIndices(isicol, &ic));
619: PetscCall(PetscMalloc1(n + 1, &rtmp));
621: for (i = 0; i < n; i++) {
622: nz = bi[i + 1] - bi[i];
623: ajtmp = bj + bi[i];
624: for (j = 0; j < nz; j++) rtmp[ajtmp[j]] = 0.0;
626: /* load in initial (unfactored row) */
627: nz = ai[r[i] + 1] - ai[r[i]];
628: ajtmpold = aj + ai[r[i]];
629: v = aa + ai[r[i]];
630: for (j = 0; j < nz; j++) rtmp[ic[ajtmpold[j]]] = v[j];
632: row = *ajtmp++;
633: while (row < i) {
634: pc = rtmp + row;
635: if (*pc != 0.0) {
636: pv = ba + diag_offset[row];
637: pj = bj + diag_offset[row] + 1;
638: multiplier = *pc * *pv++;
639: *pc = multiplier;
640: nz = bi[row + 1] - diag_offset[row] - 1;
641: for (j = 0; j < nz; j++) rtmp[pj[j]] -= multiplier * pv[j];
642: PetscCall(PetscLogFlops(1.0 + 2.0 * nz));
643: }
644: row = *ajtmp++;
645: }
646: /* finished row so stick it into b->a */
647: pv = ba + bi[i];
648: pj = bj + bi[i];
649: nz = bi[i + 1] - bi[i];
650: for (j = 0; j < nz; j++) pv[j] = rtmp[pj[j]];
651: diag = diag_offset[i] - bi[i];
652: /* check pivot entry for current row */
653: PetscCheck(pv[diag] != 0.0, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Zero pivot: row in original ordering %" PetscInt_FMT " in permuted ordering %" PetscInt_FMT, r[i], i);
654: pv[diag] = 1.0 / pv[diag];
655: }
657: PetscCall(PetscFree(rtmp));
658: PetscCall(ISRestoreIndices(isicol, &ic));
659: PetscCall(ISRestoreIndices(isrow, &r));
660: PetscCall(ISIdentity(isrow, &row_identity));
661: PetscCall(ISIdentity(isicol, &col_identity));
662: if (row_identity && col_identity) {
663: C->ops->solve = MatSolve_SeqBAIJ_1_NaturalOrdering_inplace;
664: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace;
665: } else {
666: C->ops->solve = MatSolve_SeqBAIJ_1_inplace;
667: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_inplace;
668: }
669: C->assembled = PETSC_TRUE;
670: PetscCall(PetscLogFlops(C->cmap->n));
671: PetscFunctionReturn(PETSC_SUCCESS);
672: }
674: static PetscErrorCode MatFactorGetSolverType_petsc(Mat A, MatSolverType *type)
675: {
676: PetscFunctionBegin;
677: *type = MATSOLVERPETSC;
678: PetscFunctionReturn(PETSC_SUCCESS);
679: }
681: PETSC_INTERN PetscErrorCode MatGetFactor_seqbaij_petsc(Mat A, MatFactorType ftype, Mat *B)
682: {
683: PetscInt n = A->rmap->n;
685: PetscFunctionBegin;
686: #if defined(PETSC_USE_COMPLEX)
687: PetscCheck(A->hermitian != PETSC_BOOL3_TRUE || !(ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC), PETSC_COMM_SELF, PETSC_ERR_SUP, "Hermitian Factor is not supported");
688: #endif
689: PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
690: PetscCall(MatSetSizes(*B, n, n, n, n));
691: if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
692: PetscCall(MatSetType(*B, MATSEQBAIJ));
694: (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqBAIJ;
695: (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqBAIJ;
696: PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
697: PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
698: PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
699: } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
700: PetscCall(MatSetType(*B, MATSEQSBAIJ));
701: PetscCall(MatSeqSBAIJSetPreallocation(*B, A->rmap->bs, MAT_SKIP_ALLOCATION, NULL));
703: (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqBAIJ;
704: (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqBAIJ;
705: /* Future optimization would be direct symbolic and numerical factorization for BAIJ to support orderings and Cholesky, instead of first converting to SBAIJ */
706: PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
707: PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
708: } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported");
709: (*B)->factortype = ftype;
710: (*B)->canuseordering = PETSC_TRUE;
712: PetscCall(PetscFree((*B)->solvertype));
713: PetscCall(PetscStrallocpy(MATSOLVERPETSC, &(*B)->solvertype));
714: PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_petsc));
715: PetscFunctionReturn(PETSC_SUCCESS);
716: }
718: PetscErrorCode MatLUFactor_SeqBAIJ(Mat A, IS row, IS col, const MatFactorInfo *info)
719: {
720: Mat C;
722: PetscFunctionBegin;
723: PetscCall(MatGetFactor(A, MATSOLVERPETSC, MAT_FACTOR_LU, &C));
724: PetscCall(MatLUFactorSymbolic(C, A, row, col, info));
725: PetscCall(MatLUFactorNumeric(C, A, info));
727: A->ops->solve = C->ops->solve;
728: A->ops->solvetranspose = C->ops->solvetranspose;
730: PetscCall(MatHeaderMerge(A, &C));
731: PetscFunctionReturn(PETSC_SUCCESS);
732: }
734: #include <../src/mat/impls/sbaij/seq/sbaij.h>
735: PetscErrorCode MatCholeskyFactorNumeric_SeqBAIJ_N(Mat C, Mat A, const MatFactorInfo *info)
736: {
737: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
738: Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ *)C->data;
739: IS ip = b->row;
740: const PetscInt *rip;
741: PetscInt i, j, mbs = a->mbs, bs = A->rmap->bs, *bi = b->i, *bj = b->j, *bcol;
742: PetscInt *ai = a->i, *aj = a->j;
743: PetscInt k, jmin, jmax, *jl, *il, col, nexti, ili, nz;
744: MatScalar *rtmp, *ba = b->a, *bval, *aa = a->a, dk, uikdi;
745: PetscReal rs;
746: FactorShiftCtx sctx;
748: PetscFunctionBegin;
749: if (bs > 1) { /* convert A to a SBAIJ matrix and apply Cholesky factorization from it */
750: if (!a->sbaijMat) PetscCall(MatConvert(A, MATSEQSBAIJ, MAT_INITIAL_MATRIX, &a->sbaijMat));
751: PetscCall((a->sbaijMat)->ops->choleskyfactornumeric(C, a->sbaijMat, info));
752: PetscCall(MatDestroy(&a->sbaijMat));
753: PetscFunctionReturn(PETSC_SUCCESS);
754: }
756: /* MatPivotSetUp(): initialize shift context sctx */
757: PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));
759: PetscCall(ISGetIndices(ip, &rip));
760: PetscCall(PetscMalloc3(mbs, &rtmp, mbs, &il, mbs, &jl));
762: sctx.shift_amount = 0.;
763: sctx.nshift = 0;
764: do {
765: sctx.newshift = PETSC_FALSE;
766: for (i = 0; i < mbs; i++) {
767: rtmp[i] = 0.0;
768: jl[i] = mbs;
769: il[0] = 0;
770: }
772: for (k = 0; k < mbs; k++) {
773: bval = ba + bi[k];
774: /* initialize k-th row by the perm[k]-th row of A */
775: jmin = ai[rip[k]];
776: jmax = ai[rip[k] + 1];
777: for (j = jmin; j < jmax; j++) {
778: col = rip[aj[j]];
779: if (col >= k) { /* only take upper triangular entry */
780: rtmp[col] = aa[j];
781: *bval++ = 0.0; /* for in-place factorization */
782: }
783: }
785: /* shift the diagonal of the matrix */
786: if (sctx.nshift) rtmp[k] += sctx.shift_amount;
788: /* modify k-th row by adding in those rows i with U(i,k)!=0 */
789: dk = rtmp[k];
790: i = jl[k]; /* first row to be added to k_th row */
792: while (i < k) {
793: nexti = jl[i]; /* next row to be added to k_th row */
795: /* compute multiplier, update diag(k) and U(i,k) */
796: ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
797: uikdi = -ba[ili] * ba[bi[i]]; /* diagonal(k) */
798: dk += uikdi * ba[ili];
799: ba[ili] = uikdi; /* -U(i,k) */
801: /* add multiple of row i to k-th row */
802: jmin = ili + 1;
803: jmax = bi[i + 1];
804: if (jmin < jmax) {
805: for (j = jmin; j < jmax; j++) rtmp[bj[j]] += uikdi * ba[j];
806: /* update il and jl for row i */
807: il[i] = jmin;
808: j = bj[jmin];
809: jl[i] = jl[j];
810: jl[j] = i;
811: }
812: i = nexti;
813: }
815: /* shift the diagonals when zero pivot is detected */
816: /* compute rs=sum of abs(off-diagonal) */
817: rs = 0.0;
818: jmin = bi[k] + 1;
819: nz = bi[k + 1] - jmin;
820: if (nz) {
821: bcol = bj + jmin;
822: while (nz--) {
823: rs += PetscAbsScalar(rtmp[*bcol]);
824: bcol++;
825: }
826: }
828: sctx.rs = rs;
829: sctx.pv = dk;
830: PetscCall(MatPivotCheck(C, A, info, &sctx, k));
831: if (sctx.newshift) break;
832: dk = sctx.pv;
834: /* copy data into U(k,:) */
835: ba[bi[k]] = 1.0 / dk; /* U(k,k) */
836: jmin = bi[k] + 1;
837: jmax = bi[k + 1];
838: if (jmin < jmax) {
839: for (j = jmin; j < jmax; j++) {
840: col = bj[j];
841: ba[j] = rtmp[col];
842: rtmp[col] = 0.0;
843: }
844: /* add the k-th row into il and jl */
845: il[k] = jmin;
846: i = bj[jmin];
847: jl[k] = jl[i];
848: jl[i] = k;
849: }
850: }
851: } while (sctx.newshift);
852: PetscCall(PetscFree3(rtmp, il, jl));
854: PetscCall(ISRestoreIndices(ip, &rip));
856: C->assembled = PETSC_TRUE;
857: C->preallocated = PETSC_TRUE;
859: PetscCall(PetscLogFlops(C->rmap->N));
860: if (sctx.nshift) {
861: if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
862: PetscCall(PetscInfo(A, "number of shiftpd tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
863: } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
864: PetscCall(PetscInfo(A, "number of shiftnz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
865: }
866: }
867: PetscFunctionReturn(PETSC_SUCCESS);
868: }
870: PetscErrorCode MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering(Mat C, Mat A, const MatFactorInfo *info)
871: {
872: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
873: Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ *)C->data;
874: PetscInt i, j, am = a->mbs;
875: PetscInt *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j;
876: PetscInt k, jmin, *jl, *il, nexti, ili, *acol, *bcol, nz;
877: MatScalar *rtmp, *ba = b->a, *aa = a->a, dk, uikdi, *aval, *bval;
878: PetscReal rs;
879: FactorShiftCtx sctx;
881: PetscFunctionBegin;
882: /* MatPivotSetUp(): initialize shift context sctx */
883: PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));
885: PetscCall(PetscMalloc3(am, &rtmp, am, &il, am, &jl));
887: do {
888: sctx.newshift = PETSC_FALSE;
889: for (i = 0; i < am; i++) {
890: rtmp[i] = 0.0;
891: jl[i] = am;
892: il[0] = 0;
893: }
895: for (k = 0; k < am; k++) {
896: /* initialize k-th row with elements nonzero in row perm(k) of A */
897: nz = ai[k + 1] - ai[k];
898: acol = aj + ai[k];
899: aval = aa + ai[k];
900: bval = ba + bi[k];
901: while (nz--) {
902: if (*acol < k) { /* skip lower triangular entries */
903: acol++;
904: aval++;
905: } else {
906: rtmp[*acol++] = *aval++;
907: *bval++ = 0.0; /* for in-place factorization */
908: }
909: }
911: /* shift the diagonal of the matrix */
912: if (sctx.nshift) rtmp[k] += sctx.shift_amount;
914: /* modify k-th row by adding in those rows i with U(i,k)!=0 */
915: dk = rtmp[k];
916: i = jl[k]; /* first row to be added to k_th row */
918: while (i < k) {
919: nexti = jl[i]; /* next row to be added to k_th row */
920: /* compute multiplier, update D(k) and U(i,k) */
921: ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
922: uikdi = -ba[ili] * ba[bi[i]];
923: dk += uikdi * ba[ili];
924: ba[ili] = uikdi; /* -U(i,k) */
926: /* add multiple of row i to k-th row ... */
927: jmin = ili + 1;
928: nz = bi[i + 1] - jmin;
929: if (nz > 0) {
930: bcol = bj + jmin;
931: bval = ba + jmin;
932: while (nz--) rtmp[*bcol++] += uikdi * (*bval++);
933: /* update il and jl for i-th row */
934: il[i] = jmin;
935: j = bj[jmin];
936: jl[i] = jl[j];
937: jl[j] = i;
938: }
939: i = nexti;
940: }
942: /* shift the diagonals when zero pivot is detected */
943: /* compute rs=sum of abs(off-diagonal) */
944: rs = 0.0;
945: jmin = bi[k] + 1;
946: nz = bi[k + 1] - jmin;
947: if (nz) {
948: bcol = bj + jmin;
949: while (nz--) {
950: rs += PetscAbsScalar(rtmp[*bcol]);
951: bcol++;
952: }
953: }
955: sctx.rs = rs;
956: sctx.pv = dk;
957: PetscCall(MatPivotCheck(C, A, info, &sctx, k));
958: if (sctx.newshift) break; /* sctx.shift_amount is updated */
959: dk = sctx.pv;
961: /* copy data into U(k,:) */
962: ba[bi[k]] = 1.0 / dk;
963: jmin = bi[k] + 1;
964: nz = bi[k + 1] - jmin;
965: if (nz) {
966: bcol = bj + jmin;
967: bval = ba + jmin;
968: while (nz--) {
969: *bval++ = rtmp[*bcol];
970: rtmp[*bcol++] = 0.0;
971: }
972: /* add k-th row into il and jl */
973: il[k] = jmin;
974: i = bj[jmin];
975: jl[k] = jl[i];
976: jl[i] = k;
977: }
978: }
979: } while (sctx.newshift);
980: PetscCall(PetscFree3(rtmp, il, jl));
982: C->ops->solve = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
983: C->ops->solvetranspose = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
984: C->assembled = PETSC_TRUE;
985: C->preallocated = PETSC_TRUE;
987: PetscCall(PetscLogFlops(C->rmap->N));
988: if (sctx.nshift) {
989: if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
990: PetscCall(PetscInfo(A, "number of shiftnz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
991: } else if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
992: PetscCall(PetscInfo(A, "number of shiftpd tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
993: }
994: }
995: PetscFunctionReturn(PETSC_SUCCESS);
996: }
998: #include <petscbt.h>
999: #include <../src/mat/utils/freespace.h>
1000: PetscErrorCode MatICCFactorSymbolic_SeqBAIJ(Mat fact, Mat A, IS perm, const MatFactorInfo *info)
1001: {
1002: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1003: Mat_SeqSBAIJ *b;
1004: Mat B;
1005: PetscBool perm_identity, missing;
1006: PetscInt reallocs = 0, i, *ai = a->i, *aj = a->j, am = a->mbs, bs = A->rmap->bs, *ui;
1007: const PetscInt *rip;
1008: PetscInt jmin, jmax, nzk, k, j, *jl, prow, *il, nextprow;
1009: PetscInt nlnk, *lnk, *lnk_lvl = NULL, ncols, ncols_upper, *cols, *cols_lvl, *uj, **uj_ptr, **uj_lvl_ptr;
1010: PetscReal fill = info->fill, levels = info->levels;
1011: PetscFreeSpaceList free_space = NULL, current_space = NULL;
1012: PetscFreeSpaceList free_space_lvl = NULL, current_space_lvl = NULL;
1013: PetscBT lnkbt;
1015: PetscFunctionBegin;
1016: PetscCall(MatMissingDiagonal(A, &missing, &i));
1017: PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1019: if (bs > 1) {
1020: if (!a->sbaijMat) PetscCall(MatConvert(A, MATSEQSBAIJ, MAT_INITIAL_MATRIX, &a->sbaijMat));
1021: (fact)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqSBAIJ; /* undue the change made in MatGetFactor_seqbaij_petsc */
1023: PetscCall(MatICCFactorSymbolic(fact, a->sbaijMat, perm, info));
1024: PetscFunctionReturn(PETSC_SUCCESS);
1025: }
1027: PetscCall(ISIdentity(perm, &perm_identity));
1028: PetscCall(ISGetIndices(perm, &rip));
1030: /* special case that simply copies fill pattern */
1031: if (!levels && perm_identity) {
1032: PetscCall(PetscMalloc1(am + 1, &ui));
1033: for (i = 0; i < am; i++) ui[i] = ai[i + 1] - a->diag[i]; /* ui: rowlengths - changes when !perm_identity */
1034: B = fact;
1035: PetscCall(MatSeqSBAIJSetPreallocation(B, 1, 0, ui));
1037: b = (Mat_SeqSBAIJ *)B->data;
1038: uj = b->j;
1039: for (i = 0; i < am; i++) {
1040: aj = a->j + a->diag[i];
1041: for (j = 0; j < ui[i]; j++) *uj++ = *aj++;
1042: b->ilen[i] = ui[i];
1043: }
1044: PetscCall(PetscFree(ui));
1046: B->factortype = MAT_FACTOR_NONE;
1048: PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
1049: PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
1050: B->factortype = MAT_FACTOR_ICC;
1052: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering;
1053: PetscFunctionReturn(PETSC_SUCCESS);
1054: }
1056: /* initialization */
1057: PetscCall(PetscMalloc1(am + 1, &ui));
1058: ui[0] = 0;
1059: PetscCall(PetscMalloc1(2 * am + 1, &cols_lvl));
1061: /* jl: linked list for storing indices of the pivot rows
1062: il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
1063: PetscCall(PetscMalloc4(am, &uj_ptr, am, &uj_lvl_ptr, am, &il, am, &jl));
1064: for (i = 0; i < am; i++) {
1065: jl[i] = am;
1066: il[i] = 0;
1067: }
1069: /* create and initialize a linked list for storing column indices of the active row k */
1070: nlnk = am + 1;
1071: PetscCall(PetscIncompleteLLCreate(am, am, nlnk, lnk, lnk_lvl, lnkbt));
1073: /* initial FreeSpace size is fill*(ai[am]+am)/2 */
1074: PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am] / 2, am / 2)), &free_space));
1076: current_space = free_space;
1078: PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am] / 2, am / 2)), &free_space_lvl));
1079: current_space_lvl = free_space_lvl;
1081: for (k = 0; k < am; k++) { /* for each active row k */
1082: /* initialize lnk by the column indices of row rip[k] of A */
1083: nzk = 0;
1084: ncols = ai[rip[k] + 1] - ai[rip[k]];
1085: ncols_upper = 0;
1086: cols = cols_lvl + am;
1087: for (j = 0; j < ncols; j++) {
1088: i = rip[*(aj + ai[rip[k]] + j)];
1089: if (i >= k) { /* only take upper triangular entry */
1090: cols[ncols_upper] = i;
1091: cols_lvl[ncols_upper] = -1; /* initialize level for nonzero entries */
1092: ncols_upper++;
1093: }
1094: }
1095: PetscCall(PetscIncompleteLLAdd(ncols_upper, cols, levels, cols_lvl, am, &nlnk, lnk, lnk_lvl, lnkbt));
1096: nzk += nlnk;
1098: /* update lnk by computing fill-in for each pivot row to be merged in */
1099: prow = jl[k]; /* 1st pivot row */
1101: while (prow < k) {
1102: nextprow = jl[prow];
1104: /* merge prow into k-th row */
1105: jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */
1106: jmax = ui[prow + 1];
1107: ncols = jmax - jmin;
1108: i = jmin - ui[prow];
1109: cols = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */
1110: for (j = 0; j < ncols; j++) cols_lvl[j] = *(uj_lvl_ptr[prow] + i + j);
1111: PetscCall(PetscIncompleteLLAddSorted(ncols, cols, levels, cols_lvl, am, &nlnk, lnk, lnk_lvl, lnkbt));
1112: nzk += nlnk;
1114: /* update il and jl for prow */
1115: if (jmin < jmax) {
1116: il[prow] = jmin;
1118: j = *cols;
1119: jl[prow] = jl[j];
1120: jl[j] = prow;
1121: }
1122: prow = nextprow;
1123: }
1125: /* if free space is not available, make more free space */
1126: if (current_space->local_remaining < nzk) {
1127: i = am - k + 1; /* num of unfactored rows */
1128: i = PetscMin(PetscIntMultTruncate(i, nzk), PetscIntMultTruncate(i, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
1129: PetscCall(PetscFreeSpaceGet(i, ¤t_space));
1130: PetscCall(PetscFreeSpaceGet(i, ¤t_space_lvl));
1131: reallocs++;
1132: }
1134: /* copy data into free_space and free_space_lvl, then initialize lnk */
1135: PetscCall(PetscIncompleteLLClean(am, am, nzk, lnk, lnk_lvl, current_space->array, current_space_lvl->array, lnkbt));
1137: /* add the k-th row into il and jl */
1138: if (nzk - 1 > 0) {
1139: i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
1140: jl[k] = jl[i];
1141: jl[i] = k;
1142: il[k] = ui[k] + 1;
1143: }
1144: uj_ptr[k] = current_space->array;
1145: uj_lvl_ptr[k] = current_space_lvl->array;
1147: current_space->array += nzk;
1148: current_space->local_used += nzk;
1149: current_space->local_remaining -= nzk;
1151: current_space_lvl->array += nzk;
1152: current_space_lvl->local_used += nzk;
1153: current_space_lvl->local_remaining -= nzk;
1155: ui[k + 1] = ui[k] + nzk;
1156: }
1158: PetscCall(ISRestoreIndices(perm, &rip));
1159: PetscCall(PetscFree4(uj_ptr, uj_lvl_ptr, il, jl));
1160: PetscCall(PetscFree(cols_lvl));
1162: /* copy free_space into uj and free free_space; set uj in new datastructure; */
1163: PetscCall(PetscMalloc1(ui[am] + 1, &uj));
1164: PetscCall(PetscFreeSpaceContiguous(&free_space, uj));
1165: PetscCall(PetscIncompleteLLDestroy(lnk, lnkbt));
1166: PetscCall(PetscFreeSpaceDestroy(free_space_lvl));
1168: /* put together the new matrix in MATSEQSBAIJ format */
1169: B = fact;
1170: PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL));
1172: b = (Mat_SeqSBAIJ *)B->data;
1173: b->singlemalloc = PETSC_FALSE;
1174: b->free_a = PETSC_TRUE;
1175: b->free_ij = PETSC_TRUE;
1177: PetscCall(PetscMalloc1(ui[am] + 1, &b->a));
1179: b->j = uj;
1180: b->i = ui;
1181: b->diag = NULL;
1182: b->ilen = NULL;
1183: b->imax = NULL;
1184: b->row = perm;
1185: b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
1187: PetscCall(PetscObjectReference((PetscObject)perm));
1189: b->icol = perm;
1191: PetscCall(PetscObjectReference((PetscObject)perm));
1192: PetscCall(PetscMalloc1(am + 1, &b->solve_work));
1194: b->maxnz = b->nz = ui[am];
1196: B->info.factor_mallocs = reallocs;
1197: B->info.fill_ratio_given = fill;
1198: if (ai[am] != 0.) {
1199: /* nonzeros in lower triangular part of A (including diagonals)= (ai[am]+am)/2 */
1200: B->info.fill_ratio_needed = ((PetscReal)2 * ui[am]) / (ai[am] + am);
1201: } else {
1202: B->info.fill_ratio_needed = 0.0;
1203: }
1204: #if defined(PETSC_USE_INFO)
1205: if (ai[am] != 0) {
1206: PetscReal af = B->info.fill_ratio_needed;
1207: PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af));
1208: PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af));
1209: PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af));
1210: } else {
1211: PetscCall(PetscInfo(A, "Empty matrix\n"));
1212: }
1213: #endif
1214: if (perm_identity) {
1215: B->ops->solve = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
1216: B->ops->solvetranspose = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
1217: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering;
1218: } else {
1219: (fact)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N;
1220: }
1221: PetscFunctionReturn(PETSC_SUCCESS);
1222: }
1224: PetscErrorCode MatCholeskyFactorSymbolic_SeqBAIJ(Mat fact, Mat A, IS perm, const MatFactorInfo *info)
1225: {
1226: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1227: Mat_SeqSBAIJ *b;
1228: Mat B;
1229: PetscBool perm_identity, missing;
1230: PetscReal fill = info->fill;
1231: const PetscInt *rip;
1232: PetscInt i, mbs = a->mbs, bs = A->rmap->bs, *ai = a->i, *aj = a->j, reallocs = 0, prow;
1233: PetscInt *jl, jmin, jmax, nzk, *ui, k, j, *il, nextprow;
1234: PetscInt nlnk, *lnk, ncols, ncols_upper, *cols, *uj, **ui_ptr, *uj_ptr;
1235: PetscFreeSpaceList free_space = NULL, current_space = NULL;
1236: PetscBT lnkbt;
1238: PetscFunctionBegin;
1239: if (bs > 1) { /* convert to seqsbaij */
1240: if (!a->sbaijMat) PetscCall(MatConvert(A, MATSEQSBAIJ, MAT_INITIAL_MATRIX, &a->sbaijMat));
1241: (fact)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqSBAIJ; /* undue the change made in MatGetFactor_seqbaij_petsc */
1243: PetscCall(MatCholeskyFactorSymbolic(fact, a->sbaijMat, perm, info));
1244: PetscFunctionReturn(PETSC_SUCCESS);
1245: }
1247: PetscCall(MatMissingDiagonal(A, &missing, &i));
1248: PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1250: /* check whether perm is the identity mapping */
1251: PetscCall(ISIdentity(perm, &perm_identity));
1252: PetscCheck(perm_identity, PETSC_COMM_SELF, PETSC_ERR_SUP, "Matrix reordering is not supported");
1253: PetscCall(ISGetIndices(perm, &rip));
1255: /* initialization */
1256: PetscCall(PetscMalloc1(mbs + 1, &ui));
1257: ui[0] = 0;
1259: /* jl: linked list for storing indices of the pivot rows
1260: il: il[i] points to the 1st nonzero entry of U(i,k:mbs-1) */
1261: PetscCall(PetscMalloc4(mbs, &ui_ptr, mbs, &il, mbs, &jl, mbs, &cols));
1262: for (i = 0; i < mbs; i++) {
1263: jl[i] = mbs;
1264: il[i] = 0;
1265: }
1267: /* create and initialize a linked list for storing column indices of the active row k */
1268: nlnk = mbs + 1;
1269: PetscCall(PetscLLCreate(mbs, mbs, nlnk, lnk, lnkbt));
1271: /* initial FreeSpace size is fill* (ai[mbs]+mbs)/2 */
1272: PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[mbs] / 2, mbs / 2)), &free_space));
1274: current_space = free_space;
1276: for (k = 0; k < mbs; k++) { /* for each active row k */
1277: /* initialize lnk by the column indices of row rip[k] of A */
1278: nzk = 0;
1279: ncols = ai[rip[k] + 1] - ai[rip[k]];
1280: ncols_upper = 0;
1281: for (j = 0; j < ncols; j++) {
1282: i = rip[*(aj + ai[rip[k]] + j)];
1283: if (i >= k) { /* only take upper triangular entry */
1284: cols[ncols_upper] = i;
1285: ncols_upper++;
1286: }
1287: }
1288: PetscCall(PetscLLAdd(ncols_upper, cols, mbs, &nlnk, lnk, lnkbt));
1289: nzk += nlnk;
1291: /* update lnk by computing fill-in for each pivot row to be merged in */
1292: prow = jl[k]; /* 1st pivot row */
1294: while (prow < k) {
1295: nextprow = jl[prow];
1296: /* merge prow into k-th row */
1297: jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:mbs-1) */
1298: jmax = ui[prow + 1];
1299: ncols = jmax - jmin;
1300: uj_ptr = ui_ptr[prow] + jmin - ui[prow]; /* points to the 2nd nzero entry in U(prow,k:mbs-1) */
1301: PetscCall(PetscLLAddSorted(ncols, uj_ptr, mbs, &nlnk, lnk, lnkbt));
1302: nzk += nlnk;
1304: /* update il and jl for prow */
1305: if (jmin < jmax) {
1306: il[prow] = jmin;
1307: j = *uj_ptr;
1308: jl[prow] = jl[j];
1309: jl[j] = prow;
1310: }
1311: prow = nextprow;
1312: }
1314: /* if free space is not available, make more free space */
1315: if (current_space->local_remaining < nzk) {
1316: i = mbs - k + 1; /* num of unfactored rows */
1317: i = PetscMin(PetscIntMultTruncate(i, nzk), PetscIntMultTruncate(i, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
1318: PetscCall(PetscFreeSpaceGet(i, ¤t_space));
1319: reallocs++;
1320: }
1322: /* copy data into free space, then initialize lnk */
1323: PetscCall(PetscLLClean(mbs, mbs, nzk, lnk, current_space->array, lnkbt));
1325: /* add the k-th row into il and jl */
1326: if (nzk - 1 > 0) {
1327: i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:mbs-1) */
1328: jl[k] = jl[i];
1329: jl[i] = k;
1330: il[k] = ui[k] + 1;
1331: }
1332: ui_ptr[k] = current_space->array;
1333: current_space->array += nzk;
1334: current_space->local_used += nzk;
1335: current_space->local_remaining -= nzk;
1337: ui[k + 1] = ui[k] + nzk;
1338: }
1340: PetscCall(ISRestoreIndices(perm, &rip));
1341: PetscCall(PetscFree4(ui_ptr, il, jl, cols));
1343: /* copy free_space into uj and free free_space; set uj in new datastructure; */
1344: PetscCall(PetscMalloc1(ui[mbs] + 1, &uj));
1345: PetscCall(PetscFreeSpaceContiguous(&free_space, uj));
1346: PetscCall(PetscLLDestroy(lnk, lnkbt));
1348: /* put together the new matrix in MATSEQSBAIJ format */
1349: B = fact;
1350: PetscCall(MatSeqSBAIJSetPreallocation(B, bs, MAT_SKIP_ALLOCATION, NULL));
1352: b = (Mat_SeqSBAIJ *)B->data;
1353: b->singlemalloc = PETSC_FALSE;
1354: b->free_a = PETSC_TRUE;
1355: b->free_ij = PETSC_TRUE;
1357: PetscCall(PetscMalloc1(ui[mbs] + 1, &b->a));
1359: b->j = uj;
1360: b->i = ui;
1361: b->diag = NULL;
1362: b->ilen = NULL;
1363: b->imax = NULL;
1364: b->row = perm;
1365: b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
1367: PetscCall(PetscObjectReference((PetscObject)perm));
1368: b->icol = perm;
1369: PetscCall(PetscObjectReference((PetscObject)perm));
1370: PetscCall(PetscMalloc1(mbs + 1, &b->solve_work));
1371: b->maxnz = b->nz = ui[mbs];
1373: B->info.factor_mallocs = reallocs;
1374: B->info.fill_ratio_given = fill;
1375: if (ai[mbs] != 0.) {
1376: /* nonzeros in lower triangular part of A = (ai[mbs]+mbs)/2 */
1377: B->info.fill_ratio_needed = ((PetscReal)2 * ui[mbs]) / (ai[mbs] + mbs);
1378: } else {
1379: B->info.fill_ratio_needed = 0.0;
1380: }
1381: #if defined(PETSC_USE_INFO)
1382: if (ai[mbs] != 0.) {
1383: PetscReal af = B->info.fill_ratio_needed;
1384: PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af));
1385: PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af));
1386: PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af));
1387: } else {
1388: PetscCall(PetscInfo(A, "Empty matrix\n"));
1389: }
1390: #endif
1391: if (perm_identity) {
1392: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering;
1393: } else {
1394: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N;
1395: }
1396: PetscFunctionReturn(PETSC_SUCCESS);
1397: }
1399: PetscErrorCode MatSolve_SeqBAIJ_N_NaturalOrdering(Mat A, Vec bb, Vec xx)
1400: {
1401: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1402: const PetscInt *ai = a->i, *aj = a->j, *adiag = a->diag, *vi;
1403: PetscInt i, k, n = a->mbs;
1404: PetscInt nz, bs = A->rmap->bs, bs2 = a->bs2;
1405: const MatScalar *aa = a->a, *v;
1406: PetscScalar *x, *s, *t, *ls;
1407: const PetscScalar *b;
1409: PetscFunctionBegin;
1410: PetscCall(VecGetArrayRead(bb, &b));
1411: PetscCall(VecGetArray(xx, &x));
1412: t = a->solve_work;
1414: /* forward solve the lower triangular */
1415: PetscCall(PetscArraycpy(t, b, bs)); /* copy 1st block of b to t */
1417: for (i = 1; i < n; i++) {
1418: v = aa + bs2 * ai[i];
1419: vi = aj + ai[i];
1420: nz = ai[i + 1] - ai[i];
1421: s = t + bs * i;
1422: PetscCall(PetscArraycpy(s, b + bs * i, bs)); /* copy i_th block of b to t */
1423: for (k = 0; k < nz; k++) {
1424: PetscKernel_v_gets_v_minus_A_times_w(bs, s, v, t + bs * vi[k]);
1425: v += bs2;
1426: }
1427: }
1429: /* backward solve the upper triangular */
1430: ls = a->solve_work + A->cmap->n;
1431: for (i = n - 1; i >= 0; i--) {
1432: v = aa + bs2 * (adiag[i + 1] + 1);
1433: vi = aj + adiag[i + 1] + 1;
1434: nz = adiag[i] - adiag[i + 1] - 1;
1435: PetscCall(PetscArraycpy(ls, t + i * bs, bs));
1436: for (k = 0; k < nz; k++) {
1437: PetscKernel_v_gets_v_minus_A_times_w(bs, ls, v, t + bs * vi[k]);
1438: v += bs2;
1439: }
1440: PetscKernel_w_gets_A_times_v(bs, ls, aa + bs2 * adiag[i], t + i * bs); /* *inv(diagonal[i]) */
1441: PetscCall(PetscArraycpy(x + i * bs, t + i * bs, bs));
1442: }
1444: PetscCall(VecRestoreArrayRead(bb, &b));
1445: PetscCall(VecRestoreArray(xx, &x));
1446: PetscCall(PetscLogFlops(2.0 * (a->bs2) * (a->nz) - A->rmap->bs * A->cmap->n));
1447: PetscFunctionReturn(PETSC_SUCCESS);
1448: }
1450: PetscErrorCode MatSolve_SeqBAIJ_N(Mat A, Vec bb, Vec xx)
1451: {
1452: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1453: IS iscol = a->col, isrow = a->row;
1454: const PetscInt *r, *c, *rout, *cout, *ai = a->i, *aj = a->j, *adiag = a->diag, *vi;
1455: PetscInt i, m, n = a->mbs;
1456: PetscInt nz, bs = A->rmap->bs, bs2 = a->bs2;
1457: const MatScalar *aa = a->a, *v;
1458: PetscScalar *x, *s, *t, *ls;
1459: const PetscScalar *b;
1461: PetscFunctionBegin;
1462: PetscCall(VecGetArrayRead(bb, &b));
1463: PetscCall(VecGetArray(xx, &x));
1464: t = a->solve_work;
1466: PetscCall(ISGetIndices(isrow, &rout));
1467: r = rout;
1468: PetscCall(ISGetIndices(iscol, &cout));
1469: c = cout;
1471: /* forward solve the lower triangular */
1472: PetscCall(PetscArraycpy(t, b + bs * r[0], bs));
1473: for (i = 1; i < n; i++) {
1474: v = aa + bs2 * ai[i];
1475: vi = aj + ai[i];
1476: nz = ai[i + 1] - ai[i];
1477: s = t + bs * i;
1478: PetscCall(PetscArraycpy(s, b + bs * r[i], bs));
1479: for (m = 0; m < nz; m++) {
1480: PetscKernel_v_gets_v_minus_A_times_w(bs, s, v, t + bs * vi[m]);
1481: v += bs2;
1482: }
1483: }
1485: /* backward solve the upper triangular */
1486: ls = a->solve_work + A->cmap->n;
1487: for (i = n - 1; i >= 0; i--) {
1488: v = aa + bs2 * (adiag[i + 1] + 1);
1489: vi = aj + adiag[i + 1] + 1;
1490: nz = adiag[i] - adiag[i + 1] - 1;
1491: PetscCall(PetscArraycpy(ls, t + i * bs, bs));
1492: for (m = 0; m < nz; m++) {
1493: PetscKernel_v_gets_v_minus_A_times_w(bs, ls, v, t + bs * vi[m]);
1494: v += bs2;
1495: }
1496: PetscKernel_w_gets_A_times_v(bs, ls, v, t + i * bs); /* *inv(diagonal[i]) */
1497: PetscCall(PetscArraycpy(x + bs * c[i], t + i * bs, bs));
1498: }
1499: PetscCall(ISRestoreIndices(isrow, &rout));
1500: PetscCall(ISRestoreIndices(iscol, &cout));
1501: PetscCall(VecRestoreArrayRead(bb, &b));
1502: PetscCall(VecRestoreArray(xx, &x));
1503: PetscCall(PetscLogFlops(2.0 * (a->bs2) * (a->nz) - A->rmap->bs * A->cmap->n));
1504: PetscFunctionReturn(PETSC_SUCCESS);
1505: }
1507: /*
1508: For each block in an block array saves the largest absolute value in the block into another array
1509: */
1510: static PetscErrorCode MatBlockAbs_private(PetscInt nbs, PetscInt bs2, PetscScalar *blockarray, PetscReal *absarray)
1511: {
1512: PetscInt i, j;
1514: PetscFunctionBegin;
1515: PetscCall(PetscArrayzero(absarray, nbs + 1));
1516: for (i = 0; i < nbs; i++) {
1517: for (j = 0; j < bs2; j++) {
1518: if (absarray[i] < PetscAbsScalar(blockarray[i * nbs + j])) absarray[i] = PetscAbsScalar(blockarray[i * nbs + j]);
1519: }
1520: }
1521: PetscFunctionReturn(PETSC_SUCCESS);
1522: }
1524: /*
1525: This needs to be renamed and called by the regular MatILUFactor_SeqBAIJ when drop tolerance is used
1526: */
1527: PetscErrorCode MatILUDTFactor_SeqBAIJ(Mat A, IS isrow, IS iscol, const MatFactorInfo *info, Mat *fact)
1528: {
1529: Mat B = *fact;
1530: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b;
1531: IS isicol;
1532: const PetscInt *r, *ic;
1533: PetscInt i, mbs = a->mbs, bs = A->rmap->bs, bs2 = a->bs2, *ai = a->i, *aj = a->j, *ajtmp, *adiag;
1534: PetscInt *bi, *bj, *bdiag;
1536: PetscInt row, nzi, nzi_bl, nzi_bu, *im, dtcount, nzi_al, nzi_au;
1537: PetscInt nlnk, *lnk;
1538: PetscBT lnkbt;
1539: PetscBool row_identity, icol_identity;
1540: MatScalar *aatmp, *pv, *batmp, *ba, *rtmp, *pc, *multiplier, *vtmp;
1541: PetscInt j, nz, *pj, *bjtmp, k, ncut, *jtmp;
1543: PetscReal dt = info->dt; /* shift=info->shiftamount; */
1544: PetscInt nnz_max;
1545: PetscBool missing;
1546: PetscReal *vtmp_abs;
1547: MatScalar *v_work;
1548: PetscInt *v_pivots;
1549: PetscBool allowzeropivot, zeropivotdetected = PETSC_FALSE;
1551: PetscFunctionBegin;
1552: /* symbolic factorization, can be reused */
1553: PetscCall(MatMissingDiagonal(A, &missing, &i));
1554: PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1555: adiag = a->diag;
1557: PetscCall(ISInvertPermutation(iscol, PETSC_DECIDE, &isicol));
1559: /* bdiag is location of diagonal in factor */
1560: PetscCall(PetscMalloc1(mbs + 1, &bdiag));
1562: /* allocate row pointers bi */
1563: PetscCall(PetscMalloc1(2 * mbs + 2, &bi));
1565: /* allocate bj and ba; max num of nonzero entries is (ai[n]+2*n*dtcount+2) */
1566: dtcount = (PetscInt)info->dtcount;
1567: if (dtcount > mbs - 1) dtcount = mbs - 1;
1568: nnz_max = ai[mbs] + 2 * mbs * dtcount + 2;
1569: /* printf("MatILUDTFactor_SeqBAIJ, bs %d, ai[mbs] %d, nnz_max %d, dtcount %d\n",bs,ai[mbs],nnz_max,dtcount); */
1570: PetscCall(PetscMalloc1(nnz_max, &bj));
1571: nnz_max = nnz_max * bs2;
1572: PetscCall(PetscMalloc1(nnz_max, &ba));
1574: /* put together the new matrix */
1575: PetscCall(MatSeqBAIJSetPreallocation(B, bs, MAT_SKIP_ALLOCATION, NULL));
1577: b = (Mat_SeqBAIJ *)(B)->data;
1578: b->free_a = PETSC_TRUE;
1579: b->free_ij = PETSC_TRUE;
1580: b->singlemalloc = PETSC_FALSE;
1582: b->a = ba;
1583: b->j = bj;
1584: b->i = bi;
1585: b->diag = bdiag;
1586: b->ilen = NULL;
1587: b->imax = NULL;
1588: b->row = isrow;
1589: b->col = iscol;
1591: PetscCall(PetscObjectReference((PetscObject)isrow));
1592: PetscCall(PetscObjectReference((PetscObject)iscol));
1594: b->icol = isicol;
1595: PetscCall(PetscMalloc1(bs * (mbs + 1), &b->solve_work));
1596: b->maxnz = nnz_max / bs2;
1598: (B)->factortype = MAT_FACTOR_ILUDT;
1599: (B)->info.factor_mallocs = 0;
1600: (B)->info.fill_ratio_given = ((PetscReal)nnz_max) / ((PetscReal)(ai[mbs] * bs2));
1601: /* end of symbolic factorization */
1602: PetscCall(ISGetIndices(isrow, &r));
1603: PetscCall(ISGetIndices(isicol, &ic));
1605: /* linked list for storing column indices of the active row */
1606: nlnk = mbs + 1;
1607: PetscCall(PetscLLCreate(mbs, mbs, nlnk, lnk, lnkbt));
1609: /* im: used by PetscLLAddSortedLU(); jtmp: working array for column indices of active row */
1610: PetscCall(PetscMalloc2(mbs, &im, mbs, &jtmp));
1611: /* rtmp, vtmp: working arrays for sparse and contiguous row entries of active row */
1612: PetscCall(PetscMalloc2(mbs * bs2, &rtmp, mbs * bs2, &vtmp));
1613: PetscCall(PetscMalloc1(mbs + 1, &vtmp_abs));
1614: PetscCall(PetscMalloc3(bs, &v_work, bs2, &multiplier, bs, &v_pivots));
1616: allowzeropivot = PetscNot(A->erroriffailure);
1617: bi[0] = 0;
1618: bdiag[0] = (nnz_max / bs2) - 1; /* location of diagonal in factor B */
1619: bi[2 * mbs + 1] = bdiag[0] + 1; /* endof bj and ba array */
1620: for (i = 0; i < mbs; i++) {
1621: /* copy initial fill into linked list */
1622: nzi = ai[r[i] + 1] - ai[r[i]];
1623: PetscCheck(nzi, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Empty row in matrix: row in original ordering %" PetscInt_FMT " in permuted ordering %" PetscInt_FMT, r[i], i);
1624: nzi_al = adiag[r[i]] - ai[r[i]];
1625: nzi_au = ai[r[i] + 1] - adiag[r[i]] - 1;
1627: /* load in initial unfactored row */
1628: ajtmp = aj + ai[r[i]];
1629: PetscCall(PetscLLAddPerm(nzi, ajtmp, ic, mbs, &nlnk, lnk, lnkbt));
1630: PetscCall(PetscArrayzero(rtmp, mbs * bs2));
1631: aatmp = a->a + bs2 * ai[r[i]];
1632: for (j = 0; j < nzi; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], aatmp + bs2 * j, bs2));
1634: /* add pivot rows into linked list */
1635: row = lnk[mbs];
1636: while (row < i) {
1637: nzi_bl = bi[row + 1] - bi[row] + 1;
1638: bjtmp = bj + bdiag[row + 1] + 1; /* points to 1st column next to the diagonal in U */
1639: PetscCall(PetscLLAddSortedLU(bjtmp, row, &nlnk, lnk, lnkbt, i, nzi_bl, im));
1640: nzi += nlnk;
1641: row = lnk[row];
1642: }
1644: /* copy data from lnk into jtmp, then initialize lnk */
1645: PetscCall(PetscLLClean(mbs, mbs, nzi, lnk, jtmp, lnkbt));
1647: /* numerical factorization */
1648: bjtmp = jtmp;
1649: row = *bjtmp++; /* 1st pivot row */
1651: while (row < i) {
1652: pc = rtmp + bs2 * row;
1653: pv = ba + bs2 * bdiag[row]; /* inv(diag) of the pivot row */
1654: PetscKernel_A_gets_A_times_B(bs, pc, pv, multiplier); /* pc= multiplier = pc*inv(diag[row]) */
1655: PetscCall(MatBlockAbs_private(1, bs2, pc, vtmp_abs));
1656: if (vtmp_abs[0] > dt) { /* apply tolerance dropping rule */
1657: pj = bj + bdiag[row + 1] + 1; /* point to 1st entry of U(row,:) */
1658: pv = ba + bs2 * (bdiag[row + 1] + 1);
1659: nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:), excluding diagonal */
1660: for (j = 0; j < nz; j++) PetscKernel_A_gets_A_minus_B_times_C(bs, rtmp + bs2 * pj[j], pc, pv + bs2 * j);
1661: /* PetscCall(PetscLogFlops(bslog*(nz+1.0)-bs)); */
1662: }
1663: row = *bjtmp++;
1664: }
1666: /* copy sparse rtmp into contiguous vtmp; separate L and U part */
1667: nzi_bl = 0;
1668: j = 0;
1669: while (jtmp[j] < i) { /* L-part. Note: jtmp is sorted */
1670: PetscCall(PetscArraycpy(vtmp + bs2 * j, rtmp + bs2 * jtmp[j], bs2));
1671: nzi_bl++;
1672: j++;
1673: }
1674: nzi_bu = nzi - nzi_bl - 1;
1676: while (j < nzi) { /* U-part */
1677: PetscCall(PetscArraycpy(vtmp + bs2 * j, rtmp + bs2 * jtmp[j], bs2));
1678: j++;
1679: }
1681: PetscCall(MatBlockAbs_private(nzi, bs2, vtmp, vtmp_abs));
1683: bjtmp = bj + bi[i];
1684: batmp = ba + bs2 * bi[i];
1685: /* apply level dropping rule to L part */
1686: ncut = nzi_al + dtcount;
1687: if (ncut < nzi_bl) {
1688: PetscCall(PetscSortSplitReal(ncut, nzi_bl, vtmp_abs, jtmp));
1689: PetscCall(PetscSortIntWithScalarArray(ncut, jtmp, vtmp));
1690: } else {
1691: ncut = nzi_bl;
1692: }
1693: for (j = 0; j < ncut; j++) {
1694: bjtmp[j] = jtmp[j];
1695: PetscCall(PetscArraycpy(batmp + bs2 * j, rtmp + bs2 * bjtmp[j], bs2));
1696: }
1697: bi[i + 1] = bi[i] + ncut;
1698: nzi = ncut + 1;
1700: /* apply level dropping rule to U part */
1701: ncut = nzi_au + dtcount;
1702: if (ncut < nzi_bu) {
1703: PetscCall(PetscSortSplitReal(ncut, nzi_bu, vtmp_abs + nzi_bl + 1, jtmp + nzi_bl + 1));
1704: PetscCall(PetscSortIntWithScalarArray(ncut, jtmp + nzi_bl + 1, vtmp + nzi_bl + 1));
1705: } else {
1706: ncut = nzi_bu;
1707: }
1708: nzi += ncut;
1710: /* mark bdiagonal */
1711: bdiag[i + 1] = bdiag[i] - (ncut + 1);
1712: bi[2 * mbs - i] = bi[2 * mbs - i + 1] - (ncut + 1);
1714: bjtmp = bj + bdiag[i];
1715: batmp = ba + bs2 * bdiag[i];
1716: PetscCall(PetscArraycpy(batmp, rtmp + bs2 * i, bs2));
1717: *bjtmp = i;
1719: bjtmp = bj + bdiag[i + 1] + 1;
1720: batmp = ba + (bdiag[i + 1] + 1) * bs2;
1722: for (k = 0; k < ncut; k++) {
1723: bjtmp[k] = jtmp[nzi_bl + 1 + k];
1724: PetscCall(PetscArraycpy(batmp + bs2 * k, rtmp + bs2 * bjtmp[k], bs2));
1725: }
1727: im[i] = nzi; /* used by PetscLLAddSortedLU() */
1729: /* invert diagonal block for simpler triangular solves - add shift??? */
1730: batmp = ba + bs2 * bdiag[i];
1732: PetscCall(PetscKernel_A_gets_inverse_A(bs, batmp, v_pivots, v_work, allowzeropivot, &zeropivotdetected));
1733: if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
1734: } /* for (i=0; i<mbs; i++) */
1735: PetscCall(PetscFree3(v_work, multiplier, v_pivots));
1737: /* printf("end of L %d, beginning of U %d\n",bi[mbs],bdiag[mbs]); */
1738: PetscCheck(bi[mbs] < bdiag[mbs], PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "end of L array %" PetscInt_FMT " cannot >= the beginning of U array %" PetscInt_FMT, bi[mbs], bdiag[mbs]);
1740: PetscCall(ISRestoreIndices(isrow, &r));
1741: PetscCall(ISRestoreIndices(isicol, &ic));
1743: PetscCall(PetscLLDestroy(lnk, lnkbt));
1745: PetscCall(PetscFree2(im, jtmp));
1746: PetscCall(PetscFree2(rtmp, vtmp));
1748: PetscCall(PetscLogFlops(bs2 * B->cmap->n));
1749: b->maxnz = b->nz = bi[mbs] + bdiag[0] - bdiag[mbs];
1751: PetscCall(ISIdentity(isrow, &row_identity));
1752: PetscCall(ISIdentity(isicol, &icol_identity));
1753: if (row_identity && icol_identity) {
1754: B->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering;
1755: } else {
1756: B->ops->solve = MatSolve_SeqBAIJ_N;
1757: }
1759: B->ops->solveadd = NULL;
1760: B->ops->solvetranspose = NULL;
1761: B->ops->solvetransposeadd = NULL;
1762: B->ops->matsolve = NULL;
1763: B->assembled = PETSC_TRUE;
1764: B->preallocated = PETSC_TRUE;
1765: PetscFunctionReturn(PETSC_SUCCESS);
1766: }