Actual source code: baijfact2.c
2: /*
3: Factorization code for BAIJ format.
4: */
6: #include <../src/mat/impls/baij/seq/baij.h>
7: #include <petsc/private/kernels/blockinvert.h>
8: #include <petscbt.h>
9: #include <../src/mat/utils/freespace.h>
11: extern PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat, Mat, MatDuplicateOption, PetscBool);
13: /*
14: This is not much faster than MatLUFactorNumeric_SeqBAIJ_N() but the solve is faster at least sometimes
15: */
16: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_15_NaturalOrdering(Mat B, Mat A, const MatFactorInfo *info)
17: {
18: Mat C = B;
19: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
20: PetscInt i, j, k, ipvt[15];
21: const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *ajtmp, *bjtmp, *bdiag = b->diag, *pj;
22: PetscInt nz, nzL, row;
23: MatScalar *rtmp, *pc, *mwork, *pv, *vv, work[225];
24: const MatScalar *v, *aa = a->a;
25: PetscInt bs2 = a->bs2, bs = A->rmap->bs, flg;
26: PetscInt sol_ver;
27: PetscBool allowzeropivot, zeropivotdetected;
29: PetscFunctionBegin;
30: allowzeropivot = PetscNot(A->erroriffailure);
31: PetscCall(PetscOptionsGetInt(NULL, ((PetscObject)A)->prefix, "-sol_ver", &sol_ver, NULL));
33: /* generate work space needed by the factorization */
34: PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork));
35: PetscCall(PetscArrayzero(rtmp, bs2 * n));
37: for (i = 0; i < n; i++) {
38: /* zero rtmp */
39: /* L part */
40: nz = bi[i + 1] - bi[i];
41: bjtmp = bj + bi[i];
42: for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
44: /* U part */
45: nz = bdiag[i] - bdiag[i + 1];
46: bjtmp = bj + bdiag[i + 1] + 1;
47: for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
49: /* load in initial (unfactored row) */
50: nz = ai[i + 1] - ai[i];
51: ajtmp = aj + ai[i];
52: v = aa + bs2 * ai[i];
53: for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ajtmp[j], v + bs2 * j, bs2));
55: /* elimination */
56: bjtmp = bj + bi[i];
57: nzL = bi[i + 1] - bi[i];
58: for (k = 0; k < nzL; k++) {
59: row = bjtmp[k];
60: pc = rtmp + bs2 * row;
61: for (flg = 0, j = 0; j < bs2; j++) {
62: if (pc[j] != 0.0) {
63: flg = 1;
64: break;
65: }
66: }
67: if (flg) {
68: pv = b->a + bs2 * bdiag[row];
69: PetscKernel_A_gets_A_times_B(bs, pc, pv, mwork);
70: /* PetscCall(PetscKernel_A_gets_A_times_B_15(pc,pv,mwork)); */
71: pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
72: pv = b->a + bs2 * (bdiag[row + 1] + 1);
73: nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */
74: for (j = 0; j < nz; j++) {
75: vv = rtmp + bs2 * pj[j];
76: PetscKernel_A_gets_A_minus_B_times_C(bs, vv, pc, pv);
77: /* PetscCall(PetscKernel_A_gets_A_minus_B_times_C_15(vv,pc,pv)); */
78: pv += bs2;
79: }
80: PetscCall(PetscLogFlops(2.0 * bs2 * bs * (nz + 1) - bs2)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
81: }
82: }
84: /* finished row so stick it into b->a */
85: /* L part */
86: pv = b->a + bs2 * bi[i];
87: pj = b->j + bi[i];
88: nz = bi[i + 1] - bi[i];
89: for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
91: /* Mark diagonal and invert diagonal for simpler triangular solves */
92: pv = b->a + bs2 * bdiag[i];
93: pj = b->j + bdiag[i];
94: PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
95: PetscCall(PetscKernel_A_gets_inverse_A_15(pv, ipvt, work, info->shiftamount, allowzeropivot, &zeropivotdetected));
96: if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
98: /* U part */
99: pv = b->a + bs2 * (bdiag[i + 1] + 1);
100: pj = b->j + bdiag[i + 1] + 1;
101: nz = bdiag[i] - bdiag[i + 1] - 1;
102: for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
103: }
105: PetscCall(PetscFree2(rtmp, mwork));
107: C->ops->solve = MatSolve_SeqBAIJ_15_NaturalOrdering_ver1;
108: C->ops->solvetranspose = MatSolve_SeqBAIJ_N_NaturalOrdering;
109: C->assembled = PETSC_TRUE;
111: PetscCall(PetscLogFlops(1.333333333333 * bs * bs2 * b->mbs)); /* from inverting diagonal blocks */
112: PetscFunctionReturn(PETSC_SUCCESS);
113: }
115: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N(Mat B, Mat A, const MatFactorInfo *info)
116: {
117: Mat C = B;
118: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
119: IS isrow = b->row, isicol = b->icol;
120: const PetscInt *r, *ic;
121: PetscInt i, j, k, n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j;
122: PetscInt *ajtmp, *bjtmp, nz, nzL, row, *bdiag = b->diag, *pj;
123: MatScalar *rtmp, *pc, *mwork, *v, *pv, *aa = a->a;
124: PetscInt bs = A->rmap->bs, bs2 = a->bs2, *v_pivots, flg;
125: MatScalar *v_work;
126: PetscBool col_identity, row_identity, both_identity;
127: PetscBool allowzeropivot, zeropivotdetected;
129: PetscFunctionBegin;
130: PetscCall(ISGetIndices(isrow, &r));
131: PetscCall(ISGetIndices(isicol, &ic));
132: allowzeropivot = PetscNot(A->erroriffailure);
134: PetscCall(PetscCalloc1(bs2 * n, &rtmp));
136: /* generate work space needed by dense LU factorization */
137: PetscCall(PetscMalloc3(bs, &v_work, bs2, &mwork, bs, &v_pivots));
139: for (i = 0; i < n; i++) {
140: /* zero rtmp */
141: /* L part */
142: nz = bi[i + 1] - bi[i];
143: bjtmp = bj + bi[i];
144: for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
146: /* U part */
147: nz = bdiag[i] - bdiag[i + 1];
148: bjtmp = bj + bdiag[i + 1] + 1;
149: for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
151: /* load in initial (unfactored row) */
152: nz = ai[r[i] + 1] - ai[r[i]];
153: ajtmp = aj + ai[r[i]];
154: v = aa + bs2 * ai[r[i]];
155: for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], v + bs2 * j, bs2));
157: /* elimination */
158: bjtmp = bj + bi[i];
159: nzL = bi[i + 1] - bi[i];
160: for (k = 0; k < nzL; k++) {
161: row = bjtmp[k];
162: pc = rtmp + bs2 * row;
163: for (flg = 0, j = 0; j < bs2; j++) {
164: if (pc[j] != 0.0) {
165: flg = 1;
166: break;
167: }
168: }
169: if (flg) {
170: pv = b->a + bs2 * bdiag[row];
171: PetscKernel_A_gets_A_times_B(bs, pc, pv, mwork); /* *pc = *pc * (*pv); */
172: pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
173: pv = b->a + bs2 * (bdiag[row + 1] + 1);
174: nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */
175: for (j = 0; j < nz; j++) PetscKernel_A_gets_A_minus_B_times_C(bs, rtmp + bs2 * pj[j], pc, pv + bs2 * j);
176: PetscCall(PetscLogFlops(2.0 * bs2 * bs * (nz + 1) - bs2)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
177: }
178: }
180: /* finished row so stick it into b->a */
181: /* L part */
182: pv = b->a + bs2 * bi[i];
183: pj = b->j + bi[i];
184: nz = bi[i + 1] - bi[i];
185: for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
187: /* Mark diagonal and invert diagonal for simpler triangular solves */
188: pv = b->a + bs2 * bdiag[i];
189: pj = b->j + bdiag[i];
190: PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
192: PetscCall(PetscKernel_A_gets_inverse_A(bs, pv, v_pivots, v_work, allowzeropivot, &zeropivotdetected));
193: if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
195: /* U part */
196: pv = b->a + bs2 * (bdiag[i + 1] + 1);
197: pj = b->j + bdiag[i + 1] + 1;
198: nz = bdiag[i] - bdiag[i + 1] - 1;
199: for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
200: }
202: PetscCall(PetscFree(rtmp));
203: PetscCall(PetscFree3(v_work, mwork, v_pivots));
204: PetscCall(ISRestoreIndices(isicol, &ic));
205: PetscCall(ISRestoreIndices(isrow, &r));
207: PetscCall(ISIdentity(isrow, &row_identity));
208: PetscCall(ISIdentity(isicol, &col_identity));
210: both_identity = (PetscBool)(row_identity && col_identity);
211: if (both_identity) {
212: switch (bs) {
213: case 9:
214: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
215: C->ops->solve = MatSolve_SeqBAIJ_9_NaturalOrdering;
216: #else
217: C->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering;
218: #endif
219: break;
220: case 11:
221: C->ops->solve = MatSolve_SeqBAIJ_11_NaturalOrdering;
222: break;
223: case 12:
224: C->ops->solve = MatSolve_SeqBAIJ_12_NaturalOrdering;
225: break;
226: case 13:
227: C->ops->solve = MatSolve_SeqBAIJ_13_NaturalOrdering;
228: break;
229: case 14:
230: C->ops->solve = MatSolve_SeqBAIJ_14_NaturalOrdering;
231: break;
232: default:
233: C->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering;
234: break;
235: }
236: } else {
237: C->ops->solve = MatSolve_SeqBAIJ_N;
238: }
239: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_N;
241: C->assembled = PETSC_TRUE;
243: PetscCall(PetscLogFlops(1.333333333333 * bs * bs2 * b->mbs)); /* from inverting diagonal blocks */
244: PetscFunctionReturn(PETSC_SUCCESS);
245: }
247: /*
248: ilu(0) with natural ordering under new data structure.
249: See MatILUFactorSymbolic_SeqAIJ_ilu0() for detailed description
250: because this code is almost identical to MatILUFactorSymbolic_SeqAIJ_ilu0_inplace().
251: */
253: PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0(Mat fact, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
254: {
255: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b;
256: PetscInt n = a->mbs, *ai = a->i, *aj, *adiag = a->diag, bs2 = a->bs2;
257: PetscInt i, j, nz, *bi, *bj, *bdiag, bi_temp;
259: PetscFunctionBegin;
260: PetscCall(MatDuplicateNoCreate_SeqBAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_FALSE));
261: b = (Mat_SeqBAIJ *)(fact)->data;
263: /* allocate matrix arrays for new data structure */
264: PetscCall(PetscMalloc3(bs2 * ai[n] + 1, &b->a, ai[n] + 1, &b->j, n + 1, &b->i));
266: b->singlemalloc = PETSC_TRUE;
267: b->free_a = PETSC_TRUE;
268: b->free_ij = PETSC_TRUE;
269: fact->preallocated = PETSC_TRUE;
270: fact->assembled = PETSC_TRUE;
271: if (!b->diag) PetscCall(PetscMalloc1(n + 1, &b->diag));
272: bdiag = b->diag;
274: if (n > 0) PetscCall(PetscArrayzero(b->a, bs2 * ai[n]));
276: /* set bi and bj with new data structure */
277: bi = b->i;
278: bj = b->j;
280: /* L part */
281: bi[0] = 0;
282: for (i = 0; i < n; i++) {
283: nz = adiag[i] - ai[i];
284: bi[i + 1] = bi[i] + nz;
285: aj = a->j + ai[i];
286: for (j = 0; j < nz; j++) {
287: *bj = aj[j];
288: bj++;
289: }
290: }
292: /* U part */
293: bi_temp = bi[n];
294: bdiag[n] = bi[n] - 1;
295: for (i = n - 1; i >= 0; i--) {
296: nz = ai[i + 1] - adiag[i] - 1;
297: bi_temp = bi_temp + nz + 1;
298: aj = a->j + adiag[i] + 1;
299: for (j = 0; j < nz; j++) {
300: *bj = aj[j];
301: bj++;
302: }
303: /* diag[i] */
304: *bj = i;
305: bj++;
306: bdiag[i] = bi_temp - 1;
307: }
308: PetscFunctionReturn(PETSC_SUCCESS);
309: }
311: PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
312: {
313: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b;
314: IS isicol;
315: const PetscInt *r, *ic;
316: PetscInt n = a->mbs, *ai = a->i, *aj = a->j, d;
317: PetscInt *bi, *cols, nnz, *cols_lvl;
318: PetscInt *bdiag, prow, fm, nzbd, reallocs = 0, dcount = 0;
319: PetscInt i, levels, diagonal_fill;
320: PetscBool col_identity, row_identity, both_identity;
321: PetscReal f;
322: PetscInt nlnk, *lnk, *lnk_lvl = NULL;
323: PetscBT lnkbt;
324: PetscInt nzi, *bj, **bj_ptr, **bjlvl_ptr;
325: PetscFreeSpaceList free_space = NULL, current_space = NULL;
326: PetscFreeSpaceList free_space_lvl = NULL, current_space_lvl = NULL;
327: PetscBool missing;
328: PetscInt bs = A->rmap->bs, bs2 = a->bs2;
330: PetscFunctionBegin;
331: PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
332: if (bs > 1) { /* check shifttype */
333: PetscCheck(info->shifttype != (PetscReal)MAT_SHIFT_NONZERO && info->shifttype != (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE, PETSC_COMM_SELF, PETSC_ERR_SUP, "Only MAT_SHIFT_NONE and MAT_SHIFT_INBLOCKS are supported for BAIJ matrix");
334: }
336: PetscCall(MatMissingDiagonal(A, &missing, &d));
337: PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, d);
339: f = info->fill;
340: levels = (PetscInt)info->levels;
341: diagonal_fill = (PetscInt)info->diagonal_fill;
343: PetscCall(ISInvertPermutation(iscol, PETSC_DECIDE, &isicol));
345: PetscCall(ISIdentity(isrow, &row_identity));
346: PetscCall(ISIdentity(iscol, &col_identity));
348: both_identity = (PetscBool)(row_identity && col_identity);
350: if (!levels && both_identity) {
351: /* special case: ilu(0) with natural ordering */
352: PetscCall(MatILUFactorSymbolic_SeqBAIJ_ilu0(fact, A, isrow, iscol, info));
353: PetscCall(MatSeqBAIJSetNumericFactorization(fact, both_identity));
355: fact->factortype = MAT_FACTOR_ILU;
356: (fact)->info.factor_mallocs = 0;
357: (fact)->info.fill_ratio_given = info->fill;
358: (fact)->info.fill_ratio_needed = 1.0;
360: b = (Mat_SeqBAIJ *)(fact)->data;
361: b->row = isrow;
362: b->col = iscol;
363: b->icol = isicol;
364: PetscCall(PetscObjectReference((PetscObject)isrow));
365: PetscCall(PetscObjectReference((PetscObject)iscol));
366: b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
368: PetscCall(PetscMalloc1((n + 1) * bs, &b->solve_work));
369: PetscFunctionReturn(PETSC_SUCCESS);
370: }
372: PetscCall(ISGetIndices(isrow, &r));
373: PetscCall(ISGetIndices(isicol, &ic));
375: /* get new row pointers */
376: PetscCall(PetscMalloc1(n + 1, &bi));
377: bi[0] = 0;
378: /* bdiag is location of diagonal in factor */
379: PetscCall(PetscMalloc1(n + 1, &bdiag));
380: bdiag[0] = 0;
382: PetscCall(PetscMalloc2(n, &bj_ptr, n, &bjlvl_ptr));
384: /* create a linked list for storing column indices of the active row */
385: nlnk = n + 1;
386: PetscCall(PetscIncompleteLLCreate(n, n, nlnk, lnk, lnk_lvl, lnkbt));
388: /* initial FreeSpace size is f*(ai[n]+1) */
389: PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(f, ai[n] + 1), &free_space));
390: current_space = free_space;
391: PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(f, ai[n] + 1), &free_space_lvl));
392: current_space_lvl = free_space_lvl;
394: for (i = 0; i < n; i++) {
395: nzi = 0;
396: /* copy current row into linked list */
397: nnz = ai[r[i] + 1] - ai[r[i]];
398: PetscCheck(nnz, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Empty row in matrix: row in original ordering %" PetscInt_FMT " in permuted ordering %" PetscInt_FMT, r[i], i);
399: cols = aj + ai[r[i]];
400: lnk[i] = -1; /* marker to indicate if diagonal exists */
401: PetscCall(PetscIncompleteLLInit(nnz, cols, n, ic, &nlnk, lnk, lnk_lvl, lnkbt));
402: nzi += nlnk;
404: /* make sure diagonal entry is included */
405: if (diagonal_fill && lnk[i] == -1) {
406: fm = n;
407: while (lnk[fm] < i) fm = lnk[fm];
408: lnk[i] = lnk[fm]; /* insert diagonal into linked list */
409: lnk[fm] = i;
410: lnk_lvl[i] = 0;
411: nzi++;
412: dcount++;
413: }
415: /* add pivot rows into the active row */
416: nzbd = 0;
417: prow = lnk[n];
418: while (prow < i) {
419: nnz = bdiag[prow];
420: cols = bj_ptr[prow] + nnz + 1;
421: cols_lvl = bjlvl_ptr[prow] + nnz + 1;
422: nnz = bi[prow + 1] - bi[prow] - nnz - 1;
424: PetscCall(PetscILULLAddSorted(nnz, cols, levels, cols_lvl, prow, &nlnk, lnk, lnk_lvl, lnkbt, prow));
425: nzi += nlnk;
426: prow = lnk[prow];
427: nzbd++;
428: }
429: bdiag[i] = nzbd;
430: bi[i + 1] = bi[i] + nzi;
432: /* if free space is not available, make more free space */
433: if (current_space->local_remaining < nzi) {
434: nnz = PetscIntMultTruncate(2, PetscIntMultTruncate(nzi, (n - i))); /* estimated and max additional space needed */
435: PetscCall(PetscFreeSpaceGet(nnz, ¤t_space));
436: PetscCall(PetscFreeSpaceGet(nnz, ¤t_space_lvl));
437: reallocs++;
438: }
440: /* copy data into free_space and free_space_lvl, then initialize lnk */
441: PetscCall(PetscIncompleteLLClean(n, n, nzi, lnk, lnk_lvl, current_space->array, current_space_lvl->array, lnkbt));
443: bj_ptr[i] = current_space->array;
444: bjlvl_ptr[i] = current_space_lvl->array;
446: /* make sure the active row i has diagonal entry */
447: PetscCheck(*(bj_ptr[i] + bdiag[i]) == i, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Row %" PetscInt_FMT " has missing diagonal in factored matrix\ntry running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill", i);
449: current_space->array += nzi;
450: current_space->local_used += nzi;
451: current_space->local_remaining -= nzi;
453: current_space_lvl->array += nzi;
454: current_space_lvl->local_used += nzi;
455: current_space_lvl->local_remaining -= nzi;
456: }
458: PetscCall(ISRestoreIndices(isrow, &r));
459: PetscCall(ISRestoreIndices(isicol, &ic));
461: /* copy free_space into bj and free free_space; set bi, bj, bdiag in new datastructure; */
462: PetscCall(PetscMalloc1(bi[n] + 1, &bj));
463: PetscCall(PetscFreeSpaceContiguous_LU(&free_space, bj, n, bi, bdiag));
465: PetscCall(PetscIncompleteLLDestroy(lnk, lnkbt));
466: PetscCall(PetscFreeSpaceDestroy(free_space_lvl));
467: PetscCall(PetscFree2(bj_ptr, bjlvl_ptr));
469: #if defined(PETSC_USE_INFO)
470: {
471: PetscReal af = ((PetscReal)(bdiag[0] + 1)) / ((PetscReal)ai[n]);
472: PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)f, (double)af));
473: PetscCall(PetscInfo(A, "Run with -[sub_]pc_factor_fill %g or use \n", (double)af));
474: PetscCall(PetscInfo(A, "PCFactorSetFill([sub]pc,%g);\n", (double)af));
475: PetscCall(PetscInfo(A, "for best performance.\n"));
476: if (diagonal_fill) PetscCall(PetscInfo(A, "Detected and replaced %" PetscInt_FMT " missing diagonals\n", dcount));
477: }
478: #endif
480: /* put together the new matrix */
481: PetscCall(MatSeqBAIJSetPreallocation(fact, bs, MAT_SKIP_ALLOCATION, NULL));
483: b = (Mat_SeqBAIJ *)(fact)->data;
484: b->free_a = PETSC_TRUE;
485: b->free_ij = PETSC_TRUE;
486: b->singlemalloc = PETSC_FALSE;
488: PetscCall(PetscMalloc1(bs2 * (bdiag[0] + 1), &b->a));
490: b->j = bj;
491: b->i = bi;
492: b->diag = bdiag;
493: b->free_diag = PETSC_TRUE;
494: b->ilen = NULL;
495: b->imax = NULL;
496: b->row = isrow;
497: b->col = iscol;
498: PetscCall(PetscObjectReference((PetscObject)isrow));
499: PetscCall(PetscObjectReference((PetscObject)iscol));
500: b->icol = isicol;
502: PetscCall(PetscMalloc1(bs * n + bs, &b->solve_work));
503: /* In b structure: Free imax, ilen, old a, old j.
504: Allocate bdiag, solve_work, new a, new j */
505: b->maxnz = b->nz = bdiag[0] + 1;
507: fact->info.factor_mallocs = reallocs;
508: fact->info.fill_ratio_given = f;
509: fact->info.fill_ratio_needed = ((PetscReal)(bdiag[0] + 1)) / ((PetscReal)ai[n]);
511: PetscCall(MatSeqBAIJSetNumericFactorization(fact, both_identity));
512: PetscFunctionReturn(PETSC_SUCCESS);
513: }
515: #if 0
516: // unused
517: /*
518: This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
519: except that the data structure of Mat_SeqAIJ is slightly different.
520: Not a good example of code reuse.
521: */
522: static PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_inplace(Mat fact, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
523: {
524: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b;
525: IS isicol;
526: const PetscInt *r, *ic, *ai = a->i, *aj = a->j, *xi;
527: PetscInt prow, n = a->mbs, *ainew, *ajnew, jmax, *fill, nz, *im, *ajfill, *flev, *xitmp;
528: PetscInt *dloc, idx, row, m, fm, nzf, nzi, reallocate = 0, dcount = 0;
529: PetscInt incrlev, nnz, i, bs = A->rmap->bs, bs2 = a->bs2, levels, diagonal_fill, dd;
530: PetscBool col_identity, row_identity, both_identity, flg;
531: PetscReal f;
533: PetscFunctionBegin;
534: PetscCall(MatMissingDiagonal_SeqBAIJ(A, &flg, &dd));
535: PetscCheck(!flg, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix A is missing diagonal entry in row %" PetscInt_FMT, dd);
537: f = info->fill;
538: levels = (PetscInt)info->levels;
539: diagonal_fill = (PetscInt)info->diagonal_fill;
541: PetscCall(ISInvertPermutation(iscol, PETSC_DECIDE, &isicol));
543: PetscCall(ISIdentity(isrow, &row_identity));
544: PetscCall(ISIdentity(iscol, &col_identity));
545: both_identity = (PetscBool)(row_identity && col_identity);
547: if (!levels && both_identity) { /* special case copy the nonzero structure */
548: PetscCall(MatDuplicateNoCreate_SeqBAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE));
549: PetscCall(MatSeqBAIJSetNumericFactorization_inplace(fact, both_identity));
551: fact->factortype = MAT_FACTOR_ILU;
552: b = (Mat_SeqBAIJ *)fact->data;
553: b->row = isrow;
554: b->col = iscol;
555: PetscCall(PetscObjectReference((PetscObject)isrow));
556: PetscCall(PetscObjectReference((PetscObject)iscol));
557: b->icol = isicol;
558: b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
560: PetscCall(PetscMalloc1((n + 1) * bs, &b->solve_work));
561: PetscFunctionReturn(PETSC_SUCCESS);
562: }
564: /* general case perform the symbolic factorization */
565: PetscCall(ISGetIndices(isrow, &r));
566: PetscCall(ISGetIndices(isicol, &ic));
568: /* get new row pointers */
569: PetscCall(PetscMalloc1(n + 1, &ainew));
570: ainew[0] = 0;
571: /* don't know how many column pointers are needed so estimate */
572: jmax = (PetscInt)(f * ai[n] + 1);
573: PetscCall(PetscMalloc1(jmax, &ajnew));
574: /* ajfill is level of fill for each fill entry */
575: PetscCall(PetscMalloc1(jmax, &ajfill));
576: /* fill is a linked list of nonzeros in active row */
577: PetscCall(PetscMalloc1(n + 1, &fill));
578: /* im is level for each filled value */
579: PetscCall(PetscMalloc1(n + 1, &im));
580: /* dloc is location of diagonal in factor */
581: PetscCall(PetscMalloc1(n + 1, &dloc));
582: dloc[0] = 0;
583: for (prow = 0; prow < n; prow++) {
584: /* copy prow into linked list */
585: nzf = nz = ai[r[prow] + 1] - ai[r[prow]];
586: PetscCheck(nz, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Empty row in matrix: row in original ordering %" PetscInt_FMT " in permuted ordering %" PetscInt_FMT, r[prow], prow);
587: xi = aj + ai[r[prow]];
588: fill[n] = n;
589: fill[prow] = -1; /* marker for diagonal entry */
590: while (nz--) {
591: fm = n;
592: idx = ic[*xi++];
593: do {
594: m = fm;
595: fm = fill[m];
596: } while (fm < idx);
597: fill[m] = idx;
598: fill[idx] = fm;
599: im[idx] = 0;
600: }
602: /* make sure diagonal entry is included */
603: if (diagonal_fill && fill[prow] == -1) {
604: fm = n;
605: while (fill[fm] < prow) fm = fill[fm];
606: fill[prow] = fill[fm]; /* insert diagonal into linked list */
607: fill[fm] = prow;
608: im[prow] = 0;
609: nzf++;
610: dcount++;
611: }
613: nzi = 0;
614: row = fill[n];
615: while (row < prow) {
616: incrlev = im[row] + 1;
617: nz = dloc[row];
618: xi = ajnew + ainew[row] + nz + 1;
619: flev = ajfill + ainew[row] + nz + 1;
620: nnz = ainew[row + 1] - ainew[row] - nz - 1;
621: fm = row;
622: while (nnz-- > 0) {
623: idx = *xi++;
624: if (*flev + incrlev > levels) {
625: flev++;
626: continue;
627: }
628: do {
629: m = fm;
630: fm = fill[m];
631: } while (fm < idx);
632: if (fm != idx) {
633: im[idx] = *flev + incrlev;
634: fill[m] = idx;
635: fill[idx] = fm;
636: fm = idx;
637: nzf++;
638: } else if (im[idx] > *flev + incrlev) im[idx] = *flev + incrlev;
639: flev++;
640: }
641: row = fill[row];
642: nzi++;
643: }
644: /* copy new filled row into permanent storage */
645: ainew[prow + 1] = ainew[prow] + nzf;
646: if (ainew[prow + 1] > jmax) {
647: /* estimate how much additional space we will need */
648: /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
649: /* just double the memory each time */
650: PetscInt maxadd = jmax;
651: /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */
652: if (maxadd < nzf) maxadd = (n - prow) * (nzf + 1);
653: jmax += maxadd;
655: /* allocate a longer ajnew and ajfill */
656: PetscCall(PetscMalloc1(jmax, &xitmp));
657: PetscCall(PetscArraycpy(xitmp, ajnew, ainew[prow]));
658: PetscCall(PetscFree(ajnew));
659: ajnew = xitmp;
660: PetscCall(PetscMalloc1(jmax, &xitmp));
661: PetscCall(PetscArraycpy(xitmp, ajfill, ainew[prow]));
662: PetscCall(PetscFree(ajfill));
663: ajfill = xitmp;
664: reallocate++; /* count how many reallocations are needed */
665: }
666: xitmp = ajnew + ainew[prow];
667: flev = ajfill + ainew[prow];
668: dloc[prow] = nzi;
669: fm = fill[n];
670: while (nzf--) {
671: *xitmp++ = fm;
672: *flev++ = im[fm];
673: fm = fill[fm];
674: }
675: /* make sure row has diagonal entry */
676: PetscCheck(ajnew[ainew[prow] + dloc[prow]] == prow, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Row %" PetscInt_FMT " has missing diagonal in factored matrix\n\
677: try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",
678: prow);
679: }
680: PetscCall(PetscFree(ajfill));
681: PetscCall(ISRestoreIndices(isrow, &r));
682: PetscCall(ISRestoreIndices(isicol, &ic));
683: PetscCall(PetscFree(fill));
684: PetscCall(PetscFree(im));
686: #if defined(PETSC_USE_INFO)
687: {
688: PetscReal af = ((PetscReal)ainew[n]) / ((PetscReal)ai[n]);
689: PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocate, (double)f, (double)af));
690: PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af));
691: PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g);\n", (double)af));
692: PetscCall(PetscInfo(A, "for best performance.\n"));
693: if (diagonal_fill) PetscCall(PetscInfo(A, "Detected and replaced %" PetscInt_FMT " missing diagonals\n", dcount));
694: }
695: #endif
697: /* put together the new matrix */
698: PetscCall(MatSeqBAIJSetPreallocation(fact, bs, MAT_SKIP_ALLOCATION, NULL));
699: b = (Mat_SeqBAIJ *)fact->data;
701: b->free_a = PETSC_TRUE;
702: b->free_ij = PETSC_TRUE;
703: b->singlemalloc = PETSC_FALSE;
705: PetscCall(PetscMalloc1(bs2 * ainew[n], &b->a));
707: b->j = ajnew;
708: b->i = ainew;
709: for (i = 0; i < n; i++) dloc[i] += ainew[i];
710: b->diag = dloc;
711: b->free_diag = PETSC_TRUE;
712: b->ilen = NULL;
713: b->imax = NULL;
714: b->row = isrow;
715: b->col = iscol;
716: b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
718: PetscCall(PetscObjectReference((PetscObject)isrow));
719: PetscCall(PetscObjectReference((PetscObject)iscol));
720: b->icol = isicol;
721: PetscCall(PetscMalloc1(bs * n + bs, &b->solve_work));
722: /* In b structure: Free imax, ilen, old a, old j.
723: Allocate dloc, solve_work, new a, new j */
724: b->maxnz = b->nz = ainew[n];
726: fact->info.factor_mallocs = reallocate;
727: fact->info.fill_ratio_given = f;
728: fact->info.fill_ratio_needed = ((PetscReal)ainew[n]) / ((PetscReal)ai[prow]);
730: PetscCall(MatSeqBAIJSetNumericFactorization_inplace(fact, both_identity));
731: PetscFunctionReturn(PETSC_SUCCESS);
732: }
733: #endif
735: PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A)
736: {
737: /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; */
738: /* int i,*AJ=a->j,nz=a->nz; */
740: PetscFunctionBegin;
741: /* Undo Column scaling */
742: /* while (nz--) { */
743: /* AJ[i] = AJ[i]/4; */
744: /* } */
745: /* This should really invoke a push/pop logic, but we don't have that yet. */
746: A->ops->setunfactored = NULL;
747: PetscFunctionReturn(PETSC_SUCCESS);
748: }
750: PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A)
751: {
752: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
753: PetscInt *AJ = a->j, nz = a->nz;
754: unsigned short *aj = (unsigned short *)AJ;
756: PetscFunctionBegin;
757: /* Is this really necessary? */
758: while (nz--) { AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */ }
759: A->ops->setunfactored = NULL;
760: PetscFunctionReturn(PETSC_SUCCESS);
761: }