Actual source code: baijfact3.c
2: /*
3: Factorization code for BAIJ format.
4: */
5: #include <../src/mat/impls/baij/seq/baij.h>
6: #include <petsc/private/kernels/blockinvert.h>
8: /*
9: This is used to set the numeric factorization for both LU and ILU symbolic factorization
10: */
11: PetscErrorCode MatSeqBAIJSetNumericFactorization(Mat fact, PetscBool natural)
12: {
13: PetscFunctionBegin;
14: if (natural) {
15: switch (fact->rmap->bs) {
16: case 1:
17: fact->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_1;
18: break;
19: case 2:
20: fact->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering;
21: break;
22: case 3:
23: fact->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering;
24: break;
25: case 4:
26: fact->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering;
27: break;
28: case 5:
29: fact->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_5_NaturalOrdering;
30: break;
31: case 6:
32: fact->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_6_NaturalOrdering;
33: break;
34: case 7:
35: fact->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_7_NaturalOrdering;
36: break;
37: case 9:
38: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
39: fact->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_9_NaturalOrdering;
40: #else
41: fact->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_N;
42: #endif
43: break;
44: case 15:
45: fact->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_15_NaturalOrdering;
46: break;
47: default:
48: fact->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_N;
49: break;
50: }
51: } else {
52: switch (fact->rmap->bs) {
53: case 1:
54: fact->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_1;
55: break;
56: case 2:
57: fact->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_2;
58: break;
59: case 3:
60: fact->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_3;
61: break;
62: case 4:
63: fact->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4;
64: break;
65: case 5:
66: fact->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_5;
67: break;
68: case 6:
69: fact->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_6;
70: break;
71: case 7:
72: fact->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_7;
73: break;
74: default:
75: fact->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_N;
76: break;
77: }
78: }
79: PetscFunctionReturn(PETSC_SUCCESS);
80: }
82: PetscErrorCode MatSeqBAIJSetNumericFactorization_inplace(Mat inA, PetscBool natural)
83: {
84: PetscFunctionBegin;
85: if (natural) {
86: switch (inA->rmap->bs) {
87: case 1:
88: inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_1_inplace;
89: break;
90: case 2:
91: inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering_inplace;
92: break;
93: case 3:
94: inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering_inplace;
95: break;
96: case 4:
97: #if defined(PETSC_USE_REAL_MAT_SINGLE)
98: {
99: PetscBool sse_enabled_local;
100: PetscCall(PetscSSEIsEnabled(inA->comm, &sse_enabled_local, NULL));
101: if (sse_enabled_local) {
102: #if defined(PETSC_HAVE_SSE)
103: int i, *AJ = a->j, nz = a->nz, n = a->mbs;
104: if (n == (unsigned short)n) {
105: unsigned short *aj = (unsigned short *)AJ;
106: for (i = 0; i < nz; i++) aj[i] = (unsigned short)AJ[i];
108: inA->ops->setunfactored = MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj;
109: inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj;
111: PetscCall(PetscInfo(inA, "Using special SSE, in-place natural ordering, ushort j index factor BS=4\n"));
112: } else {
113: /* Scale the column indices for easier indexing in MatSolve. */
114: /* for (i=0;i<nz;i++) { */
115: /* AJ[i] = AJ[i]*4; */
116: /* } */
117: inA->ops->setunfactored = MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE;
118: inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE;
120: PetscCall(PetscInfo(inA, "Using special SSE, in-place natural ordering, int j index factor BS=4\n"));
121: }
122: #else
123: /* This should never be reached. If so, problem in PetscSSEIsEnabled. */
124: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "SSE Hardware unavailable");
125: #endif
126: } else {
127: inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_inplace;
128: }
129: }
130: #else
131: inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_inplace;
132: #endif
133: break;
134: case 5:
135: inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_5_NaturalOrdering_inplace;
136: break;
137: case 6:
138: inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_6_NaturalOrdering_inplace;
139: break;
140: case 7:
141: inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_7_NaturalOrdering_inplace;
142: break;
143: default:
144: inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_N_inplace;
145: break;
146: }
147: } else {
148: switch (inA->rmap->bs) {
149: case 1:
150: inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_1_inplace;
151: break;
152: case 2:
153: inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_2_inplace;
154: break;
155: case 3:
156: inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_3_inplace;
157: break;
158: case 4:
159: inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_inplace;
160: break;
161: case 5:
162: inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_5_inplace;
163: break;
164: case 6:
165: inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_6_inplace;
166: break;
167: case 7:
168: inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_7_inplace;
169: break;
170: default:
171: inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_N_inplace;
172: break;
173: }
174: }
175: PetscFunctionReturn(PETSC_SUCCESS);
176: }
178: /*
179: The symbolic factorization code is identical to that for AIJ format,
180: except for very small changes since this is now a SeqBAIJ datastructure.
181: NOT good code reuse.
182: */
183: #include <petscbt.h>
184: #include <../src/mat/utils/freespace.h>
186: PetscErrorCode MatLUFactorSymbolic_SeqBAIJ(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
187: {
188: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b;
189: PetscInt n = a->mbs, bs = A->rmap->bs, bs2 = a->bs2;
190: PetscBool row_identity, col_identity, both_identity;
191: IS isicol;
192: const PetscInt *r, *ic;
193: PetscInt i, *ai = a->i, *aj = a->j;
194: PetscInt *bi, *bj, *ajtmp;
195: PetscInt *bdiag, row, nnz, nzi, reallocs = 0, nzbd, *im;
196: PetscReal f;
197: PetscInt nlnk, *lnk, k, **bi_ptr;
198: PetscFreeSpaceList free_space = NULL, current_space = NULL;
199: PetscBT lnkbt;
200: PetscBool missing;
202: PetscFunctionBegin;
203: PetscCheck(A->rmap->N == A->cmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "matrix must be square");
204: PetscCall(MatMissingDiagonal(A, &missing, &i));
205: PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
207: if (bs > 1) { /* check shifttype */
208: PetscCheck(info->shifttype != (PetscReal)MAT_SHIFT_NONZERO && info->shifttype != (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE, PETSC_COMM_SELF, PETSC_ERR_SUP, "Only MAT_SHIFT_NONE and MAT_SHIFT_INBLOCKS are supported for BAIJ matrix");
209: }
211: PetscCall(ISInvertPermutation(iscol, PETSC_DECIDE, &isicol));
212: PetscCall(ISGetIndices(isrow, &r));
213: PetscCall(ISGetIndices(isicol, &ic));
215: /* get new row and diagonal pointers, must be allocated separately because they will be given to the Mat_SeqAIJ and freed separately */
216: PetscCall(PetscMalloc1(n + 1, &bi));
217: PetscCall(PetscMalloc1(n + 1, &bdiag));
218: bi[0] = bdiag[0] = 0;
220: /* linked list for storing column indices of the active row */
221: nlnk = n + 1;
222: PetscCall(PetscLLCreate(n, n, nlnk, lnk, lnkbt));
224: PetscCall(PetscMalloc2(n + 1, &bi_ptr, n + 1, &im));
226: /* initial FreeSpace size is f*(ai[n]+1) */
227: f = info->fill;
228: PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(f, ai[n] + 1), &free_space));
230: current_space = free_space;
232: for (i = 0; i < n; i++) {
233: /* copy previous fill into linked list */
234: nzi = 0;
235: nnz = ai[r[i] + 1] - ai[r[i]];
236: ajtmp = aj + ai[r[i]];
237: PetscCall(PetscLLAddPerm(nnz, ajtmp, ic, n, &nlnk, lnk, lnkbt));
238: nzi += nlnk;
240: /* add pivot rows into linked list */
241: row = lnk[n];
242: while (row < i) {
243: nzbd = bdiag[row] + 1; /* num of entries in the row with column index <= row */
244: ajtmp = bi_ptr[row] + nzbd; /* points to the entry next to the diagonal */
245: PetscCall(PetscLLAddSortedLU(ajtmp, row, &nlnk, lnk, lnkbt, i, nzbd, im));
246: nzi += nlnk;
247: row = lnk[row];
248: }
249: bi[i + 1] = bi[i] + nzi;
250: im[i] = nzi;
252: /* mark bdiag */
253: nzbd = 0;
254: nnz = nzi;
255: k = lnk[n];
256: while (nnz-- && k < i) {
257: nzbd++;
258: k = lnk[k];
259: }
260: bdiag[i] = nzbd; /* note : bdaig[i] = nnzL as input for PetscFreeSpaceContiguous_LU() */
262: /* if free space is not available, make more free space */
263: if (current_space->local_remaining < nzi) {
264: nnz = PetscIntMultTruncate(2, PetscIntMultTruncate(n - i, nzi)); /* estimated and max additional space needed */
265: PetscCall(PetscFreeSpaceGet(nnz, ¤t_space));
266: reallocs++;
267: }
269: /* copy data into free space, then initialize lnk */
270: PetscCall(PetscLLClean(n, n, nzi, lnk, current_space->array, lnkbt));
272: bi_ptr[i] = current_space->array;
273: current_space->array += nzi;
274: current_space->local_used += nzi;
275: current_space->local_remaining -= nzi;
276: }
278: PetscCall(ISRestoreIndices(isrow, &r));
279: PetscCall(ISRestoreIndices(isicol, &ic));
281: /* copy free_space into bj and free free_space; set bi, bj, bdiag in new datastructure; */
282: PetscCall(PetscMalloc1(bi[n] + 1, &bj));
283: PetscCall(PetscFreeSpaceContiguous_LU(&free_space, bj, n, bi, bdiag));
284: PetscCall(PetscLLDestroy(lnk, lnkbt));
285: PetscCall(PetscFree2(bi_ptr, im));
287: /* put together the new matrix */
288: PetscCall(MatSeqBAIJSetPreallocation(B, bs, MAT_SKIP_ALLOCATION, NULL));
289: b = (Mat_SeqBAIJ *)(B)->data;
291: b->free_a = PETSC_TRUE;
292: b->free_ij = PETSC_TRUE;
293: b->singlemalloc = PETSC_FALSE;
295: PetscCall(PetscMalloc1((bdiag[0] + 1) * bs2, &b->a));
296: b->j = bj;
297: b->i = bi;
298: b->diag = bdiag;
299: b->free_diag = PETSC_TRUE;
300: b->ilen = NULL;
301: b->imax = NULL;
302: b->row = isrow;
303: b->col = iscol;
304: b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
306: PetscCall(PetscObjectReference((PetscObject)isrow));
307: PetscCall(PetscObjectReference((PetscObject)iscol));
308: b->icol = isicol;
309: PetscCall(PetscMalloc1(bs * n + bs, &b->solve_work));
311: b->maxnz = b->nz = bdiag[0] + 1;
313: B->factortype = MAT_FACTOR_LU;
314: B->info.factor_mallocs = reallocs;
315: B->info.fill_ratio_given = f;
317: if (ai[n] != 0) {
318: B->info.fill_ratio_needed = ((PetscReal)(bdiag[0] + 1)) / ((PetscReal)ai[n]);
319: } else {
320: B->info.fill_ratio_needed = 0.0;
321: }
322: #if defined(PETSC_USE_INFO)
323: if (ai[n] != 0) {
324: PetscReal af = B->info.fill_ratio_needed;
325: PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)f, (double)af));
326: PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af));
327: PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g);\n", (double)af));
328: PetscCall(PetscInfo(A, "for best performance.\n"));
329: } else {
330: PetscCall(PetscInfo(A, "Empty matrix\n"));
331: }
332: #endif
334: PetscCall(ISIdentity(isrow, &row_identity));
335: PetscCall(ISIdentity(iscol, &col_identity));
337: both_identity = (PetscBool)(row_identity && col_identity);
339: PetscCall(MatSeqBAIJSetNumericFactorization(B, both_identity));
340: PetscFunctionReturn(PETSC_SUCCESS);
341: }
343: #if 0
344: // unused
345: static PetscErrorCode MatLUFactorSymbolic_SeqBAIJ_inplace(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
346: {
347: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b;
348: PetscInt n = a->mbs, bs = A->rmap->bs, bs2 = a->bs2;
349: PetscBool row_identity, col_identity, both_identity;
350: IS isicol;
351: const PetscInt *r, *ic;
352: PetscInt i, *ai = a->i, *aj = a->j;
353: PetscInt *bi, *bj, *ajtmp;
354: PetscInt *bdiag, row, nnz, nzi, reallocs = 0, nzbd, *im;
355: PetscReal f;
356: PetscInt nlnk, *lnk, k, **bi_ptr;
357: PetscFreeSpaceList free_space = NULL, current_space = NULL;
358: PetscBT lnkbt;
359: PetscBool missing;
361: PetscFunctionBegin;
362: PetscCheck(A->rmap->N == A->cmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "matrix must be square");
363: PetscCall(MatMissingDiagonal(A, &missing, &i));
364: PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
366: PetscCall(ISInvertPermutation(iscol, PETSC_DECIDE, &isicol));
367: PetscCall(ISGetIndices(isrow, &r));
368: PetscCall(ISGetIndices(isicol, &ic));
370: /* get new row and diagonal pointers, must be allocated separately because they will be given to the Mat_SeqAIJ and freed separately */
371: PetscCall(PetscMalloc1(n + 1, &bi));
372: PetscCall(PetscMalloc1(n + 1, &bdiag));
374: bi[0] = bdiag[0] = 0;
376: /* linked list for storing column indices of the active row */
377: nlnk = n + 1;
378: PetscCall(PetscLLCreate(n, n, nlnk, lnk, lnkbt));
380: PetscCall(PetscMalloc2(n + 1, &bi_ptr, n + 1, &im));
382: /* initial FreeSpace size is f*(ai[n]+1) */
383: f = info->fill;
384: PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(f, ai[n] + 1), &free_space));
385: current_space = free_space;
387: for (i = 0; i < n; i++) {
388: /* copy previous fill into linked list */
389: nzi = 0;
390: nnz = ai[r[i] + 1] - ai[r[i]];
391: ajtmp = aj + ai[r[i]];
392: PetscCall(PetscLLAddPerm(nnz, ajtmp, ic, n, &nlnk, lnk, lnkbt));
393: nzi += nlnk;
395: /* add pivot rows into linked list */
396: row = lnk[n];
397: while (row < i) {
398: nzbd = bdiag[row] - bi[row] + 1; /* num of entries in the row with column index <= row */
399: ajtmp = bi_ptr[row] + nzbd; /* points to the entry next to the diagonal */
400: PetscCall(PetscLLAddSortedLU(ajtmp, row, &nlnk, lnk, lnkbt, i, nzbd, im));
401: nzi += nlnk;
402: row = lnk[row];
403: }
404: bi[i + 1] = bi[i] + nzi;
405: im[i] = nzi;
407: /* mark bdiag */
408: nzbd = 0;
409: nnz = nzi;
410: k = lnk[n];
411: while (nnz-- && k < i) {
412: nzbd++;
413: k = lnk[k];
414: }
415: bdiag[i] = bi[i] + nzbd;
417: /* if free space is not available, make more free space */
418: if (current_space->local_remaining < nzi) {
419: nnz = PetscIntMultTruncate(n - i, nzi); /* estimated and max additional space needed */
420: PetscCall(PetscFreeSpaceGet(nnz, ¤t_space));
421: reallocs++;
422: }
424: /* copy data into free space, then initialize lnk */
425: PetscCall(PetscLLClean(n, n, nzi, lnk, current_space->array, lnkbt));
427: bi_ptr[i] = current_space->array;
428: current_space->array += nzi;
429: current_space->local_used += nzi;
430: current_space->local_remaining -= nzi;
431: }
432: #if defined(PETSC_USE_INFO)
433: if (ai[n] != 0) {
434: PetscReal af = ((PetscReal)bi[n]) / ((PetscReal)ai[n]);
435: PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)f, (double)af));
436: PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af));
437: PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g);\n", (double)af));
438: PetscCall(PetscInfo(A, "for best performance.\n"));
439: } else {
440: PetscCall(PetscInfo(A, "Empty matrix\n"));
441: }
442: #endif
444: PetscCall(ISRestoreIndices(isrow, &r));
445: PetscCall(ISRestoreIndices(isicol, &ic));
447: /* destroy list of free space and other temporary array(s) */
448: PetscCall(PetscMalloc1(bi[n] + 1, &bj));
449: PetscCall(PetscFreeSpaceContiguous(&free_space, bj));
450: PetscCall(PetscLLDestroy(lnk, lnkbt));
451: PetscCall(PetscFree2(bi_ptr, im));
453: /* put together the new matrix */
454: PetscCall(MatSeqBAIJSetPreallocation(B, bs, MAT_SKIP_ALLOCATION, NULL));
455: b = (Mat_SeqBAIJ *)(B)->data;
457: b->free_a = PETSC_TRUE;
458: b->free_ij = PETSC_TRUE;
459: b->singlemalloc = PETSC_FALSE;
461: PetscCall(PetscMalloc1((bi[n] + 1) * bs2, &b->a));
462: b->j = bj;
463: b->i = bi;
464: b->diag = bdiag;
465: b->free_diag = PETSC_TRUE;
466: b->ilen = NULL;
467: b->imax = NULL;
468: b->row = isrow;
469: b->col = iscol;
470: b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
472: PetscCall(PetscObjectReference((PetscObject)isrow));
473: PetscCall(PetscObjectReference((PetscObject)iscol));
474: b->icol = isicol;
476: PetscCall(PetscMalloc1(bs * n + bs, &b->solve_work));
478: b->maxnz = b->nz = bi[n];
480: (B)->factortype = MAT_FACTOR_LU;
481: (B)->info.factor_mallocs = reallocs;
482: (B)->info.fill_ratio_given = f;
484: if (ai[n] != 0) {
485: (B)->info.fill_ratio_needed = ((PetscReal)bi[n]) / ((PetscReal)ai[n]);
486: } else {
487: (B)->info.fill_ratio_needed = 0.0;
488: }
490: PetscCall(ISIdentity(isrow, &row_identity));
491: PetscCall(ISIdentity(iscol, &col_identity));
493: both_identity = (PetscBool)(row_identity && col_identity);
495: PetscCall(MatSeqBAIJSetNumericFactorization_inplace(B, both_identity));
496: PetscFunctionReturn(PETSC_SUCCESS);
497: }
498: #endif