Actual source code: sbaijfact2.c
2: /*
3: Factorization code for SBAIJ format.
4: */
6: #include <../src/mat/impls/sbaij/seq/sbaij.h>
7: #include <../src/mat/impls/baij/seq/baij.h>
8: #include <petsc/private/kernels/blockinvert.h>
10: PetscErrorCode MatSolve_SeqSBAIJ_N_inplace(Mat A, Vec bb, Vec xx)
11: {
12: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
13: IS isrow = a->row;
14: PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
15: const PetscInt *r;
16: PetscInt nz, *vj, k, idx, k1;
17: PetscInt bs = A->rmap->bs, bs2 = a->bs2;
18: const MatScalar *aa = a->a, *v, *diag;
19: PetscScalar *x, *xk, *xj, *xk_tmp, *t;
20: const PetscScalar *b;
22: PetscFunctionBegin;
23: PetscCall(VecGetArrayRead(bb, &b));
24: PetscCall(VecGetArray(xx, &x));
25: t = a->solve_work;
26: PetscCall(ISGetIndices(isrow, &r));
27: PetscCall(PetscMalloc1(bs, &xk_tmp));
29: /* solve U^T * D * y = b by forward substitution */
30: xk = t;
31: for (k = 0; k < mbs; k++) { /* t <- perm(b) */
32: idx = bs * r[k];
33: for (k1 = 0; k1 < bs; k1++) *xk++ = b[idx + k1];
34: }
35: for (k = 0; k < mbs; k++) {
36: v = aa + bs2 * ai[k];
37: xk = t + k * bs; /* Dk*xk = k-th block of x */
38: PetscCall(PetscArraycpy(xk_tmp, xk, bs)); /* xk_tmp <- xk */
39: nz = ai[k + 1] - ai[k];
40: vj = aj + ai[k];
41: xj = t + (*vj) * bs; /* *vj-th block of x, *vj>k */
42: while (nz--) {
43: /* x(:) += U(k,:)^T*(Dk*xk) */
44: PetscKernel_v_gets_v_plus_Atranspose_times_w(bs, xj, v, xk_tmp); /* xj <- xj + v^t * xk */
45: vj++;
46: xj = t + (*vj) * bs;
47: v += bs2;
48: }
49: /* xk = inv(Dk)*(Dk*xk) */
50: diag = aa + k * bs2; /* ptr to inv(Dk) */
51: PetscKernel_w_gets_A_times_v(bs, xk_tmp, diag, xk); /* xk <- diag * xk */
52: }
54: /* solve U*x = y by back substitution */
55: for (k = mbs - 1; k >= 0; k--) {
56: v = aa + bs2 * ai[k];
57: xk = t + k * bs; /* xk */
58: nz = ai[k + 1] - ai[k];
59: vj = aj + ai[k];
60: xj = t + (*vj) * bs;
61: while (nz--) {
62: /* xk += U(k,:)*x(:) */
63: PetscKernel_v_gets_v_plus_A_times_w(bs, xk, v, xj); /* xk <- xk + v*xj */
64: vj++;
65: v += bs2;
66: xj = t + (*vj) * bs;
67: }
68: idx = bs * r[k];
69: for (k1 = 0; k1 < bs; k1++) x[idx + k1] = *xk++;
70: }
72: PetscCall(PetscFree(xk_tmp));
73: PetscCall(ISRestoreIndices(isrow, &r));
74: PetscCall(VecRestoreArrayRead(bb, &b));
75: PetscCall(VecRestoreArray(xx, &x));
76: PetscCall(PetscLogFlops(4.0 * bs2 * a->nz - (bs + 2.0 * bs2) * mbs));
77: PetscFunctionReturn(PETSC_SUCCESS);
78: }
80: PetscErrorCode MatForwardSolve_SeqSBAIJ_N_inplace(Mat A, Vec bb, Vec xx)
81: {
82: PetscFunctionBegin;
83: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "not implemented yet");
84: }
86: PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_inplace(Mat A, Vec bb, Vec xx)
87: {
88: PetscFunctionBegin;
89: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Not yet implemented");
90: }
92: PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscInt bs, PetscScalar *x)
93: {
94: PetscInt nz, k;
95: const PetscInt *vj, bs2 = bs * bs;
96: const MatScalar *v, *diag;
97: PetscScalar *xk, *xj, *xk_tmp;
99: PetscFunctionBegin;
100: PetscCall(PetscMalloc1(bs, &xk_tmp));
101: for (k = 0; k < mbs; k++) {
102: v = aa + bs2 * ai[k];
103: xk = x + k * bs; /* Dk*xk = k-th block of x */
104: PetscCall(PetscArraycpy(xk_tmp, xk, bs)); /* xk_tmp <- xk */
105: nz = ai[k + 1] - ai[k];
106: vj = aj + ai[k];
107: xj = x + (size_t)(*vj) * bs; /* *vj-th block of x, *vj>k */
108: while (nz--) {
109: /* x(:) += U(k,:)^T*(Dk*xk) */
110: PetscKernel_v_gets_v_plus_Atranspose_times_w(bs, xj, v, xk_tmp); /* xj <- xj + v^t * xk */
111: vj++;
112: xj = x + (size_t)(*vj) * bs;
113: v += bs2;
114: }
115: /* xk = inv(Dk)*(Dk*xk) */
116: diag = aa + k * bs2; /* ptr to inv(Dk) */
117: PetscKernel_w_gets_A_times_v(bs, xk_tmp, diag, xk); /* xk <- diag * xk */
118: }
119: PetscCall(PetscFree(xk_tmp));
120: PetscFunctionReturn(PETSC_SUCCESS);
121: }
123: PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscInt bs, PetscScalar *x)
124: {
125: PetscInt nz, k;
126: const PetscInt *vj, bs2 = bs * bs;
127: const MatScalar *v;
128: PetscScalar *xk, *xj;
130: PetscFunctionBegin;
131: for (k = mbs - 1; k >= 0; k--) {
132: v = aa + bs2 * ai[k];
133: xk = x + k * bs; /* xk */
134: nz = ai[k + 1] - ai[k];
135: vj = aj + ai[k];
136: xj = x + (size_t)(*vj) * bs;
137: while (nz--) {
138: /* xk += U(k,:)*x(:) */
139: PetscKernel_v_gets_v_plus_A_times_w(bs, xk, v, xj); /* xk <- xk + v*xj */
140: vj++;
141: v += bs2;
142: xj = x + (size_t)(*vj) * bs;
143: }
144: }
145: PetscFunctionReturn(PETSC_SUCCESS);
146: }
148: PetscErrorCode MatSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
149: {
150: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
151: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
152: PetscInt bs = A->rmap->bs;
153: const MatScalar *aa = a->a;
154: PetscScalar *x;
155: const PetscScalar *b;
157: PetscFunctionBegin;
158: PetscCall(VecGetArrayRead(bb, &b));
159: PetscCall(VecGetArray(xx, &x));
161: /* solve U^T * D * y = b by forward substitution */
162: PetscCall(PetscArraycpy(x, b, bs * mbs)); /* x <- b */
163: PetscCall(MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(ai, aj, aa, mbs, bs, x));
165: /* solve U*x = y by back substitution */
166: PetscCall(MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(ai, aj, aa, mbs, bs, x));
168: PetscCall(VecRestoreArrayRead(bb, &b));
169: PetscCall(VecRestoreArray(xx, &x));
170: PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (bs + 2.0 * a->bs2) * mbs));
171: PetscFunctionReturn(PETSC_SUCCESS);
172: }
174: PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
175: {
176: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
177: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
178: PetscInt bs = A->rmap->bs;
179: const MatScalar *aa = a->a;
180: const PetscScalar *b;
181: PetscScalar *x;
183: PetscFunctionBegin;
184: PetscCall(VecGetArrayRead(bb, &b));
185: PetscCall(VecGetArray(xx, &x));
186: PetscCall(PetscArraycpy(x, b, bs * mbs)); /* x <- b */
187: PetscCall(MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(ai, aj, aa, mbs, bs, x));
188: PetscCall(VecRestoreArrayRead(bb, &b));
189: PetscCall(VecRestoreArray(xx, &x));
190: PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - bs * mbs));
191: PetscFunctionReturn(PETSC_SUCCESS);
192: }
194: PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
195: {
196: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
197: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
198: PetscInt bs = A->rmap->bs;
199: const MatScalar *aa = a->a;
200: const PetscScalar *b;
201: PetscScalar *x;
203: PetscFunctionBegin;
204: PetscCall(VecGetArrayRead(bb, &b));
205: PetscCall(VecGetArray(xx, &x));
206: PetscCall(PetscArraycpy(x, b, bs * mbs));
207: PetscCall(MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(ai, aj, aa, mbs, bs, x));
208: PetscCall(VecRestoreArrayRead(bb, &b));
209: PetscCall(VecRestoreArray(xx, &x));
210: PetscCall(PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs)));
211: PetscFunctionReturn(PETSC_SUCCESS);
212: }
214: PetscErrorCode MatSolve_SeqSBAIJ_7_inplace(Mat A, Vec bb, Vec xx)
215: {
216: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
217: IS isrow = a->row;
218: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *r, *vj;
219: PetscInt nz, k, idx;
220: const MatScalar *aa = a->a, *v, *d;
221: PetscScalar *x, x0, x1, x2, x3, x4, x5, x6, *t, *tp;
222: const PetscScalar *b;
224: PetscFunctionBegin;
225: PetscCall(VecGetArrayRead(bb, &b));
226: PetscCall(VecGetArray(xx, &x));
227: t = a->solve_work;
228: PetscCall(ISGetIndices(isrow, &r));
230: /* solve U^T * D * y = b by forward substitution */
231: tp = t;
232: for (k = 0; k < mbs; k++) { /* t <- perm(b) */
233: idx = 7 * r[k];
234: tp[0] = b[idx];
235: tp[1] = b[idx + 1];
236: tp[2] = b[idx + 2];
237: tp[3] = b[idx + 3];
238: tp[4] = b[idx + 4];
239: tp[5] = b[idx + 5];
240: tp[6] = b[idx + 6];
241: tp += 7;
242: }
244: for (k = 0; k < mbs; k++) {
245: v = aa + 49 * ai[k];
246: vj = aj + ai[k];
247: tp = t + k * 7;
248: x0 = tp[0];
249: x1 = tp[1];
250: x2 = tp[2];
251: x3 = tp[3];
252: x4 = tp[4];
253: x5 = tp[5];
254: x6 = tp[6];
255: nz = ai[k + 1] - ai[k];
256: tp = t + (*vj) * 7;
257: while (nz--) {
258: tp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3 + v[4] * x4 + v[5] * x5 + v[6] * x6;
259: tp[1] += v[7] * x0 + v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4 + v[12] * x5 + v[13] * x6;
260: tp[2] += v[14] * x0 + v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5 + v[20] * x6;
261: tp[3] += v[21] * x0 + v[22] * x1 + v[23] * x2 + v[24] * x3 + v[25] * x4 + v[26] * x5 + v[27] * x6;
262: tp[4] += v[28] * x0 + v[29] * x1 + v[30] * x2 + v[31] * x3 + v[32] * x4 + v[33] * x5 + v[34] * x6;
263: tp[5] += v[35] * x0 + v[36] * x1 + v[37] * x2 + v[38] * x3 + v[39] * x4 + v[40] * x5 + v[41] * x6;
264: tp[6] += v[42] * x0 + v[43] * x1 + v[44] * x2 + v[45] * x3 + v[46] * x4 + v[47] * x5 + v[48] * x6;
265: vj++;
266: tp = t + (*vj) * 7;
267: v += 49;
268: }
270: /* xk = inv(Dk)*(Dk*xk) */
271: d = aa + k * 49; /* ptr to inv(Dk) */
272: tp = t + k * 7;
273: tp[0] = d[0] * x0 + d[7] * x1 + d[14] * x2 + d[21] * x3 + d[28] * x4 + d[35] * x5 + d[42] * x6;
274: tp[1] = d[1] * x0 + d[8] * x1 + d[15] * x2 + d[22] * x3 + d[29] * x4 + d[36] * x5 + d[43] * x6;
275: tp[2] = d[2] * x0 + d[9] * x1 + d[16] * x2 + d[23] * x3 + d[30] * x4 + d[37] * x5 + d[44] * x6;
276: tp[3] = d[3] * x0 + d[10] * x1 + d[17] * x2 + d[24] * x3 + d[31] * x4 + d[38] * x5 + d[45] * x6;
277: tp[4] = d[4] * x0 + d[11] * x1 + d[18] * x2 + d[25] * x3 + d[32] * x4 + d[39] * x5 + d[46] * x6;
278: tp[5] = d[5] * x0 + d[12] * x1 + d[19] * x2 + d[26] * x3 + d[33] * x4 + d[40] * x5 + d[47] * x6;
279: tp[6] = d[6] * x0 + d[13] * x1 + d[20] * x2 + d[27] * x3 + d[34] * x4 + d[41] * x5 + d[48] * x6;
280: }
282: /* solve U*x = y by back substitution */
283: for (k = mbs - 1; k >= 0; k--) {
284: v = aa + 49 * ai[k];
285: vj = aj + ai[k];
286: tp = t + k * 7;
287: x0 = tp[0];
288: x1 = tp[1];
289: x2 = tp[2];
290: x3 = tp[3];
291: x4 = tp[4];
292: x5 = tp[5];
293: x6 = tp[6]; /* xk */
294: nz = ai[k + 1] - ai[k];
296: tp = t + (*vj) * 7;
297: while (nz--) {
298: /* xk += U(k,:)*x(:) */
299: x0 += v[0] * tp[0] + v[7] * tp[1] + v[14] * tp[2] + v[21] * tp[3] + v[28] * tp[4] + v[35] * tp[5] + v[42] * tp[6];
300: x1 += v[1] * tp[0] + v[8] * tp[1] + v[15] * tp[2] + v[22] * tp[3] + v[29] * tp[4] + v[36] * tp[5] + v[43] * tp[6];
301: x2 += v[2] * tp[0] + v[9] * tp[1] + v[16] * tp[2] + v[23] * tp[3] + v[30] * tp[4] + v[37] * tp[5] + v[44] * tp[6];
302: x3 += v[3] * tp[0] + v[10] * tp[1] + v[17] * tp[2] + v[24] * tp[3] + v[31] * tp[4] + v[38] * tp[5] + v[45] * tp[6];
303: x4 += v[4] * tp[0] + v[11] * tp[1] + v[18] * tp[2] + v[25] * tp[3] + v[32] * tp[4] + v[39] * tp[5] + v[46] * tp[6];
304: x5 += v[5] * tp[0] + v[12] * tp[1] + v[19] * tp[2] + v[26] * tp[3] + v[33] * tp[4] + v[40] * tp[5] + v[47] * tp[6];
305: x6 += v[6] * tp[0] + v[13] * tp[1] + v[20] * tp[2] + v[27] * tp[3] + v[34] * tp[4] + v[41] * tp[5] + v[48] * tp[6];
306: vj++;
307: tp = t + (*vj) * 7;
308: v += 49;
309: }
310: tp = t + k * 7;
311: tp[0] = x0;
312: tp[1] = x1;
313: tp[2] = x2;
314: tp[3] = x3;
315: tp[4] = x4;
316: tp[5] = x5;
317: tp[6] = x6;
318: idx = 7 * r[k];
319: x[idx] = x0;
320: x[idx + 1] = x1;
321: x[idx + 2] = x2;
322: x[idx + 3] = x3;
323: x[idx + 4] = x4;
324: x[idx + 5] = x5;
325: x[idx + 6] = x6;
326: }
328: PetscCall(ISRestoreIndices(isrow, &r));
329: PetscCall(VecRestoreArrayRead(bb, &b));
330: PetscCall(VecRestoreArray(xx, &x));
331: PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
332: PetscFunctionReturn(PETSC_SUCCESS);
333: }
335: PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
336: {
337: const MatScalar *v, *d;
338: PetscScalar *xp, x0, x1, x2, x3, x4, x5, x6;
339: PetscInt nz, k;
340: const PetscInt *vj;
342: PetscFunctionBegin;
343: for (k = 0; k < mbs; k++) {
344: v = aa + 49 * ai[k];
345: xp = x + k * 7;
346: x0 = xp[0];
347: x1 = xp[1];
348: x2 = xp[2];
349: x3 = xp[3];
350: x4 = xp[4];
351: x5 = xp[5];
352: x6 = xp[6]; /* Dk*xk = k-th block of x */
353: nz = ai[k + 1] - ai[k];
354: vj = aj + ai[k];
355: PetscPrefetchBlock(vj + nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
356: PetscPrefetchBlock(v + 49 * nz, 49 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
357: while (nz--) {
358: xp = x + (*vj) * 7;
359: /* x(:) += U(k,:)^T*(Dk*xk) */
360: xp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3 + v[4] * x4 + v[5] * x5 + v[6] * x6;
361: xp[1] += v[7] * x0 + v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4 + v[12] * x5 + v[13] * x6;
362: xp[2] += v[14] * x0 + v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5 + v[20] * x6;
363: xp[3] += v[21] * x0 + v[22] * x1 + v[23] * x2 + v[24] * x3 + v[25] * x4 + v[26] * x5 + v[27] * x6;
364: xp[4] += v[28] * x0 + v[29] * x1 + v[30] * x2 + v[31] * x3 + v[32] * x4 + v[33] * x5 + v[34] * x6;
365: xp[5] += v[35] * x0 + v[36] * x1 + v[37] * x2 + v[38] * x3 + v[39] * x4 + v[40] * x5 + v[41] * x6;
366: xp[6] += v[42] * x0 + v[43] * x1 + v[44] * x2 + v[45] * x3 + v[46] * x4 + v[47] * x5 + v[48] * x6;
367: vj++;
368: v += 49;
369: }
370: /* xk = inv(Dk)*(Dk*xk) */
371: d = aa + k * 49; /* ptr to inv(Dk) */
372: xp = x + k * 7;
373: xp[0] = d[0] * x0 + d[7] * x1 + d[14] * x2 + d[21] * x3 + d[28] * x4 + d[35] * x5 + d[42] * x6;
374: xp[1] = d[1] * x0 + d[8] * x1 + d[15] * x2 + d[22] * x3 + d[29] * x4 + d[36] * x5 + d[43] * x6;
375: xp[2] = d[2] * x0 + d[9] * x1 + d[16] * x2 + d[23] * x3 + d[30] * x4 + d[37] * x5 + d[44] * x6;
376: xp[3] = d[3] * x0 + d[10] * x1 + d[17] * x2 + d[24] * x3 + d[31] * x4 + d[38] * x5 + d[45] * x6;
377: xp[4] = d[4] * x0 + d[11] * x1 + d[18] * x2 + d[25] * x3 + d[32] * x4 + d[39] * x5 + d[46] * x6;
378: xp[5] = d[5] * x0 + d[12] * x1 + d[19] * x2 + d[26] * x3 + d[33] * x4 + d[40] * x5 + d[47] * x6;
379: xp[6] = d[6] * x0 + d[13] * x1 + d[20] * x2 + d[27] * x3 + d[34] * x4 + d[41] * x5 + d[48] * x6;
380: }
381: PetscFunctionReturn(PETSC_SUCCESS);
382: }
384: PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
385: {
386: const MatScalar *v;
387: PetscScalar *xp, x0, x1, x2, x3, x4, x5, x6;
388: PetscInt nz, k;
389: const PetscInt *vj;
391: PetscFunctionBegin;
392: for (k = mbs - 1; k >= 0; k--) {
393: v = aa + 49 * ai[k];
394: xp = x + k * 7;
395: x0 = xp[0];
396: x1 = xp[1];
397: x2 = xp[2];
398: x3 = xp[3];
399: x4 = xp[4];
400: x5 = xp[5];
401: x6 = xp[6]; /* xk */
402: nz = ai[k + 1] - ai[k];
403: vj = aj + ai[k];
404: PetscPrefetchBlock(vj - nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
405: PetscPrefetchBlock(v - 49 * nz, 49 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
406: while (nz--) {
407: xp = x + (*vj) * 7;
408: /* xk += U(k,:)*x(:) */
409: x0 += v[0] * xp[0] + v[7] * xp[1] + v[14] * xp[2] + v[21] * xp[3] + v[28] * xp[4] + v[35] * xp[5] + v[42] * xp[6];
410: x1 += v[1] * xp[0] + v[8] * xp[1] + v[15] * xp[2] + v[22] * xp[3] + v[29] * xp[4] + v[36] * xp[5] + v[43] * xp[6];
411: x2 += v[2] * xp[0] + v[9] * xp[1] + v[16] * xp[2] + v[23] * xp[3] + v[30] * xp[4] + v[37] * xp[5] + v[44] * xp[6];
412: x3 += v[3] * xp[0] + v[10] * xp[1] + v[17] * xp[2] + v[24] * xp[3] + v[31] * xp[4] + v[38] * xp[5] + v[45] * xp[6];
413: x4 += v[4] * xp[0] + v[11] * xp[1] + v[18] * xp[2] + v[25] * xp[3] + v[32] * xp[4] + v[39] * xp[5] + v[46] * xp[6];
414: x5 += v[5] * xp[0] + v[12] * xp[1] + v[19] * xp[2] + v[26] * xp[3] + v[33] * xp[4] + v[40] * xp[5] + v[47] * xp[6];
415: x6 += v[6] * xp[0] + v[13] * xp[1] + v[20] * xp[2] + v[27] * xp[3] + v[34] * xp[4] + v[41] * xp[5] + v[48] * xp[6];
416: vj++;
417: v += 49;
418: }
419: xp = x + k * 7;
420: xp[0] = x0;
421: xp[1] = x1;
422: xp[2] = x2;
423: xp[3] = x3;
424: xp[4] = x4;
425: xp[5] = x5;
426: xp[6] = x6;
427: }
428: PetscFunctionReturn(PETSC_SUCCESS);
429: }
431: PetscErrorCode MatSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
432: {
433: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
434: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
435: const MatScalar *aa = a->a;
436: PetscScalar *x;
437: const PetscScalar *b;
439: PetscFunctionBegin;
440: PetscCall(VecGetArrayRead(bb, &b));
441: PetscCall(VecGetArray(xx, &x));
443: /* solve U^T * D * y = b by forward substitution */
444: PetscCall(PetscArraycpy(x, b, 7 * mbs)); /* x <- b */
445: PetscCall(MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(ai, aj, aa, mbs, x));
447: /* solve U*x = y by back substitution */
448: PetscCall(MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(ai, aj, aa, mbs, x));
450: PetscCall(VecRestoreArrayRead(bb, &b));
451: PetscCall(VecRestoreArray(xx, &x));
452: PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
453: PetscFunctionReturn(PETSC_SUCCESS);
454: }
456: PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
457: {
458: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
459: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
460: const MatScalar *aa = a->a;
461: PetscScalar *x;
462: const PetscScalar *b;
464: PetscFunctionBegin;
465: PetscCall(VecGetArrayRead(bb, &b));
466: PetscCall(VecGetArray(xx, &x));
467: PetscCall(PetscArraycpy(x, b, 7 * mbs));
468: PetscCall(MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(ai, aj, aa, mbs, x));
469: PetscCall(VecRestoreArrayRead(bb, &b));
470: PetscCall(VecRestoreArray(xx, &x));
471: PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - A->rmap->bs * mbs));
472: PetscFunctionReturn(PETSC_SUCCESS);
473: }
475: PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
476: {
477: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
478: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
479: const MatScalar *aa = a->a;
480: PetscScalar *x;
481: const PetscScalar *b;
483: PetscFunctionBegin;
484: PetscCall(VecGetArrayRead(bb, &b));
485: PetscCall(VecGetArray(xx, &x));
486: PetscCall(PetscArraycpy(x, b, 7 * mbs));
487: PetscCall(MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(ai, aj, aa, mbs, x));
488: PetscCall(VecRestoreArrayRead(bb, &b));
489: PetscCall(VecRestoreArray(xx, &x));
490: PetscCall(PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs)));
491: PetscFunctionReturn(PETSC_SUCCESS);
492: }
494: PetscErrorCode MatSolve_SeqSBAIJ_6_inplace(Mat A, Vec bb, Vec xx)
495: {
496: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
497: IS isrow = a->row;
498: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *r, *vj;
499: PetscInt nz, k, idx;
500: const MatScalar *aa = a->a, *v, *d;
501: PetscScalar *x, x0, x1, x2, x3, x4, x5, *t, *tp;
502: const PetscScalar *b;
504: PetscFunctionBegin;
505: PetscCall(VecGetArrayRead(bb, &b));
506: PetscCall(VecGetArray(xx, &x));
507: t = a->solve_work;
508: PetscCall(ISGetIndices(isrow, &r));
510: /* solve U^T * D * y = b by forward substitution */
511: tp = t;
512: for (k = 0; k < mbs; k++) { /* t <- perm(b) */
513: idx = 6 * r[k];
514: tp[0] = b[idx];
515: tp[1] = b[idx + 1];
516: tp[2] = b[idx + 2];
517: tp[3] = b[idx + 3];
518: tp[4] = b[idx + 4];
519: tp[5] = b[idx + 5];
520: tp += 6;
521: }
523: for (k = 0; k < mbs; k++) {
524: v = aa + 36 * ai[k];
525: vj = aj + ai[k];
526: tp = t + k * 6;
527: x0 = tp[0];
528: x1 = tp[1];
529: x2 = tp[2];
530: x3 = tp[3];
531: x4 = tp[4];
532: x5 = tp[5];
533: nz = ai[k + 1] - ai[k];
534: tp = t + (*vj) * 6;
535: while (nz--) {
536: tp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3 + v[4] * x4 + v[5] * x5;
537: tp[1] += v[6] * x0 + v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5;
538: tp[2] += v[12] * x0 + v[13] * x1 + v[14] * x2 + v[15] * x3 + v[16] * x4 + v[17] * x5;
539: tp[3] += v[18] * x0 + v[19] * x1 + v[20] * x2 + v[21] * x3 + v[22] * x4 + v[23] * x5;
540: tp[4] += v[24] * x0 + v[25] * x1 + v[26] * x2 + v[27] * x3 + v[28] * x4 + v[29] * x5;
541: tp[5] += v[30] * x0 + v[31] * x1 + v[32] * x2 + v[33] * x3 + v[34] * x4 + v[35] * x5;
542: vj++;
543: tp = t + (*vj) * 6;
544: v += 36;
545: }
547: /* xk = inv(Dk)*(Dk*xk) */
548: d = aa + k * 36; /* ptr to inv(Dk) */
549: tp = t + k * 6;
550: tp[0] = d[0] * x0 + d[6] * x1 + d[12] * x2 + d[18] * x3 + d[24] * x4 + d[30] * x5;
551: tp[1] = d[1] * x0 + d[7] * x1 + d[13] * x2 + d[19] * x3 + d[25] * x4 + d[31] * x5;
552: tp[2] = d[2] * x0 + d[8] * x1 + d[14] * x2 + d[20] * x3 + d[26] * x4 + d[32] * x5;
553: tp[3] = d[3] * x0 + d[9] * x1 + d[15] * x2 + d[21] * x3 + d[27] * x4 + d[33] * x5;
554: tp[4] = d[4] * x0 + d[10] * x1 + d[16] * x2 + d[22] * x3 + d[28] * x4 + d[34] * x5;
555: tp[5] = d[5] * x0 + d[11] * x1 + d[17] * x2 + d[23] * x3 + d[29] * x4 + d[35] * x5;
556: }
558: /* solve U*x = y by back substitution */
559: for (k = mbs - 1; k >= 0; k--) {
560: v = aa + 36 * ai[k];
561: vj = aj + ai[k];
562: tp = t + k * 6;
563: x0 = tp[0];
564: x1 = tp[1];
565: x2 = tp[2];
566: x3 = tp[3];
567: x4 = tp[4];
568: x5 = tp[5]; /* xk */
569: nz = ai[k + 1] - ai[k];
571: tp = t + (*vj) * 6;
572: while (nz--) {
573: /* xk += U(k,:)*x(:) */
574: x0 += v[0] * tp[0] + v[6] * tp[1] + v[12] * tp[2] + v[18] * tp[3] + v[24] * tp[4] + v[30] * tp[5];
575: x1 += v[1] * tp[0] + v[7] * tp[1] + v[13] * tp[2] + v[19] * tp[3] + v[25] * tp[4] + v[31] * tp[5];
576: x2 += v[2] * tp[0] + v[8] * tp[1] + v[14] * tp[2] + v[20] * tp[3] + v[26] * tp[4] + v[32] * tp[5];
577: x3 += v[3] * tp[0] + v[9] * tp[1] + v[15] * tp[2] + v[21] * tp[3] + v[27] * tp[4] + v[33] * tp[5];
578: x4 += v[4] * tp[0] + v[10] * tp[1] + v[16] * tp[2] + v[22] * tp[3] + v[28] * tp[4] + v[34] * tp[5];
579: x5 += v[5] * tp[0] + v[11] * tp[1] + v[17] * tp[2] + v[23] * tp[3] + v[29] * tp[4] + v[35] * tp[5];
580: vj++;
581: tp = t + (*vj) * 6;
582: v += 36;
583: }
584: tp = t + k * 6;
585: tp[0] = x0;
586: tp[1] = x1;
587: tp[2] = x2;
588: tp[3] = x3;
589: tp[4] = x4;
590: tp[5] = x5;
591: idx = 6 * r[k];
592: x[idx] = x0;
593: x[idx + 1] = x1;
594: x[idx + 2] = x2;
595: x[idx + 3] = x3;
596: x[idx + 4] = x4;
597: x[idx + 5] = x5;
598: }
600: PetscCall(ISRestoreIndices(isrow, &r));
601: PetscCall(VecRestoreArrayRead(bb, &b));
602: PetscCall(VecRestoreArray(xx, &x));
603: PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
604: PetscFunctionReturn(PETSC_SUCCESS);
605: }
607: PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
608: {
609: const MatScalar *v, *d;
610: PetscScalar *xp, x0, x1, x2, x3, x4, x5;
611: PetscInt nz, k;
612: const PetscInt *vj;
614: PetscFunctionBegin;
615: for (k = 0; k < mbs; k++) {
616: v = aa + 36 * ai[k];
617: xp = x + k * 6;
618: x0 = xp[0];
619: x1 = xp[1];
620: x2 = xp[2];
621: x3 = xp[3];
622: x4 = xp[4];
623: x5 = xp[5]; /* Dk*xk = k-th block of x */
624: nz = ai[k + 1] - ai[k];
625: vj = aj + ai[k];
626: PetscPrefetchBlock(vj + nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
627: PetscPrefetchBlock(v + 36 * nz, 36 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
628: while (nz--) {
629: xp = x + (*vj) * 6;
630: /* x(:) += U(k,:)^T*(Dk*xk) */
631: xp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3 + v[4] * x4 + v[5] * x5;
632: xp[1] += v[6] * x0 + v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5;
633: xp[2] += v[12] * x0 + v[13] * x1 + v[14] * x2 + v[15] * x3 + v[16] * x4 + v[17] * x5;
634: xp[3] += v[18] * x0 + v[19] * x1 + v[20] * x2 + v[21] * x3 + v[22] * x4 + v[23] * x5;
635: xp[4] += v[24] * x0 + v[25] * x1 + v[26] * x2 + v[27] * x3 + v[28] * x4 + v[29] * x5;
636: xp[5] += v[30] * x0 + v[31] * x1 + v[32] * x2 + v[33] * x3 + v[34] * x4 + v[35] * x5;
637: vj++;
638: v += 36;
639: }
640: /* xk = inv(Dk)*(Dk*xk) */
641: d = aa + k * 36; /* ptr to inv(Dk) */
642: xp = x + k * 6;
643: xp[0] = d[0] * x0 + d[6] * x1 + d[12] * x2 + d[18] * x3 + d[24] * x4 + d[30] * x5;
644: xp[1] = d[1] * x0 + d[7] * x1 + d[13] * x2 + d[19] * x3 + d[25] * x4 + d[31] * x5;
645: xp[2] = d[2] * x0 + d[8] * x1 + d[14] * x2 + d[20] * x3 + d[26] * x4 + d[32] * x5;
646: xp[3] = d[3] * x0 + d[9] * x1 + d[15] * x2 + d[21] * x3 + d[27] * x4 + d[33] * x5;
647: xp[4] = d[4] * x0 + d[10] * x1 + d[16] * x2 + d[22] * x3 + d[28] * x4 + d[34] * x5;
648: xp[5] = d[5] * x0 + d[11] * x1 + d[17] * x2 + d[23] * x3 + d[29] * x4 + d[35] * x5;
649: }
650: PetscFunctionReturn(PETSC_SUCCESS);
651: }
652: PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
653: {
654: const MatScalar *v;
655: PetscScalar *xp, x0, x1, x2, x3, x4, x5;
656: PetscInt nz, k;
657: const PetscInt *vj;
659: PetscFunctionBegin;
660: for (k = mbs - 1; k >= 0; k--) {
661: v = aa + 36 * ai[k];
662: xp = x + k * 6;
663: x0 = xp[0];
664: x1 = xp[1];
665: x2 = xp[2];
666: x3 = xp[3];
667: x4 = xp[4];
668: x5 = xp[5]; /* xk */
669: nz = ai[k + 1] - ai[k];
670: vj = aj + ai[k];
671: PetscPrefetchBlock(vj - nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
672: PetscPrefetchBlock(v - 36 * nz, 36 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
673: while (nz--) {
674: xp = x + (*vj) * 6;
675: /* xk += U(k,:)*x(:) */
676: x0 += v[0] * xp[0] + v[6] * xp[1] + v[12] * xp[2] + v[18] * xp[3] + v[24] * xp[4] + v[30] * xp[5];
677: x1 += v[1] * xp[0] + v[7] * xp[1] + v[13] * xp[2] + v[19] * xp[3] + v[25] * xp[4] + v[31] * xp[5];
678: x2 += v[2] * xp[0] + v[8] * xp[1] + v[14] * xp[2] + v[20] * xp[3] + v[26] * xp[4] + v[32] * xp[5];
679: x3 += v[3] * xp[0] + v[9] * xp[1] + v[15] * xp[2] + v[21] * xp[3] + v[27] * xp[4] + v[33] * xp[5];
680: x4 += v[4] * xp[0] + v[10] * xp[1] + v[16] * xp[2] + v[22] * xp[3] + v[28] * xp[4] + v[34] * xp[5];
681: x5 += v[5] * xp[0] + v[11] * xp[1] + v[17] * xp[2] + v[23] * xp[3] + v[29] * xp[4] + v[35] * xp[5];
682: vj++;
683: v += 36;
684: }
685: xp = x + k * 6;
686: xp[0] = x0;
687: xp[1] = x1;
688: xp[2] = x2;
689: xp[3] = x3;
690: xp[4] = x4;
691: xp[5] = x5;
692: }
693: PetscFunctionReturn(PETSC_SUCCESS);
694: }
696: PetscErrorCode MatSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
697: {
698: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
699: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
700: const MatScalar *aa = a->a;
701: PetscScalar *x;
702: const PetscScalar *b;
704: PetscFunctionBegin;
705: PetscCall(VecGetArrayRead(bb, &b));
706: PetscCall(VecGetArray(xx, &x));
708: /* solve U^T * D * y = b by forward substitution */
709: PetscCall(PetscArraycpy(x, b, 6 * mbs)); /* x <- b */
710: PetscCall(MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(ai, aj, aa, mbs, x));
712: /* solve U*x = y by back substitution */
713: PetscCall(MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(ai, aj, aa, mbs, x));
715: PetscCall(VecRestoreArrayRead(bb, &b));
716: PetscCall(VecRestoreArray(xx, &x));
717: PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
718: PetscFunctionReturn(PETSC_SUCCESS);
719: }
721: PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
722: {
723: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
724: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
725: const MatScalar *aa = a->a;
726: PetscScalar *x;
727: const PetscScalar *b;
729: PetscFunctionBegin;
730: PetscCall(VecGetArrayRead(bb, &b));
731: PetscCall(VecGetArray(xx, &x));
732: PetscCall(PetscArraycpy(x, b, 6 * mbs)); /* x <- b */
733: PetscCall(MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(ai, aj, aa, mbs, x));
734: PetscCall(VecRestoreArrayRead(bb, &b));
735: PetscCall(VecRestoreArray(xx, &x));
736: PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - A->rmap->bs * mbs));
737: PetscFunctionReturn(PETSC_SUCCESS);
738: }
740: PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
741: {
742: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
743: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
744: const MatScalar *aa = a->a;
745: PetscScalar *x;
746: const PetscScalar *b;
748: PetscFunctionBegin;
749: PetscCall(VecGetArrayRead(bb, &b));
750: PetscCall(VecGetArray(xx, &x));
751: PetscCall(PetscArraycpy(x, b, 6 * mbs)); /* x <- b */
752: PetscCall(MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(ai, aj, aa, mbs, x));
753: PetscCall(VecRestoreArrayRead(bb, &b));
754: PetscCall(VecRestoreArray(xx, &x));
755: PetscCall(PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs)));
756: PetscFunctionReturn(PETSC_SUCCESS);
757: }
759: PetscErrorCode MatSolve_SeqSBAIJ_5_inplace(Mat A, Vec bb, Vec xx)
760: {
761: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
762: IS isrow = a->row;
763: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
764: const PetscInt *r, *vj;
765: PetscInt nz, k, idx;
766: const MatScalar *aa = a->a, *v, *diag;
767: PetscScalar *x, x0, x1, x2, x3, x4, *t, *tp;
768: const PetscScalar *b;
770: PetscFunctionBegin;
771: PetscCall(VecGetArrayRead(bb, &b));
772: PetscCall(VecGetArray(xx, &x));
773: t = a->solve_work;
774: PetscCall(ISGetIndices(isrow, &r));
776: /* solve U^T * D * y = b by forward substitution */
777: tp = t;
778: for (k = 0; k < mbs; k++) { /* t <- perm(b) */
779: idx = 5 * r[k];
780: tp[0] = b[idx];
781: tp[1] = b[idx + 1];
782: tp[2] = b[idx + 2];
783: tp[3] = b[idx + 3];
784: tp[4] = b[idx + 4];
785: tp += 5;
786: }
788: for (k = 0; k < mbs; k++) {
789: v = aa + 25 * ai[k];
790: vj = aj + ai[k];
791: tp = t + k * 5;
792: x0 = tp[0];
793: x1 = tp[1];
794: x2 = tp[2];
795: x3 = tp[3];
796: x4 = tp[4];
797: nz = ai[k + 1] - ai[k];
799: tp = t + (*vj) * 5;
800: while (nz--) {
801: tp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3 + v[4] * x4;
802: tp[1] += v[5] * x0 + v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4;
803: tp[2] += v[10] * x0 + v[11] * x1 + v[12] * x2 + v[13] * x3 + v[14] * x4;
804: tp[3] += v[15] * x0 + v[16] * x1 + v[17] * x2 + v[18] * x3 + v[19] * x4;
805: tp[4] += v[20] * x0 + v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4;
806: vj++;
807: tp = t + (*vj) * 5;
808: v += 25;
809: }
811: /* xk = inv(Dk)*(Dk*xk) */
812: diag = aa + k * 25; /* ptr to inv(Dk) */
813: tp = t + k * 5;
814: tp[0] = diag[0] * x0 + diag[5] * x1 + diag[10] * x2 + diag[15] * x3 + diag[20] * x4;
815: tp[1] = diag[1] * x0 + diag[6] * x1 + diag[11] * x2 + diag[16] * x3 + diag[21] * x4;
816: tp[2] = diag[2] * x0 + diag[7] * x1 + diag[12] * x2 + diag[17] * x3 + diag[22] * x4;
817: tp[3] = diag[3] * x0 + diag[8] * x1 + diag[13] * x2 + diag[18] * x3 + diag[23] * x4;
818: tp[4] = diag[4] * x0 + diag[9] * x1 + diag[14] * x2 + diag[19] * x3 + diag[24] * x4;
819: }
821: /* solve U*x = y by back substitution */
822: for (k = mbs - 1; k >= 0; k--) {
823: v = aa + 25 * ai[k];
824: vj = aj + ai[k];
825: tp = t + k * 5;
826: x0 = tp[0];
827: x1 = tp[1];
828: x2 = tp[2];
829: x3 = tp[3];
830: x4 = tp[4]; /* xk */
831: nz = ai[k + 1] - ai[k];
833: tp = t + (*vj) * 5;
834: while (nz--) {
835: /* xk += U(k,:)*x(:) */
836: x0 += v[0] * tp[0] + v[5] * tp[1] + v[10] * tp[2] + v[15] * tp[3] + v[20] * tp[4];
837: x1 += v[1] * tp[0] + v[6] * tp[1] + v[11] * tp[2] + v[16] * tp[3] + v[21] * tp[4];
838: x2 += v[2] * tp[0] + v[7] * tp[1] + v[12] * tp[2] + v[17] * tp[3] + v[22] * tp[4];
839: x3 += v[3] * tp[0] + v[8] * tp[1] + v[13] * tp[2] + v[18] * tp[3] + v[23] * tp[4];
840: x4 += v[4] * tp[0] + v[9] * tp[1] + v[14] * tp[2] + v[19] * tp[3] + v[24] * tp[4];
841: vj++;
842: tp = t + (*vj) * 5;
843: v += 25;
844: }
845: tp = t + k * 5;
846: tp[0] = x0;
847: tp[1] = x1;
848: tp[2] = x2;
849: tp[3] = x3;
850: tp[4] = x4;
851: idx = 5 * r[k];
852: x[idx] = x0;
853: x[idx + 1] = x1;
854: x[idx + 2] = x2;
855: x[idx + 3] = x3;
856: x[idx + 4] = x4;
857: }
859: PetscCall(ISRestoreIndices(isrow, &r));
860: PetscCall(VecRestoreArrayRead(bb, &b));
861: PetscCall(VecRestoreArray(xx, &x));
862: PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
863: PetscFunctionReturn(PETSC_SUCCESS);
864: }
866: PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
867: {
868: const MatScalar *v, *diag;
869: PetscScalar *xp, x0, x1, x2, x3, x4;
870: PetscInt nz, k;
871: const PetscInt *vj;
873: PetscFunctionBegin;
874: for (k = 0; k < mbs; k++) {
875: v = aa + 25 * ai[k];
876: xp = x + k * 5;
877: x0 = xp[0];
878: x1 = xp[1];
879: x2 = xp[2];
880: x3 = xp[3];
881: x4 = xp[4]; /* Dk*xk = k-th block of x */
882: nz = ai[k + 1] - ai[k];
883: vj = aj + ai[k];
884: PetscPrefetchBlock(vj + nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
885: PetscPrefetchBlock(v + 25 * nz, 25 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
886: while (nz--) {
887: xp = x + (*vj) * 5;
888: /* x(:) += U(k,:)^T*(Dk*xk) */
889: xp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3 + v[4] * x4;
890: xp[1] += v[5] * x0 + v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4;
891: xp[2] += v[10] * x0 + v[11] * x1 + v[12] * x2 + v[13] * x3 + v[14] * x4;
892: xp[3] += v[15] * x0 + v[16] * x1 + v[17] * x2 + v[18] * x3 + v[19] * x4;
893: xp[4] += v[20] * x0 + v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4;
894: vj++;
895: v += 25;
896: }
897: /* xk = inv(Dk)*(Dk*xk) */
898: diag = aa + k * 25; /* ptr to inv(Dk) */
899: xp = x + k * 5;
900: xp[0] = diag[0] * x0 + diag[5] * x1 + diag[10] * x2 + diag[15] * x3 + diag[20] * x4;
901: xp[1] = diag[1] * x0 + diag[6] * x1 + diag[11] * x2 + diag[16] * x3 + diag[21] * x4;
902: xp[2] = diag[2] * x0 + diag[7] * x1 + diag[12] * x2 + diag[17] * x3 + diag[22] * x4;
903: xp[3] = diag[3] * x0 + diag[8] * x1 + diag[13] * x2 + diag[18] * x3 + diag[23] * x4;
904: xp[4] = diag[4] * x0 + diag[9] * x1 + diag[14] * x2 + diag[19] * x3 + diag[24] * x4;
905: }
906: PetscFunctionReturn(PETSC_SUCCESS);
907: }
909: PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
910: {
911: const MatScalar *v;
912: PetscScalar *xp, x0, x1, x2, x3, x4;
913: PetscInt nz, k;
914: const PetscInt *vj;
916: PetscFunctionBegin;
917: for (k = mbs - 1; k >= 0; k--) {
918: v = aa + 25 * ai[k];
919: xp = x + k * 5;
920: x0 = xp[0];
921: x1 = xp[1];
922: x2 = xp[2];
923: x3 = xp[3];
924: x4 = xp[4]; /* xk */
925: nz = ai[k + 1] - ai[k];
926: vj = aj + ai[k];
927: PetscPrefetchBlock(vj - nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
928: PetscPrefetchBlock(v - 25 * nz, 25 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
929: while (nz--) {
930: xp = x + (*vj) * 5;
931: /* xk += U(k,:)*x(:) */
932: x0 += v[0] * xp[0] + v[5] * xp[1] + v[10] * xp[2] + v[15] * xp[3] + v[20] * xp[4];
933: x1 += v[1] * xp[0] + v[6] * xp[1] + v[11] * xp[2] + v[16] * xp[3] + v[21] * xp[4];
934: x2 += v[2] * xp[0] + v[7] * xp[1] + v[12] * xp[2] + v[17] * xp[3] + v[22] * xp[4];
935: x3 += v[3] * xp[0] + v[8] * xp[1] + v[13] * xp[2] + v[18] * xp[3] + v[23] * xp[4];
936: x4 += v[4] * xp[0] + v[9] * xp[1] + v[14] * xp[2] + v[19] * xp[3] + v[24] * xp[4];
937: vj++;
938: v += 25;
939: }
940: xp = x + k * 5;
941: xp[0] = x0;
942: xp[1] = x1;
943: xp[2] = x2;
944: xp[3] = x3;
945: xp[4] = x4;
946: }
947: PetscFunctionReturn(PETSC_SUCCESS);
948: }
950: PetscErrorCode MatSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
951: {
952: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
953: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
954: const MatScalar *aa = a->a;
955: PetscScalar *x;
956: const PetscScalar *b;
958: PetscFunctionBegin;
959: PetscCall(VecGetArrayRead(bb, &b));
960: PetscCall(VecGetArray(xx, &x));
962: /* solve U^T * D * y = b by forward substitution */
963: PetscCall(PetscArraycpy(x, b, 5 * mbs)); /* x <- b */
964: PetscCall(MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(ai, aj, aa, mbs, x));
966: /* solve U*x = y by back substitution */
967: PetscCall(MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(ai, aj, aa, mbs, x));
969: PetscCall(VecRestoreArrayRead(bb, &b));
970: PetscCall(VecRestoreArray(xx, &x));
971: PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
972: PetscFunctionReturn(PETSC_SUCCESS);
973: }
975: PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
976: {
977: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
978: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
979: const MatScalar *aa = a->a;
980: PetscScalar *x;
981: const PetscScalar *b;
983: PetscFunctionBegin;
984: PetscCall(VecGetArrayRead(bb, &b));
985: PetscCall(VecGetArray(xx, &x));
986: PetscCall(PetscArraycpy(x, b, 5 * mbs)); /* x <- b */
987: PetscCall(MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(ai, aj, aa, mbs, x));
988: PetscCall(VecRestoreArrayRead(bb, &b));
989: PetscCall(VecRestoreArray(xx, &x));
990: PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - A->rmap->bs * mbs));
991: PetscFunctionReturn(PETSC_SUCCESS);
992: }
994: PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
995: {
996: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
997: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
998: const MatScalar *aa = a->a;
999: PetscScalar *x;
1000: const PetscScalar *b;
1002: PetscFunctionBegin;
1003: PetscCall(VecGetArrayRead(bb, &b));
1004: PetscCall(VecGetArray(xx, &x));
1005: PetscCall(PetscArraycpy(x, b, 5 * mbs));
1006: PetscCall(MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(ai, aj, aa, mbs, x));
1007: PetscCall(VecRestoreArrayRead(bb, &b));
1008: PetscCall(VecRestoreArray(xx, &x));
1009: PetscCall(PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs)));
1010: PetscFunctionReturn(PETSC_SUCCESS);
1011: }
1013: PetscErrorCode MatSolve_SeqSBAIJ_4_inplace(Mat A, Vec bb, Vec xx)
1014: {
1015: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1016: IS isrow = a->row;
1017: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
1018: const PetscInt *r, *vj;
1019: PetscInt nz, k, idx;
1020: const MatScalar *aa = a->a, *v, *diag;
1021: PetscScalar *x, x0, x1, x2, x3, *t, *tp;
1022: const PetscScalar *b;
1024: PetscFunctionBegin;
1025: PetscCall(VecGetArrayRead(bb, &b));
1026: PetscCall(VecGetArray(xx, &x));
1027: t = a->solve_work;
1028: PetscCall(ISGetIndices(isrow, &r));
1030: /* solve U^T * D * y = b by forward substitution */
1031: tp = t;
1032: for (k = 0; k < mbs; k++) { /* t <- perm(b) */
1033: idx = 4 * r[k];
1034: tp[0] = b[idx];
1035: tp[1] = b[idx + 1];
1036: tp[2] = b[idx + 2];
1037: tp[3] = b[idx + 3];
1038: tp += 4;
1039: }
1041: for (k = 0; k < mbs; k++) {
1042: v = aa + 16 * ai[k];
1043: vj = aj + ai[k];
1044: tp = t + k * 4;
1045: x0 = tp[0];
1046: x1 = tp[1];
1047: x2 = tp[2];
1048: x3 = tp[3];
1049: nz = ai[k + 1] - ai[k];
1051: tp = t + (*vj) * 4;
1052: while (nz--) {
1053: tp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3;
1054: tp[1] += v[4] * x0 + v[5] * x1 + v[6] * x2 + v[7] * x3;
1055: tp[2] += v[8] * x0 + v[9] * x1 + v[10] * x2 + v[11] * x3;
1056: tp[3] += v[12] * x0 + v[13] * x1 + v[14] * x2 + v[15] * x3;
1057: vj++;
1058: tp = t + (*vj) * 4;
1059: v += 16;
1060: }
1062: /* xk = inv(Dk)*(Dk*xk) */
1063: diag = aa + k * 16; /* ptr to inv(Dk) */
1064: tp = t + k * 4;
1065: tp[0] = diag[0] * x0 + diag[4] * x1 + diag[8] * x2 + diag[12] * x3;
1066: tp[1] = diag[1] * x0 + diag[5] * x1 + diag[9] * x2 + diag[13] * x3;
1067: tp[2] = diag[2] * x0 + diag[6] * x1 + diag[10] * x2 + diag[14] * x3;
1068: tp[3] = diag[3] * x0 + diag[7] * x1 + diag[11] * x2 + diag[15] * x3;
1069: }
1071: /* solve U*x = y by back substitution */
1072: for (k = mbs - 1; k >= 0; k--) {
1073: v = aa + 16 * ai[k];
1074: vj = aj + ai[k];
1075: tp = t + k * 4;
1076: x0 = tp[0];
1077: x1 = tp[1];
1078: x2 = tp[2];
1079: x3 = tp[3]; /* xk */
1080: nz = ai[k + 1] - ai[k];
1082: tp = t + (*vj) * 4;
1083: while (nz--) {
1084: /* xk += U(k,:)*x(:) */
1085: x0 += v[0] * tp[0] + v[4] * tp[1] + v[8] * tp[2] + v[12] * tp[3];
1086: x1 += v[1] * tp[0] + v[5] * tp[1] + v[9] * tp[2] + v[13] * tp[3];
1087: x2 += v[2] * tp[0] + v[6] * tp[1] + v[10] * tp[2] + v[14] * tp[3];
1088: x3 += v[3] * tp[0] + v[7] * tp[1] + v[11] * tp[2] + v[15] * tp[3];
1089: vj++;
1090: tp = t + (*vj) * 4;
1091: v += 16;
1092: }
1093: tp = t + k * 4;
1094: tp[0] = x0;
1095: tp[1] = x1;
1096: tp[2] = x2;
1097: tp[3] = x3;
1098: idx = 4 * r[k];
1099: x[idx] = x0;
1100: x[idx + 1] = x1;
1101: x[idx + 2] = x2;
1102: x[idx + 3] = x3;
1103: }
1105: PetscCall(ISRestoreIndices(isrow, &r));
1106: PetscCall(VecRestoreArrayRead(bb, &b));
1107: PetscCall(VecRestoreArray(xx, &x));
1108: PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
1109: PetscFunctionReturn(PETSC_SUCCESS);
1110: }
1112: PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
1113: {
1114: const MatScalar *v, *diag;
1115: PetscScalar *xp, x0, x1, x2, x3;
1116: PetscInt nz, k;
1117: const PetscInt *vj;
1119: PetscFunctionBegin;
1120: for (k = 0; k < mbs; k++) {
1121: v = aa + 16 * ai[k];
1122: xp = x + k * 4;
1123: x0 = xp[0];
1124: x1 = xp[1];
1125: x2 = xp[2];
1126: x3 = xp[3]; /* Dk*xk = k-th block of x */
1127: nz = ai[k + 1] - ai[k];
1128: vj = aj + ai[k];
1129: PetscPrefetchBlock(vj + nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1130: PetscPrefetchBlock(v + 16 * nz, 16 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1131: while (nz--) {
1132: xp = x + (*vj) * 4;
1133: /* x(:) += U(k,:)^T*(Dk*xk) */
1134: xp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3;
1135: xp[1] += v[4] * x0 + v[5] * x1 + v[6] * x2 + v[7] * x3;
1136: xp[2] += v[8] * x0 + v[9] * x1 + v[10] * x2 + v[11] * x3;
1137: xp[3] += v[12] * x0 + v[13] * x1 + v[14] * x2 + v[15] * x3;
1138: vj++;
1139: v += 16;
1140: }
1141: /* xk = inv(Dk)*(Dk*xk) */
1142: diag = aa + k * 16; /* ptr to inv(Dk) */
1143: xp = x + k * 4;
1144: xp[0] = diag[0] * x0 + diag[4] * x1 + diag[8] * x2 + diag[12] * x3;
1145: xp[1] = diag[1] * x0 + diag[5] * x1 + diag[9] * x2 + diag[13] * x3;
1146: xp[2] = diag[2] * x0 + diag[6] * x1 + diag[10] * x2 + diag[14] * x3;
1147: xp[3] = diag[3] * x0 + diag[7] * x1 + diag[11] * x2 + diag[15] * x3;
1148: }
1149: PetscFunctionReturn(PETSC_SUCCESS);
1150: }
1152: PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
1153: {
1154: const MatScalar *v;
1155: PetscScalar *xp, x0, x1, x2, x3;
1156: PetscInt nz, k;
1157: const PetscInt *vj;
1159: PetscFunctionBegin;
1160: for (k = mbs - 1; k >= 0; k--) {
1161: v = aa + 16 * ai[k];
1162: xp = x + k * 4;
1163: x0 = xp[0];
1164: x1 = xp[1];
1165: x2 = xp[2];
1166: x3 = xp[3]; /* xk */
1167: nz = ai[k + 1] - ai[k];
1168: vj = aj + ai[k];
1169: PetscPrefetchBlock(vj - nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1170: PetscPrefetchBlock(v - 16 * nz, 16 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1171: while (nz--) {
1172: xp = x + (*vj) * 4;
1173: /* xk += U(k,:)*x(:) */
1174: x0 += v[0] * xp[0] + v[4] * xp[1] + v[8] * xp[2] + v[12] * xp[3];
1175: x1 += v[1] * xp[0] + v[5] * xp[1] + v[9] * xp[2] + v[13] * xp[3];
1176: x2 += v[2] * xp[0] + v[6] * xp[1] + v[10] * xp[2] + v[14] * xp[3];
1177: x3 += v[3] * xp[0] + v[7] * xp[1] + v[11] * xp[2] + v[15] * xp[3];
1178: vj++;
1179: v += 16;
1180: }
1181: xp = x + k * 4;
1182: xp[0] = x0;
1183: xp[1] = x1;
1184: xp[2] = x2;
1185: xp[3] = x3;
1186: }
1187: PetscFunctionReturn(PETSC_SUCCESS);
1188: }
1190: PetscErrorCode MatSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
1191: {
1192: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1193: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
1194: const MatScalar *aa = a->a;
1195: PetscScalar *x;
1196: const PetscScalar *b;
1198: PetscFunctionBegin;
1199: PetscCall(VecGetArrayRead(bb, &b));
1200: PetscCall(VecGetArray(xx, &x));
1202: /* solve U^T * D * y = b by forward substitution */
1203: PetscCall(PetscArraycpy(x, b, 4 * mbs)); /* x <- b */
1204: PetscCall(MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(ai, aj, aa, mbs, x));
1206: /* solve U*x = y by back substitution */
1207: PetscCall(MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(ai, aj, aa, mbs, x));
1208: PetscCall(VecRestoreArrayRead(bb, &b));
1209: PetscCall(VecRestoreArray(xx, &x));
1210: PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
1211: PetscFunctionReturn(PETSC_SUCCESS);
1212: }
1214: PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
1215: {
1216: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1217: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
1218: const MatScalar *aa = a->a;
1219: PetscScalar *x;
1220: const PetscScalar *b;
1222: PetscFunctionBegin;
1223: PetscCall(VecGetArrayRead(bb, &b));
1224: PetscCall(VecGetArray(xx, &x));
1225: PetscCall(PetscArraycpy(x, b, 4 * mbs)); /* x <- b */
1226: PetscCall(MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(ai, aj, aa, mbs, x));
1227: PetscCall(VecRestoreArrayRead(bb, &b));
1228: PetscCall(VecRestoreArray(xx, &x));
1229: PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - A->rmap->bs * mbs));
1230: PetscFunctionReturn(PETSC_SUCCESS);
1231: }
1233: PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
1234: {
1235: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1236: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
1237: const MatScalar *aa = a->a;
1238: PetscScalar *x;
1239: const PetscScalar *b;
1241: PetscFunctionBegin;
1242: PetscCall(VecGetArrayRead(bb, &b));
1243: PetscCall(VecGetArray(xx, &x));
1244: PetscCall(PetscArraycpy(x, b, 4 * mbs));
1245: PetscCall(MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(ai, aj, aa, mbs, x));
1246: PetscCall(VecRestoreArrayRead(bb, &b));
1247: PetscCall(VecRestoreArray(xx, &x));
1248: PetscCall(PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs)));
1249: PetscFunctionReturn(PETSC_SUCCESS);
1250: }
1252: PetscErrorCode MatSolve_SeqSBAIJ_3_inplace(Mat A, Vec bb, Vec xx)
1253: {
1254: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1255: IS isrow = a->row;
1256: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
1257: const PetscInt *r;
1258: PetscInt nz, k, idx;
1259: const PetscInt *vj;
1260: const MatScalar *aa = a->a, *v, *diag;
1261: PetscScalar *x, x0, x1, x2, *t, *tp;
1262: const PetscScalar *b;
1264: PetscFunctionBegin;
1265: PetscCall(VecGetArrayRead(bb, &b));
1266: PetscCall(VecGetArray(xx, &x));
1267: t = a->solve_work;
1268: PetscCall(ISGetIndices(isrow, &r));
1270: /* solve U^T * D * y = b by forward substitution */
1271: tp = t;
1272: for (k = 0; k < mbs; k++) { /* t <- perm(b) */
1273: idx = 3 * r[k];
1274: tp[0] = b[idx];
1275: tp[1] = b[idx + 1];
1276: tp[2] = b[idx + 2];
1277: tp += 3;
1278: }
1280: for (k = 0; k < mbs; k++) {
1281: v = aa + 9 * ai[k];
1282: vj = aj + ai[k];
1283: tp = t + k * 3;
1284: x0 = tp[0];
1285: x1 = tp[1];
1286: x2 = tp[2];
1287: nz = ai[k + 1] - ai[k];
1289: tp = t + (*vj) * 3;
1290: while (nz--) {
1291: tp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2;
1292: tp[1] += v[3] * x0 + v[4] * x1 + v[5] * x2;
1293: tp[2] += v[6] * x0 + v[7] * x1 + v[8] * x2;
1294: vj++;
1295: tp = t + (*vj) * 3;
1296: v += 9;
1297: }
1299: /* xk = inv(Dk)*(Dk*xk) */
1300: diag = aa + k * 9; /* ptr to inv(Dk) */
1301: tp = t + k * 3;
1302: tp[0] = diag[0] * x0 + diag[3] * x1 + diag[6] * x2;
1303: tp[1] = diag[1] * x0 + diag[4] * x1 + diag[7] * x2;
1304: tp[2] = diag[2] * x0 + diag[5] * x1 + diag[8] * x2;
1305: }
1307: /* solve U*x = y by back substitution */
1308: for (k = mbs - 1; k >= 0; k--) {
1309: v = aa + 9 * ai[k];
1310: vj = aj + ai[k];
1311: tp = t + k * 3;
1312: x0 = tp[0];
1313: x1 = tp[1];
1314: x2 = tp[2]; /* xk */
1315: nz = ai[k + 1] - ai[k];
1317: tp = t + (*vj) * 3;
1318: while (nz--) {
1319: /* xk += U(k,:)*x(:) */
1320: x0 += v[0] * tp[0] + v[3] * tp[1] + v[6] * tp[2];
1321: x1 += v[1] * tp[0] + v[4] * tp[1] + v[7] * tp[2];
1322: x2 += v[2] * tp[0] + v[5] * tp[1] + v[8] * tp[2];
1323: vj++;
1324: tp = t + (*vj) * 3;
1325: v += 9;
1326: }
1327: tp = t + k * 3;
1328: tp[0] = x0;
1329: tp[1] = x1;
1330: tp[2] = x2;
1331: idx = 3 * r[k];
1332: x[idx] = x0;
1333: x[idx + 1] = x1;
1334: x[idx + 2] = x2;
1335: }
1337: PetscCall(ISRestoreIndices(isrow, &r));
1338: PetscCall(VecRestoreArrayRead(bb, &b));
1339: PetscCall(VecRestoreArray(xx, &x));
1340: PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
1341: PetscFunctionReturn(PETSC_SUCCESS);
1342: }
1344: PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
1345: {
1346: const MatScalar *v, *diag;
1347: PetscScalar *xp, x0, x1, x2;
1348: PetscInt nz, k;
1349: const PetscInt *vj;
1351: PetscFunctionBegin;
1352: for (k = 0; k < mbs; k++) {
1353: v = aa + 9 * ai[k];
1354: xp = x + k * 3;
1355: x0 = xp[0];
1356: x1 = xp[1];
1357: x2 = xp[2]; /* Dk*xk = k-th block of x */
1358: nz = ai[k + 1] - ai[k];
1359: vj = aj + ai[k];
1360: PetscPrefetchBlock(vj + nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1361: PetscPrefetchBlock(v + 9 * nz, 9 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1362: while (nz--) {
1363: xp = x + (*vj) * 3;
1364: /* x(:) += U(k,:)^T*(Dk*xk) */
1365: xp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2;
1366: xp[1] += v[3] * x0 + v[4] * x1 + v[5] * x2;
1367: xp[2] += v[6] * x0 + v[7] * x1 + v[8] * x2;
1368: vj++;
1369: v += 9;
1370: }
1371: /* xk = inv(Dk)*(Dk*xk) */
1372: diag = aa + k * 9; /* ptr to inv(Dk) */
1373: xp = x + k * 3;
1374: xp[0] = diag[0] * x0 + diag[3] * x1 + diag[6] * x2;
1375: xp[1] = diag[1] * x0 + diag[4] * x1 + diag[7] * x2;
1376: xp[2] = diag[2] * x0 + diag[5] * x1 + diag[8] * x2;
1377: }
1378: PetscFunctionReturn(PETSC_SUCCESS);
1379: }
1381: PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
1382: {
1383: const MatScalar *v;
1384: PetscScalar *xp, x0, x1, x2;
1385: PetscInt nz, k;
1386: const PetscInt *vj;
1388: PetscFunctionBegin;
1389: for (k = mbs - 1; k >= 0; k--) {
1390: v = aa + 9 * ai[k];
1391: xp = x + k * 3;
1392: x0 = xp[0];
1393: x1 = xp[1];
1394: x2 = xp[2]; /* xk */
1395: nz = ai[k + 1] - ai[k];
1396: vj = aj + ai[k];
1397: PetscPrefetchBlock(vj - nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1398: PetscPrefetchBlock(v - 9 * nz, 9 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1399: while (nz--) {
1400: xp = x + (*vj) * 3;
1401: /* xk += U(k,:)*x(:) */
1402: x0 += v[0] * xp[0] + v[3] * xp[1] + v[6] * xp[2];
1403: x1 += v[1] * xp[0] + v[4] * xp[1] + v[7] * xp[2];
1404: x2 += v[2] * xp[0] + v[5] * xp[1] + v[8] * xp[2];
1405: vj++;
1406: v += 9;
1407: }
1408: xp = x + k * 3;
1409: xp[0] = x0;
1410: xp[1] = x1;
1411: xp[2] = x2;
1412: }
1413: PetscFunctionReturn(PETSC_SUCCESS);
1414: }
1416: PetscErrorCode MatSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
1417: {
1418: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1419: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
1420: const MatScalar *aa = a->a;
1421: PetscScalar *x;
1422: const PetscScalar *b;
1424: PetscFunctionBegin;
1425: PetscCall(VecGetArrayRead(bb, &b));
1426: PetscCall(VecGetArray(xx, &x));
1428: /* solve U^T * D * y = b by forward substitution */
1429: PetscCall(PetscArraycpy(x, b, 3 * mbs));
1430: PetscCall(MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(ai, aj, aa, mbs, x));
1432: /* solve U*x = y by back substitution */
1433: PetscCall(MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(ai, aj, aa, mbs, x));
1435: PetscCall(VecRestoreArrayRead(bb, &b));
1436: PetscCall(VecRestoreArray(xx, &x));
1437: PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
1438: PetscFunctionReturn(PETSC_SUCCESS);
1439: }
1441: PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
1442: {
1443: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1444: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
1445: const MatScalar *aa = a->a;
1446: PetscScalar *x;
1447: const PetscScalar *b;
1449: PetscFunctionBegin;
1450: PetscCall(VecGetArrayRead(bb, &b));
1451: PetscCall(VecGetArray(xx, &x));
1452: PetscCall(PetscArraycpy(x, b, 3 * mbs));
1453: PetscCall(MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(ai, aj, aa, mbs, x));
1454: PetscCall(VecRestoreArrayRead(bb, &b));
1455: PetscCall(VecRestoreArray(xx, &x));
1456: PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - A->rmap->bs * mbs));
1457: PetscFunctionReturn(PETSC_SUCCESS);
1458: }
1460: PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
1461: {
1462: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1463: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
1464: const MatScalar *aa = a->a;
1465: PetscScalar *x;
1466: const PetscScalar *b;
1468: PetscFunctionBegin;
1469: PetscCall(VecGetArrayRead(bb, &b));
1470: PetscCall(VecGetArray(xx, &x));
1471: PetscCall(PetscArraycpy(x, b, 3 * mbs));
1472: PetscCall(MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(ai, aj, aa, mbs, x));
1473: PetscCall(VecRestoreArrayRead(bb, &b));
1474: PetscCall(VecRestoreArray(xx, &x));
1475: PetscCall(PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs)));
1476: PetscFunctionReturn(PETSC_SUCCESS);
1477: }
1479: PetscErrorCode MatSolve_SeqSBAIJ_2_inplace(Mat A, Vec bb, Vec xx)
1480: {
1481: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1482: IS isrow = a->row;
1483: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
1484: const PetscInt *r, *vj;
1485: PetscInt nz, k, k2, idx;
1486: const MatScalar *aa = a->a, *v, *diag;
1487: PetscScalar *x, x0, x1, *t;
1488: const PetscScalar *b;
1490: PetscFunctionBegin;
1491: PetscCall(VecGetArrayRead(bb, &b));
1492: PetscCall(VecGetArray(xx, &x));
1493: t = a->solve_work;
1494: PetscCall(ISGetIndices(isrow, &r));
1496: /* solve U^T * D * y = perm(b) by forward substitution */
1497: for (k = 0; k < mbs; k++) { /* t <- perm(b) */
1498: idx = 2 * r[k];
1499: t[k * 2] = b[idx];
1500: t[k * 2 + 1] = b[idx + 1];
1501: }
1502: for (k = 0; k < mbs; k++) {
1503: v = aa + 4 * ai[k];
1504: vj = aj + ai[k];
1505: k2 = k * 2;
1506: x0 = t[k2];
1507: x1 = t[k2 + 1];
1508: nz = ai[k + 1] - ai[k];
1509: while (nz--) {
1510: t[(*vj) * 2] += v[0] * x0 + v[1] * x1;
1511: t[(*vj) * 2 + 1] += v[2] * x0 + v[3] * x1;
1512: vj++;
1513: v += 4;
1514: }
1515: diag = aa + k * 4; /* ptr to inv(Dk) */
1516: t[k2] = diag[0] * x0 + diag[2] * x1;
1517: t[k2 + 1] = diag[1] * x0 + diag[3] * x1;
1518: }
1520: /* solve U*x = y by back substitution */
1521: for (k = mbs - 1; k >= 0; k--) {
1522: v = aa + 4 * ai[k];
1523: vj = aj + ai[k];
1524: k2 = k * 2;
1525: x0 = t[k2];
1526: x1 = t[k2 + 1];
1527: nz = ai[k + 1] - ai[k];
1528: while (nz--) {
1529: x0 += v[0] * t[(*vj) * 2] + v[2] * t[(*vj) * 2 + 1];
1530: x1 += v[1] * t[(*vj) * 2] + v[3] * t[(*vj) * 2 + 1];
1531: vj++;
1532: v += 4;
1533: }
1534: t[k2] = x0;
1535: t[k2 + 1] = x1;
1536: idx = 2 * r[k];
1537: x[idx] = x0;
1538: x[idx + 1] = x1;
1539: }
1541: PetscCall(ISRestoreIndices(isrow, &r));
1542: PetscCall(VecRestoreArrayRead(bb, &b));
1543: PetscCall(VecRestoreArray(xx, &x));
1544: PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
1545: PetscFunctionReturn(PETSC_SUCCESS);
1546: }
1548: PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
1549: {
1550: const MatScalar *v, *diag;
1551: PetscScalar x0, x1;
1552: PetscInt nz, k, k2;
1553: const PetscInt *vj;
1555: PetscFunctionBegin;
1556: for (k = 0; k < mbs; k++) {
1557: v = aa + 4 * ai[k];
1558: vj = aj + ai[k];
1559: k2 = k * 2;
1560: x0 = x[k2];
1561: x1 = x[k2 + 1]; /* Dk*xk = k-th block of x */
1562: nz = ai[k + 1] - ai[k];
1563: PetscPrefetchBlock(vj + nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1564: PetscPrefetchBlock(v + 4 * nz, 4 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1565: while (nz--) {
1566: /* x(:) += U(k,:)^T*(Dk*xk) */
1567: x[(*vj) * 2] += v[0] * x0 + v[1] * x1;
1568: x[(*vj) * 2 + 1] += v[2] * x0 + v[3] * x1;
1569: vj++;
1570: v += 4;
1571: }
1572: /* xk = inv(Dk)*(Dk*xk) */
1573: diag = aa + k * 4; /* ptr to inv(Dk) */
1574: x[k2] = diag[0] * x0 + diag[2] * x1;
1575: x[k2 + 1] = diag[1] * x0 + diag[3] * x1;
1576: }
1577: PetscFunctionReturn(PETSC_SUCCESS);
1578: }
1580: PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
1581: {
1582: const MatScalar *v;
1583: PetscScalar x0, x1;
1584: PetscInt nz, k, k2;
1585: const PetscInt *vj;
1587: PetscFunctionBegin;
1588: for (k = mbs - 1; k >= 0; k--) {
1589: v = aa + 4 * ai[k];
1590: vj = aj + ai[k];
1591: k2 = k * 2;
1592: x0 = x[k2];
1593: x1 = x[k2 + 1]; /* xk */
1594: nz = ai[k + 1] - ai[k];
1595: PetscPrefetchBlock(vj - nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1596: PetscPrefetchBlock(v - 4 * nz, 4 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1597: while (nz--) {
1598: /* xk += U(k,:)*x(:) */
1599: x0 += v[0] * x[(*vj) * 2] + v[2] * x[(*vj) * 2 + 1];
1600: x1 += v[1] * x[(*vj) * 2] + v[3] * x[(*vj) * 2 + 1];
1601: vj++;
1602: v += 4;
1603: }
1604: x[k2] = x0;
1605: x[k2 + 1] = x1;
1606: }
1607: PetscFunctionReturn(PETSC_SUCCESS);
1608: }
1610: PetscErrorCode MatSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
1611: {
1612: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1613: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
1614: const MatScalar *aa = a->a;
1615: PetscScalar *x;
1616: const PetscScalar *b;
1618: PetscFunctionBegin;
1619: PetscCall(VecGetArrayRead(bb, &b));
1620: PetscCall(VecGetArray(xx, &x));
1622: /* solve U^T * D * y = b by forward substitution */
1623: PetscCall(PetscArraycpy(x, b, 2 * mbs));
1624: PetscCall(MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(ai, aj, aa, mbs, x));
1626: /* solve U*x = y by back substitution */
1627: PetscCall(MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(ai, aj, aa, mbs, x));
1629: PetscCall(VecRestoreArrayRead(bb, &b));
1630: PetscCall(VecRestoreArray(xx, &x));
1631: PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
1632: PetscFunctionReturn(PETSC_SUCCESS);
1633: }
1635: PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
1636: {
1637: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1638: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
1639: const MatScalar *aa = a->a;
1640: PetscScalar *x;
1641: const PetscScalar *b;
1643: PetscFunctionBegin;
1644: PetscCall(VecGetArrayRead(bb, &b));
1645: PetscCall(VecGetArray(xx, &x));
1646: PetscCall(PetscArraycpy(x, b, 2 * mbs));
1647: PetscCall(MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(ai, aj, aa, mbs, x));
1648: PetscCall(VecRestoreArrayRead(bb, &b));
1649: PetscCall(VecRestoreArray(xx, &x));
1650: PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - A->rmap->bs * mbs));
1651: PetscFunctionReturn(PETSC_SUCCESS);
1652: }
1654: PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
1655: {
1656: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1657: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
1658: const MatScalar *aa = a->a;
1659: PetscScalar *x;
1660: const PetscScalar *b;
1662: PetscFunctionBegin;
1663: PetscCall(VecGetArrayRead(bb, &b));
1664: PetscCall(VecGetArray(xx, &x));
1665: PetscCall(PetscArraycpy(x, b, 2 * mbs));
1666: PetscCall(MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(ai, aj, aa, mbs, x));
1667: PetscCall(VecRestoreArrayRead(bb, &b));
1668: PetscCall(VecRestoreArray(xx, &x));
1669: PetscCall(PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs)));
1670: PetscFunctionReturn(PETSC_SUCCESS);
1671: }
1673: PetscErrorCode MatSolve_SeqSBAIJ_1(Mat A, Vec bb, Vec xx)
1674: {
1675: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1676: IS isrow = a->row;
1677: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *rp, *vj, *adiag = a->diag;
1678: const MatScalar *aa = a->a, *v;
1679: const PetscScalar *b;
1680: PetscScalar *x, xk, *t;
1681: PetscInt nz, k, j;
1683: PetscFunctionBegin;
1684: PetscCall(VecGetArrayRead(bb, &b));
1685: PetscCall(VecGetArray(xx, &x));
1686: t = a->solve_work;
1687: PetscCall(ISGetIndices(isrow, &rp));
1689: /* solve U^T*D*y = perm(b) by forward substitution */
1690: for (k = 0; k < mbs; k++) t[k] = b[rp[k]];
1691: for (k = 0; k < mbs; k++) {
1692: v = aa + ai[k];
1693: vj = aj + ai[k];
1694: xk = t[k];
1695: nz = ai[k + 1] - ai[k] - 1;
1696: for (j = 0; j < nz; j++) t[vj[j]] += v[j] * xk;
1697: t[k] = xk * v[nz]; /* v[nz] = 1/D(k) */
1698: }
1700: /* solve U*perm(x) = y by back substitution */
1701: for (k = mbs - 1; k >= 0; k--) {
1702: v = aa + adiag[k] - 1;
1703: vj = aj + adiag[k] - 1;
1704: nz = ai[k + 1] - ai[k] - 1;
1705: for (j = 0; j < nz; j++) t[k] += v[-j] * t[vj[-j]];
1706: x[rp[k]] = t[k];
1707: }
1709: PetscCall(ISRestoreIndices(isrow, &rp));
1710: PetscCall(VecRestoreArrayRead(bb, &b));
1711: PetscCall(VecRestoreArray(xx, &x));
1712: PetscCall(PetscLogFlops(4.0 * a->nz - 3.0 * mbs));
1713: PetscFunctionReturn(PETSC_SUCCESS);
1714: }
1716: PetscErrorCode MatSolve_SeqSBAIJ_1_inplace(Mat A, Vec bb, Vec xx)
1717: {
1718: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1719: IS isrow = a->row;
1720: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *rp, *vj;
1721: const MatScalar *aa = a->a, *v;
1722: PetscScalar *x, xk, *t;
1723: const PetscScalar *b;
1724: PetscInt nz, k;
1726: PetscFunctionBegin;
1727: PetscCall(VecGetArrayRead(bb, &b));
1728: PetscCall(VecGetArray(xx, &x));
1729: t = a->solve_work;
1730: PetscCall(ISGetIndices(isrow, &rp));
1732: /* solve U^T*D*y = perm(b) by forward substitution */
1733: for (k = 0; k < mbs; k++) t[k] = b[rp[k]];
1734: for (k = 0; k < mbs; k++) {
1735: v = aa + ai[k] + 1;
1736: vj = aj + ai[k] + 1;
1737: xk = t[k];
1738: nz = ai[k + 1] - ai[k] - 1;
1739: while (nz--) t[*vj++] += (*v++) * xk;
1740: t[k] = xk * aa[ai[k]]; /* aa[k] = 1/D(k) */
1741: }
1743: /* solve U*perm(x) = y by back substitution */
1744: for (k = mbs - 1; k >= 0; k--) {
1745: v = aa + ai[k] + 1;
1746: vj = aj + ai[k] + 1;
1747: nz = ai[k + 1] - ai[k] - 1;
1748: while (nz--) t[k] += (*v++) * t[*vj++];
1749: x[rp[k]] = t[k];
1750: }
1752: PetscCall(ISRestoreIndices(isrow, &rp));
1753: PetscCall(VecRestoreArrayRead(bb, &b));
1754: PetscCall(VecRestoreArray(xx, &x));
1755: PetscCall(PetscLogFlops(4.0 * a->nz - 3 * mbs));
1756: PetscFunctionReturn(PETSC_SUCCESS);
1757: }
1759: PetscErrorCode MatForwardSolve_SeqSBAIJ_1(Mat A, Vec bb, Vec xx)
1760: {
1761: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1762: IS isrow = a->row;
1763: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *rp, *vj, *adiag = a->diag;
1764: const MatScalar *aa = a->a, *v;
1765: PetscReal diagk;
1766: PetscScalar *x, xk;
1767: const PetscScalar *b;
1768: PetscInt nz, k;
1770: PetscFunctionBegin;
1771: /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */
1772: PetscCall(VecGetArrayRead(bb, &b));
1773: PetscCall(VecGetArray(xx, &x));
1774: PetscCall(ISGetIndices(isrow, &rp));
1776: for (k = 0; k < mbs; k++) x[k] = b[rp[k]];
1777: for (k = 0; k < mbs; k++) {
1778: v = aa + ai[k];
1779: vj = aj + ai[k];
1780: xk = x[k];
1781: nz = ai[k + 1] - ai[k] - 1;
1782: while (nz--) x[*vj++] += (*v++) * xk;
1784: diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */
1785: PetscCheck(!PetscImaginaryPart(aa[adiag[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal must be real and nonnegative");
1786: x[k] = xk * PetscSqrtReal(diagk);
1787: }
1788: PetscCall(ISRestoreIndices(isrow, &rp));
1789: PetscCall(VecRestoreArrayRead(bb, &b));
1790: PetscCall(VecRestoreArray(xx, &x));
1791: PetscCall(PetscLogFlops(2.0 * a->nz - mbs));
1792: PetscFunctionReturn(PETSC_SUCCESS);
1793: }
1795: PetscErrorCode MatForwardSolve_SeqSBAIJ_1_inplace(Mat A, Vec bb, Vec xx)
1796: {
1797: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1798: IS isrow = a->row;
1799: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *rp, *vj;
1800: const MatScalar *aa = a->a, *v;
1801: PetscReal diagk;
1802: PetscScalar *x, xk;
1803: const PetscScalar *b;
1804: PetscInt nz, k;
1806: PetscFunctionBegin;
1807: /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */
1808: PetscCall(VecGetArrayRead(bb, &b));
1809: PetscCall(VecGetArray(xx, &x));
1810: PetscCall(ISGetIndices(isrow, &rp));
1812: for (k = 0; k < mbs; k++) x[k] = b[rp[k]];
1813: for (k = 0; k < mbs; k++) {
1814: v = aa + ai[k] + 1;
1815: vj = aj + ai[k] + 1;
1816: xk = x[k];
1817: nz = ai[k + 1] - ai[k] - 1;
1818: while (nz--) x[*vj++] += (*v++) * xk;
1820: diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
1821: PetscCheck(!PetscImaginaryPart(aa[ai[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal must be real and nonnegative");
1822: x[k] = xk * PetscSqrtReal(diagk);
1823: }
1824: PetscCall(ISRestoreIndices(isrow, &rp));
1825: PetscCall(VecRestoreArrayRead(bb, &b));
1826: PetscCall(VecRestoreArray(xx, &x));
1827: PetscCall(PetscLogFlops(2.0 * a->nz - mbs));
1828: PetscFunctionReturn(PETSC_SUCCESS);
1829: }
1831: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1(Mat A, Vec bb, Vec xx)
1832: {
1833: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1834: IS isrow = a->row;
1835: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *rp, *vj, *adiag = a->diag;
1836: const MatScalar *aa = a->a, *v;
1837: PetscReal diagk;
1838: PetscScalar *x, *t;
1839: const PetscScalar *b;
1840: PetscInt nz, k;
1842: PetscFunctionBegin;
1843: /* solve D^(1/2)*U*perm(x) = b by back substitution */
1844: PetscCall(VecGetArrayRead(bb, &b));
1845: PetscCall(VecGetArray(xx, &x));
1846: t = a->solve_work;
1847: PetscCall(ISGetIndices(isrow, &rp));
1849: for (k = mbs - 1; k >= 0; k--) {
1850: v = aa + ai[k];
1851: vj = aj + ai[k];
1852: diagk = PetscRealPart(aa[adiag[k]]);
1853: PetscCheck(!PetscImaginaryPart(aa[adiag[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal must be real and nonnegative");
1854: t[k] = b[k] * PetscSqrtReal(diagk);
1855: nz = ai[k + 1] - ai[k] - 1;
1856: while (nz--) t[k] += (*v++) * t[*vj++];
1857: x[rp[k]] = t[k];
1858: }
1859: PetscCall(ISRestoreIndices(isrow, &rp));
1860: PetscCall(VecRestoreArrayRead(bb, &b));
1861: PetscCall(VecRestoreArray(xx, &x));
1862: PetscCall(PetscLogFlops(2.0 * a->nz - mbs));
1863: PetscFunctionReturn(PETSC_SUCCESS);
1864: }
1866: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_inplace(Mat A, Vec bb, Vec xx)
1867: {
1868: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1869: IS isrow = a->row;
1870: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *rp, *vj;
1871: const MatScalar *aa = a->a, *v;
1872: PetscReal diagk;
1873: PetscScalar *x, *t;
1874: const PetscScalar *b;
1875: PetscInt nz, k;
1877: PetscFunctionBegin;
1878: /* solve D^(1/2)*U*perm(x) = b by back substitution */
1879: PetscCall(VecGetArrayRead(bb, &b));
1880: PetscCall(VecGetArray(xx, &x));
1881: t = a->solve_work;
1882: PetscCall(ISGetIndices(isrow, &rp));
1884: for (k = mbs - 1; k >= 0; k--) {
1885: v = aa + ai[k] + 1;
1886: vj = aj + ai[k] + 1;
1887: diagk = PetscRealPart(aa[ai[k]]);
1888: PetscCheck(!PetscImaginaryPart(aa[ai[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal must be real and nonnegative");
1889: t[k] = b[k] * PetscSqrtReal(diagk);
1890: nz = ai[k + 1] - ai[k] - 1;
1891: while (nz--) t[k] += (*v++) * t[*vj++];
1892: x[rp[k]] = t[k];
1893: }
1894: PetscCall(ISRestoreIndices(isrow, &rp));
1895: PetscCall(VecRestoreArrayRead(bb, &b));
1896: PetscCall(VecRestoreArray(xx, &x));
1897: PetscCall(PetscLogFlops(2.0 * a->nz - mbs));
1898: PetscFunctionReturn(PETSC_SUCCESS);
1899: }
1901: PetscErrorCode MatSolves_SeqSBAIJ_1(Mat A, Vecs bb, Vecs xx)
1902: {
1903: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1905: PetscFunctionBegin;
1906: if (A->rmap->bs == 1) {
1907: PetscCall(MatSolve_SeqSBAIJ_1(A, bb->v, xx->v));
1908: } else {
1909: IS isrow = a->row;
1910: const PetscInt *vj, mbs = a->mbs, *ai = a->i, *aj = a->j, *rp;
1911: const MatScalar *aa = a->a, *v;
1912: PetscScalar *x, *t;
1913: const PetscScalar *b;
1914: PetscInt nz, k, n, i, j;
1916: if (bb->n > a->solves_work_n) {
1917: PetscCall(PetscFree(a->solves_work));
1918: PetscCall(PetscMalloc1(bb->n * A->rmap->N, &a->solves_work));
1919: a->solves_work_n = bb->n;
1920: }
1921: n = bb->n;
1922: PetscCall(VecGetArrayRead(bb->v, &b));
1923: PetscCall(VecGetArray(xx->v, &x));
1924: t = a->solves_work;
1926: PetscCall(ISGetIndices(isrow, &rp));
1928: /* solve U^T*D*y = perm(b) by forward substitution */
1929: for (k = 0; k < mbs; k++) {
1930: for (i = 0; i < n; i++) t[n * k + i] = b[rp[k] + i * mbs]; /* values are stored interlaced in t */
1931: }
1932: for (k = 0; k < mbs; k++) {
1933: v = aa + ai[k];
1934: vj = aj + ai[k];
1935: nz = ai[k + 1] - ai[k] - 1;
1936: for (j = 0; j < nz; j++) {
1937: for (i = 0; i < n; i++) t[n * (*vj) + i] += (*v) * t[n * k + i];
1938: v++;
1939: vj++;
1940: }
1941: for (i = 0; i < n; i++) t[n * k + i] *= aa[nz]; /* note: aa[nz] = 1/D(k) */
1942: }
1944: /* solve U*perm(x) = y by back substitution */
1945: for (k = mbs - 1; k >= 0; k--) {
1946: v = aa + ai[k] - 1;
1947: vj = aj + ai[k] - 1;
1948: nz = ai[k + 1] - ai[k] - 1;
1949: for (j = 0; j < nz; j++) {
1950: for (i = 0; i < n; i++) t[n * k + i] += (*v) * t[n * (*vj) + i];
1951: v++;
1952: vj++;
1953: }
1954: for (i = 0; i < n; i++) x[rp[k] + i * mbs] = t[n * k + i];
1955: }
1957: PetscCall(ISRestoreIndices(isrow, &rp));
1958: PetscCall(VecRestoreArrayRead(bb->v, &b));
1959: PetscCall(VecRestoreArray(xx->v, &x));
1960: PetscCall(PetscLogFlops(bb->n * (4.0 * a->nz - 3.0 * mbs)));
1961: }
1962: PetscFunctionReturn(PETSC_SUCCESS);
1963: }
1965: PetscErrorCode MatSolves_SeqSBAIJ_1_inplace(Mat A, Vecs bb, Vecs xx)
1966: {
1967: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1969: PetscFunctionBegin;
1970: if (A->rmap->bs == 1) {
1971: PetscCall(MatSolve_SeqSBAIJ_1_inplace(A, bb->v, xx->v));
1972: } else {
1973: IS isrow = a->row;
1974: const PetscInt *vj, mbs = a->mbs, *ai = a->i, *aj = a->j, *rp;
1975: const MatScalar *aa = a->a, *v;
1976: PetscScalar *x, *t;
1977: const PetscScalar *b;
1978: PetscInt nz, k, n, i;
1980: if (bb->n > a->solves_work_n) {
1981: PetscCall(PetscFree(a->solves_work));
1982: PetscCall(PetscMalloc1(bb->n * A->rmap->N, &a->solves_work));
1983: a->solves_work_n = bb->n;
1984: }
1985: n = bb->n;
1986: PetscCall(VecGetArrayRead(bb->v, &b));
1987: PetscCall(VecGetArray(xx->v, &x));
1988: t = a->solves_work;
1990: PetscCall(ISGetIndices(isrow, &rp));
1992: /* solve U^T*D*y = perm(b) by forward substitution */
1993: for (k = 0; k < mbs; k++) {
1994: for (i = 0; i < n; i++) t[n * k + i] = b[rp[k] + i * mbs]; /* values are stored interlaced in t */
1995: }
1996: for (k = 0; k < mbs; k++) {
1997: v = aa + ai[k];
1998: vj = aj + ai[k];
1999: nz = ai[k + 1] - ai[k];
2000: while (nz--) {
2001: for (i = 0; i < n; i++) t[n * (*vj) + i] += (*v) * t[n * k + i];
2002: v++;
2003: vj++;
2004: }
2005: for (i = 0; i < n; i++) t[n * k + i] *= aa[k]; /* note: aa[k] = 1/D(k) */
2006: }
2008: /* solve U*perm(x) = y by back substitution */
2009: for (k = mbs - 1; k >= 0; k--) {
2010: v = aa + ai[k];
2011: vj = aj + ai[k];
2012: nz = ai[k + 1] - ai[k];
2013: while (nz--) {
2014: for (i = 0; i < n; i++) t[n * k + i] += (*v) * t[n * (*vj) + i];
2015: v++;
2016: vj++;
2017: }
2018: for (i = 0; i < n; i++) x[rp[k] + i * mbs] = t[n * k + i];
2019: }
2021: PetscCall(ISRestoreIndices(isrow, &rp));
2022: PetscCall(VecRestoreArrayRead(bb->v, &b));
2023: PetscCall(VecRestoreArray(xx->v, &x));
2024: PetscCall(PetscLogFlops(bb->n * (4.0 * a->nz - 3.0 * mbs)));
2025: }
2026: PetscFunctionReturn(PETSC_SUCCESS);
2027: }
2029: PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A, Vec bb, Vec xx)
2030: {
2031: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
2032: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *vj, *adiag = a->diag;
2033: const MatScalar *aa = a->a, *v;
2034: const PetscScalar *b;
2035: PetscScalar *x, xi;
2036: PetscInt nz, i, j;
2038: PetscFunctionBegin;
2039: PetscCall(VecGetArrayRead(bb, &b));
2040: PetscCall(VecGetArray(xx, &x));
2041: /* solve U^T*D*y = b by forward substitution */
2042: PetscCall(PetscArraycpy(x, b, mbs));
2043: for (i = 0; i < mbs; i++) {
2044: v = aa + ai[i];
2045: vj = aj + ai[i];
2046: xi = x[i];
2047: nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
2048: for (j = 0; j < nz; j++) x[vj[j]] += v[j] * xi;
2049: x[i] = xi * v[nz]; /* v[nz] = aa[diag[i]] = 1/D(i) */
2050: }
2051: /* solve U*x = y by backward substitution */
2052: for (i = mbs - 2; i >= 0; i--) {
2053: xi = x[i];
2054: v = aa + adiag[i] - 1; /* end of row i, excluding diag */
2055: vj = aj + adiag[i] - 1;
2056: nz = ai[i + 1] - ai[i] - 1;
2057: for (j = 0; j < nz; j++) xi += v[-j] * x[vj[-j]];
2058: x[i] = xi;
2059: }
2060: PetscCall(VecRestoreArrayRead(bb, &b));
2061: PetscCall(VecRestoreArray(xx, &x));
2062: PetscCall(PetscLogFlops(4.0 * a->nz - 3 * mbs));
2063: PetscFunctionReturn(PETSC_SUCCESS);
2064: }
2066: PetscErrorCode MatMatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A, Mat B, Mat X)
2067: {
2068: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
2069: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *vj, *adiag = a->diag;
2070: const MatScalar *aa = a->a, *v;
2071: const PetscScalar *b;
2072: PetscScalar *x, xi;
2073: PetscInt nz, i, j, neq, ldb, ldx;
2074: PetscBool isdense;
2076: PetscFunctionBegin;
2077: if (!mbs) PetscFunctionReturn(PETSC_SUCCESS);
2078: PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSE, &isdense));
2079: PetscCheck(isdense, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "B matrix must be a SeqDense matrix");
2080: if (X != B) {
2081: PetscCall(PetscObjectTypeCompare((PetscObject)X, MATSEQDENSE, &isdense));
2082: PetscCheck(isdense, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "X matrix must be a SeqDense matrix");
2083: }
2084: PetscCall(MatDenseGetArrayRead(B, &b));
2085: PetscCall(MatDenseGetLDA(B, &ldb));
2086: PetscCall(MatDenseGetArray(X, &x));
2087: PetscCall(MatDenseGetLDA(X, &ldx));
2088: for (neq = 0; neq < B->cmap->n; neq++) {
2089: /* solve U^T*D*y = b by forward substitution */
2090: PetscCall(PetscArraycpy(x, b, mbs));
2091: for (i = 0; i < mbs; i++) {
2092: v = aa + ai[i];
2093: vj = aj + ai[i];
2094: xi = x[i];
2095: nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
2096: for (j = 0; j < nz; j++) x[vj[j]] += v[j] * xi;
2097: x[i] = xi * v[nz]; /* v[nz] = aa[diag[i]] = 1/D(i) */
2098: }
2099: /* solve U*x = y by backward substitution */
2100: for (i = mbs - 2; i >= 0; i--) {
2101: xi = x[i];
2102: v = aa + adiag[i] - 1; /* end of row i, excluding diag */
2103: vj = aj + adiag[i] - 1;
2104: nz = ai[i + 1] - ai[i] - 1;
2105: for (j = 0; j < nz; j++) xi += v[-j] * x[vj[-j]];
2106: x[i] = xi;
2107: }
2108: b += ldb;
2109: x += ldx;
2110: }
2111: PetscCall(MatDenseRestoreArrayRead(B, &b));
2112: PetscCall(MatDenseRestoreArray(X, &x));
2113: PetscCall(PetscLogFlops(B->cmap->n * (4.0 * a->nz - 3 * mbs)));
2114: PetscFunctionReturn(PETSC_SUCCESS);
2115: }
2117: PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
2118: {
2119: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
2120: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *vj;
2121: const MatScalar *aa = a->a, *v;
2122: PetscScalar *x, xk;
2123: const PetscScalar *b;
2124: PetscInt nz, k;
2126: PetscFunctionBegin;
2127: PetscCall(VecGetArrayRead(bb, &b));
2128: PetscCall(VecGetArray(xx, &x));
2130: /* solve U^T*D*y = b by forward substitution */
2131: PetscCall(PetscArraycpy(x, b, mbs));
2132: for (k = 0; k < mbs; k++) {
2133: v = aa + ai[k] + 1;
2134: vj = aj + ai[k] + 1;
2135: xk = x[k];
2136: nz = ai[k + 1] - ai[k] - 1; /* exclude diag[k] */
2137: while (nz--) x[*vj++] += (*v++) * xk;
2138: x[k] = xk * aa[ai[k]]; /* note: aa[diag[k]] = 1/D(k) */
2139: }
2141: /* solve U*x = y by back substitution */
2142: for (k = mbs - 2; k >= 0; k--) {
2143: v = aa + ai[k] + 1;
2144: vj = aj + ai[k] + 1;
2145: xk = x[k];
2146: nz = ai[k + 1] - ai[k] - 1;
2147: while (nz--) xk += (*v++) * x[*vj++];
2148: x[k] = xk;
2149: }
2151: PetscCall(VecRestoreArrayRead(bb, &b));
2152: PetscCall(VecRestoreArray(xx, &x));
2153: PetscCall(PetscLogFlops(4.0 * a->nz - 3 * mbs));
2154: PetscFunctionReturn(PETSC_SUCCESS);
2155: }
2157: PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A, Vec bb, Vec xx)
2158: {
2159: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
2160: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *adiag = a->diag, *vj;
2161: const MatScalar *aa = a->a, *v;
2162: PetscReal diagk;
2163: PetscScalar *x;
2164: const PetscScalar *b;
2165: PetscInt nz, k;
2167: PetscFunctionBegin;
2168: /* solve U^T*D^(1/2)*x = b by forward substitution */
2169: PetscCall(VecGetArrayRead(bb, &b));
2170: PetscCall(VecGetArray(xx, &x));
2171: PetscCall(PetscArraycpy(x, b, mbs));
2172: for (k = 0; k < mbs; k++) {
2173: v = aa + ai[k];
2174: vj = aj + ai[k];
2175: nz = ai[k + 1] - ai[k] - 1; /* exclude diag[k] */
2176: while (nz--) x[*vj++] += (*v++) * x[k];
2177: diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[adiag[k]] = 1/D(k) */
2178: PetscCheck(!PetscImaginaryPart(aa[adiag[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal (%g,%g) must be real and nonnegative", (double)PetscRealPart(aa[adiag[k]]), (double)PetscImaginaryPart(aa[adiag[k]]));
2179: x[k] *= PetscSqrtReal(diagk);
2180: }
2181: PetscCall(VecRestoreArrayRead(bb, &b));
2182: PetscCall(VecRestoreArray(xx, &x));
2183: PetscCall(PetscLogFlops(2.0 * a->nz - mbs));
2184: PetscFunctionReturn(PETSC_SUCCESS);
2185: }
2187: PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
2188: {
2189: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
2190: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *vj;
2191: const MatScalar *aa = a->a, *v;
2192: PetscReal diagk;
2193: PetscScalar *x;
2194: const PetscScalar *b;
2195: PetscInt nz, k;
2197: PetscFunctionBegin;
2198: /* solve U^T*D^(1/2)*x = b by forward substitution */
2199: PetscCall(VecGetArrayRead(bb, &b));
2200: PetscCall(VecGetArray(xx, &x));
2201: PetscCall(PetscArraycpy(x, b, mbs));
2202: for (k = 0; k < mbs; k++) {
2203: v = aa + ai[k] + 1;
2204: vj = aj + ai[k] + 1;
2205: nz = ai[k + 1] - ai[k] - 1; /* exclude diag[k] */
2206: while (nz--) x[*vj++] += (*v++) * x[k];
2207: diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
2208: PetscCheck(!PetscImaginaryPart(aa[ai[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal (%g,%g) must be real and nonnegative", (double)PetscRealPart(aa[ai[k]]), (double)PetscImaginaryPart(aa[ai[k]]));
2209: x[k] *= PetscSqrtReal(diagk);
2210: }
2211: PetscCall(VecRestoreArrayRead(bb, &b));
2212: PetscCall(VecRestoreArray(xx, &x));
2213: PetscCall(PetscLogFlops(2.0 * a->nz - mbs));
2214: PetscFunctionReturn(PETSC_SUCCESS);
2215: }
2217: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A, Vec bb, Vec xx)
2218: {
2219: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
2220: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *adiag = a->diag, *vj;
2221: const MatScalar *aa = a->a, *v;
2222: PetscReal diagk;
2223: PetscScalar *x;
2224: const PetscScalar *b;
2225: PetscInt nz, k;
2227: PetscFunctionBegin;
2228: /* solve D^(1/2)*U*x = b by back substitution */
2229: PetscCall(VecGetArrayRead(bb, &b));
2230: PetscCall(VecGetArray(xx, &x));
2232: for (k = mbs - 1; k >= 0; k--) {
2233: v = aa + ai[k];
2234: vj = aj + ai[k];
2235: diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */
2236: PetscCheck(!PetscImaginaryPart(aa[adiag[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal must be real and nonnegative");
2237: x[k] = PetscSqrtReal(diagk) * b[k];
2238: nz = ai[k + 1] - ai[k] - 1;
2239: while (nz--) x[k] += (*v++) * x[*vj++];
2240: }
2241: PetscCall(VecRestoreArrayRead(bb, &b));
2242: PetscCall(VecRestoreArray(xx, &x));
2243: PetscCall(PetscLogFlops(2.0 * a->nz - mbs));
2244: PetscFunctionReturn(PETSC_SUCCESS);
2245: }
2247: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
2248: {
2249: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
2250: const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *vj;
2251: const MatScalar *aa = a->a, *v;
2252: PetscReal diagk;
2253: PetscScalar *x;
2254: const PetscScalar *b;
2255: PetscInt nz, k;
2257: PetscFunctionBegin;
2258: /* solve D^(1/2)*U*x = b by back substitution */
2259: PetscCall(VecGetArrayRead(bb, &b));
2260: PetscCall(VecGetArray(xx, &x));
2262: for (k = mbs - 1; k >= 0; k--) {
2263: v = aa + ai[k] + 1;
2264: vj = aj + ai[k] + 1;
2265: diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
2266: PetscCheck(!PetscImaginaryPart(aa[ai[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal must be real and nonnegative");
2267: x[k] = PetscSqrtReal(diagk) * b[k];
2268: nz = ai[k + 1] - ai[k] - 1;
2269: while (nz--) x[k] += (*v++) * x[*vj++];
2270: }
2271: PetscCall(VecRestoreArrayRead(bb, &b));
2272: PetscCall(VecRestoreArray(xx, &x));
2273: PetscCall(PetscLogFlops(2.0 * a->nz - mbs));
2274: PetscFunctionReturn(PETSC_SUCCESS);
2275: }
2277: /* Use Modified Sparse Row storage for u and ju, see Saad pp.85 */
2278: PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_MSR(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2279: {
2280: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *b;
2281: const PetscInt *rip, mbs = a->mbs, *ai, *aj;
2282: PetscInt *jutmp, bs = A->rmap->bs, i;
2283: PetscInt m, reallocs = 0, *levtmp;
2284: PetscInt *prowl, *q, jmin, jmax, juidx, nzk, qm, *iu, *ju, k, j, vj, umax, maxadd;
2285: PetscInt incrlev, *lev, shift, prow, nz;
2286: PetscReal f = info->fill, levels = info->levels;
2287: PetscBool perm_identity;
2289: PetscFunctionBegin;
2290: /* check whether perm is the identity mapping */
2291: PetscCall(ISIdentity(perm, &perm_identity));
2293: PetscCheck(perm_identity, PETSC_COMM_SELF, PETSC_ERR_SUP, "Matrix reordering is not supported for sbaij matrix. Use aij format");
2294: a->permute = PETSC_FALSE;
2295: ai = a->i;
2296: aj = a->j;
2298: /* initialization */
2299: PetscCall(ISGetIndices(perm, &rip));
2300: umax = (PetscInt)(f * ai[mbs] + 1);
2301: PetscCall(PetscMalloc1(umax, &lev));
2302: umax += mbs + 1;
2303: shift = mbs + 1;
2304: PetscCall(PetscMalloc1(mbs + 1, &iu));
2305: PetscCall(PetscMalloc1(umax, &ju));
2306: iu[0] = mbs + 1;
2307: juidx = mbs + 1;
2308: /* prowl: linked list for pivot row */
2309: PetscCall(PetscMalloc3(mbs, &prowl, mbs, &q, mbs, &levtmp));
2310: /* q: linked list for col index */
2312: for (i = 0; i < mbs; i++) {
2313: prowl[i] = mbs;
2314: q[i] = 0;
2315: }
2317: /* for each row k */
2318: for (k = 0; k < mbs; k++) {
2319: nzk = 0;
2320: q[k] = mbs;
2321: /* copy current row into linked list */
2322: nz = ai[rip[k] + 1] - ai[rip[k]];
2323: j = ai[rip[k]];
2324: while (nz--) {
2325: vj = rip[aj[j++]];
2326: if (vj > k) {
2327: qm = k;
2328: do {
2329: m = qm;
2330: qm = q[m];
2331: } while (qm < vj);
2332: PetscCheck(qm != vj, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Duplicate entry in A");
2333: nzk++;
2334: q[m] = vj;
2335: q[vj] = qm;
2336: levtmp[vj] = 0; /* initialize lev for nonzero element */
2337: }
2338: }
2340: /* modify nonzero structure of k-th row by computing fill-in
2341: for each row prow to be merged in */
2342: prow = k;
2343: prow = prowl[prow]; /* next pivot row (== 0 for symbolic factorization) */
2345: while (prow < k) {
2346: /* merge row prow into k-th row */
2347: jmin = iu[prow] + 1;
2348: jmax = iu[prow + 1];
2349: qm = k;
2350: for (j = jmin; j < jmax; j++) {
2351: incrlev = lev[j - shift] + 1;
2352: if (incrlev > levels) continue;
2353: vj = ju[j];
2354: do {
2355: m = qm;
2356: qm = q[m];
2357: } while (qm < vj);
2358: if (qm != vj) { /* a fill */
2359: nzk++;
2360: q[m] = vj;
2361: q[vj] = qm;
2362: qm = vj;
2363: levtmp[vj] = incrlev;
2364: } else if (levtmp[vj] > incrlev) levtmp[vj] = incrlev;
2365: }
2366: prow = prowl[prow]; /* next pivot row */
2367: }
2369: /* add k to row list for first nonzero element in k-th row */
2370: if (nzk > 1) {
2371: i = q[k]; /* col value of first nonzero element in k_th row of U */
2372: prowl[k] = prowl[i];
2373: prowl[i] = k;
2374: }
2375: iu[k + 1] = iu[k] + nzk;
2377: /* allocate more space to ju and lev if needed */
2378: if (iu[k + 1] > umax) {
2379: /* estimate how much additional space we will need */
2380: /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
2381: /* just double the memory each time */
2382: maxadd = umax;
2383: if (maxadd < nzk) maxadd = (mbs - k) * (nzk + 1) / 2;
2384: umax += maxadd;
2386: /* allocate a longer ju */
2387: PetscCall(PetscMalloc1(umax, &jutmp));
2388: PetscCall(PetscArraycpy(jutmp, ju, iu[k]));
2389: PetscCall(PetscFree(ju));
2390: ju = jutmp;
2392: PetscCall(PetscMalloc1(umax, &jutmp));
2393: PetscCall(PetscArraycpy(jutmp, lev, iu[k] - shift));
2394: PetscCall(PetscFree(lev));
2395: lev = jutmp;
2396: reallocs += 2; /* count how many times we realloc */
2397: }
2399: /* save nonzero structure of k-th row in ju */
2400: i = k;
2401: while (nzk--) {
2402: i = q[i];
2403: ju[juidx] = i;
2404: lev[juidx - shift] = levtmp[i];
2405: juidx++;
2406: }
2407: }
2409: #if defined(PETSC_USE_INFO)
2410: if (ai[mbs] != 0) {
2411: PetscReal af = ((PetscReal)iu[mbs]) / ((PetscReal)ai[mbs]);
2412: PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)f, (double)af));
2413: PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af));
2414: PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g);\n", (double)af));
2415: PetscCall(PetscInfo(A, "for best performance.\n"));
2416: } else {
2417: PetscCall(PetscInfo(A, "Empty matrix\n"));
2418: }
2419: #endif
2421: PetscCall(ISRestoreIndices(perm, &rip));
2422: PetscCall(PetscFree3(prowl, q, levtmp));
2423: PetscCall(PetscFree(lev));
2425: /* put together the new matrix */
2426: PetscCall(MatSeqSBAIJSetPreallocation(B, bs, 0, NULL));
2428: b = (Mat_SeqSBAIJ *)(B)->data;
2429: PetscCall(PetscFree2(b->imax, b->ilen));
2431: b->singlemalloc = PETSC_FALSE;
2432: b->free_a = PETSC_TRUE;
2433: b->free_ij = PETSC_TRUE;
2434: /* the next line frees the default space generated by the Create() */
2435: PetscCall(PetscFree3(b->a, b->j, b->i));
2436: PetscCall(PetscMalloc1((iu[mbs] + 1) * a->bs2, &b->a));
2437: b->j = ju;
2438: b->i = iu;
2439: b->diag = NULL;
2440: b->ilen = NULL;
2441: b->imax = NULL;
2443: PetscCall(ISDestroy(&b->row));
2444: PetscCall(ISDestroy(&b->icol));
2445: b->row = perm;
2446: b->icol = perm;
2447: PetscCall(PetscObjectReference((PetscObject)perm));
2448: PetscCall(PetscObjectReference((PetscObject)perm));
2449: PetscCall(PetscMalloc1(bs * mbs + bs, &b->solve_work));
2450: /* In b structure: Free imax, ilen, old a, old j.
2451: Allocate idnew, solve_work, new a, new j */
2452: b->maxnz = b->nz = iu[mbs];
2454: (B)->info.factor_mallocs = reallocs;
2455: (B)->info.fill_ratio_given = f;
2456: if (ai[mbs] != 0) {
2457: (B)->info.fill_ratio_needed = ((PetscReal)iu[mbs]) / ((PetscReal)ai[mbs]);
2458: } else {
2459: (B)->info.fill_ratio_needed = 0.0;
2460: }
2461: PetscCall(MatSeqSBAIJSetNumericFactorization_inplace(B, perm_identity));
2462: PetscFunctionReturn(PETSC_SUCCESS);
2463: }
2465: /*
2466: See MatICCFactorSymbolic_SeqAIJ() for description of its data structure
2467: */
2468: #include <petscbt.h>
2469: #include <../src/mat/utils/freespace.h>
2470: PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ(Mat fact, Mat A, IS perm, const MatFactorInfo *info)
2471: {
2472: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *b;
2473: PetscBool perm_identity, free_ij = PETSC_TRUE, missing;
2474: PetscInt bs = A->rmap->bs, am = a->mbs, d, *ai = a->i, *aj = a->j;
2475: const PetscInt *rip;
2476: PetscInt reallocs = 0, i, *ui, *udiag, *cols;
2477: PetscInt jmin, jmax, nzk, k, j, *jl, prow, *il, nextprow;
2478: PetscInt nlnk, *lnk, *lnk_lvl = NULL, ncols, *uj, **uj_ptr, **uj_lvl_ptr;
2479: PetscReal fill = info->fill, levels = info->levels;
2480: PetscFreeSpaceList free_space = NULL, current_space = NULL;
2481: PetscFreeSpaceList free_space_lvl = NULL, current_space_lvl = NULL;
2482: PetscBT lnkbt;
2484: PetscFunctionBegin;
2485: PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
2486: PetscCall(MatMissingDiagonal(A, &missing, &d));
2487: PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, d);
2488: if (bs > 1) {
2489: PetscCall(MatICCFactorSymbolic_SeqSBAIJ_inplace(fact, A, perm, info));
2490: PetscFunctionReturn(PETSC_SUCCESS);
2491: }
2493: /* check whether perm is the identity mapping */
2494: PetscCall(ISIdentity(perm, &perm_identity));
2495: PetscCheck(perm_identity, PETSC_COMM_SELF, PETSC_ERR_SUP, "Matrix reordering is not supported for sbaij matrix. Use aij format");
2496: a->permute = PETSC_FALSE;
2498: PetscCall(PetscMalloc1(am + 1, &ui));
2499: PetscCall(PetscMalloc1(am + 1, &udiag));
2500: ui[0] = 0;
2502: /* ICC(0) without matrix ordering: simply rearrange column indices */
2503: if (!levels) {
2504: /* reuse the column pointers and row offsets for memory savings */
2505: for (i = 0; i < am; i++) {
2506: ncols = ai[i + 1] - ai[i];
2507: ui[i + 1] = ui[i] + ncols;
2508: udiag[i] = ui[i + 1] - 1; /* points to the last entry of U(i,:) */
2509: }
2510: PetscCall(PetscMalloc1(ui[am] + 1, &uj));
2511: cols = uj;
2512: for (i = 0; i < am; i++) {
2513: aj = a->j + ai[i] + 1; /* 1st entry of U(i,:) without diagonal */
2514: ncols = ai[i + 1] - ai[i] - 1;
2515: for (j = 0; j < ncols; j++) *cols++ = aj[j];
2516: *cols++ = i; /* diagonal is located as the last entry of U(i,:) */
2517: }
2518: } else { /* case: levels>0 */
2519: PetscCall(ISGetIndices(perm, &rip));
2521: /* initialization */
2522: /* jl: linked list for storing indices of the pivot rows
2523: il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
2524: PetscCall(PetscMalloc4(am, &uj_ptr, am, &uj_lvl_ptr, am, &il, am, &jl));
2525: for (i = 0; i < am; i++) {
2526: jl[i] = am;
2527: il[i] = 0;
2528: }
2530: /* create and initialize a linked list for storing column indices of the active row k */
2531: nlnk = am + 1;
2532: PetscCall(PetscIncompleteLLCreate(am, am, nlnk, lnk, lnk_lvl, lnkbt));
2534: /* initial FreeSpace size is fill*(ai[am]+1) */
2535: PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, ai[am] + 1), &free_space));
2537: current_space = free_space;
2539: PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, ai[am] + 1), &free_space_lvl));
2541: current_space_lvl = free_space_lvl;
2543: for (k = 0; k < am; k++) { /* for each active row k */
2544: /* initialize lnk by the column indices of row k */
2545: nzk = 0;
2546: ncols = ai[k + 1] - ai[k];
2547: PetscCheck(ncols, PETSC_COMM_SELF, PETSC_ERR_MAT_CH_ZRPVT, "Empty row %" PetscInt_FMT " in matrix ", k);
2548: cols = aj + ai[k];
2549: PetscCall(PetscIncompleteLLInit(ncols, cols, am, rip, &nlnk, lnk, lnk_lvl, lnkbt));
2550: nzk += nlnk;
2552: /* update lnk by computing fill-in for each pivot row to be merged in */
2553: prow = jl[k]; /* 1st pivot row */
2555: while (prow < k) {
2556: nextprow = jl[prow];
2558: /* merge prow into k-th row */
2559: jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */
2560: jmax = ui[prow + 1];
2561: ncols = jmax - jmin;
2562: i = jmin - ui[prow];
2564: cols = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */
2565: uj = uj_lvl_ptr[prow] + i; /* levels of cols */
2566: j = *(uj - 1);
2567: PetscCall(PetscICCLLAddSorted(ncols, cols, levels, uj, am, &nlnk, lnk, lnk_lvl, lnkbt, j));
2568: nzk += nlnk;
2570: /* update il and jl for prow */
2571: if (jmin < jmax) {
2572: il[prow] = jmin;
2574: j = *cols;
2575: jl[prow] = jl[j];
2576: jl[j] = prow;
2577: }
2578: prow = nextprow;
2579: }
2581: /* if free space is not available, make more free space */
2582: if (current_space->local_remaining < nzk) {
2583: i = am - k + 1; /* num of unfactored rows */
2584: i = PetscIntMultTruncate(i, PetscMin(nzk, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
2585: PetscCall(PetscFreeSpaceGet(i, ¤t_space));
2586: PetscCall(PetscFreeSpaceGet(i, ¤t_space_lvl));
2587: reallocs++;
2588: }
2590: /* copy data into free_space and free_space_lvl, then initialize lnk */
2591: PetscCheck(nzk != 0, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Empty row %" PetscInt_FMT " in ICC matrix factor", k);
2592: PetscCall(PetscIncompleteLLClean(am, am, nzk, lnk, lnk_lvl, current_space->array, current_space_lvl->array, lnkbt));
2594: /* add the k-th row into il and jl */
2595: if (nzk > 1) {
2596: i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
2597: jl[k] = jl[i];
2598: jl[i] = k;
2599: il[k] = ui[k] + 1;
2600: }
2601: uj_ptr[k] = current_space->array;
2602: uj_lvl_ptr[k] = current_space_lvl->array;
2604: current_space->array += nzk;
2605: current_space->local_used += nzk;
2606: current_space->local_remaining -= nzk;
2607: current_space_lvl->array += nzk;
2608: current_space_lvl->local_used += nzk;
2609: current_space_lvl->local_remaining -= nzk;
2611: ui[k + 1] = ui[k] + nzk;
2612: }
2614: PetscCall(ISRestoreIndices(perm, &rip));
2615: PetscCall(PetscFree4(uj_ptr, uj_lvl_ptr, il, jl));
2617: /* destroy list of free space and other temporary array(s) */
2618: PetscCall(PetscMalloc1(ui[am] + 1, &uj));
2619: PetscCall(PetscFreeSpaceContiguous_Cholesky(&free_space, uj, am, ui, udiag)); /* store matrix factor */
2620: PetscCall(PetscIncompleteLLDestroy(lnk, lnkbt));
2621: PetscCall(PetscFreeSpaceDestroy(free_space_lvl));
2623: } /* end of case: levels>0 || (levels=0 && !perm_identity) */
2625: /* put together the new matrix in MATSEQSBAIJ format */
2626: PetscCall(MatSeqSBAIJSetPreallocation(fact, bs, MAT_SKIP_ALLOCATION, NULL));
2628: b = (Mat_SeqSBAIJ *)(fact)->data;
2629: PetscCall(PetscFree2(b->imax, b->ilen));
2631: b->singlemalloc = PETSC_FALSE;
2632: b->free_a = PETSC_TRUE;
2633: b->free_ij = free_ij;
2635: PetscCall(PetscMalloc1(ui[am] + 1, &b->a));
2637: b->j = uj;
2638: b->i = ui;
2639: b->diag = udiag;
2640: b->free_diag = PETSC_TRUE;
2641: b->ilen = NULL;
2642: b->imax = NULL;
2643: b->row = perm;
2644: b->col = perm;
2646: PetscCall(PetscObjectReference((PetscObject)perm));
2647: PetscCall(PetscObjectReference((PetscObject)perm));
2649: b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
2651: PetscCall(PetscMalloc1(am + 1, &b->solve_work));
2653: b->maxnz = b->nz = ui[am];
2655: fact->info.factor_mallocs = reallocs;
2656: fact->info.fill_ratio_given = fill;
2657: if (ai[am] != 0) {
2658: fact->info.fill_ratio_needed = ((PetscReal)ui[am]) / ai[am];
2659: } else {
2660: fact->info.fill_ratio_needed = 0.0;
2661: }
2662: #if defined(PETSC_USE_INFO)
2663: if (ai[am] != 0) {
2664: PetscReal af = fact->info.fill_ratio_needed;
2665: PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af));
2666: PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af));
2667: PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af));
2668: } else {
2669: PetscCall(PetscInfo(A, "Empty matrix\n"));
2670: }
2671: #endif
2672: fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
2673: PetscFunctionReturn(PETSC_SUCCESS);
2674: }
2676: PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_inplace(Mat fact, Mat A, IS perm, const MatFactorInfo *info)
2677: {
2678: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
2679: Mat_SeqSBAIJ *b;
2680: PetscBool perm_identity, free_ij = PETSC_TRUE;
2681: PetscInt bs = A->rmap->bs, am = a->mbs;
2682: const PetscInt *cols, *rip, *ai = a->i, *aj = a->j;
2683: PetscInt reallocs = 0, i, *ui;
2684: PetscInt jmin, jmax, nzk, k, j, *jl, prow, *il, nextprow;
2685: PetscInt nlnk, *lnk, *lnk_lvl = NULL, ncols, *cols_lvl, *uj, **uj_ptr, **uj_lvl_ptr;
2686: PetscReal fill = info->fill, levels = info->levels, ratio_needed;
2687: PetscFreeSpaceList free_space = NULL, current_space = NULL;
2688: PetscFreeSpaceList free_space_lvl = NULL, current_space_lvl = NULL;
2689: PetscBT lnkbt;
2691: PetscFunctionBegin;
2692: /*
2693: This code originally uses Modified Sparse Row (MSR) storage
2694: (see page 85, "Iterative Methods ..." by Saad) for the output matrix B - bad choice!
2695: Then it is rewritten so the factor B takes seqsbaij format. However the associated
2696: MatCholeskyFactorNumeric_() have not been modified for the cases of bs>1,
2697: thus the original code in MSR format is still used for these cases.
2698: The code below should replace MatICCFactorSymbolic_SeqSBAIJ_MSR() whenever
2699: MatCholeskyFactorNumeric_() is modified for using sbaij symbolic factor.
2700: */
2701: if (bs > 1) {
2702: PetscCall(MatICCFactorSymbolic_SeqSBAIJ_MSR(fact, A, perm, info));
2703: PetscFunctionReturn(PETSC_SUCCESS);
2704: }
2706: /* check whether perm is the identity mapping */
2707: PetscCall(ISIdentity(perm, &perm_identity));
2708: PetscCheck(perm_identity, PETSC_COMM_SELF, PETSC_ERR_SUP, "Matrix reordering is not supported for sbaij matrix. Use aij format");
2709: a->permute = PETSC_FALSE;
2711: /* special case that simply copies fill pattern */
2712: if (!levels) {
2713: /* reuse the column pointers and row offsets for memory savings */
2714: ui = a->i;
2715: uj = a->j;
2716: free_ij = PETSC_FALSE;
2717: ratio_needed = 1.0;
2718: } else { /* case: levels>0 */
2719: PetscCall(ISGetIndices(perm, &rip));
2721: /* initialization */
2722: PetscCall(PetscMalloc1(am + 1, &ui));
2723: ui[0] = 0;
2725: /* jl: linked list for storing indices of the pivot rows
2726: il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
2727: PetscCall(PetscMalloc4(am, &uj_ptr, am, &uj_lvl_ptr, am, &il, am, &jl));
2728: for (i = 0; i < am; i++) {
2729: jl[i] = am;
2730: il[i] = 0;
2731: }
2733: /* create and initialize a linked list for storing column indices of the active row k */
2734: nlnk = am + 1;
2735: PetscCall(PetscIncompleteLLCreate(am, am, nlnk, lnk, lnk_lvl, lnkbt));
2737: /* initial FreeSpace size is fill*(ai[am]+1) */
2738: PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, ai[am] + 1), &free_space));
2740: current_space = free_space;
2742: PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, ai[am] + 1), &free_space_lvl));
2744: current_space_lvl = free_space_lvl;
2746: for (k = 0; k < am; k++) { /* for each active row k */
2747: /* initialize lnk by the column indices of row rip[k] */
2748: nzk = 0;
2749: ncols = ai[rip[k] + 1] - ai[rip[k]];
2750: cols = aj + ai[rip[k]];
2751: PetscCall(PetscIncompleteLLInit(ncols, cols, am, rip, &nlnk, lnk, lnk_lvl, lnkbt));
2752: nzk += nlnk;
2754: /* update lnk by computing fill-in for each pivot row to be merged in */
2755: prow = jl[k]; /* 1st pivot row */
2757: while (prow < k) {
2758: nextprow = jl[prow];
2760: /* merge prow into k-th row */
2761: jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */
2762: jmax = ui[prow + 1];
2763: ncols = jmax - jmin;
2764: i = jmin - ui[prow];
2765: cols = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */
2766: j = *(uj_lvl_ptr[prow] + i - 1);
2767: cols_lvl = uj_lvl_ptr[prow] + i;
2768: PetscCall(PetscICCLLAddSorted(ncols, cols, levels, cols_lvl, am, &nlnk, lnk, lnk_lvl, lnkbt, j));
2769: nzk += nlnk;
2771: /* update il and jl for prow */
2772: if (jmin < jmax) {
2773: il[prow] = jmin;
2774: j = *cols;
2775: jl[prow] = jl[j];
2776: jl[j] = prow;
2777: }
2778: prow = nextprow;
2779: }
2781: /* if free space is not available, make more free space */
2782: if (current_space->local_remaining < nzk) {
2783: i = am - k + 1; /* num of unfactored rows */
2784: i = PetscMin(PetscIntMultTruncate(i, nzk), PetscIntMultTruncate(i, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
2785: PetscCall(PetscFreeSpaceGet(i, ¤t_space));
2786: PetscCall(PetscFreeSpaceGet(i, ¤t_space_lvl));
2787: reallocs++;
2788: }
2790: /* copy data into free_space and free_space_lvl, then initialize lnk */
2791: PetscCall(PetscIncompleteLLClean(am, am, nzk, lnk, lnk_lvl, current_space->array, current_space_lvl->array, lnkbt));
2793: /* add the k-th row into il and jl */
2794: if (nzk - 1 > 0) {
2795: i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
2796: jl[k] = jl[i];
2797: jl[i] = k;
2798: il[k] = ui[k] + 1;
2799: }
2800: uj_ptr[k] = current_space->array;
2801: uj_lvl_ptr[k] = current_space_lvl->array;
2803: current_space->array += nzk;
2804: current_space->local_used += nzk;
2805: current_space->local_remaining -= nzk;
2806: current_space_lvl->array += nzk;
2807: current_space_lvl->local_used += nzk;
2808: current_space_lvl->local_remaining -= nzk;
2810: ui[k + 1] = ui[k] + nzk;
2811: }
2813: PetscCall(ISRestoreIndices(perm, &rip));
2814: PetscCall(PetscFree4(uj_ptr, uj_lvl_ptr, il, jl));
2816: /* destroy list of free space and other temporary array(s) */
2817: PetscCall(PetscMalloc1(ui[am] + 1, &uj));
2818: PetscCall(PetscFreeSpaceContiguous(&free_space, uj));
2819: PetscCall(PetscIncompleteLLDestroy(lnk, lnkbt));
2820: PetscCall(PetscFreeSpaceDestroy(free_space_lvl));
2821: if (ai[am] != 0) {
2822: ratio_needed = ((PetscReal)ui[am]) / ((PetscReal)ai[am]);
2823: } else {
2824: ratio_needed = 0.0;
2825: }
2826: } /* end of case: levels>0 || (levels=0 && !perm_identity) */
2828: /* put together the new matrix in MATSEQSBAIJ format */
2829: PetscCall(MatSeqSBAIJSetPreallocation(fact, bs, MAT_SKIP_ALLOCATION, NULL));
2831: b = (Mat_SeqSBAIJ *)(fact)->data;
2833: PetscCall(PetscFree2(b->imax, b->ilen));
2835: b->singlemalloc = PETSC_FALSE;
2836: b->free_a = PETSC_TRUE;
2837: b->free_ij = free_ij;
2839: PetscCall(PetscMalloc1(ui[am] + 1, &b->a));
2841: b->j = uj;
2842: b->i = ui;
2843: b->diag = NULL;
2844: b->ilen = NULL;
2845: b->imax = NULL;
2846: b->row = perm;
2847: b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
2849: PetscCall(PetscObjectReference((PetscObject)perm));
2850: b->icol = perm;
2851: PetscCall(PetscObjectReference((PetscObject)perm));
2852: PetscCall(PetscMalloc1(am + 1, &b->solve_work));
2854: b->maxnz = b->nz = ui[am];
2856: fact->info.factor_mallocs = reallocs;
2857: fact->info.fill_ratio_given = fill;
2858: fact->info.fill_ratio_needed = ratio_needed;
2859: #if defined(PETSC_USE_INFO)
2860: if (ai[am] != 0) {
2861: PetscReal af = fact->info.fill_ratio_needed;
2862: PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af));
2863: PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af));
2864: PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af));
2865: } else {
2866: PetscCall(PetscInfo(A, "Empty matrix\n"));
2867: }
2868: #endif
2869: if (perm_identity) {
2870: fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering_inplace;
2871: } else {
2872: fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_inplace;
2873: }
2874: PetscFunctionReturn(PETSC_SUCCESS);
2875: }