Actual source code: sbaijfact2.c


  2: /*
  3:     Factorization code for SBAIJ format.
  4: */

  6: #include <../src/mat/impls/sbaij/seq/sbaij.h>
  7: #include <../src/mat/impls/baij/seq/baij.h>
  8: #include <petsc/private/kernels/blockinvert.h>

 10: PetscErrorCode MatSolve_SeqSBAIJ_N_inplace(Mat A, Vec bb, Vec xx)
 11: {
 12:   Mat_SeqSBAIJ      *a     = (Mat_SeqSBAIJ *)A->data;
 13:   IS                 isrow = a->row;
 14:   PetscInt           mbs = a->mbs, *ai = a->i, *aj = a->j;
 15:   const PetscInt    *r;
 16:   PetscInt           nz, *vj, k, idx, k1;
 17:   PetscInt           bs = A->rmap->bs, bs2 = a->bs2;
 18:   const MatScalar   *aa = a->a, *v, *diag;
 19:   PetscScalar       *x, *xk, *xj, *xk_tmp, *t;
 20:   const PetscScalar *b;

 22:   PetscFunctionBegin;
 23:   PetscCall(VecGetArrayRead(bb, &b));
 24:   PetscCall(VecGetArray(xx, &x));
 25:   t = a->solve_work;
 26:   PetscCall(ISGetIndices(isrow, &r));
 27:   PetscCall(PetscMalloc1(bs, &xk_tmp));

 29:   /* solve U^T * D * y = b by forward substitution */
 30:   xk = t;
 31:   for (k = 0; k < mbs; k++) { /* t <- perm(b) */
 32:     idx = bs * r[k];
 33:     for (k1 = 0; k1 < bs; k1++) *xk++ = b[idx + k1];
 34:   }
 35:   for (k = 0; k < mbs; k++) {
 36:     v  = aa + bs2 * ai[k];
 37:     xk = t + k * bs;                          /* Dk*xk = k-th block of x */
 38:     PetscCall(PetscArraycpy(xk_tmp, xk, bs)); /* xk_tmp <- xk */
 39:     nz = ai[k + 1] - ai[k];
 40:     vj = aj + ai[k];
 41:     xj = t + (*vj) * bs; /* *vj-th block of x, *vj>k */
 42:     while (nz--) {
 43:       /* x(:) += U(k,:)^T*(Dk*xk) */
 44:       PetscKernel_v_gets_v_plus_Atranspose_times_w(bs, xj, v, xk_tmp); /* xj <- xj + v^t * xk */
 45:       vj++;
 46:       xj = t + (*vj) * bs;
 47:       v += bs2;
 48:     }
 49:     /* xk = inv(Dk)*(Dk*xk) */
 50:     diag = aa + k * bs2;                                /* ptr to inv(Dk) */
 51:     PetscKernel_w_gets_A_times_v(bs, xk_tmp, diag, xk); /* xk <- diag * xk */
 52:   }

 54:   /* solve U*x = y by back substitution */
 55:   for (k = mbs - 1; k >= 0; k--) {
 56:     v  = aa + bs2 * ai[k];
 57:     xk = t + k * bs; /* xk */
 58:     nz = ai[k + 1] - ai[k];
 59:     vj = aj + ai[k];
 60:     xj = t + (*vj) * bs;
 61:     while (nz--) {
 62:       /* xk += U(k,:)*x(:) */
 63:       PetscKernel_v_gets_v_plus_A_times_w(bs, xk, v, xj); /* xk <- xk + v*xj */
 64:       vj++;
 65:       v += bs2;
 66:       xj = t + (*vj) * bs;
 67:     }
 68:     idx = bs * r[k];
 69:     for (k1 = 0; k1 < bs; k1++) x[idx + k1] = *xk++;
 70:   }

 72:   PetscCall(PetscFree(xk_tmp));
 73:   PetscCall(ISRestoreIndices(isrow, &r));
 74:   PetscCall(VecRestoreArrayRead(bb, &b));
 75:   PetscCall(VecRestoreArray(xx, &x));
 76:   PetscCall(PetscLogFlops(4.0 * bs2 * a->nz - (bs + 2.0 * bs2) * mbs));
 77:   PetscFunctionReturn(PETSC_SUCCESS);
 78: }

 80: PetscErrorCode MatForwardSolve_SeqSBAIJ_N_inplace(Mat A, Vec bb, Vec xx)
 81: {
 82:   PetscFunctionBegin;
 83:   SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "not implemented yet");
 84: }

 86: PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_inplace(Mat A, Vec bb, Vec xx)
 87: {
 88:   PetscFunctionBegin;
 89:   SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Not yet implemented");
 90: }

 92: PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscInt bs, PetscScalar *x)
 93: {
 94:   PetscInt         nz, k;
 95:   const PetscInt  *vj, bs2 = bs * bs;
 96:   const MatScalar *v, *diag;
 97:   PetscScalar     *xk, *xj, *xk_tmp;

 99:   PetscFunctionBegin;
100:   PetscCall(PetscMalloc1(bs, &xk_tmp));
101:   for (k = 0; k < mbs; k++) {
102:     v  = aa + bs2 * ai[k];
103:     xk = x + k * bs;                          /* Dk*xk = k-th block of x */
104:     PetscCall(PetscArraycpy(xk_tmp, xk, bs)); /* xk_tmp <- xk */
105:     nz = ai[k + 1] - ai[k];
106:     vj = aj + ai[k];
107:     xj = x + (size_t)(*vj) * bs; /* *vj-th block of x, *vj>k */
108:     while (nz--) {
109:       /* x(:) += U(k,:)^T*(Dk*xk) */
110:       PetscKernel_v_gets_v_plus_Atranspose_times_w(bs, xj, v, xk_tmp); /* xj <- xj + v^t * xk */
111:       vj++;
112:       xj = x + (size_t)(*vj) * bs;
113:       v += bs2;
114:     }
115:     /* xk = inv(Dk)*(Dk*xk) */
116:     diag = aa + k * bs2;                                /* ptr to inv(Dk) */
117:     PetscKernel_w_gets_A_times_v(bs, xk_tmp, diag, xk); /* xk <- diag * xk */
118:   }
119:   PetscCall(PetscFree(xk_tmp));
120:   PetscFunctionReturn(PETSC_SUCCESS);
121: }

123: PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscInt bs, PetscScalar *x)
124: {
125:   PetscInt         nz, k;
126:   const PetscInt  *vj, bs2 = bs * bs;
127:   const MatScalar *v;
128:   PetscScalar     *xk, *xj;

130:   PetscFunctionBegin;
131:   for (k = mbs - 1; k >= 0; k--) {
132:     v  = aa + bs2 * ai[k];
133:     xk = x + k * bs; /* xk */
134:     nz = ai[k + 1] - ai[k];
135:     vj = aj + ai[k];
136:     xj = x + (size_t)(*vj) * bs;
137:     while (nz--) {
138:       /* xk += U(k,:)*x(:) */
139:       PetscKernel_v_gets_v_plus_A_times_w(bs, xk, v, xj); /* xk <- xk + v*xj */
140:       vj++;
141:       v += bs2;
142:       xj = x + (size_t)(*vj) * bs;
143:     }
144:   }
145:   PetscFunctionReturn(PETSC_SUCCESS);
146: }

148: PetscErrorCode MatSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
149: {
150:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
151:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
152:   PetscInt           bs = A->rmap->bs;
153:   const MatScalar   *aa = a->a;
154:   PetscScalar       *x;
155:   const PetscScalar *b;

157:   PetscFunctionBegin;
158:   PetscCall(VecGetArrayRead(bb, &b));
159:   PetscCall(VecGetArray(xx, &x));

161:   /* solve U^T * D * y = b by forward substitution */
162:   PetscCall(PetscArraycpy(x, b, bs * mbs)); /* x <- b */
163:   PetscCall(MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(ai, aj, aa, mbs, bs, x));

165:   /* solve U*x = y by back substitution */
166:   PetscCall(MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(ai, aj, aa, mbs, bs, x));

168:   PetscCall(VecRestoreArrayRead(bb, &b));
169:   PetscCall(VecRestoreArray(xx, &x));
170:   PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (bs + 2.0 * a->bs2) * mbs));
171:   PetscFunctionReturn(PETSC_SUCCESS);
172: }

174: PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
175: {
176:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
177:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
178:   PetscInt           bs = A->rmap->bs;
179:   const MatScalar   *aa = a->a;
180:   const PetscScalar *b;
181:   PetscScalar       *x;

183:   PetscFunctionBegin;
184:   PetscCall(VecGetArrayRead(bb, &b));
185:   PetscCall(VecGetArray(xx, &x));
186:   PetscCall(PetscArraycpy(x, b, bs * mbs)); /* x <- b */
187:   PetscCall(MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(ai, aj, aa, mbs, bs, x));
188:   PetscCall(VecRestoreArrayRead(bb, &b));
189:   PetscCall(VecRestoreArray(xx, &x));
190:   PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - bs * mbs));
191:   PetscFunctionReturn(PETSC_SUCCESS);
192: }

194: PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
195: {
196:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
197:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
198:   PetscInt           bs = A->rmap->bs;
199:   const MatScalar   *aa = a->a;
200:   const PetscScalar *b;
201:   PetscScalar       *x;

203:   PetscFunctionBegin;
204:   PetscCall(VecGetArrayRead(bb, &b));
205:   PetscCall(VecGetArray(xx, &x));
206:   PetscCall(PetscArraycpy(x, b, bs * mbs));
207:   PetscCall(MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(ai, aj, aa, mbs, bs, x));
208:   PetscCall(VecRestoreArrayRead(bb, &b));
209:   PetscCall(VecRestoreArray(xx, &x));
210:   PetscCall(PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs)));
211:   PetscFunctionReturn(PETSC_SUCCESS);
212: }

214: PetscErrorCode MatSolve_SeqSBAIJ_7_inplace(Mat A, Vec bb, Vec xx)
215: {
216:   Mat_SeqSBAIJ      *a     = (Mat_SeqSBAIJ *)A->data;
217:   IS                 isrow = a->row;
218:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *r, *vj;
219:   PetscInt           nz, k, idx;
220:   const MatScalar   *aa = a->a, *v, *d;
221:   PetscScalar       *x, x0, x1, x2, x3, x4, x5, x6, *t, *tp;
222:   const PetscScalar *b;

224:   PetscFunctionBegin;
225:   PetscCall(VecGetArrayRead(bb, &b));
226:   PetscCall(VecGetArray(xx, &x));
227:   t = a->solve_work;
228:   PetscCall(ISGetIndices(isrow, &r));

230:   /* solve U^T * D * y = b by forward substitution */
231:   tp = t;
232:   for (k = 0; k < mbs; k++) { /* t <- perm(b) */
233:     idx   = 7 * r[k];
234:     tp[0] = b[idx];
235:     tp[1] = b[idx + 1];
236:     tp[2] = b[idx + 2];
237:     tp[3] = b[idx + 3];
238:     tp[4] = b[idx + 4];
239:     tp[5] = b[idx + 5];
240:     tp[6] = b[idx + 6];
241:     tp += 7;
242:   }

244:   for (k = 0; k < mbs; k++) {
245:     v  = aa + 49 * ai[k];
246:     vj = aj + ai[k];
247:     tp = t + k * 7;
248:     x0 = tp[0];
249:     x1 = tp[1];
250:     x2 = tp[2];
251:     x3 = tp[3];
252:     x4 = tp[4];
253:     x5 = tp[5];
254:     x6 = tp[6];
255:     nz = ai[k + 1] - ai[k];
256:     tp = t + (*vj) * 7;
257:     while (nz--) {
258:       tp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3 + v[4] * x4 + v[5] * x5 + v[6] * x6;
259:       tp[1] += v[7] * x0 + v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4 + v[12] * x5 + v[13] * x6;
260:       tp[2] += v[14] * x0 + v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5 + v[20] * x6;
261:       tp[3] += v[21] * x0 + v[22] * x1 + v[23] * x2 + v[24] * x3 + v[25] * x4 + v[26] * x5 + v[27] * x6;
262:       tp[4] += v[28] * x0 + v[29] * x1 + v[30] * x2 + v[31] * x3 + v[32] * x4 + v[33] * x5 + v[34] * x6;
263:       tp[5] += v[35] * x0 + v[36] * x1 + v[37] * x2 + v[38] * x3 + v[39] * x4 + v[40] * x5 + v[41] * x6;
264:       tp[6] += v[42] * x0 + v[43] * x1 + v[44] * x2 + v[45] * x3 + v[46] * x4 + v[47] * x5 + v[48] * x6;
265:       vj++;
266:       tp = t + (*vj) * 7;
267:       v += 49;
268:     }

270:     /* xk = inv(Dk)*(Dk*xk) */
271:     d     = aa + k * 49; /* ptr to inv(Dk) */
272:     tp    = t + k * 7;
273:     tp[0] = d[0] * x0 + d[7] * x1 + d[14] * x2 + d[21] * x3 + d[28] * x4 + d[35] * x5 + d[42] * x6;
274:     tp[1] = d[1] * x0 + d[8] * x1 + d[15] * x2 + d[22] * x3 + d[29] * x4 + d[36] * x5 + d[43] * x6;
275:     tp[2] = d[2] * x0 + d[9] * x1 + d[16] * x2 + d[23] * x3 + d[30] * x4 + d[37] * x5 + d[44] * x6;
276:     tp[3] = d[3] * x0 + d[10] * x1 + d[17] * x2 + d[24] * x3 + d[31] * x4 + d[38] * x5 + d[45] * x6;
277:     tp[4] = d[4] * x0 + d[11] * x1 + d[18] * x2 + d[25] * x3 + d[32] * x4 + d[39] * x5 + d[46] * x6;
278:     tp[5] = d[5] * x0 + d[12] * x1 + d[19] * x2 + d[26] * x3 + d[33] * x4 + d[40] * x5 + d[47] * x6;
279:     tp[6] = d[6] * x0 + d[13] * x1 + d[20] * x2 + d[27] * x3 + d[34] * x4 + d[41] * x5 + d[48] * x6;
280:   }

282:   /* solve U*x = y by back substitution */
283:   for (k = mbs - 1; k >= 0; k--) {
284:     v  = aa + 49 * ai[k];
285:     vj = aj + ai[k];
286:     tp = t + k * 7;
287:     x0 = tp[0];
288:     x1 = tp[1];
289:     x2 = tp[2];
290:     x3 = tp[3];
291:     x4 = tp[4];
292:     x5 = tp[5];
293:     x6 = tp[6]; /* xk */
294:     nz = ai[k + 1] - ai[k];

296:     tp = t + (*vj) * 7;
297:     while (nz--) {
298:       /* xk += U(k,:)*x(:) */
299:       x0 += v[0] * tp[0] + v[7] * tp[1] + v[14] * tp[2] + v[21] * tp[3] + v[28] * tp[4] + v[35] * tp[5] + v[42] * tp[6];
300:       x1 += v[1] * tp[0] + v[8] * tp[1] + v[15] * tp[2] + v[22] * tp[3] + v[29] * tp[4] + v[36] * tp[5] + v[43] * tp[6];
301:       x2 += v[2] * tp[0] + v[9] * tp[1] + v[16] * tp[2] + v[23] * tp[3] + v[30] * tp[4] + v[37] * tp[5] + v[44] * tp[6];
302:       x3 += v[3] * tp[0] + v[10] * tp[1] + v[17] * tp[2] + v[24] * tp[3] + v[31] * tp[4] + v[38] * tp[5] + v[45] * tp[6];
303:       x4 += v[4] * tp[0] + v[11] * tp[1] + v[18] * tp[2] + v[25] * tp[3] + v[32] * tp[4] + v[39] * tp[5] + v[46] * tp[6];
304:       x5 += v[5] * tp[0] + v[12] * tp[1] + v[19] * tp[2] + v[26] * tp[3] + v[33] * tp[4] + v[40] * tp[5] + v[47] * tp[6];
305:       x6 += v[6] * tp[0] + v[13] * tp[1] + v[20] * tp[2] + v[27] * tp[3] + v[34] * tp[4] + v[41] * tp[5] + v[48] * tp[6];
306:       vj++;
307:       tp = t + (*vj) * 7;
308:       v += 49;
309:     }
310:     tp         = t + k * 7;
311:     tp[0]      = x0;
312:     tp[1]      = x1;
313:     tp[2]      = x2;
314:     tp[3]      = x3;
315:     tp[4]      = x4;
316:     tp[5]      = x5;
317:     tp[6]      = x6;
318:     idx        = 7 * r[k];
319:     x[idx]     = x0;
320:     x[idx + 1] = x1;
321:     x[idx + 2] = x2;
322:     x[idx + 3] = x3;
323:     x[idx + 4] = x4;
324:     x[idx + 5] = x5;
325:     x[idx + 6] = x6;
326:   }

328:   PetscCall(ISRestoreIndices(isrow, &r));
329:   PetscCall(VecRestoreArrayRead(bb, &b));
330:   PetscCall(VecRestoreArray(xx, &x));
331:   PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
332:   PetscFunctionReturn(PETSC_SUCCESS);
333: }

335: PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
336: {
337:   const MatScalar *v, *d;
338:   PetscScalar     *xp, x0, x1, x2, x3, x4, x5, x6;
339:   PetscInt         nz, k;
340:   const PetscInt  *vj;

342:   PetscFunctionBegin;
343:   for (k = 0; k < mbs; k++) {
344:     v  = aa + 49 * ai[k];
345:     xp = x + k * 7;
346:     x0 = xp[0];
347:     x1 = xp[1];
348:     x2 = xp[2];
349:     x3 = xp[3];
350:     x4 = xp[4];
351:     x5 = xp[5];
352:     x6 = xp[6]; /* Dk*xk = k-th block of x */
353:     nz = ai[k + 1] - ai[k];
354:     vj = aj + ai[k];
355:     PetscPrefetchBlock(vj + nz, nz, 0, PETSC_PREFETCH_HINT_NTA);          /* Indices for the next row (assumes same size as this one) */
356:     PetscPrefetchBlock(v + 49 * nz, 49 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
357:     while (nz--) {
358:       xp = x + (*vj) * 7;
359:       /* x(:) += U(k,:)^T*(Dk*xk) */
360:       xp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3 + v[4] * x4 + v[5] * x5 + v[6] * x6;
361:       xp[1] += v[7] * x0 + v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4 + v[12] * x5 + v[13] * x6;
362:       xp[2] += v[14] * x0 + v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5 + v[20] * x6;
363:       xp[3] += v[21] * x0 + v[22] * x1 + v[23] * x2 + v[24] * x3 + v[25] * x4 + v[26] * x5 + v[27] * x6;
364:       xp[4] += v[28] * x0 + v[29] * x1 + v[30] * x2 + v[31] * x3 + v[32] * x4 + v[33] * x5 + v[34] * x6;
365:       xp[5] += v[35] * x0 + v[36] * x1 + v[37] * x2 + v[38] * x3 + v[39] * x4 + v[40] * x5 + v[41] * x6;
366:       xp[6] += v[42] * x0 + v[43] * x1 + v[44] * x2 + v[45] * x3 + v[46] * x4 + v[47] * x5 + v[48] * x6;
367:       vj++;
368:       v += 49;
369:     }
370:     /* xk = inv(Dk)*(Dk*xk) */
371:     d     = aa + k * 49; /* ptr to inv(Dk) */
372:     xp    = x + k * 7;
373:     xp[0] = d[0] * x0 + d[7] * x1 + d[14] * x2 + d[21] * x3 + d[28] * x4 + d[35] * x5 + d[42] * x6;
374:     xp[1] = d[1] * x0 + d[8] * x1 + d[15] * x2 + d[22] * x3 + d[29] * x4 + d[36] * x5 + d[43] * x6;
375:     xp[2] = d[2] * x0 + d[9] * x1 + d[16] * x2 + d[23] * x3 + d[30] * x4 + d[37] * x5 + d[44] * x6;
376:     xp[3] = d[3] * x0 + d[10] * x1 + d[17] * x2 + d[24] * x3 + d[31] * x4 + d[38] * x5 + d[45] * x6;
377:     xp[4] = d[4] * x0 + d[11] * x1 + d[18] * x2 + d[25] * x3 + d[32] * x4 + d[39] * x5 + d[46] * x6;
378:     xp[5] = d[5] * x0 + d[12] * x1 + d[19] * x2 + d[26] * x3 + d[33] * x4 + d[40] * x5 + d[47] * x6;
379:     xp[6] = d[6] * x0 + d[13] * x1 + d[20] * x2 + d[27] * x3 + d[34] * x4 + d[41] * x5 + d[48] * x6;
380:   }
381:   PetscFunctionReturn(PETSC_SUCCESS);
382: }

384: PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
385: {
386:   const MatScalar *v;
387:   PetscScalar     *xp, x0, x1, x2, x3, x4, x5, x6;
388:   PetscInt         nz, k;
389:   const PetscInt  *vj;

391:   PetscFunctionBegin;
392:   for (k = mbs - 1; k >= 0; k--) {
393:     v  = aa + 49 * ai[k];
394:     xp = x + k * 7;
395:     x0 = xp[0];
396:     x1 = xp[1];
397:     x2 = xp[2];
398:     x3 = xp[3];
399:     x4 = xp[4];
400:     x5 = xp[5];
401:     x6 = xp[6]; /* xk */
402:     nz = ai[k + 1] - ai[k];
403:     vj = aj + ai[k];
404:     PetscPrefetchBlock(vj - nz, nz, 0, PETSC_PREFETCH_HINT_NTA);          /* Indices for the next row (assumes same size as this one) */
405:     PetscPrefetchBlock(v - 49 * nz, 49 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
406:     while (nz--) {
407:       xp = x + (*vj) * 7;
408:       /* xk += U(k,:)*x(:) */
409:       x0 += v[0] * xp[0] + v[7] * xp[1] + v[14] * xp[2] + v[21] * xp[3] + v[28] * xp[4] + v[35] * xp[5] + v[42] * xp[6];
410:       x1 += v[1] * xp[0] + v[8] * xp[1] + v[15] * xp[2] + v[22] * xp[3] + v[29] * xp[4] + v[36] * xp[5] + v[43] * xp[6];
411:       x2 += v[2] * xp[0] + v[9] * xp[1] + v[16] * xp[2] + v[23] * xp[3] + v[30] * xp[4] + v[37] * xp[5] + v[44] * xp[6];
412:       x3 += v[3] * xp[0] + v[10] * xp[1] + v[17] * xp[2] + v[24] * xp[3] + v[31] * xp[4] + v[38] * xp[5] + v[45] * xp[6];
413:       x4 += v[4] * xp[0] + v[11] * xp[1] + v[18] * xp[2] + v[25] * xp[3] + v[32] * xp[4] + v[39] * xp[5] + v[46] * xp[6];
414:       x5 += v[5] * xp[0] + v[12] * xp[1] + v[19] * xp[2] + v[26] * xp[3] + v[33] * xp[4] + v[40] * xp[5] + v[47] * xp[6];
415:       x6 += v[6] * xp[0] + v[13] * xp[1] + v[20] * xp[2] + v[27] * xp[3] + v[34] * xp[4] + v[41] * xp[5] + v[48] * xp[6];
416:       vj++;
417:       v += 49;
418:     }
419:     xp    = x + k * 7;
420:     xp[0] = x0;
421:     xp[1] = x1;
422:     xp[2] = x2;
423:     xp[3] = x3;
424:     xp[4] = x4;
425:     xp[5] = x5;
426:     xp[6] = x6;
427:   }
428:   PetscFunctionReturn(PETSC_SUCCESS);
429: }

431: PetscErrorCode MatSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
432: {
433:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
434:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
435:   const MatScalar   *aa = a->a;
436:   PetscScalar       *x;
437:   const PetscScalar *b;

439:   PetscFunctionBegin;
440:   PetscCall(VecGetArrayRead(bb, &b));
441:   PetscCall(VecGetArray(xx, &x));

443:   /* solve U^T * D * y = b by forward substitution */
444:   PetscCall(PetscArraycpy(x, b, 7 * mbs)); /* x <- b */
445:   PetscCall(MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(ai, aj, aa, mbs, x));

447:   /* solve U*x = y by back substitution */
448:   PetscCall(MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(ai, aj, aa, mbs, x));

450:   PetscCall(VecRestoreArrayRead(bb, &b));
451:   PetscCall(VecRestoreArray(xx, &x));
452:   PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
453:   PetscFunctionReturn(PETSC_SUCCESS);
454: }

456: PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
457: {
458:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
459:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
460:   const MatScalar   *aa = a->a;
461:   PetscScalar       *x;
462:   const PetscScalar *b;

464:   PetscFunctionBegin;
465:   PetscCall(VecGetArrayRead(bb, &b));
466:   PetscCall(VecGetArray(xx, &x));
467:   PetscCall(PetscArraycpy(x, b, 7 * mbs));
468:   PetscCall(MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(ai, aj, aa, mbs, x));
469:   PetscCall(VecRestoreArrayRead(bb, &b));
470:   PetscCall(VecRestoreArray(xx, &x));
471:   PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - A->rmap->bs * mbs));
472:   PetscFunctionReturn(PETSC_SUCCESS);
473: }

475: PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
476: {
477:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
478:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
479:   const MatScalar   *aa = a->a;
480:   PetscScalar       *x;
481:   const PetscScalar *b;

483:   PetscFunctionBegin;
484:   PetscCall(VecGetArrayRead(bb, &b));
485:   PetscCall(VecGetArray(xx, &x));
486:   PetscCall(PetscArraycpy(x, b, 7 * mbs));
487:   PetscCall(MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(ai, aj, aa, mbs, x));
488:   PetscCall(VecRestoreArrayRead(bb, &b));
489:   PetscCall(VecRestoreArray(xx, &x));
490:   PetscCall(PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs)));
491:   PetscFunctionReturn(PETSC_SUCCESS);
492: }

494: PetscErrorCode MatSolve_SeqSBAIJ_6_inplace(Mat A, Vec bb, Vec xx)
495: {
496:   Mat_SeqSBAIJ      *a     = (Mat_SeqSBAIJ *)A->data;
497:   IS                 isrow = a->row;
498:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *r, *vj;
499:   PetscInt           nz, k, idx;
500:   const MatScalar   *aa = a->a, *v, *d;
501:   PetscScalar       *x, x0, x1, x2, x3, x4, x5, *t, *tp;
502:   const PetscScalar *b;

504:   PetscFunctionBegin;
505:   PetscCall(VecGetArrayRead(bb, &b));
506:   PetscCall(VecGetArray(xx, &x));
507:   t = a->solve_work;
508:   PetscCall(ISGetIndices(isrow, &r));

510:   /* solve U^T * D * y = b by forward substitution */
511:   tp = t;
512:   for (k = 0; k < mbs; k++) { /* t <- perm(b) */
513:     idx   = 6 * r[k];
514:     tp[0] = b[idx];
515:     tp[1] = b[idx + 1];
516:     tp[2] = b[idx + 2];
517:     tp[3] = b[idx + 3];
518:     tp[4] = b[idx + 4];
519:     tp[5] = b[idx + 5];
520:     tp += 6;
521:   }

523:   for (k = 0; k < mbs; k++) {
524:     v  = aa + 36 * ai[k];
525:     vj = aj + ai[k];
526:     tp = t + k * 6;
527:     x0 = tp[0];
528:     x1 = tp[1];
529:     x2 = tp[2];
530:     x3 = tp[3];
531:     x4 = tp[4];
532:     x5 = tp[5];
533:     nz = ai[k + 1] - ai[k];
534:     tp = t + (*vj) * 6;
535:     while (nz--) {
536:       tp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3 + v[4] * x4 + v[5] * x5;
537:       tp[1] += v[6] * x0 + v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5;
538:       tp[2] += v[12] * x0 + v[13] * x1 + v[14] * x2 + v[15] * x3 + v[16] * x4 + v[17] * x5;
539:       tp[3] += v[18] * x0 + v[19] * x1 + v[20] * x2 + v[21] * x3 + v[22] * x4 + v[23] * x5;
540:       tp[4] += v[24] * x0 + v[25] * x1 + v[26] * x2 + v[27] * x3 + v[28] * x4 + v[29] * x5;
541:       tp[5] += v[30] * x0 + v[31] * x1 + v[32] * x2 + v[33] * x3 + v[34] * x4 + v[35] * x5;
542:       vj++;
543:       tp = t + (*vj) * 6;
544:       v += 36;
545:     }

547:     /* xk = inv(Dk)*(Dk*xk) */
548:     d     = aa + k * 36; /* ptr to inv(Dk) */
549:     tp    = t + k * 6;
550:     tp[0] = d[0] * x0 + d[6] * x1 + d[12] * x2 + d[18] * x3 + d[24] * x4 + d[30] * x5;
551:     tp[1] = d[1] * x0 + d[7] * x1 + d[13] * x2 + d[19] * x3 + d[25] * x4 + d[31] * x5;
552:     tp[2] = d[2] * x0 + d[8] * x1 + d[14] * x2 + d[20] * x3 + d[26] * x4 + d[32] * x5;
553:     tp[3] = d[3] * x0 + d[9] * x1 + d[15] * x2 + d[21] * x3 + d[27] * x4 + d[33] * x5;
554:     tp[4] = d[4] * x0 + d[10] * x1 + d[16] * x2 + d[22] * x3 + d[28] * x4 + d[34] * x5;
555:     tp[5] = d[5] * x0 + d[11] * x1 + d[17] * x2 + d[23] * x3 + d[29] * x4 + d[35] * x5;
556:   }

558:   /* solve U*x = y by back substitution */
559:   for (k = mbs - 1; k >= 0; k--) {
560:     v  = aa + 36 * ai[k];
561:     vj = aj + ai[k];
562:     tp = t + k * 6;
563:     x0 = tp[0];
564:     x1 = tp[1];
565:     x2 = tp[2];
566:     x3 = tp[3];
567:     x4 = tp[4];
568:     x5 = tp[5]; /* xk */
569:     nz = ai[k + 1] - ai[k];

571:     tp = t + (*vj) * 6;
572:     while (nz--) {
573:       /* xk += U(k,:)*x(:) */
574:       x0 += v[0] * tp[0] + v[6] * tp[1] + v[12] * tp[2] + v[18] * tp[3] + v[24] * tp[4] + v[30] * tp[5];
575:       x1 += v[1] * tp[0] + v[7] * tp[1] + v[13] * tp[2] + v[19] * tp[3] + v[25] * tp[4] + v[31] * tp[5];
576:       x2 += v[2] * tp[0] + v[8] * tp[1] + v[14] * tp[2] + v[20] * tp[3] + v[26] * tp[4] + v[32] * tp[5];
577:       x3 += v[3] * tp[0] + v[9] * tp[1] + v[15] * tp[2] + v[21] * tp[3] + v[27] * tp[4] + v[33] * tp[5];
578:       x4 += v[4] * tp[0] + v[10] * tp[1] + v[16] * tp[2] + v[22] * tp[3] + v[28] * tp[4] + v[34] * tp[5];
579:       x5 += v[5] * tp[0] + v[11] * tp[1] + v[17] * tp[2] + v[23] * tp[3] + v[29] * tp[4] + v[35] * tp[5];
580:       vj++;
581:       tp = t + (*vj) * 6;
582:       v += 36;
583:     }
584:     tp         = t + k * 6;
585:     tp[0]      = x0;
586:     tp[1]      = x1;
587:     tp[2]      = x2;
588:     tp[3]      = x3;
589:     tp[4]      = x4;
590:     tp[5]      = x5;
591:     idx        = 6 * r[k];
592:     x[idx]     = x0;
593:     x[idx + 1] = x1;
594:     x[idx + 2] = x2;
595:     x[idx + 3] = x3;
596:     x[idx + 4] = x4;
597:     x[idx + 5] = x5;
598:   }

600:   PetscCall(ISRestoreIndices(isrow, &r));
601:   PetscCall(VecRestoreArrayRead(bb, &b));
602:   PetscCall(VecRestoreArray(xx, &x));
603:   PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
604:   PetscFunctionReturn(PETSC_SUCCESS);
605: }

607: PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
608: {
609:   const MatScalar *v, *d;
610:   PetscScalar     *xp, x0, x1, x2, x3, x4, x5;
611:   PetscInt         nz, k;
612:   const PetscInt  *vj;

614:   PetscFunctionBegin;
615:   for (k = 0; k < mbs; k++) {
616:     v  = aa + 36 * ai[k];
617:     xp = x + k * 6;
618:     x0 = xp[0];
619:     x1 = xp[1];
620:     x2 = xp[2];
621:     x3 = xp[3];
622:     x4 = xp[4];
623:     x5 = xp[5]; /* Dk*xk = k-th block of x */
624:     nz = ai[k + 1] - ai[k];
625:     vj = aj + ai[k];
626:     PetscPrefetchBlock(vj + nz, nz, 0, PETSC_PREFETCH_HINT_NTA);          /* Indices for the next row (assumes same size as this one) */
627:     PetscPrefetchBlock(v + 36 * nz, 36 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
628:     while (nz--) {
629:       xp = x + (*vj) * 6;
630:       /* x(:) += U(k,:)^T*(Dk*xk) */
631:       xp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3 + v[4] * x4 + v[5] * x5;
632:       xp[1] += v[6] * x0 + v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5;
633:       xp[2] += v[12] * x0 + v[13] * x1 + v[14] * x2 + v[15] * x3 + v[16] * x4 + v[17] * x5;
634:       xp[3] += v[18] * x0 + v[19] * x1 + v[20] * x2 + v[21] * x3 + v[22] * x4 + v[23] * x5;
635:       xp[4] += v[24] * x0 + v[25] * x1 + v[26] * x2 + v[27] * x3 + v[28] * x4 + v[29] * x5;
636:       xp[5] += v[30] * x0 + v[31] * x1 + v[32] * x2 + v[33] * x3 + v[34] * x4 + v[35] * x5;
637:       vj++;
638:       v += 36;
639:     }
640:     /* xk = inv(Dk)*(Dk*xk) */
641:     d     = aa + k * 36; /* ptr to inv(Dk) */
642:     xp    = x + k * 6;
643:     xp[0] = d[0] * x0 + d[6] * x1 + d[12] * x2 + d[18] * x3 + d[24] * x4 + d[30] * x5;
644:     xp[1] = d[1] * x0 + d[7] * x1 + d[13] * x2 + d[19] * x3 + d[25] * x4 + d[31] * x5;
645:     xp[2] = d[2] * x0 + d[8] * x1 + d[14] * x2 + d[20] * x3 + d[26] * x4 + d[32] * x5;
646:     xp[3] = d[3] * x0 + d[9] * x1 + d[15] * x2 + d[21] * x3 + d[27] * x4 + d[33] * x5;
647:     xp[4] = d[4] * x0 + d[10] * x1 + d[16] * x2 + d[22] * x3 + d[28] * x4 + d[34] * x5;
648:     xp[5] = d[5] * x0 + d[11] * x1 + d[17] * x2 + d[23] * x3 + d[29] * x4 + d[35] * x5;
649:   }
650:   PetscFunctionReturn(PETSC_SUCCESS);
651: }
652: PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
653: {
654:   const MatScalar *v;
655:   PetscScalar     *xp, x0, x1, x2, x3, x4, x5;
656:   PetscInt         nz, k;
657:   const PetscInt  *vj;

659:   PetscFunctionBegin;
660:   for (k = mbs - 1; k >= 0; k--) {
661:     v  = aa + 36 * ai[k];
662:     xp = x + k * 6;
663:     x0 = xp[0];
664:     x1 = xp[1];
665:     x2 = xp[2];
666:     x3 = xp[3];
667:     x4 = xp[4];
668:     x5 = xp[5]; /* xk */
669:     nz = ai[k + 1] - ai[k];
670:     vj = aj + ai[k];
671:     PetscPrefetchBlock(vj - nz, nz, 0, PETSC_PREFETCH_HINT_NTA);          /* Indices for the next row (assumes same size as this one) */
672:     PetscPrefetchBlock(v - 36 * nz, 36 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
673:     while (nz--) {
674:       xp = x + (*vj) * 6;
675:       /* xk += U(k,:)*x(:) */
676:       x0 += v[0] * xp[0] + v[6] * xp[1] + v[12] * xp[2] + v[18] * xp[3] + v[24] * xp[4] + v[30] * xp[5];
677:       x1 += v[1] * xp[0] + v[7] * xp[1] + v[13] * xp[2] + v[19] * xp[3] + v[25] * xp[4] + v[31] * xp[5];
678:       x2 += v[2] * xp[0] + v[8] * xp[1] + v[14] * xp[2] + v[20] * xp[3] + v[26] * xp[4] + v[32] * xp[5];
679:       x3 += v[3] * xp[0] + v[9] * xp[1] + v[15] * xp[2] + v[21] * xp[3] + v[27] * xp[4] + v[33] * xp[5];
680:       x4 += v[4] * xp[0] + v[10] * xp[1] + v[16] * xp[2] + v[22] * xp[3] + v[28] * xp[4] + v[34] * xp[5];
681:       x5 += v[5] * xp[0] + v[11] * xp[1] + v[17] * xp[2] + v[23] * xp[3] + v[29] * xp[4] + v[35] * xp[5];
682:       vj++;
683:       v += 36;
684:     }
685:     xp    = x + k * 6;
686:     xp[0] = x0;
687:     xp[1] = x1;
688:     xp[2] = x2;
689:     xp[3] = x3;
690:     xp[4] = x4;
691:     xp[5] = x5;
692:   }
693:   PetscFunctionReturn(PETSC_SUCCESS);
694: }

696: PetscErrorCode MatSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
697: {
698:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
699:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
700:   const MatScalar   *aa = a->a;
701:   PetscScalar       *x;
702:   const PetscScalar *b;

704:   PetscFunctionBegin;
705:   PetscCall(VecGetArrayRead(bb, &b));
706:   PetscCall(VecGetArray(xx, &x));

708:   /* solve U^T * D * y = b by forward substitution */
709:   PetscCall(PetscArraycpy(x, b, 6 * mbs)); /* x <- b */
710:   PetscCall(MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(ai, aj, aa, mbs, x));

712:   /* solve U*x = y by back substitution */
713:   PetscCall(MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(ai, aj, aa, mbs, x));

715:   PetscCall(VecRestoreArrayRead(bb, &b));
716:   PetscCall(VecRestoreArray(xx, &x));
717:   PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
718:   PetscFunctionReturn(PETSC_SUCCESS);
719: }

721: PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
722: {
723:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
724:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
725:   const MatScalar   *aa = a->a;
726:   PetscScalar       *x;
727:   const PetscScalar *b;

729:   PetscFunctionBegin;
730:   PetscCall(VecGetArrayRead(bb, &b));
731:   PetscCall(VecGetArray(xx, &x));
732:   PetscCall(PetscArraycpy(x, b, 6 * mbs)); /* x <- b */
733:   PetscCall(MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(ai, aj, aa, mbs, x));
734:   PetscCall(VecRestoreArrayRead(bb, &b));
735:   PetscCall(VecRestoreArray(xx, &x));
736:   PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - A->rmap->bs * mbs));
737:   PetscFunctionReturn(PETSC_SUCCESS);
738: }

740: PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
741: {
742:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
743:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
744:   const MatScalar   *aa = a->a;
745:   PetscScalar       *x;
746:   const PetscScalar *b;

748:   PetscFunctionBegin;
749:   PetscCall(VecGetArrayRead(bb, &b));
750:   PetscCall(VecGetArray(xx, &x));
751:   PetscCall(PetscArraycpy(x, b, 6 * mbs)); /* x <- b */
752:   PetscCall(MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(ai, aj, aa, mbs, x));
753:   PetscCall(VecRestoreArrayRead(bb, &b));
754:   PetscCall(VecRestoreArray(xx, &x));
755:   PetscCall(PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs)));
756:   PetscFunctionReturn(PETSC_SUCCESS);
757: }

759: PetscErrorCode MatSolve_SeqSBAIJ_5_inplace(Mat A, Vec bb, Vec xx)
760: {
761:   Mat_SeqSBAIJ      *a     = (Mat_SeqSBAIJ *)A->data;
762:   IS                 isrow = a->row;
763:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
764:   const PetscInt    *r, *vj;
765:   PetscInt           nz, k, idx;
766:   const MatScalar   *aa = a->a, *v, *diag;
767:   PetscScalar       *x, x0, x1, x2, x3, x4, *t, *tp;
768:   const PetscScalar *b;

770:   PetscFunctionBegin;
771:   PetscCall(VecGetArrayRead(bb, &b));
772:   PetscCall(VecGetArray(xx, &x));
773:   t = a->solve_work;
774:   PetscCall(ISGetIndices(isrow, &r));

776:   /* solve U^T * D * y = b by forward substitution */
777:   tp = t;
778:   for (k = 0; k < mbs; k++) { /* t <- perm(b) */
779:     idx   = 5 * r[k];
780:     tp[0] = b[idx];
781:     tp[1] = b[idx + 1];
782:     tp[2] = b[idx + 2];
783:     tp[3] = b[idx + 3];
784:     tp[4] = b[idx + 4];
785:     tp += 5;
786:   }

788:   for (k = 0; k < mbs; k++) {
789:     v  = aa + 25 * ai[k];
790:     vj = aj + ai[k];
791:     tp = t + k * 5;
792:     x0 = tp[0];
793:     x1 = tp[1];
794:     x2 = tp[2];
795:     x3 = tp[3];
796:     x4 = tp[4];
797:     nz = ai[k + 1] - ai[k];

799:     tp = t + (*vj) * 5;
800:     while (nz--) {
801:       tp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3 + v[4] * x4;
802:       tp[1] += v[5] * x0 + v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4;
803:       tp[2] += v[10] * x0 + v[11] * x1 + v[12] * x2 + v[13] * x3 + v[14] * x4;
804:       tp[3] += v[15] * x0 + v[16] * x1 + v[17] * x2 + v[18] * x3 + v[19] * x4;
805:       tp[4] += v[20] * x0 + v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4;
806:       vj++;
807:       tp = t + (*vj) * 5;
808:       v += 25;
809:     }

811:     /* xk = inv(Dk)*(Dk*xk) */
812:     diag  = aa + k * 25; /* ptr to inv(Dk) */
813:     tp    = t + k * 5;
814:     tp[0] = diag[0] * x0 + diag[5] * x1 + diag[10] * x2 + diag[15] * x3 + diag[20] * x4;
815:     tp[1] = diag[1] * x0 + diag[6] * x1 + diag[11] * x2 + diag[16] * x3 + diag[21] * x4;
816:     tp[2] = diag[2] * x0 + diag[7] * x1 + diag[12] * x2 + diag[17] * x3 + diag[22] * x4;
817:     tp[3] = diag[3] * x0 + diag[8] * x1 + diag[13] * x2 + diag[18] * x3 + diag[23] * x4;
818:     tp[4] = diag[4] * x0 + diag[9] * x1 + diag[14] * x2 + diag[19] * x3 + diag[24] * x4;
819:   }

821:   /* solve U*x = y by back substitution */
822:   for (k = mbs - 1; k >= 0; k--) {
823:     v  = aa + 25 * ai[k];
824:     vj = aj + ai[k];
825:     tp = t + k * 5;
826:     x0 = tp[0];
827:     x1 = tp[1];
828:     x2 = tp[2];
829:     x3 = tp[3];
830:     x4 = tp[4]; /* xk */
831:     nz = ai[k + 1] - ai[k];

833:     tp = t + (*vj) * 5;
834:     while (nz--) {
835:       /* xk += U(k,:)*x(:) */
836:       x0 += v[0] * tp[0] + v[5] * tp[1] + v[10] * tp[2] + v[15] * tp[3] + v[20] * tp[4];
837:       x1 += v[1] * tp[0] + v[6] * tp[1] + v[11] * tp[2] + v[16] * tp[3] + v[21] * tp[4];
838:       x2 += v[2] * tp[0] + v[7] * tp[1] + v[12] * tp[2] + v[17] * tp[3] + v[22] * tp[4];
839:       x3 += v[3] * tp[0] + v[8] * tp[1] + v[13] * tp[2] + v[18] * tp[3] + v[23] * tp[4];
840:       x4 += v[4] * tp[0] + v[9] * tp[1] + v[14] * tp[2] + v[19] * tp[3] + v[24] * tp[4];
841:       vj++;
842:       tp = t + (*vj) * 5;
843:       v += 25;
844:     }
845:     tp         = t + k * 5;
846:     tp[0]      = x0;
847:     tp[1]      = x1;
848:     tp[2]      = x2;
849:     tp[3]      = x3;
850:     tp[4]      = x4;
851:     idx        = 5 * r[k];
852:     x[idx]     = x0;
853:     x[idx + 1] = x1;
854:     x[idx + 2] = x2;
855:     x[idx + 3] = x3;
856:     x[idx + 4] = x4;
857:   }

859:   PetscCall(ISRestoreIndices(isrow, &r));
860:   PetscCall(VecRestoreArrayRead(bb, &b));
861:   PetscCall(VecRestoreArray(xx, &x));
862:   PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
863:   PetscFunctionReturn(PETSC_SUCCESS);
864: }

866: PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
867: {
868:   const MatScalar *v, *diag;
869:   PetscScalar     *xp, x0, x1, x2, x3, x4;
870:   PetscInt         nz, k;
871:   const PetscInt  *vj;

873:   PetscFunctionBegin;
874:   for (k = 0; k < mbs; k++) {
875:     v  = aa + 25 * ai[k];
876:     xp = x + k * 5;
877:     x0 = xp[0];
878:     x1 = xp[1];
879:     x2 = xp[2];
880:     x3 = xp[3];
881:     x4 = xp[4]; /* Dk*xk = k-th block of x */
882:     nz = ai[k + 1] - ai[k];
883:     vj = aj + ai[k];
884:     PetscPrefetchBlock(vj + nz, nz, 0, PETSC_PREFETCH_HINT_NTA);          /* Indices for the next row (assumes same size as this one) */
885:     PetscPrefetchBlock(v + 25 * nz, 25 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
886:     while (nz--) {
887:       xp = x + (*vj) * 5;
888:       /* x(:) += U(k,:)^T*(Dk*xk) */
889:       xp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3 + v[4] * x4;
890:       xp[1] += v[5] * x0 + v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4;
891:       xp[2] += v[10] * x0 + v[11] * x1 + v[12] * x2 + v[13] * x3 + v[14] * x4;
892:       xp[3] += v[15] * x0 + v[16] * x1 + v[17] * x2 + v[18] * x3 + v[19] * x4;
893:       xp[4] += v[20] * x0 + v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4;
894:       vj++;
895:       v += 25;
896:     }
897:     /* xk = inv(Dk)*(Dk*xk) */
898:     diag  = aa + k * 25; /* ptr to inv(Dk) */
899:     xp    = x + k * 5;
900:     xp[0] = diag[0] * x0 + diag[5] * x1 + diag[10] * x2 + diag[15] * x3 + diag[20] * x4;
901:     xp[1] = diag[1] * x0 + diag[6] * x1 + diag[11] * x2 + diag[16] * x3 + diag[21] * x4;
902:     xp[2] = diag[2] * x0 + diag[7] * x1 + diag[12] * x2 + diag[17] * x3 + diag[22] * x4;
903:     xp[3] = diag[3] * x0 + diag[8] * x1 + diag[13] * x2 + diag[18] * x3 + diag[23] * x4;
904:     xp[4] = diag[4] * x0 + diag[9] * x1 + diag[14] * x2 + diag[19] * x3 + diag[24] * x4;
905:   }
906:   PetscFunctionReturn(PETSC_SUCCESS);
907: }

909: PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
910: {
911:   const MatScalar *v;
912:   PetscScalar     *xp, x0, x1, x2, x3, x4;
913:   PetscInt         nz, k;
914:   const PetscInt  *vj;

916:   PetscFunctionBegin;
917:   for (k = mbs - 1; k >= 0; k--) {
918:     v  = aa + 25 * ai[k];
919:     xp = x + k * 5;
920:     x0 = xp[0];
921:     x1 = xp[1];
922:     x2 = xp[2];
923:     x3 = xp[3];
924:     x4 = xp[4]; /* xk */
925:     nz = ai[k + 1] - ai[k];
926:     vj = aj + ai[k];
927:     PetscPrefetchBlock(vj - nz, nz, 0, PETSC_PREFETCH_HINT_NTA);          /* Indices for the next row (assumes same size as this one) */
928:     PetscPrefetchBlock(v - 25 * nz, 25 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
929:     while (nz--) {
930:       xp = x + (*vj) * 5;
931:       /* xk += U(k,:)*x(:) */
932:       x0 += v[0] * xp[0] + v[5] * xp[1] + v[10] * xp[2] + v[15] * xp[3] + v[20] * xp[4];
933:       x1 += v[1] * xp[0] + v[6] * xp[1] + v[11] * xp[2] + v[16] * xp[3] + v[21] * xp[4];
934:       x2 += v[2] * xp[0] + v[7] * xp[1] + v[12] * xp[2] + v[17] * xp[3] + v[22] * xp[4];
935:       x3 += v[3] * xp[0] + v[8] * xp[1] + v[13] * xp[2] + v[18] * xp[3] + v[23] * xp[4];
936:       x4 += v[4] * xp[0] + v[9] * xp[1] + v[14] * xp[2] + v[19] * xp[3] + v[24] * xp[4];
937:       vj++;
938:       v += 25;
939:     }
940:     xp    = x + k * 5;
941:     xp[0] = x0;
942:     xp[1] = x1;
943:     xp[2] = x2;
944:     xp[3] = x3;
945:     xp[4] = x4;
946:   }
947:   PetscFunctionReturn(PETSC_SUCCESS);
948: }

950: PetscErrorCode MatSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
951: {
952:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
953:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
954:   const MatScalar   *aa = a->a;
955:   PetscScalar       *x;
956:   const PetscScalar *b;

958:   PetscFunctionBegin;
959:   PetscCall(VecGetArrayRead(bb, &b));
960:   PetscCall(VecGetArray(xx, &x));

962:   /* solve U^T * D * y = b by forward substitution */
963:   PetscCall(PetscArraycpy(x, b, 5 * mbs)); /* x <- b */
964:   PetscCall(MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(ai, aj, aa, mbs, x));

966:   /* solve U*x = y by back substitution */
967:   PetscCall(MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(ai, aj, aa, mbs, x));

969:   PetscCall(VecRestoreArrayRead(bb, &b));
970:   PetscCall(VecRestoreArray(xx, &x));
971:   PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
972:   PetscFunctionReturn(PETSC_SUCCESS);
973: }

975: PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
976: {
977:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
978:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
979:   const MatScalar   *aa = a->a;
980:   PetscScalar       *x;
981:   const PetscScalar *b;

983:   PetscFunctionBegin;
984:   PetscCall(VecGetArrayRead(bb, &b));
985:   PetscCall(VecGetArray(xx, &x));
986:   PetscCall(PetscArraycpy(x, b, 5 * mbs)); /* x <- b */
987:   PetscCall(MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(ai, aj, aa, mbs, x));
988:   PetscCall(VecRestoreArrayRead(bb, &b));
989:   PetscCall(VecRestoreArray(xx, &x));
990:   PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - A->rmap->bs * mbs));
991:   PetscFunctionReturn(PETSC_SUCCESS);
992: }

994: PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
995: {
996:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
997:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
998:   const MatScalar   *aa = a->a;
999:   PetscScalar       *x;
1000:   const PetscScalar *b;

1002:   PetscFunctionBegin;
1003:   PetscCall(VecGetArrayRead(bb, &b));
1004:   PetscCall(VecGetArray(xx, &x));
1005:   PetscCall(PetscArraycpy(x, b, 5 * mbs));
1006:   PetscCall(MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(ai, aj, aa, mbs, x));
1007:   PetscCall(VecRestoreArrayRead(bb, &b));
1008:   PetscCall(VecRestoreArray(xx, &x));
1009:   PetscCall(PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs)));
1010:   PetscFunctionReturn(PETSC_SUCCESS);
1011: }

1013: PetscErrorCode MatSolve_SeqSBAIJ_4_inplace(Mat A, Vec bb, Vec xx)
1014: {
1015:   Mat_SeqSBAIJ      *a     = (Mat_SeqSBAIJ *)A->data;
1016:   IS                 isrow = a->row;
1017:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
1018:   const PetscInt    *r, *vj;
1019:   PetscInt           nz, k, idx;
1020:   const MatScalar   *aa = a->a, *v, *diag;
1021:   PetscScalar       *x, x0, x1, x2, x3, *t, *tp;
1022:   const PetscScalar *b;

1024:   PetscFunctionBegin;
1025:   PetscCall(VecGetArrayRead(bb, &b));
1026:   PetscCall(VecGetArray(xx, &x));
1027:   t = a->solve_work;
1028:   PetscCall(ISGetIndices(isrow, &r));

1030:   /* solve U^T * D * y = b by forward substitution */
1031:   tp = t;
1032:   for (k = 0; k < mbs; k++) { /* t <- perm(b) */
1033:     idx   = 4 * r[k];
1034:     tp[0] = b[idx];
1035:     tp[1] = b[idx + 1];
1036:     tp[2] = b[idx + 2];
1037:     tp[3] = b[idx + 3];
1038:     tp += 4;
1039:   }

1041:   for (k = 0; k < mbs; k++) {
1042:     v  = aa + 16 * ai[k];
1043:     vj = aj + ai[k];
1044:     tp = t + k * 4;
1045:     x0 = tp[0];
1046:     x1 = tp[1];
1047:     x2 = tp[2];
1048:     x3 = tp[3];
1049:     nz = ai[k + 1] - ai[k];

1051:     tp = t + (*vj) * 4;
1052:     while (nz--) {
1053:       tp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3;
1054:       tp[1] += v[4] * x0 + v[5] * x1 + v[6] * x2 + v[7] * x3;
1055:       tp[2] += v[8] * x0 + v[9] * x1 + v[10] * x2 + v[11] * x3;
1056:       tp[3] += v[12] * x0 + v[13] * x1 + v[14] * x2 + v[15] * x3;
1057:       vj++;
1058:       tp = t + (*vj) * 4;
1059:       v += 16;
1060:     }

1062:     /* xk = inv(Dk)*(Dk*xk) */
1063:     diag  = aa + k * 16; /* ptr to inv(Dk) */
1064:     tp    = t + k * 4;
1065:     tp[0] = diag[0] * x0 + diag[4] * x1 + diag[8] * x2 + diag[12] * x3;
1066:     tp[1] = diag[1] * x0 + diag[5] * x1 + diag[9] * x2 + diag[13] * x3;
1067:     tp[2] = diag[2] * x0 + diag[6] * x1 + diag[10] * x2 + diag[14] * x3;
1068:     tp[3] = diag[3] * x0 + diag[7] * x1 + diag[11] * x2 + diag[15] * x3;
1069:   }

1071:   /* solve U*x = y by back substitution */
1072:   for (k = mbs - 1; k >= 0; k--) {
1073:     v  = aa + 16 * ai[k];
1074:     vj = aj + ai[k];
1075:     tp = t + k * 4;
1076:     x0 = tp[0];
1077:     x1 = tp[1];
1078:     x2 = tp[2];
1079:     x3 = tp[3]; /* xk */
1080:     nz = ai[k + 1] - ai[k];

1082:     tp = t + (*vj) * 4;
1083:     while (nz--) {
1084:       /* xk += U(k,:)*x(:) */
1085:       x0 += v[0] * tp[0] + v[4] * tp[1] + v[8] * tp[2] + v[12] * tp[3];
1086:       x1 += v[1] * tp[0] + v[5] * tp[1] + v[9] * tp[2] + v[13] * tp[3];
1087:       x2 += v[2] * tp[0] + v[6] * tp[1] + v[10] * tp[2] + v[14] * tp[3];
1088:       x3 += v[3] * tp[0] + v[7] * tp[1] + v[11] * tp[2] + v[15] * tp[3];
1089:       vj++;
1090:       tp = t + (*vj) * 4;
1091:       v += 16;
1092:     }
1093:     tp         = t + k * 4;
1094:     tp[0]      = x0;
1095:     tp[1]      = x1;
1096:     tp[2]      = x2;
1097:     tp[3]      = x3;
1098:     idx        = 4 * r[k];
1099:     x[idx]     = x0;
1100:     x[idx + 1] = x1;
1101:     x[idx + 2] = x2;
1102:     x[idx + 3] = x3;
1103:   }

1105:   PetscCall(ISRestoreIndices(isrow, &r));
1106:   PetscCall(VecRestoreArrayRead(bb, &b));
1107:   PetscCall(VecRestoreArray(xx, &x));
1108:   PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
1109:   PetscFunctionReturn(PETSC_SUCCESS);
1110: }

1112: PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
1113: {
1114:   const MatScalar *v, *diag;
1115:   PetscScalar     *xp, x0, x1, x2, x3;
1116:   PetscInt         nz, k;
1117:   const PetscInt  *vj;

1119:   PetscFunctionBegin;
1120:   for (k = 0; k < mbs; k++) {
1121:     v  = aa + 16 * ai[k];
1122:     xp = x + k * 4;
1123:     x0 = xp[0];
1124:     x1 = xp[1];
1125:     x2 = xp[2];
1126:     x3 = xp[3]; /* Dk*xk = k-th block of x */
1127:     nz = ai[k + 1] - ai[k];
1128:     vj = aj + ai[k];
1129:     PetscPrefetchBlock(vj + nz, nz, 0, PETSC_PREFETCH_HINT_NTA);          /* Indices for the next row (assumes same size as this one) */
1130:     PetscPrefetchBlock(v + 16 * nz, 16 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1131:     while (nz--) {
1132:       xp = x + (*vj) * 4;
1133:       /* x(:) += U(k,:)^T*(Dk*xk) */
1134:       xp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3;
1135:       xp[1] += v[4] * x0 + v[5] * x1 + v[6] * x2 + v[7] * x3;
1136:       xp[2] += v[8] * x0 + v[9] * x1 + v[10] * x2 + v[11] * x3;
1137:       xp[3] += v[12] * x0 + v[13] * x1 + v[14] * x2 + v[15] * x3;
1138:       vj++;
1139:       v += 16;
1140:     }
1141:     /* xk = inv(Dk)*(Dk*xk) */
1142:     diag  = aa + k * 16; /* ptr to inv(Dk) */
1143:     xp    = x + k * 4;
1144:     xp[0] = diag[0] * x0 + diag[4] * x1 + diag[8] * x2 + diag[12] * x3;
1145:     xp[1] = diag[1] * x0 + diag[5] * x1 + diag[9] * x2 + diag[13] * x3;
1146:     xp[2] = diag[2] * x0 + diag[6] * x1 + diag[10] * x2 + diag[14] * x3;
1147:     xp[3] = diag[3] * x0 + diag[7] * x1 + diag[11] * x2 + diag[15] * x3;
1148:   }
1149:   PetscFunctionReturn(PETSC_SUCCESS);
1150: }

1152: PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
1153: {
1154:   const MatScalar *v;
1155:   PetscScalar     *xp, x0, x1, x2, x3;
1156:   PetscInt         nz, k;
1157:   const PetscInt  *vj;

1159:   PetscFunctionBegin;
1160:   for (k = mbs - 1; k >= 0; k--) {
1161:     v  = aa + 16 * ai[k];
1162:     xp = x + k * 4;
1163:     x0 = xp[0];
1164:     x1 = xp[1];
1165:     x2 = xp[2];
1166:     x3 = xp[3]; /* xk */
1167:     nz = ai[k + 1] - ai[k];
1168:     vj = aj + ai[k];
1169:     PetscPrefetchBlock(vj - nz, nz, 0, PETSC_PREFETCH_HINT_NTA);          /* Indices for the next row (assumes same size as this one) */
1170:     PetscPrefetchBlock(v - 16 * nz, 16 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1171:     while (nz--) {
1172:       xp = x + (*vj) * 4;
1173:       /* xk += U(k,:)*x(:) */
1174:       x0 += v[0] * xp[0] + v[4] * xp[1] + v[8] * xp[2] + v[12] * xp[3];
1175:       x1 += v[1] * xp[0] + v[5] * xp[1] + v[9] * xp[2] + v[13] * xp[3];
1176:       x2 += v[2] * xp[0] + v[6] * xp[1] + v[10] * xp[2] + v[14] * xp[3];
1177:       x3 += v[3] * xp[0] + v[7] * xp[1] + v[11] * xp[2] + v[15] * xp[3];
1178:       vj++;
1179:       v += 16;
1180:     }
1181:     xp    = x + k * 4;
1182:     xp[0] = x0;
1183:     xp[1] = x1;
1184:     xp[2] = x2;
1185:     xp[3] = x3;
1186:   }
1187:   PetscFunctionReturn(PETSC_SUCCESS);
1188: }

1190: PetscErrorCode MatSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
1191: {
1192:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
1193:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
1194:   const MatScalar   *aa = a->a;
1195:   PetscScalar       *x;
1196:   const PetscScalar *b;

1198:   PetscFunctionBegin;
1199:   PetscCall(VecGetArrayRead(bb, &b));
1200:   PetscCall(VecGetArray(xx, &x));

1202:   /* solve U^T * D * y = b by forward substitution */
1203:   PetscCall(PetscArraycpy(x, b, 4 * mbs)); /* x <- b */
1204:   PetscCall(MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(ai, aj, aa, mbs, x));

1206:   /* solve U*x = y by back substitution */
1207:   PetscCall(MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(ai, aj, aa, mbs, x));
1208:   PetscCall(VecRestoreArrayRead(bb, &b));
1209:   PetscCall(VecRestoreArray(xx, &x));
1210:   PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
1211:   PetscFunctionReturn(PETSC_SUCCESS);
1212: }

1214: PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
1215: {
1216:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
1217:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
1218:   const MatScalar   *aa = a->a;
1219:   PetscScalar       *x;
1220:   const PetscScalar *b;

1222:   PetscFunctionBegin;
1223:   PetscCall(VecGetArrayRead(bb, &b));
1224:   PetscCall(VecGetArray(xx, &x));
1225:   PetscCall(PetscArraycpy(x, b, 4 * mbs)); /* x <- b */
1226:   PetscCall(MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(ai, aj, aa, mbs, x));
1227:   PetscCall(VecRestoreArrayRead(bb, &b));
1228:   PetscCall(VecRestoreArray(xx, &x));
1229:   PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - A->rmap->bs * mbs));
1230:   PetscFunctionReturn(PETSC_SUCCESS);
1231: }

1233: PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
1234: {
1235:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
1236:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
1237:   const MatScalar   *aa = a->a;
1238:   PetscScalar       *x;
1239:   const PetscScalar *b;

1241:   PetscFunctionBegin;
1242:   PetscCall(VecGetArrayRead(bb, &b));
1243:   PetscCall(VecGetArray(xx, &x));
1244:   PetscCall(PetscArraycpy(x, b, 4 * mbs));
1245:   PetscCall(MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(ai, aj, aa, mbs, x));
1246:   PetscCall(VecRestoreArrayRead(bb, &b));
1247:   PetscCall(VecRestoreArray(xx, &x));
1248:   PetscCall(PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs)));
1249:   PetscFunctionReturn(PETSC_SUCCESS);
1250: }

1252: PetscErrorCode MatSolve_SeqSBAIJ_3_inplace(Mat A, Vec bb, Vec xx)
1253: {
1254:   Mat_SeqSBAIJ      *a     = (Mat_SeqSBAIJ *)A->data;
1255:   IS                 isrow = a->row;
1256:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
1257:   const PetscInt    *r;
1258:   PetscInt           nz, k, idx;
1259:   const PetscInt    *vj;
1260:   const MatScalar   *aa = a->a, *v, *diag;
1261:   PetscScalar       *x, x0, x1, x2, *t, *tp;
1262:   const PetscScalar *b;

1264:   PetscFunctionBegin;
1265:   PetscCall(VecGetArrayRead(bb, &b));
1266:   PetscCall(VecGetArray(xx, &x));
1267:   t = a->solve_work;
1268:   PetscCall(ISGetIndices(isrow, &r));

1270:   /* solve U^T * D * y = b by forward substitution */
1271:   tp = t;
1272:   for (k = 0; k < mbs; k++) { /* t <- perm(b) */
1273:     idx   = 3 * r[k];
1274:     tp[0] = b[idx];
1275:     tp[1] = b[idx + 1];
1276:     tp[2] = b[idx + 2];
1277:     tp += 3;
1278:   }

1280:   for (k = 0; k < mbs; k++) {
1281:     v  = aa + 9 * ai[k];
1282:     vj = aj + ai[k];
1283:     tp = t + k * 3;
1284:     x0 = tp[0];
1285:     x1 = tp[1];
1286:     x2 = tp[2];
1287:     nz = ai[k + 1] - ai[k];

1289:     tp = t + (*vj) * 3;
1290:     while (nz--) {
1291:       tp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2;
1292:       tp[1] += v[3] * x0 + v[4] * x1 + v[5] * x2;
1293:       tp[2] += v[6] * x0 + v[7] * x1 + v[8] * x2;
1294:       vj++;
1295:       tp = t + (*vj) * 3;
1296:       v += 9;
1297:     }

1299:     /* xk = inv(Dk)*(Dk*xk) */
1300:     diag  = aa + k * 9; /* ptr to inv(Dk) */
1301:     tp    = t + k * 3;
1302:     tp[0] = diag[0] * x0 + diag[3] * x1 + diag[6] * x2;
1303:     tp[1] = diag[1] * x0 + diag[4] * x1 + diag[7] * x2;
1304:     tp[2] = diag[2] * x0 + diag[5] * x1 + diag[8] * x2;
1305:   }

1307:   /* solve U*x = y by back substitution */
1308:   for (k = mbs - 1; k >= 0; k--) {
1309:     v  = aa + 9 * ai[k];
1310:     vj = aj + ai[k];
1311:     tp = t + k * 3;
1312:     x0 = tp[0];
1313:     x1 = tp[1];
1314:     x2 = tp[2]; /* xk */
1315:     nz = ai[k + 1] - ai[k];

1317:     tp = t + (*vj) * 3;
1318:     while (nz--) {
1319:       /* xk += U(k,:)*x(:) */
1320:       x0 += v[0] * tp[0] + v[3] * tp[1] + v[6] * tp[2];
1321:       x1 += v[1] * tp[0] + v[4] * tp[1] + v[7] * tp[2];
1322:       x2 += v[2] * tp[0] + v[5] * tp[1] + v[8] * tp[2];
1323:       vj++;
1324:       tp = t + (*vj) * 3;
1325:       v += 9;
1326:     }
1327:     tp         = t + k * 3;
1328:     tp[0]      = x0;
1329:     tp[1]      = x1;
1330:     tp[2]      = x2;
1331:     idx        = 3 * r[k];
1332:     x[idx]     = x0;
1333:     x[idx + 1] = x1;
1334:     x[idx + 2] = x2;
1335:   }

1337:   PetscCall(ISRestoreIndices(isrow, &r));
1338:   PetscCall(VecRestoreArrayRead(bb, &b));
1339:   PetscCall(VecRestoreArray(xx, &x));
1340:   PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
1341:   PetscFunctionReturn(PETSC_SUCCESS);
1342: }

1344: PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
1345: {
1346:   const MatScalar *v, *diag;
1347:   PetscScalar     *xp, x0, x1, x2;
1348:   PetscInt         nz, k;
1349:   const PetscInt  *vj;

1351:   PetscFunctionBegin;
1352:   for (k = 0; k < mbs; k++) {
1353:     v  = aa + 9 * ai[k];
1354:     xp = x + k * 3;
1355:     x0 = xp[0];
1356:     x1 = xp[1];
1357:     x2 = xp[2]; /* Dk*xk = k-th block of x */
1358:     nz = ai[k + 1] - ai[k];
1359:     vj = aj + ai[k];
1360:     PetscPrefetchBlock(vj + nz, nz, 0, PETSC_PREFETCH_HINT_NTA);        /* Indices for the next row (assumes same size as this one) */
1361:     PetscPrefetchBlock(v + 9 * nz, 9 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1362:     while (nz--) {
1363:       xp = x + (*vj) * 3;
1364:       /* x(:) += U(k,:)^T*(Dk*xk) */
1365:       xp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2;
1366:       xp[1] += v[3] * x0 + v[4] * x1 + v[5] * x2;
1367:       xp[2] += v[6] * x0 + v[7] * x1 + v[8] * x2;
1368:       vj++;
1369:       v += 9;
1370:     }
1371:     /* xk = inv(Dk)*(Dk*xk) */
1372:     diag  = aa + k * 9; /* ptr to inv(Dk) */
1373:     xp    = x + k * 3;
1374:     xp[0] = diag[0] * x0 + diag[3] * x1 + diag[6] * x2;
1375:     xp[1] = diag[1] * x0 + diag[4] * x1 + diag[7] * x2;
1376:     xp[2] = diag[2] * x0 + diag[5] * x1 + diag[8] * x2;
1377:   }
1378:   PetscFunctionReturn(PETSC_SUCCESS);
1379: }

1381: PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
1382: {
1383:   const MatScalar *v;
1384:   PetscScalar     *xp, x0, x1, x2;
1385:   PetscInt         nz, k;
1386:   const PetscInt  *vj;

1388:   PetscFunctionBegin;
1389:   for (k = mbs - 1; k >= 0; k--) {
1390:     v  = aa + 9 * ai[k];
1391:     xp = x + k * 3;
1392:     x0 = xp[0];
1393:     x1 = xp[1];
1394:     x2 = xp[2]; /* xk */
1395:     nz = ai[k + 1] - ai[k];
1396:     vj = aj + ai[k];
1397:     PetscPrefetchBlock(vj - nz, nz, 0, PETSC_PREFETCH_HINT_NTA);        /* Indices for the next row (assumes same size as this one) */
1398:     PetscPrefetchBlock(v - 9 * nz, 9 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1399:     while (nz--) {
1400:       xp = x + (*vj) * 3;
1401:       /* xk += U(k,:)*x(:) */
1402:       x0 += v[0] * xp[0] + v[3] * xp[1] + v[6] * xp[2];
1403:       x1 += v[1] * xp[0] + v[4] * xp[1] + v[7] * xp[2];
1404:       x2 += v[2] * xp[0] + v[5] * xp[1] + v[8] * xp[2];
1405:       vj++;
1406:       v += 9;
1407:     }
1408:     xp    = x + k * 3;
1409:     xp[0] = x0;
1410:     xp[1] = x1;
1411:     xp[2] = x2;
1412:   }
1413:   PetscFunctionReturn(PETSC_SUCCESS);
1414: }

1416: PetscErrorCode MatSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
1417: {
1418:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
1419:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
1420:   const MatScalar   *aa = a->a;
1421:   PetscScalar       *x;
1422:   const PetscScalar *b;

1424:   PetscFunctionBegin;
1425:   PetscCall(VecGetArrayRead(bb, &b));
1426:   PetscCall(VecGetArray(xx, &x));

1428:   /* solve U^T * D * y = b by forward substitution */
1429:   PetscCall(PetscArraycpy(x, b, 3 * mbs));
1430:   PetscCall(MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(ai, aj, aa, mbs, x));

1432:   /* solve U*x = y by back substitution */
1433:   PetscCall(MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(ai, aj, aa, mbs, x));

1435:   PetscCall(VecRestoreArrayRead(bb, &b));
1436:   PetscCall(VecRestoreArray(xx, &x));
1437:   PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
1438:   PetscFunctionReturn(PETSC_SUCCESS);
1439: }

1441: PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
1442: {
1443:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
1444:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
1445:   const MatScalar   *aa = a->a;
1446:   PetscScalar       *x;
1447:   const PetscScalar *b;

1449:   PetscFunctionBegin;
1450:   PetscCall(VecGetArrayRead(bb, &b));
1451:   PetscCall(VecGetArray(xx, &x));
1452:   PetscCall(PetscArraycpy(x, b, 3 * mbs));
1453:   PetscCall(MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(ai, aj, aa, mbs, x));
1454:   PetscCall(VecRestoreArrayRead(bb, &b));
1455:   PetscCall(VecRestoreArray(xx, &x));
1456:   PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - A->rmap->bs * mbs));
1457:   PetscFunctionReturn(PETSC_SUCCESS);
1458: }

1460: PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
1461: {
1462:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
1463:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
1464:   const MatScalar   *aa = a->a;
1465:   PetscScalar       *x;
1466:   const PetscScalar *b;

1468:   PetscFunctionBegin;
1469:   PetscCall(VecGetArrayRead(bb, &b));
1470:   PetscCall(VecGetArray(xx, &x));
1471:   PetscCall(PetscArraycpy(x, b, 3 * mbs));
1472:   PetscCall(MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(ai, aj, aa, mbs, x));
1473:   PetscCall(VecRestoreArrayRead(bb, &b));
1474:   PetscCall(VecRestoreArray(xx, &x));
1475:   PetscCall(PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs)));
1476:   PetscFunctionReturn(PETSC_SUCCESS);
1477: }

1479: PetscErrorCode MatSolve_SeqSBAIJ_2_inplace(Mat A, Vec bb, Vec xx)
1480: {
1481:   Mat_SeqSBAIJ      *a     = (Mat_SeqSBAIJ *)A->data;
1482:   IS                 isrow = a->row;
1483:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
1484:   const PetscInt    *r, *vj;
1485:   PetscInt           nz, k, k2, idx;
1486:   const MatScalar   *aa = a->a, *v, *diag;
1487:   PetscScalar       *x, x0, x1, *t;
1488:   const PetscScalar *b;

1490:   PetscFunctionBegin;
1491:   PetscCall(VecGetArrayRead(bb, &b));
1492:   PetscCall(VecGetArray(xx, &x));
1493:   t = a->solve_work;
1494:   PetscCall(ISGetIndices(isrow, &r));

1496:   /* solve U^T * D * y = perm(b) by forward substitution */
1497:   for (k = 0; k < mbs; k++) { /* t <- perm(b) */
1498:     idx          = 2 * r[k];
1499:     t[k * 2]     = b[idx];
1500:     t[k * 2 + 1] = b[idx + 1];
1501:   }
1502:   for (k = 0; k < mbs; k++) {
1503:     v  = aa + 4 * ai[k];
1504:     vj = aj + ai[k];
1505:     k2 = k * 2;
1506:     x0 = t[k2];
1507:     x1 = t[k2 + 1];
1508:     nz = ai[k + 1] - ai[k];
1509:     while (nz--) {
1510:       t[(*vj) * 2] += v[0] * x0 + v[1] * x1;
1511:       t[(*vj) * 2 + 1] += v[2] * x0 + v[3] * x1;
1512:       vj++;
1513:       v += 4;
1514:     }
1515:     diag      = aa + k * 4; /* ptr to inv(Dk) */
1516:     t[k2]     = diag[0] * x0 + diag[2] * x1;
1517:     t[k2 + 1] = diag[1] * x0 + diag[3] * x1;
1518:   }

1520:   /* solve U*x = y by back substitution */
1521:   for (k = mbs - 1; k >= 0; k--) {
1522:     v  = aa + 4 * ai[k];
1523:     vj = aj + ai[k];
1524:     k2 = k * 2;
1525:     x0 = t[k2];
1526:     x1 = t[k2 + 1];
1527:     nz = ai[k + 1] - ai[k];
1528:     while (nz--) {
1529:       x0 += v[0] * t[(*vj) * 2] + v[2] * t[(*vj) * 2 + 1];
1530:       x1 += v[1] * t[(*vj) * 2] + v[3] * t[(*vj) * 2 + 1];
1531:       vj++;
1532:       v += 4;
1533:     }
1534:     t[k2]      = x0;
1535:     t[k2 + 1]  = x1;
1536:     idx        = 2 * r[k];
1537:     x[idx]     = x0;
1538:     x[idx + 1] = x1;
1539:   }

1541:   PetscCall(ISRestoreIndices(isrow, &r));
1542:   PetscCall(VecRestoreArrayRead(bb, &b));
1543:   PetscCall(VecRestoreArray(xx, &x));
1544:   PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
1545:   PetscFunctionReturn(PETSC_SUCCESS);
1546: }

1548: PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
1549: {
1550:   const MatScalar *v, *diag;
1551:   PetscScalar      x0, x1;
1552:   PetscInt         nz, k, k2;
1553:   const PetscInt  *vj;

1555:   PetscFunctionBegin;
1556:   for (k = 0; k < mbs; k++) {
1557:     v  = aa + 4 * ai[k];
1558:     vj = aj + ai[k];
1559:     k2 = k * 2;
1560:     x0 = x[k2];
1561:     x1 = x[k2 + 1]; /* Dk*xk = k-th block of x */
1562:     nz = ai[k + 1] - ai[k];
1563:     PetscPrefetchBlock(vj + nz, nz, 0, PETSC_PREFETCH_HINT_NTA);        /* Indices for the next row (assumes same size as this one) */
1564:     PetscPrefetchBlock(v + 4 * nz, 4 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1565:     while (nz--) {
1566:       /* x(:) += U(k,:)^T*(Dk*xk) */
1567:       x[(*vj) * 2] += v[0] * x0 + v[1] * x1;
1568:       x[(*vj) * 2 + 1] += v[2] * x0 + v[3] * x1;
1569:       vj++;
1570:       v += 4;
1571:     }
1572:     /* xk = inv(Dk)*(Dk*xk) */
1573:     diag      = aa + k * 4; /* ptr to inv(Dk) */
1574:     x[k2]     = diag[0] * x0 + diag[2] * x1;
1575:     x[k2 + 1] = diag[1] * x0 + diag[3] * x1;
1576:   }
1577:   PetscFunctionReturn(PETSC_SUCCESS);
1578: }

1580: PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
1581: {
1582:   const MatScalar *v;
1583:   PetscScalar      x0, x1;
1584:   PetscInt         nz, k, k2;
1585:   const PetscInt  *vj;

1587:   PetscFunctionBegin;
1588:   for (k = mbs - 1; k >= 0; k--) {
1589:     v  = aa + 4 * ai[k];
1590:     vj = aj + ai[k];
1591:     k2 = k * 2;
1592:     x0 = x[k2];
1593:     x1 = x[k2 + 1]; /* xk */
1594:     nz = ai[k + 1] - ai[k];
1595:     PetscPrefetchBlock(vj - nz, nz, 0, PETSC_PREFETCH_HINT_NTA);        /* Indices for the next row (assumes same size as this one) */
1596:     PetscPrefetchBlock(v - 4 * nz, 4 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1597:     while (nz--) {
1598:       /* xk += U(k,:)*x(:) */
1599:       x0 += v[0] * x[(*vj) * 2] + v[2] * x[(*vj) * 2 + 1];
1600:       x1 += v[1] * x[(*vj) * 2] + v[3] * x[(*vj) * 2 + 1];
1601:       vj++;
1602:       v += 4;
1603:     }
1604:     x[k2]     = x0;
1605:     x[k2 + 1] = x1;
1606:   }
1607:   PetscFunctionReturn(PETSC_SUCCESS);
1608: }

1610: PetscErrorCode MatSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
1611: {
1612:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
1613:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
1614:   const MatScalar   *aa = a->a;
1615:   PetscScalar       *x;
1616:   const PetscScalar *b;

1618:   PetscFunctionBegin;
1619:   PetscCall(VecGetArrayRead(bb, &b));
1620:   PetscCall(VecGetArray(xx, &x));

1622:   /* solve U^T * D * y = b by forward substitution */
1623:   PetscCall(PetscArraycpy(x, b, 2 * mbs));
1624:   PetscCall(MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(ai, aj, aa, mbs, x));

1626:   /* solve U*x = y by back substitution */
1627:   PetscCall(MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(ai, aj, aa, mbs, x));

1629:   PetscCall(VecRestoreArrayRead(bb, &b));
1630:   PetscCall(VecRestoreArray(xx, &x));
1631:   PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
1632:   PetscFunctionReturn(PETSC_SUCCESS);
1633: }

1635: PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
1636: {
1637:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
1638:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
1639:   const MatScalar   *aa = a->a;
1640:   PetscScalar       *x;
1641:   const PetscScalar *b;

1643:   PetscFunctionBegin;
1644:   PetscCall(VecGetArrayRead(bb, &b));
1645:   PetscCall(VecGetArray(xx, &x));
1646:   PetscCall(PetscArraycpy(x, b, 2 * mbs));
1647:   PetscCall(MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(ai, aj, aa, mbs, x));
1648:   PetscCall(VecRestoreArrayRead(bb, &b));
1649:   PetscCall(VecRestoreArray(xx, &x));
1650:   PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - A->rmap->bs * mbs));
1651:   PetscFunctionReturn(PETSC_SUCCESS);
1652: }

1654: PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
1655: {
1656:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
1657:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
1658:   const MatScalar   *aa = a->a;
1659:   PetscScalar       *x;
1660:   const PetscScalar *b;

1662:   PetscFunctionBegin;
1663:   PetscCall(VecGetArrayRead(bb, &b));
1664:   PetscCall(VecGetArray(xx, &x));
1665:   PetscCall(PetscArraycpy(x, b, 2 * mbs));
1666:   PetscCall(MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(ai, aj, aa, mbs, x));
1667:   PetscCall(VecRestoreArrayRead(bb, &b));
1668:   PetscCall(VecRestoreArray(xx, &x));
1669:   PetscCall(PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs)));
1670:   PetscFunctionReturn(PETSC_SUCCESS);
1671: }

1673: PetscErrorCode MatSolve_SeqSBAIJ_1(Mat A, Vec bb, Vec xx)
1674: {
1675:   Mat_SeqSBAIJ      *a     = (Mat_SeqSBAIJ *)A->data;
1676:   IS                 isrow = a->row;
1677:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *rp, *vj, *adiag = a->diag;
1678:   const MatScalar   *aa = a->a, *v;
1679:   const PetscScalar *b;
1680:   PetscScalar       *x, xk, *t;
1681:   PetscInt           nz, k, j;

1683:   PetscFunctionBegin;
1684:   PetscCall(VecGetArrayRead(bb, &b));
1685:   PetscCall(VecGetArray(xx, &x));
1686:   t = a->solve_work;
1687:   PetscCall(ISGetIndices(isrow, &rp));

1689:   /* solve U^T*D*y = perm(b) by forward substitution */
1690:   for (k = 0; k < mbs; k++) t[k] = b[rp[k]];
1691:   for (k = 0; k < mbs; k++) {
1692:     v  = aa + ai[k];
1693:     vj = aj + ai[k];
1694:     xk = t[k];
1695:     nz = ai[k + 1] - ai[k] - 1;
1696:     for (j = 0; j < nz; j++) t[vj[j]] += v[j] * xk;
1697:     t[k] = xk * v[nz]; /* v[nz] = 1/D(k) */
1698:   }

1700:   /* solve U*perm(x) = y by back substitution */
1701:   for (k = mbs - 1; k >= 0; k--) {
1702:     v  = aa + adiag[k] - 1;
1703:     vj = aj + adiag[k] - 1;
1704:     nz = ai[k + 1] - ai[k] - 1;
1705:     for (j = 0; j < nz; j++) t[k] += v[-j] * t[vj[-j]];
1706:     x[rp[k]] = t[k];
1707:   }

1709:   PetscCall(ISRestoreIndices(isrow, &rp));
1710:   PetscCall(VecRestoreArrayRead(bb, &b));
1711:   PetscCall(VecRestoreArray(xx, &x));
1712:   PetscCall(PetscLogFlops(4.0 * a->nz - 3.0 * mbs));
1713:   PetscFunctionReturn(PETSC_SUCCESS);
1714: }

1716: PetscErrorCode MatSolve_SeqSBAIJ_1_inplace(Mat A, Vec bb, Vec xx)
1717: {
1718:   Mat_SeqSBAIJ      *a     = (Mat_SeqSBAIJ *)A->data;
1719:   IS                 isrow = a->row;
1720:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *rp, *vj;
1721:   const MatScalar   *aa = a->a, *v;
1722:   PetscScalar       *x, xk, *t;
1723:   const PetscScalar *b;
1724:   PetscInt           nz, k;

1726:   PetscFunctionBegin;
1727:   PetscCall(VecGetArrayRead(bb, &b));
1728:   PetscCall(VecGetArray(xx, &x));
1729:   t = a->solve_work;
1730:   PetscCall(ISGetIndices(isrow, &rp));

1732:   /* solve U^T*D*y = perm(b) by forward substitution */
1733:   for (k = 0; k < mbs; k++) t[k] = b[rp[k]];
1734:   for (k = 0; k < mbs; k++) {
1735:     v  = aa + ai[k] + 1;
1736:     vj = aj + ai[k] + 1;
1737:     xk = t[k];
1738:     nz = ai[k + 1] - ai[k] - 1;
1739:     while (nz--) t[*vj++] += (*v++) * xk;
1740:     t[k] = xk * aa[ai[k]]; /* aa[k] = 1/D(k) */
1741:   }

1743:   /* solve U*perm(x) = y by back substitution */
1744:   for (k = mbs - 1; k >= 0; k--) {
1745:     v  = aa + ai[k] + 1;
1746:     vj = aj + ai[k] + 1;
1747:     nz = ai[k + 1] - ai[k] - 1;
1748:     while (nz--) t[k] += (*v++) * t[*vj++];
1749:     x[rp[k]] = t[k];
1750:   }

1752:   PetscCall(ISRestoreIndices(isrow, &rp));
1753:   PetscCall(VecRestoreArrayRead(bb, &b));
1754:   PetscCall(VecRestoreArray(xx, &x));
1755:   PetscCall(PetscLogFlops(4.0 * a->nz - 3 * mbs));
1756:   PetscFunctionReturn(PETSC_SUCCESS);
1757: }

1759: PetscErrorCode MatForwardSolve_SeqSBAIJ_1(Mat A, Vec bb, Vec xx)
1760: {
1761:   Mat_SeqSBAIJ      *a     = (Mat_SeqSBAIJ *)A->data;
1762:   IS                 isrow = a->row;
1763:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *rp, *vj, *adiag = a->diag;
1764:   const MatScalar   *aa = a->a, *v;
1765:   PetscReal          diagk;
1766:   PetscScalar       *x, xk;
1767:   const PetscScalar *b;
1768:   PetscInt           nz, k;

1770:   PetscFunctionBegin;
1771:   /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */
1772:   PetscCall(VecGetArrayRead(bb, &b));
1773:   PetscCall(VecGetArray(xx, &x));
1774:   PetscCall(ISGetIndices(isrow, &rp));

1776:   for (k = 0; k < mbs; k++) x[k] = b[rp[k]];
1777:   for (k = 0; k < mbs; k++) {
1778:     v  = aa + ai[k];
1779:     vj = aj + ai[k];
1780:     xk = x[k];
1781:     nz = ai[k + 1] - ai[k] - 1;
1782:     while (nz--) x[*vj++] += (*v++) * xk;

1784:     diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */
1785:     PetscCheck(!PetscImaginaryPart(aa[adiag[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal must be real and nonnegative");
1786:     x[k] = xk * PetscSqrtReal(diagk);
1787:   }
1788:   PetscCall(ISRestoreIndices(isrow, &rp));
1789:   PetscCall(VecRestoreArrayRead(bb, &b));
1790:   PetscCall(VecRestoreArray(xx, &x));
1791:   PetscCall(PetscLogFlops(2.0 * a->nz - mbs));
1792:   PetscFunctionReturn(PETSC_SUCCESS);
1793: }

1795: PetscErrorCode MatForwardSolve_SeqSBAIJ_1_inplace(Mat A, Vec bb, Vec xx)
1796: {
1797:   Mat_SeqSBAIJ      *a     = (Mat_SeqSBAIJ *)A->data;
1798:   IS                 isrow = a->row;
1799:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *rp, *vj;
1800:   const MatScalar   *aa = a->a, *v;
1801:   PetscReal          diagk;
1802:   PetscScalar       *x, xk;
1803:   const PetscScalar *b;
1804:   PetscInt           nz, k;

1806:   PetscFunctionBegin;
1807:   /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */
1808:   PetscCall(VecGetArrayRead(bb, &b));
1809:   PetscCall(VecGetArray(xx, &x));
1810:   PetscCall(ISGetIndices(isrow, &rp));

1812:   for (k = 0; k < mbs; k++) x[k] = b[rp[k]];
1813:   for (k = 0; k < mbs; k++) {
1814:     v  = aa + ai[k] + 1;
1815:     vj = aj + ai[k] + 1;
1816:     xk = x[k];
1817:     nz = ai[k + 1] - ai[k] - 1;
1818:     while (nz--) x[*vj++] += (*v++) * xk;

1820:     diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
1821:     PetscCheck(!PetscImaginaryPart(aa[ai[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal must be real and nonnegative");
1822:     x[k] = xk * PetscSqrtReal(diagk);
1823:   }
1824:   PetscCall(ISRestoreIndices(isrow, &rp));
1825:   PetscCall(VecRestoreArrayRead(bb, &b));
1826:   PetscCall(VecRestoreArray(xx, &x));
1827:   PetscCall(PetscLogFlops(2.0 * a->nz - mbs));
1828:   PetscFunctionReturn(PETSC_SUCCESS);
1829: }

1831: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1(Mat A, Vec bb, Vec xx)
1832: {
1833:   Mat_SeqSBAIJ      *a     = (Mat_SeqSBAIJ *)A->data;
1834:   IS                 isrow = a->row;
1835:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *rp, *vj, *adiag = a->diag;
1836:   const MatScalar   *aa = a->a, *v;
1837:   PetscReal          diagk;
1838:   PetscScalar       *x, *t;
1839:   const PetscScalar *b;
1840:   PetscInt           nz, k;

1842:   PetscFunctionBegin;
1843:   /* solve D^(1/2)*U*perm(x) = b by back substitution */
1844:   PetscCall(VecGetArrayRead(bb, &b));
1845:   PetscCall(VecGetArray(xx, &x));
1846:   t = a->solve_work;
1847:   PetscCall(ISGetIndices(isrow, &rp));

1849:   for (k = mbs - 1; k >= 0; k--) {
1850:     v     = aa + ai[k];
1851:     vj    = aj + ai[k];
1852:     diagk = PetscRealPart(aa[adiag[k]]);
1853:     PetscCheck(!PetscImaginaryPart(aa[adiag[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal must be real and nonnegative");
1854:     t[k] = b[k] * PetscSqrtReal(diagk);
1855:     nz   = ai[k + 1] - ai[k] - 1;
1856:     while (nz--) t[k] += (*v++) * t[*vj++];
1857:     x[rp[k]] = t[k];
1858:   }
1859:   PetscCall(ISRestoreIndices(isrow, &rp));
1860:   PetscCall(VecRestoreArrayRead(bb, &b));
1861:   PetscCall(VecRestoreArray(xx, &x));
1862:   PetscCall(PetscLogFlops(2.0 * a->nz - mbs));
1863:   PetscFunctionReturn(PETSC_SUCCESS);
1864: }

1866: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_inplace(Mat A, Vec bb, Vec xx)
1867: {
1868:   Mat_SeqSBAIJ      *a     = (Mat_SeqSBAIJ *)A->data;
1869:   IS                 isrow = a->row;
1870:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *rp, *vj;
1871:   const MatScalar   *aa = a->a, *v;
1872:   PetscReal          diagk;
1873:   PetscScalar       *x, *t;
1874:   const PetscScalar *b;
1875:   PetscInt           nz, k;

1877:   PetscFunctionBegin;
1878:   /* solve D^(1/2)*U*perm(x) = b by back substitution */
1879:   PetscCall(VecGetArrayRead(bb, &b));
1880:   PetscCall(VecGetArray(xx, &x));
1881:   t = a->solve_work;
1882:   PetscCall(ISGetIndices(isrow, &rp));

1884:   for (k = mbs - 1; k >= 0; k--) {
1885:     v     = aa + ai[k] + 1;
1886:     vj    = aj + ai[k] + 1;
1887:     diagk = PetscRealPart(aa[ai[k]]);
1888:     PetscCheck(!PetscImaginaryPart(aa[ai[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal must be real and nonnegative");
1889:     t[k] = b[k] * PetscSqrtReal(diagk);
1890:     nz   = ai[k + 1] - ai[k] - 1;
1891:     while (nz--) t[k] += (*v++) * t[*vj++];
1892:     x[rp[k]] = t[k];
1893:   }
1894:   PetscCall(ISRestoreIndices(isrow, &rp));
1895:   PetscCall(VecRestoreArrayRead(bb, &b));
1896:   PetscCall(VecRestoreArray(xx, &x));
1897:   PetscCall(PetscLogFlops(2.0 * a->nz - mbs));
1898:   PetscFunctionReturn(PETSC_SUCCESS);
1899: }

1901: PetscErrorCode MatSolves_SeqSBAIJ_1(Mat A, Vecs bb, Vecs xx)
1902: {
1903:   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;

1905:   PetscFunctionBegin;
1906:   if (A->rmap->bs == 1) {
1907:     PetscCall(MatSolve_SeqSBAIJ_1(A, bb->v, xx->v));
1908:   } else {
1909:     IS                 isrow = a->row;
1910:     const PetscInt    *vj, mbs = a->mbs, *ai = a->i, *aj = a->j, *rp;
1911:     const MatScalar   *aa = a->a, *v;
1912:     PetscScalar       *x, *t;
1913:     const PetscScalar *b;
1914:     PetscInt           nz, k, n, i, j;

1916:     if (bb->n > a->solves_work_n) {
1917:       PetscCall(PetscFree(a->solves_work));
1918:       PetscCall(PetscMalloc1(bb->n * A->rmap->N, &a->solves_work));
1919:       a->solves_work_n = bb->n;
1920:     }
1921:     n = bb->n;
1922:     PetscCall(VecGetArrayRead(bb->v, &b));
1923:     PetscCall(VecGetArray(xx->v, &x));
1924:     t = a->solves_work;

1926:     PetscCall(ISGetIndices(isrow, &rp));

1928:     /* solve U^T*D*y = perm(b) by forward substitution */
1929:     for (k = 0; k < mbs; k++) {
1930:       for (i = 0; i < n; i++) t[n * k + i] = b[rp[k] + i * mbs]; /* values are stored interlaced in t */
1931:     }
1932:     for (k = 0; k < mbs; k++) {
1933:       v  = aa + ai[k];
1934:       vj = aj + ai[k];
1935:       nz = ai[k + 1] - ai[k] - 1;
1936:       for (j = 0; j < nz; j++) {
1937:         for (i = 0; i < n; i++) t[n * (*vj) + i] += (*v) * t[n * k + i];
1938:         v++;
1939:         vj++;
1940:       }
1941:       for (i = 0; i < n; i++) t[n * k + i] *= aa[nz]; /* note: aa[nz] = 1/D(k) */
1942:     }

1944:     /* solve U*perm(x) = y by back substitution */
1945:     for (k = mbs - 1; k >= 0; k--) {
1946:       v  = aa + ai[k] - 1;
1947:       vj = aj + ai[k] - 1;
1948:       nz = ai[k + 1] - ai[k] - 1;
1949:       for (j = 0; j < nz; j++) {
1950:         for (i = 0; i < n; i++) t[n * k + i] += (*v) * t[n * (*vj) + i];
1951:         v++;
1952:         vj++;
1953:       }
1954:       for (i = 0; i < n; i++) x[rp[k] + i * mbs] = t[n * k + i];
1955:     }

1957:     PetscCall(ISRestoreIndices(isrow, &rp));
1958:     PetscCall(VecRestoreArrayRead(bb->v, &b));
1959:     PetscCall(VecRestoreArray(xx->v, &x));
1960:     PetscCall(PetscLogFlops(bb->n * (4.0 * a->nz - 3.0 * mbs)));
1961:   }
1962:   PetscFunctionReturn(PETSC_SUCCESS);
1963: }

1965: PetscErrorCode MatSolves_SeqSBAIJ_1_inplace(Mat A, Vecs bb, Vecs xx)
1966: {
1967:   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;

1969:   PetscFunctionBegin;
1970:   if (A->rmap->bs == 1) {
1971:     PetscCall(MatSolve_SeqSBAIJ_1_inplace(A, bb->v, xx->v));
1972:   } else {
1973:     IS                 isrow = a->row;
1974:     const PetscInt    *vj, mbs = a->mbs, *ai = a->i, *aj = a->j, *rp;
1975:     const MatScalar   *aa = a->a, *v;
1976:     PetscScalar       *x, *t;
1977:     const PetscScalar *b;
1978:     PetscInt           nz, k, n, i;

1980:     if (bb->n > a->solves_work_n) {
1981:       PetscCall(PetscFree(a->solves_work));
1982:       PetscCall(PetscMalloc1(bb->n * A->rmap->N, &a->solves_work));
1983:       a->solves_work_n = bb->n;
1984:     }
1985:     n = bb->n;
1986:     PetscCall(VecGetArrayRead(bb->v, &b));
1987:     PetscCall(VecGetArray(xx->v, &x));
1988:     t = a->solves_work;

1990:     PetscCall(ISGetIndices(isrow, &rp));

1992:     /* solve U^T*D*y = perm(b) by forward substitution */
1993:     for (k = 0; k < mbs; k++) {
1994:       for (i = 0; i < n; i++) t[n * k + i] = b[rp[k] + i * mbs]; /* values are stored interlaced in t */
1995:     }
1996:     for (k = 0; k < mbs; k++) {
1997:       v  = aa + ai[k];
1998:       vj = aj + ai[k];
1999:       nz = ai[k + 1] - ai[k];
2000:       while (nz--) {
2001:         for (i = 0; i < n; i++) t[n * (*vj) + i] += (*v) * t[n * k + i];
2002:         v++;
2003:         vj++;
2004:       }
2005:       for (i = 0; i < n; i++) t[n * k + i] *= aa[k]; /* note: aa[k] = 1/D(k) */
2006:     }

2008:     /* solve U*perm(x) = y by back substitution */
2009:     for (k = mbs - 1; k >= 0; k--) {
2010:       v  = aa + ai[k];
2011:       vj = aj + ai[k];
2012:       nz = ai[k + 1] - ai[k];
2013:       while (nz--) {
2014:         for (i = 0; i < n; i++) t[n * k + i] += (*v) * t[n * (*vj) + i];
2015:         v++;
2016:         vj++;
2017:       }
2018:       for (i = 0; i < n; i++) x[rp[k] + i * mbs] = t[n * k + i];
2019:     }

2021:     PetscCall(ISRestoreIndices(isrow, &rp));
2022:     PetscCall(VecRestoreArrayRead(bb->v, &b));
2023:     PetscCall(VecRestoreArray(xx->v, &x));
2024:     PetscCall(PetscLogFlops(bb->n * (4.0 * a->nz - 3.0 * mbs)));
2025:   }
2026:   PetscFunctionReturn(PETSC_SUCCESS);
2027: }

2029: PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A, Vec bb, Vec xx)
2030: {
2031:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
2032:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *vj, *adiag = a->diag;
2033:   const MatScalar   *aa = a->a, *v;
2034:   const PetscScalar *b;
2035:   PetscScalar       *x, xi;
2036:   PetscInt           nz, i, j;

2038:   PetscFunctionBegin;
2039:   PetscCall(VecGetArrayRead(bb, &b));
2040:   PetscCall(VecGetArray(xx, &x));
2041:   /* solve U^T*D*y = b by forward substitution */
2042:   PetscCall(PetscArraycpy(x, b, mbs));
2043:   for (i = 0; i < mbs; i++) {
2044:     v  = aa + ai[i];
2045:     vj = aj + ai[i];
2046:     xi = x[i];
2047:     nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
2048:     for (j = 0; j < nz; j++) x[vj[j]] += v[j] * xi;
2049:     x[i] = xi * v[nz]; /* v[nz] = aa[diag[i]] = 1/D(i) */
2050:   }
2051:   /* solve U*x = y by backward substitution */
2052:   for (i = mbs - 2; i >= 0; i--) {
2053:     xi = x[i];
2054:     v  = aa + adiag[i] - 1; /* end of row i, excluding diag */
2055:     vj = aj + adiag[i] - 1;
2056:     nz = ai[i + 1] - ai[i] - 1;
2057:     for (j = 0; j < nz; j++) xi += v[-j] * x[vj[-j]];
2058:     x[i] = xi;
2059:   }
2060:   PetscCall(VecRestoreArrayRead(bb, &b));
2061:   PetscCall(VecRestoreArray(xx, &x));
2062:   PetscCall(PetscLogFlops(4.0 * a->nz - 3 * mbs));
2063:   PetscFunctionReturn(PETSC_SUCCESS);
2064: }

2066: PetscErrorCode MatMatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A, Mat B, Mat X)
2067: {
2068:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
2069:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *vj, *adiag = a->diag;
2070:   const MatScalar   *aa = a->a, *v;
2071:   const PetscScalar *b;
2072:   PetscScalar       *x, xi;
2073:   PetscInt           nz, i, j, neq, ldb, ldx;
2074:   PetscBool          isdense;

2076:   PetscFunctionBegin;
2077:   if (!mbs) PetscFunctionReturn(PETSC_SUCCESS);
2078:   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSE, &isdense));
2079:   PetscCheck(isdense, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "B matrix must be a SeqDense matrix");
2080:   if (X != B) {
2081:     PetscCall(PetscObjectTypeCompare((PetscObject)X, MATSEQDENSE, &isdense));
2082:     PetscCheck(isdense, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "X matrix must be a SeqDense matrix");
2083:   }
2084:   PetscCall(MatDenseGetArrayRead(B, &b));
2085:   PetscCall(MatDenseGetLDA(B, &ldb));
2086:   PetscCall(MatDenseGetArray(X, &x));
2087:   PetscCall(MatDenseGetLDA(X, &ldx));
2088:   for (neq = 0; neq < B->cmap->n; neq++) {
2089:     /* solve U^T*D*y = b by forward substitution */
2090:     PetscCall(PetscArraycpy(x, b, mbs));
2091:     for (i = 0; i < mbs; i++) {
2092:       v  = aa + ai[i];
2093:       vj = aj + ai[i];
2094:       xi = x[i];
2095:       nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
2096:       for (j = 0; j < nz; j++) x[vj[j]] += v[j] * xi;
2097:       x[i] = xi * v[nz]; /* v[nz] = aa[diag[i]] = 1/D(i) */
2098:     }
2099:     /* solve U*x = y by backward substitution */
2100:     for (i = mbs - 2; i >= 0; i--) {
2101:       xi = x[i];
2102:       v  = aa + adiag[i] - 1; /* end of row i, excluding diag */
2103:       vj = aj + adiag[i] - 1;
2104:       nz = ai[i + 1] - ai[i] - 1;
2105:       for (j = 0; j < nz; j++) xi += v[-j] * x[vj[-j]];
2106:       x[i] = xi;
2107:     }
2108:     b += ldb;
2109:     x += ldx;
2110:   }
2111:   PetscCall(MatDenseRestoreArrayRead(B, &b));
2112:   PetscCall(MatDenseRestoreArray(X, &x));
2113:   PetscCall(PetscLogFlops(B->cmap->n * (4.0 * a->nz - 3 * mbs)));
2114:   PetscFunctionReturn(PETSC_SUCCESS);
2115: }

2117: PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
2118: {
2119:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
2120:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *vj;
2121:   const MatScalar   *aa = a->a, *v;
2122:   PetscScalar       *x, xk;
2123:   const PetscScalar *b;
2124:   PetscInt           nz, k;

2126:   PetscFunctionBegin;
2127:   PetscCall(VecGetArrayRead(bb, &b));
2128:   PetscCall(VecGetArray(xx, &x));

2130:   /* solve U^T*D*y = b by forward substitution */
2131:   PetscCall(PetscArraycpy(x, b, mbs));
2132:   for (k = 0; k < mbs; k++) {
2133:     v  = aa + ai[k] + 1;
2134:     vj = aj + ai[k] + 1;
2135:     xk = x[k];
2136:     nz = ai[k + 1] - ai[k] - 1; /* exclude diag[k] */
2137:     while (nz--) x[*vj++] += (*v++) * xk;
2138:     x[k] = xk * aa[ai[k]]; /* note: aa[diag[k]] = 1/D(k) */
2139:   }

2141:   /* solve U*x = y by back substitution */
2142:   for (k = mbs - 2; k >= 0; k--) {
2143:     v  = aa + ai[k] + 1;
2144:     vj = aj + ai[k] + 1;
2145:     xk = x[k];
2146:     nz = ai[k + 1] - ai[k] - 1;
2147:     while (nz--) xk += (*v++) * x[*vj++];
2148:     x[k] = xk;
2149:   }

2151:   PetscCall(VecRestoreArrayRead(bb, &b));
2152:   PetscCall(VecRestoreArray(xx, &x));
2153:   PetscCall(PetscLogFlops(4.0 * a->nz - 3 * mbs));
2154:   PetscFunctionReturn(PETSC_SUCCESS);
2155: }

2157: PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A, Vec bb, Vec xx)
2158: {
2159:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
2160:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *adiag = a->diag, *vj;
2161:   const MatScalar   *aa = a->a, *v;
2162:   PetscReal          diagk;
2163:   PetscScalar       *x;
2164:   const PetscScalar *b;
2165:   PetscInt           nz, k;

2167:   PetscFunctionBegin;
2168:   /* solve U^T*D^(1/2)*x = b by forward substitution */
2169:   PetscCall(VecGetArrayRead(bb, &b));
2170:   PetscCall(VecGetArray(xx, &x));
2171:   PetscCall(PetscArraycpy(x, b, mbs));
2172:   for (k = 0; k < mbs; k++) {
2173:     v  = aa + ai[k];
2174:     vj = aj + ai[k];
2175:     nz = ai[k + 1] - ai[k] - 1; /* exclude diag[k] */
2176:     while (nz--) x[*vj++] += (*v++) * x[k];
2177:     diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[adiag[k]] = 1/D(k) */
2178:     PetscCheck(!PetscImaginaryPart(aa[adiag[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal (%g,%g) must be real and nonnegative", (double)PetscRealPart(aa[adiag[k]]), (double)PetscImaginaryPart(aa[adiag[k]]));
2179:     x[k] *= PetscSqrtReal(diagk);
2180:   }
2181:   PetscCall(VecRestoreArrayRead(bb, &b));
2182:   PetscCall(VecRestoreArray(xx, &x));
2183:   PetscCall(PetscLogFlops(2.0 * a->nz - mbs));
2184:   PetscFunctionReturn(PETSC_SUCCESS);
2185: }

2187: PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
2188: {
2189:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
2190:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *vj;
2191:   const MatScalar   *aa = a->a, *v;
2192:   PetscReal          diagk;
2193:   PetscScalar       *x;
2194:   const PetscScalar *b;
2195:   PetscInt           nz, k;

2197:   PetscFunctionBegin;
2198:   /* solve U^T*D^(1/2)*x = b by forward substitution */
2199:   PetscCall(VecGetArrayRead(bb, &b));
2200:   PetscCall(VecGetArray(xx, &x));
2201:   PetscCall(PetscArraycpy(x, b, mbs));
2202:   for (k = 0; k < mbs; k++) {
2203:     v  = aa + ai[k] + 1;
2204:     vj = aj + ai[k] + 1;
2205:     nz = ai[k + 1] - ai[k] - 1; /* exclude diag[k] */
2206:     while (nz--) x[*vj++] += (*v++) * x[k];
2207:     diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
2208:     PetscCheck(!PetscImaginaryPart(aa[ai[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal (%g,%g) must be real and nonnegative", (double)PetscRealPart(aa[ai[k]]), (double)PetscImaginaryPart(aa[ai[k]]));
2209:     x[k] *= PetscSqrtReal(diagk);
2210:   }
2211:   PetscCall(VecRestoreArrayRead(bb, &b));
2212:   PetscCall(VecRestoreArray(xx, &x));
2213:   PetscCall(PetscLogFlops(2.0 * a->nz - mbs));
2214:   PetscFunctionReturn(PETSC_SUCCESS);
2215: }

2217: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A, Vec bb, Vec xx)
2218: {
2219:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
2220:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *adiag = a->diag, *vj;
2221:   const MatScalar   *aa = a->a, *v;
2222:   PetscReal          diagk;
2223:   PetscScalar       *x;
2224:   const PetscScalar *b;
2225:   PetscInt           nz, k;

2227:   PetscFunctionBegin;
2228:   /* solve D^(1/2)*U*x = b by back substitution */
2229:   PetscCall(VecGetArrayRead(bb, &b));
2230:   PetscCall(VecGetArray(xx, &x));

2232:   for (k = mbs - 1; k >= 0; k--) {
2233:     v     = aa + ai[k];
2234:     vj    = aj + ai[k];
2235:     diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */
2236:     PetscCheck(!PetscImaginaryPart(aa[adiag[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal must be real and nonnegative");
2237:     x[k] = PetscSqrtReal(diagk) * b[k];
2238:     nz   = ai[k + 1] - ai[k] - 1;
2239:     while (nz--) x[k] += (*v++) * x[*vj++];
2240:   }
2241:   PetscCall(VecRestoreArrayRead(bb, &b));
2242:   PetscCall(VecRestoreArray(xx, &x));
2243:   PetscCall(PetscLogFlops(2.0 * a->nz - mbs));
2244:   PetscFunctionReturn(PETSC_SUCCESS);
2245: }

2247: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
2248: {
2249:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
2250:   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *vj;
2251:   const MatScalar   *aa = a->a, *v;
2252:   PetscReal          diagk;
2253:   PetscScalar       *x;
2254:   const PetscScalar *b;
2255:   PetscInt           nz, k;

2257:   PetscFunctionBegin;
2258:   /* solve D^(1/2)*U*x = b by back substitution */
2259:   PetscCall(VecGetArrayRead(bb, &b));
2260:   PetscCall(VecGetArray(xx, &x));

2262:   for (k = mbs - 1; k >= 0; k--) {
2263:     v     = aa + ai[k] + 1;
2264:     vj    = aj + ai[k] + 1;
2265:     diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
2266:     PetscCheck(!PetscImaginaryPart(aa[ai[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal must be real and nonnegative");
2267:     x[k] = PetscSqrtReal(diagk) * b[k];
2268:     nz   = ai[k + 1] - ai[k] - 1;
2269:     while (nz--) x[k] += (*v++) * x[*vj++];
2270:   }
2271:   PetscCall(VecRestoreArrayRead(bb, &b));
2272:   PetscCall(VecRestoreArray(xx, &x));
2273:   PetscCall(PetscLogFlops(2.0 * a->nz - mbs));
2274:   PetscFunctionReturn(PETSC_SUCCESS);
2275: }

2277: /* Use Modified Sparse Row storage for u and ju, see Saad pp.85 */
2278: PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_MSR(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2279: {
2280:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data, *b;
2281:   const PetscInt *rip, mbs    = a->mbs, *ai, *aj;
2282:   PetscInt       *jutmp, bs   = A->rmap->bs, i;
2283:   PetscInt        m, reallocs = 0, *levtmp;
2284:   PetscInt       *prowl, *q, jmin, jmax, juidx, nzk, qm, *iu, *ju, k, j, vj, umax, maxadd;
2285:   PetscInt        incrlev, *lev, shift, prow, nz;
2286:   PetscReal       f = info->fill, levels = info->levels;
2287:   PetscBool       perm_identity;

2289:   PetscFunctionBegin;
2290:   /* check whether perm is the identity mapping */
2291:   PetscCall(ISIdentity(perm, &perm_identity));

2293:   PetscCheck(perm_identity, PETSC_COMM_SELF, PETSC_ERR_SUP, "Matrix reordering is not supported for sbaij matrix. Use aij format");
2294:   a->permute = PETSC_FALSE;
2295:   ai         = a->i;
2296:   aj         = a->j;

2298:   /* initialization */
2299:   PetscCall(ISGetIndices(perm, &rip));
2300:   umax = (PetscInt)(f * ai[mbs] + 1);
2301:   PetscCall(PetscMalloc1(umax, &lev));
2302:   umax += mbs + 1;
2303:   shift = mbs + 1;
2304:   PetscCall(PetscMalloc1(mbs + 1, &iu));
2305:   PetscCall(PetscMalloc1(umax, &ju));
2306:   iu[0] = mbs + 1;
2307:   juidx = mbs + 1;
2308:   /* prowl: linked list for pivot row */
2309:   PetscCall(PetscMalloc3(mbs, &prowl, mbs, &q, mbs, &levtmp));
2310:   /* q: linked list for col index */

2312:   for (i = 0; i < mbs; i++) {
2313:     prowl[i] = mbs;
2314:     q[i]     = 0;
2315:   }

2317:   /* for each row k */
2318:   for (k = 0; k < mbs; k++) {
2319:     nzk  = 0;
2320:     q[k] = mbs;
2321:     /* copy current row into linked list */
2322:     nz = ai[rip[k] + 1] - ai[rip[k]];
2323:     j  = ai[rip[k]];
2324:     while (nz--) {
2325:       vj = rip[aj[j++]];
2326:       if (vj > k) {
2327:         qm = k;
2328:         do {
2329:           m  = qm;
2330:           qm = q[m];
2331:         } while (qm < vj);
2332:         PetscCheck(qm != vj, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Duplicate entry in A");
2333:         nzk++;
2334:         q[m]       = vj;
2335:         q[vj]      = qm;
2336:         levtmp[vj] = 0; /* initialize lev for nonzero element */
2337:       }
2338:     }

2340:     /* modify nonzero structure of k-th row by computing fill-in
2341:        for each row prow to be merged in */
2342:     prow = k;
2343:     prow = prowl[prow]; /* next pivot row (== 0 for symbolic factorization) */

2345:     while (prow < k) {
2346:       /* merge row prow into k-th row */
2347:       jmin = iu[prow] + 1;
2348:       jmax = iu[prow + 1];
2349:       qm   = k;
2350:       for (j = jmin; j < jmax; j++) {
2351:         incrlev = lev[j - shift] + 1;
2352:         if (incrlev > levels) continue;
2353:         vj = ju[j];
2354:         do {
2355:           m  = qm;
2356:           qm = q[m];
2357:         } while (qm < vj);
2358:         if (qm != vj) { /* a fill */
2359:           nzk++;
2360:           q[m]       = vj;
2361:           q[vj]      = qm;
2362:           qm         = vj;
2363:           levtmp[vj] = incrlev;
2364:         } else if (levtmp[vj] > incrlev) levtmp[vj] = incrlev;
2365:       }
2366:       prow = prowl[prow]; /* next pivot row */
2367:     }

2369:     /* add k to row list for first nonzero element in k-th row */
2370:     if (nzk > 1) {
2371:       i        = q[k]; /* col value of first nonzero element in k_th row of U */
2372:       prowl[k] = prowl[i];
2373:       prowl[i] = k;
2374:     }
2375:     iu[k + 1] = iu[k] + nzk;

2377:     /* allocate more space to ju and lev if needed */
2378:     if (iu[k + 1] > umax) {
2379:       /* estimate how much additional space we will need */
2380:       /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
2381:       /* just double the memory each time */
2382:       maxadd = umax;
2383:       if (maxadd < nzk) maxadd = (mbs - k) * (nzk + 1) / 2;
2384:       umax += maxadd;

2386:       /* allocate a longer ju */
2387:       PetscCall(PetscMalloc1(umax, &jutmp));
2388:       PetscCall(PetscArraycpy(jutmp, ju, iu[k]));
2389:       PetscCall(PetscFree(ju));
2390:       ju = jutmp;

2392:       PetscCall(PetscMalloc1(umax, &jutmp));
2393:       PetscCall(PetscArraycpy(jutmp, lev, iu[k] - shift));
2394:       PetscCall(PetscFree(lev));
2395:       lev = jutmp;
2396:       reallocs += 2; /* count how many times we realloc */
2397:     }

2399:     /* save nonzero structure of k-th row in ju */
2400:     i = k;
2401:     while (nzk--) {
2402:       i                  = q[i];
2403:       ju[juidx]          = i;
2404:       lev[juidx - shift] = levtmp[i];
2405:       juidx++;
2406:     }
2407:   }

2409: #if defined(PETSC_USE_INFO)
2410:   if (ai[mbs] != 0) {
2411:     PetscReal af = ((PetscReal)iu[mbs]) / ((PetscReal)ai[mbs]);
2412:     PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)f, (double)af));
2413:     PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af));
2414:     PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g);\n", (double)af));
2415:     PetscCall(PetscInfo(A, "for best performance.\n"));
2416:   } else {
2417:     PetscCall(PetscInfo(A, "Empty matrix\n"));
2418:   }
2419: #endif

2421:   PetscCall(ISRestoreIndices(perm, &rip));
2422:   PetscCall(PetscFree3(prowl, q, levtmp));
2423:   PetscCall(PetscFree(lev));

2425:   /* put together the new matrix */
2426:   PetscCall(MatSeqSBAIJSetPreallocation(B, bs, 0, NULL));

2428:   b = (Mat_SeqSBAIJ *)(B)->data;
2429:   PetscCall(PetscFree2(b->imax, b->ilen));

2431:   b->singlemalloc = PETSC_FALSE;
2432:   b->free_a       = PETSC_TRUE;
2433:   b->free_ij      = PETSC_TRUE;
2434:   /* the next line frees the default space generated by the Create() */
2435:   PetscCall(PetscFree3(b->a, b->j, b->i));
2436:   PetscCall(PetscMalloc1((iu[mbs] + 1) * a->bs2, &b->a));
2437:   b->j    = ju;
2438:   b->i    = iu;
2439:   b->diag = NULL;
2440:   b->ilen = NULL;
2441:   b->imax = NULL;

2443:   PetscCall(ISDestroy(&b->row));
2444:   PetscCall(ISDestroy(&b->icol));
2445:   b->row  = perm;
2446:   b->icol = perm;
2447:   PetscCall(PetscObjectReference((PetscObject)perm));
2448:   PetscCall(PetscObjectReference((PetscObject)perm));
2449:   PetscCall(PetscMalloc1(bs * mbs + bs, &b->solve_work));
2450:   /* In b structure:  Free imax, ilen, old a, old j.
2451:      Allocate idnew, solve_work, new a, new j */
2452:   b->maxnz = b->nz = iu[mbs];

2454:   (B)->info.factor_mallocs   = reallocs;
2455:   (B)->info.fill_ratio_given = f;
2456:   if (ai[mbs] != 0) {
2457:     (B)->info.fill_ratio_needed = ((PetscReal)iu[mbs]) / ((PetscReal)ai[mbs]);
2458:   } else {
2459:     (B)->info.fill_ratio_needed = 0.0;
2460:   }
2461:   PetscCall(MatSeqSBAIJSetNumericFactorization_inplace(B, perm_identity));
2462:   PetscFunctionReturn(PETSC_SUCCESS);
2463: }

2465: /*
2466:   See MatICCFactorSymbolic_SeqAIJ() for description of its data structure
2467: */
2468: #include <petscbt.h>
2469: #include <../src/mat/utils/freespace.h>
2470: PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ(Mat fact, Mat A, IS perm, const MatFactorInfo *info)
2471: {
2472:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data, *b;
2473:   PetscBool          perm_identity, free_ij = PETSC_TRUE, missing;
2474:   PetscInt           bs = A->rmap->bs, am = a->mbs, d, *ai = a->i, *aj = a->j;
2475:   const PetscInt    *rip;
2476:   PetscInt           reallocs = 0, i, *ui, *udiag, *cols;
2477:   PetscInt           jmin, jmax, nzk, k, j, *jl, prow, *il, nextprow;
2478:   PetscInt           nlnk, *lnk, *lnk_lvl = NULL, ncols, *uj, **uj_ptr, **uj_lvl_ptr;
2479:   PetscReal          fill = info->fill, levels = info->levels;
2480:   PetscFreeSpaceList free_space = NULL, current_space = NULL;
2481:   PetscFreeSpaceList free_space_lvl = NULL, current_space_lvl = NULL;
2482:   PetscBT            lnkbt;

2484:   PetscFunctionBegin;
2485:   PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
2486:   PetscCall(MatMissingDiagonal(A, &missing, &d));
2487:   PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, d);
2488:   if (bs > 1) {
2489:     PetscCall(MatICCFactorSymbolic_SeqSBAIJ_inplace(fact, A, perm, info));
2490:     PetscFunctionReturn(PETSC_SUCCESS);
2491:   }

2493:   /* check whether perm is the identity mapping */
2494:   PetscCall(ISIdentity(perm, &perm_identity));
2495:   PetscCheck(perm_identity, PETSC_COMM_SELF, PETSC_ERR_SUP, "Matrix reordering is not supported for sbaij matrix. Use aij format");
2496:   a->permute = PETSC_FALSE;

2498:   PetscCall(PetscMalloc1(am + 1, &ui));
2499:   PetscCall(PetscMalloc1(am + 1, &udiag));
2500:   ui[0] = 0;

2502:   /* ICC(0) without matrix ordering: simply rearrange column indices */
2503:   if (!levels) {
2504:     /* reuse the column pointers and row offsets for memory savings */
2505:     for (i = 0; i < am; i++) {
2506:       ncols     = ai[i + 1] - ai[i];
2507:       ui[i + 1] = ui[i] + ncols;
2508:       udiag[i]  = ui[i + 1] - 1; /* points to the last entry of U(i,:) */
2509:     }
2510:     PetscCall(PetscMalloc1(ui[am] + 1, &uj));
2511:     cols = uj;
2512:     for (i = 0; i < am; i++) {
2513:       aj    = a->j + ai[i] + 1; /* 1st entry of U(i,:) without diagonal */
2514:       ncols = ai[i + 1] - ai[i] - 1;
2515:       for (j = 0; j < ncols; j++) *cols++ = aj[j];
2516:       *cols++ = i; /* diagonal is located as the last entry of U(i,:) */
2517:     }
2518:   } else { /* case: levels>0 */
2519:     PetscCall(ISGetIndices(perm, &rip));

2521:     /* initialization */
2522:     /* jl: linked list for storing indices of the pivot rows
2523:        il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
2524:     PetscCall(PetscMalloc4(am, &uj_ptr, am, &uj_lvl_ptr, am, &il, am, &jl));
2525:     for (i = 0; i < am; i++) {
2526:       jl[i] = am;
2527:       il[i] = 0;
2528:     }

2530:     /* create and initialize a linked list for storing column indices of the active row k */
2531:     nlnk = am + 1;
2532:     PetscCall(PetscIncompleteLLCreate(am, am, nlnk, lnk, lnk_lvl, lnkbt));

2534:     /* initial FreeSpace size is fill*(ai[am]+1) */
2535:     PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, ai[am] + 1), &free_space));

2537:     current_space = free_space;

2539:     PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, ai[am] + 1), &free_space_lvl));

2541:     current_space_lvl = free_space_lvl;

2543:     for (k = 0; k < am; k++) { /* for each active row k */
2544:       /* initialize lnk by the column indices of row k */
2545:       nzk   = 0;
2546:       ncols = ai[k + 1] - ai[k];
2547:       PetscCheck(ncols, PETSC_COMM_SELF, PETSC_ERR_MAT_CH_ZRPVT, "Empty row %" PetscInt_FMT " in matrix ", k);
2548:       cols = aj + ai[k];
2549:       PetscCall(PetscIncompleteLLInit(ncols, cols, am, rip, &nlnk, lnk, lnk_lvl, lnkbt));
2550:       nzk += nlnk;

2552:       /* update lnk by computing fill-in for each pivot row to be merged in */
2553:       prow = jl[k]; /* 1st pivot row */

2555:       while (prow < k) {
2556:         nextprow = jl[prow];

2558:         /* merge prow into k-th row */
2559:         jmin  = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */
2560:         jmax  = ui[prow + 1];
2561:         ncols = jmax - jmin;
2562:         i     = jmin - ui[prow];

2564:         cols = uj_ptr[prow] + i;     /* points to the 2nd nzero entry in U(prow,k:am-1) */
2565:         uj   = uj_lvl_ptr[prow] + i; /* levels of cols */
2566:         j    = *(uj - 1);
2567:         PetscCall(PetscICCLLAddSorted(ncols, cols, levels, uj, am, &nlnk, lnk, lnk_lvl, lnkbt, j));
2568:         nzk += nlnk;

2570:         /* update il and jl for prow */
2571:         if (jmin < jmax) {
2572:           il[prow] = jmin;

2574:           j        = *cols;
2575:           jl[prow] = jl[j];
2576:           jl[j]    = prow;
2577:         }
2578:         prow = nextprow;
2579:       }

2581:       /* if free space is not available, make more free space */
2582:       if (current_space->local_remaining < nzk) {
2583:         i = am - k + 1;                                    /* num of unfactored rows */
2584:         i = PetscIntMultTruncate(i, PetscMin(nzk, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
2585:         PetscCall(PetscFreeSpaceGet(i, &current_space));
2586:         PetscCall(PetscFreeSpaceGet(i, &current_space_lvl));
2587:         reallocs++;
2588:       }

2590:       /* copy data into free_space and free_space_lvl, then initialize lnk */
2591:       PetscCheck(nzk != 0, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Empty row %" PetscInt_FMT " in ICC matrix factor", k);
2592:       PetscCall(PetscIncompleteLLClean(am, am, nzk, lnk, lnk_lvl, current_space->array, current_space_lvl->array, lnkbt));

2594:       /* add the k-th row into il and jl */
2595:       if (nzk > 1) {
2596:         i     = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
2597:         jl[k] = jl[i];
2598:         jl[i] = k;
2599:         il[k] = ui[k] + 1;
2600:       }
2601:       uj_ptr[k]     = current_space->array;
2602:       uj_lvl_ptr[k] = current_space_lvl->array;

2604:       current_space->array += nzk;
2605:       current_space->local_used += nzk;
2606:       current_space->local_remaining -= nzk;
2607:       current_space_lvl->array += nzk;
2608:       current_space_lvl->local_used += nzk;
2609:       current_space_lvl->local_remaining -= nzk;

2611:       ui[k + 1] = ui[k] + nzk;
2612:     }

2614:     PetscCall(ISRestoreIndices(perm, &rip));
2615:     PetscCall(PetscFree4(uj_ptr, uj_lvl_ptr, il, jl));

2617:     /* destroy list of free space and other temporary array(s) */
2618:     PetscCall(PetscMalloc1(ui[am] + 1, &uj));
2619:     PetscCall(PetscFreeSpaceContiguous_Cholesky(&free_space, uj, am, ui, udiag)); /* store matrix factor  */
2620:     PetscCall(PetscIncompleteLLDestroy(lnk, lnkbt));
2621:     PetscCall(PetscFreeSpaceDestroy(free_space_lvl));

2623:   } /* end of case: levels>0 || (levels=0 && !perm_identity) */

2625:   /* put together the new matrix in MATSEQSBAIJ format */
2626:   PetscCall(MatSeqSBAIJSetPreallocation(fact, bs, MAT_SKIP_ALLOCATION, NULL));

2628:   b = (Mat_SeqSBAIJ *)(fact)->data;
2629:   PetscCall(PetscFree2(b->imax, b->ilen));

2631:   b->singlemalloc = PETSC_FALSE;
2632:   b->free_a       = PETSC_TRUE;
2633:   b->free_ij      = free_ij;

2635:   PetscCall(PetscMalloc1(ui[am] + 1, &b->a));

2637:   b->j         = uj;
2638:   b->i         = ui;
2639:   b->diag      = udiag;
2640:   b->free_diag = PETSC_TRUE;
2641:   b->ilen      = NULL;
2642:   b->imax      = NULL;
2643:   b->row       = perm;
2644:   b->col       = perm;

2646:   PetscCall(PetscObjectReference((PetscObject)perm));
2647:   PetscCall(PetscObjectReference((PetscObject)perm));

2649:   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */

2651:   PetscCall(PetscMalloc1(am + 1, &b->solve_work));

2653:   b->maxnz = b->nz = ui[am];

2655:   fact->info.factor_mallocs   = reallocs;
2656:   fact->info.fill_ratio_given = fill;
2657:   if (ai[am] != 0) {
2658:     fact->info.fill_ratio_needed = ((PetscReal)ui[am]) / ai[am];
2659:   } else {
2660:     fact->info.fill_ratio_needed = 0.0;
2661:   }
2662: #if defined(PETSC_USE_INFO)
2663:   if (ai[am] != 0) {
2664:     PetscReal af = fact->info.fill_ratio_needed;
2665:     PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af));
2666:     PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af));
2667:     PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af));
2668:   } else {
2669:     PetscCall(PetscInfo(A, "Empty matrix\n"));
2670:   }
2671: #endif
2672:   fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
2673:   PetscFunctionReturn(PETSC_SUCCESS);
2674: }

2676: PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_inplace(Mat fact, Mat A, IS perm, const MatFactorInfo *info)
2677: {
2678:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
2679:   Mat_SeqSBAIJ      *b;
2680:   PetscBool          perm_identity, free_ij = PETSC_TRUE;
2681:   PetscInt           bs = A->rmap->bs, am = a->mbs;
2682:   const PetscInt    *cols, *rip, *ai = a->i, *aj = a->j;
2683:   PetscInt           reallocs = 0, i, *ui;
2684:   PetscInt           jmin, jmax, nzk, k, j, *jl, prow, *il, nextprow;
2685:   PetscInt           nlnk, *lnk, *lnk_lvl = NULL, ncols, *cols_lvl, *uj, **uj_ptr, **uj_lvl_ptr;
2686:   PetscReal          fill = info->fill, levels = info->levels, ratio_needed;
2687:   PetscFreeSpaceList free_space = NULL, current_space = NULL;
2688:   PetscFreeSpaceList free_space_lvl = NULL, current_space_lvl = NULL;
2689:   PetscBT            lnkbt;

2691:   PetscFunctionBegin;
2692:   /*
2693:    This code originally uses Modified Sparse Row (MSR) storage
2694:    (see page 85, "Iterative Methods ..." by Saad) for the output matrix B - bad choice!
2695:    Then it is rewritten so the factor B takes seqsbaij format. However the associated
2696:    MatCholeskyFactorNumeric_() have not been modified for the cases of bs>1,
2697:    thus the original code in MSR format is still used for these cases.
2698:    The code below should replace MatICCFactorSymbolic_SeqSBAIJ_MSR() whenever
2699:    MatCholeskyFactorNumeric_() is modified for using sbaij symbolic factor.
2700:   */
2701:   if (bs > 1) {
2702:     PetscCall(MatICCFactorSymbolic_SeqSBAIJ_MSR(fact, A, perm, info));
2703:     PetscFunctionReturn(PETSC_SUCCESS);
2704:   }

2706:   /* check whether perm is the identity mapping */
2707:   PetscCall(ISIdentity(perm, &perm_identity));
2708:   PetscCheck(perm_identity, PETSC_COMM_SELF, PETSC_ERR_SUP, "Matrix reordering is not supported for sbaij matrix. Use aij format");
2709:   a->permute = PETSC_FALSE;

2711:   /* special case that simply copies fill pattern */
2712:   if (!levels) {
2713:     /* reuse the column pointers and row offsets for memory savings */
2714:     ui           = a->i;
2715:     uj           = a->j;
2716:     free_ij      = PETSC_FALSE;
2717:     ratio_needed = 1.0;
2718:   } else { /* case: levels>0 */
2719:     PetscCall(ISGetIndices(perm, &rip));

2721:     /* initialization */
2722:     PetscCall(PetscMalloc1(am + 1, &ui));
2723:     ui[0] = 0;

2725:     /* jl: linked list for storing indices of the pivot rows
2726:        il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
2727:     PetscCall(PetscMalloc4(am, &uj_ptr, am, &uj_lvl_ptr, am, &il, am, &jl));
2728:     for (i = 0; i < am; i++) {
2729:       jl[i] = am;
2730:       il[i] = 0;
2731:     }

2733:     /* create and initialize a linked list for storing column indices of the active row k */
2734:     nlnk = am + 1;
2735:     PetscCall(PetscIncompleteLLCreate(am, am, nlnk, lnk, lnk_lvl, lnkbt));

2737:     /* initial FreeSpace size is fill*(ai[am]+1) */
2738:     PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, ai[am] + 1), &free_space));

2740:     current_space = free_space;

2742:     PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, ai[am] + 1), &free_space_lvl));

2744:     current_space_lvl = free_space_lvl;

2746:     for (k = 0; k < am; k++) { /* for each active row k */
2747:       /* initialize lnk by the column indices of row rip[k] */
2748:       nzk   = 0;
2749:       ncols = ai[rip[k] + 1] - ai[rip[k]];
2750:       cols  = aj + ai[rip[k]];
2751:       PetscCall(PetscIncompleteLLInit(ncols, cols, am, rip, &nlnk, lnk, lnk_lvl, lnkbt));
2752:       nzk += nlnk;

2754:       /* update lnk by computing fill-in for each pivot row to be merged in */
2755:       prow = jl[k]; /* 1st pivot row */

2757:       while (prow < k) {
2758:         nextprow = jl[prow];

2760:         /* merge prow into k-th row */
2761:         jmin     = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */
2762:         jmax     = ui[prow + 1];
2763:         ncols    = jmax - jmin;
2764:         i        = jmin - ui[prow];
2765:         cols     = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */
2766:         j        = *(uj_lvl_ptr[prow] + i - 1);
2767:         cols_lvl = uj_lvl_ptr[prow] + i;
2768:         PetscCall(PetscICCLLAddSorted(ncols, cols, levels, cols_lvl, am, &nlnk, lnk, lnk_lvl, lnkbt, j));
2769:         nzk += nlnk;

2771:         /* update il and jl for prow */
2772:         if (jmin < jmax) {
2773:           il[prow] = jmin;
2774:           j        = *cols;
2775:           jl[prow] = jl[j];
2776:           jl[j]    = prow;
2777:         }
2778:         prow = nextprow;
2779:       }

2781:       /* if free space is not available, make more free space */
2782:       if (current_space->local_remaining < nzk) {
2783:         i = am - k + 1;                                                             /* num of unfactored rows */
2784:         i = PetscMin(PetscIntMultTruncate(i, nzk), PetscIntMultTruncate(i, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
2785:         PetscCall(PetscFreeSpaceGet(i, &current_space));
2786:         PetscCall(PetscFreeSpaceGet(i, &current_space_lvl));
2787:         reallocs++;
2788:       }

2790:       /* copy data into free_space and free_space_lvl, then initialize lnk */
2791:       PetscCall(PetscIncompleteLLClean(am, am, nzk, lnk, lnk_lvl, current_space->array, current_space_lvl->array, lnkbt));

2793:       /* add the k-th row into il and jl */
2794:       if (nzk - 1 > 0) {
2795:         i     = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
2796:         jl[k] = jl[i];
2797:         jl[i] = k;
2798:         il[k] = ui[k] + 1;
2799:       }
2800:       uj_ptr[k]     = current_space->array;
2801:       uj_lvl_ptr[k] = current_space_lvl->array;

2803:       current_space->array += nzk;
2804:       current_space->local_used += nzk;
2805:       current_space->local_remaining -= nzk;
2806:       current_space_lvl->array += nzk;
2807:       current_space_lvl->local_used += nzk;
2808:       current_space_lvl->local_remaining -= nzk;

2810:       ui[k + 1] = ui[k] + nzk;
2811:     }

2813:     PetscCall(ISRestoreIndices(perm, &rip));
2814:     PetscCall(PetscFree4(uj_ptr, uj_lvl_ptr, il, jl));

2816:     /* destroy list of free space and other temporary array(s) */
2817:     PetscCall(PetscMalloc1(ui[am] + 1, &uj));
2818:     PetscCall(PetscFreeSpaceContiguous(&free_space, uj));
2819:     PetscCall(PetscIncompleteLLDestroy(lnk, lnkbt));
2820:     PetscCall(PetscFreeSpaceDestroy(free_space_lvl));
2821:     if (ai[am] != 0) {
2822:       ratio_needed = ((PetscReal)ui[am]) / ((PetscReal)ai[am]);
2823:     } else {
2824:       ratio_needed = 0.0;
2825:     }
2826:   } /* end of case: levels>0 || (levels=0 && !perm_identity) */

2828:   /* put together the new matrix in MATSEQSBAIJ format */
2829:   PetscCall(MatSeqSBAIJSetPreallocation(fact, bs, MAT_SKIP_ALLOCATION, NULL));

2831:   b = (Mat_SeqSBAIJ *)(fact)->data;

2833:   PetscCall(PetscFree2(b->imax, b->ilen));

2835:   b->singlemalloc = PETSC_FALSE;
2836:   b->free_a       = PETSC_TRUE;
2837:   b->free_ij      = free_ij;

2839:   PetscCall(PetscMalloc1(ui[am] + 1, &b->a));

2841:   b->j             = uj;
2842:   b->i             = ui;
2843:   b->diag          = NULL;
2844:   b->ilen          = NULL;
2845:   b->imax          = NULL;
2846:   b->row           = perm;
2847:   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */

2849:   PetscCall(PetscObjectReference((PetscObject)perm));
2850:   b->icol = perm;
2851:   PetscCall(PetscObjectReference((PetscObject)perm));
2852:   PetscCall(PetscMalloc1(am + 1, &b->solve_work));

2854:   b->maxnz = b->nz = ui[am];

2856:   fact->info.factor_mallocs    = reallocs;
2857:   fact->info.fill_ratio_given  = fill;
2858:   fact->info.fill_ratio_needed = ratio_needed;
2859: #if defined(PETSC_USE_INFO)
2860:   if (ai[am] != 0) {
2861:     PetscReal af = fact->info.fill_ratio_needed;
2862:     PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af));
2863:     PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af));
2864:     PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af));
2865:   } else {
2866:     PetscCall(PetscInfo(A, "Empty matrix\n"));
2867:   }
2868: #endif
2869:   if (perm_identity) {
2870:     fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering_inplace;
2871:   } else {
2872:     fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_inplace;
2873:   }
2874:   PetscFunctionReturn(PETSC_SUCCESS);
2875: }