Actual source code: baijsolvtrann.c

  1: #include <../src/mat/impls/baij/seq/baij.h>
  2: #include <petsc/private/kernels/blockinvert.h>

  4: PetscErrorCode MatSolveTranspose_SeqBAIJ_N_inplace(Mat A, Vec bb, Vec xx)
  5: {
  6:   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
  7:   IS                 iscol = a->col, isrow = a->row;
  8:   const PetscInt    *r, *c, *rout, *cout, *ai = a->i, *aj = a->j, *vi;
  9:   PetscInt           i, nz, j;
 10:   const PetscInt     n = a->mbs, bs = A->rmap->bs, bs2 = a->bs2;
 11:   const MatScalar   *aa = a->a, *v;
 12:   PetscScalar       *x, *t, *ls;
 13:   const PetscScalar *b;

 15:   PetscFunctionBegin;
 16:   PetscCall(VecGetArrayRead(bb, &b));
 17:   PetscCall(VecGetArray(xx, &x));
 18:   t = a->solve_work;

 20:   PetscCall(ISGetIndices(isrow, &rout));
 21:   r = rout;
 22:   PetscCall(ISGetIndices(iscol, &cout));
 23:   c = cout;

 25:   /* copy the b into temp work space according to permutation */
 26:   for (i = 0; i < n; i++) {
 27:     for (j = 0; j < bs; j++) t[i * bs + j] = b[c[i] * bs + j];
 28:   }

 30:   /* forward solve the upper triangular transpose */
 31:   ls = a->solve_work + A->cmap->n;
 32:   for (i = 0; i < n; i++) {
 33:     PetscCall(PetscArraycpy(ls, t + i * bs, bs));
 34:     PetscKernel_w_gets_transA_times_v(bs, ls, aa + bs2 * a->diag[i], t + i * bs);
 35:     v  = aa + bs2 * (a->diag[i] + 1);
 36:     vi = aj + a->diag[i] + 1;
 37:     nz = ai[i + 1] - a->diag[i] - 1;
 38:     while (nz--) {
 39:       PetscKernel_v_gets_v_minus_transA_times_w(bs, t + bs * (*vi++), v, t + i * bs);
 40:       v += bs2;
 41:     }
 42:   }

 44:   /* backward solve the lower triangular transpose */
 45:   for (i = n - 1; i >= 0; i--) {
 46:     v  = aa + bs2 * ai[i];
 47:     vi = aj + ai[i];
 48:     nz = a->diag[i] - ai[i];
 49:     while (nz--) {
 50:       PetscKernel_v_gets_v_minus_transA_times_w(bs, t + bs * (*vi++), v, t + i * bs);
 51:       v += bs2;
 52:     }
 53:   }

 55:   /* copy t into x according to permutation */
 56:   for (i = 0; i < n; i++) {
 57:     for (j = 0; j < bs; j++) x[bs * r[i] + j] = t[bs * i + j];
 58:   }

 60:   PetscCall(ISRestoreIndices(isrow, &rout));
 61:   PetscCall(ISRestoreIndices(iscol, &cout));
 62:   PetscCall(VecRestoreArrayRead(bb, &b));
 63:   PetscCall(VecRestoreArray(xx, &x));
 64:   PetscCall(PetscLogFlops(2.0 * (a->bs2) * (a->nz) - A->rmap->bs * A->cmap->n));
 65:   PetscFunctionReturn(PETSC_SUCCESS);
 66: }

 68: PetscErrorCode MatSolveTranspose_SeqBAIJ_N(Mat A, Vec bb, Vec xx)
 69: {
 70:   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
 71:   IS                 iscol = a->col, isrow = a->row;
 72:   const PetscInt    *r, *c, *rout, *cout;
 73:   const PetscInt     n = a->mbs, *ai = a->i, *aj = a->j, *vi, *diag = a->diag;
 74:   PetscInt           i, j, nz;
 75:   const PetscInt     bs = A->rmap->bs, bs2 = a->bs2;
 76:   const MatScalar   *aa = a->a, *v;
 77:   PetscScalar       *x, *t, *ls;
 78:   const PetscScalar *b;

 80:   PetscFunctionBegin;
 81:   PetscCall(VecGetArrayRead(bb, &b));
 82:   PetscCall(VecGetArray(xx, &x));
 83:   t = a->solve_work;

 85:   PetscCall(ISGetIndices(isrow, &rout));
 86:   r = rout;
 87:   PetscCall(ISGetIndices(iscol, &cout));
 88:   c = cout;

 90:   /* copy the b into temp work space according to permutation */
 91:   for (i = 0; i < n; i++) {
 92:     for (j = 0; j < bs; j++) t[i * bs + j] = b[c[i] * bs + j];
 93:   }

 95:   /* forward solve the upper triangular transpose */
 96:   ls = a->solve_work + A->cmap->n;
 97:   for (i = 0; i < n; i++) {
 98:     PetscCall(PetscArraycpy(ls, t + i * bs, bs));
 99:     PetscKernel_w_gets_transA_times_v(bs, ls, aa + bs2 * diag[i], t + i * bs);
100:     v  = aa + bs2 * (diag[i] - 1);
101:     vi = aj + diag[i] - 1;
102:     nz = diag[i] - diag[i + 1] - 1;
103:     for (j = 0; j > -nz; j--) {
104:       PetscKernel_v_gets_v_minus_transA_times_w(bs, t + bs * (vi[j]), v, t + i * bs);
105:       v -= bs2;
106:     }
107:   }

109:   /* backward solve the lower triangular transpose */
110:   for (i = n - 1; i >= 0; i--) {
111:     v  = aa + bs2 * ai[i];
112:     vi = aj + ai[i];
113:     nz = ai[i + 1] - ai[i];
114:     for (j = 0; j < nz; j++) {
115:       PetscKernel_v_gets_v_minus_transA_times_w(bs, t + bs * (vi[j]), v, t + i * bs);
116:       v += bs2;
117:     }
118:   }

120:   /* copy t into x according to permutation */
121:   for (i = 0; i < n; i++) {
122:     for (j = 0; j < bs; j++) x[bs * r[i] + j] = t[bs * i + j];
123:   }

125:   PetscCall(ISRestoreIndices(isrow, &rout));
126:   PetscCall(ISRestoreIndices(iscol, &cout));
127:   PetscCall(VecRestoreArrayRead(bb, &b));
128:   PetscCall(VecRestoreArray(xx, &x));
129:   PetscCall(PetscLogFlops(2.0 * (a->bs2) * (a->nz) - A->rmap->bs * A->cmap->n));
130:   PetscFunctionReturn(PETSC_SUCCESS);
131: }