Actual source code: baijsolvnat4.c

  1: #include <../src/mat/impls/baij/seq/baij.h>
  2: #include <petsc/private/kernels/blockinvert.h>

  4: /*
  5:       Special case where the matrix was ILU(0) factored in the natural
  6:    ordering. This eliminates the need for the column and row permutation.
  7: */
  8: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
  9: {
 10:   Mat_SeqBAIJ       *a  = (Mat_SeqBAIJ *)A->data;
 11:   PetscInt           n  = a->mbs;
 12:   const PetscInt    *ai = a->i, *aj = a->j;
 13:   const PetscInt    *diag = a->diag;
 14:   const MatScalar   *aa   = a->a;
 15:   PetscScalar       *x;
 16:   const PetscScalar *b;

 18:   PetscFunctionBegin;
 19:   PetscCall(VecGetArrayRead(bb, &b));
 20:   PetscCall(VecGetArray(xx, &x));

 22: #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
 23:   {
 24:     static PetscScalar w[2000]; /* very BAD need to fix */
 25:     fortransolvebaij4_(&n, x, ai, aj, diag, aa, b, w);
 26:   }
 27: #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
 28:   fortransolvebaij4unroll_(&n, x, ai, aj, diag, aa, b);
 29: #else
 30:   {
 31:     PetscScalar      s1, s2, s3, s4, x1, x2, x3, x4;
 32:     const MatScalar *v;
 33:     PetscInt         jdx, idt, idx, nz, i, ai16;
 34:     const PetscInt  *vi;

 36:     /* forward solve the lower triangular */
 37:     idx  = 0;
 38:     x[0] = b[0];
 39:     x[1] = b[1];
 40:     x[2] = b[2];
 41:     x[3] = b[3];
 42:     for (i = 1; i < n; i++) {
 43:       v  = aa + 16 * ai[i];
 44:       vi = aj + ai[i];
 45:       nz = diag[i] - ai[i];
 46:       idx += 4;
 47:       s1 = b[idx];
 48:       s2 = b[1 + idx];
 49:       s3 = b[2 + idx];
 50:       s4 = b[3 + idx];
 51:       while (nz--) {
 52:         jdx = 4 * (*vi++);
 53:         x1  = x[jdx];
 54:         x2  = x[1 + jdx];
 55:         x3  = x[2 + jdx];
 56:         x4  = x[3 + jdx];
 57:         s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
 58:         s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
 59:         s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
 60:         s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
 61:         v += 16;
 62:       }
 63:       x[idx]     = s1;
 64:       x[1 + idx] = s2;
 65:       x[2 + idx] = s3;
 66:       x[3 + idx] = s4;
 67:     }
 68:     /* backward solve the upper triangular */
 69:     idt = 4 * (n - 1);
 70:     for (i = n - 1; i >= 0; i--) {
 71:       ai16 = 16 * diag[i];
 72:       v    = aa + ai16 + 16;
 73:       vi   = aj + diag[i] + 1;
 74:       nz   = ai[i + 1] - diag[i] - 1;
 75:       s1   = x[idt];
 76:       s2   = x[1 + idt];
 77:       s3   = x[2 + idt];
 78:       s4   = x[3 + idt];
 79:       while (nz--) {
 80:         idx = 4 * (*vi++);
 81:         x1  = x[idx];
 82:         x2  = x[1 + idx];
 83:         x3  = x[2 + idx];
 84:         x4  = x[3 + idx];
 85:         s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
 86:         s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
 87:         s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
 88:         s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
 89:         v += 16;
 90:       }
 91:       v          = aa + ai16;
 92:       x[idt]     = v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4;
 93:       x[1 + idt] = v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4;
 94:       x[2 + idt] = v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4;
 95:       x[3 + idt] = v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4;
 96:       idt -= 4;
 97:     }
 98:   }
 99: #endif

101:   PetscCall(VecRestoreArrayRead(bb, &b));
102:   PetscCall(VecRestoreArray(xx, &x));
103:   PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n));
104:   PetscFunctionReturn(PETSC_SUCCESS);
105: }

107: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A, Vec bb, Vec xx)
108: {
109:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
110:   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
111:   PetscInt           i, k, nz, idx, jdx, idt;
112:   const PetscInt     bs = A->rmap->bs, bs2 = a->bs2;
113:   const MatScalar   *aa = a->a, *v;
114:   PetscScalar       *x;
115:   const PetscScalar *b;
116:   PetscScalar        s1, s2, s3, s4, x1, x2, x3, x4;

118:   PetscFunctionBegin;
119:   PetscCall(VecGetArrayRead(bb, &b));
120:   PetscCall(VecGetArray(xx, &x));
121:   /* forward solve the lower triangular */
122:   idx  = 0;
123:   x[0] = b[idx];
124:   x[1] = b[1 + idx];
125:   x[2] = b[2 + idx];
126:   x[3] = b[3 + idx];
127:   for (i = 1; i < n; i++) {
128:     v   = aa + bs2 * ai[i];
129:     vi  = aj + ai[i];
130:     nz  = ai[i + 1] - ai[i];
131:     idx = bs * i;
132:     s1  = b[idx];
133:     s2  = b[1 + idx];
134:     s3  = b[2 + idx];
135:     s4  = b[3 + idx];
136:     for (k = 0; k < nz; k++) {
137:       jdx = bs * vi[k];
138:       x1  = x[jdx];
139:       x2  = x[1 + jdx];
140:       x3  = x[2 + jdx];
141:       x4  = x[3 + jdx];
142:       s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
143:       s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
144:       s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
145:       s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;

147:       v += bs2;
148:     }

150:     x[idx]     = s1;
151:     x[1 + idx] = s2;
152:     x[2 + idx] = s3;
153:     x[3 + idx] = s4;
154:   }

156:   /* backward solve the upper triangular */
157:   for (i = n - 1; i >= 0; i--) {
158:     v   = aa + bs2 * (adiag[i + 1] + 1);
159:     vi  = aj + adiag[i + 1] + 1;
160:     nz  = adiag[i] - adiag[i + 1] - 1;
161:     idt = bs * i;
162:     s1  = x[idt];
163:     s2  = x[1 + idt];
164:     s3  = x[2 + idt];
165:     s4  = x[3 + idt];

167:     for (k = 0; k < nz; k++) {
168:       idx = bs * vi[k];
169:       x1  = x[idx];
170:       x2  = x[1 + idx];
171:       x3  = x[2 + idx];
172:       x4  = x[3 + idx];
173:       s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
174:       s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
175:       s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
176:       s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;

178:       v += bs2;
179:     }
180:     /* x = inv_diagonal*x */
181:     x[idt]     = v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4;
182:     x[1 + idt] = v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4;
183:     x[2 + idt] = v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4;
184:     x[3 + idt] = v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4;
185:   }

187:   PetscCall(VecRestoreArrayRead(bb, &b));
188:   PetscCall(VecRestoreArray(xx, &x));
189:   PetscCall(PetscLogFlops(2.0 * bs2 * (a->nz) - bs * A->cmap->n));
190:   PetscFunctionReturn(PETSC_SUCCESS);
191: }

193: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A, Vec bb, Vec xx)
194: {
195:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
196:   const PetscInt     n = a->mbs, *ai = a->i, *aj = a->j, *diag = a->diag;
197:   const MatScalar   *aa = a->a;
198:   const PetscScalar *b;
199:   PetscScalar       *x;

201:   PetscFunctionBegin;
202:   PetscCall(VecGetArrayRead(bb, &b));
203:   PetscCall(VecGetArray(xx, &x));

205:   {
206:     MatScalar        s1, s2, s3, s4, x1, x2, x3, x4;
207:     const MatScalar *v;
208:     MatScalar       *t = (MatScalar *)x;
209:     PetscInt         jdx, idt, idx, nz, i, ai16;
210:     const PetscInt  *vi;

212:     /* forward solve the lower triangular */
213:     idx  = 0;
214:     t[0] = (MatScalar)b[0];
215:     t[1] = (MatScalar)b[1];
216:     t[2] = (MatScalar)b[2];
217:     t[3] = (MatScalar)b[3];
218:     for (i = 1; i < n; i++) {
219:       v  = aa + 16 * ai[i];
220:       vi = aj + ai[i];
221:       nz = diag[i] - ai[i];
222:       idx += 4;
223:       s1 = (MatScalar)b[idx];
224:       s2 = (MatScalar)b[1 + idx];
225:       s3 = (MatScalar)b[2 + idx];
226:       s4 = (MatScalar)b[3 + idx];
227:       while (nz--) {
228:         jdx = 4 * (*vi++);
229:         x1  = t[jdx];
230:         x2  = t[1 + jdx];
231:         x3  = t[2 + jdx];
232:         x4  = t[3 + jdx];
233:         s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
234:         s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
235:         s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
236:         s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
237:         v += 16;
238:       }
239:       t[idx]     = s1;
240:       t[1 + idx] = s2;
241:       t[2 + idx] = s3;
242:       t[3 + idx] = s4;
243:     }
244:     /* backward solve the upper triangular */
245:     idt = 4 * (n - 1);
246:     for (i = n - 1; i >= 0; i--) {
247:       ai16 = 16 * diag[i];
248:       v    = aa + ai16 + 16;
249:       vi   = aj + diag[i] + 1;
250:       nz   = ai[i + 1] - diag[i] - 1;
251:       s1   = t[idt];
252:       s2   = t[1 + idt];
253:       s3   = t[2 + idt];
254:       s4   = t[3 + idt];
255:       while (nz--) {
256:         idx = 4 * (*vi++);
257:         x1  = (MatScalar)x[idx];
258:         x2  = (MatScalar)x[1 + idx];
259:         x3  = (MatScalar)x[2 + idx];
260:         x4  = (MatScalar)x[3 + idx];
261:         s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
262:         s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
263:         s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
264:         s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
265:         v += 16;
266:       }
267:       v          = aa + ai16;
268:       x[idt]     = (PetscScalar)(v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4);
269:       x[1 + idt] = (PetscScalar)(v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4);
270:       x[2 + idt] = (PetscScalar)(v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4);
271:       x[3 + idt] = (PetscScalar)(v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4);
272:       idt -= 4;
273:     }
274:   }

276:   PetscCall(VecRestoreArrayRead(bb, &b));
277:   PetscCall(VecRestoreArray(xx, &x));
278:   PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n));
279:   PetscFunctionReturn(PETSC_SUCCESS);
280: }

282: #if defined(PETSC_HAVE_SSE)

284:   #include PETSC_HAVE_SSE
285: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A, Vec bb, Vec xx)
286: {
287:   Mat_SeqBAIJ    *a  = (Mat_SeqBAIJ *)A->data;
288:   unsigned short *aj = (unsigned short *)a->j;
289:   int            *ai = a->i, n = a->mbs, *diag = a->diag;
290:   MatScalar      *aa = a->a;
291:   PetscScalar    *x, *b;

293:   PetscFunctionBegin;
294:   SSE_SCOPE_BEGIN;
295:   /*
296:      Note: This code currently uses demotion of double
297:      to float when performing the mixed-mode computation.
298:      This may not be numerically reasonable for all applications.
299:   */
300:   PREFETCH_NTA(aa + 16 * ai[1]);

302:   PetscCall(VecGetArray(bb, &b));
303:   PetscCall(VecGetArray(xx, &x));
304:   {
305:     /* x will first be computed in single precision then promoted inplace to double */
306:     MatScalar      *v, *t = (MatScalar *)x;
307:     int             nz, i, idt, ai16;
308:     unsigned int    jdx, idx;
309:     unsigned short *vi;
310:     /* Forward solve the lower triangular factor. */

312:     /* First block is the identity. */
313:     idx = 0;
314:     CONVERT_DOUBLE4_FLOAT4(t, b);
315:     v = aa + 16 * ((unsigned int)ai[1]);

317:     for (i = 1; i < n;) {
318:       PREFETCH_NTA(&v[8]);
319:       vi = aj + ai[i];
320:       nz = diag[i] - ai[i];
321:       idx += 4;

323:       /* Demote RHS from double to float. */
324:       CONVERT_DOUBLE4_FLOAT4(&t[idx], &b[idx]);
325:       LOAD_PS(&t[idx], XMM7);

327:       while (nz--) {
328:         PREFETCH_NTA(&v[16]);
329:         jdx = 4 * ((unsigned int)(*vi++));

331:         /* 4x4 Matrix-Vector product with negative accumulation: */
332:         SSE_INLINE_BEGIN_2(&t[jdx], v)
333:         SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6)

335:         /* First Column */
336:         SSE_COPY_PS(XMM0, XMM6)
337:         SSE_SHUFFLE(XMM0, XMM0, 0x00)
338:         SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0)
339:         SSE_SUB_PS(XMM7, XMM0)

341:         /* Second Column */
342:         SSE_COPY_PS(XMM1, XMM6)
343:         SSE_SHUFFLE(XMM1, XMM1, 0x55)
344:         SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4)
345:         SSE_SUB_PS(XMM7, XMM1)

347:         SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24)

349:         /* Third Column */
350:         SSE_COPY_PS(XMM2, XMM6)
351:         SSE_SHUFFLE(XMM2, XMM2, 0xAA)
352:         SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8)
353:         SSE_SUB_PS(XMM7, XMM2)

355:         /* Fourth Column */
356:         SSE_COPY_PS(XMM3, XMM6)
357:         SSE_SHUFFLE(XMM3, XMM3, 0xFF)
358:         SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12)
359:         SSE_SUB_PS(XMM7, XMM3)
360:         SSE_INLINE_END_2

362:         v += 16;
363:       }
364:       v = aa + 16 * ai[++i];
365:       PREFETCH_NTA(v);
366:       STORE_PS(&t[idx], XMM7);
367:     }

369:     /* Backward solve the upper triangular factor.*/

371:     idt  = 4 * (n - 1);
372:     ai16 = 16 * diag[n - 1];
373:     v    = aa + ai16 + 16;
374:     for (i = n - 1; i >= 0;) {
375:       PREFETCH_NTA(&v[8]);
376:       vi = aj + diag[i] + 1;
377:       nz = ai[i + 1] - diag[i] - 1;

379:       LOAD_PS(&t[idt], XMM7);

381:       while (nz--) {
382:         PREFETCH_NTA(&v[16]);
383:         idx = 4 * ((unsigned int)(*vi++));

385:         /* 4x4 Matrix-Vector Product with negative accumulation: */
386:         SSE_INLINE_BEGIN_2(&t[idx], v)
387:         SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6)

389:         /* First Column */
390:         SSE_COPY_PS(XMM0, XMM6)
391:         SSE_SHUFFLE(XMM0, XMM0, 0x00)
392:         SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0)
393:         SSE_SUB_PS(XMM7, XMM0)

395:         /* Second Column */
396:         SSE_COPY_PS(XMM1, XMM6)
397:         SSE_SHUFFLE(XMM1, XMM1, 0x55)
398:         SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4)
399:         SSE_SUB_PS(XMM7, XMM1)

401:         SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24)

403:         /* Third Column */
404:         SSE_COPY_PS(XMM2, XMM6)
405:         SSE_SHUFFLE(XMM2, XMM2, 0xAA)
406:         SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8)
407:         SSE_SUB_PS(XMM7, XMM2)

409:         /* Fourth Column */
410:         SSE_COPY_PS(XMM3, XMM6)
411:         SSE_SHUFFLE(XMM3, XMM3, 0xFF)
412:         SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12)
413:         SSE_SUB_PS(XMM7, XMM3)
414:         SSE_INLINE_END_2
415:         v += 16;
416:       }
417:       v    = aa + ai16;
418:       ai16 = 16 * diag[--i];
419:       PREFETCH_NTA(aa + ai16 + 16);
420:       /*
421:          Scale the result by the diagonal 4x4 block,
422:          which was inverted as part of the factorization
423:       */
424:       SSE_INLINE_BEGIN_3(v, &t[idt], aa + ai16)
425:       /* First Column */
426:       SSE_COPY_PS(XMM0, XMM7)
427:       SSE_SHUFFLE(XMM0, XMM0, 0x00)
428:       SSE_MULT_PS_M(XMM0, SSE_ARG_1, FLOAT_0)

430:       /* Second Column */
431:       SSE_COPY_PS(XMM1, XMM7)
432:       SSE_SHUFFLE(XMM1, XMM1, 0x55)
433:       SSE_MULT_PS_M(XMM1, SSE_ARG_1, FLOAT_4)
434:       SSE_ADD_PS(XMM0, XMM1)

436:       SSE_PREFETCH_NTA(SSE_ARG_3, FLOAT_24)

438:       /* Third Column */
439:       SSE_COPY_PS(XMM2, XMM7)
440:       SSE_SHUFFLE(XMM2, XMM2, 0xAA)
441:       SSE_MULT_PS_M(XMM2, SSE_ARG_1, FLOAT_8)
442:       SSE_ADD_PS(XMM0, XMM2)

444:       /* Fourth Column */
445:       SSE_COPY_PS(XMM3, XMM7)
446:       SSE_SHUFFLE(XMM3, XMM3, 0xFF)
447:       SSE_MULT_PS_M(XMM3, SSE_ARG_1, FLOAT_12)
448:       SSE_ADD_PS(XMM0, XMM3)

450:       SSE_STORE_PS(SSE_ARG_2, FLOAT_0, XMM0)
451:       SSE_INLINE_END_3

453:       v = aa + ai16 + 16;
454:       idt -= 4;
455:     }

457:     /* Convert t from single precision back to double precision (inplace)*/
458:     idt = 4 * (n - 1);
459:     for (i = n - 1; i >= 0; i--) {
460:       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
461:       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
462:       PetscScalar *xtemp = &x[idt];
463:       MatScalar   *ttemp = &t[idt];
464:       xtemp[3]           = (PetscScalar)ttemp[3];
465:       xtemp[2]           = (PetscScalar)ttemp[2];
466:       xtemp[1]           = (PetscScalar)ttemp[1];
467:       xtemp[0]           = (PetscScalar)ttemp[0];
468:       idt -= 4;
469:     }

471:   } /* End of artificial scope. */
472:   PetscCall(VecRestoreArray(bb, &b));
473:   PetscCall(VecRestoreArray(xx, &x));
474:   PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n));
475:   SSE_SCOPE_END;
476:   PetscFunctionReturn(PETSC_SUCCESS);
477: }

479: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A, Vec bb, Vec xx)
480: {
481:   Mat_SeqBAIJ *a  = (Mat_SeqBAIJ *)A->data;
482:   int         *aj = a->j;
483:   int         *ai = a->i, n = a->mbs, *diag = a->diag;
484:   MatScalar   *aa = a->a;
485:   PetscScalar *x, *b;

487:   PetscFunctionBegin;
488:   SSE_SCOPE_BEGIN;
489:   /*
490:      Note: This code currently uses demotion of double
491:      to float when performing the mixed-mode computation.
492:      This may not be numerically reasonable for all applications.
493:   */
494:   PREFETCH_NTA(aa + 16 * ai[1]);

496:   PetscCall(VecGetArray(bb, &b));
497:   PetscCall(VecGetArray(xx, &x));
498:   {
499:     /* x will first be computed in single precision then promoted inplace to double */
500:     MatScalar *v, *t = (MatScalar *)x;
501:     int        nz, i, idt, ai16;
502:     int        jdx, idx;
503:     int       *vi;
504:     /* Forward solve the lower triangular factor. */

506:     /* First block is the identity. */
507:     idx = 0;
508:     CONVERT_DOUBLE4_FLOAT4(t, b);
509:     v = aa + 16 * ai[1];

511:     for (i = 1; i < n;) {
512:       PREFETCH_NTA(&v[8]);
513:       vi = aj + ai[i];
514:       nz = diag[i] - ai[i];
515:       idx += 4;

517:       /* Demote RHS from double to float. */
518:       CONVERT_DOUBLE4_FLOAT4(&t[idx], &b[idx]);
519:       LOAD_PS(&t[idx], XMM7);

521:       while (nz--) {
522:         PREFETCH_NTA(&v[16]);
523:         jdx = 4 * (*vi++);
524:         /*          jdx = *vi++; */

526:         /* 4x4 Matrix-Vector product with negative accumulation: */
527:         SSE_INLINE_BEGIN_2(&t[jdx], v)
528:         SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6)

530:         /* First Column */
531:         SSE_COPY_PS(XMM0, XMM6)
532:         SSE_SHUFFLE(XMM0, XMM0, 0x00)
533:         SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0)
534:         SSE_SUB_PS(XMM7, XMM0)

536:         /* Second Column */
537:         SSE_COPY_PS(XMM1, XMM6)
538:         SSE_SHUFFLE(XMM1, XMM1, 0x55)
539:         SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4)
540:         SSE_SUB_PS(XMM7, XMM1)

542:         SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24)

544:         /* Third Column */
545:         SSE_COPY_PS(XMM2, XMM6)
546:         SSE_SHUFFLE(XMM2, XMM2, 0xAA)
547:         SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8)
548:         SSE_SUB_PS(XMM7, XMM2)

550:         /* Fourth Column */
551:         SSE_COPY_PS(XMM3, XMM6)
552:         SSE_SHUFFLE(XMM3, XMM3, 0xFF)
553:         SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12)
554:         SSE_SUB_PS(XMM7, XMM3)
555:         SSE_INLINE_END_2

557:         v += 16;
558:       }
559:       v = aa + 16 * ai[++i];
560:       PREFETCH_NTA(v);
561:       STORE_PS(&t[idx], XMM7);
562:     }

564:     /* Backward solve the upper triangular factor.*/

566:     idt  = 4 * (n - 1);
567:     ai16 = 16 * diag[n - 1];
568:     v    = aa + ai16 + 16;
569:     for (i = n - 1; i >= 0;) {
570:       PREFETCH_NTA(&v[8]);
571:       vi = aj + diag[i] + 1;
572:       nz = ai[i + 1] - diag[i] - 1;

574:       LOAD_PS(&t[idt], XMM7);

576:       while (nz--) {
577:         PREFETCH_NTA(&v[16]);
578:         idx = 4 * (*vi++);
579:         /*          idx = *vi++; */

581:         /* 4x4 Matrix-Vector Product with negative accumulation: */
582:         SSE_INLINE_BEGIN_2(&t[idx], v)
583:         SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6)

585:         /* First Column */
586:         SSE_COPY_PS(XMM0, XMM6)
587:         SSE_SHUFFLE(XMM0, XMM0, 0x00)
588:         SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0)
589:         SSE_SUB_PS(XMM7, XMM0)

591:         /* Second Column */
592:         SSE_COPY_PS(XMM1, XMM6)
593:         SSE_SHUFFLE(XMM1, XMM1, 0x55)
594:         SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4)
595:         SSE_SUB_PS(XMM7, XMM1)

597:         SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24)

599:         /* Third Column */
600:         SSE_COPY_PS(XMM2, XMM6)
601:         SSE_SHUFFLE(XMM2, XMM2, 0xAA)
602:         SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8)
603:         SSE_SUB_PS(XMM7, XMM2)

605:         /* Fourth Column */
606:         SSE_COPY_PS(XMM3, XMM6)
607:         SSE_SHUFFLE(XMM3, XMM3, 0xFF)
608:         SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12)
609:         SSE_SUB_PS(XMM7, XMM3)
610:         SSE_INLINE_END_2
611:         v += 16;
612:       }
613:       v    = aa + ai16;
614:       ai16 = 16 * diag[--i];
615:       PREFETCH_NTA(aa + ai16 + 16);
616:       /*
617:          Scale the result by the diagonal 4x4 block,
618:          which was inverted as part of the factorization
619:       */
620:       SSE_INLINE_BEGIN_3(v, &t[idt], aa + ai16)
621:       /* First Column */
622:       SSE_COPY_PS(XMM0, XMM7)
623:       SSE_SHUFFLE(XMM0, XMM0, 0x00)
624:       SSE_MULT_PS_M(XMM0, SSE_ARG_1, FLOAT_0)

626:       /* Second Column */
627:       SSE_COPY_PS(XMM1, XMM7)
628:       SSE_SHUFFLE(XMM1, XMM1, 0x55)
629:       SSE_MULT_PS_M(XMM1, SSE_ARG_1, FLOAT_4)
630:       SSE_ADD_PS(XMM0, XMM1)

632:       SSE_PREFETCH_NTA(SSE_ARG_3, FLOAT_24)

634:       /* Third Column */
635:       SSE_COPY_PS(XMM2, XMM7)
636:       SSE_SHUFFLE(XMM2, XMM2, 0xAA)
637:       SSE_MULT_PS_M(XMM2, SSE_ARG_1, FLOAT_8)
638:       SSE_ADD_PS(XMM0, XMM2)

640:       /* Fourth Column */
641:       SSE_COPY_PS(XMM3, XMM7)
642:       SSE_SHUFFLE(XMM3, XMM3, 0xFF)
643:       SSE_MULT_PS_M(XMM3, SSE_ARG_1, FLOAT_12)
644:       SSE_ADD_PS(XMM0, XMM3)

646:       SSE_STORE_PS(SSE_ARG_2, FLOAT_0, XMM0)
647:       SSE_INLINE_END_3

649:       v = aa + ai16 + 16;
650:       idt -= 4;
651:     }

653:     /* Convert t from single precision back to double precision (inplace)*/
654:     idt = 4 * (n - 1);
655:     for (i = n - 1; i >= 0; i--) {
656:       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
657:       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
658:       PetscScalar *xtemp = &x[idt];
659:       MatScalar   *ttemp = &t[idt];
660:       xtemp[3]           = (PetscScalar)ttemp[3];
661:       xtemp[2]           = (PetscScalar)ttemp[2];
662:       xtemp[1]           = (PetscScalar)ttemp[1];
663:       xtemp[0]           = (PetscScalar)ttemp[0];
664:       idt -= 4;
665:     }

667:   } /* End of artificial scope. */
668:   PetscCall(VecRestoreArray(bb, &b));
669:   PetscCall(VecRestoreArray(xx, &x));
670:   PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n));
671:   SSE_SCOPE_END;
672:   PetscFunctionReturn(PETSC_SUCCESS);
673: }

675: #endif