Actual source code: baijfact11.c


  2: /*
  3:     Factorization code for BAIJ format.
  4: */
  5: #include <../src/mat/impls/baij/seq/baij.h>
  6: #include <petsc/private/kernels/blockinvert.h>

  8: /*
  9:       Version for when blocks are 4 by 4
 10: */
 11: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_inplace(Mat C, Mat A, const MatFactorInfo *info)
 12: {
 13:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
 14:   IS              isrow = b->row, isicol = b->icol;
 15:   const PetscInt *r, *ic;
 16:   PetscInt        i, j, n = a->mbs, *bi = b->i, *bj = b->j;
 17:   PetscInt       *ajtmpold, *ajtmp, nz, row;
 18:   PetscInt       *diag_offset = b->diag, idx, *ai = a->i, *aj = a->j, *pj;
 19:   MatScalar      *pv, *v, *rtmp, *pc, *w, *x;
 20:   MatScalar       p1, p2, p3, p4, m1, m2, m3, m4, m5, m6, m7, m8, m9, x1, x2, x3, x4;
 21:   MatScalar       p5, p6, p7, p8, p9, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16;
 22:   MatScalar       p10, p11, p12, p13, p14, p15, p16, m10, m11, m12;
 23:   MatScalar       m13, m14, m15, m16;
 24:   MatScalar      *ba = b->a, *aa = a->a;
 25:   PetscBool       pivotinblocks = b->pivotinblocks;
 26:   PetscReal       shift         = info->shiftamount;
 27:   PetscBool       allowzeropivot, zeropivotdetected = PETSC_FALSE;

 29:   PetscFunctionBegin;
 30:   PetscCall(ISGetIndices(isrow, &r));
 31:   PetscCall(ISGetIndices(isicol, &ic));
 32:   PetscCall(PetscMalloc1(16 * (n + 1), &rtmp));
 33:   allowzeropivot = PetscNot(A->erroriffailure);

 35:   for (i = 0; i < n; i++) {
 36:     nz    = bi[i + 1] - bi[i];
 37:     ajtmp = bj + bi[i];
 38:     for (j = 0; j < nz; j++) {
 39:       x    = rtmp + 16 * ajtmp[j];
 40:       x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0;
 41:       x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
 42:     }
 43:     /* load in initial (unfactored row) */
 44:     idx      = r[i];
 45:     nz       = ai[idx + 1] - ai[idx];
 46:     ajtmpold = aj + ai[idx];
 47:     v        = aa + 16 * ai[idx];
 48:     for (j = 0; j < nz; j++) {
 49:       x     = rtmp + 16 * ic[ajtmpold[j]];
 50:       x[0]  = v[0];
 51:       x[1]  = v[1];
 52:       x[2]  = v[2];
 53:       x[3]  = v[3];
 54:       x[4]  = v[4];
 55:       x[5]  = v[5];
 56:       x[6]  = v[6];
 57:       x[7]  = v[7];
 58:       x[8]  = v[8];
 59:       x[9]  = v[9];
 60:       x[10] = v[10];
 61:       x[11] = v[11];
 62:       x[12] = v[12];
 63:       x[13] = v[13];
 64:       x[14] = v[14];
 65:       x[15] = v[15];
 66:       v += 16;
 67:     }
 68:     row = *ajtmp++;
 69:     while (row < i) {
 70:       pc  = rtmp + 16 * row;
 71:       p1  = pc[0];
 72:       p2  = pc[1];
 73:       p3  = pc[2];
 74:       p4  = pc[3];
 75:       p5  = pc[4];
 76:       p6  = pc[5];
 77:       p7  = pc[6];
 78:       p8  = pc[7];
 79:       p9  = pc[8];
 80:       p10 = pc[9];
 81:       p11 = pc[10];
 82:       p12 = pc[11];
 83:       p13 = pc[12];
 84:       p14 = pc[13];
 85:       p15 = pc[14];
 86:       p16 = pc[15];
 87:       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 || p16 != 0.0) {
 88:         pv    = ba + 16 * diag_offset[row];
 89:         pj    = bj + diag_offset[row] + 1;
 90:         x1    = pv[0];
 91:         x2    = pv[1];
 92:         x3    = pv[2];
 93:         x4    = pv[3];
 94:         x5    = pv[4];
 95:         x6    = pv[5];
 96:         x7    = pv[6];
 97:         x8    = pv[7];
 98:         x9    = pv[8];
 99:         x10   = pv[9];
100:         x11   = pv[10];
101:         x12   = pv[11];
102:         x13   = pv[12];
103:         x14   = pv[13];
104:         x15   = pv[14];
105:         x16   = pv[15];
106:         pc[0] = m1 = p1 * x1 + p5 * x2 + p9 * x3 + p13 * x4;
107:         pc[1] = m2 = p2 * x1 + p6 * x2 + p10 * x3 + p14 * x4;
108:         pc[2] = m3 = p3 * x1 + p7 * x2 + p11 * x3 + p15 * x4;
109:         pc[3] = m4 = p4 * x1 + p8 * x2 + p12 * x3 + p16 * x4;

111:         pc[4] = m5 = p1 * x5 + p5 * x6 + p9 * x7 + p13 * x8;
112:         pc[5] = m6 = p2 * x5 + p6 * x6 + p10 * x7 + p14 * x8;
113:         pc[6] = m7 = p3 * x5 + p7 * x6 + p11 * x7 + p15 * x8;
114:         pc[7] = m8 = p4 * x5 + p8 * x6 + p12 * x7 + p16 * x8;

116:         pc[8] = m9 = p1 * x9 + p5 * x10 + p9 * x11 + p13 * x12;
117:         pc[9] = m10 = p2 * x9 + p6 * x10 + p10 * x11 + p14 * x12;
118:         pc[10] = m11 = p3 * x9 + p7 * x10 + p11 * x11 + p15 * x12;
119:         pc[11] = m12 = p4 * x9 + p8 * x10 + p12 * x11 + p16 * x12;

121:         pc[12] = m13 = p1 * x13 + p5 * x14 + p9 * x15 + p13 * x16;
122:         pc[13] = m14 = p2 * x13 + p6 * x14 + p10 * x15 + p14 * x16;
123:         pc[14] = m15 = p3 * x13 + p7 * x14 + p11 * x15 + p15 * x16;
124:         pc[15] = m16 = p4 * x13 + p8 * x14 + p12 * x15 + p16 * x16;

126:         nz = bi[row + 1] - diag_offset[row] - 1;
127:         pv += 16;
128:         for (j = 0; j < nz; j++) {
129:           x1  = pv[0];
130:           x2  = pv[1];
131:           x3  = pv[2];
132:           x4  = pv[3];
133:           x5  = pv[4];
134:           x6  = pv[5];
135:           x7  = pv[6];
136:           x8  = pv[7];
137:           x9  = pv[8];
138:           x10 = pv[9];
139:           x11 = pv[10];
140:           x12 = pv[11];
141:           x13 = pv[12];
142:           x14 = pv[13];
143:           x15 = pv[14];
144:           x16 = pv[15];
145:           x   = rtmp + 16 * pj[j];
146:           x[0] -= m1 * x1 + m5 * x2 + m9 * x3 + m13 * x4;
147:           x[1] -= m2 * x1 + m6 * x2 + m10 * x3 + m14 * x4;
148:           x[2] -= m3 * x1 + m7 * x2 + m11 * x3 + m15 * x4;
149:           x[3] -= m4 * x1 + m8 * x2 + m12 * x3 + m16 * x4;

151:           x[4] -= m1 * x5 + m5 * x6 + m9 * x7 + m13 * x8;
152:           x[5] -= m2 * x5 + m6 * x6 + m10 * x7 + m14 * x8;
153:           x[6] -= m3 * x5 + m7 * x6 + m11 * x7 + m15 * x8;
154:           x[7] -= m4 * x5 + m8 * x6 + m12 * x7 + m16 * x8;

156:           x[8] -= m1 * x9 + m5 * x10 + m9 * x11 + m13 * x12;
157:           x[9] -= m2 * x9 + m6 * x10 + m10 * x11 + m14 * x12;
158:           x[10] -= m3 * x9 + m7 * x10 + m11 * x11 + m15 * x12;
159:           x[11] -= m4 * x9 + m8 * x10 + m12 * x11 + m16 * x12;

161:           x[12] -= m1 * x13 + m5 * x14 + m9 * x15 + m13 * x16;
162:           x[13] -= m2 * x13 + m6 * x14 + m10 * x15 + m14 * x16;
163:           x[14] -= m3 * x13 + m7 * x14 + m11 * x15 + m15 * x16;
164:           x[15] -= m4 * x13 + m8 * x14 + m12 * x15 + m16 * x16;

166:           pv += 16;
167:         }
168:         PetscCall(PetscLogFlops(128.0 * nz + 112.0));
169:       }
170:       row = *ajtmp++;
171:     }
172:     /* finished row so stick it into b->a */
173:     pv = ba + 16 * bi[i];
174:     pj = bj + bi[i];
175:     nz = bi[i + 1] - bi[i];
176:     for (j = 0; j < nz; j++) {
177:       x      = rtmp + 16 * pj[j];
178:       pv[0]  = x[0];
179:       pv[1]  = x[1];
180:       pv[2]  = x[2];
181:       pv[3]  = x[3];
182:       pv[4]  = x[4];
183:       pv[5]  = x[5];
184:       pv[6]  = x[6];
185:       pv[7]  = x[7];
186:       pv[8]  = x[8];
187:       pv[9]  = x[9];
188:       pv[10] = x[10];
189:       pv[11] = x[11];
190:       pv[12] = x[12];
191:       pv[13] = x[13];
192:       pv[14] = x[14];
193:       pv[15] = x[15];
194:       pv += 16;
195:     }
196:     /* invert diagonal block */
197:     w = ba + 16 * diag_offset[i];
198:     if (pivotinblocks) {
199:       PetscCall(PetscKernel_A_gets_inverse_A_4(w, shift, allowzeropivot, &zeropivotdetected));
200:       if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
201:     } else {
202:       PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w));
203:     }
204:   }

206:   PetscCall(PetscFree(rtmp));
207:   PetscCall(ISRestoreIndices(isicol, &ic));
208:   PetscCall(ISRestoreIndices(isrow, &r));

210:   C->ops->solve          = MatSolve_SeqBAIJ_4_inplace;
211:   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_inplace;
212:   C->assembled           = PETSC_TRUE;

214:   PetscCall(PetscLogFlops(1.333333333333 * 4 * 4 * 4 * b->mbs)); /* from inverting diagonal blocks */
215:   PetscFunctionReturn(PETSC_SUCCESS);
216: }

218: /* MatLUFactorNumeric_SeqBAIJ_4 -
219:      copied from MatLUFactorNumeric_SeqBAIJ_N_inplace() and manually re-implemented
220:        PetscKernel_A_gets_A_times_B()
221:        PetscKernel_A_gets_A_minus_B_times_C()
222:        PetscKernel_A_gets_inverse_A()
223: */

225: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4(Mat B, Mat A, const MatFactorInfo *info)
226: {
227:   Mat             C = B;
228:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
229:   IS              isrow = b->row, isicol = b->icol;
230:   const PetscInt *r, *ic;
231:   PetscInt        i, j, k, nz, nzL, row;
232:   const PetscInt  n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j;
233:   const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag, *pj, bs2 = a->bs2;
234:   MatScalar      *rtmp, *pc, *mwork, *v, *pv, *aa = a->a;
235:   PetscInt        flg;
236:   PetscReal       shift;
237:   PetscBool       allowzeropivot, zeropivotdetected;

239:   PetscFunctionBegin;
240:   allowzeropivot = PetscNot(A->erroriffailure);
241:   PetscCall(ISGetIndices(isrow, &r));
242:   PetscCall(ISGetIndices(isicol, &ic));

244:   if (info->shifttype == (PetscReal)MAT_SHIFT_NONE) {
245:     shift = 0;
246:   } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */
247:     shift = info->shiftamount;
248:   }

250:   /* generate work space needed by the factorization */
251:   PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork));
252:   PetscCall(PetscArrayzero(rtmp, bs2 * n));

254:   for (i = 0; i < n; i++) {
255:     /* zero rtmp */
256:     /* L part */
257:     nz    = bi[i + 1] - bi[i];
258:     bjtmp = bj + bi[i];
259:     for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));

261:     /* U part */
262:     nz    = bdiag[i] - bdiag[i + 1];
263:     bjtmp = bj + bdiag[i + 1] + 1;
264:     for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));

266:     /* load in initial (unfactored row) */
267:     nz    = ai[r[i] + 1] - ai[r[i]];
268:     ajtmp = aj + ai[r[i]];
269:     v     = aa + bs2 * ai[r[i]];
270:     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], v + bs2 * j, bs2));

272:     /* elimination */
273:     bjtmp = bj + bi[i];
274:     nzL   = bi[i + 1] - bi[i];
275:     for (k = 0; k < nzL; k++) {
276:       row = bjtmp[k];
277:       pc  = rtmp + bs2 * row;
278:       for (flg = 0, j = 0; j < bs2; j++) {
279:         if (pc[j] != 0.0) {
280:           flg = 1;
281:           break;
282:         }
283:       }
284:       if (flg) {
285:         pv = b->a + bs2 * bdiag[row];
286:         /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
287:         PetscCall(PetscKernel_A_gets_A_times_B_4(pc, pv, mwork));

289:         pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
290:         pv = b->a + bs2 * (bdiag[row + 1] + 1);
291:         nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */
292:         for (j = 0; j < nz; j++) {
293:           /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
294:           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
295:           v = rtmp + bs2 * pj[j];
296:           PetscCall(PetscKernel_A_gets_A_minus_B_times_C_4(v, pc, pv));
297:           pv += bs2;
298:         }
299:         PetscCall(PetscLogFlops(128.0 * nz + 112)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
300:       }
301:     }

303:     /* finished row so stick it into b->a */
304:     /* L part */
305:     pv = b->a + bs2 * bi[i];
306:     pj = b->j + bi[i];
307:     nz = bi[i + 1] - bi[i];
308:     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));

310:     /* Mark diagonal and invert diagonal for simpler triangular solves */
311:     pv = b->a + bs2 * bdiag[i];
312:     pj = b->j + bdiag[i];
313:     PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
314:     PetscCall(PetscKernel_A_gets_inverse_A_4(pv, shift, allowzeropivot, &zeropivotdetected));
315:     if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;

317:     /* U part */
318:     pv = b->a + bs2 * (bdiag[i + 1] + 1);
319:     pj = b->j + bdiag[i + 1] + 1;
320:     nz = bdiag[i] - bdiag[i + 1] - 1;
321:     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
322:   }

324:   PetscCall(PetscFree2(rtmp, mwork));
325:   PetscCall(ISRestoreIndices(isicol, &ic));
326:   PetscCall(ISRestoreIndices(isrow, &r));

328:   C->ops->solve          = MatSolve_SeqBAIJ_4;
329:   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4;
330:   C->assembled           = PETSC_TRUE;

332:   PetscCall(PetscLogFlops(1.333333333333 * 4 * 4 * 4 * n)); /* from inverting diagonal blocks */
333:   PetscFunctionReturn(PETSC_SUCCESS);
334: }

336: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_inplace(Mat C, Mat A, const MatFactorInfo *info)
337: {
338:   /*
339:     Default Version for when blocks are 4 by 4 Using natural ordering
340: */
341:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
342:   PetscInt     i, j, n = a->mbs, *bi = b->i, *bj = b->j;
343:   PetscInt    *ajtmpold, *ajtmp, nz, row;
344:   PetscInt    *diag_offset = b->diag, *ai = a->i, *aj = a->j, *pj;
345:   MatScalar   *pv, *v, *rtmp, *pc, *w, *x;
346:   MatScalar    p1, p2, p3, p4, m1, m2, m3, m4, m5, m6, m7, m8, m9, x1, x2, x3, x4;
347:   MatScalar    p5, p6, p7, p8, p9, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16;
348:   MatScalar    p10, p11, p12, p13, p14, p15, p16, m10, m11, m12;
349:   MatScalar    m13, m14, m15, m16;
350:   MatScalar   *ba = b->a, *aa = a->a;
351:   PetscBool    pivotinblocks = b->pivotinblocks;
352:   PetscReal    shift         = info->shiftamount;
353:   PetscBool    allowzeropivot, zeropivotdetected = PETSC_FALSE;

355:   PetscFunctionBegin;
356:   allowzeropivot = PetscNot(A->erroriffailure);
357:   PetscCall(PetscMalloc1(16 * (n + 1), &rtmp));

359:   for (i = 0; i < n; i++) {
360:     nz    = bi[i + 1] - bi[i];
361:     ajtmp = bj + bi[i];
362:     for (j = 0; j < nz; j++) {
363:       x    = rtmp + 16 * ajtmp[j];
364:       x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0;
365:       x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
366:     }
367:     /* load in initial (unfactored row) */
368:     nz       = ai[i + 1] - ai[i];
369:     ajtmpold = aj + ai[i];
370:     v        = aa + 16 * ai[i];
371:     for (j = 0; j < nz; j++) {
372:       x     = rtmp + 16 * ajtmpold[j];
373:       x[0]  = v[0];
374:       x[1]  = v[1];
375:       x[2]  = v[2];
376:       x[3]  = v[3];
377:       x[4]  = v[4];
378:       x[5]  = v[5];
379:       x[6]  = v[6];
380:       x[7]  = v[7];
381:       x[8]  = v[8];
382:       x[9]  = v[9];
383:       x[10] = v[10];
384:       x[11] = v[11];
385:       x[12] = v[12];
386:       x[13] = v[13];
387:       x[14] = v[14];
388:       x[15] = v[15];
389:       v += 16;
390:     }
391:     row = *ajtmp++;
392:     while (row < i) {
393:       pc  = rtmp + 16 * row;
394:       p1  = pc[0];
395:       p2  = pc[1];
396:       p3  = pc[2];
397:       p4  = pc[3];
398:       p5  = pc[4];
399:       p6  = pc[5];
400:       p7  = pc[6];
401:       p8  = pc[7];
402:       p9  = pc[8];
403:       p10 = pc[9];
404:       p11 = pc[10];
405:       p12 = pc[11];
406:       p13 = pc[12];
407:       p14 = pc[13];
408:       p15 = pc[14];
409:       p16 = pc[15];
410:       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 || p16 != 0.0) {
411:         pv    = ba + 16 * diag_offset[row];
412:         pj    = bj + diag_offset[row] + 1;
413:         x1    = pv[0];
414:         x2    = pv[1];
415:         x3    = pv[2];
416:         x4    = pv[3];
417:         x5    = pv[4];
418:         x6    = pv[5];
419:         x7    = pv[6];
420:         x8    = pv[7];
421:         x9    = pv[8];
422:         x10   = pv[9];
423:         x11   = pv[10];
424:         x12   = pv[11];
425:         x13   = pv[12];
426:         x14   = pv[13];
427:         x15   = pv[14];
428:         x16   = pv[15];
429:         pc[0] = m1 = p1 * x1 + p5 * x2 + p9 * x3 + p13 * x4;
430:         pc[1] = m2 = p2 * x1 + p6 * x2 + p10 * x3 + p14 * x4;
431:         pc[2] = m3 = p3 * x1 + p7 * x2 + p11 * x3 + p15 * x4;
432:         pc[3] = m4 = p4 * x1 + p8 * x2 + p12 * x3 + p16 * x4;

434:         pc[4] = m5 = p1 * x5 + p5 * x6 + p9 * x7 + p13 * x8;
435:         pc[5] = m6 = p2 * x5 + p6 * x6 + p10 * x7 + p14 * x8;
436:         pc[6] = m7 = p3 * x5 + p7 * x6 + p11 * x7 + p15 * x8;
437:         pc[7] = m8 = p4 * x5 + p8 * x6 + p12 * x7 + p16 * x8;

439:         pc[8] = m9 = p1 * x9 + p5 * x10 + p9 * x11 + p13 * x12;
440:         pc[9] = m10 = p2 * x9 + p6 * x10 + p10 * x11 + p14 * x12;
441:         pc[10] = m11 = p3 * x9 + p7 * x10 + p11 * x11 + p15 * x12;
442:         pc[11] = m12 = p4 * x9 + p8 * x10 + p12 * x11 + p16 * x12;

444:         pc[12] = m13 = p1 * x13 + p5 * x14 + p9 * x15 + p13 * x16;
445:         pc[13] = m14 = p2 * x13 + p6 * x14 + p10 * x15 + p14 * x16;
446:         pc[14] = m15 = p3 * x13 + p7 * x14 + p11 * x15 + p15 * x16;
447:         pc[15] = m16 = p4 * x13 + p8 * x14 + p12 * x15 + p16 * x16;
448:         nz           = bi[row + 1] - diag_offset[row] - 1;
449:         pv += 16;
450:         for (j = 0; j < nz; j++) {
451:           x1  = pv[0];
452:           x2  = pv[1];
453:           x3  = pv[2];
454:           x4  = pv[3];
455:           x5  = pv[4];
456:           x6  = pv[5];
457:           x7  = pv[6];
458:           x8  = pv[7];
459:           x9  = pv[8];
460:           x10 = pv[9];
461:           x11 = pv[10];
462:           x12 = pv[11];
463:           x13 = pv[12];
464:           x14 = pv[13];
465:           x15 = pv[14];
466:           x16 = pv[15];
467:           x   = rtmp + 16 * pj[j];
468:           x[0] -= m1 * x1 + m5 * x2 + m9 * x3 + m13 * x4;
469:           x[1] -= m2 * x1 + m6 * x2 + m10 * x3 + m14 * x4;
470:           x[2] -= m3 * x1 + m7 * x2 + m11 * x3 + m15 * x4;
471:           x[3] -= m4 * x1 + m8 * x2 + m12 * x3 + m16 * x4;

473:           x[4] -= m1 * x5 + m5 * x6 + m9 * x7 + m13 * x8;
474:           x[5] -= m2 * x5 + m6 * x6 + m10 * x7 + m14 * x8;
475:           x[6] -= m3 * x5 + m7 * x6 + m11 * x7 + m15 * x8;
476:           x[7] -= m4 * x5 + m8 * x6 + m12 * x7 + m16 * x8;

478:           x[8] -= m1 * x9 + m5 * x10 + m9 * x11 + m13 * x12;
479:           x[9] -= m2 * x9 + m6 * x10 + m10 * x11 + m14 * x12;
480:           x[10] -= m3 * x9 + m7 * x10 + m11 * x11 + m15 * x12;
481:           x[11] -= m4 * x9 + m8 * x10 + m12 * x11 + m16 * x12;

483:           x[12] -= m1 * x13 + m5 * x14 + m9 * x15 + m13 * x16;
484:           x[13] -= m2 * x13 + m6 * x14 + m10 * x15 + m14 * x16;
485:           x[14] -= m3 * x13 + m7 * x14 + m11 * x15 + m15 * x16;
486:           x[15] -= m4 * x13 + m8 * x14 + m12 * x15 + m16 * x16;

488:           pv += 16;
489:         }
490:         PetscCall(PetscLogFlops(128.0 * nz + 112.0));
491:       }
492:       row = *ajtmp++;
493:     }
494:     /* finished row so stick it into b->a */
495:     pv = ba + 16 * bi[i];
496:     pj = bj + bi[i];
497:     nz = bi[i + 1] - bi[i];
498:     for (j = 0; j < nz; j++) {
499:       x      = rtmp + 16 * pj[j];
500:       pv[0]  = x[0];
501:       pv[1]  = x[1];
502:       pv[2]  = x[2];
503:       pv[3]  = x[3];
504:       pv[4]  = x[4];
505:       pv[5]  = x[5];
506:       pv[6]  = x[6];
507:       pv[7]  = x[7];
508:       pv[8]  = x[8];
509:       pv[9]  = x[9];
510:       pv[10] = x[10];
511:       pv[11] = x[11];
512:       pv[12] = x[12];
513:       pv[13] = x[13];
514:       pv[14] = x[14];
515:       pv[15] = x[15];
516:       pv += 16;
517:     }
518:     /* invert diagonal block */
519:     w = ba + 16 * diag_offset[i];
520:     if (pivotinblocks) {
521:       PetscCall(PetscKernel_A_gets_inverse_A_4(w, shift, allowzeropivot, &zeropivotdetected));
522:       if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
523:     } else {
524:       PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w));
525:     }
526:   }

528:   PetscCall(PetscFree(rtmp));

530:   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_inplace;
531:   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace;
532:   C->assembled           = PETSC_TRUE;

534:   PetscCall(PetscLogFlops(1.333333333333 * 4 * 4 * 4 * b->mbs)); /* from inverting diagonal blocks */
535:   PetscFunctionReturn(PETSC_SUCCESS);
536: }

538: /*
539:   MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering -
540:     copied from MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering_inplace()
541: */
542: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering(Mat B, Mat A, const MatFactorInfo *info)
543: {
544:   Mat             C = B;
545:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
546:   PetscInt        i, j, k, nz, nzL, row;
547:   const PetscInt  n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j;
548:   const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag, *pj, bs2 = a->bs2;
549:   MatScalar      *rtmp, *pc, *mwork, *v, *pv, *aa = a->a;
550:   PetscInt        flg;
551:   PetscReal       shift;
552:   PetscBool       allowzeropivot, zeropivotdetected;

554:   PetscFunctionBegin;
555:   allowzeropivot = PetscNot(A->erroriffailure);

557:   /* generate work space needed by the factorization */
558:   PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork));
559:   PetscCall(PetscArrayzero(rtmp, bs2 * n));

561:   if (info->shifttype == (PetscReal)MAT_SHIFT_NONE) {
562:     shift = 0;
563:   } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */
564:     shift = info->shiftamount;
565:   }

567:   for (i = 0; i < n; i++) {
568:     /* zero rtmp */
569:     /* L part */
570:     nz    = bi[i + 1] - bi[i];
571:     bjtmp = bj + bi[i];
572:     for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));

574:     /* U part */
575:     nz    = bdiag[i] - bdiag[i + 1];
576:     bjtmp = bj + bdiag[i + 1] + 1;
577:     for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));

579:     /* load in initial (unfactored row) */
580:     nz    = ai[i + 1] - ai[i];
581:     ajtmp = aj + ai[i];
582:     v     = aa + bs2 * ai[i];
583:     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ajtmp[j], v + bs2 * j, bs2));

585:     /* elimination */
586:     bjtmp = bj + bi[i];
587:     nzL   = bi[i + 1] - bi[i];
588:     for (k = 0; k < nzL; k++) {
589:       row = bjtmp[k];
590:       pc  = rtmp + bs2 * row;
591:       for (flg = 0, j = 0; j < bs2; j++) {
592:         if (pc[j] != 0.0) {
593:           flg = 1;
594:           break;
595:         }
596:       }
597:       if (flg) {
598:         pv = b->a + bs2 * bdiag[row];
599:         /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
600:         PetscCall(PetscKernel_A_gets_A_times_B_4(pc, pv, mwork));

602:         pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
603:         pv = b->a + bs2 * (bdiag[row + 1] + 1);
604:         nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */
605:         for (j = 0; j < nz; j++) {
606:           /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
607:           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
608:           v = rtmp + bs2 * pj[j];
609:           PetscCall(PetscKernel_A_gets_A_minus_B_times_C_4(v, pc, pv));
610:           pv += bs2;
611:         }
612:         PetscCall(PetscLogFlops(128.0 * nz + 112)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
613:       }
614:     }

616:     /* finished row so stick it into b->a */
617:     /* L part */
618:     pv = b->a + bs2 * bi[i];
619:     pj = b->j + bi[i];
620:     nz = bi[i + 1] - bi[i];
621:     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));

623:     /* Mark diagonal and invert diagonal for simpler triangular solves */
624:     pv = b->a + bs2 * bdiag[i];
625:     pj = b->j + bdiag[i];
626:     PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
627:     PetscCall(PetscKernel_A_gets_inverse_A_4(pv, shift, allowzeropivot, &zeropivotdetected));
628:     if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;

630:     /* U part */
631:     pv = b->a + bs2 * (bdiag[i + 1] + 1);
632:     pj = b->j + bdiag[i + 1] + 1;
633:     nz = bdiag[i] - bdiag[i + 1] - 1;
634:     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
635:   }
636:   PetscCall(PetscFree2(rtmp, mwork));

638:   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering;
639:   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering;
640:   C->assembled           = PETSC_TRUE;

642:   PetscCall(PetscLogFlops(1.333333333333 * 4 * 4 * 4 * n)); /* from inverting diagonal blocks */
643:   PetscFunctionReturn(PETSC_SUCCESS);
644: }

646: #if defined(PETSC_HAVE_SSE)

648:   #include PETSC_HAVE_SSE

650: /* SSE Version for when blocks are 4 by 4 Using natural ordering */
651: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE(Mat B, Mat A, const MatFactorInfo *info)
652: {
653:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
654:   int          i, j, n = a->mbs;
655:   int         *bj = b->j, *bjtmp, *pj;
656:   int          row;
657:   int         *ajtmpold, nz, *bi = b->i;
658:   int         *diag_offset = b->diag, *ai = a->i, *aj = a->j;
659:   MatScalar   *pv, *v, *rtmp, *pc, *w, *x;
660:   MatScalar   *ba = b->a, *aa = a->a;
661:   int          nonzero       = 0;
662:   PetscBool    pivotinblocks = b->pivotinblocks;
663:   PetscReal    shift         = info->shiftamount;
664:   PetscBool    allowzeropivot, zeropivotdetected = PETSC_FALSE;

666:   PetscFunctionBegin;
667:   allowzeropivot = PetscNot(A->erroriffailure);
668:   SSE_SCOPE_BEGIN;

670:   PetscCheck((unsigned long)aa % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer aa is not 16 byte aligned.  SSE will not work.");
671:   PetscCheck((unsigned long)ba % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer ba is not 16 byte aligned.  SSE will not work.");
672:   PetscCall(PetscMalloc1(16 * (n + 1), &rtmp));
673:   PetscCheck((unsigned long)rtmp % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer rtmp is not 16 byte aligned.  SSE will not work.");
674:   /*    if ((unsigned long)bj==(unsigned long)aj) { */
675:   /*      colscale = 4; */
676:   /*    } */
677:   for (i = 0; i < n; i++) {
678:     nz    = bi[i + 1] - bi[i];
679:     bjtmp = bj + bi[i];
680:     /* zero out the 4x4 block accumulators */
681:     /* zero out one register */
682:     XOR_PS(XMM7, XMM7);
683:     for (j = 0; j < nz; j++) {
684:       x = rtmp + 16 * bjtmp[j];
685:       /*        x = rtmp+4*bjtmp[j]; */
686:       SSE_INLINE_BEGIN_1(x)
687:       /* Copy zero register to memory locations */
688:       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
689:       SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM7)
690:       SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM7)
691:       SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM7)
692:       SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM7)
693:       SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM7)
694:       SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM7)
695:       SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM7)
696:       SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM7)
697:       SSE_INLINE_END_1;
698:     }
699:     /* load in initial (unfactored row) */
700:     nz       = ai[i + 1] - ai[i];
701:     ajtmpold = aj + ai[i];
702:     v        = aa + 16 * ai[i];
703:     for (j = 0; j < nz; j++) {
704:       x = rtmp + 16 * ajtmpold[j];
705:       /*        x = rtmp+colscale*ajtmpold[j]; */
706:       /* Copy v block into x block */
707:       SSE_INLINE_BEGIN_2(v, x)
708:       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
709:       SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0)
710:       SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM0)

712:       SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM1)
713:       SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM1)

715:       SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM2)
716:       SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM2)

718:       SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM3)
719:       SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM3)

721:       SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM4)
722:       SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM4)

724:       SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM5)
725:       SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM5)

727:       SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM6)
728:       SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM6)

730:       SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0)
731:       SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0)
732:       SSE_INLINE_END_2;

734:       v += 16;
735:     }
736:     /*      row = (*bjtmp++)/4; */
737:     row = *bjtmp++;
738:     while (row < i) {
739:       pc = rtmp + 16 * row;
740:       SSE_INLINE_BEGIN_1(pc)
741:       /* Load block from lower triangle */
742:       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
743:       SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0)
744:       SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM0)

746:       SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM1)
747:       SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM1)

749:       SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM2)
750:       SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM2)

752:       SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM3)
753:       SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM3)

755:       /* Compare block to zero block */

757:       SSE_COPY_PS(XMM4, XMM7)
758:       SSE_CMPNEQ_PS(XMM4, XMM0)

760:       SSE_COPY_PS(XMM5, XMM7)
761:       SSE_CMPNEQ_PS(XMM5, XMM1)

763:       SSE_COPY_PS(XMM6, XMM7)
764:       SSE_CMPNEQ_PS(XMM6, XMM2)

766:       SSE_CMPNEQ_PS(XMM7, XMM3)

768:       /* Reduce the comparisons to one SSE register */
769:       SSE_OR_PS(XMM6, XMM7)
770:       SSE_OR_PS(XMM5, XMM4)
771:       SSE_OR_PS(XMM5, XMM6)
772:       SSE_INLINE_END_1;

774:       /* Reduce the one SSE register to an integer register for branching */
775:       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
776:       MOVEMASK(nonzero, XMM5);

778:       /* If block is nonzero ... */
779:       if (nonzero) {
780:         pv = ba + 16 * diag_offset[row];
781:         PREFETCH_L1(&pv[16]);
782:         pj = bj + diag_offset[row] + 1;

784:         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
785:         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
786:         /* but the diagonal was inverted already */
787:         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */

789:         SSE_INLINE_BEGIN_2(pv, pc)
790:         /* Column 0, product is accumulated in XMM4 */
791:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_0, XMM4)
792:         SSE_SHUFFLE(XMM4, XMM4, 0x00)
793:         SSE_MULT_PS(XMM4, XMM0)

795:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_1, XMM5)
796:         SSE_SHUFFLE(XMM5, XMM5, 0x00)
797:         SSE_MULT_PS(XMM5, XMM1)
798:         SSE_ADD_PS(XMM4, XMM5)

800:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_2, XMM6)
801:         SSE_SHUFFLE(XMM6, XMM6, 0x00)
802:         SSE_MULT_PS(XMM6, XMM2)
803:         SSE_ADD_PS(XMM4, XMM6)

805:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_3, XMM7)
806:         SSE_SHUFFLE(XMM7, XMM7, 0x00)
807:         SSE_MULT_PS(XMM7, XMM3)
808:         SSE_ADD_PS(XMM4, XMM7)

810:         SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM4)
811:         SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM4)

813:         /* Column 1, product is accumulated in XMM5 */
814:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_4, XMM5)
815:         SSE_SHUFFLE(XMM5, XMM5, 0x00)
816:         SSE_MULT_PS(XMM5, XMM0)

818:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_5, XMM6)
819:         SSE_SHUFFLE(XMM6, XMM6, 0x00)
820:         SSE_MULT_PS(XMM6, XMM1)
821:         SSE_ADD_PS(XMM5, XMM6)

823:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_6, XMM7)
824:         SSE_SHUFFLE(XMM7, XMM7, 0x00)
825:         SSE_MULT_PS(XMM7, XMM2)
826:         SSE_ADD_PS(XMM5, XMM7)

828:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_7, XMM6)
829:         SSE_SHUFFLE(XMM6, XMM6, 0x00)
830:         SSE_MULT_PS(XMM6, XMM3)
831:         SSE_ADD_PS(XMM5, XMM6)

833:         SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM5)
834:         SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM5)

836:         SSE_PREFETCH_L1(SSE_ARG_1, FLOAT_24)

838:         /* Column 2, product is accumulated in XMM6 */
839:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_8, XMM6)
840:         SSE_SHUFFLE(XMM6, XMM6, 0x00)
841:         SSE_MULT_PS(XMM6, XMM0)

843:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_9, XMM7)
844:         SSE_SHUFFLE(XMM7, XMM7, 0x00)
845:         SSE_MULT_PS(XMM7, XMM1)
846:         SSE_ADD_PS(XMM6, XMM7)

848:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_10, XMM7)
849:         SSE_SHUFFLE(XMM7, XMM7, 0x00)
850:         SSE_MULT_PS(XMM7, XMM2)
851:         SSE_ADD_PS(XMM6, XMM7)

853:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_11, XMM7)
854:         SSE_SHUFFLE(XMM7, XMM7, 0x00)
855:         SSE_MULT_PS(XMM7, XMM3)
856:         SSE_ADD_PS(XMM6, XMM7)

858:         SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM6)
859:         SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6)

861:         /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
862:         /* Column 3, product is accumulated in XMM0 */
863:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_12, XMM7)
864:         SSE_SHUFFLE(XMM7, XMM7, 0x00)
865:         SSE_MULT_PS(XMM0, XMM7)

867:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_13, XMM7)
868:         SSE_SHUFFLE(XMM7, XMM7, 0x00)
869:         SSE_MULT_PS(XMM1, XMM7)
870:         SSE_ADD_PS(XMM0, XMM1)

872:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_14, XMM1)
873:         SSE_SHUFFLE(XMM1, XMM1, 0x00)
874:         SSE_MULT_PS(XMM1, XMM2)
875:         SSE_ADD_PS(XMM0, XMM1)

877:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_15, XMM7)
878:         SSE_SHUFFLE(XMM7, XMM7, 0x00)
879:         SSE_MULT_PS(XMM3, XMM7)
880:         SSE_ADD_PS(XMM0, XMM3)

882:         SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM0)
883:         SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0)

885:         /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
886:         /* This is code to be maintained and read by humans after all. */
887:         /* Copy Multiplier Col 3 into XMM3 */
888:         SSE_COPY_PS(XMM3, XMM0)
889:         /* Copy Multiplier Col 2 into XMM2 */
890:         SSE_COPY_PS(XMM2, XMM6)
891:         /* Copy Multiplier Col 1 into XMM1 */
892:         SSE_COPY_PS(XMM1, XMM5)
893:         /* Copy Multiplier Col 0 into XMM0 */
894:         SSE_COPY_PS(XMM0, XMM4)
895:         SSE_INLINE_END_2;

897:         /* Update the row: */
898:         nz = bi[row + 1] - diag_offset[row] - 1;
899:         pv += 16;
900:         for (j = 0; j < nz; j++) {
901:           PREFETCH_L1(&pv[16]);
902:           x = rtmp + 16 * pj[j];
903:           /*            x = rtmp + 4*pj[j]; */

905:           /* X:=X-M*PV, One column at a time */
906:           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
907:           SSE_INLINE_BEGIN_2(x, pv)
908:           /* Load First Column of X*/
909:           SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM4)
910:           SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM4)

912:           /* Matrix-Vector Product: */
913:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_0, XMM5)
914:           SSE_SHUFFLE(XMM5, XMM5, 0x00)
915:           SSE_MULT_PS(XMM5, XMM0)
916:           SSE_SUB_PS(XMM4, XMM5)

918:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_1, XMM6)
919:           SSE_SHUFFLE(XMM6, XMM6, 0x00)
920:           SSE_MULT_PS(XMM6, XMM1)
921:           SSE_SUB_PS(XMM4, XMM6)

923:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_2, XMM7)
924:           SSE_SHUFFLE(XMM7, XMM7, 0x00)
925:           SSE_MULT_PS(XMM7, XMM2)
926:           SSE_SUB_PS(XMM4, XMM7)

928:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_3, XMM5)
929:           SSE_SHUFFLE(XMM5, XMM5, 0x00)
930:           SSE_MULT_PS(XMM5, XMM3)
931:           SSE_SUB_PS(XMM4, XMM5)

933:           SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM4)
934:           SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM4)

936:           /* Second Column */
937:           SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM5)
938:           SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM5)

940:           /* Matrix-Vector Product: */
941:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_4, XMM6)
942:           SSE_SHUFFLE(XMM6, XMM6, 0x00)
943:           SSE_MULT_PS(XMM6, XMM0)
944:           SSE_SUB_PS(XMM5, XMM6)

946:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_5, XMM7)
947:           SSE_SHUFFLE(XMM7, XMM7, 0x00)
948:           SSE_MULT_PS(XMM7, XMM1)
949:           SSE_SUB_PS(XMM5, XMM7)

951:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_6, XMM4)
952:           SSE_SHUFFLE(XMM4, XMM4, 0x00)
953:           SSE_MULT_PS(XMM4, XMM2)
954:           SSE_SUB_PS(XMM5, XMM4)

956:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_7, XMM6)
957:           SSE_SHUFFLE(XMM6, XMM6, 0x00)
958:           SSE_MULT_PS(XMM6, XMM3)
959:           SSE_SUB_PS(XMM5, XMM6)

961:           SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM5)
962:           SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM5)

964:           SSE_PREFETCH_L1(SSE_ARG_2, FLOAT_24)

966:           /* Third Column */
967:           SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM6)
968:           SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6)

970:           /* Matrix-Vector Product: */
971:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_8, XMM7)
972:           SSE_SHUFFLE(XMM7, XMM7, 0x00)
973:           SSE_MULT_PS(XMM7, XMM0)
974:           SSE_SUB_PS(XMM6, XMM7)

976:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_9, XMM4)
977:           SSE_SHUFFLE(XMM4, XMM4, 0x00)
978:           SSE_MULT_PS(XMM4, XMM1)
979:           SSE_SUB_PS(XMM6, XMM4)

981:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_10, XMM5)
982:           SSE_SHUFFLE(XMM5, XMM5, 0x00)
983:           SSE_MULT_PS(XMM5, XMM2)
984:           SSE_SUB_PS(XMM6, XMM5)

986:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_11, XMM7)
987:           SSE_SHUFFLE(XMM7, XMM7, 0x00)
988:           SSE_MULT_PS(XMM7, XMM3)
989:           SSE_SUB_PS(XMM6, XMM7)

991:           SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM6)
992:           SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM6)

994:           /* Fourth Column */
995:           SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM4)
996:           SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM4)

998:           /* Matrix-Vector Product: */
999:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_12, XMM5)
1000:           SSE_SHUFFLE(XMM5, XMM5, 0x00)
1001:           SSE_MULT_PS(XMM5, XMM0)
1002:           SSE_SUB_PS(XMM4, XMM5)

1004:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_13, XMM6)
1005:           SSE_SHUFFLE(XMM6, XMM6, 0x00)
1006:           SSE_MULT_PS(XMM6, XMM1)
1007:           SSE_SUB_PS(XMM4, XMM6)

1009:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_14, XMM7)
1010:           SSE_SHUFFLE(XMM7, XMM7, 0x00)
1011:           SSE_MULT_PS(XMM7, XMM2)
1012:           SSE_SUB_PS(XMM4, XMM7)

1014:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_15, XMM5)
1015:           SSE_SHUFFLE(XMM5, XMM5, 0x00)
1016:           SSE_MULT_PS(XMM5, XMM3)
1017:           SSE_SUB_PS(XMM4, XMM5)

1019:           SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM4)
1020:           SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM4)
1021:           SSE_INLINE_END_2;
1022:           pv += 16;
1023:         }
1024:         PetscCall(PetscLogFlops(128.0 * nz + 112.0));
1025:       }
1026:       row = *bjtmp++;
1027:       /*        row = (*bjtmp++)/4; */
1028:     }
1029:     /* finished row so stick it into b->a */
1030:     pv = ba + 16 * bi[i];
1031:     pj = bj + bi[i];
1032:     nz = bi[i + 1] - bi[i];

1034:     /* Copy x block back into pv block */
1035:     for (j = 0; j < nz; j++) {
1036:       x = rtmp + 16 * pj[j];
1037:       /*        x  = rtmp+4*pj[j]; */

1039:       SSE_INLINE_BEGIN_2(x, pv)
1040:       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1041:       SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM1)
1042:       SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM1)

1044:       SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM2)
1045:       SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM2)

1047:       SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM3)
1048:       SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM3)

1050:       SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM4)
1051:       SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM4)

1053:       SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM5)
1054:       SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM5)

1056:       SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6)
1057:       SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6)

1059:       SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM7)
1060:       SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM7)

1062:       SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0)
1063:       SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0)
1064:       SSE_INLINE_END_2;
1065:       pv += 16;
1066:     }
1067:     /* invert diagonal block */
1068:     w = ba + 16 * diag_offset[i];
1069:     if (pivotinblocks) {
1070:       PetscCall(PetscKernel_A_gets_inverse_A_4(w, shift, allowzeropivot, &zeropivotdetected));
1071:       if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
1072:     } else {
1073:       PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w));
1074:     }
1075:     /*      PetscCall(PetscKernel_A_gets_inverse_A_4_SSE(w)); */
1076:     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1077:   }

1079:   PetscCall(PetscFree(rtmp));

1081:   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1082:   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1083:   C->assembled           = PETSC_TRUE;

1085:   PetscCall(PetscLogFlops(1.333333333333 * bs * bs2 * b->mbs));
1086:   /* Flop Count from inverting diagonal blocks */
1087:   SSE_SCOPE_END;
1088:   PetscFunctionReturn(PETSC_SUCCESS);
1089: }

1091: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(Mat C)
1092: {
1093:   Mat             A = C;
1094:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
1095:   int             i, j, n = a->mbs;
1096:   unsigned short *bj = (unsigned short *)(b->j), *bjtmp, *pj;
1097:   unsigned short *aj = (unsigned short *)(a->j), *ajtmp;
1098:   unsigned int    row;
1099:   int             nz, *bi = b->i;
1100:   int            *diag_offset = b->diag, *ai = a->i;
1101:   MatScalar      *pv, *v, *rtmp, *pc, *w, *x;
1102:   MatScalar      *ba = b->a, *aa = a->a;
1103:   int             nonzero       = 0;
1104:   PetscBool       pivotinblocks = b->pivotinblocks;
1105:   PetscReal       shift         = info->shiftamount;
1106:   PetscBool       allowzeropivot, zeropivotdetected = PETSC_FALSE;

1108:   PetscFunctionBegin;
1109:   allowzeropivot = PetscNot(A->erroriffailure);
1110:   SSE_SCOPE_BEGIN;

1112:   PetscCheck((unsigned long)aa % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer aa is not 16 byte aligned.  SSE will not work.");
1113:   PetscCheck((unsigned long)ba % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer ba is not 16 byte aligned.  SSE will not work.");
1114:   PetscCall(PetscMalloc1(16 * (n + 1), &rtmp));
1115:   PetscCheck((unsigned long)rtmp % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer rtmp is not 16 byte aligned.  SSE will not work.");
1116:   /*    if ((unsigned long)bj==(unsigned long)aj) { */
1117:   /*      colscale = 4; */
1118:   /*    } */

1120:   for (i = 0; i < n; i++) {
1121:     nz    = bi[i + 1] - bi[i];
1122:     bjtmp = bj + bi[i];
1123:     /* zero out the 4x4 block accumulators */
1124:     /* zero out one register */
1125:     XOR_PS(XMM7, XMM7);
1126:     for (j = 0; j < nz; j++) {
1127:       x = rtmp + 16 * ((unsigned int)bjtmp[j]);
1128:       /*        x = rtmp+4*bjtmp[j]; */
1129:       SSE_INLINE_BEGIN_1(x)
1130:       /* Copy zero register to memory locations */
1131:       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1132:       SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM7)
1133:       SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM7)
1134:       SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM7)
1135:       SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM7)
1136:       SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM7)
1137:       SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM7)
1138:       SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM7)
1139:       SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM7)
1140:       SSE_INLINE_END_1;
1141:     }
1142:     /* load in initial (unfactored row) */
1143:     nz    = ai[i + 1] - ai[i];
1144:     ajtmp = aj + ai[i];
1145:     v     = aa + 16 * ai[i];
1146:     for (j = 0; j < nz; j++) {
1147:       x = rtmp + 16 * ((unsigned int)ajtmp[j]);
1148:       /*        x = rtmp+colscale*ajtmp[j]; */
1149:       /* Copy v block into x block */
1150:       SSE_INLINE_BEGIN_2(v, x)
1151:       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1152:       SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0)
1153:       SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM0)

1155:       SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM1)
1156:       SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM1)

1158:       SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM2)
1159:       SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM2)

1161:       SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM3)
1162:       SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM3)

1164:       SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM4)
1165:       SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM4)

1167:       SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM5)
1168:       SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM5)

1170:       SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM6)
1171:       SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM6)

1173:       SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0)
1174:       SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0)
1175:       SSE_INLINE_END_2;

1177:       v += 16;
1178:     }
1179:     /*      row = (*bjtmp++)/4; */
1180:     row = (unsigned int)(*bjtmp++);
1181:     while (row < i) {
1182:       pc = rtmp + 16 * row;
1183:       SSE_INLINE_BEGIN_1(pc)
1184:       /* Load block from lower triangle */
1185:       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1186:       SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0)
1187:       SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM0)

1189:       SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM1)
1190:       SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM1)

1192:       SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM2)
1193:       SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM2)

1195:       SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM3)
1196:       SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM3)

1198:       /* Compare block to zero block */

1200:       SSE_COPY_PS(XMM4, XMM7)
1201:       SSE_CMPNEQ_PS(XMM4, XMM0)

1203:       SSE_COPY_PS(XMM5, XMM7)
1204:       SSE_CMPNEQ_PS(XMM5, XMM1)

1206:       SSE_COPY_PS(XMM6, XMM7)
1207:       SSE_CMPNEQ_PS(XMM6, XMM2)

1209:       SSE_CMPNEQ_PS(XMM7, XMM3)

1211:       /* Reduce the comparisons to one SSE register */
1212:       SSE_OR_PS(XMM6, XMM7)
1213:       SSE_OR_PS(XMM5, XMM4)
1214:       SSE_OR_PS(XMM5, XMM6)
1215:       SSE_INLINE_END_1;

1217:       /* Reduce the one SSE register to an integer register for branching */
1218:       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
1219:       MOVEMASK(nonzero, XMM5);

1221:       /* If block is nonzero ... */
1222:       if (nonzero) {
1223:         pv = ba + 16 * diag_offset[row];
1224:         PREFETCH_L1(&pv[16]);
1225:         pj = bj + diag_offset[row] + 1;

1227:         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
1228:         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
1229:         /* but the diagonal was inverted already */
1230:         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */

1232:         SSE_INLINE_BEGIN_2(pv, pc)
1233:         /* Column 0, product is accumulated in XMM4 */
1234:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_0, XMM4)
1235:         SSE_SHUFFLE(XMM4, XMM4, 0x00)
1236:         SSE_MULT_PS(XMM4, XMM0)

1238:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_1, XMM5)
1239:         SSE_SHUFFLE(XMM5, XMM5, 0x00)
1240:         SSE_MULT_PS(XMM5, XMM1)
1241:         SSE_ADD_PS(XMM4, XMM5)

1243:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_2, XMM6)
1244:         SSE_SHUFFLE(XMM6, XMM6, 0x00)
1245:         SSE_MULT_PS(XMM6, XMM2)
1246:         SSE_ADD_PS(XMM4, XMM6)

1248:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_3, XMM7)
1249:         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1250:         SSE_MULT_PS(XMM7, XMM3)
1251:         SSE_ADD_PS(XMM4, XMM7)

1253:         SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM4)
1254:         SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM4)

1256:         /* Column 1, product is accumulated in XMM5 */
1257:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_4, XMM5)
1258:         SSE_SHUFFLE(XMM5, XMM5, 0x00)
1259:         SSE_MULT_PS(XMM5, XMM0)

1261:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_5, XMM6)
1262:         SSE_SHUFFLE(XMM6, XMM6, 0x00)
1263:         SSE_MULT_PS(XMM6, XMM1)
1264:         SSE_ADD_PS(XMM5, XMM6)

1266:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_6, XMM7)
1267:         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1268:         SSE_MULT_PS(XMM7, XMM2)
1269:         SSE_ADD_PS(XMM5, XMM7)

1271:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_7, XMM6)
1272:         SSE_SHUFFLE(XMM6, XMM6, 0x00)
1273:         SSE_MULT_PS(XMM6, XMM3)
1274:         SSE_ADD_PS(XMM5, XMM6)

1276:         SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM5)
1277:         SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM5)

1279:         SSE_PREFETCH_L1(SSE_ARG_1, FLOAT_24)

1281:         /* Column 2, product is accumulated in XMM6 */
1282:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_8, XMM6)
1283:         SSE_SHUFFLE(XMM6, XMM6, 0x00)
1284:         SSE_MULT_PS(XMM6, XMM0)

1286:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_9, XMM7)
1287:         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1288:         SSE_MULT_PS(XMM7, XMM1)
1289:         SSE_ADD_PS(XMM6, XMM7)

1291:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_10, XMM7)
1292:         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1293:         SSE_MULT_PS(XMM7, XMM2)
1294:         SSE_ADD_PS(XMM6, XMM7)

1296:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_11, XMM7)
1297:         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1298:         SSE_MULT_PS(XMM7, XMM3)
1299:         SSE_ADD_PS(XMM6, XMM7)

1301:         SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM6)
1302:         SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6)

1304:         /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
1305:         /* Column 3, product is accumulated in XMM0 */
1306:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_12, XMM7)
1307:         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1308:         SSE_MULT_PS(XMM0, XMM7)

1310:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_13, XMM7)
1311:         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1312:         SSE_MULT_PS(XMM1, XMM7)
1313:         SSE_ADD_PS(XMM0, XMM1)

1315:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_14, XMM1)
1316:         SSE_SHUFFLE(XMM1, XMM1, 0x00)
1317:         SSE_MULT_PS(XMM1, XMM2)
1318:         SSE_ADD_PS(XMM0, XMM1)

1320:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_15, XMM7)
1321:         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1322:         SSE_MULT_PS(XMM3, XMM7)
1323:         SSE_ADD_PS(XMM0, XMM3)

1325:         SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM0)
1326:         SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0)

1328:         /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
1329:         /* This is code to be maintained and read by humans after all. */
1330:         /* Copy Multiplier Col 3 into XMM3 */
1331:         SSE_COPY_PS(XMM3, XMM0)
1332:         /* Copy Multiplier Col 2 into XMM2 */
1333:         SSE_COPY_PS(XMM2, XMM6)
1334:         /* Copy Multiplier Col 1 into XMM1 */
1335:         SSE_COPY_PS(XMM1, XMM5)
1336:         /* Copy Multiplier Col 0 into XMM0 */
1337:         SSE_COPY_PS(XMM0, XMM4)
1338:         SSE_INLINE_END_2;

1340:         /* Update the row: */
1341:         nz = bi[row + 1] - diag_offset[row] - 1;
1342:         pv += 16;
1343:         for (j = 0; j < nz; j++) {
1344:           PREFETCH_L1(&pv[16]);
1345:           x = rtmp + 16 * ((unsigned int)pj[j]);
1346:           /*            x = rtmp + 4*pj[j]; */

1348:           /* X:=X-M*PV, One column at a time */
1349:           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
1350:           SSE_INLINE_BEGIN_2(x, pv)
1351:           /* Load First Column of X*/
1352:           SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM4)
1353:           SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM4)

1355:           /* Matrix-Vector Product: */
1356:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_0, XMM5)
1357:           SSE_SHUFFLE(XMM5, XMM5, 0x00)
1358:           SSE_MULT_PS(XMM5, XMM0)
1359:           SSE_SUB_PS(XMM4, XMM5)

1361:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_1, XMM6)
1362:           SSE_SHUFFLE(XMM6, XMM6, 0x00)
1363:           SSE_MULT_PS(XMM6, XMM1)
1364:           SSE_SUB_PS(XMM4, XMM6)

1366:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_2, XMM7)
1367:           SSE_SHUFFLE(XMM7, XMM7, 0x00)
1368:           SSE_MULT_PS(XMM7, XMM2)
1369:           SSE_SUB_PS(XMM4, XMM7)

1371:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_3, XMM5)
1372:           SSE_SHUFFLE(XMM5, XMM5, 0x00)
1373:           SSE_MULT_PS(XMM5, XMM3)
1374:           SSE_SUB_PS(XMM4, XMM5)

1376:           SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM4)
1377:           SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM4)

1379:           /* Second Column */
1380:           SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM5)
1381:           SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM5)

1383:           /* Matrix-Vector Product: */
1384:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_4, XMM6)
1385:           SSE_SHUFFLE(XMM6, XMM6, 0x00)
1386:           SSE_MULT_PS(XMM6, XMM0)
1387:           SSE_SUB_PS(XMM5, XMM6)

1389:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_5, XMM7)
1390:           SSE_SHUFFLE(XMM7, XMM7, 0x00)
1391:           SSE_MULT_PS(XMM7, XMM1)
1392:           SSE_SUB_PS(XMM5, XMM7)

1394:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_6, XMM4)
1395:           SSE_SHUFFLE(XMM4, XMM4, 0x00)
1396:           SSE_MULT_PS(XMM4, XMM2)
1397:           SSE_SUB_PS(XMM5, XMM4)

1399:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_7, XMM6)
1400:           SSE_SHUFFLE(XMM6, XMM6, 0x00)
1401:           SSE_MULT_PS(XMM6, XMM3)
1402:           SSE_SUB_PS(XMM5, XMM6)

1404:           SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM5)
1405:           SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM5)

1407:           SSE_PREFETCH_L1(SSE_ARG_2, FLOAT_24)

1409:           /* Third Column */
1410:           SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM6)
1411:           SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6)

1413:           /* Matrix-Vector Product: */
1414:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_8, XMM7)
1415:           SSE_SHUFFLE(XMM7, XMM7, 0x00)
1416:           SSE_MULT_PS(XMM7, XMM0)
1417:           SSE_SUB_PS(XMM6, XMM7)

1419:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_9, XMM4)
1420:           SSE_SHUFFLE(XMM4, XMM4, 0x00)
1421:           SSE_MULT_PS(XMM4, XMM1)
1422:           SSE_SUB_PS(XMM6, XMM4)

1424:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_10, XMM5)
1425:           SSE_SHUFFLE(XMM5, XMM5, 0x00)
1426:           SSE_MULT_PS(XMM5, XMM2)
1427:           SSE_SUB_PS(XMM6, XMM5)

1429:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_11, XMM7)
1430:           SSE_SHUFFLE(XMM7, XMM7, 0x00)
1431:           SSE_MULT_PS(XMM7, XMM3)
1432:           SSE_SUB_PS(XMM6, XMM7)

1434:           SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM6)
1435:           SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM6)

1437:           /* Fourth Column */
1438:           SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM4)
1439:           SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM4)

1441:           /* Matrix-Vector Product: */
1442:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_12, XMM5)
1443:           SSE_SHUFFLE(XMM5, XMM5, 0x00)
1444:           SSE_MULT_PS(XMM5, XMM0)
1445:           SSE_SUB_PS(XMM4, XMM5)

1447:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_13, XMM6)
1448:           SSE_SHUFFLE(XMM6, XMM6, 0x00)
1449:           SSE_MULT_PS(XMM6, XMM1)
1450:           SSE_SUB_PS(XMM4, XMM6)

1452:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_14, XMM7)
1453:           SSE_SHUFFLE(XMM7, XMM7, 0x00)
1454:           SSE_MULT_PS(XMM7, XMM2)
1455:           SSE_SUB_PS(XMM4, XMM7)

1457:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_15, XMM5)
1458:           SSE_SHUFFLE(XMM5, XMM5, 0x00)
1459:           SSE_MULT_PS(XMM5, XMM3)
1460:           SSE_SUB_PS(XMM4, XMM5)

1462:           SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM4)
1463:           SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM4)
1464:           SSE_INLINE_END_2;
1465:           pv += 16;
1466:         }
1467:         PetscCall(PetscLogFlops(128.0 * nz + 112.0));
1468:       }
1469:       row = (unsigned int)(*bjtmp++);
1470:       /*        row = (*bjtmp++)/4; */
1471:       /*        bjtmp++; */
1472:     }
1473:     /* finished row so stick it into b->a */
1474:     pv = ba + 16 * bi[i];
1475:     pj = bj + bi[i];
1476:     nz = bi[i + 1] - bi[i];

1478:     /* Copy x block back into pv block */
1479:     for (j = 0; j < nz; j++) {
1480:       x = rtmp + 16 * ((unsigned int)pj[j]);
1481:       /*        x  = rtmp+4*pj[j]; */

1483:       SSE_INLINE_BEGIN_2(x, pv)
1484:       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1485:       SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM1)
1486:       SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM1)

1488:       SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM2)
1489:       SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM2)

1491:       SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM3)
1492:       SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM3)

1494:       SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM4)
1495:       SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM4)

1497:       SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM5)
1498:       SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM5)

1500:       SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6)
1501:       SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6)

1503:       SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM7)
1504:       SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM7)

1506:       SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0)
1507:       SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0)
1508:       SSE_INLINE_END_2;
1509:       pv += 16;
1510:     }
1511:     /* invert diagonal block */
1512:     w = ba + 16 * diag_offset[i];
1513:     if (pivotinblocks) {
1514:       PetscCall(PetscKernel_A_gets_inverse_A_4(w, shift, allowzeropivot, &zeropivotdetected));
1515:       if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
1516:     } else {
1517:       PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w));
1518:     }
1519:     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1520:   }

1522:   PetscCall(PetscFree(rtmp));

1524:   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1525:   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1526:   C->assembled           = PETSC_TRUE;

1528:   PetscCall(PetscLogFlops(1.333333333333 * bs * bs2 * b->mbs));
1529:   /* Flop Count from inverting diagonal blocks */
1530:   SSE_SCOPE_END;
1531:   PetscFunctionReturn(PETSC_SUCCESS);
1532: }

1534: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat C, Mat A, const MatFactorInfo *info)
1535: {
1536:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
1537:   int             i, j, n = a->mbs;
1538:   unsigned short *bj = (unsigned short *)(b->j), *bjtmp, *pj;
1539:   unsigned int    row;
1540:   int            *ajtmpold, nz, *bi = b->i;
1541:   int            *diag_offset = b->diag, *ai = a->i, *aj = a->j;
1542:   MatScalar      *pv, *v, *rtmp, *pc, *w, *x;
1543:   MatScalar      *ba = b->a, *aa = a->a;
1544:   int             nonzero       = 0;
1545:   PetscBool       pivotinblocks = b->pivotinblocks;
1546:   PetscReal       shift         = info->shiftamount;
1547:   PetscBool       allowzeropivot, zeropivotdetected = PETSC_FALSE;

1549:   PetscFunctionBegin;
1550:   allowzeropivot = PetscNot(A->erroriffailure);
1551:   SSE_SCOPE_BEGIN;

1553:   PetscCheck((unsigned long)aa % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer aa is not 16 byte aligned.  SSE will not work.");
1554:   PetscCheck((unsigned long)ba % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer ba is not 16 byte aligned.  SSE will not work.");
1555:   PetscCall(PetscMalloc1(16 * (n + 1), &rtmp));
1556:   PetscCheck((unsigned long)rtmp % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer rtmp is not 16 byte aligned.  SSE will not work.");
1557:   /*    if ((unsigned long)bj==(unsigned long)aj) { */
1558:   /*      colscale = 4; */
1559:   /*    } */
1560:   if ((unsigned long)bj == (unsigned long)aj) return (MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(C));

1562:   for (i = 0; i < n; i++) {
1563:     nz    = bi[i + 1] - bi[i];
1564:     bjtmp = bj + bi[i];
1565:     /* zero out the 4x4 block accumulators */
1566:     /* zero out one register */
1567:     XOR_PS(XMM7, XMM7);
1568:     for (j = 0; j < nz; j++) {
1569:       x = rtmp + 16 * ((unsigned int)bjtmp[j]);
1570:       /*        x = rtmp+4*bjtmp[j]; */
1571:       SSE_INLINE_BEGIN_1(x)
1572:       /* Copy zero register to memory locations */
1573:       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1574:       SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM7)
1575:       SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM7)
1576:       SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM7)
1577:       SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM7)
1578:       SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM7)
1579:       SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM7)
1580:       SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM7)
1581:       SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM7)
1582:       SSE_INLINE_END_1;
1583:     }
1584:     /* load in initial (unfactored row) */
1585:     nz       = ai[i + 1] - ai[i];
1586:     ajtmpold = aj + ai[i];
1587:     v        = aa + 16 * ai[i];
1588:     for (j = 0; j < nz; j++) {
1589:       x = rtmp + 16 * ajtmpold[j];
1590:       /*        x = rtmp+colscale*ajtmpold[j]; */
1591:       /* Copy v block into x block */
1592:       SSE_INLINE_BEGIN_2(v, x)
1593:       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1594:       SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0)
1595:       SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM0)

1597:       SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM1)
1598:       SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM1)

1600:       SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM2)
1601:       SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM2)

1603:       SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM3)
1604:       SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM3)

1606:       SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM4)
1607:       SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM4)

1609:       SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM5)
1610:       SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM5)

1612:       SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM6)
1613:       SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM6)

1615:       SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0)
1616:       SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0)
1617:       SSE_INLINE_END_2;

1619:       v += 16;
1620:     }
1621:     /*      row = (*bjtmp++)/4; */
1622:     row = (unsigned int)(*bjtmp++);
1623:     while (row < i) {
1624:       pc = rtmp + 16 * row;
1625:       SSE_INLINE_BEGIN_1(pc)
1626:       /* Load block from lower triangle */
1627:       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1628:       SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0)
1629:       SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM0)

1631:       SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM1)
1632:       SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM1)

1634:       SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM2)
1635:       SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM2)

1637:       SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM3)
1638:       SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM3)

1640:       /* Compare block to zero block */

1642:       SSE_COPY_PS(XMM4, XMM7)
1643:       SSE_CMPNEQ_PS(XMM4, XMM0)

1645:       SSE_COPY_PS(XMM5, XMM7)
1646:       SSE_CMPNEQ_PS(XMM5, XMM1)

1648:       SSE_COPY_PS(XMM6, XMM7)
1649:       SSE_CMPNEQ_PS(XMM6, XMM2)

1651:       SSE_CMPNEQ_PS(XMM7, XMM3)

1653:       /* Reduce the comparisons to one SSE register */
1654:       SSE_OR_PS(XMM6, XMM7)
1655:       SSE_OR_PS(XMM5, XMM4)
1656:       SSE_OR_PS(XMM5, XMM6)
1657:       SSE_INLINE_END_1;

1659:       /* Reduce the one SSE register to an integer register for branching */
1660:       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
1661:       MOVEMASK(nonzero, XMM5);

1663:       /* If block is nonzero ... */
1664:       if (nonzero) {
1665:         pv = ba + 16 * diag_offset[row];
1666:         PREFETCH_L1(&pv[16]);
1667:         pj = bj + diag_offset[row] + 1;

1669:         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
1670:         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
1671:         /* but the diagonal was inverted already */
1672:         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */

1674:         SSE_INLINE_BEGIN_2(pv, pc)
1675:         /* Column 0, product is accumulated in XMM4 */
1676:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_0, XMM4)
1677:         SSE_SHUFFLE(XMM4, XMM4, 0x00)
1678:         SSE_MULT_PS(XMM4, XMM0)

1680:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_1, XMM5)
1681:         SSE_SHUFFLE(XMM5, XMM5, 0x00)
1682:         SSE_MULT_PS(XMM5, XMM1)
1683:         SSE_ADD_PS(XMM4, XMM5)

1685:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_2, XMM6)
1686:         SSE_SHUFFLE(XMM6, XMM6, 0x00)
1687:         SSE_MULT_PS(XMM6, XMM2)
1688:         SSE_ADD_PS(XMM4, XMM6)

1690:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_3, XMM7)
1691:         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1692:         SSE_MULT_PS(XMM7, XMM3)
1693:         SSE_ADD_PS(XMM4, XMM7)

1695:         SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM4)
1696:         SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM4)

1698:         /* Column 1, product is accumulated in XMM5 */
1699:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_4, XMM5)
1700:         SSE_SHUFFLE(XMM5, XMM5, 0x00)
1701:         SSE_MULT_PS(XMM5, XMM0)

1703:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_5, XMM6)
1704:         SSE_SHUFFLE(XMM6, XMM6, 0x00)
1705:         SSE_MULT_PS(XMM6, XMM1)
1706:         SSE_ADD_PS(XMM5, XMM6)

1708:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_6, XMM7)
1709:         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1710:         SSE_MULT_PS(XMM7, XMM2)
1711:         SSE_ADD_PS(XMM5, XMM7)

1713:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_7, XMM6)
1714:         SSE_SHUFFLE(XMM6, XMM6, 0x00)
1715:         SSE_MULT_PS(XMM6, XMM3)
1716:         SSE_ADD_PS(XMM5, XMM6)

1718:         SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM5)
1719:         SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM5)

1721:         SSE_PREFETCH_L1(SSE_ARG_1, FLOAT_24)

1723:         /* Column 2, product is accumulated in XMM6 */
1724:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_8, XMM6)
1725:         SSE_SHUFFLE(XMM6, XMM6, 0x00)
1726:         SSE_MULT_PS(XMM6, XMM0)

1728:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_9, XMM7)
1729:         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1730:         SSE_MULT_PS(XMM7, XMM1)
1731:         SSE_ADD_PS(XMM6, XMM7)

1733:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_10, XMM7)
1734:         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1735:         SSE_MULT_PS(XMM7, XMM2)
1736:         SSE_ADD_PS(XMM6, XMM7)

1738:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_11, XMM7)
1739:         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1740:         SSE_MULT_PS(XMM7, XMM3)
1741:         SSE_ADD_PS(XMM6, XMM7)

1743:         SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM6)
1744:         SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6)

1746:         /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
1747:         /* Column 3, product is accumulated in XMM0 */
1748:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_12, XMM7)
1749:         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1750:         SSE_MULT_PS(XMM0, XMM7)

1752:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_13, XMM7)
1753:         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1754:         SSE_MULT_PS(XMM1, XMM7)
1755:         SSE_ADD_PS(XMM0, XMM1)

1757:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_14, XMM1)
1758:         SSE_SHUFFLE(XMM1, XMM1, 0x00)
1759:         SSE_MULT_PS(XMM1, XMM2)
1760:         SSE_ADD_PS(XMM0, XMM1)

1762:         SSE_LOAD_SS(SSE_ARG_1, FLOAT_15, XMM7)
1763:         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1764:         SSE_MULT_PS(XMM3, XMM7)
1765:         SSE_ADD_PS(XMM0, XMM3)

1767:         SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM0)
1768:         SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0)

1770:         /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
1771:         /* This is code to be maintained and read by humans after all. */
1772:         /* Copy Multiplier Col 3 into XMM3 */
1773:         SSE_COPY_PS(XMM3, XMM0)
1774:         /* Copy Multiplier Col 2 into XMM2 */
1775:         SSE_COPY_PS(XMM2, XMM6)
1776:         /* Copy Multiplier Col 1 into XMM1 */
1777:         SSE_COPY_PS(XMM1, XMM5)
1778:         /* Copy Multiplier Col 0 into XMM0 */
1779:         SSE_COPY_PS(XMM0, XMM4)
1780:         SSE_INLINE_END_2;

1782:         /* Update the row: */
1783:         nz = bi[row + 1] - diag_offset[row] - 1;
1784:         pv += 16;
1785:         for (j = 0; j < nz; j++) {
1786:           PREFETCH_L1(&pv[16]);
1787:           x = rtmp + 16 * ((unsigned int)pj[j]);
1788:           /*            x = rtmp + 4*pj[j]; */

1790:           /* X:=X-M*PV, One column at a time */
1791:           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
1792:           SSE_INLINE_BEGIN_2(x, pv)
1793:           /* Load First Column of X*/
1794:           SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM4)
1795:           SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM4)

1797:           /* Matrix-Vector Product: */
1798:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_0, XMM5)
1799:           SSE_SHUFFLE(XMM5, XMM5, 0x00)
1800:           SSE_MULT_PS(XMM5, XMM0)
1801:           SSE_SUB_PS(XMM4, XMM5)

1803:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_1, XMM6)
1804:           SSE_SHUFFLE(XMM6, XMM6, 0x00)
1805:           SSE_MULT_PS(XMM6, XMM1)
1806:           SSE_SUB_PS(XMM4, XMM6)

1808:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_2, XMM7)
1809:           SSE_SHUFFLE(XMM7, XMM7, 0x00)
1810:           SSE_MULT_PS(XMM7, XMM2)
1811:           SSE_SUB_PS(XMM4, XMM7)

1813:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_3, XMM5)
1814:           SSE_SHUFFLE(XMM5, XMM5, 0x00)
1815:           SSE_MULT_PS(XMM5, XMM3)
1816:           SSE_SUB_PS(XMM4, XMM5)

1818:           SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM4)
1819:           SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM4)

1821:           /* Second Column */
1822:           SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM5)
1823:           SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM5)

1825:           /* Matrix-Vector Product: */
1826:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_4, XMM6)
1827:           SSE_SHUFFLE(XMM6, XMM6, 0x00)
1828:           SSE_MULT_PS(XMM6, XMM0)
1829:           SSE_SUB_PS(XMM5, XMM6)

1831:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_5, XMM7)
1832:           SSE_SHUFFLE(XMM7, XMM7, 0x00)
1833:           SSE_MULT_PS(XMM7, XMM1)
1834:           SSE_SUB_PS(XMM5, XMM7)

1836:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_6, XMM4)
1837:           SSE_SHUFFLE(XMM4, XMM4, 0x00)
1838:           SSE_MULT_PS(XMM4, XMM2)
1839:           SSE_SUB_PS(XMM5, XMM4)

1841:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_7, XMM6)
1842:           SSE_SHUFFLE(XMM6, XMM6, 0x00)
1843:           SSE_MULT_PS(XMM6, XMM3)
1844:           SSE_SUB_PS(XMM5, XMM6)

1846:           SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM5)
1847:           SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM5)

1849:           SSE_PREFETCH_L1(SSE_ARG_2, FLOAT_24)

1851:           /* Third Column */
1852:           SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM6)
1853:           SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6)

1855:           /* Matrix-Vector Product: */
1856:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_8, XMM7)
1857:           SSE_SHUFFLE(XMM7, XMM7, 0x00)
1858:           SSE_MULT_PS(XMM7, XMM0)
1859:           SSE_SUB_PS(XMM6, XMM7)

1861:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_9, XMM4)
1862:           SSE_SHUFFLE(XMM4, XMM4, 0x00)
1863:           SSE_MULT_PS(XMM4, XMM1)
1864:           SSE_SUB_PS(XMM6, XMM4)

1866:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_10, XMM5)
1867:           SSE_SHUFFLE(XMM5, XMM5, 0x00)
1868:           SSE_MULT_PS(XMM5, XMM2)
1869:           SSE_SUB_PS(XMM6, XMM5)

1871:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_11, XMM7)
1872:           SSE_SHUFFLE(XMM7, XMM7, 0x00)
1873:           SSE_MULT_PS(XMM7, XMM3)
1874:           SSE_SUB_PS(XMM6, XMM7)

1876:           SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM6)
1877:           SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM6)

1879:           /* Fourth Column */
1880:           SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM4)
1881:           SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM4)

1883:           /* Matrix-Vector Product: */
1884:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_12, XMM5)
1885:           SSE_SHUFFLE(XMM5, XMM5, 0x00)
1886:           SSE_MULT_PS(XMM5, XMM0)
1887:           SSE_SUB_PS(XMM4, XMM5)

1889:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_13, XMM6)
1890:           SSE_SHUFFLE(XMM6, XMM6, 0x00)
1891:           SSE_MULT_PS(XMM6, XMM1)
1892:           SSE_SUB_PS(XMM4, XMM6)

1894:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_14, XMM7)
1895:           SSE_SHUFFLE(XMM7, XMM7, 0x00)
1896:           SSE_MULT_PS(XMM7, XMM2)
1897:           SSE_SUB_PS(XMM4, XMM7)

1899:           SSE_LOAD_SS(SSE_ARG_2, FLOAT_15, XMM5)
1900:           SSE_SHUFFLE(XMM5, XMM5, 0x00)
1901:           SSE_MULT_PS(XMM5, XMM3)
1902:           SSE_SUB_PS(XMM4, XMM5)

1904:           SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM4)
1905:           SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM4)
1906:           SSE_INLINE_END_2;
1907:           pv += 16;
1908:         }
1909:         PetscCall(PetscLogFlops(128.0 * nz + 112.0));
1910:       }
1911:       row = (unsigned int)(*bjtmp++);
1912:       /*        row = (*bjtmp++)/4; */
1913:       /*        bjtmp++; */
1914:     }
1915:     /* finished row so stick it into b->a */
1916:     pv = ba + 16 * bi[i];
1917:     pj = bj + bi[i];
1918:     nz = bi[i + 1] - bi[i];

1920:     /* Copy x block back into pv block */
1921:     for (j = 0; j < nz; j++) {
1922:       x = rtmp + 16 * ((unsigned int)pj[j]);
1923:       /*        x  = rtmp+4*pj[j]; */

1925:       SSE_INLINE_BEGIN_2(x, pv)
1926:       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1927:       SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM1)
1928:       SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM1)

1930:       SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM2)
1931:       SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM2)

1933:       SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM3)
1934:       SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM3)

1936:       SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM4)
1937:       SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM4)

1939:       SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM5)
1940:       SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM5)

1942:       SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6)
1943:       SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6)

1945:       SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM7)
1946:       SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM7)

1948:       SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0)
1949:       SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0)
1950:       SSE_INLINE_END_2;
1951:       pv += 16;
1952:     }
1953:     /* invert diagonal block */
1954:     w = ba + 16 * diag_offset[i];
1955:     if (pivotinblocks) {
1956:       PetscCall(PetscKernel_A_gets_inverse_A_4(w, shift, allowzeropivot, , &zeropivotdetected));
1957:       if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
1958:     } else {
1959:       PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w));
1960:     }
1961:     /*      PetscCall(PetscKernel_A_gets_inverse_A_4_SSE(w)); */
1962:     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1963:   }

1965:   PetscCall(PetscFree(rtmp));

1967:   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1968:   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1969:   C->assembled           = PETSC_TRUE;

1971:   PetscCall(PetscLogFlops(1.333333333333 * bs * bs2 * b->mbs));
1972:   /* Flop Count from inverting diagonal blocks */
1973:   SSE_SCOPE_END;
1974:   PetscFunctionReturn(PETSC_SUCCESS);
1975: }

1977: #endif