Actual source code: baijfact11.c
2: /*
3: Factorization code for BAIJ format.
4: */
5: #include <../src/mat/impls/baij/seq/baij.h>
6: #include <petsc/private/kernels/blockinvert.h>
8: /*
9: Version for when blocks are 4 by 4
10: */
11: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_inplace(Mat C, Mat A, const MatFactorInfo *info)
12: {
13: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
14: IS isrow = b->row, isicol = b->icol;
15: const PetscInt *r, *ic;
16: PetscInt i, j, n = a->mbs, *bi = b->i, *bj = b->j;
17: PetscInt *ajtmpold, *ajtmp, nz, row;
18: PetscInt *diag_offset = b->diag, idx, *ai = a->i, *aj = a->j, *pj;
19: MatScalar *pv, *v, *rtmp, *pc, *w, *x;
20: MatScalar p1, p2, p3, p4, m1, m2, m3, m4, m5, m6, m7, m8, m9, x1, x2, x3, x4;
21: MatScalar p5, p6, p7, p8, p9, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16;
22: MatScalar p10, p11, p12, p13, p14, p15, p16, m10, m11, m12;
23: MatScalar m13, m14, m15, m16;
24: MatScalar *ba = b->a, *aa = a->a;
25: PetscBool pivotinblocks = b->pivotinblocks;
26: PetscReal shift = info->shiftamount;
27: PetscBool allowzeropivot, zeropivotdetected = PETSC_FALSE;
29: PetscFunctionBegin;
30: PetscCall(ISGetIndices(isrow, &r));
31: PetscCall(ISGetIndices(isicol, &ic));
32: PetscCall(PetscMalloc1(16 * (n + 1), &rtmp));
33: allowzeropivot = PetscNot(A->erroriffailure);
35: for (i = 0; i < n; i++) {
36: nz = bi[i + 1] - bi[i];
37: ajtmp = bj + bi[i];
38: for (j = 0; j < nz; j++) {
39: x = rtmp + 16 * ajtmp[j];
40: x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0;
41: x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
42: }
43: /* load in initial (unfactored row) */
44: idx = r[i];
45: nz = ai[idx + 1] - ai[idx];
46: ajtmpold = aj + ai[idx];
47: v = aa + 16 * ai[idx];
48: for (j = 0; j < nz; j++) {
49: x = rtmp + 16 * ic[ajtmpold[j]];
50: x[0] = v[0];
51: x[1] = v[1];
52: x[2] = v[2];
53: x[3] = v[3];
54: x[4] = v[4];
55: x[5] = v[5];
56: x[6] = v[6];
57: x[7] = v[7];
58: x[8] = v[8];
59: x[9] = v[9];
60: x[10] = v[10];
61: x[11] = v[11];
62: x[12] = v[12];
63: x[13] = v[13];
64: x[14] = v[14];
65: x[15] = v[15];
66: v += 16;
67: }
68: row = *ajtmp++;
69: while (row < i) {
70: pc = rtmp + 16 * row;
71: p1 = pc[0];
72: p2 = pc[1];
73: p3 = pc[2];
74: p4 = pc[3];
75: p5 = pc[4];
76: p6 = pc[5];
77: p7 = pc[6];
78: p8 = pc[7];
79: p9 = pc[8];
80: p10 = pc[9];
81: p11 = pc[10];
82: p12 = pc[11];
83: p13 = pc[12];
84: p14 = pc[13];
85: p15 = pc[14];
86: p16 = pc[15];
87: if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 || p16 != 0.0) {
88: pv = ba + 16 * diag_offset[row];
89: pj = bj + diag_offset[row] + 1;
90: x1 = pv[0];
91: x2 = pv[1];
92: x3 = pv[2];
93: x4 = pv[3];
94: x5 = pv[4];
95: x6 = pv[5];
96: x7 = pv[6];
97: x8 = pv[7];
98: x9 = pv[8];
99: x10 = pv[9];
100: x11 = pv[10];
101: x12 = pv[11];
102: x13 = pv[12];
103: x14 = pv[13];
104: x15 = pv[14];
105: x16 = pv[15];
106: pc[0] = m1 = p1 * x1 + p5 * x2 + p9 * x3 + p13 * x4;
107: pc[1] = m2 = p2 * x1 + p6 * x2 + p10 * x3 + p14 * x4;
108: pc[2] = m3 = p3 * x1 + p7 * x2 + p11 * x3 + p15 * x4;
109: pc[3] = m4 = p4 * x1 + p8 * x2 + p12 * x3 + p16 * x4;
111: pc[4] = m5 = p1 * x5 + p5 * x6 + p9 * x7 + p13 * x8;
112: pc[5] = m6 = p2 * x5 + p6 * x6 + p10 * x7 + p14 * x8;
113: pc[6] = m7 = p3 * x5 + p7 * x6 + p11 * x7 + p15 * x8;
114: pc[7] = m8 = p4 * x5 + p8 * x6 + p12 * x7 + p16 * x8;
116: pc[8] = m9 = p1 * x9 + p5 * x10 + p9 * x11 + p13 * x12;
117: pc[9] = m10 = p2 * x9 + p6 * x10 + p10 * x11 + p14 * x12;
118: pc[10] = m11 = p3 * x9 + p7 * x10 + p11 * x11 + p15 * x12;
119: pc[11] = m12 = p4 * x9 + p8 * x10 + p12 * x11 + p16 * x12;
121: pc[12] = m13 = p1 * x13 + p5 * x14 + p9 * x15 + p13 * x16;
122: pc[13] = m14 = p2 * x13 + p6 * x14 + p10 * x15 + p14 * x16;
123: pc[14] = m15 = p3 * x13 + p7 * x14 + p11 * x15 + p15 * x16;
124: pc[15] = m16 = p4 * x13 + p8 * x14 + p12 * x15 + p16 * x16;
126: nz = bi[row + 1] - diag_offset[row] - 1;
127: pv += 16;
128: for (j = 0; j < nz; j++) {
129: x1 = pv[0];
130: x2 = pv[1];
131: x3 = pv[2];
132: x4 = pv[3];
133: x5 = pv[4];
134: x6 = pv[5];
135: x7 = pv[6];
136: x8 = pv[7];
137: x9 = pv[8];
138: x10 = pv[9];
139: x11 = pv[10];
140: x12 = pv[11];
141: x13 = pv[12];
142: x14 = pv[13];
143: x15 = pv[14];
144: x16 = pv[15];
145: x = rtmp + 16 * pj[j];
146: x[0] -= m1 * x1 + m5 * x2 + m9 * x3 + m13 * x4;
147: x[1] -= m2 * x1 + m6 * x2 + m10 * x3 + m14 * x4;
148: x[2] -= m3 * x1 + m7 * x2 + m11 * x3 + m15 * x4;
149: x[3] -= m4 * x1 + m8 * x2 + m12 * x3 + m16 * x4;
151: x[4] -= m1 * x5 + m5 * x6 + m9 * x7 + m13 * x8;
152: x[5] -= m2 * x5 + m6 * x6 + m10 * x7 + m14 * x8;
153: x[6] -= m3 * x5 + m7 * x6 + m11 * x7 + m15 * x8;
154: x[7] -= m4 * x5 + m8 * x6 + m12 * x7 + m16 * x8;
156: x[8] -= m1 * x9 + m5 * x10 + m9 * x11 + m13 * x12;
157: x[9] -= m2 * x9 + m6 * x10 + m10 * x11 + m14 * x12;
158: x[10] -= m3 * x9 + m7 * x10 + m11 * x11 + m15 * x12;
159: x[11] -= m4 * x9 + m8 * x10 + m12 * x11 + m16 * x12;
161: x[12] -= m1 * x13 + m5 * x14 + m9 * x15 + m13 * x16;
162: x[13] -= m2 * x13 + m6 * x14 + m10 * x15 + m14 * x16;
163: x[14] -= m3 * x13 + m7 * x14 + m11 * x15 + m15 * x16;
164: x[15] -= m4 * x13 + m8 * x14 + m12 * x15 + m16 * x16;
166: pv += 16;
167: }
168: PetscCall(PetscLogFlops(128.0 * nz + 112.0));
169: }
170: row = *ajtmp++;
171: }
172: /* finished row so stick it into b->a */
173: pv = ba + 16 * bi[i];
174: pj = bj + bi[i];
175: nz = bi[i + 1] - bi[i];
176: for (j = 0; j < nz; j++) {
177: x = rtmp + 16 * pj[j];
178: pv[0] = x[0];
179: pv[1] = x[1];
180: pv[2] = x[2];
181: pv[3] = x[3];
182: pv[4] = x[4];
183: pv[5] = x[5];
184: pv[6] = x[6];
185: pv[7] = x[7];
186: pv[8] = x[8];
187: pv[9] = x[9];
188: pv[10] = x[10];
189: pv[11] = x[11];
190: pv[12] = x[12];
191: pv[13] = x[13];
192: pv[14] = x[14];
193: pv[15] = x[15];
194: pv += 16;
195: }
196: /* invert diagonal block */
197: w = ba + 16 * diag_offset[i];
198: if (pivotinblocks) {
199: PetscCall(PetscKernel_A_gets_inverse_A_4(w, shift, allowzeropivot, &zeropivotdetected));
200: if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
201: } else {
202: PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w));
203: }
204: }
206: PetscCall(PetscFree(rtmp));
207: PetscCall(ISRestoreIndices(isicol, &ic));
208: PetscCall(ISRestoreIndices(isrow, &r));
210: C->ops->solve = MatSolve_SeqBAIJ_4_inplace;
211: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_inplace;
212: C->assembled = PETSC_TRUE;
214: PetscCall(PetscLogFlops(1.333333333333 * 4 * 4 * 4 * b->mbs)); /* from inverting diagonal blocks */
215: PetscFunctionReturn(PETSC_SUCCESS);
216: }
218: /* MatLUFactorNumeric_SeqBAIJ_4 -
219: copied from MatLUFactorNumeric_SeqBAIJ_N_inplace() and manually re-implemented
220: PetscKernel_A_gets_A_times_B()
221: PetscKernel_A_gets_A_minus_B_times_C()
222: PetscKernel_A_gets_inverse_A()
223: */
225: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4(Mat B, Mat A, const MatFactorInfo *info)
226: {
227: Mat C = B;
228: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
229: IS isrow = b->row, isicol = b->icol;
230: const PetscInt *r, *ic;
231: PetscInt i, j, k, nz, nzL, row;
232: const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j;
233: const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag, *pj, bs2 = a->bs2;
234: MatScalar *rtmp, *pc, *mwork, *v, *pv, *aa = a->a;
235: PetscInt flg;
236: PetscReal shift;
237: PetscBool allowzeropivot, zeropivotdetected;
239: PetscFunctionBegin;
240: allowzeropivot = PetscNot(A->erroriffailure);
241: PetscCall(ISGetIndices(isrow, &r));
242: PetscCall(ISGetIndices(isicol, &ic));
244: if (info->shifttype == (PetscReal)MAT_SHIFT_NONE) {
245: shift = 0;
246: } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */
247: shift = info->shiftamount;
248: }
250: /* generate work space needed by the factorization */
251: PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork));
252: PetscCall(PetscArrayzero(rtmp, bs2 * n));
254: for (i = 0; i < n; i++) {
255: /* zero rtmp */
256: /* L part */
257: nz = bi[i + 1] - bi[i];
258: bjtmp = bj + bi[i];
259: for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
261: /* U part */
262: nz = bdiag[i] - bdiag[i + 1];
263: bjtmp = bj + bdiag[i + 1] + 1;
264: for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
266: /* load in initial (unfactored row) */
267: nz = ai[r[i] + 1] - ai[r[i]];
268: ajtmp = aj + ai[r[i]];
269: v = aa + bs2 * ai[r[i]];
270: for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], v + bs2 * j, bs2));
272: /* elimination */
273: bjtmp = bj + bi[i];
274: nzL = bi[i + 1] - bi[i];
275: for (k = 0; k < nzL; k++) {
276: row = bjtmp[k];
277: pc = rtmp + bs2 * row;
278: for (flg = 0, j = 0; j < bs2; j++) {
279: if (pc[j] != 0.0) {
280: flg = 1;
281: break;
282: }
283: }
284: if (flg) {
285: pv = b->a + bs2 * bdiag[row];
286: /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
287: PetscCall(PetscKernel_A_gets_A_times_B_4(pc, pv, mwork));
289: pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
290: pv = b->a + bs2 * (bdiag[row + 1] + 1);
291: nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */
292: for (j = 0; j < nz; j++) {
293: /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
294: /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
295: v = rtmp + bs2 * pj[j];
296: PetscCall(PetscKernel_A_gets_A_minus_B_times_C_4(v, pc, pv));
297: pv += bs2;
298: }
299: PetscCall(PetscLogFlops(128.0 * nz + 112)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
300: }
301: }
303: /* finished row so stick it into b->a */
304: /* L part */
305: pv = b->a + bs2 * bi[i];
306: pj = b->j + bi[i];
307: nz = bi[i + 1] - bi[i];
308: for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
310: /* Mark diagonal and invert diagonal for simpler triangular solves */
311: pv = b->a + bs2 * bdiag[i];
312: pj = b->j + bdiag[i];
313: PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
314: PetscCall(PetscKernel_A_gets_inverse_A_4(pv, shift, allowzeropivot, &zeropivotdetected));
315: if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
317: /* U part */
318: pv = b->a + bs2 * (bdiag[i + 1] + 1);
319: pj = b->j + bdiag[i + 1] + 1;
320: nz = bdiag[i] - bdiag[i + 1] - 1;
321: for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
322: }
324: PetscCall(PetscFree2(rtmp, mwork));
325: PetscCall(ISRestoreIndices(isicol, &ic));
326: PetscCall(ISRestoreIndices(isrow, &r));
328: C->ops->solve = MatSolve_SeqBAIJ_4;
329: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4;
330: C->assembled = PETSC_TRUE;
332: PetscCall(PetscLogFlops(1.333333333333 * 4 * 4 * 4 * n)); /* from inverting diagonal blocks */
333: PetscFunctionReturn(PETSC_SUCCESS);
334: }
336: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_inplace(Mat C, Mat A, const MatFactorInfo *info)
337: {
338: /*
339: Default Version for when blocks are 4 by 4 Using natural ordering
340: */
341: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
342: PetscInt i, j, n = a->mbs, *bi = b->i, *bj = b->j;
343: PetscInt *ajtmpold, *ajtmp, nz, row;
344: PetscInt *diag_offset = b->diag, *ai = a->i, *aj = a->j, *pj;
345: MatScalar *pv, *v, *rtmp, *pc, *w, *x;
346: MatScalar p1, p2, p3, p4, m1, m2, m3, m4, m5, m6, m7, m8, m9, x1, x2, x3, x4;
347: MatScalar p5, p6, p7, p8, p9, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16;
348: MatScalar p10, p11, p12, p13, p14, p15, p16, m10, m11, m12;
349: MatScalar m13, m14, m15, m16;
350: MatScalar *ba = b->a, *aa = a->a;
351: PetscBool pivotinblocks = b->pivotinblocks;
352: PetscReal shift = info->shiftamount;
353: PetscBool allowzeropivot, zeropivotdetected = PETSC_FALSE;
355: PetscFunctionBegin;
356: allowzeropivot = PetscNot(A->erroriffailure);
357: PetscCall(PetscMalloc1(16 * (n + 1), &rtmp));
359: for (i = 0; i < n; i++) {
360: nz = bi[i + 1] - bi[i];
361: ajtmp = bj + bi[i];
362: for (j = 0; j < nz; j++) {
363: x = rtmp + 16 * ajtmp[j];
364: x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0;
365: x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
366: }
367: /* load in initial (unfactored row) */
368: nz = ai[i + 1] - ai[i];
369: ajtmpold = aj + ai[i];
370: v = aa + 16 * ai[i];
371: for (j = 0; j < nz; j++) {
372: x = rtmp + 16 * ajtmpold[j];
373: x[0] = v[0];
374: x[1] = v[1];
375: x[2] = v[2];
376: x[3] = v[3];
377: x[4] = v[4];
378: x[5] = v[5];
379: x[6] = v[6];
380: x[7] = v[7];
381: x[8] = v[8];
382: x[9] = v[9];
383: x[10] = v[10];
384: x[11] = v[11];
385: x[12] = v[12];
386: x[13] = v[13];
387: x[14] = v[14];
388: x[15] = v[15];
389: v += 16;
390: }
391: row = *ajtmp++;
392: while (row < i) {
393: pc = rtmp + 16 * row;
394: p1 = pc[0];
395: p2 = pc[1];
396: p3 = pc[2];
397: p4 = pc[3];
398: p5 = pc[4];
399: p6 = pc[5];
400: p7 = pc[6];
401: p8 = pc[7];
402: p9 = pc[8];
403: p10 = pc[9];
404: p11 = pc[10];
405: p12 = pc[11];
406: p13 = pc[12];
407: p14 = pc[13];
408: p15 = pc[14];
409: p16 = pc[15];
410: if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 || p16 != 0.0) {
411: pv = ba + 16 * diag_offset[row];
412: pj = bj + diag_offset[row] + 1;
413: x1 = pv[0];
414: x2 = pv[1];
415: x3 = pv[2];
416: x4 = pv[3];
417: x5 = pv[4];
418: x6 = pv[5];
419: x7 = pv[6];
420: x8 = pv[7];
421: x9 = pv[8];
422: x10 = pv[9];
423: x11 = pv[10];
424: x12 = pv[11];
425: x13 = pv[12];
426: x14 = pv[13];
427: x15 = pv[14];
428: x16 = pv[15];
429: pc[0] = m1 = p1 * x1 + p5 * x2 + p9 * x3 + p13 * x4;
430: pc[1] = m2 = p2 * x1 + p6 * x2 + p10 * x3 + p14 * x4;
431: pc[2] = m3 = p3 * x1 + p7 * x2 + p11 * x3 + p15 * x4;
432: pc[3] = m4 = p4 * x1 + p8 * x2 + p12 * x3 + p16 * x4;
434: pc[4] = m5 = p1 * x5 + p5 * x6 + p9 * x7 + p13 * x8;
435: pc[5] = m6 = p2 * x5 + p6 * x6 + p10 * x7 + p14 * x8;
436: pc[6] = m7 = p3 * x5 + p7 * x6 + p11 * x7 + p15 * x8;
437: pc[7] = m8 = p4 * x5 + p8 * x6 + p12 * x7 + p16 * x8;
439: pc[8] = m9 = p1 * x9 + p5 * x10 + p9 * x11 + p13 * x12;
440: pc[9] = m10 = p2 * x9 + p6 * x10 + p10 * x11 + p14 * x12;
441: pc[10] = m11 = p3 * x9 + p7 * x10 + p11 * x11 + p15 * x12;
442: pc[11] = m12 = p4 * x9 + p8 * x10 + p12 * x11 + p16 * x12;
444: pc[12] = m13 = p1 * x13 + p5 * x14 + p9 * x15 + p13 * x16;
445: pc[13] = m14 = p2 * x13 + p6 * x14 + p10 * x15 + p14 * x16;
446: pc[14] = m15 = p3 * x13 + p7 * x14 + p11 * x15 + p15 * x16;
447: pc[15] = m16 = p4 * x13 + p8 * x14 + p12 * x15 + p16 * x16;
448: nz = bi[row + 1] - diag_offset[row] - 1;
449: pv += 16;
450: for (j = 0; j < nz; j++) {
451: x1 = pv[0];
452: x2 = pv[1];
453: x3 = pv[2];
454: x4 = pv[3];
455: x5 = pv[4];
456: x6 = pv[5];
457: x7 = pv[6];
458: x8 = pv[7];
459: x9 = pv[8];
460: x10 = pv[9];
461: x11 = pv[10];
462: x12 = pv[11];
463: x13 = pv[12];
464: x14 = pv[13];
465: x15 = pv[14];
466: x16 = pv[15];
467: x = rtmp + 16 * pj[j];
468: x[0] -= m1 * x1 + m5 * x2 + m9 * x3 + m13 * x4;
469: x[1] -= m2 * x1 + m6 * x2 + m10 * x3 + m14 * x4;
470: x[2] -= m3 * x1 + m7 * x2 + m11 * x3 + m15 * x4;
471: x[3] -= m4 * x1 + m8 * x2 + m12 * x3 + m16 * x4;
473: x[4] -= m1 * x5 + m5 * x6 + m9 * x7 + m13 * x8;
474: x[5] -= m2 * x5 + m6 * x6 + m10 * x7 + m14 * x8;
475: x[6] -= m3 * x5 + m7 * x6 + m11 * x7 + m15 * x8;
476: x[7] -= m4 * x5 + m8 * x6 + m12 * x7 + m16 * x8;
478: x[8] -= m1 * x9 + m5 * x10 + m9 * x11 + m13 * x12;
479: x[9] -= m2 * x9 + m6 * x10 + m10 * x11 + m14 * x12;
480: x[10] -= m3 * x9 + m7 * x10 + m11 * x11 + m15 * x12;
481: x[11] -= m4 * x9 + m8 * x10 + m12 * x11 + m16 * x12;
483: x[12] -= m1 * x13 + m5 * x14 + m9 * x15 + m13 * x16;
484: x[13] -= m2 * x13 + m6 * x14 + m10 * x15 + m14 * x16;
485: x[14] -= m3 * x13 + m7 * x14 + m11 * x15 + m15 * x16;
486: x[15] -= m4 * x13 + m8 * x14 + m12 * x15 + m16 * x16;
488: pv += 16;
489: }
490: PetscCall(PetscLogFlops(128.0 * nz + 112.0));
491: }
492: row = *ajtmp++;
493: }
494: /* finished row so stick it into b->a */
495: pv = ba + 16 * bi[i];
496: pj = bj + bi[i];
497: nz = bi[i + 1] - bi[i];
498: for (j = 0; j < nz; j++) {
499: x = rtmp + 16 * pj[j];
500: pv[0] = x[0];
501: pv[1] = x[1];
502: pv[2] = x[2];
503: pv[3] = x[3];
504: pv[4] = x[4];
505: pv[5] = x[5];
506: pv[6] = x[6];
507: pv[7] = x[7];
508: pv[8] = x[8];
509: pv[9] = x[9];
510: pv[10] = x[10];
511: pv[11] = x[11];
512: pv[12] = x[12];
513: pv[13] = x[13];
514: pv[14] = x[14];
515: pv[15] = x[15];
516: pv += 16;
517: }
518: /* invert diagonal block */
519: w = ba + 16 * diag_offset[i];
520: if (pivotinblocks) {
521: PetscCall(PetscKernel_A_gets_inverse_A_4(w, shift, allowzeropivot, &zeropivotdetected));
522: if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
523: } else {
524: PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w));
525: }
526: }
528: PetscCall(PetscFree(rtmp));
530: C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_inplace;
531: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace;
532: C->assembled = PETSC_TRUE;
534: PetscCall(PetscLogFlops(1.333333333333 * 4 * 4 * 4 * b->mbs)); /* from inverting diagonal blocks */
535: PetscFunctionReturn(PETSC_SUCCESS);
536: }
538: /*
539: MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering -
540: copied from MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering_inplace()
541: */
542: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering(Mat B, Mat A, const MatFactorInfo *info)
543: {
544: Mat C = B;
545: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
546: PetscInt i, j, k, nz, nzL, row;
547: const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j;
548: const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag, *pj, bs2 = a->bs2;
549: MatScalar *rtmp, *pc, *mwork, *v, *pv, *aa = a->a;
550: PetscInt flg;
551: PetscReal shift;
552: PetscBool allowzeropivot, zeropivotdetected;
554: PetscFunctionBegin;
555: allowzeropivot = PetscNot(A->erroriffailure);
557: /* generate work space needed by the factorization */
558: PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork));
559: PetscCall(PetscArrayzero(rtmp, bs2 * n));
561: if (info->shifttype == (PetscReal)MAT_SHIFT_NONE) {
562: shift = 0;
563: } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */
564: shift = info->shiftamount;
565: }
567: for (i = 0; i < n; i++) {
568: /* zero rtmp */
569: /* L part */
570: nz = bi[i + 1] - bi[i];
571: bjtmp = bj + bi[i];
572: for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
574: /* U part */
575: nz = bdiag[i] - bdiag[i + 1];
576: bjtmp = bj + bdiag[i + 1] + 1;
577: for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
579: /* load in initial (unfactored row) */
580: nz = ai[i + 1] - ai[i];
581: ajtmp = aj + ai[i];
582: v = aa + bs2 * ai[i];
583: for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ajtmp[j], v + bs2 * j, bs2));
585: /* elimination */
586: bjtmp = bj + bi[i];
587: nzL = bi[i + 1] - bi[i];
588: for (k = 0; k < nzL; k++) {
589: row = bjtmp[k];
590: pc = rtmp + bs2 * row;
591: for (flg = 0, j = 0; j < bs2; j++) {
592: if (pc[j] != 0.0) {
593: flg = 1;
594: break;
595: }
596: }
597: if (flg) {
598: pv = b->a + bs2 * bdiag[row];
599: /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
600: PetscCall(PetscKernel_A_gets_A_times_B_4(pc, pv, mwork));
602: pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
603: pv = b->a + bs2 * (bdiag[row + 1] + 1);
604: nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */
605: for (j = 0; j < nz; j++) {
606: /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
607: /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
608: v = rtmp + bs2 * pj[j];
609: PetscCall(PetscKernel_A_gets_A_minus_B_times_C_4(v, pc, pv));
610: pv += bs2;
611: }
612: PetscCall(PetscLogFlops(128.0 * nz + 112)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
613: }
614: }
616: /* finished row so stick it into b->a */
617: /* L part */
618: pv = b->a + bs2 * bi[i];
619: pj = b->j + bi[i];
620: nz = bi[i + 1] - bi[i];
621: for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
623: /* Mark diagonal and invert diagonal for simpler triangular solves */
624: pv = b->a + bs2 * bdiag[i];
625: pj = b->j + bdiag[i];
626: PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
627: PetscCall(PetscKernel_A_gets_inverse_A_4(pv, shift, allowzeropivot, &zeropivotdetected));
628: if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
630: /* U part */
631: pv = b->a + bs2 * (bdiag[i + 1] + 1);
632: pj = b->j + bdiag[i + 1] + 1;
633: nz = bdiag[i] - bdiag[i + 1] - 1;
634: for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
635: }
636: PetscCall(PetscFree2(rtmp, mwork));
638: C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering;
639: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering;
640: C->assembled = PETSC_TRUE;
642: PetscCall(PetscLogFlops(1.333333333333 * 4 * 4 * 4 * n)); /* from inverting diagonal blocks */
643: PetscFunctionReturn(PETSC_SUCCESS);
644: }
646: #if defined(PETSC_HAVE_SSE)
648: #include PETSC_HAVE_SSE
650: /* SSE Version for when blocks are 4 by 4 Using natural ordering */
651: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE(Mat B, Mat A, const MatFactorInfo *info)
652: {
653: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
654: int i, j, n = a->mbs;
655: int *bj = b->j, *bjtmp, *pj;
656: int row;
657: int *ajtmpold, nz, *bi = b->i;
658: int *diag_offset = b->diag, *ai = a->i, *aj = a->j;
659: MatScalar *pv, *v, *rtmp, *pc, *w, *x;
660: MatScalar *ba = b->a, *aa = a->a;
661: int nonzero = 0;
662: PetscBool pivotinblocks = b->pivotinblocks;
663: PetscReal shift = info->shiftamount;
664: PetscBool allowzeropivot, zeropivotdetected = PETSC_FALSE;
666: PetscFunctionBegin;
667: allowzeropivot = PetscNot(A->erroriffailure);
668: SSE_SCOPE_BEGIN;
670: PetscCheck((unsigned long)aa % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer aa is not 16 byte aligned. SSE will not work.");
671: PetscCheck((unsigned long)ba % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer ba is not 16 byte aligned. SSE will not work.");
672: PetscCall(PetscMalloc1(16 * (n + 1), &rtmp));
673: PetscCheck((unsigned long)rtmp % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer rtmp is not 16 byte aligned. SSE will not work.");
674: /* if ((unsigned long)bj==(unsigned long)aj) { */
675: /* colscale = 4; */
676: /* } */
677: for (i = 0; i < n; i++) {
678: nz = bi[i + 1] - bi[i];
679: bjtmp = bj + bi[i];
680: /* zero out the 4x4 block accumulators */
681: /* zero out one register */
682: XOR_PS(XMM7, XMM7);
683: for (j = 0; j < nz; j++) {
684: x = rtmp + 16 * bjtmp[j];
685: /* x = rtmp+4*bjtmp[j]; */
686: SSE_INLINE_BEGIN_1(x)
687: /* Copy zero register to memory locations */
688: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
689: SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM7)
690: SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM7)
691: SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM7)
692: SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM7)
693: SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM7)
694: SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM7)
695: SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM7)
696: SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM7)
697: SSE_INLINE_END_1;
698: }
699: /* load in initial (unfactored row) */
700: nz = ai[i + 1] - ai[i];
701: ajtmpold = aj + ai[i];
702: v = aa + 16 * ai[i];
703: for (j = 0; j < nz; j++) {
704: x = rtmp + 16 * ajtmpold[j];
705: /* x = rtmp+colscale*ajtmpold[j]; */
706: /* Copy v block into x block */
707: SSE_INLINE_BEGIN_2(v, x)
708: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
709: SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0)
710: SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM0)
712: SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM1)
713: SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM1)
715: SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM2)
716: SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM2)
718: SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM3)
719: SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM3)
721: SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM4)
722: SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM4)
724: SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM5)
725: SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM5)
727: SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM6)
728: SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM6)
730: SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0)
731: SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0)
732: SSE_INLINE_END_2;
734: v += 16;
735: }
736: /* row = (*bjtmp++)/4; */
737: row = *bjtmp++;
738: while (row < i) {
739: pc = rtmp + 16 * row;
740: SSE_INLINE_BEGIN_1(pc)
741: /* Load block from lower triangle */
742: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
743: SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0)
744: SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM0)
746: SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM1)
747: SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM1)
749: SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM2)
750: SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM2)
752: SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM3)
753: SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM3)
755: /* Compare block to zero block */
757: SSE_COPY_PS(XMM4, XMM7)
758: SSE_CMPNEQ_PS(XMM4, XMM0)
760: SSE_COPY_PS(XMM5, XMM7)
761: SSE_CMPNEQ_PS(XMM5, XMM1)
763: SSE_COPY_PS(XMM6, XMM7)
764: SSE_CMPNEQ_PS(XMM6, XMM2)
766: SSE_CMPNEQ_PS(XMM7, XMM3)
768: /* Reduce the comparisons to one SSE register */
769: SSE_OR_PS(XMM6, XMM7)
770: SSE_OR_PS(XMM5, XMM4)
771: SSE_OR_PS(XMM5, XMM6)
772: SSE_INLINE_END_1;
774: /* Reduce the one SSE register to an integer register for branching */
775: /* Note: Since nonzero is an int, there is no INLINE block version of this call */
776: MOVEMASK(nonzero, XMM5);
778: /* If block is nonzero ... */
779: if (nonzero) {
780: pv = ba + 16 * diag_offset[row];
781: PREFETCH_L1(&pv[16]);
782: pj = bj + diag_offset[row] + 1;
784: /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
785: /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
786: /* but the diagonal was inverted already */
787: /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
789: SSE_INLINE_BEGIN_2(pv, pc)
790: /* Column 0, product is accumulated in XMM4 */
791: SSE_LOAD_SS(SSE_ARG_1, FLOAT_0, XMM4)
792: SSE_SHUFFLE(XMM4, XMM4, 0x00)
793: SSE_MULT_PS(XMM4, XMM0)
795: SSE_LOAD_SS(SSE_ARG_1, FLOAT_1, XMM5)
796: SSE_SHUFFLE(XMM5, XMM5, 0x00)
797: SSE_MULT_PS(XMM5, XMM1)
798: SSE_ADD_PS(XMM4, XMM5)
800: SSE_LOAD_SS(SSE_ARG_1, FLOAT_2, XMM6)
801: SSE_SHUFFLE(XMM6, XMM6, 0x00)
802: SSE_MULT_PS(XMM6, XMM2)
803: SSE_ADD_PS(XMM4, XMM6)
805: SSE_LOAD_SS(SSE_ARG_1, FLOAT_3, XMM7)
806: SSE_SHUFFLE(XMM7, XMM7, 0x00)
807: SSE_MULT_PS(XMM7, XMM3)
808: SSE_ADD_PS(XMM4, XMM7)
810: SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM4)
811: SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM4)
813: /* Column 1, product is accumulated in XMM5 */
814: SSE_LOAD_SS(SSE_ARG_1, FLOAT_4, XMM5)
815: SSE_SHUFFLE(XMM5, XMM5, 0x00)
816: SSE_MULT_PS(XMM5, XMM0)
818: SSE_LOAD_SS(SSE_ARG_1, FLOAT_5, XMM6)
819: SSE_SHUFFLE(XMM6, XMM6, 0x00)
820: SSE_MULT_PS(XMM6, XMM1)
821: SSE_ADD_PS(XMM5, XMM6)
823: SSE_LOAD_SS(SSE_ARG_1, FLOAT_6, XMM7)
824: SSE_SHUFFLE(XMM7, XMM7, 0x00)
825: SSE_MULT_PS(XMM7, XMM2)
826: SSE_ADD_PS(XMM5, XMM7)
828: SSE_LOAD_SS(SSE_ARG_1, FLOAT_7, XMM6)
829: SSE_SHUFFLE(XMM6, XMM6, 0x00)
830: SSE_MULT_PS(XMM6, XMM3)
831: SSE_ADD_PS(XMM5, XMM6)
833: SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM5)
834: SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM5)
836: SSE_PREFETCH_L1(SSE_ARG_1, FLOAT_24)
838: /* Column 2, product is accumulated in XMM6 */
839: SSE_LOAD_SS(SSE_ARG_1, FLOAT_8, XMM6)
840: SSE_SHUFFLE(XMM6, XMM6, 0x00)
841: SSE_MULT_PS(XMM6, XMM0)
843: SSE_LOAD_SS(SSE_ARG_1, FLOAT_9, XMM7)
844: SSE_SHUFFLE(XMM7, XMM7, 0x00)
845: SSE_MULT_PS(XMM7, XMM1)
846: SSE_ADD_PS(XMM6, XMM7)
848: SSE_LOAD_SS(SSE_ARG_1, FLOAT_10, XMM7)
849: SSE_SHUFFLE(XMM7, XMM7, 0x00)
850: SSE_MULT_PS(XMM7, XMM2)
851: SSE_ADD_PS(XMM6, XMM7)
853: SSE_LOAD_SS(SSE_ARG_1, FLOAT_11, XMM7)
854: SSE_SHUFFLE(XMM7, XMM7, 0x00)
855: SSE_MULT_PS(XMM7, XMM3)
856: SSE_ADD_PS(XMM6, XMM7)
858: SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM6)
859: SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6)
861: /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
862: /* Column 3, product is accumulated in XMM0 */
863: SSE_LOAD_SS(SSE_ARG_1, FLOAT_12, XMM7)
864: SSE_SHUFFLE(XMM7, XMM7, 0x00)
865: SSE_MULT_PS(XMM0, XMM7)
867: SSE_LOAD_SS(SSE_ARG_1, FLOAT_13, XMM7)
868: SSE_SHUFFLE(XMM7, XMM7, 0x00)
869: SSE_MULT_PS(XMM1, XMM7)
870: SSE_ADD_PS(XMM0, XMM1)
872: SSE_LOAD_SS(SSE_ARG_1, FLOAT_14, XMM1)
873: SSE_SHUFFLE(XMM1, XMM1, 0x00)
874: SSE_MULT_PS(XMM1, XMM2)
875: SSE_ADD_PS(XMM0, XMM1)
877: SSE_LOAD_SS(SSE_ARG_1, FLOAT_15, XMM7)
878: SSE_SHUFFLE(XMM7, XMM7, 0x00)
879: SSE_MULT_PS(XMM3, XMM7)
880: SSE_ADD_PS(XMM0, XMM3)
882: SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM0)
883: SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0)
885: /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
886: /* This is code to be maintained and read by humans after all. */
887: /* Copy Multiplier Col 3 into XMM3 */
888: SSE_COPY_PS(XMM3, XMM0)
889: /* Copy Multiplier Col 2 into XMM2 */
890: SSE_COPY_PS(XMM2, XMM6)
891: /* Copy Multiplier Col 1 into XMM1 */
892: SSE_COPY_PS(XMM1, XMM5)
893: /* Copy Multiplier Col 0 into XMM0 */
894: SSE_COPY_PS(XMM0, XMM4)
895: SSE_INLINE_END_2;
897: /* Update the row: */
898: nz = bi[row + 1] - diag_offset[row] - 1;
899: pv += 16;
900: for (j = 0; j < nz; j++) {
901: PREFETCH_L1(&pv[16]);
902: x = rtmp + 16 * pj[j];
903: /* x = rtmp + 4*pj[j]; */
905: /* X:=X-M*PV, One column at a time */
906: /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
907: SSE_INLINE_BEGIN_2(x, pv)
908: /* Load First Column of X*/
909: SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM4)
910: SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM4)
912: /* Matrix-Vector Product: */
913: SSE_LOAD_SS(SSE_ARG_2, FLOAT_0, XMM5)
914: SSE_SHUFFLE(XMM5, XMM5, 0x00)
915: SSE_MULT_PS(XMM5, XMM0)
916: SSE_SUB_PS(XMM4, XMM5)
918: SSE_LOAD_SS(SSE_ARG_2, FLOAT_1, XMM6)
919: SSE_SHUFFLE(XMM6, XMM6, 0x00)
920: SSE_MULT_PS(XMM6, XMM1)
921: SSE_SUB_PS(XMM4, XMM6)
923: SSE_LOAD_SS(SSE_ARG_2, FLOAT_2, XMM7)
924: SSE_SHUFFLE(XMM7, XMM7, 0x00)
925: SSE_MULT_PS(XMM7, XMM2)
926: SSE_SUB_PS(XMM4, XMM7)
928: SSE_LOAD_SS(SSE_ARG_2, FLOAT_3, XMM5)
929: SSE_SHUFFLE(XMM5, XMM5, 0x00)
930: SSE_MULT_PS(XMM5, XMM3)
931: SSE_SUB_PS(XMM4, XMM5)
933: SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM4)
934: SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM4)
936: /* Second Column */
937: SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM5)
938: SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM5)
940: /* Matrix-Vector Product: */
941: SSE_LOAD_SS(SSE_ARG_2, FLOAT_4, XMM6)
942: SSE_SHUFFLE(XMM6, XMM6, 0x00)
943: SSE_MULT_PS(XMM6, XMM0)
944: SSE_SUB_PS(XMM5, XMM6)
946: SSE_LOAD_SS(SSE_ARG_2, FLOAT_5, XMM7)
947: SSE_SHUFFLE(XMM7, XMM7, 0x00)
948: SSE_MULT_PS(XMM7, XMM1)
949: SSE_SUB_PS(XMM5, XMM7)
951: SSE_LOAD_SS(SSE_ARG_2, FLOAT_6, XMM4)
952: SSE_SHUFFLE(XMM4, XMM4, 0x00)
953: SSE_MULT_PS(XMM4, XMM2)
954: SSE_SUB_PS(XMM5, XMM4)
956: SSE_LOAD_SS(SSE_ARG_2, FLOAT_7, XMM6)
957: SSE_SHUFFLE(XMM6, XMM6, 0x00)
958: SSE_MULT_PS(XMM6, XMM3)
959: SSE_SUB_PS(XMM5, XMM6)
961: SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM5)
962: SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM5)
964: SSE_PREFETCH_L1(SSE_ARG_2, FLOAT_24)
966: /* Third Column */
967: SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM6)
968: SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6)
970: /* Matrix-Vector Product: */
971: SSE_LOAD_SS(SSE_ARG_2, FLOAT_8, XMM7)
972: SSE_SHUFFLE(XMM7, XMM7, 0x00)
973: SSE_MULT_PS(XMM7, XMM0)
974: SSE_SUB_PS(XMM6, XMM7)
976: SSE_LOAD_SS(SSE_ARG_2, FLOAT_9, XMM4)
977: SSE_SHUFFLE(XMM4, XMM4, 0x00)
978: SSE_MULT_PS(XMM4, XMM1)
979: SSE_SUB_PS(XMM6, XMM4)
981: SSE_LOAD_SS(SSE_ARG_2, FLOAT_10, XMM5)
982: SSE_SHUFFLE(XMM5, XMM5, 0x00)
983: SSE_MULT_PS(XMM5, XMM2)
984: SSE_SUB_PS(XMM6, XMM5)
986: SSE_LOAD_SS(SSE_ARG_2, FLOAT_11, XMM7)
987: SSE_SHUFFLE(XMM7, XMM7, 0x00)
988: SSE_MULT_PS(XMM7, XMM3)
989: SSE_SUB_PS(XMM6, XMM7)
991: SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM6)
992: SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM6)
994: /* Fourth Column */
995: SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM4)
996: SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM4)
998: /* Matrix-Vector Product: */
999: SSE_LOAD_SS(SSE_ARG_2, FLOAT_12, XMM5)
1000: SSE_SHUFFLE(XMM5, XMM5, 0x00)
1001: SSE_MULT_PS(XMM5, XMM0)
1002: SSE_SUB_PS(XMM4, XMM5)
1004: SSE_LOAD_SS(SSE_ARG_2, FLOAT_13, XMM6)
1005: SSE_SHUFFLE(XMM6, XMM6, 0x00)
1006: SSE_MULT_PS(XMM6, XMM1)
1007: SSE_SUB_PS(XMM4, XMM6)
1009: SSE_LOAD_SS(SSE_ARG_2, FLOAT_14, XMM7)
1010: SSE_SHUFFLE(XMM7, XMM7, 0x00)
1011: SSE_MULT_PS(XMM7, XMM2)
1012: SSE_SUB_PS(XMM4, XMM7)
1014: SSE_LOAD_SS(SSE_ARG_2, FLOAT_15, XMM5)
1015: SSE_SHUFFLE(XMM5, XMM5, 0x00)
1016: SSE_MULT_PS(XMM5, XMM3)
1017: SSE_SUB_PS(XMM4, XMM5)
1019: SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM4)
1020: SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM4)
1021: SSE_INLINE_END_2;
1022: pv += 16;
1023: }
1024: PetscCall(PetscLogFlops(128.0 * nz + 112.0));
1025: }
1026: row = *bjtmp++;
1027: /* row = (*bjtmp++)/4; */
1028: }
1029: /* finished row so stick it into b->a */
1030: pv = ba + 16 * bi[i];
1031: pj = bj + bi[i];
1032: nz = bi[i + 1] - bi[i];
1034: /* Copy x block back into pv block */
1035: for (j = 0; j < nz; j++) {
1036: x = rtmp + 16 * pj[j];
1037: /* x = rtmp+4*pj[j]; */
1039: SSE_INLINE_BEGIN_2(x, pv)
1040: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1041: SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM1)
1042: SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM1)
1044: SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM2)
1045: SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM2)
1047: SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM3)
1048: SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM3)
1050: SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM4)
1051: SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM4)
1053: SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM5)
1054: SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM5)
1056: SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6)
1057: SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6)
1059: SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM7)
1060: SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM7)
1062: SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0)
1063: SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0)
1064: SSE_INLINE_END_2;
1065: pv += 16;
1066: }
1067: /* invert diagonal block */
1068: w = ba + 16 * diag_offset[i];
1069: if (pivotinblocks) {
1070: PetscCall(PetscKernel_A_gets_inverse_A_4(w, shift, allowzeropivot, &zeropivotdetected));
1071: if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
1072: } else {
1073: PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w));
1074: }
1075: /* PetscCall(PetscKernel_A_gets_inverse_A_4_SSE(w)); */
1076: /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1077: }
1079: PetscCall(PetscFree(rtmp));
1081: C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1082: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1083: C->assembled = PETSC_TRUE;
1085: PetscCall(PetscLogFlops(1.333333333333 * bs * bs2 * b->mbs));
1086: /* Flop Count from inverting diagonal blocks */
1087: SSE_SCOPE_END;
1088: PetscFunctionReturn(PETSC_SUCCESS);
1089: }
1091: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(Mat C)
1092: {
1093: Mat A = C;
1094: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
1095: int i, j, n = a->mbs;
1096: unsigned short *bj = (unsigned short *)(b->j), *bjtmp, *pj;
1097: unsigned short *aj = (unsigned short *)(a->j), *ajtmp;
1098: unsigned int row;
1099: int nz, *bi = b->i;
1100: int *diag_offset = b->diag, *ai = a->i;
1101: MatScalar *pv, *v, *rtmp, *pc, *w, *x;
1102: MatScalar *ba = b->a, *aa = a->a;
1103: int nonzero = 0;
1104: PetscBool pivotinblocks = b->pivotinblocks;
1105: PetscReal shift = info->shiftamount;
1106: PetscBool allowzeropivot, zeropivotdetected = PETSC_FALSE;
1108: PetscFunctionBegin;
1109: allowzeropivot = PetscNot(A->erroriffailure);
1110: SSE_SCOPE_BEGIN;
1112: PetscCheck((unsigned long)aa % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer aa is not 16 byte aligned. SSE will not work.");
1113: PetscCheck((unsigned long)ba % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer ba is not 16 byte aligned. SSE will not work.");
1114: PetscCall(PetscMalloc1(16 * (n + 1), &rtmp));
1115: PetscCheck((unsigned long)rtmp % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer rtmp is not 16 byte aligned. SSE will not work.");
1116: /* if ((unsigned long)bj==(unsigned long)aj) { */
1117: /* colscale = 4; */
1118: /* } */
1120: for (i = 0; i < n; i++) {
1121: nz = bi[i + 1] - bi[i];
1122: bjtmp = bj + bi[i];
1123: /* zero out the 4x4 block accumulators */
1124: /* zero out one register */
1125: XOR_PS(XMM7, XMM7);
1126: for (j = 0; j < nz; j++) {
1127: x = rtmp + 16 * ((unsigned int)bjtmp[j]);
1128: /* x = rtmp+4*bjtmp[j]; */
1129: SSE_INLINE_BEGIN_1(x)
1130: /* Copy zero register to memory locations */
1131: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1132: SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM7)
1133: SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM7)
1134: SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM7)
1135: SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM7)
1136: SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM7)
1137: SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM7)
1138: SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM7)
1139: SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM7)
1140: SSE_INLINE_END_1;
1141: }
1142: /* load in initial (unfactored row) */
1143: nz = ai[i + 1] - ai[i];
1144: ajtmp = aj + ai[i];
1145: v = aa + 16 * ai[i];
1146: for (j = 0; j < nz; j++) {
1147: x = rtmp + 16 * ((unsigned int)ajtmp[j]);
1148: /* x = rtmp+colscale*ajtmp[j]; */
1149: /* Copy v block into x block */
1150: SSE_INLINE_BEGIN_2(v, x)
1151: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1152: SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0)
1153: SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM0)
1155: SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM1)
1156: SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM1)
1158: SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM2)
1159: SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM2)
1161: SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM3)
1162: SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM3)
1164: SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM4)
1165: SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM4)
1167: SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM5)
1168: SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM5)
1170: SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM6)
1171: SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM6)
1173: SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0)
1174: SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0)
1175: SSE_INLINE_END_2;
1177: v += 16;
1178: }
1179: /* row = (*bjtmp++)/4; */
1180: row = (unsigned int)(*bjtmp++);
1181: while (row < i) {
1182: pc = rtmp + 16 * row;
1183: SSE_INLINE_BEGIN_1(pc)
1184: /* Load block from lower triangle */
1185: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1186: SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0)
1187: SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM0)
1189: SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM1)
1190: SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM1)
1192: SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM2)
1193: SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM2)
1195: SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM3)
1196: SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM3)
1198: /* Compare block to zero block */
1200: SSE_COPY_PS(XMM4, XMM7)
1201: SSE_CMPNEQ_PS(XMM4, XMM0)
1203: SSE_COPY_PS(XMM5, XMM7)
1204: SSE_CMPNEQ_PS(XMM5, XMM1)
1206: SSE_COPY_PS(XMM6, XMM7)
1207: SSE_CMPNEQ_PS(XMM6, XMM2)
1209: SSE_CMPNEQ_PS(XMM7, XMM3)
1211: /* Reduce the comparisons to one SSE register */
1212: SSE_OR_PS(XMM6, XMM7)
1213: SSE_OR_PS(XMM5, XMM4)
1214: SSE_OR_PS(XMM5, XMM6)
1215: SSE_INLINE_END_1;
1217: /* Reduce the one SSE register to an integer register for branching */
1218: /* Note: Since nonzero is an int, there is no INLINE block version of this call */
1219: MOVEMASK(nonzero, XMM5);
1221: /* If block is nonzero ... */
1222: if (nonzero) {
1223: pv = ba + 16 * diag_offset[row];
1224: PREFETCH_L1(&pv[16]);
1225: pj = bj + diag_offset[row] + 1;
1227: /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
1228: /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
1229: /* but the diagonal was inverted already */
1230: /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
1232: SSE_INLINE_BEGIN_2(pv, pc)
1233: /* Column 0, product is accumulated in XMM4 */
1234: SSE_LOAD_SS(SSE_ARG_1, FLOAT_0, XMM4)
1235: SSE_SHUFFLE(XMM4, XMM4, 0x00)
1236: SSE_MULT_PS(XMM4, XMM0)
1238: SSE_LOAD_SS(SSE_ARG_1, FLOAT_1, XMM5)
1239: SSE_SHUFFLE(XMM5, XMM5, 0x00)
1240: SSE_MULT_PS(XMM5, XMM1)
1241: SSE_ADD_PS(XMM4, XMM5)
1243: SSE_LOAD_SS(SSE_ARG_1, FLOAT_2, XMM6)
1244: SSE_SHUFFLE(XMM6, XMM6, 0x00)
1245: SSE_MULT_PS(XMM6, XMM2)
1246: SSE_ADD_PS(XMM4, XMM6)
1248: SSE_LOAD_SS(SSE_ARG_1, FLOAT_3, XMM7)
1249: SSE_SHUFFLE(XMM7, XMM7, 0x00)
1250: SSE_MULT_PS(XMM7, XMM3)
1251: SSE_ADD_PS(XMM4, XMM7)
1253: SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM4)
1254: SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM4)
1256: /* Column 1, product is accumulated in XMM5 */
1257: SSE_LOAD_SS(SSE_ARG_1, FLOAT_4, XMM5)
1258: SSE_SHUFFLE(XMM5, XMM5, 0x00)
1259: SSE_MULT_PS(XMM5, XMM0)
1261: SSE_LOAD_SS(SSE_ARG_1, FLOAT_5, XMM6)
1262: SSE_SHUFFLE(XMM6, XMM6, 0x00)
1263: SSE_MULT_PS(XMM6, XMM1)
1264: SSE_ADD_PS(XMM5, XMM6)
1266: SSE_LOAD_SS(SSE_ARG_1, FLOAT_6, XMM7)
1267: SSE_SHUFFLE(XMM7, XMM7, 0x00)
1268: SSE_MULT_PS(XMM7, XMM2)
1269: SSE_ADD_PS(XMM5, XMM7)
1271: SSE_LOAD_SS(SSE_ARG_1, FLOAT_7, XMM6)
1272: SSE_SHUFFLE(XMM6, XMM6, 0x00)
1273: SSE_MULT_PS(XMM6, XMM3)
1274: SSE_ADD_PS(XMM5, XMM6)
1276: SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM5)
1277: SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM5)
1279: SSE_PREFETCH_L1(SSE_ARG_1, FLOAT_24)
1281: /* Column 2, product is accumulated in XMM6 */
1282: SSE_LOAD_SS(SSE_ARG_1, FLOAT_8, XMM6)
1283: SSE_SHUFFLE(XMM6, XMM6, 0x00)
1284: SSE_MULT_PS(XMM6, XMM0)
1286: SSE_LOAD_SS(SSE_ARG_1, FLOAT_9, XMM7)
1287: SSE_SHUFFLE(XMM7, XMM7, 0x00)
1288: SSE_MULT_PS(XMM7, XMM1)
1289: SSE_ADD_PS(XMM6, XMM7)
1291: SSE_LOAD_SS(SSE_ARG_1, FLOAT_10, XMM7)
1292: SSE_SHUFFLE(XMM7, XMM7, 0x00)
1293: SSE_MULT_PS(XMM7, XMM2)
1294: SSE_ADD_PS(XMM6, XMM7)
1296: SSE_LOAD_SS(SSE_ARG_1, FLOAT_11, XMM7)
1297: SSE_SHUFFLE(XMM7, XMM7, 0x00)
1298: SSE_MULT_PS(XMM7, XMM3)
1299: SSE_ADD_PS(XMM6, XMM7)
1301: SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM6)
1302: SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6)
1304: /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
1305: /* Column 3, product is accumulated in XMM0 */
1306: SSE_LOAD_SS(SSE_ARG_1, FLOAT_12, XMM7)
1307: SSE_SHUFFLE(XMM7, XMM7, 0x00)
1308: SSE_MULT_PS(XMM0, XMM7)
1310: SSE_LOAD_SS(SSE_ARG_1, FLOAT_13, XMM7)
1311: SSE_SHUFFLE(XMM7, XMM7, 0x00)
1312: SSE_MULT_PS(XMM1, XMM7)
1313: SSE_ADD_PS(XMM0, XMM1)
1315: SSE_LOAD_SS(SSE_ARG_1, FLOAT_14, XMM1)
1316: SSE_SHUFFLE(XMM1, XMM1, 0x00)
1317: SSE_MULT_PS(XMM1, XMM2)
1318: SSE_ADD_PS(XMM0, XMM1)
1320: SSE_LOAD_SS(SSE_ARG_1, FLOAT_15, XMM7)
1321: SSE_SHUFFLE(XMM7, XMM7, 0x00)
1322: SSE_MULT_PS(XMM3, XMM7)
1323: SSE_ADD_PS(XMM0, XMM3)
1325: SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM0)
1326: SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0)
1328: /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
1329: /* This is code to be maintained and read by humans after all. */
1330: /* Copy Multiplier Col 3 into XMM3 */
1331: SSE_COPY_PS(XMM3, XMM0)
1332: /* Copy Multiplier Col 2 into XMM2 */
1333: SSE_COPY_PS(XMM2, XMM6)
1334: /* Copy Multiplier Col 1 into XMM1 */
1335: SSE_COPY_PS(XMM1, XMM5)
1336: /* Copy Multiplier Col 0 into XMM0 */
1337: SSE_COPY_PS(XMM0, XMM4)
1338: SSE_INLINE_END_2;
1340: /* Update the row: */
1341: nz = bi[row + 1] - diag_offset[row] - 1;
1342: pv += 16;
1343: for (j = 0; j < nz; j++) {
1344: PREFETCH_L1(&pv[16]);
1345: x = rtmp + 16 * ((unsigned int)pj[j]);
1346: /* x = rtmp + 4*pj[j]; */
1348: /* X:=X-M*PV, One column at a time */
1349: /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
1350: SSE_INLINE_BEGIN_2(x, pv)
1351: /* Load First Column of X*/
1352: SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM4)
1353: SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM4)
1355: /* Matrix-Vector Product: */
1356: SSE_LOAD_SS(SSE_ARG_2, FLOAT_0, XMM5)
1357: SSE_SHUFFLE(XMM5, XMM5, 0x00)
1358: SSE_MULT_PS(XMM5, XMM0)
1359: SSE_SUB_PS(XMM4, XMM5)
1361: SSE_LOAD_SS(SSE_ARG_2, FLOAT_1, XMM6)
1362: SSE_SHUFFLE(XMM6, XMM6, 0x00)
1363: SSE_MULT_PS(XMM6, XMM1)
1364: SSE_SUB_PS(XMM4, XMM6)
1366: SSE_LOAD_SS(SSE_ARG_2, FLOAT_2, XMM7)
1367: SSE_SHUFFLE(XMM7, XMM7, 0x00)
1368: SSE_MULT_PS(XMM7, XMM2)
1369: SSE_SUB_PS(XMM4, XMM7)
1371: SSE_LOAD_SS(SSE_ARG_2, FLOAT_3, XMM5)
1372: SSE_SHUFFLE(XMM5, XMM5, 0x00)
1373: SSE_MULT_PS(XMM5, XMM3)
1374: SSE_SUB_PS(XMM4, XMM5)
1376: SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM4)
1377: SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM4)
1379: /* Second Column */
1380: SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM5)
1381: SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM5)
1383: /* Matrix-Vector Product: */
1384: SSE_LOAD_SS(SSE_ARG_2, FLOAT_4, XMM6)
1385: SSE_SHUFFLE(XMM6, XMM6, 0x00)
1386: SSE_MULT_PS(XMM6, XMM0)
1387: SSE_SUB_PS(XMM5, XMM6)
1389: SSE_LOAD_SS(SSE_ARG_2, FLOAT_5, XMM7)
1390: SSE_SHUFFLE(XMM7, XMM7, 0x00)
1391: SSE_MULT_PS(XMM7, XMM1)
1392: SSE_SUB_PS(XMM5, XMM7)
1394: SSE_LOAD_SS(SSE_ARG_2, FLOAT_6, XMM4)
1395: SSE_SHUFFLE(XMM4, XMM4, 0x00)
1396: SSE_MULT_PS(XMM4, XMM2)
1397: SSE_SUB_PS(XMM5, XMM4)
1399: SSE_LOAD_SS(SSE_ARG_2, FLOAT_7, XMM6)
1400: SSE_SHUFFLE(XMM6, XMM6, 0x00)
1401: SSE_MULT_PS(XMM6, XMM3)
1402: SSE_SUB_PS(XMM5, XMM6)
1404: SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM5)
1405: SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM5)
1407: SSE_PREFETCH_L1(SSE_ARG_2, FLOAT_24)
1409: /* Third Column */
1410: SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM6)
1411: SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6)
1413: /* Matrix-Vector Product: */
1414: SSE_LOAD_SS(SSE_ARG_2, FLOAT_8, XMM7)
1415: SSE_SHUFFLE(XMM7, XMM7, 0x00)
1416: SSE_MULT_PS(XMM7, XMM0)
1417: SSE_SUB_PS(XMM6, XMM7)
1419: SSE_LOAD_SS(SSE_ARG_2, FLOAT_9, XMM4)
1420: SSE_SHUFFLE(XMM4, XMM4, 0x00)
1421: SSE_MULT_PS(XMM4, XMM1)
1422: SSE_SUB_PS(XMM6, XMM4)
1424: SSE_LOAD_SS(SSE_ARG_2, FLOAT_10, XMM5)
1425: SSE_SHUFFLE(XMM5, XMM5, 0x00)
1426: SSE_MULT_PS(XMM5, XMM2)
1427: SSE_SUB_PS(XMM6, XMM5)
1429: SSE_LOAD_SS(SSE_ARG_2, FLOAT_11, XMM7)
1430: SSE_SHUFFLE(XMM7, XMM7, 0x00)
1431: SSE_MULT_PS(XMM7, XMM3)
1432: SSE_SUB_PS(XMM6, XMM7)
1434: SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM6)
1435: SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM6)
1437: /* Fourth Column */
1438: SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM4)
1439: SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM4)
1441: /* Matrix-Vector Product: */
1442: SSE_LOAD_SS(SSE_ARG_2, FLOAT_12, XMM5)
1443: SSE_SHUFFLE(XMM5, XMM5, 0x00)
1444: SSE_MULT_PS(XMM5, XMM0)
1445: SSE_SUB_PS(XMM4, XMM5)
1447: SSE_LOAD_SS(SSE_ARG_2, FLOAT_13, XMM6)
1448: SSE_SHUFFLE(XMM6, XMM6, 0x00)
1449: SSE_MULT_PS(XMM6, XMM1)
1450: SSE_SUB_PS(XMM4, XMM6)
1452: SSE_LOAD_SS(SSE_ARG_2, FLOAT_14, XMM7)
1453: SSE_SHUFFLE(XMM7, XMM7, 0x00)
1454: SSE_MULT_PS(XMM7, XMM2)
1455: SSE_SUB_PS(XMM4, XMM7)
1457: SSE_LOAD_SS(SSE_ARG_2, FLOAT_15, XMM5)
1458: SSE_SHUFFLE(XMM5, XMM5, 0x00)
1459: SSE_MULT_PS(XMM5, XMM3)
1460: SSE_SUB_PS(XMM4, XMM5)
1462: SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM4)
1463: SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM4)
1464: SSE_INLINE_END_2;
1465: pv += 16;
1466: }
1467: PetscCall(PetscLogFlops(128.0 * nz + 112.0));
1468: }
1469: row = (unsigned int)(*bjtmp++);
1470: /* row = (*bjtmp++)/4; */
1471: /* bjtmp++; */
1472: }
1473: /* finished row so stick it into b->a */
1474: pv = ba + 16 * bi[i];
1475: pj = bj + bi[i];
1476: nz = bi[i + 1] - bi[i];
1478: /* Copy x block back into pv block */
1479: for (j = 0; j < nz; j++) {
1480: x = rtmp + 16 * ((unsigned int)pj[j]);
1481: /* x = rtmp+4*pj[j]; */
1483: SSE_INLINE_BEGIN_2(x, pv)
1484: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1485: SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM1)
1486: SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM1)
1488: SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM2)
1489: SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM2)
1491: SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM3)
1492: SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM3)
1494: SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM4)
1495: SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM4)
1497: SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM5)
1498: SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM5)
1500: SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6)
1501: SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6)
1503: SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM7)
1504: SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM7)
1506: SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0)
1507: SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0)
1508: SSE_INLINE_END_2;
1509: pv += 16;
1510: }
1511: /* invert diagonal block */
1512: w = ba + 16 * diag_offset[i];
1513: if (pivotinblocks) {
1514: PetscCall(PetscKernel_A_gets_inverse_A_4(w, shift, allowzeropivot, &zeropivotdetected));
1515: if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
1516: } else {
1517: PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w));
1518: }
1519: /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1520: }
1522: PetscCall(PetscFree(rtmp));
1524: C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1525: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1526: C->assembled = PETSC_TRUE;
1528: PetscCall(PetscLogFlops(1.333333333333 * bs * bs2 * b->mbs));
1529: /* Flop Count from inverting diagonal blocks */
1530: SSE_SCOPE_END;
1531: PetscFunctionReturn(PETSC_SUCCESS);
1532: }
1534: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat C, Mat A, const MatFactorInfo *info)
1535: {
1536: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
1537: int i, j, n = a->mbs;
1538: unsigned short *bj = (unsigned short *)(b->j), *bjtmp, *pj;
1539: unsigned int row;
1540: int *ajtmpold, nz, *bi = b->i;
1541: int *diag_offset = b->diag, *ai = a->i, *aj = a->j;
1542: MatScalar *pv, *v, *rtmp, *pc, *w, *x;
1543: MatScalar *ba = b->a, *aa = a->a;
1544: int nonzero = 0;
1545: PetscBool pivotinblocks = b->pivotinblocks;
1546: PetscReal shift = info->shiftamount;
1547: PetscBool allowzeropivot, zeropivotdetected = PETSC_FALSE;
1549: PetscFunctionBegin;
1550: allowzeropivot = PetscNot(A->erroriffailure);
1551: SSE_SCOPE_BEGIN;
1553: PetscCheck((unsigned long)aa % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer aa is not 16 byte aligned. SSE will not work.");
1554: PetscCheck((unsigned long)ba % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer ba is not 16 byte aligned. SSE will not work.");
1555: PetscCall(PetscMalloc1(16 * (n + 1), &rtmp));
1556: PetscCheck((unsigned long)rtmp % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer rtmp is not 16 byte aligned. SSE will not work.");
1557: /* if ((unsigned long)bj==(unsigned long)aj) { */
1558: /* colscale = 4; */
1559: /* } */
1560: if ((unsigned long)bj == (unsigned long)aj) return (MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(C));
1562: for (i = 0; i < n; i++) {
1563: nz = bi[i + 1] - bi[i];
1564: bjtmp = bj + bi[i];
1565: /* zero out the 4x4 block accumulators */
1566: /* zero out one register */
1567: XOR_PS(XMM7, XMM7);
1568: for (j = 0; j < nz; j++) {
1569: x = rtmp + 16 * ((unsigned int)bjtmp[j]);
1570: /* x = rtmp+4*bjtmp[j]; */
1571: SSE_INLINE_BEGIN_1(x)
1572: /* Copy zero register to memory locations */
1573: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1574: SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM7)
1575: SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM7)
1576: SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM7)
1577: SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM7)
1578: SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM7)
1579: SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM7)
1580: SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM7)
1581: SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM7)
1582: SSE_INLINE_END_1;
1583: }
1584: /* load in initial (unfactored row) */
1585: nz = ai[i + 1] - ai[i];
1586: ajtmpold = aj + ai[i];
1587: v = aa + 16 * ai[i];
1588: for (j = 0; j < nz; j++) {
1589: x = rtmp + 16 * ajtmpold[j];
1590: /* x = rtmp+colscale*ajtmpold[j]; */
1591: /* Copy v block into x block */
1592: SSE_INLINE_BEGIN_2(v, x)
1593: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1594: SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0)
1595: SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM0)
1597: SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM1)
1598: SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM1)
1600: SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM2)
1601: SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM2)
1603: SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM3)
1604: SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM3)
1606: SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM4)
1607: SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM4)
1609: SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM5)
1610: SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM5)
1612: SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM6)
1613: SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM6)
1615: SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0)
1616: SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0)
1617: SSE_INLINE_END_2;
1619: v += 16;
1620: }
1621: /* row = (*bjtmp++)/4; */
1622: row = (unsigned int)(*bjtmp++);
1623: while (row < i) {
1624: pc = rtmp + 16 * row;
1625: SSE_INLINE_BEGIN_1(pc)
1626: /* Load block from lower triangle */
1627: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1628: SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0)
1629: SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM0)
1631: SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM1)
1632: SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM1)
1634: SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM2)
1635: SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM2)
1637: SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM3)
1638: SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM3)
1640: /* Compare block to zero block */
1642: SSE_COPY_PS(XMM4, XMM7)
1643: SSE_CMPNEQ_PS(XMM4, XMM0)
1645: SSE_COPY_PS(XMM5, XMM7)
1646: SSE_CMPNEQ_PS(XMM5, XMM1)
1648: SSE_COPY_PS(XMM6, XMM7)
1649: SSE_CMPNEQ_PS(XMM6, XMM2)
1651: SSE_CMPNEQ_PS(XMM7, XMM3)
1653: /* Reduce the comparisons to one SSE register */
1654: SSE_OR_PS(XMM6, XMM7)
1655: SSE_OR_PS(XMM5, XMM4)
1656: SSE_OR_PS(XMM5, XMM6)
1657: SSE_INLINE_END_1;
1659: /* Reduce the one SSE register to an integer register for branching */
1660: /* Note: Since nonzero is an int, there is no INLINE block version of this call */
1661: MOVEMASK(nonzero, XMM5);
1663: /* If block is nonzero ... */
1664: if (nonzero) {
1665: pv = ba + 16 * diag_offset[row];
1666: PREFETCH_L1(&pv[16]);
1667: pj = bj + diag_offset[row] + 1;
1669: /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
1670: /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
1671: /* but the diagonal was inverted already */
1672: /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
1674: SSE_INLINE_BEGIN_2(pv, pc)
1675: /* Column 0, product is accumulated in XMM4 */
1676: SSE_LOAD_SS(SSE_ARG_1, FLOAT_0, XMM4)
1677: SSE_SHUFFLE(XMM4, XMM4, 0x00)
1678: SSE_MULT_PS(XMM4, XMM0)
1680: SSE_LOAD_SS(SSE_ARG_1, FLOAT_1, XMM5)
1681: SSE_SHUFFLE(XMM5, XMM5, 0x00)
1682: SSE_MULT_PS(XMM5, XMM1)
1683: SSE_ADD_PS(XMM4, XMM5)
1685: SSE_LOAD_SS(SSE_ARG_1, FLOAT_2, XMM6)
1686: SSE_SHUFFLE(XMM6, XMM6, 0x00)
1687: SSE_MULT_PS(XMM6, XMM2)
1688: SSE_ADD_PS(XMM4, XMM6)
1690: SSE_LOAD_SS(SSE_ARG_1, FLOAT_3, XMM7)
1691: SSE_SHUFFLE(XMM7, XMM7, 0x00)
1692: SSE_MULT_PS(XMM7, XMM3)
1693: SSE_ADD_PS(XMM4, XMM7)
1695: SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM4)
1696: SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM4)
1698: /* Column 1, product is accumulated in XMM5 */
1699: SSE_LOAD_SS(SSE_ARG_1, FLOAT_4, XMM5)
1700: SSE_SHUFFLE(XMM5, XMM5, 0x00)
1701: SSE_MULT_PS(XMM5, XMM0)
1703: SSE_LOAD_SS(SSE_ARG_1, FLOAT_5, XMM6)
1704: SSE_SHUFFLE(XMM6, XMM6, 0x00)
1705: SSE_MULT_PS(XMM6, XMM1)
1706: SSE_ADD_PS(XMM5, XMM6)
1708: SSE_LOAD_SS(SSE_ARG_1, FLOAT_6, XMM7)
1709: SSE_SHUFFLE(XMM7, XMM7, 0x00)
1710: SSE_MULT_PS(XMM7, XMM2)
1711: SSE_ADD_PS(XMM5, XMM7)
1713: SSE_LOAD_SS(SSE_ARG_1, FLOAT_7, XMM6)
1714: SSE_SHUFFLE(XMM6, XMM6, 0x00)
1715: SSE_MULT_PS(XMM6, XMM3)
1716: SSE_ADD_PS(XMM5, XMM6)
1718: SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM5)
1719: SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM5)
1721: SSE_PREFETCH_L1(SSE_ARG_1, FLOAT_24)
1723: /* Column 2, product is accumulated in XMM6 */
1724: SSE_LOAD_SS(SSE_ARG_1, FLOAT_8, XMM6)
1725: SSE_SHUFFLE(XMM6, XMM6, 0x00)
1726: SSE_MULT_PS(XMM6, XMM0)
1728: SSE_LOAD_SS(SSE_ARG_1, FLOAT_9, XMM7)
1729: SSE_SHUFFLE(XMM7, XMM7, 0x00)
1730: SSE_MULT_PS(XMM7, XMM1)
1731: SSE_ADD_PS(XMM6, XMM7)
1733: SSE_LOAD_SS(SSE_ARG_1, FLOAT_10, XMM7)
1734: SSE_SHUFFLE(XMM7, XMM7, 0x00)
1735: SSE_MULT_PS(XMM7, XMM2)
1736: SSE_ADD_PS(XMM6, XMM7)
1738: SSE_LOAD_SS(SSE_ARG_1, FLOAT_11, XMM7)
1739: SSE_SHUFFLE(XMM7, XMM7, 0x00)
1740: SSE_MULT_PS(XMM7, XMM3)
1741: SSE_ADD_PS(XMM6, XMM7)
1743: SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM6)
1744: SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6)
1746: /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
1747: /* Column 3, product is accumulated in XMM0 */
1748: SSE_LOAD_SS(SSE_ARG_1, FLOAT_12, XMM7)
1749: SSE_SHUFFLE(XMM7, XMM7, 0x00)
1750: SSE_MULT_PS(XMM0, XMM7)
1752: SSE_LOAD_SS(SSE_ARG_1, FLOAT_13, XMM7)
1753: SSE_SHUFFLE(XMM7, XMM7, 0x00)
1754: SSE_MULT_PS(XMM1, XMM7)
1755: SSE_ADD_PS(XMM0, XMM1)
1757: SSE_LOAD_SS(SSE_ARG_1, FLOAT_14, XMM1)
1758: SSE_SHUFFLE(XMM1, XMM1, 0x00)
1759: SSE_MULT_PS(XMM1, XMM2)
1760: SSE_ADD_PS(XMM0, XMM1)
1762: SSE_LOAD_SS(SSE_ARG_1, FLOAT_15, XMM7)
1763: SSE_SHUFFLE(XMM7, XMM7, 0x00)
1764: SSE_MULT_PS(XMM3, XMM7)
1765: SSE_ADD_PS(XMM0, XMM3)
1767: SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM0)
1768: SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0)
1770: /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
1771: /* This is code to be maintained and read by humans after all. */
1772: /* Copy Multiplier Col 3 into XMM3 */
1773: SSE_COPY_PS(XMM3, XMM0)
1774: /* Copy Multiplier Col 2 into XMM2 */
1775: SSE_COPY_PS(XMM2, XMM6)
1776: /* Copy Multiplier Col 1 into XMM1 */
1777: SSE_COPY_PS(XMM1, XMM5)
1778: /* Copy Multiplier Col 0 into XMM0 */
1779: SSE_COPY_PS(XMM0, XMM4)
1780: SSE_INLINE_END_2;
1782: /* Update the row: */
1783: nz = bi[row + 1] - diag_offset[row] - 1;
1784: pv += 16;
1785: for (j = 0; j < nz; j++) {
1786: PREFETCH_L1(&pv[16]);
1787: x = rtmp + 16 * ((unsigned int)pj[j]);
1788: /* x = rtmp + 4*pj[j]; */
1790: /* X:=X-M*PV, One column at a time */
1791: /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
1792: SSE_INLINE_BEGIN_2(x, pv)
1793: /* Load First Column of X*/
1794: SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM4)
1795: SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM4)
1797: /* Matrix-Vector Product: */
1798: SSE_LOAD_SS(SSE_ARG_2, FLOAT_0, XMM5)
1799: SSE_SHUFFLE(XMM5, XMM5, 0x00)
1800: SSE_MULT_PS(XMM5, XMM0)
1801: SSE_SUB_PS(XMM4, XMM5)
1803: SSE_LOAD_SS(SSE_ARG_2, FLOAT_1, XMM6)
1804: SSE_SHUFFLE(XMM6, XMM6, 0x00)
1805: SSE_MULT_PS(XMM6, XMM1)
1806: SSE_SUB_PS(XMM4, XMM6)
1808: SSE_LOAD_SS(SSE_ARG_2, FLOAT_2, XMM7)
1809: SSE_SHUFFLE(XMM7, XMM7, 0x00)
1810: SSE_MULT_PS(XMM7, XMM2)
1811: SSE_SUB_PS(XMM4, XMM7)
1813: SSE_LOAD_SS(SSE_ARG_2, FLOAT_3, XMM5)
1814: SSE_SHUFFLE(XMM5, XMM5, 0x00)
1815: SSE_MULT_PS(XMM5, XMM3)
1816: SSE_SUB_PS(XMM4, XMM5)
1818: SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM4)
1819: SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM4)
1821: /* Second Column */
1822: SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM5)
1823: SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM5)
1825: /* Matrix-Vector Product: */
1826: SSE_LOAD_SS(SSE_ARG_2, FLOAT_4, XMM6)
1827: SSE_SHUFFLE(XMM6, XMM6, 0x00)
1828: SSE_MULT_PS(XMM6, XMM0)
1829: SSE_SUB_PS(XMM5, XMM6)
1831: SSE_LOAD_SS(SSE_ARG_2, FLOAT_5, XMM7)
1832: SSE_SHUFFLE(XMM7, XMM7, 0x00)
1833: SSE_MULT_PS(XMM7, XMM1)
1834: SSE_SUB_PS(XMM5, XMM7)
1836: SSE_LOAD_SS(SSE_ARG_2, FLOAT_6, XMM4)
1837: SSE_SHUFFLE(XMM4, XMM4, 0x00)
1838: SSE_MULT_PS(XMM4, XMM2)
1839: SSE_SUB_PS(XMM5, XMM4)
1841: SSE_LOAD_SS(SSE_ARG_2, FLOAT_7, XMM6)
1842: SSE_SHUFFLE(XMM6, XMM6, 0x00)
1843: SSE_MULT_PS(XMM6, XMM3)
1844: SSE_SUB_PS(XMM5, XMM6)
1846: SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM5)
1847: SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM5)
1849: SSE_PREFETCH_L1(SSE_ARG_2, FLOAT_24)
1851: /* Third Column */
1852: SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM6)
1853: SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6)
1855: /* Matrix-Vector Product: */
1856: SSE_LOAD_SS(SSE_ARG_2, FLOAT_8, XMM7)
1857: SSE_SHUFFLE(XMM7, XMM7, 0x00)
1858: SSE_MULT_PS(XMM7, XMM0)
1859: SSE_SUB_PS(XMM6, XMM7)
1861: SSE_LOAD_SS(SSE_ARG_2, FLOAT_9, XMM4)
1862: SSE_SHUFFLE(XMM4, XMM4, 0x00)
1863: SSE_MULT_PS(XMM4, XMM1)
1864: SSE_SUB_PS(XMM6, XMM4)
1866: SSE_LOAD_SS(SSE_ARG_2, FLOAT_10, XMM5)
1867: SSE_SHUFFLE(XMM5, XMM5, 0x00)
1868: SSE_MULT_PS(XMM5, XMM2)
1869: SSE_SUB_PS(XMM6, XMM5)
1871: SSE_LOAD_SS(SSE_ARG_2, FLOAT_11, XMM7)
1872: SSE_SHUFFLE(XMM7, XMM7, 0x00)
1873: SSE_MULT_PS(XMM7, XMM3)
1874: SSE_SUB_PS(XMM6, XMM7)
1876: SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM6)
1877: SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM6)
1879: /* Fourth Column */
1880: SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM4)
1881: SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM4)
1883: /* Matrix-Vector Product: */
1884: SSE_LOAD_SS(SSE_ARG_2, FLOAT_12, XMM5)
1885: SSE_SHUFFLE(XMM5, XMM5, 0x00)
1886: SSE_MULT_PS(XMM5, XMM0)
1887: SSE_SUB_PS(XMM4, XMM5)
1889: SSE_LOAD_SS(SSE_ARG_2, FLOAT_13, XMM6)
1890: SSE_SHUFFLE(XMM6, XMM6, 0x00)
1891: SSE_MULT_PS(XMM6, XMM1)
1892: SSE_SUB_PS(XMM4, XMM6)
1894: SSE_LOAD_SS(SSE_ARG_2, FLOAT_14, XMM7)
1895: SSE_SHUFFLE(XMM7, XMM7, 0x00)
1896: SSE_MULT_PS(XMM7, XMM2)
1897: SSE_SUB_PS(XMM4, XMM7)
1899: SSE_LOAD_SS(SSE_ARG_2, FLOAT_15, XMM5)
1900: SSE_SHUFFLE(XMM5, XMM5, 0x00)
1901: SSE_MULT_PS(XMM5, XMM3)
1902: SSE_SUB_PS(XMM4, XMM5)
1904: SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM4)
1905: SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM4)
1906: SSE_INLINE_END_2;
1907: pv += 16;
1908: }
1909: PetscCall(PetscLogFlops(128.0 * nz + 112.0));
1910: }
1911: row = (unsigned int)(*bjtmp++);
1912: /* row = (*bjtmp++)/4; */
1913: /* bjtmp++; */
1914: }
1915: /* finished row so stick it into b->a */
1916: pv = ba + 16 * bi[i];
1917: pj = bj + bi[i];
1918: nz = bi[i + 1] - bi[i];
1920: /* Copy x block back into pv block */
1921: for (j = 0; j < nz; j++) {
1922: x = rtmp + 16 * ((unsigned int)pj[j]);
1923: /* x = rtmp+4*pj[j]; */
1925: SSE_INLINE_BEGIN_2(x, pv)
1926: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1927: SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM1)
1928: SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM1)
1930: SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM2)
1931: SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM2)
1933: SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM3)
1934: SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM3)
1936: SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM4)
1937: SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM4)
1939: SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM5)
1940: SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM5)
1942: SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6)
1943: SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6)
1945: SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM7)
1946: SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM7)
1948: SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0)
1949: SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0)
1950: SSE_INLINE_END_2;
1951: pv += 16;
1952: }
1953: /* invert diagonal block */
1954: w = ba + 16 * diag_offset[i];
1955: if (pivotinblocks) {
1956: PetscCall(PetscKernel_A_gets_inverse_A_4(w, shift, allowzeropivot, , &zeropivotdetected));
1957: if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
1958: } else {
1959: PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w));
1960: }
1961: /* PetscCall(PetscKernel_A_gets_inverse_A_4_SSE(w)); */
1962: /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1963: }
1965: PetscCall(PetscFree(rtmp));
1967: C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1968: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1969: C->assembled = PETSC_TRUE;
1971: PetscCall(PetscLogFlops(1.333333333333 * bs * bs2 * b->mbs));
1972: /* Flop Count from inverting diagonal blocks */
1973: SSE_SCOPE_END;
1974: PetscFunctionReturn(PETSC_SUCCESS);
1975: }
1977: #endif