Actual source code: baijsolvnat4.c
1: #include <../src/mat/impls/baij/seq/baij.h>
2: #include <petsc/private/kernels/blockinvert.h>
4: /*
5: Special case where the matrix was ILU(0) factored in the natural
6: ordering. This eliminates the need for the column and row permutation.
7: */
8: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
9: {
10: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
11: PetscInt n = a->mbs;
12: const PetscInt *ai = a->i, *aj = a->j;
13: const PetscInt *diag = a->diag;
14: const MatScalar *aa = a->a;
15: PetscScalar *x;
16: const PetscScalar *b;
18: PetscFunctionBegin;
19: PetscCall(VecGetArrayRead(bb, &b));
20: PetscCall(VecGetArray(xx, &x));
22: #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
23: {
24: static PetscScalar w[2000]; /* very BAD need to fix */
25: fortransolvebaij4_(&n, x, ai, aj, diag, aa, b, w);
26: }
27: #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
28: fortransolvebaij4unroll_(&n, x, ai, aj, diag, aa, b);
29: #else
30: {
31: PetscScalar s1, s2, s3, s4, x1, x2, x3, x4;
32: const MatScalar *v;
33: PetscInt jdx, idt, idx, nz, i, ai16;
34: const PetscInt *vi;
36: /* forward solve the lower triangular */
37: idx = 0;
38: x[0] = b[0];
39: x[1] = b[1];
40: x[2] = b[2];
41: x[3] = b[3];
42: for (i = 1; i < n; i++) {
43: v = aa + 16 * ai[i];
44: vi = aj + ai[i];
45: nz = diag[i] - ai[i];
46: idx += 4;
47: s1 = b[idx];
48: s2 = b[1 + idx];
49: s3 = b[2 + idx];
50: s4 = b[3 + idx];
51: while (nz--) {
52: jdx = 4 * (*vi++);
53: x1 = x[jdx];
54: x2 = x[1 + jdx];
55: x3 = x[2 + jdx];
56: x4 = x[3 + jdx];
57: s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
58: s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
59: s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
60: s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
61: v += 16;
62: }
63: x[idx] = s1;
64: x[1 + idx] = s2;
65: x[2 + idx] = s3;
66: x[3 + idx] = s4;
67: }
68: /* backward solve the upper triangular */
69: idt = 4 * (n - 1);
70: for (i = n - 1; i >= 0; i--) {
71: ai16 = 16 * diag[i];
72: v = aa + ai16 + 16;
73: vi = aj + diag[i] + 1;
74: nz = ai[i + 1] - diag[i] - 1;
75: s1 = x[idt];
76: s2 = x[1 + idt];
77: s3 = x[2 + idt];
78: s4 = x[3 + idt];
79: while (nz--) {
80: idx = 4 * (*vi++);
81: x1 = x[idx];
82: x2 = x[1 + idx];
83: x3 = x[2 + idx];
84: x4 = x[3 + idx];
85: s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
86: s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
87: s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
88: s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
89: v += 16;
90: }
91: v = aa + ai16;
92: x[idt] = v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4;
93: x[1 + idt] = v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4;
94: x[2 + idt] = v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4;
95: x[3 + idt] = v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4;
96: idt -= 4;
97: }
98: }
99: #endif
101: PetscCall(VecRestoreArrayRead(bb, &b));
102: PetscCall(VecRestoreArray(xx, &x));
103: PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n));
104: PetscFunctionReturn(PETSC_SUCCESS);
105: }
107: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A, Vec bb, Vec xx)
108: {
109: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
110: const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
111: PetscInt i, k, nz, idx, jdx, idt;
112: const PetscInt bs = A->rmap->bs, bs2 = a->bs2;
113: const MatScalar *aa = a->a, *v;
114: PetscScalar *x;
115: const PetscScalar *b;
116: PetscScalar s1, s2, s3, s4, x1, x2, x3, x4;
118: PetscFunctionBegin;
119: PetscCall(VecGetArrayRead(bb, &b));
120: PetscCall(VecGetArray(xx, &x));
121: /* forward solve the lower triangular */
122: idx = 0;
123: x[0] = b[idx];
124: x[1] = b[1 + idx];
125: x[2] = b[2 + idx];
126: x[3] = b[3 + idx];
127: for (i = 1; i < n; i++) {
128: v = aa + bs2 * ai[i];
129: vi = aj + ai[i];
130: nz = ai[i + 1] - ai[i];
131: idx = bs * i;
132: s1 = b[idx];
133: s2 = b[1 + idx];
134: s3 = b[2 + idx];
135: s4 = b[3 + idx];
136: for (k = 0; k < nz; k++) {
137: jdx = bs * vi[k];
138: x1 = x[jdx];
139: x2 = x[1 + jdx];
140: x3 = x[2 + jdx];
141: x4 = x[3 + jdx];
142: s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
143: s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
144: s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
145: s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
147: v += bs2;
148: }
150: x[idx] = s1;
151: x[1 + idx] = s2;
152: x[2 + idx] = s3;
153: x[3 + idx] = s4;
154: }
156: /* backward solve the upper triangular */
157: for (i = n - 1; i >= 0; i--) {
158: v = aa + bs2 * (adiag[i + 1] + 1);
159: vi = aj + adiag[i + 1] + 1;
160: nz = adiag[i] - adiag[i + 1] - 1;
161: idt = bs * i;
162: s1 = x[idt];
163: s2 = x[1 + idt];
164: s3 = x[2 + idt];
165: s4 = x[3 + idt];
167: for (k = 0; k < nz; k++) {
168: idx = bs * vi[k];
169: x1 = x[idx];
170: x2 = x[1 + idx];
171: x3 = x[2 + idx];
172: x4 = x[3 + idx];
173: s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
174: s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
175: s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
176: s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
178: v += bs2;
179: }
180: /* x = inv_diagonal*x */
181: x[idt] = v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4;
182: x[1 + idt] = v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4;
183: x[2 + idt] = v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4;
184: x[3 + idt] = v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4;
185: }
187: PetscCall(VecRestoreArrayRead(bb, &b));
188: PetscCall(VecRestoreArray(xx, &x));
189: PetscCall(PetscLogFlops(2.0 * bs2 * (a->nz) - bs * A->cmap->n));
190: PetscFunctionReturn(PETSC_SUCCESS);
191: }
193: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A, Vec bb, Vec xx)
194: {
195: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
196: const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *diag = a->diag;
197: const MatScalar *aa = a->a;
198: const PetscScalar *b;
199: PetscScalar *x;
201: PetscFunctionBegin;
202: PetscCall(VecGetArrayRead(bb, &b));
203: PetscCall(VecGetArray(xx, &x));
205: {
206: MatScalar s1, s2, s3, s4, x1, x2, x3, x4;
207: const MatScalar *v;
208: MatScalar *t = (MatScalar *)x;
209: PetscInt jdx, idt, idx, nz, i, ai16;
210: const PetscInt *vi;
212: /* forward solve the lower triangular */
213: idx = 0;
214: t[0] = (MatScalar)b[0];
215: t[1] = (MatScalar)b[1];
216: t[2] = (MatScalar)b[2];
217: t[3] = (MatScalar)b[3];
218: for (i = 1; i < n; i++) {
219: v = aa + 16 * ai[i];
220: vi = aj + ai[i];
221: nz = diag[i] - ai[i];
222: idx += 4;
223: s1 = (MatScalar)b[idx];
224: s2 = (MatScalar)b[1 + idx];
225: s3 = (MatScalar)b[2 + idx];
226: s4 = (MatScalar)b[3 + idx];
227: while (nz--) {
228: jdx = 4 * (*vi++);
229: x1 = t[jdx];
230: x2 = t[1 + jdx];
231: x3 = t[2 + jdx];
232: x4 = t[3 + jdx];
233: s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
234: s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
235: s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
236: s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
237: v += 16;
238: }
239: t[idx] = s1;
240: t[1 + idx] = s2;
241: t[2 + idx] = s3;
242: t[3 + idx] = s4;
243: }
244: /* backward solve the upper triangular */
245: idt = 4 * (n - 1);
246: for (i = n - 1; i >= 0; i--) {
247: ai16 = 16 * diag[i];
248: v = aa + ai16 + 16;
249: vi = aj + diag[i] + 1;
250: nz = ai[i + 1] - diag[i] - 1;
251: s1 = t[idt];
252: s2 = t[1 + idt];
253: s3 = t[2 + idt];
254: s4 = t[3 + idt];
255: while (nz--) {
256: idx = 4 * (*vi++);
257: x1 = (MatScalar)x[idx];
258: x2 = (MatScalar)x[1 + idx];
259: x3 = (MatScalar)x[2 + idx];
260: x4 = (MatScalar)x[3 + idx];
261: s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
262: s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
263: s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
264: s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
265: v += 16;
266: }
267: v = aa + ai16;
268: x[idt] = (PetscScalar)(v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4);
269: x[1 + idt] = (PetscScalar)(v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4);
270: x[2 + idt] = (PetscScalar)(v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4);
271: x[3 + idt] = (PetscScalar)(v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4);
272: idt -= 4;
273: }
274: }
276: PetscCall(VecRestoreArrayRead(bb, &b));
277: PetscCall(VecRestoreArray(xx, &x));
278: PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n));
279: PetscFunctionReturn(PETSC_SUCCESS);
280: }
282: #if defined(PETSC_HAVE_SSE)
284: #include PETSC_HAVE_SSE
285: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A, Vec bb, Vec xx)
286: {
287: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
288: unsigned short *aj = (unsigned short *)a->j;
289: int *ai = a->i, n = a->mbs, *diag = a->diag;
290: MatScalar *aa = a->a;
291: PetscScalar *x, *b;
293: PetscFunctionBegin;
294: SSE_SCOPE_BEGIN;
295: /*
296: Note: This code currently uses demotion of double
297: to float when performing the mixed-mode computation.
298: This may not be numerically reasonable for all applications.
299: */
300: PREFETCH_NTA(aa + 16 * ai[1]);
302: PetscCall(VecGetArray(bb, &b));
303: PetscCall(VecGetArray(xx, &x));
304: {
305: /* x will first be computed in single precision then promoted inplace to double */
306: MatScalar *v, *t = (MatScalar *)x;
307: int nz, i, idt, ai16;
308: unsigned int jdx, idx;
309: unsigned short *vi;
310: /* Forward solve the lower triangular factor. */
312: /* First block is the identity. */
313: idx = 0;
314: CONVERT_DOUBLE4_FLOAT4(t, b);
315: v = aa + 16 * ((unsigned int)ai[1]);
317: for (i = 1; i < n;) {
318: PREFETCH_NTA(&v[8]);
319: vi = aj + ai[i];
320: nz = diag[i] - ai[i];
321: idx += 4;
323: /* Demote RHS from double to float. */
324: CONVERT_DOUBLE4_FLOAT4(&t[idx], &b[idx]);
325: LOAD_PS(&t[idx], XMM7);
327: while (nz--) {
328: PREFETCH_NTA(&v[16]);
329: jdx = 4 * ((unsigned int)(*vi++));
331: /* 4x4 Matrix-Vector product with negative accumulation: */
332: SSE_INLINE_BEGIN_2(&t[jdx], v)
333: SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6)
335: /* First Column */
336: SSE_COPY_PS(XMM0, XMM6)
337: SSE_SHUFFLE(XMM0, XMM0, 0x00)
338: SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0)
339: SSE_SUB_PS(XMM7, XMM0)
341: /* Second Column */
342: SSE_COPY_PS(XMM1, XMM6)
343: SSE_SHUFFLE(XMM1, XMM1, 0x55)
344: SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4)
345: SSE_SUB_PS(XMM7, XMM1)
347: SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24)
349: /* Third Column */
350: SSE_COPY_PS(XMM2, XMM6)
351: SSE_SHUFFLE(XMM2, XMM2, 0xAA)
352: SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8)
353: SSE_SUB_PS(XMM7, XMM2)
355: /* Fourth Column */
356: SSE_COPY_PS(XMM3, XMM6)
357: SSE_SHUFFLE(XMM3, XMM3, 0xFF)
358: SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12)
359: SSE_SUB_PS(XMM7, XMM3)
360: SSE_INLINE_END_2
362: v += 16;
363: }
364: v = aa + 16 * ai[++i];
365: PREFETCH_NTA(v);
366: STORE_PS(&t[idx], XMM7);
367: }
369: /* Backward solve the upper triangular factor.*/
371: idt = 4 * (n - 1);
372: ai16 = 16 * diag[n - 1];
373: v = aa + ai16 + 16;
374: for (i = n - 1; i >= 0;) {
375: PREFETCH_NTA(&v[8]);
376: vi = aj + diag[i] + 1;
377: nz = ai[i + 1] - diag[i] - 1;
379: LOAD_PS(&t[idt], XMM7);
381: while (nz--) {
382: PREFETCH_NTA(&v[16]);
383: idx = 4 * ((unsigned int)(*vi++));
385: /* 4x4 Matrix-Vector Product with negative accumulation: */
386: SSE_INLINE_BEGIN_2(&t[idx], v)
387: SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6)
389: /* First Column */
390: SSE_COPY_PS(XMM0, XMM6)
391: SSE_SHUFFLE(XMM0, XMM0, 0x00)
392: SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0)
393: SSE_SUB_PS(XMM7, XMM0)
395: /* Second Column */
396: SSE_COPY_PS(XMM1, XMM6)
397: SSE_SHUFFLE(XMM1, XMM1, 0x55)
398: SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4)
399: SSE_SUB_PS(XMM7, XMM1)
401: SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24)
403: /* Third Column */
404: SSE_COPY_PS(XMM2, XMM6)
405: SSE_SHUFFLE(XMM2, XMM2, 0xAA)
406: SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8)
407: SSE_SUB_PS(XMM7, XMM2)
409: /* Fourth Column */
410: SSE_COPY_PS(XMM3, XMM6)
411: SSE_SHUFFLE(XMM3, XMM3, 0xFF)
412: SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12)
413: SSE_SUB_PS(XMM7, XMM3)
414: SSE_INLINE_END_2
415: v += 16;
416: }
417: v = aa + ai16;
418: ai16 = 16 * diag[--i];
419: PREFETCH_NTA(aa + ai16 + 16);
420: /*
421: Scale the result by the diagonal 4x4 block,
422: which was inverted as part of the factorization
423: */
424: SSE_INLINE_BEGIN_3(v, &t[idt], aa + ai16)
425: /* First Column */
426: SSE_COPY_PS(XMM0, XMM7)
427: SSE_SHUFFLE(XMM0, XMM0, 0x00)
428: SSE_MULT_PS_M(XMM0, SSE_ARG_1, FLOAT_0)
430: /* Second Column */
431: SSE_COPY_PS(XMM1, XMM7)
432: SSE_SHUFFLE(XMM1, XMM1, 0x55)
433: SSE_MULT_PS_M(XMM1, SSE_ARG_1, FLOAT_4)
434: SSE_ADD_PS(XMM0, XMM1)
436: SSE_PREFETCH_NTA(SSE_ARG_3, FLOAT_24)
438: /* Third Column */
439: SSE_COPY_PS(XMM2, XMM7)
440: SSE_SHUFFLE(XMM2, XMM2, 0xAA)
441: SSE_MULT_PS_M(XMM2, SSE_ARG_1, FLOAT_8)
442: SSE_ADD_PS(XMM0, XMM2)
444: /* Fourth Column */
445: SSE_COPY_PS(XMM3, XMM7)
446: SSE_SHUFFLE(XMM3, XMM3, 0xFF)
447: SSE_MULT_PS_M(XMM3, SSE_ARG_1, FLOAT_12)
448: SSE_ADD_PS(XMM0, XMM3)
450: SSE_STORE_PS(SSE_ARG_2, FLOAT_0, XMM0)
451: SSE_INLINE_END_3
453: v = aa + ai16 + 16;
454: idt -= 4;
455: }
457: /* Convert t from single precision back to double precision (inplace)*/
458: idt = 4 * (n - 1);
459: for (i = n - 1; i >= 0; i--) {
460: /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
461: /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
462: PetscScalar *xtemp = &x[idt];
463: MatScalar *ttemp = &t[idt];
464: xtemp[3] = (PetscScalar)ttemp[3];
465: xtemp[2] = (PetscScalar)ttemp[2];
466: xtemp[1] = (PetscScalar)ttemp[1];
467: xtemp[0] = (PetscScalar)ttemp[0];
468: idt -= 4;
469: }
471: } /* End of artificial scope. */
472: PetscCall(VecRestoreArray(bb, &b));
473: PetscCall(VecRestoreArray(xx, &x));
474: PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n));
475: SSE_SCOPE_END;
476: PetscFunctionReturn(PETSC_SUCCESS);
477: }
479: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A, Vec bb, Vec xx)
480: {
481: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
482: int *aj = a->j;
483: int *ai = a->i, n = a->mbs, *diag = a->diag;
484: MatScalar *aa = a->a;
485: PetscScalar *x, *b;
487: PetscFunctionBegin;
488: SSE_SCOPE_BEGIN;
489: /*
490: Note: This code currently uses demotion of double
491: to float when performing the mixed-mode computation.
492: This may not be numerically reasonable for all applications.
493: */
494: PREFETCH_NTA(aa + 16 * ai[1]);
496: PetscCall(VecGetArray(bb, &b));
497: PetscCall(VecGetArray(xx, &x));
498: {
499: /* x will first be computed in single precision then promoted inplace to double */
500: MatScalar *v, *t = (MatScalar *)x;
501: int nz, i, idt, ai16;
502: int jdx, idx;
503: int *vi;
504: /* Forward solve the lower triangular factor. */
506: /* First block is the identity. */
507: idx = 0;
508: CONVERT_DOUBLE4_FLOAT4(t, b);
509: v = aa + 16 * ai[1];
511: for (i = 1; i < n;) {
512: PREFETCH_NTA(&v[8]);
513: vi = aj + ai[i];
514: nz = diag[i] - ai[i];
515: idx += 4;
517: /* Demote RHS from double to float. */
518: CONVERT_DOUBLE4_FLOAT4(&t[idx], &b[idx]);
519: LOAD_PS(&t[idx], XMM7);
521: while (nz--) {
522: PREFETCH_NTA(&v[16]);
523: jdx = 4 * (*vi++);
524: /* jdx = *vi++; */
526: /* 4x4 Matrix-Vector product with negative accumulation: */
527: SSE_INLINE_BEGIN_2(&t[jdx], v)
528: SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6)
530: /* First Column */
531: SSE_COPY_PS(XMM0, XMM6)
532: SSE_SHUFFLE(XMM0, XMM0, 0x00)
533: SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0)
534: SSE_SUB_PS(XMM7, XMM0)
536: /* Second Column */
537: SSE_COPY_PS(XMM1, XMM6)
538: SSE_SHUFFLE(XMM1, XMM1, 0x55)
539: SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4)
540: SSE_SUB_PS(XMM7, XMM1)
542: SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24)
544: /* Third Column */
545: SSE_COPY_PS(XMM2, XMM6)
546: SSE_SHUFFLE(XMM2, XMM2, 0xAA)
547: SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8)
548: SSE_SUB_PS(XMM7, XMM2)
550: /* Fourth Column */
551: SSE_COPY_PS(XMM3, XMM6)
552: SSE_SHUFFLE(XMM3, XMM3, 0xFF)
553: SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12)
554: SSE_SUB_PS(XMM7, XMM3)
555: SSE_INLINE_END_2
557: v += 16;
558: }
559: v = aa + 16 * ai[++i];
560: PREFETCH_NTA(v);
561: STORE_PS(&t[idx], XMM7);
562: }
564: /* Backward solve the upper triangular factor.*/
566: idt = 4 * (n - 1);
567: ai16 = 16 * diag[n - 1];
568: v = aa + ai16 + 16;
569: for (i = n - 1; i >= 0;) {
570: PREFETCH_NTA(&v[8]);
571: vi = aj + diag[i] + 1;
572: nz = ai[i + 1] - diag[i] - 1;
574: LOAD_PS(&t[idt], XMM7);
576: while (nz--) {
577: PREFETCH_NTA(&v[16]);
578: idx = 4 * (*vi++);
579: /* idx = *vi++; */
581: /* 4x4 Matrix-Vector Product with negative accumulation: */
582: SSE_INLINE_BEGIN_2(&t[idx], v)
583: SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6)
585: /* First Column */
586: SSE_COPY_PS(XMM0, XMM6)
587: SSE_SHUFFLE(XMM0, XMM0, 0x00)
588: SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0)
589: SSE_SUB_PS(XMM7, XMM0)
591: /* Second Column */
592: SSE_COPY_PS(XMM1, XMM6)
593: SSE_SHUFFLE(XMM1, XMM1, 0x55)
594: SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4)
595: SSE_SUB_PS(XMM7, XMM1)
597: SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24)
599: /* Third Column */
600: SSE_COPY_PS(XMM2, XMM6)
601: SSE_SHUFFLE(XMM2, XMM2, 0xAA)
602: SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8)
603: SSE_SUB_PS(XMM7, XMM2)
605: /* Fourth Column */
606: SSE_COPY_PS(XMM3, XMM6)
607: SSE_SHUFFLE(XMM3, XMM3, 0xFF)
608: SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12)
609: SSE_SUB_PS(XMM7, XMM3)
610: SSE_INLINE_END_2
611: v += 16;
612: }
613: v = aa + ai16;
614: ai16 = 16 * diag[--i];
615: PREFETCH_NTA(aa + ai16 + 16);
616: /*
617: Scale the result by the diagonal 4x4 block,
618: which was inverted as part of the factorization
619: */
620: SSE_INLINE_BEGIN_3(v, &t[idt], aa + ai16)
621: /* First Column */
622: SSE_COPY_PS(XMM0, XMM7)
623: SSE_SHUFFLE(XMM0, XMM0, 0x00)
624: SSE_MULT_PS_M(XMM0, SSE_ARG_1, FLOAT_0)
626: /* Second Column */
627: SSE_COPY_PS(XMM1, XMM7)
628: SSE_SHUFFLE(XMM1, XMM1, 0x55)
629: SSE_MULT_PS_M(XMM1, SSE_ARG_1, FLOAT_4)
630: SSE_ADD_PS(XMM0, XMM1)
632: SSE_PREFETCH_NTA(SSE_ARG_3, FLOAT_24)
634: /* Third Column */
635: SSE_COPY_PS(XMM2, XMM7)
636: SSE_SHUFFLE(XMM2, XMM2, 0xAA)
637: SSE_MULT_PS_M(XMM2, SSE_ARG_1, FLOAT_8)
638: SSE_ADD_PS(XMM0, XMM2)
640: /* Fourth Column */
641: SSE_COPY_PS(XMM3, XMM7)
642: SSE_SHUFFLE(XMM3, XMM3, 0xFF)
643: SSE_MULT_PS_M(XMM3, SSE_ARG_1, FLOAT_12)
644: SSE_ADD_PS(XMM0, XMM3)
646: SSE_STORE_PS(SSE_ARG_2, FLOAT_0, XMM0)
647: SSE_INLINE_END_3
649: v = aa + ai16 + 16;
650: idt -= 4;
651: }
653: /* Convert t from single precision back to double precision (inplace)*/
654: idt = 4 * (n - 1);
655: for (i = n - 1; i >= 0; i--) {
656: /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
657: /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
658: PetscScalar *xtemp = &x[idt];
659: MatScalar *ttemp = &t[idt];
660: xtemp[3] = (PetscScalar)ttemp[3];
661: xtemp[2] = (PetscScalar)ttemp[2];
662: xtemp[1] = (PetscScalar)ttemp[1];
663: xtemp[0] = (PetscScalar)ttemp[0];
664: idt -= 4;
665: }
667: } /* End of artificial scope. */
668: PetscCall(VecRestoreArray(bb, &b));
669: PetscCall(VecRestoreArray(xx, &x));
670: PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n));
671: SSE_SCOPE_END;
672: PetscFunctionReturn(PETSC_SUCCESS);
673: }
675: #endif