Actual source code: sell.c
2: /*
3: Defines the basic matrix operations for the SELL matrix storage format.
4: */
5: #include <../src/mat/impls/sell/seq/sell.h>
6: #include <petscblaslapack.h>
7: #include <petsc/private/kernels/blocktranspose.h>
9: static PetscBool cited = PETSC_FALSE;
10: static const char citation[] = "@inproceedings{ZhangELLPACK2018,\n"
11: " author = {Hong Zhang and Richard T. Mills and Karl Rupp and Barry F. Smith},\n"
12: " title = {Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512}},\n"
13: " booktitle = {Proceedings of the 47th International Conference on Parallel Processing},\n"
14: " year = 2018\n"
15: "}\n";
17: #if defined(PETSC_HAVE_IMMINTRIN_H) && (defined(__AVX512F__) || (defined(__AVX2__) && defined(__FMA__)) || defined(__AVX__)) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
19: #include <immintrin.h>
21: #if !defined(_MM_SCALE_8)
22: #define _MM_SCALE_8 8
23: #endif
25: #if defined(__AVX512F__)
26: /* these do not work
27: vec_idx = _mm512_loadunpackhi_epi32(vec_idx,acolidx);
28: vec_vals = _mm512_loadunpackhi_pd(vec_vals,aval);
29: */
30: #define AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \
31: /* if the mask bit is set, copy from acolidx, otherwise from vec_idx */ \
32: vec_idx = _mm256_loadu_si256((__m256i const *)acolidx); \
33: vec_vals = _mm512_loadu_pd(aval); \
34: vec_x = _mm512_i32gather_pd(vec_idx, x, _MM_SCALE_8); \
35: vec_y = _mm512_fmadd_pd(vec_x, vec_vals, vec_y)
36: #elif defined(__AVX2__) && defined(__FMA__)
37: #define AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \
38: vec_vals = _mm256_loadu_pd(aval); \
39: vec_idx = _mm_loadu_si128((__m128i const *)acolidx); /* SSE2 */ \
40: vec_x = _mm256_i32gather_pd(x, vec_idx, _MM_SCALE_8); \
41: vec_y = _mm256_fmadd_pd(vec_x, vec_vals, vec_y)
42: #endif
43: #endif /* PETSC_HAVE_IMMINTRIN_H */
45: /*@C
46: MatSeqSELLSetPreallocation - For good matrix assembly performance
47: the user should preallocate the matrix storage by setting the parameter `nz`
48: (or the array `nnz`).
50: Collective
52: Input Parameters:
53: + B - The `MATSEQSELL` matrix
54: . rlenmax - number of nonzeros per row (same for all rows), ignored if `rlen` is provided
55: - rlen - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
57: Level: intermediate
59: Notes:
60: Specify the preallocated storage with either `rlenmax` or `rlen` (not both).
61: Set `rlenmax` = `PETSC_DEFAULT` and `rlen` = `NULL` for PETSc to control dynamic memory
62: allocation.
64: You can call `MatGetInfo()` to get information on how effective the preallocation was;
65: for example the fields mallocs,nz_allocated,nz_used,nz_unneeded;
66: You can also run with the option `-info` and look for messages with the string
67: malloc in them to see if additional memory allocation was needed.
69: Developer Note:
70: Use `rlenmax` of `MAT_SKIP_ALLOCATION` to not allocate any space for the matrix
71: entries or columns indices.
73: The maximum number of nonzeos in any row should be as accurate as possible.
74: If it is underestimated, you will get bad performance due to reallocation
75: (`MatSeqXSELLReallocateSELL()`).
77: .seealso: `Mat`, `MATSEQSELL`, `MATSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatGetInfo()`
78: @*/
79: PetscErrorCode MatSeqSELLSetPreallocation(Mat B, PetscInt rlenmax, const PetscInt rlen[])
80: {
81: PetscFunctionBegin;
84: PetscTryMethod(B, "MatSeqSELLSetPreallocation_C", (Mat, PetscInt, const PetscInt[]), (B, rlenmax, rlen));
85: PetscFunctionReturn(PETSC_SUCCESS);
86: }
88: PetscErrorCode MatSeqSELLSetPreallocation_SeqSELL(Mat B, PetscInt maxallocrow, const PetscInt rlen[])
89: {
90: Mat_SeqSELL *b;
91: PetscInt i, j, totalslices;
92: PetscBool skipallocation = PETSC_FALSE, realalloc = PETSC_FALSE;
94: PetscFunctionBegin;
95: if (maxallocrow >= 0 || rlen) realalloc = PETSC_TRUE;
96: if (maxallocrow == MAT_SKIP_ALLOCATION) {
97: skipallocation = PETSC_TRUE;
98: maxallocrow = 0;
99: }
101: PetscCall(PetscLayoutSetUp(B->rmap));
102: PetscCall(PetscLayoutSetUp(B->cmap));
104: /* FIXME: if one preallocates more space than needed, the matrix does not shrink automatically, but for best performance it should */
105: if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 5;
106: PetscCheck(maxallocrow >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "maxallocrow cannot be less than 0: value %" PetscInt_FMT, maxallocrow);
107: if (rlen) {
108: for (i = 0; i < B->rmap->n; i++) {
109: PetscCheck(rlen[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, rlen[i]);
110: PetscCheck(rlen[i] <= B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be greater than row length: local row %" PetscInt_FMT " value %" PetscInt_FMT " rowlength %" PetscInt_FMT, i, rlen[i], B->cmap->n);
111: }
112: }
114: B->preallocated = PETSC_TRUE;
116: b = (Mat_SeqSELL *)B->data;
118: totalslices = PetscCeilInt(B->rmap->n, 8);
119: b->totalslices = totalslices;
120: if (!skipallocation) {
121: if (B->rmap->n & 0x07) PetscCall(PetscInfo(B, "Padding rows to the SEQSELL matrix because the number of rows is not the multiple of 8 (value %" PetscInt_FMT ")\n", B->rmap->n));
123: if (!b->sliidx) { /* sliidx gives the starting index of each slice, the last element is the total space allocated */
124: PetscCall(PetscMalloc1(totalslices + 1, &b->sliidx));
125: }
126: if (!rlen) { /* if rlen is not provided, allocate same space for all the slices */
127: if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 10;
128: else if (maxallocrow < 0) maxallocrow = 1;
129: for (i = 0; i <= totalslices; i++) b->sliidx[i] = i * 8 * maxallocrow;
130: } else {
131: maxallocrow = 0;
132: b->sliidx[0] = 0;
133: for (i = 1; i < totalslices; i++) {
134: b->sliidx[i] = 0;
135: for (j = 0; j < 8; j++) b->sliidx[i] = PetscMax(b->sliidx[i], rlen[8 * (i - 1) + j]);
136: maxallocrow = PetscMax(b->sliidx[i], maxallocrow);
137: PetscCall(PetscIntSumError(b->sliidx[i - 1], 8 * b->sliidx[i], &b->sliidx[i]));
138: }
139: /* last slice */
140: b->sliidx[totalslices] = 0;
141: for (j = (totalslices - 1) * 8; j < B->rmap->n; j++) b->sliidx[totalslices] = PetscMax(b->sliidx[totalslices], rlen[j]);
142: maxallocrow = PetscMax(b->sliidx[totalslices], maxallocrow);
143: b->sliidx[totalslices] = b->sliidx[totalslices - 1] + 8 * b->sliidx[totalslices];
144: }
146: /* allocate space for val, colidx, rlen */
147: /* FIXME: should B's old memory be unlogged? */
148: PetscCall(MatSeqXSELLFreeSELL(B, &b->val, &b->colidx));
149: /* FIXME: assuming an element of the bit array takes 8 bits */
150: PetscCall(PetscMalloc2(b->sliidx[totalslices], &b->val, b->sliidx[totalslices], &b->colidx));
151: /* b->rlen will count nonzeros in each row so far. We dont copy rlen to b->rlen because the matrix has not been set. */
152: PetscCall(PetscCalloc1(8 * totalslices, &b->rlen));
154: b->singlemalloc = PETSC_TRUE;
155: b->free_val = PETSC_TRUE;
156: b->free_colidx = PETSC_TRUE;
157: } else {
158: b->free_val = PETSC_FALSE;
159: b->free_colidx = PETSC_FALSE;
160: }
162: b->nz = 0;
163: b->maxallocrow = maxallocrow;
164: b->rlenmax = maxallocrow;
165: b->maxallocmat = b->sliidx[totalslices];
166: B->info.nz_unneeded = (double)b->maxallocmat;
167: if (realalloc) PetscCall(MatSetOption(B, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE));
168: PetscFunctionReturn(PETSC_SUCCESS);
169: }
171: PetscErrorCode MatGetRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v)
172: {
173: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
174: PetscInt shift;
176: PetscFunctionBegin;
177: PetscCheck(row >= 0 && row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row %" PetscInt_FMT " out of range", row);
178: if (nz) *nz = a->rlen[row];
179: shift = a->sliidx[row >> 3] + (row & 0x07);
180: if (!a->getrowcols) PetscCall(PetscMalloc2(a->rlenmax, &a->getrowcols, a->rlenmax, &a->getrowvals));
181: if (idx) {
182: PetscInt j;
183: for (j = 0; j < a->rlen[row]; j++) a->getrowcols[j] = a->colidx[shift + 8 * j];
184: *idx = a->getrowcols;
185: }
186: if (v) {
187: PetscInt j;
188: for (j = 0; j < a->rlen[row]; j++) a->getrowvals[j] = a->val[shift + 8 * j];
189: *v = a->getrowvals;
190: }
191: PetscFunctionReturn(PETSC_SUCCESS);
192: }
194: PetscErrorCode MatRestoreRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v)
195: {
196: PetscFunctionBegin;
197: PetscFunctionReturn(PETSC_SUCCESS);
198: }
200: PetscErrorCode MatConvert_SeqSELL_SeqAIJ(Mat A, MatType newtype, MatReuse reuse, Mat *newmat)
201: {
202: Mat B;
203: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
204: PetscInt i;
206: PetscFunctionBegin;
207: if (reuse == MAT_REUSE_MATRIX) {
208: B = *newmat;
209: PetscCall(MatZeroEntries(B));
210: } else {
211: PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B));
212: PetscCall(MatSetSizes(B, A->rmap->n, A->cmap->n, A->rmap->N, A->cmap->N));
213: PetscCall(MatSetType(B, MATSEQAIJ));
214: PetscCall(MatSeqAIJSetPreallocation(B, 0, a->rlen));
215: }
217: for (i = 0; i < A->rmap->n; i++) {
218: PetscInt nz = 0, *cols = NULL;
219: PetscScalar *vals = NULL;
221: PetscCall(MatGetRow_SeqSELL(A, i, &nz, &cols, &vals));
222: PetscCall(MatSetValues(B, 1, &i, nz, cols, vals, INSERT_VALUES));
223: PetscCall(MatRestoreRow_SeqSELL(A, i, &nz, &cols, &vals));
224: }
226: PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
227: PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
228: B->rmap->bs = A->rmap->bs;
230: if (reuse == MAT_INPLACE_MATRIX) {
231: PetscCall(MatHeaderReplace(A, &B));
232: } else {
233: *newmat = B;
234: }
235: PetscFunctionReturn(PETSC_SUCCESS);
236: }
238: #include <../src/mat/impls/aij/seq/aij.h>
240: PetscErrorCode MatConvert_SeqAIJ_SeqSELL(Mat A, MatType newtype, MatReuse reuse, Mat *newmat)
241: {
242: Mat B;
243: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
244: PetscInt *ai = a->i, m = A->rmap->N, n = A->cmap->N, i, *rowlengths, row, ncols;
245: const PetscInt *cols;
246: const PetscScalar *vals;
248: PetscFunctionBegin;
250: if (reuse == MAT_REUSE_MATRIX) {
251: B = *newmat;
252: } else {
253: if (PetscDefined(USE_DEBUG) || !a->ilen) {
254: PetscCall(PetscMalloc1(m, &rowlengths));
255: for (i = 0; i < m; i++) rowlengths[i] = ai[i + 1] - ai[i];
256: }
257: if (PetscDefined(USE_DEBUG) && a->ilen) {
258: PetscBool eq;
259: PetscCall(PetscMemcmp(rowlengths, a->ilen, m * sizeof(PetscInt), &eq));
260: PetscCheck(eq, PETSC_COMM_SELF, PETSC_ERR_PLIB, "SeqAIJ ilen array incorrect");
261: PetscCall(PetscFree(rowlengths));
262: rowlengths = a->ilen;
263: } else if (a->ilen) rowlengths = a->ilen;
264: PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B));
265: PetscCall(MatSetSizes(B, m, n, m, n));
266: PetscCall(MatSetType(B, MATSEQSELL));
267: PetscCall(MatSeqSELLSetPreallocation(B, 0, rowlengths));
268: if (rowlengths != a->ilen) PetscCall(PetscFree(rowlengths));
269: }
271: for (row = 0; row < m; row++) {
272: PetscCall(MatGetRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals));
273: PetscCall(MatSetValues_SeqSELL(B, 1, &row, ncols, cols, vals, INSERT_VALUES));
274: PetscCall(MatRestoreRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals));
275: }
276: PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
277: PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
278: B->rmap->bs = A->rmap->bs;
280: if (reuse == MAT_INPLACE_MATRIX) {
281: PetscCall(MatHeaderReplace(A, &B));
282: } else {
283: *newmat = B;
284: }
285: PetscFunctionReturn(PETSC_SUCCESS);
286: }
288: PetscErrorCode MatMult_SeqSELL(Mat A, Vec xx, Vec yy)
289: {
290: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
291: PetscScalar *y;
292: const PetscScalar *x;
293: const MatScalar *aval = a->val;
294: PetscInt totalslices = a->totalslices;
295: const PetscInt *acolidx = a->colidx;
296: PetscInt i, j;
297: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
298: __m512d vec_x, vec_y, vec_vals;
299: __m256i vec_idx;
300: __mmask8 mask;
301: __m512d vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4;
302: __m256i vec_idx2, vec_idx3, vec_idx4;
303: #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
304: __m128i vec_idx;
305: __m256d vec_x, vec_y, vec_y2, vec_vals;
306: MatScalar yval;
307: PetscInt r, rows_left, row, nnz_in_row;
308: #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
309: __m128d vec_x_tmp;
310: __m256d vec_x, vec_y, vec_y2, vec_vals;
311: MatScalar yval;
312: PetscInt r, rows_left, row, nnz_in_row;
313: #else
314: PetscScalar sum[8];
315: #endif
317: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
318: #pragma disjoint(*x, *y, *aval)
319: #endif
321: PetscFunctionBegin;
322: PetscCall(VecGetArrayRead(xx, &x));
323: PetscCall(VecGetArray(yy, &y));
324: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
325: for (i = 0; i < totalslices; i++) { /* loop over slices */
326: PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
327: PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
329: vec_y = _mm512_setzero_pd();
330: vec_y2 = _mm512_setzero_pd();
331: vec_y3 = _mm512_setzero_pd();
332: vec_y4 = _mm512_setzero_pd();
334: j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice column */
335: switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) {
336: case 3:
337: AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
338: acolidx += 8;
339: aval += 8;
340: AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
341: acolidx += 8;
342: aval += 8;
343: AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
344: acolidx += 8;
345: aval += 8;
346: j += 3;
347: break;
348: case 2:
349: AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
350: acolidx += 8;
351: aval += 8;
352: AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
353: acolidx += 8;
354: aval += 8;
355: j += 2;
356: break;
357: case 1:
358: AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
359: acolidx += 8;
360: aval += 8;
361: j += 1;
362: break;
363: }
364: #pragma novector
365: for (; j < (a->sliidx[i + 1] >> 3); j += 4) {
366: AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
367: acolidx += 8;
368: aval += 8;
369: AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
370: acolidx += 8;
371: aval += 8;
372: AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
373: acolidx += 8;
374: aval += 8;
375: AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4);
376: acolidx += 8;
377: aval += 8;
378: }
380: vec_y = _mm512_add_pd(vec_y, vec_y2);
381: vec_y = _mm512_add_pd(vec_y, vec_y3);
382: vec_y = _mm512_add_pd(vec_y, vec_y4);
383: if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
384: mask = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07)));
385: _mm512_mask_storeu_pd(&y[8 * i], mask, vec_y);
386: } else {
387: _mm512_storeu_pd(&y[8 * i], vec_y);
388: }
389: }
390: #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
391: for (i = 0; i < totalslices; i++) { /* loop over full slices */
392: PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
393: PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
395: /* last slice may have padding rows. Don't use vectorization. */
396: if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
397: rows_left = A->rmap->n - 8 * i;
398: for (r = 0; r < rows_left; ++r) {
399: yval = (MatScalar)0;
400: row = 8 * i + r;
401: nnz_in_row = a->rlen[row];
402: for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]];
403: y[row] = yval;
404: }
405: break;
406: }
408: vec_y = _mm256_setzero_pd();
409: vec_y2 = _mm256_setzero_pd();
411: /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
412: #pragma novector
413: #pragma unroll(2)
414: for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
415: AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
416: aval += 4;
417: acolidx += 4;
418: AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y2);
419: aval += 4;
420: acolidx += 4;
421: }
423: _mm256_storeu_pd(y + i * 8, vec_y);
424: _mm256_storeu_pd(y + i * 8 + 4, vec_y2);
425: }
426: #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
427: for (i = 0; i < totalslices; i++) { /* loop over full slices */
428: PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
429: PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
431: vec_y = _mm256_setzero_pd();
432: vec_y2 = _mm256_setzero_pd();
434: /* last slice may have padding rows. Don't use vectorization. */
435: if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
436: rows_left = A->rmap->n - 8 * i;
437: for (r = 0; r < rows_left; ++r) {
438: yval = (MatScalar)0;
439: row = 8 * i + r;
440: nnz_in_row = a->rlen[row];
441: for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]];
442: y[row] = yval;
443: }
444: break;
445: }
447: /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
448: #pragma novector
449: #pragma unroll(2)
450: for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
451: vec_vals = _mm256_loadu_pd(aval);
452: vec_x_tmp = _mm_setzero_pd();
453: vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
454: vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
455: vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
456: vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
457: vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
458: vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
459: vec_y = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y);
460: aval += 4;
462: vec_vals = _mm256_loadu_pd(aval);
463: vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
464: vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
465: vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
466: vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
467: vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
468: vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
469: vec_y2 = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2);
470: aval += 4;
471: }
473: _mm256_storeu_pd(y + i * 8, vec_y);
474: _mm256_storeu_pd(y + i * 8 + 4, vec_y2);
475: }
476: #else
477: for (i = 0; i < totalslices; i++) { /* loop over slices */
478: for (j = 0; j < 8; j++) sum[j] = 0.0;
479: for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
480: sum[0] += aval[j] * x[acolidx[j]];
481: sum[1] += aval[j + 1] * x[acolidx[j + 1]];
482: sum[2] += aval[j + 2] * x[acolidx[j + 2]];
483: sum[3] += aval[j + 3] * x[acolidx[j + 3]];
484: sum[4] += aval[j + 4] * x[acolidx[j + 4]];
485: sum[5] += aval[j + 5] * x[acolidx[j + 5]];
486: sum[6] += aval[j + 6] * x[acolidx[j + 6]];
487: sum[7] += aval[j + 7] * x[acolidx[j + 7]];
488: }
489: if (i == totalslices - 1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */
490: for (j = 0; j < (A->rmap->n & 0x07); j++) y[8 * i + j] = sum[j];
491: } else {
492: for (j = 0; j < 8; j++) y[8 * i + j] = sum[j];
493: }
494: }
495: #endif
497: PetscCall(PetscLogFlops(2.0 * a->nz - a->nonzerorowcnt)); /* theoretical minimal FLOPs */
498: PetscCall(VecRestoreArrayRead(xx, &x));
499: PetscCall(VecRestoreArray(yy, &y));
500: PetscFunctionReturn(PETSC_SUCCESS);
501: }
503: #include <../src/mat/impls/aij/seq/ftn-kernels/fmultadd.h>
504: PetscErrorCode MatMultAdd_SeqSELL(Mat A, Vec xx, Vec yy, Vec zz)
505: {
506: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
507: PetscScalar *y, *z;
508: const PetscScalar *x;
509: const MatScalar *aval = a->val;
510: PetscInt totalslices = a->totalslices;
511: const PetscInt *acolidx = a->colidx;
512: PetscInt i, j;
513: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
514: __m512d vec_x, vec_y, vec_vals;
515: __m256i vec_idx;
516: __mmask8 mask;
517: __m512d vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4;
518: __m256i vec_idx2, vec_idx3, vec_idx4;
519: #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
520: __m128d vec_x_tmp;
521: __m256d vec_x, vec_y, vec_y2, vec_vals;
522: MatScalar yval;
523: PetscInt r, row, nnz_in_row;
524: #else
525: PetscScalar sum[8];
526: #endif
528: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
529: #pragma disjoint(*x, *y, *aval)
530: #endif
532: PetscFunctionBegin;
533: PetscCall(VecGetArrayRead(xx, &x));
534: PetscCall(VecGetArrayPair(yy, zz, &y, &z));
535: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
536: for (i = 0; i < totalslices; i++) { /* loop over slices */
537: PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
538: PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
540: if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
541: mask = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07)));
542: vec_y = _mm512_mask_loadu_pd(vec_y, mask, &y[8 * i]);
543: } else {
544: vec_y = _mm512_loadu_pd(&y[8 * i]);
545: }
546: vec_y2 = _mm512_setzero_pd();
547: vec_y3 = _mm512_setzero_pd();
548: vec_y4 = _mm512_setzero_pd();
550: j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice column */
551: switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) {
552: case 3:
553: AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
554: acolidx += 8;
555: aval += 8;
556: AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
557: acolidx += 8;
558: aval += 8;
559: AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
560: acolidx += 8;
561: aval += 8;
562: j += 3;
563: break;
564: case 2:
565: AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
566: acolidx += 8;
567: aval += 8;
568: AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
569: acolidx += 8;
570: aval += 8;
571: j += 2;
572: break;
573: case 1:
574: AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
575: acolidx += 8;
576: aval += 8;
577: j += 1;
578: break;
579: }
580: #pragma novector
581: for (; j < (a->sliidx[i + 1] >> 3); j += 4) {
582: AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
583: acolidx += 8;
584: aval += 8;
585: AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
586: acolidx += 8;
587: aval += 8;
588: AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
589: acolidx += 8;
590: aval += 8;
591: AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4);
592: acolidx += 8;
593: aval += 8;
594: }
596: vec_y = _mm512_add_pd(vec_y, vec_y2);
597: vec_y = _mm512_add_pd(vec_y, vec_y3);
598: vec_y = _mm512_add_pd(vec_y, vec_y4);
599: if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
600: _mm512_mask_storeu_pd(&z[8 * i], mask, vec_y);
601: } else {
602: _mm512_storeu_pd(&z[8 * i], vec_y);
603: }
604: }
605: #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
606: for (i = 0; i < totalslices; i++) { /* loop over full slices */
607: PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
608: PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
610: /* last slice may have padding rows. Don't use vectorization. */
611: if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
612: for (r = 0; r < (A->rmap->n & 0x07); ++r) {
613: row = 8 * i + r;
614: yval = (MatScalar)0.0;
615: nnz_in_row = a->rlen[row];
616: for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]];
617: z[row] = y[row] + yval;
618: }
619: break;
620: }
622: vec_y = _mm256_loadu_pd(y + 8 * i);
623: vec_y2 = _mm256_loadu_pd(y + 8 * i + 4);
625: /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
626: for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
627: vec_vals = _mm256_loadu_pd(aval);
628: vec_x_tmp = _mm_setzero_pd();
629: vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
630: vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
631: vec_x = _mm256_setzero_pd();
632: vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
633: vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
634: vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
635: vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
636: vec_y = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y);
637: aval += 4;
639: vec_vals = _mm256_loadu_pd(aval);
640: vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
641: vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
642: vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
643: vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
644: vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
645: vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
646: vec_y2 = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2);
647: aval += 4;
648: }
650: _mm256_storeu_pd(z + i * 8, vec_y);
651: _mm256_storeu_pd(z + i * 8 + 4, vec_y2);
652: }
653: #else
654: for (i = 0; i < totalslices; i++) { /* loop over slices */
655: for (j = 0; j < 8; j++) sum[j] = 0.0;
656: for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
657: sum[0] += aval[j] * x[acolidx[j]];
658: sum[1] += aval[j + 1] * x[acolidx[j + 1]];
659: sum[2] += aval[j + 2] * x[acolidx[j + 2]];
660: sum[3] += aval[j + 3] * x[acolidx[j + 3]];
661: sum[4] += aval[j + 4] * x[acolidx[j + 4]];
662: sum[5] += aval[j + 5] * x[acolidx[j + 5]];
663: sum[6] += aval[j + 6] * x[acolidx[j + 6]];
664: sum[7] += aval[j + 7] * x[acolidx[j + 7]];
665: }
666: if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
667: for (j = 0; j < (A->rmap->n & 0x07); j++) z[8 * i + j] = y[8 * i + j] + sum[j];
668: } else {
669: for (j = 0; j < 8; j++) z[8 * i + j] = y[8 * i + j] + sum[j];
670: }
671: }
672: #endif
674: PetscCall(PetscLogFlops(2.0 * a->nz));
675: PetscCall(VecRestoreArrayRead(xx, &x));
676: PetscCall(VecRestoreArrayPair(yy, zz, &y, &z));
677: PetscFunctionReturn(PETSC_SUCCESS);
678: }
680: PetscErrorCode MatMultTransposeAdd_SeqSELL(Mat A, Vec xx, Vec zz, Vec yy)
681: {
682: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
683: PetscScalar *y;
684: const PetscScalar *x;
685: const MatScalar *aval = a->val;
686: const PetscInt *acolidx = a->colidx;
687: PetscInt i, j, r, row, nnz_in_row, totalslices = a->totalslices;
689: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
690: #pragma disjoint(*x, *y, *aval)
691: #endif
693: PetscFunctionBegin;
694: if (A->symmetric == PETSC_BOOL3_TRUE) {
695: PetscCall(MatMultAdd_SeqSELL(A, xx, zz, yy));
696: PetscFunctionReturn(PETSC_SUCCESS);
697: }
698: if (zz != yy) PetscCall(VecCopy(zz, yy));
699: PetscCall(VecGetArrayRead(xx, &x));
700: PetscCall(VecGetArray(yy, &y));
701: for (i = 0; i < a->totalslices; i++) { /* loop over slices */
702: if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
703: for (r = 0; r < (A->rmap->n & 0x07); ++r) {
704: row = 8 * i + r;
705: nnz_in_row = a->rlen[row];
706: for (j = 0; j < nnz_in_row; ++j) y[acolidx[8 * j + r]] += aval[8 * j + r] * x[row];
707: }
708: break;
709: }
710: for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
711: y[acolidx[j]] += aval[j] * x[8 * i];
712: y[acolidx[j + 1]] += aval[j + 1] * x[8 * i + 1];
713: y[acolidx[j + 2]] += aval[j + 2] * x[8 * i + 2];
714: y[acolidx[j + 3]] += aval[j + 3] * x[8 * i + 3];
715: y[acolidx[j + 4]] += aval[j + 4] * x[8 * i + 4];
716: y[acolidx[j + 5]] += aval[j + 5] * x[8 * i + 5];
717: y[acolidx[j + 6]] += aval[j + 6] * x[8 * i + 6];
718: y[acolidx[j + 7]] += aval[j + 7] * x[8 * i + 7];
719: }
720: }
721: PetscCall(PetscLogFlops(2.0 * a->sliidx[a->totalslices]));
722: PetscCall(VecRestoreArrayRead(xx, &x));
723: PetscCall(VecRestoreArray(yy, &y));
724: PetscFunctionReturn(PETSC_SUCCESS);
725: }
727: PetscErrorCode MatMultTranspose_SeqSELL(Mat A, Vec xx, Vec yy)
728: {
729: PetscFunctionBegin;
730: if (A->symmetric == PETSC_BOOL3_TRUE) {
731: PetscCall(MatMult_SeqSELL(A, xx, yy));
732: } else {
733: PetscCall(VecSet(yy, 0.0));
734: PetscCall(MatMultTransposeAdd_SeqSELL(A, xx, yy, yy));
735: }
736: PetscFunctionReturn(PETSC_SUCCESS);
737: }
739: /*
740: Checks for missing diagonals
741: */
742: PetscErrorCode MatMissingDiagonal_SeqSELL(Mat A, PetscBool *missing, PetscInt *d)
743: {
744: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
745: PetscInt *diag, i;
747: PetscFunctionBegin;
748: *missing = PETSC_FALSE;
749: if (A->rmap->n > 0 && !(a->colidx)) {
750: *missing = PETSC_TRUE;
751: if (d) *d = 0;
752: PetscCall(PetscInfo(A, "Matrix has no entries therefore is missing diagonal\n"));
753: } else {
754: diag = a->diag;
755: for (i = 0; i < A->rmap->n; i++) {
756: if (diag[i] == -1) {
757: *missing = PETSC_TRUE;
758: if (d) *d = i;
759: PetscCall(PetscInfo(A, "Matrix is missing diagonal number %" PetscInt_FMT "\n", i));
760: break;
761: }
762: }
763: }
764: PetscFunctionReturn(PETSC_SUCCESS);
765: }
767: PetscErrorCode MatMarkDiagonal_SeqSELL(Mat A)
768: {
769: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
770: PetscInt i, j, m = A->rmap->n, shift;
772: PetscFunctionBegin;
773: if (!a->diag) {
774: PetscCall(PetscMalloc1(m, &a->diag));
775: a->free_diag = PETSC_TRUE;
776: }
777: for (i = 0; i < m; i++) { /* loop over rows */
778: shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */
779: a->diag[i] = -1;
780: for (j = 0; j < a->rlen[i]; j++) {
781: if (a->colidx[shift + j * 8] == i) {
782: a->diag[i] = shift + j * 8;
783: break;
784: }
785: }
786: }
787: PetscFunctionReturn(PETSC_SUCCESS);
788: }
790: /*
791: Negative shift indicates do not generate an error if there is a zero diagonal, just invert it anyways
792: */
793: PetscErrorCode MatInvertDiagonal_SeqSELL(Mat A, PetscScalar omega, PetscScalar fshift)
794: {
795: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
796: PetscInt i, *diag, m = A->rmap->n;
797: MatScalar *val = a->val;
798: PetscScalar *idiag, *mdiag;
800: PetscFunctionBegin;
801: if (a->idiagvalid) PetscFunctionReturn(PETSC_SUCCESS);
802: PetscCall(MatMarkDiagonal_SeqSELL(A));
803: diag = a->diag;
804: if (!a->idiag) {
805: PetscCall(PetscMalloc3(m, &a->idiag, m, &a->mdiag, m, &a->ssor_work));
806: val = a->val;
807: }
808: mdiag = a->mdiag;
809: idiag = a->idiag;
811: if (omega == 1.0 && PetscRealPart(fshift) <= 0.0) {
812: for (i = 0; i < m; i++) {
813: mdiag[i] = val[diag[i]];
814: if (!PetscAbsScalar(mdiag[i])) { /* zero diagonal */
815: PetscCheck(PetscRealPart(fshift), PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Zero diagonal on row %" PetscInt_FMT, i);
816: PetscCall(PetscInfo(A, "Zero diagonal on row %" PetscInt_FMT "\n", i));
817: A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
818: A->factorerror_zeropivot_value = 0.0;
819: A->factorerror_zeropivot_row = i;
820: }
821: idiag[i] = 1.0 / val[diag[i]];
822: }
823: PetscCall(PetscLogFlops(m));
824: } else {
825: for (i = 0; i < m; i++) {
826: mdiag[i] = val[diag[i]];
827: idiag[i] = omega / (fshift + val[diag[i]]);
828: }
829: PetscCall(PetscLogFlops(2.0 * m));
830: }
831: a->idiagvalid = PETSC_TRUE;
832: PetscFunctionReturn(PETSC_SUCCESS);
833: }
835: PetscErrorCode MatZeroEntries_SeqSELL(Mat A)
836: {
837: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
839: PetscFunctionBegin;
840: PetscCall(PetscArrayzero(a->val, a->sliidx[a->totalslices]));
841: PetscCall(MatSeqSELLInvalidateDiagonal(A));
842: PetscFunctionReturn(PETSC_SUCCESS);
843: }
845: PetscErrorCode MatDestroy_SeqSELL(Mat A)
846: {
847: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
849: PetscFunctionBegin;
850: #if defined(PETSC_USE_LOG)
851: PetscCall(PetscLogObjectState((PetscObject)A, "Rows=%" PetscInt_FMT ", Cols=%" PetscInt_FMT ", NZ=%" PetscInt_FMT, A->rmap->n, A->cmap->n, a->nz));
852: #endif
853: PetscCall(MatSeqXSELLFreeSELL(A, &a->val, &a->colidx));
854: PetscCall(ISDestroy(&a->row));
855: PetscCall(ISDestroy(&a->col));
856: PetscCall(PetscFree(a->diag));
857: PetscCall(PetscFree(a->rlen));
858: PetscCall(PetscFree(a->sliidx));
859: PetscCall(PetscFree3(a->idiag, a->mdiag, a->ssor_work));
860: PetscCall(PetscFree(a->solve_work));
861: PetscCall(ISDestroy(&a->icol));
862: PetscCall(PetscFree(a->saved_values));
863: PetscCall(PetscFree2(a->getrowcols, a->getrowvals));
865: PetscCall(PetscFree(A->data));
867: PetscCall(PetscObjectChangeTypeName((PetscObject)A, NULL));
868: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatStoreValues_C", NULL));
869: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatRetrieveValues_C", NULL));
870: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLSetPreallocation_C", NULL));
871: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetArray_C", NULL));
872: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLRestoreArray_C", NULL));
873: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqsell_seqaij_C", NULL));
874: PetscFunctionReturn(PETSC_SUCCESS);
875: }
877: PetscErrorCode MatSetOption_SeqSELL(Mat A, MatOption op, PetscBool flg)
878: {
879: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
881: PetscFunctionBegin;
882: switch (op) {
883: case MAT_ROW_ORIENTED:
884: a->roworiented = flg;
885: break;
886: case MAT_KEEP_NONZERO_PATTERN:
887: a->keepnonzeropattern = flg;
888: break;
889: case MAT_NEW_NONZERO_LOCATIONS:
890: a->nonew = (flg ? 0 : 1);
891: break;
892: case MAT_NEW_NONZERO_LOCATION_ERR:
893: a->nonew = (flg ? -1 : 0);
894: break;
895: case MAT_NEW_NONZERO_ALLOCATION_ERR:
896: a->nonew = (flg ? -2 : 0);
897: break;
898: case MAT_UNUSED_NONZERO_LOCATION_ERR:
899: a->nounused = (flg ? -1 : 0);
900: break;
901: case MAT_FORCE_DIAGONAL_ENTRIES:
902: case MAT_IGNORE_OFF_PROC_ENTRIES:
903: case MAT_USE_HASH_TABLE:
904: case MAT_SORTED_FULL:
905: PetscCall(PetscInfo(A, "Option %s ignored\n", MatOptions[op]));
906: break;
907: case MAT_SPD:
908: case MAT_SYMMETRIC:
909: case MAT_STRUCTURALLY_SYMMETRIC:
910: case MAT_HERMITIAN:
911: case MAT_SYMMETRY_ETERNAL:
912: case MAT_STRUCTURAL_SYMMETRY_ETERNAL:
913: case MAT_SPD_ETERNAL:
914: /* These options are handled directly by MatSetOption() */
915: break;
916: default:
917: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unknown option %d", op);
918: }
919: PetscFunctionReturn(PETSC_SUCCESS);
920: }
922: PetscErrorCode MatGetDiagonal_SeqSELL(Mat A, Vec v)
923: {
924: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
925: PetscInt i, j, n, shift;
926: PetscScalar *x, zero = 0.0;
928: PetscFunctionBegin;
929: PetscCall(VecGetLocalSize(v, &n));
930: PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
932: if (A->factortype == MAT_FACTOR_ILU || A->factortype == MAT_FACTOR_LU) {
933: PetscInt *diag = a->diag;
934: PetscCall(VecGetArray(v, &x));
935: for (i = 0; i < n; i++) x[i] = 1.0 / a->val[diag[i]];
936: PetscCall(VecRestoreArray(v, &x));
937: PetscFunctionReturn(PETSC_SUCCESS);
938: }
940: PetscCall(VecSet(v, zero));
941: PetscCall(VecGetArray(v, &x));
942: for (i = 0; i < n; i++) { /* loop over rows */
943: shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */
944: x[i] = 0;
945: for (j = 0; j < a->rlen[i]; j++) {
946: if (a->colidx[shift + j * 8] == i) {
947: x[i] = a->val[shift + j * 8];
948: break;
949: }
950: }
951: }
952: PetscCall(VecRestoreArray(v, &x));
953: PetscFunctionReturn(PETSC_SUCCESS);
954: }
956: PetscErrorCode MatDiagonalScale_SeqSELL(Mat A, Vec ll, Vec rr)
957: {
958: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
959: const PetscScalar *l, *r;
960: PetscInt i, j, m, n, row;
962: PetscFunctionBegin;
963: if (ll) {
964: /* The local size is used so that VecMPI can be passed to this routine
965: by MatDiagonalScale_MPISELL */
966: PetscCall(VecGetLocalSize(ll, &m));
967: PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
968: PetscCall(VecGetArrayRead(ll, &l));
969: for (i = 0; i < a->totalslices; i++) { /* loop over slices */
970: if (i == a->totalslices - 1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */
971: for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) & 0x07)) {
972: if (row < (A->rmap->n & 0x07)) a->val[j] *= l[8 * i + row];
973: }
974: } else {
975: for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) & 0x07)) a->val[j] *= l[8 * i + row];
976: }
977: }
978: PetscCall(VecRestoreArrayRead(ll, &l));
979: PetscCall(PetscLogFlops(a->nz));
980: }
981: if (rr) {
982: PetscCall(VecGetLocalSize(rr, &n));
983: PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
984: PetscCall(VecGetArrayRead(rr, &r));
985: for (i = 0; i < a->totalslices; i++) { /* loop over slices */
986: if (i == a->totalslices - 1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */
987: for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) & 0x07)) {
988: if (row < (A->rmap->n & 0x07)) a->val[j] *= r[a->colidx[j]];
989: }
990: } else {
991: for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j++) a->val[j] *= r[a->colidx[j]];
992: }
993: }
994: PetscCall(VecRestoreArrayRead(rr, &r));
995: PetscCall(PetscLogFlops(a->nz));
996: }
997: PetscCall(MatSeqSELLInvalidateDiagonal(A));
998: PetscFunctionReturn(PETSC_SUCCESS);
999: }
1001: PetscErrorCode MatGetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], PetscScalar v[])
1002: {
1003: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1004: PetscInt *cp, i, k, low, high, t, row, col, l;
1005: PetscInt shift;
1006: MatScalar *vp;
1008: PetscFunctionBegin;
1009: for (k = 0; k < m; k++) { /* loop over requested rows */
1010: row = im[k];
1011: if (row < 0) continue;
1012: PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1);
1013: shift = a->sliidx[row >> 3] + (row & 0x07); /* starting index of the row */
1014: cp = a->colidx + shift; /* pointer to the row */
1015: vp = a->val + shift; /* pointer to the row */
1016: for (l = 0; l < n; l++) { /* loop over requested columns */
1017: col = in[l];
1018: if (col < 0) continue;
1019: PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1);
1020: high = a->rlen[row];
1021: low = 0; /* assume unsorted */
1022: while (high - low > 5) {
1023: t = (low + high) / 2;
1024: if (*(cp + t * 8) > col) high = t;
1025: else low = t;
1026: }
1027: for (i = low; i < high; i++) {
1028: if (*(cp + 8 * i) > col) break;
1029: if (*(cp + 8 * i) == col) {
1030: *v++ = *(vp + 8 * i);
1031: goto finished;
1032: }
1033: }
1034: *v++ = 0.0;
1035: finished:;
1036: }
1037: }
1038: PetscFunctionReturn(PETSC_SUCCESS);
1039: }
1041: PetscErrorCode MatView_SeqSELL_ASCII(Mat A, PetscViewer viewer)
1042: {
1043: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1044: PetscInt i, j, m = A->rmap->n, shift;
1045: const char *name;
1046: PetscViewerFormat format;
1048: PetscFunctionBegin;
1049: PetscCall(PetscViewerGetFormat(viewer, &format));
1050: if (format == PETSC_VIEWER_ASCII_MATLAB) {
1051: PetscInt nofinalvalue = 0;
1052: /*
1053: if (m && ((a->i[m] == a->i[m-1]) || (a->j[a->nz-1] != A->cmap->n-1))) {
1054: nofinalvalue = 1;
1055: }
1056: */
1057: PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1058: PetscCall(PetscViewerASCIIPrintf(viewer, "%% Size = %" PetscInt_FMT " %" PetscInt_FMT " \n", m, A->cmap->n));
1059: PetscCall(PetscViewerASCIIPrintf(viewer, "%% Nonzeros = %" PetscInt_FMT " \n", a->nz));
1060: #if defined(PETSC_USE_COMPLEX)
1061: PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",4);\n", a->nz + nofinalvalue));
1062: #else
1063: PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",3);\n", a->nz + nofinalvalue));
1064: #endif
1065: PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = [\n"));
1067: for (i = 0; i < m; i++) {
1068: shift = a->sliidx[i >> 3] + (i & 0x07);
1069: for (j = 0; j < a->rlen[i]; j++) {
1070: #if defined(PETSC_USE_COMPLEX)
1071: PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %18.16e %18.16e\n", i + 1, a->colidx[shift + 8 * j] + 1, (double)PetscRealPart(a->val[shift + 8 * j]), (double)PetscImaginaryPart(a->val[shift + 8 * j])));
1072: #else
1073: PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %18.16e\n", i + 1, a->colidx[shift + 8 * j] + 1, (double)a->val[shift + 8 * j]));
1074: #endif
1075: }
1076: }
1077: /*
1078: if (nofinalvalue) {
1079: #if defined(PETSC_USE_COMPLEX)
1080: PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %18.16e %18.16e\n",m,A->cmap->n,0.,0.));
1081: #else
1082: PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %18.16e\n",m,A->cmap->n,0.0));
1083: #endif
1084: }
1085: */
1086: PetscCall(PetscObjectGetName((PetscObject)A, &name));
1087: PetscCall(PetscViewerASCIIPrintf(viewer, "];\n %s = spconvert(zzz);\n", name));
1088: PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1089: } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO || format == PETSC_VIEWER_ASCII_INFO) {
1090: PetscFunctionReturn(PETSC_SUCCESS);
1091: } else if (format == PETSC_VIEWER_ASCII_COMMON) {
1092: PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1093: for (i = 0; i < m; i++) {
1094: PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i));
1095: shift = a->sliidx[i >> 3] + (i & 0x07);
1096: for (j = 0; j < a->rlen[i]; j++) {
1097: #if defined(PETSC_USE_COMPLEX)
1098: if (PetscImaginaryPart(a->val[shift + 8 * j]) > 0.0 && PetscRealPart(a->val[shift + 8 * j]) != 0.0) {
1099: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]), (double)PetscImaginaryPart(a->val[shift + 8 * j])));
1100: } else if (PetscImaginaryPart(a->val[shift + 8 * j]) < 0.0 && PetscRealPart(a->val[shift + 8 * j]) != 0.0) {
1101: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]), (double)-PetscImaginaryPart(a->val[shift + 8 * j])));
1102: } else if (PetscRealPart(a->val[shift + 8 * j]) != 0.0) {
1103: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j])));
1104: }
1105: #else
1106: if (a->val[shift + 8 * j] != 0.0) PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + 8 * j], (double)a->val[shift + 8 * j]));
1107: #endif
1108: }
1109: PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1110: }
1111: PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1112: } else if (format == PETSC_VIEWER_ASCII_DENSE) {
1113: PetscInt cnt = 0, jcnt;
1114: PetscScalar value;
1115: #if defined(PETSC_USE_COMPLEX)
1116: PetscBool realonly = PETSC_TRUE;
1117: for (i = 0; i < a->sliidx[a->totalslices]; i++) {
1118: if (PetscImaginaryPart(a->val[i]) != 0.0) {
1119: realonly = PETSC_FALSE;
1120: break;
1121: }
1122: }
1123: #endif
1125: PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1126: for (i = 0; i < m; i++) {
1127: jcnt = 0;
1128: shift = a->sliidx[i >> 3] + (i & 0x07);
1129: for (j = 0; j < A->cmap->n; j++) {
1130: if (jcnt < a->rlen[i] && j == a->colidx[shift + 8 * j]) {
1131: value = a->val[cnt++];
1132: jcnt++;
1133: } else {
1134: value = 0.0;
1135: }
1136: #if defined(PETSC_USE_COMPLEX)
1137: if (realonly) {
1138: PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)PetscRealPart(value)));
1139: } else {
1140: PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e+%7.5e i ", (double)PetscRealPart(value), (double)PetscImaginaryPart(value)));
1141: }
1142: #else
1143: PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)value));
1144: #endif
1145: }
1146: PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1147: }
1148: PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1149: } else if (format == PETSC_VIEWER_ASCII_MATRIXMARKET) {
1150: PetscInt fshift = 1;
1151: PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1152: #if defined(PETSC_USE_COMPLEX)
1153: PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate complex general\n"));
1154: #else
1155: PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate real general\n"));
1156: #endif
1157: PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %" PetscInt_FMT "\n", m, A->cmap->n, a->nz));
1158: for (i = 0; i < m; i++) {
1159: shift = a->sliidx[i >> 3] + (i & 0x07);
1160: for (j = 0; j < a->rlen[i]; j++) {
1161: #if defined(PETSC_USE_COMPLEX)
1162: PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g %g\n", i + fshift, a->colidx[shift + 8 * j] + fshift, (double)PetscRealPart(a->val[shift + 8 * j]), (double)PetscImaginaryPart(a->val[shift + 8 * j])));
1163: #else
1164: PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g\n", i + fshift, a->colidx[shift + 8 * j] + fshift, (double)a->val[shift + 8 * j]));
1165: #endif
1166: }
1167: }
1168: PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1169: } else if (format == PETSC_VIEWER_NATIVE) {
1170: for (i = 0; i < a->totalslices; i++) { /* loop over slices */
1171: PetscInt row;
1172: PetscCall(PetscViewerASCIIPrintf(viewer, "slice %" PetscInt_FMT ": %" PetscInt_FMT " %" PetscInt_FMT "\n", i, a->sliidx[i], a->sliidx[i + 1]));
1173: for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) & 0x07)) {
1174: #if defined(PETSC_USE_COMPLEX)
1175: if (PetscImaginaryPart(a->val[j]) > 0.0) {
1176: PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g + %g i\n", 8 * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j])));
1177: } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1178: PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g - %g i\n", 8 * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), -(double)PetscImaginaryPart(a->val[j])));
1179: } else {
1180: PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g\n", 8 * i + row, a->colidx[j], (double)PetscRealPart(a->val[j])));
1181: }
1182: #else
1183: PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g\n", 8 * i + row, a->colidx[j], (double)a->val[j]));
1184: #endif
1185: }
1186: }
1187: } else {
1188: PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1189: if (A->factortype) {
1190: for (i = 0; i < m; i++) {
1191: shift = a->sliidx[i >> 3] + (i & 0x07);
1192: PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i));
1193: /* L part */
1194: for (j = shift; j < a->diag[i]; j += 8) {
1195: #if defined(PETSC_USE_COMPLEX)
1196: if (PetscImaginaryPart(a->val[shift + 8 * j]) > 0.0) {
1197: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j])));
1198: } else if (PetscImaginaryPart(a->val[shift + 8 * j]) < 0.0) {
1199: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j]))));
1200: } else {
1201: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j])));
1202: }
1203: #else
1204: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j]));
1205: #endif
1206: }
1207: /* diagonal */
1208: j = a->diag[i];
1209: #if defined(PETSC_USE_COMPLEX)
1210: if (PetscImaginaryPart(a->val[j]) > 0.0) {
1211: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)PetscImaginaryPart(1.0 / a->val[j])));
1212: } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1213: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)(-PetscImaginaryPart(1.0 / a->val[j]))));
1214: } else {
1215: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j])));
1216: }
1217: #else
1218: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)(1.0 / a->val[j])));
1219: #endif
1221: /* U part */
1222: for (j = a->diag[i] + 1; j < shift + 8 * a->rlen[i]; j += 8) {
1223: #if defined(PETSC_USE_COMPLEX)
1224: if (PetscImaginaryPart(a->val[j]) > 0.0) {
1225: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j])));
1226: } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1227: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j]))));
1228: } else {
1229: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j])));
1230: }
1231: #else
1232: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j]));
1233: #endif
1234: }
1235: PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1236: }
1237: } else {
1238: for (i = 0; i < m; i++) {
1239: shift = a->sliidx[i >> 3] + (i & 0x07);
1240: PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i));
1241: for (j = 0; j < a->rlen[i]; j++) {
1242: #if defined(PETSC_USE_COMPLEX)
1243: if (PetscImaginaryPart(a->val[j]) > 0.0) {
1244: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]), (double)PetscImaginaryPart(a->val[shift + 8 * j])));
1245: } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1246: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]), (double)-PetscImaginaryPart(a->val[shift + 8 * j])));
1247: } else {
1248: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j])));
1249: }
1250: #else
1251: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + 8 * j], (double)a->val[shift + 8 * j]));
1252: #endif
1253: }
1254: PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1255: }
1256: }
1257: PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1258: }
1259: PetscCall(PetscViewerFlush(viewer));
1260: PetscFunctionReturn(PETSC_SUCCESS);
1261: }
1263: #include <petscdraw.h>
1264: PetscErrorCode MatView_SeqSELL_Draw_Zoom(PetscDraw draw, void *Aa)
1265: {
1266: Mat A = (Mat)Aa;
1267: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1268: PetscInt i, j, m = A->rmap->n, shift;
1269: int color;
1270: PetscReal xl, yl, xr, yr, x_l, x_r, y_l, y_r;
1271: PetscViewer viewer;
1272: PetscViewerFormat format;
1274: PetscFunctionBegin;
1275: PetscCall(PetscObjectQuery((PetscObject)A, "Zoomviewer", (PetscObject *)&viewer));
1276: PetscCall(PetscViewerGetFormat(viewer, &format));
1277: PetscCall(PetscDrawGetCoordinates(draw, &xl, &yl, &xr, &yr));
1279: /* loop over matrix elements drawing boxes */
1281: if (format != PETSC_VIEWER_DRAW_CONTOUR) {
1282: PetscDrawCollectiveBegin(draw);
1283: /* Blue for negative, Cyan for zero and Red for positive */
1284: color = PETSC_DRAW_BLUE;
1285: for (i = 0; i < m; i++) {
1286: shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */
1287: y_l = m - i - 1.0;
1288: y_r = y_l + 1.0;
1289: for (j = 0; j < a->rlen[i]; j++) {
1290: x_l = a->colidx[shift + j * 8];
1291: x_r = x_l + 1.0;
1292: if (PetscRealPart(a->val[shift + 8 * j]) >= 0.) continue;
1293: PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1294: }
1295: }
1296: color = PETSC_DRAW_CYAN;
1297: for (i = 0; i < m; i++) {
1298: shift = a->sliidx[i >> 3] + (i & 0x07);
1299: y_l = m - i - 1.0;
1300: y_r = y_l + 1.0;
1301: for (j = 0; j < a->rlen[i]; j++) {
1302: x_l = a->colidx[shift + j * 8];
1303: x_r = x_l + 1.0;
1304: if (a->val[shift + 8 * j] != 0.) continue;
1305: PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1306: }
1307: }
1308: color = PETSC_DRAW_RED;
1309: for (i = 0; i < m; i++) {
1310: shift = a->sliidx[i >> 3] + (i & 0x07);
1311: y_l = m - i - 1.0;
1312: y_r = y_l + 1.0;
1313: for (j = 0; j < a->rlen[i]; j++) {
1314: x_l = a->colidx[shift + j * 8];
1315: x_r = x_l + 1.0;
1316: if (PetscRealPart(a->val[shift + 8 * j]) <= 0.) continue;
1317: PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1318: }
1319: }
1320: PetscDrawCollectiveEnd(draw);
1321: } else {
1322: /* use contour shading to indicate magnitude of values */
1323: /* first determine max of all nonzero values */
1324: PetscReal minv = 0.0, maxv = 0.0;
1325: PetscInt count = 0;
1326: PetscDraw popup;
1327: for (i = 0; i < a->sliidx[a->totalslices]; i++) {
1328: if (PetscAbsScalar(a->val[i]) > maxv) maxv = PetscAbsScalar(a->val[i]);
1329: }
1330: if (minv >= maxv) maxv = minv + PETSC_SMALL;
1331: PetscCall(PetscDrawGetPopup(draw, &popup));
1332: PetscCall(PetscDrawScalePopup(popup, minv, maxv));
1334: PetscDrawCollectiveBegin(draw);
1335: for (i = 0; i < m; i++) {
1336: shift = a->sliidx[i >> 3] + (i & 0x07);
1337: y_l = m - i - 1.0;
1338: y_r = y_l + 1.0;
1339: for (j = 0; j < a->rlen[i]; j++) {
1340: x_l = a->colidx[shift + j * 8];
1341: x_r = x_l + 1.0;
1342: color = PetscDrawRealToColor(PetscAbsScalar(a->val[count]), minv, maxv);
1343: PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1344: count++;
1345: }
1346: }
1347: PetscDrawCollectiveEnd(draw);
1348: }
1349: PetscFunctionReturn(PETSC_SUCCESS);
1350: }
1352: #include <petscdraw.h>
1353: PetscErrorCode MatView_SeqSELL_Draw(Mat A, PetscViewer viewer)
1354: {
1355: PetscDraw draw;
1356: PetscReal xr, yr, xl, yl, h, w;
1357: PetscBool isnull;
1359: PetscFunctionBegin;
1360: PetscCall(PetscViewerDrawGetDraw(viewer, 0, &draw));
1361: PetscCall(PetscDrawIsNull(draw, &isnull));
1362: if (isnull) PetscFunctionReturn(PETSC_SUCCESS);
1364: xr = A->cmap->n;
1365: yr = A->rmap->n;
1366: h = yr / 10.0;
1367: w = xr / 10.0;
1368: xr += w;
1369: yr += h;
1370: xl = -w;
1371: yl = -h;
1372: PetscCall(PetscDrawSetCoordinates(draw, xl, yl, xr, yr));
1373: PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", (PetscObject)viewer));
1374: PetscCall(PetscDrawZoom(draw, MatView_SeqSELL_Draw_Zoom, A));
1375: PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", NULL));
1376: PetscCall(PetscDrawSave(draw));
1377: PetscFunctionReturn(PETSC_SUCCESS);
1378: }
1380: PetscErrorCode MatView_SeqSELL(Mat A, PetscViewer viewer)
1381: {
1382: PetscBool iascii, isbinary, isdraw;
1384: PetscFunctionBegin;
1385: PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii));
1386: PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary));
1387: PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERDRAW, &isdraw));
1388: if (iascii) {
1389: PetscCall(MatView_SeqSELL_ASCII(A, viewer));
1390: } else if (isbinary) {
1391: /* PetscCall(MatView_SeqSELL_Binary(A,viewer)); */
1392: } else if (isdraw) PetscCall(MatView_SeqSELL_Draw(A, viewer));
1393: PetscFunctionReturn(PETSC_SUCCESS);
1394: }
1396: PetscErrorCode MatAssemblyEnd_SeqSELL(Mat A, MatAssemblyType mode)
1397: {
1398: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1399: PetscInt i, shift, row_in_slice, row, nrow, *cp, lastcol, j, k;
1400: MatScalar *vp;
1402: PetscFunctionBegin;
1403: if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
1404: /* To do: compress out the unused elements */
1405: PetscCall(MatMarkDiagonal_SeqSELL(A));
1406: PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: %" PetscInt_FMT " allocated %" PetscInt_FMT " used (%" PetscInt_FMT " nonzeros+%" PetscInt_FMT " paddedzeros)\n", A->rmap->n, A->cmap->n, a->maxallocmat, a->sliidx[a->totalslices], a->nz, a->sliidx[a->totalslices] - a->nz));
1407: PetscCall(PetscInfo(A, "Number of mallocs during MatSetValues() is %" PetscInt_FMT "\n", a->reallocs));
1408: PetscCall(PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rlenmax));
1409: /* Set unused slots for column indices to last valid column index. Set unused slots for values to zero. This allows for a use of unmasked intrinsics -> higher performance */
1410: for (i = 0; i < a->totalslices; ++i) {
1411: shift = a->sliidx[i]; /* starting index of the slice */
1412: cp = a->colidx + shift; /* pointer to the column indices of the slice */
1413: vp = a->val + shift; /* pointer to the nonzero values of the slice */
1414: for (row_in_slice = 0; row_in_slice < 8; ++row_in_slice) { /* loop over rows in the slice */
1415: row = 8 * i + row_in_slice;
1416: nrow = a->rlen[row]; /* number of nonzeros in row */
1417: /*
1418: Search for the nearest nonzero. Normally setting the index to zero may cause extra communication.
1419: But if the entire slice are empty, it is fine to use 0 since the index will not be loaded.
1420: */
1421: lastcol = 0;
1422: if (nrow > 0) { /* nonempty row */
1423: lastcol = cp[8 * (nrow - 1) + row_in_slice]; /* use the index from the last nonzero at current row */
1424: } else if (!row_in_slice) { /* first row of the correct slice is empty */
1425: for (j = 1; j < 8; j++) {
1426: if (a->rlen[8 * i + j]) {
1427: lastcol = cp[j];
1428: break;
1429: }
1430: }
1431: } else {
1432: if (a->sliidx[i + 1] != shift) lastcol = cp[row_in_slice - 1]; /* use the index from the previous row */
1433: }
1435: for (k = nrow; k < (a->sliidx[i + 1] - shift) / 8; ++k) {
1436: cp[8 * k + row_in_slice] = lastcol;
1437: vp[8 * k + row_in_slice] = (MatScalar)0;
1438: }
1439: }
1440: }
1442: A->info.mallocs += a->reallocs;
1443: a->reallocs = 0;
1445: PetscCall(MatSeqSELLInvalidateDiagonal(A));
1446: PetscFunctionReturn(PETSC_SUCCESS);
1447: }
1449: PetscErrorCode MatGetInfo_SeqSELL(Mat A, MatInfoType flag, MatInfo *info)
1450: {
1451: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1453: PetscFunctionBegin;
1454: info->block_size = 1.0;
1455: info->nz_allocated = a->maxallocmat;
1456: info->nz_used = a->sliidx[a->totalslices]; /* include padding zeros */
1457: info->nz_unneeded = (a->maxallocmat - a->sliidx[a->totalslices]);
1458: info->assemblies = A->num_ass;
1459: info->mallocs = A->info.mallocs;
1460: info->memory = 0; /* REVIEW ME */
1461: if (A->factortype) {
1462: info->fill_ratio_given = A->info.fill_ratio_given;
1463: info->fill_ratio_needed = A->info.fill_ratio_needed;
1464: info->factor_mallocs = A->info.factor_mallocs;
1465: } else {
1466: info->fill_ratio_given = 0;
1467: info->fill_ratio_needed = 0;
1468: info->factor_mallocs = 0;
1469: }
1470: PetscFunctionReturn(PETSC_SUCCESS);
1471: }
1473: PetscErrorCode MatSetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], const PetscScalar v[], InsertMode is)
1474: {
1475: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1476: PetscInt shift, i, k, l, low, high, t, ii, row, col, nrow;
1477: PetscInt *cp, nonew = a->nonew, lastcol = -1;
1478: MatScalar *vp, value;
1480: PetscFunctionBegin;
1481: for (k = 0; k < m; k++) { /* loop over added rows */
1482: row = im[k];
1483: if (row < 0) continue;
1484: PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1);
1485: shift = a->sliidx[row >> 3] + (row & 0x07); /* starting index of the row */
1486: cp = a->colidx + shift; /* pointer to the row */
1487: vp = a->val + shift; /* pointer to the row */
1488: nrow = a->rlen[row];
1489: low = 0;
1490: high = nrow;
1492: for (l = 0; l < n; l++) { /* loop over added columns */
1493: col = in[l];
1494: if (col < 0) continue;
1495: PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Col too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1);
1496: if (a->roworiented) {
1497: value = v[l + k * n];
1498: } else {
1499: value = v[k + l * m];
1500: }
1501: if ((value == 0.0 && a->ignorezeroentries) && (is == ADD_VALUES)) continue;
1503: /* search in this row for the specified column, i indicates the column to be set */
1504: if (col <= lastcol) low = 0;
1505: else high = nrow;
1506: lastcol = col;
1507: while (high - low > 5) {
1508: t = (low + high) / 2;
1509: if (*(cp + t * 8) > col) high = t;
1510: else low = t;
1511: }
1512: for (i = low; i < high; i++) {
1513: if (*(cp + i * 8) > col) break;
1514: if (*(cp + i * 8) == col) {
1515: if (is == ADD_VALUES) *(vp + i * 8) += value;
1516: else *(vp + i * 8) = value;
1517: low = i + 1;
1518: goto noinsert;
1519: }
1520: }
1521: if (value == 0.0 && a->ignorezeroentries) goto noinsert;
1522: if (nonew == 1) goto noinsert;
1523: PetscCheck(nonew != -1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Inserting a new nonzero (%" PetscInt_FMT ", %" PetscInt_FMT ") in the matrix", row, col);
1524: /* If the current row length exceeds the slice width (e.g. nrow==slice_width), allocate a new space, otherwise do nothing */
1525: MatSeqXSELLReallocateSELL(A, A->rmap->n, 1, nrow, a->sliidx, row / 8, row, col, a->colidx, a->val, cp, vp, nonew, MatScalar);
1526: /* add the new nonzero to the high position, shift the remaining elements in current row to the right by one slot */
1527: for (ii = nrow - 1; ii >= i; ii--) {
1528: *(cp + (ii + 1) * 8) = *(cp + ii * 8);
1529: *(vp + (ii + 1) * 8) = *(vp + ii * 8);
1530: }
1531: a->rlen[row]++;
1532: *(cp + i * 8) = col;
1533: *(vp + i * 8) = value;
1534: a->nz++;
1535: A->nonzerostate++;
1536: low = i + 1;
1537: high++;
1538: nrow++;
1539: noinsert:;
1540: }
1541: a->rlen[row] = nrow;
1542: }
1543: PetscFunctionReturn(PETSC_SUCCESS);
1544: }
1546: PetscErrorCode MatCopy_SeqSELL(Mat A, Mat B, MatStructure str)
1547: {
1548: PetscFunctionBegin;
1549: /* If the two matrices have the same copy implementation, use fast copy. */
1550: if (str == SAME_NONZERO_PATTERN && (A->ops->copy == B->ops->copy)) {
1551: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1552: Mat_SeqSELL *b = (Mat_SeqSELL *)B->data;
1554: PetscCheck(a->sliidx[a->totalslices] == b->sliidx[b->totalslices], PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Number of nonzeros in two matrices are different");
1555: PetscCall(PetscArraycpy(b->val, a->val, a->sliidx[a->totalslices]));
1556: } else {
1557: PetscCall(MatCopy_Basic(A, B, str));
1558: }
1559: PetscFunctionReturn(PETSC_SUCCESS);
1560: }
1562: PetscErrorCode MatSetUp_SeqSELL(Mat A)
1563: {
1564: PetscFunctionBegin;
1565: PetscCall(MatSeqSELLSetPreallocation(A, PETSC_DEFAULT, NULL));
1566: PetscFunctionReturn(PETSC_SUCCESS);
1567: }
1569: PetscErrorCode MatSeqSELLGetArray_SeqSELL(Mat A, PetscScalar *array[])
1570: {
1571: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1573: PetscFunctionBegin;
1574: *array = a->val;
1575: PetscFunctionReturn(PETSC_SUCCESS);
1576: }
1578: PetscErrorCode MatSeqSELLRestoreArray_SeqSELL(Mat A, PetscScalar *array[])
1579: {
1580: PetscFunctionBegin;
1581: PetscFunctionReturn(PETSC_SUCCESS);
1582: }
1584: PetscErrorCode MatRealPart_SeqSELL(Mat A)
1585: {
1586: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1587: PetscInt i;
1588: MatScalar *aval = a->val;
1590: PetscFunctionBegin;
1591: for (i = 0; i < a->sliidx[a->totalslices]; i++) aval[i] = PetscRealPart(aval[i]);
1592: PetscFunctionReturn(PETSC_SUCCESS);
1593: }
1595: PetscErrorCode MatImaginaryPart_SeqSELL(Mat A)
1596: {
1597: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1598: PetscInt i;
1599: MatScalar *aval = a->val;
1601: PetscFunctionBegin;
1602: for (i = 0; i < a->sliidx[a->totalslices]; i++) aval[i] = PetscImaginaryPart(aval[i]);
1603: PetscCall(MatSeqSELLInvalidateDiagonal(A));
1604: PetscFunctionReturn(PETSC_SUCCESS);
1605: }
1607: PetscErrorCode MatScale_SeqSELL(Mat inA, PetscScalar alpha)
1608: {
1609: Mat_SeqSELL *a = (Mat_SeqSELL *)inA->data;
1610: MatScalar *aval = a->val;
1611: PetscScalar oalpha = alpha;
1612: PetscBLASInt one = 1, size;
1614: PetscFunctionBegin;
1615: PetscCall(PetscBLASIntCast(a->sliidx[a->totalslices], &size));
1616: PetscCallBLAS("BLASscal", BLASscal_(&size, &oalpha, aval, &one));
1617: PetscCall(PetscLogFlops(a->nz));
1618: PetscCall(MatSeqSELLInvalidateDiagonal(inA));
1619: PetscFunctionReturn(PETSC_SUCCESS);
1620: }
1622: PetscErrorCode MatShift_SeqSELL(Mat Y, PetscScalar a)
1623: {
1624: Mat_SeqSELL *y = (Mat_SeqSELL *)Y->data;
1626: PetscFunctionBegin;
1627: if (!Y->preallocated || !y->nz) PetscCall(MatSeqSELLSetPreallocation(Y, 1, NULL));
1628: PetscCall(MatShift_Basic(Y, a));
1629: PetscFunctionReturn(PETSC_SUCCESS);
1630: }
1632: PetscErrorCode MatSOR_SeqSELL(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx)
1633: {
1634: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1635: PetscScalar *x, sum, *t;
1636: const MatScalar *idiag = NULL, *mdiag;
1637: const PetscScalar *b, *xb;
1638: PetscInt n, m = A->rmap->n, i, j, shift;
1639: const PetscInt *diag;
1641: PetscFunctionBegin;
1642: its = its * lits;
1644: if (fshift != a->fshift || omega != a->omega) a->idiagvalid = PETSC_FALSE; /* must recompute idiag[] */
1645: if (!a->idiagvalid) PetscCall(MatInvertDiagonal_SeqSELL(A, omega, fshift));
1646: a->fshift = fshift;
1647: a->omega = omega;
1649: diag = a->diag;
1650: t = a->ssor_work;
1651: idiag = a->idiag;
1652: mdiag = a->mdiag;
1654: PetscCall(VecGetArray(xx, &x));
1655: PetscCall(VecGetArrayRead(bb, &b));
1656: /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
1657: PetscCheck(flag != SOR_APPLY_UPPER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_UPPER is not implemented");
1658: PetscCheck(flag != SOR_APPLY_LOWER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_LOWER is not implemented");
1659: PetscCheck(!(flag & SOR_EISENSTAT), PETSC_COMM_SELF, PETSC_ERR_SUP, "No support yet for Eisenstat");
1661: if (flag & SOR_ZERO_INITIAL_GUESS) {
1662: if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1663: for (i = 0; i < m; i++) {
1664: shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */
1665: sum = b[i];
1666: n = (diag[i] - shift) / 8;
1667: for (j = 0; j < n; j++) sum -= a->val[shift + j * 8] * x[a->colidx[shift + j * 8]];
1668: t[i] = sum;
1669: x[i] = sum * idiag[i];
1670: }
1671: xb = t;
1672: PetscCall(PetscLogFlops(a->nz));
1673: } else xb = b;
1674: if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1675: for (i = m - 1; i >= 0; i--) {
1676: shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */
1677: sum = xb[i];
1678: n = a->rlen[i] - (diag[i] - shift) / 8 - 1;
1679: for (j = 1; j <= n; j++) sum -= a->val[diag[i] + j * 8] * x[a->colidx[diag[i] + j * 8]];
1680: if (xb == b) {
1681: x[i] = sum * idiag[i];
1682: } else {
1683: x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */
1684: }
1685: }
1686: PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */
1687: }
1688: its--;
1689: }
1690: while (its--) {
1691: if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1692: for (i = 0; i < m; i++) {
1693: /* lower */
1694: shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */
1695: sum = b[i];
1696: n = (diag[i] - shift) / 8;
1697: for (j = 0; j < n; j++) sum -= a->val[shift + j * 8] * x[a->colidx[shift + j * 8]];
1698: t[i] = sum; /* save application of the lower-triangular part */
1699: /* upper */
1700: n = a->rlen[i] - (diag[i] - shift) / 8 - 1;
1701: for (j = 1; j <= n; j++) sum -= a->val[diag[i] + j * 8] * x[a->colidx[diag[i] + j * 8]];
1702: x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */
1703: }
1704: xb = t;
1705: PetscCall(PetscLogFlops(2.0 * a->nz));
1706: } else xb = b;
1707: if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1708: for (i = m - 1; i >= 0; i--) {
1709: shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */
1710: sum = xb[i];
1711: if (xb == b) {
1712: /* whole matrix (no checkpointing available) */
1713: n = a->rlen[i];
1714: for (j = 0; j < n; j++) sum -= a->val[shift + j * 8] * x[a->colidx[shift + j * 8]];
1715: x[i] = (1. - omega) * x[i] + (sum + mdiag[i] * x[i]) * idiag[i];
1716: } else { /* lower-triangular part has been saved, so only apply upper-triangular */
1717: n = a->rlen[i] - (diag[i] - shift) / 8 - 1;
1718: for (j = 1; j <= n; j++) sum -= a->val[diag[i] + j * 8] * x[a->colidx[diag[i] + j * 8]];
1719: x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */
1720: }
1721: }
1722: if (xb == b) {
1723: PetscCall(PetscLogFlops(2.0 * a->nz));
1724: } else {
1725: PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */
1726: }
1727: }
1728: }
1729: PetscCall(VecRestoreArray(xx, &x));
1730: PetscCall(VecRestoreArrayRead(bb, &b));
1731: PetscFunctionReturn(PETSC_SUCCESS);
1732: }
1734: static struct _MatOps MatOps_Values = {MatSetValues_SeqSELL,
1735: MatGetRow_SeqSELL,
1736: MatRestoreRow_SeqSELL,
1737: MatMult_SeqSELL,
1738: /* 4*/ MatMultAdd_SeqSELL,
1739: MatMultTranspose_SeqSELL,
1740: MatMultTransposeAdd_SeqSELL,
1741: NULL,
1742: NULL,
1743: NULL,
1744: /* 10*/ NULL,
1745: NULL,
1746: NULL,
1747: MatSOR_SeqSELL,
1748: NULL,
1749: /* 15*/ MatGetInfo_SeqSELL,
1750: MatEqual_SeqSELL,
1751: MatGetDiagonal_SeqSELL,
1752: MatDiagonalScale_SeqSELL,
1753: NULL,
1754: /* 20*/ NULL,
1755: MatAssemblyEnd_SeqSELL,
1756: MatSetOption_SeqSELL,
1757: MatZeroEntries_SeqSELL,
1758: /* 24*/ NULL,
1759: NULL,
1760: NULL,
1761: NULL,
1762: NULL,
1763: /* 29*/ MatSetUp_SeqSELL,
1764: NULL,
1765: NULL,
1766: NULL,
1767: NULL,
1768: /* 34*/ MatDuplicate_SeqSELL,
1769: NULL,
1770: NULL,
1771: NULL,
1772: NULL,
1773: /* 39*/ NULL,
1774: NULL,
1775: NULL,
1776: MatGetValues_SeqSELL,
1777: MatCopy_SeqSELL,
1778: /* 44*/ NULL,
1779: MatScale_SeqSELL,
1780: MatShift_SeqSELL,
1781: NULL,
1782: NULL,
1783: /* 49*/ NULL,
1784: NULL,
1785: NULL,
1786: NULL,
1787: NULL,
1788: /* 54*/ MatFDColoringCreate_SeqXAIJ,
1789: NULL,
1790: NULL,
1791: NULL,
1792: NULL,
1793: /* 59*/ NULL,
1794: MatDestroy_SeqSELL,
1795: MatView_SeqSELL,
1796: NULL,
1797: NULL,
1798: /* 64*/ NULL,
1799: NULL,
1800: NULL,
1801: NULL,
1802: NULL,
1803: /* 69*/ NULL,
1804: NULL,
1805: NULL,
1806: NULL,
1807: NULL,
1808: /* 74*/ NULL,
1809: MatFDColoringApply_AIJ, /* reuse the FDColoring function for AIJ */
1810: NULL,
1811: NULL,
1812: NULL,
1813: /* 79*/ NULL,
1814: NULL,
1815: NULL,
1816: NULL,
1817: NULL,
1818: /* 84*/ NULL,
1819: NULL,
1820: NULL,
1821: NULL,
1822: NULL,
1823: /* 89*/ NULL,
1824: NULL,
1825: NULL,
1826: NULL,
1827: NULL,
1828: /* 94*/ NULL,
1829: NULL,
1830: NULL,
1831: NULL,
1832: NULL,
1833: /* 99*/ NULL,
1834: NULL,
1835: NULL,
1836: MatConjugate_SeqSELL,
1837: NULL,
1838: /*104*/ NULL,
1839: NULL,
1840: NULL,
1841: NULL,
1842: NULL,
1843: /*109*/ NULL,
1844: NULL,
1845: NULL,
1846: NULL,
1847: MatMissingDiagonal_SeqSELL,
1848: /*114*/ NULL,
1849: NULL,
1850: NULL,
1851: NULL,
1852: NULL,
1853: /*119*/ NULL,
1854: NULL,
1855: NULL,
1856: NULL,
1857: NULL,
1858: /*124*/ NULL,
1859: NULL,
1860: NULL,
1861: NULL,
1862: NULL,
1863: /*129*/ NULL,
1864: NULL,
1865: NULL,
1866: NULL,
1867: NULL,
1868: /*134*/ NULL,
1869: NULL,
1870: NULL,
1871: NULL,
1872: NULL,
1873: /*139*/ NULL,
1874: NULL,
1875: NULL,
1876: MatFDColoringSetUp_SeqXAIJ,
1877: NULL,
1878: /*144*/ NULL,
1879: NULL,
1880: NULL,
1881: NULL,
1882: NULL,
1883: NULL,
1884: /*150*/ NULL,
1885: NULL};
1887: PetscErrorCode MatStoreValues_SeqSELL(Mat mat)
1888: {
1889: Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1891: PetscFunctionBegin;
1892: PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
1894: /* allocate space for values if not already there */
1895: if (!a->saved_values) PetscCall(PetscMalloc1(a->sliidx[a->totalslices] + 1, &a->saved_values));
1897: /* copy values over */
1898: PetscCall(PetscArraycpy(a->saved_values, a->val, a->sliidx[a->totalslices]));
1899: PetscFunctionReturn(PETSC_SUCCESS);
1900: }
1902: PetscErrorCode MatRetrieveValues_SeqSELL(Mat mat)
1903: {
1904: Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1906: PetscFunctionBegin;
1907: PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
1908: PetscCheck(a->saved_values, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatStoreValues(A);first");
1909: PetscCall(PetscArraycpy(a->val, a->saved_values, a->sliidx[a->totalslices]));
1910: PetscFunctionReturn(PETSC_SUCCESS);
1911: }
1913: /*@C
1914: MatSeqSELLRestoreArray - returns access to the array where the data for a `MATSEQSELL` matrix is stored obtained by `MatSeqSELLGetArray()`
1916: Not Collective
1918: Input Parameters:
1919: + mat - a `MATSEQSELL` matrix
1920: - array - pointer to the data
1922: Level: intermediate
1924: .seealso: `Mat`, `MATSEQSELL`, `MatSeqSELLGetArray()`, `MatSeqSELLRestoreArrayF90()`
1925: @*/
1926: PetscErrorCode MatSeqSELLRestoreArray(Mat A, PetscScalar **array)
1927: {
1928: PetscFunctionBegin;
1929: PetscUseMethod(A, "MatSeqSELLRestoreArray_C", (Mat, PetscScalar **), (A, array));
1930: PetscFunctionReturn(PETSC_SUCCESS);
1931: }
1933: PETSC_EXTERN PetscErrorCode MatCreate_SeqSELL(Mat B)
1934: {
1935: Mat_SeqSELL *b;
1936: PetscMPIInt size;
1938: PetscFunctionBegin;
1939: PetscCall(PetscCitationsRegister(citation, &cited));
1940: PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size));
1941: PetscCheck(size <= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Comm must be of size 1");
1943: PetscCall(PetscNew(&b));
1945: B->data = (void *)b;
1947: PetscCall(PetscMemcpy(B->ops, &MatOps_Values, sizeof(struct _MatOps)));
1949: b->row = NULL;
1950: b->col = NULL;
1951: b->icol = NULL;
1952: b->reallocs = 0;
1953: b->ignorezeroentries = PETSC_FALSE;
1954: b->roworiented = PETSC_TRUE;
1955: b->nonew = 0;
1956: b->diag = NULL;
1957: b->solve_work = NULL;
1958: B->spptr = NULL;
1959: b->saved_values = NULL;
1960: b->idiag = NULL;
1961: b->mdiag = NULL;
1962: b->ssor_work = NULL;
1963: b->omega = 1.0;
1964: b->fshift = 0.0;
1965: b->idiagvalid = PETSC_FALSE;
1966: b->keepnonzeropattern = PETSC_FALSE;
1968: PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELL));
1969: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetArray_C", MatSeqSELLGetArray_SeqSELL));
1970: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLRestoreArray_C", MatSeqSELLRestoreArray_SeqSELL));
1971: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatStoreValues_C", MatStoreValues_SeqSELL));
1972: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatRetrieveValues_C", MatRetrieveValues_SeqSELL));
1973: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLSetPreallocation_C", MatSeqSELLSetPreallocation_SeqSELL));
1974: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqsell_seqaij_C", MatConvert_SeqSELL_SeqAIJ));
1975: PetscFunctionReturn(PETSC_SUCCESS);
1976: }
1978: /*
1979: Given a matrix generated with MatGetFactor() duplicates all the information in A into B
1980: */
1981: PetscErrorCode MatDuplicateNoCreate_SeqSELL(Mat C, Mat A, MatDuplicateOption cpvalues, PetscBool mallocmatspace)
1982: {
1983: Mat_SeqSELL *c = (Mat_SeqSELL *)C->data, *a = (Mat_SeqSELL *)A->data;
1984: PetscInt i, m = A->rmap->n;
1985: PetscInt totalslices = a->totalslices;
1987: PetscFunctionBegin;
1988: C->factortype = A->factortype;
1989: c->row = NULL;
1990: c->col = NULL;
1991: c->icol = NULL;
1992: c->reallocs = 0;
1993: C->assembled = PETSC_TRUE;
1995: PetscCall(PetscLayoutReference(A->rmap, &C->rmap));
1996: PetscCall(PetscLayoutReference(A->cmap, &C->cmap));
1998: PetscCall(PetscMalloc1(8 * totalslices, &c->rlen));
1999: PetscCall(PetscMalloc1(totalslices + 1, &c->sliidx));
2001: for (i = 0; i < m; i++) c->rlen[i] = a->rlen[i];
2002: for (i = 0; i < totalslices + 1; i++) c->sliidx[i] = a->sliidx[i];
2004: /* allocate the matrix space */
2005: if (mallocmatspace) {
2006: PetscCall(PetscMalloc2(a->maxallocmat, &c->val, a->maxallocmat, &c->colidx));
2008: c->singlemalloc = PETSC_TRUE;
2010: if (m > 0) {
2011: PetscCall(PetscArraycpy(c->colidx, a->colidx, a->maxallocmat));
2012: if (cpvalues == MAT_COPY_VALUES) {
2013: PetscCall(PetscArraycpy(c->val, a->val, a->maxallocmat));
2014: } else {
2015: PetscCall(PetscArrayzero(c->val, a->maxallocmat));
2016: }
2017: }
2018: }
2020: c->ignorezeroentries = a->ignorezeroentries;
2021: c->roworiented = a->roworiented;
2022: c->nonew = a->nonew;
2023: if (a->diag) {
2024: PetscCall(PetscMalloc1(m, &c->diag));
2025: for (i = 0; i < m; i++) c->diag[i] = a->diag[i];
2026: } else c->diag = NULL;
2028: c->solve_work = NULL;
2029: c->saved_values = NULL;
2030: c->idiag = NULL;
2031: c->ssor_work = NULL;
2032: c->keepnonzeropattern = a->keepnonzeropattern;
2033: c->free_val = PETSC_TRUE;
2034: c->free_colidx = PETSC_TRUE;
2036: c->maxallocmat = a->maxallocmat;
2037: c->maxallocrow = a->maxallocrow;
2038: c->rlenmax = a->rlenmax;
2039: c->nz = a->nz;
2040: C->preallocated = PETSC_TRUE;
2042: c->nonzerorowcnt = a->nonzerorowcnt;
2043: C->nonzerostate = A->nonzerostate;
2045: PetscCall(PetscFunctionListDuplicate(((PetscObject)A)->qlist, &((PetscObject)C)->qlist));
2046: PetscFunctionReturn(PETSC_SUCCESS);
2047: }
2049: PetscErrorCode MatDuplicate_SeqSELL(Mat A, MatDuplicateOption cpvalues, Mat *B)
2050: {
2051: PetscFunctionBegin;
2052: PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
2053: PetscCall(MatSetSizes(*B, A->rmap->n, A->cmap->n, A->rmap->n, A->cmap->n));
2054: if (!(A->rmap->n % A->rmap->bs) && !(A->cmap->n % A->cmap->bs)) PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2055: PetscCall(MatSetType(*B, ((PetscObject)A)->type_name));
2056: PetscCall(MatDuplicateNoCreate_SeqSELL(*B, A, cpvalues, PETSC_TRUE));
2057: PetscFunctionReturn(PETSC_SUCCESS);
2058: }
2060: /*MC
2061: MATSEQSELL - MATSEQSELL = "seqsell" - A matrix type to be used for sequential sparse matrices,
2062: based on the sliced Ellpack format
2064: Options Database Key:
2065: . -mat_type seqsell - sets the matrix type to "`MATSEQELL` during a call to `MatSetFromOptions()`
2067: Level: beginner
2069: .seealso: `Mat`, `MatCreateSeqSell()`, `MATSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATAIJ`, `MATMPIAIJ`
2070: M*/
2072: /*MC
2073: MATSELL - MATSELL = "sell" - A matrix type to be used for sparse matrices.
2075: This matrix type is identical to `MATSEQSELL` when constructed with a single process communicator,
2076: and `MATMPISELL` otherwise. As a result, for single process communicators,
2077: `MatSeqSELLSetPreallocation()` is supported, and similarly `MatMPISELLSetPreallocation()` is supported
2078: for communicators controlling multiple processes. It is recommended that you call both of
2079: the above preallocation routines for simplicity.
2081: Options Database Key:
2082: . -mat_type sell - sets the matrix type to "sell" during a call to MatSetFromOptions()
2084: Level: beginner
2086: Notes:
2087: This format is only supported for real scalars, double precision, and 32-bit indices (the defaults).
2089: It can provide better performance on Intel and AMD processes with AVX2 or AVX512 support for matrices that have a similar number of
2090: non-zeros in contiguous groups of rows. However if the computation is memory bandwidth limited it may not provide much improvement.
2092: Developer Notes:
2093: On Intel (and AMD) systems some of the matrix operations use SIMD (AVX) instructions to achieve higher performance.
2095: The sparse matrix format is as follows. For simplicity we assume a slice size of 2, it is actually 8
2096: .vb
2097: (2 0 3 4)
2098: Consider the matrix A = (5 0 6 0)
2099: (0 0 7 8)
2100: (0 0 9 9)
2102: symbolically the Ellpack format can be written as
2104: (2 3 4 |) (0 2 3 |)
2105: v = (5 6 0 |) colidx = (0 2 2 |)
2106: -------- ---------
2107: (7 8 |) (2 3 |)
2108: (9 9 |) (2 3 |)
2110: The data for 2 contiguous rows of the matrix are stored together (in column-major format) (with any left-over rows handled as a special case).
2111: Any of the rows in a slice fewer columns than the rest of the slice (row 1 above) are padded with a previous valid column in their "extra" colidx[] locations and
2112: zeros in their "extra" v locations so that the matrix operations do not need special code to handle different length rows within the 2 rows in a slice.
2114: The one-dimensional representation of v used in the code is (2 5 3 6 4 0 7 9 8 9) and for colidx is (0 0 2 2 3 2 2 2 3 3)
2116: .ve
2118: See MatMult_SeqSELL() for how this format is used with the SIMD operations to achieve high performance.
2120: References:
2121: . * - Hong Zhang, Richard T. Mills, Karl Rupp, and Barry F. Smith, Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512},
2122: Proceedings of the 47th International Conference on Parallel Processing, 2018.
2124: .seealso: `Mat`, `MatCreateSeqSELL()`, `MatCreateSeqAIJ()`, `MatCreateSell()`, `MATSEQSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATMPIAIJ`, `MATAIJ`
2125: M*/
2127: /*@C
2128: MatCreateSeqSELL - Creates a sparse matrix in `MATSEQSELL` format.
2130: Collective
2132: Input Parameters:
2133: + comm - MPI communicator, set to `PETSC_COMM_SELF`
2134: . m - number of rows
2135: . n - number of columns
2136: . rlenmax - maximum number of nonzeros in a row, ignored if `rlen` is provided
2137: - rlen - array containing the number of nonzeros in the various rows (possibly different for each row) or NULL
2139: Output Parameter:
2140: . A - the matrix
2142: Level: intermediate
2144: Notes:
2145: It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
2146: MatXXXXSetPreallocation() paradigm instead of this routine directly.
2147: [MatXXXXSetPreallocation() is, for example, `MatSeqSELLSetPreallocation()`]
2149: Specify the preallocated storage with either `rlenmax` or `rlen` (not both).
2150: Set `rlenmax` = `PETSC_DEFAULT` and `rlen` = `NULL` for PETSc to control dynamic memory
2151: allocation.
2153: .seealso: `Mat`, `MATSEQSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatSeqSELLSetPreallocation()`, `MATSELL`, `MATSEQSELL`, `MATMPISELL`
2154: @*/
2155: PetscErrorCode MatCreateSeqSELL(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt rlenmax, const PetscInt rlen[], Mat *A)
2156: {
2157: PetscFunctionBegin;
2158: PetscCall(MatCreate(comm, A));
2159: PetscCall(MatSetSizes(*A, m, n, m, n));
2160: PetscCall(MatSetType(*A, MATSEQSELL));
2161: PetscCall(MatSeqSELLSetPreallocation_SeqSELL(*A, rlenmax, rlen));
2162: PetscFunctionReturn(PETSC_SUCCESS);
2163: }
2165: PetscErrorCode MatEqual_SeqSELL(Mat A, Mat B, PetscBool *flg)
2166: {
2167: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data, *b = (Mat_SeqSELL *)B->data;
2168: PetscInt totalslices = a->totalslices;
2170: PetscFunctionBegin;
2171: /* If the matrix dimensions are not equal,or no of nonzeros */
2172: if ((A->rmap->n != B->rmap->n) || (A->cmap->n != B->cmap->n) || (a->nz != b->nz) || (a->rlenmax != b->rlenmax)) {
2173: *flg = PETSC_FALSE;
2174: PetscFunctionReturn(PETSC_SUCCESS);
2175: }
2176: /* if the a->colidx are the same */
2177: PetscCall(PetscArraycmp(a->colidx, b->colidx, a->sliidx[totalslices], flg));
2178: if (!*flg) PetscFunctionReturn(PETSC_SUCCESS);
2179: /* if a->val are the same */
2180: PetscCall(PetscArraycmp(a->val, b->val, a->sliidx[totalslices], flg));
2181: PetscFunctionReturn(PETSC_SUCCESS);
2182: }
2184: PetscErrorCode MatSeqSELLInvalidateDiagonal(Mat A)
2185: {
2186: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
2188: PetscFunctionBegin;
2189: a->idiagvalid = PETSC_FALSE;
2190: PetscFunctionReturn(PETSC_SUCCESS);
2191: }
2193: PetscErrorCode MatConjugate_SeqSELL(Mat A)
2194: {
2195: #if defined(PETSC_USE_COMPLEX)
2196: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
2197: PetscInt i;
2198: PetscScalar *val = a->val;
2200: PetscFunctionBegin;
2201: for (i = 0; i < a->sliidx[a->totalslices]; i++) val[i] = PetscConj(val[i]);
2202: #else
2203: PetscFunctionBegin;
2204: #endif
2205: PetscFunctionReturn(PETSC_SUCCESS);
2206: }