Actual source code: aijperm.c
2: /*
3: Defines basic operations for the MATSEQAIJPERM matrix class.
4: This class is derived from the MATSEQAIJ class and retains the
5: compressed row storage (aka Yale sparse matrix format) but augments
6: it with some permutation information that enables some operations
7: to be more vectorizable. A physically rearranged copy of the matrix
8: may be stored if the user desires.
10: Eventually a variety of permutations may be supported.
11: */
13: #include <../src/mat/impls/aij/seq/aij.h>
15: #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
16: #include <immintrin.h>
18: #if !defined(_MM_SCALE_8)
19: #define _MM_SCALE_8 8
20: #endif
21: #if !defined(_MM_SCALE_4)
22: #define _MM_SCALE_4 4
23: #endif
24: #endif
26: #define NDIM 512
27: /* NDIM specifies how many rows at a time we should work with when
28: * performing the vectorized mat-vec. This depends on various factors
29: * such as vector register length, etc., and I really need to add a
30: * way for the user (or the library) to tune this. I'm setting it to
31: * 512 for now since that is what Ed D'Azevedo was using in his Fortran
32: * routines. */
34: typedef struct {
35: PetscObjectState nonzerostate; /* used to determine if the nonzero structure has changed and hence the permutations need updating */
37: PetscInt ngroup;
38: PetscInt *xgroup;
39: /* Denotes where groups of rows with same number of nonzeros
40: * begin and end, i.e., xgroup[i] gives us the position in iperm[]
41: * where the ith group begins. */
43: PetscInt *nzgroup; /* how many nonzeros each row that is a member of group i has. */
44: PetscInt *iperm; /* The permutation vector. */
46: /* Some of this stuff is for Ed's recursive triangular solve.
47: * I'm not sure what I need yet. */
48: PetscInt blocksize;
49: PetscInt nstep;
50: PetscInt *jstart_list;
51: PetscInt *jend_list;
52: PetscInt *action_list;
53: PetscInt *ngroup_list;
54: PetscInt **ipointer_list;
55: PetscInt **xgroup_list;
56: PetscInt **nzgroup_list;
57: PetscInt **iperm_list;
58: } Mat_SeqAIJPERM;
60: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJPERM_SeqAIJ(Mat A, MatType type, MatReuse reuse, Mat *newmat)
61: {
62: /* This routine is only called to convert a MATAIJPERM to its base PETSc type, */
63: /* so we will ignore 'MatType type'. */
64: Mat B = *newmat;
65: Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM *)A->spptr;
67: PetscFunctionBegin;
68: if (reuse == MAT_INITIAL_MATRIX) {
69: PetscCall(MatDuplicate(A, MAT_COPY_VALUES, &B));
70: aijperm = (Mat_SeqAIJPERM *)B->spptr;
71: }
73: /* Reset the original function pointers. */
74: B->ops->assemblyend = MatAssemblyEnd_SeqAIJ;
75: B->ops->destroy = MatDestroy_SeqAIJ;
76: B->ops->duplicate = MatDuplicate_SeqAIJ;
77: B->ops->mult = MatMult_SeqAIJ;
78: B->ops->multadd = MatMultAdd_SeqAIJ;
80: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijperm_seqaij_C", NULL));
82: /* Free everything in the Mat_SeqAIJPERM data structure.*/
83: PetscCall(PetscFree(aijperm->xgroup));
84: PetscCall(PetscFree(aijperm->nzgroup));
85: PetscCall(PetscFree(aijperm->iperm));
86: PetscCall(PetscFree(B->spptr));
88: /* Change the type of B to MATSEQAIJ. */
89: PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJ));
91: *newmat = B;
92: PetscFunctionReturn(PETSC_SUCCESS);
93: }
95: PetscErrorCode MatDestroy_SeqAIJPERM(Mat A)
96: {
97: Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM *)A->spptr;
99: PetscFunctionBegin;
100: if (aijperm) {
101: /* If MatHeaderMerge() was used then this SeqAIJPERM matrix will not have a spprt. */
102: PetscCall(PetscFree(aijperm->xgroup));
103: PetscCall(PetscFree(aijperm->nzgroup));
104: PetscCall(PetscFree(aijperm->iperm));
105: PetscCall(PetscFree(A->spptr));
106: }
107: /* Change the type of A back to SEQAIJ and use MatDestroy_SeqAIJ()
108: * to destroy everything that remains. */
109: PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJ));
110: /* Note that I don't call MatSetType(). I believe this is because that
111: * is only to be called when *building* a matrix. I could be wrong, but
112: * that is how things work for the SuperLU matrix class. */
113: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijperm_seqaij_C", NULL));
114: PetscCall(MatDestroy_SeqAIJ(A));
115: PetscFunctionReturn(PETSC_SUCCESS);
116: }
118: PetscErrorCode MatDuplicate_SeqAIJPERM(Mat A, MatDuplicateOption op, Mat *M)
119: {
120: Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM *)A->spptr;
121: Mat_SeqAIJPERM *aijperm_dest;
122: PetscBool perm;
124: PetscFunctionBegin;
125: PetscCall(MatDuplicate_SeqAIJ(A, op, M));
126: PetscCall(PetscObjectTypeCompare((PetscObject)*M, MATSEQAIJPERM, &perm));
127: if (perm) {
128: aijperm_dest = (Mat_SeqAIJPERM *)(*M)->spptr;
129: PetscCall(PetscFree(aijperm_dest->xgroup));
130: PetscCall(PetscFree(aijperm_dest->nzgroup));
131: PetscCall(PetscFree(aijperm_dest->iperm));
132: } else {
133: PetscCall(PetscNew(&aijperm_dest));
134: (*M)->spptr = (void *)aijperm_dest;
135: PetscCall(PetscObjectChangeTypeName((PetscObject)*M, MATSEQAIJPERM));
136: PetscCall(PetscObjectComposeFunction((PetscObject)*M, "MatConvert_seqaijperm_seqaij_C", MatConvert_SeqAIJPERM_SeqAIJ));
137: }
138: PetscCall(PetscArraycpy(aijperm_dest, aijperm, 1));
139: /* Allocate space for, and copy the grouping and permutation info.
140: * I note that when the groups are initially determined in
141: * MatSeqAIJPERM_create_perm, xgroup and nzgroup may be sized larger than
142: * necessary. But at this point, we know how large they need to be, and
143: * allocate only the necessary amount of memory. So the duplicated matrix
144: * may actually use slightly less storage than the original! */
145: PetscCall(PetscMalloc1(A->rmap->n, &aijperm_dest->iperm));
146: PetscCall(PetscMalloc1(aijperm->ngroup + 1, &aijperm_dest->xgroup));
147: PetscCall(PetscMalloc1(aijperm->ngroup, &aijperm_dest->nzgroup));
148: PetscCall(PetscArraycpy(aijperm_dest->iperm, aijperm->iperm, A->rmap->n));
149: PetscCall(PetscArraycpy(aijperm_dest->xgroup, aijperm->xgroup, aijperm->ngroup + 1));
150: PetscCall(PetscArraycpy(aijperm_dest->nzgroup, aijperm->nzgroup, aijperm->ngroup));
151: PetscFunctionReturn(PETSC_SUCCESS);
152: }
154: PetscErrorCode MatSeqAIJPERM_create_perm(Mat A)
155: {
156: Mat_SeqAIJ *a = (Mat_SeqAIJ *)(A)->data;
157: Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM *)A->spptr;
158: PetscInt m; /* Number of rows in the matrix. */
159: PetscInt *ia; /* From the CSR representation; points to the beginning of each row. */
160: PetscInt maxnz; /* Maximum number of nonzeros in any row. */
161: PetscInt *rows_in_bucket;
162: /* To construct the permutation, we sort each row into one of maxnz
163: * buckets based on how many nonzeros are in the row. */
164: PetscInt nz;
165: PetscInt *nz_in_row; /* the number of nonzero elements in row k. */
166: PetscInt *ipnz;
167: /* When constructing the iperm permutation vector,
168: * ipnz[nz] is used to point to the next place in the permutation vector
169: * that a row with nz nonzero elements should be placed.*/
170: PetscInt i, ngroup, istart, ipos;
172: PetscFunctionBegin;
173: if (aijperm->nonzerostate == A->nonzerostate) PetscFunctionReturn(PETSC_SUCCESS); /* permutation exists and matches current nonzero structure */
174: aijperm->nonzerostate = A->nonzerostate;
175: /* Free anything previously put in the Mat_SeqAIJPERM data structure. */
176: PetscCall(PetscFree(aijperm->xgroup));
177: PetscCall(PetscFree(aijperm->nzgroup));
178: PetscCall(PetscFree(aijperm->iperm));
180: m = A->rmap->n;
181: ia = a->i;
183: /* Allocate the arrays that will hold the permutation vector. */
184: PetscCall(PetscMalloc1(m, &aijperm->iperm));
186: /* Allocate some temporary work arrays that will be used in
187: * calculating the permutation vector and groupings. */
188: PetscCall(PetscMalloc1(m, &nz_in_row));
190: /* Now actually figure out the permutation and grouping. */
192: /* First pass: Determine number of nonzeros in each row, maximum
193: * number of nonzeros in any row, and how many rows fall into each
194: * "bucket" of rows with same number of nonzeros. */
195: maxnz = 0;
196: for (i = 0; i < m; i++) {
197: nz_in_row[i] = ia[i + 1] - ia[i];
198: if (nz_in_row[i] > maxnz) maxnz = nz_in_row[i];
199: }
200: PetscCall(PetscMalloc1(PetscMax(maxnz, m) + 1, &rows_in_bucket));
201: PetscCall(PetscMalloc1(PetscMax(maxnz, m) + 1, &ipnz));
203: for (i = 0; i <= maxnz; i++) rows_in_bucket[i] = 0;
204: for (i = 0; i < m; i++) {
205: nz = nz_in_row[i];
206: rows_in_bucket[nz]++;
207: }
209: /* Allocate space for the grouping info. There will be at most (maxnz + 1)
210: * groups. (It is maxnz + 1 instead of simply maxnz because there may be
211: * rows with no nonzero elements.) If there are (maxnz + 1) groups,
212: * then xgroup[] must consist of (maxnz + 2) elements, since the last
213: * element of xgroup will tell us where the (maxnz + 1)th group ends.
214: * We allocate space for the maximum number of groups;
215: * that is potentially a little wasteful, but not too much so.
216: * Perhaps I should fix it later. */
217: PetscCall(PetscMalloc1(maxnz + 2, &aijperm->xgroup));
218: PetscCall(PetscMalloc1(maxnz + 1, &aijperm->nzgroup));
220: /* Second pass. Look at what is in the buckets and create the groupings.
221: * Note that it is OK to have a group of rows with no non-zero values. */
222: ngroup = 0;
223: istart = 0;
224: for (i = 0; i <= maxnz; i++) {
225: if (rows_in_bucket[i] > 0) {
226: aijperm->nzgroup[ngroup] = i;
227: aijperm->xgroup[ngroup] = istart;
228: ngroup++;
229: istart += rows_in_bucket[i];
230: }
231: }
233: aijperm->xgroup[ngroup] = istart;
234: aijperm->ngroup = ngroup;
236: /* Now fill in the permutation vector iperm. */
237: ipnz[0] = 0;
238: for (i = 0; i < maxnz; i++) ipnz[i + 1] = ipnz[i] + rows_in_bucket[i];
240: for (i = 0; i < m; i++) {
241: nz = nz_in_row[i];
242: ipos = ipnz[nz];
243: aijperm->iperm[ipos] = i;
244: ipnz[nz]++;
245: }
247: /* Clean up temporary work arrays. */
248: PetscCall(PetscFree(rows_in_bucket));
249: PetscCall(PetscFree(ipnz));
250: PetscCall(PetscFree(nz_in_row));
251: PetscFunctionReturn(PETSC_SUCCESS);
252: }
254: PetscErrorCode MatAssemblyEnd_SeqAIJPERM(Mat A, MatAssemblyType mode)
255: {
256: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
258: PetscFunctionBegin;
259: if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
261: /* Since a MATSEQAIJPERM matrix is really just a MATSEQAIJ with some
262: * extra information, call the AssemblyEnd routine for a MATSEQAIJ.
263: * I'm not sure if this is the best way to do this, but it avoids
264: * a lot of code duplication.
265: * I also note that currently MATSEQAIJPERM doesn't know anything about
266: * the Mat_CompressedRow data structure that SeqAIJ now uses when there
267: * are many zero rows. If the SeqAIJ assembly end routine decides to use
268: * this, this may break things. (Don't know... haven't looked at it.) */
269: a->inode.use = PETSC_FALSE;
270: PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
272: /* Now calculate the permutation and grouping information. */
273: PetscCall(MatSeqAIJPERM_create_perm(A));
274: PetscFunctionReturn(PETSC_SUCCESS);
275: }
277: PetscErrorCode MatMult_SeqAIJPERM(Mat A, Vec xx, Vec yy)
278: {
279: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
280: const PetscScalar *x;
281: PetscScalar *y;
282: const MatScalar *aa;
283: const PetscInt *aj, *ai;
284: #if !(defined(PETSC_USE_FORTRAN_KERNEL_MULTAIJPERM) && defined(notworking))
285: PetscInt i, j;
286: #endif
287: #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
288: __m512d vec_x, vec_y, vec_vals;
289: __m256i vec_idx, vec_ipos, vec_j;
290: __mmask8 mask;
291: #endif
293: /* Variables that don't appear in MatMult_SeqAIJ. */
294: Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM *)A->spptr;
295: PetscInt *iperm; /* Points to the permutation vector. */
296: PetscInt *xgroup;
297: /* Denotes where groups of rows with same number of nonzeros
298: * begin and end in iperm. */
299: PetscInt *nzgroup;
300: PetscInt ngroup;
301: PetscInt igroup;
302: PetscInt jstart, jend;
303: /* jstart is used in loops to denote the position in iperm where a
304: * group starts; jend denotes the position where it ends.
305: * (jend + 1 is where the next group starts.) */
306: PetscInt iold, nz;
307: PetscInt istart, iend, isize;
308: PetscInt ipos;
309: PetscScalar yp[NDIM];
310: PetscInt ip[NDIM]; /* yp[] and ip[] are treated as vector "registers" for performing the mat-vec. */
312: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
313: #pragma disjoint(*x, *y, *aa)
314: #endif
316: PetscFunctionBegin;
317: PetscCall(VecGetArrayRead(xx, &x));
318: PetscCall(VecGetArray(yy, &y));
319: aj = a->j; /* aj[k] gives column index for element aa[k]. */
320: aa = a->a; /* Nonzero elements stored row-by-row. */
321: ai = a->i; /* ai[k] is the position in aa and aj where row k starts. */
323: /* Get the info we need about the permutations and groupings. */
324: iperm = aijperm->iperm;
325: ngroup = aijperm->ngroup;
326: xgroup = aijperm->xgroup;
327: nzgroup = aijperm->nzgroup;
329: #if defined(PETSC_USE_FORTRAN_KERNEL_MULTAIJPERM) && defined(notworking)
330: fortranmultaijperm_(&m, x, ii, aj, aa, y);
331: #else
333: for (igroup = 0; igroup < ngroup; igroup++) {
334: jstart = xgroup[igroup];
335: jend = xgroup[igroup + 1] - 1;
336: nz = nzgroup[igroup];
338: /* Handle the special cases where the number of nonzeros per row
339: * in the group is either 0 or 1. */
340: if (nz == 0) {
341: for (i = jstart; i <= jend; i++) y[iperm[i]] = 0.0;
342: } else if (nz == 1) {
343: for (i = jstart; i <= jend; i++) {
344: iold = iperm[i];
345: ipos = ai[iold];
346: y[iold] = aa[ipos] * x[aj[ipos]];
347: }
348: } else {
349: /* We work our way through the current group in chunks of NDIM rows
350: * at a time. */
352: for (istart = jstart; istart <= jend; istart += NDIM) {
353: /* Figure out where the chunk of 'isize' rows ends in iperm.
354: * 'isize may of course be less than NDIM for the last chunk. */
355: iend = istart + (NDIM - 1);
357: if (iend > jend) iend = jend;
359: isize = iend - istart + 1;
361: /* Initialize the yp[] array that will be used to hold part of
362: * the permuted results vector, and figure out where in aa each
363: * row of the chunk will begin. */
364: for (i = 0; i < isize; i++) {
365: iold = iperm[istart + i];
366: /* iold is a row number from the matrix A *before* reordering. */
367: ip[i] = ai[iold];
368: /* ip[i] tells us where the ith row of the chunk begins in aa. */
369: yp[i] = (PetscScalar)0.0;
370: }
372: /* If the number of zeros per row exceeds the number of rows in
373: * the chunk, we should vectorize along nz, that is, perform the
374: * mat-vec one row at a time as in the usual CSR case. */
375: if (nz > isize) {
376: #if defined(PETSC_HAVE_CRAY_VECTOR)
377: #pragma _CRI preferstream
378: #endif
379: for (i = 0; i < isize; i++) {
380: #if defined(PETSC_HAVE_CRAY_VECTOR)
381: #pragma _CRI prefervector
382: #endif
384: #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
385: vec_y = _mm512_setzero_pd();
386: ipos = ip[i];
387: for (j = 0; j < (nz >> 3); j++) {
388: vec_idx = _mm256_loadu_si256((__m256i const *)&aj[ipos]);
389: vec_vals = _mm512_loadu_pd(&aa[ipos]);
390: vec_x = _mm512_i32gather_pd(vec_idx, x, _MM_SCALE_8);
391: vec_y = _mm512_fmadd_pd(vec_x, vec_vals, vec_y);
392: ipos += 8;
393: }
394: if ((nz & 0x07) > 2) {
395: mask = (__mmask8)(0xff >> (8 - (nz & 0x07)));
396: vec_idx = _mm256_loadu_si256((__m256i const *)&aj[ipos]);
397: vec_vals = _mm512_loadu_pd(&aa[ipos]);
398: vec_x = _mm512_mask_i32gather_pd(vec_x, mask, vec_idx, x, _MM_SCALE_8);
399: vec_y = _mm512_mask3_fmadd_pd(vec_x, vec_vals, vec_y, mask);
400: } else if ((nz & 0x07) == 2) {
401: yp[i] += aa[ipos] * x[aj[ipos]];
402: yp[i] += aa[ipos + 1] * x[aj[ipos + 1]];
403: } else if ((nz & 0x07) == 1) {
404: yp[i] += aa[ipos] * x[aj[ipos]];
405: }
406: yp[i] += _mm512_reduce_add_pd(vec_y);
407: #else
408: for (j = 0; j < nz; j++) {
409: ipos = ip[i] + j;
410: yp[i] += aa[ipos] * x[aj[ipos]];
411: }
412: #endif
413: }
414: } else {
415: /* Otherwise, there are enough rows in the chunk to make it
416: * worthwhile to vectorize across the rows, that is, to do the
417: * matvec by operating with "columns" of the chunk. */
418: for (j = 0; j < nz; j++) {
419: #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
420: vec_j = _mm256_set1_epi32(j);
421: for (i = 0; i < ((isize >> 3) << 3); i += 8) {
422: vec_y = _mm512_loadu_pd(&yp[i]);
423: vec_ipos = _mm256_loadu_si256((__m256i const *)&ip[i]);
424: vec_ipos = _mm256_add_epi32(vec_ipos, vec_j);
425: vec_idx = _mm256_i32gather_epi32(aj, vec_ipos, _MM_SCALE_4);
426: vec_vals = _mm512_i32gather_pd(vec_ipos, aa, _MM_SCALE_8);
427: vec_x = _mm512_i32gather_pd(vec_idx, x, _MM_SCALE_8);
428: vec_y = _mm512_fmadd_pd(vec_x, vec_vals, vec_y);
429: _mm512_storeu_pd(&yp[i], vec_y);
430: }
431: for (i = isize - (isize & 0x07); i < isize; i++) {
432: ipos = ip[i] + j;
433: yp[i] += aa[ipos] * x[aj[ipos]];
434: }
435: #else
436: for (i = 0; i < isize; i++) {
437: ipos = ip[i] + j;
438: yp[i] += aa[ipos] * x[aj[ipos]];
439: }
440: #endif
441: }
442: }
444: #if defined(PETSC_HAVE_CRAY_VECTOR)
445: #pragma _CRI ivdep
446: #endif
447: /* Put results from yp[] into non-permuted result vector y. */
448: for (i = 0; i < isize; i++) y[iperm[istart + i]] = yp[i];
449: } /* End processing chunk of isize rows of a group. */
450: } /* End handling matvec for chunk with nz > 1. */
451: } /* End loop over igroup. */
452: #endif
453: PetscCall(PetscLogFlops(PetscMax(2.0 * a->nz - A->rmap->n, 0)));
454: PetscCall(VecRestoreArrayRead(xx, &x));
455: PetscCall(VecRestoreArray(yy, &y));
456: PetscFunctionReturn(PETSC_SUCCESS);
457: }
459: /* MatMultAdd_SeqAIJPERM() calculates yy = ww + A * xx.
460: * Note that the names I used to designate the vectors differs from that
461: * used in MatMultAdd_SeqAIJ(). I did this to keep my notation consistent
462: * with the MatMult_SeqAIJPERM() routine, which is very similar to this one. */
463: /*
464: I hate having virtually identical code for the mult and the multadd!!!
465: */
466: PetscErrorCode MatMultAdd_SeqAIJPERM(Mat A, Vec xx, Vec ww, Vec yy)
467: {
468: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
469: const PetscScalar *x;
470: PetscScalar *y, *w;
471: const MatScalar *aa;
472: const PetscInt *aj, *ai;
473: #if !defined(PETSC_USE_FORTRAN_KERNEL_MULTADDAIJPERM)
474: PetscInt i, j;
475: #endif
477: /* Variables that don't appear in MatMultAdd_SeqAIJ. */
478: Mat_SeqAIJPERM *aijperm;
479: PetscInt *iperm; /* Points to the permutation vector. */
480: PetscInt *xgroup;
481: /* Denotes where groups of rows with same number of nonzeros
482: * begin and end in iperm. */
483: PetscInt *nzgroup;
484: PetscInt ngroup;
485: PetscInt igroup;
486: PetscInt jstart, jend;
487: /* jstart is used in loops to denote the position in iperm where a
488: * group starts; jend denotes the position where it ends.
489: * (jend + 1 is where the next group starts.) */
490: PetscInt iold, nz;
491: PetscInt istart, iend, isize;
492: PetscInt ipos;
493: PetscScalar yp[NDIM];
494: PetscInt ip[NDIM];
495: /* yp[] and ip[] are treated as vector "registers" for performing
496: * the mat-vec. */
498: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
499: #pragma disjoint(*x, *y, *aa)
500: #endif
502: PetscFunctionBegin;
503: PetscCall(VecGetArrayRead(xx, &x));
504: PetscCall(VecGetArrayPair(yy, ww, &y, &w));
506: aj = a->j; /* aj[k] gives column index for element aa[k]. */
507: aa = a->a; /* Nonzero elements stored row-by-row. */
508: ai = a->i; /* ai[k] is the position in aa and aj where row k starts. */
510: /* Get the info we need about the permutations and groupings. */
511: aijperm = (Mat_SeqAIJPERM *)A->spptr;
512: iperm = aijperm->iperm;
513: ngroup = aijperm->ngroup;
514: xgroup = aijperm->xgroup;
515: nzgroup = aijperm->nzgroup;
517: #if defined(PETSC_USE_FORTRAN_KERNEL_MULTADDAIJPERM)
518: fortranmultaddaijperm_(&m, x, ii, aj, aa, y, w);
519: #else
521: for (igroup = 0; igroup < ngroup; igroup++) {
522: jstart = xgroup[igroup];
523: jend = xgroup[igroup + 1] - 1;
525: nz = nzgroup[igroup];
527: /* Handle the special cases where the number of nonzeros per row
528: * in the group is either 0 or 1. */
529: if (nz == 0) {
530: for (i = jstart; i <= jend; i++) {
531: iold = iperm[i];
532: y[iold] = w[iold];
533: }
534: } else if (nz == 1) {
535: for (i = jstart; i <= jend; i++) {
536: iold = iperm[i];
537: ipos = ai[iold];
538: y[iold] = w[iold] + aa[ipos] * x[aj[ipos]];
539: }
540: }
541: /* For the general case: */
542: else {
543: /* We work our way through the current group in chunks of NDIM rows
544: * at a time. */
546: for (istart = jstart; istart <= jend; istart += NDIM) {
547: /* Figure out where the chunk of 'isize' rows ends in iperm.
548: * 'isize may of course be less than NDIM for the last chunk. */
549: iend = istart + (NDIM - 1);
550: if (iend > jend) iend = jend;
551: isize = iend - istart + 1;
553: /* Initialize the yp[] array that will be used to hold part of
554: * the permuted results vector, and figure out where in aa each
555: * row of the chunk will begin. */
556: for (i = 0; i < isize; i++) {
557: iold = iperm[istart + i];
558: /* iold is a row number from the matrix A *before* reordering. */
559: ip[i] = ai[iold];
560: /* ip[i] tells us where the ith row of the chunk begins in aa. */
561: yp[i] = w[iold];
562: }
564: /* If the number of zeros per row exceeds the number of rows in
565: * the chunk, we should vectorize along nz, that is, perform the
566: * mat-vec one row at a time as in the usual CSR case. */
567: if (nz > isize) {
568: #if defined(PETSC_HAVE_CRAY_VECTOR)
569: #pragma _CRI preferstream
570: #endif
571: for (i = 0; i < isize; i++) {
572: #if defined(PETSC_HAVE_CRAY_VECTOR)
573: #pragma _CRI prefervector
574: #endif
575: for (j = 0; j < nz; j++) {
576: ipos = ip[i] + j;
577: yp[i] += aa[ipos] * x[aj[ipos]];
578: }
579: }
580: }
581: /* Otherwise, there are enough rows in the chunk to make it
582: * worthwhile to vectorize across the rows, that is, to do the
583: * matvec by operating with "columns" of the chunk. */
584: else {
585: for (j = 0; j < nz; j++) {
586: for (i = 0; i < isize; i++) {
587: ipos = ip[i] + j;
588: yp[i] += aa[ipos] * x[aj[ipos]];
589: }
590: }
591: }
593: #if defined(PETSC_HAVE_CRAY_VECTOR)
594: #pragma _CRI ivdep
595: #endif
596: /* Put results from yp[] into non-permuted result vector y. */
597: for (i = 0; i < isize; i++) y[iperm[istart + i]] = yp[i];
598: } /* End processing chunk of isize rows of a group. */
600: } /* End handling matvec for chunk with nz > 1. */
601: } /* End loop over igroup. */
603: #endif
604: PetscCall(PetscLogFlops(2.0 * a->nz));
605: PetscCall(VecRestoreArrayRead(xx, &x));
606: PetscCall(VecRestoreArrayPair(yy, ww, &y, &w));
607: PetscFunctionReturn(PETSC_SUCCESS);
608: }
610: /* MatConvert_SeqAIJ_SeqAIJPERM converts a SeqAIJ matrix into a
611: * SeqAIJPERM matrix. This routine is called by the MatCreate_SeqAIJPERM()
612: * routine, but can also be used to convert an assembled SeqAIJ matrix
613: * into a SeqAIJPERM one. */
614: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJPERM(Mat A, MatType type, MatReuse reuse, Mat *newmat)
615: {
616: Mat B = *newmat;
617: Mat_SeqAIJPERM *aijperm;
618: PetscBool sametype;
620: PetscFunctionBegin;
621: if (reuse == MAT_INITIAL_MATRIX) PetscCall(MatDuplicate(A, MAT_COPY_VALUES, &B));
622: PetscCall(PetscObjectTypeCompare((PetscObject)A, type, &sametype));
623: if (sametype) PetscFunctionReturn(PETSC_SUCCESS);
625: PetscCall(PetscNew(&aijperm));
626: B->spptr = (void *)aijperm;
628: /* Set function pointers for methods that we inherit from AIJ but override. */
629: B->ops->duplicate = MatDuplicate_SeqAIJPERM;
630: B->ops->assemblyend = MatAssemblyEnd_SeqAIJPERM;
631: B->ops->destroy = MatDestroy_SeqAIJPERM;
632: B->ops->mult = MatMult_SeqAIJPERM;
633: B->ops->multadd = MatMultAdd_SeqAIJPERM;
635: aijperm->nonzerostate = -1; /* this will trigger the generation of the permutation information the first time through MatAssembly()*/
636: /* If A has already been assembled, compute the permutation. */
637: if (A->assembled) PetscCall(MatSeqAIJPERM_create_perm(B));
639: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijperm_seqaij_C", MatConvert_SeqAIJPERM_SeqAIJ));
641: PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJPERM));
642: *newmat = B;
643: PetscFunctionReturn(PETSC_SUCCESS);
644: }
646: /*@C
647: MatCreateSeqAIJPERM - Creates a sparse matrix of type `MATSEQAIJPERM`.
648: This type inherits from `MATSEQAIJ`, but calculates some additional permutation
649: information that is used to allow better vectorization of some
650: operations. At the cost of increased storage, the `MATSEQAIJ` formatted
651: matrix can be copied to a format in which pieces of the matrix are
652: stored in ELLPACK format, allowing the vectorized matrix multiply
653: routine to use stride-1 memory accesses.
655: Collective
657: Input Parameters:
658: + comm - MPI communicator, set to `PETSC_COMM_SELF`
659: . m - number of rows
660: . n - number of columns
661: . nz - number of nonzeros per row (same for all rows), ignored if `nnz` is given
662: - nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
664: Output Parameter:
665: . A - the matrix
667: Level: intermediate
669: .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateMPIAIJPERM()`, `MatSetValues()`
670: @*/
671: PetscErrorCode MatCreateSeqAIJPERM(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
672: {
673: PetscFunctionBegin;
674: PetscCall(MatCreate(comm, A));
675: PetscCall(MatSetSizes(*A, m, n, m, n));
676: PetscCall(MatSetType(*A, MATSEQAIJPERM));
677: PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, nnz));
678: PetscFunctionReturn(PETSC_SUCCESS);
679: }
681: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJPERM(Mat A)
682: {
683: PetscFunctionBegin;
684: PetscCall(MatSetType(A, MATSEQAIJ));
685: PetscCall(MatConvert_SeqAIJ_SeqAIJPERM(A, MATSEQAIJPERM, MAT_INPLACE_MATRIX, &A));
686: PetscFunctionReturn(PETSC_SUCCESS);
687: }