Actual source code: petscaijdevice.h

  1: #ifndef PETSCAIJDEVICE_H
  2: #define PETSCAIJDEVICE_H

  4: #include <petsc/private/matimpl.h>

  6: /* SUBMANSEC = Mat */

  8: typedef struct _n_SplitCSRMat *PetscSplitCSRDataStructure;

 10: /* 64-bit floating-point version of atomicAdd() is only natively supported by
 11:    CUDA devices of compute capability 6.x and higher. See also sfcuda.cu
 12: */
 13: #if defined(PETSC_USE_REAL_DOUBLE) && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600)
 14: __device__ double atomicAdd(double *x, double y)
 15: {
 16:   typedef unsigned long long int ullint;
 17:   double                        *address = x, val = y;
 18:   ullint                        *address_as_ull = (ullint *)address;
 19:   ullint                         old            = *address_as_ull, assumed;
 20:   do {
 21:     assumed = old;
 22:     old     = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed)));
 23:   } while (assumed != old);
 24:   return __longlong_as_double(old);
 25: }
 26: #endif

 28: #if defined(KOKKOS_INLINE_FUNCTION)
 29:   #define PetscAtomicAdd(a, b) Kokkos::atomic_fetch_add(a, b)
 30: #elif defined(__CUDA_ARCH__)
 31:   #if defined(PETSC_USE_COMPLEX)
 32:     #define PetscAtomicAdd(a, b) \
 33:       { \
 34:         PetscReal *_a = (PetscReal *)(a); \
 35:         PetscReal *_b = (PetscReal *)&(b); \
 36:         atomicAdd(&_a[0], _b[0]); \
 37:         atomicAdd(&_a[1], _b[1]); \
 38:       }
 39:   #else
 40:     #define PetscAtomicAdd(a, b) atomicAdd(a, b)
 41:   #endif
 42: #else
 43:   /* TODO: support devices other than CUDA and Kokkos */
 44:   #define PetscAtomicAdd(a, b) *(a) += b
 45: #endif

 47: #define MatSetValues_SeqAIJ_A_Private(row, col, value, addv) \
 48:   { \
 49:     inserted = 0; \
 50:     if (col <= lastcol1) low1 = 0; \
 51:     else high1 = nrow1; \
 52:     lastcol1 = col; \
 53:     while (high1 - low1 > 5) { \
 54:       t = (low1 + high1) / 2; \
 55:       if (rp1[t] > col) high1 = t; \
 56:       else low1 = t; \
 57:     } \
 58:     for (_i = low1; _i < high1; _i++) { \
 59:       if (rp1[_i] > col) break; \
 60:       if (rp1[_i] == col) { \
 61:         if (addv == ADD_VALUES) { \
 62:           PetscAtomicAdd(&ap1[_i], value); \
 63:         } else ap1[_i] = value; \
 64:         inserted = 1; \
 65:         break; \
 66:       } \
 67:     } \
 68:   }

 70: #define MatSetValues_SeqAIJ_B_Private(row, col, value, addv) \
 71:   { \
 72:     inserted = 0; \
 73:     if (col <= lastcol2) low2 = 0; \
 74:     else high2 = nrow2; \
 75:     lastcol2 = col; \
 76:     while (high2 - low2 > 5) { \
 77:       t = (low2 + high2) / 2; \
 78:       if (rp2[t] > col) high2 = t; \
 79:       else low2 = t; \
 80:     } \
 81:     for (_i = low2; _i < high2; _i++) { \
 82:       if (rp2[_i] > col) break; \
 83:       if (rp2[_i] == col) { \
 84:         if (addv == ADD_VALUES) { \
 85:           PetscAtomicAdd(&ap2[_i], value); \
 86:         } else ap2[_i] = value; \
 87:         inserted = 1; \
 88:         break; \
 89:       } \
 90:     } \
 91:   }

 93: #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
 94:   #define SETERR return (printf("[%d] ERROR in %s() at %s:%d: Location (%ld,%ld) not found! v=%g\n", d_mat->rank, __func__, __FILE__, __LINE__, (long int)im[i], (long int)in[j], PetscRealPart(value)), PETSC_ERR_ARG_OUTOFRANGE)
 95: #else
 96:   #define SETERR return PETSC_ERR_ARG_OUTOFRANGE
 97: #endif

 99: #if defined(__CUDA_ARCH__) || defined(PETSC_HAVE_HIP)
100: __device__
101: #elif defined(KOKKOS_INLINE_FUNCTION)
102: KOKKOS_INLINE_FUNCTION
103: #else
104: static
105: #endif

107:   /*@C
108:        MatSetValuesDevice - sets a set of values into a matrix, this may be called by CUDA or KOKKOS kernels

110:     Input Parameters:
111: +   d_mat - an object obtained with `MatCUSPARSEGetDeviceMatWrite()` or `MatKokkosGetDeviceMatWrite()`
112: .   m - the number of rows to insert or add to
113: .   im - the rows to insert or add to
114: .   n - number of columns to insert or add to
115: .   in - the columns to insert or add to
116: .   v - the values to insert or add to the matrix (treated as a  by n row oriented dense array
117: -   is - either `INSERT_VALUES` or `ADD_VALUES`

119:    Level: advanced

121:     Notes:
122:     Any row or column indices that are outside the bounds of the matrix on the rank are discarded

124:     It is recommended that `MatSetValuesCOO()` be used instead of this routine for efficiency

126: .seealso: `MatSetValues()`, `MatCreate()`, `MatCreateDenseCUDA()`, `MatCreateAIJCUSPARSE()`, `MatKokkosGetDeviceMatWrite()`,
127:           `MatCUSPARSEGetDeviceMatWrite()`, `MatSetValuesCOO()`
128: @*/
129:   int
130:   MatSetValuesDevice(PetscSplitCSRDataStructure d_mat, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], const PetscScalar v[], InsertMode is)
131: {
132:   PetscScalar     value;
133:   const PetscInt *rp1, *rp2 = NULL, *ai = d_mat->diag.i, *aj = d_mat->diag.j;
134:   const PetscInt *bi = d_mat->offdiag.i, *bj = d_mat->offdiag.j;
135:   PetscScalar    *ba = d_mat->offdiag.a, *aa = d_mat->diag.a;
136:   PetscInt        nrow1, nrow2 = 0, _i, low1, high1, low2 = 0, high2 = 0, t, lastcol1, lastcol2 = 0, inserted;
137:   PetscScalar    *ap1, *ap2 = NULL;
138:   PetscBool       roworiented = PETSC_TRUE;
139:   PetscInt        i, j, row, col;
140:   const PetscInt  rstart = d_mat->rstart, rend = d_mat->rend, cstart = d_mat->cstart, cend = d_mat->cend, M = d_mat->M;

142:   for (i = 0; i < m; i++) {
143:     if (im[i] >= rstart && im[i] < rend) { // silently ignore off processor rows
144:       row      = (int)(im[i] - rstart);
145:       lastcol1 = -1;
146:       rp1      = aj + ai[row];
147:       ap1      = aa + ai[row];
148:       nrow1    = ai[row + 1] - ai[row];
149:       low1     = 0;
150:       high1    = nrow1;
151:       if (bj) {
152:         lastcol2 = -1;
153:         rp2      = bj + bi[row];
154:         ap2      = ba + bi[row];
155:         nrow2    = bi[row + 1] - bi[row];
156:         low2     = 0;
157:         high2    = nrow2;
158:       } else {
159:         high2 = low2 = 0;
160:       }
161:       for (j = 0; j < n; j++) {
162:         value = roworiented ? v[i * n + j] : v[i + j * m];
163:         if (in[j] >= cstart && in[j] < cend) {
164:           col = (in[j] - cstart);
165:           MatSetValues_SeqAIJ_A_Private(row, col, value, is);
166:           if (!inserted) SETERR;
167:         } else if (in[j] < 0) { // silently ignore off processor rows
168:           continue;
169:         } else if (in[j] >= M) SETERR;
170:         else {
171:           col = d_mat->colmap[in[j]] - 1;
172:           if (col < 0) SETERR;
173:           MatSetValues_SeqAIJ_B_Private(row, col, value, is);
174:           if (!inserted) SETERR;
175:         }
176:       }
177:     }
178:   }
179:   return PETSC_SUCCESS;
180: }

182: #undef MatSetValues_SeqAIJ_A_Private
183: #undef MatSetValues_SeqAIJ_B_Private
184: #undef SETERR
185: #undef PetscAtomicAdd

187: #endif