Actual source code: petscaijdevice.h
1: #ifndef PETSCAIJDEVICE_H
2: #define PETSCAIJDEVICE_H
4: #include <petsc/private/matimpl.h>
6: /* SUBMANSEC = Mat */
8: typedef struct _n_SplitCSRMat *PetscSplitCSRDataStructure;
10: /* 64-bit floating-point version of atomicAdd() is only natively supported by
11: CUDA devices of compute capability 6.x and higher. See also sfcuda.cu
12: */
13: #if defined(PETSC_USE_REAL_DOUBLE) && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600)
14: __device__ double atomicAdd(double *x, double y)
15: {
16: typedef unsigned long long int ullint;
17: double *address = x, val = y;
18: ullint *address_as_ull = (ullint *)address;
19: ullint old = *address_as_ull, assumed;
20: do {
21: assumed = old;
22: old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed)));
23: } while (assumed != old);
24: return __longlong_as_double(old);
25: }
26: #endif
28: #if defined(KOKKOS_INLINE_FUNCTION)
29: #define PetscAtomicAdd(a, b) Kokkos::atomic_fetch_add(a, b)
30: #elif defined(__CUDA_ARCH__)
31: #if defined(PETSC_USE_COMPLEX)
32: #define PetscAtomicAdd(a, b) \
33: { \
34: PetscReal *_a = (PetscReal *)(a); \
35: PetscReal *_b = (PetscReal *)&(b); \
36: atomicAdd(&_a[0], _b[0]); \
37: atomicAdd(&_a[1], _b[1]); \
38: }
39: #else
40: #define PetscAtomicAdd(a, b) atomicAdd(a, b)
41: #endif
42: #else
43: /* TODO: support devices other than CUDA and Kokkos */
44: #define PetscAtomicAdd(a, b) *(a) += b
45: #endif
47: #define MatSetValues_SeqAIJ_A_Private(row, col, value, addv) \
48: { \
49: inserted = 0; \
50: if (col <= lastcol1) low1 = 0; \
51: else high1 = nrow1; \
52: lastcol1 = col; \
53: while (high1 - low1 > 5) { \
54: t = (low1 + high1) / 2; \
55: if (rp1[t] > col) high1 = t; \
56: else low1 = t; \
57: } \
58: for (_i = low1; _i < high1; _i++) { \
59: if (rp1[_i] > col) break; \
60: if (rp1[_i] == col) { \
61: if (addv == ADD_VALUES) { \
62: PetscAtomicAdd(&ap1[_i], value); \
63: } else ap1[_i] = value; \
64: inserted = 1; \
65: break; \
66: } \
67: } \
68: }
70: #define MatSetValues_SeqAIJ_B_Private(row, col, value, addv) \
71: { \
72: inserted = 0; \
73: if (col <= lastcol2) low2 = 0; \
74: else high2 = nrow2; \
75: lastcol2 = col; \
76: while (high2 - low2 > 5) { \
77: t = (low2 + high2) / 2; \
78: if (rp2[t] > col) high2 = t; \
79: else low2 = t; \
80: } \
81: for (_i = low2; _i < high2; _i++) { \
82: if (rp2[_i] > col) break; \
83: if (rp2[_i] == col) { \
84: if (addv == ADD_VALUES) { \
85: PetscAtomicAdd(&ap2[_i], value); \
86: } else ap2[_i] = value; \
87: inserted = 1; \
88: break; \
89: } \
90: } \
91: }
93: #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
94: #define SETERR return (printf("[%d] ERROR in %s() at %s:%d: Location (%ld,%ld) not found! v=%g\n", d_mat->rank, __func__, __FILE__, __LINE__, (long int)im[i], (long int)in[j], PetscRealPart(value)), PETSC_ERR_ARG_OUTOFRANGE)
95: #else
96: #define SETERR return PETSC_ERR_ARG_OUTOFRANGE
97: #endif
99: #if defined(__CUDA_ARCH__) || defined(PETSC_HAVE_HIP)
100: __device__
101: #elif defined(KOKKOS_INLINE_FUNCTION)
102: KOKKOS_INLINE_FUNCTION
103: #else
104: static
105: #endif
107: /*@C
108: MatSetValuesDevice - sets a set of values into a matrix, this may be called by CUDA or KOKKOS kernels
110: Input Parameters:
111: + d_mat - an object obtained with `MatCUSPARSEGetDeviceMatWrite()` or `MatKokkosGetDeviceMatWrite()`
112: . m - the number of rows to insert or add to
113: . im - the rows to insert or add to
114: . n - number of columns to insert or add to
115: . in - the columns to insert or add to
116: . v - the values to insert or add to the matrix (treated as a by n row oriented dense array
117: - is - either `INSERT_VALUES` or `ADD_VALUES`
119: Level: advanced
121: Notes:
122: Any row or column indices that are outside the bounds of the matrix on the rank are discarded
124: It is recommended that `MatSetValuesCOO()` be used instead of this routine for efficiency
126: .seealso: `MatSetValues()`, `MatCreate()`, `MatCreateDenseCUDA()`, `MatCreateAIJCUSPARSE()`, `MatKokkosGetDeviceMatWrite()`,
127: `MatCUSPARSEGetDeviceMatWrite()`, `MatSetValuesCOO()`
128: @*/
129: int
130: MatSetValuesDevice(PetscSplitCSRDataStructure d_mat, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], const PetscScalar v[], InsertMode is)
131: {
132: PetscScalar value;
133: const PetscInt *rp1, *rp2 = NULL, *ai = d_mat->diag.i, *aj = d_mat->diag.j;
134: const PetscInt *bi = d_mat->offdiag.i, *bj = d_mat->offdiag.j;
135: PetscScalar *ba = d_mat->offdiag.a, *aa = d_mat->diag.a;
136: PetscInt nrow1, nrow2 = 0, _i, low1, high1, low2 = 0, high2 = 0, t, lastcol1, lastcol2 = 0, inserted;
137: PetscScalar *ap1, *ap2 = NULL;
138: PetscBool roworiented = PETSC_TRUE;
139: PetscInt i, j, row, col;
140: const PetscInt rstart = d_mat->rstart, rend = d_mat->rend, cstart = d_mat->cstart, cend = d_mat->cend, M = d_mat->M;
142: for (i = 0; i < m; i++) {
143: if (im[i] >= rstart && im[i] < rend) { // silently ignore off processor rows
144: row = (int)(im[i] - rstart);
145: lastcol1 = -1;
146: rp1 = aj + ai[row];
147: ap1 = aa + ai[row];
148: nrow1 = ai[row + 1] - ai[row];
149: low1 = 0;
150: high1 = nrow1;
151: if (bj) {
152: lastcol2 = -1;
153: rp2 = bj + bi[row];
154: ap2 = ba + bi[row];
155: nrow2 = bi[row + 1] - bi[row];
156: low2 = 0;
157: high2 = nrow2;
158: } else {
159: high2 = low2 = 0;
160: }
161: for (j = 0; j < n; j++) {
162: value = roworiented ? v[i * n + j] : v[i + j * m];
163: if (in[j] >= cstart && in[j] < cend) {
164: col = (in[j] - cstart);
165: MatSetValues_SeqAIJ_A_Private(row, col, value, is);
166: if (!inserted) SETERR;
167: } else if (in[j] < 0) { // silently ignore off processor rows
168: continue;
169: } else if (in[j] >= M) SETERR;
170: else {
171: col = d_mat->colmap[in[j]] - 1;
172: if (col < 0) SETERR;
173: MatSetValues_SeqAIJ_B_Private(row, col, value, is);
174: if (!inserted) SETERR;
175: }
176: }
177: }
178: }
179: return PETSC_SUCCESS;
180: }
182: #undef MatSetValues_SeqAIJ_A_Private
183: #undef MatSetValues_SeqAIJ_B_Private
184: #undef SETERR
185: #undef PetscAtomicAdd
187: #endif