Actual source code: aijcusparseband.cu
1: /*
2: AIJCUSPARSE methods implemented with Cuda kernels. Uses cuSparse/Thrust maps from AIJCUSPARSE
3: */
4: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
6: #include <petscconf.h>
7: #include <../src/mat/impls/aij/seq/aij.h>
8: #include <../src/mat/impls/sbaij/seq/sbaij.h>
9: #undef VecType
10: #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
11: #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ > 600 && PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
12: #define AIJBANDUSEGROUPS 1
13: #endif
14: #if defined(AIJBANDUSEGROUPS)
15: #include <cooperative_groups.h>
16: #endif
18: /*
19: LU BAND factorization with optimization for block diagonal (Nf blocks) in natural order (-mat_no_inode -pc_factor_mat_ordering_type rcm with Nf>1 fields)
21: requires:
22: structurally symmetric: fix with transpose/column meta data
23: */
25: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSEBAND(Mat, Mat, IS, IS, const MatFactorInfo *);
26: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSEBAND(Mat, Mat, const MatFactorInfo *);
28: /*
29: The GPU LU factor kernel
30: */
31: __global__ void __launch_bounds__(1024, 1) mat_lu_factor_band_init_set_i(const PetscInt n, const int bw, int bi_csr[])
32: {
33: const PetscInt Nf = gridDim.x, Nblk = gridDim.y, nloc = n / Nf;
34: const PetscInt field = blockIdx.x, blkIdx = blockIdx.y;
35: const PetscInt nloc_i = (nloc / Nblk + !!(nloc % Nblk)), start_i = field * nloc + blkIdx * nloc_i, end_i = (start_i + nloc_i) > (field + 1) * nloc ? (field + 1) * nloc : (start_i + nloc_i);
37: // set i (row+1)
38: if (threadIdx.x + threadIdx.y + blockIdx.x + blockIdx.y == 0) bi_csr[0] = 0; // dummy at zero
39: for (int rowb = start_i + threadIdx.y; rowb < end_i; rowb += blockDim.y) { // rows in block by thread y
40: if (rowb < end_i && threadIdx.x == 0) {
41: PetscInt i = rowb + 1, ni = (rowb > bw) ? bw + 1 : i, n1L = ni * (ni - 1) / 2, nug = i * bw, n2L = bw * ((rowb > bw) ? (rowb - bw) : 0), mi = bw + rowb + 1 - n, clip = (mi > 0) ? mi * (mi - 1) / 2 + mi : 0;
42: bi_csr[rowb + 1] = n1L + nug - clip + n2L + i;
43: }
44: }
45: }
46: // copy AIJ to AIJ_BAND
47: __global__ void __launch_bounds__(1024, 1) mat_lu_factor_band_copy_aij_aij(const PetscInt n, const int bw, const PetscInt r[], const PetscInt ic[], const int ai_d[], const int aj_d[], const PetscScalar aa_d[], const int bi_csr[], PetscScalar ba_csr[])
48: {
49: const PetscInt Nf = gridDim.x, Nblk = gridDim.y, nloc = n / Nf;
50: const PetscInt field = blockIdx.x, blkIdx = blockIdx.y;
51: const PetscInt nloc_i = (nloc / Nblk + !!(nloc % Nblk)), start_i = field * nloc + blkIdx * nloc_i, end_i = (start_i + nloc_i) > (field + 1) * nloc ? (field + 1) * nloc : (start_i + nloc_i);
53: // zero B
54: if (threadIdx.x + threadIdx.y + blockIdx.x + blockIdx.y == 0) ba_csr[bi_csr[n]] = 0; // flop count at end
55: for (int rowb = start_i + threadIdx.y; rowb < end_i; rowb += blockDim.y) { // rows in block by thread y
56: if (rowb < end_i) {
57: PetscScalar *batmp = ba_csr + bi_csr[rowb];
58: const PetscInt nzb = bi_csr[rowb + 1] - bi_csr[rowb];
59: for (int j = threadIdx.x; j < nzb; j += blockDim.x) {
60: if (j < nzb) batmp[j] = 0;
61: }
62: }
63: }
65: // copy A into B with CSR format -- these two loops can be fused
66: for (int rowb = start_i + threadIdx.y; rowb < end_i; rowb += blockDim.y) { // rows in block by thread y
67: if (rowb < end_i) {
68: const PetscInt rowa = r[rowb], nza = ai_d[rowa + 1] - ai_d[rowa];
69: const int *ajtmp = aj_d + ai_d[rowa], bjStart = (rowb > bw) ? rowb - bw : 0;
70: const PetscScalar *av = aa_d + ai_d[rowa];
71: PetscScalar *batmp = ba_csr + bi_csr[rowb];
72: /* load in initial (unfactored row) */
73: for (int j = threadIdx.x; j < nza; j += blockDim.x) {
74: if (j < nza) {
75: PetscInt colb = ic[ajtmp[j]], idx = colb - bjStart;
76: PetscScalar vala = av[j];
77: batmp[idx] = vala;
78: }
79: }
80: }
81: }
82: }
83: // print AIJ_BAND
84: __global__ void print_mat_aij_band(const PetscInt n, const int bi_csr[], const PetscScalar ba_csr[])
85: {
86: // debug
87: if (threadIdx.x + threadIdx.y + blockIdx.x + blockIdx.y == 0) {
88: printf("B (AIJ) n=%d:\n", (int)n);
89: for (int rowb = 0; rowb < n; rowb++) {
90: const PetscInt nz = bi_csr[rowb + 1] - bi_csr[rowb];
91: const PetscScalar *batmp = ba_csr + bi_csr[rowb];
92: for (int j = 0; j < nz; j++) printf("(%13.6e) ", PetscRealPart(batmp[j]));
93: printf(" bi=%d\n", bi_csr[rowb + 1]);
94: }
95: }
96: }
97: // Band LU kernel --- ba_csr bi_csr
98: __global__ void __launch_bounds__(1024, 1) mat_lu_factor_band(const PetscInt n, const PetscInt bw, const int bi_csr[], PetscScalar ba_csr[], int *use_group_sync)
99: {
100: const PetscInt Nf = gridDim.x, Nblk = gridDim.y, nloc = n / Nf;
101: const PetscInt field = blockIdx.x, blkIdx = blockIdx.y;
102: const PetscInt start = field * nloc, end = start + nloc;
103: #if defined(AIJBANDUSEGROUPS)
104: auto g = cooperative_groups::this_grid();
105: #endif
106: // A22 panel update for each row A(1,:) and col A(:,1)
107: for (int glbDD = start, locDD = 0; glbDD < end; glbDD++, locDD++) {
108: PetscInt tnzUd = bw, maxU = end - 1 - glbDD; // we are chopping off the inter ears
109: const PetscInt nzUd = (tnzUd > maxU) ? maxU : tnzUd, dOffset = (glbDD > bw) ? bw : glbDD; // global to go past ears after first
110: PetscScalar *pBdd = ba_csr + bi_csr[glbDD] + dOffset;
111: const PetscScalar *baUd = pBdd + 1; // vector of data U(i,i+1:end)
112: const PetscScalar Bdd = *pBdd;
113: const PetscInt offset = blkIdx * blockDim.y + threadIdx.y, inc = Nblk * blockDim.y;
114: if (threadIdx.x == 0) {
115: for (int idx = offset, myi = glbDD + offset + 1; idx < nzUd; idx += inc, myi += inc) { /* assuming symmetric structure */
116: const PetscInt bwi = myi > bw ? bw : myi, kIdx = bwi - (myi - glbDD); // cuts off just the first (global) block
117: PetscScalar *Aid = ba_csr + bi_csr[myi] + kIdx;
118: *Aid = *Aid / Bdd;
119: }
120: }
121: __syncthreads(); // synch on threadIdx.x only
122: for (int idx = offset, myi = glbDD + offset + 1; idx < nzUd; idx += inc, myi += inc) {
123: const PetscInt bwi = myi > bw ? bw : myi, kIdx = bwi - (myi - glbDD); // cuts off just the first (global) block
124: PetscScalar *Aid = ba_csr + bi_csr[myi] + kIdx;
125: PetscScalar *Aij = Aid + 1;
126: const PetscScalar Lid = *Aid;
127: for (int jIdx = threadIdx.x; jIdx < nzUd; jIdx += blockDim.x) Aij[jIdx] -= Lid * baUd[jIdx];
128: }
129: #if defined(AIJBANDUSEGROUPS)
130: if (use_group_sync) {
131: g.sync();
132: } else {
133: __syncthreads();
134: }
135: #else
136: __syncthreads();
137: #endif
138: } /* endof for (i=0; i<n; i++) { */
139: }
141: static PetscErrorCode MatSolve_SeqAIJCUSPARSEBAND(Mat, Vec, Vec);
142: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSEBAND(Mat B, Mat A, const MatFactorInfo *)
143: {
144: Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data;
145: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
146: PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
147: Mat_SeqAIJCUSPARSE *cusparsestructA = (Mat_SeqAIJCUSPARSE *)A->spptr;
148: Mat_SeqAIJCUSPARSEMultStruct *matstructA;
149: CsrMatrix *matrixA;
150: const PetscInt n = A->rmap->n, *ic, *r;
151: const int *ai_d, *aj_d;
152: const PetscScalar *aa_d;
153: PetscScalar *ba_t = cusparseTriFactors->a_band_d;
154: int *bi_t = cusparseTriFactors->i_band_d;
155: int Ni = 10, team_size = 9, Nf = 1, nVec = 56, nconcurrent = 1; // Nf is batch size - not used
156: #if defined(AIJBANDUSEGROUPS) || defined(PETSC_USE_INFO)
157: int nsm = -1;
158: #endif
160: PetscFunctionBegin;
161: if (A->rmap->n == 0) PetscFunctionReturn(PETSC_SUCCESS);
162: // cusparse setup
163: PetscCheck(cusparsestructA, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparsestructA");
164: matstructA = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestructA->mat; // matstruct->cprowIndices
165: PetscCheck(matstructA, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing mat struct");
166: matrixA = (CsrMatrix *)matstructA->mat;
167: PetscCheck(matrixA, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing matrix cusparsestructA->mat->mat");
169: // get data
170: ic = thrust::raw_pointer_cast(cusparseTriFactors->cpermIndices->data());
171: ai_d = thrust::raw_pointer_cast(matrixA->row_offsets->data());
172: aj_d = thrust::raw_pointer_cast(matrixA->column_indices->data());
173: aa_d = thrust::raw_pointer_cast(matrixA->values->data().get());
174: r = thrust::raw_pointer_cast(cusparseTriFactors->rpermIndices->data());
176: PetscCallCUDA(WaitForCUDA());
177: PetscCall(PetscLogGpuTimeBegin());
178: {
179: int bw = (int)(2. * (double)n - 1. - (double)(PetscSqrtReal(1. + 4. * ((double)n * (double)n - (double)b->nz)) + PETSC_MACHINE_EPSILON)) / 2;
180: #if defined(PETSC_USE_LOG)
181: int bm1 = bw - 1, nl = n / Nf;
182: #endif
183: #if !defined(AIJBANDUSEGROUPS)
184: Ni = 1 / nconcurrent;
185: Ni = 1;
186: #else
187: if (!cusparseTriFactors->init_dev_prop) {
188: int gpuid;
189: cusparseTriFactors->init_dev_prop = PETSC_TRUE;
190: cudaGetDevice(&gpuid);
191: cudaGetDeviceProperties(&cusparseTriFactors->dev_prop, gpuid);
192: }
193: nsm = cusparseTriFactors->dev_prop.multiProcessorCount;
194: Ni = nsm / Nf / nconcurrent;
195: #endif
196: team_size = bw / Ni + !!(bw % Ni);
197: nVec = PetscMin(bw, 1024 / team_size);
198: PetscCall(PetscInfo(A, "Matrix Bandwidth = %d, number SMs/block = %d, num concurrency = %d, num fields = %d, numSMs/GPU = %d, thread group size = %d,%d\n", bw, Ni, nconcurrent, Nf, nsm, team_size, nVec));
199: {
200: dim3 dimBlockTeam(nVec, team_size);
201: dim3 dimBlockLeague(Nf, Ni);
202: mat_lu_factor_band_copy_aij_aij<<<dimBlockLeague, dimBlockTeam>>>(n, bw, r, ic, ai_d, aj_d, aa_d, bi_t, ba_t);
203: PetscCUDACheckLaunch; // does a sync
204: #if defined(AIJBANDUSEGROUPS)
205: if (Ni > 1) {
206: void *kernelArgs[] = {(void *)&n, (void *)&bw, (void *)&bi_t, (void *)&ba_t, (void *)&nsm};
207: cudaLaunchCooperativeKernel((void *)mat_lu_factor_band, dimBlockLeague, dimBlockTeam, kernelArgs, 0, NULL);
208: } else {
209: mat_lu_factor_band<<<dimBlockLeague, dimBlockTeam>>>(n, bw, bi_t, ba_t, NULL);
210: }
211: #else
212: mat_lu_factor_band<<<dimBlockLeague, dimBlockTeam>>>(n, bw, bi_t, ba_t, NULL);
213: #endif
214: PetscCUDACheckLaunch; // does a sync
215: #if defined(PETSC_USE_LOG)
216: PetscCall(PetscLogGpuFlops((PetscLogDouble)Nf * (bm1 * (bm1 + 1) * (PetscLogDouble)(2 * bm1 + 1) / 3 + (PetscLogDouble)2 * (nl - bw) * bw * bw + (PetscLogDouble)nl * (nl + 1) / 2)));
217: #endif
218: }
219: }
220: PetscCall(PetscLogGpuTimeEnd());
221: /* determine which version of MatSolve needs to be used. from MatLUFactorNumeric_AIJ_SeqAIJCUSPARSE */
222: B->ops->solve = MatSolve_SeqAIJCUSPARSEBAND;
223: B->ops->solvetranspose = NULL; // need transpose
224: B->ops->matsolve = NULL;
225: B->ops->matsolvetranspose = NULL;
226: PetscFunctionReturn(PETSC_SUCCESS);
227: }
229: PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSEBAND(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *)
230: {
231: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b;
232: IS isicol;
233: const PetscInt *ic, *ai = a->i, *aj = a->j;
234: PetscScalar *ba_t;
235: int *bi_t;
236: PetscInt i, n = A->rmap->n, Nf = 1; // Nf batch size - not used
237: PetscInt nzBcsr, bwL, bwU;
238: PetscBool missing;
239: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
241: PetscFunctionBegin;
242: PetscCheck(A->rmap->N == A->cmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "matrix must be square");
243: PetscCall(MatMissingDiagonal(A, &missing, &i));
244: PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
245: PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "!cusparseTriFactors");
246: PetscCall(MatIsStructurallySymmetric(A, &missing));
247: PetscCheck(missing, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "only structurally symmetric matrices supported");
249: PetscCall(ISInvertPermutation(iscol, PETSC_DECIDE, &isicol));
250: PetscCall(ISGetIndices(isicol, &ic));
252: PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL));
253: b = (Mat_SeqAIJ *)(B)->data;
255: /* get band widths, MatComputeBandwidth should take a reordering ic and do this */
256: bwL = bwU = 0;
257: for (int rwb = 0; rwb < n; rwb++) {
258: const PetscInt rwa = ic[rwb], anz = ai[rwb + 1] - ai[rwb], *ajtmp = aj + ai[rwb];
259: for (int j = 0; j < anz; j++) {
260: PetscInt colb = ic[ajtmp[j]];
261: if (colb < rwa) { // L
262: if (rwa - colb > bwL) bwL = rwa - colb;
263: } else {
264: if (colb - rwa > bwU) bwU = colb - rwa;
265: }
266: }
267: }
268: PetscCall(ISRestoreIndices(isicol, &ic));
269: /* only support structurally symmetric, but it might work */
270: PetscCheck(bwL == bwU, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Only symmetric structure supported (now) W_L=%" PetscInt_FMT " W_U=%" PetscInt_FMT, bwL, bwU);
271: PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
272: nzBcsr = n + (2 * n - 1) * bwU - bwU * bwU;
273: b->maxnz = b->nz = nzBcsr;
274: cusparseTriFactors->nnz = b->nz; // only meta data needed: n & nz
275: PetscCall(PetscInfo(A, "Matrix Bandwidth = %" PetscInt_FMT ", nnz = %" PetscInt_FMT "\n", bwL, b->nz));
276: if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
277: PetscCallCUDA(cudaMalloc(&ba_t, (b->nz + 1) * sizeof(PetscScalar))); // include a place for flops
278: PetscCallCUDA(cudaMalloc(&bi_t, (n + 1) * sizeof(int)));
279: cusparseTriFactors->a_band_d = ba_t;
280: cusparseTriFactors->i_band_d = bi_t;
281: /* In b structure: Free imax, ilen, old a, old j. Allocate solve_work, new a, new j */
282: {
283: dim3 dimBlockTeam(1, 128);
284: dim3 dimBlockLeague(Nf, 1);
285: mat_lu_factor_band_init_set_i<<<dimBlockLeague, dimBlockTeam>>>(n, bwU, bi_t);
286: }
287: PetscCUDACheckLaunch; // does a sync
289: // setup data
290: if (!cusparseTriFactors->rpermIndices) {
291: const PetscInt *r;
293: PetscCall(ISGetIndices(isrow, &r));
294: cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
295: cusparseTriFactors->rpermIndices->assign(r, r + n);
296: PetscCall(ISRestoreIndices(isrow, &r));
297: PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
298: }
299: /* upper triangular indices */
300: if (!cusparseTriFactors->cpermIndices) {
301: const PetscInt *c;
303: PetscCall(ISGetIndices(isicol, &c));
304: cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
305: cusparseTriFactors->cpermIndices->assign(c, c + n);
306: PetscCall(ISRestoreIndices(isicol, &c));
307: PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
308: }
310: /* put together the new matrix */
311: b->free_a = PETSC_FALSE;
312: b->free_ij = PETSC_FALSE;
313: b->singlemalloc = PETSC_FALSE;
314: b->ilen = NULL;
315: b->imax = NULL;
316: b->row = isrow;
317: b->col = iscol;
318: PetscCall(PetscObjectReference((PetscObject)isrow));
319: PetscCall(PetscObjectReference((PetscObject)iscol));
320: b->icol = isicol;
321: PetscCall(PetscMalloc1(n + 1, &b->solve_work));
323: B->factortype = MAT_FACTOR_LU;
324: B->info.factor_mallocs = 0;
325: B->info.fill_ratio_given = 0;
327: if (ai[n]) {
328: B->info.fill_ratio_needed = ((PetscReal)(nzBcsr)) / ((PetscReal)ai[n]);
329: } else {
330: B->info.fill_ratio_needed = 0.0;
331: }
332: #if defined(PETSC_USE_INFO)
333: if (ai[n] != 0) {
334: PetscReal af = B->info.fill_ratio_needed;
335: PetscCall(PetscInfo(A, "Band fill ratio %g\n", (double)af));
336: } else {
337: PetscCall(PetscInfo(A, "Empty matrix\n"));
338: }
339: #endif
340: if (a->inode.size) PetscCall(PetscInfo(A, "Warning: using inodes in band solver.\n"));
341: PetscCall(MatSeqAIJCheckInode_FactorLU(B));
342: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSEBAND;
343: B->offloadmask = PETSC_OFFLOAD_GPU;
345: PetscFunctionReturn(PETSC_SUCCESS);
346: }
348: /* Use -pc_factor_mat_solver_type cusparseband */
349: PetscErrorCode MatFactorGetSolverType_seqaij_cusparse_band(Mat, MatSolverType *type)
350: {
351: PetscFunctionBegin;
352: *type = MATSOLVERCUSPARSEBAND;
353: PetscFunctionReturn(PETSC_SUCCESS);
354: }
356: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat A, MatFactorType ftype, Mat *B)
357: {
358: PetscInt n = A->rmap->n;
360: PetscFunctionBegin;
361: PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
362: PetscCall(MatSetSizes(*B, n, n, n, n));
363: (*B)->factortype = ftype;
364: (*B)->canuseordering = PETSC_TRUE;
365: PetscCall(MatSetType(*B, MATSEQAIJCUSPARSE));
367: if (ftype == MAT_FACTOR_LU) {
368: PetscCall(MatSetBlockSizesFromMats(*B, A, A));
369: (*B)->ops->ilufactorsymbolic = NULL; // MatILUFactorSymbolic_SeqAIJCUSPARSE;
370: (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSEBAND;
371: PetscCall(PetscStrallocpy(MATORDERINGRCM, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
372: } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for CUSPARSEBAND Matrix Types");
374: PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
375: PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_cusparse_band));
376: PetscFunctionReturn(PETSC_SUCCESS);
377: }
379: #define WARP_SIZE 32
380: template <typename T>
381: __forceinline__ __device__ T wreduce(T a)
382: {
383: T b;
384: #pragma unroll
385: for (int i = WARP_SIZE / 2; i >= 1; i = i >> 1) {
386: b = __shfl_down_sync(0xffffffff, a, i);
387: a += b;
388: }
389: return a;
390: }
391: // reduce in a block, returns result in thread 0
392: template <typename T, int BLOCK_SIZE>
393: __device__ T breduce(T a)
394: {
395: constexpr int NWARP = BLOCK_SIZE / WARP_SIZE;
396: __shared__ double buf[NWARP];
397: int wid = threadIdx.x / WARP_SIZE;
398: int laneid = threadIdx.x % WARP_SIZE;
399: T b = wreduce<T>(a);
400: if (laneid == 0) buf[wid] = b;
401: __syncthreads();
402: if (wid == 0) {
403: if (threadIdx.x < NWARP) a = buf[threadIdx.x];
404: else a = 0;
405: for (int i = (NWARP + 1) / 2; i >= 1; i = i >> 1) a += __shfl_down_sync(0xffffffff, a, i);
406: }
407: return a;
408: }
410: // Band LU kernel --- ba_csr bi_csr
411: template <int BLOCK_SIZE>
412: __global__ void __launch_bounds__(256, 1) mat_solve_band(const PetscInt n, const PetscInt bw, const PetscScalar ba_csr[], PetscScalar x[])
413: {
414: const PetscInt Nf = gridDim.x, nloc = n / Nf, field = blockIdx.x, start = field * nloc, end = start + nloc, chopnz = bw * (bw + 1) / 2, blocknz = (2 * bw + 1) * nloc, blocknz_0 = blocknz - chopnz;
415: const PetscScalar *pLi;
416: const int tid = threadIdx.x;
418: /* Next, solve L */
419: pLi = ba_csr + (field == 0 ? 0 : blocknz_0 + (field - 1) * blocknz + bw); // diagonal (0,0) in field
420: for (int glbDD = start, locDD = 0; glbDD < end; glbDD++, locDD++) {
421: const PetscInt col = locDD < bw ? start : (glbDD - bw);
422: PetscScalar t = 0;
423: for (int j = col + tid, idx = tid; j < glbDD; j += blockDim.x, idx += blockDim.x) t += pLi[idx] * x[j];
424: #if defined(PETSC_USE_COMPLEX)
425: PetscReal tr = PetscRealPartComplex(t), ti = PetscImaginaryPartComplex(t);
426: PetscScalar tt(breduce<PetscReal, BLOCK_SIZE>(tr), breduce<PetscReal, BLOCK_SIZE>(ti));
427: t = tt;
428: #else
429: t = breduce<PetscReal, BLOCK_SIZE>(t);
430: #endif
431: if (threadIdx.x == 0) x[glbDD] -= t; // /1.0
432: __syncthreads();
433: // inc
434: pLi += glbDD - col; // get to diagonal
435: if (glbDD > n - 1 - bw) pLi += n - 1 - glbDD; // skip over U, only last block has funny offset
436: else pLi += bw;
437: pLi += 1; // skip to next row
438: if (field > 0 && (locDD + 1) < bw) pLi += bw - (locDD + 1); // skip padding at beginning (ear)
439: }
440: /* Then, solve U */
441: pLi = ba_csr + Nf * blocknz - 2 * chopnz - 1; // end of real data on block (diagonal)
442: if (field != Nf - 1) pLi -= blocknz_0 + (Nf - 2 - field) * blocknz + bw; // diagonal of last local row
444: for (int glbDD = end - 1, locDD = 0; glbDD >= start; glbDD--, locDD++) {
445: const PetscInt col = (locDD < bw) ? end - 1 : glbDD + bw; // end of row in U
446: PetscScalar t = 0;
447: for (int j = col - tid, idx = tid; j > glbDD; j -= blockDim.x, idx += blockDim.x) t += pLi[-idx] * x[j];
448: #if defined(PETSC_USE_COMPLEX)
449: PetscReal tr = PetscRealPartComplex(t), ti = PetscImaginaryPartComplex(t);
450: PetscScalar tt(breduce<PetscReal, BLOCK_SIZE>(tr), breduce<PetscReal, BLOCK_SIZE>(ti));
451: t = tt;
452: #else
453: t = breduce<PetscReal, BLOCK_SIZE>(PetscRealPart(t));
454: #endif
455: pLi -= col - glbDD; // diagonal
456: if (threadIdx.x == 0) {
457: x[glbDD] -= t;
458: x[glbDD] /= pLi[0];
459: }
460: __syncthreads();
461: // inc past L to start of previous U
462: pLi -= bw + 1;
463: if (glbDD < bw) pLi += bw - glbDD; // overshot in top left corner
464: if (((locDD + 1) < bw) && field != Nf - 1) pLi -= (bw - (locDD + 1)); // skip past right corner
465: }
466: }
468: static PetscErrorCode MatSolve_SeqAIJCUSPARSEBAND(Mat A, Vec bb, Vec xx)
469: {
470: const PetscScalar *barray;
471: PetscScalar *xarray;
472: thrust::device_ptr<const PetscScalar> bGPU;
473: thrust::device_ptr<PetscScalar> xGPU;
474: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
475: THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector;
476: PetscInt n = A->rmap->n, nz = cusparseTriFactors->nnz, Nf = 1; // Nf is batch size - not used
477: PetscInt bw = (int)(2. * (double)n - 1. - (double)(PetscSqrtReal(1. + 4. * ((double)n * (double)n - (double)nz)) + PETSC_MACHINE_EPSILON)) / 2; // quadric formula for bandwidth
479: PetscFunctionBegin;
480: if (A->rmap->n == 0) PetscFunctionReturn(PETSC_SUCCESS);
482: /* Get the GPU pointers */
483: PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
484: PetscCall(VecCUDAGetArrayRead(bb, &barray));
485: xGPU = thrust::device_pointer_cast(xarray);
486: bGPU = thrust::device_pointer_cast(barray);
488: PetscCall(PetscLogGpuTimeBegin());
489: /* First, reorder with the row permutation */
490: thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), tempGPU->begin());
491: constexpr int block = 128;
492: mat_solve_band<block><<<Nf, block>>>(n, bw, cusparseTriFactors->a_band_d, tempGPU->data().get());
493: PetscCUDACheckLaunch; // does a sync
495: /* Last, reorder with the column permutation */
496: thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), xGPU);
498: PetscCall(VecCUDARestoreArrayRead(bb, &barray));
499: PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
500: PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
501: PetscCall(PetscLogGpuTimeEnd());
503: PetscFunctionReturn(PETSC_SUCCESS);
504: }