Actual source code: aijhipsparseband.hip.cpp
1: /*
2: AIJHIPSPARSE methods implemented with HIP kernels. Uses hipSparse/Thrust maps from AIJHIPSPARSE
3: Portions of this code are under:
4: Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
5: */
6: #include <petscconf.h>
7: #include <../src/mat/impls/aij/seq/aij.h>
8: #include <../src/mat/impls/sbaij/seq/sbaij.h>
9: #undef VecType
10: #include <../src/mat/impls/aij/seq/seqhipsparse/hipsparsematimpl.h>
11: #define AIJBANDUSEGROUPS 1
12: #if defined(AIJBANDUSEGROUPS)
13: #include <hip/hip_cooperative_groups.h>
14: #endif
16: /*
17: LU BAND factorization with optimization for block diagonal (Nf blocks) in natural order (-mat_no_inode -pc_factor_mat_ordering_type rcm with Nf>1 fields)
19: requires:
20: structurally symmetric: fix with transpose/column meta data
21: */
23: static PetscErrorCode MatLUFactorSymbolic_SeqAIJHIPSPARSEBAND(Mat, Mat, IS, IS, const MatFactorInfo *);
24: static PetscErrorCode MatLUFactorNumeric_SeqAIJHIPSPARSEBAND(Mat, Mat, const MatFactorInfo *);
25: static PetscErrorCode MatSolve_SeqAIJHIPSPARSEBAND(Mat, Vec, Vec);
27: /*
28: The GPU LU factor kernel
29: */
30: __global__ void __launch_bounds__(1024, 1) mat_lu_factor_band_init_set_i(const PetscInt n, const int bw, int bi_csr[])
31: {
32: const PetscInt Nf = gridDim.x, Nblk = gridDim.y, nloc = n / Nf;
33: const PetscInt field = blockIdx.x, blkIdx = blockIdx.y;
34: const PetscInt nloc_i = (nloc / Nblk + !!(nloc % Nblk)), start_i = field * nloc + blkIdx * nloc_i, end_i = (start_i + nloc_i) > (field + 1) * nloc ? (field + 1) * nloc : (start_i + nloc_i);
36: // set i (row+1)
37: if (threadIdx.x + threadIdx.y + blockIdx.x + blockIdx.y == 0) bi_csr[0] = 0; // dummy at zero
38: for (int rowb = start_i + threadIdx.y; rowb < end_i; rowb += blockDim.y) { // rows in block by thread y
39: if (rowb < end_i && threadIdx.x == 0) {
40: PetscInt i = rowb + 1, ni = (rowb > bw) ? bw + 1 : i, n1L = ni * (ni - 1) / 2, nug = i * bw, n2L = bw * ((rowb > bw) ? (rowb - bw) : 0), mi = bw + rowb + 1 - n, clip = (mi > 0) ? mi * (mi - 1) / 2 + mi : 0;
41: bi_csr[rowb + 1] = n1L + nug - clip + n2L + i;
42: }
43: }
44: }
45: // copy AIJ to AIJ_BAND
46: __global__ void __launch_bounds__(1024, 1) mat_lu_factor_band_copy_aij_aij(const PetscInt n, const int bw, const PetscInt r[], const PetscInt ic[], const int ai_d[], const int aj_d[], const PetscScalar aa_d[], const int bi_csr[], PetscScalar ba_csr[])
47: {
48: const PetscInt Nf = gridDim.x, Nblk = gridDim.y, nloc = n / Nf;
49: const PetscInt field = blockIdx.x, blkIdx = blockIdx.y;
50: const PetscInt nloc_i = (nloc / Nblk + !!(nloc % Nblk)), start_i = field * nloc + blkIdx * nloc_i, end_i = (start_i + nloc_i) > (field + 1) * nloc ? (field + 1) * nloc : (start_i + nloc_i);
52: // zero B
53: if (threadIdx.x + threadIdx.y + blockIdx.x + blockIdx.y == 0) ba_csr[bi_csr[n]] = 0; // flop count at end
54: for (int rowb = start_i + threadIdx.y; rowb < end_i; rowb += blockDim.y) { // rows in block by thread y
55: if (rowb < end_i) {
56: PetscScalar *batmp = ba_csr + bi_csr[rowb];
57: const PetscInt nzb = bi_csr[rowb + 1] - bi_csr[rowb];
58: for (int j = threadIdx.x; j < nzb; j += blockDim.x) {
59: if (j < nzb) { batmp[j] = 0; }
60: }
61: }
62: }
64: // copy A into B with CSR format -- these two loops can be fused
65: for (int rowb = start_i + threadIdx.y; rowb < end_i; rowb += blockDim.y) { // rows in block by thread y
66: if (rowb < end_i) {
67: const PetscInt rowa = r[rowb], nza = ai_d[rowa + 1] - ai_d[rowa];
68: const int *ajtmp = aj_d + ai_d[rowa], bjStart = (rowb > bw) ? rowb - bw : 0;
69: const PetscScalar *av = aa_d + ai_d[rowa];
70: PetscScalar *batmp = ba_csr + bi_csr[rowb];
71: /* load in initial (unfactored row) */
72: for (int j = threadIdx.x; j < nza; j += blockDim.x) {
73: if (j < nza) {
74: PetscInt colb = ic[ajtmp[j]], idx = colb - bjStart;
75: PetscScalar vala = av[j];
76: batmp[idx] = vala;
77: }
78: }
79: }
80: }
81: }
82: // print AIJ_BAND
83: __global__ void print_mat_aij_band(const PetscInt n, const int bi_csr[], const PetscScalar ba_csr[])
84: {
85: // debug
86: if (threadIdx.x + threadIdx.y + blockIdx.x + blockIdx.y == 0) {
87: printf("B (AIJ) n=%d:\n", (int)n);
88: for (int rowb = 0; rowb < n; rowb++) {
89: const PetscInt nz = bi_csr[rowb + 1] - bi_csr[rowb];
90: const PetscScalar *batmp = ba_csr + bi_csr[rowb];
91: for (int j = 0; j < nz; j++) printf("(%13.6e) ", PetscRealPart(batmp[j]));
92: printf(" bi=%d\n", bi_csr[rowb + 1]);
93: }
94: }
95: }
96: // Band LU kernel --- ba_csr bi_csr
97: __global__ void __launch_bounds__(1024, 1) mat_lu_factor_band(const PetscInt n, const PetscInt bw, const int bi_csr[], PetscScalar ba_csr[], int *use_group_sync)
98: {
99: const PetscInt Nf = gridDim.x, Nblk = gridDim.y, nloc = n / Nf;
100: const PetscInt field = blockIdx.x, blkIdx = blockIdx.y;
101: const PetscInt start = field * nloc, end = start + nloc;
102: #if defined(AIJBANDUSEGROUPS)
103: auto g = cooperative_groups::this_grid();
104: #endif
105: // A22 panel update for each row A(1,:) and col A(:,1)
106: for (int glbDD = start, locDD = 0; glbDD < end; glbDD++, locDD++) {
107: PetscInt tnzUd = bw, maxU = end - 1 - glbDD; // we are chopping off the inter ears
108: const PetscInt nzUd = (tnzUd > maxU) ? maxU : tnzUd, dOffset = (glbDD > bw) ? bw : glbDD; // global to go past ears after first
109: PetscScalar *pBdd = ba_csr + bi_csr[glbDD] + dOffset;
110: const PetscScalar *baUd = pBdd + 1; // vector of data U(i,i+1:end)
111: const PetscScalar Bdd = *pBdd;
112: const PetscInt offset = blkIdx * blockDim.y + threadIdx.y, inc = Nblk * blockDim.y;
113: if (threadIdx.x == 0) {
114: for (int idx = offset, myi = glbDD + offset + 1; idx < nzUd; idx += inc, myi += inc) { /* assuming symmetric structure */
115: const PetscInt bwi = myi > bw ? bw : myi, kIdx = bwi - (myi - glbDD); // cuts off just the first (global) block
116: PetscScalar *Aid = ba_csr + bi_csr[myi] + kIdx;
117: *Aid = *Aid / Bdd;
118: }
119: }
120: __syncthreads(); // synch on threadIdx.x only
121: for (int idx = offset, myi = glbDD + offset + 1; idx < nzUd; idx += inc, myi += inc) {
122: const PetscInt bwi = myi > bw ? bw : myi, kIdx = bwi - (myi - glbDD); // cuts off just the first (global) block
123: PetscScalar *Aid = ba_csr + bi_csr[myi] + kIdx;
124: PetscScalar *Aij = Aid + 1;
125: const PetscScalar Lid = *Aid;
126: for (int jIdx = threadIdx.x; jIdx < nzUd; jIdx += blockDim.x) { Aij[jIdx] -= Lid * baUd[jIdx]; }
127: }
128: #if defined(AIJBANDUSEGROUPS)
129: if (use_group_sync) {
130: g.sync();
131: } else {
132: __syncthreads();
133: }
134: #else
135: __syncthreads();
136: #endif
137: } /* endof for (i=0; i<n; i++) { */
138: }
140: static PetscErrorCode MatLUFactorNumeric_SeqAIJHIPSPARSEBAND(Mat B, Mat A, const MatFactorInfo *info)
141: {
142: Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data;
143: Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)B->spptr;
144: PetscCheck(hipsparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing hipsparseTriFactors");
146: Mat_SeqAIJHIPSPARSE *hipsparsestructA = (Mat_SeqAIJHIPSPARSE *)A->spptr;
147: Mat_SeqAIJHIPSPARSEMultStruct *matstructA;
148: CsrMatrix *matrixA;
149: const PetscInt n = A->rmap->n, *ic, *r;
150: const int *ai_d, *aj_d;
151: const PetscScalar *aa_d;
152: PetscScalar *ba_t = hipsparseTriFactors->a_band_d;
153: int *bi_t = hipsparseTriFactors->i_band_d;
154: int Ni = 10, team_size = 9, Nf = 1, nVec = 56, nconcurrent = 1, nsm = -1; // Nf is batch size - not used
156: PetscFunctionBegin;
157: if (A->rmap->n == 0) PetscFunctionReturn(PETSC_SUCCESS);
158: // hipsparse setup
159: PetscCheck(hipsparsestructA, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing hipsparsestructA");
160: matstructA = (Mat_SeqAIJHIPSPARSEMultStruct *)hipsparsestructA->mat; // matstruct->cprowIndices
161: PetscCheck(matstructA, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing mat struct");
162: matrixA = (CsrMatrix *)matstructA->mat;
163: PetscCheck(matrixA, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing matrix hipsparsestructA->mat->mat");
165: // get data
166: ic = thrust::raw_pointer_cast(hipsparseTriFactors->cpermIndices->data());
167: ai_d = thrust::raw_pointer_cast(matrixA->row_offsets->data());
168: aj_d = thrust::raw_pointer_cast(matrixA->column_indices->data());
169: aa_d = thrust::raw_pointer_cast(matrixA->values->data().get());
170: r = thrust::raw_pointer_cast(hipsparseTriFactors->rpermIndices->data());
172: PetscCallHIP(WaitForHIP());
173: PetscCall(PetscLogGpuTimeBegin());
174: {
175: int bw = (int)(2. * (double)n - 1. - (double)(PetscSqrtReal(1. + 4. * ((double)n * (double)n - (double)b->nz)) + PETSC_MACHINE_EPSILON)) / 2, bm1 = bw - 1, nl = n / Nf;
176: #if !defined(AIJBANDUSEGROUPS)
177: Ni = 1 / nconcurrent;
178: Ni = 1;
179: #else
180: if (!hipsparseTriFactors->init_dev_prop) {
181: int gpuid;
182: hipsparseTriFactors->init_dev_prop = PETSC_TRUE;
183: PetscCallHIP(hipGetDevice(&gpuid));
184: PetscCallHIP(hipGetDeviceProperties(&hipsparseTriFactors->dev_prop, gpuid));
185: }
186: nsm = hipsparseTriFactors->dev_prop.multiProcessorCount;
187: Ni = nsm / Nf / nconcurrent;
188: #endif
189: team_size = bw / Ni + !!(bw % Ni);
190: nVec = PetscMin(bw, 1024 / team_size);
191: PetscCall(PetscInfo(A, "Matrix Bandwidth = %d, number SMs/block = %d, num concurrency = %d, num fields = %d, numSMs/GPU = %d, thread group size = %d,%d\n", bw, Ni, nconcurrent, Nf, nsm, team_size, nVec));
192: {
193: dim3 dimBlockTeam(nVec, team_size);
194: dim3 dimBlockLeague(Nf, Ni);
195: hipLaunchKernelGGL(mat_lu_factor_band_copy_aij_aij, dim3(dimBlockLeague), dim3(dimBlockTeam), 0, 0, n, bw, r, ic, ai_d, aj_d, aa_d, bi_t, ba_t);
196: PetscHIPCheckLaunch; // does a sync
197: #if defined(AIJBANDUSEGROUPS)
198: if (Ni > 1) {
199: void *kernelArgs[] = {(void *)&n, (void *)&bw, (void *)&bi_t, (void *)&ba_t, (void *)&nsm};
200: PetscCallHIP(hipLaunchCooperativeKernel((void *)mat_lu_factor_band, dimBlockLeague, dimBlockTeam, kernelArgs, 0, NULL));
201: } else {
202: hipLaunchKernelGGL(mat_lu_factor_band, dim3(dimBlockLeague), dim3(dimBlockTeam), 0, 0, n, bw, bi_t, ba_t, NULL);
203: }
204: #else
205: hipLaunchKernelGGL(mat_lu_factor_band, dim3(dimBlockLeague), dim3(dimBlockTeam), 0, 0, n, bw, bi_t, ba_t, NULL);
206: #endif
207: PetscHIPCheckLaunch; // does a sync
208: #if defined(PETSC_USE_LOG)
209: PetscCall(PetscLogGpuFlops((PetscLogDouble)Nf * (bm1 * (bm1 + 1) * (PetscLogDouble)(2 * bm1 + 1) / 3 + (PetscLogDouble)2 * (nl - bw) * bw * bw + (PetscLogDouble)nl * (nl + 1) / 2)));
210: #endif
211: }
212: }
213: PetscCall(PetscLogGpuTimeEnd());
214: /* determine which version of MatSolve needs to be used. from MatLUFactorNumeric_AIJ_SeqAIJHIPSPARSE */
215: B->ops->solve = MatSolve_SeqAIJHIPSPARSEBAND;
216: B->ops->solvetranspose = NULL; /* need transpose */
217: B->ops->matsolve = NULL;
218: B->ops->matsolvetranspose = NULL;
219: PetscFunctionReturn(PETSC_SUCCESS);
220: }
222: PetscErrorCode MatLUFactorSymbolic_SeqAIJHIPSPARSEBAND(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
223: {
224: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b;
225: IS isicol;
226: const PetscInt *ic, *ai = a->i, *aj = a->j;
227: PetscScalar *ba_t;
228: int *bi_t;
229: PetscInt i, n = A->rmap->n, Nf = 1; /* Nf batch size - not used */
230: PetscInt nzBcsr, bwL, bwU;
231: PetscBool missing;
232: Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)B->spptr;
234: PetscFunctionBegin;
235: PetscCheck(A->rmap->N == A->cmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "matrix must be square");
236: PetscCall(MatMissingDiagonal(A, &missing, &i));
237: PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
238: PetscCheck(hipsparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "!hipsparseTriFactors");
239: PetscCall(MatGetOption(A, MAT_STRUCTURALLY_SYMMETRIC, &missing));
240: PetscCheck(missing, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "only structrally symmetric matrices supported");
241: PetscCall(ISInvertPermutation(iscol, PETSC_DECIDE, &isicol));
242: PetscCall(ISGetIndices(isicol, &ic));
243: PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL));
244: b = (Mat_SeqAIJ *)(B)->data;
246: /* get band widths, MatComputeBandwidth should take a reordering ic and do this */
247: bwL = bwU = 0;
248: for (int rwb = 0; rwb < n; rwb++) {
249: const PetscInt rwa = ic[rwb], anz = ai[rwb + 1] - ai[rwb], *ajtmp = aj + ai[rwb];
250: for (int j = 0; j < anz; j++) {
251: PetscInt colb = ic[ajtmp[j]];
252: if (colb < rwa) { // L
253: if (rwa - colb > bwL) bwL = rwa - colb;
254: } else {
255: if (colb - rwa > bwU) bwU = colb - rwa;
256: }
257: }
258: }
259: PetscCall(ISRestoreIndices(isicol, &ic));
260: /* only support structurally symmetric, but it might work */
261: PetscCheck(bwL == bwU, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Only symmetric structure supported (now) W_L=%" PetscInt_FMT " W_U=%" PetscInt_FMT, bwL, bwU);
262: PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(&hipsparseTriFactors));
263: nzBcsr = n + (2 * n - 1) * bwU - bwU * bwU;
264: b->maxnz = b->nz = nzBcsr;
265: hipsparseTriFactors->nnz = b->nz; // only meta data needed: n & nz
266: PetscCall(PetscInfo(A, "Matrix Bandwidth = %" PetscInt_FMT ", nnz = %" PetscInt_FMT "\n", bwL, b->nz));
267: if (!hipsparseTriFactors->workVector) hipsparseTriFactors->workVector = new THRUSTARRAY(n);
268: PetscCallHIP(hipMalloc(&ba_t, (b->nz + 1) * sizeof(PetscScalar))); // include a place for flops
269: PetscCallHIP(hipMalloc(&bi_t, (n + 1) * sizeof(int)));
270: hipsparseTriFactors->a_band_d = ba_t;
271: hipsparseTriFactors->i_band_d = bi_t;
272: /* In b structure: Free imax, ilen, old a, old j. Allocate solve_work, new a, new j */
273: {
274: dim3 dimBlockTeam(1, 128);
275: dim3 dimBlockLeague(Nf, 1);
276: hipLaunchKernelGGL(mat_lu_factor_band_init_set_i, dim3(dimBlockLeague), dim3(dimBlockTeam), 0, 0, n, bwU, bi_t);
277: }
278: PetscHIPCheckLaunch; // does a sync
280: // setup data
281: if (!hipsparseTriFactors->rpermIndices) {
282: const PetscInt *r;
283: PetscCall(ISGetIndices(isrow, &r));
284: hipsparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
285: hipsparseTriFactors->rpermIndices->assign(r, r + n);
286: PetscCall(ISRestoreIndices(isrow, &r));
287: PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
288: }
289: /* upper triangular indices */
290: if (!hipsparseTriFactors->cpermIndices) {
291: const PetscInt *c;
292: PetscCall(ISGetIndices(isicol, &c));
293: hipsparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
294: hipsparseTriFactors->cpermIndices->assign(c, c + n);
295: PetscCall(ISRestoreIndices(isicol, &c));
296: PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
297: }
299: /* put together the new matrix */
300: b->free_a = PETSC_FALSE;
301: b->free_ij = PETSC_FALSE;
302: b->singlemalloc = PETSC_FALSE;
303: b->ilen = NULL;
304: b->imax = NULL;
305: b->row = isrow;
306: b->col = iscol;
307: PetscCall(PetscObjectReference((PetscObject)isrow));
308: PetscCall(PetscObjectReference((PetscObject)iscol));
309: b->icol = isicol;
310: PetscCall(PetscMalloc1(n + 1, &b->solve_work));
312: B->factortype = MAT_FACTOR_LU;
313: B->info.factor_mallocs = 0;
314: B->info.fill_ratio_given = 0;
316: if (ai[n]) B->info.fill_ratio_needed = ((PetscReal)(nzBcsr)) / ((PetscReal)ai[n]);
317: else B->info.fill_ratio_needed = 0.0;
318: #if defined(PETSC_USE_INFO)
319: if (ai[n] != 0) {
320: PetscReal af = B->info.fill_ratio_needed;
321: PetscCall(PetscInfo(A, "Band fill ratio %g\n", (double)af));
322: } else PetscCall(PetscInfo(A, "Empty matrix\n"));
323: #endif
324: if (a->inode.size) PetscCall(PetscInfo(A, "Warning: using inodes in band solver.\n"));
325: PetscCall(MatSeqAIJCheckInode_FactorLU(B));
326: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJHIPSPARSEBAND;
327: B->offloadmask = PETSC_OFFLOAD_GPU;
329: PetscFunctionReturn(PETSC_SUCCESS);
330: }
332: /* Use -pc_factor_mat_solver_type hipsparseband */
333: PetscErrorCode MatFactorGetSolverType_seqaij_hipsparse_band(Mat A, MatSolverType *type)
334: {
335: PetscFunctionBegin;
336: *type = MATSOLVERHIPSPARSEBAND;
337: PetscFunctionReturn(PETSC_SUCCESS);
338: }
340: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijhipsparse_hipsparse_band(Mat A, MatFactorType ftype, Mat *B)
341: {
342: PetscInt n = A->rmap->n;
344: PetscFunctionBegin;
345: PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
346: PetscCall(MatSetSizes(*B, n, n, n, n));
347: (*B)->factortype = ftype;
348: (*B)->canuseordering = PETSC_TRUE;
349: PetscCall(MatSetType(*B, MATSEQAIJHIPSPARSE));
351: if (ftype == MAT_FACTOR_LU) {
352: PetscCall(MatSetBlockSizesFromMats(*B, A, A));
353: (*B)->ops->ilufactorsymbolic = NULL; // MatILUFactorSymbolic_SeqAIJHIPSPARSE;
354: (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJHIPSPARSEBAND;
355: PetscCall(PetscStrallocpy(MATORDERINGRCM, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
356: } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for HIPSPARSEBAND Matrix Types");
358: PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
359: PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_hipsparse_band));
360: PetscFunctionReturn(PETSC_SUCCESS);
361: }
363: #define WARP_SIZE 32 // to be consistent with Nvidia terminology. WARP == Wavefront
364: template <typename T>
365: __forceinline__ __device__ T wreduce(T a)
366: {
367: T b;
369: #pragma unroll
370: for (int i = WARP_SIZE / 2; i >= 1; i = i >> 1) {
371: b = __shfl_down(0xffffffff, a, i);
372: a += b;
373: }
374: return a;
375: }
376: // reduce in a block, returns result in thread 0
377: template <typename T, int BLOCK_SIZE>
378: __device__ T breduce(T a)
379: {
380: constexpr int NWARP = BLOCK_SIZE / WARP_SIZE;
381: __shared__ double buf[NWARP];
382: int wid = threadIdx.x / WARP_SIZE;
383: int laneid = threadIdx.x % WARP_SIZE;
384: T b = wreduce<T>(a);
385: if (laneid == 0) buf[wid] = b;
386: __syncthreads();
387: if (wid == 0) {
388: if (threadIdx.x < NWARP) a = buf[threadIdx.x];
389: else a = 0;
390: for (int i = (NWARP + 1) / 2; i >= 1; i = i >> 1) { a += __shfl_down(0xffffffff, a, i); }
391: }
392: return a;
393: }
395: // Band LU kernel --- ba_csr bi_csr
396: template <int BLOCK_SIZE>
397: __global__ void __launch_bounds__(256, 1) mat_solve_band(const PetscInt n, const PetscInt bw, const PetscScalar ba_csr[], PetscScalar x[])
398: {
399: const PetscInt Nf = gridDim.x, nloc = n / Nf, field = blockIdx.x, start = field * nloc, end = start + nloc, chopnz = bw * (bw + 1) / 2, blocknz = (2 * bw + 1) * nloc, blocknz_0 = blocknz - chopnz;
400: const PetscScalar *pLi;
401: const int tid = threadIdx.x;
403: /* Next, solve L */
404: pLi = ba_csr + (field == 0 ? 0 : blocknz_0 + (field - 1) * blocknz + bw); // diagonal (0,0) in field
405: for (int glbDD = start, locDD = 0; glbDD < end; glbDD++, locDD++) {
406: const PetscInt col = locDD < bw ? start : (glbDD - bw);
407: PetscScalar t = 0;
408: for (int j = col + tid, idx = tid; j < glbDD; j += blockDim.x, idx += blockDim.x) { t += pLi[idx] * x[j]; }
409: #if defined(PETSC_USE_COMPLEX)
410: PetscReal tr = PetscRealPartComplex(t), ti = PetscImaginaryPartComplex(t);
411: PetscScalar tt(breduce<PetscReal, BLOCK_SIZE>(tr), breduce<PetscReal, BLOCK_SIZE>(ti));
412: t = tt;
413: #else
414: t = breduce<PetscReal, BLOCK_SIZE>(t);
415: #endif
416: if (threadIdx.x == 0) x[glbDD] -= t; // /1.0
417: __syncthreads();
418: // inc
419: pLi += glbDD - col; // get to diagonal
420: if (glbDD > n - 1 - bw) pLi += n - 1 - glbDD; // skip over U, only last block has funny offset
421: else pLi += bw;
422: pLi += 1; // skip to next row
423: if (field > 0 && (locDD + 1) < bw) pLi += bw - (locDD + 1); // skip padding at beginning (ear)
424: }
425: /* Then, solve U */
426: pLi = ba_csr + Nf * blocknz - 2 * chopnz - 1; // end of real data on block (diagonal)
427: if (field != Nf - 1) pLi -= blocknz_0 + (Nf - 2 - field) * blocknz + bw; // diagonal of last local row
429: for (int glbDD = end - 1, locDD = 0; glbDD >= start; glbDD--, locDD++) {
430: const PetscInt col = (locDD < bw) ? end - 1 : glbDD + bw; // end of row in U
431: PetscScalar t = 0;
432: for (int j = col - tid, idx = tid; j > glbDD; j -= blockDim.x, idx += blockDim.x) t += pLi[-idx] * x[j];
433: #if defined(PETSC_USE_COMPLEX)
434: PetscReal tr = PetscRealPartComplex(t), ti = PetscImaginaryPartComplex(t);
435: PetscScalar tt(breduce<PetscReal, BLOCK_SIZE>(tr), breduce<PetscReal, BLOCK_SIZE>(ti));
436: t = tt;
437: #else
438: t = breduce<PetscReal, BLOCK_SIZE>(PetscRealPart(t));
439: #endif
440: pLi -= col - glbDD; // diagonal
441: if (threadIdx.x == 0) {
442: x[glbDD] -= t;
443: x[glbDD] /= pLi[0];
444: }
445: __syncthreads();
446: // inc past L to start of previous U
447: pLi -= bw + 1;
448: if (glbDD < bw) pLi += bw - glbDD; // overshot in top left corner
449: if (((locDD + 1) < bw) && field != Nf - 1) pLi -= (bw - (locDD + 1)); // skip past right corner
450: }
451: }
453: static PetscErrorCode MatSolve_SeqAIJHIPSPARSEBAND(Mat A, Vec bb, Vec xx)
454: {
455: const PetscScalar *barray;
456: PetscScalar *xarray;
457: thrust::device_ptr<const PetscScalar> bGPU;
458: thrust::device_ptr<PetscScalar> xGPU;
459: Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
460: THRUSTARRAY *tempGPU = (THRUSTARRAY *)hipsparseTriFactors->workVector;
461: PetscInt n = A->rmap->n, nz = hipsparseTriFactors->nnz, Nf = 1; // Nf is batch size - not used
462: PetscInt bw = (int)(2. * (double)n - 1. - (double)(PetscSqrtReal(1. + 4. * ((double)n * (double)n - (double)nz)) + PETSC_MACHINE_EPSILON)) / 2; // quadric formula for bandwidth
464: PetscFunctionBegin;
465: if (A->rmap->n == 0) PetscFunctionReturn(PETSC_SUCCESS);
467: /* Get the GPU pointers */
468: PetscCall(VecHIPGetArrayWrite(xx, &xarray));
469: PetscCall(VecHIPGetArrayRead(bb, &barray));
470: xGPU = thrust::device_pointer_cast(xarray);
471: bGPU = thrust::device_pointer_cast(barray);
473: PetscCall(PetscLogGpuTimeBegin());
474: /* First, reorder with the row permutation */
475: thrust::copy(thrust::hip::par.on(PetscDefaultHipStream), thrust::make_permutation_iterator(bGPU, hipsparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, hipsparseTriFactors->rpermIndices->end()), tempGPU->begin());
476: constexpr int block = 128;
477: hipLaunchKernelGGL(HIP_KERNEL_NAME(mat_solve_band<block>), dim3(Nf), dim3(block), 0, 0, n, bw, hipsparseTriFactors->a_band_d, tempGPU->data().get());
478: PetscHIPCheckLaunch; // does a sync
480: /* Last, reorder with the column permutation */
481: thrust::copy(thrust::hip::par.on(PetscDefaultHipStream), thrust::make_permutation_iterator(tempGPU->begin(), hipsparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), hipsparseTriFactors->cpermIndices->end()), xGPU);
483: PetscCall(VecHIPRestoreArrayRead(bb, &barray));
484: PetscCall(VecHIPRestoreArrayWrite(xx, &xarray));
485: PetscCall(PetscLogGpuFlops(2.0 * hipsparseTriFactors->nnz - A->cmap->n));
486: PetscCall(PetscLogGpuTimeEnd());
488: PetscFunctionReturn(PETSC_SUCCESS);
489: }