Actual source code: bjkokkos.kokkos.cxx
1: #include <petscvec_kokkos.hpp>
2: #include <petsc/private/deviceimpl.h>
3: #include <petsc/private/pcimpl.h>
4: #include <petsc/private/kspimpl.h>
5: #include <petscksp.h>
6: #include <../src/mat/impls/aij/mpi/mpiaij.h>
7: #include <../src/mat/impls/aij/mpi/kokkos/mpiaijkok.hpp>
8: #include "petscsection.h"
9: #include <petscdmcomposite.h>
10: #include "Kokkos_Core.hpp"
12: #include <../src/mat/impls/aij/seq/aij.h>
13: #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>
15: #if defined(PETSC_HAVE_CUDA)
16: #include <nvToolsExt.h>
17: #endif
19: #include <petscdevice_cupm.h>
21: #define PCBJKOKKOS_SHARED_LEVEL 1 // 0 is shared, 1 is global
22: #define PCBJKOKKOS_VEC_SIZE 16
23: #define PCBJKOKKOS_TEAM_SIZE 16
25: #define PCBJKOKKOS_VERBOSE_LEVEL 1
27: typedef Kokkos::DefaultExecutionSpace exec_space;
28: using layout = Kokkos::LayoutRight;
29: using IntView = Kokkos::View<PetscInt **, layout, exec_space>;
30: using AMatrixValueView = const Kokkos::View<PetscScalar **, layout, exec_space>;
31: using XYType = const Kokkos::View<PetscScalar **, layout, exec_space>;
33: typedef enum {
34: BATCH_KSP_BICG_IDX,
35: BATCH_KSP_TFQMR_IDX,
36: BATCH_KSP_GMRES_IDX,
37: NUM_BATCH_TYPES
38: } KSPIndex;
39: typedef struct {
40: Vec vec_diag;
41: PetscInt nBlocks; /* total number of blocks */
42: PetscInt n; // cache host version of d_bid_eqOffset_k[nBlocks]
43: KSP ksp; // Used just for options. Should have one for each block
44: Kokkos::View<PetscInt *, Kokkos::LayoutRight> *d_bid_eqOffset_k;
45: Kokkos::View<PetscScalar *, Kokkos::LayoutRight> *d_idiag_k;
46: Kokkos::View<PetscInt *> *d_isrow_k;
47: Kokkos::View<PetscInt *> *d_isicol_k;
48: KSPIndex ksp_type_idx;
49: PetscInt nwork;
50: PetscInt const_block_size; // used to decide to use shared memory for work vectors
51: PetscInt *dm_Nf; // Number of fields in each DM
52: PetscInt num_dms;
53: // diagnostics
54: PetscBool reason;
55: PetscBool monitor;
56: PetscInt batch_target;
57: PetscInt nsolves_team;
58: PetscInt max_nits;
59: // caches
60: IntView *rowOffsets;
61: IntView *colIndices;
62: XYType *batch_b;
63: XYType *batch_x;
64: AMatrixValueView *batch_values;
65: } PC_PCBJKOKKOS;
67: #if defined(PETSC_HAVE_KOKKOS_KERNELS_GMRES)
68: #include <fstream>
70: #include "Kokkos_Timer.hpp"
71: #include "Kokkos_Random.hpp"
72: #include "Kokkos_UnorderedMap.hpp"
73: #include "Kokkos_Sort.hpp"
75: /// KokkosKernels headers
76: #include "KokkosBatched_Util.hpp"
77: #include "KokkosBatched_Vector.hpp"
79: #include <Kokkos_ArithTraits.hpp>
80: #include <KokkosBatched_Util.hpp>
81: #include <KokkosBatched_Vector.hpp>
82: #include <KokkosBatched_Copy_Decl.hpp>
83: #include <KokkosBatched_Copy_Impl.hpp>
84: #include <KokkosBatched_AddRadial_Decl.hpp>
85: #include <KokkosBatched_AddRadial_Impl.hpp>
86: #include <KokkosBatched_Gemm_Decl.hpp>
87: #include <KokkosBatched_Gemm_Serial_Impl.hpp>
88: #include <KokkosBatched_Gemm_Team_Impl.hpp>
89: #include <KokkosBatched_Gemv_Decl.hpp>
90: #include <KokkosBatched_Gemv_Serial_Impl.hpp>
91: #include <KokkosBatched_Gemv_Team_Impl.hpp>
92: #include <KokkosBatched_Trsm_Decl.hpp>
93: #include <KokkosBatched_Trsm_Serial_Impl.hpp>
94: #include <KokkosBatched_Trsm_Team_Impl.hpp>
95: #include <KokkosBatched_Trsv_Decl.hpp>
96: #include <KokkosBatched_Trsv_Serial_Impl.hpp>
97: #include <KokkosBatched_Trsv_Team_Impl.hpp>
98: #include <KokkosBatched_LU_Decl.hpp>
99: #include <KokkosBatched_LU_Serial_Impl.hpp>
100: #include <KokkosBatched_LU_Team_Impl.hpp>
101: #include <KokkosSparse_CrsMatrix.hpp>
102: #include "KokkosBatched_Spmv.hpp"
103: #include "KokkosBatched_CrsMatrix.hpp"
104: #include "KokkosBatched_Krylov_Handle.hpp"
105: #include "KokkosBatched_GMRES.hpp"
106: #include "KokkosBatched_JacobiPrec.hpp"
108: template <typename DeviceType, typename ValuesViewType, typename IntView, typename VectorViewType, typename KrylovHandleType>
109: struct Functor_TestBatchedTeamVectorGMRES {
110: const ValuesViewType _D;
111: const ValuesViewType _diag;
112: const IntView _r;
113: const IntView _c;
114: const VectorViewType _X;
115: const VectorViewType _B;
116: const int _N_team, _team_size, _vector_length;
117: const int _N_iteration;
118: const double _tol;
119: const int _ortho_strategy;
120: const int _scratch_pad_level;
121: KrylovHandleType _handle;
123: KOKKOS_INLINE_FUNCTION
124: Functor_TestBatchedTeamVectorGMRES(const ValuesViewType &D, const IntView &r, const IntView &c, const VectorViewType &X, const VectorViewType &B, const int N_team, const int team_size, const int vector_length, const int N_iteration, const double tol, const int ortho_strategy, const int scratch_pad_level, KrylovHandleType &handle) :
125: _D(D), _r(r), _c(c), _X(X), _B(B), _N_team(N_team), _team_size(team_size), _vector_length(vector_length), _N_iteration(N_iteration), _tol(tol), _ortho_strategy(ortho_strategy), _scratch_pad_level(scratch_pad_level), _handle(handle)
126: {
127: }
129: KOKKOS_INLINE_FUNCTION
130: Functor_TestBatchedTeamVectorGMRES(const ValuesViewType &D, const ValuesViewType &diag, const IntView &r, const IntView &c, const VectorViewType &X, const VectorViewType &B, const int N_team, const int team_size, const int vector_length, const int N_iteration, const double tol, int ortho_strategy, const int scratch_pad_level, KrylovHandleType &handle) :
131: _D(D), _diag(diag), _r(r), _c(c), _X(X), _B(B), _N_team(N_team), _team_size(team_size), _vector_length(vector_length), _N_iteration(N_iteration), _tol(tol), _ortho_strategy(ortho_strategy), _scratch_pad_level(scratch_pad_level), _handle(handle)
132: {
133: }
135: template <typename MemberType>
136: KOKKOS_INLINE_FUNCTION void operator()(const MemberType &member) const
137: {
138: const int first_matrix = static_cast<int>(member.league_rank()) * _N_team;
139: const int N = _D.extent(0);
140: const int last_matrix = (static_cast<int>(member.league_rank() + 1) * _N_team < N ? static_cast<int>(member.league_rank() + 1) * _N_team : N);
141: const int graphID = static_cast<int>(member.league_rank());
142: using TeamVectorCopy1D = KokkosBatched::TeamVectorCopy<MemberType, KokkosBatched::Trans::NoTranspose, 1>;
144: auto d = Kokkos::subview(_D, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL);
145: auto x = Kokkos::subview(_X, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL);
146: auto b = Kokkos::subview(_B, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL);
147: using ScratchPadIntViewType = Kokkos::View<typename IntView::non_const_value_type *, typename IntView::array_layout, typename IntView::execution_space::scratch_memory_space>;
148: using ScratchPadValuesViewType = Kokkos::View<typename ValuesViewType::non_const_value_type **, typename ValuesViewType::array_layout, typename ValuesViewType::execution_space::scratch_memory_space>;
150: using Operator = KokkosBatched::CrsMatrix<ValuesViewType, ScratchPadIntViewType>;
151: ScratchPadIntViewType r(member.team_scratch(1), _r.extent(1));
152: ScratchPadIntViewType c(member.team_scratch(1), _c.extent(1));
154: TeamVectorCopy1D::invoke(member, Kokkos::subview(_r, graphID, Kokkos::ALL), r);
155: TeamVectorCopy1D::invoke(member, Kokkos::subview(_c, graphID, Kokkos::ALL), c);
156: Operator A(d, r, c);
158: ScratchPadValuesViewType diag(member.team_scratch(1), last_matrix - first_matrix, _diag.extent(1));
159: using PrecOperator = KokkosBatched::JacobiPrec<ScratchPadValuesViewType>;
161: KokkosBatched::TeamVectorCopy<MemberType>::invoke(member, Kokkos::subview(_diag, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL), diag);
162: PrecOperator P(diag);
163: P.setComputedInverse();
165: KokkosBatched::TeamVectorGMRES<MemberType>::template invoke<Operator, VectorViewType, PrecOperator, KrylovHandleType>(member, A, b, x, P, _handle);
166: }
167: inline double run(PC pc)
168: {
169: typedef typename ValuesViewType::value_type value_type;
170: std::string name("KokkosBatched::Test::TeamVectorGMRES");
171: Kokkos::Timer timer;
172: Kokkos::Profiling::pushRegion(name.c_str());
174: Kokkos::TeamPolicy<DeviceType> auto_policy(ceil(1. * _D.extent(0) / _N_team), Kokkos::AUTO(), Kokkos::AUTO());
175: Kokkos::TeamPolicy<DeviceType> tuned_policy(ceil(1. * _D.extent(0) / _N_team), _team_size, _vector_length);
176: Kokkos::TeamPolicy<DeviceType> policy;
178: if (_team_size < 1) policy = auto_policy;
179: else policy = tuned_policy;
181: _handle.set_max_iteration(_N_iteration);
182: _handle.set_tolerance(_tol);
183: _handle.set_ortho_strategy(_ortho_strategy);
184: _handle.set_scratch_pad_level(_scratch_pad_level);
185: _handle.set_compute_last_residual(true);
187: int maximum_iteration = _handle.get_max_iteration();
189: using ScalarType = typename ValuesViewType::non_const_value_type;
190: using Layout = typename ValuesViewType::array_layout;
191: using EXSP = typename ValuesViewType::execution_space;
193: using MagnitudeType = typename Kokkos::Details::ArithTraits<ScalarType>::mag_type;
195: using ViewType1D = Kokkos::View<MagnitudeType *, Layout, EXSP>;
196: using ViewType2D = Kokkos::View<ScalarType **, Layout, EXSP>;
197: using ViewType3D = Kokkos::View<ScalarType ***, Layout, EXSP>;
198: using IntViewType1D = Kokkos::View<PetscInt *, Layout, EXSP>;
200: size_t bytes_1D = ViewType2D::shmem_size(_N_team, 1);
201: size_t bytes_row_ptr = IntViewType1D::shmem_size(_r.extent(1));
202: size_t bytes_col_idc = IntViewType1D::shmem_size(_c.extent(1));
203: size_t bytes_2D_1 = ViewType2D::shmem_size(_N_team, _X.extent(1));
204: size_t bytes_2D_2 = ViewType2D::shmem_size(_N_team, maximum_iteration + 1);
206: size_t bytes_diag = bytes_2D_1;
207: size_t bytes_tmp = 2 * bytes_2D_1 + 2 * bytes_1D + bytes_2D_2;
209: policy.set_scratch_size(0, Kokkos::PerTeam(bytes_tmp));
210: policy.set_scratch_size(1, Kokkos::PerTeam(bytes_col_idc + bytes_row_ptr + bytes_diag));
211: PetscCall(PetscInfo(pc, "%d scratch memory(0) = %d + %d + %d bytes_diag=%d; %d scratch memory(1); %d maximum_iterations\n", (int)(bytes_tmp), 2 * (int)bytes_2D_1, 2 * (int)bytes_1D, (int)bytes_2D_2, (int)bytes_diag, (int)(bytes_row_ptr + bytes_col_idc + bytes_diag), (int)maximum_iteration));
212: exec_space().fence();
213: timer.reset();
214: Kokkos::parallel_for(name.c_str(), policy, *this);
215: exec_space().fence();
216: double sec = timer.seconds();
218: return sec;
219: }
220: };
221: #endif // KK GMRES
223: typedef Kokkos::TeamPolicy<>::member_type team_member;
225: static PetscErrorCode PCBJKOKKOSCreateKSP_BJKOKKOS(PC pc)
226: {
227: const char *prefix;
228: PC_PCBJKOKKOS *jac = (PC_PCBJKOKKOS *)pc->data;
229: DM dm;
231: PetscFunctionBegin;
232: PetscCall(KSPCreate(PetscObjectComm((PetscObject)pc), &jac->ksp));
233: PetscCall(KSPSetErrorIfNotConverged(jac->ksp, pc->erroriffailure));
234: PetscCall(PetscObjectIncrementTabLevel((PetscObject)jac->ksp, (PetscObject)pc, 1));
235: PetscCall(PCGetOptionsPrefix(pc, &prefix));
236: PetscCall(KSPSetOptionsPrefix(jac->ksp, prefix));
237: PetscCall(KSPAppendOptionsPrefix(jac->ksp, "pc_bjkokkos_"));
238: PetscCall(PCGetDM(pc, &dm));
239: if (dm) {
240: PetscCall(KSPSetDM(jac->ksp, dm));
241: PetscCall(KSPSetDMActive(jac->ksp, PETSC_FALSE));
242: }
243: jac->reason = PETSC_FALSE;
244: jac->monitor = PETSC_FALSE;
245: jac->batch_target = -1;
246: jac->nsolves_team = 1;
247: jac->ksp->max_it = 50; // this is really for GMRES w/o restarts
248: PetscFunctionReturn(PETSC_SUCCESS);
249: }
251: // y <-- Ax
252: KOKKOS_INLINE_FUNCTION PetscErrorCode MatMult(const team_member team, const PetscInt *glb_Aai, const PetscInt *glb_Aaj, const PetscScalar *glb_Aaa, const PetscInt *r, const PetscInt *ic, const PetscInt start, const PetscInt end, const PetscScalar *x_loc, PetscScalar *y_loc)
253: {
254: Kokkos::parallel_for(Kokkos::TeamThreadRange(team, start, end), [=](const int rowb) {
255: int rowa = ic[rowb];
256: int n = glb_Aai[rowa + 1] - glb_Aai[rowa];
257: const PetscInt *aj = glb_Aaj + glb_Aai[rowa]; // global
258: const PetscScalar *aa = glb_Aaa + glb_Aai[rowa];
259: PetscScalar sum;
260: Kokkos::parallel_reduce(
261: Kokkos::ThreadVectorRange(team, n), [=](const int i, PetscScalar &lsum) { lsum += aa[i] * x_loc[r[aj[i]] - start]; }, sum);
262: Kokkos::single(Kokkos::PerThread(team), [=]() { y_loc[rowb - start] = sum; });
263: });
264: team.team_barrier();
265: return PETSC_SUCCESS;
266: }
268: // temp buffer per thread with reduction at end?
269: KOKKOS_INLINE_FUNCTION PetscErrorCode MatMultTranspose(const team_member team, const PetscInt *glb_Aai, const PetscInt *glb_Aaj, const PetscScalar *glb_Aaa, const PetscInt *r, const PetscInt *ic, const PetscInt start, const PetscInt end, const PetscScalar *x_loc, PetscScalar *y_loc)
270: {
271: Kokkos::parallel_for(Kokkos::TeamVectorRange(team, end - start), [=](int i) { y_loc[i] = 0; });
272: team.team_barrier();
273: Kokkos::parallel_for(Kokkos::TeamThreadRange(team, start, end), [=](const int rowb) {
274: int rowa = ic[rowb];
275: int n = glb_Aai[rowa + 1] - glb_Aai[rowa];
276: const PetscInt *aj = glb_Aaj + glb_Aai[rowa]; // global
277: const PetscScalar *aa = glb_Aaa + glb_Aai[rowa];
278: const PetscScalar xx = x_loc[rowb - start]; // rowb = ic[rowa] = ic[r[rowb]]
279: Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, n), [=](const int &i) {
280: PetscScalar val = aa[i] * xx;
281: Kokkos::atomic_fetch_add(&y_loc[r[aj[i]] - start], val);
282: });
283: });
284: team.team_barrier();
285: return PETSC_SUCCESS;
286: }
288: typedef struct Batch_MetaData_TAG {
289: PetscInt flops;
290: PetscInt its;
291: KSPConvergedReason reason;
292: } Batch_MetaData;
294: // Solve A(BB^-1)x = y with TFQMR. Right preconditioned to get un-preconditioned residual
295: KOKKOS_INLINE_FUNCTION PetscErrorCode BJSolve_TFQMR(const team_member team, const PetscInt *glb_Aai, const PetscInt *glb_Aaj, const PetscScalar *glb_Aaa, const PetscInt *r, const PetscInt *ic, PetscScalar *work_space_global, const int stride_global, const int nShareVec, PetscScalar *work_space_shared, const int stride_shared, PetscReal rtol, PetscReal atol, PetscReal dtol, PetscInt maxit, Batch_MetaData *metad, const PetscInt start, const PetscInt end, const PetscScalar glb_idiag[], const PetscScalar *glb_b, PetscScalar *glb_x, bool monitor)
296: {
297: using Kokkos::parallel_for;
298: using Kokkos::parallel_reduce;
299: int Nblk = end - start, i, m, stride = stride_shared, idx = 0;
300: PetscReal dp, dpold, w, dpest, tau, psi, cm, r0;
301: const PetscScalar *Diag = &glb_idiag[start];
302: PetscScalar *ptr = work_space_shared, rho, rhoold, a, s, b, eta, etaold, psiold, cf, dpi;
304: if (idx++ == nShareVec) {
305: ptr = work_space_global;
306: stride = stride_global;
307: }
308: PetscScalar *XX = ptr;
309: ptr += stride;
310: if (idx++ == nShareVec) {
311: ptr = work_space_global;
312: stride = stride_global;
313: }
314: PetscScalar *R = ptr;
315: ptr += stride;
316: if (idx++ == nShareVec) {
317: ptr = work_space_global;
318: stride = stride_global;
319: }
320: PetscScalar *RP = ptr;
321: ptr += stride;
322: if (idx++ == nShareVec) {
323: ptr = work_space_global;
324: stride = stride_global;
325: }
326: PetscScalar *V = ptr;
327: ptr += stride;
328: if (idx++ == nShareVec) {
329: ptr = work_space_global;
330: stride = stride_global;
331: }
332: PetscScalar *T = ptr;
333: ptr += stride;
334: if (idx++ == nShareVec) {
335: ptr = work_space_global;
336: stride = stride_global;
337: }
338: PetscScalar *Q = ptr;
339: ptr += stride;
340: if (idx++ == nShareVec) {
341: ptr = work_space_global;
342: stride = stride_global;
343: }
344: PetscScalar *P = ptr;
345: ptr += stride;
346: if (idx++ == nShareVec) {
347: ptr = work_space_global;
348: stride = stride_global;
349: }
350: PetscScalar *U = ptr;
351: ptr += stride;
352: if (idx++ == nShareVec) {
353: ptr = work_space_global;
354: stride = stride_global;
355: }
356: PetscScalar *D = ptr;
357: ptr += stride;
358: if (idx++ == nShareVec) {
359: ptr = work_space_global;
360: stride = stride_global;
361: }
362: PetscScalar *AUQ = V;
364: // init: get b, zero x
365: parallel_for(Kokkos::TeamVectorRange(team, start, end), [=](int rowb) {
366: int rowa = ic[rowb];
367: R[rowb - start] = glb_b[rowa];
368: XX[rowb - start] = 0;
369: });
370: team.team_barrier();
371: parallel_reduce(
372: Kokkos::TeamVectorRange(team, Nblk), [=](const int idx, PetscScalar &lsum) { lsum += R[idx] * PetscConj(R[idx]); }, dpi);
373: team.team_barrier();
374: r0 = dp = PetscSqrtReal(PetscRealPart(dpi));
375: // diagnostics
376: #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
377: if (monitor) Kokkos::single(Kokkos::PerTeam(team), [=]() { printf("%3d KSP Residual norm %14.12e \n", 0, (double)dp); });
378: #endif
379: if (dp < atol) {
380: metad->reason = KSP_CONVERGED_ATOL_NORMAL;
381: return PETSC_SUCCESS;
382: }
383: if (0 == maxit) {
384: metad->reason = KSP_DIVERGED_ITS;
385: return PETSC_SUCCESS;
386: }
388: /* Make the initial Rp = R */
389: parallel_for(Kokkos::TeamVectorRange(team, Nblk), [=](int idx) { RP[idx] = R[idx]; });
390: team.team_barrier();
391: /* Set the initial conditions */
392: etaold = 0.0;
393: psiold = 0.0;
394: tau = dp;
395: dpold = dp;
397: /* rhoold = (r,rp) */
398: parallel_reduce(
399: Kokkos::TeamVectorRange(team, Nblk), [=](const int idx, PetscScalar &dot) { dot += R[idx] * PetscConj(RP[idx]); }, rhoold);
400: team.team_barrier();
401: parallel_for(Kokkos::TeamVectorRange(team, Nblk), [=](int idx) {
402: U[idx] = R[idx];
403: P[idx] = R[idx];
404: T[idx] = Diag[idx] * P[idx];
405: D[idx] = 0;
406: });
407: team.team_barrier();
408: static_cast<void>(MatMult(team, glb_Aai, glb_Aaj, glb_Aaa, r, ic, start, end, T, V));
410: i = 0;
411: do {
412: /* s <- (v,rp) */
413: parallel_reduce(
414: Kokkos::TeamVectorRange(team, Nblk), [=](const int idx, PetscScalar &dot) { dot += V[idx] * PetscConj(RP[idx]); }, s);
415: team.team_barrier();
416: a = rhoold / s; /* a <- rho / s */
417: /* q <- u - a v VecWAXPY(w,alpha,x,y): w = alpha x + y. */
418: /* t <- u + q */
419: parallel_for(Kokkos::TeamVectorRange(team, Nblk), [=](int idx) {
420: Q[idx] = U[idx] - a * V[idx];
421: T[idx] = U[idx] + Q[idx];
422: });
423: team.team_barrier();
424: // KSP_PCApplyBAorAB
425: parallel_for(Kokkos::TeamVectorRange(team, Nblk), [=](int idx) { T[idx] = Diag[idx] * T[idx]; });
426: team.team_barrier();
427: static_cast<void>(MatMult(team, glb_Aai, glb_Aaj, glb_Aaa, r, ic, start, end, T, AUQ));
428: /* r <- r - a K (u + q) */
429: parallel_for(Kokkos::TeamVectorRange(team, Nblk), [=](int idx) { R[idx] = R[idx] - a * AUQ[idx]; });
430: team.team_barrier();
431: parallel_reduce(
432: Kokkos::TeamVectorRange(team, Nblk), [=](const int idx, PetscScalar &lsum) { lsum += R[idx] * PetscConj(R[idx]); }, dpi);
433: team.team_barrier();
434: dp = PetscSqrtReal(PetscRealPart(dpi));
435: for (m = 0; m < 2; m++) {
436: if (!m) w = PetscSqrtReal(dp * dpold);
437: else w = dp;
438: psi = w / tau;
439: cm = 1.0 / PetscSqrtReal(1.0 + psi * psi);
440: tau = tau * psi * cm;
441: eta = cm * cm * a;
442: cf = psiold * psiold * etaold / a;
443: if (!m) {
444: /* D = U + cf D */
445: parallel_for(Kokkos::TeamVectorRange(team, Nblk), [=](int idx) { D[idx] = U[idx] + cf * D[idx]; });
446: } else {
447: /* D = Q + cf D */
448: parallel_for(Kokkos::TeamVectorRange(team, Nblk), [=](int idx) { D[idx] = Q[idx] + cf * D[idx]; });
449: }
450: team.team_barrier();
451: parallel_for(Kokkos::TeamVectorRange(team, Nblk), [=](int idx) { XX[idx] = XX[idx] + eta * D[idx]; });
452: team.team_barrier();
453: dpest = PetscSqrtReal(2 * i + m + 2.0) * tau;
454: #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
455: if (monitor && m == 1) Kokkos::single(Kokkos::PerTeam(team), [=]() { printf("%3d KSP Residual norm %14.12e \n", i + 1, (double)dpest); });
456: #endif
457: if (dpest < atol) {
458: metad->reason = KSP_CONVERGED_ATOL_NORMAL;
459: goto done;
460: }
461: if (dpest / r0 < rtol) {
462: metad->reason = KSP_CONVERGED_RTOL_NORMAL;
463: goto done;
464: }
465: #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
466: if (dpest / r0 > dtol) {
467: metad->reason = KSP_DIVERGED_DTOL;
468: Kokkos::single(Kokkos::PerTeam(team), [=]() { printf("ERROR block %d diverged: %d it, res=%e, r_0=%e\n", team.league_rank(), i, dpest, r0); });
469: goto done;
470: }
471: #else
472: if (dpest / r0 > dtol) {
473: metad->reason = KSP_DIVERGED_DTOL;
474: goto done;
475: }
476: #endif
477: if (i + 1 == maxit) {
478: metad->reason = KSP_DIVERGED_ITS;
479: goto done;
480: }
482: etaold = eta;
483: psiold = psi;
484: }
486: /* rho <- (r,rp) */
487: parallel_reduce(
488: Kokkos::TeamVectorRange(team, Nblk), [=](const int idx, PetscScalar &dot) { dot += R[idx] * PetscConj(RP[idx]); }, rho);
489: team.team_barrier();
490: b = rho / rhoold; /* b <- rho / rhoold */
491: /* u <- r + b q */
492: /* p <- u + b(q + b p) */
493: parallel_for(Kokkos::TeamVectorRange(team, Nblk), [=](int idx) {
494: U[idx] = R[idx] + b * Q[idx];
495: Q[idx] = Q[idx] + b * P[idx];
496: P[idx] = U[idx] + b * Q[idx];
497: });
498: /* v <- K p */
499: team.team_barrier();
500: parallel_for(Kokkos::TeamVectorRange(team, Nblk), [=](int idx) { T[idx] = Diag[idx] * P[idx]; });
501: team.team_barrier();
502: static_cast<void>(MatMult(team, glb_Aai, glb_Aaj, glb_Aaa, r, ic, start, end, T, V));
504: rhoold = rho;
505: dpold = dp;
507: i++;
508: } while (i < maxit);
509: done:
510: // KSPUnwindPreconditioner
511: parallel_for(Kokkos::TeamVectorRange(team, Nblk), [=](int idx) { XX[idx] = Diag[idx] * XX[idx]; });
512: team.team_barrier();
513: // put x into Plex order
514: parallel_for(Kokkos::TeamVectorRange(team, start, end), [=](int rowb) {
515: int rowa = ic[rowb];
516: glb_x[rowa] = XX[rowb - start];
517: });
518: metad->its = i + 1;
519: if (1) {
520: int nnz;
521: parallel_reduce(
522: Kokkos::TeamVectorRange(team, start, end), [=](const int idx, int &lsum) { lsum += (glb_Aai[idx + 1] - glb_Aai[idx]); }, nnz);
523: metad->flops = 2 * (metad->its * (10 * Nblk + 2 * nnz) + 5 * Nblk);
524: } else {
525: metad->flops = 2 * (metad->its * (10 * Nblk + 2 * 50 * Nblk) + 5 * Nblk); // guess
526: }
527: return PETSC_SUCCESS;
528: }
530: // Solve Ax = y with biCG
531: KOKKOS_INLINE_FUNCTION PetscErrorCode BJSolve_BICG(const team_member team, const PetscInt *glb_Aai, const PetscInt *glb_Aaj, const PetscScalar *glb_Aaa, const PetscInt *r, const PetscInt *ic, PetscScalar *work_space_global, const int stride_global, const int nShareVec, PetscScalar *work_space_shared, const int stride_shared, PetscReal rtol, PetscReal atol, PetscReal dtol, PetscInt maxit, Batch_MetaData *metad, const PetscInt start, const PetscInt end, const PetscScalar glb_idiag[], const PetscScalar *glb_b, PetscScalar *glb_x, bool monitor)
532: {
533: using Kokkos::parallel_for;
534: using Kokkos::parallel_reduce;
535: int Nblk = end - start, i, stride = stride_shared, idx = 0; // start in shared mem
536: PetscReal dp, r0;
537: const PetscScalar *Di = &glb_idiag[start];
538: PetscScalar *ptr = work_space_shared, dpi, a = 1.0, beta, betaold = 1.0, b, b2, ma, mac;
540: if (idx++ == nShareVec) {
541: ptr = work_space_global;
542: stride = stride_global;
543: }
544: PetscScalar *XX = ptr;
545: ptr += stride;
546: if (idx++ == nShareVec) {
547: ptr = work_space_global;
548: stride = stride_global;
549: }
550: PetscScalar *Rl = ptr;
551: ptr += stride;
552: if (idx++ == nShareVec) {
553: ptr = work_space_global;
554: stride = stride_global;
555: }
556: PetscScalar *Zl = ptr;
557: ptr += stride;
558: if (idx++ == nShareVec) {
559: ptr = work_space_global;
560: stride = stride_global;
561: }
562: PetscScalar *Pl = ptr;
563: ptr += stride;
564: if (idx++ == nShareVec) {
565: ptr = work_space_global;
566: stride = stride_global;
567: }
568: PetscScalar *Rr = ptr;
569: ptr += stride;
570: if (idx++ == nShareVec) {
571: ptr = work_space_global;
572: stride = stride_global;
573: }
574: PetscScalar *Zr = ptr;
575: ptr += stride;
576: if (idx++ == nShareVec) {
577: ptr = work_space_global;
578: stride = stride_global;
579: }
580: PetscScalar *Pr = ptr;
581: ptr += stride;
583: /* r <- b (x is 0) */
584: parallel_for(Kokkos::TeamVectorRange(team, start, end), [=](int rowb) {
585: int rowa = ic[rowb];
586: Rl[rowb - start] = Rr[rowb - start] = glb_b[rowa];
587: XX[rowb - start] = 0;
588: });
589: team.team_barrier();
590: /* z <- Br */
591: parallel_for(Kokkos::TeamVectorRange(team, Nblk), [=](int idx) {
592: Zr[idx] = Di[idx] * Rr[idx];
593: Zl[idx] = Di[idx] * Rl[idx];
594: });
595: team.team_barrier();
596: /* dp <- r'*r */
597: parallel_reduce(
598: Kokkos::TeamVectorRange(team, Nblk), [=](const int idx, PetscScalar &lsum) { lsum += Rr[idx] * PetscConj(Rr[idx]); }, dpi);
599: team.team_barrier();
600: r0 = dp = PetscSqrtReal(PetscRealPart(dpi));
601: #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
602: if (monitor) Kokkos::single(Kokkos::PerTeam(team), [=]() { printf("%3d KSP Residual norm %14.12e \n", 0, (double)dp); });
603: #endif
604: if (dp < atol) {
605: metad->reason = KSP_CONVERGED_ATOL_NORMAL;
606: return PETSC_SUCCESS;
607: }
608: if (0 == maxit) {
609: metad->reason = KSP_DIVERGED_ITS;
610: return PETSC_SUCCESS;
611: }
612: i = 0;
613: do {
614: /* beta <- r'z */
615: parallel_reduce(
616: Kokkos::TeamVectorRange(team, Nblk), [=](const int idx, PetscScalar &dot) { dot += Zr[idx] * PetscConj(Rl[idx]); }, beta);
617: team.team_barrier();
618: #if PCBJKOKKOS_VERBOSE_LEVEL >= 6
619: #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
620: Kokkos::single(Kokkos::PerTeam(team), [=]() { printf("%7d beta = Z.R = %22.14e \n", i, (double)beta); });
621: #endif
622: #endif
623: if (!i) {
624: if (beta == 0.0) {
625: metad->reason = KSP_DIVERGED_BREAKDOWN_BICG;
626: goto done;
627: }
628: /* p <- z */
629: parallel_for(Kokkos::TeamVectorRange(team, Nblk), [=](int idx) {
630: Pr[idx] = Zr[idx];
631: Pl[idx] = Zl[idx];
632: });
633: } else {
634: b = beta / betaold;
635: /* p <- z + b* p */
636: b2 = PetscConj(b);
637: parallel_for(Kokkos::TeamVectorRange(team, Nblk), [=](int idx) {
638: Pr[idx] = b * Pr[idx] + Zr[idx];
639: Pl[idx] = b2 * Pl[idx] + Zl[idx];
640: });
641: }
642: team.team_barrier();
643: betaold = beta;
644: /* z <- Kp */
645: static_cast<void>(MatMult(team, glb_Aai, glb_Aaj, glb_Aaa, r, ic, start, end, Pr, Zr));
646: static_cast<void>(MatMultTranspose(team, glb_Aai, glb_Aaj, glb_Aaa, r, ic, start, end, Pl, Zl));
647: /* dpi <- z'p */
648: parallel_reduce(
649: Kokkos::TeamVectorRange(team, Nblk), [=](const int idx, PetscScalar &lsum) { lsum += Zr[idx] * PetscConj(Pl[idx]); }, dpi);
650: team.team_barrier();
651: //
652: a = beta / dpi; /* a = beta/p'z */
653: ma = -a;
654: mac = PetscConj(ma);
655: /* x <- x + ap */
656: parallel_for(Kokkos::TeamVectorRange(team, Nblk), [=](int idx) {
657: XX[idx] = XX[idx] + a * Pr[idx];
658: Rr[idx] = Rr[idx] + ma * Zr[idx];
659: Rl[idx] = Rl[idx] + mac * Zl[idx];
660: });
661: team.team_barrier();
662: team.team_barrier();
663: /* dp <- r'*r */
664: parallel_reduce(
665: Kokkos::TeamVectorRange(team, Nblk), [=](const int idx, PetscScalar &lsum) { lsum += Rr[idx] * PetscConj(Rr[idx]); }, dpi);
666: team.team_barrier();
667: dp = PetscSqrtReal(PetscRealPart(dpi));
668: #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
669: if (monitor) Kokkos::single(Kokkos::PerTeam(team), [=]() { printf("%3d KSP Residual norm %14.12e \n", i + 1, (double)dp); });
670: #endif
671: if (dp < atol) {
672: metad->reason = KSP_CONVERGED_ATOL_NORMAL;
673: goto done;
674: }
675: if (dp / r0 < rtol) {
676: metad->reason = KSP_CONVERGED_RTOL_NORMAL;
677: goto done;
678: }
679: #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
680: if (dp / r0 > dtol) {
681: metad->reason = KSP_DIVERGED_DTOL;
682: Kokkos::single(Kokkos::PerTeam(team), [=]() { printf("ERROR block %d diverged: %d it, res=%e, r_0=%e\n", team.league_rank(), i, dp, r0); });
683: goto done;
684: }
685: #else
686: if (dp / r0 > dtol) {
687: metad->reason = KSP_DIVERGED_DTOL;
688: goto done;
689: }
690: #endif
691: if (i + 1 == maxit) {
692: metad->reason = KSP_DIVERGED_ITS;
693: goto done;
694: }
695: /* z <- Br */
696: parallel_for(Kokkos::TeamVectorRange(team, Nblk), [=](int idx) {
697: Zr[idx] = Di[idx] * Rr[idx];
698: Zl[idx] = Di[idx] * Rl[idx];
699: });
700: i++;
701: team.team_barrier();
702: } while (i < maxit);
703: done:
704: // put x back into Plex order
705: parallel_for(Kokkos::TeamVectorRange(team, start, end), [=](int rowb) {
706: int rowa = ic[rowb];
707: glb_x[rowa] = XX[rowb - start];
708: });
709: metad->its = i + 1;
710: if (1) {
711: int nnz;
712: parallel_reduce(
713: Kokkos::TeamVectorRange(team, start, end), [=](const int idx, int &lsum) { lsum += (glb_Aai[idx + 1] - glb_Aai[idx]); }, nnz);
714: metad->flops = 2 * (metad->its * (10 * Nblk + 2 * nnz) + 5 * Nblk);
715: } else {
716: metad->flops = 2 * (metad->its * (10 * Nblk + 2 * 50 * Nblk) + 5 * Nblk); // guess
717: }
718: return PETSC_SUCCESS;
719: }
721: // KSP solver solve Ax = b; xout is output, bin is input
722: static PetscErrorCode PCApply_BJKOKKOS(PC pc, Vec bin, Vec xout)
723: {
724: PC_PCBJKOKKOS *jac = (PC_PCBJKOKKOS *)pc->data;
725: Mat A = pc->pmat;
726: Mat_SeqAIJKokkos *aijkok;
728: PetscFunctionBegin;
729: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
730: if (!aijkok) aijkok = static_cast<Mat_SeqAIJKokkos *>(((Mat_MPIAIJ *)A->data)->A->spptr);
731: PetscCheck(aijkok, PetscObjectComm((PetscObject)pc), PETSC_ERR_USER, "No aijkok");
732: {
733: PetscInt maxit = jac->ksp->max_it;
734: const PetscInt conc = Kokkos::DefaultExecutionSpace().concurrency(), openmp = !!(conc < 1000), team_size = (openmp == 0 && PCBJKOKKOS_VEC_SIZE != 1) ? PCBJKOKKOS_TEAM_SIZE : 1;
735: const PetscInt nwork = jac->nwork, nBlk = jac->nBlocks;
736: PetscScalar *glb_xdata = NULL;
737: PetscReal rtol = jac->ksp->rtol, atol = jac->ksp->abstol, dtol = jac->ksp->divtol;
738: const PetscScalar *glb_idiag = jac->d_idiag_k->data(), *glb_bdata = NULL;
739: const PetscInt *glb_Aai = aijkok->i_device_data(), *glb_Aaj = aijkok->j_device_data(), *d_bid_eqOffset = jac->d_bid_eqOffset_k->data();
740: const PetscScalar *glb_Aaa = aijkok->a_device_data();
741: const PetscInt *d_isicol = jac->d_isicol_k->data(), *d_isrow = jac->d_isrow_k->data();
742: PCFailedReason pcreason;
743: KSPIndex ksp_type_idx = jac->ksp_type_idx;
744: PetscMemType mtype;
745: PetscContainer container;
746: PetscInt batch_sz; // the number of repeated DMs, [DM_e_1, DM_e_2, DM_e_batch_sz, DM_i_1, ...]
747: VecScatter plex_batch = NULL; // not used
748: Vec bvec; // a copy of b for scatter (just alias to bin now)
749: PetscBool monitor = jac->monitor; // captured
750: PetscInt view_bid = jac->batch_target;
751: MatInfo info;
752: //PetscCall(MatGetOwnershipRange(A, &Istart, NULL));
753: jac->max_nits = 0;
754: if (view_bid < 0) view_bid = 0;
755: PetscCall(MatGetInfo(A, MAT_LOCAL, &info));
756: // get field major is to map plex IO to/from block/field major
757: PetscCall(PetscObjectQuery((PetscObject)A, "plex_batch_is", (PetscObject *)&container));
758: if (container) {
759: PetscCall(VecDuplicate(bin, &bvec));
760: PetscCall(PetscContainerGetPointer(container, (void **)&plex_batch));
761: PetscCall(VecScatterBegin(plex_batch, bin, bvec, INSERT_VALUES, SCATTER_FORWARD));
762: PetscCall(VecScatterEnd(plex_batch, bin, bvec, INSERT_VALUES, SCATTER_FORWARD));
763: SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_USER, "No plex_batch_is -- require NO field major ordering for now");
764: } else {
765: bvec = bin;
766: }
767: // get x
768: PetscCall(VecGetArrayAndMemType(xout, &glb_xdata, &mtype));
769: #if defined(PETSC_HAVE_CUDA)
770: PetscCheck(PetscMemTypeDevice(mtype), PetscObjectComm((PetscObject)pc), PETSC_ERR_ARG_WRONG, "No GPU data for x %" PetscInt_FMT " != %" PetscInt_FMT, mtype, PETSC_MEMTYPE_DEVICE);
771: #endif
772: PetscCall(VecGetArrayReadAndMemType(bvec, &glb_bdata, &mtype));
773: #if defined(PETSC_HAVE_CUDA)
774: PetscCheck(PetscMemTypeDevice(mtype), PetscObjectComm((PetscObject)pc), PETSC_ERR_ARG_WRONG, "No GPU data for b");
775: #endif
776: // get batch size
777: PetscCall(PetscObjectQuery((PetscObject)A, "batch size", (PetscObject *)&container));
778: if (container) {
779: PetscInt *pNf = NULL;
780: PetscCall(PetscContainerGetPointer(container, (void **)&pNf));
781: batch_sz = *pNf; // number of times to repeat the DMs
782: } else batch_sz = 1;
783: PetscCheck(nBlk % batch_sz == 0, PetscObjectComm((PetscObject)pc), PETSC_ERR_ARG_WRONG, "batch_sz = %" PetscInt_FMT ", nBlk = %" PetscInt_FMT, batch_sz, nBlk);
784: if (ksp_type_idx == BATCH_KSP_GMRES_IDX) { // KK solver - move PETSc data into Kokkos Views, setup solver, solve, move data out of Kokkos, process metadata (convergence tests, etc.)
785: #if defined(PETSC_HAVE_KOKKOS_KERNELS_GMRES)
786: int Nsolves_team = jac->nsolves_team, fill_idx = 0;
787: int Nloc = jac->const_block_size; // same grids
788: const int Nsolves = nBlk;
789: const int nnz = (int)info.nz_used / Nsolves; // fix for variable grid size
790: if (Nsolves_team > batch_sz) Nsolves_team = batch_sz; // silently fix this
791: PetscCheck(jac->const_block_size, PetscObjectComm((PetscObject)pc), PETSC_ERR_ARG_WRONG, "Kokkos (GMRES) solver requires constant block size (but can be made to work with species ordering or N_team==1)");
792: PetscCheck(Nsolves % Nsolves_team == 0, PetscObjectComm((PetscObject)pc), PETSC_ERR_ARG_WRONG, "Nsolves.mod(Nsolves_team) != 0: Nsolves = %d, Nsolves_team = %d", Nsolves, Nsolves_team);
793: PetscCheck(((int)info.nz_used) % Nsolves == 0, PetscObjectComm((PetscObject)pc), PETSC_ERR_ARG_WRONG, "info.nz_used.mod(Nsolves) != 0: info.nz_used = %g, Nsolves = %d", info.nz_used, Nsolves);
794: #if defined(PETSC_HAVE_CUDA)
795: nvtxRangePushA("gmres-kk");
796: #endif
797: Kokkos::View<PetscScalar **, layout, exec_space, Kokkos::MemoryTraits<Kokkos::Unmanaged>> inv_diag((PetscScalar *)glb_idiag, Nsolves, Nloc); // in correct order
798: if (!jac->rowOffsets) {
799: jac->rowOffsets = new IntView("rowOffsets", Nsolves / Nsolves_team, Nloc + 1); // same grids
800: jac->colIndices = new IntView("colIndices", Nsolves / Nsolves_team, nnz);
801: jac->batch_b = new XYType("batch rhs", Nsolves, Nloc);
802: jac->batch_x = new XYType("batch sol", Nsolves, Nloc);
803: jac->batch_values = new AMatrixValueView("batch values", Nsolves, nnz);
804: fill_idx = 1;
805: PetscCall(PetscInfo(pc, "Setup indices Nloc=%d, nnz=%d\n", Nloc, nnz));
806: }
807: IntView &rowOffsets = *jac->rowOffsets;
808: IntView &colIndices = *jac->colIndices;
809: XYType &batch_b = *jac->batch_b;
810: XYType &batch_x = *jac->batch_x;
811: AMatrixValueView &batch_values = *jac->batch_values;
813: Kokkos::deep_copy(batch_x, 0.);
814: PetscCall(PetscInfo(pc, "\tjac->n = %" PetscInt_FMT ", Nloc = %d, Nsolves = %d, nnz = %d, Nsolves_team = %d, league size = %d, maxit = %" PetscInt_FMT "\n", jac->n, Nloc, Nsolves, nnz, Nsolves_team, Nsolves / Nsolves_team, maxit));
815: Kokkos::parallel_for(
816: "rowOffsets+map", Kokkos::TeamPolicy<>(Nsolves, team_size, PCBJKOKKOS_VEC_SIZE), KOKKOS_LAMBDA(const team_member team) {
817: const int blkID = team.league_rank(), start = d_bid_eqOffset[blkID], end = d_bid_eqOffset[blkID + 1];
818: if (fill_idx) {
819: if (blkID % Nsolves_team == 0) { // first matrix on this member
820: Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start, end), [=](const int rowb) { // Nloc
821: int rowa = d_isicol[rowb];
822: int n = glb_Aai[rowa + 1] - glb_Aai[rowa];
823: rowOffsets(blkID / Nsolves_team, rowb + 1 - start) = n; // save sizes
824: });
825: }
826: }
827: // map b into field major space
828: Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start, end), [=](int rowb) {
829: int rowa = d_isicol[rowb];
830: batch_b(blkID, rowb - start) = glb_bdata[rowa];
831: });
832: });
833: Kokkos::fence();
834: if (fill_idx) {
835: Kokkos::parallel_for(
836: "prefix sum", Kokkos::TeamPolicy<>(Nsolves / Nsolves_team, 1, 1), KOKKOS_LAMBDA(const team_member team) {
837: const int graphID = team.league_rank();
838: rowOffsets(graphID, 0) = 0;
839: for (size_t i = 0; i < Nloc; ++i) rowOffsets(graphID, i + 1) += rowOffsets(graphID, i);
840: });
841: Kokkos::fence();
842: }
843: Kokkos::parallel_for(
844: "copy matrix", Kokkos::TeamPolicy<>(Nsolves /* /batch_sz */, team_size, PCBJKOKKOS_VEC_SIZE), KOKKOS_LAMBDA(const team_member team) {
845: const int blkID = team.league_rank(), start = d_bid_eqOffset[blkID], end = d_bid_eqOffset[blkID + 1], graphID = blkID / Nsolves_team;
846: Kokkos::parallel_for(Kokkos::TeamThreadRange(team, start, end), [=](const int rowb) {
847: int rowa = d_isicol[rowb];
848: int n = glb_Aai[rowa + 1] - glb_Aai[rowa];
849: const PetscInt *aj = glb_Aaj + glb_Aai[rowa]; // global index
850: const PetscScalar *aa = glb_Aaa + glb_Aai[rowa];
851: Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, n), [=](const int &i) {
852: PetscScalar val = aa[i];
853: if (fill_idx && blkID % Nsolves_team == 0) colIndices(graphID, rowOffsets(graphID, rowb - start) + i) = d_isrow[aj[i]] - blkID * Nloc; // local" global - block start
854: batch_values(blkID, rowOffsets(graphID, rowb - start) + i) = val;
855: });
856: });
857: });
858: Kokkos::fence();
859: // setup solver
860: using ScalarType = typename AMatrixValueView::non_const_value_type;
861: using MagnitudeType = typename Kokkos::Details::ArithTraits<ScalarType>::mag_type;
862: using NormViewType = Kokkos::View<MagnitudeType *, layout, exec_space>;
863: using Norm2DViewType = Kokkos::View<MagnitudeType **, layout, exec_space>;
864: using Scalar3DViewType = Kokkos::View<ScalarType ***, layout, exec_space>;
865: using IntViewType = Kokkos::View<int *, layout, exec_space>;
866: using KrylovHandleType = KokkosBatched::KrylovHandle<Norm2DViewType, IntViewType, Scalar3DViewType>;
867: const int n_iterations = maxit;
868: const int team_size = -1;
869: const int vector_length = -1;
870: const double tol = rtol;
871: const int ortho_strategy = 0;
872: KrylovHandleType handle(Nsolves, Nsolves_team, n_iterations, true);
873: handle.Arnoldi_view = Scalar3DViewType("", Nsolves, n_iterations, Nloc + n_iterations + 3);
874: // solve
875: double time = Functor_TestBatchedTeamVectorGMRES<exec_space, AMatrixValueView, IntView, XYType, KrylovHandleType>(batch_values, inv_diag, rowOffsets, colIndices, batch_x, batch_b, Nsolves_team, team_size, vector_length, n_iterations, tol, ortho_strategy, 0, handle)
876: .run(pc);
877: Kokkos::fence();
878: // get data back
879: Kokkos::parallel_for(
880: "map", Kokkos::TeamPolicy<>(Nsolves /* /batch_sz */, team_size, PCBJKOKKOS_VEC_SIZE), KOKKOS_LAMBDA(const team_member team) {
881: const int blkID = team.league_rank(), start = d_bid_eqOffset[blkID], end = d_bid_eqOffset[blkID + 1]; // 0
882: // map x into Plex/PETSc
883: Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start, end), [=](int rowb) {
884: int rowa = d_isicol[rowb];
885: glb_xdata[rowa] = batch_x(blkID, rowb - start);
886: });
887: });
888: // output assume species major - clone from Kokkos solvers
889: #if PCBJKOKKOS_VERBOSE_LEVEL >= 3
890: #if PCBJKOKKOS_VERBOSE_LEVEL >= 4
891: PetscCall(PetscPrintf(PetscObjectComm((PetscObject)A), "Iterations\n"));
892: #else
893: PetscCall(PetscPrintf(PetscObjectComm((PetscObject)A), "max iterations per species (gmres) :"));
894: #endif
895: for (PetscInt dmIdx = 0, s = 0, head = 0; dmIdx < jac->num_dms; dmIdx += batch_sz) {
896: for (PetscInt f = 0, idx = head; f < jac->dm_Nf[dmIdx]; f++, s++, idx++) {
897: #if PCBJKOKKOS_VERBOSE_LEVEL >= 4
898: PetscCall(PetscPrintf(PetscObjectComm((PetscObject)A), "%2D:", s));
899: for (int bid = 0; bid < batch_sz; bid++) PetscCall(PetscPrintf(PetscObjectComm((PetscObject)A), "%3D ", handle.get_iteration_host(idx + bid * jac->dm_Nf[dmIdx])));
900: PetscCall(PetscPrintf(PetscObjectComm((PetscObject)A), "\n"));
901: #else
902: int count = 0, ii;
903: for (int bid = 0; bid < batch_sz; bid++) {
904: if ((ii = handle.get_iteration_host(idx + bid * jac->dm_Nf[dmIdx])) > count) count = ii;
905: }
906: PetscCall(PetscPrintf(PetscObjectComm((PetscObject)A), "%3d", count));
907: #endif
908: }
909: head += batch_sz * jac->dm_Nf[dmIdx];
910: }
911: #if PCBJKOKKOS_VERBOSE_LEVEL == 3
912: PetscCall(PetscPrintf(PetscObjectComm((PetscObject)A), "\n"));
913: #endif
914: #endif
915: // return error code, get max it
916: PetscInt count = 0, mbid = 0;
917: if (handle.is_converged_host()) {
918: pcreason = PC_NOERROR;
919: if (!jac->max_nits) {
920: for (int blkID = 0; blkID < nBlk; blkID++) {
921: if (handle.get_iteration_host(blkID) > jac->max_nits) {
922: jac->max_nits = handle.get_iteration_host(blkID);
923: mbid = blkID;
924: }
925: }
926: }
927: } else {
928: PetscCall(PetscPrintf(PETSC_COMM_SELF, "There is at least one system that did not converge."));
929: pcreason = PC_SUBPC_ERROR;
930: }
931: // output - assume species major order
932: for (int blkID = 0; blkID < nBlk; blkID++) {
933: if (jac->reason) { // -pc_bjkokkos_ksp_converged_reason
934: if (jac->batch_target == blkID) {
935: if (batch_sz != 1)
936: PetscCall(PetscPrintf(PetscObjectComm((PetscObject)A), " Linear solve %s in %d iterations, batch %" PetscInt_FMT ", species %" PetscInt_FMT "\n", handle.is_converged_host(blkID) ? "converged" : "diverged", handle.get_iteration_host(blkID), blkID % batch_sz, blkID / batch_sz));
937: else PetscCall(PetscPrintf(PetscObjectComm((PetscObject)A), " Linear solve %s in %d iterations, block %" PetscInt_FMT "\n", handle.is_converged_host(blkID) ? "converged" : "diverged", handle.get_iteration_host(blkID), blkID));
938: } else if (jac->batch_target == -1 && handle.get_iteration_host(blkID) >= count) {
939: jac->max_nits = count = handle.get_iteration_host(blkID);
940: mbid = blkID;
941: }
942: if (!handle.is_converged_host(blkID)) PetscCall(PetscPrintf(PETSC_COMM_SELF, "ERROR species %d, batch %d did not converge with %d iterations\n", (int)(blkID / batch_sz), (int)blkID % batch_sz, handle.get_iteration_host(blkID)));
943: }
944: }
945: if (jac->batch_target == -1 && jac->reason) {
946: if (batch_sz != 1)
947: PetscCall(PetscPrintf(PetscObjectComm((PetscObject)A), " Linear solve %s in %d iteration, batch %" PetscInt_FMT ", specie %" PetscInt_FMT "\n", handle.is_converged_host(mbid) ? "converged" : "diverged", jac->max_nits, mbid % batch_sz, mbid / batch_sz));
948: else PetscCall(PetscPrintf(PetscObjectComm((PetscObject)A), " Linear solve %s in %d iteration, block %" PetscInt_FMT "\n", handle.is_converged_host(mbid) ? "converged" : "diverged", jac->max_nits, mbid));
949: }
950: #if defined(PETSC_HAVE_CUDA)
951: nvtxRangePop();
952: #endif
953: #else
954: SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_USER, "batch GMRES not supported");
955: #endif
956: } else { // Kokkos Krylov
957: using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space;
958: using vect2D_scr_t = Kokkos::View<PetscScalar **, Kokkos::LayoutLeft, scr_mem_t>;
959: Kokkos::View<Batch_MetaData *, Kokkos::DefaultExecutionSpace> d_metadata("solver meta data", nBlk);
960: int stride_shared, stride_global, global_buff_words;
961: d_bid_eqOffset = jac->d_bid_eqOffset_k->data();
962: // solve each block independently
963: int scr_bytes_team_shared = 0, nShareVec = 0, nGlobBVec = 0;
964: if (jac->const_block_size) { // use shared memory for work vectors only if constant block size - todo: test efficiency loss
965: int maximum_shared_mem_size = 64000;
966: PetscDevice device;
967: PetscCall(PetscDeviceGetDefault_Internal(&device));
968: PetscCall(PetscDeviceGetAttribute(device, PETSC_DEVICE_ATTR_SIZE_T_SHARED_MEM_PER_BLOCK, &maximum_shared_mem_size));
969: stride_shared = jac->const_block_size; // captured
970: nShareVec = maximum_shared_mem_size / (jac->const_block_size * sizeof(PetscScalar)); // integer floor, number of vectors that fit in shared
971: if (nShareVec > nwork) nShareVec = nwork;
972: else nGlobBVec = nwork - nShareVec;
973: global_buff_words = jac->n * nGlobBVec;
974: scr_bytes_team_shared = jac->const_block_size * nShareVec * sizeof(PetscScalar);
975: } else {
976: scr_bytes_team_shared = 0;
977: stride_shared = 0;
978: global_buff_words = jac->n * nwork;
979: nGlobBVec = nwork; // not needed == fix
980: }
981: stride_global = jac->n; // captured
982: #if defined(PETSC_HAVE_CUDA)
983: nvtxRangePushA("batch-kokkos-solve");
984: #endif
985: Kokkos::View<PetscScalar *, Kokkos::DefaultExecutionSpace> d_work_vecs_k("workvectors", global_buff_words); // global work vectors
986: PetscCall(PetscInfo(pc, "\tn = %d. %d shared bytes/team, %d global mem bytes, rtol=%e, num blocks %d, team_size=%d, %d vector threads, %d shared vectors, %d global vectors\n", (int)jac->n, scr_bytes_team_shared, global_buff_words, rtol, (int)nBlk, (int)team_size, PCBJKOKKOS_VEC_SIZE, nShareVec, nGlobBVec));
987: PetscScalar *d_work_vecs = d_work_vecs_k.data();
988: Kokkos::parallel_for(
989: "Solve", Kokkos::TeamPolicy<Kokkos::LaunchBounds<256, 4>>(nBlk, team_size, PCBJKOKKOS_VEC_SIZE).set_scratch_size(PCBJKOKKOS_SHARED_LEVEL, Kokkos::PerTeam(scr_bytes_team_shared)), KOKKOS_LAMBDA(const team_member team) {
990: const int blkID = team.league_rank(), start = d_bid_eqOffset[blkID], end = d_bid_eqOffset[blkID + 1];
991: vect2D_scr_t work_vecs_shared(team.team_scratch(PCBJKOKKOS_SHARED_LEVEL), end - start, nShareVec);
992: PetscScalar *work_buff_shared = work_vecs_shared.data();
993: PetscScalar *work_buff_global = &d_work_vecs[start]; // start inc'ed in
994: bool print = monitor && (blkID == view_bid);
995: switch (ksp_type_idx) {
996: case BATCH_KSP_BICG_IDX:
997: static_cast<void>(BJSolve_BICG(team, glb_Aai, glb_Aaj, glb_Aaa, d_isrow, d_isicol, work_buff_global, stride_global, nShareVec, work_buff_shared, stride_shared, rtol, atol, dtol, maxit, &d_metadata[blkID], start, end, glb_idiag, glb_bdata, glb_xdata, print));
998: break;
999: case BATCH_KSP_TFQMR_IDX:
1000: static_cast<void>(BJSolve_TFQMR(team, glb_Aai, glb_Aaj, glb_Aaa, d_isrow, d_isicol, work_buff_global, stride_global, nShareVec, work_buff_shared, stride_shared, rtol, atol, dtol, maxit, &d_metadata[blkID], start, end, glb_idiag, glb_bdata, glb_xdata, print));
1001: break;
1002: case BATCH_KSP_GMRES_IDX:
1003: //BJSolve_GMRES();
1004: break;
1005: default:
1006: #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
1007: printf("Unknown KSP type %d\n", ksp_type_idx);
1008: #else
1009: /* void */;
1010: #endif
1011: }
1012: });
1013: Kokkos::fence();
1014: #if defined(PETSC_HAVE_CUDA)
1015: nvtxRangePop();
1016: nvtxRangePushA("Post-solve-metadata");
1017: #endif
1018: auto h_metadata = Kokkos::create_mirror(Kokkos::HostSpace::memory_space(), d_metadata);
1019: Kokkos::deep_copy(h_metadata, d_metadata);
1020: if (jac->reason) { // -pc_bjkokkos_ksp_converged_reason
1021: #if PCBJKOKKOS_VERBOSE_LEVEL >= 3
1022: #if PCBJKOKKOS_VERBOSE_LEVEL >= 4
1023: PetscCall(PetscPrintf(PETSC_COMM_WORLD, "Iterations\n"));
1024: #endif
1025: // assume species major
1026: #if PCBJKOKKOS_VERBOSE_LEVEL < 4
1027: if (batch_sz != 1) PetscCall(PetscPrintf(PetscObjectComm((PetscObject)A), "%s: max iterations per species:", ksp_type_idx == BATCH_KSP_BICG_IDX ? "bicg" : "tfqmr"));
1028: else PetscCall(PetscPrintf(PetscObjectComm((PetscObject)A), " Linear solve converged due to %s iterations ", ksp_type_idx == BATCH_KSP_BICG_IDX ? "bicg" : "tfqmr"));
1029: #endif
1030: for (PetscInt dmIdx = 0, s = 0, head = 0; dmIdx < jac->num_dms; dmIdx += batch_sz) {
1031: for (PetscInt f = 0, idx = head; f < jac->dm_Nf[dmIdx]; f++, s++, idx++) {
1032: #if PCBJKOKKOS_VERBOSE_LEVEL >= 4
1033: PetscCall(PetscPrintf(PetscObjectComm((PetscObject)A), "%2" PetscInt_FMT ":", s));
1034: for (int bid = 0; bid < batch_sz; bid++) PetscCall(PetscPrintf(PetscObjectComm((PetscObject)A), "%3" PetscInt_FMT " ", h_metadata[idx + bid * jac->dm_Nf[dmIdx]].its));
1035: PetscCall(PetscPrintf(PetscObjectComm((PetscObject)A), "\n"));
1036: #else
1037: PetscInt count = 0;
1038: for (int bid = 0; bid < batch_sz; bid++) {
1039: if (h_metadata[idx + bid * jac->dm_Nf[dmIdx]].its > count) count = h_metadata[idx + bid * jac->dm_Nf[dmIdx]].its;
1040: }
1041: PetscCall(PetscPrintf(PetscObjectComm((PetscObject)A), "%3" PetscInt_FMT " ", count));
1042: #endif
1043: }
1044: head += batch_sz * jac->dm_Nf[dmIdx];
1045: }
1046: #if PCBJKOKKOS_VERBOSE_LEVEL == 3
1047: PetscCall(PetscPrintf(PetscObjectComm((PetscObject)A), "\n"));
1048: #endif
1049: #endif
1050: PetscInt count = 0, mbid = 0;
1051: for (int blkID = 0; blkID < nBlk; blkID++) {
1052: PetscCall(PetscLogGpuFlops((PetscLogDouble)h_metadata[blkID].flops));
1053: #if PCBJKOKKOS_VERBOSE_LEVEL < 3
1054: if (jac->batch_target == blkID) {
1055: if (batch_sz != 1)
1056: PetscCall(PetscPrintf(PetscObjectComm((PetscObject)A), " Linear solve converged due to %s iterations %d, batch %" PetscInt_FMT ", species %" PetscInt_FMT "\n", KSPConvergedReasons[h_metadata[blkID].reason], h_metadata[blkID].its, blkID % batch_sz, blkID / batch_sz));
1057: else PetscCall(PetscPrintf(PetscObjectComm((PetscObject)A), " Linear solve converged due to %s iterations %d, block %" PetscInt_FMT "\n", KSPConvergedReasons[h_metadata[blkID].reason], h_metadata[blkID].its, blkID));
1058: } else if (jac->batch_target == -1 && h_metadata[blkID].its >= count) {
1059: jac->max_nits = count = h_metadata[blkID].its;
1060: mbid = blkID;
1061: }
1062: #endif
1063: #if PCBJKOKKOS_VERBOSE_LEVEL > 0
1064: if (h_metadata[blkID].reason < 0) {
1065: PetscCall(PetscPrintf(PETSC_COMM_SELF, "ERROR reason=%s, its=%" PetscInt_FMT ". species %" PetscInt_FMT ", batch %" PetscInt_FMT "\n", KSPConvergedReasons[h_metadata[blkID].reason], h_metadata[blkID].its, blkID / batch_sz, blkID % batch_sz));
1066: }
1067: #endif
1068: }
1069: if (jac->batch_target == -1) {
1070: if (batch_sz != 1)
1071: PetscCall(PetscPrintf(PetscObjectComm((PetscObject)A), " Linear solve converged due to %s iterations %d, batch %" PetscInt_FMT ", species %" PetscInt_FMT "\n", KSPConvergedReasons[h_metadata[mbid].reason], h_metadata[mbid].its, mbid % batch_sz, mbid / batch_sz));
1072: else PetscCall(PetscPrintf(PetscObjectComm((PetscObject)A), " Linear solve converged due to %s iterations %d, block %" PetscInt_FMT "\n", KSPConvergedReasons[h_metadata[mbid].reason], h_metadata[mbid].its, mbid));
1073: }
1074: }
1075: for (int blkID = 0; blkID < nBlk; blkID++) {
1076: PetscCall(PetscLogGpuFlops((PetscLogDouble)h_metadata[blkID].flops));
1077: #if PCBJKOKKOS_VERBOSE_LEVEL > 0
1078: if (h_metadata[blkID].reason < 0) {
1079: PetscCall(PetscPrintf(PETSC_COMM_SELF, "ERROR reason=%s, its=%" PetscInt_FMT ". species %" PetscInt_FMT ", batch %" PetscInt_FMT "\n", KSPConvergedReasons[h_metadata[blkID].reason], h_metadata[blkID].its, blkID / batch_sz, blkID % batch_sz));
1080: }
1081: #endif
1082: }
1083: {
1084: int errsum;
1085: Kokkos::parallel_reduce(
1086: nBlk,
1087: KOKKOS_LAMBDA(const int idx, int &lsum) {
1088: if (d_metadata[idx].reason < 0) ++lsum;
1089: },
1090: errsum);
1091: pcreason = errsum ? PC_SUBPC_ERROR : PC_NOERROR;
1092: if (!errsum && !jac->max_nits) { // set max its to give back to top KSP
1093: for (int blkID = 0; blkID < nBlk; blkID++) {
1094: if (h_metadata[blkID].its > jac->max_nits) jac->max_nits = h_metadata[blkID].its;
1095: }
1096: } else if (errsum) {
1097: PetscCall(PetscPrintf(PETSC_COMM_SELF, "ERROR Kokkos batch solver did not converge in all solves\n"));
1098: }
1099: }
1100: #if defined(PETSC_HAVE_CUDA)
1101: nvtxRangePop();
1102: #endif
1103: } // end of Kokkos (not Kernels) solvers block
1104: PetscCall(VecRestoreArrayAndMemType(xout, &glb_xdata));
1105: PetscCall(VecRestoreArrayReadAndMemType(bvec, &glb_bdata));
1106: PetscCall(PCSetFailedReason(pc, pcreason));
1107: // map back to Plex space - not used
1108: if (plex_batch) {
1109: PetscCall(VecCopy(xout, bvec));
1110: PetscCall(VecScatterBegin(plex_batch, bvec, xout, INSERT_VALUES, SCATTER_REVERSE));
1111: PetscCall(VecScatterEnd(plex_batch, bvec, xout, INSERT_VALUES, SCATTER_REVERSE));
1112: PetscCall(VecDestroy(&bvec));
1113: }
1114: } // whole 'have aijkok' block
1115: PetscFunctionReturn(PETSC_SUCCESS);
1116: }
1118: static PetscErrorCode PCSetUp_BJKOKKOS(PC pc)
1119: {
1120: PC_PCBJKOKKOS *jac = (PC_PCBJKOKKOS *)pc->data;
1121: Mat A = pc->pmat, Aseq = A; // use filtered block matrix, really "P"
1122: Mat_SeqAIJKokkos *aijkok;
1123: PetscBool flg;
1125: PetscFunctionBegin;
1126: PetscCheck(!pc->useAmat, PetscObjectComm((PetscObject)pc), PETSC_ERR_SUP, "No support for using 'use_amat'");
1127: PetscCheck(A, PetscObjectComm((PetscObject)A), PETSC_ERR_USER, "No matrix - A is used above");
1128: PetscCall(PetscObjectTypeCompareAny((PetscObject)A, &flg, MATSEQAIJKOKKOS, MATMPIAIJKOKKOS, MATAIJKOKKOS, ""));
1129: PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "must use '-[dm_]mat_type aijkokkos -[dm_]vec_type kokkos' for -pc_type bjkokkos");
1130: aijkok = static_cast<Mat_SeqAIJKokkos *>(Aseq->spptr);
1131: if (!aijkok) {
1132: PetscCall(MatMPIAIJGetSeqAIJ(A, &Aseq, NULL, NULL));
1133: aijkok = static_cast<Mat_SeqAIJKokkos *>(Aseq->spptr);
1134: }
1135: PetscCheck((aijkok), PetscObjectComm((PetscObject)A), PETSC_ERR_USER, "No aijkok");
1136: {
1137: PetscInt Istart, Iend;
1138: PetscMPIInt rank;
1139: PetscCallMPI(MPI_Comm_rank(PETSC_COMM_WORLD, &rank));
1140: PetscCall(MatGetOwnershipRange(A, &Istart, &Iend));
1141: if (!jac->vec_diag) {
1142: Vec *subX = NULL;
1143: DM pack, *subDM = NULL;
1144: PetscInt nDMs, n, *block_sizes = NULL;
1145: IS isrow, isicol;
1146: { // Permute the matrix to get a block diagonal system: d_isrow_k, d_isicol_k
1147: MatOrderingType rtype;
1148: const PetscInt *rowindices, *icolindices;
1149: rtype = MATORDERINGRCM;
1150: // get permutation. And invert. should we convert to local indices?
1151: PetscCall(MatGetOrdering(Aseq, rtype, &isrow, &isicol)); // only seems to work for seq matrix
1152: PetscCall(ISDestroy(&isrow));
1153: PetscCall(ISInvertPermutation(isicol, PETSC_DECIDE, &isrow)); // THIS IS BACKWARD -- isrow is inverse
1154: // if (rank==1) PetscCall(ISView(isicol, PETSC_VIEWER_STDOUT_SELF));
1155: if (1) {
1156: Mat mat_block_order; // debug
1157: PetscCall(ISShift(isicol, Istart, isicol));
1158: PetscCall(MatCreateSubMatrix(A, isicol, isicol, MAT_INITIAL_MATRIX, &mat_block_order));
1159: PetscCall(ISShift(isicol, -Istart, isicol));
1160: PetscCall(MatViewFromOptions(mat_block_order, NULL, "-ksp_batch_reorder_view"));
1161: PetscCall(MatDestroy(&mat_block_order));
1162: }
1163: PetscCall(ISGetIndices(isrow, &rowindices)); // local idx
1164: PetscCall(ISGetIndices(isicol, &icolindices));
1165: const Kokkos::View<PetscInt *, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> h_isrow_k((PetscInt *)rowindices, A->rmap->n);
1166: const Kokkos::View<PetscInt *, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> h_isicol_k((PetscInt *)icolindices, A->rmap->n);
1167: jac->d_isrow_k = new Kokkos::View<PetscInt *>(Kokkos::create_mirror(DefaultMemorySpace(), h_isrow_k));
1168: jac->d_isicol_k = new Kokkos::View<PetscInt *>(Kokkos::create_mirror(DefaultMemorySpace(), h_isicol_k));
1169: Kokkos::deep_copy(*jac->d_isrow_k, h_isrow_k);
1170: Kokkos::deep_copy(*jac->d_isicol_k, h_isicol_k);
1171: PetscCall(ISRestoreIndices(isrow, &rowindices));
1172: PetscCall(ISRestoreIndices(isicol, &icolindices));
1173: // if (rank==1) PetscCall(ISView(isicol, PETSC_VIEWER_STDOUT_SELF));
1174: }
1175: // get block sizes & allocate vec_diag
1176: PetscCall(PCGetDM(pc, &pack));
1177: if (pack) {
1178: PetscCall(PetscObjectTypeCompare((PetscObject)pack, DMCOMPOSITE, &flg));
1179: if (flg) {
1180: PetscCall(DMCompositeGetNumberDM(pack, &nDMs));
1181: PetscCall(DMCreateGlobalVector(pack, &jac->vec_diag));
1182: } else pack = NULL; // flag for no DM
1183: }
1184: if (!jac->vec_diag) { // get 'nDMs' and sizes 'block_sizes' w/o DMComposite. User could provide ISs (todo)
1185: PetscInt bsrt, bend, ncols, ntot = 0;
1186: const PetscInt *colsA, nloc = Iend - Istart;
1187: const PetscInt *rowindices, *icolindices;
1188: PetscCall(PetscMalloc1(nloc, &block_sizes)); // very inefficient, to big
1189: PetscCall(ISGetIndices(isrow, &rowindices));
1190: PetscCall(ISGetIndices(isicol, &icolindices));
1191: nDMs = 0;
1192: bsrt = 0;
1193: bend = 1;
1194: for (PetscInt row_B = 0; row_B < nloc; row_B++) { // for all rows in block diagonal space
1195: PetscInt rowA = icolindices[row_B], minj = PETSC_MAX_INT, maxj = 0;
1196: //PetscCall(PetscPrintf(PETSC_COMM_SELF, "\t[%d] rowA = %d\n",rank,rowA));
1197: PetscCall(MatGetRow(Aseq, rowA, &ncols, &colsA, NULL)); // not sorted in permutation
1198: PetscCheck(ncols, PetscObjectComm((PetscObject)pc), PETSC_ERR_ARG_WRONG, "Empty row not supported: %" PetscInt_FMT "\n", row_B);
1199: for (PetscInt colj = 0; colj < ncols; colj++) {
1200: PetscInt colB = rowindices[colsA[colj]]; // use local idx
1201: //PetscCall(PetscPrintf(PETSC_COMM_SELF, "\t\t[%d] colB = %d\n",rank,colB));
1202: PetscCheck(colB >= 0 && colB < nloc, PetscObjectComm((PetscObject)pc), PETSC_ERR_ARG_WRONG, "colB < 0: %" PetscInt_FMT "\n", colB);
1203: if (colB > maxj) maxj = colB;
1204: if (colB < minj) minj = colB;
1205: }
1206: PetscCall(MatRestoreRow(Aseq, rowA, &ncols, &colsA, NULL));
1207: if (minj >= bend) { // first column is > max of last block -- new block or last block
1208: //PetscCall(PetscPrintf(PetscObjectComm((PetscObject)A), "\t\t finish block %d, N loc = %d (%d,%d)\n", nDMs+1, bend - bsrt,bsrt,bend));
1209: block_sizes[nDMs] = bend - bsrt;
1210: ntot += block_sizes[nDMs];
1211: PetscCheck(minj == bend, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "minj != bend: %" PetscInt_FMT " != %" PetscInt_FMT "\n", minj, bend);
1212: bsrt = bend;
1213: bend++; // start with size 1 in new block
1214: nDMs++;
1215: }
1216: if (maxj + 1 > bend) bend = maxj + 1;
1217: PetscCheck(minj >= bsrt || row_B == Iend - 1, PetscObjectComm((PetscObject)pc), PETSC_ERR_ARG_WRONG, "%" PetscInt_FMT ") minj < bsrt: %" PetscInt_FMT " != %" PetscInt_FMT "\n", rowA, minj, bsrt);
1218: //PetscCall(PetscPrintf(PETSC_COMM_SELF, "[%d] %d) row %d.%d) cols %d : %d ; bsrt = %d, bend = %d\n",rank,row_B,nDMs,rowA,minj,maxj,bsrt,bend));
1219: }
1220: // do last block
1221: //PetscCall(PetscPrintf(PETSC_COMM_SELF, "\t\t\t [%d] finish block %d, N loc = %d (%d,%d)\n", rank, nDMs+1, bend - bsrt,bsrt,bend));
1222: block_sizes[nDMs] = bend - bsrt;
1223: ntot += block_sizes[nDMs];
1224: nDMs++;
1225: // cleanup
1226: PetscCheck(ntot == nloc, PetscObjectComm((PetscObject)pc), PETSC_ERR_ARG_WRONG, "n total != n local: %" PetscInt_FMT " != %" PetscInt_FMT "\n", ntot, nloc);
1227: PetscCall(ISRestoreIndices(isrow, &rowindices));
1228: PetscCall(ISRestoreIndices(isicol, &icolindices));
1229: PetscCall(PetscRealloc(sizeof(PetscInt) * nDMs, &block_sizes));
1230: PetscCall(MatCreateVecs(A, &jac->vec_diag, NULL));
1231: PetscCall(PetscInfo(pc, "Setup Matrix based meta data (not DMComposite not attached to PC) %" PetscInt_FMT " sub domains\n", nDMs));
1232: }
1233: PetscCall(ISDestroy(&isrow));
1234: PetscCall(ISDestroy(&isicol));
1235: jac->num_dms = nDMs;
1236: PetscCall(VecGetLocalSize(jac->vec_diag, &n));
1237: jac->n = n;
1238: jac->d_idiag_k = new Kokkos::View<PetscScalar *, Kokkos::LayoutRight>("idiag", n);
1239: // options
1240: PetscCall(PCBJKOKKOSCreateKSP_BJKOKKOS(pc));
1241: PetscCall(KSPSetFromOptions(jac->ksp));
1242: PetscCall(PetscObjectTypeCompareAny((PetscObject)jac->ksp, &flg, KSPBICG, ""));
1243: if (flg) {
1244: jac->ksp_type_idx = BATCH_KSP_BICG_IDX;
1245: jac->nwork = 7;
1246: } else {
1247: PetscCall(PetscObjectTypeCompareAny((PetscObject)jac->ksp, &flg, KSPTFQMR, ""));
1248: if (flg) {
1249: jac->ksp_type_idx = BATCH_KSP_TFQMR_IDX;
1250: jac->nwork = 10;
1251: } else {
1252: PetscCall(PetscObjectTypeCompareAny((PetscObject)jac->ksp, &flg, KSPGMRES, ""));
1253: PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Unsupported batch ksp type");
1254: jac->ksp_type_idx = BATCH_KSP_GMRES_IDX;
1255: jac->nwork = 0;
1256: }
1257: }
1258: PetscOptionsBegin(PetscObjectComm((PetscObject)jac->ksp), ((PetscObject)jac->ksp)->prefix, "Options for Kokkos batch solver", "none");
1259: PetscCall(PetscOptionsBool("-ksp_converged_reason", "", "bjkokkos.kokkos.cxx.c", jac->reason, &jac->reason, NULL));
1260: PetscCall(PetscOptionsBool("-ksp_monitor", "", "bjkokkos.kokkos.cxx.c", jac->monitor, &jac->monitor, NULL));
1261: PetscCall(PetscOptionsInt("-ksp_batch_target", "", "bjkokkos.kokkos.cxx.c", jac->batch_target, &jac->batch_target, NULL));
1262: PetscCall(PetscOptionsInt("-ksp_batch_nsolves_team", "", "bjkokkos.kokkos.cxx.c", jac->nsolves_team, &jac->nsolves_team, NULL));
1263: PetscCheck(jac->batch_target < jac->num_dms, PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "-ksp_batch_target (%" PetscInt_FMT ") >= number of DMs (%" PetscInt_FMT ")", jac->batch_target, jac->num_dms);
1264: PetscOptionsEnd();
1265: // get blocks - jac->d_bid_eqOffset_k
1266: if (pack) {
1267: PetscCall(PetscMalloc(sizeof(*subX) * nDMs, &subX));
1268: PetscCall(PetscMalloc(sizeof(*subDM) * nDMs, &subDM));
1269: }
1270: PetscCall(PetscMalloc(sizeof(*jac->dm_Nf) * nDMs, &jac->dm_Nf));
1271: PetscCall(PetscInfo(pc, "Have %" PetscInt_FMT " blocks, n=%" PetscInt_FMT " rtol=%g type = %s\n", nDMs, n, (double)jac->ksp->rtol, ((PetscObject)jac->ksp)->type_name));
1272: if (pack) PetscCall(DMCompositeGetEntriesArray(pack, subDM));
1273: jac->nBlocks = 0;
1274: for (PetscInt ii = 0; ii < nDMs; ii++) {
1275: PetscInt Nf;
1276: if (subDM) {
1277: DM dm = subDM[ii];
1278: PetscSection section;
1279: PetscCall(DMGetLocalSection(dm, §ion));
1280: PetscCall(PetscSectionGetNumFields(section, &Nf));
1281: } else Nf = 1;
1282: jac->nBlocks += Nf;
1283: #if PCBJKOKKOS_VERBOSE_LEVEL <= 2
1284: if (ii == 0) PetscCall(PetscInfo(pc, "%" PetscInt_FMT ") %" PetscInt_FMT " blocks (%" PetscInt_FMT " total)\n", ii, Nf, jac->nBlocks));
1285: #else
1286: PetscCall(PetscInfo(pc, "%" PetscInt_FMT ") %" PetscInt_FMT " blocks (%" PetscInt_FMT " total)\n", ii, Nf, jac->nBlocks));
1287: #endif
1288: jac->dm_Nf[ii] = Nf;
1289: }
1290: { // d_bid_eqOffset_k
1291: Kokkos::View<PetscInt *, Kokkos::LayoutRight, Kokkos::HostSpace> h_block_offsets("block_offsets", jac->nBlocks + 1);
1292: if (pack) PetscCall(DMCompositeGetAccessArray(pack, jac->vec_diag, nDMs, NULL, subX));
1293: h_block_offsets[0] = 0;
1294: jac->const_block_size = -1;
1295: for (PetscInt ii = 0, idx = 0; ii < nDMs; ii++) {
1296: PetscInt nloc, nblk;
1297: if (pack) PetscCall(VecGetSize(subX[ii], &nloc));
1298: else nloc = block_sizes[ii];
1299: nblk = nloc / jac->dm_Nf[ii];
1300: PetscCheck(nloc % jac->dm_Nf[ii] == 0, PetscObjectComm((PetscObject)pc), PETSC_ERR_USER, "nloc%%jac->dm_Nf[ii] (%" PetscInt_FMT ") != 0 DMs", nloc % jac->dm_Nf[ii]);
1301: for (PetscInt jj = 0; jj < jac->dm_Nf[ii]; jj++, idx++) {
1302: h_block_offsets[idx + 1] = h_block_offsets[idx] + nblk;
1303: #if PCBJKOKKOS_VERBOSE_LEVEL <= 2
1304: if (idx == 0) PetscCall(PetscInfo(pc, "\t%" PetscInt_FMT ") Add block with %" PetscInt_FMT " equations of %" PetscInt_FMT "\n", idx + 1, nblk, jac->nBlocks));
1305: #else
1306: PetscCall(PetscInfo(pc, "\t%" PetscInt_FMT ") Add block with %" PetscInt_FMT " equations of %" PetscInt_FMT "\n", idx + 1, nblk, jac->nBlocks));
1307: #endif
1308: if (jac->const_block_size == -1) jac->const_block_size = nblk;
1309: else if (jac->const_block_size > 0 && jac->const_block_size != nblk) jac->const_block_size = 0;
1310: }
1311: }
1312: if (pack) {
1313: PetscCall(DMCompositeRestoreAccessArray(pack, jac->vec_diag, jac->nBlocks, NULL, subX));
1314: PetscCall(PetscFree(subX));
1315: PetscCall(PetscFree(subDM));
1316: }
1317: jac->d_bid_eqOffset_k = new Kokkos::View<PetscInt *, Kokkos::LayoutRight>(Kokkos::create_mirror(Kokkos::DefaultExecutionSpace::memory_space(), h_block_offsets));
1318: Kokkos::deep_copy(*jac->d_bid_eqOffset_k, h_block_offsets);
1319: }
1320: if (!pack) PetscCall(PetscFree(block_sizes));
1321: }
1322: { // get jac->d_idiag_k (PC setup),
1323: const PetscInt *d_ai = aijkok->i_device_data(), *d_aj = aijkok->j_device_data();
1324: const PetscScalar *d_aa = aijkok->a_device_data();
1325: const PetscInt conc = Kokkos::DefaultExecutionSpace().concurrency(), openmp = !!(conc < 1000), team_size = (openmp == 0 && PCBJKOKKOS_VEC_SIZE != 1) ? PCBJKOKKOS_TEAM_SIZE : 1;
1326: const PetscInt *d_bid_eqOffset = jac->d_bid_eqOffset_k->data(), *r = jac->d_isrow_k->data(), *ic = jac->d_isicol_k->data();
1327: PetscScalar *d_idiag = jac->d_idiag_k->data();
1328: Kokkos::parallel_for(
1329: "Diag", Kokkos::TeamPolicy<>(jac->nBlocks, team_size, PCBJKOKKOS_VEC_SIZE), KOKKOS_LAMBDA(const team_member team) {
1330: const PetscInt blkID = team.league_rank();
1331: Kokkos::parallel_for(Kokkos::TeamThreadRange(team, d_bid_eqOffset[blkID], d_bid_eqOffset[blkID + 1]), [=](const int rowb) {
1332: const PetscInt rowa = ic[rowb], ai = d_ai[rowa], *aj = d_aj + ai; // grab original data
1333: const PetscScalar *aa = d_aa + ai;
1334: const PetscInt nrow = d_ai[rowa + 1] - ai;
1335: int found;
1336: Kokkos::parallel_reduce(
1337: Kokkos::ThreadVectorRange(team, nrow),
1338: [=](const int &j, int &count) {
1339: const PetscInt colb = r[aj[j]];
1340: if (colb == rowb) {
1341: d_idiag[rowb] = 1. / aa[j];
1342: count++;
1343: }
1344: },
1345: found);
1346: #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
1347: if (found != 1) Kokkos::single(Kokkos::PerThread(team), [=]() { printf("ERRORrow %d) found = %d\n", rowb, found); });
1348: #endif
1349: });
1350: });
1351: }
1352: }
1353: PetscFunctionReturn(PETSC_SUCCESS);
1354: }
1356: /* Default destroy, if it has never been setup */
1357: static PetscErrorCode PCReset_BJKOKKOS(PC pc)
1358: {
1359: PC_PCBJKOKKOS *jac = (PC_PCBJKOKKOS *)pc->data;
1361: PetscFunctionBegin;
1362: PetscCall(KSPDestroy(&jac->ksp));
1363: PetscCall(VecDestroy(&jac->vec_diag));
1364: if (jac->d_bid_eqOffset_k) delete jac->d_bid_eqOffset_k;
1365: if (jac->d_idiag_k) delete jac->d_idiag_k;
1366: if (jac->d_isrow_k) delete jac->d_isrow_k;
1367: if (jac->d_isicol_k) delete jac->d_isicol_k;
1368: jac->d_bid_eqOffset_k = NULL;
1369: jac->d_idiag_k = NULL;
1370: jac->d_isrow_k = NULL;
1371: jac->d_isicol_k = NULL;
1372: PetscCall(PetscObjectComposeFunction((PetscObject)pc, "PCBJKOKKOSGetKSP_C", NULL)); // not published now (causes configure errors)
1373: PetscCall(PetscObjectComposeFunction((PetscObject)pc, "PCBJKOKKOSSetKSP_C", NULL));
1374: PetscCall(PetscFree(jac->dm_Nf));
1375: jac->dm_Nf = NULL;
1376: if (jac->rowOffsets) delete jac->rowOffsets;
1377: if (jac->colIndices) delete jac->colIndices;
1378: if (jac->batch_b) delete jac->batch_b;
1379: if (jac->batch_x) delete jac->batch_x;
1380: if (jac->batch_values) delete jac->batch_values;
1381: jac->rowOffsets = NULL;
1382: jac->colIndices = NULL;
1383: jac->batch_b = NULL;
1384: jac->batch_x = NULL;
1385: jac->batch_values = NULL;
1387: PetscFunctionReturn(PETSC_SUCCESS);
1388: }
1390: static PetscErrorCode PCDestroy_BJKOKKOS(PC pc)
1391: {
1392: PetscFunctionBegin;
1393: PetscCall(PCReset_BJKOKKOS(pc));
1394: PetscCall(PetscFree(pc->data));
1395: PetscFunctionReturn(PETSC_SUCCESS);
1396: }
1398: static PetscErrorCode PCView_BJKOKKOS(PC pc, PetscViewer viewer)
1399: {
1400: PC_PCBJKOKKOS *jac = (PC_PCBJKOKKOS *)pc->data;
1401: PetscBool iascii;
1403: PetscFunctionBegin;
1404: if (!jac->ksp) PetscCall(PCBJKOKKOSCreateKSP_BJKOKKOS(pc));
1405: PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii));
1406: if (iascii) {
1407: PetscCall(PetscViewerASCIIPrintf(viewer, " Batched device linear solver: Krylov (KSP) method with Jacobi preconditioning\n"));
1408: PetscCall(PetscViewerASCIIPrintf(viewer, "\t\tnwork = %" PetscInt_FMT ", rel tol = %e, abs tol = %e, div tol = %e, max it =%" PetscInt_FMT ", type = %s\n", jac->nwork, jac->ksp->rtol, jac->ksp->abstol, jac->ksp->divtol, jac->ksp->max_it,
1409: ((PetscObject)jac->ksp)->type_name));
1410: }
1411: PetscFunctionReturn(PETSC_SUCCESS);
1412: }
1414: static PetscErrorCode PCSetFromOptions_BJKOKKOS(PC pc, PetscOptionItems *PetscOptionsObject)
1415: {
1416: PetscFunctionBegin;
1417: PetscOptionsHeadBegin(PetscOptionsObject, "PC BJKOKKOS options");
1418: PetscOptionsHeadEnd();
1419: PetscFunctionReturn(PETSC_SUCCESS);
1420: }
1422: static PetscErrorCode PCBJKOKKOSSetKSP_BJKOKKOS(PC pc, KSP ksp)
1423: {
1424: PC_PCBJKOKKOS *jac = (PC_PCBJKOKKOS *)pc->data;
1426: PetscFunctionBegin;
1427: PetscCall(PetscObjectReference((PetscObject)ksp));
1428: PetscCall(KSPDestroy(&jac->ksp));
1429: jac->ksp = ksp;
1430: PetscFunctionReturn(PETSC_SUCCESS);
1431: }
1433: /*@C
1434: PCBJKOKKOSSetKSP - Sets the `KSP` context for `PCBJKOKKOS`
1436: Collective
1438: Input Parameters:
1439: + pc - the `PCBJKOKKOS` preconditioner context
1440: - ksp - the `KSP` solver
1442: Level: advanced
1444: Notes:
1445: The `PC` and the `KSP` must have the same communicator
1447: If the `PC` is not `PCBJKOKKOS` this function returns without doing anything
1449: ,seealso: `PCBJKOKKOSGetKSP()`, `PCBJKOKKOS`
1450: @*/
1451: PetscErrorCode PCBJKOKKOSSetKSP(PC pc, KSP ksp)
1452: {
1453: PetscFunctionBegin;
1456: PetscCheckSameComm(pc, 1, ksp, 2);
1457: PetscTryMethod(pc, "PCBJKOKKOSSetKSP_C", (PC, KSP), (pc, ksp));
1458: PetscFunctionReturn(PETSC_SUCCESS);
1459: }
1461: static PetscErrorCode PCBJKOKKOSGetKSP_BJKOKKOS(PC pc, KSP *ksp)
1462: {
1463: PC_PCBJKOKKOS *jac = (PC_PCBJKOKKOS *)pc->data;
1465: PetscFunctionBegin;
1466: if (!jac->ksp) PetscCall(PCBJKOKKOSCreateKSP_BJKOKKOS(pc));
1467: *ksp = jac->ksp;
1468: PetscFunctionReturn(PETSC_SUCCESS);
1469: }
1471: /*@C
1472: PCBJKOKKOSGetKSP - Gets the `KSP` context for the `PCBJKOKKOS` preconditioner
1474: Not Collective but `KSP` returned is parallel if `PC` was parallel
1476: Input Parameter:
1477: . pc - the preconditioner context
1479: Output Parameter:
1480: . ksp - the `KSP` solver
1482: Level: advanced
1484: Notes:
1485: You must call `KSPSetUp()` before calling `PCBJKOKKOSGetKSP()`.
1487: If the `PC` is not a `PCBJKOKKOS` object it raises an error
1489: .seealso: `PCBJKOKKOS`, `PCBJKOKKOSSetKSP()`
1490: @*/
1491: PetscErrorCode PCBJKOKKOSGetKSP(PC pc, KSP *ksp)
1492: {
1493: PetscFunctionBegin;
1496: PetscUseMethod(pc, "PCBJKOKKOSGetKSP_C", (PC, KSP *), (pc, ksp));
1497: PetscFunctionReturn(PETSC_SUCCESS);
1498: }
1500: /*MC
1501: PCBJKOKKOS - Defines a preconditioner that applies a Krylov solver and preconditioner to the blocks in a `MATSEQAIJ` matrix on the GPU using Kokkos
1503: Options Database Key:
1504: . -pc_bjkokkos_ - options prefix for its `KSP` options
1506: Level: intermediate
1508: Note:
1509: For use with -ksp_type preonly to bypass any computation on the CPU
1511: Developer Notes:
1512: The documentation is incomplete. Is this a block Jacobi preconditioner?
1514: Why does it have its own `KSP`? Where is the `KSP` run if used with -ksp_type preonly?
1516: .seealso: `PCCreate()`, `PCSetType()`, `PCType`, `PC`, `PCBJACOBI`,
1517: `PCSHELL`, `PCCOMPOSITE`, `PCSetUseAmat()`, `PCBJKOKKOSGetKSP()`
1518: M*/
1520: PETSC_EXTERN PetscErrorCode PCCreate_BJKOKKOS(PC pc)
1521: {
1522: PC_PCBJKOKKOS *jac;
1524: PetscFunctionBegin;
1525: PetscCall(PetscNew(&jac));
1526: pc->data = (void *)jac;
1528: jac->ksp = NULL;
1529: jac->vec_diag = NULL;
1530: jac->d_bid_eqOffset_k = NULL;
1531: jac->d_idiag_k = NULL;
1532: jac->d_isrow_k = NULL;
1533: jac->d_isicol_k = NULL;
1534: jac->nBlocks = 1;
1535: jac->max_nits = 0;
1537: PetscCall(PetscMemzero(pc->ops, sizeof(struct _PCOps)));
1538: pc->ops->apply = PCApply_BJKOKKOS;
1539: pc->ops->applytranspose = NULL;
1540: pc->ops->setup = PCSetUp_BJKOKKOS;
1541: pc->ops->reset = PCReset_BJKOKKOS;
1542: pc->ops->destroy = PCDestroy_BJKOKKOS;
1543: pc->ops->setfromoptions = PCSetFromOptions_BJKOKKOS;
1544: pc->ops->view = PCView_BJKOKKOS;
1546: jac->rowOffsets = NULL;
1547: jac->colIndices = NULL;
1548: jac->batch_b = NULL;
1549: jac->batch_x = NULL;
1550: jac->batch_values = NULL;
1552: PetscCall(PetscObjectComposeFunction((PetscObject)pc, "PCBJKOKKOSGetKSP_C", PCBJKOKKOSGetKSP_BJKOKKOS));
1553: PetscCall(PetscObjectComposeFunction((PetscObject)pc, "PCBJKOKKOSSetKSP_C", PCBJKOKKOSSetKSP_BJKOKKOS));
1554: PetscFunctionReturn(PETSC_SUCCESS);
1555: }