Actual source code: landaucu.cu

  1: /*
  2:   Implements the Landau kernel
  3: */
  4: #include <petscconf.h>

  6: #include <petsclandau.h>
  7: #if defined(PETSC_HAVE_CUDA_CLANG)
  8:   #define LANDAU_NOT_IMPLEMENTED SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Not supported with CLANG")
  9: PetscErrorCode LandauCUDAJacobian(DM[], const PetscInt, const PetscInt, const PetscInt, const PetscInt[], PetscReal[], PetscScalar[], const PetscScalar[], const LandauStaticData *, const PetscReal, const PetscLogEvent[], const PetscInt[], const PetscInt[], Mat[], Mat)
 10: {
 11:   LANDAU_NOT_IMPLEMENTED;
 12: }
 13: PetscErrorCode LandauCUDACreateMatMaps(P4estVertexMaps *, pointInterpolationP4est (*)[LANDAU_MAX_Q_FACE], PetscInt[], PetscInt, PetscInt)
 14: {
 15:   LANDAU_NOT_IMPLEMENTED;
 16: }
 17: PetscErrorCode LandauCUDADestroyMatMaps(P4estVertexMaps *, PetscInt)
 18: {
 19:   LANDAU_NOT_IMPLEMENTED;
 20: }
 21: PetscErrorCode LandauCUDAStaticDataSet(DM, const PetscInt, const PetscInt, const PetscInt, PetscInt[], PetscInt[], PetscInt[], PetscReal[], PetscReal[], PetscReal[], PetscReal[], PetscReal[], PetscReal[], PetscReal[], PetscReal[], PetscReal[], LandauStaticData *)
 22: {
 23:   LANDAU_NOT_IMPLEMENTED;
 24: }
 25: PetscErrorCode LandauCUDAStaticDataClear(LandauStaticData *)
 26: {
 27:   LANDAU_NOT_IMPLEMENTED;
 28: }
 29: #else
 30: #include <petsc/private/dmpleximpl.h>
 31:   #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
 32: #include <../src/mat/impls/aij/seq/aij.h>
 33: #include <petscmat.h>
 34: #include <petscdevice_cuda.h>

 36:   #include "../land_tensors.h"
 37: #include <petscaijdevice.h>

 39: PETSC_EXTERN PetscErrorCode LandauCUDACreateMatMaps(P4estVertexMaps maps[], pointInterpolationP4est (*pointMaps)[LANDAU_MAX_Q_FACE], PetscInt Nf[], PetscInt, PetscInt grid)
 40: {
 41:   P4estVertexMaps h_maps;
 42:   PetscFunctionBegin;
 43:   h_maps.num_elements = maps[grid].num_elements;
 44:   h_maps.num_face     = maps[grid].num_face;
 45:   h_maps.num_reduced  = maps[grid].num_reduced;
 46:   h_maps.deviceType   = maps[grid].deviceType;
 47:   h_maps.Nf           = Nf[grid];
 48:   h_maps.numgrids     = maps[grid].numgrids;
 49:   PetscCallCUDA(cudaMalloc((void **)&h_maps.c_maps, maps[grid].num_reduced * sizeof *pointMaps));
 50:   PetscCallCUDA(cudaMemcpy(h_maps.c_maps, maps[grid].c_maps, maps[grid].num_reduced * sizeof *pointMaps, cudaMemcpyHostToDevice));
 51:   PetscCallCUDA(cudaMalloc((void **)&h_maps.gIdx, maps[grid].num_elements * sizeof *maps[grid].gIdx));
 52:   PetscCallCUDA(cudaMemcpy(h_maps.gIdx, maps[grid].gIdx, maps[grid].num_elements * sizeof *maps[grid].gIdx, cudaMemcpyHostToDevice));
 53:   PetscCallCUDA(cudaMalloc((void **)&maps[grid].d_self, sizeof(P4estVertexMaps)));
 54:   PetscCallCUDA(cudaMemcpy(maps[grid].d_self, &h_maps, sizeof(P4estVertexMaps), cudaMemcpyHostToDevice));
 55:   PetscFunctionReturn(PETSC_SUCCESS);
 56: }

 58: PETSC_EXTERN PetscErrorCode LandauCUDADestroyMatMaps(P4estVertexMaps maps[], PetscInt num_grids)
 59: {
 60:   PetscFunctionBegin;
 61:   for (PetscInt grid = 0; grid < num_grids; grid++) {
 62:     P4estVertexMaps *d_maps = maps[grid].d_self, h_maps;
 63:     PetscCallCUDA(cudaMemcpy(&h_maps, d_maps, sizeof(P4estVertexMaps), cudaMemcpyDeviceToHost));
 64:     PetscCallCUDA(cudaFree(h_maps.c_maps));
 65:     PetscCallCUDA(cudaFree(h_maps.gIdx));
 66:     PetscCallCUDA(cudaFree(d_maps));
 67:   }
 68:   PetscFunctionReturn(PETSC_SUCCESS);
 69: }

 71: PetscErrorCode LandauCUDAStaticDataSet(DM plex, const PetscInt Nq, const PetscInt batch_sz, const PetscInt num_grids, PetscInt a_numCells[], PetscInt a_species_offset[], PetscInt a_mat_offset[], PetscReal nu_alpha[], PetscReal nu_beta[], PetscReal a_invMass[], PetscReal[], PetscReal a_invJ[], PetscReal a_x[], PetscReal a_y[], PetscReal a_z[], PetscReal a_w[], LandauStaticData *SData_d)
 72: {
 73:   PetscTabulation *Tf;
 74:   PetscReal       *BB, *DD;
 75:   PetscInt         dim, Nb = Nq, szf = sizeof(PetscReal), szs = sizeof(PetscScalar), szi = sizeof(PetscInt);
 76:   PetscInt         h_ip_offset[LANDAU_MAX_GRIDS + 1], h_ipf_offset[LANDAU_MAX_GRIDS + 1], h_elem_offset[LANDAU_MAX_GRIDS + 1], nip, IPfdf_sz, Nf;
 77:   PetscDS          prob;

 79:   PetscFunctionBegin;
 80:   PetscCall(DMGetDimension(plex, &dim));
 81:   PetscCall(DMGetDS(plex, &prob));
 82:   PetscCheck(LANDAU_DIM == dim, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "dim %" PetscInt_FMT " != LANDAU_DIM %d", dim, LANDAU_DIM);
 83:   PetscCall(PetscDSGetTabulation(prob, &Tf));
 84:   BB = Tf[0]->T[0];
 85:   DD = Tf[0]->T[1];
 86:   Nf = h_ip_offset[0] = h_ipf_offset[0] = h_elem_offset[0] = 0;
 87:   nip                                                      = 0;
 88:   IPfdf_sz                                                 = 0;
 89:   for (PetscInt grid = 0; grid < num_grids; grid++) {
 90:     PetscInt nfloc          = a_species_offset[grid + 1] - a_species_offset[grid];
 91:     h_elem_offset[grid + 1] = h_elem_offset[grid] + a_numCells[grid];
 92:     nip += a_numCells[grid] * Nq;
 93:     h_ip_offset[grid + 1] = nip;
 94:     IPfdf_sz += Nq * nfloc * a_numCells[grid];
 95:     h_ipf_offset[grid + 1] = IPfdf_sz;
 96:   }
 97:   Nf = a_species_offset[num_grids];
 98:   {
 99:     PetscCallCUDA(cudaMalloc((void **)&SData_d->B, Nq * Nb * szf)); // kernel input
100:     PetscCallCUDA(cudaMemcpy(SData_d->B, BB, Nq * Nb * szf, cudaMemcpyHostToDevice));
101:     PetscCallCUDA(cudaMalloc((void **)&SData_d->D, Nq * Nb * dim * szf)); // kernel input
102:     PetscCallCUDA(cudaMemcpy(SData_d->D, DD, Nq * Nb * dim * szf, cudaMemcpyHostToDevice));

104:     PetscCallCUDA(cudaMalloc((void **)&SData_d->alpha, Nf * szf));   // kernel input
105:     PetscCallCUDA(cudaMalloc((void **)&SData_d->beta, Nf * szf));    // kernel input
106:     PetscCallCUDA(cudaMalloc((void **)&SData_d->invMass, Nf * szf)); // kernel input

108:     PetscCallCUDA(cudaMemcpy(SData_d->alpha, nu_alpha, Nf * szf, cudaMemcpyHostToDevice));
109:     PetscCallCUDA(cudaMemcpy(SData_d->beta, nu_beta, Nf * szf, cudaMemcpyHostToDevice));
110:     PetscCallCUDA(cudaMemcpy(SData_d->invMass, a_invMass, Nf * szf, cudaMemcpyHostToDevice));

112:     // collect geometry
113:     PetscCallCUDA(cudaMalloc((void **)&SData_d->invJ, nip * dim * dim * szf)); // kernel input
114:     PetscCallCUDA(cudaMemcpy(SData_d->invJ, a_invJ, nip * dim * dim * szf, cudaMemcpyHostToDevice));
115:     PetscCallCUDA(cudaMalloc((void **)&SData_d->x, nip * szf)); // kernel input
116:     PetscCallCUDA(cudaMemcpy(SData_d->x, a_x, nip * szf, cudaMemcpyHostToDevice));
117:     PetscCallCUDA(cudaMalloc((void **)&SData_d->y, nip * szf)); // kernel input
118:     PetscCallCUDA(cudaMemcpy(SData_d->y, a_y, nip * szf, cudaMemcpyHostToDevice));
119:   #if LANDAU_DIM == 3
120:     PetscCallCUDA(cudaMalloc((void **)&SData_d->z, nip * szf)); // kernel input
121:     PetscCallCUDA(cudaMemcpy(SData_d->z, a_z, nip * szf, cudaMemcpyHostToDevice));
122:   #else
123:     (void)a_z;
124:   #endif
125:     PetscCallCUDA(cudaMalloc((void **)&SData_d->w, nip * szf)); // kernel input
126:     PetscCallCUDA(cudaMemcpy(SData_d->w, a_w, nip * szf, cudaMemcpyHostToDevice));

128:     PetscCallCUDA(cudaMalloc((void **)&SData_d->NCells, num_grids * szi));
129:     PetscCallCUDA(cudaMemcpy(SData_d->NCells, a_numCells, num_grids * szi, cudaMemcpyHostToDevice));
130:     PetscCallCUDA(cudaMalloc((void **)&SData_d->species_offset, (num_grids + 1) * szi));
131:     PetscCallCUDA(cudaMemcpy(SData_d->species_offset, a_species_offset, (num_grids + 1) * szi, cudaMemcpyHostToDevice));
132:     PetscCallCUDA(cudaMalloc((void **)&SData_d->mat_offset, (num_grids + 1) * szi));
133:     PetscCallCUDA(cudaMemcpy(SData_d->mat_offset, a_mat_offset, (num_grids + 1) * szi, cudaMemcpyHostToDevice));
134:     PetscCallCUDA(cudaMalloc((void **)&SData_d->ip_offset, (num_grids + 1) * szi));
135:     PetscCallCUDA(cudaMemcpy(SData_d->ip_offset, h_ip_offset, (num_grids + 1) * szi, cudaMemcpyHostToDevice));
136:     PetscCallCUDA(cudaMalloc((void **)&SData_d->ipf_offset, (num_grids + 1) * szi));
137:     PetscCallCUDA(cudaMemcpy(SData_d->ipf_offset, h_ipf_offset, (num_grids + 1) * szi, cudaMemcpyHostToDevice));
138:     PetscCallCUDA(cudaMalloc((void **)&SData_d->elem_offset, (num_grids + 1) * szi));
139:     PetscCallCUDA(cudaMemcpy(SData_d->elem_offset, h_elem_offset, (num_grids + 1) * szi, cudaMemcpyHostToDevice));
140:     PetscCallCUDA(cudaMalloc((void **)&SData_d->maps, num_grids * sizeof(P4estVertexMaps *)));
141:     // allocate space for dynamic data once
142:     PetscCallCUDA(cudaMalloc((void **)&SData_d->Eq_m, Nf * szf));               // this could be for each vertex (todo?)
143:     PetscCallCUDA(cudaMalloc((void **)&SData_d->f, nip * Nf * szs * batch_sz)); // for each vertex in batch
144:     PetscCallCUDA(cudaMalloc((void **)&SData_d->dfdx, nip * Nf * szs * batch_sz));
145:     PetscCallCUDA(cudaMalloc((void **)&SData_d->dfdy, nip * Nf * szs * batch_sz));
146:   #if LANDAU_DIM == 3
147:     PetscCallCUDA(cudaMalloc((void **)&SData_d->dfdz, nip * Nf * szs * batch_sz));
148:   #endif
149:   }
150:   PetscFunctionReturn(PETSC_SUCCESS);
151: }

153: PetscErrorCode LandauCUDAStaticDataClear(LandauStaticData *SData_d)
154: {
155:   PetscFunctionBegin;
156:   if (SData_d->alpha) {
157:     PetscCallCUDA(cudaFree(SData_d->alpha));
158:     SData_d->alpha = NULL;
159:     PetscCallCUDA(cudaFree(SData_d->beta));
160:     PetscCallCUDA(cudaFree(SData_d->invMass));
161:     PetscCallCUDA(cudaFree(SData_d->B));
162:     PetscCallCUDA(cudaFree(SData_d->D));
163:     PetscCallCUDA(cudaFree(SData_d->invJ));
164:   #if LANDAU_DIM == 3
165:     PetscCallCUDA(cudaFree(SData_d->z));
166:   #endif
167:     PetscCallCUDA(cudaFree(SData_d->x));
168:     PetscCallCUDA(cudaFree(SData_d->y));
169:     PetscCallCUDA(cudaFree(SData_d->w));
170:     // dynamic data
171:     PetscCallCUDA(cudaFree(SData_d->Eq_m));
172:     PetscCallCUDA(cudaFree(SData_d->f));
173:     PetscCallCUDA(cudaFree(SData_d->dfdx));
174:     PetscCallCUDA(cudaFree(SData_d->dfdy));
175:   #if LANDAU_DIM == 3
176:     PetscCallCUDA(cudaFree(SData_d->dfdz));
177:   #endif
178:     PetscCallCUDA(cudaFree(SData_d->NCells));
179:     PetscCallCUDA(cudaFree(SData_d->species_offset));
180:     PetscCallCUDA(cudaFree(SData_d->mat_offset));
181:     PetscCallCUDA(cudaFree(SData_d->ip_offset));
182:     PetscCallCUDA(cudaFree(SData_d->ipf_offset));
183:     PetscCallCUDA(cudaFree(SData_d->elem_offset));
184:     PetscCallCUDA(cudaFree(SData_d->maps));
185:   }
186:   PetscFunctionReturn(PETSC_SUCCESS);
187: }
188: //
189: // The GPU Landau kernel
190: //
191: __global__ void landau_form_fdf(const PetscInt dim, const PetscInt Nb, const PetscInt num_grids, const PetscReal d_invJ[], const PetscReal *const BB, const PetscReal *const DD, PetscScalar *d_vertex_f, P4estVertexMaps *d_maps[], PetscReal d_f[], PetscReal d_dfdx[], PetscReal d_dfdy[],
192:   #if LANDAU_DIM == 3
193:                                 PetscReal      d_dfdz[],
194:   #endif
195:                                 const PetscInt d_numCells[], const PetscInt d_species_offset[], const PetscInt d_mat_offset[], const PetscInt d_ip_offset[], const PetscInt d_ipf_offset[], const PetscInt d_elem_offset[]) // output
196: {
197:   const PetscInt   Nq = blockDim.y, myQi = threadIdx.y;
198:   const PetscInt   b_elem_idx = blockIdx.y, b_id = blockIdx.x, IPf_sz_glb = d_ipf_offset[num_grids];
199:   const PetscReal *Bq = &BB[myQi * Nb], *Dq = &DD[myQi * Nb * dim];
200:   PetscInt         grid = 0, f, d, b, e, q;
201:   while (b_elem_idx >= d_elem_offset[grid + 1]) grid++;
202:   {
203:     const PetscInt     loc_nip = d_numCells[grid] * Nq, loc_Nf = d_species_offset[grid + 1] - d_species_offset[grid], loc_elem = b_elem_idx - d_elem_offset[grid];
204:     const PetscInt     moffset = LAND_MOFFSET(b_id, grid, gridDim.x, num_grids, d_mat_offset);
205:     const PetscScalar *coef;
206:     PetscReal          u_x[LANDAU_DIM];
207:     const PetscReal   *invJ = &d_invJ[(d_ip_offset[grid] + loc_elem * Nq + myQi) * dim * dim];
208:     PetscScalar        coef_buff[LANDAU_MAX_SPECIES * LANDAU_MAX_NQ];
209:     if (!d_maps) {
210:       coef = &d_vertex_f[b_id * IPf_sz_glb + d_ipf_offset[grid] + loc_elem * Nb * loc_Nf]; // closure and IP indexing are the same
211:     } else {
212:       coef = coef_buff;
213:       for (f = 0; f < loc_Nf; ++f) {
214:         LandauIdx *const Idxs = &d_maps[grid]->gIdx[loc_elem][f][0];
215:         for (b = 0; b < Nb; ++b) {
216:           PetscInt idx = Idxs[b];
217:           if (idx >= 0) {
218:             coef_buff[f * Nb + b] = d_vertex_f[idx + moffset];
219:           } else {
220:             idx                   = -idx - 1;
221:             coef_buff[f * Nb + b] = 0;
222:             for (q = 0; q < d_maps[grid]->num_face; q++) {
223:               PetscInt  id    = d_maps[grid]->c_maps[idx][q].gid;
224:               PetscReal scale = d_maps[grid]->c_maps[idx][q].scale;
225:               if (id >= 0) coef_buff[f * Nb + b] += scale * d_vertex_f[id + moffset];
226:             }
227:           }
228:         }
229:       }
230:     }

232:     /* get f and df */
233:     for (f = threadIdx.x; f < loc_Nf; f += blockDim.x) {
234:       PetscReal      refSpaceDer[LANDAU_DIM];
235:       const PetscInt idx = b_id * IPf_sz_glb + d_ipf_offset[grid] + f * loc_nip + loc_elem * Nq + myQi;
236:       d_f[idx]           = 0.0;
237:       for (d = 0; d < LANDAU_DIM; ++d) refSpaceDer[d] = 0.0;
238:       for (b = 0; b < Nb; ++b) {
239:         const PetscInt cidx = b;
240:         d_f[idx] += Bq[cidx] * PetscRealPart(coef[f * Nb + cidx]);
241:         for (d = 0; d < dim; ++d) refSpaceDer[d] += Dq[cidx * dim + d] * PetscRealPart(coef[f * Nb + cidx]);
242:       }
243:       for (d = 0; d < dim; ++d) {
244:         for (e = 0, u_x[d] = 0.0; e < dim; ++e) u_x[d] += invJ[e * dim + d] * refSpaceDer[e];
245:       }
246:       d_dfdx[idx] = u_x[0];
247:       d_dfdy[idx] = u_x[1];
248:   #if LANDAU_DIM == 3
249:       d_dfdz[idx] = u_x[2];
250:   #endif
251:     }
252:   }
253: }

255: __device__ void landau_jac_kernel(const PetscInt num_grids, const PetscInt jpidx, PetscInt nip_global, const PetscInt grid, const PetscReal xx[], const PetscReal yy[], const PetscReal ww[], const PetscReal invJj[], const PetscInt Nftot, const PetscReal nu_alpha[], const PetscReal nu_beta[], const PetscReal invMass[], const PetscReal Eq_m[], const PetscReal *const BB, const PetscReal *const DD, PetscScalar *elemMat, P4estVertexMaps *d_maps[], PetscSplitCSRDataStructure d_mat, // output
256:                                   PetscScalar s_fieldMats[][LANDAU_MAX_NQ], // all these arrays are in shared memory
257:                                   PetscReal s_scale[][LANDAU_MAX_Q_FACE], PetscInt s_idx[][LANDAU_MAX_Q_FACE], PetscReal s_g2[][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES], PetscReal s_g3[][LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES], PetscReal s_gg2[][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES], PetscReal s_gg3[][LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES], PetscReal s_nu_alpha[], PetscReal s_nu_beta[], PetscReal s_invMass[], PetscReal s_f[], PetscReal s_dfx[], PetscReal s_dfy[], PetscReal d_f[], PetscReal d_dfdx[], PetscReal d_dfdy[], // global memory
258:   #if LANDAU_DIM == 3
259:                                   const PetscReal zz[], PetscReal s_dfz[], PetscReal d_dfdz[],
260:   #endif
261:                                   const PetscInt d_numCells[], const PetscInt d_species_offset[], const PetscInt d_mat_offset[], const PetscInt d_ip_offset[], const PetscInt d_ipf_offset[], const PetscInt d_elem_offset[])
262: {
263:   const PetscInt  Nq = blockDim.y, myQi = threadIdx.y;
264:   const PetscInt  b_elem_idx = blockIdx.y, b_id = blockIdx.x, IPf_sz_glb = d_ipf_offset[num_grids];
265:   const PetscInt  loc_Nf = d_species_offset[grid + 1] - d_species_offset[grid], loc_elem = b_elem_idx - d_elem_offset[grid];
266:   const PetscInt  moffset = LAND_MOFFSET(b_id, grid, gridDim.x, num_grids, d_mat_offset);
267:   int             delta, d, f, g, d2, dp, d3, fieldA, ipidx_b;
268:   PetscReal       gg2_temp[LANDAU_DIM], gg3_temp[LANDAU_DIM][LANDAU_DIM];
269:   #if LANDAU_DIM == 2
270:   const PetscReal vj[3] = {xx[jpidx], yy[jpidx]};
271:   constexpr int   dim   = 2;
272:   #else
273:   const PetscReal vj[3] = {xx[jpidx], yy[jpidx], zz[jpidx]};
274:   constexpr int   dim   = 3;
275:   #endif
276:   const PetscInt  f_off = d_species_offset[grid], Nb = Nq;
277:   // create g2 & g3
278:   for (f = threadIdx.x; f < loc_Nf; f += blockDim.x) {
279:     for (d = 0; d < dim; d++) { // clear accumulation data D & K
280:       s_gg2[d][myQi][f] = 0;
281:       for (d2 = 0; d2 < dim; d2++) s_gg3[d][d2][myQi][f] = 0;
282:     }
283:   }
284:   #pragma unroll
285:   for (d2 = 0; d2 < dim; d2++) {
286:     gg2_temp[d2] = 0;
287:   #pragma unroll
288:     for (d3 = 0; d3 < dim; d3++) gg3_temp[d2][d3] = 0;
289:   }
290:   if (threadIdx.y == 0) {
291:     // copy species into shared memory
292:     for (fieldA = threadIdx.x; fieldA < Nftot; fieldA += blockDim.x) {
293:       s_nu_alpha[fieldA] = nu_alpha[fieldA];
294:       s_nu_beta[fieldA]  = nu_beta[fieldA];
295:       s_invMass[fieldA]  = invMass[fieldA];
296:     }
297:   }
298:   __syncthreads();
299:   // inner integral, collect gg2/3
300:   for (ipidx_b = 0; ipidx_b < nip_global; ipidx_b += blockDim.x) {
301:     const PetscInt ipidx = ipidx_b + threadIdx.x;
302:     PetscInt       f_off_r, grid_r, loc_Nf_r, nip_loc_r, ipidx_g, fieldB, IPf_idx_r;
303:     __syncthreads();
304:     if (ipidx < nip_global) {
305:       grid_r = 0;
306:       while (ipidx >= d_ip_offset[grid_r + 1]) grid_r++;
307:       f_off_r   = d_species_offset[grid_r];
308:       ipidx_g   = ipidx - d_ip_offset[grid_r];
309:       nip_loc_r = d_numCells[grid_r] * Nq;
310:       loc_Nf_r  = d_species_offset[grid_r + 1] - d_species_offset[grid_r];
311:       IPf_idx_r = b_id * IPf_sz_glb + d_ipf_offset[grid_r] + ipidx_g;
312:       for (fieldB = threadIdx.y; fieldB < loc_Nf_r; fieldB += blockDim.y) {
313:         const PetscInt idx                       = IPf_idx_r + fieldB * nip_loc_r;
314:         s_f[fieldB * blockDim.x + threadIdx.x]   = d_f[idx]; // all vector threads get copy of data
315:         s_dfx[fieldB * blockDim.x + threadIdx.x] = d_dfdx[idx];
316:         s_dfy[fieldB * blockDim.x + threadIdx.x] = d_dfdy[idx];
317:   #if LANDAU_DIM == 3
318:         s_dfz[fieldB * blockDim.x + threadIdx.x] = d_dfdz[idx];
319:   #endif
320:       }
321:     }
322:     __syncthreads();
323:     if (ipidx < nip_global) {
324:       const PetscReal wi = ww[ipidx], x = xx[ipidx], y = yy[ipidx];
325:       PetscReal       temp1[3] = {0, 0, 0}, temp2 = 0;
326:   #if LANDAU_DIM == 2
327:       PetscReal       Ud[2][2], Uk[2][2], mask = (PetscAbs(vj[0] - x) < 100 * PETSC_SQRT_MACHINE_EPSILON && PetscAbs(vj[1] - y) < 100 * PETSC_SQRT_MACHINE_EPSILON) ? 0. : 1.;
328:       LandauTensor2D(vj, x, y, Ud, Uk, mask);
329:   #else
330:       PetscReal U[3][3], z = zz[ipidx], mask = (PetscAbs(vj[0] - x) < 100 * PETSC_SQRT_MACHINE_EPSILON && PetscAbs(vj[1] - y) < 100 * PETSC_SQRT_MACHINE_EPSILON && PetscAbs(vj[2] - z) < 100 * PETSC_SQRT_MACHINE_EPSILON) ? 0. : 1.;
331:       LandauTensor3D(vj, x, y, z, U, mask);
332:   #endif
333:       for (int fieldB = 0; fieldB < loc_Nf_r; fieldB++) {
334:         temp1[0] += s_dfx[fieldB * blockDim.x + threadIdx.x] * s_nu_beta[fieldB + f_off_r] * s_invMass[fieldB + f_off_r] * 7; // todo : bring lambdas in
335:         temp1[1] += s_dfy[fieldB * blockDim.x + threadIdx.x] * s_nu_beta[fieldB + f_off_r] * s_invMass[fieldB + f_off_r] * 7;
336:   #if LANDAU_DIM == 3
337:         temp1[2] += s_dfz[fieldB * blockDim.x + threadIdx.x] * s_nu_beta[fieldB + f_off_r] * s_invMass[fieldB + f_off_r] * 7;
338:   #endif
339:         temp2 += s_f[fieldB * blockDim.x + threadIdx.x] * s_nu_beta[fieldB + f_off_r] * 7;
340:       }
341:       temp1[0] *= wi;
342:       temp1[1] *= wi;
343:   #if LANDAU_DIM == 3
344:       temp1[2] *= wi;
345:   #endif
346:       temp2 *= wi;
347:   #if LANDAU_DIM == 2
348:     #pragma unroll
349:       for (d2 = 0; d2 < 2; d2++) {
350:     #pragma unroll
351:         for (d3 = 0; d3 < 2; ++d3) {
352:           /* K = U * grad(f): g2=e: i,A */
353:           gg2_temp[d2] += Uk[d2][d3] * temp1[d3];
354:           /* D = -U * (I \kron (fx)): g3=f: i,j,A */
355:           gg3_temp[d2][d3] += Ud[d2][d3] * temp2;
356:         }
357:       }
358:   #else
359:     #pragma unroll
360:       for (d2 = 0; d2 < 3; ++d2) {
361:     #pragma unroll
362:         for (d3 = 0; d3 < 3; ++d3) {
363:           /* K = U * grad(f): g2 = e: i,A */
364:           gg2_temp[d2] += U[d2][d3] * temp1[d3];
365:           /* D = -U * (I \kron (fx)): g3 = f: i,j,A */
366:           gg3_temp[d2][d3] += U[d2][d3] * temp2;
367:         }
368:       }
369:   #endif
370:     }
371:   } /* IPs */

373:   /* reduce gg temp sums across threads */
374:   for (delta = blockDim.x / 2; delta > 0; delta /= 2) {
375:   #pragma unroll
376:     for (d2 = 0; d2 < dim; d2++) {
377:       gg2_temp[d2] += __shfl_xor_sync(0xffffffff, gg2_temp[d2], delta, blockDim.x);
378:   #pragma unroll
379:       for (d3 = 0; d3 < dim; d3++) gg3_temp[d2][d3] += __shfl_xor_sync(0xffffffff, gg3_temp[d2][d3], delta, blockDim.x);
380:     }
381:   }
382:   // add alpha and put in gg2/3
383:   for (fieldA = threadIdx.x; fieldA < loc_Nf; fieldA += blockDim.x) {
384:   #pragma unroll
385:     for (d2 = 0; d2 < dim; d2++) {
386:       s_gg2[d2][myQi][fieldA] += gg2_temp[d2] * s_nu_alpha[fieldA + f_off];
387:   #pragma unroll
388:       for (d3 = 0; d3 < dim; d3++) s_gg3[d2][d3][myQi][fieldA] -= gg3_temp[d2][d3] * s_nu_alpha[fieldA + f_off] * s_invMass[fieldA + f_off];
389:     }
390:   }
391:   __syncthreads();
392:   /* add electric field term once per IP */
393:   for (fieldA = threadIdx.x; fieldA < loc_Nf; fieldA += blockDim.x) s_gg2[dim - 1][myQi][fieldA] += Eq_m[fieldA + f_off];
394:   __syncthreads();
395:   /* Jacobian transform - g2 */
396:   for (fieldA = threadIdx.x; fieldA < loc_Nf; fieldA += blockDim.x) {
397:     PetscReal wj = ww[jpidx];
398:     for (d = 0; d < dim; ++d) {
399:       s_g2[d][myQi][fieldA] = 0.0;
400:       for (d2 = 0; d2 < dim; ++d2) {
401:         s_g2[d][myQi][fieldA] += invJj[d * dim + d2] * s_gg2[d2][myQi][fieldA];
402:         s_g3[d][d2][myQi][fieldA] = 0.0;
403:         for (d3 = 0; d3 < dim; ++d3) {
404:           for (dp = 0; dp < dim; ++dp) s_g3[d][d2][myQi][fieldA] += invJj[d * dim + d3] * s_gg3[d3][dp][myQi][fieldA] * invJj[d2 * dim + dp];
405:         }
406:         s_g3[d][d2][myQi][fieldA] *= wj;
407:       }
408:       s_g2[d][myQi][fieldA] *= wj;
409:     }
410:   }
411:   __syncthreads(); // Synchronize (ensure all the data is available) and sum IP matrices
412:   /* FE matrix construction */
413:   {
414:     int fieldA, d, qj, d2, q, idx, totDim = Nb * loc_Nf;
415:     /* assemble */
416:     for (fieldA = 0; fieldA < loc_Nf; fieldA++) {
417:       for (f = threadIdx.y; f < Nb; f += blockDim.y) {
418:         for (g = threadIdx.x; g < Nb; g += blockDim.x) {
419:           PetscScalar t = 0;
420:           for (qj = 0; qj < Nq; qj++) {
421:             const PetscReal *BJq = &BB[qj * Nb], *DIq = &DD[qj * Nb * dim];
422:             for (d = 0; d < dim; ++d) {
423:               t += DIq[f * dim + d] * s_g2[d][qj][fieldA] * BJq[g];
424:               for (d2 = 0; d2 < dim; ++d2) t += DIq[f * dim + d] * s_g3[d][d2][qj][fieldA] * DIq[g * dim + d2];
425:             }
426:           }
427:           if (elemMat) {
428:             const PetscInt fOff = (fieldA * Nb + f) * totDim + fieldA * Nb + g;
429:             elemMat[fOff] += t; // ????
430:           } else s_fieldMats[f][g] = t;
431:         }
432:       }
433:       if (s_fieldMats) {
434:         PetscScalar            vals[LANDAU_MAX_Q_FACE * LANDAU_MAX_Q_FACE];
435:         PetscInt               nr, nc;
436:         const LandauIdx *const Idxs = &d_maps[grid]->gIdx[loc_elem][fieldA][0];
437:         __syncthreads();
438:         if (threadIdx.y == 0) {
439:           for (f = threadIdx.x; f < Nb; f += blockDim.x) {
440:             idx = Idxs[f];
441:             if (idx >= 0) {
442:               s_idx[f][0]   = idx + moffset;
443:               s_scale[f][0] = 1.;
444:             } else {
445:               idx = -idx - 1;
446:               for (q = 0; q < d_maps[grid]->num_face; q++) {
447:                 if (d_maps[grid]->c_maps[idx][q].gid >= 0) s_idx[f][q] = d_maps[grid]->c_maps[idx][q].gid + moffset;
448:                 else s_idx[f][q] = -1;
449:                 s_scale[f][q] = d_maps[grid]->c_maps[idx][q].scale;
450:               }
451:             }
452:           }
453:         }
454:         __syncthreads();
455:         for (f = threadIdx.y; f < Nb; f += blockDim.y) {
456:           idx = Idxs[f];
457:           if (idx >= 0) {
458:             nr = 1;
459:           } else {
460:             nr = d_maps[grid]->num_face;
461:           }
462:           for (g = threadIdx.x; g < Nb; g += blockDim.x) {
463:             idx = Idxs[g];
464:             if (idx >= 0) {
465:               nc = 1;
466:             } else {
467:               nc = d_maps[grid]->num_face;
468:             }
469:             for (q = 0; q < nr; q++) {
470:               for (d = 0; d < nc; d++) vals[q * nc + d] = s_scale[f][q] * s_scale[g][d] * s_fieldMats[f][g];
471:             }
472:             static_cast<void>(MatSetValuesDevice(d_mat, nr, s_idx[f], nc, s_idx[g], vals, ADD_VALUES));
473:           }
474:         }
475:         __syncthreads();
476:       }
477:     }
478:   }
479: }

481: //
482: // The CUDA Landau kernel
483: //
484: __global__ void __launch_bounds__(256, 2) landau_jacobian(const PetscInt nip_global, const PetscInt dim, const PetscInt Nb, const PetscInt num_grids, const PetscReal invJj[], const PetscInt Nftot, const PetscReal nu_alpha[], const PetscReal nu_beta[], const PetscReal invMass[], const PetscReal Eq_m[], const PetscReal *const BB, const PetscReal *const DD, const PetscReal xx[], const PetscReal yy[], const PetscReal ww[], PetscScalar d_elem_mats[], P4estVertexMaps *d_maps[], PetscSplitCSRDataStructure d_mat, PetscReal d_f[], PetscReal d_dfdx[], PetscReal d_dfdy[],
485:   #if LANDAU_DIM == 3
486:                                                           const PetscReal zz[], PetscReal d_dfdz[],
487:   #endif
488:                                                           const PetscInt d_numCells[], const PetscInt d_species_offset[], const PetscInt d_mat_offset[], const PetscInt d_ip_offset[], const PetscInt d_ipf_offset[], const PetscInt d_elem_offset[])
489: {
490:   extern __shared__ PetscReal smem[];
491:   int                         size                                = 0;
492:   PetscReal(*s_g2)[LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES] = (PetscReal(*)[LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES]) & smem[size];
493:   size += LANDAU_MAX_NQ * LANDAU_MAX_SPECIES * LANDAU_DIM;
494:   PetscReal(*s_g3)[LANDAU_DIM][LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES] = (PetscReal(*)[LANDAU_DIM][LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES]) & smem[size];
495:   size += LANDAU_DIM * LANDAU_DIM * LANDAU_MAX_NQ * LANDAU_MAX_SPECIES;
496:   PetscReal(*s_gg2)[LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES] = (PetscReal(*)[LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES]) & smem[size];
497:   size += LANDAU_MAX_NQ * LANDAU_MAX_SPECIES * LANDAU_DIM;
498:   PetscReal(*s_gg3)[LANDAU_DIM][LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES] = (PetscReal(*)[LANDAU_DIM][LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES]) & smem[size];
499:   size += LANDAU_DIM * LANDAU_DIM * LANDAU_MAX_NQ * LANDAU_MAX_SPECIES;
500:   PetscReal *s_nu_alpha = &smem[size];
501:   size += LANDAU_MAX_SPECIES;
502:   PetscReal *s_nu_beta = &smem[size];
503:   size += LANDAU_MAX_SPECIES;
504:   PetscReal *s_invMass = &smem[size];
505:   size += LANDAU_MAX_SPECIES;
506:   PetscReal *s_f = &smem[size];
507:   size += blockDim.x * LANDAU_MAX_SPECIES;
508:   PetscReal *s_dfx = &smem[size];
509:   size += blockDim.x * LANDAU_MAX_SPECIES;
510:   PetscReal *s_dfy = &smem[size];
511:   size += blockDim.x * LANDAU_MAX_SPECIES;
512:   #if LANDAU_DIM == 3
513:   PetscReal *s_dfz = &smem[size];
514:   size += blockDim.x * LANDAU_MAX_SPECIES;
515:   #endif
516:   PetscScalar(*s_fieldMats)[LANDAU_MAX_NQ][LANDAU_MAX_NQ];
517:   PetscReal(*s_scale)[LANDAU_MAX_NQ][LANDAU_MAX_Q_FACE] = nullptr;
518:   PetscInt(*s_idx)[LANDAU_MAX_NQ][LANDAU_MAX_Q_FACE]    = nullptr;
519:   const PetscInt b_elem_idx = blockIdx.y, b_id = blockIdx.x;
520:   PetscInt       Nq = blockDim.y, grid = 0; // Nq == Nb
521:   PetscScalar   *elemMat = NULL;            /* my output */
522:   while (b_elem_idx >= d_elem_offset[grid + 1]) grid++;
523:   {
524:     const PetscInt   loc_Nf = d_species_offset[grid + 1] - d_species_offset[grid], loc_elem = b_elem_idx - d_elem_offset[grid];
525:     const PetscInt   myQi  = threadIdx.y;
526:     const PetscInt   jpidx = d_ip_offset[grid] + myQi + loc_elem * Nq;
527:     const PetscReal *invJ  = &invJj[jpidx * dim * dim];
528:     if (d_elem_mats) {
529:       PetscInt totDim = loc_Nf * Nb;
530:       elemMat         = d_elem_mats; // start a beginning and get to my element matrix
531:       for (PetscInt b_id2 = 0; b_id2 < b_id; b_id2++) {
532:         for (PetscInt grid2 = 0; grid2 < num_grids; grid2++) {
533:           PetscInt Nfloc2 = d_species_offset[grid2 + 1] - d_species_offset[grid2], totDim2 = Nfloc2 * Nb;
534:           elemMat += d_numCells[grid2] * totDim2 * totDim2; // jump past grids,could be in an offset
535:         }
536:       }
537:       for (PetscInt grid2 = 0; grid2 < grid; grid2++) {
538:         PetscInt Nfloc2 = d_species_offset[grid2 + 1] - d_species_offset[grid2], totDim2 = Nfloc2 * Nb;
539:         elemMat += d_numCells[grid2] * totDim2 * totDim2; // jump past grids, could be in an offset
540:       }
541:       elemMat += loc_elem * totDim * totDim; // index into local matrix & zero out
542:       for (int i = threadIdx.x + threadIdx.y * blockDim.x; i < totDim * totDim; i += blockDim.x * blockDim.y) elemMat[i] = 0;
543:     }
544:     __syncthreads();
545:     if (d_maps) {
546:       // reuse the space for fieldMats
547:       s_fieldMats = (PetscScalar(*)[LANDAU_MAX_NQ][LANDAU_MAX_NQ]) & smem[size];
548:       size += LANDAU_MAX_NQ * LANDAU_MAX_NQ;
549:       s_scale = (PetscReal(*)[LANDAU_MAX_NQ][LANDAU_MAX_Q_FACE]) & smem[size];
550:       size += LANDAU_MAX_NQ * LANDAU_MAX_Q_FACE;
551:       s_idx = (PetscInt(*)[LANDAU_MAX_NQ][LANDAU_MAX_Q_FACE]) & smem[size];
552:       size += LANDAU_MAX_NQ * LANDAU_MAX_Q_FACE; // this is too big, idx is an integer
553:     } else {
554:       s_fieldMats = NULL;
555:     }
556:     __syncthreads();
557:     landau_jac_kernel(num_grids, jpidx, nip_global, grid, xx, yy, ww, invJ, Nftot, nu_alpha, nu_beta, invMass, Eq_m, BB, DD, elemMat, d_maps, d_mat, *s_fieldMats, *s_scale, *s_idx, *s_g2, *s_g3, *s_gg2, *s_gg3, s_nu_alpha, s_nu_beta, s_invMass, s_f, s_dfx, s_dfy, d_f, d_dfdx, d_dfdy,
558:   #if LANDAU_DIM == 3
559:                       zz, s_dfz, d_dfdz,
560:   #endif
561:                       d_numCells, d_species_offset, d_mat_offset, d_ip_offset, d_ipf_offset, d_elem_offset);
562:   }
563: }

565: __global__ void __launch_bounds__(256, 4) landau_mass(const PetscInt dim, const PetscInt Nb, const PetscInt num_grids, const PetscReal d_w[], const PetscReal *const BB, const PetscReal *const DD, PetscScalar d_elem_mats[], P4estVertexMaps *d_maps[], PetscSplitCSRDataStructure d_mat, PetscReal shift, const PetscInt d_numCells[], const PetscInt d_species_offset[], const PetscInt d_mat_offset[], const PetscInt d_ip_offset[], const PetscInt d_elem_offset[])
566: {
567:   extern __shared__ PetscReal smem[];
568:   const PetscInt              Nq = blockDim.y, b_elem_idx = blockIdx.y, b_id = blockIdx.x;
569:   PetscScalar                *elemMat = NULL; /* my output */
570:   PetscInt                    fieldA, d, qj, q, idx, f, g, grid = 0, size = 0;
571:   PetscScalar(*s_fieldMats)[LANDAU_MAX_NQ][LANDAU_MAX_NQ];
572:   PetscReal(*s_scale)[LANDAU_MAX_NQ][LANDAU_MAX_Q_FACE];
573:   PetscInt(*s_idx)[LANDAU_MAX_NQ][LANDAU_MAX_Q_FACE];
574:   if (d_maps) {
575:     // reuse the space for fieldMats
576:     s_fieldMats = (PetscScalar(*)[LANDAU_MAX_NQ][LANDAU_MAX_NQ]) & smem[size];
577:     size += LANDAU_MAX_NQ * LANDAU_MAX_NQ;
578:     s_scale = (PetscReal(*)[LANDAU_MAX_NQ][LANDAU_MAX_Q_FACE]) & smem[size];
579:     size += LANDAU_MAX_NQ * LANDAU_MAX_Q_FACE;
580:     s_idx = (PetscInt(*)[LANDAU_MAX_NQ][LANDAU_MAX_Q_FACE]) & smem[size];
581:     size += LANDAU_MAX_NQ * LANDAU_MAX_Q_FACE; // this is too big, idx is an integer
582:   } else {
583:     s_fieldMats = NULL;
584:   }
585:   while (b_elem_idx >= d_elem_offset[grid + 1]) grid++;
586:   {
587:     const PetscInt loc_Nf = d_species_offset[grid + 1] - d_species_offset[grid], loc_elem = b_elem_idx - d_elem_offset[grid];
588:     const PetscInt moffset = LAND_MOFFSET(b_id, grid, gridDim.x, num_grids, d_mat_offset), totDim = loc_Nf * Nq;
589:     if (d_elem_mats) {
590:       elemMat = d_elem_mats; // start a beginning
591:       for (PetscInt b_id2 = 0; b_id2 < b_id; b_id2++) {
592:         for (PetscInt grid2 = 0; grid2 < num_grids; grid2++) {
593:           PetscInt Nfloc2 = d_species_offset[grid2 + 1] - d_species_offset[grid2], totDim2 = Nfloc2 * Nb;
594:           elemMat += d_numCells[grid2] * totDim2 * totDim2; // jump past grids,could be in an offset
595:         }
596:       }
597:       for (PetscInt grid2 = 0; grid2 < grid; grid2++) {
598:         PetscInt Nfloc2 = d_species_offset[grid2 + 1] - d_species_offset[grid2], totDim2 = Nfloc2 * Nb;
599:         elemMat += d_numCells[grid2] * totDim2 * totDim2; // jump past grids,could be in an offset
600:       }
601:       elemMat += loc_elem * totDim * totDim;
602:       for (int i = threadIdx.x + threadIdx.y * blockDim.x; i < totDim * totDim; i += blockDim.x * blockDim.y) elemMat[i] = 0;
603:     }
604:     __syncthreads();
605:     /* FE mass matrix construction */
606:     for (fieldA = 0; fieldA < loc_Nf; fieldA++) {
607:       PetscScalar vals[LANDAU_MAX_Q_FACE * LANDAU_MAX_Q_FACE];
608:       PetscInt    nr, nc;
609:       for (f = threadIdx.y; f < Nb; f += blockDim.y) {
610:         for (g = threadIdx.x; g < Nb; g += blockDim.x) {
611:           PetscScalar t = 0;
612:           for (qj = 0; qj < Nq; qj++) {
613:             const PetscReal *BJq   = &BB[qj * Nb];
614:             const PetscInt   jpidx = d_ip_offset[grid] + qj + loc_elem * Nq;
615:             if (dim == 2) {
616:               t += BJq[f] * d_w[jpidx] * shift * BJq[g] * 2. * PETSC_PI;
617:             } else {
618:               t += BJq[f] * d_w[jpidx] * shift * BJq[g];
619:             }
620:           }
621:           if (elemMat) {
622:             const PetscInt fOff = (fieldA * Nb + f) * totDim + fieldA * Nb + g;
623:             elemMat[fOff] += t; // ????
624:           } else (*s_fieldMats)[f][g] = t;
625:         }
626:       }
627:       if (!elemMat) {
628:         const LandauIdx *const Idxs = &d_maps[grid]->gIdx[loc_elem][fieldA][0];
629:         __syncthreads();
630:         if (threadIdx.y == 0) {
631:           for (f = threadIdx.x; f < Nb; f += blockDim.x) {
632:             idx = Idxs[f];
633:             if (idx >= 0) {
634:               (*s_idx)[f][0]   = idx + moffset;
635:               (*s_scale)[f][0] = 1.;
636:             } else {
637:               idx = -idx - 1;
638:               for (q = 0; q < d_maps[grid]->num_face; q++) {
639:                 if (d_maps[grid]->c_maps[idx][q].gid >= 0) (*s_idx)[f][q] = d_maps[grid]->c_maps[idx][q].gid + moffset;
640:                 else (*s_idx)[f][q] = -1;
641:                 (*s_scale)[f][q] = d_maps[grid]->c_maps[idx][q].scale;
642:               }
643:             }
644:           }
645:         }
646:         __syncthreads();
647:         for (f = threadIdx.y; f < Nb; f += blockDim.y) {
648:           idx = Idxs[f];
649:           if (idx >= 0) {
650:             nr = 1;
651:           } else {
652:             nr = d_maps[grid]->num_face;
653:           }
654:           for (g = threadIdx.x; g < Nb; g += blockDim.x) {
655:             idx = Idxs[g];
656:             if (idx >= 0) {
657:               nc = 1;
658:             } else {
659:               nc = d_maps[grid]->num_face;
660:             }
661:             for (q = 0; q < nr; q++) {
662:               for (d = 0; d < nc; d++) vals[q * nc + d] = (*s_scale)[f][q] * (*s_scale)[g][d] * (*s_fieldMats)[f][g];
663:             }
664:             static_cast<void>(MatSetValuesDevice(d_mat, nr, (*s_idx)[f], nc, (*s_idx)[g], vals, ADD_VALUES));
665:           }
666:         }
667:       }
668:       __syncthreads();
669:     }
670:   }
671: }

673: PetscErrorCode LandauCUDAJacobian(DM plex[], const PetscInt Nq, const PetscInt batch_sz, const PetscInt num_grids, const PetscInt a_numCells[], PetscReal a_Eq_m[], PetscScalar a_elem_closure[], const PetscScalar a_xarray[], const LandauStaticData *SData_d, const PetscReal shift, const PetscLogEvent events[], const PetscInt a_mat_offset[], const PetscInt a_species_offset[], Mat subJ[], Mat JacP)
674: {
675:   cudaError_t                cerr;
676:   PetscInt                   Nb = Nq, dim, nip_global, num_cells_batch, elem_mat_size_tot;
677:   PetscInt                  *d_numCells, *d_species_offset, *d_mat_offset, *d_ip_offset, *d_ipf_offset, *d_elem_offset;
678:   PetscInt                   szf = sizeof(PetscReal), szs = sizeof(PetscScalar), Nftot = a_species_offset[num_grids];
679:   PetscReal                 *d_BB = NULL, *d_DD = NULL, *d_invJj = NULL, *d_nu_alpha = NULL, *d_nu_beta = NULL, *d_invMass = NULL, *d_Eq_m = NULL, *d_x = NULL, *d_y = NULL, *d_w = NULL;
680:   PetscScalar               *d_elem_mats = NULL, *d_vertex_f = NULL;
681:   PetscReal                 *d_f = NULL, *d_dfdx = NULL, *d_dfdy = NULL;
682:   #if LANDAU_DIM == 3
683:   PetscReal                 *d_dfdz = NULL, *d_z = NULL;
684:   #endif
685:   LandauCtx                 *ctx;
686:   PetscSplitCSRDataStructure d_mat = NULL;
687:   P4estVertexMaps          **d_maps, *maps[LANDAU_MAX_GRIDS];
688:   int                        nnn = 256 / Nq; // machine dependent
689:   PetscContainer             container;

691:   PetscFunctionBegin;
692:   PetscCall(PetscLogEventBegin(events[3], 0, 0, 0, 0));
693:   while (nnn & nnn - 1) nnn = nnn & nnn - 1;
694:   if (nnn > 16) nnn = 16;
695:   PetscCall(DMGetApplicationContext(plex[0], &ctx));
696:   PetscCheck(ctx, PETSC_COMM_SELF, PETSC_ERR_PLIB, "no context");
697:   PetscCall(DMGetDimension(plex[0], &dim));
698:   PetscCheck(dim == LANDAU_DIM, PETSC_COMM_SELF, PETSC_ERR_PLIB, "LANDAU_DIM %d != dim %" PetscInt_FMT, LANDAU_DIM, dim);
699:   if (ctx->gpu_assembly) {
700:     PetscCall(PetscObjectQuery((PetscObject)JacP, "assembly_maps", (PetscObject *)&container));
701:     if (container) {       // not here first call
702:       static int init = 0; // hack. just do every time, or put in setup (but that is in base class code), or add init_maps flag
703:       if (!init++) {
704:         P4estVertexMaps *h_maps = NULL;
705:         PetscCall(PetscContainerGetPointer(container, (void **)&h_maps));
706:         for (PetscInt grid = 0; grid < num_grids; grid++) {
707:           if (h_maps[grid].d_self) {
708:             maps[grid] = h_maps[grid].d_self;
709:           } else {
710:             SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "GPU assembly but no metadata in container");
711:           }
712:         }
713:         PetscCallCUDA(cudaMemcpy(SData_d->maps, maps, num_grids * sizeof(P4estVertexMaps *), cudaMemcpyHostToDevice));
714:       }
715:       d_maps = (P4estVertexMaps **)SData_d->maps;
716:       // this does the setup the first time called
717:       PetscCall(MatCUSPARSEGetDeviceMatWrite(JacP, &d_mat));
718:     } else {
719:       d_maps = NULL;
720:     }
721:   } else {
722:     container = NULL;
723:     d_maps    = NULL;
724:   }
725:   PetscCall(PetscLogEventEnd(events[3], 0, 0, 0, 0));
726:   {
727:     PetscInt elem_mat_size = 0;
728:     nip_global = num_cells_batch = 0;
729:     for (PetscInt grid = 0; grid < num_grids; grid++) {
730:       PetscInt Nfloc = a_species_offset[grid + 1] - a_species_offset[grid], totDim = Nfloc * Nb;
731:       nip_global += a_numCells[grid] * Nq;
732:       num_cells_batch += a_numCells[grid];                 // is in d_elem_offset, but not on host
733:       elem_mat_size += a_numCells[grid] * totDim * totDim; // could save in an offset here -- batch major ordering
734:     }
735:     elem_mat_size_tot = d_maps ? 0 : elem_mat_size;
736:   }
737:   dim3 dimGrid(batch_sz, num_cells_batch);
738:   if (elem_mat_size_tot) {
739:     PetscCallCUDA(cudaMalloc((void **)&d_elem_mats, batch_sz * elem_mat_size_tot * szs)); // kernel output - first call is on CPU
740:   } else d_elem_mats = NULL;
741:   // create data
742:   d_BB = (PetscReal *)SData_d->B;
743:   d_DD = (PetscReal *)SData_d->D;
744:   if (a_elem_closure || a_xarray) { // form f and df
745:     PetscCall(PetscLogEventBegin(events[1], 0, 0, 0, 0));
746:     PetscCallCUDA(cudaMemcpy(SData_d->Eq_m, a_Eq_m, Nftot * szf, cudaMemcpyHostToDevice));
747:     d_invJj    = (PetscReal *)SData_d->invJ;
748:     d_nu_alpha = (PetscReal *)SData_d->alpha;
749:     d_nu_beta  = (PetscReal *)SData_d->beta;
750:     d_invMass  = (PetscReal *)SData_d->invMass;
751:     d_x        = (PetscReal *)SData_d->x;
752:     d_y        = (PetscReal *)SData_d->y;
753:     d_w        = (PetscReal *)SData_d->w;
754:     d_Eq_m     = (PetscReal *)SData_d->Eq_m;
755:     d_dfdx     = (PetscReal *)SData_d->dfdx;
756:     d_dfdy     = (PetscReal *)SData_d->dfdy;
757:   #if LANDAU_DIM == 3
758:     d_dfdz     = (PetscReal *)SData_d->dfdz;
759:     d_z        = (PetscReal *)SData_d->z;
760:   #endif
761:     d_f        = (PetscReal *)SData_d->f;
762:     // get a d_vertex_f
763:     if (a_elem_closure) {
764:       PetscInt closure_sz = 0; // argh, don't have this on the host!!!
765:       for (PetscInt grid = 0; grid < num_grids; grid++) {
766:         PetscInt nfloc = a_species_offset[grid + 1] - a_species_offset[grid];
767:         closure_sz += Nq * nfloc * a_numCells[grid];
768:       }
769:       closure_sz *= batch_sz;
770:       PetscCallCUDA(cudaMalloc((void **)&d_vertex_f, closure_sz * sizeof(*a_elem_closure)));
771:       PetscCallCUDA(cudaMemcpy(d_vertex_f, a_elem_closure, closure_sz * sizeof(*a_elem_closure), cudaMemcpyHostToDevice));
772:     } else {
773:       d_vertex_f = (PetscScalar *)a_xarray;
774:     }
775:     PetscCall(PetscLogEventEnd(events[1], 0, 0, 0, 0));
776:   } else {
777:     d_w = (PetscReal *)SData_d->w; // mass just needs the weights
778:   }
779:   //
780:   d_numCells       = (PetscInt *)SData_d->NCells; // redundant -- remove
781:   d_species_offset = (PetscInt *)SData_d->species_offset;
782:   d_mat_offset     = (PetscInt *)SData_d->mat_offset;
783:   d_ip_offset      = (PetscInt *)SData_d->ip_offset;
784:   d_ipf_offset     = (PetscInt *)SData_d->ipf_offset;
785:   d_elem_offset    = (PetscInt *)SData_d->elem_offset;
786:   if (a_elem_closure || a_xarray) { // form f and df
787:     dim3 dimBlockFDF(nnn > Nftot ? Nftot : nnn, Nq), dimBlock((nip_global > nnn) ? nnn : nip_global, Nq);
788:     PetscCall(PetscLogEventBegin(events[8], 0, 0, 0, 0));
789:     PetscCall(PetscLogGpuTimeBegin());
790:     PetscCall(PetscInfo(plex[0], "Form F and dF/dx vectors: nip_global=%" PetscInt_FMT " num_grids=%" PetscInt_FMT "\n", nip_global, num_grids));
791:     landau_form_fdf<<<dimGrid, dimBlockFDF>>>(dim, Nb, num_grids, d_invJj, d_BB, d_DD, d_vertex_f, d_maps, d_f, d_dfdx, d_dfdy,
792:   #if LANDAU_DIM == 3
793:                                               d_dfdz,
794:   #endif
795:                                               d_numCells, d_species_offset, d_mat_offset, d_ip_offset, d_ipf_offset, d_elem_offset);
796:     PetscCUDACheckLaunch;
797:     PetscCall(PetscLogGpuFlops(batch_sz * nip_global * (PetscLogDouble)(2 * Nb * (1 + dim))));
798:     if (a_elem_closure) {
799:       PetscCallCUDA(cudaFree(d_vertex_f));
800:       d_vertex_f = NULL;
801:     }
802:     PetscCall(PetscLogGpuTimeEnd());
803:     PetscCall(PetscLogEventEnd(events[8], 0, 0, 0, 0));
804:     // Jacobian
805:     PetscCall(PetscLogEventBegin(events[4], 0, 0, 0, 0));
806:     PetscCall(PetscLogGpuTimeBegin());
807:     PetscCall(PetscLogGpuFlops(batch_sz * nip_global * (PetscLogDouble)(a_elem_closure ? (nip_global * (11 * Nftot + 4 * dim * dim) + 6 * Nftot * dim * dim * dim + 10 * Nftot * dim * dim + 4 * Nftot * dim + Nb * Nftot * Nb * Nq * dim * dim * 5) : Nb * Nftot * Nb * Nq * 4)));
808:     PetscInt ii = 2 * LANDAU_MAX_NQ * LANDAU_MAX_SPECIES * LANDAU_DIM * (1 + LANDAU_DIM) + 3 * LANDAU_MAX_SPECIES + (1 + LANDAU_DIM) * dimBlock.x * LANDAU_MAX_SPECIES + LANDAU_MAX_NQ * LANDAU_MAX_NQ + 2 * LANDAU_MAX_NQ * LANDAU_MAX_Q_FACE;
809:     if (ii * szf >= 49152) {
810:       cerr = cudaFuncSetAttribute(landau_jacobian, cudaFuncAttributeMaxDynamicSharedMemorySize, 98304);
811:       PetscCallCUDA(cerr);
812:     }
813:     PetscCall(PetscInfo(plex[0], "Jacobian shared memory size: %" PetscInt_FMT " bytes, d_elem_mats=%p d_maps=%p\n", ii, d_elem_mats, d_maps));
814:     landau_jacobian<<<dimGrid, dimBlock, ii * szf>>>(nip_global, dim, Nb, num_grids, d_invJj, Nftot, d_nu_alpha, d_nu_beta, d_invMass, d_Eq_m, d_BB, d_DD, d_x, d_y, d_w, d_elem_mats, d_maps, d_mat, d_f, d_dfdx, d_dfdy,
815:   #if LANDAU_DIM == 3
816:                                                      d_z, d_dfdz,
817:   #endif
818:                                                      d_numCells, d_species_offset, d_mat_offset, d_ip_offset, d_ipf_offset, d_elem_offset);
819:     PetscCUDACheckLaunch; // has sync
820:     PetscCall(PetscLogGpuTimeEnd());
821:     PetscCall(PetscLogEventEnd(events[4], 0, 0, 0, 0));
822:   } else { // mass
823:     dim3     dimBlock(nnn, Nq);
824:     PetscInt ii = LANDAU_MAX_NQ * LANDAU_MAX_NQ + 2 * LANDAU_MAX_NQ * LANDAU_MAX_Q_FACE;
825:     if (ii * szf >= 49152) {
826:       cerr = cudaFuncSetAttribute(landau_mass, cudaFuncAttributeMaxDynamicSharedMemorySize, 98304);
827:       PetscCallCUDA(cerr);
828:     }
829:     PetscCall(PetscInfo(plex[0], "Mass d_maps = %p. Nq=%" PetscInt_FMT ", vector size %d num_cells_batch=%" PetscInt_FMT ", %" PetscInt_FMT " shared memory words\n", d_maps, Nq, nnn, num_cells_batch, ii));
830:     PetscCall(PetscLogEventBegin(events[16], 0, 0, 0, 0));
831:     PetscCall(PetscLogGpuTimeBegin());
832:     landau_mass<<<dimGrid, dimBlock, ii * szf>>>(dim, Nb, num_grids, d_w, d_BB, d_DD, d_elem_mats, d_maps, d_mat, shift, d_numCells, d_species_offset, d_mat_offset, d_ip_offset, d_elem_offset);
833:     PetscCUDACheckLaunch; // has sync
834:     PetscCall(PetscLogGpuTimeEnd());
835:     PetscCall(PetscLogEventEnd(events[16], 0, 0, 0, 0));
836:   }
837:   // First time assembly with or without GPU assembly
838:   if (d_elem_mats) {
839:     PetscInt elem_mats_idx = 0;
840:     for (PetscInt b_id = 0; b_id < batch_sz; b_id++) {    // OpenMP (once)
841:       for (PetscInt grid = 0; grid < num_grids; grid++) { // elem_mats_idx += totDim*totDim*a_numCells[grid];
842:         const PetscInt     Nfloc = a_species_offset[grid + 1] - a_species_offset[grid], totDim = Nfloc * Nq;
843:         PetscScalar       *elemMats = NULL, *elMat;
844:         PetscSection       section, globalSection;
845:         PetscInt           cStart, cEnd, ej;
846:         PetscInt           moffset = LAND_MOFFSET(b_id, grid, batch_sz, num_grids, a_mat_offset), nloc, nzl, colbuf[1024], row;
847:         const PetscInt    *cols;
848:         const PetscScalar *vals;
849:         Mat                B = subJ[LAND_PACK_IDX(b_id, grid)];
850:         PetscCall(PetscLogEventBegin(events[5], 0, 0, 0, 0));
851:         PetscCall(DMPlexGetHeightStratum(plex[grid], 0, &cStart, &cEnd));
852:         PetscCall(DMGetLocalSection(plex[grid], &section));
853:         PetscCall(DMGetGlobalSection(plex[grid], &globalSection));
854:         PetscCall(PetscMalloc1(totDim * totDim * a_numCells[grid], &elemMats));
855:         PetscCallCUDA(cudaMemcpy(elemMats, &d_elem_mats[elem_mats_idx], totDim * totDim * a_numCells[grid] * sizeof(*elemMats), cudaMemcpyDeviceToHost));
856:         PetscCall(PetscLogEventEnd(events[5], 0, 0, 0, 0));
857:         PetscCall(PetscLogEventBegin(events[6], 0, 0, 0, 0));
858:         for (ej = cStart, elMat = elemMats; ej < cEnd; ++ej, elMat += totDim * totDim) {
859:           PetscCall(DMPlexMatSetClosure(plex[grid], section, globalSection, B, ej, elMat, ADD_VALUES));
860:           if (ej == -1) {
861:             int d, f;
862:             PetscCall(PetscPrintf(PETSC_COMM_SELF, "GPU Element matrix\n"));
863:             for (d = 0; d < totDim; ++d) {
864:               for (f = 0; f < totDim; ++f) PetscCall(PetscPrintf(PETSC_COMM_SELF, " %12.5e", PetscRealPart(elMat[d * totDim + f])));
865:               PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n"));
866:             }
867:           }
868:         }
869:         PetscCall(PetscFree(elemMats));
870:         PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
871:         PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
872:         // move nest matrix to global JacP
873:         PetscCall(MatGetSize(B, &nloc, NULL));
874:         for (int i = 0; i < nloc; i++) {
875:           PetscCall(MatGetRow(B, i, &nzl, &cols, &vals));
876:           PetscCheck(nzl <= 1024, PetscObjectComm((PetscObject)B), PETSC_ERR_PLIB, "Row too big: %" PetscInt_FMT, nzl);
877:           for (int j = 0; j < nzl; j++) colbuf[j] = cols[j] + moffset;
878:           row = i + moffset;
879:           PetscCall(MatSetValues(JacP, 1, &row, nzl, colbuf, vals, ADD_VALUES));
880:           PetscCall(MatRestoreRow(B, i, &nzl, &cols, &vals));
881:         }
882:         PetscCall(MatDestroy(&B));
883:         PetscCall(PetscLogEventEnd(events[6], 0, 0, 0, 0));
884:         elem_mats_idx += totDim * totDim * a_numCells[grid]; // this can be a stored offset?
885:       }                                                      // grids
886:     }
887:     PetscCheck(elem_mats_idx == batch_sz * elem_mat_size_tot, PetscObjectComm((PetscObject)JacP), PETSC_ERR_PLIB, "elem_mats_idx != batch_sz*elem_mat_size_tot: %" PetscInt_FMT " %" PetscInt_FMT, elem_mats_idx, batch_sz * elem_mat_size_tot);
888:     PetscCallCUDA(cudaFree(d_elem_mats));
889:   }

891:   PetscFunctionReturn(PETSC_SUCCESS);
892: }
893: #endif