Actual source code: landaucu.cu
1: /*
2: Implements the Landau kernel
3: */
4: #include <petscconf.h>
6: #include <petsclandau.h>
7: #if defined(PETSC_HAVE_CUDA_CLANG)
8: #define LANDAU_NOT_IMPLEMENTED SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Not supported with CLANG")
9: PetscErrorCode LandauCUDAJacobian(DM[], const PetscInt, const PetscInt, const PetscInt, const PetscInt[], PetscReal[], PetscScalar[], const PetscScalar[], const LandauStaticData *, const PetscReal, const PetscLogEvent[], const PetscInt[], const PetscInt[], Mat[], Mat)
10: {
11: LANDAU_NOT_IMPLEMENTED;
12: }
13: PetscErrorCode LandauCUDACreateMatMaps(P4estVertexMaps *, pointInterpolationP4est (*)[LANDAU_MAX_Q_FACE], PetscInt[], PetscInt, PetscInt)
14: {
15: LANDAU_NOT_IMPLEMENTED;
16: }
17: PetscErrorCode LandauCUDADestroyMatMaps(P4estVertexMaps *, PetscInt)
18: {
19: LANDAU_NOT_IMPLEMENTED;
20: }
21: PetscErrorCode LandauCUDAStaticDataSet(DM, const PetscInt, const PetscInt, const PetscInt, PetscInt[], PetscInt[], PetscInt[], PetscReal[], PetscReal[], PetscReal[], PetscReal[], PetscReal[], PetscReal[], PetscReal[], PetscReal[], PetscReal[], LandauStaticData *)
22: {
23: LANDAU_NOT_IMPLEMENTED;
24: }
25: PetscErrorCode LandauCUDAStaticDataClear(LandauStaticData *)
26: {
27: LANDAU_NOT_IMPLEMENTED;
28: }
29: #else
30: #include <petsc/private/dmpleximpl.h>
31: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
32: #include <../src/mat/impls/aij/seq/aij.h>
33: #include <petscmat.h>
34: #include <petscdevice_cuda.h>
36: #include "../land_tensors.h"
37: #include <petscaijdevice.h>
39: PETSC_EXTERN PetscErrorCode LandauCUDACreateMatMaps(P4estVertexMaps maps[], pointInterpolationP4est (*pointMaps)[LANDAU_MAX_Q_FACE], PetscInt Nf[], PetscInt, PetscInt grid)
40: {
41: P4estVertexMaps h_maps;
42: PetscFunctionBegin;
43: h_maps.num_elements = maps[grid].num_elements;
44: h_maps.num_face = maps[grid].num_face;
45: h_maps.num_reduced = maps[grid].num_reduced;
46: h_maps.deviceType = maps[grid].deviceType;
47: h_maps.Nf = Nf[grid];
48: h_maps.numgrids = maps[grid].numgrids;
49: PetscCallCUDA(cudaMalloc((void **)&h_maps.c_maps, maps[grid].num_reduced * sizeof *pointMaps));
50: PetscCallCUDA(cudaMemcpy(h_maps.c_maps, maps[grid].c_maps, maps[grid].num_reduced * sizeof *pointMaps, cudaMemcpyHostToDevice));
51: PetscCallCUDA(cudaMalloc((void **)&h_maps.gIdx, maps[grid].num_elements * sizeof *maps[grid].gIdx));
52: PetscCallCUDA(cudaMemcpy(h_maps.gIdx, maps[grid].gIdx, maps[grid].num_elements * sizeof *maps[grid].gIdx, cudaMemcpyHostToDevice));
53: PetscCallCUDA(cudaMalloc((void **)&maps[grid].d_self, sizeof(P4estVertexMaps)));
54: PetscCallCUDA(cudaMemcpy(maps[grid].d_self, &h_maps, sizeof(P4estVertexMaps), cudaMemcpyHostToDevice));
55: PetscFunctionReturn(PETSC_SUCCESS);
56: }
58: PETSC_EXTERN PetscErrorCode LandauCUDADestroyMatMaps(P4estVertexMaps maps[], PetscInt num_grids)
59: {
60: PetscFunctionBegin;
61: for (PetscInt grid = 0; grid < num_grids; grid++) {
62: P4estVertexMaps *d_maps = maps[grid].d_self, h_maps;
63: PetscCallCUDA(cudaMemcpy(&h_maps, d_maps, sizeof(P4estVertexMaps), cudaMemcpyDeviceToHost));
64: PetscCallCUDA(cudaFree(h_maps.c_maps));
65: PetscCallCUDA(cudaFree(h_maps.gIdx));
66: PetscCallCUDA(cudaFree(d_maps));
67: }
68: PetscFunctionReturn(PETSC_SUCCESS);
69: }
71: PetscErrorCode LandauCUDAStaticDataSet(DM plex, const PetscInt Nq, const PetscInt batch_sz, const PetscInt num_grids, PetscInt a_numCells[], PetscInt a_species_offset[], PetscInt a_mat_offset[], PetscReal nu_alpha[], PetscReal nu_beta[], PetscReal a_invMass[], PetscReal[], PetscReal a_invJ[], PetscReal a_x[], PetscReal a_y[], PetscReal a_z[], PetscReal a_w[], LandauStaticData *SData_d)
72: {
73: PetscTabulation *Tf;
74: PetscReal *BB, *DD;
75: PetscInt dim, Nb = Nq, szf = sizeof(PetscReal), szs = sizeof(PetscScalar), szi = sizeof(PetscInt);
76: PetscInt h_ip_offset[LANDAU_MAX_GRIDS + 1], h_ipf_offset[LANDAU_MAX_GRIDS + 1], h_elem_offset[LANDAU_MAX_GRIDS + 1], nip, IPfdf_sz, Nf;
77: PetscDS prob;
79: PetscFunctionBegin;
80: PetscCall(DMGetDimension(plex, &dim));
81: PetscCall(DMGetDS(plex, &prob));
82: PetscCheck(LANDAU_DIM == dim, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "dim %" PetscInt_FMT " != LANDAU_DIM %d", dim, LANDAU_DIM);
83: PetscCall(PetscDSGetTabulation(prob, &Tf));
84: BB = Tf[0]->T[0];
85: DD = Tf[0]->T[1];
86: Nf = h_ip_offset[0] = h_ipf_offset[0] = h_elem_offset[0] = 0;
87: nip = 0;
88: IPfdf_sz = 0;
89: for (PetscInt grid = 0; grid < num_grids; grid++) {
90: PetscInt nfloc = a_species_offset[grid + 1] - a_species_offset[grid];
91: h_elem_offset[grid + 1] = h_elem_offset[grid] + a_numCells[grid];
92: nip += a_numCells[grid] * Nq;
93: h_ip_offset[grid + 1] = nip;
94: IPfdf_sz += Nq * nfloc * a_numCells[grid];
95: h_ipf_offset[grid + 1] = IPfdf_sz;
96: }
97: Nf = a_species_offset[num_grids];
98: {
99: PetscCallCUDA(cudaMalloc((void **)&SData_d->B, Nq * Nb * szf)); // kernel input
100: PetscCallCUDA(cudaMemcpy(SData_d->B, BB, Nq * Nb * szf, cudaMemcpyHostToDevice));
101: PetscCallCUDA(cudaMalloc((void **)&SData_d->D, Nq * Nb * dim * szf)); // kernel input
102: PetscCallCUDA(cudaMemcpy(SData_d->D, DD, Nq * Nb * dim * szf, cudaMemcpyHostToDevice));
104: PetscCallCUDA(cudaMalloc((void **)&SData_d->alpha, Nf * szf)); // kernel input
105: PetscCallCUDA(cudaMalloc((void **)&SData_d->beta, Nf * szf)); // kernel input
106: PetscCallCUDA(cudaMalloc((void **)&SData_d->invMass, Nf * szf)); // kernel input
108: PetscCallCUDA(cudaMemcpy(SData_d->alpha, nu_alpha, Nf * szf, cudaMemcpyHostToDevice));
109: PetscCallCUDA(cudaMemcpy(SData_d->beta, nu_beta, Nf * szf, cudaMemcpyHostToDevice));
110: PetscCallCUDA(cudaMemcpy(SData_d->invMass, a_invMass, Nf * szf, cudaMemcpyHostToDevice));
112: // collect geometry
113: PetscCallCUDA(cudaMalloc((void **)&SData_d->invJ, nip * dim * dim * szf)); // kernel input
114: PetscCallCUDA(cudaMemcpy(SData_d->invJ, a_invJ, nip * dim * dim * szf, cudaMemcpyHostToDevice));
115: PetscCallCUDA(cudaMalloc((void **)&SData_d->x, nip * szf)); // kernel input
116: PetscCallCUDA(cudaMemcpy(SData_d->x, a_x, nip * szf, cudaMemcpyHostToDevice));
117: PetscCallCUDA(cudaMalloc((void **)&SData_d->y, nip * szf)); // kernel input
118: PetscCallCUDA(cudaMemcpy(SData_d->y, a_y, nip * szf, cudaMemcpyHostToDevice));
119: #if LANDAU_DIM == 3
120: PetscCallCUDA(cudaMalloc((void **)&SData_d->z, nip * szf)); // kernel input
121: PetscCallCUDA(cudaMemcpy(SData_d->z, a_z, nip * szf, cudaMemcpyHostToDevice));
122: #else
123: (void)a_z;
124: #endif
125: PetscCallCUDA(cudaMalloc((void **)&SData_d->w, nip * szf)); // kernel input
126: PetscCallCUDA(cudaMemcpy(SData_d->w, a_w, nip * szf, cudaMemcpyHostToDevice));
128: PetscCallCUDA(cudaMalloc((void **)&SData_d->NCells, num_grids * szi));
129: PetscCallCUDA(cudaMemcpy(SData_d->NCells, a_numCells, num_grids * szi, cudaMemcpyHostToDevice));
130: PetscCallCUDA(cudaMalloc((void **)&SData_d->species_offset, (num_grids + 1) * szi));
131: PetscCallCUDA(cudaMemcpy(SData_d->species_offset, a_species_offset, (num_grids + 1) * szi, cudaMemcpyHostToDevice));
132: PetscCallCUDA(cudaMalloc((void **)&SData_d->mat_offset, (num_grids + 1) * szi));
133: PetscCallCUDA(cudaMemcpy(SData_d->mat_offset, a_mat_offset, (num_grids + 1) * szi, cudaMemcpyHostToDevice));
134: PetscCallCUDA(cudaMalloc((void **)&SData_d->ip_offset, (num_grids + 1) * szi));
135: PetscCallCUDA(cudaMemcpy(SData_d->ip_offset, h_ip_offset, (num_grids + 1) * szi, cudaMemcpyHostToDevice));
136: PetscCallCUDA(cudaMalloc((void **)&SData_d->ipf_offset, (num_grids + 1) * szi));
137: PetscCallCUDA(cudaMemcpy(SData_d->ipf_offset, h_ipf_offset, (num_grids + 1) * szi, cudaMemcpyHostToDevice));
138: PetscCallCUDA(cudaMalloc((void **)&SData_d->elem_offset, (num_grids + 1) * szi));
139: PetscCallCUDA(cudaMemcpy(SData_d->elem_offset, h_elem_offset, (num_grids + 1) * szi, cudaMemcpyHostToDevice));
140: PetscCallCUDA(cudaMalloc((void **)&SData_d->maps, num_grids * sizeof(P4estVertexMaps *)));
141: // allocate space for dynamic data once
142: PetscCallCUDA(cudaMalloc((void **)&SData_d->Eq_m, Nf * szf)); // this could be for each vertex (todo?)
143: PetscCallCUDA(cudaMalloc((void **)&SData_d->f, nip * Nf * szs * batch_sz)); // for each vertex in batch
144: PetscCallCUDA(cudaMalloc((void **)&SData_d->dfdx, nip * Nf * szs * batch_sz));
145: PetscCallCUDA(cudaMalloc((void **)&SData_d->dfdy, nip * Nf * szs * batch_sz));
146: #if LANDAU_DIM == 3
147: PetscCallCUDA(cudaMalloc((void **)&SData_d->dfdz, nip * Nf * szs * batch_sz));
148: #endif
149: }
150: PetscFunctionReturn(PETSC_SUCCESS);
151: }
153: PetscErrorCode LandauCUDAStaticDataClear(LandauStaticData *SData_d)
154: {
155: PetscFunctionBegin;
156: if (SData_d->alpha) {
157: PetscCallCUDA(cudaFree(SData_d->alpha));
158: SData_d->alpha = NULL;
159: PetscCallCUDA(cudaFree(SData_d->beta));
160: PetscCallCUDA(cudaFree(SData_d->invMass));
161: PetscCallCUDA(cudaFree(SData_d->B));
162: PetscCallCUDA(cudaFree(SData_d->D));
163: PetscCallCUDA(cudaFree(SData_d->invJ));
164: #if LANDAU_DIM == 3
165: PetscCallCUDA(cudaFree(SData_d->z));
166: #endif
167: PetscCallCUDA(cudaFree(SData_d->x));
168: PetscCallCUDA(cudaFree(SData_d->y));
169: PetscCallCUDA(cudaFree(SData_d->w));
170: // dynamic data
171: PetscCallCUDA(cudaFree(SData_d->Eq_m));
172: PetscCallCUDA(cudaFree(SData_d->f));
173: PetscCallCUDA(cudaFree(SData_d->dfdx));
174: PetscCallCUDA(cudaFree(SData_d->dfdy));
175: #if LANDAU_DIM == 3
176: PetscCallCUDA(cudaFree(SData_d->dfdz));
177: #endif
178: PetscCallCUDA(cudaFree(SData_d->NCells));
179: PetscCallCUDA(cudaFree(SData_d->species_offset));
180: PetscCallCUDA(cudaFree(SData_d->mat_offset));
181: PetscCallCUDA(cudaFree(SData_d->ip_offset));
182: PetscCallCUDA(cudaFree(SData_d->ipf_offset));
183: PetscCallCUDA(cudaFree(SData_d->elem_offset));
184: PetscCallCUDA(cudaFree(SData_d->maps));
185: }
186: PetscFunctionReturn(PETSC_SUCCESS);
187: }
188: //
189: // The GPU Landau kernel
190: //
191: __global__ void landau_form_fdf(const PetscInt dim, const PetscInt Nb, const PetscInt num_grids, const PetscReal d_invJ[], const PetscReal *const BB, const PetscReal *const DD, PetscScalar *d_vertex_f, P4estVertexMaps *d_maps[], PetscReal d_f[], PetscReal d_dfdx[], PetscReal d_dfdy[],
192: #if LANDAU_DIM == 3
193: PetscReal d_dfdz[],
194: #endif
195: const PetscInt d_numCells[], const PetscInt d_species_offset[], const PetscInt d_mat_offset[], const PetscInt d_ip_offset[], const PetscInt d_ipf_offset[], const PetscInt d_elem_offset[]) // output
196: {
197: const PetscInt Nq = blockDim.y, myQi = threadIdx.y;
198: const PetscInt b_elem_idx = blockIdx.y, b_id = blockIdx.x, IPf_sz_glb = d_ipf_offset[num_grids];
199: const PetscReal *Bq = &BB[myQi * Nb], *Dq = &DD[myQi * Nb * dim];
200: PetscInt grid = 0, f, d, b, e, q;
201: while (b_elem_idx >= d_elem_offset[grid + 1]) grid++;
202: {
203: const PetscInt loc_nip = d_numCells[grid] * Nq, loc_Nf = d_species_offset[grid + 1] - d_species_offset[grid], loc_elem = b_elem_idx - d_elem_offset[grid];
204: const PetscInt moffset = LAND_MOFFSET(b_id, grid, gridDim.x, num_grids, d_mat_offset);
205: const PetscScalar *coef;
206: PetscReal u_x[LANDAU_DIM];
207: const PetscReal *invJ = &d_invJ[(d_ip_offset[grid] + loc_elem * Nq + myQi) * dim * dim];
208: PetscScalar coef_buff[LANDAU_MAX_SPECIES * LANDAU_MAX_NQ];
209: if (!d_maps) {
210: coef = &d_vertex_f[b_id * IPf_sz_glb + d_ipf_offset[grid] + loc_elem * Nb * loc_Nf]; // closure and IP indexing are the same
211: } else {
212: coef = coef_buff;
213: for (f = 0; f < loc_Nf; ++f) {
214: LandauIdx *const Idxs = &d_maps[grid]->gIdx[loc_elem][f][0];
215: for (b = 0; b < Nb; ++b) {
216: PetscInt idx = Idxs[b];
217: if (idx >= 0) {
218: coef_buff[f * Nb + b] = d_vertex_f[idx + moffset];
219: } else {
220: idx = -idx - 1;
221: coef_buff[f * Nb + b] = 0;
222: for (q = 0; q < d_maps[grid]->num_face; q++) {
223: PetscInt id = d_maps[grid]->c_maps[idx][q].gid;
224: PetscReal scale = d_maps[grid]->c_maps[idx][q].scale;
225: if (id >= 0) coef_buff[f * Nb + b] += scale * d_vertex_f[id + moffset];
226: }
227: }
228: }
229: }
230: }
232: /* get f and df */
233: for (f = threadIdx.x; f < loc_Nf; f += blockDim.x) {
234: PetscReal refSpaceDer[LANDAU_DIM];
235: const PetscInt idx = b_id * IPf_sz_glb + d_ipf_offset[grid] + f * loc_nip + loc_elem * Nq + myQi;
236: d_f[idx] = 0.0;
237: for (d = 0; d < LANDAU_DIM; ++d) refSpaceDer[d] = 0.0;
238: for (b = 0; b < Nb; ++b) {
239: const PetscInt cidx = b;
240: d_f[idx] += Bq[cidx] * PetscRealPart(coef[f * Nb + cidx]);
241: for (d = 0; d < dim; ++d) refSpaceDer[d] += Dq[cidx * dim + d] * PetscRealPart(coef[f * Nb + cidx]);
242: }
243: for (d = 0; d < dim; ++d) {
244: for (e = 0, u_x[d] = 0.0; e < dim; ++e) u_x[d] += invJ[e * dim + d] * refSpaceDer[e];
245: }
246: d_dfdx[idx] = u_x[0];
247: d_dfdy[idx] = u_x[1];
248: #if LANDAU_DIM == 3
249: d_dfdz[idx] = u_x[2];
250: #endif
251: }
252: }
253: }
255: __device__ void landau_jac_kernel(const PetscInt num_grids, const PetscInt jpidx, PetscInt nip_global, const PetscInt grid, const PetscReal xx[], const PetscReal yy[], const PetscReal ww[], const PetscReal invJj[], const PetscInt Nftot, const PetscReal nu_alpha[], const PetscReal nu_beta[], const PetscReal invMass[], const PetscReal Eq_m[], const PetscReal *const BB, const PetscReal *const DD, PetscScalar *elemMat, P4estVertexMaps *d_maps[], PetscSplitCSRDataStructure d_mat, // output
256: PetscScalar s_fieldMats[][LANDAU_MAX_NQ], // all these arrays are in shared memory
257: PetscReal s_scale[][LANDAU_MAX_Q_FACE], PetscInt s_idx[][LANDAU_MAX_Q_FACE], PetscReal s_g2[][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES], PetscReal s_g3[][LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES], PetscReal s_gg2[][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES], PetscReal s_gg3[][LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES], PetscReal s_nu_alpha[], PetscReal s_nu_beta[], PetscReal s_invMass[], PetscReal s_f[], PetscReal s_dfx[], PetscReal s_dfy[], PetscReal d_f[], PetscReal d_dfdx[], PetscReal d_dfdy[], // global memory
258: #if LANDAU_DIM == 3
259: const PetscReal zz[], PetscReal s_dfz[], PetscReal d_dfdz[],
260: #endif
261: const PetscInt d_numCells[], const PetscInt d_species_offset[], const PetscInt d_mat_offset[], const PetscInt d_ip_offset[], const PetscInt d_ipf_offset[], const PetscInt d_elem_offset[])
262: {
263: const PetscInt Nq = blockDim.y, myQi = threadIdx.y;
264: const PetscInt b_elem_idx = blockIdx.y, b_id = blockIdx.x, IPf_sz_glb = d_ipf_offset[num_grids];
265: const PetscInt loc_Nf = d_species_offset[grid + 1] - d_species_offset[grid], loc_elem = b_elem_idx - d_elem_offset[grid];
266: const PetscInt moffset = LAND_MOFFSET(b_id, grid, gridDim.x, num_grids, d_mat_offset);
267: int delta, d, f, g, d2, dp, d3, fieldA, ipidx_b;
268: PetscReal gg2_temp[LANDAU_DIM], gg3_temp[LANDAU_DIM][LANDAU_DIM];
269: #if LANDAU_DIM == 2
270: const PetscReal vj[3] = {xx[jpidx], yy[jpidx]};
271: constexpr int dim = 2;
272: #else
273: const PetscReal vj[3] = {xx[jpidx], yy[jpidx], zz[jpidx]};
274: constexpr int dim = 3;
275: #endif
276: const PetscInt f_off = d_species_offset[grid], Nb = Nq;
277: // create g2 & g3
278: for (f = threadIdx.x; f < loc_Nf; f += blockDim.x) {
279: for (d = 0; d < dim; d++) { // clear accumulation data D & K
280: s_gg2[d][myQi][f] = 0;
281: for (d2 = 0; d2 < dim; d2++) s_gg3[d][d2][myQi][f] = 0;
282: }
283: }
284: #pragma unroll
285: for (d2 = 0; d2 < dim; d2++) {
286: gg2_temp[d2] = 0;
287: #pragma unroll
288: for (d3 = 0; d3 < dim; d3++) gg3_temp[d2][d3] = 0;
289: }
290: if (threadIdx.y == 0) {
291: // copy species into shared memory
292: for (fieldA = threadIdx.x; fieldA < Nftot; fieldA += blockDim.x) {
293: s_nu_alpha[fieldA] = nu_alpha[fieldA];
294: s_nu_beta[fieldA] = nu_beta[fieldA];
295: s_invMass[fieldA] = invMass[fieldA];
296: }
297: }
298: __syncthreads();
299: // inner integral, collect gg2/3
300: for (ipidx_b = 0; ipidx_b < nip_global; ipidx_b += blockDim.x) {
301: const PetscInt ipidx = ipidx_b + threadIdx.x;
302: PetscInt f_off_r, grid_r, loc_Nf_r, nip_loc_r, ipidx_g, fieldB, IPf_idx_r;
303: __syncthreads();
304: if (ipidx < nip_global) {
305: grid_r = 0;
306: while (ipidx >= d_ip_offset[grid_r + 1]) grid_r++;
307: f_off_r = d_species_offset[grid_r];
308: ipidx_g = ipidx - d_ip_offset[grid_r];
309: nip_loc_r = d_numCells[grid_r] * Nq;
310: loc_Nf_r = d_species_offset[grid_r + 1] - d_species_offset[grid_r];
311: IPf_idx_r = b_id * IPf_sz_glb + d_ipf_offset[grid_r] + ipidx_g;
312: for (fieldB = threadIdx.y; fieldB < loc_Nf_r; fieldB += blockDim.y) {
313: const PetscInt idx = IPf_idx_r + fieldB * nip_loc_r;
314: s_f[fieldB * blockDim.x + threadIdx.x] = d_f[idx]; // all vector threads get copy of data
315: s_dfx[fieldB * blockDim.x + threadIdx.x] = d_dfdx[idx];
316: s_dfy[fieldB * blockDim.x + threadIdx.x] = d_dfdy[idx];
317: #if LANDAU_DIM == 3
318: s_dfz[fieldB * blockDim.x + threadIdx.x] = d_dfdz[idx];
319: #endif
320: }
321: }
322: __syncthreads();
323: if (ipidx < nip_global) {
324: const PetscReal wi = ww[ipidx], x = xx[ipidx], y = yy[ipidx];
325: PetscReal temp1[3] = {0, 0, 0}, temp2 = 0;
326: #if LANDAU_DIM == 2
327: PetscReal Ud[2][2], Uk[2][2], mask = (PetscAbs(vj[0] - x) < 100 * PETSC_SQRT_MACHINE_EPSILON && PetscAbs(vj[1] - y) < 100 * PETSC_SQRT_MACHINE_EPSILON) ? 0. : 1.;
328: LandauTensor2D(vj, x, y, Ud, Uk, mask);
329: #else
330: PetscReal U[3][3], z = zz[ipidx], mask = (PetscAbs(vj[0] - x) < 100 * PETSC_SQRT_MACHINE_EPSILON && PetscAbs(vj[1] - y) < 100 * PETSC_SQRT_MACHINE_EPSILON && PetscAbs(vj[2] - z) < 100 * PETSC_SQRT_MACHINE_EPSILON) ? 0. : 1.;
331: LandauTensor3D(vj, x, y, z, U, mask);
332: #endif
333: for (int fieldB = 0; fieldB < loc_Nf_r; fieldB++) {
334: temp1[0] += s_dfx[fieldB * blockDim.x + threadIdx.x] * s_nu_beta[fieldB + f_off_r] * s_invMass[fieldB + f_off_r] * 7; // todo : bring lambdas in
335: temp1[1] += s_dfy[fieldB * blockDim.x + threadIdx.x] * s_nu_beta[fieldB + f_off_r] * s_invMass[fieldB + f_off_r] * 7;
336: #if LANDAU_DIM == 3
337: temp1[2] += s_dfz[fieldB * blockDim.x + threadIdx.x] * s_nu_beta[fieldB + f_off_r] * s_invMass[fieldB + f_off_r] * 7;
338: #endif
339: temp2 += s_f[fieldB * blockDim.x + threadIdx.x] * s_nu_beta[fieldB + f_off_r] * 7;
340: }
341: temp1[0] *= wi;
342: temp1[1] *= wi;
343: #if LANDAU_DIM == 3
344: temp1[2] *= wi;
345: #endif
346: temp2 *= wi;
347: #if LANDAU_DIM == 2
348: #pragma unroll
349: for (d2 = 0; d2 < 2; d2++) {
350: #pragma unroll
351: for (d3 = 0; d3 < 2; ++d3) {
352: /* K = U * grad(f): g2=e: i,A */
353: gg2_temp[d2] += Uk[d2][d3] * temp1[d3];
354: /* D = -U * (I \kron (fx)): g3=f: i,j,A */
355: gg3_temp[d2][d3] += Ud[d2][d3] * temp2;
356: }
357: }
358: #else
359: #pragma unroll
360: for (d2 = 0; d2 < 3; ++d2) {
361: #pragma unroll
362: for (d3 = 0; d3 < 3; ++d3) {
363: /* K = U * grad(f): g2 = e: i,A */
364: gg2_temp[d2] += U[d2][d3] * temp1[d3];
365: /* D = -U * (I \kron (fx)): g3 = f: i,j,A */
366: gg3_temp[d2][d3] += U[d2][d3] * temp2;
367: }
368: }
369: #endif
370: }
371: } /* IPs */
373: /* reduce gg temp sums across threads */
374: for (delta = blockDim.x / 2; delta > 0; delta /= 2) {
375: #pragma unroll
376: for (d2 = 0; d2 < dim; d2++) {
377: gg2_temp[d2] += __shfl_xor_sync(0xffffffff, gg2_temp[d2], delta, blockDim.x);
378: #pragma unroll
379: for (d3 = 0; d3 < dim; d3++) gg3_temp[d2][d3] += __shfl_xor_sync(0xffffffff, gg3_temp[d2][d3], delta, blockDim.x);
380: }
381: }
382: // add alpha and put in gg2/3
383: for (fieldA = threadIdx.x; fieldA < loc_Nf; fieldA += blockDim.x) {
384: #pragma unroll
385: for (d2 = 0; d2 < dim; d2++) {
386: s_gg2[d2][myQi][fieldA] += gg2_temp[d2] * s_nu_alpha[fieldA + f_off];
387: #pragma unroll
388: for (d3 = 0; d3 < dim; d3++) s_gg3[d2][d3][myQi][fieldA] -= gg3_temp[d2][d3] * s_nu_alpha[fieldA + f_off] * s_invMass[fieldA + f_off];
389: }
390: }
391: __syncthreads();
392: /* add electric field term once per IP */
393: for (fieldA = threadIdx.x; fieldA < loc_Nf; fieldA += blockDim.x) s_gg2[dim - 1][myQi][fieldA] += Eq_m[fieldA + f_off];
394: __syncthreads();
395: /* Jacobian transform - g2 */
396: for (fieldA = threadIdx.x; fieldA < loc_Nf; fieldA += blockDim.x) {
397: PetscReal wj = ww[jpidx];
398: for (d = 0; d < dim; ++d) {
399: s_g2[d][myQi][fieldA] = 0.0;
400: for (d2 = 0; d2 < dim; ++d2) {
401: s_g2[d][myQi][fieldA] += invJj[d * dim + d2] * s_gg2[d2][myQi][fieldA];
402: s_g3[d][d2][myQi][fieldA] = 0.0;
403: for (d3 = 0; d3 < dim; ++d3) {
404: for (dp = 0; dp < dim; ++dp) s_g3[d][d2][myQi][fieldA] += invJj[d * dim + d3] * s_gg3[d3][dp][myQi][fieldA] * invJj[d2 * dim + dp];
405: }
406: s_g3[d][d2][myQi][fieldA] *= wj;
407: }
408: s_g2[d][myQi][fieldA] *= wj;
409: }
410: }
411: __syncthreads(); // Synchronize (ensure all the data is available) and sum IP matrices
412: /* FE matrix construction */
413: {
414: int fieldA, d, qj, d2, q, idx, totDim = Nb * loc_Nf;
415: /* assemble */
416: for (fieldA = 0; fieldA < loc_Nf; fieldA++) {
417: for (f = threadIdx.y; f < Nb; f += blockDim.y) {
418: for (g = threadIdx.x; g < Nb; g += blockDim.x) {
419: PetscScalar t = 0;
420: for (qj = 0; qj < Nq; qj++) {
421: const PetscReal *BJq = &BB[qj * Nb], *DIq = &DD[qj * Nb * dim];
422: for (d = 0; d < dim; ++d) {
423: t += DIq[f * dim + d] * s_g2[d][qj][fieldA] * BJq[g];
424: for (d2 = 0; d2 < dim; ++d2) t += DIq[f * dim + d] * s_g3[d][d2][qj][fieldA] * DIq[g * dim + d2];
425: }
426: }
427: if (elemMat) {
428: const PetscInt fOff = (fieldA * Nb + f) * totDim + fieldA * Nb + g;
429: elemMat[fOff] += t; // ????
430: } else s_fieldMats[f][g] = t;
431: }
432: }
433: if (s_fieldMats) {
434: PetscScalar vals[LANDAU_MAX_Q_FACE * LANDAU_MAX_Q_FACE];
435: PetscInt nr, nc;
436: const LandauIdx *const Idxs = &d_maps[grid]->gIdx[loc_elem][fieldA][0];
437: __syncthreads();
438: if (threadIdx.y == 0) {
439: for (f = threadIdx.x; f < Nb; f += blockDim.x) {
440: idx = Idxs[f];
441: if (idx >= 0) {
442: s_idx[f][0] = idx + moffset;
443: s_scale[f][0] = 1.;
444: } else {
445: idx = -idx - 1;
446: for (q = 0; q < d_maps[grid]->num_face; q++) {
447: if (d_maps[grid]->c_maps[idx][q].gid >= 0) s_idx[f][q] = d_maps[grid]->c_maps[idx][q].gid + moffset;
448: else s_idx[f][q] = -1;
449: s_scale[f][q] = d_maps[grid]->c_maps[idx][q].scale;
450: }
451: }
452: }
453: }
454: __syncthreads();
455: for (f = threadIdx.y; f < Nb; f += blockDim.y) {
456: idx = Idxs[f];
457: if (idx >= 0) {
458: nr = 1;
459: } else {
460: nr = d_maps[grid]->num_face;
461: }
462: for (g = threadIdx.x; g < Nb; g += blockDim.x) {
463: idx = Idxs[g];
464: if (idx >= 0) {
465: nc = 1;
466: } else {
467: nc = d_maps[grid]->num_face;
468: }
469: for (q = 0; q < nr; q++) {
470: for (d = 0; d < nc; d++) vals[q * nc + d] = s_scale[f][q] * s_scale[g][d] * s_fieldMats[f][g];
471: }
472: static_cast<void>(MatSetValuesDevice(d_mat, nr, s_idx[f], nc, s_idx[g], vals, ADD_VALUES));
473: }
474: }
475: __syncthreads();
476: }
477: }
478: }
479: }
481: //
482: // The CUDA Landau kernel
483: //
484: __global__ void __launch_bounds__(256, 2) landau_jacobian(const PetscInt nip_global, const PetscInt dim, const PetscInt Nb, const PetscInt num_grids, const PetscReal invJj[], const PetscInt Nftot, const PetscReal nu_alpha[], const PetscReal nu_beta[], const PetscReal invMass[], const PetscReal Eq_m[], const PetscReal *const BB, const PetscReal *const DD, const PetscReal xx[], const PetscReal yy[], const PetscReal ww[], PetscScalar d_elem_mats[], P4estVertexMaps *d_maps[], PetscSplitCSRDataStructure d_mat, PetscReal d_f[], PetscReal d_dfdx[], PetscReal d_dfdy[],
485: #if LANDAU_DIM == 3
486: const PetscReal zz[], PetscReal d_dfdz[],
487: #endif
488: const PetscInt d_numCells[], const PetscInt d_species_offset[], const PetscInt d_mat_offset[], const PetscInt d_ip_offset[], const PetscInt d_ipf_offset[], const PetscInt d_elem_offset[])
489: {
490: extern __shared__ PetscReal smem[];
491: int size = 0;
492: PetscReal(*s_g2)[LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES] = (PetscReal(*)[LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES]) & smem[size];
493: size += LANDAU_MAX_NQ * LANDAU_MAX_SPECIES * LANDAU_DIM;
494: PetscReal(*s_g3)[LANDAU_DIM][LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES] = (PetscReal(*)[LANDAU_DIM][LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES]) & smem[size];
495: size += LANDAU_DIM * LANDAU_DIM * LANDAU_MAX_NQ * LANDAU_MAX_SPECIES;
496: PetscReal(*s_gg2)[LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES] = (PetscReal(*)[LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES]) & smem[size];
497: size += LANDAU_MAX_NQ * LANDAU_MAX_SPECIES * LANDAU_DIM;
498: PetscReal(*s_gg3)[LANDAU_DIM][LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES] = (PetscReal(*)[LANDAU_DIM][LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES]) & smem[size];
499: size += LANDAU_DIM * LANDAU_DIM * LANDAU_MAX_NQ * LANDAU_MAX_SPECIES;
500: PetscReal *s_nu_alpha = &smem[size];
501: size += LANDAU_MAX_SPECIES;
502: PetscReal *s_nu_beta = &smem[size];
503: size += LANDAU_MAX_SPECIES;
504: PetscReal *s_invMass = &smem[size];
505: size += LANDAU_MAX_SPECIES;
506: PetscReal *s_f = &smem[size];
507: size += blockDim.x * LANDAU_MAX_SPECIES;
508: PetscReal *s_dfx = &smem[size];
509: size += blockDim.x * LANDAU_MAX_SPECIES;
510: PetscReal *s_dfy = &smem[size];
511: size += blockDim.x * LANDAU_MAX_SPECIES;
512: #if LANDAU_DIM == 3
513: PetscReal *s_dfz = &smem[size];
514: size += blockDim.x * LANDAU_MAX_SPECIES;
515: #endif
516: PetscScalar(*s_fieldMats)[LANDAU_MAX_NQ][LANDAU_MAX_NQ];
517: PetscReal(*s_scale)[LANDAU_MAX_NQ][LANDAU_MAX_Q_FACE] = nullptr;
518: PetscInt(*s_idx)[LANDAU_MAX_NQ][LANDAU_MAX_Q_FACE] = nullptr;
519: const PetscInt b_elem_idx = blockIdx.y, b_id = blockIdx.x;
520: PetscInt Nq = blockDim.y, grid = 0; // Nq == Nb
521: PetscScalar *elemMat = NULL; /* my output */
522: while (b_elem_idx >= d_elem_offset[grid + 1]) grid++;
523: {
524: const PetscInt loc_Nf = d_species_offset[grid + 1] - d_species_offset[grid], loc_elem = b_elem_idx - d_elem_offset[grid];
525: const PetscInt myQi = threadIdx.y;
526: const PetscInt jpidx = d_ip_offset[grid] + myQi + loc_elem * Nq;
527: const PetscReal *invJ = &invJj[jpidx * dim * dim];
528: if (d_elem_mats) {
529: PetscInt totDim = loc_Nf * Nb;
530: elemMat = d_elem_mats; // start a beginning and get to my element matrix
531: for (PetscInt b_id2 = 0; b_id2 < b_id; b_id2++) {
532: for (PetscInt grid2 = 0; grid2 < num_grids; grid2++) {
533: PetscInt Nfloc2 = d_species_offset[grid2 + 1] - d_species_offset[grid2], totDim2 = Nfloc2 * Nb;
534: elemMat += d_numCells[grid2] * totDim2 * totDim2; // jump past grids,could be in an offset
535: }
536: }
537: for (PetscInt grid2 = 0; grid2 < grid; grid2++) {
538: PetscInt Nfloc2 = d_species_offset[grid2 + 1] - d_species_offset[grid2], totDim2 = Nfloc2 * Nb;
539: elemMat += d_numCells[grid2] * totDim2 * totDim2; // jump past grids, could be in an offset
540: }
541: elemMat += loc_elem * totDim * totDim; // index into local matrix & zero out
542: for (int i = threadIdx.x + threadIdx.y * blockDim.x; i < totDim * totDim; i += blockDim.x * blockDim.y) elemMat[i] = 0;
543: }
544: __syncthreads();
545: if (d_maps) {
546: // reuse the space for fieldMats
547: s_fieldMats = (PetscScalar(*)[LANDAU_MAX_NQ][LANDAU_MAX_NQ]) & smem[size];
548: size += LANDAU_MAX_NQ * LANDAU_MAX_NQ;
549: s_scale = (PetscReal(*)[LANDAU_MAX_NQ][LANDAU_MAX_Q_FACE]) & smem[size];
550: size += LANDAU_MAX_NQ * LANDAU_MAX_Q_FACE;
551: s_idx = (PetscInt(*)[LANDAU_MAX_NQ][LANDAU_MAX_Q_FACE]) & smem[size];
552: size += LANDAU_MAX_NQ * LANDAU_MAX_Q_FACE; // this is too big, idx is an integer
553: } else {
554: s_fieldMats = NULL;
555: }
556: __syncthreads();
557: landau_jac_kernel(num_grids, jpidx, nip_global, grid, xx, yy, ww, invJ, Nftot, nu_alpha, nu_beta, invMass, Eq_m, BB, DD, elemMat, d_maps, d_mat, *s_fieldMats, *s_scale, *s_idx, *s_g2, *s_g3, *s_gg2, *s_gg3, s_nu_alpha, s_nu_beta, s_invMass, s_f, s_dfx, s_dfy, d_f, d_dfdx, d_dfdy,
558: #if LANDAU_DIM == 3
559: zz, s_dfz, d_dfdz,
560: #endif
561: d_numCells, d_species_offset, d_mat_offset, d_ip_offset, d_ipf_offset, d_elem_offset);
562: }
563: }
565: __global__ void __launch_bounds__(256, 4) landau_mass(const PetscInt dim, const PetscInt Nb, const PetscInt num_grids, const PetscReal d_w[], const PetscReal *const BB, const PetscReal *const DD, PetscScalar d_elem_mats[], P4estVertexMaps *d_maps[], PetscSplitCSRDataStructure d_mat, PetscReal shift, const PetscInt d_numCells[], const PetscInt d_species_offset[], const PetscInt d_mat_offset[], const PetscInt d_ip_offset[], const PetscInt d_elem_offset[])
566: {
567: extern __shared__ PetscReal smem[];
568: const PetscInt Nq = blockDim.y, b_elem_idx = blockIdx.y, b_id = blockIdx.x;
569: PetscScalar *elemMat = NULL; /* my output */
570: PetscInt fieldA, d, qj, q, idx, f, g, grid = 0, size = 0;
571: PetscScalar(*s_fieldMats)[LANDAU_MAX_NQ][LANDAU_MAX_NQ];
572: PetscReal(*s_scale)[LANDAU_MAX_NQ][LANDAU_MAX_Q_FACE];
573: PetscInt(*s_idx)[LANDAU_MAX_NQ][LANDAU_MAX_Q_FACE];
574: if (d_maps) {
575: // reuse the space for fieldMats
576: s_fieldMats = (PetscScalar(*)[LANDAU_MAX_NQ][LANDAU_MAX_NQ]) & smem[size];
577: size += LANDAU_MAX_NQ * LANDAU_MAX_NQ;
578: s_scale = (PetscReal(*)[LANDAU_MAX_NQ][LANDAU_MAX_Q_FACE]) & smem[size];
579: size += LANDAU_MAX_NQ * LANDAU_MAX_Q_FACE;
580: s_idx = (PetscInt(*)[LANDAU_MAX_NQ][LANDAU_MAX_Q_FACE]) & smem[size];
581: size += LANDAU_MAX_NQ * LANDAU_MAX_Q_FACE; // this is too big, idx is an integer
582: } else {
583: s_fieldMats = NULL;
584: }
585: while (b_elem_idx >= d_elem_offset[grid + 1]) grid++;
586: {
587: const PetscInt loc_Nf = d_species_offset[grid + 1] - d_species_offset[grid], loc_elem = b_elem_idx - d_elem_offset[grid];
588: const PetscInt moffset = LAND_MOFFSET(b_id, grid, gridDim.x, num_grids, d_mat_offset), totDim = loc_Nf * Nq;
589: if (d_elem_mats) {
590: elemMat = d_elem_mats; // start a beginning
591: for (PetscInt b_id2 = 0; b_id2 < b_id; b_id2++) {
592: for (PetscInt grid2 = 0; grid2 < num_grids; grid2++) {
593: PetscInt Nfloc2 = d_species_offset[grid2 + 1] - d_species_offset[grid2], totDim2 = Nfloc2 * Nb;
594: elemMat += d_numCells[grid2] * totDim2 * totDim2; // jump past grids,could be in an offset
595: }
596: }
597: for (PetscInt grid2 = 0; grid2 < grid; grid2++) {
598: PetscInt Nfloc2 = d_species_offset[grid2 + 1] - d_species_offset[grid2], totDim2 = Nfloc2 * Nb;
599: elemMat += d_numCells[grid2] * totDim2 * totDim2; // jump past grids,could be in an offset
600: }
601: elemMat += loc_elem * totDim * totDim;
602: for (int i = threadIdx.x + threadIdx.y * blockDim.x; i < totDim * totDim; i += blockDim.x * blockDim.y) elemMat[i] = 0;
603: }
604: __syncthreads();
605: /* FE mass matrix construction */
606: for (fieldA = 0; fieldA < loc_Nf; fieldA++) {
607: PetscScalar vals[LANDAU_MAX_Q_FACE * LANDAU_MAX_Q_FACE];
608: PetscInt nr, nc;
609: for (f = threadIdx.y; f < Nb; f += blockDim.y) {
610: for (g = threadIdx.x; g < Nb; g += blockDim.x) {
611: PetscScalar t = 0;
612: for (qj = 0; qj < Nq; qj++) {
613: const PetscReal *BJq = &BB[qj * Nb];
614: const PetscInt jpidx = d_ip_offset[grid] + qj + loc_elem * Nq;
615: if (dim == 2) {
616: t += BJq[f] * d_w[jpidx] * shift * BJq[g] * 2. * PETSC_PI;
617: } else {
618: t += BJq[f] * d_w[jpidx] * shift * BJq[g];
619: }
620: }
621: if (elemMat) {
622: const PetscInt fOff = (fieldA * Nb + f) * totDim + fieldA * Nb + g;
623: elemMat[fOff] += t; // ????
624: } else (*s_fieldMats)[f][g] = t;
625: }
626: }
627: if (!elemMat) {
628: const LandauIdx *const Idxs = &d_maps[grid]->gIdx[loc_elem][fieldA][0];
629: __syncthreads();
630: if (threadIdx.y == 0) {
631: for (f = threadIdx.x; f < Nb; f += blockDim.x) {
632: idx = Idxs[f];
633: if (idx >= 0) {
634: (*s_idx)[f][0] = idx + moffset;
635: (*s_scale)[f][0] = 1.;
636: } else {
637: idx = -idx - 1;
638: for (q = 0; q < d_maps[grid]->num_face; q++) {
639: if (d_maps[grid]->c_maps[idx][q].gid >= 0) (*s_idx)[f][q] = d_maps[grid]->c_maps[idx][q].gid + moffset;
640: else (*s_idx)[f][q] = -1;
641: (*s_scale)[f][q] = d_maps[grid]->c_maps[idx][q].scale;
642: }
643: }
644: }
645: }
646: __syncthreads();
647: for (f = threadIdx.y; f < Nb; f += blockDim.y) {
648: idx = Idxs[f];
649: if (idx >= 0) {
650: nr = 1;
651: } else {
652: nr = d_maps[grid]->num_face;
653: }
654: for (g = threadIdx.x; g < Nb; g += blockDim.x) {
655: idx = Idxs[g];
656: if (idx >= 0) {
657: nc = 1;
658: } else {
659: nc = d_maps[grid]->num_face;
660: }
661: for (q = 0; q < nr; q++) {
662: for (d = 0; d < nc; d++) vals[q * nc + d] = (*s_scale)[f][q] * (*s_scale)[g][d] * (*s_fieldMats)[f][g];
663: }
664: static_cast<void>(MatSetValuesDevice(d_mat, nr, (*s_idx)[f], nc, (*s_idx)[g], vals, ADD_VALUES));
665: }
666: }
667: }
668: __syncthreads();
669: }
670: }
671: }
673: PetscErrorCode LandauCUDAJacobian(DM plex[], const PetscInt Nq, const PetscInt batch_sz, const PetscInt num_grids, const PetscInt a_numCells[], PetscReal a_Eq_m[], PetscScalar a_elem_closure[], const PetscScalar a_xarray[], const LandauStaticData *SData_d, const PetscReal shift, const PetscLogEvent events[], const PetscInt a_mat_offset[], const PetscInt a_species_offset[], Mat subJ[], Mat JacP)
674: {
675: cudaError_t cerr;
676: PetscInt Nb = Nq, dim, nip_global, num_cells_batch, elem_mat_size_tot;
677: PetscInt *d_numCells, *d_species_offset, *d_mat_offset, *d_ip_offset, *d_ipf_offset, *d_elem_offset;
678: PetscInt szf = sizeof(PetscReal), szs = sizeof(PetscScalar), Nftot = a_species_offset[num_grids];
679: PetscReal *d_BB = NULL, *d_DD = NULL, *d_invJj = NULL, *d_nu_alpha = NULL, *d_nu_beta = NULL, *d_invMass = NULL, *d_Eq_m = NULL, *d_x = NULL, *d_y = NULL, *d_w = NULL;
680: PetscScalar *d_elem_mats = NULL, *d_vertex_f = NULL;
681: PetscReal *d_f = NULL, *d_dfdx = NULL, *d_dfdy = NULL;
682: #if LANDAU_DIM == 3
683: PetscReal *d_dfdz = NULL, *d_z = NULL;
684: #endif
685: LandauCtx *ctx;
686: PetscSplitCSRDataStructure d_mat = NULL;
687: P4estVertexMaps **d_maps, *maps[LANDAU_MAX_GRIDS];
688: int nnn = 256 / Nq; // machine dependent
689: PetscContainer container;
691: PetscFunctionBegin;
692: PetscCall(PetscLogEventBegin(events[3], 0, 0, 0, 0));
693: while (nnn & nnn - 1) nnn = nnn & nnn - 1;
694: if (nnn > 16) nnn = 16;
695: PetscCall(DMGetApplicationContext(plex[0], &ctx));
696: PetscCheck(ctx, PETSC_COMM_SELF, PETSC_ERR_PLIB, "no context");
697: PetscCall(DMGetDimension(plex[0], &dim));
698: PetscCheck(dim == LANDAU_DIM, PETSC_COMM_SELF, PETSC_ERR_PLIB, "LANDAU_DIM %d != dim %" PetscInt_FMT, LANDAU_DIM, dim);
699: if (ctx->gpu_assembly) {
700: PetscCall(PetscObjectQuery((PetscObject)JacP, "assembly_maps", (PetscObject *)&container));
701: if (container) { // not here first call
702: static int init = 0; // hack. just do every time, or put in setup (but that is in base class code), or add init_maps flag
703: if (!init++) {
704: P4estVertexMaps *h_maps = NULL;
705: PetscCall(PetscContainerGetPointer(container, (void **)&h_maps));
706: for (PetscInt grid = 0; grid < num_grids; grid++) {
707: if (h_maps[grid].d_self) {
708: maps[grid] = h_maps[grid].d_self;
709: } else {
710: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "GPU assembly but no metadata in container");
711: }
712: }
713: PetscCallCUDA(cudaMemcpy(SData_d->maps, maps, num_grids * sizeof(P4estVertexMaps *), cudaMemcpyHostToDevice));
714: }
715: d_maps = (P4estVertexMaps **)SData_d->maps;
716: // this does the setup the first time called
717: PetscCall(MatCUSPARSEGetDeviceMatWrite(JacP, &d_mat));
718: } else {
719: d_maps = NULL;
720: }
721: } else {
722: container = NULL;
723: d_maps = NULL;
724: }
725: PetscCall(PetscLogEventEnd(events[3], 0, 0, 0, 0));
726: {
727: PetscInt elem_mat_size = 0;
728: nip_global = num_cells_batch = 0;
729: for (PetscInt grid = 0; grid < num_grids; grid++) {
730: PetscInt Nfloc = a_species_offset[grid + 1] - a_species_offset[grid], totDim = Nfloc * Nb;
731: nip_global += a_numCells[grid] * Nq;
732: num_cells_batch += a_numCells[grid]; // is in d_elem_offset, but not on host
733: elem_mat_size += a_numCells[grid] * totDim * totDim; // could save in an offset here -- batch major ordering
734: }
735: elem_mat_size_tot = d_maps ? 0 : elem_mat_size;
736: }
737: dim3 dimGrid(batch_sz, num_cells_batch);
738: if (elem_mat_size_tot) {
739: PetscCallCUDA(cudaMalloc((void **)&d_elem_mats, batch_sz * elem_mat_size_tot * szs)); // kernel output - first call is on CPU
740: } else d_elem_mats = NULL;
741: // create data
742: d_BB = (PetscReal *)SData_d->B;
743: d_DD = (PetscReal *)SData_d->D;
744: if (a_elem_closure || a_xarray) { // form f and df
745: PetscCall(PetscLogEventBegin(events[1], 0, 0, 0, 0));
746: PetscCallCUDA(cudaMemcpy(SData_d->Eq_m, a_Eq_m, Nftot * szf, cudaMemcpyHostToDevice));
747: d_invJj = (PetscReal *)SData_d->invJ;
748: d_nu_alpha = (PetscReal *)SData_d->alpha;
749: d_nu_beta = (PetscReal *)SData_d->beta;
750: d_invMass = (PetscReal *)SData_d->invMass;
751: d_x = (PetscReal *)SData_d->x;
752: d_y = (PetscReal *)SData_d->y;
753: d_w = (PetscReal *)SData_d->w;
754: d_Eq_m = (PetscReal *)SData_d->Eq_m;
755: d_dfdx = (PetscReal *)SData_d->dfdx;
756: d_dfdy = (PetscReal *)SData_d->dfdy;
757: #if LANDAU_DIM == 3
758: d_dfdz = (PetscReal *)SData_d->dfdz;
759: d_z = (PetscReal *)SData_d->z;
760: #endif
761: d_f = (PetscReal *)SData_d->f;
762: // get a d_vertex_f
763: if (a_elem_closure) {
764: PetscInt closure_sz = 0; // argh, don't have this on the host!!!
765: for (PetscInt grid = 0; grid < num_grids; grid++) {
766: PetscInt nfloc = a_species_offset[grid + 1] - a_species_offset[grid];
767: closure_sz += Nq * nfloc * a_numCells[grid];
768: }
769: closure_sz *= batch_sz;
770: PetscCallCUDA(cudaMalloc((void **)&d_vertex_f, closure_sz * sizeof(*a_elem_closure)));
771: PetscCallCUDA(cudaMemcpy(d_vertex_f, a_elem_closure, closure_sz * sizeof(*a_elem_closure), cudaMemcpyHostToDevice));
772: } else {
773: d_vertex_f = (PetscScalar *)a_xarray;
774: }
775: PetscCall(PetscLogEventEnd(events[1], 0, 0, 0, 0));
776: } else {
777: d_w = (PetscReal *)SData_d->w; // mass just needs the weights
778: }
779: //
780: d_numCells = (PetscInt *)SData_d->NCells; // redundant -- remove
781: d_species_offset = (PetscInt *)SData_d->species_offset;
782: d_mat_offset = (PetscInt *)SData_d->mat_offset;
783: d_ip_offset = (PetscInt *)SData_d->ip_offset;
784: d_ipf_offset = (PetscInt *)SData_d->ipf_offset;
785: d_elem_offset = (PetscInt *)SData_d->elem_offset;
786: if (a_elem_closure || a_xarray) { // form f and df
787: dim3 dimBlockFDF(nnn > Nftot ? Nftot : nnn, Nq), dimBlock((nip_global > nnn) ? nnn : nip_global, Nq);
788: PetscCall(PetscLogEventBegin(events[8], 0, 0, 0, 0));
789: PetscCall(PetscLogGpuTimeBegin());
790: PetscCall(PetscInfo(plex[0], "Form F and dF/dx vectors: nip_global=%" PetscInt_FMT " num_grids=%" PetscInt_FMT "\n", nip_global, num_grids));
791: landau_form_fdf<<<dimGrid, dimBlockFDF>>>(dim, Nb, num_grids, d_invJj, d_BB, d_DD, d_vertex_f, d_maps, d_f, d_dfdx, d_dfdy,
792: #if LANDAU_DIM == 3
793: d_dfdz,
794: #endif
795: d_numCells, d_species_offset, d_mat_offset, d_ip_offset, d_ipf_offset, d_elem_offset);
796: PetscCUDACheckLaunch;
797: PetscCall(PetscLogGpuFlops(batch_sz * nip_global * (PetscLogDouble)(2 * Nb * (1 + dim))));
798: if (a_elem_closure) {
799: PetscCallCUDA(cudaFree(d_vertex_f));
800: d_vertex_f = NULL;
801: }
802: PetscCall(PetscLogGpuTimeEnd());
803: PetscCall(PetscLogEventEnd(events[8], 0, 0, 0, 0));
804: // Jacobian
805: PetscCall(PetscLogEventBegin(events[4], 0, 0, 0, 0));
806: PetscCall(PetscLogGpuTimeBegin());
807: PetscCall(PetscLogGpuFlops(batch_sz * nip_global * (PetscLogDouble)(a_elem_closure ? (nip_global * (11 * Nftot + 4 * dim * dim) + 6 * Nftot * dim * dim * dim + 10 * Nftot * dim * dim + 4 * Nftot * dim + Nb * Nftot * Nb * Nq * dim * dim * 5) : Nb * Nftot * Nb * Nq * 4)));
808: PetscInt ii = 2 * LANDAU_MAX_NQ * LANDAU_MAX_SPECIES * LANDAU_DIM * (1 + LANDAU_DIM) + 3 * LANDAU_MAX_SPECIES + (1 + LANDAU_DIM) * dimBlock.x * LANDAU_MAX_SPECIES + LANDAU_MAX_NQ * LANDAU_MAX_NQ + 2 * LANDAU_MAX_NQ * LANDAU_MAX_Q_FACE;
809: if (ii * szf >= 49152) {
810: cerr = cudaFuncSetAttribute(landau_jacobian, cudaFuncAttributeMaxDynamicSharedMemorySize, 98304);
811: PetscCallCUDA(cerr);
812: }
813: PetscCall(PetscInfo(plex[0], "Jacobian shared memory size: %" PetscInt_FMT " bytes, d_elem_mats=%p d_maps=%p\n", ii, d_elem_mats, d_maps));
814: landau_jacobian<<<dimGrid, dimBlock, ii * szf>>>(nip_global, dim, Nb, num_grids, d_invJj, Nftot, d_nu_alpha, d_nu_beta, d_invMass, d_Eq_m, d_BB, d_DD, d_x, d_y, d_w, d_elem_mats, d_maps, d_mat, d_f, d_dfdx, d_dfdy,
815: #if LANDAU_DIM == 3
816: d_z, d_dfdz,
817: #endif
818: d_numCells, d_species_offset, d_mat_offset, d_ip_offset, d_ipf_offset, d_elem_offset);
819: PetscCUDACheckLaunch; // has sync
820: PetscCall(PetscLogGpuTimeEnd());
821: PetscCall(PetscLogEventEnd(events[4], 0, 0, 0, 0));
822: } else { // mass
823: dim3 dimBlock(nnn, Nq);
824: PetscInt ii = LANDAU_MAX_NQ * LANDAU_MAX_NQ + 2 * LANDAU_MAX_NQ * LANDAU_MAX_Q_FACE;
825: if (ii * szf >= 49152) {
826: cerr = cudaFuncSetAttribute(landau_mass, cudaFuncAttributeMaxDynamicSharedMemorySize, 98304);
827: PetscCallCUDA(cerr);
828: }
829: PetscCall(PetscInfo(plex[0], "Mass d_maps = %p. Nq=%" PetscInt_FMT ", vector size %d num_cells_batch=%" PetscInt_FMT ", %" PetscInt_FMT " shared memory words\n", d_maps, Nq, nnn, num_cells_batch, ii));
830: PetscCall(PetscLogEventBegin(events[16], 0, 0, 0, 0));
831: PetscCall(PetscLogGpuTimeBegin());
832: landau_mass<<<dimGrid, dimBlock, ii * szf>>>(dim, Nb, num_grids, d_w, d_BB, d_DD, d_elem_mats, d_maps, d_mat, shift, d_numCells, d_species_offset, d_mat_offset, d_ip_offset, d_elem_offset);
833: PetscCUDACheckLaunch; // has sync
834: PetscCall(PetscLogGpuTimeEnd());
835: PetscCall(PetscLogEventEnd(events[16], 0, 0, 0, 0));
836: }
837: // First time assembly with or without GPU assembly
838: if (d_elem_mats) {
839: PetscInt elem_mats_idx = 0;
840: for (PetscInt b_id = 0; b_id < batch_sz; b_id++) { // OpenMP (once)
841: for (PetscInt grid = 0; grid < num_grids; grid++) { // elem_mats_idx += totDim*totDim*a_numCells[grid];
842: const PetscInt Nfloc = a_species_offset[grid + 1] - a_species_offset[grid], totDim = Nfloc * Nq;
843: PetscScalar *elemMats = NULL, *elMat;
844: PetscSection section, globalSection;
845: PetscInt cStart, cEnd, ej;
846: PetscInt moffset = LAND_MOFFSET(b_id, grid, batch_sz, num_grids, a_mat_offset), nloc, nzl, colbuf[1024], row;
847: const PetscInt *cols;
848: const PetscScalar *vals;
849: Mat B = subJ[LAND_PACK_IDX(b_id, grid)];
850: PetscCall(PetscLogEventBegin(events[5], 0, 0, 0, 0));
851: PetscCall(DMPlexGetHeightStratum(plex[grid], 0, &cStart, &cEnd));
852: PetscCall(DMGetLocalSection(plex[grid], §ion));
853: PetscCall(DMGetGlobalSection(plex[grid], &globalSection));
854: PetscCall(PetscMalloc1(totDim * totDim * a_numCells[grid], &elemMats));
855: PetscCallCUDA(cudaMemcpy(elemMats, &d_elem_mats[elem_mats_idx], totDim * totDim * a_numCells[grid] * sizeof(*elemMats), cudaMemcpyDeviceToHost));
856: PetscCall(PetscLogEventEnd(events[5], 0, 0, 0, 0));
857: PetscCall(PetscLogEventBegin(events[6], 0, 0, 0, 0));
858: for (ej = cStart, elMat = elemMats; ej < cEnd; ++ej, elMat += totDim * totDim) {
859: PetscCall(DMPlexMatSetClosure(plex[grid], section, globalSection, B, ej, elMat, ADD_VALUES));
860: if (ej == -1) {
861: int d, f;
862: PetscCall(PetscPrintf(PETSC_COMM_SELF, "GPU Element matrix\n"));
863: for (d = 0; d < totDim; ++d) {
864: for (f = 0; f < totDim; ++f) PetscCall(PetscPrintf(PETSC_COMM_SELF, " %12.5e", PetscRealPart(elMat[d * totDim + f])));
865: PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n"));
866: }
867: }
868: }
869: PetscCall(PetscFree(elemMats));
870: PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
871: PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
872: // move nest matrix to global JacP
873: PetscCall(MatGetSize(B, &nloc, NULL));
874: for (int i = 0; i < nloc; i++) {
875: PetscCall(MatGetRow(B, i, &nzl, &cols, &vals));
876: PetscCheck(nzl <= 1024, PetscObjectComm((PetscObject)B), PETSC_ERR_PLIB, "Row too big: %" PetscInt_FMT, nzl);
877: for (int j = 0; j < nzl; j++) colbuf[j] = cols[j] + moffset;
878: row = i + moffset;
879: PetscCall(MatSetValues(JacP, 1, &row, nzl, colbuf, vals, ADD_VALUES));
880: PetscCall(MatRestoreRow(B, i, &nzl, &cols, &vals));
881: }
882: PetscCall(MatDestroy(&B));
883: PetscCall(PetscLogEventEnd(events[6], 0, 0, 0, 0));
884: elem_mats_idx += totDim * totDim * a_numCells[grid]; // this can be a stored offset?
885: } // grids
886: }
887: PetscCheck(elem_mats_idx == batch_sz * elem_mat_size_tot, PetscObjectComm((PetscObject)JacP), PETSC_ERR_PLIB, "elem_mats_idx != batch_sz*elem_mat_size_tot: %" PetscInt_FMT " %" PetscInt_FMT, elem_mats_idx, batch_sz * elem_mat_size_tot);
888: PetscCallCUDA(cudaFree(d_elem_mats));
889: }
891: PetscFunctionReturn(PETSC_SUCCESS);
892: }
893: #endif