Actual source code: landaucu.cu

petsc-3.15.0 2021-03-30
Report Typos and Errors
  1: /*
  2:   Implements the Landau kernel
  3: */
  4: #include <petscconf.h>
  5: #include <petsc/private/dmpleximpl.h>
  6: #include <petsclandau.h>
  7: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
  8: #include <../src/mat/impls/aij/seq/aij.h>
  9: #include <petscmat.h>
 10: #include <petsccublas.h>

 12: // hack to avoid configure problems in CI. Delete when resolved
 13: #if !defined (PETSC_HAVE_CUDA_ATOMIC)
 14: #define atomicAdd(e, f) (*e) += f
 15: #endif
 16: #define PETSC_DEVICE_FUNC_DECL __device__
 17: #include "../land_tensors.h"
 18: #include <petscaijdevice.h>

 20: #define CHECK_LAUNCH_ERROR()                                                             \
 21: do {                                                                                     \
 22:   /* Check synchronous errors, i.e. pre-launch */                                        \
 23:   cudaError_t err = cudaGetLastError();                                                  \
 24:   if (cudaSuccess != err) {                                                              \
 25:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Cuda error: %s",cudaGetErrorString(err)); \
 26:   }                                                                                      \
 27:   /* Check asynchronous errors, i.e. kernel failed (ULF) */                              \
 28:   err = cudaDeviceSynchronize();                                                         \
 29:   if (cudaSuccess != err) {                                                              \
 30:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Cuda error: %s",cudaGetErrorString(err)); \
 31:   }                                                                                      \
 32:  } while (0)

 34: PETSC_EXTERN PetscErrorCode LandauCUDACreateMatMaps(P4estVertexMaps *maps, pointInterpolationP4est (*points)[LANDAU_MAX_Q_FACE], PetscInt Nf, PetscInt Nq)
 35: {
 36:   P4estVertexMaps h_maps;
 37:   cudaError_t     cerr;
 39:   h_maps.num_elements =maps->num_elements;
 40:   h_maps.num_face = maps->num_face;
 41:   h_maps.num_reduced = maps->num_reduced;
 42:   h_maps.deviceType = maps->deviceType;
 43:   h_maps.Nf = Nf;
 44:   h_maps.Nq = Nq;
 45:   cerr = cudaMalloc((void **)&h_maps.c_maps,               maps->num_reduced  * sizeof *points);CHKERRCUDA(cerr);
 46:   cerr = cudaMemcpy(          h_maps.c_maps, maps->c_maps, maps->num_reduced  * sizeof *points, cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
 47:   cerr = cudaMalloc((void **)&h_maps.gIdx,                 maps->num_elements * sizeof *maps->gIdx);CHKERRCUDA(cerr);
 48:   cerr = cudaMemcpy(          h_maps.gIdx, maps->gIdx,     maps->num_elements * sizeof *maps->gIdx, cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
 49:   cerr = cudaMalloc((void **)&maps->data, sizeof(P4estVertexMaps));CHKERRCUDA(cerr);
 50:   cerr = cudaMemcpy(          maps->data,   &h_maps, sizeof(P4estVertexMaps), cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
 51:   return(0);
 52: }

 54: PETSC_EXTERN PetscErrorCode LandauCUDADestroyMatMaps(P4estVertexMaps *pMaps)
 55: {
 56:   P4estVertexMaps *d_maps = pMaps->data, h_maps;
 57:   cudaError_t     cerr;
 59:   cerr = cudaMemcpy(&h_maps, d_maps, sizeof(P4estVertexMaps), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
 60:   cerr = cudaFree(h_maps.c_maps);CHKERRCUDA(cerr);
 61:   cerr = cudaFree(h_maps.gIdx);CHKERRCUDA(cerr);
 62:   cerr = cudaFree(d_maps);CHKERRCUDA(cerr);
 63:   return(0);
 64: }

 66: // The GPU Landau kernel
 67: //
 68: __global__
 69: void landau_form_fdf(const PetscInt nip, const PetscInt dim, const PetscInt Nf, const PetscInt Nb, const PetscReal invJ_a[],
 70:                      const PetscReal * const BB, const PetscReal * const DD, LandauIPReal *IPDataRaw, LandauIPReal d_f[], LandauIPReal d_dfdx[], LandauIPReal d_dfdy[],
 71: #if LANDAU_DIM==3
 72:                      LandauIPReal d_dfdz[],
 73: #endif
 74:                      PetscErrorCode *ierr) // output
 75: {
 76:   const PetscInt  Nq = blockDim.x, myelem = blockIdx.x;
 77:   const PetscInt  myQi = threadIdx.x;
 78:   const PetscInt  jpidx = myQi + myelem * Nq;
 79:   const PetscReal *invJ = &invJ_a[jpidx*dim*dim];
 80:   const PetscReal *Bq = &BB[myQi*Nb], *Dq = &DD[myQi*Nb*dim];
 81:   // un pack IPData
 82:   LandauIPReal    *IPData_coefs = &IPDataRaw[nip*(dim+1)];
 83:   LandauIPReal    *coef = &IPData_coefs[myelem*Nb*Nf];
 84:   PetscInt        f,d,b,e;
 85:   PetscScalar     u_x[LANDAU_MAX_SPECIES][LANDAU_DIM];
 86:   *0;
 87:   /* get f and df */
 88:   for (f = 0; f < Nf; ++f) {
 89:     PetscScalar refSpaceDer[LANDAU_DIM];
 90:     d_f[jpidx + f*nip] = 0.0;
 91:     for (d = 0; d < LANDAU_DIM; ++d) refSpaceDer[d] = 0.0;
 92:     for (b = 0; b < Nb; ++b) {
 93:       const PetscInt    cidx = b;
 94:       d_f[jpidx + f*nip] += Bq[cidx]*coef[f*Nb+cidx];
 95:       for (d = 0; d < dim; ++d) refSpaceDer[d] += Dq[cidx*dim+d]*coef[f*Nb+cidx];
 96:     }
 97:     for (d = 0; d < dim; ++d) {
 98:       for (e = 0, u_x[f][d] = 0.0; e < dim; ++e) {
 99:         u_x[f][d] += invJ[e*dim+d]*refSpaceDer[e];
100:       }
101:     }
102:   }
103:   for (f=0;f<Nf;f++) {
104:     d_dfdx[jpidx + f*nip] = PetscRealPart(u_x[f][0]);
105:     d_dfdy[jpidx + f*nip] = PetscRealPart(u_x[f][1]);
106: #if LANDAU_DIM==3
107:     d_dfdz[jpidx + f*nip] = PetscRealPart(u_x[f][2]);
108: #endif
109:   }
110: }

112: __device__ void
113: landau_inner_integral_v2(const PetscInt myQi, const PetscInt jpidx, PetscInt nip, const PetscInt Nq, const PetscInt Nf, const PetscInt Nb,
114:                          const PetscInt dim, LandauIPReal *IPDataRaw, const PetscReal invJj[], const PetscReal nu_alpha[],
115:                          const PetscReal nu_beta[], const PetscReal invMass[], const PetscReal Eq_m[],
116:                          const PetscReal * const BB, const PetscReal * const DD,
117:                          PetscScalar *elemMat, P4estVertexMaps *d_maps, PetscSplitCSRDataStructure *d_mat, // output
118:                          PetscScalar fieldMats[][LANDAU_MAX_NQ], // all these arrays are in shared memory
119:                          PetscReal g2[][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES],
120:                          PetscReal g3[][LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES],
121:                          PetscReal gg2[][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES],
122:                          PetscReal gg3[][LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES],
123:                          PetscReal s_nu_alpha[],
124:                          PetscReal s_nu_beta[],
125:                          PetscReal s_invMass[],
126:                          PetscReal s_f[],
127:                          PetscReal s_dfx[],
128:                          PetscReal s_dfy[],
129:                          LandauIPReal d_f[], LandauIPReal d_dfdx[], LandauIPReal d_dfdy[], // global memory
130: #if LANDAU_DIM==3
131:                          PetscReal s_dfz[], LandauIPReal d_dfdz[],
132: #endif
133:                          PetscReal d_mass_w[], PetscReal shift,
134:                          PetscInt myelem, PetscErrorCode *ierr)
135: {
136:   int           delta,d,f,g,d2,dp,d3,fieldA,ipidx_b,nip_pad = nip; // vectorization padding not supported;
137:   *0;
138:   if (!d_mass_w) { // get g2 & g3
139:     PetscReal     gg2_temp[LANDAU_DIM], gg3_temp[LANDAU_DIM][LANDAU_DIM];
140:     LandauIPData  IPData;
141:     // create g2 & g3
142:     for (f=threadIdx.x; f<Nf; f+=blockDim.x) {
143:       for (d=0;d<dim;d++) { // clear accumulation data D & K
144:         gg2[d][myQi][f] = 0;
145:         for (d2=0;d2<dim;d2++) gg3[d][d2][myQi][f] = 0;
146:       }
147:     }
148:     if (threadIdx.y == 0) {
149:       for (int i = threadIdx.x; i < Nf; i += blockDim.x) {
150:         s_nu_alpha[i] = nu_alpha[i];
151:         s_nu_beta[i] = nu_beta[i];
152:         s_invMass[i] = invMass[i];
153:       }
154:     }
155:     for (d2 = 0; d2 < dim; d2++) {
156:       gg2_temp[d2] = 0;
157:       for (d3 = 0; d3 < dim; d3++) {
158:         gg3_temp[d2][d3] = 0;
159:       }
160:     }
161:     __syncthreads();
162:     // un pack IPData
163:     IPData.w   = IPDataRaw;
164:     IPData.x   = IPDataRaw + 1*nip_pad;
165:     IPData.y   = IPDataRaw + 2*nip_pad;
166:     IPData.z   = IPDataRaw + 3*nip_pad;

168:     for (ipidx_b = 0; ipidx_b < nip; ipidx_b += blockDim.x) {
169:       const PetscReal vj[3] = {IPData.x[jpidx], IPData.y[jpidx], IPData.z ? IPData.z[jpidx] : 0};
170:       int ipidx = ipidx_b + threadIdx.x;

172:       __syncthreads();
173:       if (ipidx < nip) {
174:         for (fieldA = threadIdx.y; fieldA < Nf; fieldA += blockDim.y) {
175:           s_f  [fieldA*blockDim.x+threadIdx.x] =    d_f[ipidx + fieldA*nip_pad];
176:           s_dfx[fieldA*blockDim.x+threadIdx.x] = d_dfdx[ipidx + fieldA*nip_pad];
177:           s_dfy[fieldA*blockDim.x+threadIdx.x] = d_dfdy[ipidx + fieldA*nip_pad];
178: #if LANDAU_DIM==3
179:           s_dfz[fieldA*blockDim.x+threadIdx.x] = d_dfdz[ipidx + fieldA*nip_pad];
180: #endif
181:         }
182:       }
183:       __syncthreads();
184:       if (ipidx < nip) {
185:         const PetscReal wi = IPData.w[ipidx], x = IPData.x[ipidx], y = IPData.y[ipidx];
186:         PetscReal       temp1[3] = {0, 0, 0}, temp2 = 0;
187: #if LANDAU_DIM==2
188:         PetscReal Ud[2][2], Uk[2][2];
189:         LandauTensor2D(vj, x, y, Ud, Uk, (ipidx==jpidx) ? 0. : 1.);
190: #else
191:         PetscReal U[3][3], z = IPData.z[ipidx];
192:         LandauTensor3D(vj, x, y, z, U, (ipidx==jpidx) ? 0. : 1.);
193: #endif
194:         for (fieldA = 0; fieldA < Nf; fieldA++) {
195:           temp1[0] += s_dfx[fieldA*blockDim.x+threadIdx.x]*s_nu_beta[fieldA]*s_invMass[fieldA];
196:           temp1[1] += s_dfy[fieldA*blockDim.x+threadIdx.x]*s_nu_beta[fieldA]*s_invMass[fieldA];
197: #if LANDAU_DIM==3
198:           temp1[2] += s_dfz[fieldA*blockDim.x+threadIdx.x]*s_nu_beta[fieldA]*s_invMass[fieldA];
199: #endif
200:           temp2    += s_f  [fieldA*blockDim.x+threadIdx.x]*s_nu_beta[fieldA];
201:         }
202:         temp1[0] *= wi;
203:         temp1[1] *= wi;
204: #if LANDAU_DIM==3
205:         temp1[2] *= wi;
206: #endif
207:         temp2    *= wi;
208: #if LANDAU_DIM==2
209:         for (d2 = 0; d2 < 2; d2++) {
210:           for (d3 = 0; d3 < 2; ++d3) {
211:             /* K = U * grad(f): g2=e: i,A */
212:             gg2_temp[d2] += Uk[d2][d3]*temp1[d3];
213:             /* D = -U * (I \kron (fx)): g3=f: i,j,A */
214:             gg3_temp[d2][d3] += Ud[d2][d3]*temp2;
215:           }
216:         }
217: #else
218:         for (d2 = 0; d2 < 3; ++d2) {
219:           for (d3 = 0; d3 < 3; ++d3) {
220:             /* K = U * grad(f): g2 = e: i,A */
221:             gg2_temp[d2] += U[d2][d3]*temp1[d3];
222:             /* D = -U * (I \kron (fx)): g3 = f: i,j,A */
223:             gg3_temp[d2][d3] += U[d2][d3]*temp2;
224:           }
225:         }
226: #endif
227:       }
228:     } /* IPs */

230:     /* reduce gg temp sums across threads */
231:     for (delta = blockDim.x/2; delta > 0; delta /= 2) {
232:       for (d2 = 0; d2 < dim; d2++) {
233:         gg2_temp[d2] += __shfl_xor_sync(0xffffffff, gg2_temp[d2], delta, blockDim.x);
234:         for (d3 = 0; d3 < dim; d3++) {
235:           gg3_temp[d2][d3] += __shfl_xor_sync(0xffffffff, gg3_temp[d2][d3], delta, blockDim.x);
236:         }
237:       }
238:     }

240:     // add alpha and put in gg2/3
241:     for (fieldA = threadIdx.x; fieldA < Nf; fieldA += blockDim.x) {
242:       for (d2 = 0; d2 < dim; d2++) {
243:         gg2[d2][myQi][fieldA] += gg2_temp[d2]*s_nu_alpha[fieldA];
244:         for (d3 = 0; d3 < dim; d3++) {
245:           gg3[d2][d3][myQi][fieldA] -= gg3_temp[d2][d3]*s_nu_alpha[fieldA]*s_invMass[fieldA];
246:         }
247:       }
248:     }
249:     __syncthreads();

251:     /* add electric field term once per IP */
252:     for (fieldA = threadIdx.x; fieldA < Nf; fieldA += blockDim.x) {
253:       gg2[dim-1][myQi][fieldA] += Eq_m[fieldA];
254:     }
255:     __syncthreads();
256:     //intf("%d %d gg2[1][1]=%g\n",myelem,qj_start,gg2[1][dim-1]);
257:     /* Jacobian transform - g2 */
258:     for (fieldA = threadIdx.x; fieldA < Nf; fieldA += blockDim.x) {
259:       PetscReal wj = IPData.w[jpidx];
260:       for (d = 0; d < dim; ++d) {
261:         g2[d][myQi][fieldA] = 0.0;
262:         for (d2 = 0; d2 < dim; ++d2) {
263:           g2[d][myQi][fieldA] += invJj[d*dim+d2]*gg2[d2][myQi][fieldA];
264:           g3[d][d2][myQi][fieldA] = 0.0;
265:           for (d3 = 0; d3 < dim; ++d3) {
266:             for (dp = 0; dp < dim; ++dp) {
267:               g3[d][d2][myQi][fieldA] += invJj[d*dim + d3]*gg3[d3][dp][myQi][fieldA]*invJj[d2*dim + dp];
268:             }
269:           }
270:           g3[d][d2][myQi][fieldA] *= wj;
271:         }
272:         g2[d][myQi][fieldA] *= wj;
273:       }
274:     }
275:     __syncthreads();  // Synchronize (ensure all the data is available) and sum IP matrices
276:   } // !mass_w
277:   /* FE matrix construction */
278:   {
279:     int fieldA,d,qj,d2,q,idx,totDim=Nb*Nf;
280:     /* assemble */
281:     for (fieldA = 0; fieldA < Nf; fieldA++) {
282:       if (fieldMats) {
283:         for (f = threadIdx.y; f < Nb ; f += blockDim.y) {
284:           for (g = threadIdx.x; g < Nb; g += blockDim.x) {
285:             fieldMats[f][g] = 0;
286:           }
287:         }
288:       }
289:       for (f = threadIdx.y; f < Nb ; f += blockDim.y) {
290:         const PetscInt i = fieldA*Nb + f; /* Element matrix row */
291:         for (g = threadIdx.x; g < Nb; g += blockDim.x) {
292:           const PetscInt j    = fieldA*Nb + g; /* Element matrix column */
293:           const PetscInt fOff = i*totDim + j;
294:           PetscScalar t = elemMat ? elemMat[fOff] : fieldMats[f][g];
295:           for (qj = 0 ; qj < Nq ; qj++) {
296:             const PetscReal *BJq = &BB[qj*Nb], *DIq = &DD[qj*Nb*dim];
297:             if (!d_mass_w) {
298:               for (d = 0; d < dim; ++d) {
299:                 t += DIq[f*dim+d]*g2[d][qj][fieldA]*BJq[g];
300:                 for (d2 = 0; d2 < dim; ++d2) {
301:                   t += DIq[f*dim + d]*g3[d][d2][qj][fieldA]*DIq[g*dim + d2];
302:                 }
303:               }
304:             } else {
305:               const PetscInt jpidx = qj + myelem * Nq;
306:               t += BJq[f] * d_mass_w[jpidx]*shift * BJq[g];
307:             }
308:           }
309:           if (elemMat) elemMat[fOff] = t;
310:           else fieldMats[f][g] = t;
311:         }
312:       }
313:       if (fieldMats) {
314:         PetscScalar            vals[LANDAU_MAX_Q_FACE*LANDAU_MAX_Q_FACE];
315:         PetscReal              row_scale[LANDAU_MAX_Q_FACE],col_scale[LANDAU_MAX_Q_FACE];
316:         PetscInt               nr,nc,rows0[LANDAU_MAX_Q_FACE],cols0[LANDAU_MAX_Q_FACE],rows[LANDAU_MAX_Q_FACE],cols[LANDAU_MAX_Q_FACE];
317:         const LandauIdx *const Idxs = &d_maps->gIdx[myelem][fieldA][0];
318:         for (f = threadIdx.y; f < Nb ; f += blockDim.y) {
319:           idx = Idxs[f];
320:           if (idx >= 0) {
321:             nr = 1;
322:             rows0[0] = idx;
323:             row_scale[0] = 1.;
324:           } else {
325:             idx = -idx - 1;
326:             nr = d_maps->num_face;
327:             for (q = 0; q < d_maps->num_face; q++) {
328:               rows0[q]     = d_maps->c_maps[idx][q].gid;
329:               row_scale[q] = d_maps->c_maps[idx][q].scale;
330:             }
331:           }
332:           for (g = threadIdx.x; g < Nb; g += blockDim.x) {
333:             idx = Idxs[g];
334:             if (idx >= 0) {
335:               nc = 1;
336:               cols0[0] = idx;
337:               col_scale[0] = 1.;
338:             } else {
339:               idx = -idx - 1;
340:               nc = d_maps->num_face;
341:               for (q = 0; q < d_maps->num_face; q++) {
342:                 cols0[q]     = d_maps->c_maps[idx][q].gid;
343:                 col_scale[q] = d_maps->c_maps[idx][q].scale;
344:               }
345:             }
346:             for (q = 0; q < nr; q++) rows[q] = rows0[q];
347:             for (q = 0; q < nc; q++) cols[q] = cols0[q];
348:             for (q = 0; q < nr; q++) {
349:               for (d = 0; d < nc; d++) {
350:                 vals[q*nc + d] = row_scale[q]*col_scale[d]*fieldMats[f][g];
351:               }
352:             }
353:             MatSetValuesDevice(d_mat,nr,rows,nc,cols,vals,ADD_VALUES,ierr);
354:             if (*ierr) return;
355:           }
356:         }
357:       }
358:     }
359:   }
360: }

362: //
363: // The GPU Landau kernel
364: //
365: __global__
366: void __launch_bounds__(256,1) landau_kernel_v2(const PetscInt nip, const PetscInt dim, const PetscInt totDim, const PetscInt Nf, const PetscInt Nb, const PetscReal invJj[],
367:                                                const PetscReal nu_alpha[], const PetscReal nu_beta[], const PetscReal invMass[], const PetscReal Eq_m[],
368:                                                const PetscReal * const BB, const PetscReal * const DD, LandauIPReal *IPDataRaw,
369:                                                PetscScalar elemMats_out[], P4estVertexMaps *d_maps, PetscSplitCSRDataStructure *d_mat, LandauIPReal d_f[], LandauIPReal d_dfdx[], LandauIPReal d_dfdy[],
370: #if LANDAU_DIM==3
371:                                                LandauIPReal d_dfdz[],
372: #endif
373:                                                PetscReal d_mass_w[], PetscReal shift,
374:                                                PetscErrorCode *ierr)
375: {
376:   const PetscInt  Nq = blockDim.y, myelem = blockIdx.x;
377:   extern __shared__ PetscReal smem[];
378:   int size = 0;
379:   PetscReal (*g2)[LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES]              = // shared mem not needed when mass_w
380:     (PetscReal (*)[LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES])             &smem[size];
381:   size += LANDAU_MAX_NQ*LANDAU_MAX_SPECIES*LANDAU_DIM;
382:   PetscReal (*g3)[LANDAU_DIM][LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES]  =
383:     (PetscReal (*)[LANDAU_DIM][LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES]) &smem[size];
384:   size += LANDAU_DIM*LANDAU_DIM*LANDAU_MAX_NQ*LANDAU_MAX_SPECIES;
385:   PetscReal (*gg2)[LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES]             =
386:     (PetscReal (*)[LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES])             &smem[size];
387:   size += LANDAU_MAX_NQ*LANDAU_MAX_SPECIES*LANDAU_DIM;
388:   PetscReal (*gg3)[LANDAU_DIM][LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES] =
389:     (PetscReal (*)[LANDAU_DIM][LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES]) &smem[size];
390:   size += LANDAU_DIM*LANDAU_DIM*LANDAU_MAX_NQ*LANDAU_MAX_SPECIES;
391:   PetscReal *s_nu_alpha = &smem[size];
392:   size += LANDAU_MAX_SPECIES;
393:   PetscReal *s_nu_beta  = &smem[size];
394:   size += LANDAU_MAX_SPECIES;
395:   PetscReal *s_invMass  = &smem[size];
396:   size += LANDAU_MAX_SPECIES;
397:   PetscReal *s_f        = &smem[size];
398:   size += blockDim.x*LANDAU_MAX_SPECIES;
399:   PetscReal *s_dfx      = &smem[size];
400:   size += blockDim.x*LANDAU_MAX_SPECIES;
401:   PetscReal *s_dfy      = &smem[size];
402:   size += blockDim.x*LANDAU_MAX_SPECIES;
403: #if LANDAU_DIM==3
404:   PetscReal *s_dfz      = &smem[size];
405:   size += blockDim.x*LANDAU_MAX_SPECIES;
406: #endif
407:   PetscScalar (*fieldMats)[LANDAU_MAX_NQ][LANDAU_MAX_NQ] = d_maps ?
408:     (PetscScalar (*)[LANDAU_MAX_NQ][LANDAU_MAX_NQ]) &smem[size] : NULL;
409:   if (d_maps) size += LANDAU_MAX_NQ*LANDAU_MAX_NQ;
410:   const PetscInt  myQi = threadIdx.y;
411:   const PetscInt  jpidx = myQi + myelem * Nq;
412:   //const PetscInt  subblocksz = nip/nSubBlks + !!(nip%nSubBlks), ip_start = mySubBlk*subblocksz, ip_end = (mySubBlk+1)*subblocksz > nip ? nip : (mySubBlk+1)*subblocksz; /* this could be wrong with very few global IPs */
413:   PetscScalar     *elemMat  = elemMats_out ? &elemMats_out[myelem*totDim*totDim] : NULL; /* my output */
414:   int tid = threadIdx.x + threadIdx.y*blockDim.x;
415:   const PetscReal *invJ = invJj ? &invJj[jpidx*dim*dim] : NULL;
416:   if (elemMat) for (int i = tid; i < totDim*totDim; i += blockDim.x*blockDim.y) elemMat[i] = 0;
417:   __syncthreads();

419:   landau_inner_integral_v2(myQi, jpidx, nip, Nq, Nf, Nb, dim, IPDataRaw, invJ, nu_alpha, nu_beta, invMass, Eq_m, BB, DD,
420:                            elemMat, d_maps, d_mat, *fieldMats, *g2, *g3, *gg2, *gg3, s_nu_alpha, s_nu_beta, s_invMass, s_f, s_dfx, s_dfy, d_f, d_dfdx, d_dfdy,
421: #if LANDAU_DIM==3
422:                            s_dfz, d_dfdz,
423: #endif
424:                            d_mass_w, shift,
425:                            myelem, ierr); /* compact */
426: }

428: PetscErrorCode LandauCUDAJacobian(DM plex, const PetscInt Nq, const PetscReal nu_alpha[],const PetscReal nu_beta[], const PetscReal invMass[], const PetscReal Eq_m[],
429:                                   const LandauIPData *const IPData, const PetscReal invJj[],  PetscReal *mass_w, PetscReal shift, const PetscLogEvent events[], Mat JacP)
430: {
431:   PetscErrorCode    ierr,*d_ierr;
432:   cudaError_t       cerr;
433:   PetscInt          ii,ej,*Nbf,Nb,nip_dim2,cStart,cEnd,Nf,dim,numGCells,totDim,nip,szf=sizeof(LandauIPReal),ipdatasz;
434:   PetscReal         *d_BB,*d_DD,*d_invJj=NULL,*d_nu_alpha,*d_nu_beta,*d_invMass,*d_Eq_m,*d_mass_w=NULL;
435:   PetscScalar       *d_elemMats=NULL;
436:   LandauIPReal       *d_f=NULL, *d_dfdx=NULL, *d_dfdy=NULL;
437: #if LANDAU_DIM==3
438:   PetscScalar       *d_dfdz=NULL;
439: #endif
440:   PetscTabulation   *Tf;
441:   PetscDS           prob;
442:   PetscSection      section, globalSection;
443:   LandauIPReal      *d_IPDataRaw=NULL;
444:   LandauCtx         *ctx;
445:   PetscSplitCSRDataStructure *d_mat=NULL;
446:   P4estVertexMaps            *h_maps, *d_maps=NULL;
447:   int               nnn = 256/Nq;

450:   while (nnn & nnn - 1) nnn = nnn & nnn - 1;
451:   if (nnn>16) nnn = 16;
452:   PetscLogEventBegin(events[3],0,0,0,0);
453:   DMGetDimension(plex, &dim);
454:   if (dim!=LANDAU_DIM) SETERRQ2(PETSC_COMM_SELF, PETSC_ERR_PLIB, "LANDAU_DIM %D != dim %d",LANDAU_DIM,dim);
455:   DMPlexGetHeightStratum(plex,0,&cStart,&cEnd);
456:   numGCells = cEnd - cStart;
457:   nip  = numGCells*Nq; /* length of inner global iteration */
458:   DMGetDS(plex, &prob);
459:   PetscDSGetNumFields(prob, &Nf);
460:   PetscDSGetDimensions(prob, &Nbf); Nb = Nbf[0];
461:   if (Nq != Nb) SETERRQ2(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Nq != Nb. %D  %D",Nq,Nb);
462:   PetscDSGetTotalDimension(prob, &totDim);
463:   PetscDSGetTabulation(prob, &Tf);
464:   DMGetLocalSection(plex, &section);
465:   DMGetGlobalSection(plex, &globalSection);
466:   // create data
467:   cerr = cudaMalloc((void **)&d_BB,              Nq*Nb*szf);CHKERRCUDA(cerr);     // kernel input
468:   cerr = cudaMemcpy(          d_BB, Tf[0]->T[0], Nq*Nb*szf,   cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
469:   cerr = cudaMalloc((void **)&d_DD,              Nq*Nb*dim*szf);CHKERRCUDA(cerr); // kernel input
470:   cerr = cudaMemcpy(          d_DD, Tf[0]->T[1], Nq*Nb*dim*szf,   cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
471:   nip_dim2 = Nq*numGCells*dim*dim;
472:   if (mass_w) {
473:     cerr = cudaMalloc((void **)&d_mass_w,        nip*szf);CHKERRCUDA(cerr); // kernel input
474:     cerr = cudaMemcpy(          d_mass_w, mass_w,nip*szf,   cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
475:   } else {
476:     ipdatasz = LandauGetIPDataSize(IPData);
477:     cerr = cudaMalloc((void **)&d_IPDataRaw,ipdatasz*szf);CHKERRCUDA(cerr); // kernel input
478:     cerr = cudaMemcpy(d_IPDataRaw, IPData->w, ipdatasz*szf, cudaMemcpyHostToDevice);CHKERRCUDA(cerr); // assumes IPData starts with 'w'
479:     cerr = cudaMalloc((void **)&d_nu_alpha, Nf*szf);CHKERRCUDA(cerr); // kernel input
480:     cerr = cudaMalloc((void **)&d_nu_beta,  Nf*szf);CHKERRCUDA(cerr); // kernel input
481:     cerr = cudaMalloc((void **)&d_invMass,  Nf*szf);CHKERRCUDA(cerr); // kernel input
482:     cerr = cudaMalloc((void **)&d_Eq_m,     Nf*szf);CHKERRCUDA(cerr); // kernel input
483:     cerr = cudaMemcpy(d_nu_alpha, nu_alpha, Nf*szf,       cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
484:     cerr = cudaMemcpy(d_nu_beta,  nu_beta,  Nf*szf,       cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
485:     cerr = cudaMemcpy(d_invMass,  invMass,  Nf*szf,       cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
486:     cerr = cudaMemcpy(d_Eq_m,     Eq_m,     Nf*szf,       cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
487:     // f and df
488:     cerr = cudaMalloc((void **)&d_f,    nip*Nf*szf);CHKERRCUDA(cerr);     // kernel input
489:     cerr = cudaMalloc((void **)&d_dfdx, nip*Nf*szf);CHKERRCUDA(cerr);     // kernel input
490:     cerr = cudaMalloc((void **)&d_dfdy, nip*Nf*szf);CHKERRCUDA(cerr);     // kernel input
491: #if LANDAU_DIM==3
492:     cerr = cudaMalloc((void **)&d_dfdz, nip*Nf*szf);CHKERRCUDA(cerr);     // kernel input
493: #endif
494:     // collect geometry
495:     cerr = cudaMalloc((void **)&d_invJj, nip_dim2*szf);CHKERRCUDA(cerr); // kernel input
496:     cerr = cudaMemcpy(d_invJj, invJj, nip_dim2*szf,       cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
497:   }

499:   DMGetApplicationContext(plex, &ctx);
500:   if (!ctx) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "no context");
501:   if (ctx->gpu_assembly) {
502:     PetscContainer container;
503:     PetscObjectQuery((PetscObject) JacP, "assembly_maps", (PetscObject *) &container);
504:     if (container) { // not here first call
505:       PetscContainerGetPointer(container, (void **) &h_maps);
506:       if (h_maps->data) {
507:         d_maps = h_maps->data;
508:         if (!d_maps) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "GPU assembly but no metadata");
509:       } else {
510:         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "GPU assembly but no metadata in container");
511:       }
512:       // this does the setup the first time called
513:       MatCUSPARSEGetDeviceMatWrite(JacP,&d_mat);
514:     } else {
515:       cerr = cudaMalloc((void **)&d_elemMats, totDim*totDim*numGCells*sizeof(PetscScalar));CHKERRCUDA(cerr); // kernel output - first call is on CPU
516:     }
517:   } else {
518:     cerr = cudaMalloc((void **)&d_elemMats, totDim*totDim*numGCells*sizeof(PetscScalar));CHKERRCUDA(cerr); // kernel output - no GPU assembly
519:   }
520:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
521:   PetscLogEventEnd(events[3],0,0,0,0);

523:   cerr = cudaMalloc((void **)&d_ierr, sizeof(ierr));CHKERRCUDA(cerr); // kernel input
524:   if (!mass_w) { // form f and df
525:     dim3 dimBlock(Nq,1);
526:     PetscLogEventBegin(events[8],0,0,0,0);
527:     PetscLogGpuTimeBegin();
528:     ii = 0;
529:     // PetscPrintf(PETSC_COMM_SELF, "numGCells=%d dim.x=%d Nq=%d nThreads=%d, %d kB shared mem\n",numGCells,n,Nq,Nq*n,ii*szf/1024);
530:     landau_form_fdf<<<numGCells,dimBlock,ii*szf>>>( nip, dim, Nf, Nb, d_invJj, d_BB, d_DD, d_IPDataRaw, d_f, d_dfdx, d_dfdy,
531: #if LANDAU_DIM==3
532:                                                     d_dfdz,
533: #endif
534:                                                     d_ierr);
535:     CHECK_LAUNCH_ERROR();
536:     PetscLogGpuFlops(nip*(PetscLogDouble)(2*Nb*(1+dim)));
537:     PetscLogGpuTimeEnd();
538:     cerr = cudaMemcpy(&ierr, d_ierr, sizeof(ierr), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
539:     
540:     PetscLogEventEnd(events[8],0,0,0,0);
541:   }
542:   PetscLogEventBegin(events[4],0,0,0,0);
543:   {
544:     dim3 dimBlock(nnn,Nq);
545:     PetscLogGpuTimeBegin();
546:     PetscLogGpuFlops(nip*(PetscLogDouble)(mass_w ? (nip*(11*Nf+ 4*dim*dim) + 6*Nf*dim*dim*dim + 10*Nf*dim*dim + 4*Nf*dim + Nb*Nf*Nb*Nq*dim*dim*5) : Nb*Nf*Nb*Nq*4));
547:     ii = 2*LANDAU_MAX_NQ*LANDAU_MAX_SPECIES*LANDAU_DIM*(1+LANDAU_DIM) + 3*LANDAU_MAX_SPECIES + (1+LANDAU_DIM)*dimBlock.x*LANDAU_MAX_SPECIES;
548:     ii += (LANDAU_MAX_NQ*LANDAU_MAX_NQ)*LANDAU_MAX_SPECIES;
549:     if (ii*szf >= 49152) {
550:       cerr = cudaFuncSetAttribute(landau_kernel_v2,
551:                                   cudaFuncAttributeMaxDynamicSharedMemorySize,
552:                                   98304);CHKERRCUDA(cerr);
553:     }
554:     // PetscPrintf(PETSC_COMM_SELF, "numGCells=%d dim.x=%d Nq=%d nThreads=%d, %d kB shared mem\n",numGCells,n,Nq,Nq*n,ii*szf/1024);
555:     landau_kernel_v2<<<numGCells,dimBlock,ii*szf>>>(nip,dim,totDim,Nf,Nb,d_invJj,d_nu_alpha,d_nu_beta,d_invMass,d_Eq_m,
556:                                                     d_BB, d_DD, d_IPDataRaw, d_elemMats, d_maps, d_mat, d_f, d_dfdx, d_dfdy,
557: #if LANDAU_DIM==3
558:                                                     d_dfdz,
559: #endif
560:                                                     d_mass_w, shift,
561:                                                     d_ierr);
562:     CHECK_LAUNCH_ERROR();
563:     PetscLogGpuTimeEnd();
564:     //cerr = cudaMemcpy(&ierr, d_ierr, sizeof(ierr), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
565:     //
566:   }
567:   cerr = cudaFree(d_ierr);CHKERRCUDA(cerr);
568:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
569:   PetscLogEventEnd(events[4],0,0,0,0);
570:   // delete device data
571:   PetscLogEventBegin(events[5],0,0,0,0);
572:   cerr = cudaFree(d_BB);CHKERRCUDA(cerr);
573:   cerr = cudaFree(d_DD);CHKERRCUDA(cerr);
574:   if (mass_w) {
575:     cerr = cudaFree(d_mass_w);CHKERRCUDA(cerr);
576:   } else {
577:     cerr = cudaFree(d_IPDataRaw);CHKERRCUDA(cerr);
578:     cerr = cudaFree(d_f);CHKERRCUDA(cerr);
579:     cerr = cudaFree(d_dfdx);CHKERRCUDA(cerr);
580:     cerr = cudaFree(d_dfdy);CHKERRCUDA(cerr);
581: #if LANDAU_DIM==3
582:     cerr = cudaFree(d_dfdz);CHKERRCUDA(cerr);
583: #endif
584:     cerr = cudaFree(d_invJj);CHKERRCUDA(cerr);
585:     cerr = cudaFree(d_nu_alpha);CHKERRCUDA(cerr);
586:     cerr = cudaFree(d_nu_beta);CHKERRCUDA(cerr);
587:     cerr = cudaFree(d_invMass);CHKERRCUDA(cerr);
588:     cerr = cudaFree(d_Eq_m);CHKERRCUDA(cerr);
589:   }
590:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
591:   PetscLogEventEnd(events[5],0,0,0,0);
592:   // First time assembly even with GPU assembly
593:   if (d_elemMats) {
594:     PetscScalar *elemMats=NULL,*elMat;
595:     PetscLogEventBegin(events[5],0,0,0,0);
596:     PetscMalloc1(totDim*totDim*numGCells,&elemMats);
597:     cerr = cudaMemcpy(elemMats, d_elemMats, totDim*totDim*numGCells*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
598:     cerr = cudaFree(d_elemMats);CHKERRCUDA(cerr);
599:     PetscLogEventEnd(events[5],0,0,0,0);
600:     PetscLogEventBegin(events[6],0,0,0,0);
601:     for (ej = cStart, elMat = elemMats ; ej < cEnd; ++ej, elMat += totDim*totDim) {
602:       DMPlexMatSetClosure(plex, section, globalSection, JacP, ej, elMat, ADD_VALUES);
603:       if (ej==-1) {
604:         int d,f;
605:         PetscPrintf(PETSC_COMM_SELF,"GPU Element matrix\n");
606:         for (d = 0; d < totDim; ++d){
607:           for (f = 0; f < totDim; ++f) PetscPrintf(PETSC_COMM_SELF," %12.5e",  PetscRealPart(elMat[d*totDim + f]));
608:           PetscPrintf(PETSC_COMM_SELF,"\n");
609:         }
610:       }
611:     }
612:     PetscFree(elemMats);
613:     PetscLogEventEnd(events[6],0,0,0,0);
614:   }
615:   return(0);
616: }