Actual source code: veccuda2.cu

petsc-3.15.0 2021-03-30
Report Typos and Errors
  1: /*
  2:    Implements the sequential cuda vectors.
  3: */

  5: #define PETSC_SKIP_SPINLOCK
  6: #define PETSC_SKIP_CXX_COMPLEX_FIX

  8: #include <petscconf.h>
  9: #include <petsc/private/vecimpl.h>
 10: #include <../src/vec/vec/impls/dvecimpl.h>
 11: #include <petsc/private/cudavecimpl.h>

 13: #include <cuda_runtime.h>
 14: #include <thrust/device_ptr.h>
 15: #include <thrust/transform.h>
 16: #include <thrust/functional.h>
 17: #include <thrust/reduce.h>

 19: /*
 20:     Allocates space for the vector array on the GPU if it does not exist.
 21:     Does NOT change the PetscCUDAFlag for the vector
 22:     Does NOT zero the CUDA array

 24:  */
 25: PetscErrorCode VecCUDAAllocateCheck(Vec v)
 26: {
 28:   cudaError_t    err;
 29:   Vec_CUDA       *veccuda;
 30:   PetscBool      option_set;

 33:   if (!v->spptr) {
 34:     PetscReal pinned_memory_min;
 35:     PetscCalloc(sizeof(Vec_CUDA),&v->spptr);
 36:     veccuda = (Vec_CUDA*)v->spptr;
 37:     err = cudaMalloc((void**)&veccuda->GPUarray_allocated,sizeof(PetscScalar)*((PetscBLASInt)v->map->n));CHKERRCUDA(err);
 38:     veccuda->GPUarray = veccuda->GPUarray_allocated;
 39:     if (v->offloadmask == PETSC_OFFLOAD_UNALLOCATED) {
 40:       if (v->data && ((Vec_Seq*)v->data)->array) {
 41:         v->offloadmask = PETSC_OFFLOAD_CPU;
 42:       } else {
 43:         v->offloadmask = PETSC_OFFLOAD_GPU;
 44:       }
 45:     }
 46:     pinned_memory_min = 0;

 48:     /* Need to parse command line for minimum size to use for pinned memory allocations on host here.
 49:        Note: This same code duplicated in VecCreate_SeqCUDA_Private() and VecCreate_MPICUDA_Private(). Is there a good way to avoid this? */
 50:     PetscOptionsBegin(PetscObjectComm((PetscObject)v),((PetscObject)v)->prefix,"VECCUDA Options","Vec");
 51:     PetscOptionsReal("-vec_pinned_memory_min","Minimum size (in bytes) for an allocation to use pinned memory on host","VecSetPinnedMemoryMin",pinned_memory_min,&pinned_memory_min,&option_set);
 52:     if (option_set) v->minimum_bytes_pinned_memory = pinned_memory_min;
 53:     PetscOptionsEnd();
 54:   }
 55:   return(0);
 56: }

 58: /* Copies a vector from the CPU to the GPU unless we already have an up-to-date copy on the GPU */
 59: PetscErrorCode VecCUDACopyToGPU(Vec v)
 60: {
 62:   cudaError_t    err;
 63:   Vec_CUDA       *veccuda;
 64:   PetscScalar    *varray;

 68:   VecCUDAAllocateCheck(v);
 69:   if (v->offloadmask == PETSC_OFFLOAD_CPU) {
 70:     PetscLogEventBegin(VEC_CUDACopyToGPU,v,0,0,0);
 71:     veccuda            = (Vec_CUDA*)v->spptr;
 72:     varray             = veccuda->GPUarray;
 73:     err                = cudaMemcpy(varray,((Vec_Seq*)v->data)->array,v->map->n*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
 74:     PetscLogCpuToGpu((v->map->n)*sizeof(PetscScalar));
 75:     PetscLogEventEnd(VEC_CUDACopyToGPU,v,0,0,0);
 76:     v->offloadmask = PETSC_OFFLOAD_BOTH;
 77:   }
 78:   return(0);
 79: }

 81: /*
 82:      VecCUDACopyFromGPU - Copies a vector from the GPU to the CPU unless we already have an up-to-date copy on the CPU
 83: */
 84: PetscErrorCode VecCUDACopyFromGPU(Vec v)
 85: {
 87:   cudaError_t    err;
 88:   Vec_CUDA       *veccuda;
 89:   PetscScalar    *varray;

 93:   VecCUDAAllocateCheckHost(v);
 94:   if (v->offloadmask == PETSC_OFFLOAD_GPU) {
 95:     PetscLogEventBegin(VEC_CUDACopyFromGPU,v,0,0,0);
 96:     veccuda            = (Vec_CUDA*)v->spptr;
 97:     varray             = veccuda->GPUarray;
 98:     err                = cudaMemcpy(((Vec_Seq*)v->data)->array,varray,v->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(err);
 99:     PetscLogGpuToCpu((v->map->n)*sizeof(PetscScalar));
100:     PetscLogEventEnd(VEC_CUDACopyFromGPU,v,0,0,0);
101:     v->offloadmask     = PETSC_OFFLOAD_BOTH;
102:   }
103:   return(0);
104: }

106: /*MC
107:    VECSEQCUDA - VECSEQCUDA = "seqcuda" - The basic sequential vector, modified to use CUDA

109:    Options Database Keys:
110: . -vec_type seqcuda - sets the vector type to VECSEQCUDA during a call to VecSetFromOptions()

112:   Level: beginner

114: .seealso: VecCreate(), VecSetType(), VecSetFromOptions(), VecCreateSeqWithArray(), VECMPI, VecType, VecCreateMPI(), VecCreateSeq(), VecSetPinnedMemoryMin()
115: M*/

117: PetscErrorCode VecAYPX_SeqCUDA(Vec yin,PetscScalar alpha,Vec xin)
118: {
119:   const PetscScalar *xarray;
120:   PetscScalar       *yarray;
121:   PetscErrorCode    ierr;
122:   PetscBLASInt      one = 1,bn = 0;
123:   PetscScalar       sone = 1.0;
124:   cublasHandle_t    cublasv2handle;
125:   cublasStatus_t    cberr;
126:   cudaError_t       err;

129:   PetscCUBLASGetHandle(&cublasv2handle);
130:   PetscBLASIntCast(yin->map->n,&bn);
131:   VecCUDAGetArrayRead(xin,&xarray);
132:   VecCUDAGetArray(yin,&yarray);
133:   PetscLogGpuTimeBegin();
134:   if (alpha == (PetscScalar)0.0) {
135:     err = cudaMemcpy(yarray,xarray,bn*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
136:   } else if (alpha == (PetscScalar)1.0) {
137:     cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
138:     PetscLogGpuFlops(1.0*yin->map->n);
139:   } else {
140:     cberr = cublasXscal(cublasv2handle,bn,&alpha,yarray,one);CHKERRCUBLAS(cberr);
141:     cberr = cublasXaxpy(cublasv2handle,bn,&sone,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
142:     PetscLogGpuFlops(2.0*yin->map->n);
143:   }
144:   err  = WaitForCUDA();CHKERRCUDA(err);
145:   PetscLogGpuTimeEnd();
146:   VecCUDARestoreArrayRead(xin,&xarray);
147:   VecCUDARestoreArray(yin,&yarray);
148:   PetscLogCpuToGpu(sizeof(PetscScalar));
149:   return(0);
150: }

152: PetscErrorCode VecAXPY_SeqCUDA(Vec yin,PetscScalar alpha,Vec xin)
153: {
154:   const PetscScalar *xarray;
155:   PetscScalar       *yarray;
156:   PetscErrorCode    ierr;
157:   PetscBLASInt      one = 1,bn = 0;
158:   cublasHandle_t    cublasv2handle;
159:   cublasStatus_t    cberr;
160:   PetscBool         xiscuda;
161:   cudaError_t       err;

164:   if (alpha == (PetscScalar)0.0) return(0);
165:   PetscCUBLASGetHandle(&cublasv2handle);
166:   PetscObjectTypeCompareAny((PetscObject)xin,&xiscuda,VECSEQCUDA,VECMPICUDA,"");
167:   if (xiscuda) {
168:     PetscBLASIntCast(yin->map->n,&bn);
169:     VecCUDAGetArrayRead(xin,&xarray);
170:     VecCUDAGetArray(yin,&yarray);
171:     PetscLogGpuTimeBegin();
172:     cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
173:     err  = WaitForCUDA();CHKERRCUDA(err);
174:     PetscLogGpuTimeEnd();
175:     VecCUDARestoreArrayRead(xin,&xarray);
176:     VecCUDARestoreArray(yin,&yarray);
177:     PetscLogGpuFlops(2.0*yin->map->n);
178:     PetscLogCpuToGpu(sizeof(PetscScalar));
179:   } else {
180:     VecAXPY_Seq(yin,alpha,xin);
181:   }
182:   return(0);
183: }

185: PetscErrorCode VecPointwiseDivide_SeqCUDA(Vec win, Vec xin, Vec yin)
186: {
187:   PetscInt                              n = xin->map->n;
188:   const PetscScalar                     *xarray=NULL,*yarray=NULL;
189:   PetscScalar                           *warray=NULL;
190:   thrust::device_ptr<const PetscScalar> xptr,yptr;
191:   thrust::device_ptr<PetscScalar>       wptr;
192:   PetscErrorCode                        ierr;
193:   cudaError_t                           err;

196:   VecCUDAGetArrayWrite(win,&warray);
197:   VecCUDAGetArrayRead(xin,&xarray);
198:   VecCUDAGetArrayRead(yin,&yarray);
199:   PetscLogGpuTimeBegin();
200:   try {
201:     wptr = thrust::device_pointer_cast(warray);
202:     xptr = thrust::device_pointer_cast(xarray);
203:     yptr = thrust::device_pointer_cast(yarray);
204:     thrust::transform(xptr,xptr+n,yptr,wptr,thrust::divides<PetscScalar>());
205:     err  = WaitForCUDA();CHKERRCUDA(err);
206:   } catch (char *ex) {
207:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
208:   }
209:   PetscLogGpuTimeEnd();
210:   PetscLogGpuFlops(n);
211:   VecCUDARestoreArrayRead(xin,&xarray);
212:   VecCUDARestoreArrayRead(yin,&yarray);
213:   VecCUDARestoreArrayWrite(win,&warray);
214:   return(0);
215: }

217: PetscErrorCode VecWAXPY_SeqCUDA(Vec win,PetscScalar alpha,Vec xin, Vec yin)
218: {
219:   const PetscScalar *xarray=NULL,*yarray=NULL;
220:   PetscScalar       *warray=NULL;
221:   PetscErrorCode    ierr;
222:   PetscBLASInt      one = 1,bn = 0;
223:   cublasHandle_t    cublasv2handle;
224:   cublasStatus_t    stat;
225:   cudaError_t       cerr;
226:   cudaStream_t      stream;

229:   PetscCUBLASGetHandle(&cublasv2handle);
230:   PetscBLASIntCast(win->map->n,&bn);
231:   if (alpha == (PetscScalar)0.0) {
232:     VecCopy_SeqCUDA(yin,win);
233:   } else {
234:     VecCUDAGetArrayRead(xin,&xarray);
235:     VecCUDAGetArrayRead(yin,&yarray);
236:     VecCUDAGetArrayWrite(win,&warray);
237:     PetscLogGpuTimeBegin();
238:     stat = cublasGetStream(cublasv2handle,&stream);CHKERRCUBLAS(stat);
239:     cerr = cudaMemcpyAsync(warray,yarray,win->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice,stream);CHKERRCUDA(cerr);
240:     stat = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,warray,one);CHKERRCUBLAS(stat);
241:     cerr = WaitForCUDA();CHKERRCUDA(cerr);
242:     PetscLogGpuTimeEnd();
243:     PetscLogGpuFlops(2*win->map->n);
244:     VecCUDARestoreArrayRead(xin,&xarray);
245:     VecCUDARestoreArrayRead(yin,&yarray);
246:     VecCUDARestoreArrayWrite(win,&warray);
247:     PetscLogCpuToGpu(sizeof(PetscScalar));
248:   }
249:   return(0);
250: }

252: PetscErrorCode VecMAXPY_SeqCUDA(Vec xin, PetscInt nv,const PetscScalar *alpha,Vec *y)
253: {
254:   PetscErrorCode    ierr;
255:   cudaError_t       err;
256:   PetscInt          n = xin->map->n,j;
257:   PetscScalar       *xarray;
258:   const PetscScalar *yarray;
259:   PetscBLASInt      one = 1,bn = 0;
260:   cublasHandle_t    cublasv2handle;
261:   cublasStatus_t    cberr;

264:   PetscLogGpuFlops(nv*2.0*n);
265:   PetscLogCpuToGpu(nv*sizeof(PetscScalar));
266:   PetscCUBLASGetHandle(&cublasv2handle);
267:   PetscBLASIntCast(n,&bn);
268:   PetscLogGpuTimeBegin();
269:   VecCUDAGetArray(xin,&xarray);
270:   for (j=0; j<nv; j++) {
271:     VecCUDAGetArrayRead(y[j],&yarray);
272:     cberr = cublasXaxpy(cublasv2handle,bn,alpha+j,yarray,one,xarray,one);CHKERRCUBLAS(cberr);
273:     VecCUDARestoreArrayRead(y[j],&yarray);
274:   }
275:   err  = WaitForCUDA();CHKERRCUDA(err);
276:   PetscLogGpuTimeEnd();
277:   VecCUDARestoreArray(xin,&xarray);
278:   return(0);
279: }

281: PetscErrorCode VecDot_SeqCUDA(Vec xin,Vec yin,PetscScalar *z)
282: {
283:   const PetscScalar *xarray,*yarray;
284:   PetscErrorCode    ierr;
285:   PetscBLASInt      one = 1,bn = 0;
286:   cublasHandle_t    cublasv2handle;
287:   cublasStatus_t    cerr;

290:   PetscCUBLASGetHandle(&cublasv2handle);
291:   PetscBLASIntCast(yin->map->n,&bn);
292:   VecCUDAGetArrayRead(xin,&xarray);
293:   VecCUDAGetArrayRead(yin,&yarray);
294:   /* arguments y, x are reversed because BLAS complex conjugates the first argument, PETSc the second */
295:   PetscLogGpuTimeBegin();
296:   cerr = cublasXdot(cublasv2handle,bn,yarray,one,xarray,one,z);CHKERRCUBLAS(cerr);
297:   PetscLogGpuTimeEnd();
298:   if (xin->map->n >0) {
299:     PetscLogGpuFlops(2.0*xin->map->n-1);
300:   }
301:   PetscLogGpuToCpu(sizeof(PetscScalar));
302:   VecCUDARestoreArrayRead(xin,&xarray);
303:   VecCUDARestoreArrayRead(yin,&yarray);
304:   return(0);
305: }

307: //
308: // CUDA kernels for MDot to follow
309: //

311: // set work group size to be a power of 2 (128 is usually a good compromise between portability and speed)
312: #define MDOT_WORKGROUP_SIZE 128
313: #define MDOT_WORKGROUP_NUM  128

315: #if !defined(PETSC_USE_COMPLEX)
316: // M = 2:
317: __global__ void VecMDot_SeqCUDA_kernel2(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,
318:                                         PetscInt size, PetscScalar *group_results)
319: {
320:   __shared__ PetscScalar tmp_buffer[2*MDOT_WORKGROUP_SIZE];
321:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
322:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
323:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
324:   PetscInt vec_stop_index  = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

326:   PetscScalar entry_x    = 0;
327:   PetscScalar group_sum0 = 0;
328:   PetscScalar group_sum1 = 0;
329:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
330:     entry_x     = x[i];   // load only once from global memory!
331:     group_sum0 += entry_x * y0[i];
332:     group_sum1 += entry_x * y1[i];
333:   }
334:   tmp_buffer[threadIdx.x]                       = group_sum0;
335:   tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;

337:   // parallel reduction
338:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
339:     __syncthreads();
340:     if (threadIdx.x < stride) {
341:       tmp_buffer[threadIdx.x                      ] += tmp_buffer[threadIdx.x+stride                      ];
342:       tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
343:     }
344:   }

346:   // write result of group to group_results
347:   if (threadIdx.x == 0) {
348:     group_results[blockIdx.x]             = tmp_buffer[0];
349:     group_results[blockIdx.x + gridDim.x] = tmp_buffer[MDOT_WORKGROUP_SIZE];
350:   }
351: }

353: // M = 3:
354: __global__ void VecMDot_SeqCUDA_kernel3(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,
355:                                         PetscInt size, PetscScalar *group_results)
356: {
357:   __shared__ PetscScalar tmp_buffer[3*MDOT_WORKGROUP_SIZE];
358:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
359:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
360:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
361:   PetscInt vec_stop_index  = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

363:   PetscScalar entry_x    = 0;
364:   PetscScalar group_sum0 = 0;
365:   PetscScalar group_sum1 = 0;
366:   PetscScalar group_sum2 = 0;
367:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
368:     entry_x     = x[i];   // load only once from global memory!
369:     group_sum0 += entry_x * y0[i];
370:     group_sum1 += entry_x * y1[i];
371:     group_sum2 += entry_x * y2[i];
372:   }
373:   tmp_buffer[threadIdx.x]                           = group_sum0;
374:   tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] = group_sum1;
375:   tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;

377:   // parallel reduction
378:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
379:     __syncthreads();
380:     if (threadIdx.x < stride) {
381:       tmp_buffer[threadIdx.x                          ] += tmp_buffer[threadIdx.x+stride                          ];
382:       tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride +     MDOT_WORKGROUP_SIZE];
383:       tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
384:     }
385:   }

387:   // write result of group to group_results
388:   if (threadIdx.x == 0) {
389:     group_results[blockIdx.x                ] = tmp_buffer[0];
390:     group_results[blockIdx.x +     gridDim.x] = tmp_buffer[    MDOT_WORKGROUP_SIZE];
391:     group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
392:   }
393: }

395: // M = 4:
396: __global__ void VecMDot_SeqCUDA_kernel4(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,const PetscScalar *y3,
397:                                         PetscInt size, PetscScalar *group_results)
398: {
399:   __shared__ PetscScalar tmp_buffer[4*MDOT_WORKGROUP_SIZE];
400:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
401:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
402:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
403:   PetscInt vec_stop_index  = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

405:   PetscScalar entry_x    = 0;
406:   PetscScalar group_sum0 = 0;
407:   PetscScalar group_sum1 = 0;
408:   PetscScalar group_sum2 = 0;
409:   PetscScalar group_sum3 = 0;
410:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
411:     entry_x     = x[i];   // load only once from global memory!
412:     group_sum0 += entry_x * y0[i];
413:     group_sum1 += entry_x * y1[i];
414:     group_sum2 += entry_x * y2[i];
415:     group_sum3 += entry_x * y3[i];
416:   }
417:   tmp_buffer[threadIdx.x]                           = group_sum0;
418:   tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] = group_sum1;
419:   tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
420:   tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;

422:   // parallel reduction
423:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
424:     __syncthreads();
425:     if (threadIdx.x < stride) {
426:       tmp_buffer[threadIdx.x                          ] += tmp_buffer[threadIdx.x+stride                          ];
427:       tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride +     MDOT_WORKGROUP_SIZE];
428:       tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
429:       tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 3 * MDOT_WORKGROUP_SIZE];
430:     }
431:   }

433:   // write result of group to group_results
434:   if (threadIdx.x == 0) {
435:     group_results[blockIdx.x                ] = tmp_buffer[0];
436:     group_results[blockIdx.x +     gridDim.x] = tmp_buffer[    MDOT_WORKGROUP_SIZE];
437:     group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
438:     group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
439:   }
440: }

442: // M = 8:
443: __global__ void VecMDot_SeqCUDA_kernel8(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,const PetscScalar *y3,
444:                                           const PetscScalar *y4,const PetscScalar *y5,const PetscScalar *y6,const PetscScalar *y7,
445:                                           PetscInt size, PetscScalar *group_results)
446: {
447:   __shared__ PetscScalar tmp_buffer[8*MDOT_WORKGROUP_SIZE];
448:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
449:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
450:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
451:   PetscInt vec_stop_index  = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

453:   PetscScalar entry_x    = 0;
454:   PetscScalar group_sum0 = 0;
455:   PetscScalar group_sum1 = 0;
456:   PetscScalar group_sum2 = 0;
457:   PetscScalar group_sum3 = 0;
458:   PetscScalar group_sum4 = 0;
459:   PetscScalar group_sum5 = 0;
460:   PetscScalar group_sum6 = 0;
461:   PetscScalar group_sum7 = 0;
462:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
463:     entry_x     = x[i];   // load only once from global memory!
464:     group_sum0 += entry_x * y0[i];
465:     group_sum1 += entry_x * y1[i];
466:     group_sum2 += entry_x * y2[i];
467:     group_sum3 += entry_x * y3[i];
468:     group_sum4 += entry_x * y4[i];
469:     group_sum5 += entry_x * y5[i];
470:     group_sum6 += entry_x * y6[i];
471:     group_sum7 += entry_x * y7[i];
472:   }
473:   tmp_buffer[threadIdx.x]                           = group_sum0;
474:   tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] = group_sum1;
475:   tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
476:   tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;
477:   tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] = group_sum4;
478:   tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] = group_sum5;
479:   tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] = group_sum6;
480:   tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] = group_sum7;

482:   // parallel reduction
483:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
484:     __syncthreads();
485:     if (threadIdx.x < stride) {
486:       tmp_buffer[threadIdx.x                          ] += tmp_buffer[threadIdx.x+stride                          ];
487:       tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride +     MDOT_WORKGROUP_SIZE];
488:       tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
489:       tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 3 * MDOT_WORKGROUP_SIZE];
490:       tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 4 * MDOT_WORKGROUP_SIZE];
491:       tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 5 * MDOT_WORKGROUP_SIZE];
492:       tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 6 * MDOT_WORKGROUP_SIZE];
493:       tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 7 * MDOT_WORKGROUP_SIZE];
494:     }
495:   }

497:   // write result of group to group_results
498:   if (threadIdx.x == 0) {
499:     group_results[blockIdx.x                ] = tmp_buffer[0];
500:     group_results[blockIdx.x +     gridDim.x] = tmp_buffer[    MDOT_WORKGROUP_SIZE];
501:     group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
502:     group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
503:     group_results[blockIdx.x + 4 * gridDim.x] = tmp_buffer[4 * MDOT_WORKGROUP_SIZE];
504:     group_results[blockIdx.x + 5 * gridDim.x] = tmp_buffer[5 * MDOT_WORKGROUP_SIZE];
505:     group_results[blockIdx.x + 6 * gridDim.x] = tmp_buffer[6 * MDOT_WORKGROUP_SIZE];
506:     group_results[blockIdx.x + 7 * gridDim.x] = tmp_buffer[7 * MDOT_WORKGROUP_SIZE];
507:   }
508: }
509: #endif /* !defined(PETSC_USE_COMPLEX) */

511: PetscErrorCode VecMDot_SeqCUDA(Vec xin,PetscInt nv,const Vec yin[],PetscScalar *z)
512: {
513:   PetscErrorCode    ierr;
514:   PetscInt          i,n = xin->map->n,current_y_index = 0;
515:   const PetscScalar *xptr,*y0ptr,*y1ptr,*y2ptr,*y3ptr,*y4ptr,*y5ptr,*y6ptr,*y7ptr;
516: #if !defined(PETSC_USE_COMPLEX)
517:   PetscInt          nv1 = ((nv % 4) == 1) ? nv-1: nv,j;
518:   PetscScalar       *group_results_gpu,group_results_cpu[nv1*MDOT_WORKGROUP_NUM];
519:   cudaError_t       cuda_ierr;
520: #endif
521:   PetscBLASInt      one = 1,bn = 0;
522:   cublasHandle_t    cublasv2handle;
523:   cublasStatus_t    cberr;
524:   cudaError_t       err;

527:   PetscCUBLASGetHandle(&cublasv2handle);
528:   PetscBLASIntCast(xin->map->n,&bn);
529:   if (nv <= 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Number of vectors provided to VecMDot_SeqCUDA not positive.");
530:   /* Handle the case of local size zero first */
531:   if (!xin->map->n) {
532:     for (i=0; i<nv; ++i) z[i] = 0;
533:     return(0);
534:   }

536: #if !defined(PETSC_USE_COMPLEX)
537:   // allocate scratchpad memory for the results of individual work groups:
538:   cuda_cudaMalloc((void**)&group_results_gpu, nv1*sizeof(PetscScalar)*MDOT_WORKGROUP_NUM);CHKERRCUDA(cuda_ierr);
539: #endif
540:   VecCUDAGetArrayRead(xin,&xptr);
541:   PetscLogGpuTimeBegin();

543:   while (current_y_index < nv)
544:   {
545:     switch (nv - current_y_index) {

547:       case 7:
548:       case 6:
549:       case 5:
550:       case 4:
551:         VecCUDAGetArrayRead(yin[current_y_index  ],&y0ptr);
552:         VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
553:         VecCUDAGetArrayRead(yin[current_y_index+2],&y2ptr);
554:         VecCUDAGetArrayRead(yin[current_y_index+3],&y3ptr);
555: #if defined(PETSC_USE_COMPLEX)
556:         cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
557:         cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
558:         cberr = cublasXdot(cublasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);CHKERRCUBLAS(cberr);
559:         cberr = cublasXdot(cublasv2handle,bn,y3ptr,one,xptr,one,&z[current_y_index+3]);CHKERRCUBLAS(cberr);
560: #else
561:         VecMDot_SeqCUDA_kernel4<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,y3ptr,n,group_results_gpu+current_y_index*MDOT_WORKGROUP_NUM);

563: #endif
564:         VecCUDARestoreArrayRead(yin[current_y_index  ],&y0ptr);
565:         VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
566:         VecCUDARestoreArrayRead(yin[current_y_index+2],&y2ptr);
567:         VecCUDARestoreArrayRead(yin[current_y_index+3],&y3ptr);
568:         current_y_index += 4;
569:         break;

571:       case 3:
572:         VecCUDAGetArrayRead(yin[current_y_index  ],&y0ptr);
573:         VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
574:         VecCUDAGetArrayRead(yin[current_y_index+2],&y2ptr);

576: #if defined(PETSC_USE_COMPLEX)
577:         cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
578:         cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
579:         cberr = cublasXdot(cublasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);CHKERRCUBLAS(cberr);
580: #else
581:         VecMDot_SeqCUDA_kernel3<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,n,group_results_gpu+current_y_index*MDOT_WORKGROUP_NUM);
582: #endif
583:         VecCUDARestoreArrayRead(yin[current_y_index  ],&y0ptr);
584:         VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
585:         VecCUDARestoreArrayRead(yin[current_y_index+2],&y2ptr);
586:         current_y_index += 3;
587:         break;

589:       case 2:
590:         VecCUDAGetArrayRead(yin[current_y_index],&y0ptr);
591:         VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
592: #if defined(PETSC_USE_COMPLEX)
593:         cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
594:         cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
595: #else
596:         VecMDot_SeqCUDA_kernel2<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,n,group_results_gpu+current_y_index*MDOT_WORKGROUP_NUM);
597: #endif
598:         VecCUDARestoreArrayRead(yin[current_y_index],&y0ptr);
599:         VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
600:         current_y_index += 2;
601:         break;

603:       case 1:
604:         VecCUDAGetArrayRead(yin[current_y_index],&y0ptr);
605:         cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
606:         VecCUDARestoreArrayRead(yin[current_y_index],&y0ptr);
607:         current_y_index += 1;
608:         break;

610:       default: // 8 or more vectors left
611:         VecCUDAGetArrayRead(yin[current_y_index  ],&y0ptr);
612:         VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
613:         VecCUDAGetArrayRead(yin[current_y_index+2],&y2ptr);
614:         VecCUDAGetArrayRead(yin[current_y_index+3],&y3ptr);
615:         VecCUDAGetArrayRead(yin[current_y_index+4],&y4ptr);
616:         VecCUDAGetArrayRead(yin[current_y_index+5],&y5ptr);
617:         VecCUDAGetArrayRead(yin[current_y_index+6],&y6ptr);
618:         VecCUDAGetArrayRead(yin[current_y_index+7],&y7ptr);
619: #if defined(PETSC_USE_COMPLEX)
620:         cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
621:         cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
622:         cberr = cublasXdot(cublasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);CHKERRCUBLAS(cberr);
623:         cberr = cublasXdot(cublasv2handle,bn,y3ptr,one,xptr,one,&z[current_y_index+3]);CHKERRCUBLAS(cberr);
624:         cberr = cublasXdot(cublasv2handle,bn,y4ptr,one,xptr,one,&z[current_y_index+4]);CHKERRCUBLAS(cberr);
625:         cberr = cublasXdot(cublasv2handle,bn,y5ptr,one,xptr,one,&z[current_y_index+5]);CHKERRCUBLAS(cberr);
626:         cberr = cublasXdot(cublasv2handle,bn,y6ptr,one,xptr,one,&z[current_y_index+6]);CHKERRCUBLAS(cberr);
627:         cberr = cublasXdot(cublasv2handle,bn,y7ptr,one,xptr,one,&z[current_y_index+7]);CHKERRCUBLAS(cberr);
628: #else
629:         VecMDot_SeqCUDA_kernel8<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,y3ptr,y4ptr,y5ptr,y6ptr,y7ptr,n,group_results_gpu+current_y_index*MDOT_WORKGROUP_NUM);
630: #endif
631:         VecCUDARestoreArrayRead(yin[current_y_index  ],&y0ptr);
632:         VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
633:         VecCUDARestoreArrayRead(yin[current_y_index+2],&y2ptr);
634:         VecCUDARestoreArrayRead(yin[current_y_index+3],&y3ptr);
635:         VecCUDARestoreArrayRead(yin[current_y_index+4],&y4ptr);
636:         VecCUDARestoreArrayRead(yin[current_y_index+5],&y5ptr);
637:         VecCUDARestoreArrayRead(yin[current_y_index+6],&y6ptr);
638:         VecCUDARestoreArrayRead(yin[current_y_index+7],&y7ptr);
639:         current_y_index += 8;
640:         break;
641:     }
642:   }
643:   err  = WaitForCUDA();CHKERRCUDA(err);
644:   PetscLogGpuTimeEnd();
645:   VecCUDARestoreArrayRead(xin,&xptr);

647: #if defined(PETSC_USE_COMPLEX)
648:   PetscLogGpuToCpu(nv*sizeof(PetscScalar));
649: #else
650:   // copy results to CPU
651:   cuda_cudaMemcpy(group_results_cpu,group_results_gpu,nv1*sizeof(PetscScalar)*MDOT_WORKGROUP_NUM,cudaMemcpyDeviceToHost);CHKERRCUDA(cuda_ierr);

653:   // sum group results into z
654:   for (j=0; j<nv1; ++j) {
655:     z[j] = 0;
656:     for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[j] += group_results_cpu[i];
657:   }
658:   PetscLogFlops(nv1*MDOT_WORKGROUP_NUM);
659:   cuda_cudaFree(group_results_gpu);CHKERRCUDA(cuda_ierr);
660:   PetscLogGpuToCpu(nv1*sizeof(PetscScalar)*MDOT_WORKGROUP_NUM);
661: #endif
662:   PetscLogGpuFlops(PetscMax(nv*(2.0*n-1),0.0));
663:   return(0);
664: }

666: #undef MDOT_WORKGROUP_SIZE
667: #undef MDOT_WORKGROUP_NUM

669: PetscErrorCode VecSet_SeqCUDA(Vec xin,PetscScalar alpha)
670: {
671:   PetscInt                        n = xin->map->n;
672:   PetscScalar                     *xarray = NULL;
673:   thrust::device_ptr<PetscScalar> xptr;
674:   PetscErrorCode                  ierr;
675:   cudaError_t                     err;

678:   VecCUDAGetArrayWrite(xin,&xarray);
679:   PetscLogGpuTimeBegin();
680:   if (alpha == (PetscScalar)0.0) {
681:     err = cudaMemset(xarray,0,n*sizeof(PetscScalar));CHKERRCUDA(err);
682:   } else {
683:     try {
684:       xptr = thrust::device_pointer_cast(xarray);
685:       thrust::fill(xptr,xptr+n,alpha);
686:     } catch (char *ex) {
687:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
688:     }
689:     PetscLogCpuToGpu(sizeof(PetscScalar));
690:   }
691:   err  = WaitForCUDA();CHKERRCUDA(err);
692:   PetscLogGpuTimeEnd();
693:   VecCUDARestoreArrayWrite(xin,&xarray);
694:   return(0);
695: }

697: PetscErrorCode VecScale_SeqCUDA(Vec xin,PetscScalar alpha)
698: {
699:   PetscScalar    *xarray;
701:   PetscBLASInt   one = 1,bn = 0;
702:   cublasHandle_t cublasv2handle;
703:   cublasStatus_t cberr;
704:   cudaError_t    err;

707:   if (alpha == (PetscScalar)0.0) {
708:     VecSet_SeqCUDA(xin,alpha);
709:     err  = WaitForCUDA();CHKERRCUDA(err);
710:   } else if (alpha != (PetscScalar)1.0) {
711:     PetscCUBLASGetHandle(&cublasv2handle);
712:     PetscBLASIntCast(xin->map->n,&bn);
713:     VecCUDAGetArray(xin,&xarray);
714:     PetscLogGpuTimeBegin();
715:     cberr = cublasXscal(cublasv2handle,bn,&alpha,xarray,one);CHKERRCUBLAS(cberr);
716:     VecCUDARestoreArray(xin,&xarray);
717:     err  = WaitForCUDA();CHKERRCUDA(err);
718:     PetscLogGpuTimeEnd();
719:     PetscLogCpuToGpu(sizeof(PetscScalar));
720:     PetscLogGpuFlops(xin->map->n);
721:   }
722:   return(0);
723: }

725: PetscErrorCode VecTDot_SeqCUDA(Vec xin,Vec yin,PetscScalar *z)
726: {
727:   const PetscScalar *xarray,*yarray;
728:   PetscErrorCode    ierr;
729:   PetscBLASInt      one = 1,bn = 0;
730:   cublasHandle_t    cublasv2handle;
731:   cublasStatus_t    cerr;

734:   PetscCUBLASGetHandle(&cublasv2handle);
735:   PetscBLASIntCast(xin->map->n,&bn);
736:   VecCUDAGetArrayRead(xin,&xarray);
737:   VecCUDAGetArrayRead(yin,&yarray);
738:   PetscLogGpuTimeBegin();
739:   cerr = cublasXdotu(cublasv2handle,bn,xarray,one,yarray,one,z);CHKERRCUBLAS(cerr);
740:   PetscLogGpuTimeEnd();
741:   if (xin->map->n > 0) {
742:     PetscLogGpuFlops(2.0*xin->map->n-1);
743:   }
744:   PetscLogGpuToCpu(sizeof(PetscScalar));
745:   VecCUDARestoreArrayRead(yin,&yarray);
746:   VecCUDARestoreArrayRead(xin,&xarray);
747:   return(0);
748: }

750: PetscErrorCode VecCopy_SeqCUDA(Vec xin,Vec yin)
751: {
752:   const PetscScalar *xarray;
753:   PetscScalar       *yarray;
754:   PetscErrorCode    ierr;
755:   cudaError_t       err;

758:   if (xin != yin) {
759:     if (xin->offloadmask == PETSC_OFFLOAD_GPU) {
760:       PetscBool yiscuda;

762:       PetscObjectTypeCompareAny((PetscObject)yin,&yiscuda,VECSEQCUDA,VECMPICUDA,"");
763:       VecCUDAGetArrayRead(xin,&xarray);
764:       if (yiscuda) {
765:         VecCUDAGetArrayWrite(yin,&yarray);
766:       } else {
767:         VecGetArrayWrite(yin,&yarray);
768:       }
769:       PetscLogGpuTimeBegin();
770:       if (yiscuda) {
771:         err = cudaMemcpyAsync(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice,PetscDefaultCudaStream);CHKERRCUDA(err);
772:       } else {
773:         err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(err);
774:       }
775:       err  = WaitForCUDA();CHKERRCUDA(err);
776:       PetscLogGpuTimeEnd();
777:       VecCUDARestoreArrayRead(xin,&xarray);
778:       if (yiscuda) {
779:         VecCUDARestoreArrayWrite(yin,&yarray);
780:       } else {
781:         VecRestoreArrayWrite(yin,&yarray);
782:       }
783:     } else if (xin->offloadmask == PETSC_OFFLOAD_CPU) {
784:       /* copy in CPU if we are on the CPU */
785:       VecCopy_SeqCUDA_Private(xin,yin);
786:     } else if (xin->offloadmask == PETSC_OFFLOAD_BOTH) {
787:       /* if xin is valid in both places, see where yin is and copy there (because it's probably where we'll want to next use it) */
788:       if (yin->offloadmask == PETSC_OFFLOAD_CPU) {
789:         /* copy in CPU */
790:         VecCopy_SeqCUDA_Private(xin,yin);
791:       } else if (yin->offloadmask == PETSC_OFFLOAD_GPU) {
792:         /* copy in GPU */
793:         VecCUDAGetArrayRead(xin,&xarray);
794:         VecCUDAGetArrayWrite(yin,&yarray);
795:         PetscLogGpuTimeBegin();
796:         err  = cudaMemcpyAsync(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice,PetscDefaultCudaStream);CHKERRCUDA(err);
797:         PetscLogGpuTimeEnd();
798:         VecCUDARestoreArrayRead(xin,&xarray);
799:         VecCUDARestoreArrayWrite(yin,&yarray);
800:       } else if (yin->offloadmask == PETSC_OFFLOAD_BOTH) {
801:         /* xin and yin are both valid in both places (or yin was unallocated before the earlier call to allocatecheck
802:            default to copy in GPU (this is an arbitrary choice) */
803:         VecCUDAGetArrayRead(xin,&xarray);
804:         VecCUDAGetArrayWrite(yin,&yarray);
805:         PetscLogGpuTimeBegin();
806:         err  = cudaMemcpyAsync(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice,PetscDefaultCudaStream);CHKERRCUDA(err);
807:         PetscLogGpuTimeEnd();
808:         VecCUDARestoreArrayRead(xin,&xarray);
809:         VecCUDARestoreArrayWrite(yin,&yarray);
810:       } else {
811:         VecCopy_SeqCUDA_Private(xin,yin);
812:       }
813:     }
814:   }
815:   return(0);
816: }

818: PetscErrorCode VecSwap_SeqCUDA(Vec xin,Vec yin)
819: {
821:   PetscBLASInt   one = 1,bn = 0;
822:   PetscScalar    *xarray,*yarray;
823:   cublasHandle_t cublasv2handle;
824:   cublasStatus_t cberr;
825:   cudaError_t    err;

828:   PetscCUBLASGetHandle(&cublasv2handle);
829:   PetscBLASIntCast(xin->map->n,&bn);
830:   if (xin != yin) {
831:     VecCUDAGetArray(xin,&xarray);
832:     VecCUDAGetArray(yin,&yarray);
833:     PetscLogGpuTimeBegin();
834:     cberr = cublasXswap(cublasv2handle,bn,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
835:     err  = WaitForCUDA();CHKERRCUDA(err);
836:     PetscLogGpuTimeEnd();
837:     VecCUDARestoreArray(xin,&xarray);
838:     VecCUDARestoreArray(yin,&yarray);
839:   }
840:   return(0);
841: }

843: PetscErrorCode VecAXPBY_SeqCUDA(Vec yin,PetscScalar alpha,PetscScalar beta,Vec xin)
844: {
845:   PetscErrorCode    ierr;
846:   PetscScalar       a = alpha,b = beta;
847:   const PetscScalar *xarray;
848:   PetscScalar       *yarray;
849:   PetscBLASInt      one = 1, bn = 0;
850:   cublasHandle_t    cublasv2handle;
851:   cublasStatus_t    cberr;
852:   cudaError_t       err;

855:   PetscCUBLASGetHandle(&cublasv2handle);
856:   PetscBLASIntCast(yin->map->n,&bn);
857:   if (a == (PetscScalar)0.0) {
858:     VecScale_SeqCUDA(yin,beta);
859:   } else if (b == (PetscScalar)1.0) {
860:     VecAXPY_SeqCUDA(yin,alpha,xin);
861:   } else if (a == (PetscScalar)1.0) {
862:     VecAYPX_SeqCUDA(yin,beta,xin);
863:   } else if (b == (PetscScalar)0.0) {
864:     VecCUDAGetArrayRead(xin,&xarray);
865:     VecCUDAGetArray(yin,&yarray);
866:     PetscLogGpuTimeBegin();
867:     err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
868:     cberr = cublasXscal(cublasv2handle,bn,&alpha,yarray,one);CHKERRCUBLAS(cberr);
869:     err  = WaitForCUDA();CHKERRCUDA(err);
870:     PetscLogGpuTimeEnd();
871:     PetscLogGpuFlops(xin->map->n);
872:     PetscLogCpuToGpu(sizeof(PetscScalar));
873:     VecCUDARestoreArrayRead(xin,&xarray);
874:     VecCUDARestoreArray(yin,&yarray);
875:   } else {
876:     VecCUDAGetArrayRead(xin,&xarray);
877:     VecCUDAGetArray(yin,&yarray);
878:     PetscLogGpuTimeBegin();
879:     cberr = cublasXscal(cublasv2handle,bn,&beta,yarray,one);CHKERRCUBLAS(cberr);
880:     cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
881:     VecCUDARestoreArrayRead(xin,&xarray);
882:     VecCUDARestoreArray(yin,&yarray);
883:     err  = WaitForCUDA();CHKERRCUDA(err);
884:     PetscLogGpuTimeEnd();
885:     PetscLogGpuFlops(3.0*xin->map->n);
886:     PetscLogCpuToGpu(2*sizeof(PetscScalar));
887:   }
888:   return(0);
889: }

891: PetscErrorCode VecAXPBYPCZ_SeqCUDA(Vec zin,PetscScalar alpha,PetscScalar beta,PetscScalar gamma,Vec xin,Vec yin)
892: {
894:   PetscInt       n = zin->map->n;

897:   if (gamma == (PetscScalar)1.0) {
898:     /* z = ax + b*y + z */
899:     VecAXPY_SeqCUDA(zin,alpha,xin);
900:     VecAXPY_SeqCUDA(zin,beta,yin);
901:     PetscLogGpuFlops(4.0*n);
902:   } else {
903:     /* z = a*x + b*y + c*z */
904:     VecScale_SeqCUDA(zin,gamma);
905:     VecAXPY_SeqCUDA(zin,alpha,xin);
906:     VecAXPY_SeqCUDA(zin,beta,yin);
907:     PetscLogGpuFlops(5.0*n);
908:   }
909:   return(0);
910: }

912: PetscErrorCode VecPointwiseMult_SeqCUDA(Vec win,Vec xin,Vec yin)
913: {
914:   PetscInt                              n = win->map->n;
915:   const PetscScalar                     *xarray,*yarray;
916:   PetscScalar                           *warray;
917:   thrust::device_ptr<const PetscScalar> xptr,yptr;
918:   thrust::device_ptr<PetscScalar>       wptr;
919:   PetscErrorCode                        ierr;
920:   cudaError_t                           err;

923:   VecCUDAGetArrayRead(xin,&xarray);
924:   VecCUDAGetArrayRead(yin,&yarray);
925:   VecCUDAGetArrayWrite(win,&warray);
926:   PetscLogGpuTimeBegin();
927:   try {
928:     wptr = thrust::device_pointer_cast(warray);
929:     xptr = thrust::device_pointer_cast(xarray);
930:     yptr = thrust::device_pointer_cast(yarray);
931:     thrust::transform(xptr,xptr+n,yptr,wptr,thrust::multiplies<PetscScalar>());
932:     err  = WaitForCUDA();CHKERRCUDA(err);
933:   } catch (char *ex) {
934:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
935:   }
936:   PetscLogGpuTimeEnd();
937:   VecCUDARestoreArrayRead(xin,&xarray);
938:   VecCUDARestoreArrayRead(yin,&yarray);
939:   VecCUDARestoreArrayWrite(win,&warray);
940:   PetscLogGpuFlops(n);
941:   return(0);
942: }

944: /* should do infinity norm in cuda */

946: PetscErrorCode VecNorm_SeqCUDA(Vec xin,NormType type,PetscReal *z)
947: {
948:   PetscErrorCode    ierr;
949:   PetscInt          n = xin->map->n;
950:   PetscBLASInt      one = 1, bn = 0;
951:   const PetscScalar *xarray;
952:   cublasHandle_t    cublasv2handle;
953:   cublasStatus_t    cberr;
954:   cudaError_t       err;

957:   PetscCUBLASGetHandle(&cublasv2handle);
958:   PetscBLASIntCast(n,&bn);
959:   if (type == NORM_2 || type == NORM_FROBENIUS) {
960:     VecCUDAGetArrayRead(xin,&xarray);
961:     PetscLogGpuTimeBegin();
962:     cberr = cublasXnrm2(cublasv2handle,bn,xarray,one,z);CHKERRCUBLAS(cberr);
963:     PetscLogGpuTimeEnd();
964:     VecCUDARestoreArrayRead(xin,&xarray);
965:     PetscLogGpuFlops(PetscMax(2.0*n-1,0.0));
966:   } else if (type == NORM_INFINITY) {
967:     int  i;
968:     VecCUDAGetArrayRead(xin,&xarray);
969:     PetscLogGpuTimeBegin();
970:     cberr = cublasIXamax(cublasv2handle,bn,xarray,one,&i);CHKERRCUBLAS(cberr);
971:     PetscLogGpuTimeEnd();
972:     if (bn) {
973:       PetscScalar zs;
974:       err = cudaMemcpy(&zs,xarray+i-1,sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(err);
975:       *z = PetscAbsScalar(zs);
976:     } else *z = 0.0;
977:     VecCUDARestoreArrayRead(xin,&xarray);
978:   } else if (type == NORM_1) {
979:     VecCUDAGetArrayRead(xin,&xarray);
980:     PetscLogGpuTimeBegin();
981:     cberr = cublasXasum(cublasv2handle,bn,xarray,one,z);CHKERRCUBLAS(cberr);
982:     VecCUDARestoreArrayRead(xin,&xarray);
983:     PetscLogGpuTimeEnd();
984:     PetscLogGpuFlops(PetscMax(n-1.0,0.0));
985:   } else if (type == NORM_1_AND_2) {
986:     VecNorm_SeqCUDA(xin,NORM_1,z);
987:     VecNorm_SeqCUDA(xin,NORM_2,z+1);
988:   }
989:   PetscLogGpuToCpu(sizeof(PetscReal));
990:   return(0);
991: }

993: PetscErrorCode VecDotNorm2_SeqCUDA(Vec s, Vec t, PetscScalar *dp, PetscScalar *nm)
994: {

998:   VecDot_SeqCUDA(s,t,dp);
999:   VecDot_SeqCUDA(t,t,nm);
1000:   return(0);
1001: }

1003: PetscErrorCode VecDestroy_SeqCUDA(Vec v)
1004: {
1006:   cudaError_t    cerr;
1007:   Vec_CUDA       *veccuda = (Vec_CUDA*)v->spptr;

1010:   if (v->spptr) {
1011:     if (veccuda->GPUarray_allocated) {
1012:      #if defined(PETSC_HAVE_NVSHMEM)
1013:       if (veccuda->nvshmem) {
1014:         PetscNvshmemFree(veccuda->GPUarray_allocated);
1015:         veccuda->nvshmem = PETSC_FALSE;
1016:       }
1017:       else
1018:      #endif
1019:       {cerr = cudaFree(veccuda->GPUarray_allocated);CHKERRCUDA(cerr);}
1020:       veccuda->GPUarray_allocated = NULL;
1021:     }
1022:     if (veccuda->stream) {
1023:       cerr = cudaStreamDestroy(veccuda->stream);CHKERRCUDA(cerr);
1024:     }
1025:   }
1026:   VecDestroy_SeqCUDA_Private(v);
1027:   PetscFree(v->spptr);
1028:   return(0);
1029: }

1031: #if defined(PETSC_USE_COMPLEX)
1032: struct conjugate
1033: {
1034:   __host__ __device__
1035:     PetscScalar operator()(PetscScalar x)
1036:     {
1037:       return PetscConj(x);
1038:     }
1039: };
1040: #endif

1042: PetscErrorCode VecConjugate_SeqCUDA(Vec xin)
1043: {
1044: #if defined(PETSC_USE_COMPLEX)
1045:   PetscScalar                     *xarray;
1046:   PetscErrorCode                  ierr;
1047:   PetscInt                        n = xin->map->n;
1048:   thrust::device_ptr<PetscScalar> xptr;
1049:   cudaError_t                     err;

1052:   VecCUDAGetArray(xin,&xarray);
1053:   PetscLogGpuTimeBegin();
1054:   try {
1055:     xptr = thrust::device_pointer_cast(xarray);
1056:     thrust::transform(xptr,xptr+n,xptr,conjugate());
1057:     err  = WaitForCUDA();CHKERRCUDA(err);
1058:   } catch (char *ex) {
1059:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1060:   }
1061:   PetscLogGpuTimeEnd();
1062:   VecCUDARestoreArray(xin,&xarray);
1063: #else
1065: #endif
1066:   return(0);
1067: }

1069: PetscErrorCode VecGetLocalVector_SeqCUDA(Vec v,Vec w)
1070: {
1072:   cudaError_t    err;

1079:   if (w->data) {
1080:     if (((Vec_Seq*)w->data)->array_allocated) {
1081:       if (w->pinned_memory) {
1082:         PetscMallocSetCUDAHost();
1083:       }
1084:       PetscFree(((Vec_Seq*)w->data)->array_allocated);
1085:       if (w->pinned_memory) {
1086:         PetscMallocResetCUDAHost();
1087:         w->pinned_memory = PETSC_FALSE;
1088:       }
1089:     }
1090:     ((Vec_Seq*)w->data)->array = NULL;
1091:     ((Vec_Seq*)w->data)->unplacedarray = NULL;
1092:   }
1093:   if (w->spptr) {
1095:     if (((Vec_CUDA*)w->spptr)->GPUarray) {
1096:       err = cudaFree(((Vec_CUDA*)w->spptr)->GPUarray);CHKERRCUDA(err);
1097:       ((Vec_CUDA*)w->spptr)->GPUarray = NULL;
1098:     }
1099:     if (((Vec_CUDA*)w->spptr)->stream) {
1100:       err = cudaStreamDestroy(((Vec_CUDA*)w->spptr)->stream);CHKERRCUDA(err);
1101:     }
1102:     PetscFree(w->spptr);
1103:   }

1105:   if (v->petscnative) {
1106:     PetscFree(w->data);
1107:     w->data = v->data;
1108:     w->offloadmask = v->offloadmask;
1109:     w->pinned_memory = v->pinned_memory;
1110:     w->spptr = v->spptr;
1111:     PetscObjectStateIncrease((PetscObject)w);
1112:   } else {
1113:     VecGetArray(v,&((Vec_Seq*)w->data)->array);
1114:     w->offloadmask = PETSC_OFFLOAD_CPU;
1115:     VecCUDAAllocateCheck(w);
1116:   }
1117:   return(0);
1118: }

1120: PetscErrorCode VecRestoreLocalVector_SeqCUDA(Vec v,Vec w)
1121: {
1123:   cudaError_t    err;


1131:   if (v->petscnative) {
1132:     v->data = w->data;
1133:     v->offloadmask = w->offloadmask;
1134:     v->pinned_memory = w->pinned_memory;
1135:     v->spptr = w->spptr;
1136:     w->data = 0;
1137:     w->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
1138:     w->spptr = 0;
1139:   } else {
1140:     VecRestoreArray(v,&((Vec_Seq*)w->data)->array);
1141:     if ((Vec_CUDA*)w->spptr) {
1142:       err = cudaFree(((Vec_CUDA*)w->spptr)->GPUarray);CHKERRCUDA(err);
1143:       ((Vec_CUDA*)w->spptr)->GPUarray = NULL;
1144:       if (((Vec_CUDA*)v->spptr)->stream) {
1145:         err = cudaStreamDestroy(((Vec_CUDA*)w->spptr)->stream);CHKERRCUDA(err);
1146:       }
1147:       PetscFree(w->spptr);
1148:     }
1149:   }
1150:   return(0);
1151: }

1153: struct petscrealpart : public thrust::unary_function<PetscScalar,PetscReal>
1154: {
1155:   __host__ __device__
1156:   PetscReal operator()(PetscScalar x) {
1157:     return PetscRealPart(x);
1158:   }
1159: };

1161: struct petscrealparti : public thrust::unary_function<thrust::tuple<PetscScalar, PetscInt>,thrust::tuple<PetscReal, PetscInt>>
1162: {
1163:   __host__ __device__
1164:   thrust::tuple<PetscReal, PetscInt> operator()(thrust::tuple<PetscScalar, PetscInt> x) {
1165:     return thrust::make_tuple(PetscRealPart(x.get<0>()), x.get<1>());
1166:   }
1167: };

1169: struct petscmax : public thrust::binary_function<PetscReal,PetscReal,PetscReal>
1170: {
1171:   __host__ __device__
1172:   PetscReal operator()(PetscReal x, PetscReal y) {
1173:     return x < y ? y : x;
1174:   }
1175: };

1177: struct petscmaxi : public thrust::binary_function<thrust::tuple<PetscReal, PetscInt>,thrust::tuple<PetscReal, PetscInt>,thrust::tuple<PetscReal, PetscInt>>
1178: {
1179:   __host__ __device__
1180:   thrust::tuple<PetscReal, PetscInt> operator()(thrust::tuple<PetscReal, PetscInt> x, thrust::tuple<PetscReal, PetscInt> y) {
1181:     return x.get<0>() < y.get<0>() ? thrust::make_tuple(y.get<0>(), y.get<1>()) :
1182:            (x.get<0>() != y.get<0>() ? thrust::make_tuple(x.get<0>(), x.get<1>()) :
1183:            (x.get<1>() < y.get<1>() ? thrust::make_tuple(x.get<0>(), x.get<1>()) : thrust::make_tuple(y.get<0>(), y.get<1>())));
1184:   }
1185: };

1187: struct petscmin : public thrust::binary_function<PetscReal,PetscReal,PetscReal>
1188: {
1189:   __host__ __device__
1190:   PetscReal operator()(PetscReal x, PetscReal y) {
1191:     return x < y ? x : y;
1192:   }
1193: };

1195: struct petscmini : public thrust::binary_function<thrust::tuple<PetscReal, PetscInt>,thrust::tuple<PetscReal, PetscInt>,thrust::tuple<PetscReal, PetscInt>>
1196: {
1197:   __host__ __device__
1198:   thrust::tuple<PetscReal, PetscInt> operator()(thrust::tuple<PetscReal, PetscInt> x, thrust::tuple<PetscReal, PetscInt> y) {
1199:     return x.get<0>() > y.get<0>() ? thrust::make_tuple(y.get<0>(), y.get<1>()) :
1200:            (x.get<0>() != y.get<0>() ? thrust::make_tuple(x.get<0>(), x.get<1>()) :
1201:            (x.get<1>() < y.get<1>() ? thrust::make_tuple(x.get<0>(), x.get<1>()) : thrust::make_tuple(y.get<0>(), y.get<1>())));
1202:   }
1203: };

1205: PetscErrorCode VecMax_SeqCUDA(Vec v, PetscInt *p, PetscReal *m)
1206: {
1207:   PetscErrorCode                        ierr;
1208:   PetscInt                              n = v->map->n;
1209:   const PetscScalar                     *av;
1210:   thrust::device_ptr<const PetscScalar> avpt;

1214:   if (!n) {
1215:     *m = PETSC_MIN_REAL;
1216:     if (p) *p = -1;
1217:     return(0);
1218:   }
1219:   VecCUDAGetArrayRead(v,&av);
1220:   avpt = thrust::device_pointer_cast(av);
1221:   PetscLogGpuTimeBegin();
1222:   if (p) {
1223:     thrust::tuple<PetscReal,PetscInt> res(PETSC_MIN_REAL,-1);
1224:     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(avpt,thrust::counting_iterator<PetscInt>(0)));
1225:     try {
1226: #if defined(PETSC_USE_COMPLEX)
1227:       res = thrust::transform_reduce(zibit,zibit+n,petscrealparti(),res,petscmaxi());
1228: #else
1229:       res = thrust::reduce(zibit,zibit+n,res,petscmaxi());
1230: #endif
1231:     } catch (char *ex) {
1232:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1233:     }
1234:     *m = res.get<0>();
1235:     *p = res.get<1>();
1236:   } else {
1237:     try {
1238: #if defined(PETSC_USE_COMPLEX)
1239:       *m = thrust::transform_reduce(avpt,avpt+n,petscrealpart(),PETSC_MIN_REAL,petscmax());
1240: #else
1241:       *m = thrust::reduce(avpt,avpt+n,PETSC_MIN_REAL,petscmax());
1242: #endif
1243:     } catch (char *ex) {
1244:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1245:     }
1246:   }
1247:   PetscLogGpuTimeEnd();
1248:   VecCUDARestoreArrayRead(v,&av);
1249:   return(0);
1250: }

1252: PetscErrorCode VecMin_SeqCUDA(Vec v, PetscInt *p, PetscReal *m)
1253: {
1254:   PetscErrorCode                        ierr;
1255:   PetscInt                              n = v->map->n;
1256:   const PetscScalar                     *av;
1257:   thrust::device_ptr<const PetscScalar> avpt;

1261:   if (!n) {
1262:     *m = PETSC_MAX_REAL;
1263:     if (p) *p = -1;
1264:     return(0);
1265:   }
1266:   VecCUDAGetArrayRead(v,&av);
1267:   avpt = thrust::device_pointer_cast(av);
1268:   PetscLogGpuTimeBegin();
1269:   if (p) {
1270:     thrust::tuple<PetscReal,PetscInt> res(PETSC_MAX_REAL,-1);
1271:     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(avpt,thrust::counting_iterator<PetscInt>(0)));
1272:     try {
1273: #if defined(PETSC_USE_COMPLEX)
1274:       res = thrust::transform_reduce(zibit,zibit+n,petscrealparti(),res,petscmini());
1275: #else
1276:       res = thrust::reduce(zibit,zibit+n,res,petscmini());
1277: #endif
1278:     } catch (char *ex) {
1279:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1280:     }
1281:     *m = res.get<0>();
1282:     *p = res.get<1>();
1283:   } else {
1284:     try {
1285: #if defined(PETSC_USE_COMPLEX)
1286:       *m = thrust::transform_reduce(avpt,avpt+n,petscrealpart(),PETSC_MAX_REAL,petscmin());
1287: #else
1288:       *m = thrust::reduce(avpt,avpt+n,PETSC_MAX_REAL,petscmin());
1289: #endif
1290:     } catch (char *ex) {
1291:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1292:     }
1293:   }
1294:   PetscLogGpuTimeEnd();
1295:   VecCUDARestoreArrayRead(v,&av);
1296:   return(0);
1297: }
1298: #if defined(PETSC_HAVE_NVSHMEM)
1299: /* Free old CUDA array and re-allocate a new one from nvshmem symmetric heap.
1300:    New array does not retain values in the old array. The offload mask is not changed.

1302:    Note: the function is only meant to be used in MatAssemblyEnd_MPIAIJCUSPARSE.
1303:  */
1304: PetscErrorCode  VecAllocateNVSHMEM_SeqCUDA(Vec v)
1305: {
1307:   cudaError_t    cerr;
1308:   Vec_CUDA       *veccuda = (Vec_CUDA*)v->spptr;
1309:   PetscInt       n;

1312:   cerr = cudaFree(veccuda->GPUarray_allocated);CHKERRCUDA(cerr);
1313:   VecGetLocalSize(v,&n);
1314:   MPIU_Allreduce(MPI_IN_PLACE,&n,1,MPIU_INT,MPI_MAX,PETSC_COMM_WORLD);
1315:   PetscNvshmemMalloc(n*sizeof(PetscScalar),(void**)&veccuda->GPUarray_allocated);
1316:   veccuda->GPUarray = veccuda->GPUarray_allocated;
1317:   veccuda->nvshmem  = PETSC_TRUE;
1318:   return(0);
1319: }
1320: #endif