Actual source code: veccuda2.cu
petsc-3.15.0 2021-03-30
1: /*
2: Implements the sequential cuda vectors.
3: */
5: #define PETSC_SKIP_SPINLOCK
6: #define PETSC_SKIP_CXX_COMPLEX_FIX
8: #include <petscconf.h>
9: #include <petsc/private/vecimpl.h>
10: #include <../src/vec/vec/impls/dvecimpl.h>
11: #include <petsc/private/cudavecimpl.h>
13: #include <cuda_runtime.h>
14: #include <thrust/device_ptr.h>
15: #include <thrust/transform.h>
16: #include <thrust/functional.h>
17: #include <thrust/reduce.h>
19: /*
20: Allocates space for the vector array on the GPU if it does not exist.
21: Does NOT change the PetscCUDAFlag for the vector
22: Does NOT zero the CUDA array
24: */
25: PetscErrorCode VecCUDAAllocateCheck(Vec v)
26: {
28: cudaError_t err;
29: Vec_CUDA *veccuda;
30: PetscBool option_set;
33: if (!v->spptr) {
34: PetscReal pinned_memory_min;
35: PetscCalloc(sizeof(Vec_CUDA),&v->spptr);
36: veccuda = (Vec_CUDA*)v->spptr;
37: err = cudaMalloc((void**)&veccuda->GPUarray_allocated,sizeof(PetscScalar)*((PetscBLASInt)v->map->n));CHKERRCUDA(err);
38: veccuda->GPUarray = veccuda->GPUarray_allocated;
39: if (v->offloadmask == PETSC_OFFLOAD_UNALLOCATED) {
40: if (v->data && ((Vec_Seq*)v->data)->array) {
41: v->offloadmask = PETSC_OFFLOAD_CPU;
42: } else {
43: v->offloadmask = PETSC_OFFLOAD_GPU;
44: }
45: }
46: pinned_memory_min = 0;
48: /* Need to parse command line for minimum size to use for pinned memory allocations on host here.
49: Note: This same code duplicated in VecCreate_SeqCUDA_Private() and VecCreate_MPICUDA_Private(). Is there a good way to avoid this? */
50: PetscOptionsBegin(PetscObjectComm((PetscObject)v),((PetscObject)v)->prefix,"VECCUDA Options","Vec");
51: PetscOptionsReal("-vec_pinned_memory_min","Minimum size (in bytes) for an allocation to use pinned memory on host","VecSetPinnedMemoryMin",pinned_memory_min,&pinned_memory_min,&option_set);
52: if (option_set) v->minimum_bytes_pinned_memory = pinned_memory_min;
53: PetscOptionsEnd();
54: }
55: return(0);
56: }
58: /* Copies a vector from the CPU to the GPU unless we already have an up-to-date copy on the GPU */
59: PetscErrorCode VecCUDACopyToGPU(Vec v)
60: {
62: cudaError_t err;
63: Vec_CUDA *veccuda;
64: PetscScalar *varray;
68: VecCUDAAllocateCheck(v);
69: if (v->offloadmask == PETSC_OFFLOAD_CPU) {
70: PetscLogEventBegin(VEC_CUDACopyToGPU,v,0,0,0);
71: veccuda = (Vec_CUDA*)v->spptr;
72: varray = veccuda->GPUarray;
73: err = cudaMemcpy(varray,((Vec_Seq*)v->data)->array,v->map->n*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
74: PetscLogCpuToGpu((v->map->n)*sizeof(PetscScalar));
75: PetscLogEventEnd(VEC_CUDACopyToGPU,v,0,0,0);
76: v->offloadmask = PETSC_OFFLOAD_BOTH;
77: }
78: return(0);
79: }
81: /*
82: VecCUDACopyFromGPU - Copies a vector from the GPU to the CPU unless we already have an up-to-date copy on the CPU
83: */
84: PetscErrorCode VecCUDACopyFromGPU(Vec v)
85: {
87: cudaError_t err;
88: Vec_CUDA *veccuda;
89: PetscScalar *varray;
93: VecCUDAAllocateCheckHost(v);
94: if (v->offloadmask == PETSC_OFFLOAD_GPU) {
95: PetscLogEventBegin(VEC_CUDACopyFromGPU,v,0,0,0);
96: veccuda = (Vec_CUDA*)v->spptr;
97: varray = veccuda->GPUarray;
98: err = cudaMemcpy(((Vec_Seq*)v->data)->array,varray,v->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(err);
99: PetscLogGpuToCpu((v->map->n)*sizeof(PetscScalar));
100: PetscLogEventEnd(VEC_CUDACopyFromGPU,v,0,0,0);
101: v->offloadmask = PETSC_OFFLOAD_BOTH;
102: }
103: return(0);
104: }
106: /*MC
107: VECSEQCUDA - VECSEQCUDA = "seqcuda" - The basic sequential vector, modified to use CUDA
109: Options Database Keys:
110: . -vec_type seqcuda - sets the vector type to VECSEQCUDA during a call to VecSetFromOptions()
112: Level: beginner
114: .seealso: VecCreate(), VecSetType(), VecSetFromOptions(), VecCreateSeqWithArray(), VECMPI, VecType, VecCreateMPI(), VecCreateSeq(), VecSetPinnedMemoryMin()
115: M*/
117: PetscErrorCode VecAYPX_SeqCUDA(Vec yin,PetscScalar alpha,Vec xin)
118: {
119: const PetscScalar *xarray;
120: PetscScalar *yarray;
121: PetscErrorCode ierr;
122: PetscBLASInt one = 1,bn = 0;
123: PetscScalar sone = 1.0;
124: cublasHandle_t cublasv2handle;
125: cublasStatus_t cberr;
126: cudaError_t err;
129: PetscCUBLASGetHandle(&cublasv2handle);
130: PetscBLASIntCast(yin->map->n,&bn);
131: VecCUDAGetArrayRead(xin,&xarray);
132: VecCUDAGetArray(yin,&yarray);
133: PetscLogGpuTimeBegin();
134: if (alpha == (PetscScalar)0.0) {
135: err = cudaMemcpy(yarray,xarray,bn*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
136: } else if (alpha == (PetscScalar)1.0) {
137: cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
138: PetscLogGpuFlops(1.0*yin->map->n);
139: } else {
140: cberr = cublasXscal(cublasv2handle,bn,&alpha,yarray,one);CHKERRCUBLAS(cberr);
141: cberr = cublasXaxpy(cublasv2handle,bn,&sone,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
142: PetscLogGpuFlops(2.0*yin->map->n);
143: }
144: err = WaitForCUDA();CHKERRCUDA(err);
145: PetscLogGpuTimeEnd();
146: VecCUDARestoreArrayRead(xin,&xarray);
147: VecCUDARestoreArray(yin,&yarray);
148: PetscLogCpuToGpu(sizeof(PetscScalar));
149: return(0);
150: }
152: PetscErrorCode VecAXPY_SeqCUDA(Vec yin,PetscScalar alpha,Vec xin)
153: {
154: const PetscScalar *xarray;
155: PetscScalar *yarray;
156: PetscErrorCode ierr;
157: PetscBLASInt one = 1,bn = 0;
158: cublasHandle_t cublasv2handle;
159: cublasStatus_t cberr;
160: PetscBool xiscuda;
161: cudaError_t err;
164: if (alpha == (PetscScalar)0.0) return(0);
165: PetscCUBLASGetHandle(&cublasv2handle);
166: PetscObjectTypeCompareAny((PetscObject)xin,&xiscuda,VECSEQCUDA,VECMPICUDA,"");
167: if (xiscuda) {
168: PetscBLASIntCast(yin->map->n,&bn);
169: VecCUDAGetArrayRead(xin,&xarray);
170: VecCUDAGetArray(yin,&yarray);
171: PetscLogGpuTimeBegin();
172: cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
173: err = WaitForCUDA();CHKERRCUDA(err);
174: PetscLogGpuTimeEnd();
175: VecCUDARestoreArrayRead(xin,&xarray);
176: VecCUDARestoreArray(yin,&yarray);
177: PetscLogGpuFlops(2.0*yin->map->n);
178: PetscLogCpuToGpu(sizeof(PetscScalar));
179: } else {
180: VecAXPY_Seq(yin,alpha,xin);
181: }
182: return(0);
183: }
185: PetscErrorCode VecPointwiseDivide_SeqCUDA(Vec win, Vec xin, Vec yin)
186: {
187: PetscInt n = xin->map->n;
188: const PetscScalar *xarray=NULL,*yarray=NULL;
189: PetscScalar *warray=NULL;
190: thrust::device_ptr<const PetscScalar> xptr,yptr;
191: thrust::device_ptr<PetscScalar> wptr;
192: PetscErrorCode ierr;
193: cudaError_t err;
196: VecCUDAGetArrayWrite(win,&warray);
197: VecCUDAGetArrayRead(xin,&xarray);
198: VecCUDAGetArrayRead(yin,&yarray);
199: PetscLogGpuTimeBegin();
200: try {
201: wptr = thrust::device_pointer_cast(warray);
202: xptr = thrust::device_pointer_cast(xarray);
203: yptr = thrust::device_pointer_cast(yarray);
204: thrust::transform(xptr,xptr+n,yptr,wptr,thrust::divides<PetscScalar>());
205: err = WaitForCUDA();CHKERRCUDA(err);
206: } catch (char *ex) {
207: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
208: }
209: PetscLogGpuTimeEnd();
210: PetscLogGpuFlops(n);
211: VecCUDARestoreArrayRead(xin,&xarray);
212: VecCUDARestoreArrayRead(yin,&yarray);
213: VecCUDARestoreArrayWrite(win,&warray);
214: return(0);
215: }
217: PetscErrorCode VecWAXPY_SeqCUDA(Vec win,PetscScalar alpha,Vec xin, Vec yin)
218: {
219: const PetscScalar *xarray=NULL,*yarray=NULL;
220: PetscScalar *warray=NULL;
221: PetscErrorCode ierr;
222: PetscBLASInt one = 1,bn = 0;
223: cublasHandle_t cublasv2handle;
224: cublasStatus_t stat;
225: cudaError_t cerr;
226: cudaStream_t stream;
229: PetscCUBLASGetHandle(&cublasv2handle);
230: PetscBLASIntCast(win->map->n,&bn);
231: if (alpha == (PetscScalar)0.0) {
232: VecCopy_SeqCUDA(yin,win);
233: } else {
234: VecCUDAGetArrayRead(xin,&xarray);
235: VecCUDAGetArrayRead(yin,&yarray);
236: VecCUDAGetArrayWrite(win,&warray);
237: PetscLogGpuTimeBegin();
238: stat = cublasGetStream(cublasv2handle,&stream);CHKERRCUBLAS(stat);
239: cerr = cudaMemcpyAsync(warray,yarray,win->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice,stream);CHKERRCUDA(cerr);
240: stat = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,warray,one);CHKERRCUBLAS(stat);
241: cerr = WaitForCUDA();CHKERRCUDA(cerr);
242: PetscLogGpuTimeEnd();
243: PetscLogGpuFlops(2*win->map->n);
244: VecCUDARestoreArrayRead(xin,&xarray);
245: VecCUDARestoreArrayRead(yin,&yarray);
246: VecCUDARestoreArrayWrite(win,&warray);
247: PetscLogCpuToGpu(sizeof(PetscScalar));
248: }
249: return(0);
250: }
252: PetscErrorCode VecMAXPY_SeqCUDA(Vec xin, PetscInt nv,const PetscScalar *alpha,Vec *y)
253: {
254: PetscErrorCode ierr;
255: cudaError_t err;
256: PetscInt n = xin->map->n,j;
257: PetscScalar *xarray;
258: const PetscScalar *yarray;
259: PetscBLASInt one = 1,bn = 0;
260: cublasHandle_t cublasv2handle;
261: cublasStatus_t cberr;
264: PetscLogGpuFlops(nv*2.0*n);
265: PetscLogCpuToGpu(nv*sizeof(PetscScalar));
266: PetscCUBLASGetHandle(&cublasv2handle);
267: PetscBLASIntCast(n,&bn);
268: PetscLogGpuTimeBegin();
269: VecCUDAGetArray(xin,&xarray);
270: for (j=0; j<nv; j++) {
271: VecCUDAGetArrayRead(y[j],&yarray);
272: cberr = cublasXaxpy(cublasv2handle,bn,alpha+j,yarray,one,xarray,one);CHKERRCUBLAS(cberr);
273: VecCUDARestoreArrayRead(y[j],&yarray);
274: }
275: err = WaitForCUDA();CHKERRCUDA(err);
276: PetscLogGpuTimeEnd();
277: VecCUDARestoreArray(xin,&xarray);
278: return(0);
279: }
281: PetscErrorCode VecDot_SeqCUDA(Vec xin,Vec yin,PetscScalar *z)
282: {
283: const PetscScalar *xarray,*yarray;
284: PetscErrorCode ierr;
285: PetscBLASInt one = 1,bn = 0;
286: cublasHandle_t cublasv2handle;
287: cublasStatus_t cerr;
290: PetscCUBLASGetHandle(&cublasv2handle);
291: PetscBLASIntCast(yin->map->n,&bn);
292: VecCUDAGetArrayRead(xin,&xarray);
293: VecCUDAGetArrayRead(yin,&yarray);
294: /* arguments y, x are reversed because BLAS complex conjugates the first argument, PETSc the second */
295: PetscLogGpuTimeBegin();
296: cerr = cublasXdot(cublasv2handle,bn,yarray,one,xarray,one,z);CHKERRCUBLAS(cerr);
297: PetscLogGpuTimeEnd();
298: if (xin->map->n >0) {
299: PetscLogGpuFlops(2.0*xin->map->n-1);
300: }
301: PetscLogGpuToCpu(sizeof(PetscScalar));
302: VecCUDARestoreArrayRead(xin,&xarray);
303: VecCUDARestoreArrayRead(yin,&yarray);
304: return(0);
305: }
307: //
308: // CUDA kernels for MDot to follow
309: //
311: // set work group size to be a power of 2 (128 is usually a good compromise between portability and speed)
312: #define MDOT_WORKGROUP_SIZE 128
313: #define MDOT_WORKGROUP_NUM 128
315: #if !defined(PETSC_USE_COMPLEX)
316: // M = 2:
317: __global__ void VecMDot_SeqCUDA_kernel2(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,
318: PetscInt size, PetscScalar *group_results)
319: {
320: __shared__ PetscScalar tmp_buffer[2*MDOT_WORKGROUP_SIZE];
321: PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
322: entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
323: PetscInt vec_start_index = blockIdx.x * entries_per_group;
324: PetscInt vec_stop_index = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
326: PetscScalar entry_x = 0;
327: PetscScalar group_sum0 = 0;
328: PetscScalar group_sum1 = 0;
329: for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
330: entry_x = x[i]; // load only once from global memory!
331: group_sum0 += entry_x * y0[i];
332: group_sum1 += entry_x * y1[i];
333: }
334: tmp_buffer[threadIdx.x] = group_sum0;
335: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;
337: // parallel reduction
338: for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
339: __syncthreads();
340: if (threadIdx.x < stride) {
341: tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
342: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
343: }
344: }
346: // write result of group to group_results
347: if (threadIdx.x == 0) {
348: group_results[blockIdx.x] = tmp_buffer[0];
349: group_results[blockIdx.x + gridDim.x] = tmp_buffer[MDOT_WORKGROUP_SIZE];
350: }
351: }
353: // M = 3:
354: __global__ void VecMDot_SeqCUDA_kernel3(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,
355: PetscInt size, PetscScalar *group_results)
356: {
357: __shared__ PetscScalar tmp_buffer[3*MDOT_WORKGROUP_SIZE];
358: PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
359: entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
360: PetscInt vec_start_index = blockIdx.x * entries_per_group;
361: PetscInt vec_stop_index = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
363: PetscScalar entry_x = 0;
364: PetscScalar group_sum0 = 0;
365: PetscScalar group_sum1 = 0;
366: PetscScalar group_sum2 = 0;
367: for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
368: entry_x = x[i]; // load only once from global memory!
369: group_sum0 += entry_x * y0[i];
370: group_sum1 += entry_x * y1[i];
371: group_sum2 += entry_x * y2[i];
372: }
373: tmp_buffer[threadIdx.x] = group_sum0;
374: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;
375: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
377: // parallel reduction
378: for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
379: __syncthreads();
380: if (threadIdx.x < stride) {
381: tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
382: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
383: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
384: }
385: }
387: // write result of group to group_results
388: if (threadIdx.x == 0) {
389: group_results[blockIdx.x ] = tmp_buffer[0];
390: group_results[blockIdx.x + gridDim.x] = tmp_buffer[ MDOT_WORKGROUP_SIZE];
391: group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
392: }
393: }
395: // M = 4:
396: __global__ void VecMDot_SeqCUDA_kernel4(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,const PetscScalar *y3,
397: PetscInt size, PetscScalar *group_results)
398: {
399: __shared__ PetscScalar tmp_buffer[4*MDOT_WORKGROUP_SIZE];
400: PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
401: entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
402: PetscInt vec_start_index = blockIdx.x * entries_per_group;
403: PetscInt vec_stop_index = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
405: PetscScalar entry_x = 0;
406: PetscScalar group_sum0 = 0;
407: PetscScalar group_sum1 = 0;
408: PetscScalar group_sum2 = 0;
409: PetscScalar group_sum3 = 0;
410: for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
411: entry_x = x[i]; // load only once from global memory!
412: group_sum0 += entry_x * y0[i];
413: group_sum1 += entry_x * y1[i];
414: group_sum2 += entry_x * y2[i];
415: group_sum3 += entry_x * y3[i];
416: }
417: tmp_buffer[threadIdx.x] = group_sum0;
418: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;
419: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
420: tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;
422: // parallel reduction
423: for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
424: __syncthreads();
425: if (threadIdx.x < stride) {
426: tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
427: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
428: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
429: tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 3 * MDOT_WORKGROUP_SIZE];
430: }
431: }
433: // write result of group to group_results
434: if (threadIdx.x == 0) {
435: group_results[blockIdx.x ] = tmp_buffer[0];
436: group_results[blockIdx.x + gridDim.x] = tmp_buffer[ MDOT_WORKGROUP_SIZE];
437: group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
438: group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
439: }
440: }
442: // M = 8:
443: __global__ void VecMDot_SeqCUDA_kernel8(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,const PetscScalar *y3,
444: const PetscScalar *y4,const PetscScalar *y5,const PetscScalar *y6,const PetscScalar *y7,
445: PetscInt size, PetscScalar *group_results)
446: {
447: __shared__ PetscScalar tmp_buffer[8*MDOT_WORKGROUP_SIZE];
448: PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
449: entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
450: PetscInt vec_start_index = blockIdx.x * entries_per_group;
451: PetscInt vec_stop_index = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
453: PetscScalar entry_x = 0;
454: PetscScalar group_sum0 = 0;
455: PetscScalar group_sum1 = 0;
456: PetscScalar group_sum2 = 0;
457: PetscScalar group_sum3 = 0;
458: PetscScalar group_sum4 = 0;
459: PetscScalar group_sum5 = 0;
460: PetscScalar group_sum6 = 0;
461: PetscScalar group_sum7 = 0;
462: for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
463: entry_x = x[i]; // load only once from global memory!
464: group_sum0 += entry_x * y0[i];
465: group_sum1 += entry_x * y1[i];
466: group_sum2 += entry_x * y2[i];
467: group_sum3 += entry_x * y3[i];
468: group_sum4 += entry_x * y4[i];
469: group_sum5 += entry_x * y5[i];
470: group_sum6 += entry_x * y6[i];
471: group_sum7 += entry_x * y7[i];
472: }
473: tmp_buffer[threadIdx.x] = group_sum0;
474: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;
475: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
476: tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;
477: tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] = group_sum4;
478: tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] = group_sum5;
479: tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] = group_sum6;
480: tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] = group_sum7;
482: // parallel reduction
483: for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
484: __syncthreads();
485: if (threadIdx.x < stride) {
486: tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
487: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
488: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
489: tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 3 * MDOT_WORKGROUP_SIZE];
490: tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 4 * MDOT_WORKGROUP_SIZE];
491: tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 5 * MDOT_WORKGROUP_SIZE];
492: tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 6 * MDOT_WORKGROUP_SIZE];
493: tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 7 * MDOT_WORKGROUP_SIZE];
494: }
495: }
497: // write result of group to group_results
498: if (threadIdx.x == 0) {
499: group_results[blockIdx.x ] = tmp_buffer[0];
500: group_results[blockIdx.x + gridDim.x] = tmp_buffer[ MDOT_WORKGROUP_SIZE];
501: group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
502: group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
503: group_results[blockIdx.x + 4 * gridDim.x] = tmp_buffer[4 * MDOT_WORKGROUP_SIZE];
504: group_results[blockIdx.x + 5 * gridDim.x] = tmp_buffer[5 * MDOT_WORKGROUP_SIZE];
505: group_results[blockIdx.x + 6 * gridDim.x] = tmp_buffer[6 * MDOT_WORKGROUP_SIZE];
506: group_results[blockIdx.x + 7 * gridDim.x] = tmp_buffer[7 * MDOT_WORKGROUP_SIZE];
507: }
508: }
509: #endif /* !defined(PETSC_USE_COMPLEX) */
511: PetscErrorCode VecMDot_SeqCUDA(Vec xin,PetscInt nv,const Vec yin[],PetscScalar *z)
512: {
513: PetscErrorCode ierr;
514: PetscInt i,n = xin->map->n,current_y_index = 0;
515: const PetscScalar *xptr,*y0ptr,*y1ptr,*y2ptr,*y3ptr,*y4ptr,*y5ptr,*y6ptr,*y7ptr;
516: #if !defined(PETSC_USE_COMPLEX)
517: PetscInt nv1 = ((nv % 4) == 1) ? nv-1: nv,j;
518: PetscScalar *group_results_gpu,group_results_cpu[nv1*MDOT_WORKGROUP_NUM];
519: cudaError_t cuda_ierr;
520: #endif
521: PetscBLASInt one = 1,bn = 0;
522: cublasHandle_t cublasv2handle;
523: cublasStatus_t cberr;
524: cudaError_t err;
527: PetscCUBLASGetHandle(&cublasv2handle);
528: PetscBLASIntCast(xin->map->n,&bn);
529: if (nv <= 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Number of vectors provided to VecMDot_SeqCUDA not positive.");
530: /* Handle the case of local size zero first */
531: if (!xin->map->n) {
532: for (i=0; i<nv; ++i) z[i] = 0;
533: return(0);
534: }
536: #if !defined(PETSC_USE_COMPLEX)
537: // allocate scratchpad memory for the results of individual work groups:
538: cuda_cudaMalloc((void**)&group_results_gpu, nv1*sizeof(PetscScalar)*MDOT_WORKGROUP_NUM);CHKERRCUDA(cuda_ierr);
539: #endif
540: VecCUDAGetArrayRead(xin,&xptr);
541: PetscLogGpuTimeBegin();
543: while (current_y_index < nv)
544: {
545: switch (nv - current_y_index) {
547: case 7:
548: case 6:
549: case 5:
550: case 4:
551: VecCUDAGetArrayRead(yin[current_y_index ],&y0ptr);
552: VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
553: VecCUDAGetArrayRead(yin[current_y_index+2],&y2ptr);
554: VecCUDAGetArrayRead(yin[current_y_index+3],&y3ptr);
555: #if defined(PETSC_USE_COMPLEX)
556: cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
557: cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
558: cberr = cublasXdot(cublasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);CHKERRCUBLAS(cberr);
559: cberr = cublasXdot(cublasv2handle,bn,y3ptr,one,xptr,one,&z[current_y_index+3]);CHKERRCUBLAS(cberr);
560: #else
561: VecMDot_SeqCUDA_kernel4<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,y3ptr,n,group_results_gpu+current_y_index*MDOT_WORKGROUP_NUM);
563: #endif
564: VecCUDARestoreArrayRead(yin[current_y_index ],&y0ptr);
565: VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
566: VecCUDARestoreArrayRead(yin[current_y_index+2],&y2ptr);
567: VecCUDARestoreArrayRead(yin[current_y_index+3],&y3ptr);
568: current_y_index += 4;
569: break;
571: case 3:
572: VecCUDAGetArrayRead(yin[current_y_index ],&y0ptr);
573: VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
574: VecCUDAGetArrayRead(yin[current_y_index+2],&y2ptr);
576: #if defined(PETSC_USE_COMPLEX)
577: cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
578: cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
579: cberr = cublasXdot(cublasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);CHKERRCUBLAS(cberr);
580: #else
581: VecMDot_SeqCUDA_kernel3<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,n,group_results_gpu+current_y_index*MDOT_WORKGROUP_NUM);
582: #endif
583: VecCUDARestoreArrayRead(yin[current_y_index ],&y0ptr);
584: VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
585: VecCUDARestoreArrayRead(yin[current_y_index+2],&y2ptr);
586: current_y_index += 3;
587: break;
589: case 2:
590: VecCUDAGetArrayRead(yin[current_y_index],&y0ptr);
591: VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
592: #if defined(PETSC_USE_COMPLEX)
593: cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
594: cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
595: #else
596: VecMDot_SeqCUDA_kernel2<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,n,group_results_gpu+current_y_index*MDOT_WORKGROUP_NUM);
597: #endif
598: VecCUDARestoreArrayRead(yin[current_y_index],&y0ptr);
599: VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
600: current_y_index += 2;
601: break;
603: case 1:
604: VecCUDAGetArrayRead(yin[current_y_index],&y0ptr);
605: cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
606: VecCUDARestoreArrayRead(yin[current_y_index],&y0ptr);
607: current_y_index += 1;
608: break;
610: default: // 8 or more vectors left
611: VecCUDAGetArrayRead(yin[current_y_index ],&y0ptr);
612: VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
613: VecCUDAGetArrayRead(yin[current_y_index+2],&y2ptr);
614: VecCUDAGetArrayRead(yin[current_y_index+3],&y3ptr);
615: VecCUDAGetArrayRead(yin[current_y_index+4],&y4ptr);
616: VecCUDAGetArrayRead(yin[current_y_index+5],&y5ptr);
617: VecCUDAGetArrayRead(yin[current_y_index+6],&y6ptr);
618: VecCUDAGetArrayRead(yin[current_y_index+7],&y7ptr);
619: #if defined(PETSC_USE_COMPLEX)
620: cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
621: cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
622: cberr = cublasXdot(cublasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);CHKERRCUBLAS(cberr);
623: cberr = cublasXdot(cublasv2handle,bn,y3ptr,one,xptr,one,&z[current_y_index+3]);CHKERRCUBLAS(cberr);
624: cberr = cublasXdot(cublasv2handle,bn,y4ptr,one,xptr,one,&z[current_y_index+4]);CHKERRCUBLAS(cberr);
625: cberr = cublasXdot(cublasv2handle,bn,y5ptr,one,xptr,one,&z[current_y_index+5]);CHKERRCUBLAS(cberr);
626: cberr = cublasXdot(cublasv2handle,bn,y6ptr,one,xptr,one,&z[current_y_index+6]);CHKERRCUBLAS(cberr);
627: cberr = cublasXdot(cublasv2handle,bn,y7ptr,one,xptr,one,&z[current_y_index+7]);CHKERRCUBLAS(cberr);
628: #else
629: VecMDot_SeqCUDA_kernel8<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,y3ptr,y4ptr,y5ptr,y6ptr,y7ptr,n,group_results_gpu+current_y_index*MDOT_WORKGROUP_NUM);
630: #endif
631: VecCUDARestoreArrayRead(yin[current_y_index ],&y0ptr);
632: VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
633: VecCUDARestoreArrayRead(yin[current_y_index+2],&y2ptr);
634: VecCUDARestoreArrayRead(yin[current_y_index+3],&y3ptr);
635: VecCUDARestoreArrayRead(yin[current_y_index+4],&y4ptr);
636: VecCUDARestoreArrayRead(yin[current_y_index+5],&y5ptr);
637: VecCUDARestoreArrayRead(yin[current_y_index+6],&y6ptr);
638: VecCUDARestoreArrayRead(yin[current_y_index+7],&y7ptr);
639: current_y_index += 8;
640: break;
641: }
642: }
643: err = WaitForCUDA();CHKERRCUDA(err);
644: PetscLogGpuTimeEnd();
645: VecCUDARestoreArrayRead(xin,&xptr);
647: #if defined(PETSC_USE_COMPLEX)
648: PetscLogGpuToCpu(nv*sizeof(PetscScalar));
649: #else
650: // copy results to CPU
651: cuda_cudaMemcpy(group_results_cpu,group_results_gpu,nv1*sizeof(PetscScalar)*MDOT_WORKGROUP_NUM,cudaMemcpyDeviceToHost);CHKERRCUDA(cuda_ierr);
653: // sum group results into z
654: for (j=0; j<nv1; ++j) {
655: z[j] = 0;
656: for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[j] += group_results_cpu[i];
657: }
658: PetscLogFlops(nv1*MDOT_WORKGROUP_NUM);
659: cuda_cudaFree(group_results_gpu);CHKERRCUDA(cuda_ierr);
660: PetscLogGpuToCpu(nv1*sizeof(PetscScalar)*MDOT_WORKGROUP_NUM);
661: #endif
662: PetscLogGpuFlops(PetscMax(nv*(2.0*n-1),0.0));
663: return(0);
664: }
666: #undef MDOT_WORKGROUP_SIZE
667: #undef MDOT_WORKGROUP_NUM
669: PetscErrorCode VecSet_SeqCUDA(Vec xin,PetscScalar alpha)
670: {
671: PetscInt n = xin->map->n;
672: PetscScalar *xarray = NULL;
673: thrust::device_ptr<PetscScalar> xptr;
674: PetscErrorCode ierr;
675: cudaError_t err;
678: VecCUDAGetArrayWrite(xin,&xarray);
679: PetscLogGpuTimeBegin();
680: if (alpha == (PetscScalar)0.0) {
681: err = cudaMemset(xarray,0,n*sizeof(PetscScalar));CHKERRCUDA(err);
682: } else {
683: try {
684: xptr = thrust::device_pointer_cast(xarray);
685: thrust::fill(xptr,xptr+n,alpha);
686: } catch (char *ex) {
687: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
688: }
689: PetscLogCpuToGpu(sizeof(PetscScalar));
690: }
691: err = WaitForCUDA();CHKERRCUDA(err);
692: PetscLogGpuTimeEnd();
693: VecCUDARestoreArrayWrite(xin,&xarray);
694: return(0);
695: }
697: PetscErrorCode VecScale_SeqCUDA(Vec xin,PetscScalar alpha)
698: {
699: PetscScalar *xarray;
701: PetscBLASInt one = 1,bn = 0;
702: cublasHandle_t cublasv2handle;
703: cublasStatus_t cberr;
704: cudaError_t err;
707: if (alpha == (PetscScalar)0.0) {
708: VecSet_SeqCUDA(xin,alpha);
709: err = WaitForCUDA();CHKERRCUDA(err);
710: } else if (alpha != (PetscScalar)1.0) {
711: PetscCUBLASGetHandle(&cublasv2handle);
712: PetscBLASIntCast(xin->map->n,&bn);
713: VecCUDAGetArray(xin,&xarray);
714: PetscLogGpuTimeBegin();
715: cberr = cublasXscal(cublasv2handle,bn,&alpha,xarray,one);CHKERRCUBLAS(cberr);
716: VecCUDARestoreArray(xin,&xarray);
717: err = WaitForCUDA();CHKERRCUDA(err);
718: PetscLogGpuTimeEnd();
719: PetscLogCpuToGpu(sizeof(PetscScalar));
720: PetscLogGpuFlops(xin->map->n);
721: }
722: return(0);
723: }
725: PetscErrorCode VecTDot_SeqCUDA(Vec xin,Vec yin,PetscScalar *z)
726: {
727: const PetscScalar *xarray,*yarray;
728: PetscErrorCode ierr;
729: PetscBLASInt one = 1,bn = 0;
730: cublasHandle_t cublasv2handle;
731: cublasStatus_t cerr;
734: PetscCUBLASGetHandle(&cublasv2handle);
735: PetscBLASIntCast(xin->map->n,&bn);
736: VecCUDAGetArrayRead(xin,&xarray);
737: VecCUDAGetArrayRead(yin,&yarray);
738: PetscLogGpuTimeBegin();
739: cerr = cublasXdotu(cublasv2handle,bn,xarray,one,yarray,one,z);CHKERRCUBLAS(cerr);
740: PetscLogGpuTimeEnd();
741: if (xin->map->n > 0) {
742: PetscLogGpuFlops(2.0*xin->map->n-1);
743: }
744: PetscLogGpuToCpu(sizeof(PetscScalar));
745: VecCUDARestoreArrayRead(yin,&yarray);
746: VecCUDARestoreArrayRead(xin,&xarray);
747: return(0);
748: }
750: PetscErrorCode VecCopy_SeqCUDA(Vec xin,Vec yin)
751: {
752: const PetscScalar *xarray;
753: PetscScalar *yarray;
754: PetscErrorCode ierr;
755: cudaError_t err;
758: if (xin != yin) {
759: if (xin->offloadmask == PETSC_OFFLOAD_GPU) {
760: PetscBool yiscuda;
762: PetscObjectTypeCompareAny((PetscObject)yin,&yiscuda,VECSEQCUDA,VECMPICUDA,"");
763: VecCUDAGetArrayRead(xin,&xarray);
764: if (yiscuda) {
765: VecCUDAGetArrayWrite(yin,&yarray);
766: } else {
767: VecGetArrayWrite(yin,&yarray);
768: }
769: PetscLogGpuTimeBegin();
770: if (yiscuda) {
771: err = cudaMemcpyAsync(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice,PetscDefaultCudaStream);CHKERRCUDA(err);
772: } else {
773: err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(err);
774: }
775: err = WaitForCUDA();CHKERRCUDA(err);
776: PetscLogGpuTimeEnd();
777: VecCUDARestoreArrayRead(xin,&xarray);
778: if (yiscuda) {
779: VecCUDARestoreArrayWrite(yin,&yarray);
780: } else {
781: VecRestoreArrayWrite(yin,&yarray);
782: }
783: } else if (xin->offloadmask == PETSC_OFFLOAD_CPU) {
784: /* copy in CPU if we are on the CPU */
785: VecCopy_SeqCUDA_Private(xin,yin);
786: } else if (xin->offloadmask == PETSC_OFFLOAD_BOTH) {
787: /* if xin is valid in both places, see where yin is and copy there (because it's probably where we'll want to next use it) */
788: if (yin->offloadmask == PETSC_OFFLOAD_CPU) {
789: /* copy in CPU */
790: VecCopy_SeqCUDA_Private(xin,yin);
791: } else if (yin->offloadmask == PETSC_OFFLOAD_GPU) {
792: /* copy in GPU */
793: VecCUDAGetArrayRead(xin,&xarray);
794: VecCUDAGetArrayWrite(yin,&yarray);
795: PetscLogGpuTimeBegin();
796: err = cudaMemcpyAsync(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice,PetscDefaultCudaStream);CHKERRCUDA(err);
797: PetscLogGpuTimeEnd();
798: VecCUDARestoreArrayRead(xin,&xarray);
799: VecCUDARestoreArrayWrite(yin,&yarray);
800: } else if (yin->offloadmask == PETSC_OFFLOAD_BOTH) {
801: /* xin and yin are both valid in both places (or yin was unallocated before the earlier call to allocatecheck
802: default to copy in GPU (this is an arbitrary choice) */
803: VecCUDAGetArrayRead(xin,&xarray);
804: VecCUDAGetArrayWrite(yin,&yarray);
805: PetscLogGpuTimeBegin();
806: err = cudaMemcpyAsync(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice,PetscDefaultCudaStream);CHKERRCUDA(err);
807: PetscLogGpuTimeEnd();
808: VecCUDARestoreArrayRead(xin,&xarray);
809: VecCUDARestoreArrayWrite(yin,&yarray);
810: } else {
811: VecCopy_SeqCUDA_Private(xin,yin);
812: }
813: }
814: }
815: return(0);
816: }
818: PetscErrorCode VecSwap_SeqCUDA(Vec xin,Vec yin)
819: {
821: PetscBLASInt one = 1,bn = 0;
822: PetscScalar *xarray,*yarray;
823: cublasHandle_t cublasv2handle;
824: cublasStatus_t cberr;
825: cudaError_t err;
828: PetscCUBLASGetHandle(&cublasv2handle);
829: PetscBLASIntCast(xin->map->n,&bn);
830: if (xin != yin) {
831: VecCUDAGetArray(xin,&xarray);
832: VecCUDAGetArray(yin,&yarray);
833: PetscLogGpuTimeBegin();
834: cberr = cublasXswap(cublasv2handle,bn,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
835: err = WaitForCUDA();CHKERRCUDA(err);
836: PetscLogGpuTimeEnd();
837: VecCUDARestoreArray(xin,&xarray);
838: VecCUDARestoreArray(yin,&yarray);
839: }
840: return(0);
841: }
843: PetscErrorCode VecAXPBY_SeqCUDA(Vec yin,PetscScalar alpha,PetscScalar beta,Vec xin)
844: {
845: PetscErrorCode ierr;
846: PetscScalar a = alpha,b = beta;
847: const PetscScalar *xarray;
848: PetscScalar *yarray;
849: PetscBLASInt one = 1, bn = 0;
850: cublasHandle_t cublasv2handle;
851: cublasStatus_t cberr;
852: cudaError_t err;
855: PetscCUBLASGetHandle(&cublasv2handle);
856: PetscBLASIntCast(yin->map->n,&bn);
857: if (a == (PetscScalar)0.0) {
858: VecScale_SeqCUDA(yin,beta);
859: } else if (b == (PetscScalar)1.0) {
860: VecAXPY_SeqCUDA(yin,alpha,xin);
861: } else if (a == (PetscScalar)1.0) {
862: VecAYPX_SeqCUDA(yin,beta,xin);
863: } else if (b == (PetscScalar)0.0) {
864: VecCUDAGetArrayRead(xin,&xarray);
865: VecCUDAGetArray(yin,&yarray);
866: PetscLogGpuTimeBegin();
867: err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
868: cberr = cublasXscal(cublasv2handle,bn,&alpha,yarray,one);CHKERRCUBLAS(cberr);
869: err = WaitForCUDA();CHKERRCUDA(err);
870: PetscLogGpuTimeEnd();
871: PetscLogGpuFlops(xin->map->n);
872: PetscLogCpuToGpu(sizeof(PetscScalar));
873: VecCUDARestoreArrayRead(xin,&xarray);
874: VecCUDARestoreArray(yin,&yarray);
875: } else {
876: VecCUDAGetArrayRead(xin,&xarray);
877: VecCUDAGetArray(yin,&yarray);
878: PetscLogGpuTimeBegin();
879: cberr = cublasXscal(cublasv2handle,bn,&beta,yarray,one);CHKERRCUBLAS(cberr);
880: cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
881: VecCUDARestoreArrayRead(xin,&xarray);
882: VecCUDARestoreArray(yin,&yarray);
883: err = WaitForCUDA();CHKERRCUDA(err);
884: PetscLogGpuTimeEnd();
885: PetscLogGpuFlops(3.0*xin->map->n);
886: PetscLogCpuToGpu(2*sizeof(PetscScalar));
887: }
888: return(0);
889: }
891: PetscErrorCode VecAXPBYPCZ_SeqCUDA(Vec zin,PetscScalar alpha,PetscScalar beta,PetscScalar gamma,Vec xin,Vec yin)
892: {
894: PetscInt n = zin->map->n;
897: if (gamma == (PetscScalar)1.0) {
898: /* z = ax + b*y + z */
899: VecAXPY_SeqCUDA(zin,alpha,xin);
900: VecAXPY_SeqCUDA(zin,beta,yin);
901: PetscLogGpuFlops(4.0*n);
902: } else {
903: /* z = a*x + b*y + c*z */
904: VecScale_SeqCUDA(zin,gamma);
905: VecAXPY_SeqCUDA(zin,alpha,xin);
906: VecAXPY_SeqCUDA(zin,beta,yin);
907: PetscLogGpuFlops(5.0*n);
908: }
909: return(0);
910: }
912: PetscErrorCode VecPointwiseMult_SeqCUDA(Vec win,Vec xin,Vec yin)
913: {
914: PetscInt n = win->map->n;
915: const PetscScalar *xarray,*yarray;
916: PetscScalar *warray;
917: thrust::device_ptr<const PetscScalar> xptr,yptr;
918: thrust::device_ptr<PetscScalar> wptr;
919: PetscErrorCode ierr;
920: cudaError_t err;
923: VecCUDAGetArrayRead(xin,&xarray);
924: VecCUDAGetArrayRead(yin,&yarray);
925: VecCUDAGetArrayWrite(win,&warray);
926: PetscLogGpuTimeBegin();
927: try {
928: wptr = thrust::device_pointer_cast(warray);
929: xptr = thrust::device_pointer_cast(xarray);
930: yptr = thrust::device_pointer_cast(yarray);
931: thrust::transform(xptr,xptr+n,yptr,wptr,thrust::multiplies<PetscScalar>());
932: err = WaitForCUDA();CHKERRCUDA(err);
933: } catch (char *ex) {
934: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
935: }
936: PetscLogGpuTimeEnd();
937: VecCUDARestoreArrayRead(xin,&xarray);
938: VecCUDARestoreArrayRead(yin,&yarray);
939: VecCUDARestoreArrayWrite(win,&warray);
940: PetscLogGpuFlops(n);
941: return(0);
942: }
944: /* should do infinity norm in cuda */
946: PetscErrorCode VecNorm_SeqCUDA(Vec xin,NormType type,PetscReal *z)
947: {
948: PetscErrorCode ierr;
949: PetscInt n = xin->map->n;
950: PetscBLASInt one = 1, bn = 0;
951: const PetscScalar *xarray;
952: cublasHandle_t cublasv2handle;
953: cublasStatus_t cberr;
954: cudaError_t err;
957: PetscCUBLASGetHandle(&cublasv2handle);
958: PetscBLASIntCast(n,&bn);
959: if (type == NORM_2 || type == NORM_FROBENIUS) {
960: VecCUDAGetArrayRead(xin,&xarray);
961: PetscLogGpuTimeBegin();
962: cberr = cublasXnrm2(cublasv2handle,bn,xarray,one,z);CHKERRCUBLAS(cberr);
963: PetscLogGpuTimeEnd();
964: VecCUDARestoreArrayRead(xin,&xarray);
965: PetscLogGpuFlops(PetscMax(2.0*n-1,0.0));
966: } else if (type == NORM_INFINITY) {
967: int i;
968: VecCUDAGetArrayRead(xin,&xarray);
969: PetscLogGpuTimeBegin();
970: cberr = cublasIXamax(cublasv2handle,bn,xarray,one,&i);CHKERRCUBLAS(cberr);
971: PetscLogGpuTimeEnd();
972: if (bn) {
973: PetscScalar zs;
974: err = cudaMemcpy(&zs,xarray+i-1,sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(err);
975: *z = PetscAbsScalar(zs);
976: } else *z = 0.0;
977: VecCUDARestoreArrayRead(xin,&xarray);
978: } else if (type == NORM_1) {
979: VecCUDAGetArrayRead(xin,&xarray);
980: PetscLogGpuTimeBegin();
981: cberr = cublasXasum(cublasv2handle,bn,xarray,one,z);CHKERRCUBLAS(cberr);
982: VecCUDARestoreArrayRead(xin,&xarray);
983: PetscLogGpuTimeEnd();
984: PetscLogGpuFlops(PetscMax(n-1.0,0.0));
985: } else if (type == NORM_1_AND_2) {
986: VecNorm_SeqCUDA(xin,NORM_1,z);
987: VecNorm_SeqCUDA(xin,NORM_2,z+1);
988: }
989: PetscLogGpuToCpu(sizeof(PetscReal));
990: return(0);
991: }
993: PetscErrorCode VecDotNorm2_SeqCUDA(Vec s, Vec t, PetscScalar *dp, PetscScalar *nm)
994: {
998: VecDot_SeqCUDA(s,t,dp);
999: VecDot_SeqCUDA(t,t,nm);
1000: return(0);
1001: }
1003: PetscErrorCode VecDestroy_SeqCUDA(Vec v)
1004: {
1006: cudaError_t cerr;
1007: Vec_CUDA *veccuda = (Vec_CUDA*)v->spptr;
1010: if (v->spptr) {
1011: if (veccuda->GPUarray_allocated) {
1012: #if defined(PETSC_HAVE_NVSHMEM)
1013: if (veccuda->nvshmem) {
1014: PetscNvshmemFree(veccuda->GPUarray_allocated);
1015: veccuda->nvshmem = PETSC_FALSE;
1016: }
1017: else
1018: #endif
1019: {cerr = cudaFree(veccuda->GPUarray_allocated);CHKERRCUDA(cerr);}
1020: veccuda->GPUarray_allocated = NULL;
1021: }
1022: if (veccuda->stream) {
1023: cerr = cudaStreamDestroy(veccuda->stream);CHKERRCUDA(cerr);
1024: }
1025: }
1026: VecDestroy_SeqCUDA_Private(v);
1027: PetscFree(v->spptr);
1028: return(0);
1029: }
1031: #if defined(PETSC_USE_COMPLEX)
1032: struct conjugate
1033: {
1034: __host__ __device__
1035: PetscScalar operator()(PetscScalar x)
1036: {
1037: return PetscConj(x);
1038: }
1039: };
1040: #endif
1042: PetscErrorCode VecConjugate_SeqCUDA(Vec xin)
1043: {
1044: #if defined(PETSC_USE_COMPLEX)
1045: PetscScalar *xarray;
1046: PetscErrorCode ierr;
1047: PetscInt n = xin->map->n;
1048: thrust::device_ptr<PetscScalar> xptr;
1049: cudaError_t err;
1052: VecCUDAGetArray(xin,&xarray);
1053: PetscLogGpuTimeBegin();
1054: try {
1055: xptr = thrust::device_pointer_cast(xarray);
1056: thrust::transform(xptr,xptr+n,xptr,conjugate());
1057: err = WaitForCUDA();CHKERRCUDA(err);
1058: } catch (char *ex) {
1059: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1060: }
1061: PetscLogGpuTimeEnd();
1062: VecCUDARestoreArray(xin,&xarray);
1063: #else
1065: #endif
1066: return(0);
1067: }
1069: PetscErrorCode VecGetLocalVector_SeqCUDA(Vec v,Vec w)
1070: {
1072: cudaError_t err;
1079: if (w->data) {
1080: if (((Vec_Seq*)w->data)->array_allocated) {
1081: if (w->pinned_memory) {
1082: PetscMallocSetCUDAHost();
1083: }
1084: PetscFree(((Vec_Seq*)w->data)->array_allocated);
1085: if (w->pinned_memory) {
1086: PetscMallocResetCUDAHost();
1087: w->pinned_memory = PETSC_FALSE;
1088: }
1089: }
1090: ((Vec_Seq*)w->data)->array = NULL;
1091: ((Vec_Seq*)w->data)->unplacedarray = NULL;
1092: }
1093: if (w->spptr) {
1095: if (((Vec_CUDA*)w->spptr)->GPUarray) {
1096: err = cudaFree(((Vec_CUDA*)w->spptr)->GPUarray);CHKERRCUDA(err);
1097: ((Vec_CUDA*)w->spptr)->GPUarray = NULL;
1098: }
1099: if (((Vec_CUDA*)w->spptr)->stream) {
1100: err = cudaStreamDestroy(((Vec_CUDA*)w->spptr)->stream);CHKERRCUDA(err);
1101: }
1102: PetscFree(w->spptr);
1103: }
1105: if (v->petscnative) {
1106: PetscFree(w->data);
1107: w->data = v->data;
1108: w->offloadmask = v->offloadmask;
1109: w->pinned_memory = v->pinned_memory;
1110: w->spptr = v->spptr;
1111: PetscObjectStateIncrease((PetscObject)w);
1112: } else {
1113: VecGetArray(v,&((Vec_Seq*)w->data)->array);
1114: w->offloadmask = PETSC_OFFLOAD_CPU;
1115: VecCUDAAllocateCheck(w);
1116: }
1117: return(0);
1118: }
1120: PetscErrorCode VecRestoreLocalVector_SeqCUDA(Vec v,Vec w)
1121: {
1123: cudaError_t err;
1131: if (v->petscnative) {
1132: v->data = w->data;
1133: v->offloadmask = w->offloadmask;
1134: v->pinned_memory = w->pinned_memory;
1135: v->spptr = w->spptr;
1136: w->data = 0;
1137: w->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
1138: w->spptr = 0;
1139: } else {
1140: VecRestoreArray(v,&((Vec_Seq*)w->data)->array);
1141: if ((Vec_CUDA*)w->spptr) {
1142: err = cudaFree(((Vec_CUDA*)w->spptr)->GPUarray);CHKERRCUDA(err);
1143: ((Vec_CUDA*)w->spptr)->GPUarray = NULL;
1144: if (((Vec_CUDA*)v->spptr)->stream) {
1145: err = cudaStreamDestroy(((Vec_CUDA*)w->spptr)->stream);CHKERRCUDA(err);
1146: }
1147: PetscFree(w->spptr);
1148: }
1149: }
1150: return(0);
1151: }
1153: struct petscrealpart : public thrust::unary_function<PetscScalar,PetscReal>
1154: {
1155: __host__ __device__
1156: PetscReal operator()(PetscScalar x) {
1157: return PetscRealPart(x);
1158: }
1159: };
1161: struct petscrealparti : public thrust::unary_function<thrust::tuple<PetscScalar, PetscInt>,thrust::tuple<PetscReal, PetscInt>>
1162: {
1163: __host__ __device__
1164: thrust::tuple<PetscReal, PetscInt> operator()(thrust::tuple<PetscScalar, PetscInt> x) {
1165: return thrust::make_tuple(PetscRealPart(x.get<0>()), x.get<1>());
1166: }
1167: };
1169: struct petscmax : public thrust::binary_function<PetscReal,PetscReal,PetscReal>
1170: {
1171: __host__ __device__
1172: PetscReal operator()(PetscReal x, PetscReal y) {
1173: return x < y ? y : x;
1174: }
1175: };
1177: struct petscmaxi : public thrust::binary_function<thrust::tuple<PetscReal, PetscInt>,thrust::tuple<PetscReal, PetscInt>,thrust::tuple<PetscReal, PetscInt>>
1178: {
1179: __host__ __device__
1180: thrust::tuple<PetscReal, PetscInt> operator()(thrust::tuple<PetscReal, PetscInt> x, thrust::tuple<PetscReal, PetscInt> y) {
1181: return x.get<0>() < y.get<0>() ? thrust::make_tuple(y.get<0>(), y.get<1>()) :
1182: (x.get<0>() != y.get<0>() ? thrust::make_tuple(x.get<0>(), x.get<1>()) :
1183: (x.get<1>() < y.get<1>() ? thrust::make_tuple(x.get<0>(), x.get<1>()) : thrust::make_tuple(y.get<0>(), y.get<1>())));
1184: }
1185: };
1187: struct petscmin : public thrust::binary_function<PetscReal,PetscReal,PetscReal>
1188: {
1189: __host__ __device__
1190: PetscReal operator()(PetscReal x, PetscReal y) {
1191: return x < y ? x : y;
1192: }
1193: };
1195: struct petscmini : public thrust::binary_function<thrust::tuple<PetscReal, PetscInt>,thrust::tuple<PetscReal, PetscInt>,thrust::tuple<PetscReal, PetscInt>>
1196: {
1197: __host__ __device__
1198: thrust::tuple<PetscReal, PetscInt> operator()(thrust::tuple<PetscReal, PetscInt> x, thrust::tuple<PetscReal, PetscInt> y) {
1199: return x.get<0>() > y.get<0>() ? thrust::make_tuple(y.get<0>(), y.get<1>()) :
1200: (x.get<0>() != y.get<0>() ? thrust::make_tuple(x.get<0>(), x.get<1>()) :
1201: (x.get<1>() < y.get<1>() ? thrust::make_tuple(x.get<0>(), x.get<1>()) : thrust::make_tuple(y.get<0>(), y.get<1>())));
1202: }
1203: };
1205: PetscErrorCode VecMax_SeqCUDA(Vec v, PetscInt *p, PetscReal *m)
1206: {
1207: PetscErrorCode ierr;
1208: PetscInt n = v->map->n;
1209: const PetscScalar *av;
1210: thrust::device_ptr<const PetscScalar> avpt;
1214: if (!n) {
1215: *m = PETSC_MIN_REAL;
1216: if (p) *p = -1;
1217: return(0);
1218: }
1219: VecCUDAGetArrayRead(v,&av);
1220: avpt = thrust::device_pointer_cast(av);
1221: PetscLogGpuTimeBegin();
1222: if (p) {
1223: thrust::tuple<PetscReal,PetscInt> res(PETSC_MIN_REAL,-1);
1224: auto zibit = thrust::make_zip_iterator(thrust::make_tuple(avpt,thrust::counting_iterator<PetscInt>(0)));
1225: try {
1226: #if defined(PETSC_USE_COMPLEX)
1227: res = thrust::transform_reduce(zibit,zibit+n,petscrealparti(),res,petscmaxi());
1228: #else
1229: res = thrust::reduce(zibit,zibit+n,res,petscmaxi());
1230: #endif
1231: } catch (char *ex) {
1232: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1233: }
1234: *m = res.get<0>();
1235: *p = res.get<1>();
1236: } else {
1237: try {
1238: #if defined(PETSC_USE_COMPLEX)
1239: *m = thrust::transform_reduce(avpt,avpt+n,petscrealpart(),PETSC_MIN_REAL,petscmax());
1240: #else
1241: *m = thrust::reduce(avpt,avpt+n,PETSC_MIN_REAL,petscmax());
1242: #endif
1243: } catch (char *ex) {
1244: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1245: }
1246: }
1247: PetscLogGpuTimeEnd();
1248: VecCUDARestoreArrayRead(v,&av);
1249: return(0);
1250: }
1252: PetscErrorCode VecMin_SeqCUDA(Vec v, PetscInt *p, PetscReal *m)
1253: {
1254: PetscErrorCode ierr;
1255: PetscInt n = v->map->n;
1256: const PetscScalar *av;
1257: thrust::device_ptr<const PetscScalar> avpt;
1261: if (!n) {
1262: *m = PETSC_MAX_REAL;
1263: if (p) *p = -1;
1264: return(0);
1265: }
1266: VecCUDAGetArrayRead(v,&av);
1267: avpt = thrust::device_pointer_cast(av);
1268: PetscLogGpuTimeBegin();
1269: if (p) {
1270: thrust::tuple<PetscReal,PetscInt> res(PETSC_MAX_REAL,-1);
1271: auto zibit = thrust::make_zip_iterator(thrust::make_tuple(avpt,thrust::counting_iterator<PetscInt>(0)));
1272: try {
1273: #if defined(PETSC_USE_COMPLEX)
1274: res = thrust::transform_reduce(zibit,zibit+n,petscrealparti(),res,petscmini());
1275: #else
1276: res = thrust::reduce(zibit,zibit+n,res,petscmini());
1277: #endif
1278: } catch (char *ex) {
1279: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1280: }
1281: *m = res.get<0>();
1282: *p = res.get<1>();
1283: } else {
1284: try {
1285: #if defined(PETSC_USE_COMPLEX)
1286: *m = thrust::transform_reduce(avpt,avpt+n,petscrealpart(),PETSC_MAX_REAL,petscmin());
1287: #else
1288: *m = thrust::reduce(avpt,avpt+n,PETSC_MAX_REAL,petscmin());
1289: #endif
1290: } catch (char *ex) {
1291: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1292: }
1293: }
1294: PetscLogGpuTimeEnd();
1295: VecCUDARestoreArrayRead(v,&av);
1296: return(0);
1297: }
1298: #if defined(PETSC_HAVE_NVSHMEM)
1299: /* Free old CUDA array and re-allocate a new one from nvshmem symmetric heap.
1300: New array does not retain values in the old array. The offload mask is not changed.
1302: Note: the function is only meant to be used in MatAssemblyEnd_MPIAIJCUSPARSE.
1303: */
1304: PetscErrorCode VecAllocateNVSHMEM_SeqCUDA(Vec v)
1305: {
1307: cudaError_t cerr;
1308: Vec_CUDA *veccuda = (Vec_CUDA*)v->spptr;
1309: PetscInt n;
1312: cerr = cudaFree(veccuda->GPUarray_allocated);CHKERRCUDA(cerr);
1313: VecGetLocalSize(v,&n);
1314: MPIU_Allreduce(MPI_IN_PLACE,&n,1,MPIU_INT,MPI_MAX,PETSC_COMM_WORLD);
1315: PetscNvshmemMalloc(n*sizeof(PetscScalar),(void**)&veccuda->GPUarray_allocated);
1316: veccuda->GPUarray = veccuda->GPUarray_allocated;
1317: veccuda->nvshmem = PETSC_TRUE;
1318: return(0);
1319: }
1320: #endif