Actual source code: densecuda.cu
petsc-3.15.0 2021-03-30
1: /*
2: Defines the matrix operations for sequential dense with CUDA
3: */
4: #include <petscpkg_version.h>
5: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
6: #include <../src/mat/impls/dense/seq/dense.h>
7: #include <petsccublas.h>
9: /* cublas definitions are here */
10: #include <petsc/private/cudavecimpl.h>
12: #if defined(PETSC_USE_COMPLEX)
13: #if defined(PETSC_USE_REAL_SINGLE)
14: #define cusolverDnXpotrf(a,b,c,d,e,f,g,h) cusolverDnCpotrf((a),(b),(c),(cuComplex*)(d),(e),(cuComplex*)(f),(g),(h))
15: #define cusolverDnXpotrf_bufferSize(a,b,c,d,e,f) cusolverDnCpotrf_bufferSize((a),(b),(c),(cuComplex*)(d),(e),(f))
16: #define cusolverDnXpotrs(a,b,c,d,e,f,g,h,i) cusolverDnCpotrs((a),(b),(c),(d),(cuComplex*)(e),(f),(cuComplex*)(g),(h),(i))
17: #define cusolverDnXpotri(a,b,c,d,e,f,g,h) cusolverDnCpotri((a),(b),(c),(cuComplex*)(d),(e),(cuComplex*)(f),(g),(h))
18: #define cusolverDnXpotri_bufferSize(a,b,c,d,e,f) cusolverDnCpotri_bufferSize((a),(b),(c),(cuComplex*)(d),(e),(f))
19: #define cusolverDnXsytrf(a,b,c,d,e,f,g,h,i) cusolverDnCsytrf((a),(b),(c),(cuComplex*)(d),(e),(f),(cuComplex*)(g),(h),(i))
20: #define cusolverDnXsytrf_bufferSize(a,b,c,d,e) cusolverDnCsytrf_bufferSize((a),(b),(cuComplex*)(c),(d),(e))
21: #define cusolverDnXgetrf(a,b,c,d,e,f,g,h) cusolverDnCgetrf((a),(b),(c),(cuComplex*)(d),(e),(cuComplex*)(f),(g),(h))
22: #define cusolverDnXgetrf_bufferSize(a,b,c,d,e,f) cusolverDnCgetrf_bufferSize((a),(b),(c),(cuComplex*)(d),(e),(f))
23: #define cusolverDnXgetrs(a,b,c,d,e,f,g,h,i,j) cusolverDnCgetrs((a),(b),(c),(d),(cuComplex*)(e),(f),(g),(cuComplex*)(h),(i),(j))
24: #else /* complex double */
25: #define cusolverDnXpotrf(a,b,c,d,e,f,g,h) cusolverDnZpotrf((a),(b),(c),(cuDoubleComplex*)(d),(e),(cuDoubleComplex*)(f),(g),(h))
26: #define cusolverDnXpotrf_bufferSize(a,b,c,d,e,f) cusolverDnZpotrf_bufferSize((a),(b),(c),(cuDoubleComplex*)(d),(e),(f))
27: #define cusolverDnXpotrs(a,b,c,d,e,f,g,h,i) cusolverDnZpotrs((a),(b),(c),(d),(cuDoubleComplex*)(e),(f),(cuDoubleComplex*)(g),(h),(i))
28: #define cusolverDnXpotri(a,b,c,d,e,f,g,h) cusolverDnZpotri((a),(b),(c),(cuDoubleComplex*)(d),(e),(cuDoubleComplex*)(f),(g),(h))
29: #define cusolverDnXpotri_bufferSize(a,b,c,d,e,f) cusolverDnZpotri_bufferSize((a),(b),(c),(cuDoubleComplex*)(d),(e),(f))
30: #define cusolverDnXsytrf(a,b,c,d,e,f,g,h,i) cusolverDnZsytrf((a),(b),(c),(cuDoubleComplex*)(d),(e),(f),(cuDoubleComplex*)(g),(h),(i))
31: #define cusolverDnXsytrf_bufferSize(a,b,c,d,e) cusolverDnZsytrf_bufferSize((a),(b),(cuDoubleComplex*)(c),(d),(e))
32: #define cusolverDnXgetrf(a,b,c,d,e,f,g,h) cusolverDnZgetrf((a),(b),(c),(cuDoubleComplex*)(d),(e),(cuDoubleComplex*)(f),(g),(h))
33: #define cusolverDnXgetrf_bufferSize(a,b,c,d,e,f) cusolverDnZgetrf_bufferSize((a),(b),(c),(cuDoubleComplex*)(d),(e),(f))
34: #define cusolverDnXgetrs(a,b,c,d,e,f,g,h,i,j) cusolverDnZgetrs((a),(b),(c),(d),(cuDoubleComplex*)(e),(f),(g),(cuDoubleComplex*)(h),(i),(j))
35: #endif
36: #else /* real single */
37: #if defined(PETSC_USE_REAL_SINGLE)
38: #define cusolverDnXpotrf(a,b,c,d,e,f,g,h) cusolverDnSpotrf((a),(b),(c),(d),(e),(f),(g),(h))
39: #define cusolverDnXpotrf_bufferSize(a,b,c,d,e,f) cusolverDnSpotrf_bufferSize((a),(b),(c),(d),(e),(f))
40: #define cusolverDnXpotrs(a,b,c,d,e,f,g,h,i) cusolverDnSpotrs((a),(b),(c),(d),(e),(f),(g),(h),(i))
41: #define cusolverDnXpotri(a,b,c,d,e,f,g,h) cusolverDnSpotri((a),(b),(c),(d),(e),(f),(g),(h))
42: #define cusolverDnXpotri_bufferSize(a,b,c,d,e,f) cusolverDnSpotri_bufferSize((a),(b),(c),(d),(e),(f))
43: #define cusolverDnXsytrf(a,b,c,d,e,f,g,h,i) cusolverDnSsytrf((a),(b),(c),(d),(e),(f),(g),(h),(i))
44: #define cusolverDnXsytrf_bufferSize(a,b,c,d,e) cusolverDnSsytrf_bufferSize((a),(b),(c),(d),(e))
45: #define cusolverDnXgetrf(a,b,c,d,e,f,g,h) cusolverDnSgetrf((a),(b),(c),(d),(e),(f),(g),(h))
46: #define cusolverDnXgetrf_bufferSize(a,b,c,d,e,f) cusolverDnSgetrf_bufferSize((a),(b),(c),(d),(e),(f))
47: #define cusolverDnXgetrs(a,b,c,d,e,f,g,h,i,j) cusolverDnSgetrs((a),(b),(c),(d),(e),(f),(g),(h),(i),(j))
48: #else /* real double */
49: #define cusolverDnXpotrf(a,b,c,d,e,f,g,h) cusolverDnDpotrf((a),(b),(c),(d),(e),(f),(g),(h))
50: #define cusolverDnXpotrf_bufferSize(a,b,c,d,e,f) cusolverDnDpotrf_bufferSize((a),(b),(c),(d),(e),(f))
51: #define cusolverDnXpotrs(a,b,c,d,e,f,g,h,i) cusolverDnDpotrs((a),(b),(c),(d),(e),(f),(g),(h),(i))
52: #define cusolverDnXpotri(a,b,c,d,e,f,g,h) cusolverDnDpotri((a),(b),(c),(d),(e),(f),(g),(h))
53: #define cusolverDnXpotri_bufferSize(a,b,c,d,e,f) cusolverDnDpotri_bufferSize((a),(b),(c),(d),(e),(f))
54: #define cusolverDnXsytrf(a,b,c,d,e,f,g,h,i) cusolverDnDsytrf((a),(b),(c),(d),(e),(f),(g),(h),(i))
55: #define cusolverDnXsytrf_bufferSize(a,b,c,d,e) cusolverDnDsytrf_bufferSize((a),(b),(c),(d),(e))
56: #define cusolverDnXgetrf(a,b,c,d,e,f,g,h) cusolverDnDgetrf((a),(b),(c),(d),(e),(f),(g),(h))
57: #define cusolverDnXgetrf_bufferSize(a,b,c,d,e,f) cusolverDnDgetrf_bufferSize((a),(b),(c),(d),(e),(f))
58: #define cusolverDnXgetrs(a,b,c,d,e,f,g,h,i,j) cusolverDnDgetrs((a),(b),(c),(d),(e),(f),(g),(h),(i),(j))
59: #endif
60: #endif
62: typedef struct {
63: PetscScalar *d_v; /* pointer to the matrix on the GPU */
64: PetscBool user_alloc;
65: PetscScalar *unplacedarray; /* if one called MatCUDADensePlaceArray(), this is where it stashed the original */
66: PetscBool unplaced_user_alloc;
67: /* factorization support */
68: int *d_fact_ipiv; /* device pivots */
69: PetscScalar *d_fact_work; /* device workspace */
70: int fact_lwork;
71: int *d_fact_info; /* device info */
72: /* workspace */
73: Vec workvec;
74: } Mat_SeqDenseCUDA;
76: PetscErrorCode MatSeqDenseCUDASetPreallocation(Mat A, PetscScalar *d_data)
77: {
78: Mat_SeqDense *cA = (Mat_SeqDense*)A->data;
79: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
80: PetscErrorCode ierr;
81: PetscBool iscuda;
82: cudaError_t cerr;
85: PetscObjectTypeCompare((PetscObject)A,MATSEQDENSECUDA,&iscuda);
86: if (!iscuda) return(0);
87: /* it may happen CPU preallocation has not been performed */
88: PetscLayoutSetUp(A->rmap);
89: PetscLayoutSetUp(A->cmap);
90: if (cA->lda <= 0) cA->lda = A->rmap->n;
91: if (!dA->user_alloc) { cerr = cudaFree(dA->d_v);CHKERRCUDA(cerr); }
92: if (!d_data) { /* petsc-allocated storage */
93: PetscIntMultError(cA->lda,A->cmap->n,NULL);
94: cerr = cudaMalloc((void**)&dA->d_v,cA->lda*A->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
95: dA->user_alloc = PETSC_FALSE;
96: } else { /* user-allocated storage */
97: dA->d_v = d_data;
98: dA->user_alloc = PETSC_TRUE;
99: A->offloadmask = PETSC_OFFLOAD_GPU;
100: }
101: A->preallocated = PETSC_TRUE;
102: A->assembled = PETSC_TRUE;
103: return(0);
104: }
106: PetscErrorCode MatSeqDenseCUDACopyFromGPU(Mat A)
107: {
108: Mat_SeqDense *cA = (Mat_SeqDense*)A->data;
109: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
110: PetscErrorCode ierr;
111: cudaError_t cerr;
115: PetscInfo3(A,"%s matrix %d x %d\n",A->offloadmask == PETSC_OFFLOAD_GPU ? "Copy" : "Reusing",A->rmap->n,A->cmap->n);
116: if (A->offloadmask == PETSC_OFFLOAD_GPU) {
117: if (!cA->v) { /* MatCreateSeqDenseCUDA may not allocate CPU memory. Allocate if needed */
118: MatSeqDenseSetPreallocation(A,NULL);
119: }
120: PetscLogEventBegin(MAT_DenseCopyFromGPU,A,0,0,0);
121: if (cA->lda > A->rmap->n) {
122: PetscInt j,m = A->rmap->n;
124: for (j=0; j<A->cmap->n; j++) { /* TODO: it can be done better */
125: cerr = cudaMemcpy(cA->v + j*cA->lda,dA->d_v + j*cA->lda,m*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
126: }
127: } else {
128: cerr = cudaMemcpy(cA->v,dA->d_v,cA->lda*sizeof(PetscScalar)*A->cmap->n,cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
129: }
130: PetscLogGpuToCpu(cA->lda*sizeof(PetscScalar)*A->cmap->n);
131: PetscLogEventEnd(MAT_DenseCopyFromGPU,A,0,0,0);
133: A->offloadmask = PETSC_OFFLOAD_BOTH;
134: }
135: return(0);
136: }
138: PetscErrorCode MatSeqDenseCUDACopyToGPU(Mat A)
139: {
140: Mat_SeqDense *cA = (Mat_SeqDense*)A->data;
141: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
142: PetscBool copy;
143: PetscErrorCode ierr;
144: cudaError_t cerr;
148: if (A->boundtocpu) return(0);
149: copy = (PetscBool)(A->offloadmask == PETSC_OFFLOAD_CPU || A->offloadmask == PETSC_OFFLOAD_UNALLOCATED);
150: PetscInfo3(A,"%s matrix %d x %d\n",copy ? "Copy" : "Reusing",A->rmap->n,A->cmap->n);
151: if (copy) {
152: if (!dA->d_v) { /* Allocate GPU memory if not present */
153: MatSeqDenseCUDASetPreallocation(A,NULL);
154: }
155: PetscLogEventBegin(MAT_DenseCopyToGPU,A,0,0,0);
156: if (cA->lda > A->rmap->n) {
157: PetscInt j,m = A->rmap->n;
159: for (j=0; j<A->cmap->n; j++) { /* TODO: it can be done better */
160: cerr = cudaMemcpy(dA->d_v + j*cA->lda,cA->v + j*cA->lda,m*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
161: }
162: } else {
163: cerr = cudaMemcpy(dA->d_v,cA->v,cA->lda*sizeof(PetscScalar)*A->cmap->n,cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
164: }
165: PetscLogCpuToGpu(cA->lda*sizeof(PetscScalar)*A->cmap->n);
166: PetscLogEventEnd(MAT_DenseCopyToGPU,A,0,0,0);
168: A->offloadmask = PETSC_OFFLOAD_BOTH;
169: }
170: return(0);
171: }
173: static PetscErrorCode MatCopy_SeqDenseCUDA(Mat A,Mat B,MatStructure str)
174: {
175: Mat_SeqDense *a = (Mat_SeqDense*)A->data,*b = (Mat_SeqDense*)B->data;
176: PetscErrorCode ierr;
177: const PetscScalar *va;
178: PetscScalar *vb;
179: PetscInt lda1=a->lda,lda2=b->lda, m=A->rmap->n,n=A->cmap->n, j;
180: cudaError_t cerr;
183: /* If the two matrices don't have the same copy implementation, they aren't compatible for fast copy. */
184: if (A->ops->copy != B->ops->copy) {
185: MatCopy_Basic(A,B,str);
186: return(0);
187: }
188: if (m != B->rmap->n || n != B->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"size(B) != size(A)");
189: MatDenseCUDAGetArrayRead(A,&va);
190: MatDenseCUDAGetArrayWrite(B,&vb);
191: PetscLogGpuTimeBegin();
192: if (lda1>m || lda2>m) {
193: for (j=0; j<n; j++) {
194: cerr = cudaMemcpy(vb+j*lda2,va+j*lda1,m*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
195: }
196: } else {
197: cerr = cudaMemcpy(vb,va,m*(n*sizeof(PetscScalar)),cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
198: }
199: cerr = WaitForCUDA();CHKERRCUDA(cerr);
200: PetscLogGpuTimeEnd();
201: MatDenseCUDARestoreArray(B,&vb);
202: MatDenseCUDARestoreArrayRead(A,&va);
203: return(0);
204: }
206: static PetscErrorCode MatDenseCUDAPlaceArray_SeqDenseCUDA(Mat A, const PetscScalar *a)
207: {
208: Mat_SeqDense *aa = (Mat_SeqDense*)A->data;
209: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
210: PetscErrorCode ierr;
213: if (aa->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreColumnVec() first");
214: if (aa->matinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreSubMatrix() first");
215: if (dA->unplacedarray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"MatDenseCUDAResetArray() must be called first");
216: if (aa->v) { MatSeqDenseCUDACopyToGPU(A); }
217: dA->unplacedarray = dA->d_v;
218: dA->unplaced_user_alloc = dA->user_alloc;
219: dA->d_v = (PetscScalar*)a;
220: dA->user_alloc = PETSC_TRUE;
221: return(0);
222: }
224: static PetscErrorCode MatDenseCUDAResetArray_SeqDenseCUDA(Mat A)
225: {
226: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
227: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
228: PetscErrorCode ierr;
231: if (a->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreColumnVec() first");
232: if (a->matinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreSubMatrix() first");
233: if (a->v) { MatSeqDenseCUDACopyToGPU(A); }
234: dA->d_v = dA->unplacedarray;
235: dA->user_alloc = dA->unplaced_user_alloc;
236: dA->unplacedarray = NULL;
237: return(0);
238: }
240: static PetscErrorCode MatDenseCUDAReplaceArray_SeqDenseCUDA(Mat A, const PetscScalar *a)
241: {
242: Mat_SeqDense *aa = (Mat_SeqDense*)A->data;
243: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
244: cudaError_t cerr;
247: if (aa->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreColumnVec() first");
248: if (aa->matinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreSubMatrix() first");
249: if (dA->unplacedarray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"MatDenseCUDAResetArray() must be called first");
250: if (!dA->user_alloc) { cerr = cudaFree(dA->d_v);CHKERRCUDA(cerr); }
251: dA->d_v = (PetscScalar*)a;
252: dA->user_alloc = PETSC_FALSE;
253: return(0);
254: }
256: static PetscErrorCode MatDenseCUDAGetArrayWrite_SeqDenseCUDA(Mat A, PetscScalar **a)
257: {
258: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
259: PetscErrorCode ierr;
262: if (!dA->d_v) {
263: MatSeqDenseCUDASetPreallocation(A,NULL);
264: }
265: *a = dA->d_v;
266: return(0);
267: }
269: static PetscErrorCode MatDenseCUDARestoreArrayWrite_SeqDenseCUDA(Mat A, PetscScalar **a)
270: {
272: *a = NULL;
273: return(0);
274: }
276: static PetscErrorCode MatDenseCUDAGetArrayRead_SeqDenseCUDA(Mat A, const PetscScalar **a)
277: {
278: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
279: PetscErrorCode ierr;
282: MatSeqDenseCUDACopyToGPU(A);
283: *a = dA->d_v;
284: return(0);
285: }
287: static PetscErrorCode MatDenseCUDARestoreArrayRead_SeqDenseCUDA(Mat A, const PetscScalar **a)
288: {
290: *a = NULL;
291: return(0);
292: }
294: static PetscErrorCode MatDenseCUDAGetArray_SeqDenseCUDA(Mat A, PetscScalar **a)
295: {
296: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
297: PetscErrorCode ierr;
300: MatSeqDenseCUDACopyToGPU(A);
301: *a = dA->d_v;
302: return(0);
303: }
305: static PetscErrorCode MatDenseCUDARestoreArray_SeqDenseCUDA(Mat A, PetscScalar **a)
306: {
308: *a = NULL;
309: return(0);
310: }
312: PETSC_EXTERN PetscErrorCode MatSeqDenseCUDAInvertFactors_Private(Mat A)
313: {
314: #if PETSC_PKG_CUDA_VERSION_GE(10,1,0)
315: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
316: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
317: PetscScalar *da;
318: PetscErrorCode ierr;
319: cudaError_t ccer;
320: cusolverStatus_t cerr;
321: cusolverDnHandle_t handle;
322: int n,lda;
323: #if defined(PETSC_USE_DEBUG)
324: int info;
325: #endif
328: if (!A->rmap->n || !A->cmap->n) return(0);
329: PetscCUSOLVERDnGetHandle(&handle);
330: PetscMPIIntCast(A->cmap->n,&n);
331: PetscMPIIntCast(a->lda,&lda);
332: if (A->factortype == MAT_FACTOR_LU) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"cusolverDngetri not implemented");
333: else if (A->factortype == MAT_FACTOR_CHOLESKY) {
334: if (!dA->d_fact_ipiv) { /* spd */
335: int il;
337: MatDenseCUDAGetArray(A,&da);
338: cerr = cusolverDnXpotri_bufferSize(handle,CUBLAS_FILL_MODE_LOWER,n,da,lda,&il);CHKERRCUSOLVER(cerr);
339: if (il > dA->fact_lwork) {
340: dA->fact_lwork = il;
342: ccer = cudaFree(dA->d_fact_work);CHKERRCUDA(ccer);
343: ccer = cudaMalloc((void**)&dA->d_fact_work,dA->fact_lwork*sizeof(*dA->d_fact_work));CHKERRCUDA(ccer);
344: }
345: PetscLogGpuTimeBegin();
346: cerr = cusolverDnXpotri(handle,CUBLAS_FILL_MODE_LOWER,n,da,lda,dA->d_fact_work,dA->fact_lwork,dA->d_fact_info);CHKERRCUSOLVER(cerr);
347: ccer = WaitForCUDA();CHKERRCUDA(ccer);
348: PetscLogGpuTimeEnd();
349: MatDenseCUDARestoreArray(A,&da);
350: /* TODO (write cuda kernel) */
351: MatSeqDenseSymmetrize_Private(A,PETSC_TRUE);
352: } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"cusolverDnsytri not implemented");
353: }
354: #if defined(PETSC_USE_DEBUG)
355: ccer = cudaMemcpy(&info, dA->d_fact_info, sizeof(int), cudaMemcpyDeviceToHost);CHKERRCUDA(ccer);
356: if (info > 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_CH_ZRPVT,"Bad factorization: leading minor of order %d is zero",info);
357: else if (info < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Wrong argument to cuSolver %d",-info);
358: #endif
359: PetscLogGpuFlops(1.0*n*n*n/3.0);
360: A->ops->solve = NULL;
361: A->ops->solvetranspose = NULL;
362: A->ops->matsolve = NULL;
363: A->factortype = MAT_FACTOR_NONE;
365: PetscFree(A->solvertype);
366: return(0);
367: #else
368: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Upgrade to CUDA version 10.1.0 or higher");
369: #endif
370: }
372: static PetscErrorCode MatMatSolve_SeqDenseCUDA(Mat A,Mat B,Mat X)
373: {
374: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
375: Mat_SeqDense *x = (Mat_SeqDense*)X->data;
376: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
377: const PetscScalar *da;
378: PetscScalar *dx;
379: cusolverDnHandle_t handle;
380: PetscBool iscuda;
381: int nrhs,n,lda,ldx;
382: #if defined(PETSC_USE_DEBUG)
383: int info;
384: #endif
385: cudaError_t ccer;
386: cusolverStatus_t cerr;
387: PetscErrorCode ierr;
390: if (A->factortype == MAT_FACTOR_NONE) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix must be factored to solve");
391: if (!dA->d_fact_work) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix must be factored to solve");
392: PetscObjectTypeCompareAny((PetscObject)X,&iscuda,VECSEQCUDA,VECMPICUDA,"");
393: if (X != B) {
394: MatCopy(B,X,SAME_NONZERO_PATTERN);
395: }
396: MatDenseCUDAGetArrayRead(A,&da);
397: /* MatMatSolve does not have a dispatching mechanism, we may end up with a MATSEQDENSE here */
398: PetscObjectTypeCompare((PetscObject)X,MATSEQDENSECUDA,&iscuda);
399: if (!iscuda) {
400: MatConvert(X,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&X);
401: }
402: MatDenseCUDAGetArray(X,&dx);
403: PetscMPIIntCast(A->rmap->n,&n);
404: PetscMPIIntCast(X->cmap->n,&nrhs);
405: PetscMPIIntCast(a->lda,&lda);
406: PetscMPIIntCast(x->lda,&ldx);
407: PetscCUSOLVERDnGetHandle(&handle);
408: PetscLogGpuTimeBegin();
409: if (A->factortype == MAT_FACTOR_LU) {
410: PetscInfo2(A,"LU solve %d x %d on backend\n",n,n);
411: cerr = cusolverDnXgetrs(handle,CUBLAS_OP_N,n,nrhs,da,lda,dA->d_fact_ipiv,dx,ldx,dA->d_fact_info);CHKERRCUSOLVER(cerr);
412: } else if (A->factortype == MAT_FACTOR_CHOLESKY) {
413: PetscInfo2(A,"Cholesky solve %d x %d on backend\n",n,n);
414: if (!dA->d_fact_ipiv) { /* spd */
415: /* ========= Program hit cudaErrorNotReady (error 34) due to "device not ready" on CUDA API call to cudaEventQuery. */
416: cerr = cusolverDnXpotrs(handle,CUBLAS_FILL_MODE_LOWER,n,nrhs,da,lda,dx,ldx,dA->d_fact_info);CHKERRCUSOLVER(cerr);
417: } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"cusolverDnsytrs not implemented");
418: } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Unknown factor type %d",A->factortype);
419: ccer = WaitForCUDA();CHKERRCUDA(ccer);
420: PetscLogGpuTimeEnd();
421: MatDenseCUDARestoreArrayRead(A,&da);
422: MatDenseCUDARestoreArray(X,&dx);
423: if (!iscuda) {
424: MatConvert(X,MATSEQDENSE,MAT_INPLACE_MATRIX,&X);
425: }
426: #if defined(PETSC_USE_DEBUG)
427: ccer = cudaMemcpy(&info, dA->d_fact_info, sizeof(int), cudaMemcpyDeviceToHost);CHKERRCUDA(ccer);
428: if (info > 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_CH_ZRPVT,"Bad factorization: zero pivot in row %d",info-1);
429: else if (info < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Wrong argument to cuSolver %d",-info);
430: #endif
431: PetscLogGpuFlops(nrhs*(2.0*n*n - n));
432: return(0);
433: }
435: static PetscErrorCode MatSolve_SeqDenseCUDA_Private(Mat A,Vec xx,Vec yy,PetscBool trans)
436: {
437: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
438: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
439: const PetscScalar *da;
440: PetscScalar *y;
441: cusolverDnHandle_t handle;
442: int one = 1,n,lda;
443: #if defined(PETSC_USE_DEBUG)
444: int info;
445: #endif
446: cudaError_t ccer;
447: cusolverStatus_t cerr;
448: PetscBool iscuda;
449: PetscErrorCode ierr;
452: if (A->factortype == MAT_FACTOR_NONE) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix must be factored to solve");
453: if (!dA->d_fact_work) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix must be factored to solve");
454: PetscMPIIntCast(A->rmap->n,&n);
455: /* MatSolve does not have a dispatching mechanism, we may end up with a VECSTANDARD here */
456: PetscObjectTypeCompareAny((PetscObject)yy,&iscuda,VECSEQCUDA,VECMPICUDA,"");
457: if (iscuda) {
458: VecCopy(xx,yy);
459: VecCUDAGetArray(yy,&y);
460: } else {
461: if (!dA->workvec) {
462: MatCreateVecs(A,&dA->workvec,NULL);
463: }
464: VecCopy(xx,dA->workvec);
465: VecCUDAGetArray(dA->workvec,&y);
466: }
467: MatDenseCUDAGetArrayRead(A,&da);
468: PetscMPIIntCast(a->lda,&lda);
469: PetscCUSOLVERDnGetHandle(&handle);
470: PetscLogGpuTimeBegin();
471: if (A->factortype == MAT_FACTOR_LU) {
472: PetscInfo2(A,"LU solve %d x %d on backend\n",n,n);
473: cerr = cusolverDnXgetrs(handle,trans ? CUBLAS_OP_T : CUBLAS_OP_N,n,one,da,lda,dA->d_fact_ipiv,y,n,dA->d_fact_info);CHKERRCUSOLVER(cerr);
474: } else if (A->factortype == MAT_FACTOR_CHOLESKY) {
475: PetscInfo2(A,"Cholesky solve %d x %d on backend\n",n,n);
476: if (!dA->d_fact_ipiv) { /* spd */
477: /* ========= Program hit cudaErrorNotReady (error 34) due to "device not ready" on CUDA API call to cudaEventQuery. */
478: cerr = cusolverDnXpotrs(handle,CUBLAS_FILL_MODE_LOWER,n,one,da,lda,y,n,dA->d_fact_info);CHKERRCUSOLVER(cerr);
479: } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"cusolverDnsytrs not implemented");
480: } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Unknown factor type %d",A->factortype);
481: ccer = WaitForCUDA();CHKERRCUDA(ccer);
482: PetscLogGpuTimeEnd();
483: if (iscuda) {
484: VecCUDARestoreArray(yy,&y);
485: } else {
486: VecCUDARestoreArray(dA->workvec,&y);
487: VecCopy(dA->workvec,yy);
488: }
489: MatDenseCUDARestoreArrayRead(A,&da);
490: #if defined(PETSC_USE_DEBUG)
491: ccer = cudaMemcpy(&info, dA->d_fact_info, sizeof(int), cudaMemcpyDeviceToHost);CHKERRCUDA(ccer);
492: if (info > 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_CH_ZRPVT,"Bad factorization: zero pivot in row %d",info-1);
493: else if (info < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Wrong argument to cuSolver %d",-info);
494: #endif
495: PetscLogGpuFlops(2.0*n*n - n);
496: return(0);
497: }
499: static PetscErrorCode MatSolve_SeqDenseCUDA(Mat A,Vec xx,Vec yy)
500: {
501: PetscErrorCode ierr;
504: MatSolve_SeqDenseCUDA_Private(A,xx,yy,PETSC_FALSE);
505: return(0);
506: }
508: static PetscErrorCode MatSolveTranspose_SeqDenseCUDA(Mat A,Vec xx,Vec yy)
509: {
510: PetscErrorCode ierr;
513: MatSolve_SeqDenseCUDA_Private(A,xx,yy,PETSC_TRUE);
514: return(0);
515: }
517: static PetscErrorCode MatLUFactor_SeqDenseCUDA(Mat A,IS rperm,IS cperm,const MatFactorInfo *factinfo)
518: {
519: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
520: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
521: PetscScalar *da;
522: int m,n,lda;
523: #if defined(PETSC_USE_DEBUG)
524: int info;
525: #endif
526: cusolverStatus_t cerr;
527: cusolverDnHandle_t handle;
528: cudaError_t ccer;
529: PetscErrorCode ierr;
532: if (!A->rmap->n || !A->cmap->n) return(0);
533: PetscCUSOLVERDnGetHandle(&handle);
534: MatDenseCUDAGetArray(A,&da);
535: PetscMPIIntCast(A->cmap->n,&n);
536: PetscMPIIntCast(A->rmap->n,&m);
537: PetscMPIIntCast(a->lda,&lda);
538: PetscInfo2(A,"LU factor %d x %d on backend\n",m,n);
539: if (!dA->d_fact_ipiv) {
540: ccer = cudaMalloc((void**)&dA->d_fact_ipiv,n*sizeof(*dA->d_fact_ipiv));CHKERRCUDA(ccer);
541: }
542: if (!dA->fact_lwork) {
543: cerr = cusolverDnXgetrf_bufferSize(handle,m,n,da,lda,&dA->fact_lwork);CHKERRCUSOLVER(cerr);
544: ccer = cudaMalloc((void**)&dA->d_fact_work,dA->fact_lwork*sizeof(*dA->d_fact_work));CHKERRCUDA(ccer);
545: }
546: if (!dA->d_fact_info) {
547: ccer = cudaMalloc((void**)&dA->d_fact_info,sizeof(*dA->d_fact_info));CHKERRCUDA(ccer);
548: }
549: PetscLogGpuTimeBegin();
550: cerr = cusolverDnXgetrf(handle,m,n,da,lda,dA->d_fact_work,dA->d_fact_ipiv,dA->d_fact_info);CHKERRCUSOLVER(cerr);
551: ccer = WaitForCUDA();CHKERRCUDA(ccer);
552: PetscLogGpuTimeEnd();
553: MatDenseCUDARestoreArray(A,&da);
554: #if defined(PETSC_USE_DEBUG)
555: ccer = cudaMemcpy(&info, dA->d_fact_info, sizeof(int), cudaMemcpyDeviceToHost);CHKERRCUDA(ccer);
556: if (info > 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Bad factorization: zero pivot in row %d",info-1);
557: else if (info < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Wrong argument to cuSolver %d",-info);
558: #endif
559: A->factortype = MAT_FACTOR_LU;
560: PetscLogGpuFlops(2.0*n*n*m/3.0);
562: A->ops->solve = MatSolve_SeqDenseCUDA;
563: A->ops->solvetranspose = MatSolveTranspose_SeqDenseCUDA;
564: A->ops->matsolve = MatMatSolve_SeqDenseCUDA;
566: PetscFree(A->solvertype);
567: PetscStrallocpy(MATSOLVERCUDA,&A->solvertype);
568: return(0);
569: }
571: static PetscErrorCode MatCholeskyFactor_SeqDenseCUDA(Mat A,IS perm,const MatFactorInfo *factinfo)
572: {
573: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
574: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
575: PetscScalar *da;
576: int n,lda;
577: #if defined(PETSC_USE_DEBUG)
578: int info;
579: #endif
580: cusolverStatus_t cerr;
581: cusolverDnHandle_t handle;
582: cudaError_t ccer;
583: PetscErrorCode ierr;
586: if (!A->rmap->n || !A->cmap->n) return(0);
587: PetscCUSOLVERDnGetHandle(&handle);
588: PetscMPIIntCast(A->rmap->n,&n);
589: PetscInfo2(A,"Cholesky factor %d x %d on backend\n",n,n);
590: if (A->spd) {
591: MatDenseCUDAGetArray(A,&da);
592: PetscMPIIntCast(a->lda,&lda);
593: if (!dA->fact_lwork) {
594: cerr = cusolverDnXpotrf_bufferSize(handle,CUBLAS_FILL_MODE_LOWER,n,da,lda,&dA->fact_lwork);CHKERRCUSOLVER(cerr);
595: ccer = cudaMalloc((void**)&dA->d_fact_work,dA->fact_lwork*sizeof(*dA->d_fact_work));CHKERRCUDA(ccer);
596: }
597: if (!dA->d_fact_info) {
598: ccer = cudaMalloc((void**)&dA->d_fact_info,sizeof(*dA->d_fact_info));CHKERRCUDA(ccer);
599: }
600: PetscLogGpuTimeBegin();
601: cerr = cusolverDnXpotrf(handle,CUBLAS_FILL_MODE_LOWER,n,da,lda,dA->d_fact_work,dA->fact_lwork,dA->d_fact_info);CHKERRCUSOLVER(cerr);
602: ccer = WaitForCUDA();CHKERRCUDA(ccer);
603: PetscLogGpuTimeEnd();
605: MatDenseCUDARestoreArray(A,&da);
606: #if defined(PETSC_USE_DEBUG)
607: ccer = cudaMemcpy(&info, dA->d_fact_info, sizeof(int), cudaMemcpyDeviceToHost);CHKERRCUDA(ccer);
608: if (info > 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_CH_ZRPVT,"Bad factorization: zero pivot in row %d",info-1);
609: else if (info < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Wrong argument to cuSolver %d",-info);
610: #endif
611: A->factortype = MAT_FACTOR_CHOLESKY;
612: PetscLogGpuFlops(1.0*n*n*n/3.0);
613: } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cusolverDnsytrs unavailable. Use MAT_FACTOR_LU");
614: #if 0
615: /* at the time of writing this interface (cuda 10.0), cusolverDn does not implement *sytrs and *hetr* routines
616: The code below should work, and it can be activated when *sytrs routines will be available */
617: if (!dA->d_fact_ipiv) {
618: ccer = cudaMalloc((void**)&dA->d_fact_ipiv,n*sizeof(*dA->d_fact_ipiv));CHKERRCUDA(ccer);
619: }
620: if (!dA->fact_lwork) {
621: cerr = cusolverDnXsytrf_bufferSize(handle,n,da,lda,&dA->fact_lwork);CHKERRCUSOLVER(cerr);
622: ccer = cudaMalloc((void**)&dA->d_fact_work,dA->fact_lwork*sizeof(*dA->d_fact_work));CHKERRCUDA(ccer);
623: }
624: if (!dA->d_fact_info) {
625: ccer = cudaMalloc((void**)&dA->d_fact_info,sizeof(*dA->d_fact_info));CHKERRCUDA(ccer);
626: }
627: PetscLogGpuTimeBegin();
628: cerr = cusolverDnXsytrf(handle,CUBLAS_FILL_MODE_LOWER,n,da,lda,dA->d_fact_ipiv,dA->d_fact_work,dA->fact_lwork,dA->d_fact_info);CHKERRCUSOLVER(cerr);
629: PetscLogGpuTimeEnd();
630: #endif
632: A->ops->solve = MatSolve_SeqDenseCUDA;
633: A->ops->solvetranspose = MatSolveTranspose_SeqDenseCUDA;
634: A->ops->matsolve = MatMatSolve_SeqDenseCUDA;
636: PetscFree(A->solvertype);
637: PetscStrallocpy(MATSOLVERCUDA,&A->solvertype);
638: return(0);
639: }
641: /* GEMM kernel: C = op(A)*op(B), tA, tB flag transposition */
642: PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat A,Mat B,Mat C,PetscBool tA,PetscBool tB)
643: {
644: const PetscScalar *da,*db;
645: PetscScalar *dc;
646: PetscScalar one=1.0,zero=0.0;
647: int m,n,k;
648: PetscInt alda,blda,clda;
649: PetscErrorCode ierr;
650: cublasHandle_t cublasv2handle;
651: PetscBool Aiscuda,Biscuda;
652: cublasStatus_t berr;
653: cudaError_t cerr;
656: /* we may end up with SEQDENSE as one of the arguments */
657: PetscObjectTypeCompare((PetscObject)A,MATSEQDENSECUDA,&Aiscuda);
658: PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&Biscuda);
659: if (!Aiscuda) {
660: MatConvert(A,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&A);
661: }
662: if (!Biscuda) {
663: MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);
664: }
665: PetscMPIIntCast(C->rmap->n,&m);
666: PetscMPIIntCast(C->cmap->n,&n);
667: if (tA) {
668: PetscMPIIntCast(A->rmap->n,&k);
669: } else {
670: PetscMPIIntCast(A->cmap->n,&k);
671: }
672: if (!m || !n || !k) return(0);
673: PetscInfo3(C,"Matrix-Matrix product %d x %d x %d on backend\n",m,k,n);
674: MatDenseCUDAGetArrayRead(A,&da);
675: MatDenseCUDAGetArrayRead(B,&db);
676: MatDenseCUDAGetArrayWrite(C,&dc);
677: MatDenseGetLDA(A,&alda);
678: MatDenseGetLDA(B,&blda);
679: MatDenseGetLDA(C,&clda);
680: PetscCUBLASGetHandle(&cublasv2handle);
681: PetscLogGpuTimeBegin();
682: berr = cublasXgemm(cublasv2handle,tA ? CUBLAS_OP_T : CUBLAS_OP_N,tB ? CUBLAS_OP_T : CUBLAS_OP_N,
683: m,n,k,&one,da,alda,db,blda,&zero,dc,clda);CHKERRCUBLAS(berr);
684: cerr = WaitForCUDA();CHKERRCUDA(cerr);
685: PetscLogGpuTimeEnd();
686: PetscLogGpuFlops(1.0*m*n*k + 1.0*m*n*(k-1));
687: MatDenseCUDARestoreArrayRead(A,&da);
688: MatDenseCUDARestoreArrayRead(B,&db);
689: MatDenseCUDARestoreArrayWrite(C,&dc);
690: if (!Aiscuda) {
691: MatConvert(A,MATSEQDENSE,MAT_INPLACE_MATRIX,&A);
692: }
693: if (!Biscuda) {
694: MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);
695: }
696: return(0);
697: }
699: PetscErrorCode MatTransposeMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA(Mat A,Mat B,Mat C)
700: {
704: MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(A,B,C,PETSC_TRUE,PETSC_FALSE);
705: return(0);
706: }
708: PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA(Mat A,Mat B,Mat C)
709: {
713: MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(A,B,C,PETSC_FALSE,PETSC_FALSE);
714: return(0);
715: }
717: PetscErrorCode MatMatTransposeMultNumeric_SeqDenseCUDA_SeqDenseCUDA(Mat A,Mat B,Mat C)
718: {
722: MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(A,B,C,PETSC_FALSE,PETSC_TRUE);
723: return(0);
724: }
726: PetscErrorCode MatProductSetFromOptions_SeqDenseCUDA(Mat C)
727: {
731: MatProductSetFromOptions_SeqDense(C);
732: return(0);
733: }
735: /* zz = op(A)*xx + yy
736: if yy == NULL, only MatMult */
737: static PetscErrorCode MatMultAdd_SeqDenseCUDA_Private(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans)
738: {
739: Mat_SeqDense *mat = (Mat_SeqDense*)A->data;
740: const PetscScalar *xarray,*da;
741: PetscScalar *zarray;
742: PetscScalar one=1.0,zero=0.0;
743: int m, n, lda; /* Use PetscMPIInt as it is typedef'ed to int */
744: cublasHandle_t cublasv2handle;
745: cublasStatus_t berr;
746: PetscErrorCode ierr;
749: if (yy && yy != zz) { /* mult add */
750: VecCopy_SeqCUDA(yy,zz);
751: }
752: if (!A->rmap->n || !A->cmap->n) {
753: if (!yy) { /* mult only */
754: VecSet_SeqCUDA(zz,0.0);
755: }
756: return(0);
757: }
758: PetscInfo2(A,"Matrix-vector product %d x %d on backend\n",A->rmap->n,A->cmap->n);
759: PetscMPIIntCast(A->rmap->n,&m);
760: PetscMPIIntCast(A->cmap->n,&n);
761: PetscMPIIntCast(mat->lda,&lda);
762: PetscCUBLASGetHandle(&cublasv2handle);
763: MatDenseCUDAGetArrayRead(A,&da);
764: VecCUDAGetArrayRead(xx,&xarray);
765: VecCUDAGetArray(zz,&zarray);
766: PetscLogGpuTimeBegin();
767: berr = cublasXgemv(cublasv2handle,trans ? CUBLAS_OP_T : CUBLAS_OP_N,
768: m,n,&one,da,lda,xarray,1,(yy ? &one : &zero),zarray,1);CHKERRCUBLAS(berr);
769: PetscLogGpuTimeEnd();
770: PetscLogGpuFlops(2.0*A->rmap->n*A->cmap->n - (yy ? 0 : A->rmap->n));
771: VecCUDARestoreArrayRead(xx,&xarray);
772: VecCUDARestoreArray(zz,&zarray);
773: MatDenseCUDARestoreArrayRead(A,&da);
774: return(0);
775: }
777: PetscErrorCode MatMultAdd_SeqDenseCUDA(Mat A,Vec xx,Vec yy,Vec zz)
778: {
782: MatMultAdd_SeqDenseCUDA_Private(A,xx,yy,zz,PETSC_FALSE);
783: return(0);
784: }
786: PetscErrorCode MatMultTransposeAdd_SeqDenseCUDA(Mat A,Vec xx,Vec yy,Vec zz)
787: {
791: MatMultAdd_SeqDenseCUDA_Private(A,xx,yy,zz,PETSC_TRUE);
792: return(0);
793: }
795: PetscErrorCode MatMult_SeqDenseCUDA(Mat A,Vec xx,Vec yy)
796: {
800: MatMultAdd_SeqDenseCUDA_Private(A,xx,NULL,yy,PETSC_FALSE);
801: return(0);
802: }
804: PetscErrorCode MatMultTranspose_SeqDenseCUDA(Mat A,Vec xx,Vec yy)
805: {
809: MatMultAdd_SeqDenseCUDA_Private(A,xx,NULL,yy,PETSC_TRUE);
810: return(0);
811: }
813: static PetscErrorCode MatDenseGetArrayRead_SeqDenseCUDA(Mat A,const PetscScalar **array)
814: {
815: Mat_SeqDense *mat = (Mat_SeqDense*)A->data;
819: MatSeqDenseCUDACopyFromGPU(A);
820: *array = mat->v;
821: return(0);
822: }
824: static PetscErrorCode MatDenseGetArrayWrite_SeqDenseCUDA(Mat A,PetscScalar **array)
825: {
826: Mat_SeqDense *mat = (Mat_SeqDense*)A->data;
830: if (!mat->v) { /* MatCreateSeqDenseCUDA may not allocate CPU memory. Allocate if needed */
831: MatSeqDenseSetPreallocation(A,NULL);
832: }
833: *array = mat->v;
834: A->offloadmask = PETSC_OFFLOAD_CPU;
835: return(0);
836: }
838: static PetscErrorCode MatDenseGetArray_SeqDenseCUDA(Mat A,PetscScalar **array)
839: {
840: Mat_SeqDense *mat = (Mat_SeqDense*)A->data;
844: MatSeqDenseCUDACopyFromGPU(A);
845: *array = mat->v;
846: A->offloadmask = PETSC_OFFLOAD_CPU;
847: return(0);
848: }
850: PetscErrorCode MatScale_SeqDenseCUDA(Mat Y,PetscScalar alpha)
851: {
852: Mat_SeqDense *y = (Mat_SeqDense*)Y->data;
853: PetscScalar *dy;
854: int j,N,m,lday,one = 1;
855: cublasHandle_t cublasv2handle;
856: cublasStatus_t berr;
858: cudaError_t cerr;
861: PetscCUBLASGetHandle(&cublasv2handle);
862: MatDenseCUDAGetArray(Y,&dy);
863: PetscMPIIntCast(Y->rmap->n*Y->cmap->n,&N);
864: PetscMPIIntCast(Y->rmap->n,&m);
865: PetscMPIIntCast(y->lda,&lday);
866: PetscInfo2(Y,"Performing Scale %d x %d on backend\n",Y->rmap->n,Y->cmap->n);
867: PetscLogGpuTimeBegin();
868: if (lday>m) {
869: for (j=0; j<Y->cmap->n; j++) {
870: berr = cublasXscal(cublasv2handle,m,&alpha,dy+lday*j,one);CHKERRCUBLAS(berr);
871: }
872: } else {
873: berr = cublasXscal(cublasv2handle,N,&alpha,dy,one);CHKERRCUBLAS(berr);
874: }
875: cerr = WaitForCUDA();CHKERRCUDA(cerr);
876: PetscLogGpuTimeEnd();
877: PetscLogGpuFlops(N);
878: MatDenseCUDARestoreArray(Y,&dy);
879: return(0);
880: }
882: PetscErrorCode MatAXPY_SeqDenseCUDA(Mat Y,PetscScalar alpha,Mat X,MatStructure str)
883: {
884: Mat_SeqDense *x = (Mat_SeqDense*)X->data;
885: Mat_SeqDense *y = (Mat_SeqDense*)Y->data;
886: const PetscScalar *dx;
887: PetscScalar *dy;
888: int j,N,m,ldax,lday,one = 1;
889: cublasHandle_t cublasv2handle;
890: cublasStatus_t berr;
891: PetscErrorCode ierr;
892: cudaError_t cerr;
895: if (!X->rmap->n || !X->cmap->n) return(0);
896: PetscCUBLASGetHandle(&cublasv2handle);
897: MatDenseCUDAGetArrayRead(X,&dx);
898: if (alpha != 0.0) {
899: MatDenseCUDAGetArray(Y,&dy);
900: } else {
901: MatDenseCUDAGetArrayWrite(Y,&dy);
902: }
903: PetscMPIIntCast(X->rmap->n*X->cmap->n,&N);
904: PetscMPIIntCast(X->rmap->n,&m);
905: PetscMPIIntCast(x->lda,&ldax);
906: PetscMPIIntCast(y->lda,&lday);
907: PetscInfo2(Y,"Performing AXPY %d x %d on backend\n",Y->rmap->n,Y->cmap->n);
908: PetscLogGpuTimeBegin();
909: if (ldax>m || lday>m) {
910: for (j=0; j<X->cmap->n; j++) {
911: berr = cublasXaxpy(cublasv2handle,m,&alpha,dx+j*ldax,one,dy+j*lday,one);CHKERRCUBLAS(berr);
912: }
913: } else {
914: berr = cublasXaxpy(cublasv2handle,N,&alpha,dx,one,dy,one);CHKERRCUBLAS(berr);
915: }
916: cerr = WaitForCUDA();CHKERRCUDA(cerr);
917: PetscLogGpuTimeEnd();
918: PetscLogGpuFlops(PetscMax(2.*N-1,0));
919: MatDenseCUDARestoreArrayRead(X,&dx);
920: if (alpha != 0.0) {
921: MatDenseCUDARestoreArray(Y,&dy);
922: } else {
923: MatDenseCUDARestoreArrayWrite(Y,&dy);
924: }
925: return(0);
926: }
928: static PetscErrorCode MatReset_SeqDenseCUDA(Mat A)
929: {
930: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
931: cudaError_t cerr;
932: PetscErrorCode ierr;
935: if (dA) {
936: if (dA->unplacedarray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"MatDenseCUDAResetArray() must be called first");
937: if (!dA->user_alloc) { cerr = cudaFree(dA->d_v);CHKERRCUDA(cerr); }
938: cerr = cudaFree(dA->d_fact_ipiv);CHKERRCUDA(cerr);
939: cerr = cudaFree(dA->d_fact_info);CHKERRCUDA(cerr);
940: cerr = cudaFree(dA->d_fact_work);CHKERRCUDA(cerr);
941: VecDestroy(&dA->workvec);
942: }
943: PetscFree(A->spptr);
944: return(0);
945: }
947: PetscErrorCode MatDestroy_SeqDenseCUDA(Mat A)
948: {
949: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
953: /* prevent to copy back data if we own the data pointer */
954: if (!a->user_alloc) { A->offloadmask = PETSC_OFFLOAD_CPU; }
955: MatConvert_SeqDenseCUDA_SeqDense(A,MATSEQDENSE,MAT_INPLACE_MATRIX,&A);
956: MatDestroy_SeqDense(A);
957: return(0);
958: }
960: PetscErrorCode MatDuplicate_SeqDenseCUDA(Mat A,MatDuplicateOption cpvalues,Mat *B)
961: {
965: MatCreate(PetscObjectComm((PetscObject)A),B);
966: MatSetSizes(*B,A->rmap->n,A->cmap->n,A->rmap->n,A->cmap->n);
967: MatSetType(*B,((PetscObject)A)->type_name);
968: MatDuplicateNoCreate_SeqDense(*B,A,cpvalues);
969: if (cpvalues == MAT_COPY_VALUES && A->offloadmask != PETSC_OFFLOAD_CPU) {
970: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
971: const PetscScalar *da;
972: PetscScalar *db;
973: cudaError_t cerr;
974: PetscInt ldb;
976: MatDenseCUDAGetArrayRead(A,&da);
977: MatDenseCUDAGetArrayWrite(*B,&db);
978: MatDenseGetLDA(*B,&ldb);
979: if (a->lda > A->rmap->n || ldb > A->rmap->n) {
980: PetscInt j,m = A->rmap->n;
982: for (j=0; j<A->cmap->n; j++) { /* it can be done better */
983: cerr = cudaMemcpy(db+j*ldb,da+j*a->lda,m*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
984: }
985: } else {
986: cerr = cudaMemcpy(db,da,(sizeof(PetscScalar)*A->cmap->n)*A->rmap->n,cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
987: }
988: MatDenseCUDARestoreArrayRead(A,&da);
989: MatDenseCUDARestoreArrayWrite(*B,&db);
990: (*B)->offloadmask = PETSC_OFFLOAD_BOTH;
991: }
992: return(0);
993: }
995: #include <petsc/private/vecimpl.h>
997: static PetscErrorCode MatGetColumnVector_SeqDenseCUDA(Mat A,Vec v,PetscInt col)
998: {
999: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
1000: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
1001: PetscErrorCode ierr;
1002: PetscScalar *x;
1003: PetscBool viscuda;
1004: cudaError_t cerr;
1007: PetscObjectTypeCompareAny((PetscObject)v,&viscuda,VECSEQCUDA,VECMPICUDA,VECCUDA,"");
1008: if (viscuda && !v->boundtocpu) { /* update device data */
1009: VecCUDAGetArrayWrite(v,&x);
1010: if (A->offloadmask & PETSC_OFFLOAD_GPU) {
1011: cerr = cudaMemcpy(x,dA->d_v + col*a->lda,A->rmap->n*sizeof(PetscScalar),cudaMemcpyHostToHost);CHKERRCUDA(cerr);
1012: } else {
1013: cerr = cudaMemcpy(x,a->v + col*a->lda,A->rmap->n*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
1014: }
1015: VecCUDARestoreArrayWrite(v,&x);
1016: } else { /* update host data */
1017: VecGetArrayWrite(v,&x);
1018: if (A->offloadmask & PETSC_OFFLOAD_CPU) {
1019: PetscArraycpy(x,a->v+col*a->lda,A->rmap->n);
1020: } else {
1021: cerr = cudaMemcpy(x,dA->d_v + col*a->lda,A->rmap->n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1022: }
1023: VecRestoreArrayWrite(v,&x);
1024: }
1025: return(0);
1026: }
1028: PETSC_INTERN PetscErrorCode MatGetFactor_seqdense_cuda(Mat A,MatFactorType ftype,Mat *fact)
1029: {
1033: MatCreate(PetscObjectComm((PetscObject)A),fact);
1034: MatSetSizes(*fact,A->rmap->n,A->cmap->n,A->rmap->n,A->cmap->n);
1035: MatSetType(*fact,MATSEQDENSECUDA);
1036: if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU) {
1037: (*fact)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqDense;
1038: } else {
1039: (*fact)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqDense;
1040: }
1041: (*fact)->factortype = ftype;
1043: PetscFree((*fact)->solvertype);
1044: PetscStrallocpy(MATSOLVERCUDA,&(*fact)->solvertype);
1045: return(0);
1046: }
1048: static PetscErrorCode MatDenseGetColumnVec_SeqDenseCUDA(Mat A,PetscInt col,Vec *v)
1049: {
1050: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
1054: if (a->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreColumnVec() first");
1055: if (a->matinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreSubMatrix() first");
1056: MatDenseCUDAGetArray(A,(PetscScalar**)&a->ptrinuse);
1057: if (!a->cvec) { /* we pass the data of A, to prevent allocating needless GPU memory the first time VecCUDAPlaceArray is called */
1058: VecCreateSeqCUDAWithArray(PetscObjectComm((PetscObject)A),A->rmap->bs,A->rmap->n,a->ptrinuse,&a->cvec);
1059: PetscLogObjectParent((PetscObject)A,(PetscObject)a->cvec);
1060: }
1061: a->vecinuse = col + 1;
1062: VecCUDAPlaceArray(a->cvec,a->ptrinuse + (size_t)col * (size_t)a->lda);
1063: *v = a->cvec;
1064: return(0);
1065: }
1067: static PetscErrorCode MatDenseRestoreColumnVec_SeqDenseCUDA(Mat A,PetscInt col,Vec *v)
1068: {
1069: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
1073: if (!a->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseGetColumnVec() first");
1074: if (!a->cvec) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing internal column vector");
1075: a->vecinuse = 0;
1076: VecCUDAResetArray(a->cvec);
1077: MatDenseCUDARestoreArray(A,(PetscScalar**)&a->ptrinuse);
1078: *v = NULL;
1079: return(0);
1080: }
1082: static PetscErrorCode MatDenseGetColumnVecRead_SeqDenseCUDA(Mat A,PetscInt col,Vec *v)
1083: {
1084: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
1088: if (a->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreColumnVec() first");
1089: if (a->matinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreSubMatrix() first");
1090: MatDenseCUDAGetArrayRead(A,&a->ptrinuse);
1091: if (!a->cvec) { /* we pass the data of A, to prevent allocating needless GPU memory the first time VecCUDAPlaceArray is called */
1092: VecCreateSeqCUDAWithArray(PetscObjectComm((PetscObject)A),A->rmap->bs,A->rmap->n,a->ptrinuse,&a->cvec);
1093: PetscLogObjectParent((PetscObject)A,(PetscObject)a->cvec);
1094: }
1095: a->vecinuse = col + 1;
1096: VecCUDAPlaceArray(a->cvec,a->ptrinuse + (size_t)col * (size_t)a->lda);
1097: VecLockReadPush(a->cvec);
1098: *v = a->cvec;
1099: return(0);
1100: }
1102: static PetscErrorCode MatDenseRestoreColumnVecRead_SeqDenseCUDA(Mat A,PetscInt col,Vec *v)
1103: {
1104: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
1108: if (!a->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseGetColumnVec() first");
1109: if (!a->cvec) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing internal column vector");
1110: a->vecinuse = 0;
1111: VecLockReadPop(a->cvec);
1112: VecCUDAResetArray(a->cvec);
1113: MatDenseCUDARestoreArrayRead(A,&a->ptrinuse);
1114: *v = NULL;
1115: return(0);
1116: }
1118: static PetscErrorCode MatDenseGetColumnVecWrite_SeqDenseCUDA(Mat A,PetscInt col,Vec *v)
1119: {
1120: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
1124: if (a->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreColumnVec() first");
1125: if (a->matinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreSubMatrix() first");
1126: MatDenseCUDAGetArrayWrite(A,(PetscScalar**)&a->ptrinuse);
1127: if (!a->cvec) { /* we pass the data of A, to prevent allocating needless GPU memory the first time VecCUDAPlaceArray is called */
1128: VecCreateSeqCUDAWithArray(PetscObjectComm((PetscObject)A),A->rmap->bs,A->rmap->n,a->ptrinuse,&a->cvec);
1129: PetscLogObjectParent((PetscObject)A,(PetscObject)a->cvec);
1130: }
1131: a->vecinuse = col + 1;
1132: VecCUDAPlaceArray(a->cvec,a->ptrinuse + (size_t)col * (size_t)a->lda);
1133: *v = a->cvec;
1134: return(0);
1135: }
1137: static PetscErrorCode MatDenseRestoreColumnVecWrite_SeqDenseCUDA(Mat A,PetscInt col,Vec *v)
1138: {
1139: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
1143: if (!a->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseGetColumnVec() first");
1144: if (!a->cvec) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing internal column vector");
1145: a->vecinuse = 0;
1146: VecCUDAResetArray(a->cvec);
1147: MatDenseCUDARestoreArrayWrite(A,(PetscScalar**)&a->ptrinuse);
1148: *v = NULL;
1149: return(0);
1150: }
1152: static PetscErrorCode MatDenseGetSubMatrix_SeqDenseCUDA(Mat A,PetscInt cbegin,PetscInt cend,Mat *v)
1153: {
1154: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
1155: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
1156: PetscErrorCode ierr;
1159: if (a->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreColumnVec() first");
1160: if (a->matinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreSubMatrix() first");
1161: if (a->cmat && cend-cbegin != a->cmat->cmap->N) {
1162: MatDestroy(&a->cmat);
1163: }
1164: MatSeqDenseCUDACopyToGPU(A);
1165: if (!a->cmat) {
1166: MatCreateDenseCUDA(PetscObjectComm((PetscObject)A),A->rmap->n,PETSC_DECIDE,A->rmap->N,cend-cbegin,dA->d_v + (size_t)cbegin * (size_t)a->lda,&a->cmat);
1167: PetscLogObjectParent((PetscObject)A,(PetscObject)a->cmat);
1168: } else {
1169: MatDenseCUDAPlaceArray(a->cmat,dA->d_v + (size_t)cbegin * (size_t)a->lda);
1170: }
1171: MatDenseSetLDA(a->cmat,a->lda);
1172: if (a->v) { MatDensePlaceArray(a->cmat,a->v + (size_t)cbegin * (size_t)a->lda); }
1173: a->cmat->offloadmask = A->offloadmask;
1174: a->matinuse = cbegin + 1;
1175: *v = a->cmat;
1176: return(0);
1177: }
1179: static PetscErrorCode MatDenseRestoreSubMatrix_SeqDenseCUDA(Mat A,Mat *v)
1180: {
1181: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
1185: if (!a->matinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseGetSubMatrix() first");
1186: if (!a->cmat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing internal column matrix");
1187: if (*v != a->cmat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Not the matrix obtained from MatDenseGetSubMatrix()");
1188: a->matinuse = 0;
1189: A->offloadmask = PETSC_OFFLOAD_GPU;
1190: MatDenseCUDAResetArray(a->cmat);
1191: MatDenseResetArray(a->cmat);
1192: *v = NULL;
1193: return(0);
1194: }
1196: static PetscErrorCode MatDenseSetLDA_SeqDenseCUDA(Mat A,PetscInt lda)
1197: {
1198: Mat_SeqDense *cA = (Mat_SeqDense*)A->data;
1199: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
1200: PetscBool data;
1203: data = (PetscBool)((A->rmap->n > 0 && A->cmap->n > 0) ? (dA->d_v ? PETSC_TRUE : PETSC_FALSE) : PETSC_FALSE);
1204: if (!dA->user_alloc && data && cA->lda!=lda) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"LDA cannot be changed after allocation of internal storage");
1205: if (lda < A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"LDA %D must be at least matrix dimension %D",lda,A->rmap->n);
1206: cA->lda = lda;
1207: return(0);
1208: }
1210: static PetscErrorCode MatBindToCPU_SeqDenseCUDA(Mat A,PetscBool flg)
1211: {
1212: Mat_SeqDense *a = (Mat_SeqDense*)A->data;
1216: if (a->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreColumnVec() first");
1217: if (a->matinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreSubMatrix() first");
1218: A->boundtocpu = flg;
1219: if (!flg) {
1220: PetscBool iscuda;
1222: PetscObjectTypeCompare((PetscObject)a->cvec,VECSEQCUDA,&iscuda);
1223: if (!iscuda) {
1224: VecDestroy(&a->cvec);
1225: }
1226: PetscObjectTypeCompare((PetscObject)a->cmat,MATSEQDENSECUDA,&iscuda);
1227: if (!iscuda) {
1228: MatDestroy(&a->cmat);
1229: }
1230: PetscObjectComposeFunction((PetscObject)A,"MatDenseGetArray_C",MatDenseGetArray_SeqDenseCUDA);
1231: PetscObjectComposeFunction((PetscObject)A,"MatDenseGetArrayRead_C",MatDenseGetArrayRead_SeqDenseCUDA);
1232: PetscObjectComposeFunction((PetscObject)A,"MatDenseGetArrayWrite_C",MatDenseGetArrayWrite_SeqDenseCUDA);
1233: PetscObjectComposeFunction((PetscObject)A,"MatDenseGetColumnVec_C",MatDenseGetColumnVec_SeqDenseCUDA);
1234: PetscObjectComposeFunction((PetscObject)A,"MatDenseRestoreColumnVec_C",MatDenseRestoreColumnVec_SeqDenseCUDA);
1235: PetscObjectComposeFunction((PetscObject)A,"MatDenseGetColumnVecRead_C",MatDenseGetColumnVecRead_SeqDenseCUDA);
1236: PetscObjectComposeFunction((PetscObject)A,"MatDenseRestoreColumnVecRead_C",MatDenseRestoreColumnVecRead_SeqDenseCUDA);
1237: PetscObjectComposeFunction((PetscObject)A,"MatDenseGetColumnVecWrite_C",MatDenseGetColumnVecWrite_SeqDenseCUDA);
1238: PetscObjectComposeFunction((PetscObject)A,"MatDenseRestoreColumnVecWrite_C",MatDenseRestoreColumnVecWrite_SeqDenseCUDA);
1239: PetscObjectComposeFunction((PetscObject)A,"MatDenseGetSubMatrix_C",MatDenseGetSubMatrix_SeqDenseCUDA);
1240: PetscObjectComposeFunction((PetscObject)A,"MatDenseRestoreSubMatrix_C",MatDenseRestoreSubMatrix_SeqDenseCUDA);
1241: PetscObjectComposeFunction((PetscObject)A,"MatDenseSetLDA_C",MatDenseSetLDA_SeqDenseCUDA);
1243: A->ops->duplicate = MatDuplicate_SeqDenseCUDA;
1244: A->ops->mult = MatMult_SeqDenseCUDA;
1245: A->ops->multadd = MatMultAdd_SeqDenseCUDA;
1246: A->ops->multtranspose = MatMultTranspose_SeqDenseCUDA;
1247: A->ops->multtransposeadd = MatMultTransposeAdd_SeqDenseCUDA;
1248: A->ops->matmultnumeric = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA;
1249: A->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqDenseCUDA_SeqDenseCUDA;
1250: A->ops->transposematmultnumeric = MatTransposeMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA;
1251: A->ops->axpy = MatAXPY_SeqDenseCUDA;
1252: A->ops->choleskyfactor = MatCholeskyFactor_SeqDenseCUDA;
1253: A->ops->lufactor = MatLUFactor_SeqDenseCUDA;
1254: A->ops->productsetfromoptions = MatProductSetFromOptions_SeqDenseCUDA;
1255: A->ops->getcolumnvector = MatGetColumnVector_SeqDenseCUDA;
1256: A->ops->scale = MatScale_SeqDenseCUDA;
1257: A->ops->copy = MatCopy_SeqDenseCUDA;
1258: } else {
1259: /* make sure we have an up-to-date copy on the CPU */
1260: MatSeqDenseCUDACopyFromGPU(A);
1261: PetscObjectComposeFunction((PetscObject)A,"MatDenseGetArray_C",MatDenseGetArray_SeqDense);
1262: PetscObjectComposeFunction((PetscObject)A,"MatDenseGetArrayRead_C",MatDenseGetArray_SeqDense);
1263: PetscObjectComposeFunction((PetscObject)A,"MatDenseGetArrayWrite_C",MatDenseGetArray_SeqDense);
1264: PetscObjectComposeFunction((PetscObject)A,"MatDenseGetColumnVec_C",MatDenseGetColumnVec_SeqDense);
1265: PetscObjectComposeFunction((PetscObject)A,"MatDenseRestoreColumnVec_C",MatDenseRestoreColumnVec_SeqDense);
1266: PetscObjectComposeFunction((PetscObject)A,"MatDenseGetColumnVecRead_C",MatDenseGetColumnVecRead_SeqDense);
1267: PetscObjectComposeFunction((PetscObject)A,"MatDenseRestoreColumnVecRead_C",MatDenseRestoreColumnVecRead_SeqDense);
1268: PetscObjectComposeFunction((PetscObject)A,"MatDenseGetColumnVecWrite_C",MatDenseGetColumnVecWrite_SeqDense);
1269: PetscObjectComposeFunction((PetscObject)A,"MatDenseRestoreColumnVecWrite_C",MatDenseRestoreColumnVecWrite_SeqDense);
1270: PetscObjectComposeFunction((PetscObject)A,"MatDenseGetSubMatrix_C",MatDenseGetSubMatrix_SeqDense);
1271: PetscObjectComposeFunction((PetscObject)A,"MatDenseRestoreSubMatrix_C",MatDenseRestoreSubMatrix_SeqDense);
1272: PetscObjectComposeFunction((PetscObject)A,"MatDenseSetLDA_C",MatDenseSetLDA_SeqDense);
1274: A->ops->duplicate = MatDuplicate_SeqDense;
1275: A->ops->mult = MatMult_SeqDense;
1276: A->ops->multadd = MatMultAdd_SeqDense;
1277: A->ops->multtranspose = MatMultTranspose_SeqDense;
1278: A->ops->multtransposeadd = MatMultTransposeAdd_SeqDense;
1279: A->ops->productsetfromoptions = MatProductSetFromOptions_SeqDense;
1280: A->ops->matmultnumeric = MatMatMultNumeric_SeqDense_SeqDense;
1281: A->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqDense_SeqDense;
1282: A->ops->transposematmultnumeric = MatTransposeMatMultNumeric_SeqDense_SeqDense;
1283: A->ops->axpy = MatAXPY_SeqDense;
1284: A->ops->choleskyfactor = MatCholeskyFactor_SeqDense;
1285: A->ops->lufactor = MatLUFactor_SeqDense;
1286: A->ops->productsetfromoptions = MatProductSetFromOptions_SeqDense;
1287: A->ops->getcolumnvector = MatGetColumnVector_SeqDense;
1288: A->ops->scale = MatScale_SeqDense;
1289: A->ops->copy = MatCopy_SeqDense;
1290: }
1291: if (a->cmat) {
1292: MatBindToCPU(a->cmat,flg);
1293: }
1294: return(0);
1295: }
1297: PetscErrorCode MatConvert_SeqDenseCUDA_SeqDense(Mat M,MatType type,MatReuse reuse,Mat *newmat)
1298: {
1299: Mat B;
1300: PetscErrorCode ierr;
1303: if (reuse == MAT_REUSE_MATRIX || reuse == MAT_INITIAL_MATRIX) {
1304: /* TODO these cases should be optimized */
1305: MatConvert_Basic(M,type,reuse,newmat);
1306: return(0);
1307: }
1309: B = *newmat;
1310: MatBindToCPU_SeqDenseCUDA(B,PETSC_TRUE);
1311: MatReset_SeqDenseCUDA(B);
1312: PetscFree(B->defaultvectype);
1313: PetscStrallocpy(VECSTANDARD,&B->defaultvectype);
1314: PetscObjectChangeTypeName((PetscObject)B,MATSEQDENSE);
1315: PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqdensecuda_seqdense_C",NULL);
1316: PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAGetArray_C",NULL);
1317: PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAGetArrayRead_C",NULL);
1318: PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAGetArrayWrite_C",NULL);
1319: PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDARestoreArray_C",NULL);
1320: PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDARestoreArrayRead_C",NULL);
1321: PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDARestoreArrayWrite_C",NULL);
1322: PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAPlaceArray_C",NULL);
1323: PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAResetArray_C",NULL);
1324: PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAReplaceArray_C",NULL);
1325: PetscObjectComposeFunction((PetscObject)B,"MatProductSetFromOptions_seqaij_seqdensecuda_C",NULL);
1327: B->ops->bindtocpu = NULL;
1328: B->ops->destroy = MatDestroy_SeqDense;
1329: B->offloadmask = PETSC_OFFLOAD_CPU;
1330: return(0);
1331: }
1333: PetscErrorCode MatConvert_SeqDense_SeqDenseCUDA(Mat M,MatType type,MatReuse reuse,Mat *newmat)
1334: {
1335: Mat_SeqDenseCUDA *dB;
1336: Mat B;
1337: PetscErrorCode ierr;
1340: PetscCUDAInitializeCheck();
1341: if (reuse == MAT_REUSE_MATRIX || reuse == MAT_INITIAL_MATRIX) {
1342: /* TODO these cases should be optimized */
1343: MatConvert_Basic(M,type,reuse,newmat);
1344: return(0);
1345: }
1347: B = *newmat;
1348: PetscFree(B->defaultvectype);
1349: PetscStrallocpy(VECCUDA,&B->defaultvectype);
1350: PetscObjectChangeTypeName((PetscObject)B,MATSEQDENSECUDA);
1351: PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqdensecuda_seqdense_C", MatConvert_SeqDenseCUDA_SeqDense);
1352: PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAGetArray_C", MatDenseCUDAGetArray_SeqDenseCUDA);
1353: PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAGetArrayRead_C", MatDenseCUDAGetArrayRead_SeqDenseCUDA);
1354: PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAGetArrayWrite_C", MatDenseCUDAGetArrayWrite_SeqDenseCUDA);
1355: PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDARestoreArray_C", MatDenseCUDARestoreArray_SeqDenseCUDA);
1356: PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDARestoreArrayRead_C", MatDenseCUDARestoreArrayRead_SeqDenseCUDA);
1357: PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDARestoreArrayWrite_C", MatDenseCUDARestoreArrayWrite_SeqDenseCUDA);
1358: PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAPlaceArray_C", MatDenseCUDAPlaceArray_SeqDenseCUDA);
1359: PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAResetArray_C", MatDenseCUDAResetArray_SeqDenseCUDA);
1360: PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAReplaceArray_C", MatDenseCUDAReplaceArray_SeqDenseCUDA);
1361: PetscObjectComposeFunction((PetscObject)B,"MatProductSetFromOptions_seqaij_seqdensecuda_C",MatProductSetFromOptions_SeqAIJ_SeqDense);
1363: PetscNewLog(B,&dB);
1364: B->spptr = dB;
1366: B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
1368: MatBindToCPU_SeqDenseCUDA(B,PETSC_FALSE);
1369: B->ops->bindtocpu = MatBindToCPU_SeqDenseCUDA;
1370: B->ops->destroy = MatDestroy_SeqDenseCUDA;
1371: return(0);
1372: }
1374: /*@C
1375: MatCreateSeqDenseCUDA - Creates a sequential matrix in dense format using CUDA.
1377: Collective
1379: Input Parameters:
1380: + comm - MPI communicator
1381: . m - number of rows
1382: . n - number of columns
1383: - data - optional location of GPU matrix data. Set data=NULL for PETSc
1384: to control matrix memory allocation.
1386: Output Parameter:
1387: . A - the matrix
1389: Notes:
1391: Level: intermediate
1393: .seealso: MatCreate(), MatCreateSeqDense()
1394: @*/
1395: PetscErrorCode MatCreateSeqDenseCUDA(MPI_Comm comm,PetscInt m,PetscInt n,PetscScalar *data,Mat *A)
1396: {
1398: PetscMPIInt size;
1401: MPI_Comm_size(comm,&size);
1402: if (size > 1) SETERRQ1(comm,PETSC_ERR_ARG_WRONG,"Invalid communicator size %d",size);
1403: MatCreate(comm,A);
1404: MatSetSizes(*A,m,n,m,n);
1405: MatSetType(*A,MATSEQDENSECUDA);
1406: MatSeqDenseCUDASetPreallocation(*A,data);
1407: return(0);
1408: }
1410: /*MC
1411: MATSEQDENSECUDA - MATSEQDENSECUDA = "seqdensecuda" - A matrix type to be used for sequential dense matrices on GPUs.
1413: Options Database Keys:
1414: . -mat_type seqdensecuda - sets the matrix type to "seqdensecuda" during a call to MatSetFromOptions()
1416: Level: beginner
1417: M*/
1418: PETSC_EXTERN PetscErrorCode MatCreate_SeqDenseCUDA(Mat B)
1419: {
1423: PetscCUDAInitializeCheck();
1424: MatCreate_SeqDense(B);
1425: MatConvert_SeqDense_SeqDenseCUDA(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);
1426: return(0);
1427: }