Actual source code: densecuda.cu

petsc-3.15.0 2021-03-30
Report Typos and Errors
  1: /*
  2:      Defines the matrix operations for sequential dense with CUDA
  3: */
  4: #include <petscpkg_version.h>
  5: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
  6: #include <../src/mat/impls/dense/seq/dense.h>
  7: #include <petsccublas.h>

  9: /* cublas definitions are here */
 10: #include <petsc/private/cudavecimpl.h>

 12: #if defined(PETSC_USE_COMPLEX)
 13: #if defined(PETSC_USE_REAL_SINGLE)
 14: #define cusolverDnXpotrf(a,b,c,d,e,f,g,h)        cusolverDnCpotrf((a),(b),(c),(cuComplex*)(d),(e),(cuComplex*)(f),(g),(h))
 15: #define cusolverDnXpotrf_bufferSize(a,b,c,d,e,f) cusolverDnCpotrf_bufferSize((a),(b),(c),(cuComplex*)(d),(e),(f))
 16: #define cusolverDnXpotrs(a,b,c,d,e,f,g,h,i)      cusolverDnCpotrs((a),(b),(c),(d),(cuComplex*)(e),(f),(cuComplex*)(g),(h),(i))
 17: #define cusolverDnXpotri(a,b,c,d,e,f,g,h)        cusolverDnCpotri((a),(b),(c),(cuComplex*)(d),(e),(cuComplex*)(f),(g),(h))
 18: #define cusolverDnXpotri_bufferSize(a,b,c,d,e,f) cusolverDnCpotri_bufferSize((a),(b),(c),(cuComplex*)(d),(e),(f))
 19: #define cusolverDnXsytrf(a,b,c,d,e,f,g,h,i)      cusolverDnCsytrf((a),(b),(c),(cuComplex*)(d),(e),(f),(cuComplex*)(g),(h),(i))
 20: #define cusolverDnXsytrf_bufferSize(a,b,c,d,e)   cusolverDnCsytrf_bufferSize((a),(b),(cuComplex*)(c),(d),(e))
 21: #define cusolverDnXgetrf(a,b,c,d,e,f,g,h)        cusolverDnCgetrf((a),(b),(c),(cuComplex*)(d),(e),(cuComplex*)(f),(g),(h))
 22: #define cusolverDnXgetrf_bufferSize(a,b,c,d,e,f) cusolverDnCgetrf_bufferSize((a),(b),(c),(cuComplex*)(d),(e),(f))
 23: #define cusolverDnXgetrs(a,b,c,d,e,f,g,h,i,j)    cusolverDnCgetrs((a),(b),(c),(d),(cuComplex*)(e),(f),(g),(cuComplex*)(h),(i),(j))
 24: #else /* complex double */
 25: #define cusolverDnXpotrf(a,b,c,d,e,f,g,h)        cusolverDnZpotrf((a),(b),(c),(cuDoubleComplex*)(d),(e),(cuDoubleComplex*)(f),(g),(h))
 26: #define cusolverDnXpotrf_bufferSize(a,b,c,d,e,f) cusolverDnZpotrf_bufferSize((a),(b),(c),(cuDoubleComplex*)(d),(e),(f))
 27: #define cusolverDnXpotrs(a,b,c,d,e,f,g,h,i)      cusolverDnZpotrs((a),(b),(c),(d),(cuDoubleComplex*)(e),(f),(cuDoubleComplex*)(g),(h),(i))
 28: #define cusolverDnXpotri(a,b,c,d,e,f,g,h)        cusolverDnZpotri((a),(b),(c),(cuDoubleComplex*)(d),(e),(cuDoubleComplex*)(f),(g),(h))
 29: #define cusolverDnXpotri_bufferSize(a,b,c,d,e,f) cusolverDnZpotri_bufferSize((a),(b),(c),(cuDoubleComplex*)(d),(e),(f))
 30: #define cusolverDnXsytrf(a,b,c,d,e,f,g,h,i)      cusolverDnZsytrf((a),(b),(c),(cuDoubleComplex*)(d),(e),(f),(cuDoubleComplex*)(g),(h),(i))
 31: #define cusolverDnXsytrf_bufferSize(a,b,c,d,e)   cusolverDnZsytrf_bufferSize((a),(b),(cuDoubleComplex*)(c),(d),(e))
 32: #define cusolverDnXgetrf(a,b,c,d,e,f,g,h)        cusolverDnZgetrf((a),(b),(c),(cuDoubleComplex*)(d),(e),(cuDoubleComplex*)(f),(g),(h))
 33: #define cusolverDnXgetrf_bufferSize(a,b,c,d,e,f) cusolverDnZgetrf_bufferSize((a),(b),(c),(cuDoubleComplex*)(d),(e),(f))
 34: #define cusolverDnXgetrs(a,b,c,d,e,f,g,h,i,j)    cusolverDnZgetrs((a),(b),(c),(d),(cuDoubleComplex*)(e),(f),(g),(cuDoubleComplex*)(h),(i),(j))
 35: #endif
 36: #else /* real single */
 37: #if defined(PETSC_USE_REAL_SINGLE)
 38: #define cusolverDnXpotrf(a,b,c,d,e,f,g,h)        cusolverDnSpotrf((a),(b),(c),(d),(e),(f),(g),(h))
 39: #define cusolverDnXpotrf_bufferSize(a,b,c,d,e,f) cusolverDnSpotrf_bufferSize((a),(b),(c),(d),(e),(f))
 40: #define cusolverDnXpotrs(a,b,c,d,e,f,g,h,i)      cusolverDnSpotrs((a),(b),(c),(d),(e),(f),(g),(h),(i))
 41: #define cusolverDnXpotri(a,b,c,d,e,f,g,h)        cusolverDnSpotri((a),(b),(c),(d),(e),(f),(g),(h))
 42: #define cusolverDnXpotri_bufferSize(a,b,c,d,e,f) cusolverDnSpotri_bufferSize((a),(b),(c),(d),(e),(f))
 43: #define cusolverDnXsytrf(a,b,c,d,e,f,g,h,i)      cusolverDnSsytrf((a),(b),(c),(d),(e),(f),(g),(h),(i))
 44: #define cusolverDnXsytrf_bufferSize(a,b,c,d,e)   cusolverDnSsytrf_bufferSize((a),(b),(c),(d),(e))
 45: #define cusolverDnXgetrf(a,b,c,d,e,f,g,h)        cusolverDnSgetrf((a),(b),(c),(d),(e),(f),(g),(h))
 46: #define cusolverDnXgetrf_bufferSize(a,b,c,d,e,f) cusolverDnSgetrf_bufferSize((a),(b),(c),(d),(e),(f))
 47: #define cusolverDnXgetrs(a,b,c,d,e,f,g,h,i,j)    cusolverDnSgetrs((a),(b),(c),(d),(e),(f),(g),(h),(i),(j))
 48: #else /* real double */
 49: #define cusolverDnXpotrf(a,b,c,d,e,f,g,h)        cusolverDnDpotrf((a),(b),(c),(d),(e),(f),(g),(h))
 50: #define cusolverDnXpotrf_bufferSize(a,b,c,d,e,f) cusolverDnDpotrf_bufferSize((a),(b),(c),(d),(e),(f))
 51: #define cusolverDnXpotrs(a,b,c,d,e,f,g,h,i)      cusolverDnDpotrs((a),(b),(c),(d),(e),(f),(g),(h),(i))
 52: #define cusolverDnXpotri(a,b,c,d,e,f,g,h)        cusolverDnDpotri((a),(b),(c),(d),(e),(f),(g),(h))
 53: #define cusolverDnXpotri_bufferSize(a,b,c,d,e,f) cusolverDnDpotri_bufferSize((a),(b),(c),(d),(e),(f))
 54: #define cusolverDnXsytrf(a,b,c,d,e,f,g,h,i)      cusolverDnDsytrf((a),(b),(c),(d),(e),(f),(g),(h),(i))
 55: #define cusolverDnXsytrf_bufferSize(a,b,c,d,e)   cusolverDnDsytrf_bufferSize((a),(b),(c),(d),(e))
 56: #define cusolverDnXgetrf(a,b,c,d,e,f,g,h)        cusolverDnDgetrf((a),(b),(c),(d),(e),(f),(g),(h))
 57: #define cusolverDnXgetrf_bufferSize(a,b,c,d,e,f) cusolverDnDgetrf_bufferSize((a),(b),(c),(d),(e),(f))
 58: #define cusolverDnXgetrs(a,b,c,d,e,f,g,h,i,j)    cusolverDnDgetrs((a),(b),(c),(d),(e),(f),(g),(h),(i),(j))
 59: #endif
 60: #endif

 62: typedef struct {
 63:   PetscScalar *d_v; /* pointer to the matrix on the GPU */
 64:   PetscBool   user_alloc;
 65:   PetscScalar *unplacedarray; /* if one called MatCUDADensePlaceArray(), this is where it stashed the original */
 66:   PetscBool   unplaced_user_alloc;
 67:   /* factorization support */
 68:   int         *d_fact_ipiv; /* device pivots */
 69:   PetscScalar *d_fact_work; /* device workspace */
 70:   int         fact_lwork;
 71:   int         *d_fact_info; /* device info */
 72:   /* workspace */
 73:   Vec         workvec;
 74: } Mat_SeqDenseCUDA;

 76: PetscErrorCode MatSeqDenseCUDASetPreallocation(Mat A, PetscScalar *d_data)
 77: {
 78:   Mat_SeqDense     *cA = (Mat_SeqDense*)A->data;
 79:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
 80:   PetscErrorCode   ierr;
 81:   PetscBool        iscuda;
 82:   cudaError_t      cerr;

 85:   PetscObjectTypeCompare((PetscObject)A,MATSEQDENSECUDA,&iscuda);
 86:   if (!iscuda) return(0);
 87:   /* it may happen CPU preallocation has not been performed */
 88:   PetscLayoutSetUp(A->rmap);
 89:   PetscLayoutSetUp(A->cmap);
 90:   if (cA->lda <= 0) cA->lda = A->rmap->n;
 91:   if (!dA->user_alloc) { cerr = cudaFree(dA->d_v);CHKERRCUDA(cerr); }
 92:   if (!d_data) { /* petsc-allocated storage */
 93:     PetscIntMultError(cA->lda,A->cmap->n,NULL);
 94:     cerr = cudaMalloc((void**)&dA->d_v,cA->lda*A->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
 95:     dA->user_alloc = PETSC_FALSE;
 96:   } else { /* user-allocated storage */
 97:     dA->d_v        = d_data;
 98:     dA->user_alloc = PETSC_TRUE;
 99:     A->offloadmask = PETSC_OFFLOAD_GPU;
100:   }
101:   A->preallocated = PETSC_TRUE;
102:   A->assembled = PETSC_TRUE;
103:   return(0);
104: }

106: PetscErrorCode MatSeqDenseCUDACopyFromGPU(Mat A)
107: {
108:   Mat_SeqDense     *cA = (Mat_SeqDense*)A->data;
109:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
110:   PetscErrorCode   ierr;
111:   cudaError_t      cerr;

115:   PetscInfo3(A,"%s matrix %d x %d\n",A->offloadmask == PETSC_OFFLOAD_GPU ? "Copy" : "Reusing",A->rmap->n,A->cmap->n);
116:   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
117:     if (!cA->v) { /* MatCreateSeqDenseCUDA may not allocate CPU memory. Allocate if needed */
118:       MatSeqDenseSetPreallocation(A,NULL);
119:     }
120:     PetscLogEventBegin(MAT_DenseCopyFromGPU,A,0,0,0);
121:     if (cA->lda > A->rmap->n) {
122:       PetscInt j,m = A->rmap->n;

124:       for (j=0; j<A->cmap->n; j++) { /* TODO: it can be done better */
125:         cerr = cudaMemcpy(cA->v + j*cA->lda,dA->d_v + j*cA->lda,m*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
126:       }
127:     } else {
128:       cerr = cudaMemcpy(cA->v,dA->d_v,cA->lda*sizeof(PetscScalar)*A->cmap->n,cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
129:     }
130:     PetscLogGpuToCpu(cA->lda*sizeof(PetscScalar)*A->cmap->n);
131:     PetscLogEventEnd(MAT_DenseCopyFromGPU,A,0,0,0);

133:     A->offloadmask = PETSC_OFFLOAD_BOTH;
134:   }
135:   return(0);
136: }

138: PetscErrorCode MatSeqDenseCUDACopyToGPU(Mat A)
139: {
140:   Mat_SeqDense     *cA = (Mat_SeqDense*)A->data;
141:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
142:   PetscBool        copy;
143:   PetscErrorCode   ierr;
144:   cudaError_t      cerr;

148:   if (A->boundtocpu) return(0);
149:   copy = (PetscBool)(A->offloadmask == PETSC_OFFLOAD_CPU || A->offloadmask == PETSC_OFFLOAD_UNALLOCATED);
150:   PetscInfo3(A,"%s matrix %d x %d\n",copy ? "Copy" : "Reusing",A->rmap->n,A->cmap->n);
151:   if (copy) {
152:     if (!dA->d_v) { /* Allocate GPU memory if not present */
153:       MatSeqDenseCUDASetPreallocation(A,NULL);
154:     }
155:     PetscLogEventBegin(MAT_DenseCopyToGPU,A,0,0,0);
156:     if (cA->lda > A->rmap->n) {
157:       PetscInt j,m = A->rmap->n;

159:       for (j=0; j<A->cmap->n; j++) { /* TODO: it can be done better */
160:         cerr = cudaMemcpy(dA->d_v + j*cA->lda,cA->v + j*cA->lda,m*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
161:       }
162:     } else {
163:       cerr = cudaMemcpy(dA->d_v,cA->v,cA->lda*sizeof(PetscScalar)*A->cmap->n,cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
164:     }
165:     PetscLogCpuToGpu(cA->lda*sizeof(PetscScalar)*A->cmap->n);
166:     PetscLogEventEnd(MAT_DenseCopyToGPU,A,0,0,0);

168:     A->offloadmask = PETSC_OFFLOAD_BOTH;
169:   }
170:   return(0);
171: }

173: static PetscErrorCode MatCopy_SeqDenseCUDA(Mat A,Mat B,MatStructure str)
174: {
175:   Mat_SeqDense      *a = (Mat_SeqDense*)A->data,*b = (Mat_SeqDense*)B->data;
176:   PetscErrorCode    ierr;
177:   const PetscScalar *va;
178:   PetscScalar       *vb;
179:   PetscInt          lda1=a->lda,lda2=b->lda, m=A->rmap->n,n=A->cmap->n, j;
180:   cudaError_t       cerr;

183:   /* If the two matrices don't have the same copy implementation, they aren't compatible for fast copy. */
184:   if (A->ops->copy != B->ops->copy) {
185:     MatCopy_Basic(A,B,str);
186:     return(0);
187:   }
188:   if (m != B->rmap->n || n != B->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"size(B) != size(A)");
189:   MatDenseCUDAGetArrayRead(A,&va);
190:   MatDenseCUDAGetArrayWrite(B,&vb);
191:   PetscLogGpuTimeBegin();
192:   if (lda1>m || lda2>m) {
193:     for (j=0; j<n; j++) {
194:       cerr = cudaMemcpy(vb+j*lda2,va+j*lda1,m*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
195:     }
196:   } else {
197:     cerr = cudaMemcpy(vb,va,m*(n*sizeof(PetscScalar)),cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
198:   }
199:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
200:   PetscLogGpuTimeEnd();
201:   MatDenseCUDARestoreArray(B,&vb);
202:   MatDenseCUDARestoreArrayRead(A,&va);
203:   return(0);
204: }

206: static PetscErrorCode MatDenseCUDAPlaceArray_SeqDenseCUDA(Mat A, const PetscScalar *a)
207: {
208:   Mat_SeqDense     *aa = (Mat_SeqDense*)A->data;
209:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
210:   PetscErrorCode   ierr;

213:   if (aa->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreColumnVec() first");
214:   if (aa->matinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreSubMatrix() first");
215:   if (dA->unplacedarray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"MatDenseCUDAResetArray() must be called first");
216:   if (aa->v) { MatSeqDenseCUDACopyToGPU(A); }
217:   dA->unplacedarray = dA->d_v;
218:   dA->unplaced_user_alloc = dA->user_alloc;
219:   dA->d_v = (PetscScalar*)a;
220:   dA->user_alloc = PETSC_TRUE;
221:   return(0);
222: }

224: static PetscErrorCode MatDenseCUDAResetArray_SeqDenseCUDA(Mat A)
225: {
226:   Mat_SeqDense     *a = (Mat_SeqDense*)A->data;
227:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
228:   PetscErrorCode   ierr;

231:   if (a->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreColumnVec() first");
232:   if (a->matinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreSubMatrix() first");
233:   if (a->v) { MatSeqDenseCUDACopyToGPU(A); }
234:   dA->d_v = dA->unplacedarray;
235:   dA->user_alloc = dA->unplaced_user_alloc;
236:   dA->unplacedarray = NULL;
237:   return(0);
238: }

240: static PetscErrorCode MatDenseCUDAReplaceArray_SeqDenseCUDA(Mat A, const PetscScalar *a)
241: {
242:   Mat_SeqDense     *aa = (Mat_SeqDense*)A->data;
243:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
244:   cudaError_t      cerr;

247:   if (aa->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreColumnVec() first");
248:   if (aa->matinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreSubMatrix() first");
249:   if (dA->unplacedarray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"MatDenseCUDAResetArray() must be called first");
250:   if (!dA->user_alloc) { cerr = cudaFree(dA->d_v);CHKERRCUDA(cerr); }
251:   dA->d_v = (PetscScalar*)a;
252:   dA->user_alloc = PETSC_FALSE;
253:   return(0);
254: }

256: static PetscErrorCode MatDenseCUDAGetArrayWrite_SeqDenseCUDA(Mat A, PetscScalar **a)
257: {
258:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
259:   PetscErrorCode   ierr;

262:   if (!dA->d_v) {
263:     MatSeqDenseCUDASetPreallocation(A,NULL);
264:   }
265:   *a = dA->d_v;
266:   return(0);
267: }

269: static PetscErrorCode MatDenseCUDARestoreArrayWrite_SeqDenseCUDA(Mat A, PetscScalar **a)
270: {
272:   *a = NULL;
273:   return(0);
274: }

276: static PetscErrorCode MatDenseCUDAGetArrayRead_SeqDenseCUDA(Mat A, const PetscScalar **a)
277: {
278:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
279:   PetscErrorCode   ierr;

282:   MatSeqDenseCUDACopyToGPU(A);
283:   *a   = dA->d_v;
284:   return(0);
285: }

287: static PetscErrorCode MatDenseCUDARestoreArrayRead_SeqDenseCUDA(Mat A, const PetscScalar **a)
288: {
290:   *a = NULL;
291:   return(0);
292: }

294: static PetscErrorCode MatDenseCUDAGetArray_SeqDenseCUDA(Mat A, PetscScalar **a)
295: {
296:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
297:   PetscErrorCode   ierr;

300:   MatSeqDenseCUDACopyToGPU(A);
301:   *a   = dA->d_v;
302:   return(0);
303: }

305: static PetscErrorCode MatDenseCUDARestoreArray_SeqDenseCUDA(Mat A, PetscScalar **a)
306: {
308:   *a = NULL;
309:   return(0);
310: }

312: PETSC_EXTERN PetscErrorCode MatSeqDenseCUDAInvertFactors_Private(Mat A)
313: {
314: #if PETSC_PKG_CUDA_VERSION_GE(10,1,0)
315:   Mat_SeqDense       *a = (Mat_SeqDense*)A->data;
316:   Mat_SeqDenseCUDA   *dA = (Mat_SeqDenseCUDA*)A->spptr;
317:   PetscScalar        *da;
318:   PetscErrorCode     ierr;
319:   cudaError_t        ccer;
320:   cusolverStatus_t   cerr;
321:   cusolverDnHandle_t handle;
322:   int                n,lda;
323: #if defined(PETSC_USE_DEBUG)
324:   int                info;
325: #endif

328:   if (!A->rmap->n || !A->cmap->n) return(0);
329:   PetscCUSOLVERDnGetHandle(&handle);
330:   PetscMPIIntCast(A->cmap->n,&n);
331:   PetscMPIIntCast(a->lda,&lda);
332:   if (A->factortype == MAT_FACTOR_LU) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"cusolverDngetri not implemented");
333:   else if (A->factortype == MAT_FACTOR_CHOLESKY) {
334:     if (!dA->d_fact_ipiv) { /* spd */
335:       int il;

337:       MatDenseCUDAGetArray(A,&da);
338:       cerr = cusolverDnXpotri_bufferSize(handle,CUBLAS_FILL_MODE_LOWER,n,da,lda,&il);CHKERRCUSOLVER(cerr);
339:       if (il > dA->fact_lwork) {
340:         dA->fact_lwork = il;

342:         ccer = cudaFree(dA->d_fact_work);CHKERRCUDA(ccer);
343:         ccer = cudaMalloc((void**)&dA->d_fact_work,dA->fact_lwork*sizeof(*dA->d_fact_work));CHKERRCUDA(ccer);
344:       }
345:       PetscLogGpuTimeBegin();
346:       cerr = cusolverDnXpotri(handle,CUBLAS_FILL_MODE_LOWER,n,da,lda,dA->d_fact_work,dA->fact_lwork,dA->d_fact_info);CHKERRCUSOLVER(cerr);
347:       ccer = WaitForCUDA();CHKERRCUDA(ccer);
348:       PetscLogGpuTimeEnd();
349:       MatDenseCUDARestoreArray(A,&da);
350:       /* TODO (write cuda kernel) */
351:       MatSeqDenseSymmetrize_Private(A,PETSC_TRUE);
352:     } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"cusolverDnsytri not implemented");
353:   }
354: #if defined(PETSC_USE_DEBUG)
355:   ccer = cudaMemcpy(&info, dA->d_fact_info, sizeof(int), cudaMemcpyDeviceToHost);CHKERRCUDA(ccer);
356:   if (info > 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_CH_ZRPVT,"Bad factorization: leading minor of order %d is zero",info);
357:   else if (info < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Wrong argument to cuSolver %d",-info);
358: #endif
359:   PetscLogGpuFlops(1.0*n*n*n/3.0);
360:   A->ops->solve          = NULL;
361:   A->ops->solvetranspose = NULL;
362:   A->ops->matsolve       = NULL;
363:   A->factortype          = MAT_FACTOR_NONE;

365:   PetscFree(A->solvertype);
366:   return(0);
367: #else
368:   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Upgrade to CUDA version 10.1.0 or higher");
369: #endif
370: }

372: static PetscErrorCode MatMatSolve_SeqDenseCUDA(Mat A,Mat B,Mat X)
373: {
374:   Mat_SeqDense       *a = (Mat_SeqDense*)A->data;
375:   Mat_SeqDense       *x = (Mat_SeqDense*)X->data;
376:   Mat_SeqDenseCUDA   *dA = (Mat_SeqDenseCUDA*)A->spptr;
377:   const PetscScalar  *da;
378:   PetscScalar        *dx;
379:   cusolverDnHandle_t handle;
380:   PetscBool          iscuda;
381:   int                nrhs,n,lda,ldx;
382: #if defined(PETSC_USE_DEBUG)
383:   int                info;
384: #endif
385:   cudaError_t        ccer;
386:   cusolverStatus_t   cerr;
387:   PetscErrorCode     ierr;

390:   if (A->factortype == MAT_FACTOR_NONE) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix must be factored to solve");
391:   if (!dA->d_fact_work) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix must be factored to solve");
392:   PetscObjectTypeCompareAny((PetscObject)X,&iscuda,VECSEQCUDA,VECMPICUDA,"");
393:   if (X != B) {
394:     MatCopy(B,X,SAME_NONZERO_PATTERN);
395:   }
396:   MatDenseCUDAGetArrayRead(A,&da);
397:   /* MatMatSolve does not have a dispatching mechanism, we may end up with a MATSEQDENSE here */
398:   PetscObjectTypeCompare((PetscObject)X,MATSEQDENSECUDA,&iscuda);
399:   if (!iscuda) {
400:     MatConvert(X,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&X);
401:   }
402:   MatDenseCUDAGetArray(X,&dx);
403:   PetscMPIIntCast(A->rmap->n,&n);
404:   PetscMPIIntCast(X->cmap->n,&nrhs);
405:   PetscMPIIntCast(a->lda,&lda);
406:   PetscMPIIntCast(x->lda,&ldx);
407:   PetscCUSOLVERDnGetHandle(&handle);
408:   PetscLogGpuTimeBegin();
409:   if (A->factortype == MAT_FACTOR_LU) {
410:     PetscInfo2(A,"LU solve %d x %d on backend\n",n,n);
411:     cerr = cusolverDnXgetrs(handle,CUBLAS_OP_N,n,nrhs,da,lda,dA->d_fact_ipiv,dx,ldx,dA->d_fact_info);CHKERRCUSOLVER(cerr);
412:   } else if (A->factortype == MAT_FACTOR_CHOLESKY) {
413:     PetscInfo2(A,"Cholesky solve %d x %d on backend\n",n,n);
414:     if (!dA->d_fact_ipiv) { /* spd */
415:       /* ========= Program hit cudaErrorNotReady (error 34) due to "device not ready" on CUDA API call to cudaEventQuery. */
416:       cerr = cusolverDnXpotrs(handle,CUBLAS_FILL_MODE_LOWER,n,nrhs,da,lda,dx,ldx,dA->d_fact_info);CHKERRCUSOLVER(cerr);
417:     } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"cusolverDnsytrs not implemented");
418:   } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Unknown factor type %d",A->factortype);
419:   ccer = WaitForCUDA();CHKERRCUDA(ccer);
420:   PetscLogGpuTimeEnd();
421:   MatDenseCUDARestoreArrayRead(A,&da);
422:   MatDenseCUDARestoreArray(X,&dx);
423:   if (!iscuda) {
424:     MatConvert(X,MATSEQDENSE,MAT_INPLACE_MATRIX,&X);
425:   }
426: #if defined(PETSC_USE_DEBUG)
427:   ccer = cudaMemcpy(&info, dA->d_fact_info, sizeof(int), cudaMemcpyDeviceToHost);CHKERRCUDA(ccer);
428:   if (info > 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_CH_ZRPVT,"Bad factorization: zero pivot in row %d",info-1);
429:   else if (info < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Wrong argument to cuSolver %d",-info);
430: #endif
431:   PetscLogGpuFlops(nrhs*(2.0*n*n - n));
432:   return(0);
433: }

435: static PetscErrorCode MatSolve_SeqDenseCUDA_Private(Mat A,Vec xx,Vec yy,PetscBool trans)
436: {
437:   Mat_SeqDense       *a = (Mat_SeqDense*)A->data;
438:   Mat_SeqDenseCUDA   *dA = (Mat_SeqDenseCUDA*)A->spptr;
439:   const PetscScalar  *da;
440:   PetscScalar        *y;
441:   cusolverDnHandle_t handle;
442:   int                one = 1,n,lda;
443: #if defined(PETSC_USE_DEBUG)
444:   int                info;
445: #endif
446:   cudaError_t        ccer;
447:   cusolverStatus_t   cerr;
448:   PetscBool          iscuda;
449:   PetscErrorCode     ierr;

452:   if (A->factortype == MAT_FACTOR_NONE) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix must be factored to solve");
453:   if (!dA->d_fact_work) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix must be factored to solve");
454:   PetscMPIIntCast(A->rmap->n,&n);
455:   /* MatSolve does not have a dispatching mechanism, we may end up with a VECSTANDARD here */
456:   PetscObjectTypeCompareAny((PetscObject)yy,&iscuda,VECSEQCUDA,VECMPICUDA,"");
457:   if (iscuda) {
458:     VecCopy(xx,yy);
459:     VecCUDAGetArray(yy,&y);
460:   } else {
461:     if (!dA->workvec) {
462:       MatCreateVecs(A,&dA->workvec,NULL);
463:     }
464:     VecCopy(xx,dA->workvec);
465:     VecCUDAGetArray(dA->workvec,&y);
466:   }
467:   MatDenseCUDAGetArrayRead(A,&da);
468:   PetscMPIIntCast(a->lda,&lda);
469:   PetscCUSOLVERDnGetHandle(&handle);
470:   PetscLogGpuTimeBegin();
471:   if (A->factortype == MAT_FACTOR_LU) {
472:     PetscInfo2(A,"LU solve %d x %d on backend\n",n,n);
473:     cerr = cusolverDnXgetrs(handle,trans ? CUBLAS_OP_T : CUBLAS_OP_N,n,one,da,lda,dA->d_fact_ipiv,y,n,dA->d_fact_info);CHKERRCUSOLVER(cerr);
474:   } else if (A->factortype == MAT_FACTOR_CHOLESKY) {
475:     PetscInfo2(A,"Cholesky solve %d x %d on backend\n",n,n);
476:     if (!dA->d_fact_ipiv) { /* spd */
477:       /* ========= Program hit cudaErrorNotReady (error 34) due to "device not ready" on CUDA API call to cudaEventQuery. */
478:       cerr = cusolverDnXpotrs(handle,CUBLAS_FILL_MODE_LOWER,n,one,da,lda,y,n,dA->d_fact_info);CHKERRCUSOLVER(cerr);
479:     } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"cusolverDnsytrs not implemented");
480:   } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Unknown factor type %d",A->factortype);
481:   ccer = WaitForCUDA();CHKERRCUDA(ccer);
482:   PetscLogGpuTimeEnd();
483:   if (iscuda) {
484:     VecCUDARestoreArray(yy,&y);
485:   } else {
486:     VecCUDARestoreArray(dA->workvec,&y);
487:     VecCopy(dA->workvec,yy);
488:   }
489:   MatDenseCUDARestoreArrayRead(A,&da);
490: #if defined(PETSC_USE_DEBUG)
491:   ccer = cudaMemcpy(&info, dA->d_fact_info, sizeof(int), cudaMemcpyDeviceToHost);CHKERRCUDA(ccer);
492:   if (info > 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_CH_ZRPVT,"Bad factorization: zero pivot in row %d",info-1);
493:   else if (info < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Wrong argument to cuSolver %d",-info);
494: #endif
495:   PetscLogGpuFlops(2.0*n*n - n);
496:   return(0);
497: }

499: static PetscErrorCode MatSolve_SeqDenseCUDA(Mat A,Vec xx,Vec yy)
500: {
501:   PetscErrorCode     ierr;

504:   MatSolve_SeqDenseCUDA_Private(A,xx,yy,PETSC_FALSE);
505:   return(0);
506: }

508: static PetscErrorCode MatSolveTranspose_SeqDenseCUDA(Mat A,Vec xx,Vec yy)
509: {
510:   PetscErrorCode     ierr;

513:   MatSolve_SeqDenseCUDA_Private(A,xx,yy,PETSC_TRUE);
514:   return(0);
515: }

517: static PetscErrorCode MatLUFactor_SeqDenseCUDA(Mat A,IS rperm,IS cperm,const MatFactorInfo *factinfo)
518: {
519:   Mat_SeqDense       *a = (Mat_SeqDense*)A->data;
520:   Mat_SeqDenseCUDA   *dA = (Mat_SeqDenseCUDA*)A->spptr;
521:   PetscScalar        *da;
522:   int                m,n,lda;
523: #if defined(PETSC_USE_DEBUG)
524:   int                info;
525: #endif
526:   cusolverStatus_t   cerr;
527:   cusolverDnHandle_t handle;
528:   cudaError_t        ccer;
529:   PetscErrorCode     ierr;

532:   if (!A->rmap->n || !A->cmap->n) return(0);
533:   PetscCUSOLVERDnGetHandle(&handle);
534:   MatDenseCUDAGetArray(A,&da);
535:   PetscMPIIntCast(A->cmap->n,&n);
536:   PetscMPIIntCast(A->rmap->n,&m);
537:   PetscMPIIntCast(a->lda,&lda);
538:   PetscInfo2(A,"LU factor %d x %d on backend\n",m,n);
539:   if (!dA->d_fact_ipiv) {
540:     ccer = cudaMalloc((void**)&dA->d_fact_ipiv,n*sizeof(*dA->d_fact_ipiv));CHKERRCUDA(ccer);
541:   }
542:   if (!dA->fact_lwork) {
543:     cerr = cusolverDnXgetrf_bufferSize(handle,m,n,da,lda,&dA->fact_lwork);CHKERRCUSOLVER(cerr);
544:     ccer = cudaMalloc((void**)&dA->d_fact_work,dA->fact_lwork*sizeof(*dA->d_fact_work));CHKERRCUDA(ccer);
545:   }
546:   if (!dA->d_fact_info) {
547:     ccer = cudaMalloc((void**)&dA->d_fact_info,sizeof(*dA->d_fact_info));CHKERRCUDA(ccer);
548:   }
549:   PetscLogGpuTimeBegin();
550:   cerr = cusolverDnXgetrf(handle,m,n,da,lda,dA->d_fact_work,dA->d_fact_ipiv,dA->d_fact_info);CHKERRCUSOLVER(cerr);
551:   ccer = WaitForCUDA();CHKERRCUDA(ccer);
552:   PetscLogGpuTimeEnd();
553:   MatDenseCUDARestoreArray(A,&da);
554: #if defined(PETSC_USE_DEBUG)
555:   ccer = cudaMemcpy(&info, dA->d_fact_info, sizeof(int), cudaMemcpyDeviceToHost);CHKERRCUDA(ccer);
556:   if (info > 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Bad factorization: zero pivot in row %d",info-1);
557:   else if (info < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Wrong argument to cuSolver %d",-info);
558: #endif
559:   A->factortype = MAT_FACTOR_LU;
560:   PetscLogGpuFlops(2.0*n*n*m/3.0);

562:   A->ops->solve          = MatSolve_SeqDenseCUDA;
563:   A->ops->solvetranspose = MatSolveTranspose_SeqDenseCUDA;
564:   A->ops->matsolve       = MatMatSolve_SeqDenseCUDA;

566:   PetscFree(A->solvertype);
567:   PetscStrallocpy(MATSOLVERCUDA,&A->solvertype);
568:   return(0);
569: }

571: static PetscErrorCode MatCholeskyFactor_SeqDenseCUDA(Mat A,IS perm,const MatFactorInfo *factinfo)
572: {
573:   Mat_SeqDense       *a = (Mat_SeqDense*)A->data;
574:   Mat_SeqDenseCUDA   *dA = (Mat_SeqDenseCUDA*)A->spptr;
575:   PetscScalar        *da;
576:   int                n,lda;
577: #if defined(PETSC_USE_DEBUG)
578:   int                info;
579: #endif
580:   cusolverStatus_t   cerr;
581:   cusolverDnHandle_t handle;
582:   cudaError_t        ccer;
583:   PetscErrorCode     ierr;

586:   if (!A->rmap->n || !A->cmap->n) return(0);
587:   PetscCUSOLVERDnGetHandle(&handle);
588:   PetscMPIIntCast(A->rmap->n,&n);
589:   PetscInfo2(A,"Cholesky factor %d x %d on backend\n",n,n);
590:   if (A->spd) {
591:     MatDenseCUDAGetArray(A,&da);
592:     PetscMPIIntCast(a->lda,&lda);
593:     if (!dA->fact_lwork) {
594:       cerr = cusolverDnXpotrf_bufferSize(handle,CUBLAS_FILL_MODE_LOWER,n,da,lda,&dA->fact_lwork);CHKERRCUSOLVER(cerr);
595:       ccer = cudaMalloc((void**)&dA->d_fact_work,dA->fact_lwork*sizeof(*dA->d_fact_work));CHKERRCUDA(ccer);
596:     }
597:     if (!dA->d_fact_info) {
598:       ccer = cudaMalloc((void**)&dA->d_fact_info,sizeof(*dA->d_fact_info));CHKERRCUDA(ccer);
599:     }
600:     PetscLogGpuTimeBegin();
601:     cerr = cusolverDnXpotrf(handle,CUBLAS_FILL_MODE_LOWER,n,da,lda,dA->d_fact_work,dA->fact_lwork,dA->d_fact_info);CHKERRCUSOLVER(cerr);
602:     ccer = WaitForCUDA();CHKERRCUDA(ccer);
603:     PetscLogGpuTimeEnd();

605:     MatDenseCUDARestoreArray(A,&da);
606: #if defined(PETSC_USE_DEBUG)
607:     ccer = cudaMemcpy(&info, dA->d_fact_info, sizeof(int), cudaMemcpyDeviceToHost);CHKERRCUDA(ccer);
608:     if (info > 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_CH_ZRPVT,"Bad factorization: zero pivot in row %d",info-1);
609:     else if (info < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Wrong argument to cuSolver %d",-info);
610: #endif
611:     A->factortype = MAT_FACTOR_CHOLESKY;
612:     PetscLogGpuFlops(1.0*n*n*n/3.0);
613:   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cusolverDnsytrs unavailable. Use MAT_FACTOR_LU");
614: #if 0
615:     /* at the time of writing this interface (cuda 10.0), cusolverDn does not implement *sytrs and *hetr* routines
616:        The code below should work, and it can be activated when *sytrs routines will be available */
617:     if (!dA->d_fact_ipiv) {
618:       ccer = cudaMalloc((void**)&dA->d_fact_ipiv,n*sizeof(*dA->d_fact_ipiv));CHKERRCUDA(ccer);
619:     }
620:     if (!dA->fact_lwork) {
621:       cerr = cusolverDnXsytrf_bufferSize(handle,n,da,lda,&dA->fact_lwork);CHKERRCUSOLVER(cerr);
622:       ccer = cudaMalloc((void**)&dA->d_fact_work,dA->fact_lwork*sizeof(*dA->d_fact_work));CHKERRCUDA(ccer);
623:     }
624:     if (!dA->d_fact_info) {
625:       ccer = cudaMalloc((void**)&dA->d_fact_info,sizeof(*dA->d_fact_info));CHKERRCUDA(ccer);
626:     }
627:     PetscLogGpuTimeBegin();
628:     cerr = cusolverDnXsytrf(handle,CUBLAS_FILL_MODE_LOWER,n,da,lda,dA->d_fact_ipiv,dA->d_fact_work,dA->fact_lwork,dA->d_fact_info);CHKERRCUSOLVER(cerr);
629:     PetscLogGpuTimeEnd();
630: #endif

632:   A->ops->solve          = MatSolve_SeqDenseCUDA;
633:   A->ops->solvetranspose = MatSolveTranspose_SeqDenseCUDA;
634:   A->ops->matsolve       = MatMatSolve_SeqDenseCUDA;

636:   PetscFree(A->solvertype);
637:   PetscStrallocpy(MATSOLVERCUDA,&A->solvertype);
638:   return(0);
639: }

641: /* GEMM kernel: C = op(A)*op(B), tA, tB flag transposition */
642: PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat A,Mat B,Mat C,PetscBool tA,PetscBool tB)
643: {
644:   const PetscScalar *da,*db;
645:   PetscScalar       *dc;
646:   PetscScalar       one=1.0,zero=0.0;
647:   int               m,n,k;
648:   PetscInt          alda,blda,clda;
649:   PetscErrorCode    ierr;
650:   cublasHandle_t    cublasv2handle;
651:   PetscBool         Aiscuda,Biscuda;
652:   cublasStatus_t    berr;
653:   cudaError_t       cerr;

656:   /* we may end up with SEQDENSE as one of the arguments */
657:   PetscObjectTypeCompare((PetscObject)A,MATSEQDENSECUDA,&Aiscuda);
658:   PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&Biscuda);
659:   if (!Aiscuda) {
660:     MatConvert(A,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&A);
661:   }
662:   if (!Biscuda) {
663:     MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);
664:   }
665:   PetscMPIIntCast(C->rmap->n,&m);
666:   PetscMPIIntCast(C->cmap->n,&n);
667:   if (tA) {
668:     PetscMPIIntCast(A->rmap->n,&k);
669:   } else {
670:     PetscMPIIntCast(A->cmap->n,&k);
671:   }
672:   if (!m || !n || !k) return(0);
673:   PetscInfo3(C,"Matrix-Matrix product %d x %d x %d on backend\n",m,k,n);
674:   MatDenseCUDAGetArrayRead(A,&da);
675:   MatDenseCUDAGetArrayRead(B,&db);
676:   MatDenseCUDAGetArrayWrite(C,&dc);
677:   MatDenseGetLDA(A,&alda);
678:   MatDenseGetLDA(B,&blda);
679:   MatDenseGetLDA(C,&clda);
680:   PetscCUBLASGetHandle(&cublasv2handle);
681:   PetscLogGpuTimeBegin();
682:   berr = cublasXgemm(cublasv2handle,tA ? CUBLAS_OP_T : CUBLAS_OP_N,tB ? CUBLAS_OP_T : CUBLAS_OP_N,
683:                      m,n,k,&one,da,alda,db,blda,&zero,dc,clda);CHKERRCUBLAS(berr);
684:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
685:   PetscLogGpuTimeEnd();
686:   PetscLogGpuFlops(1.0*m*n*k + 1.0*m*n*(k-1));
687:   MatDenseCUDARestoreArrayRead(A,&da);
688:   MatDenseCUDARestoreArrayRead(B,&db);
689:   MatDenseCUDARestoreArrayWrite(C,&dc);
690:   if (!Aiscuda) {
691:     MatConvert(A,MATSEQDENSE,MAT_INPLACE_MATRIX,&A);
692:   }
693:   if (!Biscuda) {
694:     MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);
695:   }
696:   return(0);
697: }

699: PetscErrorCode MatTransposeMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA(Mat A,Mat B,Mat C)
700: {

704:   MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(A,B,C,PETSC_TRUE,PETSC_FALSE);
705:   return(0);
706: }

708: PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA(Mat A,Mat B,Mat C)
709: {

713:   MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(A,B,C,PETSC_FALSE,PETSC_FALSE);
714:   return(0);
715: }

717: PetscErrorCode MatMatTransposeMultNumeric_SeqDenseCUDA_SeqDenseCUDA(Mat A,Mat B,Mat C)
718: {

722:   MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(A,B,C,PETSC_FALSE,PETSC_TRUE);
723:   return(0);
724: }

726: PetscErrorCode MatProductSetFromOptions_SeqDenseCUDA(Mat C)
727: {

731:   MatProductSetFromOptions_SeqDense(C);
732:   return(0);
733: }

735: /* zz = op(A)*xx + yy
736:    if yy == NULL, only MatMult */
737: static PetscErrorCode MatMultAdd_SeqDenseCUDA_Private(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans)
738: {
739:   Mat_SeqDense      *mat = (Mat_SeqDense*)A->data;
740:   const PetscScalar *xarray,*da;
741:   PetscScalar       *zarray;
742:   PetscScalar       one=1.0,zero=0.0;
743:   int               m, n, lda; /* Use PetscMPIInt as it is typedef'ed to int */
744:   cublasHandle_t    cublasv2handle;
745:   cublasStatus_t    berr;
746:   PetscErrorCode    ierr;

749:   if (yy && yy != zz) { /* mult add */
750:     VecCopy_SeqCUDA(yy,zz);
751:   }
752:   if (!A->rmap->n || !A->cmap->n) {
753:     if (!yy) { /* mult only */
754:       VecSet_SeqCUDA(zz,0.0);
755:     }
756:     return(0);
757:   }
758:   PetscInfo2(A,"Matrix-vector product %d x %d on backend\n",A->rmap->n,A->cmap->n);
759:   PetscMPIIntCast(A->rmap->n,&m);
760:   PetscMPIIntCast(A->cmap->n,&n);
761:   PetscMPIIntCast(mat->lda,&lda);
762:   PetscCUBLASGetHandle(&cublasv2handle);
763:   MatDenseCUDAGetArrayRead(A,&da);
764:   VecCUDAGetArrayRead(xx,&xarray);
765:   VecCUDAGetArray(zz,&zarray);
766:   PetscLogGpuTimeBegin();
767:   berr = cublasXgemv(cublasv2handle,trans ? CUBLAS_OP_T : CUBLAS_OP_N,
768:                      m,n,&one,da,lda,xarray,1,(yy ? &one : &zero),zarray,1);CHKERRCUBLAS(berr);
769:   PetscLogGpuTimeEnd();
770:   PetscLogGpuFlops(2.0*A->rmap->n*A->cmap->n - (yy ? 0 : A->rmap->n));
771:   VecCUDARestoreArrayRead(xx,&xarray);
772:   VecCUDARestoreArray(zz,&zarray);
773:   MatDenseCUDARestoreArrayRead(A,&da);
774:   return(0);
775: }

777: PetscErrorCode MatMultAdd_SeqDenseCUDA(Mat A,Vec xx,Vec yy,Vec zz)
778: {

782:   MatMultAdd_SeqDenseCUDA_Private(A,xx,yy,zz,PETSC_FALSE);
783:   return(0);
784: }

786: PetscErrorCode MatMultTransposeAdd_SeqDenseCUDA(Mat A,Vec xx,Vec yy,Vec zz)
787: {

791:   MatMultAdd_SeqDenseCUDA_Private(A,xx,yy,zz,PETSC_TRUE);
792:   return(0);
793: }

795: PetscErrorCode MatMult_SeqDenseCUDA(Mat A,Vec xx,Vec yy)
796: {

800:   MatMultAdd_SeqDenseCUDA_Private(A,xx,NULL,yy,PETSC_FALSE);
801:   return(0);
802: }

804: PetscErrorCode MatMultTranspose_SeqDenseCUDA(Mat A,Vec xx,Vec yy)
805: {

809:   MatMultAdd_SeqDenseCUDA_Private(A,xx,NULL,yy,PETSC_TRUE);
810:   return(0);
811: }

813: static PetscErrorCode MatDenseGetArrayRead_SeqDenseCUDA(Mat A,const PetscScalar **array)
814: {
815:   Mat_SeqDense   *mat = (Mat_SeqDense*)A->data;

819:   MatSeqDenseCUDACopyFromGPU(A);
820:   *array = mat->v;
821:   return(0);
822: }

824: static PetscErrorCode MatDenseGetArrayWrite_SeqDenseCUDA(Mat A,PetscScalar **array)
825: {
826:   Mat_SeqDense   *mat = (Mat_SeqDense*)A->data;

830:   if (!mat->v) { /* MatCreateSeqDenseCUDA may not allocate CPU memory. Allocate if needed */
831:     MatSeqDenseSetPreallocation(A,NULL);
832:   }
833:   *array = mat->v;
834:   A->offloadmask = PETSC_OFFLOAD_CPU;
835:   return(0);
836: }

838: static PetscErrorCode MatDenseGetArray_SeqDenseCUDA(Mat A,PetscScalar **array)
839: {
840:   Mat_SeqDense   *mat = (Mat_SeqDense*)A->data;

844:   MatSeqDenseCUDACopyFromGPU(A);
845:   *array = mat->v;
846:   A->offloadmask = PETSC_OFFLOAD_CPU;
847:   return(0);
848: }

850: PetscErrorCode MatScale_SeqDenseCUDA(Mat Y,PetscScalar alpha)
851: {
852:   Mat_SeqDense   *y = (Mat_SeqDense*)Y->data;
853:   PetscScalar    *dy;
854:   int            j,N,m,lday,one = 1;
855:   cublasHandle_t cublasv2handle;
856:   cublasStatus_t berr;
858:   cudaError_t    cerr;

861:   PetscCUBLASGetHandle(&cublasv2handle);
862:   MatDenseCUDAGetArray(Y,&dy);
863:   PetscMPIIntCast(Y->rmap->n*Y->cmap->n,&N);
864:   PetscMPIIntCast(Y->rmap->n,&m);
865:   PetscMPIIntCast(y->lda,&lday);
866:   PetscInfo2(Y,"Performing Scale %d x %d on backend\n",Y->rmap->n,Y->cmap->n);
867:   PetscLogGpuTimeBegin();
868:   if (lday>m) {
869:     for (j=0; j<Y->cmap->n; j++) {
870:       berr = cublasXscal(cublasv2handle,m,&alpha,dy+lday*j,one);CHKERRCUBLAS(berr);
871:     }
872:   } else {
873:     berr = cublasXscal(cublasv2handle,N,&alpha,dy,one);CHKERRCUBLAS(berr);
874:   }
875:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
876:   PetscLogGpuTimeEnd();
877:   PetscLogGpuFlops(N);
878:   MatDenseCUDARestoreArray(Y,&dy);
879:   return(0);
880: }

882: PetscErrorCode MatAXPY_SeqDenseCUDA(Mat Y,PetscScalar alpha,Mat X,MatStructure str)
883: {
884:   Mat_SeqDense      *x = (Mat_SeqDense*)X->data;
885:   Mat_SeqDense      *y = (Mat_SeqDense*)Y->data;
886:   const PetscScalar *dx;
887:   PetscScalar       *dy;
888:   int               j,N,m,ldax,lday,one = 1;
889:   cublasHandle_t    cublasv2handle;
890:   cublasStatus_t    berr;
891:   PetscErrorCode    ierr;
892:   cudaError_t       cerr;

895:   if (!X->rmap->n || !X->cmap->n) return(0);
896:   PetscCUBLASGetHandle(&cublasv2handle);
897:   MatDenseCUDAGetArrayRead(X,&dx);
898:   if (alpha != 0.0) {
899:     MatDenseCUDAGetArray(Y,&dy);
900:   } else {
901:     MatDenseCUDAGetArrayWrite(Y,&dy);
902:   }
903:   PetscMPIIntCast(X->rmap->n*X->cmap->n,&N);
904:   PetscMPIIntCast(X->rmap->n,&m);
905:   PetscMPIIntCast(x->lda,&ldax);
906:   PetscMPIIntCast(y->lda,&lday);
907:   PetscInfo2(Y,"Performing AXPY %d x %d on backend\n",Y->rmap->n,Y->cmap->n);
908:   PetscLogGpuTimeBegin();
909:   if (ldax>m || lday>m) {
910:     for (j=0; j<X->cmap->n; j++) {
911:       berr = cublasXaxpy(cublasv2handle,m,&alpha,dx+j*ldax,one,dy+j*lday,one);CHKERRCUBLAS(berr);
912:     }
913:   } else {
914:     berr = cublasXaxpy(cublasv2handle,N,&alpha,dx,one,dy,one);CHKERRCUBLAS(berr);
915:   }
916:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
917:   PetscLogGpuTimeEnd();
918:   PetscLogGpuFlops(PetscMax(2.*N-1,0));
919:   MatDenseCUDARestoreArrayRead(X,&dx);
920:   if (alpha != 0.0) {
921:     MatDenseCUDARestoreArray(Y,&dy);
922:   } else {
923:     MatDenseCUDARestoreArrayWrite(Y,&dy);
924:   }
925:   return(0);
926: }

928: static PetscErrorCode MatReset_SeqDenseCUDA(Mat A)
929: {
930:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
931:   cudaError_t      cerr;
932:   PetscErrorCode   ierr;

935:   if (dA) {
936:     if (dA->unplacedarray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"MatDenseCUDAResetArray() must be called first");
937:     if (!dA->user_alloc) { cerr = cudaFree(dA->d_v);CHKERRCUDA(cerr); }
938:     cerr = cudaFree(dA->d_fact_ipiv);CHKERRCUDA(cerr);
939:     cerr = cudaFree(dA->d_fact_info);CHKERRCUDA(cerr);
940:     cerr = cudaFree(dA->d_fact_work);CHKERRCUDA(cerr);
941:     VecDestroy(&dA->workvec);
942:   }
943:   PetscFree(A->spptr);
944:   return(0);
945: }

947: PetscErrorCode MatDestroy_SeqDenseCUDA(Mat A)
948: {
949:   Mat_SeqDense   *a = (Mat_SeqDense*)A->data;

953:   /* prevent to copy back data if we own the data pointer */
954:   if (!a->user_alloc) { A->offloadmask = PETSC_OFFLOAD_CPU; }
955:   MatConvert_SeqDenseCUDA_SeqDense(A,MATSEQDENSE,MAT_INPLACE_MATRIX,&A);
956:   MatDestroy_SeqDense(A);
957:   return(0);
958: }

960: PetscErrorCode MatDuplicate_SeqDenseCUDA(Mat A,MatDuplicateOption cpvalues,Mat *B)
961: {

965:   MatCreate(PetscObjectComm((PetscObject)A),B);
966:   MatSetSizes(*B,A->rmap->n,A->cmap->n,A->rmap->n,A->cmap->n);
967:   MatSetType(*B,((PetscObject)A)->type_name);
968:   MatDuplicateNoCreate_SeqDense(*B,A,cpvalues);
969:   if (cpvalues == MAT_COPY_VALUES && A->offloadmask != PETSC_OFFLOAD_CPU) {
970:     Mat_SeqDense      *a = (Mat_SeqDense*)A->data;
971:     const PetscScalar *da;
972:     PetscScalar       *db;
973:     cudaError_t       cerr;
974:     PetscInt          ldb;

976:     MatDenseCUDAGetArrayRead(A,&da);
977:     MatDenseCUDAGetArrayWrite(*B,&db);
978:     MatDenseGetLDA(*B,&ldb);
979:     if (a->lda > A->rmap->n || ldb > A->rmap->n) {
980:       PetscInt j,m = A->rmap->n;

982:       for (j=0; j<A->cmap->n; j++) { /* it can be done better */
983:         cerr = cudaMemcpy(db+j*ldb,da+j*a->lda,m*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
984:       }
985:     } else {
986:       cerr = cudaMemcpy(db,da,(sizeof(PetscScalar)*A->cmap->n)*A->rmap->n,cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
987:     }
988:     MatDenseCUDARestoreArrayRead(A,&da);
989:     MatDenseCUDARestoreArrayWrite(*B,&db);
990:     (*B)->offloadmask = PETSC_OFFLOAD_BOTH;
991:   }
992:   return(0);
993: }

995: #include <petsc/private/vecimpl.h>

997: static PetscErrorCode MatGetColumnVector_SeqDenseCUDA(Mat A,Vec v,PetscInt col)
998: {
999:   Mat_SeqDense     *a = (Mat_SeqDense*)A->data;
1000:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
1001:   PetscErrorCode   ierr;
1002:   PetscScalar      *x;
1003:   PetscBool        viscuda;
1004:   cudaError_t      cerr;

1007:   PetscObjectTypeCompareAny((PetscObject)v,&viscuda,VECSEQCUDA,VECMPICUDA,VECCUDA,"");
1008:   if (viscuda && !v->boundtocpu) { /* update device data */
1009:     VecCUDAGetArrayWrite(v,&x);
1010:     if (A->offloadmask & PETSC_OFFLOAD_GPU) {
1011:       cerr = cudaMemcpy(x,dA->d_v + col*a->lda,A->rmap->n*sizeof(PetscScalar),cudaMemcpyHostToHost);CHKERRCUDA(cerr);
1012:     } else {
1013:       cerr = cudaMemcpy(x,a->v + col*a->lda,A->rmap->n*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
1014:     }
1015:     VecCUDARestoreArrayWrite(v,&x);
1016:   } else { /* update host data */
1017:     VecGetArrayWrite(v,&x);
1018:     if (A->offloadmask & PETSC_OFFLOAD_CPU) {
1019:       PetscArraycpy(x,a->v+col*a->lda,A->rmap->n);
1020:     } else {
1021:       cerr = cudaMemcpy(x,dA->d_v + col*a->lda,A->rmap->n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1022:     }
1023:     VecRestoreArrayWrite(v,&x);
1024:   }
1025:   return(0);
1026: }

1028: PETSC_INTERN PetscErrorCode MatGetFactor_seqdense_cuda(Mat A,MatFactorType ftype,Mat *fact)
1029: {

1033:   MatCreate(PetscObjectComm((PetscObject)A),fact);
1034:   MatSetSizes(*fact,A->rmap->n,A->cmap->n,A->rmap->n,A->cmap->n);
1035:   MatSetType(*fact,MATSEQDENSECUDA);
1036:   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU) {
1037:     (*fact)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqDense;
1038:   } else {
1039:     (*fact)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqDense;
1040:   }
1041:   (*fact)->factortype = ftype;

1043:   PetscFree((*fact)->solvertype);
1044:   PetscStrallocpy(MATSOLVERCUDA,&(*fact)->solvertype);
1045:   return(0);
1046: }

1048: static PetscErrorCode MatDenseGetColumnVec_SeqDenseCUDA(Mat A,PetscInt col,Vec *v)
1049: {
1050:   Mat_SeqDense   *a = (Mat_SeqDense*)A->data;

1054:   if (a->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreColumnVec() first");
1055:   if (a->matinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreSubMatrix() first");
1056:   MatDenseCUDAGetArray(A,(PetscScalar**)&a->ptrinuse);
1057:   if (!a->cvec) { /* we pass the data of A, to prevent allocating needless GPU memory the first time VecCUDAPlaceArray is called */
1058:     VecCreateSeqCUDAWithArray(PetscObjectComm((PetscObject)A),A->rmap->bs,A->rmap->n,a->ptrinuse,&a->cvec);
1059:     PetscLogObjectParent((PetscObject)A,(PetscObject)a->cvec);
1060:   }
1061:   a->vecinuse = col + 1;
1062:   VecCUDAPlaceArray(a->cvec,a->ptrinuse + (size_t)col * (size_t)a->lda);
1063:   *v   = a->cvec;
1064:   return(0);
1065: }

1067: static PetscErrorCode MatDenseRestoreColumnVec_SeqDenseCUDA(Mat A,PetscInt col,Vec *v)
1068: {
1069:   Mat_SeqDense   *a = (Mat_SeqDense*)A->data;

1073:   if (!a->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseGetColumnVec() first");
1074:   if (!a->cvec) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing internal column vector");
1075:   a->vecinuse = 0;
1076:   VecCUDAResetArray(a->cvec);
1077:   MatDenseCUDARestoreArray(A,(PetscScalar**)&a->ptrinuse);
1078:   *v   = NULL;
1079:   return(0);
1080: }

1082: static PetscErrorCode MatDenseGetColumnVecRead_SeqDenseCUDA(Mat A,PetscInt col,Vec *v)
1083: {
1084:   Mat_SeqDense   *a = (Mat_SeqDense*)A->data;

1088:   if (a->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreColumnVec() first");
1089:   if (a->matinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreSubMatrix() first");
1090:   MatDenseCUDAGetArrayRead(A,&a->ptrinuse);
1091:   if (!a->cvec) { /* we pass the data of A, to prevent allocating needless GPU memory the first time VecCUDAPlaceArray is called */
1092:     VecCreateSeqCUDAWithArray(PetscObjectComm((PetscObject)A),A->rmap->bs,A->rmap->n,a->ptrinuse,&a->cvec);
1093:     PetscLogObjectParent((PetscObject)A,(PetscObject)a->cvec);
1094:   }
1095:   a->vecinuse = col + 1;
1096:   VecCUDAPlaceArray(a->cvec,a->ptrinuse + (size_t)col * (size_t)a->lda);
1097:   VecLockReadPush(a->cvec);
1098:   *v   = a->cvec;
1099:   return(0);
1100: }

1102: static PetscErrorCode MatDenseRestoreColumnVecRead_SeqDenseCUDA(Mat A,PetscInt col,Vec *v)
1103: {
1104:   Mat_SeqDense   *a = (Mat_SeqDense*)A->data;

1108:   if (!a->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseGetColumnVec() first");
1109:   if (!a->cvec) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing internal column vector");
1110:   a->vecinuse = 0;
1111:   VecLockReadPop(a->cvec);
1112:   VecCUDAResetArray(a->cvec);
1113:   MatDenseCUDARestoreArrayRead(A,&a->ptrinuse);
1114:   *v   = NULL;
1115:   return(0);
1116: }

1118: static PetscErrorCode MatDenseGetColumnVecWrite_SeqDenseCUDA(Mat A,PetscInt col,Vec *v)
1119: {
1120:   Mat_SeqDense   *a = (Mat_SeqDense*)A->data;

1124:   if (a->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreColumnVec() first");
1125:   if (a->matinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreSubMatrix() first");
1126:   MatDenseCUDAGetArrayWrite(A,(PetscScalar**)&a->ptrinuse);
1127:   if (!a->cvec) { /* we pass the data of A, to prevent allocating needless GPU memory the first time VecCUDAPlaceArray is called */
1128:     VecCreateSeqCUDAWithArray(PetscObjectComm((PetscObject)A),A->rmap->bs,A->rmap->n,a->ptrinuse,&a->cvec);
1129:     PetscLogObjectParent((PetscObject)A,(PetscObject)a->cvec);
1130:   }
1131:   a->vecinuse = col + 1;
1132:   VecCUDAPlaceArray(a->cvec,a->ptrinuse + (size_t)col * (size_t)a->lda);
1133:   *v   = a->cvec;
1134:   return(0);
1135: }

1137: static PetscErrorCode MatDenseRestoreColumnVecWrite_SeqDenseCUDA(Mat A,PetscInt col,Vec *v)
1138: {
1139:   Mat_SeqDense   *a = (Mat_SeqDense*)A->data;

1143:   if (!a->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseGetColumnVec() first");
1144:   if (!a->cvec) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing internal column vector");
1145:   a->vecinuse = 0;
1146:   VecCUDAResetArray(a->cvec);
1147:   MatDenseCUDARestoreArrayWrite(A,(PetscScalar**)&a->ptrinuse);
1148:   *v   = NULL;
1149:   return(0);
1150: }

1152: static PetscErrorCode MatDenseGetSubMatrix_SeqDenseCUDA(Mat A,PetscInt cbegin,PetscInt cend,Mat *v)
1153: {
1154:   Mat_SeqDense     *a = (Mat_SeqDense*)A->data;
1155:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
1156:   PetscErrorCode   ierr;

1159:   if (a->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreColumnVec() first");
1160:   if (a->matinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreSubMatrix() first");
1161:   if (a->cmat && cend-cbegin != a->cmat->cmap->N) {
1162:     MatDestroy(&a->cmat);
1163:   }
1164:   MatSeqDenseCUDACopyToGPU(A);
1165:   if (!a->cmat) {
1166:     MatCreateDenseCUDA(PetscObjectComm((PetscObject)A),A->rmap->n,PETSC_DECIDE,A->rmap->N,cend-cbegin,dA->d_v + (size_t)cbegin * (size_t)a->lda,&a->cmat);
1167:     PetscLogObjectParent((PetscObject)A,(PetscObject)a->cmat);
1168:   } else {
1169:     MatDenseCUDAPlaceArray(a->cmat,dA->d_v + (size_t)cbegin * (size_t)a->lda);
1170:   }
1171:   MatDenseSetLDA(a->cmat,a->lda);
1172:   if (a->v) { MatDensePlaceArray(a->cmat,a->v + (size_t)cbegin * (size_t)a->lda); }
1173:   a->cmat->offloadmask = A->offloadmask;
1174:   a->matinuse = cbegin + 1;
1175:   *v = a->cmat;
1176:   return(0);
1177: }

1179: static PetscErrorCode MatDenseRestoreSubMatrix_SeqDenseCUDA(Mat A,Mat *v)
1180: {
1181:   Mat_SeqDense   *a = (Mat_SeqDense*)A->data;

1185:   if (!a->matinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseGetSubMatrix() first");
1186:   if (!a->cmat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing internal column matrix");
1187:   if (*v != a->cmat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Not the matrix obtained from MatDenseGetSubMatrix()");
1188:   a->matinuse = 0;
1189:   A->offloadmask = PETSC_OFFLOAD_GPU;
1190:   MatDenseCUDAResetArray(a->cmat);
1191:   MatDenseResetArray(a->cmat);
1192:   *v = NULL;
1193:   return(0);
1194: }

1196: static PetscErrorCode  MatDenseSetLDA_SeqDenseCUDA(Mat A,PetscInt lda)
1197: {
1198:   Mat_SeqDense     *cA = (Mat_SeqDense*)A->data;
1199:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
1200:   PetscBool        data;

1203:   data = (PetscBool)((A->rmap->n > 0 && A->cmap->n > 0) ? (dA->d_v ? PETSC_TRUE : PETSC_FALSE) : PETSC_FALSE);
1204:   if (!dA->user_alloc && data && cA->lda!=lda) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"LDA cannot be changed after allocation of internal storage");
1205:   if (lda < A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"LDA %D must be at least matrix dimension %D",lda,A->rmap->n);
1206:   cA->lda = lda;
1207:   return(0);
1208: }

1210: static PetscErrorCode MatBindToCPU_SeqDenseCUDA(Mat A,PetscBool flg)
1211: {
1212:   Mat_SeqDense   *a = (Mat_SeqDense*)A->data;

1216:   if (a->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreColumnVec() first");
1217:   if (a->matinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreSubMatrix() first");
1218:   A->boundtocpu = flg;
1219:   if (!flg) {
1220:     PetscBool iscuda;

1222:     PetscObjectTypeCompare((PetscObject)a->cvec,VECSEQCUDA,&iscuda);
1223:     if (!iscuda) {
1224:       VecDestroy(&a->cvec);
1225:     }
1226:     PetscObjectTypeCompare((PetscObject)a->cmat,MATSEQDENSECUDA,&iscuda);
1227:     if (!iscuda) {
1228:       MatDestroy(&a->cmat);
1229:     }
1230:     PetscObjectComposeFunction((PetscObject)A,"MatDenseGetArray_C",MatDenseGetArray_SeqDenseCUDA);
1231:     PetscObjectComposeFunction((PetscObject)A,"MatDenseGetArrayRead_C",MatDenseGetArrayRead_SeqDenseCUDA);
1232:     PetscObjectComposeFunction((PetscObject)A,"MatDenseGetArrayWrite_C",MatDenseGetArrayWrite_SeqDenseCUDA);
1233:     PetscObjectComposeFunction((PetscObject)A,"MatDenseGetColumnVec_C",MatDenseGetColumnVec_SeqDenseCUDA);
1234:     PetscObjectComposeFunction((PetscObject)A,"MatDenseRestoreColumnVec_C",MatDenseRestoreColumnVec_SeqDenseCUDA);
1235:     PetscObjectComposeFunction((PetscObject)A,"MatDenseGetColumnVecRead_C",MatDenseGetColumnVecRead_SeqDenseCUDA);
1236:     PetscObjectComposeFunction((PetscObject)A,"MatDenseRestoreColumnVecRead_C",MatDenseRestoreColumnVecRead_SeqDenseCUDA);
1237:     PetscObjectComposeFunction((PetscObject)A,"MatDenseGetColumnVecWrite_C",MatDenseGetColumnVecWrite_SeqDenseCUDA);
1238:     PetscObjectComposeFunction((PetscObject)A,"MatDenseRestoreColumnVecWrite_C",MatDenseRestoreColumnVecWrite_SeqDenseCUDA);
1239:     PetscObjectComposeFunction((PetscObject)A,"MatDenseGetSubMatrix_C",MatDenseGetSubMatrix_SeqDenseCUDA);
1240:     PetscObjectComposeFunction((PetscObject)A,"MatDenseRestoreSubMatrix_C",MatDenseRestoreSubMatrix_SeqDenseCUDA);
1241:     PetscObjectComposeFunction((PetscObject)A,"MatDenseSetLDA_C",MatDenseSetLDA_SeqDenseCUDA);

1243:     A->ops->duplicate               = MatDuplicate_SeqDenseCUDA;
1244:     A->ops->mult                    = MatMult_SeqDenseCUDA;
1245:     A->ops->multadd                 = MatMultAdd_SeqDenseCUDA;
1246:     A->ops->multtranspose           = MatMultTranspose_SeqDenseCUDA;
1247:     A->ops->multtransposeadd        = MatMultTransposeAdd_SeqDenseCUDA;
1248:     A->ops->matmultnumeric          = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA;
1249:     A->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqDenseCUDA_SeqDenseCUDA;
1250:     A->ops->transposematmultnumeric = MatTransposeMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA;
1251:     A->ops->axpy                    = MatAXPY_SeqDenseCUDA;
1252:     A->ops->choleskyfactor          = MatCholeskyFactor_SeqDenseCUDA;
1253:     A->ops->lufactor                = MatLUFactor_SeqDenseCUDA;
1254:     A->ops->productsetfromoptions   = MatProductSetFromOptions_SeqDenseCUDA;
1255:     A->ops->getcolumnvector         = MatGetColumnVector_SeqDenseCUDA;
1256:     A->ops->scale                   = MatScale_SeqDenseCUDA;
1257:     A->ops->copy                    = MatCopy_SeqDenseCUDA;
1258:   } else {
1259:     /* make sure we have an up-to-date copy on the CPU */
1260:     MatSeqDenseCUDACopyFromGPU(A);
1261:     PetscObjectComposeFunction((PetscObject)A,"MatDenseGetArray_C",MatDenseGetArray_SeqDense);
1262:     PetscObjectComposeFunction((PetscObject)A,"MatDenseGetArrayRead_C",MatDenseGetArray_SeqDense);
1263:     PetscObjectComposeFunction((PetscObject)A,"MatDenseGetArrayWrite_C",MatDenseGetArray_SeqDense);
1264:     PetscObjectComposeFunction((PetscObject)A,"MatDenseGetColumnVec_C",MatDenseGetColumnVec_SeqDense);
1265:     PetscObjectComposeFunction((PetscObject)A,"MatDenseRestoreColumnVec_C",MatDenseRestoreColumnVec_SeqDense);
1266:     PetscObjectComposeFunction((PetscObject)A,"MatDenseGetColumnVecRead_C",MatDenseGetColumnVecRead_SeqDense);
1267:     PetscObjectComposeFunction((PetscObject)A,"MatDenseRestoreColumnVecRead_C",MatDenseRestoreColumnVecRead_SeqDense);
1268:     PetscObjectComposeFunction((PetscObject)A,"MatDenseGetColumnVecWrite_C",MatDenseGetColumnVecWrite_SeqDense);
1269:     PetscObjectComposeFunction((PetscObject)A,"MatDenseRestoreColumnVecWrite_C",MatDenseRestoreColumnVecWrite_SeqDense);
1270:     PetscObjectComposeFunction((PetscObject)A,"MatDenseGetSubMatrix_C",MatDenseGetSubMatrix_SeqDense);
1271:     PetscObjectComposeFunction((PetscObject)A,"MatDenseRestoreSubMatrix_C",MatDenseRestoreSubMatrix_SeqDense);
1272:     PetscObjectComposeFunction((PetscObject)A,"MatDenseSetLDA_C",MatDenseSetLDA_SeqDense);

1274:     A->ops->duplicate               = MatDuplicate_SeqDense;
1275:     A->ops->mult                    = MatMult_SeqDense;
1276:     A->ops->multadd                 = MatMultAdd_SeqDense;
1277:     A->ops->multtranspose           = MatMultTranspose_SeqDense;
1278:     A->ops->multtransposeadd        = MatMultTransposeAdd_SeqDense;
1279:     A->ops->productsetfromoptions   = MatProductSetFromOptions_SeqDense;
1280:     A->ops->matmultnumeric          = MatMatMultNumeric_SeqDense_SeqDense;
1281:     A->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqDense_SeqDense;
1282:     A->ops->transposematmultnumeric = MatTransposeMatMultNumeric_SeqDense_SeqDense;
1283:     A->ops->axpy                    = MatAXPY_SeqDense;
1284:     A->ops->choleskyfactor          = MatCholeskyFactor_SeqDense;
1285:     A->ops->lufactor                = MatLUFactor_SeqDense;
1286:     A->ops->productsetfromoptions   = MatProductSetFromOptions_SeqDense;
1287:     A->ops->getcolumnvector         = MatGetColumnVector_SeqDense;
1288:     A->ops->scale                   = MatScale_SeqDense;
1289:     A->ops->copy                    = MatCopy_SeqDense;
1290:   }
1291:   if (a->cmat) {
1292:     MatBindToCPU(a->cmat,flg);
1293:   }
1294:   return(0);
1295: }

1297: PetscErrorCode MatConvert_SeqDenseCUDA_SeqDense(Mat M,MatType type,MatReuse reuse,Mat *newmat)
1298: {
1299:   Mat              B;
1300:   PetscErrorCode   ierr;

1303:   if (reuse == MAT_REUSE_MATRIX || reuse == MAT_INITIAL_MATRIX) {
1304:     /* TODO these cases should be optimized */
1305:     MatConvert_Basic(M,type,reuse,newmat);
1306:     return(0);
1307:   }

1309:   B    = *newmat;
1310:   MatBindToCPU_SeqDenseCUDA(B,PETSC_TRUE);
1311:   MatReset_SeqDenseCUDA(B);
1312:   PetscFree(B->defaultvectype);
1313:   PetscStrallocpy(VECSTANDARD,&B->defaultvectype);
1314:   PetscObjectChangeTypeName((PetscObject)B,MATSEQDENSE);
1315:   PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqdensecuda_seqdense_C",NULL);
1316:   PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAGetArray_C",NULL);
1317:   PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAGetArrayRead_C",NULL);
1318:   PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAGetArrayWrite_C",NULL);
1319:   PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDARestoreArray_C",NULL);
1320:   PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDARestoreArrayRead_C",NULL);
1321:   PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDARestoreArrayWrite_C",NULL);
1322:   PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAPlaceArray_C",NULL);
1323:   PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAResetArray_C",NULL);
1324:   PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAReplaceArray_C",NULL);
1325:   PetscObjectComposeFunction((PetscObject)B,"MatProductSetFromOptions_seqaij_seqdensecuda_C",NULL);

1327:   B->ops->bindtocpu = NULL;
1328:   B->ops->destroy = MatDestroy_SeqDense;
1329:   B->offloadmask = PETSC_OFFLOAD_CPU;
1330:   return(0);
1331: }

1333: PetscErrorCode MatConvert_SeqDense_SeqDenseCUDA(Mat M,MatType type,MatReuse reuse,Mat *newmat)
1334: {
1335:   Mat_SeqDenseCUDA *dB;
1336:   Mat              B;
1337:   PetscErrorCode   ierr;

1340:   PetscCUDAInitializeCheck();
1341:   if (reuse == MAT_REUSE_MATRIX || reuse == MAT_INITIAL_MATRIX) {
1342:     /* TODO these cases should be optimized */
1343:     MatConvert_Basic(M,type,reuse,newmat);
1344:     return(0);
1345:   }

1347:   B    = *newmat;
1348:   PetscFree(B->defaultvectype);
1349:   PetscStrallocpy(VECCUDA,&B->defaultvectype);
1350:   PetscObjectChangeTypeName((PetscObject)B,MATSEQDENSECUDA);
1351:   PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqdensecuda_seqdense_C",            MatConvert_SeqDenseCUDA_SeqDense);
1352:   PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAGetArray_C",                        MatDenseCUDAGetArray_SeqDenseCUDA);
1353:   PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAGetArrayRead_C",                    MatDenseCUDAGetArrayRead_SeqDenseCUDA);
1354:   PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAGetArrayWrite_C",                   MatDenseCUDAGetArrayWrite_SeqDenseCUDA);
1355:   PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDARestoreArray_C",                    MatDenseCUDARestoreArray_SeqDenseCUDA);
1356:   PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDARestoreArrayRead_C",                MatDenseCUDARestoreArrayRead_SeqDenseCUDA);
1357:   PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDARestoreArrayWrite_C",               MatDenseCUDARestoreArrayWrite_SeqDenseCUDA);
1358:   PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAPlaceArray_C",                      MatDenseCUDAPlaceArray_SeqDenseCUDA);
1359:   PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAResetArray_C",                      MatDenseCUDAResetArray_SeqDenseCUDA);
1360:   PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAReplaceArray_C",                    MatDenseCUDAReplaceArray_SeqDenseCUDA);
1361:   PetscObjectComposeFunction((PetscObject)B,"MatProductSetFromOptions_seqaij_seqdensecuda_C",MatProductSetFromOptions_SeqAIJ_SeqDense);

1363:   PetscNewLog(B,&dB);
1364:   B->spptr = dB;

1366:   B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;

1368:   MatBindToCPU_SeqDenseCUDA(B,PETSC_FALSE);
1369:   B->ops->bindtocpu = MatBindToCPU_SeqDenseCUDA;
1370:   B->ops->destroy  = MatDestroy_SeqDenseCUDA;
1371:   return(0);
1372: }

1374: /*@C
1375:    MatCreateSeqDenseCUDA - Creates a sequential matrix in dense format using CUDA.

1377:    Collective

1379:    Input Parameters:
1380: +  comm - MPI communicator
1381: .  m - number of rows
1382: .  n - number of columns
1383: -  data - optional location of GPU matrix data.  Set data=NULL for PETSc
1384:    to control matrix memory allocation.

1386:    Output Parameter:
1387: .  A - the matrix

1389:    Notes:

1391:    Level: intermediate

1393: .seealso: MatCreate(), MatCreateSeqDense()
1394: @*/
1395: PetscErrorCode  MatCreateSeqDenseCUDA(MPI_Comm comm,PetscInt m,PetscInt n,PetscScalar *data,Mat *A)
1396: {
1398:   PetscMPIInt    size;

1401:   MPI_Comm_size(comm,&size);
1402:   if (size > 1) SETERRQ1(comm,PETSC_ERR_ARG_WRONG,"Invalid communicator size %d",size);
1403:   MatCreate(comm,A);
1404:   MatSetSizes(*A,m,n,m,n);
1405:   MatSetType(*A,MATSEQDENSECUDA);
1406:   MatSeqDenseCUDASetPreallocation(*A,data);
1407:   return(0);
1408: }

1410: /*MC
1411:    MATSEQDENSECUDA - MATSEQDENSECUDA = "seqdensecuda" - A matrix type to be used for sequential dense matrices on GPUs.

1413:    Options Database Keys:
1414: . -mat_type seqdensecuda - sets the matrix type to "seqdensecuda" during a call to MatSetFromOptions()

1416:   Level: beginner
1417: M*/
1418: PETSC_EXTERN PetscErrorCode MatCreate_SeqDenseCUDA(Mat B)
1419: {

1423:   PetscCUDAInitializeCheck();
1424:   MatCreate_SeqDense(B);
1425:   MatConvert_SeqDense_SeqDenseCUDA(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);
1426:   return(0);
1427: }