Actual source code: aijcusparse.cu
petsc-3.15.0 2021-03-30
1: /*
2: Defines the basic matrix operations for the AIJ (compressed row)
3: matrix storage format using the CUSPARSE library,
4: */
5: #define PETSC_SKIP_SPINLOCK
6: #define PETSC_SKIP_CXX_COMPLEX_FIX
7: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
9: #include <petscconf.h>
10: #include <../src/mat/impls/aij/seq/aij.h>
11: #include <../src/mat/impls/sbaij/seq/sbaij.h>
12: #include <../src/vec/vec/impls/dvecimpl.h>
13: #include <petsc/private/vecimpl.h>
14: #undef VecType
15: #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
16: #include <thrust/async/for_each.h>
17: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
18: #include <cooperative_groups.h>
19: #endif
20: const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
21: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
22: /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
23: 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
25: typedef enum {
26: CUSPARSE_MV_ALG_DEFAULT = 0,
27: CUSPARSE_COOMV_ALG = 1,
28: CUSPARSE_CSRMV_ALG1 = 2,
29: CUSPARSE_CSRMV_ALG2 = 3
30: } cusparseSpMVAlg_t;
32: typedef enum {
33: CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
34: CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1,
35: CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2,
36: CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3,
37: CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4,
38: CUSPARSE_SPMM_ALG_DEFAULT = 0,
39: CUSPARSE_SPMM_COO_ALG1 = 1,
40: CUSPARSE_SPMM_COO_ALG2 = 2,
41: CUSPARSE_SPMM_COO_ALG3 = 3,
42: CUSPARSE_SPMM_COO_ALG4 = 5,
43: CUSPARSE_SPMM_CSR_ALG1 = 4,
44: CUSPARSE_SPMM_CSR_ALG2 = 6,
45: } cusparseSpMMAlg_t;
47: typedef enum {
48: CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
49: CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc
50: } cusparseCsr2CscAlg_t;
51: */
52: const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
53: const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
54: const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
55: #endif
57: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
58: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
59: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
61: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSEBAND(Mat,Mat,IS,IS,const MatFactorInfo*);
62: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSEBAND(Mat,Mat,const MatFactorInfo*);
63: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
64: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
65: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
67: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
68: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
69: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
70: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
71: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
72: static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure);
73: static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar);
74: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
75: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
76: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
77: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
78: static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
79: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
80: static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
82: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
83: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
84: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
85: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**);
86: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
87: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
89: static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat);
90: static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
91: static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool);
93: PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]);
94: PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);
96: static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]);
98: PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
99: {
100: cusparseStatus_t stat;
101: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
104: if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
105: cusparsestruct->stream = stream;
106: stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
107: return(0);
108: }
110: PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
111: {
112: cusparseStatus_t stat;
113: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
116: if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
117: if (cusparsestruct->handle != handle) {
118: if (cusparsestruct->handle) {
119: stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
120: }
121: cusparsestruct->handle = handle;
122: }
123: stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
124: return(0);
125: }
127: PetscErrorCode MatCUSPARSEClearHandle(Mat A)
128: {
129: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
130: PetscBool flg;
131: PetscErrorCode ierr;
134: PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);
135: if (!flg || !cusparsestruct) return(0);
136: if (cusparsestruct->handle) cusparsestruct->handle = 0;
137: return(0);
138: }
140: PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
141: {
143: *type = MATSOLVERCUSPARSE;
144: return(0);
145: }
147: /*MC
148: MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
149: on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
150: algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
151: performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
152: CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
153: algorithms are not recommended. This class does NOT support direct solver operations.
155: Level: beginner
157: .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
158: M*/
160: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
161: {
163: PetscInt n = A->rmap->n;
166: MatCreate(PetscObjectComm((PetscObject)A),B);
167: MatSetSizes(*B,n,n,n,n);
168: (*B)->factortype = ftype;
169: (*B)->useordering = PETSC_TRUE;
170: MatSetType(*B,MATSEQAIJCUSPARSE);
172: if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
173: MatSetBlockSizesFromMats(*B,A,A);
174: (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
175: (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE;
176: } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
177: (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE;
178: (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
179: } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
181: MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);
182: PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);
183: return(0);
184: }
186: PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
187: {
188: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
191: switch (op) {
192: case MAT_CUSPARSE_MULT:
193: cusparsestruct->format = format;
194: break;
195: case MAT_CUSPARSE_ALL:
196: cusparsestruct->format = format;
197: break;
198: default:
199: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
200: }
201: return(0);
202: }
204: /*@
205: MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
206: operation. Only the MatMult operation can use different GPU storage formats
207: for MPIAIJCUSPARSE matrices.
208: Not Collective
210: Input Parameters:
211: + A - Matrix of type SEQAIJCUSPARSE
212: . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
213: - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
215: Output Parameter:
217: Level: intermediate
219: .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
220: @*/
221: PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
222: {
227: PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));
228: return(0);
229: }
231: PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg)
232: {
236: switch (op) {
237: case MAT_FORM_EXPLICIT_TRANSPOSE:
238: /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
239: if (A->form_explicit_transpose && !flg) {MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);}
240: A->form_explicit_transpose = flg;
241: break;
242: default:
243: MatSetOption_SeqAIJ(A,op,flg);
244: break;
245: }
246: return(0);
247: }
249: static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A);
251: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
252: {
253: Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data;
254: IS isrow = b->row,iscol = b->col;
255: PetscBool row_identity,col_identity;
259: MatSeqAIJCUSPARSECopyFromGPU(A);
260: MatLUFactorNumeric_SeqAIJ(B,A,info);
261: B->offloadmask = PETSC_OFFLOAD_CPU;
262: /* determine which version of MatSolve needs to be used. */
263: ISIdentity(isrow,&row_identity);
264: ISIdentity(iscol,&col_identity);
265: if (row_identity && col_identity) {
266: B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
267: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
268: B->ops->matsolve = NULL;
269: B->ops->matsolvetranspose = NULL;
270: } else {
271: B->ops->solve = MatSolve_SeqAIJCUSPARSE;
272: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
273: B->ops->matsolve = NULL;
274: B->ops->matsolvetranspose = NULL;
275: }
277: /* get the triangular factors */
278: MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);
279: return(0);
280: }
282: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
283: {
284: PetscErrorCode ierr;
285: MatCUSPARSEStorageFormat format;
286: PetscBool flg;
287: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
290: PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");
291: if (A->factortype == MAT_FACTOR_NONE) {
292: PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
293: "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);
294: if (flg) {MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);}
296: PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
297: "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);
298: if (flg) {MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);}
299: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
300: PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
301: "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);
302: /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
303: if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
305: PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
306: "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);
307: if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
309: PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
310: "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);
311: if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
312: #endif
313: }
314: PetscOptionsTail();
315: return(0);
316: }
318: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
319: {
320: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
321: PetscErrorCode ierr;
324: MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);
325: MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);
326: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
327: return(0);
328: }
330: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
331: {
332: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
333: PetscErrorCode ierr;
336: MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);
337: MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);
338: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
339: return(0);
340: }
342: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
343: {
344: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
345: PetscErrorCode ierr;
348: MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);
349: MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);
350: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
351: return(0);
352: }
354: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
355: {
356: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
357: PetscErrorCode ierr;
360: MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);
361: MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);
362: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
363: return(0);
364: }
366: static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
367: {
368: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
369: PetscInt n = A->rmap->n;
370: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
371: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
372: cusparseStatus_t stat;
373: const PetscInt *ai = a->i,*aj = a->j,*vi;
374: const MatScalar *aa = a->a,*v;
375: PetscInt *AiLo, *AjLo;
376: PetscInt i,nz, nzLower, offset, rowOffset;
377: PetscErrorCode ierr;
378: cudaError_t cerr;
381: if (!n) return(0);
382: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
383: try {
384: /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
385: nzLower=n+ai[n]-ai[1];
386: if (!loTriFactor) {
387: PetscScalar *AALo;
389: cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
391: /* Allocate Space for the lower triangular matrix */
392: cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
393: cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
395: /* Fill the lower triangular matrix */
396: AiLo[0] = (PetscInt) 0;
397: AiLo[n] = nzLower;
398: AjLo[0] = (PetscInt) 0;
399: AALo[0] = (MatScalar) 1.0;
400: v = aa;
401: vi = aj;
402: offset = 1;
403: rowOffset= 1;
404: for (i=1; i<n; i++) {
405: nz = ai[i+1] - ai[i];
406: /* additional 1 for the term on the diagonal */
407: AiLo[i] = rowOffset;
408: rowOffset += nz+1;
410: PetscArraycpy(&(AjLo[offset]), vi, nz);
411: PetscArraycpy(&(AALo[offset]), v, nz);
413: offset += nz;
414: AjLo[offset] = (PetscInt) i;
415: AALo[offset] = (MatScalar) 1.0;
416: offset += 1;
418: v += nz;
419: vi += nz;
420: }
422: /* allocate space for the triangular factor information */
423: PetscNew(&loTriFactor);
424: loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
425: /* Create the matrix description */
426: stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
427: stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
428: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
429: stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
430: #else
431: stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
432: #endif
433: stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
434: stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
436: /* set the operation */
437: loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
439: /* set the matrix */
440: loTriFactor->csrMat = new CsrMatrix;
441: loTriFactor->csrMat->num_rows = n;
442: loTriFactor->csrMat->num_cols = n;
443: loTriFactor->csrMat->num_entries = nzLower;
445: loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
446: loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
448: loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
449: loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
451: loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
452: loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
454: /* Create the solve analysis information */
455: PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);
456: stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
457: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
458: stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
459: loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
460: loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
461: loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
462: &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
463: cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
464: #endif
466: /* perform the solve analysis */
467: stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
468: loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
469: loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
470: loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
471: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
472: ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
473: #endif
474: );CHKERRCUSPARSE(stat);
475: cerr = WaitForCUDA();CHKERRCUDA(cerr);
476: PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);
478: /* assign the pointer */
479: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
480: loTriFactor->AA_h = AALo;
481: cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
482: cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
483: PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));
484: } else { /* update values only */
485: if (!loTriFactor->AA_h) {
486: cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
487: }
488: /* Fill the lower triangular matrix */
489: loTriFactor->AA_h[0] = 1.0;
490: v = aa;
491: vi = aj;
492: offset = 1;
493: for (i=1; i<n; i++) {
494: nz = ai[i+1] - ai[i];
495: PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);
496: offset += nz;
497: loTriFactor->AA_h[offset] = 1.0;
498: offset += 1;
499: v += nz;
500: }
501: loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower);
502: PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));
503: }
504: } catch(char *ex) {
505: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
506: }
507: }
508: return(0);
509: }
511: static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
512: {
513: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
514: PetscInt n = A->rmap->n;
515: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
516: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
517: cusparseStatus_t stat;
518: const PetscInt *aj = a->j,*adiag = a->diag,*vi;
519: const MatScalar *aa = a->a,*v;
520: PetscInt *AiUp, *AjUp;
521: PetscInt i,nz, nzUpper, offset;
522: PetscErrorCode ierr;
523: cudaError_t cerr;
526: if (!n) return(0);
527: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
528: try {
529: /* next, figure out the number of nonzeros in the upper triangular matrix. */
530: nzUpper = adiag[0]-adiag[n];
531: if (!upTriFactor) {
532: PetscScalar *AAUp;
534: cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
536: /* Allocate Space for the upper triangular matrix */
537: cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
538: cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
540: /* Fill the upper triangular matrix */
541: AiUp[0]=(PetscInt) 0;
542: AiUp[n]=nzUpper;
543: offset = nzUpper;
544: for (i=n-1; i>=0; i--) {
545: v = aa + adiag[i+1] + 1;
546: vi = aj + adiag[i+1] + 1;
548: /* number of elements NOT on the diagonal */
549: nz = adiag[i] - adiag[i+1]-1;
551: /* decrement the offset */
552: offset -= (nz+1);
554: /* first, set the diagonal elements */
555: AjUp[offset] = (PetscInt) i;
556: AAUp[offset] = (MatScalar)1./v[nz];
557: AiUp[i] = AiUp[i+1] - (nz+1);
559: PetscArraycpy(&(AjUp[offset+1]), vi, nz);
560: PetscArraycpy(&(AAUp[offset+1]), v, nz);
561: }
563: /* allocate space for the triangular factor information */
564: PetscNew(&upTriFactor);
565: upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
567: /* Create the matrix description */
568: stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
569: stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
570: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
571: stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
572: #else
573: stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
574: #endif
575: stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
576: stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
578: /* set the operation */
579: upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
581: /* set the matrix */
582: upTriFactor->csrMat = new CsrMatrix;
583: upTriFactor->csrMat->num_rows = n;
584: upTriFactor->csrMat->num_cols = n;
585: upTriFactor->csrMat->num_entries = nzUpper;
587: upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
588: upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
590: upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
591: upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
593: upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
594: upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
596: /* Create the solve analysis information */
597: PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);
598: stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
599: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
600: stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
601: upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
602: upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
603: upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
604: &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
605: cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
606: #endif
608: /* perform the solve analysis */
609: stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
610: upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
611: upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
612: upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
613: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
614: ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
615: #endif
616: );CHKERRCUSPARSE(stat);
617: cerr = WaitForCUDA();CHKERRCUDA(cerr);
618: PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);
620: /* assign the pointer */
621: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
622: upTriFactor->AA_h = AAUp;
623: cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
624: cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
625: PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));
626: } else {
627: if (!upTriFactor->AA_h) {
628: cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
629: }
630: /* Fill the upper triangular matrix */
631: offset = nzUpper;
632: for (i=n-1; i>=0; i--) {
633: v = aa + adiag[i+1] + 1;
635: /* number of elements NOT on the diagonal */
636: nz = adiag[i] - adiag[i+1]-1;
638: /* decrement the offset */
639: offset -= (nz+1);
641: /* first, set the diagonal elements */
642: upTriFactor->AA_h[offset] = 1./v[nz];
643: PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);
644: }
645: upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper);
646: PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));
647: }
648: } catch(char *ex) {
649: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
650: }
651: }
652: return(0);
653: }
655: static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
656: {
657: PetscErrorCode ierr;
658: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
659: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
660: IS isrow = a->row,iscol = a->icol;
661: PetscBool row_identity,col_identity;
662: PetscInt n = A->rmap->n;
665: if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
666: MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);
667: MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);
669: if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
670: cusparseTriFactors->nnz=a->nz;
672: A->offloadmask = PETSC_OFFLOAD_BOTH;
673: /* lower triangular indices */
674: ISIdentity(isrow,&row_identity);
675: if (!row_identity && !cusparseTriFactors->rpermIndices) {
676: const PetscInt *r;
678: ISGetIndices(isrow,&r);
679: cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
680: cusparseTriFactors->rpermIndices->assign(r, r+n);
681: ISRestoreIndices(isrow,&r);
682: PetscLogCpuToGpu(n*sizeof(PetscInt));
683: }
685: /* upper triangular indices */
686: ISIdentity(iscol,&col_identity);
687: if (!col_identity && !cusparseTriFactors->cpermIndices) {
688: const PetscInt *c;
690: ISGetIndices(iscol,&c);
691: cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
692: cusparseTriFactors->cpermIndices->assign(c, c+n);
693: ISRestoreIndices(iscol,&c);
694: PetscLogCpuToGpu(n*sizeof(PetscInt));
695: }
696: return(0);
697: }
699: static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
700: {
701: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
702: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
703: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
704: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
705: cusparseStatus_t stat;
706: PetscErrorCode ierr;
707: cudaError_t cerr;
708: PetscInt *AiUp, *AjUp;
709: PetscScalar *AAUp;
710: PetscScalar *AALo;
711: PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
712: Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data;
713: const PetscInt *ai = b->i,*aj = b->j,*vj;
714: const MatScalar *aa = b->a,*v;
717: if (!n) return(0);
718: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
719: try {
720: cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
721: cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
722: if (!upTriFactor && !loTriFactor) {
723: /* Allocate Space for the upper triangular matrix */
724: cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
725: cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
727: /* Fill the upper triangular matrix */
728: AiUp[0]=(PetscInt) 0;
729: AiUp[n]=nzUpper;
730: offset = 0;
731: for (i=0; i<n; i++) {
732: /* set the pointers */
733: v = aa + ai[i];
734: vj = aj + ai[i];
735: nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
737: /* first, set the diagonal elements */
738: AjUp[offset] = (PetscInt) i;
739: AAUp[offset] = (MatScalar)1.0/v[nz];
740: AiUp[i] = offset;
741: AALo[offset] = (MatScalar)1.0/v[nz];
743: offset+=1;
744: if (nz>0) {
745: PetscArraycpy(&(AjUp[offset]), vj, nz);
746: PetscArraycpy(&(AAUp[offset]), v, nz);
747: for (j=offset; j<offset+nz; j++) {
748: AAUp[j] = -AAUp[j];
749: AALo[j] = AAUp[j]/v[nz];
750: }
751: offset+=nz;
752: }
753: }
755: /* allocate space for the triangular factor information */
756: PetscNew(&upTriFactor);
757: upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
759: /* Create the matrix description */
760: stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
761: stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
762: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
763: stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
764: #else
765: stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
766: #endif
767: stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
768: stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
770: /* set the matrix */
771: upTriFactor->csrMat = new CsrMatrix;
772: upTriFactor->csrMat->num_rows = A->rmap->n;
773: upTriFactor->csrMat->num_cols = A->cmap->n;
774: upTriFactor->csrMat->num_entries = a->nz;
776: upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
777: upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
779: upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
780: upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
782: upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
783: upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
785: /* set the operation */
786: upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
788: /* Create the solve analysis information */
789: PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);
790: stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
791: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
792: stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
793: upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
794: upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
795: upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
796: &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
797: cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
798: #endif
800: /* perform the solve analysis */
801: stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
802: upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
803: upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
804: upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
805: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
806: ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
807: #endif
808: );CHKERRCUSPARSE(stat);
809: cerr = WaitForCUDA();CHKERRCUDA(cerr);
810: PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);
812: /* assign the pointer */
813: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
815: /* allocate space for the triangular factor information */
816: PetscNew(&loTriFactor);
817: loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
819: /* Create the matrix description */
820: stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
821: stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
822: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
823: stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
824: #else
825: stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
826: #endif
827: stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
828: stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
830: /* set the operation */
831: loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
833: /* set the matrix */
834: loTriFactor->csrMat = new CsrMatrix;
835: loTriFactor->csrMat->num_rows = A->rmap->n;
836: loTriFactor->csrMat->num_cols = A->cmap->n;
837: loTriFactor->csrMat->num_entries = a->nz;
839: loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
840: loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
842: loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
843: loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
845: loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
846: loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
848: /* Create the solve analysis information */
849: PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);
850: stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
851: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
852: stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
853: loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
854: loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
855: loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
856: &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
857: cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
858: #endif
860: /* perform the solve analysis */
861: stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
862: loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
863: loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
864: loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
865: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
866: ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
867: #endif
868: );CHKERRCUSPARSE(stat);
869: cerr = WaitForCUDA();CHKERRCUDA(cerr);
870: PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);
872: /* assign the pointer */
873: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
875: PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));
876: cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
877: cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
878: } else {
879: /* Fill the upper triangular matrix */
880: offset = 0;
881: for (i=0; i<n; i++) {
882: /* set the pointers */
883: v = aa + ai[i];
884: nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
886: /* first, set the diagonal elements */
887: AAUp[offset] = 1.0/v[nz];
888: AALo[offset] = 1.0/v[nz];
890: offset+=1;
891: if (nz>0) {
892: PetscArraycpy(&(AAUp[offset]), v, nz);
893: for (j=offset; j<offset+nz; j++) {
894: AAUp[j] = -AAUp[j];
895: AALo[j] = AAUp[j]/v[nz];
896: }
897: offset+=nz;
898: }
899: }
900: if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
901: if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
902: upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
903: loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
904: PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));
905: }
906: cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
907: cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
908: } catch(char *ex) {
909: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
910: }
911: }
912: return(0);
913: }
915: static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
916: {
917: PetscErrorCode ierr;
918: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
919: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
920: IS ip = a->row;
921: PetscBool perm_identity;
922: PetscInt n = A->rmap->n;
925: if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
926: MatSeqAIJCUSPARSEBuildICCTriMatrices(A);
927: if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
928: cusparseTriFactors->nnz=(a->nz-n)*2 + n;
930: A->offloadmask = PETSC_OFFLOAD_BOTH;
932: /* lower triangular indices */
933: ISIdentity(ip,&perm_identity);
934: if (!perm_identity) {
935: IS iip;
936: const PetscInt *irip,*rip;
938: ISInvertPermutation(ip,PETSC_DECIDE,&iip);
939: ISGetIndices(iip,&irip);
940: ISGetIndices(ip,&rip);
941: cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
942: cusparseTriFactors->rpermIndices->assign(rip, rip+n);
943: cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
944: cusparseTriFactors->cpermIndices->assign(irip, irip+n);
945: ISRestoreIndices(iip,&irip);
946: ISDestroy(&iip);
947: ISRestoreIndices(ip,&rip);
948: PetscLogCpuToGpu(2.*n*sizeof(PetscInt));
949: }
950: return(0);
951: }
953: #define CHECK_LAUNCH_ERROR() \
954: do { \
955: /* Check synchronous errors, i.e. pre-launch */ \
956: cudaError_t err = cudaGetLastError(); \
957: if (cudaSuccess != err) { \
958: SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Cuda error: %s",cudaGetErrorString(err)); \
959: } \
960: /* Check asynchronous errors, i.e. kernel failed (ULF) */ \
961: err = cudaDeviceSynchronize(); \
962: if (cudaSuccess != err) { \
963: SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Cuda error: %s",cudaGetErrorString(err)); \
964: } \
965: } while (0)
967: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
968: {
969: Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data;
970: IS ip = b->row;
971: PetscBool perm_identity;
975: MatSeqAIJCUSPARSECopyFromGPU(A);
976: MatCholeskyFactorNumeric_SeqAIJ(B,A,info);
977: B->offloadmask = PETSC_OFFLOAD_CPU;
978: /* determine which version of MatSolve needs to be used. */
979: ISIdentity(ip,&perm_identity);
980: if (perm_identity) {
981: B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
982: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
983: B->ops->matsolve = NULL;
984: B->ops->matsolvetranspose = NULL;
985: } else {
986: B->ops->solve = MatSolve_SeqAIJCUSPARSE;
987: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
988: B->ops->matsolve = NULL;
989: B->ops->matsolvetranspose = NULL;
990: }
992: /* get the triangular factors */
993: MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);
994: return(0);
995: }
997: static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
998: {
999: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1000: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1001: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1002: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
1003: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
1004: cusparseStatus_t stat;
1005: cusparseIndexBase_t indexBase;
1006: cusparseMatrixType_t matrixType;
1007: cusparseFillMode_t fillMode;
1008: cusparseDiagType_t diagType;
1009: cudaError_t cerr;
1010: PetscErrorCode ierr;
1013: /* allocate space for the transpose of the lower triangular factor */
1014: PetscNew(&loTriFactorT);
1015: loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1017: /* set the matrix descriptors of the lower triangular factor */
1018: matrixType = cusparseGetMatType(loTriFactor->descr);
1019: indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
1020: fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1021: CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1022: diagType = cusparseGetMatDiagType(loTriFactor->descr);
1024: /* Create the matrix description */
1025: stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
1026: stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1027: stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1028: stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1029: stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1031: /* set the operation */
1032: loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1034: /* allocate GPU space for the CSC of the lower triangular factor*/
1035: loTriFactorT->csrMat = new CsrMatrix;
1036: loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols;
1037: loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows;
1038: loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
1039: loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
1040: loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1041: loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1043: /* compute the transpose of the lower triangular factor, i.e. the CSC */
1044: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1045: stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1046: loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1047: loTriFactor->csrMat->values->data().get(),
1048: loTriFactor->csrMat->row_offsets->data().get(),
1049: loTriFactor->csrMat->column_indices->data().get(),
1050: loTriFactorT->csrMat->values->data().get(),
1051: loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1052: CUSPARSE_ACTION_NUMERIC,indexBase,
1053: CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1054: cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1055: #endif
1057: PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);
1058: stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1059: loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1060: loTriFactor->csrMat->values->data().get(),
1061: loTriFactor->csrMat->row_offsets->data().get(),
1062: loTriFactor->csrMat->column_indices->data().get(),
1063: loTriFactorT->csrMat->values->data().get(),
1064: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1065: loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1066: CUSPARSE_ACTION_NUMERIC, indexBase,
1067: CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer
1068: #else
1069: loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1070: CUSPARSE_ACTION_NUMERIC, indexBase
1071: #endif
1072: );CHKERRCUSPARSE(stat);
1073: cerr = WaitForCUDA();CHKERRCUDA(cerr);
1074: PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);
1076: /* Create the solve analysis information */
1077: PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);
1078: stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1079: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1080: stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
1081: loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1082: loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1083: loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1084: &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1085: cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1086: #endif
1088: /* perform the solve analysis */
1089: stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
1090: loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1091: loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1092: loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo
1093: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1094: ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1095: #endif
1096: );CHKERRCUSPARSE(stat);
1097: cerr = WaitForCUDA();CHKERRCUDA(cerr);
1098: PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);
1100: /* assign the pointer */
1101: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1103: /*********************************************/
1104: /* Now the Transpose of the Upper Tri Factor */
1105: /*********************************************/
1107: /* allocate space for the transpose of the upper triangular factor */
1108: PetscNew(&upTriFactorT);
1109: upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1111: /* set the matrix descriptors of the upper triangular factor */
1112: matrixType = cusparseGetMatType(upTriFactor->descr);
1113: indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1114: fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1115: CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1116: diagType = cusparseGetMatDiagType(upTriFactor->descr);
1118: /* Create the matrix description */
1119: stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
1120: stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1121: stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1122: stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1123: stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1125: /* set the operation */
1126: upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1128: /* allocate GPU space for the CSC of the upper triangular factor*/
1129: upTriFactorT->csrMat = new CsrMatrix;
1130: upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols;
1131: upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows;
1132: upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
1133: upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1134: upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1135: upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1137: /* compute the transpose of the upper triangular factor, i.e. the CSC */
1138: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1139: stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1140: upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1141: upTriFactor->csrMat->values->data().get(),
1142: upTriFactor->csrMat->row_offsets->data().get(),
1143: upTriFactor->csrMat->column_indices->data().get(),
1144: upTriFactorT->csrMat->values->data().get(),
1145: upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1146: CUSPARSE_ACTION_NUMERIC,indexBase,
1147: CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1148: cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1149: #endif
1151: PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);
1152: stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1153: upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1154: upTriFactor->csrMat->values->data().get(),
1155: upTriFactor->csrMat->row_offsets->data().get(),
1156: upTriFactor->csrMat->column_indices->data().get(),
1157: upTriFactorT->csrMat->values->data().get(),
1158: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1159: upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1160: CUSPARSE_ACTION_NUMERIC, indexBase,
1161: CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer
1162: #else
1163: upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1164: CUSPARSE_ACTION_NUMERIC, indexBase
1165: #endif
1166: );CHKERRCUSPARSE(stat);
1167: cerr = WaitForCUDA();CHKERRCUDA(cerr);
1168: PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);
1170: /* Create the solve analysis information */
1171: PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);
1172: stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1173: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1174: stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1175: upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1176: upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1177: upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1178: &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1179: cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1180: #endif
1182: /* perform the solve analysis */
1183: stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1184: upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1185: upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1186: upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo
1187: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1188: ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1189: #endif
1190: );CHKERRCUSPARSE(stat);
1191: cerr = WaitForCUDA();CHKERRCUDA(cerr);
1192: PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);
1194: /* assign the pointer */
1195: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1196: return(0);
1197: }
1199: struct PetscScalarToPetscInt
1200: {
1201: __host__ __device__
1202: PetscInt operator()(PetscScalar s)
1203: {
1204: return (PetscInt)PetscRealPart(s);
1205: }
1206: };
1208: static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTransposeForMult(Mat A)
1209: {
1210: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1211: Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1212: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
1213: cusparseStatus_t stat;
1214: cusparseIndexBase_t indexBase;
1215: cudaError_t err;
1216: PetscErrorCode ierr;
1219: if (!A->form_explicit_transpose || !A->rmap->n || !A->cmap->n) return(0);
1220: MatSeqAIJCUSPARSECopyToGPU(A);
1221: matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1222: if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing mat struct");
1223: matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1224: if (A->transupdated && !matstructT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing matTranspose struct");
1225: if (A->transupdated) return(0);
1226: PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);
1227: if (cusparsestruct->format != MAT_CUSPARSE_CSR) {
1228: MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);
1229: }
1230: if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1231: matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1232: stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
1233: indexBase = cusparseGetMatIndexBase(matstruct->descr);
1234: stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
1235: stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1237: /* set alpha and beta */
1238: err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1239: err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1240: err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1241: err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1242: err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1243: err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1245: if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1246: CsrMatrix *matrixT = new CsrMatrix;
1247: matstructT->mat = matrixT;
1248: matrixT->num_rows = A->cmap->n;
1249: matrixT->num_cols = A->rmap->n;
1250: matrixT->num_entries = a->nz;
1251: matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1252: matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1253: matrixT->values = new THRUSTARRAY(a->nz);
1255: if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); }
1256: cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1258: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1259: stat = cusparseCreateCsr(&matstructT->matDescr,
1260: matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1261: matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1262: matrixT->values->data().get(),
1263: CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1264: indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
1265: #endif
1266: } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1267: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1268: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1269: #else
1270: CsrMatrix *temp = new CsrMatrix;
1271: CsrMatrix *tempT = new CsrMatrix;
1272: /* First convert HYB to CSR */
1273: temp->num_rows = A->rmap->n;
1274: temp->num_cols = A->cmap->n;
1275: temp->num_entries = a->nz;
1276: temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1277: temp->column_indices = new THRUSTINTARRAY32(a->nz);
1278: temp->values = new THRUSTARRAY(a->nz);
1280: stat = cusparse_hyb2csr(cusparsestruct->handle,
1281: matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1282: temp->values->data().get(),
1283: temp->row_offsets->data().get(),
1284: temp->column_indices->data().get());CHKERRCUSPARSE(stat);
1286: /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1287: tempT->num_rows = A->rmap->n;
1288: tempT->num_cols = A->cmap->n;
1289: tempT->num_entries = a->nz;
1290: tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1291: tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1292: tempT->values = new THRUSTARRAY(a->nz);
1294: stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1295: temp->num_cols, temp->num_entries,
1296: temp->values->data().get(),
1297: temp->row_offsets->data().get(),
1298: temp->column_indices->data().get(),
1299: tempT->values->data().get(),
1300: tempT->column_indices->data().get(),
1301: tempT->row_offsets->data().get(),
1302: CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1304: /* Last, convert CSC to HYB */
1305: cusparseHybMat_t hybMat;
1306: stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1307: cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1308: CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1309: stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1310: matstructT->descr, tempT->values->data().get(),
1311: tempT->row_offsets->data().get(),
1312: tempT->column_indices->data().get(),
1313: hybMat, 0, partition);CHKERRCUSPARSE(stat);
1315: /* assign the pointer */
1316: matstructT->mat = hybMat;
1317: A->transupdated = PETSC_TRUE;
1318: /* delete temporaries */
1319: if (tempT) {
1320: if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1321: if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1322: if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1323: delete (CsrMatrix*) tempT;
1324: }
1325: if (temp) {
1326: if (temp->values) delete (THRUSTARRAY*) temp->values;
1327: if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1328: if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1329: delete (CsrMatrix*) temp;
1330: }
1331: #endif
1332: }
1333: }
1334: if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1335: CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
1336: CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat;
1337: if (!matrix) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix");
1338: if (!matrix->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix rows");
1339: if (!matrix->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix cols");
1340: if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix values");
1341: if (!matrixT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT");
1342: if (!matrixT->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT rows");
1343: if (!matrixT->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT cols");
1344: if (!matrixT->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT values");
1345: if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1346: cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1347: cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
1348: PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));
1349: }
1350: if (!cusparsestruct->csr2csc_i) {
1351: THRUSTARRAY csr2csc_a(matrix->num_entries);
1352: PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1354: indexBase = cusparseGetMatIndexBase(matstruct->descr);
1355: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1356: void *csr2cscBuffer;
1357: size_t csr2cscBufferSize;
1358: stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1359: A->cmap->n, matrix->num_entries,
1360: matrix->values->data().get(),
1361: cusparsestruct->rowoffsets_gpu->data().get(),
1362: matrix->column_indices->data().get(),
1363: matrixT->values->data().get(),
1364: matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1365: CUSPARSE_ACTION_NUMERIC,indexBase,
1366: cusparsestruct->csr2cscAlg, &csr2cscBufferSize);CHKERRCUSPARSE(stat);
1367: err = cudaMalloc(&csr2cscBuffer,csr2cscBufferSize);CHKERRCUDA(err);
1368: #endif
1370: if (matrix->num_entries) {
1371: /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
1372: mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
1373: I checked every parameters and they were just fine. I have no clue why cusparse complains.
1375: Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
1376: should be filled with indexBase. So I just take a shortcut here.
1377: */
1378: stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1379: A->cmap->n,matrix->num_entries,
1380: csr2csc_a.data().get(),
1381: cusparsestruct->rowoffsets_gpu->data().get(),
1382: matrix->column_indices->data().get(),
1383: matrixT->values->data().get(),
1384: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1385: matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1386: CUSPARSE_ACTION_NUMERIC,indexBase,
1387: cusparsestruct->csr2cscAlg, csr2cscBuffer);CHKERRCUSPARSE(stat);
1388: #else
1389: matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1390: CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1391: #endif
1392: } else {
1393: matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
1394: }
1396: cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1397: PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt()));
1398: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1399: err = cudaFree(csr2cscBuffer);CHKERRCUDA(err);
1400: #endif
1401: }
1402: PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()),
1403: thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()),
1404: matrixT->values->begin()));
1405: }
1406: PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);
1407: /* the compressed row indices is not used for matTranspose */
1408: matstructT->cprowIndices = NULL;
1409: /* assign the pointer */
1410: ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1411: A->transupdated = PETSC_TRUE;
1412: return(0);
1413: }
1415: /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1416: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1417: {
1418: PetscInt n = xx->map->n;
1419: const PetscScalar *barray;
1420: PetscScalar *xarray;
1421: thrust::device_ptr<const PetscScalar> bGPU;
1422: thrust::device_ptr<PetscScalar> xGPU;
1423: cusparseStatus_t stat;
1424: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1425: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1426: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1427: THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1428: PetscErrorCode ierr;
1429: cudaError_t cerr;
1432: /* Analyze the matrix and create the transpose ... on the fly */
1433: if (!loTriFactorT && !upTriFactorT) {
1434: MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);
1435: loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1436: upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1437: }
1439: /* Get the GPU pointers */
1440: VecCUDAGetArrayWrite(xx,&xarray);
1441: VecCUDAGetArrayRead(bb,&barray);
1442: xGPU = thrust::device_pointer_cast(xarray);
1443: bGPU = thrust::device_pointer_cast(barray);
1445: PetscLogGpuTimeBegin();
1446: /* First, reorder with the row permutation */
1447: thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1448: thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1449: xGPU);
1451: /* First, solve U */
1452: stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1453: upTriFactorT->csrMat->num_rows,
1454: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1455: upTriFactorT->csrMat->num_entries,
1456: #endif
1457: &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1458: upTriFactorT->csrMat->values->data().get(),
1459: upTriFactorT->csrMat->row_offsets->data().get(),
1460: upTriFactorT->csrMat->column_indices->data().get(),
1461: upTriFactorT->solveInfo,
1462: xarray, tempGPU->data().get()
1463: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1464: ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1465: #endif
1466: );CHKERRCUSPARSE(stat);
1468: /* Then, solve L */
1469: stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1470: loTriFactorT->csrMat->num_rows,
1471: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1472: loTriFactorT->csrMat->num_entries,
1473: #endif
1474: &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1475: loTriFactorT->csrMat->values->data().get(),
1476: loTriFactorT->csrMat->row_offsets->data().get(),
1477: loTriFactorT->csrMat->column_indices->data().get(),
1478: loTriFactorT->solveInfo,
1479: tempGPU->data().get(), xarray
1480: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1481: ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1482: #endif
1483: );CHKERRCUSPARSE(stat);
1485: /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1486: thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1487: thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1488: tempGPU->begin());
1490: /* Copy the temporary to the full solution. */
1491: thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU);
1493: /* restore */
1494: VecCUDARestoreArrayRead(bb,&barray);
1495: VecCUDARestoreArrayWrite(xx,&xarray);
1496: cerr = WaitForCUDA();CHKERRCUDA(cerr);
1497: PetscLogGpuTimeEnd();
1498: PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1499: return(0);
1500: }
1502: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1503: {
1504: const PetscScalar *barray;
1505: PetscScalar *xarray;
1506: cusparseStatus_t stat;
1507: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1508: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1509: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1510: THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1511: PetscErrorCode ierr;
1512: cudaError_t cerr;
1515: /* Analyze the matrix and create the transpose ... on the fly */
1516: if (!loTriFactorT && !upTriFactorT) {
1517: MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);
1518: loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1519: upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1520: }
1522: /* Get the GPU pointers */
1523: VecCUDAGetArrayWrite(xx,&xarray);
1524: VecCUDAGetArrayRead(bb,&barray);
1526: PetscLogGpuTimeBegin();
1527: /* First, solve U */
1528: stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1529: upTriFactorT->csrMat->num_rows,
1530: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1531: upTriFactorT->csrMat->num_entries,
1532: #endif
1533: &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1534: upTriFactorT->csrMat->values->data().get(),
1535: upTriFactorT->csrMat->row_offsets->data().get(),
1536: upTriFactorT->csrMat->column_indices->data().get(),
1537: upTriFactorT->solveInfo,
1538: barray, tempGPU->data().get()
1539: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1540: ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1541: #endif
1542: );CHKERRCUSPARSE(stat);
1544: /* Then, solve L */
1545: stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1546: loTriFactorT->csrMat->num_rows,
1547: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1548: loTriFactorT->csrMat->num_entries,
1549: #endif
1550: &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1551: loTriFactorT->csrMat->values->data().get(),
1552: loTriFactorT->csrMat->row_offsets->data().get(),
1553: loTriFactorT->csrMat->column_indices->data().get(),
1554: loTriFactorT->solveInfo,
1555: tempGPU->data().get(), xarray
1556: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1557: ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1558: #endif
1559: );CHKERRCUSPARSE(stat);
1561: /* restore */
1562: VecCUDARestoreArrayRead(bb,&barray);
1563: VecCUDARestoreArrayWrite(xx,&xarray);
1564: cerr = WaitForCUDA();CHKERRCUDA(cerr);
1565: PetscLogGpuTimeEnd();
1566: PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1567: return(0);
1568: }
1570: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1571: {
1572: const PetscScalar *barray;
1573: PetscScalar *xarray;
1574: thrust::device_ptr<const PetscScalar> bGPU;
1575: thrust::device_ptr<PetscScalar> xGPU;
1576: cusparseStatus_t stat;
1577: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1578: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1579: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1580: THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1581: PetscErrorCode ierr;
1582: cudaError_t cerr;
1586: /* Get the GPU pointers */
1587: VecCUDAGetArrayWrite(xx,&xarray);
1588: VecCUDAGetArrayRead(bb,&barray);
1589: xGPU = thrust::device_pointer_cast(xarray);
1590: bGPU = thrust::device_pointer_cast(barray);
1592: PetscLogGpuTimeBegin();
1593: /* First, reorder with the row permutation */
1594: thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1595: thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1596: tempGPU->begin());
1598: /* Next, solve L */
1599: stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1600: loTriFactor->csrMat->num_rows,
1601: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1602: loTriFactor->csrMat->num_entries,
1603: #endif
1604: &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1605: loTriFactor->csrMat->values->data().get(),
1606: loTriFactor->csrMat->row_offsets->data().get(),
1607: loTriFactor->csrMat->column_indices->data().get(),
1608: loTriFactor->solveInfo,
1609: tempGPU->data().get(), xarray
1610: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1611: ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1612: #endif
1613: );CHKERRCUSPARSE(stat);
1615: /* Then, solve U */
1616: stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1617: upTriFactor->csrMat->num_rows,
1618: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1619: upTriFactor->csrMat->num_entries,
1620: #endif
1621: &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1622: upTriFactor->csrMat->values->data().get(),
1623: upTriFactor->csrMat->row_offsets->data().get(),
1624: upTriFactor->csrMat->column_indices->data().get(),
1625: upTriFactor->solveInfo,
1626: xarray, tempGPU->data().get()
1627: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1628: ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1629: #endif
1630: );CHKERRCUSPARSE(stat);
1632: /* Last, reorder with the column permutation */
1633: thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1634: thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1635: xGPU);
1637: VecCUDARestoreArrayRead(bb,&barray);
1638: VecCUDARestoreArrayWrite(xx,&xarray);
1639: cerr = WaitForCUDA();CHKERRCUDA(cerr);
1640: PetscLogGpuTimeEnd();
1641: PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1642: return(0);
1643: }
1645: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1646: {
1647: const PetscScalar *barray;
1648: PetscScalar *xarray;
1649: cusparseStatus_t stat;
1650: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1651: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1652: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1653: THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1654: PetscErrorCode ierr;
1655: cudaError_t cerr;
1658: /* Get the GPU pointers */
1659: VecCUDAGetArrayWrite(xx,&xarray);
1660: VecCUDAGetArrayRead(bb,&barray);
1662: PetscLogGpuTimeBegin();
1663: /* First, solve L */
1664: stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1665: loTriFactor->csrMat->num_rows,
1666: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1667: loTriFactor->csrMat->num_entries,
1668: #endif
1669: &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1670: loTriFactor->csrMat->values->data().get(),
1671: loTriFactor->csrMat->row_offsets->data().get(),
1672: loTriFactor->csrMat->column_indices->data().get(),
1673: loTriFactor->solveInfo,
1674: barray, tempGPU->data().get()
1675: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1676: ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1677: #endif
1678: );CHKERRCUSPARSE(stat);
1680: /* Next, solve U */
1681: stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1682: upTriFactor->csrMat->num_rows,
1683: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1684: upTriFactor->csrMat->num_entries,
1685: #endif
1686: &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1687: upTriFactor->csrMat->values->data().get(),
1688: upTriFactor->csrMat->row_offsets->data().get(),
1689: upTriFactor->csrMat->column_indices->data().get(),
1690: upTriFactor->solveInfo,
1691: tempGPU->data().get(), xarray
1692: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1693: ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1694: #endif
1695: );CHKERRCUSPARSE(stat);
1697: VecCUDARestoreArrayRead(bb,&barray);
1698: VecCUDARestoreArrayWrite(xx,&xarray);
1699: cerr = WaitForCUDA();CHKERRCUDA(cerr);
1700: PetscLogGpuTimeEnd();
1701: PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1702: return(0);
1703: }
1705: static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
1706: {
1707: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
1708: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1709: cudaError_t cerr;
1710: PetscErrorCode ierr;
1713: if (A->offloadmask == PETSC_OFFLOAD_GPU) {
1714: CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;
1716: PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);
1717: cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1718: cerr = WaitForCUDA();CHKERRCUDA(cerr);
1719: PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));
1720: PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);
1721: A->offloadmask = PETSC_OFFLOAD_BOTH;
1722: }
1723: return(0);
1724: }
1726: static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1727: {
1728: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
1732: MatSeqAIJCUSPARSECopyFromGPU(A);
1733: *array = a->a;
1734: A->offloadmask = PETSC_OFFLOAD_CPU;
1735: return(0);
1736: }
1738: static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1739: {
1740: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1741: Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
1742: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
1743: PetscInt m = A->rmap->n,*ii,*ridx,tmp;
1744: PetscErrorCode ierr;
1745: cusparseStatus_t stat;
1746: PetscBool both = PETSC_TRUE;
1747: cudaError_t err;
1750: if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Cannot copy to GPU");
1751: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1752: if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
1753: CsrMatrix *matrix;
1754: matrix = (CsrMatrix*)cusparsestruct->mat->mat;
1756: if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR values");
1757: PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);
1758: matrix->values->assign(a->a, a->a+a->nz);
1759: err = WaitForCUDA();CHKERRCUDA(err);
1760: PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));
1761: PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);
1762: MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);
1763: } else {
1764: PetscInt nnz;
1765: PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);
1766: MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);
1767: MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);
1768: delete cusparsestruct->workVector;
1769: delete cusparsestruct->rowoffsets_gpu;
1770: cusparsestruct->workVector = NULL;
1771: cusparsestruct->rowoffsets_gpu = NULL;
1772: try {
1773: if (a->compressedrow.use) {
1774: m = a->compressedrow.nrows;
1775: ii = a->compressedrow.i;
1776: ridx = a->compressedrow.rindex;
1777: } else {
1778: m = A->rmap->n;
1779: ii = a->i;
1780: ridx = NULL;
1781: }
1782: if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR row data");
1783: if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR column data");
1784: if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; }
1785: else nnz = a->nz;
1787: /* create cusparse matrix */
1788: cusparsestruct->nrows = m;
1789: matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1790: stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
1791: stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
1792: stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1794: err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1795: err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1796: err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1797: err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1798: err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1799: err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1800: stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1802: /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1803: if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1804: /* set the matrix */
1805: CsrMatrix *mat= new CsrMatrix;
1806: mat->num_rows = m;
1807: mat->num_cols = A->cmap->n;
1808: mat->num_entries = nnz;
1809: mat->row_offsets = new THRUSTINTARRAY32(m+1);
1810: mat->row_offsets->assign(ii, ii + m+1);
1812: mat->column_indices = new THRUSTINTARRAY32(nnz);
1813: mat->column_indices->assign(a->j, a->j+nnz);
1815: mat->values = new THRUSTARRAY(nnz);
1816: if (a->a) mat->values->assign(a->a, a->a+nnz);
1818: /* assign the pointer */
1819: matstruct->mat = mat;
1820: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1821: if (mat->num_rows) { /* cusparse errors on empty matrices! */
1822: stat = cusparseCreateCsr(&matstruct->matDescr,
1823: mat->num_rows, mat->num_cols, mat->num_entries,
1824: mat->row_offsets->data().get(), mat->column_indices->data().get(),
1825: mat->values->data().get(),
1826: CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1827: CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1828: }
1829: #endif
1830: } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1831: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1832: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1833: #else
1834: CsrMatrix *mat= new CsrMatrix;
1835: mat->num_rows = m;
1836: mat->num_cols = A->cmap->n;
1837: mat->num_entries = nnz;
1838: mat->row_offsets = new THRUSTINTARRAY32(m+1);
1839: mat->row_offsets->assign(ii, ii + m+1);
1841: mat->column_indices = new THRUSTINTARRAY32(nnz);
1842: mat->column_indices->assign(a->j, a->j+nnz);
1844: mat->values = new THRUSTARRAY(nnz);
1845: if (a->a) mat->values->assign(a->a, a->a+nnz);
1847: cusparseHybMat_t hybMat;
1848: stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1849: cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1850: CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1851: stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1852: matstruct->descr, mat->values->data().get(),
1853: mat->row_offsets->data().get(),
1854: mat->column_indices->data().get(),
1855: hybMat, 0, partition);CHKERRCUSPARSE(stat);
1856: /* assign the pointer */
1857: matstruct->mat = hybMat;
1859: if (mat) {
1860: if (mat->values) delete (THRUSTARRAY*)mat->values;
1861: if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1862: if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1863: delete (CsrMatrix*)mat;
1864: }
1865: #endif
1866: }
1868: /* assign the compressed row indices */
1869: if (a->compressedrow.use) {
1870: cusparsestruct->workVector = new THRUSTARRAY(m);
1871: matstruct->cprowIndices = new THRUSTINTARRAY(m);
1872: matstruct->cprowIndices->assign(ridx,ridx+m);
1873: tmp = m;
1874: } else {
1875: cusparsestruct->workVector = NULL;
1876: matstruct->cprowIndices = NULL;
1877: tmp = 0;
1878: }
1879: PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));
1881: /* assign the pointer */
1882: cusparsestruct->mat = matstruct;
1883: } catch(char *ex) {
1884: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1885: }
1886: err = WaitForCUDA();CHKERRCUDA(err);
1887: PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);
1888: cusparsestruct->nonzerostate = A->nonzerostate;
1889: }
1890: if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
1891: }
1892: return(0);
1893: }
1895: struct VecCUDAPlusEquals
1896: {
1897: template <typename Tuple>
1898: __host__ __device__
1899: void operator()(Tuple t)
1900: {
1901: thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1902: }
1903: };
1905: struct VecCUDAEquals
1906: {
1907: template <typename Tuple>
1908: __host__ __device__
1909: void operator()(Tuple t)
1910: {
1911: thrust::get<1>(t) = thrust::get<0>(t);
1912: }
1913: };
1915: struct VecCUDAEqualsReverse
1916: {
1917: template <typename Tuple>
1918: __host__ __device__
1919: void operator()(Tuple t)
1920: {
1921: thrust::get<0>(t) = thrust::get<1>(t);
1922: }
1923: };
1925: struct MatMatCusparse {
1926: PetscBool cisdense;
1927: PetscScalar *Bt;
1928: Mat X;
1929: PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
1930: PetscLogDouble flops;
1931: CsrMatrix *Bcsr;
1932: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1933: cusparseSpMatDescr_t matSpBDescr;
1934: PetscBool initialized; /* C = alpha op(A) op(B) + beta C */
1935: cusparseDnMatDescr_t matBDescr;
1936: cusparseDnMatDescr_t matCDescr;
1937: PetscInt Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
1938: size_t mmBufferSize;
1939: void *mmBuffer;
1940: void *mmBuffer2; /* SpGEMM WorkEstimation buffer */
1941: cusparseSpGEMMDescr_t spgemmDesc;
1942: #endif
1943: };
1945: static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
1946: {
1947: PetscErrorCode ierr;
1948: MatMatCusparse *mmdata = (MatMatCusparse *)data;
1949: cudaError_t cerr;
1950: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1951: cusparseStatus_t stat;
1952: #endif
1955: cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
1956: delete mmdata->Bcsr;
1957: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1958: if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); }
1959: if (mmdata->mmBuffer) { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); }
1960: if (mmdata->mmBuffer2) { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); }
1961: if (mmdata->matBDescr) { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); }
1962: if (mmdata->matCDescr) { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); }
1963: if (mmdata->spgemmDesc) { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); }
1964: #endif
1965: MatDestroy(&mmdata->X);
1966: PetscFree(data);
1967: return(0);
1968: }
1970: PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
1972: static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1973: {
1974: Mat_Product *product = C->product;
1975: Mat A,B;
1976: PetscInt m,n,blda,clda;
1977: PetscBool flg,biscuda;
1978: Mat_SeqAIJCUSPARSE *cusp;
1979: cusparseStatus_t stat;
1980: cusparseOperation_t opA;
1981: const PetscScalar *barray;
1982: PetscScalar *carray;
1983: PetscErrorCode ierr;
1984: MatMatCusparse *mmdata;
1985: Mat_SeqAIJCUSPARSEMultStruct *mat;
1986: CsrMatrix *csrmat;
1987: cudaError_t cerr;
1990: MatCheckProduct(C,1);
1991: if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
1992: mmdata = (MatMatCusparse*)product->data;
1993: A = product->A;
1994: B = product->B;
1995: PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);
1996: if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
1997: /* currently CopyToGpu does not copy if the matrix is bound to CPU
1998: Instead of silently accepting the wrong answer, I prefer to raise the error */
1999: if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2000: MatSeqAIJCUSPARSECopyToGPU(A);
2001: cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2002: switch (product->type) {
2003: case MATPRODUCT_AB:
2004: case MATPRODUCT_PtAP:
2005: mat = cusp->mat;
2006: opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2007: m = A->rmap->n;
2008: n = B->cmap->n;
2009: break;
2010: case MATPRODUCT_AtB:
2011: if (!A->form_explicit_transpose) {
2012: mat = cusp->mat;
2013: opA = CUSPARSE_OPERATION_TRANSPOSE;
2014: } else {
2015: MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);
2016: mat = cusp->matTranspose;
2017: opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2018: }
2019: m = A->cmap->n;
2020: n = B->cmap->n;
2021: break;
2022: case MATPRODUCT_ABt:
2023: case MATPRODUCT_RARt:
2024: mat = cusp->mat;
2025: opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2026: m = A->rmap->n;
2027: n = B->rmap->n;
2028: break;
2029: default:
2030: SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2031: }
2032: if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct");
2033: csrmat = (CsrMatrix*)mat->mat;
2034: /* if the user passed a CPU matrix, copy the data to the GPU */
2035: PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);
2036: if (!biscuda) {MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);}
2037: MatDenseCUDAGetArrayRead(B,&barray);
2039: MatDenseGetLDA(B,&blda);
2040: if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2041: MatDenseCUDAGetArrayWrite(mmdata->X,&carray);
2042: MatDenseGetLDA(mmdata->X,&clda);
2043: } else {
2044: MatDenseCUDAGetArrayWrite(C,&carray);
2045: MatDenseGetLDA(C,&clda);
2046: }
2048: PetscLogGpuTimeBegin();
2049: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2050: cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2051: /* (re)allcoate mmBuffer if not initialized or LDAs are different */
2052: if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2053: size_t mmBufferSize;
2054: if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;}
2055: if (!mmdata->matBDescr) {
2056: stat = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2057: mmdata->Blda = blda;
2058: }
2060: if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;}
2061: if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2062: stat = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2063: mmdata->Clda = clda;
2064: }
2066: if (!mat->matDescr) {
2067: stat = cusparseCreateCsr(&mat->matDescr,
2068: csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
2069: csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
2070: csrmat->values->data().get(),
2071: CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2072: CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
2073: }
2074: stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
2075: mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2076: mmdata->matCDescr,cusparse_scalartype,
2077: cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat);
2078: if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2079: cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr);
2080: cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr);
2081: mmdata->mmBufferSize = mmBufferSize;
2082: }
2083: mmdata->initialized = PETSC_TRUE;
2084: } else {
2085: /* to be safe, always update pointers of the mats */
2086: stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat);
2087: stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat);
2088: stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat);
2089: }
2091: /* do cusparseSpMM, which supports transpose on B */
2092: stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
2093: mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2094: mmdata->matCDescr,cusparse_scalartype,
2095: cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2096: #else
2097: PetscInt k;
2098: /* cusparseXcsrmm does not support transpose on B */
2099: if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2100: cublasHandle_t cublasv2handle;
2101: cublasStatus_t cerr;
2103: PetscCUBLASGetHandle(&cublasv2handle);
2104: cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
2105: B->cmap->n,B->rmap->n,
2106: &PETSC_CUSPARSE_ONE ,barray,blda,
2107: &PETSC_CUSPARSE_ZERO,barray,blda,
2108: mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
2109: blda = B->cmap->n;
2110: k = B->cmap->n;
2111: } else {
2112: k = B->rmap->n;
2113: }
2115: /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2116: stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
2117: csrmat->num_entries,mat->alpha_one,mat->descr,
2118: csrmat->values->data().get(),
2119: csrmat->row_offsets->data().get(),
2120: csrmat->column_indices->data().get(),
2121: mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
2122: carray,clda);CHKERRCUSPARSE(stat);
2123: #endif
2124: cerr = WaitForCUDA();CHKERRCUDA(cerr);
2125: PetscLogGpuTimeEnd();
2126: PetscLogGpuFlops(n*2.0*csrmat->num_entries);
2127: MatDenseCUDARestoreArrayRead(B,&barray);
2128: if (product->type == MATPRODUCT_RARt) {
2129: MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);
2130: MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);
2131: } else if (product->type == MATPRODUCT_PtAP) {
2132: MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);
2133: MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);
2134: } else {
2135: MatDenseCUDARestoreArrayWrite(C,&carray);
2136: }
2137: if (mmdata->cisdense) {
2138: MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);
2139: }
2140: if (!biscuda) {
2141: MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);
2142: }
2143: return(0);
2144: }
2146: static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2147: {
2148: Mat_Product *product = C->product;
2149: Mat A,B;
2150: PetscInt m,n;
2151: PetscBool cisdense,flg;
2152: PetscErrorCode ierr;
2153: MatMatCusparse *mmdata;
2154: Mat_SeqAIJCUSPARSE *cusp;
2157: MatCheckProduct(C,1);
2158: if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
2159: A = product->A;
2160: B = product->B;
2161: PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);
2162: if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2163: cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2164: if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2165: switch (product->type) {
2166: case MATPRODUCT_AB:
2167: m = A->rmap->n;
2168: n = B->cmap->n;
2169: break;
2170: case MATPRODUCT_AtB:
2171: m = A->cmap->n;
2172: n = B->cmap->n;
2173: break;
2174: case MATPRODUCT_ABt:
2175: m = A->rmap->n;
2176: n = B->rmap->n;
2177: break;
2178: case MATPRODUCT_PtAP:
2179: m = B->cmap->n;
2180: n = B->cmap->n;
2181: break;
2182: case MATPRODUCT_RARt:
2183: m = B->rmap->n;
2184: n = B->rmap->n;
2185: break;
2186: default:
2187: SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2188: }
2189: MatSetSizes(C,m,n,m,n);
2190: /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2191: PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);
2192: MatSetType(C,MATSEQDENSECUDA);
2194: /* product data */
2195: PetscNew(&mmdata);
2196: mmdata->cisdense = cisdense;
2197: #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
2198: /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2199: if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2200: cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
2201: }
2202: #endif
2203: /* for these products we need intermediate storage */
2204: if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2205: MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);
2206: MatSetType(mmdata->X,MATSEQDENSECUDA);
2207: if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2208: MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);
2209: } else {
2210: MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);
2211: }
2212: }
2213: C->product->data = mmdata;
2214: C->product->destroy = MatDestroy_MatMatCusparse;
2216: C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2217: return(0);
2218: }
2220: static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2221: {
2222: Mat_Product *product = C->product;
2223: Mat A,B;
2224: Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp;
2225: Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data;
2226: Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2227: CsrMatrix *Acsr,*Bcsr,*Ccsr;
2228: PetscBool flg;
2229: PetscErrorCode ierr;
2230: cusparseStatus_t stat;
2231: cudaError_t cerr;
2232: MatProductType ptype;
2233: MatMatCusparse *mmdata;
2234: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2235: cusparseSpMatDescr_t BmatSpDescr;
2236: #endif
2239: MatCheckProduct(C,1);
2240: if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
2241: PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);
2242: if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for C of type %s",((PetscObject)C)->type_name);
2243: mmdata = (MatMatCusparse*)C->product->data;
2244: A = product->A;
2245: B = product->B;
2246: if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2247: mmdata->reusesym = PETSC_FALSE;
2248: Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2249: if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2250: Cmat = Ccusp->mat;
2251: if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]);
2252: Ccsr = (CsrMatrix*)Cmat->mat;
2253: if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct");
2254: goto finalize;
2255: }
2256: if (!c->nz) goto finalize;
2257: PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);
2258: if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2259: PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);
2260: if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name);
2261: if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2262: if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2263: Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2264: Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2265: Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2266: if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2267: if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2268: if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2269: MatSeqAIJCUSPARSECopyToGPU(A);
2270: MatSeqAIJCUSPARSECopyToGPU(B);
2272: ptype = product->type;
2273: if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
2274: if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
2275: switch (ptype) {
2276: case MATPRODUCT_AB:
2277: Amat = Acusp->mat;
2278: Bmat = Bcusp->mat;
2279: break;
2280: case MATPRODUCT_AtB:
2281: Amat = Acusp->matTranspose;
2282: Bmat = Bcusp->mat;
2283: break;
2284: case MATPRODUCT_ABt:
2285: Amat = Acusp->mat;
2286: Bmat = Bcusp->matTranspose;
2287: break;
2288: default:
2289: SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2290: }
2291: Cmat = Ccusp->mat;
2292: if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2293: if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2294: if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[ptype]);
2295: Acsr = (CsrMatrix*)Amat->mat;
2296: Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */
2297: Ccsr = (CsrMatrix*)Cmat->mat;
2298: if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct");
2299: if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct");
2300: if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct");
2301: PetscLogGpuTimeBegin();
2302: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2303: BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2304: stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2305: Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2306: cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2307: mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2308: stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2309: Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2310: cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2311: #else
2312: stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2313: Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2314: Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2315: Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2316: Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2317: #endif
2318: PetscLogGpuFlops(mmdata->flops);
2319: cerr = WaitForCUDA();CHKERRCUDA(cerr);
2320: PetscLogGpuTimeEnd();
2321: C->offloadmask = PETSC_OFFLOAD_GPU;
2322: finalize:
2323: /* shorter version of MatAssemblyEnd_SeqAIJ */
2324: PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);
2325: PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");
2326: PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);
2327: c->reallocs = 0;
2328: C->info.mallocs += 0;
2329: C->info.nz_unneeded = 0;
2330: C->assembled = C->was_assembled = PETSC_TRUE;
2331: C->num_ass++;
2332: return(0);
2333: }
2335: static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2336: {
2337: Mat_Product *product = C->product;
2338: Mat A,B;
2339: Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp;
2340: Mat_SeqAIJ *a,*b,*c;
2341: Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2342: CsrMatrix *Acsr,*Bcsr,*Ccsr;
2343: PetscInt i,j,m,n,k;
2344: PetscBool flg;
2345: PetscErrorCode ierr;
2346: cusparseStatus_t stat;
2347: cudaError_t cerr;
2348: MatProductType ptype;
2349: MatMatCusparse *mmdata;
2350: PetscLogDouble flops;
2351: PetscBool biscompressed,ciscompressed;
2352: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2353: int64_t C_num_rows1, C_num_cols1, C_nnz1;
2354: size_t bufSize2;
2355: cusparseSpMatDescr_t BmatSpDescr;
2356: #else
2357: int cnz;
2358: #endif
2361: MatCheckProduct(C,1);
2362: if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
2363: A = product->A;
2364: B = product->B;
2365: PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);
2366: if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2367: PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);
2368: if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name);
2369: a = (Mat_SeqAIJ*)A->data;
2370: b = (Mat_SeqAIJ*)B->data;
2371: Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2372: Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2373: if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2374: if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2376: /* product data */
2377: PetscNew(&mmdata);
2378: C->product->data = mmdata;
2379: C->product->destroy = MatDestroy_MatMatCusparse;
2381: MatSeqAIJCUSPARSECopyToGPU(A);
2382: MatSeqAIJCUSPARSECopyToGPU(B);
2383: ptype = product->type;
2384: if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
2385: if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
2386: biscompressed = PETSC_FALSE;
2387: ciscompressed = PETSC_FALSE;
2388: switch (ptype) {
2389: case MATPRODUCT_AB:
2390: m = A->rmap->n;
2391: n = B->cmap->n;
2392: k = A->cmap->n;
2393: Amat = Acusp->mat;
2394: Bmat = Bcusp->mat;
2395: if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2396: if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2397: break;
2398: case MATPRODUCT_AtB:
2399: m = A->cmap->n;
2400: n = B->cmap->n;
2401: k = A->rmap->n;
2402: MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);
2403: Amat = Acusp->matTranspose;
2404: Bmat = Bcusp->mat;
2405: if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2406: break;
2407: case MATPRODUCT_ABt:
2408: m = A->rmap->n;
2409: n = B->rmap->n;
2410: k = A->cmap->n;
2411: MatSeqAIJCUSPARSEFormExplicitTransposeForMult(B);
2412: Amat = Acusp->mat;
2413: Bmat = Bcusp->matTranspose;
2414: if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2415: break;
2416: default:
2417: SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2418: }
2420: /* create cusparse matrix */
2421: MatSetSizes(C,m,n,m,n);
2422: MatSetType(C,MATSEQAIJCUSPARSE);
2423: c = (Mat_SeqAIJ*)C->data;
2424: Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2425: Cmat = new Mat_SeqAIJCUSPARSEMultStruct;
2426: Ccsr = new CsrMatrix;
2428: c->compressedrow.use = ciscompressed;
2429: if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2430: c->compressedrow.nrows = a->compressedrow.nrows;
2431: PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);
2432: PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);
2433: Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows);
2434: Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2435: Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows);
2436: } else {
2437: c->compressedrow.nrows = 0;
2438: c->compressedrow.i = NULL;
2439: c->compressedrow.rindex = NULL;
2440: Ccusp->workVector = NULL;
2441: Cmat->cprowIndices = NULL;
2442: }
2443: Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m;
2444: Ccusp->mat = Cmat;
2445: Ccusp->mat->mat = Ccsr;
2446: Ccsr->num_rows = Ccusp->nrows;
2447: Ccsr->num_cols = n;
2448: Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1);
2449: stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
2450: stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
2451: stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
2452: cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
2453: cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
2454: cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
2455: cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2456: cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2457: cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2458: if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2459: thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0);
2460: c->nz = 0;
2461: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2462: Ccsr->values = new THRUSTARRAY(c->nz);
2463: goto finalizesym;
2464: }
2466: if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2467: if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2468: Acsr = (CsrMatrix*)Amat->mat;
2469: if (!biscompressed) {
2470: Bcsr = (CsrMatrix*)Bmat->mat;
2471: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2472: BmatSpDescr = Bmat->matDescr;
2473: #endif
2474: } else { /* we need to use row offsets for the full matrix */
2475: CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat;
2476: Bcsr = new CsrMatrix;
2477: Bcsr->num_rows = B->rmap->n;
2478: Bcsr->num_cols = cBcsr->num_cols;
2479: Bcsr->num_entries = cBcsr->num_entries;
2480: Bcsr->column_indices = cBcsr->column_indices;
2481: Bcsr->values = cBcsr->values;
2482: if (!Bcusp->rowoffsets_gpu) {
2483: Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
2484: Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
2485: PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));
2486: }
2487: Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2488: mmdata->Bcsr = Bcsr;
2489: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2490: if (Bcsr->num_rows && Bcsr->num_cols) {
2491: stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries,
2492: Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2493: Bcsr->values->data().get(),
2494: CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2495: CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2496: }
2497: BmatSpDescr = mmdata->matSpBDescr;
2498: #endif
2499: }
2500: if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct");
2501: if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct");
2502: /* precompute flops count */
2503: if (ptype == MATPRODUCT_AB) {
2504: for (i=0, flops = 0; i<A->rmap->n; i++) {
2505: const PetscInt st = a->i[i];
2506: const PetscInt en = a->i[i+1];
2507: for (j=st; j<en; j++) {
2508: const PetscInt brow = a->j[j];
2509: flops += 2.*(b->i[brow+1] - b->i[brow]);
2510: }
2511: }
2512: } else if (ptype == MATPRODUCT_AtB) {
2513: for (i=0, flops = 0; i<A->rmap->n; i++) {
2514: const PetscInt anzi = a->i[i+1] - a->i[i];
2515: const PetscInt bnzi = b->i[i+1] - b->i[i];
2516: flops += (2.*anzi)*bnzi;
2517: }
2518: } else { /* TODO */
2519: flops = 0.;
2520: }
2522: mmdata->flops = flops;
2523: PetscLogGpuTimeBegin();
2524: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2525: stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2526: stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0,
2527: NULL, NULL, NULL,
2528: CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2529: CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2530: stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2531: /* ask bufferSize bytes for external memory */
2532: stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2533: Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2534: cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2535: mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat);
2536: cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr);
2537: /* inspect the matrices A and B to understand the memory requirement for the next step */
2538: stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2539: Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2540: cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2541: mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat);
2542: /* ask bufferSize again bytes for external memory */
2543: stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2544: Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2545: cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2546: mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat);
2547: /* The CUSPARSE documentation is not clear, nor the API
2548: We need both buffers to perform the operations properly!
2549: mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2550: it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2551: is stored in the descriptor! What a messy API... */
2552: cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr);
2553: /* compute the intermediate product of A * B */
2554: stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2555: Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2556: cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2557: mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2558: /* get matrix C non-zero entries C_nnz1 */
2559: stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat);
2560: c->nz = (PetscInt) C_nnz1;
2561: PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);
2562: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2563: CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2564: Ccsr->values = new THRUSTARRAY(c->nz);
2565: CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2566: stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2567: Ccsr->values->data().get());CHKERRCUSPARSE(stat);
2568: stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2569: Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2570: cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2571: #else
2572: stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
2573: stat = cusparseXcsrgemmNnz(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2574: Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2575: Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2576: Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2577: Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat);
2578: c->nz = cnz;
2579: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2580: CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2581: Ccsr->values = new THRUSTARRAY(c->nz);
2582: CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2584: stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2585: /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2586: I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2587: D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2588: stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2589: Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2590: Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2591: Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2592: Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2593: #endif
2594: cerr = WaitForCUDA();CHKERRCUDA(cerr);
2595: PetscLogGpuFlops(mmdata->flops);
2596: PetscLogGpuTimeEnd();
2597: finalizesym:
2598: c->singlemalloc = PETSC_FALSE;
2599: c->free_a = PETSC_TRUE;
2600: c->free_ij = PETSC_TRUE;
2601: PetscMalloc1(m+1,&c->i);
2602: PetscMalloc1(c->nz,&c->j);
2603: if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
2604: PetscInt *d_i = c->i;
2605: THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2606: THRUSTINTARRAY jj(Ccsr->column_indices->size());
2607: ii = *Ccsr->row_offsets;
2608: jj = *Ccsr->column_indices;
2609: if (ciscompressed) d_i = c->compressedrow.i;
2610: cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2611: cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2612: } else {
2613: PetscInt *d_i = c->i;
2614: if (ciscompressed) d_i = c->compressedrow.i;
2615: cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2616: cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2617: }
2618: if (ciscompressed) { /* need to expand host row offsets */
2619: PetscInt r = 0;
2620: c->i[0] = 0;
2621: for (k = 0; k < c->compressedrow.nrows; k++) {
2622: const PetscInt next = c->compressedrow.rindex[k];
2623: const PetscInt old = c->compressedrow.i[k];
2624: for (; r < next; r++) c->i[r+1] = old;
2625: }
2626: for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows];
2627: }
2628: PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));
2629: PetscMalloc1(m,&c->ilen);
2630: PetscMalloc1(m,&c->imax);
2631: c->maxnz = c->nz;
2632: c->nonzerorowcnt = 0;
2633: c->rmax = 0;
2634: for (k = 0; k < m; k++) {
2635: const PetscInt nn = c->i[k+1] - c->i[k];
2636: c->ilen[k] = c->imax[k] = nn;
2637: c->nonzerorowcnt += (PetscInt)!!nn;
2638: c->rmax = PetscMax(c->rmax,nn);
2639: }
2640: MatMarkDiagonal_SeqAIJ(C);
2641: PetscMalloc1(c->nz,&c->a);
2642: Ccsr->num_entries = c->nz;
2644: C->nonzerostate++;
2645: PetscLayoutSetUp(C->rmap);
2646: PetscLayoutSetUp(C->cmap);
2647: Ccusp->nonzerostate = C->nonzerostate;
2648: C->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
2649: C->preallocated = PETSC_TRUE;
2650: C->assembled = PETSC_FALSE;
2651: C->was_assembled = PETSC_FALSE;
2652: if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2653: mmdata->reusesym = PETSC_TRUE;
2654: C->offloadmask = PETSC_OFFLOAD_GPU;
2655: }
2656: C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2657: return(0);
2658: }
2660: PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
2662: /* handles sparse or dense B */
2663: static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
2664: {
2665: Mat_Product *product = mat->product;
2667: PetscBool isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE;
2670: MatCheckProduct(mat,1);
2671: PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);
2672: if (!product->A->boundtocpu && !product->B->boundtocpu) {
2673: PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);
2674: }
2675: if (product->type == MATPRODUCT_ABC) {
2676: Ciscusp = PETSC_FALSE;
2677: if (!product->C->boundtocpu) {
2678: PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);
2679: }
2680: }
2681: if (isdense) {
2682: switch (product->type) {
2683: case MATPRODUCT_AB:
2684: case MATPRODUCT_AtB:
2685: case MATPRODUCT_ABt:
2686: case MATPRODUCT_PtAP:
2687: case MATPRODUCT_RARt:
2688: if (product->A->boundtocpu) {
2689: MatProductSetFromOptions_SeqAIJ_SeqDense(mat);
2690: } else {
2691: mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2692: }
2693: break;
2694: case MATPRODUCT_ABC:
2695: mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2696: break;
2697: default:
2698: break;
2699: }
2700: } else if (Biscusp && Ciscusp) {
2701: switch (product->type) {
2702: case MATPRODUCT_AB:
2703: case MATPRODUCT_AtB:
2704: case MATPRODUCT_ABt:
2705: mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2706: break;
2707: case MATPRODUCT_PtAP:
2708: case MATPRODUCT_RARt:
2709: case MATPRODUCT_ABC:
2710: mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2711: break;
2712: default:
2713: break;
2714: }
2715: } else { /* fallback for AIJ */
2716: MatProductSetFromOptions_SeqAIJ(mat);
2717: }
2718: return(0);
2719: }
2721: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2722: {
2726: MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);
2727: return(0);
2728: }
2730: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
2731: {
2735: MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);
2736: return(0);
2737: }
2739: static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2740: {
2744: MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);
2745: return(0);
2746: }
2748: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2749: {
2753: MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);
2754: return(0);
2755: }
2757: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2758: {
2762: MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);
2763: return(0);
2764: }
2766: __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y)
2767: {
2768: int i = blockIdx.x*blockDim.x + threadIdx.x;
2769: if (i < n) y[idx[i]] += x[i];
2770: }
2772: /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
2773: static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
2774: {
2775: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
2776: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2777: Mat_SeqAIJCUSPARSEMultStruct *matstruct;
2778: PetscScalar *xarray,*zarray,*dptr,*beta,*xptr;
2779: PetscErrorCode ierr;
2780: cudaError_t cerr;
2781: cusparseStatus_t stat;
2782: cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2783: PetscBool compressed;
2784: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2785: PetscInt nx,ny;
2786: #endif
2789: if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported");
2790: if (!a->nonzerorowcnt) {
2791: if (!yy) {VecSet_SeqCUDA(zz,0);}
2792: else {VecCopy_SeqCUDA(yy,zz);}
2793: return(0);
2794: }
2795: /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
2796: MatSeqAIJCUSPARSECopyToGPU(A);
2797: if (!trans) {
2798: matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2799: if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
2800: } else {
2801: if (herm || !A->form_explicit_transpose) {
2802: opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
2803: matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2804: } else {
2805: if (!cusparsestruct->matTranspose) {MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);}
2806: matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
2807: }
2808: }
2809: /* Does the matrix use compressed rows (i.e., drop zero rows)? */
2810: compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
2812: try {
2813: VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);
2814: if (yy == zz) {VecCUDAGetArray(zz,&zarray);} /* read & write zz, so need to get uptodate zarray on GPU */
2815: else {VecCUDAGetArrayWrite(zz,&zarray);} /* write zz, so no need to init zarray on GPU */
2817: PetscLogGpuTimeBegin();
2818: if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2819: /* z = A x + beta y.
2820: If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
2821: When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
2822: */
2823: xptr = xarray;
2824: dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
2825: beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
2826: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2827: /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
2828: allocated to accommodate different uses. So we get the length info directly from mat.
2829: */
2830: if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2831: CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2832: nx = mat->num_cols;
2833: ny = mat->num_rows;
2834: }
2835: #endif
2836: } else {
2837: /* z = A^T x + beta y
2838: If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
2839: Note A^Tx is of full length, so we set beta to 1.0 if y exists.
2840: */
2841: xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
2842: dptr = zarray;
2843: beta = yy ? matstruct->beta_one : matstruct->beta_zero;
2844: if (compressed) { /* Scatter x to work vector */
2845: thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
2846: thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
2847: thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2848: VecCUDAEqualsReverse());
2849: }
2850: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2851: if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2852: CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2853: nx = mat->num_rows;
2854: ny = mat->num_cols;
2855: }
2856: #endif
2857: }
2859: /* csr_spmv does y = alpha op(A) x + beta y */
2860: if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2861: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2862: if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
2863: if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
2864: stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2865: stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2866: stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
2867: matstruct->matDescr,
2868: matstruct->cuSpMV[opA].vecXDescr, beta,
2869: matstruct->cuSpMV[opA].vecYDescr,
2870: cusparse_scalartype,
2871: cusparsestruct->spmvAlg,
2872: &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat);
2873: cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr);
2875: matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
2876: } else {
2877: /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
2878: stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat);
2879: stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat);
2880: }
2882: stat = cusparseSpMV(cusparsestruct->handle, opA,
2883: matstruct->alpha_one,
2884: matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTransposeForMult() */
2885: matstruct->cuSpMV[opA].vecXDescr,
2886: beta,
2887: matstruct->cuSpMV[opA].vecYDescr,
2888: cusparse_scalartype,
2889: cusparsestruct->spmvAlg,
2890: matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat);
2891: #else
2892: CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2893: stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
2894: mat->num_rows, mat->num_cols,
2895: mat->num_entries, matstruct->alpha_one, matstruct->descr,
2896: mat->values->data().get(), mat->row_offsets->data().get(),
2897: mat->column_indices->data().get(), xptr, beta,
2898: dptr);CHKERRCUSPARSE(stat);
2899: #endif
2900: } else {
2901: if (cusparsestruct->nrows) {
2902: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2903: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2904: #else
2905: cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
2906: stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
2907: matstruct->alpha_one, matstruct->descr, hybMat,
2908: xptr, beta,
2909: dptr);CHKERRCUSPARSE(stat);
2910: #endif
2911: }
2912: }
2913: cerr = WaitForCUDA();CHKERRCUDA(cerr);
2914: PetscLogGpuTimeEnd();
2916: if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2917: if (yy) { /* MatMultAdd: zz = A*xx + yy */
2918: if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
2919: VecCopy_SeqCUDA(yy,zz); /* zz = yy */
2920: } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
2921: VecAXPY_SeqCUDA(zz,1.0,yy); /* zz += yy */
2922: }
2923: } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
2924: VecSet_SeqCUDA(zz,0);
2925: }
2927: /* ScatterAdd the result from work vector into the full vector when A is compressed */
2928: if (compressed) {
2929: PetscLogGpuTimeBegin();
2930: /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred)
2931: and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
2932: prevent that. So I just add a ScatterAdd kernel.
2933: */
2934: #if 0
2935: thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
2936: thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
2937: thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
2938: thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2939: VecCUDAPlusEquals());
2940: #else
2941: PetscInt n = matstruct->cprowIndices->size();
2942: ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray);
2943: #endif
2944: cerr = WaitForCUDA();CHKERRCUDA(cerr);
2945: PetscLogGpuTimeEnd();
2946: }
2947: } else {
2948: if (yy && yy != zz) {
2949: VecAXPY_SeqCUDA(zz,1.0,yy); /* zz += yy */
2950: }
2951: }
2952: VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);
2953: if (yy == zz) {VecCUDARestoreArray(zz,&zarray);}
2954: else {VecCUDARestoreArrayWrite(zz,&zarray);}
2955: } catch(char *ex) {
2956: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
2957: }
2958: if (yy) {
2959: PetscLogGpuFlops(2.0*a->nz);
2960: } else {
2961: PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);
2962: }
2963: return(0);
2964: }
2966: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2967: {
2971: MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);
2972: return(0);
2973: }
2975: static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
2976: {
2977: PetscErrorCode ierr;
2978: PetscSplitCSRDataStructure *d_mat = NULL;
2980: if (A->factortype == MAT_FACTOR_NONE) {
2981: d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
2982: }
2983: MatAssemblyEnd_SeqAIJ(A,mode); // this does very little if assembled on GPU - call it?
2984: if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) return(0);
2985: if (d_mat) {
2986: A->offloadmask = PETSC_OFFLOAD_GPU;
2987: }
2989: return(0);
2990: }
2992: /* --------------------------------------------------------------------------------*/
2993: /*@
2994: MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
2995: (the default parallel PETSc format). This matrix will ultimately pushed down
2996: to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
2997: assembly performance the user should preallocate the matrix storage by setting
2998: the parameter nz (or the array nnz). By setting these parameters accurately,
2999: performance during matrix assembly can be increased by more than a factor of 50.
3001: Collective
3003: Input Parameters:
3004: + comm - MPI communicator, set to PETSC_COMM_SELF
3005: . m - number of rows
3006: . n - number of columns
3007: . nz - number of nonzeros per row (same for all rows)
3008: - nnz - array containing the number of nonzeros in the various rows
3009: (possibly different for each row) or NULL
3011: Output Parameter:
3012: . A - the matrix
3014: It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
3015: MatXXXXSetPreallocation() paradgm instead of this routine directly.
3016: [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
3018: Notes:
3019: If nnz is given then nz is ignored
3021: The AIJ format (also called the Yale sparse matrix format or
3022: compressed row storage), is fully compatible with standard Fortran 77
3023: storage. That is, the stored row and column indices can begin at
3024: either one (as in Fortran) or zero. See the users' manual for details.
3026: Specify the preallocated storage with either nz or nnz (not both).
3027: Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
3028: allocation. For large problems you MUST preallocate memory or you
3029: will get TERRIBLE performance, see the users' manual chapter on matrices.
3031: By default, this format uses inodes (identical nodes) when possible, to
3032: improve numerical efficiency of matrix-vector products and solves. We
3033: search for consecutive rows with the same nonzero structure, thereby
3034: reusing matrix information to achieve increased efficiency.
3036: Level: intermediate
3038: .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
3039: @*/
3040: PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
3041: {
3045: MatCreate(comm,A);
3046: MatSetSizes(*A,m,n,m,n);
3047: MatSetType(*A,MATSEQAIJCUSPARSE);
3048: MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);
3049: return(0);
3050: }
3052: static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3053: {
3054: PetscErrorCode ierr;
3055: PetscSplitCSRDataStructure *d_mat = NULL;
3058: if (A->factortype == MAT_FACTOR_NONE) {
3059: d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
3060: ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat = NULL;
3061: MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);
3062: } else {
3063: MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);
3064: }
3065: if (d_mat) {
3066: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
3067: cudaError_t err;
3068: PetscSplitCSRDataStructure h_mat;
3069: PetscInfo(A,"Have device matrix\n");
3070: err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
3071: if (a->compressedrow.use) {
3072: err = cudaFree(h_mat.diag.i);CHKERRCUDA(err);
3073: }
3074: err = cudaFree(d_mat);CHKERRCUDA(err);
3075: }
3076: PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);
3077: PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);
3078: PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);
3079: PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);
3080: PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);
3081: PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);
3082: PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);
3083: PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);
3084: MatDestroy_SeqAIJ(A);
3085: return(0);
3086: }
3088: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
3089: static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
3090: static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
3091: {
3095: MatDuplicate_SeqAIJ(A,cpvalues,B);
3096: MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);
3097: return(0);
3098: }
3100: static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str)
3101: {
3102: PetscErrorCode ierr;
3103: Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
3104: Mat_SeqAIJCUSPARSE *cy;
3105: Mat_SeqAIJCUSPARSE *cx;
3106: PetscScalar *ay;
3107: const PetscScalar *ax;
3108: CsrMatrix *csry,*csrx;
3109: cudaError_t cerr;
3112: cy = (Mat_SeqAIJCUSPARSE*)Y->spptr;
3113: cx = (Mat_SeqAIJCUSPARSE*)X->spptr;
3114: if (X->ops->axpy != Y->ops->axpy) {
3115: MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);
3116: MatAXPY_SeqAIJ(Y,a,X,str);
3117: return(0);
3118: }
3119: /* if we are here, it means both matrices are bound to GPU */
3120: MatSeqAIJCUSPARSECopyToGPU(Y);
3121: MatSeqAIJCUSPARSECopyToGPU(X);
3122: if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported");
3123: if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported");
3124: csry = (CsrMatrix*)cy->mat->mat;
3125: csrx = (CsrMatrix*)cx->mat->mat;
3126: /* see if we can turn this into a cublas axpy */
3127: if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3128: bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin());
3129: if (eq) {
3130: eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin());
3131: }
3132: if (eq) str = SAME_NONZERO_PATTERN;
3133: }
3134: /* spgeam is buggy with one column */
3135: if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3137: if (str == SUBSET_NONZERO_PATTERN) {
3138: cusparseStatus_t stat;
3139: PetscScalar b = 1.0;
3140: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3141: size_t bufferSize;
3142: void *buffer;
3143: #endif
3145: MatSeqAIJCUSPARSEGetArrayRead(X,&ax);
3146: MatSeqAIJCUSPARSEGetArray(Y,&ay);
3147: stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
3148: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3149: stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n,
3150: &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3151: &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3152: cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat);
3153: cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr);
3154: PetscLogGpuTimeBegin();
3155: stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3156: &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3157: &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3158: cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat);
3159: cerr = WaitForCUDA();CHKERRCUDA(cerr);
3160: PetscLogGpuFlops(x->nz + y->nz);
3161: PetscLogGpuTimeEnd();
3162: cerr = cudaFree(buffer);CHKERRCUDA(cerr);
3163: #else
3164: PetscLogGpuTimeBegin();
3165: stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3166: &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3167: &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3168: cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat);
3169: cerr = WaitForCUDA();CHKERRCUDA(cerr);
3170: PetscLogGpuFlops(x->nz + y->nz);
3171: PetscLogGpuTimeEnd();
3172: #endif
3173: stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
3174: MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);
3175: MatSeqAIJCUSPARSERestoreArray(Y,&ay);
3176: MatSeqAIJInvalidateDiagonal(Y);
3177: } else if (str == SAME_NONZERO_PATTERN) {
3178: cublasHandle_t cublasv2handle;
3179: cublasStatus_t berr;
3180: PetscBLASInt one = 1, bnz = 1;
3182: MatSeqAIJCUSPARSEGetArrayRead(X,&ax);
3183: MatSeqAIJCUSPARSEGetArray(Y,&ay);
3184: PetscCUBLASGetHandle(&cublasv2handle);
3185: PetscBLASIntCast(x->nz,&bnz);
3186: PetscLogGpuTimeBegin();
3187: berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr);
3188: cerr = WaitForCUDA();CHKERRCUDA(cerr);
3189: PetscLogGpuFlops(2.0*bnz);
3190: PetscLogGpuTimeEnd();
3191: MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);
3192: MatSeqAIJCUSPARSERestoreArray(Y,&ay);
3193: MatSeqAIJInvalidateDiagonal(Y);
3194: } else {
3195: MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);
3196: MatAXPY_SeqAIJ(Y,a,X,str);
3197: }
3198: return(0);
3199: }
3201: static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a)
3202: {
3204: Mat_SeqAIJ *y = (Mat_SeqAIJ*)Y->data;
3205: PetscScalar *ay;
3206: cudaError_t cerr;
3207: cublasHandle_t cublasv2handle;
3208: cublasStatus_t berr;
3209: PetscBLASInt one = 1, bnz = 1;
3212: MatSeqAIJCUSPARSEGetArray(Y,&ay);
3213: PetscCUBLASGetHandle(&cublasv2handle);
3214: PetscBLASIntCast(y->nz,&bnz);
3215: PetscLogGpuTimeBegin();
3216: berr = cublasXscal(cublasv2handle,bnz,&a,ay,one);CHKERRCUBLAS(berr);
3217: cerr = WaitForCUDA();CHKERRCUDA(cerr);
3218: PetscLogGpuFlops(bnz);
3219: PetscLogGpuTimeEnd();
3220: MatSeqAIJCUSPARSERestoreArray(Y,&ay);
3221: MatSeqAIJInvalidateDiagonal(Y);
3222: return(0);
3223: }
3225: static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3226: {
3227: PetscErrorCode ierr;
3228: PetscBool both = PETSC_FALSE;
3229: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
3232: if (A->factortype == MAT_FACTOR_NONE) {
3233: Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
3234: if (spptr->mat) {
3235: CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
3236: if (matrix->values) {
3237: both = PETSC_TRUE;
3238: thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3239: }
3240: }
3241: if (spptr->matTranspose) {
3242: CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
3243: if (matrix->values) {
3244: thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3245: }
3246: }
3247: }
3248: //MatZeroEntries_SeqAIJ(A);
3249: PetscArrayzero(a->a,a->i[A->rmap->n]);
3250: MatSeqAIJInvalidateDiagonal(A);
3251: if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3252: else A->offloadmask = PETSC_OFFLOAD_CPU;
3254: return(0);
3255: }
3257: static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
3258: {
3259: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
3263: if (A->factortype != MAT_FACTOR_NONE) return(0);
3264: if (flg) {
3265: MatSeqAIJCUSPARSECopyFromGPU(A);
3267: A->ops->scale = MatScale_SeqAIJ;
3268: A->ops->axpy = MatAXPY_SeqAIJ;
3269: A->ops->zeroentries = MatZeroEntries_SeqAIJ;
3270: A->ops->mult = MatMult_SeqAIJ;
3271: A->ops->multadd = MatMultAdd_SeqAIJ;
3272: A->ops->multtranspose = MatMultTranspose_SeqAIJ;
3273: A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ;
3274: A->ops->multhermitiantranspose = NULL;
3275: A->ops->multhermitiantransposeadd = NULL;
3276: A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ;
3277: PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);
3278: PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);
3279: PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);
3280: PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);
3281: PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);
3282: PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);
3283: PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);
3284: } else {
3285: A->ops->scale = MatScale_SeqAIJCUSPARSE;
3286: A->ops->axpy = MatAXPY_SeqAIJCUSPARSE;
3287: A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE;
3288: A->ops->mult = MatMult_SeqAIJCUSPARSE;
3289: A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE;
3290: A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE;
3291: A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
3292: A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3293: A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3294: A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE;
3295: PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);
3296: PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);
3297: PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);
3298: PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);
3299: PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);
3300: PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);
3301: PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);
3302: }
3303: A->boundtocpu = flg;
3304: a->inode.use = flg;
3305: return(0);
3306: }
3308: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
3309: {
3310: PetscErrorCode ierr;
3311: cusparseStatus_t stat;
3312: Mat B;
3315: PetscCUDAInitializeCheck(); /* first use of CUSPARSE may be via MatConvert */
3316: if (reuse == MAT_INITIAL_MATRIX) {
3317: MatDuplicate(A,MAT_COPY_VALUES,newmat);
3318: } else if (reuse == MAT_REUSE_MATRIX) {
3319: MatCopy(A,*newmat,SAME_NONZERO_PATTERN);
3320: }
3321: B = *newmat;
3323: PetscFree(B->defaultvectype);
3324: PetscStrallocpy(VECCUDA,&B->defaultvectype);
3326: if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
3327: if (B->factortype == MAT_FACTOR_NONE) {
3328: Mat_SeqAIJCUSPARSE *spptr;
3329: PetscNew(&spptr);
3330: stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3331: stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
3332: spptr->format = MAT_CUSPARSE_CSR;
3333: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3334: spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
3335: spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
3336: spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
3337: #endif
3338: B->spptr = spptr;
3339: } else {
3340: Mat_SeqAIJCUSPARSETriFactors *spptr;
3342: PetscNew(&spptr);
3343: stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3344: stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
3345: B->spptr = spptr;
3346: }
3347: B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3348: }
3349: B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE;
3350: B->ops->destroy = MatDestroy_SeqAIJCUSPARSE;
3351: B->ops->setoption = MatSetOption_SeqAIJCUSPARSE;
3352: B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
3353: B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE;
3354: B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE;
3356: MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);
3357: PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);
3358: PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);
3359: return(0);
3360: }
3362: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
3363: {
3367: MatCreate_SeqAIJ(B);
3368: MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);
3369: return(0);
3370: }
3372: /*MC
3373: MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
3375: A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
3376: CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
3377: All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
3379: Options Database Keys:
3380: + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
3381: . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3382: - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3384: Level: beginner
3386: .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
3387: M*/
3389: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*);
3391: PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
3392: {
3396: MatSolverTypeRegister(MATSOLVERCUSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band);
3397: MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);
3398: MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);
3399: MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);
3400: MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);
3402: return(0);
3403: }
3405: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
3406: {
3407: PetscErrorCode ierr;
3408: cusparseStatus_t stat;
3411: if (*cusparsestruct) {
3412: MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);
3413: MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);
3414: delete (*cusparsestruct)->workVector;
3415: delete (*cusparsestruct)->rowoffsets_gpu;
3416: delete (*cusparsestruct)->cooPerm;
3417: delete (*cusparsestruct)->cooPerm_a;
3418: delete (*cusparsestruct)->csr2csc_i;
3419: if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);}
3420: PetscFree(*cusparsestruct);
3421: }
3422: return(0);
3423: }
3425: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
3426: {
3428: if (*mat) {
3429: delete (*mat)->values;
3430: delete (*mat)->column_indices;
3431: delete (*mat)->row_offsets;
3432: delete *mat;
3433: *mat = 0;
3434: }
3435: return(0);
3436: }
3438: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
3439: {
3440: cusparseStatus_t stat;
3441: PetscErrorCode ierr;
3444: if (*trifactor) {
3445: if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
3446: if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
3447: CsrMatrix_Destroy(&(*trifactor)->csrMat);
3448: if ((*trifactor)->solveBuffer) {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);}
3449: if ((*trifactor)->AA_h) {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);}
3450: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3451: if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);}
3452: #endif
3453: PetscFree(*trifactor);
3454: }
3455: return(0);
3456: }
3458: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
3459: {
3460: CsrMatrix *mat;
3461: cusparseStatus_t stat;
3462: cudaError_t err;
3465: if (*matstruct) {
3466: if ((*matstruct)->mat) {
3467: if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
3468: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3469: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3470: #else
3471: cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
3472: stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
3473: #endif
3474: } else {
3475: mat = (CsrMatrix*)(*matstruct)->mat;
3476: CsrMatrix_Destroy(&mat);
3477: }
3478: }
3479: if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
3480: delete (*matstruct)->cprowIndices;
3481: if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); }
3482: if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
3483: if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
3485: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3486: Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
3487: if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);}
3488: for (int i=0; i<3; i++) {
3489: if (mdata->cuSpMV[i].initialized) {
3490: err = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err);
3491: stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat);
3492: stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat);
3493: }
3494: }
3495: #endif
3496: delete *matstruct;
3497: *matstruct = NULL;
3498: }
3499: return(0);
3500: }
3502: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3503: {
3507: if (*trifactors) {
3508: MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);
3509: MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);
3510: MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);
3511: MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);
3512: delete (*trifactors)->rpermIndices;
3513: delete (*trifactors)->cpermIndices;
3514: delete (*trifactors)->workVector;
3515: (*trifactors)->rpermIndices = NULL;
3516: (*trifactors)->cpermIndices = NULL;
3517: (*trifactors)->workVector = NULL;
3518: if ((*trifactors)->a_band_d) {cudaError_t cerr = cudaFree((*trifactors)->a_band_d);CHKERRCUDA(cerr);}
3519: if ((*trifactors)->i_band_d) {cudaError_t cerr = cudaFree((*trifactors)->i_band_d);CHKERRCUDA(cerr);}
3520: }
3521: return(0);
3522: }
3524: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3525: {
3526: PetscErrorCode ierr;
3527: cusparseHandle_t handle;
3528: cusparseStatus_t stat;
3531: if (*trifactors) {
3532: MatSeqAIJCUSPARSETriFactors_Reset(trifactors);
3533: if (handle = (*trifactors)->handle) {
3534: stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
3535: }
3536: PetscFree(*trifactors);
3537: }
3538: return(0);
3539: }
3541: struct IJCompare
3542: {
3543: __host__ __device__
3544: inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3545: {
3546: if (t1.get<0>() < t2.get<0>()) return true;
3547: if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3548: return false;
3549: }
3550: };
3552: struct IJEqual
3553: {
3554: __host__ __device__
3555: inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3556: {
3557: if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
3558: return true;
3559: }
3560: };
3562: struct IJDiff
3563: {
3564: __host__ __device__
3565: inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3566: {
3567: return t1 == t2 ? 0 : 1;
3568: }
3569: };
3571: struct IJSum
3572: {
3573: __host__ __device__
3574: inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3575: {
3576: return t1||t2;
3577: }
3578: };
3580: #include <thrust/iterator/discard_iterator.h>
3581: PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
3582: {
3583: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3584: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
3585: THRUSTARRAY *cooPerm_v = NULL;
3586: thrust::device_ptr<const PetscScalar> d_v;
3587: CsrMatrix *matrix;
3588: PetscErrorCode ierr;
3589: cudaError_t cerr;
3590: PetscInt n;
3593: if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct");
3594: if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix");
3595: if (!cusp->cooPerm) {
3596: MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);
3597: MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);
3598: return(0);
3599: }
3600: matrix = (CsrMatrix*)cusp->mat->mat;
3601: if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3602: if (!v) {
3603: if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3604: goto finalize;
3605: }
3606: n = cusp->cooPerm->size();
3607: if (isCudaMem(v)) {
3608: d_v = thrust::device_pointer_cast(v);
3609: } else {
3610: cooPerm_v = new THRUSTARRAY(n);
3611: cooPerm_v->assign(v,v+n);
3612: d_v = cooPerm_v->data();
3613: PetscLogCpuToGpu(n*sizeof(PetscScalar));
3614: }
3615: PetscLogGpuTimeBegin();
3616: if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
3617: if (cusp->cooPerm_a) {
3618: THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
3619: auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3620: thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3621: thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
3622: delete cooPerm_w;
3623: } else {
3624: auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3625: matrix->values->begin()));
3626: auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3627: matrix->values->end()));
3628: thrust::for_each(zibit,zieit,VecCUDAPlusEquals());
3629: }
3630: } else {
3631: if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
3632: auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3633: thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3634: } else {
3635: auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3636: matrix->values->begin()));
3637: auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3638: matrix->values->end()));
3639: thrust::for_each(zibit,zieit,VecCUDAEquals());
3640: }
3641: }
3642: cerr = WaitForCUDA();CHKERRCUDA(cerr);
3643: PetscLogGpuTimeEnd();
3644: finalize:
3645: delete cooPerm_v;
3646: A->offloadmask = PETSC_OFFLOAD_GPU;
3647: PetscObjectStateIncrease((PetscObject)A);
3648: /* shorter version of MatAssemblyEnd_SeqAIJ */
3649: PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);
3650: PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");
3651: PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);
3652: a->reallocs = 0;
3653: A->info.mallocs += 0;
3654: A->info.nz_unneeded = 0;
3655: A->assembled = A->was_assembled = PETSC_TRUE;
3656: A->num_ass++;
3657: return(0);
3658: }
3660: PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
3661: {
3662: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3663: PetscErrorCode ierr;
3667: if (!cusp) return(0);
3668: if (destroy) {
3669: MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);
3670: delete cusp->csr2csc_i;
3671: cusp->csr2csc_i = NULL;
3672: }
3673: A->transupdated = PETSC_FALSE;
3674: return(0);
3675: }
3677: #include <thrust/binary_search.h>
3678: PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
3679: {
3680: PetscErrorCode ierr;
3681: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3682: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
3683: PetscInt cooPerm_n, nzr = 0;
3684: cudaError_t cerr;
3687: PetscLayoutSetUp(A->rmap);
3688: PetscLayoutSetUp(A->cmap);
3689: cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
3690: if (n != cooPerm_n) {
3691: delete cusp->cooPerm;
3692: delete cusp->cooPerm_a;
3693: cusp->cooPerm = NULL;
3694: cusp->cooPerm_a = NULL;
3695: }
3696: if (n) {
3697: THRUSTINTARRAY d_i(n);
3698: THRUSTINTARRAY d_j(n);
3699: THRUSTINTARRAY ii(A->rmap->n);
3701: if (!cusp->cooPerm) { cusp->cooPerm = new THRUSTINTARRAY(n); }
3702: if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }
3704: PetscLogCpuToGpu(2.*n*sizeof(PetscInt));
3705: d_i.assign(coo_i,coo_i+n);
3706: d_j.assign(coo_j,coo_j+n);
3707: auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
3708: auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));
3710: PetscLogGpuTimeBegin();
3711: thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
3712: thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare());
3713: *cusp->cooPerm_a = d_i;
3714: THRUSTINTARRAY w = d_j;
3716: auto nekey = thrust::unique(fkey, ekey, IJEqual());
3717: if (nekey == ekey) { /* all entries are unique */
3718: delete cusp->cooPerm_a;
3719: cusp->cooPerm_a = NULL;
3720: } else { /* I couldn't come up with a more elegant algorithm */
3721: adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff());
3722: adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff());
3723: (*cusp->cooPerm_a)[0] = 0;
3724: w[0] = 0;
3725: thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum());
3726: thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>());
3727: }
3728: thrust::counting_iterator<PetscInt> search_begin(0);
3729: thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(),
3730: search_begin, search_begin + A->rmap->n,
3731: ii.begin());
3732: cerr = WaitForCUDA();CHKERRCUDA(cerr);
3733: PetscLogGpuTimeEnd();
3735: MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);
3736: a->singlemalloc = PETSC_FALSE;
3737: a->free_a = PETSC_TRUE;
3738: a->free_ij = PETSC_TRUE;
3739: PetscMalloc1(A->rmap->n+1,&a->i);
3740: a->i[0] = 0;
3741: cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3742: a->nz = a->maxnz = a->i[A->rmap->n];
3743: a->rmax = 0;
3744: PetscMalloc1(a->nz,&a->a);
3745: PetscMalloc1(a->nz,&a->j);
3746: cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3747: if (!a->ilen) { PetscMalloc1(A->rmap->n,&a->ilen); }
3748: if (!a->imax) { PetscMalloc1(A->rmap->n,&a->imax); }
3749: for (PetscInt i = 0; i < A->rmap->n; i++) {
3750: const PetscInt nnzr = a->i[i+1] - a->i[i];
3751: nzr += (PetscInt)!!(nnzr);
3752: a->ilen[i] = a->imax[i] = nnzr;
3753: a->rmax = PetscMax(a->rmax,nnzr);
3754: }
3755: a->nonzerorowcnt = nzr;
3756: A->preallocated = PETSC_TRUE;
3757: PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));
3758: MatMarkDiagonal_SeqAIJ(A);
3759: } else {
3760: MatSeqAIJSetPreallocation(A,0,NULL);
3761: }
3762: MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);
3764: /* We want to allocate the CUSPARSE struct for matvec now.
3765: The code is so convoluted now that I prefer to copy zeros */
3766: PetscArrayzero(a->a,a->nz);
3767: MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);
3768: A->offloadmask = PETSC_OFFLOAD_CPU;
3769: A->nonzerostate++;
3770: MatSeqAIJCUSPARSECopyToGPU(A);
3771: MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);
3773: A->assembled = PETSC_FALSE;
3774: A->was_assembled = PETSC_FALSE;
3775: return(0);
3776: }
3778: PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a)
3779: {
3780: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3781: CsrMatrix *csr;
3782: PetscErrorCode ierr;
3788: if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3789: MatSeqAIJCUSPARSECopyToGPU(A);
3790: if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3791: csr = (CsrMatrix*)cusp->mat->mat;
3792: if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3793: *a = csr->values->data().get();
3794: return(0);
3795: }
3797: PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a)
3798: {
3803: *a = NULL;
3804: return(0);
3805: }
3807: PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a)
3808: {
3809: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3810: CsrMatrix *csr;
3811: PetscErrorCode ierr;
3817: if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3818: MatSeqAIJCUSPARSECopyToGPU(A);
3819: if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3820: csr = (CsrMatrix*)cusp->mat->mat;
3821: if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3822: *a = csr->values->data().get();
3823: A->offloadmask = PETSC_OFFLOAD_GPU;
3824: MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);
3825: return(0);
3826: }
3828: PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a)
3829: {
3836: PetscObjectStateIncrease((PetscObject)A);
3837: *a = NULL;
3838: return(0);
3839: }
3841: PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a)
3842: {
3843: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3844: CsrMatrix *csr;
3845: PetscErrorCode ierr;
3851: if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3852: if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3853: csr = (CsrMatrix*)cusp->mat->mat;
3854: if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3855: *a = csr->values->data().get();
3856: A->offloadmask = PETSC_OFFLOAD_GPU;
3857: MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);
3858: return(0);
3859: }
3861: PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a)
3862: {
3869: PetscObjectStateIncrease((PetscObject)A);
3870: *a = NULL;
3871: return(0);
3872: }
3874: struct IJCompare4
3875: {
3876: __host__ __device__
3877: inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
3878: {
3879: if (t1.get<0>() < t2.get<0>()) return true;
3880: if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3881: return false;
3882: }
3883: };
3885: struct Shift
3886: {
3887: int _shift;
3889: Shift(int shift) : _shift(shift) {}
3890: __host__ __device__
3891: inline int operator() (const int &c)
3892: {
3893: return c + _shift;
3894: }
3895: };
3897: /* merges to SeqAIJCUSPARSE matrices, [A';B']' operation in matlab notation */
3898: PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
3899: {
3900: PetscErrorCode ierr;
3901: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c;
3902: Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp;
3903: Mat_SeqAIJCUSPARSEMultStruct *Cmat;
3904: CsrMatrix *Acsr,*Bcsr,*Ccsr;
3905: PetscInt Annz,Bnnz;
3906: cusparseStatus_t stat;
3907: PetscInt i,m,n,zero = 0;
3908: cudaError_t cerr;
3916: if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n);
3917: if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
3918: if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3919: if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3920: if (reuse == MAT_INITIAL_MATRIX) {
3921: m = A->rmap->n;
3922: n = A->cmap->n + B->cmap->n;
3923: MatCreate(PETSC_COMM_SELF,C);
3924: MatSetSizes(*C,m,n,m,n);
3925: MatSetType(*C,MATSEQAIJCUSPARSE);
3926: c = (Mat_SeqAIJ*)(*C)->data;
3927: Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
3928: Cmat = new Mat_SeqAIJCUSPARSEMultStruct;
3929: Ccsr = new CsrMatrix;
3930: Cmat->cprowIndices = NULL;
3931: c->compressedrow.use = PETSC_FALSE;
3932: c->compressedrow.nrows = 0;
3933: c->compressedrow.i = NULL;
3934: c->compressedrow.rindex = NULL;
3935: Ccusp->workVector = NULL;
3936: Ccusp->nrows = m;
3937: Ccusp->mat = Cmat;
3938: Ccusp->mat->mat = Ccsr;
3939: Ccsr->num_rows = m;
3940: Ccsr->num_cols = n;
3941: stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
3942: stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3943: stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
3944: cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
3945: cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
3946: cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
3947: cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3948: cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3949: cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3950: MatSeqAIJCUSPARSECopyToGPU(A);
3951: MatSeqAIJCUSPARSECopyToGPU(B);
3952: MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);
3953: MatSeqAIJCUSPARSEFormExplicitTransposeForMult(B);
3954: if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3955: if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3957: Acsr = (CsrMatrix*)Acusp->mat->mat;
3958: Bcsr = (CsrMatrix*)Bcusp->mat->mat;
3959: Annz = (PetscInt)Acsr->column_indices->size();
3960: Bnnz = (PetscInt)Bcsr->column_indices->size();
3961: c->nz = Annz + Bnnz;
3962: Ccsr->row_offsets = new THRUSTINTARRAY32(m+1);
3963: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3964: Ccsr->values = new THRUSTARRAY(c->nz);
3965: Ccsr->num_entries = c->nz;
3966: Ccusp->cooPerm = new THRUSTINTARRAY(c->nz);
3967: if (c->nz) {
3968: auto Acoo = new THRUSTINTARRAY32(Annz);
3969: auto Bcoo = new THRUSTINTARRAY32(Bnnz);
3970: auto Ccoo = new THRUSTINTARRAY32(c->nz);
3971: THRUSTINTARRAY32 *Aroff,*Broff;
3973: if (a->compressedrow.use) { /* need full row offset */
3974: if (!Acusp->rowoffsets_gpu) {
3975: Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
3976: Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
3977: PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));
3978: }
3979: Aroff = Acusp->rowoffsets_gpu;
3980: } else Aroff = Acsr->row_offsets;
3981: if (b->compressedrow.use) { /* need full row offset */
3982: if (!Bcusp->rowoffsets_gpu) {
3983: Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
3984: Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
3985: PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));
3986: }
3987: Broff = Bcusp->rowoffsets_gpu;
3988: } else Broff = Bcsr->row_offsets;
3989: PetscLogGpuTimeBegin();
3990: stat = cusparseXcsr2coo(Acusp->handle,
3991: Aroff->data().get(),
3992: Annz,
3993: m,
3994: Acoo->data().get(),
3995: CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3996: stat = cusparseXcsr2coo(Bcusp->handle,
3997: Broff->data().get(),
3998: Bnnz,
3999: m,
4000: Bcoo->data().get(),
4001: CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4002: /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
4003: auto Aperm = thrust::make_constant_iterator(1);
4004: auto Bperm = thrust::make_constant_iterator(0);
4005: #if PETSC_PKG_CUDA_VERSION_GE(10,0,0)
4006: auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n));
4007: auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n));
4008: #else
4009: /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
4010: auto Bcib = Bcsr->column_indices->begin();
4011: auto Bcie = Bcsr->column_indices->end();
4012: thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n));
4013: #endif
4014: auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz);
4015: auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm));
4016: auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm));
4017: auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm));
4018: auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm));
4019: auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin()));
4020: auto p1 = Ccusp->cooPerm->begin();
4021: auto p2 = Ccusp->cooPerm->begin();
4022: thrust::advance(p2,Annz);
4023: PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4()));
4024: #if PETSC_PKG_CUDA_VERSION_LT(10,0,0)
4025: thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n));
4026: #endif
4027: auto cci = thrust::make_counting_iterator(zero);
4028: auto cce = thrust::make_counting_iterator(c->nz);
4029: #if 0 //Errors on SUMMIT cuda 11.1.0
4030: PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
4031: #else
4032: auto pred = thrust::identity<int>();
4033: PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred));
4034: PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred));
4035: #endif
4036: stat = cusparseXcoo2csr(Ccusp->handle,
4037: Ccoo->data().get(),
4038: c->nz,
4039: m,
4040: Ccsr->row_offsets->data().get(),
4041: CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4042: cerr = WaitForCUDA();CHKERRCUDA(cerr);
4043: PetscLogGpuTimeEnd();
4044: delete wPerm;
4045: delete Acoo;
4046: delete Bcoo;
4047: delete Ccoo;
4048: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4049: stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries,
4050: Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(),
4051: CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4052: CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4053: #endif
4054: if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4055: PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4056: Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4057: CsrMatrix *CcsrT = new CsrMatrix;
4058: CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4059: CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4061: (*C)->form_explicit_transpose = PETSC_TRUE;
4062: (*C)->transupdated = PETSC_TRUE;
4063: Ccusp->rowoffsets_gpu = NULL;
4064: CmatT->cprowIndices = NULL;
4065: CmatT->mat = CcsrT;
4066: CcsrT->num_rows = n;
4067: CcsrT->num_cols = m;
4068: CcsrT->num_entries = c->nz;
4070: CcsrT->row_offsets = new THRUSTINTARRAY32(n+1);
4071: CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4072: CcsrT->values = new THRUSTARRAY(c->nz);
4074: PetscLogGpuTimeBegin();
4075: auto rT = CcsrT->row_offsets->begin();
4076: if (AT) {
4077: rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT);
4078: thrust::advance(rT,-1);
4079: }
4080: if (BT) {
4081: auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz));
4082: auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz));
4083: thrust::copy(titb,tite,rT);
4084: }
4085: auto cT = CcsrT->column_indices->begin();
4086: if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT);
4087: if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT);
4088: auto vT = CcsrT->values->begin();
4089: if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4090: if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4091: cerr = WaitForCUDA();CHKERRCUDA(cerr);
4092: PetscLogGpuTimeEnd();
4094: stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat);
4095: stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4096: stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
4097: cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
4098: cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
4099: cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
4100: cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4101: cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4102: cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4103: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4104: stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries,
4105: CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(),
4106: CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4107: CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4108: #endif
4109: Ccusp->matTranspose = CmatT;
4110: }
4111: }
4113: c->singlemalloc = PETSC_FALSE;
4114: c->free_a = PETSC_TRUE;
4115: c->free_ij = PETSC_TRUE;
4116: PetscMalloc1(m+1,&c->i);
4117: PetscMalloc1(c->nz,&c->j);
4118: if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4119: THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4120: THRUSTINTARRAY jj(Ccsr->column_indices->size());
4121: ii = *Ccsr->row_offsets;
4122: jj = *Ccsr->column_indices;
4123: cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4124: cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4125: } else {
4126: cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4127: cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4128: }
4129: PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));
4130: PetscMalloc1(m,&c->ilen);
4131: PetscMalloc1(m,&c->imax);
4132: c->maxnz = c->nz;
4133: c->nonzerorowcnt = 0;
4134: c->rmax = 0;
4135: for (i = 0; i < m; i++) {
4136: const PetscInt nn = c->i[i+1] - c->i[i];
4137: c->ilen[i] = c->imax[i] = nn;
4138: c->nonzerorowcnt += (PetscInt)!!nn;
4139: c->rmax = PetscMax(c->rmax,nn);
4140: }
4141: MatMarkDiagonal_SeqAIJ(*C);
4142: PetscMalloc1(c->nz,&c->a);
4143: (*C)->nonzerostate++;
4144: PetscLayoutSetUp((*C)->rmap);
4145: PetscLayoutSetUp((*C)->cmap);
4146: Ccusp->nonzerostate = (*C)->nonzerostate;
4147: (*C)->preallocated = PETSC_TRUE;
4148: } else {
4149: if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n);
4150: c = (Mat_SeqAIJ*)(*C)->data;
4151: if (c->nz) {
4152: Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4153: if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm");
4154: if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4155: if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate");
4156: MatSeqAIJCUSPARSECopyToGPU(A);
4157: MatSeqAIJCUSPARSECopyToGPU(B);
4158: if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4159: if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4160: Acsr = (CsrMatrix*)Acusp->mat->mat;
4161: Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4162: Ccsr = (CsrMatrix*)Ccusp->mat->mat;
4163: if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size());
4164: if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size());
4165: if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size());
4166: if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries);
4167: if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size());
4168: auto pmid = Ccusp->cooPerm->begin();
4169: thrust::advance(pmid,Acsr->num_entries);
4170: PetscLogGpuTimeBegin();
4171: auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(),
4172: thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin())));
4173: auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(),
4174: thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4175: thrust::for_each(zibait,zieait,VecCUDAEquals());
4176: auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(),
4177: thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4178: auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(),
4179: thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end())));
4180: thrust::for_each(zibbit,ziebit,VecCUDAEquals());
4181: MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE);
4182: if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4183: if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4184: PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4185: CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4186: CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4187: CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat;
4188: auto vT = CcsrT->values->begin();
4189: if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4190: if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4191: (*C)->transupdated = PETSC_TRUE;
4192: }
4193: cerr = WaitForCUDA();CHKERRCUDA(cerr);
4194: PetscLogGpuTimeEnd();
4195: }
4196: }
4197: PetscObjectStateIncrease((PetscObject)*C);
4198: (*C)->assembled = PETSC_TRUE;
4199: (*C)->was_assembled = PETSC_FALSE;
4200: (*C)->offloadmask = PETSC_OFFLOAD_GPU;
4201: return(0);
4202: }
4204: static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4205: {
4206: PetscErrorCode ierr;
4207: bool dmem;
4208: const PetscScalar *av;
4209: cudaError_t cerr;
4212: dmem = isCudaMem(v);
4213: MatSeqAIJCUSPARSEGetArrayRead(A,&av);
4214: if (n && idx) {
4215: THRUSTINTARRAY widx(n);
4216: widx.assign(idx,idx+n);
4217: PetscLogCpuToGpu(n*sizeof(PetscInt));
4219: THRUSTARRAY *w = NULL;
4220: thrust::device_ptr<PetscScalar> dv;
4221: if (dmem) {
4222: dv = thrust::device_pointer_cast(v);
4223: } else {
4224: w = new THRUSTARRAY(n);
4225: dv = w->data();
4226: }
4227: thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4229: auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv));
4230: auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n));
4231: thrust::for_each(zibit,zieit,VecCUDAEquals());
4232: if (w) {
4233: cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4234: }
4235: delete w;
4236: } else {
4237: cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4238: }
4239: if (!dmem) { PetscLogCpuToGpu(n*sizeof(PetscScalar)); }
4240: MatSeqAIJCUSPARSERestoreArrayRead(A,&av);
4241: return(0);
4242: }
4244: /*
4245: LU BAND factorization with optimization for block diagonal (Nf blocks) in natural order (-mat_no_inode -pc_factor_mat_ordering_type rcm with Nf>1 fields)
4247: requires:
4248: structurally symmetric: fix with transpose/column meta data
4249: */
4251: /*
4252: The GPU LU factor kernel
4253: */
4254: __global__
4255: void __launch_bounds__(1024,1)
4256: mat_lu_factor_band_init_set_i(const PetscInt n, const int bw, int bi_csr[])
4257: {
4258: const PetscInt Nf = gridDim.x, Nblk = gridDim.y, nloc = n/Nf;
4259: const PetscInt field = blockIdx.x, blkIdx = blockIdx.y;
4260: const PetscInt nloc_i = (nloc/Nblk + !!(nloc%Nblk)), start_i = field*nloc + blkIdx*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i);
4262: // set i (row+1)
4263: if (threadIdx.x + threadIdx.y + blockIdx.x + blockIdx.y == 0) bi_csr[0] = 0; // dummy at zero
4264: // for (int rowb = start_i + blkIdx*blockDim.y + threadIdx.y; rowb < end_i; rowb += Nblk*blockDim.y) { // rows in block
4265: for (int rowb = start_i + threadIdx.y; rowb < end_i; rowb += blockDim.y) { // rows in block by thread y
4266: if (rowb < end_i && threadIdx.x==0) {
4267: PetscInt i=rowb+1, ni = (rowb>bw) ? bw+1 : i, n1L = ni*(ni-1)/2, nug= i*bw, n2L = bw*((rowb>bw) ? (rowb-bw) : 0), mi = bw + rowb + 1 - n, clip = (mi>0) ? mi*(mi-1)/2 + mi: 0;
4268: bi_csr[rowb+1] = n1L + nug - clip + n2L + i;
4269: }
4270: }
4271: }
4272: // copy AIJ to AIJ_BAND
4273: __global__
4274: void __launch_bounds__(1024,1)
4275: mat_lu_factor_band_copy_aij_aij(const PetscInt n, const int bw, const PetscInt r[], const PetscInt ic[],
4276: const int ai_d[], const int aj_d[], const PetscScalar aa_d[],
4277: const int bi_csr[], PetscScalar ba_csr[])
4278: {
4279: const PetscInt Nf = gridDim.x, Nblk = gridDim.y, nloc = n/Nf;
4280: const PetscInt field = blockIdx.x, blkIdx = blockIdx.y;
4281: const PetscInt nloc_i = (nloc/Nblk + !!(nloc%Nblk)), start_i = field*nloc + blkIdx*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i);
4283: // zero B
4284: if (threadIdx.x + threadIdx.y + blockIdx.x + blockIdx.y == 0) ba_csr[bi_csr[n]] = 0; // flop count at end
4285: for (int rowb = start_i + threadIdx.y; rowb < end_i; rowb += blockDim.y) { // rows in block by thread y
4286: if (rowb < end_i) {
4287: PetscScalar *batmp = ba_csr + bi_csr[rowb];
4288: const PetscInt nzb = bi_csr[rowb+1] - bi_csr[rowb];
4289: for (int j=threadIdx.x ; j<nzb ; j += blockDim.x) {
4290: if (j<nzb) {
4291: batmp[j] = 0;
4292: }
4293: }
4294: }
4295: }
4297: // copy A into B with CSR format -- these two loops can be fused
4298: for (int rowb = start_i + threadIdx.y; rowb < end_i; rowb += blockDim.y) { // rows in block by thread y
4299: if (rowb < end_i) {
4300: const PetscInt rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa];
4301: const int *ajtmp = aj_d + ai_d[rowa], bjStart = (rowb>bw) ? rowb-bw : 0;
4302: const PetscScalar *av = aa_d + ai_d[rowa];
4303: PetscScalar *batmp = ba_csr + bi_csr[rowb];
4304: /* load in initial (unfactored row) */
4305: for (int j=threadIdx.x ; j<nza ; j += blockDim.x) {
4306: if (j<nza) {
4307: PetscInt colb = ic[ajtmp[j]], idx = colb - bjStart;
4308: PetscScalar vala = av[j];
4309: batmp[idx] = vala;
4310: }
4311: }
4312: }
4313: }
4314: }
4315: // print AIJ_BAND
4316: __global__
4317: void print_mat_aij_band(const PetscInt n, const int bi_csr[], const PetscScalar ba_csr[])
4318: {
4319: // debug
4320: if (threadIdx.x + threadIdx.y + blockIdx.x + blockIdx.y == 0){
4321: printf("B (AIJ) n=%d:\n",(int)n);
4322: for (int rowb=0;rowb<n;rowb++) {
4323: const PetscInt nz = bi_csr[rowb+1] - bi_csr[rowb];
4324: const PetscScalar *batmp = ba_csr + bi_csr[rowb];
4325: for (int j=0; j<nz; j++) printf("(%13.6e) ",PetscRealPart(batmp[j]));
4326: printf(" bi=%d\n",bi_csr[rowb+1]);
4327: }
4328: }
4329: }
4330: // Band LU kernel --- ba_csr bi_csr
4331: __global__
4332: void __launch_bounds__(1024,1)
4333: mat_lu_factor_band(const PetscInt n, const PetscInt bw, const int bi_csr[], PetscScalar ba_csr[])
4334: {
4335: extern __shared__ PetscInt smemInt[];
4336: PetscInt *sm_pkIdx = &smemInt[0];
4337: const PetscInt Nf = gridDim.x, Nblk = gridDim.y, nloc = n/Nf;
4338: const PetscInt field = blockIdx.x, blkIdx = blockIdx.y;
4339: const PetscInt start = field*nloc, end = start + nloc;
4340: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4341: auto g = cooperative_groups::this_grid();
4342: #endif
4343: // A22 panel update for each row A(1,:) and col A(:,1)
4344: for (int glbDD=start, locDD = 0; glbDD<end; glbDD++, locDD++) {
4345: PetscInt tnzUd = bw, maxU = end-1 - glbDD; // we are chopping off the inter ears
4346: const PetscInt nzUd = (tnzUd>maxU) ? maxU : tnzUd, dOffset = (glbDD > bw) ? bw : glbDD; // global to go past ears after first
4347: const PetscInt nzUd_pad = blockDim.y*(nzUd/blockDim.y + !!(nzUd%blockDim.y));
4348: PetscScalar *pBdd = ba_csr + bi_csr[glbDD] + dOffset;
4349: const PetscScalar *baUd = pBdd + 1; // vector of data U(i,i+1:end)
4350: const PetscScalar Bdd = *pBdd;
4351: const PetscInt offset = blkIdx*blockDim.y + threadIdx.y, inc = Nblk*blockDim.y;
4352: for (int idx = offset, myi = glbDD + offset + 1; idx < nzUd_pad ; idx += inc, myi += inc) { /* assuming symmetric structure */
4353: if (idx < nzUd && threadIdx.x==0) { /* assuming symmetric structure */
4354: const PetscInt bwi = myi > bw ? bw : myi, kIdx = bwi - (myi-glbDD); // cuts off just the first (global) block
4355: PetscScalar *Aid = ba_csr + bi_csr[myi] + kIdx;
4356: *Aid = *Aid/Bdd;
4357: sm_pkIdx[threadIdx.y] = kIdx;
4358: }
4359: __syncthreads(); // synch on threadIdx.x only
4360: if (idx < nzUd) { /* assuming symmetric structure */
4361: PetscInt kIdx = sm_pkIdx[threadIdx.y];
4362: PetscScalar *Aid = ba_csr + bi_csr[myi] + kIdx;
4363: PetscScalar *Aij = Aid + 1;
4364: PetscScalar Lid = *Aid;
4365: for (int jIdx=threadIdx.x ; jIdx<nzUd ; jIdx += blockDim.x) {
4366: if (jIdx<nzUd) {
4367: Aij[jIdx] -= Lid*baUd[jIdx];
4368: }
4369: }
4370: }
4371: }
4372: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4373: g.sync();
4374: #else
4375: __syncthreads();
4376: #endif
4377: } /* endof for (i=0; i<n; i++) { */
4378: }
4380: static PetscErrorCode MatSolve_SeqAIJCUSPARSEBAND(Mat,Vec,Vec);
4381: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSEBAND(Mat B,Mat A,const MatFactorInfo *info)
4382: {
4383: Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data;
4384: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
4385: if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
4386: Mat_SeqAIJCUSPARSE *cusparsestructA = (Mat_SeqAIJCUSPARSE*)A->spptr;
4387: Mat_SeqAIJCUSPARSEMultStruct *matstructA;
4388: CsrMatrix *matrixA;
4389: PetscErrorCode ierr;
4390: cudaError_t cerr;
4391: const PetscInt n=A->rmap->n, *ic, *r;
4392: const int *ai_d, *aj_d;
4393: const PetscScalar *aa_d;
4394: PetscScalar *ba_t = cusparseTriFactors->a_band_d;
4395: int *bi_t = cusparseTriFactors->i_band_d;
4396: PetscContainer container;
4397: int Ni = 10, team_size=9, Nf, nVec=56, nconcurrent = 1, nsm = -1;
4400: if (A->rmap->n == 0) {
4401: return(0);
4402: }
4403: // cusparse setup
4404: if (!cusparsestructA) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparsestructA");
4405: matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat; // matstruct->cprowIndices
4406: if (!matstructA) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing mat struct");
4407: matrixA = (CsrMatrix*)matstructA->mat;
4408: if (!matrixA) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing matrix cusparsestructA->mat->mat");
4410: // factor: get Nf if available
4411: PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);
4412: if (container) {
4413: PetscInt *pNf=NULL;
4414: PetscContainerGetPointer(container, (void **) &pNf);
4415: Nf = (*pNf)%1000;
4416: if ((*pNf)/1000>0) nconcurrent = (*pNf)/1000; // number of SMs to use
4417: } else Nf = 1;
4418: if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %D %D",n,Nf);
4420: // get data
4421: ic = thrust::raw_pointer_cast(cusparseTriFactors->cpermIndices->data());
4422: ai_d = thrust::raw_pointer_cast(matrixA->row_offsets->data());
4423: aj_d = thrust::raw_pointer_cast(matrixA->column_indices->data());
4424: aa_d = thrust::raw_pointer_cast(matrixA->values->data().get());
4425: r = thrust::raw_pointer_cast(cusparseTriFactors->rpermIndices->data());
4427: cerr = WaitForCUDA();CHKERRCUDA(cerr);
4428: PetscLogGpuTimeBegin();
4429: {
4430: int bw = (2*n-1 - (int)(PetscSqrtReal(1+4*(n*n-b->nz))+PETSC_MACHINE_EPSILON))/2, bm1=bw-1,nl=n/Nf;
4431: int gpuid;
4432: cudaDeviceProp prop;
4433: cudaGetDevice(&gpuid);
4434: cudaGetDeviceProperties(&prop, gpuid);
4435: #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
4436: Ni = 1/nconcurrent;
4437: Ni = 1;
4438: #else
4439: nsm = prop.multiProcessorCount;
4440: Ni = nsm/Nf/nconcurrent;
4441: #endif
4442: team_size = bw/Ni + !!(bw%Ni);
4443: nVec = PetscMin(bw, 1024/team_size);
4444: PetscInfo5(A,"Matrix Bandwidth = %d, number SMs/block = %d, num concurency = %d, num fields = %d, numSMs/GPU = %d\n",bw,Ni,nconcurrent,Nf,nsm);
4445: {
4446: dim3 dimBlockTeam(nVec,team_size);
4447: dim3 dimBlockLeague(Nf,Ni);
4448: mat_lu_factor_band_copy_aij_aij<<<dimBlockLeague,dimBlockTeam>>>(n, bw, r, ic, ai_d, aj_d, aa_d, bi_t, ba_t);
4449: CHECK_LAUNCH_ERROR(); // does a sync
4450: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4451: void *kernelArgs[] = { (void*)&n, (void*)&bw, (void*)&bi_t, (void*)&ba_t};
4452: cudaLaunchCooperativeKernel((void*)mat_lu_factor_band, dimBlockLeague, dimBlockTeam, kernelArgs, team_size*sizeof(PetscInt), NULL);
4453: #else
4454: mat_lu_factor_band<<<dimBlockLeague,dimBlockTeam,team_size*sizeof(PetscInt)>>>(n, bw, bi_t, ba_t);
4455: #endif
4456: CHECK_LAUNCH_ERROR(); // does a sync
4457: #if defined(PETSC_USE_LOG)
4458: PetscLogGpuFlops((PetscLogDouble)Nf*(bm1*(bm1 + 1)*(2*bm1 + 1)/3 + 2*(nl-bw)*bw*bw + nl*(nl+1)/2));
4459: #endif
4460: }
4461: }
4462: PetscLogGpuTimeEnd();
4464: /* determine which version of MatSolve needs to be used. from MatLUFactorNumeric_AIJ_SeqAIJCUSPARSE */
4465: B->ops->solve = MatSolve_SeqAIJCUSPARSEBAND;
4466: B->ops->solvetranspose = NULL; // need transpose
4467: B->ops->matsolve = NULL;
4468: B->ops->matsolvetranspose = NULL;
4470: return(0);
4471: }
4473: static PetscErrorCode MatrixNfDestroy(void *ptr)
4474: {
4475: PetscInt *nf = (PetscInt *)ptr;
4476: PetscErrorCode ierr;
4478: PetscFree(nf);
4479: return(0);
4480: }
4482: PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSEBAND(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
4483: {
4484: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b;
4485: IS isicol;
4486: PetscErrorCode ierr;
4487: cudaError_t cerr;
4488: const PetscInt *ic,*ai=a->i,*aj=a->j;
4489: PetscScalar *ba_t;
4490: int *bi_t;
4491: PetscInt i,n=A->rmap->n,Nf;
4492: PetscInt nzBcsr,bwL,bwU;
4493: PetscBool missing;
4494: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
4495: PetscContainer container;
4498: if (A->rmap->N != A->cmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"matrix must be square");
4499: MatMissingDiagonal(A,&missing,&i);
4500: if (missing) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",i);
4501: if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"!cusparseTriFactors");
4502: MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&missing);
4503: if (!missing) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"only structrally symmetric matrices supported");
4505: // factor: get Nf if available
4506: PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);
4507: if (container) {
4508: PetscInt *pNf=NULL;
4509: PetscContainerGetPointer(container, (void **) &pNf);
4510: Nf = (*pNf)%1000;
4511: PetscContainerCreate(PETSC_COMM_SELF, &container);
4512: PetscMalloc(sizeof(PetscInt), &pNf);
4513: *pNf = Nf;
4514: PetscContainerSetPointer(container, (void *)pNf);
4515: PetscContainerSetUserDestroy(container, MatrixNfDestroy);
4516: PetscObjectCompose((PetscObject)B, "Nf", (PetscObject) container);
4517: PetscContainerDestroy(&container);
4518: } else Nf = 1;
4519: if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %D %D",n,Nf);
4521: ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);
4522: ISGetIndices(isicol,&ic);
4524: MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL);
4525: PetscLogObjectParent((PetscObject)B,(PetscObject)isicol);
4526: b = (Mat_SeqAIJ*)(B)->data;
4528: /* get band widths, MatComputeBandwidth should take a reordering ic and do this */
4529: bwL = bwU = 0;
4530: for (int rwb=0; rwb<n; rwb++) {
4531: const PetscInt rwa = ic[rwb], anz = ai[rwb+1] - ai[rwb], *ajtmp = aj + ai[rwb];
4532: for (int j=0;j<anz;j++) {
4533: PetscInt colb = ic[ajtmp[j]];
4534: if (colb<rwa) { // L
4535: if (rwa-colb > bwL) bwL = rwa-colb;
4536: } else {
4537: if (colb-rwa > bwU) bwU = colb-rwa;
4538: }
4539: }
4540: }
4541: ISRestoreIndices(isicol,&ic);
4542: /* only support structurally symmetric, but it might work */
4543: if (bwL!=bwU) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Only symmetric structure supported (now) W_L=%D W_U=%D",bwL,bwU);
4544: MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);
4545: nzBcsr = n + (2*n-1)*bwU - bwU*bwU;
4546: b->maxnz = b->nz = nzBcsr;
4547: cusparseTriFactors->nnz = b->nz; // only meta data needed: n & nz
4548: if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
4549: cerr = cudaMalloc(&ba_t,(b->nz+1)*sizeof(PetscScalar));CHKERRCUDA(cerr); // incude a place for flops
4550: cerr = cudaMalloc(&bi_t,(n+1)*sizeof(int));CHKERRCUDA(cerr);
4551: cusparseTriFactors->a_band_d = ba_t;
4552: cusparseTriFactors->i_band_d = bi_t;
4553: /* In b structure: Free imax, ilen, old a, old j. Allocate solve_work, new a, new j */
4554: PetscLogObjectMemory((PetscObject)B,(nzBcsr+1)*(sizeof(PetscInt)+sizeof(PetscScalar)));
4555: {
4556: dim3 dimBlockTeam(1,128);
4557: dim3 dimBlockLeague(Nf,1);
4558: mat_lu_factor_band_init_set_i<<<dimBlockLeague,dimBlockTeam>>>(n, bwU, bi_t);
4559: }
4560: CHECK_LAUNCH_ERROR(); // does a sync
4562: // setup data
4563: if (!cusparseTriFactors->rpermIndices) {
4564: const PetscInt *r;
4566: ISGetIndices(isrow,&r);
4567: cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
4568: cusparseTriFactors->rpermIndices->assign(r, r+n);
4569: ISRestoreIndices(isrow,&r);
4570: PetscLogCpuToGpu(n*sizeof(PetscInt));
4571: }
4572: /* upper triangular indices */
4573: if (!cusparseTriFactors->cpermIndices) {
4574: const PetscInt *c;
4576: ISGetIndices(isicol,&c);
4577: cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
4578: cusparseTriFactors->cpermIndices->assign(c, c+n);
4579: ISRestoreIndices(isicol,&c);
4580: PetscLogCpuToGpu(n*sizeof(PetscInt));
4581: }
4583: /* put together the new matrix */
4584: b->free_a = PETSC_FALSE;
4585: b->free_ij = PETSC_FALSE;
4586: b->singlemalloc = PETSC_FALSE;
4587: b->ilen = NULL;
4588: b->imax = NULL;
4589: b->row = isrow;
4590: b->col = iscol;
4591: PetscObjectReference((PetscObject)isrow);
4592: PetscObjectReference((PetscObject)iscol);
4593: b->icol = isicol;
4594: PetscMalloc1(n+1,&b->solve_work);
4596: B->factortype = MAT_FACTOR_LU;
4597: B->info.factor_mallocs = 0;
4598: B->info.fill_ratio_given = 0;
4600: if (ai[n]) {
4601: B->info.fill_ratio_needed = ((PetscReal)(nzBcsr))/((PetscReal)ai[n]);
4602: } else {
4603: B->info.fill_ratio_needed = 0.0;
4604: }
4605: #if defined(PETSC_USE_INFO)
4606: if (ai[n] != 0) {
4607: PetscReal af = B->info.fill_ratio_needed;
4608: PetscInfo1(A,"Band fill ratio %g\n",(double)af);
4609: } else {
4610: PetscInfo(A,"Empty matrix\n");
4611: }
4612: #endif
4613: if (a->inode.size) {
4614: PetscInfo(A,"Warning: using inodes in band solver.\n");
4615: }
4616: MatSeqAIJCheckInode_FactorLU(B);
4617: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSEBAND;
4618: B->offloadmask = PETSC_OFFLOAD_GPU;
4620: return(0);
4621: }
4623: /* Use -pc_factor_mat_solver_type cusparseband */
4624: PetscErrorCode MatFactorGetSolverType_seqaij_cusparse_band(Mat A,MatSolverType *type)
4625: {
4627: *type = MATSOLVERCUSPARSEBAND;
4628: return(0);
4629: }
4631: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat A,MatFactorType ftype,Mat *B)
4632: {
4634: PetscInt n = A->rmap->n;
4637: MatCreate(PetscObjectComm((PetscObject)A),B);
4638: MatSetSizes(*B,n,n,n,n);
4639: (*B)->factortype = ftype;
4640: (*B)->useordering = PETSC_TRUE;
4641: MatSetType(*B,MATSEQAIJCUSPARSE);
4643: if (ftype == MAT_FACTOR_LU) {
4644: MatSetBlockSizesFromMats(*B,A,A);
4645: (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
4646: (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSEBAND;
4647: } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSEBAND Matrix Types");
4649: MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);
4650: PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse_band);
4651: return(0);
4652: }
4654: #define WARP_SIZE 32
4655: template <typename T>
4656: __forceinline__ __device__
4657: T wreduce(T a)
4658: {
4659: T b;
4660: #pragma unroll
4661: for (int i = WARP_SIZE/2; i >= 1; i = i >> 1) {
4662: b = __shfl_down_sync(0xffffffff, a, i);
4663: a += b;
4664: }
4665: return a;
4666: }
4667: // reduce in a block, returns result in thread 0
4668: template <typename T, int BLOCK_SIZE>
4669: __device__
4670: T breduce(T a)
4671: {
4672: constexpr int NWARP = BLOCK_SIZE/WARP_SIZE;
4673: __shared__ double buf[NWARP];
4674: int wid = threadIdx.x / WARP_SIZE;
4675: int laneid = threadIdx.x % WARP_SIZE;
4676: T b = wreduce<T>(a);
4677: if (laneid == 0)
4678: buf[wid] = b;
4679: __syncthreads();
4680: if (wid == 0) {
4681: if (threadIdx.x < NWARP)
4682: a = buf[threadIdx.x];
4683: else
4684: a = 0;
4685: for (int i = (NWARP+1)/2; i >= 1; i = i >> 1) {
4686: a += __shfl_down_sync(0xffffffff, a, i);
4687: }
4688: }
4689: return a;
4690: }
4693: // Band LU kernel --- ba_csr bi_csr
4694: template <int BLOCK_SIZE>
4695: __global__
4696: void __launch_bounds__(256,1)
4697: mat_solve_band(const PetscInt n, const PetscInt bw, const PetscScalar ba_csr[], PetscScalar x[])
4698: {
4699: const PetscInt Nf = gridDim.x, nloc = n/Nf, field = blockIdx.x, start = field*nloc, end = start + nloc, chopnz = bw*(bw+1)/2, blocknz=(2*bw+1)*nloc, blocknz_0 = blocknz-chopnz;
4700: const PetscScalar *pLi;
4701: const int tid = threadIdx.x;
4703: /* Next, solve L */
4704: pLi = ba_csr + (field==0 ? 0 : blocknz_0 + (field-1)*blocknz + bw); // diagonal (0,0) in field
4705: for (int glbDD=start, locDD = 0; glbDD<end; glbDD++, locDD++) {
4706: const PetscInt col = locDD<bw ? start : (glbDD-bw);
4707: PetscScalar t = 0;
4708: for (int j=col+tid,idx=tid;j<glbDD;j+=blockDim.x,idx+=blockDim.x) {
4709: t += pLi[idx]*x[j];
4710: }
4711: #if defined(PETSC_USE_COMPLEX)
4712: PetscReal tr = PetscRealPartComplex(t), ti = PetscImaginaryPartComplex(t);
4713: PetscScalar tt(breduce<PetscReal,BLOCK_SIZE>(tr), breduce<PetscReal,BLOCK_SIZE>(ti));
4714: t = tt;
4715: #else
4716: t = breduce<PetscReal,BLOCK_SIZE>(t);
4717: #endif
4718: if (threadIdx.x == 0)
4719: x[glbDD] -= t; // /1.0
4720: __syncthreads();
4721: // inc
4722: pLi += glbDD-col; // get to diagonal
4723: if (glbDD > n-1-bw) pLi += n-1-glbDD; // skip over U, only last block has funny offset
4724: else pLi += bw;
4725: pLi += 1; // skip to next row
4726: if (field>0 && (locDD+1)<bw) pLi += bw-(locDD+1); // skip padding at beginning (ear)
4727: }
4728: /* Then, solve U */
4729: pLi = ba_csr + Nf*blocknz - 2*chopnz - 1; // end of real data on block (diagonal)
4730: if (field != Nf-1) pLi -= blocknz_0 + (Nf-2-field)*blocknz + bw; // diagonal of last local row
4731: for (int glbDD=end-1, locDD = 0; glbDD >= start; glbDD--, locDD++) {
4732: const PetscInt col = (locDD<bw) ? end-1 : glbDD+bw; // end of row in U
4733: PetscScalar t = 0;
4734: for (int j=col-tid,idx=tid;j>glbDD;j-=blockDim.x,idx+=blockDim.x) {
4735: t += pLi[-idx]*x[j];
4736: }
4737: #if defined(PETSC_USE_COMPLEX)
4738: PetscReal tr = PetscRealPartComplex(t), ti = PetscImaginaryPartComplex(t);
4739: PetscScalar tt(breduce<PetscReal,BLOCK_SIZE>(tr), breduce<PetscReal,BLOCK_SIZE>(ti));
4740: t = tt;
4741: #else
4742: t = breduce<PetscReal,BLOCK_SIZE>(PetscRealPart(t));
4743: #endif
4744: pLi -= col-glbDD; // diagonal
4745: if (threadIdx.x == 0) {
4746: x[glbDD] -= t;
4747: x[glbDD] /= pLi[0];
4748: }
4749: __syncthreads();
4750: // inc past L to start of previous U
4751: pLi -= bw+1;
4752: if (glbDD<bw) pLi += bw-glbDD; // overshot in top left corner
4753: if (((locDD+1) < bw) && field != Nf-1) pLi -= (bw - (locDD+1)); // skip past right corner
4754: }
4755: }
4757: static PetscErrorCode MatSolve_SeqAIJCUSPARSEBAND(Mat A,Vec bb,Vec xx)
4758: {
4759: const PetscScalar *barray;
4760: PetscScalar *xarray;
4761: thrust::device_ptr<const PetscScalar> bGPU;
4762: thrust::device_ptr<PetscScalar> xGPU;
4763: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
4764: THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
4765: PetscInt n=A->rmap->n, nz=cusparseTriFactors->nnz, bw=(2*n-1 - (int)(PetscSqrtReal(1+4*(n*n-nz))+PETSC_MACHINE_EPSILON))/2, Nf;
4766: PetscErrorCode ierr;
4767: cudaError_t cerr;
4768: PetscContainer container;
4771: if (A->rmap->n == 0) {
4772: return(0);
4773: }
4774: // factor: get Nf if available
4775: PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);
4776: if (container) {
4777: PetscInt *pNf=NULL;
4778: PetscContainerGetPointer(container, (void **) &pNf);
4779: Nf = (*pNf)%1000;
4780: } else Nf = 1;
4781: if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %D %D",n,Nf);
4783: /* Get the GPU pointers */
4784: VecCUDAGetArrayWrite(xx,&xarray);
4785: VecCUDAGetArrayRead(bb,&barray);
4786: xGPU = thrust::device_pointer_cast(xarray);
4787: bGPU = thrust::device_pointer_cast(barray);
4789: PetscLogGpuTimeBegin();
4790: /* First, reorder with the row permutation */
4791: thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
4792: thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
4793: tempGPU->begin());
4794: constexpr int block = 128;
4795: mat_solve_band<block><<<Nf,block>>>(n,bw,cusparseTriFactors->a_band_d,tempGPU->data().get());
4796: CHECK_LAUNCH_ERROR(); // does a sync
4798: /* Last, reorder with the column permutation */
4799: thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
4800: thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
4801: xGPU);
4803: VecCUDARestoreArrayRead(bb,&barray);
4804: VecCUDARestoreArrayWrite(xx,&xarray);
4805: cerr = WaitForCUDA();CHKERRCUDA(cerr);
4806: PetscLogGpuTimeEnd();
4807: PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
4808: return(0);
4809: }