Actual source code: baij2.c
petsc-3.15.0 2021-03-30
2: #include <../src/mat/impls/baij/seq/baij.h>
3: #include <../src/mat/impls/dense/seq/dense.h>
4: #include <petsc/private/kernels/blockinvert.h>
5: #include <petscbt.h>
6: #include <petscblaslapack.h>
8: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
9: #include <immintrin.h>
10: #endif
12: PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
13: {
14: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
16: PetscInt row,i,j,k,l,m,n,*nidx,isz,val,ival;
17: const PetscInt *idx;
18: PetscInt start,end,*ai,*aj,bs,*nidx2;
19: PetscBT table;
22: m = a->mbs;
23: ai = a->i;
24: aj = a->j;
25: bs = A->rmap->bs;
27: if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
29: PetscBTCreate(m,&table);
30: PetscMalloc1(m+1,&nidx);
31: PetscMalloc1(A->rmap->N+1,&nidx2);
33: for (i=0; i<is_max; i++) {
34: /* Initialise the two local arrays */
35: isz = 0;
36: PetscBTMemzero(m,table);
38: /* Extract the indices, assume there can be duplicate entries */
39: ISGetIndices(is[i],&idx);
40: ISGetLocalSize(is[i],&n);
42: /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
43: for (j=0; j<n; ++j) {
44: ival = idx[j]/bs; /* convert the indices into block indices */
45: if (ival>=m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
46: if (!PetscBTLookupSet(table,ival)) nidx[isz++] = ival;
47: }
48: ISRestoreIndices(is[i],&idx);
49: ISDestroy(&is[i]);
51: k = 0;
52: for (j=0; j<ov; j++) { /* for each overlap*/
53: n = isz;
54: for (; k<n; k++) { /* do only those rows in nidx[k], which are not done yet */
55: row = nidx[k];
56: start = ai[row];
57: end = ai[row+1];
58: for (l = start; l<end; l++) {
59: val = aj[l];
60: if (!PetscBTLookupSet(table,val)) nidx[isz++] = val;
61: }
62: }
63: }
64: /* expand the Index Set */
65: for (j=0; j<isz; j++) {
66: for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k;
67: }
68: ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);
69: }
70: PetscBTDestroy(&table);
71: PetscFree(nidx);
72: PetscFree(nidx2);
73: return(0);
74: }
76: PetscErrorCode MatCreateSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
77: {
78: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*c;
80: PetscInt *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
81: PetscInt row,mat_i,*mat_j,tcol,*mat_ilen;
82: const PetscInt *irow,*icol;
83: PetscInt nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
84: PetscInt *aj = a->j,*ai = a->i;
85: MatScalar *mat_a;
86: Mat C;
87: PetscBool flag;
90: ISGetIndices(isrow,&irow);
91: ISGetIndices(iscol,&icol);
92: ISGetLocalSize(isrow,&nrows);
93: ISGetLocalSize(iscol,&ncols);
95: PetscCalloc1(1+oldcols,&smap);
96: ssmap = smap;
97: PetscMalloc1(1+nrows,&lens);
98: for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
99: /* determine lens of each row */
100: for (i=0; i<nrows; i++) {
101: kstart = ai[irow[i]];
102: kend = kstart + a->ilen[irow[i]];
103: lens[i] = 0;
104: for (k=kstart; k<kend; k++) {
105: if (ssmap[aj[k]]) lens[i]++;
106: }
107: }
108: /* Create and fill new matrix */
109: if (scall == MAT_REUSE_MATRIX) {
110: c = (Mat_SeqBAIJ*)((*B)->data);
112: if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
113: PetscArraycmp(c->ilen,lens,c->mbs,&flag);
114: if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
115: PetscArrayzero(c->ilen,c->mbs);
116: C = *B;
117: } else {
118: MatCreate(PetscObjectComm((PetscObject)A),&C);
119: MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);
120: MatSetType(C,((PetscObject)A)->type_name);
121: MatSeqBAIJSetPreallocation(C,bs,0,lens);
122: }
123: c = (Mat_SeqBAIJ*)(C->data);
124: for (i=0; i<nrows; i++) {
125: row = irow[i];
126: kstart = ai[row];
127: kend = kstart + a->ilen[row];
128: mat_i = c->i[i];
129: mat_j = c->j + mat_i;
130: mat_a = c->a + mat_i*bs2;
131: mat_ilen = c->ilen + i;
132: for (k=kstart; k<kend; k++) {
133: if ((tcol=ssmap[a->j[k]])) {
134: *mat_j++ = tcol - 1;
135: PetscArraycpy(mat_a,a->a+k*bs2,bs2);
136: mat_a += bs2;
137: (*mat_ilen)++;
138: }
139: }
140: }
141: /* sort */
142: {
143: MatScalar *work;
144: PetscMalloc1(bs2,&work);
145: for (i=0; i<nrows; i++) {
146: PetscInt ilen;
147: mat_i = c->i[i];
148: mat_j = c->j + mat_i;
149: mat_a = c->a + mat_i*bs2;
150: ilen = c->ilen[i];
151: PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);
152: }
153: PetscFree(work);
154: }
156: /* Free work space */
157: ISRestoreIndices(iscol,&icol);
158: PetscFree(smap);
159: PetscFree(lens);
160: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
161: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
163: ISRestoreIndices(isrow,&irow);
164: *B = C;
165: return(0);
166: }
168: PetscErrorCode MatCreateSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
169: {
170: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
171: IS is1,is2;
173: PetscInt *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count,maxmnbs,j;
174: const PetscInt *irow,*icol;
177: ISGetIndices(isrow,&irow);
178: ISGetIndices(iscol,&icol);
179: ISGetLocalSize(isrow,&nrows);
180: ISGetLocalSize(iscol,&ncols);
182: /* Verify if the indices corespond to each element in a block
183: and form the IS with compressed IS */
184: maxmnbs = PetscMax(a->mbs,a->nbs);
185: PetscMalloc2(maxmnbs,&vary,maxmnbs,&iary);
186: PetscArrayzero(vary,a->mbs);
187: for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
188: for (i=0; i<a->mbs; i++) {
189: if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks");
190: }
191: count = 0;
192: for (i=0; i<nrows; i++) {
193: j = irow[i] / bs;
194: if ((vary[j]--)==bs) iary[count++] = j;
195: }
196: ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);
198: PetscArrayzero(vary,a->nbs);
199: for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
200: for (i=0; i<a->nbs; i++) {
201: if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc");
202: }
203: count = 0;
204: for (i=0; i<ncols; i++) {
205: j = icol[i] / bs;
206: if ((vary[j]--)==bs) iary[count++] = j;
207: }
208: ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);
209: ISRestoreIndices(isrow,&irow);
210: ISRestoreIndices(iscol,&icol);
211: PetscFree2(vary,iary);
213: MatCreateSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B);
214: ISDestroy(&is1);
215: ISDestroy(&is2);
216: return(0);
217: }
219: PetscErrorCode MatDestroySubMatrix_SeqBAIJ(Mat C)
220: {
222: Mat_SeqBAIJ *c = (Mat_SeqBAIJ*)C->data;
223: Mat_SubSppt *submatj = c->submatis1;
226: (*submatj->destroy)(C);
227: MatDestroySubMatrix_Private(submatj);
228: return(0);
229: }
231: PetscErrorCode MatDestroySubMatrices_SeqBAIJ(PetscInt n,Mat *mat[])
232: {
234: PetscInt i;
235: Mat C;
236: Mat_SeqBAIJ *c;
237: Mat_SubSppt *submatj;
240: for (i=0; i<n; i++) {
241: C = (*mat)[i];
242: c = (Mat_SeqBAIJ*)C->data;
243: submatj = c->submatis1;
244: if (submatj) {
245: (*submatj->destroy)(C);
246: MatDestroySubMatrix_Private(submatj);
247: PetscFree(C->defaultvectype);
248: PetscLayoutDestroy(&C->rmap);
249: PetscLayoutDestroy(&C->cmap);
250: PetscHeaderDestroy(&C);
251: } else {
252: MatDestroy(&C);
253: }
254: }
255: PetscFree(*mat);
256: return(0);
257: }
259: PetscErrorCode MatCreateSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
260: {
262: PetscInt i;
265: if (scall == MAT_INITIAL_MATRIX) {
266: PetscCalloc1(n+1,B);
267: }
269: for (i=0; i<n; i++) {
270: MatCreateSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);
271: }
272: return(0);
273: }
275: /* -------------------------------------------------------*/
276: /* Should check that shapes of vectors and matrices match */
277: /* -------------------------------------------------------*/
279: PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
280: {
281: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
282: PetscScalar *z,sum;
283: const PetscScalar *x;
284: const MatScalar *v;
285: PetscErrorCode ierr;
286: PetscInt mbs,i,n;
287: const PetscInt *idx,*ii,*ridx=NULL;
288: PetscBool usecprow=a->compressedrow.use;
291: VecGetArrayRead(xx,&x);
292: VecGetArrayWrite(zz,&z);
294: if (usecprow) {
295: mbs = a->compressedrow.nrows;
296: ii = a->compressedrow.i;
297: ridx = a->compressedrow.rindex;
298: PetscArrayzero(z,a->mbs);
299: } else {
300: mbs = a->mbs;
301: ii = a->i;
302: }
304: for (i=0; i<mbs; i++) {
305: n = ii[1] - ii[0];
306: v = a->a + ii[0];
307: idx = a->j + ii[0];
308: ii++;
309: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
310: PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
311: sum = 0.0;
312: PetscSparseDensePlusDot(sum,x,v,idx,n);
313: if (usecprow) {
314: z[ridx[i]] = sum;
315: } else {
316: z[i] = sum;
317: }
318: }
319: VecRestoreArrayRead(xx,&x);
320: VecRestoreArrayWrite(zz,&z);
321: PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);
322: return(0);
323: }
325: PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
326: {
327: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
328: PetscScalar *z = NULL,sum1,sum2,*zarray;
329: const PetscScalar *x,*xb;
330: PetscScalar x1,x2;
331: const MatScalar *v;
332: PetscErrorCode ierr;
333: PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL;
334: PetscBool usecprow=a->compressedrow.use;
337: VecGetArrayRead(xx,&x);
338: VecGetArrayWrite(zz,&zarray);
340: idx = a->j;
341: v = a->a;
342: if (usecprow) {
343: mbs = a->compressedrow.nrows;
344: ii = a->compressedrow.i;
345: ridx = a->compressedrow.rindex;
346: PetscArrayzero(zarray,2*a->mbs);
347: } else {
348: mbs = a->mbs;
349: ii = a->i;
350: z = zarray;
351: }
353: for (i=0; i<mbs; i++) {
354: n = ii[1] - ii[0]; ii++;
355: sum1 = 0.0; sum2 = 0.0;
356: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
357: PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
358: for (j=0; j<n; j++) {
359: xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
360: sum1 += v[0]*x1 + v[2]*x2;
361: sum2 += v[1]*x1 + v[3]*x2;
362: v += 4;
363: }
364: if (usecprow) z = zarray + 2*ridx[i];
365: z[0] = sum1; z[1] = sum2;
366: if (!usecprow) z += 2;
367: }
368: VecRestoreArrayRead(xx,&x);
369: VecRestoreArrayWrite(zz,&zarray);
370: PetscLogFlops(8.0*a->nz - 2.0*a->nonzerorowcnt);
371: return(0);
372: }
374: PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
375: {
376: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
377: PetscScalar *z = NULL,sum1,sum2,sum3,x1,x2,x3,*zarray;
378: const PetscScalar *x,*xb;
379: const MatScalar *v;
380: PetscErrorCode ierr;
381: PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL;
382: PetscBool usecprow=a->compressedrow.use;
384: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
385: #pragma disjoint(*v,*z,*xb)
386: #endif
389: VecGetArrayRead(xx,&x);
390: VecGetArrayWrite(zz,&zarray);
392: idx = a->j;
393: v = a->a;
394: if (usecprow) {
395: mbs = a->compressedrow.nrows;
396: ii = a->compressedrow.i;
397: ridx = a->compressedrow.rindex;
398: PetscArrayzero(zarray,3*a->mbs);
399: } else {
400: mbs = a->mbs;
401: ii = a->i;
402: z = zarray;
403: }
405: for (i=0; i<mbs; i++) {
406: n = ii[1] - ii[0]; ii++;
407: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0;
408: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
409: PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
410: for (j=0; j<n; j++) {
411: xb = x + 3*(*idx++);
412: x1 = xb[0];
413: x2 = xb[1];
414: x3 = xb[2];
416: sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
417: sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
418: sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
419: v += 9;
420: }
421: if (usecprow) z = zarray + 3*ridx[i];
422: z[0] = sum1; z[1] = sum2; z[2] = sum3;
423: if (!usecprow) z += 3;
424: }
425: VecRestoreArrayRead(xx,&x);
426: VecRestoreArrayWrite(zz,&zarray);
427: PetscLogFlops(18.0*a->nz - 3.0*a->nonzerorowcnt);
428: return(0);
429: }
431: PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
432: {
433: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
434: PetscScalar *z = NULL,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray;
435: const PetscScalar *x,*xb;
436: const MatScalar *v;
437: PetscErrorCode ierr;
438: PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL;
439: PetscBool usecprow=a->compressedrow.use;
442: VecGetArrayRead(xx,&x);
443: VecGetArrayWrite(zz,&zarray);
445: idx = a->j;
446: v = a->a;
447: if (usecprow) {
448: mbs = a->compressedrow.nrows;
449: ii = a->compressedrow.i;
450: ridx = a->compressedrow.rindex;
451: PetscArrayzero(zarray,4*a->mbs);
452: } else {
453: mbs = a->mbs;
454: ii = a->i;
455: z = zarray;
456: }
458: for (i=0; i<mbs; i++) {
459: n = ii[1] - ii[0];
460: ii++;
461: sum1 = 0.0;
462: sum2 = 0.0;
463: sum3 = 0.0;
464: sum4 = 0.0;
466: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
467: PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
468: for (j=0; j<n; j++) {
469: xb = x + 4*(*idx++);
470: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
471: sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
472: sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
473: sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
474: sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
475: v += 16;
476: }
477: if (usecprow) z = zarray + 4*ridx[i];
478: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
479: if (!usecprow) z += 4;
480: }
481: VecRestoreArrayRead(xx,&x);
482: VecRestoreArrayWrite(zz,&zarray);
483: PetscLogFlops(32.0*a->nz - 4.0*a->nonzerorowcnt);
484: return(0);
485: }
487: PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
488: {
489: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
490: PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*zarray;
491: const PetscScalar *xb,*x;
492: const MatScalar *v;
493: PetscErrorCode ierr;
494: const PetscInt *idx,*ii,*ridx=NULL;
495: PetscInt mbs,i,j,n;
496: PetscBool usecprow=a->compressedrow.use;
499: VecGetArrayRead(xx,&x);
500: VecGetArrayWrite(zz,&zarray);
502: idx = a->j;
503: v = a->a;
504: if (usecprow) {
505: mbs = a->compressedrow.nrows;
506: ii = a->compressedrow.i;
507: ridx = a->compressedrow.rindex;
508: PetscArrayzero(zarray,5*a->mbs);
509: } else {
510: mbs = a->mbs;
511: ii = a->i;
512: z = zarray;
513: }
515: for (i=0; i<mbs; i++) {
516: n = ii[1] - ii[0]; ii++;
517: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
518: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
519: PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
520: for (j=0; j<n; j++) {
521: xb = x + 5*(*idx++);
522: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
523: sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
524: sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
525: sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
526: sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
527: sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
528: v += 25;
529: }
530: if (usecprow) z = zarray + 5*ridx[i];
531: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
532: if (!usecprow) z += 5;
533: }
534: VecRestoreArrayRead(xx,&x);
535: VecRestoreArrayWrite(zz,&zarray);
536: PetscLogFlops(50.0*a->nz - 5.0*a->nonzerorowcnt);
537: return(0);
538: }
540: PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
541: {
542: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
543: PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6;
544: const PetscScalar *x,*xb;
545: PetscScalar x1,x2,x3,x4,x5,x6,*zarray;
546: const MatScalar *v;
547: PetscErrorCode ierr;
548: PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL;
549: PetscBool usecprow=a->compressedrow.use;
552: VecGetArrayRead(xx,&x);
553: VecGetArrayWrite(zz,&zarray);
555: idx = a->j;
556: v = a->a;
557: if (usecprow) {
558: mbs = a->compressedrow.nrows;
559: ii = a->compressedrow.i;
560: ridx = a->compressedrow.rindex;
561: PetscArrayzero(zarray,6*a->mbs);
562: } else {
563: mbs = a->mbs;
564: ii = a->i;
565: z = zarray;
566: }
568: for (i=0; i<mbs; i++) {
569: n = ii[1] - ii[0];
570: ii++;
571: sum1 = 0.0;
572: sum2 = 0.0;
573: sum3 = 0.0;
574: sum4 = 0.0;
575: sum5 = 0.0;
576: sum6 = 0.0;
578: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
579: PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
580: for (j=0; j<n; j++) {
581: xb = x + 6*(*idx++);
582: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
583: sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
584: sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
585: sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
586: sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
587: sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
588: sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
589: v += 36;
590: }
591: if (usecprow) z = zarray + 6*ridx[i];
592: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
593: if (!usecprow) z += 6;
594: }
596: VecRestoreArrayRead(xx,&x);
597: VecRestoreArrayWrite(zz,&zarray);
598: PetscLogFlops(72.0*a->nz - 6.0*a->nonzerorowcnt);
599: return(0);
600: }
602: PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
603: {
604: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
605: PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
606: const PetscScalar *x,*xb;
607: PetscScalar x1,x2,x3,x4,x5,x6,x7,*zarray;
608: const MatScalar *v;
609: PetscErrorCode ierr;
610: PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL;
611: PetscBool usecprow=a->compressedrow.use;
614: VecGetArrayRead(xx,&x);
615: VecGetArrayWrite(zz,&zarray);
617: idx = a->j;
618: v = a->a;
619: if (usecprow) {
620: mbs = a->compressedrow.nrows;
621: ii = a->compressedrow.i;
622: ridx = a->compressedrow.rindex;
623: PetscArrayzero(zarray,7*a->mbs);
624: } else {
625: mbs = a->mbs;
626: ii = a->i;
627: z = zarray;
628: }
630: for (i=0; i<mbs; i++) {
631: n = ii[1] - ii[0];
632: ii++;
633: sum1 = 0.0;
634: sum2 = 0.0;
635: sum3 = 0.0;
636: sum4 = 0.0;
637: sum5 = 0.0;
638: sum6 = 0.0;
639: sum7 = 0.0;
641: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
642: PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
643: for (j=0; j<n; j++) {
644: xb = x + 7*(*idx++);
645: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
646: sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
647: sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
648: sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
649: sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
650: sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
651: sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
652: sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
653: v += 49;
654: }
655: if (usecprow) z = zarray + 7*ridx[i];
656: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
657: if (!usecprow) z += 7;
658: }
660: VecRestoreArrayRead(xx,&x);
661: VecRestoreArrayWrite(zz,&zarray);
662: PetscLogFlops(98.0*a->nz - 7.0*a->nonzerorowcnt);
663: return(0);
664: }
666: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
667: PetscErrorCode MatMult_SeqBAIJ_9_AVX2(Mat A,Vec xx,Vec zz)
668: {
669: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
670: PetscScalar *z = NULL,*work,*workt,*zarray;
671: const PetscScalar *x,*xb;
672: const MatScalar *v;
673: PetscErrorCode ierr;
674: PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
675: const PetscInt *idx,*ii,*ridx=NULL;
676: PetscInt k;
677: PetscBool usecprow=a->compressedrow.use;
679: __m256d a0,a1,a2,a3,a4,a5;
680: __m256d w0,w1,w2,w3;
681: __m256d z0,z1,z2;
682: __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL<<63);
685: VecGetArrayRead(xx,&x);
686: VecGetArrayWrite(zz,&zarray);
688: idx = a->j;
689: v = a->a;
690: if (usecprow) {
691: mbs = a->compressedrow.nrows;
692: ii = a->compressedrow.i;
693: ridx = a->compressedrow.rindex;
694: PetscArrayzero(zarray,bs*a->mbs);
695: } else {
696: mbs = a->mbs;
697: ii = a->i;
698: z = zarray;
699: }
701: if (!a->mult_work) {
702: k = PetscMax(A->rmap->n,A->cmap->n);
703: PetscMalloc1(k+1,&a->mult_work);
704: }
706: work = a->mult_work;
707: for (i=0; i<mbs; i++) {
708: n = ii[1] - ii[0]; ii++;
709: workt = work;
710: for (j=0; j<n; j++) {
711: xb = x + bs*(*idx++);
712: for (k=0; k<bs; k++) workt[k] = xb[k];
713: workt += bs;
714: }
715: if (usecprow) z = zarray + bs*ridx[i];
717: z0 = _mm256_setzero_pd(); z1 = _mm256_setzero_pd(); z2 = _mm256_setzero_pd();
719: for (j=0; j<n; j++) {
720: /* first column of a */
721: w0 = _mm256_set1_pd(work[j*9 ]);
722: a0 = _mm256_loadu_pd(&v[j*81 ]); z0 = _mm256_fmadd_pd(a0,w0,z0);
723: a1 = _mm256_loadu_pd(&v[j*81+4]); z1 = _mm256_fmadd_pd(a1,w0,z1);
724: a2 = _mm256_loadu_pd(&v[j*81+8]); z2 = _mm256_fmadd_pd(a2,w0,z2);
726: /* second column of a */
727: w1 = _mm256_set1_pd(work[j*9+ 1]);
728: a0 = _mm256_loadu_pd(&v[j*81+ 9]); z0 = _mm256_fmadd_pd(a0,w1,z0);
729: a1 = _mm256_loadu_pd(&v[j*81+13]); z1 = _mm256_fmadd_pd(a1,w1,z1);
730: a2 = _mm256_loadu_pd(&v[j*81+17]); z2 = _mm256_fmadd_pd(a2,w1,z2);
732: /* third column of a */
733: w2 = _mm256_set1_pd(work[j*9 +2]);
734: a3 = _mm256_loadu_pd(&v[j*81+18]); z0 = _mm256_fmadd_pd(a3,w2,z0);
735: a4 = _mm256_loadu_pd(&v[j*81+22]); z1 = _mm256_fmadd_pd(a4,w2,z1);
736: a5 = _mm256_loadu_pd(&v[j*81+26]); z2 = _mm256_fmadd_pd(a5,w2,z2);
738: /* fourth column of a */
739: w3 = _mm256_set1_pd(work[j*9+ 3]);
740: a0 = _mm256_loadu_pd(&v[j*81+27]); z0 = _mm256_fmadd_pd(a0,w3,z0);
741: a1 = _mm256_loadu_pd(&v[j*81+31]); z1 = _mm256_fmadd_pd(a1,w3,z1);
742: a2 = _mm256_loadu_pd(&v[j*81+35]); z2 = _mm256_fmadd_pd(a2,w3,z2);
744: /* fifth column of a */
745: w0 = _mm256_set1_pd(work[j*9+ 4]);
746: a3 = _mm256_loadu_pd(&v[j*81+36]); z0 = _mm256_fmadd_pd(a3,w0,z0);
747: a4 = _mm256_loadu_pd(&v[j*81+40]); z1 = _mm256_fmadd_pd(a4,w0,z1);
748: a5 = _mm256_loadu_pd(&v[j*81+44]); z2 = _mm256_fmadd_pd(a5,w0,z2);
750: /* sixth column of a */
751: w1 = _mm256_set1_pd(work[j*9+ 5]);
752: a0 = _mm256_loadu_pd(&v[j*81+45]); z0 = _mm256_fmadd_pd(a0,w1,z0);
753: a1 = _mm256_loadu_pd(&v[j*81+49]); z1 = _mm256_fmadd_pd(a1,w1,z1);
754: a2 = _mm256_loadu_pd(&v[j*81+53]); z2 = _mm256_fmadd_pd(a2,w1,z2);
756: /* seventh column of a */
757: w2 = _mm256_set1_pd(work[j*9+ 6]);
758: a0 = _mm256_loadu_pd(&v[j*81+54]); z0 = _mm256_fmadd_pd(a0,w2,z0);
759: a1 = _mm256_loadu_pd(&v[j*81+58]); z1 = _mm256_fmadd_pd(a1,w2,z1);
760: a2 = _mm256_loadu_pd(&v[j*81+62]); z2 = _mm256_fmadd_pd(a2,w2,z2);
762: /* eigth column of a */
763: w3 = _mm256_set1_pd(work[j*9+ 7]);
764: a3 = _mm256_loadu_pd(&v[j*81+63]); z0 = _mm256_fmadd_pd(a3,w3,z0);
765: a4 = _mm256_loadu_pd(&v[j*81+67]); z1 = _mm256_fmadd_pd(a4,w3,z1);
766: a5 = _mm256_loadu_pd(&v[j*81+71]); z2 = _mm256_fmadd_pd(a5,w3,z2);
768: /* ninth column of a */
769: w0 = _mm256_set1_pd(work[j*9+ 8]);
770: a0 = _mm256_loadu_pd(&v[j*81+72]); z0 = _mm256_fmadd_pd(a0,w0,z0);
771: a1 = _mm256_loadu_pd(&v[j*81+76]); z1 = _mm256_fmadd_pd(a1,w0,z1);
772: a2 = _mm256_maskload_pd(&v[j*81+80],mask1); z2 = _mm256_fmadd_pd(a2,w0,z2);
773: }
775: _mm256_storeu_pd(&z[ 0], z0); _mm256_storeu_pd(&z[ 4], z1); _mm256_maskstore_pd(&z[8], mask1, z2);
777: v += n*bs2;
778: if (!usecprow) z += bs;
779: }
780: VecRestoreArrayRead(xx,&x);
781: VecRestoreArrayWrite(zz,&zarray);
782: PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);
783: return(0);
784: }
785: #endif
787: PetscErrorCode MatMult_SeqBAIJ_11(Mat A,Vec xx,Vec zz)
788: {
789: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
790: PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11;
791: const PetscScalar *x,*xb;
792: PetscScalar *zarray,xv;
793: const MatScalar *v;
794: PetscErrorCode ierr;
795: const PetscInt *ii,*ij=a->j,*idx;
796: PetscInt mbs,i,j,k,n,*ridx=NULL;
797: PetscBool usecprow=a->compressedrow.use;
800: VecGetArrayRead(xx,&x);
801: VecGetArrayWrite(zz,&zarray);
803: v = a->a;
804: if (usecprow) {
805: mbs = a->compressedrow.nrows;
806: ii = a->compressedrow.i;
807: ridx = a->compressedrow.rindex;
808: PetscArrayzero(zarray,11*a->mbs);
809: } else {
810: mbs = a->mbs;
811: ii = a->i;
812: z = zarray;
813: }
815: for (i=0; i<mbs; i++) {
816: n = ii[i+1] - ii[i];
817: idx = ij + ii[i];
818: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
819: sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0;
821: for (j=0; j<n; j++) {
822: xb = x + 11*(idx[j]);
824: for (k=0; k<11; k++) {
825: xv = xb[k];
826: sum1 += v[0]*xv;
827: sum2 += v[1]*xv;
828: sum3 += v[2]*xv;
829: sum4 += v[3]*xv;
830: sum5 += v[4]*xv;
831: sum6 += v[5]*xv;
832: sum7 += v[6]*xv;
833: sum8 += v[7]*xv;
834: sum9 += v[8]*xv;
835: sum10 += v[9]*xv;
836: sum11 += v[10]*xv;
837: v += 11;
838: }
839: }
840: if (usecprow) z = zarray + 11*ridx[i];
841: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
842: z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11;
844: if (!usecprow) z += 11;
845: }
847: VecRestoreArrayRead(xx,&x);
848: VecRestoreArrayWrite(zz,&zarray);
849: PetscLogFlops(242.0*a->nz - 11.0*a->nonzerorowcnt);
850: return(0);
851: }
853: /* MatMult_SeqBAIJ_12 version 1: Columns in the block are accessed one at a time */
854: PetscErrorCode MatMult_SeqBAIJ_12_ver1(Mat A,Vec xx,Vec zz)
855: {
856: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
857: PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12;
858: const PetscScalar *x,*xb;
859: PetscScalar *zarray,xv;
860: const MatScalar *v;
861: PetscErrorCode ierr;
862: const PetscInt *ii,*ij=a->j,*idx;
863: PetscInt mbs,i,j,k,n,*ridx=NULL;
864: PetscBool usecprow=a->compressedrow.use;
867: VecGetArrayRead(xx,&x);
868: VecGetArrayWrite(zz,&zarray);
870: v = a->a;
871: if (usecprow) {
872: mbs = a->compressedrow.nrows;
873: ii = a->compressedrow.i;
874: ridx = a->compressedrow.rindex;
875: PetscArrayzero(zarray,12*a->mbs);
876: } else {
877: mbs = a->mbs;
878: ii = a->i;
879: z = zarray;
880: }
882: for (i=0; i<mbs; i++) {
883: n = ii[i+1] - ii[i];
884: idx = ij + ii[i];
885: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
886: sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0;
888: for (j=0; j<n; j++) {
889: xb = x + 12*(idx[j]);
891: for (k=0; k<12; k++) {
892: xv = xb[k];
893: sum1 += v[0]*xv;
894: sum2 += v[1]*xv;
895: sum3 += v[2]*xv;
896: sum4 += v[3]*xv;
897: sum5 += v[4]*xv;
898: sum6 += v[5]*xv;
899: sum7 += v[6]*xv;
900: sum8 += v[7]*xv;
901: sum9 += v[8]*xv;
902: sum10 += v[9]*xv;
903: sum11 += v[10]*xv;
904: sum12 += v[11]*xv;
905: v += 12;
906: }
907: }
908: if (usecprow) z = zarray + 12*ridx[i];
909: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
910: z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12;
911: if (!usecprow) z += 12;
912: }
913: VecRestoreArrayRead(xx,&x);
914: VecRestoreArrayWrite(zz,&zarray);
915: PetscLogFlops(288.0*a->nz - 12.0*a->nonzerorowcnt);
916: return(0);
917: }
919: PetscErrorCode MatMultAdd_SeqBAIJ_12_ver1(Mat A,Vec xx,Vec yy,Vec zz)
920: {
921: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
922: PetscScalar *z = NULL,*y = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12;
923: const PetscScalar *x,*xb;
924: PetscScalar *zarray,*yarray,xv;
925: const MatScalar *v;
926: PetscErrorCode ierr;
927: const PetscInt *ii,*ij=a->j,*idx;
928: PetscInt mbs = a->mbs,i,j,k,n,*ridx=NULL;
929: PetscBool usecprow=a->compressedrow.use;
932: VecGetArrayRead(xx,&x);
933: VecGetArrayPair(yy,zz,&yarray,&zarray);
935: v = a->a;
936: if (usecprow) {
937: if (zz != yy) {
938: PetscArraycpy(zarray,yarray,12*mbs);
939: }
940: mbs = a->compressedrow.nrows;
941: ii = a->compressedrow.i;
942: ridx = a->compressedrow.rindex;
943: } else {
944: ii = a->i;
945: y = yarray;
946: z = zarray;
947: }
949: for (i=0; i<mbs; i++) {
950: n = ii[i+1] - ii[i];
951: idx = ij + ii[i];
953: if (usecprow){
954: y = yarray + 12*ridx[i];
955: z = zarray + 12*ridx[i];
956: }
957: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
958: sum8 = y[7]; sum9 = y[8]; sum10 = y[9]; sum11 = y[10]; sum12 = y[11];
960: for (j=0; j<n; j++) {
961: xb = x + 12*(idx[j]);
963: for (k=0; k<12; k++) {
964: xv = xb[k];
965: sum1 += v[0]*xv;
966: sum2 += v[1]*xv;
967: sum3 += v[2]*xv;
968: sum4 += v[3]*xv;
969: sum5 += v[4]*xv;
970: sum6 += v[5]*xv;
971: sum7 += v[6]*xv;
972: sum8 += v[7]*xv;
973: sum9 += v[8]*xv;
974: sum10 += v[9]*xv;
975: sum11 += v[10]*xv;
976: sum12 += v[11]*xv;
977: v += 12;
978: }
979: }
981: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
982: z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12;
983: if (!usecprow) {
984: y += 12;
985: z += 12;
986: }
987: }
988: VecRestoreArrayRead(xx,&x);
989: VecRestoreArrayPair(yy,zz,&yarray,&zarray);
990: PetscLogFlops(288.0*a->nz - 12.0*a->nonzerorowcnt);
991: return(0);
992: }
994: /* MatMult_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */
995: PetscErrorCode MatMult_SeqBAIJ_12_ver2(Mat A,Vec xx,Vec zz)
996: {
997: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
998: PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12;
999: const PetscScalar *x,*xb;
1000: PetscScalar x1,x2,x3,x4,*zarray;
1001: const MatScalar *v;
1002: PetscErrorCode ierr;
1003: const PetscInt *ii,*ij=a->j,*idx,*ridx=NULL;
1004: PetscInt mbs,i,j,n;
1005: PetscBool usecprow=a->compressedrow.use;
1008: VecGetArrayRead(xx,&x);
1009: VecGetArrayWrite(zz,&zarray);
1011: v = a->a;
1012: if (usecprow) {
1013: mbs = a->compressedrow.nrows;
1014: ii = a->compressedrow.i;
1015: ridx = a->compressedrow.rindex;
1016: PetscArrayzero(zarray,12*a->mbs);
1017: } else {
1018: mbs = a->mbs;
1019: ii = a->i;
1020: z = zarray;
1021: }
1023: for (i=0; i<mbs; i++) {
1024: n = ii[i+1] - ii[i];
1025: idx = ij + ii[i];
1027: sum1 = sum2 = sum3 = sum4 = sum5 = sum6 = sum7 = sum8 = sum9 = sum10 = sum11 = sum12 = 0;
1028: for (j=0; j<n; j++) {
1029: xb = x + 12*(idx[j]);
1030: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1032: sum1 += v[0]*x1 + v[12]*x2 + v[24]*x3 + v[36]*x4;
1033: sum2 += v[1]*x1 + v[13]*x2 + v[25]*x3 + v[37]*x4;
1034: sum3 += v[2]*x1 + v[14]*x2 + v[26]*x3 + v[38]*x4;
1035: sum4 += v[3]*x1 + v[15]*x2 + v[27]*x3 + v[39]*x4;
1036: sum5 += v[4]*x1 + v[16]*x2 + v[28]*x3 + v[40]*x4;
1037: sum6 += v[5]*x1 + v[17]*x2 + v[29]*x3 + v[41]*x4;
1038: sum7 += v[6]*x1 + v[18]*x2 + v[30]*x3 + v[42]*x4;
1039: sum8 += v[7]*x1 + v[19]*x2 + v[31]*x3 + v[43]*x4;
1040: sum9 += v[8]*x1 + v[20]*x2 + v[32]*x3 + v[44]*x4;
1041: sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3 + v[45]*x4;
1042: sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3 + v[46]*x4;
1043: sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3 + v[47]*x4;
1044: v += 48;
1046: x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7];
1048: sum1 += v[0]*x1 + v[12]*x2 + v[24]*x3 + v[36]*x4;
1049: sum2 += v[1]*x1 + v[13]*x2 + v[25]*x3 + v[37]*x4;
1050: sum3 += v[2]*x1 + v[14]*x2 + v[26]*x3 + v[38]*x4;
1051: sum4 += v[3]*x1 + v[15]*x2 + v[27]*x3 + v[39]*x4;
1052: sum5 += v[4]*x1 + v[16]*x2 + v[28]*x3 + v[40]*x4;
1053: sum6 += v[5]*x1 + v[17]*x2 + v[29]*x3 + v[41]*x4;
1054: sum7 += v[6]*x1 + v[18]*x2 + v[30]*x3 + v[42]*x4;
1055: sum8 += v[7]*x1 + v[19]*x2 + v[31]*x3 + v[43]*x4;
1056: sum9 += v[8]*x1 + v[20]*x2 + v[32]*x3 + v[44]*x4;
1057: sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3 + v[45]*x4;
1058: sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3 + v[46]*x4;
1059: sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3 + v[47]*x4;
1060: v += 48;
1062: x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11];
1063: sum1 += v[0]*x1 + v[12]*x2 + v[24]*x3 + v[36]*x4;
1064: sum2 += v[1]*x1 + v[13]*x2 + v[25]*x3 + v[37]*x4;
1065: sum3 += v[2]*x1 + v[14]*x2 + v[26]*x3 + v[38]*x4;
1066: sum4 += v[3]*x1 + v[15]*x2 + v[27]*x3 + v[39]*x4;
1067: sum5 += v[4]*x1 + v[16]*x2 + v[28]*x3 + v[40]*x4;
1068: sum6 += v[5]*x1 + v[17]*x2 + v[29]*x3 + v[41]*x4;
1069: sum7 += v[6]*x1 + v[18]*x2 + v[30]*x3 + v[42]*x4;
1070: sum8 += v[7]*x1 + v[19]*x2 + v[31]*x3 + v[43]*x4;
1071: sum9 += v[8]*x1 + v[20]*x2 + v[32]*x3 + v[44]*x4;
1072: sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3 + v[45]*x4;
1073: sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3 + v[46]*x4;
1074: sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3 + v[47]*x4;
1075: v += 48;
1077: }
1078: if (usecprow) z = zarray + 12*ridx[i];
1079: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1080: z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12;
1081: if (!usecprow) z += 12;
1082: }
1083: VecRestoreArrayRead(xx,&x);
1084: VecRestoreArrayWrite(zz,&zarray);
1085: PetscLogFlops(288.0*a->nz - 12.0*a->nonzerorowcnt);
1086: return(0);
1087: }
1089: /* MatMultAdd_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */
1090: PetscErrorCode MatMultAdd_SeqBAIJ_12_ver2(Mat A,Vec xx,Vec yy,Vec zz)
1091: {
1092: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1093: PetscScalar *z = NULL,*y = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12;
1094: const PetscScalar *x,*xb;
1095: PetscScalar x1,x2,x3,x4,*zarray,*yarray;
1096: const MatScalar *v;
1097: PetscErrorCode ierr;
1098: const PetscInt *ii,*ij=a->j,*idx,*ridx=NULL;
1099: PetscInt mbs = a->mbs,i,j,n;
1100: PetscBool usecprow=a->compressedrow.use;
1103: VecGetArrayRead(xx,&x);
1104: VecGetArrayPair(yy,zz,&yarray,&zarray);
1106: v = a->a;
1107: if (usecprow) {
1108: if (zz != yy) {
1109: PetscArraycpy(zarray,yarray,12*mbs);
1110: }
1111: mbs = a->compressedrow.nrows;
1112: ii = a->compressedrow.i;
1113: ridx = a->compressedrow.rindex;
1114: } else {
1115: ii = a->i;
1116: y = yarray;
1117: z = zarray;
1118: }
1120: for (i=0; i<mbs; i++) {
1121: n = ii[i+1] - ii[i];
1122: idx = ij + ii[i];
1124: if (usecprow) {
1125: y = yarray + 12*ridx[i];
1126: z = zarray + 12*ridx[i];
1127: }
1128: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
1129: sum8 = y[7]; sum9 = y[8]; sum10 = y[9]; sum11 = y[10]; sum12 = y[11];
1131: for (j=0; j<n; j++) {
1132: xb = x + 12*(idx[j]);
1133: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1135: sum1 += v[0]*x1 + v[12]*x2 + v[24]*x3 + v[36]*x4;
1136: sum2 += v[1]*x1 + v[13]*x2 + v[25]*x3 + v[37]*x4;
1137: sum3 += v[2]*x1 + v[14]*x2 + v[26]*x3 + v[38]*x4;
1138: sum4 += v[3]*x1 + v[15]*x2 + v[27]*x3 + v[39]*x4;
1139: sum5 += v[4]*x1 + v[16]*x2 + v[28]*x3 + v[40]*x4;
1140: sum6 += v[5]*x1 + v[17]*x2 + v[29]*x3 + v[41]*x4;
1141: sum7 += v[6]*x1 + v[18]*x2 + v[30]*x3 + v[42]*x4;
1142: sum8 += v[7]*x1 + v[19]*x2 + v[31]*x3 + v[43]*x4;
1143: sum9 += v[8]*x1 + v[20]*x2 + v[32]*x3 + v[44]*x4;
1144: sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3 + v[45]*x4;
1145: sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3 + v[46]*x4;
1146: sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3 + v[47]*x4;
1147: v += 48;
1149: x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7];
1151: sum1 += v[0]*x1 + v[12]*x2 + v[24]*x3 + v[36]*x4;
1152: sum2 += v[1]*x1 + v[13]*x2 + v[25]*x3 + v[37]*x4;
1153: sum3 += v[2]*x1 + v[14]*x2 + v[26]*x3 + v[38]*x4;
1154: sum4 += v[3]*x1 + v[15]*x2 + v[27]*x3 + v[39]*x4;
1155: sum5 += v[4]*x1 + v[16]*x2 + v[28]*x3 + v[40]*x4;
1156: sum6 += v[5]*x1 + v[17]*x2 + v[29]*x3 + v[41]*x4;
1157: sum7 += v[6]*x1 + v[18]*x2 + v[30]*x3 + v[42]*x4;
1158: sum8 += v[7]*x1 + v[19]*x2 + v[31]*x3 + v[43]*x4;
1159: sum9 += v[8]*x1 + v[20]*x2 + v[32]*x3 + v[44]*x4;
1160: sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3 + v[45]*x4;
1161: sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3 + v[46]*x4;
1162: sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3 + v[47]*x4;
1163: v += 48;
1165: x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11];
1166: sum1 += v[0]*x1 + v[12]*x2 + v[24]*x3 + v[36]*x4;
1167: sum2 += v[1]*x1 + v[13]*x2 + v[25]*x3 + v[37]*x4;
1168: sum3 += v[2]*x1 + v[14]*x2 + v[26]*x3 + v[38]*x4;
1169: sum4 += v[3]*x1 + v[15]*x2 + v[27]*x3 + v[39]*x4;
1170: sum5 += v[4]*x1 + v[16]*x2 + v[28]*x3 + v[40]*x4;
1171: sum6 += v[5]*x1 + v[17]*x2 + v[29]*x3 + v[41]*x4;
1172: sum7 += v[6]*x1 + v[18]*x2 + v[30]*x3 + v[42]*x4;
1173: sum8 += v[7]*x1 + v[19]*x2 + v[31]*x3 + v[43]*x4;
1174: sum9 += v[8]*x1 + v[20]*x2 + v[32]*x3 + v[44]*x4;
1175: sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3 + v[45]*x4;
1176: sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3 + v[46]*x4;
1177: sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3 + v[47]*x4;
1178: v += 48;
1180: }
1181: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1182: z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12;
1183: if (!usecprow) {
1184: y += 12;
1185: z += 12;
1186: }
1187: }
1188: VecRestoreArrayRead(xx,&x);
1189: VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1190: PetscLogFlops(288.0*a->nz - 12.0*a->nonzerorowcnt);
1191: return(0);
1192: }
1194: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
1195: PetscErrorCode MatMult_SeqBAIJ_12_AVX2(Mat A,Vec xx,Vec zz)
1196: {
1197: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1198: PetscScalar *z = NULL,*zarray;
1199: const PetscScalar *x,*work;
1200: const MatScalar *v = a->a;
1201: PetscErrorCode ierr;
1202: PetscInt mbs,i,j,n;
1203: const PetscInt *idx = a->j,*ii,*ridx=NULL;
1204: PetscBool usecprow=a->compressedrow.use;
1205: const PetscInt bs = 12, bs2 = 144;
1207: __m256d a0,a1,a2,a3,a4,a5;
1208: __m256d w0,w1,w2,w3;
1209: __m256d z0,z1,z2;
1212: VecGetArrayRead(xx,&x);
1213: VecGetArrayWrite(zz,&zarray);
1215: if (usecprow) {
1216: mbs = a->compressedrow.nrows;
1217: ii = a->compressedrow.i;
1218: ridx = a->compressedrow.rindex;
1219: PetscArrayzero(zarray,bs*a->mbs);
1220: } else {
1221: mbs = a->mbs;
1222: ii = a->i;
1223: z = zarray;
1224: }
1226: for (i=0; i<mbs; i++) {
1227: z0 = _mm256_setzero_pd(); z1 = _mm256_setzero_pd(); z2 = _mm256_setzero_pd();
1229: n = ii[1] - ii[0]; ii++;
1230: for (j=0; j<n; j++) {
1231: work = x + bs*(*idx++);
1233: /* first column of a */
1234: w0 = _mm256_set1_pd(work[0]);
1235: a0 = _mm256_loadu_pd(v+0); z0 = _mm256_fmadd_pd(a0,w0,z0);
1236: a1 = _mm256_loadu_pd(v+4); z1 = _mm256_fmadd_pd(a1,w0,z1);
1237: a2 = _mm256_loadu_pd(v+8); z2 = _mm256_fmadd_pd(a2,w0,z2);
1239: /* second column of a */
1240: w1 = _mm256_set1_pd(work[1]);
1241: a3 = _mm256_loadu_pd(v+12); z0 = _mm256_fmadd_pd(a3,w1,z0);
1242: a4 = _mm256_loadu_pd(v+16); z1 = _mm256_fmadd_pd(a4,w1,z1);
1243: a5 = _mm256_loadu_pd(v+20); z2 = _mm256_fmadd_pd(a5,w1,z2);
1245: /* third column of a */
1246: w2 = _mm256_set1_pd(work[2]);
1247: a0 = _mm256_loadu_pd(v+24); z0 = _mm256_fmadd_pd(a0,w2,z0);
1248: a1 = _mm256_loadu_pd(v+28); z1 = _mm256_fmadd_pd(a1,w2,z1);
1249: a2 = _mm256_loadu_pd(v+32); z2 = _mm256_fmadd_pd(a2,w2,z2);
1251: /* fourth column of a */
1252: w3 = _mm256_set1_pd(work[3]);
1253: a3 = _mm256_loadu_pd(v+36); z0 = _mm256_fmadd_pd(a3,w3,z0);
1254: a4 = _mm256_loadu_pd(v+40); z1 = _mm256_fmadd_pd(a4,w3,z1);
1255: a5 = _mm256_loadu_pd(v+44); z2 = _mm256_fmadd_pd(a5,w3,z2);
1257: /* fifth column of a */
1258: w0 = _mm256_set1_pd(work[4]);
1259: a0 = _mm256_loadu_pd(v+48); z0 = _mm256_fmadd_pd(a0,w0,z0);
1260: a1 = _mm256_loadu_pd(v+52); z1 = _mm256_fmadd_pd(a1,w0,z1);
1261: a2 = _mm256_loadu_pd(v+56); z2 = _mm256_fmadd_pd(a2,w0,z2);
1263: /* sixth column of a */
1264: w1 = _mm256_set1_pd(work[5]);
1265: a3 = _mm256_loadu_pd(v+60); z0 = _mm256_fmadd_pd(a3,w1,z0);
1266: a4 = _mm256_loadu_pd(v+64); z1 = _mm256_fmadd_pd(a4,w1,z1);
1267: a5 = _mm256_loadu_pd(v+68); z2 = _mm256_fmadd_pd(a5,w1,z2);
1269: /* seventh column of a */
1270: w2 = _mm256_set1_pd(work[6]);
1271: a0 = _mm256_loadu_pd(v+72); z0 = _mm256_fmadd_pd(a0,w2,z0);
1272: a1 = _mm256_loadu_pd(v+76); z1 = _mm256_fmadd_pd(a1,w2,z1);
1273: a2 = _mm256_loadu_pd(v+80); z2 = _mm256_fmadd_pd(a2,w2,z2);
1275: /* eigth column of a */
1276: w3 = _mm256_set1_pd(work[7]);
1277: a3 = _mm256_loadu_pd(v+84); z0 = _mm256_fmadd_pd(a3,w3,z0);
1278: a4 = _mm256_loadu_pd(v+88); z1 = _mm256_fmadd_pd(a4,w3,z1);
1279: a5 = _mm256_loadu_pd(v+92); z2 = _mm256_fmadd_pd(a5,w3,z2);
1281: /* ninth column of a */
1282: w0 = _mm256_set1_pd(work[8]);
1283: a0 = _mm256_loadu_pd(v+96); z0 = _mm256_fmadd_pd(a0,w0,z0);
1284: a1 = _mm256_loadu_pd(v+100); z1 = _mm256_fmadd_pd(a1,w0,z1);
1285: a2 = _mm256_loadu_pd(v+104); z2 = _mm256_fmadd_pd(a2,w0,z2);
1287: /* tenth column of a */
1288: w1 = _mm256_set1_pd(work[9]);
1289: a3 = _mm256_loadu_pd(v+108); z0 = _mm256_fmadd_pd(a3,w1,z0);
1290: a4 = _mm256_loadu_pd(v+112); z1 = _mm256_fmadd_pd(a4,w1,z1);
1291: a5 = _mm256_loadu_pd(v+116); z2 = _mm256_fmadd_pd(a5,w1,z2);
1293: /* eleventh column of a */
1294: w2 = _mm256_set1_pd(work[10]);
1295: a0 = _mm256_loadu_pd(v+120); z0 = _mm256_fmadd_pd(a0,w2,z0);
1296: a1 = _mm256_loadu_pd(v+124); z1 = _mm256_fmadd_pd(a1,w2,z1);
1297: a2 = _mm256_loadu_pd(v+128); z2 = _mm256_fmadd_pd(a2,w2,z2);
1299: /* twelveth column of a */
1300: w3 = _mm256_set1_pd(work[11]);
1301: a3 = _mm256_loadu_pd(v+132); z0 = _mm256_fmadd_pd(a3,w3,z0);
1302: a4 = _mm256_loadu_pd(v+136); z1 = _mm256_fmadd_pd(a4,w3,z1);
1303: a5 = _mm256_loadu_pd(v+140); z2 = _mm256_fmadd_pd(a5,w3,z2);
1305: v += bs2;
1306: }
1307: if (usecprow) z = zarray + bs*ridx[i];
1308: _mm256_storeu_pd(&z[ 0], z0); _mm256_storeu_pd(&z[ 4], z1); _mm256_storeu_pd(&z[ 8], z2);
1309: if (!usecprow) z += bs;
1310: }
1311: VecRestoreArrayRead(xx,&x);
1312: VecRestoreArrayWrite(zz,&zarray);
1313: PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);
1314: return(0);
1315: }
1316: #endif
1318: /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */
1319: /* Default MatMult for block size 15 */
1320: PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz)
1321: {
1322: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1323: PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1324: const PetscScalar *x,*xb;
1325: PetscScalar *zarray,xv;
1326: const MatScalar *v;
1327: PetscErrorCode ierr;
1328: const PetscInt *ii,*ij=a->j,*idx;
1329: PetscInt mbs,i,j,k,n,*ridx=NULL;
1330: PetscBool usecprow=a->compressedrow.use;
1333: VecGetArrayRead(xx,&x);
1334: VecGetArrayWrite(zz,&zarray);
1336: v = a->a;
1337: if (usecprow) {
1338: mbs = a->compressedrow.nrows;
1339: ii = a->compressedrow.i;
1340: ridx = a->compressedrow.rindex;
1341: PetscArrayzero(zarray,15*a->mbs);
1342: } else {
1343: mbs = a->mbs;
1344: ii = a->i;
1345: z = zarray;
1346: }
1348: for (i=0; i<mbs; i++) {
1349: n = ii[i+1] - ii[i];
1350: idx = ij + ii[i];
1351: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1352: sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
1354: for (j=0; j<n; j++) {
1355: xb = x + 15*(idx[j]);
1357: for (k=0; k<15; k++) {
1358: xv = xb[k];
1359: sum1 += v[0]*xv;
1360: sum2 += v[1]*xv;
1361: sum3 += v[2]*xv;
1362: sum4 += v[3]*xv;
1363: sum5 += v[4]*xv;
1364: sum6 += v[5]*xv;
1365: sum7 += v[6]*xv;
1366: sum8 += v[7]*xv;
1367: sum9 += v[8]*xv;
1368: sum10 += v[9]*xv;
1369: sum11 += v[10]*xv;
1370: sum12 += v[11]*xv;
1371: sum13 += v[12]*xv;
1372: sum14 += v[13]*xv;
1373: sum15 += v[14]*xv;
1374: v += 15;
1375: }
1376: }
1377: if (usecprow) z = zarray + 15*ridx[i];
1378: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1379: z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1381: if (!usecprow) z += 15;
1382: }
1384: VecRestoreArrayRead(xx,&x);
1385: VecRestoreArrayWrite(zz,&zarray);
1386: PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);
1387: return(0);
1388: }
1390: /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */
1391: PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz)
1392: {
1393: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1394: PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1395: const PetscScalar *x,*xb;
1396: PetscScalar x1,x2,x3,x4,*zarray;
1397: const MatScalar *v;
1398: PetscErrorCode ierr;
1399: const PetscInt *ii,*ij=a->j,*idx;
1400: PetscInt mbs,i,j,n,*ridx=NULL;
1401: PetscBool usecprow=a->compressedrow.use;
1404: VecGetArrayRead(xx,&x);
1405: VecGetArrayWrite(zz,&zarray);
1407: v = a->a;
1408: if (usecprow) {
1409: mbs = a->compressedrow.nrows;
1410: ii = a->compressedrow.i;
1411: ridx = a->compressedrow.rindex;
1412: PetscArrayzero(zarray,15*a->mbs);
1413: } else {
1414: mbs = a->mbs;
1415: ii = a->i;
1416: z = zarray;
1417: }
1419: for (i=0; i<mbs; i++) {
1420: n = ii[i+1] - ii[i];
1421: idx = ij + ii[i];
1422: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1423: sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
1425: for (j=0; j<n; j++) {
1426: xb = x + 15*(idx[j]);
1427: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1429: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4;
1430: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4;
1431: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4;
1432: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4;
1433: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4;
1434: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4;
1435: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4;
1436: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4;
1437: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4;
1438: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4;
1439: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4;
1440: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4;
1441: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4;
1442: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4;
1443: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4;
1445: v += 60;
1447: x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7];
1449: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4;
1450: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4;
1451: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4;
1452: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4;
1453: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4;
1454: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4;
1455: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4;
1456: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4;
1457: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4;
1458: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4;
1459: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4;
1460: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4;
1461: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4;
1462: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4;
1463: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4;
1464: v += 60;
1466: x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11];
1467: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4;
1468: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4;
1469: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4;
1470: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4;
1471: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4;
1472: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4;
1473: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4;
1474: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4;
1475: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4;
1476: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4;
1477: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4;
1478: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4;
1479: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4;
1480: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4;
1481: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4;
1482: v += 60;
1484: x1 = xb[12]; x2 = xb[13]; x3 = xb[14];
1485: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3;
1486: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3;
1487: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3;
1488: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3;
1489: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3;
1490: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3;
1491: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3;
1492: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3;
1493: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3;
1494: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3;
1495: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3;
1496: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3;
1497: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3;
1498: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3;
1499: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3;
1500: v += 45;
1501: }
1502: if (usecprow) z = zarray + 15*ridx[i];
1503: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1504: z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1506: if (!usecprow) z += 15;
1507: }
1509: VecRestoreArrayRead(xx,&x);
1510: VecRestoreArrayWrite(zz,&zarray);
1511: PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);
1512: return(0);
1513: }
1515: /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */
1516: PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz)
1517: {
1518: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1519: PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1520: const PetscScalar *x,*xb;
1521: PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,*zarray;
1522: const MatScalar *v;
1523: PetscErrorCode ierr;
1524: const PetscInt *ii,*ij=a->j,*idx;
1525: PetscInt mbs,i,j,n,*ridx=NULL;
1526: PetscBool usecprow=a->compressedrow.use;
1529: VecGetArrayRead(xx,&x);
1530: VecGetArrayWrite(zz,&zarray);
1532: v = a->a;
1533: if (usecprow) {
1534: mbs = a->compressedrow.nrows;
1535: ii = a->compressedrow.i;
1536: ridx = a->compressedrow.rindex;
1537: PetscArrayzero(zarray,15*a->mbs);
1538: } else {
1539: mbs = a->mbs;
1540: ii = a->i;
1541: z = zarray;
1542: }
1544: for (i=0; i<mbs; i++) {
1545: n = ii[i+1] - ii[i];
1546: idx = ij + ii[i];
1547: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1548: sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
1550: for (j=0; j<n; j++) {
1551: xb = x + 15*(idx[j]);
1552: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1553: x8 = xb[7];
1555: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8;
1556: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8;
1557: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8;
1558: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8;
1559: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8;
1560: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8;
1561: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8;
1562: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8;
1563: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8;
1564: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8;
1565: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8;
1566: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8;
1567: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8;
1568: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8;
1569: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8;
1570: v += 120;
1572: x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; x5 = xb[12]; x6 = xb[13]; x7 = xb[14];
1574: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7;
1575: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7;
1576: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7;
1577: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7;
1578: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7;
1579: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7;
1580: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7;
1581: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7;
1582: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7;
1583: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7;
1584: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7;
1585: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7;
1586: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7;
1587: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7;
1588: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7;
1589: v += 105;
1590: }
1591: if (usecprow) z = zarray + 15*ridx[i];
1592: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1593: z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1595: if (!usecprow) z += 15;
1596: }
1598: VecRestoreArrayRead(xx,&x);
1599: VecRestoreArrayWrite(zz,&zarray);
1600: PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);
1601: return(0);
1602: }
1604: /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */
1605: PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz)
1606: {
1607: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1608: PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1609: const PetscScalar *x,*xb;
1610: PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray;
1611: const MatScalar *v;
1612: PetscErrorCode ierr;
1613: const PetscInt *ii,*ij=a->j,*idx;
1614: PetscInt mbs,i,j,n,*ridx=NULL;
1615: PetscBool usecprow=a->compressedrow.use;
1618: VecGetArrayRead(xx,&x);
1619: VecGetArrayWrite(zz,&zarray);
1621: v = a->a;
1622: if (usecprow) {
1623: mbs = a->compressedrow.nrows;
1624: ii = a->compressedrow.i;
1625: ridx = a->compressedrow.rindex;
1626: PetscArrayzero(zarray,15*a->mbs);
1627: } else {
1628: mbs = a->mbs;
1629: ii = a->i;
1630: z = zarray;
1631: }
1633: for (i=0; i<mbs; i++) {
1634: n = ii[i+1] - ii[i];
1635: idx = ij + ii[i];
1636: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1637: sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
1639: for (j=0; j<n; j++) {
1640: xb = x + 15*(idx[j]);
1641: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1642: x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14];
1644: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
1645: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
1646: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
1647: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
1648: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
1649: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
1650: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
1651: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
1652: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
1653: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
1654: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
1655: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
1656: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
1657: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
1658: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
1659: v += 225;
1660: }
1661: if (usecprow) z = zarray + 15*ridx[i];
1662: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1663: z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1665: if (!usecprow) z += 15;
1666: }
1668: VecRestoreArrayRead(xx,&x);
1669: VecRestoreArrayWrite(zz,&zarray);
1670: PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);
1671: return(0);
1672: }
1674: /*
1675: This will not work with MatScalar == float because it calls the BLAS
1676: */
1677: PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
1678: {
1679: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1680: PetscScalar *z = NULL,*work,*workt,*zarray;
1681: const PetscScalar *x,*xb;
1682: const MatScalar *v;
1683: PetscErrorCode ierr;
1684: PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
1685: const PetscInt *idx,*ii,*ridx=NULL;
1686: PetscInt ncols,k;
1687: PetscBool usecprow=a->compressedrow.use;
1690: VecGetArrayRead(xx,&x);
1691: VecGetArrayWrite(zz,&zarray);
1693: idx = a->j;
1694: v = a->a;
1695: if (usecprow) {
1696: mbs = a->compressedrow.nrows;
1697: ii = a->compressedrow.i;
1698: ridx = a->compressedrow.rindex;
1699: PetscArrayzero(zarray,bs*a->mbs);
1700: } else {
1701: mbs = a->mbs;
1702: ii = a->i;
1703: z = zarray;
1704: }
1706: if (!a->mult_work) {
1707: k = PetscMax(A->rmap->n,A->cmap->n);
1708: PetscMalloc1(k+1,&a->mult_work);
1709: }
1710: work = a->mult_work;
1711: for (i=0; i<mbs; i++) {
1712: n = ii[1] - ii[0]; ii++;
1713: ncols = n*bs;
1714: workt = work;
1715: for (j=0; j<n; j++) {
1716: xb = x + bs*(*idx++);
1717: for (k=0; k<bs; k++) workt[k] = xb[k];
1718: workt += bs;
1719: }
1720: if (usecprow) z = zarray + bs*ridx[i];
1721: PetscKernel_w_gets_Ar_times_v(bs,ncols,work,v,z);
1722: v += n*bs2;
1723: if (!usecprow) z += bs;
1724: }
1725: VecRestoreArrayRead(xx,&x);
1726: VecRestoreArrayWrite(zz,&zarray);
1727: PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);
1728: return(0);
1729: }
1731: PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
1732: {
1733: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1734: const PetscScalar *x;
1735: PetscScalar *y,*z,sum;
1736: const MatScalar *v;
1737: PetscErrorCode ierr;
1738: PetscInt mbs=a->mbs,i,n,*ridx=NULL;
1739: const PetscInt *idx,*ii;
1740: PetscBool usecprow=a->compressedrow.use;
1743: VecGetArrayRead(xx,&x);
1744: VecGetArrayPair(yy,zz,&y,&z);
1746: idx = a->j;
1747: v = a->a;
1748: if (usecprow) {
1749: if (zz != yy) {
1750: PetscArraycpy(z,y,mbs);
1751: }
1752: mbs = a->compressedrow.nrows;
1753: ii = a->compressedrow.i;
1754: ridx = a->compressedrow.rindex;
1755: } else {
1756: ii = a->i;
1757: }
1759: for (i=0; i<mbs; i++) {
1760: n = ii[1] - ii[0];
1761: ii++;
1762: if (!usecprow) {
1763: sum = y[i];
1764: } else {
1765: sum = y[ridx[i]];
1766: }
1767: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1768: PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1769: PetscSparseDensePlusDot(sum,x,v,idx,n);
1770: v += n;
1771: idx += n;
1772: if (usecprow) {
1773: z[ridx[i]] = sum;
1774: } else {
1775: z[i] = sum;
1776: }
1777: }
1778: VecRestoreArrayRead(xx,&x);
1779: VecRestoreArrayPair(yy,zz,&y,&z);
1780: PetscLogFlops(2.0*a->nz);
1781: return(0);
1782: }
1784: PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
1785: {
1786: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1787: PetscScalar *y = NULL,*z = NULL,sum1,sum2;
1788: const PetscScalar *x,*xb;
1789: PetscScalar x1,x2,*yarray,*zarray;
1790: const MatScalar *v;
1791: PetscErrorCode ierr;
1792: PetscInt mbs = a->mbs,i,n,j;
1793: const PetscInt *idx,*ii,*ridx = NULL;
1794: PetscBool usecprow = a->compressedrow.use;
1797: VecGetArrayRead(xx,&x);
1798: VecGetArrayPair(yy,zz,&yarray,&zarray);
1800: idx = a->j;
1801: v = a->a;
1802: if (usecprow) {
1803: if (zz != yy) {
1804: PetscArraycpy(zarray,yarray,2*mbs);
1805: }
1806: mbs = a->compressedrow.nrows;
1807: ii = a->compressedrow.i;
1808: ridx = a->compressedrow.rindex;
1809: } else {
1810: ii = a->i;
1811: y = yarray;
1812: z = zarray;
1813: }
1815: for (i=0; i<mbs; i++) {
1816: n = ii[1] - ii[0]; ii++;
1817: if (usecprow) {
1818: z = zarray + 2*ridx[i];
1819: y = yarray + 2*ridx[i];
1820: }
1821: sum1 = y[0]; sum2 = y[1];
1822: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1823: PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1824: for (j=0; j<n; j++) {
1825: xb = x + 2*(*idx++);
1826: x1 = xb[0];
1827: x2 = xb[1];
1829: sum1 += v[0]*x1 + v[2]*x2;
1830: sum2 += v[1]*x1 + v[3]*x2;
1831: v += 4;
1832: }
1833: z[0] = sum1; z[1] = sum2;
1834: if (!usecprow) {
1835: z += 2; y += 2;
1836: }
1837: }
1838: VecRestoreArrayRead(xx,&x);
1839: VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1840: PetscLogFlops(4.0*a->nz);
1841: return(0);
1842: }
1844: PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
1845: {
1846: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1847: PetscScalar *y = NULL,*z = NULL,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray;
1848: const PetscScalar *x,*xb;
1849: const MatScalar *v;
1850: PetscErrorCode ierr;
1851: PetscInt mbs = a->mbs,i,j,n;
1852: const PetscInt *idx,*ii,*ridx = NULL;
1853: PetscBool usecprow = a->compressedrow.use;
1856: VecGetArrayRead(xx,&x);
1857: VecGetArrayPair(yy,zz,&yarray,&zarray);
1859: idx = a->j;
1860: v = a->a;
1861: if (usecprow) {
1862: if (zz != yy) {
1863: PetscArraycpy(zarray,yarray,3*mbs);
1864: }
1865: mbs = a->compressedrow.nrows;
1866: ii = a->compressedrow.i;
1867: ridx = a->compressedrow.rindex;
1868: } else {
1869: ii = a->i;
1870: y = yarray;
1871: z = zarray;
1872: }
1874: for (i=0; i<mbs; i++) {
1875: n = ii[1] - ii[0]; ii++;
1876: if (usecprow) {
1877: z = zarray + 3*ridx[i];
1878: y = yarray + 3*ridx[i];
1879: }
1880: sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
1881: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1882: PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1883: for (j=0; j<n; j++) {
1884: xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1885: sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
1886: sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
1887: sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
1888: v += 9;
1889: }
1890: z[0] = sum1; z[1] = sum2; z[2] = sum3;
1891: if (!usecprow) {
1892: z += 3; y += 3;
1893: }
1894: }
1895: VecRestoreArrayRead(xx,&x);
1896: VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1897: PetscLogFlops(18.0*a->nz);
1898: return(0);
1899: }
1901: PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
1902: {
1903: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1904: PetscScalar *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray;
1905: const PetscScalar *x,*xb;
1906: const MatScalar *v;
1907: PetscErrorCode ierr;
1908: PetscInt mbs = a->mbs,i,j,n;
1909: const PetscInt *idx,*ii,*ridx=NULL;
1910: PetscBool usecprow=a->compressedrow.use;
1913: VecGetArrayRead(xx,&x);
1914: VecGetArrayPair(yy,zz,&yarray,&zarray);
1916: idx = a->j;
1917: v = a->a;
1918: if (usecprow) {
1919: if (zz != yy) {
1920: PetscArraycpy(zarray,yarray,4*mbs);
1921: }
1922: mbs = a->compressedrow.nrows;
1923: ii = a->compressedrow.i;
1924: ridx = a->compressedrow.rindex;
1925: } else {
1926: ii = a->i;
1927: y = yarray;
1928: z = zarray;
1929: }
1931: for (i=0; i<mbs; i++) {
1932: n = ii[1] - ii[0]; ii++;
1933: if (usecprow) {
1934: z = zarray + 4*ridx[i];
1935: y = yarray + 4*ridx[i];
1936: }
1937: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
1938: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1939: PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1940: for (j=0; j<n; j++) {
1941: xb = x + 4*(*idx++);
1942: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1943: sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
1944: sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
1945: sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1946: sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1947: v += 16;
1948: }
1949: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
1950: if (!usecprow) {
1951: z += 4; y += 4;
1952: }
1953: }
1954: VecRestoreArrayRead(xx,&x);
1955: VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1956: PetscLogFlops(32.0*a->nz);
1957: return(0);
1958: }
1960: PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
1961: {
1962: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1963: PetscScalar *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
1964: const PetscScalar *x,*xb;
1965: PetscScalar *yarray,*zarray;
1966: const MatScalar *v;
1967: PetscErrorCode ierr;
1968: PetscInt mbs = a->mbs,i,j,n;
1969: const PetscInt *idx,*ii,*ridx = NULL;
1970: PetscBool usecprow=a->compressedrow.use;
1973: VecGetArrayRead(xx,&x);
1974: VecGetArrayPair(yy,zz,&yarray,&zarray);
1976: idx = a->j;
1977: v = a->a;
1978: if (usecprow) {
1979: if (zz != yy) {
1980: PetscArraycpy(zarray,yarray,5*mbs);
1981: }
1982: mbs = a->compressedrow.nrows;
1983: ii = a->compressedrow.i;
1984: ridx = a->compressedrow.rindex;
1985: } else {
1986: ii = a->i;
1987: y = yarray;
1988: z = zarray;
1989: }
1991: for (i=0; i<mbs; i++) {
1992: n = ii[1] - ii[0]; ii++;
1993: if (usecprow) {
1994: z = zarray + 5*ridx[i];
1995: y = yarray + 5*ridx[i];
1996: }
1997: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
1998: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1999: PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2000: for (j=0; j<n; j++) {
2001: xb = x + 5*(*idx++);
2002: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
2003: sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
2004: sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
2005: sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
2006: sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
2007: sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
2008: v += 25;
2009: }
2010: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
2011: if (!usecprow) {
2012: z += 5; y += 5;
2013: }
2014: }
2015: VecRestoreArrayRead(xx,&x);
2016: VecRestoreArrayPair(yy,zz,&yarray,&zarray);
2017: PetscLogFlops(50.0*a->nz);
2018: return(0);
2019: }
2021: PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
2022: {
2023: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2024: PetscScalar *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,sum5,sum6;
2025: const PetscScalar *x,*xb;
2026: PetscScalar x1,x2,x3,x4,x5,x6,*yarray,*zarray;
2027: const MatScalar *v;
2028: PetscErrorCode ierr;
2029: PetscInt mbs = a->mbs,i,j,n;
2030: const PetscInt *idx,*ii,*ridx=NULL;
2031: PetscBool usecprow=a->compressedrow.use;
2034: VecGetArrayRead(xx,&x);
2035: VecGetArrayPair(yy,zz,&yarray,&zarray);
2037: idx = a->j;
2038: v = a->a;
2039: if (usecprow) {
2040: if (zz != yy) {
2041: PetscArraycpy(zarray,yarray,6*mbs);
2042: }
2043: mbs = a->compressedrow.nrows;
2044: ii = a->compressedrow.i;
2045: ridx = a->compressedrow.rindex;
2046: } else {
2047: ii = a->i;
2048: y = yarray;
2049: z = zarray;
2050: }
2052: for (i=0; i<mbs; i++) {
2053: n = ii[1] - ii[0]; ii++;
2054: if (usecprow) {
2055: z = zarray + 6*ridx[i];
2056: y = yarray + 6*ridx[i];
2057: }
2058: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5];
2059: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2060: PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2061: for (j=0; j<n; j++) {
2062: xb = x + 6*(*idx++);
2063: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
2064: sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
2065: sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
2066: sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
2067: sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
2068: sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
2069: sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
2070: v += 36;
2071: }
2072: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
2073: if (!usecprow) {
2074: z += 6; y += 6;
2075: }
2076: }
2077: VecRestoreArrayRead(xx,&x);
2078: VecRestoreArrayPair(yy,zz,&yarray,&zarray);
2079: PetscLogFlops(72.0*a->nz);
2080: return(0);
2081: }
2083: PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
2084: {
2085: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2086: PetscScalar *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
2087: const PetscScalar *x,*xb;
2088: PetscScalar x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray;
2089: const MatScalar *v;
2090: PetscErrorCode ierr;
2091: PetscInt mbs = a->mbs,i,j,n;
2092: const PetscInt *idx,*ii,*ridx = NULL;
2093: PetscBool usecprow=a->compressedrow.use;
2096: VecGetArrayRead(xx,&x);
2097: VecGetArrayPair(yy,zz,&yarray,&zarray);
2099: idx = a->j;
2100: v = a->a;
2101: if (usecprow) {
2102: if (zz != yy) {
2103: PetscArraycpy(zarray,yarray,7*mbs);
2104: }
2105: mbs = a->compressedrow.nrows;
2106: ii = a->compressedrow.i;
2107: ridx = a->compressedrow.rindex;
2108: } else {
2109: ii = a->i;
2110: y = yarray;
2111: z = zarray;
2112: }
2114: for (i=0; i<mbs; i++) {
2115: n = ii[1] - ii[0]; ii++;
2116: if (usecprow) {
2117: z = zarray + 7*ridx[i];
2118: y = yarray + 7*ridx[i];
2119: }
2120: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
2121: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2122: PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2123: for (j=0; j<n; j++) {
2124: xb = x + 7*(*idx++);
2125: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
2126: sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2127: sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2128: sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2129: sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2130: sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2131: sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2132: sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2133: v += 49;
2134: }
2135: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
2136: if (!usecprow) {
2137: z += 7; y += 7;
2138: }
2139: }
2140: VecRestoreArrayRead(xx,&x);
2141: VecRestoreArrayPair(yy,zz,&yarray,&zarray);
2142: PetscLogFlops(98.0*a->nz);
2143: return(0);
2144: }
2146: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
2147: PetscErrorCode MatMultAdd_SeqBAIJ_9_AVX2(Mat A,Vec xx,Vec yy,Vec zz)
2148: {
2149: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2150: PetscScalar *z = NULL,*work,*workt,*zarray;
2151: const PetscScalar *x,*xb;
2152: const MatScalar *v;
2153: PetscErrorCode ierr;
2154: PetscInt mbs,i,j,n;
2155: PetscInt k;
2156: PetscBool usecprow=a->compressedrow.use;
2157: const PetscInt *idx,*ii,*ridx=NULL,bs = 9, bs2 = 81;
2159: __m256d a0,a1,a2,a3,a4,a5;
2160: __m256d w0,w1,w2,w3;
2161: __m256d z0,z1,z2;
2162: __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL<<63);
2165: VecCopy(yy,zz);
2166: VecGetArrayRead(xx,&x);
2167: VecGetArray(zz,&zarray);
2169: idx = a->j;
2170: v = a->a;
2171: if (usecprow) {
2172: mbs = a->compressedrow.nrows;
2173: ii = a->compressedrow.i;
2174: ridx = a->compressedrow.rindex;
2175: } else {
2176: mbs = a->mbs;
2177: ii = a->i;
2178: z = zarray;
2179: }
2181: if (!a->mult_work) {
2182: k = PetscMax(A->rmap->n,A->cmap->n);
2183: PetscMalloc1(k+1,&a->mult_work);
2184: }
2186: work = a->mult_work;
2187: for (i=0; i<mbs; i++) {
2188: n = ii[1] - ii[0]; ii++;
2189: workt = work;
2190: for (j=0; j<n; j++) {
2191: xb = x + bs*(*idx++);
2192: for (k=0; k<bs; k++) workt[k] = xb[k];
2193: workt += bs;
2194: }
2195: if (usecprow) z = zarray + bs*ridx[i];
2197: z0 = _mm256_loadu_pd(&z[ 0]); z1 = _mm256_loadu_pd(&z[ 4]); z2 = _mm256_set1_pd(z[ 8]);
2199: for (j=0; j<n; j++) {
2200: /* first column of a */
2201: w0 = _mm256_set1_pd(work[j*9 ]);
2202: a0 = _mm256_loadu_pd(&v[j*81 ]); z0 = _mm256_fmadd_pd(a0,w0,z0);
2203: a1 = _mm256_loadu_pd(&v[j*81+4]); z1 = _mm256_fmadd_pd(a1,w0,z1);
2204: a2 = _mm256_loadu_pd(&v[j*81+8]); z2 = _mm256_fmadd_pd(a2,w0,z2);
2206: /* second column of a */
2207: w1 = _mm256_set1_pd(work[j*9+ 1]);
2208: a0 = _mm256_loadu_pd(&v[j*81+ 9]); z0 = _mm256_fmadd_pd(a0,w1,z0);
2209: a1 = _mm256_loadu_pd(&v[j*81+13]); z1 = _mm256_fmadd_pd(a1,w1,z1);
2210: a2 = _mm256_loadu_pd(&v[j*81+17]); z2 = _mm256_fmadd_pd(a2,w1,z2);
2212: /* third column of a */
2213: w2 = _mm256_set1_pd(work[j*9+ 2]);
2214: a3 = _mm256_loadu_pd(&v[j*81+18]); z0 = _mm256_fmadd_pd(a3,w2,z0);
2215: a4 = _mm256_loadu_pd(&v[j*81+22]); z1 = _mm256_fmadd_pd(a4,w2,z1);
2216: a5 = _mm256_loadu_pd(&v[j*81+26]); z2 = _mm256_fmadd_pd(a5,w2,z2);
2218: /* fourth column of a */
2219: w3 = _mm256_set1_pd(work[j*9+ 3]);
2220: a0 = _mm256_loadu_pd(&v[j*81+27]); z0 = _mm256_fmadd_pd(a0,w3,z0);
2221: a1 = _mm256_loadu_pd(&v[j*81+31]); z1 = _mm256_fmadd_pd(a1,w3,z1);
2222: a2 = _mm256_loadu_pd(&v[j*81+35]); z2 = _mm256_fmadd_pd(a2,w3,z2);
2224: /* fifth column of a */
2225: w0 = _mm256_set1_pd(work[j*9+ 4]);
2226: a3 = _mm256_loadu_pd(&v[j*81+36]); z0 = _mm256_fmadd_pd(a3,w0,z0);
2227: a4 = _mm256_loadu_pd(&v[j*81+40]); z1 = _mm256_fmadd_pd(a4,w0,z1);
2228: a5 = _mm256_loadu_pd(&v[j*81+44]); z2 = _mm256_fmadd_pd(a5,w0,z2);
2230: /* sixth column of a */
2231: w1 = _mm256_set1_pd(work[j*9+ 5]);
2232: a0 = _mm256_loadu_pd(&v[j*81+45]); z0 = _mm256_fmadd_pd(a0,w1,z0);
2233: a1 = _mm256_loadu_pd(&v[j*81+49]); z1 = _mm256_fmadd_pd(a1,w1,z1);
2234: a2 = _mm256_loadu_pd(&v[j*81+53]); z2 = _mm256_fmadd_pd(a2,w1,z2);
2236: /* seventh column of a */
2237: w2 = _mm256_set1_pd(work[j*9+ 6]);
2238: a0 = _mm256_loadu_pd(&v[j*81+54]); z0 = _mm256_fmadd_pd(a0,w2,z0);
2239: a1 = _mm256_loadu_pd(&v[j*81+58]); z1 = _mm256_fmadd_pd(a1,w2,z1);
2240: a2 = _mm256_loadu_pd(&v[j*81+62]); z2 = _mm256_fmadd_pd(a2,w2,z2);
2242: /* eigth column of a */
2243: w3 = _mm256_set1_pd(work[j*9+ 7]);
2244: a3 = _mm256_loadu_pd(&v[j*81+63]); z0 = _mm256_fmadd_pd(a3,w3,z0);
2245: a4 = _mm256_loadu_pd(&v[j*81+67]); z1 = _mm256_fmadd_pd(a4,w3,z1);
2246: a5 = _mm256_loadu_pd(&v[j*81+71]); z2 = _mm256_fmadd_pd(a5,w3,z2);
2248: /* ninth column of a */
2249: w0 = _mm256_set1_pd(work[j*9+ 8]);
2250: a0 = _mm256_loadu_pd(&v[j*81+72]); z0 = _mm256_fmadd_pd(a0,w0,z0);
2251: a1 = _mm256_loadu_pd(&v[j*81+76]); z1 = _mm256_fmadd_pd(a1,w0,z1);
2252: a2 = _mm256_maskload_pd(&v[j*81+80],mask1); z2 = _mm256_fmadd_pd(a2,w0,z2);
2253: }
2255: _mm256_storeu_pd(&z[ 0], z0); _mm256_storeu_pd(&z[ 4], z1); _mm256_maskstore_pd(&z[8], mask1, z2);
2257: v += n*bs2;
2258: if (!usecprow) z += bs;
2259: }
2260: VecRestoreArrayRead(xx,&x);
2261: VecRestoreArray(zz,&zarray);
2262: PetscLogFlops(162.0*a->nz);
2263: return(0);
2264: }
2265: #endif
2267: PetscErrorCode MatMultAdd_SeqBAIJ_11(Mat A,Vec xx,Vec yy,Vec zz)
2268: {
2269: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2270: PetscScalar *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11;
2271: const PetscScalar *x,*xb;
2272: PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,*yarray,*zarray;
2273: const MatScalar *v;
2274: PetscErrorCode ierr;
2275: PetscInt mbs = a->mbs,i,j,n;
2276: const PetscInt *idx,*ii,*ridx = NULL;
2277: PetscBool usecprow=a->compressedrow.use;
2280: VecGetArrayRead(xx,&x);
2281: VecGetArrayPair(yy,zz,&yarray,&zarray);
2283: idx = a->j;
2284: v = a->a;
2285: if (usecprow) {
2286: if (zz != yy) {
2287: PetscArraycpy(zarray,yarray,7*mbs);
2288: }
2289: mbs = a->compressedrow.nrows;
2290: ii = a->compressedrow.i;
2291: ridx = a->compressedrow.rindex;
2292: } else {
2293: ii = a->i;
2294: y = yarray;
2295: z = zarray;
2296: }
2298: for (i=0; i<mbs; i++) {
2299: n = ii[1] - ii[0]; ii++;
2300: if (usecprow) {
2301: z = zarray + 11*ridx[i];
2302: y = yarray + 11*ridx[i];
2303: }
2304: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
2305: sum8 = y[7]; sum9 = y[8]; sum10 = y[9]; sum11 = y[10];
2306: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2307: PetscPrefetchBlock(v+121*n,121*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2308: for (j=0; j<n; j++) {
2309: xb = x + 11*(*idx++);
2310: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10];
2311: sum1 += v[ 0]*x1 + v[ 11]*x2 + v[2*11]*x3 + v[3*11]*x4 + v[4*11]*x5 + v[5*11]*x6 + v[6*11]*x7+ v[7*11]*x8 + v[8*11]*x9 + v[9*11]*x10 + v[10*11]*x11;
2312: sum2 += v[1+0]*x1 + v[1+11]*x2 + v[1+2*11]*x3 + v[1+3*11]*x4 + v[1+4*11]*x5 + v[1+5*11]*x6 + v[1+6*11]*x7+ v[1+7*11]*x8 + v[1+8*11]*x9 + v[1+9*11]*x10 + v[1+10*11]*x11;
2313: sum3 += v[2+0]*x1 + v[2+11]*x2 + v[2+2*11]*x3 + v[2+3*11]*x4 + v[2+4*11]*x5 + v[2+5*11]*x6 + v[2+6*11]*x7+ v[2+7*11]*x8 + v[2+8*11]*x9 + v[2+9*11]*x10 + v[2+10*11]*x11;
2314: sum4 += v[3+0]*x1 + v[3+11]*x2 + v[3+2*11]*x3 + v[3+3*11]*x4 + v[3+4*11]*x5 + v[3+5*11]*x6 + v[3+6*11]*x7+ v[3+7*11]*x8 + v[3+8*11]*x9 + v[3+9*11]*x10 + v[3+10*11]*x11;
2315: sum5 += v[4+0]*x1 + v[4+11]*x2 + v[4+2*11]*x3 + v[4+3*11]*x4 + v[4+4*11]*x5 + v[4+5*11]*x6 + v[4+6*11]*x7+ v[4+7*11]*x8 + v[4+8*11]*x9 + v[4+9*11]*x10 + v[4+10*11]*x11;
2316: sum6 += v[5+0]*x1 + v[5+11]*x2 + v[5+2*11]*x3 + v[5+3*11]*x4 + v[5+4*11]*x5 + v[5+5*11]*x6 + v[5+6*11]*x7+ v[5+7*11]*x8 + v[5+8*11]*x9 + v[5+9*11]*x10 + v[5+10*11]*x11;
2317: sum7 += v[6+0]*x1 + v[6+11]*x2 + v[6+2*11]*x3 + v[6+3*11]*x4 + v[6+4*11]*x5 + v[6+5*11]*x6 + v[6+6*11]*x7+ v[6+7*11]*x8 + v[6+8*11]*x9 + v[6+9*11]*x10 + v[6+10*11]*x11;
2318: sum8 += v[7+0]*x1 + v[7+11]*x2 + v[7+2*11]*x3 + v[7+3*11]*x4 + v[7+4*11]*x5 + v[7+5*11]*x6 + v[7+6*11]*x7+ v[7+7*11]*x8 + v[7+8*11]*x9 + v[7+9*11]*x10 + v[7+10*11]*x11;
2319: sum9 += v[8+0]*x1 + v[8+11]*x2 + v[8+2*11]*x3 + v[8+3*11]*x4 + v[8+4*11]*x5 + v[8+5*11]*x6 + v[8+6*11]*x7+ v[8+7*11]*x8 + v[8+8*11]*x9 + v[8+9*11]*x10 + v[8+10*11]*x11;
2320: sum10 += v[9+0]*x1 + v[9+11]*x2 + v[9+2*11]*x3 + v[9+3*11]*x4 + v[9+4*11]*x5 + v[9+5*11]*x6 + v[9+6*11]*x7+ v[9+7*11]*x8 + v[9+8*11]*x9 + v[9+9*11]*x10 + v[9+10*11]*x11;
2321: sum11 += v[10+0]*x1 + v[10+11]*x2 + v[10+2*11]*x3 + v[10+3*11]*x4 + v[10+4*11]*x5 + v[10+5*11]*x6 + v[10+6*11]*x7+ v[10+7*11]*x8 + v[10+8*11]*x9 + v[10+9*11]*x10 + v[10+10*11]*x11;
2322: v += 121;
2323: }
2324: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
2325: z[7] = sum6; z[8] = sum7; z[9] = sum8; z[10] = sum9; z[11] = sum10;
2326: if (!usecprow) {
2327: z += 11; y += 11;
2328: }
2329: }
2330: VecRestoreArrayRead(xx,&x);
2331: VecRestoreArrayPair(yy,zz,&yarray,&zarray);
2332: PetscLogFlops(242.0*a->nz);
2333: return(0);
2334: }
2336: PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
2337: {
2338: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2339: PetscScalar *z = NULL,*work,*workt,*zarray;
2340: const PetscScalar *x,*xb;
2341: const MatScalar *v;
2342: PetscErrorCode ierr;
2343: PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
2344: PetscInt ncols,k;
2345: const PetscInt *ridx = NULL,*idx,*ii;
2346: PetscBool usecprow = a->compressedrow.use;
2349: VecCopy(yy,zz);
2350: VecGetArrayRead(xx,&x);
2351: VecGetArray(zz,&zarray);
2353: idx = a->j;
2354: v = a->a;
2355: if (usecprow) {
2356: mbs = a->compressedrow.nrows;
2357: ii = a->compressedrow.i;
2358: ridx = a->compressedrow.rindex;
2359: } else {
2360: mbs = a->mbs;
2361: ii = a->i;
2362: z = zarray;
2363: }
2365: if (!a->mult_work) {
2366: k = PetscMax(A->rmap->n,A->cmap->n);
2367: PetscMalloc1(k+1,&a->mult_work);
2368: }
2369: work = a->mult_work;
2370: for (i=0; i<mbs; i++) {
2371: n = ii[1] - ii[0]; ii++;
2372: ncols = n*bs;
2373: workt = work;
2374: for (j=0; j<n; j++) {
2375: xb = x + bs*(*idx++);
2376: for (k=0; k<bs; k++) workt[k] = xb[k];
2377: workt += bs;
2378: }
2379: if (usecprow) z = zarray + bs*ridx[i];
2380: PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
2381: v += n*bs2;
2382: if (!usecprow) z += bs;
2383: }
2384: VecRestoreArrayRead(xx,&x);
2385: VecRestoreArray(zz,&zarray);
2386: PetscLogFlops(2.0*a->nz*bs2);
2387: return(0);
2388: }
2390: PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
2391: {
2392: PetscScalar zero = 0.0;
2396: VecSet(zz,zero);
2397: MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz);
2398: return(0);
2399: }
2401: PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
2402: {
2403: PetscScalar zero = 0.0;
2407: VecSet(zz,zero);
2408: MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);
2409: return(0);
2410: }
2412: PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
2413: {
2414: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2415: PetscScalar *z,x1,x2,x3,x4,x5;
2416: const PetscScalar *x,*xb = NULL;
2417: const MatScalar *v;
2418: PetscErrorCode ierr;
2419: PetscInt mbs,i,rval,bs=A->rmap->bs,j,n;
2420: const PetscInt *idx,*ii,*ib,*ridx = NULL;
2421: Mat_CompressedRow cprow = a->compressedrow;
2422: PetscBool usecprow = cprow.use;
2425: if (yy != zz) { VecCopy(yy,zz); }
2426: VecGetArrayRead(xx,&x);
2427: VecGetArray(zz,&z);
2429: idx = a->j;
2430: v = a->a;
2431: if (usecprow) {
2432: mbs = cprow.nrows;
2433: ii = cprow.i;
2434: ridx = cprow.rindex;
2435: } else {
2436: mbs=a->mbs;
2437: ii = a->i;
2438: xb = x;
2439: }
2441: switch (bs) {
2442: case 1:
2443: for (i=0; i<mbs; i++) {
2444: if (usecprow) xb = x + ridx[i];
2445: x1 = xb[0];
2446: ib = idx + ii[0];
2447: n = ii[1] - ii[0]; ii++;
2448: for (j=0; j<n; j++) {
2449: rval = ib[j];
2450: z[rval] += PetscConj(*v) * x1;
2451: v++;
2452: }
2453: if (!usecprow) xb++;
2454: }
2455: break;
2456: case 2:
2457: for (i=0; i<mbs; i++) {
2458: if (usecprow) xb = x + 2*ridx[i];
2459: x1 = xb[0]; x2 = xb[1];
2460: ib = idx + ii[0];
2461: n = ii[1] - ii[0]; ii++;
2462: for (j=0; j<n; j++) {
2463: rval = ib[j]*2;
2464: z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2;
2465: z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2;
2466: v += 4;
2467: }
2468: if (!usecprow) xb += 2;
2469: }
2470: break;
2471: case 3:
2472: for (i=0; i<mbs; i++) {
2473: if (usecprow) xb = x + 3*ridx[i];
2474: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2475: ib = idx + ii[0];
2476: n = ii[1] - ii[0]; ii++;
2477: for (j=0; j<n; j++) {
2478: rval = ib[j]*3;
2479: z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3;
2480: z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3;
2481: z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3;
2482: v += 9;
2483: }
2484: if (!usecprow) xb += 3;
2485: }
2486: break;
2487: case 4:
2488: for (i=0; i<mbs; i++) {
2489: if (usecprow) xb = x + 4*ridx[i];
2490: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
2491: ib = idx + ii[0];
2492: n = ii[1] - ii[0]; ii++;
2493: for (j=0; j<n; j++) {
2494: rval = ib[j]*4;
2495: z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4;
2496: z[rval++] += PetscConj(v[4])*x1 + PetscConj(v[5])*x2 + PetscConj(v[6])*x3 + PetscConj(v[7])*x4;
2497: z[rval++] += PetscConj(v[8])*x1 + PetscConj(v[9])*x2 + PetscConj(v[10])*x3 + PetscConj(v[11])*x4;
2498: z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4;
2499: v += 16;
2500: }
2501: if (!usecprow) xb += 4;
2502: }
2503: break;
2504: case 5:
2505: for (i=0; i<mbs; i++) {
2506: if (usecprow) xb = x + 5*ridx[i];
2507: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2508: x4 = xb[3]; x5 = xb[4];
2509: ib = idx + ii[0];
2510: n = ii[1] - ii[0]; ii++;
2511: for (j=0; j<n; j++) {
2512: rval = ib[j]*5;
2513: z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4 + PetscConj(v[4])*x5;
2514: z[rval++] += PetscConj(v[5])*x1 + PetscConj(v[6])*x2 + PetscConj(v[7])*x3 + PetscConj(v[8])*x4 + PetscConj(v[9])*x5;
2515: z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5;
2516: z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5;
2517: z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5;
2518: v += 25;
2519: }
2520: if (!usecprow) xb += 5;
2521: }
2522: break;
2523: default: /* block sizes larger than 5 by 5 are handled by BLAS */
2524: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"block size larger than 5 is not supported yet");
2525: #if 0
2526: {
2527: PetscInt ncols,k,bs2=a->bs2;
2528: PetscScalar *work,*workt,zb;
2529: const PetscScalar *xtmp;
2530: if (!a->mult_work) {
2531: k = PetscMax(A->rmap->n,A->cmap->n);
2532: PetscMalloc1(k+1,&a->mult_work);
2533: }
2534: work = a->mult_work;
2535: xtmp = x;
2536: for (i=0; i<mbs; i++) {
2537: n = ii[1] - ii[0]; ii++;
2538: ncols = n*bs;
2539: PetscArrayzero(work,ncols);
2540: if (usecprow) xtmp = x + bs*ridx[i];
2541: PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
2542: v += n*bs2;
2543: if (!usecprow) xtmp += bs;
2544: workt = work;
2545: for (j=0; j<n; j++) {
2546: zb = z + bs*(*idx++);
2547: for (k=0; k<bs; k++) zb[k] += workt[k] ;
2548: workt += bs;
2549: }
2550: }
2551: }
2552: #endif
2553: }
2554: VecRestoreArrayRead(xx,&x);
2555: VecRestoreArray(zz,&z);
2556: PetscLogFlops(2.0*a->nz*a->bs2);
2557: return(0);
2558: }
2560: PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
2561: {
2562: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2563: PetscScalar *zb,*z,x1,x2,x3,x4,x5;
2564: const PetscScalar *x,*xb = NULL;
2565: const MatScalar *v;
2566: PetscErrorCode ierr;
2567: PetscInt mbs,i,rval,bs=A->rmap->bs,j,n,bs2=a->bs2;
2568: const PetscInt *idx,*ii,*ib,*ridx = NULL;
2569: Mat_CompressedRow cprow = a->compressedrow;
2570: PetscBool usecprow=cprow.use;
2573: if (yy != zz) { VecCopy(yy,zz); }
2574: VecGetArrayRead(xx,&x);
2575: VecGetArray(zz,&z);
2577: idx = a->j;
2578: v = a->a;
2579: if (usecprow) {
2580: mbs = cprow.nrows;
2581: ii = cprow.i;
2582: ridx = cprow.rindex;
2583: } else {
2584: mbs=a->mbs;
2585: ii = a->i;
2586: xb = x;
2587: }
2589: switch (bs) {
2590: case 1:
2591: for (i=0; i<mbs; i++) {
2592: if (usecprow) xb = x + ridx[i];
2593: x1 = xb[0];
2594: ib = idx + ii[0];
2595: n = ii[1] - ii[0]; ii++;
2596: for (j=0; j<n; j++) {
2597: rval = ib[j];
2598: z[rval] += *v * x1;
2599: v++;
2600: }
2601: if (!usecprow) xb++;
2602: }
2603: break;
2604: case 2:
2605: for (i=0; i<mbs; i++) {
2606: if (usecprow) xb = x + 2*ridx[i];
2607: x1 = xb[0]; x2 = xb[1];
2608: ib = idx + ii[0];
2609: n = ii[1] - ii[0]; ii++;
2610: for (j=0; j<n; j++) {
2611: rval = ib[j]*2;
2612: z[rval++] += v[0]*x1 + v[1]*x2;
2613: z[rval++] += v[2]*x1 + v[3]*x2;
2614: v += 4;
2615: }
2616: if (!usecprow) xb += 2;
2617: }
2618: break;
2619: case 3:
2620: for (i=0; i<mbs; i++) {
2621: if (usecprow) xb = x + 3*ridx[i];
2622: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2623: ib = idx + ii[0];
2624: n = ii[1] - ii[0]; ii++;
2625: for (j=0; j<n; j++) {
2626: rval = ib[j]*3;
2627: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
2628: z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
2629: z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
2630: v += 9;
2631: }
2632: if (!usecprow) xb += 3;
2633: }
2634: break;
2635: case 4:
2636: for (i=0; i<mbs; i++) {
2637: if (usecprow) xb = x + 4*ridx[i];
2638: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
2639: ib = idx + ii[0];
2640: n = ii[1] - ii[0]; ii++;
2641: for (j=0; j<n; j++) {
2642: rval = ib[j]*4;
2643: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
2644: z[rval++] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
2645: z[rval++] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
2646: z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
2647: v += 16;
2648: }
2649: if (!usecprow) xb += 4;
2650: }
2651: break;
2652: case 5:
2653: for (i=0; i<mbs; i++) {
2654: if (usecprow) xb = x + 5*ridx[i];
2655: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2656: x4 = xb[3]; x5 = xb[4];
2657: ib = idx + ii[0];
2658: n = ii[1] - ii[0]; ii++;
2659: for (j=0; j<n; j++) {
2660: rval = ib[j]*5;
2661: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
2662: z[rval++] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
2663: z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
2664: z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
2665: z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
2666: v += 25;
2667: }
2668: if (!usecprow) xb += 5;
2669: }
2670: break;
2671: default: { /* block sizes larger then 5 by 5 are handled by BLAS */
2672: PetscInt ncols,k;
2673: PetscScalar *work,*workt;
2674: const PetscScalar *xtmp;
2675: if (!a->mult_work) {
2676: k = PetscMax(A->rmap->n,A->cmap->n);
2677: PetscMalloc1(k+1,&a->mult_work);
2678: }
2679: work = a->mult_work;
2680: xtmp = x;
2681: for (i=0; i<mbs; i++) {
2682: n = ii[1] - ii[0]; ii++;
2683: ncols = n*bs;
2684: PetscArrayzero(work,ncols);
2685: if (usecprow) xtmp = x + bs*ridx[i];
2686: PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
2687: v += n*bs2;
2688: if (!usecprow) xtmp += bs;
2689: workt = work;
2690: for (j=0; j<n; j++) {
2691: zb = z + bs*(*idx++);
2692: for (k=0; k<bs; k++) zb[k] += workt[k];
2693: workt += bs;
2694: }
2695: }
2696: }
2697: }
2698: VecRestoreArrayRead(xx,&x);
2699: VecRestoreArray(zz,&z);
2700: PetscLogFlops(2.0*a->nz*a->bs2);
2701: return(0);
2702: }
2704: PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha)
2705: {
2706: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)inA->data;
2707: PetscInt totalnz = a->bs2*a->nz;
2708: PetscScalar oalpha = alpha;
2710: PetscBLASInt one = 1,tnz;
2713: PetscBLASIntCast(totalnz,&tnz);
2714: PetscStackCallBLAS("BLASscal",BLASscal_(&tnz,&oalpha,a->a,&one));
2715: PetscLogFlops(totalnz);
2716: return(0);
2717: }
2719: PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm)
2720: {
2722: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2723: MatScalar *v = a->a;
2724: PetscReal sum = 0.0;
2725: PetscInt i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1;
2728: if (type == NORM_FROBENIUS) {
2729: #if defined(PETSC_USE_REAL___FP16)
2730: PetscBLASInt one = 1,cnt = bs2*nz;
2731: PetscStackCallBLAS("BLASnrm2",*norm = BLASnrm2_(&cnt,v,&one));
2732: #else
2733: for (i=0; i<bs2*nz; i++) {
2734: sum += PetscRealPart(PetscConj(*v)*(*v)); v++;
2735: }
2736: #endif
2737: *norm = PetscSqrtReal(sum);
2738: PetscLogFlops(2.0*bs2*nz);
2739: } else if (type == NORM_1) { /* maximum column sum */
2740: PetscReal *tmp;
2741: PetscInt *bcol = a->j;
2742: PetscCalloc1(A->cmap->n+1,&tmp);
2743: for (i=0; i<nz; i++) {
2744: for (j=0; j<bs; j++) {
2745: k1 = bs*(*bcol) + j; /* column index */
2746: for (k=0; k<bs; k++) {
2747: tmp[k1] += PetscAbsScalar(*v); v++;
2748: }
2749: }
2750: bcol++;
2751: }
2752: *norm = 0.0;
2753: for (j=0; j<A->cmap->n; j++) {
2754: if (tmp[j] > *norm) *norm = tmp[j];
2755: }
2756: PetscFree(tmp);
2757: PetscLogFlops(PetscMax(bs2*nz-1,0));
2758: } else if (type == NORM_INFINITY) { /* maximum row sum */
2759: *norm = 0.0;
2760: for (k=0; k<bs; k++) {
2761: for (j=0; j<a->mbs; j++) {
2762: v = a->a + bs2*a->i[j] + k;
2763: sum = 0.0;
2764: for (i=0; i<a->i[j+1]-a->i[j]; i++) {
2765: for (k1=0; k1<bs; k1++) {
2766: sum += PetscAbsScalar(*v);
2767: v += bs;
2768: }
2769: }
2770: if (sum > *norm) *norm = sum;
2771: }
2772: }
2773: PetscLogFlops(PetscMax(bs2*nz-1,0));
2774: } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
2775: return(0);
2776: }
2779: PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscBool * flg)
2780: {
2781: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;
2785: /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */
2786: if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
2787: *flg = PETSC_FALSE;
2788: return(0);
2789: }
2791: /* if the a->i are the same */
2792: PetscArraycmp(a->i,b->i,a->mbs+1,flg);
2793: if (!*flg) return(0);
2795: /* if a->j are the same */
2796: PetscArraycmp(a->j,b->j,a->nz,flg);
2797: if (!*flg) return(0);
2799: /* if a->a are the same */
2800: PetscArraycmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs),flg);
2801: return(0);
2803: }
2805: PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
2806: {
2807: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2809: PetscInt i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
2810: PetscScalar *x,zero = 0.0;
2811: MatScalar *aa,*aa_j;
2814: if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
2815: bs = A->rmap->bs;
2816: aa = a->a;
2817: ai = a->i;
2818: aj = a->j;
2819: ambs = a->mbs;
2820: bs2 = a->bs2;
2822: VecSet(v,zero);
2823: VecGetArray(v,&x);
2824: VecGetLocalSize(v,&n);
2825: if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
2826: for (i=0; i<ambs; i++) {
2827: for (j=ai[i]; j<ai[i+1]; j++) {
2828: if (aj[j] == i) {
2829: row = i*bs;
2830: aa_j = aa+j*bs2;
2831: for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
2832: break;
2833: }
2834: }
2835: }
2836: VecRestoreArray(v,&x);
2837: return(0);
2838: }
2840: PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
2841: {
2842: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2843: const PetscScalar *l,*r,*li,*ri;
2844: PetscScalar x;
2845: MatScalar *aa, *v;
2846: PetscErrorCode ierr;
2847: PetscInt i,j,k,lm,rn,M,m,n,mbs,tmp,bs,bs2,iai;
2848: const PetscInt *ai,*aj;
2851: ai = a->i;
2852: aj = a->j;
2853: aa = a->a;
2854: m = A->rmap->n;
2855: n = A->cmap->n;
2856: bs = A->rmap->bs;
2857: mbs = a->mbs;
2858: bs2 = a->bs2;
2859: if (ll) {
2860: VecGetArrayRead(ll,&l);
2861: VecGetLocalSize(ll,&lm);
2862: if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
2863: for (i=0; i<mbs; i++) { /* for each block row */
2864: M = ai[i+1] - ai[i];
2865: li = l + i*bs;
2866: v = aa + bs2*ai[i];
2867: for (j=0; j<M; j++) { /* for each block */
2868: for (k=0; k<bs2; k++) {
2869: (*v++) *= li[k%bs];
2870: }
2871: }
2872: }
2873: VecRestoreArrayRead(ll,&l);
2874: PetscLogFlops(a->nz);
2875: }
2877: if (rr) {
2878: VecGetArrayRead(rr,&r);
2879: VecGetLocalSize(rr,&rn);
2880: if (rn != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
2881: for (i=0; i<mbs; i++) { /* for each block row */
2882: iai = ai[i];
2883: M = ai[i+1] - iai;
2884: v = aa + bs2*iai;
2885: for (j=0; j<M; j++) { /* for each block */
2886: ri = r + bs*aj[iai+j];
2887: for (k=0; k<bs; k++) {
2888: x = ri[k];
2889: for (tmp=0; tmp<bs; tmp++) v[tmp] *= x;
2890: v += bs;
2891: }
2892: }
2893: }
2894: VecRestoreArrayRead(rr,&r);
2895: PetscLogFlops(a->nz);
2896: }
2897: return(0);
2898: }
2901: PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
2902: {
2903: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2906: info->block_size = a->bs2;
2907: info->nz_allocated = a->bs2*a->maxnz;
2908: info->nz_used = a->bs2*a->nz;
2909: info->nz_unneeded = info->nz_allocated - info->nz_used;
2910: info->assemblies = A->num_ass;
2911: info->mallocs = A->info.mallocs;
2912: info->memory = ((PetscObject)A)->mem;
2913: if (A->factortype) {
2914: info->fill_ratio_given = A->info.fill_ratio_given;
2915: info->fill_ratio_needed = A->info.fill_ratio_needed;
2916: info->factor_mallocs = A->info.factor_mallocs;
2917: } else {
2918: info->fill_ratio_given = 0;
2919: info->fill_ratio_needed = 0;
2920: info->factor_mallocs = 0;
2921: }
2922: return(0);
2923: }
2925: PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
2926: {
2927: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2931: PetscArrayzero(a->a,a->bs2*a->i[a->mbs]);
2932: return(0);
2933: }
2935: PetscErrorCode MatMatMultSymbolic_SeqBAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat C)
2936: {
2940: MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C);
2941: C->ops->matmultnumeric = MatMatMultNumeric_SeqBAIJ_SeqDense;
2942: return(0);
2943: }
2945: PetscErrorCode MatMatMult_SeqBAIJ_1_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
2946: {
2947: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2948: PetscScalar *z = NULL,sum1;
2949: const PetscScalar *xb;
2950: PetscScalar x1;
2951: const MatScalar *v,*vv;
2952: PetscInt mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
2953: PetscBool usecprow=a->compressedrow.use;
2956: idx = a->j;
2957: v = a->a;
2958: if (usecprow) {
2959: mbs = a->compressedrow.nrows;
2960: ii = a->compressedrow.i;
2961: ridx = a->compressedrow.rindex;
2962: } else {
2963: mbs = a->mbs;
2964: ii = a->i;
2965: z = c;
2966: }
2968: for (i=0; i<mbs; i++) {
2969: n = ii[1] - ii[0]; ii++;
2970: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2971: PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2972: if (usecprow) z = c + ridx[i];
2973: jj = idx;
2974: vv = v;
2975: for (k=0; k<cn; k++) {
2976: idx = jj;
2977: v = vv;
2978: sum1 = 0.0;
2979: for (j=0; j<n; j++) {
2980: xb = b + (*idx++); x1 = xb[0+k*bm];
2981: sum1 += v[0]*x1;
2982: v += 1;
2983: }
2984: z[0+k*cm] = sum1;
2985: }
2986: if (!usecprow) z += 1;
2987: }
2988: return(0);
2989: }
2991: PetscErrorCode MatMatMult_SeqBAIJ_2_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
2992: {
2993: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2994: PetscScalar *z = NULL,sum1,sum2;
2995: const PetscScalar *xb;
2996: PetscScalar x1,x2;
2997: const MatScalar *v,*vv;
2998: PetscInt mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
2999: PetscBool usecprow=a->compressedrow.use;
3002: idx = a->j;
3003: v = a->a;
3004: if (usecprow) {
3005: mbs = a->compressedrow.nrows;
3006: ii = a->compressedrow.i;
3007: ridx = a->compressedrow.rindex;
3008: } else {
3009: mbs = a->mbs;
3010: ii = a->i;
3011: z = c;
3012: }
3014: for (i=0; i<mbs; i++) {
3015: n = ii[1] - ii[0]; ii++;
3016: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
3017: PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3018: if (usecprow) z = c + 2*ridx[i];
3019: jj = idx;
3020: vv = v;
3021: for (k=0; k<cn; k++) {
3022: idx = jj;
3023: v = vv;
3024: sum1 = 0.0; sum2 = 0.0;
3025: for (j=0; j<n; j++) {
3026: xb = b + 2*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm];
3027: sum1 += v[0]*x1 + v[2]*x2;
3028: sum2 += v[1]*x1 + v[3]*x2;
3029: v += 4;
3030: }
3031: z[0+k*cm] = sum1; z[1+k*cm] = sum2;
3032: }
3033: if (!usecprow) z += 2;
3034: }
3035: return(0);
3036: }
3038: PetscErrorCode MatMatMult_SeqBAIJ_3_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
3039: {
3040: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
3041: PetscScalar *z = NULL,sum1,sum2,sum3;
3042: const PetscScalar *xb;
3043: PetscScalar x1,x2,x3;
3044: const MatScalar *v,*vv;
3045: PetscInt mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
3046: PetscBool usecprow=a->compressedrow.use;
3049: idx = a->j;
3050: v = a->a;
3051: if (usecprow) {
3052: mbs = a->compressedrow.nrows;
3053: ii = a->compressedrow.i;
3054: ridx = a->compressedrow.rindex;
3055: } else {
3056: mbs = a->mbs;
3057: ii = a->i;
3058: z = c;
3059: }
3061: for (i=0; i<mbs; i++) {
3062: n = ii[1] - ii[0]; ii++;
3063: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
3064: PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3065: if (usecprow) z = c + 3*ridx[i];
3066: jj = idx;
3067: vv = v;
3068: for (k=0; k<cn; k++) {
3069: idx = jj;
3070: v = vv;
3071: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0;
3072: for (j=0; j<n; j++) {
3073: xb = b + 3*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm];
3074: sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
3075: sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
3076: sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
3077: v += 9;
3078: }
3079: z[0+k*cm] = sum1; z[1+k*cm] = sum2; z[2+k*cm] = sum3;
3080: }
3081: if (!usecprow) z += 3;
3082: }
3083: return(0);
3084: }
3086: PetscErrorCode MatMatMult_SeqBAIJ_4_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
3087: {
3088: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
3089: PetscScalar *z = NULL,sum1,sum2,sum3,sum4;
3090: const PetscScalar *xb;
3091: PetscScalar x1,x2,x3,x4;
3092: const MatScalar *v,*vv;
3093: PetscInt mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
3094: PetscBool usecprow=a->compressedrow.use;
3097: idx = a->j;
3098: v = a->a;
3099: if (usecprow) {
3100: mbs = a->compressedrow.nrows;
3101: ii = a->compressedrow.i;
3102: ridx = a->compressedrow.rindex;
3103: } else {
3104: mbs = a->mbs;
3105: ii = a->i;
3106: z = c;
3107: }
3109: for (i=0; i<mbs; i++) {
3110: n = ii[1] - ii[0]; ii++;
3111: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
3112: PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3113: if (usecprow) z = c + 4*ridx[i];
3114: jj = idx;
3115: vv = v;
3116: for (k=0; k<cn; k++) {
3117: idx = jj;
3118: v = vv;
3119: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0;
3120: for (j=0; j<n; j++) {
3121: xb = b + 4*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; x4 = xb[3+k*bm];
3122: sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
3123: sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
3124: sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3125: sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3126: v += 16;
3127: }
3128: z[0+k*cm] = sum1; z[1+k*cm] = sum2; z[2+k*cm] = sum3; z[3+k*cm] = sum4;
3129: }
3130: if (!usecprow) z += 4;
3131: }
3132: return(0);
3133: }
3135: PetscErrorCode MatMatMult_SeqBAIJ_5_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
3136: {
3137: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
3138: PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5;
3139: const PetscScalar *xb;
3140: PetscScalar x1,x2,x3,x4,x5;
3141: const MatScalar *v,*vv;
3142: PetscInt mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
3143: PetscBool usecprow=a->compressedrow.use;
3146: idx = a->j;
3147: v = a->a;
3148: if (usecprow) {
3149: mbs = a->compressedrow.nrows;
3150: ii = a->compressedrow.i;
3151: ridx = a->compressedrow.rindex;
3152: } else {
3153: mbs = a->mbs;
3154: ii = a->i;
3155: z = c;
3156: }
3158: for (i=0; i<mbs; i++) {
3159: n = ii[1] - ii[0]; ii++;
3160: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
3161: PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3162: if (usecprow) z = c + 5*ridx[i];
3163: jj = idx;
3164: vv = v;
3165: for (k=0; k<cn; k++) {
3166: idx = jj;
3167: v = vv;
3168: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
3169: for (j=0; j<n; j++) {
3170: xb = b + 5*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; x4 = xb[3+k*bm]; x5 = xb[4+k*bm];
3171: sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3172: sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3173: sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3174: sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3175: sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
3176: v += 25;
3177: }
3178: z[0+k*cm] = sum1; z[1+k*cm] = sum2; z[2+k*cm] = sum3; z[3+k*cm] = sum4; z[4+k*cm] = sum5;
3179: }
3180: if (!usecprow) z += 5;
3181: }
3182: return(0);
3183: }
3185: PetscErrorCode MatMatMultNumeric_SeqBAIJ_SeqDense(Mat A,Mat B,Mat C)
3186: {
3187: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
3188: Mat_SeqDense *bd = (Mat_SeqDense*)B->data;
3189: Mat_SeqDense *cd = (Mat_SeqDense*)C->data;
3190: PetscInt cm=cd->lda,cn=B->cmap->n,bm=bd->lda;
3191: PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
3192: PetscBLASInt bbs,bcn,bbm,bcm;
3193: PetscScalar *z = NULL;
3194: PetscScalar *c,*b;
3195: const MatScalar *v;
3196: const PetscInt *idx,*ii,*ridx=NULL;
3197: PetscScalar _DZero=0.0,_DOne=1.0;
3198: PetscBool usecprow=a->compressedrow.use;
3199: PetscErrorCode ierr;
3202: if (!cm || !cn) return(0);
3203: if (B->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in A %D not equal rows in B %D\n",A->cmap->n,B->rmap->n);
3204: if (A->rmap->n != C->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number rows in C %D not equal rows in A %D\n",C->rmap->n,A->rmap->n);
3205: if (B->cmap->n != C->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in B %D not equal columns in C %D\n",B->cmap->n,C->cmap->n);
3206: b = bd->v;
3207: if (a->nonzerorowcnt != A->rmap->n) {
3208: MatZeroEntries(C);
3209: }
3210: MatDenseGetArray(C,&c);
3211: switch (bs) {
3212: case 1:
3213: MatMatMult_SeqBAIJ_1_Private(A, b, bm, c, cm, cn);
3214: break;
3215: case 2:
3216: MatMatMult_SeqBAIJ_2_Private(A, b, bm, c, cm, cn);
3217: break;
3218: case 3:
3219: MatMatMult_SeqBAIJ_3_Private(A, b, bm, c, cm, cn);
3220: break;
3221: case 4:
3222: MatMatMult_SeqBAIJ_4_Private(A, b, bm, c, cm, cn);
3223: break;
3224: case 5:
3225: MatMatMult_SeqBAIJ_5_Private(A, b, bm, c, cm, cn);
3226: break;
3227: default: /* block sizes larger than 5 by 5 are handled by BLAS */
3228: PetscBLASIntCast(bs,&bbs);
3229: PetscBLASIntCast(cn,&bcn);
3230: PetscBLASIntCast(bm,&bbm);
3231: PetscBLASIntCast(cm,&bcm);
3232: idx = a->j;
3233: v = a->a;
3234: if (usecprow) {
3235: mbs = a->compressedrow.nrows;
3236: ii = a->compressedrow.i;
3237: ridx = a->compressedrow.rindex;
3238: } else {
3239: mbs = a->mbs;
3240: ii = a->i;
3241: z = c;
3242: }
3243: for (i=0; i<mbs; i++) {
3244: n = ii[1] - ii[0]; ii++;
3245: if (usecprow) z = c + bs*ridx[i];
3246: if (n) {
3247: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*(*idx++),&bbm,&_DZero,z,&bcm));
3248: v += bs2;
3249: }
3250: for (j=1; j<n; j++) {
3251: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*(*idx++),&bbm,&_DOne,z,&bcm));
3252: v += bs2;
3253: }
3254: if (!usecprow) z += bs;
3255: }
3256: }
3257: MatDenseRestoreArray(C,&c);
3258: PetscLogFlops((2.0*a->nz*bs2 - bs*a->nonzerorowcnt)*cn);
3259: return(0);
3260: }