Actual source code: agmresorthog.c
petsc-3.15.0 2021-03-30
1: #define PETSCKSP_DLL
3: #include <../src/ksp/ksp/impls/gmres/agmres/agmresimpl.h>
4: /*
5: * This file implements the RODDEC algorithm : its purpose is to orthogonalize a set of vectors distributed across several processes. These processes are organized in a virtual ring.
6: * References : [1] Sidje, Roger B. Alternatives for parallel Krylov subspace basis computation. Numer. Linear Algebra Appl. 4 (1997), no. 4, 305-331
7: *
8: *
9: * initial author R. B. SIDJE,
10: * modified : G.-A Atenekeng-Kahou, 2008
11: * modified : D. NUENTSA WAKAM, 2011
12: *
13: */
16: /*
17: * Take the processes that own the vectors and Organize them on a virtual ring.
18: */
19: static PetscErrorCode KSPAGMRESRoddecGivens(PetscReal*, PetscReal*, PetscReal*, PetscInt);
21: PetscErrorCode KSPAGMRESRoddecInitNeighboor(KSP ksp)
22: {
23: MPI_Comm comm;
24: KSP_AGMRES *agmres = (KSP_AGMRES*)(ksp->data);
26: PetscMPIInt First, Last, rank, size;
29: PetscObjectGetComm((PetscObject)agmres->vecs[0], &comm);
30: MPI_Comm_rank(comm, &rank);
31: MPI_Comm_size(comm, &size);
32: MPIU_Allreduce(&rank, &First, 1, MPI_INT, MPI_MIN, comm);
33: MPIU_Allreduce(&rank, &Last, 1, MPI_INT, MPI_MAX, comm);
35: if ((rank != Last) && (!rank)) {
36: agmres->Ileft = rank - 1;
37: agmres->Iright = rank + 1;
38: } else {
39: if (rank == Last) {
40: agmres->Ileft = rank - 1;
41: agmres->Iright = First;
42: } else {
43: {
44: agmres->Ileft = Last;
45: agmres->Iright = rank + 1;
46: }
47: }
48: }
49: agmres->rank = rank;
50: agmres->size = size;
51: agmres->First = First;
52: agmres->Last = Last;
53: return(0);
54: }
56: static PetscErrorCode KSPAGMRESRoddecGivens(PetscReal * c, PetscReal * s, PetscReal * r, PetscInt make_r)
57: {
58: PetscReal a, b, t;
61: if (make_r == 1) {
62: a = *c;
63: b = *s;
64: if (b == 0.e0) {
65: *c = 1.e0;
66: *s = 0.e0;
67: } else {
68: if (PetscAbsReal(b) > PetscAbsReal(a)) {
69: t = -a / b;
70: *s = 1.e0 / PetscSqrtReal(1.e0 + t * t);
71: *c = (*s) * t;
72: } else {
73: t = -b / a;
74: *c = 1.e0 / PetscSqrtReal(1.e0 + t * t);
75: *s = (*c) * t;
76: }
77: }
78: if (*c == 0.e0) {
79: *r = 1.e0;
80: } else {
81: if (PetscAbsReal(*s) < PetscAbsReal(*c)) {
82: *r = PetscSign(*c) * (*s) / 2.e0;
83: } else {
84: *r = PetscSign(*s) * 2.e0 / (*c);
85: }
86: }
87: }
89: if (*r == 1.e0) {
90: *c = 0.e0;
91: *s = 1.e0;
92: } else {
93: if (PetscAbsReal(*r) < 1.e0) {
94: *s = 2.e0 * (*r);
95: *c = PetscSqrtReal(1.e0 - (*s) * (*s));
96: } else {
97: *c = 2.e0 / (*r);
98: *s = PetscSqrtReal(1.e0 - (*c) * (*c));
99: }
100: }
101: return(0);
103: }
105: /*
106: * Compute the QR factorization of the Krylov basis vectors
107: * Input :
108: * - the vectors are availabe in agmres->vecs (alias VEC_V)
109: * - nvec : the number of vectors
110: * Output :
111: * - agmres->Qloc : product of elementary reflectors for the QR of the (local part) of the vectors.
112: * - agmres->sgn : Sign of the rotations
113: * - agmres->tloc : scalar factors of the elementary reflectors.
115: */
116: PetscErrorCode KSPAGMRESRoddec(KSP ksp, PetscInt nvec)
117: {
118: KSP_AGMRES *agmres = (KSP_AGMRES*) ksp->data;
119: MPI_Comm comm;
120: PetscScalar *Qloc = agmres->Qloc;
121: PetscScalar *sgn = agmres->sgn;
122: PetscScalar *tloc = agmres->tloc;
124: PetscReal *wbufptr = agmres->wbufptr;
125: PetscMPIInt rank = agmres->rank;
126: PetscMPIInt First = agmres->First;
127: PetscMPIInt Last = agmres->Last;
128: PetscBLASInt pas,len,bnloc,bpos;
129: PetscInt nloc,d, i, j, k;
130: PetscInt pos;
131: PetscReal c, s, rho, Ajj, val, tt, old;
132: PetscScalar *col;
133: MPI_Status status;
134: PetscBLASInt N = MAXKSPSIZE + 1;
138: PetscObjectGetComm((PetscObject)ksp,&comm);
139: PetscLogEventBegin(KSP_AGMRESRoddec,ksp,0,0,0);
140: PetscArrayzero(agmres->Rloc, N*N);
141: /* check input arguments */
142: if (nvec < 1) SETERRQ(PetscObjectComm((PetscObject)ksp),PETSC_ERR_ARG_OUTOFRANGE, "The number of input vectors shoud be positive");
143: VecGetLocalSize(VEC_V(0), &nloc);
144: PetscBLASIntCast(nloc,&bnloc);
145: if (nvec > nloc) SETERRQ(PetscObjectComm((PetscObject)ksp), PETSC_ERR_ARG_WRONG, "In QR factorization, the number of local rows should be greater or equal to the number of columns");
146: pas = 1;
147: /* Copy the vectors of the basis */
148: for (j = 0; j < nvec; j++) {
149: VecGetArray(VEC_V(j), &col);
150: PetscStackCallBLAS("BLAScopy",BLAScopy_(&bnloc, col, &pas, &Qloc[j*nloc], &pas));
151: VecRestoreArray(VEC_V(j), &col);
152: }
153: /* Each process performs a local QR on its own block */
154: for (j = 0; j < nvec; j++) {
155: len = nloc - j;
156: Ajj = Qloc[j*nloc+j];
157: PetscStackCallBLAS("BLASnrm2",rho = -PetscSign(Ajj) * BLASnrm2_(&len, &(Qloc[j*nloc+j]), &pas));
158: if (rho == 0.0) tloc[j] = 0.0;
159: else {
160: tloc[j] = (Ajj - rho) / rho;
161: len = len - 1;
162: val = 1.0 / (Ajj - rho);
163: PetscStackCallBLAS("BLASscal",BLASscal_(&len, &val, &(Qloc[j*nloc+j+1]), &pas));
164: Qloc[j*nloc+j] = 1.0;
165: len = len + 1;
166: for (k = j + 1; k < nvec; k++) {
167: PetscStackCallBLAS("BLASdot",tt = tloc[j] * BLASdot_(&len, &(Qloc[j*nloc+j]), &pas, &(Qloc[k*nloc+j]), &pas));
168: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&len, &tt, &(Qloc[j*nloc+j]), &pas, &(Qloc[k*nloc+j]), &pas));
169: }
170: Qloc[j*nloc+j] = rho;
171: }
172: }
173: /* annihilate undesirable Rloc, diagonal by diagonal*/
174: for (d = 0; d < nvec; d++) {
175: len = nvec - d;
176: if (rank == First) {
177: PetscStackCallBLAS("BLAScopy",BLAScopy_(&len, &(Qloc[d*nloc+d]), &bnloc, &(wbufptr[d]), &pas));
178: MPI_Send(&(wbufptr[d]), len, MPIU_SCALAR, rank + 1, agmres->tag, comm);
179: } else {
180: MPI_Recv(&(wbufptr[d]), len, MPIU_SCALAR, rank - 1, agmres->tag, comm, &status);
181: /* Elimination of Rloc(1,d)*/
182: c = wbufptr[d];
183: s = Qloc[d*nloc];
184: KSPAGMRESRoddecGivens(&c, &s, &rho, 1);
185: /* Apply Givens Rotation*/
186: for (k = d; k < nvec; k++) {
187: old = wbufptr[k];
188: wbufptr[k] = c * old - s * Qloc[k*nloc];
189: Qloc[k*nloc] = s * old + c * Qloc[k*nloc];
190: }
191: Qloc[d*nloc] = rho;
192: if (rank != Last) {
193: MPI_Send(& (wbufptr[d]), len, MPIU_SCALAR, rank + 1, agmres->tag, comm);
194: }
195: /* zero-out the d-th diagonal of Rloc ...*/
196: for (j = d + 1; j < nvec; j++) {
197: /* elimination of Rloc[i][j]*/
198: i = j - d;
199: c = Qloc[j*nloc+i-1];
200: s = Qloc[j*nloc+i];
201: KSPAGMRESRoddecGivens(&c, &s, &rho, 1);
202: for (k = j; k < nvec; k++) {
203: old = Qloc[k*nloc+i-1];
204: Qloc[k*nloc+i-1] = c * old - s * Qloc[k*nloc+i];
205: Qloc[k*nloc+i] = s * old + c * Qloc[k*nloc+i];
206: }
207: Qloc[j*nloc+i] = rho;
208: }
209: if (rank == Last) {
210: PetscStackCallBLAS("BLAScopy",BLAScopy_(&len, &(wbufptr[d]), &pas, RLOC(d,d), &N));
211: for (k = d + 1; k < nvec; k++) *RLOC(k,d) = 0.0;
212: }
213: }
214: }
216: if (rank == Last) {
217: for (d = 0; d < nvec; d++) {
218: pos = nvec - d;
219: PetscBLASIntCast(pos,&bpos);
220: sgn[d] = PetscSign(*RLOC(d,d));
221: PetscStackCallBLAS("BLASscal",BLASscal_(&bpos, &(sgn[d]), RLOC(d,d), &N));
222: }
223: }
224: /* BroadCast Rloc to all other processes
225: * NWD : should not be needed
226: */
227: MPI_Bcast(agmres->Rloc,N*N,MPIU_SCALAR,Last,comm);
228: PetscLogEventEnd(KSP_AGMRESRoddec,ksp,0,0,0);
229: return(0);
230: }
233: /*
234: * Computes Out <-- Q * In where Q is the orthogonal matrix from AGMRESRoddec
235: * Input
236: * - Qloc, sgn, tloc, nvec (see AGMRESRoddec above)
237: * - In : input vector (size nvec)
238: * Output :
239: * - Out : Petsc vector (distributed as the basis vectors)
240: */
241: PetscErrorCode KSPAGMRESRodvec(KSP ksp, PetscInt nvec, PetscScalar *In, Vec Out)
242: {
243: KSP_AGMRES *agmres = (KSP_AGMRES*) ksp->data;
244: MPI_Comm comm;
245: PetscScalar *Qloc = agmres->Qloc;
246: PetscScalar *sgn = agmres->sgn;
247: PetscScalar *tloc = agmres->tloc;
248: PetscMPIInt rank = agmres->rank;
249: PetscMPIInt First = agmres->First, Last = agmres->Last;
250: PetscMPIInt Iright = agmres->Iright, Ileft = agmres->Ileft;
251: PetscScalar *y, *zloc;
253: PetscInt nloc,d, len, i, j;
254: PetscBLASInt bnvec,pas,blen;
255: PetscInt dpt;
256: PetscReal c, s, rho, zp, zq, yd=0.0, tt;
257: MPI_Status status;
260: PetscBLASIntCast(nvec,&bnvec);
261: PetscObjectGetComm((PetscObject)ksp,&comm);
262: pas = 1;
263: VecGetLocalSize(VEC_V(0), &nloc);
264: PetscMalloc1(nvec, &y);
265: PetscArraycpy(y, In, nvec);
266: VecGetArray(Out, &zloc);
268: if (rank == Last) {
269: for (i = 0; i < nvec; i++) y[i] = sgn[i] * y[i];
270: }
271: for (i = 0; i < nloc; i++) zloc[i] = 0.0;
272: if (agmres->size == 1) PetscStackCallBLAS("BLAScopy",BLAScopy_(&bnvec, y, &pas, &(zloc[0]), &pas));
273: else {
274: for (d = nvec - 1; d >= 0; d--) {
275: if (rank == First) {
276: MPI_Recv(&(zloc[d]), 1, MPIU_SCALAR, Iright, agmres->tag, comm, &status);
277: } else {
278: for (j = nvec - 1; j >= d + 1; j--) {
279: i = j - d;
280: KSPAGMRESRoddecGivens(&c, &s, &(Qloc[j * nloc + i]), 0);
281: zp = zloc[i-1];
282: zq = zloc[i];
283: zloc[i-1] = c * zp + s * zq;
284: zloc[i] = -s * zp + c * zq;
285: }
286: KSPAGMRESRoddecGivens(&c, &s, &(Qloc[d * nloc]), 0);
287: if (rank == Last) {
288: zp = y[d];
289: zq = zloc[0];
290: y[d] = c * zp + s * zq;
291: zloc[0] = -s * zp + c * zq;
292: MPI_Send(&(y[d]), 1, MPIU_SCALAR, Ileft, agmres->tag, comm);
293: } else {
294: MPI_Recv(&yd, 1, MPIU_SCALAR, Iright, agmres->tag, comm, &status);
295: zp = yd;
296: zq = zloc[0];
297: yd = c * zp + s * zq;
298: zloc[0] = -s * zp + c * zq;
299: MPI_Send(&yd, 1, MPIU_SCALAR, Ileft, agmres->tag, comm);
300: }
301: }
302: }
303: }
304: for (j = nvec - 1; j >= 0; j--) {
305: dpt = j * nloc + j;
306: if (tloc[j] != 0.0) {
307: len = nloc - j;
308: PetscBLASIntCast(len,&blen);
309: rho = Qloc[dpt];
310: Qloc[dpt] = 1.0;
311: tt = tloc[j] * (BLASdot_(&blen, &(Qloc[dpt]), &pas, &(zloc[j]), &pas));
312: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&blen, &tt, &(Qloc[dpt]), &pas, &(zloc[j]), &pas));
313: Qloc[dpt] = rho;
314: }
315: }
316: VecRestoreArray(Out, &zloc);
317: PetscFree(y);
318: return(0);
319: }