Actual source code: agmresorthog.c

  1: #define PETSCKSP_DLL

  3: #include <../src/ksp/ksp/impls/gmres/agmres/agmresimpl.h>
  4: /*
  5:  *  This file implements the RODDEC algorithm : its purpose is to orthogonalize a set of vectors distributed across several processes. These processes are organized in a virtual ring.
  6:  * References : [1] Sidje, Roger B. Alternatives for parallel Krylov subspace basis computation. Numer. Linear Algebra Appl. 4 (1997), no. 4, 305-331
  7:  *
  8:  *
  9:  * initial author R. B. SIDJE,
 10:  * modified : G.-A Atenekeng-Kahou, 2008
 11:  * modified : D. NUENTSA WAKAM, 2011
 12:  *
 13:  */

 15: /*
 16:  * Take the processes that own the vectors and Organize them on a virtual ring.
 17:  */
 18: static PetscErrorCode KSPAGMRESRoddecGivens(PetscReal *, PetscReal *, PetscReal *, PetscInt);

 20: PetscErrorCode KSPAGMRESRoddecInitNeighboor(KSP ksp)
 21: {
 22:   MPI_Comm    comm;
 23:   KSP_AGMRES *agmres = (KSP_AGMRES *)(ksp->data);
 24:   PetscMPIInt First, Last, rank, size;

 26:   PetscFunctionBegin;
 27:   PetscCall(PetscObjectGetComm((PetscObject)agmres->vecs[0], &comm));
 28:   PetscCallMPI(MPI_Comm_rank(comm, &rank));
 29:   PetscCallMPI(MPI_Comm_size(comm, &size));
 30:   PetscCall(MPIU_Allreduce(&rank, &First, 1, MPI_INT, MPI_MIN, comm));
 31:   PetscCall(MPIU_Allreduce(&rank, &Last, 1, MPI_INT, MPI_MAX, comm));

 33:   if ((rank != Last) && (rank == 0)) {
 34:     agmres->Ileft  = rank - 1;
 35:     agmres->Iright = rank + 1;
 36:   } else {
 37:     if (rank == Last) {
 38:       agmres->Ileft  = rank - 1;
 39:       agmres->Iright = First;
 40:     } else {
 41:       {
 42:         agmres->Ileft  = Last;
 43:         agmres->Iright = rank + 1;
 44:       }
 45:     }
 46:   }
 47:   agmres->rank  = rank;
 48:   agmres->size  = size;
 49:   agmres->First = First;
 50:   agmres->Last  = Last;
 51:   PetscFunctionReturn(PETSC_SUCCESS);
 52: }

 54: static PetscErrorCode KSPAGMRESRoddecGivens(PetscReal *c, PetscReal *s, PetscReal *r, PetscInt make_r)
 55: {
 56:   PetscReal a, b, t;

 58:   PetscFunctionBegin;
 59:   if (make_r == 1) {
 60:     a = *c;
 61:     b = *s;
 62:     if (b == 0.e0) {
 63:       *c = 1.e0;
 64:       *s = 0.e0;
 65:     } else {
 66:       if (PetscAbsReal(b) > PetscAbsReal(a)) {
 67:         t  = -a / b;
 68:         *s = 1.e0 / PetscSqrtReal(1.e0 + t * t);
 69:         *c = (*s) * t;
 70:       } else {
 71:         t  = -b / a;
 72:         *c = 1.e0 / PetscSqrtReal(1.e0 + t * t);
 73:         *s = (*c) * t;
 74:       }
 75:     }
 76:     if (*c == 0.e0) {
 77:       *r = 1.e0;
 78:     } else {
 79:       if (PetscAbsReal(*s) < PetscAbsReal(*c)) {
 80:         *r = PetscSign(*c) * (*s) / 2.e0;
 81:       } else {
 82:         *r = PetscSign(*s) * 2.e0 / (*c);
 83:       }
 84:     }
 85:   }

 87:   if (*r == 1.e0) {
 88:     *c = 0.e0;
 89:     *s = 1.e0;
 90:   } else {
 91:     if (PetscAbsReal(*r) < 1.e0) {
 92:       *s = 2.e0 * (*r);
 93:       *c = PetscSqrtReal(1.e0 - (*s) * (*s));
 94:     } else {
 95:       *c = 2.e0 / (*r);
 96:       *s = PetscSqrtReal(1.e0 - (*c) * (*c));
 97:     }
 98:   }
 99:   PetscFunctionReturn(PETSC_SUCCESS);
100: }

102: /*
103:  * Compute the QR factorization of the Krylov basis vectors
104:  * Input :
105:  *  - the vectors are available in agmres->vecs (alias VEC_V)
106:  *  - nvec :  the number of vectors
107:  * Output :
108:  *  - agmres->Qloc : product of elementary reflectors for the QR of the (local part) of the vectors.
109:  *  - agmres->sgn :  Sign of the rotations
110:  *  - agmres->tloc : scalar factors of the elementary reflectors.

112:  */
113: PetscErrorCode KSPAGMRESRoddec(KSP ksp, PetscInt nvec)
114: {
115:   KSP_AGMRES  *agmres = (KSP_AGMRES *)ksp->data;
116:   MPI_Comm     comm;
117:   PetscScalar *Qloc    = agmres->Qloc;
118:   PetscScalar *sgn     = agmres->sgn;
119:   PetscScalar *tloc    = agmres->tloc;
120:   PetscReal   *wbufptr = agmres->wbufptr;
121:   PetscMPIInt  rank    = agmres->rank;
122:   PetscMPIInt  First   = agmres->First;
123:   PetscMPIInt  Last    = agmres->Last;
124:   PetscBLASInt pas, len, bnloc, bpos;
125:   PetscInt     nloc, d, i, j, k;
126:   PetscInt     pos;
127:   PetscReal    c, s, rho, Ajj, val, tt, old;
128:   PetscScalar *col;
129:   MPI_Status   status;
130:   PetscBLASInt N = MAXKSPSIZE + 1;

132:   PetscFunctionBegin;
133:   PetscCall(PetscObjectGetComm((PetscObject)ksp, &comm));
134:   PetscCall(PetscLogEventBegin(KSP_AGMRESRoddec, ksp, 0, 0, 0));
135:   PetscCall(PetscArrayzero(agmres->Rloc, N * N));
136:   /* check input arguments */
137:   PetscCheck(nvec >= 1, PetscObjectComm((PetscObject)ksp), PETSC_ERR_ARG_OUTOFRANGE, "The number of input vectors should be positive");
138:   PetscCall(VecGetLocalSize(VEC_V(0), &nloc));
139:   PetscCall(PetscBLASIntCast(nloc, &bnloc));
140:   PetscCheck(nvec <= nloc, PetscObjectComm((PetscObject)ksp), PETSC_ERR_ARG_WRONG, "In QR factorization, the number of local rows should be greater or equal to the number of columns");
141:   pas = 1;
142:   /* Copy the vectors of the basis */
143:   for (j = 0; j < nvec; j++) {
144:     PetscCall(VecGetArray(VEC_V(j), &col));
145:     PetscCallBLAS("BLAScopy", BLAScopy_(&bnloc, col, &pas, &Qloc[j * nloc], &pas));
146:     PetscCall(VecRestoreArray(VEC_V(j), &col));
147:   }
148:   /* Each process performs a local QR on its own block */
149:   for (j = 0; j < nvec; j++) {
150:     len = nloc - j;
151:     Ajj = Qloc[j * nloc + j];
152:     PetscCallBLAS("BLASnrm2", rho = -PetscSign(Ajj) * BLASnrm2_(&len, &(Qloc[j * nloc + j]), &pas));
153:     if (rho == 0.0) tloc[j] = 0.0;
154:     else {
155:       tloc[j] = (Ajj - rho) / rho;
156:       len     = len - 1;
157:       val     = 1.0 / (Ajj - rho);
158:       PetscCallBLAS("BLASscal", BLASscal_(&len, &val, &(Qloc[j * nloc + j + 1]), &pas));
159:       Qloc[j * nloc + j] = 1.0;
160:       len                = len + 1;
161:       for (k = j + 1; k < nvec; k++) {
162:         PetscCallBLAS("BLASdot", tt = tloc[j] * BLASdot_(&len, &(Qloc[j * nloc + j]), &pas, &(Qloc[k * nloc + j]), &pas));
163:         PetscCallBLAS("BLASaxpy", BLASaxpy_(&len, &tt, &(Qloc[j * nloc + j]), &pas, &(Qloc[k * nloc + j]), &pas));
164:       }
165:       Qloc[j * nloc + j] = rho;
166:     }
167:   }
168:   /* annihilate undesirable Rloc, diagonal by diagonal*/
169:   for (d = 0; d < nvec; d++) {
170:     len = nvec - d;
171:     if (rank == First) {
172:       PetscCallBLAS("BLAScopy", BLAScopy_(&len, &(Qloc[d * nloc + d]), &bnloc, &(wbufptr[d]), &pas));
173:       PetscCallMPI(MPI_Send(&(wbufptr[d]), len, MPIU_SCALAR, rank + 1, agmres->tag, comm));
174:     } else {
175:       PetscCallMPI(MPI_Recv(&(wbufptr[d]), len, MPIU_SCALAR, rank - 1, agmres->tag, comm, &status));
176:       /* Elimination of Rloc(1,d)*/
177:       c = wbufptr[d];
178:       s = Qloc[d * nloc];
179:       PetscCall(KSPAGMRESRoddecGivens(&c, &s, &rho, 1));
180:       /* Apply Givens Rotation*/
181:       for (k = d; k < nvec; k++) {
182:         old            = wbufptr[k];
183:         wbufptr[k]     = c * old - s * Qloc[k * nloc];
184:         Qloc[k * nloc] = s * old + c * Qloc[k * nloc];
185:       }
186:       Qloc[d * nloc] = rho;
187:       if (rank != Last) PetscCallMPI(MPI_Send(&(wbufptr[d]), len, MPIU_SCALAR, rank + 1, agmres->tag, comm));
188:       /* zero-out the d-th diagonal of Rloc ...*/
189:       for (j = d + 1; j < nvec; j++) {
190:         /* elimination of Rloc[i][j]*/
191:         i = j - d;
192:         c = Qloc[j * nloc + i - 1];
193:         s = Qloc[j * nloc + i];
194:         PetscCall(KSPAGMRESRoddecGivens(&c, &s, &rho, 1));
195:         for (k = j; k < nvec; k++) {
196:           old                    = Qloc[k * nloc + i - 1];
197:           Qloc[k * nloc + i - 1] = c * old - s * Qloc[k * nloc + i];
198:           Qloc[k * nloc + i]     = s * old + c * Qloc[k * nloc + i];
199:         }
200:         Qloc[j * nloc + i] = rho;
201:       }
202:       if (rank == Last) {
203:         PetscCallBLAS("BLAScopy", BLAScopy_(&len, &(wbufptr[d]), &pas, RLOC(d, d), &N));
204:         for (k = d + 1; k < nvec; k++) *RLOC(k, d) = 0.0;
205:       }
206:     }
207:   }

209:   if (rank == Last) {
210:     for (d = 0; d < nvec; d++) {
211:       pos = nvec - d;
212:       PetscCall(PetscBLASIntCast(pos, &bpos));
213:       sgn[d] = PetscSign(*RLOC(d, d));
214:       PetscCallBLAS("BLASscal", BLASscal_(&bpos, &(sgn[d]), RLOC(d, d), &N));
215:     }
216:   }
217:   /* BroadCast Rloc to all other processes
218:    * NWD : should not be needed
219:    */
220:   PetscCallMPI(MPI_Bcast(agmres->Rloc, N * N, MPIU_SCALAR, Last, comm));
221:   PetscCall(PetscLogEventEnd(KSP_AGMRESRoddec, ksp, 0, 0, 0));
222:   PetscFunctionReturn(PETSC_SUCCESS);
223: }

225: /*
226:  *  Computes Out <-- Q * In where Q is the orthogonal matrix from AGMRESRoddec
227:  *  Input
228:  *   - Qloc, sgn, tloc, nvec (see AGMRESRoddec above)
229:  *   - In : input vector (size nvec)
230:  *  Output :
231:  *   - Out : Petsc vector (distributed as the basis vectors)
232:  */
233: PetscErrorCode KSPAGMRESRodvec(KSP ksp, PetscInt nvec, PetscScalar *In, Vec Out)
234: {
235:   KSP_AGMRES  *agmres = (KSP_AGMRES *)ksp->data;
236:   MPI_Comm     comm;
237:   PetscScalar *Qloc  = agmres->Qloc;
238:   PetscScalar *sgn   = agmres->sgn;
239:   PetscScalar *tloc  = agmres->tloc;
240:   PetscMPIInt  rank  = agmres->rank;
241:   PetscMPIInt  First = agmres->First, Last = agmres->Last;
242:   PetscMPIInt  Iright = agmres->Iright, Ileft = agmres->Ileft;
243:   PetscScalar *y, *zloc;
244:   PetscInt     nloc, d, len, i, j;
245:   PetscBLASInt bnvec, pas, blen;
246:   PetscInt     dpt;
247:   PetscReal    c, s, rho, zp, zq, yd = 0.0, tt;
248:   MPI_Status   status;

250:   PetscFunctionBegin;
251:   PetscCall(PetscBLASIntCast(nvec, &bnvec));
252:   PetscCall(PetscObjectGetComm((PetscObject)ksp, &comm));
253:   pas = 1;
254:   PetscCall(VecGetLocalSize(VEC_V(0), &nloc));
255:   PetscCall(PetscMalloc1(nvec, &y));
256:   PetscCall(PetscArraycpy(y, In, nvec));
257:   PetscCall(VecGetArray(Out, &zloc));

259:   if (rank == Last) {
260:     for (i = 0; i < nvec; i++) y[i] = sgn[i] * y[i];
261:   }
262:   for (i = 0; i < nloc; i++) zloc[i] = 0.0;
263:   if (agmres->size == 1) PetscCallBLAS("BLAScopy", BLAScopy_(&bnvec, y, &pas, &(zloc[0]), &pas));
264:   else {
265:     for (d = nvec - 1; d >= 0; d--) {
266:       if (rank == First) {
267:         PetscCallMPI(MPI_Recv(&(zloc[d]), 1, MPIU_SCALAR, Iright, agmres->tag, comm, &status));
268:       } else {
269:         for (j = nvec - 1; j >= d + 1; j--) {
270:           i = j - d;
271:           PetscCall(KSPAGMRESRoddecGivens(&c, &s, &(Qloc[j * nloc + i]), 0));
272:           zp          = zloc[i - 1];
273:           zq          = zloc[i];
274:           zloc[i - 1] = c * zp + s * zq;
275:           zloc[i]     = -s * zp + c * zq;
276:         }
277:         PetscCall(KSPAGMRESRoddecGivens(&c, &s, &(Qloc[d * nloc]), 0));
278:         if (rank == Last) {
279:           zp      = y[d];
280:           zq      = zloc[0];
281:           y[d]    = c * zp + s * zq;
282:           zloc[0] = -s * zp + c * zq;
283:           PetscCallMPI(MPI_Send(&(y[d]), 1, MPIU_SCALAR, Ileft, agmres->tag, comm));
284:         } else {
285:           PetscCallMPI(MPI_Recv(&yd, 1, MPIU_SCALAR, Iright, agmres->tag, comm, &status));
286:           zp      = yd;
287:           zq      = zloc[0];
288:           yd      = c * zp + s * zq;
289:           zloc[0] = -s * zp + c * zq;
290:           PetscCallMPI(MPI_Send(&yd, 1, MPIU_SCALAR, Ileft, agmres->tag, comm));
291:         }
292:       }
293:     }
294:   }
295:   for (j = nvec - 1; j >= 0; j--) {
296:     dpt = j * nloc + j;
297:     if (tloc[j] != 0.0) {
298:       len = nloc - j;
299:       PetscCall(PetscBLASIntCast(len, &blen));
300:       rho       = Qloc[dpt];
301:       Qloc[dpt] = 1.0;
302:       tt        = tloc[j] * (BLASdot_(&blen, &(Qloc[dpt]), &pas, &(zloc[j]), &pas));
303:       PetscCallBLAS("BLASaxpy", BLASaxpy_(&blen, &tt, &(Qloc[dpt]), &pas, &(zloc[j]), &pas));
304:       Qloc[dpt] = rho;
305:     }
306:   }
307:   PetscCall(VecRestoreArray(Out, &zloc));
308:   PetscCall(PetscFree(y));
309:   PetscFunctionReturn(PETSC_SUCCESS);
310: }