Actual source code: agmresorthog.c
1: #define PETSCKSP_DLL
3: #include <../src/ksp/ksp/impls/gmres/agmres/agmresimpl.h>
4: /*
5: * This file implements the RODDEC algorithm : its purpose is to orthogonalize a set of vectors distributed across several processes. These processes are organized in a virtual ring.
6: * References : [1] Sidje, Roger B. Alternatives for parallel Krylov subspace basis computation. Numer. Linear Algebra Appl. 4 (1997), no. 4, 305-331
7: *
8: *
9: * initial author R. B. SIDJE,
10: * modified : G.-A Atenekeng-Kahou, 2008
11: * modified : D. NUENTSA WAKAM, 2011
12: *
13: */
15: /*
16: * Take the processes that own the vectors and Organize them on a virtual ring.
17: */
18: static PetscErrorCode KSPAGMRESRoddecGivens(PetscReal *, PetscReal *, PetscReal *, PetscInt);
20: PetscErrorCode KSPAGMRESRoddecInitNeighboor(KSP ksp)
21: {
22: MPI_Comm comm;
23: KSP_AGMRES *agmres = (KSP_AGMRES *)(ksp->data);
24: PetscMPIInt First, Last, rank, size;
26: PetscFunctionBegin;
27: PetscCall(PetscObjectGetComm((PetscObject)agmres->vecs[0], &comm));
28: PetscCallMPI(MPI_Comm_rank(comm, &rank));
29: PetscCallMPI(MPI_Comm_size(comm, &size));
30: PetscCall(MPIU_Allreduce(&rank, &First, 1, MPI_INT, MPI_MIN, comm));
31: PetscCall(MPIU_Allreduce(&rank, &Last, 1, MPI_INT, MPI_MAX, comm));
33: if ((rank != Last) && (rank == 0)) {
34: agmres->Ileft = rank - 1;
35: agmres->Iright = rank + 1;
36: } else {
37: if (rank == Last) {
38: agmres->Ileft = rank - 1;
39: agmres->Iright = First;
40: } else {
41: {
42: agmres->Ileft = Last;
43: agmres->Iright = rank + 1;
44: }
45: }
46: }
47: agmres->rank = rank;
48: agmres->size = size;
49: agmres->First = First;
50: agmres->Last = Last;
51: PetscFunctionReturn(PETSC_SUCCESS);
52: }
54: static PetscErrorCode KSPAGMRESRoddecGivens(PetscReal *c, PetscReal *s, PetscReal *r, PetscInt make_r)
55: {
56: PetscReal a, b, t;
58: PetscFunctionBegin;
59: if (make_r == 1) {
60: a = *c;
61: b = *s;
62: if (b == 0.e0) {
63: *c = 1.e0;
64: *s = 0.e0;
65: } else {
66: if (PetscAbsReal(b) > PetscAbsReal(a)) {
67: t = -a / b;
68: *s = 1.e0 / PetscSqrtReal(1.e0 + t * t);
69: *c = (*s) * t;
70: } else {
71: t = -b / a;
72: *c = 1.e0 / PetscSqrtReal(1.e0 + t * t);
73: *s = (*c) * t;
74: }
75: }
76: if (*c == 0.e0) {
77: *r = 1.e0;
78: } else {
79: if (PetscAbsReal(*s) < PetscAbsReal(*c)) {
80: *r = PetscSign(*c) * (*s) / 2.e0;
81: } else {
82: *r = PetscSign(*s) * 2.e0 / (*c);
83: }
84: }
85: }
87: if (*r == 1.e0) {
88: *c = 0.e0;
89: *s = 1.e0;
90: } else {
91: if (PetscAbsReal(*r) < 1.e0) {
92: *s = 2.e0 * (*r);
93: *c = PetscSqrtReal(1.e0 - (*s) * (*s));
94: } else {
95: *c = 2.e0 / (*r);
96: *s = PetscSqrtReal(1.e0 - (*c) * (*c));
97: }
98: }
99: PetscFunctionReturn(PETSC_SUCCESS);
100: }
102: /*
103: * Compute the QR factorization of the Krylov basis vectors
104: * Input :
105: * - the vectors are available in agmres->vecs (alias VEC_V)
106: * - nvec : the number of vectors
107: * Output :
108: * - agmres->Qloc : product of elementary reflectors for the QR of the (local part) of the vectors.
109: * - agmres->sgn : Sign of the rotations
110: * - agmres->tloc : scalar factors of the elementary reflectors.
112: */
113: PetscErrorCode KSPAGMRESRoddec(KSP ksp, PetscInt nvec)
114: {
115: KSP_AGMRES *agmres = (KSP_AGMRES *)ksp->data;
116: MPI_Comm comm;
117: PetscScalar *Qloc = agmres->Qloc;
118: PetscScalar *sgn = agmres->sgn;
119: PetscScalar *tloc = agmres->tloc;
120: PetscReal *wbufptr = agmres->wbufptr;
121: PetscMPIInt rank = agmres->rank;
122: PetscMPIInt First = agmres->First;
123: PetscMPIInt Last = agmres->Last;
124: PetscBLASInt pas, len, bnloc, bpos;
125: PetscInt nloc, d, i, j, k;
126: PetscInt pos;
127: PetscReal c, s, rho, Ajj, val, tt, old;
128: PetscScalar *col;
129: MPI_Status status;
130: PetscBLASInt N = MAXKSPSIZE + 1;
132: PetscFunctionBegin;
133: PetscCall(PetscObjectGetComm((PetscObject)ksp, &comm));
134: PetscCall(PetscLogEventBegin(KSP_AGMRESRoddec, ksp, 0, 0, 0));
135: PetscCall(PetscArrayzero(agmres->Rloc, N * N));
136: /* check input arguments */
137: PetscCheck(nvec >= 1, PetscObjectComm((PetscObject)ksp), PETSC_ERR_ARG_OUTOFRANGE, "The number of input vectors should be positive");
138: PetscCall(VecGetLocalSize(VEC_V(0), &nloc));
139: PetscCall(PetscBLASIntCast(nloc, &bnloc));
140: PetscCheck(nvec <= nloc, PetscObjectComm((PetscObject)ksp), PETSC_ERR_ARG_WRONG, "In QR factorization, the number of local rows should be greater or equal to the number of columns");
141: pas = 1;
142: /* Copy the vectors of the basis */
143: for (j = 0; j < nvec; j++) {
144: PetscCall(VecGetArray(VEC_V(j), &col));
145: PetscCallBLAS("BLAScopy", BLAScopy_(&bnloc, col, &pas, &Qloc[j * nloc], &pas));
146: PetscCall(VecRestoreArray(VEC_V(j), &col));
147: }
148: /* Each process performs a local QR on its own block */
149: for (j = 0; j < nvec; j++) {
150: len = nloc - j;
151: Ajj = Qloc[j * nloc + j];
152: PetscCallBLAS("BLASnrm2", rho = -PetscSign(Ajj) * BLASnrm2_(&len, &(Qloc[j * nloc + j]), &pas));
153: if (rho == 0.0) tloc[j] = 0.0;
154: else {
155: tloc[j] = (Ajj - rho) / rho;
156: len = len - 1;
157: val = 1.0 / (Ajj - rho);
158: PetscCallBLAS("BLASscal", BLASscal_(&len, &val, &(Qloc[j * nloc + j + 1]), &pas));
159: Qloc[j * nloc + j] = 1.0;
160: len = len + 1;
161: for (k = j + 1; k < nvec; k++) {
162: PetscCallBLAS("BLASdot", tt = tloc[j] * BLASdot_(&len, &(Qloc[j * nloc + j]), &pas, &(Qloc[k * nloc + j]), &pas));
163: PetscCallBLAS("BLASaxpy", BLASaxpy_(&len, &tt, &(Qloc[j * nloc + j]), &pas, &(Qloc[k * nloc + j]), &pas));
164: }
165: Qloc[j * nloc + j] = rho;
166: }
167: }
168: /* annihilate undesirable Rloc, diagonal by diagonal*/
169: for (d = 0; d < nvec; d++) {
170: len = nvec - d;
171: if (rank == First) {
172: PetscCallBLAS("BLAScopy", BLAScopy_(&len, &(Qloc[d * nloc + d]), &bnloc, &(wbufptr[d]), &pas));
173: PetscCallMPI(MPI_Send(&(wbufptr[d]), len, MPIU_SCALAR, rank + 1, agmres->tag, comm));
174: } else {
175: PetscCallMPI(MPI_Recv(&(wbufptr[d]), len, MPIU_SCALAR, rank - 1, agmres->tag, comm, &status));
176: /* Elimination of Rloc(1,d)*/
177: c = wbufptr[d];
178: s = Qloc[d * nloc];
179: PetscCall(KSPAGMRESRoddecGivens(&c, &s, &rho, 1));
180: /* Apply Givens Rotation*/
181: for (k = d; k < nvec; k++) {
182: old = wbufptr[k];
183: wbufptr[k] = c * old - s * Qloc[k * nloc];
184: Qloc[k * nloc] = s * old + c * Qloc[k * nloc];
185: }
186: Qloc[d * nloc] = rho;
187: if (rank != Last) PetscCallMPI(MPI_Send(&(wbufptr[d]), len, MPIU_SCALAR, rank + 1, agmres->tag, comm));
188: /* zero-out the d-th diagonal of Rloc ...*/
189: for (j = d + 1; j < nvec; j++) {
190: /* elimination of Rloc[i][j]*/
191: i = j - d;
192: c = Qloc[j * nloc + i - 1];
193: s = Qloc[j * nloc + i];
194: PetscCall(KSPAGMRESRoddecGivens(&c, &s, &rho, 1));
195: for (k = j; k < nvec; k++) {
196: old = Qloc[k * nloc + i - 1];
197: Qloc[k * nloc + i - 1] = c * old - s * Qloc[k * nloc + i];
198: Qloc[k * nloc + i] = s * old + c * Qloc[k * nloc + i];
199: }
200: Qloc[j * nloc + i] = rho;
201: }
202: if (rank == Last) {
203: PetscCallBLAS("BLAScopy", BLAScopy_(&len, &(wbufptr[d]), &pas, RLOC(d, d), &N));
204: for (k = d + 1; k < nvec; k++) *RLOC(k, d) = 0.0;
205: }
206: }
207: }
209: if (rank == Last) {
210: for (d = 0; d < nvec; d++) {
211: pos = nvec - d;
212: PetscCall(PetscBLASIntCast(pos, &bpos));
213: sgn[d] = PetscSign(*RLOC(d, d));
214: PetscCallBLAS("BLASscal", BLASscal_(&bpos, &(sgn[d]), RLOC(d, d), &N));
215: }
216: }
217: /* BroadCast Rloc to all other processes
218: * NWD : should not be needed
219: */
220: PetscCallMPI(MPI_Bcast(agmres->Rloc, N * N, MPIU_SCALAR, Last, comm));
221: PetscCall(PetscLogEventEnd(KSP_AGMRESRoddec, ksp, 0, 0, 0));
222: PetscFunctionReturn(PETSC_SUCCESS);
223: }
225: /*
226: * Computes Out <-- Q * In where Q is the orthogonal matrix from AGMRESRoddec
227: * Input
228: * - Qloc, sgn, tloc, nvec (see AGMRESRoddec above)
229: * - In : input vector (size nvec)
230: * Output :
231: * - Out : Petsc vector (distributed as the basis vectors)
232: */
233: PetscErrorCode KSPAGMRESRodvec(KSP ksp, PetscInt nvec, PetscScalar *In, Vec Out)
234: {
235: KSP_AGMRES *agmres = (KSP_AGMRES *)ksp->data;
236: MPI_Comm comm;
237: PetscScalar *Qloc = agmres->Qloc;
238: PetscScalar *sgn = agmres->sgn;
239: PetscScalar *tloc = agmres->tloc;
240: PetscMPIInt rank = agmres->rank;
241: PetscMPIInt First = agmres->First, Last = agmres->Last;
242: PetscMPIInt Iright = agmres->Iright, Ileft = agmres->Ileft;
243: PetscScalar *y, *zloc;
244: PetscInt nloc, d, len, i, j;
245: PetscBLASInt bnvec, pas, blen;
246: PetscInt dpt;
247: PetscReal c, s, rho, zp, zq, yd = 0.0, tt;
248: MPI_Status status;
250: PetscFunctionBegin;
251: PetscCall(PetscBLASIntCast(nvec, &bnvec));
252: PetscCall(PetscObjectGetComm((PetscObject)ksp, &comm));
253: pas = 1;
254: PetscCall(VecGetLocalSize(VEC_V(0), &nloc));
255: PetscCall(PetscMalloc1(nvec, &y));
256: PetscCall(PetscArraycpy(y, In, nvec));
257: PetscCall(VecGetArray(Out, &zloc));
259: if (rank == Last) {
260: for (i = 0; i < nvec; i++) y[i] = sgn[i] * y[i];
261: }
262: for (i = 0; i < nloc; i++) zloc[i] = 0.0;
263: if (agmres->size == 1) PetscCallBLAS("BLAScopy", BLAScopy_(&bnvec, y, &pas, &(zloc[0]), &pas));
264: else {
265: for (d = nvec - 1; d >= 0; d--) {
266: if (rank == First) {
267: PetscCallMPI(MPI_Recv(&(zloc[d]), 1, MPIU_SCALAR, Iright, agmres->tag, comm, &status));
268: } else {
269: for (j = nvec - 1; j >= d + 1; j--) {
270: i = j - d;
271: PetscCall(KSPAGMRESRoddecGivens(&c, &s, &(Qloc[j * nloc + i]), 0));
272: zp = zloc[i - 1];
273: zq = zloc[i];
274: zloc[i - 1] = c * zp + s * zq;
275: zloc[i] = -s * zp + c * zq;
276: }
277: PetscCall(KSPAGMRESRoddecGivens(&c, &s, &(Qloc[d * nloc]), 0));
278: if (rank == Last) {
279: zp = y[d];
280: zq = zloc[0];
281: y[d] = c * zp + s * zq;
282: zloc[0] = -s * zp + c * zq;
283: PetscCallMPI(MPI_Send(&(y[d]), 1, MPIU_SCALAR, Ileft, agmres->tag, comm));
284: } else {
285: PetscCallMPI(MPI_Recv(&yd, 1, MPIU_SCALAR, Iright, agmres->tag, comm, &status));
286: zp = yd;
287: zq = zloc[0];
288: yd = c * zp + s * zq;
289: zloc[0] = -s * zp + c * zq;
290: PetscCallMPI(MPI_Send(&yd, 1, MPIU_SCALAR, Ileft, agmres->tag, comm));
291: }
292: }
293: }
294: }
295: for (j = nvec - 1; j >= 0; j--) {
296: dpt = j * nloc + j;
297: if (tloc[j] != 0.0) {
298: len = nloc - j;
299: PetscCall(PetscBLASIntCast(len, &blen));
300: rho = Qloc[dpt];
301: Qloc[dpt] = 1.0;
302: tt = tloc[j] * (BLASdot_(&blen, &(Qloc[dpt]), &pas, &(zloc[j]), &pas));
303: PetscCallBLAS("BLASaxpy", BLASaxpy_(&blen, &tt, &(Qloc[dpt]), &pas, &(zloc[j]), &pas));
304: Qloc[dpt] = rho;
305: }
306: }
307: PetscCall(VecRestoreArray(Out, &zloc));
308: PetscCall(PetscFree(y));
309: PetscFunctionReturn(PETSC_SUCCESS);
310: }