Actual source code: mpiptap.c
2: /*
3: Defines projective product routines where A is a MPIAIJ matrix
4: C = P^T * A * P
5: */
7: #include <../src/mat/impls/aij/seq/aij.h>
8: #include <../src/mat/utils/freespace.h>
9: #include <../src/mat/impls/aij/mpi/mpiaij.h>
10: #include <petscbt.h>
11: #include <petsctime.h>
12: #include <petsc/private/hashmapiv.h>
13: #include <petsc/private/hashseti.h>
14: #include <petscsf.h>
16: PetscErrorCode MatView_MPIAIJ_PtAP(Mat A, PetscViewer viewer)
17: {
18: PetscBool iascii;
19: PetscViewerFormat format;
20: Mat_APMPI *ptap;
22: PetscFunctionBegin;
23: MatCheckProduct(A, 1);
24: ptap = (Mat_APMPI *)A->product->data;
25: PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii));
26: if (iascii) {
27: PetscCall(PetscViewerGetFormat(viewer, &format));
28: if (format == PETSC_VIEWER_ASCII_INFO || format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
29: if (ptap->algType == 0) {
30: PetscCall(PetscViewerASCIIPrintf(viewer, "using scalable MatPtAP() implementation\n"));
31: } else if (ptap->algType == 1) {
32: PetscCall(PetscViewerASCIIPrintf(viewer, "using nonscalable MatPtAP() implementation\n"));
33: } else if (ptap->algType == 2) {
34: PetscCall(PetscViewerASCIIPrintf(viewer, "using allatonce MatPtAP() implementation\n"));
35: } else if (ptap->algType == 3) {
36: PetscCall(PetscViewerASCIIPrintf(viewer, "using merged allatonce MatPtAP() implementation\n"));
37: }
38: }
39: }
40: PetscFunctionReturn(PETSC_SUCCESS);
41: }
43: PetscErrorCode MatDestroy_MPIAIJ_PtAP(void *data)
44: {
45: Mat_APMPI *ptap = (Mat_APMPI *)data;
46: Mat_Merge_SeqsToMPI *merge;
48: PetscFunctionBegin;
49: PetscCall(PetscFree2(ptap->startsj_s, ptap->startsj_r));
50: PetscCall(PetscFree(ptap->bufa));
51: PetscCall(MatDestroy(&ptap->P_loc));
52: PetscCall(MatDestroy(&ptap->P_oth));
53: PetscCall(MatDestroy(&ptap->A_loc)); /* used by MatTransposeMatMult() */
54: PetscCall(MatDestroy(&ptap->Rd));
55: PetscCall(MatDestroy(&ptap->Ro));
56: if (ptap->AP_loc) { /* used by alg_rap */
57: Mat_SeqAIJ *ap = (Mat_SeqAIJ *)(ptap->AP_loc)->data;
58: PetscCall(PetscFree(ap->i));
59: PetscCall(PetscFree2(ap->j, ap->a));
60: PetscCall(MatDestroy(&ptap->AP_loc));
61: } else { /* used by alg_ptap */
62: PetscCall(PetscFree(ptap->api));
63: PetscCall(PetscFree(ptap->apj));
64: }
65: PetscCall(MatDestroy(&ptap->C_loc));
66: PetscCall(MatDestroy(&ptap->C_oth));
67: if (ptap->apa) PetscCall(PetscFree(ptap->apa));
69: PetscCall(MatDestroy(&ptap->Pt));
71: merge = ptap->merge;
72: if (merge) { /* used by alg_ptap */
73: PetscCall(PetscFree(merge->id_r));
74: PetscCall(PetscFree(merge->len_s));
75: PetscCall(PetscFree(merge->len_r));
76: PetscCall(PetscFree(merge->bi));
77: PetscCall(PetscFree(merge->bj));
78: PetscCall(PetscFree(merge->buf_ri[0]));
79: PetscCall(PetscFree(merge->buf_ri));
80: PetscCall(PetscFree(merge->buf_rj[0]));
81: PetscCall(PetscFree(merge->buf_rj));
82: PetscCall(PetscFree(merge->coi));
83: PetscCall(PetscFree(merge->coj));
84: PetscCall(PetscFree(merge->owners_co));
85: PetscCall(PetscLayoutDestroy(&merge->rowmap));
86: PetscCall(PetscFree(ptap->merge));
87: }
88: PetscCall(ISLocalToGlobalMappingDestroy(&ptap->ltog));
90: PetscCall(PetscSFDestroy(&ptap->sf));
91: PetscCall(PetscFree(ptap->c_othi));
92: PetscCall(PetscFree(ptap->c_rmti));
93: PetscCall(PetscFree(ptap));
94: PetscFunctionReturn(PETSC_SUCCESS);
95: }
97: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable(Mat A, Mat P, Mat C)
98: {
99: Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data;
100: Mat_SeqAIJ *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = (Mat_SeqAIJ *)(a->B)->data;
101: Mat_SeqAIJ *ap, *p_loc, *p_oth = NULL, *c_seq;
102: Mat_APMPI *ptap;
103: Mat AP_loc, C_loc, C_oth;
104: PetscInt i, rstart, rend, cm, ncols, row, *api, *apj, am = A->rmap->n, apnz, nout;
105: PetscScalar *apa;
106: const PetscInt *cols;
107: const PetscScalar *vals;
109: PetscFunctionBegin;
110: MatCheckProduct(C, 3);
111: ptap = (Mat_APMPI *)C->product->data;
112: PetscCheck(ptap, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be computed. Missing data");
113: PetscCheck(ptap->AP_loc, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be reused. Do not call MatProductClear()");
115: PetscCall(MatZeroEntries(C));
117: /* 1) get R = Pd^T,Ro = Po^T */
118: if (ptap->reuse == MAT_REUSE_MATRIX) {
119: PetscCall(MatTranspose(p->A, MAT_REUSE_MATRIX, &ptap->Rd));
120: PetscCall(MatTranspose(p->B, MAT_REUSE_MATRIX, &ptap->Ro));
121: }
123: /* 2) get AP_loc */
124: AP_loc = ptap->AP_loc;
125: ap = (Mat_SeqAIJ *)AP_loc->data;
127: /* 2-1) get P_oth = ptap->P_oth and P_loc = ptap->P_loc */
128: if (ptap->reuse == MAT_REUSE_MATRIX) {
129: /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
130: PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_REUSE_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &ptap->P_oth));
131: PetscCall(MatMPIAIJGetLocalMat(P, MAT_REUSE_MATRIX, &ptap->P_loc));
132: }
134: /* 2-2) compute numeric A_loc*P - dominating part */
135: /* get data from symbolic products */
136: p_loc = (Mat_SeqAIJ *)(ptap->P_loc)->data;
137: if (ptap->P_oth) p_oth = (Mat_SeqAIJ *)(ptap->P_oth)->data;
139: api = ap->i;
140: apj = ap->j;
141: PetscCall(ISLocalToGlobalMappingApply(ptap->ltog, api[AP_loc->rmap->n], apj, apj));
142: for (i = 0; i < am; i++) {
143: /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */
144: apnz = api[i + 1] - api[i];
145: apa = ap->a + api[i];
146: PetscCall(PetscArrayzero(apa, apnz));
147: AProw_scalable(i, ad, ao, p_loc, p_oth, api, apj, apa);
148: }
149: PetscCall(ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, api[AP_loc->rmap->n], apj, &nout, apj));
150: PetscCheck(api[AP_loc->rmap->n] == nout, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT, api[AP_loc->rmap->n], nout);
152: /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */
153: /* Always use scalable version since we are in the MPI scalable version */
154: PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Rd, AP_loc, ptap->C_loc));
155: PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Ro, AP_loc, ptap->C_oth));
157: C_loc = ptap->C_loc;
158: C_oth = ptap->C_oth;
160: /* add C_loc and Co to to C */
161: PetscCall(MatGetOwnershipRange(C, &rstart, &rend));
163: /* C_loc -> C */
164: cm = C_loc->rmap->N;
165: c_seq = (Mat_SeqAIJ *)C_loc->data;
166: cols = c_seq->j;
167: vals = c_seq->a;
168: PetscCall(ISLocalToGlobalMappingApply(ptap->ltog, c_seq->i[C_loc->rmap->n], c_seq->j, c_seq->j));
170: /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */
171: /* when there are no off-processor parts. */
172: /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */
173: /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */
174: /* a table, and other, more complex stuff has to be done. */
175: if (C->assembled) {
176: C->was_assembled = PETSC_TRUE;
177: C->assembled = PETSC_FALSE;
178: }
179: if (C->was_assembled) {
180: for (i = 0; i < cm; i++) {
181: ncols = c_seq->i[i + 1] - c_seq->i[i];
182: row = rstart + i;
183: PetscCall(MatSetValues_MPIAIJ(C, 1, &row, ncols, cols, vals, ADD_VALUES));
184: cols += ncols;
185: vals += ncols;
186: }
187: } else {
188: PetscCall(MatSetValues_MPIAIJ_CopyFromCSRFormat(C, c_seq->j, c_seq->i, c_seq->a));
189: }
190: PetscCall(ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, c_seq->i[C_loc->rmap->n], c_seq->j, &nout, c_seq->j));
191: PetscCheck(c_seq->i[C_loc->rmap->n] == nout, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT, c_seq->i[C_loc->rmap->n], nout);
193: /* Co -> C, off-processor part */
194: cm = C_oth->rmap->N;
195: c_seq = (Mat_SeqAIJ *)C_oth->data;
196: cols = c_seq->j;
197: vals = c_seq->a;
198: PetscCall(ISLocalToGlobalMappingApply(ptap->ltog, c_seq->i[C_oth->rmap->n], c_seq->j, c_seq->j));
199: for (i = 0; i < cm; i++) {
200: ncols = c_seq->i[i + 1] - c_seq->i[i];
201: row = p->garray[i];
202: PetscCall(MatSetValues(C, 1, &row, ncols, cols, vals, ADD_VALUES));
203: cols += ncols;
204: vals += ncols;
205: }
206: PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
207: PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
209: ptap->reuse = MAT_REUSE_MATRIX;
211: PetscCall(ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, c_seq->i[C_oth->rmap->n], c_seq->j, &nout, c_seq->j));
212: PetscCheck(c_seq->i[C_oth->rmap->n] == nout, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT, c_seq->i[C_loc->rmap->n], nout);
213: PetscFunctionReturn(PETSC_SUCCESS);
214: }
216: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(Mat A, Mat P, PetscReal fill, Mat Cmpi)
217: {
218: Mat_APMPI *ptap;
219: Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data;
220: MPI_Comm comm;
221: PetscMPIInt size, rank;
222: Mat P_loc, P_oth;
223: PetscFreeSpaceList free_space = NULL, current_space = NULL;
224: PetscInt am = A->rmap->n, pm = P->rmap->n, pN = P->cmap->N, pn = P->cmap->n;
225: PetscInt *lnk, i, k, pnz, row, nsend;
226: PetscMPIInt tagi, tagj, *len_si, *len_s, *len_ri, nrecv;
227: PETSC_UNUSED PetscMPIInt icompleted = 0;
228: PetscInt **buf_rj, **buf_ri, **buf_ri_k;
229: const PetscInt *owners;
230: PetscInt len, proc, *dnz, *onz, nzi, nspacedouble;
231: PetscInt nrows, *buf_s, *buf_si, *buf_si_i, **nextrow, **nextci;
232: MPI_Request *swaits, *rwaits;
233: MPI_Status *sstatus, rstatus;
234: PetscLayout rowmap;
235: PetscInt *owners_co, *coi, *coj; /* i and j array of (p->B)^T*A*P - used in the communication */
236: PetscMPIInt *len_r, *id_r; /* array of length of comm->size, store send/recv matrix values */
237: PetscInt *api, *apj, *Jptr, apnz, *prmap = p->garray, con, j, Crmax, *aj, *ai, *pi, nout;
238: Mat_SeqAIJ *p_loc, *p_oth = NULL, *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = NULL, *c_loc, *c_oth;
239: PetscScalar *apv;
240: PetscHMapI ta;
241: MatType mtype;
242: const char *prefix;
243: #if defined(PETSC_USE_INFO)
244: PetscReal apfill;
245: #endif
247: PetscFunctionBegin;
248: MatCheckProduct(Cmpi, 4);
249: PetscCheck(!Cmpi->product->data, PetscObjectComm((PetscObject)Cmpi), PETSC_ERR_PLIB, "Product data not empty");
250: PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
251: PetscCallMPI(MPI_Comm_size(comm, &size));
252: PetscCallMPI(MPI_Comm_rank(comm, &rank));
254: if (size > 1) ao = (Mat_SeqAIJ *)(a->B)->data;
256: /* create symbolic parallel matrix Cmpi */
257: PetscCall(MatGetType(A, &mtype));
258: PetscCall(MatSetType(Cmpi, mtype));
260: /* create struct Mat_APMPI and attached it to C later */
261: PetscCall(PetscNew(&ptap));
262: ptap->reuse = MAT_INITIAL_MATRIX;
263: ptap->algType = 0;
265: /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
266: PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_INITIAL_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &P_oth));
267: /* get P_loc by taking all local rows of P */
268: PetscCall(MatMPIAIJGetLocalMat(P, MAT_INITIAL_MATRIX, &P_loc));
270: ptap->P_loc = P_loc;
271: ptap->P_oth = P_oth;
273: /* (0) compute Rd = Pd^T, Ro = Po^T */
274: PetscCall(MatTranspose(p->A, MAT_INITIAL_MATRIX, &ptap->Rd));
275: PetscCall(MatTranspose(p->B, MAT_INITIAL_MATRIX, &ptap->Ro));
277: /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */
278: p_loc = (Mat_SeqAIJ *)P_loc->data;
279: if (P_oth) p_oth = (Mat_SeqAIJ *)P_oth->data;
281: /* create and initialize a linked list */
282: PetscCall(PetscHMapICreateWithSize(pn, &ta)); /* for compute AP_loc and Cmpi */
283: MatRowMergeMax_SeqAIJ(p_loc, P_loc->rmap->N, ta);
284: MatRowMergeMax_SeqAIJ(p_oth, P_oth->rmap->N, ta);
285: PetscCall(PetscHMapIGetSize(ta, &Crmax)); /* Crmax = nnz(sum of Prows) */
287: PetscCall(PetscLLCondensedCreate_Scalable(Crmax, &lnk));
289: /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */
290: if (ao) {
291: PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ad->i[am], PetscIntSumTruncate(ao->i[am], p_loc->i[pm]))), &free_space));
292: } else {
293: PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ad->i[am], p_loc->i[pm])), &free_space));
294: }
295: current_space = free_space;
296: nspacedouble = 0;
298: PetscCall(PetscMalloc1(am + 1, &api));
299: api[0] = 0;
300: for (i = 0; i < am; i++) {
301: /* diagonal portion: Ad[i,:]*P */
302: ai = ad->i;
303: pi = p_loc->i;
304: nzi = ai[i + 1] - ai[i];
305: aj = ad->j + ai[i];
306: for (j = 0; j < nzi; j++) {
307: row = aj[j];
308: pnz = pi[row + 1] - pi[row];
309: Jptr = p_loc->j + pi[row];
310: /* add non-zero cols of P into the sorted linked list lnk */
311: PetscCall(PetscLLCondensedAddSorted_Scalable(pnz, Jptr, lnk));
312: }
313: /* off-diagonal portion: Ao[i,:]*P */
314: if (ao) {
315: ai = ao->i;
316: pi = p_oth->i;
317: nzi = ai[i + 1] - ai[i];
318: aj = ao->j + ai[i];
319: for (j = 0; j < nzi; j++) {
320: row = aj[j];
321: pnz = pi[row + 1] - pi[row];
322: Jptr = p_oth->j + pi[row];
323: PetscCall(PetscLLCondensedAddSorted_Scalable(pnz, Jptr, lnk));
324: }
325: }
326: apnz = lnk[0];
327: api[i + 1] = api[i] + apnz;
329: /* if free space is not available, double the total space in the list */
330: if (current_space->local_remaining < apnz) {
331: PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(apnz, current_space->total_array_size), ¤t_space));
332: nspacedouble++;
333: }
335: /* Copy data into free space, then initialize lnk */
336: PetscCall(PetscLLCondensedClean_Scalable(apnz, current_space->array, lnk));
338: current_space->array += apnz;
339: current_space->local_used += apnz;
340: current_space->local_remaining -= apnz;
341: }
342: /* Allocate space for apj and apv, initialize apj, and */
343: /* destroy list of free space and other temporary array(s) */
344: PetscCall(PetscCalloc2(api[am], &apj, api[am], &apv));
345: PetscCall(PetscFreeSpaceContiguous(&free_space, apj));
346: PetscCall(PetscLLCondensedDestroy_Scalable(lnk));
348: /* Create AP_loc for reuse */
349: PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, am, pN, api, apj, apv, &ptap->AP_loc));
350: PetscCall(MatSeqAIJCompactOutExtraColumns_SeqAIJ(ptap->AP_loc, &ptap->ltog));
352: #if defined(PETSC_USE_INFO)
353: if (ao) {
354: apfill = (PetscReal)api[am] / (ad->i[am] + ao->i[am] + p_loc->i[pm] + 1);
355: } else {
356: apfill = (PetscReal)api[am] / (ad->i[am] + p_loc->i[pm] + 1);
357: }
358: ptap->AP_loc->info.mallocs = nspacedouble;
359: ptap->AP_loc->info.fill_ratio_given = fill;
360: ptap->AP_loc->info.fill_ratio_needed = apfill;
362: if (api[am]) {
363: PetscCall(PetscInfo(ptap->AP_loc, "Scalable algorithm, AP_loc reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", nspacedouble, (double)fill, (double)apfill));
364: PetscCall(PetscInfo(ptap->AP_loc, "Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n", (double)apfill));
365: } else {
366: PetscCall(PetscInfo(ptap->AP_loc, "Scalable algorithm, AP_loc is empty \n"));
367: }
368: #endif
370: /* (2-1) compute symbolic Co = Ro*AP_loc */
371: PetscCall(MatProductCreate(ptap->Ro, ptap->AP_loc, NULL, &ptap->C_oth));
372: PetscCall(MatGetOptionsPrefix(A, &prefix));
373: PetscCall(MatSetOptionsPrefix(ptap->C_oth, prefix));
374: PetscCall(MatAppendOptionsPrefix(ptap->C_oth, "inner_offdiag_"));
376: PetscCall(MatProductSetType(ptap->C_oth, MATPRODUCT_AB));
377: PetscCall(MatProductSetAlgorithm(ptap->C_oth, "sorted"));
378: PetscCall(MatProductSetFill(ptap->C_oth, fill));
379: PetscCall(MatProductSetFromOptions(ptap->C_oth));
380: PetscCall(MatProductSymbolic(ptap->C_oth));
382: /* (3) send coj of C_oth to other processors */
383: /* determine row ownership */
384: PetscCall(PetscLayoutCreate(comm, &rowmap));
385: PetscCall(PetscLayoutSetLocalSize(rowmap, pn));
386: PetscCall(PetscLayoutSetBlockSize(rowmap, 1));
387: PetscCall(PetscLayoutSetUp(rowmap));
388: PetscCall(PetscLayoutGetRanges(rowmap, &owners));
390: /* determine the number of messages to send, their lengths */
391: PetscCall(PetscMalloc4(size, &len_s, size, &len_si, size, &sstatus, size + 2, &owners_co));
392: PetscCall(PetscArrayzero(len_s, size));
393: PetscCall(PetscArrayzero(len_si, size));
395: c_oth = (Mat_SeqAIJ *)ptap->C_oth->data;
396: coi = c_oth->i;
397: coj = c_oth->j;
398: con = ptap->C_oth->rmap->n;
399: proc = 0;
400: PetscCall(ISLocalToGlobalMappingApply(ptap->ltog, coi[con], coj, coj));
401: for (i = 0; i < con; i++) {
402: while (prmap[i] >= owners[proc + 1]) proc++;
403: len_si[proc]++; /* num of rows in Co(=Pt*AP) to be sent to [proc] */
404: len_s[proc] += coi[i + 1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */
405: }
407: len = 0; /* max length of buf_si[], see (4) */
408: owners_co[0] = 0;
409: nsend = 0;
410: for (proc = 0; proc < size; proc++) {
411: owners_co[proc + 1] = owners_co[proc] + len_si[proc];
412: if (len_s[proc]) {
413: nsend++;
414: len_si[proc] = 2 * (len_si[proc] + 1); /* length of buf_si to be sent to [proc] */
415: len += len_si[proc];
416: }
417: }
419: /* determine the number and length of messages to receive for coi and coj */
420: PetscCall(PetscGatherNumberOfMessages(comm, NULL, len_s, &nrecv));
421: PetscCall(PetscGatherMessageLengths2(comm, nsend, nrecv, len_s, len_si, &id_r, &len_r, &len_ri));
423: /* post the Irecv and Isend of coj */
424: PetscCall(PetscCommGetNewTag(comm, &tagj));
425: PetscCall(PetscPostIrecvInt(comm, tagj, nrecv, id_r, len_r, &buf_rj, &rwaits));
426: PetscCall(PetscMalloc1(nsend + 1, &swaits));
427: for (proc = 0, k = 0; proc < size; proc++) {
428: if (!len_s[proc]) continue;
429: i = owners_co[proc];
430: PetscCallMPI(MPI_Isend(coj + coi[i], len_s[proc], MPIU_INT, proc, tagj, comm, swaits + k));
431: k++;
432: }
434: /* (2-2) compute symbolic C_loc = Rd*AP_loc */
435: PetscCall(MatProductCreate(ptap->Rd, ptap->AP_loc, NULL, &ptap->C_loc));
436: PetscCall(MatProductSetType(ptap->C_loc, MATPRODUCT_AB));
437: PetscCall(MatProductSetAlgorithm(ptap->C_loc, "default"));
438: PetscCall(MatProductSetFill(ptap->C_loc, fill));
440: PetscCall(MatSetOptionsPrefix(ptap->C_loc, prefix));
441: PetscCall(MatAppendOptionsPrefix(ptap->C_loc, "inner_diag_"));
443: PetscCall(MatProductSetFromOptions(ptap->C_loc));
444: PetscCall(MatProductSymbolic(ptap->C_loc));
446: c_loc = (Mat_SeqAIJ *)ptap->C_loc->data;
447: PetscCall(ISLocalToGlobalMappingApply(ptap->ltog, c_loc->i[ptap->C_loc->rmap->n], c_loc->j, c_loc->j));
449: /* receives coj are complete */
450: for (i = 0; i < nrecv; i++) PetscCallMPI(MPI_Waitany(nrecv, rwaits, &icompleted, &rstatus));
451: PetscCall(PetscFree(rwaits));
452: if (nsend) PetscCallMPI(MPI_Waitall(nsend, swaits, sstatus));
454: /* add received column indices into ta to update Crmax */
455: for (k = 0; k < nrecv; k++) { /* k-th received message */
456: Jptr = buf_rj[k];
457: for (j = 0; j < len_r[k]; j++) PetscCall(PetscHMapISet(ta, *(Jptr + j) + 1, 1));
458: }
459: PetscCall(PetscHMapIGetSize(ta, &Crmax));
460: PetscCall(PetscHMapIDestroy(&ta));
462: /* (4) send and recv coi */
463: PetscCall(PetscCommGetNewTag(comm, &tagi));
464: PetscCall(PetscPostIrecvInt(comm, tagi, nrecv, id_r, len_ri, &buf_ri, &rwaits));
465: PetscCall(PetscMalloc1(len + 1, &buf_s));
466: buf_si = buf_s; /* points to the beginning of k-th msg to be sent */
467: for (proc = 0, k = 0; proc < size; proc++) {
468: if (!len_s[proc]) continue;
469: /* form outgoing message for i-structure:
470: buf_si[0]: nrows to be sent
471: [1:nrows]: row index (global)
472: [nrows+1:2*nrows+1]: i-structure index
473: */
474: nrows = len_si[proc] / 2 - 1; /* num of rows in Co to be sent to [proc] */
475: buf_si_i = buf_si + nrows + 1;
476: buf_si[0] = nrows;
477: buf_si_i[0] = 0;
478: nrows = 0;
479: for (i = owners_co[proc]; i < owners_co[proc + 1]; i++) {
480: nzi = coi[i + 1] - coi[i];
481: buf_si_i[nrows + 1] = buf_si_i[nrows] + nzi; /* i-structure */
482: buf_si[nrows + 1] = prmap[i] - owners[proc]; /* local row index */
483: nrows++;
484: }
485: PetscCallMPI(MPI_Isend(buf_si, len_si[proc], MPIU_INT, proc, tagi, comm, swaits + k));
486: k++;
487: buf_si += len_si[proc];
488: }
489: for (i = 0; i < nrecv; i++) PetscCallMPI(MPI_Waitany(nrecv, rwaits, &icompleted, &rstatus));
490: PetscCall(PetscFree(rwaits));
491: if (nsend) PetscCallMPI(MPI_Waitall(nsend, swaits, sstatus));
493: PetscCall(PetscFree4(len_s, len_si, sstatus, owners_co));
494: PetscCall(PetscFree(len_ri));
495: PetscCall(PetscFree(swaits));
496: PetscCall(PetscFree(buf_s));
498: /* (5) compute the local portion of Cmpi */
499: /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */
500: PetscCall(PetscFreeSpaceGet(Crmax, &free_space));
501: current_space = free_space;
503: PetscCall(PetscMalloc3(nrecv, &buf_ri_k, nrecv, &nextrow, nrecv, &nextci));
504: for (k = 0; k < nrecv; k++) {
505: buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
506: nrows = *buf_ri_k[k];
507: nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */
508: nextci[k] = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recved i-structure */
509: }
511: MatPreallocateBegin(comm, pn, pn, dnz, onz);
512: PetscCall(PetscLLCondensedCreate_Scalable(Crmax, &lnk));
513: for (i = 0; i < pn; i++) {
514: /* add C_loc into Cmpi */
515: nzi = c_loc->i[i + 1] - c_loc->i[i];
516: Jptr = c_loc->j + c_loc->i[i];
517: PetscCall(PetscLLCondensedAddSorted_Scalable(nzi, Jptr, lnk));
519: /* add received col data into lnk */
520: for (k = 0; k < nrecv; k++) { /* k-th received message */
521: if (i == *nextrow[k]) { /* i-th row */
522: nzi = *(nextci[k] + 1) - *nextci[k];
523: Jptr = buf_rj[k] + *nextci[k];
524: PetscCall(PetscLLCondensedAddSorted_Scalable(nzi, Jptr, lnk));
525: nextrow[k]++;
526: nextci[k]++;
527: }
528: }
529: nzi = lnk[0];
531: /* copy data into free space, then initialize lnk */
532: PetscCall(PetscLLCondensedClean_Scalable(nzi, current_space->array, lnk));
533: PetscCall(MatPreallocateSet(i + owners[rank], nzi, current_space->array, dnz, onz));
534: }
535: PetscCall(PetscFree3(buf_ri_k, nextrow, nextci));
536: PetscCall(PetscLLCondensedDestroy_Scalable(lnk));
537: PetscCall(PetscFreeSpaceDestroy(free_space));
539: /* local sizes and preallocation */
540: PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE));
541: if (P->cmap->bs > 0) {
542: PetscCall(PetscLayoutSetBlockSize(Cmpi->rmap, P->cmap->bs));
543: PetscCall(PetscLayoutSetBlockSize(Cmpi->cmap, P->cmap->bs));
544: }
545: PetscCall(MatMPIAIJSetPreallocation(Cmpi, 0, dnz, 0, onz));
546: MatPreallocateEnd(dnz, onz);
548: /* members in merge */
549: PetscCall(PetscFree(id_r));
550: PetscCall(PetscFree(len_r));
551: PetscCall(PetscFree(buf_ri[0]));
552: PetscCall(PetscFree(buf_ri));
553: PetscCall(PetscFree(buf_rj[0]));
554: PetscCall(PetscFree(buf_rj));
555: PetscCall(PetscLayoutDestroy(&rowmap));
557: nout = 0;
558: PetscCall(ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, c_oth->i[ptap->C_oth->rmap->n], c_oth->j, &nout, c_oth->j));
559: PetscCheck(c_oth->i[ptap->C_oth->rmap->n] == nout, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT, c_oth->i[ptap->C_oth->rmap->n], nout);
560: PetscCall(ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, c_loc->i[ptap->C_loc->rmap->n], c_loc->j, &nout, c_loc->j));
561: PetscCheck(c_loc->i[ptap->C_loc->rmap->n] == nout, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT, c_loc->i[ptap->C_loc->rmap->n], nout);
563: /* attach the supporting struct to Cmpi for reuse */
564: Cmpi->product->data = ptap;
565: Cmpi->product->view = MatView_MPIAIJ_PtAP;
566: Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP;
568: /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
569: Cmpi->assembled = PETSC_FALSE;
570: Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable;
571: PetscFunctionReturn(PETSC_SUCCESS);
572: }
574: static inline PetscErrorCode MatPtAPSymbolicComputeOneRowOfAP_private(Mat A, Mat P, Mat P_oth, const PetscInt *map, PetscInt dof, PetscInt i, PetscHSetI dht, PetscHSetI oht)
575: {
576: Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data;
577: Mat_SeqAIJ *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = (Mat_SeqAIJ *)(a->B)->data, *p_oth = (Mat_SeqAIJ *)P_oth->data, *pd = (Mat_SeqAIJ *)p->A->data, *po = (Mat_SeqAIJ *)p->B->data;
578: PetscInt *ai, nzi, j, *aj, row, col, *pi, *pj, pnz, nzpi, *p_othcols, k;
579: PetscInt pcstart, pcend, column, offset;
581: PetscFunctionBegin;
582: pcstart = P->cmap->rstart;
583: pcstart *= dof;
584: pcend = P->cmap->rend;
585: pcend *= dof;
586: /* diagonal portion: Ad[i,:]*P */
587: ai = ad->i;
588: nzi = ai[i + 1] - ai[i];
589: aj = ad->j + ai[i];
590: for (j = 0; j < nzi; j++) {
591: row = aj[j];
592: offset = row % dof;
593: row /= dof;
594: nzpi = pd->i[row + 1] - pd->i[row];
595: pj = pd->j + pd->i[row];
596: for (k = 0; k < nzpi; k++) PetscCall(PetscHSetIAdd(dht, pj[k] * dof + offset + pcstart));
597: }
598: /* off diag P */
599: for (j = 0; j < nzi; j++) {
600: row = aj[j];
601: offset = row % dof;
602: row /= dof;
603: nzpi = po->i[row + 1] - po->i[row];
604: pj = po->j + po->i[row];
605: for (k = 0; k < nzpi; k++) PetscCall(PetscHSetIAdd(oht, p->garray[pj[k]] * dof + offset));
606: }
608: /* off diagonal part: Ao[i, :]*P_oth */
609: if (ao) {
610: ai = ao->i;
611: pi = p_oth->i;
612: nzi = ai[i + 1] - ai[i];
613: aj = ao->j + ai[i];
614: for (j = 0; j < nzi; j++) {
615: row = aj[j];
616: offset = a->garray[row] % dof;
617: row = map[row];
618: pnz = pi[row + 1] - pi[row];
619: p_othcols = p_oth->j + pi[row];
620: for (col = 0; col < pnz; col++) {
621: column = p_othcols[col] * dof + offset;
622: if (column >= pcstart && column < pcend) {
623: PetscCall(PetscHSetIAdd(dht, column));
624: } else {
625: PetscCall(PetscHSetIAdd(oht, column));
626: }
627: }
628: }
629: } /* end if (ao) */
630: PetscFunctionReturn(PETSC_SUCCESS);
631: }
633: static inline PetscErrorCode MatPtAPNumericComputeOneRowOfAP_private(Mat A, Mat P, Mat P_oth, const PetscInt *map, PetscInt dof, PetscInt i, PetscHMapIV hmap)
634: {
635: Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data;
636: Mat_SeqAIJ *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = (Mat_SeqAIJ *)(a->B)->data, *p_oth = (Mat_SeqAIJ *)P_oth->data, *pd = (Mat_SeqAIJ *)p->A->data, *po = (Mat_SeqAIJ *)p->B->data;
637: PetscInt *ai, nzi, j, *aj, row, col, *pi, pnz, *p_othcols, pcstart, *pj, k, nzpi, offset;
638: PetscScalar ra, *aa, *pa;
640: PetscFunctionBegin;
641: pcstart = P->cmap->rstart;
642: pcstart *= dof;
644: /* diagonal portion: Ad[i,:]*P */
645: ai = ad->i;
646: nzi = ai[i + 1] - ai[i];
647: aj = ad->j + ai[i];
648: aa = ad->a + ai[i];
649: for (j = 0; j < nzi; j++) {
650: ra = aa[j];
651: row = aj[j];
652: offset = row % dof;
653: row /= dof;
654: nzpi = pd->i[row + 1] - pd->i[row];
655: pj = pd->j + pd->i[row];
656: pa = pd->a + pd->i[row];
657: for (k = 0; k < nzpi; k++) PetscCall(PetscHMapIVAddValue(hmap, pj[k] * dof + offset + pcstart, ra * pa[k]));
658: PetscCall(PetscLogFlops(2.0 * nzpi));
659: }
660: for (j = 0; j < nzi; j++) {
661: ra = aa[j];
662: row = aj[j];
663: offset = row % dof;
664: row /= dof;
665: nzpi = po->i[row + 1] - po->i[row];
666: pj = po->j + po->i[row];
667: pa = po->a + po->i[row];
668: for (k = 0; k < nzpi; k++) PetscCall(PetscHMapIVAddValue(hmap, p->garray[pj[k]] * dof + offset, ra * pa[k]));
669: PetscCall(PetscLogFlops(2.0 * nzpi));
670: }
672: /* off diagonal part: Ao[i, :]*P_oth */
673: if (ao) {
674: ai = ao->i;
675: pi = p_oth->i;
676: nzi = ai[i + 1] - ai[i];
677: aj = ao->j + ai[i];
678: aa = ao->a + ai[i];
679: for (j = 0; j < nzi; j++) {
680: row = aj[j];
681: offset = a->garray[row] % dof;
682: row = map[row];
683: ra = aa[j];
684: pnz = pi[row + 1] - pi[row];
685: p_othcols = p_oth->j + pi[row];
686: pa = p_oth->a + pi[row];
687: for (col = 0; col < pnz; col++) PetscCall(PetscHMapIVAddValue(hmap, p_othcols[col] * dof + offset, ra * pa[col]));
688: PetscCall(PetscLogFlops(2.0 * pnz));
689: }
690: } /* end if (ao) */
692: PetscFunctionReturn(PETSC_SUCCESS);
693: }
695: PetscErrorCode MatGetBrowsOfAcols_MPIXAIJ(Mat, Mat, PetscInt dof, MatReuse, Mat *);
697: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce(Mat A, Mat P, PetscInt dof, Mat C)
698: {
699: Mat_MPIAIJ *p = (Mat_MPIAIJ *)P->data, *c = (Mat_MPIAIJ *)C->data;
700: Mat_SeqAIJ *cd, *co, *po = (Mat_SeqAIJ *)p->B->data, *pd = (Mat_SeqAIJ *)p->A->data;
701: Mat_APMPI *ptap;
702: PetscHMapIV hmap;
703: PetscInt i, j, jj, kk, nzi, *c_rmtj, voff, *c_othj, pn, pon, pcstart, pcend, ccstart, ccend, row, am, *poj, *pdj, *apindices, cmaxr, *c_rmtc, *c_rmtjj, *dcc, *occ, loc;
704: PetscScalar *c_rmta, *c_otha, *poa, *pda, *apvalues, *apvaluestmp, *c_rmtaa;
705: PetscInt offset, ii, pocol;
706: const PetscInt *mappingindices;
707: IS map;
709: PetscFunctionBegin;
710: MatCheckProduct(C, 4);
711: ptap = (Mat_APMPI *)C->product->data;
712: PetscCheck(ptap, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be computed. Missing data");
713: PetscCheck(ptap->P_oth, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be reused. Do not call MatProductClear()");
715: PetscCall(MatZeroEntries(C));
717: /* Get P_oth = ptap->P_oth and P_loc = ptap->P_loc */
718: if (ptap->reuse == MAT_REUSE_MATRIX) {
719: /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
720: PetscCall(MatGetBrowsOfAcols_MPIXAIJ(A, P, dof, MAT_REUSE_MATRIX, &ptap->P_oth));
721: }
722: PetscCall(PetscObjectQuery((PetscObject)ptap->P_oth, "aoffdiagtopothmapping", (PetscObject *)&map));
724: PetscCall(MatGetLocalSize(p->B, NULL, &pon));
725: pon *= dof;
726: PetscCall(PetscCalloc2(ptap->c_rmti[pon], &c_rmtj, ptap->c_rmti[pon], &c_rmta));
727: PetscCall(MatGetLocalSize(A, &am, NULL));
728: cmaxr = 0;
729: for (i = 0; i < pon; i++) cmaxr = PetscMax(cmaxr, ptap->c_rmti[i + 1] - ptap->c_rmti[i]);
730: PetscCall(PetscCalloc4(cmaxr, &apindices, cmaxr, &apvalues, cmaxr, &apvaluestmp, pon, &c_rmtc));
731: PetscCall(PetscHMapIVCreateWithSize(cmaxr, &hmap));
732: PetscCall(ISGetIndices(map, &mappingindices));
733: for (i = 0; i < am && pon; i++) {
734: PetscCall(PetscHMapIVClear(hmap));
735: offset = i % dof;
736: ii = i / dof;
737: nzi = po->i[ii + 1] - po->i[ii];
738: if (!nzi) continue;
739: PetscCall(MatPtAPNumericComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, hmap));
740: voff = 0;
741: PetscCall(PetscHMapIVGetPairs(hmap, &voff, apindices, apvalues));
742: if (!voff) continue;
744: /* Form C(ii, :) */
745: poj = po->j + po->i[ii];
746: poa = po->a + po->i[ii];
747: for (j = 0; j < nzi; j++) {
748: pocol = poj[j] * dof + offset;
749: c_rmtjj = c_rmtj + ptap->c_rmti[pocol];
750: c_rmtaa = c_rmta + ptap->c_rmti[pocol];
751: for (jj = 0; jj < voff; jj++) {
752: apvaluestmp[jj] = apvalues[jj] * poa[j];
753: /* If the row is empty */
754: if (!c_rmtc[pocol]) {
755: c_rmtjj[jj] = apindices[jj];
756: c_rmtaa[jj] = apvaluestmp[jj];
757: c_rmtc[pocol]++;
758: } else {
759: PetscCall(PetscFindInt(apindices[jj], c_rmtc[pocol], c_rmtjj, &loc));
760: if (loc >= 0) { /* hit */
761: c_rmtaa[loc] += apvaluestmp[jj];
762: PetscCall(PetscLogFlops(1.0));
763: } else { /* new element */
764: loc = -(loc + 1);
765: /* Move data backward */
766: for (kk = c_rmtc[pocol]; kk > loc; kk--) {
767: c_rmtjj[kk] = c_rmtjj[kk - 1];
768: c_rmtaa[kk] = c_rmtaa[kk - 1];
769: } /* End kk */
770: c_rmtjj[loc] = apindices[jj];
771: c_rmtaa[loc] = apvaluestmp[jj];
772: c_rmtc[pocol]++;
773: }
774: }
775: PetscCall(PetscLogFlops(voff));
776: } /* End jj */
777: } /* End j */
778: } /* End i */
780: PetscCall(PetscFree4(apindices, apvalues, apvaluestmp, c_rmtc));
781: PetscCall(PetscHMapIVDestroy(&hmap));
783: PetscCall(MatGetLocalSize(P, NULL, &pn));
784: pn *= dof;
785: PetscCall(PetscCalloc2(ptap->c_othi[pn], &c_othj, ptap->c_othi[pn], &c_otha));
787: PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));
788: PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_SCALAR, c_rmta, c_otha, MPI_REPLACE));
789: PetscCall(MatGetOwnershipRangeColumn(P, &pcstart, &pcend));
790: pcstart = pcstart * dof;
791: pcend = pcend * dof;
792: cd = (Mat_SeqAIJ *)(c->A)->data;
793: co = (Mat_SeqAIJ *)(c->B)->data;
795: cmaxr = 0;
796: for (i = 0; i < pn; i++) cmaxr = PetscMax(cmaxr, (cd->i[i + 1] - cd->i[i]) + (co->i[i + 1] - co->i[i]));
797: PetscCall(PetscCalloc5(cmaxr, &apindices, cmaxr, &apvalues, cmaxr, &apvaluestmp, pn, &dcc, pn, &occ));
798: PetscCall(PetscHMapIVCreateWithSize(cmaxr, &hmap));
799: for (i = 0; i < am && pn; i++) {
800: PetscCall(PetscHMapIVClear(hmap));
801: offset = i % dof;
802: ii = i / dof;
803: nzi = pd->i[ii + 1] - pd->i[ii];
804: if (!nzi) continue;
805: PetscCall(MatPtAPNumericComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, hmap));
806: voff = 0;
807: PetscCall(PetscHMapIVGetPairs(hmap, &voff, apindices, apvalues));
808: if (!voff) continue;
809: /* Form C(ii, :) */
810: pdj = pd->j + pd->i[ii];
811: pda = pd->a + pd->i[ii];
812: for (j = 0; j < nzi; j++) {
813: row = pcstart + pdj[j] * dof + offset;
814: for (jj = 0; jj < voff; jj++) apvaluestmp[jj] = apvalues[jj] * pda[j];
815: PetscCall(PetscLogFlops(voff));
816: PetscCall(MatSetValues(C, 1, &row, voff, apindices, apvaluestmp, ADD_VALUES));
817: }
818: }
819: PetscCall(ISRestoreIndices(map, &mappingindices));
820: PetscCall(MatGetOwnershipRangeColumn(C, &ccstart, &ccend));
821: PetscCall(PetscFree5(apindices, apvalues, apvaluestmp, dcc, occ));
822: PetscCall(PetscHMapIVDestroy(&hmap));
823: PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));
824: PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_SCALAR, c_rmta, c_otha, MPI_REPLACE));
825: PetscCall(PetscFree2(c_rmtj, c_rmta));
827: /* Add contributions from remote */
828: for (i = 0; i < pn; i++) {
829: row = i + pcstart;
830: PetscCall(MatSetValues(C, 1, &row, ptap->c_othi[i + 1] - ptap->c_othi[i], c_othj + ptap->c_othi[i], c_otha + ptap->c_othi[i], ADD_VALUES));
831: }
832: PetscCall(PetscFree2(c_othj, c_otha));
834: PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
835: PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
837: ptap->reuse = MAT_REUSE_MATRIX;
838: PetscFunctionReturn(PETSC_SUCCESS);
839: }
841: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce(Mat A, Mat P, Mat C)
842: {
843: PetscFunctionBegin;
845: PetscCall(MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce(A, P, 1, C));
846: PetscFunctionReturn(PETSC_SUCCESS);
847: }
849: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce_merged(Mat A, Mat P, PetscInt dof, Mat C)
850: {
851: Mat_MPIAIJ *p = (Mat_MPIAIJ *)P->data, *c = (Mat_MPIAIJ *)C->data;
852: Mat_SeqAIJ *cd, *co, *po = (Mat_SeqAIJ *)p->B->data, *pd = (Mat_SeqAIJ *)p->A->data;
853: Mat_APMPI *ptap;
854: PetscHMapIV hmap;
855: PetscInt i, j, jj, kk, nzi, dnzi, *c_rmtj, voff, *c_othj, pn, pon, pcstart, pcend, row, am, *poj, *pdj, *apindices, cmaxr, *c_rmtc, *c_rmtjj, loc;
856: PetscScalar *c_rmta, *c_otha, *poa, *pda, *apvalues, *apvaluestmp, *c_rmtaa;
857: PetscInt offset, ii, pocol;
858: const PetscInt *mappingindices;
859: IS map;
861: PetscFunctionBegin;
862: MatCheckProduct(C, 4);
863: ptap = (Mat_APMPI *)C->product->data;
864: PetscCheck(ptap, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be computed. Missing data");
865: PetscCheck(ptap->P_oth, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be reused. Do not call MatProductClear()");
867: PetscCall(MatZeroEntries(C));
869: /* Get P_oth = ptap->P_oth and P_loc = ptap->P_loc */
870: if (ptap->reuse == MAT_REUSE_MATRIX) {
871: /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
872: PetscCall(MatGetBrowsOfAcols_MPIXAIJ(A, P, dof, MAT_REUSE_MATRIX, &ptap->P_oth));
873: }
874: PetscCall(PetscObjectQuery((PetscObject)ptap->P_oth, "aoffdiagtopothmapping", (PetscObject *)&map));
875: PetscCall(MatGetLocalSize(p->B, NULL, &pon));
876: pon *= dof;
877: PetscCall(MatGetLocalSize(P, NULL, &pn));
878: pn *= dof;
880: PetscCall(PetscCalloc2(ptap->c_rmti[pon], &c_rmtj, ptap->c_rmti[pon], &c_rmta));
881: PetscCall(MatGetLocalSize(A, &am, NULL));
882: PetscCall(MatGetOwnershipRangeColumn(P, &pcstart, &pcend));
883: pcstart *= dof;
884: pcend *= dof;
885: cmaxr = 0;
886: for (i = 0; i < pon; i++) cmaxr = PetscMax(cmaxr, ptap->c_rmti[i + 1] - ptap->c_rmti[i]);
887: cd = (Mat_SeqAIJ *)(c->A)->data;
888: co = (Mat_SeqAIJ *)(c->B)->data;
889: for (i = 0; i < pn; i++) cmaxr = PetscMax(cmaxr, (cd->i[i + 1] - cd->i[i]) + (co->i[i + 1] - co->i[i]));
890: PetscCall(PetscCalloc4(cmaxr, &apindices, cmaxr, &apvalues, cmaxr, &apvaluestmp, pon, &c_rmtc));
891: PetscCall(PetscHMapIVCreateWithSize(cmaxr, &hmap));
892: PetscCall(ISGetIndices(map, &mappingindices));
893: for (i = 0; i < am && (pon || pn); i++) {
894: PetscCall(PetscHMapIVClear(hmap));
895: offset = i % dof;
896: ii = i / dof;
897: nzi = po->i[ii + 1] - po->i[ii];
898: dnzi = pd->i[ii + 1] - pd->i[ii];
899: if (!nzi && !dnzi) continue;
900: PetscCall(MatPtAPNumericComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, hmap));
901: voff = 0;
902: PetscCall(PetscHMapIVGetPairs(hmap, &voff, apindices, apvalues));
903: if (!voff) continue;
905: /* Form remote C(ii, :) */
906: poj = po->j + po->i[ii];
907: poa = po->a + po->i[ii];
908: for (j = 0; j < nzi; j++) {
909: pocol = poj[j] * dof + offset;
910: c_rmtjj = c_rmtj + ptap->c_rmti[pocol];
911: c_rmtaa = c_rmta + ptap->c_rmti[pocol];
912: for (jj = 0; jj < voff; jj++) {
913: apvaluestmp[jj] = apvalues[jj] * poa[j];
914: /* If the row is empty */
915: if (!c_rmtc[pocol]) {
916: c_rmtjj[jj] = apindices[jj];
917: c_rmtaa[jj] = apvaluestmp[jj];
918: c_rmtc[pocol]++;
919: } else {
920: PetscCall(PetscFindInt(apindices[jj], c_rmtc[pocol], c_rmtjj, &loc));
921: if (loc >= 0) { /* hit */
922: c_rmtaa[loc] += apvaluestmp[jj];
923: PetscCall(PetscLogFlops(1.0));
924: } else { /* new element */
925: loc = -(loc + 1);
926: /* Move data backward */
927: for (kk = c_rmtc[pocol]; kk > loc; kk--) {
928: c_rmtjj[kk] = c_rmtjj[kk - 1];
929: c_rmtaa[kk] = c_rmtaa[kk - 1];
930: } /* End kk */
931: c_rmtjj[loc] = apindices[jj];
932: c_rmtaa[loc] = apvaluestmp[jj];
933: c_rmtc[pocol]++;
934: }
935: }
936: } /* End jj */
937: PetscCall(PetscLogFlops(voff));
938: } /* End j */
940: /* Form local C(ii, :) */
941: pdj = pd->j + pd->i[ii];
942: pda = pd->a + pd->i[ii];
943: for (j = 0; j < dnzi; j++) {
944: row = pcstart + pdj[j] * dof + offset;
945: for (jj = 0; jj < voff; jj++) apvaluestmp[jj] = apvalues[jj] * pda[j]; /* End kk */
946: PetscCall(PetscLogFlops(voff));
947: PetscCall(MatSetValues(C, 1, &row, voff, apindices, apvaluestmp, ADD_VALUES));
948: } /* End j */
949: } /* End i */
951: PetscCall(ISRestoreIndices(map, &mappingindices));
952: PetscCall(PetscFree4(apindices, apvalues, apvaluestmp, c_rmtc));
953: PetscCall(PetscHMapIVDestroy(&hmap));
954: PetscCall(PetscCalloc2(ptap->c_othi[pn], &c_othj, ptap->c_othi[pn], &c_otha));
956: PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));
957: PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_SCALAR, c_rmta, c_otha, MPI_REPLACE));
958: PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));
959: PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_SCALAR, c_rmta, c_otha, MPI_REPLACE));
960: PetscCall(PetscFree2(c_rmtj, c_rmta));
962: /* Add contributions from remote */
963: for (i = 0; i < pn; i++) {
964: row = i + pcstart;
965: PetscCall(MatSetValues(C, 1, &row, ptap->c_othi[i + 1] - ptap->c_othi[i], c_othj + ptap->c_othi[i], c_otha + ptap->c_othi[i], ADD_VALUES));
966: }
967: PetscCall(PetscFree2(c_othj, c_otha));
969: PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
970: PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
972: ptap->reuse = MAT_REUSE_MATRIX;
973: PetscFunctionReturn(PETSC_SUCCESS);
974: }
976: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged(Mat A, Mat P, Mat C)
977: {
978: PetscFunctionBegin;
980: PetscCall(MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce_merged(A, P, 1, C));
981: PetscFunctionReturn(PETSC_SUCCESS);
982: }
984: /* TODO: move algorithm selection to MatProductSetFromOptions */
985: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce(Mat A, Mat P, PetscInt dof, PetscReal fill, Mat Cmpi)
986: {
987: Mat_APMPI *ptap;
988: Mat_MPIAIJ *p = (Mat_MPIAIJ *)P->data;
989: MPI_Comm comm;
990: Mat_SeqAIJ *pd = (Mat_SeqAIJ *)p->A->data, *po = (Mat_SeqAIJ *)p->B->data;
991: MatType mtype;
992: PetscSF sf;
993: PetscSFNode *iremote;
994: PetscInt rootspacesize, *rootspace, *rootspaceoffsets, nleaves;
995: const PetscInt *rootdegrees;
996: PetscHSetI ht, oht, *hta, *hto;
997: PetscInt pn, pon, *c_rmtc, i, j, nzi, htsize, htosize, *c_rmtj, off, *c_othj, rcvncols, sendncols, *c_rmtoffsets;
998: PetscInt lidx, *rdj, col, pcstart, pcend, *dnz, *onz, am, arstart, arend, *poj, *pdj;
999: PetscInt nalg = 2, alg = 0, offset, ii;
1000: PetscMPIInt owner;
1001: const PetscInt *mappingindices;
1002: PetscBool flg;
1003: const char *algTypes[2] = {"overlapping", "merged"};
1004: IS map;
1006: PetscFunctionBegin;
1007: MatCheckProduct(Cmpi, 5);
1008: PetscCheck(!Cmpi->product->data, PetscObjectComm((PetscObject)Cmpi), PETSC_ERR_PLIB, "Product data not empty");
1009: PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
1011: /* Create symbolic parallel matrix Cmpi */
1012: PetscCall(MatGetLocalSize(P, NULL, &pn));
1013: pn *= dof;
1014: PetscCall(MatGetType(A, &mtype));
1015: PetscCall(MatSetType(Cmpi, mtype));
1016: PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE));
1018: PetscCall(PetscNew(&ptap));
1019: ptap->reuse = MAT_INITIAL_MATRIX;
1020: ptap->algType = 2;
1022: /* Get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
1023: PetscCall(MatGetBrowsOfAcols_MPIXAIJ(A, P, dof, MAT_INITIAL_MATRIX, &ptap->P_oth));
1024: PetscCall(PetscObjectQuery((PetscObject)ptap->P_oth, "aoffdiagtopothmapping", (PetscObject *)&map));
1025: /* This equals to the number of offdiag columns in P */
1026: PetscCall(MatGetLocalSize(p->B, NULL, &pon));
1027: pon *= dof;
1028: /* offsets */
1029: PetscCall(PetscMalloc1(pon + 1, &ptap->c_rmti));
1030: /* The number of columns we will send to remote ranks */
1031: PetscCall(PetscMalloc1(pon, &c_rmtc));
1032: PetscCall(PetscMalloc1(pon, &hta));
1033: for (i = 0; i < pon; i++) PetscCall(PetscHSetICreate(&hta[i]));
1034: PetscCall(MatGetLocalSize(A, &am, NULL));
1035: PetscCall(MatGetOwnershipRange(A, &arstart, &arend));
1036: /* Create hash table to merge all columns for C(i, :) */
1037: PetscCall(PetscHSetICreate(&ht));
1039: PetscCall(ISGetIndices(map, &mappingindices));
1040: ptap->c_rmti[0] = 0;
1041: /* 2) Pass 1: calculate the size for C_rmt (a matrix need to be sent to other processors) */
1042: for (i = 0; i < am && pon; i++) {
1043: /* Form one row of AP */
1044: PetscCall(PetscHSetIClear(ht));
1045: offset = i % dof;
1046: ii = i / dof;
1047: /* If the off diag is empty, we should not do any calculation */
1048: nzi = po->i[ii + 1] - po->i[ii];
1049: if (!nzi) continue;
1051: PetscCall(MatPtAPSymbolicComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, ht, ht));
1052: PetscCall(PetscHSetIGetSize(ht, &htsize));
1053: /* If AP is empty, just continue */
1054: if (!htsize) continue;
1055: /* Form C(ii, :) */
1056: poj = po->j + po->i[ii];
1057: for (j = 0; j < nzi; j++) PetscCall(PetscHSetIUpdate(hta[poj[j] * dof + offset], ht));
1058: }
1060: for (i = 0; i < pon; i++) {
1061: PetscCall(PetscHSetIGetSize(hta[i], &htsize));
1062: ptap->c_rmti[i + 1] = ptap->c_rmti[i] + htsize;
1063: c_rmtc[i] = htsize;
1064: }
1066: PetscCall(PetscMalloc1(ptap->c_rmti[pon], &c_rmtj));
1068: for (i = 0; i < pon; i++) {
1069: off = 0;
1070: PetscCall(PetscHSetIGetElems(hta[i], &off, c_rmtj + ptap->c_rmti[i]));
1071: PetscCall(PetscHSetIDestroy(&hta[i]));
1072: }
1073: PetscCall(PetscFree(hta));
1075: PetscCall(PetscMalloc1(pon, &iremote));
1076: for (i = 0; i < pon; i++) {
1077: owner = 0;
1078: lidx = 0;
1079: offset = i % dof;
1080: ii = i / dof;
1081: PetscCall(PetscLayoutFindOwnerIndex(P->cmap, p->garray[ii], &owner, &lidx));
1082: iremote[i].index = lidx * dof + offset;
1083: iremote[i].rank = owner;
1084: }
1086: PetscCall(PetscSFCreate(comm, &sf));
1087: PetscCall(PetscSFSetGraph(sf, pn, pon, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER));
1088: /* Reorder ranks properly so that the data handled by gather and scatter have the same order */
1089: PetscCall(PetscSFSetRankOrder(sf, PETSC_TRUE));
1090: PetscCall(PetscSFSetFromOptions(sf));
1091: PetscCall(PetscSFSetUp(sf));
1092: /* How many neighbors have contributions to my rows? */
1093: PetscCall(PetscSFComputeDegreeBegin(sf, &rootdegrees));
1094: PetscCall(PetscSFComputeDegreeEnd(sf, &rootdegrees));
1095: rootspacesize = 0;
1096: for (i = 0; i < pn; i++) rootspacesize += rootdegrees[i];
1097: PetscCall(PetscMalloc1(rootspacesize, &rootspace));
1098: PetscCall(PetscMalloc1(rootspacesize + 1, &rootspaceoffsets));
1099: /* Get information from leaves
1100: * Number of columns other people contribute to my rows
1101: * */
1102: PetscCall(PetscSFGatherBegin(sf, MPIU_INT, c_rmtc, rootspace));
1103: PetscCall(PetscSFGatherEnd(sf, MPIU_INT, c_rmtc, rootspace));
1104: PetscCall(PetscFree(c_rmtc));
1105: PetscCall(PetscCalloc1(pn + 1, &ptap->c_othi));
1106: /* The number of columns is received for each row */
1107: ptap->c_othi[0] = 0;
1108: rootspacesize = 0;
1109: rootspaceoffsets[0] = 0;
1110: for (i = 0; i < pn; i++) {
1111: rcvncols = 0;
1112: for (j = 0; j < rootdegrees[i]; j++) {
1113: rcvncols += rootspace[rootspacesize];
1114: rootspaceoffsets[rootspacesize + 1] = rootspaceoffsets[rootspacesize] + rootspace[rootspacesize];
1115: rootspacesize++;
1116: }
1117: ptap->c_othi[i + 1] = ptap->c_othi[i] + rcvncols;
1118: }
1119: PetscCall(PetscFree(rootspace));
1121: PetscCall(PetscMalloc1(pon, &c_rmtoffsets));
1122: PetscCall(PetscSFScatterBegin(sf, MPIU_INT, rootspaceoffsets, c_rmtoffsets));
1123: PetscCall(PetscSFScatterEnd(sf, MPIU_INT, rootspaceoffsets, c_rmtoffsets));
1124: PetscCall(PetscSFDestroy(&sf));
1125: PetscCall(PetscFree(rootspaceoffsets));
1127: PetscCall(PetscCalloc1(ptap->c_rmti[pon], &iremote));
1128: nleaves = 0;
1129: for (i = 0; i < pon; i++) {
1130: owner = 0;
1131: ii = i / dof;
1132: PetscCall(PetscLayoutFindOwnerIndex(P->cmap, p->garray[ii], &owner, NULL));
1133: sendncols = ptap->c_rmti[i + 1] - ptap->c_rmti[i];
1134: for (j = 0; j < sendncols; j++) {
1135: iremote[nleaves].rank = owner;
1136: iremote[nleaves++].index = c_rmtoffsets[i] + j;
1137: }
1138: }
1139: PetscCall(PetscFree(c_rmtoffsets));
1140: PetscCall(PetscCalloc1(ptap->c_othi[pn], &c_othj));
1142: PetscCall(PetscSFCreate(comm, &ptap->sf));
1143: PetscCall(PetscSFSetGraph(ptap->sf, ptap->c_othi[pn], nleaves, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER));
1144: PetscCall(PetscSFSetFromOptions(ptap->sf));
1145: /* One to one map */
1146: PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));
1148: PetscCall(PetscMalloc2(pn, &dnz, pn, &onz));
1149: PetscCall(PetscHSetICreate(&oht));
1150: PetscCall(MatGetOwnershipRangeColumn(P, &pcstart, &pcend));
1151: pcstart *= dof;
1152: pcend *= dof;
1153: PetscCall(PetscMalloc2(pn, &hta, pn, &hto));
1154: for (i = 0; i < pn; i++) {
1155: PetscCall(PetscHSetICreate(&hta[i]));
1156: PetscCall(PetscHSetICreate(&hto[i]));
1157: }
1158: /* Work on local part */
1159: /* 4) Pass 1: Estimate memory for C_loc */
1160: for (i = 0; i < am && pn; i++) {
1161: PetscCall(PetscHSetIClear(ht));
1162: PetscCall(PetscHSetIClear(oht));
1163: offset = i % dof;
1164: ii = i / dof;
1165: nzi = pd->i[ii + 1] - pd->i[ii];
1166: if (!nzi) continue;
1168: PetscCall(MatPtAPSymbolicComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, ht, oht));
1169: PetscCall(PetscHSetIGetSize(ht, &htsize));
1170: PetscCall(PetscHSetIGetSize(oht, &htosize));
1171: if (!(htsize + htosize)) continue;
1172: /* Form C(ii, :) */
1173: pdj = pd->j + pd->i[ii];
1174: for (j = 0; j < nzi; j++) {
1175: PetscCall(PetscHSetIUpdate(hta[pdj[j] * dof + offset], ht));
1176: PetscCall(PetscHSetIUpdate(hto[pdj[j] * dof + offset], oht));
1177: }
1178: }
1180: PetscCall(ISRestoreIndices(map, &mappingindices));
1182: PetscCall(PetscHSetIDestroy(&ht));
1183: PetscCall(PetscHSetIDestroy(&oht));
1185: /* Get remote data */
1186: PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));
1187: PetscCall(PetscFree(c_rmtj));
1189: for (i = 0; i < pn; i++) {
1190: nzi = ptap->c_othi[i + 1] - ptap->c_othi[i];
1191: rdj = c_othj + ptap->c_othi[i];
1192: for (j = 0; j < nzi; j++) {
1193: col = rdj[j];
1194: /* diag part */
1195: if (col >= pcstart && col < pcend) {
1196: PetscCall(PetscHSetIAdd(hta[i], col));
1197: } else { /* off diag */
1198: PetscCall(PetscHSetIAdd(hto[i], col));
1199: }
1200: }
1201: PetscCall(PetscHSetIGetSize(hta[i], &htsize));
1202: dnz[i] = htsize;
1203: PetscCall(PetscHSetIDestroy(&hta[i]));
1204: PetscCall(PetscHSetIGetSize(hto[i], &htsize));
1205: onz[i] = htsize;
1206: PetscCall(PetscHSetIDestroy(&hto[i]));
1207: }
1209: PetscCall(PetscFree2(hta, hto));
1210: PetscCall(PetscFree(c_othj));
1212: /* local sizes and preallocation */
1213: PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE));
1214: PetscCall(MatSetBlockSizes(Cmpi, dof > 1 ? dof : P->cmap->bs, dof > 1 ? dof : P->cmap->bs));
1215: PetscCall(MatMPIAIJSetPreallocation(Cmpi, 0, dnz, 0, onz));
1216: PetscCall(MatSetUp(Cmpi));
1217: PetscCall(PetscFree2(dnz, onz));
1219: /* attach the supporting struct to Cmpi for reuse */
1220: Cmpi->product->data = ptap;
1221: Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP;
1222: Cmpi->product->view = MatView_MPIAIJ_PtAP;
1224: /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
1225: Cmpi->assembled = PETSC_FALSE;
1226: /* pick an algorithm */
1227: PetscOptionsBegin(PetscObjectComm((PetscObject)A), ((PetscObject)A)->prefix, "MatPtAP", "Mat");
1228: alg = 0;
1229: PetscCall(PetscOptionsEList("-matptap_allatonce_via", "PtAP allatonce numeric approach", "MatPtAP", algTypes, nalg, algTypes[alg], &alg, &flg));
1230: PetscOptionsEnd();
1231: switch (alg) {
1232: case 0:
1233: Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce;
1234: break;
1235: case 1:
1236: Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged;
1237: break;
1238: default:
1239: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, " Unsupported allatonce numerical algorithm ");
1240: }
1241: PetscFunctionReturn(PETSC_SUCCESS);
1242: }
1244: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce(Mat A, Mat P, PetscReal fill, Mat C)
1245: {
1246: PetscFunctionBegin;
1247: PetscCall(MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce(A, P, 1, fill, C));
1248: PetscFunctionReturn(PETSC_SUCCESS);
1249: }
1251: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce_merged(Mat A, Mat P, PetscInt dof, PetscReal fill, Mat Cmpi)
1252: {
1253: Mat_APMPI *ptap;
1254: Mat_MPIAIJ *p = (Mat_MPIAIJ *)P->data;
1255: MPI_Comm comm;
1256: Mat_SeqAIJ *pd = (Mat_SeqAIJ *)p->A->data, *po = (Mat_SeqAIJ *)p->B->data;
1257: MatType mtype;
1258: PetscSF sf;
1259: PetscSFNode *iremote;
1260: PetscInt rootspacesize, *rootspace, *rootspaceoffsets, nleaves;
1261: const PetscInt *rootdegrees;
1262: PetscHSetI ht, oht, *hta, *hto, *htd;
1263: PetscInt pn, pon, *c_rmtc, i, j, nzi, dnzi, htsize, htosize, *c_rmtj, off, *c_othj, rcvncols, sendncols, *c_rmtoffsets;
1264: PetscInt lidx, *rdj, col, pcstart, pcend, *dnz, *onz, am, arstart, arend, *poj, *pdj;
1265: PetscInt nalg = 2, alg = 0, offset, ii;
1266: PetscMPIInt owner;
1267: PetscBool flg;
1268: const char *algTypes[2] = {"merged", "overlapping"};
1269: const PetscInt *mappingindices;
1270: IS map;
1272: PetscFunctionBegin;
1273: MatCheckProduct(Cmpi, 5);
1274: PetscCheck(!Cmpi->product->data, PetscObjectComm((PetscObject)Cmpi), PETSC_ERR_PLIB, "Product data not empty");
1275: PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
1277: /* Create symbolic parallel matrix Cmpi */
1278: PetscCall(MatGetLocalSize(P, NULL, &pn));
1279: pn *= dof;
1280: PetscCall(MatGetType(A, &mtype));
1281: PetscCall(MatSetType(Cmpi, mtype));
1282: PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE));
1284: PetscCall(PetscNew(&ptap));
1285: ptap->reuse = MAT_INITIAL_MATRIX;
1286: ptap->algType = 3;
1288: /* 0) Get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
1289: PetscCall(MatGetBrowsOfAcols_MPIXAIJ(A, P, dof, MAT_INITIAL_MATRIX, &ptap->P_oth));
1290: PetscCall(PetscObjectQuery((PetscObject)ptap->P_oth, "aoffdiagtopothmapping", (PetscObject *)&map));
1292: /* This equals to the number of offdiag columns in P */
1293: PetscCall(MatGetLocalSize(p->B, NULL, &pon));
1294: pon *= dof;
1295: /* offsets */
1296: PetscCall(PetscMalloc1(pon + 1, &ptap->c_rmti));
1297: /* The number of columns we will send to remote ranks */
1298: PetscCall(PetscMalloc1(pon, &c_rmtc));
1299: PetscCall(PetscMalloc1(pon, &hta));
1300: for (i = 0; i < pon; i++) PetscCall(PetscHSetICreate(&hta[i]));
1301: PetscCall(MatGetLocalSize(A, &am, NULL));
1302: PetscCall(MatGetOwnershipRange(A, &arstart, &arend));
1303: /* Create hash table to merge all columns for C(i, :) */
1304: PetscCall(PetscHSetICreate(&ht));
1305: PetscCall(PetscHSetICreate(&oht));
1306: PetscCall(PetscMalloc2(pn, &htd, pn, &hto));
1307: for (i = 0; i < pn; i++) {
1308: PetscCall(PetscHSetICreate(&htd[i]));
1309: PetscCall(PetscHSetICreate(&hto[i]));
1310: }
1312: PetscCall(ISGetIndices(map, &mappingindices));
1313: ptap->c_rmti[0] = 0;
1314: /* 2) Pass 1: calculate the size for C_rmt (a matrix need to be sent to other processors) */
1315: for (i = 0; i < am && (pon || pn); i++) {
1316: /* Form one row of AP */
1317: PetscCall(PetscHSetIClear(ht));
1318: PetscCall(PetscHSetIClear(oht));
1319: offset = i % dof;
1320: ii = i / dof;
1321: /* If the off diag is empty, we should not do any calculation */
1322: nzi = po->i[ii + 1] - po->i[ii];
1323: dnzi = pd->i[ii + 1] - pd->i[ii];
1324: if (!nzi && !dnzi) continue;
1326: PetscCall(MatPtAPSymbolicComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, ht, oht));
1327: PetscCall(PetscHSetIGetSize(ht, &htsize));
1328: PetscCall(PetscHSetIGetSize(oht, &htosize));
1329: /* If AP is empty, just continue */
1330: if (!(htsize + htosize)) continue;
1332: /* Form remote C(ii, :) */
1333: poj = po->j + po->i[ii];
1334: for (j = 0; j < nzi; j++) {
1335: PetscCall(PetscHSetIUpdate(hta[poj[j] * dof + offset], ht));
1336: PetscCall(PetscHSetIUpdate(hta[poj[j] * dof + offset], oht));
1337: }
1339: /* Form local C(ii, :) */
1340: pdj = pd->j + pd->i[ii];
1341: for (j = 0; j < dnzi; j++) {
1342: PetscCall(PetscHSetIUpdate(htd[pdj[j] * dof + offset], ht));
1343: PetscCall(PetscHSetIUpdate(hto[pdj[j] * dof + offset], oht));
1344: }
1345: }
1347: PetscCall(ISRestoreIndices(map, &mappingindices));
1349: PetscCall(PetscHSetIDestroy(&ht));
1350: PetscCall(PetscHSetIDestroy(&oht));
1352: for (i = 0; i < pon; i++) {
1353: PetscCall(PetscHSetIGetSize(hta[i], &htsize));
1354: ptap->c_rmti[i + 1] = ptap->c_rmti[i] + htsize;
1355: c_rmtc[i] = htsize;
1356: }
1358: PetscCall(PetscMalloc1(ptap->c_rmti[pon], &c_rmtj));
1360: for (i = 0; i < pon; i++) {
1361: off = 0;
1362: PetscCall(PetscHSetIGetElems(hta[i], &off, c_rmtj + ptap->c_rmti[i]));
1363: PetscCall(PetscHSetIDestroy(&hta[i]));
1364: }
1365: PetscCall(PetscFree(hta));
1367: PetscCall(PetscMalloc1(pon, &iremote));
1368: for (i = 0; i < pon; i++) {
1369: owner = 0;
1370: lidx = 0;
1371: offset = i % dof;
1372: ii = i / dof;
1373: PetscCall(PetscLayoutFindOwnerIndex(P->cmap, p->garray[ii], &owner, &lidx));
1374: iremote[i].index = lidx * dof + offset;
1375: iremote[i].rank = owner;
1376: }
1378: PetscCall(PetscSFCreate(comm, &sf));
1379: PetscCall(PetscSFSetGraph(sf, pn, pon, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER));
1380: /* Reorder ranks properly so that the data handled by gather and scatter have the same order */
1381: PetscCall(PetscSFSetRankOrder(sf, PETSC_TRUE));
1382: PetscCall(PetscSFSetFromOptions(sf));
1383: PetscCall(PetscSFSetUp(sf));
1384: /* How many neighbors have contributions to my rows? */
1385: PetscCall(PetscSFComputeDegreeBegin(sf, &rootdegrees));
1386: PetscCall(PetscSFComputeDegreeEnd(sf, &rootdegrees));
1387: rootspacesize = 0;
1388: for (i = 0; i < pn; i++) rootspacesize += rootdegrees[i];
1389: PetscCall(PetscMalloc1(rootspacesize, &rootspace));
1390: PetscCall(PetscMalloc1(rootspacesize + 1, &rootspaceoffsets));
1391: /* Get information from leaves
1392: * Number of columns other people contribute to my rows
1393: * */
1394: PetscCall(PetscSFGatherBegin(sf, MPIU_INT, c_rmtc, rootspace));
1395: PetscCall(PetscSFGatherEnd(sf, MPIU_INT, c_rmtc, rootspace));
1396: PetscCall(PetscFree(c_rmtc));
1397: PetscCall(PetscMalloc1(pn + 1, &ptap->c_othi));
1398: /* The number of columns is received for each row */
1399: ptap->c_othi[0] = 0;
1400: rootspacesize = 0;
1401: rootspaceoffsets[0] = 0;
1402: for (i = 0; i < pn; i++) {
1403: rcvncols = 0;
1404: for (j = 0; j < rootdegrees[i]; j++) {
1405: rcvncols += rootspace[rootspacesize];
1406: rootspaceoffsets[rootspacesize + 1] = rootspaceoffsets[rootspacesize] + rootspace[rootspacesize];
1407: rootspacesize++;
1408: }
1409: ptap->c_othi[i + 1] = ptap->c_othi[i] + rcvncols;
1410: }
1411: PetscCall(PetscFree(rootspace));
1413: PetscCall(PetscMalloc1(pon, &c_rmtoffsets));
1414: PetscCall(PetscSFScatterBegin(sf, MPIU_INT, rootspaceoffsets, c_rmtoffsets));
1415: PetscCall(PetscSFScatterEnd(sf, MPIU_INT, rootspaceoffsets, c_rmtoffsets));
1416: PetscCall(PetscSFDestroy(&sf));
1417: PetscCall(PetscFree(rootspaceoffsets));
1419: PetscCall(PetscCalloc1(ptap->c_rmti[pon], &iremote));
1420: nleaves = 0;
1421: for (i = 0; i < pon; i++) {
1422: owner = 0;
1423: ii = i / dof;
1424: PetscCall(PetscLayoutFindOwnerIndex(P->cmap, p->garray[ii], &owner, NULL));
1425: sendncols = ptap->c_rmti[i + 1] - ptap->c_rmti[i];
1426: for (j = 0; j < sendncols; j++) {
1427: iremote[nleaves].rank = owner;
1428: iremote[nleaves++].index = c_rmtoffsets[i] + j;
1429: }
1430: }
1431: PetscCall(PetscFree(c_rmtoffsets));
1432: PetscCall(PetscCalloc1(ptap->c_othi[pn], &c_othj));
1434: PetscCall(PetscSFCreate(comm, &ptap->sf));
1435: PetscCall(PetscSFSetGraph(ptap->sf, ptap->c_othi[pn], nleaves, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER));
1436: PetscCall(PetscSFSetFromOptions(ptap->sf));
1437: /* One to one map */
1438: PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));
1439: /* Get remote data */
1440: PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE));
1441: PetscCall(PetscFree(c_rmtj));
1442: PetscCall(PetscMalloc2(pn, &dnz, pn, &onz));
1443: PetscCall(MatGetOwnershipRangeColumn(P, &pcstart, &pcend));
1444: pcstart *= dof;
1445: pcend *= dof;
1446: for (i = 0; i < pn; i++) {
1447: nzi = ptap->c_othi[i + 1] - ptap->c_othi[i];
1448: rdj = c_othj + ptap->c_othi[i];
1449: for (j = 0; j < nzi; j++) {
1450: col = rdj[j];
1451: /* diag part */
1452: if (col >= pcstart && col < pcend) {
1453: PetscCall(PetscHSetIAdd(htd[i], col));
1454: } else { /* off diag */
1455: PetscCall(PetscHSetIAdd(hto[i], col));
1456: }
1457: }
1458: PetscCall(PetscHSetIGetSize(htd[i], &htsize));
1459: dnz[i] = htsize;
1460: PetscCall(PetscHSetIDestroy(&htd[i]));
1461: PetscCall(PetscHSetIGetSize(hto[i], &htsize));
1462: onz[i] = htsize;
1463: PetscCall(PetscHSetIDestroy(&hto[i]));
1464: }
1466: PetscCall(PetscFree2(htd, hto));
1467: PetscCall(PetscFree(c_othj));
1469: /* local sizes and preallocation */
1470: PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE));
1471: PetscCall(MatSetBlockSizes(Cmpi, dof > 1 ? dof : P->cmap->bs, dof > 1 ? dof : P->cmap->bs));
1472: PetscCall(MatMPIAIJSetPreallocation(Cmpi, 0, dnz, 0, onz));
1473: PetscCall(PetscFree2(dnz, onz));
1475: /* attach the supporting struct to Cmpi for reuse */
1476: Cmpi->product->data = ptap;
1477: Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP;
1478: Cmpi->product->view = MatView_MPIAIJ_PtAP;
1480: /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
1481: Cmpi->assembled = PETSC_FALSE;
1482: /* pick an algorithm */
1483: PetscOptionsBegin(PetscObjectComm((PetscObject)A), ((PetscObject)A)->prefix, "MatPtAP", "Mat");
1484: alg = 0;
1485: PetscCall(PetscOptionsEList("-matptap_allatonce_via", "PtAP allatonce numeric approach", "MatPtAP", algTypes, nalg, algTypes[alg], &alg, &flg));
1486: PetscOptionsEnd();
1487: switch (alg) {
1488: case 0:
1489: Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged;
1490: break;
1491: case 1:
1492: Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce;
1493: break;
1494: default:
1495: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, " Unsupported allatonce numerical algorithm ");
1496: }
1497: PetscFunctionReturn(PETSC_SUCCESS);
1498: }
1500: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce_merged(Mat A, Mat P, PetscReal fill, Mat C)
1501: {
1502: PetscFunctionBegin;
1503: PetscCall(MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce_merged(A, P, 1, fill, C));
1504: PetscFunctionReturn(PETSC_SUCCESS);
1505: }
1507: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ(Mat A, Mat P, PetscReal fill, Mat Cmpi)
1508: {
1509: Mat_APMPI *ptap;
1510: Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data;
1511: MPI_Comm comm;
1512: PetscMPIInt size, rank;
1513: PetscFreeSpaceList free_space = NULL, current_space = NULL;
1514: PetscInt am = A->rmap->n, pm = P->rmap->n, pN = P->cmap->N, pn = P->cmap->n;
1515: PetscInt *lnk, i, k, pnz, row, nsend;
1516: PetscBT lnkbt;
1517: PetscMPIInt tagi, tagj, *len_si, *len_s, *len_ri, nrecv;
1518: PETSC_UNUSED PetscMPIInt icompleted = 0;
1519: PetscInt **buf_rj, **buf_ri, **buf_ri_k;
1520: PetscInt len, proc, *dnz, *onz, *owners, nzi, nspacedouble;
1521: PetscInt nrows, *buf_s, *buf_si, *buf_si_i, **nextrow, **nextci;
1522: MPI_Request *swaits, *rwaits;
1523: MPI_Status *sstatus, rstatus;
1524: PetscLayout rowmap;
1525: PetscInt *owners_co, *coi, *coj; /* i and j array of (p->B)^T*A*P - used in the communication */
1526: PetscMPIInt *len_r, *id_r; /* array of length of comm->size, store send/recv matrix values */
1527: PetscInt *api, *apj, *Jptr, apnz, *prmap = p->garray, con, j, ap_rmax = 0, Crmax, *aj, *ai, *pi;
1528: Mat_SeqAIJ *p_loc, *p_oth = NULL, *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = NULL, *c_loc, *c_oth;
1529: PetscScalar *apv;
1530: PetscHMapI ta;
1531: MatType mtype;
1532: const char *prefix;
1533: #if defined(PETSC_USE_INFO)
1534: PetscReal apfill;
1535: #endif
1537: PetscFunctionBegin;
1538: MatCheckProduct(Cmpi, 4);
1539: PetscCheck(!Cmpi->product->data, PetscObjectComm((PetscObject)Cmpi), PETSC_ERR_PLIB, "Product data not empty");
1540: PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
1541: PetscCallMPI(MPI_Comm_size(comm, &size));
1542: PetscCallMPI(MPI_Comm_rank(comm, &rank));
1544: if (size > 1) ao = (Mat_SeqAIJ *)(a->B)->data;
1546: /* create symbolic parallel matrix Cmpi */
1547: PetscCall(MatGetType(A, &mtype));
1548: PetscCall(MatSetType(Cmpi, mtype));
1550: /* Do dense axpy in MatPtAPNumeric_MPIAIJ_MPIAIJ() */
1551: Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ;
1553: /* create struct Mat_APMPI and attached it to C later */
1554: PetscCall(PetscNew(&ptap));
1555: ptap->reuse = MAT_INITIAL_MATRIX;
1556: ptap->algType = 1;
1558: /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
1559: PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_INITIAL_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &ptap->P_oth));
1560: /* get P_loc by taking all local rows of P */
1561: PetscCall(MatMPIAIJGetLocalMat(P, MAT_INITIAL_MATRIX, &ptap->P_loc));
1563: /* (0) compute Rd = Pd^T, Ro = Po^T */
1564: PetscCall(MatTranspose(p->A, MAT_INITIAL_MATRIX, &ptap->Rd));
1565: PetscCall(MatTranspose(p->B, MAT_INITIAL_MATRIX, &ptap->Ro));
1567: /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */
1568: p_loc = (Mat_SeqAIJ *)(ptap->P_loc)->data;
1569: if (ptap->P_oth) p_oth = (Mat_SeqAIJ *)(ptap->P_oth)->data;
1571: /* create and initialize a linked list */
1572: PetscCall(PetscHMapICreateWithSize(pn, &ta)); /* for compute AP_loc and Cmpi */
1573: MatRowMergeMax_SeqAIJ(p_loc, ptap->P_loc->rmap->N, ta);
1574: MatRowMergeMax_SeqAIJ(p_oth, ptap->P_oth->rmap->N, ta);
1575: PetscCall(PetscHMapIGetSize(ta, &Crmax)); /* Crmax = nnz(sum of Prows) */
1576: /* printf("[%d] est %d, Crmax %d; pN %d\n",rank,5*(p_loc->rmax+p_oth->rmax + (PetscInt)(1.e-2*pN)),Crmax,pN); */
1578: PetscCall(PetscLLCondensedCreate(Crmax, pN, &lnk, &lnkbt));
1580: /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */
1581: if (ao) {
1582: PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ad->i[am], PetscIntSumTruncate(ao->i[am], p_loc->i[pm]))), &free_space));
1583: } else {
1584: PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ad->i[am], p_loc->i[pm])), &free_space));
1585: }
1586: current_space = free_space;
1587: nspacedouble = 0;
1589: PetscCall(PetscMalloc1(am + 1, &api));
1590: api[0] = 0;
1591: for (i = 0; i < am; i++) {
1592: /* diagonal portion: Ad[i,:]*P */
1593: ai = ad->i;
1594: pi = p_loc->i;
1595: nzi = ai[i + 1] - ai[i];
1596: aj = ad->j + ai[i];
1597: for (j = 0; j < nzi; j++) {
1598: row = aj[j];
1599: pnz = pi[row + 1] - pi[row];
1600: Jptr = p_loc->j + pi[row];
1601: /* add non-zero cols of P into the sorted linked list lnk */
1602: PetscCall(PetscLLCondensedAddSorted(pnz, Jptr, lnk, lnkbt));
1603: }
1604: /* off-diagonal portion: Ao[i,:]*P */
1605: if (ao) {
1606: ai = ao->i;
1607: pi = p_oth->i;
1608: nzi = ai[i + 1] - ai[i];
1609: aj = ao->j + ai[i];
1610: for (j = 0; j < nzi; j++) {
1611: row = aj[j];
1612: pnz = pi[row + 1] - pi[row];
1613: Jptr = p_oth->j + pi[row];
1614: PetscCall(PetscLLCondensedAddSorted(pnz, Jptr, lnk, lnkbt));
1615: }
1616: }
1617: apnz = lnk[0];
1618: api[i + 1] = api[i] + apnz;
1619: if (ap_rmax < apnz) ap_rmax = apnz;
1621: /* if free space is not available, double the total space in the list */
1622: if (current_space->local_remaining < apnz) {
1623: PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(apnz, current_space->total_array_size), ¤t_space));
1624: nspacedouble++;
1625: }
1627: /* Copy data into free space, then initialize lnk */
1628: PetscCall(PetscLLCondensedClean(pN, apnz, current_space->array, lnk, lnkbt));
1630: current_space->array += apnz;
1631: current_space->local_used += apnz;
1632: current_space->local_remaining -= apnz;
1633: }
1634: /* Allocate space for apj and apv, initialize apj, and */
1635: /* destroy list of free space and other temporary array(s) */
1636: PetscCall(PetscMalloc2(api[am], &apj, api[am], &apv));
1637: PetscCall(PetscFreeSpaceContiguous(&free_space, apj));
1638: PetscCall(PetscLLDestroy(lnk, lnkbt));
1640: /* Create AP_loc for reuse */
1641: PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, am, pN, api, apj, apv, &ptap->AP_loc));
1642: PetscCall(MatSetType(ptap->AP_loc, ((PetscObject)p->A)->type_name));
1643: #if defined(PETSC_USE_INFO)
1644: if (ao) {
1645: apfill = (PetscReal)api[am] / (ad->i[am] + ao->i[am] + p_loc->i[pm] + 1);
1646: } else {
1647: apfill = (PetscReal)api[am] / (ad->i[am] + p_loc->i[pm] + 1);
1648: }
1649: ptap->AP_loc->info.mallocs = nspacedouble;
1650: ptap->AP_loc->info.fill_ratio_given = fill;
1651: ptap->AP_loc->info.fill_ratio_needed = apfill;
1653: if (api[am]) {
1654: PetscCall(PetscInfo(ptap->AP_loc, "Nonscalable algorithm, AP_loc reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", nspacedouble, (double)fill, (double)apfill));
1655: PetscCall(PetscInfo(ptap->AP_loc, "Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n", (double)apfill));
1656: } else {
1657: PetscCall(PetscInfo(ptap->AP_loc, "Nonscalable algorithm, AP_loc is empty \n"));
1658: }
1659: #endif
1661: /* (2-1) compute symbolic Co = Ro*AP_loc */
1662: PetscCall(MatGetOptionsPrefix(A, &prefix));
1663: PetscCall(MatSetOptionsPrefix(ptap->Ro, prefix));
1664: PetscCall(MatAppendOptionsPrefix(ptap->Ro, "inner_offdiag_"));
1665: PetscCall(MatProductCreate(ptap->Ro, ptap->AP_loc, NULL, &ptap->C_oth));
1666: PetscCall(MatGetOptionsPrefix(Cmpi, &prefix));
1667: PetscCall(MatSetOptionsPrefix(ptap->C_oth, prefix));
1668: PetscCall(MatAppendOptionsPrefix(ptap->C_oth, "inner_C_oth_"));
1669: PetscCall(MatProductSetType(ptap->C_oth, MATPRODUCT_AB));
1670: PetscCall(MatProductSetAlgorithm(ptap->C_oth, "default"));
1671: PetscCall(MatProductSetFill(ptap->C_oth, fill));
1672: PetscCall(MatProductSetFromOptions(ptap->C_oth));
1673: PetscCall(MatProductSymbolic(ptap->C_oth));
1675: /* (3) send coj of C_oth to other processors */
1676: /* determine row ownership */
1677: PetscCall(PetscLayoutCreate(comm, &rowmap));
1678: rowmap->n = pn;
1679: rowmap->bs = 1;
1680: PetscCall(PetscLayoutSetUp(rowmap));
1681: owners = rowmap->range;
1683: /* determine the number of messages to send, their lengths */
1684: PetscCall(PetscMalloc4(size, &len_s, size, &len_si, size, &sstatus, size + 2, &owners_co));
1685: PetscCall(PetscArrayzero(len_s, size));
1686: PetscCall(PetscArrayzero(len_si, size));
1688: c_oth = (Mat_SeqAIJ *)ptap->C_oth->data;
1689: coi = c_oth->i;
1690: coj = c_oth->j;
1691: con = ptap->C_oth->rmap->n;
1692: proc = 0;
1693: for (i = 0; i < con; i++) {
1694: while (prmap[i] >= owners[proc + 1]) proc++;
1695: len_si[proc]++; /* num of rows in Co(=Pt*AP) to be sent to [proc] */
1696: len_s[proc] += coi[i + 1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */
1697: }
1699: len = 0; /* max length of buf_si[], see (4) */
1700: owners_co[0] = 0;
1701: nsend = 0;
1702: for (proc = 0; proc < size; proc++) {
1703: owners_co[proc + 1] = owners_co[proc] + len_si[proc];
1704: if (len_s[proc]) {
1705: nsend++;
1706: len_si[proc] = 2 * (len_si[proc] + 1); /* length of buf_si to be sent to [proc] */
1707: len += len_si[proc];
1708: }
1709: }
1711: /* determine the number and length of messages to receive for coi and coj */
1712: PetscCall(PetscGatherNumberOfMessages(comm, NULL, len_s, &nrecv));
1713: PetscCall(PetscGatherMessageLengths2(comm, nsend, nrecv, len_s, len_si, &id_r, &len_r, &len_ri));
1715: /* post the Irecv and Isend of coj */
1716: PetscCall(PetscCommGetNewTag(comm, &tagj));
1717: PetscCall(PetscPostIrecvInt(comm, tagj, nrecv, id_r, len_r, &buf_rj, &rwaits));
1718: PetscCall(PetscMalloc1(nsend + 1, &swaits));
1719: for (proc = 0, k = 0; proc < size; proc++) {
1720: if (!len_s[proc]) continue;
1721: i = owners_co[proc];
1722: PetscCallMPI(MPI_Isend(coj + coi[i], len_s[proc], MPIU_INT, proc, tagj, comm, swaits + k));
1723: k++;
1724: }
1726: /* (2-2) compute symbolic C_loc = Rd*AP_loc */
1727: PetscCall(MatSetOptionsPrefix(ptap->Rd, prefix));
1728: PetscCall(MatAppendOptionsPrefix(ptap->Rd, "inner_diag_"));
1729: PetscCall(MatProductCreate(ptap->Rd, ptap->AP_loc, NULL, &ptap->C_loc));
1730: PetscCall(MatGetOptionsPrefix(Cmpi, &prefix));
1731: PetscCall(MatSetOptionsPrefix(ptap->C_loc, prefix));
1732: PetscCall(MatAppendOptionsPrefix(ptap->C_loc, "inner_C_loc_"));
1733: PetscCall(MatProductSetType(ptap->C_loc, MATPRODUCT_AB));
1734: PetscCall(MatProductSetAlgorithm(ptap->C_loc, "default"));
1735: PetscCall(MatProductSetFill(ptap->C_loc, fill));
1736: PetscCall(MatProductSetFromOptions(ptap->C_loc));
1737: PetscCall(MatProductSymbolic(ptap->C_loc));
1739: c_loc = (Mat_SeqAIJ *)ptap->C_loc->data;
1741: /* receives coj are complete */
1742: for (i = 0; i < nrecv; i++) PetscCallMPI(MPI_Waitany(nrecv, rwaits, &icompleted, &rstatus));
1743: PetscCall(PetscFree(rwaits));
1744: if (nsend) PetscCallMPI(MPI_Waitall(nsend, swaits, sstatus));
1746: /* add received column indices into ta to update Crmax */
1747: for (k = 0; k < nrecv; k++) { /* k-th received message */
1748: Jptr = buf_rj[k];
1749: for (j = 0; j < len_r[k]; j++) PetscCall(PetscHMapISet(ta, *(Jptr + j) + 1, 1));
1750: }
1751: PetscCall(PetscHMapIGetSize(ta, &Crmax));
1752: PetscCall(PetscHMapIDestroy(&ta));
1754: /* (4) send and recv coi */
1755: PetscCall(PetscCommGetNewTag(comm, &tagi));
1756: PetscCall(PetscPostIrecvInt(comm, tagi, nrecv, id_r, len_ri, &buf_ri, &rwaits));
1757: PetscCall(PetscMalloc1(len + 1, &buf_s));
1758: buf_si = buf_s; /* points to the beginning of k-th msg to be sent */
1759: for (proc = 0, k = 0; proc < size; proc++) {
1760: if (!len_s[proc]) continue;
1761: /* form outgoing message for i-structure:
1762: buf_si[0]: nrows to be sent
1763: [1:nrows]: row index (global)
1764: [nrows+1:2*nrows+1]: i-structure index
1765: */
1766: nrows = len_si[proc] / 2 - 1; /* num of rows in Co to be sent to [proc] */
1767: buf_si_i = buf_si + nrows + 1;
1768: buf_si[0] = nrows;
1769: buf_si_i[0] = 0;
1770: nrows = 0;
1771: for (i = owners_co[proc]; i < owners_co[proc + 1]; i++) {
1772: nzi = coi[i + 1] - coi[i];
1773: buf_si_i[nrows + 1] = buf_si_i[nrows] + nzi; /* i-structure */
1774: buf_si[nrows + 1] = prmap[i] - owners[proc]; /* local row index */
1775: nrows++;
1776: }
1777: PetscCallMPI(MPI_Isend(buf_si, len_si[proc], MPIU_INT, proc, tagi, comm, swaits + k));
1778: k++;
1779: buf_si += len_si[proc];
1780: }
1781: for (i = 0; i < nrecv; i++) PetscCallMPI(MPI_Waitany(nrecv, rwaits, &icompleted, &rstatus));
1782: PetscCall(PetscFree(rwaits));
1783: if (nsend) PetscCallMPI(MPI_Waitall(nsend, swaits, sstatus));
1785: PetscCall(PetscFree4(len_s, len_si, sstatus, owners_co));
1786: PetscCall(PetscFree(len_ri));
1787: PetscCall(PetscFree(swaits));
1788: PetscCall(PetscFree(buf_s));
1790: /* (5) compute the local portion of Cmpi */
1791: /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */
1792: PetscCall(PetscFreeSpaceGet(Crmax, &free_space));
1793: current_space = free_space;
1795: PetscCall(PetscMalloc3(nrecv, &buf_ri_k, nrecv, &nextrow, nrecv, &nextci));
1796: for (k = 0; k < nrecv; k++) {
1797: buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1798: nrows = *buf_ri_k[k];
1799: nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */
1800: nextci[k] = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recved i-structure */
1801: }
1803: MatPreallocateBegin(comm, pn, pn, dnz, onz);
1804: PetscCall(PetscLLCondensedCreate(Crmax, pN, &lnk, &lnkbt));
1805: for (i = 0; i < pn; i++) {
1806: /* add C_loc into Cmpi */
1807: nzi = c_loc->i[i + 1] - c_loc->i[i];
1808: Jptr = c_loc->j + c_loc->i[i];
1809: PetscCall(PetscLLCondensedAddSorted(nzi, Jptr, lnk, lnkbt));
1811: /* add received col data into lnk */
1812: for (k = 0; k < nrecv; k++) { /* k-th received message */
1813: if (i == *nextrow[k]) { /* i-th row */
1814: nzi = *(nextci[k] + 1) - *nextci[k];
1815: Jptr = buf_rj[k] + *nextci[k];
1816: PetscCall(PetscLLCondensedAddSorted(nzi, Jptr, lnk, lnkbt));
1817: nextrow[k]++;
1818: nextci[k]++;
1819: }
1820: }
1821: nzi = lnk[0];
1823: /* copy data into free space, then initialize lnk */
1824: PetscCall(PetscLLCondensedClean(pN, nzi, current_space->array, lnk, lnkbt));
1825: PetscCall(MatPreallocateSet(i + owners[rank], nzi, current_space->array, dnz, onz));
1826: }
1827: PetscCall(PetscFree3(buf_ri_k, nextrow, nextci));
1828: PetscCall(PetscLLDestroy(lnk, lnkbt));
1829: PetscCall(PetscFreeSpaceDestroy(free_space));
1831: /* local sizes and preallocation */
1832: PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE));
1833: if (P->cmap->bs > 0) {
1834: PetscCall(PetscLayoutSetBlockSize(Cmpi->rmap, P->cmap->bs));
1835: PetscCall(PetscLayoutSetBlockSize(Cmpi->cmap, P->cmap->bs));
1836: }
1837: PetscCall(MatMPIAIJSetPreallocation(Cmpi, 0, dnz, 0, onz));
1838: MatPreallocateEnd(dnz, onz);
1840: /* members in merge */
1841: PetscCall(PetscFree(id_r));
1842: PetscCall(PetscFree(len_r));
1843: PetscCall(PetscFree(buf_ri[0]));
1844: PetscCall(PetscFree(buf_ri));
1845: PetscCall(PetscFree(buf_rj[0]));
1846: PetscCall(PetscFree(buf_rj));
1847: PetscCall(PetscLayoutDestroy(&rowmap));
1849: PetscCall(PetscCalloc1(pN, &ptap->apa));
1851: /* attach the supporting struct to Cmpi for reuse */
1852: Cmpi->product->data = ptap;
1853: Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP;
1854: Cmpi->product->view = MatView_MPIAIJ_PtAP;
1856: /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
1857: Cmpi->assembled = PETSC_FALSE;
1858: PetscFunctionReturn(PETSC_SUCCESS);
1859: }
1861: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ(Mat A, Mat P, Mat C)
1862: {
1863: Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data;
1864: Mat_SeqAIJ *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = (Mat_SeqAIJ *)(a->B)->data;
1865: Mat_SeqAIJ *ap, *p_loc, *p_oth = NULL, *c_seq;
1866: Mat_APMPI *ptap;
1867: Mat AP_loc, C_loc, C_oth;
1868: PetscInt i, rstart, rend, cm, ncols, row;
1869: PetscInt *api, *apj, am = A->rmap->n, j, col, apnz;
1870: PetscScalar *apa;
1871: const PetscInt *cols;
1872: const PetscScalar *vals;
1874: PetscFunctionBegin;
1875: MatCheckProduct(C, 3);
1876: ptap = (Mat_APMPI *)C->product->data;
1877: PetscCheck(ptap, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be computed. Missing data");
1878: PetscCheck(ptap->AP_loc, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be reused. Do not call MatProductClear()");
1880: PetscCall(MatZeroEntries(C));
1881: /* 1) get R = Pd^T,Ro = Po^T */
1882: if (ptap->reuse == MAT_REUSE_MATRIX) {
1883: PetscCall(MatTranspose(p->A, MAT_REUSE_MATRIX, &ptap->Rd));
1884: PetscCall(MatTranspose(p->B, MAT_REUSE_MATRIX, &ptap->Ro));
1885: }
1887: /* 2) get AP_loc */
1888: AP_loc = ptap->AP_loc;
1889: ap = (Mat_SeqAIJ *)AP_loc->data;
1891: /* 2-1) get P_oth = ptap->P_oth and P_loc = ptap->P_loc */
1892: if (ptap->reuse == MAT_REUSE_MATRIX) {
1893: /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
1894: PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_REUSE_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &ptap->P_oth));
1895: PetscCall(MatMPIAIJGetLocalMat(P, MAT_REUSE_MATRIX, &ptap->P_loc));
1896: }
1898: /* 2-2) compute numeric A_loc*P - dominating part */
1899: /* get data from symbolic products */
1900: p_loc = (Mat_SeqAIJ *)(ptap->P_loc)->data;
1901: if (ptap->P_oth) p_oth = (Mat_SeqAIJ *)(ptap->P_oth)->data;
1902: apa = ptap->apa;
1903: api = ap->i;
1904: apj = ap->j;
1905: for (i = 0; i < am; i++) {
1906: /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */
1907: AProw_nonscalable(i, ad, ao, p_loc, p_oth, apa);
1908: apnz = api[i + 1] - api[i];
1909: for (j = 0; j < apnz; j++) {
1910: col = apj[j + api[i]];
1911: ap->a[j + ap->i[i]] = apa[col];
1912: apa[col] = 0.0;
1913: }
1914: }
1915: /* We have modified the contents of local matrix AP_loc and must increase its ObjectState, since we are not doing AssemblyBegin/End on it. */
1916: PetscCall(PetscObjectStateIncrease((PetscObject)AP_loc));
1918: /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */
1919: PetscCall(MatProductNumeric(ptap->C_loc));
1920: PetscCall(MatProductNumeric(ptap->C_oth));
1921: C_loc = ptap->C_loc;
1922: C_oth = ptap->C_oth;
1924: /* add C_loc and Co to to C */
1925: PetscCall(MatGetOwnershipRange(C, &rstart, &rend));
1927: /* C_loc -> C */
1928: cm = C_loc->rmap->N;
1929: c_seq = (Mat_SeqAIJ *)C_loc->data;
1930: cols = c_seq->j;
1931: vals = c_seq->a;
1933: /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */
1934: /* when there are no off-processor parts. */
1935: /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */
1936: /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */
1937: /* a table, and other, more complex stuff has to be done. */
1938: if (C->assembled) {
1939: C->was_assembled = PETSC_TRUE;
1940: C->assembled = PETSC_FALSE;
1941: }
1942: if (C->was_assembled) {
1943: for (i = 0; i < cm; i++) {
1944: ncols = c_seq->i[i + 1] - c_seq->i[i];
1945: row = rstart + i;
1946: PetscCall(MatSetValues_MPIAIJ(C, 1, &row, ncols, cols, vals, ADD_VALUES));
1947: cols += ncols;
1948: vals += ncols;
1949: }
1950: } else {
1951: PetscCall(MatSetValues_MPIAIJ_CopyFromCSRFormat(C, c_seq->j, c_seq->i, c_seq->a));
1952: }
1954: /* Co -> C, off-processor part */
1955: cm = C_oth->rmap->N;
1956: c_seq = (Mat_SeqAIJ *)C_oth->data;
1957: cols = c_seq->j;
1958: vals = c_seq->a;
1959: for (i = 0; i < cm; i++) {
1960: ncols = c_seq->i[i + 1] - c_seq->i[i];
1961: row = p->garray[i];
1962: PetscCall(MatSetValues(C, 1, &row, ncols, cols, vals, ADD_VALUES));
1963: cols += ncols;
1964: vals += ncols;
1965: }
1967: PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
1968: PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
1970: ptap->reuse = MAT_REUSE_MATRIX;
1971: PetscFunctionReturn(PETSC_SUCCESS);
1972: }
1974: PETSC_INTERN PetscErrorCode MatProductSymbolic_PtAP_MPIAIJ_MPIAIJ(Mat C)
1975: {
1976: Mat_Product *product = C->product;
1977: Mat A = product->A, P = product->B;
1978: MatProductAlgorithm alg = product->alg;
1979: PetscReal fill = product->fill;
1980: PetscBool flg;
1982: PetscFunctionBegin;
1983: /* scalable: do R=P^T locally, then C=R*A*P */
1984: PetscCall(PetscStrcmp(alg, "scalable", &flg));
1985: if (flg) {
1986: PetscCall(MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(A, P, product->fill, C));
1987: C->ops->productnumeric = MatProductNumeric_PtAP;
1988: goto next;
1989: }
1991: /* nonscalable: do R=P^T locally, then C=R*A*P */
1992: PetscCall(PetscStrcmp(alg, "nonscalable", &flg));
1993: if (flg) {
1994: PetscCall(MatPtAPSymbolic_MPIAIJ_MPIAIJ(A, P, fill, C));
1995: goto next;
1996: }
1998: /* allatonce */
1999: PetscCall(PetscStrcmp(alg, "allatonce", &flg));
2000: if (flg) {
2001: PetscCall(MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce(A, P, fill, C));
2002: goto next;
2003: }
2005: /* allatonce_merged */
2006: PetscCall(PetscStrcmp(alg, "allatonce_merged", &flg));
2007: if (flg) {
2008: PetscCall(MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce_merged(A, P, fill, C));
2009: goto next;
2010: }
2012: /* backend general code */
2013: PetscCall(PetscStrcmp(alg, "backend", &flg));
2014: if (flg) {
2015: PetscCall(MatProductSymbolic_MPIAIJBACKEND(C));
2016: PetscFunctionReturn(PETSC_SUCCESS);
2017: }
2019: /* hypre */
2020: #if defined(PETSC_HAVE_HYPRE)
2021: PetscCall(PetscStrcmp(alg, "hypre", &flg));
2022: if (flg) {
2023: PetscCall(MatPtAPSymbolic_AIJ_AIJ_wHYPRE(A, P, fill, C));
2024: C->ops->productnumeric = MatProductNumeric_PtAP;
2025: PetscFunctionReturn(PETSC_SUCCESS);
2026: }
2027: #endif
2028: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_SUP, "Mat Product Algorithm is not supported");
2030: next:
2031: C->ops->productnumeric = MatProductNumeric_PtAP;
2032: PetscFunctionReturn(PETSC_SUCCESS);
2033: }