Actual source code: gs.c
2: /***********************************gs.c***************************************
4: Author: Henry M. Tufo III
6: e-mail: hmt@cs.brown.edu
8: snail-mail:
9: Division of Applied Mathematics
10: Brown University
11: Providence, RI 02912
13: Last Modification:
14: 6.21.97
15: ************************************gs.c**************************************/
17: /***********************************gs.c***************************************
18: File Description:
19: -----------------
21: ************************************gs.c**************************************/
23: #include <../src/ksp/pc/impls/tfs/tfs.h>
25: /* default length of number of items via tree - doubles if exceeded */
26: #define TREE_BUF_SZ 2048;
27: #define GS_VEC_SZ 1
29: /***********************************gs.c***************************************
30: Type: struct gather_scatter_id
31: ------------------------------
33: ************************************gs.c**************************************/
34: typedef struct gather_scatter_id {
35: PetscInt id;
36: PetscInt nel_min;
37: PetscInt nel_max;
38: PetscInt nel_sum;
39: PetscInt negl;
40: PetscInt gl_max;
41: PetscInt gl_min;
42: PetscInt repeats;
43: PetscInt ordered;
44: PetscInt positive;
45: PetscScalar *vals;
47: /* bit mask info */
48: PetscInt *my_proc_mask;
49: PetscInt mask_sz;
50: PetscInt *ngh_buf;
51: PetscInt ngh_buf_sz;
52: PetscInt *nghs;
53: PetscInt num_nghs;
54: PetscInt max_nghs;
55: PetscInt *pw_nghs;
56: PetscInt num_pw_nghs;
57: PetscInt *tree_nghs;
58: PetscInt num_tree_nghs;
60: PetscInt num_loads;
62: /* repeats == true -> local info */
63: PetscInt nel; /* number of unique elememts */
64: PetscInt *elms; /* of size nel */
65: PetscInt nel_total;
66: PetscInt *local_elms; /* of size nel_total */
67: PetscInt *companion; /* of size nel_total */
69: /* local info */
70: PetscInt num_local_total;
71: PetscInt local_strength;
72: PetscInt num_local;
73: PetscInt *num_local_reduce;
74: PetscInt **local_reduce;
75: PetscInt num_local_gop;
76: PetscInt *num_gop_local_reduce;
77: PetscInt **gop_local_reduce;
79: /* pairwise info */
80: PetscInt level;
81: PetscInt num_pairs;
82: PetscInt max_pairs;
83: PetscInt loc_node_pairs;
84: PetscInt max_node_pairs;
85: PetscInt min_node_pairs;
86: PetscInt avg_node_pairs;
87: PetscInt *pair_list;
88: PetscInt *msg_sizes;
89: PetscInt **node_list;
90: PetscInt len_pw_list;
91: PetscInt *pw_elm_list;
92: PetscScalar *pw_vals;
94: MPI_Request *msg_ids_in;
95: MPI_Request *msg_ids_out;
97: PetscScalar *out;
98: PetscScalar *in;
99: PetscInt msg_total;
101: /* tree - crystal accumulator info */
102: PetscInt max_left_over;
103: PetscInt *pre;
104: PetscInt *in_num;
105: PetscInt *out_num;
106: PetscInt **in_list;
107: PetscInt **out_list;
109: /* new tree work*/
110: PetscInt tree_nel;
111: PetscInt *tree_elms;
112: PetscScalar *tree_buf;
113: PetscScalar *tree_work;
115: PetscInt tree_map_sz;
116: PetscInt *tree_map_in;
117: PetscInt *tree_map_out;
119: /* current memory status */
120: PetscInt gl_bss_min;
121: PetscInt gl_perm_min;
123: /* max segment size for PCTFS_gs_gop_vec() */
124: PetscInt vec_sz;
126: /* hack to make paul happy */
127: MPI_Comm PCTFS_gs_comm;
129: } PCTFS_gs_id;
131: static PCTFS_gs_id *gsi_check_args(PetscInt *elms, PetscInt nel, PetscInt level);
132: static PetscErrorCode gsi_via_bit_mask(PCTFS_gs_id *gs);
133: static PetscErrorCode get_ngh_buf(PCTFS_gs_id *gs);
134: static PetscErrorCode set_pairwise(PCTFS_gs_id *gs);
135: static PCTFS_gs_id *gsi_new(void);
136: static PetscErrorCode set_tree(PCTFS_gs_id *gs);
138: /* same for all but vector flavor */
139: static PetscErrorCode PCTFS_gs_gop_local_out(PCTFS_gs_id *gs, PetscScalar *vals);
140: /* vector flavor */
141: static PetscErrorCode PCTFS_gs_gop_vec_local_out(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);
143: static PetscErrorCode PCTFS_gs_gop_vec_plus(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step);
144: static PetscErrorCode PCTFS_gs_gop_vec_pairwise_plus(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step);
145: static PetscErrorCode PCTFS_gs_gop_vec_local_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);
146: static PetscErrorCode PCTFS_gs_gop_vec_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);
147: static PetscErrorCode PCTFS_gs_gop_vec_tree_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);
149: static PetscErrorCode PCTFS_gs_gop_local_plus(PCTFS_gs_id *gs, PetscScalar *vals);
150: static PetscErrorCode PCTFS_gs_gop_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals);
152: static PetscErrorCode PCTFS_gs_gop_plus_hc(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim);
153: static PetscErrorCode PCTFS_gs_gop_pairwise_plus_hc(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim);
154: static PetscErrorCode PCTFS_gs_gop_tree_plus_hc(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim);
156: /* global vars */
157: /* from comm.c module */
159: static PetscInt num_gs_ids = 0;
161: /* should make this dynamic ... later */
162: static PetscInt msg_buf = MAX_MSG_BUF;
163: static PetscInt vec_sz = GS_VEC_SZ;
164: static PetscInt *tree_buf = NULL;
165: static PetscInt tree_buf_sz = 0;
166: static PetscInt ntree = 0;
168: /***************************************************************************/
169: PetscErrorCode PCTFS_gs_init_vec_sz(PetscInt size)
170: {
171: PetscFunctionBegin;
172: vec_sz = size;
173: PetscFunctionReturn(PETSC_SUCCESS);
174: }
176: /******************************************************************************/
177: PetscErrorCode PCTFS_gs_init_msg_buf_sz(PetscInt buf_size)
178: {
179: PetscFunctionBegin;
180: msg_buf = buf_size;
181: PetscFunctionReturn(PETSC_SUCCESS);
182: }
184: /******************************************************************************/
185: PCTFS_gs_id *PCTFS_gs_init(PetscInt *elms, PetscInt nel, PetscInt level)
186: {
187: PCTFS_gs_id *gs;
188: MPI_Group PCTFS_gs_group;
189: MPI_Comm PCTFS_gs_comm;
191: /* ensure that communication package has been initialized */
192: PetscCallAbort(PETSC_COMM_SELF, PCTFS_comm_init());
194: /* determines if we have enough dynamic/semi-static memory */
195: /* checks input, allocs and sets gd_id template */
196: gs = gsi_check_args(elms, nel, level);
198: /* only bit mask version up and working for the moment */
199: /* LATER :: get int list version working for sparse pblms */
200: PetscCallAbort(PETSC_COMM_WORLD, gsi_via_bit_mask(gs));
202: PetscCallAbort(PETSC_COMM_WORLD, MPI_Comm_group(MPI_COMM_WORLD, &PCTFS_gs_group) ? PETSC_ERR_MPI : PETSC_SUCCESS);
203: PetscCallAbort(PETSC_COMM_WORLD, MPI_Comm_create(MPI_COMM_WORLD, PCTFS_gs_group, &PCTFS_gs_comm) ? PETSC_ERR_MPI : PETSC_SUCCESS);
204: PetscCallAbort(PETSC_COMM_WORLD, MPI_Group_free(&PCTFS_gs_group) ? PETSC_ERR_MPI : PETSC_SUCCESS);
206: gs->PCTFS_gs_comm = PCTFS_gs_comm;
208: return (gs);
209: }
211: /******************************************************************************/
212: static PCTFS_gs_id *gsi_new(void)
213: {
214: PCTFS_gs_id *gs;
215: gs = (PCTFS_gs_id *)malloc(sizeof(PCTFS_gs_id));
216: PetscCallAbort(PETSC_COMM_WORLD, PetscMemzero(gs, sizeof(PCTFS_gs_id)));
217: return (gs);
218: }
220: /******************************************************************************/
221: static PCTFS_gs_id *gsi_check_args(PetscInt *in_elms, PetscInt nel, PetscInt level)
222: {
223: PetscInt i, j, k, t2;
224: PetscInt *companion, *elms, *unique, *iptr;
225: PetscInt num_local = 0, *num_to_reduce, **local_reduce;
226: PetscInt oprs[] = {NON_UNIFORM, GL_MIN, GL_MAX, GL_ADD, GL_MIN, GL_MAX, GL_MIN, GL_B_AND};
227: PetscInt vals[PETSC_STATIC_ARRAY_LENGTH(oprs) - 1];
228: PetscInt work[PETSC_STATIC_ARRAY_LENGTH(oprs) - 1];
229: PCTFS_gs_id *gs;
231: if (!in_elms) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "elms point to nothing!!!\n");
232: if (nel < 0) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "can't have fewer than 0 elms!!!\n");
234: if (nel == 0) PetscCallAbort(PETSC_COMM_WORLD, PetscInfo(0, "I don't have any elements!!!\n"));
236: /* get space for gs template */
237: gs = gsi_new();
238: gs->id = ++num_gs_ids;
240: /* hmt 6.4.99 */
241: /* caller can set global ids that don't participate to 0 */
242: /* PCTFS_gs_init ignores all zeros in elm list */
243: /* negative global ids are still invalid */
244: for (i = j = 0; i < nel; i++) {
245: if (in_elms[i] != 0) j++;
246: }
248: k = nel;
249: nel = j;
251: /* copy over in_elms list and create inverse map */
252: elms = (PetscInt *)malloc((nel + 1) * sizeof(PetscInt));
253: companion = (PetscInt *)malloc(nel * sizeof(PetscInt));
255: for (i = j = 0; i < k; i++) {
256: if (in_elms[i] != 0) {
257: elms[j] = in_elms[i];
258: companion[j++] = i;
259: }
260: }
262: if (j != nel) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "nel j mismatch!\n");
264: /* pre-pass ... check to see if sorted */
265: elms[nel] = INT_MAX;
266: iptr = elms;
267: unique = elms + 1;
268: j = 0;
269: while (*iptr != INT_MAX) {
270: if (*iptr++ > *unique++) {
271: j = 1;
272: break;
273: }
274: }
276: /* set up inverse map */
277: if (j) {
278: PetscCallAbort(PETSC_COMM_WORLD, PetscInfo(0, "gsi_check_args() :: elm list *not* sorted!\n"));
279: PetscCallAbort(PETSC_COMM_WORLD, PCTFS_SMI_sort((void *)elms, (void *)companion, nel, SORT_INTEGER));
280: } else PetscCallAbort(PETSC_COMM_WORLD, PetscInfo(0, "gsi_check_args() :: elm list sorted!\n"));
281: elms[nel] = INT_MIN;
283: /* first pass */
284: /* determine number of unique elements, check pd */
285: for (i = k = 0; i < nel; i += j) {
286: t2 = elms[i];
287: j = ++i;
289: /* clump 'em for now */
290: while (elms[j] == t2) j++;
292: /* how many together and num local */
293: if (j -= i) {
294: num_local++;
295: k += j;
296: }
297: }
299: /* how many unique elements? */
300: gs->repeats = k;
301: gs->nel = nel - k;
303: /* number of repeats? */
304: gs->num_local = num_local;
305: num_local += 2;
306: gs->local_reduce = local_reduce = (PetscInt **)malloc(num_local * sizeof(PetscInt *));
307: gs->num_local_reduce = num_to_reduce = (PetscInt *)malloc(num_local * sizeof(PetscInt));
309: unique = (PetscInt *)malloc((gs->nel + 1) * sizeof(PetscInt));
310: gs->elms = unique;
311: gs->nel_total = nel;
312: gs->local_elms = elms;
313: gs->companion = companion;
315: /* compess map as well as keep track of local ops */
316: for (num_local = i = j = 0; i < gs->nel; i++) {
317: k = j;
318: t2 = unique[i] = elms[j];
319: companion[i] = companion[j];
321: while (elms[j] == t2) j++;
323: if ((t2 = (j - k)) > 1) {
324: /* number together */
325: num_to_reduce[num_local] = t2++;
327: iptr = local_reduce[num_local++] = (PetscInt *)malloc(t2 * sizeof(PetscInt));
329: /* to use binary searching don't remap until we check intersection */
330: *iptr++ = i;
332: /* note that we're skipping the first one */
333: while (++k < j) *(iptr++) = companion[k];
334: *iptr = -1;
335: }
336: }
338: /* sentinel for ngh_buf */
339: unique[gs->nel] = INT_MAX;
341: /* for two partition sort hack */
342: num_to_reduce[num_local] = 0;
343: local_reduce[num_local] = NULL;
344: num_to_reduce[++num_local] = 0;
345: local_reduce[num_local] = NULL;
347: /* load 'em up */
348: /* note one extra to hold NON_UNIFORM flag!!! */
349: vals[2] = vals[1] = vals[0] = nel;
350: if (gs->nel > 0) {
351: vals[3] = unique[0];
352: vals[4] = unique[gs->nel - 1];
353: } else {
354: vals[3] = INT_MAX;
355: vals[4] = INT_MIN;
356: }
357: vals[5] = level;
358: vals[6] = num_gs_ids;
360: /* GLOBAL: send 'em out */
361: PetscCallAbort(PETSC_COMM_WORLD, PCTFS_giop(vals, work, PETSC_STATIC_ARRAY_LENGTH(oprs) - 1, oprs));
363: /* must be semi-pos def - only pairwise depends on this */
364: /* LATER - remove this restriction */
365: if (vals[3] < 0) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "gsi_check_args() :: system not semi-pos def \n");
366: if (vals[4] == INT_MAX) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "gsi_check_args() :: system ub too large !\n");
368: gs->nel_min = vals[0];
369: gs->nel_max = vals[1];
370: gs->nel_sum = vals[2];
371: gs->gl_min = vals[3];
372: gs->gl_max = vals[4];
373: gs->negl = vals[4] - vals[3] + 1;
375: if (gs->negl <= 0) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "gsi_check_args() :: system empty or neg :: %" PetscInt_FMT "\n", gs->negl);
377: /* LATER :: add level == -1 -> program selects level */
378: if (vals[5] < 0) vals[5] = 0;
379: else if (vals[5] > PCTFS_num_nodes) vals[5] = PCTFS_num_nodes;
380: gs->level = vals[5];
382: return (gs);
383: }
385: /******************************************************************************/
386: static PetscErrorCode gsi_via_bit_mask(PCTFS_gs_id *gs)
387: {
388: PetscInt i, nel, *elms;
389: PetscInt t1;
390: PetscInt **reduce;
391: PetscInt *map;
393: PetscFunctionBegin;
394: /* totally local removes ... PCTFS_ct_bits == 0 */
395: PetscCall(get_ngh_buf(gs));
397: if (gs->level) PetscCall(set_pairwise(gs));
398: if (gs->max_left_over) PetscCall(set_tree(gs));
400: /* intersection local and pairwise/tree? */
401: gs->num_local_total = gs->num_local;
402: gs->gop_local_reduce = gs->local_reduce;
403: gs->num_gop_local_reduce = gs->num_local_reduce;
405: map = gs->companion;
407: /* is there any local compression */
408: if (!gs->num_local) {
409: gs->local_strength = NONE;
410: gs->num_local_gop = 0;
411: } else {
412: /* ok find intersection */
413: map = gs->companion;
414: reduce = gs->local_reduce;
415: for (i = 0, t1 = 0; i < gs->num_local; i++, reduce++) {
416: if ((PCTFS_ivec_binary_search(**reduce, gs->pw_elm_list, gs->len_pw_list) >= 0) || PCTFS_ivec_binary_search(**reduce, gs->tree_map_in, gs->tree_map_sz) >= 0) {
417: t1++;
418: PetscCheck(gs->num_local_reduce[i] > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "nobody in list?");
419: gs->num_local_reduce[i] *= -1;
420: }
421: **reduce = map[**reduce];
422: }
424: /* intersection is empty */
425: if (!t1) {
426: gs->local_strength = FULL;
427: gs->num_local_gop = 0;
428: } else { /* intersection not empty */
429: gs->local_strength = PARTIAL;
431: PetscCall(PCTFS_SMI_sort((void *)gs->num_local_reduce, (void *)gs->local_reduce, gs->num_local + 1, SORT_INT_PTR));
433: gs->num_local_gop = t1;
434: gs->num_local_total = gs->num_local;
435: gs->num_local -= t1;
436: gs->gop_local_reduce = gs->local_reduce;
437: gs->num_gop_local_reduce = gs->num_local_reduce;
439: for (i = 0; i < t1; i++) {
440: PetscCheck(gs->num_gop_local_reduce[i] < 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "they aren't negative?");
441: gs->num_gop_local_reduce[i] *= -1;
442: gs->local_reduce++;
443: gs->num_local_reduce++;
444: }
445: gs->local_reduce++;
446: gs->num_local_reduce++;
447: }
448: }
450: elms = gs->pw_elm_list;
451: nel = gs->len_pw_list;
452: for (i = 0; i < nel; i++) elms[i] = map[elms[i]];
454: elms = gs->tree_map_in;
455: nel = gs->tree_map_sz;
456: for (i = 0; i < nel; i++) elms[i] = map[elms[i]];
458: /* clean up */
459: free((void *)gs->local_elms);
460: free((void *)gs->companion);
461: free((void *)gs->elms);
462: free((void *)gs->ngh_buf);
463: gs->local_elms = gs->companion = gs->elms = gs->ngh_buf = NULL;
464: PetscFunctionReturn(PETSC_SUCCESS);
465: }
467: /******************************************************************************/
468: static PetscErrorCode place_in_tree(PetscInt elm)
469: {
470: PetscInt *tp, n;
472: PetscFunctionBegin;
473: if (ntree == tree_buf_sz) {
474: if (tree_buf_sz) {
475: tp = tree_buf;
476: n = tree_buf_sz;
477: tree_buf_sz <<= 1;
478: tree_buf = (PetscInt *)malloc(tree_buf_sz * sizeof(PetscInt));
479: PCTFS_ivec_copy(tree_buf, tp, n);
480: free(tp);
481: } else {
482: tree_buf_sz = TREE_BUF_SZ;
483: tree_buf = (PetscInt *)malloc(tree_buf_sz * sizeof(PetscInt));
484: }
485: }
487: tree_buf[ntree++] = elm;
488: PetscFunctionReturn(PETSC_SUCCESS);
489: }
491: /******************************************************************************/
492: static PetscErrorCode get_ngh_buf(PCTFS_gs_id *gs)
493: {
494: PetscInt i, j, npw = 0, ntree_map = 0;
495: PetscInt p_mask_size, ngh_buf_size, buf_size;
496: PetscInt *p_mask, *sh_proc_mask, *pw_sh_proc_mask;
497: PetscInt *ngh_buf, *buf1, *buf2;
498: PetscInt offset, per_load, num_loads, or_ct, start, end;
499: PetscInt *ptr1, *ptr2, i_start, negl, nel, *elms;
500: PetscInt oper = GL_B_OR;
501: PetscInt *ptr3, *t_mask, level, ct1, ct2;
503: PetscFunctionBegin;
504: /* to make life easier */
505: nel = gs->nel;
506: elms = gs->elms;
507: level = gs->level;
509: /* det #bytes needed for processor bit masks and init w/mask cor. to PCTFS_my_id */
510: p_mask = (PetscInt *)malloc(p_mask_size = PCTFS_len_bit_mask(PCTFS_num_nodes));
511: PetscCall(PCTFS_set_bit_mask(p_mask, p_mask_size, PCTFS_my_id));
513: /* allocate space for masks and info bufs */
514: gs->nghs = sh_proc_mask = (PetscInt *)malloc(p_mask_size);
515: gs->pw_nghs = pw_sh_proc_mask = (PetscInt *)malloc(p_mask_size);
516: gs->ngh_buf_sz = ngh_buf_size = p_mask_size * nel;
517: t_mask = (PetscInt *)malloc(p_mask_size);
518: gs->ngh_buf = ngh_buf = (PetscInt *)malloc(ngh_buf_size);
520: /* comm buffer size ... memory usage bounded by ~2*msg_buf */
521: /* had thought I could exploit rendezvous threshold */
523: /* default is one pass */
524: per_load = negl = gs->negl;
525: gs->num_loads = num_loads = 1;
526: i = p_mask_size * negl;
528: /* possible overflow on buffer size */
529: /* overflow hack */
530: if (i < 0) i = INT_MAX;
532: buf_size = PetscMin(msg_buf, i);
534: /* can we do it? */
535: PetscCheck(p_mask_size <= buf_size, PETSC_COMM_SELF, PETSC_ERR_PLIB, "get_ngh_buf() :: buf<pms :: %" PetscInt_FMT ">%" PetscInt_FMT, p_mask_size, buf_size);
537: /* get PCTFS_giop buf space ... make *only* one malloc */
538: buf1 = (PetscInt *)malloc(buf_size << 1);
540: /* more than one gior exchange needed? */
541: if (buf_size != i) {
542: per_load = buf_size / p_mask_size;
543: buf_size = per_load * p_mask_size;
544: gs->num_loads = num_loads = negl / per_load + (negl % per_load > 0);
545: }
547: /* convert buf sizes from #bytes to #ints - 32-bit only! */
548: p_mask_size /= sizeof(PetscInt);
549: ngh_buf_size /= sizeof(PetscInt);
550: buf_size /= sizeof(PetscInt);
552: /* find PCTFS_giop work space */
553: buf2 = buf1 + buf_size;
555: /* hold #ints needed for processor masks */
556: gs->mask_sz = p_mask_size;
558: /* init buffers */
559: PetscCall(PCTFS_ivec_zero(sh_proc_mask, p_mask_size));
560: PetscCall(PCTFS_ivec_zero(pw_sh_proc_mask, p_mask_size));
561: PetscCall(PCTFS_ivec_zero(ngh_buf, ngh_buf_size));
563: /* HACK reset tree info */
564: tree_buf = NULL;
565: tree_buf_sz = ntree = 0;
567: /* ok do it */
568: for (ptr1 = ngh_buf, ptr2 = elms, end = gs->gl_min, or_ct = i = 0; or_ct < num_loads; or_ct++) {
569: /* identity for bitwise or is 000...000 */
570: PetscCall(PCTFS_ivec_zero(buf1, buf_size));
572: /* load msg buffer */
573: for (start = end, end += per_load, i_start = i; (offset = *ptr2) < end; i++, ptr2++) {
574: offset = (offset - start) * p_mask_size;
575: PCTFS_ivec_copy(buf1 + offset, p_mask, p_mask_size);
576: }
578: /* GLOBAL: pass buffer */
579: PetscCall(PCTFS_giop(buf1, buf2, buf_size, &oper));
581: /* unload buffer into ngh_buf */
582: ptr2 = (elms + i_start);
583: for (ptr3 = buf1, j = start; j < end; ptr3 += p_mask_size, j++) {
584: /* I own it ... may have to pairwise it */
585: if (j == *ptr2) {
586: /* do i share it w/anyone? */
587: ct1 = PCTFS_ct_bits((char *)ptr3, p_mask_size * sizeof(PetscInt));
588: /* guess not */
589: if (ct1 < 2) {
590: ptr2++;
591: ptr1 += p_mask_size;
592: continue;
593: }
595: /* i do ... so keep info and turn off my bit */
596: PCTFS_ivec_copy(ptr1, ptr3, p_mask_size);
597: PetscCall(PCTFS_ivec_xor(ptr1, p_mask, p_mask_size));
598: PetscCall(PCTFS_ivec_or(sh_proc_mask, ptr1, p_mask_size));
600: /* is it to be done pairwise? */
601: if (--ct1 <= level) {
602: npw++;
604: /* turn on high bit to indicate pw need to process */
605: *ptr2++ |= TOP_BIT;
606: PetscCall(PCTFS_ivec_or(pw_sh_proc_mask, ptr1, p_mask_size));
607: ptr1 += p_mask_size;
608: continue;
609: }
611: /* get set for next and note that I have a tree contribution */
612: /* could save exact elm index for tree here -> save a search */
613: ptr2++;
614: ptr1 += p_mask_size;
615: ntree_map++;
616: } else { /* i don't but still might be involved in tree */
618: /* shared by how many? */
619: ct1 = PCTFS_ct_bits((char *)ptr3, p_mask_size * sizeof(PetscInt));
621: /* none! */
622: if (ct1 < 2) continue;
624: /* is it going to be done pairwise? but not by me of course!*/
625: if (--ct1 <= level) continue;
626: }
627: /* LATER we're going to have to process it NOW */
628: /* nope ... tree it */
629: PetscCall(place_in_tree(j));
630: }
631: }
633: free((void *)t_mask);
634: free((void *)buf1);
636: gs->len_pw_list = npw;
637: gs->num_nghs = PCTFS_ct_bits((char *)sh_proc_mask, p_mask_size * sizeof(PetscInt));
639: /* expand from bit mask list to int list and save ngh list */
640: gs->nghs = (PetscInt *)malloc(gs->num_nghs * sizeof(PetscInt));
641: PetscCall(PCTFS_bm_to_proc((char *)sh_proc_mask, p_mask_size * sizeof(PetscInt), gs->nghs));
643: gs->num_pw_nghs = PCTFS_ct_bits((char *)pw_sh_proc_mask, p_mask_size * sizeof(PetscInt));
645: oper = GL_MAX;
646: ct1 = gs->num_nghs;
647: PetscCall(PCTFS_giop(&ct1, &ct2, 1, &oper));
648: gs->max_nghs = ct1;
650: gs->tree_map_sz = ntree_map;
651: gs->max_left_over = ntree;
653: free((void *)p_mask);
654: free((void *)sh_proc_mask);
655: PetscFunctionReturn(PETSC_SUCCESS);
656: }
658: /******************************************************************************/
659: static PetscErrorCode set_pairwise(PCTFS_gs_id *gs)
660: {
661: PetscInt i, j;
662: PetscInt p_mask_size;
663: PetscInt *p_mask, *sh_proc_mask, *tmp_proc_mask;
664: PetscInt *ngh_buf, *buf2;
665: PetscInt offset;
666: PetscInt *msg_list, *msg_size, **msg_nodes, nprs;
667: PetscInt *pairwise_elm_list, len_pair_list = 0;
668: PetscInt *iptr, t1, i_start, nel, *elms;
669: PetscInt ct;
671: PetscFunctionBegin;
672: /* to make life easier */
673: nel = gs->nel;
674: elms = gs->elms;
675: ngh_buf = gs->ngh_buf;
676: sh_proc_mask = gs->pw_nghs;
678: /* need a few temp masks */
679: p_mask_size = PCTFS_len_bit_mask(PCTFS_num_nodes);
680: p_mask = (PetscInt *)malloc(p_mask_size);
681: tmp_proc_mask = (PetscInt *)malloc(p_mask_size);
683: /* set mask to my PCTFS_my_id's bit mask */
684: PetscCall(PCTFS_set_bit_mask(p_mask, p_mask_size, PCTFS_my_id));
686: p_mask_size /= sizeof(PetscInt);
688: len_pair_list = gs->len_pw_list;
689: gs->pw_elm_list = pairwise_elm_list = (PetscInt *)malloc((len_pair_list + 1) * sizeof(PetscInt));
691: /* how many processors (nghs) do we have to exchange with? */
692: nprs = gs->num_pairs = PCTFS_ct_bits((char *)sh_proc_mask, p_mask_size * sizeof(PetscInt));
694: /* allocate space for PCTFS_gs_gop() info */
695: gs->pair_list = msg_list = (PetscInt *)malloc(sizeof(PetscInt) * nprs);
696: gs->msg_sizes = msg_size = (PetscInt *)malloc(sizeof(PetscInt) * nprs);
697: gs->node_list = msg_nodes = (PetscInt **)malloc(sizeof(PetscInt *) * (nprs + 1));
699: /* init msg_size list */
700: PetscCall(PCTFS_ivec_zero(msg_size, nprs));
702: /* expand from bit mask list to int list */
703: PetscCall(PCTFS_bm_to_proc((char *)sh_proc_mask, p_mask_size * sizeof(PetscInt), msg_list));
705: /* keep list of elements being handled pairwise */
706: for (i = j = 0; i < nel; i++) {
707: if (elms[i] & TOP_BIT) {
708: elms[i] ^= TOP_BIT;
709: pairwise_elm_list[j++] = i;
710: }
711: }
712: pairwise_elm_list[j] = -1;
714: gs->msg_ids_out = (MPI_Request *)malloc(sizeof(MPI_Request) * (nprs + 1));
715: gs->msg_ids_out[nprs] = MPI_REQUEST_NULL;
716: gs->msg_ids_in = (MPI_Request *)malloc(sizeof(MPI_Request) * (nprs + 1));
717: gs->msg_ids_in[nprs] = MPI_REQUEST_NULL;
718: gs->pw_vals = (PetscScalar *)malloc(sizeof(PetscScalar) * len_pair_list * vec_sz);
720: /* find who goes to each processor */
721: for (i_start = i = 0; i < nprs; i++) {
722: /* processor i's mask */
723: PetscCall(PCTFS_set_bit_mask(p_mask, p_mask_size * sizeof(PetscInt), msg_list[i]));
725: /* det # going to processor i */
726: for (ct = j = 0; j < len_pair_list; j++) {
727: buf2 = ngh_buf + (pairwise_elm_list[j] * p_mask_size);
728: PetscCall(PCTFS_ivec_and3(tmp_proc_mask, p_mask, buf2, p_mask_size));
729: if (PCTFS_ct_bits((char *)tmp_proc_mask, p_mask_size * sizeof(PetscInt))) ct++;
730: }
731: msg_size[i] = ct;
732: i_start = PetscMax(i_start, ct);
734: /*space to hold nodes in message to first neighbor */
735: msg_nodes[i] = iptr = (PetscInt *)malloc(sizeof(PetscInt) * (ct + 1));
737: for (j = 0; j < len_pair_list; j++) {
738: buf2 = ngh_buf + (pairwise_elm_list[j] * p_mask_size);
739: PetscCall(PCTFS_ivec_and3(tmp_proc_mask, p_mask, buf2, p_mask_size));
740: if (PCTFS_ct_bits((char *)tmp_proc_mask, p_mask_size * sizeof(PetscInt))) *iptr++ = j;
741: }
742: *iptr = -1;
743: }
744: msg_nodes[nprs] = NULL;
746: j = gs->loc_node_pairs = i_start;
747: t1 = GL_MAX;
748: PetscCall(PCTFS_giop(&i_start, &offset, 1, &t1));
749: gs->max_node_pairs = i_start;
751: i_start = j;
752: t1 = GL_MIN;
753: PetscCall(PCTFS_giop(&i_start, &offset, 1, &t1));
754: gs->min_node_pairs = i_start;
756: i_start = j;
757: t1 = GL_ADD;
758: PetscCall(PCTFS_giop(&i_start, &offset, 1, &t1));
759: gs->avg_node_pairs = i_start / PCTFS_num_nodes + 1;
761: i_start = nprs;
762: t1 = GL_MAX;
763: PetscCall(PCTFS_giop(&i_start, &offset, 1, &t1));
764: gs->max_pairs = i_start;
766: /* remap pairwise in tail of gsi_via_bit_mask() */
767: gs->msg_total = PCTFS_ivec_sum(gs->msg_sizes, nprs);
768: gs->out = (PetscScalar *)malloc(sizeof(PetscScalar) * gs->msg_total * vec_sz);
769: gs->in = (PetscScalar *)malloc(sizeof(PetscScalar) * gs->msg_total * vec_sz);
771: /* reset malloc pool */
772: free((void *)p_mask);
773: free((void *)tmp_proc_mask);
774: PetscFunctionReturn(PETSC_SUCCESS);
775: }
777: /* to do pruned tree just save ngh buf copy for each one and decode here!
778: ******************************************************************************/
779: static PetscErrorCode set_tree(PCTFS_gs_id *gs)
780: {
781: PetscInt i, j, n, nel;
782: PetscInt *iptr_in, *iptr_out, *tree_elms, *elms;
784: PetscFunctionBegin;
785: /* local work ptrs */
786: elms = gs->elms;
787: nel = gs->nel;
789: /* how many via tree */
790: gs->tree_nel = n = ntree;
791: gs->tree_elms = tree_elms = iptr_in = tree_buf;
792: gs->tree_buf = (PetscScalar *)malloc(sizeof(PetscScalar) * n * vec_sz);
793: gs->tree_work = (PetscScalar *)malloc(sizeof(PetscScalar) * n * vec_sz);
794: j = gs->tree_map_sz;
795: gs->tree_map_in = iptr_in = (PetscInt *)malloc(sizeof(PetscInt) * (j + 1));
796: gs->tree_map_out = iptr_out = (PetscInt *)malloc(sizeof(PetscInt) * (j + 1));
798: /* search the longer of the two lists */
799: /* note ... could save this info in get_ngh_buf and save searches */
800: if (n <= nel) {
801: /* bijective fct w/remap - search elm list */
802: for (i = 0; i < n; i++) {
803: if ((j = PCTFS_ivec_binary_search(*tree_elms++, elms, nel)) >= 0) {
804: *iptr_in++ = j;
805: *iptr_out++ = i;
806: }
807: }
808: } else {
809: for (i = 0; i < nel; i++) {
810: if ((j = PCTFS_ivec_binary_search(*elms++, tree_elms, n)) >= 0) {
811: *iptr_in++ = i;
812: *iptr_out++ = j;
813: }
814: }
815: }
817: /* sentinel */
818: *iptr_in = *iptr_out = -1;
819: PetscFunctionReturn(PETSC_SUCCESS);
820: }
822: /******************************************************************************/
823: static PetscErrorCode PCTFS_gs_gop_local_out(PCTFS_gs_id *gs, PetscScalar *vals)
824: {
825: PetscInt *num, *map, **reduce;
826: PetscScalar tmp;
828: PetscFunctionBegin;
829: num = gs->num_gop_local_reduce;
830: reduce = gs->gop_local_reduce;
831: while ((map = *reduce++)) {
832: /* wall */
833: if (*num == 2) {
834: num++;
835: vals[map[1]] = vals[map[0]];
836: } else if (*num == 3) { /* corner shared by three elements */
837: num++;
838: vals[map[2]] = vals[map[1]] = vals[map[0]];
839: } else if (*num == 4) { /* corner shared by four elements */
840: num++;
841: vals[map[3]] = vals[map[2]] = vals[map[1]] = vals[map[0]];
842: } else { /* general case ... odd geoms ... 3D*/
843: num++;
844: tmp = *(vals + *map++);
845: while (*map >= 0) *(vals + *map++) = tmp;
846: }
847: }
848: PetscFunctionReturn(PETSC_SUCCESS);
849: }
851: /******************************************************************************/
852: static PetscErrorCode PCTFS_gs_gop_local_plus(PCTFS_gs_id *gs, PetscScalar *vals)
853: {
854: PetscInt *num, *map, **reduce;
855: PetscScalar tmp;
857: PetscFunctionBegin;
858: num = gs->num_local_reduce;
859: reduce = gs->local_reduce;
860: while ((map = *reduce)) {
861: /* wall */
862: if (*num == 2) {
863: num++;
864: reduce++;
865: vals[map[1]] = vals[map[0]] += vals[map[1]];
866: } else if (*num == 3) { /* corner shared by three elements */
867: num++;
868: reduce++;
869: vals[map[2]] = vals[map[1]] = vals[map[0]] += (vals[map[1]] + vals[map[2]]);
870: } else if (*num == 4) { /* corner shared by four elements */
871: num++;
872: reduce++;
873: vals[map[1]] = vals[map[2]] = vals[map[3]] = vals[map[0]] += (vals[map[1]] + vals[map[2]] + vals[map[3]]);
874: } else { /* general case ... odd geoms ... 3D*/
875: num++;
876: tmp = 0.0;
877: while (*map >= 0) tmp += *(vals + *map++);
879: map = *reduce++;
880: while (*map >= 0) *(vals + *map++) = tmp;
881: }
882: }
883: PetscFunctionReturn(PETSC_SUCCESS);
884: }
886: /******************************************************************************/
887: static PetscErrorCode PCTFS_gs_gop_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals)
888: {
889: PetscInt *num, *map, **reduce;
890: PetscScalar *base;
892: PetscFunctionBegin;
893: num = gs->num_gop_local_reduce;
894: reduce = gs->gop_local_reduce;
895: while ((map = *reduce++)) {
896: /* wall */
897: if (*num == 2) {
898: num++;
899: vals[map[0]] += vals[map[1]];
900: } else if (*num == 3) { /* corner shared by three elements */
901: num++;
902: vals[map[0]] += (vals[map[1]] + vals[map[2]]);
903: } else if (*num == 4) { /* corner shared by four elements */
904: num++;
905: vals[map[0]] += (vals[map[1]] + vals[map[2]] + vals[map[3]]);
906: } else { /* general case ... odd geoms ... 3D*/
907: num++;
908: base = vals + *map++;
909: while (*map >= 0) *base += *(vals + *map++);
910: }
911: }
912: PetscFunctionReturn(PETSC_SUCCESS);
913: }
915: /******************************************************************************/
916: PetscErrorCode PCTFS_gs_free(PCTFS_gs_id *gs)
917: {
918: PetscInt i;
920: PetscFunctionBegin;
921: PetscCallMPI(MPI_Comm_free(&gs->PCTFS_gs_comm));
922: if (gs->nghs) free((void *)gs->nghs);
923: if (gs->pw_nghs) free((void *)gs->pw_nghs);
925: /* tree */
926: if (gs->max_left_over) {
927: if (gs->tree_elms) free((void *)gs->tree_elms);
928: if (gs->tree_buf) free((void *)gs->tree_buf);
929: if (gs->tree_work) free((void *)gs->tree_work);
930: if (gs->tree_map_in) free((void *)gs->tree_map_in);
931: if (gs->tree_map_out) free((void *)gs->tree_map_out);
932: }
934: /* pairwise info */
935: if (gs->num_pairs) {
936: /* should be NULL already */
937: if (gs->ngh_buf) free((void *)gs->ngh_buf);
938: if (gs->elms) free((void *)gs->elms);
939: if (gs->local_elms) free((void *)gs->local_elms);
940: if (gs->companion) free((void *)gs->companion);
942: /* only set if pairwise */
943: if (gs->vals) free((void *)gs->vals);
944: if (gs->in) free((void *)gs->in);
945: if (gs->out) free((void *)gs->out);
946: if (gs->msg_ids_in) free((void *)gs->msg_ids_in);
947: if (gs->msg_ids_out) free((void *)gs->msg_ids_out);
948: if (gs->pw_vals) free((void *)gs->pw_vals);
949: if (gs->pw_elm_list) free((void *)gs->pw_elm_list);
950: if (gs->node_list) {
951: for (i = 0; i < gs->num_pairs; i++) {
952: if (gs->node_list[i]) free((void *)gs->node_list[i]);
953: }
954: free((void *)gs->node_list);
955: }
956: if (gs->msg_sizes) free((void *)gs->msg_sizes);
957: if (gs->pair_list) free((void *)gs->pair_list);
958: }
960: /* local info */
961: if (gs->num_local_total >= 0) {
962: for (i = 0; i < gs->num_local_total + 1; i++) {
963: if (gs->num_gop_local_reduce[i]) free((void *)gs->gop_local_reduce[i]);
964: }
965: }
967: /* if intersection tree/pairwise and local isn't empty */
968: if (gs->gop_local_reduce) free((void *)gs->gop_local_reduce);
969: if (gs->num_gop_local_reduce) free((void *)gs->num_gop_local_reduce);
971: free((void *)gs);
972: PetscFunctionReturn(PETSC_SUCCESS);
973: }
975: /******************************************************************************/
976: PetscErrorCode PCTFS_gs_gop_vec(PCTFS_gs_id *gs, PetscScalar *vals, const char *op, PetscInt step)
977: {
978: PetscFunctionBegin;
979: switch (*op) {
980: case '+':
981: PetscCall(PCTFS_gs_gop_vec_plus(gs, vals, step));
982: break;
983: default:
984: PetscCall(PetscInfo(0, "PCTFS_gs_gop_vec() :: %c is not a valid op\n", op[0]));
985: PetscCall(PetscInfo(0, "PCTFS_gs_gop_vec() :: default :: plus\n"));
986: PetscCall(PCTFS_gs_gop_vec_plus(gs, vals, step));
987: break;
988: }
989: PetscFunctionReturn(PETSC_SUCCESS);
990: }
992: /******************************************************************************/
993: static PetscErrorCode PCTFS_gs_gop_vec_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step)
994: {
995: PetscFunctionBegin;
996: PetscCheck(gs, PETSC_COMM_SELF, PETSC_ERR_PLIB, "PCTFS_gs_gop_vec() passed NULL gs handle!!!");
998: /* local only operations!!! */
999: if (gs->num_local) PetscCall(PCTFS_gs_gop_vec_local_plus(gs, vals, step));
1001: /* if intersection tree/pairwise and local isn't empty */
1002: if (gs->num_local_gop) {
1003: PetscCall(PCTFS_gs_gop_vec_local_in_plus(gs, vals, step));
1005: /* pairwise */
1006: if (gs->num_pairs) PetscCall(PCTFS_gs_gop_vec_pairwise_plus(gs, vals, step));
1008: /* tree */
1009: else if (gs->max_left_over) PetscCall(PCTFS_gs_gop_vec_tree_plus(gs, vals, step));
1011: PetscCall(PCTFS_gs_gop_vec_local_out(gs, vals, step));
1012: } else { /* if intersection tree/pairwise and local is empty */
1013: /* pairwise */
1014: if (gs->num_pairs) PetscCall(PCTFS_gs_gop_vec_pairwise_plus(gs, vals, step));
1016: /* tree */
1017: else if (gs->max_left_over) PetscCall(PCTFS_gs_gop_vec_tree_plus(gs, vals, step));
1018: }
1019: PetscFunctionReturn(PETSC_SUCCESS);
1020: }
1022: /******************************************************************************/
1023: static PetscErrorCode PCTFS_gs_gop_vec_local_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step)
1024: {
1025: PetscInt *num, *map, **reduce;
1026: PetscScalar *base;
1028: PetscFunctionBegin;
1029: num = gs->num_local_reduce;
1030: reduce = gs->local_reduce;
1031: while ((map = *reduce)) {
1032: base = vals + map[0] * step;
1034: /* wall */
1035: if (*num == 2) {
1036: num++;
1037: reduce++;
1038: PetscCall(PCTFS_rvec_add(base, vals + map[1] * step, step));
1039: PetscCall(PCTFS_rvec_copy(vals + map[1] * step, base, step));
1040: } else if (*num == 3) { /* corner shared by three elements */
1041: num++;
1042: reduce++;
1043: PetscCall(PCTFS_rvec_add(base, vals + map[1] * step, step));
1044: PetscCall(PCTFS_rvec_add(base, vals + map[2] * step, step));
1045: PetscCall(PCTFS_rvec_copy(vals + map[2] * step, base, step));
1046: PetscCall(PCTFS_rvec_copy(vals + map[1] * step, base, step));
1047: } else if (*num == 4) { /* corner shared by four elements */
1048: num++;
1049: reduce++;
1050: PetscCall(PCTFS_rvec_add(base, vals + map[1] * step, step));
1051: PetscCall(PCTFS_rvec_add(base, vals + map[2] * step, step));
1052: PetscCall(PCTFS_rvec_add(base, vals + map[3] * step, step));
1053: PetscCall(PCTFS_rvec_copy(vals + map[3] * step, base, step));
1054: PetscCall(PCTFS_rvec_copy(vals + map[2] * step, base, step));
1055: PetscCall(PCTFS_rvec_copy(vals + map[1] * step, base, step));
1056: } else { /* general case ... odd geoms ... 3D */
1057: num++;
1058: while (*++map >= 0) PetscCall(PCTFS_rvec_add(base, vals + *map * step, step));
1060: map = *reduce;
1061: while (*++map >= 0) PetscCall(PCTFS_rvec_copy(vals + *map * step, base, step));
1063: reduce++;
1064: }
1065: }
1066: PetscFunctionReturn(PETSC_SUCCESS);
1067: }
1069: /******************************************************************************/
1070: static PetscErrorCode PCTFS_gs_gop_vec_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step)
1071: {
1072: PetscInt *num, *map, **reduce;
1073: PetscScalar *base;
1075: PetscFunctionBegin;
1076: num = gs->num_gop_local_reduce;
1077: reduce = gs->gop_local_reduce;
1078: while ((map = *reduce++)) {
1079: base = vals + map[0] * step;
1081: /* wall */
1082: if (*num == 2) {
1083: num++;
1084: PetscCall(PCTFS_rvec_add(base, vals + map[1] * step, step));
1085: } else if (*num == 3) { /* corner shared by three elements */
1086: num++;
1087: PetscCall(PCTFS_rvec_add(base, vals + map[1] * step, step));
1088: PetscCall(PCTFS_rvec_add(base, vals + map[2] * step, step));
1089: } else if (*num == 4) { /* corner shared by four elements */
1090: num++;
1091: PetscCall(PCTFS_rvec_add(base, vals + map[1] * step, step));
1092: PetscCall(PCTFS_rvec_add(base, vals + map[2] * step, step));
1093: PetscCall(PCTFS_rvec_add(base, vals + map[3] * step, step));
1094: } else { /* general case ... odd geoms ... 3D*/
1095: num++;
1096: while (*++map >= 0) PetscCall(PCTFS_rvec_add(base, vals + *map * step, step));
1097: }
1098: }
1099: PetscFunctionReturn(PETSC_SUCCESS);
1100: }
1102: /******************************************************************************/
1103: static PetscErrorCode PCTFS_gs_gop_vec_local_out(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step)
1104: {
1105: PetscInt *num, *map, **reduce;
1106: PetscScalar *base;
1108: PetscFunctionBegin;
1109: num = gs->num_gop_local_reduce;
1110: reduce = gs->gop_local_reduce;
1111: while ((map = *reduce++)) {
1112: base = vals + map[0] * step;
1114: /* wall */
1115: if (*num == 2) {
1116: num++;
1117: PetscCall(PCTFS_rvec_copy(vals + map[1] * step, base, step));
1118: } else if (*num == 3) { /* corner shared by three elements */
1119: num++;
1120: PetscCall(PCTFS_rvec_copy(vals + map[1] * step, base, step));
1121: PetscCall(PCTFS_rvec_copy(vals + map[2] * step, base, step));
1122: } else if (*num == 4) { /* corner shared by four elements */
1123: num++;
1124: PetscCall(PCTFS_rvec_copy(vals + map[1] * step, base, step));
1125: PetscCall(PCTFS_rvec_copy(vals + map[2] * step, base, step));
1126: PetscCall(PCTFS_rvec_copy(vals + map[3] * step, base, step));
1127: } else { /* general case ... odd geoms ... 3D*/
1128: num++;
1129: while (*++map >= 0) PetscCall(PCTFS_rvec_copy(vals + *map * step, base, step));
1130: }
1131: }
1132: PetscFunctionReturn(PETSC_SUCCESS);
1133: }
1135: /******************************************************************************/
1136: static PetscErrorCode PCTFS_gs_gop_vec_pairwise_plus(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step)
1137: {
1138: PetscScalar *dptr1, *dptr2, *dptr3, *in1, *in2;
1139: PetscInt *iptr, *msg_list, *msg_size, **msg_nodes;
1140: PetscInt *pw, *list, *size, **nodes;
1141: MPI_Request *msg_ids_in, *msg_ids_out, *ids_in, *ids_out;
1142: MPI_Status status;
1143: PetscBLASInt i1 = 1, dstep;
1145: PetscFunctionBegin;
1146: /* strip and load s */
1147: msg_list = list = gs->pair_list;
1148: msg_size = size = gs->msg_sizes;
1149: msg_nodes = nodes = gs->node_list;
1150: iptr = pw = gs->pw_elm_list;
1151: dptr1 = dptr3 = gs->pw_vals;
1152: msg_ids_in = ids_in = gs->msg_ids_in;
1153: msg_ids_out = ids_out = gs->msg_ids_out;
1154: dptr2 = gs->out;
1155: in1 = in2 = gs->in;
1157: /* post the receives */
1158: /* msg_nodes=nodes; */
1159: do {
1160: /* Should MPI_ANY_SOURCE be replaced by *list ? In that case do the
1161: second one *list and do list++ afterwards */
1162: PetscCallMPI(MPI_Irecv(in1, *size * step, MPIU_SCALAR, MPI_ANY_SOURCE, MSGTAG1 + *list, gs->PCTFS_gs_comm, msg_ids_in));
1163: list++;
1164: msg_ids_in++;
1165: in1 += *size++ * step;
1166: } while (*++msg_nodes);
1167: msg_nodes = nodes;
1169: /* load gs values into in out gs buffers */
1170: while (*iptr >= 0) {
1171: PetscCall(PCTFS_rvec_copy(dptr3, in_vals + *iptr * step, step));
1172: dptr3 += step;
1173: iptr++;
1174: }
1176: /* load out buffers and post the sends */
1177: while ((iptr = *msg_nodes++)) {
1178: dptr3 = dptr2;
1179: while (*iptr >= 0) {
1180: PetscCall(PCTFS_rvec_copy(dptr2, dptr1 + *iptr * step, step));
1181: dptr2 += step;
1182: iptr++;
1183: }
1184: PetscCallMPI(MPI_Isend(dptr3, *msg_size * step, MPIU_SCALAR, *msg_list, MSGTAG1 + PCTFS_my_id, gs->PCTFS_gs_comm, msg_ids_out));
1185: msg_size++;
1186: msg_list++;
1187: msg_ids_out++;
1188: }
1190: /* tree */
1191: if (gs->max_left_over) PetscCall(PCTFS_gs_gop_vec_tree_plus(gs, in_vals, step));
1193: /* process the received data */
1194: msg_nodes = nodes;
1195: while ((iptr = *nodes++)) {
1196: PetscScalar d1 = 1.0;
1198: /* Should I check the return value of MPI_Wait() or status? */
1199: /* Can this loop be replaced by a call to MPI_Waitall()? */
1200: PetscCallMPI(MPI_Wait(ids_in, &status));
1201: ids_in++;
1202: while (*iptr >= 0) {
1203: PetscCall(PetscBLASIntCast(step, &dstep));
1204: PetscCallBLAS("BLASaxpy", BLASaxpy_(&dstep, &d1, in2, &i1, dptr1 + *iptr * step, &i1));
1205: in2 += step;
1206: iptr++;
1207: }
1208: }
1210: /* replace vals */
1211: while (*pw >= 0) {
1212: PetscCall(PCTFS_rvec_copy(in_vals + *pw * step, dptr1, step));
1213: dptr1 += step;
1214: pw++;
1215: }
1217: /* clear isend message handles */
1218: /* This changed for clarity though it could be the same */
1220: /* Should I check the return value of MPI_Wait() or status? */
1221: /* Can this loop be replaced by a call to MPI_Waitall()? */
1222: while (*msg_nodes++) {
1223: PetscCallMPI(MPI_Wait(ids_out, &status));
1224: ids_out++;
1225: }
1226: PetscFunctionReturn(PETSC_SUCCESS);
1227: }
1229: /******************************************************************************/
1230: static PetscErrorCode PCTFS_gs_gop_vec_tree_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step)
1231: {
1232: PetscInt size, *in, *out;
1233: PetscScalar *buf, *work;
1234: PetscInt op[] = {GL_ADD, 0};
1235: PetscBLASInt i1 = 1;
1236: PetscBLASInt dstep;
1238: PetscFunctionBegin;
1239: /* copy over to local variables */
1240: in = gs->tree_map_in;
1241: out = gs->tree_map_out;
1242: buf = gs->tree_buf;
1243: work = gs->tree_work;
1244: size = gs->tree_nel * step;
1246: /* zero out collection buffer */
1247: PetscCall(PCTFS_rvec_zero(buf, size));
1249: /* copy over my contributions */
1250: while (*in >= 0) {
1251: PetscCall(PetscBLASIntCast(step, &dstep));
1252: PetscCallBLAS("BLAScopy", BLAScopy_(&dstep, vals + *in++ * step, &i1, buf + *out++ * step, &i1));
1253: }
1255: /* perform fan in/out on full buffer */
1256: /* must change PCTFS_grop to handle the blas */
1257: PetscCall(PCTFS_grop(buf, work, size, op));
1259: /* reset */
1260: in = gs->tree_map_in;
1261: out = gs->tree_map_out;
1263: /* get the portion of the results I need */
1264: while (*in >= 0) {
1265: PetscCall(PetscBLASIntCast(step, &dstep));
1266: PetscCallBLAS("BLAScopy", BLAScopy_(&dstep, buf + *out++ * step, &i1, vals + *in++ * step, &i1));
1267: }
1268: PetscFunctionReturn(PETSC_SUCCESS);
1269: }
1271: /******************************************************************************/
1272: PetscErrorCode PCTFS_gs_gop_hc(PCTFS_gs_id *gs, PetscScalar *vals, const char *op, PetscInt dim)
1273: {
1274: PetscFunctionBegin;
1275: switch (*op) {
1276: case '+':
1277: PetscCall(PCTFS_gs_gop_plus_hc(gs, vals, dim));
1278: break;
1279: default:
1280: PetscCall(PetscInfo(0, "PCTFS_gs_gop_hc() :: %c is not a valid op\n", op[0]));
1281: PetscCall(PetscInfo(0, "PCTFS_gs_gop_hc() :: default :: plus\n"));
1282: PetscCall(PCTFS_gs_gop_plus_hc(gs, vals, dim));
1283: break;
1284: }
1285: PetscFunctionReturn(PETSC_SUCCESS);
1286: }
1288: /******************************************************************************/
1289: static PetscErrorCode PCTFS_gs_gop_plus_hc(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim)
1290: {
1291: PetscFunctionBegin;
1292: /* if there's nothing to do return */
1293: if (dim <= 0) PetscFunctionReturn(PETSC_SUCCESS);
1295: /* can't do more dimensions then exist */
1296: dim = PetscMin(dim, PCTFS_i_log2_num_nodes);
1298: /* local only operations!!! */
1299: if (gs->num_local) PetscCall(PCTFS_gs_gop_local_plus(gs, vals));
1301: /* if intersection tree/pairwise and local isn't empty */
1302: if (gs->num_local_gop) {
1303: PetscCall(PCTFS_gs_gop_local_in_plus(gs, vals));
1305: /* pairwise will do tree inside ... */
1306: if (gs->num_pairs) PetscCall(PCTFS_gs_gop_pairwise_plus_hc(gs, vals, dim)); /* tree only */
1307: else if (gs->max_left_over) PetscCall(PCTFS_gs_gop_tree_plus_hc(gs, vals, dim));
1309: PetscCall(PCTFS_gs_gop_local_out(gs, vals));
1310: } else { /* if intersection tree/pairwise and local is empty */
1311: /* pairwise will do tree inside */
1312: if (gs->num_pairs) PetscCall(PCTFS_gs_gop_pairwise_plus_hc(gs, vals, dim)); /* tree */
1313: else if (gs->max_left_over) PetscCall(PCTFS_gs_gop_tree_plus_hc(gs, vals, dim));
1314: }
1315: PetscFunctionReturn(PETSC_SUCCESS);
1316: }
1318: /******************************************************************************/
1319: static PetscErrorCode PCTFS_gs_gop_pairwise_plus_hc(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim)
1320: {
1321: PetscScalar *dptr1, *dptr2, *dptr3, *in1, *in2;
1322: PetscInt *iptr, *msg_list, *msg_size, **msg_nodes;
1323: PetscInt *pw, *list, *size, **nodes;
1324: MPI_Request *msg_ids_in, *msg_ids_out, *ids_in, *ids_out;
1325: MPI_Status status;
1326: PetscInt i, mask = 1;
1328: PetscFunctionBegin;
1329: for (i = 1; i < dim; i++) {
1330: mask <<= 1;
1331: mask++;
1332: }
1334: /* strip and load s */
1335: msg_list = list = gs->pair_list;
1336: msg_size = size = gs->msg_sizes;
1337: msg_nodes = nodes = gs->node_list;
1338: iptr = pw = gs->pw_elm_list;
1339: dptr1 = dptr3 = gs->pw_vals;
1340: msg_ids_in = ids_in = gs->msg_ids_in;
1341: msg_ids_out = ids_out = gs->msg_ids_out;
1342: dptr2 = gs->out;
1343: in1 = in2 = gs->in;
1345: /* post the receives */
1346: /* msg_nodes=nodes; */
1347: do {
1348: /* Should MPI_ANY_SOURCE be replaced by *list ? In that case do the
1349: second one *list and do list++ afterwards */
1350: if ((PCTFS_my_id | mask) == (*list | mask)) {
1351: PetscCallMPI(MPI_Irecv(in1, *size, MPIU_SCALAR, MPI_ANY_SOURCE, MSGTAG1 + *list, gs->PCTFS_gs_comm, msg_ids_in));
1352: list++;
1353: msg_ids_in++;
1354: in1 += *size++;
1355: } else {
1356: list++;
1357: size++;
1358: }
1359: } while (*++msg_nodes);
1361: /* load gs values into in out gs buffers */
1362: while (*iptr >= 0) *dptr3++ = *(in_vals + *iptr++);
1364: /* load out buffers and post the sends */
1365: msg_nodes = nodes;
1366: list = msg_list;
1367: while ((iptr = *msg_nodes++)) {
1368: if ((PCTFS_my_id | mask) == (*list | mask)) {
1369: dptr3 = dptr2;
1370: while (*iptr >= 0) *dptr2++ = *(dptr1 + *iptr++);
1371: /* CHECK PERSISTENT COMMS MODE FOR ALL THIS STUFF */
1372: /* is msg_ids_out++ correct? */
1373: PetscCallMPI(MPI_Isend(dptr3, *msg_size, MPIU_SCALAR, *list, MSGTAG1 + PCTFS_my_id, gs->PCTFS_gs_comm, msg_ids_out));
1374: msg_size++;
1375: list++;
1376: msg_ids_out++;
1377: } else {
1378: list++;
1379: msg_size++;
1380: }
1381: }
1383: /* do the tree while we're waiting */
1384: if (gs->max_left_over) PetscCall(PCTFS_gs_gop_tree_plus_hc(gs, in_vals, dim));
1386: /* process the received data */
1387: msg_nodes = nodes;
1388: list = msg_list;
1389: while ((iptr = *nodes++)) {
1390: if ((PCTFS_my_id | mask) == (*list | mask)) {
1391: /* Should I check the return value of MPI_Wait() or status? */
1392: /* Can this loop be replaced by a call to MPI_Waitall()? */
1393: PetscCallMPI(MPI_Wait(ids_in, &status));
1394: ids_in++;
1395: while (*iptr >= 0) *(dptr1 + *iptr++) += *in2++;
1396: }
1397: list++;
1398: }
1400: /* replace vals */
1401: while (*pw >= 0) *(in_vals + *pw++) = *dptr1++;
1403: /* clear isend message handles */
1404: /* This changed for clarity though it could be the same */
1405: while (*msg_nodes++) {
1406: if ((PCTFS_my_id | mask) == (*msg_list | mask)) {
1407: /* Should I check the return value of MPI_Wait() or status? */
1408: /* Can this loop be replaced by a call to MPI_Waitall()? */
1409: PetscCallMPI(MPI_Wait(ids_out, &status));
1410: ids_out++;
1411: }
1412: msg_list++;
1413: }
1414: PetscFunctionReturn(PETSC_SUCCESS);
1415: }
1417: /******************************************************************************/
1418: static PetscErrorCode PCTFS_gs_gop_tree_plus_hc(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim)
1419: {
1420: PetscInt size;
1421: PetscInt *in, *out;
1422: PetscScalar *buf, *work;
1423: PetscInt op[] = {GL_ADD, 0};
1425: PetscFunctionBegin;
1426: in = gs->tree_map_in;
1427: out = gs->tree_map_out;
1428: buf = gs->tree_buf;
1429: work = gs->tree_work;
1430: size = gs->tree_nel;
1432: PetscCall(PCTFS_rvec_zero(buf, size));
1434: while (*in >= 0) *(buf + *out++) = *(vals + *in++);
1436: in = gs->tree_map_in;
1437: out = gs->tree_map_out;
1439: PetscCall(PCTFS_grop_hc(buf, work, size, op, dim));
1441: while (*in >= 0) *(vals + *in++) = *(buf + *out++);
1442: PetscFunctionReturn(PETSC_SUCCESS);
1443: }