Actual source code: xmllogevent.c

  1: /*************************************************************************************
  2:  *    M A R I T I M E  R E S E A R C H  I N S T I T U T E  N E T H E R L A N D S     *
  3:  *************************************************************************************
  4:  *    authors: Bas van 't Hof, Koos Huijssen, Christiaan M. Klaij                    *
  5:  *************************************************************************************
  6:  *    content: Support for nested PetscTimers                                        *
  7:  *************************************************************************************/
  8: #include <petsclog.h>
  9: #include <petsc/private/logimpl.h>
 10: #include <petsctime.h>
 11: #include <petscviewer.h>
 12: #include "../src/sys/logging/xmlviewer.h"

 14: #if defined(PETSC_USE_LOG)

 16: /*
 17:  * Support for nested PetscTimers
 18:  *
 19:  * PetscTimers keep track of a lot of useful information: Wall clock times,
 20:  * message passing statistics, flop counts.  Information about the nested structure
 21:  * of the timers is lost. Example:
 22:  *
 23:  * 7:30   Start: awake
 24:  * 7:30      Start: morning routine
 25:  * 7:40         Start: eat
 26:  * 7:49         Done:  eat
 27:  * 7:43      Done:  morning routine
 28:  * 8:15      Start: work
 29:  * 12:15        Start: eat
 30:  * 12:45        Done:  eat
 31:  * 16:00     Done:  work
 32:  * 16:30     Start: evening routine
 33:  * 18:30        Start: eat
 34:  * 19:15        Done:  eat
 35:  * 22:00     Done:  evening routine
 36:  * 22:00  Done:  awake
 37:  *
 38:  * Petsc timers provide the following timer results:
 39:  *
 40:  *    awake:              1 call    14:30 hours
 41:  *    morning routine:    1 call     0:13 hours
 42:  *    eat:                3 calls    1:24 hours
 43:  *    work:               1 call     7:45 hours
 44:  *    evening routine     1 call     5:30 hours
 45:  *
 46:  * Nested timers can be used to get the following table:
 47:  *
 48:  *   [1 call]: awake                14:30 hours
 49:  *   [1 call]:    morning routine         0:13 hours         ( 2 % of awake)
 50:  *   [1 call]:       eat                       0:09 hours         (69 % of morning routine)
 51:  *                   rest (morning routine)    0:04 hours         (31 % of morning routine)
 52:  *   [1 call]:    work                    7:45 hours         (53 % of awake)
 53:  *   [1 call]:       eat                       0:30 hours         ( 6 % of work)
 54:  *                   rest (work)               7:15 hours         (94 % of work)
 55:  *   [1 call]:    evening routine         5:30 hours         (38 % of awake)
 56:  *   [1 call]:       eat                       0:45 hours         (14 % of evening routine)
 57:  *                   rest (evening routine)    4:45 hours         (86 % of morning routine)
 58:  *
 59:  * We ignore the concept of 'stages', because these seem to be conflicting notions, or at least,
 60:  * the nested timers make the stages unnecessary.
 61:  *
 62:  */

 64: /*
 65:  * Data structures for keeping track of nested timers:
 66:  *
 67:  *   nestedEvents: information about the timers that have actually been activated
 68:  *   dftParentActive: if a timer is started now, it is part of (nested inside) the dftParentActive
 69:  *
 70:  * The Default-timers are used to time the nested timers. Every nested timer corresponds to
 71:  * (one or more) default timers, where one of the default timers has the same event-id as the
 72:  * nested one.
 73:  *
 74:  * Because of the risk of confusion between nested timer ids and default timer ids, we
 75:  * introduce a typedef for nested events (NestedEventId) and use the existing type PetscLogEvent
 76:  * only for default events. Also, all nested event variables are prepended with 'nst', and
 77:  * default timers with 'dft'.
 78:  */

 80:   #define DFT_ID_AWAKE -1

 82: typedef PetscLogEvent NestedEventId;
 83: typedef struct {
 84:   NestedEventId  nstEvent;         /* event-code for this nested event, argument 'event' in PetscLogEventStartNested */
 85:   int            nParents;         /* number of 'dftParents': the default timer which was the dftParentActive when this nested timer was activated */
 86:   PetscLogEvent *dftParentsSorted; /* The default timers which were the dftParentActive when this nested event was started */
 87:   PetscLogEvent *dftEvents;        /* The default timers which represent the different 'instances' of this nested event */

 89:   PetscLogEvent *dftParents;      /* The default timers which were the dftParentActive when this nested event was started */
 90:   PetscLogEvent *dftEventsSorted; /* The default timers which represent the different 'instances' of this nested event */
 91: } PetscNestedEvent;

 93: static PetscLogEvent     dftParentActive        = DFT_ID_AWAKE;
 94: static int               nNestedEvents          = 0;
 95: static int               nNestedEventsAllocated = 0;
 96: static PetscNestedEvent *nestedEvents           = NULL;
 97: static PetscLogDouble    thresholdTime          = 0.01; /* initial value was 0.1 */

 99:   #define THRESHOLD (thresholdTime / 100.0 + 1e-12)

101: static PetscErrorCode       PetscLogEventBeginNested(NestedEventId nstEvent, int t, PetscObject o1, PetscObject o2, PetscObject o3, PetscObject o4);
102: static PetscErrorCode       PetscLogEventEndNested(NestedEventId nstEvent, int t, PetscObject o1, PetscObject o2, PetscObject o3, PetscObject o4);
103: PETSC_INTERN PetscErrorCode PetscLogView_Nested(PetscViewer);
104: PETSC_INTERN PetscErrorCode PetscLogView_Flamegraph(PetscViewer);

106: /*@C
107:   PetscLogNestedBegin - Turns on nested logging of objects and events. This logs flop
108:   rates and object creation and should not slow programs down too much.

110:   Logically Collective over `PETSC_COMM_WORLD`

112:   Options Database Keys:
113: . -log_view :filename.xml:ascii_xml - Prints an XML summary of flop and timing information to the file

115:   Usage:
116: .vb
117:       PetscInitialize(...);
118:       PetscLogNestedBegin();
119:        ... code ...
120:       PetscLogView(viewer);
121:       PetscFinalize();
122: .ve

124:   Level: advanced

126: .seealso: `PetscLogDump()`, `PetscLogAllBegin()`, `PetscLogView()`, `PetscLogTraceBegin()`, `PetscLogDefaultBegin()`
127: @*/
128: PetscErrorCode PetscLogNestedBegin(void)
129: {
130:   PetscFunctionBegin;
131:   PetscCheck(!nestedEvents, PETSC_COMM_SELF, PETSC_ERR_COR, "nestedEvents already allocated");

133:   nNestedEventsAllocated = 10;
134:   PetscCall(PetscMalloc1(nNestedEventsAllocated, &nestedEvents));
135:   dftParentActive = DFT_ID_AWAKE;
136:   nNestedEvents   = 1;

138:   /* 'Awake' is nested event 0. It has no parents */
139:   nestedEvents[0].nstEvent         = 0;
140:   nestedEvents[0].nParents         = 0;
141:   nestedEvents[0].dftParentsSorted = NULL;
142:   nestedEvents[0].dftEvents        = NULL;
143:   nestedEvents[0].dftParents       = NULL;
144:   nestedEvents[0].dftEventsSorted  = NULL;

146:   PetscCall(PetscLogSet(PetscLogEventBeginNested, PetscLogEventEndNested));
147:   PetscFunctionReturn(PETSC_SUCCESS);
148: }

150: /* Delete the data structures for the nested timers */
151: PetscErrorCode PetscLogNestedEnd(void)
152: {
153:   int i;

155:   PetscFunctionBegin;
156:   if (!nestedEvents) PetscFunctionReturn(PETSC_SUCCESS);
157:   for (i = 0; i < nNestedEvents; i++) PetscCall(PetscFree4(nestedEvents[i].dftParentsSorted, nestedEvents[i].dftEventsSorted, nestedEvents[i].dftParents, nestedEvents[i].dftEvents));
158:   PetscCall(PetscFree(nestedEvents));
159:   nestedEvents           = NULL;
160:   nNestedEvents          = 0;
161:   nNestedEventsAllocated = 0;
162:   PetscFunctionReturn(PETSC_SUCCESS);
163: }

165: /*
166:  UTILITIES: FIND STUFF IN SORTED ARRAYS

168:     dftIndex - index to be found
169:     dftArray - sorted array of PetscLogEvent-ids
170:     narray - dimension of dftArray
171:     entry - entry in the array where dftIndex may be found;

173:      if dftArray[entry] != dftIndex, then dftIndex is not part of dftArray
174:      In that case, the dftIndex can be inserted at this entry.
175: */
176: static PetscErrorCode PetscLogEventFindDefaultTimer(PetscLogEvent dftIndex, const PetscLogEvent *dftArray, int narray, int *entry)
177: {
178:   PetscFunctionBegin;
179:   if (narray == 0 || dftIndex <= dftArray[0]) {
180:     *entry = 0;
181:   } else if (dftIndex > dftArray[narray - 1]) {
182:     *entry = narray;
183:   } else {
184:     int ihigh = narray - 1, ilow = 0;
185:     while (ihigh > ilow) {
186:       const int imiddle = (ihigh + ilow) / 2;
187:       if (dftArray[imiddle] > dftIndex) {
188:         ihigh = imiddle;
189:       } else if (dftArray[imiddle] < dftIndex) {
190:         ilow = imiddle + 1;
191:       } else {
192:         ihigh = imiddle;
193:         ilow  = imiddle;
194:       }
195:     }
196:     *entry = ihigh;
197:   }
198:   PetscFunctionReturn(PETSC_SUCCESS);
199: }

201: /*
202:     Utility: find the nested event with given identification

204:     nstEvent - Nested event to be found
205:     entry - entry in the nestedEvents where nstEvent may be found;

207:     if nestedEvents[entry].nstEvent != nstEvent, then index is not part of iarray
208: */
209: static PetscErrorCode PetscLogEventFindNestedTimer(NestedEventId nstEvent, int *entry)
210: {
211:   PetscFunctionBegin;
212:   if (nNestedEvents == 0 || nstEvent <= nestedEvents[0].nstEvent) {
213:     *entry = 0;
214:   } else if (nstEvent > nestedEvents[nNestedEvents - 1].nstEvent) {
215:     *entry = nNestedEvents;
216:   } else {
217:     int ihigh = nNestedEvents - 1, ilow = 0;
218:     while (ihigh > ilow) {
219:       const int imiddle = (ihigh + ilow) / 2;
220:       if (nestedEvents[imiddle].nstEvent > nstEvent) {
221:         ihigh = imiddle;
222:       } else if (nestedEvents[imiddle].nstEvent < nstEvent) {
223:         ilow = imiddle + 1;
224:       } else {
225:         ihigh = imiddle;
226:         ilow  = imiddle;
227:       }
228:     }
229:     *entry = ihigh;
230:   }
231:   PetscFunctionReturn(PETSC_SUCCESS);
232: }

234: /*
235:  Nested logging is not prepared yet to support user-defined logging stages, so for now we force logging on the main stage.
236:  Using PetscLogStage{Push/Pop}() would be more appropriate, but these two calls do extra bookkeeping work we don't need.
237: */

239:   #define MAINSTAGE 0

241: static PetscLogStage savedStage = 0;

243: static inline PetscErrorCode PetscLogStageOverride(void)
244: {
245:   PetscStageLog stageLog = petsc_stageLog;

247:   PetscFunctionBegin;
248:   if (stageLog->curStage == MAINSTAGE) PetscFunctionReturn(PETSC_SUCCESS);
249:   savedStage         = stageLog->curStage;
250:   stageLog->curStage = MAINSTAGE;
251:   PetscCall(PetscIntStackPush(stageLog->stack, MAINSTAGE));
252:   PetscFunctionReturn(PETSC_SUCCESS);
253: }

255: static inline PetscErrorCode PetscLogStageRestore(void)
256: {
257:   PetscStageLog stageLog = petsc_stageLog;

259:   PetscFunctionBegin;
260:   if (savedStage == MAINSTAGE) PetscFunctionReturn(PETSC_SUCCESS);
261:   stageLog->curStage = savedStage;
262:   PetscCall(PetscIntStackPop(stageLog->stack, &savedStage));
263:   PetscFunctionReturn(PETSC_SUCCESS);
264: }

266: /******************************************************************************************/
267: /* Start a nested event */
268: static PetscErrorCode PetscLogEventBeginNested(NestedEventId nstEvent, int t, PetscObject o1, PetscObject o2, PetscObject o3, PetscObject o4)
269: {
270:   int           entry, pentry, tentry, i;
271:   PetscLogEvent dftEvent;

273:   PetscFunctionBegin;
274:   PetscCall(PetscLogEventFindNestedTimer(nstEvent, &entry));
275:   if (entry >= nNestedEvents || nestedEvents[entry].nstEvent != nstEvent) {
276:     /* Nested event doesn't exist yet: create it */

278:     if (nNestedEvents == nNestedEventsAllocated) {
279:       /* Enlarge and re-allocate nestedEvents if needed */
280:       PetscNestedEvent *tmp = nestedEvents;
281:       PetscCall(PetscMalloc1(2 * nNestedEvents, &nestedEvents));
282:       nNestedEventsAllocated *= 2;
283:       PetscCall(PetscArraycpy(nestedEvents, tmp, nNestedEvents));
284:       PetscCall(PetscFree(tmp));
285:     }

287:     /* Clear space in nestedEvents for new nested event */
288:     nNestedEvents++;
289:     for (i = nNestedEvents - 1; i > entry; i--) nestedEvents[i] = nestedEvents[i - 1];

291:     /* Create event in nestedEvents */
292:     nestedEvents[entry].nstEvent = nstEvent;
293:     nestedEvents[entry].nParents = 1;
294:     PetscCall(PetscMalloc4(1, &nestedEvents[entry].dftParentsSorted, 1, &nestedEvents[entry].dftEventsSorted, 1, &nestedEvents[entry].dftParents, 1, &nestedEvents[entry].dftEvents));

296:     /* Fill in new event */
297:     pentry   = 0;
298:     dftEvent = (PetscLogEvent)nstEvent;

300:     nestedEvents[entry].nstEvent                 = nstEvent;
301:     nestedEvents[entry].dftParents[pentry]       = dftParentActive;
302:     nestedEvents[entry].dftEvents[pentry]        = dftEvent;
303:     nestedEvents[entry].dftParentsSorted[pentry] = dftParentActive;
304:     nestedEvents[entry].dftEventsSorted[pentry]  = dftEvent;

306:   } else {
307:     /* Nested event exists: find current dftParentActive among parents */
308:     PetscLogEvent *dftParentsSorted = nestedEvents[entry].dftParentsSorted;
309:     PetscLogEvent *dftEvents        = nestedEvents[entry].dftEvents;
310:     int            nParents         = nestedEvents[entry].nParents;

312:     PetscCall(PetscLogEventFindDefaultTimer(dftParentActive, dftParentsSorted, nParents, &pentry));

314:     if (pentry >= nParents || dftParentActive != dftParentsSorted[pentry]) {
315:       /* dftParentActive not in the list: add it to the list */
316:       int            i;
317:       PetscLogEvent *dftParents      = nestedEvents[entry].dftParents;
318:       PetscLogEvent *dftEventsSorted = nestedEvents[entry].dftEventsSorted;
319:       char           name[100];

321:       /* Register a new default timer */
322:       PetscCall(PetscSNPrintf(name, PETSC_STATIC_ARRAY_LENGTH(name), "%d -> %d", (int)dftParentActive, (int)nstEvent));
323:       PetscCall(PetscLogEventRegister(name, 0, &dftEvent));
324:       PetscCall(PetscLogEventFindDefaultTimer(dftEvent, dftEventsSorted, nParents, &tentry));

326:       /* Reallocate parents and dftEvents to make space for new parent */
327:       PetscCall(PetscMalloc4(1 + nParents, &nestedEvents[entry].dftParentsSorted, 1 + nParents, &nestedEvents[entry].dftEventsSorted, 1 + nParents, &nestedEvents[entry].dftParents, 1 + nParents, &nestedEvents[entry].dftEvents));
328:       PetscCall(PetscArraycpy(nestedEvents[entry].dftParentsSorted, dftParentsSorted, nParents));
329:       PetscCall(PetscArraycpy(nestedEvents[entry].dftEventsSorted, dftEventsSorted, nParents));
330:       PetscCall(PetscArraycpy(nestedEvents[entry].dftParents, dftParents, nParents));
331:       PetscCall(PetscArraycpy(nestedEvents[entry].dftEvents, dftEvents, nParents));
332:       PetscCall(PetscFree4(dftParentsSorted, dftEventsSorted, dftParents, dftEvents));

334:       dftParents       = nestedEvents[entry].dftParents;
335:       dftEvents        = nestedEvents[entry].dftEvents;
336:       dftParentsSorted = nestedEvents[entry].dftParentsSorted;
337:       dftEventsSorted  = nestedEvents[entry].dftEventsSorted;

339:       nestedEvents[entry].nParents++;
340:       nParents++;

342:       for (i = nParents - 1; i > pentry; i--) {
343:         dftParentsSorted[i] = dftParentsSorted[i - 1];
344:         dftEvents[i]        = dftEvents[i - 1];
345:       }
346:       for (i = nParents - 1; i > tentry; i--) {
347:         dftParents[i]      = dftParents[i - 1];
348:         dftEventsSorted[i] = dftEventsSorted[i - 1];
349:       }

351:       /* Fill in the new default timer */
352:       dftParentsSorted[pentry] = dftParentActive;
353:       dftEvents[pentry]        = dftEvent;
354:       dftParents[tentry]       = dftParentActive;
355:       dftEventsSorted[tentry]  = dftEvent;

357:     } else {
358:       /* dftParentActive was found: find the corresponding default 'dftEvent'-timer */
359:       dftEvent = nestedEvents[entry].dftEvents[pentry];
360:     }
361:   }

363:   /* Start the default 'dftEvent'-timer and update the dftParentActive */
364:   PetscCall(PetscLogStageOverride());
365:   PetscCall(PetscLogEventBeginDefault(dftEvent, t, o1, o2, o3, o4));
366:   PetscCall(PetscLogStageRestore());
367:   dftParentActive = dftEvent;
368:   PetscFunctionReturn(PETSC_SUCCESS);
369: }

371: /* End a nested event */
372: static PetscErrorCode PetscLogEventEndNested(NestedEventId nstEvent, int t, PetscObject o1, PetscObject o2, PetscObject o3, PetscObject o4)
373: {
374:   int            entry, pentry, nParents;
375:   PetscLogEvent *dftEventsSorted;

377:   PetscFunctionBegin;
378:   /* Find the nested event */
379:   PetscCall(PetscLogEventFindNestedTimer(nstEvent, &entry));
380:   PetscCheck(entry < nNestedEvents, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Logging event %d larger than number of events %d", entry, nNestedEvents);
381:   PetscCheck(nestedEvents[entry].nstEvent == nstEvent, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Logging event %d had unbalanced begin/end pairs does not match %d", entry, nstEvent);
382:   dftEventsSorted = nestedEvents[entry].dftEventsSorted;
383:   nParents        = nestedEvents[entry].nParents;

385:   /* Find the current default timer among the 'dftEvents' of this event */
386:   PetscCall(PetscLogEventFindDefaultTimer(dftParentActive, dftEventsSorted, nParents, &pentry));

388:   PetscCheck(pentry < nParents, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Entry %d is larger than number of parents %d", pentry, nParents);
389:   PetscCheck(dftEventsSorted[pentry] == dftParentActive, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Active parent is %d, but we seem to be closing %d", dftParentActive, dftEventsSorted[pentry]);

391:   /* Stop the default timer and update the dftParentActive */
392:   PetscCall(PetscLogStageOverride());
393:   PetscCall(PetscLogEventEndDefault(dftParentActive, t, o1, o2, o3, o4));
394:   PetscCall(PetscLogStageRestore());
395:   dftParentActive = nestedEvents[entry].dftParents[pentry];
396:   PetscFunctionReturn(PETSC_SUCCESS);
397: }

399: /*@
400:    PetscLogSetThreshold - Set the threshold time for logging the events; this is a percentage out of 100, so 1. means any event
401:           that takes 1 or more percent of the time.

403:   Logically Collective over `PETSC_COMM_WORLD`

405:   Input Parameter:
406: .   newThresh - the threshold to use

408:   Output Parameter:
409: .   oldThresh - the previously set threshold value

411:   Options Database Keys:
412: . -log_view :filename.xml:ascii_xml - Prints an XML summary of flop and timing information to the file

414:   Usage:
415: .vb
416:       PetscInitialize(...);
417:       PetscLogNestedBegin();
418:       PetscLogSetThreshold(0.1,&oldthresh);
419:        ... code ...
420:       PetscLogView(viewer);
421:       PetscFinalize();
422: .ve

424:   Level: advanced

426: .seealso: `PetscLogDump()`, `PetscLogAllBegin()`, `PetscLogView()`, `PetscLogTraceBegin()`, `PetscLogDefaultBegin()`,
427:           `PetscLogNestedBegin()`
428: @*/
429: PetscErrorCode PetscLogSetThreshold(PetscLogDouble newThresh, PetscLogDouble *oldThresh)
430: {
431:   PetscFunctionBegin;
432:   if (oldThresh) *oldThresh = thresholdTime;
433:   if (newThresh == (PetscLogDouble)PETSC_DECIDE) newThresh = 0.01;
434:   if (newThresh == (PetscLogDouble)PETSC_DEFAULT) newThresh = 0.01;
435:   thresholdTime = PetscMax(newThresh, 0.0);
436:   PetscFunctionReturn(PETSC_SUCCESS);
437: }

439: static PetscErrorCode PetscPrintExeSpecs(PetscViewer viewer)
440: {
441:   char        arch[128], hostname[128], username[128], pname[PETSC_MAX_PATH_LEN], date[128];
442:   char        version[256], buildoptions[128] = "";
443:   PetscMPIInt size;
444:   size_t      len;

446:   PetscFunctionBegin;
447:   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)viewer), &size));
448:   PetscCall(PetscGetArchType(arch, sizeof(arch)));
449:   PetscCall(PetscGetHostName(hostname, sizeof(hostname)));
450:   PetscCall(PetscGetUserName(username, sizeof(username)));
451:   PetscCall(PetscGetProgramName(pname, sizeof(pname)));
452:   PetscCall(PetscGetDate(date, sizeof(date)));
453:   PetscCall(PetscGetVersion(version, sizeof(version)));

455:   PetscCall(PetscViewerXMLStartSection(viewer, "runspecification", "Run Specification"));
456:   PetscCall(PetscViewerXMLPutString(viewer, "executable", "Executable", pname));
457:   PetscCall(PetscViewerXMLPutString(viewer, "architecture", "Architecture", arch));
458:   PetscCall(PetscViewerXMLPutString(viewer, "hostname", "Host", hostname));
459:   PetscCall(PetscViewerXMLPutInt(viewer, "nprocesses", "Number of processes", size));
460:   PetscCall(PetscViewerXMLPutString(viewer, "user", "Run by user", username));
461:   PetscCall(PetscViewerXMLPutString(viewer, "date", "Started at", date));
462:   PetscCall(PetscViewerXMLPutString(viewer, "petscrelease", "Petsc Release", version));

464:   if (PetscDefined(USE_DEBUG)) PetscCall(PetscStrlcat(buildoptions, "Debug ", sizeof(buildoptions)));
465:   if (PetscDefined(USE_COMPLEX)) PetscCall(PetscStrlcat(buildoptions, "Complex ", sizeof(buildoptions)));
466:   if (PetscDefined(USE_REAL_SINGLE)) {
467:     PetscCall(PetscStrlcat(buildoptions, "Single ", sizeof(buildoptions)));
468:   } else if (PetscDefined(USE_REAL___FLOAT128)) {
469:     PetscCall(PetscStrlcat(buildoptions, "Quadruple ", sizeof(buildoptions)));
470:   } else if (PetscDefined(USE_REAL___FP16)) {
471:     PetscCall(PetscStrlcat(buildoptions, "Half ", sizeof(buildoptions)));
472:   }
473:   if (PetscDefined(USE_64BIT_INDICES)) PetscCall(PetscStrlcat(buildoptions, "Int64 ", sizeof(buildoptions)));
474:   #if defined(__cplusplus)
475:   PetscCall(PetscStrlcat(buildoptions, "C++ ", sizeof(buildoptions)));
476:   #endif
477:   PetscCall(PetscStrlen(buildoptions, &len));
478:   if (len) PetscCall(PetscViewerXMLPutString(viewer, "petscbuildoptions", "Petsc build options", buildoptions));
479:   PetscCall(PetscViewerXMLEndSection(viewer, "runspecification"));
480:   PetscFunctionReturn(PETSC_SUCCESS);
481: }

483: /* Print the global performance: max, max/min, average and total of
484:  *      time, objects, flops, flops/sec, memory, MPI messages, MPI message lengths, MPI reductions.
485:  */
486: static PetscErrorCode PetscPrintXMLGlobalPerformanceElement(PetscViewer viewer, const char *name, const char *desc, PetscLogDouble local_val, const PetscBool print_average, const PetscBool print_total)
487: {
488:   PetscLogDouble min, tot, ratio, avg;
489:   MPI_Comm       comm;
490:   PetscMPIInt    rank, size;
491:   PetscLogDouble valrank[2], max[2];

493:   PetscFunctionBegin;
494:   PetscCall(PetscObjectGetComm((PetscObject)viewer, &comm));
495:   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)viewer), &size));
496:   PetscCallMPI(MPI_Comm_rank(comm, &rank));

498:   valrank[0] = local_val;
499:   valrank[1] = (PetscLogDouble)rank;
500:   PetscCall(MPIU_Allreduce(&local_val, &min, 1, MPIU_PETSCLOGDOUBLE, MPI_MIN, comm));
501:   PetscCall(MPIU_Allreduce(valrank, &max, 1, MPIU_2PETSCLOGDOUBLE, MPI_MAXLOC, comm));
502:   PetscCall(MPIU_Allreduce(&local_val, &tot, 1, MPIU_PETSCLOGDOUBLE, MPI_SUM, comm));
503:   avg = tot / ((PetscLogDouble)size);
504:   if (min != 0.0) ratio = max[0] / min;
505:   else ratio = 0.0;

507:   PetscCall(PetscViewerXMLStartSection(viewer, name, desc));
508:   PetscCall(PetscViewerXMLPutDouble(viewer, "max", NULL, max[0], "%e"));
509:   PetscCall(PetscViewerXMLPutInt(viewer, "maxrank", "rank at which max was found", (PetscMPIInt)max[1]));
510:   PetscCall(PetscViewerXMLPutDouble(viewer, "ratio", NULL, ratio, "%f"));
511:   if (print_average) PetscCall(PetscViewerXMLPutDouble(viewer, "average", NULL, avg, "%e"));
512:   if (print_total) PetscCall(PetscViewerXMLPutDouble(viewer, "total", NULL, tot, "%e"));
513:   PetscCall(PetscViewerXMLEndSection(viewer, name));
514:   PetscFunctionReturn(PETSC_SUCCESS);
515: }

517: /* Print the global performance: max, max/min, average and total of
518:  *      time, objects, flops, flops/sec, memory, MPI messages, MPI message lengths, MPI reductions.
519:  */
520: static PetscErrorCode PetscPrintGlobalPerformance(PetscViewer viewer, PetscLogDouble locTotalTime)
521: {
522:   PetscLogDouble  flops, mem, red, mess;
523:   const PetscBool print_total_yes = PETSC_TRUE, print_total_no = PETSC_FALSE, print_average_no = PETSC_FALSE, print_average_yes = PETSC_TRUE;

525:   PetscFunctionBegin;
526:   /* Must preserve reduction count before we go on */
527:   red = petsc_allreduce_ct + petsc_gather_ct + petsc_scatter_ct;

529:   /* Calculate summary information */
530:   PetscCall(PetscViewerXMLStartSection(viewer, "globalperformance", "Global performance"));

532:   /*   Time */
533:   PetscCall(PetscPrintXMLGlobalPerformanceElement(viewer, "time", "Time (sec)", locTotalTime, print_average_yes, print_total_no));

535:   /*   Objects */
536:   PetscCall(PetscPrintXMLGlobalPerformanceElement(viewer, "objects", "Objects", (PetscLogDouble)petsc_numObjects, print_average_yes, print_total_no));

538:   /*   Flop */
539:   PetscCall(PetscPrintXMLGlobalPerformanceElement(viewer, "mflop", "MFlop", petsc_TotalFlops / 1.0E6, print_average_yes, print_total_yes));

541:   /*   Flop/sec -- Must talk to Barry here */
542:   if (locTotalTime != 0.0) flops = petsc_TotalFlops / locTotalTime;
543:   else flops = 0.0;
544:   PetscCall(PetscPrintXMLGlobalPerformanceElement(viewer, "mflops", "MFlop/sec", flops / 1.0E6, print_average_yes, print_total_yes));

546:   /*   Memory */
547:   PetscCall(PetscMallocGetMaximumUsage(&mem));
548:   if (mem > 0.0) PetscCall(PetscPrintXMLGlobalPerformanceElement(viewer, "memory", "Memory (MiB)", mem / 1024.0 / 1024.0, print_average_yes, print_total_yes));
549:   /*   Messages */
550:   mess = 0.5 * (petsc_irecv_ct + petsc_isend_ct + petsc_recv_ct + petsc_send_ct);
551:   PetscCall(PetscPrintXMLGlobalPerformanceElement(viewer, "messagetransfers", "MPI Message Transfers", mess, print_average_yes, print_total_yes));

553:   /*   Message Volume */
554:   mess = 0.5 * (petsc_irecv_len + petsc_isend_len + petsc_recv_len + petsc_send_len);
555:   PetscCall(PetscPrintXMLGlobalPerformanceElement(viewer, "messagevolume", "MPI Message Volume (MiB)", mess / 1024.0 / 1024.0, print_average_yes, print_total_yes));

557:   /*   Reductions */
558:   PetscCall(PetscPrintXMLGlobalPerformanceElement(viewer, "reductions", "MPI Reductions", red, print_average_no, print_total_no));
559:   PetscCall(PetscViewerXMLEndSection(viewer, "globalperformance"));
560:   PetscFunctionReturn(PETSC_SUCCESS);
561: }

563: typedef struct {
564:   PetscLogEvent  dftEvent;
565:   NestedEventId  nstEvent;
566:   PetscLogEvent  dftParent;
567:   NestedEventId  nstParent;
568:   PetscBool      own;
569:   int            depth;
570:   NestedEventId *nstPath;
571: } PetscNestedEventTree;

573: /* Compare timers to sort them in the tree */
574: static int compareTreeItems(const void *item1_, const void *item2_)
575: {
576:   int                   i;
577:   PetscNestedEventTree *item1 = (PetscNestedEventTree *)item1_;
578:   PetscNestedEventTree *item2 = (PetscNestedEventTree *)item2_;

580:   for (i = 0; i < PetscMin(item1->depth, item2->depth); i++) {
581:     if (item1->nstPath[i] < item2->nstPath[i]) return -1;
582:     if (item1->nstPath[i] > item2->nstPath[i]) return +1;
583:   }
584:   if (item1->depth < item2->depth) return -1;
585:   if (item1->depth > item2->depth) return 1;
586:   return 0;
587: }
588: /*
589:  * Do MPI communication to get the complete, nested calling tree for all processes: there may be
590:  * calls that happen in some processes, but not in others.
591:  *
592:  * The output, tree[nTimers] is an array of PetscNestedEventTree-structs.
593:  * The tree is sorted so that the timers can be printed in the order of appearance.
594:  *
595:  * For tree-items which appear in the trees of multiple processes (which will be most items), the
596:  * following rule is followed:
597:  * + if information from my own process is available, then that is the information stored in tree.
598:  *   otherwise it is some other process's information.
599:  */
600: static PetscErrorCode PetscLogNestedTreeCreate(PetscViewer viewer, PetscNestedEventTree **p_tree, int *p_nTimers)
601: {
602:   PetscNestedEventTree *tree = NULL, *newTree;
603:   int                  *treeIndices;
604:   int                   nTimers, totalNTimers, i, j, iTimer0, maxDefaultTimer;
605:   int                   yesno;
606:   PetscBool             done;
607:   int                   maxdepth;
608:   int                   depth;
609:   int                   illegalEvent;
610:   int                   iextra;
611:   NestedEventId        *nstPath, *nstMyPath;
612:   MPI_Comm              comm;

614:   PetscFunctionBegin;
615:   PetscCall(PetscObjectGetComm((PetscObject)viewer, &comm));

617:   /* Calculate memory needed to store everybody's information and allocate tree */
618:   nTimers = 0;
619:   for (i = 0; i < nNestedEvents; i++) nTimers += nestedEvents[i].nParents;

621:   PetscCall(PetscMalloc1(nTimers, &tree));

623:   /* Fill tree with readily available information */
624:   iTimer0         = 0;
625:   maxDefaultTimer = 0;
626:   for (i = 0; i < nNestedEvents; i++) {
627:     int            nParents         = nestedEvents[i].nParents;
628:     NestedEventId  nstEvent         = nestedEvents[i].nstEvent;
629:     PetscLogEvent *dftParentsSorted = nestedEvents[i].dftParentsSorted;
630:     PetscLogEvent *dftEvents        = nestedEvents[i].dftEvents;
631:     for (j = 0; j < nParents; j++) {
632:       maxDefaultTimer = PetscMax(dftEvents[j], maxDefaultTimer);

634:       tree[iTimer0 + j].dftEvent  = dftEvents[j];
635:       tree[iTimer0 + j].nstEvent  = nstEvent;
636:       tree[iTimer0 + j].dftParent = dftParentsSorted[j];
637:       tree[iTimer0 + j].own       = PETSC_TRUE;

639:       tree[iTimer0 + j].nstParent = 0;
640:       tree[iTimer0 + j].depth     = 0;
641:       tree[iTimer0 + j].nstPath   = NULL;
642:     }
643:     iTimer0 += nParents;
644:   }

646:   /* Calculate the global maximum for the default timer index, so array treeIndices can
647:    * be allocated only once */
648:   PetscCall(MPIU_Allreduce(&maxDefaultTimer, &j, 1, MPI_INT, MPI_MAX, comm));
649:   maxDefaultTimer = j;

651:   /* Find default timer's place in the tree */
652:   PetscCall(PetscCalloc1(maxDefaultTimer + 1, &treeIndices));
653:   treeIndices[0] = 0;
654:   for (i = 0; i < nTimers; i++) {
655:     PetscLogEvent dftEvent = tree[i].dftEvent;
656:     treeIndices[dftEvent]  = i;
657:   }

659:   /* Find each dftParent's nested identification */
660:   for (i = 0; i < nTimers; i++) {
661:     PetscLogEvent dftParent = tree[i].dftParent;
662:     if (dftParent != DFT_ID_AWAKE) {
663:       int j             = treeIndices[dftParent];
664:       tree[i].nstParent = tree[j].nstEvent;
665:     }
666:   }

668:   /* Find depths for each timer path */
669:   done     = PETSC_FALSE;
670:   maxdepth = 1;
671:   while (!done) {
672:     done = PETSC_TRUE;
673:     for (i = 0; i < nTimers; i++) {
674:       if (tree[i].dftParent == DFT_ID_AWAKE) {
675:         tree[i].depth = 1;
676:         maxdepth      = PetscMax(1, maxdepth);
677:       } else {
678:         int j = treeIndices[tree[i].dftParent];
679:         depth = 1 + tree[j].depth;
680:         if (depth > tree[i].depth) {
681:           done          = PETSC_FALSE;
682:           tree[i].depth = depth;
683:           maxdepth      = PetscMax(depth, maxdepth);
684:         }
685:       }
686:     }
687:   }

689:   /* Allocate the paths in the entire tree */
690:   for (i = 0; i < nTimers; i++) {
691:     depth = tree[i].depth;
692:     PetscCall(PetscCalloc1(depth, &tree[i].nstPath));
693:   }

695:   /* Calculate the paths for all timers */
696:   for (depth = 1; depth <= maxdepth; depth++) {
697:     for (i = 0; i < nTimers; i++) {
698:       if (tree[i].depth == depth) {
699:         if (depth > 1) {
700:           int j = treeIndices[tree[i].dftParent];
701:           PetscCall(PetscArraycpy(tree[i].nstPath, tree[j].nstPath, depth - 1));
702:         }
703:         tree[i].nstPath[depth - 1] = tree[i].nstEvent;
704:       }
705:     }
706:   }
707:   PetscCall(PetscFree(treeIndices));

709:   /* Sort the tree on basis of the paths */
710:   qsort(tree, nTimers, sizeof(PetscNestedEventTree), compareTreeItems);

712:   /* Allocate an array to store paths */
713:   depth = maxdepth;
714:   PetscCall(MPIU_Allreduce(&depth, &maxdepth, 1, MPI_INT, MPI_MAX, comm));
715:   PetscCall(PetscMalloc1(maxdepth + 1, &nstPath));
716:   PetscCall(PetscMalloc1(maxdepth + 1, &nstMyPath));

718:   /* Find an illegal nested event index (1+largest nested event index) */
719:   illegalEvent = 1 + nestedEvents[nNestedEvents - 1].nstEvent;
720:   i            = illegalEvent;
721:   PetscCall(MPIU_Allreduce(&i, &illegalEvent, 1, MPI_INT, MPI_MAX, comm));

723:   /* First, detect timers which are not available in this process, but are available in others
724:    *        Allocate a new tree, that can contain all timers
725:    * Then,  fill the new tree with all (own and not-own) timers */
726:   newTree = NULL;
727:   for (yesno = 0; yesno <= 1; yesno++) {
728:     depth  = 1;
729:     i      = 0;
730:     iextra = 0;
731:     while (depth > 0) {
732:       int       j;
733:       PetscBool same;

735:       /* Construct the next path in this process's tree:
736:        * if necessary, supplement with invalid path entries */
737:       depth++;
738:       PetscCheck(depth <= maxdepth + 1, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Depth %d > maxdepth+1 %d", depth, maxdepth + 1);
739:       if (i < nTimers) {
740:         for (j = 0; j < tree[i].depth; j++) nstMyPath[j] = tree[i].nstPath[j];
741:         for (j = tree[i].depth; j < depth; j++) nstMyPath[j] = illegalEvent;
742:       } else {
743:         for (j = 0; j < depth; j++) nstMyPath[j] = illegalEvent;
744:       }

746:       /* Communicate with other processes to obtain the next path and its depth */
747:       PetscCall(MPIU_Allreduce(nstMyPath, nstPath, depth, MPI_INT, MPI_MIN, comm));
748:       for (j = depth - 1; (int)j >= 0; j--) {
749:         if (nstPath[j] == illegalEvent) depth = j;
750:       }

752:       if (depth > 0) {
753:         /* If the path exists */

755:         /* check whether the next path is the same as this process's next path */
756:         same = PETSC_TRUE;
757:         for (j = 0; same && j < depth; j++) same = (same && nstMyPath[j] == nstPath[j]) ? PETSC_TRUE : PETSC_FALSE;

759:         if (same) {
760:           /* Register 'own path' */
761:           if (newTree) newTree[i + iextra] = tree[i];
762:           i++;
763:         } else {
764:           /* Register 'not an own path' */
765:           if (newTree) {
766:             newTree[i + iextra].nstEvent = nstPath[depth - 1];
767:             newTree[i + iextra].own      = PETSC_FALSE;
768:             newTree[i + iextra].depth    = depth;
769:             PetscCall(PetscMalloc1(depth, &newTree[i + iextra].nstPath));
770:             for (j = 0; j < depth; j++) newTree[i + iextra].nstPath[j] = nstPath[j];

772:             newTree[i + iextra].dftEvent  = 0;
773:             newTree[i + iextra].dftParent = 0;
774:             newTree[i + iextra].nstParent = 0;
775:           }
776:           iextra++;
777:         }
778:       }
779:     }

781:     /* Determine the size of the complete tree (with own and not-own timers) and allocate the new tree */
782:     totalNTimers = nTimers + iextra;
783:     if (!newTree) PetscCall(PetscMalloc1(totalNTimers, &newTree));
784:   }
785:   PetscCall(PetscFree(nstPath));
786:   PetscCall(PetscFree(nstMyPath));
787:   PetscCall(PetscFree(tree));
788:   tree    = newTree;
789:   newTree = NULL;

791:   /* Set return value and return */
792:   *p_tree    = tree;
793:   *p_nTimers = totalNTimers;
794:   PetscFunctionReturn(PETSC_SUCCESS);
795: }

797: /*
798:  * Delete the nested timer tree
799:  */
800: static PetscErrorCode PetscLogNestedTreeDestroy(PetscNestedEventTree *tree, int nTimers)
801: {
802:   int i;

804:   PetscFunctionBegin;
805:   for (i = 0; i < nTimers; i++) PetscCall(PetscFree(tree[i].nstPath));
806:   PetscCall(PetscFree(tree));
807:   PetscFunctionReturn(PETSC_SUCCESS);
808: }

810: /* Print the global performance: max, max/min, average and total of
811:  *      time, objects, flops, flops/sec, memory, MPI messages, MPI message lengths, MPI reductions.
812:  */
813: static PetscErrorCode PetscPrintXMLNestedLinePerfResults(PetscViewer viewer, const char *name, PetscLogDouble value, PetscLogDouble minthreshold, PetscLogDouble maxthreshold, PetscLogDouble minmaxtreshold)
814: {
815:   MPI_Comm       comm; /* MPI communicator in reduction */
816:   PetscMPIInt    rank; /* rank of this process */
817:   PetscLogDouble val_in[2], max[2], min[2];
818:   PetscLogDouble minvalue, maxvalue, tot;
819:   PetscMPIInt    size;
820:   PetscMPIInt    minLoc, maxLoc;

822:   PetscFunctionBegin;
823:   PetscCall(PetscObjectGetComm((PetscObject)viewer, &comm));
824:   PetscCallMPI(MPI_Comm_size(comm, &size));
825:   PetscCallMPI(MPI_Comm_rank(comm, &rank));
826:   val_in[0] = value;
827:   val_in[1] = (PetscLogDouble)rank;
828:   PetscCall(MPIU_Allreduce(val_in, max, 1, MPIU_2PETSCLOGDOUBLE, MPI_MAXLOC, comm));
829:   PetscCall(MPIU_Allreduce(val_in, min, 1, MPIU_2PETSCLOGDOUBLE, MPI_MINLOC, comm));
830:   maxvalue = max[0];
831:   maxLoc   = (PetscMPIInt)max[1];
832:   minvalue = min[0];
833:   minLoc   = (PetscMPIInt)min[1];
834:   PetscCall(MPIU_Allreduce(&value, &tot, 1, MPIU_PETSCLOGDOUBLE, MPI_SUM, comm));

836:   if (maxvalue < maxthreshold && minvalue >= minthreshold) {
837:     /* One call per parent or NO value: don't print */
838:   } else {
839:     PetscCall(PetscViewerXMLStartSection(viewer, name, NULL));
840:     if (maxvalue > minvalue * minmaxtreshold) {
841:       PetscCall(PetscViewerXMLPutDouble(viewer, "avgvalue", NULL, tot / size, "%g"));
842:       PetscCall(PetscViewerXMLPutDouble(viewer, "minvalue", NULL, minvalue, "%g"));
843:       PetscCall(PetscViewerXMLPutDouble(viewer, "maxvalue", NULL, maxvalue, "%g"));
844:       PetscCall(PetscViewerXMLPutInt(viewer, "minloc", NULL, minLoc));
845:       PetscCall(PetscViewerXMLPutInt(viewer, "maxloc", NULL, maxLoc));
846:     } else {
847:       PetscCall(PetscViewerXMLPutDouble(viewer, "value", NULL, tot / size, "%g"));
848:     }
849:     PetscCall(PetscViewerXMLEndSection(viewer, name));
850:   }
851:   PetscFunctionReturn(PETSC_SUCCESS);
852: }

854:   #define N_COMM 8
855: static PetscErrorCode PetscLogNestedTreePrintLine(PetscViewer viewer, PetscEventPerfInfo perfInfo, PetscLogDouble countsPerCall, int parentCount, int depth, const char *name, PetscLogDouble totalTime, PetscBool *isPrinted)
856: {
857:   PetscLogDouble time = perfInfo.time;
858:   PetscLogDouble timeMx;
859:   MPI_Comm       comm;

861:   PetscFunctionBegin;
862:   PetscCall(PetscObjectGetComm((PetscObject)viewer, &comm));
863:   PetscCall(MPIU_Allreduce(&time, &timeMx, 1, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm));
864:   *isPrinted = ((timeMx / totalTime) >= THRESHOLD) ? PETSC_TRUE : PETSC_FALSE;
865:   if (*isPrinted) {
866:     PetscCall(PetscViewerXMLStartSection(viewer, "event", NULL));
867:     PetscCall(PetscViewerXMLPutString(viewer, "name", NULL, name));
868:     PetscCall(PetscPrintXMLNestedLinePerfResults(viewer, "time", time / totalTime * 100.0, 0, 0, 1.02));
869:     PetscCall(PetscPrintXMLNestedLinePerfResults(viewer, "ncalls", parentCount > 0 ? countsPerCall : 0, 0.99, 1.01, 1.02));
870:     PetscCall(PetscPrintXMLNestedLinePerfResults(viewer, "mflops", time >= timeMx * 0.001 ? 1e-6 * perfInfo.flops / time : 0, 0, 0.01, 1.05));
871:     PetscCall(PetscPrintXMLNestedLinePerfResults(viewer, "mbps", time >= timeMx * 0.001 ? perfInfo.messageLength / (1024 * 1024 * time) : 0, 0, 0.01, 1.05));
872:     PetscCall(PetscPrintXMLNestedLinePerfResults(viewer, "nreductsps", time >= timeMx * 0.001 ? perfInfo.numReductions / time : 0, 0, 0.01, 1.05));
873:   }
874:   PetscFunctionReturn(PETSC_SUCCESS);
875: }

877: /* Count the number of times the parent event was called */

879: static int countParents(const PetscNestedEventTree *tree, PetscEventPerfInfo *eventPerfInfo, int i)
880: {
881:   if (tree[i].depth <= 1) {
882:     return 1; /* Main event: only once */
883:   } else if (!tree[i].own) {
884:     return 1; /* This event didn't happen in this process, but did in another */
885:   } else {
886:     int iParent;
887:     for (iParent = i - 1; iParent >= 0; iParent--) {
888:       if (tree[iParent].depth == tree[i].depth - 1) break;
889:     }
890:     if (tree[iParent].depth != tree[i].depth - 1) {
891:       /* *****  Internal error: cannot find parent */
892:       return -2;
893:     } else {
894:       PetscLogEvent dftEvent = tree[iParent].dftEvent;
895:       return eventPerfInfo[dftEvent].count;
896:     }
897:   }
898: }

900: typedef struct {
901:   int            id;
902:   PetscLogDouble val;
903: } PetscSortItem;

905: static int compareSortItems(const void *item1_, const void *item2_)
906: {
907:   PetscSortItem *item1 = (PetscSortItem *)item1_;
908:   PetscSortItem *item2 = (PetscSortItem *)item2_;
909:   if (item1->val > item2->val) return -1;
910:   if (item1->val < item2->val) return +1;
911:   return 0;
912: }

914: /*
915:  * Find the number of child events.
916:  */
917: static PetscErrorCode PetscLogNestedTreeGetChildrenCount(const PetscNestedEventTree *tree, int nTimers, int iStart, int depth, int *nChildren)
918: {
919:   int n = 0;

921:   PetscFunctionBegin;
922:   for (int i = iStart + 1; i < nTimers; i++) {
923:     if (tree[i].depth <= depth) break;
924:     if (tree[i].depth == depth + 1) n++;
925:   }
926:   *nChildren = n;
927:   PetscFunctionReturn(PETSC_SUCCESS);
928: }

930: /*
931:  * Initialize child event sort items with ID and times.
932:  */
933: static PetscErrorCode PetscLogNestedTreeSetChildrenSortItems(const PetscViewer viewer, const PetscNestedEventTree *tree, int nTimers, int iStart, int depth, int nChildren, PetscSortItem **children)
934: {
935:   MPI_Comm            comm;
936:   PetscLogDouble     *times, *maxTimes;
937:   PetscStageLog       stageLog;
938:   PetscEventPerfInfo *eventPerfInfo;
939:   const int           stage = MAINSTAGE;

941:   PetscFunctionBegin;
942:   PetscCall(PetscObjectGetComm((PetscObject)viewer, &comm));
943:   PetscCall(PetscLogGetStageLog(&stageLog));
944:   eventPerfInfo = stageLog->stageInfo[stage].eventLog->eventInfo;

946:   if (nChildren > 0) {
947:     /* Create an array for the id-s and maxTimes of the children,
948:      *  leaving 2 spaces for self-time and other-time */

950:     PetscCall(PetscMalloc1(nChildren + 2, children));
951:     nChildren = 0;
952:     for (int i = iStart + 1; i < nTimers; i++) {
953:       if (tree[i].depth <= depth) break;
954:       if (tree[i].depth == depth + 1) {
955:         (*children)[nChildren].id  = i;
956:         (*children)[nChildren].val = eventPerfInfo[tree[i].dftEvent].time;
957:         nChildren++;
958:       }
959:     }

961:     /* Calculate the children's maximum times, to see whether children will be ignored or printed */
962:     PetscCall(PetscMalloc1(nChildren, &times));
963:     for (int i = 0; i < nChildren; i++) times[i] = (*children)[i].val;

965:     PetscCall(PetscMalloc1(nChildren, &maxTimes));
966:     PetscCall(MPIU_Allreduce(times, maxTimes, nChildren, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm));
967:     PetscCall(PetscFree(times));

969:     for (int i = 0; i < nChildren; i++) (*children)[i].val = maxTimes[i];
970:     PetscCall(PetscFree(maxTimes));
971:   }
972:   PetscFunctionReturn(PETSC_SUCCESS);
973: }

975: /*
976:  * Set 'self' and 'other' performance info.
977:  */
978: static PetscErrorCode PetscLogNestedTreeSetSelfOtherPerfInfo(const PetscNestedEventTree *tree, int iStart, PetscLogDouble totalTime, const PetscSortItem *children, int nChildren, PetscEventPerfInfo *myPerfInfo, PetscEventPerfInfo *selfPerfInfo, PetscEventPerfInfo *otherPerfInfo, int *parentCount, PetscLogDouble *countsPerCall)
979: {
980:   const int           stage = MAINSTAGE;
981:   PetscStageLog       stageLog;
982:   PetscEventPerfInfo *eventPerfInfo;

984:   PetscFunctionBegin;
985:   PetscCall(PetscLogGetStageLog(&stageLog));
986:   eventPerfInfo = stageLog->stageInfo[stage].eventLog->eventInfo;
987:   if (!tree[iStart].own) {
988:     /* Set values for a timer that was not activated in this process
989:    * (but was, in other processes of this run) */
990:     PetscCall(PetscMemzero(myPerfInfo, sizeof(*myPerfInfo)));

992:     *selfPerfInfo  = *myPerfInfo;
993:     *otherPerfInfo = *myPerfInfo;

995:     *parentCount   = 1;
996:     *countsPerCall = 0;
997:   } else {
998:     /* Set the values for a timer that was activated in this process */
999:     PetscLogEvent dftEvent = tree[iStart].dftEvent;

1001:     *parentCount   = countParents(tree, eventPerfInfo, iStart);
1002:     *myPerfInfo    = eventPerfInfo[dftEvent];
1003:     *countsPerCall = (PetscLogDouble)myPerfInfo->count / (PetscLogDouble)*parentCount;

1005:     *selfPerfInfo                = *myPerfInfo;
1006:     otherPerfInfo->time          = 0;
1007:     otherPerfInfo->flops         = 0;
1008:     otherPerfInfo->numMessages   = 0;
1009:     otherPerfInfo->messageLength = 0;
1010:     otherPerfInfo->numReductions = 0;

1012:     for (int i = 0; i < nChildren; i++) {
1013:       /* For all child counters: subtract the child values from self-timers */

1015:       PetscLogEvent      dftChild      = tree[children[i].id].dftEvent;
1016:       PetscEventPerfInfo childPerfInfo = eventPerfInfo[dftChild];

1018:       selfPerfInfo->time -= childPerfInfo.time;
1019:       selfPerfInfo->flops -= childPerfInfo.flops;
1020:       selfPerfInfo->numMessages -= childPerfInfo.numMessages;
1021:       selfPerfInfo->messageLength -= childPerfInfo.messageLength;
1022:       selfPerfInfo->numReductions -= childPerfInfo.numReductions;

1024:       if ((children[i].val / totalTime) < THRESHOLD) {
1025:         /* Add them to 'other' if the time is ignored in the output */
1026:         otherPerfInfo->time += childPerfInfo.time;
1027:         otherPerfInfo->flops += childPerfInfo.flops;
1028:         otherPerfInfo->numMessages += childPerfInfo.numMessages;
1029:         otherPerfInfo->messageLength += childPerfInfo.messageLength;
1030:         otherPerfInfo->numReductions += childPerfInfo.numReductions;
1031:       }
1032:     }
1033:   }
1034:   PetscFunctionReturn(PETSC_SUCCESS);
1035: }

1037: /*
1038:  * Set max times across ranks for 'self' and 'other'.
1039:  */
1040: static PetscErrorCode PetscLogNestedTreeSetMaxTimes(MPI_Comm comm, int nChildren, const PetscEventPerfInfo selfPerfInfo, const PetscEventPerfInfo otherPerfInfo, PetscSortItem *children)
1041: {
1042:   PetscLogDouble times[2], maxTimes[2];

1044:   PetscFunctionBegin;
1045:   times[0] = selfPerfInfo.time;
1046:   times[1] = otherPerfInfo.time;

1048:   PetscCall(MPIU_Allreduce(times, maxTimes, 2, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm));
1049:   children[nChildren + 0].id  = -1;
1050:   children[nChildren + 0].val = maxTimes[0];
1051:   children[nChildren + 1].id  = -2;
1052:   children[nChildren + 1].val = maxTimes[1];
1053:   PetscFunctionReturn(PETSC_SUCCESS);
1054: }

1056: static PetscErrorCode PetscLogNestedTreePrint(PetscViewer viewer, PetscNestedEventTree *tree, int nTimers, int iStart, PetscLogDouble totalTime)
1057: {
1058:   int                depth = tree[iStart].depth;
1059:   const char        *name;
1060:   int                parentCount = 1, nChildren;
1061:   PetscSortItem     *children;
1062:   PetscStageLog      stageLog;
1063:   PetscEventRegInfo *eventRegInfo;
1064:   PetscEventPerfInfo myPerfInfo = {0}, selfPerfInfo = {0}, otherPerfInfo = {0};
1065:   PetscLogDouble     countsPerCall = 0;
1066:   PetscBool          wasPrinted;
1067:   PetscBool          childWasPrinted;
1068:   MPI_Comm           comm;

1070:   PetscFunctionBegin;
1071:   /* Look up the name of the event and its PerfInfo */
1072:   PetscCall(PetscLogGetStageLog(&stageLog));
1073:   eventRegInfo = stageLog->eventLog->eventInfo;
1074:   name         = eventRegInfo[(PetscLogEvent)tree[iStart].nstEvent].name;
1075:   PetscCall(PetscObjectGetComm((PetscObject)viewer, &comm));

1077:   PetscCall(PetscLogNestedTreeGetChildrenCount(tree, nTimers, iStart, depth, &nChildren));
1078:   PetscCall(PetscLogNestedTreeSetChildrenSortItems(viewer, tree, nTimers, iStart, depth, nChildren, &children));
1079:   PetscCall(PetscLogNestedTreeSetSelfOtherPerfInfo(tree, iStart, totalTime, children, nChildren, &myPerfInfo, &selfPerfInfo, &otherPerfInfo, &parentCount, &countsPerCall));

1081:   /* Main output for this timer */
1082:   PetscCall(PetscLogNestedTreePrintLine(viewer, myPerfInfo, countsPerCall, parentCount, depth, name, totalTime, &wasPrinted));

1084:   /* Now print the lines for the children */
1085:   if (nChildren > 0) {
1086:     int i;

1088:     /* Calculate max-times for 'self' and 'other' */
1089:     PetscCall(PetscLogNestedTreeSetMaxTimes(comm, nChildren, selfPerfInfo, otherPerfInfo, children));

1091:     /* Now sort the children (including 'self' and 'other') on total time */
1092:     qsort(children, nChildren + 2, sizeof(PetscSortItem), compareSortItems);

1094:     /* Print (or ignore) the children in ascending order of total time */
1095:     PetscCall(PetscViewerXMLStartSection(viewer, "events", NULL));
1096:     for (i = 0; i < nChildren + 2; i++) {
1097:       if ((children[i].val / totalTime) < THRESHOLD) {
1098:         /* ignored: no output */
1099:       } else if (children[i].id == -1) {
1100:         PetscCall(PetscLogNestedTreePrintLine(viewer, selfPerfInfo, 1, parentCount, depth + 1, "self", totalTime, &childWasPrinted));
1101:         if (childWasPrinted) PetscCall(PetscViewerXMLEndSection(viewer, "event"));
1102:       } else if (children[i].id == -2) {
1103:         size_t len;
1104:         char  *otherName;

1106:         PetscCall(PetscStrlen(name, &len));
1107:         PetscCall(PetscMalloc1(len + 16, &otherName));
1108:         PetscCall(PetscSNPrintf(otherName, len + 16, "%s: other-timed", name));
1109:         PetscCall(PetscLogNestedTreePrintLine(viewer, otherPerfInfo, 1, 1, depth + 1, otherName, totalTime, &childWasPrinted));
1110:         PetscCall(PetscFree(otherName));
1111:         if (childWasPrinted) PetscCall(PetscViewerXMLEndSection(viewer, "event"));
1112:       } else {
1113:         /* Print the child with a recursive call to this function */
1114:         PetscCall(PetscLogNestedTreePrint(viewer, tree, nTimers, children[i].id, totalTime));
1115:       }
1116:     }
1117:     PetscCall(PetscViewerXMLEndSection(viewer, "events"));
1118:     PetscCall(PetscFree(children));
1119:   }

1121:   if (wasPrinted) PetscCall(PetscViewerXMLEndSection(viewer, "event"));
1122:   PetscFunctionReturn(PETSC_SUCCESS);
1123: }

1125: static PetscErrorCode PetscLogNestedTreePrintTop(PetscViewer viewer, PetscNestedEventTree *tree, int nTimers, PetscLogDouble totalTime)
1126: {
1127:   int            i, nChildren;
1128:   PetscSortItem *children;
1129:   MPI_Comm       comm;

1131:   PetscFunctionBegin;
1132:   PetscCall(PetscObjectGetComm((PetscObject)viewer, &comm));

1134:   PetscCall(PetscLogNestedTreeGetChildrenCount(tree, nTimers, -1, 0, &nChildren));
1135:   PetscCall(PetscLogNestedTreeSetChildrenSortItems(viewer, tree, nTimers, -1, 0, nChildren, &children));

1137:   if (nChildren > 0) {
1138:     /* Now sort the children on total time */
1139:     qsort(children, nChildren, sizeof(PetscSortItem), compareSortItems);
1140:     /* Print (or ignore) the children in ascending order of total time */
1141:     PetscCall(PetscViewerXMLStartSection(viewer, "timertree", "Timings tree"));
1142:     PetscCall(PetscViewerXMLPutDouble(viewer, "totaltime", NULL, totalTime, "%f"));
1143:     PetscCall(PetscViewerXMLPutDouble(viewer, "timethreshold", NULL, thresholdTime, "%f"));

1145:     for (i = 0; i < nChildren; i++) {
1146:       if ((children[i].val / totalTime) < THRESHOLD) {
1147:         /* ignored: no output */
1148:       } else {
1149:         /* Print the child with a recursive call to this function */
1150:         PetscCall(PetscLogNestedTreePrint(viewer, tree, nTimers, children[i].id, totalTime));
1151:       }
1152:     }
1153:     PetscCall(PetscViewerXMLEndSection(viewer, "timertree"));
1154:     PetscCall(PetscFree(children));
1155:   }
1156:   PetscFunctionReturn(PETSC_SUCCESS);
1157: }

1159: typedef struct {
1160:   char          *name;
1161:   PetscLogDouble time;
1162:   PetscLogDouble flops;
1163:   PetscLogDouble numMessages;
1164:   PetscLogDouble messageLength;
1165:   PetscLogDouble numReductions;
1166: } PetscSelfTimer;

1168: static PetscErrorCode PetscCalcSelfTime(PetscViewer viewer, PetscSelfTimer **p_self, int *p_nstMax)
1169: {
1170:   const int           stage = MAINSTAGE;
1171:   PetscStageLog       stageLog;
1172:   PetscEventRegInfo  *eventRegInfo;
1173:   PetscEventPerfInfo *eventPerfInfo;
1174:   PetscSelfTimer     *selftimes;
1175:   PetscSelfTimer     *totaltimes;
1176:   NestedEventId      *nstEvents;
1177:   int                 i, j, maxDefaultTimer;
1178:   NestedEventId       nst;
1179:   PetscLogEvent       dft;
1180:   int                 nstMax, nstMax_local;
1181:   MPI_Comm            comm;

1183:   PetscFunctionBegin;
1184:   PetscCall(PetscObjectGetComm((PetscObject)viewer, &comm));
1185:   PetscCall(PetscLogGetStageLog(&stageLog));
1186:   eventRegInfo  = stageLog->eventLog->eventInfo;
1187:   eventPerfInfo = stageLog->stageInfo[stage].eventLog->eventInfo;

1189:   /* For each default timer, calculate the (one) nested timer that it corresponds to. */
1190:   maxDefaultTimer = 0;
1191:   for (i = 0; i < nNestedEvents; i++) {
1192:     int            nParents  = nestedEvents[i].nParents;
1193:     PetscLogEvent *dftEvents = nestedEvents[i].dftEvents;
1194:     for (j = 0; j < nParents; j++) maxDefaultTimer = PetscMax(dftEvents[j], maxDefaultTimer);
1195:   }
1196:   PetscCall(PetscMalloc1(maxDefaultTimer + 1, &nstEvents));
1197:   for (dft = 0; dft < maxDefaultTimer; dft++) nstEvents[dft] = 0;
1198:   for (i = 0; i < nNestedEvents; i++) {
1199:     int            nParents  = nestedEvents[i].nParents;
1200:     NestedEventId  nstEvent  = nestedEvents[i].nstEvent;
1201:     PetscLogEvent *dftEvents = nestedEvents[i].dftEvents;
1202:     for (j = 0; j < nParents; j++) nstEvents[dftEvents[j]] = nstEvent;
1203:   }

1205:   /* Calculate largest nested event-ID */
1206:   nstMax_local = 0;
1207:   for (i = 0; i < nNestedEvents; i++) nstMax_local = PetscMax(nestedEvents[i].nstEvent, nstMax_local);
1208:   PetscCall(MPIU_Allreduce(&nstMax_local, &nstMax, 1, MPI_INT, MPI_MAX, comm));

1210:   /* Initialize all total-times with zero */
1211:   PetscCall(PetscMalloc1(nstMax + 1, &selftimes));
1212:   PetscCall(PetscMalloc1(nstMax + 1, &totaltimes));
1213:   for (nst = 0; nst <= nstMax; nst++) {
1214:     totaltimes[nst].time          = 0;
1215:     totaltimes[nst].flops         = 0;
1216:     totaltimes[nst].numMessages   = 0;
1217:     totaltimes[nst].messageLength = 0;
1218:     totaltimes[nst].numReductions = 0;
1219:     totaltimes[nst].name          = NULL;
1220:   }

1222:   /* Calculate total-times */
1223:   for (i = 0; i < nNestedEvents; i++) {
1224:     const int            nParents  = nestedEvents[i].nParents;
1225:     const NestedEventId  nstEvent  = nestedEvents[i].nstEvent;
1226:     const PetscLogEvent *dftEvents = nestedEvents[i].dftEvents;
1227:     for (j = 0; j < nParents; j++) {
1228:       const PetscLogEvent dftEvent = dftEvents[j];
1229:       totaltimes[nstEvent].time += eventPerfInfo[dftEvent].time;
1230:       totaltimes[nstEvent].flops += eventPerfInfo[dftEvent].flops;
1231:       totaltimes[nstEvent].numMessages += eventPerfInfo[dftEvent].numMessages;
1232:       totaltimes[nstEvent].messageLength += eventPerfInfo[dftEvent].messageLength;
1233:       totaltimes[nstEvent].numReductions += eventPerfInfo[dftEvent].numReductions;
1234:     }
1235:     totaltimes[nstEvent].name = eventRegInfo[(PetscLogEvent)nstEvent].name;
1236:   }

1238:   /* Initialize: self-times := totaltimes */
1239:   for (nst = 0; nst <= nstMax; nst++) selftimes[nst] = totaltimes[nst];

1241:   /* Subtract timed subprocesses from self-times */
1242:   for (i = 0; i < nNestedEvents; i++) {
1243:     const int            nParents         = nestedEvents[i].nParents;
1244:     const PetscLogEvent *dftEvents        = nestedEvents[i].dftEvents;
1245:     const NestedEventId *dftParentsSorted = nestedEvents[i].dftParentsSorted;
1246:     for (j = 0; j < nParents; j++) {
1247:       if (dftParentsSorted[j] != DFT_ID_AWAKE) {
1248:         const PetscLogEvent dftEvent  = dftEvents[j];
1249:         const NestedEventId nstParent = nstEvents[dftParentsSorted[j]];
1250:         selftimes[nstParent].time -= eventPerfInfo[dftEvent].time;
1251:         selftimes[nstParent].flops -= eventPerfInfo[dftEvent].flops;
1252:         selftimes[nstParent].numMessages -= eventPerfInfo[dftEvent].numMessages;
1253:         selftimes[nstParent].messageLength -= eventPerfInfo[dftEvent].messageLength;
1254:         selftimes[nstParent].numReductions -= eventPerfInfo[dftEvent].numReductions;
1255:       }
1256:     }
1257:   }

1259:   PetscCall(PetscFree(nstEvents));
1260:   PetscCall(PetscFree(totaltimes));

1262:   /* Set outputs */
1263:   *p_self   = selftimes;
1264:   *p_nstMax = nstMax;
1265:   PetscFunctionReturn(PETSC_SUCCESS);
1266: }

1268: static PetscErrorCode PetscPrintSelfTime(PetscViewer viewer, const PetscSelfTimer *selftimes, int nstMax, PetscLogDouble totalTime)
1269: {
1270:   int                i;
1271:   NestedEventId      nst;
1272:   PetscSortItem     *sortSelfTimes;
1273:   PetscLogDouble    *times, *maxTimes;
1274:   PetscStageLog      stageLog;
1275:   PetscEventRegInfo *eventRegInfo;
1276:   const int          dum_depth = 1, dum_count = 1, dum_parentcount = 1;
1277:   PetscBool          wasPrinted;
1278:   MPI_Comm           comm;

1280:   PetscFunctionBegin;
1281:   PetscCall(PetscObjectGetComm((PetscObject)viewer, &comm));
1282:   PetscCall(PetscLogGetStageLog(&stageLog));
1283:   eventRegInfo = stageLog->eventLog->eventInfo;

1285:   PetscCall(PetscMalloc1(nstMax + 1, &times));
1286:   PetscCall(PetscMalloc1(nstMax + 1, &maxTimes));
1287:   for (nst = 0; nst <= nstMax; nst++) times[nst] = selftimes[nst].time;
1288:   PetscCall(MPIU_Allreduce(times, maxTimes, nstMax + 1, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm));
1289:   PetscCall(PetscFree(times));

1291:   PetscCall(PetscMalloc1(nstMax + 1, &sortSelfTimes));

1293:   /* Sort the self-timers on basis of the largest time needed */
1294:   for (nst = 0; nst <= nstMax; nst++) {
1295:     sortSelfTimes[nst].id  = nst;
1296:     sortSelfTimes[nst].val = maxTimes[nst];
1297:   }
1298:   PetscCall(PetscFree(maxTimes));
1299:   qsort(sortSelfTimes, nstMax + 1, sizeof(PetscSortItem), compareSortItems);

1301:   PetscCall(PetscViewerXMLStartSection(viewer, "selftimertable", "Self-timings"));
1302:   PetscCall(PetscViewerXMLPutDouble(viewer, "totaltime", NULL, totalTime, "%f"));

1304:   for (i = 0; i <= nstMax; i++) {
1305:     if ((sortSelfTimes[i].val / totalTime) >= THRESHOLD) {
1306:       NestedEventId      nstEvent = sortSelfTimes[i].id;
1307:       const char        *name     = eventRegInfo[(PetscLogEvent)nstEvent].name;
1308:       PetscEventPerfInfo selfPerfInfo;

1310:       selfPerfInfo.time          = selftimes[nstEvent].time;
1311:       selfPerfInfo.flops         = selftimes[nstEvent].flops;
1312:       selfPerfInfo.numMessages   = selftimes[nstEvent].numMessages;
1313:       selfPerfInfo.messageLength = selftimes[nstEvent].messageLength;
1314:       selfPerfInfo.numReductions = selftimes[nstEvent].numReductions;

1316:       PetscCall(PetscLogNestedTreePrintLine(viewer, selfPerfInfo, dum_count, dum_parentcount, dum_depth, name, totalTime, &wasPrinted));
1317:       if (wasPrinted) PetscCall(PetscViewerXMLEndSection(viewer, "event"));
1318:     }
1319:   }
1320:   PetscCall(PetscViewerXMLEndSection(viewer, "selftimertable"));
1321:   PetscCall(PetscFree(sortSelfTimes));
1322:   PetscFunctionReturn(PETSC_SUCCESS);
1323: }

1325: PetscErrorCode PetscLogView_Nested(PetscViewer viewer)
1326: {
1327:   PetscLogDouble        locTotalTime, globTotalTime;
1328:   PetscNestedEventTree *tree       = NULL;
1329:   PetscSelfTimer       *selftimers = NULL;
1330:   int                   nTimers = 0, nstMax = 0;
1331:   MPI_Comm              comm;

1333:   PetscFunctionBegin;
1334:   PetscCall(PetscObjectGetComm((PetscObject)viewer, &comm));
1335:   PetscCall(PetscViewerInitASCII_XML(viewer));
1336:   PetscCall(PetscViewerASCIIPrintf(viewer, "<!-- PETSc Performance Summary: -->\n"));
1337:   PetscCall(PetscViewerXMLStartSection(viewer, "petscroot", NULL));

1339:   /* Get the total elapsed time, local and global maximum */
1340:   PetscCall(PetscTime(&locTotalTime));
1341:   locTotalTime -= petsc_BaseTime;
1342:   PetscCall(MPIU_Allreduce(&locTotalTime, &globTotalTime, 1, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm));

1344:   /* Print global information about this run */
1345:   PetscCall(PetscPrintExeSpecs(viewer));
1346:   PetscCall(PetscPrintGlobalPerformance(viewer, locTotalTime));
1347:   /* Collect nested timer tree info from all processes */
1348:   PetscCall(PetscLogNestedTreeCreate(viewer, &tree, &nTimers));
1349:   PetscCall(PetscLogNestedTreePrintTop(viewer, tree, nTimers, globTotalTime));
1350:   PetscCall(PetscLogNestedTreeDestroy(tree, nTimers));

1352:   /* Calculate self-time for all (not-nested) events */
1353:   PetscCall(PetscCalcSelfTime(viewer, &selftimers, &nstMax));
1354:   PetscCall(PetscPrintSelfTime(viewer, selftimers, nstMax, globTotalTime));
1355:   PetscCall(PetscFree(selftimers));

1357:   PetscCall(PetscViewerXMLEndSection(viewer, "petscroot"));
1358:   PetscCall(PetscViewerFinalASCII_XML(viewer));
1359:   PetscFunctionReturn(PETSC_SUCCESS);
1360: }

1362: /*
1363:  * Get the name of a nested event.
1364:  */
1365: static PetscErrorCode PetscGetNestedEventName(const PetscNestedEventTree *tree, int id, char **name)
1366: {
1367:   PetscStageLog stageLog;

1369:   PetscFunctionBegin;
1370:   PetscCall(PetscLogGetStageLog(&stageLog));
1371:   *name = stageLog->eventLog->eventInfo[(PetscLogEvent)tree[id].nstEvent].name;
1372:   PetscFunctionReturn(PETSC_SUCCESS);
1373: }

1375: /*
1376:  * Get the total time elapsed.
1377:  */
1378: static PetscErrorCode PetscGetTotalTime(const PetscViewer viewer, PetscLogDouble *totalTime)
1379: {
1380:   PetscLogDouble locTotalTime;
1381:   MPI_Comm       comm;

1383:   PetscFunctionBegin;
1384:   PetscCall(PetscObjectGetComm((PetscObject)viewer, &comm));
1385:   PetscCall(PetscTime(&locTotalTime));
1386:   locTotalTime -= petsc_BaseTime;
1387:   PetscCall(MPIU_Allreduce(&locTotalTime, totalTime, 1, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm));
1388:   PetscFunctionReturn(PETSC_SUCCESS);
1389: }

1391: /*
1392:  * Write a line to the flame graph output and then recurse into child events.
1393:  */
1394: static PetscErrorCode PetscLogNestedTreePrintFlamegraph(PetscViewer viewer, PetscNestedEventTree *tree, int nTimers, int iStart, PetscLogDouble totalTime, PetscIntStack eventStack)
1395: {
1396:   int                depth = tree[iStart].depth, parentCount = 1, i, nChildren;
1397:   char              *name       = NULL;
1398:   PetscEventPerfInfo myPerfInfo = {0}, selfPerfInfo = {0}, otherPerfInfo = {0};
1399:   PetscLogDouble     countsPerCall = 0, locTime, globTime;
1400:   PetscSortItem     *children;
1401:   PetscStageLog      stageLog;
1402:   MPI_Comm           comm;

1404:   PetscFunctionBegin;
1405:   PetscCall(PetscLogGetStageLog(&stageLog));
1406:   PetscCall(PetscObjectGetComm((PetscObject)viewer, &comm));

1408:   /* Determine information about the child events as well as 'self' and 'other' */
1409:   PetscCall(PetscLogNestedTreeGetChildrenCount(tree, nTimers, iStart, depth, &nChildren));
1410:   PetscCall(PetscLogNestedTreeSetChildrenSortItems(viewer, tree, nTimers, iStart, depth, nChildren, &children));
1411:   PetscCall(PetscLogNestedTreeSetSelfOtherPerfInfo(tree, iStart, totalTime, children, nChildren, &myPerfInfo, &selfPerfInfo, &otherPerfInfo, &parentCount, &countsPerCall));

1413:   /* Write line to the file. The time shown is 'self' + 'other' because each entry in the output
1414:    * is the total time spent in the event minus the amount spent in child events. */
1415:   locTime = selfPerfInfo.time + otherPerfInfo.time;
1416:   PetscCall(MPIU_Allreduce(&locTime, &globTime, 1, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm));
1417:   if (globTime / totalTime > THRESHOLD && tree[iStart].own) {
1418:     /* Iterate over parent events in the stack and write them */
1419:     for (i = 0; i <= eventStack->top; i++) {
1420:       PetscCall(PetscGetNestedEventName(tree, eventStack->stack[i], &name));
1421:       PetscCall(PetscViewerASCIIPrintf(viewer, "%s;", name));
1422:     }
1423:     PetscCall(PetscGetNestedEventName(tree, iStart, &name));
1424:     /* The output is given as an integer in microseconds because otherwise the file cannot be read
1425:      * by apps such as speedscope (https://speedscope.app/). */
1426:     PetscCall(PetscViewerASCIIPrintf(viewer, "%s %" PetscInt64_FMT "\n", name, (PetscInt64)(globTime * 1e6)));
1427:   }

1429:   /* Add the current event to the parent stack and write the child events */
1430:   PetscCall(PetscIntStackPush(eventStack, iStart));
1431:   for (i = 0; i < nChildren; i++) PetscCall(PetscLogNestedTreePrintFlamegraph(viewer, tree, nTimers, children[i].id, totalTime, eventStack));
1432:   /* Pop the top item from the stack and immediately discard it */
1433:   {
1434:     int tmp;
1435:     PetscCall(PetscIntStackPop(eventStack, &tmp));
1436:   }
1437:   PetscFunctionReturn(PETSC_SUCCESS);
1438: }

1440: /*
1441:  * Print nested logging information to a file suitable for reading into a Flame Graph.
1442:  *
1443:  * The format consists of a semicolon-separated list of events and the event duration in microseconds (which must be an integer).
1444:  * An example output would look like:
1445:  *   MatAssemblyBegin 1
1446:  *   MatAssemblyEnd 10
1447:  *   MatView 302
1448:  *   KSPSetUp 98
1449:  *   KSPSetUp;VecSet 5
1450:  *   KSPSolve 150
1451:  *
1452:  * This option may be requested from the command line by passing in the flag `-log_view :<somefile>.txt:ascii_flamegraph`.
1453:  */
1454: PetscErrorCode PetscLogView_Flamegraph(PetscViewer viewer)
1455: {
1456:   int                   nTimers = 0, i, nChildren;
1457:   PetscIntStack         eventStack;
1458:   PetscLogDouble        totalTime;
1459:   PetscNestedEventTree *tree = NULL;
1460:   PetscSortItem        *children;

1462:   PetscFunctionBegin;
1463:   PetscCall(PetscGetTotalTime(viewer, &totalTime));
1464:   PetscCall(PetscLogNestedTreeCreate(viewer, &tree, &nTimers));
1465:   /* We use an integer stack to keep track of parent event IDs */
1466:   PetscCall(PetscIntStackCreate(&eventStack));

1468:   /* Initialize the child events and write them recursively */
1469:   PetscCall(PetscLogNestedTreeGetChildrenCount(tree, nTimers, -1, 0, &nChildren));
1470:   PetscCall(PetscLogNestedTreeSetChildrenSortItems(viewer, tree, nTimers, -1, 0, nChildren, &children));
1471:   for (i = 0; i < nChildren; i++) PetscCall(PetscLogNestedTreePrintFlamegraph(viewer, tree, nTimers, children[i].id, totalTime, eventStack));

1473:   PetscCall(PetscLogNestedTreeDestroy(tree, nTimers));
1474:   PetscCall(PetscIntStackDestroy(eventStack));
1475:   PetscFunctionReturn(PETSC_SUCCESS);
1476: }

1478: PETSC_EXTERN PetscErrorCode PetscASend(int count, int datatype)
1479: {
1480:   PetscFunctionBegin;
1481:   PetscCall(PetscAddLogDouble(&petsc_send_ct, &petsc_send_ct_th, 1));
1482:   #if !defined(MPIUNI_H) && !defined(PETSC_HAVE_BROKEN_RECURSIVE_MACRO)
1483:   PetscCall(PetscMPITypeSize(count, MPI_Type_f2c((MPI_Fint)datatype), &petsc_send_len, &petsc_send_len_th));
1484:   #endif
1485:   PetscFunctionReturn(PETSC_SUCCESS);
1486: }

1488: PETSC_EXTERN PetscErrorCode PetscARecv(int count, int datatype)
1489: {
1490:   PetscFunctionBegin;
1491:   PetscCall(PetscAddLogDouble(&petsc_recv_ct, &petsc_recv_ct_th, 1));
1492:   #if !defined(MPIUNI_H) && !defined(PETSC_HAVE_BROKEN_RECURSIVE_MACRO)
1493:   PetscCall(PetscMPITypeSize(count, MPI_Type_f2c((MPI_Fint)datatype), &petsc_recv_len, &petsc_recv_len_th));
1494:   #endif
1495:   PetscFunctionReturn(PETSC_SUCCESS);
1496: }

1498: PETSC_EXTERN PetscErrorCode PetscAReduce(void)
1499: {
1500:   PetscFunctionBegin;
1501:   PetscCall(PetscAddLogDouble(&petsc_allreduce_ct, &petsc_allreduce_ct_th, 1));
1502:   PetscFunctionReturn(PETSC_SUCCESS);
1503: }

1505: #endif