Actual source code: xmllogevent.c
petsc-3.15.0 2021-03-30
1: /*************************************************************************************
2: * M A R I T I M E R E S E A R C H I N S T I T U T E N E T H E R L A N D S *
3: *************************************************************************************
4: * authors: Bas van 't Hof, Koos Huijssen, Christiaan M. Klaij *
5: *************************************************************************************
6: * content: Support for nested PetscTimers *
7: *************************************************************************************/
8: #include <petsclog.h>
9: #include <petsc/private/logimpl.h>
10: #include <petsctime.h>
11: #include <petscviewer.h>
12: #include "../src/sys/logging/xmlviewer.h"
14: #if defined(PETSC_USE_LOG)
16: /*
17: * Support for nested PetscTimers
18: *
19: * PetscTimers keep track of a lot of useful information: Wall clock times,
20: * message passing statistics, flop counts. Information about the nested structure
21: * of the timers is lost. Example:
22: *
23: * 7:30 Start: awake
24: * 7:30 Start: morning routine
25: * 7:40 Start: eat
26: * 7:49 Done: eat
27: * 7:43 Done: morning routine
28: * 8:15 Start: work
29: * 12:15 Start: eat
30: * 12:45 Done: eat
31: * 16:00 Done: work
32: * 16:30 Start: evening routine
33: * 18:30 Start: eat
34: * 19:15 Done: eat
35: * 22:00 Done: evening routine
36: * 22:00 Done: awake
37: *
38: * Petsc timers provide the following timer results:
39: *
40: * awake: 1 call 14:30 hours
41: * morning routine: 1 call 0:13 hours
42: * eat: 3 calls 1:24 hours
43: * work: 1 call 7:45 hours
44: * evening routine 1 call 5:30 hours
45: *
46: * Nested timers can be used to get the following table:
47: *
48: * [1 call]: awake 14:30 hours
49: * [1 call]: morning routine 0:13 hours ( 2 % of awake)
50: * [1 call]: eat 0:09 hours (69 % of morning routine)
51: * rest (morning routine) 0:04 hours (31 % of morning routine)
52: * [1 call]: work 7:45 hours (53 % of awake)
53: * [1 call]: eat 0:30 hours ( 6 % of work)
54: * rest (work) 7:15 hours (94 % of work)
55: * [1 call]: evening routine 5:30 hours (38 % of awake)
56: * [1 call]: eat 0:45 hours (14 % of evening routine)
57: * rest (evening routine) 4:45 hours (86 % of morning routine)
58: *
59: * We ignore the concept of 'stages', because these seem to be conflicting notions, or at least,
60: * the nested timers make the stages unnecessary.
61: *
62: */
64: /*
65: * Data structures for keeping track of nested timers:
66: *
67: * nestedEvents: information about the timers that have actually been activated
68: * dftParentActive: if a timer is started now, it is part of (nested inside) the dftParentActive
69: *
70: * The Default-timers are used to time the nested timers. Every nested timer corresponds to
71: * (one or more) default timers, where one of the default timers has the same event-id as the
72: * nested one.
73: *
74: * Because of the risk of confusion between nested timer ids and default timer ids, we
75: * introduce a typedef for nested events (NestedEventId) and use the existing type PetscLogEvent
76: * only for default events. Also, all nested event variables are prepended with 'nst', and
77: * default timers with 'dft'.
78: */
80: #define DFT_ID_AWAKE -1
82: typedef PetscLogEvent NestedEventId;
83: typedef struct {
84: NestedEventId nstEvent; /* event-code for this nested event, argument 'event' in PetscLogEventStartNested */
85: int nParents; /* number of 'dftParents': the default timer which was the dftParentActive when this nested timer was activated */
86: PetscLogEvent *dftParentsSorted; /* The default timers which were the dftParentActive when this nested event was started */
87: PetscLogEvent *dftEvents; /* The default timers which represent the different 'instances' of this nested event */
89: PetscLogEvent *dftParents; /* The default timers which were the dftParentActive when this nested event was started */
90: PetscLogEvent *dftEventsSorted; /* The default timers which represent the different 'instances' of this nested event */
91: } PetscNestedEvent;
93: static PetscLogEvent dftParentActive = DFT_ID_AWAKE;
94: static int nNestedEvents = 0;
95: static int nNestedEventsAllocated = 0;
96: static PetscNestedEvent *nestedEvents = NULL;
97: static PetscLogDouble thresholdTime = 0.01; /* initial value was 0.1 */
99: #define THRESHOLD (thresholdTime/100.0+1e-12)
101: static PetscErrorCode PetscLogEventBeginNested(NestedEventId nstEvent, int t, PetscObject o1, PetscObject o2, PetscObject o3, PetscObject o4);
102: static PetscErrorCode PetscLogEventEndNested(NestedEventId nstEvent, int t, PetscObject o1, PetscObject o2, PetscObject o3, PetscObject o4);
103: PETSC_INTERN PetscErrorCode PetscLogView_Nested(PetscViewer);
104: PETSC_INTERN PetscErrorCode PetscLogView_Flamegraph(PetscViewer);
107: /*@C
108: PetscLogNestedBegin - Turns on nested logging of objects and events. This logs flop
109: rates and object creation and should not slow programs down too much.
111: Logically Collective over PETSC_COMM_WORLD
113: Options Database Keys:
114: . -log_view :filename.xml:ascii_xml - Prints an XML summary of flop and timing information to the file
116: Usage:
117: .vb
118: PetscInitialize(...);
119: PetscLogNestedBegin();
120: ... code ...
121: PetscLogView(viewer);
122: PetscFinalize();
123: .ve
125: Level: advanced
127: .seealso: PetscLogDump(), PetscLogAllBegin(), PetscLogView(), PetscLogTraceBegin(), PetscLogDefaultBegin()
128: @*/
129: PetscErrorCode PetscLogNestedBegin(void)
130: {
134: if (nestedEvents) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"nestedEvents already allocated");
136: nNestedEventsAllocated = 10;
137: PetscMalloc1(nNestedEventsAllocated,&nestedEvents);
138: dftParentActive = DFT_ID_AWAKE;
139: nNestedEvents =1;
141: /* 'Awake' is nested event 0. It has no parents */
142: nestedEvents[0].nstEvent = 0;
143: nestedEvents[0].nParents = 0;
144: nestedEvents[0].dftParentsSorted = NULL;
145: nestedEvents[0].dftEvents = NULL;
146: nestedEvents[0].dftParents = NULL;
147: nestedEvents[0].dftEventsSorted = NULL;
149: PetscLogSet(PetscLogEventBeginNested,PetscLogEventEndNested);
150: return(0);
151: }
153: /* Delete the data structures for the nested timers */
154: PetscErrorCode PetscLogNestedEnd(void)
155: {
157: int i;
160: if (!nestedEvents) return(0);
161: for (i=0; i<nNestedEvents; i++) {
162: PetscFree4(nestedEvents[i].dftParentsSorted,nestedEvents[i].dftEventsSorted,nestedEvents[i].dftParents,nestedEvents[i].dftEvents);
163: }
164: PetscFree(nestedEvents);
165: nestedEvents = NULL;
166: nNestedEvents = 0;
167: nNestedEventsAllocated = 0;
168: return(0);
169: }
172: /*
173: UTILITIES: FIND STUFF IN SORTED ARRAYS
175: dftIndex - index to be found
176: dftArray - sorted array of PetscLogEvent-ids
177: narray - dimension of dftArray
178: entry - entry in the array where dftIndex may be found;
180: if dftArray[entry] != dftIndex, then dftIndex is not part of dftArray
181: In that case, the dftIndex can be inserted at this entry.
182: */
183: static PetscErrorCode PetscLogEventFindDefaultTimer(PetscLogEvent dftIndex,const PetscLogEvent *dftArray,int narray,int *entry)
184: {
186: if (narray==0 || dftIndex <= dftArray[0]) {
187: *entry = 0;
188: } else if (dftIndex > dftArray[narray-1]) {
189: *entry = narray;
190: } else {
191: int ihigh = narray-1, ilow=0;
192: while (ihigh>ilow) {
193: const int imiddle = (ihigh+ilow)/2;
194: if (dftArray[imiddle] > dftIndex) {
195: ihigh = imiddle;
196: } else if (dftArray[imiddle]<dftIndex) {
197: ilow = imiddle+1;
198: } else {
199: ihigh = imiddle;
200: ilow = imiddle;
201: }
202: }
203: *entry = ihigh;
204: }
205: return(0);
206: }
208: /*
209: Utility: find the nested event with given identification
211: nstEvent - Nested event to be found
212: entry - entry in the nestedEvents where nstEvent may be found;
214: if nestedEvents[entry].nstEvent != nstEvent, then index is not part of iarray
215: */
216: static PetscErrorCode PetscLogEventFindNestedTimer(NestedEventId nstEvent,int *entry)
217: {
219: if (nNestedEvents==0 || nstEvent <= nestedEvents[0].nstEvent) {
220: *entry = 0;
221: } else if (nstEvent > nestedEvents[nNestedEvents-1].nstEvent) {
222: *entry = nNestedEvents;
223: } else {
224: int ihigh = nNestedEvents-1, ilow = 0;
225: while (ihigh>ilow) {
226: const int imiddle = (ihigh+ilow)/2;
227: if (nestedEvents[imiddle].nstEvent > nstEvent) {
228: ihigh = imiddle;
229: } else if (nestedEvents[imiddle].nstEvent<nstEvent) {
230: ilow = imiddle+1;
231: } else {
232: ihigh = imiddle;
233: ilow = imiddle;
234: }
235: }
236: *entry = ihigh;
237: }
238: return(0);
239: }
241: /*
242: Nested logging is not prepared yet to support user-defined logging stages, so for now we force logging on the main stage.
243: Using PetscLogStage{Push/Pop}() would be more appropriate, but these two calls do extra bookkeeping work we don't need.
244: */
246: #define MAINSTAGE 0
248: static PetscLogStage savedStage = 0;
250: PETSC_STATIC_INLINE PetscErrorCode PetscLogStageOverride(void)
251: {
252: PetscStageLog stageLog = petsc_stageLog;
256: if (stageLog->curStage == MAINSTAGE) return(0);
257: savedStage = stageLog->curStage;
258: stageLog->curStage = MAINSTAGE;
259: PetscIntStackPush(stageLog->stack, MAINSTAGE);
260: return(0);
261: }
263: PETSC_STATIC_INLINE PetscErrorCode PetscLogStageRestore(void)
264: {
265: PetscStageLog stageLog = petsc_stageLog;
269: if (savedStage == MAINSTAGE) return(0);
270: stageLog->curStage = savedStage;
271: PetscIntStackPop(stageLog->stack, &savedStage);
272: return(0);
273: }
275: /******************************************************************************************/
276: /* Start a nested event */
277: static PetscErrorCode PetscLogEventBeginNested(NestedEventId nstEvent, int t, PetscObject o1, PetscObject o2, PetscObject o3, PetscObject o4)
278: {
279: PetscErrorCode ierr;
280: int entry, pentry, tentry,i;
281: PetscLogEvent dftEvent;
284: PetscLogEventFindNestedTimer(nstEvent, &entry);
285: if (entry>=nNestedEvents || nestedEvents[entry].nstEvent != nstEvent) {
286: /* Nested event doesn't exist yet: create it */
288: if (nNestedEvents==nNestedEventsAllocated) {
289: /* Enlarge and re-allocate nestedEvents if needed */
290: PetscNestedEvent *tmp = nestedEvents;
291: PetscMalloc1(2*nNestedEvents,&nestedEvents);
292: nNestedEventsAllocated*=2;
293: PetscArraycpy(nestedEvents, tmp, nNestedEvents);
294: PetscFree(tmp);
295: }
297: /* Clear space in nestedEvents for new nested event */
298: nNestedEvents++;
299: for (i = nNestedEvents-1; i>entry; i--) {
300: nestedEvents[i] = nestedEvents[i-1];
301: }
303: /* Create event in nestedEvents */
304: nestedEvents[entry].nstEvent = nstEvent;
305: nestedEvents[entry].nParents=1;
306: PetscMalloc4(1,&nestedEvents[entry].dftParentsSorted,1,&nestedEvents[entry].dftEventsSorted,1,&nestedEvents[entry].dftParents,1,&nestedEvents[entry].dftEvents);
308: /* Fill in new event */
309: pentry = 0;
310: dftEvent = (PetscLogEvent) nstEvent;
312: nestedEvents[entry].nstEvent = nstEvent;
313: nestedEvents[entry].dftParents[pentry] = dftParentActive;
314: nestedEvents[entry].dftEvents[pentry] = dftEvent;
315: nestedEvents[entry].dftParentsSorted[pentry] = dftParentActive;
316: nestedEvents[entry].dftEventsSorted[pentry] = dftEvent;
318: } else {
319: /* Nested event exists: find current dftParentActive among parents */
320: PetscLogEvent *dftParentsSorted = nestedEvents[entry].dftParentsSorted;
321: PetscLogEvent *dftEvents = nestedEvents[entry].dftEvents;
322: int nParents = nestedEvents[entry].nParents;
324: PetscLogEventFindDefaultTimer( dftParentActive, dftParentsSorted, nParents, &pentry);
326: if (pentry>=nParents || dftParentActive != dftParentsSorted[pentry]) {
327: /* dftParentActive not in the list: add it to the list */
328: int i;
329: PetscLogEvent *dftParents = nestedEvents[entry].dftParents;
330: PetscLogEvent *dftEventsSorted = nestedEvents[entry].dftEventsSorted;
331: char name[100];
333: /* Register a new default timer */
334: sprintf(name, "%d -> %d", (int) dftParentActive, (int) nstEvent);
335: PetscLogEventRegister(name, 0, &dftEvent);
336: PetscLogEventFindDefaultTimer( dftEvent, dftEventsSorted, nParents, &tentry);
338: /* Reallocate parents and dftEvents to make space for new parent */
339: PetscMalloc4(1+nParents,&nestedEvents[entry].dftParentsSorted,1+nParents,&nestedEvents[entry].dftEventsSorted,1+nParents,&nestedEvents[entry].dftParents,1+nParents,&nestedEvents[entry].dftEvents);
340: PetscArraycpy(nestedEvents[entry].dftParentsSorted, dftParentsSorted, nParents);
341: PetscArraycpy(nestedEvents[entry].dftEventsSorted, dftEventsSorted, nParents);
342: PetscArraycpy(nestedEvents[entry].dftParents, dftParents, nParents);
343: PetscArraycpy(nestedEvents[entry].dftEvents, dftEvents, nParents);
344: PetscFree4(dftParentsSorted,dftEventsSorted,dftParents,dftEvents);
346: dftParents = nestedEvents[entry].dftParents;
347: dftEvents = nestedEvents[entry].dftEvents;
348: dftParentsSorted = nestedEvents[entry].dftParentsSorted;
349: dftEventsSorted = nestedEvents[entry].dftEventsSorted;
351: nestedEvents[entry].nParents++;
352: nParents++;
354: for (i = nParents-1; i>pentry; i--) {
355: dftParentsSorted[i] = dftParentsSorted[i-1];
356: dftEvents[i] = dftEvents[i-1];
357: }
358: for (i = nParents-1; i>tentry; i--) {
359: dftParents[i] = dftParents[i-1];
360: dftEventsSorted[i] = dftEventsSorted[i-1];
361: }
363: /* Fill in the new default timer */
364: dftParentsSorted[pentry] = dftParentActive;
365: dftEvents[pentry] = dftEvent;
366: dftParents[tentry] = dftParentActive;
367: dftEventsSorted[tentry] = dftEvent;
369: } else {
370: /* dftParentActive was found: find the corresponding default 'dftEvent'-timer */
371: dftEvent = nestedEvents[entry].dftEvents[pentry];
372: }
373: }
375: /* Start the default 'dftEvent'-timer and update the dftParentActive */
376: PetscLogStageOverride();
377: PetscLogEventBeginDefault(dftEvent,t,o1,o2,o3,o4);
378: PetscLogStageRestore();
379: dftParentActive = dftEvent;
380: return(0);
381: }
383: /* End a nested event */
384: static PetscErrorCode PetscLogEventEndNested(NestedEventId nstEvent, int t, PetscObject o1, PetscObject o2, PetscObject o3, PetscObject o4)
385: {
386: PetscErrorCode ierr;
387: int entry, pentry, nParents;
388: PetscLogEvent *dftEventsSorted;
391: /* Find the nested event */
392: PetscLogEventFindNestedTimer(nstEvent, &entry);
393: if (entry>=nNestedEvents) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE, "Logging event %d larger than number of events %d",entry,nNestedEvents);
394: if (nestedEvents[entry].nstEvent != nstEvent) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE, "Logging event %d had unbalanced begin/end pairs does not match %d",entry,nstEvent);
395: dftEventsSorted = nestedEvents[entry].dftEventsSorted;
396: nParents = nestedEvents[entry].nParents;
398: /* Find the current default timer among the 'dftEvents' of this event */
399: PetscLogEventFindDefaultTimer( dftParentActive, dftEventsSorted, nParents, &pentry);
401: if (pentry>=nParents) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE, "Entry %d is larger than number of parents %d",pentry,nParents);
402: if (dftEventsSorted[pentry] != dftParentActive) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE, "Active parent is %d, but we seem to be closing %d",dftParentActive,dftEventsSorted[pentry]);
404: /* Stop the default timer and update the dftParentActive */
405: PetscLogStageOverride();
406: PetscLogEventEndDefault(dftParentActive,t,o1,o2,o3,o4);
407: PetscLogStageRestore();
408: dftParentActive = nestedEvents[entry].dftParents[pentry];
409: return(0);
410: }
412: /*@
413: PetscLogSetThreshold - Set the threshold time for logging the events; this is a percentage out of 100, so 1. means any event
414: that takes 1 or more percent of the time.
416: Logically Collective over PETSC_COMM_WORLD
418: Input Parameter:
419: . newThresh - the threshold to use
421: Output Parameter:
422: . oldThresh - the previously set threshold value
424: Options Database Keys:
425: . -log_view :filename.xml:ascii_xml - Prints an XML summary of flop and timing information to the file
427: Usage:
428: .vb
429: PetscInitialize(...);
430: PetscLogNestedBegin();
431: PetscLogSetThreshold(0.1,&oldthresh);
432: ... code ...
433: PetscLogView(viewer);
434: PetscFinalize();
435: .ve
437: Level: advanced
439: .seealso: PetscLogDump(), PetscLogAllBegin(), PetscLogView(), PetscLogTraceBegin(), PetscLogDefaultBegin(),
440: PetscLogNestedBegin()
441: @*/
442: PetscErrorCode PetscLogSetThreshold(PetscLogDouble newThresh, PetscLogDouble *oldThresh)
443: {
445: if (oldThresh) *oldThresh = thresholdTime;
446: if (newThresh == PETSC_DECIDE) newThresh = 0.01;
447: if (newThresh == PETSC_DEFAULT) newThresh = 0.01;
448: thresholdTime = PetscMax(newThresh, 0.0);
449: return(0);
450: }
452: static PetscErrorCode PetscPrintExeSpecs(PetscViewer viewer)
453: {
454: PetscErrorCode ierr;
455: char arch[128],hostname[128],username[128],pname[PETSC_MAX_PATH_LEN],date[128];
456: char version[256], buildoptions[128] = "";
457: PetscMPIInt size;
458: size_t len;
461: MPI_Comm_size(PetscObjectComm((PetscObject)viewer),&size);
462: PetscGetArchType(arch,sizeof(arch));
463: PetscGetHostName(hostname,sizeof(hostname));
464: PetscGetUserName(username,sizeof(username));
465: PetscGetProgramName(pname,sizeof(pname));
466: PetscGetDate(date,sizeof(date));
467: PetscGetVersion(version,sizeof(version));
469: PetscViewerXMLStartSection(viewer, "runspecification", "Run Specification");
470: PetscViewerXMLPutString( viewer, "executable" , "Executable" , pname);
471: PetscViewerXMLPutString( viewer, "architecture", "Architecture" , arch);
472: PetscViewerXMLPutString( viewer, "hostname" , "Host" , hostname);
473: PetscViewerXMLPutInt( viewer, "nprocesses" , "Number of processes", size);
474: PetscViewerXMLPutString( viewer, "user" , "Run by user" , username);
475: PetscViewerXMLPutString( viewer, "date" , "Started at" , date);
476: PetscViewerXMLPutString( viewer, "petscrelease", "Petsc Release", version);
478: if (PetscDefined(USE_DEBUG)) {
479: PetscStrlcat(buildoptions, "Debug ", sizeof(buildoptions));
480: }
481: if (PetscDefined(USE_COMPLEX)) {
482: PetscStrlcat(buildoptions, "Complex ", sizeof(buildoptions));
483: }
484: if (PetscDefined(USE_REAL_SINGLE)) {
485: PetscStrlcat(buildoptions, "Single ", sizeof(buildoptions));
486: } else if (PetscDefined(USE_REAL___FLOAT128)) {
487: PetscStrlcat(buildoptions, "Quadruple ", sizeof(buildoptions));
488: } else if (PetscDefined(USE_REAL___FP16)) {
489: PetscStrlcat(buildoptions, "Half ", sizeof(buildoptions));
490: }
491: if (PetscDefined(USE_64BIT_INDICES)) {
492: PetscStrlcat(buildoptions, "Int64 ", sizeof(buildoptions));
493: }
494: #if defined(__cplusplus)
495: PetscStrlcat(buildoptions, "C++ ", sizeof(buildoptions));
496: #endif
497: PetscStrlen(buildoptions,&len);
498: if (len) {
499: PetscViewerXMLPutString(viewer, "petscbuildoptions", "Petsc build options", buildoptions);
500: }
501: PetscViewerXMLEndSection(viewer, "runspecification");
502: return(0);
503: }
505: /* Print the global performance: max, max/min, average and total of
506: * time, objects, flops, flops/sec, memory, MPI messages, MPI message lengths, MPI reductions.
507: */
508: static PetscErrorCode PetscPrintXMLGlobalPerformanceElement(PetscViewer viewer, const char *name, const char *desc, PetscLogDouble local_val, const PetscBool print_average, const PetscBool print_total)
509: {
510: PetscErrorCode ierr;
511: PetscLogDouble min, tot, ratio, avg;
512: MPI_Comm comm;
513: PetscMPIInt rank, size;
514: PetscLogDouble valrank[2], max[2];
517: PetscObjectGetComm((PetscObject)viewer,&comm);
518: MPI_Comm_size(PetscObjectComm((PetscObject)viewer),&size);
519: MPI_Comm_rank(comm, &rank);
521: valrank[0] = local_val;
522: valrank[1] = (PetscLogDouble) rank;
523: MPIU_Allreduce(&local_val, &min, 1, MPIU_PETSCLOGDOUBLE, MPI_MIN, comm);
524: MPIU_Allreduce(valrank, &max, 1, MPIU_2PETSCLOGDOUBLE, MPI_MAXLOC, comm);
525: MPIU_Allreduce(&local_val, &tot, 1, MPIU_PETSCLOGDOUBLE, MPI_SUM, comm);
526: avg = tot/((PetscLogDouble) size);
527: if (min != 0.0) ratio = max[0]/min;
528: else ratio = 0.0;
530: PetscViewerXMLStartSection(viewer, name, desc);
531: PetscViewerXMLPutDouble(viewer, "max", NULL, max[0], "%e");
532: PetscViewerXMLPutInt( viewer, "maxrank" , "rank at which max was found" , (PetscMPIInt) max[1]);
533: PetscViewerXMLPutDouble(viewer, "ratio", NULL, ratio, "%f");
534: if (print_average) {
535: PetscViewerXMLPutDouble(viewer, "average", NULL, avg, "%e");
536: }
537: if (print_total) {
538: PetscViewerXMLPutDouble(viewer, "total", NULL, tot, "%e");
539: }
540: PetscViewerXMLEndSection(viewer, name);
541: return(0);
542: }
544: /* Print the global performance: max, max/min, average and total of
545: * time, objects, flops, flops/sec, memory, MPI messages, MPI message lengths, MPI reductions.
546: */
547: static PetscErrorCode PetscPrintGlobalPerformance(PetscViewer viewer, PetscLogDouble locTotalTime)
548: {
549: PetscErrorCode ierr;
550: PetscLogDouble flops, mem, red, mess;
551: const PetscBool print_total_yes = PETSC_TRUE,
552: print_total_no = PETSC_FALSE,
553: print_average_no = PETSC_FALSE,
554: print_average_yes = PETSC_TRUE;
557: /* Must preserve reduction count before we go on */
558: red = petsc_allreduce_ct + petsc_gather_ct + petsc_scatter_ct;
560: /* Calculate summary information */
561: PetscViewerXMLStartSection(viewer, "globalperformance", "Global performance");
563: /* Time */
564: PetscPrintXMLGlobalPerformanceElement(viewer, "time", "Time (sec)", locTotalTime, print_average_yes, print_total_no);
566: /* Objects */
567: PetscPrintXMLGlobalPerformanceElement(viewer, "objects", "Objects", (PetscLogDouble) petsc_numObjects, print_average_yes, print_total_no);
569: /* Flop */
570: PetscPrintXMLGlobalPerformanceElement(viewer, "mflop", "MFlop", petsc_TotalFlops/1.0E6, print_average_yes, print_total_yes);
572: /* Flop/sec -- Must talk to Barry here */
573: if (locTotalTime != 0.0) flops = petsc_TotalFlops/locTotalTime;
574: else flops = 0.0;
575: PetscPrintXMLGlobalPerformanceElement(viewer, "mflops", "MFlop/sec", flops/1.0E6, print_average_yes, print_total_yes);
577: /* Memory */
578: PetscMallocGetMaximumUsage(&mem);
579: if (mem > 0.0) {
580: PetscPrintXMLGlobalPerformanceElement(viewer, "memory", "Memory (MiB)", mem/1024.0/1024.0, print_average_yes, print_total_yes);
581: }
582: /* Messages */
583: mess = 0.5*(petsc_irecv_ct + petsc_isend_ct + petsc_recv_ct + petsc_send_ct);
584: PetscPrintXMLGlobalPerformanceElement(viewer, "messagetransfers", "MPI Message Transfers", mess, print_average_yes, print_total_yes);
586: /* Message Volume */
587: mess = 0.5*(petsc_irecv_len + petsc_isend_len + petsc_recv_len + petsc_send_len);
588: PetscPrintXMLGlobalPerformanceElement(viewer, "messagevolume", "MPI Message Volume (MiB)", mess/1024.0/1024.0, print_average_yes, print_total_yes);
590: /* Reductions */
591: PetscPrintXMLGlobalPerformanceElement(viewer, "reductions", "MPI Reductions", red , print_average_no, print_total_no);
592: PetscViewerXMLEndSection(viewer, "globalperformance");
593: return(0);
594: }
596: typedef struct {
597: PetscLogEvent dftEvent;
598: NestedEventId nstEvent;
599: PetscLogEvent dftParent;
600: NestedEventId nstParent;
601: PetscBool own;
602: int depth;
603: NestedEventId* nstPath;
604: } PetscNestedEventTree;
606: /* Compare timers to sort them in the tree */
607: static int compareTreeItems(const void *item1_, const void *item2_)
608: {
609: int i;
610: PetscNestedEventTree *item1 = (PetscNestedEventTree *) item1_;
611: PetscNestedEventTree *item2 = (PetscNestedEventTree *) item2_;
613: for (i=0; i<PetscMin(item1->depth,item2->depth); i++) {
614: if (item1->nstPath[i]<item2->nstPath[i]) return -1;
615: if (item1->nstPath[i]>item2->nstPath[i]) return +1;
616: }
617: if (item1->depth < item2->depth) return -1;
618: if (item1->depth > item2->depth) return 1;
619: return 0;
620: }
621: /*
622: * Do MPI communication to get the complete, nested calling tree for all processes: there may be
623: * calls that happen in some processes, but not in others.
624: *
625: * The output, tree[nTimers] is an array of PetscNestedEventTree-structs.
626: * The tree is sorted so that the timers can be printed in the order of appearance.
627: *
628: * For tree-items which appear in the trees of multiple processes (which will be most items), the
629: * following rule is followed:
630: * + if information from my own process is available, then that is the information stored in tree.
631: * otherwise it is some other process's information.
632: */
633: static PetscErrorCode PetscLogNestedTreeCreate(PetscViewer viewer, PetscNestedEventTree **p_tree, int *p_nTimers)
634: {
635: PetscNestedEventTree *tree = NULL, *newTree;
636: int *treeIndices;
637: int nTimers, totalNTimers, i, j, iTimer0, maxDefaultTimer;
638: int yesno;
639: PetscBool done;
640: PetscErrorCode ierr;
641: int maxdepth;
642: int depth;
643: int illegalEvent;
644: int iextra;
645: NestedEventId *nstPath, *nstMyPath;
646: MPI_Comm comm;
649: PetscObjectGetComm((PetscObject)viewer,&comm);
651: /* Calculate memory needed to store everybody's information and allocate tree */
652: nTimers = 0;
653: for (i=0; i<nNestedEvents; i++) nTimers += nestedEvents[i].nParents;
655: PetscMalloc1(nTimers,&tree);
657: /* Fill tree with readily available information */
658: iTimer0 = 0;
659: maxDefaultTimer =0;
660: for (i=0; i<nNestedEvents; i++) {
661: int nParents = nestedEvents[i].nParents;
662: NestedEventId nstEvent = nestedEvents[i].nstEvent;
663: PetscLogEvent *dftParentsSorted = nestedEvents[i].dftParentsSorted;
664: PetscLogEvent *dftEvents = nestedEvents[i].dftEvents;
665: for (j=0; j<nParents; j++) {
666: maxDefaultTimer = PetscMax(dftEvents[j],maxDefaultTimer);
668: tree[iTimer0+j].dftEvent = dftEvents[j];
669: tree[iTimer0+j].nstEvent = nstEvent;
670: tree[iTimer0+j].dftParent = dftParentsSorted[j];
671: tree[iTimer0+j].own = PETSC_TRUE;
673: tree[iTimer0+j].nstParent = 0;
674: tree[iTimer0+j].depth = 0;
675: tree[iTimer0+j].nstPath = NULL;
676: }
677: iTimer0 += nParents;
678: }
680: /* Calculate the global maximum for the default timer index, so array treeIndices can
681: * be allocated only once */
682: MPIU_Allreduce(&maxDefaultTimer, &j, 1, MPI_INT, MPI_MAX, comm);
683: maxDefaultTimer = j;
685: /* Find default timer's place in the tree */
686: PetscCalloc1(maxDefaultTimer+1,&treeIndices);
687: treeIndices[0] = 0;
688: for (i=0; i<nTimers; i++) {
689: PetscLogEvent dftEvent = tree[i].dftEvent;
690: treeIndices[dftEvent] = i;
691: }
693: /* Find each dftParent's nested identification */
694: for (i=0; i<nTimers; i++) {
695: PetscLogEvent dftParent = tree[i].dftParent;
696: if (dftParent!= DFT_ID_AWAKE) {
697: int j = treeIndices[dftParent];
698: tree[i].nstParent = tree[j].nstEvent;
699: }
700: }
702: /* Find depths for each timer path */
703: done = PETSC_FALSE;
704: maxdepth = 0;
705: while (!done) {
706: done = PETSC_TRUE;
707: for (i=0; i<nTimers; i++) {
708: if (tree[i].dftParent == DFT_ID_AWAKE) {
709: tree[i].depth = 1;
710: maxdepth = PetscMax(1,maxdepth);
711: } else {
712: int j = treeIndices[tree[i].dftParent];
713: depth = 1+tree[j].depth;
714: if (depth>tree[i].depth) {
715: done = PETSC_FALSE;
716: tree[i].depth = depth;
717: maxdepth = PetscMax(depth,maxdepth);
718: }
719: }
720: }
721: }
723: /* Allocate the paths in the entire tree */
724: for (i=0; i<nTimers; i++) {
725: depth = tree[i].depth;
726: PetscCalloc1(depth,&tree[i].nstPath);
727: }
729: /* Calculate the paths for all timers */
730: for (depth=1; depth<=maxdepth; depth++) {
731: for (i=0; i<nTimers; i++) {
732: if (tree[i].depth==depth) {
733: if (depth>1) {
734: int j = treeIndices[tree[i].dftParent];
735: PetscArraycpy(tree[i].nstPath,tree[j].nstPath,depth-1);
736: }
737: tree[i].nstPath[depth-1] = tree[i].nstEvent;
738: }
739: }
740: }
741: PetscFree(treeIndices);
743: /* Sort the tree on basis of the paths */
744: qsort(tree, nTimers, sizeof(PetscNestedEventTree), compareTreeItems);
746: /* Allocate an array to store paths */
747: depth = maxdepth;
748: MPIU_Allreduce(&depth, &maxdepth, 1, MPI_INT, MPI_MAX, comm);
749: PetscMalloc1(maxdepth+1, &nstPath);
750: PetscMalloc1(maxdepth+1, &nstMyPath);
752: /* Find an illegal nested event index (1+largest nested event index) */
753: illegalEvent = 1+nestedEvents[nNestedEvents-1].nstEvent;
754: i = illegalEvent;
755: MPIU_Allreduce(&i, &illegalEvent, 1, MPI_INT, MPI_MAX, comm);
757: /* First, detect timers which are not available in this process, but are available in others
758: * Allocate a new tree, that can contain all timers
759: * Then, fill the new tree with all (own and not-own) timers */
760: newTree= NULL;
761: for (yesno=0; yesno<=1; yesno++) {
762: depth = 1;
763: i = 0;
764: iextra = 0;
765: while (depth>0) {
766: int j;
767: PetscBool same;
769: /* Construct the next path in this process's tree:
770: * if necessary, supplement with invalid path entries */
771: depth++;
772: if (depth > maxdepth + 1) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Depth %d > maxdepth+1 %d",depth,maxdepth+1);
773: if (i<nTimers) {
774: for (j=0; j<tree[i].depth; j++) nstMyPath[j] = tree[i].nstPath[j];
775: for (j=tree[i].depth; j<depth; j++) nstMyPath[j] = illegalEvent;
776: } else {
777: for (j=0; j<depth; j++) nstMyPath[j] = illegalEvent;
778: }
780: /* Communicate with other processes to obtain the next path and its depth */
781: MPIU_Allreduce(nstMyPath, nstPath, depth, MPI_INT, MPI_MIN, comm);
782: for (j=depth-1; (int) j>=0; j--) {
783: if (nstPath[j]==illegalEvent) depth=j;
784: }
786: if (depth>0) {
787: /* If the path exists */
789: /* check whether the next path is the same as this process's next path */
790: same = PETSC_TRUE;
791: for (j=0; same && j<depth; j++) { same = (same && nstMyPath[j] == nstPath[j]) ? PETSC_TRUE : PETSC_FALSE;}
793: if (same) {
794: /* Register 'own path' */
795: if (newTree) newTree[i+iextra] = tree[i];
796: i++;
797: } else {
798: /* Register 'not an own path' */
799: if (newTree) {
800: newTree[i+iextra].nstEvent = nstPath[depth-1];
801: newTree[i+iextra].own = PETSC_FALSE;
802: newTree[i+iextra].depth = depth;
803: PetscMalloc1(depth, &newTree[i+iextra].nstPath);
804: for (j=0; j<depth; j++) {newTree[i+iextra].nstPath[j] = nstPath[j];}
806: newTree[i+iextra].dftEvent = 0;
807: newTree[i+iextra].dftParent = 0;
808: newTree[i+iextra].nstParent = 0;
809: }
810: iextra++;
811: }
813: }
814: }
816: /* Determine the size of the complete tree (with own and not-own timers) and allocate the new tree */
817: totalNTimers = nTimers + iextra;
818: if (!newTree) {
819: PetscMalloc1(totalNTimers, &newTree);
820: }
821: }
822: PetscFree(nstPath);
823: PetscFree(nstMyPath);
824: PetscFree(tree);
825: tree = newTree;
826: newTree = NULL;
828: /* Set return value and return */
829: *p_tree = tree;
830: *p_nTimers = totalNTimers;
831: return(0);
832: }
834: /*
835: * Delete the nested timer tree
836: */
837: static PetscErrorCode PetscLogNestedTreeDestroy(PetscNestedEventTree *tree, int nTimers)
838: {
839: int i;
840: PetscErrorCode ierr;
843: for (i=0; i<nTimers; i++) {
844: PetscFree(tree[i].nstPath);
845: }
846: PetscFree(tree);
847: return(0);
848: }
850: /* Print the global performance: max, max/min, average and total of
851: * time, objects, flops, flops/sec, memory, MPI messages, MPI message lengths, MPI reductions.
852: */
853: static PetscErrorCode PetscPrintXMLNestedLinePerfResults(PetscViewer viewer,const char *name,PetscLogDouble value,PetscLogDouble minthreshold,PetscLogDouble maxthreshold,PetscLogDouble minmaxtreshold)
854: {
855: MPI_Comm comm; /* MPI communicator in reduction */
856: PetscMPIInt rank; /* rank of this process */
857: PetscLogDouble val_in[2], max[2], min[2];
858: PetscLogDouble minvalue, maxvalue, tot;
859: PetscMPIInt size;
860: PetscMPIInt minLoc, maxLoc;
864: PetscObjectGetComm((PetscObject)viewer,&comm);
865: MPI_Comm_size(comm, &size);
866: MPI_Comm_rank(comm, &rank);
867: val_in[0] = value;
868: val_in[1] = (PetscLogDouble) rank;
869: MPIU_Allreduce(val_in, max, 1, MPIU_2PETSCLOGDOUBLE, MPI_MAXLOC, comm);
870: MPIU_Allreduce(val_in, min, 1, MPIU_2PETSCLOGDOUBLE, MPI_MINLOC, comm);
871: maxvalue = max[0];
872: maxLoc = (PetscMPIInt) max[1];
873: minvalue = min[0];
874: minLoc = (PetscMPIInt) min[1];
875: MPIU_Allreduce(&value, &tot, 1, MPIU_PETSCLOGDOUBLE, MPI_SUM, comm);
877: if (maxvalue<maxthreshold && minvalue>=minthreshold) {
878: /* One call per parent or NO value: don't print */
879: } else {
880: PetscViewerXMLStartSection(viewer, name, NULL);
881: if (maxvalue>minvalue*minmaxtreshold) {
882: PetscViewerXMLPutDouble(viewer, "avgvalue", NULL, tot/size, "%g");
883: PetscViewerXMLPutDouble(viewer, "minvalue", NULL, minvalue, "%g");
884: PetscViewerXMLPutDouble(viewer, "maxvalue", NULL, maxvalue, "%g");
885: PetscViewerXMLPutInt( viewer, "minloc" , NULL, minLoc);
886: PetscViewerXMLPutInt( viewer, "maxloc" , NULL, maxLoc);
887: } else {
888: PetscViewerXMLPutDouble(viewer, "value", NULL, tot/size, "%g");
889: }
890: PetscViewerXMLEndSection(viewer, name);
891: }
892: return(0);
893: }
895: #define N_COMM 8
896: static PetscErrorCode PetscLogNestedTreePrintLine(PetscViewer viewer,PetscEventPerfInfo perfInfo,PetscLogDouble countsPerCall,int parentCount,int depth,const char *name,PetscLogDouble totalTime,PetscBool *isPrinted)
897: {
898: PetscLogDouble time = perfInfo.time;
899: PetscLogDouble timeMx;
901: MPI_Comm comm;
904: PetscObjectGetComm((PetscObject)viewer,&comm);
905: MPIU_Allreduce(&time, &timeMx, 1, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);
906: *isPrinted = ((timeMx/totalTime) >= THRESHOLD) ? PETSC_TRUE : PETSC_FALSE;
907: if (*isPrinted) {
908: PetscViewerXMLStartSection(viewer, "event", NULL);
909: PetscViewerXMLPutString(viewer, "name", NULL, name);
910: PetscPrintXMLNestedLinePerfResults(viewer, "time", time/totalTime*100.0, 0, 0, 1.02);
911: PetscPrintXMLNestedLinePerfResults(viewer, "ncalls", parentCount>0 ? countsPerCall : 0, 0.99, 1.01, 1.02);
912: PetscPrintXMLNestedLinePerfResults(viewer, "mflops", time>=timeMx*0.001 ? 1e-6*perfInfo.flops/time : 0, 0, 0.01, 1.05);
913: PetscPrintXMLNestedLinePerfResults(viewer, "mbps",time>=timeMx*0.001 ? perfInfo.messageLength/(1024*1024*time) : 0, 0, 0.01, 1.05);
914: PetscPrintXMLNestedLinePerfResults(viewer, "nreductsps", time>=timeMx*0.001 ? perfInfo.numReductions/time : 0, 0, 0.01, 1.05);
915: }
916: return(0);
917: }
919: /* Count the number of times the parent event was called */
921: static int countParents( const PetscNestedEventTree *tree, PetscEventPerfInfo *eventPerfInfo, int i)
922: {
923: if (tree[i].depth<=1) {
924: return 1; /* Main event: only once */
925: } else if (!tree[i].own) {
926: return 1; /* This event didn't happen in this process, but did in another */
927: } else {
928: int iParent;
929: for (iParent=i-1; iParent>=0; iParent--) {
930: if (tree[iParent].depth == tree[i].depth-1) break;
931: }
932: if (tree[iParent].depth != tree[i].depth-1) {
933: /* ***** Internal error: cannot find parent */
934: return -2;
935: } else {
936: PetscLogEvent dftEvent = tree[iParent].dftEvent;
937: return eventPerfInfo[dftEvent].count;
938: }
939: }
940: }
942: typedef struct {
943: int id;
944: PetscLogDouble val;
945: } PetscSortItem;
947: static int compareSortItems(const void *item1_, const void *item2_)
948: {
949: PetscSortItem *item1 = (PetscSortItem *) item1_;
950: PetscSortItem *item2 = (PetscSortItem *) item2_;
951: if (item1->val > item2->val) return -1;
952: if (item1->val < item2->val) return +1;
953: return 0;
954: }
956: static PetscErrorCode PetscLogNestedTreePrint(PetscViewer viewer, PetscNestedEventTree *tree, int nTimers, int iStart, PetscLogDouble totalTime)
957: {
958: int depth = tree[iStart].depth;
959: const char *name;
960: int parentCount, nChildren;
961: PetscSortItem *children;
962: PetscErrorCode ierr;
963: const int stage = MAINSTAGE;
964: PetscStageLog stageLog;
965: PetscEventRegInfo *eventRegInfo;
966: PetscEventPerfInfo *eventPerfInfo;
967: PetscEventPerfInfo myPerfInfo, otherPerfInfo, selfPerfInfo;
968: PetscLogDouble countsPerCall;
969: PetscBool wasPrinted;
970: PetscBool childWasPrinted;
971: MPI_Comm comm;
974: /* Look up the name of the event and its PerfInfo */
975: PetscLogGetStageLog(&stageLog);
976: eventRegInfo = stageLog->eventLog->eventInfo;
977: eventPerfInfo = stageLog->stageInfo[stage].eventLog->eventInfo;
978: name = eventRegInfo[(PetscLogEvent)tree[iStart].nstEvent].name;
979: PetscObjectGetComm((PetscObject)viewer,&comm);
981: /* Count the number of child processes */
982: nChildren = 0;
983: {
984: int i;
985: for (i=iStart+1; i<nTimers; i++) {
986: if (tree[i].depth <= depth) break;
987: if (tree[i].depth == depth + 1) nChildren++;
988: }
989: }
991: if (nChildren>0) {
992: /* Create an array for the id-s and maxTimes of the children,
993: * leaving 2 spaces for self-time and other-time */
994: int i;
995: PetscLogDouble *times, *maxTimes;
997: PetscMalloc1(nChildren+2,&children);
998: nChildren = 0;
999: for (i=iStart+1; i<nTimers; i++) {
1000: if (tree[i].depth<=depth) break;
1001: if (tree[i].depth == depth + 1) {
1002: children[nChildren].id = i;
1003: children[nChildren].val = eventPerfInfo[tree[i].dftEvent].time ;
1004: nChildren++;
1005: }
1006: }
1008: /* Calculate the children's maximum times, to see whether children will be ignored or printed */
1009: PetscMalloc1(nChildren,×);
1010: for (i=0; i<nChildren; i++) { times[i] = children[i].val; }
1012: PetscMalloc1(nChildren,&maxTimes);
1013: MPIU_Allreduce(times, maxTimes, nChildren, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);
1014: PetscFree(times);
1016: for (i=0; i<nChildren; i++) { children[i].val = maxTimes[i]; }
1017: PetscFree(maxTimes);
1018: }
1020: if (!tree[iStart].own) {
1021: /* Set values for a timer that was not activated in this process
1022: * (but was, in other processes of this run) */
1023: PetscMemzero(&myPerfInfo,sizeof(myPerfInfo));
1025: selfPerfInfo = myPerfInfo;
1026: otherPerfInfo = myPerfInfo;
1028: parentCount = 1;
1029: countsPerCall = 0;
1030: } else {
1031: /* Set the values for a timer that was activated in this process */
1032: int i;
1033: PetscLogEvent dftEvent = tree[iStart].dftEvent;
1035: parentCount = countParents( tree, eventPerfInfo, iStart);
1036: myPerfInfo = eventPerfInfo[dftEvent];
1037: countsPerCall = (PetscLogDouble) myPerfInfo.count / (PetscLogDouble) parentCount;
1039: selfPerfInfo = myPerfInfo;
1040: otherPerfInfo.time = 0;
1041: otherPerfInfo.flops = 0;
1042: otherPerfInfo.numMessages = 0;
1043: otherPerfInfo.messageLength = 0;
1044: otherPerfInfo.numReductions = 0;
1046: for (i=0; i<nChildren; i++) {
1047: /* For all child counters: subtract the child values from self-timers */
1049: PetscLogEvent dftChild = tree[children[i].id].dftEvent;
1050: PetscEventPerfInfo childPerfInfo = eventPerfInfo[dftChild];
1052: selfPerfInfo.time -= childPerfInfo.time;
1053: selfPerfInfo.flops -= childPerfInfo.flops;
1054: selfPerfInfo.numMessages -= childPerfInfo.numMessages;
1055: selfPerfInfo.messageLength -= childPerfInfo.messageLength;
1056: selfPerfInfo.numReductions -= childPerfInfo.numReductions;
1058: if ((children[i].val/totalTime) < THRESHOLD) {
1059: /* Add them to 'other' if the time is ignored in the output */
1060: otherPerfInfo.time += childPerfInfo.time;
1061: otherPerfInfo.flops += childPerfInfo.flops;
1062: otherPerfInfo.numMessages += childPerfInfo.numMessages;
1063: otherPerfInfo.messageLength += childPerfInfo.messageLength;
1064: otherPerfInfo.numReductions += childPerfInfo.numReductions;
1065: }
1066: }
1067: }
1069: /* Main output for this timer */
1070: PetscLogNestedTreePrintLine(viewer, myPerfInfo, countsPerCall, parentCount, depth, name, totalTime, &wasPrinted);
1072: /* Now print the lines for the children */
1073: if (nChildren > 0) {
1074: /* Calculate max-times for 'self' and 'other' */
1075: int i;
1076: PetscLogDouble times[2], maxTimes[2];
1077: times[0] = selfPerfInfo.time; times[1] = otherPerfInfo.time;
1078: MPIU_Allreduce(times, maxTimes, 2, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);
1079: children[nChildren+0].id = -1;
1080: children[nChildren+0].val = maxTimes[0];
1081: children[nChildren+1].id = -2;
1082: children[nChildren+1].val = maxTimes[1];
1084: /* Now sort the children (including 'self' and 'other') on total time */
1085: qsort(children, nChildren+2, sizeof(PetscSortItem), compareSortItems);
1087: /* Print (or ignore) the children in ascending order of total time */
1088: PetscViewerXMLStartSection(viewer,"events", NULL);
1089: for (i=0; i<nChildren+2; i++) {
1090: if ((children[i].val/totalTime) < THRESHOLD) {
1091: /* ignored: no output */
1092: } else if (children[i].id==-1) {
1093: PetscLogNestedTreePrintLine(viewer, selfPerfInfo, 1, parentCount, depth+1, "self", totalTime, &childWasPrinted);
1094: if (childWasPrinted) {
1095: PetscViewerXMLEndSection(viewer,"event");
1096: }
1097: } else if (children[i].id==-2) {
1098: size_t len;
1099: char *otherName;
1101: PetscStrlen(name,&len);
1102: PetscMalloc1(len+16,&otherName);
1103: PetscSNPrintf(otherName,len+16,"%s: other-timed",name);
1104: PetscLogNestedTreePrintLine(viewer, otherPerfInfo, 1, 1, depth+1, otherName, totalTime, &childWasPrinted);
1105: PetscFree(otherName);
1106: if (childWasPrinted) {
1107: PetscViewerXMLEndSection(viewer,"event");
1108: }
1109: } else {
1110: /* Print the child with a recursive call to this function */
1111: PetscLogNestedTreePrint(viewer, tree, nTimers, children[i].id, totalTime);
1112: }
1113: }
1114: PetscViewerXMLEndSection(viewer,"events");
1115: PetscFree(children);
1116: }
1118: if (wasPrinted) {
1119: PetscViewerXMLEndSection(viewer, "event");
1120: }
1121: return(0);
1122: }
1124: static PetscErrorCode PetscLogNestedTreePrintTop(PetscViewer viewer, PetscNestedEventTree *tree, int nTimers, PetscLogDouble totalTime)
1125: {
1126: int i, nChildren;
1127: PetscSortItem *children;
1128: PetscErrorCode ierr;
1129: const int stage = MAINSTAGE;
1130: PetscStageLog stageLog;
1131: PetscEventPerfInfo *eventPerfInfo;
1132: MPI_Comm comm;
1135: PetscObjectGetComm((PetscObject)viewer,&comm);
1137: /* Look up the PerfInfo */
1138: PetscLogGetStageLog(&stageLog);
1139: eventPerfInfo = stageLog->stageInfo[stage].eventLog->eventInfo;
1141: /* Count the number of child processes, and count total time */
1142: nChildren = 0;
1143: for (i=0; i<nTimers; i++)
1144: if (tree[i].depth==1) nChildren++;
1146: if (nChildren>0) {
1147: /* Create an array for the id-s and maxTimes of the children,
1148: * leaving 2 spaces for self-time and other-time */
1149: PetscLogDouble *times, *maxTimes;
1151: PetscMalloc1(nChildren,&children);
1152: nChildren = 0;
1153: for (i=0; i<nTimers; i++) {
1154: if (tree[i].depth == 1) {
1155: children[nChildren].id = i;
1156: children[nChildren].val = eventPerfInfo[tree[i].dftEvent].time ;
1157: nChildren++;
1158: }
1159: }
1161: /* Calculate the children's maximum times, to sort them */
1162: PetscMalloc1(nChildren,×);
1163: for (i=0; i<nChildren; i++) { times[i] = children[i].val; }
1165: PetscMalloc1(nChildren,&maxTimes);
1166: MPIU_Allreduce(times, maxTimes, nChildren, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);
1167: PetscFree(times);
1169: for (i=0; i<nChildren; i++) { children[i].val = maxTimes[i]; }
1170: PetscFree(maxTimes);
1172: /* Now sort the children on total time */
1173: qsort(children, nChildren, sizeof(PetscSortItem), compareSortItems);
1174: /* Print (or ignore) the children in ascending order of total time */
1175: PetscViewerXMLStartSection(viewer, "timertree", "Timings tree");
1176: PetscViewerXMLPutDouble(viewer, "totaltime", NULL, totalTime, "%f");
1177: PetscViewerXMLPutDouble(viewer, "timethreshold", NULL, thresholdTime, "%f");
1179: for (i=0; i<nChildren; i++) {
1180: if ((children[i].val/totalTime) < THRESHOLD) {
1181: /* ignored: no output */
1182: } else {
1183: /* Print the child with a recursive call to this function */
1184: PetscLogNestedTreePrint(viewer, tree, nTimers, children[i].id, totalTime);
1185: }
1186: }
1187: PetscViewerXMLEndSection(viewer, "timertree");
1188: PetscFree(children);
1189: }
1190: return(0);
1191: }
1193: typedef struct {
1194: char *name;
1195: PetscLogDouble time;
1196: PetscLogDouble flops;
1197: PetscLogDouble numMessages;
1198: PetscLogDouble messageLength;
1199: PetscLogDouble numReductions;
1200: } PetscSelfTimer;
1202: static PetscErrorCode PetscCalcSelfTime(PetscViewer viewer, PetscSelfTimer **p_self, int *p_nstMax)
1203: {
1204: PetscErrorCode ierr;
1205: const int stage = MAINSTAGE;
1206: PetscStageLog stageLog;
1207: PetscEventRegInfo *eventRegInfo;
1208: PetscEventPerfInfo *eventPerfInfo;
1209: PetscSelfTimer *selftimes;
1210: PetscSelfTimer *totaltimes;
1211: NestedEventId *nstEvents;
1212: int i, j, maxDefaultTimer;
1213: NestedEventId nst;
1214: PetscLogEvent dft;
1215: int nstMax, nstMax_local;
1216: MPI_Comm comm;
1219: PetscObjectGetComm((PetscObject)viewer,&comm);
1220: PetscLogGetStageLog(&stageLog);
1221: eventRegInfo = stageLog->eventLog->eventInfo;
1222: eventPerfInfo = stageLog->stageInfo[stage].eventLog->eventInfo;
1224: /* For each default timer, calculate the (one) nested timer that it corresponds to. */
1225: maxDefaultTimer =0;
1226: for (i=0; i<nNestedEvents; i++) {
1227: int nParents = nestedEvents[i].nParents;
1228: PetscLogEvent *dftEvents = nestedEvents[i].dftEvents;
1229: for (j=0; j<nParents; j++) maxDefaultTimer = PetscMax(dftEvents[j],maxDefaultTimer);
1230: }
1231: PetscMalloc1(maxDefaultTimer+1,&nstEvents);
1232: for (dft=0; dft<maxDefaultTimer; dft++) {nstEvents[dft] = 0;}
1233: for (i=0; i<nNestedEvents; i++) {
1234: int nParents = nestedEvents[i].nParents;
1235: NestedEventId nstEvent = nestedEvents[i].nstEvent;
1236: PetscLogEvent *dftEvents = nestedEvents[i].dftEvents;
1237: for (j=0; j<nParents; j++) nstEvents[dftEvents[j]] = nstEvent;
1238: }
1240: /* Calculate largest nested event-ID */
1241: nstMax_local = 0;
1242: for (i=0; i<nNestedEvents; i++) nstMax_local = PetscMax(nestedEvents[i].nstEvent,nstMax_local);
1243: MPIU_Allreduce(&nstMax_local, &nstMax, 1, MPI_INT, MPI_MAX, comm);
1245: /* Initialize all total-times with zero */
1246: PetscMalloc1(nstMax+1,&selftimes);
1247: PetscMalloc1(nstMax+1,&totaltimes);
1248: for (nst=0; nst<=nstMax; nst++) {
1249: totaltimes[nst].time = 0;
1250: totaltimes[nst].flops = 0;
1251: totaltimes[nst].numMessages = 0;
1252: totaltimes[nst].messageLength = 0;
1253: totaltimes[nst].numReductions = 0;
1254: totaltimes[nst].name = NULL;
1255: }
1257: /* Calculate total-times */
1258: for (i=0; i<nNestedEvents; i++) {
1259: const int nParents = nestedEvents[i].nParents;
1260: const NestedEventId nstEvent = nestedEvents[i].nstEvent;
1261: const PetscLogEvent *dftEvents = nestedEvents[i].dftEvents;
1262: for (j=0; j<nParents; j++) {
1263: const PetscLogEvent dftEvent = dftEvents[j];
1264: totaltimes[nstEvent].time += eventPerfInfo[dftEvent].time;
1265: totaltimes[nstEvent].flops += eventPerfInfo[dftEvent].flops;
1266: totaltimes[nstEvent].numMessages += eventPerfInfo[dftEvent].numMessages;
1267: totaltimes[nstEvent].messageLength += eventPerfInfo[dftEvent].messageLength;
1268: totaltimes[nstEvent].numReductions += eventPerfInfo[dftEvent].numReductions;
1269: }
1270: totaltimes[nstEvent].name = eventRegInfo[(PetscLogEvent)nstEvent].name;
1271: }
1273: /* Initialize: self-times := totaltimes */
1274: for (nst=0; nst<=nstMax; nst++) { selftimes[nst] = totaltimes[nst]; }
1276: /* Subtract timed subprocesses from self-times */
1277: for (i=0; i<nNestedEvents; i++) {
1278: const int nParents = nestedEvents[i].nParents;
1279: const PetscLogEvent *dftEvents = nestedEvents[i].dftEvents;
1280: const NestedEventId *dftParentsSorted = nestedEvents[i].dftParentsSorted;
1281: for (j=0; j<nParents; j++) {
1282: if (dftParentsSorted[j] != DFT_ID_AWAKE) {
1283: const PetscLogEvent dftEvent = dftEvents[j];
1284: const NestedEventId nstParent = nstEvents[dftParentsSorted[j]];
1285: selftimes[nstParent].time -= eventPerfInfo[dftEvent].time;
1286: selftimes[nstParent].flops -= eventPerfInfo[dftEvent].flops;
1287: selftimes[nstParent].numMessages -= eventPerfInfo[dftEvent].numMessages;
1288: selftimes[nstParent].messageLength -= eventPerfInfo[dftEvent].messageLength;
1289: selftimes[nstParent].numReductions -= eventPerfInfo[dftEvent].numReductions;
1290: }
1291: }
1292: }
1294: PetscFree(nstEvents);
1295: PetscFree(totaltimes);
1297: /* Set outputs */
1298: *p_self = selftimes;
1299: *p_nstMax = nstMax;
1300: return(0);
1301: }
1303: static PetscErrorCode PetscPrintSelfTime(PetscViewer viewer, const PetscSelfTimer *selftimes, int nstMax, PetscLogDouble totalTime)
1304: {
1305: PetscErrorCode ierr;
1306: int i;
1307: NestedEventId nst;
1308: PetscSortItem *sortSelfTimes;
1309: PetscLogDouble *times, *maxTimes;
1310: PetscStageLog stageLog;
1311: PetscEventRegInfo *eventRegInfo;
1312: const int dum_depth = 1, dum_count=1, dum_parentcount=1;
1313: PetscBool wasPrinted;
1314: MPI_Comm comm;
1317: PetscObjectGetComm((PetscObject)viewer,&comm);
1318: PetscLogGetStageLog(&stageLog);
1319: eventRegInfo = stageLog->eventLog->eventInfo;
1321: PetscMalloc1(nstMax+1,×);
1322: PetscMalloc1(nstMax+1,&maxTimes);
1323: for (nst=0; nst<=nstMax; nst++) { times[nst] = selftimes[nst].time;}
1324: MPIU_Allreduce(times, maxTimes, nstMax+1, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);
1325: PetscFree(times);
1327: PetscMalloc1(nstMax+1,&sortSelfTimes);
1329: /* Sort the self-timers on basis of the largest time needed */
1330: for (nst=0; nst<=nstMax; nst++) {
1331: sortSelfTimes[nst].id = nst;
1332: sortSelfTimes[nst].val = maxTimes[nst];
1333: }
1334: PetscFree(maxTimes);
1335: qsort(sortSelfTimes, nstMax+1, sizeof(PetscSortItem), compareSortItems);
1337: PetscViewerXMLStartSection(viewer, "selftimertable", "Self-timings");
1338: PetscViewerXMLPutDouble(viewer, "totaltime", NULL, totalTime, "%f");
1340: for (i=0; i<=nstMax; i++) {
1341: if ((sortSelfTimes[i].val/totalTime) >= THRESHOLD) {
1342: NestedEventId nstEvent = sortSelfTimes[i].id;
1343: const char *name = eventRegInfo[(PetscLogEvent)nstEvent].name;
1344: PetscEventPerfInfo selfPerfInfo;
1346: selfPerfInfo.time = selftimes[nstEvent].time ;
1347: selfPerfInfo.flops = selftimes[nstEvent].flops;
1348: selfPerfInfo.numMessages = selftimes[nstEvent].numMessages;
1349: selfPerfInfo.messageLength = selftimes[nstEvent].messageLength;
1350: selfPerfInfo.numReductions = selftimes[nstEvent].numReductions;
1352: PetscLogNestedTreePrintLine(viewer, selfPerfInfo, dum_count, dum_parentcount, dum_depth, name, totalTime, &wasPrinted);
1353: if (wasPrinted){
1354: PetscViewerXMLEndSection(viewer, "event");
1355: }
1356: }
1357: }
1358: PetscViewerXMLEndSection(viewer, "selftimertable");
1359: PetscFree(sortSelfTimes);
1360: return(0);
1361: }
1363: PetscErrorCode PetscLogView_Nested(PetscViewer viewer)
1364: {
1365: PetscErrorCode ierr;
1366: PetscLogDouble locTotalTime, globTotalTime;
1367: PetscNestedEventTree *tree = NULL;
1368: PetscSelfTimer *selftimers = NULL;
1369: int nTimers = 0, nstMax = 0;
1370: MPI_Comm comm;
1373: PetscObjectGetComm((PetscObject)viewer,&comm);
1374: PetscViewerInitASCII_XML(viewer);
1375: PetscViewerASCIIPrintf(viewer, "<!-- PETSc Performance Summary: -->\n");
1376: PetscViewerXMLStartSection(viewer, "petscroot", NULL);
1378: /* Get the total elapsed time, local and global maximum */
1379: PetscTime(&locTotalTime); locTotalTime -= petsc_BaseTime;
1380: MPIU_Allreduce(&locTotalTime, &globTotalTime, 1, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);
1382: /* Print global information about this run */
1383: PetscPrintExeSpecs(viewer);
1384: PetscPrintGlobalPerformance(viewer, locTotalTime);
1385: /* Collect nested timer tree info from all processes */
1386: PetscLogNestedTreeCreate(viewer, &tree, &nTimers);
1387: PetscLogNestedTreePrintTop(viewer, tree, nTimers, globTotalTime);
1388: PetscLogNestedTreeDestroy(tree, nTimers);
1390: /* Calculate self-time for all (not-nested) events */
1391: PetscCalcSelfTime(viewer, &selftimers, &nstMax);
1392: PetscPrintSelfTime(viewer, selftimers, nstMax, globTotalTime);
1393: PetscFree(selftimers);
1395: PetscViewerXMLEndSection(viewer, "petscroot");
1396: PetscViewerFinalASCII_XML(viewer);
1397: return(0);
1398: }
1400: /*
1401: * Populate an array containing the ancestors (callers) of an event.
1402: */
1403: static PetscErrorCode GetEventAncestors(const PetscNestedEventTree *tree,int nEvents,const PetscNestedEventTree event,PetscNestedEventTree **ancestors,int *nAncestors)
1404: {
1405: PetscErrorCode ierr;
1406: int pos=0;
1407: PetscNestedEventTree currentEvent=event;
1410: // Find the number of ancestors.
1411: *nAncestors = 0;
1412: while (currentEvent.dftParent != -1) {
1413: for (int i=0; i<nEvents; i++) {
1414: if (tree[i].dftEvent == currentEvent.dftParent) {
1415: currentEvent = tree[i];
1416: (*nAncestors)++;
1417: break;
1418: }
1419: }
1420: }
1421: // Now populate the array.
1422: PetscMalloc1(*nAncestors, ancestors);
1423: currentEvent = event;
1424: while (currentEvent.dftParent != -1) {
1425: for (int i=0; i<nEvents; i++) {
1426: if (tree[i].dftEvent == currentEvent.dftParent) {
1427: currentEvent = tree[i];
1428: (*ancestors)[pos] = currentEvent;
1429: pos++;
1430: break;
1431: }
1432: }
1433: }
1434: return(0);
1435: }
1437: /*
1438: * Get the name of an event.
1439: */
1440: static PetscErrorCode GetEventName(const PetscNestedEventTree event,char **name)
1441: {
1443: PetscStageLog stageLog;
1446: PetscLogGetStageLog(&stageLog);
1447: *name = stageLog->eventLog->eventInfo[event.nstEvent].name;
1448: return(0);
1449: }
1451: /*
1452: * Return the duration of an event in seconds.
1453: */
1454: static PetscErrorCode GetEventTime(const PetscNestedEventTree event,PetscLogDouble *eventTime)
1455: {
1457: PetscStageLog stageLog;
1460: PetscLogGetStageLog(&stageLog);
1461: *eventTime = stageLog->stageInfo[MAINSTAGE].eventLog->eventInfo[event.dftEvent].time;
1462: return(0);
1463: }
1465: /*
1466: * Print nested logging information to a file suitable for reading into a Flame Graph.
1467: *
1468: * The format consists of a semicolon-separated list of events and the event duration in microseconds (which must be an integer).
1469: * An example output would look like:
1470: * MatAssemblyBegin 1
1471: * MatAssemblyEnd 10
1472: * MatView 302
1473: * KSPSetUp 98
1474: * KSPSetUp;VecSet 5
1475: * KSPSolve 150
1476: *
1477: * This option may be requested from the command line by passing in the flag `-log_view :<somefile>.txt:ascii_flamegraph`.
1478: */
1479: PetscErrorCode PetscLogView_Flamegraph(PetscViewer viewer)
1480: {
1481: int nEvents,nAncestors;
1482: char *eventName=NULL;
1483: PetscErrorCode ierr;
1484: PetscNestedEventTree *tree=NULL,*ancestors=NULL,event;
1485: PetscLogDouble eventTime=0,locTotalTime,globTotalTime;
1486: MPI_Comm comm;
1489: // Determine the overall time for the program.
1490: PetscTime(&locTotalTime);
1491: locTotalTime -= petsc_BaseTime;
1492: PetscObjectGetComm((PetscObject)viewer,&comm);
1493: MPIU_Allreduce(&locTotalTime,&globTotalTime,1,MPIU_PETSCLOGDOUBLE,MPI_MAX,comm);
1495: PetscLogNestedTreeCreate(viewer, &tree, &nEvents);
1497: // Now write the events to the file.
1498: for (int i=0; i<nEvents; i++) {
1499: event = tree[i];
1500: GetEventTime(event,&eventTime);
1501: if (eventTime/globTotalTime < THRESHOLD) continue;
1503: // Print out the ancestor events in reverse order, starting with the oldest.
1504: GetEventAncestors(tree,nEvents,event,&ancestors,&nAncestors);
1505: for (int j=nAncestors-1; j>=0; j--) {
1506: GetEventName(ancestors[j],&eventName);
1507: PetscViewerASCIIPrintf(viewer,"%s;",eventName);
1508: }
1509: PetscFree(ancestors);
1510: // Now print the actual event and duration.
1511: // The time is written as an integer (in microseconds) so the file can be understood by tools such as Speedscope.
1512: GetEventName(event,&eventName);
1513: PetscViewerASCIIPrintf(viewer,"%s %" PetscInt64_FMT "\n",eventName,(PetscInt64)(eventTime*1e6));
1514: }
1516: PetscLogNestedTreeDestroy(tree, nEvents);
1517: return(0);
1518: }
1520: PETSC_EXTERN PetscErrorCode PetscASend(int count, int datatype)
1521: {
1522: #if !defined(MPIUNI_H) && !defined(PETSC_HAVE_BROKEN_RECURSIVE_MACRO) && !defined(PETSC_HAVE_MPI_MISSING_TYPESIZE)
1524: #endif
1527: petsc_send_ct++;
1528: #if !defined(MPIUNI_H) && !defined(PETSC_HAVE_BROKEN_RECURSIVE_MACRO) && !defined(PETSC_HAVE_MPI_MISSING_TYPESIZE)
1529: PetscMPITypeSize(count,MPI_Type_f2c((MPI_Fint) datatype),&petsc_send_len);
1530: #endif
1531: return(0);
1532: }
1534: PETSC_EXTERN PetscErrorCode PetscARecv(int count, int datatype)
1535: {
1536: #if !defined(MPIUNI_H) && !defined(PETSC_HAVE_BROKEN_RECURSIVE_MACRO) && !defined(PETSC_HAVE_MPI_MISSING_TYPESIZE)
1538: #endif
1541: petsc_recv_ct++;
1542: #if !defined(MPIUNI_H) && !defined(PETSC_HAVE_BROKEN_RECURSIVE_MACRO) && !defined(PETSC_HAVE_MPI_MISSING_TYPESIZE)
1543: PetscMPITypeSize(count,MPI_Type_f2c((MPI_Fint) datatype),&petsc_recv_len);
1544: #endif
1545: return(0);
1546: }
1548: PETSC_EXTERN PetscErrorCode PetscAReduce()
1549: {
1551: petsc_allreduce_ct++;
1552: return(0);
1553: }
1555: #endif