/**************************************************************************** ** TAU Portable Profiling Package ** ** http://www.cs.uoregon.edu/research/tau ** ***************************************************************************** ** Copyright 2010 ** ** Department of Computer and Information Science, University of Oregon ** ** Advanced Computing Laboratory, Los Alamos National Laboratory ** ****************************************************************************/ /**************************************************************************** ** File : TauCollate.cpp ** ** Description : TAU Profiling Package ** ** Contact : tau-bugs@cs.uoregon.edu ** ** Documentation : See http://www.cs.uoregon.edu/research/tau ** ** ** ** Description : Profile merging code ** ** ** ****************************************************************************/ #ifdef TAU_MPI #ifdef TAU_EXP_UNIFY #include #include #include #include #include #include #include #include #include #include #include #include #include // #include //#define DEBUG #ifdef DEBUG void TAU_MPI_DEBUG0(const char *format, ...) { int rank; PMPI_Comm_rank(MPI_COMM_WORLD, &rank); if (rank != 0) { return; } va_list args; va_start(args, format); vfprintf(stderr, format, args); va_end(args); } #else void TAU_MPI_DEBUG0(const char *format, ...) { return; } #endif typedef enum { step_min, step_max, step_sum, step_sumsqr } collate_step; const int num_collate_steps = 4; char *collate_step_name[4] = {"min", "max", "sum", "sumsqr"}; MPI_Op collate_op[4] = {MPI_MIN, MPI_MAX, MPI_SUM, MPI_SUM}; /********************************************************************* * getStepValue returns the incremental thread-combined value, similar * to how MPI_Reduce will combine values across ranks, we need to do this for threads. ********************************************************************/ static double getStepValue(collate_step step, double prevValue, double nextValue) { if (step == step_sumsqr) { nextValue = nextValue * nextValue; } if (prevValue == -1) { return nextValue; } if (nextValue == -1) { return prevValue; } if (step == step_min) { if (nextValue < prevValue) { return nextValue; } } else if (step == step_max) { if (nextValue > prevValue) { return nextValue; } } else if (step == step_sum) { prevValue += nextValue; } else if (step == step_sumsqr) { prevValue += nextValue; } return prevValue; } /********************************************************************* * getStepValue returns the incremental thread-combined value, similar * to how MPI_Reduce will combine values across ranks, we need to do this for threads. ********************************************************************/ static int getStepValue(collate_step step, int prevValue, int nextValue) { if (step == step_sumsqr) { nextValue = nextValue * nextValue; } if (prevValue == -1) { return nextValue; } if (nextValue == -1) { return prevValue; } if (step == step_min) { if (nextValue < prevValue) { return nextValue; } } else if (step == step_max) { if (nextValue > prevValue) { return nextValue; } } else if (step == step_sum) { prevValue += nextValue; } else if (step == step_sumsqr) { prevValue += nextValue; } return prevValue; } /********************************************************************* * An MPI_Reduce operator similar to MPI_MIN, but it allows for -1 values * to represent "non-existent" ********************************************************************/ static void stat_min (void *i, void *o, int *len, MPI_Datatype *type) { if (*type == MPI_INT) { int *in = (int *) i; int *inout = (int *) o; for (int i=0; i<*len; i++) { if (inout[i] == -1) { inout[i] = in[i]; } else if (in[i] != -1) { if (in[i] < inout[i]) { inout[i] = in[i]; } } } } else { double *in = (double *) i; double *inout = (double *) o; for (int i=0; i<*len; i++) { if (inout[i] == -1) { inout[i] = in[i]; } else if (in[i] != -1) { if (in[i] < inout[i]) { inout[i] = in[i]; } } } } } /********************************************************************* * An MPI_Reduce operator similar to MPI_SUM, but it allows for -1 values * to represent "non-existent" ********************************************************************/ static void stat_sum (void *i, void *o, int *len, MPI_Datatype *type) { if (*type == MPI_INT) { int *in = (int *) i; int *inout = (int *) o; for (int i=0; i<*len; i++) { if (inout[i] == -1) { inout[i] = in[i]; } else if (in[i] != -1) { inout[i] += in[i]; } } } else { double *in = (double *) i; double *inout = (double *) o; for (int i=0; i<*len; i++) { if (inout[i] == -1) { inout[i] = in[i]; } else if (in[i] != -1) { inout[i] += in[i]; } } } } static void Tau_collate_allocateBuffers(double ***excl, double ***incl, int **numCalls, int **numSubr, int numItems) { *excl = (double **) TAU_UTIL_MALLOC(sizeof(double *) * Tau_Global_numCounters); *incl = (double **) TAU_UTIL_MALLOC(sizeof(double *) * Tau_Global_numCounters); for (int m=0; m= numBins) { TAU_ABORT("TAU: Error computing histogram, non-existent bin=%d\n", mybin); } histogram[mybin]++; } /********************************************************************* * Write a profile with data from all nodes/threads ********************************************************************/ extern "C" int Tau_collate_writeProfile() { static int invocationIndex = -1; invocationIndex++; int rank, size; PMPI_Comm_rank(MPI_COMM_WORLD, &rank); PMPI_Comm_size(MPI_COMM_WORLD, &size); // timing info x_uint64 start, end; if (rank == 0) { TAU_VERBOSE("TAU: Collating...\n"); start = TauMetrics_getTimeOfDay(); } // create and assign specialized min and sum operators MPI_Op min_op, sum_op; PMPI_Op_create (stat_min, 1, &min_op); PMPI_Op_create (stat_sum, 1, &sum_op); collate_op[step_min] = min_op; collate_op[step_sum] = sum_op; collate_op[step_sumsqr] = sum_op; // Dump out all thread data with present values int numThreads = RtsLayer::getNumThreads(); for (int tid = 0; tidglobalNumItems); x_uint64 start_aggregate, end_aggregate; if (rank == 0) { start_aggregate = TauMetrics_getTimeOfDay(); } int globalNumThreads; PMPI_Reduce(&numThreads, &globalNumThreads, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD); // create a reverse mapping, not strictly necessary, but it makes things easier int *globalmap = (int*)TAU_UTIL_MALLOC(functionUnifier->globalNumItems * sizeof(int)); for (int i=0; iglobalNumItems; i++) { // initialize all to -1 globalmap[i] = -1; // -1 indicates that the event did not occur for this rank } for (int i=0; ilocalNumItems; i++) { globalmap[functionUnifier->mapping[i]] = i; // set reserse mapping } // the global number of events int numItems = functionUnifier->globalNumItems; // allocate memory for values to compute/send double **excl, **incl; int *numCalls, *numSubr; Tau_collate_allocateBuffers(&excl, &incl, &numCalls, &numSubr, numItems); double ***gExcl, ***gIncl; int **gNumCalls, **gNumSubr; gExcl = (double ***) TAU_UTIL_MALLOC(sizeof(double **) * num_collate_steps); gIncl = (double ***) TAU_UTIL_MALLOC(sizeof(double **) * num_collate_steps); gNumCalls = (int **) TAU_UTIL_MALLOC(sizeof(int *) * num_collate_steps); gNumSubr = (int **) TAU_UTIL_MALLOC(sizeof(int *) * num_collate_steps); // we generate statistics in 4 steps for (int s=0; s < num_collate_steps; s++) { Tau_collate_allocateBuffers(&(gExcl[s]), &(gIncl[s]), &(gNumCalls[s]), &(gNumSubr[s]), numItems); // reset intermediate data for (int m=0; msortMap[globalmap[i]]; FunctionInfo *fi = TheFunctionDB()[local_index]; for (int m=0; mgetDumpInclusiveValues(tid)[m]); excl[m][i] = getStepValue((collate_step)s, excl[m][i], fi->getDumpExclusiveValues(tid)[m]); } numCalls[i] = getStepValue((collate_step)s, numCalls[i], fi->GetCalls(tid)); numSubr[i] = getStepValue((collate_step)s, numSubr[i], fi->GetSubrs(tid)); } } } // reduce data to rank 0 for (int m=0; mglobalStrings[i]); // } // } } if (rank == 0) { char profileName[512], profileNameTmp[512]; sprintf (profileNameTmp, ".temp.mean.%d.0.0", invocationIndex); sprintf (profileName, "mean.%d.0.0", invocationIndex); FILE *profile = fopen(profileNameTmp, "w"); fprintf (profile, "%d templated_functions_MULTI_TIME\n", numItems); fprintf (profile, "# Name Calls Subrs Excl Incl ProfileCalls % TAU Internal Profile Attributecollate_dump_dagstuhl \n"); for (int i=0; iglobalStrings[i], numCalls, numSubr, exclusive, inclusive); } fprintf (profile, "0 aggregates\n"); fclose (profile); rename (profileNameTmp, profileName); } if (rank == 0) { end_aggregate = TauMetrics_getTimeOfDay(); TAU_VERBOSE("TAU: Collate: Aggregation Complete, duration = %.4G seconds\n", ((double)((double)end_aggregate-start_aggregate))/1000000.0f); } // fflush(stderr); // PMPI_Barrier(MPI_COMM_WORLD); // now compute histograms x_uint64 start_hist, end_hist; if (rank == 0) { start_hist = TauMetrics_getTimeOfDay(); } int numBins = 20; FILE *histoFile; char histFileNameTmp[512]; char histFileName[512]; if (rank == 0) { sprintf (histFileName, "tau.histograms.%d", invocationIndex); sprintf (histFileNameTmp, ".temp.tau.histograms.%d", invocationIndex); histoFile = fopen(histFileNameTmp, "w"); fprintf (histoFile, "%d\n", numItems); fprintf (histoFile, "%d\n", (Tau_Global_numCounters*2)+2); fprintf (histoFile, "%d\n", numBins); for (int m=0; msortMap[globalmap[e]]; FunctionInfo *fi = TheFunctionDB()[local_index]; double min, max; for (int tid = 0; tidgetDumpExclusiveValues(tid)[m], numBins); Tau_collate_incrementHistogram(&(histogram[(m*2+1)*numBins]), gIncl[step_min][m][e], gIncl[step_max][m][e], fi->getDumpInclusiveValues(tid)[m], numBins); // // debugging the histogram // if (e == 0) { // int *ptr = &(histogram[(m*2)*numBins]); // std::stringstream stream; // stream << "rank[" << rank << "] = "; // for (int j=0;jGetCalls(tid), numBins); Tau_collate_incrementHistogram(&(histogram[(Tau_Global_numCounters*2+1)*numBins]), gNumSubr[step_min][e], gNumSubr[step_max][e], fi->GetSubrs(tid), numBins); } } PMPI_Reduce (histogram, outHistogram, numBins * numHistoGrams, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD); if (rank == 0) { fprintf (histoFile, "%s\n", functionUnifier->globalStrings[e]); for (int m=0; m