/home/users/khuck/src/hpx-lsu/apex/src/apex/profiler_listener.cpp


//  Copyright (c) 2014 University of Oregon
//

#ifdef APEX_HAVE_HPX
#include <hpx/config.hpp>
#ifdef APEX_HAVE_OTF2
#define APEX_TRACE_APEX
#endif // APEX_HAVE_OTF2
#endif // APEX_HAVE_HPX

#include "profiler_listener.hpp"
#include "profiler.hpp"
#include "thread_instance.hpp"
#include <iostream>
#include <iomanip>
#include <fstream>
#include <math.h>
#include "apex_options.hpp"
#include "profiler.hpp"
#include "profile.hpp"
#include "apex.hpp"

#include <atomic>
#if !defined(_WIN32) && (defined(__unix__) || defined(__unix) || (defined(__APPLE__) && defined(__MACH__)))
#include <unistd.h>
#include <sched.h>
#endif
#include <cstdio>
#include <vector>
#include <string>
#include <unordered_set>
#include <algorithm>
#include <iterator>

#include <functional>
#include <thread>
#include <future>

#if defined(APEX_THROTTLE)
#include "apex_cxx_shared_lock.hpp"
apex::shared_mutex_type throttled_event_set_mutex;
#define APEX_THROTTLE_CALLS 1000
#ifdef APEX_USE_CLOCK_TIMESTAMP
#define APEX_THROTTLE_PERCALL 0.00001 // 10 microseconds.
#else
#define APEX_THROTTLE_PERCALL 50000 // 50k cycles.
#endif
#endif

#if APEX_HAVE_PAPI
#include "papi.h"
#include <mutex>
std::mutex event_set_mutex;
#endif

#ifdef APEX_HAVE_HPX
#include <boost/assign.hpp>
#include <boost/cstdint.hpp>
#include <hpx/include/performance_counters.hpp>
#include <hpx/include/actions.hpp>
#include <hpx/include/util.hpp>
#include <hpx/lcos/local/composable_guard.hpp>
static void apex_schedule_process_profiles(void); // not in apex namespace
const int num_non_worker_threads_registered = 0;
#endif

#define APEX_MAIN "APEX MAIN"

#ifdef APEX_HAVE_TAU
#define PROFILING_ON
#define TAU_DOT_H_LESS_HEADERS
#include <TAU.h>
#endif

#include "utils.hpp"

#include <cstdlib>
#include <ctime>

using namespace std;
using namespace apex;

APEX_NATIVE_TLS unsigned int my_tid = 0; // the current thread's TID in APEX

namespace apex {

#ifdef APEX_MULTIPLE_QUEUES
  /* this is a thread-local pointer to a concurrent queue for each worker thread. */
  __thread profiler_queue_t * thequeue;
#endif

  /* THis is a special profiler, indicating that the timer requested is
     throttled, and shouldn't be processed. */
  profiler* profiler::disabled_profiler = new profiler();

#ifdef APEX_HAVE_HPX
  /* Flag indicating whether a consumer task is currently running */
  std::atomic_flag consumer_task_running = ATOMIC_FLAG_INIT;
  bool hpx_shutdown = false;
#endif

  double profiler_listener::get_non_idle_time() {
    double non_idle_time = 0.0;
    /* Iterate over all timers and accumulate the time spent in them */
    unordered_map<task_identifier, profile*>::const_iterator it2;
    std::unique_lock<std::mutex> task_map_lock(_task_map_mutex);
    for(it2 = task_map.begin(); it2 != task_map.end(); it2++) {
      profile * p = it2->second;
#if defined(APEX_THROTTLE)
      task_identifier id = it2->first;
      unordered_set<task_identifier>::const_iterator it4;
	  {
      	read_lock_type l(throttled_event_set_mutex);
      	it4 = throttled_tasks.find(id);
	  }
      if (it4!= throttled_tasks.end()) {
        continue;
      }
#endif
      if (p->get_type() == APEX_TIMER) {
        non_idle_time += p->get_accumulated();
      }
    }
    return non_idle_time*profiler::get_cpu_mhz();
  }

  profile * profiler_listener::get_idle_time() {
    double non_idle_time = get_non_idle_time();
    /* Subtract the accumulated time from the main time span. */
	int num_worker_threads = thread_instance::get_num_threads();
#ifdef APEX_HAVE_HPX
    num_worker_threads = num_worker_threads - num_non_worker_threads_registered;
#endif
    std::chrono::duration<double> time_span =
        std::chrono::duration_cast<std::chrono::duration<double>>
           (MYCLOCK::now() - main_timer->start);
    double total_main = time_span.count() *
                fmin(hardware_concurrency(), num_worker_threads);
    double elapsed = total_main - non_idle_time;
    elapsed = elapsed > 0.0 ? elapsed : 0.0;
    profile * theprofile = new profile(elapsed*profiler::get_cpu_mhz(), 0, NULL, false);
    return theprofile;
  }

  profile * profiler_listener::get_idle_rate() {
    double non_idle_time = get_non_idle_time();
    /* Subtract the accumulated time from the main time span. */
	int num_worker_threads = thread_instance::get_num_threads();
#ifdef APEX_HAVE_HPX
    num_worker_threads = num_worker_threads - num_non_worker_threads_registered;
#endif
    std::chrono::duration<double> time_span =
        std::chrono::duration_cast<std::chrono::duration<double>>
           (MYCLOCK::now() - main_timer->start);
    double total_main = time_span.count() *
                fmin(hardware_concurrency(), num_worker_threads);
    double elapsed = total_main - non_idle_time;
    double rate = elapsed > 0.0 ? ((elapsed/total_main)) : 0.0;
    profile * theprofile = new profile(rate, 0, NULL, false);
    return theprofile;
  }

  /* Return the requested profile object to the user.
   * Return nullptr if doesn't exist. */
  profile * profiler_listener::get_profile(task_identifier &id) {
    if (id.name == string(APEX_IDLE_RATE)) {
        return get_idle_rate();
    } else if (id.name == string(APEX_IDLE_TIME)) {
        return get_idle_time();
    } else if (id.name == string(APEX_NON_IDLE_TIME)) {
        profile * theprofile = new profile(get_non_idle_time(), 0, NULL, false);
        return theprofile;
    }
    std::unique_lock<std::mutex> task_map_lock(_task_map_mutex);
    unordered_map<task_identifier, profile*>::const_iterator it = task_map.find(id);
    if (it != task_map.end()) {
      return (*it).second;
    }
    return nullptr;
  }

  void profiler_listener::reset_all(void) {
    std::unique_lock<std::mutex> task_map_lock(_task_map_mutex);
    for(auto &it : task_map) {
        it.second->reset();
    }
  }

  /* After the consumer thread pulls a profiler off of the queue,
   * process it by updating its profile object in the map of profiles. */
  // TODO The name-based timer and address-based timer paths through
  // the code involve a lot of duplication -- this should be refactored
  // to remove the duplication so it's easier to maintain.
  unsigned int profiler_listener::process_profile(std::shared_ptr<profiler> &p, unsigned int tid)
  {
    if(p == nullptr) return 0;
    profile * theprofile;
    if(p->is_reset == reset_type::ALL) {
        reset_all();
        return 0;
    }
    double values[8] = {0};
    double tmp_num_counters = 0;
#if APEX_HAVE_PAPI
    tmp_num_counters = num_papi_counters;
    for (int i = 0 ; i < num_papi_counters ; i++) {
        if (p->papi_stop_values[i] > p->papi_start_values[i]) {
            values[i] = p->papi_stop_values[i] - p->papi_start_values[i];
        } else {
            values[i] = 0.0;
        }
    }
#endif
    std::unique_lock<std::mutex> task_map_lock(_task_map_mutex, std::defer_lock);
    // There is only one consumer thread except during shutdown, so we only need
    // to lock during shutdown.
    bool did_lock = false;
    if(_done) {
        task_map_lock.lock();
        did_lock = true;
    }
    unordered_map<task_identifier, profile*>::const_iterator it = task_map.find(*(p->task_id));
    if (it != task_map.end()) {
      	// A profile for this ID already exists.
        theprofile = (*it).second;
        if(_done && did_lock) {
            task_map_lock.unlock();
        }
        if(p->is_reset == reset_type::CURRENT) {
            theprofile->reset();
        } else {
            theprofile->increment(p->elapsed(), tmp_num_counters, values, p->is_resume);
        }
#if defined(APEX_THROTTLE)
        // Is this a lightweight task? If so, we shouldn't measure it any more,
        // in order to reduce overhead.
        if (theprofile->get_calls() > APEX_THROTTLE_CALLS &&
            theprofile->get_mean() < APEX_THROTTLE_PERCALL) {
            unordered_set<task_identifier>::const_iterator it2;
	        {
      	      read_lock_type l(throttled_event_set_mutex);
              it2 = throttled_tasks.find(*(p->task_id));
			}
            if (it2 == throttled_tasks.end()) {
                // lock the set for insert
	            {
      	            write_lock_type l(throttled_event_set_mutex);
                    // was it inserted when we were waiting?
                    it2 = throttled_tasks.find(*(p->task_id));
                    // no? OK - insert it.
                    if (it2 == throttled_tasks.end()) {
                        throttled_tasks.insert(*(p->task_id));
                    }
                }
                if (apex_options::use_screen_output()) {
                    cout << "APEX: disabling lightweight timer "
                         << p->task_id->get_name()
                          << endl;
                    fflush(stdout);
                }
            }
        }
#endif
      } else {
        // Create a new profile for this name.
        theprofile = new profile(p->is_reset == reset_type::CURRENT ? 0.0 : p->elapsed(), tmp_num_counters, values, p->is_resume, p->is_counter ? APEX_COUNTER : APEX_TIMER);
        task_map[*(p->task_id)] = theprofile;
        if(_done && did_lock) {
            task_map_lock.unlock();
        }
#ifdef APEX_HAVE_HPX
#ifdef APEX_REGISTER_HPX3_COUNTERS
        if(!_done) {
            if(get_hpx_runtime_ptr() != nullptr && p->task_id->has_name()) {
                std::string timer_name(p->task_id->get_name());
                //Don't register timers containing "/"
                if(timer_name.find("/") == std::string::npos) {
                    hpx::performance_counters::install_counter_type(
                    std::string("/apex/") + timer_name,
                    [p](bool r)->boost::int64_t{
                        boost::int64_t value(p->elapsed());
                        return value;
                    },
                    std::string("APEX counter ") + timer_name,
                    ""
                    );
                }
            } else {
                std::cerr << "HPX runtime not initialized yet." << std::endl;
            }
        }
#endif
#endif
      }
#if !defined(_MSC_VER)
      /* write the sample to the file */
      if (apex_options::task_scatterplot()) {
        if (!p->is_counter) {
            static int thresh = RAND_MAX/100;
            if (std::rand() < thresh) {
                std::unique_lock<std::mutex> task_map_lock(_mtx);
                task_scatterplot_samples << p->normalized_timestamp() << " "
                            << p->elapsed()*profiler::get_cpu_mhz()*1000000 << " "
                            << "'" << p->task_id->get_name() << "'" << endl;
                int loc0 = task_scatterplot_samples.tellp();
                if (loc0 > 32768) {
                    // lock access to the file
        			// write using low-level file locking!
        			struct flock fl;
        			fl.l_type   = F_WRLCK;  /* F_RDLCK, F_WRLCK, F_UNLCK    */
        			fl.l_whence = SEEK_SET; /* SEEK_SET, SEEK_CUR, SEEK_END */
        			fl.l_start  = 0;        /* Offset from l_whence         */
        			fl.l_len    = 0;        /* length, 0 = to EOF           */
        			fl.l_pid    = getpid();      /* our PID                      */
        			fcntl(task_scatterplot_sample_file, F_SETLKW, &fl);  /* F_GETLK, F_SETLK, F_SETLKW */
                    // flush the string stream to the file
                    //lseek(task_scatterplot_sample_file, 0, SEEK_END);
        			ssize_t bytes_written = write(task_scatterplot_sample_file,
						  task_scatterplot_samples.str().c_str(), loc0);
                    if (bytes_written < 0) {
                        int errsv = errno;
                        perror("Error writing to scatterplot!");
                        fprintf(stderr, "Error writing scatterplot:\n%s\n",
                                strerror(errsv));
                    }
        			fl.l_type   = F_UNLCK;   /* tell it to unlock the region */
        			fcntl(task_scatterplot_sample_file, F_SETLK, &fl); /* set the region to unlocked */
                    // reset the stringstream
                    task_scatterplot_samples.str("");
                }
            }
        }
      }
#endif
    return 1;
  }

  inline unsigned int profiler_listener::process_dependency(task_dependency* td)
  {
      unordered_map<task_identifier, unordered_map<task_identifier, int>* >::const_iterator it = task_dependencies.find(td->parent);
      unordered_map<task_identifier, int> * depend;
      // if this is a new dependency for this parent?
      if (it == task_dependencies.end()) {
          depend = new unordered_map<task_identifier, int>();
          (*depend)[td->child] = 1;
          task_dependencies[td->parent] = depend;
      // otherwise, see if this parent has seen this child
      } else {
          depend = it->second;
          unordered_map<task_identifier, int>::const_iterator it2 = depend->find(td->child);
          // first time for this child
          if (it2 == depend->end()) {
              (*depend)[td->child] = 1;
          // not the first time for this child
          } else {
              int tmp = it2->second;
              (*depend)[td->child] = tmp + 1;
          }
      }
      delete(td);
      return 1;
  }

  /* Cleaning up memory. Not really necessary, because it only gets
   * called at shutdown. But a good idea to do regardless. */
  void profiler_listener::delete_profiles(void) {
    // iterate over the map and free the objects in the map
    unordered_map<task_identifier, profile*>::const_iterator it;
    std::unique_lock<std::mutex> task_map_lock(_task_map_mutex);
    for(it = task_map.begin(); it != task_map.end(); it++) {
      delete it->second;
    }
    // clear the map.
    task_map.clear();

  }

#define PAD_WITH_SPACES "%8s"
#define FORMAT_PERCENT "%8.3f"
#define FORMAT_SCIENTIFIC "%1.2e"

  template<typename ... Args>
  string string_format( const std::string& format, Args ... args )
  {
      size_t size = snprintf( nullptr, 0, format.c_str(), args ... ) + 1; // Extra space for '\0'
      unique_ptr<char[]> buf( new char[ size ] );
      snprintf( buf.get(), size, format.c_str(), args ... );
      return string( buf.get(), buf.get() + size - 1 ); // We don't want the '\0' inside
  }

  void profiler_listener::write_one_timer(task_identifier &task_id,
          profile * p, stringstream &screen_output,
          stringstream &csv_output, double &total_accumulated,
          double &total_main) {
      string action_name = task_id.get_name();
      string shorter(action_name);
      // to keep formatting pretty, trim any long timer names
      if (shorter.size() > 30) {
        shorter.resize(27);
        shorter.resize(30, '.');
      }
      //screen_output << "\"" << shorter << "\", " ;
      screen_output << string_format("%30s", shorter.c_str()) << " : ";
#if defined(APEX_THROTTLE)
      // if this profile was throttled, don't output the measurements.
      // they are limited and bogus, anyway.
      unordered_set<task_identifier>::const_iterator it4;
	  {
      	read_lock_type l(throttled_event_set_mutex);
        it4 = throttled_tasks.find(task_id);
  	  }
      if (it4!= throttled_tasks.end()) {
        screen_output << "DISABLED (high frequency, short duration)" << endl;
        return;
      }
#endif
      if(p->get_calls() < 1) {
        p->get_profile()->calls = 1;
      }
      if (p->get_calls() < 999999) {
          screen_output << string_format(PAD_WITH_SPACES, to_string((int)p->get_calls()).c_str()) << "   " ;
      } else {
          screen_output << string_format(FORMAT_SCIENTIFIC, p->get_calls()) << "   " ;
      }
      if (p->get_type() == APEX_TIMER) {
        csv_output << "\"" << action_name << "\",";
        csv_output << llround(p->get_calls()) << ",";
        // convert MHz to Hz
        csv_output << std::llround(p->get_accumulated()) << ",";
        // convert MHz to microseconds
        csv_output << std::llround(p->get_accumulated()*profiler::get_cpu_mhz()*1000000);
        screen_output << " --n/a--   " ;
        screen_output << string_format(FORMAT_SCIENTIFIC, (p->get_mean()*profiler::get_cpu_mhz())) << "   " ;
        screen_output << " --n/a--   " ;
        screen_output << string_format(FORMAT_SCIENTIFIC, (p->get_accumulated()*profiler::get_cpu_mhz())) << "   " ;
        screen_output << " --n/a--   " ;
        screen_output << string_format(FORMAT_PERCENT, (((p->get_accumulated()*profiler::get_cpu_mhz())/total_main)*100));
#if APEX_HAVE_PAPI
        for (int i = 0 ; i < num_papi_counters ; i++) {
            screen_output  << "   " << string_format(FORMAT_SCIENTIFIC, (p->get_papi_metrics()[i]));
            csv_output << "," << std::llround(p->get_papi_metrics()[i]);
        }
#endif
        screen_output << endl;
        total_accumulated += p->get_accumulated();
        csv_output << endl;
      } else {
        if (action_name.find('%') == string::npos) {
          screen_output << string_format(FORMAT_SCIENTIFIC, p->get_minimum()) << "   " ;
          screen_output << string_format(FORMAT_SCIENTIFIC, p->get_mean()) << "   " ;
          screen_output << string_format(FORMAT_SCIENTIFIC, p->get_maximum()) << "   " ;
          screen_output << string_format(FORMAT_SCIENTIFIC, p->get_accumulated()) << "   " ;
          screen_output << string_format(FORMAT_SCIENTIFIC, p->get_stddev()) << "   " ;
        } else {
          screen_output << string_format(FORMAT_PERCENT, p->get_minimum()) << "   " ;
          screen_output << string_format(FORMAT_PERCENT, p->get_mean()) << "   " ;
          screen_output << string_format(FORMAT_PERCENT, p->get_maximum()) << "   " ;
          screen_output << string_format(FORMAT_PERCENT, p->get_accumulated()) << "   " ;
          screen_output << string_format(FORMAT_PERCENT, p->get_stddev()) << "   " ;
        }
        screen_output << " --n/a-- "  << endl;
      }
  }

  /* At program termination, write the measurements to the screen, or to CSV file, or both. */
  void profiler_listener::finalize_profiles(void) {
    // our TOTAL available time is the elapsed * the number of threads, or cores
	int num_worker_threads = thread_instance::get_num_threads();
    double wall_clock_main = main_timer->elapsed() * profiler::get_cpu_mhz();
#ifdef APEX_HAVE_HPX
    num_worker_threads = num_worker_threads - num_non_worker_threads_registered;
#endif
    double total_main = wall_clock_main *
        fmin(hardware_concurrency(), num_worker_threads);
    // create a stringstream to hold all the screen output - we may not
    // want to write it out
    stringstream screen_output;
    // create a stringstream to hold all the CSV output - we may not
    // want to write it out
    stringstream csv_output;
    // iterate over the profiles in the address map
    screen_output << "Elapsed time: " << wall_clock_main << endl;
    screen_output << "Cores detected: " << hardware_concurrency() << endl;
    screen_output << "Worker Threads observed: " << num_worker_threads << endl;
    screen_output << "Available CPU time: " << total_main << endl;
    map<apex_function_address, profile*>::const_iterator it;
    screen_output << "Action                         :  #calls  |  minimum |    mean  |  maximum |   total  |  stddev  |  % total  " << apex_options::papi_metrics() << endl;
    screen_output << "------------------------------------------------------------------------------------------------------------" << endl;
    csv_output << "\"task\",\"num calls\",\"total cycles\",\"total microseconds\"";
#if APEX_HAVE_PAPI
    for (int i = 0 ; i < num_papi_counters ; i++) {
       csv_output << ",\"" << metric_names[i] << "\"";
    }
#endif
    csv_output << endl;
    double total_accumulated = 0.0;
    unordered_map<task_identifier, profile*>::const_iterator it2;
    std::vector<task_identifier> id_vector;
    // iterate over the counters, and sort their names
    std::unique_lock<std::mutex> task_map_lock(_task_map_mutex);
    for(it2 = task_map.begin(); it2 != task_map.end(); it2++) {
        task_identifier task_id = it2->first;
        profile * p = it2->second;
        if (p->get_type() != APEX_TIMER) {
            id_vector.push_back(task_id);
        }
    }
    std::sort(id_vector.begin(), id_vector.end());
    // iterate over the counters
    for(task_identifier task_id : id_vector) {
        profile * p = task_map[task_id];
        if (p) {
            write_one_timer(task_id, p, screen_output, csv_output, total_accumulated, total_main);
        }
    }
    id_vector.clear();
    // iterate over the timers, and sort their names
    for(it2 = task_map.begin(); it2 != task_map.end(); it2++) {
        profile * p = it2->second;
        task_identifier task_id = it2->first;
        if (p->get_type() == APEX_TIMER) {
            id_vector.push_back(task_id);
        }
    }
    // iterate over the timers
    std::sort(id_vector.begin(), id_vector.end());
    // iterate over the counters
    for(task_identifier task_id : id_vector) {
        profile * p = task_map[task_id];
        if (p) {
            write_one_timer(task_id, p, screen_output, csv_output, total_accumulated, total_main);
        }
    }
    double idle_rate = total_main - (total_accumulated*profiler::get_cpu_mhz());
    screen_output << string_format("%30s", APEX_IDLE_TIME) << " : ";
    screen_output << " --n/a--   " ;
    screen_output << " --n/a--   " ;
    screen_output << " --n/a--   " ;
    screen_output << " --n/a--   " ;
    if (idle_rate < 0.0) {
      screen_output << " --n/a--   " ;
    } else {
      screen_output << string_format(FORMAT_SCIENTIFIC, idle_rate) << "   " ;
    }
    screen_output << " --n/a--   " ;
    if (idle_rate < 0.0) {
      screen_output << " --n/a--   " << endl;
    } else {
      screen_output << string_format(FORMAT_PERCENT, ((idle_rate/total_main)*100)) << endl;
    }
    screen_output << "------------------------------------------------------------------------------------------------------------" << endl;
    if (apex_options::use_screen_output()) {
        cout << screen_output.str();
    }
    if (apex_options::use_csv_output()) {
        ofstream csvfile;
        stringstream csvname;
        csvname << "apex." << node_id << ".csv";
        csvfile.open(csvname.str(), ios::out);
        csvfile << csv_output.str();
        csvfile.close();
    }
  }

/* The following code is from:
   http://stackoverflow.com/questions/7706339/grayscale-to-red-green-blue-matlab-jet-color-scale */
class node_color {
public:
    double red;
    double green;
    double blue;
    node_color() : red(1.0), green(1.0), blue(1.0) {}
    int convert(double in) { return (int)(in * 255.0); }
} ;

node_color * get_node_color(double v,double vmin,double vmax)
{
   node_color * c = new node_color();
   double dv;

   if (v < vmin)
      v = vmin;
   if (v > vmax)
      v = vmax;
   dv = vmax - vmin;

   if (v < (vmin + 0.25 * dv)) {
      c->red = 0;
      c->green = 4 * (v - vmin) / dv;
   } else if (v < (vmin + 0.5 * dv)) {
      c->red = 0;
      c->blue = 1 + 4 * (vmin + 0.25 * dv - v) / dv;
   } else if (v < (vmin + 0.75 * dv)) {
      c->red = 4 * (v - vmin - 0.5 * dv) / dv;
      c->blue = 0;
   } else {
      c->green = 1 + 4 * (vmin + 0.75 * dv - v) / dv;
      c->blue = 0;
   }

   return(c);
}

  void profiler_listener::write_taskgraph(void) {
    ofstream myfile;
    stringstream dotname;
    dotname << "taskgraph." << node_id << ".dot";
    myfile.open(dotname.str().c_str());

    myfile << "digraph prof {\n rankdir=\"LR\";\n node [shape=box];\n";
    for(auto dep = task_dependencies.begin(); dep != task_dependencies.end(); dep++) {
        task_identifier parent = dep->first;
        auto children = dep->second;
        string parent_name = parent.get_name();
        for(auto offspring = children->begin(); offspring != children->end(); offspring++) {
            task_identifier child = offspring->first;
            int count = offspring->second;
            string child_name = child.get_name();
            myfile << "  \"" << parent_name << "\" -> \"" << child_name << "\"";
            myfile << " [ label=\"  count: " << count << "\" ]; " << std::endl;

        }
    }

    // our TOTAL available time is the elapsed * the number of threads, or cores
	int num_worker_threads = thread_instance::get_num_threads();
#ifdef APEX_HAVE_HPX
    num_worker_threads = num_worker_threads - num_non_worker_threads_registered;
#endif
    double total_main = main_timer->elapsed() *
        fmin(hardware_concurrency(), num_worker_threads);

    // output nodes with  "main" [shape=box; style=filled; fillcolor="#ff0000" ];
    unordered_map<task_identifier, profile*>::const_iterator it;
    std::unique_lock<std::mutex> task_map_lock(_task_map_mutex);
    for(it = task_map.begin(); it != task_map.end(); it++) {
      profile * p = it->second;
      if (p->get_type() == APEX_TIMER) {
        node_color * c = get_node_color((p->get_accumulated()*profiler::get_cpu_mhz()), 0.0, total_main);
        task_identifier task_id = it->first;
        myfile << "  \"" << task_id.get_name() << "\" [shape=box; style=filled; fillcolor=\"#" <<
            setfill('0') << setw(2) << hex << c->convert(c->red) <<
            setfill('0') << setw(2) << hex << c->convert(c->green) <<
            setfill('0') << setw(2) << hex << c->convert(c->blue) << "\"" <<
            "; label=\"" << task_id.get_name() << ":\\n" << (p->get_accumulated()*profiler::get_cpu_mhz()) << "s\" ];" << std::endl;
      }
    }
    myfile << "}\n";
    myfile.close();
  }

  /* When writing a TAU profile, write out a timer line */
  void format_line(ofstream &myfile, profile * p) {
    myfile << p->get_calls() << " ";
    myfile << 0 << " ";
    myfile << ((p->get_accumulated()*profiler::get_cpu_mhz())) << " ";
    myfile << ((p->get_accumulated()*profiler::get_cpu_mhz())) << " ";
    myfile << 0 << " ";
    myfile << "GROUP=\"TAU_USER\" ";
    myfile << endl;
  }

  /* When writing a TAU profile, write out the main timer line */
  void format_line(ofstream &myfile, profile * p, double not_main) {
    myfile << p->get_calls() << " ";
    myfile << 0 << " ";
    myfile << (max(((p->get_accumulated()*profiler::get_cpu_mhz()) - not_main),0.0)) << " ";
    myfile << ((p->get_accumulated()*profiler::get_cpu_mhz())) << " ";
    myfile << 0 << " ";
    myfile << "GROUP=\"TAU_USER\" ";
    myfile << endl;
  }

  /* When writing a TAU profile, write out a counter line */
  void format_counter_line(ofstream &myfile, profile * p) {
    myfile << p->get_calls() << " ";       // numevents
    myfile << p->get_maximum() << " ";     // max
    myfile << p->get_minimum() << " ";     // min
    myfile << p->get_mean() << " ";        // mean
    myfile << p->get_sum_squares() << " ";
    myfile << endl;
  }

  /* Write TAU profiles from the collected data. */
  void profiler_listener::write_profile() {
    ofstream myfile;
    stringstream datname;
    // name format: profile.nodeid.contextid.threadid
    // We only write one profile per process
    datname << "profile." << node_id << ".0.0";

    // name format: profile.nodeid.contextid.threadid
    myfile.open(datname.str().c_str());
    int counter_events = 0;

    // Determine number of counter events, as these need to be
    // excluded from the number of normal timers
    unordered_map<task_identifier, profile*>::const_iterator it2;
    std::unique_lock<std::mutex> task_map_lock(_task_map_mutex);
    for(it2 = task_map.begin(); it2 != task_map.end(); it2++) {
      profile * p = it2->second;
      if(p->get_type() == APEX_COUNTER) {
        counter_events++;
      }
    }
    int function_count = task_map.size() - counter_events;

    // Print the normal timers to the profile file
    // 1504 templated_functions_MULTI_TIME
    myfile << function_count << " templated_functions_MULTI_TIME" << endl;
    // # Name Calls Subrs Excl Incl ProfileCalls #
    myfile << "# Name Calls Subrs Excl Incl ProfileCalls #" << endl;
    thread_instance ti = thread_instance::instance();

    // Iterate over the profiles which are associated to a function
    // by name. Only output the regular timers now. Counters are
    // in a separate section, below.
    profile * mainp = nullptr;
    double not_main = 0.0;
    for(it2 = task_map.begin(); it2 != task_map.end(); it2++) {
      profile * p = it2->second;
	  task_identifier task_id = it2->first;
      if(p->get_type() == APEX_TIMER) {
        string action_name = task_id.get_name();
        if(strcmp(action_name.c_str(), APEX_MAIN) == 0) {
          mainp = p;
        } else {
          myfile << "\"" << action_name << "\" ";
          format_line (myfile, p);
          not_main += (p->get_accumulated()*profiler::get_cpu_mhz());
        }
      }
    }
    if (mainp != nullptr) {
      myfile << "\"" << APEX_MAIN << "\" ";
      format_line (myfile, mainp, not_main);
    }

    // 0 aggregates
    myfile << "0 aggregates" << endl;

    // Now process the counters, if there are any.
    if(counter_events > 0) {
      myfile << counter_events << " userevents" << endl;
      myfile << "# eventname numevents max min mean sumsqr" << endl;
      for(it2 = task_map.begin(); it2 != task_map.end(); it2++) {
        profile * p = it2->second;
        if(p->get_type() == APEX_COUNTER) {
          task_identifier task_id = it2->first;
          myfile << "\"" << task_id.get_name() << "\" ";
          format_counter_line (myfile, p);
        }
      }
    }
    myfile.close();
  }

  /*
   * The main function for the consumer thread has to be static, but
   * the processing needs access to member variables, so get the
   * profiler_listener instance, and call it's proper function.
   *
   * This is a wrapper, so that we can launch the thread and set
   * affinity. However, process_profiles_wrapper is also used by the
   * last worker that calls apex_finalize(), so we don't want to change
   * that thread's affinity. So this wrapper is only for the consumer
   * thread.
   */
  void profiler_listener::consumer_process_profiles_wrapper(void) {
      if (apex_options::pin_apex_threads()) {
      	  set_thread_affinity();
	  }
      process_profiles_wrapper();
  }

  /*
   * The main function for the consumer thread has to be static, but
   * the processing needs access to member variables, so get the
   * profiler_listener instance, and call it's proper function.
   */
  void profiler_listener::process_profiles_wrapper(void) {
      apex * inst = apex::instance();
      if (inst != nullptr) {
          profiler_listener * pl = inst->the_profiler_listener;
          if (pl != nullptr) {
#ifdef APEX_TRACE_APEX
      profiler * p = start((apex_function_address)&profiler_listener::process_profiles_wrapper);
              pl->process_profiles();
      stop(p);
#else
              pl->process_profiles();
#endif
          }
      }
  }

  bool profiler_listener::concurrent_cleanup(void){
      std::shared_ptr<profiler> p;
#ifdef APEX_MULTIPLE_QUEUES
      std::unique_lock<std::mutex> queue_lock(queue_mtx);
	  std::vector<profiler_queue_t*>::const_iterator a_queue;
	  for (a_queue = allqueues.begin() ; a_queue != allqueues.end() ; ++a_queue) {
	    thequeue = *a_queue;
      	while(thequeue->try_dequeue(p)) {
        	process_profile(p,0);
      	}
	  }
#else
      while(thequeue.try_dequeue(p)) {
       	process_profile(p,0);
      }
#endif
      return true;
  }

  /* This is the main function for the consumer thread.
   * It will wait at a semaphore for pending work. When there is
   * work on one or more queues, it will iterate over the queues
   * and process the pending profiler objects, updating the profiles
   * as it goes. */
  void profiler_listener::process_profiles(void)
  {
    if (!_initialized) {
      initialize_worker_thread_for_TAU();
      _initialized = true;
    }
#ifdef APEX_HAVE_TAU
    if (apex_options::use_tau()) {
      TAU_START("profiler_listener::process_profiles");
    }
#endif

    std::shared_ptr<profiler> p;
    task_dependency* td;
    // Main loop. Stay in this loop unless "done".
#ifndef APEX_HAVE_HPX
    while (!_done) {
      queue_signal.wait();
#endif
#ifdef APEX_HAVE_TAU
      /*
    if (apex_options::use_tau()) {
      TAU_START("profiler_listener::process_profiles: main loop");
    }
    */
#endif
#ifdef APEX_MULTIPLE_QUEUES
	  std::vector<profiler_queue_t*>::const_iterator a_queue;
      std::unique_lock<std::mutex> queue_lock(queue_mtx);
	  int i = 0;
	  for (a_queue = allqueues.begin() ; a_queue != allqueues.end() ; ++a_queue) {
	    thequeue = *a_queue;
        while(!_done && thequeue->try_dequeue(p)) {
          process_profile(p, 0);
#ifdef APEX_HAVE_HPX // don't hang out in this task too long.
		  if (++i > 1000) break;
#endif
        }
	  }
#else
        while(!_done && thequeue.try_dequeue(p)) {
             process_profile(p, 0);
        }
#endif
      if (apex_options::use_taskgraph_output()) {
        while(!_done && dependency_queue.try_dequeue(td)) {
          process_dependency(td);
        }
      }
      /*
       * I want to process the tasks concurrently, but this loop
       * is too much overhead. Maybe dequeue them in batches?
       */
      /*
      std::vector<std::future<void>> pending_futures;
      while(!_done && thequeue.try_dequeue(p)) {
          auto f = std::async(my_stupid_wrapper,p);
          // transfer the future's shared state to a longer-lived future
          pending_futures.push_back(std::move(f));
      }
      for (auto iter = pending_futures.begin() ; iter < pending_futures.end() ; iter++ ) {
          iter->get();
      }
      */

#ifdef APEX_HAVE_TAU
      /*
      if (apex_options::use_tau()) {
        TAU_STOP("profiler_listener::process_profiles: main loop");
      }
      */
#endif
#ifndef APEX_HAVE_HPX
    }

    if (apex_options::use_taskgraph_output()) {
      // process the task dependencies
      while(dependency_queue.try_dequeue(td)) {
        process_dependency(td);
      }
    }

#endif // NOT DEFINED APEX_HAVE_HPX

#ifdef APEX_HAVE_HPX
    consumer_task_running.clear(memory_order_release);
#endif

#ifdef APEX_HAVE_TAU
    if (apex_options::use_tau()) {
      TAU_STOP("profiler_listener::process_profiles");
    }
#endif
  }

#if APEX_HAVE_PAPI
APEX_NATIVE_TLS int EventSet = PAPI_NULL;
enum papi_state { papi_running, papi_suspended };
APEX_NATIVE_TLS papi_state thread_papi_state = papi_suspended;
#define PAPI_ERROR_CHECK(name) \
if (rc != 0) cout << "name: " << rc << ": " << PAPI_strerror(rc) << endl;

  void profiler_listener::initialize_PAPI(bool first_time) {
      int rc = 0;
      if (first_time) {
        PAPI_library_init( PAPI_VER_CURRENT );
        //rc = PAPI_multiplex_init(); // use more counters than allowed
        //PAPI_ERROR_CHECK(PAPI_multiplex_init);
        PAPI_thread_init( thread_instance::get_id );
        // default
        //rc = PAPI_set_domain(PAPI_DOM_ALL);
        //PAPI_ERROR_CHECK(PAPI_set_domain);
      } else {
        PAPI_register_thread();
      }
      rc = PAPI_create_eventset(&EventSet);
      PAPI_ERROR_CHECK(PAPI_create_eventset);
      // default
      //rc = PAPI_assign_eventset_component (EventSet, 0);
      //PAPI_ERROR_CHECK(PAPI_assign_eventset_component);
      // default
      //rc = PAPI_set_granularity(PAPI_GRN_THR);
      //PAPI_ERROR_CHECK(PAPI_set_granularity);
      // unnecessary complexity
      //rc = PAPI_set_multiplex(EventSet);
      //PAPI_ERROR_CHECK(PAPI_set_multiplex);
      // parse the requested set of papi counters
      // The string is modified by strtok, so copy it.
      if (strlen(apex_options::papi_metrics()) > 0) {
        std::stringstream tmpstr(apex_options::papi_metrics());
        // use stream iterators to copy the stream to the vector as whitespace separated strings
        std::istream_iterator<std::string> tmpstr_it(tmpstr);
        std::istream_iterator<std::string> tmpstr_end;
        std::vector<std::string> tmpstr_results(tmpstr_it, tmpstr_end);
        int code;
        // iterate over the counter names in the vector
        for (auto p : tmpstr_results) {
          int rc = PAPI_event_name_to_code(const_cast<char*>(p.c_str()), &code);
          if (PAPI_query_event (code) == PAPI_OK) {
            rc = PAPI_add_event(EventSet, code);
            PAPI_ERROR_CHECK(PAPI_add_event);
            if (rc != 0) { printf ("Event that failed: %s\n", p.c_str()); }
            if (first_time) {
              metric_names.push_back(string(p.c_str()));
              num_papi_counters++;
            }
          }
        }
        if (!apex_options::papi_suspend()) {
            rc = PAPI_start( EventSet );
            PAPI_ERROR_CHECK(PAPI_start);
            thread_papi_state = papi_running;
        }
      }
  }

#endif

  /* When APEX gets a STARTUP event, do some initialization. */
  void profiler_listener::on_startup(startup_event_data &data) {
    if (!_done) {
      my_tid = (unsigned int)thread_instance::get_id();
#ifndef APEX_HAVE_HPX
      // Start the consumer thread, to process profiler objects.
      consumer_thread = new std::thread(consumer_process_profiles_wrapper);
#endif

#if APEX_HAVE_PAPI
      initialize_PAPI(true);
      event_sets[0] = EventSet;
#endif

      /* This commented out code is to change the priority of the consumer thread.
       * IDEALLY, I would like to make this a low priority thread, but that is as
       * yet unsuccessful. */
#if 0
      int retcode;
      int policy;

      pthread_t threadID = (pthread_t) consumer_thread->native_handle();

      struct sched_param param;

      if ((retcode = pthread_getschedparam(threadID, &policy, &param)) != 0)
      {
        errno = retcode;
        perror("pthread_getschedparam");
        exit(EXIT_FAILURE);
      }
      std::cout << "INHERITED: ";
      std::cout << "policy=" << ((policy == SCHED_FIFO)  ? "SCHED_FIFO" :
          (policy == SCHED_RR)    ? "SCHED_RR" :
          (policy == SCHED_OTHER) ? "SCHED_OTHER" :
          "???")
        << ", priority=" << param.sched_priority << " of " << sched_get_priority_min(policy) << "," << sched_get_priority_max(policy)  << std::endl;
      //param.sched_priority = 10;
      if ((retcode = pthread_setschedparam(threadID, policy, &param)) != 0)
      {
        errno = retcode;
        perror("pthread_setschedparam");
        exit(EXIT_FAILURE);
      }
#endif

      // time the whole application.
      main_timer = std::make_shared<profiler>(new task_identifier(string(APEX_MAIN)));
#if APEX_HAVE_PAPI
      if (num_papi_counters > 0 && !apex_options::papi_suspend() && thread_papi_state == papi_running) {
        int rc = PAPI_read( EventSet, main_timer->papi_start_values );
        PAPI_ERROR_CHECK(PAPI_read);
      }
#endif
    }
    APEX_UNUSED(data);
  }

  /* On the shutdown event, notify the consumer thread that we are done
   * and set the "terminate" flag. */
  void profiler_listener::on_shutdown(shutdown_event_data &data) {
    if (_done) { return; }
    if (!_done) {
      _done = true;
      node_id = data.node_id;
      //sleep(1);
#ifndef APEX_HAVE_HPX
      queue_signal.post();
      if (consumer_thread != nullptr) {
          consumer_thread->join();
      }
#endif

      // stop the main timer, and process that profile?
      main_timer->stop();
#if APEX_HAVE_PAPI
      if (num_papi_counters > 0 && !apex_options::papi_suspend() && thread_papi_state == papi_running) {
        int rc = PAPI_read( EventSet, main_timer->papi_stop_values );
        PAPI_ERROR_CHECK(PAPI_read);
      }
#endif
      // if this profile is processed, it will get deleted. so don't process it!
      // It also clutters up the final profile, if generated.
      //process_profile(main_timer.get(), my_tid);

      // output to screen?
      if ((apex_options::use_screen_output() ||
           apex_options::use_csv_output()) && node_id == 0)
      {
#ifdef APEX_MULTIPLE_QUEUES
        size_t ignored = 0;
        std::unique_lock<std::mutex> queue_lock(queue_mtx);
	    std::vector<profiler_queue_t*>::const_iterator a_queue;
	    for (a_queue = allqueues.begin() ; a_queue != allqueues.end() ; ++a_queue) {
	      thequeue = *a_queue;
          ignored += thequeue->size_approx();
		}
#else
        size_t ignored = thequeue.size_approx();
#endif
        if (ignored > 0) {
          std::cerr << "Info: " << ignored << " items remaining on on the profiler_listener queue...";
        }
#ifndef APEX_HAVE_HPX
        // We might be done, but check to make sure the queue is empty
        std::vector<std::future<bool>> pending_futures;
        for (unsigned int i=0; i<hardware_concurrency(); ++i) {
#ifdef APEX_STATIC
            /* Static libC++ doesn't do async very well. In fact, it crashes. */
            auto f = std::async(&profiler_listener::concurrent_cleanup,this);
#else // APEX_STATIC
            auto f = std::async(std::launch::async,&profiler_listener::concurrent_cleanup,this);
#endif // APEX_STATIC
            // transfer the future's shared state to a longer-lived future
            pending_futures.push_back(std::move(f));
        }
        for (auto iter = pending_futures.begin() ; iter < pending_futures.end() ; iter++ ) {
            iter->get();
        }
#endif // APEX_HAVE_HPX
        if (ignored > 0) {
          std::cerr << "done." << std::endl;
        }
        if (apex_options::use_screen_output() || apex_options::use_csv_output()) {
            finalize_profiles();
        }
      }
      if (apex_options::use_taskgraph_output() && node_id == 0)
      {
        write_taskgraph();
      }

      // output to 1 TAU profile per process?
      if (apex_options::use_profile_output() && !apex_options::use_tau()) {
        write_profile();
      }
#if !defined(_MSC_VER)
      if (apex_options::task_scatterplot()) {
          // get the length of the stream
          int loc0 = task_scatterplot_samples.tellp();
          // lock access to the file
          // write using low-level file locking!
          struct flock fl;
          fl.l_type   = F_WRLCK;  /* F_RDLCK, F_WRLCK, F_UNLCK    */
          fl.l_whence = SEEK_SET; /* SEEK_SET, SEEK_CUR, SEEK_END */
          fl.l_start  = 0;        /* Offset from l_whence         */
          fl.l_len    = 0;        /* length, 0 = to EOF           */
          fl.l_pid    = getpid();      /* our PID                      */
          fcntl(task_scatterplot_sample_file, F_SETLKW, &fl);  /* F_GETLK, F_SETLK, F_SETLKW */
          // flush the string stream to the file
          //lseek(task_scatterplot_sample_file, 0, SEEK_END);
          ssize_t bytes_written = write(task_scatterplot_sample_file,
                task_scatterplot_samples.str().c_str(), loc0);
          if (bytes_written < 0) {
              int errsv = errno;
              perror("Error writing to scatterplot!");
              fprintf(stderr, "Error writing scatterplot:\n%s\n",
                      strerror(errsv));
          }
          fl.l_type   = F_UNLCK;   /* tell it to unlock the region */
          fcntl(task_scatterplot_sample_file, F_SETLK, &fl); /* set the region to unlocked */
          close(task_scatterplot_sample_file);
      }
#endif

    }
    /* The cleanup is disabled for now. Why? Because we want to be able
     * to access the profiles at the end of the run, after APEX has
     * finalized. */
    // cleanup.
    // delete_profiles();
  }

  /* When a new node is created */
  void profiler_listener::on_new_node(node_event_data &data) {
    if (!_done) {
    }
    APEX_UNUSED(data);
  }

  /* When a new thread is registered, expand all of our storage as necessary
   * to handle the new thread */
  void profiler_listener::on_new_thread(new_thread_event_data &data) {
    if (!_done) {
      my_tid = (unsigned int)thread_instance::get_id();
	  async_thread_setup();
#if APEX_HAVE_PAPI
      initialize_PAPI(false);
      event_set_mutex.lock();
      if (my_tid >= event_sets.size()) {
        if (my_tid >= event_sets.size()) {
          event_sets.resize(my_tid + 1);
        }
      }
      event_sets[my_tid] = EventSet;
      event_set_mutex.unlock();
#endif
    }
    APEX_UNUSED(data);
  }

  extern "C" int main (int, char**);

  /* When a start event happens, create a profiler object. Unless this
   * named event is throttled, in which case do nothing, as quickly as possible */
  inline bool profiler_listener::_common_start(task_identifier * id, bool is_resume) {
    if (!_done) {
#if defined(APEX_THROTTLE)
      // if this timer is throttled, return without doing anything
      unordered_set<task_identifier>::const_iterator it;
	  {
      	read_lock_type l(throttled_event_set_mutex);
        it = throttled_tasks.find(*id);
      }
      if (it != throttled_tasks.end()) {
          /*
           * The throw is removed, because it is a performance penalty on some systems
           * on_start now returns a boolean
           */
        //throw disabled_profiler_exception(); // to be caught by apex::start/resume
        return false;
      }
#endif
      // start the profiler object, which starts our timers
      //std::shared_ptr<profiler> p = std::make_shared<profiler>(id, is_resume);
      profiler * p = new profiler(id, is_resume);
      thread_instance::instance().set_current_profiler(p);
#if APEX_HAVE_PAPI
      if (num_papi_counters > 0 && !apex_options::papi_suspend()) {
          // if papi was previously suspended, we need to start the counters
          if (thread_papi_state == papi_suspended) {
            int rc = PAPI_start( EventSet );
            PAPI_ERROR_CHECK(PAPI_start);
            thread_papi_state = papi_running;
          }
          int rc = PAPI_read( EventSet, p->papi_start_values );
          PAPI_ERROR_CHECK(PAPI_read);
      } else {
          // if papi is still running, stop the counters
          if (thread_papi_state == papi_running) {
            long long dummy[8];
            int rc = PAPI_stop( EventSet, dummy );
            PAPI_ERROR_CHECK(PAPI_stop);
            thread_papi_state = papi_suspended;
          }
      }
#endif
    } else {
        return false;
    }
    return true;
  }

  inline void profiler_listener::push_profiler(int my_tid, std::shared_ptr<profiler> &p) {
  	  // if we aren't processing profiler objects, just return.
  	  if (!apex_options::process_async_state()) { return; }
#ifdef APEX_TRACE_APEX
  	  if (p->task_id->address == (uint64_t)&profiler_listener::process_profiles_wrapper) { return; }
#endif
      // we have to make a local copy, because lockfree queues DO NOT SUPPORT shared_ptrs!
#ifdef APEX_MULTIPLE_QUEUES
      thequeue->enqueue(p);
#else
      thequeue.enqueue(p);
#endif
	  /*
      bool worked = thequeue.enqueue(p);
      if (!worked) {
          static std::atomic<bool> issued(false);
          if (!issued) {
              issued = true;
              cout << "APEX Warning : failed to push " << p->task_id->get_name() << endl;
              cout << "One or more frequently-called, lightweight functions is being timed." << endl;
          }
      }
	  */
#ifndef APEX_HAVE_HPX
      queue_signal.post();
#endif
#ifdef APEX_HAVE_HPX
      apex_schedule_process_profiles();
#endif
  }

  /* Stop the timer, if applicable, and queue the profiler object */
  inline void profiler_listener::_common_stop(std::shared_ptr<profiler> &p, bool is_yield) {
    if (!_done) {
      if (p) {
        p->stop(is_yield);
#if APEX_HAVE_PAPI
        if (num_papi_counters > 0 && !apex_options::papi_suspend() && thread_papi_state == papi_running) {
            int rc = PAPI_read( EventSet, p->papi_stop_values );
            PAPI_ERROR_CHECK(PAPI_read);
        }
#endif
        // Why is this happening now?  Why not at start? Why not at create?
        /*
        if (apex_options::use_taskgraph_output()) {
          if (!p->is_resume) {
            // get the PARENT profiler
            profiler * parent_profiler = nullptr;
            try {
              parent_profiler = thread_instance::instance().get_parent_profiler();
              if (parent_profiler != NULL) {
                task_identifier * parent = parent_profiler->task_id;
                task_identifier * child = p->task_id;
                dependency_queue.enqueue(new task_dependency(parent, child));
              }
            } catch (empty_stack_exception& e) { }
          }
        }
        */
        push_profiler(my_tid, p);
      }
    }
  }

  /* Start the timer */
  bool profiler_listener::on_start(task_identifier * id) {
    return _common_start(id, false);
  }

  /* This is just like starting a timer, but don't increment the number of calls
   * value. That is because we are restarting an existing timer. */
  bool profiler_listener::on_resume(task_identifier * id) {
    return _common_start(id, true);
  }

   /* Stop the timer */
  void profiler_listener::on_stop(std::shared_ptr<profiler> &p) {
    _common_stop(p, p->is_resume); // don't change the yield/resume value!
  }

  /* Stop the timer, but don't increment the number of calls */
  void profiler_listener::on_yield(std::shared_ptr<profiler> &p) {
    _common_stop(p, true);
  }

  /* When a thread exits, pop and stop all timers. */
  void profiler_listener::on_exit_thread(event_data &data) {
    APEX_UNUSED(data);
  }

  /* When an asynchronous thread is launched, they should
   * call apex::async_thread_setup() which will end up here.*/
  void profiler_listener::async_thread_setup(void) {
#ifdef APEX_MULTIPLE_QUEUES
	  // for asynchronous threads, check to make sure there is a queue!
	  if (thequeue == nullptr) {
	  	thequeue = new profiler_queue_t();
	  	{
        	std::unique_lock<std::mutex> queue_lock(queue_mtx);
	    	allqueues.push_back(thequeue);
	  	}
	  }
#endif
  }

  /* When a sample value is processed, save it as a profiler object, and queue it. */
  void profiler_listener::on_sample_value(sample_value_event_data &data) {
    if (!_done) {
      std::shared_ptr<profiler> p = std::make_shared<profiler>(new task_identifier(*data.counter_name), data.counter_value);
      p->is_counter = data.is_counter;
      push_profiler(my_tid, p);
    }
  }

  void profiler_listener::on_new_task(task_identifier * id, uint64_t task_id) {
    //cout << "New task: " << task_id << endl;
    if (!apex_options::use_taskgraph_output()) { return; }
    // get the current profiler
    profiler * p = thread_instance::instance().get_current_profiler();
    if (p != NULL) {
        dependency_queue.enqueue(new task_dependency(p->task_id, id));
    } else {
        task_identifier * parent = new task_identifier(string("__start"));
        dependency_queue.enqueue(new task_dependency(parent, id));
    }
  }

  /* Communication send event. Save the number of bytes. */
  void profiler_listener::on_send(message_event_data &data) {
    if (!_done) {
      std::shared_ptr<profiler> p = std::make_shared<profiler>(new task_identifier("Bytes Sent"), (double)data.size);
      push_profiler(0, p);
    }
  }

  /* Communication recv event. Save the number of bytes. */
  void profiler_listener::on_recv(message_event_data &data) {
    if (!_done) {
      std::shared_ptr<profiler> p = std::make_shared<profiler>(new task_identifier("Bytes Received"), (double)data.size);
      push_profiler(0, p);
    }
  }

  /* For periodic stuff. Do something? */
  void profiler_listener::on_periodic(periodic_event_data &data) {
    if (!_done) {
    }
    APEX_UNUSED(data);
  }

  /* For custom event stuff. Do something? */
  void profiler_listener::on_custom_event(custom_event_data &data) {
    if (!_done) {
    }
    APEX_UNUSED(data);
  }

  void profiler_listener::reset(task_identifier * id) {
    std::shared_ptr<profiler> p;
    p = std::make_shared<profiler>(id, false, reset_type::CURRENT);
    push_profiler(my_tid, p);
  }

  profiler_listener::~profiler_listener (void) {
      _done = true; // yikes!
      finalize();
      delete_profiles();
#ifndef APEX_HAVE_HPX
      delete consumer_thread;
#endif
  };

}

#ifdef APEX_HAVE_HPX
HPX_DECLARE_ACTION(::apex::profiler_listener::process_profiles_wrapper, apex_internal_process_profiles_action);
HPX_ACTION_HAS_CRITICAL_PRIORITY(apex_internal_process_profiles_action);
HPX_PLAIN_ACTION(::apex::profiler_listener::process_profiles_wrapper, apex_internal_process_profiles_action);

void apex_schedule_process_profiles() {
    if(get_hpx_runtime_ptr() == nullptr) return;
    if(hpx_shutdown) {
        ::apex::profiler_listener::process_profiles_wrapper();
    } else {
		if(!consumer_task_running.test_and_set(memory_order_acq_rel)) {
        	apex_internal_process_profiles_action act;
        	try {
           		hpx::apply(act, hpx::find_here());
        	} catch(...) {
           		// During shutdown, we can't schedule a new task,
           		// so we process profiles ourselves.
           		profiler_listener::process_profiles_wrapper();
			}
		}
    }
}

#endif




Line	% of fetches	Source
1		// Copyright (c) 2014 University of Oregon
2		//
3
4		#ifdef APEX_HAVE_HPX
5		#include <hpx/config.hpp>
6		#ifdef APEX_HAVE_OTF2
7		#define APEX_TRACE_APEX
8		#endif // APEX_HAVE_OTF2
9		#endif // APEX_HAVE_HPX
10
11		#include "profiler_listener.hpp"
12		#include "profiler.hpp"
13		#include "thread_instance.hpp"
14		#include <iostream>
15		#include <iomanip>
16		#include <fstream>
17		#include <math.h>
18		#include "apex_options.hpp"
19		#include "profiler.hpp"
20		#include "profile.hpp"
21		#include "apex.hpp"
22
23		#include <atomic>
24		#if !defined(_WIN32) && (defined(__unix__) \|\| defined(__unix) \|\| (defined(__APPLE__) && defined(__MACH__)))
25		#include <unistd.h>
26		#include <sched.h>
27		#endif
28		#include <cstdio>
29		#include <vector>
30		#include <string>
31		#include <unordered_set>
32		#include <algorithm>
33		#include <iterator>
34
35		#include <functional>
36		#include <thread>
37		#include <future>
38
39		#if defined(APEX_THROTTLE)
40		#include "apex_cxx_shared_lock.hpp"
41		apex::shared_mutex_type throttled_event_set_mutex;
42		#define APEX_THROTTLE_CALLS 1000
43		#ifdef APEX_USE_CLOCK_TIMESTAMP
44		#define APEX_THROTTLE_PERCALL 0.00001 // 10 microseconds.
45		#else
46		#define APEX_THROTTLE_PERCALL 50000 // 50k cycles.
47		#endif
48		#endif
49
50		#if APEX_HAVE_PAPI
51		#include "papi.h"
52		#include <mutex>
53		std::mutex event_set_mutex;
54		#endif
55
56		#ifdef APEX_HAVE_HPX
57		#include <boost/assign.hpp>
58		#include <boost/cstdint.hpp>
59		#include <hpx/include/performance_counters.hpp>
60		#include <hpx/include/actions.hpp>
61		#include <hpx/include/util.hpp>
62		#include <hpx/lcos/local/composable_guard.hpp>
63		static void apex_schedule_process_profiles(void); // not in apex namespace
64		const int num_non_worker_threads_registered = 0;
65		#endif
66
67		#define APEX_MAIN "APEX MAIN"
68
69		#ifdef APEX_HAVE_TAU
70		#define PROFILING_ON
71		#define TAU_DOT_H_LESS_HEADERS
72		#include <TAU.h>
73		#endif
74
75		#include "utils.hpp"
76
77		#include <cstdlib>
78		#include <ctime>
79
80		using namespace std;
81		using namespace apex;
82
83		APEX_NATIVE_TLS unsigned int my_tid = 0; // the current thread's TID in APEX
84
85		namespace apex {
86
87		#ifdef APEX_MULTIPLE_QUEUES
88		/* this is a thread-local pointer to a concurrent queue for each worker thread. */
89		__thread profiler_queue_t * thequeue;
90		#endif
91
92		/* THis is a special profiler, indicating that the timer requested is
93		throttled, and shouldn't be processed. */
94		profiler* profiler::disabled_profiler = new profiler();
95
96		#ifdef APEX_HAVE_HPX
97		/* Flag indicating whether a consumer task is currently running */
98		std::atomic_flag consumer_task_running = ATOMIC_FLAG_INIT;
99		bool hpx_shutdown = false;
100		#endif
101
102		double profiler_listener::get_non_idle_time() {
103		double non_idle_time = 0.0;
104		/* Iterate over all timers and accumulate the time spent in them */
105		unordered_map<task_identifier, profile*>::const_iterator it2;
106		std::unique_lock<std::mutex> task_map_lock(_task_map_mutex);
107		for(it2 = task_map.begin(); it2 != task_map.end(); it2++) {
108		profile * p = it2->second;
109		#if defined(APEX_THROTTLE)
110		task_identifier id = it2->first;
111		unordered_set<task_identifier>::const_iterator it4;
112		{
113		read_lock_type l(throttled_event_set_mutex);
114		it4 = throttled_tasks.find(id);
115		}
116		if (it4!= throttled_tasks.end()) {
117		continue;
118		}
119		#endif
120		if (p->get_type() == APEX_TIMER) {
121		non_idle_time += p->get_accumulated();
122		}
123		}
124		return non_idle_time*profiler::get_cpu_mhz();
125		}
126
127		profile * profiler_listener::get_idle_time() {
128		double non_idle_time = get_non_idle_time();
129		/* Subtract the accumulated time from the main time span. */
130		int num_worker_threads = thread_instance::get_num_threads();
131		#ifdef APEX_HAVE_HPX
132		num_worker_threads = num_worker_threads - num_non_worker_threads_registered;
133		#endif
134		std::chrono::duration<double> time_span =
135		std::chrono::duration_cast<std::chrono::duration<double>>
136		(MYCLOCK::now() - main_timer->start);
137		double total_main = time_span.count() *
138		fmin(hardware_concurrency(), num_worker_threads);
139		double elapsed = total_main - non_idle_time;
140		elapsed = elapsed > 0.0 ? elapsed : 0.0;
141		profile * theprofile = new profile(elapsed*profiler::get_cpu_mhz(), 0, NULL, false);
142		return theprofile;
143		}
144
145		profile * profiler_listener::get_idle_rate() {
146		double non_idle_time = get_non_idle_time();
147		/* Subtract the accumulated time from the main time span. */
148		int num_worker_threads = thread_instance::get_num_threads();
149		#ifdef APEX_HAVE_HPX
150		num_worker_threads = num_worker_threads - num_non_worker_threads_registered;
151		#endif
152		std::chrono::duration<double> time_span =
153		std::chrono::duration_cast<std::chrono::duration<double>>
154		(MYCLOCK::now() - main_timer->start);
155		double total_main = time_span.count() *
156		fmin(hardware_concurrency(), num_worker_threads);
157		double elapsed = total_main - non_idle_time;
158		double rate = elapsed > 0.0 ? ((elapsed/total_main)) : 0.0;
159		profile * theprofile = new profile(rate, 0, NULL, false);
160		return theprofile;
161		}
162
163		/* Return the requested profile object to the user.
164		* Return nullptr if doesn't exist. */
165		profile * profiler_listener::get_profile(task_identifier &id) {
166		if (id.name == string(APEX_IDLE_RATE)) {
167		return get_idle_rate();
168		} else if (id.name == string(APEX_IDLE_TIME)) {
169		return get_idle_time();
170		} else if (id.name == string(APEX_NON_IDLE_TIME)) {
171		profile * theprofile = new profile(get_non_idle_time(), 0, NULL, false);
172		return theprofile;
173		}
174		std::unique_lock<std::mutex> task_map_lock(_task_map_mutex);
175		unordered_map<task_identifier, profile*>::const_iterator it = task_map.find(id);
176		if (it != task_map.end()) {
177		return (*it).second;
178		}
179		return nullptr;
180		}
181
182		void profiler_listener::reset_all(void) {
183		std::unique_lock<std::mutex> task_map_lock(_task_map_mutex);
184		for(auto &it : task_map) {
185		it.second->reset();
186		}
187		}
188
189		/* After the consumer thread pulls a profiler off of the queue,
190		* process it by updating its profile object in the map of profiles. */
191		// TODO The name-based timer and address-based timer paths through
192		// the code involve a lot of duplication -- this should be refactored
193		// to remove the duplication so it's easier to maintain.
194		unsigned int profiler_listener::process_profile(std::shared_ptr<profiler> &p, unsigned int tid)
195		{
196		if(p == nullptr) return 0;
197		profile * theprofile;
198		if(p->is_reset == reset_type::ALL) {
199		reset_all();
200		return 0;
201		}
202		double values[8] = {0};
203		double tmp_num_counters = 0;
204		#if APEX_HAVE_PAPI
205		tmp_num_counters = num_papi_counters;
206		for (int i = 0 ; i < num_papi_counters ; i++) {
207		if (p->papi_stop_values[i] > p->papi_start_values[i]) {
208		values[i] = p->papi_stop_values[i] - p->papi_start_values[i];
209		} else {
210		values[i] = 0.0;
211		}
212		}
213		#endif
214		std::unique_lock<std::mutex> task_map_lock(_task_map_mutex, std::defer_lock);
215		// There is only one consumer thread except during shutdown, so we only need
216		// to lock during shutdown.
217		bool did_lock = false;
218		if(_done) {
219		task_map_lock.lock();
220		did_lock = true;
221		}
222		unordered_map<task_identifier, profile>::const_iterator it = task_map.find((p->task_id));
223		if (it != task_map.end()) {
224		// A profile for this ID already exists.
225		theprofile = (*it).second;
226		if(_done && did_lock) {
227		task_map_lock.unlock();
228		}
229		if(p->is_reset == reset_type::CURRENT) {
230		theprofile->reset();
231		} else {
232		theprofile->increment(p->elapsed(), tmp_num_counters, values, p->is_resume);
233		}
234		#if defined(APEX_THROTTLE)
235		// Is this a lightweight task? If so, we shouldn't measure it any more,
236		// in order to reduce overhead.
237		if (theprofile->get_calls() > APEX_THROTTLE_CALLS &&
238		theprofile->get_mean() < APEX_THROTTLE_PERCALL) {
239		unordered_set<task_identifier>::const_iterator it2;
240		{
241		read_lock_type l(throttled_event_set_mutex);
242		it2 = throttled_tasks.find(*(p->task_id));
243		}
244		if (it2 == throttled_tasks.end()) {
245		// lock the set for insert
246		{
247		write_lock_type l(throttled_event_set_mutex);
248		// was it inserted when we were waiting?
249		it2 = throttled_tasks.find(*(p->task_id));
250		// no? OK - insert it.
251		if (it2 == throttled_tasks.end()) {
252		throttled_tasks.insert(*(p->task_id));
253		}
254		}
255		if (apex_options::use_screen_output()) {
256		cout << "APEX: disabling lightweight timer "
257		<< p->task_id->get_name()
258		<< endl;
259		fflush(stdout);
260		}
261		}
262		}
263		#endif
264		} else {
265		// Create a new profile for this name.
266		theprofile = new profile(p->is_reset == reset_type::CURRENT ? 0.0 : p->elapsed(), tmp_num_counters, values, p->is_resume, p->is_counter ? APEX_COUNTER : APEX_TIMER);
267		task_map[*(p->task_id)] = theprofile;
268		if(_done && did_lock) {
269		task_map_lock.unlock();
270		}
271		#ifdef APEX_HAVE_HPX
272		#ifdef APEX_REGISTER_HPX3_COUNTERS
273		if(!_done) {
274		if(get_hpx_runtime_ptr() != nullptr && p->task_id->has_name()) {
275		std::string timer_name(p->task_id->get_name());
276		//Don't register timers containing "/"
277		if(timer_name.find("/") == std::string::npos) {
278		hpx::performance_counters::install_counter_type(
279		std::string("/apex/") + timer_name,
280		[p](bool r)->boost::int64_t{
281		boost::int64_t value(p->elapsed());
282		return value;
283		},
284		std::string("APEX counter ") + timer_name,
285		""
286		);
287		}
288		} else {
289		std::cerr << "HPX runtime not initialized yet." << std::endl;
290		}
291		}
292		#endif
293		#endif
294		}
295		#if !defined(_MSC_VER)
296		/* write the sample to the file */
297		if (apex_options::task_scatterplot()) {
298		if (!p->is_counter) {
299		static int thresh = RAND_MAX/100;
300		if (std::rand() < thresh) {
301		std::unique_lock<std::mutex> task_map_lock(_mtx);
302		task_scatterplot_samples << p->normalized_timestamp() << " "
303		<< p->elapsed()profiler::get_cpu_mhz()1000000 << " "
304		<< "'" << p->task_id->get_name() << "'" << endl;
305		int loc0 = task_scatterplot_samples.tellp();
306		if (loc0 > 32768) {
307		// lock access to the file
308		// write using low-level file locking!
309		struct flock fl;
310		fl.l_type = F_WRLCK; /* F_RDLCK, F_WRLCK, F_UNLCK */
311		fl.l_whence = SEEK_SET; /* SEEK_SET, SEEK_CUR, SEEK_END */
312		fl.l_start = 0; /* Offset from l_whence */
313		fl.l_len = 0; /* length, 0 = to EOF */
314		fl.l_pid = getpid(); /* our PID */
315		fcntl(task_scatterplot_sample_file, F_SETLKW, &fl); /* F_GETLK, F_SETLK, F_SETLKW */
316		// flush the string stream to the file
317		//lseek(task_scatterplot_sample_file, 0, SEEK_END);
318		ssize_t bytes_written = write(task_scatterplot_sample_file,
319		task_scatterplot_samples.str().c_str(), loc0);
320		if (bytes_written < 0) {
321		int errsv = errno;
322		perror("Error writing to scatterplot!");
323		fprintf(stderr, "Error writing scatterplot:\n%s\n",
324		strerror(errsv));
325		}
326		fl.l_type = F_UNLCK; /* tell it to unlock the region */
327		fcntl(task_scatterplot_sample_file, F_SETLK, &fl); /* set the region to unlocked */
328		// reset the stringstream
329		task_scatterplot_samples.str("");
330		}
331		}
332		}
333		}
334		#endif
335		return 1;
336		}
337
338		inline unsigned int profiler_listener::process_dependency(task_dependency* td)
339		{
340		unordered_map<task_identifier, unordered_map<task_identifier, int>* >::const_iterator it = task_dependencies.find(td->parent);
341		unordered_map<task_identifier, int> * depend;
342		// if this is a new dependency for this parent?
343		if (it == task_dependencies.end()) {
344		depend = new unordered_map<task_identifier, int>();
345		(*depend)[td->child] = 1;
346		task_dependencies[td->parent] = depend;
347		// otherwise, see if this parent has seen this child
348		} else {
349		depend = it->second;
350		unordered_map<task_identifier, int>::const_iterator it2 = depend->find(td->child);
351		// first time for this child
352		if (it2 == depend->end()) {
353		(*depend)[td->child] = 1;
354		// not the first time for this child
355		} else {
356		int tmp = it2->second;
357		(*depend)[td->child] = tmp + 1;
358		}
359		}
360		delete(td);
361		return 1;
362		}
363
364		/* Cleaning up memory. Not really necessary, because it only gets
365		* called at shutdown. But a good idea to do regardless. */
366		void profiler_listener::delete_profiles(void) {
367		// iterate over the map and free the objects in the map
368		unordered_map<task_identifier, profile*>::const_iterator it;
369		std::unique_lock<std::mutex> task_map_lock(_task_map_mutex);
370		for(it = task_map.begin(); it != task_map.end(); it++) {
371		delete it->second;
372		}
373		// clear the map.
374		task_map.clear();
375
376		}
377
378		#define PAD_WITH_SPACES "%8s"
379		#define FORMAT_PERCENT "%8.3f"
380		#define FORMAT_SCIENTIFIC "%1.2e"
381
382		template<typename ... Args>
383		string string_format( const std::string& format, Args ... args )
384		{
385		size_t size = snprintf( nullptr, 0, format.c_str(), args ... ) + 1; // Extra space for '\0'
386		unique_ptr<char[]> buf( new char[ size ] );
387		snprintf( buf.get(), size, format.c_str(), args ... );
388		return string( buf.get(), buf.get() + size - 1 ); // We don't want the '\0' inside
389		}
390
391		void profiler_listener::write_one_timer(task_identifier &task_id,
392		profile * p, stringstream &screen_output,
393		stringstream &csv_output, double &total_accumulated,
394		double &total_main) {
395		string action_name = task_id.get_name();
396		string shorter(action_name);
397		// to keep formatting pretty, trim any long timer names
398		if (shorter.size() > 30) {
399		shorter.resize(27);
400		shorter.resize(30, '.');
401		}
402		//screen_output << "\"" << shorter << "\", " ;
403		screen_output << string_format("%30s", shorter.c_str()) << " : ";
404		#if defined(APEX_THROTTLE)
405		// if this profile was throttled, don't output the measurements.
406		// they are limited and bogus, anyway.
407		unordered_set<task_identifier>::const_iterator it4;
408		{
409		read_lock_type l(throttled_event_set_mutex);
410		it4 = throttled_tasks.find(task_id);
411		}
412		if (it4!= throttled_tasks.end()) {
413		screen_output << "DISABLED (high frequency, short duration)" << endl;
414		return;
415		}
416		#endif
417		if(p->get_calls() < 1) {
418		p->get_profile()->calls = 1;
419		}
420		if (p->get_calls() < 999999) {
421		screen_output << string_format(PAD_WITH_SPACES, to_string((int)p->get_calls()).c_str()) << " " ;
422		} else {
423		screen_output << string_format(FORMAT_SCIENTIFIC, p->get_calls()) << " " ;
424		}
425		if (p->get_type() == APEX_TIMER) {
426		csv_output << "\"" << action_name << "\",";
427		csv_output << llround(p->get_calls()) << ",";
428		// convert MHz to Hz
429		csv_output << std::llround(p->get_accumulated()) << ",";
430		// convert MHz to microseconds
431		csv_output << std::llround(p->get_accumulated()profiler::get_cpu_mhz()1000000);
432		screen_output << " --n/a-- " ;
433		screen_output << string_format(FORMAT_SCIENTIFIC, (p->get_mean()*profiler::get_cpu_mhz())) << " " ;
434		screen_output << " --n/a-- " ;
435		screen_output << string_format(FORMAT_SCIENTIFIC, (p->get_accumulated()*profiler::get_cpu_mhz())) << " " ;
436		screen_output << " --n/a-- " ;
437		screen_output << string_format(FORMAT_PERCENT, (((p->get_accumulated()profiler::get_cpu_mhz())/total_main)100));
438		#if APEX_HAVE_PAPI
439		for (int i = 0 ; i < num_papi_counters ; i++) {
440		screen_output << " " << string_format(FORMAT_SCIENTIFIC, (p->get_papi_metrics()[i]));
441		csv_output << "," << std::llround(p->get_papi_metrics()[i]);
442		}
443		#endif
444		screen_output << endl;
445		total_accumulated += p->get_accumulated();
446		csv_output << endl;
447		} else {
448		if (action_name.find('%') == string::npos) {
449		screen_output << string_format(FORMAT_SCIENTIFIC, p->get_minimum()) << " " ;
450		screen_output << string_format(FORMAT_SCIENTIFIC, p->get_mean()) << " " ;
451		screen_output << string_format(FORMAT_SCIENTIFIC, p->get_maximum()) << " " ;
452		screen_output << string_format(FORMAT_SCIENTIFIC, p->get_accumulated()) << " " ;
453		screen_output << string_format(FORMAT_SCIENTIFIC, p->get_stddev()) << " " ;
454		} else {
455		screen_output << string_format(FORMAT_PERCENT, p->get_minimum()) << " " ;
456		screen_output << string_format(FORMAT_PERCENT, p->get_mean()) << " " ;
457		screen_output << string_format(FORMAT_PERCENT, p->get_maximum()) << " " ;
458		screen_output << string_format(FORMAT_PERCENT, p->get_accumulated()) << " " ;
459		screen_output << string_format(FORMAT_PERCENT, p->get_stddev()) << " " ;
460		}
461		screen_output << " --n/a-- " << endl;
462		}
463		}
464
465		/* At program termination, write the measurements to the screen, or to CSV file, or both. */
466		void profiler_listener::finalize_profiles(void) {
467		// our TOTAL available time is the elapsed * the number of threads, or cores
468		int num_worker_threads = thread_instance::get_num_threads();
469		double wall_clock_main = main_timer->elapsed() * profiler::get_cpu_mhz();
470		#ifdef APEX_HAVE_HPX
471		num_worker_threads = num_worker_threads - num_non_worker_threads_registered;
472		#endif
473		double total_main = wall_clock_main *
474		fmin(hardware_concurrency(), num_worker_threads);
475		// create a stringstream to hold all the screen output - we may not
476		// want to write it out
477		stringstream screen_output;
478		// create a stringstream to hold all the CSV output - we may not
479		// want to write it out
480		stringstream csv_output;
481		// iterate over the profiles in the address map
482		screen_output << "Elapsed time: " << wall_clock_main << endl;
483		screen_output << "Cores detected: " << hardware_concurrency() << endl;
484		screen_output << "Worker Threads observed: " << num_worker_threads << endl;
485		screen_output << "Available CPU time: " << total_main << endl;
486		map<apex_function_address, profile*>::const_iterator it;
487		screen_output << "Action : #calls \| minimum \| mean \| maximum \| total \| stddev \| % total " << apex_options::papi_metrics() << endl;
488		screen_output << "------------------------------------------------------------------------------------------------------------" << endl;
489		csv_output << "\"task\",\"num calls\",\"total cycles\",\"total microseconds\"";
490		#if APEX_HAVE_PAPI
491		for (int i = 0 ; i < num_papi_counters ; i++) {
492		csv_output << ",\"" << metric_names[i] << "\"";
493		}
494		#endif
495		csv_output << endl;
496		double total_accumulated = 0.0;
497		unordered_map<task_identifier, profile*>::const_iterator it2;
498		std::vector<task_identifier> id_vector;
499		// iterate over the counters, and sort their names
500		std::unique_lock<std::mutex> task_map_lock(_task_map_mutex);