/* This demonstrates how data cache misses can affect the performance of an application. We show how the time/counts for a simple matrix multiplication algorithm dramatically reduce when we employ a strip mining optimization. */ #include #include #include #define SIZE 512 #define CACHE 64 double A[SIZE][SIZE], B[SIZE][SIZE], C[SIZE][SIZE]; double multiply(void) { int i, j, k, n, m; int vl, sz, strip; TAU_PROFILE("multiply", "void (void)", TAU_USER); TAU_PROFILE_TIMER(t1,"regular", "", TAU_USER); TAU_PROFILE_TIMER(strip_timer,"stripmine", "", TAU_USER); for (n = 0; n < SIZE; n++) for (m = 0; m < SIZE; m++) { A[n][m] = B[n][m] = n + m ; C[n][m] = 0; } TAU_PROFILE_START(t1); sleep(2); for (i = 0; i < SIZE; i ++) { for (j = 0; j < SIZE; j++) { for (k = 0; k < SIZE; k++) C[i][j] += A[i][k] * B[k][j]; } } TAU_PROFILE_STOP(t1); /* Now we employ the strip mining optimization */ for(n = 0; n < SIZE; n++) for(m = 0; m < SIZE; m++) C[n][m] = 0; TAU_PROFILE_START(strip_timer); sleep(1); for(i=0; i < SIZE; i++) for(k=0; k < SIZE; k++) for(sz = 0; sz < SIZE; sz+=CACHE) { //vl = min(SIZE-sz, CACHE); vl = (SIZE - sz < CACHE ? SIZE - sz : CACHE); for(strip = sz; strip < sz+vl; strip++) C[i][strip] += A[i][k]*B[k][strip]; } TAU_PROFILE_STOP(strip_timer); return C[SIZE-10][SIZE-10]; // So KCC doesn't optimize this loop away. } int checkResults() { const char **inFuncs; /* The first dimension is functions, and the second dimension is counters */ double **counterExclusiveValues; double **counterInclusiveValues; int *numOfCalls; int *numOfSubRoutines; const char **counterNames; int numOfCouns; int numFunctions; const char **functionList; TAU_GET_FUNC_NAMES(functionList, numFunctions); int numCounters; const char ** counterList; TAU_GET_COUNTER_NAMES(counterList, numCounters); int WALL = -1; int VIRT = -1; int L1_DCM = -1; for(int j=0;j regularL1) { printf ("Uh oh, strip mining optimization increased L1_DCM!!!\n"); error = 1; } // OptixICC7 usually fails if we only check that strip > regular) if (stripVIRT > (regularVIRT * 1.5)) { printf ("Hmm, stripmining took longer, please investigate!\n"); error = 1; } if (regularVIRT > regularWALL) { printf ("Hmm, regular virtual time is more than wall clock!\n"); error = 1; } if (stripVIRT > stripWALL) { printf ("Hmm, strip virtual time is more than wall clock!\n"); error = 1; } if (error == 0) { printf ("\nMultiple Counters and PAPI seem to be working!\n"); } return error; } int main(int argc, char **argv) { TAU_PROFILE("main()", "int (int, char **)", TAU_DEFAULT); TAU_PROFILE_SET_NODE(0); multiply(); // return checkResults(); checkResults(); return 0; }