/* This demonstrates how data cache misses can affect the performance of an 
application. We show how the time/counts for a simple matrix multiplication
algorithm dramatically reduce when we employ a strip mining optimization. */

#define SIZE 512
#define CACHE 64

double A[SIZE][SIZE], B[SIZE][SIZE], C[SIZE][SIZE];

double multiply(void)
{
  int i, j, k, n, m;
  int vl, sz, strip;


/*begin*/for (n = 0; n < SIZE; n++)
           for (m = 0; m < SIZE; m++)
           { /* this belongs to the second loop */
             A[n][m] = B[n][m] = n + m ;
             C[n][m] = 0;
/*ending */}/*for the second loop, not the first!*/

  for (i = 0; i < SIZE; i ++)
  { 
    for (j = 0; j < SIZE; j++)
    {
      for (k = 0; k < SIZE; k++)
  	C[i][j] += A[i][k] * B[k][j];
    }
  }

  /* Now we employ the strip mining optimization */

  for(n = 0; n < SIZE; n++)
    for(m = 0; m < SIZE; m++)
      C[n][m] = 0; 
  
  for(i=0; i < SIZE; i++)
    for(k=0; k < SIZE; k++)
      for(sz = 0; sz < SIZE; sz+=CACHE)
      {
        /*vl = min(SIZE-sz, CACHE); */
  	vl = (SIZE - sz < CACHE ? SIZE - sz : CACHE); 
        for(strip = sz; strip < sz+vl; strip++)
          C[i][strip] += A[i][k]*B[k][strip];
      }

 
  return C[SIZE-10][SIZE-10];
}
       
       
int main(int argc, char **argv)
{
  multiply();
  return 0;
}