Notes in Deep Understanding of Computer Systems: Optimizing Program Performance

Keywords: Programming

  • Write efficient programs
    • Choosing the Right Algorithms and Data Structures
    • Programs that write compilers that can be effectively optimized to translate into efficient executable code
    • For very large computational complexity, tasks are decomposed into multiple subtasks to execute in parallel with multiprocessors and multicores.

Ability and limitations of optimizing compilers

Factors hampering optimization

  • It must be considered that two pointers may point to the same address (use of memory aliases)
    • When px and py point to different addresses, * px and * py values can be exchanged correctly
    • When px and py point to the same address, they point to zero
    • Correct writing should first check the equality of px and py
void swap(int* px, int* py)
{
  *px = *px + *py;
  *py = *px - *py;
  *px = *px - *py;
}
  • function call
    • func1() and func2() are not equivalent when functions have side effects, such as modifying the value of global variables, printing output, etc.
int f();

int func1() 
{ 
  return f() + f() + f() + f();  // The 4 call
}

int func2()
{
  return 4 * f(); // The 1 call
}

Optimizer performance example

Cyclic Deployment Technology

void prefix_sum1(double a[], double p[], long n)
{
  long i;
  p[0] = a[0];
  for (i = 1; i < n; i++)
    p[i] = p[i-1] + a[i];
}

void prefix_sum2(double a[], double p[], long n)
{
  long i;
  p[0] = a[0];
  for (i = 1; i < n; i += 2) {
    p[i] = p[i-1] + a[i];
    p[i+1] = p[i] + a[i+1];
  }

  if (i < n) p[i] = p[i-1] + a[i];
}

Example

  • Implementation of Vector Abstract Data Type vec_rec
typedef int data_t;

typedef struct {
  long len;
  data_t* data;
} vec_rec, *vec_ptr;

vec_ptr new_vec(long len)
{
  vec_ptr result = (vec_ptr)malloc(sizeof(vec_rec));

  if (!result) return NULL;

  result->len = len;

  if (len > 0) {
    data_t* data = (data_t *)calloc(len, sizeof(data_t));
    if (!data) {
      free((void*) result);
      return NULL;
    }
    return->data = data;
  }
  else
    result->data = NULL;

  return result;
}

// Get v[index], and store it in dest
int get_vec_element(vec_ptr v, long index, data_t* dest)
{
  if (index < 0 || index >= v->len) return 0;

  *dest = v->data[index];
  return 1;
}

long vec_length(vec_ptr v)
{
  return v->len;
}
  • Test case: merge operation
#define IDENT 0
#define OP    +
// #define IDENT 1
// #define OP    *

void combine1(vec_ptr v, data_t* dest)
{
  long i;
  *dest = IDENT;

  for (i = 0; i < vec_length(v); i++) { // Multiple calls to the function vec_length(), which returns the same value, significantly reduce efficiency
    data_t val;
    get_vec_element(v, i, &val);
    *dest = *dest OP val;
  }
}

Code Mobility Technology

  • Identify computations that need to be performed multiple times but the results will not change
void combine2(vec_ptr v, data_t* dest)
{
  long i;
  long length = vec_length(v);
  *dest = IDENT;

  for (i = 0; i < length; i++) { // Improvement of combine 1 ()
    data_t val;
    get_vec_element(v, i, &val);
    *dest = *dest OP val;
  }
}

Reduce function calls

data_t* get_vec_start(vec_ptr v)
{
  return v->data;
}

void combine3(vec_ptr v, data_t* dest)
{
  long i;
  long length = vec_length(v);
  data_t* data = get_vec_start(v);
  *dest = IDENT;

  for (i = 0; i < length; i++) { // Improvement of combine 2 ()
    *dest = *dest OP data[i];
  }
}

Store results in temporary variables, rather than each read-write incoming parameter

void combine4(vec_ptr v, data_t* dest)
{
  long i;
  long length = vec_length(v);
  data_t* data = get_vec_start(v);
  data_t acc;
  *dest = IDENT;

  for (i = 0; i < length; i++) { // Improvement of combine 2 ()
    acc = acc OP data[i];
  }

  *dest = acc;
}

Loop unrolling

  • Increase the number of elements calculated in each iteration and reduce the number of iterations in the cycle
    • Reducing cyclic index computation and conditional branching
    • Reducing the number of operations on critical paths in computation
void combine5(vec_ptr v, data_t* dest)
{
  long i;
  long length = vec_length(v);
  long limit = length - 2;
  data_t* data = get_vec_start(v);
  data_t acc = IDENT;

  for (i = 0; i < limit; i += 3)
    acc = ((acc OP data[i]) OP data[i+1]) OP data[i+2];

  for (; i < length; i++)
    acc = acc OP data[i];

  *dest = acc;
}

Re-associative transformation

void combine7(vec_ptr v, data_t* dest)
{
  long i;
  long length = vec_length(v);
  long limit = length - 2;
  data_t* data = get_vec_start(v);
  data_t acc = IDENT;

  for (i = 0; i < limit; i += 3)
    acc = acc OP (data[i] OP (data[i+1] OP data[i+2]));

  for (; i < length; i++)
    acc = acc OP data[i];

  *dest = acc;
}
  • Calling GCC with the command-line display option-funroll-loops allows the compiler to perform loop unwrapping correctly

Improving Parallelism

void combine6(vec_ptr v, data_t* dest)
{
  long i;
  long length = vec_length(v);
  long limit = length - 1;
  data_t* data = get_vec_start(v);
  data_t acc0 = IDENT;
  data_t acc1 = IDENT;

  for (i = 0; i < limit; i += 2) {
    acc0 = acc0 OP data[i];
    acc1 = acc1 OP data[i+1]
  }

  for (; i < length; i++)
    acc0 = acc0 OP data[i];

  *dest = acc0 + acc1;
}

Other ways

  • For random arrays, the efficiency of minmax 2 () is higher than that of minmax 1 ().
void minmax1(int a[], int b[], int n)
{
  int i;
  for (i = 0; i < n; i++)
    if (a[i] > b[i]) {
      int tmp = a[i];
      a[i] = b[i];
      b[i] = tmp;
    }
}

void minmax2(int a[], int b[], int n)
{
  int i;
  for (i = 0; i < n; i++) {
    int min = a[i] < b[i] ? a[i] : b[i];
    int max = a[i] < b[i] ? b[i] : a[i];
    a[i] = min;
    b[i] = max;
  }
}

Write cache-friendly code

  • Programs usually spend most of their time on a small number of core functions
  • Core functions usually spend most of their time on core loops

Locality

  • Programs always tend to access recently accessed data items or nearby data items
int sumvec(int v[N])
{
  int i, sum = 0;
  for (i = 0; i < N; i++)
    sum += v[i];
  return sum;
}

// Line-first, Step-1 citation mode with good spatial locality
int sumarraycols(int a[M][N])
{
  int i, j, sum = 0;
  for (i = 0; i < M; i++)
    for (j = 0; j < N; j++)
     sum += a[i][j];
  return sum;
}

// Column-first, N-step citation mode does not have good spatial locality
int sumarraycols(int a[M][N])
{
  int i, j, sum = 0;
  for (j = 0; j < N; j++)
    for (i = 0; i < M; i++)
     sum += a[i][j];
  return sum;
}

Temporal locality

  • Repeated references to local variables are good

Spatial locality

  • Rearrange loops to improve spatial locality
  • Example 1: Multiplication of two n n n-order square matrices Cn*n=A n*n Bn*nC_{ntimes n} = A_{n\times n} B_{ntimes n} Cn*n = A n*n B n*n
// ijk
for (i = 0; i < n; i++) {
  for (j = 0; j < n; j++) {
    sum = 0.0;
    for (k = 0; k < n; k++)
      sum += A[i][k] * B[k][j];
    C[i][j] += sum;
  }
}

// ikj, step 1 access mode
for (i = 0; i < n; i++) {
  for (k = 0; k < n; k++) {
    r = A[i][k];  // local variable
    for (j = 0; j < n; j++)
      C[i][j] += r * B[k][j];
  }
}

// kij, Step 1 Access Mode
for (k = 0; k < n; k++) {
  for (i = 0; i < n; i++) {
    r = A[i][k];  // local variable
    for (j = 0; j < n; j++)
      C[i][j] += r * B[k][j];
  }
}
  • Example 2: Traversing three-dimensional matrices
    • When the index on the rightmost side changes the fastest, its reference mode step is 1
int sumarray3d(int a[N][N][N])
{
  int i, j, k, sum = 0;
  for (i = 0; i < N; i++)
    for (j = 0; j < N; j++)
      for (k = 0; k < N; k++)
        sum += a[i][j][k];
  return sum;
}
  • Example 3: Traversing arrays containing arrays
    • The arrangement of members in the structure
#define N 1000
#define M 3
typedef struct {
  int vel[M];
  int acc[M];
} point;

point p[N];

// Step 1 access mode
void clear1(point* p)
{
  int i, j;
  for (i = 0; i < N; i++) {
    for (j = 0; j < M; j++)
      p[i].vel[j] = 0;
    for (j = 0; j < M; j++)
      p[i].acc[j] = 0;
  }
}

void clear2(point* p)
{
  int i, j;
  for (i = 0; i < N; i++) {
    for (j = 0; j < M; j++) {
      p[i].vel[j] = 0;
      p[i].acc[j] = 0;
    }
  }
}

Posted by dhodge on Thu, 31 Jan 2019 20:00:15 -0800