Notes in Deep Understanding of Computer Systems: Optimizing Program Performance

Write efficient programs
- Choosing the Right Algorithms and Data Structures
- Programs that write compilers that can be effectively optimized to translate into efficient executable code
- For very large computational complexity, tasks are decomposed into multiple subtasks to execute in parallel with multiprocessors and multicores.

Ability and limitations of optimizing compilers

Factors hampering optimization

It must be considered that two pointers may point to the same address (use of memory aliases)
- When px and py point to different addresses, * px and * py values can be exchanged correctly
- When px and py point to the same address, they point to zero
- Correct writing should first check the equality of px and py

void swap(int* px, int* py)
{
  *px = *px + *py;
  *py = *px - *py;
  *px = *px - *py;
}

function call
- func1() and func2() are not equivalent when functions have side effects, such as modifying the value of global variables, printing output, etc.

int f();

int func1() 
{ 
  return f() + f() + f() + f();  // The 4 call
}

int func2()
{
  return 4 * f(); // The 1 call
}

Optimizer performance example

Cyclic Deployment Technology

void prefix_sum1(double a[], double p[], long n)
{
  long i;
  p[0] = a[0];
  for (i = 1; i < n; i++)
    p[i] = p[i-1] + a[i];
}

void prefix_sum2(double a[], double p[], long n)
{
  long i;
  p[0] = a[0];
  for (i = 1; i < n; i += 2) {
    p[i] = p[i-1] + a[i];
    p[i+1] = p[i] + a[i+1];
  }

  if (i < n) p[i] = p[i-1] + a[i];
}

Example

Implementation of Vector Abstract Data Type vec_rec

typedef int data_t;

typedef struct {
  long len;
  data_t* data;
} vec_rec, *vec_ptr;

vec_ptr new_vec(long len)
{
  vec_ptr result = (vec_ptr)malloc(sizeof(vec_rec));

  if (!result) return NULL;

  result->len = len;

  if (len > 0) {
    data_t* data = (data_t *)calloc(len, sizeof(data_t));
    if (!data) {
      free((void*) result);
      return NULL;
    }
    return->data = data;
  }
  else
    result->data = NULL;

  return result;
}

// Get v[index], and store it in dest
int get_vec_element(vec_ptr v, long index, data_t* dest)
{
  if (index < 0 || index >= v->len) return 0;

  *dest = v->data[index];
  return 1;
}

long vec_length(vec_ptr v)
{
  return v->len;
}

Test case: merge operation

#define IDENT 0
#define OP    +
// #define IDENT 1
// #define OP    *

void combine1(vec_ptr v, data_t* dest)
{
  long i;
  *dest = IDENT;

  for (i = 0; i < vec_length(v); i++) { // Multiple calls to the function vec_length(), which returns the same value, significantly reduce efficiency
    data_t val;
    get_vec_element(v, i, &val);
    *dest = *dest OP val;
  }
}

Code Mobility Technology

Identify computations that need to be performed multiple times but the results will not change

void combine2(vec_ptr v, data_t* dest)
{
  long i;
  long length = vec_length(v);
  *dest = IDENT;

  for (i = 0; i < length; i++) { // Improvement of combine 1 ()
    data_t val;
    get_vec_element(v, i, &val);
    *dest = *dest OP val;
  }
}

Reduce function calls

data_t* get_vec_start(vec_ptr v)
{
  return v->data;
}

void combine3(vec_ptr v, data_t* dest)
{
  long i;
  long length = vec_length(v);
  data_t* data = get_vec_start(v);
  *dest = IDENT;

  for (i = 0; i < length; i++) { // Improvement of combine 2 ()
    *dest = *dest OP data[i];
  }
}

Store results in temporary variables, rather than each read-write incoming parameter

void combine4(vec_ptr v, data_t* dest)
{
  long i;
  long length = vec_length(v);
  data_t* data = get_vec_start(v);
  data_t acc;
  *dest = IDENT;

  for (i = 0; i < length; i++) { // Improvement of combine 2 ()
    acc = acc OP data[i];
  }

  *dest = acc;
}

Loop unrolling

Increase the number of elements calculated in each iteration and reduce the number of iterations in the cycle
- Reducing cyclic index computation and conditional branching
- Reducing the number of operations on critical paths in computation

void combine5(vec_ptr v, data_t* dest)
{
  long i;
  long length = vec_length(v);
  long limit = length - 2;
  data_t* data = get_vec_start(v);
  data_t acc = IDENT;

  for (i = 0; i < limit; i += 3)
    acc = ((acc OP data[i]) OP data[i+1]) OP data[i+2];

  for (; i < length; i++)
    acc = acc OP data[i];

  *dest = acc;
}

Re-associative transformation

void combine7(vec_ptr v, data_t* dest)
{
  long i;
  long length = vec_length(v);
  long limit = length - 2;
  data_t* data = get_vec_start(v);
  data_t acc = IDENT;

  for (i = 0; i < limit; i += 3)
    acc = acc OP (data[i] OP (data[i+1] OP data[i+2]));

  for (; i < length; i++)
    acc = acc OP data[i];

  *dest = acc;
}

Calling GCC with the command-line display option-funroll-loops allows the compiler to perform loop unwrapping correctly

Improving Parallelism

void combine6(vec_ptr v, data_t* dest)
{
  long i;
  long length = vec_length(v);
  long limit = length - 1;
  data_t* data = get_vec_start(v);
  data_t acc0 = IDENT;
  data_t acc1 = IDENT;

  for (i = 0; i < limit; i += 2) {
    acc0 = acc0 OP data[i];
    acc1 = acc1 OP data[i+1]
  }

  for (; i < length; i++)
    acc0 = acc0 OP data[i];

  *dest = acc0 + acc1;
}

Other ways

For random arrays, the efficiency of minmax 2 () is higher than that of minmax 1 ().

void minmax1(int a[], int b[], int n)
{
  int i;
  for (i = 0; i < n; i++)
    if (a[i] > b[i]) {
      int tmp = a[i];
      a[i] = b[i];
      b[i] = tmp;
    }
}

void minmax2(int a[], int b[], int n)
{
  int i;
  for (i = 0; i < n; i++) {
    int min = a[i] < b[i] ? a[i] : b[i];
    int max = a[i] < b[i] ? b[i] : a[i];
    a[i] = min;
    b[i] = max;
  }
}

Write cache-friendly code

Programs usually spend most of their time on a small number of core functions
Core functions usually spend most of their time on core loops

Locality

Programs always tend to access recently accessed data items or nearby data items

int sumvec(int v[N])
{
  int i, sum = 0;
  for (i = 0; i < N; i++)
    sum += v[i];
  return sum;
}

// Line-first, Step-1 citation mode with good spatial locality
int sumarraycols(int a[M][N])
{
  int i, j, sum = 0;
  for (i = 0; i < M; i++)
    for (j = 0; j < N; j++)
     sum += a[i][j];
  return sum;
}

// Column-first, N-step citation mode does not have good spatial locality
int sumarraycols(int a[M][N])
{
  int i, j, sum = 0;
  for (j = 0; j < N; j++)
    for (i = 0; i < M; i++)
     sum += a[i][j];
  return sum;
}

Temporal locality

Repeated references to local variables are good

Spatial locality

Rearrange loops to improve spatial locality
Example 1: Multiplication of two n n n-order square matrices Cn*n=A n*n Bn*nC_{ntimes n} = A_{n\times n} B_{ntimes n} Cn*n = A n*n B n*n

// ijk
for (i = 0; i < n; i++) {
  for (j = 0; j < n; j++) {
    sum = 0.0;
    for (k = 0; k < n; k++)
      sum += A[i][k] * B[k][j];
    C[i][j] += sum;
  }
}

// ikj, step 1 access mode
for (i = 0; i < n; i++) {
  for (k = 0; k < n; k++) {
    r = A[i][k];  // local variable
    for (j = 0; j < n; j++)
      C[i][j] += r * B[k][j];
  }
}

// kij, Step 1 Access Mode
for (k = 0; k < n; k++) {
  for (i = 0; i < n; i++) {
    r = A[i][k];  // local variable
    for (j = 0; j < n; j++)
      C[i][j] += r * B[k][j];
  }
}

Example 2: Traversing three-dimensional matrices
- When the index on the rightmost side changes the fastest, its reference mode step is 1

int sumarray3d(int a[N][N][N])
{
  int i, j, k, sum = 0;
  for (i = 0; i < N; i++)
    for (j = 0; j < N; j++)
      for (k = 0; k < N; k++)
        sum += a[i][j][k];
  return sum;
}

Example 3: Traversing arrays containing arrays
- The arrangement of members in the structure

#define N 1000
#define M 3
typedef struct {
  int vel[M];
  int acc[M];
} point;

point p[N];

// Step 1 access mode
void clear1(point* p)
{
  int i, j;
  for (i = 0; i < N; i++) {
    for (j = 0; j < M; j++)
      p[i].vel[j] = 0;
    for (j = 0; j < M; j++)
      p[i].acc[j] = 0;
  }
}

void clear2(point* p)
{
  int i, j;
  for (i = 0; i < N; i++) {
    for (j = 0; j < M; j++) {
      p[i].vel[j] = 0;
      p[i].acc[j] = 0;
    }
  }
}

Posted by dhodge on Thu, 31 Jan 2019 20:00:15 -0800

Programmer Group

Notes in Deep Understanding of Computer Systems: Optimizing Program Performance

Ability and limitations of optimizing compilers

Factors hampering optimization

Optimizer performance example

Cyclic Deployment Technology

Example

Code Mobility Technology

Reduce function calls

Store results in temporary variables, rather than each read-write incoming parameter

Loop unrolling

Re-associative transformation

Improving Parallelism

Other ways

Write cache-friendly code

Locality

Temporal locality

Spatial locality

Hot Keywords