- Write efficient programs
- Choosing the Right Algorithms and Data Structures
- Programs that write compilers that can be effectively optimized to translate into efficient executable code
- For very large computational complexity, tasks are decomposed into multiple subtasks to execute in parallel with multiprocessors and multicores.
Ability and limitations of optimizing compilers
Factors hampering optimization
- It must be considered that two pointers may point to the same address (use of memory aliases)
- When px and py point to different addresses, * px and * py values can be exchanged correctly
- When px and py point to the same address, they point to zero
- Correct writing should first check the equality of px and py
void swap(int* px, int* py)
{
*px = *px + *py;
*py = *px - *py;
*px = *px - *py;
}
- function call
- func1() and func2() are not equivalent when functions have side effects, such as modifying the value of global variables, printing output, etc.
int f();
int func1()
{
return f() + f() + f() + f();
}
int func2()
{
return 4 * f();
}
Optimizer performance example
Cyclic Deployment Technology
void prefix_sum1(double a[], double p[], long n)
{
long i;
p[0] = a[0];
for (i = 1; i < n; i++)
p[i] = p[i-1] + a[i];
}
void prefix_sum2(double a[], double p[], long n)
{
long i;
p[0] = a[0];
for (i = 1; i < n; i += 2) {
p[i] = p[i-1] + a[i];
p[i+1] = p[i] + a[i+1];
}
if (i < n) p[i] = p[i-1] + a[i];
}
Example
- Implementation of Vector Abstract Data Type vec_rec
typedef int data_t;
typedef struct {
long len;
data_t* data;
} vec_rec, *vec_ptr;
vec_ptr new_vec(long len)
{
vec_ptr result = (vec_ptr)malloc(sizeof(vec_rec));
if (!result) return NULL;
result->len = len;
if (len > 0) {
data_t* data = (data_t *)calloc(len, sizeof(data_t));
if (!data) {
free((void*) result);
return NULL;
}
return->data = data;
}
else
result->data = NULL;
return result;
}
int get_vec_element(vec_ptr v, long index, data_t* dest)
{
if (index < 0 || index >= v->len) return 0;
*dest = v->data[index];
return 1;
}
long vec_length(vec_ptr v)
{
return v->len;
}
- Test case: merge operation
#define IDENT 0
#define OP +
void combine1(vec_ptr v, data_t* dest)
{
long i;
*dest = IDENT;
for (i = 0; i < vec_length(v); i++) {
data_t val;
get_vec_element(v, i, &val);
*dest = *dest OP val;
}
}
Code Mobility Technology
- Identify computations that need to be performed multiple times but the results will not change
void combine2(vec_ptr v, data_t* dest)
{
long i;
long length = vec_length(v);
*dest = IDENT;
for (i = 0; i < length; i++) {
data_t val;
get_vec_element(v, i, &val);
*dest = *dest OP val;
}
}
Reduce function calls
data_t* get_vec_start(vec_ptr v)
{
return v->data;
}
void combine3(vec_ptr v, data_t* dest)
{
long i;
long length = vec_length(v);
data_t* data = get_vec_start(v);
*dest = IDENT;
for (i = 0; i < length; i++) {
*dest = *dest OP data[i];
}
}
Store results in temporary variables, rather than each read-write incoming parameter
void combine4(vec_ptr v, data_t* dest)
{
long i;
long length = vec_length(v);
data_t* data = get_vec_start(v);
data_t acc;
*dest = IDENT;
for (i = 0; i < length; i++) {
acc = acc OP data[i];
}
*dest = acc;
}
Loop unrolling
- Increase the number of elements calculated in each iteration and reduce the number of iterations in the cycle
- Reducing cyclic index computation and conditional branching
- Reducing the number of operations on critical paths in computation
void combine5(vec_ptr v, data_t* dest)
{
long i;
long length = vec_length(v);
long limit = length - 2;
data_t* data = get_vec_start(v);
data_t acc = IDENT;
for (i = 0; i < limit; i += 3)
acc = ((acc OP data[i]) OP data[i+1]) OP data[i+2];
for (; i < length; i++)
acc = acc OP data[i];
*dest = acc;
}
Re-associative transformation
void combine7(vec_ptr v, data_t* dest)
{
long i;
long length = vec_length(v);
long limit = length - 2;
data_t* data = get_vec_start(v);
data_t acc = IDENT;
for (i = 0; i < limit; i += 3)
acc = acc OP (data[i] OP (data[i+1] OP data[i+2]));
for (; i < length; i++)
acc = acc OP data[i];
*dest = acc;
}
- Calling GCC with the command-line display option-funroll-loops allows the compiler to perform loop unwrapping correctly
Improving Parallelism
void combine6(vec_ptr v, data_t* dest)
{
long i;
long length = vec_length(v);
long limit = length - 1;
data_t* data = get_vec_start(v);
data_t acc0 = IDENT;
data_t acc1 = IDENT;
for (i = 0; i < limit; i += 2) {
acc0 = acc0 OP data[i];
acc1 = acc1 OP data[i+1]
}
for (; i < length; i++)
acc0 = acc0 OP data[i];
*dest = acc0 + acc1;
}
Other ways
- For random arrays, the efficiency of minmax 2 () is higher than that of minmax 1 ().
void minmax1(int a[], int b[], int n)
{
int i;
for (i = 0; i < n; i++)
if (a[i] > b[i]) {
int tmp = a[i];
a[i] = b[i];
b[i] = tmp;
}
}
void minmax2(int a[], int b[], int n)
{
int i;
for (i = 0; i < n; i++) {
int min = a[i] < b[i] ? a[i] : b[i];
int max = a[i] < b[i] ? b[i] : a[i];
a[i] = min;
b[i] = max;
}
}
Write cache-friendly code
- Programs usually spend most of their time on a small number of core functions
- Core functions usually spend most of their time on core loops
Locality
- Programs always tend to access recently accessed data items or nearby data items
int sumvec(int v[N])
{
int i, sum = 0;
for (i = 0; i < N; i++)
sum += v[i];
return sum;
}
int sumarraycols(int a[M][N])
{
int i, j, sum = 0;
for (i = 0; i < M; i++)
for (j = 0; j < N; j++)
sum += a[i][j];
return sum;
}
int sumarraycols(int a[M][N])
{
int i, j, sum = 0;
for (j = 0; j < N; j++)
for (i = 0; i < M; i++)
sum += a[i][j];
return sum;
}
Temporal locality
- Repeated references to local variables are good
Spatial locality
- Rearrange loops to improve spatial locality
- Example 1: Multiplication of two n n n-order square matrices Cn*n=A n*n Bn*nC_{ntimes n} = A_{n\times n} B_{ntimes n} Cn*n = A n*n B n*n
for (i = 0; i < n; i++) {
for (j = 0; j < n; j++) {
sum = 0.0;
for (k = 0; k < n; k++)
sum += A[i][k] * B[k][j];
C[i][j] += sum;
}
}
for (i = 0; i < n; i++) {
for (k = 0; k < n; k++) {
r = A[i][k];
for (j = 0; j < n; j++)
C[i][j] += r * B[k][j];
}
}
for (k = 0; k < n; k++) {
for (i = 0; i < n; i++) {
r = A[i][k];
for (j = 0; j < n; j++)
C[i][j] += r * B[k][j];
}
}
- Example 2: Traversing three-dimensional matrices
- When the index on the rightmost side changes the fastest, its reference mode step is 1
int sumarray3d(int a[N][N][N])
{
int i, j, k, sum = 0;
for (i = 0; i < N; i++)
for (j = 0; j < N; j++)
for (k = 0; k < N; k++)
sum += a[i][j][k];
return sum;
}
- Example 3: Traversing arrays containing arrays
- The arrangement of members in the structure
#define N 1000
#define M 3
typedef struct {
int vel[M];
int acc[M];
} point;
point p[N];
void clear1(point* p)
{
int i, j;
for (i = 0; i < N; i++) {
for (j = 0; j < M; j++)
p[i].vel[j] = 0;
for (j = 0; j < M; j++)
p[i].acc[j] = 0;
}
}
void clear2(point* p)
{
int i, j;
for (i = 0; i < N; i++) {
for (j = 0; j < M; j++) {
p[i].vel[j] = 0;
p[i].acc[j] = 0;
}
}
}