Loop Splitting makes code slower

So, I am optimizing a loop (like homework) that adds 10,000 elements 600,000 times. The time without optimization is 23.34s ~, and my goal is to reach less than 7 seconds for B and less than 5 for A.

So, I started my optimizations by first expanding the loop this way.

int     j;

        for (j = 0; j < ARRAY_SIZE; j += 8) {
            sum += array[j] + array[j+1] + array[j+2] + array[j+3] + array[j+4] + array[j+5] +  array[j+6] + array[j+7];

This reduces the runtime to about 6.4 ~ seconds (I can press about 6 if I expand further).

So, I decided that I would try to add subavs and make the final amount at the end, to save time on the read and write dependencies, and I came up with a code that looks like this.

int     j;

    for (j = 0; j < ARRAY_SIZE; j += 8) {
        sum0 += array[j] + array[j+1]; 
        sum1 += array[j+2] + array[j+3];
        sum2 += array[j+4] + array[j+5]; 
        sum3 += array[j+6] + array[j+7];

However, this increases the runtime to 6.8 seconds

I tried a similar technique with pointers, and the best I could do was about 15 seconds.

, , ( , ), 32-, Linux, , , Red Hat.

, , , . - , ? , ? , , 4,8 .

50 , - , , .

    #include <stdio.h>
#include <stdlib.h>

// You are only allowed to make changes to this code as specified by the comments in it.

// The code you submit must have these two values.
#define N_TIMES     600000
#define ARRAY_SIZE   10000

int main(void)
{
    double  *array = calloc(ARRAY_SIZE, sizeof(double));
    double  sum = 0;
    int     i;

    // You can add variables between this comment ...

//  double sum0 = 0;
//  double sum1 = 0;
//  double sum2 = 0;
//  double sum3 = 0;

    // ... and this one.

    // Please change 'your name' to your actual name.
    printf("CS201 - Asgmt 4 - ACTUAL NAME\n");

    for (i = 0; i < N_TIMES; i++) {

        // You can change anything between this comment ...

        int     j;

        for (j = 0; j < ARRAY_SIZE; j += 8) {
            sum += array[j] + array[j+1] + array[j+2] + array[j+3] + array[j+4] + array[j+5] +  array[j+6] + array[j+7];
        }

        // ... and this one. But your inner loop must do the same
        // number of additions as this one does.

        }

    // You can add some final code between this comment ...
//  sum = sum0 + sum1 + sum2 + sum3;
    // ... and this one.

    return 0;
}

    #include <stdio.h>
#include <stdlib.h>

// You are only allowed to make changes to this code as specified by the comments in it.

// The code you submit must have these two values.
#define N_TIMES     600000
#define ARRAY_SIZE   10000

int main(void)
{
    double  *array = calloc(ARRAY_SIZE, sizeof(double));
    double  sum = 0;
    int     i;

    // You can add variables between this comment ...

    double sum0 = 0;
    double sum1 = 0;
    double sum2 = 0;
    double sum3 = 0;

    // ... and this one.

    // Please change 'your name' to your actual name.
    printf("CS201 - Asgmt 4 - ACTUAL NAME\n");

    for (i = 0; i < N_TIMES; i++) {

        // You can change anything between this comment ...

        int     j;

        for (j = 0; j < ARRAY_SIZE; j += 8) {
            sum0 += array[j] + array[j+1]; 
            sum1 += array[j+2] + array[j+3];
            sum2 += array[j+4] + array[j+5]; 
            sum3 += array[j+6] + array[j+7];
        }

        // ... and this one. But your inner loop must do the same
        // number of additions as this one does.

        }

    // You can add some final code between this comment ...
    sum = sum0 + sum1 + sum2 + sum3;
    // ... and this one.

    return 0;
}

ANSWER

"", , . , , 4.9 ~ 50 , , TomKarzes.

int     j;
        for (j = 0; j < ARRAY_SIZE; j += 50) {
            sum +=(((((((array[j] + array[j+1]) + (array[j+2] + array[j+3])) +
                    ((array[j+4] + array[j+5]) + (array[j+6] + array[j+7]))) + 
                    (((array[j+8] + array[j+9]) + (array[j+10] + array[j+11])) +
                    ((array[j+12] + array[j+13]) + (array[j+14] + array[j+15])))) +
                    ((((array[j+16] + array[j+17]) + (array[j+18] + array[j+19]))))) +
                    (((((array[j+20] + array[j+21]) + (array[j+22] + array[j+23])) +
                    ((array[j+24] + array[j+25]) + (array[j+26] + array[j+27]))) + 
                    (((array[j+28] + array[j+29]) + (array[j+30] + array[j+31])) +
                    ((array[j+32] + array[j+33]) + (array[j+34] + array[j+35])))) +
                    ((((array[j+36] + array[j+37]) + (array[j+38] + array[j+39])))))) + 
                    ((((array[j+40] + array[j+41]) + (array[j+42] + array[j+43])) +
                    ((array[j+44] + array[j+45]) + (array[j+46] + array[j+47]))) + 
                    (array[j+48] + array[j+49])));
        }
+4
2

. , gcc, , :

    for (j = 0; j < ARRAY_SIZE; j += 16) {
        sum = sum +
              (array[j   ] + array[j+ 1]) +
              (array[j+ 2] + array[j+ 3]) +
              (array[j+ 4] + array[j+ 5]) +
              (array[j+ 6] + array[j+ 7]) +
              (array[j+ 8] + array[j+ 9]) +
              (array[j+10] + array[j+11]) +
              (array[j+12] + array[j+13]) +
              (array[j+14] + array[j+15]);
    }

, 16 , , . +=, , sum .

, , , , - , .

, .

: ( , ):

    int     j1, j2;

    j1 = 0;
    do {
        j2 = j1 + 20;
        sum = sum +
              (array[j1   ] + array[j1+ 1]) +
              (array[j1+ 2] + array[j1+ 3]) +
              (array[j1+ 4] + array[j1+ 5]) +
              (array[j1+ 6] + array[j1+ 7]) +
              (array[j1+ 8] + array[j1+ 9]) +
              (array[j1+10] + array[j1+11]) +
              (array[j1+12] + array[j1+13]) +
              (array[j1+14] + array[j1+15]) +
              (array[j1+16] + array[j1+17]) +
              (array[j1+18] + array[j1+19]);
        j1 = j2 + 20;
        sum = sum +
              (array[j2   ] + array[j2+ 1]) +
              (array[j2+ 2] + array[j2+ 3]) +
              (array[j2+ 4] + array[j2+ 5]) +
              (array[j2+ 6] + array[j2+ 7]) +
              (array[j2+ 8] + array[j2+ 9]) +
              (array[j2+10] + array[j2+11]) +
              (array[j2+12] + array[j2+13]) +
              (array[j2+14] + array[j2+15]) +
              (array[j2+16] + array[j2+17]) +
              (array[j2+18] + array[j2+19]);
    }
    while (j1 < ARRAY_SIZE);

40, 20, , , . , , .

+2

:

  • , 1, sum +=. 16,4 64- MacBook Pro.

  • gcc -O2, , 5.46 .

  • gcc -O3, , 5.45 .

  • 8- . 2,03 .

  • 16-way additon , 1,91 .

  • 32- . WENT UP 2.08 .

  • , @kcraigie. -O3 6,01 . ( !)

    register double * p;
    for (p = array; p < array + ARRAY_SIZE; ++p) {
        sum += *p;
    }
    
  • for while, sum += *p++ 5.64 .

  • while up, 5.88 .

  • for integer-by-8 integer, 8 [0-7] _array [j + N] sumN N [0, 7]. _array double * const, array, , . 1,86 .

  • , 10 000 + _array [n], N . sum = tnKX(addsum), . , .

  • , 10000 sum += _array[n] N . 6,63 ! -, .

  • static double _array[ARRAY_SIZE];, __builtin_memcpy . 8- 2,96 . , - . ( - , .)

, 16- 8- . , - , .

Edit:

@pvg, :

int ntimes = 0;

// ... and this one.
...
    // You can change anything between this comment ...

            if (ntimes++ == 0) {

< 0,01 .;-) , F-stick.

-1

All Articles