Hello, I'm trying to run a program that finds the closest pair, using brute force using caching methods such as pdf here: Stanford Performance CachingMy source code:
float compare_points_BF(int N,point *P){
int i,j;
float distance=0, min_dist=FLT_MAX;
point *p1, *p2;
unsigned long long calc = 0;
for (i=0;i<(N-1);i++){
for (j=i+1;j<N;j++){
if ((distance = (P[i].x - P[j].x) * (P[i].x - P[j].x) +
(P[i].y - P[j].y) * (P[i].y - P[j].y)) < min_dist){
min_dist = distance;
p1 = &P[i];
p2 = &P[j];
}
}
}
return sqrt(min_dist);
}
This program gives approximately this time:
N 8192 16384 32768 65536 131072 262144 524288 1048576
seconds 0,070 0,280 1,130 5,540 18,080 72,838 295,660 1220,576
0,080 0,330 1,280 5,190 20,290 80,880 326,460 1318,631
Cache version of the above program:
float compare_points_BF(register int N, register int B, point *P){
register int i, j, ib, jb, num_blocks = (N + (B-1)) / B;
register point *p1, *p2;
register float distance=0, min_dist=FLT_MAX, regx, regy;
for (i = 0; i < num_blocks; i++){
for (j = i; j < num_blocks; j++){
for (jb = j * B; jb < ( ((j+1)*B) < N ? ((j+1)*B) : N); jb++){
regx = P[jb].x;
regy = P[jb].y;
for (i == j ? (ib = jb + 1) : (ib = i * B); ib < ( ((i+1)*B) < N ? ((i+1)*B) : N); ib++){
if((distance = (P[ib].x - regx) * (P[ib].x - regx) +
(P[ib].y - regy) * (P[ib].y - regy)) < min_dist){
min_dist = distance;
p1 = &P[ib];
p2 = &P[jb];
}
}
}
}
}
return sqrt(min_dist);
}
and some results:
Block_size = 256 N = 8192 Run time: 0.090 sec
Block_size = 512 N = 8192 Run time: 0.090 sec
Block_size = 1024 N = 8192 Run time: 0.090 sec
Block_size = 2048 N = 8192 Run time: 0.100 sec
Block_size = 4096 N = 8192 Run time: 0.090 sec
Block_size = 8192 N = 8192 Run time: 0.090 sec
Block_size = 256 N = 16384 Run time: 0.357 sec
Block_size = 512 N = 16384 Run time: 0.353 sec
Block_size = 1024 N = 16384 Run time: 0.360 sec
Block_size = 2048 N = 16384 Run time: 0.360 sec
Block_size = 4096 N = 16384 Run time: 0.370 sec
Block_size = 8192 N = 16384 Run time: 0.350 sec
Block_size = 16384 N = 16384 Run time: 0.350 sec
Block_size = 128 N = 32768 Run time: 1.420 sec
Block_size = 256 N = 32768 Run time: 1.420 sec
Block_size = 512 N = 32768 Run time: 1.390 sec
Block_size = 1024 N = 32768 Run time: 1.410 sec
Block_size = 2048 N = 32768 Run time: 1.430 sec
Block_size = 4096 N = 32768 Run time: 1.430 sec
Block_size = 8192 N = 32768 Run time: 1.400 sec
Block_size = 16384 N = 32768 Run time: 1.380 sec
Block_size = 256 N = 65536 Run time: 5.760 sec
Block_size = 512 N = 65536 Run time: 5.790 sec
Block_size = 1024 N = 65536 Run time: 5.720 sec
Block_size = 2048 N = 65536 Run time: 5.720 sec
Block_size = 4096 N = 65536 Run time: 5.720 sec
Block_size = 8192 N = 65536 Run time: 5.530 sec
Block_size = 16384 N = 65536 Run time: 5.550 sec
Block_size = 256 N = 131072 Run time: 22.750 sec
Block_size = 512 N = 131072 Run time: 23.130 sec
Block_size = 1024 N = 131072 Run time: 22.810 sec
Block_size = 2048 N = 131072 Run time: 22.690 sec
Block_size = 4096 N = 131072 Run time: 22.710 sec
Block_size = 8192 N = 131072 Run time: 21.970 sec
Block_size = 16384 N = 131072 Run time: 22.010 sec
Block_size = 256 N = 262144 Run time: 90.220 sec
Block_size = 512 N = 262144 Run time: 92.140 sec
Block_size = 1024 N = 262144 Run time: 91.181 sec
Block_size = 2048 N = 262144 Run time: 90.681 sec
Block_size = 4096 N = 262144 Run time: 90.760 sec
Block_size = 8192 N = 262144 Run time: 87.660 sec
Block_size = 16384 N = 262144 Run time: 87.760 sec
Block_size = 256 N = 524288 Run time: 361.151 sec
Block_size = 512 N = 524288 Run time: 379.521 sec
Block_size = 1024 N = 524288 Run time: 379.801 sec
From what we see, runtime is slower than non-cached code. Is this related to compiler optimization? Is the code bad or is it just because of an algorithm that does not work with tiles? I am using VS 2010 compiled with a 32 bit executable. Thanks in advance!