I have a nested loop that generates the following assembly:
# branch target labels manually added for readability 002E20F8 mov ebx,esi 002E20FA mov dword ptr [ebp-10h],3B9ACA00h 002E2101 sub ebx,edi 002E2103 add ebx,7 002E2106 shr ebx,3 002E2109 nop dword ptr [eax] outer_loop: 002E2110 xor eax,eax 002E2112 xor ecx,ecx 002E2114 cmp edi,esi 002E2116 mov edx,ebx 002E2118 cmova edx,eax 002E211B mov eax,edi 002E211D test edx,edx 002E211F je main+107h (02E2137h) ;end_innerloop inner_loop: 002E2121 movsd xmm0,mmword ptr [eax] 002E2125 inc ecx ; inc/addsd swapped 002E2126 addsd xmm0,mmword ptr [k] 002E212B add eax,8 002E212E movsd mmword ptr [k],xmm0 002E2133 cmp ecx,edx 002E2135 jne main+0F1h (02E2121h) ;inner_loop end_innerloop: 002E2137 sub dword ptr [ebp-10h],1 002E213B jne main+0E0h (02E2110h) ;outer_loop
If I change the line of code before the nested loop to just declare an int , then print it after the for loop. This causes the compiler to pull / reload storage k from the loop.
The first version of the question described it as "generate instructions in a slightly different order . " (editor's note: maybe I should leave this analysis / correction for an answer?)
003520F8 mov ebx,esi 003520FA mov dword ptr [ebp-10h],3B9ACA00h 00352101 sub ebx,edi 00352103 add ebx,7 00352106 shr ebx,3 00352109 nop dword ptr [eax] outer_loop: 00352110 xor eax,eax 00352112 xor ecx,ecx 00352114 cmp edi,esi 00352116 mov edx,ebx 00352118 cmova edx,eax 0035211B mov eax,edi 0035211D test edx,edx 0035211F je main+107h (0352137h) ;end_innerloop 00352121 movsd xmm0,mmword ptr [k] ; load of k hoisted out of the loop. Strangely not optimized to xorpd xmm0,xmm0 inner_loop: 00352126 addsd xmm0,mmword ptr [eax] 0035212A inc ecx 0035212B add eax,8 0035212E cmp ecx,edx 00352130 jne main+0F6h (0352126h) ;inner_loop 00352132 movsd mmword ptr [k],xmm0 ; movsd in different place. end_innerloop: 00352137 sub dword ptr [ebp-10h],1 0035213B jne main+0E0h (0352110h) ;outer_loop
This second compiler layout is 3 times faster. I am a little shocked by this. Does anyone know what is going on?
This was compiled using Visual Studio 2015.
Compiler flags (I can add more if necessary):
Optimization: Maximum Speed /O2
The code:
#include <iostream> #include <vector> #include "Stopwatch.h" static constexpr int N = 1000000000; int main() { std::vector<double> buffer; buffer.resize(10); for (auto& i : buffer) { i = 1e-100; } double k = 0; int h = 0; // removing this line and swapping the lines std::cout << "time = "... results in 3x slower code??!! Stopwatch watch; for (int i = 0; i < N; i++) { for (auto& j : buffer) { k += j; } } //std::cout << "time = " << watch.ElapsedMilliseconds() << " / " << k << std::endl; std::cout << "time = " << watch.ElapsedMilliseconds() << " / " << k << " / " << h << std::endl; std::cout << "Done..."; std::getchar(); return EXIT_SUCCESS; }
Stopwatch:
#pragma once #include <chrono> class Stopwatch { private: typedef std::chrono::high_resolution_clock clock; typedef std::chrono::microseconds microseconds; typedef std::chrono::milliseconds milliseconds; clock::time_point _start; public: Stopwatch() { Restart(); } void Restart() { _start = clock::now(); } double ElapsedMilliseconds() { return ElapsedMicroseconds() * 1E-3; } double ElapsedSeconds() { return ElapsedMicroseconds() * 1E-6; } Stopwatch(const Stopwatch&) = delete; Stopwatch& operator=(const Stopwatch&) = delete; private: double ElapsedMicroseconds() { return static_cast<double>(std::chrono::duration_cast<microseconds>(clock::now() - _start).count()); } };