Today I read the code using the lookup table instead of if-else to trim the two summarized uint8 values. The map i is in i={0...255} and 255 in i={256...511} . I wondered how big a gain could be, and tried to figure it out using gprof,
g++ -std=c++0x -pg perfLookup.cpp -O2 -o perfLookup && ./perfLookup && gprof perfLookup |less
with the code below. Now without the -O2 flag, gprof says lookup () takes 45% and ifelse () takes 48% of the execution time. With -O2, although this is 56% for search () and 43% for ifelse (). But is this test correct? Perhaps most of the code is optimized since dst is never read?
#include <iostream> #include <cstdint> #include <vector> void lookup(std::vector<uint8_t> src, int repeat) { uint8_t lookup[511]; for (int i = 0; i < 256; i++) { lookup[i] = i; } for (int i = 256; i < 512; i++) { lookup[i] = 255; } std::vector<uint8_t> dst(src.size()); for (int i = 0; i < repeat; i++) { for (int i = 0; i < src.size(); i++) { dst[i] = lookup[src[i]]; } } } void ifelse(std::vector<uint8_t> src, int repeat) { std::vector<uint8_t> dst(src.size()); for (int i = 0; i < repeat; i++) { for (int i = 0; i < src.size(); i++) { dst[i] = (src[i] > 255) ? 255 : src[i]; } } } int main() { int n = 10000; std::vector<uint8_t> src(n); for (int i = 0; i < src.size(); i++) { src[i] = rand() % 510; } lookup(src, 10000); ifelse(src, 10000); }
Updated code:
#include <iostream> #include <cstdint> #include <cstring> #include <vector> #include <algorithm> // g++ -std=c++0x -pg perfLookup.cpp -O2 -o perfLookup && ./perfLookup && gprof perfLookup |less std::vector<uint16_t> lookup(std::vector<uint16_t> src, int repeat) { uint16_t lookup[511]; for (int i = 0; i < 256; i++) { lookup[i] = i; } for (int i = 256; i < 511; i++) { lookup[i] = 255; } std::vector<uint16_t> dst(src.size()); for (int i = 0; i < repeat; i++) { for (int k = 0; k < src.size(); k++) { dst[k] = lookup[src[k]]; } } return dst; } std::vector<uint16_t> ifelse(std::vector<uint16_t> src, int repeat) { std::vector<uint16_t> dst(src.size()); for (int i = 0; i < repeat; i++) { for (int k = 0; k < src.size(); k++) { dst[k] = (src[k] > 255) ? 255 : src[k]; } } return dst; } std::vector<uint16_t> copyv(std::vector<uint16_t> src, int repeat) { std::vector<uint16_t> dst(src.size()); for (int i = 0; i < repeat; i++) { dst = src; for (int k = 0; k < src.size(); k++) { if (dst[k] > 255) { dst[k] = 255; } } } return dst; } std::vector<uint16_t> copyC(std::vector<uint16_t> src, int repeat) { uint16_t* dst = (uint16_t *) malloc(sizeof(uint16_t) * src.size()); // Alloc array for dst for (int i = 0; i < repeat; i++) { std::memcpy(dst, &src[0], sizeof(uint16_t) * src.size()); // copy src into array for (int k = 0; k < src.size(); k++) { if ((dst[k] & 0xFF00) != 0) dst[k] = 0x00FF; } } free(dst); return std::vector<uint16_t>(); } int main() { int n = 10000; std::vector<uint16_t> src(n); for (int i = 0; i < src.size(); i++) { src[i] = rand() % 510; } std::vector<uint16_t> dst; dst = lookup(src, 10000); dst = ifelse(src, 10000); dst = copyv(src, 10000); }
Simon A. Eugster
source share