The fastest way is to use the CPU popcnt command, and the second is SSSE3 code. The fastest transfer is the Beatleys method, followed by the lookup table: http://www.dalkescientific.com/writings/diary/archive/2011/11/02/faster_popcount_update.html
As with everything, you must compare your workload. Then optimize if it is too slow.
For AMD Phenom II X2 550 with gcc 4.7.1 (using g++ -O3 popcnt.cpp -o popcnt -mpopcnt -msse2 ):
Bitslice (7) 1,462,142 us; cnt = 32500610
Bitslice (24) 1221985 us; cnt = 32500610
Lauradoux 2347749 us; cnt = 32500610
SSE2 8-bit 790898 us; cnt = 32500610
SSE2 16-bit 825568 us; cnt = 32500610
SSE2 32-bit 864665 us; cnt = 32500610
16-bit LUT 1236739 us; cnt = 32500610
8-bit LUT 1951629 us; cnt = 32500610
gcc popcount 803173 us; cnt = 32500610
gcc popcountll 7678479 us; cnt = 32500610
FreeBSD version 1 2802681 us; cnt = 32500610
FreeBSD version 2 2167031 us; cnt = 32500610
Wikipedia # 2 4927947 us; cnt = 32500610
Wikipedia # 3 4212143 us; cnt = 32500610
HAKMEM 169 / X11 3559245 us; cnt = 32500610
naive 16182699 us; cnt = 32500610
Wegner / Kernigan 12115119 us; cnt = 32500610
Anderson 61045764 us; cnt = 32500610
8x shift and add 6712049 us; cnt = 32500610
32x shift and add 6662200 us; cnt = 32500610
For Intel Core2 Duo E8400 with gcc 4.7.1 ( g++ -O3 popcnt.cpp -o popcnt -mssse3 , -mpopcnt not supported on this CPU)
Bitslice (7) 1353007 us; cnt = 32500610
Bitslice (24) 953,044 us; cnt = 32500610
Lauradoux 534697 us; cnt = 32500610
SSE2 8-bit 458277 us; cnt = 32500610
SSE2 16-bit 555278 us; cnt = 32500610
SSE2 32-bit 634897 us; cnt = 32500610
SSSE3 414542 us; cnt = 32500610
16-bit LUT 1208412 us; cnt = 32500610
8-bit LUT 1400175 us; cnt = 32500610
gcc popcount 5428396 us; cnt = 32500610
gcc popcountll 2743358 us; cnt = 32500610
FreeBSD version 1 3025944 us; cnt = 32500610
FreeBSD version 2 2313264 us; cnt = 32500610
Wikipedia # 2 1570519 us; cnt = 32500610
Wikipedia # 3 1051828 us; cnt = 32500610
HAKMEM 169 / X11 3982779 us; cnt = 32500610
naive 20951420 us; cnt = 32500610
Wegner / Kernigan 13665630 us; cnt = 32500610
Anderson 6771549 us; cnt = 32500610
8x shift and add 14917323 us; cnt = 32500610
32x shift and add 14494482 us; cnt = 32500610
The Bitslice method is a parallel mechanism that takes into account multiple (7 or 24) machine words at a time, so it has extreme usability for a common function. After http://dalkescientific.com/writings/diary/popcnt.cpp :
static inline int popcount_fbsd2(unsigned *buf, int n) { int cnt=0; do { unsigned v = *buf++; v -= ((v >> 1) & 0x55555555); v = (v & 0x33333333) + ((v >> 2) & 0x33333333); v = (v + (v >> 4)) & 0x0F0F0F0F; v = (v * 0x01010101) >> 24; cnt += v; } while(--n); return cnt; } static inline int merging2(const unsigned *data) { unsigned count1,count2,half1,half2; count1=data[0]; count2=data[1]; half1=data[2]&0x55555555; half2=(data[2]>>1)&0x55555555; count1 = count1 - ((count1 >> 1) & 0x55555555); count2 = count2 - ((count2 >> 1) & 0x55555555); count1+=half1; count2+=half2; count1 = (count1 & 0x33333333) + ((count1 >> 2) & 0x33333333); count2 = (count2 & 0x33333333) + ((count2 >> 2) & 0x33333333); count1+=count2; count1 = (count1&0x0F0F0F0F)+ ((count1 >> 4) & 0x0F0F0F0F); count1 = count1 + (count1 >> 8); count1 = count1 + (count1 >> 16); return count1 & 0x000000FF; } static inline int merging3(const unsigned *data) { unsigned count1,count2,half1,half2,acc=0; int i; for(i=0;i<24;i+=3) { count1=data[i]; count2=data[i+1]; //w = data[i+2]; half1=data[i+2]&0x55555555; half2=(data[i+2]>>1)&0x55555555; count1 = count1 - ((count1 >> 1) & 0x55555555); count2 = count2 - ((count2 >> 1) & 0x55555555); count1+=half1; count2+=half2; count1 = (count1 & 0x33333333) + ((count1 >> 2) & 0x33333333); count1 += (count2 & 0x33333333) + ((count2 >> 2) & 0x33333333); acc += (count1 & 0x0F0F0F0F)+ ((count1>>4) &0x0F0F0F0F); } acc = (acc & 0x00FF00FF)+ ((acc>>8)&0x00FF00FF); acc = acc + (acc >> 16); return acc & 0x00000FFFF; } /* count 24 words at a time, then 3 at a time, then 1 at a time */ static inline int popcount_24words(unsigned *buf, int n) { int cnt=0, i; for (i=0; i<nn%24; i+=24) { cnt += merging3(buf+i); } for (;i<nn%3; i+=3) { cnt += merging2(buf+i); } cnt += popcount_fbsd2(buf+i, ni); return cnt; }
Hubert kario
source share