Why is adding an extra mask and bithift more optimized?

Question

Why is adding an extra mask and bithift more optimized?

When writing an integer function with a hexadecimal string, I noticed that I had an unnecessary mask and a bit shift, but when I deleted it, the code really increased (about 8 times)

char *i2s(int n){
    static char buf[(sizeof(int)<<1)+1]={0};
    int i=0;
    while(i<(sizeof(int)<<1)+1){    /* mask the ith hex, shift it to lsb */
//      buf[i++]='0'+(0xf&(n>>((sizeof(int)<<3)-i<<2))); /* less optimizable ??? */
        buf[i++]='0'+(0xf&((n&(0xf<<((sizeof(int)<<3)-i<<2)))>>((sizeof(int)<<3)-i<<2)));
        if(buf[i-1]>'9')buf[i-1]+=('A'-'0'-10); /* handle A-F */
    }
    for(i=0;buf[i++]=='0';)
        /*find first non-zero*/;
    return (char *)buf+i;
}

With an additional bit shift and mask, and compiled with gcc -S -O3, the loops are expanded and reduced to:

    movb    $48, buf.1247
    xorl    %eax, %eax
    movb    $48, buf.1247+1
    movb    $48, buf.1247+2
    movb    $48, buf.1247+3
    movb    $48, buf.1247+4
    movb    $48, buf.1247+5
    movb    $48, buf.1247+6
    movb    $48, buf.1247+7
    movb    $48, buf.1247+8
    .p2align 4,,7
    .p2align 3
.L26:
    movzbl  buf.1247(%eax), %edx
    addl    $1, %eax
    cmpb    $48, %dl
    je  .L26
    addl    $buf.1247, %eax
    ret

This is what I expected for a 32-bit x86 (it should be similar, but twice as many movb-like ops for 64-bit); however, without a visible redundant mask and bit shift, gcc cannot seem to be unfolding and optimizing it.

, ? , gcc (?), . ( C → > , MSB → 1s 0s, )

+4

c gcc bit-manipulation micro-optimization

technosaurus 04 . '13 22:43

2

Peter Cordes · Answer 1 · 2017-07-10T15:26:25+0000

, gcc4.7, gcc , .

gcc , '0' + 0, .

clang , arg n, , , gcc. gcc clang , asm arg, n.

, , ! .

, 9 , 0. ( . , static , '0'. 0 (not '0') , , , clang gcc )
, buf+1 , buf[i++] for() .
undefined, i++ i , .
, , CHAR_BIT 8. (- static char buf[CHAR_BIT*sizeof(n)/4 + 1], ).

clang gcc - , <<, , . i th , : buf[i]='0'+ (0x0f & (n >> (4*i))); , . gcc, , @Fabio tmp >>= 4 . , shr reg, imm8 . (clang gcc, , n>>(4*i) 4.)

gcc . , , A F.

, , , , 48 == 0x30 == '0'. ( , 9- , clang).

bugfixed godbolt .

Fabio . , gcc , Fabio , . ( , n>>(4*i) n>>=4.)

gcc6.3 . , !

i2s_orig:
    mov     BYTE PTR buf.1406+3, 48
    mov     BYTE PTR buf.1406, 48
    cmp     BYTE PTR buf.1406+3, 48
    mov     BYTE PTR buf.1406+1, 48
    mov     BYTE PTR buf.1406+2, 48
    mov     BYTE PTR buf.1406+4, 48
    mov     BYTE PTR buf.1406+5, 48
    mov     BYTE PTR buf.1406+6, 48
    mov     BYTE PTR buf.1406+7, 48
    mov     BYTE PTR buf.1406+8, 48
    mov     BYTE PTR buf.1406+9, 0
    jne     .L7    # testing flags from the compare earlier
    jne     .L8
    jne     .L9
    jne     .L10
    jne     .L11
    sete    al
    movzx   eax, al
    add     eax, 8
.L3:
    add     eax, OFFSET FLAT:buf.1406
    ret
.L7:
    mov     eax, 3
    jmp     .L3
 ... more of the same, setting eax to 4, or 5, etc.

jne .

Fabio Fernandes · Answer 2 · 2017-06-19T17:29:08+0000

, , ((sizeof (int) < 3) -i < < 2), , .

, ++ Java: → → > . , [GNU] ++ "x → y" , x . x , - (SRA, -), x , shift-right-logical (SRL, zero-extend). , → 2 , .

, : 1) - , , 2) , L1i. - .

. , , "0" 16, 0 .

algo, , ( , ). , .

, , , (15-20 ), . algo: , 3 4 .

const char* i2s_brcfree(int n)
{
  static char buf[ sizeof(n)*2+1] = {0};
  unsigned int nibble_shifter = n;
  for(char* p = buf+sizeof(buf)-2; p >= buf; --p, nibble_shifter>>=4){
    const char curr_nibble = nibble_shifter & 0xF; // look only at lowest 4 bits
    char digit = '0' + curr_nibble;
    // "promote" to hex if nibble is over 9, 
    // conditionally adding the difference between ('0'+nibble) and 'A' 
    enum{ dec2hex_offset = ('A'-'0'-0xA) }; // compile time constant
    digit += dec2hex_offset & -(curr_nibble > 9); // conditional add
    *p = digit;
  }
  return buf;
}

: ++ . , GCC visual studio x86.

Why is adding an extra mask and bithift more optimized?

More articles: