GCC does not optimize a structural copy of an uninitialized static constant

Question

GCC does not optimize a structural copy of an uninitialized static constant

Firstly, I am developing a microcontroller, so using RAM and ROM is a priority.

I understand that this can be read as a bug report or not specific enough. If I do not get any answers here, I will send it as such.

I like to use static const structures to initialize the default stack structures. In most cases, the default structure is all zeros. I prefer to do this with static const structs rather than memset ( memset or struct assign , static const assign )

My current arm-none-eabi-gcc-4_7_3 , compiling for a Cortex M4 target with -Os optimizations.

I noticed the following: GCC creates different code if I explicitly initialize my static const structure to zero than if I hadn't ( static const struct foo; vs static const struct foo = {0}; ). In particular, it allocates uninitialized static const structures to memory and performs copy operations.

Here is a sample code:

 struct foo {int foo; int bar;}; struct bar {int bar[20];}; static const struct foo foo1_init, foo2_init = {0}; static const struct bar bar1_init, bar2_init = {0}; extern struct foo foo1, foo2; extern struct bar bar1, bar2; void init_foo1(void) { foo1 = foo1_init; } void init_foo2(void) { foo2 = foo2_init; } void init_bar1(void) { bar1 = bar1_init; } void init_bar2(void) { bar2 = bar2_init; }

Compiled, this leads to the following assembly listing (reordered and trimmed for brevity):

  396 .section .bss.foo1_init,"aw",%nobits 397 .align 2 398 .set .LANCHOR0,. + 0 401 foo1_init: 402 0000 00000000 .space 8 402 00000000 40 .L2: 41 0010 00000000 .word .LANCHOR0 42 0014 00000000 .word foo1 55: **** foo1 = foo1_init; 32 .loc 1 55 0 33 0000 034A ldr r2, .L2 34 0002 044B ldr r3, .L2+4 35 0004 92E80300 ldmia r2, {r0, r1} 36 0008 83E80300 stmia r3, {r0, r1} 67 .L5: 68 000c 00000000 .word foo2 60: **** foo2 = foo2_init; 60 0000 024B ldr r3, .L5 61 0002 0022 movs r2, #0 62 0004 1A60 str r2, [r3, #0] 63 0006 5A60 str r2, [r3, #4] 389 .section .bss.bar1_init,"aw",%nobits 390 .align 2 391 .set .LANCHOR1,. + 0 394 bar1_init: 395 0000 00000000 .space 80 395 00000000 395 00000000 395 00000000 395 00000000 98 .L8: 99 0010 00000000 .word .LANCHOR1 100 0014 00000000 .word bar1 65: **** bar1 = bar1_init; 89 .loc 1 65 0 90 0002 0349 ldr r1, .L8 91 0004 0348 ldr r0, .L8+4 92 0006 5022 movs r2, #80 93 0008 FFF7FEFF bl memcpy 130 .L11: 131 0010 00000000 .word bar2 70: **** bar2 = bar2_init; 121 .loc 1 70 0 122 0002 0021 movs r1, #0 123 0004 5022 movs r2, #80 124 0006 0248 ldr r0, .L11 125 0008 FFF7FEFF bl memset

We can see that for foo2 = init_foo2 and bar2 = init_bar2 compiler optimized copies before storing zeros to foo2 directly or called memset for bar2 .

We can see that for foo1 = init_foo1 and bar1 = init_bar1 compiler makes explicit copies, loads and saves from registers for foo1 and calls memcpy for foo2 .

I have a few questions:

Is such a GCC operation expected? I would expect uninitialized static const structures to follow the same path inside GCC as initialized static const structures, and therefore produce the same output.
Does this happen for other versions of ARM GCC? I have no other versions, and all of the online C files for building compilers are actually C++ compilers.
Does this happen for other target GCC architectures? Again, I have no other versions.

+7

optimization with gcc struct

Iain rist Feb 05 '16 at 13:01

source share

1 answer

hdante · Answer 1 · 2017-05-10T06:45:19+0000

I tested amd64 and, to my surprise, this seems to be consistent behavior (but I don't know if this is an error). gcc places foo1_init and bar1_init in a shared data segment or a segment of null-initialized values by the operating system (.bss). foo2_init and bar2_init are placed in a read-only segment (.rodata), as if they were non-zero initialized values. This can be seen using -O0. Since you are not using the OS, the initialized section of the OS is manually initialized by gcc and / or the linker and then copied. gcc optimizes rodata values by creating a direct memset and eliminating dead variables * 2_init. However, clang optimizes both cases.

Here follows the output of gcc (-O0):

  .file "defs.c" .local foo1_init .comm foo1_init,8,8 .section .rodata .align 8 .type foo2_init, @object .size foo2_init, 8 foo2_init: .zero 8 .local bar1_init .comm bar1_init,80,32 .align 32 .type bar2_init, @object .size bar2_init, 80 bar2_init: .zero 80 .text .globl init_foo1 .type init_foo1, @function init_foo1: .LFB0: .cfi_startproc pushq %rbp .cfi_def_cfa_offset 16 .cfi_offset 6, -16 movq %rsp, %rbp .cfi_def_cfa_register 6 movq foo1_init(%rip), %rax movq %rax, foo1(%rip) nop popq %rbp .cfi_def_cfa 7, 8 ret .cfi_endproc .LFE0: .size init_foo1, .-init_foo1 .globl init_foo2 .type init_foo2, @function init_foo2: .LFB1: .cfi_startproc pushq %rbp .cfi_def_cfa_offset 16 .cfi_offset 6, -16 movq %rsp, %rbp .cfi_def_cfa_register 6 movq $0, foo2(%rip) nop popq %rbp .cfi_def_cfa 7, 8 ret .cfi_endproc .LFE1: .size init_foo2, .-init_foo2 .globl init_bar1 .type init_bar1, @function init_bar1: .LFB2: .cfi_startproc pushq %rbp .cfi_def_cfa_offset 16 .cfi_offset 6, -16 movq %rsp, %rbp .cfi_def_cfa_register 6 movq bar1_init(%rip), %rax movq %rax, bar1(%rip) movq bar1_init+8(%rip), %rax movq %rax, bar1+8(%rip) movq bar1_init+16(%rip), %rax movq %rax, bar1+16(%rip) movq bar1_init+24(%rip), %rax movq %rax, bar1+24(%rip) movq bar1_init+32(%rip), %rax movq %rax, bar1+32(%rip) movq bar1_init+40(%rip), %rax movq %rax, bar1+40(%rip) movq bar1_init+48(%rip), %rax movq %rax, bar1+48(%rip) movq bar1_init+56(%rip), %rax movq %rax, bar1+56(%rip) movq bar1_init+64(%rip), %rax movq %rax, bar1+64(%rip) movq bar1_init+72(%rip), %rax movq %rax, bar1+72(%rip) nop popq %rbp .cfi_def_cfa 7, 8 ret .cfi_endproc .LFE2: .size init_bar1, .-init_bar1 .globl init_bar2 .type init_bar2, @function init_bar2: .LFB3: .cfi_startproc pushq %rbp .cfi_def_cfa_offset 16 .cfi_offset 6, -16 movq %rsp, %rbp .cfi_def_cfa_register 6 movl $bar2, %eax movl $80, %ecx movl $0, %esi movq %rsi, (%rax) movl %ecx, %edx addq %rax, %rdx addq $8, %rdx movq %rsi, -16(%rdx) leaq 8(%rax), %rdx andq $-8, %rdx subq %rdx, %rax addl %eax, %ecx andl $-8, %ecx movl %ecx, %eax shrl $3, %eax movl %eax, %ecx movq %rdx, %rdi movq %rsi, %rax rep stosq nop popq %rbp .cfi_def_cfa 7, 8 ret .cfi_endproc .LFE3: .size init_bar2, .-init_bar2 .ident "GCC: (GNU) 6.3.1 20170306" .section .note.GNU-stack,"",@progbits

GCC does not optimize a structural copy of an uninitialized static constant

More articles: