Why does gcc perform tail call optimization for one version, but not for another?

While experimenting with tail call optimization (tco), I came across the following curious example:

unsigned long long int fac1(unsigned long long int n){
  if (n==0)
    return 1;
  return n*fac1(n-1);
}

Actually, I was impressed that gcc was able to execute tco (with flag -O2) here, because it is not that direct:

fac1(unsigned long long):
        testq   %rdi, %rdi
        movl    $1, %eax
        je      .L4
.L3:
        imulq   %rdi, %rax
        subq    $1, %rdi
        jne     .L3
        rep ret
.L4:
        rep ret

However, after changing the return type from unsigned long long intto unsigned intgcc, I could not execute tlo:

unsigned int fac2(unsigned long long int n){
  if (n==0)
    return 1;
  return n*fac2(n-1);
}

we can clearly see the recursive call in the resulting assembly :

fac2(unsigned long long):
        testq   %rdi, %rdi
        jne     .L16
        movl    $1, %eax
        ret
.L16:
        pushq   %rbx
        movq    %rdi, %rbx
        leaq    -1(%rdi), %rdi
        call    fac2(unsigned long long)
        imull   %ebx, %eax
        popq    %rbx
        ret

At first, I rejected this as a missed optimization, but now I'm not sure because clang cannot perform this optimization. So, perhaps, there are subtleties of a language that I do not know about, which prevents this optimization.

gcc fac2, fac1?


, . , . tlo?

, ( fac2):

unsigned int tlo_fac(unsigned long long int n, unsigned long long int cur){
  if (n==0)
    return cur;
  return tlo_fac(n-1, n*cur);
}

unsigned int fac(unsigned long long int n){
  return tlo_fac(n,1);
}

tlo- , fac1 ( 32bit , imulq ):

fac(unsigned long long):
        testq   %rdi, %rdi
        movl    $1, %eax
        je      .L10
.L11:
        imulq   %rdi, %rax
        subq    $1, %rdi
        jne     .L11
.L10:
        rep ret
+6
1

fact2(), unsigned long long int unsigned int

unsigned int fac2(unsigned int n) ,

fac2(unsigned int):
        testl   %edi, %edi
        movl    $1, %eax
        je      .L10
.L9:
        imull   %edi, %eax
        subl    $1, %edi
        jne     .L9
        rep ret
.L10:
        rep ret
+2

All Articles