A function clone may prevent GCC from inlining a callee into the caller. I noticed this when testing target_clones features for performance tuning. Tested GCC version: GCC 14.2 and GCC master commit 1de156eb2bb445cd0e0a582944dcd75d085f30c9 on both x86-64 and RISC-V target. A example is shown below: ```shell git clone https://github.com/eembc/coremark.git cd coremark git reset d5fad6bd094899101a4e5fd53af7298160ced6ab --hard cat > crcu32_1.patch << EOF diff a/core_util.c b/core_util.c --- a/core_util.c +++ b/core_util.c @@ -201,6 +201,13 @@ crcu32(ee_u32 newval, ee_u16 crc) return crc; } ee_u16 +crcu32_2(ee_u32 newval, ee_u16 crc) +{ + crc = crc16((ee_s16)newval, crc); + crc = crc16((ee_s16)(newval >> 16), crc); + return crc; +} +ee_u16 crc16(ee_s16 newval, ee_u16 crc) { return crcu16((ee_u16)newval, crc); EOF patch -p1 < crcu32_1.patch gcc -O2 -Ilinux -Iposix -I. -DFLAGS_STR=\""-O2 -lrt"\" -DITERATIONS=0 core_util.c -S ``` Then, look at the generated `core_util.s` file; you will find the inline chain: (crcu32|crcu32_2)->crc16->crcu16->crcu8. ```asm crcu32: .LFB15: .cfi_startproc movl %esi, %eax movl %edi, %ecx movl $8, %esi .p2align 5 .p2align 4 .p2align 3 .L33: movl %edi, %edx shrb %dil xorl %eax, %edx shrw %ax andl $1, %edx negl %edx andw $-24575, %dx xorl %edx, %eax subb $1, %sil jne .L33 movzbl %ch, %edi movl $8, %esi ``` However, when you copy the function with some parameters changed, for instance: ```shell cat > crcu32_2.patch << EOF diff a/core_util.c b/core_util.c --- a/core_util.c +++ b/core_util.c @@ -204,7 +204,7 @@ ee_u16 crcu32_2(ee_u32 newval, ee_u16 crc) { crc = crc16((ee_s16)newval, crc); - crc = crc16((ee_s16)(newval >> 16), crc); + crc = crc16((ee_s16)(newval >> 15), crc); return crc; } ee_u16 EOF patch -p1 < crcu32_2.patch gcc -O2 -Ilinux -Iposix -I. -DFLAGS_STR=\""-O2 -lrt"\" -DITERATIONS=0 core_util.c -S ``` You will notice the `crcu32` and `crcu32_2` function being generated as machine code without inline. ```asm crcu32: .LFB15: .cfi_startproc movl %edi, %r8d movzwl %si, %esi movzwl %di, %edi call crcu16 movl %r8d, %edi movzwl %ax, %esi shrl $16, %edi jmp crcu16 .cfi_endproc ``` However, if we copy the crc16->crcu16->crcu8 sequence twice, the function inline will still work for the first crc16 call, but for the second it does not. ```bash cat > crcu16_1.patch << EOF diff a/core_util.c b/core_util.c --- a/core_util.c +++ b/core_util.c @@ -187,6 +187,31 @@ crcu8(ee_u8 data, ee_u16 crc) return crc; } ee_u16 +crcu8_2(ee_u8 data, ee_u16 crc) +{ + ee_u8 i = 0, x16 = 0, carry = 0; + + for (i = 0; i < 8; i++) + { + x16 = (ee_u8)((data & 1) ^ ((ee_u8)crc & 1)); + data >>= 1; + + if (x16 == 1) + { + crc ^= 0x4002; + carry = 1; + } + else + carry = 0; + crc >>= 1; + if (carry) + crc |= 0x8000; + else + crc &= 0x7fff; + } + return crc; +} +ee_u16 crcu16(ee_u16 newval, ee_u16 crc) { crc = crcu8((ee_u8)(newval), crc); @@ -194,6 +219,13 @@ crcu16(ee_u16 newval, ee_u16 crc) return crc; } ee_u16 +crcu16_2(ee_u16 newval, ee_u16 crc) +{ + crc = crcu8_2((ee_u8)(newval), crc); + crc = crcu8_2((ee_u8)((newval) >> 8), crc); + return crc; +} +ee_u16 crcu32(ee_u32 newval, ee_u16 crc) { crc = crc16((ee_s16)newval, crc); @@ -201,10 +233,15 @@ crcu32(ee_u32 newval, ee_u16 crc) return crc; } ee_u16 +crc16_2(ee_s16 newval, ee_u16 crc) +{ + return crcu16_2((ee_u16)newval, crc); +} +ee_u16 crcu32_2(ee_u32 newval, ee_u16 crc) { - crc = crc16((ee_s16)newval, crc); - crc = crc16((ee_s16)(newval >> 15), crc); + crc = crc16_2((ee_s16)newval, crc); + crc = crc16_2((ee_s16)(newval >> 15), crc); return crc; } ee_u16 EOF patch -p1 < crcu16_1.patch gcc -O2 -Ilinux -Iposix -I. -DFLAGS_STR=\""-O2 -lrt"\" -DITERATIONS=0 core_util.c -S ``` In this case, both functions returned to be inlined. ```asm crcu32: .LFB17: .cfi_startproc movl %edi, %edx movl $8, %ecx .p2align 5 .p2align 4 .p2align 3 .L41: movl %edi, %eax shrb %dil xorl %esi, %eax shrw %si andl $1, %eax negl %eax andw $-24575, %ax xorl %eax, %esi subb $1, %cl jne .L41 .... movzwl %si, %esi movzwl %dx, %edi jmp crcu16 .cfi_endproc .LFE19: .size crcu32_2, .-crcu32_2 .p2align 4 .globl crc16 .type crc16, @function .... crcu32_2: .LFB19: .cfi_startproc movl %edi, %edx movl $8, %ecx .p2align 5 .p2align 4 .p2align 3 .L51: movl %edi, %eax shrb %dil xorl %esi, %eax shrw %si andl $1, %eax negl %eax andw $-24575, %ax xorl %eax, %esi subb $1, %cl jne .L51 movzbl %dh, %edi movl $8, %ecx .... subb $1, %cl jne .L52 shrl $15, %edx movzwl %si, %esi movzwl %dx, %edi jmp crcu16 .cfi_endproc .LFE19: .size crcu32_2, .-crcu32_2 .p2align 4 .globl crc16 .type crc16, @function ``` Why?