cyyself · November 1, 2024 06:10 · Nov 1, 2024 · Nov 1, 2024 · Nov 1, 2024 · Nov 1, 2024
diff --git a/gcc-inline-copy-function.md b/gcc-inline-copy-function.md
@@ -98,7 +98,7 @@ crcu32:
 	.cfi_endproc
 ```
 
-However, if we copy the crc16->crcu16->crcu8 sequence twice, the function inline will still work.
+However, if we copy the crc16->crcu16->crcu8 sequence twice, the function inline will still work for the first crc16 call, but for the second it does not.
 
 ```bash
 cat > crcu16_1.patch << EOF
@@ -196,6 +196,17 @@ crcu32:
 	xorl	%eax, %esi
 	subb	$1, %cl
 	jne	.L41
+....
+	movzwl	%si, %esi
+	movzwl	%dx, %edi
+	jmp	crcu16
+	.cfi_endproc
+.LFE19:
+	.size	crcu32_2, .-crcu32_2
+	.p2align 4
+	.globl	crc16
+	.type	crc16, @function
+
 ....
 
 crcu32_2:
@@ -219,6 +230,20 @@ crcu32_2:
 	jne	.L51
 	movzbl	%dh, %edi
 	movl	$8, %ecx
+
+....
+	subb	$1, %cl
+	jne	.L52
+	shrl	$15, %edx
+	movzwl	%si, %esi
+	movzwl	%dx, %edi
+	jmp	crcu16
+	.cfi_endproc
+.LFE19:
+	.size	crcu32_2, .-crcu32_2
+	.p2align 4
+	.globl	crc16
+	.type	crc16, @function
 ```
 
 Why?
diff --git a/gcc-inline-copy-function.md b/gcc-inline-copy-function.md
@@ -33,7 +33,7 @@ patch -p1 < crcu32_1.patch
 gcc -O2 -Ilinux -Iposix -I. -DFLAGS_STR=\""-O2   -lrt"\" -DITERATIONS=0 core_util.c -S
 ```
 
-Then, look at the generated `core_util.s` file; you will find the inline chain: (crcu32|crcu32_2)->crc16->crcu16->crcu18.
+Then, look at the generated `core_util.s` file; you will find the inline chain: (crcu32|crcu32_2)->crc16->crcu16->crcu8.
 
 ```asm
 crcu32:
@@ -98,7 +98,7 @@ crcu32:
 	.cfi_endproc
 ```
 
-But if we copy the crcu16 twice, the inline will still works:
+However, if we copy the crc16->crcu16->crcu8 sequence twice, the function inline will still work.
 
 ```bash
 cat > crcu16_1.patch << EOF

diff --git a/gcc-inline-copy-function.md b/gcc-inline-copy-function.md
@@ -105,13 +105,59 @@ cat > crcu16_1.patch << EOF
 diff a/core_util.c b/core_util.c
 --- a/core_util.c
 +++ b/core_util.c
-@@ -201,10 +201,15 @@ crcu32(ee_u32 newval, ee_u16 crc)
+@@ -187,6 +187,31 @@ crcu8(ee_u8 data, ee_u16 crc)
+     return crc;
+ }
+ ee_u16
++crcu8_2(ee_u8 data, ee_u16 crc)
++{
++    ee_u8 i = 0, x16 = 0, carry = 0;
++
++    for (i = 0; i < 8; i++)
++    {
++        x16 = (ee_u8)((data & 1) ^ ((ee_u8)crc & 1));
++        data >>= 1;
++
++        if (x16 == 1)
++        {
++            crc ^= 0x4002;
++            carry = 1;
++        }
++        else
++            carry = 0;
++        crc >>= 1;
++        if (carry)
++            crc |= 0x8000;
++        else
++            crc &= 0x7fff;
++    }
++    return crc;
++}
++ee_u16
+ crcu16(ee_u16 newval, ee_u16 crc)
+ {
+     crc = crcu8((ee_u8)(newval), crc);
+@@ -194,6 +219,13 @@ crcu16(ee_u16 newval, ee_u16 crc)
+     return crc;
+ }
+ ee_u16
++crcu16_2(ee_u16 newval, ee_u16 crc)
++{
++    crc = crcu8_2((ee_u8)(newval), crc);
++    crc = crcu8_2((ee_u8)((newval) >> 8), crc);
++    return crc;
++}
++ee_u16
+ crcu32(ee_u32 newval, ee_u16 crc)
+ {
+     crc = crc16((ee_s16)newval, crc);
+@@ -201,10 +233,15 @@ crcu32(ee_u32 newval, ee_u16 crc)
      return crc;
  }
  ee_u16
 +crc16_2(ee_s16 newval, ee_u16 crc)
 +{
-+    return crcu16((ee_u16)newval, crc);
++    return crcu16_2((ee_u16)newval, crc);
 +}
 +ee_u16
  crcu32_2(ee_u32 newval, ee_u16 crc)
@@ -128,6 +174,51 @@ patch -p1 < crcu16_1.patch
 gcc -O2 -Ilinux -Iposix -I. -DFLAGS_STR=\""-O2   -lrt"\" -DITERATIONS=0 core_util.c -S
 ```
 
-In this case, the `crcu32_2` will contain an inlined version of `crcu8`, whereas the `crcu32` will not.
+In this case, both functions returned to be inlined.
+
+```asm
+crcu32:
+.LFB17:
+	.cfi_startproc
+	movl	%edi, %edx
+	movl	$8, %ecx
+	.p2align 5
+	.p2align 4
+	.p2align 3
+.L41:
+	movl	%edi, %eax
+	shrb	%dil
+	xorl	%esi, %eax
+	shrw	%si
+	andl	$1, %eax
+	negl	%eax
+	andw	$-24575, %ax
+	xorl	%eax, %esi
+	subb	$1, %cl
+	jne	.L41
+....
+
+crcu32_2:
+.LFB19:
+	.cfi_startproc
+	movl	%edi, %edx
+	movl	$8, %ecx
+	.p2align 5
+	.p2align 4
+	.p2align 3
+.L51:
+	movl	%edi, %eax
+	shrb	%dil
+	xorl	%esi, %eax
+	shrw	%si
+	andl	$1, %eax
+	negl	%eax
+	andw	$-24575, %ax
+	xorl	%eax, %esi
+	subb	$1, %cl
+	jne	.L51
+	movzbl	%dh, %edi
+	movl	$8, %ecx
+```
 
 Why?
diff --git a/gcc-inline-copy-function.md b/gcc-inline-copy-function.md
@@ -0,0 +1,133 @@
+A function clone may prevent GCC from inlining a callee into the caller.
+
+I noticed this when testing target_clones features for performance tuning.
+
+Tested GCC version: GCC 14.2 and GCC master commit 1de156eb2bb445cd0e0a582944dcd75d085f30c9 on both x86-64 and RISC-V target.
+
+A example is shown below:
+
+```shell
+git clone https://github.com/eembc/coremark.git
+cd coremark
+git reset d5fad6bd094899101a4e5fd53af7298160ced6ab --hard
+cat > crcu32_1.patch << EOF
+diff a/core_util.c b/core_util.c
+--- a/core_util.c
++++ b/core_util.c
+@@ -201,6 +201,13 @@ crcu32(ee_u32 newval, ee_u16 crc)
+     return crc;
+ }
+ ee_u16
++crcu32_2(ee_u32 newval, ee_u16 crc)
++{
++    crc = crc16((ee_s16)newval, crc);
++    crc = crc16((ee_s16)(newval >> 16), crc);
++    return crc;
++}
++ee_u16
+ crc16(ee_s16 newval, ee_u16 crc)
+ {
+     return crcu16((ee_u16)newval, crc);
+EOF
+patch -p1 < crcu32_1.patch
+gcc -O2 -Ilinux -Iposix -I. -DFLAGS_STR=\""-O2   -lrt"\" -DITERATIONS=0 core_util.c -S
+```
+
+Then, look at the generated `core_util.s` file; you will find the inline chain: (crcu32|crcu32_2)->crc16->crcu16->crcu18.
+
+```asm
+crcu32:
+.LFB15:
+	.cfi_startproc
+	movl	%esi, %eax
+	movl	%edi, %ecx
+	movl	$8, %esi
+	.p2align 5
+	.p2align 4
+	.p2align 3
+.L33:
+	movl	%edi, %edx
+	shrb	%dil
+	xorl	%eax, %edx
+	shrw	%ax
+	andl	$1, %edx
+	negl	%edx
+	andw	$-24575, %dx
+	xorl	%edx, %eax
+	subb	$1, %sil
+	jne	.L33
+	movzbl	%ch, %edi
+	movl	$8, %esi
+```
+
+However, when you copy the function with some parameters changed, for instance:
+
+```shell
+cat > crcu32_2.patch << EOF
+diff a/core_util.c b/core_util.c
+--- a/core_util.c
++++ b/core_util.c
+@@ -204,7 +204,7 @@ ee_u16
+ crcu32_2(ee_u32 newval, ee_u16 crc)
+ {
+     crc = crc16((ee_s16)newval, crc);
+-    crc = crc16((ee_s16)(newval >> 16), crc);
++    crc = crc16((ee_s16)(newval >> 15), crc);
+     return crc;
+ }
+ ee_u16
+EOF
+patch -p1 < crcu32_2.patch
+gcc -O2 -Ilinux -Iposix -I. -DFLAGS_STR=\""-O2   -lrt"\" -DITERATIONS=0 core_util.c -S
+```
+
+You will notice the `crcu32` and `crcu32_2` function being generated as machine code without inline.
+
+```asm
+crcu32:
+.LFB15:
+	.cfi_startproc
+	movl	%edi, %r8d
+	movzwl	%si, %esi
+	movzwl	%di, %edi
+	call	crcu16
+	movl	%r8d, %edi
+	movzwl	%ax, %esi
+	shrl	$16, %edi
+	jmp	crcu16
+	.cfi_endproc
+```
+
+But if we copy the crcu16 twice, the inline will still works:
+
+```bash
+cat > crcu16_1.patch << EOF
+diff a/core_util.c b/core_util.c
+--- a/core_util.c
++++ b/core_util.c
+@@ -201,10 +201,15 @@ crcu32(ee_u32 newval, ee_u16 crc)
+     return crc;
+ }
+ ee_u16
++crc16_2(ee_s16 newval, ee_u16 crc)
++{
++    return crcu16((ee_u16)newval, crc);
++}
++ee_u16
+ crcu32_2(ee_u32 newval, ee_u16 crc)
+ {
+-    crc = crc16((ee_s16)newval, crc);
+-    crc = crc16((ee_s16)(newval >> 15), crc);
++    crc = crc16_2((ee_s16)newval, crc);
++    crc = crc16_2((ee_s16)(newval >> 15), crc);
+     return crc;
+ }
+ ee_u16
+EOF
+patch -p1 < crcu16_1.patch
+gcc -O2 -Ilinux -Iposix -I. -DFLAGS_STR=\""-O2   -lrt"\" -DITERATIONS=0 core_util.c -S
+```
+
+In this case, the `crcu32_2` will contain an inlined version of `crcu8`, whereas the `crcu32` will not.
+
+Why?