Last active
November 1, 2024 06:10
-
-
Save cyyself/2e9dd7eba9a46d6dae791e9283488d9c to your computer and use it in GitHub Desktop.
Revisions
-
cyyself revised this gist
Nov 1, 2024 . 1 changed file with 26 additions and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -98,7 +98,7 @@ crcu32: .cfi_endproc ``` However, if we copy the crc16->crcu16->crcu8 sequence twice, the function inline will still work for the first crc16 call, but for the second it does not. ```bash cat > crcu16_1.patch << EOF @@ -196,6 +196,17 @@ crcu32: xorl %eax, %esi subb $1, %cl jne .L41 .... movzwl %si, %esi movzwl %dx, %edi jmp crcu16 .cfi_endproc .LFE19: .size crcu32_2, .-crcu32_2 .p2align 4 .globl crc16 .type crc16, @function .... crcu32_2: @@ -219,6 +230,20 @@ crcu32_2: jne .L51 movzbl %dh, %edi movl $8, %ecx .... subb $1, %cl jne .L52 shrl $15, %edx movzwl %si, %esi movzwl %dx, %edi jmp crcu16 .cfi_endproc .LFE19: .size crcu32_2, .-crcu32_2 .p2align 4 .globl crc16 .type crc16, @function ``` Why? -
cyyself revised this gist
Nov 1, 2024 . 1 changed file with 2 additions and 2 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -33,7 +33,7 @@ patch -p1 < crcu32_1.patch gcc -O2 -Ilinux -Iposix -I. -DFLAGS_STR=\""-O2 -lrt"\" -DITERATIONS=0 core_util.c -S ``` Then, look at the generated `core_util.s` file; you will find the inline chain: (crcu32|crcu32_2)->crc16->crcu16->crcu8. ```asm crcu32: @@ -98,7 +98,7 @@ crcu32: .cfi_endproc ``` However, if we copy the crc16->crcu16->crcu8 sequence twice, the function inline will still work. ```bash cat > crcu16_1.patch << EOF -
cyyself revised this gist
Nov 1, 2024 . 1 changed file with 94 additions and 3 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -105,13 +105,59 @@ cat > crcu16_1.patch << EOF diff a/core_util.c b/core_util.c --- a/core_util.c +++ b/core_util.c @@ -187,6 +187,31 @@ crcu8(ee_u8 data, ee_u16 crc) return crc; } ee_u16 +crcu8_2(ee_u8 data, ee_u16 crc) +{ + ee_u8 i = 0, x16 = 0, carry = 0; + + for (i = 0; i < 8; i++) + { + x16 = (ee_u8)((data & 1) ^ ((ee_u8)crc & 1)); + data >>= 1; + + if (x16 == 1) + { + crc ^= 0x4002; + carry = 1; + } + else + carry = 0; + crc >>= 1; + if (carry) + crc |= 0x8000; + else + crc &= 0x7fff; + } + return crc; +} +ee_u16 crcu16(ee_u16 newval, ee_u16 crc) { crc = crcu8((ee_u8)(newval), crc); @@ -194,6 +219,13 @@ crcu16(ee_u16 newval, ee_u16 crc) return crc; } ee_u16 +crcu16_2(ee_u16 newval, ee_u16 crc) +{ + crc = crcu8_2((ee_u8)(newval), crc); + crc = crcu8_2((ee_u8)((newval) >> 8), crc); + return crc; +} +ee_u16 crcu32(ee_u32 newval, ee_u16 crc) { crc = crc16((ee_s16)newval, crc); @@ -201,10 +233,15 @@ crcu32(ee_u32 newval, ee_u16 crc) return crc; } ee_u16 +crc16_2(ee_s16 newval, ee_u16 crc) +{ + return crcu16_2((ee_u16)newval, crc); +} +ee_u16 crcu32_2(ee_u32 newval, ee_u16 crc) @@ -128,6 +174,51 @@ patch -p1 < crcu16_1.patch gcc -O2 -Ilinux -Iposix -I. -DFLAGS_STR=\""-O2 -lrt"\" -DITERATIONS=0 core_util.c -S ``` In this case, both functions returned to be inlined. ```asm crcu32: .LFB17: .cfi_startproc movl %edi, %edx movl $8, %ecx .p2align 5 .p2align 4 .p2align 3 .L41: movl %edi, %eax shrb %dil xorl %esi, %eax shrw %si andl $1, %eax negl %eax andw $-24575, %ax xorl %eax, %esi subb $1, %cl jne .L41 .... crcu32_2: .LFB19: .cfi_startproc movl %edi, %edx movl $8, %ecx .p2align 5 .p2align 4 .p2align 3 .L51: movl %edi, %eax shrb %dil xorl %esi, %eax shrw %si andl $1, %eax negl %eax andw $-24575, %ax xorl %eax, %esi subb $1, %cl jne .L51 movzbl %dh, %edi movl $8, %ecx ``` Why? -
cyyself created this gist
Nov 1, 2024 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,133 @@ A function clone may prevent GCC from inlining a callee into the caller. I noticed this when testing target_clones features for performance tuning. Tested GCC version: GCC 14.2 and GCC master commit 1de156eb2bb445cd0e0a582944dcd75d085f30c9 on both x86-64 and RISC-V target. A example is shown below: ```shell git clone https://github.com/eembc/coremark.git cd coremark git reset d5fad6bd094899101a4e5fd53af7298160ced6ab --hard cat > crcu32_1.patch << EOF diff a/core_util.c b/core_util.c --- a/core_util.c +++ b/core_util.c @@ -201,6 +201,13 @@ crcu32(ee_u32 newval, ee_u16 crc) return crc; } ee_u16 +crcu32_2(ee_u32 newval, ee_u16 crc) +{ + crc = crc16((ee_s16)newval, crc); + crc = crc16((ee_s16)(newval >> 16), crc); + return crc; +} +ee_u16 crc16(ee_s16 newval, ee_u16 crc) { return crcu16((ee_u16)newval, crc); EOF patch -p1 < crcu32_1.patch gcc -O2 -Ilinux -Iposix -I. -DFLAGS_STR=\""-O2 -lrt"\" -DITERATIONS=0 core_util.c -S ``` Then, look at the generated `core_util.s` file; you will find the inline chain: (crcu32|crcu32_2)->crc16->crcu16->crcu18. ```asm crcu32: .LFB15: .cfi_startproc movl %esi, %eax movl %edi, %ecx movl $8, %esi .p2align 5 .p2align 4 .p2align 3 .L33: movl %edi, %edx shrb %dil xorl %eax, %edx shrw %ax andl $1, %edx negl %edx andw $-24575, %dx xorl %edx, %eax subb $1, %sil jne .L33 movzbl %ch, %edi movl $8, %esi ``` However, when you copy the function with some parameters changed, for instance: ```shell cat > crcu32_2.patch << EOF diff a/core_util.c b/core_util.c --- a/core_util.c +++ b/core_util.c @@ -204,7 +204,7 @@ ee_u16 crcu32_2(ee_u32 newval, ee_u16 crc) { crc = crc16((ee_s16)newval, crc); - crc = crc16((ee_s16)(newval >> 16), crc); + crc = crc16((ee_s16)(newval >> 15), crc); return crc; } ee_u16 EOF patch -p1 < crcu32_2.patch gcc -O2 -Ilinux -Iposix -I. -DFLAGS_STR=\""-O2 -lrt"\" -DITERATIONS=0 core_util.c -S ``` You will notice the `crcu32` and `crcu32_2` function being generated as machine code without inline. ```asm crcu32: .LFB15: .cfi_startproc movl %edi, %r8d movzwl %si, %esi movzwl %di, %edi call crcu16 movl %r8d, %edi movzwl %ax, %esi shrl $16, %edi jmp crcu16 .cfi_endproc ``` But if we copy the crcu16 twice, the inline will still works: ```bash cat > crcu16_1.patch << EOF diff a/core_util.c b/core_util.c --- a/core_util.c +++ b/core_util.c @@ -201,10 +201,15 @@ crcu32(ee_u32 newval, ee_u16 crc) return crc; } ee_u16 +crc16_2(ee_s16 newval, ee_u16 crc) +{ + return crcu16((ee_u16)newval, crc); +} +ee_u16 crcu32_2(ee_u32 newval, ee_u16 crc) { - crc = crc16((ee_s16)newval, crc); - crc = crc16((ee_s16)(newval >> 15), crc); + crc = crc16_2((ee_s16)newval, crc); + crc = crc16_2((ee_s16)(newval >> 15), crc); return crc; } ee_u16 EOF patch -p1 < crcu16_1.patch gcc -O2 -Ilinux -Iposix -I. -DFLAGS_STR=\""-O2 -lrt"\" -DITERATIONS=0 core_util.c -S ``` In this case, the `crcu32_2` will contain an inlined version of `crcu8`, whereas the `crcu32` will not. Why?