Skip to content

Instantly share code, notes, and snippets.

@cyyself
Last active November 1, 2024 06:10
Show Gist options
  • Save cyyself/2e9dd7eba9a46d6dae791e9283488d9c to your computer and use it in GitHub Desktop.
Save cyyself/2e9dd7eba9a46d6dae791e9283488d9c to your computer and use it in GitHub Desktop.

Revisions

  1. cyyself revised this gist Nov 1, 2024. 1 changed file with 26 additions and 1 deletion.
    27 changes: 26 additions & 1 deletion gcc-inline-copy-function.md
    Original file line number Diff line number Diff line change
    @@ -98,7 +98,7 @@ crcu32:
    .cfi_endproc
    ```

    However, if we copy the crc16->crcu16->crcu8 sequence twice, the function inline will still work.
    However, if we copy the crc16->crcu16->crcu8 sequence twice, the function inline will still work for the first crc16 call, but for the second it does not.

    ```bash
    cat > crcu16_1.patch << EOF
    @@ -196,6 +196,17 @@ crcu32:
    xorl %eax, %esi
    subb $1, %cl
    jne .L41
    ....
    movzwl %si, %esi
    movzwl %dx, %edi
    jmp crcu16
    .cfi_endproc
    .LFE19:
    .size crcu32_2, .-crcu32_2
    .p2align 4
    .globl crc16
    .type crc16, @function
    ....
    crcu32_2:
    @@ -219,6 +230,20 @@ crcu32_2:
    jne .L51
    movzbl %dh, %edi
    movl $8, %ecx
    ....
    subb $1, %cl
    jne .L52
    shrl $15, %edx
    movzwl %si, %esi
    movzwl %dx, %edi
    jmp crcu16
    .cfi_endproc
    .LFE19:
    .size crcu32_2, .-crcu32_2
    .p2align 4
    .globl crc16
    .type crc16, @function
    ```

    Why?
  2. cyyself revised this gist Nov 1, 2024. 1 changed file with 2 additions and 2 deletions.
    4 changes: 2 additions & 2 deletions gcc-inline-copy-function.md
    Original file line number Diff line number Diff line change
    @@ -33,7 +33,7 @@ patch -p1 < crcu32_1.patch
    gcc -O2 -Ilinux -Iposix -I. -DFLAGS_STR=\""-O2 -lrt"\" -DITERATIONS=0 core_util.c -S
    ```

    Then, look at the generated `core_util.s` file; you will find the inline chain: (crcu32|crcu32_2)->crc16->crcu16->crcu18.
    Then, look at the generated `core_util.s` file; you will find the inline chain: (crcu32|crcu32_2)->crc16->crcu16->crcu8.

    ```asm
    crcu32:
    @@ -98,7 +98,7 @@ crcu32:
    .cfi_endproc
    ```

    But if we copy the crcu16 twice, the inline will still works:
    However, if we copy the crc16->crcu16->crcu8 sequence twice, the function inline will still work.

    ```bash
    cat > crcu16_1.patch << EOF
  3. cyyself revised this gist Nov 1, 2024. 1 changed file with 94 additions and 3 deletions.
    97 changes: 94 additions & 3 deletions gcc-inline-copy-function.md
    Original file line number Diff line number Diff line change
    @@ -105,13 +105,59 @@ cat > crcu16_1.patch << EOF
    diff a/core_util.c b/core_util.c
    --- a/core_util.c
    +++ b/core_util.c
    @@ -201,10 +201,15 @@ crcu32(ee_u32 newval, ee_u16 crc)
    @@ -187,6 +187,31 @@ crcu8(ee_u8 data, ee_u16 crc)
    return crc;
    }
    ee_u16
    +crcu8_2(ee_u8 data, ee_u16 crc)
    +{
    + ee_u8 i = 0, x16 = 0, carry = 0;
    +
    + for (i = 0; i < 8; i++)
    + {
    + x16 = (ee_u8)((data & 1) ^ ((ee_u8)crc & 1));
    + data >>= 1;
    +
    + if (x16 == 1)
    + {
    + crc ^= 0x4002;
    + carry = 1;
    + }
    + else
    + carry = 0;
    + crc >>= 1;
    + if (carry)
    + crc |= 0x8000;
    + else
    + crc &= 0x7fff;
    + }
    + return crc;
    +}
    +ee_u16
    crcu16(ee_u16 newval, ee_u16 crc)
    {
    crc = crcu8((ee_u8)(newval), crc);
    @@ -194,6 +219,13 @@ crcu16(ee_u16 newval, ee_u16 crc)
    return crc;
    }
    ee_u16
    +crcu16_2(ee_u16 newval, ee_u16 crc)
    +{
    + crc = crcu8_2((ee_u8)(newval), crc);
    + crc = crcu8_2((ee_u8)((newval) >> 8), crc);
    + return crc;
    +}
    +ee_u16
    crcu32(ee_u32 newval, ee_u16 crc)
    {
    crc = crc16((ee_s16)newval, crc);
    @@ -201,10 +233,15 @@ crcu32(ee_u32 newval, ee_u16 crc)
    return crc;
    }
    ee_u16
    +crc16_2(ee_s16 newval, ee_u16 crc)
    +{
    + return crcu16((ee_u16)newval, crc);
    + return crcu16_2((ee_u16)newval, crc);
    +}
    +ee_u16
    crcu32_2(ee_u32 newval, ee_u16 crc)
    @@ -128,6 +174,51 @@ patch -p1 < crcu16_1.patch
    gcc -O2 -Ilinux -Iposix -I. -DFLAGS_STR=\""-O2 -lrt"\" -DITERATIONS=0 core_util.c -S
    ```

    In this case, the `crcu32_2` will contain an inlined version of `crcu8`, whereas the `crcu32` will not.
    In this case, both functions returned to be inlined.

    ```asm
    crcu32:
    .LFB17:
    .cfi_startproc
    movl %edi, %edx
    movl $8, %ecx
    .p2align 5
    .p2align 4
    .p2align 3
    .L41:
    movl %edi, %eax
    shrb %dil
    xorl %esi, %eax
    shrw %si
    andl $1, %eax
    negl %eax
    andw $-24575, %ax
    xorl %eax, %esi
    subb $1, %cl
    jne .L41
    ....
    crcu32_2:
    .LFB19:
    .cfi_startproc
    movl %edi, %edx
    movl $8, %ecx
    .p2align 5
    .p2align 4
    .p2align 3
    .L51:
    movl %edi, %eax
    shrb %dil
    xorl %esi, %eax
    shrw %si
    andl $1, %eax
    negl %eax
    andw $-24575, %ax
    xorl %eax, %esi
    subb $1, %cl
    jne .L51
    movzbl %dh, %edi
    movl $8, %ecx
    ```

    Why?
  4. cyyself created this gist Nov 1, 2024.
    133 changes: 133 additions & 0 deletions gcc-inline-copy-function.md
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,133 @@
    A function clone may prevent GCC from inlining a callee into the caller.

    I noticed this when testing target_clones features for performance tuning.

    Tested GCC version: GCC 14.2 and GCC master commit 1de156eb2bb445cd0e0a582944dcd75d085f30c9 on both x86-64 and RISC-V target.

    A example is shown below:

    ```shell
    git clone https://github.com/eembc/coremark.git
    cd coremark
    git reset d5fad6bd094899101a4e5fd53af7298160ced6ab --hard
    cat > crcu32_1.patch << EOF
    diff a/core_util.c b/core_util.c
    --- a/core_util.c
    +++ b/core_util.c
    @@ -201,6 +201,13 @@ crcu32(ee_u32 newval, ee_u16 crc)
    return crc;
    }
    ee_u16
    +crcu32_2(ee_u32 newval, ee_u16 crc)
    +{
    + crc = crc16((ee_s16)newval, crc);
    + crc = crc16((ee_s16)(newval >> 16), crc);
    + return crc;
    +}
    +ee_u16
    crc16(ee_s16 newval, ee_u16 crc)
    {
    return crcu16((ee_u16)newval, crc);
    EOF
    patch -p1 < crcu32_1.patch
    gcc -O2 -Ilinux -Iposix -I. -DFLAGS_STR=\""-O2 -lrt"\" -DITERATIONS=0 core_util.c -S
    ```

    Then, look at the generated `core_util.s` file; you will find the inline chain: (crcu32|crcu32_2)->crc16->crcu16->crcu18.

    ```asm
    crcu32:
    .LFB15:
    .cfi_startproc
    movl %esi, %eax
    movl %edi, %ecx
    movl $8, %esi
    .p2align 5
    .p2align 4
    .p2align 3
    .L33:
    movl %edi, %edx
    shrb %dil
    xorl %eax, %edx
    shrw %ax
    andl $1, %edx
    negl %edx
    andw $-24575, %dx
    xorl %edx, %eax
    subb $1, %sil
    jne .L33
    movzbl %ch, %edi
    movl $8, %esi
    ```

    However, when you copy the function with some parameters changed, for instance:

    ```shell
    cat > crcu32_2.patch << EOF
    diff a/core_util.c b/core_util.c
    --- a/core_util.c
    +++ b/core_util.c
    @@ -204,7 +204,7 @@ ee_u16
    crcu32_2(ee_u32 newval, ee_u16 crc)
    {
    crc = crc16((ee_s16)newval, crc);
    - crc = crc16((ee_s16)(newval >> 16), crc);
    + crc = crc16((ee_s16)(newval >> 15), crc);
    return crc;
    }
    ee_u16
    EOF
    patch -p1 < crcu32_2.patch
    gcc -O2 -Ilinux -Iposix -I. -DFLAGS_STR=\""-O2 -lrt"\" -DITERATIONS=0 core_util.c -S
    ```

    You will notice the `crcu32` and `crcu32_2` function being generated as machine code without inline.

    ```asm
    crcu32:
    .LFB15:
    .cfi_startproc
    movl %edi, %r8d
    movzwl %si, %esi
    movzwl %di, %edi
    call crcu16
    movl %r8d, %edi
    movzwl %ax, %esi
    shrl $16, %edi
    jmp crcu16
    .cfi_endproc
    ```

    But if we copy the crcu16 twice, the inline will still works:

    ```bash
    cat > crcu16_1.patch << EOF
    diff a/core_util.c b/core_util.c
    --- a/core_util.c
    +++ b/core_util.c
    @@ -201,10 +201,15 @@ crcu32(ee_u32 newval, ee_u16 crc)
    return crc;
    }
    ee_u16
    +crc16_2(ee_s16 newval, ee_u16 crc)
    +{
    + return crcu16((ee_u16)newval, crc);
    +}
    +ee_u16
    crcu32_2(ee_u32 newval, ee_u16 crc)
    {
    - crc = crc16((ee_s16)newval, crc);
    - crc = crc16((ee_s16)(newval >> 15), crc);
    + crc = crc16_2((ee_s16)newval, crc);
    + crc = crc16_2((ee_s16)(newval >> 15), crc);
    return crc;
    }
    ee_u16
    EOF
    patch -p1 < crcu16_1.patch
    gcc -O2 -Ilinux -Iposix -I. -DFLAGS_STR=\""-O2 -lrt"\" -DITERATIONS=0 core_util.c -S
    ```

    In this case, the `crcu32_2` will contain an inlined version of `crcu8`, whereas the `crcu32` will not.

    Why?