Skip to content

Instantly share code, notes, and snippets.

@SmoothHacker
Forked from dougallj/aarch64_amx.py
Created December 7, 2022 04:24
Show Gist options
  • Save SmoothHacker/1c23a5242b0814a1a148e53d7faec009 to your computer and use it in GitHub Desktop.
Save SmoothHacker/1c23a5242b0814a1a148e53d7faec009 to your computer and use it in GitHub Desktop.

Revisions

  1. @dougallj dougallj revised this gist Sep 5, 2022. 1 changed file with 6 additions and 0 deletions.
    6 changes: 6 additions & 0 deletions aarch64_amx.py
    Original file line number Diff line number Diff line change
    @@ -27,6 +27,12 @@
    # APIs you can get the performance benefits (fast multiplication of big
    # matrices). This is separate from the Apple Neural Engine.
    #
    #############################################################################
    # UPDATE: See Pete Cawley's complete documentation of the AMX instructions #
    # at https://github.com/corsix/amx/blob/main/README.md - this covers a lot #
    # more than my notes (except possibly for performance details, for now). #
    #############################################################################
    #
    # Warning: This is a work in progress, some of this is going to be incorrect.
    #
    # This may actually be very similar to Intel Advanced Matrix Extension (AMX),
  2. @dougallj dougallj revised this gist Oct 7, 2021. 1 changed file with 8 additions and 0 deletions.
    8 changes: 8 additions & 0 deletions aarch64_amx.py
    Original file line number Diff line number Diff line change
    @@ -321,6 +321,14 @@
    # str before AMXSTX (aliasing): 115 cycles/iter
    # ldr before AMXSTX (no-aliasing): 31 cycles/iter
    # ldr before AMXSTX (aliasing): 112 cycles/iter
    #
    #
    #
    # Hardware
    #
    # I know even less about this, but my guesses at the floorplan
    # locations of the AMX coprocessors are in this Twitter thread:
    # https://twitter.com/dougallj/status/1446097016166051848

    import idaapi
    import ida_hexrays
  3. @dougallj dougallj revised this gist Oct 7, 2021. 1 changed file with 3 additions and 3 deletions.
    6 changes: 3 additions & 3 deletions aarch64_amx.py
    Original file line number Diff line number Diff line change
    @@ -265,9 +265,9 @@
    # FMAs, giving a throughput of one FMA32 or FMA64 instruction per cycle, but
    # only one FMA16 instruction every two cycles.
    #
    # The efficiency-variant still has 4-cycle FMA32/FMA64 latency, but it is not
    # pipelined, so can only perform one FMA32 or FMA64 instruction every four
    # cycles, or one FMA16 instruction every eight cycles.
    # The efficiency-variant still has 4-cycle FMA32/FMA64 latency, but it can
    # only perform one FMA32 or FMA64 instruction every four cycles, or one
    # FMA16 instruction every eight cycles.
    #
    # To achieve 1-cycle throughput from a single core, the destinations must be
    # independent (using a Z offset). Operations which use too much of the Z
  4. @dougallj dougallj revised this gist Oct 5, 2021. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion aarch64_amx.py
    Original file line number Diff line number Diff line change
    @@ -277,7 +277,7 @@
    # For example, an expanding 16-bit -> 32-bit FMA uses the full Z register
    # (for 1024 32-bit results), so has a throughput of one every four cycles,
    # but this can be doubled to one every two cycles by running on two
    # perfomance cores.
    # performance cores.
    #
    # There is some out-of-order execution capability on the co-processor (currently
    # estimated at a 28 to 32 operation buffer, with very low confidence).
  5. @dougallj dougallj revised this gist Oct 5, 2021. 1 changed file with 5 additions and 4 deletions.
    9 changes: 5 additions & 4 deletions aarch64_amx.py
    Original file line number Diff line number Diff line change
    @@ -255,10 +255,11 @@
    #
    # The M1 has two AMX coprocessors, one used by the four Firestorm cores, and
    # another used by the four Icestorm cores. Each coprocessor stores four copies
    # of the architectural register state, one for each core. These access data
    # of the architectural register state, one for each core. These access memory
    # via the same L2 cache as the cores, and seem to run at around the same clock
    # speed as the cores. Much like the cores themselves, the coprocessors are
    # different, and have different performance characteristics.
    # speed as the cores. Much like the power and efficiency cores, the two
    # coprocessors are designed differently, and have different performance
    # characteristics.
    #
    # The performance-variant consists of an array of 4-cycle latency, pipelined
    # FMAs, giving a throughput of one FMA32 or FMA64 instruction per cycle, but
    @@ -275,7 +276,7 @@
    #
    # For example, an expanding 16-bit -> 32-bit FMA uses the full Z register
    # (for 1024 32-bit results), so has a throughput of one every four cycles,
    # but this can be doubled to one every two cycles by running on two (or more)
    # but this can be doubled to one every two cycles by running on two
    # perfomance cores.
    #
    # There is some out-of-order execution capability on the co-processor (currently
  6. @dougallj dougallj revised this gist Oct 5, 2021. 1 changed file with 4 additions and 4 deletions.
    8 changes: 4 additions & 4 deletions aarch64_amx.py
    Original file line number Diff line number Diff line change
    @@ -261,12 +261,12 @@
    # different, and have different performance characteristics.
    #
    # The performance-variant consists of an array of 4-cycle latency, pipelined
    # FMAs, giving a throughput of one FMA32/FMA64-instruction per cycle, but only
    # one FMA16-instruction every two cycles for 16-bit.
    # FMAs, giving a throughput of one FMA32 or FMA64 instruction per cycle, but
    # only one FMA16 instruction every two cycles.
    #
    # The efficiency-variant still has 4-cycle FMA32/FMA64 latency, but it is not
    # pipelined, so can only perform one FMA32/FMA64-instruction every four cycles,
    # or one FMA16-instruction every eight cycles.
    # pipelined, so can only perform one FMA32 or FMA64 instruction every four
    # cycles, or one FMA16 instruction every eight cycles.
    #
    # To achieve 1-cycle throughput from a single core, the destinations must be
    # independent (using a Z offset). Operations which use too much of the Z
  7. @dougallj dougallj revised this gist Oct 5, 2021. 1 changed file with 31 additions and 17 deletions.
    48 changes: 31 additions & 17 deletions aarch64_amx.py
    Original file line number Diff line number Diff line change
    @@ -244,35 +244,49 @@
    #
    # Performance characteristics:
    #
    # This is trickier and still needs some work, but it appears to function as a
    # co-processor, with operations being posted to it from the main processor.
    # AMX functions as a non-speculative coprocessor, with operations posted to it
    # via the store units of the CPU cores. Non-load/store AMX instructions can be
    # fused with other non-load/store AMX instructions, to only use one store
    # port.
    #
    # Because it doesn't go through the main processors out-of-order execution, it
    # can be beneficial to add prefetch instructions (which do) immediately before
    # AMX stores (and loads).
    #
    # All AMX instructions are sent via the store ports. Non-load/store AMX
    # instructions can be fused with other non-load/store AMX instructions,
    # to only use one store port.
    #
    # There is probably only one AMX coprocessor on the M1, as multiple threads
    # trying to use AMX at the same time cause a slow-down. (It presumably must
    # store the registers for each core?)
    #
    # The M1 has two AMX coprocessors, one used by the four Firestorm cores, and
    # another used by the four Icestorm cores. Each coprocessor stores four copies
    # of the architectural register state, one for each core. These access data
    # via the same L2 cache as the cores, and seem to run at around the same clock
    # speed as the cores. Much like the cores themselves, the coprocessors are
    # different, and have different performance characteristics.
    #
    # The performance-variant consists of an array of 4-cycle latency, pipelined
    # FMAs, giving a throughput of one FMA32/FMA64-instruction per cycle, but only
    # one FMA16-instruction every two cycles for 16-bit.
    #
    # The efficiency-variant still has 4-cycle FMA32/FMA64 latency, but it is not
    # pipelined, so can only perform one FMA32/FMA64-instruction every four cycles,
    # or one FMA16-instruction every eight cycles.
    #
    # To achieve 1-cycle throughput from a single core, the destinations must be
    # independent (using a Z offset). Operations which use too much of the Z
    # register will have lower throughput. Throughput can also be improved using
    # different cores (and therefore entirely different Z registers).
    #
    # For example, an expanding 16-bit -> 32-bit FMA uses the full Z register
    # (for 1024 32-bit results), so has a throughput of one every four cycles,
    # but this can be doubled to one every two cycles by running on two (or more)
    # perfomance cores.
    #
    # There is some out-of-order execution capability on the co-processor (currently
    # estimated at a 28 to 32 operation buffer, with very low confidence).
    #
    # An FMA typically has a 4-cycle latency, but a throughput of 1 per cycle
    # (e.g. 16*16 = 256 32-bit float fused multiply-adds per cycle). This throughput
    # is only possible if the destinations can be independent (using a z offset), so
    # operations which use too much of the Z register will have lower throughput, and
    # operations which use less may have higher throughput (TODO: test).
    #
    # Loads and stores seem to be the bottleneck. They can generate faults on bad
    # addresses/alignments as expected, but appear to go via L2, with slight penalties
    # for data in L1.
    #
    # Mixing loads and stores on the main processor with co-processor loads and stores
    # causes big slowdowns, presumably as some kinds of barriers are needed to ensure a
    # causes big slowdowns, presumably due to the synchronization needed to ensure a
    # consistent view of memory.
    #
    # At best I've seen loads of 0x80 bytes to x and 0x80 bytes to y in 9 cycles
  8. @dougallj dougallj revised this gist Mar 18, 2021. 1 changed file with 8 additions and 0 deletions.
    8 changes: 8 additions & 0 deletions aarch64_amx.py
    Original file line number Diff line number Diff line change
    @@ -250,6 +250,14 @@
    # can be beneficial to add prefetch instructions (which do) immediately before
    # AMX stores (and loads).
    #
    # All AMX instructions are sent via the store ports. Non-load/store AMX
    # instructions can be fused with other non-load/store AMX instructions,
    # to only use one store port.
    #
    # There is probably only one AMX coprocessor on the M1, as multiple threads
    # trying to use AMX at the same time cause a slow-down. (It presumably must
    # store the registers for each core?)
    #
    # There is some out-of-order execution capability on the co-processor (currently
    # estimated at a 28 to 32 operation buffer, with very low confidence).
    #
  9. @dougallj dougallj revised this gist Feb 5, 2021. 1 changed file with 13 additions and 2 deletions.
    15 changes: 13 additions & 2 deletions aarch64_amx.py
    Original file line number Diff line number Diff line change
    @@ -165,6 +165,8 @@
    # clear "z" flag (don't add): (argument >> 27) & 1
    # skip y input (don't mul): (argument >> 28) & 1
    # skip x input (don't mul): (argument >> 29) & 1
    # row disable: (argument >> 32) & 0x7F
    # col disable: (argument >> 41) & 0x7F
    # 32-bit mode: (argument >> 62) & 1
    # vector (non-matrix) multiply add (16x16->16 in one row): (argument >> 63) & 1
    # TODO: there are operands in other bits that still need to be reversed
    @@ -173,7 +175,15 @@
    #
    # if bit 62 is zero, the output is in every second row, and if bit 27 is also
    # set, only every second row gets zeroed (old values remain in the other rows)
    #
    #
    # row/column disable skips the operation for certain entries in the row/column:
    # if disable is 0: process all entries
    # if disable is 1: process only every second entry (starting from the index 1)
    # if disable is 2: process only every second entry (starting from the index 0)
    # if (disable & 0x60) is 0x20: process only the entry at index "ignore & 0x1F"
    # if (disable & 0x60) is 0x40: process only the first "ignore & 0x1F" entries
    # if (disable & 0x60) is 0x60: process only the last "ignore & 0x1F" entries
    #
    # for 32-bit output (sign extend all inputs to 32-bit):
    # z += [
    # [x0, x2, x4, x6, x8, x10, x12, x14, x16, x18, x20, x22, x24, x26, x28, x30] * y0,
    @@ -198,7 +208,8 @@
    # 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    # ...
    # ]
    #
    #
    #
    # op15: multiply and add 16-bit floats (amxfma16)
    # (same as op14, but for 16-bit floats)
    # bit 62 makes output 32-bit floats, rather than 16-bit floats
  10. @dougallj dougallj revised this gist Dec 29, 2020. 1 changed file with 146 additions and 31 deletions.
    177 changes: 146 additions & 31 deletions aarch64_amx.py
    Original file line number Diff line number Diff line change
    @@ -1,29 +1,43 @@
    # TODO: XZR can be an operand, but I don't handle that correctly yet


    # IDA (disassembler) and Hex-Rays (decompiler) plugin for Apple AMX
    #
    # WIP research. (This was edited to add more info after someone posted it to
    # Hacker News. Click "Revisions" to see full changes.)
    #
    # Copyright (c) 2020 dougallj


    # Based on Python port of VMX intrinsics plugin:
    # Copyright (c) 2019 w4kfu - Synacktiv

    # Based on AArch64 8.3-A Pointer Authentication plugin:
    # Copyright (c) 2018 Eloi Benoist-Vanderbeken - Synacktiv
    # Copyright (c) 2018 xerub

    # TODO: XZR can be an operand, but I don't handle that correctly in
    # the decompuler yet.


    # AMX: Apple Matrix coprocessor
    #
    # This is an undocumented arm64 ISA extension present on the Apple M1. These
    # instructions have been reversed from Accelerate (vImage, libBLAS, libBNNS,
    # libvDSP and libLAPACK all use them), and by experimenting with their
    # behaviour on the M1. Some of this is probably wrong - I haven't reversed
    # a ton.
    # behaviour on the M1. Apple has not published a compiler, assembler, or
    # disassembler, but by callling into the public Accelerate framework
    # APIs you can get the performance benefits (fast multiplication of big
    # matrices). This is separate from the Apple Neural Engine.
    #
    # Warning: This is a work in progress, some of this is going to be incorrect.
    #
    # This may actually be very similar to Intel Advanced Matrix Extension (AMX),
    # making the name collision even more confusing, but it's not a bad place to
    # look for some idea of what's probably going on.
    #
    #
    # WIP simulator/hardware tests are at:
    # https://gist.github.com/dougallj/7cba721da1a94da725ee37c1e9cd1f21
    #
    #
    # The coprocessor state consists of two 0x200 byte "registers", amx0 ("x")
    # and amx1 ("y"), and one 0x1000 byte register amx2 ("z"). (Apple headers
    # describe x, y, and z as register groups, where each row of 64-bytes is a
    @@ -54,8 +68,8 @@
    # This register is typically a bitfield containing further parameters to the
    # operation. For example, loads and stores have a 56-bit address in bits 0
    # through 55, a 5-bit register offset (in units of 0x40) in bits 56
    # through 61, and a 1-bit flag in bit 61 (acting as an 0x40 byte load/store
    # when zero, or an 0x80 byte load/store when one).
    # through 61, and a 1-bit flag in bit 62 (acting as an 0x40 byte load/store
    # when zero, or an 0x80 byte (but aligned) load/store when one).
    #
    # My best guess at the names is based on:
    # https://www.realworldtech.com/forum/?threadid=187087&curpostid=187120
    @@ -85,14 +99,38 @@
    # amx2 (z), but 8 and 9 have their result in amx0 and amx1 (x/y), and 22 seems
    # to have its result in row 0 (bytes 0 through 0x3F) of amx0.
    #
    # 8 does an operation, result in amx0 (amxextrx?)
    # 9 does an operation, result in amx1 (amxextry?)
    # op8: extract row or move to x, result in amx0 (amxextrx)
    #
    # move a 64-byte row from z or y to x
    # operands:
    # x offset in bytes = (argument >> 10) & 0x1FF
    # z offset in rows = (argument >> 20) & 63
    # move from y = argument >> 27) & 1
    # if moving from y, the x offset is rounded down to 0x40 bytes (so it can only
    # store to a row, rather than an arbitrary byte offset in x)
    #
    # TODO: other bits
    #
    # op9: extract column or move to y, result in amx1/amx0 (amxextry)
    #
    # move a 64-byte column from z to x or y, or move a 64-byte row from x to y
    #
    # y offset in bytes = argument & 0x1FF
    # z offset in columns = (argument >> 20) & 63
    # move from x = (argument >> 27) & 1
    #
    # TODO: many other bits factor into how the layout and order of columns is
    # determined, and which register is the destination. i'd like to finish
    # reversing it before trying to specify it, but my current understanding
    # is recorded in amx_state_extry at:
    #
    # https://gist.github.com/dougallj/7cba721da1a94da725ee37c1e9cd1f21
    #
    # op10: multiply and add 64-bit floats (amxfma64)
    #
    # similar to op14, but 8x8 double multiplies for 64 results, added
    # (in groups of 8) to every 8th row of register "z" (z0, z8, z16).
    # no "32-bit mode" operand.
    # no "32-bit mode" flag (?)
    #
    # op11: multiply and subtract 64-bit floats (amxfms64)
    #
    @@ -102,7 +140,7 @@
    #
    # similar to op14, but 16x16 float multiplies for 256 results, added
    # (in groups of 16) to every 4th row of register "z" (z0, z4, z8).
    # no "32-bit mode" operand.
    # no "32-bit mode" flag (?)
    #
    # op13: multiply and subtract 32-bit floats (amxfms32)
    #
    @@ -123,8 +161,12 @@
    # operands:
    # input offset in x (byte): (argument & 0x1FF)
    # input offset in y (byte): ((argument >> 10) & 0x1FF)
    # row offset in z: (argument >> 20) & 63
    # clear "z" flag (don't add): (argument >> 27) & 1
    # skip y input (don't mul): (argument >> 28) & 1
    # skip x input (don't mul): (argument >> 29) & 1
    # 32-bit mode: (argument >> 62) & 1
    # vector (non-matrix) multiply add (16x16->16 in one row): (argument >> 63) & 1
    # TODO: there are operands in other bits that still need to be reversed
    #
    # bit 62 makes output 32-bit ints, rather than 16-bit ints
    @@ -165,13 +207,86 @@
    # (same as op15, but subtracting from register "z" instead of adding)
    #
    # 17 is enable/disable
    # 18 does an operation, result in amx2
    # 19 does an operation, result in amx2
    # 20 does an operation, result in amx2
    # 21 does an operation, result in amx2
    # 22 does an operation, result in amx0[0]
    # 18 does an operation, result in amx2 (vecint)
    # vector multiply 16-bit integers? (doesn't mac16 have a flag for this?)
    # z0[i] += x0[i] + y0[i]
    #

    # 19 does an operation, result in amx2 (vecfp)
    # vector multiply 16-bit floats? (doesn't mac16 have a flag for this?)
    # z0[i] += x0[i] + y0[i]
    #
    # 20 does an operation, result in amx2 (matint)
    # 16-bit integer matrix multiply? (doesn't fma16 do this?)
    #
    # 21 does an operation, result in amx2 (matfp)
    # 16-bit float matrix multiply? (doesn't fma16 do this?)
    #
    # 22 does an operation, result in amx0[0] (genlut)
    #
    # with xzr as input it takes 16 signed 32-bit integers from amx0[0] as input,
    # generates a 64-bit output:
    # [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0] -> 0xffffffffffffffff
    # [0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1] -> 0xf0
    # [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] -> 0xfedcba9876543210
    #
    #
    #
    # Performance characteristics:
    #
    # This is trickier and still needs some work, but it appears to function as a
    # co-processor, with operations being posted to it from the main processor.
    # Because it doesn't go through the main processors out-of-order execution, it
    # can be beneficial to add prefetch instructions (which do) immediately before
    # AMX stores (and loads).
    #
    # There is some out-of-order execution capability on the co-processor (currently
    # estimated at a 28 to 32 operation buffer, with very low confidence).
    #
    # An FMA typically has a 4-cycle latency, but a throughput of 1 per cycle
    # (e.g. 16*16 = 256 32-bit float fused multiply-adds per cycle). This throughput
    # is only possible if the destinations can be independent (using a z offset), so
    # operations which use too much of the Z register will have lower throughput, and
    # operations which use less may have higher throughput (TODO: test).
    #
    # Loads and stores seem to be the bottleneck. They can generate faults on bad
    # addresses/alignments as expected, but appear to go via L2, with slight penalties
    # for data in L1.
    #
    # Mixing loads and stores on the main processor with co-processor loads and stores
    # causes big slowdowns, presumably as some kinds of barriers are needed to ensure a
    # consistent view of memory.
    #
    # At best I've seen loads of 0x80 bytes to x and 0x80 bytes to y in 9 cycles
    # (running in a loop).
    #
    # Because of the out-of-order capabilities, performing four fmas fits within
    # this 9-cycle window at essentially no extra cost, so the following can run in
    # a loop at 9 cycles per iteration:
    #
    # AMX_LDX(load_addr | 0x4000000000000000);
    # AMX_LDY(load_addr | 0x4000000000000080);
    # AMX_FMA32(0x000000);
    # AMX_FMA32(0x110000);
    # AMX_FMA32(0x200040);
    # AMX_FMA32(0x310040);
    #
    # (this is accumulating a 32x32 tile within a larger matrix multiply)
    #
    #
    #
    # Slowdowns from mixing loads/stores:
    #
    # nop/add before AMXLDX: 9 cycles/iter
    # str before AMXLDX (no-aliasing): 47 cycles/iter
    # str before AMXLDX (aliasing): 93 or 103 cycles/iter
    # ldr before AMXLDX (no-"aliasing"): 11 cycles/iter
    # ldr before AMXLDX ("aliasing"): 66 cycles/iter
    #
    # nop/add before AMXSTX: 28 cycles/iter
    # str before AMXSTX (no-aliasing): 48 cycles/iter
    # str before AMXSTX (aliasing): 115 cycles/iter
    # ldr before AMXSTX (no-aliasing): 31 cycles/iter
    # ldr before AMXSTX (aliasing): 112 cycles/iter

    import idaapi
    import ida_hexrays
    @@ -211,8 +326,8 @@
    AMX_OP5: "AMXSTZ",
    AMX_OP6: "AMXLDZI",
    AMX_OP7: "AMXSTZI",
    AMX_OP8: "AMX8", # amxextrx?
    AMX_OP9: "AMX9", # amxextry?
    AMX_OP8: "AMXEXTRX", # amxextrx?
    AMX_OP9: "AMXEXTRY", # amxextry?
    AMX_OP10: "AMXFMA64",
    AMX_OP11: "AMXFMS64",
    AMX_OP12: "AMXFMA32",
    @@ -221,11 +336,11 @@
    AMX_OP15: "AMXFMA16",
    AMX_OP16: "AMXFMS16",
    AMX_OP17: "AMX17", # amxset / amxclr
    AMX_OP18: "AMX18",
    AMX_OP19: "AMX19",
    AMX_OP20: "AMX20",
    AMX_OP21: "AMX21",
    AMX_OP22: "AMX22",
    AMX_OP18: "AMXVECINT",
    AMX_OP19: "AMXVECFP",
    AMX_OP20: "AMXMATINT",
    AMX_OP21: "AMXMATFP",
    AMX_OP22: "AMXGENLUT",
    }

    OP_INTRINSIC_NAMES = {
    @@ -237,8 +352,8 @@
    AMX_OP5: "__amx_stz",
    AMX_OP6: "__amx_ldzi",
    AMX_OP7: "__amx_stzi",
    AMX_OP8: "__amx_op8_to_x", # amxextrx?
    AMX_OP9: "__amx_op9_to_y", # amxextry?
    AMX_OP8: "__amx_extrx",
    AMX_OP9: "__amx_extry",
    AMX_OP10: "__amx_fma64",
    AMX_OP11: "__amx_fms64",
    AMX_OP12: "__amx_fma32",
    @@ -247,11 +362,11 @@
    AMX_OP15: "__amx_fma16",
    AMX_OP16: "__amx_fms16",
    AMX_OP17: "__amx_op17", # amxset / amxclr
    AMX_OP18: "__amx_op18_to_z",
    AMX_OP19: "__amx_op19_to_z",
    AMX_OP20: "__amx_op20_to_z",
    AMX_OP21: "__amx_op21_to_z",
    AMX_OP22: "__amx_op22_to_x0",
    AMX_OP18: "__amx_vecint",
    AMX_OP19: "__amx_vecfp",
    AMX_OP20: "__amx_matint",
    AMX_OP21: "__amx_matfp",
    AMX_OP22: "__amx_genlut",
    }

    def decode_AMX(d, insn):
  11. @dougallj dougallj revised this gist Dec 20, 2020. 1 changed file with 8 additions and 3 deletions.
    11 changes: 8 additions & 3 deletions aarch64_amx.py
    Original file line number Diff line number Diff line change
    @@ -25,7 +25,12 @@
    #
    #
    # The coprocessor state consists of two 0x200 byte "registers", amx0 ("x")
    # and amx1 ("y"), and one 0x1000 byte register amx2 ("z").
    # and amx1 ("y"), and one 0x1000 byte register amx2 ("z"). (Apple headers
    # describe x, y, and z as register groups, where each row of 64-bytes is a
    # "register", and describe only "z" as being "64 registers in an M-by-N
    # matrix". They also describe a 64-bit AMX_STATE_T_EL1 register, which
    # presumably records if AMX is enabled or not, but possibly other state
    # too.)
    #
    # Each is typically loaded/stored from memory in rows of 0x40 bytes,
    # although in some operations the registers can be indexed by byte offsets.
    @@ -76,7 +81,7 @@
    # row index 3 = amx2[2].high and amx2[3].high interleaved
    # etc.
    #
    # Other operands do not touch memory, and usually have their result in
    # Other operations do not touch memory, and usually have their result in
    # amx2 (z), but 8 and 9 have their result in amx0 and amx1 (x/y), and 22 seems
    # to have its result in row 0 (bytes 0 through 0x3F) of amx0.
    #
    @@ -127,7 +132,7 @@
    # if bit 62 is zero, the output is in every second row, and if bit 27 is also
    # set, only every second row gets zeroed (old values remain in the other rows)
    #
    # for 32-bit output (sign extent all values to 32-bit):
    # for 32-bit output (sign extend all inputs to 32-bit):
    # z += [
    # [x0, x2, x4, x6, x8, x10, x12, x14, x16, x18, x20, x22, x24, x26, x28, x30] * y0,
    # [x1, x3, x5, x7, x9, x11, x13, x15, x17, x19, x21, x23, x25, x27, x29, x31] * y0,
  12. @dougallj dougallj revised this gist Dec 19, 2020. 1 changed file with 91 additions and 21 deletions.
    112 changes: 91 additions & 21 deletions aarch64_amx.py
    Original file line number Diff line number Diff line change
    @@ -82,13 +82,83 @@
    #
    # 8 does an operation, result in amx0 (amxextrx?)
    # 9 does an operation, result in amx1 (amxextry?)
    # 10 does an operation, result in amx2
    # 11 does an operation, result in amx2
    # 12 does an operation, result in amx2
    # 13 does an operation, result in amx2
    # 14 does an operation, result in amx2
    # 15 does an operation, result in amx2
    # 16 does an operation, result in amx2
    #
    # op10: multiply and add 64-bit floats (amxfma64)
    #
    # similar to op14, but 8x8 double multiplies for 64 results, added
    # (in groups of 8) to every 8th row of register "z" (z0, z8, z16).
    # no "32-bit mode" operand.
    #
    # op11: multiply and subtract 64-bit floats (amxfms64)
    #
    # same as op10, but subtracting
    #
    # op12: multiply and add 32-bit floats (amxfma32)
    #
    # similar to op14, but 16x16 float multiplies for 256 results, added
    # (in groups of 16) to every 4th row of register "z" (z0, z4, z8).
    # no "32-bit mode" operand.
    #
    # op13: multiply and subtract 32-bit floats (amxfms32)
    #
    # same as op12, but subtracting
    #
    # op14: multiply and add 16-bit signed integers (amxmac16)
    #
    # input two vectors of 32 16-bit values, one from register "x" and the other
    # from register "y". register "z" is the output, but may also be considered an
    # input for "add" operations.
    #
    # each value in the first vector is multiplied with each value in the second
    # vector (giving 32 * 32 = 1024 results), and each result is added to the value
    # in register "z". (although a bit in the input register may be set to skip
    # the addition, and simply store the result, which is typically used on the
    # first iteration of an accumulating loop.)
    #
    # operands:
    # input offset in x (byte): (argument & 0x1FF)
    # input offset in y (byte): ((argument >> 10) & 0x1FF)
    # clear "z" flag (don't add): (argument >> 27) & 1
    # 32-bit mode: (argument >> 62) & 1
    # TODO: there are operands in other bits that still need to be reversed
    #
    # bit 62 makes output 32-bit ints, rather than 16-bit ints
    #
    # if bit 62 is zero, the output is in every second row, and if bit 27 is also
    # set, only every second row gets zeroed (old values remain in the other rows)
    #
    # for 32-bit output (sign extent all values to 32-bit):
    # z += [
    # [x0, x2, x4, x6, x8, x10, x12, x14, x16, x18, x20, x22, x24, x26, x28, x30] * y0,
    # [x1, x3, x5, x7, x9, x11, x13, x15, x17, x19, x21, x23, x25, x27, x29, x31] * y0,
    # [x0, x2, x4, x6, x8, x10, x12, x14, x16, x18, x20, x22, x24, x26, x28, x30] * y1,
    # [x1, x3, x5, x7, x9, x11, x13, x15, x17, x19, x21, x23, x25, x27, x29, x31] * y1,
    # [x0, x2, x4, x6, x8, x10, x12, x14, x16, x18, x20, x22, x24, x26, x28, x30] * y2,
    # [x1, x3, x5, x7, x9, x11, x13, x15, x17, x19, x21, x23, x25, x27, x29, x31] * y2,
    # ...
    # ]
    #
    # note that this works well with the "store z interleaved operation" to get the values out
    # in order.
    #
    # for 16-bit output (although the zeroes aren't really "added" just skipped):
    # z += [
    # [x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, x30, x31] * y0,
    # 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    # [x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, x30, x31] * y1,
    # 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    # [x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, x30, x31] * y2,
    # 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    # ...
    # ]
    #
    # op15: multiply and add 16-bit floats (amxfma16)
    # (same as op14, but for 16-bit floats)
    # bit 62 makes output 32-bit floats, rather than 16-bit floats
    #
    # op16: multiply and subtract 16-bit floats (amxfms16)
    # (same as op15, but subtracting from register "z" instead of adding)
    #
    # 17 is enable/disable
    # 18 does an operation, result in amx2
    # 19 does an operation, result in amx2
    @@ -138,13 +208,13 @@
    AMX_OP7: "AMXSTZI",
    AMX_OP8: "AMX8", # amxextrx?
    AMX_OP9: "AMX9", # amxextry?
    AMX_OP10: "AMX10",
    AMX_OP11: "AMX11",
    AMX_OP12: "AMX12",
    AMX_OP13: "AMX13",
    AMX_OP14: "AMX14",
    AMX_OP15: "AMX15",
    AMX_OP16: "AMX16",
    AMX_OP10: "AMXFMA64",
    AMX_OP11: "AMXFMS64",
    AMX_OP12: "AMXFMA32",
    AMX_OP13: "AMXFMS32",
    AMX_OP14: "AMXMAC16",
    AMX_OP15: "AMXFMA16",
    AMX_OP16: "AMXFMS16",
    AMX_OP17: "AMX17", # amxset / amxclr
    AMX_OP18: "AMX18",
    AMX_OP19: "AMX19",
    @@ -164,13 +234,13 @@
    AMX_OP7: "__amx_stzi",
    AMX_OP8: "__amx_op8_to_x", # amxextrx?
    AMX_OP9: "__amx_op9_to_y", # amxextry?
    AMX_OP10: "__amx_op10_to_z",
    AMX_OP11: "__amx_op11_to_z",
    AMX_OP12: "__amx_op12_to_z",
    AMX_OP13: "__amx_op13_to_z",
    AMX_OP14: "__amx_op14_to_z",
    AMX_OP15: "__amx_op15_to_z",
    AMX_OP16: "__amx_op16_to_z",
    AMX_OP10: "__amx_fma64",
    AMX_OP11: "__amx_fms64",
    AMX_OP12: "__amx_fma32",
    AMX_OP13: "__amx_fms32",
    AMX_OP14: "__amx_mac16",
    AMX_OP15: "__amx_fma16",
    AMX_OP16: "__amx_fms16",
    AMX_OP17: "__amx_op17", # amxset / amxclr
    AMX_OP18: "__amx_op18_to_z",
    AMX_OP19: "__amx_op19_to_z",
  13. @dougallj dougallj revised this gist Dec 19, 2020. 1 changed file with 15 additions and 14 deletions.
    29 changes: 15 additions & 14 deletions aarch64_amx.py
    Original file line number Diff line number Diff line change
    @@ -14,7 +14,7 @@
    # AMX: Apple Matrix coprocessor
    #
    # This is an undocumented arm64 ISA extension present on the Apple M1. These
    # instructions has been reversed from Accelerate (vImage, libBLAS, libBNNS,
    # instructions have been reversed from Accelerate (vImage, libBLAS, libBNNS,
    # libvDSP and libLAPACK all use them), and by experimenting with their
    # behaviour on the M1. Some of this is probably wrong - I haven't reversed
    # a ton.
    @@ -24,12 +24,11 @@
    # look for some idea of what's probably going on.
    #
    #
    # The coprocessor state consists of three 0x200 byte "registers", amx0 ("x"),
    # amx1 ("y") and amx2 ("z").
    # The coprocessor state consists of two 0x200 byte "registers", amx0 ("x")
    # and amx1 ("y"), and one 0x1000 byte register amx2 ("z").
    #
    # Each is typically loaded/stored from memory as 8 rows of 0x40 bytes,
    # although in some operations the 0x200 byte registers can be indexed by
    # bytes.
    # Each is typically loaded/stored from memory in rows of 0x40 bytes,
    # although in some operations the registers can be indexed by byte offsets.
    #
    #
    # AMX instructions are of the form:
    @@ -40,7 +39,7 @@
    # op=17, operand=1. In Accelerate, these instructions are always prefixed
    # by three nops. What could go wrong?
    #
    # If instructions other than "enable" are issued when AMX is not enabled,
    # If instructions other than "enable" are executed when AMX is not enabled,
    # they are treated as illegal instructions.
    #
    #
    @@ -49,8 +48,9 @@
    #
    # This register is typically a bitfield containing further parameters to the
    # operation. For example, loads and stores have a 56-bit address in bits 0
    # through 55, and a 3-bit register offset (in units of 0x40) in bits 56
    # through 58.
    # through 55, a 5-bit register offset (in units of 0x40) in bits 56
    # through 61, and a 1-bit flag in bit 61 (acting as an 0x40 byte load/store
    # when zero, or an 0x80 byte load/store when one).
    #
    # My best guess at the names is based on:
    # https://www.realworldtech.com/forum/?threadid=187087&curpostid=187120
    @@ -64,15 +64,16 @@
    # 4 is load amx2 (amxldz)
    # 5 is store amx2 (amxstz)
    #
    # 6 and 7 load and store amx2, but in a different order.
    # 6 and 7 load and store amx2, but in a different order, and
    # always as 0x40 bytes (bit 62 is ignored)
    #
    # 6 also loads amx2 (amxldzi)
    # 7 also stores amx2 (amxstzi)
    # but they use halves of two registers in amx2
    # row index 0 = amx2[0].low and amx2[1].low
    # row index 1 = amx2[0].high and amx2[1].high
    # row index 2 = amx2[2].low and amx2[3].low
    # row index 3 = amx2[2].high and amx2[3].high
    # row index 0 = amx2[0].low and amx2[1].low interleaved
    # row index 1 = amx2[0].high and amx2[1].high interleaved
    # row index 2 = amx2[2].low and amx2[3].low interleaved
    # row index 3 = amx2[2].high and amx2[3].high interleaved
    # etc.
    #
    # Other operands do not touch memory, and usually have their result in
  14. @dougallj dougallj created this gist Dec 18, 2020.
    347 changes: 347 additions & 0 deletions aarch64_amx.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,347 @@
    # TODO: XZR can be an operand, but I don't handle that correctly yet


    # Copyright (c) 2020 dougallj

    # Based on Python port of VMX intrinsics plugin:
    # Copyright (c) 2019 w4kfu - Synacktiv

    # Based on AArch64 8.3-A Pointer Authentication plugin:
    # Copyright (c) 2018 Eloi Benoist-Vanderbeken - Synacktiv
    # Copyright (c) 2018 xerub


    # AMX: Apple Matrix coprocessor
    #
    # This is an undocumented arm64 ISA extension present on the Apple M1. These
    # instructions has been reversed from Accelerate (vImage, libBLAS, libBNNS,
    # libvDSP and libLAPACK all use them), and by experimenting with their
    # behaviour on the M1. Some of this is probably wrong - I haven't reversed
    # a ton.
    #
    # This may actually be very similar to Intel Advanced Matrix Extension (AMX),
    # making the name collision even more confusing, but it's not a bad place to
    # look for some idea of what's probably going on.
    #
    #
    # The coprocessor state consists of three 0x200 byte "registers", amx0 ("x"),
    # amx1 ("y") and amx2 ("z").
    #
    # Each is typically loaded/stored from memory as 8 rows of 0x40 bytes,
    # although in some operations the 0x200 byte registers can be indexed by
    # bytes.
    #
    #
    # AMX instructions are of the form:
    #
    # 0x00201000 | ((op & 0x1F) << 5) | (operand & 0x1F)
    #
    # AMX must be explicitly enabled using op=17, operand=0 and disabled using
    # op=17, operand=1. In Accelerate, these instructions are always prefixed
    # by three nops. What could go wrong?
    #
    # If instructions other than "enable" are issued when AMX is not enabled,
    # they are treated as illegal instructions.
    #
    #
    # All other operations (op=0-16 and op=18-22) seem to take a 64-bit register
    # number (X0-X30 or 31=XZR) as the operand.
    #
    # This register is typically a bitfield containing further parameters to the
    # operation. For example, loads and stores have a 56-bit address in bits 0
    # through 55, and a 3-bit register offset (in units of 0x40) in bits 56
    # through 58.
    #
    # My best guess at the names is based on:
    # https://www.realworldtech.com/forum/?threadid=187087&curpostid=187120
    #
    # ops 0 through 7 are loads/stores:
    #
    # 0 is load amx0 (amxldx)
    # 1 is load amx1 (amxldy)
    # 2 is store amx0 (amxstx)
    # 3 is store amx1 (amxsty)
    # 4 is load amx2 (amxldz)
    # 5 is store amx2 (amxstz)
    #
    # 6 and 7 load and store amx2, but in a different order.
    #
    # 6 also loads amx2 (amxldzi)
    # 7 also stores amx2 (amxstzi)
    # but they use halves of two registers in amx2
    # row index 0 = amx2[0].low and amx2[1].low
    # row index 1 = amx2[0].high and amx2[1].high
    # row index 2 = amx2[2].low and amx2[3].low
    # row index 3 = amx2[2].high and amx2[3].high
    # etc.
    #
    # Other operands do not touch memory, and usually have their result in
    # amx2 (z), but 8 and 9 have their result in amx0 and amx1 (x/y), and 22 seems
    # to have its result in row 0 (bytes 0 through 0x3F) of amx0.
    #
    # 8 does an operation, result in amx0 (amxextrx?)
    # 9 does an operation, result in amx1 (amxextry?)
    # 10 does an operation, result in amx2
    # 11 does an operation, result in amx2
    # 12 does an operation, result in amx2
    # 13 does an operation, result in amx2
    # 14 does an operation, result in amx2
    # 15 does an operation, result in amx2
    # 16 does an operation, result in amx2
    # 17 is enable/disable
    # 18 does an operation, result in amx2
    # 19 does an operation, result in amx2
    # 20 does an operation, result in amx2
    # 21 does an operation, result in amx2
    # 22 does an operation, result in amx0[0]
    #


    import idaapi
    import ida_hexrays


    AMX_NONE = 0
    AMX_OP0 = 1
    AMX_OP1 = 2
    AMX_OP2 = 3
    AMX_OP3 = 4
    AMX_OP4 = 5
    AMX_OP5 = 6
    AMX_OP6 = 7
    AMX_OP7 = 8
    AMX_OP8 = 9
    AMX_OP9 = 10
    AMX_OP10 = 11
    AMX_OP11 = 12
    AMX_OP12 = 13
    AMX_OP13 = 14
    AMX_OP14 = 15
    AMX_OP15 = 16
    AMX_OP16 = 17
    AMX_OP17 = 18
    AMX_OP18 = 19
    AMX_OP19 = 20
    AMX_OP20 = 21
    AMX_OP21 = 22
    AMX_OP22 = 23

    OP_NAMES = {
    AMX_OP0: "AMXLDX",
    AMX_OP1: "AMXLDY",
    AMX_OP2: "AMXSTX",
    AMX_OP3: "AMXSTY",
    AMX_OP4: "AMXLDZ",
    AMX_OP5: "AMXSTZ",
    AMX_OP6: "AMXLDZI",
    AMX_OP7: "AMXSTZI",
    AMX_OP8: "AMX8", # amxextrx?
    AMX_OP9: "AMX9", # amxextry?
    AMX_OP10: "AMX10",
    AMX_OP11: "AMX11",
    AMX_OP12: "AMX12",
    AMX_OP13: "AMX13",
    AMX_OP14: "AMX14",
    AMX_OP15: "AMX15",
    AMX_OP16: "AMX16",
    AMX_OP17: "AMX17", # amxset / amxclr
    AMX_OP18: "AMX18",
    AMX_OP19: "AMX19",
    AMX_OP20: "AMX20",
    AMX_OP21: "AMX21",
    AMX_OP22: "AMX22",
    }

    OP_INTRINSIC_NAMES = {
    AMX_OP0: "__amx_ldx",
    AMX_OP1: "__amx_ldy",
    AMX_OP2: "__amx_stx",
    AMX_OP3: "__amx_sty",
    AMX_OP4: "__amx_ldz",
    AMX_OP5: "__amx_stz",
    AMX_OP6: "__amx_ldzi",
    AMX_OP7: "__amx_stzi",
    AMX_OP8: "__amx_op8_to_x", # amxextrx?
    AMX_OP9: "__amx_op9_to_y", # amxextry?
    AMX_OP10: "__amx_op10_to_z",
    AMX_OP11: "__amx_op11_to_z",
    AMX_OP12: "__amx_op12_to_z",
    AMX_OP13: "__amx_op13_to_z",
    AMX_OP14: "__amx_op14_to_z",
    AMX_OP15: "__amx_op15_to_z",
    AMX_OP16: "__amx_op16_to_z",
    AMX_OP17: "__amx_op17", # amxset / amxclr
    AMX_OP18: "__amx_op18_to_z",
    AMX_OP19: "__amx_op19_to_z",
    AMX_OP20: "__amx_op20_to_z",
    AMX_OP21: "__amx_op21_to_z",
    AMX_OP22: "__amx_op22_to_x0",
    }

    def decode_AMX(d, insn):
    if (d & 0xfffffC00) == 0x00201000:
    Xr = d & 31
    m = (d >> 5) & 31
    if m <= AMX_OP22 - AMX_OP0:
    #insn.itype = idaapi.ARM_nop
    insn.itype = idaapi.ARM_hlt
    insn.segpref = 14
    if m == 17:
    insn.Op1.type = idaapi.o_imm
    insn.Op1.value = Xr
    insn.Op1.dtype = idaapi.dt_byte
    else:
    insn.Op1.type = idaapi.o_reg
    insn.Op1.reg = Xr + 129
    insn.Op1.dtype = idaapi.dt_qword
    insn.insnpref = AMX_OP0 + m
    insn.size = 4
    return True
    return False

    class Aarch64AMXHook(idaapi.IDP_Hooks):
    CUSTOM_INSTRUCTIONS = {idaapi.ARM_hlt}
    INDENT = 16
    def ev_ana_insn(self, outctx):
    return outctx.size if decode_AMX(idaapi.get_dword(outctx.ea), outctx) else 0

    def ev_emu_insn(self, insn):
    if insn.itype != idaapi.ARM_brk:
    return False
    return True

    def ev_out_mnem(self, outctx):
    if outctx.insn.itype in self.CUSTOM_INSTRUCTIONS:
    mnem = OP_NAMES.get(ord(outctx.insn.insnpref), None)
    if mnem is not None:
    outctx.out_custom_mnem(mnem, self.INDENT)
    return 1
    return 0

    class MicroInstruction(ida_hexrays.minsn_t):

    def __init__(self, opcode, ea):
    ida_hexrays.minsn_t.__init__(self, ea)
    self.opcode = opcode
    self.l.zero()
    self.r.zero()
    self.d.zero()

    class CallBuilder():

    def __init__(self, cdg, name, return_type=idaapi.tinfo_t(idaapi.BT_VOID)):
    self.emitted = False
    self.cdg = cdg
    self.callinfo = ida_hexrays.mcallinfo_t()
    self.callinfo.callee = idaapi.BADADDR
    self.callinfo.solid_args = 0
    self.callinfo.call_spd = 0
    self.callinfo.stkargs_top = 0
    self.callinfo.cc = idaapi.CM_CC_FASTCALL
    self.callinfo.return_type = return_type
    self.callinfo.flags = idaapi.FCI_SPLOK | idaapi.FCI_FINAL | idaapi.FCI_PROP
    self.callinfo.role = idaapi.ROLE_UNK

    glbhigh_off = cdg.mba.get_stack_region().off + cdg.mba.get_stack_region().size
    # what memory is visible to the call : GLBLOW - GLBHIGH
    self.callinfo.visible_memory.add(ida_hexrays.ivl_t(0x00, 0x100000))
    self.callinfo.visible_memory.add(ida_hexrays.ivl_t(glbhigh_off, 0xFFFFFFFFFFFFFFFF - glbhigh_off))
    # spoiled locations : GLBLOW - GLBHIGH
    self.callinfo.spoiled.mem.add(ida_hexrays.ivl_t(0x00, 0x100000))
    self.callinfo.spoiled.mem.add(ida_hexrays.ivl_t(glbhigh_off, 0xFFFFFFFFFFFFFFFF - glbhigh_off))

    self.callins = MicroInstruction(ida_hexrays.m_call, self.cdg.insn.ea)
    self.callins.l.make_helper(name)
    self.callins.d.t = ida_hexrays.mop_f
    self.callins.d.size = 0
    self.callins.d.f = self.callinfo

    if (return_type.is_void()):
    self.ins = self.callins
    else:
    self.callins.d.size = return_type.get_size()
    self.ins = MicroInstruction(ida_hexrays.m_mov, self.cdg.insn.ea)
    self.ins.l.t = ida_hexrays.mop_d
    self.ins.l.d = self.callins
    self.ins.l.size = self.callins.d.size
    self.ins.d.t = ida_hexrays.mop_r
    self.ins.d.r = 0x00
    self.ins.d.size = self.callins.d.size

    def add_register_argument(self, t, operand):
    ca = ida_hexrays.mcallarg_t()
    ca.t = idaapi.mop_r
    ca.r = operand
    ca.type = t
    ca.size = t.get_size()
    self.callinfo.args.push_back(ca)
    self.callinfo.solid_args += 1

    def set_return_register(self, reg):
    self.ins.d.r = reg

    def emit(self):
    if self.emitted == False:
    self.cdg.mb.insert_into_block(self.ins, self.cdg.mb.tail)
    self.emitted = True

    class AMXFilter(ida_hexrays.microcode_filter_t):
    def __init__(self):
    ida_hexrays.microcode_filter_t.__init__(self)
    ida_hexrays.install_microcode_filter(self, True)

    def match(self, cdg):
    return cdg.insn.itype == idaapi.ARM_hlt and cdg.insn.insnpref != AMX_NONE

    def apply(self, cdg):
    op = ord(cdg.insn.insnpref)
    intrinsic_name = OP_INTRINSIC_NAMES.get(op, '__amx%d' % op)
    if cdg.insn.Op1.type == idaapi.o_reg:
    builder = CallBuilder(cdg, intrinsic_name)
    builder.add_register_argument(idaapi.tinfo_t(idaapi.BT_INT64 | idaapi.BTMT_UNSIGNED), cdg.load_operand(0))
    builder.emit()
    elif cdg.insn.Op1.type == idaapi.o_imm:
    if op == AMX_OP17 and cdg.insn.Op1.value == 0:
    builder = CallBuilder(cdg, '__amx_begin')
    builder.emit()
    elif op == AMX_OP17 and cdg.insn.Op1.value == 1:
    builder = CallBuilder(cdg, '__amx_end')
    builder.emit()
    else:
    builder = CallBuilder(cdg, '%s_%d' % (intrinsic_name, cdg.insn.Op1.value))
    builder.emit()

    return idaapi.MERR_OK


    class Aarch64AMXPlugin(idaapi.plugin_t):
    flags = idaapi.PLUGIN_PROC | idaapi.PLUGIN_HIDE
    comment = "Aarch64 Apple AMX extension"
    wanted_hotkey = ""
    help = "Runs transparently"
    wanted_name = "Aarch64 AMX"
    hook = None
    enabled = 1

    def init(self):
    if idaapi.ph_get_id() != idaapi.PLFM_ARM or idaapi.BADADDR <= 0xFFFFFFFF:
    return idaapi.PLUGIN_SKIP
    if not ida_hexrays.init_hexrays_plugin():
    print("[-] {0} : no decompiler available, skipping".format(self.wanted_name))
    return idaapi.PLUGIN_SKIP
    print "%s init"%self.comment
    self.hook = Aarch64AMXHook()
    self.hook.hook()
    self.filter = AMXFilter()
    return idaapi.PLUGIN_KEEP

    def run():
    pass

    def term(self):
    if self.hook is not None:
    self.hook.unhook()
    print "%s unloaded"%self.comment

    def PLUGIN_ENTRY():
    return Aarch64AMXPlugin()