/* set isa_file=%~1.isa set analysis_file=%~1.a set isa_file rga --define COMPILER_AMD_RGA=1 --source-kind hlsl --asic Pitcairn --profile cs_5_0 --function %2 --intrinsics --isa %isa_file% %1 */ #if COMPILER_AMD_RGA #include "ags_shader_intrinsics_dx11.hlsl" uint2 ballot(bool pred) { return AmdDxExtShaderIntrinsics_Ballot(pred); } uint ReadFirstLane(uint x) { return AmdDxExtShaderIntrinsics_ReadfirstlaneU(x); } float ReadFirstLane(float x) { return AmdDxExtShaderIntrinsics_ReadfirstlaneF(x); } uint ReadLane(uint x, uint laneId) { return AmdDxExtShaderIntrinsics_ReadlaneU(x, laneId); } bool AnyExecSet() { return any(ballot(true)); } #endif Texture2D InTexture : register(t0); RWTexture2D OutTexture : register(u0); [numthreads(8, 8, 1)] void main(uint2 TexelId : SV_DispatchThreadId) { const uint VgprData = InTexture[TexelId]; uint NumDivergentGroups = 0; #if 1 for (; ; ++NumDivergentGroups) { const uint SgprData = ReadFirstLane(VgprData); if (SgprData == VgprData) { break; } if (AnyExecSet() == false) { break; } } #endif OutTexture[TexelId] = NumDivergentGroups; } /* ; -------- Disassembly -------------------- shader main asic(SI) type(CS) v_mad_u32_u24 v0, s12, 8, v0 // 00000000: D2860000 0401100C v_mad_u32_u24 v1, s13, 8, v1 // 00000008: D2860001 0405100D s_load_dwordx8 s[12:19], s[2:3], 0x00 // 00000010: C0C60300 s_waitcnt lgkmcnt(0) // 00000014: BF8C007F image_load v2, v[0:3], s[12:19] unorm // 00000018: F0001100 00030200 s_mov_b64 s[0:1], exec // 00000020: BE80047E s_mov_b64 s[2:3], exec // 00000024: BE82047E v_mov_b32 v3, 0 // 00000028: 7E060280 s_nop 0x0000 // 0000002C: BF800000 s_nop 0x0000 // 00000030: BF800000 s_nop 0x0000 // 00000034: BF800000 s_nop 0x0000 // 00000038: BF800000 s_nop 0x0000 // 0000003C: BF800000 label_0010: s_waitcnt vmcnt(0) // 00000040: BF8C0F70 v_readfirstlane_b32 s12, v2 // 00000044: 7E180502 v_cmp_eq_i32 vcc, s12, v2 // 00000048: 7D04040C s_and_saveexec_b64 s[12:13], vcc // 0000004C: BE8C246A s_andn2_b64 s[2:3], s[2:3], exec // 00000050: 8A827E02 s_cbranch_scc0 label_001C // 00000054: BF840006 s_and_b64 exec, s[12:13], s[2:3] // 00000058: 87FE020C s_or_b32 s12, exec_lo, exec_hi // 0000005C: 880C7F7E s_cmp_eq_i32 s12, 0 // 00000060: BF00800C s_cbranch_scc1 label_001C // 00000064: BF850002 v_add_i32 v3, vcc, 1, v3 // 00000068: 4A060681 s_branch label_0010 // 0000006C: BF82FFF4 label_001C: s_mov_b64 exec, s[0:1] // 00000070: BEFE0400 v_mov_b32 v2, v3 // 00000074: 7E040303 v_mov_b32 v4, v3 // 00000078: 7E080303 v_mov_b32 v5, v3 // 0000007C: 7E0A0303 image_store v[2:5], v[0:3], s[4:11] dmask:0xf unorm glc // 00000080: F0203F00 00010200 s_endpgm // 00000088: BF810000 end */