From f707e1255e2f0a46c7a23271f594a9a4e5ec8f08 Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Tue, 22 Jun 2021 13:06:02 +0100 Subject: [PATCH] [AMDGPU] Select d16 stores even when sramecc is enabled The sramecc feature changes the behaviour of d16 loads so they do not preserve the unused 16 bits of the result register, but it has no impact on d16 stores, so we should make use of them even when the feature is enabled. Differential Revision: https://reviews.llvm.org/D104912 --- llvm/lib/Target/AMDGPU/BUFInstructions.td | 2 +- llvm/lib/Target/AMDGPU/DSInstructions.td | 2 +- llvm/lib/Target/AMDGPU/FLATInstructions.td | 12 +- llvm/test/CodeGen/AMDGPU/store-hi16.ll | 147 +++++++++------------ 4 files changed, 70 insertions(+), 93 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td index 133ee4742e5c..4e7efef3f9b1 100644 --- a/llvm/lib/Target/AMDGPU/BUFInstructions.td +++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -1851,7 +1851,7 @@ defm : MUBUFScratchStorePat ; -let OtherPredicates = [D16PreservesUnusedBits, DisableFlatScratch] in { +let OtherPredicates = [HasD16LoadStore, DisableFlatScratch] in { // Hiding the extract high pattern in the PatFrag seems to not // automatically increase the complexity. let AddedComplexity = 1 in { diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td index 4e7a9b5a65cd..4d78e3dae2ec 100644 --- a/llvm/lib/Target/AMDGPU/DSInstructions.td +++ b/llvm/lib/Target/AMDGPU/DSInstructions.td @@ -791,7 +791,7 @@ defm : DSAtomicWritePat_mc ; defm : DSAtomicWritePat_mc ; defm : DSAtomicWritePat_mc ; -let OtherPredicates = [D16PreservesUnusedBits] in { +let OtherPredicates = [HasD16LoadStore] in { def : DSWritePat ; def : DSWritePat ; } diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index 7ea39d5c51dd..3f0c42578a11 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -1169,10 +1169,12 @@ defm : FlatAtomicPat <"FLAT_ATOMIC_XOR_X2", "atomic_load_xor_"#as, i64>; def : FlatStorePat ; def : FlatStorePat ; -let OtherPredicates = [D16PreservesUnusedBits] in { +let OtherPredicates = [HasD16LoadStore] in { def : FlatStorePat ; def : FlatStorePat ; +} +let OtherPredicates = [D16PreservesUnusedBits] in { def : FlatLoadPat_D16 ; def : FlatLoadPat_D16 ; def : FlatLoadPat_D16 ; @@ -1363,10 +1365,12 @@ defm : GlobalFLATStorePats ; defm : GlobalFLATStorePats ; defm : GlobalFLATStorePats ; -let OtherPredicates = [D16PreservesUnusedBits] in { +let OtherPredicates = [HasD16LoadStore] in { defm : GlobalFLATStorePats ; defm : GlobalFLATStorePats ; +} +let OtherPredicates = [D16PreservesUnusedBits] in { defm : GlobalFLATLoadPats_D16 ; defm : GlobalFLATLoadPats_D16 ; defm : GlobalFLATLoadPats_D16 ; @@ -1489,10 +1493,12 @@ defm : ScratchFLATStorePats ; defm : ScratchFLATStorePats ; defm : ScratchFLATStorePats ; -let OtherPredicates = [D16PreservesUnusedBits, HasFlatScratchInsts, EnableFlatScratch] in { +let OtherPredicates = [HasD16LoadStore, HasFlatScratchInsts, EnableFlatScratch] in { defm : ScratchFLATStorePats ; defm : ScratchFLATStorePats ; +} +let OtherPredicates = [D16PreservesUnusedBits, HasFlatScratchInsts, EnableFlatScratch] in { defm : ScratchFLATLoadPats_D16 ; defm : ScratchFLATLoadPats_D16 ; defm : ScratchFLATLoadPats_D16 ; diff --git a/llvm/test/CodeGen/AMDGPU/store-hi16.ll b/llvm/test/CodeGen/AMDGPU/store-hi16.ll index dd32021532f5..e4699f3f926f 100644 --- a/llvm/test/CodeGen/AMDGPU/store-hi16.ll +++ b/llvm/test/CodeGen/AMDGPU/store-hi16.ll @@ -1,16 +1,15 @@ -; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX900,GFX9,GFX900-MUBUF %s -; RUN: llc -march=amdgcn -mcpu=gfx906 -amdgpu-sroa=0 -mattr=-promote-alloca,+sram-ecc -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX906,GFX9,NO-D16-HI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX9,GFX9-MUBUF %s +; RxN: llc -march=amdgcn -mcpu=gfx906 -amdgpu-sroa=0 -mattr=-promote-alloca,+sram-ecc -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX9 %s ; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX803,NO-D16-HI %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX900,GFX9,GFX900-FLATSCR %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX9,GFX9-FLATSCR %s ; GCN-LABEL: {{^}}store_global_hi_v2i16: ; GCN: s_waitcnt -; GFX900-NEXT: global_store_short_d16_hi v[0:1], v2, off +; GFX9-NEXT: global_store_short_d16_hi v[0:1], v2, off ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX803-NEXT: flat_store_short v[0:1], v2 -; GFX906-NEXT: global_store_short v[0:1], v2, off ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 @@ -26,11 +25,10 @@ entry: ; GCN-LABEL: {{^}}store_global_hi_v2f16: ; GCN: s_waitcnt -; GFX900-NEXT: global_store_short_d16_hi v[0:1], v2, off +; GFX9-NEXT: global_store_short_d16_hi v[0:1], v2, off ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX803-NEXT: flat_store_short v[0:1], v2 -; GFX906-NEXT: global_store_short v[0:1], v2, off ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 @@ -46,11 +44,10 @@ entry: ; GCN-LABEL: {{^}}store_global_hi_i32_shift: ; GCN: s_waitcnt -; GFX900-NEXT: global_store_short_d16_hi v[0:1], v2, off +; GFX9-NEXT: global_store_short_d16_hi v[0:1], v2, off ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX803-NEXT: flat_store_short v[0:1], v2 -; GFX906-NEXT: global_store_short v[0:1], v2, off ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 @@ -65,11 +62,10 @@ entry: ; GCN-LABEL: {{^}}store_global_hi_v2i16_i8: ; GCN: s_waitcnt -; GFX900-NEXT: global_store_byte_d16_hi v[0:1], v2, off +; GFX9-NEXT: global_store_byte_d16_hi v[0:1], v2, off ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX803-NEXT: flat_store_byte v[0:1], v2 -; GFX906-NEXT: global_store_byte v[0:1], v2, off ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 @@ -85,11 +81,10 @@ entry: ; GCN-LABEL: {{^}}store_global_hi_i8_shift: ; GCN: s_waitcnt -; GFX900-NEXT: global_store_byte_d16_hi v[0:1], v2, off +; GFX9-NEXT: global_store_byte_d16_hi v[0:1], v2, off ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX803-NEXT: flat_store_byte v[0:1], v2 -; GFX906-NEXT: global_store_byte v[0:1], v2, off ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 @@ -103,16 +98,13 @@ entry: ; GCN-LABEL: {{^}}store_global_hi_v2i16_max_offset: ; GCN: s_waitcnt -; GFX900-NEXT: global_store_short_d16_hi v[0:1], v2, off offset:4094 +; GFX9-NEXT: global_store_short_d16_hi v[0:1], v2, off offset:4094 ; GFX803-DAG: v_add_u32_e32 ; GFX803-DAG: v_addc_u32_e32 ; GFX803-DAG: v_lshrrev_b32_e32 v2, 16, v2 ; GFX803: flat_store_short v[0:1], v2{{$}} -; GFX906-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX906-NEXT: global_store_short v[0:1], v2, off - ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @store_global_hi_v2i16_max_offset(i16 addrspace(1)* %out, i32 %arg) #0 { @@ -127,16 +119,13 @@ entry: ; GCN-LABEL: {{^}}store_global_hi_v2i16_min_offset: ; GCN: s_waitcnt -; GFX900-NEXT: global_store_short_d16_hi v[0:1], v2, off offset:-4096{{$}} +; GFX9-NEXT: global_store_short_d16_hi v[0:1], v2, off offset:-4096{{$}} ; GFX803-DAG: v_add_u32_e32 ; GFX803-DAG: v_addc_u32_e32 ; GFX803-DAG: v_lshrrev_b32_e32 v2, 16, v2 ; GFX803: flat_store_short v[0:1], v{{[0-9]$}} -; GFX906-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX906-NEXT: global_store_short v[0:1], v2, off - ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @store_global_hi_v2i16_min_offset(i16 addrspace(1)* %out, i32 %arg) #0 { @@ -150,16 +139,13 @@ entry: ; GCN-LABEL: {{^}}store_global_hi_v2i16_i8_max_offset: ; GCN: s_waitcnt -; GFX900-NEXT: global_store_byte_d16_hi v[0:1], v2, off offset:4095 +; GFX9-NEXT: global_store_byte_d16_hi v[0:1], v2, off offset:4095 ; GFX803-DAG: v_add_u32_e32 ; GFX803-DAG: v_addc_u32_e32 ; GFX803-DAG: v_lshrrev_b32_e32 v2, 16, v2 ; GFX803: flat_store_byte v[0:1], v{{[0-9]$}} -; GFX906-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX906-NEXT: global_store_byte v[0:1], v2, off - ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @store_global_hi_v2i16_i8_max_offset(i8 addrspace(1)* %out, i32 %arg) #0 { @@ -174,16 +160,13 @@ entry: ; GCN-LABEL: {{^}}store_global_hi_v2i16_i8_min_offset: ; GCN: s_waitcnt -; GFX900-NEXT: global_store_byte_d16_hi v[0:1], v2, off offset:-4095 +; GFX9-NEXT: global_store_byte_d16_hi v[0:1], v2, off offset:-4095 ; GFX803-DAG: v_add_u32_e32 ; GFX803-DAG: v_addc_u32_e32 ; GFX803-DAG: v_lshrrev_b32_e32 v2, 16, v2 ; GFX803: flat_store_byte v[0:1], v{{[0-9]$}} -; GFX906-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX906-NEXT: global_store_byte v[0:1], v2, off - ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @store_global_hi_v2i16_i8_min_offset(i8 addrspace(1)* %out, i32 %arg) #0 { @@ -199,7 +182,7 @@ entry: ; GCN-LABEL: {{^}}store_flat_hi_v2i16: ; GCN: s_waitcnt -; GFX900-NEXT: flat_store_short_d16_hi v[0:1], v2{{$}} +; GFX9-NEXT: flat_store_short_d16_hi v[0:1], v2{{$}} ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; NO-D16-HI-NEXT: flat_store_short v[0:1], v2 @@ -217,7 +200,7 @@ entry: ; GCN-LABEL: {{^}}store_flat_hi_v2f16: ; GCN: s_waitcnt -; GFX900-NEXT: flat_store_short_d16_hi v[0:1], v2{{$}} +; GFX9-NEXT: flat_store_short_d16_hi v[0:1], v2{{$}} ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; NO-D16-HI-NEXT: flat_store_short v[0:1], v2 @@ -235,7 +218,7 @@ entry: ; GCN-LABEL: {{^}}store_flat_hi_i32_shift: ; GCN: s_waitcnt -; GFX900-NEXT: flat_store_short_d16_hi v[0:1], v2{{$}} +; GFX9-NEXT: flat_store_short_d16_hi v[0:1], v2{{$}} ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; NO-D16-HI-NEXT: flat_store_short v[0:1], v2 @@ -253,7 +236,7 @@ entry: ; GCN-LABEL: {{^}}store_flat_hi_v2i16_i8: ; GCN: s_waitcnt -; GFX900-NEXT: flat_store_byte_d16_hi v[0:1], v2{{$}} +; GFX9-NEXT: flat_store_byte_d16_hi v[0:1], v2{{$}} ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; NO-D16-HI-NEXT: flat_store_byte v[0:1], v2 @@ -272,7 +255,7 @@ entry: ; GCN-LABEL: {{^}}store_flat_hi_i8_shift: ; GCN: s_waitcnt -; GFX900-NEXT: flat_store_byte_d16_hi v[0:1], v2{{$}} +; GFX9-NEXT: flat_store_byte_d16_hi v[0:1], v2{{$}} ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; NO-D16-HI-NEXT: flat_store_byte v[0:1], v2 @@ -289,10 +272,7 @@ entry: ; GCN-LABEL: {{^}}store_flat_hi_v2i16_max_offset: ; GCN: s_waitcnt -; GFX900-NEXT: flat_store_short_d16_hi v[0:1], v2 offset:4094{{$}} - -; GFX906-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX906-NEXT: flat_store_short v[0:1], v2 offset:4094 +; GFX9-NEXT: flat_store_short_d16_hi v[0:1], v2 offset:4094{{$}} ; GFX803-DAG: v_add_u32_e32 ; GFX803-DAG: v_addc_u32_e32 @@ -318,10 +298,7 @@ entry: ; GFX9-DAG: v_add_co_u32_e32 v{{[0-9]+}}, vcc, 0xfffff802, v ; GFX9-DAG: v_addc_co_u32_e32 v{{[0-9]+}}, vcc, -1, v -; GFX906-DAG: v_lshrrev_b32_e32 -; GFX906: flat_store_short v[0:1], v2{{$}} - -; GFX900-NEXT: flat_store_short_d16_hi v[0:1], v2{{$}} +; GFX9-NEXT: flat_store_short_d16_hi v[0:1], v2{{$}} ; GFX803: flat_store_short v[0:1], v2{{$}} ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 @@ -336,16 +313,13 @@ entry: ; GCN-LABEL: {{^}}store_flat_hi_v2i16_i8_max_offset: ; GCN: s_waitcnt -; GFX900-NEXT: flat_store_byte_d16_hi v[0:1], v2 offset:4095{{$}} +; GFX9-NEXT: flat_store_byte_d16_hi v[0:1], v2 offset:4095{{$}} ; GFX803-DAG: v_lshrrev_b32_e32 v2, 16, v2 ; GFX803-DAG: v_add_u32_e32 ; GFX803-DAG: v_addc_u32_e32 ; GFX803: flat_store_byte v[0:1], v2{{$}} -; GFX906-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX906-NEXT: flat_store_byte v[0:1], v2 offset:4095{{$}} - ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @store_flat_hi_v2i16_i8_max_offset(i8* %out, i32 %arg) #0 { @@ -367,10 +341,7 @@ entry: ; GFX9-DAG: v_add_co_u32_e32 v{{[0-9]+}}, vcc, 0xfffff001, v ; GFX9-DAG: v_addc_co_u32_e32 v{{[0-9]+}}, vcc, -1, v{{[0-9]+}}, vcc -; GFX900-NEXT: flat_store_byte_d16_hi v[0:1], v2{{$}} - -; GFX906-DAG: v_lshrrev_b32_e32 v2, 16, v2 -; GFX906: flat_store_byte v[0:1], v2{{$}} +; GFX9-NEXT: flat_store_byte_d16_hi v[0:1], v2{{$}} ; GFX803-DAG: v_lshrrev_b32_e32 v2, 16, v2 ; GFX803: flat_store_byte v[0:1], v2{{$}} @@ -390,8 +361,8 @@ entry: ; GCN-LABEL: {{^}}store_private_hi_v2i16: ; GCN: s_waitcnt -; GFX900-MUBUF-NEXT: buffer_store_short_d16_hi v1, v0, s[0:3], 0 offen{{$}} -; GFX900-FLATSCR-NEXT: scratch_store_short_d16_hi v0, v1, off +; GFX9-MUBUF-NEXT: buffer_store_short_d16_hi v1, v0, s[0:3], 0 offen{{$}} +; GFX9-FLATSCR-NEXT: scratch_store_short_d16_hi v0, v1, off ; NO-D16-HI: v_lshrrev_b32_e32 v1, 16, v1 ; NO-D16-HI: buffer_store_short v1, v0, s[0:3], 0 offen{{$}} @@ -410,8 +381,8 @@ entry: ; GCN-LABEL: {{^}}store_private_hi_v2f16: ; GCN: s_waitcnt -; GFX900-MUBUF-NEXT: buffer_store_short_d16_hi v1, v0, s[0:3], 0 offen{{$}} -; GFX900-FLATSCR-NEXT: scratch_store_short_d16_hi v0, v1, off{{$}} +; GFX9-MUBUF-NEXT: buffer_store_short_d16_hi v1, v0, s[0:3], 0 offen{{$}} +; GFX9-FLATSCR-NEXT: scratch_store_short_d16_hi v0, v1, off{{$}} ; NO-D16-HI: v_lshrrev_b32_e32 v1, 16, v1 ; NO-D16-HI: buffer_store_short v1, v0, s[0:3], 0 offen{{$}} @@ -430,8 +401,8 @@ entry: ; GCN-LABEL: {{^}}store_private_hi_i32_shift: ; GCN: s_waitcnt -; GFX900-MUBUF-NEXT: buffer_store_short_d16_hi v1, v0, s[0:3], 0 offen{{$}} -; GFX900-FLATSCR-NEXT: scratch_store_short_d16_hi v0, v1, off{{$}} +; GFX9-MUBUF-NEXT: buffer_store_short_d16_hi v1, v0, s[0:3], 0 offen{{$}} +; GFX9-FLATSCR-NEXT: scratch_store_short_d16_hi v0, v1, off{{$}} ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; NO-D16-HI-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen{{$}} @@ -449,8 +420,8 @@ entry: ; GCN-LABEL: {{^}}store_private_hi_v2i16_i8: ; GCN: s_waitcnt -; GFX900-MUBUF-NEXT: buffer_store_byte_d16_hi v1, v0, s[0:3], 0 offen{{$}} -; GFX900-FLATSCR-NEXT: scratch_store_byte_d16_hi v0, v1, off{{$}} +; GFX9-MUBUF-NEXT: buffer_store_byte_d16_hi v1, v0, s[0:3], 0 offen{{$}} +; GFX9-FLATSCR-NEXT: scratch_store_byte_d16_hi v0, v1, off{{$}} ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; NO-D16-HI-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen{{$}} @@ -469,8 +440,8 @@ entry: ; GCN-LABEL: {{^}}store_private_hi_i8_shift: ; GCN: s_waitcnt -; GFX900-MUBUF-NEXT: buffer_store_byte_d16_hi v1, v0, s[0:3], 0 offen{{$}} -; GFX900-FLATSCR-NEXT: scratch_store_byte_d16_hi v0, v1, off{{$}} +; GFX9-MUBUF-NEXT: buffer_store_byte_d16_hi v1, v0, s[0:3], 0 offen{{$}} +; GFX9-FLATSCR-NEXT: scratch_store_byte_d16_hi v0, v1, off{{$}} ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; NO-D16-HI-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen{{$}} @@ -487,8 +458,8 @@ entry: ; GCN-LABEL: {{^}}store_private_hi_v2i16_max_offset: ; GCN: s_waitcnt -; GFX900-MUBUF: buffer_store_short_d16_hi v0, off, s[0:3], s32 offset:4094{{$}} -; GFX900-FLATSCR: scratch_store_short_d16_hi off, v0, s32 offset:4094{{$}} +; GFX9-MUBUF: buffer_store_short_d16_hi v0, off, s[0:3], s32 offset:4094{{$}} +; GFX9-FLATSCR: scratch_store_short_d16_hi off, v0, s32 offset:4094{{$}} ; NO-D16-HI: v_lshrrev_b32_e32 v0, 16, v0 ; NO-D16-HI-NEXT: buffer_store_short v0, off, s[0:3], s32 offset:4094{{$}} @@ -509,9 +480,9 @@ entry: ; GCN-LABEL: {{^}}store_private_hi_v2i16_nooff: ; GCN: s_waitcnt -; GFX900-MUBUF-NEXT: buffer_store_short_d16_hi v0, off, s[0:3], 0{{$}} -; GFX900-FLATSCR-NEXT: s_mov_b32 [[SOFF:s[0-9]+]], 0 -; GFX900-FLATSCR-NEXT: scratch_store_short_d16_hi off, v0, [[SOFF]]{{$}} +; GFX9-MUBUF-NEXT: buffer_store_short_d16_hi v0, off, s[0:3], 0{{$}} +; GFX9-FLATSCR-NEXT: s_mov_b32 [[SOFF:s[0-9]+]], 0 +; GFX9-FLATSCR-NEXT: scratch_store_short_d16_hi off, v0, [[SOFF]]{{$}} ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; NO-D16-HI-NEXT: buffer_store_short v0, off, s[0:3], 0{{$}} @@ -531,9 +502,9 @@ entry: ; GCN-LABEL: {{^}}store_private_hi_v2i16_i8_nooff: ; GCN: s_waitcnt -; GFX900-MUBUF-NEXT: buffer_store_byte_d16_hi v0, off, s[0:3], 0{{$}} -; GFX900-FLATSCR-NEXT: s_mov_b32 [[SOFF:s[0-9]+]], 0 -; GFX900-FLATSCR-NEXT: scratch_store_byte_d16_hi off, v0, [[SOFF]]{{$}} +; GFX9-MUBUF-NEXT: buffer_store_byte_d16_hi v0, off, s[0:3], 0{{$}} +; GFX9-FLATSCR-NEXT: s_mov_b32 [[SOFF:s[0-9]+]], 0 +; GFX9-FLATSCR-NEXT: scratch_store_byte_d16_hi off, v0, [[SOFF]]{{$}} ; NO-D16-HI: v_lshrrev_b32_e32 v0, 16, v0 ; NO-D16-HI: buffer_store_byte v0, off, s[0:3], 0{{$}} @@ -552,7 +523,7 @@ entry: ; GCN-LABEL: {{^}}store_local_hi_v2i16: ; GCN: s_waitcnt -; GFX900-NEXT: ds_write_b16_d16_hi v0, v1{{$}} +; GFX9-NEXT: ds_write_b16_d16_hi v0, v1{{$}} ; NO-D16-HI: v_lshrrev_b32_e32 v1, 16, v1 ; NO-D16-HI: ds_write_b16 v0, v1 @@ -571,7 +542,7 @@ entry: ; GCN-LABEL: {{^}}store_local_hi_v2f16: ; GCN: s_waitcnt -; GFX900-NEXT: ds_write_b16_d16_hi v0, v1{{$}} +; GFX9-NEXT: ds_write_b16_d16_hi v0, v1{{$}} ; NO-D16-HI: v_lshrrev_b32_e32 v1, 16, v1 ; NO-D16-HI: ds_write_b16 v0, v1 @@ -590,7 +561,7 @@ entry: ; GCN-LABEL: {{^}}store_local_hi_i32_shift: ; GCN: s_waitcnt -; GFX900-NEXT: ds_write_b16_d16_hi v0, v1{{$}} +; GFX9-NEXT: ds_write_b16_d16_hi v0, v1{{$}} ; NO-D16-HI: v_lshrrev_b32_e32 v1, 16, v1 ; NO-D16-HI: ds_write_b16 v0, v1 @@ -608,7 +579,7 @@ entry: ; GCN-LABEL: {{^}}store_local_hi_v2i16_i8: ; GCN: s_waitcnt -; GFX900-NEXT: ds_write_b8_d16_hi v0, v1{{$}} +; GFX9-NEXT: ds_write_b8_d16_hi v0, v1{{$}} ; NO-D16-HI: v_lshrrev_b32_e32 v1, 16, v1 ; NO-D16-HI: ds_write_b8 v0, v1 @@ -626,7 +597,7 @@ entry: ; GCN-LABEL: {{^}}store_local_hi_v2i16_max_offset: ; GCN: s_waitcnt -; GFX900-NEXT: ds_write_b16_d16_hi v0, v1 offset:65534{{$}} +; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:65534{{$}} ; NO-D16-HI: v_lshrrev_b32_e32 v1, 16, v1 ; NO-D16-HI: ds_write_b16 v0, v1 offset:65534{{$}} @@ -645,14 +616,14 @@ entry: ; GCN-LABEL: {{^}}store_private_hi_v2i16_to_offset: ; GCN: s_waitcnt -; GFX900-MUBUF: buffer_store_dword -; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) -; GFX900-MUBUF-NEXT: buffer_store_short_d16_hi v0, off, s[0:3], s32 offset:4058 -; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) -; GFX900-FLATSCR: scratch_store_dword -; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX900-FLATSCR-NEXT: scratch_store_short_d16_hi off, v0, s32 offset:4058 -; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX9-MUBUF: buffer_store_dword +; GFX9-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX9-MUBUF-NEXT: buffer_store_short_d16_hi v0, off, s[0:3], s32 offset:4058 +; GFX9-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLATSCR: scratch_store_dword +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLATSCR-NEXT: scratch_store_short_d16_hi off, v0, s32 offset:4058 +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) define void @store_private_hi_v2i16_to_offset(i32 %arg, [10 x i32] addrspace(5)* %obj0) #0 { entry: %obj1 = alloca [4096 x i16], align 2, addrspace(5) @@ -667,13 +638,13 @@ entry: ; GCN-LABEL: {{^}}store_private_hi_v2i16_i8_to_offset: ; GCN: s_waitcnt -; GFX900-MUBUF: buffer_store_dword -; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) -; GFX900-MUBUF-NEXT: buffer_store_byte_d16_hi v0, off, s[0:3], s32 offset:4059 -; GFX900-FLATSCR: scratch_store_dword -; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX900-FLATSCR-NEXT: scratch_store_byte_d16_hi off, v0, s32 offset:4059 -; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX9-MUBUF: buffer_store_dword +; GFX9-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX9-MUBUF-NEXT: buffer_store_byte_d16_hi v0, off, s[0:3], s32 offset:4059 +; GFX9-FLATSCR: scratch_store_dword +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLATSCR-NEXT: scratch_store_byte_d16_hi off, v0, s32 offset:4059 +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) define void @store_private_hi_v2i16_i8_to_offset(i32 %arg, [10 x i32] addrspace(5)* %obj0) #0 { entry: %obj1 = alloca [4096 x i8], align 2, addrspace(5)