[AMDGPU] Select d16 stores even when sramecc is enabled

The sramecc feature changes the behaviour of d16 loads so they do not
preserve the unused 16 bits of the result register, but it has no impact
on d16 stores, so we should make use of them even when the feature is
enabled.

Differential Revision: https://reviews.llvm.org/D104912
This commit is contained in:
Jay Foad 2021-06-22 13:06:02 +01:00
parent 33ec653055
commit f707e1255e
4 changed files with 70 additions and 93 deletions

View file

@ -1851,7 +1851,7 @@ defm : MUBUFScratchStorePat <BUFFER_STORE_DWORDX3_OFFEN, BUFFER_STORE_DWORDX3_OF
defm : MUBUFScratchStorePat <BUFFER_STORE_DWORDX4_OFFEN, BUFFER_STORE_DWORDX4_OFFSET, v4i32, store_private, VReg_128>;
let OtherPredicates = [D16PreservesUnusedBits, DisableFlatScratch] in {
let OtherPredicates = [HasD16LoadStore, DisableFlatScratch] in {
// Hiding the extract high pattern in the PatFrag seems to not
// automatically increase the complexity.
let AddedComplexity = 1 in {

View file

@ -791,7 +791,7 @@ defm : DSAtomicWritePat_mc <DS_WRITE_B16, i32, "atomic_store_local_16">;
defm : DSAtomicWritePat_mc <DS_WRITE_B32, i32, "atomic_store_local_32">;
defm : DSAtomicWritePat_mc <DS_WRITE_B64, i64, "atomic_store_local_64">;
let OtherPredicates = [D16PreservesUnusedBits] in {
let OtherPredicates = [HasD16LoadStore] in {
def : DSWritePat <DS_WRITE_B16_D16_HI, i32, store_hi16_local>;
def : DSWritePat <DS_WRITE_B8_D16_HI, i32, truncstorei8_hi16_local>;
}

View file

@ -1169,10 +1169,12 @@ defm : FlatAtomicPat <"FLAT_ATOMIC_XOR_X2", "atomic_load_xor_"#as, i64>;
def : FlatStorePat <FLAT_STORE_BYTE, truncstorei8_flat, i16>;
def : FlatStorePat <FLAT_STORE_SHORT, store_flat, i16>;
let OtherPredicates = [D16PreservesUnusedBits] in {
let OtherPredicates = [HasD16LoadStore] in {
def : FlatStorePat <FLAT_STORE_SHORT_D16_HI, truncstorei16_hi16_flat, i32>;
def : FlatStorePat <FLAT_STORE_BYTE_D16_HI, truncstorei8_hi16_flat, i32>;
}
let OtherPredicates = [D16PreservesUnusedBits] in {
def : FlatLoadPat_D16 <FLAT_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_flat, v2i16>;
def : FlatLoadPat_D16 <FLAT_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_flat, v2f16>;
def : FlatLoadPat_D16 <FLAT_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_flat, v2i16>;
@ -1363,10 +1365,12 @@ defm : GlobalFLATStorePats <GLOBAL_STORE_SHORT, truncstorei16_global, i32>;
defm : GlobalFLATStorePats <GLOBAL_STORE_SHORT, store_global, i16>;
defm : GlobalFLATStorePats <GLOBAL_STORE_DWORDX3, store_global, v3i32>;
let OtherPredicates = [D16PreservesUnusedBits] in {
let OtherPredicates = [HasD16LoadStore] in {
defm : GlobalFLATStorePats <GLOBAL_STORE_SHORT_D16_HI, truncstorei16_hi16_global, i32>;
defm : GlobalFLATStorePats <GLOBAL_STORE_BYTE_D16_HI, truncstorei8_hi16_global, i32>;
}
let OtherPredicates = [D16PreservesUnusedBits] in {
defm : GlobalFLATLoadPats_D16 <GLOBAL_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_global, v2i16>;
defm : GlobalFLATLoadPats_D16 <GLOBAL_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_global, v2f16>;
defm : GlobalFLATLoadPats_D16 <GLOBAL_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_global, v2i16>;
@ -1489,10 +1493,12 @@ defm : ScratchFLATStorePats <SCRATCH_STORE_SHORT, truncstorei16_private, i32>;
defm : ScratchFLATStorePats <SCRATCH_STORE_SHORT, store_private, i16>;
defm : ScratchFLATStorePats <SCRATCH_STORE_DWORDX3, store_private, v3i32>;
let OtherPredicates = [D16PreservesUnusedBits, HasFlatScratchInsts, EnableFlatScratch] in {
let OtherPredicates = [HasD16LoadStore, HasFlatScratchInsts, EnableFlatScratch] in {
defm : ScratchFLATStorePats <SCRATCH_STORE_SHORT_D16_HI, truncstorei16_hi16_private, i32>;
defm : ScratchFLATStorePats <SCRATCH_STORE_BYTE_D16_HI, truncstorei8_hi16_private, i32>;
}
let OtherPredicates = [D16PreservesUnusedBits, HasFlatScratchInsts, EnableFlatScratch] in {
defm : ScratchFLATLoadPats_D16 <SCRATCH_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_private, v2i16>;
defm : ScratchFLATLoadPats_D16 <SCRATCH_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_private, v2f16>;
defm : ScratchFLATLoadPats_D16 <SCRATCH_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_private, v2i16>;

View file

@ -1,16 +1,15 @@
; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX900,GFX9,GFX900-MUBUF %s
; RUN: llc -march=amdgcn -mcpu=gfx906 -amdgpu-sroa=0 -mattr=-promote-alloca,+sram-ecc -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX906,GFX9,NO-D16-HI %s
; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX9,GFX9-MUBUF %s
; RxN: llc -march=amdgcn -mcpu=gfx906 -amdgpu-sroa=0 -mattr=-promote-alloca,+sram-ecc -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX9 %s
; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX803,NO-D16-HI %s
; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX900,GFX9,GFX900-FLATSCR %s
; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX9,GFX9-FLATSCR %s
; GCN-LABEL: {{^}}store_global_hi_v2i16:
; GCN: s_waitcnt
; GFX900-NEXT: global_store_short_d16_hi v[0:1], v2, off
; GFX9-NEXT: global_store_short_d16_hi v[0:1], v2, off
; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX803-NEXT: flat_store_short v[0:1], v2
; GFX906-NEXT: global_store_short v[0:1], v2, off
; GCN-NEXT: s_waitcnt
; GCN-NEXT: s_setpc_b64
@ -26,11 +25,10 @@ entry:
; GCN-LABEL: {{^}}store_global_hi_v2f16:
; GCN: s_waitcnt
; GFX900-NEXT: global_store_short_d16_hi v[0:1], v2, off
; GFX9-NEXT: global_store_short_d16_hi v[0:1], v2, off
; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX803-NEXT: flat_store_short v[0:1], v2
; GFX906-NEXT: global_store_short v[0:1], v2, off
; GCN-NEXT: s_waitcnt
; GCN-NEXT: s_setpc_b64
@ -46,11 +44,10 @@ entry:
; GCN-LABEL: {{^}}store_global_hi_i32_shift:
; GCN: s_waitcnt
; GFX900-NEXT: global_store_short_d16_hi v[0:1], v2, off
; GFX9-NEXT: global_store_short_d16_hi v[0:1], v2, off
; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX803-NEXT: flat_store_short v[0:1], v2
; GFX906-NEXT: global_store_short v[0:1], v2, off
; GCN-NEXT: s_waitcnt
; GCN-NEXT: s_setpc_b64
@ -65,11 +62,10 @@ entry:
; GCN-LABEL: {{^}}store_global_hi_v2i16_i8:
; GCN: s_waitcnt
; GFX900-NEXT: global_store_byte_d16_hi v[0:1], v2, off
; GFX9-NEXT: global_store_byte_d16_hi v[0:1], v2, off
; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX803-NEXT: flat_store_byte v[0:1], v2
; GFX906-NEXT: global_store_byte v[0:1], v2, off
; GCN-NEXT: s_waitcnt
; GCN-NEXT: s_setpc_b64
@ -85,11 +81,10 @@ entry:
; GCN-LABEL: {{^}}store_global_hi_i8_shift:
; GCN: s_waitcnt
; GFX900-NEXT: global_store_byte_d16_hi v[0:1], v2, off
; GFX9-NEXT: global_store_byte_d16_hi v[0:1], v2, off
; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX803-NEXT: flat_store_byte v[0:1], v2
; GFX906-NEXT: global_store_byte v[0:1], v2, off
; GCN-NEXT: s_waitcnt
; GCN-NEXT: s_setpc_b64
@ -103,16 +98,13 @@ entry:
; GCN-LABEL: {{^}}store_global_hi_v2i16_max_offset:
; GCN: s_waitcnt
; GFX900-NEXT: global_store_short_d16_hi v[0:1], v2, off offset:4094
; GFX9-NEXT: global_store_short_d16_hi v[0:1], v2, off offset:4094
; GFX803-DAG: v_add_u32_e32
; GFX803-DAG: v_addc_u32_e32
; GFX803-DAG: v_lshrrev_b32_e32 v2, 16, v2
; GFX803: flat_store_short v[0:1], v2{{$}}
; GFX906-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX906-NEXT: global_store_short v[0:1], v2, off
; GCN-NEXT: s_waitcnt
; GCN-NEXT: s_setpc_b64
define void @store_global_hi_v2i16_max_offset(i16 addrspace(1)* %out, i32 %arg) #0 {
@ -127,16 +119,13 @@ entry:
; GCN-LABEL: {{^}}store_global_hi_v2i16_min_offset:
; GCN: s_waitcnt
; GFX900-NEXT: global_store_short_d16_hi v[0:1], v2, off offset:-4096{{$}}
; GFX9-NEXT: global_store_short_d16_hi v[0:1], v2, off offset:-4096{{$}}
; GFX803-DAG: v_add_u32_e32
; GFX803-DAG: v_addc_u32_e32
; GFX803-DAG: v_lshrrev_b32_e32 v2, 16, v2
; GFX803: flat_store_short v[0:1], v{{[0-9]$}}
; GFX906-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX906-NEXT: global_store_short v[0:1], v2, off
; GCN-NEXT: s_waitcnt
; GCN-NEXT: s_setpc_b64
define void @store_global_hi_v2i16_min_offset(i16 addrspace(1)* %out, i32 %arg) #0 {
@ -150,16 +139,13 @@ entry:
; GCN-LABEL: {{^}}store_global_hi_v2i16_i8_max_offset:
; GCN: s_waitcnt
; GFX900-NEXT: global_store_byte_d16_hi v[0:1], v2, off offset:4095
; GFX9-NEXT: global_store_byte_d16_hi v[0:1], v2, off offset:4095
; GFX803-DAG: v_add_u32_e32
; GFX803-DAG: v_addc_u32_e32
; GFX803-DAG: v_lshrrev_b32_e32 v2, 16, v2
; GFX803: flat_store_byte v[0:1], v{{[0-9]$}}
; GFX906-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX906-NEXT: global_store_byte v[0:1], v2, off
; GCN-NEXT: s_waitcnt
; GCN-NEXT: s_setpc_b64
define void @store_global_hi_v2i16_i8_max_offset(i8 addrspace(1)* %out, i32 %arg) #0 {
@ -174,16 +160,13 @@ entry:
; GCN-LABEL: {{^}}store_global_hi_v2i16_i8_min_offset:
; GCN: s_waitcnt
; GFX900-NEXT: global_store_byte_d16_hi v[0:1], v2, off offset:-4095
; GFX9-NEXT: global_store_byte_d16_hi v[0:1], v2, off offset:-4095
; GFX803-DAG: v_add_u32_e32
; GFX803-DAG: v_addc_u32_e32
; GFX803-DAG: v_lshrrev_b32_e32 v2, 16, v2
; GFX803: flat_store_byte v[0:1], v{{[0-9]$}}
; GFX906-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX906-NEXT: global_store_byte v[0:1], v2, off
; GCN-NEXT: s_waitcnt
; GCN-NEXT: s_setpc_b64
define void @store_global_hi_v2i16_i8_min_offset(i8 addrspace(1)* %out, i32 %arg) #0 {
@ -199,7 +182,7 @@ entry:
; GCN-LABEL: {{^}}store_flat_hi_v2i16:
; GCN: s_waitcnt
; GFX900-NEXT: flat_store_short_d16_hi v[0:1], v2{{$}}
; GFX9-NEXT: flat_store_short_d16_hi v[0:1], v2{{$}}
; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; NO-D16-HI-NEXT: flat_store_short v[0:1], v2
@ -217,7 +200,7 @@ entry:
; GCN-LABEL: {{^}}store_flat_hi_v2f16:
; GCN: s_waitcnt
; GFX900-NEXT: flat_store_short_d16_hi v[0:1], v2{{$}}
; GFX9-NEXT: flat_store_short_d16_hi v[0:1], v2{{$}}
; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; NO-D16-HI-NEXT: flat_store_short v[0:1], v2
@ -235,7 +218,7 @@ entry:
; GCN-LABEL: {{^}}store_flat_hi_i32_shift:
; GCN: s_waitcnt
; GFX900-NEXT: flat_store_short_d16_hi v[0:1], v2{{$}}
; GFX9-NEXT: flat_store_short_d16_hi v[0:1], v2{{$}}
; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; NO-D16-HI-NEXT: flat_store_short v[0:1], v2
@ -253,7 +236,7 @@ entry:
; GCN-LABEL: {{^}}store_flat_hi_v2i16_i8:
; GCN: s_waitcnt
; GFX900-NEXT: flat_store_byte_d16_hi v[0:1], v2{{$}}
; GFX9-NEXT: flat_store_byte_d16_hi v[0:1], v2{{$}}
; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; NO-D16-HI-NEXT: flat_store_byte v[0:1], v2
@ -272,7 +255,7 @@ entry:
; GCN-LABEL: {{^}}store_flat_hi_i8_shift:
; GCN: s_waitcnt
; GFX900-NEXT: flat_store_byte_d16_hi v[0:1], v2{{$}}
; GFX9-NEXT: flat_store_byte_d16_hi v[0:1], v2{{$}}
; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; NO-D16-HI-NEXT: flat_store_byte v[0:1], v2
@ -289,10 +272,7 @@ entry:
; GCN-LABEL: {{^}}store_flat_hi_v2i16_max_offset:
; GCN: s_waitcnt
; GFX900-NEXT: flat_store_short_d16_hi v[0:1], v2 offset:4094{{$}}
; GFX906-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX906-NEXT: flat_store_short v[0:1], v2 offset:4094
; GFX9-NEXT: flat_store_short_d16_hi v[0:1], v2 offset:4094{{$}}
; GFX803-DAG: v_add_u32_e32
; GFX803-DAG: v_addc_u32_e32
@ -318,10 +298,7 @@ entry:
; GFX9-DAG: v_add_co_u32_e32 v{{[0-9]+}}, vcc, 0xfffff802, v
; GFX9-DAG: v_addc_co_u32_e32 v{{[0-9]+}}, vcc, -1, v
; GFX906-DAG: v_lshrrev_b32_e32
; GFX906: flat_store_short v[0:1], v2{{$}}
; GFX900-NEXT: flat_store_short_d16_hi v[0:1], v2{{$}}
; GFX9-NEXT: flat_store_short_d16_hi v[0:1], v2{{$}}
; GFX803: flat_store_short v[0:1], v2{{$}}
; GCN-NEXT: s_waitcnt
; GCN-NEXT: s_setpc_b64
@ -336,16 +313,13 @@ entry:
; GCN-LABEL: {{^}}store_flat_hi_v2i16_i8_max_offset:
; GCN: s_waitcnt
; GFX900-NEXT: flat_store_byte_d16_hi v[0:1], v2 offset:4095{{$}}
; GFX9-NEXT: flat_store_byte_d16_hi v[0:1], v2 offset:4095{{$}}
; GFX803-DAG: v_lshrrev_b32_e32 v2, 16, v2
; GFX803-DAG: v_add_u32_e32
; GFX803-DAG: v_addc_u32_e32
; GFX803: flat_store_byte v[0:1], v2{{$}}
; GFX906-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX906-NEXT: flat_store_byte v[0:1], v2 offset:4095{{$}}
; GCN-NEXT: s_waitcnt
; GCN-NEXT: s_setpc_b64
define void @store_flat_hi_v2i16_i8_max_offset(i8* %out, i32 %arg) #0 {
@ -367,10 +341,7 @@ entry:
; GFX9-DAG: v_add_co_u32_e32 v{{[0-9]+}}, vcc, 0xfffff001, v
; GFX9-DAG: v_addc_co_u32_e32 v{{[0-9]+}}, vcc, -1, v{{[0-9]+}}, vcc
; GFX900-NEXT: flat_store_byte_d16_hi v[0:1], v2{{$}}
; GFX906-DAG: v_lshrrev_b32_e32 v2, 16, v2
; GFX906: flat_store_byte v[0:1], v2{{$}}
; GFX9-NEXT: flat_store_byte_d16_hi v[0:1], v2{{$}}
; GFX803-DAG: v_lshrrev_b32_e32 v2, 16, v2
; GFX803: flat_store_byte v[0:1], v2{{$}}
@ -390,8 +361,8 @@ entry:
; GCN-LABEL: {{^}}store_private_hi_v2i16:
; GCN: s_waitcnt
; GFX900-MUBUF-NEXT: buffer_store_short_d16_hi v1, v0, s[0:3], 0 offen{{$}}
; GFX900-FLATSCR-NEXT: scratch_store_short_d16_hi v0, v1, off
; GFX9-MUBUF-NEXT: buffer_store_short_d16_hi v1, v0, s[0:3], 0 offen{{$}}
; GFX9-FLATSCR-NEXT: scratch_store_short_d16_hi v0, v1, off
; NO-D16-HI: v_lshrrev_b32_e32 v1, 16, v1
; NO-D16-HI: buffer_store_short v1, v0, s[0:3], 0 offen{{$}}
@ -410,8 +381,8 @@ entry:
; GCN-LABEL: {{^}}store_private_hi_v2f16:
; GCN: s_waitcnt
; GFX900-MUBUF-NEXT: buffer_store_short_d16_hi v1, v0, s[0:3], 0 offen{{$}}
; GFX900-FLATSCR-NEXT: scratch_store_short_d16_hi v0, v1, off{{$}}
; GFX9-MUBUF-NEXT: buffer_store_short_d16_hi v1, v0, s[0:3], 0 offen{{$}}
; GFX9-FLATSCR-NEXT: scratch_store_short_d16_hi v0, v1, off{{$}}
; NO-D16-HI: v_lshrrev_b32_e32 v1, 16, v1
; NO-D16-HI: buffer_store_short v1, v0, s[0:3], 0 offen{{$}}
@ -430,8 +401,8 @@ entry:
; GCN-LABEL: {{^}}store_private_hi_i32_shift:
; GCN: s_waitcnt
; GFX900-MUBUF-NEXT: buffer_store_short_d16_hi v1, v0, s[0:3], 0 offen{{$}}
; GFX900-FLATSCR-NEXT: scratch_store_short_d16_hi v0, v1, off{{$}}
; GFX9-MUBUF-NEXT: buffer_store_short_d16_hi v1, v0, s[0:3], 0 offen{{$}}
; GFX9-FLATSCR-NEXT: scratch_store_short_d16_hi v0, v1, off{{$}}
; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; NO-D16-HI-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen{{$}}
@ -449,8 +420,8 @@ entry:
; GCN-LABEL: {{^}}store_private_hi_v2i16_i8:
; GCN: s_waitcnt
; GFX900-MUBUF-NEXT: buffer_store_byte_d16_hi v1, v0, s[0:3], 0 offen{{$}}
; GFX900-FLATSCR-NEXT: scratch_store_byte_d16_hi v0, v1, off{{$}}
; GFX9-MUBUF-NEXT: buffer_store_byte_d16_hi v1, v0, s[0:3], 0 offen{{$}}
; GFX9-FLATSCR-NEXT: scratch_store_byte_d16_hi v0, v1, off{{$}}
; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; NO-D16-HI-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen{{$}}
@ -469,8 +440,8 @@ entry:
; GCN-LABEL: {{^}}store_private_hi_i8_shift:
; GCN: s_waitcnt
; GFX900-MUBUF-NEXT: buffer_store_byte_d16_hi v1, v0, s[0:3], 0 offen{{$}}
; GFX900-FLATSCR-NEXT: scratch_store_byte_d16_hi v0, v1, off{{$}}
; GFX9-MUBUF-NEXT: buffer_store_byte_d16_hi v1, v0, s[0:3], 0 offen{{$}}
; GFX9-FLATSCR-NEXT: scratch_store_byte_d16_hi v0, v1, off{{$}}
; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; NO-D16-HI-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen{{$}}
@ -487,8 +458,8 @@ entry:
; GCN-LABEL: {{^}}store_private_hi_v2i16_max_offset:
; GCN: s_waitcnt
; GFX900-MUBUF: buffer_store_short_d16_hi v0, off, s[0:3], s32 offset:4094{{$}}
; GFX900-FLATSCR: scratch_store_short_d16_hi off, v0, s32 offset:4094{{$}}
; GFX9-MUBUF: buffer_store_short_d16_hi v0, off, s[0:3], s32 offset:4094{{$}}
; GFX9-FLATSCR: scratch_store_short_d16_hi off, v0, s32 offset:4094{{$}}
; NO-D16-HI: v_lshrrev_b32_e32 v0, 16, v0
; NO-D16-HI-NEXT: buffer_store_short v0, off, s[0:3], s32 offset:4094{{$}}
@ -509,9 +480,9 @@ entry:
; GCN-LABEL: {{^}}store_private_hi_v2i16_nooff:
; GCN: s_waitcnt
; GFX900-MUBUF-NEXT: buffer_store_short_d16_hi v0, off, s[0:3], 0{{$}}
; GFX900-FLATSCR-NEXT: s_mov_b32 [[SOFF:s[0-9]+]], 0
; GFX900-FLATSCR-NEXT: scratch_store_short_d16_hi off, v0, [[SOFF]]{{$}}
; GFX9-MUBUF-NEXT: buffer_store_short_d16_hi v0, off, s[0:3], 0{{$}}
; GFX9-FLATSCR-NEXT: s_mov_b32 [[SOFF:s[0-9]+]], 0
; GFX9-FLATSCR-NEXT: scratch_store_short_d16_hi off, v0, [[SOFF]]{{$}}
; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; NO-D16-HI-NEXT: buffer_store_short v0, off, s[0:3], 0{{$}}
@ -531,9 +502,9 @@ entry:
; GCN-LABEL: {{^}}store_private_hi_v2i16_i8_nooff:
; GCN: s_waitcnt
; GFX900-MUBUF-NEXT: buffer_store_byte_d16_hi v0, off, s[0:3], 0{{$}}
; GFX900-FLATSCR-NEXT: s_mov_b32 [[SOFF:s[0-9]+]], 0
; GFX900-FLATSCR-NEXT: scratch_store_byte_d16_hi off, v0, [[SOFF]]{{$}}
; GFX9-MUBUF-NEXT: buffer_store_byte_d16_hi v0, off, s[0:3], 0{{$}}
; GFX9-FLATSCR-NEXT: s_mov_b32 [[SOFF:s[0-9]+]], 0
; GFX9-FLATSCR-NEXT: scratch_store_byte_d16_hi off, v0, [[SOFF]]{{$}}
; NO-D16-HI: v_lshrrev_b32_e32 v0, 16, v0
; NO-D16-HI: buffer_store_byte v0, off, s[0:3], 0{{$}}
@ -552,7 +523,7 @@ entry:
; GCN-LABEL: {{^}}store_local_hi_v2i16:
; GCN: s_waitcnt
; GFX900-NEXT: ds_write_b16_d16_hi v0, v1{{$}}
; GFX9-NEXT: ds_write_b16_d16_hi v0, v1{{$}}
; NO-D16-HI: v_lshrrev_b32_e32 v1, 16, v1
; NO-D16-HI: ds_write_b16 v0, v1
@ -571,7 +542,7 @@ entry:
; GCN-LABEL: {{^}}store_local_hi_v2f16:
; GCN: s_waitcnt
; GFX900-NEXT: ds_write_b16_d16_hi v0, v1{{$}}
; GFX9-NEXT: ds_write_b16_d16_hi v0, v1{{$}}
; NO-D16-HI: v_lshrrev_b32_e32 v1, 16, v1
; NO-D16-HI: ds_write_b16 v0, v1
@ -590,7 +561,7 @@ entry:
; GCN-LABEL: {{^}}store_local_hi_i32_shift:
; GCN: s_waitcnt
; GFX900-NEXT: ds_write_b16_d16_hi v0, v1{{$}}
; GFX9-NEXT: ds_write_b16_d16_hi v0, v1{{$}}
; NO-D16-HI: v_lshrrev_b32_e32 v1, 16, v1
; NO-D16-HI: ds_write_b16 v0, v1
@ -608,7 +579,7 @@ entry:
; GCN-LABEL: {{^}}store_local_hi_v2i16_i8:
; GCN: s_waitcnt
; GFX900-NEXT: ds_write_b8_d16_hi v0, v1{{$}}
; GFX9-NEXT: ds_write_b8_d16_hi v0, v1{{$}}
; NO-D16-HI: v_lshrrev_b32_e32 v1, 16, v1
; NO-D16-HI: ds_write_b8 v0, v1
@ -626,7 +597,7 @@ entry:
; GCN-LABEL: {{^}}store_local_hi_v2i16_max_offset:
; GCN: s_waitcnt
; GFX900-NEXT: ds_write_b16_d16_hi v0, v1 offset:65534{{$}}
; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:65534{{$}}
; NO-D16-HI: v_lshrrev_b32_e32 v1, 16, v1
; NO-D16-HI: ds_write_b16 v0, v1 offset:65534{{$}}
@ -645,14 +616,14 @@ entry:
; GCN-LABEL: {{^}}store_private_hi_v2i16_to_offset:
; GCN: s_waitcnt
; GFX900-MUBUF: buffer_store_dword
; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0)
; GFX900-MUBUF-NEXT: buffer_store_short_d16_hi v0, off, s[0:3], s32 offset:4058
; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0)
; GFX900-FLATSCR: scratch_store_dword
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX900-FLATSCR-NEXT: scratch_store_short_d16_hi off, v0, s32 offset:4058
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX9-MUBUF: buffer_store_dword
; GFX9-MUBUF-NEXT: s_waitcnt vmcnt(0)
; GFX9-MUBUF-NEXT: buffer_store_short_d16_hi v0, off, s[0:3], s32 offset:4058
; GFX9-MUBUF-NEXT: s_waitcnt vmcnt(0)
; GFX9-FLATSCR: scratch_store_dword
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX9-FLATSCR-NEXT: scratch_store_short_d16_hi off, v0, s32 offset:4058
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
define void @store_private_hi_v2i16_to_offset(i32 %arg, [10 x i32] addrspace(5)* %obj0) #0 {
entry:
%obj1 = alloca [4096 x i16], align 2, addrspace(5)
@ -667,13 +638,13 @@ entry:
; GCN-LABEL: {{^}}store_private_hi_v2i16_i8_to_offset:
; GCN: s_waitcnt
; GFX900-MUBUF: buffer_store_dword
; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0)
; GFX900-MUBUF-NEXT: buffer_store_byte_d16_hi v0, off, s[0:3], s32 offset:4059
; GFX900-FLATSCR: scratch_store_dword
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX900-FLATSCR-NEXT: scratch_store_byte_d16_hi off, v0, s32 offset:4059
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX9-MUBUF: buffer_store_dword
; GFX9-MUBUF-NEXT: s_waitcnt vmcnt(0)
; GFX9-MUBUF-NEXT: buffer_store_byte_d16_hi v0, off, s[0:3], s32 offset:4059
; GFX9-FLATSCR: scratch_store_dword
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX9-FLATSCR-NEXT: scratch_store_byte_d16_hi off, v0, s32 offset:4059
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
define void @store_private_hi_v2i16_i8_to_offset(i32 %arg, [10 x i32] addrspace(5)* %obj0) #0 {
entry:
%obj1 = alloca [4096 x i8], align 2, addrspace(5)