AMDGPU: Select i8/i16 global and flat atomic load/store

As far as I know these should be atomic anyway, as long as the address
is aligned. Unaligned atomics hit an ugly error in AtomicExpand.
This commit is contained in:
Matt Arsenault 2022-04-12 18:10:02 -04:00
parent 7c71ce97e7
commit df29ec2f54
4 changed files with 292 additions and 0 deletions

View file

@ -898,6 +898,10 @@ defm BUFFER_LOAD_DWORDX4 : MUBUF_Pseudo_Loads <
"buffer_load_dwordx4", v4i32
>;
defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_UBYTE", i32, atomic_load_8_global>;
defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_USHORT", i32, atomic_load_16_global>;
defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_UBYTE", i16, atomic_load_8_global>;
defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_USHORT", i16, atomic_load_16_global>;
defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_UBYTE", i32, extloadi8_global>;
defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_UBYTE", i32, zextloadi8_global>;
defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_SBYTE", i32, sextloadi8_global>;
@ -1794,6 +1798,10 @@ multiclass MUBUFStore_Atomic_Pattern <MUBUF_Pseudo Instr_ADDR64, MUBUF_Pseudo In
>;
}
let SubtargetPredicate = isGFX6GFX7 in {
defm : MUBUFStore_Atomic_Pattern <BUFFER_STORE_BYTE_ADDR64, BUFFER_STORE_BYTE_OFFSET, i32, atomic_store_global_8>;
defm : MUBUFStore_Atomic_Pattern <BUFFER_STORE_BYTE_ADDR64, BUFFER_STORE_BYTE_OFFSET, i16, atomic_store_global_8>;
defm : MUBUFStore_Atomic_Pattern <BUFFER_STORE_SHORT_ADDR64, BUFFER_STORE_SHORT_OFFSET, i32, atomic_store_global_16>;
defm : MUBUFStore_Atomic_Pattern <BUFFER_STORE_SHORT_ADDR64, BUFFER_STORE_SHORT_OFFSET, i16, atomic_store_global_16>;
defm : MUBUFStore_Atomic_Pattern <BUFFER_STORE_DWORD_ADDR64, BUFFER_STORE_DWORD_OFFSET, i32, atomic_store_global_32>;
defm : MUBUFStore_Atomic_Pattern <BUFFER_STORE_DWORDX2_ADDR64, BUFFER_STORE_DWORDX2_OFFSET, i64, atomic_store_global_64>;
} // End Predicates = isGFX6GFX7

View file

@ -1089,6 +1089,10 @@ class ScratchLoadSVaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueT
let OtherPredicates = [HasFlatAddressSpace] in {
def : FlatLoadPat <FLAT_LOAD_UBYTE, atomic_load_8_flat, i32>;
def : FlatLoadPat <FLAT_LOAD_UBYTE, atomic_load_8_flat, i16>;
def : FlatLoadPat <FLAT_LOAD_USHORT, atomic_load_16_flat, i32>;
def : FlatLoadPat <FLAT_LOAD_USHORT, atomic_load_16_flat, i16>;
def : FlatLoadPat <FLAT_LOAD_UBYTE, extloadi8_flat, i32>;
def : FlatLoadPat <FLAT_LOAD_UBYTE, zextloadi8_flat, i32>;
def : FlatLoadPat <FLAT_LOAD_SBYTE, sextloadi8_flat, i32>;
@ -1126,6 +1130,11 @@ def : FlatStorePat <FLAT_STORE_DWORDX4, store_flat, vt>;
def : FlatStoreAtomicPat <FLAT_STORE_DWORD, atomic_store_flat_32, i32>;
def : FlatStoreAtomicPat <FLAT_STORE_DWORDX2, atomic_store_flat_64, i64>;
def : FlatStoreAtomicPat <FLAT_STORE_BYTE, atomic_store_flat_8, i32>;
def : FlatStoreAtomicPat <FLAT_STORE_BYTE, atomic_store_flat_8, i16>;
def : FlatStoreAtomicPat <FLAT_STORE_SHORT, atomic_store_flat_16, i32>;
def : FlatStoreAtomicPat <FLAT_STORE_SHORT, atomic_store_flat_16, i16>;
foreach as = [ "flat", "global" ] in {
defm : FlatAtomicPat <"FLAT_ATOMIC_ADD", "atomic_load_add_"#as, i32>;
@ -1310,6 +1319,10 @@ multiclass ScratchFLATLoadPats_D16<FLAT_Pseudo inst, SDPatternOperator node, Val
let OtherPredicates = [HasFlatGlobalInsts] in {
defm : GlobalFLATLoadPats <GLOBAL_LOAD_UBYTE, atomic_load_8_global, i32>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_UBYTE, atomic_load_8_global, i16>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_USHORT, atomic_load_16_global, i32>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_USHORT, atomic_load_16_global, i16>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_UBYTE, extloadi8_global, i32>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_UBYTE, zextloadi8_global, i32>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_SBYTE, sextloadi8_global, i32>;
@ -1369,6 +1382,10 @@ defm : GlobalFLATLoadPats_D16 <GLOBAL_LOAD_SHORT_D16, load_d16_lo_global, v2i16>
defm : GlobalFLATLoadPats_D16 <GLOBAL_LOAD_SHORT_D16, load_d16_lo_global, v2f16>;
}
defm : GlobalFLATAtomicStorePats <GLOBAL_STORE_BYTE, atomic_store_global_8, i32>;
defm : GlobalFLATAtomicStorePats <GLOBAL_STORE_BYTE, atomic_store_global_8, i16>;
defm : GlobalFLATAtomicStorePats <GLOBAL_STORE_SHORT, atomic_store_global_16, i32>;
defm : GlobalFLATAtomicStorePats <GLOBAL_STORE_SHORT, atomic_store_global_16, i16>;
defm : GlobalFLATAtomicStorePats <GLOBAL_STORE_DWORD, atomic_store_global_32, i32>;
defm : GlobalFLATAtomicStorePats <GLOBAL_STORE_DWORDX2, atomic_store_global_64, i64>;

View file

@ -1128,3 +1128,149 @@ entry:
store atomic float %in, float* %ptr seq_cst, align 4
ret void
}
; GCN-LABEL: {{^}}atomic_load_i8_offset:
; CIVI: flat_load_ubyte [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
; GFX9: flat_load_ubyte [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] offset:16 glc{{$}}
; GCN: flat_store_byte v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
define amdgpu_kernel void @atomic_load_i8_offset(i8* %in, i8* %out) {
entry:
%gep = getelementptr i8, i8* %in, i64 16
%val = load atomic i8, i8* %gep seq_cst, align 1
store i8 %val, i8* %out
ret void
}
; GCN-LABEL: {{^}}atomic_load_i8:
; GCN: flat_load_ubyte [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc
; GCN: flat_store_byte v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
define amdgpu_kernel void @atomic_load_i8(i8* %in, i8* %out) {
entry:
%val = load atomic i8, i8* %in seq_cst, align 1
store i8 %val, i8* %out
ret void
}
; GCN-LABEL: {{^}}atomic_load_i8_addr64_offset:
; CIVI: flat_load_ubyte [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}] glc{{$}}
; GFX9: flat_load_ubyte [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}] offset:16 glc{{$}}
; GCN: flat_store_byte v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
define amdgpu_kernel void @atomic_load_i8_addr64_offset(i8* %in, i8* %out, i64 %index) {
entry:
%ptr = getelementptr i8, i8* %in, i64 %index
%gep = getelementptr i8, i8* %ptr, i64 16
%val = load atomic i8, i8* %gep seq_cst, align 1
store i8 %val, i8* %out
ret void
}
; GCN-LABEL: {{^}}atomic_store_i8_offset:
; CIVI: flat_store_byte v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
; GFX9: flat_store_byte v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} offset:16{{$}}
define amdgpu_kernel void @atomic_store_i8_offset(i8 %in, i8* %out) {
entry:
%gep = getelementptr i8, i8* %out, i64 16
store atomic i8 %in, i8* %gep seq_cst, align 1
ret void
}
; GCN-LABEL: {{^}}atomic_store_i8:
; GCN: flat_store_byte v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
define amdgpu_kernel void @atomic_store_i8(i8 %in, i8* %out) {
entry:
store atomic i8 %in, i8* %out seq_cst, align 1
ret void
}
; GCN-LABEL: {{^}}atomic_store_i8_addr64_offset:
; CIVI: flat_store_byte v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
; GFX9: flat_store_byte v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} offset:16{{$}}
define amdgpu_kernel void @atomic_store_i8_addr64_offset(i8 %in, i8* %out, i64 %index) {
entry:
%ptr = getelementptr i8, i8* %out, i64 %index
%gep = getelementptr i8, i8* %ptr, i64 16
store atomic i8 %in, i8* %gep seq_cst, align 1
ret void
}
; GCN-LABEL: {{^}}atomic_load_i16_offset:
; CIVI: flat_load_ushort [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
; GFX9: flat_load_ushort [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] offset:16 glc{{$}}
; GCN: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
define amdgpu_kernel void @atomic_load_i16_offset(i16* %in, i16* %out) {
entry:
%gep = getelementptr i16, i16* %in, i64 8
%val = load atomic i16, i16* %gep seq_cst, align 2
store i16 %val, i16* %out
ret void
}
; GCN-LABEL: {{^}}atomic_load_i16:
; GCN: flat_load_ushort [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc
; GCN: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
define amdgpu_kernel void @atomic_load_i16(i16* %in, i16* %out) {
entry:
%val = load atomic i16, i16* %in seq_cst, align 2
store i16 %val, i16* %out
ret void
}
; GCN-LABEL: {{^}}atomic_load_i16_addr64_offset:
; CIVI: flat_load_ushort [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}] glc{{$}}
; GFX9: flat_load_ushort [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}] offset:16 glc{{$}}
; GCN: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
define amdgpu_kernel void @atomic_load_i16_addr64_offset(i16* %in, i16* %out, i64 %index) {
entry:
%ptr = getelementptr i16, i16* %in, i64 %index
%gep = getelementptr i16, i16* %ptr, i64 8
%val = load atomic i16, i16* %gep seq_cst, align 2
store i16 %val, i16* %out
ret void
}
; GCN-LABEL: {{^}}atomic_store_i16_offset:
; CIVI: flat_store_short v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
; GFX9: flat_store_short v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} offset:16{{$}}
define amdgpu_kernel void @atomic_store_i16_offset(i16 %in, i16* %out) {
entry:
%gep = getelementptr i16, i16* %out, i64 8
store atomic i16 %in, i16* %gep seq_cst, align 2
ret void
}
; GCN-LABEL: {{^}}atomic_store_i16:
; GCN: flat_store_short v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
define amdgpu_kernel void @atomic_store_i16(i16 %in, i16* %out) {
entry:
store atomic i16 %in, i16* %out seq_cst, align 2
ret void
}
; GCN-LABEL: {{^}}atomic_store_i16_addr64_offset:
; CIVI: flat_store_short v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
; GFX9: flat_store_short v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} offset:16{{$}}
define amdgpu_kernel void @atomic_store_i16_addr64_offset(i16 %in, i16* %out, i64 %index) {
entry:
%ptr = getelementptr i16, i16* %out, i64 %index
%gep = getelementptr i16, i16* %ptr, i64 8
store atomic i16 %in, i16* %gep seq_cst, align 2
ret void
}
; GCN-LABEL: {{^}}atomic_store_f16_offset:
; CIVI: flat_store_short v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
; GFX9: flat_store_short v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} offset:16{{$}}
define amdgpu_kernel void @atomic_store_f16_offset(half %in, half* %out) {
entry:
%gep = getelementptr half, half* %out, i64 8
store atomic half %in, half* %gep seq_cst, align 2
ret void
}
; GCN-LABEL: {{^}}atomic_store_f16:
; GCN: flat_store_short v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
define amdgpu_kernel void @atomic_store_f16(half %in, half* %out) {
entry:
store atomic half %in, half* %out seq_cst, align 2
ret void
}

View file

@ -1316,3 +1316,124 @@ entry:
store atomic float %in, float addrspace(1)* %ptr seq_cst, align 4
ret void
}
; GCN-LABEL: {{^}}atomic_load_i8_offset:
; SIVI: buffer_load_ubyte [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
; SIVI: buffer_store_byte [[RET]]
; GFX9: global_load_ubyte [[RET:v[0-9]+]], v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:16 glc{{$}}
define amdgpu_kernel void @atomic_load_i8_offset(i8 addrspace(1)* %in, i8 addrspace(1)* %out) {
entry:
%gep = getelementptr i8, i8 addrspace(1)* %in, i64 16
%val = load atomic i8, i8 addrspace(1)* %gep seq_cst, align 1
store i8 %val, i8 addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}atomic_load_i8_negoffset:
; SI: buffer_load_ubyte [[RET:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
; VI: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0xfffffe00
; VI-NEXT: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, -1
; VI: flat_load_ubyte [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
; GFX9: global_load_ubyte [[RET:v[0-9]+]], v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:-512 glc{{$}}
define amdgpu_kernel void @atomic_load_i8_negoffset(i8 addrspace(1)* %in, i8 addrspace(1)* %out) {
entry:
%gep = getelementptr i8, i8 addrspace(1)* %in, i64 -512
%val = load atomic i8, i8 addrspace(1)* %gep seq_cst, align 1
store i8 %val, i8 addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}atomic_store_i8_offset:
; SI: buffer_store_byte {{v[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
; VI: flat_store_byte v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+$}}
; GFX9: global_store_byte {{v[0-9]+}}, {{v[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:16{{$}}
define amdgpu_kernel void @atomic_store_i8_offset(i8 %in, i8 addrspace(1)* %out) {
entry:
%gep = getelementptr i8, i8 addrspace(1)* %out, i64 16
store atomic i8 %in, i8 addrspace(1)* %gep seq_cst, align 1
ret void
}
; GCN-LABEL: {{^}}atomic_store_i8:
; SI: buffer_store_byte {{v[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
; VI: flat_store_byte v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+$}}
; GFX9: global_store_byte {{v[0-9]+}}, {{v[0-9]+}}, s{{\[[0-9]+:[0-9]+\]$}}
define amdgpu_kernel void @atomic_store_i8(i8 %in, i8 addrspace(1)* %out) {
entry:
store atomic i8 %in, i8 addrspace(1)* %out seq_cst, align 1
ret void
}
; GCN-LABEL: {{^}}atomic_load_i16_offset:
; SIVI: buffer_load_ushort [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
; SIVI: buffer_store_short [[RET]]
; GFX9: global_load_ushort [[RET:v[0-9]+]], v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:16 glc{{$}}
define amdgpu_kernel void @atomic_load_i16_offset(i16 addrspace(1)* %in, i16 addrspace(1)* %out) {
entry:
%gep = getelementptr i16, i16 addrspace(1)* %in, i64 8
%val = load atomic i16, i16 addrspace(1)* %gep seq_cst, align 2
store i16 %val, i16 addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}atomic_load_i16_negoffset:
; SI: buffer_load_ushort [[RET:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
; VI: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0xfffffe00
; VI-NEXT: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, -1
; VI: flat_load_ushort [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
; GFX9: global_load_ushort [[RET:v[0-9]+]], v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:-512 glc{{$}}
define amdgpu_kernel void @atomic_load_i16_negoffset(i16 addrspace(1)* %in, i16 addrspace(1)* %out) {
entry:
%gep = getelementptr i16, i16 addrspace(1)* %in, i64 -256
%val = load atomic i16, i16 addrspace(1)* %gep seq_cst, align 2
store i16 %val, i16 addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}atomic_store_i16_offset:
; SI: buffer_store_short {{v[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
; VI: flat_store_short v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+$}}
; GFX9: global_store_short {{v[0-9]+}}, {{v[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:16{{$}}
define amdgpu_kernel void @atomic_store_i16_offset(i16 %in, i16 addrspace(1)* %out) {
entry:
%gep = getelementptr i16, i16 addrspace(1)* %out, i64 8
store atomic i16 %in, i16 addrspace(1)* %gep seq_cst, align 2
ret void
}
; GCN-LABEL: {{^}}atomic_store_i16:
; SI: buffer_store_short {{v[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
; VI: flat_store_short v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+$}}
; GFX9: global_store_short {{v[0-9]+}}, {{v[0-9]+}}, s{{\[[0-9]+:[0-9]+\]$}}
define amdgpu_kernel void @atomic_store_i16(i16 %in, i16 addrspace(1)* %out) {
entry:
store atomic i16 %in, i16 addrspace(1)* %out seq_cst, align 2
ret void
}
; GCN-LABEL: {{^}}atomic_store_f16_offset:
; SI: buffer_store_short {{v[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
; VI: flat_store_short v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+$}}
; GFX9: global_store_short {{v[0-9]+}}, {{v[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:16{{$}}
define amdgpu_kernel void @atomic_store_f16_offset(half %in, half addrspace(1)* %out) {
entry:
%gep = getelementptr half, half addrspace(1)* %out, i64 8
store atomic half %in, half addrspace(1)* %gep seq_cst, align 2
ret void
}
; GCN-LABEL: {{^}}atomic_store_f16:
; SI: buffer_store_short {{v[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
; VI: flat_store_short v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+$}}
; GFX9: global_store_short {{v[0-9]+}}, {{v[0-9]+}}, s{{\[[0-9]+:[0-9]+\]$}}
define amdgpu_kernel void @atomic_store_f16(half %in, half addrspace(1)* %out) {
entry:
store atomic half %in, half addrspace(1)* %out seq_cst, align 2
ret void
}