[AMDGPU] Narrow lshl from 64 to 32 bit if possible
Turn expensive 64 bit shift into 32 bit if shift does not overflow int: shl (ext x) => zext (shl x) Differential Revision: https://reviews.llvm.org/D33367 llvm-svn: 303569
This commit is contained in:
parent
80cb549c2f
commit
5fa289f0d8
|
@ -2595,27 +2595,49 @@ SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl(
|
|||
|
||||
SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
|
||||
DAGCombinerInfo &DCI) const {
|
||||
if (N->getValueType(0) != MVT::i64)
|
||||
EVT VT = N->getValueType(0);
|
||||
if (VT != MVT::i64)
|
||||
return SDValue();
|
||||
|
||||
ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
|
||||
if (!RHS)
|
||||
return SDValue();
|
||||
|
||||
SDValue LHS = N->getOperand(0);
|
||||
unsigned RHSVal = RHS->getZExtValue();
|
||||
if (!RHSVal)
|
||||
return LHS;
|
||||
|
||||
SDLoc SL(N);
|
||||
SelectionDAG &DAG = DCI.DAG;
|
||||
|
||||
switch (LHS->getOpcode()) {
|
||||
default:
|
||||
break;
|
||||
case ISD::ZERO_EXTEND:
|
||||
case ISD::SIGN_EXTEND:
|
||||
case ISD::ANY_EXTEND: {
|
||||
// shl (ext x) => zext (shl x), if shift does not overflow int
|
||||
KnownBits Known;
|
||||
SDValue X = LHS->getOperand(0);
|
||||
DAG.computeKnownBits(X, Known);
|
||||
unsigned LZ = Known.countMinLeadingZeros();
|
||||
if (LZ < RHSVal)
|
||||
break;
|
||||
EVT XVT = X.getValueType();
|
||||
SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(RHS, 0));
|
||||
return DAG.getZExtOrTrunc(Shl, SL, VT);
|
||||
}
|
||||
}
|
||||
|
||||
// i64 (shl x, C) -> (build_pair 0, (shl x, C -32))
|
||||
|
||||
// On some subtargets, 64-bit shift is a quarter rate instruction. In the
|
||||
// common case, splitting this into a move and a 32-bit shift is faster and
|
||||
// the same code size.
|
||||
const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
|
||||
if (!RHS)
|
||||
return SDValue();
|
||||
|
||||
unsigned RHSVal = RHS->getZExtValue();
|
||||
if (RHSVal < 32)
|
||||
return SDValue();
|
||||
|
||||
SDValue LHS = N->getOperand(0);
|
||||
|
||||
SDLoc SL(N);
|
||||
SelectionDAG &DAG = DCI.DAG;
|
||||
|
||||
SDValue ShiftAmt = DAG.getConstant(RHSVal - 32, SL, MVT::i32);
|
||||
|
||||
SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
|
||||
|
|
|
@ -84,11 +84,10 @@ define amdgpu_kernel void @v_test_add_i16_zext_to_i32(i32 addrspace(1)* %out, i1
|
|||
|
||||
; FIXME: Need to handle non-uniform case for function below (load without gep).
|
||||
; GCN-LABEL: {{^}}v_test_add_i16_zext_to_i64:
|
||||
; VI-DAG: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0
|
||||
; VI: flat_load_ushort [[A:v[0-9]+]]
|
||||
; VI: flat_load_ushort [[B:v[0-9]+]]
|
||||
; VI-DAG: v_add_u16_e32 v[[ADD:[0-9]+]], [[B]], [[A]]
|
||||
; VI: buffer_store_dwordx2 v{{\[}}[[ADD]]:[[VZERO]]{{\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0{{$}}
|
||||
; VI: buffer_store_dwordx2 v{{\[}}[[ADD]]:{{[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0{{$}}
|
||||
define amdgpu_kernel void @v_test_add_i16_zext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%gep.out = getelementptr inbounds i64, i64 addrspace(1)* %out, i32 %tid
|
||||
|
|
|
@ -202,10 +202,10 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i32(<2 x i32> addrspace(1)
|
|||
; VI: flat_load_ushort v[[B_LO:[0-9]+]]
|
||||
; VI: flat_load_ushort v[[B_HI:[0-9]+]]
|
||||
|
||||
; VI: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
|
||||
; VI: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
|
||||
; VI: v_add_u16_e32
|
||||
; VI: v_add_u16_e32
|
||||
; VI-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
|
||||
; VI-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
|
||||
; VI-DAG: v_add_u16_e32
|
||||
; VI-DAG: v_add_u16_e32
|
||||
|
||||
; VI: buffer_store_dwordx4
|
||||
define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
|
||||
|
|
|
@ -50,7 +50,7 @@ define amdgpu_kernel void @v_ubfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out,
|
|||
; GCN-LABEL: {{^}}s_ubfe_sub_i32:
|
||||
; GCN: s_load_dword [[SRC:s[0-9]+]]
|
||||
; GCN: s_load_dword [[WIDTH:s[0-9]+]]
|
||||
; GCN: v_mov_b32_e32 [[VWIDTH:v[0-9]+]], {{s[0-9]+}}
|
||||
; GCN: v_mov_b32_e32 [[VWIDTH:v[0-9]+]], [[WIDTH]]
|
||||
; GCN: v_bfe_u32 v{{[0-9]+}}, [[SRC]], 0, [[VWIDTH]]
|
||||
define amdgpu_kernel void @s_ubfe_sub_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 {
|
||||
%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
|
@ -128,7 +128,7 @@ define amdgpu_kernel void @v_sbfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out,
|
|||
; GCN-LABEL: {{^}}s_sbfe_sub_i32:
|
||||
; GCN: s_load_dword [[SRC:s[0-9]+]]
|
||||
; GCN: s_load_dword [[WIDTH:s[0-9]+]]
|
||||
; GCN: v_mov_b32_e32 [[VWIDTH:v[0-9]+]], {{s[0-9]+}}
|
||||
; GCN: v_mov_b32_e32 [[VWIDTH:v[0-9]+]], [[WIDTH]]
|
||||
; GCN: v_bfe_i32 v{{[0-9]+}}, [[SRC]], 0, [[VWIDTH]]
|
||||
define amdgpu_kernel void @s_sbfe_sub_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 {
|
||||
%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
|
|
|
@ -135,7 +135,6 @@ define amdgpu_kernel void @s_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64
|
|||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}v_ctlz_i64:
|
||||
; GCN-DAG: v_mov_b32_e32 v[[CTLZ_HI:[0-9]+]], 0{{$}}
|
||||
; GCN-DAG: {{buffer|flat}}_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
|
||||
; GCN-DAG: v_cmp_eq_u32_e64 [[CMPHI:s\[[0-9]+:[0-9]+\]]], 0, v[[HI]]
|
||||
; GCN-DAG: v_ffbh_u32_e32 [[FFBH_LO:v[0-9]+]], v[[LO]]
|
||||
|
@ -145,7 +144,7 @@ define amdgpu_kernel void @s_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64
|
|||
; GCN-DAG: v_or_b32_e32 [[OR:v[0-9]+]], v[[HI]], v[[LO]]
|
||||
; GCN-DAG: v_cmp_ne_u32_e32 vcc, 0, [[OR]]
|
||||
; GCN-DAG: v_cndmask_b32_e32 v[[CLTZ_LO:[0-9]+]], 64, v[[CTLZ:[0-9]+]], vcc
|
||||
; GCN: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[CLTZ_LO]]:[[CTLZ_HI]]{{\]}}
|
||||
; GCN: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[CLTZ_LO]]:[[CTLZ_HI:[0-9]+]]{{\]}}
|
||||
define amdgpu_kernel void @v_ctlz_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
|
||||
%tid = call i32 @llvm.r600.read.tidig.x()
|
||||
%in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
|
||||
|
|
|
@ -121,8 +121,7 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_trunc(i32 addrspace(1)* noalias
|
|||
; GCN-DAG: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, 32, [[FFBH_LO]]
|
||||
; GCN-DAG: v_ffbh_u32_e32 [[FFBH_HI:v[0-9]+]], v[[HI]]
|
||||
; GCN-DAG: v_cndmask_b32_e64 v[[CTLZ:[0-9]+]], [[FFBH_HI]], [[FFBH_LO]]
|
||||
; GCN-DAG: v_mov_b32_e32 v[[CTLZ_HI:[0-9]+]], 0{{$}}
|
||||
; GCN: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[CTLZ]]:[[CTLZ_HI]]{{\]}}
|
||||
; GCN: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[CTLZ]]:[[CTLZ_HI:[0-9]+]]{{\]}}
|
||||
define amdgpu_kernel void @v_ctlz_zero_undef_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
|
||||
%tid = call i32 @llvm.r600.read.tidig.x()
|
||||
%in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
|
||||
|
|
|
@ -266,8 +266,8 @@ define amdgpu_kernel void @write2_ptr_subreg_arg_two_val_f32(float addrspace(1)*
|
|||
}
|
||||
|
||||
; SI-LABEL: @simple_write2_one_val_f64
|
||||
; SI: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]],
|
||||
; SI: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}}
|
||||
; SI-DAG: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]],
|
||||
; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}}
|
||||
; SI: ds_write2_b64 [[VPTR]], [[VAL]], [[VAL]] offset1:8
|
||||
; SI: s_endpgm
|
||||
define amdgpu_kernel void @simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 {
|
||||
|
|
|
@ -845,10 +845,10 @@ define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(float addrspace(
|
|||
; GCN: {{buffer_|flat_}}load_dword [[A:v[0-9]+]]
|
||||
; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
|
||||
; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
|
||||
; GCN: v_min_f32
|
||||
; GCN: v_max_f32
|
||||
; GCN: v_min_f32
|
||||
; GCN: v_max_f32
|
||||
; GCN-DAG: v_min_f32
|
||||
; GCN-DAG: v_max_f32
|
||||
; GCN-DAG: v_min_f32
|
||||
; GCN-DAG: v_max_f32
|
||||
define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
|
||||
|
|
|
@ -356,6 +356,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(i64 addrspace(1)*
|
|||
|
||||
; GCN-LABEL: {{^}}global_atomic_dec_ret_i64_offset_addr64:
|
||||
; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
|
||||
; CI: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
|
||||
; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
|
||||
; CI: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40 glc{{$}}
|
||||
; VI: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}}
|
||||
|
@ -371,6 +372,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(i64 addrspace
|
|||
|
||||
; GCN-LABEL: {{^}}global_atomic_dec_noret_i64_offset_addr64:
|
||||
; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
|
||||
; CI: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
|
||||
; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
|
||||
; CI: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40{{$}}
|
||||
; VI: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}{{$}}
|
||||
|
|
|
@ -207,6 +207,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(i64 addrspace(1)*
|
|||
|
||||
; GCN-LABEL: {{^}}global_atomic_inc_ret_i64_offset_addr64:
|
||||
; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
|
||||
; CI: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
|
||||
; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
|
||||
; CI: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40 glc{{$}}
|
||||
; VI: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}}
|
||||
|
@ -222,6 +223,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(i64 addrspace
|
|||
|
||||
; GCN-LABEL: {{^}}global_atomic_inc_noret_i64_offset_addr64:
|
||||
; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
|
||||
; CI: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
|
||||
; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
|
||||
; CI: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40{{$}}
|
||||
; VI: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}{{$}}
|
||||
|
|
45
llvm/test/CodeGen/AMDGPU/lshl64-to-32.ll
Normal file
45
llvm/test/CodeGen/AMDGPU/lshl64-to-32.ll
Normal file
|
@ -0,0 +1,45 @@
|
|||
; RUN: llc -march=amdgcn < %s | FileCheck %s
|
||||
|
||||
; CHECK-LABEL: {{^}}zext_shl64_to_32:
|
||||
; CHECK: s_lshl_b32
|
||||
; CHECK-NOT: s_lshl_b64
|
||||
define amdgpu_kernel void @zext_shl64_to_32(i64 addrspace(1)* nocapture %out, i32 %x) {
|
||||
%and = and i32 %x, 1073741823
|
||||
%ext = zext i32 %and to i64
|
||||
%shl = shl i64 %ext, 2
|
||||
store i64 %shl, i64 addrspace(1)* %out, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: {{^}}sext_shl64_to_32:
|
||||
; CHECK: s_lshl_b32
|
||||
; CHECK-NOT: s_lshl_b64
|
||||
define amdgpu_kernel void @sext_shl64_to_32(i64 addrspace(1)* nocapture %out, i32 %x) {
|
||||
%and = and i32 %x, 536870911
|
||||
%ext = sext i32 %and to i64
|
||||
%shl = shl i64 %ext, 2
|
||||
store i64 %shl, i64 addrspace(1)* %out, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: {{^}}zext_shl64_overflow:
|
||||
; CHECK: s_lshl_b64
|
||||
; CHECK-NOT: s_lshl_b32
|
||||
define amdgpu_kernel void @zext_shl64_overflow(i64 addrspace(1)* nocapture %out, i32 %x) {
|
||||
%and = and i32 %x, 2147483647
|
||||
%ext = zext i32 %and to i64
|
||||
%shl = shl i64 %ext, 2
|
||||
store i64 %shl, i64 addrspace(1)* %out, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: {{^}}sext_shl64_overflow:
|
||||
; CHECK: s_lshl_b64
|
||||
; CHECK-NOT: s_lshl_b32
|
||||
define amdgpu_kernel void @sext_shl64_overflow(i64 addrspace(1)* nocapture %out, i32 %x) {
|
||||
%and = and i32 %x, 2147483647
|
||||
%ext = sext i32 %and to i64
|
||||
%shl = shl i64 %ext, 2
|
||||
store i64 %shl, i64 addrspace(1)* %out, align 4
|
||||
ret void
|
||||
}
|
|
@ -299,10 +299,10 @@ define amdgpu_kernel void @v_uextract_bit_31_32_i64_trunc_i32(i32 addrspace(1)*
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}and_not_mask_i64:
|
||||
; GCN: buffer_load_dwordx2 v{{\[}}[[VALLO:[0-9]+]]:[[VALHI:[0-9]+]]{{\]}}
|
||||
; GCN: v_mov_b32_e32 v[[SHRHI]], 0{{$}}
|
||||
; GCN-DAG: buffer_load_dwordx2 v{{\[}}[[VALLO:[0-9]+]]:[[VALHI:[0-9]+]]{{\]}}
|
||||
; GCN: v_mov_b32_e32 v[[SHRHI:[0-9]+]], 0{{$}}
|
||||
; GCN: v_lshrrev_b32_e32 [[SHR:v[0-9]+]], 20, v[[VALLO]]
|
||||
; GCN-DAG: v_and_b32_e32 v[[SHRLO]], 4, [[SHR]]
|
||||
; GCN-DAG: v_and_b32_e32 v[[SHRLO:[0-9]+]], 4, [[SHR]]
|
||||
; GCN-NOT: v[[SHRLO]]
|
||||
; GCN-NOT: v[[SHRHI]]
|
||||
; GCN: buffer_store_dwordx2 v{{\[}}[[SHRLO]]:[[SHRHI]]{{\]}}
|
||||
|
@ -360,10 +360,9 @@ define amdgpu_kernel void @v_uextract_bit_34_37_multi_use_shift_i64(i64 addrspac
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v_uextract_bit_33_36_use_upper_half_shift_i64:
|
||||
; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
|
||||
; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
|
||||
; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 3
|
||||
; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
|
||||
; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}}
|
||||
; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:{{[0-9]+\]}}
|
||||
; GCN: buffer_store_dword v[[ZERO]]
|
||||
define amdgpu_kernel void @v_uextract_bit_33_36_use_upper_half_shift_i64(i64 addrspace(1)* %out0, i32 addrspace(1)* %out1, i64 addrspace(1)* %in) #1 {
|
||||
%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
|
|
|
@ -201,7 +201,8 @@ define amdgpu_kernel void @s_lshr_32_i64(i64 addrspace(1)* %out, i64 %a) {
|
|||
|
||||
; GCN-LABEL: {{^}}v_lshr_32_i64:
|
||||
; GCN-DAG: buffer_load_dword v[[HI_A:[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
|
||||
; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], 0{{$}}
|
||||
; GCN-DAG: v_mov_b32_e32 v[[VHI1:[0-9]+]], 0{{$}}
|
||||
; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], v[[VHI1]]{{$}}
|
||||
; GCN: buffer_store_dwordx2 v{{\[}}[[HI_A]]:[[VHI]]{{\]}}
|
||||
define amdgpu_kernel void @v_lshr_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
|
||||
%tid = call i32 @llvm.r600.read.tidig.x() #0
|
||||
|
|
|
@ -85,9 +85,9 @@ define amdgpu_kernel void @v_test_sub_i16_zext_to_i32(i32 addrspace(1)* %out, i1
|
|||
|
||||
; FIXME: Need to handle non-uniform case for function below (load without gep).
|
||||
; GCN-LABEL: {{^}}v_test_sub_i16_zext_to_i64:
|
||||
; VI-DAG: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0
|
||||
; VI: flat_load_ushort [[A:v[0-9]+]]
|
||||
; VI: flat_load_ushort [[B:v[0-9]+]]
|
||||
; VI-DAG: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0
|
||||
; VI-DAG: v_subrev_u16_e32 v[[ADD:[0-9]+]], [[B]], [[A]]
|
||||
; VI: buffer_store_dwordx2 v{{\[}}[[ADD]]:[[VZERO]]{{\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0{{$}}
|
||||
define amdgpu_kernel void @v_test_sub_i16_zext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
|
||||
|
|
Loading…
Reference in a new issue