[AMDGPU] gfx940 VALU hazard recognizer

Differntial Revision: https://reviews.llvm.org/D122339
This commit is contained in:
Stanislav Mekhanoshin 2022-03-23 11:59:08 -07:00
parent 267d1873fa
commit f311f934e1
3 changed files with 354 additions and 3 deletions

View file

@ -813,13 +813,136 @@ GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
}
int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
int WaitStatesNeeded = 0;
if (ST.hasTransForwardingHazard() && !SIInstrInfo::isTRANS(*VALU)) {
const int TransDefWaitstates = 1;
auto IsTransDefFn = [this, VALU](const MachineInstr &MI) {
if (!SIInstrInfo::isTRANS(MI))
return false;
const SIRegisterInfo *TRI = ST.getRegisterInfo();
const SIInstrInfo *TII = ST.getInstrInfo();
Register Def = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)->getReg();
for (const MachineOperand &Use : VALU->explicit_uses()) {
if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg()))
return true;
}
return false;
};
int WaitStatesNeededForDef =
TransDefWaitstates -
getWaitStatesSince(IsTransDefFn, TransDefWaitstates);
WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
}
if (ST.hasDstSelForwardingHazard()) {
const int Shift16DefWaitstates = 1;
auto IsShift16BitDefFn = [this, VALU](const MachineInstr &MI) {
if (!SIInstrInfo::isVALU(MI))
return false;
const SIInstrInfo *TII = ST.getInstrInfo();
if (SIInstrInfo::isSDWA(MI)) {
if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel))
if (DstSel->getImm() == AMDGPU::SDWA::DWORD)
return false;
} else {
if ((AMDGPU::getNamedOperandIdx(MI.getOpcode(),
AMDGPU::OpName::op_sel) == -1) ||
!(TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)
->getImm() &
SISrcMods::DST_OP_SEL))
return false;
}
const SIRegisterInfo *TRI = ST.getRegisterInfo();
if (auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
Register Def = Dst->getReg();
for (const MachineOperand &Use : VALU->explicit_uses()) {
if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg()))
return true;
}
}
return false;
};
int WaitStatesNeededForDef =
Shift16DefWaitstates -
getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
}
if (ST.hasVDecCoExecHazard()) {
const int VALUWriteSGPRVALUReadWaitstates = 2;
const int VALUWriteEXECRWLane = 4;
const int VALUWriteVGPRReadlaneRead = 1;
const SIRegisterInfo *TRI = ST.getRegisterInfo();
const MachineRegisterInfo &MRI = MF.getRegInfo();
Register UseReg;
auto IsVALUDefSGPRFn = [&UseReg, TRI](const MachineInstr &MI) {
if (!SIInstrInfo::isVALU(MI))
return false;
return MI.modifiesRegister(UseReg, TRI);
};
for (const MachineOperand &Use : VALU->explicit_uses()) {
if (!Use.isReg())
continue;
UseReg = Use.getReg();
if (TRI->isSGPRReg(MRI, UseReg)) {
int WaitStatesNeededForDef =
VALUWriteSGPRVALUReadWaitstates -
getWaitStatesSince(IsVALUDefSGPRFn,
VALUWriteSGPRVALUReadWaitstates);
WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
}
}
if (VALU->readsRegister(AMDGPU::VCC, TRI)) {
UseReg = AMDGPU::VCC;
int WaitStatesNeededForDef =
VALUWriteSGPRVALUReadWaitstates -
getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteSGPRVALUReadWaitstates);
WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
}
switch (VALU->getOpcode()) {
case AMDGPU::V_READLANE_B32:
case AMDGPU::V_READFIRSTLANE_B32: {
MachineOperand *Src = TII.getNamedOperand(*VALU, AMDGPU::OpName::src0);
UseReg = Src->getReg();
int WaitStatesNeededForDef =
VALUWriteVGPRReadlaneRead -
getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteVGPRReadlaneRead);
WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
}
LLVM_FALLTHROUGH;
case AMDGPU::V_WRITELANE_B32: {
UseReg = AMDGPU::EXEC;
int WaitStatesNeededForDef =
VALUWriteEXECRWLane -
getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteEXECRWLane);
WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
break;
}
default:
break;
}
}
// This checks for the hazard where VMEM instructions that store more than
// 8 bytes can have there store data over written by the next instruction.
if (!ST.has12DWordStoreHazard())
return 0;
return WaitStatesNeeded;
const MachineRegisterInfo &MRI = MF.getRegInfo();
int WaitStatesNeeded = 0;
for (const MachineOperand &Def : VALU->defs()) {
WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI));

View file

@ -966,8 +966,19 @@ public:
return HasLdsBranchVmemWARHazard;
}
// Has one cycle hazard on transcendental instruction feeding a
// non transcendental VALU.
bool hasTransForwardingHazard() const { return GFX940Insts; }
// Has one cycle hazard on a VALU instruction partially writing dst with
// a shift of result bits feeding another VALU instruction.
bool hasDstSelForwardingHazard() const { return GFX940Insts; }
// Cannot use op_sel with v_dot instructions.
bool hasDOTOpSelHazard() const {
bool hasDOTOpSelHazard() const { return GFX940Insts; }
// Does not have HW interlocs for VALU writing and then reading SGPRs.
bool hasVDecCoExecHazard() const {
return GFX940Insts;
}

View file

@ -0,0 +1,217 @@
# RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=GCN %s
# GCN-LABEL: name: trans32_write_non_trans32_read
# GCN: V_RCP_F32
# GCN-NEXT: S_NOP 0
# GCN-NEXT: V_MUL_F32
name: trans32_write_non_trans32_read
body: |
bb.0:
$vgpr1 = V_RCP_F32_e32 $vgpr0, implicit $mode, implicit $exec
$vgpr2 = V_MUL_F32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec
...
# GCN-LABEL: name: trans32_write_trans_read
# GCN: V_SIN_F32
# GCN-NEXT: V_COS_F32
name: trans32_write_trans_read
body: |
bb.0:
$vgpr0 = V_SIN_F32_e32 $vgpr1, implicit $mode, implicit $exec
$vgpr2 = V_COS_F32_e32 $vgpr0, implicit $mode, implicit $exec
...
# GCN-LABEL: name: trans64_write_non_trans_read
# GCN: V_RCP_F64
# GCN-NEXT: S_NOP 0
# GCN-NEXT: V_OR_B32
name: trans64_write_non_trans_read
body: |
bb.0:
$vgpr0_vgpr1 = V_RCP_F64_e32 $vgpr2_vgpr3, implicit $mode, implicit $exec
$vgpr4 = V_OR_B32_e32 $vgpr1, $vgpr5, implicit $mode, implicit $exec
...
# GCN-LABEL: name: trans32_write_non_trans64_read
# GCN: V_EXP_F32
# GCN-NEXT: S_NOP 0
# GCN-NEXT: V_MUL_F64
name: trans32_write_non_trans64_read
body: |
bb.0:
$vgpr1 = V_EXP_F32_e32 $vgpr0, implicit $mode, implicit $exec
$vgpr2_vgpr3 = V_MUL_F64_e64 0, $vgpr0_vgpr1, 0, $vgpr2_vgpr3, 0, 0, implicit $mode, implicit $exec
...
# GCN-LABEL: name: opsel_hi16_write_valu_read
# GCN: V_ADD_I16
# GCN-NEXT: S_NOP 0
# GCN-NEXT: V_MUL_F64
name: opsel_hi16_write_valu_read
body: |
bb.0:
$vgpr0 = V_ADD_I16_e64 8, $vgpr1, 0, $vgpr2, 0, 0, implicit $exec
$vgpr4_vgpr5 = V_MUL_F64_e64 0, $vgpr0_vgpr1, 0, $vgpr2_vgpr3, 0, 0, implicit $mode, implicit $exec
...
# GCN-LABEL: name: opsel_lo16_write_valu_read
# GCN: V_ADD_I16
# GCN-NEXT: V_MUL_F64
name: opsel_lo16_write_valu_read
body: |
bb.0:
$vgpr0 = V_ADD_I16_e64 0, $vgpr1, 0, $vgpr2, 0, 0, implicit $exec
$vgpr4_vgpr5 = V_MUL_F64_e64 0, $vgpr0_vgpr1, 0, $vgpr2_vgpr3, 0, 0, implicit $mode, implicit $exec
...
# GCN-LABEL: name: sdwa_hi16_write_valu_read
# GCN: V_MOV_B32_sdwa
# GCN-NEXT: S_NOP 0
# GCN-NEXT: V_MOV_B32_e32
name: sdwa_hi16_write_valu_read
body: |
bb.0:
$vgpr0 = V_MOV_B32_sdwa 0, $vgpr1, 0, 5, 2, 5, implicit $vgpr0(tied-def 0), implicit $exec
$vgpr3 = V_MOV_B32_e32 $vgpr0, implicit $exec
...
# GCN-LABEL: name: sdwa_lo16_write_valu_read
# GCN: V_MOV_B32_sdwa
# GCN-NEXT: S_NOP 0
# GCN-NEXT: V_MOV_B32_e32
name: sdwa_lo16_write_valu_read
body: |
bb.0:
$vgpr0 = V_MOV_B32_sdwa 0, $vgpr1, 0, 4, 2, 5, implicit $vgpr0(tied-def 0), implicit $exec
$vgpr3 = V_MOV_B32_e32 $vgpr0, implicit $exec
...
# GCN-LABEL: name: sdwa_dword_write_valu_read
# GCN: V_MOV_B32_sdwa
# GCN-NEXT: V_MOV_B32_e32
name: sdwa_dword_write_valu_read
body: |
bb.0:
$vgpr0 = V_MOV_B32_sdwa 0, $vgpr1, 0, 6, 2, 5, implicit $vgpr0(tied-def 0), implicit $exec
$vgpr3 = V_MOV_B32_e32 $vgpr0, implicit $exec
...
# GCN-LABEL: name: sdwa_lo16_no_write_valu_read
# GCN: V_CMP_EQ_U32_sdwa
# GCN-NEXT: V_MOV_B32_e32
name: sdwa_lo16_no_write_valu_read
body: |
bb.0:
$vcc = V_CMP_EQ_U32_sdwa 0, $vgpr1, 0, $vgpr0, 0, 5, 2, implicit $exec
$vgpr3 = V_MOV_B32_e32 $vgpr0, implicit $exec
...
# GCN-LABEL: name: valu_write_sgpr_valu_read_as_constant
# GCN: V_READFIRSTLANE_B32
# GCN-NEXT: S_NOP 1
# GCN-NEXT: V_MOV_B32_e32
name: valu_write_sgpr_valu_read_as_constant
body: |
bb.0:
$sgpr0 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
$vgpr1 = V_MOV_B32_e32 $sgpr0, implicit $exec
...
# GCN-LABEL: name: valu_write_vcc_valu_read_as_constant
# GCN: V_CMP_NE_U32_e32
# GCN-NEXT: S_NOP 1
# GCN-NEXT: V_ADDC_U32_e32
name: valu_write_vcc_valu_read_as_constant
body: |
bb.0:
V_CMP_NE_U32_e32 0, $vgpr0, implicit-def $vcc, implicit $exec
$vgpr1 = V_ADDC_U32_e32 0, $vgpr1, implicit-def $vcc, implicit $vcc, implicit $exec
...
# GCN-LABEL: name: valu_write_sgpr_readlane_read_as_laneselect
# GCN: V_READFIRSTLANE_B32
# GCN-NEXT: S_NOP 3
# GCN-NEXT: V_READLANE_B32
name: valu_write_sgpr_readlane_read_as_laneselect
body: |
bb.0:
$sgpr0 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
$sgpr1 = V_READLANE_B32 $vgpr1, $sgpr0, implicit $exec
...
# GCN-LABEL: name: valu_write_sgpr_writelane_read_as_laneselect
# GCN: V_ADD_CO_U32_e64
# GCN-NEXT: S_NOP 3
# GCN-NEXT: V_WRITELANE_B32
name: valu_write_sgpr_writelane_read_as_laneselect
body: |
bb.0:
$vgpr0, $sgpr0_sgpr1 = V_ADD_CO_U32_e64 $vgpr0, 1, 0, implicit $exec
$vgpr1 = V_WRITELANE_B32 0, $sgpr0, $vgpr1, implicit $exec
...
# GCN-LABEL: name: vcmpx_write_exec_valu_read_as_constant
# GCN: V_CMPX_EQ_I32_e32
# GCN-NEXT: S_NOP 1
# GCN-NEXT: V_MOV_B32_e32
name: vcmpx_write_exec_valu_read_as_constant
body: |
bb.0:
implicit $exec, implicit $vcc = V_CMPX_EQ_I32_e32 $vgpr0, $vgpr1, implicit $exec
$vgpr1 = V_MOV_B32_e32 $exec_lo, implicit $exec
...
# GCN-LABEL: name: vcmpx_write_exec_readlane
# GCN: V_CMPX_EQ_I32_e32
# GCN-NEXT: S_NOP 3
# GCN-NEXT: V_READLANE_B32
name: vcmpx_write_exec_readlane
body: |
bb.0:
implicit $exec, implicit $vcc = V_CMPX_EQ_I32_e32 $vgpr0, $vgpr1, implicit $exec
$sgpr1 = V_READLANE_B32 $vgpr1, 0, implicit $exec
...
# GCN-LABEL: name: vcmpx_write_exec_readfirstlane
# GCN: V_CMPX_EQ_I32_e32
# GCN-NEXT: S_NOP 3
# GCN-NEXT: V_READFIRSTLANE_B32
name: vcmpx_write_exec_readfirstlane
body: |
bb.0:
implicit $exec, implicit $vcc = V_CMPX_EQ_I32_e32 $vgpr0, $vgpr1, implicit $exec
$sgpr1 = V_READFIRSTLANE_B32 $vgpr1, implicit $exec
...
# GCN-LABEL: name: vcmpx_write_exec_writelane
# GCN: V_CMPX_EQ_I32_e32
# GCN-NEXT: S_NOP 3
# GCN-NEXT: V_WRITELANE_B32
name: vcmpx_write_exec_writelane
body: |
bb.0:
implicit $exec, implicit $vcc = V_CMPX_EQ_I32_e32 $vgpr0, $vgpr1, implicit $exec
$vgpr1 = V_WRITELANE_B32 0, $sgpr0, $vgpr1, implicit $exec
...
# GCN-LABEL: name: valu_write_vgpr_readlane_read
# GCN: V_ADD_CO_U32_e32
# GCN-NEXT: S_NOP 0
# GCN-NEXT: V_READLANE_B32
name: valu_write_vgpr_readlane_read
body: |
bb.0:
$vgpr1 = V_ADD_CO_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc, implicit $exec
$sgpr1 = V_READLANE_B32 $vgpr1, $sgpr0, implicit $exec
...
# GCN-LABEL: name: valu_write_vgpr_readfirstlane_read
# GCN: V_ADD_CO_U32_e32
# GCN-NEXT: S_NOP 0
# GCN-NEXT: V_READFIRSTLANE_B32
name: valu_write_vgpr_readfirstlane_read
body: |
bb.0:
$vgpr1 = V_ADD_CO_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc, implicit $exec
$sgpr1 = V_READFIRSTLANE_B32 $vgpr1, implicit $exec
...