[AMDGPU] Allow for MFMA Inst Clustering

This patch adds cluster edges between independent MFMA instructions. Additionally, it propogates all predecessors of cluster insts to the root of the cluster(s), and all successors to the leaf(ves) of the cluster(s) -- this is done to remove the possibility that those insts will be interspersed within the cluster.

Reviewed By: kerbowa

Differential Revision: https://reviews.llvm.org/D124678
This commit is contained in:
jeff 2022-04-26 11:23:13 -07:00
parent b6572ad504
commit f822db7670
6 changed files with 625 additions and 0 deletions

View file

@ -0,0 +1,175 @@
//===--- AMDGPUMFMAClusting.cpp - AMDGPU MFMA Clustering -------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
/// \file This file contains a DAG scheduling mutation to cluster MFMA
/// instructions.
//
//===----------------------------------------------------------------------===//
#include "AMDGPUMFMAClustering.h"
#include "AMDGPUTargetMachine.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIInstrInfo.h"
#include "SIMachineFunctionInfo.h"
#include "llvm/CodeGen/MachineScheduler.h"
using namespace llvm;
#define DEBUG_TYPE "amdgpu-mfma-clustering"
namespace {
static cl::opt<bool> EnableMFMACluster("amdgpu-mfma-cluster",
cl::desc("Enable MFMA clustering"),
cl::init(false));
static cl::opt<unsigned>
MaxMFMAClusterSize("amdgpu-mfma-cluster-size", cl::init(5), cl::Hidden,
cl::desc("The maximum number of MFMA instructions to "
"attempt to cluster together."));
class MFMAClusterDAGMutation : public ScheduleDAGMutation {
const SIInstrInfo *TII;
ScheduleDAGMI *DAG;
public:
MFMAClusterDAGMutation() = default;
void apply(ScheduleDAGInstrs *DAGInstrs) override;
};
static void collectMFMASUnits(SmallVectorImpl<SUnit *> &MFMASUnits,
const SIInstrInfo *TII, ScheduleDAGInstrs *DAG) {
for (SUnit &SU : DAG->SUnits) {
MachineInstr &MAI = *SU.getInstr();
if (!TII->isMAI(MAI) ||
MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64)
continue;
MFMASUnits.push_back(&SU);
LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU););
}
// Sorting the MFMAs in NodeNum order results in a good clustering order
std::sort(MFMASUnits.begin(), MFMASUnits.end(),
[](SUnit *a, SUnit *b) { return a->NodeNum < b->NodeNum; });
}
static void propagateDeps(DenseMap<unsigned, unsigned> &SUnit2ClusterInfo,
llvm::ArrayRef<SDep> ClusterPreds,
llvm::ArrayRef<SDep> ClusterSuccs,
unsigned ClusterNum, ScheduleDAGInstrs *DAG) {
for (auto Node : SUnit2ClusterInfo) {
if (Node.second != ClusterNum)
continue; // Only add the combined succs to the current cluster
LLVM_DEBUG(dbgs() << "Copying Deps To SU(" << Node.first << ")\n");
for (const SDep &Succ : ClusterSuccs) {
LLVM_DEBUG(dbgs() << "Copying Succ SU(" << Succ.getSUnit()->NodeNum
<< ")\n");
DAG->addEdge(Succ.getSUnit(),
SDep(&DAG->SUnits[Node.first], SDep::Artificial));
}
for (const SDep &Pred : ClusterPreds) {
LLVM_DEBUG(dbgs() << "Copying Pred SU(" << Pred.getSUnit()->NodeNum
<< ")\n");
if (Pred.getSUnit()->NodeNum == ClusterNum)
continue;
DAG->addEdge(&DAG->SUnits[Node.first],
SDep(Pred.getSUnit(), SDep::Artificial));
}
}
}
static void clusterNeighboringMFMAs(llvm::ArrayRef<SUnit *> MFMASUnits,
ScheduleDAGInstrs *DAG) {
DenseMap<unsigned, unsigned> SUnit2ClusterInfo;
for (unsigned Idx = 0, End = MFMASUnits.size(); Idx < (End - 1); ++Idx) {
if (SUnit2ClusterInfo.count(MFMASUnits[Idx]->NodeNum))
continue; // We don't want to cluster against a different cluster
auto MFMAOpa = MFMASUnits[Idx];
auto ClusterBase = MFMAOpa;
unsigned ClusterNum = ClusterBase->NodeNum;
SmallVector<SDep, 4> ClusterSuccs(MFMAOpa->Succs);
SmallVector<SDep, 4> ClusterPreds(MFMAOpa->Preds);
unsigned NextIdx = Idx + 1;
unsigned ClusterSize = 1;
// Attempt to cluster all the remaining MFMASunits in a chain
// starting at ClusterBase/MFMAOpa.
for (; NextIdx < End; ++NextIdx) {
if (ClusterSize >= MaxMFMAClusterSize || NextIdx >= End)
break;
// Only add independent MFMAs that have not been previously clustered
if (SUnit2ClusterInfo.count(MFMASUnits[NextIdx]->NodeNum) ||
DAG->IsReachable(MFMASUnits[NextIdx], ClusterBase) ||
DAG->IsReachable(ClusterBase, MFMASUnits[NextIdx]))
continue;
auto MFMAOpb = MFMASUnits[NextIdx];
// Aggregate the cluster inst dependencies for dep propogation
ClusterPreds.append(MFMAOpb->Preds);
ClusterSuccs.append(MFMAOpb->Succs);
if (!DAG->addEdge(MFMAOpb, SDep(MFMAOpa, SDep::Cluster)))
continue;
// Enforce ordering to ensure root/leaf of cluster chain gets
// scheduled first/last
DAG->addEdge(MFMAOpb, SDep(MFMAOpa, SDep::Artificial));
LLVM_DEBUG(dbgs() << "Cluster MFMA SU(" << MFMAOpa->NodeNum << ") - SU("
<< MFMAOpb->NodeNum << ")\n");
SUnit2ClusterInfo[MFMAOpb->NodeNum] = ClusterNum;
SUnit2ClusterInfo[MFMAOpa->NodeNum] = ClusterNum;
++ClusterSize;
MFMAOpa = MFMAOpb;
}
propagateDeps(SUnit2ClusterInfo, ClusterPreds, ClusterSuccs, ClusterNum,
DAG);
}
}
void MFMAClusterDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) {
const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
TII = ST.getInstrInfo();
const SIMachineFunctionInfo *MFI =
DAGInstrs->MF.getInfo<SIMachineFunctionInfo>();
if (!ST.hasMAIInsts())
return;
DAG = static_cast<ScheduleDAGMI *>(DAGInstrs);
const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
if (!TSchedModel || DAG->SUnits.empty())
return;
SmallVector<SUnit *, 32> MFMASUnits;
collectMFMASUnits(MFMASUnits, TII, DAG);
if (MFMASUnits.size() < 2)
return;
clusterNeighboringMFMAs(MFMASUnits, DAG);
}
} // namespace
namespace llvm {
std::unique_ptr<ScheduleDAGMutation> createMFMAClusterDAGMutation() {
return EnableMFMACluster ? std::make_unique<MFMAClusterDAGMutation>()
: nullptr;
}
} // end namespace llvm

View file

@ -0,0 +1,21 @@
//===- AMDGPUMFMAClustering.h - AMDGPU MFMA Clustering ------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUMFMACLUSTERING_H
#define LLVM_LIB_TARGET_AMDGPU_AMDGPUMFMACLUSTERING_H
#include "llvm/CodeGen/ScheduleDAGMutation.h"
#include <memory>
namespace llvm {
std::unique_ptr<ScheduleDAGMutation> createMFMAClusterDAGMutation();
} // namespace llvm
#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUMFMACLUSTERING_H

View file

@ -16,6 +16,7 @@
#include "AMDGPU.h"
#include "AMDGPUAliasAnalysis.h"
#include "AMDGPUExportClustering.h"
#include "AMDGPUMFMAClustering.h"
#include "AMDGPUMacroFusion.h"
#include "AMDGPUTargetObjectFile.h"
#include "AMDGPUTargetTransformInfo.h"
@ -398,6 +399,7 @@ createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
ScheduleDAGMILive *DAG =
new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxOccupancySchedStrategy>(C));
DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
DAG->addMutation(createMFMAClusterDAGMutation());
DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
return DAG;
@ -879,6 +881,7 @@ public:
const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
DAG->addMutation(ST.createFillMFMAShadowMutation(DAG->TII));
DAG->addMutation(createMFMAClusterDAGMutation());
return DAG;
}

View file

@ -75,6 +75,7 @@ add_llvm_target(AMDGPUCodeGen
AMDGPUMachineModuleInfo.cpp
AMDGPUMacroFusion.cpp
AMDGPUMCInstLower.cpp
AMDGPUMFMAClustering.cpp
AMDGPUMIRFormatter.cpp
AMDGPUOpenCLEnqueuedBlockLowering.cpp
AMDGPUPerfHintAnalysis.cpp

View file

@ -0,0 +1,71 @@
# RUN: llc -march=amdgcn -mcpu=gfx90a -run-pass=machine-scheduler %s -o - -amdgpu-mfma-cluster=1 --debug-only=amdgpu-mfma-clustering 2>&1 | FileCheck -check-prefix=PRERA %s
# RUN: llc -march=amdgcn -mcpu=gfx90a -run-pass=machine-scheduler %s -o - -amdgpu-mfma-cluster=1 -amdgpu-mfma-cluster-size=2 --debug-only=amdgpu-mfma-clustering 2>&1 | FileCheck -check-prefix=TWOLIMIT %s
# RUN: llc -march=amdgcn -mcpu=gfx90a -run-pass=postmisched %s -o - -amdgpu-mfma-cluster=1 --debug-only=amdgpu-mfma-clustering 2>&1| FileCheck -check-prefix=POSTRA %s
# REQUIRES: asserts
# PRERA: Cluster MFMA SU(2) - SU(6)
# PRERA-NEXT: Cluster MFMA SU(6) - SU(10)
# PRERA-NEXT: Cluster MFMA SU(10) - SU(12)
# TWOLIMIT: Cluster MFMA SU(2) - SU(6)
# TWOLIMIT: Cluster MFMA SU(10) - SU(11)
# POSTRA: Cluster MFMA SU(2) - SU(6)
# POSTRA-NEXT: Cluster MFMA SU(6) - SU(10)
# POSTRA-NEXT: Cluster MFMA SU(10) - SU(12)
---
name: basic_cluster
tracksRegLiveness: true
body: |
bb.0:
liveins: $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15
$vgpr1 = V_MOV_B32_e32 1, implicit $exec
$vgpr0 = V_MOV_B32_e32 1, implicit $exec
$agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
$vgpr2 = V_MOV_B32_e32 1, implicit $exec
$vgpr3 = V_MOV_B32_e32 1, implicit $exec
$vgpr4 = V_MOV_B32_e32 1, implicit $exec
$agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
$vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
$vgpr5 = V_MOV_B32_e32 1, implicit $exec
$vgpr6 = V_MOV_B32_e32 1, implicit $exec
$agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec
$agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr5, $vgpr6, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
$agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
...
# PRERA: Cluster MFMA SU(12) - SU(16)
# PRERA-NEXT: Cluster MFMA SU(16) - SU(20)
# POSTRA: Cluster MFMA SU(12) - SU(16)
# POSTRA-NEXT: Cluster MFMA SU(16) - SU(20)
---
name: complex_cluster
tracksRegLiveness: true
body: |
bb.0:
liveins: $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15, $sgpr0, $vgpr10_vgpr11
$vgpr1 = V_MOV_B32_e32 1, implicit $exec
$vgpr0 = V_MOV_B32_e32 1, implicit $exec
$vgpr8 = V_MOV_B32_e32 0, implicit $exec
$vgpr9 = V_MOV_B32_e32 9, implicit $exec
$agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
$vgpr2 = V_MOV_B32_e32 1, implicit $exec
$vgpr3 = V_MOV_B32_e32 1, implicit $exec
$vgpr4 = V_MOV_B32_e32 1, implicit $exec
$vgpr1 = V_ADD_F16_e32 $vgpr1, $vgpr0, implicit $mode, implicit $exec
GLOBAL_STORE_DWORD $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec
$vgpr3 = DS_READ_U16_gfx9 $vgpr2, 0, 0, implicit $exec
$vgpr4 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec
$agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
$vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
$vgpr5 = V_MOV_B32_e32 1, implicit $exec
$vgpr6 = V_MOV_B32_e32 1, implicit $exec
$agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec
$agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr5, $vgpr6, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
$vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
$vgpr6 = V_MUL_LO_U32_e64 $vgpr1, $sgpr0, implicit $exec
$agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
...

View file

@ -0,0 +1,354 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -march=amdgcn -mcpu=gfx90a -run-pass=machine-scheduler %s -o - -amdgpu-mfma-cluster=1 2>&1 | FileCheck -check-prefix=PRERA %s
# RUN: llc -march=amdgcn -mcpu=gfx90a -run-pass=machine-scheduler %s -o - 2>&1 | FileCheck -check-prefix=DEFAULT %s
# RUN: llc -march=amdgcn -mcpu=gfx90a -start-before=machine-scheduler -stop-after=postmisched %s -o - -amdgpu-mfma-cluster=1 2>&1 | FileCheck -check-prefix=BOTHSCHEDPASS %s
# RUN: llc -march=amdgcn -mcpu=gfx90a -run-pass=machine-scheduler %s -o - -amdgpu-mfma-cluster=1 -amdgpu-mfma-cluster-size=2 2>&1 | FileCheck -check-prefix=TWOLIMIT %s
# RUN: llc -march=amdgcn -mcpu=gfx90a -run-pass=postmisched %s -o - -amdgpu-mfma-cluster=1 2>&1| FileCheck -check-prefix=POSTRA %s
---
name: no_cluster
tracksRegLiveness: true
body: |
bb.0:
liveins: $sgpr0, $vgpr10_vgpr11
; PRERA-LABEL: name: no_cluster
; PRERA: liveins: $sgpr0, $vgpr10_vgpr11
; PRERA-NEXT: {{ $}}
; PRERA-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
; PRERA-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec
; PRERA-NEXT: $vgpr1 = V_ADD_F16_e32 $vgpr1, $vgpr0, implicit $mode, implicit $exec
; PRERA-NEXT: GLOBAL_STORE_DWORD $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec
; PRERA-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
; PRERA-NEXT: $vgpr3 = DS_READ_U16_gfx9 $vgpr2, 0, 0, implicit $exec
; PRERA-NEXT: $vgpr4 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec
; PRERA-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec
; PRERA-NEXT: $vgpr9 = V_MOV_B32_e32 9, implicit $exec
; PRERA-NEXT: $vgpr7 = GLOBAL_LOAD_USHORT $vgpr8_vgpr9, 0, 0, implicit $exec
; PRERA-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
; PRERA-NEXT: $vgpr6 = V_MUL_LO_U32_e64 $vgpr1, $sgpr0, implicit $exec
; DEFAULT-LABEL: name: no_cluster
; DEFAULT: liveins: $sgpr0, $vgpr10_vgpr11
; DEFAULT-NEXT: {{ $}}
; DEFAULT-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
; DEFAULT-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec
; DEFAULT-NEXT: $vgpr1 = V_ADD_F16_e32 $vgpr1, $vgpr0, implicit $mode, implicit $exec
; DEFAULT-NEXT: GLOBAL_STORE_DWORD $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec
; DEFAULT-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
; DEFAULT-NEXT: $vgpr3 = DS_READ_U16_gfx9 $vgpr2, 0, 0, implicit $exec
; DEFAULT-NEXT: $vgpr4 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec
; DEFAULT-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec
; DEFAULT-NEXT: $vgpr9 = V_MOV_B32_e32 9, implicit $exec
; DEFAULT-NEXT: $vgpr7 = GLOBAL_LOAD_USHORT $vgpr8_vgpr9, 0, 0, implicit $exec
; DEFAULT-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
; DEFAULT-NEXT: $vgpr6 = V_MUL_LO_U32_e64 $vgpr1, $sgpr0, implicit $exec
; BOTHSCHEDPASS-LABEL: name: no_cluster
; BOTHSCHEDPASS: liveins: $sgpr0, $vgpr10_vgpr11
; BOTHSCHEDPASS-NEXT: {{ $}}
; BOTHSCHEDPASS-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
; BOTHSCHEDPASS-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec
; BOTHSCHEDPASS-NEXT: $vgpr1 = V_ADD_F16_e32 killed $vgpr1, $vgpr0, implicit $mode, implicit $exec
; BOTHSCHEDPASS-NEXT: GLOBAL_STORE_DWORD killed $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec
; BOTHSCHEDPASS-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
; BOTHSCHEDPASS-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec
; BOTHSCHEDPASS-NEXT: $vgpr9 = V_MOV_B32_e32 9, implicit $exec
; BOTHSCHEDPASS-NEXT: $vgpr3 = DS_READ_U16_gfx9 killed $vgpr2, 0, 0, implicit $exec
; BOTHSCHEDPASS-NEXT: $vgpr4 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec
; BOTHSCHEDPASS-NEXT: $vgpr7 = GLOBAL_LOAD_USHORT killed $vgpr8_vgpr9, 0, 0, implicit $exec
; BOTHSCHEDPASS-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, killed $vgpr0, implicit $exec
; BOTHSCHEDPASS-NEXT: $vgpr6 = V_MUL_LO_U32_e64 killed $vgpr1, killed $sgpr0, implicit $exec
; TWOLIMIT-LABEL: name: no_cluster
; TWOLIMIT: liveins: $sgpr0, $vgpr10_vgpr11
; TWOLIMIT-NEXT: {{ $}}
; TWOLIMIT-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
; TWOLIMIT-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec
; TWOLIMIT-NEXT: $vgpr1 = V_ADD_F16_e32 $vgpr1, $vgpr0, implicit $mode, implicit $exec
; TWOLIMIT-NEXT: GLOBAL_STORE_DWORD $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec
; TWOLIMIT-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
; TWOLIMIT-NEXT: $vgpr3 = DS_READ_U16_gfx9 $vgpr2, 0, 0, implicit $exec
; TWOLIMIT-NEXT: $vgpr4 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec
; TWOLIMIT-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec
; TWOLIMIT-NEXT: $vgpr9 = V_MOV_B32_e32 9, implicit $exec
; TWOLIMIT-NEXT: $vgpr7 = GLOBAL_LOAD_USHORT $vgpr8_vgpr9, 0, 0, implicit $exec
; TWOLIMIT-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
; TWOLIMIT-NEXT: $vgpr6 = V_MUL_LO_U32_e64 $vgpr1, $sgpr0, implicit $exec
; POSTRA-LABEL: name: no_cluster
; POSTRA: liveins: $sgpr0, $vgpr10_vgpr11
; POSTRA-NEXT: {{ $}}
; POSTRA-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
; POSTRA-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec
; POSTRA-NEXT: $vgpr1 = V_ADD_F16_e32 killed $vgpr1, $vgpr0, implicit $mode, implicit $exec
; POSTRA-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec
; POSTRA-NEXT: $vgpr9 = V_MOV_B32_e32 9, implicit $exec
; POSTRA-NEXT: GLOBAL_STORE_DWORD killed $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec
; POSTRA-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
; POSTRA-NEXT: $vgpr3 = DS_READ_U16_gfx9 killed $vgpr2, 0, 0, implicit $exec
; POSTRA-NEXT: $vgpr4 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec
; POSTRA-NEXT: $vgpr7 = GLOBAL_LOAD_USHORT killed $vgpr8_vgpr9, 0, 0, implicit $exec
; POSTRA-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, killed $vgpr0, implicit $exec
; POSTRA-NEXT: $vgpr6 = V_MUL_LO_U32_e64 killed $vgpr1, killed $sgpr0, implicit $exec
$vgpr1 = V_MOV_B32_e32 1, implicit $exec
$vgpr0 = V_MOV_B32_e32 1, implicit $exec
$vgpr8 = V_MOV_B32_e32 0, implicit $exec
$vgpr9 = V_MOV_B32_e32 9, implicit $exec
$vgpr1 = V_ADD_F16_e32 $vgpr1, $vgpr0, implicit $mode, implicit $exec
GLOBAL_STORE_DWORD $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec
$vgpr2 = V_MOV_B32_e32 1, implicit $exec
$vgpr3 = DS_READ_U16_gfx9 $vgpr2, 0, 0, implicit $exec
$vgpr4 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec
$vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
$vgpr6 = V_MUL_LO_U32_e64 $vgpr1, $sgpr0, implicit $exec
$vgpr7 = GLOBAL_LOAD_USHORT $vgpr8_vgpr9, 0, 0, implicit $exec
...
---
name: basic_cluster
tracksRegLiveness: true
body: |
bb.0:
liveins: $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15
; PRERA-LABEL: name: basic_cluster
; PRERA: liveins: $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15
; PRERA-NEXT: {{ $}}
; PRERA-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
; PRERA-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
; PRERA-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec
; PRERA-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec
; PRERA-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
; PRERA-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
; PRERA-NEXT: $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec
; PRERA-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
; PRERA-NEXT: $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
; PRERA-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec
; PRERA-NEXT: $vgpr6 = V_MOV_B32_e32 1, implicit $exec
; PRERA-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr5, $vgpr6, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
; PRERA-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
; DEFAULT-LABEL: name: basic_cluster
; DEFAULT: liveins: $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15
; DEFAULT-NEXT: {{ $}}
; DEFAULT-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
; DEFAULT-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec
; DEFAULT-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
; DEFAULT-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
; DEFAULT-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec
; DEFAULT-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
; DEFAULT-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
; DEFAULT-NEXT: $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec
; DEFAULT-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec
; DEFAULT-NEXT: $vgpr6 = V_MOV_B32_e32 1, implicit $exec
; DEFAULT-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr5, $vgpr6, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
; DEFAULT-NEXT: $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
; DEFAULT-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
; BOTHSCHEDPASS-LABEL: name: basic_cluster
; BOTHSCHEDPASS: liveins: $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15
; BOTHSCHEDPASS-NEXT: {{ $}}
; BOTHSCHEDPASS-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
; BOTHSCHEDPASS-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
; BOTHSCHEDPASS-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec
; BOTHSCHEDPASS-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
; BOTHSCHEDPASS-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec
; BOTHSCHEDPASS-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec
; BOTHSCHEDPASS-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
; BOTHSCHEDPASS-NEXT: $vgpr6 = V_MOV_B32_e32 1, implicit $exec
; BOTHSCHEDPASS-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
; BOTHSCHEDPASS-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
; BOTHSCHEDPASS-NEXT: $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr3, killed $vgpr4, killed $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec
; BOTHSCHEDPASS-NEXT: $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr1, killed $vgpr0, killed $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
; BOTHSCHEDPASS-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr5, killed $vgpr6, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
; TWOLIMIT-LABEL: name: basic_cluster
; TWOLIMIT: liveins: $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15
; TWOLIMIT-NEXT: {{ $}}
; TWOLIMIT-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
; TWOLIMIT-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
; TWOLIMIT-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec
; TWOLIMIT-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec
; TWOLIMIT-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
; TWOLIMIT-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
; TWOLIMIT-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
; TWOLIMIT-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec
; TWOLIMIT-NEXT: $vgpr6 = V_MOV_B32_e32 1, implicit $exec
; TWOLIMIT-NEXT: $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec
; TWOLIMIT-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr5, $vgpr6, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
; TWOLIMIT-NEXT: $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
; TWOLIMIT-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
; POSTRA-LABEL: name: basic_cluster
; POSTRA: liveins: $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15
; POSTRA-NEXT: {{ $}}
; POSTRA-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
; POSTRA-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec
; POSTRA-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
; POSTRA-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
; POSTRA-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec
; POSTRA-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec
; POSTRA-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
; POSTRA-NEXT: $vgpr6 = V_MOV_B32_e32 1, implicit $exec
; POSTRA-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
; POSTRA-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
; POSTRA-NEXT: $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr3, killed $vgpr4, killed $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec
; POSTRA-NEXT: $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr1, killed $vgpr0, killed $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
; POSTRA-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr5, killed $vgpr6, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
$vgpr1 = V_MOV_B32_e32 1, implicit $exec
$vgpr0 = V_MOV_B32_e32 1, implicit $exec
$agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
$vgpr2 = V_MOV_B32_e32 1, implicit $exec
$vgpr3 = V_MOV_B32_e32 1, implicit $exec
$vgpr4 = V_MOV_B32_e32 1, implicit $exec
$agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
$vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
$vgpr5 = V_MOV_B32_e32 1, implicit $exec
$vgpr6 = V_MOV_B32_e32 1, implicit $exec
$agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec
$agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr5, $vgpr6, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
$agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
...
---
name: complex_cluster
tracksRegLiveness: true
body: |
bb.0:
liveins: $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15, $sgpr0, $vgpr10_vgpr11
; PRERA-LABEL: name: complex_cluster
; PRERA: liveins: $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15, $sgpr0, $vgpr10_vgpr11
; PRERA-NEXT: {{ $}}
; PRERA-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
; PRERA-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec
; PRERA-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
; PRERA-NEXT: $vgpr1 = V_ADD_F16_e32 $vgpr1, $vgpr0, implicit $mode, implicit $exec
; PRERA-NEXT: GLOBAL_STORE_DWORD $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec
; PRERA-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
; PRERA-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
; PRERA-NEXT: $vgpr3 = DS_READ_U16_gfx9 $vgpr2, 0, 0, implicit $exec
; PRERA-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec
; PRERA-NEXT: $vgpr4 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec
; PRERA-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
; PRERA-NEXT: $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec
; PRERA-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
; PRERA-NEXT: $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
; PRERA-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec
; PRERA-NEXT: $vgpr6 = V_MOV_B32_e32 1, implicit $exec
; PRERA-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr5, $vgpr6, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
; PRERA-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
; PRERA-NEXT: $vgpr6 = V_MUL_LO_U32_e64 $vgpr1, $sgpr0, implicit $exec
; PRERA-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec
; PRERA-NEXT: $vgpr9 = V_MOV_B32_e32 9, implicit $exec
; DEFAULT-LABEL: name: complex_cluster
; DEFAULT: liveins: $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15, $sgpr0, $vgpr10_vgpr11
; DEFAULT-NEXT: {{ $}}
; DEFAULT-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
; DEFAULT-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec
; DEFAULT-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
; DEFAULT-NEXT: $vgpr1 = V_ADD_F16_e32 $vgpr1, $vgpr0, implicit $mode, implicit $exec
; DEFAULT-NEXT: GLOBAL_STORE_DWORD $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec
; DEFAULT-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
; DEFAULT-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
; DEFAULT-NEXT: $vgpr3 = DS_READ_U16_gfx9 $vgpr2, 0, 0, implicit $exec
; DEFAULT-NEXT: $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
; DEFAULT-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec
; DEFAULT-NEXT: $vgpr4 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec
; DEFAULT-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
; DEFAULT-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
; DEFAULT-NEXT: $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec
; DEFAULT-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec
; DEFAULT-NEXT: $vgpr6 = V_MOV_B32_e32 1, implicit $exec
; DEFAULT-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr5, $vgpr6, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
; DEFAULT-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
; DEFAULT-NEXT: $vgpr6 = V_MUL_LO_U32_e64 $vgpr1, $sgpr0, implicit $exec
; DEFAULT-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec
; DEFAULT-NEXT: $vgpr9 = V_MOV_B32_e32 9, implicit $exec
; BOTHSCHEDPASS-LABEL: name: complex_cluster
; BOTHSCHEDPASS: liveins: $sgpr0, $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15, $vgpr10_vgpr11
; BOTHSCHEDPASS-NEXT: {{ $}}
; BOTHSCHEDPASS-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
; BOTHSCHEDPASS-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec
; BOTHSCHEDPASS-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
; BOTHSCHEDPASS-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
; BOTHSCHEDPASS-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec
; BOTHSCHEDPASS-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
; BOTHSCHEDPASS-NEXT: $vgpr1 = V_ADD_F16_e32 killed $vgpr1, $vgpr0, implicit $mode, implicit $exec
; BOTHSCHEDPASS-NEXT: GLOBAL_STORE_DWORD killed $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec
; BOTHSCHEDPASS-NEXT: $vgpr3 = DS_READ_U16_gfx9 killed $vgpr2, 0, 0, implicit $exec
; BOTHSCHEDPASS-NEXT: $vgpr4 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec
; BOTHSCHEDPASS-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
; BOTHSCHEDPASS-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec
; BOTHSCHEDPASS-NEXT: $vgpr6 = V_MOV_B32_e32 1, implicit $exec
; BOTHSCHEDPASS-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec
; BOTHSCHEDPASS-NEXT: $vgpr9 = V_MOV_B32_e32 9, implicit $exec
; BOTHSCHEDPASS-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
; BOTHSCHEDPASS-NEXT: $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr3, killed $vgpr4, killed $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec
; BOTHSCHEDPASS-NEXT: $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, killed $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
; BOTHSCHEDPASS-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr5, killed $vgpr6, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
; BOTHSCHEDPASS-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, killed $vgpr0, implicit $exec
; BOTHSCHEDPASS-NEXT: $vgpr6 = V_MUL_LO_U32_e64 killed $vgpr1, killed $sgpr0, implicit $exec
; TWOLIMIT-LABEL: name: complex_cluster
; TWOLIMIT: liveins: $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15, $sgpr0, $vgpr10_vgpr11
; TWOLIMIT-NEXT: {{ $}}
; TWOLIMIT-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
; TWOLIMIT-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec
; TWOLIMIT-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
; TWOLIMIT-NEXT: $vgpr1 = V_ADD_F16_e32 $vgpr1, $vgpr0, implicit $mode, implicit $exec
; TWOLIMIT-NEXT: GLOBAL_STORE_DWORD $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec
; TWOLIMIT-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
; TWOLIMIT-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
; TWOLIMIT-NEXT: $vgpr3 = DS_READ_U16_gfx9 $vgpr2, 0, 0, implicit $exec
; TWOLIMIT-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec
; TWOLIMIT-NEXT: $vgpr4 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec
; TWOLIMIT-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
; TWOLIMIT-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
; TWOLIMIT-NEXT: $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec
; TWOLIMIT-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec
; TWOLIMIT-NEXT: $vgpr6 = V_MOV_B32_e32 1, implicit $exec
; TWOLIMIT-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr5, $vgpr6, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
; TWOLIMIT-NEXT: $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
; TWOLIMIT-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
; TWOLIMIT-NEXT: $vgpr6 = V_MUL_LO_U32_e64 $vgpr1, $sgpr0, implicit $exec
; TWOLIMIT-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec
; TWOLIMIT-NEXT: $vgpr9 = V_MOV_B32_e32 9, implicit $exec
; POSTRA-LABEL: name: complex_cluster
; POSTRA: liveins: $agpr0_agpr1_agpr2_agpr3, $agpr4_agpr5_agpr6_agpr7, $agpr8_agpr9_agpr10_agpr11, $agpr12_agpr13_agpr14_agpr15, $sgpr0, $vgpr10_vgpr11
; POSTRA-NEXT: {{ $}}
; POSTRA-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
; POSTRA-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec
; POSTRA-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
; POSTRA-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
; POSTRA-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec
; POSTRA-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
; POSTRA-NEXT: $vgpr1 = V_ADD_F16_e32 killed $vgpr1, $vgpr0, implicit $mode, implicit $exec
; POSTRA-NEXT: GLOBAL_STORE_DWORD killed $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec
; POSTRA-NEXT: $vgpr3 = DS_READ_U16_gfx9 killed $vgpr2, 0, 0, implicit $exec
; POSTRA-NEXT: $vgpr4 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec
; POSTRA-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
; POSTRA-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec
; POSTRA-NEXT: $vgpr6 = V_MOV_B32_e32 1, implicit $exec
; POSTRA-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec
; POSTRA-NEXT: $vgpr9 = V_MOV_B32_e32 9, implicit $exec
; POSTRA-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
; POSTRA-NEXT: $agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr3, killed $vgpr4, killed $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec
; POSTRA-NEXT: $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, killed $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
; POSTRA-NEXT: $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr5, killed $vgpr6, killed $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
; POSTRA-NEXT: $vgpr5 = V_XOR_B32_e32 $vgpr1, killed $vgpr0, implicit $exec
; POSTRA-NEXT: $vgpr6 = V_MUL_LO_U32_e64 killed $vgpr1, killed $sgpr0, implicit $exec
$vgpr1 = V_MOV_B32_e32 1, implicit $exec
$vgpr0 = V_MOV_B32_e32 1, implicit $exec
$vgpr8 = V_MOV_B32_e32 0, implicit $exec
$vgpr9 = V_MOV_B32_e32 9, implicit $exec
$agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
$vgpr2 = V_MOV_B32_e32 1, implicit $exec
$vgpr3 = V_MOV_B32_e32 1, implicit $exec
$vgpr4 = V_MOV_B32_e32 1, implicit $exec
$vgpr1 = V_ADD_F16_e32 $vgpr1, $vgpr0, implicit $mode, implicit $exec
GLOBAL_STORE_DWORD $vgpr10_vgpr11, $vgpr1, 0, 0, implicit $exec
$vgpr3 = DS_READ_U16_gfx9 $vgpr2, 0, 0, implicit $exec
$vgpr4 = GLOBAL_LOAD_USHORT $vgpr0_vgpr1, 0, 0, implicit $exec
$agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
$vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
$vgpr5 = V_MOV_B32_e32 1, implicit $exec
$vgpr6 = V_MOV_B32_e32 1, implicit $exec
$agpr8_agpr9_agpr10_agpr11 = V_MFMA_F32_4X4X1F32_e64 $vgpr3, $vgpr4, $agpr8_agpr9_agpr10_agpr11, 0, 0, 0, implicit $mode, implicit $exec
$agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 $vgpr5, $vgpr6, $agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec
$vgpr5 = V_XOR_B32_e32 $vgpr1, $vgpr0, implicit $exec
$vgpr6 = V_MUL_LO_U32_e64 $vgpr1, $sgpr0, implicit $exec
$agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
...