Implement two cluster layout heuristics

Summary:
Pettis' paper on block layout (PLDI'90) suggests we should order
clusters (or chains, using the paper terminology) using a specific criterion.
This patch implements two distinct ideas for cluster layout that can be
activated using different command-line flags. The first one reflects Pettis'
ideas on minimizing branch mispredictions and the second one is targeted at
reducing I-cache misses, described in the Ispike paper (CGO'04).

(cherry picked from FBD2588693)
This commit is contained in:
Rafael Auler 2015-10-23 09:38:26 -07:00 committed by Maksim Panchenko
parent 2539539bde
commit 13a520ab30
4 changed files with 193 additions and 91 deletions

View file

@ -94,6 +94,7 @@ public:
typedef std::reverse_iterator<iterator> reverse_iterator;
bool empty() const { return Instructions.empty(); }
unsigned size() const { return (unsigned)Instructions.size(); }
MCInst &front() { return Instructions.front(); }
MCInst &back() { return Instructions.back(); }
const MCInst &front() const { return Instructions.front(); }

View file

@ -568,7 +568,7 @@ void BinaryFunction::inferFallThroughCounts() {
return;
}
void BinaryFunction::optimizeLayout() {
void BinaryFunction::optimizeLayout(HeuristicPriority Priority) {
// Bail if no profiling information or if empty
if (getExecutionCount() == BinaryFunction::COUNT_NO_PROFILE ||
BasicBlocksLayout.empty()) {
@ -598,9 +598,18 @@ void BinaryFunction::optimizeLayout() {
std::vector<ClusterTy> Clusters;
BBToClusterMapTy BBToClusterMap;
// Populating priority queue with all edges
// Encode relative weights between two clusters
std::vector<std::map<uint32_t, uint64_t>> ClusterEdges;
ClusterEdges.resize(BasicBlocksLayout.size());
for (auto BB : BasicBlocksLayout) {
BBToClusterMap[BB] = -1; // Mark as unmapped
// Create a cluster for this BB
uint32_t I = Clusters.size();
Clusters.emplace_back();
auto &Cluster = Clusters.back();
Cluster.push_back(BB);
BBToClusterMap[BB] = I;
// Populate priority queue with edges
auto BI = BB->BranchInfo.begin();
for (auto &I : BB->successors()) {
if (BI->Count != BinaryBasicBlock::COUNT_FALLTHROUGH_EDGE)
@ -610,13 +619,6 @@ void BinaryFunction::optimizeLayout() {
}
}
// Start a cluster with the entry point
BinaryBasicBlock *Entry = *BasicBlocksLayout.begin();
Clusters.emplace_back();
auto &EntryCluster = Clusters.back();
EntryCluster.push_back(Entry);
BBToClusterMap[Entry] = 0;
// Grow clusters in a greedy fashion
while (!Queue.empty()) {
auto elmt = Queue.top();
@ -624,95 +626,166 @@ void BinaryFunction::optimizeLayout() {
BinaryBasicBlock *BBSrc = elmt.first;
BinaryBasicBlock *BBDst = elmt.second;
int I = 0, J = 0;
// Case 1: BBSrc and BBDst are the same. Ignore this edge
if (BBSrc == BBDst || BBDst == Entry)
if (BBSrc == BBDst || BBDst == *BasicBlocksLayout.begin())
continue;
// Case 2: Both BBSrc and BBDst are already allocated
if ((I = BBToClusterMap[BBSrc]) != -1 &&
(J = BBToClusterMap[BBDst]) != -1) {
// Case 2a: If they are already allocated at the same cluster, ignore
if (I == J)
int I = BBToClusterMap[BBSrc];
int J = BBToClusterMap[BBDst];
// Case 2: If they are already allocated at the same cluster, just increase
// the weight of this cluster
if (I == J) {
ClusterEdges[I][I] += Weight[elmt];
continue;
}
auto &ClusterA = Clusters[I];
auto &ClusterB = Clusters[J];
if (ClusterA.back() == BBSrc && ClusterB.front() == BBDst) {
// Case 3: BBSrc is at the end of a cluster and BBDst is at the start,
// allowing us to merge two clusters
for (auto BB : ClusterB)
BBToClusterMap[BB] = I;
ClusterA.insert(ClusterA.end(), ClusterB.begin(), ClusterB.end());
ClusterB.clear();
// Iterate through all inter-cluster edges and transfer edges targeting
// cluster B to cluster A.
// It is bad to have to iterate though all edges when we could have a list
// of predecessors for cluster B. However, it's not clear if it is worth
// the added code complexity to create a data structure for clusters that
// maintains a list of predecessors. Maybe change this if it becomes a
// deal breaker.
for (uint32_t K = 0, E = ClusterEdges.size(); K != E; ++K)
ClusterEdges[K][I] += ClusterEdges[K][J];
} else {
// Case 4: Both BBSrc and BBDst are allocated in positions we cannot
// merge them. Annotate the weight of this edge in the weight between
// clusters to help us decide ordering between these clusters.
ClusterEdges[I][J] += Weight[elmt];
}
}
std::vector<uint32_t> Order; // Cluster layout order
// Here we have 3 conflicting goals as to how to layout clusters. If we want
// to minimize jump offsets, we should put clusters with heavy inter-cluster
// dependence as close as possible. If we want to maximize the probability
// that all inter-cluster edges are predicted as not-taken, we should enforce
// a topological order to make targets appear after sources, creating forward
// branches. If we want to separate hot from cold blocks to maximize the
// probability that unfrequently executed code doesn't pollute the cache, we
// should put clusters in descending order of hotness.
std::vector<double> AvgFreq;
AvgFreq.resize(Clusters.size(), 0.0);
for (uint32_t I = 1, E = Clusters.size(); I < E; ++I) {
double Freq = 0.0;
for (auto BB : Clusters[I]) {
if (!BB->empty())
Freq += BB->getExecutionCount() / BB->size();
}
AvgFreq[I] = Freq;
}
switch(Priority) {
case HP_NONE: {
for (uint32_t I = 0, E = Clusters.size(); I < E; ++I)
if (!Clusters[I].empty())
Order.push_back(I);
break;
}
case HP_BRANCH_PREDICTOR: {
// Do a topological sort for clusters, prioritizing frequently-executed BBs
// during the traversal.
std::stack<uint32_t> Stack;
std::vector<uint32_t> Status;
std::vector<uint32_t> Parent;
Status.resize(Clusters.size(), 0);
Parent.resize(Clusters.size(), 0);
constexpr uint32_t STACKED = 1;
constexpr uint32_t VISITED = 2;
Status[0] = STACKED;
Stack.push(0);
while (!Stack.empty()) {
uint32_t I = Stack.top();
if (!(Status[I] & VISITED)) {
Status[I] |= VISITED;
// Order successors by weight
auto ClusterComp = [&ClusterEdges, I](uint32_t A, uint32_t B) {
return ClusterEdges[I][A] > ClusterEdges[I][B];
};
std::priority_queue<uint32_t, std::vector<uint32_t>,
decltype(ClusterComp)> SuccQueue(ClusterComp);
for (auto &Target: ClusterEdges[I]) {
if (Target.second > 0 && !(Status[Target.first] & STACKED) &&
!Clusters[Target.first].empty()) {
Parent[Target.first] = I;
Status[Target.first] = STACKED;
SuccQueue.push(Target.first);
}
}
while (!SuccQueue.empty()) {
Stack.push(SuccQueue.top());
SuccQueue.pop();
}
continue;
auto &ClusterA = Clusters[I];
auto &ClusterB = Clusters[J];
if (ClusterA.back() == BBSrc && ClusterB.front() == BBDst) {
// Case 2b: BBSrc is at the end of a cluster and BBDst is at the start,
// allowing us to merge two clusters
for (auto BB : ClusterB)
BBToClusterMap[BB] = I;
ClusterA.insert(ClusterA.end(), ClusterB.begin(), ClusterB.end());
ClusterB.clear();
} else {
// Case 2c: Both BBSrc and BBDst are allocated in positions we cannot
// merge them, so we ignore this edge.
}
continue;
// Already visited this node
Stack.pop();
Order.push_back(I);
}
std::reverse(Order.begin(), Order.end());
// Put unreachable clusters at the end
for (uint32_t I = 0, E = Clusters.size(); I < E; ++I)
if (!(Status[I] & VISITED) && !Clusters[I].empty())
Order.push_back(I);
// Case 3: BBSrc is already allocated in a cluster
if ((I = BBToClusterMap[BBSrc]) != -1) {
auto &Cluster = Clusters[I];
if (Cluster.back() == BBSrc) {
// Case 3a: BBSrc is allocated at the end of this cluster. We put
// BBSrc and BBDst together.
Cluster.push_back(BBDst);
BBToClusterMap[BBDst] = I;
} else {
// Case 3b: We cannot put BBSrc and BBDst in consecutive positions,
// so we ignore this edge.
}
continue;
}
// Sort nodes with equal precedence
auto Beg = Order.begin();
// Don't reorder the first cluster, which contains the function entry point
++Beg;
std::stable_sort(Beg, Order.end(),
[&AvgFreq, &Parent](uint32_t A, uint32_t B) {
uint32_t P = Parent[A];
while (Parent[P] != 0) {
if (Parent[P] == B)
return false;
P = Parent[P];
}
P = Parent[B];
while (Parent[P] != 0) {
if (Parent[P] == A)
return true;
P = Parent[P];
}
return AvgFreq[A] > AvgFreq[B];
});
break;
}
case HP_CACHE_UTILIZATION: {
// Order clusters based on average instruction execution frequency
for (uint32_t I = 0, E = Clusters.size(); I < E; ++I)
if (!Clusters[I].empty())
Order.push_back(I);
auto Beg = Order.begin();
// Don't reorder the first cluster, which contains the function entry point
++Beg;
std::stable_sort(Beg, Order.end(), [&AvgFreq](uint32_t A, uint32_t B) {
return AvgFreq[A] > AvgFreq[B];
});
// Case 4: BBSrc is not in a cluster, but BBDst is
if ((I = BBToClusterMap[BBDst]) != -1) {
auto &Cluster = Clusters[I];
if (Cluster.front() == BBDst) {
// Case 4a: BBDst is allocated at the start of this cluster. We put
// BBSrc and BBDst together.
Cluster.insert(Cluster.begin(), BBSrc);
BBToClusterMap[BBSrc] = I;
} else {
// Case 4b: We cannot put BBSrc and BBDst in consecutive positions,
// so we ignore this edge.
}
continue;
}
// Case 5: Both BBSrc and BBDst are unallocated, so we create a new cluster
// with them
I = Clusters.size();
Clusters.emplace_back();
auto &Cluster = Clusters.back();
Cluster.push_back(BBSrc);
Cluster.push_back(BBDst);
BBToClusterMap[BBSrc] = I;
BBToClusterMap[BBDst] = I;
break;
}
}
// Create an extra cluster for unvisited basic blocks
std::vector<BinaryBasicBlock *> Unvisited;
for (auto BB : BasicBlocksLayout) {
if (BBToClusterMap[BB] == -1) {
Unvisited.push_back(BB);
}
}
// Define final function layout based on clusters
BasicBlocksLayout.clear();
for (auto &Cluster : Clusters) {
for (auto I : Order) {
auto &Cluster = Clusters[I];
BasicBlocksLayout.insert(BasicBlocksLayout.end(), Cluster.begin(),
Cluster.end());
}
// Finalize layout with BBs that weren't assigned to any cluster, preserving
// their relative order
BasicBlocksLayout.insert(BasicBlocksLayout.end(), Unvisited.begin(),
Unvisited.end());
fixBranches();
}

View file

@ -51,6 +51,19 @@ public:
Assembled, /// Function has been assembled in memory
};
// Choose which strategy should the block layout heuristic prioritize when
// facing conflicting goals.
enum HeuristicPriority : char {
HP_NONE = 0,
// HP_BRANCH_PREDICTOR is an implementation of what is suggested in Pettis'
// paper (PLDI '90) about block reordering, trying to minimize branch
// mispredictions.
HP_BRANCH_PREDICTOR,
// HP_CACHE_UTILIZATION pigbacks on the idea from Ispike paper (CGO '04)
// that suggests putting frequently executed chains first in the layout.
HP_CACHE_UTILIZATION,
};
static constexpr uint64_t COUNT_NO_PROFILE =
std::numeric_limits<uint64_t>::max();
// Function size, in number of BBs, above which we fallback to a heuristic
@ -202,7 +215,7 @@ public:
/// Perform optimal code layout based on edge frequencies making necessary
/// adjustments to instructions at the end of basic blocks.
void optimizeLayout();
void optimizeLayout(HeuristicPriority Priority);
/// Dynamic programming implementation for the TSP, applied to BB layout. Find
/// the optimal way to maximize weight during a path traversing all BBs. In

View file

@ -94,10 +94,11 @@ EliminateUnreachable("eliminate-unreachable",
cl::desc("eliminate unreachable code"),
cl::Optional);
static cl::opt<bool>
ReorderBlocks("reorder-blocks",
cl::desc("redo basic block layout based on profiling data"),
cl::Optional);
static cl::opt<std::string> ReorderBlocks(
"reorder-blocks",
cl::desc("redo basic block layout based on profiling data with a specific "
"priority (none, branch-predictor or cache)"),
cl::value_desc("priority"), cl::init("disable"));
static cl::opt<bool>
DumpData("dump-data", cl::desc("dump parsed flo data and exit (debugging)"),
@ -518,6 +519,15 @@ static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) {
//
// FIXME: use real optimization passes.
bool NagUser = true;
if (opts::ReorderBlocks != "" &&
opts::ReorderBlocks != "disable" &&
opts::ReorderBlocks != "none" &&
opts::ReorderBlocks != "branch-predictor" &&
opts::ReorderBlocks != "cache") {
errs() << ToolName << ": Unrecognized block reordering priority \""
<< opts::ReorderBlocks << "\".\n";
exit(1);
}
for (auto &BFI : BinaryFunctions) {
auto &Function = BFI.second;
@ -566,9 +576,14 @@ static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) {
Function.print(errs(), "after unreachable code elimination");
}
if (opts::ReorderBlocks) {
Function.optimizeLayout();
if (opts::ReorderBlocks != "disable") {
if (opts::ReorderBlocks == "branch-predictor") {
BFI.second.optimizeLayout(BinaryFunction::HP_BRANCH_PREDICTOR);
} else if (opts::ReorderBlocks == "cache") {
BFI.second.optimizeLayout(BinaryFunction::HP_CACHE_UTILIZATION);
} else {
BFI.second.optimizeLayout(BinaryFunction::HP_NONE);
}
if (opts::PrintAll || opts::PrintReordered)
Function.print(errs(), "after reordering blocks");
}