a10fb73ab3
Summary: We only need ClusterEdges in reordering algorithm optimized for branches and the computation is quite resource-hungry, thus it makes sense to only do it when needed. Some refactoring too. (cherry picked from FBD3721107)
261 lines
9 KiB
C++
261 lines
9 KiB
C++
//===- ReorderAlgorithm.h - Interface for basic block reorderng algorithms ===//
|
|
//
|
|
// The LLVM Compiler Infrastructure
|
|
//
|
|
// This file is distributed under the University of Illinois Open Source
|
|
// License. See LICENSE.TXT for details.
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
// Interface to different basic block reordering algorithms.
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#ifndef LLVM_TOOLS_LLVM_BOLT_REORDER_ALGORITHM_H
|
|
#define LLVM_TOOLS_LLVM_BOLT_REORDER_ALGORITHM_H
|
|
|
|
#include "BinaryFunction.h"
|
|
#include "llvm/Support/ErrorHandling.h"
|
|
#include <unordered_map>
|
|
#include <memory>
|
|
#include <vector>
|
|
|
|
|
|
namespace llvm {
|
|
|
|
class raw_ostream;
|
|
|
|
|
|
namespace bolt {
|
|
|
|
class BinaryBasicBlock;
|
|
class BinaryFunction;
|
|
|
|
/// Objects of this class implement various basic block clustering algorithms.
|
|
/// Basic block clusters are chains of basic blocks that should be laid out
|
|
/// in this order to maximize performace. These algorithms group basic blocks
|
|
/// into clusters using execution profile data and various heuristics.
|
|
class ClusterAlgorithm {
|
|
public:
|
|
using ClusterTy = std::vector<BinaryBasicBlock *>;
|
|
std::vector<ClusterTy> Clusters;
|
|
std::vector<std::unordered_map<uint32_t, uint64_t>> ClusterEdges;
|
|
std::vector<double> AvgFreq;
|
|
|
|
/// Group the basic blocks in the given function into clusters stored in the
|
|
/// Clusters vector. Also encode relative weights between two clusters in
|
|
/// the ClusterEdges vector if requested. This vector is indexed by
|
|
/// the clusters indices in the Clusters vector.
|
|
virtual void clusterBasicBlocks(const BinaryFunction &BF,
|
|
bool ComputeEdges = false) = 0;
|
|
|
|
/// Compute for each cluster its averagae execution frequency, that is
|
|
/// the sum of average frequencies of its blocks (execution count / # instrs).
|
|
/// The average frequencies are stored in the AvgFreq vector, index by the
|
|
/// cluster indices in the Clusters vector.
|
|
void computeClusterAverageFrequency();
|
|
|
|
/// Clear clusters and related info.
|
|
virtual void reset();
|
|
|
|
void printClusters() const;
|
|
|
|
virtual ~ClusterAlgorithm() {}
|
|
};
|
|
|
|
|
|
/// Base class for a greedy clustering algorithm that selects edges in order
|
|
/// based on some heuristic and uses them to join basic blocks into clusters.
|
|
class GreedyClusterAlgorithm : public ClusterAlgorithm {
|
|
protected:
|
|
// Represents an edge between two basic blocks, with source, destination, and
|
|
// profile count.
|
|
struct EdgeTy {
|
|
const BinaryBasicBlock *Src;
|
|
const BinaryBasicBlock *Dst;
|
|
uint64_t Count;
|
|
|
|
EdgeTy(const BinaryBasicBlock *Src, const BinaryBasicBlock *Dst,
|
|
uint64_t Count) :
|
|
Src(Src), Dst(Dst), Count(Count) {}
|
|
|
|
void print(raw_ostream &OS) const;
|
|
};
|
|
|
|
struct EdgeHash {
|
|
size_t operator() (const EdgeTy &E) const;
|
|
};
|
|
|
|
struct EdgeEqual {
|
|
bool operator() (const EdgeTy &A, const EdgeTy &B) const;
|
|
};
|
|
|
|
// Virtual methods that allow custom specialization of the heuristic used by
|
|
// the algorithm to select edges.
|
|
virtual void initQueue(
|
|
std::vector<EdgeTy> &Queue, const BinaryFunction &BF) = 0;
|
|
virtual void adjustQueue(
|
|
std::vector<EdgeTy> &Queue, const BinaryFunction &BF) = 0;
|
|
virtual bool areClustersCompatible(
|
|
const ClusterTy &Front, const ClusterTy &Back, const EdgeTy &E) const = 0;
|
|
|
|
// Map from basic block to owning cluster index.
|
|
using BBToClusterMapTy = std::unordered_map<const BinaryBasicBlock *,
|
|
unsigned>;
|
|
BBToClusterMapTy BBToClusterMap;
|
|
|
|
public:
|
|
void clusterBasicBlocks(const BinaryFunction &BF,
|
|
bool ComputeEdges = false) override;
|
|
void reset() override;
|
|
};
|
|
|
|
|
|
/// This clustering algorithm is based on a greedy heuristic suggested by
|
|
/// Pettis and Hansen (PLDI '90).
|
|
class PHGreedyClusterAlgorithm : public GreedyClusterAlgorithm {
|
|
protected:
|
|
void initQueue(
|
|
std::vector<EdgeTy> &Queue, const BinaryFunction &BF) override;
|
|
void adjustQueue(
|
|
std::vector<EdgeTy> &Queue, const BinaryFunction &BF) override;
|
|
bool areClustersCompatible(
|
|
const ClusterTy &Front, const ClusterTy &Back, const EdgeTy &E) const
|
|
override;
|
|
};
|
|
|
|
|
|
/// This clustering algorithm is based on a greedy heuristic that is a
|
|
/// modification of the heuristic suggested by Pettis (PLDI '90). It is
|
|
/// geared towards minimizing branches.
|
|
class MinBranchGreedyClusterAlgorithm : public GreedyClusterAlgorithm {
|
|
private:
|
|
// Map from an edge to its weight which is used by the algorithm to sort the
|
|
// edges.
|
|
std::unordered_map<EdgeTy, int64_t, EdgeHash, EdgeEqual> Weight;
|
|
|
|
// The weight of an edge is calculated as the win in branches if we choose
|
|
// to layout this edge as a fall-through. For example, consider the edges
|
|
// A -> B with execution count 500,
|
|
// A -> C with execution count 100, and
|
|
// D -> B with execution count 150
|
|
// wher B, C are the only successors of A and A, D are thr only predessecors
|
|
// of B. Then if we choose to layout edge A -> B as a fallthrough, the win in
|
|
// branches would be 500 - 100 - 150 = 250. That is the weight of edge A->B.
|
|
int64_t calculateWeight(const EdgeTy &E, const BinaryFunction &BF) const;
|
|
|
|
protected:
|
|
void initQueue(
|
|
std::vector<EdgeTy> &Queue, const BinaryFunction &BF) override;
|
|
void adjustQueue(
|
|
std::vector<EdgeTy> &Queue, const BinaryFunction &BF) override;
|
|
bool areClustersCompatible(
|
|
const ClusterTy &Front, const ClusterTy &Back, const EdgeTy &E) const
|
|
override;
|
|
|
|
public:
|
|
void reset() override;
|
|
};
|
|
|
|
|
|
/// Objects of this class implement various basic block reordering alogrithms.
|
|
/// Most of these algorithms depend on a clustering alogrithm.
|
|
/// Here we have 3 conflicting goals as to how to layout clusters. If we want
|
|
/// to minimize jump offsets, we should put clusters with heavy inter-cluster
|
|
/// dependence as close as possible. If we want to maximize the probability
|
|
/// that all inter-cluster edges are predicted as not-taken, we should enforce
|
|
/// a topological order to make targets appear after sources, creating forward
|
|
/// branches. If we want to separate hot from cold blocks to maximize the
|
|
/// probability that unfrequently executed code doesn't pollute the cache, we
|
|
/// should put clusters in descending order of hotness.
|
|
class ReorderAlgorithm {
|
|
protected:
|
|
std::unique_ptr<ClusterAlgorithm> CAlgo;
|
|
|
|
public:
|
|
ReorderAlgorithm() { }
|
|
explicit ReorderAlgorithm(std::unique_ptr<ClusterAlgorithm> CAlgo) :
|
|
CAlgo(std::move(CAlgo)) { }
|
|
|
|
using BasicBlockOrder = BinaryFunction::BasicBlockOrderType;
|
|
|
|
/// Reorder the basic blocks of the given function and store the new order in
|
|
/// the new Clusters vector.
|
|
virtual void reorderBasicBlocks(
|
|
const BinaryFunction &BF, BasicBlockOrder &Order) const = 0;
|
|
|
|
void setClusterAlgorithm(ClusterAlgorithm *CAlgo) {
|
|
this->CAlgo.reset(CAlgo);
|
|
}
|
|
|
|
virtual ~ReorderAlgorithm() { }
|
|
};
|
|
|
|
|
|
/// Dynamic programming implementation for the TSP, applied to BB layout. Find
|
|
/// the optimal way to maximize weight during a path traversing all BBs. In
|
|
/// this way, we will convert the hottest branches into fall-throughs.
|
|
///
|
|
/// Uses exponential amount of memory on the number of basic blocks and should
|
|
/// only be used for small functions.
|
|
class OptimalReorderAlgorithm : public ReorderAlgorithm {
|
|
public:
|
|
void reorderBasicBlocks(
|
|
const BinaryFunction &BF, BasicBlockOrder &Order) const override;
|
|
};
|
|
|
|
|
|
/// Simple algorithm that groups basic blocks into clusters and then
|
|
/// lays them out cluster after cluster.
|
|
class OptimizeReorderAlgorithm : public ReorderAlgorithm {
|
|
public:
|
|
explicit OptimizeReorderAlgorithm(std::unique_ptr<ClusterAlgorithm> CAlgo) :
|
|
ReorderAlgorithm(std::move(CAlgo)) { }
|
|
|
|
void reorderBasicBlocks(
|
|
const BinaryFunction &BF, BasicBlockOrder &Order) const override;
|
|
};
|
|
|
|
|
|
/// This reorder algorithm tries to ensure that all inter-cluster edges are
|
|
/// predicted as not-taken, by enforcing a topological order to make
|
|
/// targets appear after sources, creating forward branches.
|
|
class OptimizeBranchReorderAlgorithm : public ReorderAlgorithm {
|
|
public:
|
|
explicit OptimizeBranchReorderAlgorithm(
|
|
std::unique_ptr<ClusterAlgorithm> CAlgo) :
|
|
ReorderAlgorithm(std::move(CAlgo)) { }
|
|
|
|
void reorderBasicBlocks(
|
|
const BinaryFunction &BF, BasicBlockOrder &Order) const override;
|
|
};
|
|
|
|
|
|
/// This reorder tries to separate hot from cold blocks to maximize the
|
|
/// probability that unfrequently executed code doesn't pollute the cache, by
|
|
/// putting clusters in descending order of hotness.
|
|
class OptimizeCacheReorderAlgorithm : public ReorderAlgorithm {
|
|
public:
|
|
explicit OptimizeCacheReorderAlgorithm(
|
|
std::unique_ptr<ClusterAlgorithm> CAlgo) :
|
|
ReorderAlgorithm(std::move(CAlgo)) { }
|
|
|
|
void reorderBasicBlocks(
|
|
const BinaryFunction &BF, BasicBlockOrder &Order) const override;
|
|
};
|
|
|
|
|
|
/// Toy example that simply reverses the original basic block order.
|
|
class ReverseReorderAlgorithm : public ReorderAlgorithm {
|
|
public:
|
|
void reorderBasicBlocks(
|
|
const BinaryFunction &BF, BasicBlockOrder &Order) const override;
|
|
};
|
|
|
|
|
|
} // namespace bolt
|
|
} // namespace llvm
|
|
|
|
#endif
|
|
|