Implement two cluster layout heuristics

Summary: Pettis' paper on block layout (PLDI'90) suggests we should order clusters (or chains, using the paper terminology) using a specific criterion. This patch implements two distinct ideas for cluster layout that can be activated using different command-line flags. The first one reflects Pettis' ideas on minimizing branch mispredictions and the second one is targeted at reducing I-cache misses, described in the Ispike paper (CGO'04). (cherry picked from FBD2588693)
2015-10-23 09:38:26 -07:00 · 2015-10-23 09:38:26 -07:00 · 13a520ab30
parent 2539539bde
commit 13a520ab30
4 changed files with 193 additions and 91 deletions
--- a/bolt/BinaryBasicBlock.h
+++ b/bolt/BinaryBasicBlock.h
@ -94,6 +94,7 @@ public:
  typedef std::reverse_iterator<iterator>                      reverse_iterator;

  bool         empty()            const { return Instructions.empty(); }
+  unsigned     size()     const { return (unsigned)Instructions.size(); }
  MCInst       &front()                 { return Instructions.front();  }
  MCInst       &back()                  { return Instructions.back();   }
  const MCInst &front()           const { return Instructions.front();  }
--- a/bolt/BinaryFunction.cpp
+++ b/bolt/BinaryFunction.cpp
@ -568,7 +568,7 @@ void BinaryFunction::inferFallThroughCounts() {
  return;
 }

-void BinaryFunction::optimizeLayout() {
+void BinaryFunction::optimizeLayout(HeuristicPriority Priority) {
  // Bail if no profiling information or if empty
  if (getExecutionCount() == BinaryFunction::COUNT_NO_PROFILE ||
      BasicBlocksLayout.empty()) {
@ -598,9 +598,18 @@ void BinaryFunction::optimizeLayout() {
  std::vector<ClusterTy> Clusters;
  BBToClusterMapTy BBToClusterMap;

-  // Populating priority queue with all edges
+  // Encode relative weights between two clusters
+  std::vector<std::map<uint32_t, uint64_t>> ClusterEdges;
+  ClusterEdges.resize(BasicBlocksLayout.size());
+
  for (auto BB : BasicBlocksLayout) {
-    BBToClusterMap[BB] = -1; // Mark as unmapped
+    // Create a cluster for this BB
+    uint32_t I = Clusters.size();
+    Clusters.emplace_back();
+    auto &Cluster = Clusters.back();
+    Cluster.push_back(BB);
+    BBToClusterMap[BB] = I;
+    // Populate priority queue with edges
    auto BI = BB->BranchInfo.begin();
    for (auto &I : BB->successors()) {
      if (BI->Count != BinaryBasicBlock::COUNT_FALLTHROUGH_EDGE)
@ -610,13 +619,6 @@ void BinaryFunction::optimizeLayout() {
    }
  }

-  // Start a cluster with the entry point
-  BinaryBasicBlock *Entry = *BasicBlocksLayout.begin();
-  Clusters.emplace_back();
-  auto &EntryCluster = Clusters.back();
-  EntryCluster.push_back(Entry);
-  BBToClusterMap[Entry] = 0;
-
  // Grow clusters in a greedy fashion
  while (!Queue.empty()) {
    auto elmt = Queue.top();
@ -624,95 +626,166 @@ void BinaryFunction::optimizeLayout() {

    BinaryBasicBlock *BBSrc = elmt.first;
    BinaryBasicBlock *BBDst = elmt.second;
-    int I = 0, J = 0;

    // Case 1: BBSrc and BBDst are the same. Ignore this edge
-    if (BBSrc == BBDst || BBDst == Entry)
+    if (BBSrc == BBDst || BBDst == *BasicBlocksLayout.begin())
      continue;

-    // Case 2: Both BBSrc and BBDst are already allocated
-    if ((I = BBToClusterMap[BBSrc]) != -1 &&
-        (J = BBToClusterMap[BBDst]) != -1) {
-      // Case 2a: If they are already allocated at the same cluster, ignore
-      if (I == J)
+    int I = BBToClusterMap[BBSrc];
+    int J = BBToClusterMap[BBDst];
+
+    // Case 2: If they are already allocated at the same cluster, just increase
+    // the weight of this cluster
+    if (I == J) {
+      ClusterEdges[I][I] += Weight[elmt];
+      continue;
+    }
+
+    auto &ClusterA = Clusters[I];
+    auto &ClusterB = Clusters[J];
+    if (ClusterA.back() == BBSrc && ClusterB.front() == BBDst) {
+      // Case 3: BBSrc is at the end of a cluster and BBDst is at the start,
+      // allowing us to merge two clusters
+      for (auto BB : ClusterB)
+        BBToClusterMap[BB] = I;
+      ClusterA.insert(ClusterA.end(), ClusterB.begin(), ClusterB.end());
+      ClusterB.clear();
+      // Iterate through all inter-cluster edges and transfer edges targeting
+      // cluster B to cluster A.
+      // It is bad to have to iterate though all edges when we could have a list
+      // of predecessors for cluster B. However, it's not clear if it is worth
+      // the added code complexity to create a data structure for clusters that
+      // maintains a list of predecessors. Maybe change this if it becomes a
+      // deal breaker.
+      for (uint32_t K = 0, E = ClusterEdges.size(); K != E; ++K)
+        ClusterEdges[K][I] += ClusterEdges[K][J];
+    } else {
+      // Case 4: Both BBSrc and BBDst are allocated in positions we cannot
+      // merge them. Annotate the weight of this edge in the weight between
+      // clusters to help us decide ordering between these clusters.
+      ClusterEdges[I][J] += Weight[elmt];
+    }
+  }
+
+  std::vector<uint32_t> Order;  // Cluster layout order
+
+  // Here we have 3 conflicting goals as to how to layout clusters. If we want
+  // to minimize jump offsets, we should put clusters with heavy inter-cluster
+  // dependence as close as possible. If we want to maximize the probability
+  // that all inter-cluster edges are predicted as not-taken, we should enforce
+  // a topological order to make targets appear after sources, creating forward
+  // branches. If we want to separate hot from cold blocks to maximize the
+  // probability that unfrequently executed code doesn't pollute the cache, we
+  // should put clusters in descending order of hotness.
+  std::vector<double> AvgFreq;
+  AvgFreq.resize(Clusters.size(), 0.0);
+  for (uint32_t I = 1, E = Clusters.size(); I < E; ++I) {
+    double Freq = 0.0;
+    for (auto BB : Clusters[I]) {
+      if (!BB->empty())
+        Freq += BB->getExecutionCount() / BB->size();
+    }
+    AvgFreq[I] = Freq;
+  }
+
+  switch(Priority) {
+  case HP_NONE: {
+    for (uint32_t I = 0, E = Clusters.size(); I < E; ++I)
+      if (!Clusters[I].empty())
+        Order.push_back(I);
+    break;
+  }
+  case HP_BRANCH_PREDICTOR: {
+    // Do a topological sort for clusters, prioritizing frequently-executed BBs
+    // during the traversal.
+    std::stack<uint32_t> Stack;
+    std::vector<uint32_t> Status;
+    std::vector<uint32_t> Parent;
+    Status.resize(Clusters.size(), 0);
+    Parent.resize(Clusters.size(), 0);
+    constexpr uint32_t STACKED = 1;
+    constexpr uint32_t VISITED = 2;
+    Status[0] = STACKED;
+    Stack.push(0);
+    while (!Stack.empty()) {
+      uint32_t I = Stack.top();
+      if (!(Status[I] & VISITED)) {
+        Status[I] |= VISITED;
+        // Order successors by weight
+        auto ClusterComp = [&ClusterEdges, I](uint32_t A, uint32_t B) {
+          return ClusterEdges[I][A] > ClusterEdges[I][B];
+        };
+        std::priority_queue<uint32_t, std::vector<uint32_t>,
+                            decltype(ClusterComp)> SuccQueue(ClusterComp);
+        for (auto &Target: ClusterEdges[I]) {
+          if (Target.second > 0 && !(Status[Target.first] & STACKED) &&
+              !Clusters[Target.first].empty()) {
+            Parent[Target.first] = I;
+            Status[Target.first] = STACKED;
+            SuccQueue.push(Target.first);
+          }
+        }
+        while (!SuccQueue.empty()) {
+          Stack.push(SuccQueue.top());
+          SuccQueue.pop();
+        }
        continue;
-      auto &ClusterA = Clusters[I];
-      auto &ClusterB = Clusters[J];
-      if (ClusterA.back() == BBSrc && ClusterB.front() == BBDst) {
-        // Case 2b: BBSrc is at the end of a cluster and BBDst is at the start,
-        // allowing us to merge two clusters
-        for (auto BB : ClusterB)
-          BBToClusterMap[BB] = I;
-        ClusterA.insert(ClusterA.end(), ClusterB.begin(), ClusterB.end());
-        ClusterB.clear();
-      } else {
-        // Case 2c: Both BBSrc and BBDst are allocated in positions we cannot
-        // merge them, so we ignore this edge.
      }
-      continue;
+      // Already visited this node
+      Stack.pop();
+      Order.push_back(I);
    }
+    std::reverse(Order.begin(), Order.end());
+    // Put unreachable clusters at the end
+    for (uint32_t I = 0, E = Clusters.size(); I < E; ++I)
+      if (!(Status[I] & VISITED) && !Clusters[I].empty())
+        Order.push_back(I);

-    // Case 3: BBSrc is already allocated in a cluster
-    if ((I = BBToClusterMap[BBSrc]) != -1) {
-      auto &Cluster = Clusters[I];
-      if (Cluster.back() == BBSrc) {
-        // Case 3a: BBSrc is allocated at the end of this cluster. We put
-        // BBSrc and BBDst together.
-        Cluster.push_back(BBDst);
-        BBToClusterMap[BBDst] = I;
-      } else {
-        // Case 3b: We cannot put BBSrc and BBDst in consecutive positions,
-        // so we ignore this edge.
-      }
-      continue;
-    }
+    // Sort nodes with equal precedence
+    auto Beg = Order.begin();
+    // Don't reorder the first cluster, which contains the function entry point
+    ++Beg;
+    std::stable_sort(Beg, Order.end(),
+                     [&AvgFreq, &Parent](uint32_t A, uint32_t B) {
+                       uint32_t P = Parent[A];
+                       while (Parent[P] != 0) {
+                         if (Parent[P] == B)
+                           return false;
+                         P = Parent[P];
+                       }
+                       P = Parent[B];
+                       while (Parent[P] != 0) {
+                         if (Parent[P] == A)
+                           return true;
+                         P = Parent[P];
+                       }
+                       return AvgFreq[A] > AvgFreq[B];
+                     });
+    break;
+  }
+  case HP_CACHE_UTILIZATION: {
+    // Order clusters based on average instruction execution frequency
+    for (uint32_t I = 0, E = Clusters.size(); I < E; ++I)
+      if (!Clusters[I].empty())
+        Order.push_back(I);
+    auto Beg = Order.begin();
+    // Don't reorder the first cluster, which contains the function entry point
+    ++Beg;
+    std::stable_sort(Beg, Order.end(), [&AvgFreq](uint32_t A, uint32_t B) {
+      return AvgFreq[A] > AvgFreq[B];
+    });

-    // Case 4: BBSrc is not in a cluster, but BBDst is
-    if ((I = BBToClusterMap[BBDst]) != -1) {
-      auto &Cluster = Clusters[I];
-      if (Cluster.front() == BBDst) {
-        // Case 4a: BBDst is allocated at the start of this cluster. We put
-        // BBSrc and BBDst together.
-        Cluster.insert(Cluster.begin(), BBSrc);
-        BBToClusterMap[BBSrc] = I;
-      } else {
-        // Case 4b: We cannot put BBSrc and BBDst in consecutive positions,
-        // so we ignore this edge.
-      }
-      continue;
-    }
-
-    // Case 5: Both BBSrc and BBDst are unallocated, so we create a new cluster
-    // with them
-    I = Clusters.size();
-    Clusters.emplace_back();
-    auto &Cluster = Clusters.back();
-    Cluster.push_back(BBSrc);
-    Cluster.push_back(BBDst);
-    BBToClusterMap[BBSrc] = I;
-    BBToClusterMap[BBDst] = I;
+    break;
+  }
  }

-  // Create an extra cluster for unvisited basic blocks
-  std::vector<BinaryBasicBlock *> Unvisited;
-  for (auto BB : BasicBlocksLayout) {
-    if (BBToClusterMap[BB] == -1) {
-      Unvisited.push_back(BB);
-    }
-  }
-
-  // Define final function layout based on clusters
  BasicBlocksLayout.clear();
-  for (auto &Cluster : Clusters) {
+  for (auto I : Order) {
+    auto &Cluster = Clusters[I];
    BasicBlocksLayout.insert(BasicBlocksLayout.end(), Cluster.begin(),
                             Cluster.end());
  }

-  // Finalize layout with BBs that weren't assigned to any cluster, preserving
-  // their relative order
-  BasicBlocksLayout.insert(BasicBlocksLayout.end(), Unvisited.begin(),
-                           Unvisited.end());
-
  fixBranches();
 }

--- a/bolt/BinaryFunction.h
+++ b/bolt/BinaryFunction.h
@ -51,6 +51,19 @@ public:
    Assembled,        /// Function has been assembled in memory
  };

+  // Choose which strategy should the block layout heuristic prioritize when
+  // facing conflicting goals.
+  enum HeuristicPriority : char {
+    HP_NONE = 0,
+    // HP_BRANCH_PREDICTOR is an implementation of what is suggested in Pettis'
+    // paper (PLDI '90) about block reordering, trying to minimize branch
+    // mispredictions.
+    HP_BRANCH_PREDICTOR,
+    // HP_CACHE_UTILIZATION pigbacks on the idea from Ispike paper (CGO '04)
+    // that suggests putting frequently executed chains first in the layout.
+    HP_CACHE_UTILIZATION,
+  };
+
  static constexpr uint64_t COUNT_NO_PROFILE =
    std::numeric_limits<uint64_t>::max();
  // Function size, in number of BBs, above which we fallback to a heuristic
@ -202,7 +215,7 @@ public:

  /// Perform optimal code layout based on edge frequencies making necessary
  /// adjustments to instructions at the end of basic blocks.
-  void optimizeLayout();
+  void optimizeLayout(HeuristicPriority Priority);

  /// Dynamic programming implementation for the TSP, applied to BB layout. Find
  /// the optimal way to maximize weight during a path traversing all BBs. In
--- a/bolt/llvm-flo.cpp
+++ b/bolt/llvm-flo.cpp
@ -94,10 +94,11 @@ EliminateUnreachable("eliminate-unreachable",
                     cl::desc("eliminate unreachable code"),
                     cl::Optional);

-static cl::opt<bool>
-ReorderBlocks("reorder-blocks",
-              cl::desc("redo basic block layout based on profiling data"),
-              cl::Optional);
+static cl::opt<std::string> ReorderBlocks(
+    "reorder-blocks",
+    cl::desc("redo basic block layout based on profiling data with a specific "
+             "priority (none, branch-predictor or cache)"),
+    cl::value_desc("priority"), cl::init("disable"));

 static cl::opt<bool>
 DumpData("dump-data", cl::desc("dump parsed flo data and exit (debugging)"),
@ -518,6 +519,15 @@ static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) {
  //
  // FIXME: use real optimization passes.
  bool NagUser = true;
+  if (opts::ReorderBlocks != "" &&
+      opts::ReorderBlocks != "disable" &&
+      opts::ReorderBlocks != "none" &&
+      opts::ReorderBlocks != "branch-predictor" &&
+      opts::ReorderBlocks != "cache") {
+    errs() << ToolName << ": Unrecognized block reordering priority \""
+           << opts::ReorderBlocks << "\".\n";
+    exit(1);
+  }
  for (auto &BFI : BinaryFunctions) {
    auto &Function = BFI.second;

@ -566,9 +576,14 @@ static void OptimizeFile(ELFObjectFileBase *File, const DataReader &DR) {
        Function.print(errs(), "after unreachable code elimination");
    }

-    if (opts::ReorderBlocks) {
-      Function.optimizeLayout();
-
+    if (opts::ReorderBlocks != "disable") {
+      if (opts::ReorderBlocks == "branch-predictor") {
+        BFI.second.optimizeLayout(BinaryFunction::HP_BRANCH_PREDICTOR);
+      } else if (opts::ReorderBlocks == "cache") {
+        BFI.second.optimizeLayout(BinaryFunction::HP_CACHE_UTILIZATION);
+      } else {
+        BFI.second.optimizeLayout(BinaryFunction::HP_NONE);
+      }
      if (opts::PrintAll || opts::PrintReordered)
        Function.print(errs(), "after reordering blocks");
    }