[lld-macho] Port CallGraphSort from COFF/ELF

Depends on D112160

This adds the new options `--call-graph-profile-sort` (default),
`--no-call-graph-profile-sort` and `--print-symbol-order=`. If call graph
profile sorting is enabled, reads `__LLVM,__cg_profile` sections from object
files and uses the resulting graph to put callees and callers close to each
other in the final binary via the C3 clustering heuristic.

Differential Revision: https://reviews.llvm.org/D112164
This commit is contained in:
Leonard Grey 2022-01-12 10:47:04 -05:00 committed by Nico Weber
parent 497a4b26c4
commit 6db04b97e6
14 changed files with 481 additions and 1 deletions

View file

@ -10,6 +10,7 @@ add_lld_library(lldMachO
Arch/ARM64Common.cpp
Arch/ARM64_32.cpp
Arch/X86_64.cpp
CallGraphSort.cpp
ConcatOutputSection.cpp
Driver.cpp
DriverUtils.cpp

252
lld/MachO/CallGraphSort.cpp Normal file
View file

@ -0,0 +1,252 @@
//===- CallGraphSort.cpp --------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
/// This is based on the ELF port, see ELF/CallGraphSort.cpp for the details
/// about the algorithm.
///
//===----------------------------------------------------------------------===//
#include "CallGraphSort.h"
#include "Config.h"
#include "InputFiles.h"
#include "Symbols.h"
#include "Target.h"
#include "lld/Common/ErrorHandler.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/MapVector.h"
#include "llvm/Support/TimeProfiler.h"
#include "llvm/Support/raw_ostream.h"
#include <numeric>
using namespace llvm;
using namespace lld;
using namespace lld::macho;
namespace {
struct Edge {
int from;
uint64_t weight;
};
struct Cluster {
Cluster(int sec, size_t s) : next(sec), prev(sec), size(s) {}
double getDensity() const {
if (size == 0)
return 0;
return double(weight) / double(size);
}
int next;
int prev;
uint64_t size;
uint64_t weight = 0;
uint64_t initialWeight = 0;
Edge bestPred = {-1, 0};
};
class CallGraphSort {
public:
CallGraphSort();
DenseMap<const InputSection *, size_t> run();
private:
std::vector<Cluster> clusters;
std::vector<const InputSection *> sections;
};
// Maximum amount the combined cluster density can be worse than the original
// cluster to consider merging.
constexpr int MAX_DENSITY_DEGRADATION = 8;
} // end anonymous namespace
using SectionPair = std::pair<const InputSection *, const InputSection *>;
// Take the edge list in config->callGraphProfile, resolve symbol names to
// Symbols, and generate a graph between InputSections with the provided
// weights.
CallGraphSort::CallGraphSort() {
MapVector<SectionPair, uint64_t> &profile = config->callGraphProfile;
DenseMap<const InputSection *, int> secToCluster;
auto getOrCreateCluster = [&](const InputSection *isec) -> int {
auto res = secToCluster.try_emplace(isec, clusters.size());
if (res.second) {
sections.push_back(isec);
clusters.emplace_back(clusters.size(), isec->getSize());
}
return res.first->second;
};
// Create the graph
for (std::pair<SectionPair, uint64_t> &c : profile) {
const auto fromSec = c.first.first->canonical();
const auto toSec = c.first.second->canonical();
uint64_t weight = c.second;
// Ignore edges between input sections belonging to different output
// sections. This is done because otherwise we would end up with clusters
// containing input sections that can't actually be placed adjacently in the
// output. This messes with the cluster size and density calculations. We
// would also end up moving input sections in other output sections without
// moving them closer to what calls them.
if (fromSec->parent != toSec->parent)
continue;
int from = getOrCreateCluster(fromSec);
int to = getOrCreateCluster(toSec);
clusters[to].weight += weight;
if (from == to)
continue;
// Remember the best edge.
Cluster &toC = clusters[to];
if (toC.bestPred.from == -1 || toC.bestPred.weight < weight) {
toC.bestPred.from = from;
toC.bestPred.weight = weight;
}
}
for (Cluster &c : clusters)
c.initialWeight = c.weight;
}
// It's bad to merge clusters which would degrade the density too much.
static bool isNewDensityBad(Cluster &a, Cluster &b) {
double newDensity = double(a.weight + b.weight) / double(a.size + b.size);
return newDensity < a.getDensity() / MAX_DENSITY_DEGRADATION;
}
// Find the leader of V's belonged cluster (represented as an equivalence
// class). We apply union-find path-halving technique (simple to implement) in
// the meantime as it decreases depths and the time complexity.
static int getLeader(std::vector<int> &leaders, int v) {
while (leaders[v] != v) {
leaders[v] = leaders[leaders[v]];
v = leaders[v];
}
return v;
}
static void mergeClusters(std::vector<Cluster> &cs, Cluster &into, int intoIdx,
Cluster &from, int fromIdx) {
int tail1 = into.prev, tail2 = from.prev;
into.prev = tail2;
cs[tail2].next = intoIdx;
from.prev = tail1;
cs[tail1].next = fromIdx;
into.size += from.size;
into.weight += from.weight;
from.size = 0;
from.weight = 0;
}
// Group InputSections into clusters using the Call-Chain Clustering heuristic
// then sort the clusters by density.
DenseMap<const InputSection *, size_t> CallGraphSort::run() {
const uint64_t maxClusterSize = target->getPageSize();
// Cluster indices sorted by density.
std::vector<int> sorted(clusters.size());
// For union-find.
std::vector<int> leaders(clusters.size());
std::iota(leaders.begin(), leaders.end(), 0);
std::iota(sorted.begin(), sorted.end(), 0);
llvm::stable_sort(sorted, [&](int a, int b) {
return clusters[a].getDensity() > clusters[b].getDensity();
});
for (int l : sorted) {
// The cluster index is the same as the index of its leader here because
// clusters[L] has not been merged into another cluster yet.
Cluster &c = clusters[l];
// Don't consider merging if the edge is unlikely.
if (c.bestPred.from == -1 || c.bestPred.weight * 10 <= c.initialWeight)
continue;
int predL = getLeader(leaders, c.bestPred.from);
// Already in the same cluster.
if (l == predL)
continue;
Cluster *predC = &clusters[predL];
if (c.size + predC->size > maxClusterSize)
continue;
if (isNewDensityBad(*predC, c))
continue;
leaders[l] = predL;
mergeClusters(clusters, *predC, predL, c, l);
}
// Sort remaining non-empty clusters by density.
sorted.clear();
for (int i = 0, e = (int)clusters.size(); i != e; ++i)
if (clusters[i].size > 0)
sorted.push_back(i);
llvm::stable_sort(sorted, [&](int a, int b) {
return clusters[a].getDensity() > clusters[b].getDensity();
});
DenseMap<const InputSection *, size_t> orderMap;
// Sections will be sorted by decreasing order. Absent sections will have
// priority 0 and be placed at the end of sections.
// NB: This is opposite from COFF/ELF to be compatible with the existing
// order-file code.
int curOrder = clusters.size();
for (int leader : sorted) {
for (int i = leader;;) {
orderMap[sections[i]] = curOrder--;
i = clusters[i].next;
if (i == leader)
break;
}
}
if (!config->printSymbolOrder.empty()) {
std::error_code ec;
raw_fd_ostream os(config->printSymbolOrder, ec, sys::fs::OF_None);
if (ec) {
error("cannot open " + config->printSymbolOrder + ": " + ec.message());
return orderMap;
}
// Print the symbols ordered by C3, in the order of decreasing curOrder
// Instead of sorting all the orderMap, just repeat the loops above.
for (int leader : sorted)
for (int i = leader;;) {
const InputSection *isec = sections[i];
// Search all the symbols in the file of the section
// and find out a Defined symbol with name that is within the
// section.
for (Symbol *sym : isec->getFile()->symbols) {
if (auto *d = dyn_cast_or_null<Defined>(sym)) {
if (d->isec == isec)
os << sym->getName() << "\n";
}
}
i = clusters[i].next;
if (i == leader)
break;
}
}
return orderMap;
}
// Sort sections by the profile data provided by __LLVM,__cg_profile sections.
//
// This first builds a call graph based on the profile data then merges sections
// according to the C³ heuristic. All clusters are then sorted by a density
// metric to further improve locality.
DenseMap<const InputSection *, size_t> macho::computeCallGraphProfileOrder() {
TimeTraceScope timeScope("Call graph profile sort");
return CallGraphSort().run();
}

22
lld/MachO/CallGraphSort.h Normal file
View file

@ -0,0 +1,22 @@
//===- CallGraphSort.h ------------------------------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#ifndef LLD_MACHO_CALL_GRAPH_SORT_H
#define LLD_MACHO_CALL_GRAPH_SORT_H
#include "InputSection.h"
#include "llvm/ADT/DenseMap.h"
namespace lld {
namespace macho {
llvm::DenseMap<const InputSection *, size_t> computeCallGraphProfileOrder();
} // namespace macho
} // namespace lld
#endif

View file

@ -12,6 +12,7 @@
#include "llvm/ADT/CachedHashString.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/StringSet.h"
#include "llvm/BinaryFormat/MachO.h"
@ -27,6 +28,7 @@
namespace lld {
namespace macho {
class InputSection;
class Symbol;
struct SymbolPriorityEntry;
@ -169,6 +171,12 @@ struct Configuration {
std::vector<SegmentProtection> segmentProtections;
llvm::DenseMap<llvm::StringRef, SymbolPriorityEntry> priorities;
llvm::MapVector<std::pair<const InputSection *, const InputSection *>,
uint64_t>
callGraphProfile;
bool callGraphProfileSort = false;
llvm::StringRef printSymbolOrder;
SectionRenameMap sectionRenameMap;
SegmentRenameMap segmentRenameMap;

View file

@ -1057,6 +1057,25 @@ static void gatherInputSections() {
assert(inputOrder <= UnspecifiedInputOrder);
}
static void extractCallGraphProfile() {
TimeTraceScope timeScope("Extract call graph profile");
for (const InputFile *file : inputFiles) {
auto *obj = dyn_cast_or_null<ObjFile>(file);
if (!obj)
continue;
for (const CallGraphEntry &entry : obj->callGraph) {
assert(entry.fromIndex < obj->symbols.size() &&
entry.toIndex < obj->symbols.size());
auto *fromSym = dyn_cast_or_null<Defined>(obj->symbols[entry.fromIndex]);
auto *toSym = dyn_cast_or_null<Defined>(obj->symbols[entry.toIndex]);
if (!fromSym || !toSym)
continue;
config->callGraphProfile[{fromSym->isec, toSym->isec}] += entry.count;
}
}
}
static void foldIdenticalLiterals() {
// We always create a cStringSection, regardless of whether dedupLiterals is
// true. If it isn't, we simply create a non-deduplicating CStringSection.
@ -1266,6 +1285,9 @@ bool macho::link(ArrayRef<const char *> argsArr, bool canExitEarly,
config->icfLevel != ICFLevel::none;
config->warnDylibInstallName = args.hasFlag(
OPT_warn_dylib_install_name, OPT_no_warn_dylib_install_name, false);
config->callGraphProfileSort = args.hasFlag(
OPT_call_graph_profile_sort, OPT_no_call_graph_profile_sort, true);
config->printSymbolOrder = args.getLastArgValue(OPT_print_symbol_order);
// FIXME: Add a commandline flag for this too.
config->zeroModTime = getenv("ZERO_AR_DATE");
@ -1458,8 +1480,10 @@ bool macho::link(ArrayRef<const char *> argsArr, bool canExitEarly,
replaceCommonSymbols();
StringRef orderFile = args.getLastArgValue(OPT_order_file);
if (!orderFile.empty())
if (!orderFile.empty()) {
parseOrderFile(orderFile);
config->callGraphProfileSort = false;
}
referenceStubBinder();
@ -1509,6 +1533,8 @@ bool macho::link(ArrayRef<const char *> argsArr, bool canExitEarly,
}
gatherInputSections();
if (config->callGraphProfileSort)
extractCallGraphProfile();
if (config->deadStrip)
markLive();

View file

@ -63,10 +63,12 @@
#include "llvm/ADT/iterator.h"
#include "llvm/BinaryFormat/MachO.h"
#include "llvm/LTO/LTO.h"
#include "llvm/Support/BinaryStreamReader.h"
#include "llvm/Support/Endian.h"
#include "llvm/Support/MemoryBuffer.h"
#include "llvm/Support/Path.h"
#include "llvm/Support/TarWriter.h"
#include "llvm/Support/TimeProfiler.h"
#include "llvm/TextAPI/Architecture.h"
#include "llvm/TextAPI/InterfaceFile.h"
@ -325,6 +327,25 @@ void ObjFile::parseSections(ArrayRef<SectionHeader> sectionHeaders) {
if (name == section_names::compactUnwind)
compactUnwindSection = &sections.back();
} else if (segname == segment_names::llvm) {
if (name == "__cg_profile" && config->callGraphProfileSort) {
TimeTraceScope timeScope("Parsing call graph section");
BinaryStreamReader reader(data, support::little);
while (!reader.empty()) {
uint32_t fromIndex, toIndex;
uint64_t count;
if (Error err = reader.readInteger(fromIndex))
fatal(toString(this) + ": Expected 32-bit integer");
if (Error err = reader.readInteger(toIndex))
fatal(toString(this) + ": Expected 32-bit integer");
if (Error err = reader.readInteger(count))
fatal(toString(this) + ": Expected 64-bit integer");
callGraph.emplace_back();
CallGraphEntry &entry = callGraph.back();
entry.fromIndex = fromIndex;
entry.toIndex = toIndex;
entry.count = count;
}
}
// ld64 does not appear to emit contents from sections within the __LLVM
// segment. Symbols within those sections point to bitcode metadata
// instead of actual symbols. Global symbols within those sections could

View file

@ -65,6 +65,16 @@ struct Section {
Section(uint64_t addr) : address(addr){};
};
// Represents a call graph profile edge.
struct CallGraphEntry {
// The index of the caller in the symbol table.
uint32_t fromIndex;
// The index of the callee in the symbol table.
uint32_t toIndex;
// Number of calls from callee to caller in the profile.
uint64_t count;
};
class InputFile {
public:
enum Kind {
@ -115,6 +125,7 @@ public:
llvm::DWARFUnit *compileUnit = nullptr;
const uint32_t modTime;
std::vector<ConcatInputSection *> debugSections;
std::vector<CallGraphEntry> callGraph;
private:
Section *compactUnwindSection = nullptr;

View file

@ -53,6 +53,7 @@ public:
virtual bool isLive(uint64_t off) const = 0;
virtual void markLive(uint64_t off) = 0;
virtual InputSection *canonical() { return this; }
virtual const InputSection *canonical() const { return this; }
OutputSection *parent = nullptr;
@ -124,6 +125,9 @@ public:
ConcatInputSection *canonical() override {
return replacement ? replacement : this;
}
const InputSection *canonical() const override {
return replacement ? replacement : this;
}
static bool classof(const InputSection *isec) {
return isec->kind() == ConcatKind;

View file

@ -79,6 +79,15 @@ def no_warn_dylib_install_name: Joined<["--"], "no-warn-dylib-install-name">,
def warn_dylib_install_name: Joined<["--"], "warn-dylib-install-name">,
HelpText<"Warn on -install-name if -dylib is not passed">,
Group<grp_lld>;
def call_graph_profile_sort: Flag<["--"], "call-graph-profile-sort">,
HelpText<"Reorder sections with call graph profile (default)">,
Group<grp_lld>;
def no_call_graph_profile_sort : Flag<["--"], "no-call-graph-profile-sort">,
HelpText<"Do not reorder sections with call graph profile">,
Group<grp_lld>;
def print_symbol_order: Joined<["--"], "print-symbol-order=">,
HelpText<"Print a symbol order specified by --call-graph-profile-sort into the specified file">,
Group<grp_lld>;
// This is a complete Options.td compiled from Apple's ld(1) manpage
// dated 2018-03-07 and cross checked with ld64 source code in repo

View file

@ -7,6 +7,7 @@
//===----------------------------------------------------------------------===//
#include "Writer.h"
#include "CallGraphSort.h"
#include "ConcatOutputSection.h"
#include "Config.h"
#include "InputFiles.h"
@ -865,6 +866,8 @@ static size_t getSymbolPriority(const SymbolPriorityEntry &entry,
// Each section gets assigned the priority of the highest-priority symbol it
// contains.
static DenseMap<const InputSection *, size_t> buildInputSectionPriorities() {
if (config->callGraphProfileSort)
return computeCallGraphProfileOrder();
DenseMap<const InputSection *, size_t> sectionPriorities;
if (config->priorities.empty())

View file

@ -0,0 +1,46 @@
# REQUIRES: x86
# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %s -o %t
# RUN: %lld -e A %t -o %t.out --icf=all
# RUN: llvm-nm --numeric-sort %t.out | FileCheck %s
# RUN: %lld -e A %t -o %t2.out
# RUN: llvm-nm --numeric-sort %t2.out | FileCheck %s --check-prefix=NOICF
.text
.globl D
D:
mov $60, %rax
retq
.globl C
C:
mov $60, %rax
retq
.globl B
B:
mov $2, %rax
retq
.globl A
A:
mov $42, %rax
retq
.cg_profile A, B, 100
.cg_profile A, C, 40
.cg_profile C, D, 61
.subsections_via_symbols
# CHECK: 0000000100000290 T A
# CHECK-NEXT: 0000000100000298 T C
# CHECK-NEXT: 0000000100000298 T D
# CHECK-NEXT: 00000001000002a0 T B
# NOICF: 0000000100000290 T A
# NOICF-NEXT: 0000000100000298 T B
# NOICF-NEXT: 00000001000002a0 T C
# NOICF-NEXT: 00000001000002a8 T D

View file

@ -0,0 +1,42 @@
# REQUIRES: x86
# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %s -o %t.o
# RUN: %lld -lSystem -e A -o %t.out %t.o
# RUN: llvm-nm --numeric-sort %t.out | FileCheck %s
# RUN: %lld --no-call-graph-profile-sort -lSystem -e A -o %t.out %t.o
# RUN: llvm-nm --numeric-sort %t.out | FileCheck %s --check-prefix=NO-CG
.text
D:
retq
.globl C
C:
retq
.globl B
B:
retq
.globl A
A:
Aa:
retq
.subsections_via_symbols
.cg_profile A, B, 10
.cg_profile A, B, 10
.cg_profile Aa, B, 80
.cg_profile A, C, 40
.cg_profile B, C, 30
.cg_profile C, D, 90
# CHECK: 00000001000002c8 T A
# CHECK: 00000001000002c9 T B
# CHECK: 00000001000002ca T C
# CHECK: 00000001000002cb t D
# NO-CG: 00000001000002c8 t D
# NO-CG: 00000001000002c9 T C
# NO-CG: 00000001000002ca T B
# NO-CG: 00000001000002cb T A

View file

@ -0,0 +1,34 @@
# REQUIRES: x86
# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %s -o %t
# RUN: %lld -e A %t -o %t2 --print-symbol-order=%t3
# RUN: FileCheck %s --input-file %t3
# CHECK: B
# CHECK-NEXT: C
# CHECK-NEXT: D
# CHECK-NEXT: A
.text
.globl A
A:
nop
.globl B
B:
nop
.globl C
C:
nop
.globl D
D:
nop
.subsections_via_symbols
.cg_profile A, B, 5
.cg_profile B, C, 50
.cg_profile C, D, 40
.cg_profile D, B, 10

View file

@ -27,6 +27,7 @@ static_library("MachO") {
"Arch/ARM64Common.cpp",
"Arch/ARM64_32.cpp",
"Arch/X86_64.cpp",
"CallGraphSort.cpp",
"ConcatOutputSection.cpp",
"Driver.cpp",
"DriverUtils.cpp",