[BOLT] Increase coverage of shrink wrapping [5/5]

Add -experimental-shrink-wrapping flag to control when we
want to move callee-saved registers even when addresses of the stack
frame are captured and used in pointer arithmetic, making it more
challenging to do alias analysis to prove that we do not access
optimized stack positions. This alias analysis is not yet implemented,
hence, it is experimental. In practice, though, no compiler would emit
code to do pointer arithmetic to access a saved callee-saved register
unless there is a memory bug or we are failing to identify a
callee-saved reg, so I'm not sure how useful it would be to formally
prove that.

Reviewed By: Amir

Differential Revision: https://reviews.llvm.org/D126115
This commit is contained in:
Rafael Auler 2022-05-20 20:10:22 -07:00
parent 3e5f67f356
commit a3cfdd746e
10 changed files with 301 additions and 12 deletions

View file

@ -121,6 +121,10 @@ class FrameAnalysis {
/// Set of functions that require the stack to be 16B aligned
DenseSet<const BinaryFunction *> FunctionsRequireAlignment;
/// Set of functions that performs computations with stack addresses and
/// complicates our understanding of aliasing of stack spaces.
DenseSet<const BinaryFunction *> FunctionsWithStackArithmetic;
/// Owns ArgAccesses for all instructions. References to elements are
/// attached to instructions as indexes to this vector, in MCAnnotations.
std::vector<ArgAccesses> ArgAccessesVector;
@ -183,6 +187,12 @@ public:
return FunctionsRequireAlignment.count(&Func);
}
/// Return true if \p Func does computation with the address of any stack
/// position, meaning we have limited alias analysis on this function.
bool hasStackArithmetic(const BinaryFunction &Func) const {
return FunctionsWithStackArithmetic.count(&Func);
}
/// Functions for retrieving our specific MCAnnotation data from instructions
ErrorOr<ArgAccesses &> getArgAccessesFor(const MCInst &Inst);

View file

@ -38,6 +38,11 @@ static cl::opt<std::string> FrameOptFunctionNamesFile(
static cl::opt<bool> TimeFA("time-fa", cl::desc("time frame analysis steps"),
cl::ReallyHidden, cl::cat(BoltOptCategory));
static cl::opt<bool>
ExperimentalSW("experimental-shrink-wrapping",
cl::desc("process functions with stack pointer arithmetic"),
cl::ReallyHidden, cl::ZeroOrMore, cl::cat(BoltOptCategory));
bool shouldFrameOptimize(const llvm::bolt::BinaryFunction &Function) {
if (Function.hasUnknownControlFlow())
return false;
@ -111,6 +116,7 @@ class FrameAccessAnalysis {
const MCInst *Prev{nullptr};
/// Info about the last frame access
bool IsValidAccess{false};
bool EscapesStackAddress{false};
FrameIndexEntry FIE;
bool decodeFrameAccess(const MCInst &Inst) {
@ -124,14 +130,15 @@ class FrameAccessAnalysis {
return true;
}
if (IsIndexed || FIE.Size == 0) {
if (IsIndexed || (!FIE.Size && (FIE.IsLoad || FIE.IsStore))) {
LLVM_DEBUG(dbgs() << "Giving up on indexed memory access/unknown size\n");
LLVM_DEBUG(dbgs() << "Blame insn: ");
LLVM_DEBUG(BC.printInstruction(outs(), Inst, 0, &BF, true, false, false));
LLVM_DEBUG(Inst.dump());
return false;
}
assert(FIE.Size != 0);
assert(FIE.Size != 0 || (!FIE.IsLoad && !FIE.IsStore));
FIE.RegOrImm = SrcImm;
if (FIE.IsLoad || FIE.IsStoreFromReg)
@ -171,9 +178,11 @@ public:
const FrameIndexEntry &getFIE() const { return FIE; }
int getSPOffset() const { return SPOffset; }
bool isValidAccess() const { return IsValidAccess; }
bool doesEscapeStackAddress() const { return EscapesStackAddress; }
bool doNext(const BinaryBasicBlock &BB, const MCInst &Inst) {
IsValidAccess = false;
EscapesStackAddress = false;
std::tie(SPOffset, FPOffset) =
Prev ? *SPT.getStateAt(*Prev) : *SPT.getStateAt(BB);
Prev = &Inst;
@ -214,11 +223,14 @@ public:
}
if (BC.MIB->escapesVariable(Inst, SPT.HasFramePointer)) {
LLVM_DEBUG(
dbgs() << "Leaked stack address, giving up on this function.\n");
LLVM_DEBUG(dbgs() << "Blame insn: ");
LLVM_DEBUG(Inst.dump());
return false;
EscapesStackAddress = true;
if (!opts::ExperimentalSW) {
LLVM_DEBUG(
dbgs() << "Leaked stack address, giving up on this function.\n");
LLVM_DEBUG(dbgs() << "Blame insn: ");
LLVM_DEBUG(Inst.dump());
return false;
}
}
return decodeFrameAccess(Inst);
@ -405,7 +417,7 @@ bool FrameAnalysis::computeArgsAccessed(BinaryFunction &BF) {
FAA.enterNewBB();
for (MCInst &Inst : *BB) {
if (!FAA.doNext(*BB, Inst)) {
if (!FAA.doNext(*BB, Inst) || FAA.doesEscapeStackAddress()) {
ArgsTouchedMap[&BF].emplace(std::make_pair(-1, 0));
NoInfo = true;
break;
@ -475,6 +487,12 @@ bool FrameAnalysis::restoreFrameIndex(BinaryFunction &BF) {
dbgs() << "\t\t\tSP offset is " << FAA.getSPOffset() << "\n";
});
if (FAA.doesEscapeStackAddress()) {
if (!FunctionsWithStackArithmetic.count(&BF))
FunctionsWithStackArithmetic.insert(&BF);
continue;
}
if (!FAA.isValidAccess())
continue;

View file

@ -268,13 +268,15 @@ void FrameOptimizerPass::runOnFunctions(BinaryContext &BC) {
{
NamedRegionTimer T1("removeloads", "remove loads", "FOP", "FOP breakdown",
opts::TimeOpts);
removeUnnecessaryLoads(*RA, *FA, I.second);
if (!FA->hasStackArithmetic(I.second))
removeUnnecessaryLoads(*RA, *FA, I.second);
}
if (opts::RemoveStores) {
NamedRegionTimer T1("removestores", "remove stores", "FOP",
"FOP breakdown", opts::TimeOpts);
removeUnusedStores(*FA, I.second);
if (!FA->hasStackArithmetic(I.second))
removeUnusedStores(*FA, I.second);
}
// Don't even start shrink wrapping if no profiling info is available
if (I.second.getKnownExecutionCount() == 0)

View file

@ -1066,6 +1066,14 @@ bool ShrinkWrapping::validatePushPopsMode(unsigned CSR, MCInst *BestPosSave,
});
return false;
}
if (FA.hasStackArithmetic(BF)) {
LLVM_DEBUG({
dbgs() << "Reg " << CSR
<< " is not using push/pops due to function "
"taking the address of a stack position.\n";
});
return false;
}
for (MCInst *Save : CSA.getSavesByReg(CSR)) {
if (!SLM.canCollapseRegion(Save)) {
LLVM_DEBUG(dbgs() << "Reg " << CSR << " cannot collapse region.\n");

View file

@ -1091,8 +1091,10 @@ public:
bool IsLoad = MCII.mayLoad();
bool IsStore = MCII.mayStore();
// Is it LEA? (deals with memory but is not loading nor storing)
if (!IsLoad && !IsStore)
return false;
if (!IsLoad && !IsStore) {
I = {0, IsLoad, IsStore, false, false};
break;
}
uint8_t Sz = getMemDataSize(Inst, MemOpNo);
I = {Sz, IsLoad, IsStore, false, false};
break;

View file

@ -0,0 +1,60 @@
# This checks that frame optimizer does not try to optimize away caller-saved
# regs when we do not have complete aliasing info (when there is an LEA
# instruction and the function does arithmetic with stack addresses).
# REQUIRES: system-linux
# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown \
# RUN: %s -o %t.o
# RUN: link_fdata %s %t.o %t.fdata
# RUN: llvm-strip --strip-unneeded %t.o
# RUN: %clang %cflags %t.o -o %t.exe -Wl,-q -nostdlib
# RUN: llvm-bolt %t.exe -relocs -o %t.out -data %t.fdata \
# RUN: -frame-opt=all -simplify-conditional-tail-calls=false \
# RUN: -lite=0 -eliminate-unreachable=false | FileCheck %s
# RUN: llvm-objdump -d %t.out --print-imm-hex | \
# RUN: FileCheck --check-prefix CHECK-OBJDUMP %s
.globl foo
.type foo, %function
foo:
.cfi_startproc
movq $0, (%rsi)
ret
.cfi_endproc
.size foo, .-foo
.globl _start
.type _start, %function
_start:
.cfi_startproc
# FDATA: 0 [unknown] 0 1 _start 0 0 1
push %rbp
mov %rsp, %rbp
subq $0x20, %rsp
je b
c:
addq $0x20, %rsp
pop %rbp
ret
b:
movq %rdi, %r13
movq %r13, -0x08(%rbp)
leaq -0x08(%rbp), %rsi
callq foo
movq -0x08(%rbp), %r13
jmp c
.cfi_endproc
.size _start, .-_start
# CHECK: BOLT-INFO: FOP deleted 0 load(s) (dyn count: 0) and 0 store(s)
# CHECK-OBJDUMP: <_start>:
# CHECK-OBJDUMP: movq %rdi, %r13
# CHECK-OBJDUMP-NEXT: movq %r13, -0x8(%rbp)
# CHECK-OBJDUMP-NEXT: leaq
# CHECK-OBJDUMP-NEXT: callq
# CHECK-OBJDUMP-NEXT: movq -0x8(%rbp), %r13

View file

@ -0,0 +1,65 @@
# This checks that shrink wrapping correctly drops moving push/pops when
# there is an LEA instruction.
# REQUIRES: system-linux
# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown \
# RUN: %s -o %t.o
# RUN: link_fdata %s %t.o %t.fdata
# RUN: llvm-strip --strip-unneeded %t.o
# RUN: ld.lld %t.o -o %t.exe -q
# RUN: llvm-bolt %t.exe -relocs -o %t.out -data %t.fdata \
# RUN: -frame-opt=all -simplify-conditional-tail-calls=false \
# RUN: -experimental-shrink-wrapping \
# RUN: -eliminate-unreachable=false | FileCheck %s
# RUN: llvm-objdump -d %t.out --print-imm-hex | \
# RUN: FileCheck --check-prefix CHECK-OBJDUMP %s
.globl _start
.type _start, %function
_start:
.cfi_startproc
# FDATA: 0 [unknown] 0 1 _start 0 0 1
push %rbp
mov %rsp, %rbp
push %rbx
push %r14
subq $0x20, %rsp
je b
c:
addq $0x20, %rsp
pop %r14
pop %rbx
pop %rbp
ret
b:
je f
jmp *JT(,%rdi,8)
d:
mov %r14, %rdi
mov %rbx, %rdi
leaq -0x20(%rbp), %r14
movq -0x20(%rbp), %rdi
f:
addq $0x20, %rsp
pop %r14
pop %rbx
pop %rbp
ret
.cfi_endproc
.size _start, .-_start
.data
JT:
.quad c
.quad d
.quad f
# CHECK: BOLT-INFO: Shrink wrapping moved 2 spills inserting load/stores and 0 spills inserting push/pops
# Checks that offsets of instructions accessing the stack were not changed
# CHECK-OBJDUMP: <_start>:
# CHECK-OBJDUMP: movq %rbx, %rdi
# CHECK-OBJDUMP-NEXT: leaq -0x20(%rbp), %r14
# CHECK-OBJDUMP: movq -0x20(%rbp), %rdi

View file

@ -0,0 +1,58 @@
# This checks that shrink wrapping correctly drops moving push/pops when
# there is a MOV instruction loading the value of the stack pointer in
# order to do pointer arithmetic with a stack address.
# REQUIRES: system-linux
# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown \
# RUN: %s -o %t.o
# RUN: link_fdata %s %t.o %t.fdata
# RUN: llvm-strip --strip-unneeded %t.o
# RUN: ld.lld %t.o -o %t.exe -q
# RUN: llvm-bolt %t.exe -relocs -o %t.out -data %t.fdata \
# RUN: -frame-opt=all -simplify-conditional-tail-calls=false \
# RUN: -experimental-shrink-wrapping \
# RUN: -eliminate-unreachable=false | FileCheck %s
.globl _start
.type _start, %function
_start:
.cfi_startproc
# FDATA: 0 [unknown] 0 1 _start 0 0 1
push %rbp
mov %rsp, %rbp
push %rbx
push %r14
subq $0x20, %rsp
je b
c:
addq $0x20, %rsp
pop %r14
pop %rbx
pop %rbp
ret
b:
je f
jmp *JT(,%rdi,8)
d:
mov %r14, %rdi
mov %rbx, %rdi
mov %rbp, %rdi
sub 0x20, %rdi
f:
addq $0x20, %rsp
pop %r14
pop %rbx
pop %rbp
ret
.cfi_endproc
.size _start, .-_start
.data
JT:
.quad c
.quad d
.quad f
# CHECK: BOLT-INFO: Shrink wrapping moved 2 spills inserting load/stores and 0 spills inserting push/pops

View file

@ -21,6 +21,7 @@ c:
pushf
push %rbx
push %rbp
addq $0x50, -0x30(%rbp)
pop %rbp
pop %rbx
popf

View file

@ -0,0 +1,65 @@
# This checks that shrink wrapping uses the red zone defined in the X86 ABI by
# placing restores that access elements already deallocated by the stack.
# REQUIRES: system-linux
# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown \
# RUN: %s -o %t.o
# RUN: link_fdata %s %t.o %t.fdata
# RUN: llvm-strip --strip-unneeded %t.o
# RUN: %clang %cflags %t.o -o %t.exe -Wl,-q -nostdlib
# RUN: llvm-bolt -relocs %t.exe -o %t.out -data %t.fdata \
# RUN: -frame-opt=all -simplify-conditional-tail-calls=false \
# RUN: -experimental-shrink-wrapping \
# RUN: -eliminate-unreachable=false | FileCheck %s
# RUN: llvm-objdump -d %t.out --print-imm-hex | \
# RUN: FileCheck --check-prefix CHECK-OBJDUMP %s
# Here we create a CFG where the restore position matches the previous (deleted)
# restore position. Shrink wrapping then will put a stack access to an element
# that was deallocated at the previously deleted POP, which falls in the red
# zone and should be safe for X86 Linux ABI.
.globl _start
.type _start, %function
_start:
.cfi_startproc
# FDATA: 0 [unknown] 0 1 _start 0 0 1
push %rbp
mov %rsp, %rbp
push %rbx
push %r14
subq $0x20, %rsp
b: je hot_path
# FDATA: 1 _start #b# 1 _start #hot_path# 0 1
cold_path:
mov %r14, %rdi
mov %rbx, %rdi
movq rel(%rip), %rdi # Add this to create a relocation and run bolt w/ relocs
leaq -0x20(%rbp), %r14
movq -0x20(%rbp), %rdi
leaq -0x10(%rbp), %rsp
pop %r14
pop %rbx
pop %rbp
ret
hot_path:
addq $0x20, %rsp
pop %r14
pop %rbx
pop %rbp
ret
.cfi_endproc
.size _start, .-_start
.data
rel: .quad cold_path
# CHECK: BOLT-INFO: Shrink wrapping moved 2 spills inserting load/stores and 0 spills inserting push/pops
# CHECK-OBJDUMP: <_start>:
# CHECK-OBJDUMP: leaq (%rbp), %rsp
# CHECK-OBJDUMP-NEXT: popq %rbp
# CHECK-OBJDUMP-NEXT: movq -0x10(%rsp), %rbx
# CHECK-OBJDUMP-NEXT: movq -0x18(%rsp), %r14
# CHECK-OBJDUMP-NEXT: retq