skip preludes for some leaf fns

When rbp is not necessary to compile
a leaf function, we skip saving and
restoring it.
This commit is contained in:
Quentin Carbonneaux 2024-08-20 15:20:42 +02:00
parent bb8de8c633
commit 626f0b2781
4 changed files with 145 additions and 98 deletions

2
all.h
View file

@ -393,8 +393,10 @@ struct Fn {
Blk **rpo;
bits reg;
int slot;
int salign;
char vararg;
char dynalloc;
char leaf;
char name[NString];
Lnk lnk;
};

View file

@ -1,6 +1,16 @@
#include "all.h"
typedef struct E E;
struct E {
FILE *f;
Fn *fn;
int fp;
uint64_t fsz;
int nclob;
};
#define CMP(X) \
X(Ciule, "be") \
X(Ciult, "b") \
@ -142,23 +152,29 @@ static char *rname[][4] = {
static int
slot(Ref r, Fn *fn)
slot(Ref r, E *e)
{
int s;
s = rsval(r);
assert(s <= fn->slot);
assert(s <= e->fn->slot);
/* specific to NAlign == 3 */
if (s < 0)
return -4 * s;
else if (fn->vararg)
return -176 + -4 * (fn->slot - s);
if (s < 0) {
if (e->fp == RSP)
return 4*-s - 8 + e->fsz + e->nclob*8;
else
return -4 * (fn->slot - s);
return 4*-s;
}
else if (e->fp == RSP)
return 4*s + e->nclob*8;
else if (e->fn->vararg)
return -176 + -4 * (e->fn->slot - s);
else
return -4 * (e->fn->slot - s);
}
static void
emitcon(Con *con, FILE *f)
emitcon(Con *con, E *e)
{
char *p, *l;
@ -168,16 +184,16 @@ emitcon(Con *con, FILE *f)
p = l[0] == '"' ? "" : T.assym;
if (con->sym.type == SThr) {
if (T.apple)
fprintf(f, "%s%s@TLVP", p, l);
fprintf(e->f, "%s%s@TLVP", p, l);
else
fprintf(f, "%%fs:%s%s@tpoff", p, l);
fprintf(e->f, "%%fs:%s%s@tpoff", p, l);
} else
fprintf(f, "%s%s", p, l);
fprintf(e->f, "%s%s", p, l);
if (con->bits.i)
fprintf(f, "%+"PRId64, con->bits.i);
fprintf(e->f, "%+"PRId64, con->bits.i);
break;
case CBits:
fprintf(f, "%"PRId64, con->bits.i);
fprintf(e->f, "%"PRId64, con->bits.i);
break;
default:
die("unreachable");
@ -212,10 +228,10 @@ getarg(char c, Ins *i)
}
}
static void emitins(Ins, Fn *, FILE *);
static void emitins(Ins, E *);
static void
emitcopy(Ref r1, Ref r2, int k, Fn *fn, FILE *f)
emitcopy(Ref r1, Ref r2, int k, E *e)
{
Ins icp;
@ -223,11 +239,11 @@ emitcopy(Ref r1, Ref r2, int k, Fn *fn, FILE *f)
icp.arg[0] = r2;
icp.to = r1;
icp.cls = k;
emitins(icp, fn, f);
emitins(icp, e);
}
static void
emitf(char *s, Ins *i, Fn *fn, FILE *f)
emitf(char *s, Ins *i, E *e)
{
static char clstoa[][3] = {"l", "q", "ss", "sd"};
char c;
@ -247,25 +263,25 @@ emitf(char *s, Ins *i, Fn *fn, FILE *f)
case '-':
assert((!req(i->arg[1], i->to) || req(i->arg[0], i->to)) &&
"cannot convert to 2-address");
emitcopy(i->to, i->arg[0], i->cls, fn, f);
emitcopy(i->to, i->arg[0], i->cls, e);
s++;
break;
}
fputc('\t', f);
fputc('\t', e->f);
Next:
while ((c = *s++) != '%')
if (!c) {
fputc('\n', f);
fputc('\n', e->f);
return;
} else
fputc(c, f);
fputc(c, e->f);
switch ((c = *s++)) {
case '%':
fputc('%', f);
fputc('%', e->f);
break;
case 'k':
fputs(clstoa[i->cls], f);
fputs(clstoa[i->cls], e->f);
break;
case '0':
case '1':
@ -282,37 +298,42 @@ Next:
switch (rtype(ref)) {
case RTmp:
assert(isreg(ref));
fprintf(f, "%%%s", regtoa(ref.val, sz));
fprintf(e->f, "%%%s", regtoa(ref.val, sz));
break;
case RSlot:
fprintf(f, "%d(%%rbp)", slot(ref, fn));
fprintf(e->f, "%d(%%%s)",
slot(ref, e),
regtoa(e->fp, SLong)
);
break;
case RMem:
Mem:
m = &fn->mem[ref.val];
m = &e->fn->mem[ref.val];
if (rtype(m->base) == RSlot) {
off.type = CBits;
off.bits.i = slot(m->base, fn);
off.bits.i = slot(m->base, e);
addcon(&m->offset, &off, 1);
m->base = TMP(RBP);
m->base = TMP(e->fp);
}
if (m->offset.type != CUndef)
emitcon(&m->offset, f);
fputc('(', f);
emitcon(&m->offset, e);
fputc('(', e->f);
if (!req(m->base, R))
fprintf(f, "%%%s", regtoa(m->base.val, SLong));
fprintf(e->f, "%%%s",
regtoa(m->base.val, SLong)
);
else if (m->offset.type == CAddr)
fprintf(f, "%%rip");
fprintf(e->f, "%%rip");
if (!req(m->index, R))
fprintf(f, ", %%%s, %d",
fprintf(e->f, ", %%%s, %d",
regtoa(m->index.val, SLong),
m->scale
);
fputc(')', f);
fputc(')', e->f);
break;
case RCon:
fputc('$', f);
emitcon(&fn->con[ref.val], f);
fputc('$', e->f);
emitcon(&e->fn->con[ref.val], e);
break;
default:
die("unreachable");
@ -337,18 +358,21 @@ Next:
case RMem:
goto Mem;
case RSlot:
fprintf(f, "%d(%%rbp)", slot(ref, fn));
fprintf(e->f, "%d(%%%s)",
slot(ref, e),
regtoa(e->fp, SLong)
);
break;
case RCon:
off = fn->con[ref.val];
emitcon(&off, f);
off = e->fn->con[ref.val];
emitcon(&off, e);
if (off.type == CAddr)
if (off.sym.type != SThr || T.apple)
fprintf(f, "(%%rip)");
fprintf(e->f, "(%%rip)");
break;
case RTmp:
assert(isreg(ref));
fprintf(f, "(%%%s)", regtoa(ref.val, SLong));
fprintf(e->f, "(%%%s)", regtoa(ref.val, SLong));
break;
default:
die("unreachable");
@ -366,7 +390,7 @@ static void *negmask[4] = {
};
static void
emitins(Ins i, Fn *fn, FILE *f)
emitins(Ins i, E *e)
{
Ref r;
int64_t val;
@ -393,7 +417,7 @@ emitins(Ins i, Fn *fn, FILE *f)
|| (omap[o].cls == Ka))
break;
}
emitf(omap[o].fmt, &i, fn, f);
emitf(omap[o].fmt, &i, e);
break;
case Onop:
/* just do nothing for nops, they are inserted
@ -410,7 +434,7 @@ emitins(Ins i, Fn *fn, FILE *f)
if (KBASE(i.cls) == 0 /* only available for ints */
&& rtype(i.arg[0]) == RCon
&& rtype(i.arg[1]) == RTmp) {
emitf("imul%k %0, %1, %=", &i, fn, f);
emitf("imul%k %0, %1, %=", &i, e);
break;
}
goto Table;
@ -419,18 +443,18 @@ emitins(Ins i, Fn *fn, FILE *f)
* some 3-address subtractions */
if (req(i.to, i.arg[1]) && !req(i.arg[0], i.to)) {
ineg = (Ins){Oneg, i.cls, i.to, {i.to}};
emitins(ineg, fn, f);
emitf("add%k %0, %=", &i, fn, f);
emitins(ineg, e);
emitf("add%k %0, %=", &i, e);
break;
}
goto Table;
case Oneg:
if (!req(i.to, i.arg[0]))
emitf("mov%k %0, %=", &i, fn, f);
emitf("mov%k %0, %=", &i, e);
if (KBASE(i.cls) == 0)
emitf("neg%k %=", &i, fn, f);
emitf("neg%k %=", &i, e);
else
fprintf(f,
fprintf(e->f,
"\txorp%c %sfp%d(%%rip), %%%s\n",
"xxsd"[i.cls],
T.asloc,
@ -443,8 +467,8 @@ emitins(Ins i, Fn *fn, FILE *f)
* conversion to 2-address in emitf() would fail */
if (req(i.to, i.arg[1])) {
i.arg[1] = TMP(XMM0+15);
emitf("mov%k %=, %1", &i, fn, f);
emitf("mov%k %0, %=", &i, fn, f);
emitf("mov%k %=, %1", &i, e);
emitf("mov%k %0, %=", &i, e);
i.arg[0] = i.to;
}
goto Table;
@ -460,53 +484,54 @@ emitins(Ins i, Fn *fn, FILE *f)
t0 = rtype(i.arg[0]);
if (i.cls == Kl
&& t0 == RCon
&& fn->con[i.arg[0].val].type == CBits) {
val = fn->con[i.arg[0].val].bits.i;
&& e->fn->con[i.arg[0].val].type == CBits) {
val = e->fn->con[i.arg[0].val].bits.i;
if (isreg(i.to))
if (val >= 0 && val <= UINT32_MAX) {
emitf("movl %W0, %W=", &i, fn, f);
emitf("movl %W0, %W=", &i, e);
break;
}
if (rtype(i.to) == RSlot)
if (val < INT32_MIN || val > INT32_MAX) {
emitf("movl %0, %=", &i, fn, f);
emitf("movl %0>>32, 4+%=", &i, fn, f);
emitf("movl %0, %=", &i, e);
emitf("movl %0>>32, 4+%=", &i, e);
break;
}
}
if (isreg(i.to)
&& t0 == RCon
&& fn->con[i.arg[0].val].type == CAddr) {
emitf("lea%k %M0, %=", &i, fn, f);
&& e->fn->con[i.arg[0].val].type == CAddr) {
emitf("lea%k %M0, %=", &i, e);
break;
}
if (rtype(i.to) == RSlot
&& (t0 == RSlot || t0 == RMem)) {
i.cls = KWIDE(i.cls) ? Kd : Ks;
i.arg[1] = TMP(XMM0+15);
emitf("mov%k %0, %1", &i, fn, f);
emitf("mov%k %1, %=", &i, fn, f);
emitf("mov%k %0, %1", &i, e);
emitf("mov%k %1, %=", &i, e);
break;
}
/* conveniently, the assembler knows if it
* should use movabsq when reading movq */
emitf("mov%k %0, %=", &i, fn, f);
emitf("mov%k %0, %=", &i, e);
break;
case Oaddr:
if (!T.apple
&& rtype(i.arg[0]) == RCon
&& fn->con[i.arg[0].val].sym.type == SThr) {
&& e->fn->con[i.arg[0].val].sym.type == SThr) {
/* derive the symbol address from the TCB
* address at offset 0 of %fs */
assert(isreg(i.to));
con = &fn->con[i.arg[0].val];
con = &e->fn->con[i.arg[0].val];
sym = str(con->sym.id);
emitf("movq %%fs:0, %L=", &i, fn, f);
fprintf(f, "\tleaq %s%s@tpoff",
emitf("movq %%fs:0, %L=", &i, e);
fprintf(e->f, "\tleaq %s%s@tpoff",
sym[0] == '"' ? "" : T.assym, sym);
if (con->bits.i)
fprintf(f, "%+"PRId64, con->bits.i);
fprintf(f, "(%%%s), %%%s\n",
fprintf(e->f, "%+"PRId64,
con->bits.i);
fprintf(e->f, "(%%%s), %%%s\n",
regtoa(i.to.val, SLong),
regtoa(i.to.val, SLong));
break;
@ -517,12 +542,12 @@ emitins(Ins i, Fn *fn, FILE *f)
* assembly... */
switch (rtype(i.arg[0])) {
case RCon:
fprintf(f, "\tcallq ");
emitcon(&fn->con[i.arg[0].val], f);
fprintf(f, "\n");
fprintf(e->f, "\tcallq ");
emitcon(&e->fn->con[i.arg[0].val], e);
fprintf(e->f, "\n");
break;
case RTmp:
emitf("callq *%L0", &i, fn, f);
emitf("callq *%L0", &i, e);
break;
default:
die("invalid call argument");
@ -533,9 +558,10 @@ emitins(Ins i, Fn *fn, FILE *f)
* maybe we should split Osalloc in 2 different
* instructions depending on the result
*/
emitf("subq %L0, %%rsp", &i, fn, f);
assert(e->fp == RBP);
emitf("subq %L0, %%rsp", &i, e);
if (!req(i.to, R))
emitcopy(i.to, TMP(RSP), Kl, fn, f);
emitcopy(i.to, TMP(RSP), Kl, e);
break;
case Oswap:
if (KBASE(i.cls) == 0)
@ -543,27 +569,35 @@ emitins(Ins i, Fn *fn, FILE *f)
/* for floats, there is no swap instruction
* so we use xmm15 as a temporary
*/
emitcopy(TMP(XMM0+15), i.arg[0], i.cls, fn, f);
emitcopy(i.arg[0], i.arg[1], i.cls, fn, f);
emitcopy(i.arg[1], TMP(XMM0+15), i.cls, fn, f);
emitcopy(TMP(XMM0+15), i.arg[0], i.cls, e);
emitcopy(i.arg[0], i.arg[1], i.cls, e);
emitcopy(i.arg[1], TMP(XMM0+15), i.cls, e);
break;
case Odbgloc:
emitdbgloc(i.arg[0].val, i.arg[1].val, f);
emitdbgloc(i.arg[0].val, i.arg[1].val, e->f);
break;
}
}
static uint64_t
framesz(Fn *fn)
static void
framesz(E *e)
{
uint64_t i, o, f;
/* specific to NAlign == 3 */
o = 0;
if (!e->fn->leaf) {
for (i=0, o=0; i<NCLR; i++)
o ^= 1 & (fn->reg >> amd64_sysv_rclob[i]);
f = fn->slot;
o ^= e->fn->reg >> amd64_sysv_rclob[i];
o &= 1;
}
f = e->fn->slot;
f = (f + 3) & -4;
return 4*f + 8*o + 176*fn->vararg;
if (f > 0
&& e->fp == RSP
&& e->fn->salign == 4)
f += 2;
e->fsz = 4*f + 8*o + 176*e->fn->vararg;
}
void
@ -578,13 +612,19 @@ amd64_emitfn(Fn *fn, FILE *f)
Blk *b, *s;
Ins *i, itmp;
int *r, c, o, n, lbl;
uint64_t fs;
E *e;
e = &(E){.f = f, .fn = fn};
emitfnlnk(fn->name, &fn->lnk, f);
fputs("\tendbr64\n\tpushq %rbp\n\tmovq %rsp, %rbp\n", f);
fs = framesz(fn);
if (fs)
fprintf(f, "\tsubq $%"PRIu64", %%rsp\n", fs);
fputs("\tendbr64\n", f);
if (!fn->leaf || fn->vararg || fn->dynalloc) {
e->fp = RBP;
fputs("\tpushq %rbp\n\tmovq %rsp, %rbp\n", f);
} else
e->fp = RSP;
framesz(e);
if (e->fsz)
fprintf(f, "\tsubq $%"PRIu64", %%rsp\n", e->fsz);
if (fn->vararg) {
o = -176;
for (r=amd64_sysv_rsave; r<&amd64_sysv_rsave[6]; r++, o+=8)
@ -595,15 +635,15 @@ amd64_emitfn(Fn *fn, FILE *f)
for (r=amd64_sysv_rclob; r<&amd64_sysv_rclob[NCLR]; r++)
if (fn->reg & BIT(*r)) {
itmp.arg[0] = TMP(*r);
emitf("pushq %L0", &itmp, fn, f);
fs += 8;
emitf("pushq %L0", &itmp, e);
e->nclob++;
}
for (lbl=0, b=fn->start; b; b=b->link) {
if (lbl || b->npred > 1)
fprintf(f, "%sbb%d:\n", T.asloc, id0+b->id);
for (i=b->ins; i!=&b->ins[b->nins]; i++)
emitins(*i, fn, f);
emitins(*i, e);
lbl = 1;
switch (b->jmp.type) {
case Jhlt:
@ -614,17 +654,19 @@ amd64_emitfn(Fn *fn, FILE *f)
fprintf(f,
"\tmovq %%rbp, %%rsp\n"
"\tsubq $%"PRIu64", %%rsp\n",
fs
);
e->fsz + e->nclob * 8);
for (r=&amd64_sysv_rclob[NCLR]; r>amd64_sysv_rclob;)
if (fn->reg & BIT(*--r)) {
itmp.arg[0] = TMP(*r);
emitf("popq %L0", &itmp, fn, f);
emitf("popq %L0", &itmp, e);
}
if (e->fp == RBP)
fputs("\tleave\n", f);
else if (e->fsz)
fprintf(f,
"\tleave\n"
"\tret\n"
);
"\taddq $%"PRIu64", %%rsp\n",
e->fsz);
fputs("\tret\n", f);
break;
case Jjmp:
Jmp:

View file

@ -808,6 +808,7 @@ amd64_isel(Fn *fn)
die("alloc too large");
fn->tmp[i->to.val].slot = fn->slot;
fn->slot += sz;
fn->salign = 2 + al - Oalloc;
*i = (Ins){.op = Onop};
}

View file

@ -694,6 +694,7 @@ parseline(PState ps)
goto Ins;
}
if (op == Tcall) {
curf->leaf = 0;
arg[0] = parseref();
parserefl(1);
op = Ocall;
@ -910,6 +911,7 @@ parsefn(Lnk *lnk)
curf->con[0].bits.i = 0xdeaddead; /* UNDEF */
curf->con[1].type = CBits;
curf->lnk = *lnk;
curf->leaf = 1;
blink = &curf->start;
curf->retty = Kx;
if (peek() != Tglo)