From efcb5465e597571820391bee450dd1e8def300b4 Mon Sep 17 00:00:00 2001 From: Quentin Carbonneaux Date: Thu, 15 Aug 2024 23:11:20 +0200 Subject: [PATCH 1/4] align emitted code Functions are now aligned on 16-byte boundaries. This mimics gcc and should help reduce the maximum perf impact of cosmetic code changes. Previously, any change in the output of qbe could have far reaching implications on alignment. Thanks to Roland Paterson-Jones for pointing out the variability issue. --- parse.c | 1 + 1 file changed, 1 insertion(+) diff --git a/parse.c b/parse.c index a745779..caab452 100644 --- a/parse.c +++ b/parse.c @@ -1219,6 +1219,7 @@ parse(FILE *f, char *path, void dbgfile(char *), void data(Dat *), void func(Fn dbgfile(tokval.str); break; case Tfunc: + lnk.align = 16; func(parsefn(&lnk)); break; case Tdata: From bb8de8c63362b7234db02482240d5600203225d9 Mon Sep 17 00:00:00 2001 From: Alexey Yerin Date: Fri, 2 Aug 2024 14:39:07 +0300 Subject: [PATCH 2/4] arm64/isel: Avoid signed overflow when handling immediates Clang incorrectly optimizes this negation with -O2 and causes QBE to emit 0 in place of INT64_MIN. --- arm64/isel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arm64/isel.c b/arm64/isel.c index 062beb3..9ce6adc 100644 --- a/arm64/isel.c +++ b/arm64/isel.c @@ -24,7 +24,7 @@ imm(Con *c, int k, int64_t *pn) i = Iplo12; if (n < 0) { i = Inlo12; - n = -n; + n = -(uint64_t)n; } *pn = n; if ((n & 0x000fff) == n) From 626f0b278137ff6f8b7d910d9b3fc3cbdfbb39fc Mon Sep 17 00:00:00 2001 From: Quentin Carbonneaux Date: Tue, 20 Aug 2024 15:20:42 +0200 Subject: [PATCH 3/4] skip preludes for some leaf fns When rbp is not necessary to compile a leaf function, we skip saving and restoring it. --- all.h | 2 + amd64/emit.c | 238 ++++++++++++++++++++++++++++++--------------------- amd64/isel.c | 1 + parse.c | 2 + 4 files changed, 145 insertions(+), 98 deletions(-) diff --git a/all.h b/all.h index 3479d27..97cc41c 100644 --- a/all.h +++ b/all.h @@ -393,8 +393,10 @@ struct Fn { Blk **rpo; bits reg; int slot; + int salign; char vararg; char dynalloc; + char leaf; char name[NString]; Lnk lnk; }; diff --git a/amd64/emit.c b/amd64/emit.c index 00dd80f..8f36188 100644 --- a/amd64/emit.c +++ b/amd64/emit.c @@ -1,6 +1,16 @@ #include "all.h" +typedef struct E E; + +struct E { + FILE *f; + Fn *fn; + int fp; + uint64_t fsz; + int nclob; +}; + #define CMP(X) \ X(Ciule, "be") \ X(Ciult, "b") \ @@ -142,23 +152,29 @@ static char *rname[][4] = { static int -slot(Ref r, Fn *fn) +slot(Ref r, E *e) { int s; s = rsval(r); - assert(s <= fn->slot); + assert(s <= e->fn->slot); /* specific to NAlign == 3 */ - if (s < 0) - return -4 * s; - else if (fn->vararg) - return -176 + -4 * (fn->slot - s); + if (s < 0) { + if (e->fp == RSP) + return 4*-s - 8 + e->fsz + e->nclob*8; + else + return 4*-s; + } + else if (e->fp == RSP) + return 4*s + e->nclob*8; + else if (e->fn->vararg) + return -176 + -4 * (e->fn->slot - s); else - return -4 * (fn->slot - s); + return -4 * (e->fn->slot - s); } static void -emitcon(Con *con, FILE *f) +emitcon(Con *con, E *e) { char *p, *l; @@ -168,16 +184,16 @@ emitcon(Con *con, FILE *f) p = l[0] == '"' ? "" : T.assym; if (con->sym.type == SThr) { if (T.apple) - fprintf(f, "%s%s@TLVP", p, l); + fprintf(e->f, "%s%s@TLVP", p, l); else - fprintf(f, "%%fs:%s%s@tpoff", p, l); + fprintf(e->f, "%%fs:%s%s@tpoff", p, l); } else - fprintf(f, "%s%s", p, l); + fprintf(e->f, "%s%s", p, l); if (con->bits.i) - fprintf(f, "%+"PRId64, con->bits.i); + fprintf(e->f, "%+"PRId64, con->bits.i); break; case CBits: - fprintf(f, "%"PRId64, con->bits.i); + fprintf(e->f, "%"PRId64, con->bits.i); break; default: die("unreachable"); @@ -212,10 +228,10 @@ getarg(char c, Ins *i) } } -static void emitins(Ins, Fn *, FILE *); +static void emitins(Ins, E *); static void -emitcopy(Ref r1, Ref r2, int k, Fn *fn, FILE *f) +emitcopy(Ref r1, Ref r2, int k, E *e) { Ins icp; @@ -223,11 +239,11 @@ emitcopy(Ref r1, Ref r2, int k, Fn *fn, FILE *f) icp.arg[0] = r2; icp.to = r1; icp.cls = k; - emitins(icp, fn, f); + emitins(icp, e); } static void -emitf(char *s, Ins *i, Fn *fn, FILE *f) +emitf(char *s, Ins *i, E *e) { static char clstoa[][3] = {"l", "q", "ss", "sd"}; char c; @@ -247,25 +263,25 @@ emitf(char *s, Ins *i, Fn *fn, FILE *f) case '-': assert((!req(i->arg[1], i->to) || req(i->arg[0], i->to)) && "cannot convert to 2-address"); - emitcopy(i->to, i->arg[0], i->cls, fn, f); + emitcopy(i->to, i->arg[0], i->cls, e); s++; break; } - fputc('\t', f); + fputc('\t', e->f); Next: while ((c = *s++) != '%') if (!c) { - fputc('\n', f); + fputc('\n', e->f); return; } else - fputc(c, f); + fputc(c, e->f); switch ((c = *s++)) { case '%': - fputc('%', f); + fputc('%', e->f); break; case 'k': - fputs(clstoa[i->cls], f); + fputs(clstoa[i->cls], e->f); break; case '0': case '1': @@ -282,37 +298,42 @@ Next: switch (rtype(ref)) { case RTmp: assert(isreg(ref)); - fprintf(f, "%%%s", regtoa(ref.val, sz)); + fprintf(e->f, "%%%s", regtoa(ref.val, sz)); break; case RSlot: - fprintf(f, "%d(%%rbp)", slot(ref, fn)); + fprintf(e->f, "%d(%%%s)", + slot(ref, e), + regtoa(e->fp, SLong) + ); break; case RMem: Mem: - m = &fn->mem[ref.val]; + m = &e->fn->mem[ref.val]; if (rtype(m->base) == RSlot) { off.type = CBits; - off.bits.i = slot(m->base, fn); + off.bits.i = slot(m->base, e); addcon(&m->offset, &off, 1); - m->base = TMP(RBP); + m->base = TMP(e->fp); } if (m->offset.type != CUndef) - emitcon(&m->offset, f); - fputc('(', f); + emitcon(&m->offset, e); + fputc('(', e->f); if (!req(m->base, R)) - fprintf(f, "%%%s", regtoa(m->base.val, SLong)); + fprintf(e->f, "%%%s", + regtoa(m->base.val, SLong) + ); else if (m->offset.type == CAddr) - fprintf(f, "%%rip"); + fprintf(e->f, "%%rip"); if (!req(m->index, R)) - fprintf(f, ", %%%s, %d", + fprintf(e->f, ", %%%s, %d", regtoa(m->index.val, SLong), m->scale ); - fputc(')', f); + fputc(')', e->f); break; case RCon: - fputc('$', f); - emitcon(&fn->con[ref.val], f); + fputc('$', e->f); + emitcon(&e->fn->con[ref.val], e); break; default: die("unreachable"); @@ -337,18 +358,21 @@ Next: case RMem: goto Mem; case RSlot: - fprintf(f, "%d(%%rbp)", slot(ref, fn)); + fprintf(e->f, "%d(%%%s)", + slot(ref, e), + regtoa(e->fp, SLong) + ); break; case RCon: - off = fn->con[ref.val]; - emitcon(&off, f); + off = e->fn->con[ref.val]; + emitcon(&off, e); if (off.type == CAddr) if (off.sym.type != SThr || T.apple) - fprintf(f, "(%%rip)"); + fprintf(e->f, "(%%rip)"); break; case RTmp: assert(isreg(ref)); - fprintf(f, "(%%%s)", regtoa(ref.val, SLong)); + fprintf(e->f, "(%%%s)", regtoa(ref.val, SLong)); break; default: die("unreachable"); @@ -366,7 +390,7 @@ static void *negmask[4] = { }; static void -emitins(Ins i, Fn *fn, FILE *f) +emitins(Ins i, E *e) { Ref r; int64_t val; @@ -393,7 +417,7 @@ emitins(Ins i, Fn *fn, FILE *f) || (omap[o].cls == Ka)) break; } - emitf(omap[o].fmt, &i, fn, f); + emitf(omap[o].fmt, &i, e); break; case Onop: /* just do nothing for nops, they are inserted @@ -410,7 +434,7 @@ emitins(Ins i, Fn *fn, FILE *f) if (KBASE(i.cls) == 0 /* only available for ints */ && rtype(i.arg[0]) == RCon && rtype(i.arg[1]) == RTmp) { - emitf("imul%k %0, %1, %=", &i, fn, f); + emitf("imul%k %0, %1, %=", &i, e); break; } goto Table; @@ -419,18 +443,18 @@ emitins(Ins i, Fn *fn, FILE *f) * some 3-address subtractions */ if (req(i.to, i.arg[1]) && !req(i.arg[0], i.to)) { ineg = (Ins){Oneg, i.cls, i.to, {i.to}}; - emitins(ineg, fn, f); - emitf("add%k %0, %=", &i, fn, f); + emitins(ineg, e); + emitf("add%k %0, %=", &i, e); break; } goto Table; case Oneg: if (!req(i.to, i.arg[0])) - emitf("mov%k %0, %=", &i, fn, f); + emitf("mov%k %0, %=", &i, e); if (KBASE(i.cls) == 0) - emitf("neg%k %=", &i, fn, f); + emitf("neg%k %=", &i, e); else - fprintf(f, + fprintf(e->f, "\txorp%c %sfp%d(%%rip), %%%s\n", "xxsd"[i.cls], T.asloc, @@ -443,8 +467,8 @@ emitins(Ins i, Fn *fn, FILE *f) * conversion to 2-address in emitf() would fail */ if (req(i.to, i.arg[1])) { i.arg[1] = TMP(XMM0+15); - emitf("mov%k %=, %1", &i, fn, f); - emitf("mov%k %0, %=", &i, fn, f); + emitf("mov%k %=, %1", &i, e); + emitf("mov%k %0, %=", &i, e); i.arg[0] = i.to; } goto Table; @@ -460,53 +484,54 @@ emitins(Ins i, Fn *fn, FILE *f) t0 = rtype(i.arg[0]); if (i.cls == Kl && t0 == RCon - && fn->con[i.arg[0].val].type == CBits) { - val = fn->con[i.arg[0].val].bits.i; + && e->fn->con[i.arg[0].val].type == CBits) { + val = e->fn->con[i.arg[0].val].bits.i; if (isreg(i.to)) if (val >= 0 && val <= UINT32_MAX) { - emitf("movl %W0, %W=", &i, fn, f); + emitf("movl %W0, %W=", &i, e); break; } if (rtype(i.to) == RSlot) if (val < INT32_MIN || val > INT32_MAX) { - emitf("movl %0, %=", &i, fn, f); - emitf("movl %0>>32, 4+%=", &i, fn, f); + emitf("movl %0, %=", &i, e); + emitf("movl %0>>32, 4+%=", &i, e); break; } } if (isreg(i.to) && t0 == RCon - && fn->con[i.arg[0].val].type == CAddr) { - emitf("lea%k %M0, %=", &i, fn, f); + && e->fn->con[i.arg[0].val].type == CAddr) { + emitf("lea%k %M0, %=", &i, e); break; } if (rtype(i.to) == RSlot && (t0 == RSlot || t0 == RMem)) { i.cls = KWIDE(i.cls) ? Kd : Ks; i.arg[1] = TMP(XMM0+15); - emitf("mov%k %0, %1", &i, fn, f); - emitf("mov%k %1, %=", &i, fn, f); + emitf("mov%k %0, %1", &i, e); + emitf("mov%k %1, %=", &i, e); break; } /* conveniently, the assembler knows if it * should use movabsq when reading movq */ - emitf("mov%k %0, %=", &i, fn, f); + emitf("mov%k %0, %=", &i, e); break; case Oaddr: if (!T.apple && rtype(i.arg[0]) == RCon - && fn->con[i.arg[0].val].sym.type == SThr) { + && e->fn->con[i.arg[0].val].sym.type == SThr) { /* derive the symbol address from the TCB * address at offset 0 of %fs */ assert(isreg(i.to)); - con = &fn->con[i.arg[0].val]; + con = &e->fn->con[i.arg[0].val]; sym = str(con->sym.id); - emitf("movq %%fs:0, %L=", &i, fn, f); - fprintf(f, "\tleaq %s%s@tpoff", + emitf("movq %%fs:0, %L=", &i, e); + fprintf(e->f, "\tleaq %s%s@tpoff", sym[0] == '"' ? "" : T.assym, sym); if (con->bits.i) - fprintf(f, "%+"PRId64, con->bits.i); - fprintf(f, "(%%%s), %%%s\n", + fprintf(e->f, "%+"PRId64, + con->bits.i); + fprintf(e->f, "(%%%s), %%%s\n", regtoa(i.to.val, SLong), regtoa(i.to.val, SLong)); break; @@ -517,12 +542,12 @@ emitins(Ins i, Fn *fn, FILE *f) * assembly... */ switch (rtype(i.arg[0])) { case RCon: - fprintf(f, "\tcallq "); - emitcon(&fn->con[i.arg[0].val], f); - fprintf(f, "\n"); + fprintf(e->f, "\tcallq "); + emitcon(&e->fn->con[i.arg[0].val], e); + fprintf(e->f, "\n"); break; case RTmp: - emitf("callq *%L0", &i, fn, f); + emitf("callq *%L0", &i, e); break; default: die("invalid call argument"); @@ -533,9 +558,10 @@ emitins(Ins i, Fn *fn, FILE *f) * maybe we should split Osalloc in 2 different * instructions depending on the result */ - emitf("subq %L0, %%rsp", &i, fn, f); + assert(e->fp == RBP); + emitf("subq %L0, %%rsp", &i, e); if (!req(i.to, R)) - emitcopy(i.to, TMP(RSP), Kl, fn, f); + emitcopy(i.to, TMP(RSP), Kl, e); break; case Oswap: if (KBASE(i.cls) == 0) @@ -543,27 +569,35 @@ emitins(Ins i, Fn *fn, FILE *f) /* for floats, there is no swap instruction * so we use xmm15 as a temporary */ - emitcopy(TMP(XMM0+15), i.arg[0], i.cls, fn, f); - emitcopy(i.arg[0], i.arg[1], i.cls, fn, f); - emitcopy(i.arg[1], TMP(XMM0+15), i.cls, fn, f); + emitcopy(TMP(XMM0+15), i.arg[0], i.cls, e); + emitcopy(i.arg[0], i.arg[1], i.cls, e); + emitcopy(i.arg[1], TMP(XMM0+15), i.cls, e); break; case Odbgloc: - emitdbgloc(i.arg[0].val, i.arg[1].val, f); + emitdbgloc(i.arg[0].val, i.arg[1].val, e->f); break; } } -static uint64_t -framesz(Fn *fn) +static void +framesz(E *e) { uint64_t i, o, f; /* specific to NAlign == 3 */ - for (i=0, o=0; ireg >> amd64_sysv_rclob[i]); - f = fn->slot; + o = 0; + if (!e->fn->leaf) { + for (i=0, o=0; ifn->reg >> amd64_sysv_rclob[i]; + o &= 1; + } + f = e->fn->slot; f = (f + 3) & -4; - return 4*f + 8*o + 176*fn->vararg; + if (f > 0 + && e->fp == RSP + && e->fn->salign == 4) + f += 2; + e->fsz = 4*f + 8*o + 176*e->fn->vararg; } void @@ -578,13 +612,19 @@ amd64_emitfn(Fn *fn, FILE *f) Blk *b, *s; Ins *i, itmp; int *r, c, o, n, lbl; - uint64_t fs; + E *e; + e = &(E){.f = f, .fn = fn}; emitfnlnk(fn->name, &fn->lnk, f); - fputs("\tendbr64\n\tpushq %rbp\n\tmovq %rsp, %rbp\n", f); - fs = framesz(fn); - if (fs) - fprintf(f, "\tsubq $%"PRIu64", %%rsp\n", fs); + fputs("\tendbr64\n", f); + if (!fn->leaf || fn->vararg || fn->dynalloc) { + e->fp = RBP; + fputs("\tpushq %rbp\n\tmovq %rsp, %rbp\n", f); + } else + e->fp = RSP; + framesz(e); + if (e->fsz) + fprintf(f, "\tsubq $%"PRIu64", %%rsp\n", e->fsz); if (fn->vararg) { o = -176; for (r=amd64_sysv_rsave; r<&amd64_sysv_rsave[6]; r++, o+=8) @@ -595,15 +635,15 @@ amd64_emitfn(Fn *fn, FILE *f) for (r=amd64_sysv_rclob; r<&amd64_sysv_rclob[NCLR]; r++) if (fn->reg & BIT(*r)) { itmp.arg[0] = TMP(*r); - emitf("pushq %L0", &itmp, fn, f); - fs += 8; + emitf("pushq %L0", &itmp, e); + e->nclob++; } for (lbl=0, b=fn->start; b; b=b->link) { if (lbl || b->npred > 1) fprintf(f, "%sbb%d:\n", T.asloc, id0+b->id); for (i=b->ins; i!=&b->ins[b->nins]; i++) - emitins(*i, fn, f); + emitins(*i, e); lbl = 1; switch (b->jmp.type) { case Jhlt: @@ -614,17 +654,19 @@ amd64_emitfn(Fn *fn, FILE *f) fprintf(f, "\tmovq %%rbp, %%rsp\n" "\tsubq $%"PRIu64", %%rsp\n", - fs - ); + e->fsz + e->nclob * 8); for (r=&amd64_sysv_rclob[NCLR]; r>amd64_sysv_rclob;) if (fn->reg & BIT(*--r)) { itmp.arg[0] = TMP(*r); - emitf("popq %L0", &itmp, fn, f); + emitf("popq %L0", &itmp, e); } - fprintf(f, - "\tleave\n" - "\tret\n" - ); + if (e->fp == RBP) + fputs("\tleave\n", f); + else if (e->fsz) + fprintf(f, + "\taddq $%"PRIu64", %%rsp\n", + e->fsz); + fputs("\tret\n", f); break; case Jjmp: Jmp: diff --git a/amd64/isel.c b/amd64/isel.c index 2b92878..bd645ce 100644 --- a/amd64/isel.c +++ b/amd64/isel.c @@ -808,6 +808,7 @@ amd64_isel(Fn *fn) die("alloc too large"); fn->tmp[i->to.val].slot = fn->slot; fn->slot += sz; + fn->salign = 2 + al - Oalloc; *i = (Ins){.op = Onop}; } diff --git a/parse.c b/parse.c index caab452..e896679 100644 --- a/parse.c +++ b/parse.c @@ -694,6 +694,7 @@ parseline(PState ps) goto Ins; } if (op == Tcall) { + curf->leaf = 0; arg[0] = parseref(); parserefl(1); op = Ocall; @@ -910,6 +911,7 @@ parsefn(Lnk *lnk) curf->con[0].bits.i = 0xdeaddead; /* UNDEF */ curf->con[1].type = CBits; curf->lnk = *lnk; + curf->leaf = 1; blink = &curf->start; curf->retty = Kx; if (peek() != Tglo) From 90050202f57b22243f5d3dd434a81df2f89de9ed Mon Sep 17 00:00:00 2001 From: Quentin Carbonneaux Date: Tue, 1 Oct 2024 19:38:15 +0200 Subject: [PATCH 4/4] fix various codegen bugs on arm64 - dynamic allocations could generate bad 'and' instructions (for the and with -16 in salloc()). - symbols used in w context would generate adrp and add instructions on wN registers while they seem to only work on xN registers. Thanks to Rosie for reporting them. --- arm64/emit.c | 19 ++++++++++++++----- test/isel5.ssa | 16 ++++++++++++++++ 2 files changed, 30 insertions(+), 5 deletions(-) create mode 100644 test/isel5.ssa diff --git a/arm64/emit.c b/arm64/emit.c index ffdc178..28cd6a5 100644 --- a/arm64/emit.c +++ b/arm64/emit.c @@ -160,7 +160,8 @@ emitf(char *s, Ins *i, E *e) Ref r; int k, c; Con *pc; - uint n, sp; + uint64_t n; + uint sp; fputc('\t', e->f); @@ -217,10 +218,17 @@ emitf(char *s, Ins *i, E *e) pc = &e->fn->con[r.val]; n = pc->bits.i; assert(pc->type == CBits); - if (n & 0xfff000) - fprintf(e->f, "#%u, lsl #12", n>>12); - else - fprintf(e->f, "#%u", n); + if (n >> 24) { + assert(arm64_logimm(n, k)); + fprintf(e->f, "#%"PRIu64, n); + } else if (n & 0xfff000) { + assert(!(n & ~0xfff000ull)); + fprintf(e->f, "#%"PRIu64", lsl #12", + n>>12); + } else { + assert(!(n & ~0xfffull)); + fprintf(e->f, "#%"PRIu64, n); + } break; } break; @@ -304,6 +312,7 @@ loadcon(Con *c, int r, int k, E *e) rn = rname(r, k); n = c->bits.i; if (c->type == CAddr) { + rn = rname(r, Kl); loadaddr(c, rn, e); return; } diff --git a/test/isel5.ssa b/test/isel5.ssa new file mode 100644 index 0000000..9c546d7 --- /dev/null +++ b/test/isel5.ssa @@ -0,0 +1,16 @@ +# make sure the local symbols used for +# fp constants do not get a _ prefix +# on apple arm hardware + +export function w $main() { +@start + %r =d copy d_1.2 + %x =w call $printf(l $fmt, ..., d %r) + ret 0 +} + +data $fmt = { b "%.06f\n", b 0 } + +# >>> output +# 1.200000 +# <<<