libqbe/mem.c
Quentin Carbonneaux 15e25a61b3 fix coalesce() to produce valid ssa
When multiple stack slots are coalesced
one 'alloc' instruction is kept in the il
and the other ones are removed and have
their uses replaced by the result of the
selected one. To produce valid ssa, it
must be ensured that the uses that get
replaced are dominated by the selected
'alloc' instruction. This patch ensures
dominance by moving the selected alloc up
in the start block as necessary.
2022-12-14 23:05:35 +01:00

413 lines
7.6 KiB
C

#include "all.h"
typedef struct Range Range;
typedef struct Slot Slot;
/* require use, maintains use counts */
void
promote(Fn *fn)
{
Blk *b;
Ins *i, *l;
Tmp *t;
Use *u, *ue;
int s, k;
/* promote uniform stack slots to temporaries */
b = fn->start;
for (i=b->ins; i<&b->ins[b->nins]; i++) {
if (Oalloc > i->op || i->op > Oalloc1)
continue;
/* specific to NAlign == 3 */
assert(rtype(i->to) == RTmp);
t = &fn->tmp[i->to.val];
if (t->ndef != 1)
goto Skip;
k = -1;
s = -1;
for (u=t->use; u<&t->use[t->nuse]; u++) {
if (u->type != UIns)
goto Skip;
l = u->u.ins;
if (isload(l->op))
if (s == -1 || s == loadsz(l)) {
s = loadsz(l);
continue;
}
if (isstore(l->op))
if (req(i->to, l->arg[1]) && !req(i->to, l->arg[0]))
if (s == -1 || s == storesz(l))
if (k == -1 || k == optab[l->op].argcls[0][0]) {
s = storesz(l);
k = optab[l->op].argcls[0][0];
continue;
}
goto Skip;
}
/* get rid of the alloc and replace uses */
*i = (Ins){.op = Onop};
t->ndef--;
ue = &t->use[t->nuse];
for (u=t->use; u!=ue; u++) {
l = u->u.ins;
if (isstore(l->op)) {
l->cls = k;
l->op = Ocopy;
l->to = l->arg[1];
l->arg[1] = R;
t->nuse--;
t->ndef++;
} else {
if (k == -1)
err("slot %%%s is read but never stored to",
fn->tmp[l->arg[0].val].name);
/* try to turn loads into copies so we
* can eliminate them later */
switch(l->op) {
case Oloadsw:
case Oloaduw:
if (k == Kl)
goto Extend;
/* fall through */
case Oload:
if (KBASE(k) != KBASE(l->cls))
l->op = Ocast;
else
l->op = Ocopy;
break;
default:
Extend:
l->op = Oextsb + (l->op - Oloadsb);
break;
}
}
}
Skip:;
}
if (debug['M']) {
fprintf(stderr, "\n> After slot promotion:\n");
printfn(fn, stderr);
}
}
/* [a, b) with 0 <= a */
struct Range {
int a, b;
};
struct Slot {
int t;
int sz;
bits m;
bits l;
Range r;
Slot *s;
};
static inline int
rin(Range r, int n)
{
return r.a <= n && n < r.b;
}
static inline int
rovlap(Range r0, Range r1)
{
return r0.b && r1.b && r0.a < r1.b && r1.a < r0.b;
}
static void
radd(Range *r, int n)
{
if (!r->b)
*r = (Range){n, n+1};
else if (n < r->a)
r->a = n;
else if (n >= r->b)
r->b = n+1;
}
static int
slot(Slot **ps, int64_t *off, Ref r, Fn *fn, Slot *sl)
{
Alias a;
Tmp *t;
getalias(&a, r, fn);
if (a.type != ALoc)
return 0;
t = &fn->tmp[a.base];
if (t->visit < 0)
return 0;
*off = a.offset;
*ps = &sl[t->visit];
return 1;
}
static void
load(Ref r, bits x, int ip, Fn *fn, Slot *sl)
{
int64_t off;
Slot *s;
if (slot(&s, &off, r, fn, sl)) {
s->l |= x << off;
s->l &= s->m;
if (s->l)
radd(&s->r, ip);
}
}
static void
store(Ref r, bits x, int ip, Fn *fn, Slot *sl)
{
int64_t off;
Slot *s;
if (slot(&s, &off, r, fn, sl)) {
if (s->l)
radd(&s->r, ip);
s->l &= ~(x << off);
}
}
static int
scmp(const void *pa, const void *pb)
{
Slot *a, *b;
a = (Slot *)pa, b = (Slot *)pb;
if (a->sz != b->sz)
return b->sz - a->sz;
return a->r.a - b->r.a;
}
static void
maxrpo(Blk *hd, Blk *b)
{
if (hd->loop < (int)b->id)
hd->loop = b->id;
}
void
coalesce(Fn *fn)
{
Range r, *br;
Slot *s, *s0, *sl;
Blk *b, **ps, *succ[3];
Ins *i;
Use *u;
Tmp *t, *ts;
Ref *arg;
bits x;
int n, m, nsl, ip, *stk;
uint total, freed, fused;
/* minimize the stack usage
* by coalescing slots
*/
nsl = 0;
sl = vnew(0, sizeof sl[0], PHeap);
for (n=Tmp0; n<fn->ntmp; n++) {
t = &fn->tmp[n];
t->visit = -1;
if (t->alias.type == ALoc)
if (t->alias.slot == &t->alias)
if (t->bid == fn->start->id)
if (t->alias.u.loc.sz != -1) {
t->visit = nsl;
vgrow(&sl, ++nsl);
s = &sl[nsl-1];
s->t = n;
s->sz = t->alias.u.loc.sz;
s->m = t->alias.u.loc.m;
s->s = 0;
}
}
/* one-pass liveness analysis */
for (b=fn->start; b; b=b->link)
b->loop = -1;
loopiter(fn, maxrpo);
br = emalloc(fn->nblk * sizeof br[0]);
ip = INT_MAX - 1;
for (n=fn->nblk-1; n>=0; n--) {
b = fn->rpo[n];
succ[0] = b->s1;
succ[1] = b->s2;
succ[2] = 0;
br[n].b = ip--;
for (s=sl; s<&sl[nsl]; s++) {
s->l = 0;
for (ps=succ; *ps; ps++) {
m = (*ps)->id;
if (m > n && rin(s->r, br[m].a)) {
s->l = s->m;
radd(&s->r, ip);
}
}
}
for (i=&b->ins[b->nins]; i!=b->ins;) {
arg = (--i)->arg;
if (i->op == Oargc) {
load(arg[1], -1, --ip, fn, sl);
}
if (isload(i->op)) {
x = BIT(loadsz(i)) - 1;
load(arg[0], x, --ip, fn, sl);
}
if (isstore(i->op)) {
x = BIT(storesz(i)) - 1;
store(arg[1], x, ip--, fn, sl);
}
}
for (s=sl; s<&sl[nsl]; s++)
if (s->l) {
radd(&s->r, ip);
if (b->loop != -1) {
assert(b->loop > n);
radd(&s->r, br[b->loop].b - 1);
}
}
br[n].a = ip;
}
free(br);
/* kill slots with an empty live range */
total = 0;
freed = 0;
stk = vnew(0, sizeof stk[0], PHeap);
n = 0;
for (s=s0=sl; s<&sl[nsl]; s++) {
total += s->sz;
if (!s->r.b) {
vgrow(&stk, ++n);
stk[n-1] = s->t;
freed += s->sz;
} else
*s0++ = *s;
}
nsl = s0-sl;
if (debug['M']) {
fputs("\n> Slot coalescing:\n", stderr);
if (n) {
fputs("\tkill [", stderr);
for (m=0; m<n; m++)
fprintf(stderr, " %%%s",
fn->tmp[stk[m]].name);
fputs(" ]\n", stderr);
}
}
while (n--) {
t = &fn->tmp[stk[n]];
assert(t->ndef == 1 && t->def);
*t->def = (Ins){.op = Onop};
for (u=t->use; u<&t->use[t->nuse]; u++) {
if (u->type == UJmp) {
b = fn->rpo[u->bid];
b->jmp.arg = CON_Z;
continue;
}
assert(u->type == UIns);
i = u->u.ins;
/* make loads crash */
if (isload(i->op))
i->arg[0] = CON_Z;
else if (i->op == Oargc)
i->arg[1] = CON_Z;
else if (!req(i->to, R)) {
assert(rtype(i->to) == RTmp);
vgrow(&stk, ++n);
stk[n-1] = i->to.val;
} else {
assert(!isarg(i->op));
*i = (Ins){.op = Onop};
}
}
}
vfree(stk);
/* fuse slots by decreasing size */
qsort(sl, nsl, sizeof *sl, scmp);
fused = 0;
for (n=0; n<nsl; n++) {
s0 = &sl[n];
if (s0->s)
continue;
s0->s = s0;
r = s0->r;
for (s=s0+1; s<&sl[nsl]; s++) {
if (s->s || !s->r.b)
goto Skip;
if (rovlap(r, s->r))
/* O(n) can be approximated
* by 'goto Skip;' if need be
*/
for (m=n; &sl[m]<s; m++)
if (sl[m].s == s0)
if (rovlap(sl[m].r, s->r))
goto Skip;
radd(&r, s->r.a);
radd(&r, s->r.b - 1);
s->s = s0;
fused += s->sz;
Skip:;
}
}
/* substitute fused slots */
for (s=sl; s<&sl[nsl]; s++) {
t = &fn->tmp[s->t];
assert(t->ndef == 1 && t->def);
if (s->s == s)
continue;
*t->def = (Ins){.op = Onop};
ts = &fn->tmp[s->s->t];
assert(t->bid == ts->bid);
if (t->def < ts->def) {
/* make sure the slot we
* selected has a def that
* dominates its new uses
*/
*t->def = *ts->def;
*ts->def = (Ins){.op = Onop};
ts->def = t->def;
}
for (u=t->use; u<&t->use[t->nuse]; u++) {
if (u->type == UJmp) {
b = fn->rpo[u->bid];
b->jmp.arg = TMP(s->s->t);
continue;
}
assert(u->type == UIns);
arg = u->u.ins->arg;
for (n=0; n<2; n++)
if (req(arg[n], TMP(s->t)))
arg[n] = TMP(s->s->t);
}
}
if (debug['M']) {
for (s0=sl; s0<&sl[nsl]; s0++) {
if (s0->s != s0)
continue;
fprintf(stderr, "\tfuse (% 3db) [", s0->sz);
for (s=s0; s<&sl[nsl]; s++) {
if (s->s != s0)
continue;
fprintf(stderr, " %%%s", fn->tmp[s->t].name);
if (s->r.b)
fprintf(stderr, "[%d,%d)",
s->r.a-ip, s->r.b-ip);
else
fputs("{}", stderr);
}
fputs(" ]\n", stderr);
}
fprintf(stderr, "\tsums %u/%u/%u (killed/fused/total)\n\n",
freed, fused, total);
printfn(fn, stderr);
}
vfree(sl);
}