Simplify int mul/udiv/urem of 2^N into shl/shr/and.

Passes the "standard" test suite.

(cproc bootstrap, hare[c] make test, roland units, linpack/coremark run)

However linpack benchmark is now notably slower. Coremark is ~2% faster.

As noticed before, linmark timing is dubious, and maybe my cheap (AMD) laptop
prefers mul to shl.
This commit is contained in:
Roland Paterson-Jones 2024-06-13 20:58:52 +02:00 committed by Quentin Carbonneaux
parent cf9f95f521
commit 7a19de5cf4

54
simpl.c
View file

@ -30,16 +30,43 @@ blit(Ref sd[2], int sz, Fn *fn)
}
}
static int
ulog2_tab64[64] = {
63, 0, 58, 1, 59, 47, 53, 2,
60, 39, 48, 27, 54, 33, 42, 3,
61, 51, 37, 40, 49, 18, 28, 20,
55, 30, 34, 11, 43, 14, 22, 4,
62, 57, 46, 52, 38, 26, 32, 41,
50, 36, 17, 19, 29, 10, 13, 21,
56, 45, 25, 31, 35, 16, 9, 12,
44, 24, 15, 8, 23, 7, 6, 5
};
static int
ulog2(uint64_t pow2)
{
return ulog2_tab64[(pow2 * 0x07EDD5E59A4E28C2) >> 58];
}
static int
ispow2(uint64_t v)
{
return (v & (v - 1)) == 0;
}
static void
ins(Ins **pi, int *new, Blk *b, Fn *fn)
{
ulong ni;
Con *c;
Ins *i;
Ref r;
int n;
i = *pi;
/* simplify more instructions here;
* copy 0 into xor, mul 2^n into shift,
* bit rotations, ... */
* copy 0 into xor bit rotations,
* etc. */
switch (i->op) {
case Oblit1:
assert(i > b->ins);
@ -54,6 +81,29 @@ ins(Ins **pi, int *new, Blk *b, Fn *fn)
blit((i-1)->arg, rsval(i->arg[0]), fn);
*pi = i-1;
break;
case Omul:
case Oudiv:
case Ourem:
r = i->arg[1];
if (KBASE(i->cls) == 0)
if (rtype(r) == RCon) {
c = &fn->con[r.val];
if (c->type == CBits)
if (ispow2(c->bits.i)) {
n = ulog2(c->bits.i);
if (i->op == Ourem) {
i->op = Oand;
i->arg[1] = getcon((1ull<<n) - 1, fn);
} else if (i->op == Oudiv) {
i->op = Oshr;
i->arg[1] = getcon(n, fn);
} else if (i->op == Omul) {
i->op = Oshl;
i->arg[1] = getcon(n, fn);
}
}
}
/* fall through */
default:
if (*new)
emiti(*i);