From efcb5465e597571820391bee450dd1e8def300b4 Mon Sep 17 00:00:00 2001
From: Quentin Carbonneaux <quentin@c9x.me>
Date: Thu, 15 Aug 2024 23:11:20 +0200
Subject: [PATCH 1/4] align emitted code

Functions are now aligned on 16-byte
boundaries. This mimics gcc and should
help reduce the maximum perf impact of
cosmetic code changes. Previously, any
change in the output of qbe could have
far reaching implications on alignment.

Thanks to Roland Paterson-Jones for
pointing out the variability issue.
---
 parse.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/parse.c b/parse.c
index a745779..caab452 100644
--- a/parse.c
+++ b/parse.c
@@ -1219,6 +1219,7 @@ parse(FILE *f, char *path, void dbgfile(char *), void data(Dat *), void func(Fn
 			dbgfile(tokval.str);
 			break;
 		case Tfunc:
+			lnk.align = 16;
 			func(parsefn(&lnk));
 			break;
 		case Tdata:

From bb8de8c63362b7234db02482240d5600203225d9 Mon Sep 17 00:00:00 2001
From: Alexey Yerin <yyp@disroot.org>
Date: Fri, 2 Aug 2024 14:39:07 +0300
Subject: [PATCH 2/4] arm64/isel: Avoid signed overflow when handling
 immediates

Clang incorrectly optimizes this negation with -O2 and causes QBE to
emit 0 in place of INT64_MIN.
---
 arm64/isel.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arm64/isel.c b/arm64/isel.c
index 062beb3..9ce6adc 100644
--- a/arm64/isel.c
+++ b/arm64/isel.c
@@ -24,7 +24,7 @@ imm(Con *c, int k, int64_t *pn)
 	i = Iplo12;
 	if (n < 0) {
 		i = Inlo12;
-		n = -n;
+		n = -(uint64_t)n;
 	}
 	*pn = n;
 	if ((n & 0x000fff) == n)

From 626f0b278137ff6f8b7d910d9b3fc3cbdfbb39fc Mon Sep 17 00:00:00 2001
From: Quentin Carbonneaux <quentin@c9x.me>
Date: Tue, 20 Aug 2024 15:20:42 +0200
Subject: [PATCH 3/4] skip preludes for some leaf fns

When rbp is not necessary to compile
a leaf function, we skip saving and
restoring it.
---
 all.h        |   2 +
 amd64/emit.c | 238 ++++++++++++++++++++++++++++++---------------------
 amd64/isel.c |   1 +
 parse.c      |   2 +
 4 files changed, 145 insertions(+), 98 deletions(-)

diff --git a/all.h b/all.h
index 3479d27..97cc41c 100644
--- a/all.h
+++ b/all.h
@@ -393,8 +393,10 @@ struct Fn {
 	Blk **rpo;
 	bits reg;
 	int slot;
+	int salign;
 	char vararg;
 	char dynalloc;
+	char leaf;
 	char name[NString];
 	Lnk lnk;
 };
diff --git a/amd64/emit.c b/amd64/emit.c
index 00dd80f..8f36188 100644
--- a/amd64/emit.c
+++ b/amd64/emit.c
@@ -1,6 +1,16 @@
 #include "all.h"
 
 
+typedef struct E E;
+
+struct E {
+	FILE *f;
+	Fn *fn;
+	int fp;
+	uint64_t fsz;
+	int nclob;
+};
+
 #define CMP(X) \
 	X(Ciule,      "be") \
 	X(Ciult,      "b")  \
@@ -142,23 +152,29 @@ static char *rname[][4] = {
 
 
 static int
-slot(Ref r, Fn *fn)
+slot(Ref r, E *e)
 {
 	int s;
 
 	s = rsval(r);
-	assert(s <= fn->slot);
+	assert(s <= e->fn->slot);
 	/* specific to NAlign == 3 */
-	if (s < 0)
-		return -4 * s;
-	else if (fn->vararg)
-		return -176 + -4 * (fn->slot - s);
+	if (s < 0) {
+		if (e->fp == RSP)
+			return 4*-s - 8 + e->fsz + e->nclob*8;
+		else
+			return 4*-s;
+	}
+	else if (e->fp == RSP)
+		return 4*s + e->nclob*8;
+	else if (e->fn->vararg)
+		return -176 + -4 * (e->fn->slot - s);
 	else
-		return -4 * (fn->slot - s);
+		return -4 * (e->fn->slot - s);
 }
 
 static void
-emitcon(Con *con, FILE *f)
+emitcon(Con *con, E *e)
 {
 	char *p, *l;
 
@@ -168,16 +184,16 @@ emitcon(Con *con, FILE *f)
 		p = l[0] == '"' ? "" : T.assym;
 		if (con->sym.type == SThr) {
 			if (T.apple)
-				fprintf(f, "%s%s@TLVP", p, l);
+				fprintf(e->f, "%s%s@TLVP", p, l);
 			else
-				fprintf(f, "%%fs:%s%s@tpoff", p, l);
+				fprintf(e->f, "%%fs:%s%s@tpoff", p, l);
 		} else
-			fprintf(f, "%s%s", p, l);
+			fprintf(e->f, "%s%s", p, l);
 		if (con->bits.i)
-			fprintf(f, "%+"PRId64, con->bits.i);
+			fprintf(e->f, "%+"PRId64, con->bits.i);
 		break;
 	case CBits:
-		fprintf(f, "%"PRId64, con->bits.i);
+		fprintf(e->f, "%"PRId64, con->bits.i);
 		break;
 	default:
 		die("unreachable");
@@ -212,10 +228,10 @@ getarg(char c, Ins *i)
 	}
 }
 
-static void emitins(Ins, Fn *, FILE *);
+static void emitins(Ins, E *);
 
 static void
-emitcopy(Ref r1, Ref r2, int k, Fn *fn, FILE *f)
+emitcopy(Ref r1, Ref r2, int k, E *e)
 {
 	Ins icp;
 
@@ -223,11 +239,11 @@ emitcopy(Ref r1, Ref r2, int k, Fn *fn, FILE *f)
 	icp.arg[0] = r2;
 	icp.to = r1;
 	icp.cls = k;
-	emitins(icp, fn, f);
+	emitins(icp, e);
 }
 
 static void
-emitf(char *s, Ins *i, Fn *fn, FILE *f)
+emitf(char *s, Ins *i, E *e)
 {
 	static char clstoa[][3] = {"l", "q", "ss", "sd"};
 	char c;
@@ -247,25 +263,25 @@ emitf(char *s, Ins *i, Fn *fn, FILE *f)
 	case '-':
 		assert((!req(i->arg[1], i->to) || req(i->arg[0], i->to)) &&
 			"cannot convert to 2-address");
-		emitcopy(i->to, i->arg[0], i->cls, fn, f);
+		emitcopy(i->to, i->arg[0], i->cls, e);
 		s++;
 		break;
 	}
 
-	fputc('\t', f);
+	fputc('\t', e->f);
 Next:
 	while ((c = *s++) != '%')
 		if (!c) {
-			fputc('\n', f);
+			fputc('\n', e->f);
 			return;
 		} else
-			fputc(c, f);
+			fputc(c, e->f);
 	switch ((c = *s++)) {
 	case '%':
-		fputc('%', f);
+		fputc('%', e->f);
 		break;
 	case 'k':
-		fputs(clstoa[i->cls], f);
+		fputs(clstoa[i->cls], e->f);
 		break;
 	case '0':
 	case '1':
@@ -282,37 +298,42 @@ Next:
 		switch (rtype(ref)) {
 		case RTmp:
 			assert(isreg(ref));
-			fprintf(f, "%%%s", regtoa(ref.val, sz));
+			fprintf(e->f, "%%%s", regtoa(ref.val, sz));
 			break;
 		case RSlot:
-			fprintf(f, "%d(%%rbp)", slot(ref, fn));
+			fprintf(e->f, "%d(%%%s)",
+				slot(ref, e),
+				regtoa(e->fp, SLong)
+			);
 			break;
 		case RMem:
 		Mem:
-			m = &fn->mem[ref.val];
+			m = &e->fn->mem[ref.val];
 			if (rtype(m->base) == RSlot) {
 				off.type = CBits;
-				off.bits.i = slot(m->base, fn);
+				off.bits.i = slot(m->base, e);
 				addcon(&m->offset, &off, 1);
-				m->base = TMP(RBP);
+				m->base = TMP(e->fp);
 			}
 			if (m->offset.type != CUndef)
-				emitcon(&m->offset, f);
-			fputc('(', f);
+				emitcon(&m->offset, e);
+			fputc('(', e->f);
 			if (!req(m->base, R))
-				fprintf(f, "%%%s", regtoa(m->base.val, SLong));
+				fprintf(e->f, "%%%s",
+					regtoa(m->base.val, SLong)
+				);
 			else if (m->offset.type == CAddr)
-				fprintf(f, "%%rip");
+				fprintf(e->f, "%%rip");
 			if (!req(m->index, R))
-				fprintf(f, ", %%%s, %d",
+				fprintf(e->f, ", %%%s, %d",
 					regtoa(m->index.val, SLong),
 					m->scale
 				);
-			fputc(')', f);
+			fputc(')', e->f);
 			break;
 		case RCon:
-			fputc('$', f);
-			emitcon(&fn->con[ref.val], f);
+			fputc('$', e->f);
+			emitcon(&e->fn->con[ref.val], e);
 			break;
 		default:
 			die("unreachable");
@@ -337,18 +358,21 @@ Next:
 		case RMem:
 			goto Mem;
 		case RSlot:
-			fprintf(f, "%d(%%rbp)", slot(ref, fn));
+			fprintf(e->f, "%d(%%%s)",
+				slot(ref, e),
+				regtoa(e->fp, SLong)
+			);
 			break;
 		case RCon:
-			off = fn->con[ref.val];
-			emitcon(&off, f);
+			off = e->fn->con[ref.val];
+			emitcon(&off, e);
 			if (off.type == CAddr)
 			if (off.sym.type != SThr || T.apple)
-				fprintf(f, "(%%rip)");
+				fprintf(e->f, "(%%rip)");
 			break;
 		case RTmp:
 			assert(isreg(ref));
-			fprintf(f, "(%%%s)", regtoa(ref.val, SLong));
+			fprintf(e->f, "(%%%s)", regtoa(ref.val, SLong));
 			break;
 		default:
 			die("unreachable");
@@ -366,7 +390,7 @@ static void *negmask[4] = {
 };
 
 static void
-emitins(Ins i, Fn *fn, FILE *f)
+emitins(Ins i, E *e)
 {
 	Ref r;
 	int64_t val;
@@ -393,7 +417,7 @@ emitins(Ins i, Fn *fn, FILE *f)
 			|| (omap[o].cls == Ka))
 				break;
 		}
-		emitf(omap[o].fmt, &i, fn, f);
+		emitf(omap[o].fmt, &i, e);
 		break;
 	case Onop:
 		/* just do nothing for nops, they are inserted
@@ -410,7 +434,7 @@ emitins(Ins i, Fn *fn, FILE *f)
 		if (KBASE(i.cls) == 0 /* only available for ints */
 		&& rtype(i.arg[0]) == RCon
 		&& rtype(i.arg[1]) == RTmp) {
-			emitf("imul%k %0, %1, %=", &i, fn, f);
+			emitf("imul%k %0, %1, %=", &i, e);
 			break;
 		}
 		goto Table;
@@ -419,18 +443,18 @@ emitins(Ins i, Fn *fn, FILE *f)
 		 * some 3-address subtractions */
 		if (req(i.to, i.arg[1]) && !req(i.arg[0], i.to)) {
 			ineg = (Ins){Oneg, i.cls, i.to, {i.to}};
-			emitins(ineg, fn, f);
-			emitf("add%k %0, %=", &i, fn, f);
+			emitins(ineg, e);
+			emitf("add%k %0, %=", &i, e);
 			break;
 		}
 		goto Table;
 	case Oneg:
 		if (!req(i.to, i.arg[0]))
-			emitf("mov%k %0, %=", &i, fn, f);
+			emitf("mov%k %0, %=", &i, e);
 		if (KBASE(i.cls) == 0)
-			emitf("neg%k %=", &i, fn, f);
+			emitf("neg%k %=", &i, e);
 		else
-			fprintf(f,
+			fprintf(e->f,
 				"\txorp%c %sfp%d(%%rip), %%%s\n",
 				"xxsd"[i.cls],
 				T.asloc,
@@ -443,8 +467,8 @@ emitins(Ins i, Fn *fn, FILE *f)
 		 * conversion to 2-address in emitf() would fail */
 		if (req(i.to, i.arg[1])) {
 			i.arg[1] = TMP(XMM0+15);
-			emitf("mov%k %=, %1", &i, fn, f);
-			emitf("mov%k %0, %=", &i, fn, f);
+			emitf("mov%k %=, %1", &i, e);
+			emitf("mov%k %0, %=", &i, e);
 			i.arg[0] = i.to;
 		}
 		goto Table;
@@ -460,53 +484,54 @@ emitins(Ins i, Fn *fn, FILE *f)
 		t0 = rtype(i.arg[0]);
 		if (i.cls == Kl
 		&& t0 == RCon
-		&& fn->con[i.arg[0].val].type == CBits) {
-			val = fn->con[i.arg[0].val].bits.i;
+		&& e->fn->con[i.arg[0].val].type == CBits) {
+			val = e->fn->con[i.arg[0].val].bits.i;
 			if (isreg(i.to))
 			if (val >= 0 && val <= UINT32_MAX) {
-				emitf("movl %W0, %W=", &i, fn, f);
+				emitf("movl %W0, %W=", &i, e);
 				break;
 			}
 			if (rtype(i.to) == RSlot)
 			if (val < INT32_MIN || val > INT32_MAX) {
-				emitf("movl %0, %=", &i, fn, f);
-				emitf("movl %0>>32, 4+%=", &i, fn, f);
+				emitf("movl %0, %=", &i, e);
+				emitf("movl %0>>32, 4+%=", &i, e);
 				break;
 			}
 		}
 		if (isreg(i.to)
 		&& t0 == RCon
-		&& fn->con[i.arg[0].val].type == CAddr) {
-			emitf("lea%k %M0, %=", &i, fn, f);
+		&& e->fn->con[i.arg[0].val].type == CAddr) {
+			emitf("lea%k %M0, %=", &i, e);
 			break;
 		}
 		if (rtype(i.to) == RSlot
 		&& (t0 == RSlot || t0 == RMem)) {
 			i.cls = KWIDE(i.cls) ? Kd : Ks;
 			i.arg[1] = TMP(XMM0+15);
-			emitf("mov%k %0, %1", &i, fn, f);
-			emitf("mov%k %1, %=", &i, fn, f);
+			emitf("mov%k %0, %1", &i, e);
+			emitf("mov%k %1, %=", &i, e);
 			break;
 		}
 		/* conveniently, the assembler knows if it
 		 * should use movabsq when reading movq */
-		emitf("mov%k %0, %=", &i, fn, f);
+		emitf("mov%k %0, %=", &i, e);
 		break;
 	case Oaddr:
 		if (!T.apple
 		&& rtype(i.arg[0]) == RCon
-		&& fn->con[i.arg[0].val].sym.type == SThr) {
+		&& e->fn->con[i.arg[0].val].sym.type == SThr) {
 			/* derive the symbol address from the TCB
 			 * address at offset 0 of %fs */
 			assert(isreg(i.to));
-			con = &fn->con[i.arg[0].val];
+			con = &e->fn->con[i.arg[0].val];
 			sym = str(con->sym.id);
-			emitf("movq %%fs:0, %L=", &i, fn, f);
-			fprintf(f, "\tleaq %s%s@tpoff",
+			emitf("movq %%fs:0, %L=", &i, e);
+			fprintf(e->f, "\tleaq %s%s@tpoff",
 				sym[0] == '"' ? "" : T.assym, sym);
 			if (con->bits.i)
-				fprintf(f, "%+"PRId64, con->bits.i);
-			fprintf(f, "(%%%s), %%%s\n",
+				fprintf(e->f, "%+"PRId64,
+					con->bits.i);
+			fprintf(e->f, "(%%%s), %%%s\n",
 				regtoa(i.to.val, SLong),
 				regtoa(i.to.val, SLong));
 			break;
@@ -517,12 +542,12 @@ emitins(Ins i, Fn *fn, FILE *f)
 		 * assembly... */
 		switch (rtype(i.arg[0])) {
 		case RCon:
-			fprintf(f, "\tcallq ");
-			emitcon(&fn->con[i.arg[0].val], f);
-			fprintf(f, "\n");
+			fprintf(e->f, "\tcallq ");
+			emitcon(&e->fn->con[i.arg[0].val], e);
+			fprintf(e->f, "\n");
 			break;
 		case RTmp:
-			emitf("callq *%L0", &i, fn, f);
+			emitf("callq *%L0", &i, e);
 			break;
 		default:
 			die("invalid call argument");
@@ -533,9 +558,10 @@ emitins(Ins i, Fn *fn, FILE *f)
 		 * maybe we should split Osalloc in 2 different
 		 * instructions depending on the result
 		 */
-		emitf("subq %L0, %%rsp", &i, fn, f);
+		assert(e->fp == RBP);
+		emitf("subq %L0, %%rsp", &i, e);
 		if (!req(i.to, R))
-			emitcopy(i.to, TMP(RSP), Kl, fn, f);
+			emitcopy(i.to, TMP(RSP), Kl, e);
 		break;
 	case Oswap:
 		if (KBASE(i.cls) == 0)
@@ -543,27 +569,35 @@ emitins(Ins i, Fn *fn, FILE *f)
 		/* for floats, there is no swap instruction
 		 * so we use xmm15 as a temporary
 		 */
-		emitcopy(TMP(XMM0+15), i.arg[0], i.cls, fn, f);
-		emitcopy(i.arg[0], i.arg[1], i.cls, fn, f);
-		emitcopy(i.arg[1], TMP(XMM0+15), i.cls, fn, f);
+		emitcopy(TMP(XMM0+15), i.arg[0], i.cls, e);
+		emitcopy(i.arg[0], i.arg[1], i.cls, e);
+		emitcopy(i.arg[1], TMP(XMM0+15), i.cls, e);
 		break;
 	case Odbgloc:
-		emitdbgloc(i.arg[0].val, i.arg[1].val, f);
+		emitdbgloc(i.arg[0].val, i.arg[1].val, e->f);
 		break;
 	}
 }
 
-static uint64_t
-framesz(Fn *fn)
+static void
+framesz(E *e)
 {
 	uint64_t i, o, f;
 
 	/* specific to NAlign == 3 */
-	for (i=0, o=0; i<NCLR; i++)
-		o ^= 1 & (fn->reg >> amd64_sysv_rclob[i]);
-	f = fn->slot;
+	o = 0;
+	if (!e->fn->leaf) {
+		for (i=0, o=0; i<NCLR; i++)
+			o ^= e->fn->reg >> amd64_sysv_rclob[i];
+		o &= 1;
+	}
+	f = e->fn->slot;
 	f = (f + 3) & -4;
-	return 4*f + 8*o + 176*fn->vararg;
+	if (f > 0
+	&& e->fp == RSP
+	&& e->fn->salign == 4)
+		f += 2;
+	e->fsz = 4*f + 8*o + 176*e->fn->vararg;
 }
 
 void
@@ -578,13 +612,19 @@ amd64_emitfn(Fn *fn, FILE *f)
 	Blk *b, *s;
 	Ins *i, itmp;
 	int *r, c, o, n, lbl;
-	uint64_t fs;
+	E *e;
 
+	e = &(E){.f = f, .fn = fn};
 	emitfnlnk(fn->name, &fn->lnk, f);
-	fputs("\tendbr64\n\tpushq %rbp\n\tmovq %rsp, %rbp\n", f);
-	fs = framesz(fn);
-	if (fs)
-		fprintf(f, "\tsubq $%"PRIu64", %%rsp\n", fs);
+	fputs("\tendbr64\n", f);
+	if (!fn->leaf || fn->vararg || fn->dynalloc) {
+		e->fp = RBP;
+		fputs("\tpushq %rbp\n\tmovq %rsp, %rbp\n", f);
+	} else
+		e->fp = RSP;
+	framesz(e);
+	if (e->fsz)
+		fprintf(f, "\tsubq $%"PRIu64", %%rsp\n", e->fsz);
 	if (fn->vararg) {
 		o = -176;
 		for (r=amd64_sysv_rsave; r<&amd64_sysv_rsave[6]; r++, o+=8)
@@ -595,15 +635,15 @@ amd64_emitfn(Fn *fn, FILE *f)
 	for (r=amd64_sysv_rclob; r<&amd64_sysv_rclob[NCLR]; r++)
 		if (fn->reg & BIT(*r)) {
 			itmp.arg[0] = TMP(*r);
-			emitf("pushq %L0", &itmp, fn, f);
-			fs += 8;
+			emitf("pushq %L0", &itmp, e);
+			e->nclob++;
 		}
 
 	for (lbl=0, b=fn->start; b; b=b->link) {
 		if (lbl || b->npred > 1)
 			fprintf(f, "%sbb%d:\n", T.asloc, id0+b->id);
 		for (i=b->ins; i!=&b->ins[b->nins]; i++)
-			emitins(*i, fn, f);
+			emitins(*i, e);
 		lbl = 1;
 		switch (b->jmp.type) {
 		case Jhlt:
@@ -614,17 +654,19 @@ amd64_emitfn(Fn *fn, FILE *f)
 				fprintf(f,
 					"\tmovq %%rbp, %%rsp\n"
 					"\tsubq $%"PRIu64", %%rsp\n",
-					fs
-				);
+					e->fsz + e->nclob * 8);
 			for (r=&amd64_sysv_rclob[NCLR]; r>amd64_sysv_rclob;)
 				if (fn->reg & BIT(*--r)) {
 					itmp.arg[0] = TMP(*r);
-					emitf("popq %L0", &itmp, fn, f);
+					emitf("popq %L0", &itmp, e);
 				}
-			fprintf(f,
-				"\tleave\n"
-				"\tret\n"
-			);
+			if (e->fp == RBP)
+				fputs("\tleave\n", f);
+			else if (e->fsz)
+				fprintf(f,
+					"\taddq $%"PRIu64", %%rsp\n",
+					e->fsz);
+			fputs("\tret\n", f);
 			break;
 		case Jjmp:
 		Jmp:
diff --git a/amd64/isel.c b/amd64/isel.c
index 2b92878..bd645ce 100644
--- a/amd64/isel.c
+++ b/amd64/isel.c
@@ -808,6 +808,7 @@ amd64_isel(Fn *fn)
 					die("alloc too large");
 				fn->tmp[i->to.val].slot = fn->slot;
 				fn->slot += sz;
+				fn->salign = 2 + al - Oalloc;
 				*i = (Ins){.op = Onop};
 			}
 
diff --git a/parse.c b/parse.c
index caab452..e896679 100644
--- a/parse.c
+++ b/parse.c
@@ -694,6 +694,7 @@ parseline(PState ps)
 		goto Ins;
 	}
 	if (op == Tcall) {
+		curf->leaf = 0;
 		arg[0] = parseref();
 		parserefl(1);
 		op = Ocall;
@@ -910,6 +911,7 @@ parsefn(Lnk *lnk)
 	curf->con[0].bits.i = 0xdeaddead;  /* UNDEF */
 	curf->con[1].type = CBits;
 	curf->lnk = *lnk;
+	curf->leaf = 1;
 	blink = &curf->start;
 	curf->retty = Kx;
 	if (peek() != Tglo)

From 90050202f57b22243f5d3dd434a81df2f89de9ed Mon Sep 17 00:00:00 2001
From: Quentin Carbonneaux <quentin@c9x.me>
Date: Tue, 1 Oct 2024 19:38:15 +0200
Subject: [PATCH 4/4] fix various codegen bugs on arm64

- dynamic allocations could generate
  bad 'and' instructions (for the
  and with -16 in salloc()).
- symbols used in w context would
  generate adrp and add instructions
  on wN registers while they seem to
  only work on xN registers.

Thanks to Rosie for reporting them.
---
 arm64/emit.c   | 19 ++++++++++++++-----
 test/isel5.ssa | 16 ++++++++++++++++
 2 files changed, 30 insertions(+), 5 deletions(-)
 create mode 100644 test/isel5.ssa

diff --git a/arm64/emit.c b/arm64/emit.c
index ffdc178..28cd6a5 100644
--- a/arm64/emit.c
+++ b/arm64/emit.c
@@ -160,7 +160,8 @@ emitf(char *s, Ins *i, E *e)
 	Ref r;
 	int k, c;
 	Con *pc;
-	uint n, sp;
+	uint64_t n;
+	uint sp;
 
 	fputc('\t', e->f);
 
@@ -217,10 +218,17 @@ emitf(char *s, Ins *i, E *e)
 				pc = &e->fn->con[r.val];
 				n = pc->bits.i;
 				assert(pc->type == CBits);
-				if (n & 0xfff000)
-					fprintf(e->f, "#%u, lsl #12", n>>12);
-				else
-					fprintf(e->f, "#%u", n);
+				if (n >> 24) {
+					assert(arm64_logimm(n, k));
+					fprintf(e->f, "#%"PRIu64, n);
+				} else if (n & 0xfff000) {
+					assert(!(n & ~0xfff000ull));
+					fprintf(e->f, "#%"PRIu64", lsl #12",
+						n>>12);
+				} else {
+					assert(!(n & ~0xfffull));
+					fprintf(e->f, "#%"PRIu64, n);
+				}
 				break;
 			}
 			break;
@@ -304,6 +312,7 @@ loadcon(Con *c, int r, int k, E *e)
 	rn = rname(r, k);
 	n = c->bits.i;
 	if (c->type == CAddr) {
+		rn = rname(r, Kl);
 		loadaddr(c, rn, e);
 		return;
 	}
diff --git a/test/isel5.ssa b/test/isel5.ssa
new file mode 100644
index 0000000..9c546d7
--- /dev/null
+++ b/test/isel5.ssa
@@ -0,0 +1,16 @@
+# make sure the local symbols used for
+# fp constants do not get a _ prefix
+# on apple arm hardware
+
+export function w $main() {
+@start
+	%r =d copy d_1.2
+	%x =w call $printf(l $fmt, ..., d %r)
+	ret 0
+}
+
+data $fmt = { b "%.06f\n", b 0 }
+
+# >>> output
+# 1.200000
+# <<<