opkg

statically linked package installer
git clone anongit@rnpnr.xyz:opkg.git
Log | Files | Refs | Feed | Submodules | README | LICENSE

0001-amd64-optimize-loading-0-into-registers.patch (2701B)


      1 From 55b93f727cbad62a13dce0136077b0ffb47b90d7 Mon Sep 17 00:00:00 2001
      2 From: =?UTF-8?q?=C3=89rico=20Nogueira?= <erico.erc@gmail.com>
      3 Date: Sun, 11 Jul 2021 19:19:12 -0300
      4 Subject: [PATCH] amd64: optimize loading 0 into registers
      5 
      6 Loading +0 into a floating point register can be done using pxor or
      7 xorps instructions. Per [1], we went with pxor because it can run on all
      8 vector ALU ports, even if it's one byte longer.
      9 
     10 Similarly, an integer register can be zeroed with xor, which has a
     11 smaller encoding than mov with 0 immediate.
     12 
     13 To implement this, we special case fixarg to allow Ocopy when the
     14 value is +0 for floating point, and change emitins to emit pxor/xor
     15 when it encounters a copy from 0.
     16 
     17 Co-authored-by: Michael Forney <mforney@mforney.org>
     18 
     19 [1] https://stackoverflow.com/questions/39811577/does-using-mix-of-pxor-and-xorps-affect-performance/39828976
     20 ---
     21  amd64/emit.c | 12 ++++++++++++
     22  amd64/isel.c | 12 +++++++-----
     23  2 files changed, 19 insertions(+), 5 deletions(-)
     24 
     25 diff --git a/amd64/emit.c b/amd64/emit.c
     26 index 51d1a5c..a3e72e6 100644
     27 --- a/amd64/emit.c
     28 +++ b/amd64/emit.c
     29 @@ -458,6 +458,18 @@ emitins(Ins i, Fn *fn, FILE *f)
     30  		if (req(i.to, i.arg[0]))
     31  			break;
     32  		t0 = rtype(i.arg[0]);
     33 +		if (t0 == RCon
     34 +		&& fn->con[i.arg[0].val].type == CBits
     35 +		&& fn->con[i.arg[0].val].bits.i == 0) {
     36 +			if (isreg(i.to)) {
     37 +				if (KBASE(i.cls) == 0)
     38 +					emitf("xor%k %=, %=", &i, fn, f);
     39 +				else
     40 +					emitf("pxor %D=, %D=", &i, fn, f);
     41 +				break;
     42 +			}
     43 +			i.cls = KWIDE(i.cls) ? Kl : Kw;
     44 +		}
     45  		if (i.cls == Kl
     46  		&& t0 == RCon
     47  		&& fn->con[i.arg[0].val].type == CBits) {
     48 diff --git a/amd64/isel.c b/amd64/isel.c
     49 index e29c8bf..4bec2e1 100644
     50 --- a/amd64/isel.c
     51 +++ b/amd64/isel.c
     52 @@ -85,7 +85,7 @@ fixarg(Ref *r, int k, Ins *i, Fn *fn)
     53  	r1 = r0 = *r;
     54  	s = rslot(r0, fn);
     55  	op = i ? i->op : Ocopy;
     56 -	if (KBASE(k) == 1 && rtype(r0) == RCon) {
     57 +	if (KBASE(k) == 1 && rtype(r0) == RCon && fn->con[r0.val].bits.i != 0) {
     58  		/* load floating points from memory
     59  		 * slots, they can't be used as
     60  		 * immediates
     61 @@ -99,13 +99,15 @@ fixarg(Ref *r, int k, Ins *i, Fn *fn)
     62  		a.offset.sym.id = intern(buf);
     63  		fn->mem[fn->nmem-1] = a;
     64  	}
     65 -	else if (op != Ocopy && k == Kl && noimm(r0, fn)) {
     66 +	else if (op != Ocopy && ((k == Kl && noimm(r0, fn)) || (KBASE(k) == 1 && rtype(r0) == RCon))) {
     67  		/* load constants that do not fit in
     68  		 * a 32bit signed integer into a
     69 -		 * long temporary
     70 +		 * long temporary OR
     71 +		 * load positive zero into a floating
     72 +		 * point register
     73  		 */
     74 -		r1 = newtmp("isel", Kl, fn);
     75 -		emit(Ocopy, Kl, r1, r0, R);
     76 +		r1 = newtmp("isel", k, fn);
     77 +		emit(Ocopy, k, r1, r0, R);
     78  	}
     79  	else if (s != -1) {
     80  		/* load fast locals' addresses into
     81 -- 
     82 2.42.0
     83