0001-amd64-optimize-loading-0-into-registers.patch (2701B)
1 From 55b93f727cbad62a13dce0136077b0ffb47b90d7 Mon Sep 17 00:00:00 2001 2 From: =?UTF-8?q?=C3=89rico=20Nogueira?= <erico.erc@gmail.com> 3 Date: Sun, 11 Jul 2021 19:19:12 -0300 4 Subject: [PATCH] amd64: optimize loading 0 into registers 5 6 Loading +0 into a floating point register can be done using pxor or 7 xorps instructions. Per [1], we went with pxor because it can run on all 8 vector ALU ports, even if it's one byte longer. 9 10 Similarly, an integer register can be zeroed with xor, which has a 11 smaller encoding than mov with 0 immediate. 12 13 To implement this, we special case fixarg to allow Ocopy when the 14 value is +0 for floating point, and change emitins to emit pxor/xor 15 when it encounters a copy from 0. 16 17 Co-authored-by: Michael Forney <mforney@mforney.org> 18 19 [1] https://stackoverflow.com/questions/39811577/does-using-mix-of-pxor-and-xorps-affect-performance/39828976 20 --- 21 amd64/emit.c | 12 ++++++++++++ 22 amd64/isel.c | 12 +++++++----- 23 2 files changed, 19 insertions(+), 5 deletions(-) 24 25 diff --git a/amd64/emit.c b/amd64/emit.c 26 index 51d1a5c..a3e72e6 100644 27 --- a/amd64/emit.c 28 +++ b/amd64/emit.c 29 @@ -458,6 +458,18 @@ emitins(Ins i, Fn *fn, FILE *f) 30 if (req(i.to, i.arg[0])) 31 break; 32 t0 = rtype(i.arg[0]); 33 + if (t0 == RCon 34 + && fn->con[i.arg[0].val].type == CBits 35 + && fn->con[i.arg[0].val].bits.i == 0) { 36 + if (isreg(i.to)) { 37 + if (KBASE(i.cls) == 0) 38 + emitf("xor%k %=, %=", &i, fn, f); 39 + else 40 + emitf("pxor %D=, %D=", &i, fn, f); 41 + break; 42 + } 43 + i.cls = KWIDE(i.cls) ? Kl : Kw; 44 + } 45 if (i.cls == Kl 46 && t0 == RCon 47 && fn->con[i.arg[0].val].type == CBits) { 48 diff --git a/amd64/isel.c b/amd64/isel.c 49 index e29c8bf..4bec2e1 100644 50 --- a/amd64/isel.c 51 +++ b/amd64/isel.c 52 @@ -85,7 +85,7 @@ fixarg(Ref *r, int k, Ins *i, Fn *fn) 53 r1 = r0 = *r; 54 s = rslot(r0, fn); 55 op = i ? i->op : Ocopy; 56 - if (KBASE(k) == 1 && rtype(r0) == RCon) { 57 + if (KBASE(k) == 1 && rtype(r0) == RCon && fn->con[r0.val].bits.i != 0) { 58 /* load floating points from memory 59 * slots, they can't be used as 60 * immediates 61 @@ -99,13 +99,15 @@ fixarg(Ref *r, int k, Ins *i, Fn *fn) 62 a.offset.sym.id = intern(buf); 63 fn->mem[fn->nmem-1] = a; 64 } 65 - else if (op != Ocopy && k == Kl && noimm(r0, fn)) { 66 + else if (op != Ocopy && ((k == Kl && noimm(r0, fn)) || (KBASE(k) == 1 && rtype(r0) == RCon))) { 67 /* load constants that do not fit in 68 * a 32bit signed integer into a 69 - * long temporary 70 + * long temporary OR 71 + * load positive zero into a floating 72 + * point register 73 */ 74 - r1 = newtmp("isel", Kl, fn); 75 - emit(Ocopy, Kl, r1, r0, R); 76 + r1 = newtmp("isel", k, fn); 77 + emit(Ocopy, k, r1, r0, R); 78 } 79 else if (s != -1) { 80 /* load fast locals' addresses into 81 -- 82 2.42.0 83