libzahl

big integer library
git clone git://git.suckless.org/libzahl
Log | Files | Refs | README | LICENSE

commit 93bf9e5b4bf63708c732f5bf07619d2e59c81ec4
parent b0732b78f06a4b790f71517227d9e957751ad4fa
Author: Mattias Andrée <maandree@kth.se>
Date:   Thu,  5 May 2016 02:41:50 +0200

Optimise zadd on x86-64

Signed-off-by: Mattias Andrée <maandree@kth.se>

Diffstat:
Msrc/zadd.c | 100+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
1 file changed, 96 insertions(+), 4 deletions(-)

diff --git a/src/zadd.c b/src/zadd.c @@ -2,20 +2,79 @@ #include "internals.h" +#if defined(__x86_64__) +# define ASM3(code) \ + __asm__ __volatile__ (code : "+d"(carry) : "a"(ac + i), "b"(bc + i), "c"(cc + i)) + +# define ASM2(code) \ + __asm__ __volatile__ (code : "+d"(carry) : "a"(ac + i), "b"(bc + i)) + +# define ADD2(off) \ + "\n movq "#off"(%%rbx), %%rdx" \ + "\n adcq %%rdx, "#off"(%%rax)" + +# define ADD3(off) \ + "\n movq "#off"(%%rbx), %%rdx" \ + "\n adcq "#off"(%%rcx), %%rdx" \ + "\n movq %%rdx, "#off"(%%rax)" + +# define WRAP_CARRY(interior) \ + "\n clc" \ + "\n cmpq $0, %%rdx" \ + "\n je 1f" \ + "\n stc" \ + "\n 1:" \ + interior \ + "\n movq $1, %%rdx" \ + "\n jc 1f" \ + "\n movq $0, %%rdx" \ + "\n 1:" +#endif + + static inline void zadd_impl_4(z_t a, z_t b, z_t c, size_t n) { - zahl_char_t carry = 0, tcarry; + zahl_char_t carry = 0, *ac = a->chars, *bc = b->chars, *cc = c->chars; size_t i; +#if defined(__x86_64__) + for (i = 0; (i += 4) <= n;) + ASM3(WRAP_CARRY(ADD3(-32) ADD3(-24) ADD3(-16) ADD3(-8))); + if (i > n) { + i -= 4; + switch (n & 3) { + case 3: + ASM3(WRAP_CARRY(ADD3(0) ADD3(8) ADD3(16))); + break; + case 2: + ASM3(WRAP_CARRY(ADD3(0) ADD3(8))); + break; + case 1: + ASM3(WRAP_CARRY(ADD3(0))); + break; + default: + break; + } + } + i = n; + + while (carry) { + carry = libzahl_add_overflow(ac + i, ac[i], 1); + i++; + } +#else + zahl_char_t tcarry; + for (i = 0; i < n; i++) { - tcarry = libzahl_add_overflow(a->chars + i, b->chars[i], c->chars[i]); - carry = tcarry | (zahl_char_t)libzahl_add_overflow(a->chars + i, a->chars[i], carry); + tcarry = libzahl_add_overflow(ac + i, bc[i], cc[i]); + carry = tcarry | (zahl_char_t)libzahl_add_overflow(ac + i, ac[i], carry); } while (carry) { - carry = libzahl_add_overflow(a->chars + i, a->chars[i], 1); + carry = libzahl_add_overflow(ac + i, ac[i], 1); i++; } +#endif if (a->used < i) a->used = i; @@ -24,7 +83,40 @@ zadd_impl_4(z_t a, z_t b, z_t c, size_t n) static inline void zadd_impl_3(z_t a, z_t b, size_t n) { +#if defined(__x86_64__) + zahl_char_t carry = 0, *ac = a->chars, *bc = b->chars; + size_t i; + + for (i = 0; (i += 4) <= n;) + ASM2(WRAP_CARRY(ADD2(-32) ADD2(-24) ADD2(-16) ADD2(-8))); + if (i > n) { + i -= 4; + switch (n & 3) { + case 3: + ASM2(WRAP_CARRY(ADD2(0) ADD2(8) ADD2(16))); + break; + case 2: + ASM2(WRAP_CARRY(ADD2(0) ADD2(8))); + break; + case 1: + ASM2(WRAP_CARRY(ADD2(0))); + break; + default: + break; + } + } + i = n; + + while (carry) { + carry = libzahl_add_overflow(ac + i, ac[i], 1); + i++; + } + + if (a->used < i) + a->used = i; +#else zadd_impl_4(a, a, b, n); +#endif } static inline void