libzahl

big integer library
git clone git://git.suckless.org/libzahl
Log | Files | Refs | README | LICENSE

commit 9437becf6d8aa4d9a3872b2cd6b353dc4c90a1cb
parent 683babd645e3a41b26326beea30879ee30920392
Author: Mattias Andrée <maandree@kth.se>
Date:   Thu,  5 May 2016 21:11:43 +0200

Optimisations

Signed-off-by: Mattias Andrée <maandree@kth.se>

Diffstat:
MSTATUS | 33+++++++++++++++++++++------------
Msrc/zadd.c | 132+++++++++++++++++++++++++++++++++----------------------------------------------
Mzahl/internals.h | 6+-----
3 files changed, 77 insertions(+), 94 deletions(-)

diff --git a/STATUS b/STATUS @@ -1,3 +1,21 @@ +The following functions are probably implemented optimally: + +zswap ................... always fastest +zzero ................... always fastest (shared with gmp) +zsignum ................. always fastest (shared with gmp) +zeven ................... always fastest +zodd .................... always fastest +zeven_nonzero ........... always fastest +zodd_nonzero ............ always fastest (shared with gmp) +zbtest .................. always fastest + + +The following functions are probably implemented close to +optimally, further optimisation should not be a priority: + +zadd_unsigned ........... fastest after ~70 compared against zadd too (x86-64) + + Optimisation progress for libzahl, compared to other big integer libraries. These comparisons are for 152-bit integers. Functions in parenthesis the right column are functions that needs @@ -10,26 +28,18 @@ zset .................... fastest [always] zseti ................... tomsfastmath is faster [always] zsetu ................... tomsfastmath is faster [always] zneg(a, b) .............. fastest [always] -zneg(a, a) .............. fastest [always] (shared with gmp) +zneg(a, a) .............. fastest [always] (shared with gmp; faster with clang) zabs(a, b) .............. fastest [always] zabs(a, a) .............. tomsfastmath is faster [always] -zadd_unsigned ........... fastest [always] -zsub_unsigned ........... fastest [always] -zadd .................... fastest [after ~100, tomsfastmath before] (shared with gmp) +zsub_unsigned ........... fastest [always] (compared against zsub too) +zadd .................... fastest [after ~110, tomsfastmath before] (x86-64) zsub .................... fastest [always] zand .................... 77 % of tomsfastmath [until ~900, alternating with gmp] zor ..................... 65 % of tomsfastmath [until ~1750, alternating with gmp (gcc) and tomsfastmath (clang)] zxor .................... 87 % of tomsfastmath [until ~700, alternating with gmp (gcc+clangs),] znot .................... fastest [always] -zeven ................... fastest [always] -zodd .................... fastest [always] -zeven_nonzero ........... fastest [always] -zodd_nonzero ............ fastest [always] -zzero ................... fastest [always] (shared with gmp) -zsignum ................. fastest [always] (shared with gmp) zbits ................... fastest [always] zlsb .................... fastest [always] -zswap ................... fastest [always] zlsh .................... fastest [until ~1000, then gmp] zrsh .................... fastest [almost never] ztrunc(a, b, c) ......... fastest [always; alternating with gmp between 1400~3000 (clang)] @@ -46,7 +56,6 @@ zbset(a, b, 0) .......... fastest [always] zbset(a, a, 0) .......... fastest [always] zbset(a, b, -1) ......... fastest [always] zbset(a, a, -1) ......... fastest [always] -zbtest .................. fastest [always] zgcd .................... 21 % of gmp (zcmpmag) zmul .................... slowest zsqr .................... slowest (zmul) diff --git a/src/zadd.c b/src/zadd.c @@ -4,116 +4,94 @@ #if defined(__x86_64__) # define ASM3(code) \ - __asm__ __volatile__ (code : "+d"(carry) : "a"(ac + i), "b"(bc + i), "c"(cc + i)) + __asm__ __volatile__ (code : [x]"+r"(carry), [a]"+r"(ac), [b]"+r"(bc), [c]"+r"(cc)) # define ASM2(code) \ - __asm__ __volatile__ (code : "+d"(carry) : "a"(ac + i), "b"(bc + i)) + __asm__ __volatile__ (code : [x]"+r"(carry), [a]"+r"(ac), [b]"+r"(bc)) -# define ADD2(off) \ - "\n movq "#off"(%%rbx), %%rdx" \ - "\n adcq %%rdx, "#off"(%%rax)" +# define ADD2(off) \ + "\n movq "#off"(%[b]), %[x]" \ + "\n adcq %[x], "#off"(%[a])" -# define ADD3(off) \ - "\n movq "#off"(%%rbx), %%rdx" \ - "\n adcq "#off"(%%rcx), %%rdx" \ - "\n movq %%rdx, "#off"(%%rax)" +# define ADD3(off) \ + "\n movq "#off"(%[b]), %[x]" \ + "\n adcq "#off"(%[c]), %[x]" \ + "\n movq %[x], "#off"(%[a])" # define WRAP_CARRY(interior) \ - "\n clc" \ - "\n cmpq $0, %%rdx" \ - "\n je 1f" \ - "\n stc" \ - "\n 1:" \ + "\n addq $-1, %[x]" \ interior \ - "\n movq $1, %%rdx" \ + "\n movq $1, %[x]" \ "\n jc 1f" \ - "\n movq $0, %%rdx" \ + "\n movq $0, %[x]" \ "\n 1:" + +# define ASM_ADD(N) \ + do { \ + register zahl_char_t carry = 0; \ + size_t i; \ + for (i = 0; (INC(4)), (i += 4) <= n;) \ + ASM##N(WRAP_CARRY(ADD##N(-32) ADD##N(-24) ADD##N(-16) ADD##N(-8))); \ + switch (n & 3) { \ + case 3: \ + ASM##N(WRAP_CARRY(ADD##N(-32) ADD##N(-24) ADD##N(-16))); \ + break; \ + case 2: \ + ASM##N(WRAP_CARRY(ADD##N(-32) ADD##N(-24))); \ + break; \ + case 1: \ + ASM##N(WRAP_CARRY(ADD##N(-32))); \ + break; \ + default: \ + break; \ + } \ + i = n; \ + while (carry) { \ + carry = libzahl_add_overflow(a->chars + i, a->chars[i], 1); \ + i++; \ + } \ + if (a->used < i) \ + a->used = i; \ + } while (0) #endif static inline void zadd_impl_4(z_t a, z_t b, z_t c, size_t n) { - zahl_char_t carry = 0, *ac = a->chars, *bc = b->chars, *cc = c->chars; - size_t i; - -#if defined(__x86_64__) - for (i = 0; (i += 4) <= n;) - ASM3(WRAP_CARRY(ADD3(-32) ADD3(-24) ADD3(-16) ADD3(-8))); - if (i > n) { - i -= 4; - switch (n & 3) { - case 3: - ASM3(WRAP_CARRY(ADD3(0) ADD3(8) ADD3(16))); - break; - case 2: - ASM3(WRAP_CARRY(ADD3(0) ADD3(8))); - break; - case 1: - ASM3(WRAP_CARRY(ADD3(0))); - break; - default: - break; - } - } - i = n; - - while (carry) { - carry = libzahl_add_overflow(ac + i, ac[i], 1); - i++; - } +#ifdef ASM_ADD + register zahl_char_t *ac = a->chars, *bc = b->chars, *cc = c->chars; +# define INC(P) (ac += (P), bc += (P), cc += (P)) + ASM_ADD(3); +# undef INC #else - zahl_char_t tcarry; + zahl_char_t carry = 0, tcarry; + zahl_char_t *ac = a->chars, *bc = b->chars, *cc = c->chars; + size_t i; for (i = 0; i < n; i++) { tcarry = libzahl_add_overflow(ac + i, bc[i], cc[i]); carry = tcarry | (zahl_char_t)libzahl_add_overflow(ac + i, ac[i], carry); } + while (carry) { carry = libzahl_add_overflow(ac + i, ac[i], 1); i++; } -#endif if (a->used < i) a->used = i; +#endif } static inline void zadd_impl_3(z_t a, z_t b, size_t n) { -#if defined(__x86_64__) - zahl_char_t carry = 0, *ac = a->chars, *bc = b->chars; - size_t i; - - for (i = 0; (i += 4) <= n;) - ASM2(WRAP_CARRY(ADD2(-32) ADD2(-24) ADD2(-16) ADD2(-8))); - if (i > n) { - i -= 4; - switch (n & 3) { - case 3: - ASM2(WRAP_CARRY(ADD2(0) ADD2(8) ADD2(16))); - break; - case 2: - ASM2(WRAP_CARRY(ADD2(0) ADD2(8))); - break; - case 1: - ASM2(WRAP_CARRY(ADD2(0))); - break; - default: - break; - } - } - i = n; - - while (carry) { - carry = libzahl_add_overflow(ac + i, ac[i], 1); - i++; - } - - if (a->used < i) - a->used = i; +#ifdef ASM_ADD + register zahl_char_t *ac = a->chars, *bc = b->chars; +# define INC(P) (ac += (P), bc += (P)) + ASM_ADD(2); +# undef INC #else zadd_impl_4(a, a, b, n); #endif diff --git a/zahl/internals.h b/zahl/internals.h @@ -2,11 +2,7 @@ #ifndef ZAHL_INLINE # if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L -# if defined(__GNUC__) || defined(__clang__) -# define ZAHL_INLINE __attribute__((__always_inline__, __gnu_inline__)) static inline -# else -# define ZAHL_INLINE static inline -# endif +# define ZAHL_INLINE static inline # else # define ZAHL_INLINE static # endif