commit 9437becf6d8aa4d9a3872b2cd6b353dc4c90a1cb
parent 683babd645e3a41b26326beea30879ee30920392
Author: Mattias Andrée <maandree@kth.se>
Date: Thu, 5 May 2016 21:11:43 +0200
Optimisations
Signed-off-by: Mattias Andrée <maandree@kth.se>
Diffstat:
M | STATUS | | | 33 | +++++++++++++++++++++------------ |
M | src/zadd.c | | | 132 | +++++++++++++++++++++++++++++++++---------------------------------------------- |
M | zahl/internals.h | | | 6 | +----- |
3 files changed, 77 insertions(+), 94 deletions(-)
diff --git a/STATUS b/STATUS
@@ -1,3 +1,21 @@
+The following functions are probably implemented optimally:
+
+zswap ................... always fastest
+zzero ................... always fastest (shared with gmp)
+zsignum ................. always fastest (shared with gmp)
+zeven ................... always fastest
+zodd .................... always fastest
+zeven_nonzero ........... always fastest
+zodd_nonzero ............ always fastest (shared with gmp)
+zbtest .................. always fastest
+
+
+The following functions are probably implemented close to
+optimally, further optimisation should not be a priority:
+
+zadd_unsigned ........... fastest after ~70 compared against zadd too (x86-64)
+
+
Optimisation progress for libzahl, compared to other big integer
libraries. These comparisons are for 152-bit integers. Functions
in parenthesis the right column are functions that needs
@@ -10,26 +28,18 @@ zset .................... fastest [always]
zseti ................... tomsfastmath is faster [always]
zsetu ................... tomsfastmath is faster [always]
zneg(a, b) .............. fastest [always]
-zneg(a, a) .............. fastest [always] (shared with gmp)
+zneg(a, a) .............. fastest [always] (shared with gmp; faster with clang)
zabs(a, b) .............. fastest [always]
zabs(a, a) .............. tomsfastmath is faster [always]
-zadd_unsigned ........... fastest [always]
-zsub_unsigned ........... fastest [always]
-zadd .................... fastest [after ~100, tomsfastmath before] (shared with gmp)
+zsub_unsigned ........... fastest [always] (compared against zsub too)
+zadd .................... fastest [after ~110, tomsfastmath before] (x86-64)
zsub .................... fastest [always]
zand .................... 77 % of tomsfastmath [until ~900, alternating with gmp]
zor ..................... 65 % of tomsfastmath [until ~1750, alternating with gmp (gcc) and tomsfastmath (clang)]
zxor .................... 87 % of tomsfastmath [until ~700, alternating with gmp (gcc+clangs),]
znot .................... fastest [always]
-zeven ................... fastest [always]
-zodd .................... fastest [always]
-zeven_nonzero ........... fastest [always]
-zodd_nonzero ............ fastest [always]
-zzero ................... fastest [always] (shared with gmp)
-zsignum ................. fastest [always] (shared with gmp)
zbits ................... fastest [always]
zlsb .................... fastest [always]
-zswap ................... fastest [always]
zlsh .................... fastest [until ~1000, then gmp]
zrsh .................... fastest [almost never]
ztrunc(a, b, c) ......... fastest [always; alternating with gmp between 1400~3000 (clang)]
@@ -46,7 +56,6 @@ zbset(a, b, 0) .......... fastest [always]
zbset(a, a, 0) .......... fastest [always]
zbset(a, b, -1) ......... fastest [always]
zbset(a, a, -1) ......... fastest [always]
-zbtest .................. fastest [always]
zgcd .................... 21 % of gmp (zcmpmag)
zmul .................... slowest
zsqr .................... slowest (zmul)
diff --git a/src/zadd.c b/src/zadd.c
@@ -4,116 +4,94 @@
#if defined(__x86_64__)
# define ASM3(code) \
- __asm__ __volatile__ (code : "+d"(carry) : "a"(ac + i), "b"(bc + i), "c"(cc + i))
+ __asm__ __volatile__ (code : [x]"+r"(carry), [a]"+r"(ac), [b]"+r"(bc), [c]"+r"(cc))
# define ASM2(code) \
- __asm__ __volatile__ (code : "+d"(carry) : "a"(ac + i), "b"(bc + i))
+ __asm__ __volatile__ (code : [x]"+r"(carry), [a]"+r"(ac), [b]"+r"(bc))
-# define ADD2(off) \
- "\n movq "#off"(%%rbx), %%rdx" \
- "\n adcq %%rdx, "#off"(%%rax)"
+# define ADD2(off) \
+ "\n movq "#off"(%[b]), %[x]" \
+ "\n adcq %[x], "#off"(%[a])"
-# define ADD3(off) \
- "\n movq "#off"(%%rbx), %%rdx" \
- "\n adcq "#off"(%%rcx), %%rdx" \
- "\n movq %%rdx, "#off"(%%rax)"
+# define ADD3(off) \
+ "\n movq "#off"(%[b]), %[x]" \
+ "\n adcq "#off"(%[c]), %[x]" \
+ "\n movq %[x], "#off"(%[a])"
# define WRAP_CARRY(interior) \
- "\n clc" \
- "\n cmpq $0, %%rdx" \
- "\n je 1f" \
- "\n stc" \
- "\n 1:" \
+ "\n addq $-1, %[x]" \
interior \
- "\n movq $1, %%rdx" \
+ "\n movq $1, %[x]" \
"\n jc 1f" \
- "\n movq $0, %%rdx" \
+ "\n movq $0, %[x]" \
"\n 1:"
+
+# define ASM_ADD(N) \
+ do { \
+ register zahl_char_t carry = 0; \
+ size_t i; \
+ for (i = 0; (INC(4)), (i += 4) <= n;) \
+ ASM##N(WRAP_CARRY(ADD##N(-32) ADD##N(-24) ADD##N(-16) ADD##N(-8))); \
+ switch (n & 3) { \
+ case 3: \
+ ASM##N(WRAP_CARRY(ADD##N(-32) ADD##N(-24) ADD##N(-16))); \
+ break; \
+ case 2: \
+ ASM##N(WRAP_CARRY(ADD##N(-32) ADD##N(-24))); \
+ break; \
+ case 1: \
+ ASM##N(WRAP_CARRY(ADD##N(-32))); \
+ break; \
+ default: \
+ break; \
+ } \
+ i = n; \
+ while (carry) { \
+ carry = libzahl_add_overflow(a->chars + i, a->chars[i], 1); \
+ i++; \
+ } \
+ if (a->used < i) \
+ a->used = i; \
+ } while (0)
#endif
static inline void
zadd_impl_4(z_t a, z_t b, z_t c, size_t n)
{
- zahl_char_t carry = 0, *ac = a->chars, *bc = b->chars, *cc = c->chars;
- size_t i;
-
-#if defined(__x86_64__)
- for (i = 0; (i += 4) <= n;)
- ASM3(WRAP_CARRY(ADD3(-32) ADD3(-24) ADD3(-16) ADD3(-8)));
- if (i > n) {
- i -= 4;
- switch (n & 3) {
- case 3:
- ASM3(WRAP_CARRY(ADD3(0) ADD3(8) ADD3(16)));
- break;
- case 2:
- ASM3(WRAP_CARRY(ADD3(0) ADD3(8)));
- break;
- case 1:
- ASM3(WRAP_CARRY(ADD3(0)));
- break;
- default:
- break;
- }
- }
- i = n;
-
- while (carry) {
- carry = libzahl_add_overflow(ac + i, ac[i], 1);
- i++;
- }
+#ifdef ASM_ADD
+ register zahl_char_t *ac = a->chars, *bc = b->chars, *cc = c->chars;
+# define INC(P) (ac += (P), bc += (P), cc += (P))
+ ASM_ADD(3);
+# undef INC
#else
- zahl_char_t tcarry;
+ zahl_char_t carry = 0, tcarry;
+ zahl_char_t *ac = a->chars, *bc = b->chars, *cc = c->chars;
+ size_t i;
for (i = 0; i < n; i++) {
tcarry = libzahl_add_overflow(ac + i, bc[i], cc[i]);
carry = tcarry | (zahl_char_t)libzahl_add_overflow(ac + i, ac[i], carry);
}
+
while (carry) {
carry = libzahl_add_overflow(ac + i, ac[i], 1);
i++;
}
-#endif
if (a->used < i)
a->used = i;
+#endif
}
static inline void
zadd_impl_3(z_t a, z_t b, size_t n)
{
-#if defined(__x86_64__)
- zahl_char_t carry = 0, *ac = a->chars, *bc = b->chars;
- size_t i;
-
- for (i = 0; (i += 4) <= n;)
- ASM2(WRAP_CARRY(ADD2(-32) ADD2(-24) ADD2(-16) ADD2(-8)));
- if (i > n) {
- i -= 4;
- switch (n & 3) {
- case 3:
- ASM2(WRAP_CARRY(ADD2(0) ADD2(8) ADD2(16)));
- break;
- case 2:
- ASM2(WRAP_CARRY(ADD2(0) ADD2(8)));
- break;
- case 1:
- ASM2(WRAP_CARRY(ADD2(0)));
- break;
- default:
- break;
- }
- }
- i = n;
-
- while (carry) {
- carry = libzahl_add_overflow(ac + i, ac[i], 1);
- i++;
- }
-
- if (a->used < i)
- a->used = i;
+#ifdef ASM_ADD
+ register zahl_char_t *ac = a->chars, *bc = b->chars;
+# define INC(P) (ac += (P), bc += (P))
+ ASM_ADD(2);
+# undef INC
#else
zadd_impl_4(a, a, b, n);
#endif
diff --git a/zahl/internals.h b/zahl/internals.h
@@ -2,11 +2,7 @@
#ifndef ZAHL_INLINE
# if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
-# if defined(__GNUC__) || defined(__clang__)
-# define ZAHL_INLINE __attribute__((__always_inline__, __gnu_inline__)) static inline
-# else
-# define ZAHL_INLINE static inline
-# endif
+# define ZAHL_INLINE static inline
# else
# define ZAHL_INLINE static
# endif