commit 93bf9e5b4bf63708c732f5bf07619d2e59c81ec4
parent b0732b78f06a4b790f71517227d9e957751ad4fa
Author: Mattias Andrée <maandree@kth.se>
Date: Thu, 5 May 2016 02:41:50 +0200
Optimise zadd on x86-64
Signed-off-by: Mattias Andrée <maandree@kth.se>
Diffstat:
M | src/zadd.c | | | 100 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---- |
1 file changed, 96 insertions(+), 4 deletions(-)
diff --git a/src/zadd.c b/src/zadd.c
@@ -2,20 +2,79 @@
#include "internals.h"
+#if defined(__x86_64__)
+# define ASM3(code) \
+ __asm__ __volatile__ (code : "+d"(carry) : "a"(ac + i), "b"(bc + i), "c"(cc + i))
+
+# define ASM2(code) \
+ __asm__ __volatile__ (code : "+d"(carry) : "a"(ac + i), "b"(bc + i))
+
+# define ADD2(off) \
+ "\n movq "#off"(%%rbx), %%rdx" \
+ "\n adcq %%rdx, "#off"(%%rax)"
+
+# define ADD3(off) \
+ "\n movq "#off"(%%rbx), %%rdx" \
+ "\n adcq "#off"(%%rcx), %%rdx" \
+ "\n movq %%rdx, "#off"(%%rax)"
+
+# define WRAP_CARRY(interior) \
+ "\n clc" \
+ "\n cmpq $0, %%rdx" \
+ "\n je 1f" \
+ "\n stc" \
+ "\n 1:" \
+ interior \
+ "\n movq $1, %%rdx" \
+ "\n jc 1f" \
+ "\n movq $0, %%rdx" \
+ "\n 1:"
+#endif
+
+
static inline void
zadd_impl_4(z_t a, z_t b, z_t c, size_t n)
{
- zahl_char_t carry = 0, tcarry;
+ zahl_char_t carry = 0, *ac = a->chars, *bc = b->chars, *cc = c->chars;
size_t i;
+#if defined(__x86_64__)
+ for (i = 0; (i += 4) <= n;)
+ ASM3(WRAP_CARRY(ADD3(-32) ADD3(-24) ADD3(-16) ADD3(-8)));
+ if (i > n) {
+ i -= 4;
+ switch (n & 3) {
+ case 3:
+ ASM3(WRAP_CARRY(ADD3(0) ADD3(8) ADD3(16)));
+ break;
+ case 2:
+ ASM3(WRAP_CARRY(ADD3(0) ADD3(8)));
+ break;
+ case 1:
+ ASM3(WRAP_CARRY(ADD3(0)));
+ break;
+ default:
+ break;
+ }
+ }
+ i = n;
+
+ while (carry) {
+ carry = libzahl_add_overflow(ac + i, ac[i], 1);
+ i++;
+ }
+#else
+ zahl_char_t tcarry;
+
for (i = 0; i < n; i++) {
- tcarry = libzahl_add_overflow(a->chars + i, b->chars[i], c->chars[i]);
- carry = tcarry | (zahl_char_t)libzahl_add_overflow(a->chars + i, a->chars[i], carry);
+ tcarry = libzahl_add_overflow(ac + i, bc[i], cc[i]);
+ carry = tcarry | (zahl_char_t)libzahl_add_overflow(ac + i, ac[i], carry);
}
while (carry) {
- carry = libzahl_add_overflow(a->chars + i, a->chars[i], 1);
+ carry = libzahl_add_overflow(ac + i, ac[i], 1);
i++;
}
+#endif
if (a->used < i)
a->used = i;
@@ -24,7 +83,40 @@ zadd_impl_4(z_t a, z_t b, z_t c, size_t n)
static inline void
zadd_impl_3(z_t a, z_t b, size_t n)
{
+#if defined(__x86_64__)
+ zahl_char_t carry = 0, *ac = a->chars, *bc = b->chars;
+ size_t i;
+
+ for (i = 0; (i += 4) <= n;)
+ ASM2(WRAP_CARRY(ADD2(-32) ADD2(-24) ADD2(-16) ADD2(-8)));
+ if (i > n) {
+ i -= 4;
+ switch (n & 3) {
+ case 3:
+ ASM2(WRAP_CARRY(ADD2(0) ADD2(8) ADD2(16)));
+ break;
+ case 2:
+ ASM2(WRAP_CARRY(ADD2(0) ADD2(8)));
+ break;
+ case 1:
+ ASM2(WRAP_CARRY(ADD2(0)));
+ break;
+ default:
+ break;
+ }
+ }
+ i = n;
+
+ while (carry) {
+ carry = libzahl_add_overflow(ac + i, ac[i], 1);
+ i++;
+ }
+
+ if (a->used < i)
+ a->used = i;
+#else
zadd_impl_4(a, a, b, n);
+#endif
}
static inline void