Optimisations - libzahl - big integer library

commit 9437becf6d8aa4d9a3872b2cd6b353dc4c90a1cb
parent 683babd645e3a41b26326beea30879ee30920392
Author: Mattias Andrée <maandree@kth.se>
Date:   Thu,  5 May 2016 21:11:43 +0200

Optimisations

Signed-off-by: Mattias Andrée <maandree@kth.se>

Diffstat:
M STATUS  | 33 +++++++++++++++++++++------------
M src/zadd.c  | 132 +++++++++++++++++++++++++++++++++----------------------------------------------
M zahl/internals.h  | 6 +-----

3 files changed, 77 insertions(+), 94 deletions(-)
diff --git a/STATUS b/STATUS
@@ -1,3 +1,21 @@
+The following functions are probably implemented optimally:
+
+zswap ................... always fastest
+zzero ................... always fastest (shared with gmp)
+zsignum ................. always fastest (shared with gmp)
+zeven ................... always fastest
+zodd .................... always fastest
+zeven_nonzero ........... always fastest
+zodd_nonzero ............ always fastest (shared with gmp)
+zbtest .................. always fastest
+
+
+The following functions are probably implemented close to
+optimally, further optimisation should not be a priority:
+
+zadd_unsigned ........... fastest after ~70 compared against zadd too (x86-64)
+
+
 Optimisation progress for libzahl, compared to other big integer
 libraries. These comparisons are for 152-bit integers. Functions
 in parenthesis the right column are functions that needs
@@ -10,26 +28,18 @@ zset .................... fastest [always]
 zseti ................... tomsfastmath is faster [always]
 zsetu ................... tomsfastmath is faster [always]
 zneg(a, b) .............. fastest [always]
-zneg(a, a) .............. fastest [always] (shared with gmp)
+zneg(a, a) .............. fastest [always] (shared with gmp; faster with clang)
 zabs(a, b) .............. fastest [always]
 zabs(a, a) .............. tomsfastmath is faster [always]
-zadd_unsigned ........... fastest [always]
-zsub_unsigned ........... fastest [always]
-zadd .................... fastest [after ~100, tomsfastmath before] (shared with gmp)
+zsub_unsigned ........... fastest [always] (compared against zsub too)
+zadd .................... fastest [after ~110, tomsfastmath before] (x86-64)
 zsub .................... fastest [always]
 zand .................... 77 % of tomsfastmath [until ~900, alternating with gmp]
 zor ..................... 65 % of tomsfastmath [until ~1750, alternating with gmp (gcc) and tomsfastmath (clang)]
 zxor .................... 87 % of tomsfastmath [until ~700, alternating with gmp (gcc+clangs),]
 znot .................... fastest [always]
-zeven ................... fastest [always]
-zodd .................... fastest [always]
-zeven_nonzero ........... fastest [always]
-zodd_nonzero ............ fastest [always]
-zzero ................... fastest [always] (shared with gmp)
-zsignum ................. fastest [always] (shared with gmp)
 zbits ................... fastest [always]
 zlsb .................... fastest [always]
-zswap ................... fastest [always]
 zlsh .................... fastest [until ~1000, then gmp]
 zrsh .................... fastest [almost never]
 ztrunc(a, b, c) ......... fastest [always; alternating with gmp between 1400~3000 (clang)]
@@ -46,7 +56,6 @@ zbset(a, b, 0) .......... fastest [always]
 zbset(a, a, 0) .......... fastest [always]
 zbset(a, b, -1) ......... fastest [always]
 zbset(a, a, -1) ......... fastest [always]
-zbtest .................. fastest [always]
 zgcd .................... 21 % of gmp (zcmpmag)
 zmul .................... slowest
 zsqr .................... slowest (zmul)
diff --git a/src/zadd.c b/src/zadd.c
@@ -4,116 +4,94 @@
 
 #if defined(__x86_64__)
 # define ASM3(code)  \
-	__asm__ __volatile__ (code : "+d"(carry) : "a"(ac + i), "b"(bc + i), "c"(cc + i))
+	__asm__ __volatile__ (code : [x]"+r"(carry), [a]"+r"(ac), [b]"+r"(bc), [c]"+r"(cc))
 
 # define ASM2(code)  \
-	__asm__ __volatile__ (code : "+d"(carry) : "a"(ac + i), "b"(bc + i))
+	__asm__ __volatile__ (code : [x]"+r"(carry), [a]"+r"(ac), [b]"+r"(bc))
 
-# define ADD2(off)                         \
-	"\n    movq "#off"(%%rbx), %%rdx"  \
-	"\n    adcq %%rdx, "#off"(%%rax)"
+# define ADD2(off)                       \
+	"\n    movq "#off"(%[b]), %[x]"  \
+	"\n    adcq %[x], "#off"(%[a])"
 
-# define ADD3(off)                         \
-	"\n    movq "#off"(%%rbx), %%rdx"  \
-	"\n    adcq "#off"(%%rcx), %%rdx"  \
-	"\n    movq %%rdx, "#off"(%%rax)"
+# define ADD3(off)                       \
+	"\n    movq "#off"(%[b]), %[x]"  \
+	"\n    adcq "#off"(%[c]), %[x]"  \
+	"\n    movq %[x], "#off"(%[a])"
 
 # define WRAP_CARRY(interior)   \
-	"\n    clc"             \
-	"\n    cmpq $0, %%rdx"  \
-	"\n    je 1f"           \
-	"\n    stc"             \
-	"\n 1:"                 \
+	"\n    addq $-1, %[x]"  \
 	interior                \
-	"\n    movq $1, %%rdx"  \
+	"\n    movq $1, %[x]"   \
 	"\n    jc 1f"           \
-	"\n    movq $0, %%rdx"  \
+	"\n    movq $0, %[x]"   \
 	"\n 1:"
+
+# define ASM_ADD(N)                                                                          \
+	do {                                                                                 \
+		register zahl_char_t carry = 0;                                              \
+		size_t i;                                                                    \
+		for (i = 0; (INC(4)), (i += 4) <= n;)                                        \
+			ASM##N(WRAP_CARRY(ADD##N(-32) ADD##N(-24) ADD##N(-16) ADD##N(-8)));  \
+		switch (n & 3) {                                                             \
+		case 3:                                                                      \
+			ASM##N(WRAP_CARRY(ADD##N(-32) ADD##N(-24) ADD##N(-16)));             \
+			break;                                                               \
+		case 2:                                                                      \
+			ASM##N(WRAP_CARRY(ADD##N(-32) ADD##N(-24)));                         \
+			break;                                                               \
+		case 1:                                                                      \
+			ASM##N(WRAP_CARRY(ADD##N(-32)));                                     \
+			break;                                                               \
+		default:                                                                     \
+			break;                                                               \
+		}                                                                            \
+		i = n;                                                                       \
+		while (carry) {                                                              \
+			carry = libzahl_add_overflow(a->chars + i, a->chars[i], 1);          \
+			i++;                                                                 \
+		}                                                                            \
+		if (a->used < i)                                                             \
+			a->used = i;                                                         \
+	} while (0)
 #endif
 
 
 static inline void
 zadd_impl_4(z_t a, z_t b, z_t c, size_t n)
 {
-	zahl_char_t carry = 0, *ac = a->chars, *bc = b->chars, *cc = c->chars;
-	size_t i;
-
-#if defined(__x86_64__)
-	for (i = 0; (i += 4) <= n;)
-		ASM3(WRAP_CARRY(ADD3(-32) ADD3(-24) ADD3(-16) ADD3(-8)));
-	if (i > n) {
-		i -= 4;
-		switch (n & 3) {
-		case 3:
-			ASM3(WRAP_CARRY(ADD3(0) ADD3(8) ADD3(16)));
-			break;
-		case 2:
-			ASM3(WRAP_CARRY(ADD3(0) ADD3(8)));
-			break;
-		case 1:
-			ASM3(WRAP_CARRY(ADD3(0)));
-			break;
-		default:
-			break;
-		}
-	}
-	i = n;
-
-	while (carry) {
-		carry = libzahl_add_overflow(ac + i, ac[i], 1);
-		i++;
-	}
+#ifdef ASM_ADD
+	register zahl_char_t *ac = a->chars, *bc = b->chars, *cc = c->chars;
+# define INC(P)  (ac += (P), bc += (P), cc += (P))
+	ASM_ADD(3);
+# undef INC
 #else
-	zahl_char_t tcarry;
+	zahl_char_t carry = 0, tcarry;
+	zahl_char_t *ac = a->chars, *bc = b->chars, *cc = c->chars;
+	size_t i;
 
 	for (i = 0; i < n; i++) {
 		tcarry = libzahl_add_overflow(ac + i, bc[i], cc[i]);
 		carry = tcarry | (zahl_char_t)libzahl_add_overflow(ac + i, ac[i], carry);
 	}
+
 	while (carry) {
 		carry = libzahl_add_overflow(ac + i, ac[i], 1);
 		i++;
 	}
-#endif
 
 	if (a->used < i)
 		a->used = i;
+#endif
 }
 
 static inline void
 zadd_impl_3(z_t a, z_t b, size_t n)
 {
-#if defined(__x86_64__)
-	zahl_char_t carry = 0, *ac = a->chars, *bc = b->chars;
-	size_t i;
-
-	for (i = 0; (i += 4) <= n;)
-		ASM2(WRAP_CARRY(ADD2(-32) ADD2(-24) ADD2(-16) ADD2(-8)));
-	if (i > n) {
-		i -= 4;
-		switch (n & 3) {
-		case 3:
-			ASM2(WRAP_CARRY(ADD2(0) ADD2(8) ADD2(16)));
-			break;
-		case 2:
-			ASM2(WRAP_CARRY(ADD2(0) ADD2(8)));
-			break;
-		case 1:
-			ASM2(WRAP_CARRY(ADD2(0)));
-			break;
-		default:
-			break;
-		}
-	}
-	i = n;
-
-	while (carry) {
-		carry = libzahl_add_overflow(ac + i, ac[i], 1);
-		i++;
-	}
-
-	if (a->used < i)
-		a->used = i;
+#ifdef ASM_ADD
+	register zahl_char_t *ac = a->chars, *bc = b->chars;
+# define INC(P)  (ac += (P), bc += (P))
+	ASM_ADD(2);
+# undef INC
 #else
 	zadd_impl_4(a, a, b, n);
 #endif
diff --git a/zahl/internals.h b/zahl/internals.h
@@ -2,11 +2,7 @@
 
 #ifndef ZAHL_INLINE
 # if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
-#  if defined(__GNUC__) || defined(__clang__)
-#   define ZAHL_INLINE  __attribute__((__always_inline__, __gnu_inline__)) static inline
-#  else
-#   define ZAHL_INLINE  static inline
-#  endif
+#  define ZAHL_INLINE  static inline
 # else
 #  define ZAHL_INLINE  static
 # endif

	libzahl big integer library
	git clone git://git.suckless.org/libzahl
	Log \| Files \| Refs \| README \| LICENSE

M	STATUS	\|	33	+++++++++++++++++++++------------
M	src/zadd.c	\|	132	+++++++++++++++++++++++++++++++++----------------------------------------------
M	zahl/internals.h	\|	6	+-----