Some small improvements - libzahl

commit d6f4393542998276250bd3f3519bb824ca4b3d91
parent 247dffb6c9a24894f094392ce62f2aa2e81a8acc
Author: Mattias Andrée <maandree@kth.se>
Date:   Sat,  7 May 2016 17:22:42 +0200

Some small improvements

Signed-off-by: Mattias Andrée <maandree@kth.se>

Diffstat:
A INSTALL  | 36 ++++++++++++++++++++++++++++++++++++
M Makefile  | 5 +++--
M STATUS  | 69 +++++++++++++++++++++++++++++++++++++--------------------------------
M config.mk  | 12 +++---------
M src/internals.h  | 16 +++++++++++-----
M src/zload.c  | 7 ++++---
M zahl.h  | 3 +++
M zahl/inlines.h  | 3 ++-
M zahl/memory.h  | 2 +-

9 files changed, 100 insertions(+), 53 deletions(-)
diff --git a/INSTALL b/INSTALL
@@ -0,0 +1,36 @@
+Configure libzahl
+=================
+
+libzahl is configured by editing config.mk. You may choose
+to make a copy of config.mk, and reference the copy with
+CONFIG when running make. For example:
+    cp config.mk my-config.mk
+    # edit my-config.mk
+    make CONFIG=my-config.mk
+
+Unless you are compiling for Linux you make have to add
+    -D'FAST_RANDOM_PATHNAME="<path to a non-blocking random number generator>"'
+(/dev/urandom on Linux) and
+    -D'SECURE_RANDOM_PATHNAME="<path to a blocking random number generator>"'
+(/dev/random on Linux) to CPPFLAGS.
+
+If you are using a C standard library where the higher bits have higher
+entropy in the lower bits in rand(3) (as historically was the case),
+remove -DGOOD_RAND from CPPFLAGS.
+
+If you don't care if your program crashes on failure, you can add
+-DZAHL_UNSAFE to CPPFLAGS. This will give you a marginal performance
+boost. You should also add, preferably,
+    #define ZAHL_UNSAFE
+before including <zahl.h> in your program if you are doing this.
+
+If your CPU does not support indirect jumps (computed jumps) you should
+add -DZAHL_ISA_MISSING_INDIRECT_JUMP to CPPFLAGS, and preferably add
+    #define ZAHL_ISA_MISSING_INDIRECT_JUMP
+before including <zahl.h> in your program.
+
+libzahl contains some (very little) assembly code. In the event
+that the used instructions are not supported on your machine, please
+report it, and in the meanwhile add -DZAHL_NO_ASM to CPPFLAGS. You
+make also have to do this if you are compiling with a compiler that
+does not support extended inline assembly.
diff --git a/Makefile b/Makefile
@@ -1,4 +1,5 @@
-include config.mk
+CONFIG = config.mk
+include $(CONFIG)
 
 HDR_SEMIPUBLIC =\
 	zahl/inlines.h\
@@ -121,7 +122,7 @@ CFLAGS_WITHOUT_O = $$(printf '%s\n' $(CFLAGS) | sed '/^-O.*$$/d')
 
 all: libzahl.a $(DOC)
 
-.o: .c $(HDR) config.mk
+.o: .c $(HDR) $(CONFIG)
 	$(CC) $(CFLAGS) $(CPPFLAGS) -c -o $@ $<
 
 libzahl.a: $(OBJ)
diff --git a/STATUS b/STATUS
@@ -18,55 +18,55 @@ processes are fixed to one CPU.
 
   The following functions are probably implemented optimally:
 
+zset .................... always fastest (gcc); until ~1200 (clang [can be fixed with assembly])
 zseti(a, +) ............. tomsfastmath is faster
 zseti(a, -) ............. tomsfastmath is faster
 zsetu ................... tomsfastmath is faster
 zswap ................... always fastest
 zzero ................... always fastest (shared with gmp)
 zsignum ................. always fastest (shared with gmp)
-zeven ................... always fastest
-zodd .................... always fastest
-zeven_nonzero ........... always fastest
+zeven ................... always fastest (shared with gmp)
+zodd .................... always fastest (shared with gmp)
+zeven_nonzero ........... always fastest (shared with gmp)
 zodd_nonzero ............ always fastest (shared with gmp)
 zbtest .................. always fastest
+zsave ................... always fastest [clang needs zset fix]
+zload ................... always fastest [clang needs zset fix]
 
 
   The following functions are probably implemented close to
   optimally, further optimisation should not be a priority:
 
-zadd_unsigned ........... fastest after ~70 compared against zadd too (x86-64)
+zadd_unsigned ........... fastest after ~140 (depends on cc and libc) compared against zadd too
 ztrunc(a, a, b) ......... fastest until ~100, then 77 % (gcc) or 68 % (clang) of tomsfastmath
-zbset(a, a, 1) .......... always fastest (93 % of gmp (clang))
-zbset(a, a, 0) .......... always fastest
-zbset(a, a, -1) ......... always fastest
+zbset(a, a, 1) .......... always fastest
+zbset(a, a, 0) .......... always fastest (faster with clang than gcc)
+zbset(a, a, -1) ......... always fastest (only marginally faster than gmp with clang)
 zlsb .................... always fastest <<suspicious>>
-zlsh .................... fastest until ~3400, then tomsfastmath, clang and musl are a bit slow
+zlsh .................... not too fast anymore
+zand .................... fastest after ~400, tomsfastmath before (gcc+glibc is slow)
+zor ..................... fastest after ~1150, tomsfastmath before (gcc+glibc is slow)
+zxor .................... fastest after ~400, tomsfastmath before (clang), gcc is slow
+znot .................... always fastest (faster with musl than glibc)
 
 
   The following functions are probably implemented optimally, but
   depends on other functions or call-cases for better performance:
 
-zneg(a, b) .............. always fastest
-zabs(a, b) .............. always fastest
-ztrunc(a, b, c) ......... always fastest (alternating with gmp between 1400~3000 (clang+glibc))
-zbset(a, b, 1) .......... always fastest
-zbset(a, b, 0) .......... always fastest
-zbset(a, b, -1) ......... always fastest
-zsplit .................. alternating with gmp for fastest
+zneg(a, b) .............. always fastest (gcc+musl); gcc is a bit slow [clang needs zset fix]
+zabs(a, b) .............. always fastest (gcc+musl); gcc is a bit slow [clang needs zset fix]
+ztrunc(a, b, c) ......... always fastest (gcc+musl); gcc is a bit slow [clang needs zset fix]
+zbset(a, b, 1) .......... always fastest (gcc+musl); gcc is a bit slow [clang needs zset fix]
+zbset(a, b, 0) .......... always fastest (gcc+musl); gcc is a bit slow [clang needs zset fix]
+zbset(a, b, -1) ......... always fastest (gcc+musl); gcc is a bit slow [clang needs zset fix]
+zsplit .................. alternating with gmp for fastest (clang and glibc is slower)
 
 
   The following functions require structural changes for
   further optimisations:
 
-zset .................... always fastest
-zneg(a, a) .............. always fastest (shared with gmp; faster with clang)
-zabs(a, a) .............. tomsfastmath is faster (46 % of tomsfastmath with clang)
-zand .................... fastest until ~900, alternating with gmp
-zor ..................... fastest until ~1750, alternating with gmp (gcc) and tomsfastmath (clang)
-zxor .................... fastest until ~700, alternating with gmp (gcc+glibc)
-znot .................... always fastest
-zsave ................... always fastest
-zload ................... always fastest
+zneg(a, a) .............. always fastest (shared with gmp (gcc))
+zabs(a, a) .............. 34 % (clang) or 8 % (gcc) of tomsfastmath
 
 
   The following functions are probably implemented optimally
@@ -82,26 +82,31 @@ zcmpu ................... always fastest
   It may be possible optimise the following functions
   further:
 
-zadd .................... fastest after ~110 (x86-64)
-zcmp .................... acceptable (glibc); almost always fastest (musl)
+zadd .................... fastest after ~90 (clang), ~260 (gcc+musl), or ~840 (gcc+glibc) (gcc+glibc is slow)
+zcmp .................... almost always fastest (musl), almost always slowest (glibc) <<suspicious (clang)>>
 zcmpmag ................. always fastest <<suspicious, see zcmp>>
 
 
   The following functions could be optimised further:
 
-zrsh .................... gmp is almost always faster
+zrsh .................... mp is almost always faster; also tomsfastmath after ~3000 (gcc+glibc) or ~2800 (clang)
 zsub_unsigned ........... always fastest (compared against zsub too)
-zsub .................... always fastest
+zsub .................... always fastest (slower with gcc+glibc than gcc+musl or clang)
 
 
   The following functions could probably be optimised further,
   but there performance can be significantly improved by
   optimising their dependencies:
 
-zmul .................... fastest after ~4096
-zsqr .................... slowest, but asymptotically fastest
-zstr_length(a, 10) ...... gmp is faster
-zstr(a, b, n) ........... fastest after ~700
+zmul .................... slowest
+zsqr .................... slowest
+zstr_length(a, 10) ...... gmp is faster (clang is faster than gcc, musl is faster than glibc)
+zstr(a, b, n) ........... slowest
+
+
+musl has more stable performance than glibc. clang is better at
+inlining than gcc. (Which is better at optimising must be judged
+on a per-function basis.)
 
 
 
diff --git a/config.mk b/config.mk
@@ -1,3 +1,5 @@
+# Please read INSTALL, section 'Configure libzahl'.
+
 VERSION = 1.1
 
 PREFIX = /usr/local
@@ -9,14 +11,6 @@ CC = cc
 AR = ar
 RANLIB = ranlib
 
-# Unless /dev/urandom exists and is a non-blocking random number generator,
-# you have to add -DFAST_RANDOM_PATHNAME=... to CPPFLAGS, and
-# unless /dev/random exists and is a blocking secure random number generator
-# you have to add -DSECURE_RANDOM_PATHNAME=... to CPPFLAGS.
-# Remove -DGOOD_RAND if the higher bits have higher entropy in the lower
-# bits in rand(3), this was historically the case.
-# Add -DZAHL_UNSAFE if you rather libzahl did not check for errors.
-
 CPPFLAGS = -D_DEFAULT_SOURCE -D_BSD_SOURCE -D_XOPEN_SOURCE=700 -DGOOD_RAND
-CFLAGS   = -std=c99 -flto -O3 -Wall -pedantic
+CFLAGS   = -std=c99 -O3 -flto -Wall -pedantic
 LDFLAGS  = -s
diff --git a/src/internals.h b/src/internals.h
@@ -26,7 +26,7 @@
 #define Os     ZAHL_Os
 #define Oz     ZAHL_Oz
 
-#define LIST_TEMPS\
+#define LIST_TEMPS_HERE\
 	X(libzahl_tmp_str_num, 0)\
 	X(libzahl_tmp_str_mag, 0)\
 	X(libzahl_tmp_str_div, 0)\
@@ -35,8 +35,6 @@
 	X(libzahl_tmp_gcd_v, 0)\
 	X(libzahl_tmp_sub, 0)\
 	X(libzahl_tmp_modmul, 0)\
-	X(libzahl_tmp_div, 0)\
-	X(libzahl_tmp_mod, 0)\
 	X(libzahl_tmp_pow_b, 0)\
 	X(libzahl_tmp_pow_c, 0)\
 	X(libzahl_tmp_pow_d, 0)\
@@ -50,6 +48,11 @@
 	X(libzahl_tmp_ptest_n1, 0)\
 	X(libzahl_tmp_ptest_n4, 0)
 
+#define LIST_TEMPS\
+	X(libzahl_tmp_div, 0)\
+	X(libzahl_tmp_mod, 0)\
+	LIST_TEMPS_HERE
+
 #define LIST_CONSTS\
 	X(0, libzahl_const_1e19, zsetu, 10000000000000000000ULL) /* The largest power of 10 < 2⁶⁴. */\
 	X(1, libzahl_const_1,    zsetu, 1)\
@@ -57,7 +60,7 @@
 	X(3, libzahl_const_4,    zsetu, 4)
 
 #define X(x, s)  extern z_t x;
-LIST_TEMPS
+LIST_TEMPS_HERE
 #undef X
 #define X(i, x, f, v)  extern z_t x;
 LIST_CONSTS
@@ -119,7 +122,10 @@ zzero1(z_t a, z_t b)
 static inline void
 zmemcpy_range(register zahl_char_t *restrict d, register const zahl_char_t *restrict s, size_t i, size_t n)
 {
-	zmemcpy(d + i, s + i, n - i);
+	d += i;
+	s += i;
+	n -= i;
+	zmemcpy(d, s, n);
 }
 
 static void
diff --git a/src/zload.c b/src/zload.c
@@ -6,11 +6,12 @@ size_t
 zload(z_t a, const void *buffer)
 {
 	const char *buf = buffer;
-	a->sign = *((const long *)buf),   buf += sizeof(long);
-	a->used = *((const size_t *)buf), buf += sizeof(size_t);
+	a->sign = (int)*((const long *)buf), buf += sizeof(long);
+	a->used = *((const size_t *)buf),    buf += sizeof(size_t);
 	if (likely(a->sign)) {
 		ENSURE_SIZE(a, a->used);
 		zmemcpy(a->chars, (const zahl_char_t *)buf, a->used);
 	}
-	return sizeof(long) + sizeof(size_t) + (zzero(a) ? 0 : ((a->used + 3) & ~3) * sizeof(zahl_char_t));
+	return sizeof(long) + sizeof(size_t) +
+		(zzero(a) ? 0 : ((a->used + 3) & (size_t)~3) * sizeof(zahl_char_t));
 }
diff --git a/zahl.h b/zahl.h
@@ -13,6 +13,8 @@
 #include <stdint.h>
 #include <unistd.h>
 
+
+
 #include "zahl/internals.h"
 
 
@@ -180,4 +182,5 @@ void zsqr_ll(z_t, z_t);                 /* zsqr for non-negative operand */
 
 #include "zahl/inlines.h"
 
+
 #endif
diff --git a/zahl/inlines.h b/zahl/inlines.h
@@ -268,7 +268,8 @@ zsave(z_t a, void *buffer)
 			libzahl_memcpy((zahl_char_t *)buf, a->chars, a->used);
 		}
 	}
-	return sizeof(long) + sizeof(size_t) + (zzero(a) ? 0 : ((a->used + 3) & ~3) * sizeof(zahl_char_t));
+	return sizeof(long) + sizeof(size_t) +
+		(zzero(a) ? 0 :((a->used + 3) & (size_t)~3) * sizeof(zahl_char_t));
 }
 
 
diff --git a/zahl/memory.h b/zahl/memory.h
@@ -24,7 +24,7 @@
 	case 0: break;
 
 
-#if defined(LIBZAHL_ISA_MISSING_INDIRECT_JUMP)
+#if defined(ZAHL_ISA_MISSING_INDIRECT_JUMP)
 # define LIBZAHL_SMALL_INPUT_BEGIN(n)
 # define LIBZAHL_SMALL_INPUT_END
 #else

	libzahl big integer library
	git clone git://git.suckless.org/libzahl
	Log \| Files \| Refs \| README \| LICENSE

A	INSTALL	\|	36	++++++++++++++++++++++++++++++++++++
M	Makefile	\|	5	+++--
M	STATUS	\|	69	+++++++++++++++++++++++++++++++++++++--------------------------------
M	config.mk	\|	12	+++---------
M	src/internals.h	\|	16	+++++++++++-----
M	src/zload.c	\|	7	++++---
M	zahl.h	\|	3	+++
M	zahl/inlines.h	\|	3	++-
M	zahl/memory.h	\|	2	+-