commit d6f4393542998276250bd3f3519bb824ca4b3d91
parent 247dffb6c9a24894f094392ce62f2aa2e81a8acc
Author: Mattias Andrée <maandree@kth.se>
Date: Sat, 7 May 2016 17:22:42 +0200
Some small improvements
Signed-off-by: Mattias Andrée <maandree@kth.se>
Diffstat:
9 files changed, 100 insertions(+), 53 deletions(-)
diff --git a/INSTALL b/INSTALL
@@ -0,0 +1,36 @@
+Configure libzahl
+=================
+
+libzahl is configured by editing config.mk. You may choose
+to make a copy of config.mk, and reference the copy with
+CONFIG when running make. For example:
+ cp config.mk my-config.mk
+ # edit my-config.mk
+ make CONFIG=my-config.mk
+
+Unless you are compiling for Linux you make have to add
+ -D'FAST_RANDOM_PATHNAME="<path to a non-blocking random number generator>"'
+(/dev/urandom on Linux) and
+ -D'SECURE_RANDOM_PATHNAME="<path to a blocking random number generator>"'
+(/dev/random on Linux) to CPPFLAGS.
+
+If you are using a C standard library where the higher bits have higher
+entropy in the lower bits in rand(3) (as historically was the case),
+remove -DGOOD_RAND from CPPFLAGS.
+
+If you don't care if your program crashes on failure, you can add
+-DZAHL_UNSAFE to CPPFLAGS. This will give you a marginal performance
+boost. You should also add, preferably,
+ #define ZAHL_UNSAFE
+before including <zahl.h> in your program if you are doing this.
+
+If your CPU does not support indirect jumps (computed jumps) you should
+add -DZAHL_ISA_MISSING_INDIRECT_JUMP to CPPFLAGS, and preferably add
+ #define ZAHL_ISA_MISSING_INDIRECT_JUMP
+before including <zahl.h> in your program.
+
+libzahl contains some (very little) assembly code. In the event
+that the used instructions are not supported on your machine, please
+report it, and in the meanwhile add -DZAHL_NO_ASM to CPPFLAGS. You
+make also have to do this if you are compiling with a compiler that
+does not support extended inline assembly.
diff --git a/Makefile b/Makefile
@@ -1,4 +1,5 @@
-include config.mk
+CONFIG = config.mk
+include $(CONFIG)
HDR_SEMIPUBLIC =\
zahl/inlines.h\
@@ -121,7 +122,7 @@ CFLAGS_WITHOUT_O = $$(printf '%s\n' $(CFLAGS) | sed '/^-O.*$$/d')
all: libzahl.a $(DOC)
-.o: .c $(HDR) config.mk
+.o: .c $(HDR) $(CONFIG)
$(CC) $(CFLAGS) $(CPPFLAGS) -c -o $@ $<
libzahl.a: $(OBJ)
diff --git a/STATUS b/STATUS
@@ -18,55 +18,55 @@ processes are fixed to one CPU.
The following functions are probably implemented optimally:
+zset .................... always fastest (gcc); until ~1200 (clang [can be fixed with assembly])
zseti(a, +) ............. tomsfastmath is faster
zseti(a, -) ............. tomsfastmath is faster
zsetu ................... tomsfastmath is faster
zswap ................... always fastest
zzero ................... always fastest (shared with gmp)
zsignum ................. always fastest (shared with gmp)
-zeven ................... always fastest
-zodd .................... always fastest
-zeven_nonzero ........... always fastest
+zeven ................... always fastest (shared with gmp)
+zodd .................... always fastest (shared with gmp)
+zeven_nonzero ........... always fastest (shared with gmp)
zodd_nonzero ............ always fastest (shared with gmp)
zbtest .................. always fastest
+zsave ................... always fastest [clang needs zset fix]
+zload ................... always fastest [clang needs zset fix]
The following functions are probably implemented close to
optimally, further optimisation should not be a priority:
-zadd_unsigned ........... fastest after ~70 compared against zadd too (x86-64)
+zadd_unsigned ........... fastest after ~140 (depends on cc and libc) compared against zadd too
ztrunc(a, a, b) ......... fastest until ~100, then 77 % (gcc) or 68 % (clang) of tomsfastmath
-zbset(a, a, 1) .......... always fastest (93 % of gmp (clang))
-zbset(a, a, 0) .......... always fastest
-zbset(a, a, -1) ......... always fastest
+zbset(a, a, 1) .......... always fastest
+zbset(a, a, 0) .......... always fastest (faster with clang than gcc)
+zbset(a, a, -1) ......... always fastest (only marginally faster than gmp with clang)
zlsb .................... always fastest <<suspicious>>
-zlsh .................... fastest until ~3400, then tomsfastmath, clang and musl are a bit slow
+zlsh .................... not too fast anymore
+zand .................... fastest after ~400, tomsfastmath before (gcc+glibc is slow)
+zor ..................... fastest after ~1150, tomsfastmath before (gcc+glibc is slow)
+zxor .................... fastest after ~400, tomsfastmath before (clang), gcc is slow
+znot .................... always fastest (faster with musl than glibc)
The following functions are probably implemented optimally, but
depends on other functions or call-cases for better performance:
-zneg(a, b) .............. always fastest
-zabs(a, b) .............. always fastest
-ztrunc(a, b, c) ......... always fastest (alternating with gmp between 1400~3000 (clang+glibc))
-zbset(a, b, 1) .......... always fastest
-zbset(a, b, 0) .......... always fastest
-zbset(a, b, -1) ......... always fastest
-zsplit .................. alternating with gmp for fastest
+zneg(a, b) .............. always fastest (gcc+musl); gcc is a bit slow [clang needs zset fix]
+zabs(a, b) .............. always fastest (gcc+musl); gcc is a bit slow [clang needs zset fix]
+ztrunc(a, b, c) ......... always fastest (gcc+musl); gcc is a bit slow [clang needs zset fix]
+zbset(a, b, 1) .......... always fastest (gcc+musl); gcc is a bit slow [clang needs zset fix]
+zbset(a, b, 0) .......... always fastest (gcc+musl); gcc is a bit slow [clang needs zset fix]
+zbset(a, b, -1) ......... always fastest (gcc+musl); gcc is a bit slow [clang needs zset fix]
+zsplit .................. alternating with gmp for fastest (clang and glibc is slower)
The following functions require structural changes for
further optimisations:
-zset .................... always fastest
-zneg(a, a) .............. always fastest (shared with gmp; faster with clang)
-zabs(a, a) .............. tomsfastmath is faster (46 % of tomsfastmath with clang)
-zand .................... fastest until ~900, alternating with gmp
-zor ..................... fastest until ~1750, alternating with gmp (gcc) and tomsfastmath (clang)
-zxor .................... fastest until ~700, alternating with gmp (gcc+glibc)
-znot .................... always fastest
-zsave ................... always fastest
-zload ................... always fastest
+zneg(a, a) .............. always fastest (shared with gmp (gcc))
+zabs(a, a) .............. 34 % (clang) or 8 % (gcc) of tomsfastmath
The following functions are probably implemented optimally
@@ -82,26 +82,31 @@ zcmpu ................... always fastest
It may be possible optimise the following functions
further:
-zadd .................... fastest after ~110 (x86-64)
-zcmp .................... acceptable (glibc); almost always fastest (musl)
+zadd .................... fastest after ~90 (clang), ~260 (gcc+musl), or ~840 (gcc+glibc) (gcc+glibc is slow)
+zcmp .................... almost always fastest (musl), almost always slowest (glibc) <<suspicious (clang)>>
zcmpmag ................. always fastest <<suspicious, see zcmp>>
The following functions could be optimised further:
-zrsh .................... gmp is almost always faster
+zrsh .................... mp is almost always faster; also tomsfastmath after ~3000 (gcc+glibc) or ~2800 (clang)
zsub_unsigned ........... always fastest (compared against zsub too)
-zsub .................... always fastest
+zsub .................... always fastest (slower with gcc+glibc than gcc+musl or clang)
The following functions could probably be optimised further,
but there performance can be significantly improved by
optimising their dependencies:
-zmul .................... fastest after ~4096
-zsqr .................... slowest, but asymptotically fastest
-zstr_length(a, 10) ...... gmp is faster
-zstr(a, b, n) ........... fastest after ~700
+zmul .................... slowest
+zsqr .................... slowest
+zstr_length(a, 10) ...... gmp is faster (clang is faster than gcc, musl is faster than glibc)
+zstr(a, b, n) ........... slowest
+
+
+musl has more stable performance than glibc. clang is better at
+inlining than gcc. (Which is better at optimising must be judged
+on a per-function basis.)
diff --git a/config.mk b/config.mk
@@ -1,3 +1,5 @@
+# Please read INSTALL, section 'Configure libzahl'.
+
VERSION = 1.1
PREFIX = /usr/local
@@ -9,14 +11,6 @@ CC = cc
AR = ar
RANLIB = ranlib
-# Unless /dev/urandom exists and is a non-blocking random number generator,
-# you have to add -DFAST_RANDOM_PATHNAME=... to CPPFLAGS, and
-# unless /dev/random exists and is a blocking secure random number generator
-# you have to add -DSECURE_RANDOM_PATHNAME=... to CPPFLAGS.
-# Remove -DGOOD_RAND if the higher bits have higher entropy in the lower
-# bits in rand(3), this was historically the case.
-# Add -DZAHL_UNSAFE if you rather libzahl did not check for errors.
-
CPPFLAGS = -D_DEFAULT_SOURCE -D_BSD_SOURCE -D_XOPEN_SOURCE=700 -DGOOD_RAND
-CFLAGS = -std=c99 -flto -O3 -Wall -pedantic
+CFLAGS = -std=c99 -O3 -flto -Wall -pedantic
LDFLAGS = -s
diff --git a/src/internals.h b/src/internals.h
@@ -26,7 +26,7 @@
#define Os ZAHL_Os
#define Oz ZAHL_Oz
-#define LIST_TEMPS\
+#define LIST_TEMPS_HERE\
X(libzahl_tmp_str_num, 0)\
X(libzahl_tmp_str_mag, 0)\
X(libzahl_tmp_str_div, 0)\
@@ -35,8 +35,6 @@
X(libzahl_tmp_gcd_v, 0)\
X(libzahl_tmp_sub, 0)\
X(libzahl_tmp_modmul, 0)\
- X(libzahl_tmp_div, 0)\
- X(libzahl_tmp_mod, 0)\
X(libzahl_tmp_pow_b, 0)\
X(libzahl_tmp_pow_c, 0)\
X(libzahl_tmp_pow_d, 0)\
@@ -50,6 +48,11 @@
X(libzahl_tmp_ptest_n1, 0)\
X(libzahl_tmp_ptest_n4, 0)
+#define LIST_TEMPS\
+ X(libzahl_tmp_div, 0)\
+ X(libzahl_tmp_mod, 0)\
+ LIST_TEMPS_HERE
+
#define LIST_CONSTS\
X(0, libzahl_const_1e19, zsetu, 10000000000000000000ULL) /* The largest power of 10 < 2⁶⁴. */\
X(1, libzahl_const_1, zsetu, 1)\
@@ -57,7 +60,7 @@
X(3, libzahl_const_4, zsetu, 4)
#define X(x, s) extern z_t x;
-LIST_TEMPS
+LIST_TEMPS_HERE
#undef X
#define X(i, x, f, v) extern z_t x;
LIST_CONSTS
@@ -119,7 +122,10 @@ zzero1(z_t a, z_t b)
static inline void
zmemcpy_range(register zahl_char_t *restrict d, register const zahl_char_t *restrict s, size_t i, size_t n)
{
- zmemcpy(d + i, s + i, n - i);
+ d += i;
+ s += i;
+ n -= i;
+ zmemcpy(d, s, n);
}
static void
diff --git a/src/zload.c b/src/zload.c
@@ -6,11 +6,12 @@ size_t
zload(z_t a, const void *buffer)
{
const char *buf = buffer;
- a->sign = *((const long *)buf), buf += sizeof(long);
- a->used = *((const size_t *)buf), buf += sizeof(size_t);
+ a->sign = (int)*((const long *)buf), buf += sizeof(long);
+ a->used = *((const size_t *)buf), buf += sizeof(size_t);
if (likely(a->sign)) {
ENSURE_SIZE(a, a->used);
zmemcpy(a->chars, (const zahl_char_t *)buf, a->used);
}
- return sizeof(long) + sizeof(size_t) + (zzero(a) ? 0 : ((a->used + 3) & ~3) * sizeof(zahl_char_t));
+ return sizeof(long) + sizeof(size_t) +
+ (zzero(a) ? 0 : ((a->used + 3) & (size_t)~3) * sizeof(zahl_char_t));
}
diff --git a/zahl.h b/zahl.h
@@ -13,6 +13,8 @@
#include <stdint.h>
#include <unistd.h>
+
+
#include "zahl/internals.h"
@@ -180,4 +182,5 @@ void zsqr_ll(z_t, z_t); /* zsqr for non-negative operand */
#include "zahl/inlines.h"
+
#endif
diff --git a/zahl/inlines.h b/zahl/inlines.h
@@ -268,7 +268,8 @@ zsave(z_t a, void *buffer)
libzahl_memcpy((zahl_char_t *)buf, a->chars, a->used);
}
}
- return sizeof(long) + sizeof(size_t) + (zzero(a) ? 0 : ((a->used + 3) & ~3) * sizeof(zahl_char_t));
+ return sizeof(long) + sizeof(size_t) +
+ (zzero(a) ? 0 :((a->used + 3) & (size_t)~3) * sizeof(zahl_char_t));
}
diff --git a/zahl/memory.h b/zahl/memory.h
@@ -24,7 +24,7 @@
case 0: break;
-#if defined(LIBZAHL_ISA_MISSING_INDIRECT_JUMP)
+#if defined(ZAHL_ISA_MISSING_INDIRECT_JUMP)
# define LIBZAHL_SMALL_INPUT_BEGIN(n)
# define LIBZAHL_SMALL_INPUT_END
#else