libzahl

big integer library
git clone git://git.suckless.org/libzahl
Log | Files | Refs | README | LICENSE

commit d6f4393542998276250bd3f3519bb824ca4b3d91
parent 247dffb6c9a24894f094392ce62f2aa2e81a8acc
Author: Mattias Andrée <maandree@kth.se>
Date:   Sat,  7 May 2016 17:22:42 +0200

Some small improvements

Signed-off-by: Mattias Andrée <maandree@kth.se>

Diffstat:
AINSTALL | 36++++++++++++++++++++++++++++++++++++
MMakefile | 5+++--
MSTATUS | 69+++++++++++++++++++++++++++++++++++++--------------------------------
Mconfig.mk | 12+++---------
Msrc/internals.h | 16+++++++++++-----
Msrc/zload.c | 7++++---
Mzahl.h | 3+++
Mzahl/inlines.h | 3++-
Mzahl/memory.h | 2+-
9 files changed, 100 insertions(+), 53 deletions(-)

diff --git a/INSTALL b/INSTALL @@ -0,0 +1,36 @@ +Configure libzahl +================= + +libzahl is configured by editing config.mk. You may choose +to make a copy of config.mk, and reference the copy with +CONFIG when running make. For example: + cp config.mk my-config.mk + # edit my-config.mk + make CONFIG=my-config.mk + +Unless you are compiling for Linux you make have to add + -D'FAST_RANDOM_PATHNAME="<path to a non-blocking random number generator>"' +(/dev/urandom on Linux) and + -D'SECURE_RANDOM_PATHNAME="<path to a blocking random number generator>"' +(/dev/random on Linux) to CPPFLAGS. + +If you are using a C standard library where the higher bits have higher +entropy in the lower bits in rand(3) (as historically was the case), +remove -DGOOD_RAND from CPPFLAGS. + +If you don't care if your program crashes on failure, you can add +-DZAHL_UNSAFE to CPPFLAGS. This will give you a marginal performance +boost. You should also add, preferably, + #define ZAHL_UNSAFE +before including <zahl.h> in your program if you are doing this. + +If your CPU does not support indirect jumps (computed jumps) you should +add -DZAHL_ISA_MISSING_INDIRECT_JUMP to CPPFLAGS, and preferably add + #define ZAHL_ISA_MISSING_INDIRECT_JUMP +before including <zahl.h> in your program. + +libzahl contains some (very little) assembly code. In the event +that the used instructions are not supported on your machine, please +report it, and in the meanwhile add -DZAHL_NO_ASM to CPPFLAGS. You +make also have to do this if you are compiling with a compiler that +does not support extended inline assembly. diff --git a/Makefile b/Makefile @@ -1,4 +1,5 @@ -include config.mk +CONFIG = config.mk +include $(CONFIG) HDR_SEMIPUBLIC =\ zahl/inlines.h\ @@ -121,7 +122,7 @@ CFLAGS_WITHOUT_O = $$(printf '%s\n' $(CFLAGS) | sed '/^-O.*$$/d') all: libzahl.a $(DOC) -.o: .c $(HDR) config.mk +.o: .c $(HDR) $(CONFIG) $(CC) $(CFLAGS) $(CPPFLAGS) -c -o $@ $< libzahl.a: $(OBJ) diff --git a/STATUS b/STATUS @@ -18,55 +18,55 @@ processes are fixed to one CPU. The following functions are probably implemented optimally: +zset .................... always fastest (gcc); until ~1200 (clang [can be fixed with assembly]) zseti(a, +) ............. tomsfastmath is faster zseti(a, -) ............. tomsfastmath is faster zsetu ................... tomsfastmath is faster zswap ................... always fastest zzero ................... always fastest (shared with gmp) zsignum ................. always fastest (shared with gmp) -zeven ................... always fastest -zodd .................... always fastest -zeven_nonzero ........... always fastest +zeven ................... always fastest (shared with gmp) +zodd .................... always fastest (shared with gmp) +zeven_nonzero ........... always fastest (shared with gmp) zodd_nonzero ............ always fastest (shared with gmp) zbtest .................. always fastest +zsave ................... always fastest [clang needs zset fix] +zload ................... always fastest [clang needs zset fix] The following functions are probably implemented close to optimally, further optimisation should not be a priority: -zadd_unsigned ........... fastest after ~70 compared against zadd too (x86-64) +zadd_unsigned ........... fastest after ~140 (depends on cc and libc) compared against zadd too ztrunc(a, a, b) ......... fastest until ~100, then 77 % (gcc) or 68 % (clang) of tomsfastmath -zbset(a, a, 1) .......... always fastest (93 % of gmp (clang)) -zbset(a, a, 0) .......... always fastest -zbset(a, a, -1) ......... always fastest +zbset(a, a, 1) .......... always fastest +zbset(a, a, 0) .......... always fastest (faster with clang than gcc) +zbset(a, a, -1) ......... always fastest (only marginally faster than gmp with clang) zlsb .................... always fastest <<suspicious>> -zlsh .................... fastest until ~3400, then tomsfastmath, clang and musl are a bit slow +zlsh .................... not too fast anymore +zand .................... fastest after ~400, tomsfastmath before (gcc+glibc is slow) +zor ..................... fastest after ~1150, tomsfastmath before (gcc+glibc is slow) +zxor .................... fastest after ~400, tomsfastmath before (clang), gcc is slow +znot .................... always fastest (faster with musl than glibc) The following functions are probably implemented optimally, but depends on other functions or call-cases for better performance: -zneg(a, b) .............. always fastest -zabs(a, b) .............. always fastest -ztrunc(a, b, c) ......... always fastest (alternating with gmp between 1400~3000 (clang+glibc)) -zbset(a, b, 1) .......... always fastest -zbset(a, b, 0) .......... always fastest -zbset(a, b, -1) ......... always fastest -zsplit .................. alternating with gmp for fastest +zneg(a, b) .............. always fastest (gcc+musl); gcc is a bit slow [clang needs zset fix] +zabs(a, b) .............. always fastest (gcc+musl); gcc is a bit slow [clang needs zset fix] +ztrunc(a, b, c) ......... always fastest (gcc+musl); gcc is a bit slow [clang needs zset fix] +zbset(a, b, 1) .......... always fastest (gcc+musl); gcc is a bit slow [clang needs zset fix] +zbset(a, b, 0) .......... always fastest (gcc+musl); gcc is a bit slow [clang needs zset fix] +zbset(a, b, -1) ......... always fastest (gcc+musl); gcc is a bit slow [clang needs zset fix] +zsplit .................. alternating with gmp for fastest (clang and glibc is slower) The following functions require structural changes for further optimisations: -zset .................... always fastest -zneg(a, a) .............. always fastest (shared with gmp; faster with clang) -zabs(a, a) .............. tomsfastmath is faster (46 % of tomsfastmath with clang) -zand .................... fastest until ~900, alternating with gmp -zor ..................... fastest until ~1750, alternating with gmp (gcc) and tomsfastmath (clang) -zxor .................... fastest until ~700, alternating with gmp (gcc+glibc) -znot .................... always fastest -zsave ................... always fastest -zload ................... always fastest +zneg(a, a) .............. always fastest (shared with gmp (gcc)) +zabs(a, a) .............. 34 % (clang) or 8 % (gcc) of tomsfastmath The following functions are probably implemented optimally @@ -82,26 +82,31 @@ zcmpu ................... always fastest It may be possible optimise the following functions further: -zadd .................... fastest after ~110 (x86-64) -zcmp .................... acceptable (glibc); almost always fastest (musl) +zadd .................... fastest after ~90 (clang), ~260 (gcc+musl), or ~840 (gcc+glibc) (gcc+glibc is slow) +zcmp .................... almost always fastest (musl), almost always slowest (glibc) <<suspicious (clang)>> zcmpmag ................. always fastest <<suspicious, see zcmp>> The following functions could be optimised further: -zrsh .................... gmp is almost always faster +zrsh .................... mp is almost always faster; also tomsfastmath after ~3000 (gcc+glibc) or ~2800 (clang) zsub_unsigned ........... always fastest (compared against zsub too) -zsub .................... always fastest +zsub .................... always fastest (slower with gcc+glibc than gcc+musl or clang) The following functions could probably be optimised further, but there performance can be significantly improved by optimising their dependencies: -zmul .................... fastest after ~4096 -zsqr .................... slowest, but asymptotically fastest -zstr_length(a, 10) ...... gmp is faster -zstr(a, b, n) ........... fastest after ~700 +zmul .................... slowest +zsqr .................... slowest +zstr_length(a, 10) ...... gmp is faster (clang is faster than gcc, musl is faster than glibc) +zstr(a, b, n) ........... slowest + + +musl has more stable performance than glibc. clang is better at +inlining than gcc. (Which is better at optimising must be judged +on a per-function basis.) diff --git a/config.mk b/config.mk @@ -1,3 +1,5 @@ +# Please read INSTALL, section 'Configure libzahl'. + VERSION = 1.1 PREFIX = /usr/local @@ -9,14 +11,6 @@ CC = cc AR = ar RANLIB = ranlib -# Unless /dev/urandom exists and is a non-blocking random number generator, -# you have to add -DFAST_RANDOM_PATHNAME=... to CPPFLAGS, and -# unless /dev/random exists and is a blocking secure random number generator -# you have to add -DSECURE_RANDOM_PATHNAME=... to CPPFLAGS. -# Remove -DGOOD_RAND if the higher bits have higher entropy in the lower -# bits in rand(3), this was historically the case. -# Add -DZAHL_UNSAFE if you rather libzahl did not check for errors. - CPPFLAGS = -D_DEFAULT_SOURCE -D_BSD_SOURCE -D_XOPEN_SOURCE=700 -DGOOD_RAND -CFLAGS = -std=c99 -flto -O3 -Wall -pedantic +CFLAGS = -std=c99 -O3 -flto -Wall -pedantic LDFLAGS = -s diff --git a/src/internals.h b/src/internals.h @@ -26,7 +26,7 @@ #define Os ZAHL_Os #define Oz ZAHL_Oz -#define LIST_TEMPS\ +#define LIST_TEMPS_HERE\ X(libzahl_tmp_str_num, 0)\ X(libzahl_tmp_str_mag, 0)\ X(libzahl_tmp_str_div, 0)\ @@ -35,8 +35,6 @@ X(libzahl_tmp_gcd_v, 0)\ X(libzahl_tmp_sub, 0)\ X(libzahl_tmp_modmul, 0)\ - X(libzahl_tmp_div, 0)\ - X(libzahl_tmp_mod, 0)\ X(libzahl_tmp_pow_b, 0)\ X(libzahl_tmp_pow_c, 0)\ X(libzahl_tmp_pow_d, 0)\ @@ -50,6 +48,11 @@ X(libzahl_tmp_ptest_n1, 0)\ X(libzahl_tmp_ptest_n4, 0) +#define LIST_TEMPS\ + X(libzahl_tmp_div, 0)\ + X(libzahl_tmp_mod, 0)\ + LIST_TEMPS_HERE + #define LIST_CONSTS\ X(0, libzahl_const_1e19, zsetu, 10000000000000000000ULL) /* The largest power of 10 < 2⁶⁴. */\ X(1, libzahl_const_1, zsetu, 1)\ @@ -57,7 +60,7 @@ X(3, libzahl_const_4, zsetu, 4) #define X(x, s) extern z_t x; -LIST_TEMPS +LIST_TEMPS_HERE #undef X #define X(i, x, f, v) extern z_t x; LIST_CONSTS @@ -119,7 +122,10 @@ zzero1(z_t a, z_t b) static inline void zmemcpy_range(register zahl_char_t *restrict d, register const zahl_char_t *restrict s, size_t i, size_t n) { - zmemcpy(d + i, s + i, n - i); + d += i; + s += i; + n -= i; + zmemcpy(d, s, n); } static void diff --git a/src/zload.c b/src/zload.c @@ -6,11 +6,12 @@ size_t zload(z_t a, const void *buffer) { const char *buf = buffer; - a->sign = *((const long *)buf), buf += sizeof(long); - a->used = *((const size_t *)buf), buf += sizeof(size_t); + a->sign = (int)*((const long *)buf), buf += sizeof(long); + a->used = *((const size_t *)buf), buf += sizeof(size_t); if (likely(a->sign)) { ENSURE_SIZE(a, a->used); zmemcpy(a->chars, (const zahl_char_t *)buf, a->used); } - return sizeof(long) + sizeof(size_t) + (zzero(a) ? 0 : ((a->used + 3) & ~3) * sizeof(zahl_char_t)); + return sizeof(long) + sizeof(size_t) + + (zzero(a) ? 0 : ((a->used + 3) & (size_t)~3) * sizeof(zahl_char_t)); } diff --git a/zahl.h b/zahl.h @@ -13,6 +13,8 @@ #include <stdint.h> #include <unistd.h> + + #include "zahl/internals.h" @@ -180,4 +182,5 @@ void zsqr_ll(z_t, z_t); /* zsqr for non-negative operand */ #include "zahl/inlines.h" + #endif diff --git a/zahl/inlines.h b/zahl/inlines.h @@ -268,7 +268,8 @@ zsave(z_t a, void *buffer) libzahl_memcpy((zahl_char_t *)buf, a->chars, a->used); } } - return sizeof(long) + sizeof(size_t) + (zzero(a) ? 0 : ((a->used + 3) & ~3) * sizeof(zahl_char_t)); + return sizeof(long) + sizeof(size_t) + + (zzero(a) ? 0 :((a->used + 3) & (size_t)~3) * sizeof(zahl_char_t)); } diff --git a/zahl/memory.h b/zahl/memory.h @@ -24,7 +24,7 @@ case 0: break; -#if defined(LIBZAHL_ISA_MISSING_INDIRECT_JUMP) +#if defined(ZAHL_ISA_MISSING_INDIRECT_JUMP) # define LIBZAHL_SMALL_INPUT_BEGIN(n) # define LIBZAHL_SMALL_INPUT_END #else