libzahl

big integer library
git clone git://git.suckless.org/libzahl
Log | Files | Refs | README | LICENSE

commit 84ad8821d456e8f7f40df43b7eb7245703004ce7
parent d6f4393542998276250bd3f3519bb824ca4b3d91
Author: Mattias Andrée <maandree@kth.se>
Date:   Sat,  7 May 2016 18:15:59 +0200

Optimise libzahl_memcpy for clang

Signed-off-by: Mattias Andrée <maandree@kth.se>

Diffstat:
MINSTALL | 4+++-
MSTATUS | 40++++++++++++++++++++--------------------
Mzahl/memory.h | 45++++++++++++++++++++++++++++++++++++++-------
3 files changed, 61 insertions(+), 28 deletions(-)

diff --git a/INSTALL b/INSTALL @@ -33,4 +33,6 @@ libzahl contains some (very little) assembly code. In the event that the used instructions are not supported on your machine, please report it, and in the meanwhile add -DZAHL_NO_ASM to CPPFLAGS. You make also have to do this if you are compiling with a compiler that -does not support extended inline assembly. +does not support extended inline assembly. You may also have to add + #define ZAHL_NO_ASM +to your program before includeing <zahl.h> diff --git a/STATUS b/STATUS @@ -18,7 +18,7 @@ processes are fixed to one CPU. The following functions are probably implemented optimally: -zset .................... always fastest (gcc); until ~1200 (clang [can be fixed with assembly]) +zset .................... always fastest zseti(a, +) ............. tomsfastmath is faster zseti(a, -) ............. tomsfastmath is faster zsetu ................... tomsfastmath is faster @@ -30,8 +30,20 @@ zodd .................... always fastest (shared with gmp) zeven_nonzero ........... always fastest (shared with gmp) zodd_nonzero ............ always fastest (shared with gmp) zbtest .................. always fastest -zsave ................... always fastest [clang needs zset fix] -zload ................... always fastest [clang needs zset fix] +zsave ................... always fastest +zload ................... always fastest + + + The following functions are probably implemented optimally, but + depends on other functions or call-cases for better performance: + +zneg(a, b) .............. always fastest +zabs(a, b) .............. always fastest +ztrunc(a, b, c) ......... always fastest +zbset(a, b, 1) .......... always fastest +zbset(a, b, 0) .......... always fastest +zbset(a, b, -1) ......... always fastest +zsplit .................. alternating with gmp for fastest, but gmp is a bit faster on average The following functions are probably implemented close to @@ -40,26 +52,14 @@ zload ................... always fastest [clang needs zset fix] zadd_unsigned ........... fastest after ~140 (depends on cc and libc) compared against zadd too ztrunc(a, a, b) ......... fastest until ~100, then 77 % (gcc) or 68 % (clang) of tomsfastmath zbset(a, a, 1) .......... always fastest -zbset(a, a, 0) .......... always fastest (faster with clang than gcc) +zbset(a, a, 0) .......... always fastest zbset(a, a, -1) ......... always fastest (only marginally faster than gmp with clang) zlsb .................... always fastest <<suspicious>> zlsh .................... not too fast anymore -zand .................... fastest after ~400, tomsfastmath before (gcc+glibc is slow) -zor ..................... fastest after ~1150, tomsfastmath before (gcc+glibc is slow) -zxor .................... fastest after ~400, tomsfastmath before (clang), gcc is slow -znot .................... always fastest (faster with musl than glibc) - - - The following functions are probably implemented optimally, but - depends on other functions or call-cases for better performance: - -zneg(a, b) .............. always fastest (gcc+musl); gcc is a bit slow [clang needs zset fix] -zabs(a, b) .............. always fastest (gcc+musl); gcc is a bit slow [clang needs zset fix] -ztrunc(a, b, c) ......... always fastest (gcc+musl); gcc is a bit slow [clang needs zset fix] -zbset(a, b, 1) .......... always fastest (gcc+musl); gcc is a bit slow [clang needs zset fix] -zbset(a, b, 0) .......... always fastest (gcc+musl); gcc is a bit slow [clang needs zset fix] -zbset(a, b, -1) ......... always fastest (gcc+musl); gcc is a bit slow [clang needs zset fix] -zsplit .................. alternating with gmp for fastest (clang and glibc is slower) +zand .................... fastest after ~400, tomsfastmath before +zor ..................... fastest after ~1150, tomsfastmath before +zxor .................... alternative with gmp after ~700, tomsfastmath before (musl), a bit slow with glibc +znot .................... always fastest The following functions require structural changes for diff --git a/zahl/memory.h b/zahl/memory.h @@ -34,16 +34,47 @@ ZAHL_INLINE void -libzahl_memcpy(register zahl_char_t *restrict d, register const zahl_char_t *restrict s, size_t n) +libzahl_memcpy(register zahl_char_t *restrict d, register const zahl_char_t *restrict s, register size_t n) { - size_t i; #define LIBZAHL_X(I) case I: d[I - 1] = s[I - 1]; LIBZAHL_SMALL_INPUT_BEGIN(n); - for (i = 0; i < n; i += 4) { - d[i + 0] = s[i + 0]; - d[i + 1] = s[i + 1]; - d[i + 2] = s[i + 2]; - d[i + 3] = s[i + 3]; + { +#if defined(__x86_64__) && !defined(ZAHL_NO_ASM) + /* This crap is needed for clang. */ + register zahl_char_t t; + __asm__ __volatile__ ( +# if defined(ZAHL_ISA_MISSING_INDIRECT_JUMP) + "\n testq %[e], %[e]" + "\n jz 2f" +# endif + "\n shlq $3, %[e]" + "\n addq %[d], %[e]" + "\n 1:" + "\n movq 0(%[s]), %[t]" + "\n movq %[t], 0(%[d])" + "\n movq 8(%[s]), %[t]" + "\n movq %[t], 8(%[d])" + "\n movq 16(%[s]), %[t]" + "\n movq %[t], 16(%[d])" + "\n movq 24(%[s]), %[t]" + "\n movq %[t], 24(%[d])" + "\n addq $32, %[s]" + "\n addq $32, %[d]" + "\n cmpq %[e], %[d]" + "\n jl 1b" +# if defined(ZAHL_ISA_MISSING_INDIRECT_JUMP) + "\n 2:" +# endif + : [t]"=r"(t), [d]"+r"(d), [s]"+r"(s), [e]"+r"(n)); +#else + size_t i; + for (i = 0; i < n; i += 4) { + d[i + 0] = s[i + 0]; + d[i + 1] = s[i + 1]; + d[i + 2] = s[i + 2]; + d[i + 3] = s[i + 3]; + } +#endif } LIBZAHL_SMALL_INPUT_END; #undef LIBZAHL_X