libzahl

big integer library
git clone git://git.suckless.org/libzahl
Log | Files | Refs | README | LICENSE

commit fdc75a358e4c20116640c08f4e8ce7a09dc3cebd
parent 4e659044778dfd05a5f5d2b41611e7737a0eb825
Author: Mattias Andrée <maandree@kth.se>
Date:   Sat,  7 May 2016 03:02:56 +0200

Optimisations

Signed-off-by: Mattias Andrée <maandree@kth.se>

Diffstat:
MMakefile | 3++-
Msrc/allocator.c | 2+-
Msrc/internals.h | 97+++++++++++++++++++++++++------------------------------------------------------
Msrc/zadd.c | 2+-
Msrc/zload.c | 4++--
Msrc/zlsh.c | 4++--
Msrc/zor.c | 4++--
Msrc/zptest.c | 3+--
Msrc/zsetup.c | 2+-
Msrc/zsqr.c | 4++--
Msrc/zxor.c | 4++--
Mzahl.h | 1+
Mzahl/inlines.h | 10+++++++---
Mzahl/internals.h | 61++-----------------------------------------------------------
Azahl/memory.h | 133+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
15 files changed, 190 insertions(+), 144 deletions(-)

diff --git a/Makefile b/Makefile @@ -2,7 +2,8 @@ include config.mk HDR_SEMIPUBLIC =\ zahl/inlines.h\ - zahl/internals.h + zahl/internals.h\ + zahl/memory.h HDR_PRIVATE =\ src/internals.h diff --git a/src/allocator.c b/src/allocator.c @@ -21,7 +21,7 @@ libzahl_realloc(z_t a, size_t need) zfree(a); a->chars = new; } else { - a->chars = realloc(a->chars, new_size * sizeof(zahl_char_t)); + a->chars = realloc(a->chars, (new_size + ZAHL_FLUFF) * sizeof(zahl_char_t)); if (check(!a->chars)) libzahl_memfailure(); } diff --git a/src/internals.h b/src/internals.h @@ -4,7 +4,6 @@ #include <errno.h> #include <stdlib.h> #include <string.h> -#include <unistd.h> /* clang pretends to be GCC... */ #if defined(__GNUC__) && defined(__clang__) @@ -105,7 +104,11 @@ extern void *libzahl_temp_allocation; #define zpositive2(a, b) (zsignum(a) + zsignum(b) == 2) #define zzero2(a, b) (!(zsignum(a) | zsignum(b))) #define zmemcpy(d, s, n) libzahl_memcpy(d, s, n) +#define zmemmove(d, s, n) libzahl_memmove(d, s, n) +#define zmemmovef(d, s, n) libzahl_memmovef(d, s, n) +#define zmemmoveb(d, s, n) libzahl_memmoveb(d, s, n) #define zmemset(a, v, n) libzahl_memset(a, v, n) +#define zmemset_precise(a, v, n) libzahl_memset_precise(a, v, n) static inline int zzero1(z_t a, z_t b) @@ -113,11 +116,10 @@ zzero1(z_t a, z_t b) return zzero(a) || zzero(b); } -O2 static inline void +static inline void zmemcpy_range(register zahl_char_t *restrict d, register const zahl_char_t *restrict s, size_t i, size_t n) { - for (; i < n; i++) - d[i] = s[i]; + zmemcpy(d + i, s + i, n - i); } static void @@ -337,7 +339,7 @@ zfree_temp(z_t a) /* } */ -#define ZMEM_2OP(a, b, c, n, OP) \ +#define ZMEM_2OP_PRECISE(a, b, c, n, OP) \ do { \ zahl_char_t *a__ = (a); \ const zahl_char_t *b__ = (b); \ @@ -365,66 +367,29 @@ zfree_temp(z_t a) } \ } while (0) -#define ZMEM_1OP(a, b, n, OP) \ - do { \ - zahl_char_t *a__ = (a); \ - const zahl_char_t *b__ = (b); \ - size_t i__, n__ = (n); \ - if (n__ <= 4) { \ - if (n__ >= 1) \ - a__[0] = OP(b__[0]); \ - if (n__ >= 2) \ - a__[1] = OP(b__[1]); \ - if (n__ >= 3) \ - a__[2] = OP(b__[2]); \ - if (n__ >= 4) \ - a__[3] = OP(b__[3]); \ - } else { \ - for (i__ = 0; (i__ += 4) < n__;) { \ - a__[i__ - 1] = OP(b__[i__ - 1]); \ - a__[i__ - 2] = OP(b__[i__ - 2]); \ - a__[i__ - 3] = OP(b__[i__ - 3]); \ - a__[i__ - 4] = OP(b__[i__ - 4]); \ - } \ - if (i__ > n__) \ - for (i__ -= 4; i__ < n__; i__++) \ - a__[i__] = OP(b__[i__]); \ - } \ +#define ZMEM_2OP(a, b, c, n, OP) \ + do { \ + zahl_char_t *a__ = (a); \ + const zahl_char_t *b__ = (b); \ + const zahl_char_t *c__ = (c); \ + size_t i__, n__ = (n); \ + for (i__ = 0; i__ < n__; i__ += 4) { \ + a__[i__ + 0] = b__[i__ + 0] OP c__[i__ + 0]; \ + a__[i__ + 1] = b__[i__ + 1] OP c__[i__ + 1]; \ + a__[i__ + 2] = b__[i__ + 2] OP c__[i__ + 2]; \ + a__[i__ + 3] = b__[i__ + 3] OP c__[i__ + 3]; \ + } \ } while (0) -static inline void -zmemcpyb(register zahl_char_t *restrict d, register const zahl_char_t *restrict s, size_t n_) -{ - ssize_t i, n = (ssize_t)n_; - switch (n & 3) { - case 3: - d[n - 1] = s[n - 1]; - d[n - 2] = s[n - 2]; - d[n - 3] = s[n - 3]; - break; - case 2: - d[n - 1] = s[n - 1]; - d[n - 2] = s[n - 2]; - break; - case 1: - d[n - 1] = s[n - 1]; - break; - default: - break; - } - for (i = n & ~3; (i -= 4) >= 0;) { - d[i + 3] = s[i + 3]; - d[i + 2] = s[i + 2]; - d[i + 1] = s[i + 1]; - d[i + 0] = s[i + 0]; - } -} - -static inline void -zmemmove(register zahl_char_t *d, register const zahl_char_t *s, size_t n) -{ - if (d < s) - zmemcpy(d, s, n); - else - zmemcpyb(d, s, n); -} +#define ZMEM_1OP(a, b, n, OP) \ + do { \ + zahl_char_t *a__ = (a); \ + const zahl_char_t *b__ = (b); \ + size_t i__, n__ = (n); \ + for (i__ = 0; i__ < n__; i__ += 4) { \ + a__[i__ + 0] = OP(b__[i__ + 0]); \ + a__[i__ + 1] = OP(b__[i__ + 1]); \ + a__[i__ + 2] = OP(b__[i__ + 2]); \ + a__[i__ + 3] = OP(b__[i__ + 3]); \ + } \ + } while (0) diff --git a/src/zadd.c b/src/zadd.c @@ -2,7 +2,7 @@ #include "internals.h" -#if defined(__x86_64__) +#if defined(__x86_64__) && !defined(ZAHL_NO_ASM) # define ASM3(code) \ __asm__ __volatile__ (code : [x]"+r"(carry), [a]"+r"(ac), [b]"+r"(bc), [c]"+r"(cc)) diff --git a/src/zload.c b/src/zload.c @@ -6,11 +6,11 @@ size_t zload(z_t a, const void *buffer) { const char *buf = buffer; - a->sign = *((const int *)buf), buf += sizeof(int); + a->sign = *((const long *)buf), buf += sizeof(long); a->used = *((const size_t *)buf), buf += sizeof(size_t); if (likely(a->sign)) { ENSURE_SIZE(a, a->used); zmemcpy(a->chars, (const zahl_char_t *)buf, a->used); } - return sizeof(int) + sizeof(size_t) + (zzero(a) ? 0 : a->used * sizeof(zahl_char_t)); + return sizeof(long) + sizeof(size_t) + (zzero(a) ? 0 : ((a->used + 3) & ~3) * sizeof(zahl_char_t)); } diff --git a/src/zlsh.c b/src/zlsh.c @@ -19,11 +19,11 @@ zlsh(z_t a, z_t b, size_t bits) ENSURE_SIZE(a, b->used + chars + 1); if (likely(a == b)) { - zmemcpyb(a->chars + chars, b->chars, b->used); + zmemmoveb(a->chars + chars, b->chars, b->used); } else { zmemcpy(a->chars + chars, b->chars, b->used); } - zmemset(a->chars, 0, chars); + zmemset_precise(a->chars, 0, chars); a->used = b->used + chars; if (likely(bits)) { /* This if statement is very important in C. */ diff --git a/src/zor.c b/src/zor.c @@ -19,11 +19,11 @@ zor(z_t a, z_t b, z_t c) ENSURE_SIZE(a, m); if (a == b) { - ZMEM_2OP(a->chars, a->chars, c->chars, n, |); + ZMEM_2OP_PRECISE(a->chars, a->chars, c->chars, n, |); if (a->used < c->used) zmemcpy_range(a->chars, c->chars, n, m); } else if (unlikely(a == c)) { - ZMEM_2OP(a->chars, a->chars, b->chars, n, |); + ZMEM_2OP_PRECISE(a->chars, a->chars, b->chars, n, |); if (a->used < b->used) zmemcpy_range(a->chars, b->chars, n, m); } else if (m == b->used) { diff --git a/src/zptest.c b/src/zptest.c @@ -47,8 +47,7 @@ zptest(z_t witness, z_t n, int t) continue; for (i = 1; i < r; i++) { - zsqr(x, x); - zmod(x, x, n); + zmodsqr(x, x, n); if (!zcmp(x, libzahl_const_1)) { if (witness) zswap(witness, a); diff --git a/src/zsetup.c b/src/zsetup.c @@ -21,7 +21,7 @@ struct zahl **libzahl_temp_stack_end; void *libzahl_temp_allocation = 0; #define X(i, x, f, v) 1 + -static zahl_char_t constant_chars[LIST_CONSTS 0]; +static zahl_char_t constant_chars[LIST_CONSTS ZAHL_FLUFF]; #undef X diff --git a/src/zsqr.c b/src/zsqr.c @@ -20,7 +20,7 @@ zsqr_ll(z_t a, z_t b) #define z2 a z_t z0, z1, high, low; - zahl_char_t auxchars[3]; + zahl_char_t auxchars[3 * ZAHL_FLUFF]; size_t bits; bits = zbits(b); @@ -38,7 +38,7 @@ zsqr_ll(z_t a, z_t b) * which require constant auxiliary memory. */ if (bits < BITS_PER_CHAR) { low->chars = auxchars; - high->chars = auxchars + 1; + high->chars = auxchars + ZAHL_FLUFF; zsplit_unsigned_fast_small_auto(high, low, b, bits); } else { bits = TRUNCATE_TO_CHAR(bits); diff --git a/src/zxor.c b/src/zxor.c @@ -26,11 +26,11 @@ zxor(z_t a, z_t b, z_t c) ENSURE_SIZE(a, m); if (a == b) { - ZMEM_2OP(a->chars, a->chars, cc, n, ^); + ZMEM_2OP_PRECISE(a->chars, a->chars, cc, n, ^); if (a->used < cn) zmemcpy_range(a->chars, cc, n, m); } else if (unlikely(a == c)) { - ZMEM_2OP(a->chars, b->chars, cc, n, ^); + ZMEM_2OP_PRECISE(a->chars, a->chars, bc, n, ^); if (a->used < bn) zmemcpy_range(a->chars, bc, n, m); } else if (m == bn) { diff --git a/zahl.h b/zahl.h @@ -11,6 +11,7 @@ #include <setjmp.h> #include <stddef.h> #include <stdint.h> +#include <unistd.h> #include "zahl/internals.h" diff --git a/zahl/inlines.h b/zahl/inlines.h @@ -259,12 +259,16 @@ zsave(z_t a, void *buffer) { if (ZAHL_LIKELY(buffer)) { char *buf = buffer; - *((int *)buf) = a->sign, buf += sizeof(int); + *((long *)buf) = a->sign, buf += sizeof(long); /* Use `long` for alignment. */ *((size_t *)buf) = a->used, buf += sizeof(size_t); - if (ZAHL_LIKELY(!zzero(a))) + if (ZAHL_LIKELY(!zzero(a))) { + a->chars[a->used + 2] = 0; + a->chars[a->used + 1] = 0; + a->chars[a->used + 0] = 0; libzahl_memcpy((zahl_char_t *)buf, a->chars, a->used); + } } - return sizeof(int) + sizeof(size_t) + (zzero(a) ? 0 : a->used * sizeof(zahl_char_t)); + return sizeof(long) + sizeof(size_t) + (zzero(a) ? 0 : ((a->used + 3) & ~3) * sizeof(zahl_char_t)); } diff --git a/zahl/internals.h b/zahl/internals.h @@ -51,6 +51,7 @@ #define ZAHL_BITS_PER_CHAR 64 #define ZAHL_LB_BITS_PER_CHAR 6 #define ZAHL_CHAR_MAX UINT64_MAX +#define ZAHL_FLUFF 4 /* Note: These cannot be changed willy-nilly, some code depends * on them, be cause being flexible would just be too painful. */ @@ -113,62 +114,4 @@ extern struct zahl libzahl_tmp_mod[1]; void libzahl_realloc(struct zahl *, size_t); -ZAHL_INLINE void -libzahl_memcpy(register zahl_char_t *d, register const zahl_char_t *s, size_t n) -{ - size_t i; - if (n <= 4) { - if (n >= 1) - d[0] = s[0]; - if (n >= 2) - d[1] = s[1]; - if (n >= 3) - d[2] = s[2]; - if (n >= 4) - d[3] = s[3]; - } else { - for (i = 0; (i += 4) <= n;) { - d[i - 4] = s[i - 4]; - d[i - 3] = s[i - 3]; - d[i - 2] = s[i - 2]; - d[i - 1] = s[i - 1]; - } - if (i > n) { - i -= 4; - if (i < n) - d[i] = s[i], i++; - if (i < n) - d[i] = s[i], i++; - if (i < n) - d[i] = s[i], i++; - if (i < n) - d[i] = s[i]; - } - } -} - -ZAHL_INLINE void -libzahl_memset(register zahl_char_t *a, register zahl_char_t v, size_t n) -{ - size_t i; - if (n <= 4) { - if (n >= 1) - a[0] = v; - if (n >= 2) - a[1] = v; - if (n >= 3) - a[2] = v; - if (n >= 4) - a[3] = v; - } else { - for (i = 0; (i += 4) <= n;) { - a[i - 1] = v; - a[i - 2] = v; - a[i - 3] = v; - a[i - 4] = v; - } - if (i > n) - for (i -= 4; i < n; i++) - a[i] = v; - } -} +#include "memory.h" diff --git a/zahl/memory.h b/zahl/memory.h @@ -0,0 +1,133 @@ +/* See LICENSE file for copyright and license details. */ + +#define LIBZAHL_MEM_CASES \ + LIBZAHL_X(20); \ + LIBZAHL_X(19); \ + LIBZAHL_X(18); \ + LIBZAHL_X(17); \ + LIBZAHL_X(16); \ + LIBZAHL_X(15); \ + LIBZAHL_X(14); \ + LIBZAHL_X(13); \ + LIBZAHL_X(12); \ + LIBZAHL_X(11); \ + LIBZAHL_X(10); \ + LIBZAHL_X( 9); \ + LIBZAHL_X( 8); \ + LIBZAHL_X( 7); \ + LIBZAHL_X( 6); \ + LIBZAHL_X( 5); \ + LIBZAHL_X( 4); \ + LIBZAHL_X( 3); \ + LIBZAHL_X( 2); \ + LIBZAHL_X( 1); \ + case 0: break; + + +#if defined(LIBZAHL_ISA_MISSING_INDIRECT_JUMP) +# define LIBZAHL_SMALL_INPUT_BEGIN(n) +# define LIBZAHL_SMALL_INPUT_END +#else +# define LIBZAHL_SMALL_INPUT_BEGIN(n) switch (n) { LIBZAHL_MEM_CASES default: +# define LIBZAHL_SMALL_INPUT_END break; } +#endif + + +ZAHL_INLINE void +libzahl_memcpy(register zahl_char_t *restrict d, register const zahl_char_t *restrict s, size_t n) +{ + size_t i; +#define LIBZAHL_X(I) case I: d[I - 1] = s[I - 1]; + LIBZAHL_SMALL_INPUT_BEGIN(n); + for (i = 0; i < n; i += 4) { + d[i + 0] = s[i + 0]; + d[i + 1] = s[i + 1]; + d[i + 2] = s[i + 2]; + d[i + 3] = s[i + 3]; + } + LIBZAHL_SMALL_INPUT_END; +#undef LIBZAHL_X +} + + +ZAHL_INLINE void +libzahl_memset(register zahl_char_t *a, register zahl_char_t v, size_t n) +{ + size_t i; + for (i = 0; i < n; i += 4) { + a[i + 0] = v; + a[i + 1] = v; + a[i + 2] = v; + a[i + 3] = v; + } +} + +ZAHL_INLINE void +libzahl_memset_precise(register zahl_char_t *a, register zahl_char_t v, size_t n) +{ + size_t i; + if (n <= 4) { + if (n >= 1) + a[0] = v; + if (n >= 2) + a[1] = v; + if (n >= 3) + a[2] = v; + if (n >= 4) + a[3] = v; + } else { + for (i = 0; (i += 4) <= n;) { + a[i - 1] = v; + a[i - 2] = v; + a[i - 3] = v; + a[i - 4] = v; + } + if (i > n) + for (i -= 4; i < n; i++) + a[i] = v; + } +} + + +ZAHL_INLINE void +libzahl_memmovef(register zahl_char_t *d, register const zahl_char_t *s, size_t n) +{ + if (n && n < 4) { + d[0] = s[0]; + d[1] = s[1]; + d[2] = s[2]; + } else { + size_t i; + for (i = 0; i < n; i += 4) { + d[i + 0] = s[i + 0]; + d[i + 1] = s[i + 1]; + d[i + 2] = s[i + 2]; + d[i + 3] = s[i + 3]; + } + } +} + +ZAHL_INLINE void +libzahl_memmoveb(register zahl_char_t *d, register const zahl_char_t *s, size_t n) +{ + ssize_t i; +#define LIBZAHL_X(I) case I: d[I - 1] = s[I - 1]; + LIBZAHL_SMALL_INPUT_BEGIN(n); + for (i = ((ssize_t)n + 3) & ~3; (i -= 4) >= 0;) { + d[i + 3] = s[i + 3]; + d[i + 2] = s[i + 2]; + d[i + 1] = s[i + 1]; + d[i + 0] = s[i + 0]; + } + LIBZAHL_SMALL_INPUT_END; +#undef LIBZAHL_X +} + +ZAHL_INLINE void +libzahl_memmove(register zahl_char_t *d, register const zahl_char_t *s, size_t n) +{ + if (d < s) + libzahl_memmovef(d, s, n); + else + libzahl_memmoveb(d, s, n); +}