commit fdc75a358e4c20116640c08f4e8ce7a09dc3cebd
parent 4e659044778dfd05a5f5d2b41611e7737a0eb825
Author: Mattias Andrée <maandree@kth.se>
Date: Sat, 7 May 2016 03:02:56 +0200
Optimisations
Signed-off-by: Mattias Andrée <maandree@kth.se>
Diffstat:
15 files changed, 190 insertions(+), 144 deletions(-)
diff --git a/Makefile b/Makefile
@@ -2,7 +2,8 @@ include config.mk
HDR_SEMIPUBLIC =\
zahl/inlines.h\
- zahl/internals.h
+ zahl/internals.h\
+ zahl/memory.h
HDR_PRIVATE =\
src/internals.h
diff --git a/src/allocator.c b/src/allocator.c
@@ -21,7 +21,7 @@ libzahl_realloc(z_t a, size_t need)
zfree(a);
a->chars = new;
} else {
- a->chars = realloc(a->chars, new_size * sizeof(zahl_char_t));
+ a->chars = realloc(a->chars, (new_size + ZAHL_FLUFF) * sizeof(zahl_char_t));
if (check(!a->chars))
libzahl_memfailure();
}
diff --git a/src/internals.h b/src/internals.h
@@ -4,7 +4,6 @@
#include <errno.h>
#include <stdlib.h>
#include <string.h>
-#include <unistd.h>
/* clang pretends to be GCC... */
#if defined(__GNUC__) && defined(__clang__)
@@ -105,7 +104,11 @@ extern void *libzahl_temp_allocation;
#define zpositive2(a, b) (zsignum(a) + zsignum(b) == 2)
#define zzero2(a, b) (!(zsignum(a) | zsignum(b)))
#define zmemcpy(d, s, n) libzahl_memcpy(d, s, n)
+#define zmemmove(d, s, n) libzahl_memmove(d, s, n)
+#define zmemmovef(d, s, n) libzahl_memmovef(d, s, n)
+#define zmemmoveb(d, s, n) libzahl_memmoveb(d, s, n)
#define zmemset(a, v, n) libzahl_memset(a, v, n)
+#define zmemset_precise(a, v, n) libzahl_memset_precise(a, v, n)
static inline int
zzero1(z_t a, z_t b)
@@ -113,11 +116,10 @@ zzero1(z_t a, z_t b)
return zzero(a) || zzero(b);
}
-O2 static inline void
+static inline void
zmemcpy_range(register zahl_char_t *restrict d, register const zahl_char_t *restrict s, size_t i, size_t n)
{
- for (; i < n; i++)
- d[i] = s[i];
+ zmemcpy(d + i, s + i, n - i);
}
static void
@@ -337,7 +339,7 @@ zfree_temp(z_t a)
/* } */
-#define ZMEM_2OP(a, b, c, n, OP) \
+#define ZMEM_2OP_PRECISE(a, b, c, n, OP) \
do { \
zahl_char_t *a__ = (a); \
const zahl_char_t *b__ = (b); \
@@ -365,66 +367,29 @@ zfree_temp(z_t a)
} \
} while (0)
-#define ZMEM_1OP(a, b, n, OP) \
- do { \
- zahl_char_t *a__ = (a); \
- const zahl_char_t *b__ = (b); \
- size_t i__, n__ = (n); \
- if (n__ <= 4) { \
- if (n__ >= 1) \
- a__[0] = OP(b__[0]); \
- if (n__ >= 2) \
- a__[1] = OP(b__[1]); \
- if (n__ >= 3) \
- a__[2] = OP(b__[2]); \
- if (n__ >= 4) \
- a__[3] = OP(b__[3]); \
- } else { \
- for (i__ = 0; (i__ += 4) < n__;) { \
- a__[i__ - 1] = OP(b__[i__ - 1]); \
- a__[i__ - 2] = OP(b__[i__ - 2]); \
- a__[i__ - 3] = OP(b__[i__ - 3]); \
- a__[i__ - 4] = OP(b__[i__ - 4]); \
- } \
- if (i__ > n__) \
- for (i__ -= 4; i__ < n__; i__++) \
- a__[i__] = OP(b__[i__]); \
- } \
+#define ZMEM_2OP(a, b, c, n, OP) \
+ do { \
+ zahl_char_t *a__ = (a); \
+ const zahl_char_t *b__ = (b); \
+ const zahl_char_t *c__ = (c); \
+ size_t i__, n__ = (n); \
+ for (i__ = 0; i__ < n__; i__ += 4) { \
+ a__[i__ + 0] = b__[i__ + 0] OP c__[i__ + 0]; \
+ a__[i__ + 1] = b__[i__ + 1] OP c__[i__ + 1]; \
+ a__[i__ + 2] = b__[i__ + 2] OP c__[i__ + 2]; \
+ a__[i__ + 3] = b__[i__ + 3] OP c__[i__ + 3]; \
+ } \
} while (0)
-static inline void
-zmemcpyb(register zahl_char_t *restrict d, register const zahl_char_t *restrict s, size_t n_)
-{
- ssize_t i, n = (ssize_t)n_;
- switch (n & 3) {
- case 3:
- d[n - 1] = s[n - 1];
- d[n - 2] = s[n - 2];
- d[n - 3] = s[n - 3];
- break;
- case 2:
- d[n - 1] = s[n - 1];
- d[n - 2] = s[n - 2];
- break;
- case 1:
- d[n - 1] = s[n - 1];
- break;
- default:
- break;
- }
- for (i = n & ~3; (i -= 4) >= 0;) {
- d[i + 3] = s[i + 3];
- d[i + 2] = s[i + 2];
- d[i + 1] = s[i + 1];
- d[i + 0] = s[i + 0];
- }
-}
-
-static inline void
-zmemmove(register zahl_char_t *d, register const zahl_char_t *s, size_t n)
-{
- if (d < s)
- zmemcpy(d, s, n);
- else
- zmemcpyb(d, s, n);
-}
+#define ZMEM_1OP(a, b, n, OP) \
+ do { \
+ zahl_char_t *a__ = (a); \
+ const zahl_char_t *b__ = (b); \
+ size_t i__, n__ = (n); \
+ for (i__ = 0; i__ < n__; i__ += 4) { \
+ a__[i__ + 0] = OP(b__[i__ + 0]); \
+ a__[i__ + 1] = OP(b__[i__ + 1]); \
+ a__[i__ + 2] = OP(b__[i__ + 2]); \
+ a__[i__ + 3] = OP(b__[i__ + 3]); \
+ } \
+ } while (0)
diff --git a/src/zadd.c b/src/zadd.c
@@ -2,7 +2,7 @@
#include "internals.h"
-#if defined(__x86_64__)
+#if defined(__x86_64__) && !defined(ZAHL_NO_ASM)
# define ASM3(code) \
__asm__ __volatile__ (code : [x]"+r"(carry), [a]"+r"(ac), [b]"+r"(bc), [c]"+r"(cc))
diff --git a/src/zload.c b/src/zload.c
@@ -6,11 +6,11 @@ size_t
zload(z_t a, const void *buffer)
{
const char *buf = buffer;
- a->sign = *((const int *)buf), buf += sizeof(int);
+ a->sign = *((const long *)buf), buf += sizeof(long);
a->used = *((const size_t *)buf), buf += sizeof(size_t);
if (likely(a->sign)) {
ENSURE_SIZE(a, a->used);
zmemcpy(a->chars, (const zahl_char_t *)buf, a->used);
}
- return sizeof(int) + sizeof(size_t) + (zzero(a) ? 0 : a->used * sizeof(zahl_char_t));
+ return sizeof(long) + sizeof(size_t) + (zzero(a) ? 0 : ((a->used + 3) & ~3) * sizeof(zahl_char_t));
}
diff --git a/src/zlsh.c b/src/zlsh.c
@@ -19,11 +19,11 @@ zlsh(z_t a, z_t b, size_t bits)
ENSURE_SIZE(a, b->used + chars + 1);
if (likely(a == b)) {
- zmemcpyb(a->chars + chars, b->chars, b->used);
+ zmemmoveb(a->chars + chars, b->chars, b->used);
} else {
zmemcpy(a->chars + chars, b->chars, b->used);
}
- zmemset(a->chars, 0, chars);
+ zmemset_precise(a->chars, 0, chars);
a->used = b->used + chars;
if (likely(bits)) { /* This if statement is very important in C. */
diff --git a/src/zor.c b/src/zor.c
@@ -19,11 +19,11 @@ zor(z_t a, z_t b, z_t c)
ENSURE_SIZE(a, m);
if (a == b) {
- ZMEM_2OP(a->chars, a->chars, c->chars, n, |);
+ ZMEM_2OP_PRECISE(a->chars, a->chars, c->chars, n, |);
if (a->used < c->used)
zmemcpy_range(a->chars, c->chars, n, m);
} else if (unlikely(a == c)) {
- ZMEM_2OP(a->chars, a->chars, b->chars, n, |);
+ ZMEM_2OP_PRECISE(a->chars, a->chars, b->chars, n, |);
if (a->used < b->used)
zmemcpy_range(a->chars, b->chars, n, m);
} else if (m == b->used) {
diff --git a/src/zptest.c b/src/zptest.c
@@ -47,8 +47,7 @@ zptest(z_t witness, z_t n, int t)
continue;
for (i = 1; i < r; i++) {
- zsqr(x, x);
- zmod(x, x, n);
+ zmodsqr(x, x, n);
if (!zcmp(x, libzahl_const_1)) {
if (witness)
zswap(witness, a);
diff --git a/src/zsetup.c b/src/zsetup.c
@@ -21,7 +21,7 @@ struct zahl **libzahl_temp_stack_end;
void *libzahl_temp_allocation = 0;
#define X(i, x, f, v) 1 +
-static zahl_char_t constant_chars[LIST_CONSTS 0];
+static zahl_char_t constant_chars[LIST_CONSTS ZAHL_FLUFF];
#undef X
diff --git a/src/zsqr.c b/src/zsqr.c
@@ -20,7 +20,7 @@ zsqr_ll(z_t a, z_t b)
#define z2 a
z_t z0, z1, high, low;
- zahl_char_t auxchars[3];
+ zahl_char_t auxchars[3 * ZAHL_FLUFF];
size_t bits;
bits = zbits(b);
@@ -38,7 +38,7 @@ zsqr_ll(z_t a, z_t b)
* which require constant auxiliary memory. */
if (bits < BITS_PER_CHAR) {
low->chars = auxchars;
- high->chars = auxchars + 1;
+ high->chars = auxchars + ZAHL_FLUFF;
zsplit_unsigned_fast_small_auto(high, low, b, bits);
} else {
bits = TRUNCATE_TO_CHAR(bits);
diff --git a/src/zxor.c b/src/zxor.c
@@ -26,11 +26,11 @@ zxor(z_t a, z_t b, z_t c)
ENSURE_SIZE(a, m);
if (a == b) {
- ZMEM_2OP(a->chars, a->chars, cc, n, ^);
+ ZMEM_2OP_PRECISE(a->chars, a->chars, cc, n, ^);
if (a->used < cn)
zmemcpy_range(a->chars, cc, n, m);
} else if (unlikely(a == c)) {
- ZMEM_2OP(a->chars, b->chars, cc, n, ^);
+ ZMEM_2OP_PRECISE(a->chars, a->chars, bc, n, ^);
if (a->used < bn)
zmemcpy_range(a->chars, bc, n, m);
} else if (m == bn) {
diff --git a/zahl.h b/zahl.h
@@ -11,6 +11,7 @@
#include <setjmp.h>
#include <stddef.h>
#include <stdint.h>
+#include <unistd.h>
#include "zahl/internals.h"
diff --git a/zahl/inlines.h b/zahl/inlines.h
@@ -259,12 +259,16 @@ zsave(z_t a, void *buffer)
{
if (ZAHL_LIKELY(buffer)) {
char *buf = buffer;
- *((int *)buf) = a->sign, buf += sizeof(int);
+ *((long *)buf) = a->sign, buf += sizeof(long); /* Use `long` for alignment. */
*((size_t *)buf) = a->used, buf += sizeof(size_t);
- if (ZAHL_LIKELY(!zzero(a)))
+ if (ZAHL_LIKELY(!zzero(a))) {
+ a->chars[a->used + 2] = 0;
+ a->chars[a->used + 1] = 0;
+ a->chars[a->used + 0] = 0;
libzahl_memcpy((zahl_char_t *)buf, a->chars, a->used);
+ }
}
- return sizeof(int) + sizeof(size_t) + (zzero(a) ? 0 : a->used * sizeof(zahl_char_t));
+ return sizeof(long) + sizeof(size_t) + (zzero(a) ? 0 : ((a->used + 3) & ~3) * sizeof(zahl_char_t));
}
diff --git a/zahl/internals.h b/zahl/internals.h
@@ -51,6 +51,7 @@
#define ZAHL_BITS_PER_CHAR 64
#define ZAHL_LB_BITS_PER_CHAR 6
#define ZAHL_CHAR_MAX UINT64_MAX
+#define ZAHL_FLUFF 4
/* Note: These cannot be changed willy-nilly, some code depends
* on them, be cause being flexible would just be too painful. */
@@ -113,62 +114,4 @@ extern struct zahl libzahl_tmp_mod[1];
void libzahl_realloc(struct zahl *, size_t);
-ZAHL_INLINE void
-libzahl_memcpy(register zahl_char_t *d, register const zahl_char_t *s, size_t n)
-{
- size_t i;
- if (n <= 4) {
- if (n >= 1)
- d[0] = s[0];
- if (n >= 2)
- d[1] = s[1];
- if (n >= 3)
- d[2] = s[2];
- if (n >= 4)
- d[3] = s[3];
- } else {
- for (i = 0; (i += 4) <= n;) {
- d[i - 4] = s[i - 4];
- d[i - 3] = s[i - 3];
- d[i - 2] = s[i - 2];
- d[i - 1] = s[i - 1];
- }
- if (i > n) {
- i -= 4;
- if (i < n)
- d[i] = s[i], i++;
- if (i < n)
- d[i] = s[i], i++;
- if (i < n)
- d[i] = s[i], i++;
- if (i < n)
- d[i] = s[i];
- }
- }
-}
-
-ZAHL_INLINE void
-libzahl_memset(register zahl_char_t *a, register zahl_char_t v, size_t n)
-{
- size_t i;
- if (n <= 4) {
- if (n >= 1)
- a[0] = v;
- if (n >= 2)
- a[1] = v;
- if (n >= 3)
- a[2] = v;
- if (n >= 4)
- a[3] = v;
- } else {
- for (i = 0; (i += 4) <= n;) {
- a[i - 1] = v;
- a[i - 2] = v;
- a[i - 3] = v;
- a[i - 4] = v;
- }
- if (i > n)
- for (i -= 4; i < n; i++)
- a[i] = v;
- }
-}
+#include "memory.h"
diff --git a/zahl/memory.h b/zahl/memory.h
@@ -0,0 +1,133 @@
+/* See LICENSE file for copyright and license details. */
+
+#define LIBZAHL_MEM_CASES \
+ LIBZAHL_X(20); \
+ LIBZAHL_X(19); \
+ LIBZAHL_X(18); \
+ LIBZAHL_X(17); \
+ LIBZAHL_X(16); \
+ LIBZAHL_X(15); \
+ LIBZAHL_X(14); \
+ LIBZAHL_X(13); \
+ LIBZAHL_X(12); \
+ LIBZAHL_X(11); \
+ LIBZAHL_X(10); \
+ LIBZAHL_X( 9); \
+ LIBZAHL_X( 8); \
+ LIBZAHL_X( 7); \
+ LIBZAHL_X( 6); \
+ LIBZAHL_X( 5); \
+ LIBZAHL_X( 4); \
+ LIBZAHL_X( 3); \
+ LIBZAHL_X( 2); \
+ LIBZAHL_X( 1); \
+ case 0: break;
+
+
+#if defined(LIBZAHL_ISA_MISSING_INDIRECT_JUMP)
+# define LIBZAHL_SMALL_INPUT_BEGIN(n)
+# define LIBZAHL_SMALL_INPUT_END
+#else
+# define LIBZAHL_SMALL_INPUT_BEGIN(n) switch (n) { LIBZAHL_MEM_CASES default:
+# define LIBZAHL_SMALL_INPUT_END break; }
+#endif
+
+
+ZAHL_INLINE void
+libzahl_memcpy(register zahl_char_t *restrict d, register const zahl_char_t *restrict s, size_t n)
+{
+ size_t i;
+#define LIBZAHL_X(I) case I: d[I - 1] = s[I - 1];
+ LIBZAHL_SMALL_INPUT_BEGIN(n);
+ for (i = 0; i < n; i += 4) {
+ d[i + 0] = s[i + 0];
+ d[i + 1] = s[i + 1];
+ d[i + 2] = s[i + 2];
+ d[i + 3] = s[i + 3];
+ }
+ LIBZAHL_SMALL_INPUT_END;
+#undef LIBZAHL_X
+}
+
+
+ZAHL_INLINE void
+libzahl_memset(register zahl_char_t *a, register zahl_char_t v, size_t n)
+{
+ size_t i;
+ for (i = 0; i < n; i += 4) {
+ a[i + 0] = v;
+ a[i + 1] = v;
+ a[i + 2] = v;
+ a[i + 3] = v;
+ }
+}
+
+ZAHL_INLINE void
+libzahl_memset_precise(register zahl_char_t *a, register zahl_char_t v, size_t n)
+{
+ size_t i;
+ if (n <= 4) {
+ if (n >= 1)
+ a[0] = v;
+ if (n >= 2)
+ a[1] = v;
+ if (n >= 3)
+ a[2] = v;
+ if (n >= 4)
+ a[3] = v;
+ } else {
+ for (i = 0; (i += 4) <= n;) {
+ a[i - 1] = v;
+ a[i - 2] = v;
+ a[i - 3] = v;
+ a[i - 4] = v;
+ }
+ if (i > n)
+ for (i -= 4; i < n; i++)
+ a[i] = v;
+ }
+}
+
+
+ZAHL_INLINE void
+libzahl_memmovef(register zahl_char_t *d, register const zahl_char_t *s, size_t n)
+{
+ if (n && n < 4) {
+ d[0] = s[0];
+ d[1] = s[1];
+ d[2] = s[2];
+ } else {
+ size_t i;
+ for (i = 0; i < n; i += 4) {
+ d[i + 0] = s[i + 0];
+ d[i + 1] = s[i + 1];
+ d[i + 2] = s[i + 2];
+ d[i + 3] = s[i + 3];
+ }
+ }
+}
+
+ZAHL_INLINE void
+libzahl_memmoveb(register zahl_char_t *d, register const zahl_char_t *s, size_t n)
+{
+ ssize_t i;
+#define LIBZAHL_X(I) case I: d[I - 1] = s[I - 1];
+ LIBZAHL_SMALL_INPUT_BEGIN(n);
+ for (i = ((ssize_t)n + 3) & ~3; (i -= 4) >= 0;) {
+ d[i + 3] = s[i + 3];
+ d[i + 2] = s[i + 2];
+ d[i + 1] = s[i + 1];
+ d[i + 0] = s[i + 0];
+ }
+ LIBZAHL_SMALL_INPUT_END;
+#undef LIBZAHL_X
+}
+
+ZAHL_INLINE void
+libzahl_memmove(register zahl_char_t *d, register const zahl_char_t *s, size_t n)
+{
+ if (d < s)
+ libzahl_memmovef(d, s, n);
+ else
+ libzahl_memmoveb(d, s, n);
+}