commit 40b860777616071997ec035783eeea402ffb1ae2
parent d0565fe373f559312be54b6bc8d74aa7fd34fe2c
Author: Mattias Andrée <maandree@kth.se>
Date: Tue, 3 May 2016 14:03:33 +0200
Optimise libzahl_memcpy and libzahl_memset
Signed-off-by: Mattias Andrée <maandree@kth.se>
Diffstat:
3 files changed, 53 insertions(+), 8 deletions(-)
diff --git a/STATUS b/STATUS
@@ -6,7 +6,7 @@ left column. Double-parenthesis means there may be a better way
to do it. Inside square-brackets, there are some comments on
multi-bit comparisons.
-zset .................... fastest [until ~750, then gmp, also tomsfastmath after ~2750]
+zset .................... fastest [always with gcc, unless ~250 with clang]
zseti ................... tomsfastmath is faster [always]
zsetu ................... tomsfastmath is faster [always]
zneg(a, b) .............. fastest [until ~300, then gmp]
diff --git a/TODO b/TODO
@@ -5,9 +5,10 @@ Add zsets_radix
Add zstr_radix
Test big endian
-Test always having used > 0 for zero
+Test always having .used > 0 for zero
Test negative/non-negative instead of sign
Test long .sign
+Test always having .chars % 4 == 0
Test optimisation of zmul:
bc = [(Hb * Hc) << (m2 << 1)]
diff --git a/zahl-internals.h b/zahl-internals.h
@@ -109,18 +109,62 @@ struct zahl {
void libzahl_realloc(struct zahl *, size_t);
-ZAHL_O2 ZAHL_INLINE void
+ZAHL_INLINE void
libzahl_memcpy(register zahl_char_t *restrict d, register const zahl_char_t *restrict s, size_t n)
{
size_t i;
- for (i = 0; i < n; i++)
- d[i] = s[i];
+ if (n <= 4) {
+ if (n >= 1)
+ d[0] = s[0];
+ if (n >= 2)
+ d[1] = s[1];
+ if (n >= 3)
+ d[2] = s[2];
+ if (n >= 4)
+ d[3] = s[3];
+ } else {
+ for (i = 0; (i += 4) <= n;) {
+ d[i - 1] = s[i - 1];
+ d[i - 2] = s[i - 2];
+ d[i - 3] = s[i - 3];
+ d[i - 4] = s[i - 4];
+ }
+ if (i > n) {
+ i -= 4;
+ if (i < n)
+ d[i] = s[i], i++;
+ if (i < n)
+ d[i] = s[i], i++;
+ if (i < n)
+ d[i] = s[i], i++;
+ if (i < n)
+ d[i] = s[i], i++;
+ }
+ }
}
-ZAHL_O2 ZAHL_INLINE void
+ZAHL_INLINE void
libzahl_memset(register zahl_char_t *a, register zahl_char_t v, size_t n)
{
size_t i;
- for (i = 0; i < n; i++)
- a[i] = v;
+ if (n <= 4) {
+ if (n >= 1)
+ a[0] = v;
+ if (n >= 2)
+ a[1] = v;
+ if (n >= 3)
+ a[2] = v;
+ if (n >= 4)
+ a[3] = v;
+ } else {
+ for (i = 0; (i += 4) <= n;) {
+ a[i - 1] = v;
+ a[i - 2] = v;
+ a[i - 3] = v;
+ a[i - 4] = v;
+ }
+ if (i > n)
+ for (i -= 4; i < n; i++)
+ a[i] = v;
+ }
}