commit f3b969b6991f154a1fde1ea6b4488320ed0b486f
Author: Mattias Andrée <maandree_AT_kth.se>
AuthorDate: Tue Mar 15 11:40:46 2016 +0100
Commit: Mattias Andrée <maandree_AT_kth.se>
CommitDate: Tue Mar 15 11:40:46 2016 +0100
Optimise zsetup, zgcd, zmul, and zsqr and add -flto
Signed-off-by: Mattias Andrée <maandree_AT_kth.se>
diff --git a/TODO b/TODO
index 23edfbf..603fabb 100644
--- a/TODO
+++ b/TODO
_AT_@ -8,3 +8,10 @@ Add zranddist value based on % for fitness to bound
Test big endian
Test always having used > 0 for zero
Test negative/non-negative instead of sign
+
+Test optimisation of zmul:
+ bc = [(Hb * Hc) << (m2 << 1)]
+ + [(Hb * Hc) << m2]
+ - [(Hb - Lb)(Hc - Lc) << m2]
+ + [(Lb * Lc) << m2]
+ + (Lb * Lc)
diff --git a/bench/benchmark.c b/bench/benchmark.c
index 7b987e2..d756db8 100644
--- a/bench/benchmark.c
+++ b/bench/benchmark.c
_AT_@ -85,14 +85,18 @@ main(int argc, char *argv[])
BENCHMARK(zlsh(c, a, 76), 1);
BENCHMARK(zrsh(c, a, 76), 1);
BENCHMARK(ztrunc(c, a, 76), 1);
+ BENCHMARK(ztrunc(c, c, 76), 1);
BENCHMARK(zsplit(c, d, a, 76), 1);
BENCHMARK(zcmpmag(a, b), 1);
BENCHMARK(zcmp(a, b), 1);
BENCHMARK(zcmpi(a, 1000000000LL), 1);
BENCHMARK(zcmpu(a, 1000000000ULL), 1);
BENCHMARK(zbset(c, a, 76, 1), 1);
+ BENCHMARK(zbset(a, a, 76, 1), 1);
BENCHMARK(zbset(c, a, 76, 0), 1);
+ BENCHMARK(zbset(c, c, 76, 0), 1);
BENCHMARK(zbset(c, a, 76, -1), 1);
+ BENCHMARK(zbset(a, a, 76, -1), 1);
BENCHMARK(zbtest(a, 76), 1);
BENCHMARK(zgcd(c, a, b), 0);
BENCHMARK(zmul(c, a, b), 0);
diff --git a/config.mk b/config.mk
index cad12d8..1e43a43 100644
--- a/config.mk
+++ b/config.mk
_AT_@ -14,5 +14,5 @@ RANLIB = ranlib
# you have to add -DSECURE_RANDOM_PATHNAME=... to CPPFLAGS.
CPPFLAGS = -D_DEFAULT_SOURCE -D_BSD_SOURCE -D_XOPEN_SOURCE=700
-CFLAGS = -std=c99 -O3 -Wall -pedantic
+CFLAGS = -std=c99 -flto -O3 -Wall -pedantic
LDFLAGS = -s
diff --git a/src/internals.h b/src/internals.h
index c859ce1..456a2df 100644
--- a/src/internals.h
+++ b/src/internals.h
_AT_@ -45,29 +45,29 @@
#endif
#define LIST_TEMPS\
- X(libzahl_tmp_cmp)\
- X(libzahl_tmp_str_num)\
- X(libzahl_tmp_str_mag)\
- X(libzahl_tmp_str_div)\
- X(libzahl_tmp_str_rem)\
- X(libzahl_tmp_gcd_u)\
- X(libzahl_tmp_gcd_v)\
- X(libzahl_tmp_sub)\
- X(libzahl_tmp_modmul)\
- X(libzahl_tmp_div)\
- X(libzahl_tmp_mod)\
- X(libzahl_tmp_pow_b)\
- X(libzahl_tmp_pow_c)\
- X(libzahl_tmp_pow_d)\
- X(libzahl_tmp_modsqr)\
- X(libzahl_tmp_divmod_a)\
- X(libzahl_tmp_divmod_b)\
- X(libzahl_tmp_divmod_d)\
- X(libzahl_tmp_ptest_x)\
- X(libzahl_tmp_ptest_a)\
- X(libzahl_tmp_ptest_d)\
- X(libzahl_tmp_ptest_n1)\
- X(libzahl_tmp_ptest_n4)
+ X(libzahl_tmp_cmp, 1)\
+ X(libzahl_tmp_str_num, 0)\
+ X(libzahl_tmp_str_mag, 0)\
+ X(libzahl_tmp_str_div, 0)\
+ X(libzahl_tmp_str_rem, 0)\
+ X(libzahl_tmp_gcd_u, 0)\
+ X(libzahl_tmp_gcd_v, 0)\
+ X(libzahl_tmp_sub, 0)\
+ X(libzahl_tmp_modmul, 0)\
+ X(libzahl_tmp_div, 0)\
+ X(libzahl_tmp_mod, 0)\
+ X(libzahl_tmp_pow_b, 0)\
+ X(libzahl_tmp_pow_c, 0)\
+ X(libzahl_tmp_pow_d, 0)\
+ X(libzahl_tmp_modsqr, 0)\
+ X(libzahl_tmp_divmod_a, 0)\
+ X(libzahl_tmp_divmod_b, 0)\
+ X(libzahl_tmp_divmod_d, 0)\
+ X(libzahl_tmp_ptest_x, 0)\
+ X(libzahl_tmp_ptest_a, 0)\
+ X(libzahl_tmp_ptest_d, 0)\
+ X(libzahl_tmp_ptest_n1, 0)\
+ X(libzahl_tmp_ptest_n4, 0)
#define LIST_CONSTS\
X(libzahl_const_1e19, zsetu, 10000000000000000000ULL) /* The largest power of 10 < 2⁶⁴. */\
_AT_@ -75,7 +75,7 @@
X(libzahl_const_2, zsetu, 2)\
X(libzahl_const_4, zsetu, 4)
-#define X(x) extern z_t x;
+#define X(x, s) extern z_t x;
LIST_TEMPS
#undef X
#define X(x, f, v) extern z_t x;
_AT_@ -185,3 +185,50 @@ libzahl_add_overflow(zahl_char_t *rp, zahl_char_t a, zahl_char_t b)
return carry;
}
#endif
+
+static inline void
+zrsh_taint(z_t a, size_t bits)
+{
+ size_t i, chars, cbits;
+
+ if (unlikely(!bits))
+ return;
+ if (unlikely(zzero(a)))
+ return;
+
+ chars = FLOOR_BITS_TO_CHARS(bits);
+
+ if (unlikely(chars >= a->used || zbits(a) <= bits)) {
+ SET_SIGNUM(a, 0);
+ return;
+ }
+
+ bits = BITS_IN_LAST_CHAR(bits);
+ cbits = BITS_PER_CHAR - bits;
+
+ if (likely(chars)) {
+ a->used -= chars;
+ a->chars += chars;
+ }
+
+ if (unlikely(bits)) { /* This if statement is very important in C. */
+ a->chars[0] >>= bits;
+ for (i = 1; i < a->used; i++) {
+ a->chars[i - 1] |= a->chars[i] << cbits;
+ a->chars[i] >>= bits;
+ }
+ TRIM_NONZERO(a);
+ }
+}
+
+static inline void
+zswap_tainted_unsigned(z_t a, z_t b)
+{
+ z_t t;
+ t->used = b->used;
+ b->used = a->used;
+ a->used = t->used;
+ t->chars = b->chars;
+ b->chars = a->chars;
+ a->chars = t->chars;
+}
diff --git a/src/zadd.c b/src/zadd.c
index 557ec6f..b730b81 100644
--- a/src/zadd.c
+++ b/src/zadd.c
_AT_@ -72,6 +72,33 @@ zadd_unsigned(z_t a, z_t b, z_t c)
}
void
+zadd_unsigned_assign(z_t a, z_t b)
+{
+ size_t size, n;
+
+ if (unlikely(zzero(a))) {
+ zabs(a, b);
+ return;
+ } else if (unlikely(zzero(b))) {
+ return;
+ }
+
+ size = MAX(a->used, b->used);
+ n = a->used + b->used - size;
+
+ ENSURE_SIZE(a, size + 1);
+ a->chars[size] = 0;
+
+ if (a->used < b->used) {
+ n = b->used;
+ zmemset(a->chars + a->used, 0, n - a->used);
+ }
+ zadd_impl(a, b, n);
+
+ SET_SIGNUM(a, 1);
+}
+
+void
zadd(z_t a, z_t b, z_t c)
{
if (unlikely(zzero(b))) {
diff --git a/src/zgcd.c b/src/zgcd.c
index 7c6f2bc..502a779 100644
--- a/src/zgcd.c
+++ b/src/zgcd.c
_AT_@ -12,14 +12,11 @@ zgcd(z_t a, z_t b, z_t c)
* Binary GCD algorithm.
*/
- size_t shifts = 0, i = 0, min;
- zahl_char_t uv, bit;
- int neg;
+ size_t shifts;
+ zahl_char_t *u_orig, *v_orig;
+ size_t u_lsb, v_lsb;
+ int neg, cmpmag;
- if (unlikely(!zcmp(b, c))) {
- SET(a, b);
- return;
- }
if (unlikely(zzero(b))) {
SET(a, c);
return;
_AT_@ -29,37 +26,30 @@ zgcd(z_t a, z_t b, z_t c)
return;
}
- zabs(u, b);
- zabs(v, c);
neg = znegative2(b, c);
- min = MIN(u->used, v->used);
- for (; i < min; i++) {
- uv = u->chars[i] | v->chars[i];
- for (bit = 1; bit; bit <<= 1, shifts++)
- if (uv & bit)
- goto loop_done;
+ u_lsb = zlsb(b);
+ v_lsb = zlsb(c);
+ shifts = MIN(u_lsb, v_lsb);
+ zrsh(u, b, u_lsb);
+ zrsh(v, c, v_lsb);
+
+ u_orig = u->chars;
+ v_orig = v->chars;
+
+ for (;;) {
+ if (likely((cmpmag = zcmpmag(u, v)) >= 0)) {
+ if (unlikely(cmpmag == 0))
+ break;
+ zswap_tainted_unsigned(u, v);
+ }
+ zsub_positive_assign(v, u);
+ zrsh_taint(v, zlsb(v));
}
- for (; i < u->used; i++)
- for (bit = 1; bit; bit <<= 1, shifts++)
- if (u->chars[i] & bit)
- goto loop_done;
- for (; i < v->used; i++)
- for (bit = 1; bit; bit <<= 1, shifts++)
- if (v->chars[i] & bit)
- goto loop_done;
-loop_done:
- zrsh(u, u, shifts);
- zrsh(v, v, shifts);
-
- zrsh(u, u, zlsb(u));
- do {
- zrsh(v, v, zlsb(v));
- if (zcmpmag(u, v) > 0) /* Both are non-negative. */
- zswap(u, v);
- zsub_unsigned(v, v, u);
- } while (!zzero(v));
zlsh(a, u, shifts);
SET_SIGNUM(a, neg ? -1 : 1);
+
+ u->chars = u_orig;
+ v->chars = v_orig;
}
diff --git a/src/zmul.c b/src/zmul.c
index ae02844..ab41213 100644
--- a/src/zmul.c
+++ b/src/zmul.c
_AT_@ -57,39 +57,22 @@ zmul(z_t a, z_t b, z_t c)
zsplit(b_high, b_low, b, m2);
zsplit(c_high, c_low, c, m2);
-#if 1
+
zmul(z0, b_low, c_low);
zmul(z2, b_high, c_high);
- zadd(b_low, b_low, b_high);
- zadd(c_low, c_low, c_high);
+ zadd_unsigned_assign(b_low, b_high);
+ zadd_unsigned_assign(c_low, c_high);
zmul(z1, b_low, c_low);
- zsub(z1, z1, z0);
- zsub(z1, z1, z2);
+ zsub_nonnegative_assign(z1, z0);
+ zsub_nonnegative_assign(z1, z2);
zlsh(z1, z1, m2);
m2 <<= 1;
- zlsh(z2, z2, m2);
-
- zadd(a, z2, z1);
- zadd(a, a, z0);
-#else
- zmul(z0, b_low, c_low);
- zmul(z2, b_high, c_high);
- zsub(b_low, b_high, b_low);
- zsub(c_low, c_high, c_low);
- zmul(z1, b_low, c_low);
-
- zlsh(z0, z0, m2 + 1);
- zlsh(z1, z1, m2);
zlsh(a, z2, m2);
- m2 <<= 1;
- zlsh(z2, z2, m2);
- zadd(z2, z2, a);
+ zadd_unsigned_assign(a, z1);
+ zadd_unsigned_assign(a, z0);
- zsub(a, z2, z1);
- zadd(a, a, z0);
-#endif
zfree(z0);
zfree(z1);
diff --git a/src/zsetup.c b/src/zsetup.c
index 921e509..10fc5f5 100644
--- a/src/zsetup.c
+++ b/src/zsetup.c
_AT_@ -1,7 +1,7 @@
/* See LICENSE file for copyright and license details. */
#include "internals.h"
-#define X(x) z_t x;
+#define X(x, s) z_t x;
LIST_TEMPS
#undef X
#define X(x, f, v) z_t x;
_AT_@ -30,8 +30,8 @@ zsetup(jmp_buf env)
memset(libzahl_pool_n, 0, sizeof(libzahl_pool_n));
memset(libzahl_pool_alloc, 0, sizeof(libzahl_pool_alloc));
-#define X(x)\
- zinit(x), zsetu(x, 1);
+#define X(x, s)\
+ zinit(x); if (s) zsetu(x, 1);
LIST_TEMPS;
#undef X
#define X(x, f, v)\
diff --git a/src/zsqr.c b/src/zsqr.c
index bd03128..68480ba 100644
--- a/src/zsqr.c
+++ b/src/zsqr.c
_AT_@ -42,32 +42,17 @@ zsqr(z_t a, z_t b)
zsplit(high, low, b, m2);
-#if 1
+
zsqr(z0, low);
zsqr(z2, high);
zmul(z1, low, high);
zlsh(z1, z1, m2 + 1);
m2 <<= 1;
- zlsh(z2, z2, m2);
-
- zadd(a, z2, z1);
- zadd(a, a, z0);
-#else
- zsqr(z0, low);
- zsqr(z2, high);
- zmul(z1, low, low);
-
- zlsh(z0, z0, m2 + 1);
- zlsh(z1, z1, m2 + 1);
zlsh(a, z2, m2);
- m2 <<= 1;
- zlsh(z2, z2, m2);
- zadd(z2, z2, a);
+ zadd_unsigned_assign(a, z1);
+ zadd_unsigned_assign(a, z0);
- zsub(a, z2, z1);
- zadd(a, a, z0);
-#endif
zfree(z0);
zfree(z1);
diff --git a/src/zsub.c b/src/zsub.c
index b3f12f2..259526f 100644
--- a/src/zsub.c
+++ b/src/zsub.c
_AT_@ -46,7 +46,7 @@ libzahl_zsub_unsigned(z_t a, z_t b, z_t c)
SET_SIGNUM(a, 0);
return;
}
- n = MIN(b->used, c->used);
+ n = b->used;
if (a == b) {
zset(libzahl_tmp_sub, b);
SET(a, c);
_AT_@ -56,7 +56,7 @@ libzahl_zsub_unsigned(z_t a, z_t b, z_t c)
zsub_impl(a, b, n);
}
} else {
- n = MIN(b->used, c->used);
+ n = c->used;
if (unlikely(a == c)) {
zset(libzahl_tmp_sub, c);
SET(a, b);
_AT_@ -77,6 +77,24 @@ zsub_unsigned(z_t a, z_t b, z_t c)
}
void
+zsub_nonnegative_assign(z_t a, z_t b)
+{
+ if (unlikely(zzero(b))) {
+ zabs(a, a);
+ } else if (unlikely(!zcmpmag(a, b))) {
+ SET_SIGNUM(a, 0);
+ } else {
+ zsub_impl(a, b, b->used);
+ }
+}
+
+void
+zsub_positive_assign(z_t a, z_t b)
+{
+ zsub_impl(a, b, b->used);
+}
+
+void
zsub(z_t a, z_t b, z_t c)
{
if (unlikely(zzero(b))) {
diff --git a/src/zunsetup.c b/src/zunsetup.c
index 9926354..ba84dd7 100644
--- a/src/zunsetup.c
+++ b/src/zunsetup.c
_AT_@ -8,7 +8,7 @@ zunsetup(void)
size_t i;
if (libzahl_set_up) {
libzahl_set_up = 0;
-#define X(x)\
+#define X(x, s)\
free(x->chars);
LIST_TEMPS;
#undef X
diff --git a/zahl.h b/zahl.h
index e700dc7..cd41b7b 100644
--- a/zahl.h
+++ b/zahl.h
_AT_@ -119,6 +119,9 @@ void zmodpowu(z_t, z_t, unsigned long long int, z_t);
/* These are used internally and may be removed in a future version. */
void zadd_unsigned(z_t, z_t, z_t); /* a := |b| + |c| */
void zsub_unsigned(z_t, z_t, z_t); /* a := |b| - |c| */
+void zadd_unsigned_assign(z_t, z_t); /* a := |a| + |b| */
+void zsub_nonnegative_assign(z_t, z_t); /* a := a - b, assuming a ≥ b ≥ 0 */
+void zsub_positive_assign(z_t, z_t); /* a := a - b, assuming a > b > 0 */
/* Bitwise operations. */
Received on Tue Mar 15 2016 - 22:38:22 CET
This archive was generated by hypermail 2.3.0
: Tue Mar 15 2016 - 22:48:14 CET