(wrong string) ée

From: <git_AT_suckless.org>
Date: Wed, 16 Mar 2016 14:30:52 +0100 (CET)

commit 3e3b44d087ab616089402129b2bc4c4831c6b33a
Author: Mattias Andrée <maandree_AT_kth.se>
AuthorDate: Wed Mar 16 14:30:29 2016 +0100
Commit: Mattias Andrée <maandree_AT_kth.se>
CommitDate: Wed Mar 16 14:30:29 2016 +0100

    Optimise zsqr, zmul, zstr, zdivmod, zpow, and zpowu
    
    Signed-off-by: Mattias Andrée <maandree_AT_kth.se>

diff --git a/TODO b/TODO
index 603fabb..5c69340 100644
--- a/TODO
+++ b/TODO
_AT_@ -15,3 +15,7 @@ Test optimisation of zmul:
      - [(Hb - Lb)(Hc - Lc) << m2]
      + [(Lb * Lc) << m2]
      + (Lb * Lc)
+
+Would zmul be faster if we split only one of the
+factors until they are both approximately the same
+size?
diff --git a/src/internals.h b/src/internals.h
index a9d9af5..ed5d7e9 100644
--- a/src/internals.h
+++ b/src/internals.h
_AT_@ -234,25 +234,28 @@ zswap_tainted_unsigned(z_t a, z_t b)
 }
 
 static inline void
-zsplit_fast_large_taint(z_t high, z_t low, z_t a, size_t n)
+zsplit_unsigned_fast_large_taint(z_t high, z_t low, z_t a, size_t n)
 {
         n >>= LB_BITS_PER_CHAR;
- high->sign = a->sign;
+ high->sign = 1;
         high->used = a->used - n;
         high->chars = a->chars + n;
- low->sign = a->sign;
+#if 0
+ TRIM_AND_ZERO(high);
+#endif
+ low->sign = 1;
         low->used = n;
         low->chars = a->chars;
         TRIM_AND_ZERO(low);
 }
 
 static inline void
-zsplit_fast_small_tainted(z_t high, z_t low, z_t a, size_t n)
+zsplit_unsigned_fast_small_tainted(z_t high, z_t low, z_t a, size_t n)
 {
         zahl_char_t mask = 1;
         mask = (mask << n) - 1;
 
- high->sign = a->sign;
+ high->sign = 1;
         high->used = 1;
         high->chars[0] = a->chars[0] >> n;
         if (a->used == 2) {
_AT_@ -261,10 +264,12 @@ zsplit_fast_small_tainted(z_t high, z_t low, z_t a, size_t n)
                 n = BITS_PER_CHAR - n;
                 high->chars[0] |= (a->chars[1] & mask) << n;
         }
+#if 0
         if (unlikely(!high->chars[high->used - 1]))
                 high->sign = 0;
+#endif
 
- low->sign = a->sign;
+ low->sign = 1;
         low->used = 1;
         low->chars[0] = a->chars[0] & mask;
         if (unlikely(!low->chars[0]))
diff --git a/src/zdivmod.c b/src/zdivmod.c
index 55d0e06..d907450 100644
--- a/src/zdivmod.c
+++ b/src/zdivmod.c
_AT_@ -1,44 +1,17 @@
 /* See LICENSE file for copyright and license details. */
 #include "internals.h"
 
-#define ta libzahl_tmp_divmod_a
-#define tb libzahl_tmp_divmod_b
-#define td libzahl_tmp_divmod_d
-#define tds libzahl_tmp_divmod_ds
+#define ta libzahl_tmp_divmod_a
+#define tb libzahl_tmp_divmod_b
+#define td libzahl_tmp_divmod_d
+#define tds_proper libzahl_tmp_divmod_ds
 
 
-void
-zdivmod(z_t a, z_t b, z_t c, z_t d)
+static inline void
+zdivmod_impl(z_t a, z_t b, z_t c, z_t d)
 {
- size_t c_bits, d_bits, bit;
- int sign, cmpmag;
-
- sign = zsignum(c) * zsignum(d);
-
- if (unlikely(!sign)) {
- if (zzero(c)) {
- if (zzero(d)) {
- libzahl_failure(-ZERROR_0_DIV_0);
- } else {
- SET_SIGNUM(a, 0);
- SET_SIGNUM(b, 0);
- }
- } else {
- libzahl_failure(-ZERROR_DIV_0);
- }
- return;
- } else if (unlikely((cmpmag = zcmpmag(c, d)) <= 0)) {
- if (cmpmag == 0) {
- zseti(a, sign);
- SET_SIGNUM(b, 0);
- return;
- } else {
- SET(b, c);
- }
- SET_SIGNUM(b, 1);
- SET_SIGNUM(a, 0);
- return;
- }
+ size_t c_bits, d_bits, bit, i;
+ static z_t tds[BITS_PER_CHAR];
 
         c_bits = zbits(c);
         d_bits = zbits(d);
_AT_@ -54,10 +27,10 @@ zdivmod(z_t a, z_t b, z_t c, z_t d)
         SET_SIGNUM(ta, 0);
         zabs(tb, c);
 
- if (bit < BITS_PER_CHAR) {
+ if (unlikely(bit <= BITS_PER_CHAR)) {
                 for (;;) {
                         if (zcmpmag(td, tb) <= 0) {
- zsub(tb, tb, td);
+ zsub_unsigned(tb, tb, td);
                                 zbset(ta, ta, bit, 1);
                         }
                         if (!bit-- || zzero(tb))
_AT_@ -65,25 +38,61 @@ zdivmod(z_t a, z_t b, z_t c, z_t d)
                         zrsh(td, td, 1);
                 }
         } else {
- size_t i;
- for (i = 0; i < BITS_PER_CHAR; i++)
- zrsh(tds[i], td, i);
+ for (i = 0; i < BITS_PER_CHAR; i++) {
+ zrsh(tds_proper[i], td, i);
+ tds[i]->used = tds_proper[i]->used;
+ tds[i]->sign = tds_proper[i]->sign;
+ tds[i]->chars = tds_proper[i]->chars;
+ }
                 for (;;) {
                         for (i = 0; i < BITS_PER_CHAR; i++) {
                                 if (zcmpmag(tds[i], tb) <= 0) {
- zsub(tb, tb, tds[i]);
+ zsub_unsigned(tb, tb, tds[i]);
                                         zbset(ta, ta, bit, 1);
                                 }
                                 if (!bit-- || zzero(tb))
                                         goto done;
                         }
                         for (i = MIN(bit, BITS_PER_CHAR - 1) + 1; i--;)
- zrsh(tds[i], tds[i], BITS_PER_CHAR);
+ zrsh_taint(tds[i], BITS_PER_CHAR);
                 }
         }
 done:
 
         zswap(a, ta);
         zswap(b, tb);
+}
+
+
+void
+zdivmod(z_t a, z_t b, z_t c, z_t d)
+{
+ int sign, cmpmag;
+
+ sign = zsignum(c) * zsignum(d);
+
+ if (unlikely(!sign)) {
+ if (unlikely(!zzero(c))) {
+ libzahl_failure(-ZERROR_DIV_0);
+ } else if (unlikely(zzero(d))) {
+ libzahl_failure(-ZERROR_0_DIV_0);
+ } else {
+ SET_SIGNUM(a, 0);
+ SET_SIGNUM(b, 0);
+ }
+ return;
+ } else if (cmpmag = zcmpmag(c, d), unlikely(cmpmag <= 0)) {
+ if (unlikely(cmpmag == 0)) {
+ zseti(a, sign);
+ SET_SIGNUM(b, 0);
+ } else {
+ SET(b, c);
+ SET_SIGNUM(b, 1);
+ SET_SIGNUM(a, 0);
+ }
+ return;
+ }
+
+ zdivmod_impl(a, b, c, d);
         SET_SIGNUM(a, sign);
 }
diff --git a/src/zmul.c b/src/zmul.c
index 71460d8..6633edd 100644
--- a/src/zmul.c
+++ b/src/zmul.c
_AT_@ -11,7 +11,7 @@ zmul_impl_single_char(z_t a, z_t b, z_t c)
         SET_SIGNUM(a, 1);
 }
 
-static void
+void
 zmul_impl(z_t a, z_t b, z_t c)
 {
         /*
_AT_@ -19,13 +19,18 @@ zmul_impl(z_t a, z_t b, z_t c)
          *
          * Basically, this is how you were taught to multiply large numbers
          * by hand in school: 4010⋅3020 = (4000 + 10)(3000 + 20) =
- = 40⋅30⋅10⁴ + (40⋅20 + 30⋅10)⋅10² + 10⋅20, but the middle is
+ * = 40⋅30⋅10⁴ + (40⋅20 + 30⋅10)⋅10² + 10⋅20, but the middle is
          * optimised to only one multiplication:
          * 40⋅20 + 30⋅10 = (40 + 10)(30 + 20) − 40⋅30 − 10⋅20.
+ * This optimisation is crucial. Without it, the algorithm with
+ * run in O(n²).
          */
 
+#define z2 c_low
+#define z1 b_low
+#define z0 a
         size_t m, m2;
- z_t z0, z1, z2, b_high, b_low, c_high, c_low;
+ z_t b_high, b_low, c_high, c_low;
 
         if (unlikely(zzero1(b, c))) {
                 SET_SIGNUM(a, 0);
_AT_@ -43,9 +48,6 @@ zmul_impl(z_t a, z_t b, z_t c)
         m = MAX(m, m2);
         m2 = m >> 1;
 
- zinit(z0);
- zinit(z1);
- zinit(z2);
         zinit(b_high);
         zinit(b_low);
         zinit(c_high);
_AT_@ -66,14 +68,11 @@ zmul_impl(z_t a, z_t b, z_t c)
 
         zlsh(z1, z1, m2);
         m2 <<= 1;
- zlsh(a, z2, m2);
+ zlsh(z2, z2, m2);
         zadd_unsigned_assign(a, z1);
- zadd_unsigned_assign(a, z0);
+ zadd_unsigned_assign(a, z2);
 
 
- zfree(z0);
- zfree(z1);
- zfree(z2);
         zfree(b_high);
         zfree(b_low);
         zfree(c_high);
diff --git a/src/zpow.c b/src/zpow.c
index 81f8098..84f4927 100644
--- a/src/zpow.c
+++ b/src/zpow.c
_AT_@ -5,6 +5,9 @@
 #define tc libzahl_tmp_pow_c
 
 
+extern void zmul_impl(z_t a, z_t b, z_t c);
+extern void zsqr_impl(z_t a, z_t b);
+
 void
 zpow(z_t a, z_t b, z_t c)
 {
_AT_@ -16,6 +19,7 @@ zpow(z_t a, z_t b, z_t c)
 
         size_t i, j, n, bits;
         zahl_char_t x;
+ int neg;
 
         if (unlikely(zsignum(c) <= 0)) {
                 if (zzero(c)) {
_AT_@ -36,7 +40,8 @@ zpow(z_t a, z_t b, z_t c)
         bits = zbits(c);
         n = FLOOR_BITS_TO_CHARS(bits);
 
- zset(tb, b);
+ neg = znegative(b) && zodd(c);
+ zabs(tb, b);
         zset(tc, c);
         zsetu(a, 1);
 
_AT_@ -44,14 +49,17 @@ zpow(z_t a, z_t b, z_t c)
                 x = tc->chars[i];
                 for (j = BITS_PER_CHAR; j--; x >>= 1) {
                         if (x & 1)
- zmul(a, a, tb);
- zsqr(tb, tb);
+ zmul_impl(a, a, tb);
+ zsqr_impl(tb, tb);
                 }
         }
         x = tc->chars[i];
         for (; x; x >>= 1) {
                 if (x & 1)
- zmul(a, a, tb);
- zsqr(tb, tb);
+ zmul_impl(a, a, tb);
+ zsqr_impl(tb, tb);
         }
+
+ if (neg)
+ zneg(a, a);
 }
diff --git a/src/zpowu.c b/src/zpowu.c
index c4a2a64..cf879e0 100644
--- a/src/zpowu.c
+++ b/src/zpowu.c
_AT_@ -4,9 +4,14 @@
 #define tb libzahl_tmp_pow_b
 
 
+extern void zmul_impl(z_t a, z_t b, z_t c);
+extern void zsqr_impl(z_t a, z_t b);
+
 void
 zpowu(z_t a, z_t b, unsigned long long int c)
 {
+ int neg;
+
         if (unlikely(!c)) {
                 if (zzero(b))
                         libzahl_failure(-ZERROR_0_POW_0);
_AT_@ -17,12 +22,16 @@ zpowu(z_t a, z_t b, unsigned long long int c)
                 return;
         }
 
- zset(tb, b);
+ neg = znegative(b) && (c & 1);
+ zabs(tb, b);
         zsetu(a, 1);
 
         for (; c; c >>= 1) {
                 if (c & 1)
- zmul(a, a, tb);
- zsqr(tb, tb);
+ zmul_impl(a, a, tb);
+ zsqr_impl(tb, tb);
         }
+
+ if (neg)
+ zneg(a, a);
 }
diff --git a/src/zsqr.c b/src/zsqr.c
index e9418bf..8a616f0 100644
--- a/src/zsqr.c
+++ b/src/zsqr.c
_AT_@ -11,16 +11,19 @@ zsqr_impl_single_char(z_t a, z_t b)
         SET_SIGNUM(a, 1);
 }
 
-static void
+extern void zmul_impl(z_t a, z_t b, z_t c);
+
+void
 zsqr_impl(z_t a, z_t b)
 {
         /*
          * Karatsuba algorithm, optimised for equal factors.
          */
 
- z_t z0, z1, z2, high, low;
- size_t bits;
+#define z2 a
+ z_t z0, z1, high, low;
         zahl_char_t auxchars[3];
+ size_t bits;
 
         bits = zbits(b);
 
_AT_@ -31,39 +34,41 @@ zsqr_impl(z_t a, z_t b)
 
         bits >>= 1;
 
- zinit(z0);
- zinit(z1);
- zinit(z2);
-
+ /* Try to split only at a character level rather than a bit level.
+ * Such splits are faster, even if bit-level is required, and do
+ * not require auxiliary memory except for the bit-level split
+ * which require constant auxiliary memory. */
         if (bits < BITS_PER_CHAR) {
                 low->chars = auxchars;
                 high->chars = auxchars + 1;
- zsplit_fast_small_tainted(high, low, b, bits);
+ zsplit_unsigned_fast_small_tainted(high, low, b, bits);
         } else {
                 bits &= ~(BITS_PER_CHAR - 1);
- zsplit_fast_large_taint(high, low, b, bits);
+ zsplit_unsigned_fast_large_taint(high, low, b, bits);
         }
 
 
- zsqr_impl(z2, high);
         if (unlikely(zzero(low))) {
- SET_SIGNUM(z0, 0);
- SET_SIGNUM(z1, 0);
+ zsqr_impl(z2, high);
+ zlsh(a, z2, bits << 1);
         } else {
+ zinit(z0);
+ zinit(z1);
+
                 zsqr_impl(z0, low);
- zmul(z1, low, high);
- }
 
- zlsh(z1, z1, bits + 1);
- bits <<= 1;
- zlsh(a, z2, bits);
- zadd_unsigned_assign(a, z1);
- zadd_unsigned_assign(a, z0);
+ zmul_impl(z1, low, high);
+ zlsh(z1, z1, bits + 1);
 
+ zsqr_impl(z2, high);
+ zlsh(a, z2, bits << 1);
 
- zfree(z0);
- zfree(z1);
- zfree(z2);
+ zadd_unsigned_assign(a, z1);
+ zadd_unsigned_assign(a, z0);
+
+ zfree(z0);
+ zfree(z1);
+ }
 }
 
 void
diff --git a/src/zstr.c b/src/zstr.c
index c82ec89..f919dc8 100644
--- a/src/zstr.c
+++ b/src/zstr.c
_AT_@ -12,6 +12,48 @@
  * the cast to unsigned long long must be changed accordingly. */
 
 
+#define S1(P) P"0" P"1" P"2" P"3" P"4" P"5" P"6" P"7" P"8" P"9"
+#define S2(P) S1(P"0")S1(P"1")S1(P"2")S1(P"3")S1(P"4")S1(P"5")S1(P"6")S1(P"7")S1(P"8")S1(P"9")
+
+
+static inline O2 void
+sprintint_fix(char *buf, zahl_char_t v)
+{
+ const char *partials = S2("");
+ uint16_t *buffer = (uint16_t *)(buf + 1);
+
+ buffer[8] = *(uint16_t *)(partials + 2 * (v % 100)), v /= 100;
+ buffer[7] = *(uint16_t *)(partials + 2 * (v % 100)), v /= 100;
+ buffer[6] = *(uint16_t *)(partials + 2 * (v % 100)), v /= 100;
+ buffer[5] = *(uint16_t *)(partials + 2 * (v % 100)), v /= 100;
+ buffer[4] = *(uint16_t *)(partials + 2 * (v % 100)), v /= 100;
+ buffer[3] = *(uint16_t *)(partials + 2 * (v % 100)), v /= 100;
+ buffer[2] = *(uint16_t *)(partials + 2 * (v % 100)), v /= 100;
+ buffer[1] = *(uint16_t *)(partials + 2 * (v % 100)), v /= 100;
+ buffer[0] = *(uint16_t *)(partials + 2 * (v % 100)), v /= 100;
+ *buf = '0' + v;
+ buf[19] = 0;
+}
+
+static inline void
+cmemmove(char *d, const char *s, long n)
+{
+ while (n--)
+ *d++ = *s++;
+}
+
+static inline size_t
+sprintint_min(char *buf, zahl_char_t v)
+{
+ long i = 0, j;
+ sprintint_fix(buf, v);
+ for (; buf[i] == '0'; i++);
+ cmemmove(buf, buf + i, j = 19 - i);
+ buf[j] = 0;
+ return j;
+}
+
+
 char *
 zstr(z_t a, char *b)
 {
_AT_@ -42,12 +84,12 @@ zstr(z_t a, char *b)
         for (;;) {
                 zdivmod(num, rem, num, libzahl_const_1e19);
                 if (likely(!zzero(num))) {
- sprintf(b + n, "%019llu", zzero(rem) ? 0ULL : (unsigned long long)(rem->chars[0]));
+ sprintint_fix(b + n, zzero(rem) ? 0 : rem->chars[0]);
                         b[n + 19] = overridden;
                         overridden = b[n];
                         n = n > 19 ? (n - 19) : 0;
                 } else {
- len = (size_t)sprintf(buf, "%llu", (unsigned long long)(rem->chars[0]));
+ len = sprintint_min(buf, rem->chars[0]);
                         if (overridden)
                                 buf[len] = b[n + len];
                         memcpy(b + n, buf, len + 1);
Received on Wed Mar 16 2016 - 14:30:52 CET

This archive was generated by hypermail 2.3.0 : Wed Mar 16 2016 - 14:36:12 CET