(wrong string) ée

From: <git_AT_suckless.org>
Date: Fri, 6 May 2016 02:21:07 +0200 (CEST)

commit 9437becf6d8aa4d9a3872b2cd6b353dc4c90a1cb
Author: Mattias Andrée <maandree_AT_kth.se>
AuthorDate: Thu May 5 21:11:43 2016 +0200
Commit: Mattias Andrée <maandree_AT_kth.se>
CommitDate: Thu May 5 21:13:16 2016 +0200

    Optimisations
    
    Signed-off-by: Mattias Andrée <maandree_AT_kth.se>

diff --git a/STATUS b/STATUS
index a5f664e..a0fb7d2 100644
--- a/STATUS
+++ b/STATUS
_AT_@ -1,3 +1,21 @@
+The following functions are probably implemented optimally:
+
+zswap ................... always fastest
+zzero ................... always fastest (shared with gmp)
+zsignum ................. always fastest (shared with gmp)
+zeven ................... always fastest
+zodd .................... always fastest
+zeven_nonzero ........... always fastest
+zodd_nonzero ............ always fastest (shared with gmp)
+zbtest .................. always fastest
+
+
+The following functions are probably implemented close to
+optimally, further optimisation should not be a priority:
+
+zadd_unsigned ........... fastest after ~70 compared against zadd too (x86-64)
+
+
 Optimisation progress for libzahl, compared to other big integer
 libraries. These comparisons are for 152-bit integers. Functions
 in parenthesis the right column are functions that needs
_AT_@ -10,26 +28,18 @@ zset .................... fastest [always]
 zseti ................... tomsfastmath is faster [always]
 zsetu ................... tomsfastmath is faster [always]
 zneg(a, b) .............. fastest [always]
-zneg(a, a) .............. fastest [always] (shared with gmp)
+zneg(a, a) .............. fastest [always] (shared with gmp; faster with clang)
 zabs(a, b) .............. fastest [always]
 zabs(a, a) .............. tomsfastmath is faster [always]
-zadd_unsigned ........... fastest [always]
-zsub_unsigned ........... fastest [always]
-zadd .................... fastest [after ~100, tomsfastmath before] (shared with gmp)
+zsub_unsigned ........... fastest [always] (compared against zsub too)
+zadd .................... fastest [after ~110, tomsfastmath before] (x86-64)
 zsub .................... fastest [always]
 zand .................... 77 % of tomsfastmath [until ~900, alternating with gmp]
 zor ..................... 65 % of tomsfastmath [until ~1750, alternating with gmp (gcc) and tomsfastmath (clang)]
 zxor .................... 87 % of tomsfastmath [until ~700, alternating with gmp (gcc+clangs),]
 znot .................... fastest [always]
-zeven ................... fastest [always]
-zodd .................... fastest [always]
-zeven_nonzero ........... fastest [always]
-zodd_nonzero ............ fastest [always]
-zzero ................... fastest [always] (shared with gmp)
-zsignum ................. fastest [always] (shared with gmp)
 zbits ................... fastest [always]
 zlsb .................... fastest [always]
-zswap ................... fastest [always]
 zlsh .................... fastest [until ~1000, then gmp]
 zrsh .................... fastest [almost never]
 ztrunc(a, b, c) ......... fastest [always; alternating with gmp between 1400~3000 (clang)]
_AT_@ -46,7 +56,6 @@ zbset(a, b, 0) .......... fastest [always]
 zbset(a, a, 0) .......... fastest [always]
 zbset(a, b, -1) ......... fastest [always]
 zbset(a, a, -1) ......... fastest [always]
-zbtest .................. fastest [always]
 zgcd .................... 21 % of gmp (zcmpmag)
 zmul .................... slowest
 zsqr .................... slowest (zmul)
diff --git a/src/zadd.c b/src/zadd.c
index 8efdf19..5eb7050 100644
--- a/src/zadd.c
+++ b/src/zadd.c
_AT_@ -4,116 +4,94 @@
 
 #if defined(__x86_64__)
 # define ASM3(code) \
- __asm__ __volatile__ (code : "+d"(carry) : "a"(ac + i), "b"(bc + i), "c"(cc + i))
+ __asm__ __volatile__ (code : [x]"+r"(carry), [a]"+r"(ac), [b]"+r"(bc), [c]"+r"(cc))
 
 # define ASM2(code) \
- __asm__ __volatile__ (code : "+d"(carry) : "a"(ac + i), "b"(bc + i))
+ __asm__ __volatile__ (code : [x]"+r"(carry), [a]"+r"(ac), [b]"+r"(bc))
 
-# define ADD2(off) \
- "\n movq "#off"(%%rbx), %%rdx" \
- "\n adcq %%rdx, "#off"(%%rax)"
+# define ADD2(off) \
+ "\n movq "#off"(%[b]), %[x]" \
+ "\n adcq %[x], "#off"(%[a])"
 
-# define ADD3(off) \
- "\n movq "#off"(%%rbx), %%rdx" \
- "\n adcq "#off"(%%rcx), %%rdx" \
- "\n movq %%rdx, "#off"(%%rax)"
+# define ADD3(off) \
+ "\n movq "#off"(%[b]), %[x]" \
+ "\n adcq "#off"(%[c]), %[x]" \
+ "\n movq %[x], "#off"(%[a])"
 
 # define WRAP_CARRY(interior) \
- "\n clc" \
- "\n cmpq $0, %%rdx" \
- "\n je 1f" \
- "\n stc" \
- "\n 1:" \
+ "\n addq $-1, %[x]" \
         interior \
- "\n movq $1, %%rdx" \
+ "\n movq $1, %[x]" \
         "\n jc 1f" \
- "\n movq $0, %%rdx" \
+ "\n movq $0, %[x]" \
         "\n 1:"
+
+# define ASM_ADD(N) \
+ do { \
+ register zahl_char_t carry = 0; \
+ size_t i; \
+ for (i = 0; (INC(4)), (i += 4) <= n;) \
+ ASM##N(WRAP_CARRY(ADD##N(-32) ADD##N(-24) ADD##N(-16) ADD##N(-8))); \
+ switch (n & 3) { \
+ case 3: \
+ ASM##N(WRAP_CARRY(ADD##N(-32) ADD##N(-24) ADD##N(-16))); \
+ break; \
+ case 2: \
+ ASM##N(WRAP_CARRY(ADD##N(-32) ADD##N(-24))); \
+ break; \
+ case 1: \
+ ASM##N(WRAP_CARRY(ADD##N(-32))); \
+ break; \
+ default: \
+ break; \
+ } \
+ i = n; \
+ while (carry) { \
+ carry = libzahl_add_overflow(a->chars + i, a->chars[i], 1); \
+ i++; \
+ } \
+ if (a->used < i) \
+ a->used = i; \
+ } while (0)
 #endif
 
 
 static inline void
 zadd_impl_4(z_t a, z_t b, z_t c, size_t n)
 {
- zahl_char_t carry = 0, *ac = a->chars, *bc = b->chars, *cc = c->chars;
- size_t i;
-
-#if defined(__x86_64__)
- for (i = 0; (i += 4) <= n;)
- ASM3(WRAP_CARRY(ADD3(-32) ADD3(-24) ADD3(-16) ADD3(-8)));
- if (i > n) {
- i -= 4;
- switch (n & 3) {
- case 3:
- ASM3(WRAP_CARRY(ADD3(0) ADD3(8) ADD3(16)));
- break;
- case 2:
- ASM3(WRAP_CARRY(ADD3(0) ADD3(8)));
- break;
- case 1:
- ASM3(WRAP_CARRY(ADD3(0)));
- break;
- default:
- break;
- }
- }
- i = n;
-
- while (carry) {
- carry = libzahl_add_overflow(ac + i, ac[i], 1);
- i++;
- }
+#ifdef ASM_ADD
+ register zahl_char_t *ac = a->chars, *bc = b->chars, *cc = c->chars;
+# define INC(P) (ac += (P), bc += (P), cc += (P))
+ ASM_ADD(3);
+# undef INC
 #else
- zahl_char_t tcarry;
+ zahl_char_t carry = 0, tcarry;
+ zahl_char_t *ac = a->chars, *bc = b->chars, *cc = c->chars;
+ size_t i;
 
         for (i = 0; i < n; i++) {
                 tcarry = libzahl_add_overflow(ac + i, bc[i], cc[i]);
                 carry = tcarry | (zahl_char_t)libzahl_add_overflow(ac + i, ac[i], carry);
         }
+
         while (carry) {
                 carry = libzahl_add_overflow(ac + i, ac[i], 1);
                 i++;
         }
-#endif
 
         if (a->used < i)
                 a->used = i;
+#endif
 }
 
 static inline void
 zadd_impl_3(z_t a, z_t b, size_t n)
 {
-#if defined(__x86_64__)
- zahl_char_t carry = 0, *ac = a->chars, *bc = b->chars;
- size_t i;
-
- for (i = 0; (i += 4) <= n;)
- ASM2(WRAP_CARRY(ADD2(-32) ADD2(-24) ADD2(-16) ADD2(-8)));
- if (i > n) {
- i -= 4;
- switch (n & 3) {
- case 3:
- ASM2(WRAP_CARRY(ADD2(0) ADD2(8) ADD2(16)));
- break;
- case 2:
- ASM2(WRAP_CARRY(ADD2(0) ADD2(8)));
- break;
- case 1:
- ASM2(WRAP_CARRY(ADD2(0)));
- break;
- default:
- break;
- }
- }
- i = n;
-
- while (carry) {
- carry = libzahl_add_overflow(ac + i, ac[i], 1);
- i++;
- }
-
- if (a->used < i)
- a->used = i;
+#ifdef ASM_ADD
+ register zahl_char_t *ac = a->chars, *bc = b->chars;
+# define INC(P) (ac += (P), bc += (P))
+ ASM_ADD(2);
+# undef INC
 #else
         zadd_impl_4(a, a, b, n);
 #endif
diff --git a/zahl/internals.h b/zahl/internals.h
index b7cd230..e0aa248 100644
--- a/zahl/internals.h
+++ b/zahl/internals.h
_AT_@ -2,11 +2,7 @@
 
 #ifndef ZAHL_INLINE
 # if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
-# if defined(__GNUC__) || defined(__clang__)
-# define ZAHL_INLINE __attribute__((__always_inline__, __gnu_inline__)) static inline
-# else
-# define ZAHL_INLINE static inline
-# endif
+# define ZAHL_INLINE static inline
 # else
 # define ZAHL_INLINE static
 # endif
Received on Fri May 06 2016 - 02:21:07 CEST

This archive was generated by hypermail 2.3.0 : Fri May 06 2016 - 02:24:17 CEST