[hackers] [libgrapheme] Add UTF-8 decoder benchmark || Laslo Hunhold

From: <git_AT_suckless.org>
Date: Tue, 4 Jan 2022 18:39:16 +0100 (CET)

commit 06013743a38729c531a67a63bbfd55b50badddfe
Author: Laslo Hunhold <dev_AT_frign.de>
AuthorDate: Tue Jan 4 18:11:02 2022 +0100
Commit: Laslo Hunhold <dev_AT_frign.de>
CommitDate: Tue Jan 4 18:37:33 2022 +0100

    Add UTF-8 decoder benchmark
    
    Here we can also see the trouble with the custom types in libutf8proc.
    
    Signed-off-by: Laslo Hunhold <dev_AT_frign.de>

diff --git a/Makefile b/Makefile
index 2a016ad..f30e0af 100644
--- a/Makefile
+++ b/Makefile
_AT_@ -6,6 +6,7 @@ include config.mk
 
 BENCHMARK =\
         benchmark/character\
+ benchmark/utf8-decode\
 
 DATA =\
         data/emoji-data.txt\
_AT_@ -37,6 +38,7 @@ MAN7 = man/libgrapheme.7
 all: libgrapheme.a libgrapheme.so
 
 benchmark/character.o: benchmark/character.c config.mk gen/character-test.h grapheme.h benchmark/util.h
+benchmark/utf8-decode.o: benchmark/utf8-decode.c config.mk gen/character-test.h grapheme.h benchmark/util.h
 benchmark/util.o: benchmark/util.c config.mk benchmark/util.h
 gen/character-prop.o: gen/character-prop.c config.mk gen/util.h
 gen/character-test.o: gen/character-test.c config.mk gen/util.h
_AT_@ -51,6 +53,7 @@ test/utf8-decode.o: test/utf8-decode.c config.mk grapheme.h test/util.h
 test/util.o: test/util.c config.mk test/util.h
 
 benchmark/character: benchmark/character.o benchmark/util.o libgrapheme.a
+benchmark/utf8-decode: benchmark/utf8-decode.o benchmark/util.o libgrapheme.a
 gen/character-test: gen/character-test.o gen/util.o
 gen/properties: gen/properties.o gen/util.o
 test/character: test/character.o test/util.o libgrapheme.a
_AT_@ -139,4 +142,4 @@ dist:
         tar -cf - "libgrapheme-$(VERSION)" | gzip -c > "libgrapheme-$(VERSION).tar.gz"
         rm -rf "libgrapheme-$(VERSION)"
 
-.PHONY: all test install uninstall clean clean-data dist
+.PHONY: all benchmark test install uninstall clean clean-data dist
diff --git a/benchmark/utf8-decode.c b/benchmark/utf8-decode.c
new file mode 100644
index 0000000..16d117e
--- /dev/null
+++ b/benchmark/utf8-decode.c
_AT_@ -0,0 +1,120 @@
+/* See LICENSE file for copyright and license details. */
+#include <errno.h>
+#include <math.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "../grapheme.h"
+#include "../gen/character-test.h"
+#include "util.h"
+
+#include <utf8proc.h>
+
+#define NUM_ITERATIONS 100000
+
+#if defined __has_attribute
+ #if __has_attribute(optnone)
+ void libgrapheme(const void *) __attribute__((optnone));
+ void libutf8proc(const void *) __attribute__((optnone));
+ #endif
+#endif
+
+struct payload {
+ char *buf_char;
+ utf8proc_uint8_t *buf_uint8;
+ size_t bufsiz;
+};
+
+void
+libgrapheme(const void *payload)
+{
+ const struct payload *p = payload;
+ uint_least32_t cp;
+ size_t ret, off;
+
+ for (off = 0; off < p->bufsiz; off += ret) {
+ if ((ret = grapheme_decode_utf8(p->buf_char + off,
+ p->bufsiz - off, &cp)) >
+ (p->bufsiz - off)) {
+ break;
+ }
+ (void)cp;
+ }
+}
+
+void
+libutf8proc(const void *payload)
+{
+ const struct payload *p = payload;
+ utf8proc_int32_t cp;
+ utf8proc_ssize_t ret;
+ size_t off;
+
+ for (off = 0; off < p->bufsiz; off += (size_t)ret) {
+ if ((ret = utf8proc_iterate(p->buf_uint8 + off,
+ (utf8proc_ssize_t)(p->bufsiz - off),
+ &cp)) < 0) {
+ break;
+ }
+ (void)cp;
+ }
+}
+
+int
+main(int argc, char *argv[])
+{
+ struct payload p;
+ size_t cpbufsiz, i, off, ret;
+ uint32_t *cpbuf;
+ double baseline = (double)NAN;
+
+ (void)argc;
+
+ if ((cpbuf = generate_test_buffer(character_test, LEN(character_test),
+ &cpbufsiz)) == NULL) {
+ return 1;
+ }
+
+ /* convert cp-buffer to utf8-data (both as char and custom uint8-type) */
+ for (i = 0, p.bufsiz = 0; i < cpbufsiz; i++) {
+ p.bufsiz += grapheme_encode_utf8(cpbuf[i], NULL, 0);
+ }
+ if ((p.buf_char = malloc(p.bufsiz)) == NULL) {
+ fprintf(stderr, "malloc: %s\n", strerror(errno));
+ exit(1);
+ }
+ for (i = 0, off = 0; i < cpbufsiz; i++, off += ret) {
+ if ((ret = grapheme_encode_utf8(cpbuf[i], p.buf_char + off,
+ p.bufsiz - off)) >
+ (p.bufsiz - off)) {
+ /* shouldn't happen */
+ fprintf(stderr, "Error while converting buffer.\n");
+ exit(1);
+ }
+ }
+ if ((p.buf_uint8 = malloc(p.bufsiz)) == NULL) {
+ fprintf(stderr, "malloc: %s\n", strerror(errno));
+ exit(1);
+ }
+ for (i = 0; i < p.bufsiz; i++) {
+ /*
+ * even if char is larger than 8 bit, it will only have
+ * any of the first 8 bits set (by construction).
+ */
+ p.buf_uint8[i] = (utf8proc_uint8_t)p.buf_char[i];
+ }
+
+ printf("%s\n", argv[0]);
+ run_benchmark(libgrapheme, &p, "libgrapheme ", &baseline,
+ NUM_ITERATIONS);
+ run_benchmark(libutf8proc, &p, "libutf8proc ", &baseline,
+ NUM_ITERATIONS);
+
+ free(cpbuf);
+ free(p.buf_char);
+ free(p.buf_uint8);
+
+ return 0;
+}
Received on Tue Jan 04 2022 - 18:39:16 CET

This archive was generated by hypermail 2.3.0 : Tue Jan 04 2022 - 18:48:35 CET