From 410099df7091a37354e66de1dfb4dba8a822915d Mon Sep 17 00:00:00 2001 From: Johannes Schlatow Date: Fri, 25 Mar 2022 11:52:20 +0100 Subject: [PATCH] base/memset: speedup implementation Compared to the bytewise memset, a wordwise memset (or even multi-word) achieves a speedup of ~6. On Zynq-7000/Cortex-A9: 317 MiB/s -> 2040 MiB/s On base-linux x86_64: 3580 MiB/s -> 23700 MiB/s genodelabs/genode#4456 --- repos/base/include/util/string.h | 42 +++++++++++++++++++++++++++++++- 1 file changed, 41 insertions(+), 1 deletion(-) diff --git a/repos/base/include/util/string.h b/repos/base/include/util/string.h index 4ab462fc48..683070132e 100644 --- a/repos/base/include/util/string.h +++ b/repos/base/include/util/string.h @@ -246,7 +246,47 @@ namespace Genode { __attribute((optimize("no-tree-loop-distribute-patterns"))) inline void *memset(void *dst, uint8_t i, size_t size) { - while (size--) ((uint8_t *)dst)[size] = i; + typedef unsigned long word_t; + + enum { + LEN = sizeof(word_t), + MASK = LEN-1 + }; + + size_t d_align = (size_t)dst & MASK; + uint8_t* d = (uint8_t*)dst; + + /* write until word aligned */ + for (; d_align && d_align < LEN && size; + d_align++, size--, d++) + *d = i; + + word_t word = i; + word |= word << 8; + word |= word << 16; + if (LEN == 8) + word |= (word << 16) << 16; + + /* write 8-word chunks (likely matches cache line size) */ + for (; size >= 8*LEN; size -= 8*LEN, d += 8*LEN) { + ((word_t *)d)[0] = word; + ((word_t *)d)[1] = word; + ((word_t *)d)[2] = word; + ((word_t *)d)[3] = word; + ((word_t *)d)[4] = word; + ((word_t *)d)[5] = word; + ((word_t *)d)[6] = word; + ((word_t *)d)[7] = word; + } + + /* write remaining words */ + for (; size >= LEN; size -= LEN, d += LEN) + ((word_t *)d)[0] = word; + + /* write remaining bytes */ + for (; size; size--, d++) + *d = i; + return dst; }