From 410099df7091a37354e66de1dfb4dba8a822915d Mon Sep 17 00:00:00 2001
From: Johannes Schlatow <johannes.schlatow@genode-labs.com>
Date: Fri, 25 Mar 2022 11:52:20 +0100
Subject: [PATCH] base/memset: speedup implementation

Compared to the bytewise memset, a wordwise memset (or even multi-word)
achieves a speedup of ~6.

On Zynq-7000/Cortex-A9:
317 MiB/s -> 2040 MiB/s

On base-linux x86_64:
3580 MiB/s -> 23700 MiB/s

genodelabs/genode#4456
---
 repos/base/include/util/string.h | 42 +++++++++++++++++++++++++++++++-
 1 file changed, 41 insertions(+), 1 deletion(-)

diff --git a/repos/base/include/util/string.h b/repos/base/include/util/string.h
index 4ab462fc48..683070132e 100644
--- a/repos/base/include/util/string.h
+++ b/repos/base/include/util/string.h
@@ -246,7 +246,47 @@ namespace Genode {
 	 __attribute((optimize("no-tree-loop-distribute-patterns")))
 	inline void *memset(void *dst, uint8_t i, size_t size)
 	{
-		while (size--) ((uint8_t *)dst)[size] = i;
+		typedef unsigned long word_t;
+
+		enum {
+			LEN  = sizeof(word_t),
+			MASK = LEN-1
+		};
+
+		size_t d_align = (size_t)dst & MASK;
+		uint8_t* d = (uint8_t*)dst;
+
+		/* write until word aligned */
+		for (; d_align && d_align < LEN && size;
+		       d_align++, size--, d++)
+			*d = i;
+
+		word_t word = i;
+		word |= word << 8;
+		word |= word << 16;
+		if (LEN == 8)
+			word |= (word << 16) << 16;
+
+		/* write 8-word chunks (likely matches cache line size) */
+		for (; size >= 8*LEN; size -= 8*LEN, d += 8*LEN) {
+			((word_t *)d)[0] = word;
+			((word_t *)d)[1] = word;
+			((word_t *)d)[2] = word;
+			((word_t *)d)[3] = word;
+			((word_t *)d)[4] = word;
+			((word_t *)d)[5] = word;
+			((word_t *)d)[6] = word;
+			((word_t *)d)[7] = word;
+		}
+
+		/* write remaining words */
+		for (; size >= LEN; size -= LEN, d += LEN)
+			((word_t *)d)[0] = word;
+
+		/* write remaining bytes */
+		for (; size; size--, d++)
+			*d = i;
+
 		return dst;
 	}