From 0104a740285bf42922d9479cf835097d986923dc Mon Sep 17 00:00:00 2001
From: Johannes Schlatow <johannes.schlatow@genode-labs.com>
Date: Fri, 25 Mar 2022 10:12:46 +0100
Subject: [PATCH] memcpy (arm): cache align and use pld for speedup

Preloading a few cache lines ahead brings a significant speedup in
memcpy throughput. Note, the particular (optimal) value was empirically
determined on a Cortex-A9 (Zynq-7000) SoC @ 666Mhz. It is best combined
with L2 prefetching enabled (including double linefills and prefetch
offset 7). Yet, even without L2 prefetching this seems to be the sweet
spot.

genodelabs/genode#4456
---
 repos/base/include/spec/arm/cpu/string.h | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/repos/base/include/spec/arm/cpu/string.h b/repos/base/include/spec/arm/cpu/string.h
index 2e65f39021..1cb8d3a15d 100644
--- a/repos/base/include/spec/arm/cpu/string.h
+++ b/repos/base/include/spec/arm/cpu/string.h
@@ -30,21 +30,25 @@ namespace Genode {
 	{
 		unsigned char *d = (unsigned char *)dst, *s = (unsigned char *)src;
 
-		/* check 4 byte; alignment */
-		size_t d_align = (size_t)d & 0x3;
-		size_t s_align = (size_t)s & 0x3;
+		/* fetch the first cache line */
+		asm volatile ("pld [%0, #0]\n\t" : "+r" (s));
 
-		/* only same alignments work for the following LDM/STM loop */
-		if (d_align != s_align)
+		/* check 32-byte (cache line) alignment */
+		size_t d_align = (size_t)d & 0x1f;
+		size_t s_align = (size_t)s & 0x1f;
+
+		/* only same word-alignments work for the following LDM/STM loop */
+		if ((d_align & 0x3) != (s_align & 0x3))
 			return size;
 
-		/* copy to 4 byte alignment */
-		for (; (size > 0) && (s_align > 0) && (s_align < 4);
+		/* copy to 32-byte alignment */
+		for (; (size > 0) && (s_align > 0) && (s_align < 32);
 		     s_align++, *d++ = *s++, size--);
 
 		/* copy 32 byte chunks */
 		for (; size >= 32; size -= 32) {
 			asm volatile ("ldmia %0!, {r3 - r10} \n\t"
+			              "pld [%0, #160]\n\t"
 			              "stmia %1!, {r3 - r10} \n\t"
 			              : "+r" (s), "+r" (d)
 			              :: "r3","r4","r5","r6","r7","r8","r9","r10");