From 95a01b0636682378afd01da0898046baf2344981 Mon Sep 17 00:00:00 2001
From: mashu555 <longwei27@huawei.com>
Date: Fri, 7 Nov 2025 14:37:52 +0800
Subject: [PATCH] revert using 32-byte alignment in memcpy_sve

(cherry picked from commit ba9db451a241eb3dd53c43f3f75061bd6945e3f8)
---
 ...mentation_for_32-byte_aligned_access.patch | 119 ------------------
 glibc.spec                                    |  10 +-
 2 files changed, 6 insertions(+), 123 deletions(-)
 delete mode 100755 AArch64-modify_the_SVE_memcpy_implementation_for_32-byte_aligned_access.patch

diff --git a/AArch64-modify_the_SVE_memcpy_implementation_for_32-byte_aligned_access.patch b/AArch64-modify_the_SVE_memcpy_implementation_for_32-byte_aligned_access.patch
deleted file mode 100755
index 3ec266a..0000000
--- a/AArch64-modify_the_SVE_memcpy_implementation_for_32-byte_aligned_access.patch
+++ /dev/null
@@ -1,119 +0,0 @@
-From 47d70aaf3adb11af5c488b434039bb5844c97bb6 Mon Sep 17 00:00:00 2001
-From: Long Wei <longwei27@huawei.com>
-Date: Wed, 24 Sep 2025 10:38:47 +0800
-Subject: [PATCH] aarch64:Modify the copy_long function in the SVE memcpy
- implementation for 32-byte aligned access
-
-aarch64: Optimize memcpy_sve by using 32-byte alignment
-The current memcpy_sve implementation shifts the destination pointer
-forward to achieve only 16-byte alignment. This can lead to two
-performance issues:
-1.  Cross-cache-line accesses: With 16-byte alignment, a 32-byte store
-    operation can still straddle two cache lines. This forces the CPU
-    to perform two separate cache line accesses, effectively doubling
-    the time for the store.
-2.  Cache bank conflicts: On some ARM microarchitectures, L1 cache is
-    organized into banks. 16-byte alignment can cause stores to frequently
-    hit the same bank, creating contention and reducing effective memory
-    bandwidth.
-Change the implementation of memcpy_sve from shifting forward to 16-byte
-alignment to shifting forward to 32-byte alignment, which is more
-cache-friendly.
--   All 32-byte SVE vector stores are fully contained within a
-    single 64-byte cache line, minimizing access latency.
--   Stores are distributed across different cache banks more
-    evenly, preventing conflicts and maximizing throughput.
-
-We tested the performance of `memcpy` on Kunpeng servers using the
-libmicro test suite.
-The results showed that using 32-byte alignment can reduce the latency
-of `memcpy`.
-The test results are in microseconds.
-
-16-byte alignment:
-memcpy_10       memcpy_32       memcpy_64       memcpy_128  memcpy_256
-0.0028          0.0028          0.0028          0.0035      0.0063
-memcpy_512      memcpy_1k       memcpy_2k       memcpy_4k   memcpy_8k
-0.0122          0.0165          0.0315          0.0605      0.1251
-memcpy_10k      memcpy_16k      memcpy_32k      memcpy_64k  memcpy_128k
-0.1597          0.2458          0.512           1.024       2.048
-memcpy_256k     memcpy_512k     memcpy_1m       memcpy_2m   memcpy_4m
-4.096           7.936           16.8            33.152      66.72
-memcpy_8m       memcpy_10m
-132.096         165.12
-
-32-byte alignment:
-memcpy_10       memcpy_32       memcpy_64       memcpy_128   memcpy_256
-0.0028          0.0028          0.0028          0.0035       0.0058
-memcpy_512      memcpy_1k       memcpy_2k       memcpy_4k    memcpy_8k
-0.0096          0.0165          0.0315          0.0614       0.121
-memcpy_10k      memcpy_16k      memcpy_32k      memcpy_64k   memcpy_128k
-0.1515          0.2355          0.512           1.024        2.048
-memcpy_256k     memcpy_512k     memcpy_1m       memcpy_2m    memcpy_4m
-3.84            7.168           15.072          29.952       60.032
-memcpy_8m       memcpy_10m
-119.04          147.968
-
-No functional change.
-
-sysdeps/aarch64/multiarch/memcpy_sve.S:  Change alignment shifting from 16
-bytes to 32 bytes.
----
- sysdeps/aarch64/multiarch/memcpy_sve.S | 30 +++++++++++++-------------
- 1 file changed, 15 insertions(+), 15 deletions(-)
-
-diff --git a/sysdeps/aarch64/multiarch/memcpy_sve.S b/sysdeps/aarch64/multiarch/memcpy_sve.S
-index 2f14f913..17c8859a 100644
---- a/sysdeps/aarch64/multiarch/memcpy_sve.S
-+++ b/sysdeps/aarch64/multiarch/memcpy_sve.S
-@@ -109,22 +109,22 @@ L(copy_long):
- 	add	srcend, src, count
- 	add	dstend, dstin, count
- 
--	/* Copy 16 bytes and then align src to 16-byte alignment.  */
--	ldr	D_q, [src]
--	and	tmp1, src, 15
--	bic	src, src, 15
-+	/* Copy 32 bytes and then align src to 32-byte alignment.  */
-+	ldp	A_q, B_q, [src]
-+	and	tmp1, src, 31
-+	bic	src, src, 31
- 	sub	dst, dstin, tmp1
--	add	count, count, tmp1	/* Count is now 16 too large.  */
--	ldp	A_q, B_q, [src, 16]
--	str	D_q, [dstin]
--	ldp	C_q, D_q, [src, 48]
--	subs	count, count, 128 + 16	/* Test and readjust count.  */
-+	add	count, count, tmp1	/* Count is now 32 too large.  */
-+	stp	A_q, B_q, [dstin]
-+	ldp	A_q, B_q, [src, 32]
-+	ldp	C_q, D_q, [src, 64]
-+	subs	count, count, 128 + 32	/* Test and readjust count.  */
- 	b.ls	L(copy64_from_end)
- L(loop64):
--	stp	A_q, B_q, [dst, 16]
--	ldp	A_q, B_q, [src, 80]
--	stp	C_q, D_q, [dst, 48]
--	ldp	C_q, D_q, [src, 112]
-+	stp	A_q, B_q, [dst, 32]
-+	ldp	A_q, B_q, [src, 96]
-+	stp	C_q, D_q, [dst, 64]
-+	ldp	C_q, D_q, [src, 128]
- 	add	src, src, 64
- 	add	dst, dst, 64
- 	subs	count, count, 64
-@@ -133,9 +133,9 @@ L(loop64):
- 	/* Write the last iteration and copy 64 bytes from the end.  */
- L(copy64_from_end):
- 	ldp	E_q, F_q, [srcend, -64]
--	stp	A_q, B_q, [dst, 16]
-+	stp	A_q, B_q, [dst, 32]
- 	ldp	A_q, B_q, [srcend, -32]
--	stp	C_q, D_q, [dst, 48]
-+	stp	C_q, D_q, [dst, 64]
- 	stp	E_q, F_q, [dstend, -64]
- 	stp	A_q, B_q, [dstend, -32]
- 	ret
--- 
-2.33.0
-
diff --git a/glibc.spec b/glibc.spec
index d504bb5..057e7a5 100644
--- a/glibc.spec
+++ b/glibc.spec
@@ -67,7 +67,7 @@
 ##############################################################################
 Name: 	 	glibc
 Version: 	2.38
-Release: 	76
+Release: 	77
 Summary: 	The GNU libc libraries
 License:	%{all_license}
 URL: 		http://www.gnu.org/software/glibc/
@@ -415,9 +415,8 @@ Patch9038: Using-__memmove_generic-when-kunpeng920-with-tsv120-m.patch
 Patch9039: revert-aarch64-Use-memcpy_simd-as-the-default-memcpy.patch
 Patch9040: backport-Fix-UB-on__dl_map_object_from_fd.patch
 Patch9041: backport-Fix-handling-of-symbol-versions-which-hash-to-zero.patch
-Patch9042: AArch64-modify_the_SVE_memcpy_implementation_for_32-byte_aligned_access.patch
-Patch9043: 0024-Sw64-Change-libdir-from-lib-to-lib64.patch
-Patch9044: backport-x86-Disable-AVX-Fast-Unaligned-Load-on-Hygon-1-2-3.patch
+Patch9042: 0024-Sw64-Change-libdir-from-lib-to-lib64.patch
+Patch9043: backport-x86-Disable-AVX-Fast-Unaligned-Load-on-Hygon-1-2-3.patch
 
 Provides: ldconfig rtld(GNU_HASH) bundled(gnulib)
 
@@ -1603,6 +1602,9 @@ fi
 %endif
 
 %changelog
+* Fri Nov 07 2025 Long Wei <longwei27@huawei.com> - 2.38-77
+- aarch64: revert using 32-byte alignment in memcpy_sve
+
 * Wed Nov 05 2025 Qingqing Li <liqingqing3@huawei.com> - 2.38-76
 - x86: fix wmemset ifunc stray '!' (bug 33542)
 
-- 
Gitee