From 95a01b0636682378afd01da0898046baf2344981 Mon Sep 17 00:00:00 2001 From: mashu555 Date: Fri, 7 Nov 2025 14:37:52 +0800 Subject: [PATCH] revert using 32-byte alignment in memcpy_sve (cherry picked from commit ba9db451a241eb3dd53c43f3f75061bd6945e3f8) --- ...mentation_for_32-byte_aligned_access.patch | 119 ------------------ glibc.spec | 10 +- 2 files changed, 6 insertions(+), 123 deletions(-) delete mode 100755 AArch64-modify_the_SVE_memcpy_implementation_for_32-byte_aligned_access.patch diff --git a/AArch64-modify_the_SVE_memcpy_implementation_for_32-byte_aligned_access.patch b/AArch64-modify_the_SVE_memcpy_implementation_for_32-byte_aligned_access.patch deleted file mode 100755 index 3ec266a..0000000 --- a/AArch64-modify_the_SVE_memcpy_implementation_for_32-byte_aligned_access.patch +++ /dev/null @@ -1,119 +0,0 @@ -From 47d70aaf3adb11af5c488b434039bb5844c97bb6 Mon Sep 17 00:00:00 2001 -From: Long Wei -Date: Wed, 24 Sep 2025 10:38:47 +0800 -Subject: [PATCH] aarch64:Modify the copy_long function in the SVE memcpy - implementation for 32-byte aligned access - -aarch64: Optimize memcpy_sve by using 32-byte alignment -The current memcpy_sve implementation shifts the destination pointer -forward to achieve only 16-byte alignment. This can lead to two -performance issues: -1. Cross-cache-line accesses: With 16-byte alignment, a 32-byte store - operation can still straddle two cache lines. This forces the CPU - to perform two separate cache line accesses, effectively doubling - the time for the store. -2. Cache bank conflicts: On some ARM microarchitectures, L1 cache is - organized into banks. 16-byte alignment can cause stores to frequently - hit the same bank, creating contention and reducing effective memory - bandwidth. -Change the implementation of memcpy_sve from shifting forward to 16-byte -alignment to shifting forward to 32-byte alignment, which is more -cache-friendly. -- All 32-byte SVE vector stores are fully contained within a - single 64-byte cache line, minimizing access latency. -- Stores are distributed across different cache banks more - evenly, preventing conflicts and maximizing throughput. - -We tested the performance of `memcpy` on Kunpeng servers using the -libmicro test suite. -The results showed that using 32-byte alignment can reduce the latency -of `memcpy`. -The test results are in microseconds. - -16-byte alignment: -memcpy_10 memcpy_32 memcpy_64 memcpy_128 memcpy_256 -0.0028 0.0028 0.0028 0.0035 0.0063 -memcpy_512 memcpy_1k memcpy_2k memcpy_4k memcpy_8k -0.0122 0.0165 0.0315 0.0605 0.1251 -memcpy_10k memcpy_16k memcpy_32k memcpy_64k memcpy_128k -0.1597 0.2458 0.512 1.024 2.048 -memcpy_256k memcpy_512k memcpy_1m memcpy_2m memcpy_4m -4.096 7.936 16.8 33.152 66.72 -memcpy_8m memcpy_10m -132.096 165.12 - -32-byte alignment: -memcpy_10 memcpy_32 memcpy_64 memcpy_128 memcpy_256 -0.0028 0.0028 0.0028 0.0035 0.0058 -memcpy_512 memcpy_1k memcpy_2k memcpy_4k memcpy_8k -0.0096 0.0165 0.0315 0.0614 0.121 -memcpy_10k memcpy_16k memcpy_32k memcpy_64k memcpy_128k -0.1515 0.2355 0.512 1.024 2.048 -memcpy_256k memcpy_512k memcpy_1m memcpy_2m memcpy_4m -3.84 7.168 15.072 29.952 60.032 -memcpy_8m memcpy_10m -119.04 147.968 - -No functional change. - -sysdeps/aarch64/multiarch/memcpy_sve.S: Change alignment shifting from 16 -bytes to 32 bytes. ---- - sysdeps/aarch64/multiarch/memcpy_sve.S | 30 +++++++++++++------------- - 1 file changed, 15 insertions(+), 15 deletions(-) - -diff --git a/sysdeps/aarch64/multiarch/memcpy_sve.S b/sysdeps/aarch64/multiarch/memcpy_sve.S -index 2f14f913..17c8859a 100644 ---- a/sysdeps/aarch64/multiarch/memcpy_sve.S -+++ b/sysdeps/aarch64/multiarch/memcpy_sve.S -@@ -109,22 +109,22 @@ L(copy_long): - add srcend, src, count - add dstend, dstin, count - -- /* Copy 16 bytes and then align src to 16-byte alignment. */ -- ldr D_q, [src] -- and tmp1, src, 15 -- bic src, src, 15 -+ /* Copy 32 bytes and then align src to 32-byte alignment. */ -+ ldp A_q, B_q, [src] -+ and tmp1, src, 31 -+ bic src, src, 31 - sub dst, dstin, tmp1 -- add count, count, tmp1 /* Count is now 16 too large. */ -- ldp A_q, B_q, [src, 16] -- str D_q, [dstin] -- ldp C_q, D_q, [src, 48] -- subs count, count, 128 + 16 /* Test and readjust count. */ -+ add count, count, tmp1 /* Count is now 32 too large. */ -+ stp A_q, B_q, [dstin] -+ ldp A_q, B_q, [src, 32] -+ ldp C_q, D_q, [src, 64] -+ subs count, count, 128 + 32 /* Test and readjust count. */ - b.ls L(copy64_from_end) - L(loop64): -- stp A_q, B_q, [dst, 16] -- ldp A_q, B_q, [src, 80] -- stp C_q, D_q, [dst, 48] -- ldp C_q, D_q, [src, 112] -+ stp A_q, B_q, [dst, 32] -+ ldp A_q, B_q, [src, 96] -+ stp C_q, D_q, [dst, 64] -+ ldp C_q, D_q, [src, 128] - add src, src, 64 - add dst, dst, 64 - subs count, count, 64 -@@ -133,9 +133,9 @@ L(loop64): - /* Write the last iteration and copy 64 bytes from the end. */ - L(copy64_from_end): - ldp E_q, F_q, [srcend, -64] -- stp A_q, B_q, [dst, 16] -+ stp A_q, B_q, [dst, 32] - ldp A_q, B_q, [srcend, -32] -- stp C_q, D_q, [dst, 48] -+ stp C_q, D_q, [dst, 64] - stp E_q, F_q, [dstend, -64] - stp A_q, B_q, [dstend, -32] - ret --- -2.33.0 - diff --git a/glibc.spec b/glibc.spec index d504bb5..057e7a5 100644 --- a/glibc.spec +++ b/glibc.spec @@ -67,7 +67,7 @@ ############################################################################## Name: glibc Version: 2.38 -Release: 76 +Release: 77 Summary: The GNU libc libraries License: %{all_license} URL: http://www.gnu.org/software/glibc/ @@ -415,9 +415,8 @@ Patch9038: Using-__memmove_generic-when-kunpeng920-with-tsv120-m.patch Patch9039: revert-aarch64-Use-memcpy_simd-as-the-default-memcpy.patch Patch9040: backport-Fix-UB-on__dl_map_object_from_fd.patch Patch9041: backport-Fix-handling-of-symbol-versions-which-hash-to-zero.patch -Patch9042: AArch64-modify_the_SVE_memcpy_implementation_for_32-byte_aligned_access.patch -Patch9043: 0024-Sw64-Change-libdir-from-lib-to-lib64.patch -Patch9044: backport-x86-Disable-AVX-Fast-Unaligned-Load-on-Hygon-1-2-3.patch +Patch9042: 0024-Sw64-Change-libdir-from-lib-to-lib64.patch +Patch9043: backport-x86-Disable-AVX-Fast-Unaligned-Load-on-Hygon-1-2-3.patch Provides: ldconfig rtld(GNU_HASH) bundled(gnulib) @@ -1603,6 +1602,9 @@ fi %endif %changelog +* Fri Nov 07 2025 Long Wei - 2.38-77 +- aarch64: revert using 32-byte alignment in memcpy_sve + * Wed Nov 05 2025 Qingqing Li - 2.38-76 - x86: fix wmemset ifunc stray '!' (bug 33542) -- Gitee