From cdab0e40b9c36e77db7f9c62440e55fa0e3527ae Mon Sep 17 00:00:00 2001 From: Piotr Caban Date: Tue, 26 Jan 2021 19:42:03 +0100 Subject: [PATCH] msvcrt: Improve memmove performance on i386 and x86_64 architectures. Wine-Bug: https://bugs.winehq.org/show_bug.cgi?id=49663 Signed-off-by: Piotr Caban Signed-off-by: Alexandre Julliard (cherry picked from commit 38c490496000c5852f14e9c022868c5107d9ff03) Signed-off-by: Michael Stefaniuc --- dlls/msvcrt/math.c | 2 +- dlls/msvcrt/msvcrt.h | 2 + dlls/msvcrt/string.c | 263 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 266 insertions(+), 1 deletion(-) diff --git a/dlls/msvcrt/math.c b/dlls/msvcrt/math.c index 4e1f7b24a9c..0b0b7adf797 100644 --- a/dlls/msvcrt/math.c +++ b/dlls/msvcrt/math.c @@ -64,7 +64,7 @@ typedef int (CDECL *MSVCRT_matherr_func)(struct _exception *); static MSVCRT_matherr_func MSVCRT_default_matherr_func = NULL; -static BOOL sse2_supported; +BOOL sse2_supported; static BOOL sse2_enabled; static const struct unix_funcs *unix_funcs; diff --git a/dlls/msvcrt/msvcrt.h b/dlls/msvcrt/msvcrt.h index d86056f4c05..c360943b1f0 100644 --- a/dlls/msvcrt/msvcrt.h +++ b/dlls/msvcrt/msvcrt.h @@ -32,6 +32,8 @@ #include "winbase.h" #undef strncpy +extern BOOL sse2_supported DECLSPEC_HIDDEN; + #define DBL80_MAX_10_EXP 4932 #define DBL80_MIN_10_EXP -4951 diff --git a/dlls/msvcrt/string.c b/dlls/msvcrt/string.c index 348c36f3ed5..d335fce4992 100644 --- a/dlls/msvcrt/string.c +++ b/dlls/msvcrt/string.c @@ -31,6 +31,7 @@ #include "msvcrt.h" #include "bnum.h" #include "winnls.h" +#include "wine/asm.h" #include "wine/debug.h" WINE_DEFAULT_DEBUG_CHANNEL(msvcrt); @@ -2470,6 +2471,259 @@ int __cdecl memcmp(const void *ptr1, const void *ptr2, size_t n) return 0; } +#if defined(__i386__) || defined(__x86_64__) + +#ifdef __i386__ + +#define DEST_REG "%edi" +#define SRC_REG "%esi" +#define LEN_REG "%ecx" +#define TMP_REG "%edx" + +#define MEMMOVE_INIT \ + "pushl " SRC_REG "\n\t" \ + __ASM_CFI(".cfi_adjust_cfa_offset 4\n\t") \ + "pushl " DEST_REG "\n\t" \ + __ASM_CFI(".cfi_adjust_cfa_offset 4\n\t") \ + "movl 12(%esp), " DEST_REG "\n\t" \ + "movl 16(%esp), " SRC_REG "\n\t" \ + "movl 20(%esp), " LEN_REG "\n\t" + +#define MEMMOVE_CLEANUP \ + "movl 12(%esp), %eax\n\t" \ + "popl " DEST_REG "\n\t" \ + __ASM_CFI(".cfi_adjust_cfa_offset -4\n\t") \ + "popl " SRC_REG "\n\t" \ + __ASM_CFI(".cfi_adjust_cfa_offset -4\n\t") + +#else + +#define DEST_REG "%rdi" +#define SRC_REG "%rsi" +#define LEN_REG "%r8" +#define TMP_REG "%r9" + +#define MEMMOVE_INIT \ + "pushq " SRC_REG "\n\t" \ + __ASM_CFI(".cfi_adjust_cfa_offset 8\n\t") \ + "pushq " DEST_REG "\n\t" \ + __ASM_CFI(".cfi_adjust_cfa_offset 8\n\t") \ + "movq %rcx, " DEST_REG "\n\t" \ + "movq %rdx, " SRC_REG "\n\t" + +#define MEMMOVE_CLEANUP \ + "movq %rcx, %rax\n\t" \ + "popq " DEST_REG "\n\t" \ + __ASM_CFI(".cfi_adjust_cfa_offset -8\n\t") \ + "popq " SRC_REG "\n\t" \ + __ASM_CFI(".cfi_adjust_cfa_offset -8\n\t") +#endif + +void * __cdecl sse2_memmove(void *dst, const void *src, size_t n); +__ASM_GLOBAL_FUNC( sse2_memmove, + MEMMOVE_INIT + "mov " DEST_REG ", " TMP_REG "\n\t" /* check copying direction */ + "sub " SRC_REG ", " TMP_REG "\n\t" + "cmp " LEN_REG ", " TMP_REG "\n\t" + "jb copy_bwd\n\t" + /* copy forwards */ + "cmp $4, " LEN_REG "\n\t" /* 4-bytes align */ + "jb copy_fwd3\n\t" + "mov " DEST_REG ", " TMP_REG "\n\t" + "shr $1, " TMP_REG "\n\t" + "jnc 1f\n\t" + "movsb\n\t" + "dec " LEN_REG "\n\t" + "inc " TMP_REG "\n\t" + "1:\n\t" + "shr $1, " TMP_REG "\n\t" + "jnc 1f\n\t" + "movsw\n\t" + "sub $2, " LEN_REG "\n\t" + "inc " TMP_REG "\n\t" + "1:\n\t" /* 16-bytes align */ + "cmp $16, " LEN_REG "\n\t" + "jb copy_fwd15\n\t" + "shr $1, " TMP_REG "\n\t" + "jnc 1f\n\t" + "movsl\n\t" + "sub $4, " LEN_REG "\n\t" + "inc " TMP_REG "\n\t" + "1:\n\t" + "shr $1, " TMP_REG "\n\t" + "jnc 1f\n\t" + "movsl\n\t" + "movsl\n\t" + "sub $8, " LEN_REG "\n\t" + "1:\n\t" + "cmp $64, " LEN_REG "\n\t" + "jb copy_fwd63\n\t" + "1:\n\t" /* copy 64-bytes blocks in loop, dest 16-bytes aligned */ + "movdqu 0x00(" SRC_REG "), %xmm0\n\t" + "movdqu 0x10(" SRC_REG "), %xmm1\n\t" + "movdqu 0x20(" SRC_REG "), %xmm2\n\t" + "movdqu 0x30(" SRC_REG "), %xmm3\n\t" + "movdqa %xmm0, 0x00(" DEST_REG ")\n\t" + "movdqa %xmm1, 0x10(" DEST_REG ")\n\t" + "movdqa %xmm2, 0x20(" DEST_REG ")\n\t" + "movdqa %xmm3, 0x30(" DEST_REG ")\n\t" + "add $64, " SRC_REG "\n\t" + "add $64, " DEST_REG "\n\t" + "sub $64, " LEN_REG "\n\t" + "cmp $64, " LEN_REG "\n\t" + "jae 1b\n\t" + "copy_fwd63:\n\t" /* copy last 63 bytes, dest 16-bytes aligned */ + "mov " LEN_REG ", " TMP_REG "\n\t" + "and $15, " LEN_REG "\n\t" + "shr $5, " TMP_REG "\n\t" + "jnc 1f\n\t" + "movdqu 0(" SRC_REG "), %xmm0\n\t" + "movdqa %xmm0, 0(" DEST_REG ")\n\t" + "add $16, " SRC_REG "\n\t" + "add $16, " DEST_REG "\n\t" + "1:\n\t" + "shr $1, " TMP_REG "\n\t" + "jnc copy_fwd15\n\t" + "movdqu 0x00(" SRC_REG "), %xmm0\n\t" + "movdqu 0x10(" SRC_REG "), %xmm1\n\t" + "movdqa %xmm0, 0x00(" DEST_REG ")\n\t" + "movdqa %xmm1, 0x10(" DEST_REG ")\n\t" + "add $32, " SRC_REG "\n\t" + "add $32, " DEST_REG "\n\t" + "copy_fwd15:\n\t" /* copy last 15 bytes, dest 4-bytes aligned */ + "mov " LEN_REG ", " TMP_REG "\n\t" + "and $3, " LEN_REG "\n\t" + "shr $3, " TMP_REG "\n\t" + "jnc 1f\n\t" + "movsl\n\t" + "1:\n\t" + "shr $1, " TMP_REG "\n\t" + "jnc copy_fwd3\n\t" + "movsl\n\t" + "movsl\n\t" + "copy_fwd3:\n\t" /* copy last 3 bytes */ + "shr $1, " LEN_REG "\n\t" + "jnc 1f\n\t" + "movsb\n\t" + "1:\n\t" + "shr $1, " LEN_REG "\n\t" + "jnc 1f\n\t" + "movsw\n\t" + "1:\n\t" + MEMMOVE_CLEANUP + "ret\n\t" + "copy_bwd:\n\t" + "lea (" DEST_REG ", " LEN_REG "), " DEST_REG "\n\t" + "lea (" SRC_REG ", " LEN_REG "), " SRC_REG "\n\t" + "cmp $4, " LEN_REG "\n\t" /* 4-bytes align */ + "jb copy_bwd3\n\t" + "mov " DEST_REG ", " TMP_REG "\n\t" + "shr $1, " TMP_REG "\n\t" + "jnc 1f\n\t" + "dec " SRC_REG "\n\t" + "dec " DEST_REG "\n\t" + "movb (" SRC_REG "), %al\n\t" + "movb %al, (" DEST_REG ")\n\t" + "dec " LEN_REG "\n\t" + "1:\n\t" + "shr $1, " TMP_REG "\n\t" + "jnc 1f\n\t" + "sub $2, " SRC_REG "\n\t" + "sub $2, " DEST_REG "\n\t" + "movw (" SRC_REG "), %ax\n\t" + "movw %ax, (" DEST_REG ")\n\t" + "sub $2, " LEN_REG "\n\t" + "1:\n\t" /* 16-bytes align */ + "cmp $16, " LEN_REG "\n\t" + "jb copy_bwd15\n\t" + "shr $1, " TMP_REG "\n\t" + "jnc 1f\n\t" + "sub $4, " SRC_REG "\n\t" + "sub $4, " DEST_REG "\n\t" + "movl (" SRC_REG "), %eax\n\t" + "movl %eax, (" DEST_REG ")\n\t" + "sub $4, " LEN_REG "\n\t" + "1:\n\t" + "shr $1, " TMP_REG "\n\t" + "jnc 1f\n\t" + "sub $8, " SRC_REG "\n\t" + "sub $8, " DEST_REG "\n\t" + "movl 4(" SRC_REG "), %eax\n\t" + "movl %eax, 4(" DEST_REG ")\n\t" + "movl (" SRC_REG "), %eax\n\t" + "movl %eax, (" DEST_REG ")\n\t" + "sub $8, " LEN_REG "\n\t" + "1:\n\t" + "cmp $64, " LEN_REG "\n\t" + "jb copy_bwd63\n\t" + "1:\n\t" /* copy 64-bytes blocks in loop, dest 16-bytes aligned */ + "sub $64, " SRC_REG "\n\t" + "sub $64, " DEST_REG "\n\t" + "movdqu 0x00(" SRC_REG "), %xmm0\n\t" + "movdqu 0x10(" SRC_REG "), %xmm1\n\t" + "movdqu 0x20(" SRC_REG "), %xmm2\n\t" + "movdqu 0x30(" SRC_REG "), %xmm3\n\t" + "movdqa %xmm0, 0x00(" DEST_REG ")\n\t" + "movdqa %xmm1, 0x10(" DEST_REG ")\n\t" + "movdqa %xmm2, 0x20(" DEST_REG ")\n\t" + "movdqa %xmm3, 0x30(" DEST_REG ")\n\t" + "sub $64, " LEN_REG "\n\t" + "cmp $64, " LEN_REG "\n\t" + "jae 1b\n\t" + "copy_bwd63:\n\t" /* copy last 63 bytes, dest 16-bytes aligned */ + "mov " LEN_REG ", " TMP_REG "\n\t" + "and $15, " LEN_REG "\n\t" + "shr $5, " TMP_REG "\n\t" + "jnc 1f\n\t" + "sub $16, " SRC_REG "\n\t" + "sub $16, " DEST_REG "\n\t" + "movdqu (" SRC_REG "), %xmm0\n\t" + "movdqa %xmm0, (" DEST_REG ")\n\t" + "1:\n\t" + "shr $1, " TMP_REG "\n\t" + "jnc copy_bwd15\n\t" + "sub $32, " SRC_REG "\n\t" + "sub $32, " DEST_REG "\n\t" + "movdqu 0x00(" SRC_REG "), %xmm0\n\t" + "movdqu 0x10(" SRC_REG "), %xmm1\n\t" + "movdqa %xmm0, 0x00(" DEST_REG ")\n\t" + "movdqa %xmm1, 0x10(" DEST_REG ")\n\t" + "copy_bwd15:\n\t" /* copy last 15 bytes, dest 4-bytes aligned */ + "mov " LEN_REG ", " TMP_REG "\n\t" + "and $3, " LEN_REG "\n\t" + "shr $3, " TMP_REG "\n\t" + "jnc 1f\n\t" + "sub $4, " SRC_REG "\n\t" + "sub $4, " DEST_REG "\n\t" + "movl (" SRC_REG "), %eax\n\t" + "movl %eax, (" DEST_REG ")\n\t" + "1:\n\t" + "shr $1, " TMP_REG "\n\t" + "jnc copy_bwd3\n\t" + "sub $8, " SRC_REG "\n\t" + "sub $8, " DEST_REG "\n\t" + "movl 4(" SRC_REG "), %eax\n\t" + "movl %eax, 4(" DEST_REG ")\n\t" + "movl (" SRC_REG "), %eax\n\t" + "movl %eax, (" DEST_REG ")\n\t" + "copy_bwd3:\n\t" /* copy last 3 bytes */ + "shr $1, " LEN_REG "\n\t" + "jnc 1f\n\t" + "dec " SRC_REG "\n\t" + "dec " DEST_REG "\n\t" + "movb (" SRC_REG "), %al\n\t" + "movb %al, (" DEST_REG ")\n\t" + "1:\n\t" + "shr $1, " LEN_REG "\n\t" + "jnc 1f\n\t" + "movw -2(" SRC_REG "), %ax\n\t" + "movw %ax, -2(" DEST_REG ")\n\t" + "1:\n\t" + MEMMOVE_CLEANUP + "ret" ) + +#endif + /********************************************************************* * memmove (MSVCRT.@) */ @@ -2480,10 +2734,18 @@ int __cdecl memcmp(const void *ptr1, const void *ptr2, size_t n) #endif void * __cdecl memmove(void *dst, const void *src, size_t n) { +#ifdef __x86_64__ + return sse2_memmove(dst, src, n); +#else unsigned char *d = dst; const unsigned char *s = src; int sh1; +#ifdef __i386__ + if (sse2_supported) + return sse2_memmove(dst, src, n); +#endif + if (!n) return dst; if ((size_t)dst - (size_t)src >= n) @@ -2571,6 +2833,7 @@ void * __cdecl memmove(void *dst, const void *src, size_t n) while (n--) *--d = *--s; } return dst; +#endif } #undef MERGE