msvcrt: Improve memmove performance on i386 and x86_64 architectures.

Wine-Bug: https://bugs.winehq.org/show_bug.cgi?id=49663
Signed-off-by: Piotr Caban <piotr@codeweavers.com>
Signed-off-by: Alexandre Julliard <julliard@winehq.org>
This commit is contained in:
Piotr Caban 2021-01-26 19:42:03 +01:00 committed by Alexandre Julliard
parent ad6a3e7534
commit 38c4904960
3 changed files with 266 additions and 1 deletions

View File

@ -64,7 +64,7 @@ typedef int (CDECL *MSVCRT_matherr_func)(struct _exception *);
static MSVCRT_matherr_func MSVCRT_default_matherr_func = NULL;
static BOOL sse2_supported;
BOOL sse2_supported;
static BOOL sse2_enabled;
static const struct unix_funcs *unix_funcs;

View File

@ -32,6 +32,8 @@
#include "winbase.h"
#undef strncpy
extern BOOL sse2_supported DECLSPEC_HIDDEN;
#define DBL80_MAX_10_EXP 4932
#define DBL80_MIN_10_EXP -4951

View File

@ -31,6 +31,7 @@
#include "msvcrt.h"
#include "bnum.h"
#include "winnls.h"
#include "wine/asm.h"
#include "wine/debug.h"
WINE_DEFAULT_DEBUG_CHANNEL(msvcrt);
@ -2470,6 +2471,259 @@ int __cdecl memcmp(const void *ptr1, const void *ptr2, size_t n)
return 0;
}
#if defined(__i386__) || defined(__x86_64__)
#ifdef __i386__
#define DEST_REG "%edi"
#define SRC_REG "%esi"
#define LEN_REG "%ecx"
#define TMP_REG "%edx"
#define MEMMOVE_INIT \
"pushl " SRC_REG "\n\t" \
__ASM_CFI(".cfi_adjust_cfa_offset 4\n\t") \
"pushl " DEST_REG "\n\t" \
__ASM_CFI(".cfi_adjust_cfa_offset 4\n\t") \
"movl 12(%esp), " DEST_REG "\n\t" \
"movl 16(%esp), " SRC_REG "\n\t" \
"movl 20(%esp), " LEN_REG "\n\t"
#define MEMMOVE_CLEANUP \
"movl 12(%esp), %eax\n\t" \
"popl " DEST_REG "\n\t" \
__ASM_CFI(".cfi_adjust_cfa_offset -4\n\t") \
"popl " SRC_REG "\n\t" \
__ASM_CFI(".cfi_adjust_cfa_offset -4\n\t")
#else
#define DEST_REG "%rdi"
#define SRC_REG "%rsi"
#define LEN_REG "%r8"
#define TMP_REG "%r9"
#define MEMMOVE_INIT \
"pushq " SRC_REG "\n\t" \
__ASM_CFI(".cfi_adjust_cfa_offset 8\n\t") \
"pushq " DEST_REG "\n\t" \
__ASM_CFI(".cfi_adjust_cfa_offset 8\n\t") \
"movq %rcx, " DEST_REG "\n\t" \
"movq %rdx, " SRC_REG "\n\t"
#define MEMMOVE_CLEANUP \
"movq %rcx, %rax\n\t" \
"popq " DEST_REG "\n\t" \
__ASM_CFI(".cfi_adjust_cfa_offset -8\n\t") \
"popq " SRC_REG "\n\t" \
__ASM_CFI(".cfi_adjust_cfa_offset -8\n\t")
#endif
void * __cdecl sse2_memmove(void *dst, const void *src, size_t n);
__ASM_GLOBAL_FUNC( sse2_memmove,
MEMMOVE_INIT
"mov " DEST_REG ", " TMP_REG "\n\t" /* check copying direction */
"sub " SRC_REG ", " TMP_REG "\n\t"
"cmp " LEN_REG ", " TMP_REG "\n\t"
"jb copy_bwd\n\t"
/* copy forwards */
"cmp $4, " LEN_REG "\n\t" /* 4-bytes align */
"jb copy_fwd3\n\t"
"mov " DEST_REG ", " TMP_REG "\n\t"
"shr $1, " TMP_REG "\n\t"
"jnc 1f\n\t"
"movsb\n\t"
"dec " LEN_REG "\n\t"
"inc " TMP_REG "\n\t"
"1:\n\t"
"shr $1, " TMP_REG "\n\t"
"jnc 1f\n\t"
"movsw\n\t"
"sub $2, " LEN_REG "\n\t"
"inc " TMP_REG "\n\t"
"1:\n\t" /* 16-bytes align */
"cmp $16, " LEN_REG "\n\t"
"jb copy_fwd15\n\t"
"shr $1, " TMP_REG "\n\t"
"jnc 1f\n\t"
"movsl\n\t"
"sub $4, " LEN_REG "\n\t"
"inc " TMP_REG "\n\t"
"1:\n\t"
"shr $1, " TMP_REG "\n\t"
"jnc 1f\n\t"
"movsl\n\t"
"movsl\n\t"
"sub $8, " LEN_REG "\n\t"
"1:\n\t"
"cmp $64, " LEN_REG "\n\t"
"jb copy_fwd63\n\t"
"1:\n\t" /* copy 64-bytes blocks in loop, dest 16-bytes aligned */
"movdqu 0x00(" SRC_REG "), %xmm0\n\t"
"movdqu 0x10(" SRC_REG "), %xmm1\n\t"
"movdqu 0x20(" SRC_REG "), %xmm2\n\t"
"movdqu 0x30(" SRC_REG "), %xmm3\n\t"
"movdqa %xmm0, 0x00(" DEST_REG ")\n\t"
"movdqa %xmm1, 0x10(" DEST_REG ")\n\t"
"movdqa %xmm2, 0x20(" DEST_REG ")\n\t"
"movdqa %xmm3, 0x30(" DEST_REG ")\n\t"
"add $64, " SRC_REG "\n\t"
"add $64, " DEST_REG "\n\t"
"sub $64, " LEN_REG "\n\t"
"cmp $64, " LEN_REG "\n\t"
"jae 1b\n\t"
"copy_fwd63:\n\t" /* copy last 63 bytes, dest 16-bytes aligned */
"mov " LEN_REG ", " TMP_REG "\n\t"
"and $15, " LEN_REG "\n\t"
"shr $5, " TMP_REG "\n\t"
"jnc 1f\n\t"
"movdqu 0(" SRC_REG "), %xmm0\n\t"
"movdqa %xmm0, 0(" DEST_REG ")\n\t"
"add $16, " SRC_REG "\n\t"
"add $16, " DEST_REG "\n\t"
"1:\n\t"
"shr $1, " TMP_REG "\n\t"
"jnc copy_fwd15\n\t"
"movdqu 0x00(" SRC_REG "), %xmm0\n\t"
"movdqu 0x10(" SRC_REG "), %xmm1\n\t"
"movdqa %xmm0, 0x00(" DEST_REG ")\n\t"
"movdqa %xmm1, 0x10(" DEST_REG ")\n\t"
"add $32, " SRC_REG "\n\t"
"add $32, " DEST_REG "\n\t"
"copy_fwd15:\n\t" /* copy last 15 bytes, dest 4-bytes aligned */
"mov " LEN_REG ", " TMP_REG "\n\t"
"and $3, " LEN_REG "\n\t"
"shr $3, " TMP_REG "\n\t"
"jnc 1f\n\t"
"movsl\n\t"
"1:\n\t"
"shr $1, " TMP_REG "\n\t"
"jnc copy_fwd3\n\t"
"movsl\n\t"
"movsl\n\t"
"copy_fwd3:\n\t" /* copy last 3 bytes */
"shr $1, " LEN_REG "\n\t"
"jnc 1f\n\t"
"movsb\n\t"
"1:\n\t"
"shr $1, " LEN_REG "\n\t"
"jnc 1f\n\t"
"movsw\n\t"
"1:\n\t"
MEMMOVE_CLEANUP
"ret\n\t"
"copy_bwd:\n\t"
"lea (" DEST_REG ", " LEN_REG "), " DEST_REG "\n\t"
"lea (" SRC_REG ", " LEN_REG "), " SRC_REG "\n\t"
"cmp $4, " LEN_REG "\n\t" /* 4-bytes align */
"jb copy_bwd3\n\t"
"mov " DEST_REG ", " TMP_REG "\n\t"
"shr $1, " TMP_REG "\n\t"
"jnc 1f\n\t"
"dec " SRC_REG "\n\t"
"dec " DEST_REG "\n\t"
"movb (" SRC_REG "), %al\n\t"
"movb %al, (" DEST_REG ")\n\t"
"dec " LEN_REG "\n\t"
"1:\n\t"
"shr $1, " TMP_REG "\n\t"
"jnc 1f\n\t"
"sub $2, " SRC_REG "\n\t"
"sub $2, " DEST_REG "\n\t"
"movw (" SRC_REG "), %ax\n\t"
"movw %ax, (" DEST_REG ")\n\t"
"sub $2, " LEN_REG "\n\t"
"1:\n\t" /* 16-bytes align */
"cmp $16, " LEN_REG "\n\t"
"jb copy_bwd15\n\t"
"shr $1, " TMP_REG "\n\t"
"jnc 1f\n\t"
"sub $4, " SRC_REG "\n\t"
"sub $4, " DEST_REG "\n\t"
"movl (" SRC_REG "), %eax\n\t"
"movl %eax, (" DEST_REG ")\n\t"
"sub $4, " LEN_REG "\n\t"
"1:\n\t"
"shr $1, " TMP_REG "\n\t"
"jnc 1f\n\t"
"sub $8, " SRC_REG "\n\t"
"sub $8, " DEST_REG "\n\t"
"movl 4(" SRC_REG "), %eax\n\t"
"movl %eax, 4(" DEST_REG ")\n\t"
"movl (" SRC_REG "), %eax\n\t"
"movl %eax, (" DEST_REG ")\n\t"
"sub $8, " LEN_REG "\n\t"
"1:\n\t"
"cmp $64, " LEN_REG "\n\t"
"jb copy_bwd63\n\t"
"1:\n\t" /* copy 64-bytes blocks in loop, dest 16-bytes aligned */
"sub $64, " SRC_REG "\n\t"
"sub $64, " DEST_REG "\n\t"
"movdqu 0x00(" SRC_REG "), %xmm0\n\t"
"movdqu 0x10(" SRC_REG "), %xmm1\n\t"
"movdqu 0x20(" SRC_REG "), %xmm2\n\t"
"movdqu 0x30(" SRC_REG "), %xmm3\n\t"
"movdqa %xmm0, 0x00(" DEST_REG ")\n\t"
"movdqa %xmm1, 0x10(" DEST_REG ")\n\t"
"movdqa %xmm2, 0x20(" DEST_REG ")\n\t"
"movdqa %xmm3, 0x30(" DEST_REG ")\n\t"
"sub $64, " LEN_REG "\n\t"
"cmp $64, " LEN_REG "\n\t"
"jae 1b\n\t"
"copy_bwd63:\n\t" /* copy last 63 bytes, dest 16-bytes aligned */
"mov " LEN_REG ", " TMP_REG "\n\t"
"and $15, " LEN_REG "\n\t"
"shr $5, " TMP_REG "\n\t"
"jnc 1f\n\t"
"sub $16, " SRC_REG "\n\t"
"sub $16, " DEST_REG "\n\t"
"movdqu (" SRC_REG "), %xmm0\n\t"
"movdqa %xmm0, (" DEST_REG ")\n\t"
"1:\n\t"
"shr $1, " TMP_REG "\n\t"
"jnc copy_bwd15\n\t"
"sub $32, " SRC_REG "\n\t"
"sub $32, " DEST_REG "\n\t"
"movdqu 0x00(" SRC_REG "), %xmm0\n\t"
"movdqu 0x10(" SRC_REG "), %xmm1\n\t"
"movdqa %xmm0, 0x00(" DEST_REG ")\n\t"
"movdqa %xmm1, 0x10(" DEST_REG ")\n\t"
"copy_bwd15:\n\t" /* copy last 15 bytes, dest 4-bytes aligned */
"mov " LEN_REG ", " TMP_REG "\n\t"
"and $3, " LEN_REG "\n\t"
"shr $3, " TMP_REG "\n\t"
"jnc 1f\n\t"
"sub $4, " SRC_REG "\n\t"
"sub $4, " DEST_REG "\n\t"
"movl (" SRC_REG "), %eax\n\t"
"movl %eax, (" DEST_REG ")\n\t"
"1:\n\t"
"shr $1, " TMP_REG "\n\t"
"jnc copy_bwd3\n\t"
"sub $8, " SRC_REG "\n\t"
"sub $8, " DEST_REG "\n\t"
"movl 4(" SRC_REG "), %eax\n\t"
"movl %eax, 4(" DEST_REG ")\n\t"
"movl (" SRC_REG "), %eax\n\t"
"movl %eax, (" DEST_REG ")\n\t"
"copy_bwd3:\n\t" /* copy last 3 bytes */
"shr $1, " LEN_REG "\n\t"
"jnc 1f\n\t"
"dec " SRC_REG "\n\t"
"dec " DEST_REG "\n\t"
"movb (" SRC_REG "), %al\n\t"
"movb %al, (" DEST_REG ")\n\t"
"1:\n\t"
"shr $1, " LEN_REG "\n\t"
"jnc 1f\n\t"
"movw -2(" SRC_REG "), %ax\n\t"
"movw %ax, -2(" DEST_REG ")\n\t"
"1:\n\t"
MEMMOVE_CLEANUP
"ret" )
#endif
/*********************************************************************
* memmove (MSVCRT.@)
*/
@ -2480,10 +2734,18 @@ int __cdecl memcmp(const void *ptr1, const void *ptr2, size_t n)
#endif
void * __cdecl memmove(void *dst, const void *src, size_t n)
{
#ifdef __x86_64__
return sse2_memmove(dst, src, n);
#else
unsigned char *d = dst;
const unsigned char *s = src;
int sh1;
#ifdef __i386__
if (sse2_supported)
return sse2_memmove(dst, src, n);
#endif
if (!n) return dst;
if ((size_t)dst - (size_t)src >= n)
@ -2571,6 +2833,7 @@ void * __cdecl memmove(void *dst, const void *src, size_t n)
while (n--) *--d = *--s;
}
return dst;
#endif
}
#undef MERGE