msvcrt: Improve memmove performance on i386 and x86_64 architectures.
Wine-Bug: https://bugs.winehq.org/show_bug.cgi?id=49663 Signed-off-by: Piotr Caban <piotr@codeweavers.com> Signed-off-by: Alexandre Julliard <julliard@winehq.org>
This commit is contained in:
parent
ad6a3e7534
commit
38c4904960
|
@ -64,7 +64,7 @@ typedef int (CDECL *MSVCRT_matherr_func)(struct _exception *);
|
|||
|
||||
static MSVCRT_matherr_func MSVCRT_default_matherr_func = NULL;
|
||||
|
||||
static BOOL sse2_supported;
|
||||
BOOL sse2_supported;
|
||||
static BOOL sse2_enabled;
|
||||
|
||||
static const struct unix_funcs *unix_funcs;
|
||||
|
|
|
@ -32,6 +32,8 @@
|
|||
#include "winbase.h"
|
||||
#undef strncpy
|
||||
|
||||
extern BOOL sse2_supported DECLSPEC_HIDDEN;
|
||||
|
||||
#define DBL80_MAX_10_EXP 4932
|
||||
#define DBL80_MIN_10_EXP -4951
|
||||
|
||||
|
|
|
@ -31,6 +31,7 @@
|
|||
#include "msvcrt.h"
|
||||
#include "bnum.h"
|
||||
#include "winnls.h"
|
||||
#include "wine/asm.h"
|
||||
#include "wine/debug.h"
|
||||
|
||||
WINE_DEFAULT_DEBUG_CHANNEL(msvcrt);
|
||||
|
@ -2470,6 +2471,259 @@ int __cdecl memcmp(const void *ptr1, const void *ptr2, size_t n)
|
|||
return 0;
|
||||
}
|
||||
|
||||
#if defined(__i386__) || defined(__x86_64__)
|
||||
|
||||
#ifdef __i386__
|
||||
|
||||
#define DEST_REG "%edi"
|
||||
#define SRC_REG "%esi"
|
||||
#define LEN_REG "%ecx"
|
||||
#define TMP_REG "%edx"
|
||||
|
||||
#define MEMMOVE_INIT \
|
||||
"pushl " SRC_REG "\n\t" \
|
||||
__ASM_CFI(".cfi_adjust_cfa_offset 4\n\t") \
|
||||
"pushl " DEST_REG "\n\t" \
|
||||
__ASM_CFI(".cfi_adjust_cfa_offset 4\n\t") \
|
||||
"movl 12(%esp), " DEST_REG "\n\t" \
|
||||
"movl 16(%esp), " SRC_REG "\n\t" \
|
||||
"movl 20(%esp), " LEN_REG "\n\t"
|
||||
|
||||
#define MEMMOVE_CLEANUP \
|
||||
"movl 12(%esp), %eax\n\t" \
|
||||
"popl " DEST_REG "\n\t" \
|
||||
__ASM_CFI(".cfi_adjust_cfa_offset -4\n\t") \
|
||||
"popl " SRC_REG "\n\t" \
|
||||
__ASM_CFI(".cfi_adjust_cfa_offset -4\n\t")
|
||||
|
||||
#else
|
||||
|
||||
#define DEST_REG "%rdi"
|
||||
#define SRC_REG "%rsi"
|
||||
#define LEN_REG "%r8"
|
||||
#define TMP_REG "%r9"
|
||||
|
||||
#define MEMMOVE_INIT \
|
||||
"pushq " SRC_REG "\n\t" \
|
||||
__ASM_CFI(".cfi_adjust_cfa_offset 8\n\t") \
|
||||
"pushq " DEST_REG "\n\t" \
|
||||
__ASM_CFI(".cfi_adjust_cfa_offset 8\n\t") \
|
||||
"movq %rcx, " DEST_REG "\n\t" \
|
||||
"movq %rdx, " SRC_REG "\n\t"
|
||||
|
||||
#define MEMMOVE_CLEANUP \
|
||||
"movq %rcx, %rax\n\t" \
|
||||
"popq " DEST_REG "\n\t" \
|
||||
__ASM_CFI(".cfi_adjust_cfa_offset -8\n\t") \
|
||||
"popq " SRC_REG "\n\t" \
|
||||
__ASM_CFI(".cfi_adjust_cfa_offset -8\n\t")
|
||||
#endif
|
||||
|
||||
void * __cdecl sse2_memmove(void *dst, const void *src, size_t n);
|
||||
__ASM_GLOBAL_FUNC( sse2_memmove,
|
||||
MEMMOVE_INIT
|
||||
"mov " DEST_REG ", " TMP_REG "\n\t" /* check copying direction */
|
||||
"sub " SRC_REG ", " TMP_REG "\n\t"
|
||||
"cmp " LEN_REG ", " TMP_REG "\n\t"
|
||||
"jb copy_bwd\n\t"
|
||||
/* copy forwards */
|
||||
"cmp $4, " LEN_REG "\n\t" /* 4-bytes align */
|
||||
"jb copy_fwd3\n\t"
|
||||
"mov " DEST_REG ", " TMP_REG "\n\t"
|
||||
"shr $1, " TMP_REG "\n\t"
|
||||
"jnc 1f\n\t"
|
||||
"movsb\n\t"
|
||||
"dec " LEN_REG "\n\t"
|
||||
"inc " TMP_REG "\n\t"
|
||||
"1:\n\t"
|
||||
"shr $1, " TMP_REG "\n\t"
|
||||
"jnc 1f\n\t"
|
||||
"movsw\n\t"
|
||||
"sub $2, " LEN_REG "\n\t"
|
||||
"inc " TMP_REG "\n\t"
|
||||
"1:\n\t" /* 16-bytes align */
|
||||
"cmp $16, " LEN_REG "\n\t"
|
||||
"jb copy_fwd15\n\t"
|
||||
"shr $1, " TMP_REG "\n\t"
|
||||
"jnc 1f\n\t"
|
||||
"movsl\n\t"
|
||||
"sub $4, " LEN_REG "\n\t"
|
||||
"inc " TMP_REG "\n\t"
|
||||
"1:\n\t"
|
||||
"shr $1, " TMP_REG "\n\t"
|
||||
"jnc 1f\n\t"
|
||||
"movsl\n\t"
|
||||
"movsl\n\t"
|
||||
"sub $8, " LEN_REG "\n\t"
|
||||
"1:\n\t"
|
||||
"cmp $64, " LEN_REG "\n\t"
|
||||
"jb copy_fwd63\n\t"
|
||||
"1:\n\t" /* copy 64-bytes blocks in loop, dest 16-bytes aligned */
|
||||
"movdqu 0x00(" SRC_REG "), %xmm0\n\t"
|
||||
"movdqu 0x10(" SRC_REG "), %xmm1\n\t"
|
||||
"movdqu 0x20(" SRC_REG "), %xmm2\n\t"
|
||||
"movdqu 0x30(" SRC_REG "), %xmm3\n\t"
|
||||
"movdqa %xmm0, 0x00(" DEST_REG ")\n\t"
|
||||
"movdqa %xmm1, 0x10(" DEST_REG ")\n\t"
|
||||
"movdqa %xmm2, 0x20(" DEST_REG ")\n\t"
|
||||
"movdqa %xmm3, 0x30(" DEST_REG ")\n\t"
|
||||
"add $64, " SRC_REG "\n\t"
|
||||
"add $64, " DEST_REG "\n\t"
|
||||
"sub $64, " LEN_REG "\n\t"
|
||||
"cmp $64, " LEN_REG "\n\t"
|
||||
"jae 1b\n\t"
|
||||
"copy_fwd63:\n\t" /* copy last 63 bytes, dest 16-bytes aligned */
|
||||
"mov " LEN_REG ", " TMP_REG "\n\t"
|
||||
"and $15, " LEN_REG "\n\t"
|
||||
"shr $5, " TMP_REG "\n\t"
|
||||
"jnc 1f\n\t"
|
||||
"movdqu 0(" SRC_REG "), %xmm0\n\t"
|
||||
"movdqa %xmm0, 0(" DEST_REG ")\n\t"
|
||||
"add $16, " SRC_REG "\n\t"
|
||||
"add $16, " DEST_REG "\n\t"
|
||||
"1:\n\t"
|
||||
"shr $1, " TMP_REG "\n\t"
|
||||
"jnc copy_fwd15\n\t"
|
||||
"movdqu 0x00(" SRC_REG "), %xmm0\n\t"
|
||||
"movdqu 0x10(" SRC_REG "), %xmm1\n\t"
|
||||
"movdqa %xmm0, 0x00(" DEST_REG ")\n\t"
|
||||
"movdqa %xmm1, 0x10(" DEST_REG ")\n\t"
|
||||
"add $32, " SRC_REG "\n\t"
|
||||
"add $32, " DEST_REG "\n\t"
|
||||
"copy_fwd15:\n\t" /* copy last 15 bytes, dest 4-bytes aligned */
|
||||
"mov " LEN_REG ", " TMP_REG "\n\t"
|
||||
"and $3, " LEN_REG "\n\t"
|
||||
"shr $3, " TMP_REG "\n\t"
|
||||
"jnc 1f\n\t"
|
||||
"movsl\n\t"
|
||||
"1:\n\t"
|
||||
"shr $1, " TMP_REG "\n\t"
|
||||
"jnc copy_fwd3\n\t"
|
||||
"movsl\n\t"
|
||||
"movsl\n\t"
|
||||
"copy_fwd3:\n\t" /* copy last 3 bytes */
|
||||
"shr $1, " LEN_REG "\n\t"
|
||||
"jnc 1f\n\t"
|
||||
"movsb\n\t"
|
||||
"1:\n\t"
|
||||
"shr $1, " LEN_REG "\n\t"
|
||||
"jnc 1f\n\t"
|
||||
"movsw\n\t"
|
||||
"1:\n\t"
|
||||
MEMMOVE_CLEANUP
|
||||
"ret\n\t"
|
||||
"copy_bwd:\n\t"
|
||||
"lea (" DEST_REG ", " LEN_REG "), " DEST_REG "\n\t"
|
||||
"lea (" SRC_REG ", " LEN_REG "), " SRC_REG "\n\t"
|
||||
"cmp $4, " LEN_REG "\n\t" /* 4-bytes align */
|
||||
"jb copy_bwd3\n\t"
|
||||
"mov " DEST_REG ", " TMP_REG "\n\t"
|
||||
"shr $1, " TMP_REG "\n\t"
|
||||
"jnc 1f\n\t"
|
||||
"dec " SRC_REG "\n\t"
|
||||
"dec " DEST_REG "\n\t"
|
||||
"movb (" SRC_REG "), %al\n\t"
|
||||
"movb %al, (" DEST_REG ")\n\t"
|
||||
"dec " LEN_REG "\n\t"
|
||||
"1:\n\t"
|
||||
"shr $1, " TMP_REG "\n\t"
|
||||
"jnc 1f\n\t"
|
||||
"sub $2, " SRC_REG "\n\t"
|
||||
"sub $2, " DEST_REG "\n\t"
|
||||
"movw (" SRC_REG "), %ax\n\t"
|
||||
"movw %ax, (" DEST_REG ")\n\t"
|
||||
"sub $2, " LEN_REG "\n\t"
|
||||
"1:\n\t" /* 16-bytes align */
|
||||
"cmp $16, " LEN_REG "\n\t"
|
||||
"jb copy_bwd15\n\t"
|
||||
"shr $1, " TMP_REG "\n\t"
|
||||
"jnc 1f\n\t"
|
||||
"sub $4, " SRC_REG "\n\t"
|
||||
"sub $4, " DEST_REG "\n\t"
|
||||
"movl (" SRC_REG "), %eax\n\t"
|
||||
"movl %eax, (" DEST_REG ")\n\t"
|
||||
"sub $4, " LEN_REG "\n\t"
|
||||
"1:\n\t"
|
||||
"shr $1, " TMP_REG "\n\t"
|
||||
"jnc 1f\n\t"
|
||||
"sub $8, " SRC_REG "\n\t"
|
||||
"sub $8, " DEST_REG "\n\t"
|
||||
"movl 4(" SRC_REG "), %eax\n\t"
|
||||
"movl %eax, 4(" DEST_REG ")\n\t"
|
||||
"movl (" SRC_REG "), %eax\n\t"
|
||||
"movl %eax, (" DEST_REG ")\n\t"
|
||||
"sub $8, " LEN_REG "\n\t"
|
||||
"1:\n\t"
|
||||
"cmp $64, " LEN_REG "\n\t"
|
||||
"jb copy_bwd63\n\t"
|
||||
"1:\n\t" /* copy 64-bytes blocks in loop, dest 16-bytes aligned */
|
||||
"sub $64, " SRC_REG "\n\t"
|
||||
"sub $64, " DEST_REG "\n\t"
|
||||
"movdqu 0x00(" SRC_REG "), %xmm0\n\t"
|
||||
"movdqu 0x10(" SRC_REG "), %xmm1\n\t"
|
||||
"movdqu 0x20(" SRC_REG "), %xmm2\n\t"
|
||||
"movdqu 0x30(" SRC_REG "), %xmm3\n\t"
|
||||
"movdqa %xmm0, 0x00(" DEST_REG ")\n\t"
|
||||
"movdqa %xmm1, 0x10(" DEST_REG ")\n\t"
|
||||
"movdqa %xmm2, 0x20(" DEST_REG ")\n\t"
|
||||
"movdqa %xmm3, 0x30(" DEST_REG ")\n\t"
|
||||
"sub $64, " LEN_REG "\n\t"
|
||||
"cmp $64, " LEN_REG "\n\t"
|
||||
"jae 1b\n\t"
|
||||
"copy_bwd63:\n\t" /* copy last 63 bytes, dest 16-bytes aligned */
|
||||
"mov " LEN_REG ", " TMP_REG "\n\t"
|
||||
"and $15, " LEN_REG "\n\t"
|
||||
"shr $5, " TMP_REG "\n\t"
|
||||
"jnc 1f\n\t"
|
||||
"sub $16, " SRC_REG "\n\t"
|
||||
"sub $16, " DEST_REG "\n\t"
|
||||
"movdqu (" SRC_REG "), %xmm0\n\t"
|
||||
"movdqa %xmm0, (" DEST_REG ")\n\t"
|
||||
"1:\n\t"
|
||||
"shr $1, " TMP_REG "\n\t"
|
||||
"jnc copy_bwd15\n\t"
|
||||
"sub $32, " SRC_REG "\n\t"
|
||||
"sub $32, " DEST_REG "\n\t"
|
||||
"movdqu 0x00(" SRC_REG "), %xmm0\n\t"
|
||||
"movdqu 0x10(" SRC_REG "), %xmm1\n\t"
|
||||
"movdqa %xmm0, 0x00(" DEST_REG ")\n\t"
|
||||
"movdqa %xmm1, 0x10(" DEST_REG ")\n\t"
|
||||
"copy_bwd15:\n\t" /* copy last 15 bytes, dest 4-bytes aligned */
|
||||
"mov " LEN_REG ", " TMP_REG "\n\t"
|
||||
"and $3, " LEN_REG "\n\t"
|
||||
"shr $3, " TMP_REG "\n\t"
|
||||
"jnc 1f\n\t"
|
||||
"sub $4, " SRC_REG "\n\t"
|
||||
"sub $4, " DEST_REG "\n\t"
|
||||
"movl (" SRC_REG "), %eax\n\t"
|
||||
"movl %eax, (" DEST_REG ")\n\t"
|
||||
"1:\n\t"
|
||||
"shr $1, " TMP_REG "\n\t"
|
||||
"jnc copy_bwd3\n\t"
|
||||
"sub $8, " SRC_REG "\n\t"
|
||||
"sub $8, " DEST_REG "\n\t"
|
||||
"movl 4(" SRC_REG "), %eax\n\t"
|
||||
"movl %eax, 4(" DEST_REG ")\n\t"
|
||||
"movl (" SRC_REG "), %eax\n\t"
|
||||
"movl %eax, (" DEST_REG ")\n\t"
|
||||
"copy_bwd3:\n\t" /* copy last 3 bytes */
|
||||
"shr $1, " LEN_REG "\n\t"
|
||||
"jnc 1f\n\t"
|
||||
"dec " SRC_REG "\n\t"
|
||||
"dec " DEST_REG "\n\t"
|
||||
"movb (" SRC_REG "), %al\n\t"
|
||||
"movb %al, (" DEST_REG ")\n\t"
|
||||
"1:\n\t"
|
||||
"shr $1, " LEN_REG "\n\t"
|
||||
"jnc 1f\n\t"
|
||||
"movw -2(" SRC_REG "), %ax\n\t"
|
||||
"movw %ax, -2(" DEST_REG ")\n\t"
|
||||
"1:\n\t"
|
||||
MEMMOVE_CLEANUP
|
||||
"ret" )
|
||||
|
||||
#endif
|
||||
|
||||
/*********************************************************************
|
||||
* memmove (MSVCRT.@)
|
||||
*/
|
||||
|
@ -2480,10 +2734,18 @@ int __cdecl memcmp(const void *ptr1, const void *ptr2, size_t n)
|
|||
#endif
|
||||
void * __cdecl memmove(void *dst, const void *src, size_t n)
|
||||
{
|
||||
#ifdef __x86_64__
|
||||
return sse2_memmove(dst, src, n);
|
||||
#else
|
||||
unsigned char *d = dst;
|
||||
const unsigned char *s = src;
|
||||
int sh1;
|
||||
|
||||
#ifdef __i386__
|
||||
if (sse2_supported)
|
||||
return sse2_memmove(dst, src, n);
|
||||
#endif
|
||||
|
||||
if (!n) return dst;
|
||||
|
||||
if ((size_t)dst - (size_t)src >= n)
|
||||
|
@ -2571,6 +2833,7 @@ void * __cdecl memmove(void *dst, const void *src, size_t n)
|
|||
while (n--) *--d = *--s;
|
||||
}
|
||||
return dst;
|
||||
#endif
|
||||
}
|
||||
#undef MERGE
|
||||
|
||||
|
|
Loading…
Reference in New Issue