diff options
| author | Ulrich Drepper <drepper@redhat.com> | 2008-03-07 17:55:11 +0000 |
|---|---|---|
| committer | Ulrich Drepper <drepper@redhat.com> | 2008-03-07 17:55:11 +0000 |
| commit | 78c2bf0eb433515af766d5bbb77901b7c8f9a8cc (patch) | |
| tree | 87b78a8f96faff215404f0498676ccee2159ecf0 /sysdeps/x86_64/memset.S | |
| parent | dff375150393cf31c06010153082959438da9886 (diff) | |
| download | glibc-78c2bf0eb433515af766d5bbb77901b7c8f9a8cc.tar.xz glibc-78c2bf0eb433515af766d5bbb77901b7c8f9a8cc.zip | |
* sysdeps/x86_64/rtld-memset.c: New file.
2008-2-26 Harsha Jagasia <harsha.jagasia@amd.com>
* sysdeps/x86_64/cacheinfo.c (NOT_USED_RIGHT_NOW): Remove ifdef guards.
* sysdeps/x86_64/memset.S: Rewrite non-SSE code path as tuned for AMD
Barcelona machine. Make default fall through branch of
__x86_64_preferred_memory_instruction check as the integer code path.
2007-10-15 H.J. Lu <hongjiu.lu@intel.com>
* sysdeps/x86_64/cacheinfo.c
(__x86_64_preferred_memory_instruction): New variable.
(init_cacheinfo): Initialize __x86_64_preferred_memory_instruction.
* sysdeps/x86_64/memset.S: Rewrite.
2008-01-08 Jakub Jelinek <jakub@redhat.com>
* malloc/malloc.c (public_cALLOc): For arenas other than
Diffstat (limited to 'sysdeps/x86_64/memset.S')
| -rw-r--r-- | sysdeps/x86_64/memset.S | 1365 |
1 files changed, 1281 insertions, 84 deletions
diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S index 939240600d..c7bf2318de 100644 --- a/sysdeps/x86_64/memset.S +++ b/sysdeps/x86_64/memset.S @@ -1,8 +1,7 @@ /* memset/bzero -- set memory area to CH/0 Optimized version for x86-64. - Copyright (C) 2002, 2003, 2004, 2005, 2007 Free Software Foundation, Inc. + Copyright (C) 2002-2005, 2007, 2008 Free Software Foundation, Inc. This file is part of the GNU C Library. - Contributed by Andreas Jaeger <aj@suse.de>. The GNU C Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public @@ -20,13 +19,9 @@ 02111-1307 USA. */ #include <sysdep.h> -#include "asm-syntax.h" -#include "bp-sym.h" -#include "bp-asm.h" -/* This is somehow experimental and could made dependend on the cache - size. */ -#define LARGE $120000 +#define __STOS_LOWER_BOUNDARY $8192 +#define __STOS_UPPER_BOUNDARY $65536 .text #ifndef NOT_IN_libc @@ -46,89 +41,1291 @@ END (__memset_chk) #endif ENTRY (memset) L(memset_entry): - cmp $0x7,%rdx /* Check for small length. */ - mov %rdi,%rcx /* Save ptr as return value. */ - jbe 7f + cmp $0x1,%rdx + mov %rdi,%rax /* memset returns the dest address. */ + jne L(ck2) + mov %sil,(%rdi) + retq +L(ck2): + mov $0x101010101010101,%r9 + mov %rdx,%r8 + movzbq %sil,%rdx + imul %r9,%rdx +L(now_dw_aligned): + cmp $0x90,%r8 + jg L(ck_mem_ops_method) +L(now_dw_aligned_small): + lea L(setPxQx)(%rip),%r11 + add %r8,%rdi +#ifndef PIC + jmpq *(%r11,%r8,8) +#else + movslq (%r11,%r8,4),%rcx + lea (%rcx,%r11,1),%r11 + jmpq *%r11 +#endif - /* Populate 8 bit data to full 64-bit. */ - movabs $0x0101010101010101,%r8 - movzbl %sil,%eax - imul %rax,%r8 - test $0x7,%edi /* Check for alignment. */ - je 2f +L(Got0): + retq - .p2align 4 -1: /* Align ptr to 8 byte. */ - mov %sil,(%rcx) - dec %rdx - inc %rcx - test $0x7,%ecx - jne 1b - -2: /* Check for really large regions. */ - mov %rdx,%rax - shr $0x6,%rax - je 4f - cmp LARGE, %rdx - jae 11f + .pushsection .rodata + .balign 16 +#ifndef PIC +L(setPxQx): + .quad L(Got0), L(P1Q0), L(P2Q0), L(P3Q0) + .quad L(P4Q0), L(P5Q0), L(P6Q0), L(P7Q0) + .quad L(P0Q1), L(P1Q1), L(P2Q1), L(P3Q1) + .quad L(P4Q1), L(P5Q1), L(P6Q1), L(P7Q1) + .quad L(P0Q2), L(P1Q2), L(P2Q2), L(P3Q2) + .quad L(P4Q2), L(P5Q2), L(P6Q2), L(P7Q2) + .quad L(P0Q3), L(P1Q3), L(P2Q3), L(P3Q3) + .quad L(P4Q3), L(P5Q3), L(P6Q3), L(P7Q3) + .quad L(P0Q4), L(P1Q4), L(P2Q4), L(P3Q4) + .quad L(P4Q4), L(P5Q4), L(P6Q4), L(P7Q4) + .quad L(P0Q5), L(P1Q5), L(P2Q5), L(P3Q5) + .quad L(P4Q5), L(P5Q5), L(P6Q5), L(P7Q5) + .quad L(P0Q6), L(P1Q6), L(P2Q6), L(P3Q6) + .quad L(P4Q6), L(P5Q6), L(P6Q6), L(P7Q6) + .quad L(P0Q7), L(P1Q7), L(P2Q7), L(P3Q7) + .quad L(P4Q7), L(P5Q7), L(P6Q7), L(P7Q7) + .quad L(P0Q8), L(P1Q8), L(P2Q8), L(P3Q8) + .quad L(P4Q8), L(P5Q8), L(P6Q8), L(P7Q8) + .quad L(P0Q9), L(P1Q9), L(P2Q9), L(P3Q9) + .quad L(P4Q9), L(P5Q9), L(P6Q9), L(P7Q9) + .quad L(P0QA), L(P1QA), L(P2QA), L(P3QA) + .quad L(P4QA), L(P5QA), L(P6QA), L(P7QA) + .quad L(P0QB), L(P1QB), L(P2QB), L(P3QB) + .quad L(P4QB), L(P5QB), L(P6QB), L(P7QB) + .quad L(P0QC), L(P1QC), L(P2QC), L(P3QC) + .quad L(P4QC), L(P5QC), L(P6QC), L(P7QC) + .quad L(P0QD), L(P1QD), L(P2QD), L(P3QD) + .quad L(P4QD), L(P5QD), L(P6QD), L(P7QD) + .quad L(P0QE), L(P1QE), L(P2QE), L(P3QE) + .quad L(P4QE), L(P5QE), L(P6QE), L(P7QE) + .quad L(P0QF), L(P1QF), L(P2QF), L(P3QF) + .quad L(P4QF), L(P5QF), L(P6QF), L(P7QF) + .quad L(P0QG), L(P1QG), L(P2QG), L(P3QG) + .quad L(P4QG), L(P5QG), L(P6QG), L(P7QG) + .quad L(P0QH), L(P1QH), L(P2QH), L(P3QH) + .quad L(P4QH), L(P5QH), L(P6QH), L(P7QH) + .quad L(P0QI) +# ifdef USE_EXTRA_TABLE + .quad L(P1QI), L(P2QI), L(P3QI), L(P4QI) + .quad L(P5QI), L(P6QI), L(P7QI) +# endif +#else +L(setPxQx): + .int L(Got0)-L(setPxQx) + .int L(P1Q0)-L(setPxQx) + .int L(P2Q0)-L(setPxQx) + .int L(P3Q0)-L(setPxQx) + .int L(P4Q0)-L(setPxQx) + .int L(P5Q0)-L(setPxQx) + .int L(P6Q0)-L(setPxQx) + .int L(P7Q0)-L(setPxQx) + + .int L(P0Q1)-L(setPxQx) + .int L(P1Q1)-L(setPxQx) + .int L(P2Q1)-L(setPxQx) + .int L(P3Q1)-L(setPxQx) + .int L(P4Q1)-L(setPxQx) + .int L(P5Q1)-L(setPxQx) + .int L(P6Q1)-L(setPxQx) + .int L(P7Q1)-L(setPxQx) + + .int L(P0Q2)-L(setPxQx) + .int L(P1Q2)-L(setPxQx) + .int L(P2Q2)-L(setPxQx) + .int L(P3Q2)-L(setPxQx) + .int L(P4Q2)-L(setPxQx) + .int L(P5Q2)-L(setPxQx) + .int L(P6Q2)-L(setPxQx) + .int L(P7Q2)-L(setPxQx) + + .int L(P0Q3)-L(setPxQx) + .int L(P1Q3)-L(setPxQx) + .int L(P2Q3)-L(setPxQx) + .int L(P3Q3)-L(setPxQx) + .int L(P4Q3)-L(setPxQx) + .int L(P5Q3)-L(setPxQx) + .int L(P6Q3)-L(setPxQx) + .int L(P7Q3)-L(setPxQx) + + .int L(P0Q4)-L(setPxQx) + .int L(P1Q4)-L(setPxQx) + .int L(P2Q4)-L(setPxQx) + .int L(P3Q4)-L(setPxQx) + .int L(P4Q4)-L(setPxQx) + .int L(P5Q4)-L(setPxQx) + .int L(P6Q4)-L(setPxQx) + .int L(P7Q4)-L(setPxQx) + + .int L(P0Q5)-L(setPxQx) + .int L(P1Q5)-L(setPxQx) + .int L(P2Q5)-L(setPxQx) + .int L(P3Q5)-L(setPxQx) + .int L(P4Q5)-L(setPxQx) + .int L(P5Q5)-L(setPxQx) + .int L(P6Q5)-L(setPxQx) + .int L(P7Q5)-L(setPxQx) + + .int L(P0Q6)-L(setPxQx) + .int L(P1Q6)-L(setPxQx) + .int L(P2Q6)-L(setPxQx) + .int L(P3Q6)-L(setPxQx) + .int L(P4Q6)-L(setPxQx) + .int L(P5Q6)-L(setPxQx) + .int L(P6Q6)-L(setPxQx) + .int L(P7Q6)-L(setPxQx) + + .int L(P0Q7)-L(setPxQx) + .int L(P1Q7)-L(setPxQx) + .int L(P2Q7)-L(setPxQx) + .int L(P3Q7)-L(setPxQx) + .int L(P4Q7)-L(setPxQx) + .int L(P5Q7)-L(setPxQx) + .int L(P6Q7)-L(setPxQx) + .int L(P7Q7)-L(setPxQx) + + .int L(P0Q8)-L(setPxQx) + .int L(P1Q8)-L(setPxQx) + .int L(P2Q8)-L(setPxQx) + .int L(P3Q8)-L(setPxQx) + .int L(P4Q8)-L(setPxQx) + .int L(P5Q8)-L(setPxQx) + .int L(P6Q8)-L(setPxQx) + .int L(P7Q8)-L(setPxQx) + + .int L(P0Q9)-L(setPxQx) + .int L(P1Q9)-L(setPxQx) + .int L(P2Q9)-L(setPxQx) + .int L(P3Q9)-L(setPxQx) + .int L(P4Q9)-L(setPxQx) + .int L(P5Q9)-L(setPxQx) + .int L(P6Q9)-L(setPxQx) + .int L(P7Q9)-L(setPxQx) + + .int L(P0QA)-L(setPxQx) + .int L(P1QA)-L(setPxQx) + .int L(P2QA)-L(setPxQx) + .int L(P3QA)-L(setPxQx) + .int L(P4QA)-L(setPxQx) + .int L(P5QA)-L(setPxQx) + .int L(P6QA)-L(setPxQx) + .int L(P7QA)-L(setPxQx) + + .int L(P0QB)-L(setPxQx) + .int L(P1QB)-L(setPxQx) + .int L(P2QB)-L(setPxQx) + .int L(P3QB)-L(setPxQx) + .int L(P4QB)-L(setPxQx) + .int L(P5QB)-L(setPxQx) + .int L(P6QB)-L(setPxQx) + .int L(P7QB)-L(setPxQx) + + .int L(P0QC)-L(setPxQx) + .int L(P1QC)-L(setPxQx) + .int L(P2QC)-L(setPxQx) + .int L(P3QC)-L(setPxQx) + .int L(P4QC)-L(setPxQx) + .int L(P5QC)-L(setPxQx) + .int L(P6QC)-L(setPxQx) + .int L(P7QC)-L(setPxQx) + + .int L(P0QD)-L(setPxQx) + .int L(P1QD)-L(setPxQx) + .int L(P2QD)-L(setPxQx) + .int L(P3QD)-L(setPxQx) + .int L(P4QD)-L(setPxQx) + .int L(P5QD)-L(setPxQx) + .int L(P6QD)-L(setPxQx) + .int L(P7QD)-L(setPxQx) + + .int L(P0QE)-L(setPxQx) + .int L(P1QE)-L(setPxQx) + .int L(P2QE)-L(setPxQx) + .int L(P3QE)-L(setPxQx) + .int L(P4QE)-L(setPxQx) + .int L(P5QE)-L(setPxQx) + .int L(P6QE)-L(setPxQx) + .int L(P7QE)-L(setPxQx) + + .int L(P0QF)-L(setPxQx) + .int L(P1QF)-L(setPxQx) + .int L(P2QF)-L(setPxQx) + .int L(P3QF)-L(setPxQx) + .int L(P4QF)-L(setPxQx) + .int L(P5QF)-L(setPxQx) + .int L(P6QF)-L(setPxQx) + .int L(P7QF)-L(setPxQx) + + .int L(P0QG)-L(setPxQx) + .int L(P1QG)-L(setPxQx) + .int L(P2QG)-L(setPxQx) + .int L(P3QG)-L(setPxQx) + .int L(P4QG)-L(setPxQx) + .int L(P5QG)-L(setPxQx) + .int L(P6QG)-L(setPxQx) + .int L(P7QG)-L(setPxQx) + + .int L(P0QH)-L(setPxQx) + .int L(P1QH)-L(setPxQx) + .int L(P2QH)-L(setPxQx) + .int L(P3QH)-L(setPxQx) + .int L(P4QH)-L(setPxQx) + .int L(P5QH)-L(setPxQx) + .int L(P6QH)-L(setPxQx) + .int L(P7QH)-L(setPxQx) + + .int L(P0QI)-L(setPxQx) +# ifdef USE_EXTRA_TABLE + .int L(P1QI)-L(setPxQx) + .int L(P2QI)-L(setPxQx) + .int L(P3QI)-L(setPxQx) + .int L(P4QI)-L(setPxQx) + .int L(P5QI)-L(setPxQx) + .int L(P6QI)-L(setPxQx) + .int L(P7QI)-L(setPxQx) +# endif +#endif + .popsection + + .balign 16 +#ifdef USE_EXTRA_TABLE +L(P1QI): mov %rdx,-0x91(%rdi) +#endif +L(P1QH): mov %rdx,-0x89(%rdi) +L(P1QG): mov %rdx,-0x81(%rdi) +# .balign 16 +L(P1QF): mov %rdx,-0x79(%rdi) +L(P1QE): mov %rdx,-0x71(%rdi) +L(P1QD): mov %rdx,-0x69(%rdi) +L(P1QC): mov %rdx,-0x61(%rdi) +L(P1QB): mov %rdx,-0x59(%rdi) +L(P1QA): mov %rdx,-0x51(%rdi) +L(P1Q9): mov %rdx,-0x49(%rdi) +L(P1Q8): mov %rdx,-0x41(%rdi) +L(P1Q7): mov %rdx,-0x39(%rdi) +L(P1Q6): mov %rdx,-0x31(%rdi) +L(P1Q5): mov %rdx,-0x29(%rdi) +L(P1Q4): mov %rdx,-0x21(%rdi) +L(P1Q3): mov %rdx,-0x19(%rdi) +L(P1Q2): mov %rdx,-0x11(%rdi) +L(P1Q1): mov %rdx,-0x9(%rdi) +L(P1Q0): mov %dl,-0x1(%rdi) + retq + + .balign 16 +L(P0QI): mov %rdx,-0x90(%rdi) +L(P0QH): mov %rdx,-0x88(%rdi) +# .balign 16 +L(P0QG): mov %rdx,-0x80(%rdi) +L(P0QF): mov %rdx,-0x78(%rdi) +L(P0QE): mov %rdx,-0x70(%rdi) +L(P0QD): mov %rdx,-0x68(%rdi) +L(P0QC): mov %rdx,-0x60(%rdi) +L(P0QB): mov %rdx,-0x58(%rdi) +L(P0QA): mov %rdx,-0x50(%rdi) +L(P0Q9): mov %rdx,-0x48(%rdi) +L(P0Q8): mov %rdx,-0x40(%rdi) +L(P0Q7): mov %rdx,-0x38(%rdi) +L(P0Q6): mov %rdx,-0x30(%rdi) +L(P0Q5): mov %rdx,-0x28(%rdi) +L(P0Q4): mov %rdx,-0x20(%rdi) +L(P0Q3): mov %rdx,-0x18(%rdi) +L(P0Q2): mov %rdx,-0x10(%rdi) +L(P0Q1): mov %rdx,-0x8(%rdi) +L(P0Q0): retq + + + .balign 16 +#ifdef USE_EXTRA_TABLE +L(P2QI): mov %rdx,-0x92(%rdi) +#endif +L(P2QH): mov %rdx,-0x8a(%rdi) +L(P2QG): mov %rdx,-0x82(%rdi) +# .balign 16 +L(P2QF): mov %rdx,-0x7a(%rdi) +L(P2QE): mov %rdx,-0x72(%rdi) +L(P2QD): mov %rdx,-0x6a(%rdi) +L(P2QC): mov %rdx,-0x62(%rdi) +L(P2QB): mov %rdx,-0x5a(%rdi) +L(P2QA): mov %rdx,-0x52(%rdi) +L(P2Q9): mov %rdx,-0x4a(%rdi) +L(P2Q8): mov %rdx,-0x42(%rdi) +L(P2Q7): mov %rdx,-0x3a(%rdi) +L(P2Q6): mov %rdx,-0x32(%rdi) +L(P2Q5): mov %rdx,-0x2a(%rdi) +L(P2Q4): mov %rdx,-0x22(%rdi) +L(P2Q3): mov %rdx,-0x1a(%rdi) +L(P2Q2): mov %rdx,-0x12(%rdi) +L(P2Q1): mov %rdx,-0xa(%rdi) +L(P2Q0): mov %dx,-0x2(%rdi) + retq + + .balign 16 +#ifdef USE_EXTRA_TABLE +L(P3QI): mov %rdx,-0x93(%rdi) +#endif +L(P3QH): mov %rdx,-0x8b(%rdi) +L(P3QG): mov %rdx,-0x83(%rdi) +# .balign 16 +L(P3QF): mov %rdx,-0x7b(%rdi) +L(P3QE): mov %rdx,-0x73(%rdi) +L(P3QD): mov %rdx,-0x6b(%rdi) +L(P3QC): mov %rdx,-0x63(%rdi) +L(P3QB): mov %rdx,-0x5b(%rdi) +L(P3QA): mov %rdx,-0x53(%rdi) +L(P3Q9): mov %rdx,-0x4b(%rdi) +L(P3Q8): mov %rdx,-0x43(%rdi) +L(P3Q7): mov %rdx,-0x3b(%rdi) +L(P3Q6): mov %rdx,-0x33(%rdi) +L(P3Q5): mov %rdx,-0x2b(%rdi) +L(P3Q4): mov %rdx,-0x23(%rdi) +L(P3Q3): mov %rdx,-0x1b(%rdi) +L(P3Q2): mov %rdx,-0x13(%rdi) +L(P3Q1): mov %rdx,-0xb(%rdi) +L(P3Q0): mov %dx,-0x3(%rdi) + mov %dl,-0x1(%rdi) + retq + + .balign 16 +#ifdef USE_EXTRA_TABLE +L(P4QI): mov %rdx,-0x94(%rdi) +#endif +L(P4QH): mov %rdx,-0x8c(%rdi) +L(P4QG): mov %rdx,-0x84(%rdi) +# .balign 16 +L(P4QF): mov %rdx,-0x7c(%rdi) +L(P4QE): mov %rdx,-0x74(%rdi) +L(P4QD): mov %rdx,-0x6c(%rdi) +L(P4QC): mov %rdx,-0x64(%rdi) +L(P4QB): mov %rdx,-0x5c(%rdi) +L(P4QA): mov %rdx,-0x54(%rdi) +L(P4Q9): mov %rdx,-0x4c(%rdi) +L(P4Q8): mov %rdx,-0x44(%rdi) +L(P4Q7): mov %rdx,-0x3c(%rdi) +L(P4Q6): mov %rdx,-0x34(%rdi) +L(P4Q5): mov %rdx,-0x2c(%rdi) +L(P4Q4): mov %rdx,-0x24(%rdi) +L(P4Q3): mov %rdx,-0x1c(%rdi) +L(P4Q2): mov %rdx,-0x14(%rdi) +L(P4Q1): mov %rdx,-0xc(%rdi) +L(P4Q0): mov %edx,-0x4(%rdi) + retq + + .balign 16 +#if defined(USE_EXTRA_TABLE) +L(P5QI): mov %rdx,-0x95(%rdi) +#endif +L(P5QH): mov %rdx,-0x8d(%rdi) +L(P5QG): mov %rdx,-0x85(%rdi) +# .balign 16 +L(P5QF): mov %rdx,-0x7d(%rdi) +L(P5QE): mov %rdx,-0x75(%rdi) +L(P5QD): mov %rdx,-0x6d(%rdi) +L(P5QC): mov %rdx,-0x65(%rdi) +L(P5QB): mov %rdx,-0x5d(%rdi) +L(P5QA): mov %rdx,-0x55(%rdi) +L(P5Q9): mov %rdx,-0x4d(%rdi) +L(P5Q8): mov %rdx,-0x45(%rdi) +L(P5Q7): mov %rdx,-0x3d(%rdi) +L(P5Q6): mov %rdx,-0x35(%rdi) +L(P5Q5): mov %rdx,-0x2d(%rdi) +L(P5Q4): mov %rdx,-0x25(%rdi) +L(P5Q3): mov %rdx,-0x1d(%rdi) +L(P5Q2): mov %rdx,-0x15(%rdi) +L(P5Q1): mov %rdx,-0xd(%rdi) +L(P5Q0): mov %edx,-0x5(%rdi) + mov %dl,-0x1(%rdi) + retq + + .balign 16 +#ifdef USE_EXTRA_TABLE +L(P6QI): mov %rdx,-0x96(%rdi) +#endif +L(P6QH): mov %rdx,-0x8e(%rdi) +L(P6QG): mov %rdx,-0x86(%rdi) +# .balign 16 +L(P6QF): mov %rdx,-0x7e(%rdi) +L(P6QE): mov %rdx,-0x76(%rdi) +L(P6QD): mov %rdx,-0x6e(%rdi) +L(P6QC): mov %rdx,-0x66(%rdi) +L(P6QB): mov %rdx,-0x5e(%rdi) +L(P6QA): mov %rdx,-0x56(%rdi) +L(P6Q9): mov %rdx,-0x4e(%rdi) +L(P6Q8): mov %rdx,-0x46(%rdi) +L(P6Q7): mov %rdx,-0x3e(%rdi) +L(P6Q6): mov %rdx,-0x36(%rdi) +L(P6Q5): mov %rdx,-0x2e(%rdi) +L(P6Q4): mov %rdx,-0x26(%rdi) +L(P6Q3): mov %rdx,-0x1e(%rdi) +L(P6Q2): mov %rdx,-0x16(%rdi) +L(P6Q1): mov %rdx,-0xe(%rdi) +L(P6Q0): mov %edx,-0x6(%rdi) + mov %dx,-0x2(%rdi) + retq + + .balign 16 +#ifdef USE_EXTRA_TABLE +L(P7QI): mov %rdx,-0x97(%rdi) +#endif +L(P7QH): mov %rdx,-0x8f(%rdi) +L(P7QG): mov %rdx,-0x87(%rdi) +# .balign 16 +L(P7QF): mov %rdx,-0x7f(%rdi) +L(P7QE): mov %rdx,-0x77(%rdi) +L(P7QD): mov %rdx,-0x6f(%rdi) +L(P7QC): mov %rdx,-0x67(%rdi) +L(P7QB): mov %rdx,-0x5f(%rdi) +L(P7QA): mov %rdx,-0x57(%rdi) +L(P7Q9): mov %rdx,-0x4f(%rdi) +L(P7Q8): mov %rdx,-0x47(%rdi) +L(P7Q7): mov %rdx,-0x3f(%rdi) +L(P7Q6): mov %rdx,-0x37(%rdi) +L(P7Q5): mov %rdx,-0x2f(%rdi) +L(P7Q4): mov %rdx,-0x27(%rdi) +L(P7Q3): mov %rdx,-0x1f(%rdi) +L(P7Q2): mov %rdx,-0x17(%rdi) +L(P7Q1): mov %rdx,-0xf(%rdi) +L(P7Q0): mov %edx,-0x7(%rdi) + mov %dx,-0x3(%rdi) + mov %dl,-0x1(%rdi) + retq + + .balign 16 +L(ck_mem_ops_method): + +# align to 16 byte boundary first + #test $0xf,%rdi + #jz L(aligned_now) + lea L(AliPxQx)(%rip),%r11 + mov $0x10,%r10 + mov %rdi,%r9 + and $0xf,%r9 + sub %r9,%r10 + and $0xf,%r10 + add %r10,%rdi + sub %r10,%r8 +#ifndef PIC + jmpq *(%r11,%r10,8) +#else + movslq (%r11,%r10,4),%rcx + lea (%rcx,%r11,1),%r11 + jmpq *%r11 +#endif + + .pushsection .rodata + .balign 16 +#ifndef PIC +L(AliPxQx): + .quad L(aligned_now), L(A1Q0), L(A2Q0), L(A3Q0) + .quad L(A4Q0), L(A5Q0), L(A6Q0), L(A7Q0) + .quad L(A0Q1), L(A1Q1), L(A2Q1), L(A3Q1) + .quad L(A4Q1), L(A5Q1), L(A6Q1), L(A7Q1) +#else +L(AliPxQx): + .int L(aligned_now)-L(AliPxQx) + .int L(A1Q0)-L(AliPxQx) + .int L(A2Q0)-L(AliPxQx) + .int L(A3Q0)-L(AliPxQx) + .int L(A4Q0)-L(AliPxQx) + .int L(A5Q0)-L(AliPxQx) + .int L(A6Q0)-L(AliPxQx) + .int L(A7Q0)-L(AliPxQx) + + .int L(A0Q1)-L(AliPxQx) + .int L(A1Q1)-L(AliPxQx) + .int L(A2Q1)-L(AliPxQx) + .int L(A3Q1)-L(AliPxQx) + .int L(A4Q1)-L(AliPxQx) + .int L(A5Q1)-L(AliPxQx) + .int L(A6Q1)-L(AliPxQx) + .int L(A7Q1)-L(AliPxQx) +#endif + .popsection + + .balign 16 +L(A5Q1): mov %dl,-0xd(%rdi) +L(A4Q1): mov %edx,-0xc(%rdi) +L(A0Q1): mov %rdx,-0x8(%rdi) +L(A0Q0): jmp L(aligned_now) + + .balign 16 +L(A1Q1): mov %dl,-0x9(%rdi) + mov %rdx,-0x8(%rdi) + jmp L(aligned_now) + + .balign 16 +L(A1Q0): mov %dl,-0x1(%rdi) + jmp L(aligned_now) + + .balign 16 +L(A3Q1): mov %dl,-0xb(%rdi) +L(A2Q1): mov %dx,-0xa(%rdi) + mov %rdx,-0x8(%rdi) + jmp L(aligned_now) + + .balign 16 +L(A3Q0): mov %dl,-0x3(%rdi) +L(A2Q0): mov %dx,-0x2(%rdi) + jmp L(aligned_now) + + .balign 16 +L(A5Q0): mov %dl,-0x5(%rdi) +L(A4Q0): mov %edx,-0x4(%rdi) + jmp L(aligned_now) + + .balign 16 +L(A7Q1): mov %dl,-0xf(%rdi) +L(A6Q1): mov %dx,-0xe(%rdi) + mov %edx,-0xc(%rdi) + mov %rdx,-0x8(%rdi) + jmp L(aligned_now) + + .balign 16 +L(A7Q0): mov %dl,-0x7(%rdi) +L(A6Q0): mov %dx,-0x6(%rdi) + mov %edx,-0x4(%rdi) + jmp L(aligned_now) + + .balign 16 +L(aligned_now): + + cmpl $0x1,__x86_64_preferred_memory_instruction(%rip) + jg L(SSE_pre) + +L(8byte_move_try): + cmpq __STOS_LOWER_BOUNDARY,%r8 + jae L(8byte_stos_try) + + .balign 16 +L(8byte_move): + movq %r8,%rcx + shrq $7,%rcx + jz L(8byte_move_skip) .p2align 4 -3: /* Copy 64 bytes. */ - mov %r8,(%rcx) - mov %r8,0x8(%rcx) - mov %r8,0x10(%rcx) - mov %r8,0x18(%rcx) - mov %r8,0x20(%rcx) - mov %r8,0x28(%rcx) - mov %r8,0x30(%rcx) - mov %r8,0x38(%rcx) - add $0x40,%rcx - dec %rax - jne 3b - -4: /* Copy final bytes. */ - and $0x3f,%edx - mov %rdx,%rax - shr $0x3,%rax - je 6f - -5: /* First in chunks of 8 bytes. */ - mov %r8,(%rcx) - add $0x8,%rcx - dec %rax - jne 5b -6: - and $0x7,%edx -7: - test %rdx,%rdx - je 9f -8: /* And finally as bytes (up to 7). */ - mov %sil,(%rcx) - inc %rcx - dec %rdx - jne 8b -9: - /* Load result (only if used as memset). */ - mov %rdi,%rax /* start address of destination is result */ + +L(8byte_move_loop): + decq %rcx + + movq %rdx, (%rdi) + movq %rdx, 8 (%rdi) + movq %rdx, 16 (%rdi) + movq %rdx, 24 (%rdi) + movq %rdx, 32 (%rdi) + movq %rdx, 40 (%rdi) + movq %rdx, 48 (%rdi) + movq %rdx, 56 (%rdi) + movq %rdx, 64 (%rdi) + movq %rdx, 72 (%rdi) + movq %rdx, 80 (%rdi) + movq %rdx, 88 (%rdi) + movq %rdx, 96 (%rdi) + movq %rdx, 104 (%rdi) + movq %rdx, 112 (%rdi) + movq %rdx, 120 (%rdi) + + leaq 128 (%rdi),%rdi + + jnz L(8byte_move_loop) + +L(8byte_move_skip): + andl $127,%r8d + lea (%rdi,%r8,1),%rdi + lea L(setPxQx)(%rip),%r11 + +#ifndef PIC + jmpq *(%r11,%r8,8) # old scheme remained for nonPIC +#else + movslq (%r11,%r8,4),%rcx + lea (%rcx,%r11,1),%r11 + jmpq *%r11 +#endif + + .balign 16 +L(8byte_stos_try): + mov __x86_64_shared_cache_size(%rip),%r9d // ck largest cache size + cmpq %r8,%r9 // calculate the lesser of remaining + cmovaq %r8,%r9 // bytes and largest cache size + jbe L(8byte_stos) + +L(8byte_move_reuse_try): + cmp __STOS_UPPER_BOUNDARY,%r8 + jae L(8byte_move) + + .balign 16 +L(8byte_stos): + movq %r9,%rcx + andq $-8,%r9 + + shrq $3,%rcx + jz L(8byte_stos_skip) + + xchgq %rax,%rdx + + rep + stosq + + xchgq %rax,%rdx + +L(8byte_stos_skip): + subq %r9,%r8 + ja L(8byte_nt_move) + + andl $7,%r8d + lea (%rdi,%r8,1),%rdi + lea L(setPxQx)(%rip),%r11 +#ifndef PIC + jmpq *(%r11,%r8,8) # old scheme remained for nonPIC +#else + movslq (%r11,%r8,4),%rcx + lea (%rcx,%r11,1),%r11 + jmpq *%r11 +#endif + + .balign 16 +L(8byte_nt_move): + movq %r8,%rcx + shrq $7,%rcx + jz L(8byte_nt_move_skip) + + .balign 16 +L(8byte_nt_move_loop): + decq %rcx + + movntiq %rdx, (%rdi) + movntiq %rdx, 8 (%rdi) + movntiq %rdx, 16 (%rdi) + movntiq %rdx, 24 (%rdi) + movntiq %rdx, 32 (%rdi) + movntiq %rdx, 40 (%rdi) + movntiq %rdx, 48 (%rdi) + movntiq %rdx, 56 (%rdi) + movntiq %rdx, 64 (%rdi) + movntiq %rdx, 72 (%rdi) + movntiq %rdx, 80 (%rdi) + movntiq %rdx, 88 (%rdi) + movntiq %rdx, 96 (%rdi) + movntiq %rdx, 104 (%rdi) + movntiq %rdx, 112 (%rdi) + movntiq %rdx, 120 (%rdi) + + leaq 128 (%rdi),%rdi + + jnz L(8byte_nt_move_loop) + + sfence + +L(8byte_nt_move_skip): + andl $127,%r8d + + lea (%rdi,%r8,1),%rdi + lea L(setPxQx)(%rip),%r11 +#ifndef PIC + jmpq *(%r11,%r8,8) # old scheme remained for nonPIC +#else + movslq (%r11,%r8,4),%rcx + lea (%rcx,%r11,1),%r11 + jmpq *%r11 +#endif + +L(SSE_pre): + # fill RegXMM0 with the pattern + movd %rdx,%xmm0 + pu |
