diff options
| -rw-r--r-- | fedora/glibc-rh234946.patch | 32 | ||||
| -rw-r--r-- | fedora/glibc-x86_64-memcpy.patch | 1439 | ||||
| -rw-r--r-- | fedora/glibc.spec.in | 7 |
3 files changed, 1477 insertions, 1 deletions
diff --git a/fedora/glibc-rh234946.patch b/fedora/glibc-rh234946.patch new file mode 100644 index 0000000000..add3f8023f --- /dev/null +++ b/fedora/glibc-rh234946.patch @@ -0,0 +1,32 @@ +2006-12-25 Sripathi Kodi <sripathik@in.ibm.com> + + * include/link.h: Declare new flag l_fini_called in struct link_map. + * elf/dl-fini.c: In _dl_fini, set l_fini_called to 1 instead of + l_init_called to 0. + +--- libc/elf/dl-fini.c 2006-12-22 01:54:22.000000000 -0600 ++++ libc/elf/dl-fini.c 2006-12-24 22:51:52.000000000 -0600 +@@ -215,10 +215,10 @@ _dl_fini (void) + { + l = maps[i]; + +- if (l->l_init_called) ++ if (l->l_init_called && !l->l_fini_called) + { + /* Make sure nothing happens if we are called twice. */ +- l->l_init_called = 0; ++ l->l_fini_called = 1; + + /* Is there a destructor function? */ + if (l->l_info[DT_FINI_ARRAY] != NULL +--- libc/include/link.h 2006-12-22 01:54:22.000000000 -0600 ++++ libc/include/link.h 2006-12-24 22:53:29.000000000 -0600 +@@ -185,6 +185,8 @@ struct link_map + unsigned int l_contiguous:1; /* Nonzero if inter-segment holes are + mprotected or if no holes are present at + all. */ ++ unsigned int l_fini_called:1; /* Nonzero if _dl_fini has processed ++ this object */ + + /* Array with version names. */ + unsigned int l_nversions; diff --git a/fedora/glibc-x86_64-memcpy.patch b/fedora/glibc-x86_64-memcpy.patch new file mode 100644 index 0000000000..3888134df8 --- /dev/null +++ b/fedora/glibc-x86_64-memcpy.patch @@ -0,0 +1,1439 @@ +2007-05-21 Ulrich Drepper <drepper@redhat.com> + + * sysdeps/x86_64/cacheinfo.c (init_cacheinfo): Pass correct value + as second parameter to handle_intel. + + * sysdeps/unix/sysv/linux/x86_64/sysconf.c: Move cache information + handling to ... + * sysdeps/x86_64/cacheinfo.c: ... here. New file. + * sysdeps/x86_64/Makefile [subdir=string] (sysdep_routines): Add + cacheinfo. + * sysdeps/x86_64/memcpy.S: Complete rewrite. + * sysdeps/x86_64/mempcpy.S: Adjust appropriately. + Patch by Evandro Menezes <evandro.menezes@amd.com>. + +--- libc/sysdeps/x86_64/Makefile 16 Aug 2004 06:46:14 -0000 1.4 ++++ libc/sysdeps/x86_64/Makefile 21 May 2007 19:20:45 -0000 1.5 +@@ -9,3 +9,7 @@ endif + ifeq ($(subdir),gmon) + sysdep_routines += _mcount + endif ++ ++ifeq ($(subdir),string) ++sysdep_routines += cacheinfo ++endif +--- libc/sysdeps/x86_64/cacheinfo.c 1 Jan 1970 00:00:00 -0000 ++++ libc/sysdeps/x86_64/cacheinfo.c 21 May 2007 22:37:45 -0000 1.2 +@@ -0,0 +1,451 @@ ++/* x86_64 cache info. ++ Copyright (C) 2003, 2004, 2006, 2007 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, write to the Free ++ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA ++ 02111-1307 USA. ++*/ ++ ++#include <assert.h> ++#include <stdbool.h> ++#include <stdlib.h> ++#include <unistd.h> ++ ++static const struct intel_02_cache_info ++{ ++ unsigned int idx; ++ int name; ++ long int size; ++ long int assoc; ++ long int linesize; ++} intel_02_known [] = ++ { ++ { 0x06, _SC_LEVEL1_ICACHE_SIZE, 8192, 4, 32 }, ++ { 0x08, _SC_LEVEL1_ICACHE_SIZE, 16384, 4, 32 }, ++ { 0x0a, _SC_LEVEL1_DCACHE_SIZE, 8192, 2, 32 }, ++ { 0x0c, _SC_LEVEL1_DCACHE_SIZE, 16384, 4, 32 }, ++ { 0x22, _SC_LEVEL3_CACHE_SIZE, 524288, 4, 64 }, ++ { 0x23, _SC_LEVEL3_CACHE_SIZE, 1048576, 8, 64 }, ++ { 0x25, _SC_LEVEL3_CACHE_SIZE, 2097152, 8, 64 }, ++ { 0x29, _SC_LEVEL3_CACHE_SIZE, 4194304, 8, 64 }, ++ { 0x2c, _SC_LEVEL1_DCACHE_SIZE, 32768, 8, 64 }, ++ { 0x30, _SC_LEVEL1_ICACHE_SIZE, 32768, 8, 64 }, ++ { 0x39, _SC_LEVEL2_CACHE_SIZE, 131072, 4, 64 }, ++ { 0x3a, _SC_LEVEL2_CACHE_SIZE, 196608, 6, 64 }, ++ { 0x3b, _SC_LEVEL2_CACHE_SIZE, 131072, 2, 64 }, ++ { 0x3c, _SC_LEVEL2_CACHE_SIZE, 262144, 4, 64 }, ++ { 0x3d, _SC_LEVEL2_CACHE_SIZE, 393216, 6, 64 }, ++ { 0x3e, _SC_LEVEL2_CACHE_SIZE, 524288, 4, 64 }, ++ { 0x41, _SC_LEVEL2_CACHE_SIZE, 131072, 4, 32 }, ++ { 0x42, _SC_LEVEL2_CACHE_SIZE, 262144, 4, 32 }, ++ { 0x43, _SC_LEVEL2_CACHE_SIZE, 524288, 4, 32 }, ++ { 0x44, _SC_LEVEL2_CACHE_SIZE, 1048576, 4, 32 }, ++ { 0x45, _SC_LEVEL2_CACHE_SIZE, 2097152, 4, 32 }, ++ { 0x46, _SC_LEVEL3_CACHE_SIZE, 4194304, 4, 64 }, ++ { 0x47, _SC_LEVEL3_CACHE_SIZE, 8388608, 8, 64 }, ++ { 0x49, _SC_LEVEL2_CACHE_SIZE, 4194304, 16, 64 }, ++ { 0x4a, _SC_LEVEL3_CACHE_SIZE, 6291456, 12, 64 }, ++ { 0x4b, _SC_LEVEL3_CACHE_SIZE, 8388608, 16, 64 }, ++ { 0x4c, _SC_LEVEL3_CACHE_SIZE, 12582912, 12, 64 }, ++ { 0x4d, _SC_LEVEL3_CACHE_SIZE, 16777216, 16, 64 }, ++ { 0x60, _SC_LEVEL1_DCACHE_SIZE, 16384, 8, 64 }, ++ { 0x66, _SC_LEVEL1_DCACHE_SIZE, 8192, 4, 64 }, ++ { 0x67, _SC_LEVEL1_DCACHE_SIZE, 16384, 4, 64 }, ++ { 0x68, _SC_LEVEL1_DCACHE_SIZE, 32768, 4, 64 }, ++ { 0x78, _SC_LEVEL2_CACHE_SIZE, 1048576, 8, 64 }, ++ { 0x79, _SC_LEVEL2_CACHE_SIZE, 131072, 8, 64 }, ++ { 0x7a, _SC_LEVEL2_CACHE_SIZE, 262144, 8, 64 }, ++ { 0x7b, _SC_LEVEL2_CACHE_SIZE, 524288, 8, 64 }, ++ { 0x7c, _SC_LEVEL2_CACHE_SIZE, 1048576, 8, 64 }, ++ { 0x7d, _SC_LEVEL2_CACHE_SIZE, 2097152, 8, 64 }, ++ { 0x7f, _SC_LEVEL2_CACHE_SIZE, 524288, 2, 64 }, ++ { 0x82, _SC_LEVEL2_CACHE_SIZE, 262144, 8, 32 }, ++ { 0x83, _SC_LEVEL2_CACHE_SIZE, 524288, 8, 32 }, ++ { 0x84, _SC_LEVEL2_CACHE_SIZE, 1048576, 8, 32 }, ++ { 0x85, _SC_LEVEL2_CACHE_SIZE, 2097152, 8, 32 }, ++ { 0x86, _SC_LEVEL2_CACHE_SIZE, 524288, 4, 64 }, ++ { 0x87, _SC_LEVEL2_CACHE_SIZE, 1048576, 8, 64 }, ++ }; ++ ++#define nintel_02_known (sizeof (intel_02_known) / sizeof (intel_02_known [0])) ++ ++static int ++intel_02_known_compare (const void *p1, const void *p2) ++{ ++ const struct intel_02_cache_info *i1; ++ const struct intel_02_cache_info *i2; ++ ++ i1 = (const struct intel_02_cache_info *) p1; ++ i2 = (const struct intel_02_cache_info *) p2; ++ ++ if (i1->idx == i2->idx) ++ return 0; ++ ++ return i1->idx < i2->idx ? -1 : 1; ++} ++ ++ ++static long int ++__attribute__ ((noinline)) ++intel_check_word (int name, unsigned int value, bool *has_level_2, ++ bool *no_level_2_or_3) ++{ ++ if ((value & 0x80000000) != 0) ++ /* The register value is reserved. */ ++ return 0; ++ ++ /* Fold the name. The _SC_ constants are always in the order SIZE, ++ ASSOC, LINESIZE. */ ++ int folded_name = (_SC_LEVEL1_ICACHE_SIZE ++ + ((name - _SC_LEVEL1_ICACHE_SIZE) / 3) * 3); ++ ++ while (value != 0) ++ { ++ unsigned int byte = value & 0xff; ++ ++ if (byte == 0x40) ++ { ++ *no_level_2_or_3 = true; ++ ++ if (folded_name == _SC_LEVEL3_CACHE_SIZE) ++ /* No need to look further. */ ++ break; ++ } ++ else ++ { ++ if (byte == 0x49 && folded_name == _SC_LEVEL3_CACHE_SIZE) ++ { ++ /* Intel reused this value. For family 15, model 6 it ++ specifies the 3rd level cache. Otherwise the 2nd ++ level cache. */ ++ unsigned int eax; ++ unsigned int ebx; ++ unsigned int ecx; ++ unsigned int edx; ++ asm volatile ("cpuid" ++ : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx) ++ : "0" (1)); ++ ++ unsigned int family = ((eax >> 20) & 0xff) + ((eax >> 8) & 0xf); ++ unsigned int model = ((((eax >>16) & 0xf) << 4) ++ + ((eax >> 4) & 0xf)); ++ if (family == 15 && model == 6) ++ { ++ /* The level 3 cache is encoded for this model like ++ the level 2 cache is for other models. Pretend ++ the caller asked for the level 2 cache. */ ++ name = (_SC_LEVEL2_CACHE_SIZE ++ + (name - _SC_LEVEL3_CACHE_SIZE)); ++ folded_name = _SC_LEVEL3_CACHE_SIZE; ++ } ++ } ++ ++ struct intel_02_cache_info *found; ++ struct intel_02_cache_info search; ++ ++ search.idx = byte; ++ found = bsearch (&search, intel_02_known, nintel_02_known, ++ sizeof (intel_02_known[0]), intel_02_known_compare); ++ if (found != NULL) ++ { ++ if (found->name == folded_name) ++ { ++ unsigned int offset = name - folded_name; ++ ++ if (offset == 0) ++ /* Cache size. */ ++ return found->size; ++ if (offset == 1) ++ return found->assoc; ++ ++ assert (offset == 2); ++ return found->linesize; ++ } ++ ++ if (found->name == _SC_LEVEL2_CACHE_SIZE) ++ *has_level_2 = true; ++ } ++ } ++ ++ /* Next byte for the next round. */ ++ value >>= 8; ++ } ++ ++ /* Nothing found. */ ++ return 0; ++} ++ ++ ++static long int __attribute__ ((noinline)) ++handle_intel (int name, unsigned int maxidx) ++{ ++ assert (maxidx >= 2); ++ ++ /* OK, we can use the CPUID instruction to get all info about the ++ caches. */ ++ unsigned int cnt = 0; ++ unsigned int max = 1; ++ long int result = 0; ++ bool no_level_2_or_3 = false; ++ bool has_level_2 = false; ++ ++ while (cnt++ < max) ++ { ++ unsigned int eax; ++ unsigned int ebx; ++ unsigned int ecx; ++ unsigned int edx; ++ asm volatile ("cpuid" ++ : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx) ++ : "0" (2)); ++ ++ /* The low byte of EAX in the first round contain the number of ++ rounds we have to make. At least one, the one we are already ++ doing. */ ++ if (cnt == 1) ++ { ++ max = eax & 0xff; ++ eax &= 0xffffff00; ++ } ++ ++ /* Process the individual registers' value. */ ++ result = intel_check_word (name, eax, &has_level_2, &no_level_2_or_3); ++ if (result != 0) ++ return result; ++ ++ result = intel_check_word (name, ebx, &has_level_2, &no_level_2_or_3); ++ if (result != 0) ++ return result; ++ ++ result = intel_check_word (name, ecx, &has_level_2, &no_level_2_or_3); ++ if (result != 0) ++ return result; ++ ++ result = intel_check_word (name, edx, &has_level_2, &no_level_2_or_3); ++ if (result != 0) ++ return result; ++ } ++ ++ if (name >= _SC_LEVEL2_CACHE_SIZE && name <= _SC_LEVEL3_CACHE_LINESIZE ++ && no_level_2_or_3) ++ return -1; ++ ++ return 0; ++} ++ ++ ++static long int __attribute__ ((noinline)) ++handle_amd (int name) ++{ ++ unsigned int eax; ++ unsigned int ebx; ++ unsigned int ecx; ++ unsigned int edx; ++ asm volatile ("cpuid" ++ : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx) ++ : "0" (0x80000000)); ++ ++ if (name >= _SC_LEVEL3_CACHE_SIZE) ++ return 0; ++ ++ unsigned int fn = 0x80000005 + (name >= _SC_LEVEL2_CACHE_SIZE); ++ if (eax < fn) ++ return 0; ++ ++ asm volatile ("cpuid" ++ : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx) ++ : "0" (fn)); ++ ++ if (name < _SC_LEVEL1_DCACHE_SIZE) ++ { ++ name += _SC_LEVEL1_DCACHE_SIZE - _SC_LEVEL1_ICACHE_SIZE; ++ ecx = edx; ++ } ++ ++ switch (name) ++ { ++ case _SC_LEVEL1_DCACHE_SIZE: ++ return (ecx >> 14) & 0x3fc00; ++ case _SC_LEVEL1_DCACHE_ASSOC: ++ ecx >>= 16; ++ if ((ecx & 0xff) == 0xff) ++ /* Fully associative. */ ++ return (ecx << 2) & 0x3fc00; ++ return ecx & 0xff; ++ case _SC_LEVEL1_DCACHE_LINESIZE: ++ return ecx & 0xff; ++ case _SC_LEVEL2_CACHE_SIZE: ++ return (ecx & 0xf000) == 0 ? 0 : (ecx >> 6) & 0x3fffc00; ++ case _SC_LEVEL2_CACHE_ASSOC: ++ ecx >>= 12; ++ switch (ecx & 0xf) ++ { ++ case 0: ++ case 1: ++ case 2: ++ case 4: ++ return ecx & 0xf; ++ case 6: ++ return 8; ++ case 8: ++ return 16; ++ case 0xf: ++ return (ecx << 6) & 0x3fffc00; ++ default: ++ return 0; ++ } ++ case _SC_LEVEL2_CACHE_LINESIZE: ++ return (ecx & 0xf000) == 0 ? 0 : ecx & 0xff; ++ default: ++ assert (! "cannot happen"); ++ } ++ return -1; ++} ++ ++ ++/* Get the value of the system variable NAME. */ ++long int ++attribute_hidden ++__cache_sysconf (int name) ++{ ++ /* Find out what brand of processor. */ ++ unsigned int eax; ++ unsigned int ebx; ++ unsigned int ecx; ++ unsigned int edx; ++ asm volatile ("cpuid" ++ : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx) ++ : "0" (0)); ++ ++ /* This spells out "GenuineIntel". */ ++ if (ebx == 0x756e6547 && ecx == 0x6c65746e && edx == 0x49656e69) ++ return handle_intel (name, eax); ++ ++ /* This spells out "AuthenticAMD". */ ++ if (ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65) ++ return handle_amd (name); ++ ++ // XXX Fill in more vendors. ++ ++ /* CPU not known, we have no information. */ ++ return 0; ++} ++ ++ ++/* Half the core cache size for use in memory and string routines, typically ++ L1 size. */ ++long int __x86_64_core_cache_size_half attribute_hidden = 32 * 1024 / 2; ++/* Shared cache size for use in memory and string routines, typically ++ L2 or L3 size. */ ++long int __x86_64_shared_cache_size_half attribute_hidden = 1024 * 1024 / 2; ++/* PREFETCHW support flag for use in memory and string routines. */ ++int __x86_64_prefetchw attribute_hidden; ++ ++ ++static void ++__attribute__((constructor)) ++init_cacheinfo (void) ++{ ++ /* Find out what brand of processor. */ ++ unsigned int eax; ++ unsigned int ebx; ++ unsigned int ecx; ++ unsigned int edx; ++ int max_cpuid; ++ int max_cpuid_ex; ++ long int core = -1; ++ long int shared = -1; ++ unsigned int level; ++ unsigned int threads = 0; ++ ++ asm volatile ("cpuid" ++ : "=a" (max_cpuid), "=b" (ebx), "=c" (ecx), "=d" (edx) ++ : "0" (0)); ++ ++ /* This spells out "GenuineIntel". */ ++ if (ebx == 0x756e6547 && ecx == 0x6c65746e && edx == 0x49656e69) ++ { ++ core = handle_intel (_SC_LEVEL1_DCACHE_SIZE, max_cpuid); ++ ++ /* Try L3 first. */ ++ level = 3; ++ shared = handle_intel (_SC_LEVEL3_CACHE_SIZE, max_cpuid); ++ ++ if (shared <= 0) ++ { ++ /* Try L2 otherwise. */ ++ level = 2; ++ shared = handle_intel (_SC_LEVEL2_CACHE_SIZE, max_cpuid); ++ } ++ ++ /* Figure out the number of logical threads that share the ++ highest cache level. */ ++ if (max_cpuid >= 4) ++ { ++ int i = 0; ++ ++ /* Query until desired cache level is enumerated. */ ++ do ++ { ++ asm volatile ("cpuid" ++ : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx) ++ : "0" (4), "2" (i++)); ++ } ++ while (((eax >> 5) & 0x7) != level); ++ ++ threads = ((eax >> 14) & 0x3ff) + 1; ++ } ++ else ++ { ++ /* Assume that all logical threads share the highest cache level. */ ++ asm volatile ("cpuid" ++ : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx) ++ : "0" (1)); ++ ++ threads = (ebx >> 16) & 0xff; ++ } ++ ++ /* Cap usage of highest cache level to the number of supported ++ threads. */ ++ if (shared > 0 && threads > 0) ++ shared /= threads; ++ } ++ /* This spells out "AuthenticAMD". */ ++ else if (ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65) ++ { ++ core = handle_amd (_SC_LEVEL1_DCACHE_SIZE); ++ shared = handle_amd (_SC_LEVEL2_CACHE_SIZE); ++ ++ asm volatile ("cpuid" ++ : "=a" (max_cpuid_ex), "=b" (ebx), "=c" (ecx), "=d" (edx) ++ : "0" (0x80000000)); ++ ++ if (max_cpuid_ex >= 0x80000001) ++ { ++ asm volatile ("cpuid" ++ : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx) ++ : "0" (0x80000001)); ++ /* PREFETCHW || 3DNow! */ ++ if ((ecx & 0x100) || (edx & 0x80000000)) ++ __x86_64_prefetchw = -1; ++ } ++ } ++ ++ if (core > 0) ++ __x86_64_core_cache_size_half = core / 2; ++ ++ if (shared > 0) ++ __x86_64_shared_cache_size_half = shared / 2; ++} +--- libc/sysdeps/x86_64/memcpy.S 18 Oct 2004 04:17:08 -0000 1.5 ++++ libc/sysdeps/x86_64/memcpy.S 21 May 2007 19:21:01 -0000 1.6 +@@ -1,7 +1,10 @@ +-/* Highly optimized version for x86-64. +- Copyright (C) 1997, 2000, 2002, 2003, 2004 Free Software Foundation, Inc. ++/* ++ Optimized memcpy for x86-64. ++ ++ Copyright (C) 2007 Free Software Foundation, Inc. ++ Contributed by Evandro Menezes <evandro.menezes@amd.com>, 2007. ++ + This file is part of the GNU C Library. +- Based on i586 version contributed by Ulrich Drepper <drepper@cygnus.com>, 1997. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public +@@ -16,86 +19,556 @@ + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA +- 02111-1307 USA. */ ++ 02111-1307 USA. ++*/ + + #include <sysdep.h> + #include "asm-syntax.h" +-#include "bp-sym.h" +-#include "bp-asm.h" + +-/* BEWARE: `#ifdef memcpy' means that memcpy is redefined as `mempcpy', +- and the return value is the byte after the last one copied in +- the destination. */ +-#define MEMPCPY_P (defined memcpy) ++/* Stack slots in the red-zone. */ ++ ++#ifdef USE_AS_MEMPCPY ++# define RETVAL (0) ++#else ++# define RETVAL (-8) ++#endif ++#define SAVE0 (RETVAL - 8) ++#define SAVE1 (SAVE0 - 8) ++#define SAVE2 (SAVE1 - 8) ++#define SAVE3 (SAVE2 - 8) + + .text ++ + #if defined PIC && !defined NOT_IN_libc + ENTRY (__memcpy_chk) ++ + cmpq %rdx, %rcx + jb HIDDEN_JUMPTARGET (__chk_fail) ++ + END (__memcpy_chk) + #endif +-ENTRY (BP_SYM (memcpy)) +- /* Cutoff for the big loop is a size of 32 bytes since otherwise +- the loop will never be entered. */ ++ ++ENTRY(memcpy) /* (void *, const void*, size_t) */ ++ ++/* Handle tiny blocks. */ ++ ++L(1try): /* up to 32B */ + cmpq $32, %rdx +- movq %rdx, %rcx +-#if !MEMPCPY_P +- movq %rdi, %r10 /* Save value. */ ++#ifndef USE_AS_MEMPCPY ++ movq %rdi, %rax /* save return value */ + #endif ++ jae L(1after) + +- /* We need this in any case. */ +- cld ++L(1): /* 1-byte once */ ++ testb $1, %dl ++ jz L(1a) + +- jbe 1f ++ movzbl (%rsi), %ecx ++ movb %cl, (%rdi) + +- /* Align destination. */ +- movq %rdi, %rax +- negq %rax +- andq $7, %rax +- subq %rax, %rcx +- xchgq %rax, %rcx ++ incq %rsi ++ incq %rdi ++ ++ .p2align 4,, 4 ++ ++L(1a): /* 2-byte once */ ++ testb $2, %dl ++ jz L(1b) ++ ++ movzwl (%rsi), %ecx ++ movw %cx, (%rdi) + +- rep; movsb ++ addq $2, %rsi ++ addq $2, %rdi + +- movq %rax, %rcx +- subq $32, %rcx +- js 2f ++ .p2align 4,, 4 ++ ++L(1b): /* 4-byte once */ ++ testb $4, %dl ++ jz L(1c) ++ ++ movl (%rsi), %ecx ++ movl %ecx, (%rdi) ++ ++ addq $4, %rsi ++ addq $4, %rdi ++ ++ .p2align 4,, 4 ++ ++L(1c): /* 8-byte once */ ++ testb $8, %dl ++ jz L(1d) ++ ++ movq (%rsi), %rcx ++ movq %rcx, (%rdi) ++ ++ addq $8, %rsi ++ addq $8, %rdi ++ ++ .p2align 4,, 4 ++ ++L(1d): /* 16-byte loop */ ++ andl $0xf0, %edx ++ jz L(exit) + + .p2align 4 +-3: + +- /* Now correct the loop counter. Please note that in the following +- code the flags are not changed anymore. */ +- subq $32, %rcx ++L(1loop): ++ movq (%rsi), %rcx ++ movq 8 (%rsi), %r8 ++ movq %rcx, (%rdi) ++ movq %r8, 8 (%rdi) ++ ++ subl $16, %edx ++ ++ leaq 16 (%rsi), %rsi ++ leaq 16 (%rdi), %rdi ++ ++ jnz L(1loop) ++ ++ .p2align 4,, 4 ++ ++L(exit): /* exit */ ++#ifdef USE_AS_MEMPCPY ++ movq %rdi, %rax /* return value */ ++#else ++ rep ++#endif ++ retq ++ ++ .p2align 4 ++ ++L(1after): ++#ifndef USE_AS_MEMPCPY ++ movq %rax, RETVAL (%rsp) /* save return value */ ++#endif ++ ++/* Align to the natural word size. */ ++ ++L(aligntry): ++ movl %esi, %ecx /* align by destination */ ++ ++ andl $7, %ecx ++ jz L(alignafter) /* already aligned */ ++ ++L(align): /* align */ ++ leaq -8 (%rcx, %rdx), %rdx /* calculate remaining bytes */ ++ subl $8, %ecx ++ ++ .p2align 4 ++ ++L(alignloop): /* 1-byte alignment loop */ ++ movzbl (%rsi), %eax ++ movb %al, (%rdi) ++ ++ incl %ecx ++ ++ leaq 1 (%rsi), %rsi ++ leaq 1 (%rdi), %rdi ++ ++ jnz L(alignloop) ++ ++ .p2align 4 ++ ++L(alignafter): ++ ++/* Loop to handle mid-sized blocks. */ ++ ++L(32try): /* up to 1KB */ ++ cmpq $1024, %rdx ++ ja L(32after) ++ ++L(32): /* 32-byte loop */ ++ movl %edx, %ecx ++ shrl $5, %ecx ++ jz L(32skip) ++ ++ .p2align 4 ++ ++L(32loop): ++ decl %ecx + + movq (%rsi), %rax +- movq 8(%rsi), %rdx +- movq 16(%rsi), %r8 +- movq 24(%rsi), %r9 ++ movq 8 (%rsi), %r8 ++ movq 16 (%rsi), %r9 ++ movq 24 (%rsi), %r10 ++ + movq %rax, (%rdi) +- movq %rdx, 8(%rdi) +- movq %r8, 16(%rdi) +- movq %r9, 24(%rdi) ++ movq %r8, 8 (%rdi) ++ movq %r9, 16 (%rdi) ++ movq %r10, 24 (%rdi) + + leaq 32(%rsi), %rsi + leaq 32(%rdi), %rdi + +- jns 3b ++ jz L(32skip) /* help out smaller blocks */ ++ ++ decl %ecx ++ ++ movq (%rsi), %rax ++ movq 8 (%rsi), %r8 ++ movq 16 (%rsi), %r9 ++ movq 24 (%rsi), %r10 ++ ++ movq %rax, (%rdi) ++ movq %r8, 8 (%rdi) ++ movq %r9, 16 (%rdi) ++ movq %r10, 24 (%rdi) ++ ++ leaq 32 (%rsi), %rsi ++ leaq 32 (%rdi), %rdi ++ ++ jnz L(32loop) ++ ++ .p2align 4 + +- /* Correct extra loop counter modification. */ +-2: addq $32, %rcx +-1: rep; movsb ++L(32skip): ++ andl $31, %edx /* check for left overs */ ++#ifdef USE_AS_MEMPCPY ++ jnz L(1) + +-#if MEMPCPY_P +- movq %rdi, %rax /* Set return value. */ ++ movq %rdi, %rax + #else +- movq %r10, %rax /* Set return value. */ ++ movq RETVAL (%rsp), %rax ++ jnz L(1) + ++ rep ++#endif ++ retq /* exit */ ++ ++ .p2align 4 ++ ++L(32after): ++ ++/* ++ In order to minimize code-size in RTLD, algorithms specific for ++ larger blocks are excluded when building for RTLD. |
