aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--fedora/glibc-rh234946.patch32
-rw-r--r--fedora/glibc-x86_64-memcpy.patch1439
-rw-r--r--fedora/glibc.spec.in7
3 files changed, 1477 insertions, 1 deletions
diff --git a/fedora/glibc-rh234946.patch b/fedora/glibc-rh234946.patch
new file mode 100644
index 0000000000..add3f8023f
--- /dev/null
+++ b/fedora/glibc-rh234946.patch
@@ -0,0 +1,32 @@
+2006-12-25 Sripathi Kodi <sripathik@in.ibm.com>
+
+ * include/link.h: Declare new flag l_fini_called in struct link_map.
+ * elf/dl-fini.c: In _dl_fini, set l_fini_called to 1 instead of
+ l_init_called to 0.
+
+--- libc/elf/dl-fini.c 2006-12-22 01:54:22.000000000 -0600
++++ libc/elf/dl-fini.c 2006-12-24 22:51:52.000000000 -0600
+@@ -215,10 +215,10 @@ _dl_fini (void)
+ {
+ l = maps[i];
+
+- if (l->l_init_called)
++ if (l->l_init_called && !l->l_fini_called)
+ {
+ /* Make sure nothing happens if we are called twice. */
+- l->l_init_called = 0;
++ l->l_fini_called = 1;
+
+ /* Is there a destructor function? */
+ if (l->l_info[DT_FINI_ARRAY] != NULL
+--- libc/include/link.h 2006-12-22 01:54:22.000000000 -0600
++++ libc/include/link.h 2006-12-24 22:53:29.000000000 -0600
+@@ -185,6 +185,8 @@ struct link_map
+ unsigned int l_contiguous:1; /* Nonzero if inter-segment holes are
+ mprotected or if no holes are present at
+ all. */
++ unsigned int l_fini_called:1; /* Nonzero if _dl_fini has processed
++ this object */
+
+ /* Array with version names. */
+ unsigned int l_nversions;
diff --git a/fedora/glibc-x86_64-memcpy.patch b/fedora/glibc-x86_64-memcpy.patch
new file mode 100644
index 0000000000..3888134df8
--- /dev/null
+++ b/fedora/glibc-x86_64-memcpy.patch
@@ -0,0 +1,1439 @@
+2007-05-21 Ulrich Drepper <drepper@redhat.com>
+
+ * sysdeps/x86_64/cacheinfo.c (init_cacheinfo): Pass correct value
+ as second parameter to handle_intel.
+
+ * sysdeps/unix/sysv/linux/x86_64/sysconf.c: Move cache information
+ handling to ...
+ * sysdeps/x86_64/cacheinfo.c: ... here. New file.
+ * sysdeps/x86_64/Makefile [subdir=string] (sysdep_routines): Add
+ cacheinfo.
+ * sysdeps/x86_64/memcpy.S: Complete rewrite.
+ * sysdeps/x86_64/mempcpy.S: Adjust appropriately.
+ Patch by Evandro Menezes <evandro.menezes@amd.com>.
+
+--- libc/sysdeps/x86_64/Makefile 16 Aug 2004 06:46:14 -0000 1.4
++++ libc/sysdeps/x86_64/Makefile 21 May 2007 19:20:45 -0000 1.5
+@@ -9,3 +9,7 @@ endif
+ ifeq ($(subdir),gmon)
+ sysdep_routines += _mcount
+ endif
++
++ifeq ($(subdir),string)
++sysdep_routines += cacheinfo
++endif
+--- libc/sysdeps/x86_64/cacheinfo.c 1 Jan 1970 00:00:00 -0000
++++ libc/sysdeps/x86_64/cacheinfo.c 21 May 2007 22:37:45 -0000 1.2
+@@ -0,0 +1,451 @@
++/* x86_64 cache info.
++ Copyright (C) 2003, 2004, 2006, 2007 Free Software Foundation, Inc.
++ This file is part of the GNU C Library.
++
++ The GNU C Library is free software; you can redistribute it and/or
++ modify it under the terms of the GNU Lesser General Public
++ License as published by the Free Software Foundation; either
++ version 2.1 of the License, or (at your option) any later version.
++
++ The GNU C Library is distributed in the hope that it will be useful,
++ but WITHOUT ANY WARRANTY; without even the implied warranty of
++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ Lesser General Public License for more details.
++
++ You should have received a copy of the GNU Lesser General Public
++ License along with the GNU C Library; if not, write to the Free
++ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
++ 02111-1307 USA.
++*/
++
++#include <assert.h>
++#include <stdbool.h>
++#include <stdlib.h>
++#include <unistd.h>
++
++static const struct intel_02_cache_info
++{
++ unsigned int idx;
++ int name;
++ long int size;
++ long int assoc;
++ long int linesize;
++} intel_02_known [] =
++ {
++ { 0x06, _SC_LEVEL1_ICACHE_SIZE, 8192, 4, 32 },
++ { 0x08, _SC_LEVEL1_ICACHE_SIZE, 16384, 4, 32 },
++ { 0x0a, _SC_LEVEL1_DCACHE_SIZE, 8192, 2, 32 },
++ { 0x0c, _SC_LEVEL1_DCACHE_SIZE, 16384, 4, 32 },
++ { 0x22, _SC_LEVEL3_CACHE_SIZE, 524288, 4, 64 },
++ { 0x23, _SC_LEVEL3_CACHE_SIZE, 1048576, 8, 64 },
++ { 0x25, _SC_LEVEL3_CACHE_SIZE, 2097152, 8, 64 },
++ { 0x29, _SC_LEVEL3_CACHE_SIZE, 4194304, 8, 64 },
++ { 0x2c, _SC_LEVEL1_DCACHE_SIZE, 32768, 8, 64 },
++ { 0x30, _SC_LEVEL1_ICACHE_SIZE, 32768, 8, 64 },
++ { 0x39, _SC_LEVEL2_CACHE_SIZE, 131072, 4, 64 },
++ { 0x3a, _SC_LEVEL2_CACHE_SIZE, 196608, 6, 64 },
++ { 0x3b, _SC_LEVEL2_CACHE_SIZE, 131072, 2, 64 },
++ { 0x3c, _SC_LEVEL2_CACHE_SIZE, 262144, 4, 64 },
++ { 0x3d, _SC_LEVEL2_CACHE_SIZE, 393216, 6, 64 },
++ { 0x3e, _SC_LEVEL2_CACHE_SIZE, 524288, 4, 64 },
++ { 0x41, _SC_LEVEL2_CACHE_SIZE, 131072, 4, 32 },
++ { 0x42, _SC_LEVEL2_CACHE_SIZE, 262144, 4, 32 },
++ { 0x43, _SC_LEVEL2_CACHE_SIZE, 524288, 4, 32 },
++ { 0x44, _SC_LEVEL2_CACHE_SIZE, 1048576, 4, 32 },
++ { 0x45, _SC_LEVEL2_CACHE_SIZE, 2097152, 4, 32 },
++ { 0x46, _SC_LEVEL3_CACHE_SIZE, 4194304, 4, 64 },
++ { 0x47, _SC_LEVEL3_CACHE_SIZE, 8388608, 8, 64 },
++ { 0x49, _SC_LEVEL2_CACHE_SIZE, 4194304, 16, 64 },
++ { 0x4a, _SC_LEVEL3_CACHE_SIZE, 6291456, 12, 64 },
++ { 0x4b, _SC_LEVEL3_CACHE_SIZE, 8388608, 16, 64 },
++ { 0x4c, _SC_LEVEL3_CACHE_SIZE, 12582912, 12, 64 },
++ { 0x4d, _SC_LEVEL3_CACHE_SIZE, 16777216, 16, 64 },
++ { 0x60, _SC_LEVEL1_DCACHE_SIZE, 16384, 8, 64 },
++ { 0x66, _SC_LEVEL1_DCACHE_SIZE, 8192, 4, 64 },
++ { 0x67, _SC_LEVEL1_DCACHE_SIZE, 16384, 4, 64 },
++ { 0x68, _SC_LEVEL1_DCACHE_SIZE, 32768, 4, 64 },
++ { 0x78, _SC_LEVEL2_CACHE_SIZE, 1048576, 8, 64 },
++ { 0x79, _SC_LEVEL2_CACHE_SIZE, 131072, 8, 64 },
++ { 0x7a, _SC_LEVEL2_CACHE_SIZE, 262144, 8, 64 },
++ { 0x7b, _SC_LEVEL2_CACHE_SIZE, 524288, 8, 64 },
++ { 0x7c, _SC_LEVEL2_CACHE_SIZE, 1048576, 8, 64 },
++ { 0x7d, _SC_LEVEL2_CACHE_SIZE, 2097152, 8, 64 },
++ { 0x7f, _SC_LEVEL2_CACHE_SIZE, 524288, 2, 64 },
++ { 0x82, _SC_LEVEL2_CACHE_SIZE, 262144, 8, 32 },
++ { 0x83, _SC_LEVEL2_CACHE_SIZE, 524288, 8, 32 },
++ { 0x84, _SC_LEVEL2_CACHE_SIZE, 1048576, 8, 32 },
++ { 0x85, _SC_LEVEL2_CACHE_SIZE, 2097152, 8, 32 },
++ { 0x86, _SC_LEVEL2_CACHE_SIZE, 524288, 4, 64 },
++ { 0x87, _SC_LEVEL2_CACHE_SIZE, 1048576, 8, 64 },
++ };
++
++#define nintel_02_known (sizeof (intel_02_known) / sizeof (intel_02_known [0]))
++
++static int
++intel_02_known_compare (const void *p1, const void *p2)
++{
++ const struct intel_02_cache_info *i1;
++ const struct intel_02_cache_info *i2;
++
++ i1 = (const struct intel_02_cache_info *) p1;
++ i2 = (const struct intel_02_cache_info *) p2;
++
++ if (i1->idx == i2->idx)
++ return 0;
++
++ return i1->idx < i2->idx ? -1 : 1;
++}
++
++
++static long int
++__attribute__ ((noinline))
++intel_check_word (int name, unsigned int value, bool *has_level_2,
++ bool *no_level_2_or_3)
++{
++ if ((value & 0x80000000) != 0)
++ /* The register value is reserved. */
++ return 0;
++
++ /* Fold the name. The _SC_ constants are always in the order SIZE,
++ ASSOC, LINESIZE. */
++ int folded_name = (_SC_LEVEL1_ICACHE_SIZE
++ + ((name - _SC_LEVEL1_ICACHE_SIZE) / 3) * 3);
++
++ while (value != 0)
++ {
++ unsigned int byte = value & 0xff;
++
++ if (byte == 0x40)
++ {
++ *no_level_2_or_3 = true;
++
++ if (folded_name == _SC_LEVEL3_CACHE_SIZE)
++ /* No need to look further. */
++ break;
++ }
++ else
++ {
++ if (byte == 0x49 && folded_name == _SC_LEVEL3_CACHE_SIZE)
++ {
++ /* Intel reused this value. For family 15, model 6 it
++ specifies the 3rd level cache. Otherwise the 2nd
++ level cache. */
++ unsigned int eax;
++ unsigned int ebx;
++ unsigned int ecx;
++ unsigned int edx;
++ asm volatile ("cpuid"
++ : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx)
++ : "0" (1));
++
++ unsigned int family = ((eax >> 20) & 0xff) + ((eax >> 8) & 0xf);
++ unsigned int model = ((((eax >>16) & 0xf) << 4)
++ + ((eax >> 4) & 0xf));
++ if (family == 15 && model == 6)
++ {
++ /* The level 3 cache is encoded for this model like
++ the level 2 cache is for other models. Pretend
++ the caller asked for the level 2 cache. */
++ name = (_SC_LEVEL2_CACHE_SIZE
++ + (name - _SC_LEVEL3_CACHE_SIZE));
++ folded_name = _SC_LEVEL3_CACHE_SIZE;
++ }
++ }
++
++ struct intel_02_cache_info *found;
++ struct intel_02_cache_info search;
++
++ search.idx = byte;
++ found = bsearch (&search, intel_02_known, nintel_02_known,
++ sizeof (intel_02_known[0]), intel_02_known_compare);
++ if (found != NULL)
++ {
++ if (found->name == folded_name)
++ {
++ unsigned int offset = name - folded_name;
++
++ if (offset == 0)
++ /* Cache size. */
++ return found->size;
++ if (offset == 1)
++ return found->assoc;
++
++ assert (offset == 2);
++ return found->linesize;
++ }
++
++ if (found->name == _SC_LEVEL2_CACHE_SIZE)
++ *has_level_2 = true;
++ }
++ }
++
++ /* Next byte for the next round. */
++ value >>= 8;
++ }
++
++ /* Nothing found. */
++ return 0;
++}
++
++
++static long int __attribute__ ((noinline))
++handle_intel (int name, unsigned int maxidx)
++{
++ assert (maxidx >= 2);
++
++ /* OK, we can use the CPUID instruction to get all info about the
++ caches. */
++ unsigned int cnt = 0;
++ unsigned int max = 1;
++ long int result = 0;
++ bool no_level_2_or_3 = false;
++ bool has_level_2 = false;
++
++ while (cnt++ < max)
++ {
++ unsigned int eax;
++ unsigned int ebx;
++ unsigned int ecx;
++ unsigned int edx;
++ asm volatile ("cpuid"
++ : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx)
++ : "0" (2));
++
++ /* The low byte of EAX in the first round contain the number of
++ rounds we have to make. At least one, the one we are already
++ doing. */
++ if (cnt == 1)
++ {
++ max = eax & 0xff;
++ eax &= 0xffffff00;
++ }
++
++ /* Process the individual registers' value. */
++ result = intel_check_word (name, eax, &has_level_2, &no_level_2_or_3);
++ if (result != 0)
++ return result;
++
++ result = intel_check_word (name, ebx, &has_level_2, &no_level_2_or_3);
++ if (result != 0)
++ return result;
++
++ result = intel_check_word (name, ecx, &has_level_2, &no_level_2_or_3);
++ if (result != 0)
++ return result;
++
++ result = intel_check_word (name, edx, &has_level_2, &no_level_2_or_3);
++ if (result != 0)
++ return result;
++ }
++
++ if (name >= _SC_LEVEL2_CACHE_SIZE && name <= _SC_LEVEL3_CACHE_LINESIZE
++ && no_level_2_or_3)
++ return -1;
++
++ return 0;
++}
++
++
++static long int __attribute__ ((noinline))
++handle_amd (int name)
++{
++ unsigned int eax;
++ unsigned int ebx;
++ unsigned int ecx;
++ unsigned int edx;
++ asm volatile ("cpuid"
++ : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx)
++ : "0" (0x80000000));
++
++ if (name >= _SC_LEVEL3_CACHE_SIZE)
++ return 0;
++
++ unsigned int fn = 0x80000005 + (name >= _SC_LEVEL2_CACHE_SIZE);
++ if (eax < fn)
++ return 0;
++
++ asm volatile ("cpuid"
++ : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx)
++ : "0" (fn));
++
++ if (name < _SC_LEVEL1_DCACHE_SIZE)
++ {
++ name += _SC_LEVEL1_DCACHE_SIZE - _SC_LEVEL1_ICACHE_SIZE;
++ ecx = edx;
++ }
++
++ switch (name)
++ {
++ case _SC_LEVEL1_DCACHE_SIZE:
++ return (ecx >> 14) & 0x3fc00;
++ case _SC_LEVEL1_DCACHE_ASSOC:
++ ecx >>= 16;
++ if ((ecx & 0xff) == 0xff)
++ /* Fully associative. */
++ return (ecx << 2) & 0x3fc00;
++ return ecx & 0xff;
++ case _SC_LEVEL1_DCACHE_LINESIZE:
++ return ecx & 0xff;
++ case _SC_LEVEL2_CACHE_SIZE:
++ return (ecx & 0xf000) == 0 ? 0 : (ecx >> 6) & 0x3fffc00;
++ case _SC_LEVEL2_CACHE_ASSOC:
++ ecx >>= 12;
++ switch (ecx & 0xf)
++ {
++ case 0:
++ case 1:
++ case 2:
++ case 4:
++ return ecx & 0xf;
++ case 6:
++ return 8;
++ case 8:
++ return 16;
++ case 0xf:
++ return (ecx << 6) & 0x3fffc00;
++ default:
++ return 0;
++ }
++ case _SC_LEVEL2_CACHE_LINESIZE:
++ return (ecx & 0xf000) == 0 ? 0 : ecx & 0xff;
++ default:
++ assert (! "cannot happen");
++ }
++ return -1;
++}
++
++
++/* Get the value of the system variable NAME. */
++long int
++attribute_hidden
++__cache_sysconf (int name)
++{
++ /* Find out what brand of processor. */
++ unsigned int eax;
++ unsigned int ebx;
++ unsigned int ecx;
++ unsigned int edx;
++ asm volatile ("cpuid"
++ : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx)
++ : "0" (0));
++
++ /* This spells out "GenuineIntel". */
++ if (ebx == 0x756e6547 && ecx == 0x6c65746e && edx == 0x49656e69)
++ return handle_intel (name, eax);
++
++ /* This spells out "AuthenticAMD". */
++ if (ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65)
++ return handle_amd (name);
++
++ // XXX Fill in more vendors.
++
++ /* CPU not known, we have no information. */
++ return 0;
++}
++
++
++/* Half the core cache size for use in memory and string routines, typically
++ L1 size. */
++long int __x86_64_core_cache_size_half attribute_hidden = 32 * 1024 / 2;
++/* Shared cache size for use in memory and string routines, typically
++ L2 or L3 size. */
++long int __x86_64_shared_cache_size_half attribute_hidden = 1024 * 1024 / 2;
++/* PREFETCHW support flag for use in memory and string routines. */
++int __x86_64_prefetchw attribute_hidden;
++
++
++static void
++__attribute__((constructor))
++init_cacheinfo (void)
++{
++ /* Find out what brand of processor. */
++ unsigned int eax;
++ unsigned int ebx;
++ unsigned int ecx;
++ unsigned int edx;
++ int max_cpuid;
++ int max_cpuid_ex;
++ long int core = -1;
++ long int shared = -1;
++ unsigned int level;
++ unsigned int threads = 0;
++
++ asm volatile ("cpuid"
++ : "=a" (max_cpuid), "=b" (ebx), "=c" (ecx), "=d" (edx)
++ : "0" (0));
++
++ /* This spells out "GenuineIntel". */
++ if (ebx == 0x756e6547 && ecx == 0x6c65746e && edx == 0x49656e69)
++ {
++ core = handle_intel (_SC_LEVEL1_DCACHE_SIZE, max_cpuid);
++
++ /* Try L3 first. */
++ level = 3;
++ shared = handle_intel (_SC_LEVEL3_CACHE_SIZE, max_cpuid);
++
++ if (shared <= 0)
++ {
++ /* Try L2 otherwise. */
++ level = 2;
++ shared = handle_intel (_SC_LEVEL2_CACHE_SIZE, max_cpuid);
++ }
++
++ /* Figure out the number of logical threads that share the
++ highest cache level. */
++ if (max_cpuid >= 4)
++ {
++ int i = 0;
++
++ /* Query until desired cache level is enumerated. */
++ do
++ {
++ asm volatile ("cpuid"
++ : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx)
++ : "0" (4), "2" (i++));
++ }
++ while (((eax >> 5) & 0x7) != level);
++
++ threads = ((eax >> 14) & 0x3ff) + 1;
++ }
++ else
++ {
++ /* Assume that all logical threads share the highest cache level. */
++ asm volatile ("cpuid"
++ : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx)
++ : "0" (1));
++
++ threads = (ebx >> 16) & 0xff;
++ }
++
++ /* Cap usage of highest cache level to the number of supported
++ threads. */
++ if (shared > 0 && threads > 0)
++ shared /= threads;
++ }
++ /* This spells out "AuthenticAMD". */
++ else if (ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65)
++ {
++ core = handle_amd (_SC_LEVEL1_DCACHE_SIZE);
++ shared = handle_amd (_SC_LEVEL2_CACHE_SIZE);
++
++ asm volatile ("cpuid"
++ : "=a" (max_cpuid_ex), "=b" (ebx), "=c" (ecx), "=d" (edx)
++ : "0" (0x80000000));
++
++ if (max_cpuid_ex >= 0x80000001)
++ {
++ asm volatile ("cpuid"
++ : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx)
++ : "0" (0x80000001));
++ /* PREFETCHW || 3DNow! */
++ if ((ecx & 0x100) || (edx & 0x80000000))
++ __x86_64_prefetchw = -1;
++ }
++ }
++
++ if (core > 0)
++ __x86_64_core_cache_size_half = core / 2;
++
++ if (shared > 0)
++ __x86_64_shared_cache_size_half = shared / 2;
++}
+--- libc/sysdeps/x86_64/memcpy.S 18 Oct 2004 04:17:08 -0000 1.5
++++ libc/sysdeps/x86_64/memcpy.S 21 May 2007 19:21:01 -0000 1.6
+@@ -1,7 +1,10 @@
+-/* Highly optimized version for x86-64.
+- Copyright (C) 1997, 2000, 2002, 2003, 2004 Free Software Foundation, Inc.
++/*
++ Optimized memcpy for x86-64.
++
++ Copyright (C) 2007 Free Software Foundation, Inc.
++ Contributed by Evandro Menezes <evandro.menezes@amd.com>, 2007.
++
+ This file is part of the GNU C Library.
+- Based on i586 version contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+@@ -16,86 +19,556 @@
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+- 02111-1307 USA. */
++ 02111-1307 USA.
++*/
+
+ #include <sysdep.h>
+ #include "asm-syntax.h"
+-#include "bp-sym.h"
+-#include "bp-asm.h"
+
+-/* BEWARE: `#ifdef memcpy' means that memcpy is redefined as `mempcpy',
+- and the return value is the byte after the last one copied in
+- the destination. */
+-#define MEMPCPY_P (defined memcpy)
++/* Stack slots in the red-zone. */
++
++#ifdef USE_AS_MEMPCPY
++# define RETVAL (0)
++#else
++# define RETVAL (-8)
++#endif
++#define SAVE0 (RETVAL - 8)
++#define SAVE1 (SAVE0 - 8)
++#define SAVE2 (SAVE1 - 8)
++#define SAVE3 (SAVE2 - 8)
+
+ .text
++
+ #if defined PIC && !defined NOT_IN_libc
+ ENTRY (__memcpy_chk)
++
+ cmpq %rdx, %rcx
+ jb HIDDEN_JUMPTARGET (__chk_fail)
++
+ END (__memcpy_chk)
+ #endif
+-ENTRY (BP_SYM (memcpy))
+- /* Cutoff for the big loop is a size of 32 bytes since otherwise
+- the loop will never be entered. */
++
++ENTRY(memcpy) /* (void *, const void*, size_t) */
++
++/* Handle tiny blocks. */
++
++L(1try): /* up to 32B */
+ cmpq $32, %rdx
+- movq %rdx, %rcx
+-#if !MEMPCPY_P
+- movq %rdi, %r10 /* Save value. */
++#ifndef USE_AS_MEMPCPY
++ movq %rdi, %rax /* save return value */
+ #endif
++ jae L(1after)
+
+- /* We need this in any case. */
+- cld
++L(1): /* 1-byte once */
++ testb $1, %dl
++ jz L(1a)
+
+- jbe 1f
++ movzbl (%rsi), %ecx
++ movb %cl, (%rdi)
+
+- /* Align destination. */
+- movq %rdi, %rax
+- negq %rax
+- andq $7, %rax
+- subq %rax, %rcx
+- xchgq %rax, %rcx
++ incq %rsi
++ incq %rdi
++
++ .p2align 4,, 4
++
++L(1a): /* 2-byte once */
++ testb $2, %dl
++ jz L(1b)
++
++ movzwl (%rsi), %ecx
++ movw %cx, (%rdi)
+
+- rep; movsb
++ addq $2, %rsi
++ addq $2, %rdi
+
+- movq %rax, %rcx
+- subq $32, %rcx
+- js 2f
++ .p2align 4,, 4
++
++L(1b): /* 4-byte once */
++ testb $4, %dl
++ jz L(1c)
++
++ movl (%rsi), %ecx
++ movl %ecx, (%rdi)
++
++ addq $4, %rsi
++ addq $4, %rdi
++
++ .p2align 4,, 4
++
++L(1c): /* 8-byte once */
++ testb $8, %dl
++ jz L(1d)
++
++ movq (%rsi), %rcx
++ movq %rcx, (%rdi)
++
++ addq $8, %rsi
++ addq $8, %rdi
++
++ .p2align 4,, 4
++
++L(1d): /* 16-byte loop */
++ andl $0xf0, %edx
++ jz L(exit)
+
+ .p2align 4
+-3:
+
+- /* Now correct the loop counter. Please note that in the following
+- code the flags are not changed anymore. */
+- subq $32, %rcx
++L(1loop):
++ movq (%rsi), %rcx
++ movq 8 (%rsi), %r8
++ movq %rcx, (%rdi)
++ movq %r8, 8 (%rdi)
++
++ subl $16, %edx
++
++ leaq 16 (%rsi), %rsi
++ leaq 16 (%rdi), %rdi
++
++ jnz L(1loop)
++
++ .p2align 4,, 4
++
++L(exit): /* exit */
++#ifdef USE_AS_MEMPCPY
++ movq %rdi, %rax /* return value */
++#else
++ rep
++#endif
++ retq
++
++ .p2align 4
++
++L(1after):
++#ifndef USE_AS_MEMPCPY
++ movq %rax, RETVAL (%rsp) /* save return value */
++#endif
++
++/* Align to the natural word size. */
++
++L(aligntry):
++ movl %esi, %ecx /* align by destination */
++
++ andl $7, %ecx
++ jz L(alignafter) /* already aligned */
++
++L(align): /* align */
++ leaq -8 (%rcx, %rdx), %rdx /* calculate remaining bytes */
++ subl $8, %ecx
++
++ .p2align 4
++
++L(alignloop): /* 1-byte alignment loop */
++ movzbl (%rsi), %eax
++ movb %al, (%rdi)
++
++ incl %ecx
++
++ leaq 1 (%rsi), %rsi
++ leaq 1 (%rdi), %rdi
++
++ jnz L(alignloop)
++
++ .p2align 4
++
++L(alignafter):
++
++/* Loop to handle mid-sized blocks. */
++
++L(32try): /* up to 1KB */
++ cmpq $1024, %rdx
++ ja L(32after)
++
++L(32): /* 32-byte loop */
++ movl %edx, %ecx
++ shrl $5, %ecx
++ jz L(32skip)
++
++ .p2align 4
++
++L(32loop):
++ decl %ecx
+
+ movq (%rsi), %rax
+- movq 8(%rsi), %rdx
+- movq 16(%rsi), %r8
+- movq 24(%rsi), %r9
++ movq 8 (%rsi), %r8
++ movq 16 (%rsi), %r9
++ movq 24 (%rsi), %r10
++
+ movq %rax, (%rdi)
+- movq %rdx, 8(%rdi)
+- movq %r8, 16(%rdi)
+- movq %r9, 24(%rdi)
++ movq %r8, 8 (%rdi)
++ movq %r9, 16 (%rdi)
++ movq %r10, 24 (%rdi)
+
+ leaq 32(%rsi), %rsi
+ leaq 32(%rdi), %rdi
+
+- jns 3b
++ jz L(32skip) /* help out smaller blocks */
++
++ decl %ecx
++
++ movq (%rsi), %rax
++ movq 8 (%rsi), %r8
++ movq 16 (%rsi), %r9
++ movq 24 (%rsi), %r10
++
++ movq %rax, (%rdi)
++ movq %r8, 8 (%rdi)
++ movq %r9, 16 (%rdi)
++ movq %r10, 24 (%rdi)
++
++ leaq 32 (%rsi), %rsi
++ leaq 32 (%rdi), %rdi
++
++ jnz L(32loop)
++
++ .p2align 4
+
+- /* Correct extra loop counter modification. */
+-2: addq $32, %rcx
+-1: rep; movsb
++L(32skip):
++ andl $31, %edx /* check for left overs */
++#ifdef USE_AS_MEMPCPY
++ jnz L(1)
+
+-#if MEMPCPY_P
+- movq %rdi, %rax /* Set return value. */
++ movq %rdi, %rax
+ #else
+- movq %r10, %rax /* Set return value. */
++ movq RETVAL (%rsp), %rax
++ jnz L(1)
+
++ rep
++#endif
++ retq /* exit */
++
++ .p2align 4
++
++L(32after):
++
++/*
++ In order to minimize code-size in RTLD, algorithms specific for
++ larger blocks are excluded when building for RTLD.