3 files changed, 1477 insertions, 1 deletions
diff --git a/fedora/glibc-rh234946.patch b/fedora/glibc-rh234946.patch
new file mode 100644
index 0000000000..add3f8023f
--- /dev/null
+++ b/fedora/glibc-rh234946.patch
@@ -0,0 +1,32 @@
+2006-12-25  Sripathi Kodi  <sripathik@in.ibm.com>
+
+	* include/link.h: Declare new flag l_fini_called in struct link_map.
+	* elf/dl-fini.c: In _dl_fini, set l_fini_called to 1 instead of 
+	l_init_called to 0.
+
+--- libc/elf/dl-fini.c	2006-12-22 01:54:22.000000000 -0600
++++ libc/elf/dl-fini.c	2006-12-24 22:51:52.000000000 -0600
+@@ -215,10 +215,10 @@ _dl_fini (void)
+ 	{
+ 	  l = maps[i];
+ 
+-	  if (l->l_init_called)
++	  if (l->l_init_called && !l->l_fini_called)
+ 	    {
+ 	      /* Make sure nothing happens if we are called twice.  */
+-	      l->l_init_called = 0;
++	      l->l_fini_called = 1;
+ 
+ 	      /* Is there a destructor function?  */
+ 	      if (l->l_info[DT_FINI_ARRAY] != NULL
+--- libc/include/link.h	2006-12-22 01:54:22.000000000 -0600
++++ libc/include/link.h	2006-12-24 22:53:29.000000000 -0600
+@@ -185,6 +185,8 @@ struct link_map
+     unsigned int l_contiguous:1; /* Nonzero if inter-segment holes are
+ 				    mprotected or if no holes are present at
+ 				    all.  */
++    unsigned int l_fini_called:1; /* Nonzero if _dl_fini has processed
++				     this object */
+ 
+     /* Array with version names.  */
+     unsigned int l_nversions;
diff --git a/fedora/glibc-x86_64-memcpy.patch b/fedora/glibc-x86_64-memcpy.patch
new file mode 100644
index 0000000000..3888134df8
--- /dev/null
+++ b/fedora/glibc-x86_64-memcpy.patch
@@ -0,0 +1,1439 @@
+2007-05-21  Ulrich Drepper  <drepper@redhat.com>
+
+	* sysdeps/x86_64/cacheinfo.c (init_cacheinfo): Pass correct value
+	as second parameter to handle_intel.
+
+	* sysdeps/unix/sysv/linux/x86_64/sysconf.c: Move cache information
+	handling to ...
+	* sysdeps/x86_64/cacheinfo.c: ... here.  New file.
+	* sysdeps/x86_64/Makefile [subdir=string] (sysdep_routines): Add
+	cacheinfo.
+	* sysdeps/x86_64/memcpy.S: Complete rewrite.
+	* sysdeps/x86_64/mempcpy.S: Adjust appropriately.
+	Patch by Evandro Menezes <evandro.menezes@amd.com>.
+
+--- libc/sysdeps/x86_64/Makefile	16 Aug 2004 06:46:14 -0000	1.4
++++ libc/sysdeps/x86_64/Makefile	21 May 2007 19:20:45 -0000	1.5
+@@ -9,3 +9,7 @@ endif
+ ifeq ($(subdir),gmon)
+ sysdep_routines += _mcount
+ endif
++
++ifeq ($(subdir),string)
++sysdep_routines += cacheinfo
++endif
+--- libc/sysdeps/x86_64/cacheinfo.c	1 Jan 1970 00:00:00 -0000
++++ libc/sysdeps/x86_64/cacheinfo.c	21 May 2007 22:37:45 -0000	1.2
+@@ -0,0 +1,451 @@
++/* x86_64 cache info.
++   Copyright (C) 2003, 2004, 2006, 2007 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, write to the Free
++   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
++   02111-1307 USA.
++*/
++
++#include <assert.h>
++#include <stdbool.h>
++#include <stdlib.h>
++#include <unistd.h>
++
++static const struct intel_02_cache_info
++{
++  unsigned int idx;
++  int name;
++  long int size;
++  long int assoc;
++  long int linesize;
++} intel_02_known [] =
++  {
++    { 0x06, _SC_LEVEL1_ICACHE_SIZE,    8192,  4, 32 },
++    { 0x08, _SC_LEVEL1_ICACHE_SIZE,   16384,  4, 32 },
++    { 0x0a, _SC_LEVEL1_DCACHE_SIZE,    8192,  2, 32 },
++    { 0x0c, _SC_LEVEL1_DCACHE_SIZE,   16384,  4, 32 },
++    { 0x22, _SC_LEVEL3_CACHE_SIZE,   524288,  4, 64 },
++    { 0x23, _SC_LEVEL3_CACHE_SIZE,  1048576,  8, 64 },
++    { 0x25, _SC_LEVEL3_CACHE_SIZE,  2097152,  8, 64 },
++    { 0x29, _SC_LEVEL3_CACHE_SIZE,  4194304,  8, 64 },
++    { 0x2c, _SC_LEVEL1_DCACHE_SIZE,   32768,  8, 64 },
++    { 0x30, _SC_LEVEL1_ICACHE_SIZE,   32768,  8, 64 },
++    { 0x39, _SC_LEVEL2_CACHE_SIZE,   131072,  4, 64 },
++    { 0x3a, _SC_LEVEL2_CACHE_SIZE,   196608,  6, 64 },
++    { 0x3b, _SC_LEVEL2_CACHE_SIZE,   131072,  2, 64 },
++    { 0x3c, _SC_LEVEL2_CACHE_SIZE,   262144,  4, 64 },
++    { 0x3d, _SC_LEVEL2_CACHE_SIZE,   393216,  6, 64 },
++    { 0x3e, _SC_LEVEL2_CACHE_SIZE,   524288,  4, 64 },
++    { 0x41, _SC_LEVEL2_CACHE_SIZE,   131072,  4, 32 },
++    { 0x42, _SC_LEVEL2_CACHE_SIZE,   262144,  4, 32 },
++    { 0x43, _SC_LEVEL2_CACHE_SIZE,   524288,  4, 32 },
++    { 0x44, _SC_LEVEL2_CACHE_SIZE,  1048576,  4, 32 },
++    { 0x45, _SC_LEVEL2_CACHE_SIZE,  2097152,  4, 32 },
++    { 0x46, _SC_LEVEL3_CACHE_SIZE,  4194304,  4, 64 },
++    { 0x47, _SC_LEVEL3_CACHE_SIZE,  8388608,  8, 64 },
++    { 0x49, _SC_LEVEL2_CACHE_SIZE,  4194304, 16, 64 },
++    { 0x4a, _SC_LEVEL3_CACHE_SIZE,  6291456, 12, 64 },
++    { 0x4b, _SC_LEVEL3_CACHE_SIZE,  8388608, 16, 64 },
++    { 0x4c, _SC_LEVEL3_CACHE_SIZE, 12582912, 12, 64 },
++    { 0x4d, _SC_LEVEL3_CACHE_SIZE, 16777216, 16, 64 },
++    { 0x60, _SC_LEVEL1_DCACHE_SIZE,   16384,  8, 64 },
++    { 0x66, _SC_LEVEL1_DCACHE_SIZE,    8192,  4, 64 },
++    { 0x67, _SC_LEVEL1_DCACHE_SIZE,   16384,  4, 64 },
++    { 0x68, _SC_LEVEL1_DCACHE_SIZE,   32768,  4, 64 },
++    { 0x78, _SC_LEVEL2_CACHE_SIZE,  1048576,  8, 64 },
++    { 0x79, _SC_LEVEL2_CACHE_SIZE,   131072,  8, 64 },
++    { 0x7a, _SC_LEVEL2_CACHE_SIZE,   262144,  8, 64 },
++    { 0x7b, _SC_LEVEL2_CACHE_SIZE,   524288,  8, 64 },
++    { 0x7c, _SC_LEVEL2_CACHE_SIZE,  1048576,  8, 64 },
++    { 0x7d, _SC_LEVEL2_CACHE_SIZE,  2097152,  8, 64 },
++    { 0x7f, _SC_LEVEL2_CACHE_SIZE,   524288,  2, 64 },
++    { 0x82, _SC_LEVEL2_CACHE_SIZE,   262144,  8, 32 },
++    { 0x83, _SC_LEVEL2_CACHE_SIZE,   524288,  8, 32 },
++    { 0x84, _SC_LEVEL2_CACHE_SIZE,  1048576,  8, 32 },
++    { 0x85, _SC_LEVEL2_CACHE_SIZE,  2097152,  8, 32 },
++    { 0x86, _SC_LEVEL2_CACHE_SIZE,   524288,  4, 64 },
++    { 0x87, _SC_LEVEL2_CACHE_SIZE,  1048576,  8, 64 },
++  };
++
++#define nintel_02_known (sizeof (intel_02_known) / sizeof (intel_02_known [0]))
++
++static int
++intel_02_known_compare (const void *p1, const void *p2)
++{
++  const struct intel_02_cache_info *i1;
++  const struct intel_02_cache_info *i2;
++
++  i1 = (const struct intel_02_cache_info *) p1;
++  i2 = (const struct intel_02_cache_info *) p2;
++
++  if (i1->idx == i2->idx)
++    return 0;
++
++  return i1->idx < i2->idx ? -1 : 1;
++}
++
++
++static long int
++__attribute__ ((noinline))
++intel_check_word (int name, unsigned int value, bool *has_level_2,
++		  bool *no_level_2_or_3)
++{
++  if ((value & 0x80000000) != 0)
++    /* The register value is reserved.  */
++    return 0;
++
++  /* Fold the name.  The _SC_ constants are always in the order SIZE,
++     ASSOC, LINESIZE.  */
++  int folded_name = (_SC_LEVEL1_ICACHE_SIZE
++		     + ((name - _SC_LEVEL1_ICACHE_SIZE) / 3) * 3);
++
++  while (value != 0)
++    {
++      unsigned int byte = value & 0xff;
++
++      if (byte == 0x40)
++	{
++	  *no_level_2_or_3 = true;
++
++	  if (folded_name == _SC_LEVEL3_CACHE_SIZE)
++	    /* No need to look further.  */
++	    break;
++	}
++      else
++	{
++	  if (byte == 0x49 && folded_name == _SC_LEVEL3_CACHE_SIZE)
++	    {
++	      /* Intel reused this value.  For family 15, model 6 it
++		 specifies the 3rd level cache.  Otherwise the 2nd
++		 level cache.  */
++	      unsigned int eax;
++	      unsigned int ebx;
++	      unsigned int ecx;
++	      unsigned int edx;
++	      asm volatile ("cpuid"
++			    : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx)
++			    : "0" (1));
++
++	      unsigned int family = ((eax >> 20) & 0xff) + ((eax >> 8) & 0xf);
++	      unsigned int model = ((((eax >>16) & 0xf) << 4)
++				    + ((eax >> 4) & 0xf));
++	      if (family == 15 && model == 6)
++		{
++		  /* The level 3 cache is encoded for this model like
++		     the level 2 cache is for other models.  Pretend
++		     the caller asked for the level 2 cache.  */
++		  name = (_SC_LEVEL2_CACHE_SIZE
++			  + (name - _SC_LEVEL3_CACHE_SIZE));
++		  folded_name = _SC_LEVEL3_CACHE_SIZE;
++		}
++	    }
++
++	  struct intel_02_cache_info *found;
++	  struct intel_02_cache_info search;
++
++	  search.idx = byte;
++	  found = bsearch (&search, intel_02_known, nintel_02_known,
++			   sizeof (intel_02_known[0]), intel_02_known_compare);
++	  if (found != NULL)
++	    {
++	      if (found->name == folded_name)
++		{
++		  unsigned int offset = name - folded_name;
++
++		  if (offset == 0)
++		    /* Cache size.  */
++		    return found->size;
++		  if (offset == 1)
++		    return found->assoc;
++
++		  assert (offset == 2);
++		  return found->linesize;
++		}
++
++	      if (found->name == _SC_LEVEL2_CACHE_SIZE)
++		*has_level_2 = true;
++	    }
++	}
++
++      /* Next byte for the next round.  */
++      value >>= 8;
++    }
++
++  /* Nothing found.  */
++  return 0;
++}
++
++
++static long int __attribute__ ((noinline))
++handle_intel (int name, unsigned int maxidx)
++{
++  assert (maxidx >= 2);
++
++  /* OK, we can use the CPUID instruction to get all info about the
++     caches.  */
++  unsigned int cnt = 0;
++  unsigned int max = 1;
++  long int result = 0;
++  bool no_level_2_or_3 = false;
++  bool has_level_2 = false;
++
++  while (cnt++ < max)
++    {
++      unsigned int eax;
++      unsigned int ebx;
++      unsigned int ecx;
++      unsigned int edx;
++      asm volatile ("cpuid"
++		    : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx)
++		    : "0" (2));
++
++      /* The low byte of EAX in the first round contain the number of
++	 rounds we have to make.  At least one, the one we are already
++	 doing.  */
++      if (cnt == 1)
++	{
++	  max = eax & 0xff;
++	  eax &= 0xffffff00;
++	}
++
++      /* Process the individual registers' value.  */
++      result = intel_check_word (name, eax, &has_level_2, &no_level_2_or_3);
++      if (result != 0)
++	return result;
++
++      result = intel_check_word (name, ebx, &has_level_2, &no_level_2_or_3);
++      if (result != 0)
++	return result;
++
++      result = intel_check_word (name, ecx, &has_level_2, &no_level_2_or_3);
++      if (result != 0)
++	return result;
++
++      result = intel_check_word (name, edx, &has_level_2, &no_level_2_or_3);
++      if (result != 0)
++	return result;
++    }
++
++  if (name >= _SC_LEVEL2_CACHE_SIZE && name <= _SC_LEVEL3_CACHE_LINESIZE
++      && no_level_2_or_3)
++    return -1;
++
++  return 0;
++}
++
++
++static long int __attribute__ ((noinline))
++handle_amd (int name)
++{
++  unsigned int eax;
++  unsigned int ebx;
++  unsigned int ecx;
++  unsigned int edx;
++  asm volatile ("cpuid"
++		: "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx)
++		: "0" (0x80000000));
++
++  if (name >= _SC_LEVEL3_CACHE_SIZE)
++    return 0;
++
++  unsigned int fn = 0x80000005 + (name >= _SC_LEVEL2_CACHE_SIZE);
++  if (eax < fn)
++    return 0;
++
++  asm volatile ("cpuid"
++		: "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx)
++		: "0" (fn));
++
++  if (name < _SC_LEVEL1_DCACHE_SIZE)
++    {
++      name += _SC_LEVEL1_DCACHE_SIZE - _SC_LEVEL1_ICACHE_SIZE;
++      ecx = edx;
++    }
++
++  switch (name)
++    {
++    case _SC_LEVEL1_DCACHE_SIZE:
++      return (ecx >> 14) & 0x3fc00;
++    case _SC_LEVEL1_DCACHE_ASSOC:
++      ecx >>= 16;
++      if ((ecx & 0xff) == 0xff)
++	/* Fully associative.  */
++	return (ecx << 2) & 0x3fc00;
++      return ecx & 0xff;
++    case _SC_LEVEL1_DCACHE_LINESIZE:
++      return ecx & 0xff;
++    case _SC_LEVEL2_CACHE_SIZE:
++      return (ecx & 0xf000) == 0 ? 0 : (ecx >> 6) & 0x3fffc00;
++    case _SC_LEVEL2_CACHE_ASSOC:
++      ecx >>= 12;
++      switch (ecx & 0xf)
++        {
++        case 0:
++        case 1:
++        case 2:
++        case 4:
++	  return ecx & 0xf;
++	case 6:
++	  return 8;
++	case 8:
++	  return 16;
++	case 0xf:
++	  return (ecx << 6) & 0x3fffc00;
++	default:
++	  return 0;
++        }
++    case _SC_LEVEL2_CACHE_LINESIZE:
++      return (ecx & 0xf000) == 0 ? 0 : ecx & 0xff;
++    default:
++      assert (! "cannot happen");
++    }
++  return -1;
++}
++
++
++/* Get the value of the system variable NAME.  */
++long int
++attribute_hidden
++__cache_sysconf (int name)
++{
++  /* Find out what brand of processor.  */
++  unsigned int eax;
++  unsigned int ebx;
++  unsigned int ecx;
++  unsigned int edx;
++  asm volatile ("cpuid"
++		: "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx)
++		: "0" (0));
++
++  /* This spells out "GenuineIntel".  */
++  if (ebx == 0x756e6547 && ecx == 0x6c65746e && edx == 0x49656e69)
++    return handle_intel (name, eax);
++
++  /* This spells out "AuthenticAMD".  */
++  if (ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65)
++    return handle_amd (name);
++
++  // XXX Fill in more vendors.
++
++  /* CPU not known, we have no information.  */
++  return 0;
++}
++
++
++/* Half the core cache size for use in memory and string routines, typically
++   L1 size. */
++long int __x86_64_core_cache_size_half attribute_hidden = 32 * 1024 / 2;
++/* Shared cache size for use in memory and string routines, typically
++   L2 or L3 size. */
++long int __x86_64_shared_cache_size_half attribute_hidden = 1024 * 1024 / 2;
++/* PREFETCHW support flag for use in memory and string routines. */
++int __x86_64_prefetchw attribute_hidden;
++
++
++static void
++__attribute__((constructor))
++init_cacheinfo (void)
++{
++  /* Find out what brand of processor.  */
++  unsigned int eax;
++  unsigned int ebx;
++  unsigned int ecx;
++  unsigned int edx;
++  int max_cpuid;
++  int max_cpuid_ex;
++  long int core = -1;
++  long int shared = -1;
++  unsigned int level;
++  unsigned int threads = 0;
++
++  asm volatile ("cpuid"
++		: "=a" (max_cpuid), "=b" (ebx), "=c" (ecx), "=d" (edx)
++		: "0" (0));
++
++  /* This spells out "GenuineIntel".  */
++  if (ebx == 0x756e6547 && ecx == 0x6c65746e && edx == 0x49656e69)
++    {
++      core = handle_intel (_SC_LEVEL1_DCACHE_SIZE, max_cpuid);
++
++      /* Try L3 first. */
++      level  = 3;
++      shared = handle_intel (_SC_LEVEL3_CACHE_SIZE, max_cpuid);
++
++      if (shared <= 0)
++        {
++	  /* Try L2 otherwise. */
++          level  = 2;
++          shared = handle_intel (_SC_LEVEL2_CACHE_SIZE, max_cpuid);
++	}
++
++      /* Figure out the number of logical threads that share the
++	 highest cache level. */
++      if (max_cpuid >= 4)
++        {
++	  int i = 0;
++
++	  /* Query until desired cache level is enumerated. */
++	  do
++	    {
++              asm volatile ("cpuid"
++		            : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx)
++		            : "0" (4), "2" (i++));
++	    }
++          while (((eax >> 5) & 0x7) != level);
++
++	  threads = ((eax >> 14) & 0x3ff) + 1;
++	}
++      else
++        {
++	  /* Assume that all logical threads share the highest cache level. */
++          asm volatile ("cpuid"
++		        : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx)
++		        : "0" (1));
++
++	  threads = (ebx >> 16) & 0xff;
++	}
++
++      /* Cap usage of highest cache level to the number of supported
++	 threads. */
++      if (shared > 0 && threads > 0)
++        shared /= threads;
++    }
++  /* This spells out "AuthenticAMD".  */
++  else if (ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65)
++    {
++      core   = handle_amd (_SC_LEVEL1_DCACHE_SIZE);
++      shared = handle_amd (_SC_LEVEL2_CACHE_SIZE);
++
++      asm volatile ("cpuid"
++		    : "=a" (max_cpuid_ex), "=b" (ebx), "=c" (ecx), "=d" (edx)
++		    : "0" (0x80000000));
++
++      if (max_cpuid_ex >= 0x80000001)
++	{
++	  asm volatile ("cpuid"
++			: "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx)
++			: "0" (0x80000001));
++	  /*  PREFETCHW     || 3DNow! */
++	  if ((ecx & 0x100) || (edx & 0x80000000))
++	    __x86_64_prefetchw = -1;
++	}
++    }
++
++  if (core > 0)
++    __x86_64_core_cache_size_half = core / 2;
++
++  if (shared > 0)
++    __x86_64_shared_cache_size_half = shared / 2;
++}
+--- libc/sysdeps/x86_64/memcpy.S	18 Oct 2004 04:17:08 -0000	1.5
++++ libc/sysdeps/x86_64/memcpy.S	21 May 2007 19:21:01 -0000	1.6
+@@ -1,7 +1,10 @@
+-/* Highly optimized version for x86-64.
+-   Copyright (C) 1997, 2000, 2002, 2003, 2004 Free Software Foundation, Inc.
++/*
++   Optimized memcpy for x86-64.
++
++   Copyright (C) 2007 Free Software Foundation, Inc.
++   Contributed by Evandro Menezes <evandro.menezes@amd.com>, 2007.
++
+    This file is part of the GNU C Library.
+-   Based on i586 version contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
+ 
+    The GNU C Library is free software; you can redistribute it and/or
+    modify it under the terms of the GNU Lesser General Public
+@@ -16,86 +19,556 @@
+    You should have received a copy of the GNU Lesser General Public
+    License along with the GNU C Library; if not, write to the Free
+    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+-   02111-1307 USA.  */
++   02111-1307 USA.
++*/
+ 
+ #include <sysdep.h>
+ #include "asm-syntax.h"
+-#include "bp-sym.h"
+-#include "bp-asm.h"
+ 
+-/* BEWARE: `#ifdef memcpy' means that memcpy is redefined as `mempcpy',
+-   and the return value is the byte after the last one copied in
+-   the destination. */
+-#define MEMPCPY_P (defined memcpy)
++/* Stack slots in the red-zone. */
++
++#ifdef USE_AS_MEMPCPY
++#  define RETVAL	(0)
++#else
++#  define RETVAL	(-8)
++#endif
++#define SAVE0	(RETVAL - 8)
++#define SAVE1	(SAVE0	- 8)
++#define SAVE2	(SAVE1	- 8)
++#define SAVE3	(SAVE2	- 8)
+ 
+         .text
++
+ #if defined PIC && !defined NOT_IN_libc
+ ENTRY (__memcpy_chk)
++
+ 	cmpq	%rdx, %rcx
+ 	jb	HIDDEN_JUMPTARGET (__chk_fail)
++
+ END (__memcpy_chk)
+ #endif
+-ENTRY (BP_SYM (memcpy))
+-	/* Cutoff for the big loop is a size of 32 bytes since otherwise
+-	   the loop will never be entered.  */
++
++ENTRY(memcpy)				/* (void *, const void*, size_t) */
++
++/* Handle tiny blocks. */
++
++L(1try):				/* up to 32B */
+ 	cmpq	$32, %rdx
+-	movq	%rdx, %rcx
+-#if !MEMPCPY_P
+-	movq	%rdi, %r10	/* Save value. */
++#ifndef USE_AS_MEMPCPY
++	movq	%rdi, %rax		/* save return value */
+ #endif
++	jae	L(1after)
+ 
+-	/* We need this in any case.  */
+-	cld
++L(1):					/* 1-byte once */
++	testb	$1, %dl
++	jz	L(1a)
+ 
+-	jbe	1f
++	movzbl	(%rsi),	%ecx
++	movb	%cl, (%rdi)
+ 
+-	/* Align destination.  */
+-	movq	%rdi, %rax
+-	negq	%rax
+-	andq	$7, %rax
+-	subq	%rax, %rcx
+-	xchgq	%rax, %rcx
++	incq	%rsi
++	incq	%rdi
++
++	.p2align 4,, 4
++
++L(1a):					/* 2-byte once */
++	testb	$2, %dl
++	jz	L(1b)
++
++	movzwl	(%rsi),	%ecx
++	movw	%cx, (%rdi)
+ 
+-	rep; movsb
++	addq	$2, %rsi
++	addq	$2, %rdi
+ 
+-	movq	%rax, %rcx
+-	subq	$32, %rcx
+-	js	2f
++	.p2align 4,, 4
++
++L(1b):					/* 4-byte once */
++	testb	$4, %dl
++	jz	L(1c)
++
++	movl	(%rsi),	%ecx
++	movl	%ecx, (%rdi)
++
++	addq	$4, %rsi
++	addq	$4, %rdi
++
++	.p2align 4,, 4
++
++L(1c):					/* 8-byte once */
++	testb	$8, %dl
++	jz	L(1d)
++
++	movq	(%rsi), %rcx
++	movq	%rcx, (%rdi)
++
++	addq	$8, %rsi
++	addq	$8, %rdi
++
++	.p2align 4,, 4
++
++L(1d):					/* 16-byte loop */
++	andl	$0xf0, %edx
++	jz	L(exit)
+ 
+ 	.p2align 4
+-3:
+ 
+-	/* Now correct the loop counter.  Please note that in the following
+-	   code the flags are not changed anymore.  */
+-	subq	$32, %rcx
++L(1loop):
++	movq	  (%rsi), %rcx
++	movq	8 (%rsi), %r8
++	movq	%rcx,   (%rdi)
++	movq	 %r8, 8 (%rdi)
++
++	subl	$16, %edx
++
++	leaq	16 (%rsi), %rsi
++	leaq	16 (%rdi), %rdi
++
++	jnz	L(1loop)
++
++	.p2align 4,, 4
++
++L(exit):				/* exit */
++#ifdef USE_AS_MEMPCPY
++	movq	%rdi, %rax		/* return value */
++#else
++	rep
++#endif
++	retq
++
++	.p2align 4
++
++L(1after):
++#ifndef USE_AS_MEMPCPY
++	movq	%rax, RETVAL (%rsp)	/* save return value */
++#endif
++
++/* Align to the natural word size. */
++
++L(aligntry):
++	movl	%esi, %ecx      	/* align by destination */
++
++	andl	$7, %ecx
++	jz	L(alignafter)  		/* already aligned */
++
++L(align):		      		/* align */
++	leaq	-8 (%rcx, %rdx), %rdx	/* calculate remaining bytes */
++	subl	$8, %ecx
++
++	.p2align 4
++
++L(alignloop):				/* 1-byte alignment loop */
++	movzbl	(%rsi), %eax
++	movb	%al, (%rdi)
++
++	incl	%ecx
++
++	leaq	1 (%rsi), %rsi
++	leaq	1 (%rdi), %rdi
++
++	jnz	L(alignloop)
++
++	.p2align 4
++
++L(alignafter):
++
++/* Loop to handle mid-sized blocks. */
++
++L(32try):				/* up to 1KB */
++	cmpq	$1024, %rdx
++	ja	L(32after)
++
++L(32):					/* 32-byte loop */
++	movl	%edx, %ecx
++	shrl	$5, %ecx
++	jz	L(32skip)
++
++	.p2align 4
++
++L(32loop):
++	decl	%ecx
+ 
+ 	movq	(%rsi), %rax
+-	movq	8(%rsi), %rdx
+-	movq	16(%rsi), %r8
+-	movq	24(%rsi), %r9
++	movq	 8 (%rsi), %r8
++	movq	16 (%rsi), %r9
++	movq	24 (%rsi), %r10
++
+ 	movq	%rax, (%rdi)
+-	movq	%rdx, 8(%rdi)
+-	movq	%r8, 16(%rdi)
+-	movq	%r9, 24(%rdi)
++	movq	 %r8,  8 (%rdi)
++	movq	 %r9, 16 (%rdi)
++	movq	%r10, 24 (%rdi)
+ 
+ 	leaq	32(%rsi), %rsi
+ 	leaq	32(%rdi), %rdi
+ 
+-	jns	3b
++	jz	L(32skip)		/* help out smaller blocks */
++
++	decl	%ecx
++
++	movq	   (%rsi), %rax
++	movq	 8 (%rsi), %r8
++	movq	16 (%rsi), %r9
++	movq	24 (%rsi), %r10
++
++	movq	%rax,    (%rdi)
++	movq	 %r8,  8 (%rdi)
++	movq	 %r9, 16 (%rdi)
++	movq	%r10, 24 (%rdi)
++
++	leaq	32 (%rsi), %rsi
++	leaq	32 (%rdi), %rdi
++
++	jnz	L(32loop)
++
++	.p2align 4
+ 
+-	/* Correct extra loop counter modification.  */
+-2:	addq	$32, %rcx
+-1:	rep; movsb
++L(32skip):
++	andl	$31, %edx		/* check for left overs */
++#ifdef USE_AS_MEMPCPY
++	jnz	L(1)
+ 
+-#if MEMPCPY_P
+-	movq	%rdi, %rax		/* Set return value.  */
++	movq	%rdi, %rax
+ #else
+-	movq	%r10, %rax		/* Set return value.  */
++	movq	RETVAL (%rsp), %rax
++	jnz	L(1)
+ 	
++	rep
++#endif
++	retq				/* exit */
++
++	.p2align 4
++
++L(32after):
++
++/*
++	In order to minimize code-size in RTLD, algorithms specific for
++	larger blocks are excluded when building for RTLD.