aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndrew Senkevich <andrew.senkevich@intel.com>2015-06-09 14:25:49 +0300
committerAndrew Senkevich <andrew.senkevich@intel.com>2015-06-09 14:25:49 +0300
commit2193311288b97cf11dfabf1be22eac89b4ff7366 (patch)
treeb01d1141b18f16fe4b55855b6982b7601e208b0e
parentfc5771e43ccf905d3e134b1e3349a3657acaa2a9 (diff)
downloadglibc-2193311288b97cf11dfabf1be22eac89b4ff7366.tar.xz
glibc-2193311288b97cf11dfabf1be22eac89b4ff7366.zip
Start of series of patches with x86_64 vector math functions.
Here is implementation of cos containing SSE, AVX, AVX2 and AVX512 versions according to Vector ABI which had been discussed in <https://groups.google.com/forum/#!topic/x86-64-abi/LmppCfN1rZ4>. Vector math library build and ABI testing enabled by default for x86_64. * sysdeps/x86_64/fpu/Makefile: New file. * sysdeps/x86_64/fpu/Versions: New file. * sysdeps/x86_64/fpu/svml_d_cos_data.S: New file. * sysdeps/x86_64/fpu/svml_d_cos_data.h: New file. * sysdeps/x86_64/fpu/svml_d_cos2_core.S: New file. * sysdeps/x86_64/fpu/svml_d_cos4_core.S: New file. * sysdeps/x86_64/fpu/svml_d_cos4_core_avx.S: New file. * sysdeps/x86_64/fpu/svml_d_cos8_core.S: New file. * sysdeps/x86_64/fpu/svml_d_wrapper_impl.h: New file. * sysdeps/x86_64/fpu/multiarch/svml_d_cos2_core.S: New file. * sysdeps/x86_64/fpu/multiarch/svml_d_cos2_core_sse4.S: New file. * sysdeps/x86_64/fpu/multiarch/svml_d_cos4_core.S: New file. * sysdeps/x86_64/fpu/multiarch/svml_d_cos4_core_avx2.S: New file. * sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S: New file. * sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S: New file. * sysdeps/x86_64/fpu/multiarch/Makefile (libmvec-sysdep_routines): Added build of SSE, AVX2 and AVX512 IFUNC versions. * sysdeps/x86/fpu/bits/math-vector.h: Added SIMD declaration for cos. * math/bits/mathcalls.h: Added cos declaration with __MATHCALL_VEC. * sysdeps/x86_64/configure.ac: Options for libmvec build. * sysdeps/x86_64/configure: Regenerated. * sysdeps/x86_64/sysdep.h (cfi_offset_rel_rsp): New macro. * sysdeps/unix/sysv/linux/x86_64/libmvec.abilist: New file. * manual/install.texi (Configuring and compiling): Document --disable-mathvec. * INSTALL: Regenerated. * NEWS: Mention addition of libmvec and x86_64 vector cos.
-rw-r--r--ChangeLog30
-rw-r--r--INSTALL4
-rw-r--r--NEWS8
-rw-r--r--manual/install.texi4
-rw-r--r--math/bits/mathcalls.h2
-rw-r--r--sysdeps/unix/sysv/linux/x86_64/libmvec.abilist6
-rw-r--r--sysdeps/x86/fpu/bits/math-vector.h34
-rw-r--r--sysdeps/x86_64/configure4
-rw-r--r--sysdeps/x86_64/configure.ac4
-rw-r--r--sysdeps/x86_64/fpu/Makefile5
-rw-r--r--sysdeps/x86_64/fpu/Versions5
-rw-r--r--sysdeps/x86_64/fpu/multiarch/Makefile5
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_d_cos2_core.S38
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_d_cos2_core_sse4.S223
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_d_cos4_core.S38
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_d_cos4_core_avx2.S207
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S39
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S463
-rw-r--r--sysdeps/x86_64/fpu/svml_d_cos2_core.S30
-rw-r--r--sysdeps/x86_64/fpu/svml_d_cos4_core.S30
-rw-r--r--sysdeps/x86_64/fpu/svml_d_cos4_core_avx.S25
-rw-r--r--sysdeps/x86_64/fpu/svml_d_cos8_core.S26
-rw-r--r--sysdeps/x86_64/fpu/svml_d_cos_data.S114
-rw-r--r--sysdeps/x86_64/fpu/svml_d_cos_data.h48
-rw-r--r--sysdeps/x86_64/fpu/svml_d_wrapper_impl.h101
-rw-r--r--sysdeps/x86_64/sysdep.h7
26 files changed, 1499 insertions, 1 deletions
diff --git a/ChangeLog b/ChangeLog
index 63dc9982b7..0877e05289 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,33 @@
+2015-06-09 Andrew Senkevich <andrew.senkevich@intel.com>
+
+ * sysdeps/x86_64/fpu/Makefile: New file.
+ * sysdeps/x86_64/fpu/Versions: New file.
+ * sysdeps/x86_64/fpu/svml_d_cos_data.S: New file.
+ * sysdeps/x86_64/fpu/svml_d_cos_data.h: New file.
+ * sysdeps/x86_64/fpu/svml_d_cos2_core.S: New file.
+ * sysdeps/x86_64/fpu/svml_d_cos4_core.S: New file.
+ * sysdeps/x86_64/fpu/svml_d_cos4_core_avx.S: New file.
+ * sysdeps/x86_64/fpu/svml_d_cos8_core.S: New file.
+ * sysdeps/x86_64/fpu/svml_d_wrapper_impl.h: New file.
+ * sysdeps/x86_64/fpu/multiarch/svml_d_cos2_core.S: New file.
+ * sysdeps/x86_64/fpu/multiarch/svml_d_cos2_core_sse4.S: New file.
+ * sysdeps/x86_64/fpu/multiarch/svml_d_cos4_core.S: New file.
+ * sysdeps/x86_64/fpu/multiarch/svml_d_cos4_core_avx2.S: New file.
+ * sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S: New file.
+ * sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S: New file.
+ * sysdeps/x86_64/fpu/multiarch/Makefile (libmvec-sysdep_routines): Added
+ build of SSE, AVX2 and AVX512 IFUNC versions.
+ * sysdeps/x86/fpu/bits/math-vector.h: Added SIMD declaration for cos.
+ * math/bits/mathcalls.h: Added cos declaration with __MATHCALL_VEC.
+ * sysdeps/x86_64/configure.ac: Options for libmvec build.
+ * sysdeps/x86_64/configure: Regenerated.
+ * sysdeps/x86_64/sysdep.h (cfi_offset_rel_rsp): New macro.
+ * sysdeps/unix/sysv/linux/x86_64/libmvec.abilist: New file.
+ * manual/install.texi (Configuring and compiling): Document
+ --disable-mathvec.
+ * INSTALL: Regenerated.
+ * NEWS: Mention addition of libmvec and x86_64 vector cos.
+
2015-06-09 Marko Myllynen <myllynen@redhat.com>
* locale/C-ctype.c (PREDEFINED_CLASSES): Remove.
diff --git a/INSTALL b/INSTALL
index 18a47f618e..8e13f2cef7 100644
--- a/INSTALL
+++ b/INSTALL
@@ -149,6 +149,10 @@ will be used, and CFLAGS sets optimization options for the compiler.
with, so new warnings cause the build with '-Werror' to fail), you
can configure with '--disable-werror'.
+'--disable-mathvec'
+ By default for x86_64, the GNU C Library is built with vector math
+ library. Use this option to disable vector math library.
+
'--build=BUILD-SYSTEM'
'--host=HOST-SYSTEM'
These options are for cross-compiling. If you specify both options
diff --git a/NEWS b/NEWS
index 881e61c4f2..5e223a1e39 100644
--- a/NEWS
+++ b/NEWS
@@ -50,6 +50,14 @@ Version 2.22
* CVE-2014-8121 The NSS backends shared internal state between the getXXent
and getXXbyYY NSS calls for the same database, causing a denial-of-service
condition in some applications.
+
+* Added vector math library named libmvec with the following vectorized x86_64
+ implementations: cos.
+ The library can be disabled with --disable-mathvec. Use of the functions is
+ enabled with -fopenmp -ffast-math starting from -O1 for GCC version >= 4.9.0.
+ The library is linked in as needed when using -lm (no need to specify -lmvec
+ explicitly).
+ Visit <https://sourceware.org/glibc/wiki/libmvec> for detailed information.
Version 2.21
diff --git a/manual/install.texi b/manual/install.texi
index bb09199fa6..42ee467463 100644
--- a/manual/install.texi
+++ b/manual/install.texi
@@ -181,6 +181,10 @@ version of GCC than this version of @theglibc{} was tested with, so
new warnings cause the build with @option{-Werror} to fail), you can
configure with @option{--disable-werror}.
+@item --disable-mathvec
+By default for x86_64, @theglibc{} is built with vector math library.
+Use this option to disable vector math library.
+
@item --build=@var{build-system}
@itemx --host=@var{host-system}
These options are for cross-compiling. If you specify both options and
diff --git a/math/bits/mathcalls.h b/math/bits/mathcalls.h
index e8e5577038..85a6a95a7c 100644
--- a/math/bits/mathcalls.h
+++ b/math/bits/mathcalls.h
@@ -60,7 +60,7 @@ __MATHCALL (atan,, (_Mdouble_ __x));
__MATHCALL (atan2,, (_Mdouble_ __y, _Mdouble_ __x));
/* Cosine of X. */
-__MATHCALL (cos,, (_Mdouble_ __x));
+__MATHCALL_VEC (cos,, (_Mdouble_ __x));
/* Sine of X. */
__MATHCALL (sin,, (_Mdouble_ __x));
/* Tangent of X. */
diff --git a/sysdeps/unix/sysv/linux/x86_64/libmvec.abilist b/sysdeps/unix/sysv/linux/x86_64/libmvec.abilist
new file mode 100644
index 0000000000..be6eaedafd
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/x86_64/libmvec.abilist
@@ -0,0 +1,6 @@
+GLIBC_2.22
+ GLIBC_2.22 A
+ _ZGVbN2v_cos F
+ _ZGVcN4v_cos F
+ _ZGVdN4v_cos F
+ _ZGVeN8v_cos F
diff --git a/sysdeps/x86/fpu/bits/math-vector.h b/sysdeps/x86/fpu/bits/math-vector.h
new file mode 100644
index 0000000000..27294ce9fa
--- /dev/null
+++ b/sysdeps/x86/fpu/bits/math-vector.h
@@ -0,0 +1,34 @@
+/* Platform-specific SIMD declarations of math functions.
+ Copyright (C) 2014-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef _MATH_H
+# error "Never include <bits/math-vector.h> directly;\
+ include <math.h> instead."
+#endif
+
+/* Get default empty definitions for simd declarations. */
+#include <bits/libm-simd-decl-stubs.h>
+
+#if defined __x86_64__ && defined __FAST_MATH__
+# if defined _OPENMP && _OPENMP >= 201307
+/* OpenMP case. */
+# define __DECL_SIMD_x86_64 _Pragma ("omp declare simd notinbranch")
+# undef __DECL_SIMD_cos
+# define __DECL_SIMD_cos __DECL_SIMD_x86_64
+# endif
+#endif
diff --git a/sysdeps/x86_64/configure b/sysdeps/x86_64/configure
index 7d4dadd4fd..1493523e1c 100644
--- a/sysdeps/x86_64/configure
+++ b/sysdeps/x86_64/configure
@@ -275,6 +275,10 @@ fi
config_vars="$config_vars
config-cflags-avx2 = $libc_cv_cc_avx2"
+if test x"$build_mathvec" = xnotset; then
+ build_mathvec=yes
+fi
+
$as_echo "#define PI_STATIC_AND_HIDDEN 1" >>confdefs.h
# work around problem with autoconf and empty lines at the end of files
diff --git a/sysdeps/x86_64/configure.ac b/sysdeps/x86_64/configure.ac
index c9f9a51f72..1c2b35fe92 100644
--- a/sysdeps/x86_64/configure.ac
+++ b/sysdeps/x86_64/configure.ac
@@ -99,6 +99,10 @@ if test $libc_cv_cc_avx2 = yes; then
fi
LIBC_CONFIG_VAR([config-cflags-avx2], [$libc_cv_cc_avx2])
+if test x"$build_mathvec" = xnotset; then
+ build_mathvec=yes
+fi
+
dnl It is always possible to access static and hidden symbols in an
dnl position independent way.
AC_DEFINE(PI_STATIC_AND_HIDDEN)
diff --git a/sysdeps/x86_64/fpu/Makefile b/sysdeps/x86_64/fpu/Makefile
new file mode 100644
index 0000000000..2f16323f78
--- /dev/null
+++ b/sysdeps/x86_64/fpu/Makefile
@@ -0,0 +1,5 @@
+ifeq ($(subdir),mathvec)
+libmvec-support += svml_d_cos2_core svml_d_cos4_core_avx \
+ svml_d_cos4_core svml_d_cos8_core \
+ svml_d_cos_data init-arch
+endif
diff --git a/sysdeps/x86_64/fpu/Versions b/sysdeps/x86_64/fpu/Versions
new file mode 100644
index 0000000000..4290e73589
--- /dev/null
+++ b/sysdeps/x86_64/fpu/Versions
@@ -0,0 +1,5 @@
+libmvec {
+ GLIBC_2.22 {
+ _ZGVbN2v_cos; _ZGVcN4v_cos; _ZGVdN4v_cos; _ZGVeN8v_cos;
+ }
+}
diff --git a/sysdeps/x86_64/fpu/multiarch/Makefile b/sysdeps/x86_64/fpu/multiarch/Makefile
index 12b0526e50..b2f3266490 100644
--- a/sysdeps/x86_64/fpu/multiarch/Makefile
+++ b/sysdeps/x86_64/fpu/multiarch/Makefile
@@ -51,3 +51,8 @@ CFLAGS-slowexp-avx.c = -msse2avx -DSSE2AVX
CFLAGS-s_tan-avx.c = -msse2avx -DSSE2AVX
endif
endif
+
+ifeq ($(subdir),mathvec)
+libmvec-sysdep_routines += svml_d_cos2_core_sse4 svml_d_cos4_core_avx2 \
+ svml_d_cos8_core_avx512
+endif
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cos2_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cos2_core.S
new file mode 100644
index 0000000000..5f67d83bd4
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cos2_core.S
@@ -0,0 +1,38 @@
+/* Multiple versions of vectorized cos, vector length is 2.
+ Copyright (C) 2014-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+ .text
+ENTRY (_ZGVbN2v_cos)
+ .type _ZGVbN2v_cos, @gnu_indirect_function
+ cmpl $0, KIND_OFFSET+__cpu_features(%rip)
+ jne 1f
+ call __init_cpu_features
+1: leaq _ZGVbN2v_cos_sse4(%rip), %rax
+ testl $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
+ jz 2f
+ ret
+2: leaq _ZGVbN2v_cos_sse2(%rip), %rax
+ ret
+END (_ZGVbN2v_cos)
+libmvec_hidden_def (_ZGVbN2v_cos)
+
+#define _ZGVbN2v_cos _ZGVbN2v_cos_sse2
+#include "../svml_d_cos2_core.S"
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cos2_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cos2_core_sse4.S
new file mode 100644
index 0000000000..11348a37c5
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cos2_core_sse4.S
@@ -0,0 +1,223 @@
+/* Function cos vectorized with SSE4.
+ Copyright (C) 2014-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "svml_d_cos_data.h"
+
+ .text
+ENTRY (_ZGVbN2v_cos_sse4)
+/* ALGORITHM DESCRIPTION:
+
+ ( low accuracy ( < 4ulp ) or enhanced performance
+ ( half of correct mantissa ) implementation )
+
+ Argument representation:
+ arg + Pi/2 = (N*Pi + R)
+
+ Result calculation:
+ cos(arg) = sin(arg+Pi/2) = sin(N*Pi + R) = (-1)^N * sin(R)
+ sin(R) is approximated by corresponding polynomial
+ */
+ pushq %rbp
+ cfi_adjust_cfa_offset (8)
+ cfi_rel_offset (%rbp, 0)
+ movq %rsp, %rbp
+ cfi_def_cfa_register (%rbp)
+ andq $-64, %rsp
+ subq $320, %rsp
+ movaps %xmm0, %xmm3
+ movq __svml_dcos_data@GOTPCREL(%rip), %rax
+ movups __dHalfPI(%rax), %xmm2
+
+/* ARGUMENT RANGE REDUCTION:
+ Add Pi/2 to argument: X' = X+Pi/2
+ */
+ addpd %xmm3, %xmm2
+ movups __dInvPI(%rax), %xmm5
+ movups __dAbsMask(%rax), %xmm4
+
+/* Get absolute argument value: X' = |X'| */
+ andps %xmm2, %xmm4
+
+/* Y = X'*InvPi + RS : right shifter add */
+ mulpd %xmm5, %xmm2
+
+/* Check for large arguments path */
+ cmpnlepd __dRangeVal(%rax), %xmm4
+ movups __dRShifter(%rax), %xmm6
+ addpd %xmm6, %xmm2
+ movmskpd %xmm4, %ecx
+
+/* N = Y - RS : right shifter sub */
+ movaps %xmm2, %xmm1
+
+/* SignRes = Y<<63 : shift LSB to MSB place for result sign */
+ psllq $63, %xmm2
+ subpd %xmm6, %xmm1
+
+/* N = N - 0.5 */
+ subpd __dOneHalf(%rax), %xmm1
+ movups __dPI1(%rax), %xmm7
+
+/* R = X - N*Pi1 */
+ mulpd %xmm1, %xmm7
+ movups __dPI2(%rax), %xmm4
+
+/* R = R - N*Pi2 */
+ mulpd %xmm1, %xmm4
+ subpd %xmm7, %xmm0
+ movups __dPI3(%rax), %xmm5
+
+/* R = R - N*Pi3 */
+ mulpd %xmm1, %xmm5
+ subpd %xmm4, %xmm0
+
+/* R = R - N*Pi4 */
+ movups __dPI4(%rax), %xmm6
+ mulpd %xmm6, %xmm1
+ subpd %xmm5, %xmm0
+ subpd %xmm1, %xmm0
+
+/* POLYNOMIAL APPROXIMATION: R2 = R*R */
+ movaps %xmm0, %xmm4
+ mulpd %xmm0, %xmm4
+ movups __dC7(%rax), %xmm1
+ mulpd %xmm4, %xmm1
+ addpd __dC6(%rax), %xmm1
+ mulpd %xmm4, %xmm1
+ addpd __dC5(%rax), %xmm1
+ mulpd %xmm4, %xmm1
+ addpd __dC4(%rax), %xmm1
+
+/* Poly = C3+R2*(C4+R2*(C5+R2*(C6+R2*C7))) */
+ mulpd %xmm4, %xmm1
+ addpd __dC3(%rax), %xmm1
+
+/* Poly = R+R*(R2*(C1+R2*(C2+R2*Poly))) */
+ mulpd %xmm4, %xmm1
+ addpd __dC2(%rax), %xmm1
+ mulpd %xmm4, %xmm1
+ addpd __dC1(%rax), %xmm1
+ mulpd %xmm1, %xmm4
+ mulpd %xmm0, %xmm4
+ addpd %xmm4, %xmm0
+
+/* RECONSTRUCTION:
+ Final sign setting: Res = Poly^SignRes */
+ xorps %xmm2, %xmm0
+ testl %ecx, %ecx
+ jne .LBL_1_3
+
+.LBL_1_2:
+ cfi_remember_state
+ movq %rbp, %rsp
+ cfi_def_cfa_register (%rsp)
+ popq %rbp
+ cfi_adjust_cfa_offset (-8)
+ cfi_restore (%rbp)
+ ret
+
+.LBL_1_3:
+ cfi_restore_state
+ movups %xmm3, 192(%rsp)
+ movups %xmm0, 256(%rsp)
+ je .LBL_1_2
+
+ xorb %dl, %dl
+ xorl %eax, %eax
+ movups %xmm8, 112(%rsp)
+ movups %xmm9, 96(%rsp)
+ movups %xmm10, 80(%rsp)
+ movups %xmm11, 64(%rsp)
+ movups %xmm12, 48(%rsp)
+ movups %xmm13, 32(%rsp)
+ movups %xmm14, 16(%rsp)
+ movups %xmm15, (%rsp)
+ movq %rsi, 136(%rsp)
+ movq %rdi, 128(%rsp)
+ movq %r12, 168(%rsp)
+ cfi_offset_rel_rsp (12, 168)
+ movb %dl, %r12b
+ movq %r13, 160(%rsp)
+ cfi_offset_rel_rsp (13, 160)
+ movl %ecx, %r13d
+ movq %r14, 152(%rsp)
+ cfi_offset_rel_rsp (14, 152)
+ movl %eax, %r14d
+ movq %r15, 144(%rsp)
+ cfi_offset_rel_rsp (15, 144)
+ cfi_remember_state
+
+.LBL_1_6:
+ btl %r14d, %r13d
+ jc .LBL_1_12
+
+.LBL_1_7:
+ lea 1(%r14), %esi
+ btl %esi, %r13d
+ jc .LBL_1_10
+
+.LBL_1_8:
+ incb %r12b
+ addl $2, %r14d
+ cmpb $16, %r12b
+ jb .LBL_1_6
+
+ movups 112(%rsp), %xmm8
+ movups 96(%rsp), %xmm9
+