slackbuilds_ponce/libraries/atlas/atlas.patch
2010-05-15 10:25:39 +02:00

5072 lines
137 KiB
Diff

diff -rupN ATLAS/CONFIG/src/backend/archinfo_x86.c atlas-3.8.3/CONFIG/src/backend/archinfo_x86.c
--- ATLAS/CONFIG/src/backend/archinfo_x86.c 2009-02-18 19:47:37.000000000 +0100
+++ atlas-3.8.3/CONFIG/src/backend/archinfo_x86.c 2009-11-12 13:47:23.777451677 +0100
@@ -320,7 +320,7 @@ enum MACHTYPE Chip2Mach(enum CHIP chip,
iret = IntP4;
break;
case 3:
- case 4:
+ case 4: ; case 6:
iret = IntP4E;
break;
default:
diff -rupN ATLAS/include/atlas_lvl3.h atlas-3.8.3/include/atlas_lvl3.h
--- ATLAS/include/atlas_lvl3.h 2009-02-18 19:47:35.000000000 +0100
+++ atlas-3.8.3/include/atlas_lvl3.h 2009-11-12 13:52:49.308496090 +0100
@@ -126,7 +126,7 @@
#define CPAT Mjoin(C_ATL_, PRE);
#ifndef ATL_MaxMalloc
- #define ATL_MaxMalloc 67108864
+ #define ATL_MaxMalloc XXX_MaxMalloc_XXX
#endif
typedef void (*MAT2BLK)(int, int, const TYPE*, int, TYPE*, const SCALAR);
diff -rupN ATLAS/src/blas/gemm/ATL_cmmJITcp.c atlas-3.8.3/src/blas/gemm/ATL_cmmJITcp.c
--- ATLAS/src/blas/gemm/ATL_cmmJITcp.c 2009-02-18 19:47:44.000000000 +0100
+++ atlas-3.8.3/src/blas/gemm/ATL_cmmJITcp.c 2009-11-12 12:44:34.816529051 +0100
@@ -268,7 +268,8 @@ static void Mjoin(PATL,mmK)
{
NBmm0 = NBmm1 = NBmmX = Mjoin(PATLU,pKBmm);
if (SCALAR_IS_ZERO(beta))
- Mjoin(PATL,gezero)(M, N, C, ldc);
+ /* Mjoin(PATL,gezero)(M, N, C, ldc); */
+ { Mjoin(PATLU,gezero)(M, N, pC, ldpc); Mjoin(PATLU,gezero)(M, N, pC+ipc, ldpc); }
}
if (nblk)
{
diff -rupN ATLAS/src/blas/gemm/ATL_gereal2cplx.c atlas-3.8.3/src/blas/gemm/ATL_gereal2cplx.c
--- ATLAS/src/blas/gemm/ATL_gereal2cplx.c 2009-02-18 19:47:44.000000000 +0100
+++ atlas-3.8.3/src/blas/gemm/ATL_gereal2cplx.c 2009-11-12 12:49:49.331651677 +0100
@@ -43,7 +43,53 @@ void Mjoin(PATL,gereal2cplx)
const int ldc2 = (ldc-M)<<1;
int i, j;
- if (ialp == ATL_rzero && ibet == ATL_rzero)
+/*
+ * Cannot read C if BETA is 0
+ */
+ if (rbet == ATL_rzero && ibet == ATL_rzero)
+ {
+ if (ialp == ATL_rzero) /* alpha is a real number */
+ {
+ if (ralp == ATL_rone) /* alpha = 1.0 */
+ {
+ for (j=0; j < N; j++, R += ldr, I += ldi, C += ldc2)
+ {
+ for (i=0; i < M; i++, C += 2)
+ {
+ *C = R[i];
+ C[1] = I[i];
+ }
+ }
+ }
+ else
+ {
+ for (j=0; j < N; j++, R += ldr, I += ldi, C += ldc2)
+ {
+ for (i=0; i < M; i++, C += 2)
+ {
+ *C = ralp * R[i];
+ C[1] = ralp * I[i];
+ }
+ }
+ }
+ }
+ else /* alpha is a complex number */
+ {
+ for (j=0; j < N; j++, R += ldr, I += ldi, C += ldc2)
+ {
+ for (i=0; i < M; i++, C += 2)
+ {
+ ra = R[i]; ia = I[i];
+ C[0] = ralp * ra - ialp * ia;
+ C[1] = ralp * ia + ialp * ra;
+ }
+ }
+ }
+ }
+/*
+ * If alpha and beta are both real numbers
+ */
+ else if (ialp == ATL_rzero && ibet == ATL_rzero)
{
if (ralp == ATL_rone && rbet == ATL_rone)
{
diff -rupN ATLAS/tune/blas/gemm/CASES/ATL_smm14x1x84_sseCU.c atlas-3.8.3/tune/blas/gemm/CASES/ATL_smm14x1x84_sseCU.c
--- ATLAS/tune/blas/gemm/CASES/ATL_smm14x1x84_sseCU.c 2009-02-18 19:48:26.000000000 +0100
+++ atlas-3.8.3/tune/blas/gemm/CASES/ATL_smm14x1x84_sseCU.c 2009-11-12 12:35:50.453038827 +0100
@@ -27,6 +27,13 @@
* POSSIBILITY OF SUCH DAMAGE.
*
*/
+#if KB > 84
+ #error "KB cannot exceed 84!"
+#endif
+#if (KB/4)*4 != KB
+ #error "KB must be a multiple of 4!"
+#endif
+
#ifndef ATL_GAS_x8664
#error "This kernel requires x86-64 assembly!"
#endif
@@ -58,25 +65,25 @@
* Integer register usage shown be these defines
*/
#define pA %rcx
-#define pA10 %rbx
-#define ldab %rbp
-#define mldab %rdx
+#define pA10 %rbx
+#define ldab %rbp
+#define mldab %rdx
#define mldab5 %rax
#define pB %rdi
#define pC %rsi
#define incCn %r10
#define stM %r9
#define stN %r11
-#define pfA %r8
-#define pA5 pA
-#define pB0 pB
+#define pfA %r8
+#define pA5 pA
+#define pB0 pB
#if MB == 0
- #define stM0 %r12
- #define incAm %r13
+ #define stM0 %r12
+ #define incAm %r13
#endif
/* rax used in 32/64 conversion */
-#define NBso (KB*4)
+#define NBso (KB*4)
#define MBKBso (MB*KB*4)
#define NB2so (NBso+NBso)
#define NB3so (NBso+NBso+NBso)
@@ -95,22 +102,22 @@
/*
* SSE2 register usage shown be these defines
*/
-#define rA0 %xmm0
-#define rB0 %xmm1
-#define rC0 %xmm2
-#define rC1 %xmm3
-#define rC2 %xmm4
-#define rC3 %xmm5
-#define rC4 %xmm6
-#define rC5 %xmm7
-#define rC6 %xmm8
-#define rC7 %xmm9
-#define rC8 %xmm10
-#define rC9 %xmm11
-#define rC10 %xmm12
-#define rC11 %xmm13
-#define rC12 %xmm14
-#define rC13 %xmm15
+#define rA0 %xmm0
+#define rB0 %xmm1
+#define rC0 %xmm2
+#define rC1 %xmm3
+#define rC2 %xmm4
+#define rC3 %xmm5
+#define rC4 %xmm6
+#define rC5 %xmm7
+#define rC6 %xmm8
+#define rC7 %xmm9
+#define rC8 %xmm10
+#define rC9 %xmm11
+#define rC10 %xmm12
+#define rC11 %xmm13
+#define rC12 %xmm14
+#define rC13 %xmm15
/*
* Prefetch defines
*/
@@ -127,99 +134,99 @@
#if MB != 0
#define incAm $MBKBso-NB14so+176
#endif
- .text
+ .text
.global ATL_asmdecor(ATL_USERMM)
ATL_asmdecor(ATL_USERMM):
/*
* Save callee-saved iregs
*/
- movq %rbp, -8(%rsp)
- movq %rbx, -16(%rsp)
+ movq %rbp, -8(%rsp)
+ movq %rbx, -16(%rsp)
#if MB == 0
- movq %r12, -32(%rsp)
- movq %r13, -40(%rsp)
+ movq %r12, -32(%rsp)
+ movq %r13, -40(%rsp)
#endif
#ifdef BETAX
#define BOF -56
- movss %xmm1, BOF(%rsp)
- movss %xmm1, BOF+4(%rsp)
- movss %xmm1, BOF+8(%rsp)
- movss %xmm1, BOF+12(%rsp)
+ movss %xmm1, BOF(%rsp)
+ movss %xmm1, BOF+4(%rsp)
+ movss %xmm1, BOF+8(%rsp)
+ movss %xmm1, BOF+12(%rsp)
#endif
/*
* pA already comes in right reg
* Initialize pB = B; pC = C; NBso = NB * sizeof;
*/
- movq %rsi, stN
- movq %rdi, %rax
- movq 16(%rsp), pC
- prefC((pC))
- prefC(64(pC))
- movq %r9, pB
- prefB((pB))
- prefB(64(pB))
- movq %rax, stM
+ movq %rsi, stN
+ movq %rdi, %rax
+ movq 16(%rsp), pC
+ prefC((pC))
+ prefC(64(pC))
+ movq %r9, pB
+ prefB((pB))
+ prefB(64(pB))
+ movq %rax, stM
/*
* stM = pA + NBNBso; stN = pB + NBNBso;
*/
#if MB == 0
- movq stM, pfA
- imulq $NBso, pfA
- prefB(128(pB))
- movq pfA, incAm
- addq pA5, pfA
- addq $176-NB14so, incAm
+ movq stM, pfA
+ imulq $NBso, pfA
+ prefB(128(pB))
+ movq pfA, incAm
+ addq pA5, pfA
+ addq $176-NB14so, incAm
#else
- movq $MBKBso, pfA
- addq pA5, pfA
- prefB(128(pB))
+ movq $MBKBso, pfA
+ addq pA5, pfA
+ prefB(128(pB))
#endif
/*
* convert ldc to 64 bits, and then set incCn = (ldc - MB)*sizeof
*/
- movl 24(%rsp), %eax
- cltq
- movq %rax, incCn
- subq stM, incCn
- addq $14, incCn
+ movl 24(%rsp), %eax
+ cltq
+ movq %rax, incCn
+ subq stM, incCn
+ addq $14, incCn
#ifdef SREAL
- shl $2, incCn
+ shl $2, incCn
#else
- shl $3, incCn
- prefC(128(pC))
- prefC(192(pC))
+ shl $3, incCn
+ prefC(128(pC))
+ prefC(192(pC))
#endif
/*
* Find M/14 if MB is not set
*/
#if MB == 0
- cmp $84, stM
- jne MB_LT84
-/* movq $84/14, stM */
- movq $6, stM
+ cmp $84, stM
+ jne MB_LT84
+/* movq $84/14, stM */
+ movq $6, stM
MBFOUND:
- subq $1, stM
- movq stM, stM0
+ subq $1, stM
+ movq stM, stM0
#endif
- addq $120, pA5
- addq $120, pB0
- movq $KB*4, ldab
- movq $-KB*5*4, mldab5
- movq $-KB*4, mldab
- subq mldab5, pA5
- lea KB*4(pA5, ldab,4), pA10
-/* movq $NB, stN */
+ addq $120, pA5
+ addq $120, pB0
+ movq $KB*4, ldab
+ movq $-KB*5*4, mldab5
+ movq $-KB*4, mldab
+ subq mldab5, pA5
+ lea KB*4(pA5, ldab,4), pA10
+/* movq $NB, stN */
UNLOOP:
#if MB == 0
- movq stM0, stM
- cmp $0, stM
- je MLAST
+ movq stM0, stM
+ cmp $0, stM
+ je MLAST
#else
#ifdef ATL_DivAns
- movq $ATL_DivAns-1, stM
+ movq $ATL_DivAns-1, stM
#else
- movq $MB/14-1, stM
+ movq $MB/14-1, stM
#endif
#endif
#if MB == 0 || MB > 14
@@ -227,992 +234,992 @@ UMLOOP:
/*
* rC[0-13] = pC[0-13] * beta
*/
- ALIGN16
+ ALIGN16
/*UKLOOP: */
#ifdef BETA1
- movaps 0-120(pA10,mldab5,2), rC0
- movaps 0-120(pB0), rB0
- mulps rB0, rC0
- addss (pC), rC0
- movaps 0-120(pA5, mldab,4), rC1
- mulps rB0, rC1
- addss CMUL(4)(pC), rC1
- movaps 0-120(pA10, mldab,8), rC2
- mulps rB0, rC2
- addss CMUL(8)(pC), rC2
- movaps 0-120(pA5, mldab,2), rC3
- mulps rB0, rC3
- addss CMUL(12)(pC), rC3
- movaps 0-120(pA5, mldab), rC4
- mulps rB0, rC4
- addss CMUL(16)(pC), rC4
- movaps 0-120(pA5), rC5
- mulps rB0, rC5
- addss CMUL(20)(pC), rC5
- movaps 0-120(pA5, ldab), rC6
- mulps rB0, rC6
- addss CMUL(24)(pC), rC6
- movaps 0-120(pA5, ldab,2), rC7
- mulps rB0, rC7
- addss CMUL(28)(pC), rC7
- movaps 0-120(pA10, mldab,2), rC8
- mulps rB0, rC8
- addss CMUL(32)(pC), rC8
- movaps 0-120(pA5,ldab,4), rC9
- mulps rB0, rC9
- addss CMUL(36)(pC), rC9
- movaps 0-120(pA10), rC10
- mulps rB0, rC10
- addss CMUL(40)(pC), rC10
- movaps 0-120(pA10,ldab), rC11
- mulps rB0, rC11
- addss CMUL(44)(pC), rC11
- movaps 0-120(pA10,ldab,2), rC12
- mulps rB0, rC12
- addss CMUL(48)(pC), rC12
- movaps 0-120(pA5,ldab,8), rC13
- mulps rB0, rC13
- addss CMUL(52)(pC), rC13
+ movaps 0-120(pA10,mldab5,2), rC0
+ movaps 0-120(pB0), rB0
+ mulps rB0, rC0
+ addss (pC), rC0
+ movaps 0-120(pA5, mldab,4), rC1
+ mulps rB0, rC1
+ addss CMUL(4)(pC), rC1
+ movaps 0-120(pA10, mldab,8), rC2
+ mulps rB0, rC2
+ addss CMUL(8)(pC), rC2
+ movaps 0-120(pA5, mldab,2), rC3
+ mulps rB0, rC3
+ addss CMUL(12)(pC), rC3
+ movaps 0-120(pA5, mldab), rC4
+ mulps rB0, rC4
+ addss CMUL(16)(pC), rC4
+ movaps 0-120(pA5), rC5
+ mulps rB0, rC5
+ addss CMUL(20)(pC), rC5
+ movaps 0-120(pA5, ldab), rC6
+ mulps rB0, rC6
+ addss CMUL(24)(pC), rC6
+ movaps 0-120(pA5, ldab,2), rC7
+ mulps rB0, rC7
+ addss CMUL(28)(pC), rC7
+ movaps 0-120(pA10, mldab,2), rC8
+ mulps rB0, rC8
+ addss CMUL(32)(pC), rC8
+ movaps 0-120(pA5,ldab,4), rC9
+ mulps rB0, rC9
+ addss CMUL(36)(pC), rC9
+ movaps 0-120(pA10), rC10
+ mulps rB0, rC10
+ addss CMUL(40)(pC), rC10
+ movaps 0-120(pA10,ldab), rC11
+ mulps rB0, rC11
+ addss CMUL(44)(pC), rC11
+ movaps 0-120(pA10,ldab,2), rC12
+ mulps rB0, rC12
+ addss CMUL(48)(pC), rC12
+ movaps 0-120(pA5,ldab,8), rC13
+ mulps rB0, rC13
+ addss CMUL(52)(pC), rC13
#else
- movaps 0-120(pA10,mldab5,2), rC0
- movaps 0-120(pB0), rC13
- mulps rC13, rC0
- movaps 0-120(pA5, mldab,4), rC1
- mulps rC13, rC1
- movaps 0-120(pA10, mldab,8), rC2
- mulps rC13, rC2
- movaps 0-120(pA5, mldab,2), rC3
- mulps rC13, rC3
- movaps 0-120(pA5, mldab), rC4
- mulps rC13, rC4
- movaps 0-120(pA5), rC5
- mulps rC13, rC5
- movaps 0-120(pA5, ldab), rC6
- mulps rC13, rC6
- movaps 0-120(pA5, ldab,2), rC7
- mulps rC13, rC7
- movaps 0-120(pA10, mldab,2), rC8
- mulps rC13, rC8
- movaps 0-120(pA5,ldab,4), rC9
- mulps rC13, rC9
- movaps 0-120(pA10), rC10
- mulps rC13, rC10
- movaps 0-120(pA10,ldab), rC11
- mulps rC13, rC11
- movaps 0-120(pA10,ldab,2), rC12
- mulps rC13, rC12
- mulps 0-120(pA5,ldab,8), rC13
+ movaps 0-120(pA10,mldab5,2), rC0
+ movaps 0-120(pB0), rC13
+ mulps rC13, rC0
+ movaps 0-120(pA5, mldab,4), rC1
+ mulps rC13, rC1
+ movaps 0-120(pA10, mldab,8), rC2
+ mulps rC13, rC2
+ movaps 0-120(pA5, mldab,2), rC3
+ mulps rC13, rC3
+ movaps 0-120(pA5, mldab), rC4
+ mulps rC13, rC4
+ movaps 0-120(pA5), rC5
+ mulps rC13, rC5
+ movaps 0-120(pA5, ldab), rC6
+ mulps rC13, rC6
+ movaps 0-120(pA5, ldab,2), rC7
+ mulps rC13, rC7
+ movaps 0-120(pA10, mldab,2), rC8
+ mulps rC13, rC8
+ movaps 0-120(pA5,ldab,4), rC9
+ mulps rC13, rC9
+ movaps 0-120(pA10), rC10
+ mulps rC13, rC10
+ movaps 0-120(pA10,ldab), rC11
+ mulps rC13, rC11
+ movaps 0-120(pA10,ldab,2), rC12
+ mulps rC13, rC12
+ mulps 0-120(pA5,ldab,8), rC13
#endif
#if KB > 4
- movaps 16-120(pA10,mldab5,2), rA0
- movaps 16-120(pB0), rB0
- mulps rB0, rA0
- addps rA0, rC0
- movaps 16-120(pA5, mldab,4), rA0
- mulps rB0, rA0
- addps rA0, rC1
- movaps 16-120(pA10, mldab,8), rA0
- mulps rB0, rA0
- addps rA0, rC2
- movaps 16-120(pA5, mldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC3
- movaps 16-120(pA5, mldab), rA0
- mulps rB0, rA0
- addps rA0, rC4
- movaps 16-120(pA5), rA0
- mulps rB0, rA0
- addps rA0, rC5
- movaps 16-120(pA5, ldab), rA0
- mulps rB0, rA0
- addps rA0, rC6
- movaps 16-120(pA5, ldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC7
- movaps 16-120(pA10, mldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC8
- movaps 16-120(pA5,ldab,4), rA0
- mulps rB0, rA0
- addps rA0, rC9
- movaps 16-120(pA10), rA0
- mulps rB0, rA0
- addps rA0, rC10
- movaps 16-120(pA10,ldab), rA0
- mulps rB0, rA0
- addps rA0, rC11
- movaps 16-120(pA10,ldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC12
- mulps 16-120(pA5,ldab,8), rB0
- addps rB0, rC13
+ movaps 16-120(pA10,mldab5,2), rA0
+ movaps 16-120(pB0), rB0
+ mulps rB0, rA0
+ addps rA0, rC0
+ movaps 16-120(pA5, mldab,4), rA0
+ mulps rB0, rA0
+ addps rA0, rC1
+ movaps 16-120(pA10, mldab,8), rA0
+ mulps rB0, rA0
+ addps rA0, rC2
+ movaps 16-120(pA5, mldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC3
+ movaps 16-120(pA5, mldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC4
+ movaps 16-120(pA5), rA0
+ mulps rB0, rA0
+ addps rA0, rC5
+ movaps 16-120(pA5, ldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC6
+ movaps 16-120(pA5, ldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC7
+ movaps 16-120(pA10, mldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC8
+ movaps 16-120(pA5,ldab,4), rA0
+ mulps rB0, rA0
+ addps rA0, rC9
+ movaps 16-120(pA10), rA0
+ mulps rB0, rA0
+ addps rA0, rC10
+ movaps 16-120(pA10,ldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC11
+ movaps 16-120(pA10,ldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC12
+ mulps 16-120(pA5,ldab,8), rB0
+ addps rB0, rC13
#endif
#if KB > 8
- movaps 32-120(pA10,mldab5,2), rA0
- movaps 32-120(pB0), rB0
- mulps rB0, rA0
- addps rA0, rC0
- movaps 32-120(pA5, mldab,4), rA0
- mulps rB0, rA0
- addps rA0, rC1
- movaps 32-120(pA10, mldab,8), rA0
- mulps rB0, rA0
- addps rA0, rC2
- movaps 32-120(pA5, mldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC3
- movaps 32-120(pA5, mldab), rA0
- mulps rB0, rA0
- addps rA0, rC4
- movaps 32-120(pA5), rA0
- mulps rB0, rA0
- addps rA0, rC5
- movaps 32-120(pA5, ldab), rA0
- mulps rB0, rA0
- addps rA0, rC6
- movaps 32-120(pA5, ldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC7
- movaps 32-120(pA10, mldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC8
- movaps 32-120(pA5,ldab,4), rA0
- mulps rB0, rA0
- addps rA0, rC9
- movaps 32-120(pA10), rA0
- mulps rB0, rA0
- addps rA0, rC10
- movaps 32-120(pA10,ldab), rA0
- mulps rB0, rA0
- addps rA0, rC11
- movaps 32-120(pA10,ldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC12
- mulps 32-120(pA5,ldab,8), rB0
- addps rB0, rC13
+ movaps 32-120(pA10,mldab5,2), rA0
+ movaps 32-120(pB0), rB0
+ mulps rB0, rA0
+ addps rA0, rC0
+ movaps 32-120(pA5, mldab,4), rA0
+ mulps rB0, rA0
+ addps rA0, rC1
+ movaps 32-120(pA10, mldab,8), rA0
+ mulps rB0, rA0
+ addps rA0, rC2
+ movaps 32-120(pA5, mldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC3
+ movaps 32-120(pA5, mldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC4
+ movaps 32-120(pA5), rA0
+ mulps rB0, rA0
+ addps rA0, rC5
+ movaps 32-120(pA5, ldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC6
+ movaps 32-120(pA5, ldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC7
+ movaps 32-120(pA10, mldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC8
+ movaps 32-120(pA5,ldab,4), rA0
+ mulps rB0, rA0
+ addps rA0, rC9
+ movaps 32-120(pA10), rA0
+ mulps rB0, rA0
+ addps rA0, rC10
+ movaps 32-120(pA10,ldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC11
+ movaps 32-120(pA10,ldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC12
+ mulps 32-120(pA5,ldab,8), rB0
+ addps rB0, rC13
#endif
#if KB > 12
- movaps 48-120(pA10,mldab5,2), rA0
- movaps 48-120(pB0), rB0
- mulps rB0, rA0
- addps rA0, rC0
- movaps 48-120(pA5, mldab,4), rA0
- mulps rB0, rA0
- addps rA0, rC1
- movaps 48-120(pA10, mldab,8), rA0
- mulps rB0, rA0
- addps rA0, rC2
- movaps 48-120(pA5, mldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC3
- movaps 48-120(pA5, mldab), rA0
- mulps rB0, rA0
- addps rA0, rC4
- movaps 48-120(pA5), rA0
- mulps rB0, rA0
- addps rA0, rC5
- movaps 48-120(pA5, ldab), rA0
- mulps rB0, rA0
- addps rA0, rC6
- movaps 48-120(pA5, ldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC7
- movaps 48-120(pA10, mldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC8
- movaps 48-120(pA5,ldab,4), rA0
- mulps rB0, rA0
- addps rA0, rC9
- movaps 48-120(pA10), rA0
- mulps rB0, rA0
- addps rA0, rC10
- movaps 48-120(pA10,ldab), rA0
- mulps rB0, rA0
- addps rA0, rC11
- movaps 48-120(pA10,ldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC12
- mulps 48-120(pA5,ldab,8), rB0
- addps rB0, rC13
+ movaps 48-120(pA10,mldab5,2), rA0
+ movaps 48-120(pB0), rB0
+ mulps rB0, rA0
+ addps rA0, rC0
+ movaps 48-120(pA5, mldab,4), rA0
+ mulps rB0, rA0
+ addps rA0, rC1
+ movaps 48-120(pA10, mldab,8), rA0
+ mulps rB0, rA0
+ addps rA0, rC2
+ movaps 48-120(pA5, mldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC3
+ movaps 48-120(pA5, mldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC4
+ movaps 48-120(pA5), rA0
+ mulps rB0, rA0
+ addps rA0, rC5
+ movaps 48-120(pA5, ldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC6
+ movaps 48-120(pA5, ldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC7
+ movaps 48-120(pA10, mldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC8
+ movaps 48-120(pA5,ldab,4), rA0
+ mulps rB0, rA0
+ addps rA0, rC9
+ movaps 48-120(pA10), rA0
+ mulps rB0, rA0
+ addps rA0, rC10
+ movaps 48-120(pA10,ldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC11
+ movaps 48-120(pA10,ldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC12
+ mulps 48-120(pA5,ldab,8), rB0
+ addps rB0, rC13
#endif
#if KB > 16
- movaps 64-120(pA10,mldab5,2), rA0
- movaps 64-120(pB0), rB0
- mulps rB0, rA0
- addps rA0, rC0
- movaps 64-120(pA5, mldab,4), rA0
- mulps rB0, rA0
- addps rA0, rC1
- movaps 64-120(pA10, mldab,8), rA0
- mulps rB0, rA0
- addps rA0, rC2
- movaps 64-120(pA5, mldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC3
- movaps 64-120(pA5, mldab), rA0
- mulps rB0, rA0
- addps rA0, rC4
- movaps 64-120(pA5), rA0
- mulps rB0, rA0
- addps rA0, rC5
- movaps 64-120(pA5, ldab), rA0
- mulps rB0, rA0
- addps rA0, rC6
- movaps 64-120(pA5, ldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC7
- movaps 64-120(pA10, mldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC8
- movaps 64-120(pA5,ldab,4), rA0
- mulps rB0, rA0
- addps rA0, rC9
- movaps 64-120(pA10), rA0
- mulps rB0, rA0
- addps rA0, rC10
- movaps 64-120(pA10,ldab), rA0
- mulps rB0, rA0
- addps rA0, rC11
- movaps 64-120(pA10,ldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC12
- mulps 64-120(pA5,ldab,8), rB0
- addps rB0, rC13
+ movaps 64-120(pA10,mldab5,2), rA0
+ movaps 64-120(pB0), rB0
+ mulps rB0, rA0
+ addps rA0, rC0
+ movaps 64-120(pA5, mldab,4), rA0
+ mulps rB0, rA0
+ addps rA0, rC1
+ movaps 64-120(pA10, mldab,8), rA0
+ mulps rB0, rA0
+ addps rA0, rC2
+ movaps 64-120(pA5, mldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC3
+ movaps 64-120(pA5, mldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC4
+ movaps 64-120(pA5), rA0
+ mulps rB0, rA0
+ addps rA0, rC5
+ movaps 64-120(pA5, ldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC6
+ movaps 64-120(pA5, ldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC7
+ movaps 64-120(pA10, mldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC8
+ movaps 64-120(pA5,ldab,4), rA0
+ mulps rB0, rA0
+ addps rA0, rC9
+ movaps 64-120(pA10), rA0
+ mulps rB0, rA0
+ addps rA0, rC10
+ movaps 64-120(pA10,ldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC11
+ movaps 64-120(pA10,ldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC12
+ mulps 64-120(pA5,ldab,8), rB0
+ addps rB0, rC13
#endif
#if KB > 20
- movaps 80-120(pA10,mldab5,2), rA0
- movaps 80-120(pB0), rB0
- mulps rB0, rA0
- addps rA0, rC0
- movaps 80-120(pA5, mldab,4), rA0
- mulps rB0, rA0
- addps rA0, rC1
- movaps 80-120(pA10, mldab,8), rA0
- mulps rB0, rA0
- addps rA0, rC2
- movaps 80-120(pA5, mldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC3
- movaps 80-120(pA5, mldab), rA0
- mulps rB0, rA0
- addps rA0, rC4
- movaps 80-120(pA5), rA0
- mulps rB0, rA0
- addps rA0, rC5
- movaps 80-120(pA5, ldab), rA0
- mulps rB0, rA0
- addps rA0, rC6
- movaps 80-120(pA5, ldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC7
- movaps 80-120(pA10, mldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC8
- movaps 80-120(pA5,ldab,4), rA0
- mulps rB0, rA0
- addps rA0, rC9
- movaps 80-120(pA10), rA0
- mulps rB0, rA0
- addps rA0, rC10
- movaps 80-120(pA10,ldab), rA0
- mulps rB0, rA0
- addps rA0, rC11
- movaps 80-120(pA10,ldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC12
- mulps 80-120(pA5,ldab,8), rB0
- addps rB0, rC13
+ movaps 80-120(pA10,mldab5,2), rA0
+ movaps 80-120(pB0), rB0
+ mulps rB0, rA0
+ addps rA0, rC0
+ movaps 80-120(pA5, mldab,4), rA0
+ mulps rB0, rA0
+ addps rA0, rC1
+ movaps 80-120(pA10, mldab,8), rA0
+ mulps rB0, rA0
+ addps rA0, rC2
+ movaps 80-120(pA5, mldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC3
+ movaps 80-120(pA5, mldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC4
+ movaps 80-120(pA5), rA0
+ mulps rB0, rA0
+ addps rA0, rC5
+ movaps 80-120(pA5, ldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC6
+ movaps 80-120(pA5, ldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC7
+ movaps 80-120(pA10, mldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC8
+ movaps 80-120(pA5,ldab,4), rA0
+ mulps rB0, rA0
+ addps rA0, rC9
+ movaps 80-120(pA10), rA0
+ mulps rB0, rA0
+ addps rA0, rC10
+ movaps 80-120(pA10,ldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC11
+ movaps 80-120(pA10,ldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC12
+ mulps 80-120(pA5,ldab,8), rB0
+ addps rB0, rC13
#endif
#if KB > 24
- movaps 96-120(pA10,mldab5,2), rA0
- movaps 96-120(pB0), rB0
- mulps rB0, rA0
- addps rA0, rC0
- movaps 96-120(pA5, mldab,4), rA0
- mulps rB0, rA0
- addps rA0, rC1
- movaps 96-120(pA10, mldab,8), rA0
- mulps rB0, rA0
- addps rA0, rC2
- movaps 96-120(pA5, mldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC3
- movaps 96-120(pA5, mldab), rA0
- mulps rB0, rA0
- addps rA0, rC4
- movaps 96-120(pA5), rA0
- mulps rB0, rA0
- addps rA0, rC5
- movaps 96-120(pA5, ldab), rA0
- mulps rB0, rA0
- addps rA0, rC6
- movaps 96-120(pA5, ldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC7
- movaps 96-120(pA10, mldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC8
- movaps 96-120(pA5,ldab,4), rA0
- mulps rB0, rA0
- addps rA0, rC9
- movaps 96-120(pA10), rA0
- mulps rB0, rA0
- addps rA0, rC10
- movaps 96-120(pA10,ldab), rA0
- mulps rB0, rA0
- addps rA0, rC11
- movaps 96-120(pA10,ldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC12
- mulps 96-120(pA5,ldab,8), rB0
- addps rB0, rC13
+ movaps 96-120(pA10,mldab5,2), rA0
+ movaps 96-120(pB0), rB0
+ mulps rB0, rA0
+ addps rA0, rC0
+ movaps 96-120(pA5, mldab,4), rA0
+ mulps rB0, rA0
+ addps rA0, rC1
+ movaps 96-120(pA10, mldab,8), rA0
+ mulps rB0, rA0
+ addps rA0, rC2
+ movaps 96-120(pA5, mldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC3
+ movaps 96-120(pA5, mldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC4
+ movaps 96-120(pA5), rA0
+ mulps rB0, rA0
+ addps rA0, rC5
+ movaps 96-120(pA5, ldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC6
+ movaps 96-120(pA5, ldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC7
+ movaps 96-120(pA10, mldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC8
+ movaps 96-120(pA5,ldab,4), rA0
+ mulps rB0, rA0
+ addps rA0, rC9
+ movaps 96-120(pA10), rA0
+ mulps rB0, rA0
+ addps rA0, rC10
+ movaps 96-120(pA10,ldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC11
+ movaps 96-120(pA10,ldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC12
+ mulps 96-120(pA5,ldab,8), rB0
+ addps rB0, rC13
#endif
#if KB > 28
- movaps 112-120(pA10,mldab5,2), rA0
- movaps 112-120(pB0), rB0
- mulps rB0, rA0
- addps rA0, rC0
- movaps 112-120(pA5, mldab,4), rA0
- mulps rB0, rA0
- addps rA0, rC1
- movaps 112-120(pA10, mldab,8), rA0
- mulps rB0, rA0
- addps rA0, rC2
- movaps 112-120(pA5, mldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC3
- movaps 112-120(pA5, mldab), rA0
- mulps rB0, rA0
- addps rA0, rC4
- movaps 112-120(pA5), rA0
- mulps rB0, rA0
- addps rA0, rC5
- movaps 112-120(pA5, ldab), rA0
- mulps rB0, rA0
- addps rA0, rC6
- movaps 112-120(pA5, ldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC7
- movaps 112-120(pA10, mldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC8
- movaps 112-120(pA5,ldab,4), rA0
- mulps rB0, rA0
- addps rA0, rC9
- movaps 112-120(pA10), rA0
- mulps rB0, rA0
- addps rA0, rC10
- movaps 112-120(pA10,ldab), rA0
- mulps rB0, rA0
- addps rA0, rC11
- movaps 112-120(pA10,ldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC12
- mulps 112-120(pA5,ldab,8), rB0
- addps rB0, rC13
+ movaps 112-120(pA10,mldab5,2), rA0
+ movaps 112-120(pB0), rB0
+ mulps rB0, rA0
+ addps rA0, rC0
+ movaps 112-120(pA5, mldab,4), rA0
+ mulps rB0, rA0
+ addps rA0, rC1
+ movaps 112-120(pA10, mldab,8), rA0
+ mulps rB0, rA0
+ addps rA0, rC2
+ movaps 112-120(pA5, mldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC3
+ movaps 112-120(pA5, mldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC4
+ movaps 112-120(pA5), rA0
+ mulps rB0, rA0
+ addps rA0, rC5
+ movaps 112-120(pA5, ldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC6
+ movaps 112-120(pA5, ldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC7
+ movaps 112-120(pA10, mldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC8
+ movaps 112-120(pA5,ldab,4), rA0
+ mulps rB0, rA0
+ addps rA0, rC9
+ movaps 112-120(pA10), rA0
+ mulps rB0, rA0
+ addps rA0, rC10
+ movaps 112-120(pA10,ldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC11
+ movaps 112-120(pA10,ldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC12
+ mulps 112-120(pA5,ldab,8), rB0
+ addps rB0, rC13
#endif
#ifndef SREAL
- pref2((pfA))
- pref2(64(pfA))
+ pref2((pfA))
+ pref2(64(pfA))
#endif
#if KB > 32
- movaps 128-120(pA10,mldab5,2), rA0
- movaps 128-120(pB0), rB0
- mulps rB0, rA0
- addps rA0, rC0
- movaps 128-120(pA5, mldab,4), rA0
- mulps rB0, rA0
- addps rA0, rC1
- movaps 128-120(pA10, mldab,8), rA0
- mulps rB0, rA0
- addps rA0, rC2
- movaps 128-120(pA5, mldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC3
- movaps 128-120(pA5, mldab), rA0
- mulps rB0, rA0
- addps rA0, rC4
- movaps 128-120(pA5), rA0
- mulps rB0, rA0
- addps rA0, rC5
- movaps 128-120(pA5, ldab), rA0
- mulps rB0, rA0
- addps rA0, rC6
- movaps 128-120(pA5, ldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC7
- movaps 128-120(pA10, mldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC8
- movaps 128-120(pA5,ldab,4), rA0
- mulps rB0, rA0
- addps rA0, rC9
- movaps 128-120(pA10), rA0
- mulps rB0, rA0
- addps rA0, rC10
- movaps 128-120(pA10,ldab), rA0
- mulps rB0, rA0
- addps rA0, rC11
- movaps 128-120(pA10,ldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC12
- mulps 128-120(pA5,ldab,8), rB0
- addps rB0, rC13
+ movaps 128-120(pA10,mldab5,2), rA0
+ movaps 128-120(pB0), rB0
+ mulps rB0, rA0
+ addps rA0, rC0
+ movaps 128-120(pA5, mldab,4), rA0
+ mulps rB0, rA0
+ addps rA0, rC1
+ movaps 128-120(pA10, mldab,8), rA0
+ mulps rB0, rA0
+ addps rA0, rC2
+ movaps 128-120(pA5, mldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC3
+ movaps 128-120(pA5, mldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC4
+ movaps 128-120(pA5), rA0
+ mulps rB0, rA0
+ addps rA0, rC5
+ movaps 128-120(pA5, ldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC6
+ movaps 128-120(pA5, ldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC7
+ movaps 128-120(pA10, mldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC8
+ movaps 128-120(pA5,ldab,4), rA0
+ mulps rB0, rA0
+ addps rA0, rC9
+ movaps 128-120(pA10), rA0
+ mulps rB0, rA0
+ addps rA0, rC10
+ movaps 128-120(pA10,ldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC11
+ movaps 128-120(pA10,ldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC12
+ mulps 128-120(pA5,ldab,8), rB0
+ addps rB0, rC13
#endif
#if KB > 36
- movaps 144-120(pA10,mldab5,2), rA0
- movaps 144-120(pB0), rB0
- mulps rB0, rA0
- addps rA0, rC0
- movaps 144-120(pA5, mldab,4), rA0
- mulps rB0, rA0
- addps rA0, rC1
- movaps 144-120(pA10, mldab,8), rA0
- mulps rB0, rA0
- addps rA0, rC2
- movaps 144-120(pA5, mldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC3
- movaps 144-120(pA5, mldab), rA0
- mulps rB0, rA0
- addps rA0, rC4
- movaps 144-120(pA5), rA0
- mulps rB0, rA0
- addps rA0, rC5
- movaps 144-120(pA5, ldab), rA0
- mulps rB0, rA0
- addps rA0, rC6
- movaps 144-120(pA5, ldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC7
- movaps 144-120(pA10, mldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC8
- movaps 144-120(pA5,ldab,4), rA0
- mulps rB0, rA0
- addps rA0, rC9
- movaps 144-120(pA10), rA0
- mulps rB0, rA0
- addps rA0, rC10
- movaps 144-120(pA10,ldab), rA0
- mulps rB0, rA0
- addps rA0, rC11
- movaps 144-120(pA10,ldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC12
- mulps 144-120(pA5,ldab,8), rB0
- addps rB0, rC13
+ movaps 144-120(pA10,mldab5,2), rA0
+ movaps 144-120(pB0), rB0
+ mulps rB0, rA0
+ addps rA0, rC0
+ movaps 144-120(pA5, mldab,4), rA0
+ mulps rB0, rA0
+ addps rA0, rC1
+ movaps 144-120(pA10, mldab,8), rA0
+ mulps rB0, rA0
+ addps rA0, rC2
+ movaps 144-120(pA5, mldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC3
+ movaps 144-120(pA5, mldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC4
+ movaps 144-120(pA5), rA0
+ mulps rB0, rA0
+ addps rA0, rC5
+ movaps 144-120(pA5, ldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC6
+ movaps 144-120(pA5, ldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC7
+ movaps 144-120(pA10, mldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC8
+ movaps 144-120(pA5,ldab,4), rA0
+ mulps rB0, rA0
+ addps rA0, rC9
+ movaps 144-120(pA10), rA0
+ mulps rB0, rA0
+ addps rA0, rC10
+ movaps 144-120(pA10,ldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC11
+ movaps 144-120(pA10,ldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC12
+ mulps 144-120(pA5,ldab,8), rB0
+ addps rB0, rC13
#endif
#if KB > 40
- movaps 160-120(pA10,mldab5,2), rA0
- movaps 160-120(pB0), rB0
- mulps rB0, rA0
- addq $176, pB0
- addps rA0, rC0
- movaps 160-120(pA5, mldab,4), rA0
- mulps rB0, rA0
- addps rA0, rC1
- movaps 160-120(pA10, mldab,8), rA0
- mulps rB0, rA0
- addps rA0, rC2
- movaps 160-120(pA5, mldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC3
- movaps 160-120(pA5, mldab), rA0
- mulps rB0, rA0
- addps rA0, rC4
- movaps 160-120(pA5), rA0
- mulps rB0, rA0
- addps rA0, rC5
- movaps 160-120(pA5, ldab), rA0
- mulps rB0, rA0
- addps rA0, rC6
- movaps 160-120(pA5, ldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC7
- movaps 160-120(pA10, mldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC8
- movaps 160-120(pA5,ldab,4), rA0
- mulps rB0, rA0
- addps rA0, rC9
- movaps 160-120(pA10), rA0
- mulps rB0, rA0
- addps rA0, rC10
- movaps 160-120(pA10,ldab), rA0
- mulps rB0, rA0
- addps rA0, rC11
- movaps 160-120(pA10,ldab,2), rA0
- mulps rB0, rA0
- addq $176, pA10
- addps rA0, rC12
- mulps 160-120(pA5,ldab,8), rB0
- addps rB0, rC13
- addq $176, pA5
+ movaps 160-120(pA10,mldab5,2), rA0
+ movaps 160-120(pB0), rB0
+ mulps rB0, rA0
+ addq $176, pB0
+ addps rA0, rC0
+ movaps 160-120(pA5, mldab,4), rA0
+ mulps rB0, rA0
+ addps rA0, rC1
+ movaps 160-120(pA10, mldab,8), rA0
+ mulps rB0, rA0
+ addps rA0, rC2
+ movaps 160-120(pA5, mldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC3
+ movaps 160-120(pA5, mldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC4
+ movaps 160-120(pA5), rA0
+ mulps rB0, rA0
+ addps rA0, rC5
+ movaps 160-120(pA5, ldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC6
+ movaps 160-120(pA5, ldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC7
+ movaps 160-120(pA10, mldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC8
+ movaps 160-120(pA5,ldab,4), rA0
+ mulps rB0, rA0
+ addps rA0, rC9
+ movaps 160-120(pA10), rA0
+ mulps rB0, rA0
+ addps rA0, rC10
+ movaps 160-120(pA10,ldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC11
+ movaps 160-120(pA10,ldab,2), rA0
+ mulps rB0, rA0
+ addq $176, pA10
+ addps rA0, rC12
+ mulps 160-120(pA5,ldab,8), rB0
+ addps rB0, rC13
+ addq $176, pA5
#else
- addq $176, pB0
- addq $176, pA10
- addq $176, pA5
+ addq $176, pB0
+ addq $176, pA10
+ addq $176, pA5
#endif
#if KB > 44
- movaps 0-120(pA10,mldab5,2), rA0
- movaps 0-120(pB0), rB0
- mulps rB0, rA0
- addps rA0, rC0
- movaps 0-120(pA5, mldab,4), rA0
- mulps rB0, rA0
- addps rA0, rC1
- movaps 0-120(pA10, mldab,8), rA0
- mulps rB0, rA0
- addps rA0, rC2
- movaps 0-120(pA5, mldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC3
- movaps 0-120(pA5, mldab), rA0
- mulps rB0, rA0
- addps rA0, rC4
- movaps 0-120(pA5), rA0
- mulps rB0, rA0
- addps rA0, rC5
- movaps 0-120(pA5, ldab), rA0
- mulps rB0, rA0
- addps rA0, rC6
- movaps 0-120(pA5, ldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC7
- movaps 0-120(pA10, mldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC8
- movaps 0-120(pA5,ldab,4), rA0
- mulps rB0, rA0
- addps rA0, rC9
- movaps 0-120(pA10), rA0
- mulps rB0, rA0
- addps rA0, rC10
- movaps 0-120(pA10,ldab), rA0
- mulps rB0, rA0
- addps rA0, rC11
- movaps 0-120(pA10,ldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC12
- mulps 0-120(pA5,ldab,8), rB0
- addps rB0, rC13
+ movaps 0-120(pA10,mldab5,2), rA0
+ movaps 0-120(pB0), rB0
+ mulps rB0, rA0
+ addps rA0, rC0
+ movaps 0-120(pA5, mldab,4), rA0
+ mulps rB0, rA0
+ addps rA0, rC1
+ movaps 0-120(pA10, mldab,8), rA0
+ mulps rB0, rA0
+ addps rA0, rC2
+ movaps 0-120(pA5, mldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC3
+ movaps 0-120(pA5, mldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC4
+ movaps 0-120(pA5), rA0
+ mulps rB0, rA0
+ addps rA0, rC5
+ movaps 0-120(pA5, ldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC6
+ movaps 0-120(pA5, ldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC7
+ movaps 0-120(pA10, mldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC8
+ movaps 0-120(pA5,ldab,4), rA0
+ mulps rB0, rA0
+ addps rA0, rC9
+ movaps 0-120(pA10), rA0
+ mulps rB0, rA0
+ addps rA0, rC10
+ movaps 0-120(pA10,ldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC11
+ movaps 0-120(pA10,ldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC12
+ mulps 0-120(pA5,ldab,8), rB0
+ addps rB0, rC13
#endif
#if KB > 48
- movaps 16-120(pA10,mldab5,2), rA0
- movaps 16-120(pB0), rB0
- mulps rB0, rA0
- addps rA0, rC0
- movaps 16-120(pA5, mldab,4), rA0
- mulps rB0, rA0
- addps rA0, rC1
- movaps 16-120(pA10, mldab,8), rA0
- mulps rB0, rA0
- addps rA0, rC2
- movaps 16-120(pA5, mldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC3
- movaps 16-120(pA5, mldab), rA0
- mulps rB0, rA0
- addps rA0, rC4
- movaps 16-120(pA5), rA0
- mulps rB0, rA0
- addps rA0, rC5
- movaps 16-120(pA5, ldab), rA0
- mulps rB0, rA0
- addps rA0, rC6
- movaps 16-120(pA5, ldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC7
- movaps 16-120(pA10, mldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC8
- movaps 16-120(pA5,ldab,4), rA0
- mulps rB0, rA0
- addps rA0, rC9
- movaps 16-120(pA10), rA0
- mulps rB0, rA0
- addps rA0, rC10
- movaps 16-120(pA10,ldab), rA0
- mulps rB0, rA0
- addps rA0, rC11
- movaps 16-120(pA10,ldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC12
- mulps 16-120(pA5,ldab,8), rB0
- addps rB0, rC13
+ movaps 16-120(pA10,mldab5,2), rA0
+ movaps 16-120(pB0), rB0
+ mulps rB0, rA0
+ addps rA0, rC0
+ movaps 16-120(pA5, mldab,4), rA0
+ mulps rB0, rA0
+ addps rA0, rC1
+ movaps 16-120(pA10, mldab,8), rA0
+ mulps rB0, rA0
+ addps rA0, rC2
+ movaps 16-120(pA5, mldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC3
+ movaps 16-120(pA5, mldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC4
+ movaps 16-120(pA5), rA0
+ mulps rB0, rA0
+ addps rA0, rC5
+ movaps 16-120(pA5, ldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC6
+ movaps 16-120(pA5, ldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC7
+ movaps 16-120(pA10, mldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC8
+ movaps 16-120(pA5,ldab,4), rA0
+ mulps rB0, rA0
+ addps rA0, rC9
+ movaps 16-120(pA10), rA0
+ mulps rB0, rA0
+ addps rA0, rC10
+ movaps 16-120(pA10,ldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC11
+ movaps 16-120(pA10,ldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC12
+ mulps 16-120(pA5,ldab,8), rB0
+ addps rB0, rC13
#endif
#if KB > 52
- movaps 32-120(pA10,mldab5,2), rA0
- movaps 32-120(pB0), rB0
- mulps rB0, rA0
- addps rA0, rC0
- movaps 32-120(pA5, mldab,4), rA0
- mulps rB0, rA0
- addps rA0, rC1
- movaps 32-120(pA10, mldab,8), rA0
- mulps rB0, rA0
- addps rA0, rC2
- movaps 32-120(pA5, mldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC3
- movaps 32-120(pA5, mldab), rA0
- mulps rB0, rA0
- addps rA0, rC4
- movaps 32-120(pA5), rA0
- mulps rB0, rA0
- addps rA0, rC5
- movaps 32-120(pA5, ldab), rA0
- mulps rB0, rA0
- addps rA0, rC6
- movaps 32-120(pA5, ldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC7
- movaps 32-120(pA10, mldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC8
- movaps 32-120(pA5,ldab,4), rA0
- mulps rB0, rA0
- addps rA0, rC9
- movaps 32-120(pA10), rA0
- mulps rB0, rA0
- addps rA0, rC10
- movaps 32-120(pA10,ldab), rA0
- mulps rB0, rA0
- addps rA0, rC11
- movaps 32-120(pA10,ldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC12
- mulps 32-120(pA5,ldab,8), rB0
- addps rB0, rC13
+ movaps 32-120(pA10,mldab5,2), rA0
+ movaps 32-120(pB0), rB0
+ mulps rB0, rA0
+ addps rA0, rC0
+ movaps 32-120(pA5, mldab,4), rA0
+ mulps rB0, rA0
+ addps rA0, rC1
+ movaps 32-120(pA10, mldab,8), rA0
+ mulps rB0, rA0
+ addps rA0, rC2
+ movaps 32-120(pA5, mldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC3
+ movaps 32-120(pA5, mldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC4
+ movaps 32-120(pA5), rA0
+ mulps rB0, rA0
+ addps rA0, rC5
+ movaps 32-120(pA5, ldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC6
+ movaps 32-120(pA5, ldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC7
+ movaps 32-120(pA10, mldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC8
+ movaps 32-120(pA5,ldab,4), rA0
+ mulps rB0, rA0
+ addps rA0, rC9
+ movaps 32-120(pA10), rA0
+ mulps rB0, rA0
+ addps rA0, rC10
+ movaps 32-120(pA10,ldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC11
+ movaps 32-120(pA10,ldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC12
+ mulps 32-120(pA5,ldab,8), rB0
+ addps rB0, rC13
#endif
#if KB > 56
- movaps 48-120(pA10,mldab5,2), rA0
- movaps 48-120(pB0), rB0
- mulps rB0, rA0
- addps rA0, rC0
- movaps 48-120(pA5, mldab,4), rA0
- mulps rB0, rA0
- addps rA0, rC1
- movaps 48-120(pA10, mldab,8), rA0
- mulps rB0, rA0
- addps rA0, rC2
- movaps 48-120(pA5, mldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC3
- movaps 48-120(pA5, mldab), rA0
- mulps rB0, rA0
- addps rA0, rC4
- movaps 48-120(pA5), rA0
- mulps rB0, rA0
- addps rA0, rC5
- movaps 48-120(pA5, ldab), rA0
- mulps rB0, rA0
- addps rA0, rC6
- movaps 48-120(pA5, ldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC7
- movaps 48-120(pA10, mldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC8
- movaps 48-120(pA5,ldab,4), rA0
- mulps rB0, rA0
- addps rA0, rC9
- movaps 48-120(pA10), rA0
- mulps rB0, rA0
- addps rA0, rC10
- movaps 48-120(pA10,ldab), rA0
- mulps rB0, rA0
- addps rA0, rC11
- movaps 48-120(pA10,ldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC12
- mulps 48-120(pA5,ldab,8), rB0
- addps rB0, rC13
+ movaps 48-120(pA10,mldab5,2), rA0
+ movaps 48-120(pB0), rB0
+ mulps rB0, rA0
+ addps rA0, rC0
+ movaps 48-120(pA5, mldab,4), rA0
+ mulps rB0, rA0
+ addps rA0, rC1
+ movaps 48-120(pA10, mldab,8), rA0
+ mulps rB0, rA0
+ addps rA0, rC2
+ movaps 48-120(pA5, mldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC3
+ movaps 48-120(pA5, mldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC4
+ movaps 48-120(pA5), rA0
+ mulps rB0, rA0
+ addps rA0, rC5
+ movaps 48-120(pA5, ldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC6
+ movaps 48-120(pA5, ldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC7
+ movaps 48-120(pA10, mldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC8
+ movaps 48-120(pA5,ldab,4), rA0
+ mulps rB0, rA0
+ addps rA0, rC9
+ movaps 48-120(pA10), rA0
+ mulps rB0, rA0
+ addps rA0, rC10
+ movaps 48-120(pA10,ldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC11
+ movaps 48-120(pA10,ldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC12
+ mulps 48-120(pA5,ldab,8), rB0
+ addps rB0, rC13
#endif
#if KB > 60
- movaps 64-120(pA10,mldab5,2), rA0
- movaps 64-120(pB0), rB0
- mulps rB0, rA0
- addps rA0, rC0
- movaps 64-120(pA5, mldab,4), rA0
- mulps rB0, rA0
- addps rA0, rC1
- movaps 64-120(pA10, mldab,8), rA0
- mulps rB0, rA0
- addps rA0, rC2
- movaps 64-120(pA5, mldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC3
- movaps 64-120(pA5, mldab), rA0
- mulps rB0, rA0
- addps rA0, rC4
- movaps 64-120(pA5), rA0
- mulps rB0, rA0
- addps rA0, rC5
- movaps 64-120(pA5, ldab), rA0
- mulps rB0, rA0
- addps rA0, rC6
- movaps 64-120(pA5, ldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC7
- movaps 64-120(pA10, mldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC8
- movaps 64-120(pA5,ldab,4), rA0
- mulps rB0, rA0
- addps rA0, rC9
- movaps 64-120(pA10), rA0
- mulps rB0, rA0
- addps rA0, rC10
- movaps 64-120(pA10,ldab), rA0
- mulps rB0, rA0
- addps rA0, rC11
- movaps 64-120(pA10,ldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC12
- mulps 64-120(pA5,ldab,8), rB0
- addps rB0, rC13
+ movaps 64-120(pA10,mldab5,2), rA0
+ movaps 64-120(pB0), rB0
+ mulps rB0, rA0
+ addps rA0, rC0
+ movaps 64-120(pA5, mldab,4), rA0
+ mulps rB0, rA0
+ addps rA0, rC1
+ movaps 64-120(pA10, mldab,8), rA0
+ mulps rB0, rA0
+ addps rA0, rC2
+ movaps 64-120(pA5, mldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC3
+ movaps 64-120(pA5, mldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC4
+ movaps 64-120(pA5), rA0
+ mulps rB0, rA0
+ addps rA0, rC5
+ movaps 64-120(pA5, ldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC6
+ movaps 64-120(pA5, ldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC7
+ movaps 64-120(pA10, mldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC8
+ movaps 64-120(pA5,ldab,4), rA0
+ mulps rB0, rA0
+ addps rA0, rC9
+ movaps 64-120(pA10), rA0
+ mulps rB0, rA0
+ addps rA0, rC10
+ movaps 64-120(pA10,ldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC11
+ movaps 64-120(pA10,ldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC12
+ mulps 64-120(pA5,ldab,8), rB0
+ addps rB0, rC13
#endif
#if KB > 64
- movaps 80-120(pA10,mldab5,2), rA0
- movaps 80-120(pB0), rB0
- mulps rB0, rA0
- addps rA0, rC0
- movaps 80-120(pA5, mldab,4), rA0
- mulps rB0, rA0
- addps rA0, rC1
- movaps 80-120(pA10, mldab,8), rA0
- mulps rB0, rA0
- addps rA0, rC2
- movaps 80-120(pA5, mldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC3
- movaps 80-120(pA5, mldab), rA0
- mulps rB0, rA0
- addps rA0, rC4
- movaps 80-120(pA5), rA0
- mulps rB0, rA0
- addps rA0, rC5
- movaps 80-120(pA5, ldab), rA0
- mulps rB0, rA0
- addps rA0, rC6
- movaps 80-120(pA5, ldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC7
- movaps 80-120(pA10, mldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC8
- movaps 80-120(pA5,ldab,4), rA0
- mulps rB0, rA0
- addps rA0, rC9
- movaps 80-120(pA10), rA0
- mulps rB0, rA0
- addps rA0, rC10
- movaps 80-120(pA10,ldab), rA0
- mulps rB0, rA0
- addps rA0, rC11
- movaps 80-120(pA10,ldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC12
- mulps 80-120(pA5,ldab,8), rB0
- addps rB0, rC13
+ movaps 80-120(pA10,mldab5,2), rA0
+ movaps 80-120(pB0), rB0
+ mulps rB0, rA0
+ addps rA0, rC0
+ movaps 80-120(pA5, mldab,4), rA0
+ mulps rB0, rA0
+ addps rA0, rC1
+ movaps 80-120(pA10, mldab,8), rA0
+ mulps rB0, rA0
+ addps rA0, rC2
+ movaps 80-120(pA5, mldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC3
+ movaps 80-120(pA5, mldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC4
+ movaps 80-120(pA5), rA0
+ mulps rB0, rA0
+ addps rA0, rC5
+ movaps 80-120(pA5, ldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC6
+ movaps 80-120(pA5, ldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC7
+ movaps 80-120(pA10, mldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC8
+ movaps 80-120(pA5,ldab,4), rA0
+ mulps rB0, rA0
+ addps rA0, rC9
+ movaps 80-120(pA10), rA0
+ mulps rB0, rA0
+ addps rA0, rC10
+ movaps 80-120(pA10,ldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC11
+ movaps 80-120(pA10,ldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC12
+ mulps 80-120(pA5,ldab,8), rB0
+ addps rB0, rC13
#endif
#if KB > 68
- movaps 96-120(pA10,mldab5,2), rA0
- movaps 96-120(pB0), rB0
- mulps rB0, rA0
- addps rA0, rC0
- movaps 96-120(pA5, mldab,4), rA0
- mulps rB0, rA0
- addps rA0, rC1
- movaps 96-120(pA10, mldab,8), rA0
- mulps rB0, rA0
- addps rA0, rC2
- movaps 96-120(pA5, mldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC3
- movaps 96-120(pA5, mldab), rA0
- mulps rB0, rA0
- addps rA0, rC4
- movaps 96-120(pA5), rA0
- mulps rB0, rA0
- addps rA0, rC5
- movaps 96-120(pA5, ldab), rA0
- mulps rB0, rA0
- addps rA0, rC6
- movaps 96-120(pA5, ldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC7
- movaps 96-120(pA10, mldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC8
- movaps 96-120(pA5,ldab,4), rA0
- mulps rB0, rA0
- addps rA0, rC9
- movaps 96-120(pA10), rA0
- mulps rB0, rA0
- addps rA0, rC10
- movaps 96-120(pA10,ldab), rA0
- mulps rB0, rA0
- addps rA0, rC11
- movaps 96-120(pA10,ldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC12
- mulps 96-120(pA5,ldab,8), rB0
- addps rB0, rC13
+ movaps 96-120(pA10,mldab5,2), rA0
+ movaps 96-120(pB0), rB0
+ mulps rB0, rA0
+ addps rA0, rC0
+ movaps 96-120(pA5, mldab,4), rA0
+ mulps rB0, rA0
+ addps rA0, rC1
+ movaps 96-120(pA10, mldab,8), rA0
+ mulps rB0, rA0
+ addps rA0, rC2
+ movaps 96-120(pA5, mldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC3
+ movaps 96-120(pA5, mldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC4
+ movaps 96-120(pA5), rA0
+ mulps rB0, rA0
+ addps rA0, rC5
+ movaps 96-120(pA5, ldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC6
+ movaps 96-120(pA5, ldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC7
+ movaps 96-120(pA10, mldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC8
+ movaps 96-120(pA5,ldab,4), rA0
+ mulps rB0, rA0
+ addps rA0, rC9
+ movaps 96-120(pA10), rA0
+ mulps rB0, rA0
+ addps rA0, rC10
+ movaps 96-120(pA10,ldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC11
+ movaps 96-120(pA10,ldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC12
+ mulps 96-120(pA5,ldab,8), rB0
+ addps rB0, rC13
#endif
#if KB > 72
- movaps 112-120(pA10,mldab5,2), rA0
- movaps 112-120(pB0), rB0
- mulps rB0, rA0
- addps rA0, rC0
- movaps 112-120(pA5, mldab,4), rA0
- mulps rB0, rA0
- addps rA0, rC1
- movaps 112-120(pA10, mldab,8), rA0
- mulps rB0, rA0
- addps rA0, rC2
- movaps 112-120(pA5, mldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC3
- movaps 112-120(pA5, mldab), rA0
- mulps rB0, rA0
- addps rA0, rC4
- movaps 112-120(pA5), rA0
- mulps rB0, rA0
- addps rA0, rC5
- movaps 112-120(pA5, ldab), rA0
- mulps rB0, rA0
- addps rA0, rC6
- movaps 112-120(pA5, ldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC7
- movaps 112-120(pA10, mldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC8
- movaps 112-120(pA5,ldab,4), rA0
- mulps rB0, rA0
- addps rA0, rC9
- movaps 112-120(pA10), rA0
- mulps rB0, rA0
- addps rA0, rC10
- movaps 112-120(pA10,ldab), rA0
- mulps rB0, rA0
- addps rA0, rC11
- movaps 112-120(pA10,ldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC12
- mulps 112-120(pA5,ldab,8), rB0
- addps rB0, rC13
+ movaps 112-120(pA10,mldab5,2), rA0
+ movaps 112-120(pB0), rB0
+ mulps rB0, rA0
+ addps rA0, rC0
+ movaps 112-120(pA5, mldab,4), rA0
+ mulps rB0, rA0
+ addps rA0, rC1
+ movaps 112-120(pA10, mldab,8), rA0
+ mulps rB0, rA0
+ addps rA0, rC2
+ movaps 112-120(pA5, mldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC3
+ movaps 112-120(pA5, mldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC4
+ movaps 112-120(pA5), rA0
+ mulps rB0, rA0
+ addps rA0, rC5
+ movaps 112-120(pA5, ldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC6
+ movaps 112-120(pA5, ldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC7
+ movaps 112-120(pA10, mldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC8
+ movaps 112-120(pA5,ldab,4), rA0
+ mulps rB0, rA0
+ addps rA0, rC9
+ movaps 112-120(pA10), rA0
+ mulps rB0, rA0
+ addps rA0, rC10
+ movaps 112-120(pA10,ldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC11
+ movaps 112-120(pA10,ldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC12
+ mulps 112-120(pA5,ldab,8), rB0
+ addps rB0, rC13
#endif
#if KB > 76
- movaps 128-120(pA10,mldab5,2), rA0
- movaps 128-120(pB0), rB0
- mulps rB0, rA0
- addps rA0, rC0
- movaps 128-120(pA5, mldab,4), rA0
- mulps rB0, rA0
- addps rA0, rC1
- movaps 128-120(pA10, mldab,8), rA0
- mulps rB0, rA0
- addps rA0, rC2
- movaps 128-120(pA5, mldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC3
- movaps 128-120(pA5, mldab), rA0
- mulps rB0, rA0
- addps rA0, rC4
- movaps 128-120(pA5), rA0
- mulps rB0, rA0
- addps rA0, rC5
- movaps 128-120(pA5, ldab), rA0
- mulps rB0, rA0
- addps rA0, rC6
- movaps 128-120(pA5, ldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC7
- movaps 128-120(pA10, mldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC8
- movaps 128-120(pA5,ldab,4), rA0
- mulps rB0, rA0
- addps rA0, rC9
- movaps 128-120(pA10), rA0
- mulps rB0, rA0
- addps rA0, rC10
- movaps 128-120(pA10,ldab), rA0
- mulps rB0, rA0
- addps rA0, rC11
- movaps 128-120(pA10,ldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC12
- mulps 128-120(pA5,ldab,8), rB0
- addps rB0, rC13
+ movaps 128-120(pA10,mldab5,2), rA0
+ movaps 128-120(pB0), rB0
+ mulps rB0, rA0
+ addps rA0, rC0
+ movaps 128-120(pA5, mldab,4), rA0
+ mulps rB0, rA0
+ addps rA0, rC1
+ movaps 128-120(pA10, mldab,8), rA0
+ mulps rB0, rA0
+ addps rA0, rC2
+ movaps 128-120(pA5, mldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC3
+ movaps 128-120(pA5, mldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC4
+ movaps 128-120(pA5), rA0
+ mulps rB0, rA0
+ addps rA0, rC5
+ movaps 128-120(pA5, ldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC6
+ movaps 128-120(pA5, ldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC7
+ movaps 128-120(pA10, mldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC8
+ movaps 128-120(pA5,ldab,4), rA0
+ mulps rB0, rA0
+ addps rA0, rC9
+ movaps 128-120(pA10), rA0
+ mulps rB0, rA0
+ addps rA0, rC10
+ movaps 128-120(pA10,ldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC11
+ movaps 128-120(pA10,ldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC12
+ mulps 128-120(pA5,ldab,8), rB0
+ addps rB0, rC13
#endif
#if KB > 80
- movaps 144-120(pA10,mldab5,2), rA0
- movaps 144-120(pB0), rB0
- mulps rB0, rA0
- addps rA0, rC0
- movaps 144-120(pA5, mldab,4), rA0
- mulps rB0, rA0
- addps rA0, rC1
- movaps 144-120(pA10, mldab,8), rA0
- mulps rB0, rA0
- addps rA0, rC2
- movaps 144-120(pA5, mldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC3
- movaps 144-120(pA5, mldab), rA0
- mulps rB0, rA0
- addps rA0, rC4
- movaps 144-120(pA5), rA0
- mulps rB0, rA0
- addps rA0, rC5
- movaps 144-120(pA5, ldab), rA0
- mulps rB0, rA0
- addps rA0, rC6
- movaps 144-120(pA5, ldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC7
- movaps 144-120(pA10, mldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC8
- movaps 144-120(pA5,ldab,4), rA0
- mulps rB0, rA0
- addps rA0, rC9
- movaps 144-120(pA10), rA0
- mulps rB0, rA0
- addps rA0, rC10
- movaps 144-120(pA10,ldab), rA0
- mulps rB0, rA0
- addps rA0, rC11
- movaps 144-120(pA10,ldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC12
- mulps 144-120(pA5,ldab,8), rB0
- addps rB0, rC13
+ movaps 144-120(pA10,mldab5,2), rA0
+ movaps 144-120(pB0), rB0
+ mulps rB0, rA0
+ addps rA0, rC0
+ movaps 144-120(pA5, mldab,4), rA0
+ mulps rB0, rA0
+ addps rA0, rC1
+ movaps 144-120(pA10, mldab,8), rA0
+ mulps rB0, rA0
+ addps rA0, rC2
+ movaps 144-120(pA5, mldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC3
+ movaps 144-120(pA5, mldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC4
+ movaps 144-120(pA5), rA0
+ mulps rB0, rA0
+ addps rA0, rC5
+ movaps 144-120(pA5, ldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC6
+ movaps 144-120(pA5, ldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC7
+ movaps 144-120(pA10, mldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC8
+ movaps 144-120(pA5,ldab,4), rA0
+ mulps rB0, rA0
+ addps rA0, rC9
+ movaps 144-120(pA10), rA0
+ mulps rB0, rA0
+ addps rA0, rC10
+ movaps 144-120(pA10,ldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC11
+ movaps 144-120(pA10,ldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC12
+ mulps 144-120(pA5,ldab,8), rB0
+ addps rB0, rC13
#endif
/*UKLOOP */
@@ -1220,234 +1227,234 @@ UMLOOP:
* Get these bastard things summed up correctly
*/
- /* rC0 = c0a c0b c0c c0d */
- /* rC1 = c1a c1b c1c c1d */
- /* rC2 = c2a c2b c2c c2d */
- /* rC3 = c3a c3b c3c c3d */
+ /* rC0 = c0a c0b c0c c0d */
+ /* rC1 = c1a c1b c1c c1d */
+ /* rC2 = c2a c2b c2c c2d */
+ /* rC3 = c3a c3b c3c c3d */
/* */
- movaps rC2, rB0 /* rB0 = c2a c2b c2c c2d */
- prefC((pC))
- prefC(64(pC))
- movaps rC0, rA0 /* rA0 = c0a c0b c0c c0d */
- unpckhps rC3, rB0 /* rB0 = c2c c3c c2d c3d */
- unpckhps rC1, rA0 /* rA0 = c0c c1c c0d c1d */
- unpcklps rC3, rC2 /* rC2 = c2a c3a c2b c3b */
- movlhps rB0, rC3 /* rC3 = c3a c3b c2c c3c */
- unpcklps rC1, rC0 /* rC0 = c0a c1a c0b c1b */
- movhlps rA0, rC3 /* rC3 = c0d c1d c2c c3c */
- movlhps rC2, rA0 /* rA0 = c0c c1c c2a c3a */
- movhlps rC0, rB0 /* rB0 = c0b c1b c2d c3d */
- addps rA0, rC3 /* rC3 = c0cd c1cd c2ac c3ac */
- movlhps rC0, rC1 /* rC1 = c1a c1b c0a c1a */
- movhlps rC1, rC2 /* rC2 = c0a c1a c2b c3b */
- movaps rC4, rA0 /* rA0 = c4a c4b c4c c4d */
- addps rB0, rC2 /* rC2 = c0ab c1ab c2bd c3bd */
- movaps rC6, rB0 /* rB0 = c6a c6b c6c c6d */
- addps rC2, rC3 /* rC3 = c0abcd c1abcd c2bdac c3bdac */
-
-
- /* rC4 = c4a c4b c4c c4d */
- /* rC5 = c5a c5b c5c c5d */
- /* rC6 = c6a c6b c6c c6d */
- /* rC7 = c7a c7b c7c c7d */
- /* rC8 = c08a c08b c08c c08d */
- /* rC9 = c09a c09b c09c c09d */
- /* rC10 = c10a c10b c10c c10d */
- /* rC11 = c11a c11b c11c c11d */
- /* rC12 = c12a c12b c12c c12d */
- /* rC13 = c13a c13b c13c c13d */
+ movaps rC2, rB0 /* rB0 = c2a c2b c2c c2d */
+ prefC((pC))
+ prefC(64(pC))
+ movaps rC0, rA0 /* rA0 = c0a c0b c0c c0d */
+ unpckhps rC3, rB0 /* rB0 = c2c c3c c2d c3d */
+ unpckhps rC1, rA0 /* rA0 = c0c c1c c0d c1d */
+ unpcklps rC3, rC2 /* rC2 = c2a c3a c2b c3b */
+ movlhps rB0, rC3 /* rC3 = c3a c3b c2c c3c */
+ unpcklps rC1, rC0 /* rC0 = c0a c1a c0b c1b */
+ movhlps rA0, rC3 /* rC3 = c0d c1d c2c c3c */
+ movlhps rC2, rA0 /* rA0 = c0c c1c c2a c3a */
+ movhlps rC0, rB0 /* rB0 = c0b c1b c2d c3d */
+ addps rA0, rC3 /* rC3 = c0cd c1cd c2ac c3ac */
+ movlhps rC0, rC1 /* rC1 = c1a c1b c0a c1a */
+ movhlps rC1, rC2 /* rC2 = c0a c1a c2b c3b */
+ movaps rC4, rA0 /* rA0 = c4a c4b c4c c4d */
+ addps rB0, rC2 /* rC2 = c0ab c1ab c2bd c3bd */
+ movaps rC6, rB0 /* rB0 = c6a c6b c6c c6d */
+ addps rC2, rC3 /* rC3 = c0abcd c1abcd c2bdac c3bdac */
+
+
+ /* rC4 = c4a c4b c4c c4d */
+ /* rC5 = c5a c5b c5c c5d */
+ /* rC6 = c6a c6b c6c c6d */
+ /* rC7 = c7a c7b c7c c7d */
+ /* rC8 = c08a c08b c08c c08d */
+ /* rC9 = c09a c09b c09c c09d */
+ /* rC10 = c10a c10b c10c c10d */
+ /* rC11 = c11a c11b c11c c11d */
+ /* rC12 = c12a c12b c12c c12d */
+ /* rC13 = c13a c13b c13c c13d */
/* */
- movaps rC10, rC0 /* rC0 = c10a c10b c10c c10d */
- prefC(128(pC))
+ movaps rC10, rC0 /* rC0 = c10a c10b c10c c10d */
+ prefC(128(pC))
#ifdef SREAL
- pref2((pfA))
+ pref2((pfA))
#else
- prefC(192(pC))
+ prefC(192(pC))
#endif
- movaps rC8 , rC1 /* rC1 = c08a c08b c08c c08d */
- movaps rC12, rC2 /* rC2 = c12a c12b c12c c12d */
- unpckhps rC7, rB0 /* rB0 = c6c c7c c6d c7d */
- unpckhps rC5, rA0 /* rA0 = c4c c5c c4d c5d */
- unpcklps rC7, rC6 /* rC6 = c6a c7a c6b c7b */
- unpckhps rC11, rC0 /* rC0 = c10c c11c c10d c11d */
- unpckhps rC9 , rC1 /* rC1 = c08c c09c c08d c09d */
- movlhps rB0, rC7 /* rC7 = c7a c7b c6c c7c */
- unpcklps rC5, rC4 /* rC4 = c4a c5a c4b c5b */
- movhlps rA0, rC7 /* rC7 = c4d c5d c6c c7c */
- movlhps rC6, rA0 /* rA0 = c4c c5c c6a c7a */
- unpcklps rC11, rC10 /* rC10 = c10a c11a c10b c11b */
- movhlps rC4, rB0 /* rB0 = c4b c5b c6d c7d */
- movlhps rC0, rC11 /* rC11 = c11a c11b c10c c11c */
- addps rA0, rC7 /* rC7 = c4cd c5cd c6ac c7ac */
+ movaps rC8 , rC1 /* rC1 = c08a c08b c08c c08d */
+ movaps rC12, rC2 /* rC2 = c12a c12b c12c c12d */
+ unpckhps rC7, rB0 /* rB0 = c6c c7c c6d c7d */
+ unpckhps rC5, rA0 /* rA0 = c4c c5c c4d c5d */
+ unpcklps rC7, rC6 /* rC6 = c6a c7a c6b c7b */
+ unpckhps rC11, rC0 /* rC0 = c10c c11c c10d c11d */
+ unpckhps rC9 , rC1 /* rC1 = c08c c09c c08d c09d */
+ movlhps rB0, rC7 /* rC7 = c7a c7b c6c c7c */
+ unpcklps rC5, rC4 /* rC4 = c4a c5a c4b c5b */
+ movhlps rA0, rC7 /* rC7 = c4d c5d c6c c7c */
+ movlhps rC6, rA0 /* rA0 = c4c c5c c6a c7a */
+ unpcklps rC11, rC10 /* rC10 = c10a c11a c10b c11b */
+ movhlps rC4, rB0 /* rB0 = c4b c5b c6d c7d */
+ movlhps rC0, rC11 /* rC11 = c11a c11b c10c c11c */
+ addps rA0, rC7 /* rC7 = c4cd c5cd c6ac c7ac */
#ifdef BETAX
#ifdef SREAL
- movups (pC), rA0
- movlhps rC4, rC5 /* rC5 = c5a c5b c4a c5a */
- movups 16(pC), rC4
- unpcklps rC9 , rC8 /* rC8 = c08a c09a c08b c09b */
- movhlps rC1, rC11 /* rC11 = c08d c09d c10c c11c */
- movlhps rC10, rC1 /* rC1 = c08c c09c c10a c11a */
- movhlps rC5, rC6 /* rC6 = c4a c5a c6b c7b */
- movups 32(pC), rC5
- movhlps rC8 , rC0 /* rC0 = c08b c09b c10d c11d */
- unpcklps rC13, rC2 /* rC2 = c12a c13a c12b c13b */
- addps rC1, rC11 /* rC11 = c08cd c09cd c10ac c11ac */
- movlps 48(pC), rC1
- addps rB0, rC6 /* rC6 = c4ab c5ab c6bd c7bd */
- movlhps rC8 , rC9 /* rC9 = c09a c09b c08a c09a */
- unpckhps rC13, rC12 /* rC12 = c12c c13c c12d c13d */
- movhlps rC9 , rC10 /* rC10 = c08a c09a c10b c11b */
- addps rC6, rC7 /* rC7 = c4abcd c5abcd c6bdac c7bdac */
- pref2(64(pfA))
- mulps BOF(%rsp), rA0
- addps rC0, rC10 /* rC10 = c08ab c09ab c10bd c11bd */
- mulps BOF(%rsp), rC4
- addps rC2, rC12 /* rC12 = c12ac c13ac c12bd c13bd */
- mulps BOF(%rsp), rC5
- addps rC10, rC11 /* rC11 = c08abcd c09abcd c10bdac c11bdac */
- mulps BOF(%rsp), rC1
+ movups (pC), rA0
+ movlhps rC4, rC5 /* rC5 = c5a c5b c4a c5a */
+ movups 16(pC), rC4
+ unpcklps rC9 , rC8 /* rC8 = c08a c09a c08b c09b */
+ movhlps rC1, rC11 /* rC11 = c08d c09d c10c c11c */
+ movlhps rC10, rC1 /* rC1 = c08c c09c c10a c11a */
+ movhlps rC5, rC6 /* rC6 = c4a c5a c6b c7b */
+ movups 32(pC), rC5
+ movhlps rC8 , rC0 /* rC0 = c08b c09b c10d c11d */
+ unpcklps rC13, rC2 /* rC2 = c12a c13a c12b c13b */
+ addps rC1, rC11 /* rC11 = c08cd c09cd c10ac c11ac */
+ movlps 48(pC), rC1
+ addps rB0, rC6 /* rC6 = c4ab c5ab c6bd c7bd */
+ movlhps rC8 , rC9 /* rC9 = c09a c09b c08a c09a */
+ unpckhps rC13, rC12 /* rC12 = c12c c13c c12d c13d */
+ movhlps rC9 , rC10 /* rC10 = c08a c09a c10b c11b */
+ addps rC6, rC7 /* rC7 = c4abcd c5abcd c6bdac c7bdac */
+ pref2(64(pfA))
+ mulps BOF(%rsp), rA0
+ addps rC0, rC10 /* rC10 = c08ab c09ab c10bd c11bd */
+ mulps BOF(%rsp), rC4
+ addps rC2, rC12 /* rC12 = c12ac c13ac c12bd c13bd */
+ mulps BOF(%rsp), rC5
+ addps rC10, rC11 /* rC11 = c08abcd c09abcd c10bdac c11bdac */
+ mulps BOF(%rsp), rC1
/* */
- movhlps rC12, rC13 /* rC13 = c12bd c13bd X X */
- addps rA0, rC3
- addq $68, pfA
- addps rC13, rC12 /* rC12 = c12abcd c13abcd X X */
- addps rC4, rC7
- addps rC5, rC11
- addps rC1, rC12
+ movhlps rC12, rC13 /* rC13 = c12bd c13bd X X */
+ addps rA0, rC3
+ addq $68, pfA
+ addps rC13, rC12 /* rC12 = c12abcd c13abcd X X */
+ addps rC4, rC7
+ addps rC5, rC11
+ addps rC1, rC12
#else /* BETA = X, complex type */
- movups (pC), rA0
- movlhps rC4, rC5 /* rC5 = c5a c5b c4a c5a */
- movups 16(pC), rC4
- unpcklps rC9 , rC8 /* rC8 = c08a c09a c08b c09b */
- shufps $0x88, rC4, rA0 /* rA0 = c0 c1 c2 c3 */
- movups 32(pC), rC4 /* rC4 = c4 X c5 X */
- movhlps rC1, rC11 /* rC11 = c08d c09d c10c c11c */
- movlhps rC10, rC1 /* rC1 = c08c c09c c10a c11a */
- movhlps rC5, rC6 /* rC6 = c4a c5a c6b c7b */
- movups 48(pC), rC5 /* rC5 = c6 X c7 X */
- movhlps rC8 , rC0 /* rC0 = c08b c09b c10d c11d */
- unpcklps rC13, rC2 /* rC2 = c12a c13a c12b c13b */
- addps rC1, rC11 /* rC11 = c08cd c09cd c10ac c11ac */
- shufps $0x88, rC5, rC4 /* rC4 = c4 c5 c6 c7 */
- movups 64(pC), rC5 /* rC5 = c8 X c9 X */
- movups 80(pC), rC1 /* rC1 = c10 X c11 X */
- addps rB0, rC6 /* rC6 = c4ab c5ab c6bd c7bd */
- shufps $0x88, rC1, rC5 /* rC5 = c8 c9 c10 c11 */
- movlhps rC8 , rC9 /* rC9 = c09a c09b c08a c09a */
- movss 96(pC), rC1
- unpckhps rC13, rC12 /* rC12 = c12c c13c c12d c13d */
- movss 104(pC), rB0
- movhlps rC9 , rC10 /* rC10 = c08a c09a c10b c11b */
- unpcklps rB0, rC1
- addps rC6, rC7 /* rC7 = c4abcd c5abcd c6bdac c7bdac */
- prefC(256(pC))
- mulps BOF(%rsp), rA0
- addps rC0, rC10 /* rC10 = c08ab c09ab c10bd c11bd */
- mulps BOF(%rsp), rC4
- addps rC2, rC12 /* rC12 = c12ac c13ac c12bd c13bd */
- mulps BOF(%rsp), rC5
- addps rC10, rC11 /* rC11 = c08abcd c09abcd c10bdac c11bdac */
- mulps BOF(%rsp), rC1
+ movups (pC), rA0
+ movlhps rC4, rC5 /* rC5 = c5a c5b c4a c5a */
+ movups 16(pC), rC4
+ unpcklps rC9 , rC8 /* rC8 = c08a c09a c08b c09b */
+ shufps $0x88, rC4, rA0 /* rA0 = c0 c1 c2 c3 */
+ movups 32(pC), rC4 /* rC4 = c4 X c5 X */
+ movhlps rC1, rC11 /* rC11 = c08d c09d c10c c11c */
+ movlhps rC10, rC1 /* rC1 = c08c c09c c10a c11a */
+ movhlps rC5, rC6 /* rC6 = c4a c5a c6b c7b */
+ movups 48(pC), rC5 /* rC5 = c6 X c7 X */
+ movhlps rC8 , rC0 /* rC0 = c08b c09b c10d c11d */
+ unpcklps rC13, rC2 /* rC2 = c12a c13a c12b c13b */
+ addps rC1, rC11 /* rC11 = c08cd c09cd c10ac c11ac */
+ shufps $0x88, rC5, rC4 /* rC4 = c4 c5 c6 c7 */
+ movups 64(pC), rC5 /* rC5 = c8 X c9 X */
+ movups 80(pC), rC1 /* rC1 = c10 X c11 X */
+ addps rB0, rC6 /* rC6 = c4ab c5ab c6bd c7bd */
+ shufps $0x88, rC1, rC5 /* rC5 = c8 c9 c10 c11 */
+ movlhps rC8 , rC9 /* rC9 = c09a c09b c08a c09a */
+ movss 96(pC), rC1
+ unpckhps rC13, rC12 /* rC12 = c12c c13c c12d c13d */
+ movss 104(pC), rB0
+ movhlps rC9 , rC10 /* rC10 = c08a c09a c10b c11b */
+ unpcklps rB0, rC1
+ addps rC6, rC7 /* rC7 = c4abcd c5abcd c6bdac c7bdac */
+ prefC(256(pC))
+ mulps BOF(%rsp), rA0
+ addps rC0, rC10 /* rC10 = c08ab c09ab c10bd c11bd */
+ mulps BOF(%rsp), rC4
+ addps rC2, rC12 /* rC12 = c12ac c13ac c12bd c13bd */
+ mulps BOF(%rsp), rC5
+ addps rC10, rC11 /* rC11 = c08abcd c09abcd c10bdac c11bdac */
+ mulps BOF(%rsp), rC1
/* */
- movhlps rC12, rC13 /* rC13 = c12bd c13bd X X */
- addps rA0, rC3
- prefC(192(pC))
- addq $68, pfA
- addps rC13, rC12 /* rC12 = c12abcd c13abcd X X */
- addps rC4, rC7
- addps rC5, rC11
- addps rC1, rC12
+ movhlps rC12, rC13 /* rC13 = c12bd c13bd X X */
+ addps rA0, rC3
+ prefC(192(pC))
+ addq $68, pfA
+ addps rC13, rC12 /* rC12 = c12abcd c13abcd X X */
+ addps rC4, rC7
+ addps rC5, rC11
+ addps rC1, rC12
#endif
#else
- movlhps rC4, rC5 /* rC5 = c5a c5b c4a c5a */
- unpcklps rC9 , rC8 /* rC8 = c08a c09a c08b c09b */
- movhlps rC1, rC11 /* rC11 = c08d c09d c10c c11c */
- movlhps rC10, rC1 /* rC1 = c08c c09c c10a c11a */
- movhlps rC5, rC6 /* rC6 = c4a c5a c6b c7b */
- movhlps rC8 , rC0 /* rC0 = c08b c09b c10d c11d */
- unpcklps rC13, rC2 /* rC2 = c12a c13a c12b c13b */
- addps rC1, rC11 /* rC11 = c08cd c09cd c10ac c11ac */
- addps rB0, rC6 /* rC6 = c4ab c5ab c6bd c7bd */
- movlhps rC8 , rC9 /* rC9 = c09a c09b c08a c09a */
- unpckhps rC13, rC12 /* rC12 = c12c c13c c12d c13d */
- movhlps rC9 , rC10 /* rC10 = c08a c09a c10b c11b */
- addps rC6, rC7 /* rC7 = c4abcd c5abcd c6bdac c7bdac */
+ movlhps rC4, rC5 /* rC5 = c5a c5b c4a c5a */
+ unpcklps rC9 , rC8 /* rC8 = c08a c09a c08b c09b */
+ movhlps rC1, rC11 /* rC11 = c08d c09d c10c c11c */
+ movlhps rC10, rC1 /* rC1 = c08c c09c c10a c11a */
+ movhlps rC5, rC6 /* rC6 = c4a c5a c6b c7b */
+ movhlps rC8 , rC0 /* rC0 = c08b c09b c10d c11d */
+ unpcklps rC13, rC2 /* rC2 = c12a c13a c12b c13b */
+ addps rC1, rC11 /* rC11 = c08cd c09cd c10ac c11ac */
+ addps rB0, rC6 /* rC6 = c4ab c5ab c6bd c7bd */
+ movlhps rC8 , rC9 /* rC9 = c09a c09b c08a c09a */
+ unpckhps rC13, rC12 /* rC12 = c12c c13c c12d c13d */
+ movhlps rC9 , rC10 /* rC10 = c08a c09a c10b c11b */
+ addps rC6, rC7 /* rC7 = c4abcd c5abcd c6bdac c7bdac */
#ifdef SREAL
- pref2(64(pfA))
+ pref2(64(pfA))
#else
- prefC(256(pC))
+ prefC(256(pC))
#endif
- addps rC0, rC10 /* rC10 = c08ab c09ab c10bd c11bd */
- addps rC2, rC12 /* rC12 = c12ac c13ac c12bd c13bd */
- addps rC10, rC11 /* rC11 = c08abcd c09abcd c10bdac c11bdac */
+ addps rC0, rC10 /* rC10 = c08ab c09ab c10bd c11bd */
+ addps rC2, rC12 /* rC12 = c12ac c13ac c12bd c13bd */
+ addps rC10, rC11 /* rC11 = c08abcd c09abcd c10bdac c11bdac */
/* */
- movhlps rC12, rC13 /* rC13 = c12bd c13bd X X */
+ movhlps rC12, rC13 /* rC13 = c12bd c13bd X X */
#ifndef SREAL
- prefC(192(pC))
+ prefC(192(pC))
#endif
- addq $68, pfA
- addps rC13, rC12 /* rC12 = c12abcd c13abcd X X */
+ addq $68, pfA
+ addps rC13, rC12 /* rC12 = c12abcd c13abcd X X */
#endif
/*
* Write results back to C; pC += 14;
*/
#ifdef SREAL
- movups rC3, (pC)
- movups rC7, 16(pC)
- movups rC11, 32(pC)
- movlps rC12, 48(pC)
- addq $56, pC
+ movups rC3, (pC)
+ movups rC7, 16(pC)
+ movups rC11, 32(pC)
+ movlps rC12, 48(pC)
+ addq $56, pC
#else
- movss rC3, (pC)
- movss rC7, 32(pC)
- movhlps rC3, rC0
- movhlps rC7, rC6
- movss rC0, 16(pC)
- movss rC6, 48(pC)
- shufps $0x55, rC3, rC3
- shufps $0x55, rC7, rC7
- movss rC3, 8(pC)
- movss rC7, 40(pC)
- shufps $0x55, rC0, rC0
- shufps $0x55, rC6, rC6
- movss rC0, 24(pC)
- movss rC6, 56(pC)
-
- movss rC11, 64(pC)
- movhlps rC11, rC2
- movss rC12, 96(pC)
- movss rC2, 80(pC)
- shufps $0x55, rC11, rC11
- shufps $0x55, rC12, rC12
- movss rC11, 72(pC)
- shufps $0x55, rC2, rC2
- movss rC12, 104(pC)
- movss rC2, 88(pC)
+ movss rC3, (pC)
+ movss rC7, 32(pC)
+ movhlps rC3, rC0
+ movhlps rC7, rC6
+ movss rC0, 16(pC)
+ movss rC6, 48(pC)
+ shufps $0x55, rC3, rC3
+ shufps $0x55, rC7, rC7
+ movss rC3, 8(pC)
+ movss rC7, 40(pC)
+ shufps $0x55, rC0, rC0
+ shufps $0x55, rC6, rC6
+ movss rC0, 24(pC)
+ movss rC6, 56(pC)
+
+ movss rC11, 64(pC)
+ movhlps rC11, rC2
+ movss rC12, 96(pC)
+ movss rC2, 80(pC)
+ shufps $0x55, rC11, rC11
+ shufps $0x55, rC12, rC12
+ movss rC11, 72(pC)
+ shufps $0x55, rC2, rC2
+ movss rC12, 104(pC)
+ movss rC2, 88(pC)
- addq $112, pC
+ addq $112, pC
#endif
/*
* Write results back to C
*/
- addq $NB14so-176, pA5
- addq $NB14so-176, pA10
- subq $176, pB0
+ addq $NB14so-176, pA5
+ addq $NB14so-176, pA10
+ subq $176, pB0
/*
* pC += 14; pA += 14*NB; pB -= NB;
*/
/*
* while (pA != stM);
*/
- subq $1, stM
- jne UMLOOP
+ subq $1, stM
+ jne UMLOOP
#endif
/*
@@ -1459,994 +1466,994 @@ MLAST:
#endif
/*UKLOOP: */
#ifdef BETA1
- movaps 0-120(pA10,mldab5,2), rC0
- movaps 0-120(pB0), rB0
- mulps rB0, rC0
- addss (pC), rC0
- movaps 0-120(pA5, mldab,4), rC1
- mulps rB0, rC1
- addss CMUL(4)(pC), rC1
- movaps 0-120(pA10, mldab,8), rC2
- mulps rB0, rC2
- addss CMUL(8)(pC), rC2
- movaps 0-120(pA5, mldab,2), rC3
- mulps rB0, rC3
- addss CMUL(12)(pC), rC3
- movaps 0-120(pA5, mldab), rC4
- mulps rB0, rC4
- addss CMUL(16)(pC), rC4
- movaps 0-120(pA5), rC5
- mulps rB0, rC5
- addss CMUL(20)(pC), rC5
- movaps 0-120(pA5, ldab), rC6
- mulps rB0, rC6
- addss CMUL(24)(pC), rC6
- movaps 0-120(pA5, ldab,2), rC7
- mulps rB0, rC7
- addss CMUL(28)(pC), rC7
- movaps 0-120(pA10, mldab,2), rC8
- mulps rB0, rC8
- addss CMUL(32)(pC), rC8
- movaps 0-120(pA5,ldab,4), rC9
- mulps rB0, rC9
- addss CMUL(36)(pC), rC9
- movaps 0-120(pA10), rC10
- mulps rB0, rC10
- addss CMUL(40)(pC), rC10
- movaps 0-120(pA10,ldab), rC11
- mulps rB0, rC11
- addss CMUL(44)(pC), rC11
- movaps 0-120(pA10,ldab,2), rC12
- mulps rB0, rC12
- addss CMUL(48)(pC), rC12
- movaps 0-120(pA5,ldab,8), rC13
- mulps rB0, rC13
- addss CMUL(52)(pC), rC13
+ movaps 0-120(pA10,mldab5,2), rC0
+ movaps 0-120(pB0), rB0
+ mulps rB0, rC0
+ addss (pC), rC0
+ movaps 0-120(pA5, mldab,4), rC1
+ mulps rB0, rC1
+ addss CMUL(4)(pC), rC1
+ movaps 0-120(pA10, mldab,8), rC2
+ mulps rB0, rC2
+ addss CMUL(8)(pC), rC2
+ movaps 0-120(pA5, mldab,2), rC3
+ mulps rB0, rC3
+ addss CMUL(12)(pC), rC3
+ movaps 0-120(pA5, mldab), rC4
+ mulps rB0, rC4
+ addss CMUL(16)(pC), rC4
+ movaps 0-120(pA5), rC5
+ mulps rB0, rC5
+ addss CMUL(20)(pC), rC5
+ movaps 0-120(pA5, ldab), rC6
+ mulps rB0, rC6
+ addss CMUL(24)(pC), rC6
+ movaps 0-120(pA5, ldab,2), rC7
+ mulps rB0, rC7
+ addss CMUL(28)(pC), rC7
+ movaps 0-120(pA10, mldab,2), rC8
+ mulps rB0, rC8
+ addss CMUL(32)(pC), rC8
+ movaps 0-120(pA5,ldab,4), rC9
+ mulps rB0, rC9
+ addss CMUL(36)(pC), rC9
+ movaps 0-120(pA10), rC10
+ mulps rB0, rC10
+ addss CMUL(40)(pC), rC10
+ movaps 0-120(pA10,ldab), rC11
+ mulps rB0, rC11
+ addss CMUL(44)(pC), rC11
+ movaps 0-120(pA10,ldab,2), rC12
+ mulps rB0, rC12
+ addss CMUL(48)(pC), rC12
+ movaps 0-120(pA5,ldab,8), rC13
+ mulps rB0, rC13
+ addss CMUL(52)(pC), rC13
#else
- movaps 0-120(pA10,mldab5,2), rC0
- movaps 0-120(pB0), rC13
- mulps rC13, rC0
- movaps 0-120(pA5, mldab,4), rC1
- mulps rC13, rC1
- movaps 0-120(pA10, mldab,8), rC2
- mulps rC13, rC2
- movaps 0-120(pA5, mldab,2), rC3
- mulps rC13, rC3
- movaps 0-120(pA5, mldab), rC4
- mulps rC13, rC4
- movaps 0-120(pA5), rC5
- mulps rC13, rC5
- movaps 0-120(pA5, ldab), rC6
- mulps rC13, rC6
- movaps 0-120(pA5, ldab,2), rC7
- mulps rC13, rC7
- movaps 0-120(pA10, mldab,2), rC8
- mulps rC13, rC8
- movaps 0-120(pA5,ldab,4), rC9
- mulps rC13, rC9
- movaps 0-120(pA10), rC10
- mulps rC13, rC10
- movaps 0-120(pA10,ldab), rC11
- mulps rC13, rC11
- movaps 0-120(pA10,ldab,2), rC12
- mulps rC13, rC12
- mulps 0-120(pA5,ldab,8), rC13
+ movaps 0-120(pA10,mldab5,2), rC0
+ movaps 0-120(pB0), rC13
+ mulps rC13, rC0
+ movaps 0-120(pA5, mldab,4), rC1
+ mulps rC13, rC1
+ movaps 0-120(pA10, mldab,8), rC2
+ mulps rC13, rC2
+ movaps 0-120(pA5, mldab,2), rC3
+ mulps rC13, rC3
+ movaps 0-120(pA5, mldab), rC4
+ mulps rC13, rC4
+ movaps 0-120(pA5), rC5
+ mulps rC13, rC5
+ movaps 0-120(pA5, ldab), rC6
+ mulps rC13, rC6
+ movaps 0-120(pA5, ldab,2), rC7
+ mulps rC13, rC7
+ movaps 0-120(pA10, mldab,2), rC8
+ mulps rC13, rC8
+ movaps 0-120(pA5,ldab,4), rC9
+ mulps rC13, rC9
+ movaps 0-120(pA10), rC10
+ mulps rC13, rC10
+ movaps 0-120(pA10,ldab), rC11
+ mulps rC13, rC11
+ movaps 0-120(pA10,ldab,2), rC12
+ mulps rC13, rC12
+ mulps 0-120(pA5,ldab,8), rC13
#endif
#if KB > 4
- movaps 16-120(pA10,mldab5,2), rA0
- movaps 16-120(pB0), rB0
- mulps rB0, rA0
- addps rA0, rC0
- movaps 16-120(pA5, mldab,4), rA0
- mulps rB0, rA0
- addps rA0, rC1
- movaps 16-120(pA10, mldab,8), rA0
- mulps rB0, rA0
- addps rA0, rC2
- movaps 16-120(pA5, mldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC3
- movaps 16-120(pA5, mldab), rA0
- mulps rB0, rA0
- addps rA0, rC4
- movaps 16-120(pA5), rA0
- mulps rB0, rA0
- addps rA0, rC5
- movaps 16-120(pA5, ldab), rA0
- mulps rB0, rA0
- addps rA0, rC6
- movaps 16-120(pA5, ldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC7
- movaps 16-120(pA10, mldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC8
- movaps 16-120(pA5,ldab,4), rA0
- mulps rB0, rA0
- addps rA0, rC9
- movaps 16-120(pA10), rA0
- mulps rB0, rA0
- addps rA0, rC10
- movaps 16-120(pA10,ldab), rA0
- mulps rB0, rA0
- addps rA0, rC11
- movaps 16-120(pA10,ldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC12
- mulps 16-120(pA5,ldab,8), rB0
- addps rB0, rC13
+ movaps 16-120(pA10,mldab5,2), rA0
+ movaps 16-120(pB0), rB0
+ mulps rB0, rA0
+ addps rA0, rC0
+ movaps 16-120(pA5, mldab,4), rA0
+ mulps rB0, rA0
+ addps rA0, rC1
+ movaps 16-120(pA10, mldab,8), rA0
+ mulps rB0, rA0
+ addps rA0, rC2
+ movaps 16-120(pA5, mldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC3
+ movaps 16-120(pA5, mldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC4
+ movaps 16-120(pA5), rA0
+ mulps rB0, rA0
+ addps rA0, rC5
+ movaps 16-120(pA5, ldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC6
+ movaps 16-120(pA5, ldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC7
+ movaps 16-120(pA10, mldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC8
+ movaps 16-120(pA5,ldab,4), rA0
+ mulps rB0, rA0
+ addps rA0, rC9
+ movaps 16-120(pA10), rA0
+ mulps rB0, rA0
+ addps rA0, rC10
+ movaps 16-120(pA10,ldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC11
+ movaps 16-120(pA10,ldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC12
+ mulps 16-120(pA5,ldab,8), rB0
+ addps rB0, rC13
#endif
#if KB > 8
- movaps 32-120(pA10,mldab5,2), rA0
- movaps 32-120(pB0), rB0
- mulps rB0, rA0
- addps rA0, rC0
- movaps 32-120(pA5, mldab,4), rA0
- mulps rB0, rA0
- addps rA0, rC1
- movaps 32-120(pA10, mldab,8), rA0
- mulps rB0, rA0
- addps rA0, rC2
- movaps 32-120(pA5, mldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC3
- movaps 32-120(pA5, mldab), rA0
- mulps rB0, rA0
- addps rA0, rC4
- movaps 32-120(pA5), rA0
- mulps rB0, rA0
- addps rA0, rC5
- movaps 32-120(pA5, ldab), rA0
- mulps rB0, rA0
- addps rA0, rC6
- movaps 32-120(pA5, ldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC7
- movaps 32-120(pA10, mldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC8
- movaps 32-120(pA5,ldab,4), rA0
- mulps rB0, rA0
- addps rA0, rC9
- movaps 32-120(pA10), rA0
- mulps rB0, rA0
- addps rA0, rC10
- movaps 32-120(pA10,ldab), rA0
- mulps rB0, rA0
- addps rA0, rC11
- movaps 32-120(pA10,ldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC12
- mulps 32-120(pA5,ldab,8), rB0
- addps rB0, rC13
+ movaps 32-120(pA10,mldab5,2), rA0
+ movaps 32-120(pB0), rB0
+ mulps rB0, rA0
+ addps rA0, rC0
+ movaps 32-120(pA5, mldab,4), rA0
+ mulps rB0, rA0
+ addps rA0, rC1
+ movaps 32-120(pA10, mldab,8), rA0
+ mulps rB0, rA0
+ addps rA0, rC2
+ movaps 32-120(pA5, mldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC3
+ movaps 32-120(pA5, mldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC4
+ movaps 32-120(pA5), rA0
+ mulps rB0, rA0
+ addps rA0, rC5
+ movaps 32-120(pA5, ldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC6
+ movaps 32-120(pA5, ldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC7
+ movaps 32-120(pA10, mldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC8
+ movaps 32-120(pA5,ldab,4), rA0
+ mulps rB0, rA0
+ addps rA0, rC9
+ movaps 32-120(pA10), rA0
+ mulps rB0, rA0
+ addps rA0, rC10
+ movaps 32-120(pA10,ldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC11
+ movaps 32-120(pA10,ldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC12
+ mulps 32-120(pA5,ldab,8), rB0
+ addps rB0, rC13
#endif
#if KB > 12
- movaps 48-120(pA10,mldab5,2), rA0
- movaps 48-120(pB0), rB0
- mulps rB0, rA0
- addps rA0, rC0
- movaps 48-120(pA5, mldab,4), rA0
- mulps rB0, rA0
- addps rA0, rC1
- movaps 48-120(pA10, mldab,8), rA0
- mulps rB0, rA0
- addps rA0, rC2
- movaps 48-120(pA5, mldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC3
- movaps 48-120(pA5, mldab), rA0
- mulps rB0, rA0
- addps rA0, rC4
- movaps 48-120(pA5), rA0
- mulps rB0, rA0
- addps rA0, rC5
- movaps 48-120(pA5, ldab), rA0
- mulps rB0, rA0
- addps rA0, rC6
- movaps 48-120(pA5, ldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC7
- movaps 48-120(pA10, mldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC8
- movaps 48-120(pA5,ldab,4), rA0
- mulps rB0, rA0
- addps rA0, rC9
- movaps 48-120(pA10), rA0
- mulps rB0, rA0
- addps rA0, rC10
- movaps 48-120(pA10,ldab), rA0
- mulps rB0, rA0
- addps rA0, rC11
- movaps 48-120(pA10,ldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC12
- mulps 48-120(pA5,ldab,8), rB0
- addps rB0, rC13
+ movaps 48-120(pA10,mldab5,2), rA0
+ movaps 48-120(pB0), rB0
+ mulps rB0, rA0
+ addps rA0, rC0
+ movaps 48-120(pA5, mldab,4), rA0
+ mulps rB0, rA0
+ addps rA0, rC1
+ movaps 48-120(pA10, mldab,8), rA0
+ mulps rB0, rA0
+ addps rA0, rC2
+ movaps 48-120(pA5, mldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC3
+ movaps 48-120(pA5, mldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC4
+ movaps 48-120(pA5), rA0
+ mulps rB0, rA0
+ addps rA0, rC5
+ movaps 48-120(pA5, ldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC6
+ movaps 48-120(pA5, ldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC7
+ movaps 48-120(pA10, mldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC8
+ movaps 48-120(pA5,ldab,4), rA0
+ mulps rB0, rA0
+ addps rA0, rC9
+ movaps 48-120(pA10), rA0
+ mulps rB0, rA0
+ addps rA0, rC10
+ movaps 48-120(pA10,ldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC11
+ movaps 48-120(pA10,ldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC12
+ mulps 48-120(pA5,ldab,8), rB0
+ addps rB0, rC13
#endif
#if KB > 16
- movaps 64-120(pA10,mldab5,2), rA0
- movaps 64-120(pB0), rB0
- mulps rB0, rA0
- addps rA0, rC0
- movaps 64-120(pA5, mldab,4), rA0
- mulps rB0, rA0
- addps rA0, rC1
- movaps 64-120(pA10, mldab,8), rA0
- mulps rB0, rA0
- addps rA0, rC2
- movaps 64-120(pA5, mldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC3
- movaps 64-120(pA5, mldab), rA0
- mulps rB0, rA0
- addps rA0, rC4
- movaps 64-120(pA5), rA0
- mulps rB0, rA0
- addps rA0, rC5
- movaps 64-120(pA5, ldab), rA0
- mulps rB0, rA0
- addps rA0, rC6
- movaps 64-120(pA5, ldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC7
- movaps 64-120(pA10, mldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC8
- movaps 64-120(pA5,ldab,4), rA0
- mulps rB0, rA0
- addps rA0, rC9
- movaps 64-120(pA10), rA0
- mulps rB0, rA0
- addps rA0, rC10
- movaps 64-120(pA10,ldab), rA0
- mulps rB0, rA0
- addps rA0, rC11
- movaps 64-120(pA10,ldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC12
- mulps 64-120(pA5,ldab,8), rB0
- addps rB0, rC13
+ movaps 64-120(pA10,mldab5,2), rA0
+ movaps 64-120(pB0), rB0
+ mulps rB0, rA0
+ addps rA0, rC0
+ movaps 64-120(pA5, mldab,4), rA0
+ mulps rB0, rA0
+ addps rA0, rC1
+ movaps 64-120(pA10, mldab,8), rA0
+ mulps rB0, rA0
+ addps rA0, rC2
+ movaps 64-120(pA5, mldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC3
+ movaps 64-120(pA5, mldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC4
+ movaps 64-120(pA5), rA0
+ mulps rB0, rA0
+ addps rA0, rC5
+ movaps 64-120(pA5, ldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC6
+ movaps 64-120(pA5, ldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC7
+ movaps 64-120(pA10, mldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC8
+ movaps 64-120(pA5,ldab,4), rA0
+ mulps rB0, rA0
+ addps rA0, rC9
+ movaps 64-120(pA10), rA0
+ mulps rB0, rA0
+ addps rA0, rC10
+ movaps 64-120(pA10,ldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC11
+ movaps 64-120(pA10,ldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC12
+ mulps 64-120(pA5,ldab,8), rB0
+ addps rB0, rC13
#endif
#if KB > 20
- movaps 80-120(pA10,mldab5,2), rA0
- movaps 80-120(pB0), rB0
- mulps rB0, rA0
- addps rA0, rC0
- movaps 80-120(pA5, mldab,4), rA0
- mulps rB0, rA0
- addps rA0, rC1
- movaps 80-120(pA10, mldab,8), rA0
- mulps rB0, rA0
- addps rA0, rC2
- movaps 80-120(pA5, mldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC3
- movaps 80-120(pA5, mldab), rA0
- mulps rB0, rA0
- addps rA0, rC4
- movaps 80-120(pA5), rA0
- mulps rB0, rA0
- addps rA0, rC5
- movaps 80-120(pA5, ldab), rA0
- mulps rB0, rA0
- addps rA0, rC6
- movaps 80-120(pA5, ldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC7
- movaps 80-120(pA10, mldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC8
- movaps 80-120(pA5,ldab,4), rA0
- mulps rB0, rA0
- addps rA0, rC9
- movaps 80-120(pA10), rA0
- mulps rB0, rA0
- addps rA0, rC10
- movaps 80-120(pA10,ldab), rA0
- mulps rB0, rA0
- addps rA0, rC11
- movaps 80-120(pA10,ldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC12
- mulps 80-120(pA5,ldab,8), rB0
- addps rB0, rC13
+ movaps 80-120(pA10,mldab5,2), rA0
+ movaps 80-120(pB0), rB0
+ mulps rB0, rA0
+ addps rA0, rC0
+ movaps 80-120(pA5, mldab,4), rA0
+ mulps rB0, rA0
+ addps rA0, rC1
+ movaps 80-120(pA10, mldab,8), rA0
+ mulps rB0, rA0
+ addps rA0, rC2
+ movaps 80-120(pA5, mldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC3
+ movaps 80-120(pA5, mldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC4
+ movaps 80-120(pA5), rA0
+ mulps rB0, rA0
+ addps rA0, rC5
+ movaps 80-120(pA5, ldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC6
+ movaps 80-120(pA5, ldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC7
+ movaps 80-120(pA10, mldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC8
+ movaps 80-120(pA5,ldab,4), rA0
+ mulps rB0, rA0
+ addps rA0, rC9
+ movaps 80-120(pA10), rA0
+ mulps rB0, rA0
+ addps rA0, rC10
+ movaps 80-120(pA10,ldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC11
+ movaps 80-120(pA10,ldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC12
+ mulps 80-120(pA5,ldab,8), rB0
+ addps rB0, rC13
#endif
#if KB > 24
- movaps 96-120(pA10,mldab5,2), rA0
- movaps 96-120(pB0), rB0
- mulps rB0, rA0
- addps rA0, rC0
- movaps 96-120(pA5, mldab,4), rA0
- mulps rB0, rA0
- addps rA0, rC1
- movaps 96-120(pA10, mldab,8), rA0
- mulps rB0, rA0
- addps rA0, rC2
- movaps 96-120(pA5, mldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC3
- movaps 96-120(pA5, mldab), rA0
- mulps rB0, rA0
- addps rA0, rC4
- movaps 96-120(pA5), rA0
- mulps rB0, rA0
- addps rA0, rC5
- movaps 96-120(pA5, ldab), rA0
- mulps rB0, rA0
- addps rA0, rC6
- movaps 96-120(pA5, ldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC7
- movaps 96-120(pA10, mldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC8
- movaps 96-120(pA5,ldab,4), rA0
- mulps rB0, rA0
- addps rA0, rC9
- movaps 96-120(pA10), rA0
- mulps rB0, rA0
- addps rA0, rC10
- movaps 96-120(pA10,ldab), rA0
- mulps rB0, rA0
- addps rA0, rC11
- movaps 96-120(pA10,ldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC12
- mulps 96-120(pA5,ldab,8), rB0
- addps rB0, rC13
+ movaps 96-120(pA10,mldab5,2), rA0
+ movaps 96-120(pB0), rB0
+ mulps rB0, rA0
+ addps rA0, rC0
+ movaps 96-120(pA5, mldab,4), rA0
+ mulps rB0, rA0
+ addps rA0, rC1
+ movaps 96-120(pA10, mldab,8), rA0
+ mulps rB0, rA0
+ addps rA0, rC2
+ movaps 96-120(pA5, mldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC3
+ movaps 96-120(pA5, mldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC4
+ movaps 96-120(pA5), rA0
+ mulps rB0, rA0
+ addps rA0, rC5
+ movaps 96-120(pA5, ldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC6
+ movaps 96-120(pA5, ldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC7
+ movaps 96-120(pA10, mldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC8
+ movaps 96-120(pA5,ldab,4), rA0
+ mulps rB0, rA0
+ addps rA0, rC9
+ movaps 96-120(pA10), rA0
+ mulps rB0, rA0
+ addps rA0, rC10
+ movaps 96-120(pA10,ldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC11
+ movaps 96-120(pA10,ldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC12
+ mulps 96-120(pA5,ldab,8), rB0
+ addps rB0, rC13
#endif
#if KB > 28
- movaps 112-120(pA10,mldab5,2), rA0
- movaps 112-120(pB0), rB0
- mulps rB0, rA0
- addps rA0, rC0
- movaps 112-120(pA5, mldab,4), rA0
- mulps rB0, rA0
- addps rA0, rC1
- movaps 112-120(pA10, mldab,8), rA0
- mulps rB0, rA0
- addps rA0, rC2
- movaps 112-120(pA5, mldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC3
- movaps 112-120(pA5, mldab), rA0
- mulps rB0, rA0
- addps rA0, rC4
- movaps 112-120(pA5), rA0
- mulps rB0, rA0
- addps rA0, rC5
- movaps 112-120(pA5, ldab), rA0
- mulps rB0, rA0
- addps rA0, rC6
- movaps 112-120(pA5, ldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC7
- movaps 112-120(pA10, mldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC8
- movaps 112-120(pA5,ldab,4), rA0
- mulps rB0, rA0
- addps rA0, rC9
- movaps 112-120(pA10), rA0
- mulps rB0, rA0
- addps rA0, rC10
- movaps 112-120(pA10,ldab), rA0
- mulps rB0, rA0
- addps rA0, rC11
- movaps 112-120(pA10,ldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC12
- mulps 112-120(pA5,ldab,8), rB0
- addps rB0, rC13
+ movaps 112-120(pA10,mldab5,2), rA0
+ movaps 112-120(pB0), rB0
+ mulps rB0, rA0
+ addps rA0, rC0
+ movaps 112-120(pA5, mldab,4), rA0
+ mulps rB0, rA0
+ addps rA0, rC1
+ movaps 112-120(pA10, mldab,8), rA0
+ mulps rB0, rA0
+ addps rA0, rC2
+ movaps 112-120(pA5, mldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC3
+ movaps 112-120(pA5, mldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC4
+ movaps 112-120(pA5), rA0
+ mulps rB0, rA0
+ addps rA0, rC5
+ movaps 112-120(pA5, ldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC6
+ movaps 112-120(pA5, ldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC7
+ movaps 112-120(pA10, mldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC8
+ movaps 112-120(pA5,ldab,4), rA0
+ mulps rB0, rA0
+ addps rA0, rC9
+ movaps 112-120(pA10), rA0
+ mulps rB0, rA0
+ addps rA0, rC10
+ movaps 112-120(pA10,ldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC11
+ movaps 112-120(pA10,ldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC12
+ mulps 112-120(pA5,ldab,8), rB0
+ addps rB0, rC13
#endif
#if KB > 32
- movaps 128-120(pA10,mldab5,2), rA0
- movaps 128-120(pB0), rB0
- mulps rB0, rA0
- addps rA0, rC0
- movaps 128-120(pA5, mldab,4), rA0
- mulps rB0, rA0
- addps rA0, rC1
- movaps 128-120(pA10, mldab,8), rA0
- mulps rB0, rA0
- addps rA0, rC2
- movaps 128-120(pA5, mldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC3
- movaps 128-120(pA5, mldab), rA0
- mulps rB0, rA0
- addps rA0, rC4
- movaps 128-120(pA5), rA0
- mulps rB0, rA0
- addps rA0, rC5
- movaps 128-120(pA5, ldab), rA0
- mulps rB0, rA0
- addps rA0, rC6
- movaps 128-120(pA5, ldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC7
- movaps 128-120(pA10, mldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC8
- movaps 128-120(pA5,ldab,4), rA0
- mulps rB0, rA0
- addps rA0, rC9
- movaps 128-120(pA10), rA0
- mulps rB0, rA0
- addps rA0, rC10
- movaps 128-120(pA10,ldab), rA0
- mulps rB0, rA0
- addps rA0, rC11
- movaps 128-120(pA10,ldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC12
- mulps 128-120(pA5,ldab,8), rB0
- addps rB0, rC13
+ movaps 128-120(pA10,mldab5,2), rA0
+ movaps 128-120(pB0), rB0
+ mulps rB0, rA0
+ addps rA0, rC0
+ movaps 128-120(pA5, mldab,4), rA0
+ mulps rB0, rA0
+ addps rA0, rC1
+ movaps 128-120(pA10, mldab,8), rA0
+ mulps rB0, rA0
+ addps rA0, rC2
+ movaps 128-120(pA5, mldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC3
+ movaps 128-120(pA5, mldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC4
+ movaps 128-120(pA5), rA0
+ mulps rB0, rA0
+ addps rA0, rC5
+ movaps 128-120(pA5, ldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC6
+ movaps 128-120(pA5, ldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC7
+ movaps 128-120(pA10, mldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC8
+ movaps 128-120(pA5,ldab,4), rA0
+ mulps rB0, rA0
+ addps rA0, rC9
+ movaps 128-120(pA10), rA0
+ mulps rB0, rA0
+ addps rA0, rC10
+ movaps 128-120(pA10,ldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC11
+ movaps 128-120(pA10,ldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC12
+ mulps 128-120(pA5,ldab,8), rB0
+ addps rB0, rC13
#endif
#if KB > 36
- movaps 144-120(pA10,mldab5,2), rA0
- movaps 144-120(pB0), rB0
- mulps rB0, rA0
- addps rA0, rC0
- movaps 144-120(pA5, mldab,4), rA0
- mulps rB0, rA0
- addps rA0, rC1
- movaps 144-120(pA10, mldab,8), rA0
- mulps rB0, rA0
- addps rA0, rC2
- movaps 144-120(pA5, mldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC3
- movaps 144-120(pA5, mldab), rA0
- mulps rB0, rA0
- addps rA0, rC4
- movaps 144-120(pA5), rA0
- mulps rB0, rA0
- addps rA0, rC5
- movaps 144-120(pA5, ldab), rA0
- mulps rB0, rA0
- addps rA0, rC6
- movaps 144-120(pA5, ldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC7
- movaps 144-120(pA10, mldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC8
- movaps 144-120(pA5,ldab,4), rA0
- mulps rB0, rA0
- addps rA0, rC9
- movaps 144-120(pA10), rA0
- mulps rB0, rA0
- addps rA0, rC10
- movaps 144-120(pA10,ldab), rA0
- mulps rB0, rA0
- addps rA0, rC11
- movaps 144-120(pA10,ldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC12
- mulps 144-120(pA5,ldab,8), rB0
- addps rB0, rC13
+ movaps 144-120(pA10,mldab5,2), rA0
+ movaps 144-120(pB0), rB0
+ mulps rB0, rA0
+ addps rA0, rC0
+ movaps 144-120(pA5, mldab,4), rA0
+ mulps rB0, rA0
+ addps rA0, rC1
+ movaps 144-120(pA10, mldab,8), rA0
+ mulps rB0, rA0
+ addps rA0, rC2
+ movaps 144-120(pA5, mldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC3
+ movaps 144-120(pA5, mldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC4
+ movaps 144-120(pA5), rA0
+ mulps rB0, rA0
+ addps rA0, rC5
+ movaps 144-120(pA5, ldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC6
+ movaps 144-120(pA5, ldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC7
+ movaps 144-120(pA10, mldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC8
+ movaps 144-120(pA5,ldab,4), rA0
+ mulps rB0, rA0
+ addps rA0, rC9
+ movaps 144-120(pA10), rA0
+ mulps rB0, rA0
+ addps rA0, rC10
+ movaps 144-120(pA10,ldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC11
+ movaps 144-120(pA10,ldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC12
+ mulps 144-120(pA5,ldab,8), rB0
+ addps rB0, rC13
#endif
- prefB((pB,ldab))
- prefB(64(pB,ldab))
+ prefB((pB,ldab))
+ prefB(64(pB,ldab))
#if KB > 40
- movaps 160-120(pA10,mldab5,2), rA0
- movaps 160-120(pB0), rB0
- mulps rB0, rA0
- addq $176, pB0
- addps rA0, rC0
- movaps 160-120(pA5, mldab,4), rA0
- mulps rB0, rA0
- addps rA0, rC1
- movaps 160-120(pA10, mldab,8), rA0
- mulps rB0, rA0
- addps rA0, rC2
- movaps 160-120(pA5, mldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC3
- movaps 160-120(pA5, mldab), rA0
- mulps rB0, rA0
- addps rA0, rC4
- movaps 160-120(pA5), rA0
- mulps rB0, rA0
- addps rA0, rC5
- movaps 160-120(pA5, ldab), rA0
- mulps rB0, rA0
- addps rA0, rC6
- movaps 160-120(pA5, ldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC7
- movaps 160-120(pA10, mldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC8
- movaps 160-120(pA5,ldab,4), rA0
- mulps rB0, rA0
- addps rA0, rC9
- movaps 160-120(pA10), rA0
- mulps rB0, rA0
- addps rA0, rC10
- movaps 160-120(pA10,ldab), rA0
- mulps rB0, rA0
- addps rA0, rC11
- movaps 160-120(pA10,ldab,2), rA0
- mulps rB0, rA0
- addq $176, pA10
- addps rA0, rC12
- mulps 160-120(pA5,ldab,8), rB0
- addps rB0, rC13
- addq $176, pA5
+ movaps 160-120(pA10,mldab5,2), rA0
+ movaps 160-120(pB0), rB0
+ mulps rB0, rA0
+ addq $176, pB0
+ addps rA0, rC0
+ movaps 160-120(pA5, mldab,4), rA0
+ mulps rB0, rA0
+ addps rA0, rC1
+ movaps 160-120(pA10, mldab,8), rA0
+ mulps rB0, rA0
+ addps rA0, rC2
+ movaps 160-120(pA5, mldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC3
+ movaps 160-120(pA5, mldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC4
+ movaps 160-120(pA5), rA0
+ mulps rB0, rA0
+ addps rA0, rC5
+ movaps 160-120(pA5, ldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC6
+ movaps 160-120(pA5, ldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC7
+ movaps 160-120(pA10, mldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC8
+ movaps 160-120(pA5,ldab,4), rA0
+ mulps rB0, rA0
+ addps rA0, rC9
+ movaps 160-120(pA10), rA0
+ mulps rB0, rA0
+ addps rA0, rC10
+ movaps 160-120(pA10,ldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC11
+ movaps 160-120(pA10,ldab,2), rA0
+ mulps rB0, rA0
+ addq $176, pA10
+ addps rA0, rC12
+ mulps 160-120(pA5,ldab,8), rB0
+ addps rB0, rC13
+ addq $176, pA5
#else
- addq $176, pB0
- addq $176, pA10
- addq $176, pA5
+ addq $176, pB0
+ addq $176, pA10
+ addq $176, pA5
#endif
#if KB > 44
- movaps 0-120(pA10,mldab5,2), rA0
- movaps 0-120(pB0), rB0
- mulps rB0, rA0
- addps rA0, rC0
- movaps 0-120(pA5, mldab,4), rA0
- mulps rB0, rA0
- addps rA0, rC1
- movaps 0-120(pA10, mldab,8), rA0
- mulps rB0, rA0
- addps rA0, rC2
- movaps 0-120(pA5, mldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC3
- movaps 0-120(pA5, mldab), rA0
- mulps rB0, rA0
- addps rA0, rC4
- movaps 0-120(pA5), rA0
- mulps rB0, rA0
- addps rA0, rC5
- movaps 0-120(pA5, ldab), rA0
- mulps rB0, rA0
- addps rA0, rC6
- movaps 0-120(pA5, ldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC7
- movaps 0-120(pA10, mldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC8
- movaps 0-120(pA5,ldab,4), rA0
- mulps rB0, rA0
- addps rA0, rC9
- movaps 0-120(pA10), rA0
- mulps rB0, rA0
- addps rA0, rC10
- movaps 0-120(pA10,ldab), rA0
- mulps rB0, rA0
- addps rA0, rC11
- movaps 0-120(pA10,ldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC12
- mulps 0-120(pA5,ldab,8), rB0
- addps rB0, rC13
+ movaps 0-120(pA10,mldab5,2), rA0
+ movaps 0-120(pB0), rB0
+ mulps rB0, rA0
+ addps rA0, rC0
+ movaps 0-120(pA5, mldab,4), rA0
+ mulps rB0, rA0
+ addps rA0, rC1
+ movaps 0-120(pA10, mldab,8), rA0
+ mulps rB0, rA0
+ addps rA0, rC2
+ movaps 0-120(pA5, mldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC3
+ movaps 0-120(pA5, mldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC4
+ movaps 0-120(pA5), rA0
+ mulps rB0, rA0
+ addps rA0, rC5
+ movaps 0-120(pA5, ldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC6
+ movaps 0-120(pA5, ldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC7
+ movaps 0-120(pA10, mldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC8
+ movaps 0-120(pA5,ldab,4), rA0
+ mulps rB0, rA0
+ addps rA0, rC9
+ movaps 0-120(pA10), rA0
+ mulps rB0, rA0
+ addps rA0, rC10
+ movaps 0-120(pA10,ldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC11
+ movaps 0-120(pA10,ldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC12
+ mulps 0-120(pA5,ldab,8), rB0
+ addps rB0, rC13
#endif
#if KB > 48
- movaps 16-120(pA10,mldab5,2), rA0
- movaps 16-120(pB0), rB0
- mulps rB0, rA0
- addps rA0, rC0
- movaps 16-120(pA5, mldab,4), rA0
- mulps rB0, rA0
- addps rA0, rC1
- movaps 16-120(pA10, mldab,8), rA0
- mulps rB0, rA0
- addps rA0, rC2
- movaps 16-120(pA5, mldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC3
- movaps 16-120(pA5, mldab), rA0
- mulps rB0, rA0
- addps rA0, rC4
- movaps 16-120(pA5), rA0
- mulps rB0, rA0
- addps rA0, rC5
- movaps 16-120(pA5, ldab), rA0
- mulps rB0, rA0
- addps rA0, rC6
- movaps 16-120(pA5, ldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC7
- movaps 16-120(pA10, mldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC8
- movaps 16-120(pA5,ldab,4), rA0
- mulps rB0, rA0
- addps rA0, rC9
- movaps 16-120(pA10), rA0
- mulps rB0, rA0
- addps rA0, rC10
- movaps 16-120(pA10,ldab), rA0
- mulps rB0, rA0
- addps rA0, rC11
- movaps 16-120(pA10,ldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC12
- mulps 16-120(pA5,ldab,8), rB0
- addps rB0, rC13
+ movaps 16-120(pA10,mldab5,2), rA0
+ movaps 16-120(pB0), rB0
+ mulps rB0, rA0
+ addps rA0, rC0
+ movaps 16-120(pA5, mldab,4), rA0
+ mulps rB0, rA0
+ addps rA0, rC1
+ movaps 16-120(pA10, mldab,8), rA0
+ mulps rB0, rA0
+ addps rA0, rC2
+ movaps 16-120(pA5, mldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC3
+ movaps 16-120(pA5, mldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC4
+ movaps 16-120(pA5), rA0
+ mulps rB0, rA0
+ addps rA0, rC5
+ movaps 16-120(pA5, ldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC6
+ movaps 16-120(pA5, ldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC7
+ movaps 16-120(pA10, mldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC8
+ movaps 16-120(pA5,ldab,4), rA0
+ mulps rB0, rA0
+ addps rA0, rC9
+ movaps 16-120(pA10), rA0
+ mulps rB0, rA0
+ addps rA0, rC10
+ movaps 16-120(pA10,ldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC11
+ movaps 16-120(pA10,ldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC12
+ mulps 16-120(pA5,ldab,8), rB0
+ addps rB0, rC13
#endif
#if KB > 52
- movaps 32-120(pA10,mldab5,2), rA0
- movaps 32-120(pB0), rB0
- mulps rB0, rA0
- addps rA0, rC0
- movaps 32-120(pA5, mldab,4), rA0
- mulps rB0, rA0
- addps rA0, rC1
- movaps 32-120(pA10, mldab,8), rA0
- mulps rB0, rA0
- addps rA0, rC2
- movaps 32-120(pA5, mldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC3
- movaps 32-120(pA5, mldab), rA0
- mulps rB0, rA0
- addps rA0, rC4
- movaps 32-120(pA5), rA0
- mulps rB0, rA0
- addps rA0, rC5
- movaps 32-120(pA5, ldab), rA0
- mulps rB0, rA0
- addps rA0, rC6
- movaps 32-120(pA5, ldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC7
- movaps 32-120(pA10, mldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC8
- movaps 32-120(pA5,ldab,4), rA0
- mulps rB0, rA0
- addps rA0, rC9
- movaps 32-120(pA10), rA0
- mulps rB0, rA0
- addps rA0, rC10
- movaps 32-120(pA10,ldab), rA0
- mulps rB0, rA0
- addps rA0, rC11
- movaps 32-120(pA10,ldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC12
- mulps 32-120(pA5,ldab,8), rB0
- addps rB0, rC13
+ movaps 32-120(pA10,mldab5,2), rA0
+ movaps 32-120(pB0), rB0
+ mulps rB0, rA0
+ addps rA0, rC0
+ movaps 32-120(pA5, mldab,4), rA0
+ mulps rB0, rA0
+ addps rA0, rC1
+ movaps 32-120(pA10, mldab,8), rA0
+ mulps rB0, rA0
+ addps rA0, rC2
+ movaps 32-120(pA5, mldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC3
+ movaps 32-120(pA5, mldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC4
+ movaps 32-120(pA5), rA0
+ mulps rB0, rA0
+ addps rA0, rC5
+ movaps 32-120(pA5, ldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC6
+ movaps 32-120(pA5, ldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC7
+ movaps 32-120(pA10, mldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC8
+ movaps 32-120(pA5,ldab,4), rA0
+ mulps rB0, rA0
+ addps rA0, rC9
+ movaps 32-120(pA10), rA0
+ mulps rB0, rA0
+ addps rA0, rC10
+ movaps 32-120(pA10,ldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC11
+ movaps 32-120(pA10,ldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC12
+ mulps 32-120(pA5,ldab,8), rB0
+ addps rB0, rC13
#endif
#if KB > 56
- movaps 48-120(pA10,mldab5,2), rA0
- movaps 48-120(pB0), rB0
- mulps rB0, rA0
- addps rA0, rC0
- movaps 48-120(pA5, mldab,4), rA0
- mulps rB0, rA0
- addps rA0, rC1
- movaps 48-120(pA10, mldab,8), rA0
- mulps rB0, rA0
- addps rA0, rC2
- movaps 48-120(pA5, mldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC3
- movaps 48-120(pA5, mldab), rA0
- mulps rB0, rA0
- addps rA0, rC4
- movaps 48-120(pA5), rA0
- mulps rB0, rA0
- addps rA0, rC5
- movaps 48-120(pA5, ldab), rA0
- mulps rB0, rA0
- addps rA0, rC6
- movaps 48-120(pA5, ldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC7
- movaps 48-120(pA10, mldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC8
- movaps 48-120(pA5,ldab,4), rA0
- mulps rB0, rA0
- addps rA0, rC9
- movaps 48-120(pA10), rA0
- mulps rB0, rA0
- addps rA0, rC10
- movaps 48-120(pA10,ldab), rA0
- mulps rB0, rA0
- addps rA0, rC11
- movaps 48-120(pA10,ldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC12
- mulps 48-120(pA5,ldab,8), rB0
- addps rB0, rC13
+ movaps 48-120(pA10,mldab5,2), rA0
+ movaps 48-120(pB0), rB0
+ mulps rB0, rA0
+ addps rA0, rC0
+ movaps 48-120(pA5, mldab,4), rA0
+ mulps rB0, rA0
+ addps rA0, rC1
+ movaps 48-120(pA10, mldab,8), rA0
+ mulps rB0, rA0
+ addps rA0, rC2
+ movaps 48-120(pA5, mldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC3
+ movaps 48-120(pA5, mldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC4
+ movaps 48-120(pA5), rA0
+ mulps rB0, rA0
+ addps rA0, rC5
+ movaps 48-120(pA5, ldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC6
+ movaps 48-120(pA5, ldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC7
+ movaps 48-120(pA10, mldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC8
+ movaps 48-120(pA5,ldab,4), rA0
+ mulps rB0, rA0
+ addps rA0, rC9
+ movaps 48-120(pA10), rA0
+ mulps rB0, rA0
+ addps rA0, rC10
+ movaps 48-120(pA10,ldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC11
+ movaps 48-120(pA10,ldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC12
+ mulps 48-120(pA5,ldab,8), rB0
+ addps rB0, rC13
#endif
#if KB > 60
- movaps 64-120(pA10,mldab5,2), rA0
- movaps 64-120(pB0), rB0
- mulps rB0, rA0
- addps rA0, rC0
- movaps 64-120(pA5, mldab,4), rA0
- mulps rB0, rA0
- addps rA0, rC1
- movaps 64-120(pA10, mldab,8), rA0
- mulps rB0, rA0
- addps rA0, rC2
- movaps 64-120(pA5, mldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC3
- movaps 64-120(pA5, mldab), rA0
- mulps rB0, rA0
- addps rA0, rC4
- movaps 64-120(pA5), rA0
- mulps rB0, rA0
- addps rA0, rC5
- movaps 64-120(pA5, ldab), rA0
- mulps rB0, rA0
- addps rA0, rC6
- movaps 64-120(pA5, ldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC7
- movaps 64-120(pA10, mldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC8
- movaps 64-120(pA5,ldab,4), rA0
- mulps rB0, rA0
- addps rA0, rC9
- movaps 64-120(pA10), rA0
- mulps rB0, rA0
- addps rA0, rC10
- movaps 64-120(pA10,ldab), rA0
- mulps rB0, rA0
- addps rA0, rC11
- movaps 64-120(pA10,ldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC12
- mulps 64-120(pA5,ldab,8), rB0
- addps rB0, rC13
+ movaps 64-120(pA10,mldab5,2), rA0
+ movaps 64-120(pB0), rB0
+ mulps rB0, rA0
+ addps rA0, rC0
+ movaps 64-120(pA5, mldab,4), rA0
+ mulps rB0, rA0
+ addps rA0, rC1
+ movaps 64-120(pA10, mldab,8), rA0
+ mulps rB0, rA0
+ addps rA0, rC2
+ movaps 64-120(pA5, mldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC3
+ movaps 64-120(pA5, mldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC4
+ movaps 64-120(pA5), rA0
+ mulps rB0, rA0
+ addps rA0, rC5
+ movaps 64-120(pA5, ldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC6
+ movaps 64-120(pA5, ldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC7
+ movaps 64-120(pA10, mldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC8
+ movaps 64-120(pA5,ldab,4), rA0
+ mulps rB0, rA0
+ addps rA0, rC9
+ movaps 64-120(pA10), rA0
+ mulps rB0, rA0
+ addps rA0, rC10
+ movaps 64-120(pA10,ldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC11
+ movaps 64-120(pA10,ldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC12
+ mulps 64-120(pA5,ldab,8), rB0
+ addps rB0, rC13
#endif
- prefB(128-176(pB,ldab))
- prefB(192-176(pB,ldab))
+ prefB(128-176(pB,ldab))
+ prefB(192-176(pB,ldab))
#if KB > 64
- movaps 80-120(pA10,mldab5,2), rA0
- movaps 80-120(pB0), rB0
- mulps rB0, rA0
- addps rA0, rC0
- movaps 80-120(pA5, mldab,4), rA0
- mulps rB0, rA0
- addps rA0, rC1
- movaps 80-120(pA10, mldab,8), rA0
- mulps rB0, rA0
- addps rA0, rC2
- movaps 80-120(pA5, mldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC3
- movaps 80-120(pA5, mldab), rA0
- mulps rB0, rA0
- addps rA0, rC4
- movaps 80-120(pA5), rA0
- mulps rB0, rA0
- addps rA0, rC5
- movaps 80-120(pA5, ldab), rA0
- mulps rB0, rA0
- addps rA0, rC6
- movaps 80-120(pA5, ldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC7
- movaps 80-120(pA10, mldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC8
- movaps 80-120(pA5,ldab,4), rA0
- mulps rB0, rA0
- addps rA0, rC9
- movaps 80-120(pA10), rA0
- mulps rB0, rA0
- addps rA0, rC10
- movaps 80-120(pA10,ldab), rA0
- mulps rB0, rA0
- addps rA0, rC11
- movaps 80-120(pA10,ldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC12
- mulps 80-120(pA5,ldab,8), rB0
- addps rB0, rC13
+ movaps 80-120(pA10,mldab5,2), rA0
+ movaps 80-120(pB0), rB0
+ mulps rB0, rA0
+ addps rA0, rC0
+ movaps 80-120(pA5, mldab,4), rA0
+ mulps rB0, rA0
+ addps rA0, rC1
+ movaps 80-120(pA10, mldab,8), rA0
+ mulps rB0, rA0
+ addps rA0, rC2
+ movaps 80-120(pA5, mldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC3
+ movaps 80-120(pA5, mldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC4
+ movaps 80-120(pA5), rA0
+ mulps rB0, rA0
+ addps rA0, rC5
+ movaps 80-120(pA5, ldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC6
+ movaps 80-120(pA5, ldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC7
+ movaps 80-120(pA10, mldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC8
+ movaps 80-120(pA5,ldab,4), rA0
+ mulps rB0, rA0
+ addps rA0, rC9
+ movaps 80-120(pA10), rA0
+ mulps rB0, rA0
+ addps rA0, rC10
+ movaps 80-120(pA10,ldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC11
+ movaps 80-120(pA10,ldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC12
+ mulps 80-120(pA5,ldab,8), rB0
+ addps rB0, rC13
#endif
#if KB > 68
- movaps 96-120(pA10,mldab5,2), rA0
- movaps 96-120(pB0), rB0
- mulps rB0, rA0
- addps rA0, rC0
- movaps 96-120(pA5, mldab,4), rA0
- mulps rB0, rA0
- addps rA0, rC1
- movaps 96-120(pA10, mldab,8), rA0
- mulps rB0, rA0
- addps rA0, rC2
- movaps 96-120(pA5, mldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC3
- movaps 96-120(pA5, mldab), rA0
- mulps rB0, rA0
- addps rA0, rC4
- movaps 96-120(pA5), rA0
- mulps rB0, rA0
- addps rA0, rC5
- movaps 96-120(pA5, ldab), rA0
- mulps rB0, rA0
- addps rA0, rC6
- movaps 96-120(pA5, ldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC7
- movaps 96-120(pA10, mldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC8
- movaps 96-120(pA5,ldab,4), rA0
- mulps rB0, rA0
- addps rA0, rC9
- movaps 96-120(pA10), rA0
- mulps rB0, rA0
- addps rA0, rC10
- movaps 96-120(pA10,ldab), rA0
- mulps rB0, rA0
- addps rA0, rC11
- movaps 96-120(pA10,ldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC12
- mulps 96-120(pA5,ldab,8), rB0
- addps rB0, rC13
+ movaps 96-120(pA10,mldab5,2), rA0
+ movaps 96-120(pB0), rB0
+ mulps rB0, rA0
+ addps rA0, rC0
+ movaps 96-120(pA5, mldab,4), rA0
+ mulps rB0, rA0
+ addps rA0, rC1
+ movaps 96-120(pA10, mldab,8), rA0
+ mulps rB0, rA0
+ addps rA0, rC2
+ movaps 96-120(pA5, mldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC3
+ movaps 96-120(pA5, mldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC4
+ movaps 96-120(pA5), rA0
+ mulps rB0, rA0
+ addps rA0, rC5
+ movaps 96-120(pA5, ldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC6
+ movaps 96-120(pA5, ldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC7
+ movaps 96-120(pA10, mldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC8
+ movaps 96-120(pA5,ldab,4), rA0
+ mulps rB0, rA0
+ addps rA0, rC9
+ movaps 96-120(pA10), rA0
+ mulps rB0, rA0
+ addps rA0, rC10
+ movaps 96-120(pA10,ldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC11
+ movaps 96-120(pA10,ldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC12
+ mulps 96-120(pA5,ldab,8), rB0
+ addps rB0, rC13
#endif
#if KB > 72
- movaps 112-120(pA10,mldab5,2), rA0
- movaps 112-120(pB0), rB0
- mulps rB0, rA0
- addps rA0, rC0
- movaps 112-120(pA5, mldab,4), rA0
- mulps rB0, rA0
- addps rA0, rC1
- movaps 112-120(pA10, mldab,8), rA0
- mulps rB0, rA0
- addps rA0, rC2
- movaps 112-120(pA5, mldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC3
- movaps 112-120(pA5, mldab), rA0
- mulps rB0, rA0
- addps rA0, rC4
- movaps 112-120(pA5), rA0
- mulps rB0, rA0
- addps rA0, rC5
- movaps 112-120(pA5, ldab), rA0
- mulps rB0, rA0
- addps rA0, rC6
- movaps 112-120(pA5, ldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC7
- movaps 112-120(pA10, mldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC8
- movaps 112-120(pA5,ldab,4), rA0
- mulps rB0, rA0
- addps rA0, rC9
- movaps 112-120(pA10), rA0
- mulps rB0, rA0
- addps rA0, rC10
- movaps 112-120(pA10,ldab), rA0
- mulps rB0, rA0
- addps rA0, rC11
- movaps 112-120(pA10,ldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC12
- mulps 112-120(pA5,ldab,8), rB0
- prefC((pC))
- prefC((pC,incCn))
- addps rB0, rC13
+ movaps 112-120(pA10,mldab5,2), rA0
+ movaps 112-120(pB0), rB0
+ mulps rB0, rA0
+ addps rA0, rC0
+ movaps 112-120(pA5, mldab,4), rA0
+ mulps rB0, rA0
+ addps rA0, rC1
+ movaps 112-120(pA10, mldab,8), rA0
+ mulps rB0, rA0
+ addps rA0, rC2
+ movaps 112-120(pA5, mldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC3
+ movaps 112-120(pA5, mldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC4
+ movaps 112-120(pA5), rA0
+ mulps rB0, rA0
+ addps rA0, rC5
+ movaps 112-120(pA5, ldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC6
+ movaps 112-120(pA5, ldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC7
+ movaps 112-120(pA10, mldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC8
+ movaps 112-120(pA5,ldab,4), rA0
+ mulps rB0, rA0
+ addps rA0, rC9
+ movaps 112-120(pA10), rA0
+ mulps rB0, rA0
+ addps rA0, rC10
+ movaps 112-120(pA10,ldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC11
+ movaps 112-120(pA10,ldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC12
+ mulps 112-120(pA5,ldab,8), rB0
+ prefC((pC))
+ prefC((pC,incCn))
+ addps rB0, rC13
#else
- prefC((pC))
- prefC((pC,incCn))
+ prefC((pC))
+ prefC((pC,incCn))
#endif
#if KB > 76
- movaps 128-120(pA10,mldab5,2), rA0
- movaps 128-120(pB0), rB0
- mulps rB0, rA0
- addps rA0, rC0
- movaps 128-120(pA5, mldab,4), rA0
- mulps rB0, rA0
- addps rA0, rC1
- movaps 128-120(pA10, mldab,8), rA0
- mulps rB0, rA0
- addps rA0, rC2
- movaps 128-120(pA5, mldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC3
- movaps 128-120(pA5, mldab), rA0
- mulps rB0, rA0
- addps rA0, rC4
- movaps 128-120(pA5), rA0
- mulps rB0, rA0
- addps rA0, rC5
- movaps 128-120(pA5, ldab), rA0
- mulps rB0, rA0
- addps rA0, rC6
- movaps 128-120(pA5, ldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC7
- movaps 128-120(pA10, mldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC8
- movaps 128-120(pA5,ldab,4), rA0
- mulps rB0, rA0
- addps rA0, rC9
- movaps 128-120(pA10), rA0
- mulps rB0, rA0
- addps rA0, rC10
- movaps 128-120(pA10,ldab), rA0
- mulps rB0, rA0
- addps rA0, rC11
- movaps 128-120(pA10,ldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC12
- mulps 128-120(pA5,ldab,8), rB0
- addps rB0, rC13
+ movaps 128-120(pA10,mldab5,2), rA0
+ movaps 128-120(pB0), rB0
+ mulps rB0, rA0
+ addps rA0, rC0
+ movaps 128-120(pA5, mldab,4), rA0
+ mulps rB0, rA0
+ addps rA0, rC1
+ movaps 128-120(pA10, mldab,8), rA0
+ mulps rB0, rA0
+ addps rA0, rC2
+ movaps 128-120(pA5, mldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC3
+ movaps 128-120(pA5, mldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC4
+ movaps 128-120(pA5), rA0
+ mulps rB0, rA0
+ addps rA0, rC5
+ movaps 128-120(pA5, ldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC6
+ movaps 128-120(pA5, ldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC7
+ movaps 128-120(pA10, mldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC8
+ movaps 128-120(pA5,ldab,4), rA0
+ mulps rB0, rA0
+ addps rA0, rC9
+ movaps 128-120(pA10), rA0
+ mulps rB0, rA0
+ addps rA0, rC10
+ movaps 128-120(pA10,ldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC11
+ movaps 128-120(pA10,ldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC12
+ mulps 128-120(pA5,ldab,8), rB0
+ addps rB0, rC13
#endif
#if KB > 80
- movaps 144-120(pA10,mldab5,2), rA0
- movaps 144-120(pB0), rB0
- mulps rB0, rA0
- addps rA0, rC0
- movaps 144-120(pA5, mldab,4), rA0
- mulps rB0, rA0
- addps rA0, rC1
- movaps 144-120(pA10, mldab,8), rA0
- mulps rB0, rA0
- addps rA0, rC2
- movaps 144-120(pA5, mldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC3
- movaps 144-120(pA5, mldab), rA0
- mulps rB0, rA0
- addps rA0, rC4
- movaps 144-120(pA5), rA0
- mulps rB0, rA0
- addps rA0, rC5
- movaps 144-120(pA5, ldab), rA0
- mulps rB0, rA0
- addps rA0, rC6
- movaps 144-120(pA5, ldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC7
- movaps 144-120(pA10, mldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC8
- movaps 144-120(pA5,ldab,4), rA0
- mulps rB0, rA0
- addps rA0, rC9
- movaps 144-120(pA10), rA0
- mulps rB0, rA0
- addps rA0, rC10
- movaps 144-120(pA10,ldab), rA0
- mulps rB0, rA0
- addps rA0, rC11
- movaps 144-120(pA10,ldab,2), rA0
- mulps rB0, rA0
- addps rA0, rC12
- mulps 144-120(pA5,ldab,8), rB0
- addps rB0, rC13
+ movaps 144-120(pA10,mldab5,2), rA0
+ movaps 144-120(pB0), rB0
+ mulps rB0, rA0
+ addps rA0, rC0
+ movaps 144-120(pA5, mldab,4), rA0
+ mulps rB0, rA0
+ addps rA0, rC1
+ movaps 144-120(pA10, mldab,8), rA0
+ mulps rB0, rA0
+ addps rA0, rC2
+ movaps 144-120(pA5, mldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC3
+ movaps 144-120(pA5, mldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC4
+ movaps 144-120(pA5), rA0
+ mulps rB0, rA0
+ addps rA0, rC5
+ movaps 144-120(pA5, ldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC6
+ movaps 144-120(pA5, ldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC7
+ movaps 144-120(pA10, mldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC8
+ movaps 144-120(pA5,ldab,4), rA0
+ mulps rB0, rA0
+ addps rA0, rC9
+ movaps 144-120(pA10), rA0
+ mulps rB0, rA0
+ addps rA0, rC10
+ movaps 144-120(pA10,ldab), rA0
+ mulps rB0, rA0
+ addps rA0, rC11
+ movaps 144-120(pA10,ldab,2), rA0
+ mulps rB0, rA0
+ addps rA0, rC12
+ mulps 144-120(pA5,ldab,8), rB0
+ addps rB0, rC13
#endif
/*UKLOOP */
@@ -2454,202 +2461,202 @@ MLAST:
* Get these bastard things summed up correctly
*/
- /* rC0 = c0a c0b c0c c0d */
- /* rC1 = c1a c1b c1c c1d */
- /* rC2 = c2a c2b c2c c2d */
- /* rC3 = c3a c3b c3c c3d */
+ /* rC0 = c0a c0b c0c c0d */
+ /* rC1 = c1a c1b c1c c1d */
+ /* rC2 = c2a c2b c2c c2d */
+ /* rC3 = c3a c3b c3c c3d */
/* */
- movaps rC2, rB0 /* rB0 = c2a c2b c2c c2d */
- prefC(64(pC,incCn))
- prefB(256-176(pB,ldab))
- movaps rC0, rA0 /* rA0 = c0a c0b c0c c0d */
- unpckhps rC3, rB0 /* rB0 = c2c c3c c2d c3d */
- unpckhps rC1, rA0 /* rA0 = c0c c1c c0d c1d */
- unpcklps rC3, rC2 /* rC2 = c2a c3a c2b c3b */
- movlhps rB0, rC3 /* rC3 = c3a c3b c2c c3c */
- unpcklps rC1, rC0 /* rC0 = c0a c1a c0b c1b */
- movhlps rA0, rC3 /* rC3 = c0d c1d c2c c3c */
- movlhps rC2, rA0 /* rA0 = c0c c1c c2a c3a */
- movhlps rC0, rB0 /* rB0 = c0b c1b c2d c3d */
- addps rA0, rC3 /* rC3 = c0cd c1cd c2ac c3ac */
- movlhps rC0, rC1 /* rC1 = c1a c1b c0a c1a */
- movhlps rC1, rC2 /* rC2 = c0a c1a c2b c3b */
- movaps rC4, rA0 /* rA0 = c4a c4b c4c c4d */
- addps rB0, rC2 /* rC2 = c0ab c1ab c2bd c3bd */
- movaps rC6, rB0 /* rB0 = c6a c6b c6c c6d */
- addps rC2, rC3 /* rC3 = c0abcd c1abcd c2bdac c3bdac */
-
-
- /* rC4 = c4a c4b c4c c4d */
- /* rC5 = c5a c5b c5c c5d */
- /* rC6 = c6a c6b c6c c6d */
- /* rC7 = c7a c7b c7c c7d */
- /* rC8 = c08a c08b c08c c08d */
- /* rC9 = c09a c09b c09c c09d */
- /* rC10 = c10a c10b c10c c10d */
- /* rC11 = c11a c11b c11c c11d */
- /* rC12 = c12a c12b c12c c12d */
- /* rC13 = c13a c13b c13c c13d */
+ movaps rC2, rB0 /* rB0 = c2a c2b c2c c2d */
+ prefC(64(pC,incCn))
+ prefB(256-176(pB,ldab))
+ movaps rC0, rA0 /* rA0 = c0a c0b c0c c0d */
+ unpckhps rC3, rB0 /* rB0 = c2c c3c c2d c3d */
+ unpckhps rC1, rA0 /* rA0 = c0c c1c c0d c1d */
+ unpcklps rC3, rC2 /* rC2 = c2a c3a c2b c3b */
+ movlhps rB0, rC3 /* rC3 = c3a c3b c2c c3c */
+ unpcklps rC1, rC0 /* rC0 = c0a c1a c0b c1b */
+ movhlps rA0, rC3 /* rC3 = c0d c1d c2c c3c */
+ movlhps rC2, rA0 /* rA0 = c0c c1c c2a c3a */
+ movhlps rC0, rB0 /* rB0 = c0b c1b c2d c3d */
+ addps rA0, rC3 /* rC3 = c0cd c1cd c2ac c3ac */
+ movlhps rC0, rC1 /* rC1 = c1a c1b c0a c1a */
+ movhlps rC1, rC2 /* rC2 = c0a c1a c2b c3b */
+ movaps rC4, rA0 /* rA0 = c4a c4b c4c c4d */
+ addps rB0, rC2 /* rC2 = c0ab c1ab c2bd c3bd */
+ movaps rC6, rB0 /* rB0 = c6a c6b c6c c6d */
+ addps rC2, rC3 /* rC3 = c0abcd c1abcd c2bdac c3bdac */
+
+
+ /* rC4 = c4a c4b c4c c4d */
+ /* rC5 = c5a c5b c5c c5d */
+ /* rC6 = c6a c6b c6c c6d */
+ /* rC7 = c7a c7b c7c c7d */
+ /* rC8 = c08a c08b c08c c08d */
+ /* rC9 = c09a c09b c09c c09d */
+ /* rC10 = c10a c10b c10c c10d */
+ /* rC11 = c11a c11b c11c c11d */
+ /* rC12 = c12a c12b c12c c12d */
+ /* rC13 = c13a c13b c13c c13d */
/* */
- movaps rC10, rC0 /* rC0 = c10a c10b c10c c10d */
- movaps rC8 , rC1 /* rC1 = c08a c08b c08c c08d */
- movaps rC12, rC2 /* rC2 = c12a c12b c12c c12d */
- unpckhps rC7, rB0 /* rB0 = c6c c7c c6d c7d */
- unpckhps rC5, rA0 /* rA0 = c4c c5c c4d c5d */
- unpcklps rC7, rC6 /* rC6 = c6a c7a c6b c7b */
- unpckhps rC11, rC0 /* rC0 = c10c c11c c10d c11d */
- unpckhps rC9 , rC1 /* rC1 = c08c c09c c08d c09d */
- movlhps rB0, rC7 /* rC7 = c7a c7b c6c c7c */
- unpcklps rC5, rC4 /* rC4 = c4a c5a c4b c5b */
- movhlps rA0, rC7 /* rC7 = c4d c5d c6c c7c */
- movlhps rC6, rA0 /* rA0 = c4c c5c c6a c7a */
- unpcklps rC11, rC10 /* rC10 = c10a c11a c10b c11b */
- movhlps rC4, rB0 /* rB0 = c4b c5b c6d c7d */
- movlhps rC0, rC11 /* rC11 = c11a c11b c10c c11c */
- addps rA0, rC7 /* rC7 = c4cd c5cd c6ac c7ac */
+ movaps rC10, rC0 /* rC0 = c10a c10b c10c c10d */
+ movaps rC8 , rC1 /* rC1 = c08a c08b c08c c08d */
+ movaps rC12, rC2 /* rC2 = c12a c12b c12c c12d */
+ unpckhps rC7, rB0 /* rB0 = c6c c7c c6d c7d */
+ unpckhps rC5, rA0 /* rA0 = c4c c5c c4d c5d */
+ unpcklps rC7, rC6 /* rC6 = c6a c7a c6b c7b */
+ unpckhps rC11, rC0 /* rC0 = c10c c11c c10d c11d */
+ unpckhps rC9 , rC1 /* rC1 = c08c c09c c08d c09d */
+ movlhps rB0, rC7 /* rC7 = c7a c7b c6c c7c */
+ unpcklps rC5, rC4 /* rC4 = c4a c5a c4b c5b */
+ movhlps rA0, rC7 /* rC7 = c4d c5d c6c c7c */
+ movlhps rC6, rA0 /* rA0 = c4c c5c c6a c7a */
+ unpcklps rC11, rC10 /* rC10 = c10a c11a c10b c11b */
+ movhlps rC4, rB0 /* rB0 = c4b c5b c6d c7d */
+ movlhps rC0, rC11 /* rC11 = c11a c11b c10c c11c */
+ addps rA0, rC7 /* rC7 = c4cd c5cd c6ac c7ac */
#ifdef BETAX
#ifdef SREAL
- movups (pC), rA0
- movlhps rC4, rC5 /* rC5 = c5a c5b c4a c5a */
- movups 16(pC), rC4
- unpcklps rC9 , rC8 /* rC8 = c08a c09a c08b c09b */
- movhlps rC1, rC11 /* rC11 = c08d c09d c10c c11c */
- movlhps rC10, rC1 /* rC1 = c08c c09c c10a c11a */
- movhlps rC5, rC6 /* rC6 = c4a c5a c6b c7b */
- movups 32(pC), rC5
- movhlps rC8 , rC0 /* rC0 = c08b c09b c10d c11d */
- unpcklps rC13, rC2 /* rC2 = c12a c13a c12b c13b */
- addps rC1, rC11 /* rC11 = c08cd c09cd c10ac c11ac */
- movlps 48(pC), rC1
- addps rB0, rC6 /* rC6 = c4ab c5ab c6bd c7bd */
- movlhps rC8 , rC9 /* rC9 = c09a c09b c08a c09a */
- unpckhps rC13, rC12 /* rC12 = c12c c13c c12d c13d */
- movhlps rC9 , rC10 /* rC10 = c08a c09a c10b c11b */
- addps rC6, rC7 /* rC7 = c4abcd c5abcd c6bdac c7bdac */
- mulps BOF(%rsp), rA0
- addps rC0, rC10 /* rC10 = c08ab c09ab c10bd c11bd */
- mulps BOF(%rsp), rC4
- addps rC2, rC12 /* rC12 = c12ac c13ac c12bd c13bd */
- mulps BOF(%rsp), rC5
- addps rC10, rC11 /* rC11 = c08abcd c09abcd c10bdac c11bdac */
- mulps BOF(%rsp), rC1
+ movups (pC), rA0
+ movlhps rC4, rC5 /* rC5 = c5a c5b c4a c5a */
+ movups 16(pC), rC4
+ unpcklps rC9 , rC8 /* rC8 = c08a c09a c08b c09b */
+ movhlps rC1, rC11 /* rC11 = c08d c09d c10c c11c */
+ movlhps rC10, rC1 /* rC1 = c08c c09c c10a c11a */
+ movhlps rC5, rC6 /* rC6 = c4a c5a c6b c7b */
+ movups 32(pC), rC5
+ movhlps rC8 , rC0 /* rC0 = c08b c09b c10d c11d */
+ unpcklps rC13, rC2 /* rC2 = c12a c13a c12b c13b */
+ addps rC1, rC11 /* rC11 = c08cd c09cd c10ac c11ac */
+ movlps 48(pC), rC1
+ addps rB0, rC6 /* rC6 = c4ab c5ab c6bd c7bd */
+ movlhps rC8 , rC9 /* rC9 = c09a c09b c08a c09a */
+ unpckhps rC13, rC12 /* rC12 = c12c c13c c12d c13d */
+ movhlps rC9 , rC10 /* rC10 = c08a c09a c10b c11b */
+ addps rC6, rC7 /* rC7 = c4abcd c5abcd c6bdac c7bdac */
+ mulps BOF(%rsp), rA0
+ addps rC0, rC10 /* rC10 = c08ab c09ab c10bd c11bd */
+ mulps BOF(%rsp), rC4
+ addps rC2, rC12 /* rC12 = c12ac c13ac c12bd c13bd */
+ mulps BOF(%rsp), rC5
+ addps rC10, rC11 /* rC11 = c08abcd c09abcd c10bdac c11bdac */
+ mulps BOF(%rsp), rC1
/* */
- movhlps rC12, rC13 /* rC13 = c12bd c13bd X X */
- addps rA0, rC3
- addps rC13, rC12 /* rC12 = c12abcd c13abcd X X */
- addps rC4, rC7
- addps rC5, rC11
- prefB(320-176(pB,ldab))
- addps rC1, rC12
+ movhlps rC12, rC13 /* rC13 = c12bd c13bd X X */
+ addps rA0, rC3
+ addps rC13, rC12 /* rC12 = c12abcd c13abcd X X */
+ addps rC4, rC7
+ addps rC5, rC11
+ prefB(320-176(pB,ldab))
+ addps rC1, rC12
#else /* BETA = X, complex type */
- movups (pC), rA0
- movlhps rC4, rC5 /* rC5 = c5a c5b c4a c5a */
- movups 16(pC), rC4
- unpcklps rC9 , rC8 /* rC8 = c08a c09a c08b c09b */
- shufps $0x88, rC4, rA0 /* rA0 = c0 c1 c2 c3 */
- movups 32(pC), rC4 /* rC4 = c4 X c5 X */
- movhlps rC1, rC11 /* rC11 = c08d c09d c10c c11c */
- movlhps rC10, rC1 /* rC1 = c08c c09c c10a c11a */
- movhlps rC5, rC6 /* rC6 = c4a c5a c6b c7b */
- movups 48(pC), rC5 /* rC5 = c6 X c7 X */
- movhlps rC8 , rC0 /* rC0 = c08b c09b c10d c11d */
- unpcklps rC13, rC2 /* rC2 = c12a c13a c12b c13b */
- addps rC1, rC11 /* rC11 = c08cd c09cd c10ac c11ac */
- shufps $0x88, rC5, rC4 /* rC4 = c4 c5 c6 c7 */
- movups 64(pC), rC5 /* rC5 = c8 X c9 X */
- movups 80(pC), rC1 /* rC1 = c10 X c11 X */
- addps rB0, rC6 /* rC6 = c4ab c5ab c6bd c7bd */
- shufps $0x88, rC1, rC5 /* rC5 = c8 c9 c10 c11 */
- movlhps rC8 , rC9 /* rC9 = c09a c09b c08a c09a */
- movss 96(pC), rC1
- unpckhps rC13, rC12 /* rC12 = c12c c13c c12d c13d */
- movss 104(pC), rB0
- movhlps rC9 , rC10 /* rC10 = c08a c09a c10b c11b */
- unpcklps rB0, rC1
- addps rC6, rC7 /* rC7 = c4abcd c5abcd c6bdac c7bdac */
- mulps BOF(%rsp), rA0
- addps rC0, rC10 /* rC10 = c08ab c09ab c10bd c11bd */
- mulps BOF(%rsp), rC4
- addps rC2, rC12 /* rC12 = c12ac c13ac c12bd c13bd */
- mulps BOF(%rsp), rC5
- addps rC10, rC11 /* rC11 = c08abcd c09abcd c10bdac c11bdac */
- mulps BOF(%rsp), rC1
+ movups (pC), rA0
+ movlhps rC4, rC5 /* rC5 = c5a c5b c4a c5a */
+ movups 16(pC), rC4
+ unpcklps rC9 , rC8 /* rC8 = c08a c09a c08b c09b */
+ shufps $0x88, rC4, rA0 /* rA0 = c0 c1 c2 c3 */
+ movups 32(pC), rC4 /* rC4 = c4 X c5 X */
+ movhlps rC1, rC11 /* rC11 = c08d c09d c10c c11c */
+ movlhps rC10, rC1 /* rC1 = c08c c09c c10a c11a */
+ movhlps rC5, rC6 /* rC6 = c4a c5a c6b c7b */
+ movups 48(pC), rC5 /* rC5 = c6 X c7 X */
+ movhlps rC8 , rC0 /* rC0 = c08b c09b c10d c11d */
+ unpcklps rC13, rC2 /* rC2 = c12a c13a c12b c13b */
+ addps rC1, rC11 /* rC11 = c08cd c09cd c10ac c11ac */
+ shufps $0x88, rC5, rC4 /* rC4 = c4 c5 c6 c7 */
+ movups 64(pC), rC5 /* rC5 = c8 X c9 X */
+ movups 80(pC), rC1 /* rC1 = c10 X c11 X */
+ addps rB0, rC6 /* rC6 = c4ab c5ab c6bd c7bd */
+ shufps $0x88, rC1, rC5 /* rC5 = c8 c9 c10 c11 */
+ movlhps rC8 , rC9 /* rC9 = c09a c09b c08a c09a */
+ movss 96(pC), rC1
+ unpckhps rC13, rC12 /* rC12 = c12c c13c c12d c13d */
+ movss 104(pC), rB0
+ movhlps rC9 , rC10 /* rC10 = c08a c09a c10b c11b */
+ unpcklps rB0, rC1
+ addps rC6, rC7 /* rC7 = c4abcd c5abcd c6bdac c7bdac */
+ mulps BOF(%rsp), rA0
+ addps rC0, rC10 /* rC10 = c08ab c09ab c10bd c11bd */
+ mulps BOF(%rsp), rC4
+ addps rC2, rC12 /* rC12 = c12ac c13ac c12bd c13bd */
+ mulps BOF(%rsp), rC5
+ addps rC10, rC11 /* rC11 = c08abcd c09abcd c10bdac c11bdac */
+ mulps BOF(%rsp), rC1
/* */
- movhlps rC12, rC13 /* rC13 = c12bd c13bd X X */
- addps rA0, rC3
- addps rC13, rC12 /* rC12 = c12abcd c13abcd X X */
- addps rC4, rC7
- addps rC5, rC11
- prefB(320-176(pB,ldab))
- addps rC1, rC12
+ movhlps rC12, rC13 /* rC13 = c12bd c13bd X X */
+ addps rA0, rC3
+ addps rC13, rC12 /* rC12 = c12abcd c13abcd X X */
+ addps rC4, rC7
+ addps rC5, rC11
+ prefB(320-176(pB,ldab))
+ addps rC1, rC12
#endif
#else
- movlhps rC4, rC5 /* rC5 = c5a c5b c4a c5a */
- unpcklps rC9 , rC8 /* rC8 = c08a c09a c08b c09b */
- movhlps rC1, rC11 /* rC11 = c08d c09d c10c c11c */
- movlhps rC10, rC1 /* rC1 = c08c c09c c10a c11a */
- movhlps rC5, rC6 /* rC6 = c4a c5a c6b c7b */
- movhlps rC8 , rC0 /* rC0 = c08b c09b c10d c11d */
- unpcklps rC13, rC2 /* rC2 = c12a c13a c12b c13b */
- addps rC1, rC11 /* rC11 = c08cd c09cd c10ac c11ac */
- addps rB0, rC6 /* rC6 = c4ab c5ab c6bd c7bd */
- movlhps rC8 , rC9 /* rC9 = c09a c09b c08a c09a */
- unpckhps rC13, rC12 /* rC12 = c12c c13c c12d c13d */
- movhlps rC9 , rC10 /* rC10 = c08a c09a c10b c11b */
- addps rC6, rC7 /* rC7 = c4abcd c5abcd c6bdac c7bdac */
- addps rC0, rC10 /* rC10 = c08ab c09ab c10bd c11bd */
- addps rC2, rC12 /* rC12 = c12ac c13ac c12bd c13bd */
- addps rC10, rC11 /* rC11 = c08abcd c09abcd c10bdac c11bdac */
+ movlhps rC4, rC5 /* rC5 = c5a c5b c4a c5a */
+ unpcklps rC9 , rC8 /* rC8 = c08a c09a c08b c09b */
+ movhlps rC1, rC11 /* rC11 = c08d c09d c10c c11c */
+ movlhps rC10, rC1 /* rC1 = c08c c09c c10a c11a */
+ movhlps rC5, rC6 /* rC6 = c4a c5a c6b c7b */
+ movhlps rC8 , rC0 /* rC0 = c08b c09b c10d c11d */
+ unpcklps rC13, rC2 /* rC2 = c12a c13a c12b c13b */
+ addps rC1, rC11 /* rC11 = c08cd c09cd c10ac c11ac */
+ addps rB0, rC6 /* rC6 = c4ab c5ab c6bd c7bd */
+ movlhps rC8 , rC9 /* rC9 = c09a c09b c08a c09a */
+ unpckhps rC13, rC12 /* rC12 = c12c c13c c12d c13d */
+ movhlps rC9 , rC10 /* rC10 = c08a c09a c10b c11b */
+ addps rC6, rC7 /* rC7 = c4abcd c5abcd c6bdac c7bdac */
+ addps rC0, rC10 /* rC10 = c08ab c09ab c10bd c11bd */
+ addps rC2, rC12 /* rC12 = c12ac c13ac c12bd c13bd */
+ addps rC10, rC11 /* rC11 = c08abcd c09abcd c10bdac c11bdac */
/* */
- movhlps rC12, rC13 /* rC13 = c12bd c13bd X X */
- prefB(320-176(pB,ldab))
- addps rC13, rC12 /* rC12 = c12abcd c13abcd X X */
+ movhlps rC12, rC13 /* rC13 = c12bd c13bd X X */
+ prefB(320-176(pB,ldab))
+ addps rC13, rC12 /* rC12 = c12abcd c13abcd X X */
#endif
/*
* Write results back to C; pC += 14;
*/
#ifdef SREAL
- movups rC3, (pC)
- movups rC7, 16(pC)
- movups rC11, 32(pC)
- movlps rC12, 48(pC)
-/* addq $56, pC */
+ movups rC3, (pC)
+ movups rC7, 16(pC)
+ movups rC11, 32(pC)
+ movlps rC12, 48(pC)
+/* addq $56, pC */
#else
- movss rC3, (pC)
- movss rC7, 32(pC)
- movhlps rC3, rC0
- movhlps rC7, rC6
- movss rC0, 16(pC)
- movss rC6, 48(pC)
- shufps $0x55, rC3, rC3
- shufps $0x55, rC7, rC7
- movss rC3, 8(pC)
- movss rC7, 40(pC)
- shufps $0x55, rC0, rC0
- shufps $0x55, rC6, rC6
- movss rC0, 24(pC)
- movss rC6, 56(pC)
-
- movss rC11, 64(pC)
- movhlps rC11, rC2
- movss rC12, 96(pC)
- movss rC2, 80(pC)
- shufps $0x55, rC11, rC11
- shufps $0x55, rC12, rC12
- movss rC11, 72(pC)
- shufps $0x55, rC2, rC2
- movss rC12, 104(pC)
- movss rC2, 88(pC)
+ movss rC3, (pC)
+ movss rC7, 32(pC)
+ movhlps rC3, rC0
+ movhlps rC7, rC6
+ movss rC0, 16(pC)
+ movss rC6, 48(pC)
+ shufps $0x55, rC3, rC3
+ shufps $0x55, rC7, rC7
+ movss rC3, 8(pC)
+ movss rC7, 40(pC)
+ shufps $0x55, rC0, rC0
+ shufps $0x55, rC6, rC6
+ movss rC0, 24(pC)
+ movss rC6, 56(pC)
+
+ movss rC11, 64(pC)
+ movhlps rC11, rC2
+ movss rC12, 96(pC)
+ movss rC2, 80(pC)
+ shufps $0x55, rC11, rC11
+ shufps $0x55, rC12, rC12
+ movss rC11, 72(pC)
+ shufps $0x55, rC2, rC2
+ movss rC12, 104(pC)
+ movss rC2, 88(pC)
-/* addq $112, pC */
+/* addq $112, pC */
#endif
/*
* Write results back to C
@@ -2660,55 +2667,55 @@ MLAST:
/*
* while (pA != stM);
*/
-/* subq $1, stM */
-/* jne UMLOOP */
+/* subq $1, stM */
+/* jne UMLOOP */
/*
* pC += 14; pA += 14*NB; pB -= NB;
*/
-/* subq $MBKBso-NB14so+176, pA5 */
-/* subq $MBKBso-NB14so+176, pA10 */
- subq incAm, pA5
- subq incAm, pA10
- addq $NBso-176, pB0
+/* subq $MBKBso-NB14so+176, pA5 */
+/* subq $MBKBso-NB14so+176, pA10 */
+ subq incAm, pA5
+ subq incAm, pA10
+ addq $NBso-176, pB0
/*
* while (pA != stM);
*/
-/* subq $1, stM */
-/* jne UMLOOP */
+/* subq $1, stM */
+/* jne UMLOOP */
/*
* pC += incCn; pA -= NBNB; pB += NB;
*/
- addq incCn, pC
+ addq incCn, pC
/*
* while (pB != stN);
*/
- sub $1, stN
- jne UNLOOP
+ sub $1, stN
+ jne UNLOOP
/*
* Restore callee-saved iregs
*/
DONE:
- movq -8(%rsp), %rbp
- movq -16(%rsp), %rbx
+ movq -8(%rsp), %rbp
+ movq -16(%rsp), %rbx
#if MB == 0
- movq -32(%rsp), %r12
- movq -40(%rsp), %r13
+ movq -32(%rsp), %r12
+ movq -40(%rsp), %r13
#endif
- ret
+ ret
#if MB == 0
MB_LT84:
- cmp $70, stM
- jne MB_LT70
-/* movq $70/14, stM */
- movq $5, stM
- jmp MBFOUND
+ cmp $70, stM
+ jne MB_LT70
+/* movq $70/14, stM */
+ movq $5, stM
+ jmp MBFOUND
MB_LT70:
- cmp $56, stM
- jne MB_LT56
-/* movq $56/14, stM */
- movq $4, stM
- jmp MBFOUND
+ cmp $56, stM
+ jne MB_LT56
+/* movq $56/14, stM */
+ movq $4, stM
+ jmp MBFOUND
MB_LT56:
cmp $42, stM
jne MB_LT42
diff -rupN ATLAS/tune/blas/level1/scalsrch.c atlas-3.8.3/tune/blas/level1/scalsrch.c
--- ATLAS/tune/blas/level1/scalsrch.c 2009-02-18 19:48:25.000000000 +0100
+++ atlas-3.8.3/tune/blas/level1/scalsrch.c 2009-11-12 13:45:48.141174024 +0100
@@ -747,7 +747,7 @@ void GenMainRout(char pre, int n, int *i
/*
* Handle all special alpha cases
*/
- fprintf(fpout, "%sif ( SCALAR_IS_ZERO(alpha) )\n", spc);
+ /* fprintf(fpout, "%sif ( SCALAR_IS_ZERO(alpha) )\n", spc);
fprintf(fpout, "%s{\n", spc);
if (pre == 'c' || pre == 'z')
{
@@ -756,7 +756,7 @@ void GenMainRout(char pre, int n, int *i
}
else fprintf(fpout, "%s Mjoin(PATL,set)(N, ATL_rzero, X, incx);\n", spc);
fprintf(fpout, "%s return;\n", spc);
- fprintf(fpout, "%s}\n", spc);
+ fprintf(fpout, "%s}\n", spc); */
GenAlphCase(pre, spc, fpout, 1, n, ix, iy, ia, ib);
GenAlphCase(pre, spc, fpout, -1, n, ix, iy, ia, ib);
if (pre == 'c' || pre == 'z')