diff --git a/ARMeilleure/ARMeilleure.csproj b/ARMeilleure/ARMeilleure.csproj index 1c2135ed54..fa55511541 100644 --- a/ARMeilleure/ARMeilleure.csproj +++ b/ARMeilleure/ARMeilleure.csproj @@ -7,6 +7,7 @@ + diff --git a/ARMeilleure/CodeGen/X86/Assembler.cs b/ARMeilleure/CodeGen/X86/Assembler.cs index 2ea4208b33..67736a31f2 100644 --- a/ARMeilleure/CodeGen/X86/Assembler.cs +++ b/ARMeilleure/CodeGen/X86/Assembler.cs @@ -1034,7 +1034,13 @@ namespace ARMeilleure.CodeGen.X86 Debug.Assert(opCode != BadOp, "Invalid opcode value."); - if ((flags & InstructionFlags.Vex) != 0 && HardwareCapabilities.SupportsVexEncoding) + if ((flags & InstructionFlags.Evex) != 0 && HardwareCapabilities.SupportsEvexEncoding) + { + WriteEvexInst(dest, src1, src2, type, flags, opCode); + + opCode &= 0xff; + } + else if ((flags & InstructionFlags.Vex) != 0 && HardwareCapabilities.SupportsVexEncoding) { // In a vex encoding, only one prefix can be active at a time. The active prefix is encoded in the second byte using two bits. @@ -1153,6 +1159,103 @@ namespace ARMeilleure.CodeGen.X86 } } + private void WriteEvexInst( + Operand dest, + Operand src1, + Operand src2, + OperandType type, + InstructionFlags flags, + int opCode, + bool broadcast = false, + int registerWidth = 128, + int maskRegisterIdx = 0, + bool zeroElements = false) + { + int op1Idx = dest.GetRegister().Index; + int op2Idx = src1.GetRegister().Index; + int op3Idx = src2.GetRegister().Index; + + WriteByte(0x62); + + // P0 + // Extend operand 1 register + bool r = (op1Idx & 8) == 0; + // Extend operand 3 register + bool x = (op3Idx & 16) == 0; + // Extend operand 3 register + bool b = (op3Idx & 8) == 0; + // Extend operand 1 register + bool rp = (op1Idx & 16) == 0; + // Escape code index + byte mm = 0b00; + + switch ((ushort)(opCode >> 8)) + { + case 0xf00: mm = 0b01; break; + case 0xf38: mm = 0b10; break; + case 0xf3a: mm = 0b11; break; + + default: Debug.Fail($"Failed to EVEX encode opcode 0x{opCode:X}."); break; + } + + WriteByte( + (byte)( + (r ? 0x80 : 0) | + (x ? 0x40 : 0) | + (b ? 0x20 : 0) | + (rp ? 0x10 : 0) | + mm)); + + // P1 + // Specify 64-bit lane mode + bool w = Is64Bits(type); + // Operand 2 register index + byte vvvv = (byte)(~op2Idx & 0b1111); + // Opcode prefix + byte pp = (flags & InstructionFlags.PrefixMask) switch + { + InstructionFlags.Prefix66 => 0b01, + InstructionFlags.PrefixF3 => 0b10, + InstructionFlags.PrefixF2 => 0b11, + _ => 0 + }; + WriteByte( + (byte)( + (w ? 0x80 : 0) | + (vvvv << 3) | + 0b100 | + pp)); + + // P2 + // Mask register determines what elements to zero, rather than what elements to merge + bool z = zeroElements; + // Specifies register-width + byte ll = 0b00; + switch (registerWidth) + { + case 128: ll = 0b00; break; + case 256: ll = 0b01; break; + case 512: ll = 0b10; break; + + default: Debug.Fail($"Invalid EVEX vector register width {registerWidth}."); break; + } + // Embedded broadcast in the case of a memory operand + bool bcast = broadcast; + // Extend operand 2 register + bool vp = (op2Idx & 16) == 0; + // Mask register index + Debug.Assert(maskRegisterIdx < 8, $"Invalid mask register index {maskRegisterIdx}."); + byte aaa = (byte)(maskRegisterIdx & 0b111); + + WriteByte( + (byte)( + (z ? 0x80 : 0) | + (ll << 5) | + (bcast ? 0x10 : 0) | + (vp ? 8 : 0) | + aaa)); + } + private void WriteCompactInst(Operand operand, int opCode) { int regIndex = operand.GetRegister().Index; diff --git a/ARMeilleure/CodeGen/X86/AssemblerTable.cs b/ARMeilleure/CodeGen/X86/AssemblerTable.cs index ecdc029f96..b47b3ecd1a 100644 --- a/ARMeilleure/CodeGen/X86/AssemblerTable.cs +++ b/ARMeilleure/CodeGen/X86/AssemblerTable.cs @@ -20,6 +20,7 @@ namespace ARMeilleure.CodeGen.X86 Reg8Dest = 1 << 2, RexW = 1 << 3, Vex = 1 << 4, + Evex = 1 << 5, PrefixBit = 16, PrefixMask = 7 << PrefixBit, @@ -278,6 +279,7 @@ namespace ARMeilleure.CodeGen.X86 Add(X86Instruction.Vfnmsub231sd, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38bf, InstructionFlags.Vex | InstructionFlags.Prefix66 | InstructionFlags.RexW)); Add(X86Instruction.Vfnmsub231ss, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38bf, InstructionFlags.Vex | InstructionFlags.Prefix66)); Add(X86Instruction.Vpblendvb, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f3a4c, InstructionFlags.Vex | InstructionFlags.Prefix66)); + Add(X86Instruction.Vpternlogd, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f3a25, InstructionFlags.Evex | InstructionFlags.Prefix66)); Add(X86Instruction.Xor, new InstructionInfo(0x00000031, 0x06000083, 0x06000081, BadOp, 0x00000033, InstructionFlags.None)); Add(X86Instruction.Xorpd, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x00000f57, InstructionFlags.Vex | InstructionFlags.Prefix66)); Add(X86Instruction.Xorps, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x00000f57, InstructionFlags.Vex)); diff --git a/ARMeilleure/CodeGen/X86/HardwareCapabilities.cs b/ARMeilleure/CodeGen/X86/HardwareCapabilities.cs index c12a4e28b7..63a9e46a24 100644 --- a/ARMeilleure/CodeGen/X86/HardwareCapabilities.cs +++ b/ARMeilleure/CodeGen/X86/HardwareCapabilities.cs @@ -1,10 +1,14 @@ +using Ryujinx.Memory; using System; +using System.Runtime.InteropServices; using System.Runtime.Intrinsics.X86; namespace ARMeilleure.CodeGen.X86 { static class HardwareCapabilities { + private delegate uint GetXcr0(); + static HardwareCapabilities() { if (!X86Base.IsSupported) @@ -24,6 +28,28 @@ namespace ARMeilleure.CodeGen.X86 FeatureInfo7Ebx = (FeatureFlags7Ebx)ebx7; FeatureInfo7Ecx = (FeatureFlags7Ecx)ecx7; } + + Xcr0InfoEax = (Xcr0FlagsEax)GetXcr0Eax(); + } + + private static uint GetXcr0Eax() + { + ReadOnlySpan asmGetXcr0 = new byte[] + { + 0x31, 0xc9, // xor ecx, ecx + 0xf, 0x01, 0xd0, // xgetbv + 0xc3, // ret + }; + + using MemoryBlock memGetXcr0 = new MemoryBlock((ulong)asmGetXcr0.Length); + + memGetXcr0.Write(0, asmGetXcr0); + + memGetXcr0.Reprotect(0, (ulong)asmGetXcr0.Length, MemoryPermission.ReadAndExecute); + + var fGetXcr0 = Marshal.GetDelegateForFunctionPointer(memGetXcr0.Pointer); + + return fGetXcr0(); } [Flags] @@ -44,6 +70,7 @@ namespace ARMeilleure.CodeGen.X86 Sse42 = 1 << 20, Popcnt = 1 << 23, Aes = 1 << 25, + Osxsave = 1 << 27, Avx = 1 << 28, F16c = 1 << 29 } @@ -52,7 +79,11 @@ namespace ARMeilleure.CodeGen.X86 public enum FeatureFlags7Ebx { Avx2 = 1 << 5, - Sha = 1 << 29 + Avx512f = 1 << 16, + Avx512dq = 1 << 17, + Sha = 1 << 29, + Avx512bw = 1 << 30, + Avx512vl = 1 << 31 } [Flags] @@ -61,10 +92,21 @@ namespace ARMeilleure.CodeGen.X86 Gfni = 1 << 8, } + [Flags] + public enum Xcr0FlagsEax + { + Sse = 1 << 1, + YmmHi128 = 1 << 2, + Opmask = 1 << 5, + ZmmHi256 = 1 << 6, + Hi16Zmm = 1 << 7 + } + public static FeatureFlags1Edx FeatureInfo1Edx { get; } public static FeatureFlags1Ecx FeatureInfo1Ecx { get; } public static FeatureFlags7Ebx FeatureInfo7Ebx { get; } = 0; public static FeatureFlags7Ecx FeatureInfo7Ecx { get; } = 0; + public static Xcr0FlagsEax Xcr0InfoEax { get; } = 0; public static bool SupportsSse => FeatureInfo1Edx.HasFlag(FeatureFlags1Edx.Sse); public static bool SupportsSse2 => FeatureInfo1Edx.HasFlag(FeatureFlags1Edx.Sse2); @@ -76,8 +118,13 @@ namespace ARMeilleure.CodeGen.X86 public static bool SupportsSse42 => FeatureInfo1Ecx.HasFlag(FeatureFlags1Ecx.Sse42); public static bool SupportsPopcnt => FeatureInfo1Ecx.HasFlag(FeatureFlags1Ecx.Popcnt); public static bool SupportsAesni => FeatureInfo1Ecx.HasFlag(FeatureFlags1Ecx.Aes); - public static bool SupportsAvx => FeatureInfo1Ecx.HasFlag(FeatureFlags1Ecx.Avx); + public static bool SupportsAvx => FeatureInfo1Ecx.HasFlag(FeatureFlags1Ecx.Avx | FeatureFlags1Ecx.Osxsave) && Xcr0InfoEax.HasFlag(Xcr0FlagsEax.Sse | Xcr0FlagsEax.YmmHi128); public static bool SupportsAvx2 => FeatureInfo7Ebx.HasFlag(FeatureFlags7Ebx.Avx2) && SupportsAvx; + public static bool SupportsAvx512F => FeatureInfo7Ebx.HasFlag(FeatureFlags7Ebx.Avx512f) && FeatureInfo1Ecx.HasFlag(FeatureFlags1Ecx.Osxsave) + && Xcr0InfoEax.HasFlag(Xcr0FlagsEax.Sse | Xcr0FlagsEax.YmmHi128 | Xcr0FlagsEax.Opmask | Xcr0FlagsEax.ZmmHi256 | Xcr0FlagsEax.Hi16Zmm); + public static bool SupportsAvx512Vl => FeatureInfo7Ebx.HasFlag(FeatureFlags7Ebx.Avx512vl) && SupportsAvx512F; + public static bool SupportsAvx512Bw => FeatureInfo7Ebx.HasFlag(FeatureFlags7Ebx.Avx512bw) && SupportsAvx512F; + public static bool SupportsAvx512Dq => FeatureInfo7Ebx.HasFlag(FeatureFlags7Ebx.Avx512dq) && SupportsAvx512F; public static bool SupportsF16c => FeatureInfo1Ecx.HasFlag(FeatureFlags1Ecx.F16c); public static bool SupportsSha => FeatureInfo7Ebx.HasFlag(FeatureFlags7Ebx.Sha); public static bool SupportsGfni => FeatureInfo7Ecx.HasFlag(FeatureFlags7Ecx.Gfni); @@ -85,5 +132,6 @@ namespace ARMeilleure.CodeGen.X86 public static bool ForceLegacySse { get; set; } public static bool SupportsVexEncoding => SupportsAvx && !ForceLegacySse; + public static bool SupportsEvexEncoding => SupportsAvx512F && !ForceLegacySse; } } \ No newline at end of file diff --git a/ARMeilleure/CodeGen/X86/IntrinsicTable.cs b/ARMeilleure/CodeGen/X86/IntrinsicTable.cs index 8c909ac13d..c788fa4424 100644 --- a/ARMeilleure/CodeGen/X86/IntrinsicTable.cs +++ b/ARMeilleure/CodeGen/X86/IntrinsicTable.cs @@ -180,6 +180,7 @@ namespace ARMeilleure.CodeGen.X86 Add(Intrinsic.X86Vfnmadd231ss, new IntrinsicInfo(X86Instruction.Vfnmadd231ss, IntrinsicType.Fma)); Add(Intrinsic.X86Vfnmsub231sd, new IntrinsicInfo(X86Instruction.Vfnmsub231sd, IntrinsicType.Fma)); Add(Intrinsic.X86Vfnmsub231ss, new IntrinsicInfo(X86Instruction.Vfnmsub231ss, IntrinsicType.Fma)); + Add(Intrinsic.X86Vpternlogd, new IntrinsicInfo(X86Instruction.Vpternlogd, IntrinsicType.TernaryImm)); Add(Intrinsic.X86Xorpd, new IntrinsicInfo(X86Instruction.Xorpd, IntrinsicType.Binary)); Add(Intrinsic.X86Xorps, new IntrinsicInfo(X86Instruction.Xorps, IntrinsicType.Binary)); } diff --git a/ARMeilleure/CodeGen/X86/X86Instruction.cs b/ARMeilleure/CodeGen/X86/X86Instruction.cs index b024394e16..ecfc432d70 100644 --- a/ARMeilleure/CodeGen/X86/X86Instruction.cs +++ b/ARMeilleure/CodeGen/X86/X86Instruction.cs @@ -219,6 +219,7 @@ namespace ARMeilleure.CodeGen.X86 Vfnmsub231sd, Vfnmsub231ss, Vpblendvb, + Vpternlogd, Xor, Xorpd, Xorps, diff --git a/ARMeilleure/Instructions/InstEmitSimdLogical.cs b/ARMeilleure/Instructions/InstEmitSimdLogical.cs index 8ca815801a..2bf531e6c7 100644 --- a/ARMeilleure/Instructions/InstEmitSimdLogical.cs +++ b/ARMeilleure/Instructions/InstEmitSimdLogical.cs @@ -254,7 +254,22 @@ namespace ARMeilleure.Instructions public static void Not_V(ArmEmitterContext context) { - if (Optimizations.UseSse2) + if (Optimizations.UseAvx512Ortho) + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + Operand n = GetVec(op.Rn); + + Operand res = context.AddIntrinsic(Intrinsic.X86Vpternlogd, n, n, Const(~0b10101010)); + + if (op.RegisterSize == RegisterSize.Simd64) + { + res = context.VectorZeroUpper64(res); + } + + context.Copy(GetVec(op.Rd), res); + } + else if (Optimizations.UseSse2) { OpCodeSimd op = (OpCodeSimd)context.CurrOp; @@ -283,6 +298,22 @@ namespace ARMeilleure.Instructions { InstEmitSimdHelperArm64.EmitVectorBinaryOp(context, Intrinsic.Arm64OrnV); } + else if (Optimizations.UseAvx512Ortho) + { + OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; + + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + Operand res = context.AddIntrinsic(Intrinsic.X86Vpternlogd, n, m, Const(0b11001100 | ~0b10101010)); + + if (op.RegisterSize == RegisterSize.Simd64) + { + res = context.VectorZeroUpper64(res); + } + + context.Copy(GetVec(op.Rd), res); + } else if (Optimizations.UseSse2) { OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; diff --git a/ARMeilleure/Instructions/InstEmitSimdLogical32.cs b/ARMeilleure/Instructions/InstEmitSimdLogical32.cs index c2a04778bb..68ef4ed175 100644 --- a/ARMeilleure/Instructions/InstEmitSimdLogical32.cs +++ b/ARMeilleure/Instructions/InstEmitSimdLogical32.cs @@ -151,6 +151,13 @@ namespace ARMeilleure.Instructions { InstEmitSimdHelper32Arm64.EmitVectorBinaryOpSimd32(context, (n, m) => context.AddIntrinsic(Intrinsic.Arm64OrnV | Intrinsic.Arm64V128, n, m)); } + else if (Optimizations.UseAvx512Ortho) + { + EmitVectorBinaryOpSimd32(context, (n, m) => + { + return context.AddIntrinsic(Intrinsic.X86Vpternlogd, n, m, Const(0b11001100 | ~0b10101010)); + }); + } else if (Optimizations.UseSse2) { Operand mask = context.VectorOne(); diff --git a/ARMeilleure/Instructions/InstEmitSimdMove32.cs b/ARMeilleure/Instructions/InstEmitSimdMove32.cs index 17100eb9c8..b8b91b31d2 100644 --- a/ARMeilleure/Instructions/InstEmitSimdMove32.cs +++ b/ARMeilleure/Instructions/InstEmitSimdMove32.cs @@ -34,7 +34,14 @@ namespace ARMeilleure.Instructions public static void Vmvn_I(ArmEmitterContext context) { - if (Optimizations.UseSse2) + if (Optimizations.UseAvx512Ortho) + { + EmitVectorUnaryOpSimd32(context, (op1) => + { + return context.AddIntrinsic(Intrinsic.X86Vpternlogd, op1, op1, Const(0b01010101)); + }); + } + else if (Optimizations.UseSse2) { EmitVectorUnaryOpSimd32(context, (op1) => { diff --git a/ARMeilleure/IntermediateRepresentation/Intrinsic.cs b/ARMeilleure/IntermediateRepresentation/Intrinsic.cs index a665e4b7a1..b629345ee4 100644 --- a/ARMeilleure/IntermediateRepresentation/Intrinsic.cs +++ b/ARMeilleure/IntermediateRepresentation/Intrinsic.cs @@ -173,6 +173,7 @@ namespace ARMeilleure.IntermediateRepresentation X86Vfnmadd231ss, X86Vfnmsub231sd, X86Vfnmsub231ss, + X86Vpternlogd, X86Xorpd, X86Xorps, diff --git a/ARMeilleure/Optimizations.cs b/ARMeilleure/Optimizations.cs index 9044314f60..a84a4dc4f5 100644 --- a/ARMeilleure/Optimizations.cs +++ b/ARMeilleure/Optimizations.cs @@ -23,6 +23,10 @@ namespace ARMeilleure public static bool UseSse42IfAvailable { get; set; } = true; public static bool UsePopCntIfAvailable { get; set; } = true; public static bool UseAvxIfAvailable { get; set; } = true; + public static bool UseAvx512FIfAvailable { get; set; } = true; + public static bool UseAvx512VlIfAvailable { get; set; } = true; + public static bool UseAvx512BwIfAvailable { get; set; } = true; + public static bool UseAvx512DqIfAvailable { get; set; } = true; public static bool UseF16cIfAvailable { get; set; } = true; public static bool UseFmaIfAvailable { get; set; } = true; public static bool UseAesniIfAvailable { get; set; } = true; @@ -47,11 +51,18 @@ namespace ARMeilleure internal static bool UseSse42 => UseSse42IfAvailable && X86HardwareCapabilities.SupportsSse42; internal static bool UsePopCnt => UsePopCntIfAvailable && X86HardwareCapabilities.SupportsPopcnt; internal static bool UseAvx => UseAvxIfAvailable && X86HardwareCapabilities.SupportsAvx && !ForceLegacySse; + internal static bool UseAvx512F => UseAvx512FIfAvailable && X86HardwareCapabilities.SupportsAvx512F && !ForceLegacySse; + internal static bool UseAvx512Vl => UseAvx512VlIfAvailable && X86HardwareCapabilities.SupportsAvx512Vl && !ForceLegacySse; + internal static bool UseAvx512Bw => UseAvx512BwIfAvailable && X86HardwareCapabilities.SupportsAvx512Bw && !ForceLegacySse; + internal static bool UseAvx512Dq => UseAvx512DqIfAvailable && X86HardwareCapabilities.SupportsAvx512Dq && !ForceLegacySse; internal static bool UseF16c => UseF16cIfAvailable && X86HardwareCapabilities.SupportsF16c; internal static bool UseFma => UseFmaIfAvailable && X86HardwareCapabilities.SupportsFma; internal static bool UseAesni => UseAesniIfAvailable && X86HardwareCapabilities.SupportsAesni; internal static bool UsePclmulqdq => UsePclmulqdqIfAvailable && X86HardwareCapabilities.SupportsPclmulqdq; internal static bool UseSha => UseShaIfAvailable && X86HardwareCapabilities.SupportsSha; internal static bool UseGfni => UseGfniIfAvailable && X86HardwareCapabilities.SupportsGfni; + + internal static bool UseAvx512Ortho => UseAvx512F && UseAvx512Vl; + internal static bool UseAvx512OrthoFloat => UseAvx512Ortho && UseAvx512Dq; } } diff --git a/ARMeilleure/Translation/PTC/Ptc.cs b/ARMeilleure/Translation/PTC/Ptc.cs index 0b23fd043b..17f6870623 100644 --- a/ARMeilleure/Translation/PTC/Ptc.cs +++ b/ARMeilleure/Translation/PTC/Ptc.cs @@ -30,7 +30,7 @@ namespace ARMeilleure.Translation.PTC private const string OuterHeaderMagicString = "PTCohd\0\0"; private const string InnerHeaderMagicString = "PTCihd\0\0"; - private const uint InternalVersion = 4484; //! To be incremented manually for each change to the ARMeilleure project. + private const uint InternalVersion = 4485; //! To be incremented manually for each change to the ARMeilleure project. private const string ActualDir = "0"; private const string BackupDir = "1"; @@ -969,6 +969,7 @@ namespace ARMeilleure.Translation.PTC (ulong)Arm64HardwareCapabilities.LinuxFeatureInfoHwCap, (ulong)Arm64HardwareCapabilities.LinuxFeatureInfoHwCap2, (ulong)Arm64HardwareCapabilities.MacOsFeatureInfo, + 0, 0); } else if (RuntimeInformation.ProcessArchitecture == Architecture.X64) @@ -977,11 +978,12 @@ namespace ARMeilleure.Translation.PTC (ulong)X86HardwareCapabilities.FeatureInfo1Ecx, (ulong)X86HardwareCapabilities.FeatureInfo1Edx, (ulong)X86HardwareCapabilities.FeatureInfo7Ebx, - (ulong)X86HardwareCapabilities.FeatureInfo7Ecx); + (ulong)X86HardwareCapabilities.FeatureInfo7Ecx, + (ulong)X86HardwareCapabilities.Xcr0InfoEax); } else { - return new FeatureInfo(0, 0, 0, 0); + return new FeatureInfo(0, 0, 0, 0, 0); } } @@ -1002,7 +1004,7 @@ namespace ARMeilleure.Translation.PTC return osPlatform; } - [StructLayout(LayoutKind.Sequential, Pack = 1/*, Size = 78*/)] + [StructLayout(LayoutKind.Sequential, Pack = 1/*, Size = 86*/)] private struct OuterHeader { public ulong Magic; @@ -1034,8 +1036,8 @@ namespace ARMeilleure.Translation.PTC } } - [StructLayout(LayoutKind.Sequential, Pack = 1/*, Size = 32*/)] - private record struct FeatureInfo(ulong FeatureInfo0, ulong FeatureInfo1, ulong FeatureInfo2, ulong FeatureInfo3); + [StructLayout(LayoutKind.Sequential, Pack = 1/*, Size = 40*/)] + private record struct FeatureInfo(ulong FeatureInfo0, ulong FeatureInfo1, ulong FeatureInfo2, ulong FeatureInfo3, ulong FeatureInfo4); [StructLayout(LayoutKind.Sequential, Pack = 1/*, Size = 128*/)] private struct InnerHeader