aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--ARMeilleure/ARMeilleure.csproj1
-rw-r--r--ARMeilleure/CodeGen/X86/Assembler.cs105
-rw-r--r--ARMeilleure/CodeGen/X86/AssemblerTable.cs2
-rw-r--r--ARMeilleure/CodeGen/X86/HardwareCapabilities.cs52
-rw-r--r--ARMeilleure/CodeGen/X86/IntrinsicTable.cs1
-rw-r--r--ARMeilleure/CodeGen/X86/X86Instruction.cs1
-rw-r--r--ARMeilleure/Instructions/InstEmitSimdLogical.cs33
-rw-r--r--ARMeilleure/Instructions/InstEmitSimdLogical32.cs7
-rw-r--r--ARMeilleure/Instructions/InstEmitSimdMove32.cs9
-rw-r--r--ARMeilleure/IntermediateRepresentation/Intrinsic.cs1
-rw-r--r--ARMeilleure/Optimizations.cs11
-rw-r--r--ARMeilleure/Translation/PTC/Ptc.cs14
12 files changed, 226 insertions, 11 deletions
diff --git a/ARMeilleure/ARMeilleure.csproj b/ARMeilleure/ARMeilleure.csproj
index 1c2135ed5..fa5551154 100644
--- a/ARMeilleure/ARMeilleure.csproj
+++ b/ARMeilleure/ARMeilleure.csproj
@@ -7,6 +7,7 @@
7 7
8 <ItemGroup> 8 <ItemGroup>
9 <ProjectReference Include="..\Ryujinx.Common\Ryujinx.Common.csproj" /> 9 <ProjectReference Include="..\Ryujinx.Common\Ryujinx.Common.csproj" />
10 <ProjectReference Include="..\Ryujinx.Memory\Ryujinx.Memory.csproj" />
10 </ItemGroup> 11 </ItemGroup>
11 12
12 <ItemGroup> 13 <ItemGroup>
diff --git a/ARMeilleure/CodeGen/X86/Assembler.cs b/ARMeilleure/CodeGen/X86/Assembler.cs
index 2ea4208b3..67736a31f 100644
--- a/ARMeilleure/CodeGen/X86/Assembler.cs
+++ b/ARMeilleure/CodeGen/X86/Assembler.cs
@@ -1034,7 +1034,13 @@ namespace ARMeilleure.CodeGen.X86
1034 1034
1035 Debug.Assert(opCode != BadOp, "Invalid opcode value."); 1035 Debug.Assert(opCode != BadOp, "Invalid opcode value.");
1036 1036
1037 if ((flags & InstructionFlags.Vex) != 0 && HardwareCapabilities.SupportsVexEncoding) 1037 if ((flags & InstructionFlags.Evex) != 0 && HardwareCapabilities.SupportsEvexEncoding)
1038 {
1039 WriteEvexInst(dest, src1, src2, type, flags, opCode);
1040
1041 opCode &= 0xff;
1042 }
1043 else if ((flags & InstructionFlags.Vex) != 0 && HardwareCapabilities.SupportsVexEncoding)
1038 { 1044 {
1039 // In a vex encoding, only one prefix can be active at a time. The active prefix is encoded in the second byte using two bits. 1045 // In a vex encoding, only one prefix can be active at a time. The active prefix is encoded in the second byte using two bits.
1040 1046
@@ -1153,6 +1159,103 @@ namespace ARMeilleure.CodeGen.X86
1153 } 1159 }
1154 } 1160 }
1155 1161
1162 private void WriteEvexInst(
1163 Operand dest,
1164 Operand src1,
1165 Operand src2,
1166 OperandType type,
1167 InstructionFlags flags,
1168 int opCode,
1169 bool broadcast = false,
1170 int registerWidth = 128,
1171 int maskRegisterIdx = 0,
1172 bool zeroElements = false)
1173 {
1174 int op1Idx = dest.GetRegister().Index;
1175 int op2Idx = src1.GetRegister().Index;
1176 int op3Idx = src2.GetRegister().Index;
1177
1178 WriteByte(0x62);
1179
1180 // P0
1181 // Extend operand 1 register
1182 bool r = (op1Idx & 8) == 0;
1183 // Extend operand 3 register
1184 bool x = (op3Idx & 16) == 0;
1185 // Extend operand 3 register
1186 bool b = (op3Idx & 8) == 0;
1187 // Extend operand 1 register
1188 bool rp = (op1Idx & 16) == 0;
1189 // Escape code index
1190 byte mm = 0b00;
1191
1192 switch ((ushort)(opCode >> 8))
1193 {
1194 case 0xf00: mm = 0b01; break;
1195 case 0xf38: mm = 0b10; break;
1196 case 0xf3a: mm = 0b11; break;
1197
1198 default: Debug.Fail($"Failed to EVEX encode opcode 0x{opCode:X}."); break;
1199 }
1200
1201 WriteByte(
1202 (byte)(
1203 (r ? 0x80 : 0) |
1204 (x ? 0x40 : 0) |
1205 (b ? 0x20 : 0) |
1206 (rp ? 0x10 : 0) |
1207 mm));
1208
1209 // P1
1210 // Specify 64-bit lane mode
1211 bool w = Is64Bits(type);
1212 // Operand 2 register index
1213 byte vvvv = (byte)(~op2Idx & 0b1111);
1214 // Opcode prefix
1215 byte pp = (flags & InstructionFlags.PrefixMask) switch
1216 {
1217 InstructionFlags.Prefix66 => 0b01,
1218 InstructionFlags.PrefixF3 => 0b10,
1219 InstructionFlags.PrefixF2 => 0b11,
1220 _ => 0
1221 };
1222 WriteByte(
1223 (byte)(
1224 (w ? 0x80 : 0) |
1225 (vvvv << 3) |
1226 0b100 |
1227 pp));
1228
1229 // P2
1230 // Mask register determines what elements to zero, rather than what elements to merge
1231 bool z = zeroElements;
1232 // Specifies register-width
1233 byte ll = 0b00;
1234 switch (registerWidth)
1235 {
1236 case 128: ll = 0b00; break;
1237 case 256: ll = 0b01; break;
1238 case 512: ll = 0b10; break;
1239
1240 default: Debug.Fail($"Invalid EVEX vector register width {registerWidth}."); break;
1241 }
1242 // Embedded broadcast in the case of a memory operand
1243 bool bcast = broadcast;
1244 // Extend operand 2 register
1245 bool vp = (op2Idx & 16) == 0;
1246 // Mask register index
1247 Debug.Assert(maskRegisterIdx < 8, $"Invalid mask register index {maskRegisterIdx}.");
1248 byte aaa = (byte)(maskRegisterIdx & 0b111);
1249
1250 WriteByte(
1251 (byte)(
1252 (z ? 0x80 : 0) |
1253 (ll << 5) |
1254 (bcast ? 0x10 : 0) |
1255 (vp ? 8 : 0) |
1256 aaa));
1257 }
1258
1156 private void WriteCompactInst(Operand operand, int opCode) 1259 private void WriteCompactInst(Operand operand, int opCode)
1157 { 1260 {
1158 int regIndex = operand.GetRegister().Index; 1261 int regIndex = operand.GetRegister().Index;
diff --git a/ARMeilleure/CodeGen/X86/AssemblerTable.cs b/ARMeilleure/CodeGen/X86/AssemblerTable.cs
index ecdc029f9..b47b3ecd1 100644
--- a/ARMeilleure/CodeGen/X86/AssemblerTable.cs
+++ b/ARMeilleure/CodeGen/X86/AssemblerTable.cs
@@ -20,6 +20,7 @@ namespace ARMeilleure.CodeGen.X86
20 Reg8Dest = 1 << 2, 20 Reg8Dest = 1 << 2,
21 RexW = 1 << 3, 21 RexW = 1 << 3,
22 Vex = 1 << 4, 22 Vex = 1 << 4,
23 Evex = 1 << 5,
23 24
24 PrefixBit = 16, 25 PrefixBit = 16,
25 PrefixMask = 7 << PrefixBit, 26 PrefixMask = 7 << PrefixBit,
@@ -278,6 +279,7 @@ namespace ARMeilleure.CodeGen.X86
278 Add(X86Instruction.Vfnmsub231sd, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38bf, InstructionFlags.Vex | InstructionFlags.Prefix66 | InstructionFlags.RexW)); 279 Add(X86Instruction.Vfnmsub231sd, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38bf, InstructionFlags.Vex | InstructionFlags.Prefix66 | InstructionFlags.RexW));
279 Add(X86Instruction.Vfnmsub231ss, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38bf, InstructionFlags.Vex | InstructionFlags.Prefix66)); 280 Add(X86Instruction.Vfnmsub231ss, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38bf, InstructionFlags.Vex | InstructionFlags.Prefix66));
280 Add(X86Instruction.Vpblendvb, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f3a4c, InstructionFlags.Vex | InstructionFlags.Prefix66)); 281 Add(X86Instruction.Vpblendvb, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f3a4c, InstructionFlags.Vex | InstructionFlags.Prefix66));
282 Add(X86Instruction.Vpternlogd, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f3a25, InstructionFlags.Evex | InstructionFlags.Prefix66));
281 Add(X86Instruction.Xor, new InstructionInfo(0x00000031, 0x06000083, 0x06000081, BadOp, 0x00000033, InstructionFlags.None)); 283 Add(X86Instruction.Xor, new InstructionInfo(0x00000031, 0x06000083, 0x06000081, BadOp, 0x00000033, InstructionFlags.None));
282 Add(X86Instruction.Xorpd, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x00000f57, InstructionFlags.Vex | InstructionFlags.Prefix66)); 284 Add(X86Instruction.Xorpd, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x00000f57, InstructionFlags.Vex | InstructionFlags.Prefix66));
283 Add(X86Instruction.Xorps, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x00000f57, InstructionFlags.Vex)); 285 Add(X86Instruction.Xorps, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x00000f57, InstructionFlags.Vex));
diff --git a/ARMeilleure/CodeGen/X86/HardwareCapabilities.cs b/ARMeilleure/CodeGen/X86/HardwareCapabilities.cs
index c12a4e28b..63a9e46a2 100644
--- a/ARMeilleure/CodeGen/X86/HardwareCapabilities.cs
+++ b/ARMeilleure/CodeGen/X86/HardwareCapabilities.cs
@@ -1,10 +1,14 @@
1using Ryujinx.Memory;
1using System; 2using System;
3using System.Runtime.InteropServices;
2using System.Runtime.Intrinsics.X86; 4using System.Runtime.Intrinsics.X86;
3 5
4namespace ARMeilleure.CodeGen.X86 6namespace ARMeilleure.CodeGen.X86
5{ 7{
6 static class HardwareCapabilities 8 static class HardwareCapabilities
7 { 9 {
10 private delegate uint GetXcr0();
11
8 static HardwareCapabilities() 12 static HardwareCapabilities()
9 { 13 {
10 if (!X86Base.IsSupported) 14 if (!X86Base.IsSupported)
@@ -24,6 +28,28 @@ namespace ARMeilleure.CodeGen.X86
24 FeatureInfo7Ebx = (FeatureFlags7Ebx)ebx7; 28 FeatureInfo7Ebx = (FeatureFlags7Ebx)ebx7;
25 FeatureInfo7Ecx = (FeatureFlags7Ecx)ecx7; 29 FeatureInfo7Ecx = (FeatureFlags7Ecx)ecx7;
26 } 30 }
31
32 Xcr0InfoEax = (Xcr0FlagsEax)GetXcr0Eax();
33 }
34
35 private static uint GetXcr0Eax()
36 {
37 ReadOnlySpan<byte> asmGetXcr0 = new byte[]
38 {
39 0x31, 0xc9, // xor ecx, ecx
40 0xf, 0x01, 0xd0, // xgetbv
41 0xc3, // ret
42 };
43
44 using MemoryBlock memGetXcr0 = new MemoryBlock((ulong)asmGetXcr0.Length);
45
46 memGetXcr0.Write(0, asmGetXcr0);
47
48 memGetXcr0.Reprotect(0, (ulong)asmGetXcr0.Length, MemoryPermission.ReadAndExecute);
49
50 var fGetXcr0 = Marshal.GetDelegateForFunctionPointer<GetXcr0>(memGetXcr0.Pointer);
51
52 return fGetXcr0();
27 } 53 }
28 54
29 [Flags] 55 [Flags]
@@ -44,6 +70,7 @@ namespace ARMeilleure.CodeGen.X86
44 Sse42 = 1 << 20, 70 Sse42 = 1 << 20,
45 Popcnt = 1 << 23, 71 Popcnt = 1 << 23,
46 Aes = 1 << 25, 72 Aes = 1 << 25,
73 Osxsave = 1 << 27,
47 Avx = 1 << 28, 74 Avx = 1 << 28,
48 F16c = 1 << 29 75 F16c = 1 << 29
49 } 76 }
@@ -52,7 +79,11 @@ namespace ARMeilleure.CodeGen.X86
52 public enum FeatureFlags7Ebx 79 public enum FeatureFlags7Ebx
53 { 80 {
54 Avx2 = 1 << 5, 81 Avx2 = 1 << 5,
55 Sha = 1 << 29 82 Avx512f = 1 << 16,
83 Avx512dq = 1 << 17,
84 Sha = 1 << 29,
85 Avx512bw = 1 << 30,
86 Avx512vl = 1 << 31
56 } 87 }
57 88
58 [Flags] 89 [Flags]
@@ -61,10 +92,21 @@ namespace ARMeilleure.CodeGen.X86
61 Gfni = 1 << 8, 92 Gfni = 1 << 8,
62 } 93 }
63 94
95 [Flags]
96 public enum Xcr0FlagsEax
97 {
98 Sse = 1 << 1,
99 YmmHi128 = 1 << 2,
100 Opmask = 1 << 5,
101 ZmmHi256 = 1 << 6,
102 Hi16Zmm = 1 << 7
103 }
104
64 public static FeatureFlags1Edx FeatureInfo1Edx { get; } 105 public static FeatureFlags1Edx FeatureInfo1Edx { get; }
65 public static FeatureFlags1Ecx FeatureInfo1Ecx { get; } 106 public static FeatureFlags1Ecx FeatureInfo1Ecx { get; }
66 public static FeatureFlags7Ebx FeatureInfo7Ebx { get; } = 0; 107 public static FeatureFlags7Ebx FeatureInfo7Ebx { get; } = 0;
67 public static FeatureFlags7Ecx FeatureInfo7Ecx { get; } = 0; 108 public static FeatureFlags7Ecx FeatureInfo7Ecx { get; } = 0;
109 public static Xcr0FlagsEax Xcr0InfoEax { get; } = 0;
68 110
69 public static bool SupportsSse => FeatureInfo1Edx.HasFlag(FeatureFlags1Edx.Sse); 111 public static bool SupportsSse => FeatureInfo1Edx.HasFlag(FeatureFlags1Edx.Sse);
70 public static bool SupportsSse2 => FeatureInfo1Edx.HasFlag(FeatureFlags1Edx.Sse2); 112 public static bool SupportsSse2 => FeatureInfo1Edx.HasFlag(FeatureFlags1Edx.Sse2);
@@ -76,8 +118,13 @@ namespace ARMeilleure.CodeGen.X86
76 public static bool SupportsSse42 => FeatureInfo1Ecx.HasFlag(FeatureFlags1Ecx.Sse42); 118 public static bool SupportsSse42 => FeatureInfo1Ecx.HasFlag(FeatureFlags1Ecx.Sse42);
77 public static bool SupportsPopcnt => FeatureInfo1Ecx.HasFlag(FeatureFlags1Ecx.Popcnt); 119 public static bool SupportsPopcnt => FeatureInfo1Ecx.HasFlag(FeatureFlags1Ecx.Popcnt);
78 public static bool SupportsAesni => FeatureInfo1Ecx.HasFlag(FeatureFlags1Ecx.Aes); 120 public static bool SupportsAesni => FeatureInfo1Ecx.HasFlag(FeatureFlags1Ecx.Aes);
79 public static bool SupportsAvx => FeatureInfo1Ecx.HasFlag(FeatureFlags1Ecx.Avx); 121 public static bool SupportsAvx => FeatureInfo1Ecx.HasFlag(FeatureFlags1Ecx.Avx | FeatureFlags1Ecx.Osxsave) && Xcr0InfoEax.HasFlag(Xcr0FlagsEax.Sse | Xcr0FlagsEax.YmmHi128);
80 public static bool SupportsAvx2 => FeatureInfo7Ebx.HasFlag(FeatureFlags7Ebx.Avx2) && SupportsAvx; 122 public static bool SupportsAvx2 => FeatureInfo7Ebx.HasFlag(FeatureFlags7Ebx.Avx2) && SupportsAvx;
123 public static bool SupportsAvx512F => FeatureInfo7Ebx.HasFlag(FeatureFlags7Ebx.Avx512f) && FeatureInfo1Ecx.HasFlag(FeatureFlags1Ecx.Osxsave)
124 && Xcr0InfoEax.HasFlag(Xcr0FlagsEax.Sse | Xcr0FlagsEax.YmmHi128 | Xcr0FlagsEax.Opmask | Xcr0FlagsEax.ZmmHi256 | Xcr0FlagsEax.Hi16Zmm);
125 public static bool SupportsAvx512Vl => FeatureInfo7Ebx.HasFlag(FeatureFlags7Ebx.Avx512vl) && SupportsAvx512F;
126 public static bool SupportsAvx512Bw => FeatureInfo7Ebx.HasFlag(FeatureFlags7Ebx.Avx512bw) && SupportsAvx512F;
127 public static bool SupportsAvx512Dq => FeatureInfo7Ebx.HasFlag(FeatureFlags7Ebx.Avx512dq) && SupportsAvx512F;
81 public static bool SupportsF16c => FeatureInfo1Ecx.HasFlag(FeatureFlags1Ecx.F16c); 128 public static bool SupportsF16c => FeatureInfo1Ecx.HasFlag(FeatureFlags1Ecx.F16c);
82 public static bool SupportsSha => FeatureInfo7Ebx.HasFlag(FeatureFlags7Ebx.Sha); 129 public static bool SupportsSha => FeatureInfo7Ebx.HasFlag(FeatureFlags7Ebx.Sha);
83 public static bool SupportsGfni => FeatureInfo7Ecx.HasFlag(FeatureFlags7Ecx.Gfni); 130 public static bool SupportsGfni => FeatureInfo7Ecx.HasFlag(FeatureFlags7Ecx.Gfni);
@@ -85,5 +132,6 @@ namespace ARMeilleure.CodeGen.X86
85 public static bool ForceLegacySse { get; set; } 132 public static bool ForceLegacySse { get; set; }
86 133
87 public static bool SupportsVexEncoding => SupportsAvx && !ForceLegacySse; 134 public static bool SupportsVexEncoding => SupportsAvx && !ForceLegacySse;
135 public static bool SupportsEvexEncoding => SupportsAvx512F && !ForceLegacySse;
88 } 136 }
89} \ No newline at end of file 137} \ No newline at end of file
diff --git a/ARMeilleure/CodeGen/X86/IntrinsicTable.cs b/ARMeilleure/CodeGen/X86/IntrinsicTable.cs
index 8c909ac13..c788fa442 100644
--- a/ARMeilleure/CodeGen/X86/IntrinsicTable.cs
+++ b/ARMeilleure/CodeGen/X86/IntrinsicTable.cs
@@ -180,6 +180,7 @@ namespace ARMeilleure.CodeGen.X86
180 Add(Intrinsic.X86Vfnmadd231ss, new IntrinsicInfo(X86Instruction.Vfnmadd231ss, IntrinsicType.Fma)); 180 Add(Intrinsic.X86Vfnmadd231ss, new IntrinsicInfo(X86Instruction.Vfnmadd231ss, IntrinsicType.Fma));
181 Add(Intrinsic.X86Vfnmsub231sd, new IntrinsicInfo(X86Instruction.Vfnmsub231sd, IntrinsicType.Fma)); 181 Add(Intrinsic.X86Vfnmsub231sd, new IntrinsicInfo(X86Instruction.Vfnmsub231sd, IntrinsicType.Fma));
182 Add(Intrinsic.X86Vfnmsub231ss, new IntrinsicInfo(X86Instruction.Vfnmsub231ss, IntrinsicType.Fma)); 182 Add(Intrinsic.X86Vfnmsub231ss, new IntrinsicInfo(X86Instruction.Vfnmsub231ss, IntrinsicType.Fma));
183 Add(Intrinsic.X86Vpternlogd, new IntrinsicInfo(X86Instruction.Vpternlogd, IntrinsicType.TernaryImm));
183 Add(Intrinsic.X86Xorpd, new IntrinsicInfo(X86Instruction.Xorpd, IntrinsicType.Binary)); 184 Add(Intrinsic.X86Xorpd, new IntrinsicInfo(X86Instruction.Xorpd, IntrinsicType.Binary));
184 Add(Intrinsic.X86Xorps, new IntrinsicInfo(X86Instruction.Xorps, IntrinsicType.Binary)); 185 Add(Intrinsic.X86Xorps, new IntrinsicInfo(X86Instruction.Xorps, IntrinsicType.Binary));
185 } 186 }
diff --git a/ARMeilleure/CodeGen/X86/X86Instruction.cs b/ARMeilleure/CodeGen/X86/X86Instruction.cs
index b024394e1..ecfc432d7 100644
--- a/ARMeilleure/CodeGen/X86/X86Instruction.cs
+++ b/ARMeilleure/CodeGen/X86/X86Instruction.cs
@@ -219,6 +219,7 @@ namespace ARMeilleure.CodeGen.X86
219 Vfnmsub231sd, 219 Vfnmsub231sd,
220 Vfnmsub231ss, 220 Vfnmsub231ss,
221 Vpblendvb, 221 Vpblendvb,
222 Vpternlogd,
222 Xor, 223 Xor,
223 Xorpd, 224 Xorpd,
224 Xorps, 225 Xorps,
diff --git a/ARMeilleure/Instructions/InstEmitSimdLogical.cs b/ARMeilleure/Instructions/InstEmitSimdLogical.cs
index 8ca815801..2bf531e6c 100644
--- a/ARMeilleure/Instructions/InstEmitSimdLogical.cs
+++ b/ARMeilleure/Instructions/InstEmitSimdLogical.cs
@@ -254,7 +254,22 @@ namespace ARMeilleure.Instructions
254 254
255 public static void Not_V(ArmEmitterContext context) 255 public static void Not_V(ArmEmitterContext context)
256 { 256 {
257 if (Optimizations.UseSse2) 257 if (Optimizations.UseAvx512Ortho)
258 {
259 OpCodeSimd op = (OpCodeSimd)context.CurrOp;
260
261 Operand n = GetVec(op.Rn);
262
263 Operand res = context.AddIntrinsic(Intrinsic.X86Vpternlogd, n, n, Const(~0b10101010));
264
265 if (op.RegisterSize == RegisterSize.Simd64)
266 {
267 res = context.VectorZeroUpper64(res);
268 }
269
270 context.Copy(GetVec(op.Rd), res);
271 }
272 else if (Optimizations.UseSse2)
258 { 273 {
259 OpCodeSimd op = (OpCodeSimd)context.CurrOp; 274 OpCodeSimd op = (OpCodeSimd)context.CurrOp;
260 275
@@ -283,6 +298,22 @@ namespace ARMeilleure.Instructions
283 { 298 {
284 InstEmitSimdHelperArm64.EmitVectorBinaryOp(context, Intrinsic.Arm64OrnV); 299 InstEmitSimdHelperArm64.EmitVectorBinaryOp(context, Intrinsic.Arm64OrnV);
285 } 300 }
301 else if (Optimizations.UseAvx512Ortho)
302 {
303 OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp;
304
305 Operand n = GetVec(op.Rn);
306 Operand m = GetVec(op.Rm);
307
308 Operand res = context.AddIntrinsic(Intrinsic.X86Vpternlogd, n, m, Const(0b11001100 | ~0b10101010));
309
310 if (op.RegisterSize == RegisterSize.Simd64)
311 {
312 res = context.VectorZeroUpper64(res);
313 }
314
315 context.Copy(GetVec(op.Rd), res);
316 }
286 else if (Optimizations.UseSse2) 317 else if (Optimizations.UseSse2)
287 { 318 {
288 OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; 319 OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp;
diff --git a/ARMeilleure/Instructions/InstEmitSimdLogical32.cs b/ARMeilleure/Instructions/InstEmitSimdLogical32.cs
index c2a04778b..68ef4ed17 100644
--- a/ARMeilleure/Instructions/InstEmitSimdLogical32.cs
+++ b/ARMeilleure/Instructions/InstEmitSimdLogical32.cs
@@ -151,6 +151,13 @@ namespace ARMeilleure.Instructions
151 { 151 {
152 InstEmitSimdHelper32Arm64.EmitVectorBinaryOpSimd32(context, (n, m) => context.AddIntrinsic(Intrinsic.Arm64OrnV | Intrinsic.Arm64V128, n, m)); 152 InstEmitSimdHelper32Arm64.EmitVectorBinaryOpSimd32(context, (n, m) => context.AddIntrinsic(Intrinsic.Arm64OrnV | Intrinsic.Arm64V128, n, m));
153 } 153 }
154 else if (Optimizations.UseAvx512Ortho)
155 {
156 EmitVectorBinaryOpSimd32(context, (n, m) =>
157 {
158 return context.AddIntrinsic(Intrinsic.X86Vpternlogd, n, m, Const(0b11001100 | ~0b10101010));
159 });
160 }
154 else if (Optimizations.UseSse2) 161 else if (Optimizations.UseSse2)
155 { 162 {
156 Operand mask = context.VectorOne(); 163 Operand mask = context.VectorOne();
diff --git a/ARMeilleure/Instructions/InstEmitSimdMove32.cs b/ARMeilleure/Instructions/InstEmitSimdMove32.cs
index 17100eb9c..b8b91b31d 100644
--- a/ARMeilleure/Instructions/InstEmitSimdMove32.cs
+++ b/ARMeilleure/Instructions/InstEmitSimdMove32.cs
@@ -34,7 +34,14 @@ namespace ARMeilleure.Instructions
34 34
35 public static void Vmvn_I(ArmEmitterContext context) 35 public static void Vmvn_I(ArmEmitterContext context)
36 { 36 {
37 if (Optimizations.UseSse2) 37 if (Optimizations.UseAvx512Ortho)
38 {
39 EmitVectorUnaryOpSimd32(context, (op1) =>
40 {
41 return context.AddIntrinsic(Intrinsic.X86Vpternlogd, op1, op1, Const(0b01010101));
42 });
43 }
44 else if (Optimizations.UseSse2)
38 { 45 {
39 EmitVectorUnaryOpSimd32(context, (op1) => 46 EmitVectorUnaryOpSimd32(context, (op1) =>
40 { 47 {
diff --git a/ARMeilleure/IntermediateRepresentation/Intrinsic.cs b/ARMeilleure/IntermediateRepresentation/Intrinsic.cs
index a665e4b7a..b629345ee 100644
--- a/ARMeilleure/IntermediateRepresentation/Intrinsic.cs
+++ b/ARMeilleure/IntermediateRepresentation/Intrinsic.cs
@@ -173,6 +173,7 @@ namespace ARMeilleure.IntermediateRepresentation
173 X86Vfnmadd231ss, 173 X86Vfnmadd231ss,
174 X86Vfnmsub231sd, 174 X86Vfnmsub231sd,
175 X86Vfnmsub231ss, 175 X86Vfnmsub231ss,
176 X86Vpternlogd,
176 X86Xorpd, 177 X86Xorpd,
177 X86Xorps, 178 X86Xorps,
178 179
diff --git a/ARMeilleure/Optimizations.cs b/ARMeilleure/Optimizations.cs
index 9044314f6..a84a4dc4f 100644
--- a/ARMeilleure/Optimizations.cs
+++ b/ARMeilleure/Optimizations.cs
@@ -23,6 +23,10 @@ namespace ARMeilleure
23 public static bool UseSse42IfAvailable { get; set; } = true; 23 public static bool UseSse42IfAvailable { get; set; } = true;
24 public static bool UsePopCntIfAvailable { get; set; } = true; 24 public static bool UsePopCntIfAvailable { get; set; } = true;
25 public static bool UseAvxIfAvailable { get; set; } = true; 25 public static bool UseAvxIfAvailable { get; set; } = true;
26 public static bool UseAvx512FIfAvailable { get; set; } = true;
27 public static bool UseAvx512VlIfAvailable { get; set; } = true;
28 public static bool UseAvx512BwIfAvailable { get; set; } = true;
29 public static bool UseAvx512DqIfAvailable { get; set; } = true;
26 public static bool UseF16cIfAvailable { get; set; } = true; 30 public static bool UseF16cIfAvailable { get; set; } = true;
27 public static bool UseFmaIfAvailable { get; set; } = true; 31 public static bool UseFmaIfAvailable { get; set; } = true;
28 public static bool UseAesniIfAvailable { get; set; } = true; 32 public static bool UseAesniIfAvailable { get; set; } = true;
@@ -47,11 +51,18 @@ namespace ARMeilleure
47 internal static bool UseSse42 => UseSse42IfAvailable && X86HardwareCapabilities.SupportsSse42; 51 internal static bool UseSse42 => UseSse42IfAvailable && X86HardwareCapabilities.SupportsSse42;
48 internal static bool UsePopCnt => UsePopCntIfAvailable && X86HardwareCapabilities.SupportsPopcnt; 52 internal static bool UsePopCnt => UsePopCntIfAvailable && X86HardwareCapabilities.SupportsPopcnt;
49 internal static bool UseAvx => UseAvxIfAvailable && X86HardwareCapabilities.SupportsAvx && !ForceLegacySse; 53 internal static bool UseAvx => UseAvxIfAvailable && X86HardwareCapabilities.SupportsAvx && !ForceLegacySse;
54 internal static bool UseAvx512F => UseAvx512FIfAvailable && X86HardwareCapabilities.SupportsAvx512F && !ForceLegacySse;
55 internal static bool UseAvx512Vl => UseAvx512VlIfAvailable && X86HardwareCapabilities.SupportsAvx512Vl && !ForceLegacySse;
56 internal static bool UseAvx512Bw => UseAvx512BwIfAvailable && X86HardwareCapabilities.SupportsAvx512Bw && !ForceLegacySse;
57 internal static bool UseAvx512Dq => UseAvx512DqIfAvailable && X86HardwareCapabilities.SupportsAvx512Dq && !ForceLegacySse;
50 internal static bool UseF16c => UseF16cIfAvailable && X86HardwareCapabilities.SupportsF16c; 58 internal static bool UseF16c => UseF16cIfAvailable && X86HardwareCapabilities.SupportsF16c;
51 internal static bool UseFma => UseFmaIfAvailable && X86HardwareCapabilities.SupportsFma; 59 internal static bool UseFma => UseFmaIfAvailable && X86HardwareCapabilities.SupportsFma;
52 internal static bool UseAesni => UseAesniIfAvailable && X86HardwareCapabilities.SupportsAesni; 60 internal static bool UseAesni => UseAesniIfAvailable && X86HardwareCapabilities.SupportsAesni;
53 internal static bool UsePclmulqdq => UsePclmulqdqIfAvailable && X86HardwareCapabilities.SupportsPclmulqdq; 61 internal static bool UsePclmulqdq => UsePclmulqdqIfAvailable && X86HardwareCapabilities.SupportsPclmulqdq;
54 internal static bool UseSha => UseShaIfAvailable && X86HardwareCapabilities.SupportsSha; 62 internal static bool UseSha => UseShaIfAvailable && X86HardwareCapabilities.SupportsSha;
55 internal static bool UseGfni => UseGfniIfAvailable && X86HardwareCapabilities.SupportsGfni; 63 internal static bool UseGfni => UseGfniIfAvailable && X86HardwareCapabilities.SupportsGfni;
64
65 internal static bool UseAvx512Ortho => UseAvx512F && UseAvx512Vl;
66 internal static bool UseAvx512OrthoFloat => UseAvx512Ortho && UseAvx512Dq;
56 } 67 }
57} 68}
diff --git a/ARMeilleure/Translation/PTC/Ptc.cs b/ARMeilleure/Translation/PTC/Ptc.cs
index 0b23fd043..17f687062 100644
--- a/ARMeilleure/Translation/PTC/Ptc.cs
+++ b/ARMeilleure/Translation/PTC/Ptc.cs
@@ -30,7 +30,7 @@ namespace ARMeilleure.Translation.PTC
30 private const string OuterHeaderMagicString = "PTCohd\0\0"; 30 private const string OuterHeaderMagicString = "PTCohd\0\0";
31 private const string InnerHeaderMagicString = "PTCihd\0\0"; 31 private const string InnerHeaderMagicString = "PTCihd\0\0";
32 32
33 private const uint InternalVersion = 4484; //! To be incremented manually for each change to the ARMeilleure project. 33 private const uint InternalVersion = 4485; //! To be incremented manually for each change to the ARMeilleure project.
34 34
35 private const string ActualDir = "0"; 35 private const string ActualDir = "0";
36 private const string BackupDir = "1"; 36 private const string BackupDir = "1";
@@ -969,6 +969,7 @@ namespace ARMeilleure.Translation.PTC
969 (ulong)Arm64HardwareCapabilities.LinuxFeatureInfoHwCap, 969 (ulong)Arm64HardwareCapabilities.LinuxFeatureInfoHwCap,
970 (ulong)Arm64HardwareCapabilities.LinuxFeatureInfoHwCap2, 970 (ulong)Arm64HardwareCapabilities.LinuxFeatureInfoHwCap2,
971 (ulong)Arm64HardwareCapabilities.MacOsFeatureInfo, 971 (ulong)Arm64HardwareCapabilities.MacOsFeatureInfo,
972 0,
972 0); 973 0);
973 } 974 }
974 else if (RuntimeInformation.ProcessArchitecture == Architecture.X64) 975 else if (RuntimeInformation.ProcessArchitecture == Architecture.X64)
@@ -977,11 +978,12 @@ namespace ARMeilleure.Translation.PTC
977 (ulong)X86HardwareCapabilities.FeatureInfo1Ecx, 978 (ulong)X86HardwareCapabilities.FeatureInfo1Ecx,
978 (ulong)X86HardwareCapabilities.FeatureInfo1Edx, 979 (ulong)X86HardwareCapabilities.FeatureInfo1Edx,
979 (ulong)X86HardwareCapabilities.FeatureInfo7Ebx, 980 (ulong)X86HardwareCapabilities.FeatureInfo7Ebx,
980 (ulong)X86HardwareCapabilities.FeatureInfo7Ecx); 981 (ulong)X86HardwareCapabilities.FeatureInfo7Ecx,
982 (ulong)X86HardwareCapabilities.Xcr0InfoEax);
981 } 983 }
982 else 984 else
983 { 985 {
984 return new FeatureInfo(0, 0, 0, 0); 986 return new FeatureInfo(0, 0, 0, 0, 0);
985 } 987 }
986 } 988 }
987 989
@@ -1002,7 +1004,7 @@ namespace ARMeilleure.Translation.PTC
1002 return osPlatform; 1004 return osPlatform;
1003 } 1005 }
1004 1006
1005 [StructLayout(LayoutKind.Sequential, Pack = 1/*, Size = 78*/)] 1007 [StructLayout(LayoutKind.Sequential, Pack = 1/*, Size = 86*/)]
1006 private struct OuterHeader 1008 private struct OuterHeader
1007 { 1009 {
1008 public ulong Magic; 1010 public ulong Magic;
@@ -1034,8 +1036,8 @@ namespace ARMeilleure.Translation.PTC
1034 } 1036 }
1035 } 1037 }
1036 1038
1037 [StructLayout(LayoutKind.Sequential, Pack = 1/*, Size = 32*/)] 1039 [StructLayout(LayoutKind.Sequential, Pack = 1/*, Size = 40*/)]
1038 private record struct FeatureInfo(ulong FeatureInfo0, ulong FeatureInfo1, ulong FeatureInfo2, ulong FeatureInfo3); 1040 private record struct FeatureInfo(ulong FeatureInfo0, ulong FeatureInfo1, ulong FeatureInfo2, ulong FeatureInfo3, ulong FeatureInfo4);
1039 1041
1040 [StructLayout(LayoutKind.Sequential, Pack = 1/*, Size = 128*/)] 1042 [StructLayout(LayoutKind.Sequential, Pack = 1/*, Size = 128*/)]
1041 private struct InnerHeader 1043 private struct InnerHeader