mirror of
				https://github.com/Ryujinx/Ryujinx.git
				synced 2025-10-24 19:50:29 -07:00 
			
		
		
		
	Faster crc32 implementation (#1294)
* Add Pclmulqdq intrinsic * Implement crc32 in terms of pclmulqdq * Address PR comments
This commit is contained in:
		| @@ -165,6 +165,7 @@ namespace ARMeilleure.CodeGen.X86 | ||||
|             Add(X86Instruction.Pavgb,      new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x00000fe0, InstructionFlags.Vex | InstructionFlags.Prefix66)); | ||||
|             Add(X86Instruction.Pavgw,      new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x00000fe3, InstructionFlags.Vex | InstructionFlags.Prefix66)); | ||||
|             Add(X86Instruction.Pblendvb,   new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x000f3810, InstructionFlags.Prefix66)); | ||||
|             Add(X86Instruction.Pclmulqdq,  new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x000f3a44, InstructionFlags.Vex | InstructionFlags.Prefix66)); | ||||
|             Add(X86Instruction.Pcmpeqb,    new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x00000f74, InstructionFlags.Vex | InstructionFlags.Prefix66)); | ||||
|             Add(X86Instruction.Pcmpeqd,    new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x00000f76, InstructionFlags.Vex | InstructionFlags.Prefix66)); | ||||
|             Add(X86Instruction.Pcmpeqq,    new InstructionInfo(BadOp,      BadOp,      BadOp,      BadOp,      0x000f3829, InstructionFlags.Vex | InstructionFlags.Prefix66)); | ||||
| @@ -633,6 +634,13 @@ namespace ARMeilleure.CodeGen.X86 | ||||
|             WriteInstruction(dest, source, type, X86Instruction.Or); | ||||
|         } | ||||
|  | ||||
|         public void Pclmulqdq(Operand dest, Operand source, byte imm) | ||||
|         { | ||||
|             WriteInstruction(dest, null, source, X86Instruction.Pclmulqdq); | ||||
|  | ||||
|             WriteByte(imm); | ||||
|         } | ||||
|  | ||||
|         public void Pcmpeqw(Operand dest, Operand src1, Operand src2) | ||||
|         { | ||||
|             WriteInstruction(dest, src1, src2, X86Instruction.Pcmpeqw); | ||||
|   | ||||
| @@ -82,6 +82,7 @@ namespace ARMeilleure.CodeGen.X86 | ||||
|             Add(Intrinsic.X86Pavgb,      new IntrinsicInfo(X86Instruction.Pavgb,      IntrinsicType.Binary)); | ||||
|             Add(Intrinsic.X86Pavgw,      new IntrinsicInfo(X86Instruction.Pavgw,      IntrinsicType.Binary)); | ||||
|             Add(Intrinsic.X86Pblendvb,   new IntrinsicInfo(X86Instruction.Pblendvb,   IntrinsicType.Ternary)); | ||||
|             Add(Intrinsic.X86Pclmulqdq,  new IntrinsicInfo(X86Instruction.Pclmulqdq,  IntrinsicType.TernaryImm)); | ||||
|             Add(Intrinsic.X86Pcmpeqb,    new IntrinsicInfo(X86Instruction.Pcmpeqb,    IntrinsicType.Binary)); | ||||
|             Add(Intrinsic.X86Pcmpeqd,    new IntrinsicInfo(X86Instruction.Pcmpeqd,    IntrinsicType.Binary)); | ||||
|             Add(Intrinsic.X86Pcmpeqq,    new IntrinsicInfo(X86Instruction.Pcmpeqq,    IntrinsicType.Binary)); | ||||
|   | ||||
| @@ -98,6 +98,7 @@ namespace ARMeilleure.CodeGen.X86 | ||||
|         Pavgb, | ||||
|         Pavgw, | ||||
|         Pblendvb, | ||||
|         Pclmulqdq, | ||||
|         Pcmpeqb, | ||||
|         Pcmpeqd, | ||||
|         Pcmpeqq, | ||||
|   | ||||
| @@ -1,9 +1,13 @@ | ||||
| // https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf | ||||
|  | ||||
| using ARMeilleure.Decoders; | ||||
| using ARMeilleure.IntermediateRepresentation; | ||||
| using ARMeilleure.Translation; | ||||
| using System; | ||||
|  | ||||
| using static ARMeilleure.Instructions.InstEmitHelper; | ||||
| using static ARMeilleure.Instructions.InstEmitSimdHelper; | ||||
| using static ARMeilleure.IntermediateRepresentation.OperandHelper; | ||||
|  | ||||
| namespace ARMeilleure.Instructions | ||||
| { | ||||
| @@ -11,42 +15,159 @@ namespace ARMeilleure.Instructions | ||||
|     { | ||||
|         public static void Crc32b(ArmEmitterContext context) | ||||
|         { | ||||
|             EmitCrc32Call(context, new _U32_U32_U8(SoftFallback.Crc32b)); | ||||
|             if (Optimizations.UsePclmulqdq) | ||||
|             { | ||||
|                 EmitCrc32Optimized(context, false, 8); | ||||
|             } | ||||
|             else | ||||
|             { | ||||
|                 EmitCrc32Call(context, new _U32_U32_U8(SoftFallback.Crc32b)); | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         public static void Crc32h(ArmEmitterContext context) | ||||
|         { | ||||
|             EmitCrc32Call(context, new _U32_U32_U16(SoftFallback.Crc32h)); | ||||
|             if (Optimizations.UsePclmulqdq) | ||||
|             { | ||||
|                 EmitCrc32Optimized(context, false, 16); | ||||
|             } | ||||
|             else | ||||
|             { | ||||
|                 EmitCrc32Call(context, new _U32_U32_U16(SoftFallback.Crc32h)); | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         public static void Crc32w(ArmEmitterContext context) | ||||
|         { | ||||
|             EmitCrc32Call(context, new _U32_U32_U32(SoftFallback.Crc32w)); | ||||
|             if (Optimizations.UsePclmulqdq) | ||||
|             { | ||||
|                 EmitCrc32Optimized(context, false, 32); | ||||
|             } | ||||
|             else | ||||
|             { | ||||
|                 EmitCrc32Call(context, new _U32_U32_U32(SoftFallback.Crc32w)); | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         public static void Crc32x(ArmEmitterContext context) | ||||
|         { | ||||
|             EmitCrc32Call(context, new _U32_U32_U64(SoftFallback.Crc32x)); | ||||
|             if (Optimizations.UsePclmulqdq) | ||||
|             { | ||||
|                 EmitCrc32Optimized64(context, false); | ||||
|             } | ||||
|             else | ||||
|             { | ||||
|                 EmitCrc32Call(context, new _U32_U32_U64(SoftFallback.Crc32x)); | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         public static void Crc32cb(ArmEmitterContext context) | ||||
|         { | ||||
|             EmitCrc32Call(context, new _U32_U32_U8(SoftFallback.Crc32cb)); | ||||
|             if (Optimizations.UsePclmulqdq) | ||||
|             { | ||||
|                 EmitCrc32Optimized(context, true, 8); | ||||
|             } | ||||
|             else | ||||
|             { | ||||
|                 EmitCrc32Call(context, new _U32_U32_U8(SoftFallback.Crc32cb)); | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         public static void Crc32ch(ArmEmitterContext context) | ||||
|         { | ||||
|             EmitCrc32Call(context, new _U32_U32_U16(SoftFallback.Crc32ch)); | ||||
|             if (Optimizations.UsePclmulqdq) | ||||
|             { | ||||
|                 EmitCrc32Optimized(context, true, 16); | ||||
|             } | ||||
|             else | ||||
|             { | ||||
|                 EmitCrc32Call(context, new _U32_U32_U16(SoftFallback.Crc32ch)); | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         public static void Crc32cw(ArmEmitterContext context) | ||||
|         { | ||||
|             EmitCrc32Call(context, new _U32_U32_U32(SoftFallback.Crc32cw)); | ||||
|             if (Optimizations.UsePclmulqdq) | ||||
|             { | ||||
|                 EmitCrc32Optimized(context, true, 32); | ||||
|             } | ||||
|             else | ||||
|             { | ||||
|                 EmitCrc32Call(context, new _U32_U32_U32(SoftFallback.Crc32cw)); | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         public static void Crc32cx(ArmEmitterContext context) | ||||
|         { | ||||
|             EmitCrc32Call(context, new _U32_U32_U64(SoftFallback.Crc32cx)); | ||||
|             if (Optimizations.UsePclmulqdq) | ||||
|             { | ||||
|                 EmitCrc32Optimized64(context, true); | ||||
|             } | ||||
|             else | ||||
|             { | ||||
|                 EmitCrc32Call(context, new _U32_U32_U64(SoftFallback.Crc32cx)); | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         private static void EmitCrc32Optimized(ArmEmitterContext context, bool castagnoli, int bitsize) | ||||
|         { | ||||
|             OpCodeAluBinary op = (OpCodeAluBinary)context.CurrOp; | ||||
|  | ||||
|             long mu = castagnoli ? 0x0DEA713F1 : 0x1F7011641; // mu' = floor(x^64/P(x))' | ||||
|             long polynomial = castagnoli ? 0x105EC76F0 : 0x1DB710641; // P'(x) << 1 | ||||
|  | ||||
|             Operand crc = GetIntOrZR(context, op.Rn); | ||||
|             Operand data = GetIntOrZR(context, op.Rm); | ||||
|  | ||||
|             crc = context.VectorInsert(context.VectorZero(), crc, 0); | ||||
|  | ||||
|             switch (bitsize) | ||||
|             { | ||||
|                 case 8: data = context.VectorInsert8(context.VectorZero(), data, 0); break; | ||||
|                 case 16: data = context.VectorInsert16(context.VectorZero(), data, 0); break; | ||||
|                 case 32: data = context.VectorInsert(context.VectorZero(), data, 0); break; | ||||
|             } | ||||
|  | ||||
|             Operand tmp = context.AddIntrinsic(Intrinsic.X86Pxor, crc, data); | ||||
|             tmp = context.AddIntrinsic(Intrinsic.X86Psllq, tmp, Const(64 - bitsize)); | ||||
|             tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, tmp, X86GetScalar(context, mu), Const(0)); | ||||
|             tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, tmp, X86GetScalar(context, polynomial), Const(0)); | ||||
|  | ||||
|             if (bitsize < 32) | ||||
|             { | ||||
|                 crc = context.AddIntrinsic(Intrinsic.X86Pslldq, crc, Const((64 - bitsize) / 8)); | ||||
|                 tmp = context.AddIntrinsic(Intrinsic.X86Pxor, tmp, crc); | ||||
|             } | ||||
|  | ||||
|             SetIntOrZR(context, op.Rd, context.VectorExtract(OperandType.I32, tmp, 2)); | ||||
|         } | ||||
|  | ||||
|         private static void EmitCrc32Optimized64(ArmEmitterContext context, bool castagnoli) | ||||
|         { | ||||
|             OpCodeAluBinary op = (OpCodeAluBinary)context.CurrOp; | ||||
|  | ||||
|             long mu = castagnoli ? 0x0DEA713F1 : 0x1F7011641; // mu' = floor(x^64/P(x))' | ||||
|             long polynomial = castagnoli ? 0x105EC76F0 : 0x1DB710641; // P'(x) << 1 | ||||
|  | ||||
|             Operand crc = GetIntOrZR(context, op.Rn); | ||||
|             Operand data = GetIntOrZR(context, op.Rm); | ||||
|  | ||||
|             crc = context.VectorInsert(context.VectorZero(), crc, 0); | ||||
|             data = context.VectorInsert(context.VectorZero(), data, 0); | ||||
|  | ||||
|             Operand tmp = context.AddIntrinsic(Intrinsic.X86Pxor, crc, data); | ||||
|             Operand res = context.AddIntrinsic(Intrinsic.X86Pslldq, tmp, Const(4)); | ||||
|  | ||||
|             tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, res, X86GetScalar(context, mu), Const(0)); | ||||
|             tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, tmp, X86GetScalar(context, polynomial), Const(0)); | ||||
|  | ||||
|             tmp = context.AddIntrinsic(Intrinsic.X86Pxor, tmp, res); | ||||
|             tmp = context.AddIntrinsic(Intrinsic.X86Psllq, tmp, Const(32)); | ||||
|  | ||||
|             tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, tmp, X86GetScalar(context, mu), Const(1)); | ||||
|             tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, tmp, X86GetScalar(context, polynomial), Const(0)); | ||||
|  | ||||
|             SetIntOrZR(context, op.Rd, context.VectorExtract(OperandType.I32, tmp, 2)); | ||||
|         } | ||||
|  | ||||
|         private static void EmitCrc32Call(ArmEmitterContext context, Delegate dlg) | ||||
|   | ||||
| @@ -71,6 +71,7 @@ namespace ARMeilleure.IntermediateRepresentation | ||||
|         X86Pavgb, | ||||
|         X86Pavgw, | ||||
|         X86Pblendvb, | ||||
|         X86Pclmulqdq, | ||||
|         X86Pcmpeqb, | ||||
|         X86Pcmpeqd, | ||||
|         X86Pcmpeqq, | ||||
|   | ||||
| @@ -8,15 +8,16 @@ namespace ARMeilleure | ||||
|  | ||||
|         public static bool FastFP { get; set; } = true; | ||||
|  | ||||
|         public static bool UseSseIfAvailable    { get; set; } = true; | ||||
|         public static bool UseSse2IfAvailable   { get; set; } = true; | ||||
|         public static bool UseSse3IfAvailable   { get; set; } = true; | ||||
|         public static bool UseSsse3IfAvailable  { get; set; } = true; | ||||
|         public static bool UseSse41IfAvailable  { get; set; } = true; | ||||
|         public static bool UseSse42IfAvailable  { get; set; } = true; | ||||
|         public static bool UsePopCntIfAvailable { get; set; } = true; | ||||
|         public static bool UseAvxIfAvailable    { get; set; } = true; | ||||
|         public static bool UseAesniIfAvailable  { get; set; } = true; | ||||
|         public static bool UseSseIfAvailable       { get; set; } = true; | ||||
|         public static bool UseSse2IfAvailable      { get; set; } = true; | ||||
|         public static bool UseSse3IfAvailable      { get; set; } = true; | ||||
|         public static bool UseSsse3IfAvailable     { get; set; } = true; | ||||
|         public static bool UseSse41IfAvailable     { get; set; } = true; | ||||
|         public static bool UseSse42IfAvailable     { get; set; } = true; | ||||
|         public static bool UsePopCntIfAvailable    { get; set; } = true; | ||||
|         public static bool UseAvxIfAvailable       { get; set; } = true; | ||||
|         public static bool UseAesniIfAvailable     { get; set; } = true; | ||||
|         public static bool UsePclmulqdqIfAvailable { get; set; } = true; | ||||
|  | ||||
|         public static bool ForceLegacySse | ||||
|         { | ||||
| @@ -24,14 +25,15 @@ namespace ARMeilleure | ||||
|             set => HardwareCapabilities.ForceLegacySse = value; | ||||
|         } | ||||
|  | ||||
|         internal static bool UseSse    => UseSseIfAvailable    && HardwareCapabilities.SupportsSse; | ||||
|         internal static bool UseSse2   => UseSse2IfAvailable   && HardwareCapabilities.SupportsSse2; | ||||
|         internal static bool UseSse3   => UseSse3IfAvailable   && HardwareCapabilities.SupportsSse3; | ||||
|         internal static bool UseSsse3  => UseSsse3IfAvailable  && HardwareCapabilities.SupportsSsse3; | ||||
|         internal static bool UseSse41  => UseSse41IfAvailable  && HardwareCapabilities.SupportsSse41; | ||||
|         internal static bool UseSse42  => UseSse42IfAvailable  && HardwareCapabilities.SupportsSse42; | ||||
|         internal static bool UsePopCnt => UsePopCntIfAvailable && HardwareCapabilities.SupportsPopcnt; | ||||
|         internal static bool UseAvx    => UseAvxIfAvailable    && HardwareCapabilities.SupportsAvx && !ForceLegacySse; | ||||
|         internal static bool UseAesni  => UseAesniIfAvailable  && HardwareCapabilities.SupportsAesni; | ||||
|         internal static bool UseSse       => UseSseIfAvailable       && HardwareCapabilities.SupportsSse; | ||||
|         internal static bool UseSse2      => UseSse2IfAvailable      && HardwareCapabilities.SupportsSse2; | ||||
|         internal static bool UseSse3      => UseSse3IfAvailable      && HardwareCapabilities.SupportsSse3; | ||||
|         internal static bool UseSsse3     => UseSsse3IfAvailable     && HardwareCapabilities.SupportsSsse3; | ||||
|         internal static bool UseSse41     => UseSse41IfAvailable     && HardwareCapabilities.SupportsSse41; | ||||
|         internal static bool UseSse42     => UseSse42IfAvailable     && HardwareCapabilities.SupportsSse42; | ||||
|         internal static bool UsePopCnt    => UsePopCntIfAvailable    && HardwareCapabilities.SupportsPopcnt; | ||||
|         internal static bool UseAvx       => UseAvxIfAvailable       && HardwareCapabilities.SupportsAvx && !ForceLegacySse; | ||||
|         internal static bool UseAesni     => UseAesniIfAvailable     && HardwareCapabilities.SupportsAesni; | ||||
|         internal static bool UsePclmulqdq => UsePclmulqdqIfAvailable && HardwareCapabilities.SupportsPclmulqdq; | ||||
|     } | ||||
| } | ||||
		Reference in New Issue
	
	Block a user