early-access version 1279

This commit is contained in:
pineappleEA
2021-01-02 22:47:26 +01:00
parent 84d5e05316
commit f7b2c59575
35 changed files with 399 additions and 112 deletions

View File

@@ -39,6 +39,9 @@ enum class OptimizationFlag : std::uint32_t {
/// This is an UNSAFE optimization that reduces accuracy of certain floating-point instructions.
/// This allows results of FRECPE and FRSQRTE to have **less** error than spec allows.
Unsafe_ReducedErrorFP = 0x00020000,
/// This is an UNSAFE optimization that causes floating-point instructions to not produce correct NaNs.
/// This may also result in inaccurate results when instructions are given certain special values.
Unsafe_InaccurateNaN = 0x00040000,
};
constexpr OptimizationFlag no_optimizations = static_cast<OptimizationFlag>(0);

View File

@@ -1080,29 +1080,40 @@ static void EmitSub(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, int bit
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
auto& carry_in = args[2];
const bool is_cmp = inst->UseCount() == size_t(!!carry_inst + !!overflow_inst + !!nzcv_inst) && carry_in.IsImmediate() && carry_in.GetImmediateU1();
const Xbyak::Reg64 nzcv = DoNZCV(code, ctx.reg_alloc, nzcv_inst);
const Xbyak::Reg result = ctx.reg_alloc.UseScratchGpr(args[0]).changeBit(bitsize);
const Xbyak::Reg result = (is_cmp ? ctx.reg_alloc.UseGpr(args[0]) : ctx.reg_alloc.UseScratchGpr(args[0])).changeBit(bitsize);
const Xbyak::Reg8 carry = DoCarry(ctx.reg_alloc, carry_in, carry_inst);
const Xbyak::Reg8 overflow = overflow_inst ? ctx.reg_alloc.ScratchGpr().cvt8() : Xbyak::Reg8{-1};
// TODO: Consider using LEA.
// TODO: Optimize CMP case.
// Note that x64 CF is inverse of what the ARM carry flag is here.
if (args[1].IsImmediate() && args[1].GetType() == IR::Type::U32) {
bool invert_output_carry = true;
if (is_cmp) {
if (args[1].IsImmediate() && args[1].GetType() == IR::Type::U32) {
const u32 op_arg = args[1].GetImmediateU32();
code.cmp(result, op_arg);
} else {
OpArg op_arg = ctx.reg_alloc.UseOpArg(args[1]);
op_arg.setBit(bitsize);
code.cmp(result, *op_arg);
}
} else if (args[1].IsImmediate() && args[1].GetType() == IR::Type::U32) {
const u32 op_arg = args[1].GetImmediateU32();
if (carry_in.IsImmediate()) {
if (carry_in.GetImmediateU1()) {
code.sub(result, op_arg);
} else {
code.stc();
code.sbb(result, op_arg);
code.add(result, ~op_arg);
invert_output_carry = false;
}
} else {
code.bt(carry.cvt32(), 0);
code.cmc();
code.sbb(result, op_arg);
code.adc(result, ~op_arg);
invert_output_carry = false;
}
} else {
OpArg op_arg = ctx.reg_alloc.UseOpArg(args[1]);
@@ -1122,14 +1133,20 @@ static void EmitSub(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, int bit
}
if (nzcv_inst) {
code.cmc();
if (invert_output_carry) {
code.cmc();
}
code.lahf();
code.seto(code.al);
ctx.reg_alloc.DefineValue(nzcv_inst, nzcv);
ctx.EraseInstruction(nzcv_inst);
}
if (carry_inst) {
code.setnc(carry);
if (invert_output_carry) {
code.setnc(carry);
} else {
code.setc(carry);
}
ctx.reg_alloc.DefineValue(carry_inst, carry);
ctx.EraseInstruction(carry_inst);
}
@@ -1138,8 +1155,9 @@ static void EmitSub(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, int bit
ctx.reg_alloc.DefineValue(overflow_inst, overflow);
ctx.EraseInstruction(overflow_inst);
}
ctx.reg_alloc.DefineValue(inst, result);
if (!is_cmp) {
ctx.reg_alloc.DefineValue(inst, result);
}
}
void EmitX64::EmitSub32(EmitContext& ctx, IR::Inst* inst) {

View File

@@ -257,7 +257,7 @@ void FPTwoOp(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) {
Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
if (!ctx.FPCR().DN()) {
if (!ctx.FPCR().DN() && !ctx.HasOptimization(OptimizationFlag::Unsafe_InaccurateNaN)) {
end = ProcessNaN<fsize>(code, result);
}
if constexpr (std::is_member_function_pointer_v<Function>) {
@@ -265,7 +265,9 @@ void FPTwoOp(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) {
} else {
fn(result);
}
if (ctx.FPCR().DN()) {
if (ctx.HasOptimization(OptimizationFlag::Unsafe_InaccurateNaN)) {
// Do nothing
} else if (ctx.FPCR().DN()) {
ForceToDefaultNaN<fsize>(code, result);
} else {
PostProcessNaN<fsize>(code, result, ctx.reg_alloc.ScratchXmm());
@@ -281,6 +283,20 @@ void FPThreeOp(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn)
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (ctx.HasOptimization(OptimizationFlag::Unsafe_InaccurateNaN)) {
const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm operand = ctx.reg_alloc.UseScratchXmm(args[1]);
if constexpr (std::is_member_function_pointer_v<Function>) {
(code.*fn)(result, operand);
} else {
fn(result, operand);
}
ctx.reg_alloc.DefineValue(inst, result);
return;
}
if (ctx.FPCR().DN()) {
const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm operand = ctx.reg_alloc.UseScratchXmm(args[1]);
@@ -590,9 +606,20 @@ static void EmitFPMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
using FPT = mp::unsigned_integer_of_size<fsize>;
if constexpr (fsize != 16) {
if (code.HasFMA()) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (code.HasFMA() && ctx.HasOptimization(OptimizationFlag::Unsafe_InaccurateNaN)) {
const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]);
const Xbyak::Xmm operand3 = ctx.reg_alloc.UseXmm(args[2]);
FCODE(vfmadd231s)(result, operand2, operand3);
ctx.reg_alloc.DefineValue(inst, result);
return;
}
if (code.HasFMA()) {
Xbyak::Label end, fallback;
const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(args[0]);
@@ -641,8 +668,6 @@ static void EmitFPMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
}
if (ctx.HasOptimization(OptimizationFlag::Unsafe_UnfuseFMA)) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm operand1 = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm operand2 = ctx.reg_alloc.UseScratchXmm(args[1]);
const Xbyak::Xmm operand3 = ctx.reg_alloc.UseXmm(args[2]);
@@ -810,6 +835,22 @@ static void EmitFPRecipStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst*
using FPT = mp::unsigned_integer_of_size<fsize>;
if constexpr (fsize != 16) {
if (code.HasFMA() && ctx.HasOptimization(OptimizationFlag::Unsafe_InaccurateNaN)) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
Xbyak::Label end, fallback;
const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(args[0]);
const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]);
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
code.movaps(result, code.MConst(xword, FP::FPValue<FPT, false, 0, 2>()));
FCODE(vfnmadd231s)(result, operand1, operand2);
ctx.reg_alloc.DefineValue(inst, result);
return;
}
if (code.HasFMA()) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
@@ -998,6 +1039,21 @@ static void EmitFPRSqrtStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst*
using FPT = mp::unsigned_integer_of_size<fsize>;
if constexpr (fsize != 16) {
if (code.HasFMA() && code.HasAVX() && ctx.HasOptimization(OptimizationFlag::Unsafe_InaccurateNaN)) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(args[0]);
const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]);
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
code.vmovaps(result, code.MConst(xword, FP::FPValue<FPT, false, 0, 3>()));
FCODE(vfnmadd231s)(result, operand1, operand2);
FCODE(vmuls)(result, result, code.MConst(xword, FP::FPValue<FPT, false, -1, 1>()));
ctx.reg_alloc.DefineValue(inst, result);
return;
}
if (code.HasFMA() && code.HasAVX()) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);

View File

@@ -290,7 +290,7 @@ void EmitTwoOpVectorOperation(BlockOfCode& code, EmitContext& ctx, IR::Inst* ins
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const bool fpcr_controlled = args[fpcr_controlled_arg_index].GetImmediateU1();
if (ctx.FPCR(fpcr_controlled).DN()) {
if (ctx.FPCR(fpcr_controlled).DN() || ctx.HasOptimization(OptimizationFlag::Unsafe_InaccurateNaN)) {
Xbyak::Xmm result;
if constexpr (std::is_member_function_pointer_v<Function>) {
@@ -306,7 +306,9 @@ void EmitTwoOpVectorOperation(BlockOfCode& code, EmitContext& ctx, IR::Inst* ins
});
}
ForceToDefaultNaN<fsize>(code, ctx.FPCR(fpcr_controlled), result);
if (!ctx.HasOptimization(OptimizationFlag::Unsafe_InaccurateNaN)) {
ForceToDefaultNaN<fsize>(code, ctx.FPCR(fpcr_controlled), result);
}
ctx.reg_alloc.DefineValue(inst, result);
return;
@@ -342,7 +344,7 @@ void EmitThreeOpVectorOperation(BlockOfCode& code, EmitContext& ctx, IR::Inst* i
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const bool fpcr_controlled = args[2].GetImmediateU1();
if (ctx.FPCR(fpcr_controlled).DN()) {
if (ctx.FPCR(fpcr_controlled).DN() || ctx.HasOptimization(OptimizationFlag::Unsafe_InaccurateNaN)) {
const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
@@ -356,7 +358,9 @@ void EmitThreeOpVectorOperation(BlockOfCode& code, EmitContext& ctx, IR::Inst* i
});
}
ForceToDefaultNaN<fsize>(code, ctx.FPCR(fpcr_controlled), xmm_a);
if (!ctx.HasOptimization(OptimizationFlag::Unsafe_InaccurateNaN)) {
ForceToDefaultNaN<fsize>(code, ctx.FPCR(fpcr_controlled), xmm_a);
}
ctx.reg_alloc.DefineValue(inst, xmm_a);
return;
@@ -985,11 +989,23 @@ void EmitFPVectorMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
};
if constexpr (fsize != 16) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const bool fpcr_controlled = args[3].GetImmediateU1();
if (code.HasFMA() && code.HasAVX() && ctx.HasOptimization(OptimizationFlag::Unsafe_UnfuseFMA)) {
const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
const Xbyak::Xmm xmm_c = ctx.reg_alloc.UseXmm(args[2]);
MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&]{
FCODE(vfmadd231p)(result, xmm_b, xmm_c);
});
ctx.reg_alloc.DefineValue(inst, result);
return;
}
if (code.HasFMA() && code.HasAVX()) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const bool fpcr_controlled = args[3].GetImmediateU1();
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseXmm(args[0]);
const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
@@ -1025,8 +1041,6 @@ void EmitFPVectorMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
}
if (ctx.HasOptimization(OptimizationFlag::Unsafe_UnfuseFMA)) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm operand1 = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm operand2 = ctx.reg_alloc.UseScratchXmm(args[1]);
const Xbyak::Xmm operand3 = ctx.reg_alloc.UseXmm(args[2]);
@@ -1233,10 +1247,24 @@ static void EmitRecipStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst* in
};
if constexpr (fsize != 16) {
if (code.HasFMA() && code.HasAVX()) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const bool fpcr_controlled = args[2].GetImmediateU1();
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const bool fpcr_controlled = args[2].GetImmediateU1();
if (code.HasFMA() && code.HasAVX() && ctx.HasOptimization(OptimizationFlag::Unsafe_InaccurateNaN)) {
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(args[0]);
const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]);
MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&]{
code.movaps(result, GetVectorOf<fsize, false, 0, 2>(code));
FCODE(vfnmadd231p)(result, operand1, operand2);
});
ctx.reg_alloc.DefineValue(inst, result);
return;
}
if (code.HasFMA() && code.HasAVX()) {
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(args[0]);
const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]);
@@ -1269,8 +1297,6 @@ static void EmitRecipStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst* in
}
if (ctx.HasOptimization(OptimizationFlag::Unsafe_UnfuseFMA)) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm operand1 = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]);
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
@@ -1428,10 +1454,25 @@ static void EmitRSqrtStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst* in
};
if constexpr (fsize != 16) {
if (code.HasFMA() && code.HasAVX()) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const bool fpcr_controlled = args[2].GetImmediateU1();
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const bool fpcr_controlled = args[2].GetImmediateU1();
if (code.HasFMA() && code.HasAVX() && ctx.HasOptimization(OptimizationFlag::Unsafe_InaccurateNaN)) {
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(args[0]);
const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]);
MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&]{
code.vmovaps(result, GetVectorOf<fsize, false, 0, 3>(code));
FCODE(vfnmadd231p)(result, operand1, operand2);
FCODE(vmulp)(result, result, GetVectorOf<fsize, false, -1, 1>(code));
});
ctx.reg_alloc.DefineValue(inst, result);
return;
}
if (code.HasFMA() && code.HasAVX()) {
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(args[0]);
const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]);
@@ -1470,8 +1511,6 @@ static void EmitRSqrtStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst* in
}
if (ctx.HasOptimization(OptimizationFlag::Unsafe_UnfuseFMA)) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm operand1 = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]);
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();

View File

@@ -27,7 +27,7 @@ bool TranslatorVisitor::CCMP_reg(bool sf, Reg Rm, Cond cond, Reg Rn, Imm<4> nzcv
const IR::U32U64 operand1 = X(datasize, Rn);
const IR::U32U64 operand2 = X(datasize, Rm);
const IR::NZCV then_flags = ir.NZCVFrom(ir.AddWithCarry(operand1, ir.Not(operand2), ir.Imm1(1)));
const IR::NZCV then_flags = ir.NZCVFrom(ir.SubWithCarry(operand1, operand2, ir.Imm1(1)));
const IR::NZCV else_flags = ir.NZCVFromPackedFlags(ir.Imm32(flags));
ir.SetNZCV(ir.ConditionalSelect(cond, then_flags, else_flags));
return true;
@@ -53,7 +53,7 @@ bool TranslatorVisitor::CCMP_imm(bool sf, Imm<5> imm5, Cond cond, Reg Rn, Imm<4>
const IR::U32U64 operand1 = X(datasize, Rn);
const IR::U32U64 operand2 = I(datasize, imm5.ZeroExtend<u32>());
const IR::NZCV then_flags = ir.NZCVFrom(ir.AddWithCarry(operand1, ir.Not(operand2), ir.Imm1(1)));
const IR::NZCV then_flags = ir.NZCVFrom(ir.SubWithCarry(operand1, operand2, ir.Imm1(1)));
const IR::NZCV else_flags = ir.NZCVFromPackedFlags(ir.Imm32(flags));
ir.SetNZCV(ir.ConditionalSelect(cond, then_flags, else_flags));
return true;