early-access version 1279
This commit is contained in:
@@ -1080,29 +1080,40 @@ static void EmitSub(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, int bit
|
||||
|
||||
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||
auto& carry_in = args[2];
|
||||
const bool is_cmp = inst->UseCount() == size_t(!!carry_inst + !!overflow_inst + !!nzcv_inst) && carry_in.IsImmediate() && carry_in.GetImmediateU1();
|
||||
|
||||
const Xbyak::Reg64 nzcv = DoNZCV(code, ctx.reg_alloc, nzcv_inst);
|
||||
const Xbyak::Reg result = ctx.reg_alloc.UseScratchGpr(args[0]).changeBit(bitsize);
|
||||
const Xbyak::Reg result = (is_cmp ? ctx.reg_alloc.UseGpr(args[0]) : ctx.reg_alloc.UseScratchGpr(args[0])).changeBit(bitsize);
|
||||
const Xbyak::Reg8 carry = DoCarry(ctx.reg_alloc, carry_in, carry_inst);
|
||||
const Xbyak::Reg8 overflow = overflow_inst ? ctx.reg_alloc.ScratchGpr().cvt8() : Xbyak::Reg8{-1};
|
||||
|
||||
// TODO: Consider using LEA.
|
||||
// TODO: Optimize CMP case.
|
||||
// Note that x64 CF is inverse of what the ARM carry flag is here.
|
||||
|
||||
if (args[1].IsImmediate() && args[1].GetType() == IR::Type::U32) {
|
||||
bool invert_output_carry = true;
|
||||
|
||||
if (is_cmp) {
|
||||
if (args[1].IsImmediate() && args[1].GetType() == IR::Type::U32) {
|
||||
const u32 op_arg = args[1].GetImmediateU32();
|
||||
code.cmp(result, op_arg);
|
||||
} else {
|
||||
OpArg op_arg = ctx.reg_alloc.UseOpArg(args[1]);
|
||||
op_arg.setBit(bitsize);
|
||||
code.cmp(result, *op_arg);
|
||||
}
|
||||
} else if (args[1].IsImmediate() && args[1].GetType() == IR::Type::U32) {
|
||||
const u32 op_arg = args[1].GetImmediateU32();
|
||||
if (carry_in.IsImmediate()) {
|
||||
if (carry_in.GetImmediateU1()) {
|
||||
code.sub(result, op_arg);
|
||||
} else {
|
||||
code.stc();
|
||||
code.sbb(result, op_arg);
|
||||
code.add(result, ~op_arg);
|
||||
invert_output_carry = false;
|
||||
}
|
||||
} else {
|
||||
code.bt(carry.cvt32(), 0);
|
||||
code.cmc();
|
||||
code.sbb(result, op_arg);
|
||||
code.adc(result, ~op_arg);
|
||||
invert_output_carry = false;
|
||||
}
|
||||
} else {
|
||||
OpArg op_arg = ctx.reg_alloc.UseOpArg(args[1]);
|
||||
@@ -1122,14 +1133,20 @@ static void EmitSub(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, int bit
|
||||
}
|
||||
|
||||
if (nzcv_inst) {
|
||||
code.cmc();
|
||||
if (invert_output_carry) {
|
||||
code.cmc();
|
||||
}
|
||||
code.lahf();
|
||||
code.seto(code.al);
|
||||
ctx.reg_alloc.DefineValue(nzcv_inst, nzcv);
|
||||
ctx.EraseInstruction(nzcv_inst);
|
||||
}
|
||||
if (carry_inst) {
|
||||
code.setnc(carry);
|
||||
if (invert_output_carry) {
|
||||
code.setnc(carry);
|
||||
} else {
|
||||
code.setc(carry);
|
||||
}
|
||||
ctx.reg_alloc.DefineValue(carry_inst, carry);
|
||||
ctx.EraseInstruction(carry_inst);
|
||||
}
|
||||
@@ -1138,8 +1155,9 @@ static void EmitSub(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, int bit
|
||||
ctx.reg_alloc.DefineValue(overflow_inst, overflow);
|
||||
ctx.EraseInstruction(overflow_inst);
|
||||
}
|
||||
|
||||
ctx.reg_alloc.DefineValue(inst, result);
|
||||
if (!is_cmp) {
|
||||
ctx.reg_alloc.DefineValue(inst, result);
|
||||
}
|
||||
}
|
||||
|
||||
void EmitX64::EmitSub32(EmitContext& ctx, IR::Inst* inst) {
|
||||
|
@@ -257,7 +257,7 @@ void FPTwoOp(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) {
|
||||
|
||||
Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
|
||||
|
||||
if (!ctx.FPCR().DN()) {
|
||||
if (!ctx.FPCR().DN() && !ctx.HasOptimization(OptimizationFlag::Unsafe_InaccurateNaN)) {
|
||||
end = ProcessNaN<fsize>(code, result);
|
||||
}
|
||||
if constexpr (std::is_member_function_pointer_v<Function>) {
|
||||
@@ -265,7 +265,9 @@ void FPTwoOp(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) {
|
||||
} else {
|
||||
fn(result);
|
||||
}
|
||||
if (ctx.FPCR().DN()) {
|
||||
if (ctx.HasOptimization(OptimizationFlag::Unsafe_InaccurateNaN)) {
|
||||
// Do nothing
|
||||
} else if (ctx.FPCR().DN()) {
|
||||
ForceToDefaultNaN<fsize>(code, result);
|
||||
} else {
|
||||
PostProcessNaN<fsize>(code, result, ctx.reg_alloc.ScratchXmm());
|
||||
@@ -281,6 +283,20 @@ void FPThreeOp(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn)
|
||||
|
||||
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||
|
||||
if (ctx.HasOptimization(OptimizationFlag::Unsafe_InaccurateNaN)) {
|
||||
const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
|
||||
const Xbyak::Xmm operand = ctx.reg_alloc.UseScratchXmm(args[1]);
|
||||
|
||||
if constexpr (std::is_member_function_pointer_v<Function>) {
|
||||
(code.*fn)(result, operand);
|
||||
} else {
|
||||
fn(result, operand);
|
||||
}
|
||||
|
||||
ctx.reg_alloc.DefineValue(inst, result);
|
||||
return;
|
||||
}
|
||||
|
||||
if (ctx.FPCR().DN()) {
|
||||
const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
|
||||
const Xbyak::Xmm operand = ctx.reg_alloc.UseScratchXmm(args[1]);
|
||||
@@ -590,9 +606,20 @@ static void EmitFPMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
|
||||
using FPT = mp::unsigned_integer_of_size<fsize>;
|
||||
|
||||
if constexpr (fsize != 16) {
|
||||
if (code.HasFMA()) {
|
||||
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||
|
||||
if (code.HasFMA() && ctx.HasOptimization(OptimizationFlag::Unsafe_InaccurateNaN)) {
|
||||
const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
|
||||
const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]);
|
||||
const Xbyak::Xmm operand3 = ctx.reg_alloc.UseXmm(args[2]);
|
||||
|
||||
FCODE(vfmadd231s)(result, operand2, operand3);
|
||||
|
||||
ctx.reg_alloc.DefineValue(inst, result);
|
||||
return;
|
||||
}
|
||||
|
||||
if (code.HasFMA()) {
|
||||
Xbyak::Label end, fallback;
|
||||
|
||||
const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(args[0]);
|
||||
@@ -641,8 +668,6 @@ static void EmitFPMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
|
||||
}
|
||||
|
||||
if (ctx.HasOptimization(OptimizationFlag::Unsafe_UnfuseFMA)) {
|
||||
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||
|
||||
const Xbyak::Xmm operand1 = ctx.reg_alloc.UseScratchXmm(args[0]);
|
||||
const Xbyak::Xmm operand2 = ctx.reg_alloc.UseScratchXmm(args[1]);
|
||||
const Xbyak::Xmm operand3 = ctx.reg_alloc.UseXmm(args[2]);
|
||||
@@ -810,6 +835,22 @@ static void EmitFPRecipStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst*
|
||||
using FPT = mp::unsigned_integer_of_size<fsize>;
|
||||
|
||||
if constexpr (fsize != 16) {
|
||||
if (code.HasFMA() && ctx.HasOptimization(OptimizationFlag::Unsafe_InaccurateNaN)) {
|
||||
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||
|
||||
Xbyak::Label end, fallback;
|
||||
|
||||
const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(args[0]);
|
||||
const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]);
|
||||
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
|
||||
|
||||
code.movaps(result, code.MConst(xword, FP::FPValue<FPT, false, 0, 2>()));
|
||||
FCODE(vfnmadd231s)(result, operand1, operand2);
|
||||
|
||||
ctx.reg_alloc.DefineValue(inst, result);
|
||||
return;
|
||||
}
|
||||
|
||||
if (code.HasFMA()) {
|
||||
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||
|
||||
@@ -998,6 +1039,21 @@ static void EmitFPRSqrtStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst*
|
||||
using FPT = mp::unsigned_integer_of_size<fsize>;
|
||||
|
||||
if constexpr (fsize != 16) {
|
||||
if (code.HasFMA() && code.HasAVX() && ctx.HasOptimization(OptimizationFlag::Unsafe_InaccurateNaN)) {
|
||||
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||
|
||||
const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(args[0]);
|
||||
const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]);
|
||||
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
|
||||
|
||||
code.vmovaps(result, code.MConst(xword, FP::FPValue<FPT, false, 0, 3>()));
|
||||
FCODE(vfnmadd231s)(result, operand1, operand2);
|
||||
FCODE(vmuls)(result, result, code.MConst(xword, FP::FPValue<FPT, false, -1, 1>()));
|
||||
|
||||
ctx.reg_alloc.DefineValue(inst, result);
|
||||
return;
|
||||
}
|
||||
|
||||
if (code.HasFMA() && code.HasAVX()) {
|
||||
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||
|
||||
|
@@ -290,7 +290,7 @@ void EmitTwoOpVectorOperation(BlockOfCode& code, EmitContext& ctx, IR::Inst* ins
|
||||
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||
const bool fpcr_controlled = args[fpcr_controlled_arg_index].GetImmediateU1();
|
||||
|
||||
if (ctx.FPCR(fpcr_controlled).DN()) {
|
||||
if (ctx.FPCR(fpcr_controlled).DN() || ctx.HasOptimization(OptimizationFlag::Unsafe_InaccurateNaN)) {
|
||||
Xbyak::Xmm result;
|
||||
|
||||
if constexpr (std::is_member_function_pointer_v<Function>) {
|
||||
@@ -306,7 +306,9 @@ void EmitTwoOpVectorOperation(BlockOfCode& code, EmitContext& ctx, IR::Inst* ins
|
||||
});
|
||||
}
|
||||
|
||||
ForceToDefaultNaN<fsize>(code, ctx.FPCR(fpcr_controlled), result);
|
||||
if (!ctx.HasOptimization(OptimizationFlag::Unsafe_InaccurateNaN)) {
|
||||
ForceToDefaultNaN<fsize>(code, ctx.FPCR(fpcr_controlled), result);
|
||||
}
|
||||
|
||||
ctx.reg_alloc.DefineValue(inst, result);
|
||||
return;
|
||||
@@ -342,7 +344,7 @@ void EmitThreeOpVectorOperation(BlockOfCode& code, EmitContext& ctx, IR::Inst* i
|
||||
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||
const bool fpcr_controlled = args[2].GetImmediateU1();
|
||||
|
||||
if (ctx.FPCR(fpcr_controlled).DN()) {
|
||||
if (ctx.FPCR(fpcr_controlled).DN() || ctx.HasOptimization(OptimizationFlag::Unsafe_InaccurateNaN)) {
|
||||
const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
|
||||
const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
|
||||
|
||||
@@ -356,7 +358,9 @@ void EmitThreeOpVectorOperation(BlockOfCode& code, EmitContext& ctx, IR::Inst* i
|
||||
});
|
||||
}
|
||||
|
||||
ForceToDefaultNaN<fsize>(code, ctx.FPCR(fpcr_controlled), xmm_a);
|
||||
if (!ctx.HasOptimization(OptimizationFlag::Unsafe_InaccurateNaN)) {
|
||||
ForceToDefaultNaN<fsize>(code, ctx.FPCR(fpcr_controlled), xmm_a);
|
||||
}
|
||||
|
||||
ctx.reg_alloc.DefineValue(inst, xmm_a);
|
||||
return;
|
||||
@@ -985,11 +989,23 @@ void EmitFPVectorMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
|
||||
};
|
||||
|
||||
if constexpr (fsize != 16) {
|
||||
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||
const bool fpcr_controlled = args[3].GetImmediateU1();
|
||||
|
||||
if (code.HasFMA() && code.HasAVX() && ctx.HasOptimization(OptimizationFlag::Unsafe_UnfuseFMA)) {
|
||||
const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
|
||||
const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
|
||||
const Xbyak::Xmm xmm_c = ctx.reg_alloc.UseXmm(args[2]);
|
||||
|
||||
MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&]{
|
||||
FCODE(vfmadd231p)(result, xmm_b, xmm_c);
|
||||
});
|
||||
|
||||
ctx.reg_alloc.DefineValue(inst, result);
|
||||
return;
|
||||
}
|
||||
|
||||
if (code.HasFMA() && code.HasAVX()) {
|
||||
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||
|
||||
const bool fpcr_controlled = args[3].GetImmediateU1();
|
||||
|
||||
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
|
||||
const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseXmm(args[0]);
|
||||
const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
|
||||
@@ -1025,8 +1041,6 @@ void EmitFPVectorMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
|
||||
}
|
||||
|
||||
if (ctx.HasOptimization(OptimizationFlag::Unsafe_UnfuseFMA)) {
|
||||
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||
|
||||
const Xbyak::Xmm operand1 = ctx.reg_alloc.UseScratchXmm(args[0]);
|
||||
const Xbyak::Xmm operand2 = ctx.reg_alloc.UseScratchXmm(args[1]);
|
||||
const Xbyak::Xmm operand3 = ctx.reg_alloc.UseXmm(args[2]);
|
||||
@@ -1233,10 +1247,24 @@ static void EmitRecipStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst* in
|
||||
};
|
||||
|
||||
if constexpr (fsize != 16) {
|
||||
if (code.HasFMA() && code.HasAVX()) {
|
||||
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||
const bool fpcr_controlled = args[2].GetImmediateU1();
|
||||
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||
const bool fpcr_controlled = args[2].GetImmediateU1();
|
||||
|
||||
if (code.HasFMA() && code.HasAVX() && ctx.HasOptimization(OptimizationFlag::Unsafe_InaccurateNaN)) {
|
||||
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
|
||||
const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(args[0]);
|
||||
const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]);
|
||||
|
||||
MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&]{
|
||||
code.movaps(result, GetVectorOf<fsize, false, 0, 2>(code));
|
||||
FCODE(vfnmadd231p)(result, operand1, operand2);
|
||||
});
|
||||
|
||||
ctx.reg_alloc.DefineValue(inst, result);
|
||||
return;
|
||||
}
|
||||
|
||||
if (code.HasFMA() && code.HasAVX()) {
|
||||
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
|
||||
const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(args[0]);
|
||||
const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]);
|
||||
@@ -1269,8 +1297,6 @@ static void EmitRecipStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst* in
|
||||
}
|
||||
|
||||
if (ctx.HasOptimization(OptimizationFlag::Unsafe_UnfuseFMA)) {
|
||||
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||
|
||||
const Xbyak::Xmm operand1 = ctx.reg_alloc.UseScratchXmm(args[0]);
|
||||
const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]);
|
||||
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
|
||||
@@ -1428,10 +1454,25 @@ static void EmitRSqrtStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst* in
|
||||
};
|
||||
|
||||
if constexpr (fsize != 16) {
|
||||
if (code.HasFMA() && code.HasAVX()) {
|
||||
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||
const bool fpcr_controlled = args[2].GetImmediateU1();
|
||||
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||
const bool fpcr_controlled = args[2].GetImmediateU1();
|
||||
|
||||
if (code.HasFMA() && code.HasAVX() && ctx.HasOptimization(OptimizationFlag::Unsafe_InaccurateNaN)) {
|
||||
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
|
||||
const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(args[0]);
|
||||
const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]);
|
||||
|
||||
MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&]{
|
||||
code.vmovaps(result, GetVectorOf<fsize, false, 0, 3>(code));
|
||||
FCODE(vfnmadd231p)(result, operand1, operand2);
|
||||
FCODE(vmulp)(result, result, GetVectorOf<fsize, false, -1, 1>(code));
|
||||
});
|
||||
|
||||
ctx.reg_alloc.DefineValue(inst, result);
|
||||
return;
|
||||
}
|
||||
|
||||
if (code.HasFMA() && code.HasAVX()) {
|
||||
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
|
||||
const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(args[0]);
|
||||
const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]);
|
||||
@@ -1470,8 +1511,6 @@ static void EmitRSqrtStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst* in
|
||||
}
|
||||
|
||||
if (ctx.HasOptimization(OptimizationFlag::Unsafe_UnfuseFMA)) {
|
||||
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||
|
||||
const Xbyak::Xmm operand1 = ctx.reg_alloc.UseScratchXmm(args[0]);
|
||||
const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]);
|
||||
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
|
||||
|
@@ -27,7 +27,7 @@ bool TranslatorVisitor::CCMP_reg(bool sf, Reg Rm, Cond cond, Reg Rn, Imm<4> nzcv
|
||||
const IR::U32U64 operand1 = X(datasize, Rn);
|
||||
const IR::U32U64 operand2 = X(datasize, Rm);
|
||||
|
||||
const IR::NZCV then_flags = ir.NZCVFrom(ir.AddWithCarry(operand1, ir.Not(operand2), ir.Imm1(1)));
|
||||
const IR::NZCV then_flags = ir.NZCVFrom(ir.SubWithCarry(operand1, operand2, ir.Imm1(1)));
|
||||
const IR::NZCV else_flags = ir.NZCVFromPackedFlags(ir.Imm32(flags));
|
||||
ir.SetNZCV(ir.ConditionalSelect(cond, then_flags, else_flags));
|
||||
return true;
|
||||
@@ -53,7 +53,7 @@ bool TranslatorVisitor::CCMP_imm(bool sf, Imm<5> imm5, Cond cond, Reg Rn, Imm<4>
|
||||
const IR::U32U64 operand1 = X(datasize, Rn);
|
||||
const IR::U32U64 operand2 = I(datasize, imm5.ZeroExtend<u32>());
|
||||
|
||||
const IR::NZCV then_flags = ir.NZCVFrom(ir.AddWithCarry(operand1, ir.Not(operand2), ir.Imm1(1)));
|
||||
const IR::NZCV then_flags = ir.NZCVFrom(ir.SubWithCarry(operand1, operand2, ir.Imm1(1)));
|
||||
const IR::NZCV else_flags = ir.NZCVFromPackedFlags(ir.Imm32(flags));
|
||||
ir.SetNZCV(ir.ConditionalSelect(cond, then_flags, else_flags));
|
||||
return true;
|
||||
|
Reference in New Issue
Block a user