early-access version 1262

This commit is contained in:
pineappleEA
2020-12-30 00:04:04 +00:00
parent a5ba8b4937
commit ac593731e0
118 changed files with 8331 additions and 4646 deletions

View File

@@ -935,7 +935,11 @@ Xbyak::RegExp EmitVAddrLookup(BlockOfCode& code, A32EmitContext& ctx, size_t bit
code.mov(tmp, vaddr.cvt32());
code.shr(tmp, static_cast<int>(page_bits));
code.mov(page, qword[r14 + tmp.cvt64() * sizeof(void*)]);
code.test(page, page);
if (ctx.conf.page_table_pointer_mask_bits == 0) {
code.test(page, page);
} else {
code.and_(page, ~u32(0) << ctx.conf.page_table_pointer_mask_bits);
}
code.jz(abort, code.T_NEAR);
if (ctx.conf.absolute_offset_page_table) {
return page + vaddr;

View File

@@ -815,7 +815,11 @@ Xbyak::RegExp EmitVAddrLookup(BlockOfCode& code, A64EmitContext& ctx, size_t bit
code.jnz(abort, code.T_NEAR);
}
code.mov(page, qword[r14 + tmp * sizeof(void*)]);
code.test(page, page);
if (ctx.conf.page_table_pointer_mask_bits == 0) {
code.test(page, page);
} else {
code.and_(page, ~u32(0) << ctx.conf.page_table_pointer_mask_bits);
}
code.jz(abort, code.T_NEAR);
if (ctx.conf.absolute_offset_page_table) {
return page + vaddr;

View File

@@ -384,6 +384,21 @@ bool BlockOfCode::HasAVX512_Skylake() const {
&& DoesCpuSupport(Xbyak::util::Cpu::tAVX512VL);
}
bool BlockOfCode::HasAVX512_Icelake() const {
return DoesCpuSupport(Xbyak::util::Cpu::tAVX512F)
&& DoesCpuSupport(Xbyak::util::Cpu::tAVX512CD)
&& DoesCpuSupport(Xbyak::util::Cpu::tAVX512BW)
&& DoesCpuSupport(Xbyak::util::Cpu::tAVX512DQ)
&& DoesCpuSupport(Xbyak::util::Cpu::tAVX512VL)
&& DoesCpuSupport(Xbyak::util::Cpu::tAVX512_VPOPCNTDQ)
&& DoesCpuSupport(Xbyak::util::Cpu::tAVX512_VNNI)
&& DoesCpuSupport(Xbyak::util::Cpu::tGFNI)
&& DoesCpuSupport(Xbyak::util::Cpu::tVAES)
&& DoesCpuSupport(Xbyak::util::Cpu::tAVX512_VBMI2)
&& DoesCpuSupport(Xbyak::util::Cpu::tAVX512_BITALG)
&& DoesCpuSupport(Xbyak::util::Cpu::tVPCLMULQDQ);
}
bool BlockOfCode::HasAVX512_BITALG() const {
return DoesCpuSupport(Xbyak::util::Cpu::tAVX512_BITALG);
}

View File

@@ -158,6 +158,7 @@ public:
bool HasFMA() const;
bool HasAVX2() const;
bool HasAVX512_Skylake() const;
bool HasAVX512_Icelake() const;
bool HasAVX512_BITALG() const;
private:

View File

@@ -291,8 +291,6 @@ void EmitX64::EmitLogicalShiftLeft32(EmitContext& ctx, IR::Inst* inst) {
auto& shift_arg = args[1];
auto& carry_arg = args[2];
// TODO: Consider using BMI2 instructions like SHLX when arm-in-host flags is implemented.
if (!carry_inst) {
if (shift_arg.IsImmediate()) {
const Xbyak::Reg32 result = ctx.reg_alloc.UseScratchGpr(operand_arg).cvt32();
@@ -304,6 +302,18 @@ void EmitX64::EmitLogicalShiftLeft32(EmitContext& ctx, IR::Inst* inst) {
code.xor_(result, result);
}
ctx.reg_alloc.DefineValue(inst, result);
} else if (code.HasBMI2()) {
const Xbyak::Reg32 shift = ctx.reg_alloc.UseGpr(shift_arg).cvt32();
const Xbyak::Reg32 operand = ctx.reg_alloc.UseGpr(operand_arg).cvt32();
const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32();
const Xbyak::Reg32 zero = ctx.reg_alloc.ScratchGpr().cvt32();
code.shlx(result, operand, shift);
code.xor_(zero, zero);
code.cmp(shift.cvt8(), 32);
code.cmovnb(result, zero);
ctx.reg_alloc.DefineValue(inst, result);
} else {
ctx.reg_alloc.Use(shift_arg, HostLoc::RCX);
@@ -398,6 +408,18 @@ void EmitX64::EmitLogicalShiftLeft64(EmitContext& ctx, IR::Inst* inst) {
code.xor_(result.cvt32(), result.cvt32());
}
ctx.reg_alloc.DefineValue(inst, result);
} else if (code.HasBMI2()) {
const Xbyak::Reg64 shift = ctx.reg_alloc.UseGpr(shift_arg);
const Xbyak::Reg64 operand = ctx.reg_alloc.UseGpr(operand_arg);
const Xbyak::Reg64 result = ctx.reg_alloc.ScratchGpr();
const Xbyak::Reg64 zero = ctx.reg_alloc.ScratchGpr();
code.shlx(result, operand, shift);
code.xor_(zero.cvt32(), zero.cvt32());
code.cmp(shift.cvt8(), 64);
code.cmovnb(result, zero);
ctx.reg_alloc.DefineValue(inst, result);
} else {
ctx.reg_alloc.Use(shift_arg, HostLoc::RCX);
@@ -405,7 +427,7 @@ void EmitX64::EmitLogicalShiftLeft64(EmitContext& ctx, IR::Inst* inst) {
const Xbyak::Reg64 zero = ctx.reg_alloc.ScratchGpr();
// The x64 SHL instruction masks the shift count by 0x1F before performing the shift.
// ARM differs from the behaviour: It does not mask the count, so shifts above 31 result in zeros.
// ARM differs from the behaviour: It does not mask the count, so shifts above 63 result in zeros.
code.shl(result, code.cl);
code.xor_(zero.cvt32(), zero.cvt32());
@@ -435,6 +457,18 @@ void EmitX64::EmitLogicalShiftRight32(EmitContext& ctx, IR::Inst* inst) {
code.xor_(result, result);
}
ctx.reg_alloc.DefineValue(inst, result);
} else if (code.HasBMI2()) {
const Xbyak::Reg32 shift = ctx.reg_alloc.UseGpr(shift_arg).cvt32();
const Xbyak::Reg32 operand = ctx.reg_alloc.UseGpr(operand_arg).cvt32();
const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32();
const Xbyak::Reg32 zero = ctx.reg_alloc.ScratchGpr().cvt32();
code.shrx(result, operand, shift);
code.xor_(zero, zero);
code.cmp(shift.cvt8(), 32);
code.cmovnb(result, zero);
ctx.reg_alloc.DefineValue(inst, result);
} else {
ctx.reg_alloc.Use(shift_arg, HostLoc::RCX);
@@ -530,6 +564,18 @@ void EmitX64::EmitLogicalShiftRight64(EmitContext& ctx, IR::Inst* inst) {
code.xor_(result.cvt32(), result.cvt32());
}
ctx.reg_alloc.DefineValue(inst, result);
} else if (code.HasBMI2()) {
const Xbyak::Reg64 shift = ctx.reg_alloc.UseGpr(shift_arg);
const Xbyak::Reg64 operand = ctx.reg_alloc.UseGpr(operand_arg);
const Xbyak::Reg64 result = ctx.reg_alloc.ScratchGpr();
const Xbyak::Reg64 zero = ctx.reg_alloc.ScratchGpr();
code.shrx(result, operand, shift);
code.xor_(zero.cvt32(), zero.cvt32());
code.cmp(shift.cvt8(), 63);
code.cmovnb(result, zero);
ctx.reg_alloc.DefineValue(inst, result);
} else {
ctx.reg_alloc.Use(shift_arg, HostLoc::RCX);
@@ -537,7 +583,7 @@ void EmitX64::EmitLogicalShiftRight64(EmitContext& ctx, IR::Inst* inst) {
const Xbyak::Reg64 zero = ctx.reg_alloc.ScratchGpr();
// The x64 SHR instruction masks the shift count by 0x1F before performing the shift.
// ARM differs from the behaviour: It does not mask the count, so shifts above 31 result in zeros.
// ARM differs from the behaviour: It does not mask the count, so shifts above 63 result in zeros.
code.shr(result, code.cl);
code.xor_(zero.cvt32(), zero.cvt32());
@@ -563,6 +609,22 @@ void EmitX64::EmitArithmeticShiftRight32(EmitContext& ctx, IR::Inst* inst) {
code.sar(result, u8(shift < 31 ? shift : 31));
ctx.reg_alloc.DefineValue(inst, result);
} else if (code.HasBMI2()) {
const Xbyak::Reg32 shift = ctx.reg_alloc.UseScratchGpr(shift_arg).cvt32();
const Xbyak::Reg32 operand = ctx.reg_alloc.UseGpr(operand_arg).cvt32();
const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32();
const Xbyak::Reg32 const31 = ctx.reg_alloc.ScratchGpr().cvt32();
// The 32-bit x64 SAR instruction masks the shift count by 0x1F before performing the shift.
// ARM differs from the behaviour: It does not mask the count.
// We note that all shift values above 31 have the same behaviour as 31 does, so we saturate `shift` to 31.
code.mov(const31, 31);
code.cmp(shift.cvt8(), 31);
code.cmovnb(shift, const31);
code.sarx(result, operand, shift);
ctx.reg_alloc.DefineValue(inst, result);
} else {
ctx.reg_alloc.UseScratch(shift_arg, HostLoc::RCX);
@@ -574,9 +636,8 @@ void EmitX64::EmitArithmeticShiftRight32(EmitContext& ctx, IR::Inst* inst) {
// We note that all shift values above 31 have the same behaviour as 31 does, so we saturate `shift` to 31.
code.mov(const31, 31);
code.movzx(code.ecx, code.cl);
code.cmp(code.ecx, u32(31));
code.cmovg(code.ecx, const31);
code.cmp(code.cl, u32(31));
code.cmova(code.ecx, const31);
code.sar(result, code.cl);
ctx.reg_alloc.DefineValue(inst, result);
@@ -647,6 +708,18 @@ void EmitX64::EmitArithmeticShiftRight64(EmitContext& ctx, IR::Inst* inst) {
code.sar(result, u8(shift < 63 ? shift : 63));
ctx.reg_alloc.DefineValue(inst, result);
} else if (code.HasBMI2()) {
const Xbyak::Reg64 shift = ctx.reg_alloc.UseScratchGpr(shift_arg);
const Xbyak::Reg64 operand = ctx.reg_alloc.UseGpr(operand_arg);
const Xbyak::Reg64 result = ctx.reg_alloc.ScratchGpr();
const Xbyak::Reg64 const63 = ctx.reg_alloc.ScratchGpr();
code.mov(const63.cvt32(), 63);
code.cmp(shift.cvt8(), 63);
code.cmovnb(shift, const63);
code.sarx(result, operand, shift);
ctx.reg_alloc.DefineValue(inst, result);
} else {
ctx.reg_alloc.UseScratch(shift_arg, HostLoc::RCX);
@@ -658,8 +731,7 @@ void EmitX64::EmitArithmeticShiftRight64(EmitContext& ctx, IR::Inst* inst) {
// We note that all shift values above 63 have the same behaviour as 63 does, so we saturate `shift` to 63.
code.mov(const63, 63);
code.movzx(code.ecx, code.cl);
code.cmp(code.ecx, u32(63));
code.cmp(code.cl, u32(63));
code.cmovg(code.ecx, const63);
code.sar(result, code.cl);
@@ -676,7 +748,15 @@ void EmitX64::EmitRotateRight32(EmitContext& ctx, IR::Inst* inst) {
auto& carry_arg = args[2];
if (!carry_inst) {
if (shift_arg.IsImmediate()) {
if (shift_arg.IsImmediate() && code.HasBMI2()) {
const u8 shift = shift_arg.GetImmediateU8();
const Xbyak::Reg32 operand = ctx.reg_alloc.UseGpr(operand_arg).cvt32();
const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32();
code.rorx(result, operand, shift);
ctx.reg_alloc.DefineValue(inst, result);
} else if (shift_arg.IsImmediate()) {
const u8 shift = shift_arg.GetImmediateU8();
const Xbyak::Reg32 result = ctx.reg_alloc.UseScratchGpr(operand_arg).cvt32();
@@ -751,7 +831,15 @@ void EmitX64::EmitRotateRight64(EmitContext& ctx, IR::Inst* inst) {
auto& operand_arg = args[0];
auto& shift_arg = args[1];
if (shift_arg.IsImmediate()) {
if (shift_arg.IsImmediate() && code.HasBMI2()) {
const u8 shift = shift_arg.GetImmediateU8();
const Xbyak::Reg64 operand = ctx.reg_alloc.UseGpr(operand_arg);
const Xbyak::Reg64 result = ctx.reg_alloc.ScratchGpr();
code.rorx(result, operand, shift);
ctx.reg_alloc.DefineValue(inst, result);
} else if (shift_arg.IsImmediate()) {
const u8 shift = shift_arg.GetImmediateU8();
const Xbyak::Reg64 result = ctx.reg_alloc.UseScratchGpr(operand_arg);

View File

@@ -439,6 +439,14 @@ void EmitX64::EmitVectorAnd(EmitContext& ctx, IR::Inst* inst) {
}
static void ArithmeticShiftRightByte(EmitContext& ctx, BlockOfCode& code, const Xbyak::Xmm& result, u8 shift_amount) {
if (code.HasAVX512_Icelake()) {
// Do a logical shift right upon the 8x8 bit-matrix, but shift in
// `0x80` bytes into the matrix to repeat the most significant bit.
const u64 zero_extend = ~(0xFFFFFFFFFFFFFFFF << (shift_amount * 8)) & 0x8080808080808080;
const u64 shift_matrix = (0x0102040810204080 >> (shift_amount * 8)) | zero_extend;
code.vgf2p8affineqb(result, result, code.MConst(xword_b, shift_matrix), 0);
return;
}
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
code.punpckhbw(tmp, result);
@@ -1460,11 +1468,17 @@ void EmitX64::EmitVectorLogicalShiftLeft8(EmitContext& ctx, IR::Inst* inst) {
if (shift_amount == 1) {
code.paddb(result, result);
} else if (shift_amount > 0) {
const u64 replicand = (0xFFULL << shift_amount) & 0xFF;
const u64 mask = Common::Replicate(replicand, Common::BitSize<u8>());
if (code.HasAVX512_Icelake()) {
// Galois 8x8 identity matrix, bit-shifted by the shift-amount
const u64 shift_matrix = 0x0102040810204080 >> (shift_amount * 8);
code.vgf2p8affineqb(result, result, code.MConst(xword_b, shift_matrix), 0);
} else {
const u64 replicand = (0xFFULL << shift_amount) & 0xFF;
const u64 mask = Common::Replicate(replicand, Common::BitSize<u8>());
code.psllw(result, shift_amount);
code.pand(result, code.MConst(xword, mask, mask));
code.psllw(result, shift_amount);
code.pand(result, code.MConst(xword, mask, mask));
}
}
ctx.reg_alloc.DefineValue(inst, result);
@@ -1510,11 +1524,17 @@ void EmitX64::EmitVectorLogicalShiftRight8(EmitContext& ctx, IR::Inst* inst) {
const u8 shift_amount = args[1].GetImmediateU8();
if (shift_amount > 0) {
const u64 replicand = 0xFEULL >> shift_amount;
const u64 mask = Common::Replicate(replicand, Common::BitSize<u8>());
if (code.HasAVX512_Icelake()) {
// Galois 8x8 identity matrix, bit-shifted by the shift-amount
const u64 shift_matrix = 0x0102040810204080 << (shift_amount * 8);
code.vgf2p8affineqb(result, result, code.MConst(xword_b, shift_matrix), 0);
} else {
const u64 replicand = 0xFEULL >> shift_amount;
const u64 mask = Common::Replicate(replicand, Common::BitSize<u8>());
code.psrlw(result, shift_amount);
code.pand(result, code.MConst(xword, mask, mask));
code.psrlw(result, shift_amount);
code.pand(result, code.MConst(xword, mask, mask));
}
}
ctx.reg_alloc.DefineValue(inst, result);
@@ -2747,40 +2767,48 @@ void EmitX64::EmitVectorReverseBits(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm high_nibble_reg = ctx.reg_alloc.ScratchXmm();
code.movdqa(high_nibble_reg, code.MConst(xword, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0));
code.pand(high_nibble_reg, data);
code.pxor(data, high_nibble_reg);
code.psrld(high_nibble_reg, 4);
if (code.HasSSSE3()) {
// High lookup
const Xbyak::Xmm high_reversed_reg = ctx.reg_alloc.ScratchXmm();
code.movdqa(high_reversed_reg, code.MConst(xword, 0xE060A020C0408000, 0xF070B030D0509010));
code.pshufb(high_reversed_reg, data);
// Low lookup (low nibble equivalent of the above)
code.movdqa(data, code.MConst(xword, 0x0E060A020C040800, 0x0F070B030D050901));
code.pshufb(data, high_nibble_reg);
code.por(data, high_reversed_reg);
if (code.HasAVX512_Icelake() && code.HasSSSE3()) {
// GFNI(vgf2p8affineqb) and SSSE3(pshuf)
// Reverse bits within bytes
code.vgf2p8affineqb(data, data, code.MConst(xword_b, 0x8040201008040201), 0);
// Reverse bytes within vector
code.pshufb(data, code.MConst(xword, 0x0001020304050607, 0x08090a0b0c0d0e0f));
} else {
code.pslld(data, 4);
code.por(data, high_nibble_reg);
code.movdqa(high_nibble_reg, code.MConst(xword, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC));
const Xbyak::Xmm high_nibble_reg = ctx.reg_alloc.ScratchXmm();
code.movdqa(high_nibble_reg, code.MConst(xword, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0));
code.pand(high_nibble_reg, data);
code.pxor(data, high_nibble_reg);
code.psrld(high_nibble_reg, 2);
code.pslld(data, 2);
code.por(data, high_nibble_reg);
code.psrld(high_nibble_reg, 4);
code.movdqa(high_nibble_reg, code.MConst(xword, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA));
code.pand(high_nibble_reg, data);
code.pxor(data, high_nibble_reg);
code.psrld(high_nibble_reg, 1);
code.paddd(data, data);
code.por(data, high_nibble_reg);
if (code.HasSSSE3()) {
// High lookup
const Xbyak::Xmm high_reversed_reg = ctx.reg_alloc.ScratchXmm();
code.movdqa(high_reversed_reg, code.MConst(xword, 0xE060A020C0408000, 0xF070B030D0509010));
code.pshufb(high_reversed_reg, data);
// Low lookup (low nibble equivalent of the above)
code.movdqa(data, code.MConst(xword, 0x0E060A020C040800, 0x0F070B030D050901));
code.pshufb(data, high_nibble_reg);
code.por(data, high_reversed_reg);
} else {
code.pslld(data, 4);
code.por(data, high_nibble_reg);
code.movdqa(high_nibble_reg, code.MConst(xword, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC));
code.pand(high_nibble_reg, data);
code.pxor(data, high_nibble_reg);
code.psrld(high_nibble_reg, 2);
code.pslld(data, 2);
code.por(data, high_nibble_reg);
code.movdqa(high_nibble_reg, code.MConst(xword, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA));
code.pand(high_nibble_reg, data);
code.pxor(data, high_nibble_reg);
code.psrld(high_nibble_reg, 1);
code.paddd(data, data);
code.por(data, high_nibble_reg);
}
}
ctx.reg_alloc.DefineValue(inst, data);

View File

@@ -35,6 +35,7 @@ struct CodeBlockInfo {
class SigHandler {
public:
SigHandler();
~SigHandler();
void AddCodeBlock(CodeBlockInfo info);
void RemoveCodeBlock(u64 rip);
@@ -48,6 +49,8 @@ private:
bool supports_fast_mem = true;
void* signal_stack_memory = nullptr;
std::vector<CodeBlockInfo> code_block_infos;
std::mutex code_block_infos_mutex;
@@ -62,8 +65,10 @@ SigHandler sig_handler;
SigHandler::SigHandler() {
constexpr size_t signal_stack_size = std::max(SIGSTKSZ, 2 * 1024 * 1024);
signal_stack_memory = std::malloc(signal_stack_size);
stack_t signal_stack;
signal_stack.ss_sp = std::malloc(signal_stack_size);
signal_stack.ss_sp = signal_stack_memory;
signal_stack.ss_size = signal_stack_size;
signal_stack.ss_flags = 0;
if (sigaltstack(&signal_stack, nullptr) != 0) {
@@ -91,6 +96,10 @@ SigHandler::SigHandler() {
#endif
}
SigHandler::~SigHandler() {
std::free(signal_stack_memory);
}
void SigHandler::AddCodeBlock(CodeBlockInfo cbi) {
std::lock_guard<std::mutex> guard(code_block_infos_mutex);
if (auto iter = FindCodeBlockInfo(cbi.code_begin); iter != code_block_infos.end()) {

View File

@@ -55,7 +55,7 @@ public:
bool SingleStepping() const { return single_stepping; }
bool operator == (const LocationDescriptor& o) const {
return std::tie(arm_pc, cpsr, fpscr, single_stepping) == std::tie(o.arm_pc, o.cpsr, o.fpscr, single_stepping);
return std::tie(arm_pc, cpsr, fpscr, single_stepping) == std::tie(o.arm_pc, o.cpsr, o.fpscr, o.single_stepping);
}
bool operator != (const LocationDescriptor& o) const {

View File

@@ -45,7 +45,7 @@ public:
bool SingleStepping() const { return single_stepping; }
bool operator == (const LocationDescriptor& o) const {
return std::tie(pc, fpcr, single_stepping) == std::tie(o.pc, o.fpcr, single_stepping);
return std::tie(pc, fpcr, single_stepping) == std::tie(o.pc, o.fpcr, o.single_stepping);
}
bool operator != (const LocationDescriptor& o) const {

View File

@@ -63,15 +63,12 @@ private:
*/
template<size_t N>
static auto GetArgInfo(const char* const bitstring) {
const auto one = static_cast<opcode_type>(1);
std::array<opcode_type, N> masks = {};
std::array<size_t, N> shifts = {};
size_t arg_index = 0;
char ch = 0;
for (size_t i = 0; i < opcode_bitsize; i++) {
const size_t bit_position = opcode_bitsize - i - 1;
if (bitstring[i] == '0' || bitstring[i] == '1' || bitstring[i] == '-') {
if (ch != 0) {
ch = 0;
@@ -85,9 +82,15 @@ private:
arg_index++;
}
ASSERT(arg_index < N);
masks[arg_index] |= one << bit_position;
shifts[arg_index] = bit_position;
if constexpr (N > 0) {
const size_t bit_position = opcode_bitsize - i - 1;
ASSERT(arg_index < N);
masks[arg_index] |= static_cast<opcode_type>(1) << bit_position;
shifts[arg_index] = bit_position;
} else {
ASSERT_FALSE();
}
}
}