early-access version 2126

This commit is contained in:
pineappleEA
2021-10-12 22:47:55 +02:00
parent c2a51b4c36
commit 925a78d02c
20 changed files with 316 additions and 49 deletions

View File

@@ -364,6 +364,10 @@ include(CreateDirectoryGroups)
create_target_directory_groups(dynarmic)
target_include_directories(dynarmic PUBLIC ..)
set_target_properties(dynarmic PROPERTIES
VERSION ${dynarmic_VERSION}
SOVERSION ${dynarmic_VERSION_MAJOR}
)
target_compile_options(dynarmic PRIVATE ${DYNARMIC_CXX_FLAGS})
target_link_libraries(dynarmic
PUBLIC

View File

@@ -318,8 +318,12 @@ void Jit::LoadContext(const Context& ctx) {
}
void Jit::DumpDisassembly() const {
const size_t size = (const char*)impl->block_of_code.getCurr() - (const char*)impl->block_of_code.GetCodeBegin();
const size_t size = reinterpret_cast<const char*>(impl->block_of_code.getCurr()) - reinterpret_cast<const char*>(impl->block_of_code.GetCodeBegin());
Common::DumpDisassembledX64(impl->block_of_code.GetCodeBegin(), size);
}
std::vector<std::string> Jit::Disassemble() const {
const size_t size = reinterpret_cast<const char*>(impl->block_of_code.getCurr()) - reinterpret_cast<const char*>(impl->block_of_code.GetCodeBegin());
return Common::DisassembleX64(impl->block_of_code.GetCodeBegin(), size);
}
} // namespace Dynarmic::A32

View File

@@ -5,6 +5,7 @@
#include <cstring>
#include <memory>
#include <mutex>
#include <boost/icl/interval_set.hpp>
@@ -57,6 +58,8 @@ public:
void Run() {
ASSERT(!is_executing);
PerformRequestedCacheInvalidation();
is_executing = true;
SCOPE_EXIT { this->is_executing = false; };
jit_state.halt_requested = false;
@@ -80,6 +83,8 @@ public:
void Step() {
ASSERT(!is_executing);
PerformRequestedCacheInvalidation();
is_executing = true;
SCOPE_EXIT { this->is_executing = false; };
jit_state.halt_requested = true;
@@ -90,15 +95,21 @@ public:
}
void ClearCache() {
std::unique_lock lock{invalidation_mutex};
invalidate_entire_cache = true;
RequestCacheInvalidation();
if (is_executing) {
jit_state.halt_requested = true;
}
}
void InvalidateCacheRange(u64 start_address, size_t length) {
std::unique_lock lock{invalidation_mutex};
const auto end_address = static_cast<u64>(start_address + length - 1);
const auto range = boost::icl::discrete_interval<u64>::closed(start_address, end_address);
invalid_cache_ranges.add(range);
RequestCacheInvalidation();
if (is_executing) {
jit_state.halt_requested = true;
}
}
void Reset() {
@@ -200,10 +211,15 @@ public:
}
void DumpDisassembly() const {
const size_t size = (const char*)block_of_code.getCurr() - (const char*)block_of_code.GetCodeBegin();
const size_t size = reinterpret_cast<const char*>(block_of_code.getCurr()) - reinterpret_cast<const char*>(block_of_code.GetCodeBegin());
Common::DumpDisassembledX64(block_of_code.GetCodeBegin(), size);
}
std::vector<std::string> Disassemble() const {
const size_t size = reinterpret_cast<const char*>(block_of_code.getCurr()) - reinterpret_cast<const char*>(block_of_code.GetCodeBegin());
return Common::DisassembleX64(block_of_code.GetCodeBegin(), size);
}
private:
static CodePtr GetCurrentBlockThunk(void* thisptr) {
Jit::Impl* this_ = static_cast<Jit::Impl*>(thisptr);
@@ -263,6 +279,7 @@ private:
}
void PerformRequestedCacheInvalidation() {
std::unique_lock lock{invalidation_mutex};
if (!invalidate_entire_cache && invalid_cache_ranges.empty()) {
return;
}
@@ -287,6 +304,7 @@ private:
bool invalidate_entire_cache = false;
boost::icl::interval_set<u64> invalid_cache_ranges;
std::mutex invalidation_mutex;
};
Jit::Jit(UserConfig conf)
@@ -402,4 +420,8 @@ void Jit::DumpDisassembly() const {
return impl->DumpDisassembly();
}
std::vector<std::string> Jit::Disassemble() const {
return impl->Disassemble();
}
} // namespace Dynarmic::A64

View File

@@ -2990,6 +2990,98 @@ void EmitX64::EmitVectorReverseBits(EmitContext& ctx, IR::Inst* inst) {
ctx.reg_alloc.DefineValue(inst, data);
}
void EmitX64::EmitVectorReduceAdd8(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm temp = xmm0;
// Add upper elements to lower elements
code.pshufd(temp, data, 0b01'00'11'10);
code.paddb(data, temp);
// Add adjacent 8-bit values into 64-bit lanes
code.pxor(temp, temp);
code.psadbw(data, temp);
// Zero-extend lower 8-bits
code.pslldq(data, 15);
code.psrldq(data, 15);
ctx.reg_alloc.DefineValue(inst, data);
}
void EmitX64::EmitVectorReduceAdd16(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm temp = xmm0;
if (code.HasHostFeature(HostFeature::SSSE3)) {
code.pxor(temp, temp);
code.phaddw(data, xmm0);
code.phaddw(data, xmm0);
code.phaddw(data, xmm0);
} else {
// Add upper elements to lower elements
code.pshufd(temp, data, 0b00'01'10'11);
code.paddw(data, temp);
// Add pairs of 16-bit values into 32-bit lanes
code.movdqa(temp, code.MConst(xword, 0x0001000100010001, 0x0001000100010001));
code.pmaddwd(data, temp);
// Sum adjacent 32-bit lanes
code.pshufd(temp, data, 0b10'11'00'01);
code.paddd(data, temp);
// Zero-extend lower 16-bits
code.pslldq(data, 14);
code.psrldq(data, 14);
}
ctx.reg_alloc.DefineValue(inst, data);
}
void EmitX64::EmitVectorReduceAdd32(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm temp = xmm0;
// Add upper elements to lower elements(reversed)
code.pshufd(temp, data, 0b00'01'10'11);
code.paddd(data, temp);
// Sum adjacent 32-bit lanes
if (code.HasHostFeature(HostFeature::SSSE3)) {
code.phaddd(data, data);
} else {
code.pshufd(temp, data, 0b10'11'00'01);
code.paddd(data, temp);
}
// shift upper-most result into lower-most lane
code.psrldq(data, 12);
ctx.reg_alloc.DefineValue(inst, data);
}
void EmitX64::EmitVectorReduceAdd64(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm temp = xmm0;
// Add upper elements to lower elements
code.pshufd(temp, data, 0b01'00'11'10);
code.paddq(data, temp);
// Zero-extend lower 64-bits
code.movq(data, data);
ctx.reg_alloc.DefineValue(inst, data);
}
static void EmitVectorRoundingHalvingAddSigned(size_t esize, EmitContext& ctx, IR::Inst* inst, BlockOfCode& code) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);

View File

@@ -64,7 +64,7 @@ private:
SigHandler sig_handler;
SigHandler::SigHandler() {
constexpr size_t signal_stack_size = std::max(SIGSTKSZ, 2 * 1024 * 1024);
const size_t signal_stack_size = std::max<size_t>(SIGSTKSZ, 2 * 1024 * 1024);
signal_stack_memory = std::malloc(signal_stack_size);

View File

@@ -46,7 +46,7 @@ constexpr FPUnpacked ToNormalized(bool sign, int exponent, u64 value) {
const int highest_bit = Common::HighestSetBit(value);
const int offset = static_cast<int>(normalized_point_position) - highest_bit;
value <<= offset;
exponent -= offset - normalized_point_position;
exponent -= offset - static_cast<int>(normalized_point_position);
return {sign, exponent, value};
}

View File

@@ -21,15 +21,36 @@ void DumpDisassembledX64(const void* ptr, size_t size) {
size_t offset = 0;
ZydisDecodedInstruction instruction;
while (ZYAN_SUCCESS(ZydisDecoderDecodeBuffer(&decoder, (const char*)ptr + offset, size - offset, &instruction))) {
while (ZYAN_SUCCESS(ZydisDecoderDecodeBuffer(&decoder, static_cast<const char*>(ptr) + offset, size - offset, &instruction))) {
fmt::print("{:016x} ", (u64)ptr + offset);
char buffer[256];
ZydisFormatterFormatInstruction(&formatter, &instruction, buffer, sizeof(buffer), (u64)ptr + offset);
ZydisFormatterFormatInstruction(&formatter, &instruction, buffer, sizeof(buffer), reinterpret_cast<u64>(ptr) + offset);
puts(buffer);
offset += instruction.length;
}
}
std::vector<std::string> DisassembleX64(const void* ptr, size_t size) {
std::vector<std::string> result;
ZydisDecoder decoder;
ZydisDecoderInit(&decoder, ZYDIS_MACHINE_MODE_LONG_64, ZYDIS_ADDRESS_WIDTH_64);
ZydisFormatter formatter;
ZydisFormatterInit(&formatter, ZYDIS_FORMATTER_STYLE_INTEL);
size_t offset = 0;
ZydisDecodedInstruction instruction;
while (ZYAN_SUCCESS(ZydisDecoderDecodeBuffer(&decoder, static_cast<const char*>(ptr) + offset, size - offset, &instruction))) {
char buffer[256];
ZydisFormatterFormatInstruction(&formatter, &instruction, buffer, sizeof(buffer), reinterpret_cast<u64>(ptr) + offset);
result.push_back(fmt::format("{:016x} {}", (u64)ptr + offset, buffer));
offset += instruction.length;
}
return result;
}
} // namespace Dynarmic::Common

View File

@@ -5,10 +5,17 @@
#pragma once
#include <string>
#include <vector>
#include "dynarmic/common/common_types.h"
namespace Dynarmic::Common {
void DumpDisassembledX64(const void* ptr, size_t size);
/**
* Disassemble `size' bytes from `ptr' and return the disassembled lines as a vector
* of strings.
*/
std::vector<std::string> DisassembleX64(const void* ptr, size_t size);
} // namespace Dynarmic::Common

View File

@@ -171,27 +171,10 @@ bool TranslatorVisitor::ADDV(bool Q, Imm<2> size, Vec Vn, Vec Vd) {
const size_t esize = 8 << size.ZeroExtend();
const size_t datasize = Q ? 128 : 64;
const size_t elements = datasize / esize;
const IR::U128 operand = V(datasize, Vn);
const auto get_element = [&](IR::U128 vec, size_t element) {
return ir.ZeroExtendToWord(ir.VectorGetElement(esize, vec, element));
};
IR::U32 sum = get_element(operand, 0);
for (size_t i = 1; i < elements; i++) {
sum = ir.Add(sum, get_element(operand, i));
}
if (size == 0b00) {
V(datasize, Vd, ir.ZeroExtendToQuad(ir.LeastSignificantByte(sum)));
} else if (size == 0b01) {
V(datasize, Vd, ir.ZeroExtendToQuad(ir.LeastSignificantHalf(sum)));
} else {
V(datasize, Vd, ir.ZeroExtendToQuad(sum));
}
V(128, Vd, ir.VectorReduceAdd(esize, operand));
return true;
}

View File

@@ -9,6 +9,7 @@
#include <cstdint>
#include <memory>
#include <string>
#include <vector>
#include "dynarmic/interface/A32/config.h"
@@ -91,6 +92,12 @@ public:
/// Debugging: Dump a disassembly all compiled code to the console.
void DumpDisassembly() const;
/**
* Disassemble the instructions following the current pc and return
* the resulting instructions as a vector of their string representations.
*/
std::vector<std::string> Disassemble() const;
private:
bool is_executing = false;

View File

@@ -10,6 +10,7 @@
#include <cstdint>
#include <memory>
#include <string>
#include <vector>
#include "dynarmic/interface/A64/config.h"
@@ -117,6 +118,12 @@ public:
/// Debugging: Dump a disassembly all of compiled code to the console.
void DumpDisassembly() const;
/*
* Disassemble the instructions following the current pc and return
* the resulting instructions as a vector of their string representations.
*/
std::vector<std::string> Disassemble() const;
private:
struct Impl;
std::unique_ptr<Impl> impl;

View File

@@ -1526,6 +1526,21 @@ U128 IREmitter::VectorReverseBits(const U128& a) {
return Inst<U128>(Opcode::VectorReverseBits, a);
}
U128 IREmitter::VectorReduceAdd(size_t esize, const U128& a) {
switch (esize) {
case 8:
return Inst<U128>(Opcode::VectorReduceAdd8, a);
case 16:
return Inst<U128>(Opcode::VectorReduceAdd16, a);
case 32:
return Inst<U128>(Opcode::VectorReduceAdd32, a);
case 64:
return Inst<U128>(Opcode::VectorReduceAdd64, a);
}
UNREACHABLE();
}
U128 IREmitter::VectorRotateLeft(size_t esize, const U128& a, u8 amount) {
ASSERT(amount < esize);

View File

@@ -294,6 +294,7 @@ public:
U128 VectorPolynomialMultiplyLong(size_t esize, const U128& a, const U128& b);
U128 VectorPopulationCount(const U128& a);
U128 VectorReverseBits(const U128& a);
U128 VectorReduceAdd(size_t esize, const U128& a);
U128 VectorRotateLeft(size_t esize, const U128& a, u8 amount);
U128 VectorRotateRight(size_t esize, const U128& a, u8 amount);
U128 VectorRoundingHalvingAddSigned(size_t esize, const U128& a, const U128& b);

View File

@@ -431,6 +431,10 @@ OPCODE(VectorPolynomialMultiplyLong8, U128, U128
OPCODE(VectorPolynomialMultiplyLong64, U128, U128, U128 )
OPCODE(VectorPopulationCount, U128, U128 )
OPCODE(VectorReverseBits, U128, U128 )
OPCODE(VectorReduceAdd8, U128, U128 )
OPCODE(VectorReduceAdd16, U128, U128 )
OPCODE(VectorReduceAdd32, U128, U128 )
OPCODE(VectorReduceAdd64, U128, U128 )
OPCODE(VectorRoundingHalvingAddS8, U128, U128, U128 )
OPCODE(VectorRoundingHalvingAddS16, U128, U128, U128 )
OPCODE(VectorRoundingHalvingAddS32, U128, U128, U128 )