early-access version 2126

2021-10-12 22:47:55 +02:00
parent c2a51b4c36
commit 925a78d02c
20 changed files with 316 additions and 49 deletions
--- a/externals/dynarmic/src/dynarmic/CMakeLists.txt
+++ b/externals/dynarmic/src/dynarmic/CMakeLists.txt
@@ -364,6 +364,10 @@ include(CreateDirectoryGroups)
 create_target_directory_groups(dynarmic)

 target_include_directories(dynarmic PUBLIC ..)
+set_target_properties(dynarmic PROPERTIES
+    VERSION ${dynarmic_VERSION}
+    SOVERSION ${dynarmic_VERSION_MAJOR}
+)
 target_compile_options(dynarmic PRIVATE ${DYNARMIC_CXX_FLAGS})
 target_link_libraries(dynarmic
    PUBLIC
--- a/externals/dynarmic/src/dynarmic/backend/x64/a32_interface.cpp
+++ b/externals/dynarmic/src/dynarmic/backend/x64/a32_interface.cpp
@@ -318,8 +318,12 @@ void Jit::LoadContext(const Context& ctx) {
 }

 void Jit::DumpDisassembly() const {
-    const size_t size = (const char*)impl->block_of_code.getCurr() - (const char*)impl->block_of_code.GetCodeBegin();
+    const size_t size = reinterpret_cast<const char*>(impl->block_of_code.getCurr()) - reinterpret_cast<const char*>(impl->block_of_code.GetCodeBegin());
    Common::DumpDisassembledX64(impl->block_of_code.GetCodeBegin(), size);
 }

+std::vector<std::string> Jit::Disassemble() const {
+    const size_t size = reinterpret_cast<const char*>(impl->block_of_code.getCurr()) - reinterpret_cast<const char*>(impl->block_of_code.GetCodeBegin());
+    return Common::DisassembleX64(impl->block_of_code.GetCodeBegin(), size);
+}
 }  // namespace Dynarmic::A32
--- a/externals/dynarmic/src/dynarmic/backend/x64/a64_interface.cpp
+++ b/externals/dynarmic/src/dynarmic/backend/x64/a64_interface.cpp
@@ -5,6 +5,7 @@

 #include <cstring>
 #include <memory>
+#include <mutex>

 #include <boost/icl/interval_set.hpp>

@@ -57,6 +58,8 @@ public:

    void Run() {
        ASSERT(!is_executing);
+        PerformRequestedCacheInvalidation();
+
        is_executing = true;
        SCOPE_EXIT { this->is_executing = false; };
        jit_state.halt_requested = false;
@@ -80,6 +83,8 @@ public:

    void Step() {
        ASSERT(!is_executing);
+        PerformRequestedCacheInvalidation();
+
        is_executing = true;
        SCOPE_EXIT { this->is_executing = false; };
        jit_state.halt_requested = true;
@@ -90,15 +95,21 @@ public:
    }

    void ClearCache() {
+        std::unique_lock lock{invalidation_mutex};
        invalidate_entire_cache = true;
-        RequestCacheInvalidation();
+        if (is_executing) {
+            jit_state.halt_requested = true;
+        }
    }

    void InvalidateCacheRange(u64 start_address, size_t length) {
+        std::unique_lock lock{invalidation_mutex};
        const auto end_address = static_cast<u64>(start_address + length - 1);
        const auto range = boost::icl::discrete_interval<u64>::closed(start_address, end_address);
        invalid_cache_ranges.add(range);
-        RequestCacheInvalidation();
+        if (is_executing) {
+            jit_state.halt_requested = true;
+        }
    }

    void Reset() {
@@ -200,10 +211,15 @@ public:
    }

    void DumpDisassembly() const {
-        const size_t size = (const char*)block_of_code.getCurr() - (const char*)block_of_code.GetCodeBegin();
+        const size_t size = reinterpret_cast<const char*>(block_of_code.getCurr()) - reinterpret_cast<const char*>(block_of_code.GetCodeBegin());
        Common::DumpDisassembledX64(block_of_code.GetCodeBegin(), size);
    }

+    std::vector<std::string> Disassemble() const {
+        const size_t size = reinterpret_cast<const char*>(block_of_code.getCurr()) - reinterpret_cast<const char*>(block_of_code.GetCodeBegin());
+        return Common::DisassembleX64(block_of_code.GetCodeBegin(), size);
+    }
+
 private:
    static CodePtr GetCurrentBlockThunk(void* thisptr) {
        Jit::Impl* this_ = static_cast<Jit::Impl*>(thisptr);
@@ -263,6 +279,7 @@ private:
    }

    void PerformRequestedCacheInvalidation() {
+        std::unique_lock lock{invalidation_mutex};
        if (!invalidate_entire_cache && invalid_cache_ranges.empty()) {
            return;
        }
@@ -287,6 +304,7 @@ private:

    bool invalidate_entire_cache = false;
    boost::icl::interval_set<u64> invalid_cache_ranges;
+    std::mutex invalidation_mutex;
 };

 Jit::Jit(UserConfig conf)
@@ -402,4 +420,8 @@ void Jit::DumpDisassembly() const {
    return impl->DumpDisassembly();
 }

+std::vector<std::string> Jit::Disassemble() const {
+    return impl->Disassemble();
+}
+
 }  // namespace Dynarmic::A64
--- a/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_vector.cpp
+++ b/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_vector.cpp
@@ -2990,6 +2990,98 @@ void EmitX64::EmitVectorReverseBits(EmitContext& ctx, IR::Inst* inst) {
    ctx.reg_alloc.DefineValue(inst, data);
 }

+void EmitX64::EmitVectorReduceAdd8(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+    const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]);
+    const Xbyak::Xmm temp = xmm0;
+
+    // Add upper elements to lower elements
+    code.pshufd(temp, data, 0b01'00'11'10);
+    code.paddb(data, temp);
+
+    // Add adjacent 8-bit values into 64-bit lanes
+    code.pxor(temp, temp);
+    code.psadbw(data, temp);
+
+    // Zero-extend lower 8-bits
+    code.pslldq(data, 15);
+    code.psrldq(data, 15);
+
+    ctx.reg_alloc.DefineValue(inst, data);
+}
+
+void EmitX64::EmitVectorReduceAdd16(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+    const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]);
+    const Xbyak::Xmm temp = xmm0;
+
+    if (code.HasHostFeature(HostFeature::SSSE3)) {
+        code.pxor(temp, temp);
+        code.phaddw(data, xmm0);
+        code.phaddw(data, xmm0);
+        code.phaddw(data, xmm0);
+    } else {
+        // Add upper elements to lower elements
+        code.pshufd(temp, data, 0b00'01'10'11);
+        code.paddw(data, temp);
+
+        // Add pairs of 16-bit values into 32-bit lanes
+        code.movdqa(temp, code.MConst(xword, 0x0001000100010001, 0x0001000100010001));
+        code.pmaddwd(data, temp);
+
+        // Sum adjacent 32-bit lanes
+        code.pshufd(temp, data, 0b10'11'00'01);
+        code.paddd(data, temp);
+        // Zero-extend lower 16-bits
+        code.pslldq(data, 14);
+        code.psrldq(data, 14);
+    }
+
+    ctx.reg_alloc.DefineValue(inst, data);
+}
+
+void EmitX64::EmitVectorReduceAdd32(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+    const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]);
+    const Xbyak::Xmm temp = xmm0;
+
+    // Add upper elements to lower elements(reversed)
+    code.pshufd(temp, data, 0b00'01'10'11);
+    code.paddd(data, temp);
+
+    // Sum adjacent 32-bit lanes
+    if (code.HasHostFeature(HostFeature::SSSE3)) {
+        code.phaddd(data, data);
+    } else {
+        code.pshufd(temp, data, 0b10'11'00'01);
+        code.paddd(data, temp);
+    }
+
+    // shift upper-most result into lower-most lane
+    code.psrldq(data, 12);
+
+    ctx.reg_alloc.DefineValue(inst, data);
+}
+
+void EmitX64::EmitVectorReduceAdd64(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+    const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]);
+    const Xbyak::Xmm temp = xmm0;
+
+    // Add upper elements to lower elements
+    code.pshufd(temp, data, 0b01'00'11'10);
+    code.paddq(data, temp);
+
+    // Zero-extend lower 64-bits
+    code.movq(data, data);
+
+    ctx.reg_alloc.DefineValue(inst, data);
+}
+
 static void EmitVectorRoundingHalvingAddSigned(size_t esize, EmitContext& ctx, IR::Inst* inst, BlockOfCode& code) {
    auto args = ctx.reg_alloc.GetArgumentInfo(inst);

--- a/externals/dynarmic/src/dynarmic/backend/x64/exception_handler_posix.cpp
+++ b/externals/dynarmic/src/dynarmic/backend/x64/exception_handler_posix.cpp
@@ -64,7 +64,7 @@ private:
 SigHandler sig_handler;

 SigHandler::SigHandler() {
-    constexpr size_t signal_stack_size = std::max(SIGSTKSZ, 2 * 1024 * 1024);
+    const size_t signal_stack_size = std::max<size_t>(SIGSTKSZ, 2 * 1024 * 1024);

    signal_stack_memory = std::malloc(signal_stack_size);

--- a/externals/dynarmic/src/dynarmic/common/fp/unpacked.h
+++ b/externals/dynarmic/src/dynarmic/common/fp/unpacked.h
@@ -46,7 +46,7 @@ constexpr FPUnpacked ToNormalized(bool sign, int exponent, u64 value) {
    const int highest_bit = Common::HighestSetBit(value);
    const int offset = static_cast<int>(normalized_point_position) - highest_bit;
    value <<= offset;
-    exponent -= offset - normalized_point_position;
+    exponent -= offset - static_cast<int>(normalized_point_position);
    return {sign, exponent, value};
 }

--- a/externals/dynarmic/src/dynarmic/common/x64_disassemble.cpp
+++ b/externals/dynarmic/src/dynarmic/common/x64_disassemble.cpp
@@ -21,15 +21,36 @@ void DumpDisassembledX64(const void* ptr, size_t size) {

    size_t offset = 0;
    ZydisDecodedInstruction instruction;
-    while (ZYAN_SUCCESS(ZydisDecoderDecodeBuffer(&decoder, (const char*)ptr + offset, size - offset, &instruction))) {
+    while (ZYAN_SUCCESS(ZydisDecoderDecodeBuffer(&decoder, static_cast<const char*>(ptr) + offset, size - offset, &instruction))) {
        fmt::print("{:016x}  ", (u64)ptr + offset);

        char buffer[256];
-        ZydisFormatterFormatInstruction(&formatter, &instruction, buffer, sizeof(buffer), (u64)ptr + offset);
+        ZydisFormatterFormatInstruction(&formatter, &instruction, buffer, sizeof(buffer), reinterpret_cast<u64>(ptr) + offset);
        puts(buffer);

        offset += instruction.length;
    }
 }

+std::vector<std::string> DisassembleX64(const void* ptr, size_t size) {
+    std::vector<std::string> result;
+    ZydisDecoder decoder;
+    ZydisDecoderInit(&decoder, ZYDIS_MACHINE_MODE_LONG_64, ZYDIS_ADDRESS_WIDTH_64);
+
+    ZydisFormatter formatter;
+    ZydisFormatterInit(&formatter, ZYDIS_FORMATTER_STYLE_INTEL);
+
+    size_t offset = 0;
+    ZydisDecodedInstruction instruction;
+    while (ZYAN_SUCCESS(ZydisDecoderDecodeBuffer(&decoder, static_cast<const char*>(ptr) + offset, size - offset, &instruction))) {
+        char buffer[256];
+        ZydisFormatterFormatInstruction(&formatter, &instruction, buffer, sizeof(buffer), reinterpret_cast<u64>(ptr) + offset);
+
+        result.push_back(fmt::format("{:016x}  {}", (u64)ptr + offset, buffer));
+
+        offset += instruction.length;
+    }
+
+    return result;
+}
 }  // namespace Dynarmic::Common
--- a/externals/dynarmic/src/dynarmic/common/x64_disassemble.h
+++ b/externals/dynarmic/src/dynarmic/common/x64_disassemble.h
@@ -5,10 +5,17 @@

 #pragma once

+#include <string>
+#include <vector>
+
 #include "dynarmic/common/common_types.h"

 namespace Dynarmic::Common {

 void DumpDisassembledX64(const void* ptr, size_t size);
-
+/**
+ * Disassemble `size' bytes from `ptr' and return the disassembled lines as a vector
+ * of strings.
+ */
+std::vector<std::string> DisassembleX64(const void* ptr, size_t size);
 }  // namespace Dynarmic::Common
--- a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_across_lanes.cpp
+++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_across_lanes.cpp
@@ -171,27 +171,10 @@ bool TranslatorVisitor::ADDV(bool Q, Imm<2> size, Vec Vn, Vec Vd) {

    const size_t esize = 8 << size.ZeroExtend();
    const size_t datasize = Q ? 128 : 64;
-    const size_t elements = datasize / esize;

    const IR::U128 operand = V(datasize, Vn);

-    const auto get_element = [&](IR::U128 vec, size_t element) {
-        return ir.ZeroExtendToWord(ir.VectorGetElement(esize, vec, element));
-    };
-
-    IR::U32 sum = get_element(operand, 0);
-    for (size_t i = 1; i < elements; i++) {
-        sum = ir.Add(sum, get_element(operand, i));
-    }
-
-    if (size == 0b00) {
-        V(datasize, Vd, ir.ZeroExtendToQuad(ir.LeastSignificantByte(sum)));
-    } else if (size == 0b01) {
-        V(datasize, Vd, ir.ZeroExtendToQuad(ir.LeastSignificantHalf(sum)));
-    } else {
-        V(datasize, Vd, ir.ZeroExtendToQuad(sum));
-    }
-
+    V(128, Vd, ir.VectorReduceAdd(esize, operand));
    return true;
 }

--- a/externals/dynarmic/src/dynarmic/interface/A32/a32.h
+++ b/externals/dynarmic/src/dynarmic/interface/A32/a32.h
@@ -9,6 +9,7 @@
 #include <cstdint>
 #include <memory>
 #include <string>
+#include <vector>

 #include "dynarmic/interface/A32/config.h"

@@ -91,6 +92,12 @@ public:
    /// Debugging: Dump a disassembly all compiled code to the console.
    void DumpDisassembly() const;

+    /**
+     * Disassemble the instructions following the current pc and return
+     * the resulting instructions as a vector of their string representations.
+     */
+    std::vector<std::string> Disassemble() const;
+
 private:
    bool is_executing = false;

--- a/externals/dynarmic/src/dynarmic/interface/A64/a64.h
+++ b/externals/dynarmic/src/dynarmic/interface/A64/a64.h
@@ -10,6 +10,7 @@
 #include <cstdint>
 #include <memory>
 #include <string>
+#include <vector>

 #include "dynarmic/interface/A64/config.h"

@@ -117,6 +118,12 @@ public:
    /// Debugging: Dump a disassembly all of compiled code to the console.
    void DumpDisassembly() const;

+    /* 
+     * Disassemble the instructions following the current pc and return
+     * the resulting instructions as a vector of their string representations.
+     */
+    std::vector<std::string> Disassemble() const;
+
 private:
    struct Impl;
    std::unique_ptr<Impl> impl;
--- a/externals/dynarmic/src/dynarmic/ir/ir_emitter.cpp
+++ b/externals/dynarmic/src/dynarmic/ir/ir_emitter.cpp
@@ -1526,6 +1526,21 @@ U128 IREmitter::VectorReverseBits(const U128& a) {
    return Inst<U128>(Opcode::VectorReverseBits, a);
 }

+U128 IREmitter::VectorReduceAdd(size_t esize, const U128& a) {
+    switch (esize) {
+    case 8:
+        return Inst<U128>(Opcode::VectorReduceAdd8, a);
+    case 16:
+        return Inst<U128>(Opcode::VectorReduceAdd16, a);
+    case 32:
+        return Inst<U128>(Opcode::VectorReduceAdd32, a);
+    case 64:
+        return Inst<U128>(Opcode::VectorReduceAdd64, a);
+    }
+
+    UNREACHABLE();
+}
+
 U128 IREmitter::VectorRotateLeft(size_t esize, const U128& a, u8 amount) {
    ASSERT(amount < esize);

--- a/externals/dynarmic/src/dynarmic/ir/ir_emitter.h
+++ b/externals/dynarmic/src/dynarmic/ir/ir_emitter.h
@@ -294,6 +294,7 @@ public:
    U128 VectorPolynomialMultiplyLong(size_t esize, const U128& a, const U128& b);
    U128 VectorPopulationCount(const U128& a);
    U128 VectorReverseBits(const U128& a);
+    U128 VectorReduceAdd(size_t esize, const U128& a);
    U128 VectorRotateLeft(size_t esize, const U128& a, u8 amount);
    U128 VectorRotateRight(size_t esize, const U128& a, u8 amount);
    U128 VectorRoundingHalvingAddSigned(size_t esize, const U128& a, const U128& b);
--- a/externals/dynarmic/src/dynarmic/ir/opcodes.inc
+++ b/externals/dynarmic/src/dynarmic/ir/opcodes.inc
@@ -431,6 +431,10 @@ OPCODE(VectorPolynomialMultiplyLong8,                       U128,           U128
 OPCODE(VectorPolynomialMultiplyLong64,                      U128,           U128,           U128                                            )
 OPCODE(VectorPopulationCount,                               U128,           U128                                                            )
 OPCODE(VectorReverseBits,                                   U128,           U128                                                            )
+OPCODE(VectorReduceAdd8,                                    U128,           U128                                                            )
+OPCODE(VectorReduceAdd16,                                   U128,           U128                                                            )
+OPCODE(VectorReduceAdd32,                                   U128,           U128                                                            )
+OPCODE(VectorReduceAdd64,                                   U128,           U128                                                            )
 OPCODE(VectorRoundingHalvingAddS8,                          U128,           U128,           U128                                            )
 OPCODE(VectorRoundingHalvingAddS16,                         U128,           U128,           U128                                            )
 OPCODE(VectorRoundingHalvingAddS32,                         U128,           U128,           U128                                            )