From e2ba36a19ff230342cba6c5611c512a3b229f837 Mon Sep 17 00:00:00 2001 From: pineappleEA Date: Tue, 29 Mar 2022 23:49:35 +0200 Subject: [PATCH] early-access version 2621 --- README.md | 2 +- .../dynarmic/src/dynarmic/CMakeLists.txt | 1 + .../src/dynarmic/backend/x64/a32_emit_x64.h | 19 +- .../backend/x64/a32_emit_x64_memory.cpp | 506 +++---------- .../src/dynarmic/backend/x64/a64_emit_x64.h | 6 +- .../backend/x64/a64_emit_x64_memory.cpp | 677 ++++-------------- .../backend/x64/emit_x64_memory.cpp.inc | 497 +++++++++++++ .../dynarmic/backend/x64/emit_x64_memory.h | 93 ++- .../src/dynarmic/backend/x64/reg_alloc.cpp | 5 + .../src/dynarmic/backend/x64/reg_alloc.h | 5 + 10 files changed, 816 insertions(+), 995 deletions(-) create mode 100755 externals/dynarmic/src/dynarmic/backend/x64/emit_x64_memory.cpp.inc diff --git a/README.md b/README.md index 685904b23..f347ba8bf 100755 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ yuzu emulator early access ============= -This is the source code for early-access 2620. +This is the source code for early-access 2621. ## Legal Notice diff --git a/externals/dynarmic/src/dynarmic/CMakeLists.txt b/externals/dynarmic/src/dynarmic/CMakeLists.txt index c64233371..38b5b43ea 100755 --- a/externals/dynarmic/src/dynarmic/CMakeLists.txt +++ b/externals/dynarmic/src/dynarmic/CMakeLists.txt @@ -285,6 +285,7 @@ if (ARCHITECTURE STREQUAL "x86_64") backend/x64/emit_x64_data_processing.cpp backend/x64/emit_x64_floating_point.cpp backend/x64/emit_x64_memory.h + backend/x64/emit_x64_memory.cpp.inc backend/x64/emit_x64_packed.cpp backend/x64/emit_x64_saturation.cpp backend/x64/emit_x64_sm4.cpp diff --git a/externals/dynarmic/src/dynarmic/backend/x64/a32_emit_x64.h b/externals/dynarmic/src/dynarmic/backend/x64/a32_emit_x64.h index 43f90f6a1..7bdc7e023 100755 --- a/externals/dynarmic/src/dynarmic/backend/x64/a32_emit_x64.h +++ b/externals/dynarmic/src/dynarmic/backend/x64/a32_emit_x64.h @@ -71,9 +71,12 @@ protected: std::array fast_dispatch_table; void ClearFastDispatchTable(); - std::map, void (*)()> read_fallbacks; - std::map, void (*)()> write_fallbacks; - std::map, void (*)()> exclusive_write_fallbacks; + void (*memory_read_128)() = nullptr; // Dummy + void (*memory_write_128)() = nullptr; // Dummy + + std::map, void (*)()> read_fallbacks; + std::map, void (*)()> write_fallbacks; + std::map, void (*)()> exclusive_write_fallbacks; void GenFastmemFallbacks(); const void* terminal_handler_pop_rsb_hint; @@ -99,7 +102,7 @@ protected: u64 resume_rip; u64 callback; DoNotFastmemMarker marker; - bool compile; + bool recompile; }; tsl::robin_map fastmem_patch_info; std::set do_not_fastmem; @@ -112,13 +115,13 @@ protected: template void EmitMemoryWrite(A32EmitContext& ctx, IR::Inst* inst); template - void ExclusiveReadMemory(A32EmitContext& ctx, IR::Inst* inst); + void EmitExclusiveReadMemory(A32EmitContext& ctx, IR::Inst* inst); template - void ExclusiveWriteMemory(A32EmitContext& ctx, IR::Inst* inst); + void EmitExclusiveWriteMemory(A32EmitContext& ctx, IR::Inst* inst); template - void ExclusiveReadMemoryInline(A32EmitContext& ctx, IR::Inst* inst); + void EmitExclusiveReadMemoryInline(A32EmitContext& ctx, IR::Inst* inst); template - void ExclusiveWriteMemoryInline(A32EmitContext& ctx, IR::Inst* inst); + void EmitExclusiveWriteMemoryInline(A32EmitContext& ctx, IR::Inst* inst); // Terminal instruction emitters void EmitSetUpperLocationDescriptor(IR::LocationDescriptor new_location, IR::LocationDescriptor old_location); diff --git a/externals/dynarmic/src/dynarmic/backend/x64/a32_emit_x64_memory.cpp b/externals/dynarmic/src/dynarmic/backend/x64/a32_emit_x64_memory.cpp index 43d7c0c7f..adbe00e21 100755 --- a/externals/dynarmic/src/dynarmic/backend/x64/a32_emit_x64_memory.cpp +++ b/externals/dynarmic/src/dynarmic/backend/x64/a32_emit_x64_memory.cpp @@ -47,222 +47,93 @@ void A32EmitX64::GenFastmemFallbacks() { {64, Devirtualize<&A32::UserCallbacks::MemoryWriteExclusive64>(conf.callbacks)}, }}; - for (int vaddr_idx : idxes) { - for (int value_idx : idxes) { - for (const auto& [bitsize, callback] : read_callbacks) { - code.align(); - read_fallbacks[std::make_tuple(bitsize, vaddr_idx, value_idx)] = code.getCurr(); - ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLocRegIdx(value_idx)); - if (vaddr_idx != code.ABI_PARAM2.getIdx()) { - code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); - } - callback.EmitCall(code); - if (value_idx != code.ABI_RETURN.getIdx()) { - code.mov(Xbyak::Reg64{value_idx}, code.ABI_RETURN); - } - ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocRegIdx(value_idx)); - code.ZeroExtendFrom(bitsize, Xbyak::Reg64{value_idx}); - code.ret(); - PerfMapRegister(read_fallbacks[std::make_tuple(bitsize, vaddr_idx, value_idx)], code.getCurr(), fmt::format("a32_read_fallback_{}", bitsize)); - } - - for (const auto& [bitsize, callback] : write_callbacks) { - code.align(); - write_fallbacks[std::make_tuple(bitsize, vaddr_idx, value_idx)] = code.getCurr(); - ABI_PushCallerSaveRegistersAndAdjustStack(code); - if (vaddr_idx == code.ABI_PARAM3.getIdx() && value_idx == code.ABI_PARAM2.getIdx()) { - code.xchg(code.ABI_PARAM2, code.ABI_PARAM3); - } else if (vaddr_idx == code.ABI_PARAM3.getIdx()) { - code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); - if (value_idx != code.ABI_PARAM3.getIdx()) { - code.mov(code.ABI_PARAM3, Xbyak::Reg64{value_idx}); - } - } else { - if (value_idx != code.ABI_PARAM3.getIdx()) { - code.mov(code.ABI_PARAM3, Xbyak::Reg64{value_idx}); - } + for (bool ordered : {false, true}) { + for (int vaddr_idx : idxes) { + for (int value_idx : idxes) { + for (const auto& [bitsize, callback] : read_callbacks) { + code.align(); + read_fallbacks[std::make_tuple(ordered, bitsize, vaddr_idx, value_idx)] = code.getCurr(); + ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLocRegIdx(value_idx)); if (vaddr_idx != code.ABI_PARAM2.getIdx()) { code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); } + if (ordered) { + code.mfence(); + } + callback.EmitCall(code); + if (value_idx != code.ABI_RETURN.getIdx()) { + code.mov(Xbyak::Reg64{value_idx}, code.ABI_RETURN); + } + ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocRegIdx(value_idx)); + code.ZeroExtendFrom(bitsize, Xbyak::Reg64{value_idx}); + code.ret(); + PerfMapRegister(read_fallbacks[std::make_tuple(ordered, bitsize, vaddr_idx, value_idx)], code.getCurr(), fmt::format("a32_read_fallback_{}", bitsize)); } - code.ZeroExtendFrom(bitsize, code.ABI_PARAM3); - callback.EmitCall(code); - ABI_PopCallerSaveRegistersAndAdjustStack(code); - code.ret(); - PerfMapRegister(write_fallbacks[std::make_tuple(bitsize, vaddr_idx, value_idx)], code.getCurr(), fmt::format("a32_write_fallback_{}", bitsize)); - } - for (const auto& [bitsize, callback] : exclusive_write_callbacks) { - code.align(); - exclusive_write_fallbacks[std::make_tuple(bitsize, vaddr_idx, value_idx)] = code.getCurr(); - ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLoc::RAX); - if (vaddr_idx == code.ABI_PARAM3.getIdx() && value_idx == code.ABI_PARAM2.getIdx()) { - code.xchg(code.ABI_PARAM2, code.ABI_PARAM3); - } else if (vaddr_idx == code.ABI_PARAM3.getIdx()) { - code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); - if (value_idx != code.ABI_PARAM3.getIdx()) { - code.mov(code.ABI_PARAM3, Xbyak::Reg64{value_idx}); - } - } else { - if (value_idx != code.ABI_PARAM3.getIdx()) { - code.mov(code.ABI_PARAM3, Xbyak::Reg64{value_idx}); - } - if (vaddr_idx != code.ABI_PARAM2.getIdx()) { + for (const auto& [bitsize, callback] : write_callbacks) { + code.align(); + write_fallbacks[std::make_tuple(ordered, bitsize, vaddr_idx, value_idx)] = code.getCurr(); + ABI_PushCallerSaveRegistersAndAdjustStack(code); + if (vaddr_idx == code.ABI_PARAM3.getIdx() && value_idx == code.ABI_PARAM2.getIdx()) { + code.xchg(code.ABI_PARAM2, code.ABI_PARAM3); + } else if (vaddr_idx == code.ABI_PARAM3.getIdx()) { code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); + if (value_idx != code.ABI_PARAM3.getIdx()) { + code.mov(code.ABI_PARAM3, Xbyak::Reg64{value_idx}); + } + } else { + if (value_idx != code.ABI_PARAM3.getIdx()) { + code.mov(code.ABI_PARAM3, Xbyak::Reg64{value_idx}); + } + if (vaddr_idx != code.ABI_PARAM2.getIdx()) { + code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); + } } + code.ZeroExtendFrom(bitsize, code.ABI_PARAM3); + callback.EmitCall(code); + if (ordered) { + code.mfence(); + } + ABI_PopCallerSaveRegistersAndAdjustStack(code); + code.ret(); + PerfMapRegister(write_fallbacks[std::make_tuple(ordered, bitsize, vaddr_idx, value_idx)], code.getCurr(), fmt::format("a32_write_fallback_{}", bitsize)); + } + + for (const auto& [bitsize, callback] : exclusive_write_callbacks) { + code.align(); + exclusive_write_fallbacks[std::make_tuple(ordered, bitsize, vaddr_idx, value_idx)] = code.getCurr(); + ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLoc::RAX); + if (vaddr_idx == code.ABI_PARAM3.getIdx() && value_idx == code.ABI_PARAM2.getIdx()) { + code.xchg(code.ABI_PARAM2, code.ABI_PARAM3); + } else if (vaddr_idx == code.ABI_PARAM3.getIdx()) { + code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); + if (value_idx != code.ABI_PARAM3.getIdx()) { + code.mov(code.ABI_PARAM3, Xbyak::Reg64{value_idx}); + } + } else { + if (value_idx != code.ABI_PARAM3.getIdx()) { + code.mov(code.ABI_PARAM3, Xbyak::Reg64{value_idx}); + } + if (vaddr_idx != code.ABI_PARAM2.getIdx()) { + code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); + } + } + code.ZeroExtendFrom(bitsize, code.ABI_PARAM3); + code.mov(code.ABI_PARAM4, rax); + code.ZeroExtendFrom(bitsize, code.ABI_PARAM4); + callback.EmitCall(code); + ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLoc::RAX); + code.ret(); + PerfMapRegister(exclusive_write_fallbacks[std::make_tuple(ordered, bitsize, vaddr_idx, value_idx)], code.getCurr(), fmt::format("a32_exclusive_write_fallback_{}", bitsize)); } - code.ZeroExtendFrom(bitsize, code.ABI_PARAM3); - code.mov(code.ABI_PARAM4, rax); - code.ZeroExtendFrom(bitsize, code.ABI_PARAM4); - callback.EmitCall(code); - ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLoc::RAX); - code.ret(); - PerfMapRegister(exclusive_write_fallbacks[std::make_tuple(bitsize, vaddr_idx, value_idx)], code.getCurr(), fmt::format("a32_exclusive_write_fallback_{}", bitsize)); } } } } -std::optional A32EmitX64::ShouldFastmem(A32EmitContext& ctx, IR::Inst* inst) const { - if (!conf.fastmem_pointer || !exception_handler.SupportsFastmem()) { - return std::nullopt; - } - - const auto marker = std::make_tuple(ctx.Location(), ctx.GetInstOffset(inst)); - if (do_not_fastmem.count(marker) > 0) { - return std::nullopt; - } - return marker; -} - -FakeCall A32EmitX64::FastmemCallback(u64 rip_) { - const auto iter = fastmem_patch_info.find(rip_); - - if (iter == fastmem_patch_info.end()) { - fmt::print("dynarmic: Segfault happened within JITted code at rip = {:016x}\n", rip_); - fmt::print("Segfault wasn't at a fastmem patch location!\n"); - fmt::print("Now dumping code.......\n\n"); - Common::DumpDisassembledX64((void*)(rip_ & ~u64(0xFFF)), 0x1000); - ASSERT_FALSE("iter != fastmem_patch_info.end()"); - } - - if (iter->second.compile) { - const auto marker = iter->second.marker; - do_not_fastmem.emplace(marker); - InvalidateBasicBlocks({std::get<0>(marker)}); - } - - return FakeCall{ - .call_rip = iter->second.callback, - .ret_rip = iter->second.resume_rip, - }; -} - -template -void A32EmitX64::EmitMemoryRead(A32EmitContext& ctx, IR::Inst* inst) { - auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const auto fastmem_marker = ShouldFastmem(ctx, inst); - - if (!conf.page_table && !fastmem_marker) { - // Neither fastmem nor page table: Use callbacks - ctx.reg_alloc.HostCall(inst, {}, args[0]); - Devirtualize(conf.callbacks).EmitCall(code); - code.ZeroExtendFrom(bitsize, code.ABI_RETURN); - return; - } - - const Xbyak::Reg64 vaddr = ctx.reg_alloc.UseGpr(args[0]); - const Xbyak::Reg64 value = ctx.reg_alloc.ScratchGpr(); - - const auto wrapped_fn = read_fallbacks[std::make_tuple(bitsize, vaddr.getIdx(), value.getIdx())]; - - if (fastmem_marker) { - // Use fastmem - const auto src_ptr = r13 + vaddr; - - const auto location = code.getCurr(); - EmitReadMemoryMov(code, value.getIdx(), src_ptr); - - fastmem_patch_info.emplace( - Common::BitCast(location), - FastmemPatchInfo{ - Common::BitCast(code.getCurr()), - Common::BitCast(wrapped_fn), - *fastmem_marker, - conf.recompile_on_fastmem_failure, - }); - - ctx.reg_alloc.DefineValue(inst, value); - return; - } - - // Use page table - ASSERT(conf.page_table); - Xbyak::Label abort, end; - - const auto src_ptr = EmitVAddrLookup(code, ctx, bitsize, abort, vaddr); - EmitReadMemoryMov(code, value.getIdx(), src_ptr); - code.L(end); - - code.SwitchToFarCode(); - code.L(abort); - code.call(wrapped_fn); - code.jmp(end, code.T_NEAR); - code.SwitchToNearCode(); - - ctx.reg_alloc.DefineValue(inst, value); -} - -template -void A32EmitX64::EmitMemoryWrite(A32EmitContext& ctx, IR::Inst* inst) { - auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const auto fastmem_marker = ShouldFastmem(ctx, inst); - - if (!conf.page_table && !fastmem_marker) { - // Neither fastmem nor page table: Use callbacks - ctx.reg_alloc.HostCall(nullptr, {}, args[0], args[1]); - Devirtualize(conf.callbacks).EmitCall(code); - return; - } - - const Xbyak::Reg64 vaddr = ctx.reg_alloc.UseGpr(args[0]); - const Xbyak::Reg64 value = ctx.reg_alloc.UseGpr(args[1]); - - const auto wrapped_fn = write_fallbacks[std::make_tuple(bitsize, vaddr.getIdx(), value.getIdx())]; - - if (fastmem_marker) { - // Use fastmem - const auto dest_ptr = r13 + vaddr; - - const auto location = code.getCurr(); - EmitWriteMemoryMov(code, dest_ptr, value.getIdx()); - - fastmem_patch_info.emplace( - Common::BitCast(location), - FastmemPatchInfo{ - Common::BitCast(code.getCurr()), - Common::BitCast(wrapped_fn), - *fastmem_marker, - conf.recompile_on_fastmem_failure, - }); - - return; - } - - // Use page table - ASSERT(conf.page_table); - Xbyak::Label abort, end; - - const auto dest_ptr = EmitVAddrLookup(code, ctx, bitsize, abort, vaddr); - EmitWriteMemoryMov(code, dest_ptr, value.getIdx()); - code.L(end); - - code.SwitchToFarCode(); - code.L(abort); - code.call(wrapped_fn); - code.jmp(end, code.T_NEAR); - code.SwitchToNearCode(); -} +#define Axx A32 +#include "emit_x64_memory.cpp.inc" +#undef Axx void A32EmitX64::EmitA32ReadMemory8(A32EmitContext& ctx, IR::Inst* inst) { EmitMemoryRead<8, &A32::UserCallbacks::MemoryRead8>(ctx, inst); @@ -296,268 +167,71 @@ void A32EmitX64::EmitA32WriteMemory64(A32EmitContext& ctx, IR::Inst* inst) { EmitMemoryWrite<64, &A32::UserCallbacks::MemoryWrite64>(ctx, inst); } -template -void A32EmitX64::ExclusiveReadMemory(A32EmitContext& ctx, IR::Inst* inst) { - using T = mp::unsigned_integer_of_size; - - ASSERT(conf.global_monitor != nullptr); - auto args = ctx.reg_alloc.GetArgumentInfo(inst); - - ctx.reg_alloc.HostCall(inst, {}, args[0]); - - code.mov(code.byte[r15 + offsetof(A32JitState, exclusive_state)], u8(1)); - code.mov(code.ABI_PARAM1, reinterpret_cast(&conf)); - code.CallLambda( - [](A32::UserConfig& conf, u32 vaddr) -> T { - return conf.global_monitor->ReadAndMark(conf.processor_id, vaddr, [&]() -> T { - return (conf.callbacks->*callback)(vaddr); - }); - }); - code.ZeroExtendFrom(bitsize, code.ABI_RETURN); -} - -template -void A32EmitX64::ExclusiveWriteMemory(A32EmitContext& ctx, IR::Inst* inst) { - using T = mp::unsigned_integer_of_size; - - ASSERT(conf.global_monitor != nullptr); - auto args = ctx.reg_alloc.GetArgumentInfo(inst); - - ctx.reg_alloc.HostCall(inst, {}, args[0], args[1]); - - Xbyak::Label end; - - code.mov(code.ABI_RETURN, u32(1)); - code.cmp(code.byte[r15 + offsetof(A32JitState, exclusive_state)], u8(0)); - code.je(end); - code.mov(code.byte[r15 + offsetof(A32JitState, exclusive_state)], u8(0)); - code.mov(code.ABI_PARAM1, reinterpret_cast(&conf)); - code.CallLambda( - [](A32::UserConfig& conf, u32 vaddr, T value) -> u32 { - return conf.global_monitor->DoExclusiveOperation(conf.processor_id, vaddr, - [&](T expected) -> bool { - return (conf.callbacks->*callback)(vaddr, value, expected); - }) - ? 0 - : 1; - }); - code.L(end); -} - -template -void A32EmitX64::ExclusiveReadMemoryInline(A32EmitContext& ctx, IR::Inst* inst) { - ASSERT(conf.global_monitor && conf.fastmem_pointer); - if (!exception_handler.SupportsFastmem()) { - ExclusiveReadMemory(ctx, inst); - return; - } - - auto args = ctx.reg_alloc.GetArgumentInfo(inst); - - const Xbyak::Reg64 vaddr = ctx.reg_alloc.UseGpr(args[0]); - const Xbyak::Reg64 value = ctx.reg_alloc.ScratchGpr(); - const Xbyak::Reg64 tmp = ctx.reg_alloc.ScratchGpr(); - const Xbyak::Reg64 tmp2 = ctx.reg_alloc.ScratchGpr(); - - const auto wrapped_fn = read_fallbacks[std::make_tuple(bitsize, vaddr.getIdx(), value.getIdx())]; - - EmitExclusiveLock(code, conf, tmp, tmp2.cvt32()); - - code.mov(code.byte[r15 + offsetof(A32JitState, exclusive_state)], u8(1)); - code.mov(tmp, Common::BitCast(GetExclusiveMonitorAddressPointer(conf.global_monitor, conf.processor_id))); - code.mov(qword[tmp], vaddr); - - const auto fastmem_marker = ShouldFastmem(ctx, inst); - if (fastmem_marker) { - Xbyak::Label end; - - const auto src_ptr = r13 + vaddr; - - const auto location = code.getCurr(); - EmitReadMemoryMov(code, value.getIdx(), src_ptr); - - fastmem_patch_info.emplace( - Common::BitCast(location), - FastmemPatchInfo{ - Common::BitCast(code.getCurr()), - Common::BitCast(wrapped_fn), - *fastmem_marker, - conf.recompile_on_exclusive_fastmem_failure, - }); - - code.L(end); - } else { - code.call(wrapped_fn); - } - - code.mov(tmp, Common::BitCast(GetExclusiveMonitorValuePointer(conf.global_monitor, conf.processor_id))); - EmitWriteMemoryMov(code, tmp, value.getIdx()); - - EmitExclusiveUnlock(code, conf, tmp, tmp2.cvt32()); - - ctx.reg_alloc.DefineValue(inst, value); -} - -template -void A32EmitX64::ExclusiveWriteMemoryInline(A32EmitContext& ctx, IR::Inst* inst) { - ASSERT(conf.global_monitor && conf.fastmem_pointer); - if (!exception_handler.SupportsFastmem()) { - ExclusiveWriteMemory(ctx, inst); - return; - } - - auto args = ctx.reg_alloc.GetArgumentInfo(inst); - - ctx.reg_alloc.ScratchGpr(HostLoc::RAX); - const Xbyak::Reg64 value = ctx.reg_alloc.UseGpr(args[1]); - const Xbyak::Reg64 vaddr = ctx.reg_alloc.UseGpr(args[0]); - const Xbyak::Reg32 status = ctx.reg_alloc.ScratchGpr().cvt32(); - const Xbyak::Reg64 tmp = ctx.reg_alloc.ScratchGpr(); - - const auto fallback_fn = exclusive_write_fallbacks[std::make_tuple(bitsize, vaddr.getIdx(), value.getIdx())]; - - EmitExclusiveLock(code, conf, tmp, eax); - - Xbyak::Label end; - - code.mov(tmp, Common::BitCast(GetExclusiveMonitorAddressPointer(conf.global_monitor, conf.processor_id))); - code.mov(status, u32(1)); - code.cmp(code.byte[r15 + offsetof(A32JitState, exclusive_state)], u8(0)); - code.je(end, code.T_NEAR); - code.cmp(qword[tmp], vaddr); - code.jne(end, code.T_NEAR); - - EmitExclusiveTestAndClear(code, conf, vaddr, tmp, rax); - - code.mov(code.byte[r15 + offsetof(A32JitState, exclusive_state)], u8(0)); - code.mov(tmp, Common::BitCast(GetExclusiveMonitorValuePointer(conf.global_monitor, conf.processor_id))); - - EmitReadMemoryMov(code, rax.getIdx(), tmp); - - const auto fastmem_marker = ShouldFastmem(ctx, inst); - if (fastmem_marker) { - const auto dest_ptr = r13 + vaddr; - - const auto location = code.getCurr(); - - switch (bitsize) { - case 8: - code.lock(); - code.cmpxchg(code.byte[dest_ptr], value.cvt8()); - break; - case 16: - code.lock(); - code.cmpxchg(word[dest_ptr], value.cvt16()); - break; - case 32: - code.lock(); - code.cmpxchg(dword[dest_ptr], value.cvt32()); - break; - case 64: - code.lock(); - code.cmpxchg(qword[dest_ptr], value.cvt64()); - break; - default: - UNREACHABLE(); - } - - code.setnz(status.cvt8()); - - code.SwitchToFarCode(); - - fastmem_patch_info.emplace( - Common::BitCast(location), - FastmemPatchInfo{ - Common::BitCast(code.getCurr()), - Common::BitCast(fallback_fn), - *fastmem_marker, - conf.recompile_on_exclusive_fastmem_failure, - }); - - code.cmp(al, 0); - code.setz(status.cvt8()); - code.movzx(status.cvt32(), status.cvt8()); - code.jmp(end, code.T_NEAR); - code.SwitchToNearCode(); - } else { - code.call(fallback_fn); - code.cmp(al, 0); - code.setz(status.cvt8()); - code.movzx(status.cvt32(), status.cvt8()); - } - - code.L(end); - - EmitExclusiveUnlock(code, conf, tmp, eax); - - ctx.reg_alloc.DefineValue(inst, status); -} - void A32EmitX64::EmitA32ClearExclusive(A32EmitContext&, IR::Inst*) { code.mov(code.byte[r15 + offsetof(A32JitState, exclusive_state)], u8(0)); } void A32EmitX64::EmitA32ExclusiveReadMemory8(A32EmitContext& ctx, IR::Inst* inst) { if (conf.fastmem_exclusive_access) { - ExclusiveReadMemoryInline<8, &A32::UserCallbacks::MemoryRead8>(ctx, inst); + EmitExclusiveReadMemoryInline<8, &A32::UserCallbacks::MemoryRead8>(ctx, inst); } else { - ExclusiveReadMemory<8, &A32::UserCallbacks::MemoryRead8>(ctx, inst); + EmitExclusiveReadMemory<8, &A32::UserCallbacks::MemoryRead8>(ctx, inst); } } void A32EmitX64::EmitA32ExclusiveReadMemory16(A32EmitContext& ctx, IR::Inst* inst) { if (conf.fastmem_exclusive_access) { - ExclusiveReadMemoryInline<16, &A32::UserCallbacks::MemoryRead16>(ctx, inst); + EmitExclusiveReadMemoryInline<16, &A32::UserCallbacks::MemoryRead16>(ctx, inst); } else { - ExclusiveReadMemory<16, &A32::UserCallbacks::MemoryRead16>(ctx, inst); + EmitExclusiveReadMemory<16, &A32::UserCallbacks::MemoryRead16>(ctx, inst); } } void A32EmitX64::EmitA32ExclusiveReadMemory32(A32EmitContext& ctx, IR::Inst* inst) { if (conf.fastmem_exclusive_access) { - ExclusiveReadMemoryInline<32, &A32::UserCallbacks::MemoryRead32>(ctx, inst); + EmitExclusiveReadMemoryInline<32, &A32::UserCallbacks::MemoryRead32>(ctx, inst); } else { - ExclusiveReadMemory<32, &A32::UserCallbacks::MemoryRead32>(ctx, inst); + EmitExclusiveReadMemory<32, &A32::UserCallbacks::MemoryRead32>(ctx, inst); } } void A32EmitX64::EmitA32ExclusiveReadMemory64(A32EmitContext& ctx, IR::Inst* inst) { if (conf.fastmem_exclusive_access) { - ExclusiveReadMemoryInline<64, &A32::UserCallbacks::MemoryRead64>(ctx, inst); + EmitExclusiveReadMemoryInline<64, &A32::UserCallbacks::MemoryRead64>(ctx, inst); } else { - ExclusiveReadMemory<64, &A32::UserCallbacks::MemoryRead64>(ctx, inst); + EmitExclusiveReadMemory<64, &A32::UserCallbacks::MemoryRead64>(ctx, inst); } } void A32EmitX64::EmitA32ExclusiveWriteMemory8(A32EmitContext& ctx, IR::Inst* inst) { if (conf.fastmem_exclusive_access) { - ExclusiveWriteMemoryInline<8, &A32::UserCallbacks::MemoryWriteExclusive8>(ctx, inst); + EmitExclusiveWriteMemoryInline<8, &A32::UserCallbacks::MemoryWriteExclusive8>(ctx, inst); } else { - ExclusiveWriteMemory<8, &A32::UserCallbacks::MemoryWriteExclusive8>(ctx, inst); + EmitExclusiveWriteMemory<8, &A32::UserCallbacks::MemoryWriteExclusive8>(ctx, inst); } } void A32EmitX64::EmitA32ExclusiveWriteMemory16(A32EmitContext& ctx, IR::Inst* inst) { if (conf.fastmem_exclusive_access) { - ExclusiveWriteMemoryInline<16, &A32::UserCallbacks::MemoryWriteExclusive16>(ctx, inst); + EmitExclusiveWriteMemoryInline<16, &A32::UserCallbacks::MemoryWriteExclusive16>(ctx, inst); } else { - ExclusiveWriteMemory<16, &A32::UserCallbacks::MemoryWriteExclusive16>(ctx, inst); + EmitExclusiveWriteMemory<16, &A32::UserCallbacks::MemoryWriteExclusive16>(ctx, inst); } } void A32EmitX64::EmitA32ExclusiveWriteMemory32(A32EmitContext& ctx, IR::Inst* inst) { if (conf.fastmem_exclusive_access) { - ExclusiveWriteMemoryInline<32, &A32::UserCallbacks::MemoryWriteExclusive32>(ctx, inst); + EmitExclusiveWriteMemoryInline<32, &A32::UserCallbacks::MemoryWriteExclusive32>(ctx, inst); } else { - ExclusiveWriteMemory<32, &A32::UserCallbacks::MemoryWriteExclusive32>(ctx, inst); + EmitExclusiveWriteMemory<32, &A32::UserCallbacks::MemoryWriteExclusive32>(ctx, inst); } } void A32EmitX64::EmitA32ExclusiveWriteMemory64(A32EmitContext& ctx, IR::Inst* inst) { if (conf.fastmem_exclusive_access) { - ExclusiveWriteMemoryInline<64, &A32::UserCallbacks::MemoryWriteExclusive64>(ctx, inst); + EmitExclusiveWriteMemoryInline<64, &A32::UserCallbacks::MemoryWriteExclusive64>(ctx, inst); } else { - ExclusiveWriteMemory<64, &A32::UserCallbacks::MemoryWriteExclusive64>(ctx, inst); + EmitExclusiveWriteMemory<64, &A32::UserCallbacks::MemoryWriteExclusive64>(ctx, inst); } } diff --git a/externals/dynarmic/src/dynarmic/backend/x64/a64_emit_x64.h b/externals/dynarmic/src/dynarmic/backend/x64/a64_emit_x64.h index cbfa28bd5..f5d1a9174 100755 --- a/externals/dynarmic/src/dynarmic/backend/x64/a64_emit_x64.h +++ b/externals/dynarmic/src/dynarmic/backend/x64/a64_emit_x64.h @@ -71,9 +71,9 @@ protected: void (*memory_exclusive_write_128)(); void GenMemory128Accessors(); - std::map, void (*)()> read_fallbacks; - std::map, void (*)()> write_fallbacks; - std::map, void (*)()> exclusive_write_fallbacks; + std::map, void (*)()> read_fallbacks; + std::map, void (*)()> write_fallbacks; + std::map, void (*)()> exclusive_write_fallbacks; void GenFastmemFallbacks(); const void* terminal_handler_pop_rsb_hint; diff --git a/externals/dynarmic/src/dynarmic/backend/x64/a64_emit_x64_memory.cpp b/externals/dynarmic/src/dynarmic/backend/x64/a64_emit_x64_memory.cpp index 7755c9ff3..0ab2ff9b5 100755 --- a/externals/dynarmic/src/dynarmic/backend/x64/a64_emit_x64_memory.cpp +++ b/externals/dynarmic/src/dynarmic/backend/x64/a64_emit_x64_memory.cpp @@ -131,301 +131,157 @@ void A64EmitX64::GenFastmemFallbacks() { {64, Devirtualize<&A64::UserCallbacks::MemoryWriteExclusive64>(conf.callbacks)}, }}; - for (int vaddr_idx : idxes) { - if (vaddr_idx == 4 || vaddr_idx == 15) { - continue; - } - - for (int value_idx : idxes) { - code.align(); - read_fallbacks[std::make_tuple(128, vaddr_idx, value_idx)] = code.getCurr(); - ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(value_idx)); - if (vaddr_idx != code.ABI_PARAM2.getIdx()) { - code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); - } - code.call(memory_read_128); - if (value_idx != 1) { - code.movaps(Xbyak::Xmm{value_idx}, xmm1); - } - ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(value_idx)); - code.ret(); - PerfMapRegister(read_fallbacks[std::make_tuple(128, vaddr_idx, value_idx)], code.getCurr(), "a64_read_fallback_128"); - - code.align(); - write_fallbacks[std::make_tuple(128, vaddr_idx, value_idx)] = code.getCurr(); - ABI_PushCallerSaveRegistersAndAdjustStack(code); - if (vaddr_idx != code.ABI_PARAM2.getIdx()) { - code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); - } - if (value_idx != 1) { - code.movaps(xmm1, Xbyak::Xmm{value_idx}); - } - code.call(memory_write_128); - ABI_PopCallerSaveRegistersAndAdjustStack(code); - code.ret(); - PerfMapRegister(write_fallbacks[std::make_tuple(128, vaddr_idx, value_idx)], code.getCurr(), "a64_write_fallback_128"); - - code.align(); - exclusive_write_fallbacks[std::make_tuple(128, vaddr_idx, value_idx)] = code.getCurr(); - ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLoc::RAX); - if (value_idx != 1) { - code.movaps(xmm1, Xbyak::Xmm{value_idx}); - } - if (code.HasHostFeature(HostFeature::SSE41)) { - code.movq(xmm2, rax); - code.pinsrq(xmm2, rdx, 1); - } else { - code.movq(xmm2, rax); - code.movq(xmm0, rdx); - code.punpcklqdq(xmm2, xmm0); - } - if (vaddr_idx != code.ABI_PARAM2.getIdx()) { - code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); - } - code.call(memory_exclusive_write_128); - ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLoc::RAX); - code.ret(); - PerfMapRegister(exclusive_write_fallbacks[std::make_tuple(128, vaddr_idx, value_idx)], code.getCurr(), "a64_write_fallback_128"); - - if (value_idx == 4 || value_idx == 15) { + for (bool ordered : {false, true}) { + for (int vaddr_idx : idxes) { + if (vaddr_idx == 4 || vaddr_idx == 15) { continue; } - for (const auto& [bitsize, callback] : read_callbacks) { + for (int value_idx : idxes) { code.align(); - read_fallbacks[std::make_tuple(bitsize, vaddr_idx, value_idx)] = code.getCurr(); - ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLocRegIdx(value_idx)); + read_fallbacks[std::make_tuple(ordered, 128, vaddr_idx, value_idx)] = code.getCurr(); + ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(value_idx)); if (vaddr_idx != code.ABI_PARAM2.getIdx()) { code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); } - callback.EmitCall(code); - if (value_idx != code.ABI_RETURN.getIdx()) { - code.mov(Xbyak::Reg64{value_idx}, code.ABI_RETURN); + if (ordered) { + code.mfence(); } - ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocRegIdx(value_idx)); - code.ZeroExtendFrom(bitsize, Xbyak::Reg64{value_idx}); + code.call(memory_read_128); + if (value_idx != 1) { + code.movaps(Xbyak::Xmm{value_idx}, xmm1); + } + ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(value_idx)); code.ret(); - PerfMapRegister(read_fallbacks[std::make_tuple(bitsize, vaddr_idx, value_idx)], code.getCurr(), fmt::format("a64_read_fallback_{}", bitsize)); - } + PerfMapRegister(read_fallbacks[std::make_tuple(ordered, 128, vaddr_idx, value_idx)], code.getCurr(), "a64_read_fallback_128"); - for (const auto& [bitsize, callback] : write_callbacks) { code.align(); - write_fallbacks[std::make_tuple(bitsize, vaddr_idx, value_idx)] = code.getCurr(); + write_fallbacks[std::make_tuple(ordered, 128, vaddr_idx, value_idx)] = code.getCurr(); ABI_PushCallerSaveRegistersAndAdjustStack(code); - if (vaddr_idx == code.ABI_PARAM3.getIdx() && value_idx == code.ABI_PARAM2.getIdx()) { - code.xchg(code.ABI_PARAM2, code.ABI_PARAM3); - } else if (vaddr_idx == code.ABI_PARAM3.getIdx()) { + if (vaddr_idx != code.ABI_PARAM2.getIdx()) { code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); - if (value_idx != code.ABI_PARAM3.getIdx()) { - code.mov(code.ABI_PARAM3, Xbyak::Reg64{value_idx}); - } - } else { - if (value_idx != code.ABI_PARAM3.getIdx()) { - code.mov(code.ABI_PARAM3, Xbyak::Reg64{value_idx}); - } - if (vaddr_idx != code.ABI_PARAM2.getIdx()) { - code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); - } } - code.ZeroExtendFrom(bitsize, code.ABI_PARAM3); - callback.EmitCall(code); + if (value_idx != 1) { + code.movaps(xmm1, Xbyak::Xmm{value_idx}); + } + code.call(memory_write_128); + if (ordered) { + code.mfence(); + } ABI_PopCallerSaveRegistersAndAdjustStack(code); code.ret(); - PerfMapRegister(write_fallbacks[std::make_tuple(bitsize, vaddr_idx, value_idx)], code.getCurr(), fmt::format("a64_write_fallback_{}", bitsize)); - } + PerfMapRegister(write_fallbacks[std::make_tuple(ordered, 128, vaddr_idx, value_idx)], code.getCurr(), "a64_write_fallback_128"); - for (const auto& [bitsize, callback] : exclusive_write_callbacks) { code.align(); - exclusive_write_fallbacks[std::make_tuple(bitsize, vaddr_idx, value_idx)] = code.getCurr(); + exclusive_write_fallbacks[std::make_tuple(ordered, 128, vaddr_idx, value_idx)] = code.getCurr(); ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLoc::RAX); - if (vaddr_idx == code.ABI_PARAM3.getIdx() && value_idx == code.ABI_PARAM2.getIdx()) { - code.xchg(code.ABI_PARAM2, code.ABI_PARAM3); - } else if (vaddr_idx == code.ABI_PARAM3.getIdx()) { - code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); - if (value_idx != code.ABI_PARAM3.getIdx()) { - code.mov(code.ABI_PARAM3, Xbyak::Reg64{value_idx}); - } + if (value_idx != 1) { + code.movaps(xmm1, Xbyak::Xmm{value_idx}); + } + if (code.HasHostFeature(HostFeature::SSE41)) { + code.movq(xmm2, rax); + code.pinsrq(xmm2, rdx, 1); } else { - if (value_idx != code.ABI_PARAM3.getIdx()) { - code.mov(code.ABI_PARAM3, Xbyak::Reg64{value_idx}); - } + code.movq(xmm2, rax); + code.movq(xmm0, rdx); + code.punpcklqdq(xmm2, xmm0); + } + if (vaddr_idx != code.ABI_PARAM2.getIdx()) { + code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); + } + code.call(memory_exclusive_write_128); + ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLoc::RAX); + code.ret(); + PerfMapRegister(exclusive_write_fallbacks[std::make_tuple(ordered, 128, vaddr_idx, value_idx)], code.getCurr(), "a64_exclusive_write_fallback_128"); + + if (value_idx == 4 || value_idx == 15) { + continue; + } + + for (const auto& [bitsize, callback] : read_callbacks) { + code.align(); + read_fallbacks[std::make_tuple(ordered, bitsize, vaddr_idx, value_idx)] = code.getCurr(); + ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLocRegIdx(value_idx)); if (vaddr_idx != code.ABI_PARAM2.getIdx()) { code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); } + if (ordered) { + code.mfence(); + } + callback.EmitCall(code); + if (value_idx != code.ABI_RETURN.getIdx()) { + code.mov(Xbyak::Reg64{value_idx}, code.ABI_RETURN); + } + ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocRegIdx(value_idx)); + code.ZeroExtendFrom(bitsize, Xbyak::Reg64{value_idx}); + code.ret(); + PerfMapRegister(read_fallbacks[std::make_tuple(ordered, bitsize, vaddr_idx, value_idx)], code.getCurr(), fmt::format("a64_read_fallback_{}", bitsize)); + } + + for (const auto& [bitsize, callback] : write_callbacks) { + code.align(); + write_fallbacks[std::make_tuple(ordered, bitsize, vaddr_idx, value_idx)] = code.getCurr(); + ABI_PushCallerSaveRegistersAndAdjustStack(code); + if (vaddr_idx == code.ABI_PARAM3.getIdx() && value_idx == code.ABI_PARAM2.getIdx()) { + code.xchg(code.ABI_PARAM2, code.ABI_PARAM3); + } else if (vaddr_idx == code.ABI_PARAM3.getIdx()) { + code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); + if (value_idx != code.ABI_PARAM3.getIdx()) { + code.mov(code.ABI_PARAM3, Xbyak::Reg64{value_idx}); + } + } else { + if (value_idx != code.ABI_PARAM3.getIdx()) { + code.mov(code.ABI_PARAM3, Xbyak::Reg64{value_idx}); + } + if (vaddr_idx != code.ABI_PARAM2.getIdx()) { + code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); + } + } + code.ZeroExtendFrom(bitsize, code.ABI_PARAM3); + callback.EmitCall(code); + if (ordered) { + code.mfence(); + } + ABI_PopCallerSaveRegistersAndAdjustStack(code); + code.ret(); + PerfMapRegister(write_fallbacks[std::make_tuple(ordered, bitsize, vaddr_idx, value_idx)], code.getCurr(), fmt::format("a64_write_fallback_{}", bitsize)); + } + + for (const auto& [bitsize, callback] : exclusive_write_callbacks) { + code.align(); + exclusive_write_fallbacks[std::make_tuple(ordered, bitsize, vaddr_idx, value_idx)] = code.getCurr(); + ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLoc::RAX); + if (vaddr_idx == code.ABI_PARAM3.getIdx() && value_idx == code.ABI_PARAM2.getIdx()) { + code.xchg(code.ABI_PARAM2, code.ABI_PARAM3); + } else if (vaddr_idx == code.ABI_PARAM3.getIdx()) { + code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); + if (value_idx != code.ABI_PARAM3.getIdx()) { + code.mov(code.ABI_PARAM3, Xbyak::Reg64{value_idx}); + } + } else { + if (value_idx != code.ABI_PARAM3.getIdx()) { + code.mov(code.ABI_PARAM3, Xbyak::Reg64{value_idx}); + } + if (vaddr_idx != code.ABI_PARAM2.getIdx()) { + code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); + } + } + code.ZeroExtendFrom(bitsize, code.ABI_PARAM3); + code.mov(code.ABI_PARAM4, rax); + code.ZeroExtendFrom(bitsize, code.ABI_PARAM4); + callback.EmitCall(code); + ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLoc::RAX); + code.ret(); + PerfMapRegister(exclusive_write_fallbacks[std::make_tuple(ordered, bitsize, vaddr_idx, value_idx)], code.getCurr(), fmt::format("a64_exclusive_write_fallback_{}", bitsize)); } - code.ZeroExtendFrom(bitsize, code.ABI_PARAM3); - code.mov(code.ABI_PARAM4, rax); - code.ZeroExtendFrom(bitsize, code.ABI_PARAM4); - callback.EmitCall(code); - ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLoc::RAX); - code.ret(); - PerfMapRegister(exclusive_write_fallbacks[std::make_tuple(bitsize, vaddr_idx, value_idx)], code.getCurr(), fmt::format("a64_exclusive_write_fallback_{}", bitsize)); } } } } -std::optional A64EmitX64::ShouldFastmem(A64EmitContext& ctx, IR::Inst* inst) const { - if (!conf.fastmem_pointer || !exception_handler.SupportsFastmem()) { - return std::nullopt; - } - - const auto marker = std::make_tuple(ctx.Location(), ctx.GetInstOffset(inst)); - if (do_not_fastmem.count(marker) > 0) { - return std::nullopt; - } - return marker; -} - -FakeCall A64EmitX64::FastmemCallback(u64 rip_) { - const auto iter = fastmem_patch_info.find(rip_); - - if (iter == fastmem_patch_info.end()) { - fmt::print("dynarmic: Segfault happened within JITted code at rip = {:016x}\n", rip_); - fmt::print("Segfault wasn't at a fastmem patch location!\n"); - fmt::print("Now dumping code.......\n\n"); - Common::DumpDisassembledX64((void*)(rip_ & ~u64(0xFFF)), 0x1000); - ASSERT_FALSE("iter != fastmem_patch_info.end()"); - } - - if (iter->second.recompile) { - const auto marker = iter->second.marker; - do_not_fastmem.emplace(marker); - InvalidateBasicBlocks({std::get<0>(marker)}); - } - - return FakeCall{ - .call_rip = iter->second.callback, - .ret_rip = iter->second.resume_rip, - }; -} - -template -void A64EmitX64::EmitMemoryRead(A64EmitContext& ctx, IR::Inst* inst) { - auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const auto fastmem_marker = ShouldFastmem(ctx, inst); - - if (!conf.page_table && !fastmem_marker) { - // Neither fastmem nor page table: Use callbacks - if constexpr (bitsize == 128) { - ctx.reg_alloc.HostCall(nullptr, {}, args[0]); - code.CallFunction(memory_read_128); - ctx.reg_alloc.DefineValue(inst, xmm1); - } else { - ctx.reg_alloc.HostCall(inst, {}, args[0]); - Devirtualize(conf.callbacks).EmitCall(code); - code.ZeroExtendFrom(bitsize, code.ABI_RETURN); - } - return; - } - - const Xbyak::Reg64 vaddr = ctx.reg_alloc.UseGpr(args[0]); - const int value_idx = bitsize == 128 ? ctx.reg_alloc.ScratchXmm().getIdx() : ctx.reg_alloc.ScratchGpr().getIdx(); - - const auto wrapped_fn = read_fallbacks[std::make_tuple(bitsize, vaddr.getIdx(), value_idx)]; - - Xbyak::Label abort, end; - bool require_abort_handling = false; - - if (fastmem_marker) { - // Use fastmem - const auto src_ptr = EmitFastmemVAddr(code, ctx, abort, vaddr, require_abort_handling); - - const auto location = code.getCurr(); - EmitReadMemoryMov(code, value_idx, src_ptr); - - fastmem_patch_info.emplace( - Common::BitCast(location), - FastmemPatchInfo{ - Common::BitCast(code.getCurr()), - Common::BitCast(wrapped_fn), - *fastmem_marker, - conf.recompile_on_fastmem_failure, - }); - } else { - // Use page table - ASSERT(conf.page_table); - const auto src_ptr = EmitVAddrLookup(code, ctx, bitsize, abort, vaddr); - require_abort_handling = true; - EmitReadMemoryMov(code, value_idx, src_ptr); - } - code.L(end); - - if (require_abort_handling) { - code.SwitchToFarCode(); - code.L(abort); - code.call(wrapped_fn); - code.jmp(end, code.T_NEAR); - code.SwitchToNearCode(); - } - - if constexpr (bitsize == 128) { - ctx.reg_alloc.DefineValue(inst, Xbyak::Xmm{value_idx}); - } else { - ctx.reg_alloc.DefineValue(inst, Xbyak::Reg64{value_idx}); - } -} - -template -void A64EmitX64::EmitMemoryWrite(A64EmitContext& ctx, IR::Inst* inst) { - auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const auto fastmem_marker = ShouldFastmem(ctx, inst); - - if (!conf.page_table && !fastmem_marker) { - // Neither fastmem nor page table: Use callbacks - if constexpr (bitsize == 128) { - ctx.reg_alloc.Use(args[0], ABI_PARAM2); - ctx.reg_alloc.Use(args[1], HostLoc::XMM1); - ctx.reg_alloc.EndOfAllocScope(); - ctx.reg_alloc.HostCall(nullptr); - code.CallFunction(memory_write_128); - } else { - ctx.reg_alloc.HostCall(nullptr, {}, args[0], args[1]); - Devirtualize(conf.callbacks).EmitCall(code); - } - return; - } - - const Xbyak::Reg64 vaddr = ctx.reg_alloc.UseGpr(args[0]); - const int value_idx = bitsize == 128 ? ctx.reg_alloc.UseXmm(args[1]).getIdx() : ctx.reg_alloc.UseGpr(args[1]).getIdx(); - - const auto wrapped_fn = write_fallbacks[std::make_tuple(bitsize, vaddr.getIdx(), value_idx)]; - - Xbyak::Label abort, end; - bool require_abort_handling = false; - - if (fastmem_marker) { - // Use fastmem - const auto dest_ptr = EmitFastmemVAddr(code, ctx, abort, vaddr, require_abort_handling); - - const auto location = code.getCurr(); - EmitWriteMemoryMov(code, dest_ptr, value_idx); - - fastmem_patch_info.emplace( - Common::BitCast(location), - FastmemPatchInfo{ - Common::BitCast(code.getCurr()), - Common::BitCast(wrapped_fn), - *fastmem_marker, - conf.recompile_on_fastmem_failure, - }); - } else { - // Use page table - ASSERT(conf.page_table); - const auto dest_ptr = EmitVAddrLookup(code, ctx, bitsize, abort, vaddr); - require_abort_handling = true; - EmitWriteMemoryMov(code, dest_ptr, value_idx); - } - code.L(end); - - if (require_abort_handling) { - code.SwitchToFarCode(); - code.L(abort); - code.call(wrapped_fn); - code.jmp(end, code.T_NEAR); - code.SwitchToNearCode(); - } -} +#define Axx A64 +#include "emit_x64_memory.cpp.inc" +#undef Axx void A64EmitX64::EmitA64ReadMemory8(A64EmitContext& ctx, IR::Inst* inst) { EmitMemoryRead<8, &A64::UserCallbacks::MemoryRead8>(ctx, inst); @@ -467,295 +323,6 @@ void A64EmitX64::EmitA64WriteMemory128(A64EmitContext& ctx, IR::Inst* inst) { EmitMemoryWrite<128, &A64::UserCallbacks::MemoryWrite64>(ctx, inst); } -template -void A64EmitX64::EmitExclusiveReadMemory(A64EmitContext& ctx, IR::Inst* inst) { - ASSERT(conf.global_monitor != nullptr); - auto args = ctx.reg_alloc.GetArgumentInfo(inst); - - if constexpr (bitsize != 128) { - using T = mp::unsigned_integer_of_size; - - ctx.reg_alloc.HostCall(inst, {}, args[0]); - - code.mov(code.byte[r15 + offsetof(A64JitState, exclusive_state)], u8(1)); - code.mov(code.ABI_PARAM1, reinterpret_cast(&conf)); - code.CallLambda( - [](A64::UserConfig& conf, u64 vaddr) -> T { - return conf.global_monitor->ReadAndMark(conf.processor_id, vaddr, [&]() -> T { - return (conf.callbacks->*callback)(vaddr); - }); - }); - code.ZeroExtendFrom(bitsize, code.ABI_RETURN); - } else { - const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); - ctx.reg_alloc.Use(args[0], ABI_PARAM2); - ctx.reg_alloc.EndOfAllocScope(); - ctx.reg_alloc.HostCall(nullptr); - - code.mov(code.byte[r15 + offsetof(A64JitState, exclusive_state)], u8(1)); - code.mov(code.ABI_PARAM1, reinterpret_cast(&conf)); - ctx.reg_alloc.AllocStackSpace(16 + ABI_SHADOW_SPACE); - code.lea(code.ABI_PARAM3, ptr[rsp + ABI_SHADOW_SPACE]); - code.CallLambda( - [](A64::UserConfig& conf, u64 vaddr, A64::Vector& ret) { - ret = conf.global_monitor->ReadAndMark(conf.processor_id, vaddr, [&]() -> A64::Vector { - return (conf.callbacks->*callback)(vaddr); - }); - }); - code.movups(result, xword[rsp + ABI_SHADOW_SPACE]); - ctx.reg_alloc.ReleaseStackSpace(16 + ABI_SHADOW_SPACE); - - ctx.reg_alloc.DefineValue(inst, result); - } -} - -template -void A64EmitX64::EmitExclusiveWriteMemory(A64EmitContext& ctx, IR::Inst* inst) { - ASSERT(conf.global_monitor != nullptr); - auto args = ctx.reg_alloc.GetArgumentInfo(inst); - - if constexpr (bitsize != 128) { - ctx.reg_alloc.HostCall(inst, {}, args[0], args[1]); - } else { - ctx.reg_alloc.Use(args[0], ABI_PARAM2); - ctx.reg_alloc.Use(args[1], HostLoc::XMM1); - ctx.reg_alloc.EndOfAllocScope(); - ctx.reg_alloc.HostCall(inst); - } - - Xbyak::Label end; - - code.mov(code.ABI_RETURN, u32(1)); - code.cmp(code.byte[r15 + offsetof(A64JitState, exclusive_state)], u8(0)); - code.je(end); - code.mov(code.byte[r15 + offsetof(A64JitState, exclusive_state)], u8(0)); - code.mov(code.ABI_PARAM1, reinterpret_cast(&conf)); - if constexpr (bitsize != 128) { - using T = mp::unsigned_integer_of_size; - - code.CallLambda( - [](A64::UserConfig& conf, u64 vaddr, T value) -> u32 { - return conf.global_monitor->DoExclusiveOperation(conf.processor_id, vaddr, - [&](T expected) -> bool { - return (conf.callbacks->*callback)(vaddr, value, expected); - }) - ? 0 - : 1; - }); - } else { - ctx.reg_alloc.AllocStackSpace(16 + ABI_SHADOW_SPACE); - code.lea(code.ABI_PARAM3, ptr[rsp + ABI_SHADOW_SPACE]); - code.movaps(xword[code.ABI_PARAM3], xmm1); - code.CallLambda( - [](A64::UserConfig& conf, u64 vaddr, A64::Vector& value) -> u32 { - return conf.global_monitor->DoExclusiveOperation(conf.processor_id, vaddr, - [&](A64::Vector expected) -> bool { - return (conf.callbacks->*callback)(vaddr, value, expected); - }) - ? 0 - : 1; - }); - ctx.reg_alloc.ReleaseStackSpace(16 + ABI_SHADOW_SPACE); - } - code.L(end); -} - -template -void A64EmitX64::EmitExclusiveReadMemoryInline(A64EmitContext& ctx, IR::Inst* inst) { - ASSERT(conf.global_monitor && conf.fastmem_pointer); - if (!exception_handler.SupportsFastmem()) { - EmitExclusiveReadMemory(ctx, inst); - return; - } - - auto args = ctx.reg_alloc.GetArgumentInfo(inst); - - const Xbyak::Reg64 vaddr = ctx.reg_alloc.UseGpr(args[0]); - const int value_idx = bitsize == 128 ? ctx.reg_alloc.ScratchXmm().getIdx() : ctx.reg_alloc.ScratchGpr().getIdx(); - const Xbyak::Reg64 tmp = ctx.reg_alloc.ScratchGpr(); - const Xbyak::Reg64 tmp2 = ctx.reg_alloc.ScratchGpr(); - - const auto wrapped_fn = read_fallbacks[std::make_tuple(bitsize, vaddr.getIdx(), value_idx)]; - - EmitExclusiveLock(code, conf, tmp, tmp2.cvt32()); - - code.mov(code.byte[r15 + offsetof(A64JitState, exclusive_state)], u8(1)); - code.mov(tmp, Common::BitCast(GetExclusiveMonitorAddressPointer(conf.global_monitor, conf.processor_id))); - code.mov(qword[tmp], vaddr); - - const auto fastmem_marker = ShouldFastmem(ctx, inst); - if (fastmem_marker) { - Xbyak::Label abort, end; - bool require_abort_handling = false; - - const auto src_ptr = EmitFastmemVAddr(code, ctx, abort, vaddr, require_abort_handling); - - const auto location = code.getCurr(); - EmitReadMemoryMov(code, value_idx, src_ptr); - - fastmem_patch_info.emplace( - Common::BitCast(location), - FastmemPatchInfo{ - Common::BitCast(code.getCurr()), - Common::BitCast(wrapped_fn), - *fastmem_marker, - conf.recompile_on_exclusive_fastmem_failure, - }); - - code.L(end); - - if (require_abort_handling) { - code.SwitchToFarCode(); - code.L(abort); - code.call(wrapped_fn); - code.jmp(end, code.T_NEAR); - code.SwitchToNearCode(); - } - } else { - code.call(wrapped_fn); - } - - code.mov(tmp, Common::BitCast(GetExclusiveMonitorValuePointer(conf.global_monitor, conf.processor_id))); - EmitWriteMemoryMov(code, tmp, value_idx); - - EmitExclusiveUnlock(code, conf, tmp, tmp2.cvt32()); - - if constexpr (bitsize == 128) { - ctx.reg_alloc.DefineValue(inst, Xbyak::Xmm{value_idx}); - } else { - ctx.reg_alloc.DefineValue(inst, Xbyak::Reg64{value_idx}); - } -} - -template -void A64EmitX64::EmitExclusiveWriteMemoryInline(A64EmitContext& ctx, IR::Inst* inst) { - ASSERT(conf.global_monitor && conf.fastmem_pointer); - if (!exception_handler.SupportsFastmem()) { - EmitExclusiveWriteMemory(ctx, inst); - return; - } - - auto args = ctx.reg_alloc.GetArgumentInfo(inst); - - const auto value = [&] { - if constexpr (bitsize == 128) { - ctx.reg_alloc.ScratchGpr(HostLoc::RAX); - ctx.reg_alloc.ScratchGpr(HostLoc::RBX); - ctx.reg_alloc.ScratchGpr(HostLoc::RCX); - ctx.reg_alloc.ScratchGpr(HostLoc::RDX); - return ctx.reg_alloc.UseXmm(args[1]); - } else { - ctx.reg_alloc.ScratchGpr(HostLoc::RAX); - return ctx.reg_alloc.UseGpr(args[1]); - } - }(); - const Xbyak::Reg64 vaddr = ctx.reg_alloc.UseGpr(args[0]); - const Xbyak::Reg32 status = ctx.reg_alloc.ScratchGpr().cvt32(); - const Xbyak::Reg64 tmp = ctx.reg_alloc.ScratchGpr(); - - const auto fallback_fn = exclusive_write_fallbacks[std::make_tuple(bitsize, vaddr.getIdx(), value.getIdx())]; - - EmitExclusiveLock(code, conf, tmp, eax); - - Xbyak::Label end; - - code.mov(tmp, Common::BitCast(GetExclusiveMonitorAddressPointer(conf.global_monitor, conf.processor_id))); - code.mov(status, u32(1)); - code.cmp(code.byte[r15 + offsetof(A64JitState, exclusive_state)], u8(0)); - code.je(end, code.T_NEAR); - code.cmp(qword[tmp], vaddr); - code.jne(end, code.T_NEAR); - - EmitExclusiveTestAndClear(code, conf, vaddr, tmp, rax); - - code.mov(code.byte[r15 + offsetof(A64JitState, exclusive_state)], u8(0)); - code.mov(tmp, Common::BitCast(GetExclusiveMonitorValuePointer(conf.global_monitor, conf.processor_id))); - - if constexpr (bitsize == 128) { - code.mov(rax, qword[tmp + 0]); - code.mov(rdx, qword[tmp + 8]); - if (code.HasHostFeature(HostFeature::SSE41)) { - code.movq(rbx, value); - code.pextrq(rcx, value, 1); - } else { - code.movaps(xmm0, value); - code.movq(rbx, xmm0); - code.punpckhqdq(xmm0, xmm0); - code.movq(rcx, xmm0); - } - } else { - EmitReadMemoryMov(code, rax.getIdx(), tmp); - } - - const auto fastmem_marker = ShouldFastmem(ctx, inst); - if (fastmem_marker) { - Xbyak::Label abort; - bool require_abort_handling = false; - - const auto dest_ptr = EmitFastmemVAddr(code, ctx, abort, vaddr, require_abort_handling, tmp); - - const auto location = code.getCurr(); - - if constexpr (bitsize == 128) { - code.lock(); - code.cmpxchg16b(ptr[dest_ptr]); - } else { - switch (bitsize) { - case 8: - code.lock(); - code.cmpxchg(code.byte[dest_ptr], value.cvt8()); - break; - case 16: - code.lock(); - code.cmpxchg(word[dest_ptr], value.cvt16()); - break; - case 32: - code.lock(); - code.cmpxchg(dword[dest_ptr], value.cvt32()); - break; - case 64: - code.lock(); - code.cmpxchg(qword[dest_ptr], value.cvt64()); - break; - default: - UNREACHABLE(); - } - } - - code.setnz(status.cvt8()); - - code.SwitchToFarCode(); - code.L(abort); - code.call(fallback_fn); - - fastmem_patch_info.emplace( - Common::BitCast(location), - FastmemPatchInfo{ - Common::BitCast(code.getCurr()), - Common::BitCast(fallback_fn), - *fastmem_marker, - conf.recompile_on_exclusive_fastmem_failure, - }); - - code.cmp(al, 0); - code.setz(status.cvt8()); - code.movzx(status.cvt32(), status.cvt8()); - code.jmp(end, code.T_NEAR); - code.SwitchToNearCode(); - } else { - code.call(fallback_fn); - code.cmp(al, 0); - code.setz(status.cvt8()); - code.movzx(status.cvt32(), status.cvt8()); - } - - code.L(end); - - EmitExclusiveUnlock(code, conf, tmp, eax); - - ctx.reg_alloc.DefineValue(inst, status); -} - void A64EmitX64::EmitA64ClearExclusive(A64EmitContext&, IR::Inst*) { code.mov(code.byte[r15 + offsetof(A64JitState, exclusive_state)], u8(0)); } diff --git a/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_memory.cpp.inc b/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_memory.cpp.inc new file mode 100755 index 000000000..74899034c --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_memory.cpp.inc @@ -0,0 +1,497 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2022 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include "dynarmic/common/macro_util.h" + +#define AxxEmitX64 CONCATENATE_TOKENS(Axx, EmitX64) +#define AxxEmitContext CONCATENATE_TOKENS(Axx, EmitContext) +#define AxxJitState CONCATENATE_TOKENS(Axx, JitState) +#define AxxUserConfig Axx::UserConfig + +namespace { +using Vector = std::array; +} + +std::optional AxxEmitX64::ShouldFastmem(AxxEmitContext& ctx, IR::Inst* inst) const { + if (!conf.fastmem_pointer || !exception_handler.SupportsFastmem()) { + return std::nullopt; + } + + const auto marker = std::make_tuple(ctx.Location(), ctx.GetInstOffset(inst)); + if (do_not_fastmem.count(marker) > 0) { + return std::nullopt; + } + return marker; +} + +FakeCall AxxEmitX64::FastmemCallback(u64 rip_) { + const auto iter = fastmem_patch_info.find(rip_); + + if (iter == fastmem_patch_info.end()) { + fmt::print("dynarmic: Segfault happened within JITted code at rip = {:016x}\n", rip_); + fmt::print("Segfault wasn't at a fastmem patch location!\n"); + fmt::print("Now dumping code.......\n\n"); + Common::DumpDisassembledX64((void*)(rip_ & ~u64(0xFFF)), 0x1000); + ASSERT_FALSE("iter != fastmem_patch_info.end()"); + } + + if (iter->second.recompile) { + const auto marker = iter->second.marker; + do_not_fastmem.emplace(marker); + InvalidateBasicBlocks({std::get<0>(marker)}); + } + + return FakeCall{ + .call_rip = iter->second.callback, + .ret_rip = iter->second.resume_rip, + }; +} + +template +void AxxEmitX64::EmitMemoryRead(AxxEmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const bool ordered = IsOrdered(args[1].GetImmediateAccType()); + const auto fastmem_marker = ShouldFastmem(ctx, inst); + + if (!conf.page_table && !fastmem_marker) { + // Neither fastmem nor page table: Use callbacks + if constexpr (bitsize == 128) { + ctx.reg_alloc.HostCall(nullptr, {}, args[0]); + if (ordered) { + code.mfence(); + } + code.CallFunction(memory_read_128); + ctx.reg_alloc.DefineValue(inst, xmm1); + } else { + ctx.reg_alloc.HostCall(inst, {}, args[0]); + if (ordered) { + code.mfence(); + } + Devirtualize(conf.callbacks).EmitCall(code); + code.ZeroExtendFrom(bitsize, code.ABI_RETURN); + } + return; + } + + const Xbyak::Reg64 vaddr = ctx.reg_alloc.UseGpr(args[0]); + const int value_idx = bitsize == 128 ? ctx.reg_alloc.ScratchXmm().getIdx() : ctx.reg_alloc.ScratchGpr().getIdx(); + + const auto wrapped_fn = read_fallbacks[std::make_tuple(ordered, bitsize, vaddr.getIdx(), value_idx)]; + + Xbyak::Label abort, end; + bool require_abort_handling = false; + + if (fastmem_marker) { + // Use fastmem + const auto src_ptr = EmitFastmemVAddr(code, ctx, abort, vaddr, require_abort_handling); + + const auto location = EmitReadMemoryMov(code, value_idx, src_ptr, ordered); + + fastmem_patch_info.emplace( + Common::BitCast(location), + FastmemPatchInfo{ + Common::BitCast(code.getCurr()), + Common::BitCast(wrapped_fn), + *fastmem_marker, + conf.recompile_on_fastmem_failure, + }); + } else { + // Use page table + ASSERT(conf.page_table); + const auto src_ptr = EmitVAddrLookup(code, ctx, bitsize, abort, vaddr); + require_abort_handling = true; + EmitReadMemoryMov(code, value_idx, src_ptr, ordered); + } + code.L(end); + + if (require_abort_handling) { + code.SwitchToFarCode(); + code.L(abort); + code.call(wrapped_fn); + code.jmp(end, code.T_NEAR); + code.SwitchToNearCode(); + } + + if constexpr (bitsize == 128) { + ctx.reg_alloc.DefineValue(inst, Xbyak::Xmm{value_idx}); + } else { + ctx.reg_alloc.DefineValue(inst, Xbyak::Reg64{value_idx}); + } +} + +template +void AxxEmitX64::EmitMemoryWrite(AxxEmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const bool ordered = IsOrdered(args[2].GetImmediateAccType()); + const auto fastmem_marker = ShouldFastmem(ctx, inst); + + if (!conf.page_table && !fastmem_marker) { + // Neither fastmem nor page table: Use callbacks + if constexpr (bitsize == 128) { + ctx.reg_alloc.Use(args[0], ABI_PARAM2); + ctx.reg_alloc.Use(args[1], HostLoc::XMM1); + ctx.reg_alloc.EndOfAllocScope(); + ctx.reg_alloc.HostCall(nullptr); + code.CallFunction(memory_write_128); + } else { + ctx.reg_alloc.HostCall(nullptr, {}, args[0], args[1]); + Devirtualize(conf.callbacks).EmitCall(code); + } + if (ordered) { + code.mfence(); + } + return; + } + + const Xbyak::Reg64 vaddr = ctx.reg_alloc.UseGpr(args[0]); + const int value_idx = bitsize == 128 + ? ctx.reg_alloc.UseXmm(args[1]).getIdx() + : (ordered ? ctx.reg_alloc.UseScratchGpr(args[1]).getIdx() : ctx.reg_alloc.UseGpr(args[1]).getIdx()); + + const auto wrapped_fn = write_fallbacks[std::make_tuple(ordered, bitsize, vaddr.getIdx(), value_idx)]; + + Xbyak::Label abort, end; + bool require_abort_handling = false; + + if (fastmem_marker) { + // Use fastmem + const auto dest_ptr = EmitFastmemVAddr(code, ctx, abort, vaddr, require_abort_handling); + + const auto location = EmitWriteMemoryMov(code, dest_ptr, value_idx, ordered); + + fastmem_patch_info.emplace( + Common::BitCast(location), + FastmemPatchInfo{ + Common::BitCast(code.getCurr()), + Common::BitCast(wrapped_fn), + *fastmem_marker, + conf.recompile_on_fastmem_failure, + }); + } else { + // Use page table + ASSERT(conf.page_table); + const auto dest_ptr = EmitVAddrLookup(code, ctx, bitsize, abort, vaddr); + require_abort_handling = true; + EmitWriteMemoryMov(code, dest_ptr, value_idx, ordered); + } + code.L(end); + + if (require_abort_handling) { + code.SwitchToFarCode(); + code.L(abort); + code.call(wrapped_fn); + code.jmp(end, code.T_NEAR); + code.SwitchToNearCode(); + } +} + +template +void AxxEmitX64::EmitExclusiveReadMemory(AxxEmitContext& ctx, IR::Inst* inst) { + ASSERT(conf.global_monitor != nullptr); + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const bool ordered = IsOrdered(args[1].GetImmediateAccType()); + + if constexpr (bitsize != 128) { + using T = mp::unsigned_integer_of_size; + + ctx.reg_alloc.HostCall(inst, {}, args[0]); + + code.mov(code.byte[r15 + offsetof(AxxJitState, exclusive_state)], u8(1)); + code.mov(code.ABI_PARAM1, reinterpret_cast(&conf)); + if (ordered) { + code.mfence(); + } + code.CallLambda( + [](AxxUserConfig& conf, Axx::VAddr vaddr) -> T { + return conf.global_monitor->ReadAndMark(conf.processor_id, vaddr, [&]() -> T { + return (conf.callbacks->*callback)(vaddr); + }); + }); + code.ZeroExtendFrom(bitsize, code.ABI_RETURN); + } else { + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + ctx.reg_alloc.Use(args[0], ABI_PARAM2); + ctx.reg_alloc.EndOfAllocScope(); + ctx.reg_alloc.HostCall(nullptr); + + code.mov(code.byte[r15 + offsetof(AxxJitState, exclusive_state)], u8(1)); + code.mov(code.ABI_PARAM1, reinterpret_cast(&conf)); + ctx.reg_alloc.AllocStackSpace(16 + ABI_SHADOW_SPACE); + code.lea(code.ABI_PARAM3, ptr[rsp + ABI_SHADOW_SPACE]); + if (ordered) { + code.mfence(); + } + code.CallLambda( + [](AxxUserConfig& conf, Axx::VAddr vaddr, Vector& ret) { + ret = conf.global_monitor->ReadAndMark(conf.processor_id, vaddr, [&]() -> Vector { + return (conf.callbacks->*callback)(vaddr); + }); + }); + code.movups(result, xword[rsp + ABI_SHADOW_SPACE]); + ctx.reg_alloc.ReleaseStackSpace(16 + ABI_SHADOW_SPACE); + + ctx.reg_alloc.DefineValue(inst, result); + } +} + +template +void AxxEmitX64::EmitExclusiveWriteMemory(AxxEmitContext& ctx, IR::Inst* inst) { + ASSERT(conf.global_monitor != nullptr); + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const bool ordered = IsOrdered(args[2].GetImmediateAccType()); + + if constexpr (bitsize != 128) { + ctx.reg_alloc.HostCall(inst, {}, args[0], args[1]); + } else { + ctx.reg_alloc.Use(args[0], ABI_PARAM2); + ctx.reg_alloc.Use(args[1], HostLoc::XMM1); + ctx.reg_alloc.EndOfAllocScope(); + ctx.reg_alloc.HostCall(inst); + } + + Xbyak::Label end; + + code.mov(code.ABI_RETURN, u32(1)); + code.cmp(code.byte[r15 + offsetof(AxxJitState, exclusive_state)], u8(0)); + code.je(end); + code.mov(code.byte[r15 + offsetof(AxxJitState, exclusive_state)], u8(0)); + code.mov(code.ABI_PARAM1, reinterpret_cast(&conf)); + if constexpr (bitsize != 128) { + using T = mp::unsigned_integer_of_size; + + code.CallLambda( + [](AxxUserConfig& conf, Axx::VAddr vaddr, T value) -> u32 { + return conf.global_monitor->DoExclusiveOperation(conf.processor_id, vaddr, + [&](T expected) -> bool { + return (conf.callbacks->*callback)(vaddr, value, expected); + }) + ? 0 + : 1; + }); + if (ordered) { + code.mfence(); + } + } else { + ctx.reg_alloc.AllocStackSpace(16 + ABI_SHADOW_SPACE); + code.lea(code.ABI_PARAM3, ptr[rsp + ABI_SHADOW_SPACE]); + code.movaps(xword[code.ABI_PARAM3], xmm1); + code.CallLambda( + [](AxxUserConfig& conf, Axx::VAddr vaddr, Vector& value) -> u32 { + return conf.global_monitor->DoExclusiveOperation(conf.processor_id, vaddr, + [&](Vector expected) -> bool { + return (conf.callbacks->*callback)(vaddr, value, expected); + }) + ? 0 + : 1; + }); + if (ordered) { + code.mfence(); + } + ctx.reg_alloc.ReleaseStackSpace(16 + ABI_SHADOW_SPACE); + } + code.L(end); +} + +template +void AxxEmitX64::EmitExclusiveReadMemoryInline(AxxEmitContext& ctx, IR::Inst* inst) { + ASSERT(conf.global_monitor && conf.fastmem_pointer); + if (!exception_handler.SupportsFastmem()) { + EmitExclusiveReadMemory(ctx, inst); + return; + } + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const bool ordered = IsOrdered(args[1].GetImmediateAccType()); + + const Xbyak::Reg64 vaddr = ctx.reg_alloc.UseGpr(args[0]); + const int value_idx = bitsize == 128 ? ctx.reg_alloc.ScratchXmm().getIdx() : ctx.reg_alloc.ScratchGpr().getIdx(); + const Xbyak::Reg64 tmp = ctx.reg_alloc.ScratchGpr(); + const Xbyak::Reg64 tmp2 = ctx.reg_alloc.ScratchGpr(); + + const auto wrapped_fn = read_fallbacks[std::make_tuple(ordered, bitsize, vaddr.getIdx(), value_idx)]; + + EmitExclusiveLock(code, conf, tmp, tmp2.cvt32()); + + code.mov(code.byte[r15 + offsetof(AxxJitState, exclusive_state)], u8(1)); + code.mov(tmp, Common::BitCast(GetExclusiveMonitorAddressPointer(conf.global_monitor, conf.processor_id))); + code.mov(qword[tmp], vaddr); + + const auto fastmem_marker = ShouldFastmem(ctx, inst); + if (fastmem_marker) { + Xbyak::Label abort, end; + bool require_abort_handling = false; + + const auto src_ptr = EmitFastmemVAddr(code, ctx, abort, vaddr, require_abort_handling); + + const auto location = EmitReadMemoryMov(code, value_idx, src_ptr, ordered); + + fastmem_patch_info.emplace( + Common::BitCast(location), + FastmemPatchInfo{ + Common::BitCast(code.getCurr()), + Common::BitCast(wrapped_fn), + *fastmem_marker, + conf.recompile_on_exclusive_fastmem_failure, + }); + + code.L(end); + + if (require_abort_handling) { + code.SwitchToFarCode(); + code.L(abort); + code.call(wrapped_fn); + code.jmp(end, code.T_NEAR); + code.SwitchToNearCode(); + } + } else { + code.call(wrapped_fn); + } + + code.mov(tmp, Common::BitCast(GetExclusiveMonitorValuePointer(conf.global_monitor, conf.processor_id))); + EmitWriteMemoryMov(code, tmp, value_idx, false); + + EmitExclusiveUnlock(code, conf, tmp, tmp2.cvt32()); + + if constexpr (bitsize == 128) { + ctx.reg_alloc.DefineValue(inst, Xbyak::Xmm{value_idx}); + } else { + ctx.reg_alloc.DefineValue(inst, Xbyak::Reg64{value_idx}); + } +} + +template +void AxxEmitX64::EmitExclusiveWriteMemoryInline(AxxEmitContext& ctx, IR::Inst* inst) { + ASSERT(conf.global_monitor && conf.fastmem_pointer); + if (!exception_handler.SupportsFastmem()) { + EmitExclusiveWriteMemory(ctx, inst); + return; + } + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const bool ordered = IsOrdered(args[2].GetImmediateAccType()); + + const auto value = [&] { + if constexpr (bitsize == 128) { + ctx.reg_alloc.ScratchGpr(HostLoc::RAX); + ctx.reg_alloc.ScratchGpr(HostLoc::RBX); + ctx.reg_alloc.ScratchGpr(HostLoc::RCX); + ctx.reg_alloc.ScratchGpr(HostLoc::RDX); + return ctx.reg_alloc.UseXmm(args[1]); + } else { + ctx.reg_alloc.ScratchGpr(HostLoc::RAX); + return ctx.reg_alloc.UseGpr(args[1]); + } + }(); + const Xbyak::Reg64 vaddr = ctx.reg_alloc.UseGpr(args[0]); + const Xbyak::Reg32 status = ctx.reg_alloc.ScratchGpr().cvt32(); + const Xbyak::Reg64 tmp = ctx.reg_alloc.ScratchGpr(); + + const auto wrapped_fn = exclusive_write_fallbacks[std::make_tuple(ordered, bitsize, vaddr.getIdx(), value.getIdx())]; + + EmitExclusiveLock(code, conf, tmp, eax); + + Xbyak::Label end; + + code.mov(tmp, Common::BitCast(GetExclusiveMonitorAddressPointer(conf.global_monitor, conf.processor_id))); + code.mov(status, u32(1)); + code.cmp(code.byte[r15 + offsetof(AxxJitState, exclusive_state)], u8(0)); + code.je(end, code.T_NEAR); + code.cmp(qword[tmp], vaddr); + code.jne(end, code.T_NEAR); + + EmitExclusiveTestAndClear(code, conf, vaddr, tmp, rax); + + code.mov(code.byte[r15 + offsetof(AxxJitState, exclusive_state)], u8(0)); + code.mov(tmp, Common::BitCast(GetExclusiveMonitorValuePointer(conf.global_monitor, conf.processor_id))); + + if constexpr (bitsize == 128) { + code.mov(rax, qword[tmp + 0]); + code.mov(rdx, qword[tmp + 8]); + if (code.HasHostFeature(HostFeature::SSE41)) { + code.movq(rbx, value); + code.pextrq(rcx, value, 1); + } else { + code.movaps(xmm0, value); + code.movq(rbx, xmm0); + code.punpckhqdq(xmm0, xmm0); + code.movq(rcx, xmm0); + } + } else { + EmitReadMemoryMov(code, rax.getIdx(), tmp, false); + } + + const auto fastmem_marker = ShouldFastmem(ctx, inst); + if (fastmem_marker) { + Xbyak::Label abort; + bool require_abort_handling = false; + + const auto dest_ptr = EmitFastmemVAddr(code, ctx, abort, vaddr, require_abort_handling, tmp); + + const auto location = code.getCurr(); + + if constexpr (bitsize == 128) { + code.lock(); + code.cmpxchg16b(ptr[dest_ptr]); + } else { + switch (bitsize) { + case 8: + code.lock(); + code.cmpxchg(code.byte[dest_ptr], value.cvt8()); + break; + case 16: + code.lock(); + code.cmpxchg(word[dest_ptr], value.cvt16()); + break; + case 32: + code.lock(); + code.cmpxchg(dword[dest_ptr], value.cvt32()); + break; + case 64: + code.lock(); + code.cmpxchg(qword[dest_ptr], value.cvt64()); + break; + default: + UNREACHABLE(); + } + } + + code.setnz(status.cvt8()); + + code.SwitchToFarCode(); + code.L(abort); + code.call(wrapped_fn); + + fastmem_patch_info.emplace( + Common::BitCast(location), + FastmemPatchInfo{ + Common::BitCast(code.getCurr()), + Common::BitCast(wrapped_fn), + *fastmem_marker, + conf.recompile_on_exclusive_fastmem_failure, + }); + + code.cmp(al, 0); + code.setz(status.cvt8()); + code.movzx(status.cvt32(), status.cvt8()); + code.jmp(end, code.T_NEAR); + code.SwitchToNearCode(); + } else { + code.call(wrapped_fn); + code.cmp(al, 0); + code.setz(status.cvt8()); + code.movzx(status.cvt32(), status.cvt8()); + } + + code.L(end); + + EmitExclusiveUnlock(code, conf, tmp, eax); + + ctx.reg_alloc.DefineValue(inst, status); +} + +#undef AxxEmitX64 +#undef AxxEmitContext +#undef AxxJitState +#undef AxxUserConfig diff --git a/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_memory.h b/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_memory.h index 23b567ef6..c9ce765ed 100755 --- a/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_memory.h +++ b/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_memory.h @@ -10,6 +10,7 @@ #include "dynarmic/backend/x64/exclusive_monitor_friend.h" #include "dynarmic/common/spin_lock_x64.h" #include "dynarmic/interface/exclusive_monitor.h" +#include "dynarmic/ir/acc_type.h" namespace Dynarmic::Backend::X64 { @@ -198,49 +199,113 @@ template<> } template -void EmitReadMemoryMov(BlockOfCode& code, int value_idx, const Xbyak::RegExp& addr) { +const void* EmitReadMemoryMov(BlockOfCode& code, int value_idx, const Xbyak::RegExp& addr, bool ordered) { + if (ordered) { + if constexpr (bitsize == 128) { + code.mfence(); + } else { + code.xor_(Xbyak::Reg32{value_idx}, Xbyak::Reg32{value_idx}); + } + + const void* fastmem_location = code.getCurr(); + switch (bitsize) { + case 8: + code.lock(); + code.xadd(code.byte[addr], Xbyak::Reg32{value_idx}.cvt8()); + break; + case 16: + code.lock(); + code.xadd(word[addr], Xbyak::Reg32{value_idx}); + break; + case 32: + code.lock(); + code.xadd(dword[addr], Xbyak::Reg32{value_idx}); + break; + case 64: + code.lock(); + code.xadd(qword[addr], Xbyak::Reg64{value_idx}); + break; + case 128: + // TODO (HACK): Detect CPUs where this load is not atomic + code.movaps(Xbyak::Xmm{value_idx}, xword[addr]); + break; + default: + ASSERT_FALSE("Invalid bitsize"); + } + return fastmem_location; + } + + const void* fastmem_location = code.getCurr(); switch (bitsize) { case 8: code.movzx(Xbyak::Reg32{value_idx}, code.byte[addr]); - return; + break; case 16: code.movzx(Xbyak::Reg32{value_idx}, word[addr]); - return; + break; case 32: code.mov(Xbyak::Reg32{value_idx}, dword[addr]); - return; + break; case 64: code.mov(Xbyak::Reg64{value_idx}, qword[addr]); - return; + break; case 128: code.movups(Xbyak::Xmm{value_idx}, xword[addr]); - return; + break; default: ASSERT_FALSE("Invalid bitsize"); } + return fastmem_location; } template -void EmitWriteMemoryMov(BlockOfCode& code, const Xbyak::RegExp& addr, int value_idx) { +const void* EmitWriteMemoryMov(BlockOfCode& code, const Xbyak::RegExp& addr, int value_idx, bool ordered) { + if (ordered) { + const void* fastmem_location = code.getCurr(); + switch (bitsize) { + case 8: + code.xchg(code.byte[addr], Xbyak::Reg64{value_idx}.cvt8()); + break; + case 16: + code.xchg(word[addr], Xbyak::Reg16{value_idx}); + break; + case 32: + code.xchg(dword[addr], Xbyak::Reg32{value_idx}); + break; + case 64: + code.xchg(qword[addr], Xbyak::Reg64{value_idx}); + break; + case 128: + code.movaps(xword[addr], Xbyak::Xmm{value_idx}); + code.mfence(); + break; + default: + ASSERT_FALSE("Invalid bitsize"); + } + return fastmem_location; + } + + const void* fastmem_location = code.getCurr(); switch (bitsize) { case 8: code.mov(code.byte[addr], Xbyak::Reg64{value_idx}.cvt8()); - return; + break; case 16: code.mov(word[addr], Xbyak::Reg16{value_idx}); - return; + break; case 32: code.mov(dword[addr], Xbyak::Reg32{value_idx}); - return; + break; case 64: code.mov(qword[addr], Xbyak::Reg64{value_idx}); - return; + break; case 128: code.movups(xword[addr], Xbyak::Xmm{value_idx}); - return; + break; default: ASSERT_FALSE("Invalid bitsize"); } + return fastmem_location; } template @@ -284,6 +349,10 @@ void EmitExclusiveTestAndClear(BlockOfCode& code, const UserConfig& conf, Xbyak: } } +inline bool IsOrdered(IR::AccType acctype) { + return acctype == IR::AccType::ORDERED || acctype == IR::AccType::ORDEREDRW || acctype == IR::AccType::LIMITEDORDERED; +} + } // namespace } // namespace Dynarmic::Backend::X64 diff --git a/externals/dynarmic/src/dynarmic/backend/x64/reg_alloc.cpp b/externals/dynarmic/src/dynarmic/backend/x64/reg_alloc.cpp index ae53a620a..7ff8c7807 100755 --- a/externals/dynarmic/src/dynarmic/backend/x64/reg_alloc.cpp +++ b/externals/dynarmic/src/dynarmic/backend/x64/reg_alloc.cpp @@ -208,6 +208,11 @@ IR::Cond Argument::GetImmediateCond() const { return value.GetCond(); } +IR::AccType Argument::GetImmediateAccType() const { + ASSERT(IsImmediate() && GetType() == IR::Type::AccType); + return value.GetAccType(); +} + bool Argument::IsInGpr() const { if (IsImmediate()) return false; diff --git a/externals/dynarmic/src/dynarmic/backend/x64/reg_alloc.h b/externals/dynarmic/src/dynarmic/backend/x64/reg_alloc.h index 58bfe87f3..5002932c5 100755 --- a/externals/dynarmic/src/dynarmic/backend/x64/reg_alloc.h +++ b/externals/dynarmic/src/dynarmic/backend/x64/reg_alloc.h @@ -21,6 +21,10 @@ #include "dynarmic/ir/microinstruction.h" #include "dynarmic/ir/value.h" +namespace Dynarmic::IR { +enum class AccType; +} // namespace Dynarmic::IR + namespace Dynarmic::Backend::X64 { class RegAlloc; @@ -75,6 +79,7 @@ public: u64 GetImmediateS32() const; u64 GetImmediateU64() const; IR::Cond GetImmediateCond() const; + IR::AccType GetImmediateAccType() const; /// Is this value currently in a GPR? bool IsInGpr() const;