early-access version 2621
This commit is contained in:
		| @@ -1,7 +1,7 @@ | ||||
| yuzu emulator early access | ||||
| ============= | ||||
|  | ||||
| This is the source code for early-access 2620. | ||||
| This is the source code for early-access 2621. | ||||
|  | ||||
| ## Legal Notice | ||||
|  | ||||
|   | ||||
| @@ -285,6 +285,7 @@ if (ARCHITECTURE STREQUAL "x86_64") | ||||
|         backend/x64/emit_x64_data_processing.cpp | ||||
|         backend/x64/emit_x64_floating_point.cpp | ||||
|         backend/x64/emit_x64_memory.h | ||||
|         backend/x64/emit_x64_memory.cpp.inc | ||||
|         backend/x64/emit_x64_packed.cpp | ||||
|         backend/x64/emit_x64_saturation.cpp | ||||
|         backend/x64/emit_x64_sm4.cpp | ||||
|   | ||||
| @@ -71,9 +71,12 @@ protected: | ||||
|     std::array<FastDispatchEntry, fast_dispatch_table_size> fast_dispatch_table; | ||||
|     void ClearFastDispatchTable(); | ||||
|  | ||||
|     std::map<std::tuple<size_t, int, int>, void (*)()> read_fallbacks; | ||||
|     std::map<std::tuple<size_t, int, int>, void (*)()> write_fallbacks; | ||||
|     std::map<std::tuple<size_t, int, int>, void (*)()> exclusive_write_fallbacks; | ||||
|     void (*memory_read_128)() = nullptr;   // Dummy | ||||
|     void (*memory_write_128)() = nullptr;  // Dummy | ||||
|  | ||||
|     std::map<std::tuple<bool, size_t, int, int>, void (*)()> read_fallbacks; | ||||
|     std::map<std::tuple<bool, size_t, int, int>, void (*)()> write_fallbacks; | ||||
|     std::map<std::tuple<bool, size_t, int, int>, void (*)()> exclusive_write_fallbacks; | ||||
|     void GenFastmemFallbacks(); | ||||
|  | ||||
|     const void* terminal_handler_pop_rsb_hint; | ||||
| @@ -99,7 +102,7 @@ protected: | ||||
|         u64 resume_rip; | ||||
|         u64 callback; | ||||
|         DoNotFastmemMarker marker; | ||||
|         bool compile; | ||||
|         bool recompile; | ||||
|     }; | ||||
|     tsl::robin_map<u64, FastmemPatchInfo> fastmem_patch_info; | ||||
|     std::set<DoNotFastmemMarker> do_not_fastmem; | ||||
| @@ -112,13 +115,13 @@ protected: | ||||
|     template<std::size_t bitsize, auto callback> | ||||
|     void EmitMemoryWrite(A32EmitContext& ctx, IR::Inst* inst); | ||||
|     template<std::size_t bitsize, auto callback> | ||||
|     void ExclusiveReadMemory(A32EmitContext& ctx, IR::Inst* inst); | ||||
|     void EmitExclusiveReadMemory(A32EmitContext& ctx, IR::Inst* inst); | ||||
|     template<std::size_t bitsize, auto callback> | ||||
|     void ExclusiveWriteMemory(A32EmitContext& ctx, IR::Inst* inst); | ||||
|     void EmitExclusiveWriteMemory(A32EmitContext& ctx, IR::Inst* inst); | ||||
|     template<std::size_t bitsize, auto callback> | ||||
|     void ExclusiveReadMemoryInline(A32EmitContext& ctx, IR::Inst* inst); | ||||
|     void EmitExclusiveReadMemoryInline(A32EmitContext& ctx, IR::Inst* inst); | ||||
|     template<std::size_t bitsize, auto callback> | ||||
|     void ExclusiveWriteMemoryInline(A32EmitContext& ctx, IR::Inst* inst); | ||||
|     void EmitExclusiveWriteMemoryInline(A32EmitContext& ctx, IR::Inst* inst); | ||||
|  | ||||
|     // Terminal instruction emitters | ||||
|     void EmitSetUpperLocationDescriptor(IR::LocationDescriptor new_location, IR::LocationDescriptor old_location); | ||||
|   | ||||
| @@ -47,222 +47,93 @@ void A32EmitX64::GenFastmemFallbacks() { | ||||
|         {64, Devirtualize<&A32::UserCallbacks::MemoryWriteExclusive64>(conf.callbacks)}, | ||||
|     }}; | ||||
|  | ||||
|     for (int vaddr_idx : idxes) { | ||||
|         for (int value_idx : idxes) { | ||||
|             for (const auto& [bitsize, callback] : read_callbacks) { | ||||
|                 code.align(); | ||||
|                 read_fallbacks[std::make_tuple(bitsize, vaddr_idx, value_idx)] = code.getCurr<void (*)()>(); | ||||
|                 ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLocRegIdx(value_idx)); | ||||
|                 if (vaddr_idx != code.ABI_PARAM2.getIdx()) { | ||||
|                     code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); | ||||
|                 } | ||||
|                 callback.EmitCall(code); | ||||
|                 if (value_idx != code.ABI_RETURN.getIdx()) { | ||||
|                     code.mov(Xbyak::Reg64{value_idx}, code.ABI_RETURN); | ||||
|                 } | ||||
|                 ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocRegIdx(value_idx)); | ||||
|                 code.ZeroExtendFrom(bitsize, Xbyak::Reg64{value_idx}); | ||||
|                 code.ret(); | ||||
|                 PerfMapRegister(read_fallbacks[std::make_tuple(bitsize, vaddr_idx, value_idx)], code.getCurr(), fmt::format("a32_read_fallback_{}", bitsize)); | ||||
|             } | ||||
|  | ||||
|             for (const auto& [bitsize, callback] : write_callbacks) { | ||||
|                 code.align(); | ||||
|                 write_fallbacks[std::make_tuple(bitsize, vaddr_idx, value_idx)] = code.getCurr<void (*)()>(); | ||||
|                 ABI_PushCallerSaveRegistersAndAdjustStack(code); | ||||
|                 if (vaddr_idx == code.ABI_PARAM3.getIdx() && value_idx == code.ABI_PARAM2.getIdx()) { | ||||
|                     code.xchg(code.ABI_PARAM2, code.ABI_PARAM3); | ||||
|                 } else if (vaddr_idx == code.ABI_PARAM3.getIdx()) { | ||||
|                     code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); | ||||
|                     if (value_idx != code.ABI_PARAM3.getIdx()) { | ||||
|                         code.mov(code.ABI_PARAM3, Xbyak::Reg64{value_idx}); | ||||
|                     } | ||||
|                 } else { | ||||
|                     if (value_idx != code.ABI_PARAM3.getIdx()) { | ||||
|                         code.mov(code.ABI_PARAM3, Xbyak::Reg64{value_idx}); | ||||
|                     } | ||||
|     for (bool ordered : {false, true}) { | ||||
|         for (int vaddr_idx : idxes) { | ||||
|             for (int value_idx : idxes) { | ||||
|                 for (const auto& [bitsize, callback] : read_callbacks) { | ||||
|                     code.align(); | ||||
|                     read_fallbacks[std::make_tuple(ordered, bitsize, vaddr_idx, value_idx)] = code.getCurr<void (*)()>(); | ||||
|                     ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLocRegIdx(value_idx)); | ||||
|                     if (vaddr_idx != code.ABI_PARAM2.getIdx()) { | ||||
|                         code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); | ||||
|                     } | ||||
|                     if (ordered) { | ||||
|                         code.mfence(); | ||||
|                     } | ||||
|                     callback.EmitCall(code); | ||||
|                     if (value_idx != code.ABI_RETURN.getIdx()) { | ||||
|                         code.mov(Xbyak::Reg64{value_idx}, code.ABI_RETURN); | ||||
|                     } | ||||
|                     ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocRegIdx(value_idx)); | ||||
|                     code.ZeroExtendFrom(bitsize, Xbyak::Reg64{value_idx}); | ||||
|                     code.ret(); | ||||
|                     PerfMapRegister(read_fallbacks[std::make_tuple(ordered, bitsize, vaddr_idx, value_idx)], code.getCurr(), fmt::format("a32_read_fallback_{}", bitsize)); | ||||
|                 } | ||||
|                 code.ZeroExtendFrom(bitsize, code.ABI_PARAM3); | ||||
|                 callback.EmitCall(code); | ||||
|                 ABI_PopCallerSaveRegistersAndAdjustStack(code); | ||||
|                 code.ret(); | ||||
|                 PerfMapRegister(write_fallbacks[std::make_tuple(bitsize, vaddr_idx, value_idx)], code.getCurr(), fmt::format("a32_write_fallback_{}", bitsize)); | ||||
|             } | ||||
|  | ||||
|             for (const auto& [bitsize, callback] : exclusive_write_callbacks) { | ||||
|                 code.align(); | ||||
|                 exclusive_write_fallbacks[std::make_tuple(bitsize, vaddr_idx, value_idx)] = code.getCurr<void (*)()>(); | ||||
|                 ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLoc::RAX); | ||||
|                 if (vaddr_idx == code.ABI_PARAM3.getIdx() && value_idx == code.ABI_PARAM2.getIdx()) { | ||||
|                     code.xchg(code.ABI_PARAM2, code.ABI_PARAM3); | ||||
|                 } else if (vaddr_idx == code.ABI_PARAM3.getIdx()) { | ||||
|                     code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); | ||||
|                     if (value_idx != code.ABI_PARAM3.getIdx()) { | ||||
|                         code.mov(code.ABI_PARAM3, Xbyak::Reg64{value_idx}); | ||||
|                     } | ||||
|                 } else { | ||||
|                     if (value_idx != code.ABI_PARAM3.getIdx()) { | ||||
|                         code.mov(code.ABI_PARAM3, Xbyak::Reg64{value_idx}); | ||||
|                     } | ||||
|                     if (vaddr_idx != code.ABI_PARAM2.getIdx()) { | ||||
|                 for (const auto& [bitsize, callback] : write_callbacks) { | ||||
|                     code.align(); | ||||
|                     write_fallbacks[std::make_tuple(ordered, bitsize, vaddr_idx, value_idx)] = code.getCurr<void (*)()>(); | ||||
|                     ABI_PushCallerSaveRegistersAndAdjustStack(code); | ||||
|                     if (vaddr_idx == code.ABI_PARAM3.getIdx() && value_idx == code.ABI_PARAM2.getIdx()) { | ||||
|                         code.xchg(code.ABI_PARAM2, code.ABI_PARAM3); | ||||
|                     } else if (vaddr_idx == code.ABI_PARAM3.getIdx()) { | ||||
|                         code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); | ||||
|                         if (value_idx != code.ABI_PARAM3.getIdx()) { | ||||
|                             code.mov(code.ABI_PARAM3, Xbyak::Reg64{value_idx}); | ||||
|                         } | ||||
|                     } else { | ||||
|                         if (value_idx != code.ABI_PARAM3.getIdx()) { | ||||
|                             code.mov(code.ABI_PARAM3, Xbyak::Reg64{value_idx}); | ||||
|                         } | ||||
|                         if (vaddr_idx != code.ABI_PARAM2.getIdx()) { | ||||
|                             code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); | ||||
|                         } | ||||
|                     } | ||||
|                     code.ZeroExtendFrom(bitsize, code.ABI_PARAM3); | ||||
|                     callback.EmitCall(code); | ||||
|                     if (ordered) { | ||||
|                         code.mfence(); | ||||
|                     } | ||||
|                     ABI_PopCallerSaveRegistersAndAdjustStack(code); | ||||
|                     code.ret(); | ||||
|                     PerfMapRegister(write_fallbacks[std::make_tuple(ordered, bitsize, vaddr_idx, value_idx)], code.getCurr(), fmt::format("a32_write_fallback_{}", bitsize)); | ||||
|                 } | ||||
|  | ||||
|                 for (const auto& [bitsize, callback] : exclusive_write_callbacks) { | ||||
|                     code.align(); | ||||
|                     exclusive_write_fallbacks[std::make_tuple(ordered, bitsize, vaddr_idx, value_idx)] = code.getCurr<void (*)()>(); | ||||
|                     ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLoc::RAX); | ||||
|                     if (vaddr_idx == code.ABI_PARAM3.getIdx() && value_idx == code.ABI_PARAM2.getIdx()) { | ||||
|                         code.xchg(code.ABI_PARAM2, code.ABI_PARAM3); | ||||
|                     } else if (vaddr_idx == code.ABI_PARAM3.getIdx()) { | ||||
|                         code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); | ||||
|                         if (value_idx != code.ABI_PARAM3.getIdx()) { | ||||
|                             code.mov(code.ABI_PARAM3, Xbyak::Reg64{value_idx}); | ||||
|                         } | ||||
|                     } else { | ||||
|                         if (value_idx != code.ABI_PARAM3.getIdx()) { | ||||
|                             code.mov(code.ABI_PARAM3, Xbyak::Reg64{value_idx}); | ||||
|                         } | ||||
|                         if (vaddr_idx != code.ABI_PARAM2.getIdx()) { | ||||
|                             code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); | ||||
|                         } | ||||
|                     } | ||||
|                     code.ZeroExtendFrom(bitsize, code.ABI_PARAM3); | ||||
|                     code.mov(code.ABI_PARAM4, rax); | ||||
|                     code.ZeroExtendFrom(bitsize, code.ABI_PARAM4); | ||||
|                     callback.EmitCall(code); | ||||
|                     ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLoc::RAX); | ||||
|                     code.ret(); | ||||
|                     PerfMapRegister(exclusive_write_fallbacks[std::make_tuple(ordered, bitsize, vaddr_idx, value_idx)], code.getCurr(), fmt::format("a32_exclusive_write_fallback_{}", bitsize)); | ||||
|                 } | ||||
|                 code.ZeroExtendFrom(bitsize, code.ABI_PARAM3); | ||||
|                 code.mov(code.ABI_PARAM4, rax); | ||||
|                 code.ZeroExtendFrom(bitsize, code.ABI_PARAM4); | ||||
|                 callback.EmitCall(code); | ||||
|                 ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLoc::RAX); | ||||
|                 code.ret(); | ||||
|                 PerfMapRegister(exclusive_write_fallbacks[std::make_tuple(bitsize, vaddr_idx, value_idx)], code.getCurr(), fmt::format("a32_exclusive_write_fallback_{}", bitsize)); | ||||
|             } | ||||
|         } | ||||
|     } | ||||
| } | ||||
|  | ||||
| std::optional<A32EmitX64::DoNotFastmemMarker> A32EmitX64::ShouldFastmem(A32EmitContext& ctx, IR::Inst* inst) const { | ||||
|     if (!conf.fastmem_pointer || !exception_handler.SupportsFastmem()) { | ||||
|         return std::nullopt; | ||||
|     } | ||||
|  | ||||
|     const auto marker = std::make_tuple(ctx.Location(), ctx.GetInstOffset(inst)); | ||||
|     if (do_not_fastmem.count(marker) > 0) { | ||||
|         return std::nullopt; | ||||
|     } | ||||
|     return marker; | ||||
| } | ||||
|  | ||||
| FakeCall A32EmitX64::FastmemCallback(u64 rip_) { | ||||
|     const auto iter = fastmem_patch_info.find(rip_); | ||||
|  | ||||
|     if (iter == fastmem_patch_info.end()) { | ||||
|         fmt::print("dynarmic: Segfault happened within JITted code at rip = {:016x}\n", rip_); | ||||
|         fmt::print("Segfault wasn't at a fastmem patch location!\n"); | ||||
|         fmt::print("Now dumping code.......\n\n"); | ||||
|         Common::DumpDisassembledX64((void*)(rip_ & ~u64(0xFFF)), 0x1000); | ||||
|         ASSERT_FALSE("iter != fastmem_patch_info.end()"); | ||||
|     } | ||||
|  | ||||
|     if (iter->second.compile) { | ||||
|         const auto marker = iter->second.marker; | ||||
|         do_not_fastmem.emplace(marker); | ||||
|         InvalidateBasicBlocks({std::get<0>(marker)}); | ||||
|     } | ||||
|  | ||||
|     return FakeCall{ | ||||
|         .call_rip = iter->second.callback, | ||||
|         .ret_rip = iter->second.resume_rip, | ||||
|     }; | ||||
| } | ||||
|  | ||||
| template<std::size_t bitsize, auto callback> | ||||
| void A32EmitX64::EmitMemoryRead(A32EmitContext& ctx, IR::Inst* inst) { | ||||
|     auto args = ctx.reg_alloc.GetArgumentInfo(inst); | ||||
|     const auto fastmem_marker = ShouldFastmem(ctx, inst); | ||||
|  | ||||
|     if (!conf.page_table && !fastmem_marker) { | ||||
|         // Neither fastmem nor page table: Use callbacks | ||||
|         ctx.reg_alloc.HostCall(inst, {}, args[0]); | ||||
|         Devirtualize<callback>(conf.callbacks).EmitCall(code); | ||||
|         code.ZeroExtendFrom(bitsize, code.ABI_RETURN); | ||||
|         return; | ||||
|     } | ||||
|  | ||||
|     const Xbyak::Reg64 vaddr = ctx.reg_alloc.UseGpr(args[0]); | ||||
|     const Xbyak::Reg64 value = ctx.reg_alloc.ScratchGpr(); | ||||
|  | ||||
|     const auto wrapped_fn = read_fallbacks[std::make_tuple(bitsize, vaddr.getIdx(), value.getIdx())]; | ||||
|  | ||||
|     if (fastmem_marker) { | ||||
|         // Use fastmem | ||||
|         const auto src_ptr = r13 + vaddr; | ||||
|  | ||||
|         const auto location = code.getCurr(); | ||||
|         EmitReadMemoryMov<bitsize>(code, value.getIdx(), src_ptr); | ||||
|  | ||||
|         fastmem_patch_info.emplace( | ||||
|             Common::BitCast<u64>(location), | ||||
|             FastmemPatchInfo{ | ||||
|                 Common::BitCast<u64>(code.getCurr()), | ||||
|                 Common::BitCast<u64>(wrapped_fn), | ||||
|                 *fastmem_marker, | ||||
|                 conf.recompile_on_fastmem_failure, | ||||
|             }); | ||||
|  | ||||
|         ctx.reg_alloc.DefineValue(inst, value); | ||||
|         return; | ||||
|     } | ||||
|  | ||||
|     // Use page table | ||||
|     ASSERT(conf.page_table); | ||||
|     Xbyak::Label abort, end; | ||||
|  | ||||
|     const auto src_ptr = EmitVAddrLookup(code, ctx, bitsize, abort, vaddr); | ||||
|     EmitReadMemoryMov<bitsize>(code, value.getIdx(), src_ptr); | ||||
|     code.L(end); | ||||
|  | ||||
|     code.SwitchToFarCode(); | ||||
|     code.L(abort); | ||||
|     code.call(wrapped_fn); | ||||
|     code.jmp(end, code.T_NEAR); | ||||
|     code.SwitchToNearCode(); | ||||
|  | ||||
|     ctx.reg_alloc.DefineValue(inst, value); | ||||
| } | ||||
|  | ||||
| template<std::size_t bitsize, auto callback> | ||||
| void A32EmitX64::EmitMemoryWrite(A32EmitContext& ctx, IR::Inst* inst) { | ||||
|     auto args = ctx.reg_alloc.GetArgumentInfo(inst); | ||||
|     const auto fastmem_marker = ShouldFastmem(ctx, inst); | ||||
|  | ||||
|     if (!conf.page_table && !fastmem_marker) { | ||||
|         // Neither fastmem nor page table: Use callbacks | ||||
|         ctx.reg_alloc.HostCall(nullptr, {}, args[0], args[1]); | ||||
|         Devirtualize<callback>(conf.callbacks).EmitCall(code); | ||||
|         return; | ||||
|     } | ||||
|  | ||||
|     const Xbyak::Reg64 vaddr = ctx.reg_alloc.UseGpr(args[0]); | ||||
|     const Xbyak::Reg64 value = ctx.reg_alloc.UseGpr(args[1]); | ||||
|  | ||||
|     const auto wrapped_fn = write_fallbacks[std::make_tuple(bitsize, vaddr.getIdx(), value.getIdx())]; | ||||
|  | ||||
|     if (fastmem_marker) { | ||||
|         // Use fastmem | ||||
|         const auto dest_ptr = r13 + vaddr; | ||||
|  | ||||
|         const auto location = code.getCurr(); | ||||
|         EmitWriteMemoryMov<bitsize>(code, dest_ptr, value.getIdx()); | ||||
|  | ||||
|         fastmem_patch_info.emplace( | ||||
|             Common::BitCast<u64>(location), | ||||
|             FastmemPatchInfo{ | ||||
|                 Common::BitCast<u64>(code.getCurr()), | ||||
|                 Common::BitCast<u64>(wrapped_fn), | ||||
|                 *fastmem_marker, | ||||
|                 conf.recompile_on_fastmem_failure, | ||||
|             }); | ||||
|  | ||||
|         return; | ||||
|     } | ||||
|  | ||||
|     // Use page table | ||||
|     ASSERT(conf.page_table); | ||||
|     Xbyak::Label abort, end; | ||||
|  | ||||
|     const auto dest_ptr = EmitVAddrLookup(code, ctx, bitsize, abort, vaddr); | ||||
|     EmitWriteMemoryMov<bitsize>(code, dest_ptr, value.getIdx()); | ||||
|     code.L(end); | ||||
|  | ||||
|     code.SwitchToFarCode(); | ||||
|     code.L(abort); | ||||
|     code.call(wrapped_fn); | ||||
|     code.jmp(end, code.T_NEAR); | ||||
|     code.SwitchToNearCode(); | ||||
| } | ||||
| #define Axx A32 | ||||
| #include "emit_x64_memory.cpp.inc" | ||||
| #undef Axx | ||||
|  | ||||
| void A32EmitX64::EmitA32ReadMemory8(A32EmitContext& ctx, IR::Inst* inst) { | ||||
|     EmitMemoryRead<8, &A32::UserCallbacks::MemoryRead8>(ctx, inst); | ||||
| @@ -296,268 +167,71 @@ void A32EmitX64::EmitA32WriteMemory64(A32EmitContext& ctx, IR::Inst* inst) { | ||||
|     EmitMemoryWrite<64, &A32::UserCallbacks::MemoryWrite64>(ctx, inst); | ||||
| } | ||||
|  | ||||
| template<size_t bitsize, auto callback> | ||||
| void A32EmitX64::ExclusiveReadMemory(A32EmitContext& ctx, IR::Inst* inst) { | ||||
|     using T = mp::unsigned_integer_of_size<bitsize>; | ||||
|  | ||||
|     ASSERT(conf.global_monitor != nullptr); | ||||
|     auto args = ctx.reg_alloc.GetArgumentInfo(inst); | ||||
|  | ||||
|     ctx.reg_alloc.HostCall(inst, {}, args[0]); | ||||
|  | ||||
|     code.mov(code.byte[r15 + offsetof(A32JitState, exclusive_state)], u8(1)); | ||||
|     code.mov(code.ABI_PARAM1, reinterpret_cast<u64>(&conf)); | ||||
|     code.CallLambda( | ||||
|         [](A32::UserConfig& conf, u32 vaddr) -> T { | ||||
|             return conf.global_monitor->ReadAndMark<T>(conf.processor_id, vaddr, [&]() -> T { | ||||
|                 return (conf.callbacks->*callback)(vaddr); | ||||
|             }); | ||||
|         }); | ||||
|     code.ZeroExtendFrom(bitsize, code.ABI_RETURN); | ||||
| } | ||||
|  | ||||
| template<size_t bitsize, auto callback> | ||||
| void A32EmitX64::ExclusiveWriteMemory(A32EmitContext& ctx, IR::Inst* inst) { | ||||
|     using T = mp::unsigned_integer_of_size<bitsize>; | ||||
|  | ||||
|     ASSERT(conf.global_monitor != nullptr); | ||||
|     auto args = ctx.reg_alloc.GetArgumentInfo(inst); | ||||
|  | ||||
|     ctx.reg_alloc.HostCall(inst, {}, args[0], args[1]); | ||||
|  | ||||
|     Xbyak::Label end; | ||||
|  | ||||
|     code.mov(code.ABI_RETURN, u32(1)); | ||||
|     code.cmp(code.byte[r15 + offsetof(A32JitState, exclusive_state)], u8(0)); | ||||
|     code.je(end); | ||||
|     code.mov(code.byte[r15 + offsetof(A32JitState, exclusive_state)], u8(0)); | ||||
|     code.mov(code.ABI_PARAM1, reinterpret_cast<u64>(&conf)); | ||||
|     code.CallLambda( | ||||
|         [](A32::UserConfig& conf, u32 vaddr, T value) -> u32 { | ||||
|             return conf.global_monitor->DoExclusiveOperation<T>(conf.processor_id, vaddr, | ||||
|                                                                 [&](T expected) -> bool { | ||||
|                                                                     return (conf.callbacks->*callback)(vaddr, value, expected); | ||||
|                                                                 }) | ||||
|                      ? 0 | ||||
|                      : 1; | ||||
|         }); | ||||
|     code.L(end); | ||||
| } | ||||
|  | ||||
| template<std::size_t bitsize, auto callback> | ||||
| void A32EmitX64::ExclusiveReadMemoryInline(A32EmitContext& ctx, IR::Inst* inst) { | ||||
|     ASSERT(conf.global_monitor && conf.fastmem_pointer); | ||||
|     if (!exception_handler.SupportsFastmem()) { | ||||
|         ExclusiveReadMemory<bitsize, callback>(ctx, inst); | ||||
|         return; | ||||
|     } | ||||
|  | ||||
|     auto args = ctx.reg_alloc.GetArgumentInfo(inst); | ||||
|  | ||||
|     const Xbyak::Reg64 vaddr = ctx.reg_alloc.UseGpr(args[0]); | ||||
|     const Xbyak::Reg64 value = ctx.reg_alloc.ScratchGpr(); | ||||
|     const Xbyak::Reg64 tmp = ctx.reg_alloc.ScratchGpr(); | ||||
|     const Xbyak::Reg64 tmp2 = ctx.reg_alloc.ScratchGpr(); | ||||
|  | ||||
|     const auto wrapped_fn = read_fallbacks[std::make_tuple(bitsize, vaddr.getIdx(), value.getIdx())]; | ||||
|  | ||||
|     EmitExclusiveLock(code, conf, tmp, tmp2.cvt32()); | ||||
|  | ||||
|     code.mov(code.byte[r15 + offsetof(A32JitState, exclusive_state)], u8(1)); | ||||
|     code.mov(tmp, Common::BitCast<u64>(GetExclusiveMonitorAddressPointer(conf.global_monitor, conf.processor_id))); | ||||
|     code.mov(qword[tmp], vaddr); | ||||
|  | ||||
|     const auto fastmem_marker = ShouldFastmem(ctx, inst); | ||||
|     if (fastmem_marker) { | ||||
|         Xbyak::Label end; | ||||
|  | ||||
|         const auto src_ptr = r13 + vaddr; | ||||
|  | ||||
|         const auto location = code.getCurr(); | ||||
|         EmitReadMemoryMov<bitsize>(code, value.getIdx(), src_ptr); | ||||
|  | ||||
|         fastmem_patch_info.emplace( | ||||
|             Common::BitCast<u64>(location), | ||||
|             FastmemPatchInfo{ | ||||
|                 Common::BitCast<u64>(code.getCurr()), | ||||
|                 Common::BitCast<u64>(wrapped_fn), | ||||
|                 *fastmem_marker, | ||||
|                 conf.recompile_on_exclusive_fastmem_failure, | ||||
|             }); | ||||
|  | ||||
|         code.L(end); | ||||
|     } else { | ||||
|         code.call(wrapped_fn); | ||||
|     } | ||||
|  | ||||
|     code.mov(tmp, Common::BitCast<u64>(GetExclusiveMonitorValuePointer(conf.global_monitor, conf.processor_id))); | ||||
|     EmitWriteMemoryMov<bitsize>(code, tmp, value.getIdx()); | ||||
|  | ||||
|     EmitExclusiveUnlock(code, conf, tmp, tmp2.cvt32()); | ||||
|  | ||||
|     ctx.reg_alloc.DefineValue(inst, value); | ||||
| } | ||||
|  | ||||
| template<std::size_t bitsize, auto callback> | ||||
| void A32EmitX64::ExclusiveWriteMemoryInline(A32EmitContext& ctx, IR::Inst* inst) { | ||||
|     ASSERT(conf.global_monitor && conf.fastmem_pointer); | ||||
|     if (!exception_handler.SupportsFastmem()) { | ||||
|         ExclusiveWriteMemory<bitsize, callback>(ctx, inst); | ||||
|         return; | ||||
|     } | ||||
|  | ||||
|     auto args = ctx.reg_alloc.GetArgumentInfo(inst); | ||||
|  | ||||
|     ctx.reg_alloc.ScratchGpr(HostLoc::RAX); | ||||
|     const Xbyak::Reg64 value = ctx.reg_alloc.UseGpr(args[1]); | ||||
|     const Xbyak::Reg64 vaddr = ctx.reg_alloc.UseGpr(args[0]); | ||||
|     const Xbyak::Reg32 status = ctx.reg_alloc.ScratchGpr().cvt32(); | ||||
|     const Xbyak::Reg64 tmp = ctx.reg_alloc.ScratchGpr(); | ||||
|  | ||||
|     const auto fallback_fn = exclusive_write_fallbacks[std::make_tuple(bitsize, vaddr.getIdx(), value.getIdx())]; | ||||
|  | ||||
|     EmitExclusiveLock(code, conf, tmp, eax); | ||||
|  | ||||
|     Xbyak::Label end; | ||||
|  | ||||
|     code.mov(tmp, Common::BitCast<u64>(GetExclusiveMonitorAddressPointer(conf.global_monitor, conf.processor_id))); | ||||
|     code.mov(status, u32(1)); | ||||
|     code.cmp(code.byte[r15 + offsetof(A32JitState, exclusive_state)], u8(0)); | ||||
|     code.je(end, code.T_NEAR); | ||||
|     code.cmp(qword[tmp], vaddr); | ||||
|     code.jne(end, code.T_NEAR); | ||||
|  | ||||
|     EmitExclusiveTestAndClear(code, conf, vaddr, tmp, rax); | ||||
|  | ||||
|     code.mov(code.byte[r15 + offsetof(A32JitState, exclusive_state)], u8(0)); | ||||
|     code.mov(tmp, Common::BitCast<u64>(GetExclusiveMonitorValuePointer(conf.global_monitor, conf.processor_id))); | ||||
|  | ||||
|     EmitReadMemoryMov<bitsize>(code, rax.getIdx(), tmp); | ||||
|  | ||||
|     const auto fastmem_marker = ShouldFastmem(ctx, inst); | ||||
|     if (fastmem_marker) { | ||||
|         const auto dest_ptr = r13 + vaddr; | ||||
|  | ||||
|         const auto location = code.getCurr(); | ||||
|  | ||||
|         switch (bitsize) { | ||||
|         case 8: | ||||
|             code.lock(); | ||||
|             code.cmpxchg(code.byte[dest_ptr], value.cvt8()); | ||||
|             break; | ||||
|         case 16: | ||||
|             code.lock(); | ||||
|             code.cmpxchg(word[dest_ptr], value.cvt16()); | ||||
|             break; | ||||
|         case 32: | ||||
|             code.lock(); | ||||
|             code.cmpxchg(dword[dest_ptr], value.cvt32()); | ||||
|             break; | ||||
|         case 64: | ||||
|             code.lock(); | ||||
|             code.cmpxchg(qword[dest_ptr], value.cvt64()); | ||||
|             break; | ||||
|         default: | ||||
|             UNREACHABLE(); | ||||
|         } | ||||
|  | ||||
|         code.setnz(status.cvt8()); | ||||
|  | ||||
|         code.SwitchToFarCode(); | ||||
|  | ||||
|         fastmem_patch_info.emplace( | ||||
|             Common::BitCast<u64>(location), | ||||
|             FastmemPatchInfo{ | ||||
|                 Common::BitCast<u64>(code.getCurr()), | ||||
|                 Common::BitCast<u64>(fallback_fn), | ||||
|                 *fastmem_marker, | ||||
|                 conf.recompile_on_exclusive_fastmem_failure, | ||||
|             }); | ||||
|  | ||||
|         code.cmp(al, 0); | ||||
|         code.setz(status.cvt8()); | ||||
|         code.movzx(status.cvt32(), status.cvt8()); | ||||
|         code.jmp(end, code.T_NEAR); | ||||
|         code.SwitchToNearCode(); | ||||
|     } else { | ||||
|         code.call(fallback_fn); | ||||
|         code.cmp(al, 0); | ||||
|         code.setz(status.cvt8()); | ||||
|         code.movzx(status.cvt32(), status.cvt8()); | ||||
|     } | ||||
|  | ||||
|     code.L(end); | ||||
|  | ||||
|     EmitExclusiveUnlock(code, conf, tmp, eax); | ||||
|  | ||||
|     ctx.reg_alloc.DefineValue(inst, status); | ||||
| } | ||||
|  | ||||
| void A32EmitX64::EmitA32ClearExclusive(A32EmitContext&, IR::Inst*) { | ||||
|     code.mov(code.byte[r15 + offsetof(A32JitState, exclusive_state)], u8(0)); | ||||
| } | ||||
|  | ||||
| void A32EmitX64::EmitA32ExclusiveReadMemory8(A32EmitContext& ctx, IR::Inst* inst) { | ||||
|     if (conf.fastmem_exclusive_access) { | ||||
|         ExclusiveReadMemoryInline<8, &A32::UserCallbacks::MemoryRead8>(ctx, inst); | ||||
|         EmitExclusiveReadMemoryInline<8, &A32::UserCallbacks::MemoryRead8>(ctx, inst); | ||||
|     } else { | ||||
|         ExclusiveReadMemory<8, &A32::UserCallbacks::MemoryRead8>(ctx, inst); | ||||
|         EmitExclusiveReadMemory<8, &A32::UserCallbacks::MemoryRead8>(ctx, inst); | ||||
|     } | ||||
| } | ||||
|  | ||||
| void A32EmitX64::EmitA32ExclusiveReadMemory16(A32EmitContext& ctx, IR::Inst* inst) { | ||||
|     if (conf.fastmem_exclusive_access) { | ||||
|         ExclusiveReadMemoryInline<16, &A32::UserCallbacks::MemoryRead16>(ctx, inst); | ||||
|         EmitExclusiveReadMemoryInline<16, &A32::UserCallbacks::MemoryRead16>(ctx, inst); | ||||
|     } else { | ||||
|         ExclusiveReadMemory<16, &A32::UserCallbacks::MemoryRead16>(ctx, inst); | ||||
|         EmitExclusiveReadMemory<16, &A32::UserCallbacks::MemoryRead16>(ctx, inst); | ||||
|     } | ||||
| } | ||||
|  | ||||
| void A32EmitX64::EmitA32ExclusiveReadMemory32(A32EmitContext& ctx, IR::Inst* inst) { | ||||
|     if (conf.fastmem_exclusive_access) { | ||||
|         ExclusiveReadMemoryInline<32, &A32::UserCallbacks::MemoryRead32>(ctx, inst); | ||||
|         EmitExclusiveReadMemoryInline<32, &A32::UserCallbacks::MemoryRead32>(ctx, inst); | ||||
|     } else { | ||||
|         ExclusiveReadMemory<32, &A32::UserCallbacks::MemoryRead32>(ctx, inst); | ||||
|         EmitExclusiveReadMemory<32, &A32::UserCallbacks::MemoryRead32>(ctx, inst); | ||||
|     } | ||||
| } | ||||
|  | ||||
| void A32EmitX64::EmitA32ExclusiveReadMemory64(A32EmitContext& ctx, IR::Inst* inst) { | ||||
|     if (conf.fastmem_exclusive_access) { | ||||
|         ExclusiveReadMemoryInline<64, &A32::UserCallbacks::MemoryRead64>(ctx, inst); | ||||
|         EmitExclusiveReadMemoryInline<64, &A32::UserCallbacks::MemoryRead64>(ctx, inst); | ||||
|     } else { | ||||
|         ExclusiveReadMemory<64, &A32::UserCallbacks::MemoryRead64>(ctx, inst); | ||||
|         EmitExclusiveReadMemory<64, &A32::UserCallbacks::MemoryRead64>(ctx, inst); | ||||
|     } | ||||
| } | ||||
|  | ||||
| void A32EmitX64::EmitA32ExclusiveWriteMemory8(A32EmitContext& ctx, IR::Inst* inst) { | ||||
|     if (conf.fastmem_exclusive_access) { | ||||
|         ExclusiveWriteMemoryInline<8, &A32::UserCallbacks::MemoryWriteExclusive8>(ctx, inst); | ||||
|         EmitExclusiveWriteMemoryInline<8, &A32::UserCallbacks::MemoryWriteExclusive8>(ctx, inst); | ||||
|     } else { | ||||
|         ExclusiveWriteMemory<8, &A32::UserCallbacks::MemoryWriteExclusive8>(ctx, inst); | ||||
|         EmitExclusiveWriteMemory<8, &A32::UserCallbacks::MemoryWriteExclusive8>(ctx, inst); | ||||
|     } | ||||
| } | ||||
|  | ||||
| void A32EmitX64::EmitA32ExclusiveWriteMemory16(A32EmitContext& ctx, IR::Inst* inst) { | ||||
|     if (conf.fastmem_exclusive_access) { | ||||
|         ExclusiveWriteMemoryInline<16, &A32::UserCallbacks::MemoryWriteExclusive16>(ctx, inst); | ||||
|         EmitExclusiveWriteMemoryInline<16, &A32::UserCallbacks::MemoryWriteExclusive16>(ctx, inst); | ||||
|     } else { | ||||
|         ExclusiveWriteMemory<16, &A32::UserCallbacks::MemoryWriteExclusive16>(ctx, inst); | ||||
|         EmitExclusiveWriteMemory<16, &A32::UserCallbacks::MemoryWriteExclusive16>(ctx, inst); | ||||
|     } | ||||
| } | ||||
|  | ||||
| void A32EmitX64::EmitA32ExclusiveWriteMemory32(A32EmitContext& ctx, IR::Inst* inst) { | ||||
|     if (conf.fastmem_exclusive_access) { | ||||
|         ExclusiveWriteMemoryInline<32, &A32::UserCallbacks::MemoryWriteExclusive32>(ctx, inst); | ||||
|         EmitExclusiveWriteMemoryInline<32, &A32::UserCallbacks::MemoryWriteExclusive32>(ctx, inst); | ||||
|     } else { | ||||
|         ExclusiveWriteMemory<32, &A32::UserCallbacks::MemoryWriteExclusive32>(ctx, inst); | ||||
|         EmitExclusiveWriteMemory<32, &A32::UserCallbacks::MemoryWriteExclusive32>(ctx, inst); | ||||
|     } | ||||
| } | ||||
|  | ||||
| void A32EmitX64::EmitA32ExclusiveWriteMemory64(A32EmitContext& ctx, IR::Inst* inst) { | ||||
|     if (conf.fastmem_exclusive_access) { | ||||
|         ExclusiveWriteMemoryInline<64, &A32::UserCallbacks::MemoryWriteExclusive64>(ctx, inst); | ||||
|         EmitExclusiveWriteMemoryInline<64, &A32::UserCallbacks::MemoryWriteExclusive64>(ctx, inst); | ||||
|     } else { | ||||
|         ExclusiveWriteMemory<64, &A32::UserCallbacks::MemoryWriteExclusive64>(ctx, inst); | ||||
|         EmitExclusiveWriteMemory<64, &A32::UserCallbacks::MemoryWriteExclusive64>(ctx, inst); | ||||
|     } | ||||
| } | ||||
|  | ||||
|   | ||||
| @@ -71,9 +71,9 @@ protected: | ||||
|     void (*memory_exclusive_write_128)(); | ||||
|     void GenMemory128Accessors(); | ||||
|  | ||||
|     std::map<std::tuple<size_t, int, int>, void (*)()> read_fallbacks; | ||||
|     std::map<std::tuple<size_t, int, int>, void (*)()> write_fallbacks; | ||||
|     std::map<std::tuple<size_t, int, int>, void (*)()> exclusive_write_fallbacks; | ||||
|     std::map<std::tuple<bool, size_t, int, int>, void (*)()> read_fallbacks; | ||||
|     std::map<std::tuple<bool, size_t, int, int>, void (*)()> write_fallbacks; | ||||
|     std::map<std::tuple<bool, size_t, int, int>, void (*)()> exclusive_write_fallbacks; | ||||
|     void GenFastmemFallbacks(); | ||||
|  | ||||
|     const void* terminal_handler_pop_rsb_hint; | ||||
|   | ||||
| @@ -131,301 +131,157 @@ void A64EmitX64::GenFastmemFallbacks() { | ||||
|         {64, Devirtualize<&A64::UserCallbacks::MemoryWriteExclusive64>(conf.callbacks)}, | ||||
|     }}; | ||||
|  | ||||
|     for (int vaddr_idx : idxes) { | ||||
|         if (vaddr_idx == 4 || vaddr_idx == 15) { | ||||
|             continue; | ||||
|         } | ||||
|  | ||||
|         for (int value_idx : idxes) { | ||||
|             code.align(); | ||||
|             read_fallbacks[std::make_tuple(128, vaddr_idx, value_idx)] = code.getCurr<void (*)()>(); | ||||
|             ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(value_idx)); | ||||
|             if (vaddr_idx != code.ABI_PARAM2.getIdx()) { | ||||
|                 code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); | ||||
|             } | ||||
|             code.call(memory_read_128); | ||||
|             if (value_idx != 1) { | ||||
|                 code.movaps(Xbyak::Xmm{value_idx}, xmm1); | ||||
|             } | ||||
|             ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(value_idx)); | ||||
|             code.ret(); | ||||
|             PerfMapRegister(read_fallbacks[std::make_tuple(128, vaddr_idx, value_idx)], code.getCurr(), "a64_read_fallback_128"); | ||||
|  | ||||
|             code.align(); | ||||
|             write_fallbacks[std::make_tuple(128, vaddr_idx, value_idx)] = code.getCurr<void (*)()>(); | ||||
|             ABI_PushCallerSaveRegistersAndAdjustStack(code); | ||||
|             if (vaddr_idx != code.ABI_PARAM2.getIdx()) { | ||||
|                 code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); | ||||
|             } | ||||
|             if (value_idx != 1) { | ||||
|                 code.movaps(xmm1, Xbyak::Xmm{value_idx}); | ||||
|             } | ||||
|             code.call(memory_write_128); | ||||
|             ABI_PopCallerSaveRegistersAndAdjustStack(code); | ||||
|             code.ret(); | ||||
|             PerfMapRegister(write_fallbacks[std::make_tuple(128, vaddr_idx, value_idx)], code.getCurr(), "a64_write_fallback_128"); | ||||
|  | ||||
|             code.align(); | ||||
|             exclusive_write_fallbacks[std::make_tuple(128, vaddr_idx, value_idx)] = code.getCurr<void (*)()>(); | ||||
|             ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLoc::RAX); | ||||
|             if (value_idx != 1) { | ||||
|                 code.movaps(xmm1, Xbyak::Xmm{value_idx}); | ||||
|             } | ||||
|             if (code.HasHostFeature(HostFeature::SSE41)) { | ||||
|                 code.movq(xmm2, rax); | ||||
|                 code.pinsrq(xmm2, rdx, 1); | ||||
|             } else { | ||||
|                 code.movq(xmm2, rax); | ||||
|                 code.movq(xmm0, rdx); | ||||
|                 code.punpcklqdq(xmm2, xmm0); | ||||
|             } | ||||
|             if (vaddr_idx != code.ABI_PARAM2.getIdx()) { | ||||
|                 code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); | ||||
|             } | ||||
|             code.call(memory_exclusive_write_128); | ||||
|             ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLoc::RAX); | ||||
|             code.ret(); | ||||
|             PerfMapRegister(exclusive_write_fallbacks[std::make_tuple(128, vaddr_idx, value_idx)], code.getCurr(), "a64_write_fallback_128"); | ||||
|  | ||||
|             if (value_idx == 4 || value_idx == 15) { | ||||
|     for (bool ordered : {false, true}) { | ||||
|         for (int vaddr_idx : idxes) { | ||||
|             if (vaddr_idx == 4 || vaddr_idx == 15) { | ||||
|                 continue; | ||||
|             } | ||||
|  | ||||
|             for (const auto& [bitsize, callback] : read_callbacks) { | ||||
|             for (int value_idx : idxes) { | ||||
|                 code.align(); | ||||
|                 read_fallbacks[std::make_tuple(bitsize, vaddr_idx, value_idx)] = code.getCurr<void (*)()>(); | ||||
|                 ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLocRegIdx(value_idx)); | ||||
|                 read_fallbacks[std::make_tuple(ordered, 128, vaddr_idx, value_idx)] = code.getCurr<void (*)()>(); | ||||
|                 ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(value_idx)); | ||||
|                 if (vaddr_idx != code.ABI_PARAM2.getIdx()) { | ||||
|                     code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); | ||||
|                 } | ||||
|                 callback.EmitCall(code); | ||||
|                 if (value_idx != code.ABI_RETURN.getIdx()) { | ||||
|                     code.mov(Xbyak::Reg64{value_idx}, code.ABI_RETURN); | ||||
|                 if (ordered) { | ||||
|                     code.mfence(); | ||||
|                 } | ||||
|                 ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocRegIdx(value_idx)); | ||||
|                 code.ZeroExtendFrom(bitsize, Xbyak::Reg64{value_idx}); | ||||
|                 code.call(memory_read_128); | ||||
|                 if (value_idx != 1) { | ||||
|                     code.movaps(Xbyak::Xmm{value_idx}, xmm1); | ||||
|                 } | ||||
|                 ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(value_idx)); | ||||
|                 code.ret(); | ||||
|                 PerfMapRegister(read_fallbacks[std::make_tuple(bitsize, vaddr_idx, value_idx)], code.getCurr(), fmt::format("a64_read_fallback_{}", bitsize)); | ||||
|             } | ||||
|                 PerfMapRegister(read_fallbacks[std::make_tuple(ordered, 128, vaddr_idx, value_idx)], code.getCurr(), "a64_read_fallback_128"); | ||||
|  | ||||
|             for (const auto& [bitsize, callback] : write_callbacks) { | ||||
|                 code.align(); | ||||
|                 write_fallbacks[std::make_tuple(bitsize, vaddr_idx, value_idx)] = code.getCurr<void (*)()>(); | ||||
|                 write_fallbacks[std::make_tuple(ordered, 128, vaddr_idx, value_idx)] = code.getCurr<void (*)()>(); | ||||
|                 ABI_PushCallerSaveRegistersAndAdjustStack(code); | ||||
|                 if (vaddr_idx == code.ABI_PARAM3.getIdx() && value_idx == code.ABI_PARAM2.getIdx()) { | ||||
|                     code.xchg(code.ABI_PARAM2, code.ABI_PARAM3); | ||||
|                 } else if (vaddr_idx == code.ABI_PARAM3.getIdx()) { | ||||
|                 if (vaddr_idx != code.ABI_PARAM2.getIdx()) { | ||||
|                     code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); | ||||
|                     if (value_idx != code.ABI_PARAM3.getIdx()) { | ||||
|                         code.mov(code.ABI_PARAM3, Xbyak::Reg64{value_idx}); | ||||
|                     } | ||||
|                 } else { | ||||
|                     if (value_idx != code.ABI_PARAM3.getIdx()) { | ||||
|                         code.mov(code.ABI_PARAM3, Xbyak::Reg64{value_idx}); | ||||
|                     } | ||||
|                     if (vaddr_idx != code.ABI_PARAM2.getIdx()) { | ||||
|                         code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); | ||||
|                     } | ||||
|                 } | ||||
|                 code.ZeroExtendFrom(bitsize, code.ABI_PARAM3); | ||||
|                 callback.EmitCall(code); | ||||
|                 if (value_idx != 1) { | ||||
|                     code.movaps(xmm1, Xbyak::Xmm{value_idx}); | ||||
|                 } | ||||
|                 code.call(memory_write_128); | ||||
|                 if (ordered) { | ||||
|                     code.mfence(); | ||||
|                 } | ||||
|                 ABI_PopCallerSaveRegistersAndAdjustStack(code); | ||||
|                 code.ret(); | ||||
|                 PerfMapRegister(write_fallbacks[std::make_tuple(bitsize, vaddr_idx, value_idx)], code.getCurr(), fmt::format("a64_write_fallback_{}", bitsize)); | ||||
|             } | ||||
|                 PerfMapRegister(write_fallbacks[std::make_tuple(ordered, 128, vaddr_idx, value_idx)], code.getCurr(), "a64_write_fallback_128"); | ||||
|  | ||||
|             for (const auto& [bitsize, callback] : exclusive_write_callbacks) { | ||||
|                 code.align(); | ||||
|                 exclusive_write_fallbacks[std::make_tuple(bitsize, vaddr_idx, value_idx)] = code.getCurr<void (*)()>(); | ||||
|                 exclusive_write_fallbacks[std::make_tuple(ordered, 128, vaddr_idx, value_idx)] = code.getCurr<void (*)()>(); | ||||
|                 ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLoc::RAX); | ||||
|                 if (vaddr_idx == code.ABI_PARAM3.getIdx() && value_idx == code.ABI_PARAM2.getIdx()) { | ||||
|                     code.xchg(code.ABI_PARAM2, code.ABI_PARAM3); | ||||
|                 } else if (vaddr_idx == code.ABI_PARAM3.getIdx()) { | ||||
|                     code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); | ||||
|                     if (value_idx != code.ABI_PARAM3.getIdx()) { | ||||
|                         code.mov(code.ABI_PARAM3, Xbyak::Reg64{value_idx}); | ||||
|                     } | ||||
|                 if (value_idx != 1) { | ||||
|                     code.movaps(xmm1, Xbyak::Xmm{value_idx}); | ||||
|                 } | ||||
|                 if (code.HasHostFeature(HostFeature::SSE41)) { | ||||
|                     code.movq(xmm2, rax); | ||||
|                     code.pinsrq(xmm2, rdx, 1); | ||||
|                 } else { | ||||
|                     if (value_idx != code.ABI_PARAM3.getIdx()) { | ||||
|                         code.mov(code.ABI_PARAM3, Xbyak::Reg64{value_idx}); | ||||
|                     } | ||||
|                     code.movq(xmm2, rax); | ||||
|                     code.movq(xmm0, rdx); | ||||
|                     code.punpcklqdq(xmm2, xmm0); | ||||
|                 } | ||||
|                 if (vaddr_idx != code.ABI_PARAM2.getIdx()) { | ||||
|                     code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); | ||||
|                 } | ||||
|                 code.call(memory_exclusive_write_128); | ||||
|                 ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLoc::RAX); | ||||
|                 code.ret(); | ||||
|                 PerfMapRegister(exclusive_write_fallbacks[std::make_tuple(ordered, 128, vaddr_idx, value_idx)], code.getCurr(), "a64_exclusive_write_fallback_128"); | ||||
|  | ||||
|                 if (value_idx == 4 || value_idx == 15) { | ||||
|                     continue; | ||||
|                 } | ||||
|  | ||||
|                 for (const auto& [bitsize, callback] : read_callbacks) { | ||||
|                     code.align(); | ||||
|                     read_fallbacks[std::make_tuple(ordered, bitsize, vaddr_idx, value_idx)] = code.getCurr<void (*)()>(); | ||||
|                     ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLocRegIdx(value_idx)); | ||||
|                     if (vaddr_idx != code.ABI_PARAM2.getIdx()) { | ||||
|                         code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); | ||||
|                     } | ||||
|                     if (ordered) { | ||||
|                         code.mfence(); | ||||
|                     } | ||||
|                     callback.EmitCall(code); | ||||
|                     if (value_idx != code.ABI_RETURN.getIdx()) { | ||||
|                         code.mov(Xbyak::Reg64{value_idx}, code.ABI_RETURN); | ||||
|                     } | ||||
|                     ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocRegIdx(value_idx)); | ||||
|                     code.ZeroExtendFrom(bitsize, Xbyak::Reg64{value_idx}); | ||||
|                     code.ret(); | ||||
|                     PerfMapRegister(read_fallbacks[std::make_tuple(ordered, bitsize, vaddr_idx, value_idx)], code.getCurr(), fmt::format("a64_read_fallback_{}", bitsize)); | ||||
|                 } | ||||
|  | ||||
|                 for (const auto& [bitsize, callback] : write_callbacks) { | ||||
|                     code.align(); | ||||
|                     write_fallbacks[std::make_tuple(ordered, bitsize, vaddr_idx, value_idx)] = code.getCurr<void (*)()>(); | ||||
|                     ABI_PushCallerSaveRegistersAndAdjustStack(code); | ||||
|                     if (vaddr_idx == code.ABI_PARAM3.getIdx() && value_idx == code.ABI_PARAM2.getIdx()) { | ||||
|                         code.xchg(code.ABI_PARAM2, code.ABI_PARAM3); | ||||
|                     } else if (vaddr_idx == code.ABI_PARAM3.getIdx()) { | ||||
|                         code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); | ||||
|                         if (value_idx != code.ABI_PARAM3.getIdx()) { | ||||
|                             code.mov(code.ABI_PARAM3, Xbyak::Reg64{value_idx}); | ||||
|                         } | ||||
|                     } else { | ||||
|                         if (value_idx != code.ABI_PARAM3.getIdx()) { | ||||
|                             code.mov(code.ABI_PARAM3, Xbyak::Reg64{value_idx}); | ||||
|                         } | ||||
|                         if (vaddr_idx != code.ABI_PARAM2.getIdx()) { | ||||
|                             code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); | ||||
|                         } | ||||
|                     } | ||||
|                     code.ZeroExtendFrom(bitsize, code.ABI_PARAM3); | ||||
|                     callback.EmitCall(code); | ||||
|                     if (ordered) { | ||||
|                         code.mfence(); | ||||
|                     } | ||||
|                     ABI_PopCallerSaveRegistersAndAdjustStack(code); | ||||
|                     code.ret(); | ||||
|                     PerfMapRegister(write_fallbacks[std::make_tuple(ordered, bitsize, vaddr_idx, value_idx)], code.getCurr(), fmt::format("a64_write_fallback_{}", bitsize)); | ||||
|                 } | ||||
|  | ||||
|                 for (const auto& [bitsize, callback] : exclusive_write_callbacks) { | ||||
|                     code.align(); | ||||
|                     exclusive_write_fallbacks[std::make_tuple(ordered, bitsize, vaddr_idx, value_idx)] = code.getCurr<void (*)()>(); | ||||
|                     ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLoc::RAX); | ||||
|                     if (vaddr_idx == code.ABI_PARAM3.getIdx() && value_idx == code.ABI_PARAM2.getIdx()) { | ||||
|                         code.xchg(code.ABI_PARAM2, code.ABI_PARAM3); | ||||
|                     } else if (vaddr_idx == code.ABI_PARAM3.getIdx()) { | ||||
|                         code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); | ||||
|                         if (value_idx != code.ABI_PARAM3.getIdx()) { | ||||
|                             code.mov(code.ABI_PARAM3, Xbyak::Reg64{value_idx}); | ||||
|                         } | ||||
|                     } else { | ||||
|                         if (value_idx != code.ABI_PARAM3.getIdx()) { | ||||
|                             code.mov(code.ABI_PARAM3, Xbyak::Reg64{value_idx}); | ||||
|                         } | ||||
|                         if (vaddr_idx != code.ABI_PARAM2.getIdx()) { | ||||
|                             code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); | ||||
|                         } | ||||
|                     } | ||||
|                     code.ZeroExtendFrom(bitsize, code.ABI_PARAM3); | ||||
|                     code.mov(code.ABI_PARAM4, rax); | ||||
|                     code.ZeroExtendFrom(bitsize, code.ABI_PARAM4); | ||||
|                     callback.EmitCall(code); | ||||
|                     ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLoc::RAX); | ||||
|                     code.ret(); | ||||
|                     PerfMapRegister(exclusive_write_fallbacks[std::make_tuple(ordered, bitsize, vaddr_idx, value_idx)], code.getCurr(), fmt::format("a64_exclusive_write_fallback_{}", bitsize)); | ||||
|                 } | ||||
|                 code.ZeroExtendFrom(bitsize, code.ABI_PARAM3); | ||||
|                 code.mov(code.ABI_PARAM4, rax); | ||||
|                 code.ZeroExtendFrom(bitsize, code.ABI_PARAM4); | ||||
|                 callback.EmitCall(code); | ||||
|                 ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLoc::RAX); | ||||
|                 code.ret(); | ||||
|                 PerfMapRegister(exclusive_write_fallbacks[std::make_tuple(bitsize, vaddr_idx, value_idx)], code.getCurr(), fmt::format("a64_exclusive_write_fallback_{}", bitsize)); | ||||
|             } | ||||
|         } | ||||
|     } | ||||
| } | ||||
|  | ||||
| std::optional<A64EmitX64::DoNotFastmemMarker> A64EmitX64::ShouldFastmem(A64EmitContext& ctx, IR::Inst* inst) const { | ||||
|     if (!conf.fastmem_pointer || !exception_handler.SupportsFastmem()) { | ||||
|         return std::nullopt; | ||||
|     } | ||||
|  | ||||
|     const auto marker = std::make_tuple(ctx.Location(), ctx.GetInstOffset(inst)); | ||||
|     if (do_not_fastmem.count(marker) > 0) { | ||||
|         return std::nullopt; | ||||
|     } | ||||
|     return marker; | ||||
| } | ||||
|  | ||||
| FakeCall A64EmitX64::FastmemCallback(u64 rip_) { | ||||
|     const auto iter = fastmem_patch_info.find(rip_); | ||||
|  | ||||
|     if (iter == fastmem_patch_info.end()) { | ||||
|         fmt::print("dynarmic: Segfault happened within JITted code at rip = {:016x}\n", rip_); | ||||
|         fmt::print("Segfault wasn't at a fastmem patch location!\n"); | ||||
|         fmt::print("Now dumping code.......\n\n"); | ||||
|         Common::DumpDisassembledX64((void*)(rip_ & ~u64(0xFFF)), 0x1000); | ||||
|         ASSERT_FALSE("iter != fastmem_patch_info.end()"); | ||||
|     } | ||||
|  | ||||
|     if (iter->second.recompile) { | ||||
|         const auto marker = iter->second.marker; | ||||
|         do_not_fastmem.emplace(marker); | ||||
|         InvalidateBasicBlocks({std::get<0>(marker)}); | ||||
|     } | ||||
|  | ||||
|     return FakeCall{ | ||||
|         .call_rip = iter->second.callback, | ||||
|         .ret_rip = iter->second.resume_rip, | ||||
|     }; | ||||
| } | ||||
|  | ||||
| template<std::size_t bitsize, auto callback> | ||||
| void A64EmitX64::EmitMemoryRead(A64EmitContext& ctx, IR::Inst* inst) { | ||||
|     auto args = ctx.reg_alloc.GetArgumentInfo(inst); | ||||
|     const auto fastmem_marker = ShouldFastmem(ctx, inst); | ||||
|  | ||||
|     if (!conf.page_table && !fastmem_marker) { | ||||
|         // Neither fastmem nor page table: Use callbacks | ||||
|         if constexpr (bitsize == 128) { | ||||
|             ctx.reg_alloc.HostCall(nullptr, {}, args[0]); | ||||
|             code.CallFunction(memory_read_128); | ||||
|             ctx.reg_alloc.DefineValue(inst, xmm1); | ||||
|         } else { | ||||
|             ctx.reg_alloc.HostCall(inst, {}, args[0]); | ||||
|             Devirtualize<callback>(conf.callbacks).EmitCall(code); | ||||
|             code.ZeroExtendFrom(bitsize, code.ABI_RETURN); | ||||
|         } | ||||
|         return; | ||||
|     } | ||||
|  | ||||
|     const Xbyak::Reg64 vaddr = ctx.reg_alloc.UseGpr(args[0]); | ||||
|     const int value_idx = bitsize == 128 ? ctx.reg_alloc.ScratchXmm().getIdx() : ctx.reg_alloc.ScratchGpr().getIdx(); | ||||
|  | ||||
|     const auto wrapped_fn = read_fallbacks[std::make_tuple(bitsize, vaddr.getIdx(), value_idx)]; | ||||
|  | ||||
|     Xbyak::Label abort, end; | ||||
|     bool require_abort_handling = false; | ||||
|  | ||||
|     if (fastmem_marker) { | ||||
|         // Use fastmem | ||||
|         const auto src_ptr = EmitFastmemVAddr(code, ctx, abort, vaddr, require_abort_handling); | ||||
|  | ||||
|         const auto location = code.getCurr(); | ||||
|         EmitReadMemoryMov<bitsize>(code, value_idx, src_ptr); | ||||
|  | ||||
|         fastmem_patch_info.emplace( | ||||
|             Common::BitCast<u64>(location), | ||||
|             FastmemPatchInfo{ | ||||
|                 Common::BitCast<u64>(code.getCurr()), | ||||
|                 Common::BitCast<u64>(wrapped_fn), | ||||
|                 *fastmem_marker, | ||||
|                 conf.recompile_on_fastmem_failure, | ||||
|             }); | ||||
|     } else { | ||||
|         // Use page table | ||||
|         ASSERT(conf.page_table); | ||||
|         const auto src_ptr = EmitVAddrLookup(code, ctx, bitsize, abort, vaddr); | ||||
|         require_abort_handling = true; | ||||
|         EmitReadMemoryMov<bitsize>(code, value_idx, src_ptr); | ||||
|     } | ||||
|     code.L(end); | ||||
|  | ||||
|     if (require_abort_handling) { | ||||
|         code.SwitchToFarCode(); | ||||
|         code.L(abort); | ||||
|         code.call(wrapped_fn); | ||||
|         code.jmp(end, code.T_NEAR); | ||||
|         code.SwitchToNearCode(); | ||||
|     } | ||||
|  | ||||
|     if constexpr (bitsize == 128) { | ||||
|         ctx.reg_alloc.DefineValue(inst, Xbyak::Xmm{value_idx}); | ||||
|     } else { | ||||
|         ctx.reg_alloc.DefineValue(inst, Xbyak::Reg64{value_idx}); | ||||
|     } | ||||
| } | ||||
|  | ||||
| template<std::size_t bitsize, auto callback> | ||||
| void A64EmitX64::EmitMemoryWrite(A64EmitContext& ctx, IR::Inst* inst) { | ||||
|     auto args = ctx.reg_alloc.GetArgumentInfo(inst); | ||||
|     const auto fastmem_marker = ShouldFastmem(ctx, inst); | ||||
|  | ||||
|     if (!conf.page_table && !fastmem_marker) { | ||||
|         // Neither fastmem nor page table: Use callbacks | ||||
|         if constexpr (bitsize == 128) { | ||||
|             ctx.reg_alloc.Use(args[0], ABI_PARAM2); | ||||
|             ctx.reg_alloc.Use(args[1], HostLoc::XMM1); | ||||
|             ctx.reg_alloc.EndOfAllocScope(); | ||||
|             ctx.reg_alloc.HostCall(nullptr); | ||||
|             code.CallFunction(memory_write_128); | ||||
|         } else { | ||||
|             ctx.reg_alloc.HostCall(nullptr, {}, args[0], args[1]); | ||||
|             Devirtualize<callback>(conf.callbacks).EmitCall(code); | ||||
|         } | ||||
|         return; | ||||
|     } | ||||
|  | ||||
|     const Xbyak::Reg64 vaddr = ctx.reg_alloc.UseGpr(args[0]); | ||||
|     const int value_idx = bitsize == 128 ? ctx.reg_alloc.UseXmm(args[1]).getIdx() : ctx.reg_alloc.UseGpr(args[1]).getIdx(); | ||||
|  | ||||
|     const auto wrapped_fn = write_fallbacks[std::make_tuple(bitsize, vaddr.getIdx(), value_idx)]; | ||||
|  | ||||
|     Xbyak::Label abort, end; | ||||
|     bool require_abort_handling = false; | ||||
|  | ||||
|     if (fastmem_marker) { | ||||
|         // Use fastmem | ||||
|         const auto dest_ptr = EmitFastmemVAddr(code, ctx, abort, vaddr, require_abort_handling); | ||||
|  | ||||
|         const auto location = code.getCurr(); | ||||
|         EmitWriteMemoryMov<bitsize>(code, dest_ptr, value_idx); | ||||
|  | ||||
|         fastmem_patch_info.emplace( | ||||
|             Common::BitCast<u64>(location), | ||||
|             FastmemPatchInfo{ | ||||
|                 Common::BitCast<u64>(code.getCurr()), | ||||
|                 Common::BitCast<u64>(wrapped_fn), | ||||
|                 *fastmem_marker, | ||||
|                 conf.recompile_on_fastmem_failure, | ||||
|             }); | ||||
|     } else { | ||||
|         // Use page table | ||||
|         ASSERT(conf.page_table); | ||||
|         const auto dest_ptr = EmitVAddrLookup(code, ctx, bitsize, abort, vaddr); | ||||
|         require_abort_handling = true; | ||||
|         EmitWriteMemoryMov<bitsize>(code, dest_ptr, value_idx); | ||||
|     } | ||||
|     code.L(end); | ||||
|  | ||||
|     if (require_abort_handling) { | ||||
|         code.SwitchToFarCode(); | ||||
|         code.L(abort); | ||||
|         code.call(wrapped_fn); | ||||
|         code.jmp(end, code.T_NEAR); | ||||
|         code.SwitchToNearCode(); | ||||
|     } | ||||
| } | ||||
| #define Axx A64 | ||||
| #include "emit_x64_memory.cpp.inc" | ||||
| #undef Axx | ||||
|  | ||||
| void A64EmitX64::EmitA64ReadMemory8(A64EmitContext& ctx, IR::Inst* inst) { | ||||
|     EmitMemoryRead<8, &A64::UserCallbacks::MemoryRead8>(ctx, inst); | ||||
| @@ -467,295 +323,6 @@ void A64EmitX64::EmitA64WriteMemory128(A64EmitContext& ctx, IR::Inst* inst) { | ||||
|     EmitMemoryWrite<128, &A64::UserCallbacks::MemoryWrite64>(ctx, inst); | ||||
| } | ||||
|  | ||||
| template<std::size_t bitsize, auto callback> | ||||
| void A64EmitX64::EmitExclusiveReadMemory(A64EmitContext& ctx, IR::Inst* inst) { | ||||
|     ASSERT(conf.global_monitor != nullptr); | ||||
|     auto args = ctx.reg_alloc.GetArgumentInfo(inst); | ||||
|  | ||||
|     if constexpr (bitsize != 128) { | ||||
|         using T = mp::unsigned_integer_of_size<bitsize>; | ||||
|  | ||||
|         ctx.reg_alloc.HostCall(inst, {}, args[0]); | ||||
|  | ||||
|         code.mov(code.byte[r15 + offsetof(A64JitState, exclusive_state)], u8(1)); | ||||
|         code.mov(code.ABI_PARAM1, reinterpret_cast<u64>(&conf)); | ||||
|         code.CallLambda( | ||||
|             [](A64::UserConfig& conf, u64 vaddr) -> T { | ||||
|                 return conf.global_monitor->ReadAndMark<T>(conf.processor_id, vaddr, [&]() -> T { | ||||
|                     return (conf.callbacks->*callback)(vaddr); | ||||
|                 }); | ||||
|             }); | ||||
|         code.ZeroExtendFrom(bitsize, code.ABI_RETURN); | ||||
|     } else { | ||||
|         const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); | ||||
|         ctx.reg_alloc.Use(args[0], ABI_PARAM2); | ||||
|         ctx.reg_alloc.EndOfAllocScope(); | ||||
|         ctx.reg_alloc.HostCall(nullptr); | ||||
|  | ||||
|         code.mov(code.byte[r15 + offsetof(A64JitState, exclusive_state)], u8(1)); | ||||
|         code.mov(code.ABI_PARAM1, reinterpret_cast<u64>(&conf)); | ||||
|         ctx.reg_alloc.AllocStackSpace(16 + ABI_SHADOW_SPACE); | ||||
|         code.lea(code.ABI_PARAM3, ptr[rsp + ABI_SHADOW_SPACE]); | ||||
|         code.CallLambda( | ||||
|             [](A64::UserConfig& conf, u64 vaddr, A64::Vector& ret) { | ||||
|                 ret = conf.global_monitor->ReadAndMark<A64::Vector>(conf.processor_id, vaddr, [&]() -> A64::Vector { | ||||
|                     return (conf.callbacks->*callback)(vaddr); | ||||
|                 }); | ||||
|             }); | ||||
|         code.movups(result, xword[rsp + ABI_SHADOW_SPACE]); | ||||
|         ctx.reg_alloc.ReleaseStackSpace(16 + ABI_SHADOW_SPACE); | ||||
|  | ||||
|         ctx.reg_alloc.DefineValue(inst, result); | ||||
|     } | ||||
| } | ||||
|  | ||||
| template<std::size_t bitsize, auto callback> | ||||
| void A64EmitX64::EmitExclusiveWriteMemory(A64EmitContext& ctx, IR::Inst* inst) { | ||||
|     ASSERT(conf.global_monitor != nullptr); | ||||
|     auto args = ctx.reg_alloc.GetArgumentInfo(inst); | ||||
|  | ||||
|     if constexpr (bitsize != 128) { | ||||
|         ctx.reg_alloc.HostCall(inst, {}, args[0], args[1]); | ||||
|     } else { | ||||
|         ctx.reg_alloc.Use(args[0], ABI_PARAM2); | ||||
|         ctx.reg_alloc.Use(args[1], HostLoc::XMM1); | ||||
|         ctx.reg_alloc.EndOfAllocScope(); | ||||
|         ctx.reg_alloc.HostCall(inst); | ||||
|     } | ||||
|  | ||||
|     Xbyak::Label end; | ||||
|  | ||||
|     code.mov(code.ABI_RETURN, u32(1)); | ||||
|     code.cmp(code.byte[r15 + offsetof(A64JitState, exclusive_state)], u8(0)); | ||||
|     code.je(end); | ||||
|     code.mov(code.byte[r15 + offsetof(A64JitState, exclusive_state)], u8(0)); | ||||
|     code.mov(code.ABI_PARAM1, reinterpret_cast<u64>(&conf)); | ||||
|     if constexpr (bitsize != 128) { | ||||
|         using T = mp::unsigned_integer_of_size<bitsize>; | ||||
|  | ||||
|         code.CallLambda( | ||||
|             [](A64::UserConfig& conf, u64 vaddr, T value) -> u32 { | ||||
|                 return conf.global_monitor->DoExclusiveOperation<T>(conf.processor_id, vaddr, | ||||
|                                                                     [&](T expected) -> bool { | ||||
|                                                                         return (conf.callbacks->*callback)(vaddr, value, expected); | ||||
|                                                                     }) | ||||
|                          ? 0 | ||||
|                          : 1; | ||||
|             }); | ||||
|     } else { | ||||
|         ctx.reg_alloc.AllocStackSpace(16 + ABI_SHADOW_SPACE); | ||||
|         code.lea(code.ABI_PARAM3, ptr[rsp + ABI_SHADOW_SPACE]); | ||||
|         code.movaps(xword[code.ABI_PARAM3], xmm1); | ||||
|         code.CallLambda( | ||||
|             [](A64::UserConfig& conf, u64 vaddr, A64::Vector& value) -> u32 { | ||||
|                 return conf.global_monitor->DoExclusiveOperation<A64::Vector>(conf.processor_id, vaddr, | ||||
|                                                                               [&](A64::Vector expected) -> bool { | ||||
|                                                                                   return (conf.callbacks->*callback)(vaddr, value, expected); | ||||
|                                                                               }) | ||||
|                          ? 0 | ||||
|                          : 1; | ||||
|             }); | ||||
|         ctx.reg_alloc.ReleaseStackSpace(16 + ABI_SHADOW_SPACE); | ||||
|     } | ||||
|     code.L(end); | ||||
| } | ||||
|  | ||||
| template<std::size_t bitsize, auto callback> | ||||
| void A64EmitX64::EmitExclusiveReadMemoryInline(A64EmitContext& ctx, IR::Inst* inst) { | ||||
|     ASSERT(conf.global_monitor && conf.fastmem_pointer); | ||||
|     if (!exception_handler.SupportsFastmem()) { | ||||
|         EmitExclusiveReadMemory<bitsize, callback>(ctx, inst); | ||||
|         return; | ||||
|     } | ||||
|  | ||||
|     auto args = ctx.reg_alloc.GetArgumentInfo(inst); | ||||
|  | ||||
|     const Xbyak::Reg64 vaddr = ctx.reg_alloc.UseGpr(args[0]); | ||||
|     const int value_idx = bitsize == 128 ? ctx.reg_alloc.ScratchXmm().getIdx() : ctx.reg_alloc.ScratchGpr().getIdx(); | ||||
|     const Xbyak::Reg64 tmp = ctx.reg_alloc.ScratchGpr(); | ||||
|     const Xbyak::Reg64 tmp2 = ctx.reg_alloc.ScratchGpr(); | ||||
|  | ||||
|     const auto wrapped_fn = read_fallbacks[std::make_tuple(bitsize, vaddr.getIdx(), value_idx)]; | ||||
|  | ||||
|     EmitExclusiveLock(code, conf, tmp, tmp2.cvt32()); | ||||
|  | ||||
|     code.mov(code.byte[r15 + offsetof(A64JitState, exclusive_state)], u8(1)); | ||||
|     code.mov(tmp, Common::BitCast<u64>(GetExclusiveMonitorAddressPointer(conf.global_monitor, conf.processor_id))); | ||||
|     code.mov(qword[tmp], vaddr); | ||||
|  | ||||
|     const auto fastmem_marker = ShouldFastmem(ctx, inst); | ||||
|     if (fastmem_marker) { | ||||
|         Xbyak::Label abort, end; | ||||
|         bool require_abort_handling = false; | ||||
|  | ||||
|         const auto src_ptr = EmitFastmemVAddr(code, ctx, abort, vaddr, require_abort_handling); | ||||
|  | ||||
|         const auto location = code.getCurr(); | ||||
|         EmitReadMemoryMov<bitsize>(code, value_idx, src_ptr); | ||||
|  | ||||
|         fastmem_patch_info.emplace( | ||||
|             Common::BitCast<u64>(location), | ||||
|             FastmemPatchInfo{ | ||||
|                 Common::BitCast<u64>(code.getCurr()), | ||||
|                 Common::BitCast<u64>(wrapped_fn), | ||||
|                 *fastmem_marker, | ||||
|                 conf.recompile_on_exclusive_fastmem_failure, | ||||
|             }); | ||||
|  | ||||
|         code.L(end); | ||||
|  | ||||
|         if (require_abort_handling) { | ||||
|             code.SwitchToFarCode(); | ||||
|             code.L(abort); | ||||
|             code.call(wrapped_fn); | ||||
|             code.jmp(end, code.T_NEAR); | ||||
|             code.SwitchToNearCode(); | ||||
|         } | ||||
|     } else { | ||||
|         code.call(wrapped_fn); | ||||
|     } | ||||
|  | ||||
|     code.mov(tmp, Common::BitCast<u64>(GetExclusiveMonitorValuePointer(conf.global_monitor, conf.processor_id))); | ||||
|     EmitWriteMemoryMov<bitsize>(code, tmp, value_idx); | ||||
|  | ||||
|     EmitExclusiveUnlock(code, conf, tmp, tmp2.cvt32()); | ||||
|  | ||||
|     if constexpr (bitsize == 128) { | ||||
|         ctx.reg_alloc.DefineValue(inst, Xbyak::Xmm{value_idx}); | ||||
|     } else { | ||||
|         ctx.reg_alloc.DefineValue(inst, Xbyak::Reg64{value_idx}); | ||||
|     } | ||||
| } | ||||
|  | ||||
| template<std::size_t bitsize, auto callback> | ||||
| void A64EmitX64::EmitExclusiveWriteMemoryInline(A64EmitContext& ctx, IR::Inst* inst) { | ||||
|     ASSERT(conf.global_monitor && conf.fastmem_pointer); | ||||
|     if (!exception_handler.SupportsFastmem()) { | ||||
|         EmitExclusiveWriteMemory<bitsize, callback>(ctx, inst); | ||||
|         return; | ||||
|     } | ||||
|  | ||||
|     auto args = ctx.reg_alloc.GetArgumentInfo(inst); | ||||
|  | ||||
|     const auto value = [&] { | ||||
|         if constexpr (bitsize == 128) { | ||||
|             ctx.reg_alloc.ScratchGpr(HostLoc::RAX); | ||||
|             ctx.reg_alloc.ScratchGpr(HostLoc::RBX); | ||||
|             ctx.reg_alloc.ScratchGpr(HostLoc::RCX); | ||||
|             ctx.reg_alloc.ScratchGpr(HostLoc::RDX); | ||||
|             return ctx.reg_alloc.UseXmm(args[1]); | ||||
|         } else { | ||||
|             ctx.reg_alloc.ScratchGpr(HostLoc::RAX); | ||||
|             return ctx.reg_alloc.UseGpr(args[1]); | ||||
|         } | ||||
|     }(); | ||||
|     const Xbyak::Reg64 vaddr = ctx.reg_alloc.UseGpr(args[0]); | ||||
|     const Xbyak::Reg32 status = ctx.reg_alloc.ScratchGpr().cvt32(); | ||||
|     const Xbyak::Reg64 tmp = ctx.reg_alloc.ScratchGpr(); | ||||
|  | ||||
|     const auto fallback_fn = exclusive_write_fallbacks[std::make_tuple(bitsize, vaddr.getIdx(), value.getIdx())]; | ||||
|  | ||||
|     EmitExclusiveLock(code, conf, tmp, eax); | ||||
|  | ||||
|     Xbyak::Label end; | ||||
|  | ||||
|     code.mov(tmp, Common::BitCast<u64>(GetExclusiveMonitorAddressPointer(conf.global_monitor, conf.processor_id))); | ||||
|     code.mov(status, u32(1)); | ||||
|     code.cmp(code.byte[r15 + offsetof(A64JitState, exclusive_state)], u8(0)); | ||||
|     code.je(end, code.T_NEAR); | ||||
|     code.cmp(qword[tmp], vaddr); | ||||
|     code.jne(end, code.T_NEAR); | ||||
|  | ||||
|     EmitExclusiveTestAndClear(code, conf, vaddr, tmp, rax); | ||||
|  | ||||
|     code.mov(code.byte[r15 + offsetof(A64JitState, exclusive_state)], u8(0)); | ||||
|     code.mov(tmp, Common::BitCast<u64>(GetExclusiveMonitorValuePointer(conf.global_monitor, conf.processor_id))); | ||||
|  | ||||
|     if constexpr (bitsize == 128) { | ||||
|         code.mov(rax, qword[tmp + 0]); | ||||
|         code.mov(rdx, qword[tmp + 8]); | ||||
|         if (code.HasHostFeature(HostFeature::SSE41)) { | ||||
|             code.movq(rbx, value); | ||||
|             code.pextrq(rcx, value, 1); | ||||
|         } else { | ||||
|             code.movaps(xmm0, value); | ||||
|             code.movq(rbx, xmm0); | ||||
|             code.punpckhqdq(xmm0, xmm0); | ||||
|             code.movq(rcx, xmm0); | ||||
|         } | ||||
|     } else { | ||||
|         EmitReadMemoryMov<bitsize>(code, rax.getIdx(), tmp); | ||||
|     } | ||||
|  | ||||
|     const auto fastmem_marker = ShouldFastmem(ctx, inst); | ||||
|     if (fastmem_marker) { | ||||
|         Xbyak::Label abort; | ||||
|         bool require_abort_handling = false; | ||||
|  | ||||
|         const auto dest_ptr = EmitFastmemVAddr(code, ctx, abort, vaddr, require_abort_handling, tmp); | ||||
|  | ||||
|         const auto location = code.getCurr(); | ||||
|  | ||||
|         if constexpr (bitsize == 128) { | ||||
|             code.lock(); | ||||
|             code.cmpxchg16b(ptr[dest_ptr]); | ||||
|         } else { | ||||
|             switch (bitsize) { | ||||
|             case 8: | ||||
|                 code.lock(); | ||||
|                 code.cmpxchg(code.byte[dest_ptr], value.cvt8()); | ||||
|                 break; | ||||
|             case 16: | ||||
|                 code.lock(); | ||||
|                 code.cmpxchg(word[dest_ptr], value.cvt16()); | ||||
|                 break; | ||||
|             case 32: | ||||
|                 code.lock(); | ||||
|                 code.cmpxchg(dword[dest_ptr], value.cvt32()); | ||||
|                 break; | ||||
|             case 64: | ||||
|                 code.lock(); | ||||
|                 code.cmpxchg(qword[dest_ptr], value.cvt64()); | ||||
|                 break; | ||||
|             default: | ||||
|                 UNREACHABLE(); | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         code.setnz(status.cvt8()); | ||||
|  | ||||
|         code.SwitchToFarCode(); | ||||
|         code.L(abort); | ||||
|         code.call(fallback_fn); | ||||
|  | ||||
|         fastmem_patch_info.emplace( | ||||
|             Common::BitCast<u64>(location), | ||||
|             FastmemPatchInfo{ | ||||
|                 Common::BitCast<u64>(code.getCurr()), | ||||
|                 Common::BitCast<u64>(fallback_fn), | ||||
|                 *fastmem_marker, | ||||
|                 conf.recompile_on_exclusive_fastmem_failure, | ||||
|             }); | ||||
|  | ||||
|         code.cmp(al, 0); | ||||
|         code.setz(status.cvt8()); | ||||
|         code.movzx(status.cvt32(), status.cvt8()); | ||||
|         code.jmp(end, code.T_NEAR); | ||||
|         code.SwitchToNearCode(); | ||||
|     } else { | ||||
|         code.call(fallback_fn); | ||||
|         code.cmp(al, 0); | ||||
|         code.setz(status.cvt8()); | ||||
|         code.movzx(status.cvt32(), status.cvt8()); | ||||
|     } | ||||
|  | ||||
|     code.L(end); | ||||
|  | ||||
|     EmitExclusiveUnlock(code, conf, tmp, eax); | ||||
|  | ||||
|     ctx.reg_alloc.DefineValue(inst, status); | ||||
| } | ||||
|  | ||||
| void A64EmitX64::EmitA64ClearExclusive(A64EmitContext&, IR::Inst*) { | ||||
|     code.mov(code.byte[r15 + offsetof(A64JitState, exclusive_state)], u8(0)); | ||||
| } | ||||
|   | ||||
							
								
								
									
										497
									
								
								externals/dynarmic/src/dynarmic/backend/x64/emit_x64_memory.cpp.inc
									
									
									
									
										vendored
									
									
										Executable file
									
								
							
							
						
						
									
										497
									
								
								externals/dynarmic/src/dynarmic/backend/x64/emit_x64_memory.cpp.inc
									
									
									
									
										vendored
									
									
										Executable file
									
								
							| @@ -0,0 +1,497 @@ | ||||
| /* This file is part of the dynarmic project. | ||||
|  * Copyright (c) 2022 MerryMage | ||||
|  * SPDX-License-Identifier: 0BSD | ||||
|  */ | ||||
|  | ||||
| #include "dynarmic/common/macro_util.h" | ||||
|  | ||||
| #define AxxEmitX64 CONCATENATE_TOKENS(Axx, EmitX64) | ||||
| #define AxxEmitContext CONCATENATE_TOKENS(Axx, EmitContext) | ||||
| #define AxxJitState CONCATENATE_TOKENS(Axx, JitState) | ||||
| #define AxxUserConfig Axx::UserConfig | ||||
|  | ||||
| namespace { | ||||
| using Vector = std::array<u64, 2>; | ||||
| } | ||||
|  | ||||
| std::optional<AxxEmitX64::DoNotFastmemMarker> AxxEmitX64::ShouldFastmem(AxxEmitContext& ctx, IR::Inst* inst) const { | ||||
|     if (!conf.fastmem_pointer || !exception_handler.SupportsFastmem()) { | ||||
|         return std::nullopt; | ||||
|     } | ||||
|  | ||||
|     const auto marker = std::make_tuple(ctx.Location(), ctx.GetInstOffset(inst)); | ||||
|     if (do_not_fastmem.count(marker) > 0) { | ||||
|         return std::nullopt; | ||||
|     } | ||||
|     return marker; | ||||
| } | ||||
|  | ||||
| FakeCall AxxEmitX64::FastmemCallback(u64 rip_) { | ||||
|     const auto iter = fastmem_patch_info.find(rip_); | ||||
|  | ||||
|     if (iter == fastmem_patch_info.end()) { | ||||
|         fmt::print("dynarmic: Segfault happened within JITted code at rip = {:016x}\n", rip_); | ||||
|         fmt::print("Segfault wasn't at a fastmem patch location!\n"); | ||||
|         fmt::print("Now dumping code.......\n\n"); | ||||
|         Common::DumpDisassembledX64((void*)(rip_ & ~u64(0xFFF)), 0x1000); | ||||
|         ASSERT_FALSE("iter != fastmem_patch_info.end()"); | ||||
|     } | ||||
|  | ||||
|     if (iter->second.recompile) { | ||||
|         const auto marker = iter->second.marker; | ||||
|         do_not_fastmem.emplace(marker); | ||||
|         InvalidateBasicBlocks({std::get<0>(marker)}); | ||||
|     } | ||||
|  | ||||
|     return FakeCall{ | ||||
|         .call_rip = iter->second.callback, | ||||
|         .ret_rip = iter->second.resume_rip, | ||||
|     }; | ||||
| } | ||||
|  | ||||
| template<std::size_t bitsize, auto callback> | ||||
| void AxxEmitX64::EmitMemoryRead(AxxEmitContext& ctx, IR::Inst* inst) { | ||||
|     auto args = ctx.reg_alloc.GetArgumentInfo(inst); | ||||
|     const bool ordered = IsOrdered(args[1].GetImmediateAccType()); | ||||
|     const auto fastmem_marker = ShouldFastmem(ctx, inst); | ||||
|  | ||||
|     if (!conf.page_table && !fastmem_marker) { | ||||
|         // Neither fastmem nor page table: Use callbacks | ||||
|         if constexpr (bitsize == 128) { | ||||
|             ctx.reg_alloc.HostCall(nullptr, {}, args[0]); | ||||
|             if (ordered) { | ||||
|                 code.mfence(); | ||||
|             } | ||||
|             code.CallFunction(memory_read_128); | ||||
|             ctx.reg_alloc.DefineValue(inst, xmm1); | ||||
|         } else { | ||||
|             ctx.reg_alloc.HostCall(inst, {}, args[0]); | ||||
|             if (ordered) { | ||||
|                 code.mfence(); | ||||
|             } | ||||
|             Devirtualize<callback>(conf.callbacks).EmitCall(code); | ||||
|             code.ZeroExtendFrom(bitsize, code.ABI_RETURN); | ||||
|         } | ||||
|         return; | ||||
|     } | ||||
|  | ||||
|     const Xbyak::Reg64 vaddr = ctx.reg_alloc.UseGpr(args[0]); | ||||
|     const int value_idx = bitsize == 128 ? ctx.reg_alloc.ScratchXmm().getIdx() : ctx.reg_alloc.ScratchGpr().getIdx(); | ||||
|  | ||||
|     const auto wrapped_fn = read_fallbacks[std::make_tuple(ordered, bitsize, vaddr.getIdx(), value_idx)]; | ||||
|  | ||||
|     Xbyak::Label abort, end; | ||||
|     bool require_abort_handling = false; | ||||
|  | ||||
|     if (fastmem_marker) { | ||||
|         // Use fastmem | ||||
|         const auto src_ptr = EmitFastmemVAddr(code, ctx, abort, vaddr, require_abort_handling); | ||||
|  | ||||
|         const auto location = EmitReadMemoryMov<bitsize>(code, value_idx, src_ptr, ordered); | ||||
|  | ||||
|         fastmem_patch_info.emplace( | ||||
|             Common::BitCast<u64>(location), | ||||
|             FastmemPatchInfo{ | ||||
|                 Common::BitCast<u64>(code.getCurr()), | ||||
|                 Common::BitCast<u64>(wrapped_fn), | ||||
|                 *fastmem_marker, | ||||
|                 conf.recompile_on_fastmem_failure, | ||||
|             }); | ||||
|     } else { | ||||
|         // Use page table | ||||
|         ASSERT(conf.page_table); | ||||
|         const auto src_ptr = EmitVAddrLookup(code, ctx, bitsize, abort, vaddr); | ||||
|         require_abort_handling = true; | ||||
|         EmitReadMemoryMov<bitsize>(code, value_idx, src_ptr, ordered); | ||||
|     } | ||||
|     code.L(end); | ||||
|  | ||||
|     if (require_abort_handling) { | ||||
|         code.SwitchToFarCode(); | ||||
|         code.L(abort); | ||||
|         code.call(wrapped_fn); | ||||
|         code.jmp(end, code.T_NEAR); | ||||
|         code.SwitchToNearCode(); | ||||
|     } | ||||
|  | ||||
|     if constexpr (bitsize == 128) { | ||||
|         ctx.reg_alloc.DefineValue(inst, Xbyak::Xmm{value_idx}); | ||||
|     } else { | ||||
|         ctx.reg_alloc.DefineValue(inst, Xbyak::Reg64{value_idx}); | ||||
|     } | ||||
| } | ||||
|  | ||||
| template<std::size_t bitsize, auto callback> | ||||
| void AxxEmitX64::EmitMemoryWrite(AxxEmitContext& ctx, IR::Inst* inst) { | ||||
|     auto args = ctx.reg_alloc.GetArgumentInfo(inst); | ||||
|     const bool ordered = IsOrdered(args[2].GetImmediateAccType()); | ||||
|     const auto fastmem_marker = ShouldFastmem(ctx, inst); | ||||
|  | ||||
|     if (!conf.page_table && !fastmem_marker) { | ||||
|         // Neither fastmem nor page table: Use callbacks | ||||
|         if constexpr (bitsize == 128) { | ||||
|             ctx.reg_alloc.Use(args[0], ABI_PARAM2); | ||||
|             ctx.reg_alloc.Use(args[1], HostLoc::XMM1); | ||||
|             ctx.reg_alloc.EndOfAllocScope(); | ||||
|             ctx.reg_alloc.HostCall(nullptr); | ||||
|             code.CallFunction(memory_write_128); | ||||
|         } else { | ||||
|             ctx.reg_alloc.HostCall(nullptr, {}, args[0], args[1]); | ||||
|             Devirtualize<callback>(conf.callbacks).EmitCall(code); | ||||
|         } | ||||
|         if (ordered) { | ||||
|             code.mfence(); | ||||
|         } | ||||
|         return; | ||||
|     } | ||||
|  | ||||
|     const Xbyak::Reg64 vaddr = ctx.reg_alloc.UseGpr(args[0]); | ||||
|     const int value_idx = bitsize == 128 | ||||
|                             ? ctx.reg_alloc.UseXmm(args[1]).getIdx() | ||||
|                             : (ordered ? ctx.reg_alloc.UseScratchGpr(args[1]).getIdx() : ctx.reg_alloc.UseGpr(args[1]).getIdx()); | ||||
|  | ||||
|     const auto wrapped_fn = write_fallbacks[std::make_tuple(ordered, bitsize, vaddr.getIdx(), value_idx)]; | ||||
|  | ||||
|     Xbyak::Label abort, end; | ||||
|     bool require_abort_handling = false; | ||||
|  | ||||
|     if (fastmem_marker) { | ||||
|         // Use fastmem | ||||
|         const auto dest_ptr = EmitFastmemVAddr(code, ctx, abort, vaddr, require_abort_handling); | ||||
|  | ||||
|         const auto location = EmitWriteMemoryMov<bitsize>(code, dest_ptr, value_idx, ordered); | ||||
|  | ||||
|         fastmem_patch_info.emplace( | ||||
|             Common::BitCast<u64>(location), | ||||
|             FastmemPatchInfo{ | ||||
|                 Common::BitCast<u64>(code.getCurr()), | ||||
|                 Common::BitCast<u64>(wrapped_fn), | ||||
|                 *fastmem_marker, | ||||
|                 conf.recompile_on_fastmem_failure, | ||||
|             }); | ||||
|     } else { | ||||
|         // Use page table | ||||
|         ASSERT(conf.page_table); | ||||
|         const auto dest_ptr = EmitVAddrLookup(code, ctx, bitsize, abort, vaddr); | ||||
|         require_abort_handling = true; | ||||
|         EmitWriteMemoryMov<bitsize>(code, dest_ptr, value_idx, ordered); | ||||
|     } | ||||
|     code.L(end); | ||||
|  | ||||
|     if (require_abort_handling) { | ||||
|         code.SwitchToFarCode(); | ||||
|         code.L(abort); | ||||
|         code.call(wrapped_fn); | ||||
|         code.jmp(end, code.T_NEAR); | ||||
|         code.SwitchToNearCode(); | ||||
|     } | ||||
| } | ||||
|  | ||||
| template<std::size_t bitsize, auto callback> | ||||
| void AxxEmitX64::EmitExclusiveReadMemory(AxxEmitContext& ctx, IR::Inst* inst) { | ||||
|     ASSERT(conf.global_monitor != nullptr); | ||||
|     auto args = ctx.reg_alloc.GetArgumentInfo(inst); | ||||
|     const bool ordered = IsOrdered(args[1].GetImmediateAccType()); | ||||
|  | ||||
|     if constexpr (bitsize != 128) { | ||||
|         using T = mp::unsigned_integer_of_size<bitsize>; | ||||
|  | ||||
|         ctx.reg_alloc.HostCall(inst, {}, args[0]); | ||||
|  | ||||
|         code.mov(code.byte[r15 + offsetof(AxxJitState, exclusive_state)], u8(1)); | ||||
|         code.mov(code.ABI_PARAM1, reinterpret_cast<u64>(&conf)); | ||||
|         if (ordered) { | ||||
|             code.mfence(); | ||||
|         } | ||||
|         code.CallLambda( | ||||
|             [](AxxUserConfig& conf, Axx::VAddr vaddr) -> T { | ||||
|                 return conf.global_monitor->ReadAndMark<T>(conf.processor_id, vaddr, [&]() -> T { | ||||
|                     return (conf.callbacks->*callback)(vaddr); | ||||
|                 }); | ||||
|             }); | ||||
|         code.ZeroExtendFrom(bitsize, code.ABI_RETURN); | ||||
|     } else { | ||||
|         const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); | ||||
|         ctx.reg_alloc.Use(args[0], ABI_PARAM2); | ||||
|         ctx.reg_alloc.EndOfAllocScope(); | ||||
|         ctx.reg_alloc.HostCall(nullptr); | ||||
|  | ||||
|         code.mov(code.byte[r15 + offsetof(AxxJitState, exclusive_state)], u8(1)); | ||||
|         code.mov(code.ABI_PARAM1, reinterpret_cast<u64>(&conf)); | ||||
|         ctx.reg_alloc.AllocStackSpace(16 + ABI_SHADOW_SPACE); | ||||
|         code.lea(code.ABI_PARAM3, ptr[rsp + ABI_SHADOW_SPACE]); | ||||
|         if (ordered) { | ||||
|             code.mfence(); | ||||
|         } | ||||
|         code.CallLambda( | ||||
|             [](AxxUserConfig& conf, Axx::VAddr vaddr, Vector& ret) { | ||||
|                 ret = conf.global_monitor->ReadAndMark<Vector>(conf.processor_id, vaddr, [&]() -> Vector { | ||||
|                     return (conf.callbacks->*callback)(vaddr); | ||||
|                 }); | ||||
|             }); | ||||
|         code.movups(result, xword[rsp + ABI_SHADOW_SPACE]); | ||||
|         ctx.reg_alloc.ReleaseStackSpace(16 + ABI_SHADOW_SPACE); | ||||
|  | ||||
|         ctx.reg_alloc.DefineValue(inst, result); | ||||
|     } | ||||
| } | ||||
|  | ||||
| template<std::size_t bitsize, auto callback> | ||||
| void AxxEmitX64::EmitExclusiveWriteMemory(AxxEmitContext& ctx, IR::Inst* inst) { | ||||
|     ASSERT(conf.global_monitor != nullptr); | ||||
|     auto args = ctx.reg_alloc.GetArgumentInfo(inst); | ||||
|     const bool ordered = IsOrdered(args[2].GetImmediateAccType()); | ||||
|  | ||||
|     if constexpr (bitsize != 128) { | ||||
|         ctx.reg_alloc.HostCall(inst, {}, args[0], args[1]); | ||||
|     } else { | ||||
|         ctx.reg_alloc.Use(args[0], ABI_PARAM2); | ||||
|         ctx.reg_alloc.Use(args[1], HostLoc::XMM1); | ||||
|         ctx.reg_alloc.EndOfAllocScope(); | ||||
|         ctx.reg_alloc.HostCall(inst); | ||||
|     } | ||||
|  | ||||
|     Xbyak::Label end; | ||||
|  | ||||
|     code.mov(code.ABI_RETURN, u32(1)); | ||||
|     code.cmp(code.byte[r15 + offsetof(AxxJitState, exclusive_state)], u8(0)); | ||||
|     code.je(end); | ||||
|     code.mov(code.byte[r15 + offsetof(AxxJitState, exclusive_state)], u8(0)); | ||||
|     code.mov(code.ABI_PARAM1, reinterpret_cast<u64>(&conf)); | ||||
|     if constexpr (bitsize != 128) { | ||||
|         using T = mp::unsigned_integer_of_size<bitsize>; | ||||
|  | ||||
|         code.CallLambda( | ||||
|             [](AxxUserConfig& conf, Axx::VAddr vaddr, T value) -> u32 { | ||||
|                 return conf.global_monitor->DoExclusiveOperation<T>(conf.processor_id, vaddr, | ||||
|                                                                     [&](T expected) -> bool { | ||||
|                                                                         return (conf.callbacks->*callback)(vaddr, value, expected); | ||||
|                                                                     }) | ||||
|                          ? 0 | ||||
|                          : 1; | ||||
|             }); | ||||
|         if (ordered) { | ||||
|             code.mfence(); | ||||
|         } | ||||
|     } else { | ||||
|         ctx.reg_alloc.AllocStackSpace(16 + ABI_SHADOW_SPACE); | ||||
|         code.lea(code.ABI_PARAM3, ptr[rsp + ABI_SHADOW_SPACE]); | ||||
|         code.movaps(xword[code.ABI_PARAM3], xmm1); | ||||
|         code.CallLambda( | ||||
|             [](AxxUserConfig& conf, Axx::VAddr vaddr, Vector& value) -> u32 { | ||||
|                 return conf.global_monitor->DoExclusiveOperation<Vector>(conf.processor_id, vaddr, | ||||
|                                                                          [&](Vector expected) -> bool { | ||||
|                                                                              return (conf.callbacks->*callback)(vaddr, value, expected); | ||||
|                                                                          }) | ||||
|                          ? 0 | ||||
|                          : 1; | ||||
|             }); | ||||
|         if (ordered) { | ||||
|             code.mfence(); | ||||
|         } | ||||
|         ctx.reg_alloc.ReleaseStackSpace(16 + ABI_SHADOW_SPACE); | ||||
|     } | ||||
|     code.L(end); | ||||
| } | ||||
|  | ||||
| template<std::size_t bitsize, auto callback> | ||||
| void AxxEmitX64::EmitExclusiveReadMemoryInline(AxxEmitContext& ctx, IR::Inst* inst) { | ||||
|     ASSERT(conf.global_monitor && conf.fastmem_pointer); | ||||
|     if (!exception_handler.SupportsFastmem()) { | ||||
|         EmitExclusiveReadMemory<bitsize, callback>(ctx, inst); | ||||
|         return; | ||||
|     } | ||||
|  | ||||
|     auto args = ctx.reg_alloc.GetArgumentInfo(inst); | ||||
|     const bool ordered = IsOrdered(args[1].GetImmediateAccType()); | ||||
|  | ||||
|     const Xbyak::Reg64 vaddr = ctx.reg_alloc.UseGpr(args[0]); | ||||
|     const int value_idx = bitsize == 128 ? ctx.reg_alloc.ScratchXmm().getIdx() : ctx.reg_alloc.ScratchGpr().getIdx(); | ||||
|     const Xbyak::Reg64 tmp = ctx.reg_alloc.ScratchGpr(); | ||||
|     const Xbyak::Reg64 tmp2 = ctx.reg_alloc.ScratchGpr(); | ||||
|  | ||||
|     const auto wrapped_fn = read_fallbacks[std::make_tuple(ordered, bitsize, vaddr.getIdx(), value_idx)]; | ||||
|  | ||||
|     EmitExclusiveLock(code, conf, tmp, tmp2.cvt32()); | ||||
|  | ||||
|     code.mov(code.byte[r15 + offsetof(AxxJitState, exclusive_state)], u8(1)); | ||||
|     code.mov(tmp, Common::BitCast<u64>(GetExclusiveMonitorAddressPointer(conf.global_monitor, conf.processor_id))); | ||||
|     code.mov(qword[tmp], vaddr); | ||||
|  | ||||
|     const auto fastmem_marker = ShouldFastmem(ctx, inst); | ||||
|     if (fastmem_marker) { | ||||
|         Xbyak::Label abort, end; | ||||
|         bool require_abort_handling = false; | ||||
|  | ||||
|         const auto src_ptr = EmitFastmemVAddr(code, ctx, abort, vaddr, require_abort_handling); | ||||
|  | ||||
|         const auto location = EmitReadMemoryMov<bitsize>(code, value_idx, src_ptr, ordered); | ||||
|  | ||||
|         fastmem_patch_info.emplace( | ||||
|             Common::BitCast<u64>(location), | ||||
|             FastmemPatchInfo{ | ||||
|                 Common::BitCast<u64>(code.getCurr()), | ||||
|                 Common::BitCast<u64>(wrapped_fn), | ||||
|                 *fastmem_marker, | ||||
|                 conf.recompile_on_exclusive_fastmem_failure, | ||||
|             }); | ||||
|  | ||||
|         code.L(end); | ||||
|  | ||||
|         if (require_abort_handling) { | ||||
|             code.SwitchToFarCode(); | ||||
|             code.L(abort); | ||||
|             code.call(wrapped_fn); | ||||
|             code.jmp(end, code.T_NEAR); | ||||
|             code.SwitchToNearCode(); | ||||
|         } | ||||
|     } else { | ||||
|         code.call(wrapped_fn); | ||||
|     } | ||||
|  | ||||
|     code.mov(tmp, Common::BitCast<u64>(GetExclusiveMonitorValuePointer(conf.global_monitor, conf.processor_id))); | ||||
|     EmitWriteMemoryMov<bitsize>(code, tmp, value_idx, false); | ||||
|  | ||||
|     EmitExclusiveUnlock(code, conf, tmp, tmp2.cvt32()); | ||||
|  | ||||
|     if constexpr (bitsize == 128) { | ||||
|         ctx.reg_alloc.DefineValue(inst, Xbyak::Xmm{value_idx}); | ||||
|     } else { | ||||
|         ctx.reg_alloc.DefineValue(inst, Xbyak::Reg64{value_idx}); | ||||
|     } | ||||
| } | ||||
|  | ||||
| template<std::size_t bitsize, auto callback> | ||||
| void AxxEmitX64::EmitExclusiveWriteMemoryInline(AxxEmitContext& ctx, IR::Inst* inst) { | ||||
|     ASSERT(conf.global_monitor && conf.fastmem_pointer); | ||||
|     if (!exception_handler.SupportsFastmem()) { | ||||
|         EmitExclusiveWriteMemory<bitsize, callback>(ctx, inst); | ||||
|         return; | ||||
|     } | ||||
|  | ||||
|     auto args = ctx.reg_alloc.GetArgumentInfo(inst); | ||||
|     const bool ordered = IsOrdered(args[2].GetImmediateAccType()); | ||||
|  | ||||
|     const auto value = [&] { | ||||
|         if constexpr (bitsize == 128) { | ||||
|             ctx.reg_alloc.ScratchGpr(HostLoc::RAX); | ||||
|             ctx.reg_alloc.ScratchGpr(HostLoc::RBX); | ||||
|             ctx.reg_alloc.ScratchGpr(HostLoc::RCX); | ||||
|             ctx.reg_alloc.ScratchGpr(HostLoc::RDX); | ||||
|             return ctx.reg_alloc.UseXmm(args[1]); | ||||
|         } else { | ||||
|             ctx.reg_alloc.ScratchGpr(HostLoc::RAX); | ||||
|             return ctx.reg_alloc.UseGpr(args[1]); | ||||
|         } | ||||
|     }(); | ||||
|     const Xbyak::Reg64 vaddr = ctx.reg_alloc.UseGpr(args[0]); | ||||
|     const Xbyak::Reg32 status = ctx.reg_alloc.ScratchGpr().cvt32(); | ||||
|     const Xbyak::Reg64 tmp = ctx.reg_alloc.ScratchGpr(); | ||||
|  | ||||
|     const auto wrapped_fn = exclusive_write_fallbacks[std::make_tuple(ordered, bitsize, vaddr.getIdx(), value.getIdx())]; | ||||
|  | ||||
|     EmitExclusiveLock(code, conf, tmp, eax); | ||||
|  | ||||
|     Xbyak::Label end; | ||||
|  | ||||
|     code.mov(tmp, Common::BitCast<u64>(GetExclusiveMonitorAddressPointer(conf.global_monitor, conf.processor_id))); | ||||
|     code.mov(status, u32(1)); | ||||
|     code.cmp(code.byte[r15 + offsetof(AxxJitState, exclusive_state)], u8(0)); | ||||
|     code.je(end, code.T_NEAR); | ||||
|     code.cmp(qword[tmp], vaddr); | ||||
|     code.jne(end, code.T_NEAR); | ||||
|  | ||||
|     EmitExclusiveTestAndClear(code, conf, vaddr, tmp, rax); | ||||
|  | ||||
|     code.mov(code.byte[r15 + offsetof(AxxJitState, exclusive_state)], u8(0)); | ||||
|     code.mov(tmp, Common::BitCast<u64>(GetExclusiveMonitorValuePointer(conf.global_monitor, conf.processor_id))); | ||||
|  | ||||
|     if constexpr (bitsize == 128) { | ||||
|         code.mov(rax, qword[tmp + 0]); | ||||
|         code.mov(rdx, qword[tmp + 8]); | ||||
|         if (code.HasHostFeature(HostFeature::SSE41)) { | ||||
|             code.movq(rbx, value); | ||||
|             code.pextrq(rcx, value, 1); | ||||
|         } else { | ||||
|             code.movaps(xmm0, value); | ||||
|             code.movq(rbx, xmm0); | ||||
|             code.punpckhqdq(xmm0, xmm0); | ||||
|             code.movq(rcx, xmm0); | ||||
|         } | ||||
|     } else { | ||||
|         EmitReadMemoryMov<bitsize>(code, rax.getIdx(), tmp, false); | ||||
|     } | ||||
|  | ||||
|     const auto fastmem_marker = ShouldFastmem(ctx, inst); | ||||
|     if (fastmem_marker) { | ||||
|         Xbyak::Label abort; | ||||
|         bool require_abort_handling = false; | ||||
|  | ||||
|         const auto dest_ptr = EmitFastmemVAddr(code, ctx, abort, vaddr, require_abort_handling, tmp); | ||||
|  | ||||
|         const auto location = code.getCurr(); | ||||
|  | ||||
|         if constexpr (bitsize == 128) { | ||||
|             code.lock(); | ||||
|             code.cmpxchg16b(ptr[dest_ptr]); | ||||
|         } else { | ||||
|             switch (bitsize) { | ||||
|             case 8: | ||||
|                 code.lock(); | ||||
|                 code.cmpxchg(code.byte[dest_ptr], value.cvt8()); | ||||
|                 break; | ||||
|             case 16: | ||||
|                 code.lock(); | ||||
|                 code.cmpxchg(word[dest_ptr], value.cvt16()); | ||||
|                 break; | ||||
|             case 32: | ||||
|                 code.lock(); | ||||
|                 code.cmpxchg(dword[dest_ptr], value.cvt32()); | ||||
|                 break; | ||||
|             case 64: | ||||
|                 code.lock(); | ||||
|                 code.cmpxchg(qword[dest_ptr], value.cvt64()); | ||||
|                 break; | ||||
|             default: | ||||
|                 UNREACHABLE(); | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         code.setnz(status.cvt8()); | ||||
|  | ||||
|         code.SwitchToFarCode(); | ||||
|         code.L(abort); | ||||
|         code.call(wrapped_fn); | ||||
|  | ||||
|         fastmem_patch_info.emplace( | ||||
|             Common::BitCast<u64>(location), | ||||
|             FastmemPatchInfo{ | ||||
|                 Common::BitCast<u64>(code.getCurr()), | ||||
|                 Common::BitCast<u64>(wrapped_fn), | ||||
|                 *fastmem_marker, | ||||
|                 conf.recompile_on_exclusive_fastmem_failure, | ||||
|             }); | ||||
|  | ||||
|         code.cmp(al, 0); | ||||
|         code.setz(status.cvt8()); | ||||
|         code.movzx(status.cvt32(), status.cvt8()); | ||||
|         code.jmp(end, code.T_NEAR); | ||||
|         code.SwitchToNearCode(); | ||||
|     } else { | ||||
|         code.call(wrapped_fn); | ||||
|         code.cmp(al, 0); | ||||
|         code.setz(status.cvt8()); | ||||
|         code.movzx(status.cvt32(), status.cvt8()); | ||||
|     } | ||||
|  | ||||
|     code.L(end); | ||||
|  | ||||
|     EmitExclusiveUnlock(code, conf, tmp, eax); | ||||
|  | ||||
|     ctx.reg_alloc.DefineValue(inst, status); | ||||
| } | ||||
|  | ||||
| #undef AxxEmitX64 | ||||
| #undef AxxEmitContext | ||||
| #undef AxxJitState | ||||
| #undef AxxUserConfig | ||||
| @@ -10,6 +10,7 @@ | ||||
| #include "dynarmic/backend/x64/exclusive_monitor_friend.h" | ||||
| #include "dynarmic/common/spin_lock_x64.h" | ||||
| #include "dynarmic/interface/exclusive_monitor.h" | ||||
| #include "dynarmic/ir/acc_type.h" | ||||
|  | ||||
| namespace Dynarmic::Backend::X64 { | ||||
|  | ||||
| @@ -198,49 +199,113 @@ template<> | ||||
| } | ||||
|  | ||||
| template<std::size_t bitsize> | ||||
| void EmitReadMemoryMov(BlockOfCode& code, int value_idx, const Xbyak::RegExp& addr) { | ||||
| const void* EmitReadMemoryMov(BlockOfCode& code, int value_idx, const Xbyak::RegExp& addr, bool ordered) { | ||||
|     if (ordered) { | ||||
|         if constexpr (bitsize == 128) { | ||||
|             code.mfence(); | ||||
|         } else { | ||||
|             code.xor_(Xbyak::Reg32{value_idx}, Xbyak::Reg32{value_idx}); | ||||
|         } | ||||
|  | ||||
|         const void* fastmem_location = code.getCurr(); | ||||
|         switch (bitsize) { | ||||
|         case 8: | ||||
|             code.lock(); | ||||
|             code.xadd(code.byte[addr], Xbyak::Reg32{value_idx}.cvt8()); | ||||
|             break; | ||||
|         case 16: | ||||
|             code.lock(); | ||||
|             code.xadd(word[addr], Xbyak::Reg32{value_idx}); | ||||
|             break; | ||||
|         case 32: | ||||
|             code.lock(); | ||||
|             code.xadd(dword[addr], Xbyak::Reg32{value_idx}); | ||||
|             break; | ||||
|         case 64: | ||||
|             code.lock(); | ||||
|             code.xadd(qword[addr], Xbyak::Reg64{value_idx}); | ||||
|             break; | ||||
|         case 128: | ||||
|             // TODO (HACK): Detect CPUs where this load is not atomic | ||||
|             code.movaps(Xbyak::Xmm{value_idx}, xword[addr]); | ||||
|             break; | ||||
|         default: | ||||
|             ASSERT_FALSE("Invalid bitsize"); | ||||
|         } | ||||
|         return fastmem_location; | ||||
|     } | ||||
|  | ||||
|     const void* fastmem_location = code.getCurr(); | ||||
|     switch (bitsize) { | ||||
|     case 8: | ||||
|         code.movzx(Xbyak::Reg32{value_idx}, code.byte[addr]); | ||||
|         return; | ||||
|         break; | ||||
|     case 16: | ||||
|         code.movzx(Xbyak::Reg32{value_idx}, word[addr]); | ||||
|         return; | ||||
|         break; | ||||
|     case 32: | ||||
|         code.mov(Xbyak::Reg32{value_idx}, dword[addr]); | ||||
|         return; | ||||
|         break; | ||||
|     case 64: | ||||
|         code.mov(Xbyak::Reg64{value_idx}, qword[addr]); | ||||
|         return; | ||||
|         break; | ||||
|     case 128: | ||||
|         code.movups(Xbyak::Xmm{value_idx}, xword[addr]); | ||||
|         return; | ||||
|         break; | ||||
|     default: | ||||
|         ASSERT_FALSE("Invalid bitsize"); | ||||
|     } | ||||
|     return fastmem_location; | ||||
| } | ||||
|  | ||||
| template<std::size_t bitsize> | ||||
| void EmitWriteMemoryMov(BlockOfCode& code, const Xbyak::RegExp& addr, int value_idx) { | ||||
| const void* EmitWriteMemoryMov(BlockOfCode& code, const Xbyak::RegExp& addr, int value_idx, bool ordered) { | ||||
|     if (ordered) { | ||||
|         const void* fastmem_location = code.getCurr(); | ||||
|         switch (bitsize) { | ||||
|         case 8: | ||||
|             code.xchg(code.byte[addr], Xbyak::Reg64{value_idx}.cvt8()); | ||||
|             break; | ||||
|         case 16: | ||||
|             code.xchg(word[addr], Xbyak::Reg16{value_idx}); | ||||
|             break; | ||||
|         case 32: | ||||
|             code.xchg(dword[addr], Xbyak::Reg32{value_idx}); | ||||
|             break; | ||||
|         case 64: | ||||
|             code.xchg(qword[addr], Xbyak::Reg64{value_idx}); | ||||
|             break; | ||||
|         case 128: | ||||
|             code.movaps(xword[addr], Xbyak::Xmm{value_idx}); | ||||
|             code.mfence(); | ||||
|             break; | ||||
|         default: | ||||
|             ASSERT_FALSE("Invalid bitsize"); | ||||
|         } | ||||
|         return fastmem_location; | ||||
|     } | ||||
|  | ||||
|     const void* fastmem_location = code.getCurr(); | ||||
|     switch (bitsize) { | ||||
|     case 8: | ||||
|         code.mov(code.byte[addr], Xbyak::Reg64{value_idx}.cvt8()); | ||||
|         return; | ||||
|         break; | ||||
|     case 16: | ||||
|         code.mov(word[addr], Xbyak::Reg16{value_idx}); | ||||
|         return; | ||||
|         break; | ||||
|     case 32: | ||||
|         code.mov(dword[addr], Xbyak::Reg32{value_idx}); | ||||
|         return; | ||||
|         break; | ||||
|     case 64: | ||||
|         code.mov(qword[addr], Xbyak::Reg64{value_idx}); | ||||
|         return; | ||||
|         break; | ||||
|     case 128: | ||||
|         code.movups(xword[addr], Xbyak::Xmm{value_idx}); | ||||
|         return; | ||||
|         break; | ||||
|     default: | ||||
|         ASSERT_FALSE("Invalid bitsize"); | ||||
|     } | ||||
|     return fastmem_location; | ||||
| } | ||||
|  | ||||
| template<typename UserConfig> | ||||
| @@ -284,6 +349,10 @@ void EmitExclusiveTestAndClear(BlockOfCode& code, const UserConfig& conf, Xbyak: | ||||
|     } | ||||
| } | ||||
|  | ||||
| inline bool IsOrdered(IR::AccType acctype) { | ||||
|     return acctype == IR::AccType::ORDERED || acctype == IR::AccType::ORDEREDRW || acctype == IR::AccType::LIMITEDORDERED; | ||||
| } | ||||
|  | ||||
| }  // namespace | ||||
|  | ||||
| }  // namespace Dynarmic::Backend::X64 | ||||
|   | ||||
| @@ -208,6 +208,11 @@ IR::Cond Argument::GetImmediateCond() const { | ||||
|     return value.GetCond(); | ||||
| } | ||||
|  | ||||
| IR::AccType Argument::GetImmediateAccType() const { | ||||
|     ASSERT(IsImmediate() && GetType() == IR::Type::AccType); | ||||
|     return value.GetAccType(); | ||||
| } | ||||
|  | ||||
| bool Argument::IsInGpr() const { | ||||
|     if (IsImmediate()) | ||||
|         return false; | ||||
|   | ||||
| @@ -21,6 +21,10 @@ | ||||
| #include "dynarmic/ir/microinstruction.h" | ||||
| #include "dynarmic/ir/value.h" | ||||
|  | ||||
| namespace Dynarmic::IR { | ||||
| enum class AccType; | ||||
| }  // namespace Dynarmic::IR | ||||
|  | ||||
| namespace Dynarmic::Backend::X64 { | ||||
|  | ||||
| class RegAlloc; | ||||
| @@ -75,6 +79,7 @@ public: | ||||
|     u64 GetImmediateS32() const; | ||||
|     u64 GetImmediateU64() const; | ||||
|     IR::Cond GetImmediateCond() const; | ||||
|     IR::AccType GetImmediateAccType() const; | ||||
|  | ||||
|     /// Is this value currently in a GPR? | ||||
|     bool IsInGpr() const; | ||||
|   | ||||
		Reference in New Issue
	
	Block a user