early-access version 2639

2022-04-03 03:41:15 +02:00
parent 0397c1ff98
commit d15d58f409
25 changed files with 272 additions and 203 deletions
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
 yuzu emulator early access
 =============
-This is the source code for early-access 2637.
+This is the source code for early-access 2639.
 ## Legal Notice
--- a/externals/CMakeLists.txt
+++ b/externals/CMakeLists.txt
@@ -68,9 +68,6 @@ if (YUZU_USE_EXTERNAL_SDL2)
    add_library(SDL2 ALIAS SDL2-static)
 endif()
 # SoundTouch
 add_subdirectory(soundtouch)
 # Cubeb
 if(ENABLE_CUBEB)
    set(BUILD_TESTS OFF CACHE BOOL "")
--- a/src/audio_core/CMakeLists.txt
+++ b/src/audio_core/CMakeLists.txt
@@ -36,8 +36,6 @@ add_library(audio_core STATIC
    splitter_context.h
    stream.cpp
    stream.h
    time_stretch.cpp
    time_stretch.h
    voice_context.cpp
    voice_context.h
@@ -63,7 +61,6 @@ if (NOT MSVC)
 endif()
 target_link_libraries(audio_core PUBLIC common core)
 target_link_libraries(audio_core PRIVATE SoundTouch)
 if(ENABLE_CUBEB)
    target_link_libraries(audio_core PRIVATE cubeb)
--- a/src/audio_core/cubeb_sink.cpp
+++ b/src/audio_core/cubeb_sink.cpp
@@ -7,7 +7,6 @@
 #include <cstring>
 #include "audio_core/cubeb_sink.h"
 #include "audio_core/stream.h"
 #include "audio_core/time_stretch.h"
 #include "common/assert.h"
 #include "common/logging/log.h"
 #include "common/ring_buffer.h"
@@ -23,8 +22,7 @@ class CubebSinkStream final : public SinkStream {
 public:
    CubebSinkStream(cubeb* ctx_, u32 sample_rate, u32 num_channels_, cubeb_devid output_device,
                    const std::string& name)
-        : ctx{ctx_}, num_channels{std::min(num_channels_, 6u)}, time_stretch{sample_rate,
+        : ctx{ctx_}, num_channels{std::min(num_channels_, 6u)} {
                                                                             num_channels} {
        cubeb_stream_params params{};
        params.rate = sample_rate;
@@ -131,7 +129,6 @@ private:
    Common::RingBuffer<s16, 0x10000> queue;
    std::array<s16, 2> last_frame{};
    std::atomic<bool> should_flush{};
    TimeStretcher time_stretch;
    static long DataCallback(cubeb_stream* stream, void* user_data, const void* input_buffer,
                             void* output_buffer, long num_frames);
@@ -205,25 +202,7 @@ long CubebSinkStream::DataCallback([[maybe_unused]] cubeb_stream* stream, void*
    const std::size_t num_channels = impl->GetNumChannels();
    const std::size_t samples_to_write = num_channels * num_frames;
-    std::size_t samples_written;
+    const std::size_t samples_written = impl->queue.Pop(buffer, samples_to_write);
    /*
    if (Settings::values.enable_audio_stretching.GetValue()) {
        const std::vector<s16> in{impl->queue.Pop()};
        const std::size_t num_in{in.size() / num_channels};
        s16* const out{reinterpret_cast<s16*>(buffer)};
        const std::size_t out_frames =
            impl->time_stretch.Process(in.data(), num_in, out, num_frames);
        samples_written = out_frames * num_channels;
        if (impl->should_flush) {
            impl->time_stretch.Flush();
            impl->should_flush = false;
        }
    } else {
        samples_written = impl->queue.Pop(buffer, samples_to_write);
    }*/
    samples_written = impl->queue.Pop(buffer, samples_to_write);
    if (samples_written >= num_channels) {
        std::memcpy(&impl->last_frame[0], buffer + (samples_written - num_channels) * sizeof(s16),
--- a/src/audio_core/sdl2_sink.cpp
+++ b/src/audio_core/sdl2_sink.cpp
@@ -7,7 +7,6 @@
 #include <cstring>
 #include "audio_core/sdl2_sink.h"
 #include "audio_core/stream.h"
 #include "audio_core/time_stretch.h"
 #include "common/assert.h"
 #include "common/logging/log.h"
 //#include "common/settings.h"
@@ -27,7 +26,7 @@ namespace AudioCore {
 class SDLSinkStream final : public SinkStream {
 public:
    SDLSinkStream(u32 sample_rate, u32 num_channels_, const std::string& output_device)
-        : num_channels{std::min(num_channels_, 6u)}, time_stretch{sample_rate, num_channels} {
+        : num_channels{std::min(num_channels_, 6u)} {
        SDL_AudioSpec spec;
        spec.freq = sample_rate;
@@ -116,7 +115,6 @@ private:
    SDL_AudioDeviceID dev = 0;
    u32 num_channels{};
    std::atomic<bool> should_flush{};
    TimeStretcher time_stretch;
 };
 SDLSink::SDLSink(std::string_view target_device_name) {
--- a/src/common/atomic_ops.h
+++ b/src/common/atomic_ops.h
@@ -46,6 +46,43 @@ namespace Common {
                                          reinterpret_cast<__int64*>(expected.data())) != 0;
 }
 [[nodiscard]] inline bool AtomicCompareAndSwap(volatile u8* pointer, u8 value, u8 expected,
                                               u8& actual) {
    actual =
        _InterlockedCompareExchange8(reinterpret_cast<volatile char*>(pointer), value, expected);
    return actual == expected;
 }
 [[nodiscard]] inline bool AtomicCompareAndSwap(volatile u16* pointer, u16 value, u16 expected,
                                               u16& actual) {
    actual =
        _InterlockedCompareExchange16(reinterpret_cast<volatile short*>(pointer), value, expected);
    return actual == expected;
 }
 [[nodiscard]] inline bool AtomicCompareAndSwap(volatile u32* pointer, u32 value, u32 expected,
                                               u32& actual) {
    actual =
        _InterlockedCompareExchange(reinterpret_cast<volatile long*>(pointer), value, expected);
    return actual == expected;
 }
 [[nodiscard]] inline bool AtomicCompareAndSwap(volatile u64* pointer, u64 value, u64 expected,
                                               u64& actual) {
    actual = _InterlockedCompareExchange64(reinterpret_cast<volatile __int64*>(pointer), value,
                                           expected);
    return actual == expected;
 }
 [[nodiscard]] inline bool AtomicCompareAndSwap(volatile u64* pointer, u128 value, u128 expected,
                                               u128& actual) {
    const bool result =
        _InterlockedCompareExchange128(reinterpret_cast<volatile __int64*>(pointer), value[1],
                                       value[0], reinterpret_cast<__int64*>(expected.data())) != 0;
    actual = expected;
    return result;
 }
 [[nodiscard]] inline u128 AtomicLoad128(volatile u64* pointer) {
    u128 result{};
    _InterlockedCompareExchange128(reinterpret_cast<volatile __int64*>(pointer), result[1],
@@ -79,6 +116,42 @@ namespace Common {
    return __sync_bool_compare_and_swap((unsigned __int128*)pointer, expected_a, value_a);
 }
 [[nodiscard]] inline bool AtomicCompareAndSwap(volatile u8* pointer, u8 value, u8 expected,
                                               u8& actual) {
    actual = __sync_val_compare_and_swap(pointer, expected, value);
    return actual == expected;
 }
 [[nodiscard]] inline bool AtomicCompareAndSwap(volatile u16* pointer, u16 value, u16 expected,
                                               u16& actual) {
    actual = __sync_val_compare_and_swap(pointer, expected, value);
    return actual == expected;
 }
 [[nodiscard]] inline bool AtomicCompareAndSwap(volatile u32* pointer, u32 value, u32 expected,
                                               u32& actual) {
    actual = __sync_val_compare_and_swap(pointer, expected, value);
    return actual == expected;
 }
 [[nodiscard]] inline bool AtomicCompareAndSwap(volatile u64* pointer, u64 value, u64 expected,
                                               u64& actual) {
    actual = __sync_val_compare_and_swap(pointer, expected, value);
    return actual == expected;
 }
 [[nodiscard]] inline bool AtomicCompareAndSwap(volatile u64* pointer, u128 value, u128 expected,
                                               u128& actual) {
    unsigned __int128 value_a;
    unsigned __int128 expected_a;
    unsigned __int128 actual_a;
    std::memcpy(&value_a, value.data(), sizeof(u128));
    std::memcpy(&expected_a, expected.data(), sizeof(u128));
    actual_a = __sync_val_compare_and_swap((unsigned __int128*)pointer, expected_a, value_a);
    std::memcpy(actual.data(), &actual_a, sizeof(u128));
    return actual_a == expected_a;
 }
 [[nodiscard]] inline u128 AtomicLoad128(volatile u64* pointer) {
    unsigned __int128 zeros_a = 0;
    unsigned __int128 result_a =
--- a/src/common/settings.h
+++ b/src/common/settings.h
@@ -38,6 +38,7 @@ enum class CPUAccuracy : u32 {
    Auto = 0,
    Accurate = 1,
    Unsafe = 2,
    Paranoid = 3,
 };
 enum class FullscreenMode : u32 {
@@ -470,7 +471,7 @@ struct Values {
    // Cpu
    RangedSetting<CPUAccuracy> cpu_accuracy{CPUAccuracy::Auto, CPUAccuracy::Auto,
-                                            CPUAccuracy::Unsafe, "cpu_accuracy"};
+                                            CPUAccuracy::Paranoid, "cpu_accuracy"};
    // TODO: remove cpu_accuracy_first_time, migration setting added 8 July 2021
    BasicSetting<bool> cpu_accuracy_first_time{true, "cpu_accuracy_first_time"};
    BasicSetting<bool> cpu_debug_mode{false, "cpu_debug_mode"};
--- a/src/common/x64/native_clock.cpp
+++ b/src/common/x64/native_clock.cpp
@@ -55,8 +55,9 @@ NativeClock::NativeClock(u64 emulated_cpu_frequency_, u64 emulated_clock_frequen
 u64 NativeClock::GetRTSC() {
    TimePoint new_time_point{};
    TimePoint current_time_point{};
    current_time_point.pack = Common::AtomicLoad128(time_point.pack.data());
    do {
        current_time_point.pack = Common::AtomicLoad128(time_point.pack.data());
        _mm_mfence();
        const u64 current_measure = __rdtsc();
        u64 diff = current_measure - current_time_point.inner.last_measure;
@@ -66,7 +67,7 @@ u64 NativeClock::GetRTSC() {
                                                : current_time_point.inner.last_measure;
        new_time_point.inner.accumulated_ticks = current_time_point.inner.accumulated_ticks + diff;
    } while (!Common::AtomicCompareAndSwap(time_point.pack.data(), new_time_point.pack,
-                                           current_time_point.pack));
+                                           current_time_point.pack, current_time_point.pack));
    /// The clock cannot be more precise than the guest timer, remove the lower bits
    return new_time_point.inner.accumulated_ticks & inaccuracy_mask;
 }
@@ -75,13 +76,14 @@ void NativeClock::Pause(bool is_paused) {
    if (!is_paused) {
        TimePoint current_time_point{};
        TimePoint new_time_point{};
        current_time_point.pack = Common::AtomicLoad128(time_point.pack.data());
        do {
            current_time_point.pack = Common::AtomicLoad128(time_point.pack.data());
            new_time_point.pack = current_time_point.pack;
            _mm_mfence();
            new_time_point.inner.last_measure = __rdtsc();
        } while (!Common::AtomicCompareAndSwap(time_point.pack.data(), new_time_point.pack,
-                                               current_time_point.pack));
+                                               current_time_point.pack, current_time_point.pack));
    }
 }
--- a/src/core/arm/dynarmic/arm_dynarmic_32.cpp
+++ b/src/core/arm/dynarmic/arm_dynarmic_32.cpp
@@ -186,35 +186,41 @@ std::shared_ptr<Dynarmic::A32::Jit> ARM_Dynarmic_32::MakeJit(Common::PageTable*
        if (!Settings::values.cpuopt_recompile_exclusives) {
            config.recompile_on_exclusive_fastmem_failure = false;
        }
-    }
+    } else {
        // Unsafe optimizations
        if (Settings::values.cpu_accuracy.GetValue() == Settings::CPUAccuracy::Unsafe) {
            config.unsafe_optimizations = true;
            if (Settings::values.cpuopt_unsafe_unfuse_fma) {
                config.optimizations |= Dynarmic::OptimizationFlag::Unsafe_UnfuseFMA;
            }
            if (Settings::values.cpuopt_unsafe_reduce_fp_error) {
                config.optimizations |= Dynarmic::OptimizationFlag::Unsafe_ReducedErrorFP;
            }
            if (Settings::values.cpuopt_unsafe_ignore_standard_fpcr) {
                config.optimizations |= Dynarmic::OptimizationFlag::Unsafe_IgnoreStandardFPCRValue;
            }
            if (Settings::values.cpuopt_unsafe_inaccurate_nan) {
                config.optimizations |= Dynarmic::OptimizationFlag::Unsafe_InaccurateNaN;
            }
            if (Settings::values.cpuopt_unsafe_ignore_global_monitor) {
                config.optimizations |= Dynarmic::OptimizationFlag::Unsafe_IgnoreGlobalMonitor;
            }
        }
-    // Unsafe optimizations
+        // Curated optimizations
-    if (Settings::values.cpu_accuracy.GetValue() == Settings::CPUAccuracy::Unsafe) {
+        if (Settings::values.cpu_accuracy.GetValue() == Settings::CPUAccuracy::Auto) {
-        config.unsafe_optimizations = true;
+            config.unsafe_optimizations = true;
        if (Settings::values.cpuopt_unsafe_unfuse_fma) {
            config.optimizations |= Dynarmic::OptimizationFlag::Unsafe_UnfuseFMA;
        }
        if (Settings::values.cpuopt_unsafe_reduce_fp_error) {
            config.optimizations |= Dynarmic::OptimizationFlag::Unsafe_ReducedErrorFP;
        }
        if (Settings::values.cpuopt_unsafe_ignore_standard_fpcr) {
            config.optimizations |= Dynarmic::OptimizationFlag::Unsafe_IgnoreStandardFPCRValue;
        }
        if (Settings::values.cpuopt_unsafe_inaccurate_nan) {
            config.optimizations |= Dynarmic::OptimizationFlag::Unsafe_InaccurateNaN;
        }
        if (Settings::values.cpuopt_unsafe_ignore_global_monitor) {
            config.optimizations |= Dynarmic::OptimizationFlag::Unsafe_IgnoreGlobalMonitor;
        }
    }
-    // Curated optimizations
+        // Paranoia mode for debugging optimizations
-    if (Settings::values.cpu_accuracy.GetValue() == Settings::CPUAccuracy::Auto) {
+        if (Settings::values.cpu_accuracy.GetValue() == Settings::CPUAccuracy::Paranoid) {
-        config.unsafe_optimizations = true;
+            config.unsafe_optimizations = false;
-        config.optimizations |= Dynarmic::OptimizationFlag::Unsafe_UnfuseFMA;
+            config.optimizations = Dynarmic::no_optimizations;
-        config.optimizations |= Dynarmic::OptimizationFlag::Unsafe_IgnoreStandardFPCRValue;
+        }
        config.optimizations |= Dynarmic::OptimizationFlag::Unsafe_InaccurateNaN;
        config.optimizations |= Dynarmic::OptimizationFlag::Unsafe_IgnoreGlobalMonitor;
    }
    return std::make_unique<Dynarmic::A32::Jit>(config);
--- a/src/core/arm/dynarmic/arm_dynarmic_64.cpp
+++ b/src/core/arm/dynarmic/arm_dynarmic_64.cpp
@@ -248,35 +248,41 @@ std::shared_ptr<Dynarmic::A64::Jit> ARM_Dynarmic_64::MakeJit(Common::PageTable*
        if (!Settings::values.cpuopt_recompile_exclusives) {
            config.recompile_on_exclusive_fastmem_failure = false;
        }
-    }
+    } else {
        // Unsafe optimizations
        if (Settings::values.cpu_accuracy.GetValue() == Settings::CPUAccuracy::Unsafe) {
            config.unsafe_optimizations = true;
            if (Settings::values.cpuopt_unsafe_unfuse_fma) {
                config.optimizations |= Dynarmic::OptimizationFlag::Unsafe_UnfuseFMA;
            }
            if (Settings::values.cpuopt_unsafe_reduce_fp_error) {
                config.optimizations |= Dynarmic::OptimizationFlag::Unsafe_ReducedErrorFP;
            }
            if (Settings::values.cpuopt_unsafe_inaccurate_nan) {
                config.optimizations |= Dynarmic::OptimizationFlag::Unsafe_InaccurateNaN;
            }
            if (Settings::values.cpuopt_unsafe_fastmem_check) {
                config.fastmem_address_space_bits = 64;
            }
            if (Settings::values.cpuopt_unsafe_ignore_global_monitor) {
                config.optimizations |= Dynarmic::OptimizationFlag::Unsafe_IgnoreGlobalMonitor;
            }
        }
-    // Unsafe optimizations
+        // Curated optimizations
-    if (Settings::values.cpu_accuracy.GetValue() == Settings::CPUAccuracy::Unsafe) {
+        if (Settings::values.cpu_accuracy.GetValue() == Settings::CPUAccuracy::Auto) {
-        config.unsafe_optimizations = true;
+            config.unsafe_optimizations = true;
        if (Settings::values.cpuopt_unsafe_unfuse_fma) {
            config.optimizations |= Dynarmic::OptimizationFlag::Unsafe_UnfuseFMA;
        }
        if (Settings::values.cpuopt_unsafe_reduce_fp_error) {
            config.optimizations |= Dynarmic::OptimizationFlag::Unsafe_ReducedErrorFP;
        }
        if (Settings::values.cpuopt_unsafe_inaccurate_nan) {
            config.optimizations |= Dynarmic::OptimizationFlag::Unsafe_InaccurateNaN;
        }
        if (Settings::values.cpuopt_unsafe_fastmem_check) {
            config.fastmem_address_space_bits = 64;
        }
        if (Settings::values.cpuopt_unsafe_ignore_global_monitor) {
            config.optimizations |= Dynarmic::OptimizationFlag::Unsafe_IgnoreGlobalMonitor;
        }
    }
-    // Curated optimizations
+        // Paranoia mode for debugging optimizations
-    if (Settings::values.cpu_accuracy.GetValue() == Settings::CPUAccuracy::Auto) {
+        if (Settings::values.cpu_accuracy.GetValue() == Settings::CPUAccuracy::Paranoid) {
-        config.unsafe_optimizations = true;
+            config.unsafe_optimizations = false;
-        config.optimizations |= Dynarmic::OptimizationFlag::Unsafe_UnfuseFMA;
+            config.optimizations = Dynarmic::no_optimizations;
-        config.optimizations |= Dynarmic::OptimizationFlag::Unsafe_InaccurateNaN;
+        }
        config.fastmem_address_space_bits = 64;
        config.optimizations |= Dynarmic::OptimizationFlag::Unsafe_IgnoreGlobalMonitor;
    }
    return std::make_shared<Dynarmic::A64::Jit>(config);
--- a/src/core/hle/service/nvflinger/buffer_queue_consumer.cpp
+++ b/src/core/hle/service/nvflinger/buffer_queue_consumer.cpp
@@ -18,8 +18,7 @@ BufferQueueConsumer::BufferQueueConsumer(std::shared_ptr<BufferQueueCore> core_)
 BufferQueueConsumer::~BufferQueueConsumer() = default;
 Status BufferQueueConsumer::AcquireBuffer(BufferItem* out_buffer,
-                                          std::chrono::nanoseconds expected_present,
+                                          std::chrono::nanoseconds expected_present) {
                                          u64 max_frame_number) {
    std::scoped_lock lock(core->mutex);
    // Check that the consumer doesn't currently have the maximum number of buffers acquired.
@@ -50,12 +49,6 @@ Status BufferQueueConsumer::AcquireBuffer(BufferItem* out_buffer,
        while (core->queue.size() > 1 && !core->queue[0].is_auto_timestamp) {
            const auto& buffer_item{core->queue[1]};
            // If dropping entry[0] would leave us with a buffer that the consumer is not yet ready
            // for, don't drop it.
            if (max_frame_number && buffer_item.frame_number > max_frame_number) {
                break;
            }
            // If entry[1] is timely, drop entry[0] (and repeat).
            const auto desired_present = buffer_item.timestamp;
            if (desired_present < expected_present.count() - MAX_REASONABLE_NSEC ||
@@ -200,4 +193,39 @@ Status BufferQueueConsumer::Connect(std::shared_ptr<IConsumerListener> consumer_
    return Status::NoError;
 }
 Status BufferQueueConsumer::GetReleasedBuffers(u64* out_slot_mask) {
    if (out_slot_mask == nullptr) {
        LOG_ERROR(Service_NVFlinger, "out_slot_mask may not be nullptr");
        return Status::BadValue;
    }
    std::scoped_lock lock(core->mutex);
    if (core->is_abandoned) {
        LOG_ERROR(Service_NVFlinger, "BufferQueue has been abandoned");
        return Status::NoInit;
    }
    u64 mask = 0;
    for (int s = 0; s < BufferQueueDefs::NUM_BUFFER_SLOTS; ++s) {
        if (!slots[s].acquire_called) {
            mask |= (1ULL << s);
        }
    }
    // Remove from the mask queued buffers for which acquire has been called, since the consumer
    // will not receive their buffer addresses and so must retain their cached information
    auto current(core->queue.begin());
    while (current != core->queue.end()) {
        if (current->acquire_called) {
            mask &= ~(1ULL << current->slot);
        }
        ++current;
    }
    LOG_DEBUG(Service_NVFlinger, "returning mask {}", mask);
    *out_slot_mask = mask;
    return Status::NoError;
 }
 } // namespace Service::android
--- a/src/core/hle/service/nvflinger/buffer_queue_consumer.h
+++ b/src/core/hle/service/nvflinger/buffer_queue_consumer.h
@@ -24,10 +24,10 @@ public:
    explicit BufferQueueConsumer(std::shared_ptr<BufferQueueCore> core_);
    ~BufferQueueConsumer();
-    Status AcquireBuffer(BufferItem* out_buffer, std::chrono::nanoseconds expected_present,
+    Status AcquireBuffer(BufferItem* out_buffer, std::chrono::nanoseconds expected_present);
                         u64 max_frame_number = 0);
    Status ReleaseBuffer(s32 slot, u64 frame_number, const Fence& release_fence);
    Status Connect(std::shared_ptr<IConsumerListener> consumer_listener, bool controlled_by_app);
    Status GetReleasedBuffers(u64* out_slot_mask);
 private:
    std::shared_ptr<BufferQueueCore> core;
--- a/src/core/hle/service/nvflinger/buffer_queue_core.cpp
+++ b/src/core/hle/service/nvflinger/buffer_queue_core.cpp
@@ -95,7 +95,6 @@ void BufferQueueCore::FreeBufferLocked(s32 slot) {
 }
 void BufferQueueCore::FreeAllBuffersLocked() {
    queue.clear();
    buffer_has_been_queued = false;
    for (s32 slot = 0; slot < BufferQueueDefs::NUM_BUFFER_SLOTS; ++slot) {
--- a/src/core/hle/service/nvflinger/buffer_queue_core.h
+++ b/src/core/hle/service/nvflinger/buffer_queue_core.h
@@ -73,8 +73,6 @@ private:
    u32 transform_hint{};
    bool is_allocating{};
    mutable std::condition_variable_any is_allocating_condition;
    bool allow_allocation{true};
    u64 buffer_age{};
    bool is_shutting_down{};
 };
--- a/src/core/hle/service/nvflinger/buffer_queue_producer.cpp
+++ b/src/core/hle/service/nvflinger/buffer_queue_producer.cpp
@@ -62,11 +62,12 @@ Status BufferQueueProducer::RequestBuffer(s32 slot, std::shared_ptr<GraphicBuffe
 Status BufferQueueProducer::SetBufferCount(s32 buffer_count) {
    LOG_DEBUG(Service_NVFlinger, "count = {}", buffer_count);
    std::shared_ptr<IConsumerListener> listener;
    std::shared_ptr<IConsumerListener> listener;
    {
        std::scoped_lock lock(core->mutex);
        core->WaitWhileAllocatingLocked();
        if (core->is_abandoned) {
            LOG_ERROR(Service_NVFlinger, "BufferQueue has been abandoned");
            return Status::NoInit;
@@ -120,7 +121,7 @@ Status BufferQueueProducer::SetBufferCount(s32 buffer_count) {
 }
 Status BufferQueueProducer::WaitForFreeSlotThenRelock(bool async, s32* found,
-                                                      Status* returnFlags) const {
+                                                      Status* return_flags) const {
    bool try_again = true;
    while (try_again) {
@@ -142,10 +143,12 @@ Status BufferQueueProducer::WaitForFreeSlotThenRelock(bool async, s32* found,
            ASSERT(slots[s].buffer_state == BufferState::Free);
            if (slots[s].graphic_buffer != nullptr) {
                core->FreeBufferLocked(s);
-                *returnFlags |= Status::ReleaseAllBuffers;
+                *return_flags |= Status::ReleaseAllBuffers;
            }
        }
        // Look for a free buffer to give to the client
        *found = BufferQueueCore::INVALID_BUFFER_SLOT;
        s32 dequeued_count{};
        s32 acquired_count{};
        for (s32 s{}; s < max_buffer_count; ++s) {
@@ -235,68 +238,50 @@ Status BufferQueueProducer::DequeueBuffer(s32* out_slot, Fence* out_fence, bool
    {
        std::scoped_lock lock(core->mutex);
        core->WaitWhileAllocatingLocked();
        if (format == PixelFormat::NoFormat) {
            format = core->default_buffer_format;
        }
        // Enable the usage bits the consumer requested
        usage |= core->consumer_usage_bit;
        s32 found{};
        Status status = WaitForFreeSlotThenRelock(async, &found, &return_flags);
        if (status != Status::NoError) {
            return status;
        }
        // This should not happen
        if (found == BufferQueueCore::INVALID_BUFFER_SLOT) {
            LOG_ERROR(Service_NVFlinger, "no available buffer slots");
            return Status::Busy;
        }
        *out_slot = found;
        attached_by_consumer = slots[found].attached_by_consumer;
        const bool use_default_size = !width && !height;
        if (use_default_size) {
            width = core->default_width;
            height = core->default_height;
        }
        s32 found = BufferItem::INVALID_BUFFER_SLOT;
        while (found == BufferItem::INVALID_BUFFER_SLOT) {
            Status status = WaitForFreeSlotThenRelock(async, &found, &return_flags);
            if (status != Status::NoError) {
                return status;
            }
            // This should not happen
            if (found == BufferQueueCore::INVALID_BUFFER_SLOT) {
                LOG_DEBUG(Service_NVFlinger, "no available buffer slots");
                return Status::Busy;
            }
            const std::shared_ptr<GraphicBuffer>& buffer(slots[found].graphic_buffer);
            // If we are not allowed to allocate new buffers, WaitForFreeSlotThenRelock must have
            // returned a slot containing a buffer. If this buffer would require reallocation to
            // meet the requested attributes, we free it and attempt to get another one.
            if (!core->allow_allocation) {
                if (buffer->NeedsReallocation(width, height, format, usage)) {
                    core->FreeBufferLocked(found);
                    found = BufferItem::INVALID_BUFFER_SLOT;
                    continue;
                }
            }
        }
        *out_slot = found;
        attached_by_consumer = slots[found].attached_by_consumer;
        slots[found].buffer_state = BufferState::Dequeued;
        const std::shared_ptr<GraphicBuffer>& buffer(slots[found].graphic_buffer);
-
+        if ((buffer == nullptr) || (buffer->Width() != width) || (buffer->Height() != height) ||
-        if ((buffer == nullptr) || buffer->NeedsReallocation(width, height, format, usage)) {
+            (buffer->Format() != format) || ((buffer->Usage() & usage) != usage)) {
            slots[found].acquire_called = false;
            slots[found].graphic_buffer = nullptr;
            slots[found].request_buffer_called = false;
            slots[found].fence = Fence::NoFence();
-            core->buffer_age = 0;
+
            return_flags |= Status::BufferNeedsReallocation;
        } else {
            // We add 1 because that will be the frame number when this buffer
            // is queued
            core->buffer_age = core->frame_counter + 1 - slots[found].frame_number;
        }
        LOG_DEBUG(Service_NVFlinger, "setting buffer age to {}", core->buffer_age);
        *out_fence = slots[found].fence;
        slots[found].fence = Fence::NoFence();
    }
@@ -311,6 +296,7 @@ Status BufferQueueProducer::DequeueBuffer(s32* out_slot, Fence* out_fence, bool
        {
            std::scoped_lock lock(core->mutex);
            if (core->is_abandoned) {
                LOG_ERROR(Service_NVFlinger, "BufferQueue has been abandoned");
                return Status::NoInit;
@@ -327,6 +313,7 @@ Status BufferQueueProducer::DequeueBuffer(s32* out_slot, Fence* out_fence, bool
    LOG_DEBUG(Service_NVFlinger, "returning slot={} frame={}, flags={}", *out_slot,
              slots[*out_slot].frame_number, return_flags);
    return return_flags;
 }
@@ -334,6 +321,7 @@ Status BufferQueueProducer::DetachBuffer(s32 slot) {
    LOG_DEBUG(Service_NVFlinger, "slot {}", slot);
    std::scoped_lock lock(core->mutex);
    if (core->is_abandoned) {
        LOG_ERROR(Service_NVFlinger, "BufferQueue has been abandoned");
        return Status::NoInit;
@@ -369,7 +357,6 @@ Status BufferQueueProducer::DetachNextBuffer(std::shared_ptr<GraphicBuffer>* out
    }
    std::scoped_lock lock(core->mutex);
    core->WaitWhileAllocatingLocked();
    if (core->is_abandoned) {
@@ -423,6 +410,7 @@ Status BufferQueueProducer::AttachBuffer(s32* out_slot,
        return status;
    }
    // This should not happen
    if (found == BufferQueueCore::INVALID_BUFFER_SLOT) {
        LOG_ERROR(Service_NVFlinger, "No available buffer slots");
        return Status::Busy;
@@ -466,8 +454,8 @@ Status BufferQueueProducer::QueueBuffer(s32 slot, const QueueBufferInput& input,
        return Status::BadValue;
    }
-    std::shared_ptr<IConsumerListener> frameAvailableListener;
+    std::shared_ptr<IConsumerListener> frame_available_listener;
-    std::shared_ptr<IConsumerListener> frameReplacedListener;
+    std::shared_ptr<IConsumerListener> frame_replaced_listener;
    s32 callback_ticket{};
    BufferItem item;
@@ -541,12 +529,13 @@ Status BufferQueueProducer::QueueBuffer(s32 slot, const QueueBufferInput& input,
        item.fence = fence;
        item.is_droppable = core->dequeue_buffer_cannot_block || async;
        item.swap_interval = swap_interval;
        sticky_transform = sticky_transform_;
        if (core->queue.empty()) {
            // When the queue is empty, we can simply queue this buffer
            core->queue.push_back(item);
-            frameAvailableListener = core->consumer_listener;
+            frame_available_listener = core->consumer_listener;
        } else {
            // When the queue is not empty, we need to look at the front buffer
            // state to see if we need to replace it
@@ -563,10 +552,10 @@ Status BufferQueueProducer::QueueBuffer(s32 slot, const QueueBufferInput& input,
                }
                // Overwrite the droppable buffer with the incoming one
                *front = item;
-                frameReplacedListener = core->consumer_listener;
+                frame_replaced_listener = core->consumer_listener;
            } else {
                core->queue.push_back(item);
-                frameAvailableListener = core->consumer_listener;
+                frame_available_listener = core->consumer_listener;
            }
        }
@@ -592,10 +581,10 @@ Status BufferQueueProducer::QueueBuffer(s32 slot, const QueueBufferInput& input,
            callback_condition.wait(callback_mutex);
        }
-        if (frameAvailableListener != nullptr) {
+        if (frame_available_listener != nullptr) {
-            frameAvailableListener->OnFrameAvailable(item);
+            frame_available_listener->OnFrameAvailable(item);
-        } else if (frameReplacedListener != nullptr) {
+        } else if (frame_replaced_listener != nullptr) {
-            frameReplacedListener->OnFrameReplaced(item);
+            frame_replaced_listener->OnFrameReplaced(item);
        }
        ++current_callback_ticket;
@@ -669,13 +658,6 @@ Status BufferQueueProducer::Query(NativeWindow what, s32* out_value) {
    case NativeWindow::ConsumerUsageBits:
        value = core->consumer_usage_bit;
        break;
    case NativeWindow::BufferAge:
        if (core->buffer_age > INT32_MAX) {
            value = 0;
        } else {
            value = static_cast<u32>(core->buffer_age);
        }
        break;
    default:
        UNREACHABLE();
        return Status::BadValue;
@@ -737,7 +719,6 @@ Status BufferQueueProducer::Connect(const std::shared_ptr<IProducerListener>& li
    core->buffer_has_been_queued = false;
    core->dequeue_buffer_cannot_block =
        core->consumer_controlled_by_app && producer_controlled_by_app;
    core->allow_allocation = true;
    return status;
 }
@@ -770,7 +751,7 @@ Status BufferQueueProducer::Disconnect(NativeWindowApi api) {
                core->SignalDequeueCondition();
                buffer_wait_event->GetWritableEvent().Signal();
                listener = core->consumer_listener;
-            } else if (core->connected_api != NativeWindowApi::NoConnectedApi) {
+            } else {
                LOG_ERROR(Service_NVFlinger, "still connected to another api (cur = {} req = {})",
                          core->connected_api, api);
                status = Status::BadValue;
--- a/src/core/hle/service/nvflinger/buffer_queue_producer.h
+++ b/src/core/hle/service/nvflinger/buffer_queue_producer.h
@@ -66,7 +66,7 @@ public:
 private:
    BufferQueueProducer(const BufferQueueProducer&) = delete;
-    Status WaitForFreeSlotThenRelock(bool async, s32* found, Status* returnFlags) const;
+    Status WaitForFreeSlotThenRelock(bool async, s32* found, Status* return_flags) const;
    Kernel::KEvent* buffer_wait_event{};
    Service::KernelHelpers::ServiceContext& service_context;
--- a/src/core/hle/service/nvflinger/consumer_base.cpp
+++ b/src/core/hle/service/nvflinger/consumer_base.cpp
@@ -36,38 +36,41 @@ void ConsumerBase::FreeBufferLocked(s32 slot_index) {
 }
 void ConsumerBase::OnFrameAvailable(const BufferItem& item) {
    std::scoped_lock lock(mutex);
    LOG_DEBUG(Service_NVFlinger, "called");
 }
 void ConsumerBase::OnFrameReplaced(const BufferItem& item) {
    std::scoped_lock lock(mutex);
    LOG_DEBUG(Service_NVFlinger, "called");
 }
 void ConsumerBase::OnBuffersReleased() {
    std::scoped_lock lock(mutex);
    LOG_DEBUG(Service_NVFlinger, "called");
    if (is_abandoned) {
        // Nothing to do if we're already abandoned.
        return;
    }
    u64 mask = 0;
    consumer->GetReleasedBuffers(&mask);
    for (int i = 0; i < BufferQueueDefs::NUM_BUFFER_SLOTS; i++) {
        if (mask & (1ULL << i)) {
            FreeBufferLocked(i);
        }
    }
 }
 void ConsumerBase::OnSidebandStreamChanged() {}
-Status ConsumerBase::AcquireBufferLocked(BufferItem* item, std::chrono::nanoseconds present_when,
+Status ConsumerBase::AcquireBufferLocked(BufferItem* item, std::chrono::nanoseconds present_when) {
-                                         u64 max_frame_number) {
+    Status err = consumer->AcquireBuffer(item, present_when);
    if (is_abandoned) {
        LOG_ERROR(Service_NVFlinger, "consumer is abandoned!");
        return Status::NoInit;
    }
    Status err = consumer->AcquireBuffer(item, present_when, max_frame_number);
    if (err != Status::NoError) {
        return err;
    }
    if (item->graphic_buffer != nullptr) {
        if (slots[item->slot].graphic_buffer != nullptr) {
            FreeBufferLocked(item->slot);
        }
        slots[item->slot].graphic_buffer = item->graphic_buffer;
    }
--- a/src/core/hle/service/nvflinger/consumer_base.h
+++ b/src/core/hle/service/nvflinger/consumer_base.h
@@ -35,8 +35,7 @@ protected:
    virtual void OnSidebandStreamChanged() override;
    void FreeBufferLocked(s32 slot_index);
-    Status AcquireBufferLocked(BufferItem* item, std::chrono::nanoseconds present_when,
+    Status AcquireBufferLocked(BufferItem* item, std::chrono::nanoseconds present_when);
                               u64 max_frame_number = 0);
    Status ReleaseBufferLocked(s32 slot, const std::shared_ptr<GraphicBuffer> graphic_buffer);
    bool StillTracking(s32 slot, const std::shared_ptr<GraphicBuffer> graphic_buffer) const;
    Status AddReleaseFenceLocked(s32 slot, const std::shared_ptr<GraphicBuffer> graphic_buffer,
--- a/src/core/hle/service/nvflinger/nvflinger.cpp
+++ b/src/core/hle/service/nvflinger/nvflinger.cpp
@@ -104,7 +104,7 @@ void NVFlinger::SetNVDrvInstance(std::shared_ptr<Nvidia::Module> instance) {
 std::optional<u64> NVFlinger::OpenDisplay(std::string_view name) {
    const auto lock_guard = Lock();
-    LOG_DEBUG(Service, "Opening \"{}\" display", name);
+    LOG_DEBUG(Service_NVFlinger, "Opening \"{}\" display", name);
    const auto itr =
        std::find_if(displays.begin(), displays.end(),
@@ -219,7 +219,7 @@ VI::Layer* NVFlinger::FindOrCreateLayer(u64 display_id, u64 layer_id) {
    auto* layer = display->FindLayer(layer_id);
    if (layer == nullptr) {
-        LOG_DEBUG(Service, "Layer at id {} not found. Trying to create it.", layer_id);
+        LOG_DEBUG(Service_NVFlinger, "Layer at id {} not found. Trying to create it.", layer_id);
        CreateLayerAtId(*display, layer_id);
        return display->FindLayer(layer_id);
    }
--- a/src/video_core/memory_manager.cpp
+++ b/src/video_core/memory_manager.cpp
@@ -39,21 +39,9 @@ GPUVAddr MemoryManager::UpdateRange(GPUVAddr gpu_addr, PageEntry page_entry, std
    return gpu_addr;
 }
 void MemoryManager::UnmapSubmappedRanges(GPUVAddr gpu_addr, std::size_t size) {
    const auto submapped_ranges = GetSubmappedRange(gpu_addr, size);
    for (const auto& [map_addr, map_size] : submapped_ranges) {
        // Flush and invalidate through the GPU interface, to be asynchronous if possible.
        const std::optional<VAddr> cpu_vaddr = GpuToCpuAddress(map_addr);
        if (!cpu_vaddr) {
            continue;
        }
        rasterizer->UnmapMemory(*cpu_vaddr, map_size);
    }
 }
 GPUVAddr MemoryManager::Map(VAddr cpu_addr, GPUVAddr gpu_addr, std::size_t size) {
-    // Unmap any pre-existing rasterizer memory in this range
+    // Mark any pre-existing rasterizer memory in this range as remapped
-    UnmapSubmappedRanges(gpu_addr, size);
+    rasterizer->ModifyGPUMemory(gpu_addr, size);
    const auto it = std::ranges::lower_bound(map_ranges, gpu_addr, {}, &MapRange::first);
    if (it != map_ranges.end() && it->first == gpu_addr) {
@@ -85,8 +73,16 @@ void MemoryManager::Unmap(GPUVAddr gpu_addr, std::size_t size) {
    } else {
        UNREACHABLE_MSG("Unmapping non-existent GPU address=0x{:x}", gpu_addr);
    }
    const auto submapped_ranges = GetSubmappedRange(gpu_addr, size);
    for (const auto& [map_addr, map_size] : submapped_ranges) {
        // Flush and invalidate through the GPU interface, to be asynchronous if possible.
        const std::optional<VAddr> cpu_addr = GpuToCpuAddress(map_addr);
        ASSERT(cpu_addr);
        rasterizer->UnmapMemory(*cpu_addr, map_size);
    }
    UnmapSubmappedRanges(gpu_addr, size);
    UpdateRange(gpu_addr, PageEntry::State::Unmapped, size);
 }
--- a/src/video_core/memory_manager.h
+++ b/src/video_core/memory_manager.h
@@ -149,7 +149,6 @@ private:
    [[nodiscard]] PageEntry GetPageEntry(GPUVAddr gpu_addr) const;
    void SetPageEntry(GPUVAddr gpu_addr, PageEntry page_entry, std::size_t size = page_size);
    GPUVAddr UpdateRange(GPUVAddr gpu_addr, PageEntry page_entry, std::size_t size);
    void UnmapSubmappedRanges(GPUVAddr gpu_addr, std::size_t size);
    [[nodiscard]] std::optional<GPUVAddr> FindFreeRange(std::size_t size, std::size_t align,
                                                        bool start_32bit_address = false) const;
--- a/src/video_core/texture_cache/texture_cache.h
+++ b/src/video_core/texture_cache/texture_cache.h
@@ -454,15 +454,20 @@ void TextureCache<P>::DownloadMemory(VAddr cpu_addr, size_t size) {
        return slot_images[lhs].modification_tick < slot_images[rhs].modification_tick;
    });
    for (const ImageId image_id : images) {
-        Image& image = slot_images[image_id];
+        DownloadImage(image_id);
        auto map = runtime.DownloadStagingBuffer(image.unswizzled_size_bytes);
        const auto copies = FullDownloadCopies(image.info);
        image.DownloadMemory(map, copies);
        runtime.Finish();
        SwizzleImage(gpu_memory, image.gpu_addr, image.info, copies, map.mapped_span);
    }
 }
 template <class P>
 void TextureCache<P>::DownloadImage(ImageId image_id) {
    Image& image = slot_images[image_id];
    auto map = runtime.DownloadStagingBuffer(image.unswizzled_size_bytes);
    const auto copies = FullDownloadCopies(image.info);
    image.DownloadMemory(map, copies);
    runtime.Finish();
    SwizzleImage(gpu_memory, image.gpu_addr, image.info, copies, map.mapped_span);
 }
 template <class P>
 void TextureCache<P>::UnmapMemory(VAddr cpu_addr, size_t size) {
    std::vector<ImageId> deleted_images;
@@ -1058,7 +1063,7 @@ ImageId TextureCache<P>::JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, VA
    for (const ImageId overlap_id : ignore_textures) {
        Image& overlap = slot_images[overlap_id];
        if (True(overlap.flags & ImageFlagBits::GpuModified)) {
-            UNIMPLEMENTED();
+            DownloadImage(overlap_id);
        }
        if (True(overlap.flags & ImageFlagBits::Tracked)) {
            UntrackImage(overlap, overlap_id);
--- a/src/video_core/texture_cache/texture_cache_base.h
+++ b/src/video_core/texture_cache/texture_cache_base.h
@@ -139,6 +139,9 @@ public:
    /// Download contents of host images to guest memory in a region
    void DownloadMemory(VAddr cpu_addr, size_t size);
    /// Download contents of host images to guest memory
    void DownloadImage(ImageId image_id);
    /// Remove images in a region
    void UnmapMemory(VAddr cpu_addr, size_t size);
--- a/src/yuzu/configuration/configure_cpu.ui
+++ b/src/yuzu/configuration/configure_cpu.ui
@@ -52,6 +52,11 @@
               <string>Unsafe</string>
              </property>
             </item>
             <item>
              <property name="text">
               <string>Paranoid (disables most optimizations)</string>
              </property>
             </item>
            </widget>
           </item>
          </layout>
--- a/src/yuzu_cmd/default_ini.h
+++ b/src/yuzu_cmd/default_ini.h
@@ -342,12 +342,6 @@ fps_cap =
 # null: No audio output
 output_engine =
 # Whether or not to enable the audio-stretching post-processing effect.
 # This effect adjusts audio speed to match emulation speed and helps prevent audio stutter,
 # at the cost of increasing audio latency.
 # 0: No, 1 (default): Yes
 enable_audio_stretching =
 # Which audio device to use.
 # auto (default): Auto-select
 output_device =