early-access version 1332

2021-01-17 03:19:34 +01:00
parent 233493ad87
commit f70af7672d
126 changed files with 3856 additions and 3241 deletions
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
 yuzu emulator early access
 =============

-This is the source code for early-access 1331.
+This is the source code for early-access 1332.

 ## Legal Notice

--- a/dist/icons/controller/controller.qrc
+++ b/dist/icons/controller/controller.qrc
@@ -1,26 +1,5 @@
 <RCC>
  <qresource prefix="controller">
-    <file alias="dual_joycon">dual_joycon.png</file>
-    <file alias="dual_joycon_dark">dual_joycon_dark.png</file>
-    <file alias="dual_joycon_midnight">dual_joycon_midnight.png</file>
-    <file alias="handheld">handheld.png</file>
-    <file alias="handheld_dark">handheld_dark.png</file>
-    <file alias="handheld_midnight">handheld_midnight.png</file>
-    <file alias="pro_controller">pro_controller.png</file>
-    <file alias="pro_controller_dark">pro_controller_dark.png</file>
-    <file alias="pro_controller_midnight">pro_controller_midnight.png</file>
-    <file alias="single_joycon_left">single_joycon_left.png</file>
-    <file alias="single_joycon_left_dark">single_joycon_left_dark.png</file>
-    <file alias="single_joycon_left_midnight">single_joycon_left_midnight.png</file>
-    <file alias="single_joycon_right">single_joycon_right.png</file>
-    <file alias="single_joycon_right_dark">single_joycon_right_dark.png</file>
-    <file alias="single_joycon_right_midnight">single_joycon_right_midnight.png</file>
-    <file alias="single_joycon_left_vertical">single_joycon_left_vertical.png</file>
-    <file alias="single_joycon_left_vertical_dark">single_joycon_left_vertical_dark.png</file>
-    <file alias="single_joycon_left_vertical_midnight">single_joycon_left_vertical_midnight.png</file>
-    <file alias="single_joycon_right_vertical">single_joycon_right_vertical.png</file>
-    <file alias="single_joycon_right_vertical_dark">single_joycon_right_vertical_dark.png</file>
-    <file alias="single_joycon_right_vertical_midnight">single_joycon_right_vertical_midnight.png</file>
    <file alias="applet_dual_joycon">applet_dual_joycon.png</file>
    <file alias="applet_dual_joycon_dark">applet_dual_joycon_dark.png</file>	
    <file alias="applet_dual_joycon_midnight">applet_dual_joycon_midnight.png</file>	
--- a/src/common/uint128.cpp
+++ b/src/common/uint128.cpp
@@ -38,34 +38,4 @@ u64 MultiplyAndDivide64(u64 a, u64 b, u64 d) {

 #endif

-u128 Multiply64Into128(u64 a, u64 b) {
-    u128 result;
-#ifdef _MSC_VER
-    result[0] = _umul128(a, b, &result[1]);
-#else
-    unsigned __int128 tmp = a;
-    tmp *= b;
-    std::memcpy(&result, &tmp, sizeof(u128));
-#endif
-    return result;
-}
-
-std::pair<u64, u64> Divide128On32(u128 dividend, u32 divisor) {
-    u64 remainder = dividend[0] % divisor;
-    u64 accum = dividend[0] / divisor;
-    if (dividend[1] == 0)
-        return {accum, remainder};
-    // We ignore dividend[1] / divisor as that overflows
-    const u64 first_segment = (dividend[1] % divisor) << 32;
-    accum += (first_segment / divisor) << 32;
-    const u64 second_segment = (first_segment % divisor) << 32;
-    accum += (second_segment / divisor);
-    remainder += second_segment % divisor;
-    if (remainder >= divisor) {
-        accum++;
-        remainder -= divisor;
-    }
-    return {accum, remainder};
-}
-
 } // namespace Common
--- a/src/common/uint128.h
+++ b/src/common/uint128.h
@@ -12,11 +12,4 @@ namespace Common {
 // This function multiplies 2 u64 values and divides it by a u64 value.
 [[nodiscard]] u64 MultiplyAndDivide64(u64 a, u64 b, u64 d);

-// This function multiplies 2 u64 values and produces a u128 value;
-[[nodiscard]] u128 Multiply64Into128(u64 a, u64 b);
-
-// This function divides a u128 by a u32 value and produces two u64 values:
-// the result of division and the remainder
-[[nodiscard]] std::pair<u64, u64> Divide128On32(u128 dividend, u32 divisor);
-
 } // namespace Common
--- a/src/common/wall_clock.cpp
+++ b/src/common/wall_clock.cpp
@@ -2,7 +2,6 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.

-#include "common/uint128.h"
 #include "common/wall_clock.h"

 #ifdef ARCHITECTURE_x86_64
@@ -41,16 +40,11 @@ public:
    }

    u64 GetClockCycles() override {
-        std::chrono::nanoseconds time_now = GetTimeNS();
-        const u128 temporary =
-            Common::Multiply64Into128(time_now.count(), emulated_clock_frequency);
-        return Common::Divide128On32(temporary, 1000000000).first;
+        return GetTimeNS().count() * (emulated_clock_frequency / 1000) / 1000000;
    }

    u64 GetCPUCycles() override {
-        std::chrono::nanoseconds time_now = GetTimeNS();
-        const u128 temporary = Common::Multiply64Into128(time_now.count(), emulated_cpu_frequency);
-        return Common::Divide128On32(temporary, 1000000000).first;
+        return GetTimeNS().count() * (emulated_cpu_frequency / 1000) / 1000000;
    }

    void Pause([[maybe_unused]] bool is_paused) override {
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -19,7 +19,6 @@ add_library(core STATIC
    core.h
    core_timing.cpp
    core_timing.h
-    core_timing_util.cpp
    core_timing_util.h
    cpu_manager.cpp
    cpu_manager.h
--- a/src/core/core_timing_util.h
+++ b/src/core/core_timing_util.h
@@ -1,24 +1,59 @@
-// Copyright 2008 Dolphin Emulator Project / 2017 Citra Emulator Project
-// Licensed under GPLv2+
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.

 #pragma once

 #include <chrono>
+
 #include "common/common_types.h"
+#include "core/hardware_properties.h"

 namespace Core::Timing {

-s64 msToCycles(std::chrono::milliseconds ms);
-s64 usToCycles(std::chrono::microseconds us);
-s64 nsToCycles(std::chrono::nanoseconds ns);
-u64 msToClockCycles(std::chrono::milliseconds ns);
-u64 usToClockCycles(std::chrono::microseconds ns);
-u64 nsToClockCycles(std::chrono::nanoseconds ns);
-std::chrono::milliseconds CyclesToMs(s64 cycles);
-std::chrono::nanoseconds CyclesToNs(s64 cycles);
-std::chrono::microseconds CyclesToUs(s64 cycles);
+namespace detail {
+constexpr u64 CNTFREQ_ADJUSTED = Hardware::CNTFREQ / 1000;
+constexpr u64 BASE_CLOCK_RATE_ADJUSTED = Hardware::BASE_CLOCK_RATE / 1000;
+} // namespace detail

-u64 CpuCyclesToClockCycles(u64 ticks);
+[[nodiscard]] constexpr s64 msToCycles(std::chrono::milliseconds ms) {
+    return ms.count() * detail::BASE_CLOCK_RATE_ADJUSTED;
+}
+
+[[nodiscard]] constexpr s64 usToCycles(std::chrono::microseconds us) {
+    return us.count() * detail::BASE_CLOCK_RATE_ADJUSTED / 1000;
+}
+
+[[nodiscard]] constexpr s64 nsToCycles(std::chrono::nanoseconds ns) {
+    return ns.count() * detail::BASE_CLOCK_RATE_ADJUSTED / 1000000;
+}
+
+[[nodiscard]] constexpr u64 msToClockCycles(std::chrono::milliseconds ms) {
+    return static_cast<u64>(ms.count()) * detail::CNTFREQ_ADJUSTED;
+}
+
+[[nodiscard]] constexpr u64 usToClockCycles(std::chrono::microseconds us) {
+    return us.count() * detail::CNTFREQ_ADJUSTED / 1000;
+}
+
+[[nodiscard]] constexpr u64 nsToClockCycles(std::chrono::nanoseconds ns) {
+    return ns.count() * detail::CNTFREQ_ADJUSTED / 1000000;
+}
+
+[[nodiscard]] constexpr u64 CpuCyclesToClockCycles(u64 ticks) {
+    return ticks * detail::CNTFREQ_ADJUSTED / detail::BASE_CLOCK_RATE_ADJUSTED;
+}
+
+[[nodiscard]] constexpr std::chrono::milliseconds CyclesToMs(s64 cycles) {
+    return std::chrono::milliseconds(cycles / detail::BASE_CLOCK_RATE_ADJUSTED);
+}
+
+[[nodiscard]] constexpr std::chrono::nanoseconds CyclesToNs(s64 cycles) {
+    return std::chrono::nanoseconds(cycles * 1000000 / detail::BASE_CLOCK_RATE_ADJUSTED);
+}
+
+[[nodiscard]] constexpr std::chrono::microseconds CyclesToUs(s64 cycles) {
+    return std::chrono::microseconds(cycles * 1000 / detail::BASE_CLOCK_RATE_ADJUSTED);
+}

 } // namespace Core::Timing
--- a/src/core/frontend/emu_window.cpp
+++ b/src/core/frontend/emu_window.cpp
@@ -21,21 +21,18 @@ public:

    std::mutex mutex;

-    bool touch_pressed = false; ///< True if touchpad area is currently pressed, otherwise false
-
-    float touch_x = 0.0f; ///< Touchpad X-position
-    float touch_y = 0.0f; ///< Touchpad Y-position
+    Input::TouchStatus status;

 private:
    class Device : public Input::TouchDevice {
    public:
        explicit Device(std::weak_ptr<TouchState>&& touch_state) : touch_state(touch_state) {}
-        std::tuple<float, float, bool> GetStatus() const override {
+        Input::TouchStatus GetStatus() const override {
            if (auto state = touch_state.lock()) {
                std::lock_guard guard{state->mutex};
-                return std::make_tuple(state->touch_x, state->touch_y, state->touch_pressed);
+                return state->status;
            }
-            return std::make_tuple(0.0f, 0.0f, false);
+            return {};
        }

    private:
@@ -79,36 +76,44 @@ std::tuple<unsigned, unsigned> EmuWindow::ClipToTouchScreen(unsigned new_x, unsi
    return std::make_tuple(new_x, new_y);
 }

-void EmuWindow::TouchPressed(unsigned framebuffer_x, unsigned framebuffer_y) {
-    if (!IsWithinTouchscreen(framebuffer_layout, framebuffer_x, framebuffer_y))
+void EmuWindow::TouchPressed(unsigned framebuffer_x, unsigned framebuffer_y, std::size_t id) {
+    if (!IsWithinTouchscreen(framebuffer_layout, framebuffer_x, framebuffer_y)) {
        return;
+    }
+    if (id >= touch_state->status.size()) {
+        return;
+    }

    std::lock_guard guard{touch_state->mutex};
-    touch_state->touch_x =
+    const float x =
        static_cast<float>(framebuffer_x - framebuffer_layout.screen.left) /
        static_cast<float>(framebuffer_layout.screen.right - framebuffer_layout.screen.left);
-    touch_state->touch_y =
+    const float y =
        static_cast<float>(framebuffer_y - framebuffer_layout.screen.top) /
        static_cast<float>(framebuffer_layout.screen.bottom - framebuffer_layout.screen.top);

-    touch_state->touch_pressed = true;
+    touch_state->status[id] = std::make_tuple(x, y, true);
 }

-void EmuWindow::TouchReleased() {
+void EmuWindow::TouchReleased(std::size_t id) {
+    if (id >= touch_state->status.size()) {
+        return;
+    }
    std::lock_guard guard{touch_state->mutex};
-    touch_state->touch_pressed = false;
-    touch_state->touch_x = 0;
-    touch_state->touch_y = 0;
+    touch_state->status[id] = std::make_tuple(0.0f, 0.0f, false);
 }

-void EmuWindow::TouchMoved(unsigned framebuffer_x, unsigned framebuffer_y) {
-    if (!touch_state->touch_pressed)
+void EmuWindow::TouchMoved(unsigned framebuffer_x, unsigned framebuffer_y, std::size_t id) {
+    if (id >= touch_state->status.size()) {
+        return;
+    }
+    if (!std::get<2>(touch_state->status[id]))
        return;

    if (!IsWithinTouchscreen(framebuffer_layout, framebuffer_x, framebuffer_y))
        std::tie(framebuffer_x, framebuffer_y) = ClipToTouchScreen(framebuffer_x, framebuffer_y);

-    TouchPressed(framebuffer_x, framebuffer_y);
+    TouchPressed(framebuffer_x, framebuffer_y, id);
 }

 void EmuWindow::UpdateCurrentFramebufferLayout(unsigned width, unsigned height) {
--- a/src/core/frontend/emu_window.h
+++ b/src/core/frontend/emu_window.h
@@ -117,18 +117,23 @@ public:
     * Signal that a touch pressed event has occurred (e.g. mouse click pressed)
     * @param framebuffer_x Framebuffer x-coordinate that was pressed
     * @param framebuffer_y Framebuffer y-coordinate that was pressed
+     * @param id Touch event ID
     */
-    void TouchPressed(unsigned framebuffer_x, unsigned framebuffer_y);
+    void TouchPressed(unsigned framebuffer_x, unsigned framebuffer_y, std::size_t id);

-    /// Signal that a touch released event has occurred (e.g. mouse click released)
-    void TouchReleased();
+    /**
+     * Signal that a touch released event has occurred (e.g. mouse click released)
+     * @param id Touch event ID
+     */
+    void TouchReleased(std::size_t id);

    /**
     * Signal that a touch movement event has occurred (e.g. mouse was moved over the emu window)
     * @param framebuffer_x Framebuffer x-coordinate
     * @param framebuffer_y Framebuffer y-coordinate
+     * @param id Touch event ID
     */
-    void TouchMoved(unsigned framebuffer_x, unsigned framebuffer_y);
+    void TouchMoved(unsigned framebuffer_x, unsigned framebuffer_y, std::size_t id);

    /**
     * Returns currently active configuration.
--- a/src/core/frontend/input.h
+++ b/src/core/frontend/input.h
@@ -21,6 +21,11 @@ enum class AnalogDirection : u8 {
    UP,
    DOWN,
 };
+struct AnalogProperties {
+    float deadzone;
+    float range;
+    float threshold;
+};

 /// An abstract class template for an input device (a button, an analog input, etc.).
 template <typename StatusType>
@@ -30,6 +35,12 @@ public:
    virtual StatusType GetStatus() const {
        return {};
    }
+    virtual StatusType GetRawStatus() const {
+        return GetStatus();
+    }
+    virtual AnalogProperties GetAnalogProperties() const {
+        return {};
+    }
    virtual bool GetAnalogDirectionStatus([[maybe_unused]] AnalogDirection direction) const {
        return {};
    }
@@ -163,10 +174,11 @@ using MotionStatus = std::tuple<Common::Vec3<float>, Common::Vec3<float>, Common
 using MotionDevice = InputDevice<MotionStatus>;

 /**
- * A touch status is an object that returns a tuple of two floats and a bool. The floats are
- * x and y coordinates in the range 0.0 - 1.0, and the bool indicates whether it is pressed.
+ * A touch status is an object that returns an array of 16 tuple elements of two floats and a bool.
+ * The floats are x and y coordinates in the range 0.0 - 1.0, and the bool indicates whether it is
+ * pressed.
 */
-using TouchStatus = std::tuple<float, float, bool>;
+using TouchStatus = std::array<std::tuple<float, float, bool>, 16>;

 /**
 * A touch device is an input device that returns a touch status object
--- a/src/core/hle/service/hid/controllers/touchscreen.cpp
+++ b/src/core/hle/service/hid/controllers/touchscreen.cpp
@@ -2,6 +2,7 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.

+#include <algorithm>
 #include <cstring>
 #include "common/common_types.h"
 #include "core/core_timing.h"
@@ -16,7 +17,13 @@ constexpr std::size_t SHARED_MEMORY_OFFSET = 0x400;
 Controller_Touchscreen::Controller_Touchscreen(Core::System& system) : ControllerBase(system) {}
 Controller_Touchscreen::~Controller_Touchscreen() = default;

-void Controller_Touchscreen::OnInit() {}
+void Controller_Touchscreen::OnInit() {
+    for (std::size_t id = 0; id < MAX_FINGERS; ++id) {
+        mouse_finger_id[id] = MAX_FINGERS;
+        keyboard_finger_id[id] = MAX_FINGERS;
+        udp_finger_id[id] = MAX_FINGERS;
+    }
+}

 void Controller_Touchscreen::OnRelease() {}

@@ -40,38 +47,106 @@ void Controller_Touchscreen::OnUpdate(const Core::Timing::CoreTiming& core_timin
    cur_entry.sampling_number = last_entry.sampling_number + 1;
    cur_entry.sampling_number2 = cur_entry.sampling_number;

-    bool pressed = false;
-    float x, y;
-    std::tie(x, y, pressed) = touch_device->GetStatus();
-    auto& touch_entry = cur_entry.states[0];
-    touch_entry.attribute.raw = 0;
-    if (!pressed && touch_btn_device) {
-        std::tie(x, y, pressed) = touch_btn_device->GetStatus();
-    }
-    if (pressed && Settings::values.touchscreen.enabled) {
-        touch_entry.x = static_cast<u16>(x * Layout::ScreenUndocked::Width);
-        touch_entry.y = static_cast<u16>(y * Layout::ScreenUndocked::Height);
-        touch_entry.diameter_x = Settings::values.touchscreen.diameter_x;
-        touch_entry.diameter_y = Settings::values.touchscreen.diameter_y;
-        touch_entry.rotation_angle = Settings::values.touchscreen.rotation_angle;
-        const u64 tick = core_timing.GetCPUTicks();
-        touch_entry.delta_time = tick - last_touch;
-        last_touch = tick;
-        touch_entry.finger = Settings::values.touchscreen.finger;
-        cur_entry.entry_count = 1;
-    } else {
-        cur_entry.entry_count = 0;
+    const Input::TouchStatus& mouse_status = touch_mouse_device->GetStatus();
+    const Input::TouchStatus& udp_status = touch_udp_device->GetStatus();
+    for (std::size_t id = 0; id < mouse_status.size(); ++id) {
+        mouse_finger_id[id] = UpdateTouchInputEvent(mouse_status[id], mouse_finger_id[id]);
+        udp_finger_id[id] = UpdateTouchInputEvent(udp_status[id], udp_finger_id[id]);
    }

+    if (Settings::values.use_touch_from_button) {
+        const Input::TouchStatus& keyboard_status = touch_btn_device->GetStatus();
+        for (std::size_t id = 0; id < mouse_status.size(); ++id) {
+            keyboard_finger_id[id] =
+                UpdateTouchInputEvent(keyboard_status[id], keyboard_finger_id[id]);
+        }
+    }
+
+    std::array<Finger, 16> active_fingers;
+    const auto end_iter = std::copy_if(fingers.begin(), fingers.end(), active_fingers.begin(),
+                                       [](const auto& finger) { return finger.pressed; });
+    const auto active_fingers_count =
+        static_cast<std::size_t>(std::distance(active_fingers.begin(), end_iter));
+
+    const u64 tick = core_timing.GetCPUTicks();
+    cur_entry.entry_count = static_cast<s32_le>(active_fingers_count);
+    for (std::size_t id = 0; id < MAX_FINGERS; ++id) {
+        auto& touch_entry = cur_entry.states[id];
+        if (id < active_fingers_count) {
+            touch_entry.x = static_cast<u16>(active_fingers[id].x * Layout::ScreenUndocked::Width);
+            touch_entry.y = static_cast<u16>(active_fingers[id].y * Layout::ScreenUndocked::Height);
+            touch_entry.diameter_x = Settings::values.touchscreen.diameter_x;
+            touch_entry.diameter_y = Settings::values.touchscreen.diameter_y;
+            touch_entry.rotation_angle = Settings::values.touchscreen.rotation_angle;
+            touch_entry.delta_time = tick - active_fingers[id].last_touch;
+            fingers[active_fingers[id].id].last_touch = tick;
+            touch_entry.finger = active_fingers[id].id;
+            touch_entry.attribute.raw = active_fingers[id].attribute.raw;
+        } else {
+            // Clear touch entry
+            touch_entry.attribute.raw = 0;
+            touch_entry.x = 0;
+            touch_entry.y = 0;
+            touch_entry.diameter_x = 0;
+            touch_entry.diameter_y = 0;
+            touch_entry.rotation_angle = 0;
+            touch_entry.delta_time = 0;
+            touch_entry.finger = 0;
+        }
+    }
    std::memcpy(data + SHARED_MEMORY_OFFSET, &shared_memory, sizeof(TouchScreenSharedMemory));
 }

 void Controller_Touchscreen::OnLoadInputDevices() {
-    touch_device = Input::CreateDevice<Input::TouchDevice>(Settings::values.touchscreen.device);
-    if (Settings::values.use_touch_from_button) {
-        touch_btn_device = Input::CreateDevice<Input::TouchDevice>("engine:touch_from_button");
-    } else {
-        touch_btn_device.reset();
-    }
+    touch_mouse_device = Input::CreateDevice<Input::TouchDevice>("engine:emu_window");
+    touch_udp_device = Input::CreateDevice<Input::TouchDevice>("engine:cemuhookudp");
+    touch_btn_device = Input::CreateDevice<Input::TouchDevice>("engine:touch_from_button");
 }
+
+std::optional<std::size_t> Controller_Touchscreen::GetUnusedFingerID() const {
+    std::size_t first_free_id = 0;
+    while (first_free_id < MAX_FINGERS) {
+        if (!fingers[first_free_id].pressed) {
+            return first_free_id;
+        } else {
+            first_free_id++;
+        }
+    }
+    return std::nullopt;
+}
+
+std::size_t Controller_Touchscreen::UpdateTouchInputEvent(
+    const std::tuple<float, float, bool>& touch_input, std::size_t finger_id) {
+    const auto& [x, y, pressed] = touch_input;
+    if (pressed) {
+        Attributes attribute{};
+        if (finger_id == MAX_FINGERS) {
+            const auto first_free_id = GetUnusedFingerID();
+            if (!first_free_id) {
+                // Invalid finger id do nothing
+                return MAX_FINGERS;
+            }
+            finger_id = first_free_id.value();
+            fingers[finger_id].pressed = true;
+            fingers[finger_id].id = static_cast<u32_le>(finger_id);
+            attribute.start_touch.Assign(1);
+        }
+        fingers[finger_id].x = x;
+        fingers[finger_id].y = y;
+        fingers[finger_id].attribute = attribute;
+        return finger_id;
+    }
+
+    if (finger_id != MAX_FINGERS) {
+        if (!fingers[finger_id].attribute.end_touch) {
+            fingers[finger_id].attribute.end_touch.Assign(1);
+            fingers[finger_id].attribute.start_touch.Assign(0);
+            return finger_id;
+        }
+        fingers[finger_id].pressed = false;
+    }
+
+    return MAX_FINGERS;
+}
+
 } // namespace Service::HID
--- a/src/core/hle/service/hid/controllers/touchscreen.h
+++ b/src/core/hle/service/hid/controllers/touchscreen.h
@@ -30,6 +30,18 @@ public:
    void OnLoadInputDevices() override;

 private:
+    static constexpr std::size_t MAX_FINGERS = 16;
+
+    // Returns an unused finger id, if there is no fingers available std::nullopt will be returned
+    std::optional<std::size_t> GetUnusedFingerID() const;
+
+    // If the touch is new it tries to assing a new finger id, if there is no fingers avaliable no
+    // changes will be made. Updates the coordinates if the finger id it's already set. If the touch
+    // ends delays the output by one frame to set the end_touch flag before finally freeing the
+    // finger id
+    std::size_t UpdateTouchInputEvent(const std::tuple<float, float, bool>& touch_input,
+                                      std::size_t finger_id);
+
    struct Attributes {
        union {
            u32 raw{};
@@ -55,7 +67,7 @@ private:
        s64_le sampling_number;
        s64_le sampling_number2;
        s32_le entry_count;
-        std::array<TouchState, 16> states;
+        std::array<TouchState, MAX_FINGERS> states;
    };
    static_assert(sizeof(TouchScreenEntry) == 0x298, "TouchScreenEntry is an invalid size");

@@ -66,9 +78,23 @@ private:
    };
    static_assert(sizeof(TouchScreenSharedMemory) == 0x3000,
                  "TouchScreenSharedMemory is an invalid size");
+
+    struct Finger {
+        u64_le last_touch{};
+        float x{};
+        float y{};
+        u32_le id{};
+        bool pressed{};
+        Attributes attribute;
+    };
+
    TouchScreenSharedMemory shared_memory{};
-    std::unique_ptr<Input::TouchDevice> touch_device;
+    std::unique_ptr<Input::TouchDevice> touch_mouse_device;
+    std::unique_ptr<Input::TouchDevice> touch_udp_device;
    std::unique_ptr<Input::TouchDevice> touch_btn_device;
-    s64_le last_touch{};
+    std::array<std::size_t, MAX_FINGERS> mouse_finger_id;
+    std::array<std::size_t, MAX_FINGERS> keyboard_finger_id;
+    std::array<std::size_t, MAX_FINGERS> udp_finger_id;
+    std::array<Finger, MAX_FINGERS> fingers;
 };
 } // namespace Service::HID
--- a/src/core/hle/service/nvdrv/devices/nvhost_nvdec.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvhost_nvdec.cpp
@@ -37,6 +37,7 @@ NvResult nvhost_nvdec::Ioctl1(Ioctl command, const std::vector<u8>& input,
                Tegra::ChCommandHeaderList cmdlist(1);
                cmdlist[0] = Tegra::ChCommandHeader{0xDEADB33F};
                system.GPU().PushCommandBuffer(cmdlist);
+                system.GPU().MemoryManager().InvalidateQueuedCaches();
            }
            return UnmapBuffer(input, output);
        }
--- a/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.cpp
@@ -193,7 +193,13 @@ NvResult nvhost_nvdec_common::UnmapBuffer(const std::vector<u8>& input, std::vec
            return NvResult::InvalidState;
        }
        if (const auto size{RemoveBufferMap(object->dma_map_addr)}; size) {
-            gpu.MemoryManager().Unmap(object->dma_map_addr, *size);
+            if (vic_device) {
+                // UnmapVicFrame defers texture_cache invalidation of the frame address until
+                // the stream is over
+                gpu.MemoryManager().UnmapVicFrame(object->dma_map_addr, *size);
+            } else {
+                gpu.MemoryManager().Unmap(object->dma_map_addr, *size);
+            }
        } else {
            // This occurs quite frequently, however does not seem to impact functionality
            LOG_DEBUG(Service_NVDRV, "invalid offset=0x{:X} dma=0x{:X}", object->addr,
--- a/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.h
+++ b/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.h
@@ -160,6 +160,7 @@ protected:

    s32_le nvmap_fd{};
    u32_le submit_timeout{};
+    bool vic_device{};
    std::shared_ptr<nvmap> nvmap_dev;
    SyncpointManager& syncpoint_manager;
    std::array<u32, MaxSyncPoints> device_syncpoints{};
--- a/src/core/hle/service/nvdrv/devices/nvhost_vic.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvhost_vic.cpp
@@ -12,8 +12,9 @@
 namespace Service::Nvidia::Devices {
 nvhost_vic::nvhost_vic(Core::System& system, std::shared_ptr<nvmap> nvmap_dev,
                       SyncpointManager& syncpoint_manager)
-    : nvhost_nvdec_common(system, std::move(nvmap_dev), syncpoint_manager) {}
-
+    : nvhost_nvdec_common(system, std::move(nvmap_dev), syncpoint_manager) {
+    vic_device = true;
+}
 nvhost_vic::~nvhost_vic() = default;

 NvResult nvhost_vic::Ioctl1(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output) {
--- a/src/core/memory.cpp
+++ b/src/core/memory.cpp
@@ -195,8 +195,9 @@ struct Memory::Impl {
            switch (type) {
            case Common::PageType::Unmapped: {
                LOG_ERROR(HW_Memory,
-                          "Unmapped ReadBlock @ 0x{:016X} (start address = 0x{:016X}, size = {})",
-                          current_vaddr, src_addr, size);
+                          "Unmapped ReadBlock @ 0x{:016X} (start address = 0x{:016X}, size = {}) "
+                          "at PC 0x{:08X}",
+                          current_vaddr, src_addr, size, system.CurrentArmInterface().GetPC());
                std::memset(dest_buffer, 0, copy_amount);
                break;
            }
@@ -240,8 +241,9 @@ struct Memory::Impl {
            switch (type) {
            case Common::PageType::Unmapped: {
                LOG_ERROR(HW_Memory,
-                          "Unmapped ReadBlock @ 0x{:016X} (start address = 0x{:016X}, size = {})",
-                          current_vaddr, src_addr, size);
+                          "Unmapped ReadBlock @ 0x{:016X} (start address = 0x{:016X}, size = {}) "
+                          "at PC 0x{:08X}",
+                          current_vaddr, src_addr, size, system.CurrentArmInterface().GetPC());
                std::memset(dest_buffer, 0, copy_amount);
                break;
            }
@@ -291,8 +293,9 @@ struct Memory::Impl {
            switch (type) {
            case Common::PageType::Unmapped: {
                LOG_ERROR(HW_Memory,
-                          "Unmapped WriteBlock @ 0x{:016X} (start address = 0x{:016X}, size = {})",
-                          current_vaddr, dest_addr, size);
+                          "Unmapped WriteBlock @ 0x{:016X} (start address = 0x{:016X}, size = {}) "
+                          "at PC 0x{:08X}",
+                          current_vaddr, dest_addr, size, system.CurrentArmInterface().GetPC());
                break;
            }
            case Common::PageType::Memory: {
@@ -334,8 +337,9 @@ struct Memory::Impl {
            switch (type) {
            case Common::PageType::Unmapped: {
                LOG_ERROR(HW_Memory,
-                          "Unmapped WriteBlock @ 0x{:016X} (start address = 0x{:016X}, size = {})",
-                          current_vaddr, dest_addr, size);
+                          "Unmapped WriteBlock @ 0x{:016X} (start address = 0x{:016X}, size = {}) "
+                          "at PC 0x{:08X}",
+                          current_vaddr, dest_addr, size, system.CurrentArmInterface().GetPC());
                break;
            }
            case Common::PageType::Memory: {
@@ -383,8 +387,9 @@ struct Memory::Impl {
            switch (type) {
            case Common::PageType::Unmapped: {
                LOG_ERROR(HW_Memory,
-                          "Unmapped ZeroBlock @ 0x{:016X} (start address = 0x{:016X}, size = {})",
-                          current_vaddr, dest_addr, size);
+                          "Unmapped ZeroBlock @ 0x{:016X} (start address = 0x{:016X}, size = {}) "
+                          "at PC 0x{:08X}",
+                          current_vaddr, dest_addr, size, system.CurrentArmInterface().GetPC());
                break;
            }
            case Common::PageType::Memory: {
@@ -429,8 +434,9 @@ struct Memory::Impl {
            switch (type) {
            case Common::PageType::Unmapped: {
                LOG_ERROR(HW_Memory,
-                          "Unmapped CopyBlock @ 0x{:016X} (start address = 0x{:016X}, size = {})",
-                          current_vaddr, src_addr, size);
+                          "Unmapped CopyBlock @ 0x{:016X} (start address = 0x{:016X}, size = {}) "
+                          "at PC 0x{:08X}",
+                          current_vaddr, src_addr, size, system.CurrentArmInterface().GetPC());
                ZeroBlock(process, dest_addr, copy_amount);
                break;
            }
@@ -601,7 +607,8 @@ struct Memory::Impl {
        }
        switch (Common::PageTable::PageInfo::ExtractType(raw_pointer)) {
        case Common::PageType::Unmapped:
-            LOG_ERROR(HW_Memory, "Unmapped Read{} @ 0x{:08X}", sizeof(T) * 8, vaddr);
+            LOG_ERROR(HW_Memory, "Unmapped Read{} @ 0x{:08X} at PC 0x{:08X}", sizeof(T) * 8, vaddr,
+                      system.CurrentArmInterface().GetPC());
            return 0;
        case Common::PageType::Memory:
            ASSERT_MSG(false, "Mapped memory page without a pointer @ {:016X}", vaddr);
@@ -638,8 +645,9 @@ struct Memory::Impl {
        }
        switch (Common::PageTable::PageInfo::ExtractType(raw_pointer)) {
        case Common::PageType::Unmapped:
-            LOG_ERROR(HW_Memory, "Unmapped Write{} 0x{:08X} @ 0x{:016X}", sizeof(data) * 8,
-                      static_cast<u32>(data), vaddr);
+            LOG_ERROR(HW_Memory, "Unmapped Write{} 0x{:08X} @ 0x{:016X} at PC 0x{:08X}",
+                      sizeof(data) * 8, static_cast<u32>(data), vaddr,
+                      system.CurrentArmInterface().GetPC());
            return;
        case Common::PageType::Memory:
            ASSERT_MSG(false, "Mapped memory page without a pointer @ {:016X}", vaddr);
--- a/src/input_common/gcadapter/gc_poller.cpp
+++ b/src/input_common/gcadapter/gc_poller.cpp
@@ -185,6 +185,16 @@ public:
        return {0.0f, 0.0f};
    }

+    std::tuple<float, float> GetRawStatus() const override {
+        const float x = GetAxis(axis_x);
+        const float y = GetAxis(axis_y);
+        return {x, y};
+    }
+
+    Input::AnalogProperties GetAnalogProperties() const override {
+        return {deadzone, range, 0.5f};
+    }
+
    bool GetAnalogDirectionStatus(Input::AnalogDirection direction) const override {
        const auto [x, y] = GetStatus();
        const float directional_deadzone = 0.5f;
--- a/src/input_common/sdl/sdl_impl.cpp
+++ b/src/input_common/sdl/sdl_impl.cpp
@@ -373,6 +373,16 @@ public:
        return {};
    }

+    std::tuple<float, float> GetRawStatus() const override {
+        const float x = joystick->GetAxis(axis_x, range);
+        const float y = joystick->GetAxis(axis_y, range);
+        return {x, -y};
+    }
+
+    Input::AnalogProperties GetAnalogProperties() const override {
+        return {deadzone, range, 0.5f};
+    }
+
    bool GetAnalogDirectionStatus(Input::AnalogDirection direction) const override {
        const auto [x, y] = GetStatus();
        const float directional_deadzone = 0.5f;
--- a/src/input_common/touch_from_button.cpp
+++ b/src/input_common/touch_from_button.cpp
@@ -25,18 +25,19 @@ public:
        }
    }

-    std::tuple<float, float, bool> GetStatus() const override {
-        for (const auto& m : map) {
-            const bool state = std::get<0>(m)->GetStatus();
+    Input::TouchStatus GetStatus() const override {
+        Input::TouchStatus touch_status{};
+        for (std::size_t id = 0; id < map.size() && id < touch_status.size(); ++id) {
+            const bool state = std::get<0>(map[id])->GetStatus();
            if (state) {
-                const float x = static_cast<float>(std::get<1>(m)) /
+                const float x = static_cast<float>(std::get<1>(map[id])) /
                                static_cast<int>(Layout::ScreenUndocked::Width);
-                const float y = static_cast<float>(std::get<2>(m)) /
+                const float y = static_cast<float>(std::get<2>(map[id])) /
                                static_cast<int>(Layout::ScreenUndocked::Height);
-                return {x, y, true};
+                touch_status[id] = {x, y, true};
            }
        }
-        return {};
+        return touch_status;
    }

 private:
--- a/src/input_common/udp/client.cpp
+++ b/src/input_common/udp/client.cpp
@@ -136,6 +136,7 @@ static void SocketLoop(Socket* socket) {

 Client::Client() {
    LOG_INFO(Input, "Udp Initialization started");
+    finger_id.fill(MAX_TOUCH_FINGERS);
    ReloadSockets();
 }

@@ -176,7 +177,7 @@ void Client::ReloadSockets() {
    std::string server_token;
    std::size_t client = 0;
    while (std::getline(servers_ss, server_token, ',')) {
-        if (client == max_udp_clients) {
+        if (client == MAX_UDP_CLIENTS) {
            break;
        }
        std::stringstream server_ss(server_token);
@@ -194,7 +195,7 @@ void Client::ReloadSockets() {
        for (std::size_t pad = 0; pad < 4; ++pad) {
            const std::size_t client_number =
                GetClientNumber(udp_input_address, udp_input_port, pad);
-            if (client_number != max_udp_clients) {
+            if (client_number != MAX_UDP_CLIENTS) {
                LOG_ERROR(Input, "Duplicated UDP servers found");
                continue;
            }
@@ -213,7 +214,7 @@ std::size_t Client::GetClientNumber(std::string_view host, u16 port, std::size_t
            return client;
        }
    }
-    return max_udp_clients;
+    return MAX_UDP_CLIENTS;
 }

 void Client::OnVersion([[maybe_unused]] Response::Version data) {
@@ -259,33 +260,14 @@ void Client::OnPadData(Response::PadData data, std::size_t client) {
        std::lock_guard guard(clients[client].status.update_mutex);
        clients[client].status.motion_status = clients[client].motion.GetMotion();

-        // TODO: add a setting for "click" touch. Click touch refers to a device that differentiates
-        // between a simple "tap" and a hard press that causes the touch screen to click.
-        const bool is_active = data.touch_1.is_active != 0;
-
-        float x = 0;
-        float y = 0;
-
-        if (is_active && clients[client].status.touch_calibration) {
-            const u16 min_x = clients[client].status.touch_calibration->min_x;
-            const u16 max_x = clients[client].status.touch_calibration->max_x;
-            const u16 min_y = clients[client].status.touch_calibration->min_y;
-            const u16 max_y = clients[client].status.touch_calibration->max_y;
-
-            x = static_cast<float>(std::clamp(static_cast<u16>(data.touch_1.x), min_x, max_x) -
-                                   min_x) /
-                static_cast<float>(max_x - min_x);
-            y = static_cast<float>(std::clamp(static_cast<u16>(data.touch_1.y), min_y, max_y) -
-                                   min_y) /
-                static_cast<float>(max_y - min_y);
+        for (std::size_t id = 0; id < data.touch.size(); ++id) {
+            UpdateTouchInput(data.touch[id], client, id);
        }

-        clients[client].status.touch_status = {x, y, is_active};
-
        if (configuring) {
            const Common::Vec3f gyroscope = clients[client].motion.GetGyroscope();
            const Common::Vec3f accelerometer = clients[client].motion.GetAcceleration();
-            UpdateYuzuSettings(client, accelerometer, gyroscope, is_active);
+            UpdateYuzuSettings(client, accelerometer, gyroscope);
        }
    }
 }
@@ -320,21 +302,17 @@ void Client::Reset() {
 }

 void Client::UpdateYuzuSettings(std::size_t client, const Common::Vec3<float>& acc,
-                                const Common::Vec3<float>& gyro, bool touch) {
+                                const Common::Vec3<float>& gyro) {
    if (gyro.Length() > 0.2f) {
-        LOG_DEBUG(Input, "UDP Controller {}: gyro=({}, {}, {}), accel=({}, {}, {}), touch={}",
-                  client, gyro[0], gyro[1], gyro[2], acc[0], acc[1], acc[2], touch);
+        LOG_DEBUG(Input, "UDP Controller {}: gyro=({}, {}, {}), accel=({}, {}, {})", client,
+                  gyro[0], gyro[1], gyro[2], acc[0], acc[1], acc[2]);
    }
    UDPPadStatus pad{
        .host = clients[client].host,
        .port = clients[client].port,
        .pad_index = clients[client].pad_index,
    };
-    if (touch) {
-        pad.touch = PadTouch::Click;
-        pad_queue.Push(pad);
-    }
-    for (size_t i = 0; i < 3; ++i) {
+    for (std::size_t i = 0; i < 3; ++i) {
        if (gyro[i] > 5.0f || gyro[i] < -5.0f) {
            pad.motion = static_cast<PadMotion>(i);
            pad.motion_value = gyro[i];
@@ -348,6 +326,50 @@ void Client::UpdateYuzuSettings(std::size_t client, const Common::Vec3<float>& a
    }
 }

+std::optional<std::size_t> Client::GetUnusedFingerID() const {
+    std::size_t first_free_id = 0;
+    while (first_free_id < MAX_TOUCH_FINGERS) {
+        if (!std::get<2>(touch_status[first_free_id])) {
+            return first_free_id;
+        } else {
+            first_free_id++;
+        }
+    }
+    return std::nullopt;
+}
+
+void Client::UpdateTouchInput(Response::TouchPad& touch_pad, std::size_t client, std::size_t id) {
+    // TODO: Use custom calibration per device
+    const Common::ParamPackage touch_param(Settings::values.touch_device);
+    const u16 min_x = static_cast<u16>(touch_param.Get("min_x", 100));
+    const u16 min_y = static_cast<u16>(touch_param.Get("min_y", 50));
+    const u16 max_x = static_cast<u16>(touch_param.Get("max_x", 1800));
+    const u16 max_y = static_cast<u16>(touch_param.Get("max_y", 850));
+    const std::size_t touch_id = client * 2 + id;
+    if (touch_pad.is_active) {
+        if (finger_id[touch_id] == MAX_TOUCH_FINGERS) {
+            const auto first_free_id = GetUnusedFingerID();
+            if (!first_free_id) {
+                // Invalid finger id skip to next input
+                return;
+            }
+            finger_id[touch_id] = *first_free_id;
+        }
+        auto& [x, y, pressed] = touch_status[finger_id[touch_id]];
+        x = static_cast<float>(std::clamp(static_cast<u16>(touch_pad.x), min_x, max_x) - min_x) /
+            static_cast<float>(max_x - min_x);
+        y = static_cast<float>(std::clamp(static_cast<u16>(touch_pad.y), min_y, max_y) - min_y) /
+            static_cast<float>(max_y - min_y);
+        pressed = true;
+        return;
+    }
+
+    if (finger_id[touch_id] != MAX_TOUCH_FINGERS) {
+        touch_status[finger_id[touch_id]] = {};
+        finger_id[touch_id] = MAX_TOUCH_FINGERS;
+    }
+}
+
 void Client::BeginConfiguration() {
    pad_queue.Clear();
    configuring = true;
@@ -360,7 +382,7 @@ void Client::EndConfiguration() {

 DeviceStatus& Client::GetPadState(const std::string& host, u16 port, std::size_t pad) {
    const std::size_t client_number = GetClientNumber(host, port, pad);
-    if (client_number == max_udp_clients) {
+    if (client_number == MAX_UDP_CLIENTS) {
        return clients[0].status;
    }
    return clients[client_number].status;
@@ -368,12 +390,20 @@ DeviceStatus& Client::GetPadState(const std::string& host, u16 port, std::size_t

 const DeviceStatus& Client::GetPadState(const std::string& host, u16 port, std::size_t pad) const {
    const std::size_t client_number = GetClientNumber(host, port, pad);
-    if (client_number == max_udp_clients) {
+    if (client_number == MAX_UDP_CLIENTS) {
        return clients[0].status;
    }
    return clients[client_number].status;
 }

+Input::TouchStatus& Client::GetTouchState() {
+    return touch_status;
+}
+
+const Input::TouchStatus& Client::GetTouchState() const {
+    return touch_status;
+}
+
 Common::SPSCQueue<UDPPadStatus>& Client::GetPadQueue() {
    return pad_queue;
 }
@@ -426,24 +456,24 @@ CalibrationConfigurationJob::CalibrationConfigurationJob(
                                        current_status = Status::Ready;
                                        status_callback(current_status);
                                    }
-                                    if (data.touch_1.is_active == 0) {
+                                    if (data.touch[0].is_active == 0) {
                                        return;
                                    }
-                                    LOG_DEBUG(Input, "Current touch: {} {}", data.touch_1.x,
-                                              data.touch_1.y);
-                                    min_x = std::min(min_x, static_cast<u16>(data.touch_1.x));
-                                    min_y = std::min(min_y, static_cast<u16>(data.touch_1.y));
+                                    LOG_DEBUG(Input, "Current touch: {} {}", data.touch[0].x,
+                                              data.touch[0].y);
+                                    min_x = std::min(min_x, static_cast<u16>(data.touch[0].x));
+                                    min_y = std::min(min_y, static_cast<u16>(data.touch[0].y));
                                    if (current_status == Status::Ready) {
                                        // First touch - min data (min_x/min_y)
                                        current_status = Status::Stage1Completed;
                                        status_callback(current_status);
                                    }
-                                    if (data.touch_1.x - min_x > CALIBRATION_THRESHOLD &&
-                                        data.touch_1.y - min_y > CALIBRATION_THRESHOLD) {
+                                    if (data.touch[0].x - min_x > CALIBRATION_THRESHOLD &&
+                                        data.touch[0].y - min_y > CALIBRATION_THRESHOLD) {
                                        // Set the current position as max value and finishes
                                        // configuration
-                                        max_x = data.touch_1.x;
-                                        max_y = data.touch_1.y;
+                                        max_x = data.touch[0].x;
+                                        max_y = data.touch[0].y;
                                        current_status = Status::Completed;
                                        data_callback(min_x, min_y, max_x, max_y);
                                        status_callback(current_status);
--- a/src/input_common/udp/client.h
+++ b/src/input_common/udp/client.h
@@ -28,6 +28,7 @@ class Socket;
 namespace Response {
 struct PadData;
 struct PortInfo;
+struct TouchPad;
 struct Version;
 } // namespace Response

@@ -50,7 +51,6 @@ struct UDPPadStatus {
    std::string host{"127.0.0.1"};
    u16 port{26760};
    std::size_t pad_index{};
-    PadTouch touch{PadTouch::Undefined};
    PadMotion motion{PadMotion::Undefined};
    f32 motion_value{0.0f};
 };
@@ -93,6 +93,9 @@ public:
    DeviceStatus& GetPadState(const std::string& host, u16 port, std::size_t pad);
    const DeviceStatus& GetPadState(const std::string& host, u16 port, std::size_t pad) const;

+    Input::TouchStatus& GetTouchState();
+    const Input::TouchStatus& GetTouchState() const;
+
 private:
    struct ClientData {
        std::string host{"127.0.0.1"};
@@ -122,14 +125,25 @@ private:
    void StartCommunication(std::size_t client, const std::string& host, u16 port,
                            std::size_t pad_index, u32 client_id);
    void UpdateYuzuSettings(std::size_t client, const Common::Vec3<float>& acc,
-                            const Common::Vec3<float>& gyro, bool touch);
+                            const Common::Vec3<float>& gyro);
+
+    // Returns an unused finger id, if there is no fingers available std::nullopt will be
+    // returned
+    std::optional<std::size_t> GetUnusedFingerID() const;
+
+    // Merges and updates all touch inputs into the touch_status array
+    void UpdateTouchInput(Response::TouchPad& touch_pad, std::size_t client, std::size_t id);

    bool configuring = false;

    // Allocate clients for 8 udp servers
-    const std::size_t max_udp_clients = 32;
-    std::array<ClientData, 4 * 8> clients;
-    Common::SPSCQueue<UDPPadStatus> pad_queue;
+    static constexpr std::size_t MAX_UDP_CLIENTS = 4 * 8;
+    // Each client can have up 2 touch inputs
+    static constexpr std::size_t MAX_TOUCH_FINGERS = MAX_UDP_CLIENTS * 2;
+    std::array<ClientData, MAX_UDP_CLIENTS> clients{};
+    Common::SPSCQueue<UDPPadStatus> pad_queue{};
+    Input::TouchStatus touch_status{};
+    std::array<std::size_t, MAX_TOUCH_FINGERS> finger_id{};
 };

 /// An async job allowing configuration of the touchpad calibration.
--- a/src/input_common/udp/protocol.h
+++ b/src/input_common/udp/protocol.h
@@ -140,6 +140,14 @@ static_assert(sizeof(PortInfo) == 12, "UDP Response PortInfo struct has wrong si
 static_assert(std::is_trivially_copyable_v<PortInfo>,
              "UDP Response PortInfo is not trivially copyable");

+struct TouchPad {
+    u8 is_active{};
+    u8 id{};
+    u16_le x{};
+    u16_le y{};
+};
+static_assert(sizeof(TouchPad) == 6, "UDP Response TouchPad struct has wrong size ");
+
 #pragma pack(push, 1)
 struct PadData {
    PortInfo info{};
@@ -190,12 +198,7 @@ struct PadData {
        u8 button_13{};
    } analog_button;

-    struct TouchPad {
-        u8 is_active{};
-        u8 id{};
-        u16_le x{};
-        u16_le y{};
-    } touch_1, touch_2;
+    std::array<TouchPad, 2> touch;

    u64_le motion_timestamp;

@@ -222,7 +225,6 @@ static_assert(sizeof(Message<PadData>) == MAX_PACKET_SIZE,

 static_assert(sizeof(PadData::AnalogButton) == 12,
              "UDP Response AnalogButton struct has wrong size ");
-static_assert(sizeof(PadData::TouchPad) == 6, "UDP Response TouchPad struct has wrong size ");
 static_assert(sizeof(PadData::Accelerometer) == 12,
              "UDP Response Accelerometer struct has wrong size ");
 static_assert(sizeof(PadData::Gyroscope) == 12, "UDP Response Gyroscope struct has wrong size ");
--- a/src/input_common/udp/udp.cpp
+++ b/src/input_common/udp/udp.cpp
@@ -78,8 +78,8 @@ public:
    explicit UDPTouch(std::string ip_, u16 port_, u16 pad_, CemuhookUDP::Client* client_)
        : ip(std::move(ip_)), port(port_), pad(pad_), client(client_) {}

-    std::tuple<float, float, bool> GetStatus() const override {
-        return client->GetPadState(ip, port, pad).touch_status;
+    Input::TouchStatus GetStatus() const override {
+        return client->GetTouchState();
    }

 private:
@@ -107,32 +107,4 @@ std::unique_ptr<Input::TouchDevice> UDPTouchFactory::Create(const Common::ParamP
    return std::make_unique<UDPTouch>(std::move(ip), port, pad, client.get());
 }

-void UDPTouchFactory::BeginConfiguration() {
-    polling = true;
-    client->BeginConfiguration();
-}
-
-void UDPTouchFactory::EndConfiguration() {
-    polling = false;
-    client->EndConfiguration();
-}
-
-Common::ParamPackage UDPTouchFactory::GetNextInput() {
-    Common::ParamPackage params;
-    CemuhookUDP::UDPPadStatus pad;
-    auto& queue = client->GetPadQueue();
-    while (queue.Pop(pad)) {
-        if (pad.touch == CemuhookUDP::PadTouch::Undefined) {
-            continue;
-        }
-        params.Set("engine", "cemuhookudp");
-        params.Set("ip", pad.host);
-        params.Set("port", static_cast<u16>(pad.port));
-        params.Set("pad_index", static_cast<u16>(pad.pad_index));
-        params.Set("touch", static_cast<u16>(pad.touch));
-        return params;
-    }
-    return params;
-}
-
 } // namespace InputCommon
--- a/src/tests/video_core/buffer_base.cpp
+++ b/src/tests/video_core/buffer_base.cpp
@@ -471,3 +471,79 @@ TEST_CASE("BufferBase: Unaligned page region query") {
    REQUIRE(buffer.IsRegionCpuModified(c + 4000, 1000));
    REQUIRE(buffer.IsRegionCpuModified(c + 4000, 1));
 }
+
+TEST_CASE("BufferBase: Cached write") {
+    RasterizerInterface rasterizer;
+    BufferBase buffer(rasterizer, c, WORD);
+    buffer.UnmarkRegionAsCpuModified(c, WORD);
+    buffer.CachedCpuWrite(c + PAGE, PAGE);
+    REQUIRE(!buffer.IsRegionCpuModified(c + PAGE, PAGE));
+    buffer.FlushCachedWrites();
+    REQUIRE(buffer.IsRegionCpuModified(c + PAGE, PAGE));
+    buffer.MarkRegionAsCpuModified(c, WORD);
+    REQUIRE(rasterizer.Count() == 0);
+}
+
+TEST_CASE("BufferBase: Multiple cached write") {
+    RasterizerInterface rasterizer;
+    BufferBase buffer(rasterizer, c, WORD);
+    buffer.UnmarkRegionAsCpuModified(c, WORD);
+    buffer.CachedCpuWrite(c + PAGE, PAGE);
+    buffer.CachedCpuWrite(c + PAGE * 3, PAGE);
+    REQUIRE(!buffer.IsRegionCpuModified(c + PAGE, PAGE));
+    REQUIRE(!buffer.IsRegionCpuModified(c + PAGE * 3, PAGE));
+    buffer.FlushCachedWrites();
+    REQUIRE(buffer.IsRegionCpuModified(c + PAGE, PAGE));
+    REQUIRE(buffer.IsRegionCpuModified(c + PAGE * 3, PAGE));
+    buffer.MarkRegionAsCpuModified(c, WORD);
+    REQUIRE(rasterizer.Count() == 0);
+}
+
+TEST_CASE("BufferBase: Cached write unmarked") {
+    RasterizerInterface rasterizer;
+    BufferBase buffer(rasterizer, c, WORD);
+    buffer.UnmarkRegionAsCpuModified(c, WORD);
+    buffer.CachedCpuWrite(c + PAGE, PAGE);
+    buffer.UnmarkRegionAsCpuModified(c + PAGE, PAGE);
+    REQUIRE(!buffer.IsRegionCpuModified(c + PAGE, PAGE));
+    buffer.FlushCachedWrites();
+    REQUIRE(buffer.IsRegionCpuModified(c + PAGE, PAGE));
+    buffer.MarkRegionAsCpuModified(c, WORD);
+    REQUIRE(rasterizer.Count() == 0);
+}
+
+TEST_CASE("BufferBase: Cached write iterated") {
+    RasterizerInterface rasterizer;
+    BufferBase buffer(rasterizer, c, WORD);
+    buffer.UnmarkRegionAsCpuModified(c, WORD);
+    buffer.CachedCpuWrite(c + PAGE, PAGE);
+    int num = 0;
+    buffer.ForEachUploadRange(c, WORD, [&](u64 offset, u64 size) { ++num; });
+    REQUIRE(num == 0);
+    REQUIRE(!buffer.IsRegionCpuModified(c + PAGE, PAGE));
+    buffer.FlushCachedWrites();
+    REQUIRE(buffer.IsRegionCpuModified(c + PAGE, PAGE));
+    buffer.MarkRegionAsCpuModified(c, WORD);
+    REQUIRE(rasterizer.Count() == 0);
+}
+
+TEST_CASE("BufferBase: Cached write downloads") {
+    RasterizerInterface rasterizer;
+    BufferBase buffer(rasterizer, c, WORD);
+    buffer.UnmarkRegionAsCpuModified(c, WORD);
+    REQUIRE(rasterizer.Count() == 64);
+    buffer.CachedCpuWrite(c + PAGE, PAGE);
+    REQUIRE(rasterizer.Count() == 63);
+    buffer.MarkRegionAsGpuModified(c + PAGE, PAGE);
+    int num = 0;
+    buffer.ForEachDownloadRange(c, WORD, [&](u64 offset, u64 size) { ++num; });
+    buffer.ForEachUploadRange(c, WORD, [&](u64 offset, u64 size) { ++num; });
+    REQUIRE(num == 0);
+    REQUIRE(!buffer.IsRegionCpuModified(c + PAGE, PAGE));
+    REQUIRE(!buffer.IsRegionGpuModified(c + PAGE, PAGE));
+    buffer.FlushCachedWrites();
+    REQUIRE(buffer.IsRegionCpuModified(c + PAGE, PAGE));
+    REQUIRE(!buffer.IsRegionGpuModified(c + PAGE, PAGE));
+    buffer.MarkRegionAsCpuModified(c, WORD);
+    REQUIRE(rasterizer.Count() == 0);
+}
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -2,10 +2,8 @@ add_subdirectory(host_shaders)

 add_library(video_core STATIC
    buffer_cache/buffer_base.h
-    buffer_cache/buffer_block.h
+    buffer_cache/buffer_cache.cpp
    buffer_cache/buffer_cache.h
-    buffer_cache/map_interval.cpp
-    buffer_cache/map_interval.h
    cdma_pusher.cpp
    cdma_pusher.h
    command_classes/codecs/codec.cpp
@@ -154,8 +152,6 @@ add_library(video_core STATIC
    renderer_vulkan/vk_staging_buffer_pool.h
    renderer_vulkan/vk_state_tracker.cpp
    renderer_vulkan/vk_state_tracker.h
-    renderer_vulkan/vk_stream_buffer.cpp
-    renderer_vulkan/vk_stream_buffer.h
    renderer_vulkan/vk_swapchain.cpp
    renderer_vulkan/vk_swapchain.h
    renderer_vulkan/vk_texture_cache.cpp
--- a/src/video_core/buffer_cache/buffer_base.h
+++ b/src/video_core/buffer_cache/buffer_base.h
@@ -19,6 +19,7 @@ namespace VideoCommon {

 enum class BufferFlagBits {
    Picked = 1 << 0,
+    CachedWrites = 1 << 1,
 };
 DECLARE_ENUM_FLAG_OPERATORS(BufferFlagBits)

@@ -40,7 +41,7 @@ class BufferBase {
    static constexpr u64 BYTES_PER_WORD = PAGES_PER_WORD * BYTES_PER_PAGE;

    /// Vector tracking modified pages tightly packed with small vector optimization
-    union WrittenWords {
+    union WordsArray {
        /// Returns the pointer to the words state
        [[nodiscard]] const u64* Pointer(bool is_short) const noexcept {
            return is_short ? &stack : heap;
@@ -55,49 +56,59 @@ class BufferBase {
        u64* heap;     ///< Not-small buffers pointer to the storage
    };

-    struct GpuCpuWords {
-        explicit GpuCpuWords() = default;
-        explicit GpuCpuWords(u64 size_bytes_) : size_bytes{size_bytes_} {
+    struct Words {
+        explicit Words() = default;
+        explicit Words(u64 size_bytes_) : size_bytes{size_bytes_} {
            if (IsShort()) {
                cpu.stack = ~u64{0};
                gpu.stack = 0;
+                cached_cpu.stack = 0;
+                untracked.stack = ~u64{0};
            } else {
                // Share allocation between CPU and GPU pages and set their default values
                const size_t num_words = NumWords();
-                u64* const alloc = new u64[num_words * 2];
+                u64* const alloc = new u64[num_words * 4];
                cpu.heap = alloc;
                gpu.heap = alloc + num_words;
+                cached_cpu.heap = alloc + num_words * 2;
+                untracked.heap = alloc + num_words * 3;
                std::fill_n(cpu.heap, num_words, ~u64{0});
                std::fill_n(gpu.heap, num_words, 0);
+                std::fill_n(cached_cpu.heap, num_words, 0);
+                std::fill_n(untracked.heap, num_words, ~u64{0});
            }
            // Clean up tailing bits
-            const u64 last_local_page =
-                Common::DivCeil(size_bytes % BYTES_PER_WORD, BYTES_PER_PAGE);
+            const u64 last_word_size = size_bytes % BYTES_PER_WORD;
+            const u64 last_local_page = Common::DivCeil(last_word_size, BYTES_PER_PAGE);
            const u64 shift = (PAGES_PER_WORD - last_local_page) % PAGES_PER_WORD;
-            u64& last_word = cpu.Pointer(IsShort())[NumWords() - 1];
-            last_word = (last_word << shift) >> shift;
+            const u64 last_word = (~u64{0} << shift) >> shift;
+            cpu.Pointer(IsShort())[NumWords() - 1] = last_word;
+            untracked.Pointer(IsShort())[NumWords() - 1] = last_word;
        }

-        ~GpuCpuWords() {
+        ~Words() {
            Release();
        }

-        GpuCpuWords& operator=(GpuCpuWords&& rhs) noexcept {
+        Words& operator=(Words&& rhs) noexcept {
            Release();
            size_bytes = rhs.size_bytes;
            cpu = rhs.cpu;
            gpu = rhs.gpu;
+            cached_cpu = rhs.cached_cpu;
+            untracked = rhs.untracked;
            rhs.cpu.heap = nullptr;
            return *this;
        }

-        GpuCpuWords(GpuCpuWords&& rhs) noexcept
-            : size_bytes{rhs.size_bytes}, cpu{rhs.cpu}, gpu{rhs.gpu} {
+        Words(Words&& rhs) noexcept
+            : size_bytes{rhs.size_bytes}, cpu{rhs.cpu}, gpu{rhs.gpu},
+              cached_cpu{rhs.cached_cpu}, untracked{rhs.untracked} {
            rhs.cpu.heap = nullptr;
        }

-        GpuCpuWords& operator=(const GpuCpuWords&) = delete;
-        GpuCpuWords(const GpuCpuWords&) = delete;
+        Words& operator=(const Words&) = delete;
+        Words(const Words&) = delete;

        /// Returns true when the buffer fits in the small vector optimization
        [[nodiscard]] bool IsShort() const noexcept {
@@ -118,8 +129,17 @@ class BufferBase {
        }

        u64 size_bytes = 0;
-        WrittenWords cpu;
-        WrittenWords gpu;
+        WordsArray cpu;
+        WordsArray gpu;
+        WordsArray cached_cpu;
+        WordsArray untracked;
+    };
+
+    enum class Type {
+        CPU,
+        GPU,
+        CachedCPU,
+        Untracked,
    };

 public:
@@ -132,68 +152,93 @@ public:
    BufferBase& operator=(const BufferBase&) = delete;
    BufferBase(const BufferBase&) = delete;

+    BufferBase& operator=(BufferBase&&) = default;
+    BufferBase(BufferBase&&) = default;
+
    /// Returns the inclusive CPU modified range in a begin end pair
    [[nodiscard]] std::pair<u64, u64> ModifiedCpuRegion(VAddr query_cpu_addr,
                                                        u64 query_size) const noexcept {
        const u64 offset = query_cpu_addr - cpu_addr;
-        return ModifiedRegion<false>(offset, query_size);
+        return ModifiedRegion<Type::CPU>(offset, query_size);
    }

    /// Returns the inclusive GPU modified range in a begin end pair
    [[nodiscard]] std::pair<u64, u64> ModifiedGpuRegion(VAddr query_cpu_addr,
                                                        u64 query_size) const noexcept {
        const u64 offset = query_cpu_addr - cpu_addr;
-        return ModifiedRegion<true>(offset, query_size);
+        return ModifiedRegion<Type::GPU>(offset, query_size);
    }

    /// Returns true if a region has been modified from the CPU
    [[nodiscard]] bool IsRegionCpuModified(VAddr query_cpu_addr, u64 query_size) const noexcept {
        const u64 offset = query_cpu_addr - cpu_addr;
-        return IsRegionModified<false>(offset, query_size);
+        return IsRegionModified<Type::CPU>(offset, query_size);
    }

    /// Returns true if a region has been modified from the GPU
    [[nodiscard]] bool IsRegionGpuModified(VAddr query_cpu_addr, u64 query_size) const noexcept {
        const u64 offset = query_cpu_addr - cpu_addr;
-        return IsRegionModified<true>(offset, query_size);
+        return IsRegionModified<Type::GPU>(offset, query_size);
    }

    /// Mark region as CPU modified, notifying the rasterizer about this change
    void MarkRegionAsCpuModified(VAddr dirty_cpu_addr, u64 size) {
-        ChangeRegionState<true, true>(words.cpu, dirty_cpu_addr, size);
+        ChangeRegionState<Type::CPU, true>(dirty_cpu_addr, size);
    }

    /// Unmark region as CPU modified, notifying the rasterizer about this change
    void UnmarkRegionAsCpuModified(VAddr dirty_cpu_addr, u64 size) {
-        ChangeRegionState<false, true>(words.cpu, dirty_cpu_addr, size);
+        ChangeRegionState<Type::CPU, false>(dirty_cpu_addr, size);
    }

    /// Mark region as modified from the host GPU
    void MarkRegionAsGpuModified(VAddr dirty_cpu_addr, u64 size) noexcept {
-        ChangeRegionState<true, false>(words.gpu, dirty_cpu_addr, size);
+        ChangeRegionState<Type::GPU, true>(dirty_cpu_addr, size);
    }

    /// Unmark region as modified from the host GPU
    void UnmarkRegionAsGpuModified(VAddr dirty_cpu_addr, u64 size) noexcept {
-        ChangeRegionState<false, false>(words.gpu, dirty_cpu_addr, size);
+        ChangeRegionState<Type::GPU, false>(dirty_cpu_addr, size);
+    }
+
+    /// Mark region as modified from the CPU
+    /// but don't mark it as modified until FlusHCachedWrites is called.
+    void CachedCpuWrite(VAddr dirty_cpu_addr, u64 size) {
+        flags |= BufferFlagBits::CachedWrites;
+        ChangeRegionState<Type::CachedCPU, true>(dirty_cpu_addr, size);
+    }
+
+    /// Flushes cached CPU writes, and notify the rasterizer about the deltas
+    void FlushCachedWrites() noexcept {
+        flags &= ~BufferFlagBits::CachedWrites;
+        const u64 num_words = NumWords();
+        const u64* const cached_words = Array<Type::CachedCPU>();
+        u64* const untracked_words = Array<Type::Untracked>();
+        u64* const cpu_words = Array<Type::CPU>();
+        for (u64 word_index = 0; word_index < num_words; ++word_index) {
+            const u64 cached_bits = cached_words[word_index];
+            NotifyRasterizer<false>(word_index, untracked_words[word_index], cached_bits);
+            untracked_words[word_index] |= cached_bits;
+            cpu_words[word_index] |= cached_bits;
+        }
    }

    /// Call 'func' for each CPU modified range and unmark those pages as CPU modified
    template <typename Func>
    void ForEachUploadRange(VAddr query_cpu_range, u64 size, Func&& func) {
-        ForEachModifiedRange<false, true>(query_cpu_range, size, func);
+        ForEachModifiedRange<Type::CPU>(query_cpu_range, size, func);
    }

    /// Call 'func' for each GPU modified range and unmark those pages as GPU modified
    template <typename Func>
    void ForEachDownloadRange(VAddr query_cpu_range, u64 size, Func&& func) {
-        ForEachModifiedRange<true, false>(query_cpu_range, size, func);
+        ForEachModifiedRange<Type::GPU>(query_cpu_range, size, func);
    }

    /// Call 'func' for each GPU modified range and unmark those pages as GPU modified
    template <typename Func>
    void ForEachDownloadRange(Func&& func) {
-        ForEachModifiedRange<true, false>(cpu_addr, SizeBytes(), func);
+        ForEachModifiedRange<Type::GPU>(cpu_addr, SizeBytes(), func);
    }

    /// Mark buffer as picked
@@ -216,6 +261,11 @@ public:
        return True(flags & BufferFlagBits::Picked);
    }

+    /// Returns true when the buffer has pending cached writes
+    [[nodiscard]] bool HasCachedWrites() const noexcept {
+        return True(flags & BufferFlagBits::CachedWrites);
+    }
+
    /// Returns the base CPU address of the buffer
    [[nodiscard]] VAddr CpuAddr() const noexcept {
        return cpu_addr;
@@ -233,26 +283,48 @@ public:
    }

 private:
+    template <Type type>
+    u64* Array() noexcept {
+        if constexpr (type == Type::CPU) {
+            return words.cpu.Pointer(IsShort());
+        } else if constexpr (type == Type::GPU) {
+            return words.gpu.Pointer(IsShort());
+        } else if constexpr (type == Type::CachedCPU) {
+            return words.cached_cpu.Pointer(IsShort());
+        } else if constexpr (type == Type::Untracked) {
+            return words.untracked.Pointer(IsShort());
+        }
+    }
+
+    template <Type type>
+    const u64* Array() const noexcept {
+        if constexpr (type == Type::CPU) {
+            return words.cpu.Pointer(IsShort());
+        } else if constexpr (type == Type::GPU) {
+            return words.gpu.Pointer(IsShort());
+        } else if constexpr (type == Type::CachedCPU) {
+            return words.cached_cpu.Pointer(IsShort());
+        } else if constexpr (type == Type::Untracked) {
+            return words.untracked.Pointer(IsShort());
+        }
+    }
+
    /**
     * Change the state of a range of pages
     *
-     * @param written_words Pages to be marked or unmarked as modified
     * @param dirty_addr    Base address to mark or unmark as modified
     * @param size          Size in bytes to mark or unmark as modified
-     *
-     * @tparam enable            True when the bits will be set to one, false for zero
-     * @tparam notify_rasterizer True when the rasterizer has to be notified about the changes
     */
-    template <bool enable, bool notify_rasterizer>
-    void ChangeRegionState(WrittenWords& written_words, u64 dirty_addr,
-                           s64 size) noexcept(!notify_rasterizer) {
+    template <Type type, bool enable>
+    void ChangeRegionState(u64 dirty_addr, s64 size) noexcept(type == Type::GPU) {
        const s64 difference = dirty_addr - cpu_addr;
        const u64 offset = std::max<s64>(difference, 0);
        size += std::min<s64>(difference, 0);
        if (offset >= SizeBytes() || size < 0) {
            return;
        }
-        u64* const state_words = written_words.Pointer(IsShort());
+        u64* const untracked_words = Array<Type::Untracked>();
+        u64* const state_words = Array<type>();
        const u64 offset_end = std::min(offset + size, SizeBytes());
        const u64 begin_page_index = offset / BYTES_PER_PAGE;
        const u64 begin_word_index = begin_page_index / PAGES_PER_WORD;
@@ -268,13 +340,19 @@ private:
            u64 bits = ~u64{0};
            bits = (bits >> right_offset) << right_offset;
            bits = (bits << left_offset) >> left_offset;
-            if constexpr (notify_rasterizer) {
-                NotifyRasterizer<!enable>(word_index, state_words[word_index], bits);
+            if constexpr (type == Type::CPU || type == Type::CachedCPU) {
+                NotifyRasterizer<!enable>(word_index, untracked_words[word_index], bits);
            }
            if constexpr (enable) {
                state_words[word_index] |= bits;
+                if constexpr (type == Type::CPU || type == Type::CachedCPU) {
+                    untracked_words[word_index] |= bits;
+                }
            } else {
                state_words[word_index] &= ~bits;
+                if constexpr (type == Type::CPU || type == Type::CachedCPU) {
+                    untracked_words[word_index] &= ~bits;
+                }
            }
            page_index = 0;
            ++word_index;
@@ -291,7 +369,7 @@ private:
     * @tparam add_to_rasterizer True when the rasterizer should start tracking the new pages
     */
    template <bool add_to_rasterizer>
-    void NotifyRasterizer(u64 word_index, u64 current_bits, u64 new_bits) {
+    void NotifyRasterizer(u64 word_index, u64 current_bits, u64 new_bits) const {
        u64 changed_bits = (add_to_rasterizer ? current_bits : ~current_bits) & new_bits;
        VAddr addr = cpu_addr + word_index * BYTES_PER_WORD;
        while (changed_bits != 0) {
@@ -315,21 +393,20 @@ private:
     * @param query_cpu_range Base CPU address to loop over
     * @param size            Size in bytes of the CPU range to loop over
     * @param func            Function to call for each turned off region
-     *
-     * @tparam gpu               True for host GPU pages, false for CPU pages
-     * @tparam notify_rasterizer True when the rasterizer should be notified about state changes
     */
-    template <bool gpu, bool notify_rasterizer, typename Func>
+    template <Type type, typename Func>
    void ForEachModifiedRange(VAddr query_cpu_range, s64 size, Func&& func) {
+        static_assert(type != Type::Untracked);
+
        const s64 difference = query_cpu_range - cpu_addr;
        const u64 query_begin = std::max<s64>(difference, 0);
        size += std::min<s64>(difference, 0);
        if (query_begin >= SizeBytes() || size < 0) {
            return;
        }
-        const u64* const cpu_words = words.cpu.Pointer(IsShort());
+        u64* const untracked_words = Array<Type::Untracked>();
+        u64* const state_words = Array<type>();
        const u64 query_end = query_begin + std::min(static_cast<u64>(size), SizeBytes());
-        u64* const state_words = (gpu ? words.gpu : words.cpu).Pointer(IsShort());
        u64* const words_begin = state_words + query_begin / BYTES_PER_WORD;
        u64* const words_end = state_words + Common::DivCeil(query_end, BYTES_PER_WORD);

@@ -345,7 +422,8 @@ private:
        const u64 word_index_end = std::distance(state_words, last_modified_word);

        const unsigned local_page_begin = std::countr_zero(*first_modified_word);
-        const unsigned local_page_end = PAGES_PER_WORD - std::countl_zero(last_modified_word[-1]);
+        const unsigned local_page_end =
+            static_cast<unsigned>(PAGES_PER_WORD) - std::countl_zero(last_modified_word[-1]);
        const u64 word_page_begin = word_index_begin * PAGES_PER_WORD;
        const u64 word_page_end = (word_index_end - 1) * PAGES_PER_WORD;
        const u64 query_page_begin = query_begin / BYTES_PER_PAGE;
@@ -371,11 +449,13 @@ private:
            const u64 current_word = state_words[word_index] & bits;
            state_words[word_index] &= ~bits;

-            // Exclude CPU modified pages when visiting GPU pages
-            const u64 word = current_word & ~(gpu ? cpu_words[word_index] : 0);
-            if constexpr (notify_rasterizer) {
-                NotifyRasterizer<true>(word_index, word, ~u64{0});
+            if constexpr (type == Type::CPU) {
+                const u64 current_bits = untracked_words[word_index] & bits;
+                untracked_words[word_index] &= ~bits;
+                NotifyRasterizer<true>(word_index, current_bits, ~u64{0});
            }
+            // Exclude CPU modified pages when visiting GPU pages
+            const u64 word = current_word & ~(type == Type::GPU ? untracked_words[word_index] : 0);
            u64 page = page_begin;
            page_begin = 0;

@@ -416,17 +496,20 @@ private:
     * @param offset Offset in bytes from the start of the buffer
     * @param size   Size in bytes of the region to query for modifications
     */
-    template <bool gpu>
+    template <Type type>
    [[nodiscard]] bool IsRegionModified(u64 offset, u64 size) const noexcept {
-        const u64* const cpu_words = words.cpu.Pointer(IsShort());
-        const u64* const state_words = (gpu ? words.gpu : words.cpu).Pointer(IsShort());
+        static_assert(type != Type::Untracked);
+
+        const u64* const untracked_words = Array<Type::Untracked>();
+        const u64* const state_words = Array<type>();
        const u64 num_query_words = size / BYTES_PER_WORD + 1;
        const u64 word_begin = offset / BYTES_PER_WORD;
        const u64 word_end = std::min(word_begin + num_query_words, NumWords());
        const u64 page_limit = Common::DivCeil(offset + size, BYTES_PER_PAGE);
        u64 page_index = (offset / BYTES_PER_PAGE) % PAGES_PER_WORD;
        for (u64 word_index = word_begin; word_index < word_end; ++word_index, page_index = 0) {
-            const u64 word = state_words[word_index] & ~(gpu ? cpu_words[word_index] : 0);
+            const u64 off_word = type == Type::GPU ? untracked_words[word_index] : 0;
+            const u64 word = state_words[word_index] & ~off_word;
            if (word == 0) {
                continue;
            }
@@ -445,13 +528,13 @@ private:
     *
     * @param offset Offset in bytes from the start of the buffer
     * @param size   Size in bytes of the region to query for modifications
-     *
-     * @tparam True to query GPU modified pages, false for CPU pages
     */
-    template <bool gpu>
+    template <Type type>
    [[nodiscard]] std::pair<u64, u64> ModifiedRegion(u64 offset, u64 size) const noexcept {
-        const u64* const cpu_words = words.cpu.Pointer(IsShort());
-        const u64* const state_words = (gpu ? words.gpu : words.cpu).Pointer(IsShort());
+        static_assert(type != Type::Untracked);
+
+        const u64* const untracked_words = Array<Type::Untracked>();
+        const u64* const state_words = Array<type>();
        const u64 num_query_words = size / BYTES_PER_WORD + 1;
        const u64 word_begin = offset / BYTES_PER_WORD;
        const u64 word_end = std::min(word_begin + num_query_words, NumWords());
@@ -460,7 +543,8 @@ private:
        u64 begin = std::numeric_limits<u64>::max();
        u64 end = 0;
        for (u64 word_index = word_begin; word_index < word_end; ++word_index) {
-            const u64 word = state_words[word_index] & ~(gpu ? cpu_words[word_index] : 0);
+            const u64 off_word = type == Type::GPU ? untracked_words[word_index] : 0;
+            const u64 word = state_words[word_index] & ~off_word;
            if (word == 0) {
                continue;
            }
@@ -488,7 +572,7 @@ private:

    RasterizerInterface* rasterizer = nullptr;
    VAddr cpu_addr = 0;
-    GpuCpuWords words;
+    Words words;
    BufferFlagBits flags{};
 };

--- a/src/video_core/buffer_cache/buffer_cache.cpp
+++ b/src/video_core/buffer_cache/buffer_cache.cpp
@@ -0,0 +1,13 @@
+// Copyright 2021 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "common/microprofile.h"
+
+namespace VideoCommon {
+
+MICROPROFILE_DEFINE(GPU_PrepareBuffers, "GPU", "Prepare buffers", MP_RGB(224, 128, 128));
+MICROPROFILE_DEFINE(GPU_BindUploadBuffers, "GPU", "Bind and upload buffers", MP_RGB(224, 128, 128));
+MICROPROFILE_DEFINE(GPU_DownloadMemory, "GPU", "Download buffers", MP_RGB(224, 128, 128));
+
+} // namespace VideoCommon
--- a/src/video_core/buffer_cache/buffer_cache.h
+++ b/src/video_core/buffer_cache/buffer_cache.h
--- a/src/video_core/command_classes/vic.cpp
+++ b/src/video_core/command_classes/vic.cpp
@@ -110,12 +110,10 @@ void Vic::Execute() {
                                           converted_frame_buffer.get(), block_height, 0, 0);

            gpu.MemoryManager().WriteBlock(output_surface_luma_address, swizzled_data.data(), size);
-            gpu.Maxwell3D().OnMemoryWrite();
        } else {
            // send pitch linear frame
            gpu.MemoryManager().WriteBlock(output_surface_luma_address, converted_frame_buf_addr,
                                           linear_size);
-            gpu.Maxwell3D().OnMemoryWrite();
        }
        break;
    }
@@ -163,7 +161,6 @@ void Vic::Execute() {
        }
        gpu.MemoryManager().WriteBlock(output_surface_chroma_u_address, chroma_buffer.data(),
                                       chroma_buffer.size());
-        gpu.Maxwell3D().OnMemoryWrite();
        break;
    }
    default:
--- a/src/video_core/dirty_flags.cpp
+++ b/src/video_core/dirty_flags.cpp
@@ -12,13 +12,30 @@
 #define NUM(field_name) (sizeof(::Tegra::Engines::Maxwell3D::Regs::field_name) / (sizeof(u32)))

 namespace VideoCommon::Dirty {
-
+namespace {
 using Tegra::Engines::Maxwell3D;

-void SetupDirtyRenderTargets(Tegra::Engines::Maxwell3D::DirtyState::Tables& tables) {
+void SetupDirtyVertexBuffers(Maxwell3D::DirtyState::Tables& tables) {
+    static constexpr std::size_t num_array = 3;
+    for (std::size_t i = 0; i < Maxwell3D::Regs::NumVertexArrays; ++i) {
+        const std::size_t array_offset = OFF(vertex_array) + i * NUM(vertex_array[0]);
+        const std::size_t limit_offset = OFF(vertex_array_limit) + i * NUM(vertex_array_limit[0]);
+
+        FillBlock(tables, array_offset, num_array, VertexBuffer0 + i, VertexBuffers);
+        FillBlock(tables, limit_offset, NUM(vertex_array_limit), VertexBuffer0 + i, VertexBuffers);
+    }
+}
+
+void SetupIndexBuffer(Maxwell3D::DirtyState::Tables& tables) {
+    FillBlock(tables[0], OFF(index_array), NUM(index_array), IndexBuffer);
+}
+
+void SetupDirtyDescriptors(Maxwell3D::DirtyState::Tables& tables) {
    FillBlock(tables[0], OFF(tic), NUM(tic), Descriptors);
    FillBlock(tables[0], OFF(tsc), NUM(tsc), Descriptors);
+}

+void SetupDirtyRenderTargets(Maxwell3D::DirtyState::Tables& tables) {
    static constexpr std::size_t num_per_rt = NUM(rt[0]);
    static constexpr std::size_t begin = OFF(rt);
    static constexpr std::size_t num = num_per_rt * Maxwell3D::Regs::NumRenderTargets;
@@ -41,5 +58,13 @@ void SetupDirtyRenderTargets(Tegra::Engines::Maxwell3D::DirtyState::Tables& tabl
        FillBlock(table, OFF(zeta), NUM(zeta), flag);
    }
 }
+} // Anonymous namespace
+
+void SetupDirtyFlags(Maxwell3D::DirtyState::Tables& tables) {
+    SetupDirtyVertexBuffers(tables);
+    SetupIndexBuffer(tables);
+    SetupDirtyDescriptors(tables);
+    SetupDirtyRenderTargets(tables);
+}

 } // namespace VideoCommon::Dirty
--- a/src/video_core/dirty_flags.h
+++ b/src/video_core/dirty_flags.h
@@ -30,6 +30,12 @@ enum : u8 {
    ColorBuffer7,
    ZetaBuffer,

+    VertexBuffers,
+    VertexBuffer0,
+    VertexBuffer31 = VertexBuffer0 + 31,
+
+    IndexBuffer,
+
    LastCommonEntry,
 };

@@ -47,6 +53,6 @@ void FillBlock(Tegra::Engines::Maxwell3D::DirtyState::Tables& tables, std::size_
    FillBlock(tables[1], begin, num, index_b);
 }

-void SetupDirtyRenderTargets(Tegra::Engines::Maxwell3D::DirtyState::Tables& tables);
+void SetupDirtyFlags(Tegra::Engines::Maxwell3D::DirtyState::Tables& tables);

 } // namespace VideoCommon::Dirty
--- a/src/video_core/dma_pusher.cpp
+++ b/src/video_core/dma_pusher.cpp
@@ -23,8 +23,6 @@ void DmaPusher::DispatchCalls() {
    MICROPROFILE_SCOPE(DispatchCalls);

    gpu.SyncGuestHost();
-    // On entering GPU code, assume all memory may be touched by the ARM core.
-    gpu.Maxwell3D().OnMemoryWrite();

    dma_pushbuffer_subindex = 0;

--- a/src/video_core/engines/fermi_2d.cpp
+++ b/src/video_core/engines/fermi_2d.cpp
@@ -18,8 +18,8 @@ Fermi2D::Fermi2D() {

 Fermi2D::~Fermi2D() = default;

-void Fermi2D::BindRasterizer(VideoCore::RasterizerInterface& rasterizer_) {
-    rasterizer = &rasterizer_;
+void Fermi2D::BindRasterizer(VideoCore::RasterizerInterface* rasterizer_) {
+    rasterizer = rasterizer_;
 }

 void Fermi2D::CallMethod(u32 method, u32 method_argument, bool is_last_call) {
--- a/src/video_core/engines/fermi_2d.h
+++ b/src/video_core/engines/fermi_2d.h
@@ -38,7 +38,7 @@ public:
    ~Fermi2D();

    /// Binds a rasterizer to this engine.
-    void BindRasterizer(VideoCore::RasterizerInterface& rasterizer);
+    void BindRasterizer(VideoCore::RasterizerInterface* rasterizer);

    /// Write the value to the register identified by method.
    void CallMethod(u32 method, u32 method_argument, bool is_last_call) override;
--- a/src/video_core/engines/kepler_compute.cpp
+++ b/src/video_core/engines/kepler_compute.cpp
@@ -21,8 +21,8 @@ KeplerCompute::KeplerCompute(Core::System& system_, MemoryManager& memory_manage

 KeplerCompute::~KeplerCompute() = default;

-void KeplerCompute::BindRasterizer(VideoCore::RasterizerInterface& rasterizer_) {
-    rasterizer = &rasterizer_;
+void KeplerCompute::BindRasterizer(VideoCore::RasterizerInterface* rasterizer_) {
+    rasterizer = rasterizer_;
 }

 void KeplerCompute::CallMethod(u32 method, u32 method_argument, bool is_last_call) {
@@ -39,7 +39,6 @@ void KeplerCompute::CallMethod(u32 method, u32 method_argument, bool is_last_cal
    case KEPLER_COMPUTE_REG_INDEX(data_upload): {
        upload_state.ProcessData(method_argument, is_last_call);
        if (is_last_call) {
-            system.GPU().Maxwell3D().OnMemoryWrite();
        }
        break;
    }
--- a/src/video_core/engines/kepler_compute.h
+++ b/src/video_core/engines/kepler_compute.h
@@ -46,7 +46,7 @@ public:
    ~KeplerCompute();

    /// Binds a rasterizer to this engine.
-    void BindRasterizer(VideoCore::RasterizerInterface& rasterizer);
+    void BindRasterizer(VideoCore::RasterizerInterface* rasterizer);

    static constexpr std::size_t NumConstBuffers = 8;

--- a/src/video_core/engines/kepler_memory.cpp
+++ b/src/video_core/engines/kepler_memory.cpp
@@ -33,7 +33,6 @@ void KeplerMemory::CallMethod(u32 method, u32 method_argument, bool is_last_call
    case KEPLERMEMORY_REG_INDEX(data): {
        upload_state.ProcessData(method_argument, is_last_call);
        if (is_last_call) {
-            system.GPU().Maxwell3D().OnMemoryWrite();
        }
        break;
    }
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@@ -30,8 +30,8 @@ Maxwell3D::Maxwell3D(Core::System& system_, MemoryManager& memory_manager_)

 Maxwell3D::~Maxwell3D() = default;

-void Maxwell3D::BindRasterizer(VideoCore::RasterizerInterface& rasterizer_) {
-    rasterizer = &rasterizer_;
+void Maxwell3D::BindRasterizer(VideoCore::RasterizerInterface* rasterizer_) {
+    rasterizer = rasterizer_;
 }

 void Maxwell3D::InitializeRegisterDefaults() {
@@ -223,7 +223,6 @@ void Maxwell3D::ProcessMethodCall(u32 method, u32 argument, u32 nonshadow_argume
    case MAXWELL3D_REG_INDEX(data_upload):
        upload_state.ProcessData(argument, is_last_call);
        if (is_last_call) {
-            OnMemoryWrite();
        }
        return;
    case MAXWELL3D_REG_INDEX(fragment_barrier):
@@ -570,17 +569,18 @@ std::optional<u64> Maxwell3D::GetQueryResult() {
    }
 }

-void Maxwell3D::ProcessCBBind(std::size_t stage_index) {
+void Maxwell3D::ProcessCBBind(size_t stage_index) {
    // Bind the buffer currently in CB_ADDRESS to the specified index in the desired shader stage.
-    auto& shader = state.shader_stages[stage_index];
-    auto& bind_data = regs.cb_bind[stage_index];
-
-    ASSERT(bind_data.index < Regs::MaxConstBuffers);
-    auto& buffer = shader.const_buffers[bind_data.index];
-
+    const auto& bind_data = regs.cb_bind[stage_index];
+    auto& buffer = state.shader_stages[stage_index].const_buffers[bind_data.index];
    buffer.enabled = bind_data.valid.Value() != 0;
    buffer.address = regs.const_buffer.BufferAddress();
    buffer.size = regs.const_buffer.cb_size;
+
+    const bool is_enabled = bind_data.valid.Value() != 0;
+    const GPUVAddr gpu_addr = is_enabled ? regs.const_buffer.BufferAddress() : 0;
+    const u32 size = is_enabled ? regs.const_buffer.cb_size : 0;
+    rasterizer->BindGraphicsUniformBuffer(stage_index, bind_data.index, gpu_addr, size);
 }

 void Maxwell3D::ProcessCBData(u32 value) {
@@ -635,7 +635,6 @@ void Maxwell3D::FinishCBData() {

    const u32 id = cb_data_state.id;
    memory_manager.WriteBlock(address, cb_data_state.buffer[id].data(), size);
-    OnMemoryWrite();

    cb_data_state.id = null_cb_data;
    cb_data_state.current = null_cb_data;
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@@ -55,7 +55,7 @@ public:
    ~Maxwell3D();

    /// Binds a rasterizer to this engine.
-    void BindRasterizer(VideoCore::RasterizerInterface& rasterizer);
+    void BindRasterizer(VideoCore::RasterizerInterface* rasterizer);

    /// Register structure of the Maxwell3D engine.
    /// TODO(Subv): This structure will need to be made bigger as more registers are discovered.
@@ -1314,8 +1314,7 @@ public:

                    GPUVAddr LimitAddress() const {
                        return static_cast<GPUVAddr>((static_cast<GPUVAddr>(limit_high) << 32) |
-                                                     limit_low) +
-                               1;
+                                                     limit_low);
                    }
                } vertex_array_limit[NumVertexArrays];

@@ -1403,6 +1402,7 @@ public:
        };

        std::array<ShaderStageInfo, Regs::MaxShaderStage> shader_stages;
+
        u32 current_instance = 0; ///< Current instance to be used to simulate instanced rendering.
    };

@@ -1452,11 +1452,6 @@ public:
        return *rasterizer;
    }

-    /// Notify a memory write has happened.
-    void OnMemoryWrite() {
-        dirty.flags |= dirty.on_write_stores;
-    }
-
    enum class MMEDrawMode : u32 {
        Undefined,
        Array,
@@ -1478,7 +1473,6 @@ public:
        using Tables = std::array<Table, 2>;

        Flags flags;
-        Flags on_write_stores;
        Tables tables{};
    } dirty;

@@ -1541,7 +1535,7 @@ private:
    void FinishCBData();

    /// Handles a write to the CB_BIND register.
-    void ProcessCBBind(std::size_t stage_index);
+    void ProcessCBBind(size_t stage_index);

    /// Handles a write to the VERTEX_END_GL register, triggering a draw.
    void DrawArrays();
--- a/src/video_core/engines/maxwell_dma.cpp
+++ b/src/video_core/engines/maxwell_dma.cpp
@@ -60,9 +60,6 @@ void MaxwellDMA::Launch() {
        return;
    }

-    // All copies here update the main memory, so mark all rasterizer states as invalid.
-    system.GPU().Maxwell3D().OnMemoryWrite();
-
    if (is_src_pitch && is_dst_pitch) {
        CopyPitchToPitch();
    } else {
--- a/src/video_core/fence_manager.h
+++ b/src/video_core/fence_manager.h
@@ -143,22 +143,26 @@ private:
    }

    bool ShouldWait() const {
+        std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex};
        return texture_cache.ShouldWaitAsyncFlushes() || buffer_cache.ShouldWaitAsyncFlushes() ||
               query_cache.ShouldWaitAsyncFlushes();
    }

    bool ShouldFlush() const {
+        std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex};
        return texture_cache.HasUncommittedFlushes() || buffer_cache.HasUncommittedFlushes() ||
               query_cache.HasUncommittedFlushes();
    }

    void PopAsyncFlushes() {
+        std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex};
        texture_cache.PopAsyncFlushes();
        buffer_cache.PopAsyncFlushes();
        query_cache.PopAsyncFlushes();
    }

    void CommitAsyncFlushes() {
+        std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex};
        texture_cache.CommitAsyncFlushes();
        buffer_cache.CommitAsyncFlushes();
        query_cache.CommitAsyncFlushes();
--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@@ -44,8 +44,8 @@ GPU::~GPU() = default;

 void GPU::BindRenderer(std::unique_ptr<VideoCore::RendererBase> renderer_) {
    renderer = std::move(renderer_);
+    rasterizer = renderer->ReadRasterizer();

-    VideoCore::RasterizerInterface& rasterizer = renderer->Rasterizer();
    memory_manager->BindRasterizer(rasterizer);
    maxwell_3d->BindRasterizer(rasterizer);
    fermi_2d->BindRasterizer(rasterizer);
@@ -171,7 +171,7 @@ void GPU::TickWork() {
        const std::size_t size = request.size;
        flush_requests.pop_front();
        flush_request_mutex.unlock();
-        renderer->Rasterizer().FlushRegion(addr, size);
+        rasterizer->FlushRegion(addr, size);
        current_flush_fence.store(fence);
        flush_request_mutex.lock();
    }
@@ -193,11 +193,11 @@ u64 GPU::GetTicks() const {
 }

 void GPU::FlushCommands() {
-    renderer->Rasterizer().FlushCommands();
+    rasterizer->FlushCommands();
 }

 void GPU::SyncGuestHost() {
-    renderer->Rasterizer().SyncGuestHost();
+    rasterizer->SyncGuestHost();
 }

 enum class GpuSemaphoreOperation {
--- a/src/video_core/gpu.h
+++ b/src/video_core/gpu.h
@@ -366,6 +366,7 @@ protected:
    std::unique_ptr<Tegra::DmaPusher> dma_pusher;
    std::unique_ptr<Tegra::CDmaPusher> cdma_pusher;
    std::unique_ptr<VideoCore::RendererBase> renderer;
+    VideoCore::RasterizerInterface* rasterizer = nullptr;
    const bool use_nvdec;

 private:
--- a/src/video_core/gpu_thread.cpp
+++ b/src/video_core/gpu_thread.cpp
@@ -38,6 +38,7 @@ static void RunThread(Core::System& system, VideoCore::RendererBase& renderer,
    }

    auto current_context = context.Acquire();
+    VideoCore::RasterizerInterface* const rasterizer = renderer.ReadRasterizer();

    CommandDataContainer next;
    while (state.is_running) {
@@ -52,13 +53,13 @@ static void RunThread(Core::System& system, VideoCore::RendererBase& renderer,
        } else if (const auto* data = std::get_if<SwapBuffersCommand>(&next.data)) {
            renderer.SwapBuffers(data->framebuffer ? &*data->framebuffer : nullptr);
        } else if (std::holds_alternative<OnCommandListEndCommand>(next.data)) {
-            renderer.Rasterizer().ReleaseFences();
+            rasterizer->ReleaseFences();
        } else if (std::holds_alternative<GPUTickCommand>(next.data)) {
            system.GPU().TickWork();
        } else if (const auto* flush = std::get_if<FlushRegionCommand>(&next.data)) {
-            renderer.Rasterizer().FlushRegion(flush->addr, flush->size);
+            rasterizer->FlushRegion(flush->addr, flush->size);
        } else if (const auto* invalidate = std::get_if<InvalidateRegionCommand>(&next.data)) {
-            renderer.Rasterizer().OnCPUWrite(invalidate->addr, invalidate->size);
+            rasterizer->OnCPUWrite(invalidate->addr, invalidate->size);
        } else if (std::holds_alternative<EndProcessingCommand>(next.data)) {
            return;
        } else {
@@ -84,6 +85,7 @@ ThreadManager::~ThreadManager() {
 void ThreadManager::StartThread(VideoCore::RendererBase& renderer,
                                Core::Frontend::GraphicsContext& context,
                                Tegra::DmaPusher& dma_pusher, Tegra::CDmaPusher& cdma_pusher) {
+    rasterizer = renderer.ReadRasterizer();
    thread = std::thread(RunThread, std::ref(system), std::ref(renderer), std::ref(context),
                         std::ref(dma_pusher), std::ref(state), std::ref(cdma_pusher));
 }
@@ -129,12 +131,12 @@ void ThreadManager::FlushRegion(VAddr addr, u64 size) {
 }

 void ThreadManager::InvalidateRegion(VAddr addr, u64 size) {
-    system.Renderer().Rasterizer().OnCPUWrite(addr, size);
+    rasterizer->OnCPUWrite(addr, size);
 }

 void ThreadManager::FlushAndInvalidateRegion(VAddr addr, u64 size) {
    // Skip flush on asynch mode, as FlushAndInvalidateRegion is not used for anything too important
-    system.Renderer().Rasterizer().OnCPUWrite(addr, size);
+    rasterizer->OnCPUWrite(addr, size);
 }

 void ThreadManager::WaitIdle() const {
--- a/src/video_core/gpu_thread.h
+++ b/src/video_core/gpu_thread.h
@@ -27,6 +27,7 @@ class System;
 } // namespace Core

 namespace VideoCore {
+class RasterizerInterface;
 class RendererBase;
 } // namespace VideoCore

@@ -151,11 +152,12 @@ private:
    /// Pushes a command to be executed by the GPU thread
    u64 PushCommand(CommandData&& command_data);

-    SynchState state;
    Core::System& system;
-    std::thread thread;
-    std::thread::id thread_id;
    const bool is_async;
+    VideoCore::RasterizerInterface* rasterizer = nullptr;
+
+    SynchState state;
+    std::thread thread;
 };

 } // namespace VideoCommon::GPUThread
--- a/src/video_core/host_shaders/CMakeLists.txt
+++ b/src/video_core/host_shaders/CMakeLists.txt
@@ -12,7 +12,6 @@ set(SHADER_FILES
    vulkan_blit_depth_stencil.frag
    vulkan_present.frag
    vulkan_present.vert
-    vulkan_quad_array.comp
    vulkan_quad_indexed.comp
    vulkan_uint8.comp
 )
--- a/src/video_core/host_shaders/vulkan_uint8.comp
+++ b/src/video_core/host_shaders/vulkan_uint8.comp
@@ -16,9 +16,16 @@ layout (std430, set = 0, binding = 1) writeonly buffer OutputBuffer {
    uint16_t output_indexes[];
 };

+uint AssembleIndex(uint id) {
+    // Most primitive restart indices are 0xFF
+    // Hardcode this to 0xFF for now
+    uint index = uint(input_indexes[id]);
+    return index == 0xFF ? 0xFFFF : index;
+}
+
 void main() {
    uint id = gl_GlobalInvocationID.x;
    if (id < input_indexes.length()) {
-        output_indexes[id] = uint16_t(input_indexes[id]);
+        output_indexes[id] = uint16_t(AssembleIndex(id));
    }
 }
--- a/src/video_core/memory_manager.cpp
+++ b/src/video_core/memory_manager.cpp
@@ -20,8 +20,8 @@ MemoryManager::MemoryManager(Core::System& system_)

 MemoryManager::~MemoryManager() = default;

-void MemoryManager::BindRasterizer(VideoCore::RasterizerInterface& rasterizer_) {
-    rasterizer = &rasterizer_;
+void MemoryManager::BindRasterizer(VideoCore::RasterizerInterface* rasterizer_) {
+    rasterizer = rasterizer_;
 }

 GPUVAddr MemoryManager::UpdateRange(GPUVAddr gpu_addr, PageEntry page_entry, std::size_t size) {
@@ -101,6 +101,25 @@ void MemoryManager::TryUnlockPage(PageEntry page_entry, std::size_t size) {
               .IsSuccess());
 }

+void MemoryManager::UnmapVicFrame(GPUVAddr gpu_addr, std::size_t size) {
+    if (!size) {
+        return;
+    }
+
+    const std::optional<VAddr> cpu_addr = GpuToCpuAddress(gpu_addr);
+    ASSERT(cpu_addr);
+    rasterizer->InvalidateExceptTextureCache(*cpu_addr, size);
+    cache_invalidate_queue.push_back({*cpu_addr, size});
+
+    UpdateRange(gpu_addr, PageEntry::State::Unmapped, size);
+}
+
+void MemoryManager::InvalidateQueuedCaches() {
+    for (const auto& entry : cache_invalidate_queue) {
+        rasterizer->InvalidateTextureCache(entry.first, entry.second);
+    }
+    cache_invalidate_queue.clear();
+}
 PageEntry MemoryManager::GetPageEntry(GPUVAddr gpu_addr) const {
    return page_table[PageEntryIndex(gpu_addr)];
 }
--- a/src/video_core/memory_manager.h
+++ b/src/video_core/memory_manager.h
@@ -72,7 +72,7 @@ public:
    ~MemoryManager();

    /// Binds a renderer to the memory manager.
-    void BindRasterizer(VideoCore::RasterizerInterface& rasterizer);
+    void BindRasterizer(VideoCore::RasterizerInterface* rasterizer);

    [[nodiscard]] std::optional<VAddr> GpuToCpuAddress(GPUVAddr addr) const;

@@ -121,6 +121,14 @@ public:
    [[nodiscard]] GPUVAddr Allocate(std::size_t size, std::size_t align);
    void Unmap(GPUVAddr gpu_addr, std::size_t size);

+    /**
+     * Some Decoded NVDEC frames require that texture cache does not get invalidated.
+     * UnmapVicFrame defers the texture cache invalidation until the stream ends
+     * by invoking InvalidateQueuedCaches to invalidate all frame texture caches.
+     */
+    void UnmapVicFrame(GPUVAddr gpu_addr, std::size_t size);
+    void InvalidateQueuedCaches();
+
 private:
    [[nodiscard]] PageEntry GetPageEntry(GPUVAddr gpu_addr) const;
    void SetPageEntry(GPUVAddr gpu_addr, PageEntry page_entry, std::size_t size = page_size);
@@ -150,6 +158,7 @@ private:
    VideoCore::RasterizerInterface* rasterizer = nullptr;

    std::vector<PageEntry> page_table;
+    std::vector<std::pair<VAddr, std::size_t>> cache_invalidate_queue;
 };

 } // namespace Tegra
--- a/src/video_core/rasterizer_interface.h
+++ b/src/video_core/rasterizer_interface.h
@@ -7,6 +7,7 @@
 #include <atomic>
 #include <functional>
 #include <optional>
+#include <span>
 #include "common/common_types.h"
 #include "video_core/engines/fermi_2d.h"
 #include "video_core/gpu.h"
@@ -49,6 +50,10 @@ public:
    /// Records a GPU query and caches it
    virtual void Query(GPUVAddr gpu_addr, QueryType type, std::optional<u64> timestamp) = 0;

+    /// Signal an uniform buffer binding
+    virtual void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr,
+                                           u32 size) = 0;
+
    /// Signal a GPU based semaphore as a fence
    virtual void SignalSemaphore(GPUVAddr addr, u32 value) = 0;

@@ -64,6 +69,12 @@ public:
    /// Notify rasterizer that any caches of the specified region should be flushed to Switch memory
    virtual void FlushRegion(VAddr addr, u64 size) = 0;

+    /// Notify rasterizer to flush the texture cache to Switch memory
+    virtual void InvalidateExceptTextureCache(VAddr addr, u64 size) = 0;
+
+    /// Notify rasterizer to invalidate the texture cache
+    virtual void InvalidateTextureCache(VAddr addr, u64 size) = 0;
+
    /// Check if the the specified memory area requires flushing to CPU Memory.
    virtual bool MustFlushRegion(VAddr addr, u64 size) = 0;

--- a/src/video_core/renderer_base.h
+++ b/src/video_core/renderer_base.h
@@ -37,15 +37,11 @@ public:
                          std::unique_ptr<Core::Frontend::GraphicsContext> context);
    virtual ~RendererBase();

-    /// Initialize the renderer
-    [[nodiscard]] virtual bool Init() = 0;
-
-    /// Shutdown the renderer
-    virtual void ShutDown() = 0;
-
    /// Finalize rendering the guest frame and draw into the presentation texture
    virtual void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) = 0;

+    [[nodiscard]] virtual RasterizerInterface* ReadRasterizer() = 0;
+
    // Getter/setter functions:
    // ------------------------

@@ -57,14 +53,6 @@ public:
        return m_current_frame;
    }

-    [[nodiscard]] RasterizerInterface& Rasterizer() {
-        return *rasterizer;
-    }
-
-    [[nodiscard]] const RasterizerInterface& Rasterizer() const {
-        return *rasterizer;
-    }
-
    [[nodiscard]] Core::Frontend::GraphicsContext& Context() {
        return *context;
    }
@@ -98,7 +86,6 @@ public:

 protected:
    Core::Frontend::EmuWindow& render_window; ///< Reference to the render window handle.
-    std::unique_ptr<RasterizerInterface> rasterizer;
    std::unique_ptr<Core::Frontend::GraphicsContext> context;
    f32 m_current_fps = 0.0f; ///< Current framerate, should be set by the renderer
    int m_current_frame = 0;  ///< Current frame, should be set by the renderer
--- a/src/video_core/renderer_opengl/gl_buffer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
@@ -2,98 +2,235 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.

-#include <memory>
+#include <span>

-#include <glad/glad.h>
-
-#include "common/assert.h"
-#include "common/microprofile.h"
 #include "video_core/buffer_cache/buffer_cache.h"
-#include "video_core/engines/maxwell_3d.h"
-#include "video_core/rasterizer_interface.h"
 #include "video_core/renderer_opengl/gl_buffer_cache.h"
 #include "video_core/renderer_opengl/gl_device.h"
-#include "video_core/renderer_opengl/gl_rasterizer.h"
-#include "video_core/renderer_opengl/gl_resource_manager.h"
+#include "video_core/vulkan_common/vulkan_device.h"
+#include "video_core/vulkan_common/vulkan_instance.h"
+#include "video_core/vulkan_common/vulkan_library.h"
+#include "video_core/vulkan_common/vulkan_memory_allocator.h"

 namespace OpenGL {
+namespace {
+struct BindlessSSBO {
+    GLuint64EXT address;
+    GLsizei length;
+    GLsizei padding;
+};
+static_assert(sizeof(BindlessSSBO) == sizeof(GLuint) * 4);

-using Maxwell = Tegra::Engines::Maxwell3D::Regs;
+constexpr std::array PROGRAM_LUT{
+    GL_VERTEX_PROGRAM_NV,   GL_TESS_CONTROL_PROGRAM_NV, GL_TESS_EVALUATION_PROGRAM_NV,
+    GL_GEOMETRY_PROGRAM_NV, GL_FRAGMENT_PROGRAM_NV,
+};
+} // Anonymous namespace

-MICROPROFILE_DEFINE(OpenGL_Buffer_Download, "OpenGL", "Buffer Download", MP_RGB(192, 192, 128));
+Buffer::Buffer(BufferCacheRuntime&, VideoCommon::NullBufferParams null_params)
+    : VideoCommon::BufferBase<VideoCore::RasterizerInterface>(null_params) {}

-Buffer::Buffer(const Device& device_, VAddr cpu_addr_, std::size_t size_)
-    : BufferBlock{cpu_addr_, size_} {
-    gl_buffer.Create();
-    glNamedBufferData(gl_buffer.handle, static_cast<GLsizeiptr>(size_), nullptr, GL_DYNAMIC_DRAW);
-    if (device_.UseAssemblyShaders() || device_.HasVertexBufferUnifiedMemory()) {
-        glMakeNamedBufferResidentNV(gl_buffer.handle, GL_READ_WRITE);
-        glGetNamedBufferParameterui64vNV(gl_buffer.handle, GL_BUFFER_GPU_ADDRESS_NV, &gpu_address);
+Buffer::Buffer(BufferCacheRuntime& runtime, VideoCore::RasterizerInterface& rasterizer_,
+               VAddr cpu_addr_, u64 size_bytes_)
+    : VideoCommon::BufferBase<VideoCore::RasterizerInterface>(rasterizer_, cpu_addr_, size_bytes_) {
+    buffer.Create();
+    const std::string name = fmt::format("Buffer 0x{:x}", CpuAddr());
+    glObjectLabel(GL_BUFFER, buffer.handle, static_cast<GLsizei>(name.size()), name.data());
+    if (runtime.device.UseAssemblyShaders()) {
+        CreateMemoryObjects(runtime);
+        glNamedBufferStorageMemEXT(buffer.handle, SizeBytes(), memory_commit.ExportOpenGLHandle(),
+                                   memory_commit.Offset());
+    } else {
+        glNamedBufferData(buffer.handle, SizeBytes(), nullptr, GL_DYNAMIC_DRAW);
+    }
+    if (runtime.has_unified_vertex_buffers) {
+        glGetNamedBufferParameterui64vNV(buffer.handle, GL_BUFFER_GPU_ADDRESS_NV, &address);
    }
 }

-Buffer::~Buffer() = default;
-
-void Buffer::Upload(std::size_t offset, std::size_t data_size, const u8* data) {
-    glNamedBufferSubData(Handle(), static_cast<GLintptr>(offset),
-                         static_cast<GLsizeiptr>(data_size), data);
+void Buffer::ImmediateUpload(size_t offset, std::span<const u8> data) noexcept {
+    glNamedBufferSubData(buffer.handle, static_cast<GLintptr>(offset),
+                         static_cast<GLsizeiptr>(data.size_bytes()), data.data());
 }

-void Buffer::Download(std::size_t offset, std::size_t data_size, u8* data) {
-    MICROPROFILE_SCOPE(OpenGL_Buffer_Download);
-    const GLsizeiptr gl_size = static_cast<GLsizeiptr>(data_size);
-    const GLintptr gl_offset = static_cast<GLintptr>(offset);
-    if (read_buffer.handle == 0) {
-        read_buffer.Create();
-        glNamedBufferData(read_buffer.handle, static_cast<GLsizeiptr>(Size()), nullptr,
-                          GL_STREAM_READ);
-    }
-    glMemoryBarrier(GL_BUFFER_UPDATE_BARRIER_BIT);
-    glCopyNamedBufferSubData(gl_buffer.handle, read_buffer.handle, gl_offset, gl_offset, gl_size);
-    glGetNamedBufferSubData(read_buffer.handle, gl_offset, gl_size, data);
+void Buffer::ImmediateDownload(size_t offset, std::span<u8> data) noexcept {
+    glGetNamedBufferSubData(buffer.handle, static_cast<GLintptr>(offset),
+                            static_cast<GLsizeiptr>(data.size_bytes()), data.data());
 }

-void Buffer::CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst_offset,
-                      std::size_t copy_size) {
-    glCopyNamedBufferSubData(src.Handle(), Handle(), static_cast<GLintptr>(src_offset),
-                             static_cast<GLintptr>(dst_offset), static_cast<GLsizeiptr>(copy_size));
-}
-
-OGLBufferCache::OGLBufferCache(VideoCore::RasterizerInterface& rasterizer_,
-                               Tegra::MemoryManager& gpu_memory_, Core::Memory::Memory& cpu_memory_,
-                               const Device& device_, OGLStreamBuffer& stream_buffer_,
-                               StateTracker& state_tracker)
-    : GenericBufferCache{rasterizer_, gpu_memory_, cpu_memory_, stream_buffer_}, device{device_} {
-    if (!device.HasFastBufferSubData()) {
+void Buffer::MakeResident(GLenum access) noexcept {
+    // Abuse GLenum's order to exit early
+    // GL_NONE (default) < GL_READ_ONLY < GL_READ_WRITE
+    if (access <= current_residency_access || buffer.handle == 0) {
        return;
    }
+    if (std::exchange(current_residency_access, access) != GL_NONE) {
+        // If the buffer is already resident, remove its residency before promoting it
+        glMakeNamedBufferNonResidentNV(buffer.handle);
+    }
+    glMakeNamedBufferResidentNV(buffer.handle, access);
+}

-    static constexpr GLsizeiptr size = static_cast<GLsizeiptr>(Maxwell::MaxConstBufferSize);
-    glCreateBuffers(static_cast<GLsizei>(std::size(cbufs)), std::data(cbufs));
-    for (const GLuint cbuf : cbufs) {
-        glNamedBufferData(cbuf, size, nullptr, GL_STREAM_DRAW);
+GLuint Buffer::SubBuffer(u32 offset) {
+    if (offset == 0) {
+        return buffer.handle;
+    }
+    for (const auto& [sub_buffer, sub_offset] : subs) {
+        if (sub_offset == offset) {
+            return sub_buffer.handle;
+        }
+    }
+    OGLBuffer sub_buffer;
+    sub_buffer.Create();
+    glNamedBufferStorageMemEXT(sub_buffer.handle, SizeBytes() - offset,
+                               memory_commit.ExportOpenGLHandle(), memory_commit.Offset() + offset);
+    return subs.emplace_back(std::move(sub_buffer), offset).first.handle;
+}
+
+void Buffer::CreateMemoryObjects(BufferCacheRuntime& runtime) {
+    auto& allocator = runtime.vulkan_memory_allocator;
+    auto& device = runtime.vulkan_device->GetLogical();
+    auto vulkan_buffer = device.CreateBuffer(VkBufferCreateInfo{
+        .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
+        .pNext = nullptr,
+        .flags = 0,
+        .size = SizeBytes(),
+        .usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT |
+                 VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT |
+                 VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT | VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT |
+                 VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_INDEX_BUFFER_BIT |
+                 VK_BUFFER_USAGE_VERTEX_BUFFER_BIT,
+        .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
+        .queueFamilyIndexCount = 0,
+        .pQueueFamilyIndices = nullptr,
+    });
+    const VkMemoryRequirements requirements = device.GetBufferMemoryRequirements(*vulkan_buffer);
+    memory_commit = allocator->Commit(requirements, Vulkan::MemoryUsage::DeviceLocal);
+}
+
+BufferCacheRuntime::BufferCacheRuntime(const Device& device_, const Vulkan::Device* vulkan_device_,
+                                       Vulkan::MemoryAllocator* vulkan_memory_allocator_)
+    : device{device_}, vulkan_device{vulkan_device_},
+      vulkan_memory_allocator{vulkan_memory_allocator_},
+      stream_buffer{device.HasFastBufferSubData() ? std::nullopt
+                                                  : std::make_optional<StreamBuffer>()} {
+    GLint gl_max_attributes;
+    glGetIntegerv(GL_MAX_VERTEX_ATTRIBS, &gl_max_attributes);
+    max_attributes = static_cast<u32>(gl_max_attributes);
+    use_assembly_shaders = device.UseAssemblyShaders();
+    has_unified_vertex_buffers = device.HasVertexBufferUnifiedMemory();
+
+    for (auto& stage_uniforms : fast_uniforms) {
+        for (OGLBuffer& buffer : stage_uniforms) {
+            buffer.Create();
+            glNamedBufferData(buffer.handle, BufferCache::SKIP_CACHE_SIZE, nullptr, GL_STREAM_DRAW);
+        }
    }
 }

-OGLBufferCache::~OGLBufferCache() {
-    glDeleteBuffers(static_cast<GLsizei>(std::size(cbufs)), std::data(cbufs));
+void BufferCacheRuntime::CopyBuffer(Buffer& dst_buffer, Buffer& src_buffer,
+                                    std::span<const VideoCommon::BufferCopy> copies) {
+    for (const VideoCommon::BufferCopy& copy : copies) {
+        glCopyNamedBufferSubData(
+            src_buffer.Handle(), dst_buffer.Handle(), static_cast<GLintptr>(copy.src_offset),
+            static_cast<GLintptr>(copy.dst_offset), static_cast<GLsizeiptr>(copy.size));
+    }
 }

-std::shared_ptr<Buffer> OGLBufferCache::CreateBlock(VAddr cpu_addr, std::size_t size) {
-    return std::make_shared<Buffer>(device, cpu_addr, size);
+void BufferCacheRuntime::BindIndexBuffer(Buffer& buffer, u32 offset, u32 size) {
+    if (has_unified_vertex_buffers) {
+        buffer.MakeResident(GL_READ_ONLY);
+        glBufferAddressRangeNV(GL_ELEMENT_ARRAY_ADDRESS_NV, 0, buffer.HostGpuAddr() + offset,
+                               static_cast<GLsizeiptr>(size));
+    } else {
+        glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, buffer.Handle());
+        index_buffer_offset = offset;
+    }
 }

-OGLBufferCache::BufferInfo OGLBufferCache::GetEmptyBuffer(std::size_t) {
-    return {0, 0, 0};
+void BufferCacheRuntime::BindVertexBuffer(u32 index, Buffer& buffer, u32 offset, u32 size,
+                                          u32 stride) {
+    if (index >= max_attributes) {
+        return;
+    }
+    if (has_unified_vertex_buffers) {
+        buffer.MakeResident(GL_READ_ONLY);
+        glBindVertexBuffer(index, 0, 0, static_cast<GLsizei>(stride));
+        glBufferAddressRangeNV(GL_VERTEX_ATTRIB_ARRAY_ADDRESS_NV, index,
+                               buffer.HostGpuAddr() + offset, static_cast<GLsizeiptr>(size));
+    } else {
+        glBindVertexBuffer(index, buffer.Handle(), static_cast<GLintptr>(offset),
+                           static_cast<GLsizei>(stride));
+    }
 }

-OGLBufferCache::BufferInfo OGLBufferCache::ConstBufferUpload(const void* raw_pointer,
-                                                             std::size_t size) {
-    DEBUG_ASSERT(cbuf_cursor < std::size(cbufs));
-    const GLuint cbuf = cbufs[cbuf_cursor++];
+void BufferCacheRuntime::BindUniformBuffer(size_t stage, u32 binding_index, Buffer& buffer,
+                                           u32 offset, u32 size) {
+    if (use_assembly_shaders) {
+        const GLuint sub_buffer = buffer.SubBuffer(offset);
+        glBindBufferRangeNV(PABO_LUT[stage], binding_index, sub_buffer, 0,
+                            static_cast<GLsizeiptr>(size));
+    } else {
+        const GLuint base_binding = device.GetBaseBindings(stage).uniform_buffer;
+        const GLuint binding = base_binding + binding_index;
+        glBindBufferRange(GL_UNIFORM_BUFFER, binding, buffer.Handle(),
+                          static_cast<GLintptr>(offset), static_cast<GLsizeiptr>(size));
+    }
+}

-    glNamedBufferSubData(cbuf, 0, static_cast<GLsizeiptr>(size), raw_pointer);
-    return {cbuf, 0, 0};
+void BufferCacheRuntime::BindComputeUniformBuffer(u32 binding_index, Buffer& buffer, u32 offset,
+                                                  u32 size) {
+    if (use_assembly_shaders) {
+        glBindBufferRangeNV(GL_COMPUTE_PROGRAM_PARAMETER_BUFFER_NV, binding_index,
+                            buffer.SubBuffer(offset), 0, static_cast<GLsizeiptr>(size));
+    } else {
+        glBindBufferRange(GL_UNIFORM_BUFFER, binding_index, buffer.Handle(),
+                          static_cast<GLintptr>(offset), static_cast<GLsizeiptr>(size));
+    }
+}
+
+void BufferCacheRuntime::BindStorageBuffer(size_t stage, u32 binding_index, Buffer& buffer,
+                                           u32 offset, u32 size, bool is_written) {
+    if (use_assembly_shaders) {
+        const BindlessSSBO ssbo{
+            .address = buffer.HostGpuAddr() + offset,
+            .length = static_cast<GLsizei>(size),
+            .padding = 0,
+        };
+        buffer.MakeResident(is_written ? GL_READ_WRITE : GL_READ_ONLY);
+        glProgramLocalParametersI4uivNV(PROGRAM_LUT[stage], binding_index, 1,
+                                        reinterpret_cast<const GLuint*>(&ssbo));
+    } else {
+        const GLuint base_binding = device.GetBaseBindings(stage).shader_storage_buffer;
+        const GLuint binding = base_binding + binding_index;
+        glBindBufferRange(GL_SHADER_STORAGE_BUFFER, binding, buffer.Handle(),
+                          static_cast<GLintptr>(offset), static_cast<GLsizeiptr>(size));
+    }
+}
+
+void BufferCacheRuntime::BindComputeStorageBuffer(u32 binding_index, Buffer& buffer, u32 offset,
+                                                  u32 size, bool is_written) {
+    if (use_assembly_shaders) {
+        const BindlessSSBO ssbo{
+            .address = buffer.HostGpuAddr() + offset,
+            .length = static_cast<GLsizei>(size),
+            .padding = 0,
+        };
+        buffer.MakeResident(is_written ? GL_READ_WRITE : GL_READ_ONLY);
+        glProgramLocalParametersI4uivNV(GL_COMPUTE_PROGRAM_NV, binding_index, 1,
+                                        reinterpret_cast<const GLuint*>(&ssbo));
+    } else if (size == 0) {
+        glBindBufferRange(GL_SHADER_STORAGE_BUFFER, binding_index, 0, 0, 0);
+    } else {
+        glBindBufferRange(GL_SHADER_STORAGE_BUFFER, binding_index, buffer.Handle(),
+                          static_cast<GLintptr>(offset), static_cast<GLsizeiptr>(size));
+    }
+}
+
+void BufferCacheRuntime::BindTransformFeedbackBuffer(u32 index, Buffer& buffer, u32 offset,
+                                                     u32 size) {
+    glBindBufferRange(GL_TRANSFORM_FEEDBACK_BUFFER, index, buffer.Handle(),
+                      static_cast<GLintptr>(offset), static_cast<GLsizeiptr>(size));
 }

 } // namespace OpenGL
--- a/src/video_core/renderer_opengl/gl_buffer_cache.h
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.h
@@ -5,79 +5,167 @@
 #pragma once

 #include <array>
-#include <memory>
+#include <span>

+#include "common/alignment.h"
 #include "common/common_types.h"
+#include "common/dynamic_library.h"
 #include "video_core/buffer_cache/buffer_cache.h"
-#include "video_core/engines/maxwell_3d.h"
+#include "video_core/rasterizer_interface.h"
+#include "video_core/renderer_opengl/gl_device.h"
 #include "video_core/renderer_opengl/gl_resource_manager.h"
 #include "video_core/renderer_opengl/gl_stream_buffer.h"
+#include "video_core/vulkan_common/vulkan_device.h"
+#include "video_core/vulkan_common/vulkan_memory_allocator.h"

-namespace Core {
-class System;
-}
+namespace Vulkan {
+class Device;
+class MemoryAllocator;
+} // namespace Vulkan

 namespace OpenGL {

-class Device;
-class OGLStreamBuffer;
-class RasterizerOpenGL;
-class StateTracker;
+class BufferCacheRuntime;

-class Buffer : public VideoCommon::BufferBlock {
+class Buffer : public VideoCommon::BufferBase<VideoCore::RasterizerInterface> {
 public:
-    explicit Buffer(const Device& device_, VAddr cpu_addr_, std::size_t size_);
-    ~Buffer();
+    explicit Buffer(BufferCacheRuntime&, VideoCore::RasterizerInterface& rasterizer, VAddr cpu_addr,
+                    u64 size_bytes);
+    explicit Buffer(BufferCacheRuntime&, VideoCommon::NullBufferParams);

-    void Upload(std::size_t offset, std::size_t data_size, const u8* data);
+    void ImmediateUpload(size_t offset, std::span<const u8> data) noexcept;

-    void Download(std::size_t offset, std::size_t data_size, u8* data);
+    void ImmediateDownload(size_t offset, std::span<u8> data) noexcept;

-    void CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst_offset,
-                  std::size_t copy_size);
+    void MakeResident(GLenum access) noexcept;

-    GLuint Handle() const noexcept {
-        return gl_buffer.handle;
+    [[nodiscard]] GLuint SubBuffer(u32 offset);
+
+    [[nodiscard]] GLuint64EXT HostGpuAddr() const noexcept {
+        return address;
    }

-    u64 Address() const noexcept {
-        return gpu_address;
+    [[nodiscard]] GLuint Handle() const noexcept {
+        return buffer.handle;
    }

 private:
-    OGLBuffer gl_buffer;
-    OGLBuffer read_buffer;
-    u64 gpu_address = 0;
+    void CreateMemoryObjects(BufferCacheRuntime& runtime);
+
+    GLuint64EXT address = 0;
+    Vulkan::MemoryCommit memory_commit;
+    OGLBuffer buffer;
+    GLenum current_residency_access = GL_NONE;
+    std::vector<std::pair<OGLBuffer, u32>> subs;
 };

-using GenericBufferCache = VideoCommon::BufferCache<Buffer, GLuint, OGLStreamBuffer>;
-class OGLBufferCache final : public GenericBufferCache {
+class BufferCacheRuntime {
+    friend Buffer;
+
 public:
-    explicit OGLBufferCache(VideoCore::RasterizerInterface& rasterizer,
-                            Tegra::MemoryManager& gpu_memory, Core::Memory::Memory& cpu_memory,
-                            const Device& device, OGLStreamBuffer& stream_buffer,
-                            StateTracker& state_tracker);
-    ~OGLBufferCache();
+    static constexpr u8 INVALID_BINDING = std::numeric_limits<u8>::max();

-    BufferInfo GetEmptyBuffer(std::size_t) override;
+    explicit BufferCacheRuntime(const Device& device_, const Vulkan::Device* vulkan_device_,
+                                Vulkan::MemoryAllocator* vulkan_memory_allocator_);

-    void Acquire() noexcept {
-        cbuf_cursor = 0;
+    void CopyBuffer(Buffer& dst_buffer, Buffer& src_buffer,
+                    std::span<const VideoCommon::BufferCopy> copies);
+
+    void BindIndexBuffer(Buffer& buffer, u32 offset, u32 size);
+
+    void BindVertexBuffer(u32 index, Buffer& buffer, u32 offset, u32 size, u32 stride);
+
+    void BindUniformBuffer(size_t stage, u32 binding_index, Buffer& buffer, u32 offset, u32 size);
+
+    void BindComputeUniformBuffer(u32 binding_index, Buffer& buffer, u32 offset, u32 size);
+
+    void BindStorageBuffer(size_t stage, u32 binding_index, Buffer& buffer, u32 offset, u32 size,
+                           bool is_written);
+
+    void BindComputeStorageBuffer(u32 binding_index, Buffer& buffer, u32 offset, u32 size,
+                                  bool is_written);
+
+    void BindTransformFeedbackBuffer(u32 index, Buffer& buffer, u32 offset, u32 size);
+
+    void BindFastUniformBuffer(size_t stage, u32 binding_index, u32 size) {
+        if (use_assembly_shaders) {
+            const GLuint handle = fast_uniforms[stage][binding_index].handle;
+            const GLsizeiptr gl_size = static_cast<GLsizeiptr>(size);
+            glBindBufferRangeNV(PABO_LUT[stage], binding_index, handle, 0, gl_size);
+        } else {
+            const GLuint base_binding = device.GetBaseBindings(stage).uniform_buffer;
+            const GLuint binding = base_binding + binding_index;
+            glBindBufferRange(GL_UNIFORM_BUFFER, binding,
+                              fast_uniforms[stage][binding_index].handle, 0,
+                              static_cast<GLsizeiptr>(size));
+        }
    }

-protected:
-    std::shared_ptr<Buffer> CreateBlock(VAddr cpu_addr, std::size_t size) override;
+    void PushFastUniformBuffer(size_t stage, u32 binding_index, std::span<const u8> data) {
+        if (use_assembly_shaders) {
+            glProgramBufferParametersIuivNV(
+                PABO_LUT[stage], binding_index, 0,
+                static_cast<GLsizei>(data.size_bytes() / sizeof(GLuint)),
+                reinterpret_cast<const GLuint*>(data.data()));
+        } else {
+            glNamedBufferSubData(fast_uniforms[stage][binding_index].handle, 0,
+                                 static_cast<GLsizeiptr>(data.size_bytes()), data.data());
+        }
+    }

-    BufferInfo ConstBufferUpload(const void* raw_pointer, std::size_t size) override;
+    std::span<u8> BindMappedUniformBuffer(size_t stage, u32 binding_index, u32 size) noexcept {
+        const auto [mapped_span, offset] = stream_buffer->Request(static_cast<size_t>(size));
+        const GLuint base_binding = device.GetBaseBindings(stage).uniform_buffer;
+        const GLuint binding = base_binding + binding_index;
+        glBindBufferRange(GL_UNIFORM_BUFFER, binding, stream_buffer->Handle(),
+                          static_cast<GLintptr>(offset), static_cast<GLsizeiptr>(size));
+        return mapped_span;
+    }
+
+    [[nodiscard]] const GLvoid* IndexOffset() const noexcept {
+        return reinterpret_cast<const GLvoid*>(static_cast<uintptr_t>(index_buffer_offset));
+    }
+
+    [[nodiscard]] bool HasFastBufferSubData() const noexcept {
+        return device.HasFastBufferSubData();
+    }

 private:
-    static constexpr std::size_t NUM_CBUFS = Tegra::Engines::Maxwell3D::Regs::MaxConstBuffers *
-                                             Tegra::Engines::Maxwell3D::Regs::MaxShaderProgram;
+    static constexpr std::array PABO_LUT{
+        GL_VERTEX_PROGRAM_PARAMETER_BUFFER_NV,          GL_TESS_CONTROL_PROGRAM_PARAMETER_BUFFER_NV,
+        GL_TESS_EVALUATION_PROGRAM_PARAMETER_BUFFER_NV, GL_GEOMETRY_PROGRAM_PARAMETER_BUFFER_NV,
+        GL_FRAGMENT_PROGRAM_PARAMETER_BUFFER_NV,
+    };

    const Device& device;
+    const Vulkan::Device* vulkan_device;
+    Vulkan::MemoryAllocator* vulkan_memory_allocator;
+    std::optional<StreamBuffer> stream_buffer;

-    std::size_t cbuf_cursor = 0;
-    std::array<GLuint, NUM_CBUFS> cbufs{};
+    u32 max_attributes = 0;
+
+    bool use_assembly_shaders = false;
+    bool has_unified_vertex_buffers = false;
+
+    std::array<std::array<OGLBuffer, VideoCommon::NUM_GRAPHICS_UNIFORM_BUFFERS>,
+               VideoCommon::NUM_STAGES>
+        fast_uniforms;
+
+    u32 index_buffer_offset = 0;
 };

+struct BufferCacheParams {
+    using Runtime = OpenGL::BufferCacheRuntime;
+    using Buffer = OpenGL::Buffer;
+
+    static constexpr bool IS_OPENGL = true;
+    static constexpr bool HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS = true;
+    static constexpr bool HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT = true;
+    static constexpr bool NEEDS_BIND_UNIFORM_INDEX = true;
+    static constexpr bool NEEDS_BIND_STORAGE_INDEX = true;
+    static constexpr bool USE_MEMORY_MAPS = false;
+};
+
+using BufferCache = VideoCommon::BufferCache<BufferCacheParams>;
+
 } // namespace OpenGL
--- a/src/video_core/renderer_opengl/gl_device.cpp
+++ b/src/video_core/renderer_opengl/gl_device.cpp
@@ -21,9 +21,7 @@
 #include "video_core/renderer_opengl/gl_resource_manager.h"

 namespace OpenGL {
-
 namespace {
-
 // One uniform block is reserved for emulation purposes
 constexpr u32 ReservedUniformBlocks = 1;

@@ -198,10 +196,18 @@ bool IsASTCSupported() {
    return nsight || HasExtension(extensions, "GL_EXT_debug_tool");
 }

+[[nodiscard]] std::string UuidString(std::span<const GLubyte, GL_UUID_SIZE_EXT> uuid) {
+    return fmt::format("{:x}{:x}{:x}{:x}-{:x}{:x}-{:x}{:x}-{:x}{:x}-{:x}{:x}{:x}{:x}{:x}", uuid[0],
+                       uuid[1], uuid[2], uuid[3], uuid[4], uuid[5], uuid[6], uuid[7], uuid[8],
+                       uuid[9], uuid[10], uuid[11], uuid[12], uuid[13], uuid[14], uuid[15]);
+}
 } // Anonymous namespace

-Device::Device()
-    : max_uniform_buffers{BuildMaxUniformBuffers()}, base_bindings{BuildBaseBindings()} {
+Device::Device(bool has_vulkan_instance) {
+    if (!GLAD_GL_VERSION_4_3) {
+        LOG_ERROR(Render_OpenGL, "OpenGL 4.3 is not available");
+        throw std::runtime_error{"Insufficient version"};
+    }
    const std::string_view vendor = reinterpret_cast<const char*>(glGetString(GL_VENDOR));
    const std::string_view version = reinterpret_cast<const char*>(glGetString(GL_VERSION));
    const std::vector extensions = GetExtensions();
@@ -217,6 +223,9 @@ Device::Device()
            "Beta driver 443.24 is known to have issues. There might be performance issues.");
        disable_fast_buffer_sub_data = true;
    }
+
+    max_uniform_buffers = BuildMaxUniformBuffers();
+    base_bindings = BuildBaseBindings();
    uniform_buffer_alignment = GetInteger<size_t>(GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT);
    shader_storage_alignment = GetInteger<size_t>(GL_SHADER_STORAGE_BUFFER_OFFSET_ALIGNMENT);
    max_vertex_attributes = GetInteger<u32>(GL_MAX_VERTEX_ATTRIBS);
@@ -243,7 +252,8 @@ Device::Device()

    use_assembly_shaders = Settings::values.use_assembly_shaders.GetValue() &&
                           GLAD_GL_NV_gpu_program5 && GLAD_GL_NV_compute_program5 &&
-                           GLAD_GL_NV_transform_feedback && GLAD_GL_NV_transform_feedback2;
+                           GLAD_GL_NV_transform_feedback && GLAD_GL_NV_transform_feedback2 &&
+                           has_vulkan_instance;

    use_asynchronous_shaders = Settings::values.use_asynchronous_shaders.GetValue();

--- a/src/video_core/renderer_opengl/gl_device.h
+++ b/src/video_core/renderer_opengl/gl_device.h
@@ -10,18 +10,16 @@

 namespace OpenGL {

-static constexpr u32 EmulationUniformBlockBinding = 0;
-
-class Device final {
+class Device {
 public:
-    struct BaseBindings final {
+    struct BaseBindings {
        u32 uniform_buffer{};
        u32 shader_storage_buffer{};
        u32 sampler{};
        u32 image{};
    };

-    explicit Device();
+    explicit Device(bool has_vulkan_instance);
    explicit Device(std::nullptr_t);

    u32 GetMaxUniformBuffers(Tegra::Engines::ShaderType shader_type) const noexcept {
--- a/src/video_core/renderer_opengl/gl_fence_manager.cpp
+++ b/src/video_core/renderer_opengl/gl_fence_manager.cpp
@@ -47,7 +47,7 @@ void GLInnerFence::Wait() {

 FenceManagerOpenGL::FenceManagerOpenGL(VideoCore::RasterizerInterface& rasterizer_,
                                       Tegra::GPU& gpu_, TextureCache& texture_cache_,
-                                       OGLBufferCache& buffer_cache_, QueryCache& query_cache_)
+                                       BufferCache& buffer_cache_, QueryCache& query_cache_)
    : GenericFenceManager{rasterizer_, gpu_, texture_cache_, buffer_cache_, query_cache_} {}

 Fence FenceManagerOpenGL::CreateFence(u32 value, bool is_stubbed) {
--- a/src/video_core/renderer_opengl/gl_fence_manager.h
+++ b/src/video_core/renderer_opengl/gl_fence_manager.h
@@ -32,14 +32,13 @@ private:
 };

 using Fence = std::shared_ptr<GLInnerFence>;
-using GenericFenceManager =
-    VideoCommon::FenceManager<Fence, TextureCache, OGLBufferCache, QueryCache>;
+using GenericFenceManager = VideoCommon::FenceManager<Fence, TextureCache, BufferCache, QueryCache>;

 class FenceManagerOpenGL final : public GenericFenceManager {
 public:
-    explicit FenceManagerOpenGL(VideoCore::RasterizerInterface& rasterizer_, Tegra::GPU& gpu_,
-                                TextureCache& texture_cache_, OGLBufferCache& buffer_cache_,
-                                QueryCache& query_cache_);
+    explicit FenceManagerOpenGL(VideoCore::RasterizerInterface& rasterizer, Tegra::GPU& gpu,
+                                TextureCache& texture_cache, BufferCache& buffer_cache,
+                                QueryCache& query_cache);

 protected:
    Fence CreateFence(u32 value, bool is_stubbed) override;
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -44,17 +44,10 @@ using VideoCore::Surface::PixelFormat;
 using VideoCore::Surface::SurfaceTarget;
 using VideoCore::Surface::SurfaceType;

-MICROPROFILE_DEFINE(OpenGL_VAO, "OpenGL", "Vertex Format Setup", MP_RGB(128, 128, 192));
-MICROPROFILE_DEFINE(OpenGL_VB, "OpenGL", "Vertex Buffer Setup", MP_RGB(128, 128, 192));
-MICROPROFILE_DEFINE(OpenGL_Shader, "OpenGL", "Shader Setup", MP_RGB(128, 128, 192));
-MICROPROFILE_DEFINE(OpenGL_UBO, "OpenGL", "Const Buffer Setup", MP_RGB(128, 128, 192));
-MICROPROFILE_DEFINE(OpenGL_Index, "OpenGL", "Index Buffer Setup", MP_RGB(128, 128, 192));
-MICROPROFILE_DEFINE(OpenGL_Texture, "OpenGL", "Texture Setup", MP_RGB(128, 128, 192));
-MICROPROFILE_DEFINE(OpenGL_Framebuffer, "OpenGL", "Framebuffer Setup", MP_RGB(128, 128, 192));
 MICROPROFILE_DEFINE(OpenGL_Drawing, "OpenGL", "Drawing", MP_RGB(128, 128, 192));
+MICROPROFILE_DEFINE(OpenGL_Clears, "OpenGL", "Clears", MP_RGB(128, 128, 192));
 MICROPROFILE_DEFINE(OpenGL_Blits, "OpenGL", "Blits", MP_RGB(128, 128, 192));
-MICROPROFILE_DEFINE(OpenGL_CacheManagement, "OpenGL", "Cache Mgmt", MP_RGB(100, 255, 100));
-MICROPROFILE_DEFINE(OpenGL_PrimitiveAssembly, "OpenGL", "Prim Asmbl", MP_RGB(255, 100, 100));
+MICROPROFILE_DEFINE(OpenGL_CacheManagement, "OpenGL", "Cache Management", MP_RGB(100, 255, 100));

 namespace {

@@ -104,20 +97,6 @@ TextureHandle GetTextureInfo(const Engine& engine, bool via_header_index, const
    return TextureHandle(engine.AccessConstBuffer32(shader_type, buffer, offset), via_header_index);
 }

-std::size_t GetConstBufferSize(const Tegra::Engines::ConstBufferInfo& buffer,
-                               const ConstBufferEntry& entry) {
-    if (!entry.IsIndirect()) {
-        return entry.GetSize();
-    }
-    if (buffer.size > Maxwell::MaxConstBufferSize) {
-        LOG_WARNING(Render_OpenGL, "Indirect constbuffer size {} exceeds maximum {}", buffer.size,
-                    Maxwell::MaxConstBufferSize);
-        return Maxwell::MaxConstBufferSize;
-    }
-
-    return buffer.size;
-}
-
 /// Translates hardware transform feedback indices
 /// @param location Hardware location
 /// @return Pair of ARB_transform_feedback3 token stream first and third arguments
@@ -150,14 +129,6 @@ void oglEnable(GLenum cap, bool state) {
    (state ? glEnable : glDisable)(cap);
 }

-void UpdateBindlessSSBOs(GLenum target, const BindlessSSBO* ssbos, size_t num_ssbos) {
-    if (num_ssbos == 0) {
-        return;
-    }
-    glProgramLocalParametersI4uivNV(target, 0, static_cast<GLsizei>(num_ssbos),
-                                    reinterpret_cast<const GLuint*>(ssbos));
-}
-
 ImageViewType ImageViewTypeFromEntry(const SamplerEntry& entry) {
    if (entry.is_buffer) {
        return ImageViewType::Buffer;
@@ -199,49 +170,35 @@ ImageViewType ImageViewTypeFromEntry(const ImageEntry& entry) {

 RasterizerOpenGL::RasterizerOpenGL(Core::Frontend::EmuWindow& emu_window_, Tegra::GPU& gpu_,
                                   Core::Memory::Memory& cpu_memory_, const Device& device_,
+                                   const Vulkan::Device* vulkan_device,
+                                   Vulkan::MemoryAllocator* vulkan_memory_allocator,
                                   ScreenInfo& screen_info_, ProgramManager& program_manager_,
                                   StateTracker& state_tracker_)
    : RasterizerAccelerated(cpu_memory_), gpu(gpu_), maxwell3d(gpu.Maxwell3D()),
      kepler_compute(gpu.KeplerCompute()), gpu_memory(gpu.MemoryManager()), device(device_),
      screen_info(screen_info_), program_manager(program_manager_), state_tracker(state_tracker_),
-      stream_buffer(device, state_tracker),
      texture_cache_runtime(device, program_manager, state_tracker),
      texture_cache(texture_cache_runtime, *this, maxwell3d, kepler_compute, gpu_memory),
+      buffer_cache_runtime(device, vulkan_device, vulkan_memory_allocator),
+      buffer_cache(*this, maxwell3d, kepler_compute, gpu_memory, cpu_memory_, buffer_cache_runtime),
      shader_cache(*this, emu_window_, gpu, maxwell3d, kepler_compute, gpu_memory, device),
      query_cache(*this, maxwell3d, gpu_memory),
-      buffer_cache(*this, gpu_memory, cpu_memory_, device, stream_buffer, state_tracker),
      fence_manager(*this, gpu, texture_cache, buffer_cache, query_cache),
      async_shaders(emu_window_) {
-    unified_uniform_buffer.Create();
-    glNamedBufferStorage(unified_uniform_buffer.handle, TOTAL_CONST_BUFFER_BYTES, nullptr, 0);
-
-    if (device.UseAssemblyShaders()) {
-        glCreateBuffers(static_cast<GLsizei>(staging_cbufs.size()), staging_cbufs.data());
-        for (const GLuint cbuf : staging_cbufs) {
-            glNamedBufferStorage(cbuf, static_cast<GLsizeiptr>(Maxwell::MaxConstBufferSize),
-                                 nullptr, 0);
-        }
-    }
    if (device.UseAsynchronousShaders()) {
        async_shaders.AllocateWorkers();
    }
 }

-RasterizerOpenGL::~RasterizerOpenGL() {
-    if (device.UseAssemblyShaders()) {
-        glDeleteBuffers(static_cast<GLsizei>(staging_cbufs.size()), staging_cbufs.data());
-    }
-}
+RasterizerOpenGL::~RasterizerOpenGL() = default;

-void RasterizerOpenGL::SetupVertexFormat() {
+void RasterizerOpenGL::SyncVertexFormats() {
    auto& flags = maxwell3d.dirty.flags;
    if (!flags[Dirty::VertexFormats]) {
        return;
    }
    flags[Dirty::VertexFormats] = false;

-    MICROPROFILE_SCOPE(OpenGL_VAO);
-
    // Use the vertex array as-is, assumes that the data is formatted correctly for OpenGL. Enables
    // the first 16 vertex attributes always, as we don't know which ones are actually used until
    // shader time. Note, Tegra technically supports 32, but we're capping this to 16 for now to
@@ -277,55 +234,7 @@ void RasterizerOpenGL::SetupVertexFormat() {
    }
 }

-void RasterizerOpenGL::SetupVertexBuffer() {
-    auto& flags = maxwell3d.dirty.flags;
-    if (!flags[Dirty::VertexBuffers]) {
-        return;
-    }
-    flags[Dirty::VertexBuffers] = false;
-
-    MICROPROFILE_SCOPE(OpenGL_VB);
-
-    const bool use_unified_memory = device.HasVertexBufferUnifiedMemory();
-
-    // Upload all guest vertex arrays sequentially to our buffer
-    const auto& regs = maxwell3d.regs;
-    for (std::size_t index = 0; index < NUM_SUPPORTED_VERTEX_BINDINGS; ++index) {
-        if (!flags[Dirty::VertexBuffer0 + index]) {
-            continue;
-        }
-        flags[Dirty::VertexBuffer0 + index] = false;
-
-        const auto& vertex_array = regs.vertex_array[index];
-        if (!vertex_array.IsEnabled()) {
-            continue;
-        }
-
-        const GPUVAddr start = vertex_array.StartAddress();
-        const GPUVAddr end = regs.vertex_array_limit[index].LimitAddress();
-        ASSERT(end >= start);
-
-        const GLuint gl_index = static_cast<GLuint>(index);
-        const u64 size = end - start;
-        if (size == 0) {
-            glBindVertexBuffer(gl_index, 0, 0, vertex_array.stride);
-            if (use_unified_memory) {
-                glBufferAddressRangeNV(GL_VERTEX_ATTRIB_ARRAY_ADDRESS_NV, gl_index, 0, 0);
-            }
-            continue;
-        }
-        const auto info = buffer_cache.UploadMemory(start, size);
-        if (use_unified_memory) {
-            glBindVertexBuffer(gl_index, 0, 0, vertex_array.stride);
-            glBufferAddressRangeNV(GL_VERTEX_ATTRIB_ARRAY_ADDRESS_NV, gl_index,
-                                   info.address + info.offset, size);
-        } else {
-            glBindVertexBuffer(gl_index, info.handle, info.offset, vertex_array.stride);
-        }
-    }
-}
-
-void RasterizerOpenGL::SetupVertexInstances() {
+void RasterizerOpenGL::SyncVertexInstances() {
    auto& flags = maxwell3d.dirty.flags;
    if (!flags[Dirty::VertexInstances]) {
        return;
@@ -346,17 +255,7 @@ void RasterizerOpenGL::SetupVertexInstances() {
    }
 }

-GLintptr RasterizerOpenGL::SetupIndexBuffer() {
-    MICROPROFILE_SCOPE(OpenGL_Index);
-    const auto& regs = maxwell3d.regs;
-    const std::size_t size = CalculateIndexBufferSize();
-    const auto info = buffer_cache.UploadMemory(regs.index_array.IndexStart(), size);
-    glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, info.handle);
-    return info.offset;
-}
-
-void RasterizerOpenGL::SetupShaders() {
-    MICROPROFILE_SCOPE(OpenGL_Shader);
+void RasterizerOpenGL::SetupShaders(bool is_indexed) {
    u32 clip_distances = 0;

    std::array<Shader*, Maxwell::MaxShaderStage> shaders{};
@@ -413,11 +312,19 @@ void RasterizerOpenGL::SetupShaders() {
        const size_t stage = index == 0 ? 0 : index - 1;
        shaders[stage] = shader;

-        SetupDrawConstBuffers(stage, shader);
-        SetupDrawGlobalMemory(stage, shader);
        SetupDrawTextures(shader, stage);
        SetupDrawImages(shader, stage);

+        buffer_cache.SetEnabledUniformBuffers(stage, shader->GetEntries().enabled_uniform_buffers);
+
+        buffer_cache.UnbindGraphicsStorageBuffers(stage);
+        u32 ssbo_index = 0;
+        for (const auto& buffer : shader->GetEntries().global_memory_entries) {
+            buffer_cache.BindGraphicsStorageBuffer(stage, ssbo_index, buffer.cbuf_index,
+                                                   buffer.cbuf_offset, buffer.is_written);
+            ++ssbo_index;
+        }
+
        // Workaround for Intel drivers.
        // When a clip distance is enabled but not set in the shader it crops parts of the screen
        // (sometimes it's half the screen, sometimes three quarters). To avoid this, enable the
@@ -433,43 +340,26 @@ void RasterizerOpenGL::SetupShaders() {
    SyncClipEnabled(clip_distances);
    maxwell3d.dirty.flags[Dirty::Shaders] = false;

+    buffer_cache.UpdateGraphicsBuffers(is_indexed);
+
    const std::span indices_span(image_view_indices.data(), image_view_indices.size());
    texture_cache.FillGraphicsImageViews(indices_span, image_view_ids);

+    buffer_cache.BindHostGeometryBuffers(is_indexed);
+
    size_t image_view_index = 0;
    size_t texture_index = 0;
    size_t image_index = 0;
    for (size_t stage = 0; stage < Maxwell::MaxShaderStage; ++stage) {
        const Shader* const shader = shaders[stage];
-        if (shader) {
-            const auto base = device.GetBaseBindings(stage);
-            BindTextures(shader->GetEntries(), base.sampler, base.image, image_view_index,
-                         texture_index, image_index);
-        }
-    }
-}
-
-std::size_t RasterizerOpenGL::CalculateVertexArraysSize() const {
-    const auto& regs = maxwell3d.regs;
-
-    std::size_t size = 0;
-    for (u32 index = 0; index < Maxwell::NumVertexArrays; ++index) {
-        if (!regs.vertex_array[index].IsEnabled())
+        if (!shader) {
            continue;
-
-        const GPUVAddr start = regs.vertex_array[index].StartAddress();
-        const GPUVAddr end = regs.vertex_array_limit[index].LimitAddress();
-
-        size += end - start;
-        ASSERT(end >= start);
+        }
+        buffer_cache.BindHostStageBuffers(stage);
+        const auto& base = device.GetBaseBindings(stage);
+        BindTextures(shader->GetEntries(), base.sampler, base.image, image_view_index,
+                     texture_index, image_index);
    }
-
-    return size;
-}
-
-std::size_t RasterizerOpenGL::CalculateIndexBufferSize() const {
-    return static_cast<std::size_t>(maxwell3d.regs.index_array.count) *
-           static_cast<std::size_t>(maxwell3d.regs.index_array.FormatSizeInBytes());
 }

 void RasterizerOpenGL::LoadDiskResources(u64 title_id, const std::atomic_bool& stop_loading,
@@ -478,6 +368,7 @@ void RasterizerOpenGL::LoadDiskResources(u64 title_id, const std::atomic_bool& s
 }

 void RasterizerOpenGL::Clear() {
+    MICROPROFILE_SCOPE(OpenGL_Clears);
    if (!maxwell3d.ShouldExecute()) {
        return;
    }
@@ -528,11 +419,9 @@ void RasterizerOpenGL::Clear() {
    }
    UNIMPLEMENTED_IF(regs.clear_flags.viewport);

-    {
-        auto lock = texture_cache.AcquireLock();
-        texture_cache.UpdateRenderTargets(true);
-        state_tracker.BindFramebuffer(texture_cache.GetFramebuffer()->Handle());
-    }
+    std::scoped_lock lock{texture_cache.mutex};
+    texture_cache.UpdateRenderTargets(true);
+    state_tracker.BindFramebuffer(texture_cache.GetFramebuffer()->Handle());

    if (use_color) {
        glClearBufferfv(GL_COLOR, regs.clear_buffers.RT, regs.clear_color);
@@ -544,7 +433,6 @@ void RasterizerOpenGL::Clear() {
    } else if (use_stencil) {
        glClearBufferiv(GL_STENCIL, 0, &regs.clear_stencil);
    }
-
    ++num_queued_commands;
 }

@@ -553,75 +441,12 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {

    query_cache.UpdateCounters();

-    SyncViewport();
-    SyncRasterizeEnable();
-    SyncPolygonModes();
-    SyncColorMask();
-    SyncFragmentColorClampState();
-    SyncMultiSampleState();
-    SyncDepthTestState();
-    SyncDepthClamp();
-    SyncStencilTestState();
-    SyncBlendState();
-    SyncLogicOpState();
-    SyncCullMode();
-    SyncPrimitiveRestart();
-    SyncScissorTest();
-    SyncPointState();
-    SyncLineState();
-    SyncPolygonOffset();
-    SyncAlphaTest();
-    SyncFramebufferSRGB();
-
-    buffer_cache.Acquire();
-    current_cbuf = 0;
-
-    std::size_t buffer_size = CalculateVertexArraysSize();
-
-    // Add space for index buffer
-    if (is_indexed) {
-        buffer_size = Common::AlignUp(buffer_size, 4) + CalculateIndexBufferSize();
-    }
-
-    // Uniform space for the 5 shader stages
-    buffer_size =
-        Common::AlignUp<std::size_t>(buffer_size, 4) +
-        (sizeof(MaxwellUniformData) + device.GetUniformBufferAlignment()) * Maxwell::MaxShaderStage;
-
-    // Add space for at least 18 constant buffers
-    buffer_size += Maxwell::MaxConstBuffers *
-                   (Maxwell::MaxConstBufferSize + device.GetUniformBufferAlignment());
-
-    // Prepare the vertex array.
-    buffer_cache.Map(buffer_size);
-
-    // Prepare vertex array format.
-    SetupVertexFormat();
-
-    // Upload vertex and index data.
-    SetupVertexBuffer();
-    SetupVertexInstances();
-    GLintptr index_buffer_offset = 0;
-    if (is_indexed) {
-        index_buffer_offset = SetupIndexBuffer();
-    }
-
-    // Setup emulation uniform buffer.
-    if (!device.UseAssemblyShaders()) {
-        MaxwellUniformData ubo;
-        ubo.SetFromRegs(maxwell3d);
-        const auto info =
-            buffer_cache.UploadHostMemory(&ubo, sizeof(ubo), device.GetUniformBufferAlignment());
-        glBindBufferRange(GL_UNIFORM_BUFFER, EmulationUniformBlockBinding, info.handle, info.offset,
-                          static_cast<GLsizeiptr>(sizeof(ubo)));
-    }
+    SyncState();

    // Setup shaders and their used resources.
-    auto lock = texture_cache.AcquireLock();
-    SetupShaders();
+    std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex};
+    SetupShaders(is_indexed);

-    // Signal the buffer cache that we are not going to upload more things.
-    buffer_cache.Unmap();
    texture_cache.UpdateRenderTargets(false);
    state_tracker.BindFramebuffer(texture_cache.GetFramebuffer()->Handle());
    program_manager.BindGraphicsPipeline();
@@ -635,7 +460,7 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
    if (is_indexed) {
        const GLint base_vertex = static_cast<GLint>(maxwell3d.regs.vb_element_base);
        const GLsizei num_vertices = static_cast<GLsizei>(maxwell3d.regs.index_array.count);
-        const GLvoid* offset = reinterpret_cast<const GLvoid*>(index_buffer_offset);
+        const GLvoid* const offset = buffer_cache_runtime.IndexOffset();
        const GLenum format = MaxwellToGL::IndexFormat(maxwell3d.regs.index_array.format);
        if (num_instances == 1 && base_instance == 0 && base_vertex == 0) {
            glDrawElements(primitive_mode, num_vertices, format, offset);
@@ -675,22 +500,22 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
 }

 void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) {
-    buffer_cache.Acquire();
-    current_cbuf = 0;
-
    Shader* const kernel = shader_cache.GetComputeKernel(code_addr);

-    auto lock = texture_cache.AcquireLock();
+    std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex};
    BindComputeTextures(kernel);

-    const size_t buffer_size = Tegra::Engines::KeplerCompute::NumConstBuffers *
-                               (Maxwell::MaxConstBufferSize + device.GetUniformBufferAlignment());
-    buffer_cache.Map(buffer_size);
-
-    SetupComputeConstBuffers(kernel);
-    SetupComputeGlobalMemory(kernel);
-
-    buffer_cache.Unmap();
+    const auto& entries = kernel->GetEntries();
+    buffer_cache.SetEnabledComputeUniformBuffers(entries.enabled_uniform_buffers);
+    buffer_cache.UnbindComputeStorageBuffers();
+    u32 ssbo_index = 0;
+    for (const auto& buffer : entries.global_memory_entries) {
+        buffer_cache.BindComputeStorageBuffer(ssbo_index, buffer.cbuf_index, buffer.cbuf_offset,
+                                              buffer.is_written);
+        ++ssbo_index;
+    }
+    buffer_cache.UpdateComputeBuffers();
+    buffer_cache.BindHostComputeBuffers();

    const auto& launch_desc = kepler_compute.launch_description;
    glDispatchCompute(launch_desc.grid_dim_x, launch_desc.grid_dim_y, launch_desc.grid_dim_z);
@@ -706,6 +531,12 @@ void RasterizerOpenGL::Query(GPUVAddr gpu_addr, VideoCore::QueryType type,
    query_cache.Query(gpu_addr, type, timestamp);
 }

+void RasterizerOpenGL::BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr,
+                                                 u32 size) {
+    std::scoped_lock lock{buffer_cache.mutex};
+    buffer_cache.BindGraphicsUniformBuffer(stage, index, gpu_addr, size);
+}
+
 void RasterizerOpenGL::FlushAll() {}

 void RasterizerOpenGL::FlushRegion(VAddr addr, u64 size) {
@@ -714,19 +545,43 @@ void RasterizerOpenGL::FlushRegion(VAddr addr, u64 size) {
        return;
    }
    {
-        auto lock = texture_cache.AcquireLock();
+        std::scoped_lock lock{texture_cache.mutex};
        texture_cache.DownloadMemory(addr, size);
    }
-    buffer_cache.FlushRegion(addr, size);
+    {
+        std::scoped_lock lock{buffer_cache.mutex};
+        buffer_cache.DownloadMemory(addr, size);
+    }
    query_cache.FlushRegion(addr, size);
 }

+void RasterizerOpenGL::InvalidateExceptTextureCache(VAddr addr, u64 size) {
+    if (addr == 0 || size == 0) {
+        return;
+    }
+    shader_cache.InvalidateRegion(addr, size);
+    {
+        std::scoped_lock lock{buffer_cache.mutex};
+        buffer_cache.WriteMemory(addr, size);
+    }
+    query_cache.InvalidateRegion(addr, size);
+}
+
+void RasterizerOpenGL::InvalidateTextureCache(VAddr addr, u64 size) {
+    if (addr == 0 || size == 0) {
+        return;
+    }
+    std::scoped_lock lock{texture_cache.mutex};
+    texture_cache.UnmapMemory(addr, size);
+}
+
 bool RasterizerOpenGL::MustFlushRegion(VAddr addr, u64 size) {
+    std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex};
    if (!Settings::IsGPULevelHigh()) {
-        return buffer_cache.MustFlushRegion(addr, size);
+        return buffer_cache.IsRegionGpuModified(addr, size);
    }
    return texture_cache.IsRegionGpuModified(addr, size) ||
-           buffer_cache.MustFlushRegion(addr, size);
+           buffer_cache.IsRegionGpuModified(addr, size);
 }

 void RasterizerOpenGL::InvalidateRegion(VAddr addr, u64 size) {
@@ -735,11 +590,14 @@ void RasterizerOpenGL::InvalidateRegion(VAddr addr, u64 size) {
        return;
    }
    {
-        auto lock = texture_cache.AcquireLock();
+        std::scoped_lock lock{texture_cache.mutex};
        texture_cache.WriteMemory(addr, size);
    }
+    {
+        std::scoped_lock lock{buffer_cache.mutex};
+        buffer_cache.WriteMemory(addr, size);
+    }
    shader_cache.InvalidateRegion(addr, size);
-    buffer_cache.InvalidateRegion(addr, size);
    query_cache.InvalidateRegion(addr, size);
 }

@@ -748,26 +606,35 @@ void RasterizerOpenGL::OnCPUWrite(VAddr addr, u64 size) {
    if (addr == 0 || size == 0) {
        return;
    }
+    shader_cache.OnCPUWrite(addr, size);
    {
-        auto lock = texture_cache.AcquireLock();
+        std::scoped_lock lock{texture_cache.mutex};
        texture_cache.WriteMemory(addr, size);
    }
-    shader_cache.OnCPUWrite(addr, size);
-    buffer_cache.OnCPUWrite(addr, size);
+    {
+        std::scoped_lock lock{buffer_cache.mutex};
+        buffer_cache.CachedWriteMemory(addr, size);
+    }
 }

 void RasterizerOpenGL::SyncGuestHost() {
    MICROPROFILE_SCOPE(OpenGL_CacheManagement);
-    buffer_cache.SyncGuestHost();
    shader_cache.SyncGuestHost();
+    {
+        std::scoped_lock lock{buffer_cache.mutex};
+        buffer_cache.FlushCachedWrites();
+    }
 }

 void RasterizerOpenGL::UnmapMemory(VAddr addr, u64 size) {
    {
-        auto lock = texture_cache.AcquireLock();
+        std::scoped_lock lock{texture_cache.mutex};
        texture_cache.UnmapMemory(addr, size);
    }
-    buffer_cache.OnCPUWrite(addr, size);
+    {
+        std::scoped_lock lock{buffer_cache.mutex};
+        buffer_cache.WriteMemory(addr, size);
+    }
    shader_cache.OnCPUWrite(addr, size);
 }

@@ -802,14 +669,7 @@ void RasterizerOpenGL::FlushAndInvalidateRegion(VAddr addr, u64 size) {
 }

 void RasterizerOpenGL::WaitForIdle() {
-    // Place a barrier on everything that is not framebuffer related.
-    // This is related to another flag that is not currently implemented.
-    glMemoryBarrier(GL_VERTEX_ATTRIB_ARRAY_BARRIER_BIT | GL_ELEMENT_ARRAY_BARRIER_BIT |
-                    GL_UNIFORM_BARRIER_BIT | GL_TEXTURE_FETCH_BARRIER_BIT |
-                    GL_SHADER_IMAGE_ACCESS_BARRIER_BIT | GL_COMMAND_BARRIER_BIT |
-                    GL_PIXEL_BUFFER_BARRIER_BIT | GL_TEXTURE_UPDATE_BARRIER_BIT |
-                    GL_BUFFER_UPDATE_BARRIER_BIT | GL_TRANSFORM_FEEDBACK_BARRIER_BIT |
-                    GL_SHADER_STORAGE_BARRIER_BIT | GL_QUERY_BUFFER_BARRIER_BIT);
+    glMemoryBarrier(GL_ALL_BARRIER_BITS);
 }

 void RasterizerOpenGL::FragmentBarrier() {
@@ -834,18 +694,21 @@ void RasterizerOpenGL::TickFrame() {
    num_queued_commands = 0;

    fence_manager.TickFrame();
-    buffer_cache.TickFrame();
    {
-        auto lock = texture_cache.AcquireLock();
+        std::scoped_lock lock{texture_cache.mutex};
        texture_cache.TickFrame();
    }
+    {
+        std::scoped_lock lock{buffer_cache.mutex};
+        buffer_cache.TickFrame();
+    }
 }

 bool RasterizerOpenGL::AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Surface& src,
                                             const Tegra::Engines::Fermi2D::Surface& dst,
                                             const Tegra::Engines::Fermi2D::Config& copy_config) {
    MICROPROFILE_SCOPE(OpenGL_Blits);
-    auto lock = texture_cache.AcquireLock();
+    std::scoped_lock lock{texture_cache.mutex};
    texture_cache.BlitImage(dst, src, copy_config);
    return true;
 }
@@ -857,7 +720,7 @@ bool RasterizerOpenGL::AccelerateDisplay(const Tegra::FramebufferConfig& config,
    }
    MICROPROFILE_SCOPE(OpenGL_CacheManagement);

-    auto lock = texture_cache.AcquireLock();
+    std::scoped_lock lock{texture_cache.mutex};
    ImageView* const image_view{texture_cache.TryFindFramebufferImageView(framebuffer_addr)};
    if (!image_view) {
        return false;
@@ -924,166 +787,6 @@ void RasterizerOpenGL::BindTextures(const ShaderEntries& entries, GLuint base_te
    }
 }

-void RasterizerOpenGL::SetupDrawConstBuffers(std::size_t stage_index, Shader* shader) {
-    static constexpr std::array PARAMETER_LUT{
-        GL_VERTEX_PROGRAM_PARAMETER_BUFFER_NV,          GL_TESS_CONTROL_PROGRAM_PARAMETER_BUFFER_NV,
-        GL_TESS_EVALUATION_PROGRAM_PARAMETER_BUFFER_NV, GL_GEOMETRY_PROGRAM_PARAMETER_BUFFER_NV,
-        GL_FRAGMENT_PROGRAM_PARAMETER_BUFFER_NV,
-    };
-    MICROPROFILE_SCOPE(OpenGL_UBO);
-    const auto& stages = maxwell3d.state.shader_stages;
-    const auto& shader_stage = stages[stage_index];
-    const auto& entries = shader->GetEntries();
-    const bool use_unified = entries.use_unified_uniforms;
-    const std::size_t base_unified_offset = stage_index * NUM_CONST_BUFFERS_BYTES_PER_STAGE;
-
-    const auto base_bindings = device.GetBaseBindings(stage_index);
-    u32 binding = device.UseAssemblyShaders() ? 0 : base_bindings.uniform_buffer;
-    for (const auto& entry : entries.const_buffers) {
-        const u32 index = entry.GetIndex();
-        const auto& buffer = shader_stage.const_buffers[index];
-        SetupConstBuffer(PARAMETER_LUT[stage_index], binding, buffer, entry, use_unified,
-                         base_unified_offset + index * Maxwell::MaxConstBufferSize);
-        ++binding;
-    }
-    if (use_unified) {
-        const u32 index = static_cast<u32>(base_bindings.shader_storage_buffer +
-                                           entries.global_memory_entries.size());
-        glBindBufferRange(GL_SHADER_STORAGE_BUFFER, index, unified_uniform_buffer.handle,
-                          base_unified_offset, NUM_CONST_BUFFERS_BYTES_PER_STAGE);
-    }
-}
-
-void RasterizerOpenGL::SetupComputeConstBuffers(Shader* kernel) {
-    MICROPROFILE_SCOPE(OpenGL_UBO);
-    const auto& launch_desc = kepler_compute.launch_description;
-    const auto& entries = kernel->GetEntries();
-    const bool use_unified = entries.use_unified_uniforms;
-
-    u32 binding = 0;
-    for (const auto& entry : entries.const_buffers) {
-        const auto& config = launch_desc.const_buffer_config[entry.GetIndex()];
-        const std::bitset<8> mask = launch_desc.const_buffer_enable_mask.Value();
-        Tegra::Engines::ConstBufferInfo buffer;
-        buffer.address = config.Address();
-        buffer.size = config.size;
-        buffer.enabled = mask[entry.GetIndex()];
-        SetupConstBuffer(GL_COMPUTE_PROGRAM_PARAMETER_BUFFER_NV, binding, buffer, entry,
-                         use_unified, entry.GetIndex() * Maxwell::MaxConstBufferSize);
-        ++binding;
-    }
-    if (use_unified) {
-        const GLuint index = static_cast<GLuint>(entries.global_memory_entries.size());
-        glBindBufferRange(GL_SHADER_STORAGE_BUFFER, index, unified_uniform_buffer.handle, 0,
-                          NUM_CONST_BUFFERS_BYTES_PER_STAGE);
-    }
-}
-
-void RasterizerOpenGL::SetupConstBuffer(GLenum stage, u32 binding,
-                                        const Tegra::Engines::ConstBufferInfo& buffer,
-                                        const ConstBufferEntry& entry, bool use_unified,
-                                        std::size_t unified_offset) {
-    if (!buffer.enabled) {
-        // Set values to zero to unbind buffers
-        if (device.UseAssemblyShaders()) {
-            glBindBufferRangeNV(stage, entry.GetIndex(), 0, 0, 0);
-        } else {
-            glBindBufferRange(GL_UNIFORM_BUFFER, binding, 0, 0, sizeof(float));
-        }
-        return;
-    }
-
-    // Align the actual size so it ends up being a multiple of vec4 to meet the OpenGL std140
-    // UBO alignment requirements.
-    const std::size_t size = Common::AlignUp(GetConstBufferSize(buffer, entry), sizeof(GLvec4));
-
-    const bool fast_upload = !use_unified && device.HasFastBufferSubData();
-
-    const std::size_t alignment = use_unified ? 4 : device.GetUniformBufferAlignment();
-    const GPUVAddr gpu_addr = buffer.address;
-    auto info = buffer_cache.UploadMemory(gpu_addr, size, alignment, false, fast_upload);
-
-    if (device.UseAssemblyShaders()) {
-        UNIMPLEMENTED_IF(use_unified);
-        if (info.offset != 0) {
-            const GLuint staging_cbuf = staging_cbufs[current_cbuf++];
-            glCopyNamedBufferSubData(info.handle, staging_cbuf, info.offset, 0, size);
-            info.handle = staging_cbuf;
-            info.offset = 0;
-        }
-        glBindBufferRangeNV(stage, binding, info.handle, info.offset, size);
-        return;
-    }
-
-    if (use_unified) {
-        glCopyNamedBufferSubData(info.handle, unified_uniform_buffer.handle, info.offset,
-                                 unified_offset, size);
-    } else {
-        glBindBufferRange(GL_UNIFORM_BUFFER, binding, info.handle, info.offset, size);
-    }
-}
-
-void RasterizerOpenGL::SetupDrawGlobalMemory(std::size_t stage_index, Shader* shader) {
-    static constexpr std::array TARGET_LUT = {
-        GL_VERTEX_PROGRAM_NV,   GL_TESS_CONTROL_PROGRAM_NV, GL_TESS_EVALUATION_PROGRAM_NV,
-        GL_GEOMETRY_PROGRAM_NV, GL_FRAGMENT_PROGRAM_NV,
-    };
-    const auto& cbufs{maxwell3d.state.shader_stages[stage_index]};
-    const auto& entries{shader->GetEntries().global_memory_entries};
-
-    std::array<BindlessSSBO, 32> ssbos;
-    ASSERT(entries.size() < ssbos.size());
-
-    const bool assembly_shaders = device.UseAssemblyShaders();
-    u32 binding = assembly_shaders ? 0 : device.GetBaseBindings(stage_index).shader_storage_buffer;
-    for (const auto& entry : entries) {
-        const GPUVAddr addr{cbufs.const_buffers[entry.cbuf_index].address + entry.cbuf_offset};
-        const GPUVAddr gpu_addr{gpu_memory.Read<u64>(addr)};
-        const u32 size{gpu_memory.Read<u32>(addr + 8)};
-        SetupGlobalMemory(binding, entry, gpu_addr, size, &ssbos[binding]);
-        ++binding;
-    }
-    if (assembly_shaders) {
-        UpdateBindlessSSBOs(TARGET_LUT[stage_index], ssbos.data(), entries.size());
-    }
-}
-
-void RasterizerOpenGL::SetupComputeGlobalMemory(Shader* kernel) {
-    const auto& cbufs{kepler_compute.launch_description.const_buffer_config};
-    const auto& entries{kernel->GetEntries().global_memory_entries};
-
-    std::array<BindlessSSBO, 32> ssbos;
-    ASSERT(entries.size() < ssbos.size());
-
-    u32 binding = 0;
-    for (const auto& entry : entries) {
-        const GPUVAddr addr{cbufs[entry.cbuf_index].Address() + entry.cbuf_offset};
-        const GPUVAddr gpu_addr{gpu_memory.Read<u64>(addr)};
-        const u32 size{gpu_memory.Read<u32>(addr + 8)};
-        SetupGlobalMemory(binding, entry, gpu_addr, size, &ssbos[binding]);
-        ++binding;
-    }
-    if (device.UseAssemblyShaders()) {
-        UpdateBindlessSSBOs(GL_COMPUTE_PROGRAM_NV, ssbos.data(), ssbos.size());
-    }
-}
-
-void RasterizerOpenGL::SetupGlobalMemory(u32 binding, const GlobalMemoryEntry& entry,
-                                         GPUVAddr gpu_addr, size_t size, BindlessSSBO* ssbo) {
-    const size_t alignment{device.GetShaderStorageBufferAlignment()};
-    const auto info = buffer_cache.UploadMemory(gpu_addr, size, alignment, entry.is_written);
-    if (device.UseAssemblyShaders()) {
-        *ssbo = BindlessSSBO{
-            .address = static_cast<GLuint64EXT>(info.address + info.offset),
-            .length = static_cast<GLsizei>(size),
-            .padding = 0,
-        };
-    } else {
-        glBindBufferRange(GL_SHADER_STORAGE_BUFFER, binding, info.handle, info.offset,
-                          static_cast<GLsizeiptr>(size));
-    }
-}
-
 void RasterizerOpenGL::SetupDrawTextures(const Shader* shader, size_t stage_index) {
    const bool via_header_index =
        maxwell3d.regs.sampler_index == Maxwell::SamplerIndex::ViaHeaderIndex;
@@ -1131,6 +834,30 @@ void RasterizerOpenGL::SetupComputeImages(const Shader* shader) {
    }
 }

+void RasterizerOpenGL::SyncState() {
+    SyncViewport();
+    SyncRasterizeEnable();
+    SyncPolygonModes();
+    SyncColorMask();
+    SyncFragmentColorClampState();
+    SyncMultiSampleState();
+    SyncDepthTestState();
+    SyncDepthClamp();
+    SyncStencilTestState();
+    SyncBlendState();
+    SyncLogicOpState();
+    SyncCullMode();
+    SyncPrimitiveRestart();
+    SyncScissorTest();
+    SyncPointState();
+    SyncLineState();
+    SyncPolygonOffset();
+    SyncAlphaTest();
+    SyncFramebufferSRGB();
+    SyncVertexFormats();
+    SyncVertexInstances();
+}
+
 void RasterizerOpenGL::SyncViewport() {
    auto& flags = maxwell3d.dirty.flags;
    const auto& regs = maxwell3d.regs;
@@ -1166,9 +893,11 @@ void RasterizerOpenGL::SyncViewport() {
        if (regs.screen_y_control.y_negate != 0) {
            flip_y = !flip_y;
        }
-        glClipControl(flip_y ? GL_UPPER_LEFT : GL_LOWER_LEFT,
-                      regs.depth_mode == Maxwell::DepthMode::ZeroToOne ? GL_ZERO_TO_ONE
-                                                                       : GL_NEGATIVE_ONE_TO_ONE);
+        const bool is_zero_to_one = regs.depth_mode == Maxwell::DepthMode::ZeroToOne;
+        const GLenum origin = flip_y ? GL_UPPER_LEFT : GL_LOWER_LEFT;
+        const GLenum depth = is_zero_to_one ? GL_ZERO_TO_ONE : GL_NEGATIVE_ONE_TO_ONE;
+        state_tracker.ClipControl(origin, depth);
+        state_tracker.SetYNegate(regs.screen_y_control.y_negate != 0);
    }

    if (dirty_viewport) {
@@ -1652,36 +1381,13 @@ void RasterizerOpenGL::BeginTransformFeedback(GLenum primitive_mode) {
    if (regs.tfb_enabled == 0) {
        return;
    }
-
    if (device.UseAssemblyShaders()) {
        SyncTransformFeedback();
    }
-
    UNIMPLEMENTED_IF(regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationControl) ||
                     regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationEval) ||
                     regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::Geometry));
-
-    for (std::size_t index = 0; index < Maxwell::NumTransformFeedbackBuffers; ++index) {
-        const auto& binding = regs.tfb_bindings[index];
-        if (!binding.buffer_enable) {
-            if (enabled_transform_feedback_buffers[index]) {
-                glBindBufferRange(GL_TRANSFORM_FEEDBACK_BUFFER, static_cast<GLuint>(index), 0, 0,
-                                  0);
-            }
-            enabled_transform_feedback_buffers[index] = false;
-            continue;
-        }
-        enabled_transform_feedback_buffers[index] = true;
-
-        auto& tfb_buffer = transform_feedback_buffers[index];
-        tfb_buffer.Create();
-
-        const GLuint handle = tfb_buffer.handle;
-        const std::size_t size = binding.buffer_size;
-        glNamedBufferData(handle, static_cast<GLsizeiptr>(size), nullptr, GL_STREAM_COPY);
-        glBindBufferRange(GL_TRANSFORM_FEEDBACK_BUFFER, static_cast<GLuint>(index), handle, 0,
-                          static_cast<GLsizeiptr>(size));
-    }
+    UNIMPLEMENTED_IF(primitive_mode != GL_POINTS);

    // We may have to call BeginTransformFeedbackNV here since they seem to call different
    // implementations on Nvidia's driver (the pointer is different) but we are using
@@ -1695,23 +1401,7 @@ void RasterizerOpenGL::EndTransformFeedback() {
    if (regs.tfb_enabled == 0) {
        return;
    }
-
    glEndTransformFeedback();
-
-    for (std::size_t index = 0; index < Maxwell::NumTransformFeedbackBuffers; ++index) {
-        const auto& binding = regs.tfb_bindings[index];
-        if (!binding.buffer_enable) {
-            continue;
-        }
-        UNIMPLEMENTED_IF(binding.buffer_offset != 0);
-
-        const GLuint handle = transform_feedback_buffers[index].handle;
-        const GPUVAddr gpu_addr = binding.Address();
-        const std::size_t size = binding.buffer_size;
-        const auto info = buffer_cache.UploadMemory(gpu_addr, size, 4, true);
-        glCopyNamedBufferSubData(handle, info.handle, 0, info.offset,
-                                 static_cast<GLsizeiptr>(size));
-    }
 }

 } // namespace OpenGL
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@@ -30,7 +30,6 @@
 #include "video_core/renderer_opengl/gl_shader_decompiler.h"
 #include "video_core/renderer_opengl/gl_shader_manager.h"
 #include "video_core/renderer_opengl/gl_state_tracker.h"
-#include "video_core/renderer_opengl/gl_stream_buffer.h"
 #include "video_core/renderer_opengl/gl_texture_cache.h"
 #include "video_core/shader/async_shaders.h"
 #include "video_core/textures/texture.h"
@@ -47,6 +46,11 @@ namespace Tegra {
 class MemoryManager;
 }

+namespace Vulkan {
+class Device;
+class MemoryAllocator;
+} // namespace Vulkan
+
 namespace OpenGL {

 struct ScreenInfo;
@@ -63,6 +67,8 @@ class RasterizerOpenGL : public VideoCore::RasterizerAccelerated {
 public:
    explicit RasterizerOpenGL(Core::Frontend::EmuWindow& emu_window_, Tegra::GPU& gpu_,
                              Core::Memory::Memory& cpu_memory_, const Device& device_,
+                              const Vulkan::Device* vulkan_device,
+                              Vulkan::MemoryAllocator* vulkan_memory_allocator,
                              ScreenInfo& screen_info_, ProgramManager& program_manager_,
                              StateTracker& state_tracker_);
    ~RasterizerOpenGL() override;
@@ -72,8 +78,11 @@ public:
    void DispatchCompute(GPUVAddr code_addr) override;
    void ResetCounter(VideoCore::QueryType type) override;
    void Query(GPUVAddr gpu_addr, VideoCore::QueryType type, std::optional<u64> timestamp) override;
+    void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size) override;
    void FlushAll() override;
    void FlushRegion(VAddr addr, u64 size) override;
+    void InvalidateExceptTextureCache(VAddr addr, u64 size) override;
+    void InvalidateTextureCache(VAddr addr, u64 size) override;
    bool MustFlushRegion(VAddr addr, u64 size) override;
    void InvalidateRegion(VAddr addr, u64 size) override;
    void OnCPUWrite(VAddr addr, u64 size) override;
@@ -119,27 +128,6 @@ private:
    void BindTextures(const ShaderEntries& entries, GLuint base_texture, GLuint base_image,
                      size_t& image_view_index, size_t& texture_index, size_t& image_index);

-    /// Configures the current constbuffers to use for the draw command.
-    void SetupDrawConstBuffers(std::size_t stage_index, Shader* shader);
-
-    /// Configures the current constbuffers to use for the kernel invocation.
-    void SetupComputeConstBuffers(Shader* kernel);
-
-    /// Configures a constant buffer.
-    void SetupConstBuffer(GLenum stage, u32 binding, const Tegra::Engines::ConstBufferInfo& buffer,
-                          const ConstBufferEntry& entry, bool use_unified,
-                          std::size_t unified_offset);
-
-    /// Configures the current global memory entries to use for the draw command.
-    void SetupDrawGlobalMemory(std::size_t stage_index, Shader* shader);
-
-    /// Configures the current global memory entries to use for the kernel invocation.
-    void SetupComputeGlobalMemory(Shader* kernel);
-
-    /// Configures a global memory buffer.
-    void SetupGlobalMemory(u32 binding, const GlobalMemoryEntry& entry, GPUVAddr gpu_addr,
-                           size_t size, BindlessSSBO* ssbo);
-
    /// Configures the current textures to use for the draw command.
    void SetupDrawTextures(const Shader* shader, size_t stage_index);

@@ -152,6 +140,9 @@ private:
    /// Configures images in a compute shader.
    void SetupComputeImages(const Shader* shader);

+    /// Syncs state to match guest's
+    void SyncState();
+
    /// Syncs the viewport and depth range to match the guest state
    void SyncViewport();

@@ -215,6 +206,12 @@ private:
    /// Syncs the framebuffer sRGB state to match the guest state
    void SyncFramebufferSRGB();

+    /// Syncs vertex formats to match the guest state
+    void SyncVertexFormats();
+
+    /// Syncs vertex instances to match the guest state
+    void SyncVertexInstances();
+
    /// Syncs transform feedback state to match guest state
    /// @note Only valid on assembly shaders
    void SyncTransformFeedback();
@@ -225,19 +222,7 @@ private:
    /// End a transform feedback
    void EndTransformFeedback();

-    std::size_t CalculateVertexArraysSize() const;
-
-    std::size_t CalculateIndexBufferSize() const;
-
-    /// Updates the current vertex format
-    void SetupVertexFormat();
-
-    void SetupVertexBuffer();
-    void SetupVertexInstances();
-
-    GLintptr SetupIndexBuffer();
-
-    void SetupShaders();
+    void SetupShaders(bool is_indexed);

    Tegra::GPU& gpu;
    Tegra::Engines::Maxwell3D& maxwell3d;
@@ -249,12 +234,12 @@ private:
    ProgramManager& program_manager;
    StateTracker& state_tracker;

-    OGLStreamBuffer stream_buffer;
    TextureCacheRuntime texture_cache_runtime;
    TextureCache texture_cache;
+    BufferCacheRuntime buffer_cache_runtime;
+    BufferCache buffer_cache;
    ShaderCacheOpenGL shader_cache;
    QueryCache query_cache;
-    OGLBufferCache buffer_cache;
    FenceManagerOpenGL fence_manager;

    VideoCommon::Shader::AsyncShaders async_shaders;
@@ -262,20 +247,8 @@ private:
    boost::container::static_vector<u32, MAX_IMAGE_VIEWS> image_view_indices;
    std::array<ImageViewId, MAX_IMAGE_VIEWS> image_view_ids;
    boost::container::static_vector<GLuint, MAX_TEXTURES> sampler_handles;
-    std::array<GLuint, MAX_TEXTURES> texture_handles;
-    std::array<GLuint, MAX_IMAGES> image_handles;
-
-    std::array<OGLBuffer, Tegra::Engines::Maxwell3D::Regs::NumTransformFeedbackBuffers>
-        transform_feedback_buffers;
-    std::bitset<Tegra::Engines::Maxwell3D::Regs::NumTransformFeedbackBuffers>
-        enabled_transform_feedback_buffers;
-
-    static constexpr std::size_t NUM_CONSTANT_BUFFERS =
-        Tegra::Engines::Maxwell3D::Regs::MaxConstBuffers *
-        Tegra::Engines::Maxwell3D::Regs::MaxShaderProgram;
-    std::array<GLuint, NUM_CONSTANT_BUFFERS> staging_cbufs{};
-    std::size_t current_cbuf = 0;
-    OGLBuffer unified_uniform_buffer;
+    std::array<GLuint, MAX_TEXTURES> texture_handles{};
+    std::array<GLuint, MAX_IMAGES> image_handles{};

    /// Number of commands queued to the OpenGL driver. Resetted on flush.
    std::size_t num_queued_commands = 0;
--- a/src/video_core/renderer_opengl/gl_resource_manager.cpp
+++ b/src/video_core/renderer_opengl/gl_resource_manager.cpp
@@ -171,12 +171,6 @@ void OGLBuffer::Release() {
    handle = 0;
 }

-void OGLBuffer::MakeStreamCopy(std::size_t buffer_size) {
-    ASSERT_OR_EXECUTE((handle != 0 && buffer_size != 0), { return; });
-
-    glNamedBufferData(handle, buffer_size, nullptr, GL_STREAM_COPY);
-}
-
 void OGLSync::Create() {
    if (handle != 0)
        return;
--- a/src/video_core/renderer_opengl/gl_resource_manager.h
+++ b/src/video_core/renderer_opengl/gl_resource_manager.h
@@ -234,9 +234,6 @@ public:
    /// Deletes the internal OpenGL resource
    void Release();

-    // Converts the buffer into a stream copy buffer with a fixed size
-    void MakeStreamCopy(std::size_t buffer_size);
-
    GLuint handle = 0;
 };

--- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
@@ -63,7 +63,7 @@ using TextureIR = std::variant<TextureOffset, TextureDerivates, TextureArgument>
 constexpr u32 MAX_CONSTBUFFER_SCALARS = static_cast<u32>(Maxwell::MaxConstBufferSize) / sizeof(u32);
 constexpr u32 MAX_CONSTBUFFER_ELEMENTS = MAX_CONSTBUFFER_SCALARS / sizeof(u32);

-constexpr std::string_view CommonDeclarations = R"(#define ftoi floatBitsToInt
+constexpr std::string_view COMMON_DECLARATIONS = R"(#define ftoi floatBitsToInt
 #define ftou floatBitsToUint
 #define itof intBitsToFloat
 #define utof uintBitsToFloat
@@ -76,10 +76,6 @@ bvec2 HalfFloatNanComparison(bvec2 comparison, vec2 pair1, vec2 pair2) {{

 const float fswzadd_modifiers_a[] = float[4](-1.0f,  1.0f, -1.0f,  0.0f );
 const float fswzadd_modifiers_b[] = float[4](-1.0f, -1.0f,  1.0f, -1.0f );
-
-layout (std140, binding = {}) uniform vs_config {{
-    float y_direction;
-}};
 )";

 class ShaderWriter final {
@@ -401,13 +397,6 @@ std::string FlowStackTopName(MetaStackClass stack) {
    return fmt::format("{}_flow_stack_top", GetFlowStackPrefix(stack));
 }

-bool UseUnifiedUniforms(const Device& device, const ShaderIR& ir, ShaderType stage) {
-    const u32 num_ubos = static_cast<u32>(ir.GetConstantBuffers().size());
-    // We waste one UBO for emulation
-    const u32 num_available_ubos = device.GetMaxUniformBuffers(stage) - 1;
-    return num_ubos > num_available_ubos;
-}
-
 struct GenericVaryingDescription {
    std::string name;
    u8 first_element = 0;
@@ -419,9 +408,8 @@ public:
    explicit GLSLDecompiler(const Device& device_, const ShaderIR& ir_, const Registry& registry_,
                            ShaderType stage_, std::string_view identifier_,
                            std::string_view suffix_)
-        : device{device_}, ir{ir_}, registry{registry_}, stage{stage_}, identifier{identifier_},
-          suffix{suffix_}, header{ir.GetHeader()}, use_unified_uniforms{
-                                                       UseUnifiedUniforms(device_, ir_, stage_)} {
+        : device{device_}, ir{ir_}, registry{registry_}, stage{stage_},
+          identifier{identifier_}, suffix{suffix_}, header{ir.GetHeader()} {
        if (stage != ShaderType::Compute) {
            transform_feedback = BuildTransformFeedback(registry.GetGraphicsInfo());
        }
@@ -515,7 +503,8 @@ private:
        if (!identifier.empty()) {
            code.AddLine("// {}", identifier);
        }
-        code.AddLine("#version 440 {}", ir.UsesLegacyVaryings() ? "compatibility" : "core");
+        const bool use_compatibility = ir.UsesLegacyVaryings() || ir.UsesYNegate();
+        code.AddLine("#version 440 {}", use_compatibility ? "compatibility" : "core");
        code.AddLine("#extension GL_ARB_separate_shader_objects : enable");
        if (device.HasShaderBallot()) {
            code.AddLine("#extension GL_ARB_shader_ballot : require");
@@ -541,7 +530,7 @@ private:

        code.AddNewLine();

-        code.AddLine(CommonDeclarations, EmulationUniformBlockBinding);
+        code.AddLine(COMMON_DECLARATIONS);
    }

    void DeclareVertex() {
@@ -864,17 +853,6 @@ private:
    }

    void DeclareConstantBuffers() {
-        if (use_unified_uniforms) {
-            const u32 binding = device.GetBaseBindings(stage).shader_storage_buffer +
-                                static_cast<u32>(ir.GetGlobalMemory().size());
-            code.AddLine("layout (std430, binding = {}) readonly buffer UnifiedUniforms {{",
-                         binding);
-            code.AddLine("    uint cbufs[];");
-            code.AddLine("}};");
-            code.AddNewLine();
-            return;
-        }
-
        u32 binding = device.GetBaseBindings(stage).uniform_buffer;
        for (const auto& [index, info] : ir.GetConstantBuffers()) {
            const u32 num_elements = Common::AlignUp(info.GetSize(), 4) / 4;
@@ -1080,29 +1058,17 @@ private:

        if (const auto cbuf = std::get_if<CbufNode>(&*node)) {
            const Node offset = cbuf->GetOffset();
-            const u32 base_unified_offset = cbuf->GetIndex() * MAX_CONSTBUFFER_SCALARS;

            if (const auto immediate = std::get_if<ImmediateNode>(&*offset)) {
                // Direct access
                const u32 offset_imm = immediate->GetValue();
                ASSERT_MSG(offset_imm % 4 == 0, "Unaligned cbuf direct access");
-                if (use_unified_uniforms) {
-                    return {fmt::format("cbufs[{}]", base_unified_offset + offset_imm / 4),
-                            Type::Uint};
-                } else {
-                    return {fmt::format("{}[{}][{}]", GetConstBuffer(cbuf->GetIndex()),
-                                        offset_imm / (4 * 4), (offset_imm / 4) % 4),
-                            Type::Uint};
-                }
-            }
-
-            // Indirect access
-            if (use_unified_uniforms) {
-                return {fmt::format("cbufs[{} + ({} >> 2)]", base_unified_offset,
-                                    Visit(offset).AsUint()),
+                return {fmt::format("{}[{}][{}]", GetConstBuffer(cbuf->GetIndex()),
+                                    offset_imm / (4 * 4), (offset_imm / 4) % 4),
                        Type::Uint};
            }

+            // Indirect access
            const std::string final_offset = code.GenerateTemporary();
            code.AddLine("uint {} = {} >> 2;", final_offset, Visit(offset).AsUint());

@@ -2292,7 +2258,6 @@ private:
                }
            }
        }
-
        if (header.ps.omap.depth) {
            // The depth output is always 2 registers after the last color output, and current_reg
            // already contains one past the last color register.
@@ -2336,7 +2301,8 @@ private:
    }

    Expression YNegate(Operation operation) {
-        return {"y_direction", Type::Float};
+        // Y_NEGATE is mapped to this uniform value
+        return {"gl_FrontMaterial.ambient.a", Type::Float};
    }

    template <u32 element>
@@ -2786,7 +2752,6 @@ private:
    const std::string_view identifier;
    const std::string_view suffix;
    const Header header;
-    const bool use_unified_uniforms;
    std::unordered_map<u8, VaryingTFB> transform_feedback;

    ShaderWriter code;
@@ -3002,8 +2967,10 @@ ShaderEntries MakeEntries(const Device& device, const ShaderIR& ir, ShaderType s
    for (std::size_t i = 0; i < std::size(clip_distances); ++i) {
        entries.clip_distances = (clip_distances[i] ? 1U : 0U) << i;
    }
+    for (const auto& buffer : entries.const_buffers) {
+        entries.enabled_uniform_buffers |= 1U << buffer.GetIndex();
+    }
    entries.shader_length = ir.GetLength();
-    entries.use_unified_uniforms = UseUnifiedUniforms(device, ir, stage);
    return entries;
 }

--- a/src/video_core/renderer_opengl/gl_shader_decompiler.h
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.h
@@ -55,7 +55,7 @@ struct ShaderEntries {
    std::vector<ImageEntry> images;
    std::size_t shader_length{};
    u32 clip_distances{};
-    bool use_unified_uniforms{};
+    u32 enabled_uniform_buffers{};
 };

 ShaderEntries MakeEntries(const Device& device, const VideoCommon::Shader::ShaderIR& ir,
--- a/src/video_core/renderer_opengl/gl_state_tracker.cpp
+++ b/src/video_core/renderer_opengl/gl_state_tracker.cpp
@@ -36,16 +36,10 @@ void SetupDirtyColorMasks(Tables& tables) {
    FillBlock(tables[1], OFF(color_mask), NUM(color_mask), ColorMasks);
 }

-void SetupDirtyVertexArrays(Tables& tables) {
-    static constexpr std::size_t num_array = 3;
+void SetupDirtyVertexInstances(Tables& tables) {
    static constexpr std::size_t instance_base_offset = 3;
    for (std::size_t i = 0; i < Regs::NumVertexArrays; ++i) {
        const std::size_t array_offset = OFF(vertex_array) + i * NUM(vertex_array[0]);
-        const std::size_t limit_offset = OFF(vertex_array_limit) + i * NUM(vertex_array_limit[0]);
-
-        FillBlock(tables, array_offset, num_array, VertexBuffer0 + i, VertexBuffers);
-        FillBlock(tables, limit_offset, NUM(vertex_array_limit), VertexBuffer0 + i, VertexBuffers);
-
        const std::size_t instance_array_offset = array_offset + instance_base_offset;
        tables[0][instance_array_offset] = static_cast<u8>(VertexInstance0 + i);
        tables[1][instance_array_offset] = VertexInstances;
@@ -217,11 +211,11 @@ void SetupDirtyMisc(Tables& tables) {
 StateTracker::StateTracker(Tegra::GPU& gpu) : flags{gpu.Maxwell3D().dirty.flags} {
    auto& dirty = gpu.Maxwell3D().dirty;
    auto& tables = dirty.tables;
-    SetupDirtyRenderTargets(tables);
+    SetupDirtyFlags(tables);
    SetupDirtyColorMasks(tables);
    SetupDirtyViewports(tables);
    SetupDirtyScissors(tables);
-    SetupDirtyVertexArrays(tables);
+    SetupDirtyVertexInstances(tables);
    SetupDirtyVertexFormat(tables);
    SetupDirtyShaders(tables);
    SetupDirtyPolygonModes(tables);
@@ -241,19 +235,6 @@ StateTracker::StateTracker(Tegra::GPU& gpu) : flags{gpu.Maxwell3D().dirty.flags}
    SetupDirtyClipControl(tables);
    SetupDirtyDepthClampEnabled(tables);
    SetupDirtyMisc(tables);
-
-    auto& store = dirty.on_write_stores;
-    store[VertexBuffers] = true;
-    for (std::size_t i = 0; i < Regs::NumVertexArrays; ++i) {
-        store[VertexBuffer0 + i] = true;
-    }
-}
-
-void StateTracker::InvalidateStreamBuffer() {
-    flags[Dirty::VertexBuffers] = true;
-    for (int index = Dirty::VertexBuffer0; index <= Dirty::VertexBuffer31; ++index) {
-        flags[index] = true;
-    }
 }

 } // namespace OpenGL
--- a/src/video_core/renderer_opengl/gl_state_tracker.h
+++ b/src/video_core/renderer_opengl/gl_state_tracker.h
@@ -28,10 +28,6 @@ enum : u8 {
    VertexFormat0,
    VertexFormat31 = VertexFormat0 + 31,

-    VertexBuffers,
-    VertexBuffer0,
-    VertexBuffer31 = VertexBuffer0 + 31,
-
    VertexInstances,
    VertexInstance0,
    VertexInstance31 = VertexInstance0 + 31,
@@ -92,8 +88,6 @@ class StateTracker {
 public:
    explicit StateTracker(Tegra::GPU& gpu);

-    void InvalidateStreamBuffer();
-
    void BindIndexBuffer(GLuint new_index_buffer) {
        if (index_buffer == new_index_buffer) {
            return;
@@ -110,13 +104,32 @@ public:
        glBindFramebuffer(GL_DRAW_FRAMEBUFFER, framebuffer);
    }

+    void ClipControl(GLenum new_origin, GLenum new_depth) {
+        if (new_origin == origin && new_depth == depth) {
+            return;
+        }
+        origin = new_origin;
+        depth = new_depth;
+        glClipControl(origin, depth);
+    }
+
+    void SetYNegate(bool new_y_negate) {
+        if (new_y_negate == y_negate) {
+            return;
+        }
+        // Y_NEGATE is mapped to gl_FrontMaterial.ambient.a
+        y_negate = new_y_negate;
+        const std::array ambient{0.0f, 0.0f, 0.0f, y_negate ? -1.0f : 1.0f};
+        glMaterialfv(GL_FRONT, GL_AMBIENT, ambient.data());
+    }
+
    void NotifyScreenDrawVertexArray() {
        flags[OpenGL::Dirty::VertexFormats] = true;
        flags[OpenGL::Dirty::VertexFormat0 + 0] = true;
        flags[OpenGL::Dirty::VertexFormat0 + 1] = true;

-        flags[OpenGL::Dirty::VertexBuffers] = true;
-        flags[OpenGL::Dirty::VertexBuffer0] = true;
+        flags[VideoCommon::Dirty::VertexBuffers] = true;
+        flags[VideoCommon::Dirty::VertexBuffer0] = true;

        flags[OpenGL::Dirty::VertexInstances] = true;
        flags[OpenGL::Dirty::VertexInstance0 + 0] = true;
@@ -202,6 +215,9 @@ private:

    GLuint framebuffer = 0;
    GLuint index_buffer = 0;
+    GLenum origin = GL_LOWER_LEFT;
+    GLenum depth = GL_NEGATIVE_ONE_TO_ONE;
+    bool y_negate = false;
 };

 } // namespace OpenGL
--- a/src/video_core/renderer_opengl/gl_stream_buffer.cpp
+++ b/src/video_core/renderer_opengl/gl_stream_buffer.cpp
@@ -1,70 +1,64 @@
-// Copyright 2018 Citra Emulator Project
+// Copyright 2021 yuzu Emulator Project
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.

-#include <tuple>
-#include <vector>
+#include <array>
+#include <memory>
+#include <span>
+
+#include <glad/glad.h>

 #include "common/alignment.h"
 #include "common/assert.h"
-#include "common/microprofile.h"
-#include "video_core/renderer_opengl/gl_device.h"
-#include "video_core/renderer_opengl/gl_state_tracker.h"
 #include "video_core/renderer_opengl/gl_stream_buffer.h"

-MICROPROFILE_DEFINE(OpenGL_StreamBuffer, "OpenGL", "Stream Buffer Orphaning",
-                    MP_RGB(128, 128, 192));
-
 namespace OpenGL {

-OGLStreamBuffer::OGLStreamBuffer(const Device& device, StateTracker& state_tracker_)
-    : state_tracker{state_tracker_} {
-    gl_buffer.Create();
-
-    static constexpr GLbitfield flags = GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT;
-    glNamedBufferStorage(gl_buffer.handle, BUFFER_SIZE, nullptr, flags);
-    mapped_ptr = static_cast<u8*>(
-        glMapNamedBufferRange(gl_buffer.handle, 0, BUFFER_SIZE, flags | GL_MAP_FLUSH_EXPLICIT_BIT));
-
-    if (device.UseAssemblyShaders() || device.HasVertexBufferUnifiedMemory()) {
-        glMakeNamedBufferResidentNV(gl_buffer.handle, GL_READ_ONLY);
-        glGetNamedBufferParameterui64vNV(gl_buffer.handle, GL_BUFFER_GPU_ADDRESS_NV, &gpu_address);
+StreamBuffer::StreamBuffer() {
+    static constexpr GLenum flags = GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | GL_MAP_COHERENT_BIT;
+    buffer.Create();
+    glObjectLabel(GL_BUFFER, buffer.handle, -1, "Stream Buffer");
+    glNamedBufferStorage(buffer.handle, STREAM_BUFFER_SIZE, nullptr, flags);
+    mapped_pointer =
+        static_cast<u8*>(glMapNamedBufferRange(buffer.handle, 0, STREAM_BUFFER_SIZE, flags));
+    for (OGLSync& sync : fences) {
+        sync.Create();
    }
 }

-OGLStreamBuffer::~OGLStreamBuffer() {
-    glUnmapNamedBuffer(gl_buffer.handle);
-    gl_buffer.Release();
-}
-
-std::pair<u8*, GLintptr> OGLStreamBuffer::Map(GLsizeiptr size, GLintptr alignment) {
-    ASSERT(size <= BUFFER_SIZE);
-    ASSERT(alignment <= BUFFER_SIZE);
-    mapped_size = size;
-
-    if (alignment > 0) {
-        buffer_pos = Common::AlignUp<std::size_t>(buffer_pos, alignment);
+std::pair<std::span<u8>, size_t> StreamBuffer::Request(size_t size) noexcept {
+    ASSERT(size < REGION_SIZE);
+    for (size_t region = Region(used_iterator), region_end = Region(iterator); region < region_end;
+         ++region) {
+        fences[region].Create();
    }
+    used_iterator = iterator;

-    if (buffer_pos + size > BUFFER_SIZE) {
-        MICROPROFILE_SCOPE(OpenGL_StreamBuffer);
-        glInvalidateBufferData(gl_buffer.handle);
-        state_tracker.InvalidateStreamBuffer();
-
-        buffer_pos = 0;
+    for (size_t region = Region(free_iterator) + 1,
+                region_end = std::min(Region(iterator + size) + 1, NUM_SYNCS);
+         region < region_end; ++region) {
+        glClientWaitSync(fences[region].handle, 0, GL_TIMEOUT_IGNORED);
+        fences[region].Release();
    }
-
-    return std::make_pair(mapped_ptr + buffer_pos, buffer_pos);
-}
-
-void OGLStreamBuffer::Unmap(GLsizeiptr size) {
-    ASSERT(size <= mapped_size);
-
-    if (size > 0) {
-        glFlushMappedNamedBufferRange(gl_buffer.handle, buffer_pos, size);
+    if (iterator + size > free_iterator) {
+        free_iterator = iterator + size;
    }
+    if (iterator + size > STREAM_BUFFER_SIZE) {
+        for (size_t region = Region(used_iterator); region < NUM_SYNCS; ++region) {
+            fences[region].Create();
+        }
+        used_iterator = 0;
+        iterator = 0;
+        free_iterator = size;

-    buffer_pos += size;
+        for (size_t region = 0, region_end = Region(size); region <= region_end; ++region) {
+            glClientWaitSync(fences[region].handle, 0, GL_TIMEOUT_IGNORED);
+            fences[region].Release();
+        }
+    }
+    const size_t offset = iterator;
+    iterator = Common::AlignUp(iterator + size, MAX_ALIGNMENT);
+    return {std::span(mapped_pointer + offset, size), offset};
 }

 } // namespace OpenGL
--- a/src/video_core/renderer_opengl/gl_stream_buffer.h
+++ b/src/video_core/renderer_opengl/gl_stream_buffer.h
@@ -1,9 +1,12 @@
-// Copyright 2018 Citra Emulator Project
+// Copyright 2021 yuzu Emulator Project
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.

 #pragma once

+#include <array>
+#include <memory>
+#include <span>
 #include <utility>

 #include <glad/glad.h>
@@ -13,48 +16,35 @@

 namespace OpenGL {

-class Device;
-class StateTracker;
+class StreamBuffer {
+    static constexpr size_t STREAM_BUFFER_SIZE = 64 * 1024 * 1024;
+    static constexpr size_t NUM_SYNCS = 16;
+    static constexpr size_t REGION_SIZE = STREAM_BUFFER_SIZE / NUM_SYNCS;
+    static constexpr size_t MAX_ALIGNMENT = 256;
+    static_assert(STREAM_BUFFER_SIZE % MAX_ALIGNMENT == 0);
+    static_assert(STREAM_BUFFER_SIZE % NUM_SYNCS == 0);
+    static_assert(REGION_SIZE % MAX_ALIGNMENT == 0);

-class OGLStreamBuffer : private NonCopyable {
 public:
-    explicit OGLStreamBuffer(const Device& device, StateTracker& state_tracker_);
-    ~OGLStreamBuffer();
+    explicit StreamBuffer();

-    /*
-     * Allocates a linear chunk of memory in the GPU buffer with at least "size" bytes
-     * and the optional alignment requirement.
-     * If the buffer is full, the whole buffer is reallocated which invalidates old chunks.
-     * The return values are the pointer to the new chunk, and the offset within the buffer.
-     * The actual used size must be specified on unmapping the chunk.
-     */
-    std::pair<u8*, GLintptr> Map(GLsizeiptr size, GLintptr alignment = 0);
+    [[nodiscard]] std::pair<std::span<u8>, size_t> Request(size_t size) noexcept;

-    void Unmap(GLsizeiptr size);
-
-    GLuint Handle() const {
-        return gl_buffer.handle;
-    }
-
-    u64 Address() const {
-        return gpu_address;
-    }
-
-    GLsizeiptr Size() const noexcept {
-        return BUFFER_SIZE;
+    [[nodiscard]] GLuint Handle() const noexcept {
+        return buffer.handle;
    }

 private:
-    static constexpr GLsizeiptr BUFFER_SIZE = 256 * 1024 * 1024;
+    [[nodiscard]] static size_t Region(size_t offset) noexcept {
+        return offset / REGION_SIZE;
+    }

-    StateTracker& state_tracker;
-
-    OGLBuffer gl_buffer;
-
-    GLuint64EXT gpu_address = 0;
-    GLintptr buffer_pos = 0;
-    GLsizeiptr mapped_size = 0;
-    u8* mapped_ptr = nullptr;
+    size_t iterator = 0;
+    size_t used_iterator = 0;
+    size_t free_iterator = 0;
+    u8* mapped_pointer = nullptr;
+    OGLBuffer buffer;
+    std::array<OGLSync, NUM_SYNCS> fences;
 };

 } // namespace OpenGL
--- a/src/video_core/renderer_opengl/gl_texture_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp
@@ -398,9 +398,6 @@ void AttachTexture(GLuint fbo, GLenum attachment, const ImageView* image_view) {

 } // Anonymous namespace

-ImageBufferMap::ImageBufferMap(GLuint handle_, u8* map, size_t size, OGLSync* sync_)
-    : span(map, size), sync{sync_}, handle{handle_} {}
-
 ImageBufferMap::~ImageBufferMap() {
    if (sync) {
        sync->Create();
@@ -487,11 +484,11 @@ void TextureCacheRuntime::Finish() {
    glFinish();
 }

-ImageBufferMap TextureCacheRuntime::MapUploadBuffer(size_t size) {
+ImageBufferMap TextureCacheRuntime::UploadStagingBuffer(size_t size) {
    return upload_buffers.RequestMap(size, true);
 }

-ImageBufferMap TextureCacheRuntime::MapDownloadBuffer(size_t size) {
+ImageBufferMap TextureCacheRuntime::DownloadStagingBuffer(size_t size) {
    return download_buffers.RequestMap(size, false);
 }

@@ -596,7 +593,11 @@ ImageBufferMap TextureCacheRuntime::StagingBuffers::RequestMap(size_t requested_
                                                               bool insert_fence) {
    const size_t index = RequestBuffer(requested_size);
    OGLSync* const sync = insert_fence ? &syncs[index] : nullptr;
-    return ImageBufferMap(buffers[index].handle, maps[index], requested_size, sync);
+    return ImageBufferMap{
+        .mapped_span = std::span(maps[index], requested_size),
+        .sync = sync,
+        .buffer = buffers[index].handle,
+    };
 }

 size_t TextureCacheRuntime::StagingBuffers::RequestBuffer(size_t requested_size) {
@@ -711,7 +712,7 @@ Image::Image(TextureCacheRuntime& runtime, const VideoCommon::ImageInfo& info_,

 void Image::UploadMemory(const ImageBufferMap& map, size_t buffer_offset,
                         std::span<const VideoCommon::BufferImageCopy> copies) {
-    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, map.Handle());
+    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, map.buffer);
    glFlushMappedBufferRange(GL_PIXEL_UNPACK_BUFFER, buffer_offset, unswizzled_size_bytes);

    glPixelStorei(GL_UNPACK_ALIGNMENT, 1);
@@ -735,7 +736,7 @@ void Image::UploadMemory(const ImageBufferMap& map, size_t buffer_offset,
 void Image::UploadMemory(const ImageBufferMap& map, size_t buffer_offset,
                         std::span<const VideoCommon::BufferCopy> copies) {
    for (const VideoCommon::BufferCopy& copy : copies) {
-        glCopyNamedBufferSubData(map.Handle(), buffer.handle, copy.src_offset + buffer_offset,
+        glCopyNamedBufferSubData(map.buffer, buffer.handle, copy.src_offset + buffer_offset,
                                 copy.dst_offset, copy.size);
    }
 }
@@ -744,7 +745,7 @@ void Image::DownloadMemory(ImageBufferMap& map, size_t buffer_offset,
                           std::span<const VideoCommon::BufferImageCopy> copies) {
    glMemoryBarrier(GL_PIXEL_BUFFER_BARRIER_BIT); // TODO: Move this to its own API

-    glBindBuffer(GL_PIXEL_PACK_BUFFER, map.Handle());
+    glBindBuffer(GL_PIXEL_PACK_BUFFER, map.buffer);
    glPixelStorei(GL_PACK_ALIGNMENT, 1);

    u32 current_row_length = std::numeric_limits<u32>::max();
--- a/src/video_core/renderer_opengl/gl_texture_cache.h
+++ b/src/video_core/renderer_opengl/gl_texture_cache.h
@@ -31,23 +31,12 @@ using VideoCommon::NUM_RT;
 using VideoCommon::Offset2D;
 using VideoCommon::RenderTargets;

-class ImageBufferMap {
-public:
-    explicit ImageBufferMap(GLuint handle, u8* map, size_t size, OGLSync* sync);
+struct ImageBufferMap {
    ~ImageBufferMap();

-    GLuint Handle() const noexcept {
-        return handle;
-    }
-
-    std::span<u8> Span() const noexcept {
-        return span;
-    }
-
-private:
-    std::span<u8> span;
+    std::span<u8> mapped_span;
    OGLSync* sync;
-    GLuint handle;
+    GLuint buffer;
 };

 struct FormatProperties {
@@ -69,9 +58,9 @@ public:

    void Finish();

-    ImageBufferMap MapUploadBuffer(size_t size);
+    ImageBufferMap UploadStagingBuffer(size_t size);

-    ImageBufferMap MapDownloadBuffer(size_t size);
+    ImageBufferMap DownloadStagingBuffer(size_t size);

    void CopyImage(Image& dst, Image& src, std::span<const VideoCommon::ImageCopy> copies);

--- a/src/video_core/renderer_opengl/renderer_opengl.cpp
+++ b/src/video_core/renderer_opengl/renderer_opengl.cpp
@@ -27,11 +27,14 @@
 #include "video_core/renderer_opengl/gl_shader_manager.h"
 #include "video_core/renderer_opengl/renderer_opengl.h"
 #include "video_core/textures/decoders.h"
+#include "video_core/vulkan_common/vulkan_debug_callback.h"
+#include "video_core/vulkan_common/vulkan_device.h"
+#include "video_core/vulkan_common/vulkan_instance.h"
+#include "video_core/vulkan_common/vulkan_library.h"
+#include "video_core/vulkan_common/vulkan_memory_allocator.h"

 namespace OpenGL {
-
 namespace {
-
 constexpr GLint PositionLocation = 0;
 constexpr GLint TexCoordLocation = 1;
 constexpr GLint ModelViewMatrixLocation = 0;
@@ -125,14 +128,100 @@ void APIENTRY DebugHandler(GLenum source, GLenum type, GLuint id, GLenum severit
    }
 }

+Vulkan::vk::PhysicalDevice FindPhysicalDevice(Vulkan::vk::Instance& instance) {
+    using namespace Vulkan;
+    using UUID = std::array<GLubyte, GL_UUID_SIZE_EXT>;
+
+    GLint num_device_uuids;
+    glGetIntegerv(GL_NUM_DEVICE_UUIDS_EXT, &num_device_uuids);
+    std::vector<UUID> device_uuids(num_device_uuids);
+    for (GLint index = 0; index < num_device_uuids; ++index) {
+        glGetUnsignedBytei_vEXT(GL_DEVICE_UUID_EXT, 0, device_uuids[index].data());
+    }
+    UUID driver_uuid;
+    glGetUnsignedBytevEXT(GL_DRIVER_UUID_EXT, driver_uuid.data());
+
+    for (const VkPhysicalDevice raw_physical_device : instance.EnumeratePhysicalDevices()) {
+        VkPhysicalDeviceIDProperties device_id_properties{};
+        device_id_properties.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ID_PROPERTIES;
+
+        VkPhysicalDeviceProperties2KHR properties{
+            .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2_KHR,
+            .pNext = &device_id_properties,
+            .properties{},
+        };
+        vk::PhysicalDevice physical_device(raw_physical_device, instance.Dispatch());
+        physical_device.GetProperties2KHR(properties);
+        if (!std::ranges::equal(device_id_properties.driverUUID, driver_uuid)) {
+            continue;
+        }
+        const auto it =
+            std::ranges::find_if(device_uuids, [&device_id_properties, driver_uuid](UUID uuid) {
+                return std::ranges::equal(device_id_properties.deviceUUID, uuid);
+            });
+        if (it != device_uuids.end()) {
+            return physical_device;
+        }
+    }
+    throw vk::Exception(VK_ERROR_INCOMPATIBLE_DRIVER);
+}
 } // Anonymous namespace

+struct VulkanObjects {
+    static std::unique_ptr<VulkanObjects> TryCreate() {
+        if (!GLAD_GL_EXT_memory_object) {
+            // Interop is not present
+            return nullptr;
+        }
+        const std::string_view vendor{reinterpret_cast<const char*>(glGetString(GL_VENDOR))};
+        if (vendor == "ATI Technologies Inc.") {
+            // Avoid using GL_EXT_memory_object on AMD, as it makes the GL driver crash
+            return nullptr;
+        }
+        if (!Settings::values.use_assembly_shaders.GetValue()) {
+            // We only need interop when assembly shaders are enabled
+            return nullptr;
+        }
+#ifdef __linux__
+        LOG_WARNING(Render_OpenGL, "Interop doesn't work on Linux at the moment");
+        return nullptr;
+#endif
+        try {
+            return std::make_unique<VulkanObjects>();
+        } catch (const Vulkan::vk::Exception& exception) {
+            LOG_ERROR(Render_OpenGL, "Failed to initialize Vulkan objects with error: {}",
+                      exception.what());
+            return nullptr;
+        }
+    }
+
+    Common::DynamicLibrary library{Vulkan::OpenLibrary()};
+    Vulkan::vk::InstanceDispatch dld;
+    Vulkan::vk::Instance instance{Vulkan::CreateInstance(library, dld, VK_API_VERSION_1_1)};
+    Vulkan::Device device{*instance, FindPhysicalDevice(instance), nullptr, dld};
+    Vulkan::MemoryAllocator memory_allocator{device, true};
+};
+
 RendererOpenGL::RendererOpenGL(Core::TelemetrySession& telemetry_session_,
                               Core::Frontend::EmuWindow& emu_window_,
                               Core::Memory::Memory& cpu_memory_, Tegra::GPU& gpu_,
                               std::unique_ptr<Core::Frontend::GraphicsContext> context_)
    : RendererBase{emu_window_, std::move(context_)}, telemetry_session{telemetry_session_},
-      emu_window{emu_window_}, cpu_memory{cpu_memory_}, gpu{gpu_}, program_manager{device} {}
+      emu_window{emu_window_}, cpu_memory{cpu_memory_}, gpu{gpu_},
+      vulkan_objects{VulkanObjects::TryCreate()}, device{vulkan_objects != nullptr},
+      state_tracker{gpu}, program_manager{device},
+      rasterizer(emu_window, gpu, cpu_memory, device,
+                 vulkan_objects ? &vulkan_objects->device : nullptr,
+                 vulkan_objects ? &vulkan_objects->memory_allocator : nullptr, screen_info,
+                 program_manager, state_tracker) {
+    if (Settings::values.renderer_debug && GLAD_GL_KHR_debug) {
+        glEnable(GL_DEBUG_OUTPUT);
+        glEnable(GL_DEBUG_OUTPUT_SYNCHRONOUS);
+        glDebugMessageCallback(DebugHandler, nullptr);
+    }
+    AddTelemetryFields();
+    InitOpenGLObjects();
+}

 RendererOpenGL::~RendererOpenGL() = default;

@@ -148,7 +237,7 @@ void RendererOpenGL::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) {

    ++m_current_frame;

-    rasterizer->TickFrame();
+    rasterizer.TickFrame();

    context->SwapBuffers();
    render_window.OnFrameDisplayed();
@@ -179,7 +268,7 @@ void RendererOpenGL::LoadFBToScreenInfo(const Tegra::FramebufferConfig& framebuf
    framebuffer_crop_rect = framebuffer.crop_rect;

    const VAddr framebuffer_addr{framebuffer.address + framebuffer.offset};
-    if (rasterizer->AccelerateDisplay(framebuffer, framebuffer_addr, framebuffer.stride)) {
+    if (rasterizer.AccelerateDisplay(framebuffer, framebuffer_addr, framebuffer.stride)) {
        return;
    }

@@ -267,6 +356,7 @@ void RendererOpenGL::InitOpenGLObjects() {
    // Enable unified vertex attributes and query vertex buffer address when the driver supports it
    if (device.HasVertexBufferUnifiedMemory()) {
        glEnableClientState(GL_VERTEX_ATTRIB_ARRAY_UNIFIED_NV);
+        glEnableClientState(GL_ELEMENT_ARRAY_UNIFIED_NV);

        glMakeNamedBufferResidentNV(vertex_buffer.handle, GL_READ_ONLY);
        glGetNamedBufferParameterui64vNV(vertex_buffer.handle, GL_BUFFER_GPU_ADDRESS_NV,
@@ -289,14 +379,6 @@ void RendererOpenGL::AddTelemetryFields() {
    telemetry_session.AddField(user_system, "GPU_OpenGL_Version", std::string(gl_version));
 }

-void RendererOpenGL::CreateRasterizer() {
-    if (rasterizer) {
-        return;
-    }
-    rasterizer = std::make_unique<RasterizerOpenGL>(emu_window, gpu, cpu_memory, device,
-                                                    screen_info, program_manager, state_tracker);
-}
-
 void RendererOpenGL::ConfigureFramebufferTexture(TextureInfo& texture,
                                                 const Tegra::FramebufferConfig& framebuffer) {
    texture.width = framebuffer.width;
@@ -407,6 +489,7 @@ void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) {

    program_manager.BindHostPipeline(pipeline.handle);

+    state_tracker.ClipControl(GL_LOWER_LEFT, GL_ZERO_TO_ONE);
    glEnable(GL_CULL_FACE);
    if (screen_info.display_srgb) {
        glEnable(GL_FRAMEBUFFER_SRGB);
@@ -425,7 +508,6 @@ void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) {
    glCullFace(GL_BACK);
    glFrontFace(GL_CW);
    glColorMaski(0, GL_TRUE, GL_TRUE, GL_TRUE, GL_TRUE);
-    glClipControl(GL_LOWER_LEFT, GL_ZERO_TO_ONE);
    glViewportIndexedf(0, 0.0f, 0.0f, static_cast<GLfloat>(layout.width),
                       static_cast<GLfloat>(layout.height));
    glDepthRangeIndexed(0, 0.0, 0.0);
@@ -497,25 +579,4 @@ void RendererOpenGL::RenderScreenshot() {
    renderer_settings.screenshot_requested = false;
 }

-bool RendererOpenGL::Init() {
-    if (Settings::values.renderer_debug && GLAD_GL_KHR_debug) {
-        glEnable(GL_DEBUG_OUTPUT);
-        glEnable(GL_DEBUG_OUTPUT_SYNCHRONOUS);
-        glDebugMessageCallback(DebugHandler, nullptr);
-    }
-
-    AddTelemetryFields();
-
-    if (!GLAD_GL_VERSION_4_3) {
-        return false;
-    }
-
-    InitOpenGLObjects();
-    CreateRasterizer();
-
-    return true;
-}
-
-void RendererOpenGL::ShutDown() {}
-
 } // namespace OpenGL
--- a/src/video_core/renderer_opengl/renderer_opengl.h
+++ b/src/video_core/renderer_opengl/renderer_opengl.h
@@ -10,6 +10,7 @@
 #include "common/math_util.h"
 #include "video_core/renderer_base.h"
 #include "video_core/renderer_opengl/gl_device.h"
+#include "video_core/renderer_opengl/gl_rasterizer.h"
 #include "video_core/renderer_opengl/gl_resource_manager.h"
 #include "video_core/renderer_opengl/gl_shader_manager.h"
 #include "video_core/renderer_opengl/gl_state_tracker.h"
@@ -37,6 +38,8 @@ class GPU;

 namespace OpenGL {

+struct VulkanObjects;
+
 /// Structure used for storing information about the textures for the Switch screen
 struct TextureInfo {
    OGLTexture resource;
@@ -63,18 +66,18 @@ public:
                            std::unique_ptr<Core::Frontend::GraphicsContext> context_);
    ~RendererOpenGL() override;

-    bool Init() override;
-    void ShutDown() override;
    void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) override;

+    VideoCore::RasterizerInterface* ReadRasterizer() override {
+        return &rasterizer;
+    }
+
 private:
    /// Initializes the OpenGL state and creates persistent objects.
    void InitOpenGLObjects();

    void AddTelemetryFields();

-    void CreateRasterizer();
-
    void ConfigureFramebufferTexture(TextureInfo& texture,
                                     const Tegra::FramebufferConfig& framebuffer);

@@ -98,8 +101,11 @@ private:
    Core::Memory::Memory& cpu_memory;
    Tegra::GPU& gpu;

-    const Device device;
-    StateTracker state_tracker{gpu};
+    std::unique_ptr<VulkanObjects> vulkan_objects;
+    Device device;
+    StateTracker state_tracker;
+    ProgramManager program_manager;
+    RasterizerOpenGL rasterizer;

    // OpenGL object IDs
    OGLSampler present_sampler;
@@ -115,9 +121,6 @@ private:
    /// Display information for Switch screen
    ScreenInfo screen_info;

-    /// Global dummy shader pipeline
-    ProgramManager program_manager;
-
    /// OpenGL framebuffer data
    std::vector<u8> gl_framebuffer_data;

--- a/src/video_core/renderer_opengl/util_shaders.cpp
+++ b/src/video_core/renderer_opengl/util_shaders.cpp
@@ -71,7 +71,7 @@ void UtilShaders::BlockLinearUpload2D(Image& image, const ImageBufferMap& map, s
    static constexpr GLuint BINDING_OUTPUT_IMAGE = 0;

    program_manager.BindHostCompute(block_linear_unswizzle_2d_program.handle);
-    glFlushMappedNamedBufferRange(map.Handle(), buffer_offset, image.guest_size_bytes);
+    glFlushMappedNamedBufferRange(map.buffer, buffer_offset, image.guest_size_bytes);
    glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_SWIZZLE_BUFFER, swizzle_table_buffer.handle);

    const GLenum store_format = StoreFormat(BytesPerBlock(image.info.format));
@@ -91,8 +91,8 @@ void UtilShaders::BlockLinearUpload2D(Image& image, const ImageBufferMap& map, s
        glUniform1ui(5, params.x_shift);
        glUniform1ui(6, params.block_height);
        glUniform1ui(7, params.block_height_mask);
-        glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_INPUT_BUFFER, map.Handle(),
-                          input_offset, image.guest_size_bytes - swizzle.buffer_offset);
+        glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_INPUT_BUFFER, map.buffer, input_offset,
+                          image.guest_size_bytes - swizzle.buffer_offset);
        glBindImageTexture(BINDING_OUTPUT_IMAGE, image.Handle(), swizzle.level, GL_TRUE, 0,
                           GL_WRITE_ONLY, store_format);
        glDispatchCompute(num_dispatches_x, num_dispatches_y, image.info.resources.layers);
@@ -108,7 +108,7 @@ void UtilShaders::BlockLinearUpload3D(Image& image, const ImageBufferMap& map, s
    static constexpr GLuint BINDING_INPUT_BUFFER = 1;
    static constexpr GLuint BINDING_OUTPUT_IMAGE = 0;

-    glFlushMappedNamedBufferRange(map.Handle(), buffer_offset, image.guest_size_bytes);
+    glFlushMappedNamedBufferRange(map.buffer, buffer_offset, image.guest_size_bytes);
    program_manager.BindHostCompute(block_linear_unswizzle_3d_program.handle);
    glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_SWIZZLE_BUFFER, swizzle_table_buffer.handle);

@@ -132,8 +132,8 @@ void UtilShaders::BlockLinearUpload3D(Image& image, const ImageBufferMap& map, s
        glUniform1ui(7, params.block_height_mask);
        glUniform1ui(8, params.block_depth);
        glUniform1ui(9, params.block_depth_mask);
-        glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_INPUT_BUFFER, map.Handle(),
-                          input_offset, image.guest_size_bytes - swizzle.buffer_offset);
+        glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_INPUT_BUFFER, map.buffer, input_offset,
+                          image.guest_size_bytes - swizzle.buffer_offset);
        glBindImageTexture(BINDING_OUTPUT_IMAGE, image.Handle(), swizzle.level, GL_TRUE, 0,
                           GL_WRITE_ONLY, store_format);
        glDispatchCompute(num_dispatches_x, num_dispatches_y, num_dispatches_z);
@@ -159,7 +159,7 @@ void UtilShaders::PitchUpload(Image& image, const ImageBufferMap& map, size_t bu
                         "Non-power of two images are not implemented");

    program_manager.BindHostCompute(pitch_unswizzle_program.handle);
-    glFlushMappedNamedBufferRange(map.Handle(), buffer_offset, image.guest_size_bytes);
+    glFlushMappedNamedBufferRange(map.buffer, buffer_offset, image.guest_size_bytes);
    glUniform2ui(LOC_ORIGIN, 0, 0);
    glUniform2i(LOC_DESTINATION, 0, 0);
    glUniform1ui(LOC_BYTES_PER_BLOCK, bytes_per_block);
@@ -172,8 +172,8 @@ void UtilShaders::PitchUpload(Image& image, const ImageBufferMap& map, size_t bu
        const u32 num_dispatches_x = Common::DivCeil(num_tiles.width, WORKGROUP_SIZE.width);
        const u32 num_dispatches_y = Common::DivCeil(num_tiles.height, WORKGROUP_SIZE.height);

-        glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_INPUT_BUFFER, map.Handle(),
-                          input_offset, image.guest_size_bytes - swizzle.buffer_offset);
+        glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_INPUT_BUFFER, map.buffer, input_offset,
+                          image.guest_size_bytes - swizzle.buffer_offset);
        glDispatchCompute(num_dispatches_x, num_dispatches_y, 1);
    }
    program_manager.RestoreGuestCompute();
--- a/src/video_core/renderer_opengl/util_shaders.h
+++ b/src/video_core/renderer_opengl/util_shaders.h
@@ -15,9 +15,10 @@
 namespace OpenGL {

 class Image;
-class ImageBufferMap;
 class ProgramManager;

+struct ImageBufferMap;
+
 class UtilShaders {
 public:
    explicit UtilShaders(ProgramManager& program_manager);
--- a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
+++ b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
@@ -526,13 +526,9 @@ VkCompareOp ComparisonOp(Maxwell::ComparisonOp comparison) {
    return {};
 }

-VkIndexType IndexFormat(const Device& device, Maxwell::IndexFormat index_format) {
+VkIndexType IndexFormat(Maxwell::IndexFormat index_format) {
    switch (index_format) {
    case Maxwell::IndexFormat::UnsignedByte:
-        if (!device.IsExtIndexTypeUint8Supported()) {
-            UNIMPLEMENTED_MSG("Native uint8 indices are not supported on this device");
-            return VK_INDEX_TYPE_UINT16;
-        }
        return VK_INDEX_TYPE_UINT8_EXT;
    case Maxwell::IndexFormat::UnsignedShort:
        return VK_INDEX_TYPE_UINT16;
--- a/src/video_core/renderer_vulkan/maxwell_to_vk.h
+++ b/src/video_core/renderer_vulkan/maxwell_to_vk.h
@@ -45,7 +45,7 @@ VkFormat VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttrib

 VkCompareOp ComparisonOp(Maxwell::ComparisonOp comparison);

-VkIndexType IndexFormat(const Device& device, Maxwell::IndexFormat index_format);
+VkIndexType IndexFormat(Maxwell::IndexFormat index_format);

 VkStencilOp StencilOp(Maxwell::StencilOp stencil_op);

--- a/src/video_core/renderer_vulkan/renderer_vulkan.cpp
+++ b/src/video_core/renderer_vulkan/renderer_vulkan.cpp
@@ -80,17 +80,50 @@ std::string BuildCommaSeparatedExtensions(std::vector<std::string> available_ext
    return separated_extensions;
 }

+Device CreateDevice(const vk::Instance& instance, const vk::InstanceDispatch& dld,
+                    VkSurfaceKHR surface) {
+    const std::vector<VkPhysicalDevice> devices = instance.EnumeratePhysicalDevices();
+    const s32 device_index = Settings::values.vulkan_device.GetValue();
+    if (device_index < 0 || device_index >= static_cast<s32>(devices.size())) {
+        LOG_ERROR(Render_Vulkan, "Invalid device index {}!", device_index);
+        throw vk::Exception(VK_ERROR_INITIALIZATION_FAILED);
+    }
+    const vk::PhysicalDevice physical_device(devices[device_index], dld);
+    return Device(*instance, physical_device, surface, dld);
+}
 } // Anonymous namespace

 RendererVulkan::RendererVulkan(Core::TelemetrySession& telemetry_session_,
                               Core::Frontend::EmuWindow& emu_window,
                               Core::Memory::Memory& cpu_memory_, Tegra::GPU& gpu_,
-                               std::unique_ptr<Core::Frontend::GraphicsContext> context_)
-    : RendererBase{emu_window, std::move(context_)}, telemetry_session{telemetry_session_},
-      cpu_memory{cpu_memory_}, gpu{gpu_} {}
+                               std::unique_ptr<Core::Frontend::GraphicsContext> context_) try
+    : RendererBase(emu_window, std::move(context_)),
+      telemetry_session(telemetry_session_),
+      cpu_memory(cpu_memory_),
+      gpu(gpu_),
+      library(OpenLibrary()),
+      instance(CreateInstance(library, dld, VK_API_VERSION_1_1, render_window.GetWindowInfo().type,
+                              true, Settings::values.renderer_debug)),
+      debug_callback(Settings::values.renderer_debug ? CreateDebugCallback(instance) : nullptr),
+      surface(CreateSurface(instance, render_window)),
+      device(CreateDevice(instance, dld, *surface)),
+      memory_allocator(device, false),
+      state_tracker(gpu),
+      scheduler(device, state_tracker),
+      swapchain(*surface, device, scheduler, render_window.GetFramebufferLayout().width,
+                render_window.GetFramebufferLayout().height, false),
+      blit_screen(cpu_memory, render_window, device, memory_allocator, swapchain, scheduler,
+                  screen_info),
+      rasterizer(render_window, gpu, gpu.MemoryManager(), cpu_memory, screen_info, device,
+                 memory_allocator, state_tracker, scheduler) {
+    Report();
+} catch (const vk::Exception& exception) {
+    LOG_ERROR(Render_Vulkan, "Vulkan initialization failed with error: {}", exception.what());
+    throw std::runtime_error{fmt::format("Vulkan initialization error {}", exception.what())};
+}

 RendererVulkan::~RendererVulkan() {
-    ShutDown();
+    void(device.GetLogical().WaitIdle());
 }

 void RendererVulkan::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) {
@@ -101,101 +134,38 @@ void RendererVulkan::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) {
    if (layout.width > 0 && layout.height > 0 && render_window.IsShown()) {
        const VAddr framebuffer_addr = framebuffer->address + framebuffer->offset;
        const bool use_accelerated =
-            rasterizer->AccelerateDisplay(*framebuffer, framebuffer_addr, framebuffer->stride);
+            rasterizer.AccelerateDisplay(*framebuffer, framebuffer_addr, framebuffer->stride);
        const bool is_srgb = use_accelerated && screen_info.is_srgb;
-        if (swapchain->HasFramebufferChanged(layout) || swapchain->GetSrgbState() != is_srgb) {
-            swapchain->Create(layout.width, layout.height, is_srgb);
-            blit_screen->Recreate();
+        if (swapchain.HasFramebufferChanged(layout) || swapchain.GetSrgbState() != is_srgb) {
+            swapchain.Create(layout.width, layout.height, is_srgb);
+            blit_screen.Recreate();
        }

-        scheduler->WaitWorker();
+        scheduler.WaitWorker();

-        swapchain->AcquireNextImage();
-        const VkSemaphore render_semaphore = blit_screen->Draw(*framebuffer, use_accelerated);
+        swapchain.AcquireNextImage();
+        const VkSemaphore render_semaphore = blit_screen.Draw(*framebuffer, use_accelerated);

-        scheduler->Flush(render_semaphore);
+        scheduler.Flush(render_semaphore);

-        if (swapchain->Present(render_semaphore)) {
-            blit_screen->Recreate();
+        if (swapchain.Present(render_semaphore)) {
+            blit_screen.Recreate();
        }
-
-        rasterizer->TickFrame();
+        rasterizer.TickFrame();
    }

    render_window.OnFrameDisplayed();
 }

-bool RendererVulkan::Init() try {
-    library = OpenLibrary();
-    instance = CreateInstance(library, dld, VK_API_VERSION_1_1, render_window.GetWindowInfo().type,
-                              true, Settings::values.renderer_debug);
-    if (Settings::values.renderer_debug) {
-        debug_callback = CreateDebugCallback(instance);
-    }
-    surface = CreateSurface(instance, render_window);
-
-    InitializeDevice();
-    Report();
-
-    memory_allocator = std::make_unique<MemoryAllocator>(*device);
-
-    state_tracker = std::make_unique<StateTracker>(gpu);
-
-    scheduler = std::make_unique<VKScheduler>(*device, *state_tracker);
-
-    const auto& framebuffer = render_window.GetFramebufferLayout();
-    swapchain = std::make_unique<VKSwapchain>(*surface, *device, *scheduler);
-    swapchain->Create(framebuffer.width, framebuffer.height, false);
-
-    rasterizer = std::make_unique<RasterizerVulkan>(render_window, gpu, gpu.MemoryManager(),
-                                                    cpu_memory, screen_info, *device,
-                                                    *memory_allocator, *state_tracker, *scheduler);
-
-    blit_screen =
-        std::make_unique<VKBlitScreen>(cpu_memory, render_window, *rasterizer, *device,
-                                       *memory_allocator, *swapchain, *scheduler, screen_info);
-    return true;
-
-} catch (const vk::Exception& exception) {
-    LOG_ERROR(Render_Vulkan, "Vulkan initialization failed with error: {}", exception.what());
-    return false;
-}
-
-void RendererVulkan::ShutDown() {
-    if (!device) {
-        return;
-    }
-    if (const auto& dev = device->GetLogical()) {
-        dev.WaitIdle();
-    }
-    rasterizer.reset();
-    blit_screen.reset();
-    scheduler.reset();
-    swapchain.reset();
-    memory_allocator.reset();
-    device.reset();
-}
-
-void RendererVulkan::InitializeDevice() {
-    const std::vector<VkPhysicalDevice> devices = instance.EnumeratePhysicalDevices();
-    const s32 device_index = Settings::values.vulkan_device.GetValue();
-    if (device_index < 0 || device_index >= static_cast<s32>(devices.size())) {
-        LOG_ERROR(Render_Vulkan, "Invalid device index {}!", device_index);
-        throw vk::Exception(VK_ERROR_INITIALIZATION_FAILED);
-    }
-    const vk::PhysicalDevice physical_device(devices[static_cast<size_t>(device_index)], dld);
-    device = std::make_unique<Device>(*instance, physical_device, *surface, dld);
-}
-
 void RendererVulkan::Report() const {
-    const std::string vendor_name{device->GetVendorName()};
-    const std::string model_name{device->GetModelName()};
-    const std::string driver_version = GetDriverVersion(*device);
+    const std::string vendor_name{device.GetVendorName()};
+    const std::string model_name{device.GetModelName()};
+    const std::string driver_version = GetDriverVersion(device);
    const std::string driver_name = fmt::format("{} {}", vendor_name, driver_version);

-    const std::string api_version = GetReadableVersion(device->ApiVersion());
+    const std::string api_version = GetReadableVersion(device.ApiVersion());

-    const std::string extensions = BuildCommaSeparatedExtensions(device->GetAvailableExtensions());
+    const std::string extensions = BuildCommaSeparatedExtensions(device.GetAvailableExtensions());

    LOG_INFO(Render_Vulkan, "Driver: {}", driver_name);
    LOG_INFO(Render_Vulkan, "Device: {}", model_name);
@@ -209,21 +179,4 @@ void RendererVulkan::Report() const {
    telemetry_session.AddField(field, "GPU_Vulkan_Extensions", extensions);
 }

-std::vector<std::string> RendererVulkan::EnumerateDevices() try {
-    vk::InstanceDispatch dld;
-    const Common::DynamicLibrary library = OpenLibrary();
-    const vk::Instance instance = CreateInstance(library, dld, VK_API_VERSION_1_0);
-    const std::vector<VkPhysicalDevice> physical_devices = instance.EnumeratePhysicalDevices();
-    std::vector<std::string> names;
-    names.reserve(physical_devices.size());
-    for (const VkPhysicalDevice device : physical_devices) {
-        names.push_back(vk::PhysicalDevice(device, dld).GetProperties().deviceName);
-    }
-    return names;
-
-} catch (const vk::Exception& exception) {
-    LOG_ERROR(Render_Vulkan, "Failed to enumerate devices with error: {}", exception.what());
-    return {};
-}
-
 } // namespace Vulkan
--- a/src/video_core/renderer_vulkan/renderer_vulkan.h
+++ b/src/video_core/renderer_vulkan/renderer_vulkan.h
@@ -9,8 +9,14 @@
 #include <vector>

 #include "common/dynamic_library.h"
-
 #include "video_core/renderer_base.h"
+#include "video_core/renderer_vulkan/vk_blit_screen.h"
+#include "video_core/renderer_vulkan/vk_rasterizer.h"
+#include "video_core/renderer_vulkan/vk_scheduler.h"
+#include "video_core/renderer_vulkan/vk_state_tracker.h"
+#include "video_core/renderer_vulkan/vk_swapchain.h"
+#include "video_core/vulkan_common/vulkan_device.h"
+#include "video_core/vulkan_common/vulkan_memory_allocator.h"
 #include "video_core/vulkan_common/vulkan_wrapper.h"

 namespace Core {
@@ -27,20 +33,6 @@ class GPU;

 namespace Vulkan {

-class Device;
-class StateTracker;
-class MemoryAllocator;
-class VKBlitScreen;
-class VKSwapchain;
-class VKScheduler;
-
-struct VKScreenInfo {
-    VkImageView image_view{};
-    u32 width{};
-    u32 height{};
-    bool is_srgb{};
-};
-
 class RendererVulkan final : public VideoCore::RendererBase {
 public:
    explicit RendererVulkan(Core::TelemetrySession& telemtry_session,
@@ -49,15 +41,13 @@ public:
                            std::unique_ptr<Core::Frontend::GraphicsContext> context_);
    ~RendererVulkan() override;

-    bool Init() override;
-    void ShutDown() override;
    void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) override;

-    static std::vector<std::string> EnumerateDevices();
+    VideoCore::RasterizerInterface* ReadRasterizer() override {
+        return &rasterizer;
+    }

 private:
-    void InitializeDevice();
-
    void Report() const;

    Core::TelemetrySession& telemetry_session;
@@ -68,18 +58,18 @@ private:
    vk::InstanceDispatch dld;

    vk::Instance instance;
-
+    vk::DebugUtilsMessenger debug_callback;
    vk::SurfaceKHR surface;

    VKScreenInfo screen_info;

-    vk::DebugUtilsMessenger debug_callback;
-    std::unique_ptr<Device> device;
-    std::unique_ptr<MemoryAllocator> memory_allocator;
-    std::unique_ptr<StateTracker> state_tracker;
-    std::unique_ptr<VKScheduler> scheduler;
-    std::unique_ptr<VKSwapchain> swapchain;
-    std::unique_ptr<VKBlitScreen> blit_screen;
+    Device device;
+    MemoryAllocator memory_allocator;
+    StateTracker state_tracker;
+    VKScheduler scheduler;
+    VKSwapchain swapchain;
+    VKBlitScreen blit_screen;
+    RasterizerVulkan rasterizer;
 };

 } // namespace Vulkan
--- a/src/video_core/renderer_vulkan/vk_blit_screen.cpp
+++ b/src/video_core/renderer_vulkan/vk_blit_screen.cpp
@@ -18,7 +18,6 @@
 #include "video_core/gpu.h"
 #include "video_core/host_shaders/vulkan_present_frag_spv.h"
 #include "video_core/host_shaders/vulkan_present_vert_spv.h"
-#include "video_core/rasterizer_interface.h"
 #include "video_core/renderer_vulkan/renderer_vulkan.h"
 #include "video_core/renderer_vulkan/vk_blit_screen.h"
 #include "video_core/renderer_vulkan/vk_master_semaphore.h"
@@ -113,13 +112,12 @@ struct VKBlitScreen::BufferData {
 };

 VKBlitScreen::VKBlitScreen(Core::Memory::Memory& cpu_memory_,
-                           Core::Frontend::EmuWindow& render_window_,
-                           VideoCore::RasterizerInterface& rasterizer_, const Device& device_,
+                           Core::Frontend::EmuWindow& render_window_, const Device& device_,
                           MemoryAllocator& memory_allocator_, VKSwapchain& swapchain_,
                           VKScheduler& scheduler_, const VKScreenInfo& screen_info_)
-    : cpu_memory{cpu_memory_}, render_window{render_window_}, rasterizer{rasterizer_},
-      device{device_}, memory_allocator{memory_allocator_}, swapchain{swapchain_},
-      scheduler{scheduler_}, image_count{swapchain.GetImageCount()}, screen_info{screen_info_} {
+    : cpu_memory{cpu_memory_}, render_window{render_window_}, device{device_},
+      memory_allocator{memory_allocator_}, swapchain{swapchain_}, scheduler{scheduler_},
+      image_count{swapchain.GetImageCount()}, screen_info{screen_info_} {
    resource_ticks.resize(image_count);

    CreateStaticResources();
@@ -150,8 +148,8 @@ VkSemaphore VKBlitScreen::Draw(const Tegra::FramebufferConfig& framebuffer, bool
    SetUniformData(data, framebuffer);
    SetVertexData(data, framebuffer);

-    const std::span<u8> map = buffer_commit.Map();
-    std::memcpy(map.data(), &data, sizeof(data));
+    const std::span<u8> mapped_span = buffer_commit.Map();
+    std::memcpy(mapped_span.data(), &data, sizeof(data));

    if (!use_accelerated) {
        const u64 image_offset = GetRawImageOffset(framebuffer, image_index);
@@ -159,14 +157,13 @@ VkSemaphore VKBlitScreen::Draw(const Tegra::FramebufferConfig& framebuffer, bool
        const VAddr framebuffer_addr = framebuffer.address + framebuffer.offset;
        const u8* const host_ptr = cpu_memory.GetPointer(framebuffer_addr);
        const size_t size_bytes = GetSizeInBytes(framebuffer);
-        rasterizer.FlushRegion(ToCacheAddr(host_ptr), size_bytes);

        // TODO(Rodrigo): Read this from HLE
        constexpr u32 block_height_log2 = 4;
        const u32 bytes_per_pixel = GetBytesPerPixel(framebuffer);
        Tegra::Texture::UnswizzleTexture(
-            map.subspan(image_offset, size_bytes), std::span(host_ptr, size_bytes), bytes_per_pixel,
-            framebuffer.width, framebuffer.height, 1, block_height_log2, 0);
+            mapped_span.subspan(image_offset, size_bytes), std::span(host_ptr, size_bytes),
+            bytes_per_pixel, framebuffer.width, framebuffer.height, 1, block_height_log2, 0);

        const VkBufferImageCopy copy{
            .bufferOffset = image_offset,
@@ -266,7 +263,6 @@ VkSemaphore VKBlitScreen::Draw(const Tegra::FramebufferConfig& framebuffer, bool
        cmdbuf.Draw(4, 1, 0, 0);
        cmdbuf.EndRenderPass();
    });
-
    return *semaphores[image_index];
 }

--- a/src/video_core/renderer_vulkan/vk_blit_screen.h
+++ b/src/video_core/renderer_vulkan/vk_blit_screen.h
@@ -38,12 +38,18 @@ class RasterizerVulkan;
 class VKScheduler;
 class VKSwapchain;

-class VKBlitScreen final {
+struct VKScreenInfo {
+    VkImageView image_view{};
+    u32 width{};
+    u32 height{};
+    bool is_srgb{};
+};
+
+class VKBlitScreen {
 public:
    explicit VKBlitScreen(Core::Memory::Memory& cpu_memory,
-                          Core::Frontend::EmuWindow& render_window,
-                          VideoCore::RasterizerInterface& rasterizer, const Device& device,
-                          MemoryAllocator& memory_allocator, VKSwapchain& swapchain,
+                          Core::Frontend::EmuWindow& render_window, const Device& device,
+                          MemoryAllocator& memory_manager, VKSwapchain& swapchain,
                          VKScheduler& scheduler, const VKScreenInfo& screen_info);
    ~VKBlitScreen();

@@ -84,7 +90,6 @@ private:

    Core::Memory::Memory& cpu_memory;
    Core::Frontend::EmuWindow& render_window;
-    VideoCore::RasterizerInterface& rasterizer;
    const Device& device;
    MemoryAllocator& memory_allocator;
    VKSwapchain& swapchain;
--- a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
@@ -3,188 +3,276 @@
 // Refer to the license.txt file included.

 #include <algorithm>
+#include <array>
 #include <cstring>
-#include <memory>
+#include <span>
+#include <vector>

-#include "core/core.h"
 #include "video_core/buffer_cache/buffer_cache.h"
+#include "video_core/renderer_vulkan/maxwell_to_vk.h"
 #include "video_core/renderer_vulkan/vk_buffer_cache.h"
 #include "video_core/renderer_vulkan/vk_scheduler.h"
-#include "video_core/renderer_vulkan/vk_stream_buffer.h"
+#include "video_core/renderer_vulkan/vk_staging_buffer_pool.h"
+#include "video_core/renderer_vulkan/vk_update_descriptor.h"
 #include "video_core/vulkan_common/vulkan_device.h"
+#include "video_core/vulkan_common/vulkan_memory_allocator.h"
 #include "video_core/vulkan_common/vulkan_wrapper.h"

 namespace Vulkan {
-
 namespace {
+VkBufferCopy MakeBufferCopy(const VideoCommon::BufferCopy& copy) {
+    return VkBufferCopy{
+        .srcOffset = copy.src_offset,
+        .dstOffset = copy.dst_offset,
+        .size = copy.size,
+    };
+}

-constexpr VkBufferUsageFlags BUFFER_USAGE =
-    VK_BUFFER_USAGE_VERTEX_BUFFER_BIT | VK_BUFFER_USAGE_INDEX_BUFFER_BIT |
-    VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT;
+VkIndexType IndexTypeFromNumElements(const Device& device, u32 num_elements) {
+    if (num_elements <= 0xff && device.IsExtIndexTypeUint8Supported()) {
+        return VK_INDEX_TYPE_UINT8_EXT;
+    }
+    if (num_elements <= 0xffff) {
+        return VK_INDEX_TYPE_UINT16;
+    }
+    return VK_INDEX_TYPE_UINT32;
+}

-constexpr VkPipelineStageFlags UPLOAD_PIPELINE_STAGE =
-    VK_PIPELINE_STAGE_TRANSFER_BIT | VK_PIPELINE_STAGE_VERTEX_INPUT_BIT |
-    VK_PIPELINE_STAGE_VERTEX_SHADER_BIT | VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT |
-    VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
-
-constexpr VkAccessFlags UPLOAD_ACCESS_BARRIERS =
-    VK_ACCESS_TRANSFER_READ_BIT | VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_UNIFORM_READ_BIT |
-    VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT | VK_ACCESS_INDEX_READ_BIT;
-
-constexpr VkAccessFlags TRANSFORM_FEEDBACK_WRITE_ACCESS =
-    VK_ACCESS_TRANSFORM_FEEDBACK_WRITE_BIT_EXT | VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT;
+size_t BytesPerIndex(VkIndexType index_type) {
+    switch (index_type) {
+    case VK_INDEX_TYPE_UINT8_EXT:
+        return 1;
+    case VK_INDEX_TYPE_UINT16:
+        return 2;
+    case VK_INDEX_TYPE_UINT32:
+        return 4;
+    default:
+        UNREACHABLE_MSG("Invalid index type={}", index_type);
+        return 1;
+    }
+}

+template <typename T>
+std::array<T, 6> MakeQuadIndices(u32 quad, u32 first) {
+    std::array<T, 6> indices{0, 1, 2, 0, 2, 3};
+    std::ranges::transform(indices, indices.begin(),
+                           [quad, first](u32 index) { return first + index + quad * 4; });
+    return indices;
+}
 } // Anonymous namespace

-Buffer::Buffer(const Device& device_, MemoryAllocator& memory_allocator, VKScheduler& scheduler_,
-               StagingBufferPool& staging_pool_, VAddr cpu_addr_, std::size_t size_)
-    : BufferBlock{cpu_addr_, size_}, device{device_}, scheduler{scheduler_}, staging_pool{
-                                                                                 staging_pool_} {
-    buffer = device.GetLogical().CreateBuffer(VkBufferCreateInfo{
+Buffer::Buffer(BufferCacheRuntime&, VideoCommon::NullBufferParams null_params)
+    : VideoCommon::BufferBase<VideoCore::RasterizerInterface>(null_params) {}
+
+Buffer::Buffer(BufferCacheRuntime& runtime, VideoCore::RasterizerInterface& rasterizer_,
+               VAddr cpu_addr_, u64 size_bytes_)
+    : VideoCommon::BufferBase<VideoCore::RasterizerInterface>(rasterizer_, cpu_addr_, size_bytes_) {
+    buffer = runtime.device.GetLogical().CreateBuffer(VkBufferCreateInfo{
        .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
        .pNext = nullptr,
        .flags = 0,
-        .size = static_cast<VkDeviceSize>(size_),
-        .usage = BUFFER_USAGE | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT,
+        .size = SizeBytes(),
+        .usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT |
+                 VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT |
+                 VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT | VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT |
+                 VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_INDEX_BUFFER_BIT |
+                 VK_BUFFER_USAGE_VERTEX_BUFFER_BIT,
        .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
        .queueFamilyIndexCount = 0,
        .pQueueFamilyIndices = nullptr,
    });
-    commit = memory_allocator.Commit(buffer, MemoryUsage::DeviceLocal);
+    if (runtime.device.HasDebuggingToolAttached()) {
+        buffer.SetObjectNameEXT(fmt::format("Buffer 0x{:x}", CpuAddr()).c_str());
+    }
+    commit = runtime.memory_allocator.Commit(buffer, MemoryUsage::DeviceLocal);
 }

-Buffer::~Buffer() = default;
+BufferCacheRuntime::BufferCacheRuntime(const Device& device_, MemoryAllocator& memory_allocator_,
+                                       VKScheduler& scheduler_, StagingBufferPool& staging_pool_,
+                                       VKUpdateDescriptorQueue& update_descriptor_queue_,
+                                       VKDescriptorPool& descriptor_pool)
+    : device{device_}, memory_allocator{memory_allocator_}, scheduler{scheduler_},
+      staging_pool{staging_pool_}, update_descriptor_queue{update_descriptor_queue_},
+      uint8_pass(device, scheduler, descriptor_pool, staging_pool, update_descriptor_queue),
+      quad_index_pass(device, scheduler, descriptor_pool, staging_pool, update_descriptor_queue) {}

-void Buffer::Upload(std::size_t offset, std::size_t data_size, const u8* data) {
-    const auto& staging = staging_pool.Request(data_size, MemoryUsage::Upload);
-    std::memcpy(staging.mapped_span.data(), data, data_size);
+StagingBufferRef BufferCacheRuntime::UploadStagingBuffer(size_t size) {
+    return staging_pool.Request(size, MemoryUsage::Upload);
+}

+StagingBufferRef BufferCacheRuntime::DownloadStagingBuffer(size_t size) {
+    return staging_pool.Request(size, MemoryUsage::Download);
+}
+
+void BufferCacheRuntime::Finish() {
+    scheduler.Finish();
+}
+
+void BufferCacheRuntime::CopyBuffer(VkBuffer dst_buffer, VkBuffer src_buffer,
+                                    std::span<const VideoCommon::BufferCopy> copies) {
+    static constexpr VkMemoryBarrier READ_BARRIER{
+        .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
+        .pNext = nullptr,
+        .srcAccessMask = VK_ACCESS_MEMORY_WRITE_BIT,
+        .dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT | VK_ACCESS_TRANSFER_WRITE_BIT,
+    };
+    static constexpr VkMemoryBarrier WRITE_BARRIER{
+        .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
+        .pNext = nullptr,
+        .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
+        .dstAccessMask = VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT,
+    };
+    // Measuring a popular game, this number never exceeds the specified size once data is warmed up
+    boost::container::small_vector<VkBufferCopy, 3> vk_copies(copies.size());
+    std::ranges::transform(copies, vk_copies.begin(), MakeBufferCopy);
    scheduler.RequestOutsideRenderPassOperationContext();
+    scheduler.Record([src_buffer, dst_buffer, vk_copies](vk::CommandBuffer cmdbuf) {
+        cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT,
+                               0, READ_BARRIER);
+        cmdbuf.CopyBuffer(src_buffer, dst_buffer, vk_copies);
+        cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
+                               0, WRITE_BARRIER);
+    });
+}

-    const VkBuffer handle = Handle();
-    scheduler.Record([staging = staging.buffer, handle, offset, data_size,
-                      &device = device](vk::CommandBuffer cmdbuf) {
-        const VkBufferMemoryBarrier read_barrier{
-            .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER,
-            .pNext = nullptr,
-            .srcAccessMask =
-                VK_ACCESS_SHADER_WRITE_BIT | VK_ACCESS_TRANSFER_WRITE_BIT |
-                VK_ACCESS_HOST_WRITE_BIT |
-                (device.IsExtTransformFeedbackSupported() ? TRANSFORM_FEEDBACK_WRITE_ACCESS : 0),
-            .dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT,
-            .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
-            .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
-            .buffer = handle,
-            .offset = offset,
-            .size = data_size,
+void BufferCacheRuntime::BindIndexBuffer(PrimitiveTopology topology, IndexFormat index_format,
+                                         u32 base_vertex, u32 num_indices, VkBuffer buffer,
+                                         u32 offset, [[maybe_unused]] u32 size) {
+    VkIndexType index_type = MaxwellToVK::IndexFormat(index_format);
+    if (topology == PrimitiveTopology::Quads) {
+        index_type = VK_INDEX_TYPE_UINT32;
+        std::tie(buffer, offset) =
+            quad_index_pass.Assemble(index_format, num_indices, base_vertex, buffer, offset);
+    } else if (index_type == VK_INDEX_TYPE_UINT8_EXT && !device.IsExtIndexTypeUint8Supported()) {
+        index_type = VK_INDEX_TYPE_UINT16;
+        std::tie(buffer, offset) = uint8_pass.Assemble(num_indices, buffer, offset);
+    }
+    scheduler.Record([buffer, offset, index_type](vk::CommandBuffer cmdbuf) {
+        cmdbuf.BindIndexBuffer(buffer, offset, index_type);
+    });
+}
+
+void BufferCacheRuntime::BindQuadArrayIndexBuffer(u32 first, u32 count) {
+    ReserveQuadArrayLUT(first + count, true);
+
+    // The LUT has the indices 0, 1, 2, and 3 copied as an array
+    // To apply these 'first' offsets we can apply an offset based on the modulus.
+    const VkIndexType index_type = quad_array_lut_index_type;
+    const size_t sub_first_offset = static_cast<size_t>(first % 4) * (current_num_indices / 4);
+    const size_t offset = (sub_first_offset + first / 4) * 6ULL * BytesPerIndex(index_type);
+    scheduler.Record([buffer = *quad_array_lut, index_type, offset](vk::CommandBuffer cmdbuf) {
+        cmdbuf.BindIndexBuffer(buffer, offset, index_type);
+    });
+}
+
+void BufferCacheRuntime::BindVertexBuffer(u32 index, VkBuffer buffer, u32 offset, u32 size,
+                                          u32 stride) {
+    if (device.IsExtExtendedDynamicStateSupported()) {
+        scheduler.Record([index, buffer, offset, size, stride](vk::CommandBuffer cmdbuf) {
+            const VkDeviceSize vk_offset = offset;
+            const VkDeviceSize vk_size = buffer != VK_NULL_HANDLE ? size : VK_WHOLE_SIZE;
+            const VkDeviceSize vk_stride = stride;
+            cmdbuf.BindVertexBuffers2EXT(index, 1, &buffer, &vk_offset, &vk_size, &vk_stride);
+        });
+    } else {
+        scheduler.Record([index, buffer, offset](vk::CommandBuffer cmdbuf) {
+            cmdbuf.BindVertexBuffer(index, buffer, offset);
+        });
+    }
+}
+
+void BufferCacheRuntime::BindTransformFeedbackBuffer(u32 index, VkBuffer buffer, u32 offset,
+                                                     u32 size) {
+    if (!device.IsExtTransformFeedbackSupported()) {
+        // Already logged in the rasterizer
+        return;
+    }
+    scheduler.Record([index, buffer, offset, size](vk::CommandBuffer cmdbuf) {
+        const VkDeviceSize vk_offset = offset;
+        const VkDeviceSize vk_size = size;
+        cmdbuf.BindTransformFeedbackBuffersEXT(index, 1, &buffer, &vk_offset, &vk_size);
+    });
+}
+
+void BufferCacheRuntime::BindBuffer(VkBuffer buffer, u32 offset, u32 size) {
+    update_descriptor_queue.AddBuffer(buffer, offset, size);
+}
+
+void BufferCacheRuntime::ReserveQuadArrayLUT(u32 num_indices, bool wait_for_idle) {
+    if (num_indices <= current_num_indices) {
+        return;
+    }
+    if (wait_for_idle) {
+        scheduler.Finish();
+    }
+    current_num_indices = num_indices;
+    quad_array_lut_index_type = IndexTypeFromNumElements(device, num_indices);
+
+    const u32 num_quads = num_indices / 4;
+    const u32 num_triangle_indices = num_quads * 6;
+    const u32 num_first_offset_copies = 4;
+    const size_t bytes_per_index = BytesPerIndex(quad_array_lut_index_type);
+    const size_t size_bytes = num_triangle_indices * bytes_per_index * num_first_offset_copies;
+    quad_array_lut = device.GetLogical().CreateBuffer(VkBufferCreateInfo{
+        .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
+        .pNext = nullptr,
+        .flags = 0,
+        .size = size_bytes,
+        .usage = VK_BUFFER_USAGE_INDEX_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT,
+        .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
+        .queueFamilyIndexCount = 0,
+        .pQueueFamilyIndices = nullptr,
+    });
+    if (device.HasDebuggingToolAttached()) {
+        quad_array_lut.SetObjectNameEXT("Quad LUT");
+    }
+    quad_array_lut_commit = memory_allocator.Commit(quad_array_lut, MemoryUsage::DeviceLocal);
+
+    const StagingBufferRef staging = staging_pool.Request(size_bytes, MemoryUsage::Upload);
+    u8* staging_data = staging.mapped_span.data();
+    const size_t quad_size = bytes_per_index * 6;
+    for (u32 first = 0; first < num_first_offset_copies; ++first) {
+        for (u32 quad = 0; quad < num_quads; ++quad) {
+            switch (quad_array_lut_index_type) {
+            case VK_INDEX_TYPE_UINT8_EXT:
+                std::memcpy(staging_data, MakeQuadIndices<u8>(quad, first).data(), quad_size);
+                break;
+            case VK_INDEX_TYPE_UINT16:
+                std::memcpy(staging_data, MakeQuadIndices<u16>(quad, first).data(), quad_size);
+                break;
+            case VK_INDEX_TYPE_UINT32:
+                std::memcpy(staging_data, MakeQuadIndices<u32>(quad, first).data(), quad_size);
+                break;
+            default:
+                UNREACHABLE();
+                break;
+            }
+            staging_data += quad_size;
+        }
+    }
+    scheduler.RequestOutsideRenderPassOperationContext();
+    scheduler.Record([src_buffer = staging.buffer, dst_buffer = *quad_array_lut,
+                      size_bytes](vk::CommandBuffer cmdbuf) {
+        const VkBufferCopy copy{
+            .srcOffset = 0,
+            .dstOffset = 0,
+            .size = size_bytes,
        };
        const VkBufferMemoryBarrier write_barrier{
            .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER,
            .pNext = nullptr,
            .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
-            .dstAccessMask = UPLOAD_ACCESS_BARRIERS,
+            .dstAccessMask = VK_ACCESS_INDEX_READ_BIT,
            .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
            .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
-            .buffer = handle,
-            .offset = offset,
-            .size = data_size,
+            .buffer = dst_buffer,
+            .offset = 0,
+            .size = size_bytes,
        };
-        cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT,
-                               0, read_barrier);
-        cmdbuf.CopyBuffer(staging, handle, VkBufferCopy{0, offset, data_size});
-        cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, UPLOAD_PIPELINE_STAGE, 0,
-                               write_barrier);
+        cmdbuf.CopyBuffer(src_buffer, dst_buffer, copy);
+        cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_VERTEX_INPUT_BIT,
+                               0, write_barrier);
    });
 }

-void Buffer::Download(std::size_t offset, std::size_t data_size, u8* data) {
-    auto staging = staging_pool.Request(data_size, MemoryUsage::Download);
-    scheduler.RequestOutsideRenderPassOperationContext();
-
-    const VkBuffer handle = Handle();
-    scheduler.Record(
-        [staging = staging.buffer, handle, offset, data_size](vk::CommandBuffer cmdbuf) {
-            const VkBufferMemoryBarrier barrier{
-                .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER,
-                .pNext = nullptr,
-                .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT,
-                .dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT,
-                .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
-                .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
-                .buffer = handle,
-                .offset = offset,
-                .size = data_size,
-            };
-
-            cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_VERTEX_SHADER_BIT |
-                                       VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT |
-                                       VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
-                                   VK_PIPELINE_STAGE_TRANSFER_BIT, 0, {}, barrier, {});
-            cmdbuf.CopyBuffer(handle, staging, VkBufferCopy{offset, 0, data_size});
-        });
-    scheduler.Finish();
-
-    std::memcpy(data, staging.mapped_span.data(), data_size);
-}
-
-void Buffer::CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst_offset,
-                      std::size_t copy_size) {
-    scheduler.RequestOutsideRenderPassOperationContext();
-
-    const VkBuffer dst_buffer = Handle();
-    scheduler.Record([src_buffer = src.Handle(), dst_buffer, src_offset, dst_offset,
-                      copy_size](vk::CommandBuffer cmdbuf) {
-        cmdbuf.CopyBuffer(src_buffer, dst_buffer, VkBufferCopy{src_offset, dst_offset, copy_size});
-
-        std::array<VkBufferMemoryBarrier, 2> barriers;
-        barriers[0].sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
-        barriers[0].pNext = nullptr;
-        barriers[0].srcAccessMask = VK_ACCESS_TRANSFER_READ_BIT;
-        barriers[0].dstAccessMask = VK_ACCESS_SHADER_WRITE_BIT;
-        barriers[0].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
-        barriers[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
-        barriers[0].buffer = src_buffer;
-        barriers[0].offset = src_offset;
-        barriers[0].size = copy_size;
-        barriers[1].sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
-        barriers[1].pNext = nullptr;
-        barriers[1].srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
-        barriers[1].dstAccessMask = UPLOAD_ACCESS_BARRIERS;
-        barriers[1].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
-        barriers[1].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
-        barriers[1].buffer = dst_buffer;
-        barriers[1].offset = dst_offset;
-        barriers[1].size = copy_size;
-        cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, UPLOAD_PIPELINE_STAGE, 0, {},
-                               barriers, {});
-    });
-}
-
-VKBufferCache::VKBufferCache(VideoCore::RasterizerInterface& rasterizer_,
-                             Tegra::MemoryManager& gpu_memory_, Core::Memory::Memory& cpu_memory_,
-                             const Device& device_, MemoryAllocator& memory_allocator_,
-                             VKScheduler& scheduler_, VKStreamBuffer& stream_buffer_,
-                             StagingBufferPool& staging_pool_)
-    : VideoCommon::BufferCache<Buffer, VkBuffer, VKStreamBuffer>{rasterizer_, gpu_memory_,
-                                                                 cpu_memory_, stream_buffer_},
-      device{device_}, memory_allocator{memory_allocator_}, scheduler{scheduler_},
-      staging_pool{staging_pool_} {}
-
-VKBufferCache::~VKBufferCache() = default;
-
-std::shared_ptr<Buffer> VKBufferCache::CreateBlock(VAddr cpu_addr, std::size_t size) {
-    return std::make_shared<Buffer>(device, memory_allocator, scheduler, staging_pool, cpu_addr,
-                                    size);
-}
-
-VKBufferCache::BufferInfo VKBufferCache::GetEmptyBuffer(std::size_t size) {
-    size = std::max(size, std::size_t(4));
-    const auto& empty = staging_pool.Request(size, MemoryUsage::DeviceLocal);
-    scheduler.RequestOutsideRenderPassOperationContext();
-    scheduler.Record([size, buffer = empty.buffer](vk::CommandBuffer cmdbuf) {
-        cmdbuf.FillBuffer(buffer, 0, size, 0);
-    });
-    return {empty.buffer, 0, 0};
-}
-
 } // namespace Vulkan
--- a/src/video_core/renderer_vulkan/vk_buffer_cache.h
+++ b/src/video_core/renderer_vulkan/vk_buffer_cache.h
@@ -4,69 +4,112 @@

 #pragma once

-#include <memory>
-
-#include "common/common_types.h"
 #include "video_core/buffer_cache/buffer_cache.h"
+#include "video_core/engines/maxwell_3d.h"
+#include "video_core/renderer_vulkan/vk_compute_pass.h"
 #include "video_core/renderer_vulkan/vk_staging_buffer_pool.h"
-#include "video_core/renderer_vulkan/vk_stream_buffer.h"
 #include "video_core/vulkan_common/vulkan_memory_allocator.h"
 #include "video_core/vulkan_common/vulkan_wrapper.h"

 namespace Vulkan {

 class Device;
+class VKDescriptorPool;
 class VKScheduler;
+class VKUpdateDescriptorQueue;

-class Buffer final : public VideoCommon::BufferBlock {
+class BufferCacheRuntime;
+
+class Buffer : public VideoCommon::BufferBase<VideoCore::RasterizerInterface> {
 public:
-    explicit Buffer(const Device& device, MemoryAllocator& memory_allocator, VKScheduler& scheduler,
-                    StagingBufferPool& staging_pool, VAddr cpu_addr_, std::size_t size_);
-    ~Buffer();
+    explicit Buffer(BufferCacheRuntime&, VideoCommon::NullBufferParams null_params);
+    explicit Buffer(BufferCacheRuntime& runtime, VideoCore::RasterizerInterface& rasterizer_,
+                    VAddr cpu_addr_, u64 size_bytes_);

-    void Upload(std::size_t offset, std::size_t data_size, const u8* data);
-
-    void Download(std::size_t offset, std::size_t data_size, u8* data);
-
-    void CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst_offset,
-                  std::size_t copy_size);
-
-    VkBuffer Handle() const {
+    [[nodiscard]] VkBuffer Handle() const noexcept {
        return *buffer;
    }

-    u64 Address() const {
-        return 0;
+    operator VkBuffer() const noexcept {
+        return *buffer;
    }

 private:
-    const Device& device;
-    VKScheduler& scheduler;
-    StagingBufferPool& staging_pool;
-
    vk::Buffer buffer;
    MemoryCommit commit;
 };

-class VKBufferCache final : public VideoCommon::BufferCache<Buffer, VkBuffer, VKStreamBuffer> {
+class BufferCacheRuntime {
+    friend Buffer;
+
+    using PrimitiveTopology = Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology;
+    using IndexFormat = Tegra::Engines::Maxwell3D::Regs::IndexFormat;
+
 public:
-    explicit VKBufferCache(VideoCore::RasterizerInterface& rasterizer,
-                           Tegra::MemoryManager& gpu_memory, Core::Memory::Memory& cpu_memory,
-                           const Device& device, MemoryAllocator& memory_allocator,
-                           VKScheduler& scheduler, VKStreamBuffer& stream_buffer,
-                           StagingBufferPool& staging_pool);
-    ~VKBufferCache();
+    explicit BufferCacheRuntime(const Device& device_, MemoryAllocator& memory_manager_,
+                                VKScheduler& scheduler_, StagingBufferPool& staging_pool_,
+                                VKUpdateDescriptorQueue& update_descriptor_queue_,
+                                VKDescriptorPool& descriptor_pool);

-    BufferInfo GetEmptyBuffer(std::size_t size) override;
+    void Finish();

-protected:
-    std::shared_ptr<Buffer> CreateBlock(VAddr cpu_addr, std::size_t size) override;
+    [[nodiscard]] StagingBufferRef UploadStagingBuffer(size_t size);
+
+    [[nodiscard]] StagingBufferRef DownloadStagingBuffer(size_t size);
+
+    void CopyBuffer(VkBuffer src_buffer, VkBuffer dst_buffer,
+                    std::span<const VideoCommon::BufferCopy> copies);
+
+    void BindIndexBuffer(PrimitiveTopology topology, IndexFormat index_format, u32 num_indices,
+                         u32 base_vertex, VkBuffer buffer, u32 offset, u32 size);
+
+    void BindQuadArrayIndexBuffer(u32 first, u32 count);
+
+    void BindVertexBuffer(u32 index, VkBuffer buffer, u32 offset, u32 size, u32 stride);
+
+    void BindTransformFeedbackBuffer(u32 index, VkBuffer buffer, u32 offset, u32 size);
+
+    void BindUniformBuffer(VkBuffer buffer, u32 offset, u32 size) {
+        BindBuffer(buffer, offset, size);
+    }
+
+    void BindStorageBuffer(VkBuffer buffer, u32 offset, u32 size,
+                           [[maybe_unused]] bool is_written) {
+        BindBuffer(buffer, offset, size);
+    }

 private:
+    void BindBuffer(VkBuffer buffer, u32 offset, u32 size);
+
+    void ReserveQuadArrayLUT(u32 num_indices, bool wait_for_idle);
+
    const Device& device;
    MemoryAllocator& memory_allocator;
    VKScheduler& scheduler;
    StagingBufferPool& staging_pool;
+    VKUpdateDescriptorQueue& update_descriptor_queue;
+
+    vk::Buffer quad_array_lut;
+    MemoryCommit quad_array_lut_commit;
+    VkIndexType quad_array_lut_index_type{};
+    u32 current_num_indices = 0;
+
+    Uint8Pass uint8_pass;
+    QuadIndexedPass quad_index_pass;
 };

+struct BufferCacheParams {
+    using Runtime = Vulkan::BufferCacheRuntime;
+    using Buffer = Vulkan::Buffer;
+
+    static constexpr bool IS_OPENGL = false;
+    static constexpr bool HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS = false;
+    static constexpr bool HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT = false;
+    static constexpr bool NEEDS_BIND_UNIFORM_INDEX = false;
+    static constexpr bool NEEDS_BIND_STORAGE_INDEX = false;
+    static constexpr bool USE_MEMORY_MAPS = true;
+};
+
+using BufferCache = VideoCommon::BufferCache<BufferCacheParams>;
+
 } // namespace Vulkan
--- a/src/video_core/renderer_vulkan/vk_compute_pass.cpp
+++ b/src/video_core/renderer_vulkan/vk_compute_pass.cpp
@@ -10,7 +10,6 @@
 #include "common/alignment.h"
 #include "common/assert.h"
 #include "common/common_types.h"
-#include "video_core/host_shaders/vulkan_quad_array_comp_spv.h"
 #include "video_core/host_shaders/vulkan_quad_indexed_comp_spv.h"
 #include "video_core/host_shaders/vulkan_uint8_comp_spv.h"
 #include "video_core/renderer_vulkan/vk_compute_pass.h"
@@ -22,19 +21,7 @@
 #include "video_core/vulkan_common/vulkan_wrapper.h"

 namespace Vulkan {
-
 namespace {
-
-VkDescriptorSetLayoutBinding BuildQuadArrayPassDescriptorSetLayoutBinding() {
-    return {
-        .binding = 0,
-        .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
-        .descriptorCount = 1,
-        .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
-        .pImmutableSamplers = nullptr,
-    };
-}
-
 VkDescriptorUpdateTemplateEntryKHR BuildQuadArrayPassDescriptorUpdateTemplateEntry() {
    return {
        .dstBinding = 0,
@@ -162,55 +149,6 @@ VkDescriptorSet VKComputePass::CommitDescriptorSet(
    return set;
 }

-QuadArrayPass::QuadArrayPass(const Device& device_, VKScheduler& scheduler_,
-                             VKDescriptorPool& descriptor_pool_,
-                             StagingBufferPool& staging_buffer_pool_,
-                             VKUpdateDescriptorQueue& update_descriptor_queue_)
-    : VKComputePass(device_, descriptor_pool_, BuildQuadArrayPassDescriptorSetLayoutBinding(),
-                    BuildQuadArrayPassDescriptorUpdateTemplateEntry(),
-                    BuildComputePushConstantRange(sizeof(u32)), VULKAN_QUAD_ARRAY_COMP_SPV),
-      scheduler{scheduler_}, staging_buffer_pool{staging_buffer_pool_},
-      update_descriptor_queue{update_descriptor_queue_} {}
-
-QuadArrayPass::~QuadArrayPass() = default;
-
-std::pair<VkBuffer, VkDeviceSize> QuadArrayPass::Assemble(u32 num_vertices, u32 first) {
-    const u32 num_triangle_vertices = (num_vertices / 4) * 6;
-    const std::size_t staging_size = num_triangle_vertices * sizeof(u32);
-    const auto staging_ref = staging_buffer_pool.Request(staging_size, MemoryUsage::DeviceLocal);
-
-    update_descriptor_queue.Acquire();
-    update_descriptor_queue.AddBuffer(staging_ref.buffer, 0, staging_size);
-    const VkDescriptorSet set = CommitDescriptorSet(update_descriptor_queue);
-
-    scheduler.RequestOutsideRenderPassOperationContext();
-
-    ASSERT(num_vertices % 4 == 0);
-    const u32 num_quads = num_vertices / 4;
-    scheduler.Record([layout = *layout, pipeline = *pipeline, buffer = staging_ref.buffer,
-                      num_quads, first, set](vk::CommandBuffer cmdbuf) {
-        constexpr u32 dispatch_size = 1024;
-        cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, pipeline);
-        cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, layout, 0, set, {});
-        cmdbuf.PushConstants(layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(first), &first);
-        cmdbuf.Dispatch(Common::AlignUp(num_quads, dispatch_size) / dispatch_size, 1, 1);
-
-        VkBufferMemoryBarrier barrier;
-        barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
-        barrier.pNext = nullptr;
-        barrier.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT;
-        barrier.dstAccessMask = VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT;
-        barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
-        barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
-        barrier.buffer = buffer;
-        barrier.offset = 0;
-        barrier.size = static_cast<VkDeviceSize>(num_quads) * 6 * sizeof(u32);
-        cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
-                               VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, 0, {}, {barrier}, {});
-    });
-    return {staging_ref.buffer, 0};
-}
-
 Uint8Pass::Uint8Pass(const Device& device, VKScheduler& scheduler_,
                     VKDescriptorPool& descriptor_pool, StagingBufferPool& staging_buffer_pool_,
                     VKUpdateDescriptorQueue& update_descriptor_queue_)
@@ -221,18 +159,18 @@ Uint8Pass::Uint8Pass(const Device& device, VKScheduler& scheduler_,

 Uint8Pass::~Uint8Pass() = default;

-std::pair<VkBuffer, u64> Uint8Pass::Assemble(u32 num_vertices, VkBuffer src_buffer,
-                                             u64 src_offset) {
+std::pair<VkBuffer, u32> Uint8Pass::Assemble(u32 num_vertices, VkBuffer src_buffer,
+                                             u32 src_offset) {
    const u32 staging_size = static_cast<u32>(num_vertices * sizeof(u16));
-    const auto staging_ref = staging_buffer_pool.Request(staging_size, MemoryUsage::DeviceLocal);
+    const auto staging = staging_buffer_pool.Request(staging_size, MemoryUsage::DeviceLocal);

    update_descriptor_queue.Acquire();
    update_descriptor_queue.AddBuffer(src_buffer, src_offset, num_vertices);
-    update_descriptor_queue.AddBuffer(staging_ref.buffer, 0, staging_size);
+    update_descriptor_queue.AddBuffer(staging.buffer, 0, staging_size);
    const VkDescriptorSet set = CommitDescriptorSet(update_descriptor_queue);

    scheduler.RequestOutsideRenderPassOperationContext();
-    scheduler.Record([layout = *layout, pipeline = *pipeline, buffer = staging_ref.buffer, set,
+    scheduler.Record([layout = *layout, pipeline = *pipeline, buffer = staging.buffer, set,
                      num_vertices](vk::CommandBuffer cmdbuf) {
        constexpr u32 dispatch_size = 1024;
        cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, pipeline);
@@ -252,7 +190,7 @@ std::pair<VkBuffer, u64> Uint8Pass::Assemble(u32 num_vertices, VkBuffer src_buff
        cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
                               VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, 0, {}, barrier, {});
    });
-    return {staging_ref.buffer, 0};
+    return {staging.buffer, 0};
 }

 QuadIndexedPass::QuadIndexedPass(const Device& device_, VKScheduler& scheduler_,
@@ -267,9 +205,9 @@ QuadIndexedPass::QuadIndexedPass(const Device& device_, VKScheduler& scheduler_,

 QuadIndexedPass::~QuadIndexedPass() = default;

-std::pair<VkBuffer, u64> QuadIndexedPass::Assemble(
+std::pair<VkBuffer, u32> QuadIndexedPass::Assemble(
    Tegra::Engines::Maxwell3D::Regs::IndexFormat index_format, u32 num_vertices, u32 base_vertex,
-    VkBuffer src_buffer, u64 src_offset) {
+    VkBuffer src_buffer, u32 src_offset) {
    const u32 index_shift = [index_format] {
        switch (index_format) {
        case Tegra::Engines::Maxwell3D::Regs::IndexFormat::UnsignedByte:
@@ -286,15 +224,15 @@ std::pair<VkBuffer, u64> QuadIndexedPass::Assemble(
    const u32 num_tri_vertices = (num_vertices / 4) * 6;

    const std::size_t staging_size = num_tri_vertices * sizeof(u32);
-    const auto staging_ref = staging_buffer_pool.Request(staging_size, MemoryUsage::DeviceLocal);
+    const auto staging = staging_buffer_pool.Request(staging_size, MemoryUsage::DeviceLocal);

    update_descriptor_queue.Acquire();
    update_descriptor_queue.AddBuffer(src_buffer, src_offset, input_size);
-    update_descriptor_queue.AddBuffer(staging_ref.buffer, 0, staging_size);
+    update_descriptor_queue.AddBuffer(staging.buffer, 0, staging_size);
    const VkDescriptorSet set = CommitDescriptorSet(update_descriptor_queue);

    scheduler.RequestOutsideRenderPassOperationContext();
-    scheduler.Record([layout = *layout, pipeline = *pipeline, buffer = staging_ref.buffer, set,
+    scheduler.Record([layout = *layout, pipeline = *pipeline, buffer = staging.buffer, set,
                      num_tri_vertices, base_vertex, index_shift](vk::CommandBuffer cmdbuf) {
        static constexpr u32 dispatch_size = 1024;
        const std::array push_constants = {base_vertex, index_shift};
@@ -317,7 +255,7 @@ std::pair<VkBuffer, u64> QuadIndexedPass::Assemble(
        cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
                               VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, 0, {}, barrier, {});
    });
-    return {staging_ref.buffer, 0};
+    return {staging.buffer, 0};
 }

 } // namespace Vulkan
--- a/src/video_core/renderer_vulkan/vk_compute_pass.h
+++ b/src/video_core/renderer_vulkan/vk_compute_pass.h
@@ -41,22 +41,6 @@ private:
    vk::ShaderModule module;
 };

-class QuadArrayPass final : public VKComputePass {
-public:
-    explicit QuadArrayPass(const Device& device_, VKScheduler& scheduler_,
-                           VKDescriptorPool& descriptor_pool_,
-                           StagingBufferPool& staging_buffer_pool_,
-                           VKUpdateDescriptorQueue& update_descriptor_queue_);
-    ~QuadArrayPass();
-
-    std::pair<VkBuffer, VkDeviceSize> Assemble(u32 num_vertices, u32 first);
-
-private:
-    VKScheduler& scheduler;
-    StagingBufferPool& staging_buffer_pool;
-    VKUpdateDescriptorQueue& update_descriptor_queue;
-};
-
 class Uint8Pass final : public VKComputePass {
 public:
    explicit Uint8Pass(const Device& device_, VKScheduler& scheduler_,
@@ -64,7 +48,9 @@ public:
                       VKUpdateDescriptorQueue& update_descriptor_queue_);
    ~Uint8Pass();

-    std::pair<VkBuffer, u64> Assemble(u32 num_vertices, VkBuffer src_buffer, u64 src_offset);
+    /// Assemble uint8 indices into an uint16 index buffer
+    /// Returns a pair with the staging buffer, and the offset where the assembled data is
+    std::pair<VkBuffer, u32> Assemble(u32 num_vertices, VkBuffer src_buffer, u32 src_offset);

 private:
    VKScheduler& scheduler;
@@ -80,9 +66,9 @@ public:
                             VKUpdateDescriptorQueue& update_descriptor_queue_);
    ~QuadIndexedPass();

-    std::pair<VkBuffer, u64> Assemble(Tegra::Engines::Maxwell3D::Regs::IndexFormat index_format,
+    std::pair<VkBuffer, u32> Assemble(Tegra::Engines::Maxwell3D::Regs::IndexFormat index_format,
                                      u32 num_vertices, u32 base_vertex, VkBuffer src_buffer,
-                                      u64 src_offset);
+                                      u32 src_offset);

 private:
    VKScheduler& scheduler;
--- a/src/video_core/renderer_vulkan/vk_fence_manager.cpp
+++ b/src/video_core/renderer_vulkan/vk_fence_manager.cpp
@@ -45,8 +45,8 @@ void InnerFence::Wait() {
 }

 VKFenceManager::VKFenceManager(VideoCore::RasterizerInterface& rasterizer_, Tegra::GPU& gpu_,
-                               Tegra::MemoryManager& memory_manager_, TextureCache& texture_cache_,
-                               VKBufferCache& buffer_cache_, VKQueryCache& query_cache_,
+                               TextureCache& texture_cache_, BufferCache& buffer_cache_,
+                               VKQueryCache& query_cache_, const Device& device_,
                               VKScheduler& scheduler_)
    : GenericFenceManager{rasterizer_, gpu_, texture_cache_, buffer_cache_, query_cache_},
      scheduler{scheduler_} {}
--- a/src/video_core/renderer_vulkan/vk_fence_manager.h
+++ b/src/video_core/renderer_vulkan/vk_fence_manager.h
@@ -22,7 +22,6 @@ class RasterizerInterface;
 namespace Vulkan {

 class Device;
-class VKBufferCache;
 class VKQueryCache;
 class VKScheduler;

@@ -45,14 +44,14 @@ private:
 using Fence = std::shared_ptr<InnerFence>;

 using GenericFenceManager =
-    VideoCommon::FenceManager<Fence, TextureCache, VKBufferCache, VKQueryCache>;
+    VideoCommon::FenceManager<Fence, TextureCache, BufferCache, VKQueryCache>;

 class VKFenceManager final : public GenericFenceManager {
 public:
-    explicit VKFenceManager(VideoCore::RasterizerInterface& rasterizer_, Tegra::GPU& gpu_,
-                            Tegra::MemoryManager& memory_manager_, TextureCache& texture_cache_,
-                            VKBufferCache& buffer_cache_, VKQueryCache& query_cache_,
-                            VKScheduler& scheduler_);
+    explicit VKFenceManager(VideoCore::RasterizerInterface& rasterizer, Tegra::GPU& gpu,
+                            TextureCache& texture_cache, BufferCache& buffer_cache,
+                            VKQueryCache& query_cache, const Device& device,
+                            VKScheduler& scheduler);

 protected:
    Fence CreateFence(u32 value, bool is_stubbed) override;
--- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
--- a/src/video_core/renderer_vulkan/vk_rasterizer.h
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.h
@@ -18,14 +18,12 @@
 #include "video_core/renderer_vulkan/blit_image.h"
 #include "video_core/renderer_vulkan/fixed_pipeline_state.h"
 #include "video_core/renderer_vulkan/vk_buffer_cache.h"
-#include "video_core/renderer_vulkan/vk_compute_pass.h"
 #include "video_core/renderer_vulkan/vk_descriptor_pool.h"
 #include "video_core/renderer_vulkan/vk_fence_manager.h"
 #include "video_core/renderer_vulkan/vk_pipeline_cache.h"
 #include "video_core/renderer_vulkan/vk_query_cache.h"
 #include "video_core/renderer_vulkan/vk_scheduler.h"
 #include "video_core/renderer_vulkan/vk_staging_buffer_pool.h"
-#include "video_core/renderer_vulkan/vk_stream_buffer.h"
 #include "video_core/renderer_vulkan/vk_texture_cache.h"
 #include "video_core/renderer_vulkan/vk_update_descriptor.h"
 #include "video_core/shader/async_shaders.h"
@@ -49,7 +47,6 @@ namespace Vulkan {
 struct VKScreenInfo;

 class StateTracker;
-class BufferBindings;

 class RasterizerVulkan final : public VideoCore::RasterizerAccelerated {
 public:
@@ -65,8 +62,11 @@ public:
    void DispatchCompute(GPUVAddr code_addr) override;
    void ResetCounter(VideoCore::QueryType type) override;
    void Query(GPUVAddr gpu_addr, VideoCore::QueryType type, std::optional<u64> timestamp) override;
+    void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size) override;
    void FlushAll() override;
    void FlushRegion(VAddr addr, u64 size) override;
+    void InvalidateExceptTextureCache(VAddr addr, u64 size) override;
+    void InvalidateTextureCache(VAddr addr, u64 size) override;
    bool MustFlushRegion(VAddr addr, u64 size) override;
    void InvalidateRegion(VAddr addr, u64 size) override;
    void OnCPUWrite(VAddr addr, u64 size) override;
@@ -107,24 +107,11 @@ private:

    static constexpr VkDeviceSize DEFAULT_BUFFER_SIZE = 4 * sizeof(float);

-    struct DrawParameters {
-        void Draw(vk::CommandBuffer cmdbuf) const;
-
-        u32 base_instance = 0;
-        u32 num_instances = 0;
-        u32 base_vertex = 0;
-        u32 num_vertices = 0;
-        bool is_indexed = 0;
-    };
-
    void FlushWork();

-    /// Setups geometry buffers and state.
-    DrawParameters SetupGeometry(FixedPipelineState& fixed_state, BufferBindings& buffer_bindings,
-                                 bool is_indexed, bool is_instanced);
-
    /// Setup descriptors in the graphics pipeline.
-    void SetupShaderDescriptors(const std::array<Shader*, Maxwell::MaxShaderProgram>& shaders);
+    void SetupShaderDescriptors(const std::array<Shader*, Maxwell::MaxShaderProgram>& shaders,
+                                bool is_indexed);

    void UpdateDynamicStates();

@@ -132,16 +119,6 @@ private:

    void EndTransformFeedback();

-    void SetupVertexArrays(BufferBindings& buffer_bindings);
-
-    void SetupIndexBuffer(BufferBindings& buffer_bindings, DrawParameters& params, bool is_indexed);
-
-    /// Setup constant buffers in the graphics pipeline.
-    void SetupGraphicsConstBuffers(const ShaderEntries& entries, std::size_t stage);
-
-    /// Setup global buffers in the graphics pipeline.
-    void SetupGraphicsGlobalBuffers(const ShaderEntries& entries, std::size_t stage);
-
    /// Setup uniform texels in the graphics pipeline.
    void SetupGraphicsUniformTexels(const ShaderEntries& entries, std::size_t stage);

@@ -154,12 +131,6 @@ private:
    /// Setup images in the graphics pipeline.
    void SetupGraphicsImages(const ShaderEntries& entries, std::size_t stage);

-    /// Setup constant buffers in the compute pipeline.
-    void SetupComputeConstBuffers(const ShaderEntries& entries);
-
-    /// Setup global buffers in the compute pipeline.
-    void SetupComputeGlobalBuffers(const ShaderEntries& entries);
-
    /// Setup texel buffers in the compute pipeline.
    void SetupComputeUniformTexels(const ShaderEntries& entries);

@@ -172,11 +143,6 @@ private:
    /// Setup images in the compute pipeline.
    void SetupComputeImages(const ShaderEntries& entries);

-    void SetupConstBuffer(const ConstBufferEntry& entry,
-                          const Tegra::Engines::ConstBufferInfo& buffer);
-
-    void SetupGlobalBuffer(const GlobalBufferEntry& entry, GPUVAddr address);
-
    void UpdateViewportsState(Tegra::Engines::Maxwell3D::Regs& regs);
    void UpdateScissorsState(Tegra::Engines::Maxwell3D::Regs& regs);
    void UpdateDepthBias(Tegra::Engines::Maxwell3D::Regs& regs);
@@ -193,19 +159,6 @@ private:
    void UpdateStencilOp(Tegra::Engines::Maxwell3D::Regs& regs);
    void UpdateStencilTestEnable(Tegra::Engines::Maxwell3D::Regs& regs);

-    size_t CalculateGraphicsStreamBufferSize(bool is_indexed) const;
-
-    size_t CalculateComputeStreamBufferSize() const;
-
-    size_t CalculateVertexArraysSize() const;
-
-    size_t CalculateIndexBufferSize() const;
-
-    size_t CalculateConstBufferSize(const ConstBufferEntry& entry,
-                                    const Tegra::Engines::ConstBufferInfo& buffer) const;
-
-    VkBuffer DefaultBuffer();
-
    Tegra::GPU& gpu;
    Tegra::MemoryManager& gpu_memory;
    Tegra::Engines::Maxwell3D& maxwell3d;
@@ -217,24 +170,19 @@ private:
    StateTracker& state_tracker;
    VKScheduler& scheduler;

-    VKStreamBuffer stream_buffer;
    StagingBufferPool staging_pool;
    VKDescriptorPool descriptor_pool;
    VKUpdateDescriptorQueue update_descriptor_queue;
    BlitImageHelper blit_image;
-    QuadArrayPass quad_array_pass;
-    QuadIndexedPass quad_indexed_pass;
-    Uint8Pass uint8_pass;

    TextureCacheRuntime texture_cache_runtime;
    TextureCache texture_cache;
+    BufferCacheRuntime buffer_cache_runtime;
+    BufferCache buffer_cache;
    VKPipelineCache pipeline_cache;
-    VKBufferCache buffer_cache;
    VKQueryCache query_cache;
    VKFenceManager fence_manager;

-    vk::Buffer default_buffer;
-    MemoryCommit default_buffer_commit;
    vk::Event wfi_event;
    VideoCommon::Shader::AsyncShaders async_shaders;

--- a/src/video_core/renderer_vulkan/vk_scheduler.cpp
+++ b/src/video_core/renderer_vulkan/vk_scheduler.cpp
@@ -52,18 +52,6 @@ VKScheduler::~VKScheduler() {
    worker_thread.join();
 }

-u64 VKScheduler::CurrentTick() const noexcept {
-    return master_semaphore->CurrentTick();
-}
-
-bool VKScheduler::IsFree(u64 tick) const noexcept {
-    return master_semaphore->IsFree(tick);
-}
-
-void VKScheduler::Wait(u64 tick) {
-    master_semaphore->Wait(tick);
-}
-
 void VKScheduler::Flush(VkSemaphore semaphore) {
    SubmitExecution(semaphore);
    AllocateNewContext();
@@ -269,7 +257,7 @@ void VKScheduler::EndRenderPass() {
        cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT |
                                   VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT |
                                   VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT,
-                               VK_PIPELINE_STAGE_ALL_GRAPHICS_BIT, 0, nullptr, nullptr,
+                               VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 0, nullptr, nullptr,
                               vk::Span(barriers.data(), num_images));
    });
    state.renderpass = nullptr;
--- a/src/video_core/renderer_vulkan/vk_scheduler.h
+++ b/src/video_core/renderer_vulkan/vk_scheduler.h
@@ -12,6 +12,7 @@
 #include <utility>
 #include "common/common_types.h"
 #include "common/threadsafe_queue.h"
+#include "video_core/renderer_vulkan/vk_master_semaphore.h"
 #include "video_core/vulkan_common/vulkan_wrapper.h"

 namespace Vulkan {
@@ -19,7 +20,6 @@ namespace Vulkan {
 class CommandPool;
 class Device;
 class Framebuffer;
-class MasterSemaphore;
 class StateTracker;
 class VKQueryCache;

@@ -30,15 +30,6 @@ public:
    explicit VKScheduler(const Device& device, StateTracker& state_tracker);
    ~VKScheduler();

-    /// Returns the current command buffer tick.
-    [[nodiscard]] u64 CurrentTick() const noexcept;
-
-    /// Returns true when a tick has been triggered by the GPU.
-    [[nodiscard]] bool IsFree(u64 tick) const noexcept;
-
-    /// Waits for the given tick to trigger on the GPU.
-    void Wait(u64 tick);
-
    /// Sends the current execution context to the GPU.
    void Flush(VkSemaphore semaphore = nullptr);

@@ -80,6 +71,21 @@ public:
        (void)chunk->Record(command);
    }

+    /// Returns the current command buffer tick.
+    [[nodiscard]] u64 CurrentTick() const noexcept {
+        return master_semaphore->CurrentTick();
+    }
+
+    /// Returns true when a tick has been triggered by the GPU.
+    [[nodiscard]] bool IsFree(u64 tick) const noexcept {
+        return master_semaphore->IsFree(tick);
+    }
+
+    /// Waits for the given tick to trigger on the GPU.
+    void Wait(u64 tick) {
+        master_semaphore->Wait(tick);
+    }
+
    /// Returns the master timeline semaphore.
    [[nodiscard]] MasterSemaphore& GetMasterSemaphore() const noexcept {
        return *master_semaphore;
--- a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
+++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
@@ -3124,6 +3124,9 @@ ShaderEntries GenerateShaderEntries(const VideoCommon::Shader::ShaderIR& ir) {
            entries.attributes.insert(GetGenericAttributeLocation(attribute));
        }
    }
+    for (const auto& buffer : entries.const_buffers) {
+        entries.enabled_uniform_buffers |= 1U << buffer.GetIndex();
+    }
    entries.clip_distances = ir.GetClipDistances();
    entries.shader_length = ir.GetLength();
    entries.uses_warps = ir.UsesWarps();
--- a/src/video_core/renderer_vulkan/vk_shader_decompiler.h
+++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.h
@@ -39,24 +39,7 @@ private:
    u32 index{};
 };

-class GlobalBufferEntry {
-public:
-    constexpr explicit GlobalBufferEntry(u32 cbuf_index_, u32 cbuf_offset_, bool is_written_)
-        : cbuf_index{cbuf_index_}, cbuf_offset{cbuf_offset_}, is_written{is_written_} {}
-
-    constexpr u32 GetCbufIndex() const {
-        return cbuf_index;
-    }
-
-    constexpr u32 GetCbufOffset() const {
-        return cbuf_offset;
-    }
-
-    constexpr bool IsWritten() const {
-        return is_written;
-    }
-
-private:
+struct GlobalBufferEntry {
    u32 cbuf_index{};
    u32 cbuf_offset{};
    bool is_written{};
@@ -78,6 +61,7 @@ struct ShaderEntries {
    std::set<u32> attributes;
    std::array<bool, Maxwell::NumClipDistances> clip_distances{};
    std::size_t shader_length{};
+    u32 enabled_uniform_buffers{};
    bool uses_warps{};
 };

--- a/src/video_core/renderer_vulkan/vk_state_tracker.cpp
+++ b/src/video_core/renderer_vulkan/vk_state_tracker.cpp
@@ -30,15 +30,18 @@ using Table = Maxwell3D::DirtyState::Table;
 using Flags = Maxwell3D::DirtyState::Flags;

 Flags MakeInvalidationFlags() {
-    static constexpr std::array INVALIDATION_FLAGS{
+    static constexpr int INVALIDATION_FLAGS[]{
        Viewports,         Scissors,  DepthBias,         BlendConstants,    DepthBounds,
        StencilProperties, CullMode,  DepthBoundsEnable, DepthTestEnable,   DepthWriteEnable,
-        DepthCompareOp,    FrontFace, StencilOp,         StencilTestEnable,
+        DepthCompareOp,    FrontFace, StencilOp,         StencilTestEnable, VertexBuffers,
    };
    Flags flags{};
    for (const int flag : INVALIDATION_FLAGS) {
        flags[flag] = true;
    }
+    for (int index = VertexBuffer0; index <= VertexBuffer31; ++index) {
+        flags[index] = true;
+    }
    return flags;
 }

@@ -130,7 +133,7 @@ void SetupDirtyStencilTestEnable(Tables& tables) {
 StateTracker::StateTracker(Tegra::GPU& gpu)
    : flags{gpu.Maxwell3D().dirty.flags}, invalidation_flags{MakeInvalidationFlags()} {
    auto& tables = gpu.Maxwell3D().dirty.tables;
-    SetupDirtyRenderTargets(tables);
+    SetupDirtyFlags(tables);
    SetupDirtyViewports(tables);
    SetupDirtyScissors(tables);
    SetupDirtyDepthBias(tables);
--- a/src/video_core/renderer_vulkan/vk_swapchain.cpp
+++ b/src/video_core/renderer_vulkan/vk_swapchain.cpp
@@ -56,8 +56,11 @@ VkExtent2D ChooseSwapExtent(const VkSurfaceCapabilitiesKHR& capabilities, u32 wi

 } // Anonymous namespace

-VKSwapchain::VKSwapchain(VkSurfaceKHR surface_, const Device& device_, VKScheduler& scheduler_)
-    : surface{surface_}, device{device_}, scheduler{scheduler_} {}
+VKSwapchain::VKSwapchain(VkSurfaceKHR surface_, const Device& device_, VKScheduler& scheduler_,
+                         u32 width, u32 height, bool srgb)
+    : surface{surface_}, device{device_}, scheduler{scheduler_} {
+    Create(width, height, srgb);
+}

 VKSwapchain::~VKSwapchain() = default;

--- a/src/video_core/renderer_vulkan/vk_swapchain.h
+++ b/src/video_core/renderer_vulkan/vk_swapchain.h
@@ -20,7 +20,8 @@ class VKScheduler;

 class VKSwapchain {
 public:
-    explicit VKSwapchain(VkSurfaceKHR surface, const Device& device, VKScheduler& scheduler);
+    explicit VKSwapchain(VkSurfaceKHR surface, const Device& device, VKScheduler& scheduler,
+                         u32 width, u32 height, bool srgb);
    ~VKSwapchain();

    /// Creates (or recreates) the swapchain with a given size.
--- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp
@@ -410,46 +410,47 @@ constexpr VkBorderColor ConvertBorderColor(const std::array<float, 4>& color) {
 void CopyBufferToImage(vk::CommandBuffer cmdbuf, VkBuffer src_buffer, VkImage image,
                       VkImageAspectFlags aspect_mask, bool is_initialized,
                       std::span<const VkBufferImageCopy> copies) {
-    static constexpr VkAccessFlags ACCESS_FLAGS = VK_ACCESS_SHADER_WRITE_BIT |
-                                                  VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT |
-                                                  VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT;
+    static constexpr VkAccessFlags WRITE_ACCESS_FLAGS =
+        VK_ACCESS_SHADER_WRITE_BIT | VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT |
+        VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT;
+    static constexpr VkAccessFlags READ_ACCESS_FLAGS = VK_ACCESS_SHADER_READ_BIT |
+                                                       VK_ACCESS_COLOR_ATTACHMENT_READ_BIT |
+                                                       VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT;
    const VkImageMemoryBarrier read_barrier{
        .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
        .pNext = nullptr,
-        .srcAccessMask = ACCESS_FLAGS,
+        .srcAccessMask = WRITE_ACCESS_FLAGS,
        .dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
        .oldLayout = is_initialized ? VK_IMAGE_LAYOUT_GENERAL : VK_IMAGE_LAYOUT_UNDEFINED,
        .newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
        .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
        .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
        .image = image,
-        .subresourceRange =
-            {
-                .aspectMask = aspect_mask,
-                .baseMipLevel = 0,
-                .levelCount = VK_REMAINING_MIP_LEVELS,
-                .baseArrayLayer = 0,
-                .layerCount = VK_REMAINING_ARRAY_LAYERS,
-            },
+        .subresourceRange{
+            .aspectMask = aspect_mask,
+            .baseMipLevel = 0,
+            .levelCount = VK_REMAINING_MIP_LEVELS,
+            .baseArrayLayer = 0,
+            .layerCount = VK_REMAINING_ARRAY_LAYERS,
+        },
    };
    const VkImageMemoryBarrier write_barrier{
        .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
        .pNext = nullptr,
        .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
-        .dstAccessMask = ACCESS_FLAGS,
+        .dstAccessMask = WRITE_ACCESS_FLAGS | READ_ACCESS_FLAGS,
        .oldLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
        .newLayout = VK_IMAGE_LAYOUT_GENERAL,
        .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
        .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
        .image = image,
-        .subresourceRange =
-            {
-                .aspectMask = aspect_mask,
-                .baseMipLevel = 0,
-                .levelCount = VK_REMAINING_MIP_LEVELS,
-                .baseArrayLayer = 0,
-                .layerCount = VK_REMAINING_ARRAY_LAYERS,
-            },
+        .subresourceRange{
+            .aspectMask = aspect_mask,
+            .baseMipLevel = 0,
+            .levelCount = VK_REMAINING_MIP_LEVELS,
+            .baseArrayLayer = 0,
+            .layerCount = VK_REMAINING_ARRAY_LAYERS,
+        },
    };
    cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, 0,
                           read_barrier);
@@ -553,20 +554,12 @@ void TextureCacheRuntime::Finish() {
    scheduler.Finish();
 }

-ImageBufferMap TextureCacheRuntime::MapUploadBuffer(size_t size) {
-    const auto staging_ref = staging_buffer_pool.Request(size, MemoryUsage::Upload);
-    return {
-        .handle = staging_ref.buffer,
-        .span = staging_ref.mapped_span,
-    };
+StagingBufferRef TextureCacheRuntime::UploadStagingBuffer(size_t size) {
+    return staging_buffer_pool.Request(size, MemoryUsage::Upload);
 }

-ImageBufferMap TextureCacheRuntime::MapDownloadBuffer(size_t size) {
-    const auto staging_ref = staging_buffer_pool.Request(size, MemoryUsage::Download);
-    return {
-        .handle = staging_ref.buffer,
-        .span = staging_ref.mapped_span,
-    };
+StagingBufferRef TextureCacheRuntime::DownloadStagingBuffer(size_t size) {
+    return staging_buffer_pool.Request(size, MemoryUsage::Download);
 }

 void TextureCacheRuntime::BlitImage(Framebuffer* dst_framebuffer, ImageView& dst, ImageView& src,
@@ -738,7 +731,7 @@ void TextureCacheRuntime::CopyImage(Image& dst, Image& src,
                .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT | VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT |
                                 VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT |
                                 VK_ACCESS_TRANSFER_WRITE_BIT,
-                .dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
+                .dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT,
                .oldLayout = VK_IMAGE_LAYOUT_GENERAL,
                .newLayout = VK_IMAGE_LAYOUT_GENERAL,
                .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
@@ -749,12 +742,9 @@ void TextureCacheRuntime::CopyImage(Image& dst, Image& src,
            VkImageMemoryBarrier{
                .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
                .pNext = nullptr,
-                .srcAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT |
-                                 VK_ACCESS_COLOR_ATTACHMENT_READ_BIT |
-                                 VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT |
-                                 VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT |
+                .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT | VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT |
                                 VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT |
-                                 VK_ACCESS_TRANSFER_READ_BIT | VK_ACCESS_TRANSFER_WRITE_BIT,
+                                 VK_ACCESS_TRANSFER_WRITE_BIT,
                .dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
                .oldLayout = VK_IMAGE_LAYOUT_GENERAL,
                .newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
@@ -812,12 +802,12 @@ Image::Image(TextureCacheRuntime& runtime, const ImageInfo& info_, GPUVAddr gpu_
    }
 }

-void Image::UploadMemory(const ImageBufferMap& map, size_t buffer_offset,
+void Image::UploadMemory(const StagingBufferRef& map, size_t buffer_offset,
                         std::span<const BufferImageCopy> copies) {
    // TODO: Move this to another API
    scheduler->RequestOutsideRenderPassOperationContext();
    std::vector vk_copies = TransformBufferImageCopies(copies, buffer_offset, aspect_mask);
-    const VkBuffer src_buffer = map.handle;
+    const VkBuffer src_buffer = map.buffer;
    const VkImage vk_image = *image;
    const VkImageAspectFlags vk_aspect_mask = aspect_mask;
    const bool is_initialized = std::exchange(initialized, true);
@@ -827,12 +817,12 @@ void Image::UploadMemory(const ImageBufferMap& map, size_t buffer_offset,
    });
 }

-void Image::UploadMemory(const ImageBufferMap& map, size_t buffer_offset,
+void Image::UploadMemory(const StagingBufferRef& map, size_t buffer_offset,
                         std::span<const VideoCommon::BufferCopy> copies) {
    // TODO: Move this to another API
    scheduler->RequestOutsideRenderPassOperationContext();
    std::vector vk_copies = TransformBufferCopies(copies, buffer_offset);
-    const VkBuffer src_buffer = map.handle;
+    const VkBuffer src_buffer = map.buffer;
    const VkBuffer dst_buffer = *buffer;
    scheduler->Record([src_buffer, dst_buffer, vk_copies](vk::CommandBuffer cmdbuf) {
        // TODO: Barriers
@@ -840,13 +830,58 @@ void Image::UploadMemory(const ImageBufferMap& map, size_t buffer_offset,
    });
 }

-void Image::DownloadMemory(const ImageBufferMap& map, size_t buffer_offset,
+void Image::DownloadMemory(const StagingBufferRef& map, size_t buffer_offset,
                           std::span<const BufferImageCopy> copies) {
    std::vector vk_copies = TransformBufferImageCopies(copies, buffer_offset, aspect_mask);
-    scheduler->Record([buffer = map.handle, image = *image, aspect_mask = aspect_mask,
+    scheduler->Record([buffer = map.buffer, image = *image, aspect_mask = aspect_mask,
                       vk_copies](vk::CommandBuffer cmdbuf) {
-        // TODO: Barriers
-        cmdbuf.CopyImageToBuffer(image, VK_IMAGE_LAYOUT_GENERAL, buffer, vk_copies);
+        const VkImageMemoryBarrier read_barrier{
+            .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
+            .pNext = nullptr,
+            .srcAccessMask = VK_ACCESS_MEMORY_WRITE_BIT,
+            .dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT,
+            .oldLayout = VK_IMAGE_LAYOUT_GENERAL,
+            .newLayout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
+            .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+            .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+            .image = image,
+            .subresourceRange{
+                .aspectMask = aspect_mask,
+                .baseMipLevel = 0,
+                .levelCount = VK_REMAINING_MIP_LEVELS,
+                .baseArrayLayer = 0,
+                .layerCount = VK_REMAINING_ARRAY_LAYERS,
+            },
+        };
+        const VkImageMemoryBarrier image_write_barrier{
+            .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
+            .pNext = nullptr,
+            .srcAccessMask = 0,
+            .dstAccessMask = VK_ACCESS_MEMORY_WRITE_BIT,
+            .oldLayout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
+            .newLayout = VK_IMAGE_LAYOUT_GENERAL,
+            .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+            .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+            .image = image,
+            .subresourceRange{
+                .aspectMask = aspect_mask,
+                .baseMipLevel = 0,
+                .levelCount = VK_REMAINING_MIP_LEVELS,
+                .baseArrayLayer = 0,
+                .layerCount = VK_REMAINING_ARRAY_LAYERS,
+            },
+        };
+        const VkMemoryBarrier memory_write_barrier{
+            .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
+            .pNext = nullptr,
+            .srcAccessMask = VK_ACCESS_MEMORY_WRITE_BIT,
+            .dstAccessMask = VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT,
+        };
+        cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT,
+                               0, read_barrier);
+        cmdbuf.CopyImageToBuffer(image, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, buffer, vk_copies);
+        cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
+                               0, memory_write_barrier, nullptr, image_write_barrier);
    });
 }

@@ -1106,7 +1141,7 @@ Framebuffer::Framebuffer(TextureCacheRuntime& runtime, std::span<ImageView*, NUM
        .pAttachments = attachments.data(),
        .width = key.size.width,
        .height = key.size.height,
-        .layers = static_cast<u32>(num_layers),
+        .layers = static_cast<u32>(std::max(num_layers, 1)),
    });
    if (runtime.device.HasDebuggingToolAttached()) {
        framebuffer.SetObjectNameEXT(VideoCommon::Name(key).c_str());
--- a/src/video_core/renderer_vulkan/vk_texture_cache.h
+++ b/src/video_core/renderer_vulkan/vk_texture_cache.h
@@ -7,6 +7,7 @@
 #include <compare>
 #include <span>

+#include "video_core/renderer_vulkan/vk_staging_buffer_pool.h"
 #include "video_core/texture_cache/texture_cache.h"
 #include "video_core/vulkan_common/vulkan_memory_allocator.h"
 #include "video_core/vulkan_common/vulkan_wrapper.h"
@@ -53,19 +54,6 @@ struct hash<Vulkan::RenderPassKey> {

 namespace Vulkan {

-struct ImageBufferMap {
-    [[nodiscard]] VkBuffer Handle() const noexcept {
-        return handle;
-    }
-
-    [[nodiscard]] std::span<u8> Span() const noexcept {
-        return span;
-    }
-
-    VkBuffer handle;
-    std::span<u8> span;
-};
-
 struct TextureCacheRuntime {
    const Device& device;
    VKScheduler& scheduler;
@@ -76,9 +64,9 @@ struct TextureCacheRuntime {

    void Finish();

-    [[nodiscard]] ImageBufferMap MapUploadBuffer(size_t size);
+    [[nodiscard]] StagingBufferRef UploadStagingBuffer(size_t size);

-    [[nodiscard]] ImageBufferMap MapDownloadBuffer(size_t size);
+    [[nodiscard]] StagingBufferRef DownloadStagingBuffer(size_t size);

    void BlitImage(Framebuffer* dst_framebuffer, ImageView& dst, ImageView& src,
                   const std::array<Offset2D, 2>& dst_region,
@@ -94,7 +82,7 @@ struct TextureCacheRuntime {
        return false;
    }

-    void AccelerateImageUpload(Image&, const ImageBufferMap&, size_t,
+    void AccelerateImageUpload(Image&, const StagingBufferRef&, size_t,
                               std::span<const VideoCommon::SwizzleParameters>) {
        UNREACHABLE();
    }
@@ -112,13 +100,13 @@ public:
    explicit Image(TextureCacheRuntime&, const VideoCommon::ImageInfo& info, GPUVAddr gpu_addr,
                   VAddr cpu_addr);

-    void UploadMemory(const ImageBufferMap& map, size_t buffer_offset,
+    void UploadMemory(const StagingBufferRef& map, size_t buffer_offset,
                      std::span<const VideoCommon::BufferImageCopy> copies);

-    void UploadMemory(const ImageBufferMap& map, size_t buffer_offset,
+    void UploadMemory(const StagingBufferRef& map, size_t buffer_offset,
                      std::span<const VideoCommon::BufferCopy> copies);

-    void DownloadMemory(const ImageBufferMap& map, size_t buffer_offset,
+    void DownloadMemory(const StagingBufferRef& map, size_t buffer_offset,
                        std::span<const VideoCommon::BufferImageCopy> copies);

    [[nodiscard]] VkImage Handle() const noexcept {
--- a/src/video_core/shader/async_shaders.h
+++ b/src/video_core/shader/async_shaders.h
@@ -9,16 +9,7 @@
 #include <shared_mutex>
 #include <thread>

-// This header includes both Vulkan and OpenGL headers, this has to be fixed
-// Unfortunately, including OpenGL will include Windows.h that defines macros that can cause issues.
-// Forcefully include glad early and undefine macros
 #include <glad/glad.h>
-#ifdef CreateEvent
-#undef CreateEvent
-#endif
-#ifdef CreateSemaphore
-#undef CreateSemaphore
-#endif

 #include "common/common_types.h"
 #include "video_core/renderer_opengl/gl_device.h"
--- a/src/video_core/shader/decode/other.cpp
+++ b/src/video_core/shader/decode/other.cpp
@@ -76,6 +76,7 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) {
            case SystemVariable::InvocationId:
                return Operation(OperationCode::InvocationId);
            case SystemVariable::Ydirection:
+                uses_y_negate = true;
                return Operation(OperationCode::YNegate);
            case SystemVariable::InvocationInfo:
                LOG_WARNING(HW_GPU, "S2R instruction with InvocationInfo is incomplete");
--- a/src/video_core/shader/shader_ir.h
+++ b/src/video_core/shader/shader_ir.h
@@ -139,6 +139,10 @@ public:
        return uses_legacy_varyings;
    }

+    bool UsesYNegate() const {
+        return uses_y_negate;
+    }
+
    bool UsesWarps() const {
        return uses_warps;
    }
@@ -465,6 +469,7 @@ private:
    bool uses_instance_id{};
    bool uses_vertex_id{};
    bool uses_legacy_varyings{};
+    bool uses_y_negate{};
    bool uses_warps{};
    bool uses_indexed_samplers{};

--- a/Show More
+++ b/Show More