early-access version 3290

2023-01-06 04:23:54 +01:00
parent 69c608cf26
commit f001eddbf3
155 changed files with 2064 additions and 998 deletions
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -85,6 +85,7 @@ add_library(video_core STATIC
    gpu.h
    gpu_thread.cpp
    gpu_thread.h
+    invalidation_accumulator.h
    memory_manager.cpp
    memory_manager.h
    precompiled_headers.h
@@ -99,6 +100,8 @@ add_library(video_core STATIC
    renderer_null/null_rasterizer.h
    renderer_null/renderer_null.cpp
    renderer_null/renderer_null.h
+    renderer_opengl/blit_image.cpp
+    renderer_opengl/blit_image.h
    renderer_opengl/gl_buffer_cache.cpp
    renderer_opengl/gl_buffer_cache.h
    renderer_opengl/gl_compute_pipeline.cpp
@@ -190,6 +193,8 @@ add_library(video_core STATIC
    renderer_vulkan/vk_texture_cache.cpp
    renderer_vulkan/vk_texture_cache.h
    renderer_vulkan/vk_texture_cache_base.cpp
+    renderer_vulkan/vk_turbo_mode.cpp
+    renderer_vulkan/vk_turbo_mode.h
    renderer_vulkan/vk_update_descriptor.cpp
    renderer_vulkan/vk_update_descriptor.h
    shader_cache.cpp
--- a/src/video_core/buffer_cache/buffer_base.h
+++ b/src/video_core/buffer_cache/buffer_base.h
@@ -430,7 +430,7 @@ private:
        if (query_begin >= SizeBytes() || size < 0) {
            return;
        }
-        u64* const untracked_words = Array<Type::Untracked>();
+        [[maybe_unused]] u64* const untracked_words = Array<Type::Untracked>();
        u64* const state_words = Array<type>();
        const u64 query_end = query_begin + std::min(static_cast<u64>(size), SizeBytes());
        u64* const words_begin = state_words + query_begin / BYTES_PER_WORD;
@@ -483,7 +483,7 @@ private:
                NotifyRasterizer<true>(word_index, current_bits, ~u64{0});
            }
            // Exclude CPU modified pages when visiting GPU pages
-            const u64 word = current_word & ~(type == Type::GPU ? untracked_words[word_index] : 0);
+            const u64 word = current_word;
            u64 page = page_begin;
            page_begin = 0;

@@ -531,7 +531,7 @@ private:
    [[nodiscard]] bool IsRegionModified(u64 offset, u64 size) const noexcept {
        static_assert(type != Type::Untracked);

-        const u64* const untracked_words = Array<Type::Untracked>();
+        [[maybe_unused]] const u64* const untracked_words = Array<Type::Untracked>();
        const u64* const state_words = Array<type>();
        const u64 num_query_words = size / BYTES_PER_WORD + 1;
        const u64 word_begin = offset / BYTES_PER_WORD;
@@ -539,8 +539,7 @@ private:
        const u64 page_limit = Common::DivCeil(offset + size, BYTES_PER_PAGE);
        u64 page_index = (offset / BYTES_PER_PAGE) % PAGES_PER_WORD;
        for (u64 word_index = word_begin; word_index < word_end; ++word_index, page_index = 0) {
-            const u64 off_word = type == Type::GPU ? untracked_words[word_index] : 0;
-            const u64 word = state_words[word_index] & ~off_word;
+            const u64 word = state_words[word_index];
            if (word == 0) {
                continue;
            }
@@ -564,7 +563,7 @@ private:
    [[nodiscard]] std::pair<u64, u64> ModifiedRegion(u64 offset, u64 size) const noexcept {
        static_assert(type != Type::Untracked);

-        const u64* const untracked_words = Array<Type::Untracked>();
+        [[maybe_unused]] const u64* const untracked_words = Array<Type::Untracked>();
        const u64* const state_words = Array<type>();
        const u64 num_query_words = size / BYTES_PER_WORD + 1;
        const u64 word_begin = offset / BYTES_PER_WORD;
@@ -574,8 +573,7 @@ private:
        u64 begin = std::numeric_limits<u64>::max();
        u64 end = 0;
        for (u64 word_index = word_begin; word_index < word_end; ++word_index) {
-            const u64 off_word = type == Type::GPU ? untracked_words[word_index] : 0;
-            const u64 word = state_words[word_index] & ~off_word;
+            const u64 word = state_words[word_index];
            if (word == 0) {
                continue;
            }
--- a/src/video_core/buffer_cache/buffer_cache.h
+++ b/src/video_core/buffer_cache/buffer_cache.h
@@ -1938,14 +1938,21 @@ typename BufferCache<P>::Binding BufferCache<P>::StorageBufferBinding(GPUVAddr s
                                                                      bool is_written) const {
    const GPUVAddr gpu_addr = gpu_memory->Read<u64>(ssbo_addr);
    const u32 size = gpu_memory->Read<u32>(ssbo_addr + 8);
-    const std::optional<VAddr> cpu_addr = gpu_memory->GpuToCpuAddress(gpu_addr);
+    const u32 alignment = runtime.GetStorageBufferAlignment();
+
+    const GPUVAddr aligned_gpu_addr = Common::AlignDown(gpu_addr, alignment);
+    const u32 aligned_size =
+        Common::AlignUp(static_cast<u32>(gpu_addr - aligned_gpu_addr) + size, alignment);
+
+    const std::optional<VAddr> cpu_addr = gpu_memory->GpuToCpuAddress(aligned_gpu_addr);
    if (!cpu_addr || size == 0) {
        return NULL_BINDING;
    }
-    const VAddr cpu_end = Common::AlignUp(*cpu_addr + size, Core::Memory::YUZU_PAGESIZE);
+
+    const VAddr cpu_end = Common::AlignUp(*cpu_addr + aligned_size, Core::Memory::YUZU_PAGESIZE);
    const Binding binding{
        .cpu_addr = *cpu_addr,
-        .size = is_written ? size : static_cast<u32>(cpu_end - *cpu_addr),
+        .size = is_written ? aligned_size : static_cast<u32>(cpu_end - *cpu_addr),
        .buffer_id = BufferId{},
    };
    return binding;
--- a/src/video_core/engines/draw_manager.cpp
+++ b/src/video_core/engines/draw_manager.cpp
@@ -51,6 +51,10 @@ void DrawManager::ProcessMethodCall(u32 method, u32 argument) {
        LOG_WARNING(HW_GPU, "(STUBBED) called");
        break;
    }
+    case MAXWELL3D_REG_INDEX(draw_texture.src_y0): {
+        DrawTexture();
+        break;
+    }
    default:
        break;
    }
@@ -179,6 +183,33 @@ void DrawManager::DrawIndexSmall(u32 argument) {
    ProcessDraw(true, 1);
 }

+void DrawManager::DrawTexture() {
+    const auto& regs{maxwell3d->regs};
+    draw_texture_state.dst_x0 = static_cast<float>(regs.draw_texture.dst_x0) / 4096.f;
+    draw_texture_state.dst_y0 = static_cast<float>(regs.draw_texture.dst_y0) / 4096.f;
+    const auto dst_width = static_cast<float>(regs.draw_texture.dst_width) / 4096.f;
+    const auto dst_height = static_cast<float>(regs.draw_texture.dst_height) / 4096.f;
+    const bool lower_left{regs.window_origin.mode !=
+                          Maxwell3D::Regs::WindowOrigin::Mode::UpperLeft};
+    if (lower_left) {
+        draw_texture_state.dst_y0 -= dst_height;
+    }
+    draw_texture_state.dst_x1 = draw_texture_state.dst_x0 + dst_width;
+    draw_texture_state.dst_y1 = draw_texture_state.dst_y0 + dst_height;
+    draw_texture_state.src_x0 = static_cast<float>(regs.draw_texture.src_x0) / 4096.f;
+    draw_texture_state.src_y0 = static_cast<float>(regs.draw_texture.src_y0) / 4096.f;
+    draw_texture_state.src_x1 =
+        (static_cast<float>(regs.draw_texture.dx_du) / 4294967295.f) * dst_width +
+        draw_texture_state.src_x0;
+    draw_texture_state.src_y1 =
+        (static_cast<float>(regs.draw_texture.dy_dv) / 4294967295.f) * dst_height +
+        draw_texture_state.src_y0;
+    draw_texture_state.src_sampler = regs.draw_texture.src_sampler;
+    draw_texture_state.src_texture = regs.draw_texture.src_texture;
+
+    maxwell3d->rasterizer->DrawTexture();
+}
+
 void DrawManager::UpdateTopology() {
    const auto& regs{maxwell3d->regs};
    switch (regs.primitive_topology_control) {
--- a/src/video_core/engines/draw_manager.h
+++ b/src/video_core/engines/draw_manager.h
@@ -32,6 +32,19 @@ public:
        std::vector<u8> inline_index_draw_indexes;
    };

+    struct DrawTextureState {
+        f32 dst_x0;
+        f32 dst_y0;
+        f32 dst_x1;
+        f32 dst_y1;
+        f32 src_x0;
+        f32 src_y0;
+        f32 src_x1;
+        f32 src_y1;
+        u32 src_sampler;
+        u32 src_texture;
+    };
+
    struct IndirectParams {
        bool is_indexed;
        bool include_count;
@@ -64,6 +77,10 @@ public:
        return draw_state;
    }

+    const DrawTextureState& GetDrawTextureState() const {
+        return draw_texture_state;
+    }
+
    IndirectParams& GetIndirectParams() {
        return indirect_state;
    }
@@ -81,6 +98,8 @@ private:

    void DrawIndexSmall(u32 argument);

+    void DrawTexture();
+
    void UpdateTopology();

    void ProcessDraw(bool draw_indexed, u32 instance_count);
@@ -89,6 +108,7 @@ private:

    Maxwell3D* maxwell3d{};
    State draw_state{};
+    DrawTextureState draw_texture_state{};
    IndirectParams indirect_state{};
 };
 } // namespace Tegra::Engines
--- a/src/video_core/engines/engine_upload.cpp
+++ b/src/video_core/engines/engine_upload.cpp
@@ -76,7 +76,7 @@ void State::ProcessData(std::span<const u8> read_buffer) {
                                       regs.dest.height, regs.dest.depth, x_offset, regs.dest.y,
                                       x_elements, regs.line_count, regs.dest.BlockHeight(),
                                       regs.dest.BlockDepth(), regs.line_length_in);
-        memory_manager.WriteBlock(address, tmp_buffer.data(), dst_size);
+        memory_manager.WriteBlockCached(address, tmp_buffer.data(), dst_size);
    }
 }

--- a/src/video_core/engines/fermi_2d.cpp
+++ b/src/video_core/engines/fermi_2d.cpp
@@ -6,6 +6,7 @@
 #include "common/microprofile.h"
 #include "video_core/engines/fermi_2d.h"
 #include "video_core/engines/sw_blitter/blitter.h"
+#include "video_core/memory_manager.h"
 #include "video_core/rasterizer_interface.h"
 #include "video_core/surface.h"
 #include "video_core/textures/decoders.h"
@@ -20,8 +21,8 @@ namespace Tegra::Engines {

 using namespace Texture;

-Fermi2D::Fermi2D(MemoryManager& memory_manager_) {
-    sw_blitter = std::make_unique<Blitter::SoftwareBlitEngine>(memory_manager_);
+Fermi2D::Fermi2D(MemoryManager& memory_manager_) : memory_manager{memory_manager_} {
+    sw_blitter = std::make_unique<Blitter::SoftwareBlitEngine>(memory_manager);
    // Nvidia's OpenGL driver seems to assume these values
    regs.src.depth = 1;
    regs.dst.depth = 1;
@@ -104,6 +105,7 @@ void Fermi2D::Blit() {
        config.src_x0 = 0;
    }

+    memory_manager.FlushCaching();
    if (!rasterizer->AccelerateSurfaceCopy(src, regs.dst, config)) {
        sw_blitter->Blit(src, regs.dst, config);
    }
--- a/src/video_core/engines/fermi_2d.h
+++ b/src/video_core/engines/fermi_2d.h
@@ -305,6 +305,7 @@ public:
 private:
    VideoCore::RasterizerInterface* rasterizer = nullptr;
    std::unique_ptr<Blitter::SoftwareBlitEngine> sw_blitter;
+    MemoryManager& memory_manager;

    /// Performs the copy from the source surface to the destination surface as configured in the
    /// registers.
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@@ -149,6 +149,7 @@ bool Maxwell3D::IsMethodExecutable(u32 method) {
    case MAXWELL3D_REG_INDEX(inline_index_4x8.index0):
    case MAXWELL3D_REG_INDEX(vertex_array_instance_first):
    case MAXWELL3D_REG_INDEX(vertex_array_instance_subsequent):
+    case MAXWELL3D_REG_INDEX(draw_texture.src_y0):
    case MAXWELL3D_REG_INDEX(wait_for_idle):
    case MAXWELL3D_REG_INDEX(shadow_ram_control):
    case MAXWELL3D_REG_INDEX(load_mme.instruction_ptr):
@@ -485,11 +486,6 @@ void Maxwell3D::StampQueryResult(u64 payload, bool long_query) {
 }

 void Maxwell3D::ProcessQueryGet() {
-    // TODO(Subv): Support the other query units.
-    if (regs.report_semaphore.query.location != Regs::ReportSemaphore::Location::All) {
-        LOG_DEBUG(HW_GPU, "Locations other than ALL are unimplemented");
-    }
-
    switch (regs.report_semaphore.query.operation) {
    case Regs::ReportSemaphore::Operation::Release:
        if (regs.report_semaphore.query.short_query != 0) {
@@ -649,7 +645,7 @@ void Maxwell3D::ProcessCBMultiData(const u32* start_base, u32 amount) {

    const GPUVAddr address{buffer_address + regs.const_buffer.offset};
    const size_t copy_size = amount * sizeof(u32);
-    memory_manager.WriteBlock(address, start_base, copy_size);
+    memory_manager.WriteBlockCached(address, start_base, copy_size);

    // Increment the current buffer position.
    regs.const_buffer.offset += static_cast<u32>(copy_size);
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@@ -1599,6 +1599,20 @@ public:
        };
        static_assert(sizeof(TIRModulationCoeff) == 0x4);

+        struct DrawTexture {
+            s32 dst_x0;
+            s32 dst_y0;
+            s32 dst_width;
+            s32 dst_height;
+            s64 dx_du;
+            s64 dy_dv;
+            u32 src_sampler;
+            u32 src_texture;
+            s32 src_x0;
+            s32 src_y0;
+        };
+        static_assert(sizeof(DrawTexture) == 0x30);
+
        struct ReduceColorThreshold {
            union {
                BitField<0, 8, u32> all_hit_once;
@@ -2751,7 +2765,7 @@ public:
                u32 reserved_sw_method2;                                               ///< 0x102C
                std::array<TIRModulationCoeff, 5> tir_modulation_coeff;                ///< 0x1030
                std::array<u32, 15> spare_nop;                                         ///< 0x1044
-                INSERT_PADDING_BYTES_NOINIT(0x30);
+                DrawTexture draw_texture;                                              ///< 0x1080
                std::array<u32, 7> reserved_sw_method3_to_7;                           ///< 0x10B0
                ReduceColorThreshold reduce_color_thresholds_unorm8;                   ///< 0x10CC
                std::array<u32, 4> reserved_sw_method10_to_13;                         ///< 0x10D0
--- a/src/video_core/engines/maxwell_dma.cpp
+++ b/src/video_core/engines/maxwell_dma.cpp
@@ -69,7 +69,7 @@ void MaxwellDMA::Launch() {
    if (launch.multi_line_enable) {
        const bool is_src_pitch = launch.src_memory_layout == LaunchDMA::MemoryLayout::PITCH;
        const bool is_dst_pitch = launch.dst_memory_layout == LaunchDMA::MemoryLayout::PITCH;
-
+        memory_manager.FlushCaching();
        if (!is_src_pitch && !is_dst_pitch) {
            // If both the source and the destination are in block layout, assert.
            CopyBlockLinearToBlockLinear();
@@ -104,6 +104,7 @@ void MaxwellDMA::Launch() {
                                            reinterpret_cast<u8*>(tmp_buffer.data()),
                                            regs.line_length_in * sizeof(u32));
        } else {
+            memory_manager.FlushCaching();
            const auto convert_linear_2_blocklinear_addr = [](u64 address) {
                return (address & ~0x1f0ULL) | ((address & 0x40) >> 2) | ((address & 0x10) << 1) |
                       ((address & 0x180) >> 1) | ((address & 0x20) << 3);
@@ -121,8 +122,8 @@ void MaxwellDMA::Launch() {
                    memory_manager.ReadBlockUnsafe(
                        convert_linear_2_blocklinear_addr(regs.offset_in + offset),
                        tmp_buffer.data(), tmp_buffer.size());
-                    memory_manager.WriteBlock(regs.offset_out + offset, tmp_buffer.data(),
-                                              tmp_buffer.size());
+                    memory_manager.WriteBlockCached(regs.offset_out + offset, tmp_buffer.data(),
+                                                    tmp_buffer.size());
                }
            } else if (is_src_pitch && !is_dst_pitch) {
                UNIMPLEMENTED_IF(regs.line_length_in % 16 != 0);
@@ -132,7 +133,7 @@ void MaxwellDMA::Launch() {
                for (u32 offset = 0; offset < regs.line_length_in; offset += 16) {
                    memory_manager.ReadBlockUnsafe(regs.offset_in + offset, tmp_buffer.data(),
                                                   tmp_buffer.size());
-                    memory_manager.WriteBlock(
+                    memory_manager.WriteBlockCached(
                        convert_linear_2_blocklinear_addr(regs.offset_out + offset),
                        tmp_buffer.data(), tmp_buffer.size());
                }
@@ -141,8 +142,8 @@ void MaxwellDMA::Launch() {
                    std::vector<u8> tmp_buffer(regs.line_length_in);
                    memory_manager.ReadBlockUnsafe(regs.offset_in, tmp_buffer.data(),
                                                   regs.line_length_in);
-                    memory_manager.WriteBlock(regs.offset_out, tmp_buffer.data(),
-                                              regs.line_length_in);
+                    memory_manager.WriteBlockCached(regs.offset_out, tmp_buffer.data(),
+                                                    regs.line_length_in);
                }
            }
        }
@@ -204,7 +205,7 @@ void MaxwellDMA::CopyBlockLinearToPitch() {
                     src_params.origin.y, x_elements, regs.line_count, block_height, block_depth,
                     regs.pitch_out);

-    memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size);
+    memory_manager.WriteBlockCached(regs.offset_out, write_buffer.data(), dst_size);
 }

 void MaxwellDMA::CopyPitchToBlockLinear() {
@@ -256,7 +257,7 @@ void MaxwellDMA::CopyPitchToBlockLinear() {
                   dst_params.origin.y, x_elements, regs.line_count, block_height, block_depth,
                   regs.pitch_in);

-    memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size);
+    memory_manager.WriteBlockCached(regs.offset_out, write_buffer.data(), dst_size);
 }

 void MaxwellDMA::FastCopyBlockLinearToPitch() {
@@ -287,7 +288,7 @@ void MaxwellDMA::FastCopyBlockLinearToPitch() {
                     regs.src_params.block_size.height, regs.src_params.block_size.depth,
                     regs.pitch_out);

-    memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size);
+    memory_manager.WriteBlockCached(regs.offset_out, write_buffer.data(), dst_size);
 }

 void MaxwellDMA::CopyBlockLinearToBlockLinear() {
@@ -347,7 +348,7 @@ void MaxwellDMA::CopyBlockLinearToBlockLinear() {
                   dst.depth, dst_x_offset, dst.origin.y, x_elements, regs.line_count,
                   dst.block_size.height, dst.block_size.depth, pitch);

-    memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size);
+    memory_manager.WriteBlockCached(regs.offset_out, write_buffer.data(), dst_size);
 }

 void MaxwellDMA::ReleaseSemaphore() {
--- a/src/video_core/host_shaders/CMakeLists.txt
+++ b/src/video_core/host_shaders/CMakeLists.txt
@@ -11,6 +11,7 @@ set(GLSL_INCLUDES

 set(SHADER_FILES
    astc_decoder.comp
+    blit_color_float.frag
    block_linear_unswizzle_2d.comp
    block_linear_unswizzle_3d.comp
    convert_abgr8_to_d24s8.frag
@@ -36,7 +37,6 @@ set(SHADER_FILES
    smaa_blending_weight_calculation.frag
    smaa_neighborhood_blending.vert
    smaa_neighborhood_blending.frag
-    vulkan_blit_color_float.frag
    vulkan_blit_depth_stencil.frag
    vulkan_fidelityfx_fsr_easu_fp16.comp
    vulkan_fidelityfx_fsr_easu_fp32.comp
@@ -47,6 +47,7 @@ set(SHADER_FILES
    vulkan_present_scaleforce_fp16.frag
    vulkan_present_scaleforce_fp32.frag
    vulkan_quad_indexed.comp
+    vulkan_turbo_mode.comp
    vulkan_uint8.comp
 )

--- a/src/video_core/host_shaders/full_screen_triangle.vert
+++ b/src/video_core/host_shaders/full_screen_triangle.vert
@@ -4,13 +4,20 @@
 #version 450

 #ifdef VULKAN
+#define VERTEX_ID gl_VertexIndex
 #define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants {
 #define END_PUSH_CONSTANTS };
 #define UNIFORM(n)
+#define FLIPY 1
 #else // ^^^ Vulkan ^^^ // vvv OpenGL vvv
+#define VERTEX_ID gl_VertexID
 #define BEGIN_PUSH_CONSTANTS
 #define END_PUSH_CONSTANTS
+#define FLIPY -1
 #define UNIFORM(n) layout (location = n) uniform
+out gl_PerVertex {
+    vec4 gl_Position;
+};
 #endif

 BEGIN_PUSH_CONSTANTS
@@ -21,8 +28,8 @@ END_PUSH_CONSTANTS
 layout(location = 0) out vec2 texcoord;

 void main() {
-    float x = float((gl_VertexIndex & 1) << 2);
-    float y = float((gl_VertexIndex & 2) << 1);
-    gl_Position = vec4(x - 1.0, y - 1.0, 0.0, 1.0);
+    float x = float((VERTEX_ID & 1) << 2);
+    float y = float((VERTEX_ID & 2) << 1);
+    gl_Position = vec4(x - 1.0, FLIPY * (y - 1.0), 0.0, 1.0);
    texcoord = fma(vec2(x, y) / 2.0, tex_scale, tex_offset);
-}
+}
--- a/src/video_core/macro/macro_hle.cpp
+++ b/src/video_core/macro/macro_hle.cpp
@@ -50,38 +50,6 @@ protected:
    Maxwell3D& maxwell3d;
 };

-class HLE_DrawArrays final : public HLEMacroImpl {
-public:
-    explicit HLE_DrawArrays(Maxwell3D& maxwell3d_) : HLEMacroImpl(maxwell3d_) {}
-
-    void Execute(const std::vector<u32>& parameters, [[maybe_unused]] u32 method) override {
-        maxwell3d.RefreshParameters();
-
-        auto topology = static_cast<Maxwell3D::Regs::PrimitiveTopology>(parameters[0]);
-        maxwell3d.draw_manager->DrawArray(topology, parameters[1], parameters[2],
-                                          maxwell3d.regs.global_base_instance_index, 1);
-    }
-};
-
-class HLE_DrawIndexed final : public HLEMacroImpl {
-public:
-    explicit HLE_DrawIndexed(Maxwell3D& maxwell3d_) : HLEMacroImpl(maxwell3d_) {}
-
-    void Execute(const std::vector<u32>& parameters, [[maybe_unused]] u32 method) override {
-        maxwell3d.RefreshParameters();
-        maxwell3d.regs.index_buffer.start_addr_high = parameters[1];
-        maxwell3d.regs.index_buffer.start_addr_low = parameters[2];
-        maxwell3d.regs.index_buffer.format =
-            static_cast<Engines::Maxwell3D::Regs::IndexFormat>(parameters[3]);
-        maxwell3d.dirty.flags[VideoCommon::Dirty::IndexBuffer] = true;
-
-        auto topology = static_cast<Maxwell3D::Regs::PrimitiveTopology>(parameters[0]);
-        maxwell3d.draw_manager->DrawIndex(topology, 0, parameters[4],
-                                          maxwell3d.regs.global_base_vertex_index,
-                                          maxwell3d.regs.global_base_instance_index, 1);
-    }
-};
-
 /*
 * @note: these macros have two versions, a normal and extended version, with the extended version
 * also assigning the base vertex/instance.
@@ -497,11 +465,6 @@ public:
 } // Anonymous namespace

 HLEMacro::HLEMacro(Maxwell3D& maxwell3d_) : maxwell3d{maxwell3d_} {
-    builders.emplace(0xDD6A7FA92A7D2674ULL,
-                     std::function<std::unique_ptr<CachedMacro>(Maxwell3D&)>(
-                         [](Maxwell3D& maxwell3d__) -> std::unique_ptr<CachedMacro> {
-                             return std::make_unique<HLE_DrawArrays>(maxwell3d__);
-                         }));
    builders.emplace(0x0D61FC9FAAC9FCADULL,
                     std::function<std::unique_ptr<CachedMacro>(Maxwell3D&)>(
                         [](Maxwell3D& maxwell3d__) -> std::unique_ptr<CachedMacro> {
@@ -512,11 +475,6 @@ HLEMacro::HLEMacro(Maxwell3D& maxwell3d_) : maxwell3d{maxwell3d_} {
                         [](Maxwell3D& maxwell3d__) -> std::unique_ptr<CachedMacro> {
                             return std::make_unique<HLE_DrawArraysIndirect<true>>(maxwell3d__);
                         }));
-    builders.emplace(0x2DB33AADB741839CULL,
-                     std::function<std::unique_ptr<CachedMacro>(Maxwell3D&)>(
-                         [](Maxwell3D& maxwell3d__) -> std::unique_ptr<CachedMacro> {
-                             return std::make_unique<HLE_DrawIndexed>(maxwell3d__);
-                         }));
    builders.emplace(0x771BB18C62444DA0ULL,
                     std::function<std::unique_ptr<CachedMacro>(Maxwell3D&)>(
                         [](Maxwell3D& maxwell3d__) -> std::unique_ptr<CachedMacro> {
--- a/src/video_core/memory_manager.cpp
+++ b/src/video_core/memory_manager.cpp
@@ -6,11 +6,13 @@
 #include "common/alignment.h"
 #include "common/assert.h"
 #include "common/logging/log.h"
+#include "common/settings.h"
 #include "core/core.h"
 #include "core/device_memory.h"
 #include "core/hle/kernel/k_page_table.h"
 #include "core/hle/kernel/k_process.h"
 #include "core/memory.h"
+#include "video_core/invalidation_accumulator.h"
 #include "video_core/memory_manager.h"
 #include "video_core/rasterizer_interface.h"
 #include "video_core/renderer_base.h"
@@ -26,7 +28,8 @@ MemoryManager::MemoryManager(Core::System& system_, u64 address_space_bits_, u64
      entries{}, big_entries{}, page_table{address_space_bits, address_space_bits + page_bits - 38,
                                           page_bits != big_page_bits ? page_bits : 0},
      kind_map{PTEKind::INVALID}, unique_identifier{unique_identifier_generator.fetch_add(
-                                      1, std::memory_order_acq_rel)} {
+                                      1, std::memory_order_acq_rel)},
+      accumulator{std::make_unique<VideoCommon::InvalidationAccumulator>()} {
    address_space_size = 1ULL << address_space_bits;
    page_size = 1ULL << page_bits;
    page_mask = page_size - 1ULL;
@@ -43,6 +46,11 @@ MemoryManager::MemoryManager(Core::System& system_, u64 address_space_bits_, u64
    big_page_table_cpu.resize(big_page_table_size);
    big_page_continous.resize(big_page_table_size / continous_bits, 0);
    entries.resize(page_table_size / 32, 0);
+    if (!Settings::IsGPULevelExtreme() && Settings::IsFastmemEnabled()) {
+        fastmem_arena = system.DeviceMemory().buffer.VirtualBasePointer();
+    } else {
+        fastmem_arena = nullptr;
+    }
 }

 MemoryManager::~MemoryManager() = default;
@@ -185,15 +193,12 @@ void MemoryManager::Unmap(GPUVAddr gpu_addr, std::size_t size) {
    if (size == 0) {
        return;
    }
-    const auto submapped_ranges = GetSubmappedRange(gpu_addr, size);
+    GetSubmappedRangeImpl<false>(gpu_addr, size, page_stash);

-    for (const auto& [map_addr, map_size] : submapped_ranges) {
-        // Flush and invalidate through the GPU interface, to be asynchronous if possible.
-        const std::optional<VAddr> cpu_addr = GpuToCpuAddress(map_addr);
-        ASSERT(cpu_addr);
-
-        rasterizer->UnmapMemory(*cpu_addr, map_size);
+    for (const auto& [map_addr, map_size] : page_stash) {
+        rasterizer->UnmapMemory(map_addr, map_size);
    }
+    page_stash.clear();

    BigPageTableOp<EntryType::Free>(gpu_addr, 0, size, PTEKind::INVALID);
    PageTableOp<EntryType::Free>(gpu_addr, 0, size, PTEKind::INVALID);
@@ -355,7 +360,7 @@ inline void MemoryManager::MemoryOperation(GPUVAddr gpu_src_addr, std::size_t si
    }
 }

-template <bool is_safe>
+template <bool is_safe, bool use_fastmem>
 void MemoryManager::ReadBlockImpl(GPUVAddr gpu_src_addr, void* dest_buffer, std::size_t size,
                                  [[maybe_unused]] VideoCommon::CacheType which) const {
    auto set_to_zero = [&]([[maybe_unused]] std::size_t page_index,
@@ -369,8 +374,12 @@ void MemoryManager::ReadBlockImpl(GPUVAddr gpu_src_addr, void* dest_buffer, std:
        if constexpr (is_safe) {
            rasterizer->FlushRegion(cpu_addr_base, copy_amount, which);
        }
-        u8* physical = memory.GetPointer(cpu_addr_base);
-        std::memcpy(dest_buffer, physical, copy_amount);
+        if constexpr (use_fastmem) {
+            std::memcpy(dest_buffer, &fastmem_arena[cpu_addr_base], copy_amount);
+        } else {
+            u8* physical = memory.GetPointer(cpu_addr_base);
+            std::memcpy(dest_buffer, physical, copy_amount);
+        }
        dest_buffer = static_cast<u8*>(dest_buffer) + copy_amount;
    };
    auto mapped_big = [&](std::size_t page_index, std::size_t offset, std::size_t copy_amount) {
@@ -379,11 +388,15 @@ void MemoryManager::ReadBlockImpl(GPUVAddr gpu_src_addr, void* dest_buffer, std:
        if constexpr (is_safe) {
            rasterizer->FlushRegion(cpu_addr_base, copy_amount, which);
        }
-        if (!IsBigPageContinous(page_index)) [[unlikely]] {
-            memory.ReadBlockUnsafe(cpu_addr_base, dest_buffer, copy_amount);
+        if constexpr (use_fastmem) {
+            std::memcpy(dest_buffer, &fastmem_arena[cpu_addr_base], copy_amount);
        } else {
-            u8* physical = memory.GetPointer(cpu_addr_base);
-            std::memcpy(dest_buffer, physical, copy_amount);
+            if (!IsBigPageContinous(page_index)) [[unlikely]] {
+                memory.ReadBlockUnsafe(cpu_addr_base, dest_buffer, copy_amount);
+            } else {
+                u8* physical = memory.GetPointer(cpu_addr_base);
+                std::memcpy(dest_buffer, physical, copy_amount);
+            }
        }
        dest_buffer = static_cast<u8*>(dest_buffer) + copy_amount;
    };
@@ -397,12 +410,20 @@ void MemoryManager::ReadBlockImpl(GPUVAddr gpu_src_addr, void* dest_buffer, std:

 void MemoryManager::ReadBlock(GPUVAddr gpu_src_addr, void* dest_buffer, std::size_t size,
                              VideoCommon::CacheType which) const {
-    ReadBlockImpl<true>(gpu_src_addr, dest_buffer, size, which);
+    if (fastmem_arena) [[likely]] {
+        ReadBlockImpl<true, true>(gpu_src_addr, dest_buffer, size, which);
+        return;
+    }
+    ReadBlockImpl<true, false>(gpu_src_addr, dest_buffer, size, which);
 }

 void MemoryManager::ReadBlockUnsafe(GPUVAddr gpu_src_addr, void* dest_buffer,
                                    const std::size_t size) const {
-    ReadBlockImpl<false>(gpu_src_addr, dest_buffer, size, VideoCommon::CacheType::None);
+    if (fastmem_arena) [[likely]] {
+        ReadBlockImpl<false, true>(gpu_src_addr, dest_buffer, size, VideoCommon::CacheType::None);
+        return;
+    }
+    ReadBlockImpl<false, false>(gpu_src_addr, dest_buffer, size, VideoCommon::CacheType::None);
 }

 template <bool is_safe>
@@ -454,6 +475,12 @@ void MemoryManager::WriteBlockUnsafe(GPUVAddr gpu_dest_addr, const void* src_buf
    WriteBlockImpl<false>(gpu_dest_addr, src_buffer, size, VideoCommon::CacheType::None);
 }

+void MemoryManager::WriteBlockCached(GPUVAddr gpu_dest_addr, const void* src_buffer,
+                                     std::size_t size) {
+    WriteBlockImpl<false>(gpu_dest_addr, src_buffer, size, VideoCommon::CacheType::None);
+    accumulator->Add(gpu_dest_addr, size);
+}
+
 void MemoryManager::FlushRegion(GPUVAddr gpu_addr, size_t size,
                                VideoCommon::CacheType which) const {
    auto do_nothing = [&]([[maybe_unused]] std::size_t page_index,
@@ -663,7 +690,17 @@ bool MemoryManager::IsFullyMappedRange(GPUVAddr gpu_addr, std::size_t size) cons
 std::vector<std::pair<GPUVAddr, std::size_t>> MemoryManager::GetSubmappedRange(
    GPUVAddr gpu_addr, std::size_t size) const {
    std::vector<std::pair<GPUVAddr, std::size_t>> result{};
-    std::optional<std::pair<GPUVAddr, std::size_t>> last_segment{};
+    GetSubmappedRangeImpl<true>(gpu_addr, size, result);
+    return result;
+}
+
+template <bool is_gpu_address>
+void MemoryManager::GetSubmappedRangeImpl(
+    GPUVAddr gpu_addr, std::size_t size,
+    std::vector<std::pair<std::conditional_t<is_gpu_address, GPUVAddr, VAddr>, std::size_t>>&
+        result) const {
+    std::optional<std::pair<std::conditional_t<is_gpu_address, GPUVAddr, VAddr>, std::size_t>>
+        last_segment{};
    std::optional<VAddr> old_page_addr{};
    const auto split = [&last_segment, &result]([[maybe_unused]] std::size_t page_index,
                                                [[maybe_unused]] std::size_t offset,
@@ -685,8 +722,12 @@ std::vector<std::pair<GPUVAddr, std::size_t>> MemoryManager::GetSubmappedRange(
        }
        old_page_addr = {cpu_addr_base + copy_amount};
        if (!last_segment) {
-            const GPUVAddr new_base_addr = (page_index << big_page_bits) + offset;
-            last_segment = {new_base_addr, copy_amount};
+            if constexpr (is_gpu_address) {
+                const GPUVAddr new_base_addr = (page_index << big_page_bits) + offset;
+                last_segment = {new_base_addr, copy_amount};
+            } else {
+                last_segment = {cpu_addr_base, copy_amount};
+            }
        } else {
            last_segment->second += copy_amount;
        }
@@ -703,8 +744,12 @@ std::vector<std::pair<GPUVAddr, std::size_t>> MemoryManager::GetSubmappedRange(
        }
        old_page_addr = {cpu_addr_base + copy_amount};
        if (!last_segment) {
-            const GPUVAddr new_base_addr = (page_index << page_bits) + offset;
-            last_segment = {new_base_addr, copy_amount};
+            if constexpr (is_gpu_address) {
+                const GPUVAddr new_base_addr = (page_index << page_bits) + offset;
+                last_segment = {new_base_addr, copy_amount};
+            } else {
+                last_segment = {cpu_addr_base, copy_amount};
+            }
        } else {
            last_segment->second += copy_amount;
        }
@@ -715,7 +760,18 @@ std::vector<std::pair<GPUVAddr, std::size_t>> MemoryManager::GetSubmappedRange(
    };
    MemoryOperation<true>(gpu_addr, size, extend_size_big, split, do_short_pages);
    split(0, 0, 0);
-    return result;
+}
+
+void MemoryManager::FlushCaching() {
+    if (!accumulator->AnyAccumulated()) {
+        return;
+    }
+    accumulator->Callback([this](GPUVAddr addr, size_t size) {
+        GetSubmappedRangeImpl<false>(addr, size, page_stash);
+    });
+    rasterizer->InnerInvalidation(page_stash);
+    page_stash.clear();
+    accumulator->Clear();
 }

 } // namespace Tegra
--- a/src/video_core/memory_manager.h
+++ b/src/video_core/memory_manager.h
@@ -19,6 +19,10 @@ namespace VideoCore {
 class RasterizerInterface;
 }

+namespace VideoCommon {
+class InvalidationAccumulator;
+}
+
 namespace Core {
 class DeviceMemory;
 namespace Memory {
@@ -80,6 +84,7 @@ public:
     */
    void ReadBlockUnsafe(GPUVAddr gpu_src_addr, void* dest_buffer, std::size_t size) const;
    void WriteBlockUnsafe(GPUVAddr gpu_dest_addr, const void* src_buffer, std::size_t size);
+    void WriteBlockCached(GPUVAddr gpu_dest_addr, const void* src_buffer, std::size_t size);

    /**
     * Checks if a gpu region can be simply read with a pointer.
@@ -129,12 +134,14 @@ public:
    size_t GetMemoryLayoutSize(GPUVAddr gpu_addr,
                               size_t max_size = std::numeric_limits<size_t>::max()) const;

+    void FlushCaching();
+
 private:
    template <bool is_big_pages, typename FuncMapped, typename FuncReserved, typename FuncUnmapped>
    inline void MemoryOperation(GPUVAddr gpu_src_addr, std::size_t size, FuncMapped&& func_mapped,
                                FuncReserved&& func_reserved, FuncUnmapped&& func_unmapped) const;

-    template <bool is_safe>
+    template <bool is_safe, bool use_fastmem>
    void ReadBlockImpl(GPUVAddr gpu_src_addr, void* dest_buffer, std::size_t size,
                       VideoCommon::CacheType which) const;

@@ -154,6 +161,12 @@ private:
    inline bool IsBigPageContinous(size_t big_page_index) const;
    inline void SetBigPageContinous(size_t big_page_index, bool value);

+    template <bool is_gpu_address>
+    void GetSubmappedRangeImpl(
+        GPUVAddr gpu_addr, std::size_t size,
+        std::vector<std::pair<std::conditional_t<is_gpu_address, GPUVAddr, VAddr>, std::size_t>>&
+            result) const;
+
    Core::System& system;
    Core::Memory::Memory& memory;
    Core::DeviceMemory& device_memory;
@@ -201,10 +214,13 @@ private:
    Common::VirtualBuffer<u32> big_page_table_cpu;

    std::vector<u64> big_page_continous;
+    std::vector<std::pair<VAddr, std::size_t>> page_stash{};
+    u8* fastmem_arena{};

    constexpr static size_t continous_bits = 64;

    const size_t unique_identifier;
+    std::unique_ptr<VideoCommon::InvalidationAccumulator> accumulator;

    static std::atomic<size_t> unique_identifier_generator;
 };
--- a/src/video_core/rasterizer_interface.h
+++ b/src/video_core/rasterizer_interface.h
@@ -6,6 +6,7 @@
 #include <functional>
 #include <optional>
 #include <span>
+#include <utility>
 #include "common/common_types.h"
 #include "common/polyfill_thread.h"
 #include "video_core/cache_types.h"
@@ -46,6 +47,9 @@ public:
    /// Dispatches an indirect draw invocation
    virtual void DrawIndirect() {}

+    /// Dispatches an draw texture invocation
+    virtual void DrawTexture() = 0;
+
    /// Clear the current framebuffer
    virtual void Clear(u32 layer_count) = 0;

@@ -95,6 +99,12 @@ public:
    virtual void InvalidateRegion(VAddr addr, u64 size,
                                  VideoCommon::CacheType which = VideoCommon::CacheType::All) = 0;

+    virtual void InnerInvalidation(std::span<const std::pair<VAddr, std::size_t>> sequences) {
+        for (const auto& [cpu_addr, size] : sequences) {
+            InvalidateRegion(cpu_addr, size);
+        }
+    }
+
    /// Notify rasterizer that any caches of the specified region are desync with guest
    virtual void OnCPUWrite(VAddr addr, u64 size) = 0;

--- a/src/video_core/renderer_null/null_rasterizer.cpp
+++ b/src/video_core/renderer_null/null_rasterizer.cpp
@@ -21,6 +21,7 @@ RasterizerNull::RasterizerNull(Core::Memory::Memory& cpu_memory_, Tegra::GPU& gp
 RasterizerNull::~RasterizerNull() = default;

 void RasterizerNull::Draw(bool is_indexed, u32 instance_count) {}
+void RasterizerNull::DrawTexture() {}
 void RasterizerNull::Clear(u32 layer_count) {}
 void RasterizerNull::DispatchCompute() {}
 void RasterizerNull::ResetCounter(VideoCore::QueryType type) {}
--- a/src/video_core/renderer_null/null_rasterizer.h
+++ b/src/video_core/renderer_null/null_rasterizer.h
@@ -31,6 +31,7 @@ public:
    ~RasterizerNull() override;

    void Draw(bool is_indexed, u32 instance_count) override;
+    void DrawTexture() override;
    void Clear(u32 layer_count) override;
    void DispatchCompute() override;
    void ResetCounter(VideoCore::QueryType type) override;
--- a/src/video_core/renderer_opengl/gl_buffer_cache.h
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.h
@@ -160,6 +160,10 @@ public:
        return device.CanReportMemoryUsage();
    }

+    u32 GetStorageBufferAlignment() const {
+        return static_cast<u32>(device.GetShaderStorageBufferAlignment());
+    }
+
 private:
    static constexpr std::array PABO_LUT{
        GL_VERTEX_PROGRAM_PARAMETER_BUFFER_NV,          GL_TESS_CONTROL_PROGRAM_PARAMETER_BUFFER_NV,
--- a/src/video_core/renderer_opengl/gl_device.cpp
+++ b/src/video_core/renderer_opengl/gl_device.cpp
@@ -166,6 +166,7 @@ Device::Device(Core::Frontend::EmuWindow& emu_window) {
    has_shader_int64 = HasExtension(extensions, "GL_ARB_gpu_shader_int64");
    has_amd_shader_half_float = GLAD_GL_AMD_gpu_shader_half_float;
    has_sparse_texture_2 = GLAD_GL_ARB_sparse_texture2;
+    has_draw_texture = GLAD_GL_NV_draw_texture;
    warp_size_potentially_larger_than_guest = !is_nvidia && !is_intel;
    need_fastmath_off = is_nvidia;
    can_report_memory = GLAD_GL_NVX_gpu_memory_info;
--- a/src/video_core/renderer_opengl/gl_device.h
+++ b/src/video_core/renderer_opengl/gl_device.h
@@ -4,6 +4,8 @@
 #pragma once

 #include <cstddef>
+#include <string>
+
 #include "common/common_types.h"
 #include "core/frontend/emu_window.h"
 #include "shader_recompiler/stage.h"
@@ -146,6 +148,10 @@ public:
        return has_sparse_texture_2;
    }

+    bool HasDrawTexture() const {
+        return has_draw_texture;
+    }
+
    bool IsWarpSizePotentiallyLargerThanGuest() const {
        return warp_size_potentially_larger_than_guest;
    }
@@ -216,6 +222,7 @@ private:
    bool has_shader_int64{};
    bool has_amd_shader_half_float{};
    bool has_sparse_texture_2{};
+    bool has_draw_texture{};
    bool warp_size_potentially_larger_than_guest{};
    bool need_fastmath_off{};
    bool has_cbuf_ftou_bug{};
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -64,7 +64,8 @@ RasterizerOpenGL::RasterizerOpenGL(Core::Frontend::EmuWindow& emu_window_, Tegra
      shader_cache(*this, emu_window_, device, texture_cache, buffer_cache, program_manager,
                   state_tracker, gpu.ShaderNotify()),
      query_cache(*this), accelerate_dma(buffer_cache),
-      fence_manager(*this, gpu, texture_cache, buffer_cache, query_cache) {}
+      fence_manager(*this, gpu, texture_cache, buffer_cache, query_cache),
+      blit_image(program_manager_) {}

 RasterizerOpenGL::~RasterizerOpenGL() = default;

@@ -318,6 +319,47 @@ void RasterizerOpenGL::DrawIndirect() {
    buffer_cache.SetDrawIndirect(nullptr);
 }

+void RasterizerOpenGL::DrawTexture() {
+    MICROPROFILE_SCOPE(OpenGL_Drawing);
+
+    SCOPE_EXIT({ gpu.TickWork(); });
+    query_cache.UpdateCounters();
+
+    texture_cache.SynchronizeGraphicsDescriptors();
+    texture_cache.UpdateRenderTargets(false);
+
+    SyncState();
+
+    const auto& draw_texture_state = maxwell3d->draw_manager->GetDrawTextureState();
+    const auto& sampler = texture_cache.GetGraphicsSampler(draw_texture_state.src_sampler);
+    const auto& texture = texture_cache.GetImageView(draw_texture_state.src_texture);
+
+    if (device.HasDrawTexture()) {
+        state_tracker.BindFramebuffer(texture_cache.GetFramebuffer()->Handle());
+
+        glDrawTextureNV(texture.DefaultHandle(), sampler->Handle(), draw_texture_state.dst_x0,
+                        draw_texture_state.dst_y0, draw_texture_state.dst_x1,
+                        draw_texture_state.dst_y1, 0,
+                        draw_texture_state.src_x0 / static_cast<float>(texture.size.width),
+                        draw_texture_state.src_y0 / static_cast<float>(texture.size.height),
+                        draw_texture_state.src_x1 / static_cast<float>(texture.size.width),
+                        draw_texture_state.src_y1 / static_cast<float>(texture.size.height));
+    } else {
+        Region2D dst_region = {Offset2D{.x = static_cast<s32>(draw_texture_state.dst_x0),
+                                        .y = static_cast<s32>(draw_texture_state.dst_y0)},
+                               Offset2D{.x = static_cast<s32>(draw_texture_state.dst_x1),
+                                        .y = static_cast<s32>(draw_texture_state.dst_y1)}};
+        Region2D src_region = {Offset2D{.x = static_cast<s32>(draw_texture_state.src_x0),
+                                        .y = static_cast<s32>(draw_texture_state.src_y0)},
+                               Offset2D{.x = static_cast<s32>(draw_texture_state.src_x1),
+                                        .y = static_cast<s32>(draw_texture_state.src_y1)}};
+        blit_image.BlitColor(texture_cache.GetFramebuffer()->Handle(), texture.DefaultHandle(),
+                             sampler->Handle(), dst_region, src_region, texture.size);
+    }
+
+    ++num_queued_commands;
+}
+
 void RasterizerOpenGL::DispatchCompute() {
    ComputePipeline* const pipeline{shader_cache.CurrentComputePipeline()};
    if (!pipeline) {
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@@ -16,6 +16,7 @@
 #include "video_core/engines/maxwell_dma.h"
 #include "video_core/rasterizer_accelerated.h"
 #include "video_core/rasterizer_interface.h"
+#include "video_core/renderer_opengl/blit_image.h"
 #include "video_core/renderer_opengl/gl_buffer_cache.h"
 #include "video_core/renderer_opengl/gl_device.h"
 #include "video_core/renderer_opengl/gl_fence_manager.h"
@@ -70,6 +71,7 @@ public:

    void Draw(bool is_indexed, u32 instance_count) override;
    void DrawIndirect() override;
+    void DrawTexture() override;
    void Clear(u32 layer_count) override;
    void DispatchCompute() override;
    void ResetCounter(VideoCore::QueryType type) override;
@@ -224,6 +226,8 @@ private:
    AccelerateDMA accelerate_dma;
    FenceManagerOpenGL fence_manager;

+    BlitImageHelper blit_image;
+
    boost::container::static_vector<u32, MAX_IMAGE_VIEWS> image_view_indices;
    std::array<ImageViewId, MAX_IMAGE_VIEWS> image_view_ids;
    boost::container::static_vector<GLuint, MAX_TEXTURES> sampler_handles;
--- a/src/video_core/renderer_opengl/gl_shader_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp
@@ -236,6 +236,8 @@ ShaderCache::ShaderCache(RasterizerOpenGL& rasterizer_, Core::Frontend::EmuWindo
          .needs_demote_reorder = device.IsAmd(),
          .support_snorm_render_buffer = false,
          .support_viewport_index_layer = device.HasVertexViewportLayer(),
+          .min_ssbo_alignment = static_cast<u32>(device.GetShaderStorageBufferAlignment()),
+          .support_geometry_shader_passthrough = device.HasGeometryShaderPassthrough(),
      } {
    if (use_asynchronous_shaders) {
        workers = CreateWorkers();
--- a/src/video_core/renderer_opengl/gl_shader_manager.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_manager.cpp
@@ -1,2 +1,123 @@
 // SPDX-FileCopyrightText: Copyright 2018 yuzu Emulator Project
 // SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <glad/glad.h>
+
+#include "video_core/renderer_opengl/gl_shader_manager.h"
+
+namespace OpenGL {
+
+static constexpr std::array ASSEMBLY_PROGRAM_ENUMS{
+    GL_VERTEX_PROGRAM_NV,   GL_TESS_CONTROL_PROGRAM_NV, GL_TESS_EVALUATION_PROGRAM_NV,
+    GL_GEOMETRY_PROGRAM_NV, GL_FRAGMENT_PROGRAM_NV,
+};
+
+ProgramManager::ProgramManager(const Device& device) {
+    glCreateProgramPipelines(1, &pipeline.handle);
+    if (device.UseAssemblyShaders()) {
+        glEnable(GL_COMPUTE_PROGRAM_NV);
+    }
+}
+
+void ProgramManager::BindComputeProgram(GLuint program) {
+    glUseProgram(program);
+    is_compute_bound = true;
+}
+
+void ProgramManager::BindComputeAssemblyProgram(GLuint program) {
+    if (current_assembly_compute_program != program) {
+        current_assembly_compute_program = program;
+        glBindProgramARB(GL_COMPUTE_PROGRAM_NV, program);
+    }
+    UnbindPipeline();
+}
+
+void ProgramManager::BindSourcePrograms(std::span<const OGLProgram, NUM_STAGES> programs) {
+    static constexpr std::array<GLenum, 5> stage_enums{
+        GL_VERTEX_SHADER_BIT,   GL_TESS_CONTROL_SHADER_BIT, GL_TESS_EVALUATION_SHADER_BIT,
+        GL_GEOMETRY_SHADER_BIT, GL_FRAGMENT_SHADER_BIT,
+    };
+    for (size_t stage = 0; stage < NUM_STAGES; ++stage) {
+        if (current_programs[stage] != programs[stage].handle) {
+            current_programs[stage] = programs[stage].handle;
+            glUseProgramStages(pipeline.handle, stage_enums[stage], programs[stage].handle);
+        }
+    }
+    BindPipeline();
+}
+
+void ProgramManager::BindPresentPrograms(GLuint vertex, GLuint fragment) {
+    if (current_programs[0] != vertex) {
+        current_programs[0] = vertex;
+        glUseProgramStages(pipeline.handle, GL_VERTEX_SHADER_BIT, vertex);
+    }
+    if (current_programs[4] != fragment) {
+        current_programs[4] = fragment;
+        glUseProgramStages(pipeline.handle, GL_FRAGMENT_SHADER_BIT, fragment);
+    }
+    glUseProgramStages(
+        pipeline.handle,
+        GL_TESS_CONTROL_SHADER_BIT | GL_TESS_EVALUATION_SHADER_BIT | GL_GEOMETRY_SHADER_BIT, 0);
+    current_programs[1] = 0;
+    current_programs[2] = 0;
+    current_programs[3] = 0;
+
+    if (current_stage_mask != 0) {
+        current_stage_mask = 0;
+        for (const GLenum program_type : ASSEMBLY_PROGRAM_ENUMS) {
+            glDisable(program_type);
+        }
+    }
+    BindPipeline();
+}
+
+void ProgramManager::BindAssemblyPrograms(std::span<const OGLAssemblyProgram, NUM_STAGES> programs,
+                                          u32 stage_mask) {
+    const u32 changed_mask = current_stage_mask ^ stage_mask;
+    current_stage_mask = stage_mask;
+
+    if (changed_mask != 0) {
+        for (size_t stage = 0; stage < NUM_STAGES; ++stage) {
+            if (((changed_mask >> stage) & 1) != 0) {
+                if (((stage_mask >> stage) & 1) != 0) {
+                    glEnable(ASSEMBLY_PROGRAM_ENUMS[stage]);
+                } else {
+                    glDisable(ASSEMBLY_PROGRAM_ENUMS[stage]);
+                }
+            }
+        }
+    }
+    for (size_t stage = 0; stage < NUM_STAGES; ++stage) {
+        if (current_programs[stage] != programs[stage].handle) {
+            current_programs[stage] = programs[stage].handle;
+            glBindProgramARB(ASSEMBLY_PROGRAM_ENUMS[stage], programs[stage].handle);
+        }
+    }
+    UnbindPipeline();
+}
+
+void ProgramManager::RestoreGuestCompute() {}
+
+void ProgramManager::BindPipeline() {
+    if (!is_pipeline_bound) {
+        is_pipeline_bound = true;
+        glBindProgramPipeline(pipeline.handle);
+    }
+    UnbindCompute();
+}
+
+void ProgramManager::UnbindPipeline() {
+    if (is_pipeline_bound) {
+        is_pipeline_bound = false;
+        glBindProgramPipeline(0);
+    }
+    UnbindCompute();
+}
+
+void ProgramManager::UnbindCompute() {
+    if (is_compute_bound) {
+        is_compute_bound = false;
+        glUseProgram(0);
+    }
+}
+} // namespace OpenGL
--- a/src/video_core/renderer_opengl/gl_shader_manager.h
+++ b/src/video_core/renderer_opengl/gl_shader_manager.h
@@ -6,8 +6,6 @@
 #include <array>
 #include <span>

-#include <glad/glad.h>
-
 #include "video_core/renderer_opengl/gl_device.h"
 #include "video_core/renderer_opengl/gl_resource_manager.h"

@@ -16,121 +14,28 @@ namespace OpenGL {
 class ProgramManager {
    static constexpr size_t NUM_STAGES = 5;

-    static constexpr std::array ASSEMBLY_PROGRAM_ENUMS{
-        GL_VERTEX_PROGRAM_NV,   GL_TESS_CONTROL_PROGRAM_NV, GL_TESS_EVALUATION_PROGRAM_NV,
-        GL_GEOMETRY_PROGRAM_NV, GL_FRAGMENT_PROGRAM_NV,
-    };
-
 public:
-    explicit ProgramManager(const Device& device) {
-        glCreateProgramPipelines(1, &pipeline.handle);
-        if (device.UseAssemblyShaders()) {
-            glEnable(GL_COMPUTE_PROGRAM_NV);
-        }
-    }
+    explicit ProgramManager(const Device& device);

-    void BindComputeProgram(GLuint program) {
-        glUseProgram(program);
-        is_compute_bound = true;
-    }
+    void BindComputeProgram(GLuint program);

-    void BindComputeAssemblyProgram(GLuint program) {
-        if (current_assembly_compute_program != program) {
-            current_assembly_compute_program = program;
-            glBindProgramARB(GL_COMPUTE_PROGRAM_NV, program);
-        }
-        UnbindPipeline();
-    }
+    void BindComputeAssemblyProgram(GLuint program);

-    void BindSourcePrograms(std::span<const OGLProgram, NUM_STAGES> programs) {
-        static constexpr std::array<GLenum, 5> stage_enums{
-            GL_VERTEX_SHADER_BIT,   GL_TESS_CONTROL_SHADER_BIT, GL_TESS_EVALUATION_SHADER_BIT,
-            GL_GEOMETRY_SHADER_BIT, GL_FRAGMENT_SHADER_BIT,
-        };
-        for (size_t stage = 0; stage < NUM_STAGES; ++stage) {
-            if (current_programs[stage] != programs[stage].handle) {
-                current_programs[stage] = programs[stage].handle;
-                glUseProgramStages(pipeline.handle, stage_enums[stage], programs[stage].handle);
-            }
-        }
-        BindPipeline();
-    }
+    void BindSourcePrograms(std::span<const OGLProgram, NUM_STAGES> programs);

-    void BindPresentPrograms(GLuint vertex, GLuint fragment) {
-        if (current_programs[0] != vertex) {
-            current_programs[0] = vertex;
-            glUseProgramStages(pipeline.handle, GL_VERTEX_SHADER_BIT, vertex);
-        }
-        if (current_programs[4] != fragment) {
-            current_programs[4] = fragment;
-            glUseProgramStages(pipeline.handle, GL_FRAGMENT_SHADER_BIT, fragment);
-        }
-        glUseProgramStages(
-            pipeline.handle,
-            GL_TESS_CONTROL_SHADER_BIT | GL_TESS_EVALUATION_SHADER_BIT | GL_GEOMETRY_SHADER_BIT, 0);
-        current_programs[1] = 0;
-        current_programs[2] = 0;
-        current_programs[3] = 0;
-
-        if (current_stage_mask != 0) {
-            current_stage_mask = 0;
-            for (const GLenum program_type : ASSEMBLY_PROGRAM_ENUMS) {
-                glDisable(program_type);
-            }
-        }
-        BindPipeline();
-    }
+    void BindPresentPrograms(GLuint vertex, GLuint fragment);

    void BindAssemblyPrograms(std::span<const OGLAssemblyProgram, NUM_STAGES> programs,
-                              u32 stage_mask) {
-        const u32 changed_mask = current_stage_mask ^ stage_mask;
-        current_stage_mask = stage_mask;
+                              u32 stage_mask);

-        if (changed_mask != 0) {
-            for (size_t stage = 0; stage < NUM_STAGES; ++stage) {
-                if (((changed_mask >> stage) & 1) != 0) {
-                    if (((stage_mask >> stage) & 1) != 0) {
-                        glEnable(ASSEMBLY_PROGRAM_ENUMS[stage]);
-                    } else {
-                        glDisable(ASSEMBLY_PROGRAM_ENUMS[stage]);
-                    }
-                }
-            }
-        }
-        for (size_t stage = 0; stage < NUM_STAGES; ++stage) {
-            if (current_programs[stage] != programs[stage].handle) {
-                current_programs[stage] = programs[stage].handle;
-                glBindProgramARB(ASSEMBLY_PROGRAM_ENUMS[stage], programs[stage].handle);
-            }
-        }
-        UnbindPipeline();
-    }
-
-    void RestoreGuestCompute() {}
+    void RestoreGuestCompute();

 private:
-    void BindPipeline() {
-        if (!is_pipeline_bound) {
-            is_pipeline_bound = true;
-            glBindProgramPipeline(pipeline.handle);
-        }
-        UnbindCompute();
-    }
+    void BindPipeline();

-    void UnbindPipeline() {
-        if (is_pipeline_bound) {
-            is_pipeline_bound = false;
-            glBindProgramPipeline(0);
-        }
-        UnbindCompute();
-    }
+    void UnbindPipeline();

-    void UnbindCompute() {
-        if (is_compute_bound) {
-            is_compute_bound = false;
-            glUseProgram(0);
-        }
-    }
+    void UnbindCompute();

    OGLPipeline pipeline;
    bool is_pipeline_bound{};
--- a/src/video_core/renderer_vulkan/blit_image.cpp
+++ b/src/video_core/renderer_vulkan/blit_image.cpp
@@ -4,13 +4,13 @@
 #include <algorithm>

 #include "common/settings.h"
+#include "video_core/host_shaders/blit_color_float_frag_spv.h"
 #include "video_core/host_shaders/convert_abgr8_to_d24s8_frag_spv.h"
 #include "video_core/host_shaders/convert_d24s8_to_abgr8_frag_spv.h"
 #include "video_core/host_shaders/convert_depth_to_float_frag_spv.h"
 #include "video_core/host_shaders/convert_float_to_depth_frag_spv.h"
 #include "video_core/host_shaders/convert_s8d24_to_abgr8_frag_spv.h"
 #include "video_core/host_shaders/full_screen_triangle_vert_spv.h"
-#include "video_core/host_shaders/vulkan_blit_color_float_frag_spv.h"
 #include "video_core/host_shaders/vulkan_blit_depth_stencil_frag_spv.h"
 #include "video_core/renderer_vulkan/blit_image.h"
 #include "video_core/renderer_vulkan/maxwell_to_vk.h"
@@ -303,7 +303,7 @@ void UpdateTwoTexturesDescriptorSet(const Device& device, VkDescriptorSet descri
 }

 void BindBlitState(vk::CommandBuffer cmdbuf, VkPipelineLayout layout, const Region2D& dst_region,
-                   const Region2D& src_region) {
+                   const Region2D& src_region, const Extent3D& src_size = {1, 1, 1}) {
    const VkOffset2D offset{
        .x = std::min(dst_region.start.x, dst_region.end.x),
        .y = std::min(dst_region.start.y, dst_region.end.y),
@@ -325,12 +325,15 @@ void BindBlitState(vk::CommandBuffer cmdbuf, VkPipelineLayout layout, const Regi
        .offset = offset,
        .extent = extent,
    };
-    const float scale_x = static_cast<float>(src_region.end.x - src_region.start.x);
-    const float scale_y = static_cast<float>(src_region.end.y - src_region.start.y);
+    const float scale_x = static_cast<float>(src_region.end.x - src_region.start.x) /
+                          static_cast<float>(src_size.width);
+    const float scale_y = static_cast<float>(src_region.end.y - src_region.start.y) /
+                          static_cast<float>(src_size.height);
    const PushConstants push_constants{
        .tex_scale = {scale_x, scale_y},
-        .tex_offset = {static_cast<float>(src_region.start.x),
-                       static_cast<float>(src_region.start.y)},
+        .tex_offset = {static_cast<float>(src_region.start.x) / static_cast<float>(src_size.width),
+                       static_cast<float>(src_region.start.y) /
+                           static_cast<float>(src_size.height)},
    };
    cmdbuf.SetViewport(0, viewport);
    cmdbuf.SetScissor(0, scissor);
@@ -365,7 +368,7 @@ BlitImageHelper::BlitImageHelper(const Device& device_, Scheduler& scheduler_,
      two_textures_pipeline_layout(device.GetLogical().CreatePipelineLayout(
          PipelineLayoutCreateInfo(two_textures_set_layout.address()))),
      full_screen_vert(BuildShader(device, FULL_SCREEN_TRIANGLE_VERT_SPV)),
-      blit_color_to_color_frag(BuildShader(device, VULKAN_BLIT_COLOR_FLOAT_FRAG_SPV)),
+      blit_color_to_color_frag(BuildShader(device, BLIT_COLOR_FLOAT_FRAG_SPV)),
      blit_depth_stencil_frag(BuildShader(device, VULKAN_BLIT_DEPTH_STENCIL_FRAG_SPV)),
      convert_depth_to_float_frag(BuildShader(device, CONVERT_DEPTH_TO_FLOAT_FRAG_SPV)),
      convert_float_to_depth_frag(BuildShader(device, CONVERT_FLOAT_TO_DEPTH_FRAG_SPV)),
@@ -404,6 +407,30 @@ void BlitImageHelper::BlitColor(const Framebuffer* dst_framebuffer, VkImageView
    scheduler.InvalidateState();
 }

+void BlitImageHelper::BlitColor(const Framebuffer* dst_framebuffer, VkImageView src_image_view,
+                                VkSampler src_sampler, const Region2D& dst_region,
+                                const Region2D& src_region, const Extent3D& src_size) {
+    const BlitImagePipelineKey key{
+        .renderpass = dst_framebuffer->RenderPass(),
+        .operation = Tegra::Engines::Fermi2D::Operation::SrcCopy,
+    };
+    const VkPipelineLayout layout = *one_texture_pipeline_layout;
+    const VkPipeline pipeline = FindOrEmplaceColorPipeline(key);
+    scheduler.RequestRenderpass(dst_framebuffer);
+    scheduler.Record([this, dst_region, src_region, src_size, pipeline, layout, src_sampler,
+                      src_image_view](vk::CommandBuffer cmdbuf) {
+        // TODO: Barriers
+        const VkDescriptorSet descriptor_set = one_texture_descriptor_allocator.Commit();
+        UpdateOneTextureDescriptorSet(device, descriptor_set, src_sampler, src_image_view);
+        cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_GRAPHICS, pipeline);
+        cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_GRAPHICS, layout, 0, descriptor_set,
+                                  nullptr);
+        BindBlitState(cmdbuf, layout, dst_region, src_region, src_size);
+        cmdbuf.Draw(3, 1, 0, 0);
+    });
+    scheduler.InvalidateState();
+}
+
 void BlitImageHelper::BlitDepthStencil(const Framebuffer* dst_framebuffer,
                                       VkImageView src_depth_view, VkImageView src_stencil_view,
                                       const Region2D& dst_region, const Region2D& src_region,
--- a/src/video_core/renderer_vulkan/blit_image.h
+++ b/src/video_core/renderer_vulkan/blit_image.h
@@ -10,6 +10,8 @@

 namespace Vulkan {

+using VideoCommon::Extent3D;
+using VideoCommon::Offset2D;
 using VideoCommon::Region2D;

 class Device;
@@ -36,6 +38,10 @@ public:
                   Tegra::Engines::Fermi2D::Filter filter,
                   Tegra::Engines::Fermi2D::Operation operation);

+    void BlitColor(const Framebuffer* dst_framebuffer, VkImageView src_image_view,
+                   VkSampler src_sampler, const Region2D& dst_region, const Region2D& src_region,
+                   const Extent3D& src_size);
+
    void BlitDepthStencil(const Framebuffer* dst_framebuffer, VkImageView src_depth_view,
                          VkImageView src_stencil_view, const Region2D& dst_region,
                          const Region2D& src_region, Tegra::Engines::Fermi2D::Filter filter,
--- a/src/video_core/renderer_vulkan/renderer_vulkan.cpp
+++ b/src/video_core/renderer_vulkan/renderer_vulkan.cpp
@@ -78,6 +78,8 @@ std::string BuildCommaSeparatedExtensions(std::vector<std::string> available_ext
    return separated_extensions;
 }

+} // Anonymous namespace
+
 Device CreateDevice(const vk::Instance& instance, const vk::InstanceDispatch& dld,
                    VkSurfaceKHR surface) {
    const std::vector<VkPhysicalDevice> devices = instance.EnumeratePhysicalDevices();
@@ -89,7 +91,6 @@ Device CreateDevice(const vk::Instance& instance, const vk::InstanceDispatch& dl
    const vk::PhysicalDevice physical_device(devices[device_index], dld);
    return Device(*instance, physical_device, surface, dld);
 }
-} // Anonymous namespace

 RendererVulkan::RendererVulkan(Core::TelemetrySession& telemetry_session_,
                               Core::Frontend::EmuWindow& emu_window,
@@ -109,6 +110,9 @@ RendererVulkan::RendererVulkan(Core::TelemetrySession& telemetry_session_,
                  screen_info),
      rasterizer(render_window, gpu, cpu_memory, screen_info, device, memory_allocator,
                 state_tracker, scheduler) {
+    if (Settings::values.renderer_force_max_clock.GetValue()) {
+        turbo_mode.emplace(instance, dld);
+    }
    Report();
 } catch (const vk::Exception& exception) {
    LOG_ERROR(Render_Vulkan, "Vulkan initialization failed with error: {}", exception.what());
--- a/src/video_core/renderer_vulkan/renderer_vulkan.h
+++ b/src/video_core/renderer_vulkan/renderer_vulkan.h
@@ -13,6 +13,7 @@
 #include "video_core/renderer_vulkan/vk_scheduler.h"
 #include "video_core/renderer_vulkan/vk_state_tracker.h"
 #include "video_core/renderer_vulkan/vk_swapchain.h"
+#include "video_core/renderer_vulkan/vk_turbo_mode.h"
 #include "video_core/vulkan_common/vulkan_device.h"
 #include "video_core/vulkan_common/vulkan_memory_allocator.h"
 #include "video_core/vulkan_common/vulkan_wrapper.h"
@@ -31,6 +32,9 @@ class GPU;

 namespace Vulkan {

+Device CreateDevice(const vk::Instance& instance, const vk::InstanceDispatch& dld,
+                    VkSurfaceKHR surface);
+
 class RendererVulkan final : public VideoCore::RendererBase {
 public:
    explicit RendererVulkan(Core::TelemetrySession& telemtry_session,
@@ -74,6 +78,7 @@ private:
    Swapchain swapchain;
    BlitScreen blit_screen;
    RasterizerVulkan rasterizer;
+    std::optional<TurboMode> turbo_mode;
 };

 } // namespace Vulkan
--- a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
@@ -330,6 +330,10 @@ bool BufferCacheRuntime::CanReportMemoryUsage() const {
    return device.CanReportMemoryUsage();
 }

+u32 BufferCacheRuntime::GetStorageBufferAlignment() const {
+    return static_cast<u32>(device.GetStorageBufferAlignment());
+}
+
 void BufferCacheRuntime::Finish() {
    scheduler.Finish();
 }
--- a/src/video_core/renderer_vulkan/vk_buffer_cache.h
+++ b/src/video_core/renderer_vulkan/vk_buffer_cache.h
@@ -73,6 +73,8 @@ public:

    bool CanReportMemoryUsage() const;

+    u32 GetStorageBufferAlignment() const;
+
    [[nodiscard]] StagingBufferRef UploadStagingBuffer(size_t size);

    [[nodiscard]] StagingBufferRef DownloadStagingBuffer(size_t size);
--- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
@@ -331,6 +331,7 @@ PipelineCache::PipelineCache(RasterizerVulkan& rasterizer_, const Device& device
        .need_declared_frag_colors = false,

        .has_broken_spirv_clamp = driver_id == VK_DRIVER_ID_INTEL_PROPRIETARY_WINDOWS,
+        .has_broken_spirv_position_input = driver_id == VK_DRIVER_ID_QUALCOMM_PROPRIETARY,
        .has_broken_unsigned_image_offsets = false,
        .has_broken_signed_operations = false,
        .has_broken_fp16_float_controls = driver_id == VK_DRIVER_ID_NVIDIA_PROPRIETARY,
@@ -343,6 +344,8 @@ PipelineCache::PipelineCache(RasterizerVulkan& rasterizer_, const Device& device
            driver_id == VK_DRIVER_ID_AMD_PROPRIETARY || driver_id == VK_DRIVER_ID_AMD_OPEN_SOURCE,
        .support_snorm_render_buffer = true,
        .support_viewport_index_layer = device.IsExtShaderViewportIndexLayerSupported(),
+        .min_ssbo_alignment = static_cast<u32>(device.GetStorageBufferAlignment()),
+        .support_geometry_shader_passthrough = device.IsNvGeometryShaderPassthroughSupported(),
    };

    if (device.GetMaxVertexInputAttributes() < Maxwell::NumVertexAttributes) {
--- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
@@ -186,6 +186,7 @@ void RasterizerVulkan::PrepareDraw(bool is_indexed, Func&& draw_func) {

    SCOPE_EXIT({ gpu.TickWork(); });
    FlushWork();
+    gpu_memory->FlushCaching();

    query_cache.UpdateCounters();

@@ -265,6 +266,34 @@ void RasterizerVulkan::DrawIndirect() {
    buffer_cache.SetDrawIndirect(nullptr);
 }

+void RasterizerVulkan::DrawTexture() {
+    MICROPROFILE_SCOPE(Vulkan_Drawing);
+
+    SCOPE_EXIT({ gpu.TickWork(); });
+    FlushWork();
+
+    query_cache.UpdateCounters();
+
+    texture_cache.SynchronizeGraphicsDescriptors();
+    texture_cache.UpdateRenderTargets(false);
+
+    UpdateDynamicStates();
+
+    const auto& draw_texture_state = maxwell3d->draw_manager->GetDrawTextureState();
+    const auto& sampler = texture_cache.GetGraphicsSampler(draw_texture_state.src_sampler);
+    const auto& texture = texture_cache.GetImageView(draw_texture_state.src_texture);
+    Region2D dst_region = {Offset2D{.x = static_cast<s32>(draw_texture_state.dst_x0),
+                                    .y = static_cast<s32>(draw_texture_state.dst_y0)},
+                           Offset2D{.x = static_cast<s32>(draw_texture_state.dst_x1),
+                                    .y = static_cast<s32>(draw_texture_state.dst_y1)}};
+    Region2D src_region = {Offset2D{.x = static_cast<s32>(draw_texture_state.src_x0),
+                                    .y = static_cast<s32>(draw_texture_state.src_y0)},
+                           Offset2D{.x = static_cast<s32>(draw_texture_state.src_x1),
+                                    .y = static_cast<s32>(draw_texture_state.src_y1)}};
+    blit_image.BlitColor(texture_cache.GetFramebuffer(), texture.RenderTarget(), sampler->Handle(),
+                         dst_region, src_region, texture.size);
+}
+
 void RasterizerVulkan::Clear(u32 layer_count) {
    MICROPROFILE_SCOPE(Vulkan_Clearing);

@@ -393,6 +422,7 @@ void RasterizerVulkan::Clear(u32 layer_count) {

 void RasterizerVulkan::DispatchCompute() {
    FlushWork();
+    gpu_memory->FlushCaching();

    ComputePipeline* const pipeline{pipeline_cache.CurrentComputePipeline()};
    if (!pipeline) {
@@ -481,6 +511,27 @@ void RasterizerVulkan::InvalidateRegion(VAddr addr, u64 size, VideoCommon::Cache
    }
 }

+void RasterizerVulkan::InnerInvalidation(std::span<const std::pair<VAddr, std::size_t>> sequences) {
+    {
+        std::scoped_lock lock{texture_cache.mutex};
+        for (const auto& [addr, size] : sequences) {
+            texture_cache.WriteMemory(addr, size);
+        }
+    }
+    {
+        std::scoped_lock lock{buffer_cache.mutex};
+        for (const auto& [addr, size] : sequences) {
+            buffer_cache.WriteMemory(addr, size);
+        }
+    }
+    {
+        for (const auto& [addr, size] : sequences) {
+            query_cache.InvalidateRegion(addr, size);
+            pipeline_cache.InvalidateRegion(addr, size);
+        }
+    }
+}
+
 void RasterizerVulkan::OnCPUWrite(VAddr addr, u64 size) {
    if (addr == 0 || size == 0) {
        return;
--- a/src/video_core/renderer_vulkan/vk_rasterizer.h
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.h
@@ -66,6 +66,7 @@ public:

    void Draw(bool is_indexed, u32 instance_count) override;
    void DrawIndirect() override;
+    void DrawTexture() override;
    void Clear(u32 layer_count) override;
    void DispatchCompute() override;
    void ResetCounter(VideoCore::QueryType type) override;
@@ -79,6 +80,7 @@ public:
                         VideoCommon::CacheType which = VideoCommon::CacheType::All) override;
    void InvalidateRegion(VAddr addr, u64 size,
                          VideoCommon::CacheType which = VideoCommon::CacheType::All) override;
+    void InnerInvalidation(std::span<const std::pair<VAddr, std::size_t>> sequences) override;
    void OnCPUWrite(VAddr addr, u64 size) override;
    void InvalidateGPUCache() override;
    void UnmapMemory(VAddr addr, u64 size) override;
--- a/src/video_core/texture_cache/texture_cache.h
+++ b/src/video_core/texture_cache/texture_cache.h
@@ -148,6 +148,13 @@ typename P::ImageView& TextureCache<P>::GetImageView(ImageViewId id) noexcept {
    return slot_image_views[id];
 }

+template <class P>
+typename P::ImageView& TextureCache<P>::GetImageView(u32 index) noexcept {
+    const auto image_view_id = VisitImageView(channel_state->graphics_image_table,
+                                              channel_state->graphics_image_view_ids, index);
+    return slot_image_views[image_view_id];
+}
+
 template <class P>
 void TextureCache<P>::MarkModification(ImageId id) noexcept {
    MarkModification(slot_images[id]);
--- a/src/video_core/texture_cache/texture_cache_base.h
+++ b/src/video_core/texture_cache/texture_cache_base.h
@@ -129,6 +129,9 @@ public:
    /// Return a reference to the given image view id
    [[nodiscard]] ImageView& GetImageView(ImageViewId id) noexcept;

+    /// Get the imageview from the graphics descriptor table in the specified index
+    [[nodiscard]] ImageView& GetImageView(u32 index) noexcept;
+
    /// Mark an image as modified from the GPU
    void MarkModification(ImageId id) noexcept;

--- a/src/video_core/vulkan_common/vulkan_device.cpp
+++ b/src/video_core/vulkan_common/vulkan_device.cpp
@@ -1472,7 +1472,7 @@ std::vector<const char*> Device::LoadExtensions(bool requires_surface) {
        is_patch_list_restart_supported =
            primitive_topology_list_restart.primitiveTopologyPatchListRestart;
    }
-    if (has_khr_image_format_list && has_khr_swapchain_mutable_format) {
+    if (requires_surface && has_khr_image_format_list && has_khr_swapchain_mutable_format) {
        extensions.push_back(VK_KHR_IMAGE_FORMAT_LIST_EXTENSION_NAME);
        extensions.push_back(VK_KHR_SWAPCHAIN_MUTABLE_FORMAT_EXTENSION_NAME);
        khr_swapchain_mutable_format = true;