early-access version 1255

2020-12-28 15:15:37 +00:00
parent 84b39492d1
commit 78b48028e1
6254 changed files with 1868140 additions and 0 deletions
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -0,0 +1,314 @@
+add_subdirectory(host_shaders)
+
+add_library(video_core STATIC
+    buffer_cache/buffer_block.h
+    buffer_cache/buffer_cache.h
+    buffer_cache/map_interval.cpp
+    buffer_cache/map_interval.h
+    cdma_pusher.cpp
+    cdma_pusher.h
+    command_classes/codecs/codec.cpp
+    command_classes/codecs/codec.h
+    command_classes/codecs/h264.cpp
+    command_classes/codecs/h264.h
+    command_classes/codecs/vp9.cpp
+    command_classes/codecs/vp9.h
+    command_classes/codecs/vp9_types.h
+    command_classes/host1x.cpp
+    command_classes/host1x.h
+    command_classes/nvdec.cpp
+    command_classes/nvdec.h
+    command_classes/nvdec_common.h
+    command_classes/sync_manager.cpp
+    command_classes/sync_manager.h
+    command_classes/vic.cpp
+    command_classes/vic.h
+    compatible_formats.cpp
+    compatible_formats.h
+    delayed_destruction_ring.h
+    dirty_flags.cpp
+    dirty_flags.h
+    dma_pusher.cpp
+    dma_pusher.h
+    engines/const_buffer_engine_interface.h
+    engines/const_buffer_info.h
+    engines/engine_interface.h
+    engines/engine_upload.cpp
+    engines/engine_upload.h
+    engines/fermi_2d.cpp
+    engines/fermi_2d.h
+    engines/kepler_compute.cpp
+    engines/kepler_compute.h
+    engines/kepler_memory.cpp
+    engines/kepler_memory.h
+    engines/maxwell_3d.cpp
+    engines/maxwell_3d.h
+    engines/maxwell_dma.cpp
+    engines/maxwell_dma.h
+    engines/shader_bytecode.h
+    engines/shader_header.h
+    engines/shader_type.h
+    framebuffer_config.h
+    macro/macro.cpp
+    macro/macro.h
+    macro/macro_hle.cpp
+    macro/macro_hle.h
+    macro/macro_interpreter.cpp
+    macro/macro_interpreter.h
+    macro/macro_jit_x64.cpp
+    macro/macro_jit_x64.h
+    fence_manager.h
+    gpu.cpp
+    gpu.h
+    gpu_thread.cpp
+    gpu_thread.h
+    guest_driver.cpp
+    guest_driver.h
+    memory_manager.cpp
+    memory_manager.h
+    morton.cpp
+    morton.h
+    query_cache.h
+    rasterizer_accelerated.cpp
+    rasterizer_accelerated.h
+    rasterizer_interface.h
+    renderer_base.cpp
+    renderer_base.h
+    renderer_opengl/gl_arb_decompiler.cpp
+    renderer_opengl/gl_arb_decompiler.h
+    renderer_opengl/gl_buffer_cache.cpp
+    renderer_opengl/gl_buffer_cache.h
+    renderer_opengl/gl_device.cpp
+    renderer_opengl/gl_device.h
+    renderer_opengl/gl_fence_manager.cpp
+    renderer_opengl/gl_fence_manager.h
+    renderer_opengl/gl_rasterizer.cpp
+    renderer_opengl/gl_rasterizer.h
+    renderer_opengl/gl_resource_manager.cpp
+    renderer_opengl/gl_resource_manager.h
+    renderer_opengl/gl_shader_cache.cpp
+    renderer_opengl/gl_shader_cache.h
+    renderer_opengl/gl_shader_decompiler.cpp
+    renderer_opengl/gl_shader_decompiler.h
+    renderer_opengl/gl_shader_disk_cache.cpp
+    renderer_opengl/gl_shader_disk_cache.h
+    renderer_opengl/gl_shader_manager.cpp
+    renderer_opengl/gl_shader_manager.h
+    renderer_opengl/gl_shader_util.cpp
+    renderer_opengl/gl_shader_util.h
+    renderer_opengl/gl_state_tracker.cpp
+    renderer_opengl/gl_state_tracker.h
+    renderer_opengl/gl_stream_buffer.cpp
+    renderer_opengl/gl_stream_buffer.h
+    renderer_opengl/gl_texture_cache.cpp
+    renderer_opengl/gl_texture_cache.h
+    renderer_opengl/gl_query_cache.cpp
+    renderer_opengl/gl_query_cache.h
+    renderer_opengl/maxwell_to_gl.h
+    renderer_opengl/renderer_opengl.cpp
+    renderer_opengl/renderer_opengl.h
+    renderer_opengl/util_shaders.cpp
+    renderer_opengl/util_shaders.h
+    renderer_vulkan/blit_image.cpp
+    renderer_vulkan/blit_image.h
+    renderer_vulkan/fixed_pipeline_state.cpp
+    renderer_vulkan/fixed_pipeline_state.h
+    renderer_vulkan/maxwell_to_vk.cpp
+    renderer_vulkan/maxwell_to_vk.h
+    renderer_vulkan/nsight_aftermath_tracker.cpp
+    renderer_vulkan/nsight_aftermath_tracker.h
+    renderer_vulkan/renderer_vulkan.h
+    renderer_vulkan/renderer_vulkan.cpp
+    renderer_vulkan/vk_blit_screen.cpp
+    renderer_vulkan/vk_blit_screen.h
+    renderer_vulkan/vk_buffer_cache.cpp
+    renderer_vulkan/vk_buffer_cache.h
+    renderer_vulkan/vk_command_pool.cpp
+    renderer_vulkan/vk_command_pool.h
+    renderer_vulkan/vk_compute_pass.cpp
+    renderer_vulkan/vk_compute_pass.h
+    renderer_vulkan/vk_compute_pipeline.cpp
+    renderer_vulkan/vk_compute_pipeline.h
+    renderer_vulkan/vk_descriptor_pool.cpp
+    renderer_vulkan/vk_descriptor_pool.h
+    renderer_vulkan/vk_device.cpp
+    renderer_vulkan/vk_device.h
+    renderer_vulkan/vk_fence_manager.cpp
+    renderer_vulkan/vk_fence_manager.h
+    renderer_vulkan/vk_graphics_pipeline.cpp
+    renderer_vulkan/vk_graphics_pipeline.h
+    renderer_vulkan/vk_master_semaphore.cpp
+    renderer_vulkan/vk_master_semaphore.h
+    renderer_vulkan/vk_memory_manager.cpp
+    renderer_vulkan/vk_memory_manager.h
+    renderer_vulkan/vk_pipeline_cache.cpp
+    renderer_vulkan/vk_pipeline_cache.h
+    renderer_vulkan/vk_query_cache.cpp
+    renderer_vulkan/vk_query_cache.h
+    renderer_vulkan/vk_rasterizer.cpp
+    renderer_vulkan/vk_rasterizer.h
+    renderer_vulkan/vk_resource_pool.cpp
+    renderer_vulkan/vk_resource_pool.h
+    renderer_vulkan/vk_scheduler.cpp
+    renderer_vulkan/vk_scheduler.h
+    renderer_vulkan/vk_shader_decompiler.cpp
+    renderer_vulkan/vk_shader_decompiler.h
+    renderer_vulkan/vk_shader_util.cpp
+    renderer_vulkan/vk_shader_util.h
+    renderer_vulkan/vk_staging_buffer_pool.cpp
+    renderer_vulkan/vk_staging_buffer_pool.h
+    renderer_vulkan/vk_state_tracker.cpp
+    renderer_vulkan/vk_state_tracker.h
+    renderer_vulkan/vk_stream_buffer.cpp
+    renderer_vulkan/vk_stream_buffer.h
+    renderer_vulkan/vk_swapchain.cpp
+    renderer_vulkan/vk_swapchain.h
+    renderer_vulkan/vk_texture_cache.cpp
+    renderer_vulkan/vk_texture_cache.h
+    renderer_vulkan/vk_update_descriptor.cpp
+    renderer_vulkan/vk_update_descriptor.h
+    renderer_vulkan/wrapper.cpp
+    renderer_vulkan/wrapper.h
+    shader_cache.h
+    shader_notify.cpp
+    shader_notify.h
+    shader/decode/arithmetic.cpp
+    shader/decode/arithmetic_immediate.cpp
+    shader/decode/bfe.cpp
+    shader/decode/bfi.cpp
+    shader/decode/shift.cpp
+    shader/decode/arithmetic_integer.cpp
+    shader/decode/arithmetic_integer_immediate.cpp
+    shader/decode/arithmetic_half.cpp
+    shader/decode/arithmetic_half_immediate.cpp
+    shader/decode/ffma.cpp
+    shader/decode/hfma2.cpp
+    shader/decode/conversion.cpp
+    shader/decode/memory.cpp
+    shader/decode/texture.cpp
+    shader/decode/image.cpp
+    shader/decode/float_set_predicate.cpp
+    shader/decode/integer_set_predicate.cpp
+    shader/decode/half_set_predicate.cpp
+    shader/decode/predicate_set_register.cpp
+    shader/decode/predicate_set_predicate.cpp
+    shader/decode/register_set_predicate.cpp
+    shader/decode/float_set.cpp
+    shader/decode/integer_set.cpp
+    shader/decode/half_set.cpp
+    shader/decode/video.cpp
+    shader/decode/warp.cpp
+    shader/decode/xmad.cpp
+    shader/decode/other.cpp
+    shader/ast.cpp
+    shader/ast.h
+    shader/async_shaders.cpp
+    shader/async_shaders.h
+    shader/compiler_settings.cpp
+    shader/compiler_settings.h
+    shader/control_flow.cpp
+    shader/control_flow.h
+    shader/decode.cpp
+    shader/expr.cpp
+    shader/expr.h
+    shader/memory_util.cpp
+    shader/memory_util.h
+    shader/node_helper.cpp
+    shader/node_helper.h
+    shader/node.h
+    shader/registry.cpp
+    shader/registry.h
+    shader/shader_ir.cpp
+    shader/shader_ir.h
+    shader/track.cpp
+    shader/transform_feedback.cpp
+    shader/transform_feedback.h
+    surface.cpp
+    surface.h
+    texture_cache/decode_bc4.cpp
+    texture_cache/decode_bc4.h
+    texture_cache/descriptor_table.h
+    texture_cache/formatter.cpp
+    texture_cache/formatter.h
+    texture_cache/format_lookup_table.cpp
+    texture_cache/format_lookup_table.h
+    texture_cache/image_base.cpp
+    texture_cache/image_base.h
+    texture_cache/image_info.cpp
+    texture_cache/image_info.h
+    texture_cache/image_view_base.cpp
+    texture_cache/image_view_base.h
+    texture_cache/image_view_info.cpp
+    texture_cache/image_view_info.h
+    texture_cache/render_targets.h
+    texture_cache/samples_helper.h
+    texture_cache/slot_vector.h
+    texture_cache/texture_cache.h
+    texture_cache/texture_cache.inl
+    texture_cache/types.h
+    texture_cache/util.cpp
+    texture_cache/util.h
+    textures/astc.cpp
+    textures/astc.h
+    textures/decoders.cpp
+    textures/decoders.h
+    textures/texture.cpp
+    textures/texture.h
+    video_core.cpp
+    video_core.h
+)
+
+create_target_directory_groups(video_core)
+
+target_link_libraries(video_core PUBLIC common core)
+target_link_libraries(video_core PRIVATE glad xbyak)
+
+if (MSVC)
+    target_include_directories(video_core PRIVATE ${FFMPEG_INCLUDE_DIR})
+    target_link_libraries(video_core PUBLIC ${FFMPEG_LIBRARY_DIR}/swscale.lib ${FFMPEG_LIBRARY_DIR}/avcodec.lib ${FFMPEG_LIBRARY_DIR}/avutil.lib)
+else()
+    target_include_directories(video_core PRIVATE ${FFMPEG_INCLUDE_DIR})
+    target_link_libraries(video_core PRIVATE ${FFMPEG_LIBRARIES})
+endif()
+
+add_dependencies(video_core host_shaders)
+target_include_directories(video_core PRIVATE ${HOST_SHADERS_INCLUDE})
+target_include_directories(video_core PRIVATE sirit ../../externals/Vulkan-Headers/include)
+target_link_libraries(video_core PRIVATE sirit)
+
+if (ENABLE_NSIGHT_AFTERMATH)
+    if (NOT DEFINED ENV{NSIGHT_AFTERMATH_SDK})
+        message(ERROR "Environment variable NSIGHT_AFTERMATH_SDK has to be provided")
+    endif()
+    if (NOT WIN32)
+        message(ERROR "Nsight Aftermath doesn't support non-Windows platforms")
+    endif()
+    target_compile_definitions(video_core PRIVATE HAS_NSIGHT_AFTERMATH)
+    target_include_directories(video_core PRIVATE "$ENV{NSIGHT_AFTERMATH_SDK}/include")
+endif()
+
+if (MSVC)
+    target_compile_options(video_core PRIVATE
+        /we4267 # 'var' : conversion from 'size_t' to 'type', possible loss of data
+        /we4456 # Declaration of 'identifier' hides previous local declaration
+        /we4457 # Declaration of 'identifier' hides function parameter
+        /we4458 # Declaration of 'identifier' hides class member
+        /we4459 # Declaration of 'identifier' hides global declaration
+        /we4715 # 'function' : not all control paths return a value
+    )
+else()
+    target_compile_options(video_core PRIVATE
+        -Werror=conversion
+        -Wno-error=sign-conversion
+        -Werror=pessimizing-move
+        -Werror=redundant-move
+        -Werror=shadow
+        -Werror=switch
+        -Werror=type-limits
+        -Werror=unused-variable
+
+        $<$<CXX_COMPILER_ID:GNU>:-Werror=class-memaccess>
+        $<$<CXX_COMPILER_ID:GNU>:-Werror=unused-but-set-parameter>
+        $<$<CXX_COMPILER_ID:GNU>:-Werror=unused-but-set-variable>
+    )
+endif()
--- a/src/video_core/buffer_cache/buffer_block.h
+++ b/src/video_core/buffer_cache/buffer_block.h
@@ -0,0 +1,62 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include "common/common_types.h"
+
+namespace VideoCommon {
+
+class BufferBlock {
+public:
+    [[nodiscard]] bool Overlaps(VAddr start, VAddr end) const {
+        return (cpu_addr < end) && (cpu_addr_end > start);
+    }
+
+    [[nodiscard]] bool IsInside(VAddr other_start, VAddr other_end) const {
+        return cpu_addr <= other_start && other_end <= cpu_addr_end;
+    }
+
+    [[nodiscard]] std::size_t Offset(VAddr in_addr) const {
+        return static_cast<std::size_t>(in_addr - cpu_addr);
+    }
+
+    [[nodiscard]] VAddr CpuAddr() const {
+        return cpu_addr;
+    }
+
+    [[nodiscard]] VAddr CpuAddrEnd() const {
+        return cpu_addr_end;
+    }
+
+    void SetCpuAddr(VAddr new_addr) {
+        cpu_addr = new_addr;
+        cpu_addr_end = new_addr + size;
+    }
+
+    [[nodiscard]] std::size_t Size() const {
+        return size;
+    }
+
+    [[nodiscard]] u64 Epoch() const {
+        return epoch;
+    }
+
+    void SetEpoch(u64 new_epoch) {
+        epoch = new_epoch;
+    }
+
+protected:
+    explicit BufferBlock(VAddr cpu_addr_, std::size_t size_) : size{size_} {
+        SetCpuAddr(cpu_addr_);
+    }
+
+private:
+    VAddr cpu_addr{};
+    VAddr cpu_addr_end{};
+    std::size_t size{};
+    u64 epoch{};
+};
+
+} // namespace VideoCommon
--- a/src/video_core/buffer_cache/buffer_cache.h
+++ b/src/video_core/buffer_cache/buffer_cache.h
@@ -0,0 +1,594 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <list>
+#include <memory>
+#include <mutex>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include <boost/container/small_vector.hpp>
+#include <boost/icl/interval_set.hpp>
+#include <boost/intrusive/set.hpp>
+
+#include "common/alignment.h"
+#include "common/assert.h"
+#include "common/common_types.h"
+#include "common/logging/log.h"
+#include "core/core.h"
+#include "core/memory.h"
+#include "core/settings.h"
+#include "video_core/buffer_cache/buffer_block.h"
+#include "video_core/buffer_cache/map_interval.h"
+#include "video_core/memory_manager.h"
+#include "video_core/rasterizer_interface.h"
+
+namespace VideoCommon {
+
+template <typename Buffer, typename BufferType, typename StreamBuffer>
+class BufferCache {
+    using IntervalSet = boost::icl::interval_set<VAddr>;
+    using IntervalType = typename IntervalSet::interval_type;
+    using VectorMapInterval = boost::container::small_vector<MapInterval*, 1>;
+
+    static constexpr u64 WRITE_PAGE_BIT = 11;
+    static constexpr u64 BLOCK_PAGE_BITS = 21;
+    static constexpr u64 BLOCK_PAGE_SIZE = 1ULL << BLOCK_PAGE_BITS;
+
+public:
+    struct BufferInfo {
+        BufferType handle;
+        u64 offset;
+        u64 address;
+    };
+
+    BufferInfo UploadMemory(GPUVAddr gpu_addr, std::size_t size, std::size_t alignment = 4,
+                            bool is_written = false, bool use_fast_cbuf = false) {
+        std::lock_guard lock{mutex};
+
+        const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr);
+        if (!cpu_addr) {
+            return GetEmptyBuffer(size);
+        }
+
+        // Cache management is a big overhead, so only cache entries with a given size.
+        // TODO: Figure out which size is the best for given games.
+        constexpr std::size_t max_stream_size = 0x800;
+        if (use_fast_cbuf || size < max_stream_size) {
+            if (!is_written && !IsRegionWritten(*cpu_addr, *cpu_addr + size - 1)) {
+                const bool is_granular = gpu_memory.IsGranularRange(gpu_addr, size);
+                if (use_fast_cbuf) {
+                    u8* dest;
+                    if (is_granular) {
+                        dest = gpu_memory.GetPointer(gpu_addr);
+                    } else {
+                        staging_buffer.resize(size);
+                        dest = staging_buffer.data();
+                        gpu_memory.ReadBlockUnsafe(gpu_addr, dest, size);
+                    }
+                    return ConstBufferUpload(dest, size);
+                }
+                if (is_granular) {
+                    u8* const host_ptr = gpu_memory.GetPointer(gpu_addr);
+                    return StreamBufferUpload(size, alignment, [host_ptr, size](u8* dest) {
+                        std::memcpy(dest, host_ptr, size);
+                    });
+                } else {
+                    return StreamBufferUpload(size, alignment, [this, gpu_addr, size](u8* dest) {
+                        gpu_memory.ReadBlockUnsafe(gpu_addr, dest, size);
+                    });
+                }
+            }
+        }
+
+        Buffer* const block = GetBlock(*cpu_addr, size);
+        MapInterval* const map = MapAddress(block, gpu_addr, *cpu_addr, size);
+        if (!map) {
+            return GetEmptyBuffer(size);
+        }
+        if (is_written) {
+            map->MarkAsModified(true, GetModifiedTicks());
+            if (Settings::IsGPULevelHigh() &&
+                Settings::values.use_asynchronous_gpu_emulation.GetValue()) {
+                MarkForAsyncFlush(map);
+            }
+            if (!map->is_written) {
+                map->is_written = true;
+                MarkRegionAsWritten(map->start, map->end - 1);
+            }
+        }
+
+        return BufferInfo{block->Handle(), block->Offset(*cpu_addr), block->Address()};
+    }
+
+    /// Uploads from a host memory. Returns the OpenGL buffer where it's located and its offset.
+    BufferInfo UploadHostMemory(const void* raw_pointer, std::size_t size,
+                                std::size_t alignment = 4) {
+        std::lock_guard lock{mutex};
+        return StreamBufferUpload(size, alignment, [raw_pointer, size](u8* dest) {
+            std::memcpy(dest, raw_pointer, size);
+        });
+    }
+
+    /// Prepares the buffer cache for data uploading
+    /// @param max_size Maximum number of bytes that will be uploaded
+    /// @return True when a stream buffer invalidation was required, false otherwise
+    void Map(std::size_t max_size) {
+        std::lock_guard lock{mutex};
+
+        std::tie(buffer_ptr, buffer_offset_base) = stream_buffer.Map(max_size, 4);
+        buffer_offset = buffer_offset_base;
+    }
+
+    /// Finishes the upload stream
+    void Unmap() {
+        std::lock_guard lock{mutex};
+        stream_buffer.Unmap(buffer_offset - buffer_offset_base);
+    }
+
+    /// Function called at the end of each frame, inteded for deferred operations
+    void TickFrame() {
+        ++epoch;
+
+        while (!pending_destruction.empty()) {
+            // Delay at least 4 frames before destruction.
+            // This is due to triple buffering happening on some drivers.
+            static constexpr u64 epochs_to_destroy = 5;
+            if (pending_destruction.front()->Epoch() + epochs_to_destroy > epoch) {
+                break;
+            }
+            pending_destruction.pop();
+        }
+    }
+
+    /// Write any cached resources overlapping the specified region back to memory
+    void FlushRegion(VAddr addr, std::size_t size) {
+        std::lock_guard lock{mutex};
+
+        VectorMapInterval objects = GetMapsInRange(addr, size);
+        std::sort(objects.begin(), objects.end(),
+                  [](MapInterval* lhs, MapInterval* rhs) { return lhs->ticks < rhs->ticks; });
+        for (MapInterval* object : objects) {
+            if (object->is_modified && object->is_registered) {
+                mutex.unlock();
+                FlushMap(object);
+                mutex.lock();
+            }
+        }
+    }
+
+    bool MustFlushRegion(VAddr addr, std::size_t size) {
+        std::lock_guard lock{mutex};
+
+        const VectorMapInterval objects = GetMapsInRange(addr, size);
+        return std::any_of(objects.cbegin(), objects.cend(), [](const MapInterval* map) {
+            return map->is_modified && map->is_registered;
+        });
+    }
+
+    /// Mark the specified region as being invalidated
+    void InvalidateRegion(VAddr addr, u64 size) {
+        std::lock_guard lock{mutex};
+
+        for (auto& object : GetMapsInRange(addr, size)) {
+            if (object->is_registered) {
+                Unregister(object);
+            }
+        }
+    }
+
+    void OnCPUWrite(VAddr addr, std::size_t size) {
+        std::lock_guard lock{mutex};
+
+        for (MapInterval* object : GetMapsInRange(addr, size)) {
+            if (object->is_memory_marked && object->is_registered) {
+                UnmarkMemory(object);
+                object->is_sync_pending = true;
+                marked_for_unregister.emplace_back(object);
+            }
+        }
+    }
+
+    void SyncGuestHost() {
+        std::lock_guard lock{mutex};
+
+        for (auto& object : marked_for_unregister) {
+            if (object->is_registered) {
+                object->is_sync_pending = false;
+                Unregister(object);
+            }
+        }
+        marked_for_unregister.clear();
+    }
+
+    void CommitAsyncFlushes() {
+        if (uncommitted_flushes) {
+            auto commit_list = std::make_shared<std::list<MapInterval*>>();
+            for (MapInterval* map : *uncommitted_flushes) {
+                if (map->is_registered && map->is_modified) {
+                    // TODO(Blinkhawk): Implement backend asynchronous flushing
+                    // AsyncFlushMap(map)
+                    commit_list->push_back(map);
+                }
+            }
+            if (!commit_list->empty()) {
+                committed_flushes.push_back(commit_list);
+            } else {
+                committed_flushes.emplace_back();
+            }
+        } else {
+            committed_flushes.emplace_back();
+        }
+        uncommitted_flushes.reset();
+    }
+
+    bool ShouldWaitAsyncFlushes() const {
+        return !committed_flushes.empty() && committed_flushes.front() != nullptr;
+    }
+
+    bool HasUncommittedFlushes() const {
+        return uncommitted_flushes != nullptr;
+    }
+
+    void PopAsyncFlushes() {
+        if (committed_flushes.empty()) {
+            return;
+        }
+        auto& flush_list = committed_flushes.front();
+        if (!flush_list) {
+            committed_flushes.pop_front();
+            return;
+        }
+        for (MapInterval* map : *flush_list) {
+            if (map->is_registered) {
+                // TODO(Blinkhawk): Replace this for reading the asynchronous flush
+                FlushMap(map);
+            }
+        }
+        committed_flushes.pop_front();
+    }
+
+    virtual BufferInfo GetEmptyBuffer(std::size_t size) = 0;
+
+protected:
+    explicit BufferCache(VideoCore::RasterizerInterface& rasterizer_,
+                         Tegra::MemoryManager& gpu_memory_, Core::Memory::Memory& cpu_memory_,
+                         StreamBuffer& stream_buffer_)
+        : rasterizer{rasterizer_}, gpu_memory{gpu_memory_}, cpu_memory{cpu_memory_},
+          stream_buffer{stream_buffer_} {}
+
+    ~BufferCache() = default;
+
+    virtual std::shared_ptr<Buffer> CreateBlock(VAddr cpu_addr, std::size_t size) = 0;
+
+    virtual BufferInfo ConstBufferUpload(const void* raw_pointer, std::size_t size) {
+        return {};
+    }
+
+    /// Register an object into the cache
+    MapInterval* Register(MapInterval new_map, bool inherit_written = false) {
+        const VAddr cpu_addr = new_map.start;
+        if (!cpu_addr) {
+            LOG_CRITICAL(HW_GPU, "Failed to register buffer with unmapped gpu_address 0x{:016x}",
+                         new_map.gpu_addr);
+            return nullptr;
+        }
+        const std::size_t size = new_map.end - new_map.start;
+        new_map.is_registered = true;
+        rasterizer.UpdatePagesCachedCount(cpu_addr, size, 1);
+        new_map.is_memory_marked = true;
+        if (inherit_written) {
+            MarkRegionAsWritten(new_map.start, new_map.end - 1);
+            new_map.is_written = true;
+        }
+        MapInterval* const storage = mapped_addresses_allocator.Allocate();
+        *storage = new_map;
+        mapped_addresses.insert(*storage);
+        return storage;
+    }
+
+    void UnmarkMemory(MapInterval* map) {
+        if (!map->is_memory_marked) {
+            return;
+        }
+        const std::size_t size = map->end - map->start;
+        rasterizer.UpdatePagesCachedCount(map->start, size, -1);
+        map->is_memory_marked = false;
+    }
+
+    /// Unregisters an object from the cache
+    void Unregister(MapInterval* map) {
+        UnmarkMemory(map);
+        map->is_registered = false;
+        if (map->is_sync_pending) {
+            map->is_sync_pending = false;
+            marked_for_unregister.remove(map);
+        }
+        if (map->is_written) {
+            UnmarkRegionAsWritten(map->start, map->end - 1);
+        }
+        const auto it = mapped_addresses.find(*map);
+        ASSERT(it != mapped_addresses.end());
+        mapped_addresses.erase(it);
+        mapped_addresses_allocator.Release(map);
+    }
+
+private:
+    MapInterval* MapAddress(Buffer* block, GPUVAddr gpu_addr, VAddr cpu_addr, std::size_t size) {
+        const VectorMapInterval overlaps = GetMapsInRange(cpu_addr, size);
+        if (overlaps.empty()) {
+            const VAddr cpu_addr_end = cpu_addr + size;
+            if (gpu_memory.IsGranularRange(gpu_addr, size)) {
+                u8* const host_ptr = gpu_memory.GetPointer(gpu_addr);
+                block->Upload(block->Offset(cpu_addr), size, host_ptr);
+            } else {
+                staging_buffer.resize(size);
+                gpu_memory.ReadBlockUnsafe(gpu_addr, staging_buffer.data(), size);
+                block->Upload(block->Offset(cpu_addr), size, staging_buffer.data());
+            }
+            return Register(MapInterval(cpu_addr, cpu_addr_end, gpu_addr));
+        }
+
+        const VAddr cpu_addr_end = cpu_addr + size;
+        if (overlaps.size() == 1) {
+            MapInterval* const current_map = overlaps[0];
+            if (current_map->IsInside(cpu_addr, cpu_addr_end)) {
+                return current_map;
+            }
+        }
+        VAddr new_start = cpu_addr;
+        VAddr new_end = cpu_addr_end;
+        bool write_inheritance = false;
+        bool modified_inheritance = false;
+        // Calculate new buffer parameters
+        for (MapInterval* overlap : overlaps) {
+            new_start = std::min(overlap->start, new_start);
+            new_end = std::max(overlap->end, new_end);
+            write_inheritance |= overlap->is_written;
+            modified_inheritance |= overlap->is_modified;
+        }
+        GPUVAddr new_gpu_addr = gpu_addr + new_start - cpu_addr;
+        for (auto& overlap : overlaps) {
+            Unregister(overlap);
+        }
+        UpdateBlock(block, new_start, new_end, overlaps);
+
+        const MapInterval new_map{new_start, new_end, new_gpu_addr};
+        MapInterval* const map = Register(new_map, write_inheritance);
+        if (!map) {
+            return nullptr;
+        }
+        if (modified_inheritance) {
+            map->MarkAsModified(true, GetModifiedTicks());
+            if (Settings::IsGPULevelHigh() &&
+                Settings::values.use_asynchronous_gpu_emulation.GetValue()) {
+                MarkForAsyncFlush(map);
+            }
+        }
+        return map;
+    }
+
+    void UpdateBlock(Buffer* block, VAddr start, VAddr end, const VectorMapInterval& overlaps) {
+        const IntervalType base_interval{start, end};
+        IntervalSet interval_set{};
+        interval_set.add(base_interval);
+        for (auto& overlap : overlaps) {
+            const IntervalType subtract{overlap->start, overlap->end};
+            interval_set.subtract(subtract);
+        }
+        for (auto& interval : interval_set) {
+            const std::size_t size = interval.upper() - interval.lower();
+            if (size == 0) {
+                continue;
+            }
+            staging_buffer.resize(size);
+            cpu_memory.ReadBlockUnsafe(interval.lower(), staging_buffer.data(), size);
+            block->Upload(block->Offset(interval.lower()), size, staging_buffer.data());
+        }
+    }
+
+    VectorMapInterval GetMapsInRange(VAddr addr, std::size_t size) {
+        VectorMapInterval result;
+        if (size == 0) {
+            return result;
+        }
+
+        const VAddr addr_end = addr + size;
+        auto it = mapped_addresses.lower_bound(addr);
+        if (it != mapped_addresses.begin()) {
+            --it;
+        }
+        while (it != mapped_addresses.end() && it->start < addr_end) {
+            if (it->Overlaps(addr, addr_end)) {
+                result.push_back(&*it);
+            }
+            ++it;
+        }
+        return result;
+    }
+
+    /// Returns a ticks counter used for tracking when cached objects were last modified
+    u64 GetModifiedTicks() {
+        return ++modified_ticks;
+    }
+
+    void FlushMap(MapInterval* map) {
+        const auto it = blocks.find(map->start >> BLOCK_PAGE_BITS);
+        ASSERT_OR_EXECUTE(it != blocks.end(), return;);
+
+        std::shared_ptr<Buffer> block = it->second;
+
+        const std::size_t size = map->end - map->start;
+        staging_buffer.resize(size);
+        block->Download(block->Offset(map->start), size, staging_buffer.data());
+        cpu_memory.WriteBlockUnsafe(map->start, staging_buffer.data(), size);
+        map->MarkAsModified(false, 0);
+    }
+
+    template <typename Callable>
+    BufferInfo StreamBufferUpload(std::size_t size, std::size_t alignment, Callable&& callable) {
+        AlignBuffer(alignment);
+        const std::size_t uploaded_offset = buffer_offset;
+        callable(buffer_ptr);
+
+        buffer_ptr += size;
+        buffer_offset += size;
+        return BufferInfo{stream_buffer.Handle(), uploaded_offset, stream_buffer.Address()};
+    }
+
+    void AlignBuffer(std::size_t alignment) {
+        // Align the offset, not the mapped pointer
+        const std::size_t offset_aligned = Common::AlignUp(buffer_offset, alignment);
+        buffer_ptr += offset_aligned - buffer_offset;
+        buffer_offset = offset_aligned;
+    }
+
+    std::shared_ptr<Buffer> EnlargeBlock(std::shared_ptr<Buffer> buffer) {
+        const std::size_t old_size = buffer->Size();
+        const std::size_t new_size = old_size + BLOCK_PAGE_SIZE;
+        const VAddr cpu_addr = buffer->CpuAddr();
+        std::shared_ptr<Buffer> new_buffer = CreateBlock(cpu_addr, new_size);
+        new_buffer->CopyFrom(*buffer, 0, 0, old_size);
+        QueueDestruction(std::move(buffer));
+
+        const VAddr cpu_addr_end = cpu_addr + new_size - 1;
+        const u64 page_end = cpu_addr_end >> BLOCK_PAGE_BITS;
+        for (u64 page_start = cpu_addr >> BLOCK_PAGE_BITS; page_start <= page_end; ++page_start) {
+            blocks.insert_or_assign(page_start, new_buffer);
+        }
+
+        return new_buffer;
+    }
+
+    std::shared_ptr<Buffer> MergeBlocks(std::shared_ptr<Buffer> first,
+                                        std::shared_ptr<Buffer> second) {
+        const std::size_t size_1 = first->Size();
+        const std::size_t size_2 = second->Size();
+        const VAddr first_addr = first->CpuAddr();
+        const VAddr second_addr = second->CpuAddr();
+        const VAddr new_addr = std::min(first_addr, second_addr);
+        const std::size_t new_size = size_1 + size_2;
+
+        std::shared_ptr<Buffer> new_buffer = CreateBlock(new_addr, new_size);
+        new_buffer->CopyFrom(*first, 0, new_buffer->Offset(first_addr), size_1);
+        new_buffer->CopyFrom(*second, 0, new_buffer->Offset(second_addr), size_2);
+        QueueDestruction(std::move(first));
+        QueueDestruction(std::move(second));
+
+        const VAddr cpu_addr_end = new_addr + new_size - 1;
+        const u64 page_end = cpu_addr_end >> BLOCK_PAGE_BITS;
+        for (u64 page_start = new_addr >> BLOCK_PAGE_BITS; page_start <= page_end; ++page_start) {
+            blocks.insert_or_assign(page_start, new_buffer);
+        }
+        return new_buffer;
+    }
+
+    Buffer* GetBlock(VAddr cpu_addr, std::size_t size) {
+        std::shared_ptr<Buffer> found;
+
+        const VAddr cpu_addr_end = cpu_addr + size - 1;
+        const u64 page_end = cpu_addr_end >> BLOCK_PAGE_BITS;
+        for (u64 page_start = cpu_addr >> BLOCK_PAGE_BITS; page_start <= page_end; ++page_start) {
+            auto it = blocks.find(page_start);
+            if (it == blocks.end()) {
+                if (found) {
+                    found = EnlargeBlock(found);
+                    continue;
+                }
+                const VAddr start_addr = page_start << BLOCK_PAGE_BITS;
+                found = CreateBlock(start_addr, BLOCK_PAGE_SIZE);
+                blocks.insert_or_assign(page_start, found);
+                continue;
+            }
+            if (!found) {
+                found = it->second;
+                continue;
+            }
+            if (found != it->second) {
+                found = MergeBlocks(std::move(found), it->second);
+            }
+        }
+        return found.get();
+    }
+
+    void MarkRegionAsWritten(VAddr start, VAddr end) {
+        const u64 page_end = end >> WRITE_PAGE_BIT;
+        for (u64 page_start = start >> WRITE_PAGE_BIT; page_start <= page_end; ++page_start) {
+            if (const auto [it, inserted] = written_pages.emplace(page_start, 1); !inserted) {
+                ++it->second;
+            }
+        }
+    }
+
+    void UnmarkRegionAsWritten(VAddr start, VAddr end) {
+        const u64 page_end = end >> WRITE_PAGE_BIT;
+        for (u64 page_start = start >> WRITE_PAGE_BIT; page_start <= page_end; ++page_start) {
+            auto it = written_pages.find(page_start);
+            if (it != written_pages.end()) {
+                if (it->second > 1) {
+                    --it->second;
+                } else {
+                    written_pages.erase(it);
+                }
+            }
+        }
+    }
+
+    bool IsRegionWritten(VAddr start, VAddr end) const {
+        const u64 page_end = end >> WRITE_PAGE_BIT;
+        for (u64 page_start = start >> WRITE_PAGE_BIT; page_start <= page_end; ++page_start) {
+            if (written_pages.contains(page_start)) {
+                return true;
+            }
+        }
+        return false;
+    }
+
+    void QueueDestruction(std::shared_ptr<Buffer> buffer) {
+        buffer->SetEpoch(epoch);
+        pending_destruction.push(std::move(buffer));
+    }
+
+    void MarkForAsyncFlush(MapInterval* map) {
+        if (!uncommitted_flushes) {
+            uncommitted_flushes = std::make_shared<std::unordered_set<MapInterval*>>();
+        }
+        uncommitted_flushes->insert(map);
+    }
+
+    VideoCore::RasterizerInterface& rasterizer;
+    Tegra::MemoryManager& gpu_memory;
+    Core::Memory::Memory& cpu_memory;
+    StreamBuffer& stream_buffer;
+
+    u8* buffer_ptr = nullptr;
+    u64 buffer_offset = 0;
+    u64 buffer_offset_base = 0;
+
+    MapIntervalAllocator mapped_addresses_allocator;
+    boost::intrusive::set<MapInterval, boost::intrusive::compare<MapIntervalCompare>>
+        mapped_addresses;
+
+    std::unordered_map<u64, u32> written_pages;
+    std::unordered_map<u64, std::shared_ptr<Buffer>> blocks;
+
+    std::queue<std::shared_ptr<Buffer>> pending_destruction;
+    u64 epoch = 0;
+    u64 modified_ticks = 0;
+
+    std::vector<u8> staging_buffer;
+
+    std::list<MapInterval*> marked_for_unregister;
+
+    std::shared_ptr<std::unordered_set<MapInterval*>> uncommitted_flushes;
+    std::list<std::shared_ptr<std::list<MapInterval*>>> committed_flushes;
+
+    std::recursive_mutex mutex;
+};
+
+} // namespace VideoCommon
--- a/src/video_core/buffer_cache/map_interval.cpp
+++ b/src/video_core/buffer_cache/map_interval.cpp
@@ -0,0 +1,33 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <algorithm>
+#include <array>
+#include <cstddef>
+#include <memory>
+
+#include "video_core/buffer_cache/map_interval.h"
+
+namespace VideoCommon {
+
+MapIntervalAllocator::MapIntervalAllocator() {
+    FillFreeList(first_chunk);
+}
+
+MapIntervalAllocator::~MapIntervalAllocator() = default;
+
+void MapIntervalAllocator::AllocateNewChunk() {
+    *new_chunk = std::make_unique<Chunk>();
+    FillFreeList(**new_chunk);
+    new_chunk = &(*new_chunk)->next;
+}
+
+void MapIntervalAllocator::FillFreeList(Chunk& chunk) {
+    const std::size_t old_size = free_list.size();
+    free_list.resize(old_size + chunk.data.size());
+    std::transform(chunk.data.rbegin(), chunk.data.rend(), free_list.begin() + old_size,
+                   [](MapInterval& interval) { return &interval; });
+}
+
+} // namespace VideoCommon
--- a/src/video_core/buffer_cache/map_interval.h
+++ b/src/video_core/buffer_cache/map_interval.h
@@ -0,0 +1,93 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <array>
+#include <cstddef>
+#include <memory>
+#include <vector>
+
+#include <boost/intrusive/set_hook.hpp>
+
+#include "common/common_types.h"
+#include "video_core/gpu.h"
+
+namespace VideoCommon {
+
+struct MapInterval : public boost::intrusive::set_base_hook<boost::intrusive::optimize_size<true>> {
+    MapInterval() = default;
+
+    /*implicit*/ MapInterval(VAddr start_) noexcept : start{start_} {}
+
+    explicit MapInterval(VAddr start_, VAddr end_, GPUVAddr gpu_addr_) noexcept
+        : start{start_}, end{end_}, gpu_addr{gpu_addr_} {}
+
+    bool IsInside(VAddr other_start, VAddr other_end) const noexcept {
+        return start <= other_start && other_end <= end;
+    }
+
+    bool Overlaps(VAddr other_start, VAddr other_end) const noexcept {
+        return start < other_end && other_start < end;
+    }
+
+    void MarkAsModified(bool is_modified_, u64 ticks_) noexcept {
+        is_modified = is_modified_;
+        ticks = ticks_;
+    }
+
+    boost::intrusive::set_member_hook<> member_hook_;
+    VAddr start = 0;
+    VAddr end = 0;
+    GPUVAddr gpu_addr = 0;
+    u64 ticks = 0;
+    bool is_written = false;
+    bool is_modified = false;
+    bool is_registered = false;
+    bool is_memory_marked = false;
+    bool is_sync_pending = false;
+};
+
+struct MapIntervalCompare {
+    constexpr bool operator()(const MapInterval& lhs, const MapInterval& rhs) const noexcept {
+        return lhs.start < rhs.start;
+    }
+};
+
+class MapIntervalAllocator {
+public:
+    MapIntervalAllocator();
+    ~MapIntervalAllocator();
+
+    MapInterval* Allocate() {
+        if (free_list.empty()) {
+            AllocateNewChunk();
+        }
+        MapInterval* const interval = free_list.back();
+        free_list.pop_back();
+        return interval;
+    }
+
+    void Release(MapInterval* interval) {
+        free_list.push_back(interval);
+    }
+
+private:
+    struct Chunk {
+        std::unique_ptr<Chunk> next;
+        std::array<MapInterval, 0x8000> data;
+    };
+
+    void AllocateNewChunk();
+
+    void FillFreeList(Chunk& chunk);
+
+    std::vector<MapInterval*> free_list;
+
+    Chunk first_chunk;
+
+    std::unique_ptr<Chunk>* new_chunk = &first_chunk.next;
+};
+
+} // namespace VideoCommon
--- a/src/video_core/cdma_pusher.cpp
+++ b/src/video_core/cdma_pusher.cpp
@@ -0,0 +1,171 @@
+// MIT License
+//
+// Copyright (c) Ryujinx Team and Contributors
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
+// associated documentation files (the "Software"), to deal in the Software without restriction,
+// including without limitation the rights to use, copy, modify, merge, publish, distribute,
+// sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or
+// substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
+// NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+//
+
+#include "command_classes/host1x.h"
+#include "command_classes/nvdec.h"
+#include "command_classes/vic.h"
+#include "common/bit_util.h"
+#include "video_core/cdma_pusher.h"
+#include "video_core/command_classes/nvdec_common.h"
+#include "video_core/engines/maxwell_3d.h"
+#include "video_core/gpu.h"
+#include "video_core/memory_manager.h"
+
+namespace Tegra {
+CDmaPusher::CDmaPusher(GPU& gpu_)
+    : gpu{gpu_}, nvdec_processor(std::make_shared<Nvdec>(gpu)),
+      vic_processor(std::make_unique<Vic>(gpu, nvdec_processor)),
+      host1x_processor(std::make_unique<Host1x>(gpu)),
+      nvdec_sync(std::make_unique<SyncptIncrManager>(gpu)),
+      vic_sync(std::make_unique<SyncptIncrManager>(gpu)) {}
+
+CDmaPusher::~CDmaPusher() = default;
+
+void CDmaPusher::Push(ChCommandHeaderList&& entries) {
+    cdma_queue.push(std::move(entries));
+}
+
+void CDmaPusher::DispatchCalls() {
+    while (!cdma_queue.empty()) {
+        Step();
+    }
+}
+
+void CDmaPusher::Step() {
+    const auto entries{cdma_queue.front()};
+    cdma_queue.pop();
+
+    std::vector<u32> values(entries.size());
+    std::memcpy(values.data(), entries.data(), entries.size() * sizeof(u32));
+
+    for (const u32 value : values) {
+        if (mask != 0) {
+            const u32 lbs = Common::CountTrailingZeroes32(mask);
+            mask &= ~(1U << lbs);
+            ExecuteCommand(static_cast<u32>(offset + lbs), value);
+            continue;
+        } else if (count != 0) {
+            --count;
+            ExecuteCommand(static_cast<u32>(offset), value);
+            if (incrementing) {
+                ++offset;
+            }
+            continue;
+        }
+        const auto mode = static_cast<ChSubmissionMode>((value >> 28) & 0xf);
+        switch (mode) {
+        case ChSubmissionMode::SetClass: {
+            mask = value & 0x3f;
+            offset = (value >> 16) & 0xfff;
+            current_class = static_cast<ChClassId>((value >> 6) & 0x3ff);
+            break;
+        }
+        case ChSubmissionMode::Incrementing:
+        case ChSubmissionMode::NonIncrementing:
+            count = value & 0xffff;
+            offset = (value >> 16) & 0xfff;
+            incrementing = mode == ChSubmissionMode::Incrementing;
+            break;
+        case ChSubmissionMode::Mask:
+            mask = value & 0xffff;
+            offset = (value >> 16) & 0xfff;
+            break;
+        case ChSubmissionMode::Immediate: {
+            const u32 data = value & 0xfff;
+            offset = (value >> 16) & 0xfff;
+            ExecuteCommand(static_cast<u32>(offset), data);
+            break;
+        }
+        default:
+            UNIMPLEMENTED_MSG("ChSubmission mode {} is not implemented!", static_cast<u32>(mode));
+            break;
+        }
+    }
+}
+
+void CDmaPusher::ExecuteCommand(u32 state_offset, u32 data) {
+    switch (current_class) {
+    case ChClassId::NvDec:
+        ThiStateWrite(nvdec_thi_state, state_offset, {data});
+        switch (static_cast<ThiMethod>(state_offset)) {
+        case ThiMethod::IncSyncpt: {
+            LOG_DEBUG(Service_NVDRV, "NVDEC Class IncSyncpt Method");
+            const auto syncpoint_id = static_cast<u32>(data & 0xFF);
+            const auto cond = static_cast<u32>((data >> 8) & 0xFF);
+            if (cond == 0) {
+                nvdec_sync->Increment(syncpoint_id);
+            } else {
+                nvdec_sync->IncrementWhenDone(static_cast<u32>(current_class), syncpoint_id);
+                nvdec_sync->SignalDone(syncpoint_id);
+            }
+            break;
+        }
+        case ThiMethod::SetMethod1:
+            LOG_DEBUG(Service_NVDRV, "NVDEC method 0x{:X}",
+                      static_cast<u32>(nvdec_thi_state.method_0));
+            nvdec_processor->ProcessMethod(static_cast<Nvdec::Method>(nvdec_thi_state.method_0),
+                                           {data});
+            break;
+        default:
+            break;
+        }
+        break;
+    case ChClassId::GraphicsVic:
+        ThiStateWrite(vic_thi_state, static_cast<u32>(state_offset), {data});
+        switch (static_cast<ThiMethod>(state_offset)) {
+        case ThiMethod::IncSyncpt: {
+            LOG_DEBUG(Service_NVDRV, "VIC Class IncSyncpt Method");
+            const auto syncpoint_id = static_cast<u32>(data & 0xFF);
+            const auto cond = static_cast<u32>((data >> 8) & 0xFF);
+            if (cond == 0) {
+                vic_sync->Increment(syncpoint_id);
+            } else {
+                vic_sync->IncrementWhenDone(static_cast<u32>(current_class), syncpoint_id);
+                vic_sync->SignalDone(syncpoint_id);
+            }
+            break;
+        }
+        case ThiMethod::SetMethod1:
+            LOG_DEBUG(Service_NVDRV, "VIC method 0x{:X}, Args=({})",
+                      static_cast<u32>(vic_thi_state.method_0), data);
+            vic_processor->ProcessMethod(static_cast<Vic::Method>(vic_thi_state.method_0), {data});
+            break;
+        default:
+            break;
+        }
+        break;
+    case ChClassId::Host1x:
+        // This device is mainly for syncpoint synchronization
+        LOG_DEBUG(Service_NVDRV, "Host1X Class Method");
+        host1x_processor->ProcessMethod(static_cast<Host1x::Method>(state_offset), {data});
+        break;
+    default:
+        UNIMPLEMENTED_MSG("Current class not implemented {:X}", static_cast<u32>(current_class));
+        break;
+    }
+}
+
+void CDmaPusher::ThiStateWrite(ThiRegisters& state, u32 state_offset,
+                               const std::vector<u32>& arguments) {
+    u8* const state_offset_ptr = reinterpret_cast<u8*>(&state) + sizeof(u32) * state_offset;
+    std::memcpy(state_offset_ptr, arguments.data(), sizeof(u32) * arguments.size());
+}
+
+} // namespace Tegra
--- a/src/video_core/cdma_pusher.h
+++ b/src/video_core/cdma_pusher.h
@@ -0,0 +1,138 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <memory>
+#include <unordered_map>
+#include <vector>
+#include <queue>
+
+#include "common/bit_field.h"
+#include "common/common_types.h"
+#include "video_core/command_classes/sync_manager.h"
+
+namespace Tegra {
+
+class GPU;
+class Nvdec;
+class Vic;
+class Host1x;
+
+enum class ChSubmissionMode : u32 {
+    SetClass = 0,
+    Incrementing = 1,
+    NonIncrementing = 2,
+    Mask = 3,
+    Immediate = 4,
+    Restart = 5,
+    Gather = 6,
+};
+
+enum class ChClassId : u32 {
+    NoClass = 0x0,
+    Host1x = 0x1,
+    VideoEncodeMpeg = 0x20,
+    VideoEncodeNvEnc = 0x21,
+    VideoStreamingVi = 0x30,
+    VideoStreamingIsp = 0x32,
+    VideoStreamingIspB = 0x34,
+    VideoStreamingViI2c = 0x36,
+    GraphicsVic = 0x5d,
+    Graphics3D = 0x60,
+    GraphicsGpu = 0x61,
+    Tsec = 0xe0,
+    TsecB = 0xe1,
+    NvJpg = 0xc0,
+    NvDec = 0xf0
+};
+
+enum class ChMethod : u32 {
+    Empty = 0,
+    SetMethod = 0x10,
+    SetData = 0x11,
+};
+
+union ChCommandHeader {
+    u32 raw;
+    BitField<0, 16, u32> value;
+    BitField<16, 12, ChMethod> method_offset;
+    BitField<28, 4, ChSubmissionMode> submission_mode;
+};
+static_assert(sizeof(ChCommandHeader) == sizeof(u32), "ChCommand header is an invalid size");
+
+struct ChCommand {
+    ChClassId class_id{};
+    int method_offset{};
+    std::vector<u32> arguments;
+};
+
+using ChCommandHeaderList = std::vector<ChCommandHeader>;
+using ChCommandList = std::vector<ChCommand>;
+
+struct ThiRegisters {
+    u32_le increment_syncpt{};
+    INSERT_PADDING_WORDS(1);
+    u32_le increment_syncpt_error{};
+    u32_le ctx_switch_incremement_syncpt{};
+    INSERT_PADDING_WORDS(4);
+    u32_le ctx_switch{};
+    INSERT_PADDING_WORDS(1);
+    u32_le ctx_syncpt_eof{};
+    INSERT_PADDING_WORDS(5);
+    u32_le method_0{};
+    u32_le method_1{};
+    INSERT_PADDING_WORDS(12);
+    u32_le int_status{};
+    u32_le int_mask{};
+};
+
+enum class ThiMethod : u32 {
+    IncSyncpt = offsetof(ThiRegisters, increment_syncpt) / sizeof(u32),
+    SetMethod0 = offsetof(ThiRegisters, method_0) / sizeof(u32),
+    SetMethod1 = offsetof(ThiRegisters, method_1) / sizeof(u32),
+};
+
+class CDmaPusher {
+public:
+    explicit CDmaPusher(GPU& gpu_);
+    ~CDmaPusher();
+
+    /// Push NVDEC command buffer entries into queue
+    void Push(ChCommandHeaderList&& entries);
+
+    /// Process queued command buffer entries
+    void DispatchCalls();
+
+    /// Process one queue element
+    void Step();
+
+    /// Invoke command class devices to execute the command based on the current state
+    void ExecuteCommand(u32 state_offset, u32 data);
+
+private:
+    /// Write arguments value to the ThiRegisters member at the specified offset
+    void ThiStateWrite(ThiRegisters& state, u32 state_offset, const std::vector<u32>& arguments);
+
+    GPU& gpu;
+
+    std::shared_ptr<Nvdec> nvdec_processor;
+    std::unique_ptr<Vic> vic_processor;
+    std::unique_ptr<Host1x> host1x_processor;
+    std::unique_ptr<SyncptIncrManager> nvdec_sync;
+    std::unique_ptr<SyncptIncrManager> vic_sync;
+    ChClassId current_class{};
+    ThiRegisters vic_thi_state{};
+    ThiRegisters nvdec_thi_state{};
+
+    s32 count{};
+    s32 offset{};
+    s32 mask{};
+    bool incrementing{};
+
+    // Queue of command lists to be processed
+    std::queue<ChCommandHeaderList> cdma_queue;
+};
+
+} // namespace Tegra
--- a/src/video_core/command_classes/codecs/codec.cpp
+++ b/src/video_core/command_classes/codecs/codec.cpp
@@ -0,0 +1,129 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <cstring>
+#include <fstream>
+#include <vector>
+#include "common/assert.h"
+#include "video_core/command_classes/codecs/codec.h"
+#include "video_core/command_classes/codecs/h264.h"
+#include "video_core/command_classes/codecs/vp9.h"
+#include "video_core/gpu.h"
+#include "video_core/memory_manager.h"
+
+extern "C" {
+#include <libavutil/opt.h>
+}
+
+namespace Tegra {
+
+void AVFrameDeleter(AVFrame* ptr) {
+    av_frame_unref(ptr);
+    av_free(ptr);
+}
+
+Codec::Codec(GPU& gpu_)
+    : gpu(gpu_), h264_decoder(std::make_unique<Decoder::H264>(gpu)),
+      vp9_decoder(std::make_unique<Decoder::VP9>(gpu)) {}
+
+Codec::~Codec() {
+    if (!initialized) {
+        return;
+    }
+    // Free libav memory
+    AVFrame* av_frame{nullptr};
+    avcodec_send_packet(av_codec_ctx, nullptr);
+    av_frame = av_frame_alloc();
+    avcodec_receive_frame(av_codec_ctx, av_frame);
+    avcodec_flush_buffers(av_codec_ctx);
+
+    av_frame_unref(av_frame);
+    av_free(av_frame);
+    avcodec_close(av_codec_ctx);
+}
+
+void Codec::SetTargetCodec(NvdecCommon::VideoCodec codec) {
+    LOG_INFO(Service_NVDRV, "NVDEC video codec initialized to {}", codec);
+    current_codec = codec;
+}
+
+void Codec::StateWrite(u32 offset, u64 arguments) {
+    u8* const state_offset = reinterpret_cast<u8*>(&state) + offset * sizeof(u64);
+    std::memcpy(state_offset, &arguments, sizeof(u64));
+}
+
+void Codec::Decode() {
+    bool is_first_frame = false;
+
+    if (!initialized) {
+        if (current_codec == NvdecCommon::VideoCodec::H264) {
+            av_codec = avcodec_find_decoder(AV_CODEC_ID_H264);
+        } else if (current_codec == NvdecCommon::VideoCodec::Vp9) {
+            av_codec = avcodec_find_decoder(AV_CODEC_ID_VP9);
+        } else {
+            LOG_ERROR(Service_NVDRV, "Unknown video codec {}", current_codec);
+            return;
+        }
+
+        av_codec_ctx = avcodec_alloc_context3(av_codec);
+        av_opt_set(av_codec_ctx->priv_data, "tune", "zerolatency", 0);
+
+        // TODO(ameerj): libavcodec gpu hw acceleration
+
+        const auto av_error = avcodec_open2(av_codec_ctx, av_codec, nullptr);
+        if (av_error < 0) {
+            LOG_ERROR(Service_NVDRV, "avcodec_open2() Failed.");
+            avcodec_close(av_codec_ctx);
+            return;
+        }
+        initialized = true;
+        is_first_frame = true;
+    }
+    bool vp9_hidden_frame = false;
+
+    AVPacket packet{};
+    av_init_packet(&packet);
+    std::vector<u8> frame_data;
+
+    if (current_codec == NvdecCommon::VideoCodec::H264) {
+        frame_data = h264_decoder->ComposeFrameHeader(state, is_first_frame);
+    } else if (current_codec == NvdecCommon::VideoCodec::Vp9) {
+        frame_data = vp9_decoder->ComposeFrameHeader(state);
+        vp9_hidden_frame = vp9_decoder->WasFrameHidden();
+    }
+
+    packet.data = frame_data.data();
+    packet.size = static_cast<int>(frame_data.size());
+
+    avcodec_send_packet(av_codec_ctx, &packet);
+
+    if (!vp9_hidden_frame) {
+        // Only receive/store visible frames
+        AVFramePtr frame = AVFramePtr{av_frame_alloc(), AVFrameDeleter};
+        avcodec_receive_frame(av_codec_ctx, frame.get());
+        av_frames.push(std::move(frame));
+        // Limit queue to 10 frames. Workaround for ZLA decode and queue spam
+        if (av_frames.size() > 10) {
+            av_frames.pop();
+        }
+    }
+}
+
+AVFramePtr Codec::GetCurrentFrame() {
+    // Sometimes VIC will request more frames than have been decoded.
+    // in this case, return a nullptr and don't overwrite previous frame data
+    if (av_frames.empty()) {
+        return AVFramePtr{nullptr, AVFrameDeleter};
+    }
+
+    AVFramePtr frame = std::move(av_frames.front());
+    av_frames.pop();
+    return frame;
+}
+
+NvdecCommon::VideoCodec Codec::GetCurrentCodec() const {
+    return current_codec;
+}
+
+} // namespace Tegra
--- a/src/video_core/command_classes/codecs/codec.h
+++ b/src/video_core/command_classes/codecs/codec.h
@@ -0,0 +1,70 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <memory>
+#include <queue>
+#include "common/common_types.h"
+#include "video_core/command_classes/nvdec_common.h"
+
+extern "C" {
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wconversion"
+#endif
+#include <libavcodec/avcodec.h>
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
+}
+
+namespace Tegra {
+class GPU;
+struct VicRegisters;
+
+void AVFrameDeleter(AVFrame* ptr);
+using AVFramePtr = std::unique_ptr<AVFrame, decltype(&AVFrameDeleter)>;
+
+namespace Decoder {
+class H264;
+class VP9;
+} // namespace Decoder
+
+class Codec {
+public:
+    explicit Codec(GPU& gpu);
+    ~Codec();
+
+    /// Sets NVDEC video stream codec
+    void SetTargetCodec(NvdecCommon::VideoCodec codec);
+
+    /// Populate NvdecRegisters state with argument value at the provided offset
+    void StateWrite(u32 offset, u64 arguments);
+
+    /// Call decoders to construct headers, decode AVFrame with ffmpeg
+    void Decode();
+
+    /// Returns next decoded frame
+    [[nodiscard]] AVFramePtr GetCurrentFrame();
+
+    /// Returns the value of current_codec
+    [[nodiscard]] NvdecCommon::VideoCodec GetCurrentCodec() const;
+
+private:
+    bool initialized{};
+    NvdecCommon::VideoCodec current_codec{NvdecCommon::VideoCodec::None};
+
+    AVCodec* av_codec{nullptr};
+    AVCodecContext* av_codec_ctx{nullptr};
+
+    GPU& gpu;
+    std::unique_ptr<Decoder::H264> h264_decoder;
+    std::unique_ptr<Decoder::VP9> vp9_decoder;
+
+    NvdecCommon::NvdecRegisters state{};
+    std::queue<AVFramePtr> av_frames{};
+};
+
+} // namespace Tegra
--- a/src/video_core/command_classes/codecs/h264.cpp
+++ b/src/video_core/command_classes/codecs/h264.cpp
@@ -0,0 +1,293 @@
+// MIT License
+//
+// Copyright (c) Ryujinx Team and Contributors
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
+// associated documentation files (the "Software"), to deal in the Software without restriction,
+// including without limitation the rights to use, copy, modify, merge, publish, distribute,
+// sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or
+// substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
+// NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+//
+
+#include <array>
+#include "common/bit_util.h"
+#include "video_core/command_classes/codecs/h264.h"
+#include "video_core/gpu.h"
+#include "video_core/memory_manager.h"
+
+namespace Tegra::Decoder {
+namespace {
+// ZigZag LUTs from libavcodec.
+constexpr std::array<u8, 64> zig_zag_direct{
+    0,  1,  8,  16, 9,  2,  3,  10, 17, 24, 32, 25, 18, 11, 4,  5,  12, 19, 26, 33, 40, 48,
+    41, 34, 27, 20, 13, 6,  7,  14, 21, 28, 35, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23,
+    30, 37, 44, 51, 58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63,
+};
+
+constexpr std::array<u8, 16> zig_zag_scan{
+    0 + 0 * 4, 1 + 0 * 4, 0 + 1 * 4, 0 + 2 * 4, 1 + 1 * 4, 2 + 0 * 4, 3 + 0 * 4, 2 + 1 * 4,
+    1 + 2 * 4, 0 + 3 * 4, 1 + 3 * 4, 2 + 2 * 4, 3 + 1 * 4, 3 + 2 * 4, 2 + 3 * 4, 3 + 3 * 4,
+};
+} // Anonymous namespace
+
+H264::H264(GPU& gpu_) : gpu(gpu_) {}
+
+H264::~H264() = default;
+
+const std::vector<u8>& H264::ComposeFrameHeader(const NvdecCommon::NvdecRegisters& state,
+                                                bool is_first_frame) {
+    H264DecoderContext context{};
+    gpu.MemoryManager().ReadBlock(state.picture_info_offset, &context, sizeof(H264DecoderContext));
+
+    const s32 frame_number = static_cast<s32>((context.h264_parameter_set.flags >> 46) & 0x1ffff);
+    if (!is_first_frame && frame_number != 0) {
+        frame.resize(context.frame_data_size);
+
+        gpu.MemoryManager().ReadBlock(state.frame_bitstream_offset, frame.data(), frame.size());
+    } else {
+        /// Encode header
+        H264BitWriter writer{};
+        writer.WriteU(1, 24);
+        writer.WriteU(0, 1);
+        writer.WriteU(3, 2);
+        writer.WriteU(7, 5);
+        writer.WriteU(100, 8);
+        writer.WriteU(0, 8);
+        writer.WriteU(31, 8);
+        writer.WriteUe(0);
+        const auto chroma_format_idc =
+            static_cast<u32>((context.h264_parameter_set.flags >> 12) & 3);
+        writer.WriteUe(chroma_format_idc);
+        if (chroma_format_idc == 3) {
+            writer.WriteBit(false);
+        }
+
+        writer.WriteUe(0);
+        writer.WriteUe(0);
+        writer.WriteBit(false); // QpprimeYZeroTransformBypassFlag
+        writer.WriteBit(false); // Scaling matrix present flag
+
+        const auto order_cnt_type = static_cast<u32>((context.h264_parameter_set.flags >> 14) & 3);
+        writer.WriteUe(static_cast<u32>((context.h264_parameter_set.flags >> 8) & 0xf));
+        writer.WriteUe(order_cnt_type);
+        if (order_cnt_type == 0) {
+            writer.WriteUe(context.h264_parameter_set.log2_max_pic_order_cnt);
+        } else if (order_cnt_type == 1) {
+            writer.WriteBit(context.h264_parameter_set.delta_pic_order_always_zero_flag != 0);
+
+            writer.WriteSe(0);
+            writer.WriteSe(0);
+            writer.WriteUe(0);
+        }
+
+        const s32 pic_height = context.h264_parameter_set.pic_height_in_map_units /
+                               (context.h264_parameter_set.frame_mbs_only_flag ? 1 : 2);
+
+        writer.WriteUe(16);
+        writer.WriteBit(false);
+        writer.WriteUe(context.h264_parameter_set.pic_width_in_mbs - 1);
+        writer.WriteUe(pic_height - 1);
+        writer.WriteBit(context.h264_parameter_set.frame_mbs_only_flag != 0);
+
+        if (!context.h264_parameter_set.frame_mbs_only_flag) {
+            writer.WriteBit(((context.h264_parameter_set.flags >> 0) & 1) != 0);
+        }
+
+        writer.WriteBit(((context.h264_parameter_set.flags >> 1) & 1) != 0);
+        writer.WriteBit(false); // Frame cropping flag
+        writer.WriteBit(false); // VUI parameter present flag
+
+        writer.End();
+
+        // H264 PPS
+        writer.WriteU(1, 24);
+        writer.WriteU(0, 1);
+        writer.WriteU(3, 2);
+        writer.WriteU(8, 5);
+
+        writer.WriteUe(0);
+        writer.WriteUe(0);
+
+        writer.WriteBit(context.h264_parameter_set.entropy_coding_mode_flag != 0);
+        writer.WriteBit(false);
+        writer.WriteUe(0);
+        writer.WriteUe(context.h264_parameter_set.num_refidx_l0_default_active);
+        writer.WriteUe(context.h264_parameter_set.num_refidx_l1_default_active);
+        writer.WriteBit(((context.h264_parameter_set.flags >> 2) & 1) != 0);
+        writer.WriteU(static_cast<s32>((context.h264_parameter_set.flags >> 32) & 0x3), 2);
+        s32 pic_init_qp = static_cast<s32>((context.h264_parameter_set.flags >> 16) & 0x3f);
+        pic_init_qp = (pic_init_qp << 26) >> 26;
+        writer.WriteSe(pic_init_qp);
+        writer.WriteSe(0);
+        s32 chroma_qp_index_offset =
+            static_cast<s32>((context.h264_parameter_set.flags >> 22) & 0x1f);
+        chroma_qp_index_offset = (chroma_qp_index_offset << 27) >> 27;
+
+        writer.WriteSe(chroma_qp_index_offset);
+        writer.WriteBit(context.h264_parameter_set.deblocking_filter_control_flag != 0);
+        writer.WriteBit(((context.h264_parameter_set.flags >> 3) & 1) != 0);
+        writer.WriteBit(context.h264_parameter_set.redundant_pic_count_flag != 0);
+        writer.WriteBit(context.h264_parameter_set.transform_8x8_mode_flag != 0);
+
+        writer.WriteBit(true);
+
+        for (s32 index = 0; index < 6; index++) {
+            writer.WriteBit(true);
+            const auto matrix_x4 =
+                std::vector<u8>(context.scaling_matrix_4.begin(), context.scaling_matrix_4.end());
+            writer.WriteScalingList(matrix_x4, index * 16, 16);
+        }
+
+        if (context.h264_parameter_set.transform_8x8_mode_flag) {
+            for (s32 index = 0; index < 2; index++) {
+                writer.WriteBit(true);
+                const auto matrix_x8 = std::vector<u8>(context.scaling_matrix_8.begin(),
+                                                       context.scaling_matrix_8.end());
+
+                writer.WriteScalingList(matrix_x8, index * 64, 64);
+            }
+        }
+
+        s32 chroma_qp_index_offset2 =
+            static_cast<s32>((context.h264_parameter_set.flags >> 27) & 0x1f);
+        chroma_qp_index_offset2 = (chroma_qp_index_offset2 << 27) >> 27;
+
+        writer.WriteSe(chroma_qp_index_offset2);
+
+        writer.End();
+
+        const auto& encoded_header = writer.GetByteArray();
+        frame.resize(encoded_header.size() + context.frame_data_size);
+        std::memcpy(frame.data(), encoded_header.data(), encoded_header.size());
+
+        gpu.MemoryManager().ReadBlock(state.frame_bitstream_offset,
+                                      frame.data() + encoded_header.size(),
+                                      context.frame_data_size);
+    }
+
+    return frame;
+}
+
+H264BitWriter::H264BitWriter() = default;
+
+H264BitWriter::~H264BitWriter() = default;
+
+void H264BitWriter::WriteU(s32 value, s32 value_sz) {
+    WriteBits(value, value_sz);
+}
+
+void H264BitWriter::WriteSe(s32 value) {
+    WriteExpGolombCodedInt(value);
+}
+
+void H264BitWriter::WriteUe(u32 value) {
+    WriteExpGolombCodedUInt(value);
+}
+
+void H264BitWriter::End() {
+    WriteBit(true);
+    Flush();
+}
+
+void H264BitWriter::WriteBit(bool state) {
+    WriteBits(state ? 1 : 0, 1);
+}
+
+void H264BitWriter::WriteScalingList(const std::vector<u8>& list, s32 start, s32 count) {
+    std::vector<u8> scan(count);
+    if (count == 16) {
+        std::memcpy(scan.data(), zig_zag_scan.data(), scan.size());
+    } else {
+        std::memcpy(scan.data(), zig_zag_direct.data(), scan.size());
+    }
+    u8 last_scale = 8;
+
+    for (s32 index = 0; index < count; index++) {
+        const u8 value = list[start + scan[index]];
+        const s32 delta_scale = static_cast<s32>(value - last_scale);
+
+        WriteSe(delta_scale);
+
+        last_scale = value;
+    }
+}
+
+std::vector<u8>& H264BitWriter::GetByteArray() {
+    return byte_array;
+}
+
+const std::vector<u8>& H264BitWriter::GetByteArray() const {
+    return byte_array;
+}
+
+void H264BitWriter::WriteBits(s32 value, s32 bit_count) {
+    s32 value_pos = 0;
+
+    s32 remaining = bit_count;
+
+    while (remaining > 0) {
+        s32 copy_size = remaining;
+
+        const s32 free_bits = GetFreeBufferBits();
+
+        if (copy_size > free_bits) {
+            copy_size = free_bits;
+        }
+
+        const s32 mask = (1 << copy_size) - 1;
+
+        const s32 src_shift = (bit_count - value_pos) - copy_size;
+        const s32 dst_shift = (buffer_size - buffer_pos) - copy_size;
+
+        buffer |= ((value >> src_shift) & mask) << dst_shift;
+
+        value_pos += copy_size;
+        buffer_pos += copy_size;
+        remaining -= copy_size;
+    }
+}
+
+void H264BitWriter::WriteExpGolombCodedInt(s32 value) {
+    const s32 sign = value <= 0 ? 0 : 1;
+    if (value < 0) {
+        value = -value;
+    }
+    value = (value << 1) - sign;
+    WriteExpGolombCodedUInt(value);
+}
+
+void H264BitWriter::WriteExpGolombCodedUInt(u32 value) {
+    const s32 size = 32 - Common::CountLeadingZeroes32(static_cast<s32>(value + 1));
+    WriteBits(1, size);
+
+    value -= (1U << (size - 1)) - 1;
+    WriteBits(static_cast<s32>(value), size - 1);
+}
+
+s32 H264BitWriter::GetFreeBufferBits() {
+    if (buffer_pos == buffer_size) {
+        Flush();
+    }
+
+    return buffer_size - buffer_pos;
+}
+
+void H264BitWriter::Flush() {
+    if (buffer_pos == 0) {
+        return;
+    }
+    byte_array.push_back(static_cast<u8>(buffer));
+
+    buffer = 0;
+    buffer_pos = 0;
+}
+} // namespace Tegra::Decoder
--- a/src/video_core/command_classes/codecs/h264.h
+++ b/src/video_core/command_classes/codecs/h264.h
@@ -0,0 +1,118 @@
+// MIT License
+//
+// Copyright (c) Ryujinx Team and Contributors
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
+// associated documentation files (the "Software"), to deal in the Software without restriction,
+// including without limitation the rights to use, copy, modify, merge, publish, distribute,
+// sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or
+// substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
+// NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+//
+
+#pragma once
+
+#include <vector>
+#include "common/common_funcs.h"
+#include "common/common_types.h"
+#include "video_core/command_classes/nvdec_common.h"
+
+namespace Tegra {
+class GPU;
+namespace Decoder {
+
+class H264BitWriter {
+public:
+    H264BitWriter();
+    ~H264BitWriter();
+
+    /// The following Write methods are based on clause 9.1 in the H.264 specification.
+    /// WriteSe and WriteUe write in the Exp-Golomb-coded syntax
+    void WriteU(s32 value, s32 value_sz);
+    void WriteSe(s32 value);
+    void WriteUe(u32 value);
+
+    /// Finalize the bitstream
+    void End();
+
+    /// append a bit to the stream, equivalent value to the state parameter
+    void WriteBit(bool state);
+
+    /// Based on section 7.3.2.1.1.1 and Table 7-4 in the H.264 specification
+    /// Writes the scaling matrices of the sream
+    void WriteScalingList(const std::vector<u8>& list, s32 start, s32 count);
+
+    /// Return the bitstream as a vector.
+    [[nodiscard]] std::vector<u8>& GetByteArray();
+    [[nodiscard]] const std::vector<u8>& GetByteArray() const;
+
+private:
+    void WriteBits(s32 value, s32 bit_count);
+    void WriteExpGolombCodedInt(s32 value);
+    void WriteExpGolombCodedUInt(u32 value);
+    [[nodiscard]] s32 GetFreeBufferBits();
+    void Flush();
+
+    s32 buffer_size{8};
+
+    s32 buffer{};
+    s32 buffer_pos{};
+    std::vector<u8> byte_array;
+};
+
+class H264 {
+public:
+    explicit H264(GPU& gpu);
+    ~H264();
+
+    /// Compose the H264 header of the frame for FFmpeg decoding
+    [[nodiscard]] const std::vector<u8>& ComposeFrameHeader(
+        const NvdecCommon::NvdecRegisters& state, bool is_first_frame = false);
+
+private:
+    struct H264ParameterSet {
+        u32 log2_max_pic_order_cnt{};
+        u32 delta_pic_order_always_zero_flag{};
+        u32 frame_mbs_only_flag{};
+        u32 pic_width_in_mbs{};
+        u32 pic_height_in_map_units{};
+        INSERT_PADDING_WORDS(1);
+        u32 entropy_coding_mode_flag{};
+        u32 bottom_field_pic_order_flag{};
+        u32 num_refidx_l0_default_active{};
+        u32 num_refidx_l1_default_active{};
+        u32 deblocking_filter_control_flag{};
+        u32 redundant_pic_count_flag{};
+        u32 transform_8x8_mode_flag{};
+        INSERT_PADDING_WORDS(9);
+        u64 flags{};
+        u32 frame_number{};
+        u32 frame_number2{};
+    };
+    static_assert(sizeof(H264ParameterSet) == 0x68, "H264ParameterSet is an invalid size");
+
+    struct H264DecoderContext {
+        INSERT_PADDING_BYTES(0x48);
+        u32 frame_data_size{};
+        INSERT_PADDING_BYTES(0xc);
+        H264ParameterSet h264_parameter_set{};
+        INSERT_PADDING_BYTES(0x100);
+        std::array<u8, 0x60> scaling_matrix_4;
+        std::array<u8, 0x80> scaling_matrix_8;
+    };
+    static_assert(sizeof(H264DecoderContext) == 0x2a0, "H264DecoderContext is an invalid size");
+
+    std::vector<u8> frame;
+    GPU& gpu;
+};
+
+} // namespace Decoder
+} // namespace Tegra
--- a/src/video_core/command_classes/codecs/vp9.cpp
+++ b/src/video_core/command_classes/codecs/vp9.cpp
@@ -0,0 +1,989 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <cstring> // for std::memcpy
+#include <numeric>
+#include "video_core/command_classes/codecs/vp9.h"
+#include "video_core/gpu.h"
+#include "video_core/memory_manager.h"
+
+namespace Tegra::Decoder {
+namespace {
+// Default compressed header probabilities once frame context resets
+constexpr Vp9EntropyProbs default_probs{
+    .y_mode_prob{
+        65,  32, 18, 144, 162, 194, 41, 51, 98, 132, 68,  18, 165, 217, 196, 45, 40, 78,
+        173, 80, 19, 176, 240, 193, 64, 35, 46, 221, 135, 38, 194, 248, 121, 96, 85, 29,
+    },
+    .partition_prob{
+        199, 122, 141, 0, 147, 63, 159, 0, 148, 133, 118, 0, 121, 104, 114, 0,
+        174, 73,  87,  0, 92,  41, 83,  0, 82,  99,  50,  0, 53,  39,  39,  0,
+        177, 58,  59,  0, 68,  26, 63,  0, 52,  79,  25,  0, 17,  14,  12,  0,
+        222, 34,  30,  0, 72,  16, 44,  0, 58,  32,  12,  0, 10,  7,   6,   0,
+    },
+    .coef_probs{
+        195, 29,  183, 84,  49,  136, 8,   42,  71,  0,   0,   0,   0,  0,   0,   0,  0,  0,
+        31,  107, 169, 35,  99,  159, 17,  82,  140, 8,   66,  114, 2,  44,  76,  1,  19, 32,
+        40,  132, 201, 29,  114, 187, 13,  91,  157, 7,   75,  127, 3,  58,  95,  1,  28, 47,
+        69,  142, 221, 42,  122, 201, 15,  91,  159, 6,   67,  121, 1,  42,  77,  1,  17, 31,
+        102, 148, 228, 67,  117, 204, 17,  82,  154, 6,   59,  114, 2,  39,  75,  1,  15, 29,
+        156, 57,  233, 119, 57,  212, 58,  48,  163, 29,  40,  124, 12, 30,  81,  3,  12, 31,
+        191, 107, 226, 124, 117, 204, 25,  99,  155, 0,   0,   0,   0,  0,   0,   0,  0,  0,
+        29,  148, 210, 37,  126, 194, 8,   93,  157, 2,   68,  118, 1,  39,  69,  1,  17, 33,
+        41,  151, 213, 27,  123, 193, 3,   82,  144, 1,   58,  105, 1,  32,  60,  1,  13, 26,
+        59,  159, 220, 23,  126, 198, 4,   88,  151, 1,   66,  114, 1,  38,  71,  1,  18, 34,
+        114, 136, 232, 51,  114, 207, 11,  83,  155, 3,   56,  105, 1,  33,  65,  1,  17, 34,
+        149, 65,  234, 121, 57,  215, 61,  49,  166, 28,  36,  114, 12, 25,  76,  3,  16, 42,
+        214, 49,  220, 132, 63,  188, 42,  65,  137, 0,   0,   0,   0,  0,   0,   0,  0,  0,
+        85,  137, 221, 104, 131, 216, 49,  111, 192, 21,  87,  155, 2,  49,  87,  1,  16, 28,
+        89,  163, 230, 90,  137, 220, 29,  100, 183, 10,  70,  135, 2,  42,  81,  1,  17, 33,
+        108, 167, 237, 55,  133, 222, 15,  97,  179, 4,   72,  135, 1,  45,  85,  1,  19, 38,
+        124, 146, 240, 66,  124, 224, 17,  88,  175, 4,   58,  122, 1,  36,  75,  1,  18, 37,
+        141, 79,  241, 126, 70,  227, 66,  58,  182, 30,  44,  136, 12, 34,  96,  2,  20, 47,
+        229, 99,  249, 143, 111, 235, 46,  109, 192, 0,   0,   0,   0,  0,   0,   0,  0,  0,
+        82,  158, 236, 94,  146, 224, 25,  117, 191, 9,   87,  149, 3,  56,  99,  1,  33, 57,
+        83,  167, 237, 68,  145, 222, 10,  103, 177, 2,   72,  131, 1,  41,  79,  1,  20, 39,
+        99,  167, 239, 47,  141, 224, 10,  104, 178, 2,   73,  133, 1,  44,  85,  1,  22, 47,
+        127, 145, 243, 71,  129, 228, 17,  93,  177, 3,   61,  124, 1,  41,  84,  1,  21, 52,
+        157, 78,  244, 140, 72,  231, 69,  58,  184, 31,  44,  137, 14, 38,  105, 8,  23, 61,
+        125, 34,  187, 52,  41,  133, 6,   31,  56,  0,   0,   0,   0,  0,   0,   0,  0,  0,
+        37,  109, 153, 51,  102, 147, 23,  87,  128, 8,   67,  101, 1,  41,  63,  1,  19, 29,
+        31,  154, 185, 17,  127, 175, 6,   96,  145, 2,   73,  114, 1,  51,  82,  1,  28, 45,
+        23,  163, 200, 10,  131, 185, 2,   93,  148, 1,   67,  111, 1,  41,  69,  1,  14, 24,
+        29,  176, 217, 12,  145, 201, 3,   101, 156, 1,   69,  111, 1,  39,  63,  1,  14, 23,
+        57,  192, 233, 25,  154, 215, 6,   109, 167, 3,   78,  118, 1,  48,  69,  1,  21, 29,
+        202, 105, 245, 108, 106, 216, 18,  90,  144, 0,   0,   0,   0,  0,   0,   0,  0,  0,
+        33,  172, 219, 64,  149, 206, 14,  117, 177, 5,   90,  141, 2,  61,  95,  1,  37, 57,
+        33,  179, 220, 11,  140, 198, 1,   89,  148, 1,   60,  104, 1,  33,  57,  1,  12, 21,
+        30,  181, 221, 8,   141, 198, 1,   87,  145, 1,   58,  100, 1,  31,  55,  1,  12, 20,
+        32,  186, 224, 7,   142, 198, 1,   86,  143, 1,   58,  100, 1,  31,  55,  1,  12, 22,
+        57,  192, 227, 20,  143, 204, 3,   96,  154, 1,   68,  112, 1,  42,  69,  1,  19, 32,
+        212, 35,  215, 113, 47,  169, 29,  48,  105, 0,   0,   0,   0,  0,   0,   0,  0,  0,
+        74,  129, 203, 106, 120, 203, 49,  107, 178, 19,  84,  144, 4,  50,  84,  1,  15, 25,
+        71,  172, 217, 44,  141, 209, 15,  102, 173, 6,   76,  133, 2,  51,  89,  1,  24, 42,
+        64,  185, 231, 31,  148, 216, 8,   103, 175, 3,   74,  131, 1,  46,  81,  1,  18, 30,
+        65,  196, 235, 25,  157, 221, 5,   105, 174, 1,   67,  120, 1,  38,  69,  1,  15, 30,
+        65,  204, 238, 30,  156, 224, 7,   107, 177, 2,   70,  124, 1,  42,  73,  1,  18, 34,
+        225, 86,  251, 144, 104, 235, 42,  99,  181, 0,   0,   0,   0,  0,   0,   0,  0,  0,
+        85,  175, 239, 112, 165, 229, 29,  136, 200, 12,  103, 162, 6,  77,  123, 2,  53, 84,
+        75,  183, 239, 30,  155, 221, 3,   106, 171, 1,   74,  128, 1,  44,  76,  1,  17, 28,
+        73,  185, 240, 27,  159, 222, 2,   107, 172, 1,   75,  127, 1,  42,  73,  1,  17, 29,
+        62,  190, 238, 21,  159, 222, 2,   107, 172, 1,   72,  122, 1,  40,  71,  1,  18, 32,
+        61,  199, 240, 27,  161, 226, 4,   113, 180, 1,   76,  129, 1,  46,  80,  1,  23, 41,
+        7,   27,  153, 5,   30,  95,  1,   16,  30,  0,   0,   0,   0,  0,   0,   0,  0,  0,
+        50,  75,  127, 57,  75,  124, 27,  67,  108, 10,  54,  86,  1,  33,  52,  1,  12, 18,
+        43,  125, 151, 26,  108, 148, 7,   83,  122, 2,   59,  89,  1,  38,  60,  1,  17, 27,
+        23,  144, 163, 13,  112, 154, 2,   75,  117, 1,   50,  81,  1,  31,  51,  1,  14, 23,
+        18,  162, 185, 6,   123, 171, 1,   78,  125, 1,   51,  86,  1,  31,  54,  1,  14, 23,
+        15,  199, 227, 3,   150, 204, 1,   91,  146, 1,   55,  95,  1,  30,  53,  1,  11, 20,
+        19,  55,  240, 19,  59,  196, 3,   52,  105, 0,   0,   0,   0,  0,   0,   0,  0,  0,
+        41,  166, 207, 104, 153, 199, 31,  123, 181, 14,  101, 152, 5,  72,  106, 1,  36, 52,
+        35,  176, 211, 12,  131, 190, 2,   88,  144, 1,   60,  101, 1,  36,  60,  1,  16, 28,
+        28,  183, 213, 8,   134, 191, 1,   86,  142, 1,   56,  96,  1,  30,  53,  1,  12, 20,
+        20,  190, 215, 4,   135, 192, 1,   84,  139, 1,   53,  91,  1,  28,  49,  1,  11, 20,
+        13,  196, 216, 2,   137, 192, 1,   86,  143, 1,   57,  99,  1,  32,  56,  1,  13, 24,
+        211, 29,  217, 96,  47,  156, 22,  43,  87,  0,   0,   0,   0,  0,   0,   0,  0,  0,
+        78,  120, 193, 111, 116, 186, 46,  102, 164, 15,  80,  128, 2,  49,  76,  1,  18, 28,
+        71,  161, 203, 42,  132, 192, 10,  98,  150, 3,   69,  109, 1,  44,  70,  1,  18, 29,
+        57,  186, 211, 30,  140, 196, 4,   93,  146, 1,   62,  102, 1,  38,  65,  1,  16, 27,
+        47,  199, 217, 14,  145, 196, 1,   88,  142, 1,   57,  98,  1,  36,  62,  1,  15, 26,
+        26,  219, 229, 5,   155, 207, 1,   94,  151, 1,   60,  104, 1,  36,  62,  1,  16, 28,
+        233, 29,  248, 146, 47,  220, 43,  52,  140, 0,   0,   0,   0,  0,   0,   0,  0,  0,
+        100, 163, 232, 179, 161, 222, 63,  142, 204, 37,  113, 174, 26, 89,  137, 18, 68, 97,
+        85,  181, 230, 32,  146, 209, 7,   100, 164, 3,   71,  121, 1,  45,  77,  1,  18, 30,
+        65,  187, 230, 20,  148, 207, 2,   97,  159, 1,   68,  116, 1,  40,  70,  1,  14, 29,
+        40,  194, 227, 8,   147, 204, 1,   94,  155, 1,   65,  112, 1,  39,  66,  1,  14, 26,
+        16,  208, 228, 3,   151, 207, 1,   98,  160, 1,   67,  117, 1,  41,  74,  1,  17, 31,
+        17,  38,  140, 7,   34,  80,  1,   17,  29,  0,   0,   0,   0,  0,   0,   0,  0,  0,
+        37,  75,  128, 41,  76,  128, 26,  66,  116, 12,  52,  94,  2,  32,  55,  1,  10, 16,
+        50,  127, 154, 37,  109, 152, 16,  82,  121, 5,   59,  85,  1,  35,  54,  1,  13, 20,
+        40,  142, 167, 17,  110, 157, 2,   71,  112, 1,   44,  72,  1,  27,  45,  1,  11, 17,
+        30,  175, 188, 9,   124, 169, 1,   74,  116, 1,   48,  78,  1,  30,  49,  1,  11, 18,
+        10,  222, 223, 2,   150, 194, 1,   83,  128, 1,   48,  79,  1,  27,  45,  1,  11, 17,
+        36,  41,  235, 29,  36,  193, 10,  27,  111, 0,   0,   0,   0,  0,   0,   0,  0,  0,
+        85,  165, 222, 177, 162, 215, 110, 135, 195, 57,  113, 168, 23, 83,  120, 10, 49, 61,
+        85,  190, 223, 36,  139, 200, 5,   90,  146, 1,   60,  103, 1,  38,  65,  1,  18, 30,
+        72,  202, 223, 23,  141, 199, 2,   86,  140, 1,   56,  97,  1,  36,  61,  1,  16, 27,
+        55,  218, 225, 13,  145, 200, 1,   86,  141, 1,   57,  99,  1,  35,  61,  1,  13, 22,
+        15,  235, 212, 1,   132, 184, 1,   84,  139, 1,   57,  97,  1,  34,  56,  1,  14, 23,
+        181, 21,  201, 61,  37,  123, 10,  38,  71,  0,   0,   0,   0,  0,   0,   0,  0,  0,
+        47,  106, 172, 95,  104, 173, 42,  93,  159, 18,  77,  131, 4,  50,  81,  1,  17, 23,
+        62,  147, 199, 44,  130, 189, 28,  102, 154, 18,  75,  115, 2,  44,  65,  1,  12, 19,
+        55,  153, 210, 24,  130, 194, 3,   93,  146, 1,   61,  97,  1,  31,  50,  1,  10, 16,
+        49,  186, 223, 17,  148, 204, 1,   96,  142, 1,   53,  83,  1,  26,  44,  1,  11, 17,
+        13,  217, 212, 2,   136, 180, 1,   78,  124, 1,   50,  83,  1,  29,  49,  1,  14, 23,
+        197, 13,  247, 82,  17,  222, 25,  17,  162, 0,   0,   0,   0,  0,   0,   0,  0,  0,
+        126, 186, 247, 234, 191, 243, 176, 177, 234, 104, 158, 220, 66, 128, 186, 55, 90, 137,
+        111, 197, 242, 46,  158, 219, 9,   104, 171, 2,   65,  125, 1,  44,  80,  1,  17, 91,
+        104, 208, 245, 39,  168, 224, 3,   109, 162, 1,   79,  124, 1,  50,  102, 1,  43, 102,
+        84,  220, 246, 31,  177, 231, 2,   115, 180, 1,   79,  134, 1,  55,  77,  1,  60, 79,
+        43,  243, 240, 8,   180, 217, 1,   115, 166, 1,   84,  121, 1,  51,  67,  1,  16, 6,
+    },
+    .switchable_interp_prob{235, 162, 36, 255, 34, 3, 149, 144},
+    .inter_mode_prob{
+        2,  173, 34, 0,  7,  145, 85, 0,  7,  166, 63, 0,  7,  94,
+        66, 0,   8,  64, 46, 0,   17, 81, 31, 0,   25, 29, 30, 0,
+    },
+    .intra_inter_prob{9, 102, 187, 225},
+    .comp_inter_prob{9, 102, 187, 225, 0},
+    .single_ref_prob{33, 16, 77, 74, 142, 142, 172, 170, 238, 247},
+    .comp_ref_prob{50, 126, 123, 221, 226},
+    .tx_32x32_prob{3, 136, 37, 5, 52, 13},
+    .tx_16x16_prob{20, 152, 15, 101},
+    .tx_8x8_prob{100, 66},
+    .skip_probs{192, 128, 64},
+    .joints{32, 64, 96},
+    .sign{128, 128},
+    .classes{
+        224, 144, 192, 168, 192, 176, 192, 198, 198, 245,
+        216, 128, 176, 160, 176, 176, 192, 198, 198, 208,
+    },
+    .class_0{216, 208},
+    .prob_bits{
+        136, 140, 148, 160, 176, 192, 224, 234, 234, 240,
+        136, 140, 148, 160, 176, 192, 224, 234, 234, 240,
+    },
+    .class_0_fr{128, 128, 64, 96, 112, 64, 128, 128, 64, 96, 112, 64},
+    .fr{64, 96, 64, 64, 96, 64},
+    .class_0_hp{160, 160},
+    .high_precision{128, 128},
+};
+
+constexpr std::array<s32, 256> norm_lut{
+    0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+};
+
+constexpr std::array<s32, 254> map_lut{
+    20,  21,  22,  23,  24,  25,  0,   26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,
+    1,   38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  2,   50,  51,  52,  53,  54,
+    55,  56,  57,  58,  59,  60,  61,  3,   62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,
+    73,  4,   74,  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  5,   86,  87,  88,  89,
+    90,  91,  92,  93,  94,  95,  96,  97,  6,   98,  99,  100, 101, 102, 103, 104, 105, 106, 107,
+    108, 109, 7,   110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 8,   122, 123, 124,
+    125, 126, 127, 128, 129, 130, 131, 132, 133, 9,   134, 135, 136, 137, 138, 139, 140, 141, 142,
+    143, 144, 145, 10,  146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 11,  158, 159,
+    160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 12,  170, 171, 172, 173, 174, 175, 176, 177,
+    178, 179, 180, 181, 13,  182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 14,  194,
+    195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 15,  206, 207, 208, 209, 210, 211, 212,
+    213, 214, 215, 216, 217, 16,  218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 17,
+    230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 18,  242, 243, 244, 245, 246, 247,
+    248, 249, 250, 251, 252, 253, 19,
+};
+
+// 6.2.14 Tile size calculation
+
+[[nodiscard]] s32 CalcMinLog2TileCols(s32 frame_width) {
+    const s32 sb64_cols = (frame_width + 63) / 64;
+    s32 min_log2 = 0;
+
+    while ((64 << min_log2) < sb64_cols) {
+        min_log2++;
+    }
+
+    return min_log2;
+}
+
+[[nodiscard]] s32 CalcMaxLog2TileCols(s32 frame_width) {
+    const s32 sb64_cols = (frame_width + 63) / 64;
+    s32 max_log2 = 1;
+
+    while ((sb64_cols >> max_log2) >= 4) {
+        max_log2++;
+    }
+
+    return max_log2 - 1;
+}
+
+// Recenters probability. Based on section 6.3.6 of VP9 Specification
+[[nodiscard]] s32 RecenterNonNeg(s32 new_prob, s32 old_prob) {
+    if (new_prob > old_prob * 2) {
+        return new_prob;
+    }
+
+    if (new_prob >= old_prob) {
+        return (new_prob - old_prob) * 2;
+    }
+
+    return (old_prob - new_prob) * 2 - 1;
+}
+
+// Adjusts old_prob depending on new_prob. Based on section 6.3.5 of VP9 Specification
+[[nodiscard]] s32 RemapProbability(s32 new_prob, s32 old_prob) {
+    new_prob--;
+    old_prob--;
+
+    std::size_t index{};
+
+    if (old_prob * 2 <= 0xff) {
+        index = static_cast<std::size_t>(std::max(0, RecenterNonNeg(new_prob, old_prob) - 1));
+    } else {
+        index = static_cast<std::size_t>(
+            std::max(0, RecenterNonNeg(0xff - 1 - new_prob, 0xff - 1 - old_prob) - 1));
+    }
+
+    return map_lut[index];
+}
+} // Anonymous namespace
+
+VP9::VP9(GPU& gpu_) : gpu{gpu_} {}
+
+VP9::~VP9() = default;
+
+void VP9::WriteProbabilityUpdate(VpxRangeEncoder& writer, u8 new_prob, u8 old_prob) {
+    const bool update = new_prob != old_prob;
+
+    writer.Write(update, diff_update_probability);
+
+    if (update) {
+        WriteProbabilityDelta(writer, new_prob, old_prob);
+    }
+}
+template <typename T, std::size_t N>
+void VP9::WriteProbabilityUpdate(VpxRangeEncoder& writer, const std::array<T, N>& new_prob,
+                                 const std::array<T, N>& old_prob) {
+    for (std::size_t offset = 0; offset < new_prob.size(); ++offset) {
+        WriteProbabilityUpdate(writer, new_prob[offset], old_prob[offset]);
+    }
+}
+
+template <typename T, std::size_t N>
+void VP9::WriteProbabilityUpdateAligned4(VpxRangeEncoder& writer, const std::array<T, N>& new_prob,
+                                         const std::array<T, N>& old_prob) {
+    for (std::size_t offset = 0; offset < new_prob.size(); offset += 4) {
+        WriteProbabilityUpdate(writer, new_prob[offset + 0], old_prob[offset + 0]);
+        WriteProbabilityUpdate(writer, new_prob[offset + 1], old_prob[offset + 1]);
+        WriteProbabilityUpdate(writer, new_prob[offset + 2], old_prob[offset + 2]);
+    }
+}
+
+void VP9::WriteProbabilityDelta(VpxRangeEncoder& writer, u8 new_prob, u8 old_prob) {
+    const int delta = RemapProbability(new_prob, old_prob);
+
+    EncodeTermSubExp(writer, delta);
+}
+
+void VP9::EncodeTermSubExp(VpxRangeEncoder& writer, s32 value) {
+    if (WriteLessThan(writer, value, 16)) {
+        writer.Write(value, 4);
+    } else if (WriteLessThan(writer, value, 32)) {
+        writer.Write(value - 16, 4);
+    } else if (WriteLessThan(writer, value, 64)) {
+        writer.Write(value - 32, 5);
+    } else {
+        value -= 64;
+
+        constexpr s32 size = 8;
+
+        const s32 mask = (1 << size) - 191;
+
+        const s32 delta = value - mask;
+
+        if (delta < 0) {
+            writer.Write(value, size - 1);
+        } else {
+            writer.Write(delta / 2 + mask, size - 1);
+            writer.Write(delta & 1, 1);
+        }
+    }
+}
+
+bool VP9::WriteLessThan(VpxRangeEncoder& writer, s32 value, s32 test) {
+    const bool is_lt = value < test;
+    writer.Write(!is_lt);
+    return is_lt;
+}
+
+void VP9::WriteCoefProbabilityUpdate(VpxRangeEncoder& writer, s32 tx_mode,
+                                     const std::array<u8, 1728>& new_prob,
+                                     const std::array<u8, 1728>& old_prob) {
+    constexpr u32 block_bytes = 2 * 2 * 6 * 6 * 3;
+
+    const auto needs_update = [&](u32 base_index) {
+        return !std::equal(new_prob.begin() + base_index,
+                           new_prob.begin() + base_index + block_bytes,
+                           old_prob.begin() + base_index);
+    };
+
+    for (u32 block_index = 0; block_index < 4; block_index++) {
+        const u32 base_index = block_index * block_bytes;
+        const bool update = needs_update(base_index);
+        writer.Write(update);
+
+        if (update) {
+            u32 index = base_index;
+            for (s32 i = 0; i < 2; i++) {
+                for (s32 j = 0; j < 2; j++) {
+                    for (s32 k = 0; k < 6; k++) {
+                        for (s32 l = 0; l < 6; l++) {
+                            if (k != 0 || l < 3) {
+                                WriteProbabilityUpdate(writer, new_prob[index + 0],
+                                                       old_prob[index + 0]);
+                                WriteProbabilityUpdate(writer, new_prob[index + 1],
+                                                       old_prob[index + 1]);
+                                WriteProbabilityUpdate(writer, new_prob[index + 2],
+                                                       old_prob[index + 2]);
+                            }
+                            index += 3;
+                        }
+                    }
+                }
+            }
+        }
+        if (block_index == static_cast<u32>(tx_mode)) {
+            break;
+        }
+    }
+}
+
+void VP9::WriteMvProbabilityUpdate(VpxRangeEncoder& writer, u8 new_prob, u8 old_prob) {
+    const bool update = new_prob != old_prob;
+    writer.Write(update, diff_update_probability);
+
+    if (update) {
+        writer.Write(new_prob >> 1, 7);
+    }
+}
+
+Vp9PictureInfo VP9::GetVp9PictureInfo(const NvdecCommon::NvdecRegisters& state) {
+    PictureInfo picture_info{};
+    gpu.MemoryManager().ReadBlock(state.picture_info_offset, &picture_info, sizeof(PictureInfo));
+    Vp9PictureInfo vp9_info = picture_info.Convert();
+
+    InsertEntropy(state.vp9_entropy_probs_offset, vp9_info.entropy);
+
+    // surface_luma_offset[0:3] contains the address of the reference frame offsets in the following
+    // order: last, golden, altref, current. It may be worthwhile to track the updates done here
+    // to avoid buffering frame data needed for reference frame updating in the header composition.
+    std::memcpy(vp9_info.frame_offsets.data(), state.surface_luma_offset.data(), 4 * sizeof(u64));
+
+    return vp9_info;
+}
+
+void VP9::InsertEntropy(u64 offset, Vp9EntropyProbs& dst) {
+    EntropyProbs entropy{};
+    gpu.MemoryManager().ReadBlock(offset, &entropy, sizeof(EntropyProbs));
+    entropy.Convert(dst);
+}
+
+Vp9FrameContainer VP9::GetCurrentFrame(const NvdecCommon::NvdecRegisters& state) {
+    Vp9FrameContainer current_frame{};
+    {
+        gpu.SyncGuestHost();
+        current_frame.info = GetVp9PictureInfo(state);
+        current_frame.bit_stream.resize(current_frame.info.bitstream_size);
+        gpu.MemoryManager().ReadBlock(state.frame_bitstream_offset, current_frame.bit_stream.data(),
+                                      current_frame.info.bitstream_size);
+    }
+    // Buffer two frames, saving the last show frame info
+    if (!next_next_frame.bit_stream.empty()) {
+        Vp9FrameContainer temp{
+            .info = current_frame.info,
+            .bit_stream = std::move(current_frame.bit_stream),
+        };
+        next_next_frame.info.show_frame = current_frame.info.last_frame_shown;
+        current_frame.info = next_next_frame.info;
+        current_frame.bit_stream = std::move(next_next_frame.bit_stream);
+        next_next_frame = std::move(temp);
+
+        if (!next_frame.bit_stream.empty()) {
+            Vp9FrameContainer temp2{
+                .info = current_frame.info,
+                .bit_stream = std::move(current_frame.bit_stream),
+            };
+            next_frame.info.show_frame = current_frame.info.last_frame_shown;
+            current_frame.info = next_frame.info;
+            current_frame.bit_stream = std::move(next_frame.bit_stream);
+            next_frame = std::move(temp2);
+        } else {
+            next_frame.info = current_frame.info;
+            next_frame.bit_stream = std::move(current_frame.bit_stream);
+        }
+    } else {
+        next_next_frame.info = current_frame.info;
+        next_next_frame.bit_stream = std::move(current_frame.bit_stream);
+    }
+    return current_frame;
+}
+
+std::vector<u8> VP9::ComposeCompressedHeader() {
+    VpxRangeEncoder writer{};
+    const bool update_probs = current_frame_info.show_frame && !current_frame_info.is_key_frame;
+    if (!current_frame_info.lossless) {
+        if (static_cast<u32>(current_frame_info.transform_mode) >= 3) {
+            writer.Write(3, 2);
+            writer.Write(current_frame_info.transform_mode == 4);
+        } else {
+            writer.Write(current_frame_info.transform_mode, 2);
+        }
+    }
+
+    if (current_frame_info.transform_mode == 4) {
+        // tx_mode_probs() in the spec
+        WriteProbabilityUpdate(writer, current_frame_info.entropy.tx_8x8_prob,
+                               prev_frame_probs.tx_8x8_prob);
+        WriteProbabilityUpdate(writer, current_frame_info.entropy.tx_16x16_prob,
+                               prev_frame_probs.tx_16x16_prob);
+        WriteProbabilityUpdate(writer, current_frame_info.entropy.tx_32x32_prob,
+                               prev_frame_probs.tx_32x32_prob);
+        if (update_probs) {
+            prev_frame_probs.tx_8x8_prob = current_frame_info.entropy.tx_8x8_prob;
+            prev_frame_probs.tx_16x16_prob = current_frame_info.entropy.tx_16x16_prob;
+            prev_frame_probs.tx_32x32_prob = current_frame_info.entropy.tx_32x32_prob;
+        }
+    }
+    // read_coef_probs()  in the spec
+    WriteCoefProbabilityUpdate(writer, current_frame_info.transform_mode,
+                               current_frame_info.entropy.coef_probs, prev_frame_probs.coef_probs);
+    // read_skip_probs()  in the spec
+    WriteProbabilityUpdate(writer, current_frame_info.entropy.skip_probs,
+                           prev_frame_probs.skip_probs);
+
+    if (update_probs) {
+        prev_frame_probs.coef_probs = current_frame_info.entropy.coef_probs;
+        prev_frame_probs.skip_probs = current_frame_info.entropy.skip_probs;
+    }
+
+    if (!current_frame_info.intra_only) {
+        // read_inter_probs() in the spec
+        WriteProbabilityUpdateAligned4(writer, current_frame_info.entropy.inter_mode_prob,
+                                       prev_frame_probs.inter_mode_prob);
+
+        if (current_frame_info.interp_filter == 4) {
+            // read_interp_filter_probs() in the spec
+            WriteProbabilityUpdate(writer, current_frame_info.entropy.switchable_interp_prob,
+                                   prev_frame_probs.switchable_interp_prob);
+            if (update_probs) {
+                prev_frame_probs.switchable_interp_prob =
+                    current_frame_info.entropy.switchable_interp_prob;
+            }
+        }
+
+        // read_is_inter_probs() in the spec
+        WriteProbabilityUpdate(writer, current_frame_info.entropy.intra_inter_prob,
+                               prev_frame_probs.intra_inter_prob);
+
+        // frame_reference_mode() in the spec
+        if ((current_frame_info.ref_frame_sign_bias[1] & 1) !=
+                (current_frame_info.ref_frame_sign_bias[2] & 1) ||
+            (current_frame_info.ref_frame_sign_bias[1] & 1) !=
+                (current_frame_info.ref_frame_sign_bias[3] & 1)) {
+            if (current_frame_info.reference_mode >= 1) {
+                writer.Write(1, 1);
+                writer.Write(current_frame_info.reference_mode == 2);
+            } else {
+                writer.Write(0, 1);
+            }
+        }
+
+        // frame_reference_mode_probs() in the spec
+        if (current_frame_info.reference_mode == 2) {
+            WriteProbabilityUpdate(writer, current_frame_info.entropy.comp_inter_prob,
+                                   prev_frame_probs.comp_inter_prob);
+            if (update_probs) {
+                prev_frame_probs.comp_inter_prob = current_frame_info.entropy.comp_inter_prob;
+            }
+        }
+
+        if (current_frame_info.reference_mode != 1) {
+            WriteProbabilityUpdate(writer, current_frame_info.entropy.single_ref_prob,
+                                   prev_frame_probs.single_ref_prob);
+            if (update_probs) {
+                prev_frame_probs.single_ref_prob = current_frame_info.entropy.single_ref_prob;
+            }
+        }
+
+        if (current_frame_info.reference_mode != 0) {
+            WriteProbabilityUpdate(writer, current_frame_info.entropy.comp_ref_prob,
+                                   prev_frame_probs.comp_ref_prob);
+            if (update_probs) {
+                prev_frame_probs.comp_ref_prob = current_frame_info.entropy.comp_ref_prob;
+            }
+        }
+
+        // read_y_mode_probs
+        for (std::size_t index = 0; index < current_frame_info.entropy.y_mode_prob.size();
+             ++index) {
+            WriteProbabilityUpdate(writer, current_frame_info.entropy.y_mode_prob[index],
+                                   prev_frame_probs.y_mode_prob[index]);
+        }
+
+        // read_partition_probs
+        WriteProbabilityUpdateAligned4(writer, current_frame_info.entropy.partition_prob,
+                                       prev_frame_probs.partition_prob);
+
+        // mv_probs
+        for (s32 i = 0; i < 3; i++) {
+            WriteMvProbabilityUpdate(writer, current_frame_info.entropy.joints[i],
+                                     prev_frame_probs.joints[i]);
+        }
+        if (update_probs) {
+            prev_frame_probs.inter_mode_prob = current_frame_info.entropy.inter_mode_prob;
+            prev_frame_probs.intra_inter_prob = current_frame_info.entropy.intra_inter_prob;
+            prev_frame_probs.y_mode_prob = current_frame_info.entropy.y_mode_prob;
+            prev_frame_probs.partition_prob = current_frame_info.entropy.partition_prob;
+            prev_frame_probs.joints = current_frame_info.entropy.joints;
+        }
+
+        for (s32 i = 0; i < 2; i++) {
+            WriteMvProbabilityUpdate(writer, current_frame_info.entropy.sign[i],
+                                     prev_frame_probs.sign[i]);
+            for (s32 j = 0; j < 10; j++) {
+                const int index = i * 10 + j;
+                WriteMvProbabilityUpdate(writer, current_frame_info.entropy.classes[index],
+                                         prev_frame_probs.classes[index]);
+            }
+            WriteMvProbabilityUpdate(writer, current_frame_info.entropy.class_0[i],
+                                     prev_frame_probs.class_0[i]);
+
+            for (s32 j = 0; j < 10; j++) {
+                const int index = i * 10 + j;
+                WriteMvProbabilityUpdate(writer, current_frame_info.entropy.prob_bits[index],
+                                         prev_frame_probs.prob_bits[index]);
+            }
+        }
+
+        for (s32 i = 0; i < 2; i++) {
+            for (s32 j = 0; j < 2; j++) {
+                for (s32 k = 0; k < 3; k++) {
+                    const int index = i * 2 * 3 + j * 3 + k;
+                    WriteMvProbabilityUpdate(writer, current_frame_info.entropy.class_0_fr[index],
+                                             prev_frame_probs.class_0_fr[index]);
+                }
+            }
+
+            for (s32 j = 0; j < 3; j++) {
+                const int index = i * 3 + j;
+                WriteMvProbabilityUpdate(writer, current_frame_info.entropy.fr[index],
+                                         prev_frame_probs.fr[index]);
+            }
+        }
+
+        if (current_frame_info.allow_high_precision_mv) {
+            for (s32 index = 0; index < 2; index++) {
+                WriteMvProbabilityUpdate(writer, current_frame_info.entropy.class_0_hp[index],
+                                         prev_frame_probs.class_0_hp[index]);
+                WriteMvProbabilityUpdate(writer, current_frame_info.entropy.high_precision[index],
+                                         prev_frame_probs.high_precision[index]);
+            }
+        }
+
+        // save previous probs
+        if (update_probs) {
+            prev_frame_probs.sign = current_frame_info.entropy.sign;
+            prev_frame_probs.classes = current_frame_info.entropy.classes;
+            prev_frame_probs.class_0 = current_frame_info.entropy.class_0;
+            prev_frame_probs.prob_bits = current_frame_info.entropy.prob_bits;
+            prev_frame_probs.class_0_fr = current_frame_info.entropy.class_0_fr;
+            prev_frame_probs.fr = current_frame_info.entropy.fr;
+            prev_frame_probs.class_0_hp = current_frame_info.entropy.class_0_hp;
+            prev_frame_probs.high_precision = current_frame_info.entropy.high_precision;
+        }
+    }
+    writer.End();
+    return writer.GetBuffer();
+}
+
+VpxBitStreamWriter VP9::ComposeUncompressedHeader() {
+    VpxBitStreamWriter uncomp_writer{};
+
+    uncomp_writer.WriteU(2, 2);                                      // Frame marker.
+    uncomp_writer.WriteU(0, 2);                                      // Profile.
+    uncomp_writer.WriteBit(false);                                   // Show existing frame.
+    uncomp_writer.WriteBit(!current_frame_info.is_key_frame);        // is key frame?
+    uncomp_writer.WriteBit(current_frame_info.show_frame);           // show frame?
+    uncomp_writer.WriteBit(current_frame_info.error_resilient_mode); // error reslience
+
+    if (current_frame_info.is_key_frame) {
+        uncomp_writer.WriteU(frame_sync_code, 24);
+        uncomp_writer.WriteU(0, 3); // Color space.
+        uncomp_writer.WriteU(0, 1); // Color range.
+        uncomp_writer.WriteU(current_frame_info.frame_size.width - 1, 16);
+        uncomp_writer.WriteU(current_frame_info.frame_size.height - 1, 16);
+        uncomp_writer.WriteBit(false); // Render and frame size different.
+
+        // Reset context
+        prev_frame_probs = default_probs;
+        swap_next_golden = false;
+        loop_filter_ref_deltas.fill(0);
+        loop_filter_mode_deltas.fill(0);
+
+        // allow frames offsets to stabilize before checking for golden frames
+        grace_period = 4;
+
+        // On key frames, all frame slots are set to the current frame,
+        // so the value of the selected slot doesn't really matter.
+        frame_ctxs.fill({current_frame_number, false, default_probs});
+
+        // intra only, meaning the frame can be recreated with no other references
+        current_frame_info.intra_only = true;
+
+    } else {
+
+        if (!current_frame_info.show_frame) {
+            uncomp_writer.WriteBit(current_frame_info.intra_only);
+            if (!current_frame_info.last_frame_was_key) {
+                swap_next_golden = !swap_next_golden;
+            }
+        } else {
+            current_frame_info.intra_only = false;
+        }
+        if (!current_frame_info.error_resilient_mode) {
+            uncomp_writer.WriteU(0, 2); // Reset frame context.
+        }
+
+        // Last, Golden, Altref frames
+        std::array<s32, 3> ref_frame_index{0, 1, 2};
+
+        // Set when next frame is hidden
+        // altref and golden references are swapped
+        if (swap_next_golden) {
+            ref_frame_index = std::array<s32, 3>{0, 2, 1};
+        }
+
+        // update Last Frame
+        u64 refresh_frame_flags = 1;
+
+        // golden frame may refresh, determined if the next golden frame offset is changed
+        bool golden_refresh = false;
+        if (grace_period <= 0) {
+            for (s32 index = 1; index < 3; ++index) {
+                if (current_frame_info.frame_offsets[index] !=
+                    next_frame.info.frame_offsets[index]) {
+                    current_frame_info.refresh_frame[index] = true;
+                    golden_refresh = true;
+                    grace_period = 3;
+                }
+            }
+        }
+
+        if (current_frame_info.show_frame &&
+            (!next_frame.info.show_frame || next_frame.info.is_key_frame)) {
+            // Update golden frame
+            refresh_frame_flags = swap_next_golden ? 2 : 4;
+        }
+
+        if (!current_frame_info.show_frame) {
+            // Update altref
+            refresh_frame_flags = swap_next_golden ? 2 : 4;
+        } else if (golden_refresh) {
+            refresh_frame_flags = 3;
+        }
+
+        if (current_frame_info.intra_only) {
+            uncomp_writer.WriteU(frame_sync_code, 24);
+            uncomp_writer.WriteU(static_cast<s32>(refresh_frame_flags), 8);
+            uncomp_writer.WriteU(current_frame_info.frame_size.width - 1, 16);
+            uncomp_writer.WriteU(current_frame_info.frame_size.height - 1, 16);
+            uncomp_writer.WriteBit(false); // Render and frame size different.
+        } else {
+            uncomp_writer.WriteU(static_cast<s32>(refresh_frame_flags), 8);
+
+            for (s32 index = 1; index < 4; index++) {
+                uncomp_writer.WriteU(ref_frame_index[index - 1], 3);
+                uncomp_writer.WriteU(current_frame_info.ref_frame_sign_bias[index], 1);
+            }
+
+            uncomp_writer.WriteBit(true);  // Frame size with refs.
+            uncomp_writer.WriteBit(false); // Render and frame size different.
+            uncomp_writer.WriteBit(current_frame_info.allow_high_precision_mv);
+            uncomp_writer.WriteBit(current_frame_info.interp_filter == 4);
+
+            if (current_frame_info.interp_filter != 4) {
+                uncomp_writer.WriteU(current_frame_info.interp_filter, 2);
+            }
+        }
+    }
+
+    if (!current_frame_info.error_resilient_mode) {
+        uncomp_writer.WriteBit(true); // Refresh frame context. where do i get this info from?
+        uncomp_writer.WriteBit(true); // Frame parallel decoding mode.
+    }
+
+    int frame_ctx_idx = 0;
+    if (!current_frame_info.show_frame) {
+        frame_ctx_idx = 1;
+    }
+
+    uncomp_writer.WriteU(frame_ctx_idx, 2); // Frame context index.
+    prev_frame_probs =
+        frame_ctxs[frame_ctx_idx].probs; // reference probabilities for compressed header
+    frame_ctxs[frame_ctx_idx] = {current_frame_number, false, current_frame_info.entropy};
+
+    uncomp_writer.WriteU(current_frame_info.first_level, 6);
+    uncomp_writer.WriteU(current_frame_info.sharpness_level, 3);
+    uncomp_writer.WriteBit(current_frame_info.mode_ref_delta_enabled);
+
+    if (current_frame_info.mode_ref_delta_enabled) {
+        // check if ref deltas are different, update accordingly
+        std::array<bool, 4> update_loop_filter_ref_deltas;
+        std::array<bool, 2> update_loop_filter_mode_deltas;
+
+        bool loop_filter_delta_update = false;
+
+        for (std::size_t index = 0; index < current_frame_info.ref_deltas.size(); index++) {
+            const s8 old_deltas = loop_filter_ref_deltas[index];
+            const s8 new_deltas = current_frame_info.ref_deltas[index];
+            const bool differing_delta = old_deltas != new_deltas;
+
+            update_loop_filter_ref_deltas[index] = differing_delta;
+            loop_filter_delta_update |= differing_delta;
+        }
+
+        for (std::size_t index = 0; index < current_frame_info.mode_deltas.size(); index++) {
+            const s8 old_deltas = loop_filter_mode_deltas[index];
+            const s8 new_deltas = current_frame_info.mode_deltas[index];
+            const bool differing_delta = old_deltas != new_deltas;
+
+            update_loop_filter_mode_deltas[index] = differing_delta;
+            loop_filter_delta_update |= differing_delta;
+        }
+
+        uncomp_writer.WriteBit(loop_filter_delta_update);
+
+        if (loop_filter_delta_update) {
+            for (std::size_t index = 0; index < current_frame_info.ref_deltas.size(); index++) {
+                uncomp_writer.WriteBit(update_loop_filter_ref_deltas[index]);
+
+                if (update_loop_filter_ref_deltas[index]) {
+                    uncomp_writer.WriteS(current_frame_info.ref_deltas[index], 6);
+                }
+            }
+
+            for (std::size_t index = 0; index < current_frame_info.mode_deltas.size(); index++) {
+                uncomp_writer.WriteBit(update_loop_filter_mode_deltas[index]);
+
+                if (update_loop_filter_mode_deltas[index]) {
+                    uncomp_writer.WriteS(current_frame_info.mode_deltas[index], 6);
+                }
+            }
+            // save new deltas
+            loop_filter_ref_deltas = current_frame_info.ref_deltas;
+            loop_filter_mode_deltas = current_frame_info.mode_deltas;
+        }
+    }
+
+    uncomp_writer.WriteU(current_frame_info.base_q_index, 8);
+
+    uncomp_writer.WriteDeltaQ(current_frame_info.y_dc_delta_q);
+    uncomp_writer.WriteDeltaQ(current_frame_info.uv_dc_delta_q);
+    uncomp_writer.WriteDeltaQ(current_frame_info.uv_ac_delta_q);
+
+    uncomp_writer.WriteBit(false); // Segmentation enabled (TODO).
+
+    const s32 min_tile_cols_log2 = CalcMinLog2TileCols(current_frame_info.frame_size.width);
+    const s32 max_tile_cols_log2 = CalcMaxLog2TileCols(current_frame_info.frame_size.width);
+
+    const s32 tile_cols_log2_diff = current_frame_info.log2_tile_cols - min_tile_cols_log2;
+    const s32 tile_cols_log2_inc_mask = (1 << tile_cols_log2_diff) - 1;
+
+    // If it's less than the maximum, we need to add an extra 0 on the bitstream
+    // to indicate that it should stop reading.
+    if (current_frame_info.log2_tile_cols < max_tile_cols_log2) {
+        uncomp_writer.WriteU(tile_cols_log2_inc_mask << 1, tile_cols_log2_diff + 1);
+    } else {
+        uncomp_writer.WriteU(tile_cols_log2_inc_mask, tile_cols_log2_diff);
+    }
+
+    const bool tile_rows_log2_is_nonzero = current_frame_info.log2_tile_rows != 0;
+
+    uncomp_writer.WriteBit(tile_rows_log2_is_nonzero);
+
+    if (tile_rows_log2_is_nonzero) {
+        uncomp_writer.WriteBit(current_frame_info.log2_tile_rows > 1);
+    }
+
+    return uncomp_writer;
+}
+
+const std::vector<u8>& VP9::ComposeFrameHeader(const NvdecCommon::NvdecRegisters& state) {
+    std::vector<u8> bitstream;
+    {
+        Vp9FrameContainer curr_frame = GetCurrentFrame(state);
+        current_frame_info = curr_frame.info;
+        bitstream = std::move(curr_frame.bit_stream);
+    }
+
+    // The uncompressed header routine sets PrevProb parameters needed for the compressed header
+    auto uncomp_writer = ComposeUncompressedHeader();
+    std::vector<u8> compressed_header = ComposeCompressedHeader();
+
+    uncomp_writer.WriteU(static_cast<s32>(compressed_header.size()), 16);
+    uncomp_writer.Flush();
+    std::vector<u8> uncompressed_header = uncomp_writer.GetByteArray();
+
+    // Write headers and frame to buffer
+    frame.resize(uncompressed_header.size() + compressed_header.size() + bitstream.size());
+    std::memcpy(frame.data(), uncompressed_header.data(), uncompressed_header.size());
+    std::memcpy(frame.data() + uncompressed_header.size(), compressed_header.data(),
+                compressed_header.size());
+    std::memcpy(frame.data() + uncompressed_header.size() + compressed_header.size(),
+                bitstream.data(), bitstream.size());
+
+    // keep track of frame number
+    current_frame_number++;
+    grace_period--;
+
+    // don't display hidden frames
+    hidden = !current_frame_info.show_frame;
+    return frame;
+}
+
+VpxRangeEncoder::VpxRangeEncoder() {
+    Write(false);
+}
+
+VpxRangeEncoder::~VpxRangeEncoder() = default;
+
+void VpxRangeEncoder::Write(s32 value, s32 value_size) {
+    for (s32 bit = value_size - 1; bit >= 0; bit--) {
+        Write(((value >> bit) & 1) != 0);
+    }
+}
+
+void VpxRangeEncoder::Write(bool bit) {
+    Write(bit, half_probability);
+}
+
+void VpxRangeEncoder::Write(bool bit, s32 probability) {
+    u32 local_range = range;
+    const u32 split = 1 + (((local_range - 1) * static_cast<u32>(probability)) >> 8);
+    local_range = split;
+
+    if (bit) {
+        low_value += split;
+        local_range = range - split;
+    }
+
+    s32 shift = norm_lut[local_range];
+    local_range <<= shift;
+    count += shift;
+
+    if (count >= 0) {
+        const s32 offset = shift - count;
+
+        if (((low_value << (offset - 1)) >> 31) != 0) {
+            const s32 current_pos = static_cast<s32>(base_stream.GetPosition());
+            base_stream.Seek(-1, Common::SeekOrigin::FromCurrentPos);
+            while (PeekByte() == 0xff) {
+                base_stream.WriteByte(0);
+
+                base_stream.Seek(-2, Common::SeekOrigin::FromCurrentPos);
+            }
+            base_stream.WriteByte(static_cast<u8>((PeekByte() + 1)));
+            base_stream.Seek(current_pos, Common::SeekOrigin::SetOrigin);
+        }
+        base_stream.WriteByte(static_cast<u8>((low_value >> (24 - offset))));
+
+        low_value <<= offset;
+        shift = count;
+        low_value &= 0xffffff;
+        count -= 8;
+    }
+
+    low_value <<= shift;
+    range = local_range;
+}
+
+void VpxRangeEncoder::End() {
+    for (std::size_t index = 0; index < 32; ++index) {
+        Write(false);
+    }
+}
+
+u8 VpxRangeEncoder::PeekByte() {
+    const u8 value = base_stream.ReadByte();
+    base_stream.Seek(-1, Common::SeekOrigin::FromCurrentPos);
+
+    return value;
+}
+
+VpxBitStreamWriter::VpxBitStreamWriter() = default;
+
+VpxBitStreamWriter::~VpxBitStreamWriter() = default;
+
+void VpxBitStreamWriter::WriteU(u32 value, u32 value_size) {
+    WriteBits(value, value_size);
+}
+
+void VpxBitStreamWriter::WriteS(s32 value, u32 value_size) {
+    const bool sign = value < 0;
+    if (sign) {
+        value = -value;
+    }
+
+    WriteBits(static_cast<u32>(value << 1) | (sign ? 1 : 0), value_size + 1);
+}
+
+void VpxBitStreamWriter::WriteDeltaQ(u32 value) {
+    const bool delta_coded = value != 0;
+    WriteBit(delta_coded);
+
+    if (delta_coded) {
+        WriteBits(value, 4);
+    }
+}
+
+void VpxBitStreamWriter::WriteBits(u32 value, u32 bit_count) {
+    s32 value_pos = 0;
+    s32 remaining = bit_count;
+
+    while (remaining > 0) {
+        s32 copy_size = remaining;
+
+        const s32 free = GetFreeBufferBits();
+
+        if (copy_size > free) {
+            copy_size = free;
+        }
+
+        const s32 mask = (1 << copy_size) - 1;
+
+        const s32 src_shift = (bit_count - value_pos) - copy_size;
+        const s32 dst_shift = (buffer_size - buffer_pos) - copy_size;
+
+        buffer |= ((value >> src_shift) & mask) << dst_shift;
+
+        value_pos += copy_size;
+        buffer_pos += copy_size;
+        remaining -= copy_size;
+    }
+}
+
+void VpxBitStreamWriter::WriteBit(bool state) {
+    WriteBits(state ? 1 : 0, 1);
+}
+
+s32 VpxBitStreamWriter::GetFreeBufferBits() {
+    if (buffer_pos == buffer_size) {
+        Flush();
+    }
+
+    return buffer_size - buffer_pos;
+}
+
+void VpxBitStreamWriter::Flush() {
+    if (buffer_pos == 0) {
+        return;
+    }
+    byte_array.push_back(static_cast<u8>(buffer));
+    buffer = 0;
+    buffer_pos = 0;
+}
+
+std::vector<u8>& VpxBitStreamWriter::GetByteArray() {
+    return byte_array;
+}
+
+const std::vector<u8>& VpxBitStreamWriter::GetByteArray() const {
+    return byte_array;
+}
+
+} // namespace Tegra::Decoder
--- a/src/video_core/command_classes/codecs/vp9.h
+++ b/src/video_core/command_classes/codecs/vp9.h
@@ -0,0 +1,197 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <array>
+#include <vector>
+
+#include "common/common_types.h"
+#include "common/stream.h"
+#include "video_core/command_classes/codecs/vp9_types.h"
+#include "video_core/command_classes/nvdec_common.h"
+
+namespace Tegra {
+class GPU;
+enum class FrameType { KeyFrame = 0, InterFrame = 1 };
+namespace Decoder {
+
+/// The VpxRangeEncoder, and VpxBitStreamWriter classes are used to compose the
+/// VP9 header bitstreams.
+
+class VpxRangeEncoder {
+public:
+    VpxRangeEncoder();
+    ~VpxRangeEncoder();
+
+    VpxRangeEncoder(const VpxRangeEncoder&) = delete;
+    VpxRangeEncoder& operator=(const VpxRangeEncoder&) = delete;
+
+    VpxRangeEncoder(VpxRangeEncoder&&) = default;
+    VpxRangeEncoder& operator=(VpxRangeEncoder&&) = default;
+
+    /// Writes the rightmost value_size bits from value into the stream
+    void Write(s32 value, s32 value_size);
+
+    /// Writes a single bit with half probability
+    void Write(bool bit);
+
+    /// Writes a bit to the base_stream encoded with probability
+    void Write(bool bit, s32 probability);
+
+    /// Signal the end of the bitstream
+    void End();
+
+    [[nodiscard]] std::vector<u8>& GetBuffer() {
+        return base_stream.GetBuffer();
+    }
+
+    [[nodiscard]] const std::vector<u8>& GetBuffer() const {
+        return base_stream.GetBuffer();
+    }
+
+private:
+    u8 PeekByte();
+    Common::Stream base_stream{};
+    u32 low_value{};
+    u32 range{0xff};
+    s32 count{-24};
+    s32 half_probability{128};
+};
+
+class VpxBitStreamWriter {
+public:
+    VpxBitStreamWriter();
+    ~VpxBitStreamWriter();
+
+    VpxBitStreamWriter(const VpxBitStreamWriter&) = delete;
+    VpxBitStreamWriter& operator=(const VpxBitStreamWriter&) = delete;
+
+    VpxBitStreamWriter(VpxBitStreamWriter&&) = default;
+    VpxBitStreamWriter& operator=(VpxBitStreamWriter&&) = default;
+
+    /// Write an unsigned integer value
+    void WriteU(u32 value, u32 value_size);
+
+    /// Write a signed integer value
+    void WriteS(s32 value, u32 value_size);
+
+    /// Based on 6.2.10 of VP9 Spec, writes a delta coded value
+    void WriteDeltaQ(u32 value);
+
+    /// Write a single bit.
+    void WriteBit(bool state);
+
+    /// Pushes current buffer into buffer_array, resets buffer
+    void Flush();
+
+    /// Returns byte_array
+    [[nodiscard]] std::vector<u8>& GetByteArray();
+
+    /// Returns const byte_array
+    [[nodiscard]] const std::vector<u8>& GetByteArray() const;
+
+private:
+    /// Write bit_count bits from value into buffer
+    void WriteBits(u32 value, u32 bit_count);
+
+    /// Gets next available position in buffer, invokes Flush() if buffer is full
+    s32 GetFreeBufferBits();
+
+    s32 buffer_size{8};
+
+    s32 buffer{};
+    s32 buffer_pos{};
+    std::vector<u8> byte_array;
+};
+
+class VP9 {
+public:
+    explicit VP9(GPU& gpu_);
+    ~VP9();
+
+    VP9(const VP9&) = delete;
+    VP9& operator=(const VP9&) = delete;
+
+    VP9(VP9&&) = default;
+    VP9& operator=(VP9&&) = delete;
+
+    /// Composes the VP9 frame from the GPU state information. Based on the official VP9 spec
+    /// documentation
+    [[nodiscard]] const std::vector<u8>& ComposeFrameHeader(
+        const NvdecCommon::NvdecRegisters& state);
+
+    /// Returns true if the most recent frame was a hidden frame.
+    [[nodiscard]] bool WasFrameHidden() const {
+        return hidden;
+    }
+
+private:
+    /// Generates compressed header probability updates in the bitstream writer
+    template <typename T, std::size_t N>
+    void WriteProbabilityUpdate(VpxRangeEncoder& writer, const std::array<T, N>& new_prob,
+                                const std::array<T, N>& old_prob);
+
+    /// Generates compressed header probability updates in the bitstream writer
+    /// If probs are not equal, WriteProbabilityDelta is invoked
+    void WriteProbabilityUpdate(VpxRangeEncoder& writer, u8 new_prob, u8 old_prob);
+
+    /// Generates compressed header probability deltas in the bitstream writer
+    void WriteProbabilityDelta(VpxRangeEncoder& writer, u8 new_prob, u8 old_prob);
+
+    /// Inverse of 6.3.4 Decode term subexp
+    void EncodeTermSubExp(VpxRangeEncoder& writer, s32 value);
+
+    /// Writes if the value is less than the test value
+    bool WriteLessThan(VpxRangeEncoder& writer, s32 value, s32 test);
+
+    /// Writes probability updates for the Coef probabilities
+    void WriteCoefProbabilityUpdate(VpxRangeEncoder& writer, s32 tx_mode,
+                                    const std::array<u8, 1728>& new_prob,
+                                    const std::array<u8, 1728>& old_prob);
+
+    /// Write probabilities for 4-byte aligned structures
+    template <typename T, std::size_t N>
+    void WriteProbabilityUpdateAligned4(VpxRangeEncoder& writer, const std::array<T, N>& new_prob,
+                                        const std::array<T, N>& old_prob);
+
+    /// Write motion vector probability updates. 6.3.17 in the spec
+    void WriteMvProbabilityUpdate(VpxRangeEncoder& writer, u8 new_prob, u8 old_prob);
+
+    /// Returns VP9 information from NVDEC provided offset and size
+    [[nodiscard]] Vp9PictureInfo GetVp9PictureInfo(const NvdecCommon::NvdecRegisters& state);
+
+    /// Read and convert NVDEC provided entropy probs to Vp9EntropyProbs struct
+    void InsertEntropy(u64 offset, Vp9EntropyProbs& dst);
+
+    /// Returns frame to be decoded after buffering
+    [[nodiscard]] Vp9FrameContainer GetCurrentFrame(const NvdecCommon::NvdecRegisters& state);
+
+    /// Use NVDEC providied information to compose the headers for the current frame
+    [[nodiscard]] std::vector<u8> ComposeCompressedHeader();
+    [[nodiscard]] VpxBitStreamWriter ComposeUncompressedHeader();
+
+    GPU& gpu;
+    std::vector<u8> frame;
+
+    std::array<s8, 4> loop_filter_ref_deltas{};
+    std::array<s8, 2> loop_filter_mode_deltas{};
+
+    bool hidden = false;
+    s64 current_frame_number = -2; // since we buffer 2 frames
+    s32 grace_period = 6;          // frame offsets need to stabilize
+    std::array<FrameContexts, 4> frame_ctxs{};
+    Vp9FrameContainer next_frame{};
+    Vp9FrameContainer next_next_frame{};
+    bool swap_next_golden{};
+
+    Vp9PictureInfo current_frame_info{};
+    Vp9EntropyProbs prev_frame_probs{};
+
+    s32 diff_update_probability = 252;
+    s32 frame_sync_code = 0x498342;
+};
+
+} // namespace Decoder
+} // namespace Tegra
--- a/src/video_core/command_classes/codecs/vp9_types.h
+++ b/src/video_core/command_classes/codecs/vp9_types.h
@@ -0,0 +1,302 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <array>
+#include <cstring>
+#include <vector>
+#include "common/common_funcs.h"
+#include "common/common_types.h"
+
+namespace Tegra {
+class GPU;
+
+namespace Decoder {
+struct Vp9FrameDimensions {
+    s16 width{};
+    s16 height{};
+    s16 luma_pitch{};
+    s16 chroma_pitch{};
+};
+static_assert(sizeof(Vp9FrameDimensions) == 0x8, "Vp9 Vp9FrameDimensions is an invalid size");
+
+enum FrameFlags : u32 {
+    IsKeyFrame = 1 << 0,
+    LastFrameIsKeyFrame = 1 << 1,
+    FrameSizeChanged = 1 << 2,
+    ErrorResilientMode = 1 << 3,
+    LastShowFrame = 1 << 4,
+    IntraOnly = 1 << 5,
+};
+
+enum class TxSize {
+    Tx4x4 = 0,   // 4x4 transform
+    Tx8x8 = 1,   // 8x8 transform
+    Tx16x16 = 2, // 16x16 transform
+    Tx32x32 = 3, // 32x32 transform
+    TxSizes = 4
+};
+
+enum class TxMode {
+    Only4X4 = 0,      // Only 4x4 transform used
+    Allow8X8 = 1,     // Allow block transform size up to 8x8
+    Allow16X16 = 2,   // Allow block transform size up to 16x16
+    Allow32X32 = 3,   // Allow block transform size up to 32x32
+    TxModeSelect = 4, // Transform specified for each block
+    TxModes = 5
+};
+
+struct Segmentation {
+    u8 enabled{};
+    u8 update_map{};
+    u8 temporal_update{};
+    u8 abs_delta{};
+    std::array<u32, 8> feature_mask{};
+    std::array<std::array<s16, 4>, 8> feature_data{};
+};
+static_assert(sizeof(Segmentation) == 0x64, "Segmentation is an invalid size");
+
+struct LoopFilter {
+    u8 mode_ref_delta_enabled{};
+    std::array<s8, 4> ref_deltas{};
+    std::array<s8, 2> mode_deltas{};
+};
+static_assert(sizeof(LoopFilter) == 0x7, "LoopFilter is an invalid size");
+
+struct Vp9EntropyProbs {
+    std::array<u8, 36> y_mode_prob{};
+    std::array<u8, 64> partition_prob{};
+    std::array<u8, 1728> coef_probs{};
+    std::array<u8, 8> switchable_interp_prob{};
+    std::array<u8, 28> inter_mode_prob{};
+    std::array<u8, 4> intra_inter_prob{};
+    std::array<u8, 5> comp_inter_prob{};
+    std::array<u8, 10> single_ref_prob{};
+    std::array<u8, 5> comp_ref_prob{};
+    std::array<u8, 6> tx_32x32_prob{};
+    std::array<u8, 4> tx_16x16_prob{};
+    std::array<u8, 2> tx_8x8_prob{};
+    std::array<u8, 3> skip_probs{};
+    std::array<u8, 3> joints{};
+    std::array<u8, 2> sign{};
+    std::array<u8, 20> classes{};
+    std::array<u8, 2> class_0{};
+    std::array<u8, 20> prob_bits{};
+    std::array<u8, 12> class_0_fr{};
+    std::array<u8, 6> fr{};
+    std::array<u8, 2> class_0_hp{};
+    std::array<u8, 2> high_precision{};
+};
+static_assert(sizeof(Vp9EntropyProbs) == 0x7B4, "Vp9EntropyProbs is an invalid size");
+
+struct Vp9PictureInfo {
+    bool is_key_frame{};
+    bool intra_only{};
+    bool last_frame_was_key{};
+    bool frame_size_changed{};
+    bool error_resilient_mode{};
+    bool last_frame_shown{};
+    bool show_frame{};
+    std::array<s8, 4> ref_frame_sign_bias{};
+    s32 base_q_index{};
+    s32 y_dc_delta_q{};
+    s32 uv_dc_delta_q{};
+    s32 uv_ac_delta_q{};
+    bool lossless{};
+    s32 transform_mode{};
+    bool allow_high_precision_mv{};
+    s32 interp_filter{};
+    s32 reference_mode{};
+    s8 comp_fixed_ref{};
+    std::array<s8, 2> comp_var_ref{};
+    s32 log2_tile_cols{};
+    s32 log2_tile_rows{};
+    bool segment_enabled{};
+    bool segment_map_update{};
+    bool segment_map_temporal_update{};
+    s32 segment_abs_delta{};
+    std::array<u32, 8> segment_feature_enable{};
+    std::array<std::array<s16, 4>, 8> segment_feature_data{};
+    bool mode_ref_delta_enabled{};
+    bool use_prev_in_find_mv_refs{};
+    std::array<s8, 4> ref_deltas{};
+    std::array<s8, 2> mode_deltas{};
+    Vp9EntropyProbs entropy{};
+    Vp9FrameDimensions frame_size{};
+    u8 first_level{};
+    u8 sharpness_level{};
+    u32 bitstream_size{};
+    std::array<u64, 4> frame_offsets{};
+    std::array<bool, 4> refresh_frame{};
+};
+
+struct Vp9FrameContainer {
+    Vp9PictureInfo info{};
+    std::vector<u8> bit_stream;
+};
+
+struct PictureInfo {
+    INSERT_PADDING_WORDS(12);
+    u32 bitstream_size{};
+    INSERT_PADDING_WORDS(5);
+    Vp9FrameDimensions last_frame_size{};
+    Vp9FrameDimensions golden_frame_size{};
+    Vp9FrameDimensions alt_frame_size{};
+    Vp9FrameDimensions current_frame_size{};
+    u32 vp9_flags{};
+    std::array<s8, 4> ref_frame_sign_bias{};
+    u8 first_level{};
+    u8 sharpness_level{};
+    u8 base_q_index{};
+    u8 y_dc_delta_q{};
+    u8 uv_ac_delta_q{};
+    u8 uv_dc_delta_q{};
+    u8 lossless{};
+    u8 tx_mode{};
+    u8 allow_high_precision_mv{};
+    u8 interp_filter{};
+    u8 reference_mode{};
+    s8 comp_fixed_ref{};
+    std::array<s8, 2> comp_var_ref{};
+    u8 log2_tile_cols{};
+    u8 log2_tile_rows{};
+    Segmentation segmentation{};
+    LoopFilter loop_filter{};
+    INSERT_PADDING_BYTES(5);
+    u32 surface_params{};
+    INSERT_PADDING_WORDS(3);
+
+    [[nodiscard]] Vp9PictureInfo Convert() const {
+        return {
+            .is_key_frame = (vp9_flags & FrameFlags::IsKeyFrame) != 0,
+            .intra_only = (vp9_flags & FrameFlags::IntraOnly) != 0,
+            .last_frame_was_key = (vp9_flags & FrameFlags::LastFrameIsKeyFrame) != 0,
+            .frame_size_changed = (vp9_flags & FrameFlags::FrameSizeChanged) != 0,
+            .error_resilient_mode = (vp9_flags & FrameFlags::ErrorResilientMode) != 0,
+            .last_frame_shown = (vp9_flags & FrameFlags::LastShowFrame) != 0,
+            .ref_frame_sign_bias = ref_frame_sign_bias,
+            .base_q_index = base_q_index,
+            .y_dc_delta_q = y_dc_delta_q,
+            .uv_dc_delta_q = uv_dc_delta_q,
+            .uv_ac_delta_q = uv_ac_delta_q,
+            .lossless = lossless != 0,
+            .transform_mode = tx_mode,
+            .allow_high_precision_mv = allow_high_precision_mv != 0,
+            .interp_filter = interp_filter,
+            .reference_mode = reference_mode,
+            .comp_fixed_ref = comp_fixed_ref,
+            .comp_var_ref = comp_var_ref,
+            .log2_tile_cols = log2_tile_cols,
+            .log2_tile_rows = log2_tile_rows,
+            .segment_enabled = segmentation.enabled != 0,
+            .segment_map_update = segmentation.update_map != 0,
+            .segment_map_temporal_update = segmentation.temporal_update != 0,
+            .segment_abs_delta = segmentation.abs_delta,
+            .segment_feature_enable = segmentation.feature_mask,
+            .segment_feature_data = segmentation.feature_data,
+            .mode_ref_delta_enabled = loop_filter.mode_ref_delta_enabled != 0,
+            .use_prev_in_find_mv_refs = !(vp9_flags == (FrameFlags::ErrorResilientMode)) &&
+                                        !(vp9_flags == (FrameFlags::FrameSizeChanged)) &&
+                                        !(vp9_flags == (FrameFlags::IntraOnly)) &&
+                                        (vp9_flags == (FrameFlags::LastShowFrame)) &&
+                                        !(vp9_flags == (FrameFlags::LastFrameIsKeyFrame)),
+            .ref_deltas = loop_filter.ref_deltas,
+            .mode_deltas = loop_filter.mode_deltas,
+            .frame_size = current_frame_size,
+            .first_level = first_level,
+            .sharpness_level = sharpness_level,
+            .bitstream_size = bitstream_size,
+        };
+    }
+};
+static_assert(sizeof(PictureInfo) == 0x100, "PictureInfo is an invalid size");
+
+struct EntropyProbs {
+    INSERT_PADDING_BYTES(1024);
+    std::array<u8, 28> inter_mode_prob{};
+    std::array<u8, 4> intra_inter_prob{};
+    INSERT_PADDING_BYTES(80);
+    std::array<u8, 2> tx_8x8_prob{};
+    std::array<u8, 4> tx_16x16_prob{};
+    std::array<u8, 6> tx_32x32_prob{};
+    std::array<u8, 4> y_mode_prob_e8{};
+    std::array<std::array<u8, 8>, 4> y_mode_prob_e0e7{};
+    INSERT_PADDING_BYTES(64);
+    std::array<u8, 64> partition_prob{};
+    INSERT_PADDING_BYTES(10);
+    std::array<u8, 8> switchable_interp_prob{};
+    std::array<u8, 5> comp_inter_prob{};
+    std::array<u8, 3> skip_probs{};
+    INSERT_PADDING_BYTES(1);
+    std::array<u8, 3> joints{};
+    std::array<u8, 2> sign{};
+    std::array<u8, 2> class_0{};
+    std::array<u8, 6> fr{};
+    std::array<u8, 2> class_0_hp{};
+    std::array<u8, 2> high_precision{};
+    std::array<u8, 20> classes{};
+    std::array<u8, 12> class_0_fr{};
+    std::array<u8, 20> pred_bits{};
+    std::array<u8, 10> single_ref_prob{};
+    std::array<u8, 5> comp_ref_prob{};
+    INSERT_PADDING_BYTES(17);
+    std::array<u8, 2304> coef_probs{};
+
+    void Convert(Vp9EntropyProbs& fc) {
+        fc.inter_mode_prob = inter_mode_prob;
+        fc.intra_inter_prob = intra_inter_prob;
+        fc.tx_8x8_prob = tx_8x8_prob;
+        fc.tx_16x16_prob = tx_16x16_prob;
+        fc.tx_32x32_prob = tx_32x32_prob;
+
+        for (std::size_t i = 0; i < 4; i++) {
+            for (std::size_t j = 0; j < 9; j++) {
+                fc.y_mode_prob[j + 9 * i] = j < 8 ? y_mode_prob_e0e7[i][j] : y_mode_prob_e8[i];
+            }
+        }
+
+        fc.partition_prob = partition_prob;
+        fc.switchable_interp_prob = switchable_interp_prob;
+        fc.comp_inter_prob = comp_inter_prob;
+        fc.skip_probs = skip_probs;
+        fc.joints = joints;
+        fc.sign = sign;
+        fc.class_0 = class_0;
+        fc.fr = fr;
+        fc.class_0_hp = class_0_hp;
+        fc.high_precision = high_precision;
+        fc.classes = classes;
+        fc.class_0_fr = class_0_fr;
+        fc.prob_bits = pred_bits;
+        fc.single_ref_prob = single_ref_prob;
+        fc.comp_ref_prob = comp_ref_prob;
+
+        // Skip the 4th element as it goes unused
+        for (std::size_t i = 0; i < coef_probs.size(); i += 4) {
+            const std::size_t j = i - i / 4;
+            fc.coef_probs[j] = coef_probs[i];
+            fc.coef_probs[j + 1] = coef_probs[i + 1];
+            fc.coef_probs[j + 2] = coef_probs[i + 2];
+        }
+    }
+};
+static_assert(sizeof(EntropyProbs) == 0xEA0, "EntropyProbs is an invalid size");
+
+enum class Ref { Last, Golden, AltRef };
+
+struct RefPoolElement {
+    s64 frame{};
+    Ref ref{};
+    bool refresh{};
+};
+
+struct FrameContexts {
+    s64 from{};
+    bool adapted{};
+    Vp9EntropyProbs probs{};
+};
+
+}; // namespace Decoder
+}; // namespace Tegra
--- a/src/video_core/command_classes/host1x.cpp
+++ b/src/video_core/command_classes/host1x.cpp
@@ -0,0 +1,39 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "common/assert.h"
+#include "video_core/command_classes/host1x.h"
+#include "video_core/gpu.h"
+
+Tegra::Host1x::Host1x(GPU& gpu_) : gpu(gpu_) {}
+
+Tegra::Host1x::~Host1x() = default;
+
+void Tegra::Host1x::StateWrite(u32 offset, u32 arguments) {
+    u8* const state_offset = reinterpret_cast<u8*>(&state) + offset * sizeof(u32);
+    std::memcpy(state_offset, &arguments, sizeof(u32));
+}
+
+void Tegra::Host1x::ProcessMethod(Method method, const std::vector<u32>& arguments) {
+    StateWrite(static_cast<u32>(method), arguments[0]);
+    switch (method) {
+    case Method::WaitSyncpt:
+        Execute(arguments[0]);
+        break;
+    case Method::LoadSyncptPayload32:
+        syncpoint_value = arguments[0];
+        break;
+    case Method::WaitSyncpt32:
+        Execute(arguments[0]);
+        break;
+    default:
+        UNIMPLEMENTED_MSG("Host1x method 0x{:X}", static_cast<u32>(method));
+        break;
+    }
+}
+
+void Tegra::Host1x::Execute(u32 data) {
+    // This method waits on a valid syncpoint.
+    // TODO: Implement when proper Async is in place
+}
--- a/src/video_core/command_classes/host1x.h
+++ b/src/video_core/command_classes/host1x.h
@@ -0,0 +1,78 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <vector>
+#include "common/common_funcs.h"
+#include "common/common_types.h"
+
+namespace Tegra {
+class GPU;
+class Nvdec;
+
+class Host1x {
+public:
+    struct Host1xClassRegisters {
+        u32 incr_syncpt{};
+        u32 incr_syncpt_ctrl{};
+        u32 incr_syncpt_error{};
+        INSERT_PADDING_WORDS(5);
+        u32 wait_syncpt{};
+        u32 wait_syncpt_base{};
+        u32 wait_syncpt_incr{};
+        u32 load_syncpt_base{};
+        u32 incr_syncpt_base{};
+        u32 clear{};
+        u32 wait{};
+        u32 wait_with_interrupt{};
+        u32 delay_use{};
+        u32 tick_count_high{};
+        u32 tick_count_low{};
+        u32 tick_ctrl{};
+        INSERT_PADDING_WORDS(23);
+        u32 ind_ctrl{};
+        u32 ind_off2{};
+        u32 ind_off{};
+        std::array<u32, 31> ind_data{};
+        INSERT_PADDING_WORDS(1);
+        u32 load_syncpoint_payload32{};
+        u32 stall_ctrl{};
+        u32 wait_syncpt32{};
+        u32 wait_syncpt_base32{};
+        u32 load_syncpt_base32{};
+        u32 incr_syncpt_base32{};
+        u32 stall_count_high{};
+        u32 stall_count_low{};
+        u32 xref_ctrl{};
+        u32 channel_xref_high{};
+        u32 channel_xref_low{};
+    };
+    static_assert(sizeof(Host1xClassRegisters) == 0x164, "Host1xClassRegisters is an invalid size");
+
+    enum class Method : u32 {
+        WaitSyncpt = offsetof(Host1xClassRegisters, wait_syncpt) / 4,
+        LoadSyncptPayload32 = offsetof(Host1xClassRegisters, load_syncpoint_payload32) / 4,
+        WaitSyncpt32 = offsetof(Host1xClassRegisters, wait_syncpt32) / 4,
+    };
+
+    explicit Host1x(GPU& gpu);
+    ~Host1x();
+
+    /// Writes the method into the state, Invoke Execute() if encountered
+    void ProcessMethod(Method method, const std::vector<u32>& arguments);
+
+private:
+    /// For Host1x, execute is waiting on a syncpoint previously written into the state
+    void Execute(u32 data);
+
+    /// Write argument into the provided offset
+    void StateWrite(u32 offset, u32 arguments);
+
+    u32 syncpoint_value{};
+    Host1xClassRegisters state{};
+    GPU& gpu;
+};
+
+} // namespace Tegra
--- a/src/video_core/command_classes/nvdec.cpp
+++ b/src/video_core/command_classes/nvdec.cpp
@@ -0,0 +1,48 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "common/assert.h"
+#include "video_core/command_classes/nvdec.h"
+#include "video_core/gpu.h"
+
+namespace Tegra {
+
+Nvdec::Nvdec(GPU& gpu_) : gpu(gpu_), codec(std::make_unique<Codec>(gpu)) {}
+
+Nvdec::~Nvdec() = default;
+
+void Nvdec::ProcessMethod(Method method, const std::vector<u32>& arguments) {
+    if (method == Method::SetVideoCodec) {
+        codec->StateWrite(static_cast<u32>(method), arguments[0]);
+    } else {
+        codec->StateWrite(static_cast<u32>(method), static_cast<u64>(arguments[0]) << 8);
+    }
+
+    switch (method) {
+    case Method::SetVideoCodec:
+        codec->SetTargetCodec(static_cast<NvdecCommon::VideoCodec>(arguments[0]));
+        break;
+    case Method::Execute:
+        Execute();
+        break;
+    }
+}
+
+AVFramePtr Nvdec::GetFrame() {
+    return codec->GetCurrentFrame();
+}
+
+void Nvdec::Execute() {
+    switch (codec->GetCurrentCodec()) {
+    case NvdecCommon::VideoCodec::H264:
+    case NvdecCommon::VideoCodec::Vp9:
+        codec->Decode();
+        break;
+    default:
+        UNIMPLEMENTED_MSG("Unknown codec {}", static_cast<u32>(codec->GetCurrentCodec()));
+        break;
+    }
+}
+
+} // namespace Tegra
--- a/src/video_core/command_classes/nvdec.h
+++ b/src/video_core/command_classes/nvdec.h
@@ -0,0 +1,38 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <memory>
+#include <vector>
+#include "common/common_types.h"
+#include "video_core/command_classes/codecs/codec.h"
+
+namespace Tegra {
+class GPU;
+
+class Nvdec {
+public:
+    enum class Method : u32 {
+        SetVideoCodec = 0x80,
+        Execute = 0xc0,
+    };
+
+    explicit Nvdec(GPU& gpu);
+    ~Nvdec();
+
+    /// Writes the method into the state, Invoke Execute() if encountered
+    void ProcessMethod(Method method, const std::vector<u32>& arguments);
+
+    /// Return most recently decoded frame
+    [[nodiscard]] AVFramePtr GetFrame();
+
+private:
+    /// Invoke codec to decode a frame
+    void Execute();
+
+    GPU& gpu;
+    std::unique_ptr<Codec> codec;
+};
+} // namespace Tegra
--- a/src/video_core/command_classes/nvdec_common.h
+++ b/src/video_core/command_classes/nvdec_common.h
@@ -0,0 +1,48 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include "common/common_funcs.h"
+#include "common/common_types.h"
+
+namespace Tegra::NvdecCommon {
+
+struct NvdecRegisters {
+    INSERT_PADDING_WORDS(256);
+    u64 set_codec_id{};
+    INSERT_PADDING_WORDS(254);
+    u64 set_platform_id{};
+    u64 picture_info_offset{};
+    u64 frame_bitstream_offset{};
+    u64 frame_number{};
+    u64 h264_slice_data_offsets{};
+    u64 h264_mv_dump_offset{};
+    INSERT_PADDING_WORDS(6);
+    u64 frame_stats_offset{};
+    u64 h264_last_surface_luma_offset{};
+    u64 h264_last_surface_chroma_offset{};
+    std::array<u64, 17> surface_luma_offset{};
+    std::array<u64, 17> surface_chroma_offset{};
+    INSERT_PADDING_WORDS(132);
+    u64 vp9_entropy_probs_offset{};
+    u64 vp9_backward_updates_offset{};
+    u64 vp9_last_frame_segmap_offset{};
+    u64 vp9_curr_frame_segmap_offset{};
+    INSERT_PADDING_WORDS(2);
+    u64 vp9_last_frame_mvs_offset{};
+    u64 vp9_curr_frame_mvs_offset{};
+    INSERT_PADDING_WORDS(2);
+};
+static_assert(sizeof(NvdecRegisters) == (0xBC0), "NvdecRegisters is incorrect size");
+
+enum class VideoCodec : u32 {
+    None = 0x0,
+    H264 = 0x3,
+    Vp8 = 0x5,
+    H265 = 0x7,
+    Vp9 = 0x9,
+};
+
+} // namespace Tegra::NvdecCommon
--- a/src/video_core/command_classes/sync_manager.cpp
+++ b/src/video_core/command_classes/sync_manager.cpp
@@ -0,0 +1,60 @@
+// MIT License
+//
+// Copyright (c) Ryujinx Team and Contributors
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
+// associated documentation files (the "Software"), to deal in the Software without restriction,
+// including without limitation the rights to use, copy, modify, merge, publish, distribute,
+// sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or
+// substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
+// NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+//
+
+#include <algorithm>
+#include "sync_manager.h"
+#include "video_core/gpu.h"
+
+namespace Tegra {
+SyncptIncrManager::SyncptIncrManager(GPU& gpu_) : gpu(gpu_) {}
+SyncptIncrManager::~SyncptIncrManager() = default;
+
+void SyncptIncrManager::Increment(u32 id) {
+    increments.emplace_back(0, 0, id, true);
+    IncrementAllDone();
+}
+
+u32 SyncptIncrManager::IncrementWhenDone(u32 class_id, u32 id) {
+    const u32 handle = current_id++;
+    increments.emplace_back(handle, class_id, id);
+    return handle;
+}
+
+void SyncptIncrManager::SignalDone(u32 handle) {
+    const auto done_incr =
+        std::find_if(increments.begin(), increments.end(),
+                     [handle](const SyncptIncr& incr) { return incr.id == handle; });
+    if (done_incr != increments.cend()) {
+        done_incr->complete = true;
+    }
+    IncrementAllDone();
+}
+
+void SyncptIncrManager::IncrementAllDone() {
+    std::size_t done_count = 0;
+    for (; done_count < increments.size(); ++done_count) {
+        if (!increments[done_count].complete) {
+            break;
+        }
+        gpu.IncrementSyncPoint(increments[done_count].syncpt_id);
+    }
+    increments.erase(increments.begin(), increments.begin() + done_count);
+}
+} // namespace Tegra
--- a/src/video_core/command_classes/sync_manager.h
+++ b/src/video_core/command_classes/sync_manager.h
@@ -0,0 +1,64 @@
+// MIT License
+//
+// Copyright (c) Ryujinx Team and Contributors
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
+// associated documentation files (the "Software"), to deal in the Software without restriction,
+// including without limitation the rights to use, copy, modify, merge, publish, distribute,
+// sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or
+// substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
+// NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+//
+
+#pragma once
+
+#include <mutex>
+#include <vector>
+#include "common/common_types.h"
+
+namespace Tegra {
+class GPU;
+struct SyncptIncr {
+    u32 id;
+    u32 class_id;
+    u32 syncpt_id;
+    bool complete;
+
+    SyncptIncr(u32 id_, u32 class_id_, u32 syncpt_id_, bool done = false)
+        : id(id_), class_id(class_id_), syncpt_id(syncpt_id_), complete(done) {}
+};
+
+class SyncptIncrManager {
+public:
+    explicit SyncptIncrManager(GPU& gpu);
+    ~SyncptIncrManager();
+
+    /// Add syncpoint id and increment all
+    void Increment(u32 id);
+
+    /// Returns a handle to increment later
+    u32 IncrementWhenDone(u32 class_id, u32 id);
+
+    /// IncrememntAllDone, including handle
+    void SignalDone(u32 handle);
+
+    /// Increment all sequential pending increments that are already done.
+    void IncrementAllDone();
+
+private:
+    std::vector<SyncptIncr> increments;
+    std::mutex increment_lock;
+    u32 current_id{};
+
+    GPU& gpu;
+};
+
+} // namespace Tegra
--- a/src/video_core/command_classes/vic.cpp
+++ b/src/video_core/command_classes/vic.cpp
@@ -0,0 +1,175 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <array>
+#include "common/assert.h"
+#include "video_core/command_classes/nvdec.h"
+#include "video_core/command_classes/vic.h"
+#include "video_core/engines/maxwell_3d.h"
+#include "video_core/gpu.h"
+#include "video_core/memory_manager.h"
+#include "video_core/textures/decoders.h"
+
+extern "C" {
+#include <libswscale/swscale.h>
+}
+
+namespace Tegra {
+
+Vic::Vic(GPU& gpu_, std::shared_ptr<Nvdec> nvdec_processor_)
+    : gpu(gpu_), nvdec_processor(std::move(nvdec_processor_)) {}
+Vic::~Vic() = default;
+
+void Vic::VicStateWrite(u32 offset, u32 arguments) {
+    u8* const state_offset = reinterpret_cast<u8*>(&vic_state) + offset * sizeof(u32);
+    std::memcpy(state_offset, &arguments, sizeof(u32));
+}
+
+void Vic::ProcessMethod(Method method, const std::vector<u32>& arguments) {
+    LOG_DEBUG(HW_GPU, "Vic method 0x{:X}", method);
+    VicStateWrite(static_cast<u32>(method), arguments[0]);
+    const u64 arg = static_cast<u64>(arguments[0]) << 8;
+    switch (method) {
+    case Method::Execute:
+        Execute();
+        break;
+    case Method::SetConfigStructOffset:
+        config_struct_address = arg;
+        break;
+    case Method::SetOutputSurfaceLumaOffset:
+        output_surface_luma_address = arg;
+        break;
+    case Method::SetOutputSurfaceChromaUOffset:
+        output_surface_chroma_u_address = arg;
+        break;
+    case Method::SetOutputSurfaceChromaVOffset:
+        output_surface_chroma_v_address = arg;
+        break;
+    default:
+        break;
+    }
+}
+
+void Vic::Execute() {
+    if (output_surface_luma_address == 0) {
+        LOG_ERROR(Service_NVDRV, "VIC Luma address not set. Recieved 0x{:X}",
+                  vic_state.output_surface.luma_offset);
+        return;
+    }
+    const VicConfig config{gpu.MemoryManager().Read<u64>(config_struct_address + 0x20)};
+    const AVFramePtr frame_ptr = nvdec_processor->GetFrame();
+    const auto* frame = frame_ptr.get();
+    if (!frame || frame->width == 0 || frame->height == 0) {
+        return;
+    }
+    const VideoPixelFormat pixel_format =
+        static_cast<VideoPixelFormat>(config.pixel_format.Value());
+    switch (pixel_format) {
+    case VideoPixelFormat::BGRA8:
+    case VideoPixelFormat::RGBA8: {
+        LOG_TRACE(Service_NVDRV, "Writing RGB Frame");
+
+        if (scaler_ctx == nullptr || frame->width != scaler_width ||
+            frame->height != scaler_height) {
+            const AVPixelFormat target_format =
+                (pixel_format == VideoPixelFormat::RGBA8) ? AV_PIX_FMT_RGBA : AV_PIX_FMT_BGRA;
+
+            sws_freeContext(scaler_ctx);
+            scaler_ctx = nullptr;
+
+            // FFmpeg returns all frames in YUV420, convert it into expected format
+            scaler_ctx =
+                sws_getContext(frame->width, frame->height, AV_PIX_FMT_YUV420P, frame->width,
+                               frame->height, target_format, 0, nullptr, nullptr, nullptr);
+
+            scaler_width = frame->width;
+            scaler_height = frame->height;
+        }
+        // Get Converted frame
+        const std::size_t linear_size = frame->width * frame->height * 4;
+
+        using AVMallocPtr = std::unique_ptr<u8, decltype(&av_free)>;
+        AVMallocPtr converted_frame_buffer{static_cast<u8*>(av_malloc(linear_size)), av_free};
+
+        const int converted_stride{frame->width * 4};
+        u8* const converted_frame_buf_addr{converted_frame_buffer.get()};
+
+        sws_scale(scaler_ctx, frame->data, frame->linesize, 0, frame->height,
+                  &converted_frame_buf_addr, &converted_stride);
+
+        const u32 blk_kind = static_cast<u32>(config.block_linear_kind);
+        if (blk_kind != 0) {
+            // swizzle pitch linear to block linear
+            const u32 block_height = static_cast<u32>(config.block_linear_height_log2);
+            const auto size = Tegra::Texture::CalculateSize(true, 4, frame->width, frame->height, 1,
+                                                            block_height, 0);
+            std::vector<u8> swizzled_data(size);
+            Tegra::Texture::SwizzleSubrect(frame->width, frame->height, frame->width * 4,
+                                           frame->width, 4, swizzled_data.data(),
+                                           converted_frame_buffer.get(), block_height, 0, 0);
+
+            gpu.MemoryManager().WriteBlock(output_surface_luma_address, swizzled_data.data(), size);
+            gpu.Maxwell3D().OnMemoryWrite();
+        } else {
+            // send pitch linear frame
+            gpu.MemoryManager().WriteBlock(output_surface_luma_address, converted_frame_buf_addr,
+                                           linear_size);
+            gpu.Maxwell3D().OnMemoryWrite();
+        }
+        break;
+    }
+    case VideoPixelFormat::Yuv420: {
+        LOG_TRACE(Service_NVDRV, "Writing YUV420 Frame");
+
+        const std::size_t surface_width = config.surface_width_minus1 + 1;
+        const std::size_t surface_height = config.surface_height_minus1 + 1;
+        const std::size_t half_width = surface_width / 2;
+        const std::size_t half_height = config.surface_height_minus1 / 2;
+        const std::size_t aligned_width = (surface_width + 0xff) & ~0xff;
+
+        const auto* luma_ptr = frame->data[0];
+        const auto* chroma_b_ptr = frame->data[1];
+        const auto* chroma_r_ptr = frame->data[2];
+        const auto stride = frame->linesize[0];
+        const auto half_stride = frame->linesize[1];
+
+        std::vector<u8> luma_buffer(aligned_width * surface_height);
+        std::vector<u8> chroma_buffer(aligned_width * half_height);
+
+        // Populate luma buffer
+        for (std::size_t y = 0; y < surface_height - 1; ++y) {
+            std::size_t src = y * stride;
+            std::size_t dst = y * aligned_width;
+
+            std::size_t size = surface_width;
+
+            for (std::size_t offset = 0; offset < size; ++offset) {
+                luma_buffer[dst + offset] = luma_ptr[src + offset];
+            }
+        }
+        gpu.MemoryManager().WriteBlock(output_surface_luma_address, luma_buffer.data(),
+                                       luma_buffer.size());
+
+        // Populate chroma buffer from both channels with interleaving.
+        for (std::size_t y = 0; y < half_height; ++y) {
+            std::size_t src = y * half_stride;
+            std::size_t dst = y * aligned_width;
+
+            for (std::size_t x = 0; x < half_width; ++x) {
+                chroma_buffer[dst + x * 2] = chroma_b_ptr[src + x];
+                chroma_buffer[dst + x * 2 + 1] = chroma_r_ptr[src + x];
+            }
+        }
+        gpu.MemoryManager().WriteBlock(output_surface_chroma_u_address, chroma_buffer.data(),
+                                       chroma_buffer.size());
+        gpu.Maxwell3D().OnMemoryWrite();
+        break;
+    }
+    default:
+        UNIMPLEMENTED_MSG("Unknown video pixel format {}", config.pixel_format.Value());
+        break;
+    }
+}
+
+} // namespace Tegra
--- a/src/video_core/command_classes/vic.h
+++ b/src/video_core/command_classes/vic.h
@@ -0,0 +1,110 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <memory>
+#include <vector>
+#include "common/bit_field.h"
+#include "common/common_types.h"
+
+struct SwsContext;
+
+namespace Tegra {
+class GPU;
+class Nvdec;
+
+struct PlaneOffsets {
+    u32 luma_offset{};
+    u32 chroma_u_offset{};
+    u32 chroma_v_offset{};
+};
+
+struct VicRegisters {
+    INSERT_PADDING_WORDS(64);
+    u32 nop{};
+    INSERT_PADDING_WORDS(15);
+    u32 pm_trigger{};
+    INSERT_PADDING_WORDS(47);
+    u32 set_application_id{};
+    u32 set_watchdog_timer{};
+    INSERT_PADDING_WORDS(17);
+    u32 context_save_area{};
+    u32 context_switch{};
+    INSERT_PADDING_WORDS(43);
+    u32 execute{};
+    INSERT_PADDING_WORDS(63);
+    std::array<std::array<PlaneOffsets, 8>, 8> surfacex_slots{};
+    u32 picture_index{};
+    u32 control_params{};
+    u32 config_struct_offset{};
+    u32 filter_struct_offset{};
+    u32 palette_offset{};
+    u32 hist_offset{};
+    u32 context_id{};
+    u32 fce_ucode_size{};
+    PlaneOffsets output_surface{};
+    u32 fce_ucode_offset{};
+    INSERT_PADDING_WORDS(4);
+    std::array<u32, 8> slot_context_id{};
+    INSERT_PADDING_WORDS(16);
+};
+static_assert(sizeof(VicRegisters) == 0x7A0, "VicRegisters is an invalid size");
+
+class Vic {
+public:
+    enum class Method : u32 {
+        Execute = 0xc0,
+        SetControlParams = 0x1c1,
+        SetConfigStructOffset = 0x1c2,
+        SetOutputSurfaceLumaOffset = 0x1c8,
+        SetOutputSurfaceChromaUOffset = 0x1c9,
+        SetOutputSurfaceChromaVOffset = 0x1ca
+    };
+
+    explicit Vic(GPU& gpu, std::shared_ptr<Nvdec> nvdec_processor);
+    ~Vic();
+
+    /// Write to the device state.
+    void ProcessMethod(Method method, const std::vector<u32>& arguments);
+
+private:
+    void Execute();
+
+    void VicStateWrite(u32 offset, u32 arguments);
+    VicRegisters vic_state{};
+
+    enum class VideoPixelFormat : u64_le {
+        RGBA8 = 0x1f,
+        BGRA8 = 0x20,
+        Yuv420 = 0x44,
+    };
+
+    union VicConfig {
+        u64_le raw{};
+        BitField<0, 7, u64_le> pixel_format;
+        BitField<7, 2, u64_le> chroma_loc_horiz;
+        BitField<9, 2, u64_le> chroma_loc_vert;
+        BitField<11, 4, u64_le> block_linear_kind;
+        BitField<15, 4, u64_le> block_linear_height_log2;
+        BitField<19, 3, u64_le> reserved0;
+        BitField<22, 10, u64_le> reserved1;
+        BitField<32, 14, u64_le> surface_width_minus1;
+        BitField<46, 14, u64_le> surface_height_minus1;
+    };
+
+    GPU& gpu;
+    std::shared_ptr<Tegra::Nvdec> nvdec_processor;
+
+    GPUVAddr config_struct_address{};
+    GPUVAddr output_surface_luma_address{};
+    GPUVAddr output_surface_chroma_u_address{};
+    GPUVAddr output_surface_chroma_v_address{};
+
+    SwsContext* scaler_ctx{};
+    s32 scaler_width{};
+    s32 scaler_height{};
+};
+
+} // namespace Tegra
--- a/src/video_core/compatible_formats.cpp
+++ b/src/video_core/compatible_formats.cpp
@@ -0,0 +1,249 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <array>
+#include <cstddef>
+
+#include "common/common_types.h"
+#include "video_core/compatible_formats.h"
+#include "video_core/surface.h"
+
+namespace VideoCore::Surface {
+
+namespace {
+
+using Table = std::array<std::array<u64, 2>, MaxPixelFormat>;
+
+// Compatibility table taken from Table 3.X.2 in:
+// https://www.khronos.org/registry/OpenGL/extensions/ARB/ARB_texture_view.txt
+
+constexpr std::array VIEW_CLASS_128_BITS{
+    PixelFormat::R32G32B32A32_FLOAT,
+    PixelFormat::R32G32B32A32_UINT,
+    PixelFormat::R32G32B32A32_SINT,
+};
+
+constexpr std::array VIEW_CLASS_96_BITS{
+    PixelFormat::R32G32B32_FLOAT,
+};
+// Missing formats:
+// PixelFormat::RGB32UI,
+// PixelFormat::RGB32I,
+
+constexpr std::array VIEW_CLASS_64_BITS{
+    PixelFormat::R32G32_FLOAT,       PixelFormat::R32G32_UINT,
+    PixelFormat::R32G32_SINT,        PixelFormat::R16G16B16A16_FLOAT,
+    PixelFormat::R16G16B16A16_UNORM, PixelFormat::R16G16B16A16_SNORM,
+    PixelFormat::R16G16B16A16_UINT,  PixelFormat::R16G16B16A16_SINT,
+};
+
+// TODO: How should we handle 48 bits?
+
+constexpr std::array VIEW_CLASS_32_BITS{
+    PixelFormat::R16G16_FLOAT,      PixelFormat::B10G11R11_FLOAT, PixelFormat::R32_FLOAT,
+    PixelFormat::A2B10G10R10_UNORM, PixelFormat::R16G16_UINT,     PixelFormat::R32_UINT,
+    PixelFormat::R16G16_SINT,       PixelFormat::R32_SINT,        PixelFormat::A8B8G8R8_UNORM,
+    PixelFormat::R16G16_UNORM,      PixelFormat::A8B8G8R8_SNORM,  PixelFormat::R16G16_SNORM,
+    PixelFormat::A8B8G8R8_SRGB,     PixelFormat::E5B9G9R9_FLOAT,  PixelFormat::B8G8R8A8_UNORM,
+    PixelFormat::B8G8R8A8_SRGB,     PixelFormat::A8B8G8R8_UINT,   PixelFormat::A8B8G8R8_SINT,
+    PixelFormat::A2B10G10R10_UINT,
+};
+
+// TODO: How should we handle 24 bits?
+
+constexpr std::array VIEW_CLASS_16_BITS{
+    PixelFormat::R16_FLOAT,  PixelFormat::R8G8_UINT,  PixelFormat::R16_UINT,
+    PixelFormat::R16_SINT,   PixelFormat::R8G8_UNORM, PixelFormat::R16_UNORM,
+    PixelFormat::R8G8_SNORM, PixelFormat::R16_SNORM,  PixelFormat::R8G8_SINT,
+};
+
+constexpr std::array VIEW_CLASS_8_BITS{
+    PixelFormat::R8_UINT,
+    PixelFormat::R8_UNORM,
+    PixelFormat::R8_SINT,
+    PixelFormat::R8_SNORM,
+};
+
+constexpr std::array VIEW_CLASS_RGTC1_RED{
+    PixelFormat::BC4_UNORM,
+    PixelFormat::BC4_SNORM,
+};
+
+constexpr std::array VIEW_CLASS_RGTC2_RG{
+    PixelFormat::BC5_UNORM,
+    PixelFormat::BC5_SNORM,
+};
+
+constexpr std::array VIEW_CLASS_BPTC_UNORM{
+    PixelFormat::BC7_UNORM,
+    PixelFormat::BC7_SRGB,
+};
+
+constexpr std::array VIEW_CLASS_BPTC_FLOAT{
+    PixelFormat::BC6H_SFLOAT,
+    PixelFormat::BC6H_UFLOAT,
+};
+
+constexpr std::array VIEW_CLASS_ASTC_4x4_RGBA{
+    PixelFormat::ASTC_2D_4X4_UNORM,
+    PixelFormat::ASTC_2D_4X4_SRGB,
+};
+
+constexpr std::array VIEW_CLASS_ASTC_5x4_RGBA{
+    PixelFormat::ASTC_2D_5X4_UNORM,
+    PixelFormat::ASTC_2D_5X4_SRGB,
+};
+
+constexpr std::array VIEW_CLASS_ASTC_5x5_RGBA{
+    PixelFormat::ASTC_2D_5X5_UNORM,
+    PixelFormat::ASTC_2D_5X5_SRGB,
+};
+
+constexpr std::array VIEW_CLASS_ASTC_6x5_RGBA{
+    PixelFormat::ASTC_2D_6X5_UNORM,
+    PixelFormat::ASTC_2D_6X5_SRGB,
+};
+
+constexpr std::array VIEW_CLASS_ASTC_6x6_RGBA{
+    PixelFormat::ASTC_2D_6X6_UNORM,
+    PixelFormat::ASTC_2D_6X6_SRGB,
+};
+
+constexpr std::array VIEW_CLASS_ASTC_8x5_RGBA{
+    PixelFormat::ASTC_2D_8X5_UNORM,
+    PixelFormat::ASTC_2D_8X5_SRGB,
+};
+
+constexpr std::array VIEW_CLASS_ASTC_8x8_RGBA{
+    PixelFormat::ASTC_2D_8X8_UNORM,
+    PixelFormat::ASTC_2D_8X8_SRGB,
+};
+
+// Missing formats:
+// PixelFormat::ASTC_2D_10X5_UNORM
+// PixelFormat::ASTC_2D_10X5_SRGB
+
+// Missing formats:
+// PixelFormat::ASTC_2D_10X6_UNORM
+// PixelFormat::ASTC_2D_10X6_SRGB
+
+constexpr std::array VIEW_CLASS_ASTC_10x8_RGBA{
+    PixelFormat::ASTC_2D_10X8_UNORM,
+    PixelFormat::ASTC_2D_10X8_SRGB,
+};
+
+constexpr std::array VIEW_CLASS_ASTC_10x10_RGBA{
+    PixelFormat::ASTC_2D_10X10_UNORM,
+    PixelFormat::ASTC_2D_10X10_SRGB,
+};
+
+// Missing formats
+// ASTC_2D_12X10_UNORM,
+// ASTC_2D_12X10_SRGB,
+
+constexpr std::array VIEW_CLASS_ASTC_12x12_RGBA{
+    PixelFormat::ASTC_2D_12X12_UNORM,
+    PixelFormat::ASTC_2D_12X12_SRGB,
+};
+
+// Compatibility table taken from Table 4.X.1 in:
+// https://www.khronos.org/registry/OpenGL/extensions/ARB/ARB_copy_image.txt
+
+constexpr std::array COPY_CLASS_128_BITS{
+    PixelFormat::R32G32B32A32_UINT, PixelFormat::R32G32B32A32_FLOAT, PixelFormat::R32G32B32A32_SINT,
+    PixelFormat::BC2_UNORM,         PixelFormat::BC2_SRGB,           PixelFormat::BC3_UNORM,
+    PixelFormat::BC3_SRGB,          PixelFormat::BC5_UNORM,          PixelFormat::BC5_SNORM,
+    PixelFormat::BC7_UNORM,         PixelFormat::BC7_SRGB,           PixelFormat::BC6H_SFLOAT,
+    PixelFormat::BC6H_UFLOAT,
+};
+// Missing formats:
+// PixelFormat::RGBA32I
+// COMPRESSED_RG_RGTC2
+
+constexpr std::array COPY_CLASS_64_BITS{
+    PixelFormat::R16G16B16A16_FLOAT, PixelFormat::R16G16B16A16_UINT,
+    PixelFormat::R16G16B16A16_UNORM, PixelFormat::R16G16B16A16_SNORM,
+    PixelFormat::R16G16B16A16_SINT,  PixelFormat::R32G32_UINT,
+    PixelFormat::R32G32_FLOAT,       PixelFormat::R32G32_SINT,
+    PixelFormat::BC1_RGBA_UNORM,     PixelFormat::BC1_RGBA_SRGB,
+};
+// Missing formats:
+// COMPRESSED_RGB_S3TC_DXT1_EXT
+// COMPRESSED_SRGB_S3TC_DXT1_EXT
+// COMPRESSED_RGBA_S3TC_DXT1_EXT
+// COMPRESSED_SIGNED_RED_RGTC1
+
+constexpr void Enable(Table& table, size_t format_a, size_t format_b) {
+    table[format_a][format_b / 64] |= u64(1) << (format_b % 64);
+    table[format_b][format_a / 64] |= u64(1) << (format_a % 64);
+}
+
+constexpr void Enable(Table& table, PixelFormat format_a, PixelFormat format_b) {
+    Enable(table, static_cast<size_t>(format_a), static_cast<size_t>(format_b));
+}
+
+template <typename Range>
+constexpr void EnableRange(Table& table, const Range& range) {
+    for (auto it_a = range.begin(); it_a != range.end(); ++it_a) {
+        for (auto it_b = it_a; it_b != range.end(); ++it_b) {
+            Enable(table, *it_a, *it_b);
+        }
+    }
+}
+
+constexpr bool IsSupported(const Table& table, PixelFormat format_a, PixelFormat format_b) {
+    const size_t a = static_cast<size_t>(format_a);
+    const size_t b = static_cast<size_t>(format_b);
+    return ((table[a][b / 64] >> (b % 64)) & 1) != 0;
+}
+
+constexpr Table MakeViewTable() {
+    Table view{};
+    for (size_t i = 0; i < MaxPixelFormat; ++i) {
+        // Identity is allowed
+        Enable(view, i, i);
+    }
+    EnableRange(view, VIEW_CLASS_128_BITS);
+    EnableRange(view, VIEW_CLASS_96_BITS);
+    EnableRange(view, VIEW_CLASS_64_BITS);
+    EnableRange(view, VIEW_CLASS_32_BITS);
+    EnableRange(view, VIEW_CLASS_16_BITS);
+    EnableRange(view, VIEW_CLASS_8_BITS);
+    EnableRange(view, VIEW_CLASS_RGTC1_RED);
+    EnableRange(view, VIEW_CLASS_RGTC2_RG);
+    EnableRange(view, VIEW_CLASS_BPTC_UNORM);
+    EnableRange(view, VIEW_CLASS_BPTC_FLOAT);
+    EnableRange(view, VIEW_CLASS_ASTC_4x4_RGBA);
+    EnableRange(view, VIEW_CLASS_ASTC_5x4_RGBA);
+    EnableRange(view, VIEW_CLASS_ASTC_5x5_RGBA);
+    EnableRange(view, VIEW_CLASS_ASTC_6x5_RGBA);
+    EnableRange(view, VIEW_CLASS_ASTC_6x6_RGBA);
+    EnableRange(view, VIEW_CLASS_ASTC_8x5_RGBA);
+    EnableRange(view, VIEW_CLASS_ASTC_8x8_RGBA);
+    EnableRange(view, VIEW_CLASS_ASTC_10x8_RGBA);
+    EnableRange(view, VIEW_CLASS_ASTC_10x10_RGBA);
+    EnableRange(view, VIEW_CLASS_ASTC_12x12_RGBA);
+    return view;
+}
+
+constexpr Table MakeCopyTable() {
+    Table copy = MakeViewTable();
+    EnableRange(copy, COPY_CLASS_128_BITS);
+    EnableRange(copy, COPY_CLASS_64_BITS);
+    return copy;
+}
+
+} // Anonymous namespace
+
+bool IsViewCompatible(PixelFormat format_a, PixelFormat format_b) {
+    static constexpr Table TABLE = MakeViewTable();
+    return IsSupported(TABLE, format_a, format_b);
+}
+
+bool IsCopyCompatible(PixelFormat format_a, PixelFormat format_b) {
+    static constexpr Table TABLE = MakeCopyTable();
+    return IsSupported(TABLE, format_a, format_b);
+}
+
+} // namespace VideoCore::Surface
--- a/src/video_core/compatible_formats.h
+++ b/src/video_core/compatible_formats.h
@@ -0,0 +1,15 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include "video_core/surface.h"
+
+namespace VideoCore::Surface {
+
+bool IsViewCompatible(PixelFormat format_a, PixelFormat format_b);
+
+bool IsCopyCompatible(PixelFormat format_a, PixelFormat format_b);
+
+} // namespace VideoCore::Surface
--- a/src/video_core/delayed_destruction_ring.h
+++ b/src/video_core/delayed_destruction_ring.h
@@ -0,0 +1,31 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <array>
+#include <cstddef>
+#include <utility>
+#include <vector>
+
+namespace VideoCommon {
+
+template <typename T, size_t TICKS_TO_DESTROY>
+class DelayedDestructionRing {
+public:
+    void Tick() {
+        index = (index + 1) % TICKS_TO_DESTROY;
+        elements[index].clear();
+    }
+
+    void Push(T&& object) {
+        elements[index].push_back(std::move(object));
+    }
+
+private:
+    size_t index = 0;
+    std::array<std::vector<T>, TICKS_TO_DESTROY> elements;
+};
+
+} // namespace VideoCommon
--- a/src/video_core/dirty_flags.cpp
+++ b/src/video_core/dirty_flags.cpp
@@ -0,0 +1,45 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <array>
+#include <cstddef>
+
+#include "common/common_types.h"
+#include "video_core/dirty_flags.h"
+
+#define OFF(field_name) MAXWELL3D_REG_INDEX(field_name)
+#define NUM(field_name) (sizeof(::Tegra::Engines::Maxwell3D::Regs::field_name) / (sizeof(u32)))
+
+namespace VideoCommon::Dirty {
+
+using Tegra::Engines::Maxwell3D;
+
+void SetupDirtyRenderTargets(Tegra::Engines::Maxwell3D::DirtyState::Tables& tables) {
+    FillBlock(tables[0], OFF(tic), NUM(tic), Descriptors);
+    FillBlock(tables[0], OFF(tsc), NUM(tsc), Descriptors);
+
+    static constexpr std::size_t num_per_rt = NUM(rt[0]);
+    static constexpr std::size_t begin = OFF(rt);
+    static constexpr std::size_t num = num_per_rt * Maxwell3D::Regs::NumRenderTargets;
+    for (std::size_t rt = 0; rt < Maxwell3D::Regs::NumRenderTargets; ++rt) {
+        FillBlock(tables[0], begin + rt * num_per_rt, num_per_rt, ColorBuffer0 + rt);
+    }
+    FillBlock(tables[1], begin, num, RenderTargets);
+    FillBlock(tables[0], OFF(render_area), NUM(render_area), RenderTargets);
+
+    tables[0][OFF(rt_control)] = RenderTargets;
+    tables[1][OFF(rt_control)] = RenderTargetControl;
+
+    static constexpr std::array zeta_flags{ZetaBuffer, RenderTargets};
+    for (std::size_t i = 0; i < std::size(zeta_flags); ++i) {
+        const u8 flag = zeta_flags[i];
+        auto& table = tables[i];
+        table[OFF(zeta_enable)] = flag;
+        table[OFF(zeta_width)] = flag;
+        table[OFF(zeta_height)] = flag;
+        FillBlock(table, OFF(zeta), NUM(zeta), flag);
+    }
+}
+
+} // namespace VideoCommon::Dirty
--- a/src/video_core/dirty_flags.h
+++ b/src/video_core/dirty_flags.h
@@ -0,0 +1,52 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <algorithm>
+#include <cstddef>
+#include <iterator>
+
+#include "common/common_types.h"
+#include "video_core/engines/maxwell_3d.h"
+
+namespace VideoCommon::Dirty {
+
+enum : u8 {
+    NullEntry = 0,
+
+    Descriptors,
+
+    RenderTargets,
+    RenderTargetControl,
+    ColorBuffer0,
+    ColorBuffer1,
+    ColorBuffer2,
+    ColorBuffer3,
+    ColorBuffer4,
+    ColorBuffer5,
+    ColorBuffer6,
+    ColorBuffer7,
+    ZetaBuffer,
+
+    LastCommonEntry,
+};
+
+template <typename Integer>
+void FillBlock(Tegra::Engines::Maxwell3D::DirtyState::Table& table, std::size_t begin,
+               std::size_t num, Integer dirty_index) {
+    const auto it = std::begin(table) + begin;
+    std::fill(it, it + num, static_cast<u8>(dirty_index));
+}
+
+template <typename Integer1, typename Integer2>
+void FillBlock(Tegra::Engines::Maxwell3D::DirtyState::Tables& tables, std::size_t begin,
+               std::size_t num, Integer1 index_a, Integer2 index_b) {
+    FillBlock(tables[0], begin, num, index_a);
+    FillBlock(tables[1], begin, num, index_b);
+}
+
+void SetupDirtyRenderTargets(Tegra::Engines::Maxwell3D::DirtyState::Tables& tables);
+
+} // namespace VideoCommon::Dirty
--- a/src/video_core/dma_pusher.cpp
+++ b/src/video_core/dma_pusher.cpp
@@ -0,0 +1,177 @@
+// Copyright 2018 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "common/cityhash.h"
+#include "common/microprofile.h"
+#include "core/core.h"
+#include "core/memory.h"
+#include "video_core/dma_pusher.h"
+#include "video_core/engines/maxwell_3d.h"
+#include "video_core/gpu.h"
+#include "video_core/memory_manager.h"
+
+namespace Tegra {
+
+DmaPusher::DmaPusher(Core::System& system_, GPU& gpu_) : gpu{gpu_}, system{system_} {}
+
+DmaPusher::~DmaPusher() = default;
+
+MICROPROFILE_DEFINE(DispatchCalls, "GPU", "Execute command buffer", MP_RGB(128, 128, 192));
+
+void DmaPusher::DispatchCalls() {
+    MICROPROFILE_SCOPE(DispatchCalls);
+
+    gpu.SyncGuestHost();
+    // On entering GPU code, assume all memory may be touched by the ARM core.
+    gpu.Maxwell3D().OnMemoryWrite();
+
+    dma_pushbuffer_subindex = 0;
+
+    dma_state.is_last_call = true;
+
+    while (system.IsPoweredOn()) {
+        if (!Step()) {
+            break;
+        }
+    }
+    gpu.FlushCommands();
+    gpu.SyncGuestHost();
+    gpu.OnCommandListEnd();
+}
+
+bool DmaPusher::Step() {
+    if (!ib_enable || dma_pushbuffer.empty()) {
+        // pushbuffer empty and IB empty or nonexistent - nothing to do
+        return false;
+    }
+
+    CommandList& command_list{dma_pushbuffer.front()};
+
+    ASSERT_OR_EXECUTE(
+        command_list.command_lists.size() || command_list.prefetch_command_list.size(), {
+            // Somehow the command_list is empty, in order to avoid a crash
+            // We ignore it and assume its size is 0.
+            dma_pushbuffer.pop();
+            dma_pushbuffer_subindex = 0;
+            return true;
+        });
+
+    if (command_list.prefetch_command_list.size()) {
+        // Prefetched command list from nvdrv, used for things like synchronization
+        command_headers = std::move(command_list.prefetch_command_list);
+        dma_pushbuffer.pop();
+    } else {
+        const CommandListHeader command_list_header{
+            command_list.command_lists[dma_pushbuffer_subindex++]};
+        const GPUVAddr dma_get = command_list_header.addr;
+
+        if (dma_pushbuffer_subindex >= command_list.command_lists.size()) {
+            // We've gone through the current list, remove it from the queue
+            dma_pushbuffer.pop();
+            dma_pushbuffer_subindex = 0;
+        }
+
+        if (command_list_header.size == 0) {
+            return true;
+        }
+
+        // Push buffer non-empty, read a word
+        command_headers.resize(command_list_header.size);
+        gpu.MemoryManager().ReadBlockUnsafe(dma_get, command_headers.data(),
+                                            command_list_header.size * sizeof(u32));
+    }
+    for (std::size_t index = 0; index < command_headers.size();) {
+        const CommandHeader& command_header = command_headers[index];
+
+        if (dma_state.method_count) {
+            // Data word of methods command
+            if (dma_state.non_incrementing) {
+                const u32 max_write = static_cast<u32>(
+                    std::min<std::size_t>(index + dma_state.method_count, command_headers.size()) -
+                    index);
+                CallMultiMethod(&command_header.argument, max_write);
+                dma_state.method_count -= max_write;
+                dma_state.is_last_call = true;
+                index += max_write;
+                continue;
+            } else {
+                dma_state.is_last_call = dma_state.method_count <= 1;
+                CallMethod(command_header.argument);
+            }
+
+            if (!dma_state.non_incrementing) {
+                dma_state.method++;
+            }
+
+            if (dma_increment_once) {
+                dma_state.non_incrementing = true;
+            }
+
+            dma_state.method_count--;
+        } else {
+            // No command active - this is the first word of a new one
+            switch (command_header.mode) {
+            case SubmissionMode::Increasing:
+                SetState(command_header);
+                dma_state.non_incrementing = false;
+                dma_increment_once = false;
+                break;
+            case SubmissionMode::NonIncreasing:
+                SetState(command_header);
+                dma_state.non_incrementing = true;
+                dma_increment_once = false;
+                break;
+            case SubmissionMode::Inline:
+                dma_state.method = command_header.method;
+                dma_state.subchannel = command_header.subchannel;
+                CallMethod(command_header.arg_count);
+                dma_state.non_incrementing = true;
+                dma_increment_once = false;
+                break;
+            case SubmissionMode::IncreaseOnce:
+                SetState(command_header);
+                dma_state.non_incrementing = false;
+                dma_increment_once = true;
+                break;
+            default:
+                break;
+            }
+        }
+        index++;
+    }
+
+    return true;
+}
+
+void DmaPusher::SetState(const CommandHeader& command_header) {
+    dma_state.method = command_header.method;
+    dma_state.subchannel = command_header.subchannel;
+    dma_state.method_count = command_header.method_count;
+}
+
+void DmaPusher::CallMethod(u32 argument) const {
+    if (dma_state.method < non_puller_methods) {
+        gpu.CallMethod(GPU::MethodCall{
+            dma_state.method,
+            argument,
+            dma_state.subchannel,
+            dma_state.method_count,
+        });
+    } else {
+        subchannels[dma_state.subchannel]->CallMethod(dma_state.method, argument,
+                                                      dma_state.is_last_call);
+    }
+}
+
+void DmaPusher::CallMultiMethod(const u32* base_start, u32 num_methods) const {
+    if (dma_state.method < non_puller_methods) {
+        gpu.CallMultiMethod(dma_state.method, dma_state.subchannel, base_start, num_methods,
+                            dma_state.method_count);
+    } else {
+        subchannels[dma_state.subchannel]->CallMultiMethod(dma_state.method, base_start,
+                                                           num_methods, dma_state.method_count);
+    }
+}
+
+} // namespace Tegra
--- a/src/video_core/dma_pusher.h
+++ b/src/video_core/dma_pusher.h
@@ -0,0 +1,154 @@
+// Copyright 2018 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <array>
+#include <vector>
+#include <queue>
+
+#include "common/bit_field.h"
+#include "common/common_types.h"
+#include "video_core/engines/engine_interface.h"
+
+namespace Core {
+class System;
+}
+
+namespace Tegra {
+
+class GPU;
+
+enum class SubmissionMode : u32 {
+    IncreasingOld = 0,
+    Increasing = 1,
+    NonIncreasingOld = 2,
+    NonIncreasing = 3,
+    Inline = 4,
+    IncreaseOnce = 5
+};
+
+// Note that, traditionally, methods are treated as 4-byte addressable locations, and hence
+// their numbers are written down multiplied by 4 in Docs. Here we are not multiply by 4.
+// So the values you see in docs might be multiplied by 4.
+enum class BufferMethods : u32 {
+    BindObject = 0x0,
+    Nop = 0x2,
+    SemaphoreAddressHigh = 0x4,
+    SemaphoreAddressLow = 0x5,
+    SemaphoreSequence = 0x6,
+    SemaphoreTrigger = 0x7,
+    NotifyIntr = 0x8,
+    WrcacheFlush = 0x9,
+    Unk28 = 0xA,
+    UnkCacheFlush = 0xB,
+    RefCnt = 0x14,
+    SemaphoreAcquire = 0x1A,
+    SemaphoreRelease = 0x1B,
+    FenceValue = 0x1C,
+    FenceAction = 0x1D,
+    WaitForInterrupt = 0x1E,
+    Unk7c = 0x1F,
+    Yield = 0x20,
+    NonPullerMethods = 0x40,
+};
+
+struct CommandListHeader {
+    union {
+        u64 raw;
+        BitField<0, 40, GPUVAddr> addr;
+        BitField<41, 1, u64> is_non_main;
+        BitField<42, 21, u64> size;
+    };
+};
+static_assert(sizeof(CommandListHeader) == sizeof(u64), "CommandListHeader is incorrect size");
+
+union CommandHeader {
+    u32 argument;
+    BitField<0, 13, u32> method;
+    BitField<0, 24, u32> method_count_;
+    BitField<13, 3, u32> subchannel;
+    BitField<16, 13, u32> arg_count;
+    BitField<16, 13, u32> method_count;
+    BitField<29, 3, SubmissionMode> mode;
+};
+static_assert(std::is_standard_layout_v<CommandHeader>, "CommandHeader is not standard layout");
+static_assert(sizeof(CommandHeader) == sizeof(u32), "CommandHeader has incorrect size!");
+
+inline CommandHeader BuildCommandHeader(BufferMethods method, u32 arg_count, SubmissionMode mode) {
+    CommandHeader result{};
+    result.method.Assign(static_cast<u32>(method));
+    result.arg_count.Assign(arg_count);
+    result.mode.Assign(mode);
+    return result;
+}
+
+struct CommandList final {
+    CommandList() = default;
+    explicit CommandList(std::size_t size) : command_lists(size) {}
+    explicit CommandList(std::vector<CommandHeader>&& prefetch_command_list_)
+        : prefetch_command_list{std::move(prefetch_command_list_)} {}
+
+    std::vector<CommandListHeader> command_lists;
+    std::vector<CommandHeader> prefetch_command_list;
+};
+
+/**
+ * The DmaPusher class implements DMA submission to FIFOs, providing an area of memory that the
+ * emulated app fills with commands and tells PFIFO to process. The pushbuffers are then assembled
+ * into a "command stream" consisting of 32-bit words that make up "commands".
+ * See https://envytools.readthedocs.io/en/latest/hw/fifo/dma-pusher.html#fifo-dma-pusher for
+ * details on this implementation.
+ */
+class DmaPusher final {
+public:
+    explicit DmaPusher(Core::System& system_, GPU& gpu_);
+    ~DmaPusher();
+
+    void Push(CommandList&& entries) {
+        dma_pushbuffer.push(std::move(entries));
+    }
+
+    void DispatchCalls();
+
+    void BindSubchannel(Engines::EngineInterface* engine, u32 subchannel_id) {
+        subchannels[subchannel_id] = engine;
+    }
+
+private:
+    static constexpr u32 non_puller_methods = 0x40;
+    static constexpr u32 max_subchannels = 8;
+    bool Step();
+
+    void SetState(const CommandHeader& command_header);
+
+    void CallMethod(u32 argument) const;
+    void CallMultiMethod(const u32* base_start, u32 num_methods) const;
+
+    std::vector<CommandHeader> command_headers; ///< Buffer for list of commands fetched at once
+
+    std::queue<CommandList> dma_pushbuffer; ///< Queue of command lists to be processed
+    std::size_t dma_pushbuffer_subindex{};  ///< Index within a command list within the pushbuffer
+
+    struct DmaState {
+        u32 method;            ///< Current method
+        u32 subchannel;        ///< Current subchannel
+        u32 method_count;      ///< Current method count
+        u32 length_pending;    ///< Large NI command length pending
+        bool non_incrementing; ///< Current command's NI flag
+        bool is_last_call;
+    };
+
+    DmaState dma_state{};
+    bool dma_increment_once{};
+
+    bool ib_enable{true}; ///< IB mode enabled
+
+    std::array<Engines::EngineInterface*, max_subchannels> subchannels{};
+
+    GPU& gpu;
+    Core::System& system;
+};
+
+} // namespace Tegra
--- a/src/video_core/engines/const_buffer_engine_interface.h
+++ b/src/video_core/engines/const_buffer_engine_interface.h
@@ -0,0 +1,103 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <type_traits>
+#include "common/bit_field.h"
+#include "common/common_types.h"
+#include "video_core/engines/shader_bytecode.h"
+#include "video_core/engines/shader_type.h"
+#include "video_core/guest_driver.h"
+#include "video_core/textures/texture.h"
+
+namespace Tegra::Engines {
+
+struct SamplerDescriptor {
+    union {
+        u32 raw = 0;
+        BitField<0, 2, Tegra::Shader::TextureType> texture_type;
+        BitField<2, 3, Tegra::Texture::ComponentType> r_type;
+        BitField<5, 1, u32> is_array;
+        BitField<6, 1, u32> is_buffer;
+        BitField<7, 1, u32> is_shadow;
+        BitField<8, 3, Tegra::Texture::ComponentType> g_type;
+        BitField<11, 3, Tegra::Texture::ComponentType> b_type;
+        BitField<14, 3, Tegra::Texture::ComponentType> a_type;
+        BitField<17, 7, Tegra::Texture::TextureFormat> format;
+    };
+
+    bool operator==(const SamplerDescriptor& rhs) const noexcept {
+        return raw == rhs.raw;
+    }
+
+    bool operator!=(const SamplerDescriptor& rhs) const noexcept {
+        return !operator==(rhs);
+    }
+
+    static SamplerDescriptor FromTIC(const Tegra::Texture::TICEntry& tic) {
+        using Tegra::Shader::TextureType;
+        SamplerDescriptor result;
+
+        result.format.Assign(tic.format.Value());
+        result.r_type.Assign(tic.r_type.Value());
+        result.g_type.Assign(tic.g_type.Value());
+        result.b_type.Assign(tic.b_type.Value());
+        result.a_type.Assign(tic.a_type.Value());
+
+        switch (tic.texture_type.Value()) {
+        case Tegra::Texture::TextureType::Texture1D:
+            result.texture_type.Assign(TextureType::Texture1D);
+            return result;
+        case Tegra::Texture::TextureType::Texture2D:
+            result.texture_type.Assign(TextureType::Texture2D);
+            return result;
+        case Tegra::Texture::TextureType::Texture3D:
+            result.texture_type.Assign(TextureType::Texture3D);
+            return result;
+        case Tegra::Texture::TextureType::TextureCubemap:
+            result.texture_type.Assign(TextureType::TextureCube);
+            return result;
+        case Tegra::Texture::TextureType::Texture1DArray:
+            result.texture_type.Assign(TextureType::Texture1D);
+            result.is_array.Assign(1);
+            return result;
+        case Tegra::Texture::TextureType::Texture2DArray:
+            result.texture_type.Assign(TextureType::Texture2D);
+            result.is_array.Assign(1);
+            return result;
+        case Tegra::Texture::TextureType::Texture1DBuffer:
+            result.texture_type.Assign(TextureType::Texture1D);
+            result.is_buffer.Assign(1);
+            return result;
+        case Tegra::Texture::TextureType::Texture2DNoMipmap:
+            result.texture_type.Assign(TextureType::Texture2D);
+            return result;
+        case Tegra::Texture::TextureType::TextureCubeArray:
+            result.texture_type.Assign(TextureType::TextureCube);
+            result.is_array.Assign(1);
+            return result;
+        default:
+            result.texture_type.Assign(TextureType::Texture2D);
+            return result;
+        }
+    }
+};
+static_assert(std::is_trivially_copyable_v<SamplerDescriptor>);
+
+class ConstBufferEngineInterface {
+public:
+    virtual ~ConstBufferEngineInterface() = default;
+    virtual u32 AccessConstBuffer32(ShaderType stage, u64 const_buffer, u64 offset) const = 0;
+    virtual SamplerDescriptor AccessBoundSampler(ShaderType stage, u64 offset) const = 0;
+    virtual SamplerDescriptor AccessBindlessSampler(ShaderType stage, u64 const_buffer,
+                                                    u64 offset) const = 0;
+    virtual SamplerDescriptor AccessSampler(u32 handle) const = 0;
+    virtual u32 GetBoundBuffer() const = 0;
+
+    virtual VideoCore::GuestDriverProfile& AccessGuestDriverProfile() = 0;
+    virtual const VideoCore::GuestDriverProfile& AccessGuestDriverProfile() const = 0;
+};
+
+} // namespace Tegra::Engines
--- a/src/video_core/engines/const_buffer_info.h
+++ b/src/video_core/engines/const_buffer_info.h
@@ -0,0 +1,17 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include "common/common_types.h"
+
+namespace Tegra::Engines {
+
+struct ConstBufferInfo {
+    GPUVAddr address;
+    u32 size;
+    bool enabled;
+};
+
+} // namespace Tegra::Engines
--- a/src/video_core/engines/engine_interface.h
+++ b/src/video_core/engines/engine_interface.h
@@ -0,0 +1,22 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <type_traits>
+#include "common/common_types.h"
+
+namespace Tegra::Engines {
+
+class EngineInterface {
+public:
+    /// Write the value to the register identified by method.
+    virtual void CallMethod(u32 method, u32 method_argument, bool is_last_call) = 0;
+
+    /// Write multiple values to the register identified by method.
+    virtual void CallMultiMethod(u32 method, const u32* base_start, u32 amount,
+                                 u32 methods_pending) = 0;
+};
+
+} // namespace Tegra::Engines
--- a/src/video_core/engines/engine_upload.cpp
+++ b/src/video_core/engines/engine_upload.cpp
@@ -0,0 +1,52 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <cstring>
+
+#include "common/assert.h"
+#include "video_core/engines/engine_upload.h"
+#include "video_core/memory_manager.h"
+#include "video_core/textures/decoders.h"
+
+namespace Tegra::Engines::Upload {
+
+State::State(MemoryManager& memory_manager_, Registers& regs_)
+    : regs{regs_}, memory_manager{memory_manager_} {}
+
+State::~State() = default;
+
+void State::ProcessExec(const bool is_linear_) {
+    write_offset = 0;
+    copy_size = regs.line_length_in * regs.line_count;
+    inner_buffer.resize(copy_size);
+    is_linear = is_linear_;
+}
+
+void State::ProcessData(const u32 data, const bool is_last_call) {
+    const u32 sub_copy_size = std::min(4U, copy_size - write_offset);
+    std::memcpy(&inner_buffer[write_offset], &data, sub_copy_size);
+    write_offset += sub_copy_size;
+    if (!is_last_call) {
+        return;
+    }
+    const GPUVAddr address{regs.dest.Address()};
+    if (is_linear) {
+        memory_manager.WriteBlock(address, inner_buffer.data(), copy_size);
+    } else {
+        UNIMPLEMENTED_IF(regs.dest.z != 0);
+        UNIMPLEMENTED_IF(regs.dest.depth != 1);
+        UNIMPLEMENTED_IF(regs.dest.BlockWidth() != 0);
+        UNIMPLEMENTED_IF(regs.dest.BlockDepth() != 0);
+        const std::size_t dst_size = Tegra::Texture::CalculateSize(
+            true, 1, regs.dest.width, regs.dest.height, 1, regs.dest.BlockHeight(), 0);
+        tmp_buffer.resize(dst_size);
+        memory_manager.ReadBlock(address, tmp_buffer.data(), dst_size);
+        Tegra::Texture::SwizzleKepler(regs.dest.width, regs.dest.height, regs.dest.x, regs.dest.y,
+                                      regs.dest.BlockHeight(), copy_size, inner_buffer.data(),
+                                      tmp_buffer.data());
+        memory_manager.WriteBlock(address, tmp_buffer.data(), dst_size);
+    }
+}
+
+} // namespace Tegra::Engines::Upload
--- a/src/video_core/engines/engine_upload.h
+++ b/src/video_core/engines/engine_upload.h
@@ -0,0 +1,73 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <vector>
+#include "common/bit_field.h"
+#include "common/common_types.h"
+
+namespace Tegra {
+class MemoryManager;
+}
+
+namespace Tegra::Engines::Upload {
+
+struct Registers {
+    u32 line_length_in;
+    u32 line_count;
+
+    struct {
+        u32 address_high;
+        u32 address_low;
+        u32 pitch;
+        union {
+            BitField<0, 4, u32> block_width;
+            BitField<4, 4, u32> block_height;
+            BitField<8, 4, u32> block_depth;
+        };
+        u32 width;
+        u32 height;
+        u32 depth;
+        u32 z;
+        u32 x;
+        u32 y;
+
+        GPUVAddr Address() const {
+            return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) | address_low);
+        }
+
+        u32 BlockWidth() const {
+            return block_width.Value();
+        }
+
+        u32 BlockHeight() const {
+            return block_height.Value();
+        }
+
+        u32 BlockDepth() const {
+            return block_depth.Value();
+        }
+    } dest;
+};
+
+class State {
+public:
+    explicit State(MemoryManager& memory_manager_, Registers& regs_);
+    ~State();
+
+    void ProcessExec(bool is_linear_);
+    void ProcessData(u32 data, bool is_last_call);
+
+private:
+    u32 write_offset = 0;
+    u32 copy_size = 0;
+    std::vector<u8> inner_buffer;
+    std::vector<u8> tmp_buffer;
+    bool is_linear = false;
+    Registers& regs;
+    MemoryManager& memory_manager;
+};
+
+} // namespace Tegra::Engines::Upload
--- a/src/video_core/engines/fermi_2d.cpp
+++ b/src/video_core/engines/fermi_2d.cpp
@@ -0,0 +1,69 @@
+// Copyright 2018 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "common/assert.h"
+#include "common/logging/log.h"
+#include "video_core/engines/fermi_2d.h"
+#include "video_core/memory_manager.h"
+#include "video_core/rasterizer_interface.h"
+
+namespace Tegra::Engines {
+
+Fermi2D::Fermi2D() {
+    // Nvidia's OpenGL driver seems to assume these values
+    regs.src.depth = 1;
+    regs.dst.depth = 1;
+}
+
+Fermi2D::~Fermi2D() = default;
+
+void Fermi2D::BindRasterizer(VideoCore::RasterizerInterface& rasterizer_) {
+    rasterizer = &rasterizer_;
+}
+
+void Fermi2D::CallMethod(u32 method, u32 method_argument, bool is_last_call) {
+    ASSERT_MSG(method < Regs::NUM_REGS,
+               "Invalid Fermi2D register, increase the size of the Regs structure");
+    regs.reg_array[method] = method_argument;
+
+    if (method == FERMI2D_REG_INDEX(pixels_from_memory.src_y0) + 1) {
+        Blit();
+    }
+}
+
+void Fermi2D::CallMultiMethod(u32 method, const u32* base_start, u32 amount, u32 methods_pending) {
+    for (u32 i = 0; i < amount; ++i) {
+        CallMethod(method, base_start[i], methods_pending - i <= 1);
+    }
+}
+
+void Fermi2D::Blit() {
+    LOG_DEBUG(HW_GPU, "called. source address=0x{:x}, destination address=0x{:x}",
+              regs.src.Address(), regs.dst.Address());
+
+    UNIMPLEMENTED_IF_MSG(regs.operation != Operation::SrcCopy, "Operation is not copy");
+    UNIMPLEMENTED_IF_MSG(regs.src.layer != 0, "Source layer is not zero");
+    UNIMPLEMENTED_IF_MSG(regs.dst.layer != 0, "Destination layer is not zero");
+    UNIMPLEMENTED_IF_MSG(regs.src.depth != 1, "Source depth is not one");
+    UNIMPLEMENTED_IF_MSG(regs.clip_enable != 0, "Clipped blit enabled");
+
+    const auto& args = regs.pixels_from_memory;
+    const Config config{
+        .operation = regs.operation,
+        .filter = args.sample_mode.filter,
+        .dst_x0 = args.dst_x0,
+        .dst_y0 = args.dst_y0,
+        .dst_x1 = args.dst_x0 + args.dst_width,
+        .dst_y1 = args.dst_y0 + args.dst_height,
+        .src_x0 = static_cast<s32>(args.src_x0 >> 32),
+        .src_y0 = static_cast<s32>(args.src_y0 >> 32),
+        .src_x1 = static_cast<s32>((args.du_dx * args.dst_width + args.src_x0) >> 32),
+        .src_y1 = static_cast<s32>((args.dv_dy * args.dst_height + args.src_y0) >> 32),
+    };
+    if (!rasterizer->AccelerateSurfaceCopy(regs.src, regs.dst, config)) {
+        UNIMPLEMENTED();
+    }
+}
+
+} // namespace Tegra::Engines
--- a/src/video_core/engines/fermi_2d.h
+++ b/src/video_core/engines/fermi_2d.h
@@ -0,0 +1,352 @@
+// Copyright 2018 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <array>
+#include <cstddef>
+#include "common/bit_field.h"
+#include "common/common_funcs.h"
+#include "common/common_types.h"
+#include "common/math_util.h"
+#include "video_core/engines/engine_interface.h"
+#include "video_core/gpu.h"
+
+namespace Tegra {
+class MemoryManager;
+}
+
+namespace VideoCore {
+class RasterizerInterface;
+}
+
+namespace Tegra::Engines {
+
+/**
+ * This Engine is known as G80_2D. Documentation can be found in:
+ * https://github.com/envytools/envytools/blob/master/rnndb/graph/g80_2d.xml
+ * https://cgit.freedesktop.org/mesa/mesa/tree/src/gallium/drivers/nouveau/nv50/nv50_2d.xml.h
+ */
+
+#define FERMI2D_REG_INDEX(field_name)                                                              \
+    (offsetof(Tegra::Engines::Fermi2D::Regs, field_name) / sizeof(u32))
+
+class Fermi2D final : public EngineInterface {
+public:
+    explicit Fermi2D();
+    ~Fermi2D();
+
+    /// Binds a rasterizer to this engine.
+    void BindRasterizer(VideoCore::RasterizerInterface& rasterizer);
+
+    /// Write the value to the register identified by method.
+    void CallMethod(u32 method, u32 method_argument, bool is_last_call) override;
+
+    /// Write multiple values to the register identified by method.
+    void CallMultiMethod(u32 method, const u32* base_start, u32 amount,
+                         u32 methods_pending) override;
+
+    enum class Origin : u32 {
+        Center = 0,
+        Corner = 1,
+    };
+
+    enum class Filter : u32 {
+        Point = 0,
+        Bilinear = 1,
+    };
+
+    enum class Operation : u32 {
+        SrcCopyAnd = 0,
+        ROPAnd = 1,
+        Blend = 2,
+        SrcCopy = 3,
+        ROP = 4,
+        SrcCopyPremult = 5,
+        BlendPremult = 6,
+    };
+
+    enum class MemoryLayout : u32 {
+        BlockLinear = 0,
+        Pitch = 1,
+    };
+
+    enum class CpuIndexWrap : u32 {
+        Wrap = 0,
+        NoWrap = 1,
+    };
+
+    struct Surface {
+        RenderTargetFormat format;
+        MemoryLayout linear;
+        union {
+            BitField<0, 4, u32> block_width;
+            BitField<4, 4, u32> block_height;
+            BitField<8, 4, u32> block_depth;
+        };
+        u32 depth;
+        u32 layer;
+        u32 pitch;
+        u32 width;
+        u32 height;
+        u32 addr_upper;
+        u32 addr_lower;
+
+        [[nodiscard]] constexpr GPUVAddr Address() const noexcept {
+            return (static_cast<GPUVAddr>(addr_upper) << 32) | static_cast<GPUVAddr>(addr_lower);
+        }
+    };
+    static_assert(sizeof(Surface) == 0x28, "Surface has incorrect size");
+
+    enum class SectorPromotion : u32 {
+        NoPromotion = 0,
+        PromoteTo2V = 1,
+        PromoteTo2H = 2,
+        PromoteTo4 = 3,
+    };
+
+    enum class NumTpcs : u32 {
+        All = 0,
+        One = 1,
+    };
+
+    enum class RenderEnableMode : u32 {
+        False = 0,
+        True = 1,
+        Conditional = 2,
+        RenderIfEqual = 3,
+        RenderIfNotEqual = 4,
+    };
+
+    enum class ColorKeyFormat : u32 {
+        A16R56G6B5 = 0,
+        A1R5G55B5 = 1,
+        A8R8G8B8 = 2,
+        A2R10G10B10 = 3,
+        Y8 = 4,
+        Y16 = 5,
+        Y32 = 6,
+    };
+
+    union Beta4 {
+        BitField<0, 8, u32> b;
+        BitField<8, 8, u32> g;
+        BitField<16, 8, u32> r;
+        BitField<24, 8, u32> a;
+    };
+
+    struct Point {
+        u32 x;
+        u32 y;
+    };
+
+    enum class PatternSelect : u32 {
+        MonoChrome8x8 = 0,
+        MonoChrome64x1 = 1,
+        MonoChrome1x64 = 2,
+        Color = 3,
+    };
+
+    enum class NotifyType : u32 {
+        WriteOnly = 0,
+        WriteThenAwaken = 1,
+    };
+
+    enum class MonochromePatternColorFormat : u32 {
+        A8X8R8G6B5 = 0,
+        A1R5G5B5 = 1,
+        A8R8G8B8 = 2,
+        A8Y8 = 3,
+        A8X8Y16 = 4,
+        Y32 = 5,
+    };
+
+    enum class MonochromePatternFormat : u32 {
+        CGA6_M1 = 0,
+        LE_M1 = 1,
+    };
+
+    union Regs {
+        static constexpr std::size_t NUM_REGS = 0x258;
+        struct {
+            u32 object;
+            INSERT_UNION_PADDING_WORDS(0x3F);
+            u32 no_operation;
+            NotifyType notify;
+            INSERT_UNION_PADDING_WORDS(0x2);
+            u32 wait_for_idle;
+            INSERT_UNION_PADDING_WORDS(0xB);
+            u32 pm_trigger;
+            INSERT_UNION_PADDING_WORDS(0xF);
+            u32 context_dma_notify;
+            u32 dst_context_dma;
+            u32 src_context_dma;
+            u32 semaphore_context_dma;
+            INSERT_UNION_PADDING_WORDS(0x1C);
+            Surface dst;
+            CpuIndexWrap pixels_from_cpu_index_wrap;
+            u32 kind2d_check_enable;
+            Surface src;
+            SectorPromotion pixels_from_memory_sector_promotion;
+            INSERT_UNION_PADDING_WORDS(0x1);
+            NumTpcs num_tpcs;
+            u32 render_enable_addr_upper;
+            u32 render_enable_addr_lower;
+            RenderEnableMode render_enable_mode;
+            INSERT_UNION_PADDING_WORDS(0x4);
+            u32 clip_x0;
+            u32 clip_y0;
+            u32 clip_width;
+            u32 clip_height;
+            BitField<0, 1, u32> clip_enable;
+            BitField<0, 3, ColorKeyFormat> color_key_format;
+            u32 color_key;
+            BitField<0, 1, u32> color_key_enable;
+            BitField<0, 8, u32> rop;
+            u32 beta1;
+            Beta4 beta4;
+            Operation operation;
+            union {
+                BitField<0, 6, u32> x;
+                BitField<8, 6, u32> y;
+            } pattern_offset;
+            BitField<0, 2, PatternSelect> pattern_select;
+            INSERT_UNION_PADDING_WORDS(0xC);
+            struct {
+                BitField<0, 3, MonochromePatternColorFormat> color_format;
+                BitField<0, 1, MonochromePatternFormat> format;
+                u32 color0;
+                u32 color1;
+                u32 pattern0;
+                u32 pattern1;
+            } monochrome_pattern;
+            struct {
+                std::array<u32, 0x40> X8R8G8B8;
+                std::array<u32, 0x20> R5G6B5;
+                std::array<u32, 0x20> X1R5G5B5;
+                std::array<u32, 0x10> Y8;
+            } color_pattern;
+            INSERT_UNION_PADDING_WORDS(0x10);
+            struct {
+                u32 prim_mode;
+                u32 prim_color_format;
+                u32 prim_color;
+                u32 line_tie_break_bits;
+                INSERT_UNION_PADDING_WORDS(0x14);
+                u32 prim_point_xy;
+                INSERT_UNION_PADDING_WORDS(0x7);
+                std::array<Point, 0x40> prim_point;
+            } render_solid;
+            struct {
+                u32 data_type;
+                u32 color_format;
+                u32 index_format;
+                u32 mono_format;
+                u32 wrap;
+                u32 color0;
+                u32 color1;
+                u32 mono_opacity;
+                INSERT_UNION_PADDING_WORDS(0x6);
+                u32 src_width;
+                u32 src_height;
+                u32 dx_du_frac;
+                u32 dx_du_int;
+                u32 dx_dv_frac;
+                u32 dy_dv_int;
+                u32 dst_x0_frac;
+                u32 dst_x0_int;
+                u32 dst_y0_frac;
+                u32 dst_y0_int;
+                u32 data;
+            } pixels_from_cpu;
+            INSERT_UNION_PADDING_WORDS(0x3);
+            u32 big_endian_control;
+            INSERT_UNION_PADDING_WORDS(0x3);
+            struct {
+                BitField<0, 3, u32> block_shape;
+                BitField<0, 5, u32> corral_size;
+                BitField<0, 1, u32> safe_overlap;
+                union {
+                    BitField<0, 1, Origin> origin;
+                    BitField<4, 1, Filter> filter;
+                } sample_mode;
+                INSERT_UNION_PADDING_WORDS(0x8);
+                s32 dst_x0;
+                s32 dst_y0;
+                s32 dst_width;
+                s32 dst_height;
+                s64 du_dx;
+                s64 dv_dy;
+                s64 src_x0;
+                s64 src_y0;
+            } pixels_from_memory;
+        };
+        std::array<u32, NUM_REGS> reg_array;
+    } regs{};
+
+    struct Config {
+        Operation operation;
+        Filter filter;
+        s32 dst_x0;
+        s32 dst_y0;
+        s32 dst_x1;
+        s32 dst_y1;
+        s32 src_x0;
+        s32 src_y0;
+        s32 src_x1;
+        s32 src_y1;
+    };
+
+private:
+    VideoCore::RasterizerInterface* rasterizer;
+
+    /// Performs the copy from the source surface to the destination surface as configured in the
+    /// registers.
+    void Blit();
+};
+
+#define ASSERT_REG_POSITION(field_name, position)                                                  \
+    static_assert(offsetof(Fermi2D::Regs, field_name) == position,                                 \
+                  "Field " #field_name " has invalid position")
+
+ASSERT_REG_POSITION(object, 0x0);
+ASSERT_REG_POSITION(no_operation, 0x100);
+ASSERT_REG_POSITION(notify, 0x104);
+ASSERT_REG_POSITION(wait_for_idle, 0x110);
+ASSERT_REG_POSITION(pm_trigger, 0x140);
+ASSERT_REG_POSITION(context_dma_notify, 0x180);
+ASSERT_REG_POSITION(dst_context_dma, 0x184);
+ASSERT_REG_POSITION(src_context_dma, 0x188);
+ASSERT_REG_POSITION(semaphore_context_dma, 0x18C);
+ASSERT_REG_POSITION(dst, 0x200);
+ASSERT_REG_POSITION(pixels_from_cpu_index_wrap, 0x228);
+ASSERT_REG_POSITION(kind2d_check_enable, 0x22C);
+ASSERT_REG_POSITION(src, 0x230);
+ASSERT_REG_POSITION(pixels_from_memory_sector_promotion, 0x258);
+ASSERT_REG_POSITION(num_tpcs, 0x260);
+ASSERT_REG_POSITION(render_enable_addr_upper, 0x264);
+ASSERT_REG_POSITION(render_enable_addr_lower, 0x268);
+ASSERT_REG_POSITION(clip_x0, 0x280);
+ASSERT_REG_POSITION(clip_y0, 0x284);
+ASSERT_REG_POSITION(clip_width, 0x288);
+ASSERT_REG_POSITION(clip_height, 0x28c);
+ASSERT_REG_POSITION(clip_enable, 0x290);
+ASSERT_REG_POSITION(color_key_format, 0x294);
+ASSERT_REG_POSITION(color_key, 0x298);
+ASSERT_REG_POSITION(rop, 0x2A0);
+ASSERT_REG_POSITION(beta1, 0x2A4);
+ASSERT_REG_POSITION(beta4, 0x2A8);
+ASSERT_REG_POSITION(operation, 0x2AC);
+ASSERT_REG_POSITION(pattern_offset, 0x2B0);
+ASSERT_REG_POSITION(pattern_select, 0x2B4);
+ASSERT_REG_POSITION(monochrome_pattern, 0x2E8);
+ASSERT_REG_POSITION(color_pattern, 0x300);
+ASSERT_REG_POSITION(render_solid, 0x580);
+ASSERT_REG_POSITION(pixels_from_cpu, 0x800);
+ASSERT_REG_POSITION(big_endian_control, 0x870);
+ASSERT_REG_POSITION(pixels_from_memory, 0x880);
+
+#undef ASSERT_REG_POSITION
+
+} // namespace Tegra::Engines
--- a/src/video_core/engines/kepler_compute.cpp
+++ b/src/video_core/engines/kepler_compute.cpp
@@ -0,0 +1,127 @@
+// Copyright 2018 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <bitset>
+#include "common/assert.h"
+#include "common/logging/log.h"
+#include "core/core.h"
+#include "video_core/engines/kepler_compute.h"
+#include "video_core/engines/maxwell_3d.h"
+#include "video_core/engines/shader_type.h"
+#include "video_core/memory_manager.h"
+#include "video_core/rasterizer_interface.h"
+#include "video_core/renderer_base.h"
+#include "video_core/textures/decoders.h"
+
+namespace Tegra::Engines {
+
+KeplerCompute::KeplerCompute(Core::System& system_, MemoryManager& memory_manager_)
+    : system{system_}, memory_manager{memory_manager_}, upload_state{memory_manager, regs.upload} {}
+
+KeplerCompute::~KeplerCompute() = default;
+
+void KeplerCompute::BindRasterizer(VideoCore::RasterizerInterface& rasterizer_) {
+    rasterizer = &rasterizer_;
+}
+
+void KeplerCompute::CallMethod(u32 method, u32 method_argument, bool is_last_call) {
+    ASSERT_MSG(method < Regs::NUM_REGS,
+               "Invalid KeplerCompute register, increase the size of the Regs structure");
+
+    regs.reg_array[method] = method_argument;
+
+    switch (method) {
+    case KEPLER_COMPUTE_REG_INDEX(exec_upload): {
+        upload_state.ProcessExec(regs.exec_upload.linear != 0);
+        break;
+    }
+    case KEPLER_COMPUTE_REG_INDEX(data_upload): {
+        upload_state.ProcessData(method_argument, is_last_call);
+        if (is_last_call) {
+            system.GPU().Maxwell3D().OnMemoryWrite();
+        }
+        break;
+    }
+    case KEPLER_COMPUTE_REG_INDEX(launch):
+        ProcessLaunch();
+        break;
+    default:
+        break;
+    }
+}
+
+void KeplerCompute::CallMultiMethod(u32 method, const u32* base_start, u32 amount,
+                                    u32 methods_pending) {
+    for (std::size_t i = 0; i < amount; i++) {
+        CallMethod(method, base_start[i], methods_pending - static_cast<u32>(i) <= 1);
+    }
+}
+
+u32 KeplerCompute::AccessConstBuffer32(ShaderType stage, u64 const_buffer, u64 offset) const {
+    ASSERT(stage == ShaderType::Compute);
+    const auto& buffer = launch_description.const_buffer_config[const_buffer];
+    u32 result;
+    std::memcpy(&result, memory_manager.GetPointer(buffer.Address() + offset), sizeof(u32));
+    return result;
+}
+
+SamplerDescriptor KeplerCompute::AccessBoundSampler(ShaderType stage, u64 offset) const {
+    return AccessBindlessSampler(stage, regs.tex_cb_index, offset * sizeof(Texture::TextureHandle));
+}
+
+SamplerDescriptor KeplerCompute::AccessBindlessSampler(ShaderType stage, u64 const_buffer,
+                                                       u64 offset) const {
+    ASSERT(stage == ShaderType::Compute);
+    const auto& tex_info_buffer = launch_description.const_buffer_config[const_buffer];
+    const GPUVAddr tex_info_address = tex_info_buffer.Address() + offset;
+    return AccessSampler(memory_manager.Read<u32>(tex_info_address));
+}
+
+SamplerDescriptor KeplerCompute::AccessSampler(u32 handle) const {
+    const Texture::TextureHandle tex_handle{handle};
+    const Texture::TICEntry tic = GetTICEntry(tex_handle.tic_id);
+    const Texture::TSCEntry tsc = GetTSCEntry(tex_handle.tsc_id);
+
+    SamplerDescriptor result = SamplerDescriptor::FromTIC(tic);
+    result.is_shadow.Assign(tsc.depth_compare_enabled.Value());
+    return result;
+}
+
+VideoCore::GuestDriverProfile& KeplerCompute::AccessGuestDriverProfile() {
+    return rasterizer->AccessGuestDriverProfile();
+}
+
+const VideoCore::GuestDriverProfile& KeplerCompute::AccessGuestDriverProfile() const {
+    return rasterizer->AccessGuestDriverProfile();
+}
+
+void KeplerCompute::ProcessLaunch() {
+    const GPUVAddr launch_desc_loc = regs.launch_desc_loc.Address();
+    memory_manager.ReadBlockUnsafe(launch_desc_loc, &launch_description,
+                                   LaunchParams::NUM_LAUNCH_PARAMETERS * sizeof(u32));
+
+    const GPUVAddr code_addr = regs.code_loc.Address() + launch_description.program_start;
+    LOG_TRACE(HW_GPU, "Compute invocation launched at address 0x{:016x}", code_addr);
+
+    rasterizer->DispatchCompute(code_addr);
+}
+
+Texture::TICEntry KeplerCompute::GetTICEntry(u32 tic_index) const {
+    const GPUVAddr tic_address_gpu{regs.tic.Address() + tic_index * sizeof(Texture::TICEntry)};
+
+    Texture::TICEntry tic_entry;
+    memory_manager.ReadBlockUnsafe(tic_address_gpu, &tic_entry, sizeof(Texture::TICEntry));
+
+    return tic_entry;
+}
+
+Texture::TSCEntry KeplerCompute::GetTSCEntry(u32 tsc_index) const {
+    const GPUVAddr tsc_address_gpu{regs.tsc.Address() + tsc_index * sizeof(Texture::TSCEntry)};
+
+    Texture::TSCEntry tsc_entry;
+    memory_manager.ReadBlockUnsafe(tsc_address_gpu, &tsc_entry, sizeof(Texture::TSCEntry));
+    return tsc_entry;
+}
+
+} // namespace Tegra::Engines
--- a/src/video_core/engines/kepler_compute.h
+++ b/src/video_core/engines/kepler_compute.h
@@ -0,0 +1,269 @@
+// Copyright 2018 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <array>
+#include <cstddef>
+#include <vector>
+#include "common/bit_field.h"
+#include "common/common_funcs.h"
+#include "common/common_types.h"
+#include "video_core/engines/const_buffer_engine_interface.h"
+#include "video_core/engines/engine_interface.h"
+#include "video_core/engines/engine_upload.h"
+#include "video_core/engines/shader_type.h"
+#include "video_core/gpu.h"
+#include "video_core/textures/texture.h"
+
+namespace Core {
+class System;
+}
+
+namespace Tegra {
+class MemoryManager;
+}
+
+namespace VideoCore {
+class RasterizerInterface;
+}
+
+namespace Tegra::Engines {
+
+/**
+ * This Engine is known as GK104_Compute. Documentation can be found in:
+ * https://github.com/envytools/envytools/blob/master/rnndb/graph/gk104_compute.xml
+ * https://cgit.freedesktop.org/mesa/mesa/tree/src/gallium/drivers/nouveau/nvc0/nve4_compute.xml.h
+ */
+
+#define KEPLER_COMPUTE_REG_INDEX(field_name)                                                       \
+    (offsetof(Tegra::Engines::KeplerCompute::Regs, field_name) / sizeof(u32))
+
+class KeplerCompute final : public ConstBufferEngineInterface, public EngineInterface {
+public:
+    explicit KeplerCompute(Core::System& system, MemoryManager& memory_manager);
+    ~KeplerCompute();
+
+    /// Binds a rasterizer to this engine.
+    void BindRasterizer(VideoCore::RasterizerInterface& rasterizer);
+
+    static constexpr std::size_t NumConstBuffers = 8;
+
+    struct Regs {
+        static constexpr std::size_t NUM_REGS = 0xCF8;
+
+        union {
+            struct {
+                INSERT_UNION_PADDING_WORDS(0x60);
+
+                Upload::Registers upload;
+
+                struct {
+                    union {
+                        BitField<0, 1, u32> linear;
+                    };
+                } exec_upload;
+
+                u32 data_upload;
+
+                INSERT_UNION_PADDING_WORDS(0x3F);
+
+                struct {
+                    u32 address;
+                    GPUVAddr Address() const {
+                        return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address) << 8));
+                    }
+                } launch_desc_loc;
+
+                INSERT_UNION_PADDING_WORDS(0x1);
+
+                u32 launch;
+
+                INSERT_UNION_PADDING_WORDS(0x4A7);
+
+                struct {
+                    u32 address_high;
+                    u32 address_low;
+                    u32 limit;
+                    GPUVAddr Address() const {
+                        return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
+                                                     address_low);
+                    }
+                } tsc;
+
+                INSERT_UNION_PADDING_WORDS(0x3);
+
+                struct {
+                    u32 address_high;
+                    u32 address_low;
+                    u32 limit;
+                    GPUVAddr Address() const {
+                        return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
+                                                     address_low);
+                    }
+                } tic;
+
+                INSERT_UNION_PADDING_WORDS(0x22);
+
+                struct {
+                    u32 address_high;
+                    u32 address_low;
+                    GPUVAddr Address() const {
+                        return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
+                                                     address_low);
+                    }
+                } code_loc;
+
+                INSERT_UNION_PADDING_WORDS(0x3FE);
+
+                u32 tex_cb_index;
+
+                INSERT_UNION_PADDING_WORDS(0x374);
+            };
+            std::array<u32, NUM_REGS> reg_array;
+        };
+    } regs{};
+
+    struct LaunchParams {
+        static constexpr std::size_t NUM_LAUNCH_PARAMETERS = 0x40;
+
+        INSERT_PADDING_WORDS(0x8);
+
+        u32 program_start;
+
+        INSERT_PADDING_WORDS(0x2);
+
+        BitField<30, 1, u32> linked_tsc;
+
+        BitField<0, 31, u32> grid_dim_x;
+        union {
+            BitField<0, 16, u32> grid_dim_y;
+            BitField<16, 16, u32> grid_dim_z;
+        };
+
+        INSERT_PADDING_WORDS(0x3);
+
+        BitField<0, 18, u32> shared_alloc;
+
+        BitField<16, 16, u32> block_dim_x;
+        union {
+            BitField<0, 16, u32> block_dim_y;
+            BitField<16, 16, u32> block_dim_z;
+        };
+
+        union {
+            BitField<0, 8, u32> const_buffer_enable_mask;
+            BitField<29, 2, u32> cache_layout;
+        };
+
+        INSERT_PADDING_WORDS(0x8);
+
+        struct ConstBufferConfig {
+            u32 address_low;
+            union {
+                BitField<0, 8, u32> address_high;
+                BitField<15, 17, u32> size;
+            };
+            GPUVAddr Address() const {
+                return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high.Value()) << 32) |
+                                             address_low);
+            }
+        };
+        std::array<ConstBufferConfig, NumConstBuffers> const_buffer_config;
+
+        union {
+            BitField<0, 20, u32> local_pos_alloc;
+            BitField<27, 5, u32> barrier_alloc;
+        };
+
+        union {
+            BitField<0, 20, u32> local_neg_alloc;
+            BitField<24, 5, u32> gpr_alloc;
+        };
+
+        union {
+            BitField<0, 20, u32> local_crs_alloc;
+            BitField<24, 5, u32> sass_version;
+        };
+
+        INSERT_PADDING_WORDS(0x10);
+    } launch_description{};
+
+    struct {
+        u32 write_offset = 0;
+        u32 copy_size = 0;
+        std::vector<u8> inner_buffer;
+    } state{};
+
+    static_assert(sizeof(Regs) == Regs::NUM_REGS * sizeof(u32),
+                  "KeplerCompute Regs has wrong size");
+
+    static_assert(sizeof(LaunchParams) == LaunchParams::NUM_LAUNCH_PARAMETERS * sizeof(u32),
+                  "KeplerCompute LaunchParams has wrong size");
+
+    /// Write the value to the register identified by method.
+    void CallMethod(u32 method, u32 method_argument, bool is_last_call) override;
+
+    /// Write multiple values to the register identified by method.
+    void CallMultiMethod(u32 method, const u32* base_start, u32 amount,
+                         u32 methods_pending) override;
+
+    u32 AccessConstBuffer32(ShaderType stage, u64 const_buffer, u64 offset) const override;
+
+    SamplerDescriptor AccessBoundSampler(ShaderType stage, u64 offset) const override;
+
+    SamplerDescriptor AccessBindlessSampler(ShaderType stage, u64 const_buffer,
+                                            u64 offset) const override;
+
+    SamplerDescriptor AccessSampler(u32 handle) const override;
+
+    u32 GetBoundBuffer() const override {
+        return regs.tex_cb_index;
+    }
+
+    VideoCore::GuestDriverProfile& AccessGuestDriverProfile() override;
+
+    const VideoCore::GuestDriverProfile& AccessGuestDriverProfile() const override;
+
+private:
+    void ProcessLaunch();
+
+    /// Retrieves information about a specific TIC entry from the TIC buffer.
+    Texture::TICEntry GetTICEntry(u32 tic_index) const;
+
+    /// Retrieves information about a specific TSC entry from the TSC buffer.
+    Texture::TSCEntry GetTSCEntry(u32 tsc_index) const;
+
+    Core::System& system;
+    MemoryManager& memory_manager;
+    VideoCore::RasterizerInterface* rasterizer = nullptr;
+    Upload::State upload_state;
+};
+
+#define ASSERT_REG_POSITION(field_name, position)                                                  \
+    static_assert(offsetof(KeplerCompute::Regs, field_name) == position * 4,                       \
+                  "Field " #field_name " has invalid position")
+
+#define ASSERT_LAUNCH_PARAM_POSITION(field_name, position)                                         \
+    static_assert(offsetof(KeplerCompute::LaunchParams, field_name) == position * 4,               \
+                  "Field " #field_name " has invalid position")
+
+ASSERT_REG_POSITION(upload, 0x60);
+ASSERT_REG_POSITION(exec_upload, 0x6C);
+ASSERT_REG_POSITION(data_upload, 0x6D);
+ASSERT_REG_POSITION(launch, 0xAF);
+ASSERT_REG_POSITION(tsc, 0x557);
+ASSERT_REG_POSITION(tic, 0x55D);
+ASSERT_REG_POSITION(code_loc, 0x582);
+ASSERT_REG_POSITION(tex_cb_index, 0x982);
+ASSERT_LAUNCH_PARAM_POSITION(program_start, 0x8);
+ASSERT_LAUNCH_PARAM_POSITION(grid_dim_x, 0xC);
+ASSERT_LAUNCH_PARAM_POSITION(shared_alloc, 0x11);
+ASSERT_LAUNCH_PARAM_POSITION(block_dim_x, 0x12);
+ASSERT_LAUNCH_PARAM_POSITION(const_buffer_enable_mask, 0x14);
+ASSERT_LAUNCH_PARAM_POSITION(const_buffer_config, 0x1D);
+
+#undef ASSERT_REG_POSITION
+
+} // namespace Tegra::Engines
--- a/src/video_core/engines/kepler_memory.cpp
+++ b/src/video_core/engines/kepler_memory.cpp
@@ -0,0 +1,50 @@
+// Copyright 2018 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "common/assert.h"
+#include "common/logging/log.h"
+#include "core/core.h"
+#include "video_core/engines/kepler_memory.h"
+#include "video_core/engines/maxwell_3d.h"
+#include "video_core/memory_manager.h"
+#include "video_core/rasterizer_interface.h"
+#include "video_core/renderer_base.h"
+#include "video_core/textures/decoders.h"
+
+namespace Tegra::Engines {
+
+KeplerMemory::KeplerMemory(Core::System& system_, MemoryManager& memory_manager)
+    : system{system_}, upload_state{memory_manager, regs.upload} {}
+
+KeplerMemory::~KeplerMemory() = default;
+
+void KeplerMemory::CallMethod(u32 method, u32 method_argument, bool is_last_call) {
+    ASSERT_MSG(method < Regs::NUM_REGS,
+               "Invalid KeplerMemory register, increase the size of the Regs structure");
+
+    regs.reg_array[method] = method_argument;
+
+    switch (method) {
+    case KEPLERMEMORY_REG_INDEX(exec): {
+        upload_state.ProcessExec(regs.exec.linear != 0);
+        break;
+    }
+    case KEPLERMEMORY_REG_INDEX(data): {
+        upload_state.ProcessData(method_argument, is_last_call);
+        if (is_last_call) {
+            system.GPU().Maxwell3D().OnMemoryWrite();
+        }
+        break;
+    }
+    }
+}
+
+void KeplerMemory::CallMultiMethod(u32 method, const u32* base_start, u32 amount,
+                                   u32 methods_pending) {
+    for (std::size_t i = 0; i < amount; i++) {
+        CallMethod(method, base_start[i], methods_pending - static_cast<u32>(i) <= 1);
+    }
+}
+
+} // namespace Tegra::Engines
--- a/src/video_core/engines/kepler_memory.h
+++ b/src/video_core/engines/kepler_memory.h
@@ -0,0 +1,85 @@
+// Copyright 2018 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <array>
+#include <cstddef>
+#include <vector>
+#include "common/bit_field.h"
+#include "common/common_funcs.h"
+#include "common/common_types.h"
+#include "video_core/engines/engine_interface.h"
+#include "video_core/engines/engine_upload.h"
+#include "video_core/gpu.h"
+
+namespace Core {
+class System;
+}
+
+namespace Tegra {
+class MemoryManager;
+}
+
+namespace Tegra::Engines {
+
+/**
+ * This Engine is known as P2MF. Documentation can be found in:
+ * https://github.com/envytools/envytools/blob/master/rnndb/graph/gk104_p2mf.xml
+ * https://cgit.freedesktop.org/mesa/mesa/tree/src/gallium/drivers/nouveau/nvc0/nve4_p2mf.xml.h
+ */
+
+#define KEPLERMEMORY_REG_INDEX(field_name)                                                         \
+    (offsetof(Tegra::Engines::KeplerMemory::Regs, field_name) / sizeof(u32))
+
+class KeplerMemory final : public EngineInterface {
+public:
+    explicit KeplerMemory(Core::System& system_, MemoryManager& memory_manager);
+    ~KeplerMemory();
+
+    /// Write the value to the register identified by method.
+    void CallMethod(u32 method, u32 method_argument, bool is_last_call) override;
+
+    /// Write multiple values to the register identified by method.
+    void CallMultiMethod(u32 method, const u32* base_start, u32 amount,
+                         u32 methods_pending) override;
+
+    struct Regs {
+        static constexpr size_t NUM_REGS = 0x7F;
+
+        union {
+            struct {
+                INSERT_UNION_PADDING_WORDS(0x60);
+
+                Upload::Registers upload;
+
+                struct {
+                    union {
+                        BitField<0, 1, u32> linear;
+                    };
+                } exec;
+
+                u32 data;
+
+                INSERT_UNION_PADDING_WORDS(0x11);
+            };
+            std::array<u32, NUM_REGS> reg_array;
+        };
+    } regs{};
+
+private:
+    Core::System& system;
+    Upload::State upload_state;
+};
+
+#define ASSERT_REG_POSITION(field_name, position)                                                  \
+    static_assert(offsetof(KeplerMemory::Regs, field_name) == position * 4,                        \
+                  "Field " #field_name " has invalid position")
+
+ASSERT_REG_POSITION(upload, 0x60);
+ASSERT_REG_POSITION(exec, 0x6C);
+ASSERT_REG_POSITION(data, 0x6D);
+#undef ASSERT_REG_POSITION
+
+} // namespace Tegra::Engines
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@@ -0,0 +1,708 @@
+// Copyright 2018 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <cstring>
+#include <optional>
+#include "common/assert.h"
+#include "core/core.h"
+#include "core/core_timing.h"
+#include "video_core/engines/maxwell_3d.h"
+#include "video_core/engines/shader_type.h"
+#include "video_core/gpu.h"
+#include "video_core/memory_manager.h"
+#include "video_core/rasterizer_interface.h"
+#include "video_core/textures/texture.h"
+
+namespace Tegra::Engines {
+
+using VideoCore::QueryType;
+
+/// First register id that is actually a Macro call.
+constexpr u32 MacroRegistersStart = 0xE00;
+
+Maxwell3D::Maxwell3D(Core::System& system_, MemoryManager& memory_manager_)
+    : system{system_}, memory_manager{memory_manager_}, macro_engine{GetMacroEngine(*this)},
+      upload_state{memory_manager, regs.upload} {
+    dirty.flags.flip();
+    InitializeRegisterDefaults();
+}
+
+Maxwell3D::~Maxwell3D() = default;
+
+void Maxwell3D::BindRasterizer(VideoCore::RasterizerInterface& rasterizer_) {
+    rasterizer = &rasterizer_;
+}
+
+void Maxwell3D::InitializeRegisterDefaults() {
+    // Initializes registers to their default values - what games expect them to be at boot. This is
+    // for certain registers that may not be explicitly set by games.
+
+    // Reset all registers to zero
+    std::memset(&regs, 0, sizeof(regs));
+
+    // Depth range near/far is not always set, but is expected to be the default 0.0f, 1.0f. This is
+    // needed for ARMS.
+    for (auto& viewport : regs.viewports) {
+        viewport.depth_range_near = 0.0f;
+        viewport.depth_range_far = 1.0f;
+    }
+    for (auto& viewport : regs.viewport_transform) {
+        viewport.swizzle.x.Assign(Regs::ViewportSwizzle::PositiveX);
+        viewport.swizzle.y.Assign(Regs::ViewportSwizzle::PositiveY);
+        viewport.swizzle.z.Assign(Regs::ViewportSwizzle::PositiveZ);
+        viewport.swizzle.w.Assign(Regs::ViewportSwizzle::PositiveW);
+    }
+
+    // Doom and Bomberman seems to use the uninitialized registers and just enable blend
+    // so initialize blend registers with sane values
+    regs.blend.equation_rgb = Regs::Blend::Equation::Add;
+    regs.blend.factor_source_rgb = Regs::Blend::Factor::One;
+    regs.blend.factor_dest_rgb = Regs::Blend::Factor::Zero;
+    regs.blend.equation_a = Regs::Blend::Equation::Add;
+    regs.blend.factor_source_a = Regs::Blend::Factor::One;
+    regs.blend.factor_dest_a = Regs::Blend::Factor::Zero;
+    for (auto& blend : regs.independent_blend) {
+        blend.equation_rgb = Regs::Blend::Equation::Add;
+        blend.factor_source_rgb = Regs::Blend::Factor::One;
+        blend.factor_dest_rgb = Regs::Blend::Factor::Zero;
+        blend.equation_a = Regs::Blend::Equation::Add;
+        blend.factor_source_a = Regs::Blend::Factor::One;
+        blend.factor_dest_a = Regs::Blend::Factor::Zero;
+    }
+    regs.stencil_front_op_fail = Regs::StencilOp::Keep;
+    regs.stencil_front_op_zfail = Regs::StencilOp::Keep;
+    regs.stencil_front_op_zpass = Regs::StencilOp::Keep;
+    regs.stencil_front_func_func = Regs::ComparisonOp::Always;
+    regs.stencil_front_func_mask = 0xFFFFFFFF;
+    regs.stencil_front_mask = 0xFFFFFFFF;
+    regs.stencil_two_side_enable = 1;
+    regs.stencil_back_op_fail = Regs::StencilOp::Keep;
+    regs.stencil_back_op_zfail = Regs::StencilOp::Keep;
+    regs.stencil_back_op_zpass = Regs::StencilOp::Keep;
+    regs.stencil_back_func_func = Regs::ComparisonOp::Always;
+    regs.stencil_back_func_mask = 0xFFFFFFFF;
+    regs.stencil_back_mask = 0xFFFFFFFF;
+
+    regs.depth_test_func = Regs::ComparisonOp::Always;
+    regs.front_face = Regs::FrontFace::CounterClockWise;
+    regs.cull_face = Regs::CullFace::Back;
+
+    // TODO(Rodrigo): Most games do not set a point size. I think this is a case of a
+    // register carrying a default value. Assume it's OpenGL's default (1).
+    regs.point_size = 1.0f;
+
+    // TODO(bunnei): Some games do not initialize the color masks (e.g. Sonic Mania). Assuming a
+    // default of enabled fixes rendering here.
+    for (auto& color_mask : regs.color_mask) {
+        color_mask.R.Assign(1);
+        color_mask.G.Assign(1);
+        color_mask.B.Assign(1);
+        color_mask.A.Assign(1);
+    }
+
+    for (auto& format : regs.vertex_attrib_format) {
+        format.constant.Assign(1);
+    }
+
+    // NVN games expect these values to be enabled at boot
+    regs.rasterize_enable = 1;
+    regs.rt_separate_frag_data = 1;
+    regs.framebuffer_srgb = 1;
+    regs.line_width_aliased = 1.0f;
+    regs.line_width_smooth = 1.0f;
+    regs.front_face = Maxwell3D::Regs::FrontFace::ClockWise;
+    regs.polygon_mode_back = Maxwell3D::Regs::PolygonMode::Fill;
+    regs.polygon_mode_front = Maxwell3D::Regs::PolygonMode::Fill;
+
+    shadow_state = regs;
+
+    mme_inline[MAXWELL3D_REG_INDEX(draw.vertex_end_gl)] = true;
+    mme_inline[MAXWELL3D_REG_INDEX(draw.vertex_begin_gl)] = true;
+    mme_inline[MAXWELL3D_REG_INDEX(vertex_buffer.count)] = true;
+    mme_inline[MAXWELL3D_REG_INDEX(index_array.count)] = true;
+}
+
+void Maxwell3D::ProcessMacro(u32 method, const u32* base_start, u32 amount, bool is_last_call) {
+    if (executing_macro == 0) {
+        // A macro call must begin by writing the macro method's register, not its argument.
+        ASSERT_MSG((method % 2) == 0,
+                   "Can't start macro execution by writing to the ARGS register");
+        executing_macro = method;
+    }
+
+    macro_params.insert(macro_params.end(), base_start, base_start + amount);
+
+    // Call the macro when there are no more parameters in the command buffer
+    if (is_last_call) {
+        CallMacroMethod(executing_macro, macro_params);
+        macro_params.clear();
+    }
+}
+
+u32 Maxwell3D::ProcessShadowRam(u32 method, u32 argument) {
+    // Keep track of the register value in shadow_state when requested.
+    const auto control = shadow_state.shadow_ram_control;
+    if (control == Regs::ShadowRamControl::Track ||
+        control == Regs::ShadowRamControl::TrackWithFilter) {
+        shadow_state.reg_array[method] = argument;
+        return argument;
+    }
+    if (control == Regs::ShadowRamControl::Replay) {
+        return shadow_state.reg_array[method];
+    }
+    return argument;
+}
+
+void Maxwell3D::ProcessDirtyRegisters(u32 method, u32 argument) {
+    if (regs.reg_array[method] == argument) {
+        return;
+    }
+    regs.reg_array[method] = argument;
+
+    for (const auto& table : dirty.tables) {
+        dirty.flags[table[method]] = true;
+    }
+}
+
+void Maxwell3D::ProcessMethodCall(u32 method, u32 argument, u32 nonshadow_argument,
+                                  bool is_last_call) {
+    switch (method) {
+    case MAXWELL3D_REG_INDEX(wait_for_idle):
+        return rasterizer->WaitForIdle();
+    case MAXWELL3D_REG_INDEX(shadow_ram_control):
+        shadow_state.shadow_ram_control = static_cast<Regs::ShadowRamControl>(nonshadow_argument);
+        return;
+    case MAXWELL3D_REG_INDEX(macros.data):
+        return macro_engine->AddCode(regs.macros.upload_address, argument);
+    case MAXWELL3D_REG_INDEX(macros.bind):
+        return ProcessMacroBind(argument);
+    case MAXWELL3D_REG_INDEX(firmware[4]):
+        return ProcessFirmwareCall4();
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[0]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[1]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[2]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[3]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[4]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[5]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[6]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[7]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[8]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[9]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[10]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[11]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[12]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[13]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[14]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[15]):
+        return StartCBData(method);
+    case MAXWELL3D_REG_INDEX(cb_bind[0]):
+        return ProcessCBBind(0);
+    case MAXWELL3D_REG_INDEX(cb_bind[1]):
+        return ProcessCBBind(1);
+    case MAXWELL3D_REG_INDEX(cb_bind[2]):
+        return ProcessCBBind(2);
+    case MAXWELL3D_REG_INDEX(cb_bind[3]):
+        return ProcessCBBind(3);
+    case MAXWELL3D_REG_INDEX(cb_bind[4]):
+        return ProcessCBBind(4);
+    case MAXWELL3D_REG_INDEX(draw.vertex_end_gl):
+        return DrawArrays();
+    case MAXWELL3D_REG_INDEX(clear_buffers):
+        return ProcessClearBuffers();
+    case MAXWELL3D_REG_INDEX(query.query_get):
+        return ProcessQueryGet();
+    case MAXWELL3D_REG_INDEX(condition.mode):
+        return ProcessQueryCondition();
+    case MAXWELL3D_REG_INDEX(counter_reset):
+        return ProcessCounterReset();
+    case MAXWELL3D_REG_INDEX(sync_info):
+        return ProcessSyncPoint();
+    case MAXWELL3D_REG_INDEX(exec_upload):
+        return upload_state.ProcessExec(regs.exec_upload.linear != 0);
+    case MAXWELL3D_REG_INDEX(data_upload):
+        upload_state.ProcessData(argument, is_last_call);
+        if (is_last_call) {
+            OnMemoryWrite();
+        }
+        return;
+    case MAXWELL3D_REG_INDEX(fragment_barrier):
+        return rasterizer->FragmentBarrier();
+    case MAXWELL3D_REG_INDEX(tiled_cache_barrier):
+        return rasterizer->TiledCacheBarrier();
+    }
+}
+
+void Maxwell3D::CallMacroMethod(u32 method, const std::vector<u32>& parameters) {
+    // Reset the current macro.
+    executing_macro = 0;
+
+    // Lookup the macro offset
+    const u32 entry =
+        ((method - MacroRegistersStart) >> 1) % static_cast<u32>(macro_positions.size());
+
+    // Execute the current macro.
+    macro_engine->Execute(*this, macro_positions[entry], parameters);
+    if (mme_draw.current_mode != MMEDrawMode::Undefined) {
+        FlushMMEInlineDraw();
+    }
+}
+
+void Maxwell3D::CallMethod(u32 method, u32 method_argument, bool is_last_call) {
+    if (method == cb_data_state.current) {
+        regs.reg_array[method] = method_argument;
+        ProcessCBData(method_argument);
+        return;
+    } else if (cb_data_state.current != null_cb_data) {
+        FinishCBData();
+    }
+
+    // It is an error to write to a register other than the current macro's ARG register before it
+    // has finished execution.
+    if (executing_macro != 0) {
+        ASSERT(method == executing_macro + 1);
+    }
+
+    // Methods after 0xE00 are special, they're actually triggers for some microcode that was
+    // uploaded to the GPU during initialization.
+    if (method >= MacroRegistersStart) {
+        ProcessMacro(method, &method_argument, 1, is_last_call);
+        return;
+    }
+
+    ASSERT_MSG(method < Regs::NUM_REGS,
+               "Invalid Maxwell3D register, increase the size of the Regs structure");
+
+    const u32 argument = ProcessShadowRam(method, method_argument);
+    ProcessDirtyRegisters(method, argument);
+    ProcessMethodCall(method, argument, method_argument, is_last_call);
+}
+
+void Maxwell3D::CallMultiMethod(u32 method, const u32* base_start, u32 amount,
+                                u32 methods_pending) {
+    // Methods after 0xE00 are special, they're actually triggers for some microcode that was
+    // uploaded to the GPU during initialization.
+    if (method >= MacroRegistersStart) {
+        ProcessMacro(method, base_start, amount, amount == methods_pending);
+        return;
+    }
+    switch (method) {
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[0]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[1]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[2]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[3]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[4]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[5]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[6]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[7]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[8]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[9]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[10]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[11]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[12]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[13]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[14]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[15]):
+        ProcessCBMultiData(method, base_start, amount);
+        break;
+    default:
+        for (std::size_t i = 0; i < amount; i++) {
+            CallMethod(method, base_start[i], methods_pending - static_cast<u32>(i) <= 1);
+        }
+        break;
+    }
+}
+
+void Maxwell3D::StepInstance(const MMEDrawMode expected_mode, const u32 count) {
+    if (mme_draw.current_mode == MMEDrawMode::Undefined) {
+        if (mme_draw.gl_begin_consume) {
+            mme_draw.current_mode = expected_mode;
+            mme_draw.current_count = count;
+            mme_draw.instance_count = 1;
+            mme_draw.gl_begin_consume = false;
+            mme_draw.gl_end_count = 0;
+        }
+        return;
+    } else {
+        if (mme_draw.current_mode == expected_mode && count == mme_draw.current_count &&
+            mme_draw.instance_mode && mme_draw.gl_begin_consume) {
+            mme_draw.instance_count++;
+            mme_draw.gl_begin_consume = false;
+            return;
+        } else {
+            FlushMMEInlineDraw();
+        }
+    }
+    // Tail call in case it needs to retry.
+    StepInstance(expected_mode, count);
+}
+
+void Maxwell3D::CallMethodFromMME(u32 method, u32 method_argument) {
+    if (mme_inline[method]) {
+        regs.reg_array[method] = method_argument;
+        if (method == MAXWELL3D_REG_INDEX(vertex_buffer.count) ||
+            method == MAXWELL3D_REG_INDEX(index_array.count)) {
+            const MMEDrawMode expected_mode = method == MAXWELL3D_REG_INDEX(vertex_buffer.count)
+                                                  ? MMEDrawMode::Array
+                                                  : MMEDrawMode::Indexed;
+            StepInstance(expected_mode, method_argument);
+        } else if (method == MAXWELL3D_REG_INDEX(draw.vertex_begin_gl)) {
+            mme_draw.instance_mode =
+                (regs.draw.instance_next != 0) || (regs.draw.instance_cont != 0);
+            mme_draw.gl_begin_consume = true;
+        } else {
+            mme_draw.gl_end_count++;
+        }
+    } else {
+        if (mme_draw.current_mode != MMEDrawMode::Undefined) {
+            FlushMMEInlineDraw();
+        }
+        CallMethod(method, method_argument, true);
+    }
+}
+
+void Maxwell3D::FlushMMEInlineDraw() {
+    LOG_TRACE(HW_GPU, "called, topology={}, count={}", regs.draw.topology.Value(),
+              regs.vertex_buffer.count);
+    ASSERT_MSG(!(regs.index_array.count && regs.vertex_buffer.count), "Both indexed and direct?");
+    ASSERT(mme_draw.instance_count == mme_draw.gl_end_count);
+
+    // Both instance configuration registers can not be set at the same time.
+    ASSERT_MSG(!regs.draw.instance_next || !regs.draw.instance_cont,
+               "Illegal combination of instancing parameters");
+
+    const bool is_indexed = mme_draw.current_mode == MMEDrawMode::Indexed;
+    if (ShouldExecute()) {
+        rasterizer->Draw(is_indexed, true);
+    }
+
+    // TODO(bunnei): Below, we reset vertex count so that we can use these registers to determine if
+    // the game is trying to draw indexed or direct mode. This needs to be verified on HW still -
+    // it's possible that it is incorrect and that there is some other register used to specify the
+    // drawing mode.
+    if (is_indexed) {
+        regs.index_array.count = 0;
+    } else {
+        regs.vertex_buffer.count = 0;
+    }
+    mme_draw.current_mode = MMEDrawMode::Undefined;
+    mme_draw.current_count = 0;
+    mme_draw.instance_count = 0;
+    mme_draw.instance_mode = false;
+    mme_draw.gl_begin_consume = false;
+    mme_draw.gl_end_count = 0;
+}
+
+void Maxwell3D::ProcessMacroUpload(u32 data) {
+    macro_engine->AddCode(regs.macros.upload_address++, data);
+}
+
+void Maxwell3D::ProcessMacroBind(u32 data) {
+    macro_positions[regs.macros.entry++] = data;
+}
+
+void Maxwell3D::ProcessFirmwareCall4() {
+    LOG_WARNING(HW_GPU, "(STUBBED) called");
+
+    // Firmware call 4 is a blob that changes some registers depending on its parameters.
+    // These registers don't affect emulation and so are stubbed by setting 0xd00 to 1.
+    regs.reg_array[0xd00] = 1;
+}
+
+void Maxwell3D::StampQueryResult(u64 payload, bool long_query) {
+    struct LongQueryResult {
+        u64_le value;
+        u64_le timestamp;
+    };
+    static_assert(sizeof(LongQueryResult) == 16, "LongQueryResult has wrong size");
+    const GPUVAddr sequence_address{regs.query.QueryAddress()};
+    if (long_query) {
+        // Write the 128-bit result structure in long mode. Note: We emulate an infinitely fast
+        // GPU, this command may actually take a while to complete in real hardware due to GPU
+        // wait queues.
+        LongQueryResult query_result{payload, system.GPU().GetTicks()};
+        memory_manager.WriteBlock(sequence_address, &query_result, sizeof(query_result));
+    } else {
+        memory_manager.Write<u32>(sequence_address, static_cast<u32>(payload));
+    }
+}
+
+void Maxwell3D::ProcessQueryGet() {
+    // TODO(Subv): Support the other query units.
+    if (regs.query.query_get.unit != Regs::QueryUnit::Crop) {
+        LOG_DEBUG(HW_GPU, "Units other than CROP are unimplemented");
+    }
+
+    switch (regs.query.query_get.operation) {
+    case Regs::QueryOperation::Release:
+        if (regs.query.query_get.fence == 1) {
+            rasterizer->SignalSemaphore(regs.query.QueryAddress(), regs.query.query_sequence);
+        } else {
+            StampQueryResult(regs.query.query_sequence, regs.query.query_get.short_query == 0);
+        }
+        break;
+    case Regs::QueryOperation::Acquire:
+        // TODO(Blinkhawk): Under this operation, the GPU waits for the CPU to write a value that
+        // matches the current payload.
+        UNIMPLEMENTED_MSG("Unimplemented query operation ACQUIRE");
+        break;
+    case Regs::QueryOperation::Counter:
+        if (const std::optional<u64> result = GetQueryResult()) {
+            // If the query returns an empty optional it means it's cached and deferred.
+            // In this case we have a non-empty result, so we stamp it immediately.
+            StampQueryResult(*result, regs.query.query_get.short_query == 0);
+        }
+        break;
+    case Regs::QueryOperation::Trap:
+        UNIMPLEMENTED_MSG("Unimplemented query operation TRAP");
+        break;
+    default:
+        UNIMPLEMENTED_MSG("Unknown query operation");
+        break;
+    }
+}
+
+void Maxwell3D::ProcessQueryCondition() {
+    const GPUVAddr condition_address{regs.condition.Address()};
+    switch (regs.condition.mode) {
+    case Regs::ConditionMode::Always: {
+        execute_on = true;
+        break;
+    }
+    case Regs::ConditionMode::Never: {
+        execute_on = false;
+        break;
+    }
+    case Regs::ConditionMode::ResNonZero: {
+        Regs::QueryCompare cmp;
+        memory_manager.ReadBlock(condition_address, &cmp, sizeof(cmp));
+        execute_on = cmp.initial_sequence != 0U && cmp.initial_mode != 0U;
+        break;
+    }
+    case Regs::ConditionMode::Equal: {
+        Regs::QueryCompare cmp;
+        memory_manager.ReadBlock(condition_address, &cmp, sizeof(cmp));
+        execute_on =
+            cmp.initial_sequence == cmp.current_sequence && cmp.initial_mode == cmp.current_mode;
+        break;
+    }
+    case Regs::ConditionMode::NotEqual: {
+        Regs::QueryCompare cmp;
+        memory_manager.ReadBlock(condition_address, &cmp, sizeof(cmp));
+        execute_on =
+            cmp.initial_sequence != cmp.current_sequence || cmp.initial_mode != cmp.current_mode;
+        break;
+    }
+    default: {
+        UNIMPLEMENTED_MSG("Uninplemented Condition Mode!");
+        execute_on = true;
+        break;
+    }
+    }
+}
+
+void Maxwell3D::ProcessCounterReset() {
+    switch (regs.counter_reset) {
+    case Regs::CounterReset::SampleCnt:
+        rasterizer->ResetCounter(QueryType::SamplesPassed);
+        break;
+    default:
+        LOG_DEBUG(Render_OpenGL, "Unimplemented counter reset={}", regs.counter_reset);
+        break;
+    }
+}
+
+void Maxwell3D::ProcessSyncPoint() {
+    const u32 sync_point = regs.sync_info.sync_point.Value();
+    const u32 increment = regs.sync_info.increment.Value();
+    [[maybe_unused]] const u32 cache_flush = regs.sync_info.unknown.Value();
+    if (increment) {
+        rasterizer->SignalSyncPoint(sync_point);
+    }
+}
+
+void Maxwell3D::DrawArrays() {
+    LOG_TRACE(HW_GPU, "called, topology={}, count={}", regs.draw.topology.Value(),
+              regs.vertex_buffer.count);
+    ASSERT_MSG(!(regs.index_array.count && regs.vertex_buffer.count), "Both indexed and direct?");
+
+    // Both instance configuration registers can not be set at the same time.
+    ASSERT_MSG(!regs.draw.instance_next || !regs.draw.instance_cont,
+               "Illegal combination of instancing parameters");
+
+    if (regs.draw.instance_next) {
+        // Increment the current instance *before* drawing.
+        state.current_instance += 1;
+    } else if (!regs.draw.instance_cont) {
+        // Reset the current instance to 0.
+        state.current_instance = 0;
+    }
+
+    const bool is_indexed{regs.index_array.count && !regs.vertex_buffer.count};
+    if (ShouldExecute()) {
+        rasterizer->Draw(is_indexed, false);
+    }
+
+    // TODO(bunnei): Below, we reset vertex count so that we can use these registers to determine if
+    // the game is trying to draw indexed or direct mode. This needs to be verified on HW still -
+    // it's possible that it is incorrect and that there is some other register used to specify the
+    // drawing mode.
+    if (is_indexed) {
+        regs.index_array.count = 0;
+    } else {
+        regs.vertex_buffer.count = 0;
+    }
+}
+
+std::optional<u64> Maxwell3D::GetQueryResult() {
+    switch (regs.query.query_get.select) {
+    case Regs::QuerySelect::Zero:
+        return 0;
+    case Regs::QuerySelect::SamplesPassed:
+        // Deferred.
+        rasterizer->Query(regs.query.QueryAddress(), QueryType::SamplesPassed,
+                          system.GPU().GetTicks());
+        return std::nullopt;
+    default:
+        LOG_DEBUG(HW_GPU, "Unimplemented query select type {}",
+                  regs.query.query_get.select.Value());
+        return 1;
+    }
+}
+
+void Maxwell3D::ProcessCBBind(std::size_t stage_index) {
+    // Bind the buffer currently in CB_ADDRESS to the specified index in the desired shader stage.
+    auto& shader = state.shader_stages[stage_index];
+    auto& bind_data = regs.cb_bind[stage_index];
+
+    ASSERT(bind_data.index < Regs::MaxConstBuffers);
+    auto& buffer = shader.const_buffers[bind_data.index];
+
+    buffer.enabled = bind_data.valid.Value() != 0;
+    buffer.address = regs.const_buffer.BufferAddress();
+    buffer.size = regs.const_buffer.cb_size;
+}
+
+void Maxwell3D::ProcessCBData(u32 value) {
+    const u32 id = cb_data_state.id;
+    cb_data_state.buffer[id][cb_data_state.counter] = value;
+    // Increment the current buffer position.
+    regs.const_buffer.cb_pos = regs.const_buffer.cb_pos + 4;
+    cb_data_state.counter++;
+}
+
+void Maxwell3D::StartCBData(u32 method) {
+    constexpr u32 first_cb_data = MAXWELL3D_REG_INDEX(const_buffer.cb_data[0]);
+    cb_data_state.start_pos = regs.const_buffer.cb_pos;
+    cb_data_state.id = method - first_cb_data;
+    cb_data_state.current = method;
+    cb_data_state.counter = 0;
+    ProcessCBData(regs.const_buffer.cb_data[cb_data_state.id]);
+}
+
+void Maxwell3D::ProcessCBMultiData(u32 method, const u32* start_base, u32 amount) {
+    if (cb_data_state.current != method) {
+        if (cb_data_state.current != null_cb_data) {
+            FinishCBData();
+        }
+        constexpr u32 first_cb_data = MAXWELL3D_REG_INDEX(const_buffer.cb_data[0]);
+        cb_data_state.start_pos = regs.const_buffer.cb_pos;
+        cb_data_state.id = method - first_cb_data;
+        cb_data_state.current = method;
+        cb_data_state.counter = 0;
+    }
+    const std::size_t id = cb_data_state.id;
+    const std::size_t size = amount;
+    std::size_t i = 0;
+    for (; i < size; i++) {
+        cb_data_state.buffer[id][cb_data_state.counter] = start_base[i];
+        cb_data_state.counter++;
+    }
+    // Increment the current buffer position.
+    regs.const_buffer.cb_pos = regs.const_buffer.cb_pos + 4 * amount;
+}
+
+void Maxwell3D::FinishCBData() {
+    // Write the input value to the current const buffer at the current position.
+    const GPUVAddr buffer_address = regs.const_buffer.BufferAddress();
+    ASSERT(buffer_address != 0);
+
+    // Don't allow writing past the end of the buffer.
+    ASSERT(regs.const_buffer.cb_pos <= regs.const_buffer.cb_size);
+
+    const GPUVAddr address{buffer_address + cb_data_state.start_pos};
+    const std::size_t size = regs.const_buffer.cb_pos - cb_data_state.start_pos;
+
+    const u32 id = cb_data_state.id;
+    memory_manager.WriteBlock(address, cb_data_state.buffer[id].data(), size);
+    OnMemoryWrite();
+
+    cb_data_state.id = null_cb_data;
+    cb_data_state.current = null_cb_data;
+}
+
+Texture::TICEntry Maxwell3D::GetTICEntry(u32 tic_index) const {
+    const GPUVAddr tic_address_gpu{regs.tic.Address() + tic_index * sizeof(Texture::TICEntry)};
+
+    Texture::TICEntry tic_entry;
+    memory_manager.ReadBlockUnsafe(tic_address_gpu, &tic_entry, sizeof(Texture::TICEntry));
+
+    return tic_entry;
+}
+
+Texture::TSCEntry Maxwell3D::GetTSCEntry(u32 tsc_index) const {
+    const GPUVAddr tsc_address_gpu{regs.tsc.Address() + tsc_index * sizeof(Texture::TSCEntry)};
+
+    Texture::TSCEntry tsc_entry;
+    memory_manager.ReadBlockUnsafe(tsc_address_gpu, &tsc_entry, sizeof(Texture::TSCEntry));
+    return tsc_entry;
+}
+
+u32 Maxwell3D::GetRegisterValue(u32 method) const {
+    ASSERT_MSG(method < Regs::NUM_REGS, "Invalid Maxwell3D register");
+    return regs.reg_array[method];
+}
+
+void Maxwell3D::ProcessClearBuffers() {
+    rasterizer->Clear();
+}
+
+u32 Maxwell3D::AccessConstBuffer32(ShaderType stage, u64 const_buffer, u64 offset) const {
+    ASSERT(stage != ShaderType::Compute);
+    const auto& shader_stage = state.shader_stages[static_cast<std::size_t>(stage)];
+    const auto& buffer = shader_stage.const_buffers[const_buffer];
+    return memory_manager.Read<u32>(buffer.address + offset);
+}
+
+SamplerDescriptor Maxwell3D::AccessBoundSampler(ShaderType stage, u64 offset) const {
+    return AccessBindlessSampler(stage, regs.tex_cb_index, offset * sizeof(Texture::TextureHandle));
+}
+
+SamplerDescriptor Maxwell3D::AccessBindlessSampler(ShaderType stage, u64 const_buffer,
+                                                   u64 offset) const {
+    ASSERT(stage != ShaderType::Compute);
+    const auto& shader = state.shader_stages[static_cast<std::size_t>(stage)];
+    const auto& tex_info_buffer = shader.const_buffers[const_buffer];
+    const GPUVAddr tex_info_address = tex_info_buffer.address + offset;
+    return AccessSampler(memory_manager.Read<u32>(tex_info_address));
+}
+
+SamplerDescriptor Maxwell3D::AccessSampler(u32 handle) const {
+    const Texture::TextureHandle tex_handle{handle};
+    const Texture::TICEntry tic = GetTICEntry(tex_handle.tic_id);
+    const Texture::TSCEntry tsc = GetTSCEntry(tex_handle.tsc_id);
+
+    SamplerDescriptor result = SamplerDescriptor::FromTIC(tic);
+    result.is_shadow.Assign(tsc.depth_compare_enabled.Value());
+    return result;
+}
+
+VideoCore::GuestDriverProfile& Maxwell3D::AccessGuestDriverProfile() {
+    return rasterizer->AccessGuestDriverProfile();
+}
+
+const VideoCore::GuestDriverProfile& Maxwell3D::AccessGuestDriverProfile() const {
+    return rasterizer->AccessGuestDriverProfile();
+}
+
+} // namespace Tegra::Engines
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
--- a/src/video_core/engines/maxwell_dma.cpp
+++ b/src/video_core/engines/maxwell_dma.cpp
@@ -0,0 +1,219 @@
+// Copyright 2018 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "common/assert.h"
+#include "common/logging/log.h"
+#include "core/core.h"
+#include "core/settings.h"
+#include "video_core/engines/maxwell_3d.h"
+#include "video_core/engines/maxwell_dma.h"
+#include "video_core/memory_manager.h"
+#include "video_core/renderer_base.h"
+#include "video_core/textures/decoders.h"
+
+namespace Tegra::Engines {
+
+using namespace Texture;
+
+MaxwellDMA::MaxwellDMA(Core::System& system_, MemoryManager& memory_manager_)
+    : system{system_}, memory_manager{memory_manager_} {}
+
+MaxwellDMA::~MaxwellDMA() = default;
+
+void MaxwellDMA::CallMethod(u32 method, u32 method_argument, bool is_last_call) {
+    ASSERT_MSG(method < NUM_REGS, "Invalid MaxwellDMA register");
+
+    regs.reg_array[method] = method_argument;
+
+    if (method == offsetof(Regs, launch_dma) / sizeof(u32)) {
+        Launch();
+    }
+}
+
+void MaxwellDMA::CallMultiMethod(u32 method, const u32* base_start, u32 amount,
+                                 u32 methods_pending) {
+    for (size_t i = 0; i < amount; ++i) {
+        CallMethod(method, base_start[i], methods_pending - static_cast<u32>(i) <= 1);
+    }
+}
+
+void MaxwellDMA::Launch() {
+    LOG_TRACE(Render_OpenGL, "DMA copy 0x{:x} -> 0x{:x}", static_cast<GPUVAddr>(regs.offset_in),
+              static_cast<GPUVAddr>(regs.offset_out));
+
+    // TODO(Subv): Perform more research and implement all features of this engine.
+    const LaunchDMA& launch = regs.launch_dma;
+    ASSERT(launch.remap_enable == 0);
+    ASSERT(launch.semaphore_type == LaunchDMA::SemaphoreType::NONE);
+    ASSERT(launch.interrupt_type == LaunchDMA::InterruptType::NONE);
+    ASSERT(launch.data_transfer_type == LaunchDMA::DataTransferType::NON_PIPELINED);
+    ASSERT(regs.dst_params.origin.x == 0);
+    ASSERT(regs.dst_params.origin.y == 0);
+
+    const bool is_src_pitch = launch.src_memory_layout == LaunchDMA::MemoryLayout::PITCH;
+    const bool is_dst_pitch = launch.dst_memory_layout == LaunchDMA::MemoryLayout::PITCH;
+
+    if (!is_src_pitch && !is_dst_pitch) {
+        // If both the source and the destination are in block layout, assert.
+        UNREACHABLE_MSG("Tiled->Tiled DMA transfers are not yet implemented");
+        return;
+    }
+
+    // All copies here update the main memory, so mark all rasterizer states as invalid.
+    system.GPU().Maxwell3D().OnMemoryWrite();
+
+    if (is_src_pitch && is_dst_pitch) {
+        CopyPitchToPitch();
+    } else {
+        ASSERT(launch.multi_line_enable == 1);
+
+        if (!is_src_pitch && is_dst_pitch) {
+            CopyBlockLinearToPitch();
+        } else {
+            CopyPitchToBlockLinear();
+        }
+    }
+}
+
+void MaxwellDMA::CopyPitchToPitch() {
+    // When `multi_line_enable` bit is disabled the copy is performed as if we were copying a 1D
+    // buffer of length `line_length_in`.
+    // Otherwise we copy a 2D image of dimensions (line_length_in, line_count).
+    if (!regs.launch_dma.multi_line_enable) {
+        memory_manager.CopyBlock(regs.offset_out, regs.offset_in, regs.line_length_in);
+        return;
+    }
+
+    // Perform a line-by-line copy.
+    // We're going to take a subrect of size (line_length_in, line_count) from the source rectangle.
+    // There is no need to manually flush/invalidate the regions because CopyBlock does that for us.
+    for (u32 line = 0; line < regs.line_count; ++line) {
+        const GPUVAddr source_line = regs.offset_in + static_cast<size_t>(line) * regs.pitch_in;
+        const GPUVAddr dest_line = regs.offset_out + static_cast<size_t>(line) * regs.pitch_out;
+        memory_manager.CopyBlock(dest_line, source_line, regs.line_length_in);
+    }
+}
+
+void MaxwellDMA::CopyBlockLinearToPitch() {
+    UNIMPLEMENTED_IF(regs.src_params.block_size.width != 0);
+    UNIMPLEMENTED_IF(regs.src_params.block_size.depth != 0);
+    UNIMPLEMENTED_IF(regs.src_params.layer != 0);
+
+    // Optimized path for micro copies.
+    const size_t dst_size = static_cast<size_t>(regs.pitch_out) * regs.line_count;
+    if (dst_size < GOB_SIZE && regs.pitch_out <= GOB_SIZE_X) {
+        FastCopyBlockLinearToPitch();
+        return;
+    }
+
+    // Deswizzle the input and copy it over.
+    const u32 bytes_per_pixel = regs.pitch_out / regs.line_length_in;
+    const Parameters& src_params = regs.src_params;
+    const u32 width = src_params.width;
+    const u32 height = src_params.height;
+    const u32 depth = src_params.depth;
+    const u32 block_height = src_params.block_size.height;
+    const u32 block_depth = src_params.block_size.depth;
+    const size_t src_size =
+        CalculateSize(true, bytes_per_pixel, width, height, depth, block_height, block_depth);
+
+    if (read_buffer.size() < src_size) {
+        read_buffer.resize(src_size);
+    }
+    if (write_buffer.size() < dst_size) {
+        write_buffer.resize(dst_size);
+    }
+
+    memory_manager.ReadBlock(regs.offset_in, read_buffer.data(), src_size);
+    memory_manager.ReadBlock(regs.offset_out, write_buffer.data(), dst_size);
+
+    UnswizzleSubrect(regs.line_length_in, regs.line_count, regs.pitch_out, width, bytes_per_pixel,
+                     block_height, src_params.origin.x, src_params.origin.y, write_buffer.data(),
+                     read_buffer.data());
+
+    memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size);
+}
+
+void MaxwellDMA::CopyPitchToBlockLinear() {
+    UNIMPLEMENTED_IF_MSG(regs.dst_params.block_size.width != 0, "Block width is not one");
+
+    const auto& dst_params = regs.dst_params;
+    const u32 bytes_per_pixel = regs.pitch_in / regs.line_length_in;
+    const u32 width = dst_params.width;
+    const u32 height = dst_params.height;
+    const u32 depth = dst_params.depth;
+    const u32 block_height = dst_params.block_size.height;
+    const u32 block_depth = dst_params.block_size.depth;
+    const size_t dst_size =
+        CalculateSize(true, bytes_per_pixel, width, height, depth, block_height, block_depth);
+    const size_t dst_layer_size =
+        CalculateSize(true, bytes_per_pixel, width, height, 1, block_height, block_depth);
+
+    const size_t src_size = static_cast<size_t>(regs.pitch_in) * regs.line_count;
+
+    if (read_buffer.size() < src_size) {
+        read_buffer.resize(src_size);
+    }
+    if (write_buffer.size() < dst_size) {
+        write_buffer.resize(dst_size);
+    }
+
+    if (Settings::IsGPULevelExtreme()) {
+        memory_manager.ReadBlock(regs.offset_in, read_buffer.data(), src_size);
+        memory_manager.ReadBlock(regs.offset_out, write_buffer.data(), dst_size);
+    } else {
+        memory_manager.ReadBlockUnsafe(regs.offset_in, read_buffer.data(), src_size);
+        memory_manager.ReadBlockUnsafe(regs.offset_out, write_buffer.data(), dst_size);
+    }
+
+    // If the input is linear and the output is tiled, swizzle the input and copy it over.
+    if (regs.dst_params.block_size.depth > 0) {
+        ASSERT(dst_params.layer == 0);
+        SwizzleSliceToVoxel(regs.line_length_in, regs.line_count, regs.pitch_in, width, height,
+                            bytes_per_pixel, block_height, block_depth, dst_params.origin.x,
+                            dst_params.origin.y, write_buffer.data(), read_buffer.data());
+    } else {
+        SwizzleSubrect(regs.line_length_in, regs.line_count, regs.pitch_in, width, bytes_per_pixel,
+                       write_buffer.data() + dst_layer_size * dst_params.layer, read_buffer.data(),
+                       block_height, dst_params.origin.x, dst_params.origin.y);
+    }
+
+    memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size);
+}
+
+void MaxwellDMA::FastCopyBlockLinearToPitch() {
+    const u32 bytes_per_pixel = regs.pitch_out / regs.line_length_in;
+    const size_t src_size = GOB_SIZE;
+    const size_t dst_size = static_cast<size_t>(regs.pitch_out) * regs.line_count;
+    u32 pos_x = regs.src_params.origin.x;
+    u32 pos_y = regs.src_params.origin.y;
+    const u64 offset = GetGOBOffset(regs.src_params.width, regs.src_params.height, pos_x, pos_y,
+                                    regs.src_params.block_size.height, bytes_per_pixel);
+    const u32 x_in_gob = 64 / bytes_per_pixel;
+    pos_x = pos_x % x_in_gob;
+    pos_y = pos_y % 8;
+
+    if (read_buffer.size() < src_size) {
+        read_buffer.resize(src_size);
+    }
+    if (write_buffer.size() < dst_size) {
+        write_buffer.resize(dst_size);
+    }
+
+    if (Settings::IsGPULevelExtreme()) {
+        memory_manager.ReadBlock(regs.offset_in + offset, read_buffer.data(), src_size);
+        memory_manager.ReadBlock(regs.offset_out, write_buffer.data(), dst_size);
+    } else {
+        memory_manager.ReadBlockUnsafe(regs.offset_in + offset, read_buffer.data(), src_size);
+        memory_manager.ReadBlockUnsafe(regs.offset_out, write_buffer.data(), dst_size);
+    }
+
+    UnswizzleSubrect(regs.line_length_in, regs.line_count, regs.pitch_out, regs.src_params.width,
+                     bytes_per_pixel, regs.src_params.block_size.height, pos_x, pos_y,
+                     write_buffer.data(), read_buffer.data());
+
+    memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size);
+}
+
+} // namespace Tegra::Engines
--- a/src/video_core/engines/maxwell_dma.h
+++ b/src/video_core/engines/maxwell_dma.h
@@ -0,0 +1,274 @@
+// Copyright 2018 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <array>
+#include <cstddef>
+#include <vector>
+#include "common/bit_field.h"
+#include "common/common_funcs.h"
+#include "common/common_types.h"
+#include "video_core/engines/engine_interface.h"
+#include "video_core/gpu.h"
+
+namespace Core {
+class System;
+}
+
+namespace Tegra {
+class MemoryManager;
+}
+
+namespace Tegra::Engines {
+
+/**
+ * This engine is known as gk104_copy. Documentation can be found in:
+ * https://github.com/NVIDIA/open-gpu-doc/blob/master/classes/dma-copy/clb0b5.h
+ * https://github.com/envytools/envytools/blob/master/rnndb/fifo/gk104_copy.xml
+ */
+
+class MaxwellDMA final : public EngineInterface {
+public:
+    struct PackedGPUVAddr {
+        u32 upper;
+        u32 lower;
+
+        constexpr operator GPUVAddr() const noexcept {
+            return (static_cast<GPUVAddr>(upper & 0xff) << 32) | lower;
+        }
+    };
+
+    union BlockSize {
+        BitField<0, 4, u32> width;
+        BitField<4, 4, u32> height;
+        BitField<8, 4, u32> depth;
+        BitField<12, 4, u32> gob_height;
+    };
+    static_assert(sizeof(BlockSize) == 4);
+
+    union Origin {
+        BitField<0, 16, u32> x;
+        BitField<16, 16, u32> y;
+    };
+    static_assert(sizeof(Origin) == 4);
+
+    struct Parameters {
+        BlockSize block_size;
+        u32 width;
+        u32 height;
+        u32 depth;
+        u32 layer;
+        Origin origin;
+    };
+    static_assert(sizeof(Parameters) == 24);
+
+    struct Semaphore {
+        PackedGPUVAddr address;
+        u32 payload;
+    };
+    static_assert(sizeof(Semaphore) == 12);
+
+    struct RenderEnable {
+        enum class Mode : u32 {
+            // Note: This uses Pascal case in order to avoid the identifiers
+            // FALSE and TRUE, which are reserved on Darwin.
+            False = 0,
+            True = 1,
+            Conditional = 2,
+            RenderIfEqual = 3,
+            RenderIfNotEqual = 4,
+        };
+
+        PackedGPUVAddr address;
+        BitField<0, 3, Mode> mode;
+    };
+    static_assert(sizeof(RenderEnable) == 12);
+
+    enum class PhysModeTarget : u32 {
+        LOCAL_FB = 0,
+        COHERENT_SYSMEM = 1,
+        NONCOHERENT_SYSMEM = 2,
+    };
+    using PhysMode = BitField<0, 2, PhysModeTarget>;
+
+    union LaunchDMA {
+        enum class DataTransferType : u32 {
+            NONE = 0,
+            PIPELINED = 1,
+            NON_PIPELINED = 2,
+        };
+
+        enum class SemaphoreType : u32 {
+            NONE = 0,
+            RELEASE_ONE_WORD_SEMAPHORE = 1,
+            RELEASE_FOUR_WORD_SEMAPHORE = 2,
+        };
+
+        enum class InterruptType : u32 {
+            NONE = 0,
+            BLOCKING = 1,
+            NON_BLOCKING = 2,
+        };
+
+        enum class MemoryLayout : u32 {
+            BLOCKLINEAR = 0,
+            PITCH = 1,
+        };
+
+        enum class Type : u32 {
+            VIRTUAL = 0,
+            PHYSICAL = 1,
+        };
+
+        enum class SemaphoreReduction : u32 {
+            IMIN = 0,
+            IMAX = 1,
+            IXOR = 2,
+            IAND = 3,
+            IOR = 4,
+            IADD = 5,
+            INC = 6,
+            DEC = 7,
+            FADD = 0xA,
+        };
+
+        enum class SemaphoreReductionSign : u32 {
+            SIGNED = 0,
+            UNSIGNED = 1,
+        };
+
+        enum class BypassL2 : u32 {
+            USE_PTE_SETTING = 0,
+            FORCE_VOLATILE = 1,
+        };
+
+        BitField<0, 2, DataTransferType> data_transfer_type;
+        BitField<2, 1, u32> flush_enable;
+        BitField<3, 2, SemaphoreType> semaphore_type;
+        BitField<5, 2, InterruptType> interrupt_type;
+        BitField<7, 1, MemoryLayout> src_memory_layout;
+        BitField<8, 1, MemoryLayout> dst_memory_layout;
+        BitField<9, 1, u32> multi_line_enable;
+        BitField<10, 1, u32> remap_enable;
+        BitField<11, 1, u32> rmwdisable;
+        BitField<12, 1, Type> src_type;
+        BitField<13, 1, Type> dst_type;
+        BitField<14, 4, SemaphoreReduction> semaphore_reduction;
+        BitField<18, 1, SemaphoreReductionSign> semaphore_reduction_sign;
+        BitField<19, 1, u32> reduction_enable;
+        BitField<20, 1, BypassL2> bypass_l2;
+    };
+    static_assert(sizeof(LaunchDMA) == 4);
+
+    struct RemapConst {
+        enum Swizzle : u32 {
+            SRC_X = 0,
+            SRC_Y = 1,
+            SRC_Z = 2,
+            SRC_W = 3,
+            CONST_A = 4,
+            CONST_B = 5,
+            NO_WRITE = 6,
+        };
+
+        PackedGPUVAddr address;
+
+        union {
+            BitField<0, 3, Swizzle> dst_x;
+            BitField<4, 3, Swizzle> dst_y;
+            BitField<8, 3, Swizzle> dst_z;
+            BitField<12, 3, Swizzle> dst_w;
+            BitField<16, 2, u32> component_size_minus_one;
+            BitField<20, 2, u32> num_src_components_minus_one;
+            BitField<24, 2, u32> num_dst_components_minus_one;
+        };
+    };
+    static_assert(sizeof(RemapConst) == 12);
+
+    explicit MaxwellDMA(Core::System& system_, MemoryManager& memory_manager_);
+    ~MaxwellDMA();
+
+    /// Write the value to the register identified by method.
+    void CallMethod(u32 method, u32 method_argument, bool is_last_call) override;
+
+    /// Write multiple values to the register identified by method.
+    void CallMultiMethod(u32 method, const u32* base_start, u32 amount,
+                         u32 methods_pending) override;
+
+private:
+    /// Performs the copy from the source buffer to the destination buffer as configured in the
+    /// registers.
+    void Launch();
+
+    void CopyPitchToPitch();
+
+    void CopyBlockLinearToPitch();
+
+    void CopyPitchToBlockLinear();
+
+    void FastCopyBlockLinearToPitch();
+
+    Core::System& system;
+
+    MemoryManager& memory_manager;
+
+    std::vector<u8> read_buffer;
+    std::vector<u8> write_buffer;
+
+    static constexpr std::size_t NUM_REGS = 0x800;
+    struct Regs {
+        union {
+            struct {
+                u32 reserved[0x40];
+                u32 nop;
+                u32 reserved01[0xf];
+                u32 pm_trigger;
+                u32 reserved02[0x3f];
+                Semaphore semaphore;
+                u32 reserved03[0x2];
+                RenderEnable render_enable;
+                PhysMode src_phys_mode;
+                PhysMode dst_phys_mode;
+                u32 reserved04[0x26];
+                LaunchDMA launch_dma;
+                u32 reserved05[0x3f];
+                PackedGPUVAddr offset_in;
+                PackedGPUVAddr offset_out;
+                u32 pitch_in;
+                u32 pitch_out;
+                u32 line_length_in;
+                u32 line_count;
+                u32 reserved06[0xb8];
+                RemapConst remap_const;
+                Parameters dst_params;
+                u32 reserved07[0x1];
+                Parameters src_params;
+                u32 reserved08[0x275];
+                u32 pm_trigger_end;
+                u32 reserved09[0x3ba];
+            };
+            std::array<u32, NUM_REGS> reg_array;
+        };
+    } regs{};
+
+#define ASSERT_REG_POSITION(field_name, position)                                                  \
+    static_assert(offsetof(MaxwellDMA::Regs, field_name) == position * 4,                          \
+                  "Field " #field_name " has invalid position")
+
+    ASSERT_REG_POSITION(launch_dma, 0xC0);
+    ASSERT_REG_POSITION(offset_in, 0x100);
+    ASSERT_REG_POSITION(offset_out, 0x102);
+    ASSERT_REG_POSITION(pitch_in, 0x104);
+    ASSERT_REG_POSITION(pitch_out, 0x105);
+    ASSERT_REG_POSITION(line_length_in, 0x106);
+    ASSERT_REG_POSITION(line_count, 0x107);
+    ASSERT_REG_POSITION(remap_const, 0x1C0);
+    ASSERT_REG_POSITION(dst_params, 0x1C3);
+    ASSERT_REG_POSITION(src_params, 0x1CA);
+
+#undef ASSERT_REG_POSITION
+};
+
+} // namespace Tegra::Engines
--- a/src/video_core/engines/shader_bytecode.h
+++ b/src/video_core/engines/shader_bytecode.h
--- a/src/video_core/engines/shader_header.h
+++ b/src/video_core/engines/shader_header.h
@@ -0,0 +1,158 @@
+// Copyright 2018 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <array>
+#include <optional>
+
+#include "common/bit_field.h"
+#include "common/common_funcs.h"
+#include "common/common_types.h"
+
+namespace Tegra::Shader {
+
+enum class OutputTopology : u32 {
+    PointList = 1,
+    LineStrip = 6,
+    TriangleStrip = 7,
+};
+
+enum class PixelImap : u8 {
+    Unused = 0,
+    Constant = 1,
+    Perspective = 2,
+    ScreenLinear = 3,
+};
+
+// Documentation in:
+// http://download.nvidia.com/open-gpu-doc/Shader-Program-Header/1/Shader-Program-Header.html
+struct Header {
+    union {
+        BitField<0, 5, u32> sph_type;
+        BitField<5, 5, u32> version;
+        BitField<10, 4, u32> shader_type;
+        BitField<14, 1, u32> mrt_enable;
+        BitField<15, 1, u32> kills_pixels;
+        BitField<16, 1, u32> does_global_store;
+        BitField<17, 4, u32> sass_version;
+        BitField<21, 5, u32> reserved;
+        BitField<26, 1, u32> does_load_or_store;
+        BitField<27, 1, u32> does_fp64;
+        BitField<28, 4, u32> stream_out_mask;
+    } common0;
+
+    union {
+        BitField<0, 24, u32> shader_local_memory_low_size;
+        BitField<24, 8, u32> per_patch_attribute_count;
+    } common1;
+
+    union {
+        BitField<0, 24, u32> shader_local_memory_high_size;
+        BitField<24, 8, u32> threads_per_input_primitive;
+    } common2;
+
+    union {
+        BitField<0, 24, u32> shader_local_memory_crs_size;
+        BitField<24, 4, OutputTopology> output_topology;
+        BitField<28, 4, u32> reserved;
+    } common3;
+
+    union {
+        BitField<0, 12, u32> max_output_vertices;
+        BitField<12, 8, u32> store_req_start; // NOTE: not used by geometry shaders.
+        BitField<20, 4, u32> reserved;
+        BitField<24, 8, u32> store_req_end; // NOTE: not used by geometry shaders.
+    } common4;
+
+    union {
+        struct {
+            INSERT_UNION_PADDING_BYTES(3);  // ImapSystemValuesA
+            INSERT_UNION_PADDING_BYTES(1);  // ImapSystemValuesB
+            INSERT_UNION_PADDING_BYTES(16); // ImapGenericVector[32]
+            INSERT_UNION_PADDING_BYTES(2);  // ImapColor
+            union {
+                BitField<0, 8, u16> clip_distances;
+                BitField<8, 1, u16> point_sprite_s;
+                BitField<9, 1, u16> point_sprite_t;
+                BitField<10, 1, u16> fog_coordinate;
+                BitField<12, 1, u16> tessellation_eval_point_u;
+                BitField<13, 1, u16> tessellation_eval_point_v;
+                BitField<14, 1, u16> instance_id;
+                BitField<15, 1, u16> vertex_id;
+            };
+            INSERT_UNION_PADDING_BYTES(5);  // ImapFixedFncTexture[10]
+            INSERT_UNION_PADDING_BYTES(1);  // ImapReserved
+            INSERT_UNION_PADDING_BYTES(3);  // OmapSystemValuesA
+            INSERT_UNION_PADDING_BYTES(1);  // OmapSystemValuesB
+            INSERT_UNION_PADDING_BYTES(16); // OmapGenericVector[32]
+            INSERT_UNION_PADDING_BYTES(2);  // OmapColor
+            INSERT_UNION_PADDING_BYTES(2);  // OmapSystemValuesC
+            INSERT_UNION_PADDING_BYTES(5);  // OmapFixedFncTexture[10]
+            INSERT_UNION_PADDING_BYTES(1);  // OmapReserved
+        } vtg;
+
+        struct {
+            INSERT_UNION_PADDING_BYTES(3); // ImapSystemValuesA
+            INSERT_UNION_PADDING_BYTES(1); // ImapSystemValuesB
+
+            union {
+                BitField<0, 2, PixelImap> x;
+                BitField<2, 2, PixelImap> y;
+                BitField<4, 2, PixelImap> z;
+                BitField<6, 2, PixelImap> w;
+                u8 raw;
+            } imap_generic_vector[32];
+
+            INSERT_UNION_PADDING_BYTES(2);  // ImapColor
+            INSERT_UNION_PADDING_BYTES(2);  // ImapSystemValuesC
+            INSERT_UNION_PADDING_BYTES(10); // ImapFixedFncTexture[10]
+            INSERT_UNION_PADDING_BYTES(2);  // ImapReserved
+
+            struct {
+                u32 target;
+                union {
+                    BitField<0, 1, u32> sample_mask;
+                    BitField<1, 1, u32> depth;
+                    BitField<2, 30, u32> reserved;
+                };
+            } omap;
+
+            bool IsColorComponentOutputEnabled(u32 render_target, u32 component) const {
+                const u32 bit = render_target * 4 + component;
+                return omap.target & (1 << bit);
+            }
+
+            PixelImap GetPixelImap(u32 attribute) const {
+                const auto get_index = [this, attribute](u32 index) {
+                    return static_cast<PixelImap>(
+                        (imap_generic_vector[attribute].raw >> (index * 2)) & 3);
+                };
+
+                std::optional<PixelImap> result;
+                for (u32 component = 0; component < 4; ++component) {
+                    const PixelImap index = get_index(component);
+                    if (index == PixelImap::Unused) {
+                        continue;
+                    }
+                    if (result && result != index) {
+                        LOG_CRITICAL(HW_GPU, "Generic attribute conflict in interpolation mode");
+                    }
+                    result = index;
+                }
+                return result.value_or(PixelImap::Unused);
+            }
+        } ps;
+
+        std::array<u32, 0xF> raw;
+    };
+
+    u64 GetLocalMemorySize() const {
+        return (common1.shader_local_memory_low_size |
+                (common2.shader_local_memory_high_size << 24));
+    }
+};
+static_assert(sizeof(Header) == 0x50, "Incorrect structure size");
+
+} // namespace Tegra::Shader
--- a/src/video_core/engines/shader_type.h
+++ b/src/video_core/engines/shader_type.h
@@ -0,0 +1,21 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include "common/common_types.h"
+
+namespace Tegra::Engines {
+
+enum class ShaderType : u32 {
+    Vertex = 0,
+    TesselationControl = 1,
+    TesselationEval = 2,
+    Geometry = 3,
+    Fragment = 4,
+    Compute = 5,
+};
+static constexpr std::size_t MaxShaderTypes = 6;
+
+} // namespace Tegra::Engines
--- a/src/video_core/fence_manager.h
+++ b/src/video_core/fence_manager.h
@@ -0,0 +1,177 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <algorithm>
+#include <queue>
+
+#include "common/common_types.h"
+#include "core/core.h"
+#include "video_core/delayed_destruction_ring.h"
+#include "video_core/gpu.h"
+#include "video_core/memory_manager.h"
+#include "video_core/rasterizer_interface.h"
+
+namespace VideoCommon {
+
+class FenceBase {
+public:
+    explicit FenceBase(u32 payload_, bool is_stubbed_)
+        : address{}, payload{payload_}, is_semaphore{false}, is_stubbed{is_stubbed_} {}
+
+    explicit FenceBase(GPUVAddr address_, u32 payload_, bool is_stubbed_)
+        : address{address_}, payload{payload_}, is_semaphore{true}, is_stubbed{is_stubbed_} {}
+
+    GPUVAddr GetAddress() const {
+        return address;
+    }
+
+    u32 GetPayload() const {
+        return payload;
+    }
+
+    bool IsSemaphore() const {
+        return is_semaphore;
+    }
+
+private:
+    GPUVAddr address;
+    u32 payload;
+    bool is_semaphore;
+
+protected:
+    bool is_stubbed;
+};
+
+template <typename TFence, typename TTextureCache, typename TTBufferCache, typename TQueryCache>
+class FenceManager {
+public:
+    /// Notify the fence manager about a new frame
+    void TickFrame() {
+        delayed_destruction_ring.Tick();
+    }
+
+    void SignalSemaphore(GPUVAddr addr, u32 value) {
+        TryReleasePendingFences();
+        const bool should_flush = ShouldFlush();
+        CommitAsyncFlushes();
+        TFence new_fence = CreateFence(addr, value, !should_flush);
+        fences.push(new_fence);
+        QueueFence(new_fence);
+        if (should_flush) {
+            rasterizer.FlushCommands();
+        }
+        rasterizer.SyncGuestHost();
+    }
+
+    void SignalSyncPoint(u32 value) {
+        TryReleasePendingFences();
+        const bool should_flush = ShouldFlush();
+        CommitAsyncFlushes();
+        TFence new_fence = CreateFence(value, !should_flush);
+        fences.push(new_fence);
+        QueueFence(new_fence);
+        if (should_flush) {
+            rasterizer.FlushCommands();
+        }
+        rasterizer.SyncGuestHost();
+    }
+
+    void WaitPendingFences() {
+        while (!fences.empty()) {
+            TFence& current_fence = fences.front();
+            if (ShouldWait()) {
+                WaitFence(current_fence);
+            }
+            PopAsyncFlushes();
+            if (current_fence->IsSemaphore()) {
+                gpu_memory.template Write<u32>(current_fence->GetAddress(),
+                                               current_fence->GetPayload());
+            } else {
+                gpu.IncrementSyncPoint(current_fence->GetPayload());
+            }
+            PopFence();
+        }
+    }
+
+protected:
+    explicit FenceManager(VideoCore::RasterizerInterface& rasterizer_, Tegra::GPU& gpu_,
+                          TTextureCache& texture_cache_, TTBufferCache& buffer_cache_,
+                          TQueryCache& query_cache_)
+        : rasterizer{rasterizer_}, gpu{gpu_}, gpu_memory{gpu.MemoryManager()},
+          texture_cache{texture_cache_}, buffer_cache{buffer_cache_}, query_cache{query_cache_} {}
+
+    virtual ~FenceManager() = default;
+
+    /// Creates a Sync Point Fence Interface, does not create a backend fence if 'is_stubbed' is
+    /// true
+    virtual TFence CreateFence(u32 value, bool is_stubbed) = 0;
+    /// Creates a Semaphore Fence Interface, does not create a backend fence if 'is_stubbed' is true
+    virtual TFence CreateFence(GPUVAddr addr, u32 value, bool is_stubbed) = 0;
+    /// Queues a fence into the backend if the fence isn't stubbed.
+    virtual void QueueFence(TFence& fence) = 0;
+    /// Notifies that the backend fence has been signaled/reached in host GPU.
+    virtual bool IsFenceSignaled(TFence& fence) const = 0;
+    /// Waits until a fence has been signalled by the host GPU.
+    virtual void WaitFence(TFence& fence) = 0;
+
+    VideoCore::RasterizerInterface& rasterizer;
+    Tegra::GPU& gpu;
+    Tegra::MemoryManager& gpu_memory;
+    TTextureCache& texture_cache;
+    TTBufferCache& buffer_cache;
+    TQueryCache& query_cache;
+
+private:
+    void TryReleasePendingFences() {
+        while (!fences.empty()) {
+            TFence& current_fence = fences.front();
+            if (ShouldWait() && !IsFenceSignaled(current_fence)) {
+                return;
+            }
+            PopAsyncFlushes();
+            if (current_fence->IsSemaphore()) {
+                gpu_memory.template Write<u32>(current_fence->GetAddress(),
+                                               current_fence->GetPayload());
+            } else {
+                gpu.IncrementSyncPoint(current_fence->GetPayload());
+            }
+            PopFence();
+        }
+    }
+
+    bool ShouldWait() const {
+        return texture_cache.ShouldWaitAsyncFlushes() || buffer_cache.ShouldWaitAsyncFlushes() ||
+               query_cache.ShouldWaitAsyncFlushes();
+    }
+
+    bool ShouldFlush() const {
+        return texture_cache.HasUncommittedFlushes() || buffer_cache.HasUncommittedFlushes() ||
+               query_cache.HasUncommittedFlushes();
+    }
+
+    void PopAsyncFlushes() {
+        texture_cache.PopAsyncFlushes();
+        buffer_cache.PopAsyncFlushes();
+        query_cache.PopAsyncFlushes();
+    }
+
+    void CommitAsyncFlushes() {
+        texture_cache.CommitAsyncFlushes();
+        buffer_cache.CommitAsyncFlushes();
+        query_cache.CommitAsyncFlushes();
+    }
+
+    void PopFence() {
+        delayed_destruction_ring.Push(std::move(fences.front()));
+        fences.pop();
+    }
+
+    std::queue<TFence> fences;
+
+    DelayedDestructionRing<TFence, 6> delayed_destruction_ring;
+};
+
+} // namespace VideoCommon
--- a/src/video_core/framebuffer_config.h
+++ b/src/video_core/framebuffer_config.h
@@ -0,0 +1,31 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+namespace Tegra {
+
+/**
+ * Struct describing framebuffer configuration
+ */
+struct FramebufferConfig {
+    enum class PixelFormat : u32 {
+        A8B8G8R8_UNORM = 1,
+        RGB565_UNORM = 4,
+        B8G8R8A8_UNORM = 5,
+    };
+
+    VAddr address{};
+    u32 offset{};
+    u32 width{};
+    u32 height{};
+    u32 stride{};
+    PixelFormat pixel_format{};
+
+    using TransformFlags = Service::NVFlinger::BufferQueue::BufferTransformFlags;
+    TransformFlags transform_flags{};
+    Common::Rectangle<int> crop_rect;
+};
+
+} // namespace Tegra
--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@@ -0,0 +1,533 @@
+// Copyright 2018 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <chrono>
+
+#include "common/assert.h"
+#include "common/microprofile.h"
+#include "core/core.h"
+#include "core/core_timing.h"
+#include "core/core_timing_util.h"
+#include "core/frontend/emu_window.h"
+#include "core/hardware_interrupt_manager.h"
+#include "core/memory.h"
+#include "core/settings.h"
+#include "video_core/engines/fermi_2d.h"
+#include "video_core/engines/kepler_compute.h"
+#include "video_core/engines/kepler_memory.h"
+#include "video_core/engines/maxwell_3d.h"
+#include "video_core/engines/maxwell_dma.h"
+#include "video_core/gpu.h"
+#include "video_core/memory_manager.h"
+#include "video_core/renderer_base.h"
+#include "video_core/shader_notify.h"
+#include "video_core/video_core.h"
+
+namespace Tegra {
+
+MICROPROFILE_DEFINE(GPU_wait, "GPU", "Wait for the GPU", MP_RGB(128, 128, 192));
+
+GPU::GPU(Core::System& system_, bool is_async_, bool use_nvdec_)
+    : system{system_}, memory_manager{std::make_unique<Tegra::MemoryManager>(system)},
+      dma_pusher{std::make_unique<Tegra::DmaPusher>(system, *this)},
+      cdma_pusher{std::make_unique<Tegra::CDmaPusher>(*this)}, use_nvdec{use_nvdec_},
+      maxwell_3d{std::make_unique<Engines::Maxwell3D>(system, *memory_manager)},
+      fermi_2d{std::make_unique<Engines::Fermi2D>()},
+      kepler_compute{std::make_unique<Engines::KeplerCompute>(system, *memory_manager)},
+      maxwell_dma{std::make_unique<Engines::MaxwellDMA>(system, *memory_manager)},
+      kepler_memory{std::make_unique<Engines::KeplerMemory>(system, *memory_manager)},
+      shader_notify{std::make_unique<VideoCore::ShaderNotify>()}, is_async{is_async_},
+      gpu_thread{system_, is_async_} {}
+
+GPU::~GPU() = default;
+
+void GPU::BindRenderer(std::unique_ptr<VideoCore::RendererBase> renderer_) {
+    renderer = std::move(renderer_);
+
+    VideoCore::RasterizerInterface& rasterizer = renderer->Rasterizer();
+    memory_manager->BindRasterizer(rasterizer);
+    maxwell_3d->BindRasterizer(rasterizer);
+    fermi_2d->BindRasterizer(rasterizer);
+    kepler_compute->BindRasterizer(rasterizer);
+}
+
+Engines::Maxwell3D& GPU::Maxwell3D() {
+    return *maxwell_3d;
+}
+
+const Engines::Maxwell3D& GPU::Maxwell3D() const {
+    return *maxwell_3d;
+}
+
+Engines::KeplerCompute& GPU::KeplerCompute() {
+    return *kepler_compute;
+}
+
+const Engines::KeplerCompute& GPU::KeplerCompute() const {
+    return *kepler_compute;
+}
+
+MemoryManager& GPU::MemoryManager() {
+    return *memory_manager;
+}
+
+const MemoryManager& GPU::MemoryManager() const {
+    return *memory_manager;
+}
+
+DmaPusher& GPU::DmaPusher() {
+    return *dma_pusher;
+}
+
+Tegra::CDmaPusher& GPU::CDmaPusher() {
+    return *cdma_pusher;
+}
+
+const DmaPusher& GPU::DmaPusher() const {
+    return *dma_pusher;
+}
+
+const Tegra::CDmaPusher& GPU::CDmaPusher() const {
+    return *cdma_pusher;
+}
+
+void GPU::WaitFence(u32 syncpoint_id, u32 value) {
+    // Synced GPU, is always in sync
+    if (!is_async) {
+        return;
+    }
+    if (syncpoint_id == UINT32_MAX) {
+        // TODO: Research what this does.
+        LOG_ERROR(HW_GPU, "Waiting for syncpoint -1 not implemented");
+        return;
+    }
+    MICROPROFILE_SCOPE(GPU_wait);
+    std::unique_lock lock{sync_mutex};
+    sync_cv.wait(lock, [=, this] { return syncpoints.at(syncpoint_id).load() >= value; });
+}
+
+void GPU::IncrementSyncPoint(const u32 syncpoint_id) {
+    auto& syncpoint = syncpoints.at(syncpoint_id);
+    syncpoint++;
+    std::lock_guard lock{sync_mutex};
+    sync_cv.notify_all();
+    auto& interrupt = syncpt_interrupts.at(syncpoint_id);
+    if (!interrupt.empty()) {
+        u32 value = syncpoint.load();
+        auto it = interrupt.begin();
+        while (it != interrupt.end()) {
+            if (value >= *it) {
+                TriggerCpuInterrupt(syncpoint_id, *it);
+                it = interrupt.erase(it);
+                continue;
+            }
+            it++;
+        }
+    }
+}
+
+u32 GPU::GetSyncpointValue(const u32 syncpoint_id) const {
+    return syncpoints.at(syncpoint_id).load();
+}
+
+void GPU::RegisterSyncptInterrupt(const u32 syncpoint_id, const u32 value) {
+    auto& interrupt = syncpt_interrupts.at(syncpoint_id);
+    bool contains = std::any_of(interrupt.begin(), interrupt.end(),
+                                [value](u32 in_value) { return in_value == value; });
+    if (contains) {
+        return;
+    }
+    interrupt.emplace_back(value);
+}
+
+bool GPU::CancelSyncptInterrupt(const u32 syncpoint_id, const u32 value) {
+    std::lock_guard lock{sync_mutex};
+    auto& interrupt = syncpt_interrupts.at(syncpoint_id);
+    const auto iter =
+        std::find_if(interrupt.begin(), interrupt.end(),
+                     [value](u32 interrupt_value) { return value == interrupt_value; });
+
+    if (iter == interrupt.end()) {
+        return false;
+    }
+    interrupt.erase(iter);
+    return true;
+}
+
+u64 GPU::RequestFlush(VAddr addr, std::size_t size) {
+    std::unique_lock lck{flush_request_mutex};
+    const u64 fence = ++last_flush_fence;
+    flush_requests.emplace_back(fence, addr, size);
+    return fence;
+}
+
+void GPU::TickWork() {
+    std::unique_lock lck{flush_request_mutex};
+    while (!flush_requests.empty()) {
+        auto& request = flush_requests.front();
+        const u64 fence = request.fence;
+        const VAddr addr = request.addr;
+        const std::size_t size = request.size;
+        flush_requests.pop_front();
+        flush_request_mutex.unlock();
+        renderer->Rasterizer().FlushRegion(addr, size);
+        current_flush_fence.store(fence);
+        flush_request_mutex.lock();
+    }
+}
+
+u64 GPU::GetTicks() const {
+    // This values were reversed engineered by fincs from NVN
+    // The gpu clock is reported in units of 385/625 nanoseconds
+    constexpr u64 gpu_ticks_num = 384;
+    constexpr u64 gpu_ticks_den = 625;
+
+    u64 nanoseconds = system.CoreTiming().GetGlobalTimeNs().count();
+    if (Settings::values.use_fast_gpu_time.GetValue()) {
+        nanoseconds /= 256;
+    }
+    const u64 nanoseconds_num = nanoseconds / gpu_ticks_den;
+    const u64 nanoseconds_rem = nanoseconds % gpu_ticks_den;
+    return nanoseconds_num * gpu_ticks_num + (nanoseconds_rem * gpu_ticks_num) / gpu_ticks_den;
+}
+
+void GPU::FlushCommands() {
+    renderer->Rasterizer().FlushCommands();
+}
+
+void GPU::SyncGuestHost() {
+    renderer->Rasterizer().SyncGuestHost();
+}
+
+enum class GpuSemaphoreOperation {
+    AcquireEqual = 0x1,
+    WriteLong = 0x2,
+    AcquireGequal = 0x4,
+    AcquireMask = 0x8,
+};
+
+void GPU::CallMethod(const MethodCall& method_call) {
+    LOG_TRACE(HW_GPU, "Processing method {:08X} on subchannel {}", method_call.method,
+              method_call.subchannel);
+
+    ASSERT(method_call.subchannel < bound_engines.size());
+
+    if (ExecuteMethodOnEngine(method_call.method)) {
+        CallEngineMethod(method_call);
+    } else {
+        CallPullerMethod(method_call);
+    }
+}
+
+void GPU::CallMultiMethod(u32 method, u32 subchannel, const u32* base_start, u32 amount,
+                          u32 methods_pending) {
+    LOG_TRACE(HW_GPU, "Processing method {:08X} on subchannel {}", method, subchannel);
+
+    ASSERT(subchannel < bound_engines.size());
+
+    if (ExecuteMethodOnEngine(method)) {
+        CallEngineMultiMethod(method, subchannel, base_start, amount, methods_pending);
+    } else {
+        for (std::size_t i = 0; i < amount; i++) {
+            CallPullerMethod(MethodCall{
+                method,
+                base_start[i],
+                subchannel,
+                methods_pending - static_cast<u32>(i),
+            });
+        }
+    }
+}
+
+bool GPU::ExecuteMethodOnEngine(u32 method) {
+    const auto buffer_method = static_cast<BufferMethods>(method);
+    return buffer_method >= BufferMethods::NonPullerMethods;
+}
+
+void GPU::CallPullerMethod(const MethodCall& method_call) {
+    regs.reg_array[method_call.method] = method_call.argument;
+    const auto method = static_cast<BufferMethods>(method_call.method);
+
+    switch (method) {
+    case BufferMethods::BindObject: {
+        ProcessBindMethod(method_call);
+        break;
+    }
+    case BufferMethods::Nop:
+    case BufferMethods::SemaphoreAddressHigh:
+    case BufferMethods::SemaphoreAddressLow:
+    case BufferMethods::SemaphoreSequence:
+    case BufferMethods::RefCnt:
+    case BufferMethods::UnkCacheFlush:
+    case BufferMethods::WrcacheFlush:
+    case BufferMethods::FenceValue:
+        break;
+    case BufferMethods::FenceAction:
+        ProcessFenceActionMethod();
+        break;
+    case BufferMethods::WaitForInterrupt:
+        ProcessWaitForInterruptMethod();
+        break;
+    case BufferMethods::SemaphoreTrigger: {
+        ProcessSemaphoreTriggerMethod();
+        break;
+    }
+    case BufferMethods::NotifyIntr: {
+        // TODO(Kmather73): Research and implement this method.
+        LOG_ERROR(HW_GPU, "Special puller engine method NotifyIntr not implemented");
+        break;
+    }
+    case BufferMethods::Unk28: {
+        // TODO(Kmather73): Research and implement this method.
+        LOG_ERROR(HW_GPU, "Special puller engine method Unk28 not implemented");
+        break;
+    }
+    case BufferMethods::SemaphoreAcquire: {
+        ProcessSemaphoreAcquire();
+        break;
+    }
+    case BufferMethods::SemaphoreRelease: {
+        ProcessSemaphoreRelease();
+        break;
+    }
+    case BufferMethods::Yield: {
+        // TODO(Kmather73): Research and implement this method.
+        LOG_ERROR(HW_GPU, "Special puller engine method Yield not implemented");
+        break;
+    }
+    default:
+        LOG_ERROR(HW_GPU, "Special puller engine method {:X} not implemented", method);
+        break;
+    }
+}
+
+void GPU::CallEngineMethod(const MethodCall& method_call) {
+    const EngineID engine = bound_engines[method_call.subchannel];
+
+    switch (engine) {
+    case EngineID::FERMI_TWOD_A:
+        fermi_2d->CallMethod(method_call.method, method_call.argument, method_call.IsLastCall());
+        break;
+    case EngineID::MAXWELL_B:
+        maxwell_3d->CallMethod(method_call.method, method_call.argument, method_call.IsLastCall());
+        break;
+    case EngineID::KEPLER_COMPUTE_B:
+        kepler_compute->CallMethod(method_call.method, method_call.argument,
+                                   method_call.IsLastCall());
+        break;
+    case EngineID::MAXWELL_DMA_COPY_A:
+        maxwell_dma->CallMethod(method_call.method, method_call.argument, method_call.IsLastCall());
+        break;
+    case EngineID::KEPLER_INLINE_TO_MEMORY_B:
+        kepler_memory->CallMethod(method_call.method, method_call.argument,
+                                  method_call.IsLastCall());
+        break;
+    default:
+        UNIMPLEMENTED_MSG("Unimplemented engine");
+    }
+}
+
+void GPU::CallEngineMultiMethod(u32 method, u32 subchannel, const u32* base_start, u32 amount,
+                                u32 methods_pending) {
+    const EngineID engine = bound_engines[subchannel];
+
+    switch (engine) {
+    case EngineID::FERMI_TWOD_A:
+        fermi_2d->CallMultiMethod(method, base_start, amount, methods_pending);
+        break;
+    case EngineID::MAXWELL_B:
+        maxwell_3d->CallMultiMethod(method, base_start, amount, methods_pending);
+        break;
+    case EngineID::KEPLER_COMPUTE_B:
+        kepler_compute->CallMultiMethod(method, base_start, amount, methods_pending);
+        break;
+    case EngineID::MAXWELL_DMA_COPY_A:
+        maxwell_dma->CallMultiMethod(method, base_start, amount, methods_pending);
+        break;
+    case EngineID::KEPLER_INLINE_TO_MEMORY_B:
+        kepler_memory->CallMultiMethod(method, base_start, amount, methods_pending);
+        break;
+    default:
+        UNIMPLEMENTED_MSG("Unimplemented engine");
+    }
+}
+
+void GPU::ProcessBindMethod(const MethodCall& method_call) {
+    // Bind the current subchannel to the desired engine id.
+    LOG_DEBUG(HW_GPU, "Binding subchannel {} to engine {}", method_call.subchannel,
+              method_call.argument);
+    const auto engine_id = static_cast<EngineID>(method_call.argument);
+    bound_engines[method_call.subchannel] = static_cast<EngineID>(engine_id);
+    switch (engine_id) {
+    case EngineID::FERMI_TWOD_A:
+        dma_pusher->BindSubchannel(fermi_2d.get(), method_call.subchannel);
+        break;
+    case EngineID::MAXWELL_B:
+        dma_pusher->BindSubchannel(maxwell_3d.get(), method_call.subchannel);
+        break;
+    case EngineID::KEPLER_COMPUTE_B:
+        dma_pusher->BindSubchannel(kepler_compute.get(), method_call.subchannel);
+        break;
+    case EngineID::MAXWELL_DMA_COPY_A:
+        dma_pusher->BindSubchannel(maxwell_dma.get(), method_call.subchannel);
+        break;
+    case EngineID::KEPLER_INLINE_TO_MEMORY_B:
+        dma_pusher->BindSubchannel(kepler_memory.get(), method_call.subchannel);
+        break;
+    default:
+        UNIMPLEMENTED_MSG("Unimplemented engine {:04X}", engine_id);
+    }
+}
+
+void GPU::ProcessFenceActionMethod() {
+    switch (regs.fence_action.op) {
+    case FenceOperation::Acquire:
+        WaitFence(regs.fence_action.syncpoint_id, regs.fence_value);
+        break;
+    case FenceOperation::Increment:
+        IncrementSyncPoint(regs.fence_action.syncpoint_id);
+        break;
+    default:
+        UNIMPLEMENTED_MSG("Unimplemented operation {}", regs.fence_action.op.Value());
+    }
+}
+
+void GPU::ProcessWaitForInterruptMethod() {
+    // TODO(bunnei) ImplementMe
+    LOG_WARNING(HW_GPU, "(STUBBED) called");
+}
+
+void GPU::ProcessSemaphoreTriggerMethod() {
+    const auto semaphoreOperationMask = 0xF;
+    const auto op =
+        static_cast<GpuSemaphoreOperation>(regs.semaphore_trigger & semaphoreOperationMask);
+    if (op == GpuSemaphoreOperation::WriteLong) {
+        struct Block {
+            u32 sequence;
+            u32 zeros = 0;
+            u64 timestamp;
+        };
+
+        Block block{};
+        block.sequence = regs.semaphore_sequence;
+        // TODO(Kmather73): Generate a real GPU timestamp and write it here instead of
+        // CoreTiming
+        block.timestamp = GetTicks();
+        memory_manager->WriteBlock(regs.semaphore_address.SemaphoreAddress(), &block,
+                                   sizeof(block));
+    } else {
+        const u32 word{memory_manager->Read<u32>(regs.semaphore_address.SemaphoreAddress())};
+        if ((op == GpuSemaphoreOperation::AcquireEqual && word == regs.semaphore_sequence) ||
+            (op == GpuSemaphoreOperation::AcquireGequal &&
+             static_cast<s32>(word - regs.semaphore_sequence) > 0) ||
+            (op == GpuSemaphoreOperation::AcquireMask && (word & regs.semaphore_sequence))) {
+            // Nothing to do in this case
+        } else {
+            regs.acquire_source = true;
+            regs.acquire_value = regs.semaphore_sequence;
+            if (op == GpuSemaphoreOperation::AcquireEqual) {
+                regs.acquire_active = true;
+                regs.acquire_mode = false;
+            } else if (op == GpuSemaphoreOperation::AcquireGequal) {
+                regs.acquire_active = true;
+                regs.acquire_mode = true;
+            } else if (op == GpuSemaphoreOperation::AcquireMask) {
+                // TODO(kemathe) The acquire mask operation waits for a value that, ANDed with
+                // semaphore_sequence, gives a non-0 result
+                LOG_ERROR(HW_GPU, "Invalid semaphore operation AcquireMask not implemented");
+            } else {
+                LOG_ERROR(HW_GPU, "Invalid semaphore operation");
+            }
+        }
+    }
+}
+
+void GPU::ProcessSemaphoreRelease() {
+    memory_manager->Write<u32>(regs.semaphore_address.SemaphoreAddress(), regs.semaphore_release);
+}
+
+void GPU::ProcessSemaphoreAcquire() {
+    const u32 word = memory_manager->Read<u32>(regs.semaphore_address.SemaphoreAddress());
+    const auto value = regs.semaphore_acquire;
+    if (word != value) {
+        regs.acquire_active = true;
+        regs.acquire_value = value;
+        // TODO(kemathe73) figure out how to do the acquire_timeout
+        regs.acquire_mode = false;
+        regs.acquire_source = false;
+    }
+}
+
+void GPU::Start() {
+    gpu_thread.StartThread(*renderer, renderer->Context(), *dma_pusher, *cdma_pusher);
+    cpu_context = renderer->GetRenderWindow().CreateSharedContext();
+    cpu_context->MakeCurrent();
+}
+
+void GPU::ObtainContext() {
+    cpu_context->MakeCurrent();
+}
+
+void GPU::ReleaseContext() {
+    cpu_context->DoneCurrent();
+}
+
+void GPU::PushGPUEntries(Tegra::CommandList&& entries) {
+    gpu_thread.SubmitList(std::move(entries));
+}
+
+void GPU::PushCommandBuffer(Tegra::ChCommandHeaderList& entries) {
+    if (!use_nvdec) {
+        return;
+    }
+    // This condition fires when a video stream ends, clear all intermediary data
+    if (entries[0].raw == 0xDEADB33F) {
+        cdma_pusher.reset();
+        return;
+    }
+    if (!cdma_pusher) {
+        cdma_pusher = std::make_unique<Tegra::CDmaPusher>(*this);
+    }
+
+    // SubmitCommandBuffer would make the nvdec operations async, this is not currently working
+    // TODO(ameerj): RE proper async nvdec operation
+    // gpu_thread.SubmitCommandBuffer(std::move(entries));
+
+    cdma_pusher->Push(std::move(entries));
+    cdma_pusher->DispatchCalls();
+}
+
+void GPU::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) {
+    gpu_thread.SwapBuffers(framebuffer);
+}
+
+void GPU::FlushRegion(VAddr addr, u64 size) {
+    gpu_thread.FlushRegion(addr, size);
+}
+
+void GPU::InvalidateRegion(VAddr addr, u64 size) {
+    gpu_thread.InvalidateRegion(addr, size);
+}
+
+void GPU::FlushAndInvalidateRegion(VAddr addr, u64 size) {
+    gpu_thread.FlushAndInvalidateRegion(addr, size);
+}
+
+void GPU::TriggerCpuInterrupt(const u32 syncpoint_id, const u32 value) const {
+    auto& interrupt_manager = system.InterruptManager();
+    interrupt_manager.GPUInterruptSyncpt(syncpoint_id, value);
+}
+
+void GPU::WaitIdle() const {
+    gpu_thread.WaitIdle();
+}
+
+void GPU::OnCommandListEnd() {
+    if (is_async) {
+        // This command only applies to asynchronous GPU mode
+        gpu_thread.OnCommandListEnd();
+    }
+}
+
+} // namespace Tegra
--- a/src/video_core/gpu.h
+++ b/src/video_core/gpu.h
@@ -0,0 +1,436 @@
+// Copyright 2018 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <array>
+#include <atomic>
+#include <condition_variable>
+#include <list>
+#include <memory>
+#include <mutex>
+#include "common/common_types.h"
+#include "core/hle/service/nvdrv/nvdata.h"
+#include "core/hle/service/nvflinger/buffer_queue.h"
+#include "video_core/cdma_pusher.h"
+#include "video_core/dma_pusher.h"
+#include "video_core/framebuffer_config.h"
+#include "video_core/gpu_thread.h"
+
+using CacheAddr = std::uintptr_t;
+[[nodiscard]] inline CacheAddr ToCacheAddr(const void* host_ptr) {
+    return reinterpret_cast<CacheAddr>(host_ptr);
+}
+
+[[nodiscard]] inline u8* FromCacheAddr(CacheAddr cache_addr) {
+    return reinterpret_cast<u8*>(cache_addr);
+}
+
+namespace Core {
+namespace Frontend {
+class EmuWindow;
+}
+class System;
+} // namespace Core
+
+namespace VideoCore {
+class RendererBase;
+class ShaderNotify;
+} // namespace VideoCore
+
+namespace Tegra {
+
+enum class RenderTargetFormat : u32 {
+    NONE = 0x0,
+    R32B32G32A32_FLOAT = 0xC0,
+    R32G32B32A32_SINT = 0xC1,
+    R32G32B32A32_UINT = 0xC2,
+    R16G16B16A16_UNORM = 0xC6,
+    R16G16B16A16_SNORM = 0xC7,
+    R16G16B16A16_SINT = 0xC8,
+    R16G16B16A16_UINT = 0xC9,
+    R16G16B16A16_FLOAT = 0xCA,
+    R32G32_FLOAT = 0xCB,
+    R32G32_SINT = 0xCC,
+    R32G32_UINT = 0xCD,
+    R16G16B16X16_FLOAT = 0xCE,
+    B8G8R8A8_UNORM = 0xCF,
+    B8G8R8A8_SRGB = 0xD0,
+    A2B10G10R10_UNORM = 0xD1,
+    A2B10G10R10_UINT = 0xD2,
+    A8B8G8R8_UNORM = 0xD5,
+    A8B8G8R8_SRGB = 0xD6,
+    A8B8G8R8_SNORM = 0xD7,
+    A8B8G8R8_SINT = 0xD8,
+    A8B8G8R8_UINT = 0xD9,
+    R16G16_UNORM = 0xDA,
+    R16G16_SNORM = 0xDB,
+    R16G16_SINT = 0xDC,
+    R16G16_UINT = 0xDD,
+    R16G16_FLOAT = 0xDE,
+    B10G11R11_FLOAT = 0xE0,
+    R32_SINT = 0xE3,
+    R32_UINT = 0xE4,
+    R32_FLOAT = 0xE5,
+    R5G6B5_UNORM = 0xE8,
+    A1R5G5B5_UNORM = 0xE9,
+    R8G8_UNORM = 0xEA,
+    R8G8_SNORM = 0xEB,
+    R8G8_SINT = 0xEC,
+    R8G8_UINT = 0xED,
+    R16_UNORM = 0xEE,
+    R16_SNORM = 0xEF,
+    R16_SINT = 0xF0,
+    R16_UINT = 0xF1,
+    R16_FLOAT = 0xF2,
+    R8_UNORM = 0xF3,
+    R8_SNORM = 0xF4,
+    R8_SINT = 0xF5,
+    R8_UINT = 0xF6,
+};
+
+enum class DepthFormat : u32 {
+    D32_FLOAT = 0xA,
+    D16_UNORM = 0x13,
+    S8_UINT_Z24_UNORM = 0x14,
+    D24X8_UNORM = 0x15,
+    D24S8_UNORM = 0x16,
+    D24C8_UNORM = 0x18,
+    D32_FLOAT_S8X24_UINT = 0x19,
+};
+
+struct CommandListHeader;
+class DebugContext;
+
+namespace Engines {
+class Fermi2D;
+class Maxwell3D;
+class MaxwellDMA;
+class KeplerCompute;
+class KeplerMemory;
+} // namespace Engines
+
+enum class EngineID {
+    FERMI_TWOD_A = 0x902D, // 2D Engine
+    MAXWELL_B = 0xB197,    // 3D Engine
+    KEPLER_COMPUTE_B = 0xB1C0,
+    KEPLER_INLINE_TO_MEMORY_B = 0xA140,
+    MAXWELL_DMA_COPY_A = 0xB0B5,
+};
+
+class MemoryManager;
+
+class GPU final {
+public:
+    struct MethodCall {
+        u32 method{};
+        u32 argument{};
+        u32 subchannel{};
+        u32 method_count{};
+
+        explicit MethodCall(u32 method_, u32 argument_, u32 subchannel_ = 0, u32 method_count_ = 0)
+            : method(method_), argument(argument_), subchannel(subchannel_),
+              method_count(method_count_) {}
+
+        [[nodiscard]] bool IsLastCall() const {
+            return method_count <= 1;
+        }
+    };
+
+    explicit GPU(Core::System& system_, bool is_async_, bool use_nvdec_);
+    ~GPU();
+
+    /// Binds a renderer to the GPU.
+    void BindRenderer(std::unique_ptr<VideoCore::RendererBase> renderer);
+
+    /// Calls a GPU method.
+    void CallMethod(const MethodCall& method_call);
+
+    /// Calls a GPU multivalue method.
+    void CallMultiMethod(u32 method, u32 subchannel, const u32* base_start, u32 amount,
+                         u32 methods_pending);
+
+    /// Flush all current written commands into the host GPU for execution.
+    void FlushCommands();
+    /// Synchronizes CPU writes with Host GPU memory.
+    void SyncGuestHost();
+    /// Signal the ending of command list.
+    void OnCommandListEnd();
+
+    /// Request a host GPU memory flush from the CPU.
+    [[nodiscard]] u64 RequestFlush(VAddr addr, std::size_t size);
+
+    /// Obtains current flush request fence id.
+    [[nodiscard]] u64 CurrentFlushRequestFence() const {
+        return current_flush_fence.load(std::memory_order_relaxed);
+    }
+
+    /// Tick pending requests within the GPU.
+    void TickWork();
+
+    /// Returns a reference to the Maxwell3D GPU engine.
+    [[nodiscard]] Engines::Maxwell3D& Maxwell3D();
+
+    /// Returns a const reference to the Maxwell3D GPU engine.
+    [[nodiscard]] const Engines::Maxwell3D& Maxwell3D() const;
+
+    /// Returns a reference to the KeplerCompute GPU engine.
+    [[nodiscard]] Engines::KeplerCompute& KeplerCompute();
+
+    /// Returns a reference to the KeplerCompute GPU engine.
+    [[nodiscard]] const Engines::KeplerCompute& KeplerCompute() const;
+
+    /// Returns a reference to the GPU memory manager.
+    [[nodiscard]] Tegra::MemoryManager& MemoryManager();
+
+    /// Returns a const reference to the GPU memory manager.
+    [[nodiscard]] const Tegra::MemoryManager& MemoryManager() const;
+
+    /// Returns a reference to the GPU DMA pusher.
+    [[nodiscard]] Tegra::DmaPusher& DmaPusher();
+
+    /// Returns a const reference to the GPU DMA pusher.
+    [[nodiscard]] const Tegra::DmaPusher& DmaPusher() const;
+
+    /// Returns a reference to the GPU CDMA pusher.
+    [[nodiscard]] Tegra::CDmaPusher& CDmaPusher();
+
+    /// Returns a const reference to the GPU CDMA pusher.
+    [[nodiscard]] const Tegra::CDmaPusher& CDmaPusher() const;
+
+    /// Returns a reference to the underlying renderer.
+    [[nodiscard]] VideoCore::RendererBase& Renderer() {
+        return *renderer;
+    }
+
+    /// Returns a const reference to the underlying renderer.
+    [[nodiscard]] const VideoCore::RendererBase& Renderer() const {
+        return *renderer;
+    }
+
+    /// Returns a reference to the shader notifier.
+    [[nodiscard]] VideoCore::ShaderNotify& ShaderNotify() {
+        return *shader_notify;
+    }
+
+    /// Returns a const reference to the shader notifier.
+    [[nodiscard]] const VideoCore::ShaderNotify& ShaderNotify() const {
+        return *shader_notify;
+    }
+
+    // Waits for the GPU to finish working
+    void WaitIdle() const;
+
+    /// Allows the CPU/NvFlinger to wait on the GPU before presenting a frame.
+    void WaitFence(u32 syncpoint_id, u32 value);
+
+    void IncrementSyncPoint(u32 syncpoint_id);
+
+    [[nodiscard]] u32 GetSyncpointValue(u32 syncpoint_id) const;
+
+    void RegisterSyncptInterrupt(u32 syncpoint_id, u32 value);
+
+    [[nodiscard]] bool CancelSyncptInterrupt(u32 syncpoint_id, u32 value);
+
+    [[nodiscard]] u64 GetTicks() const;
+
+    [[nodiscard]] std::unique_lock<std::mutex> LockSync() {
+        return std::unique_lock{sync_mutex};
+    }
+
+    [[nodiscard]] bool IsAsync() const {
+        return is_async;
+    }
+
+    [[nodiscard]] bool UseNvdec() const {
+        return use_nvdec;
+    }
+
+    enum class FenceOperation : u32 {
+        Acquire = 0,
+        Increment = 1,
+    };
+
+    union FenceAction {
+        u32 raw;
+        BitField<0, 1, FenceOperation> op;
+        BitField<8, 24, u32> syncpoint_id;
+
+        [[nodiscard]] static CommandHeader Build(FenceOperation op, u32 syncpoint_id) {
+            FenceAction result{};
+            result.op.Assign(op);
+            result.syncpoint_id.Assign(syncpoint_id);
+            return {result.raw};
+        }
+    };
+
+    struct Regs {
+        static constexpr size_t NUM_REGS = 0x40;
+
+        union {
+            struct {
+                INSERT_UNION_PADDING_WORDS(0x4);
+                struct {
+                    u32 address_high;
+                    u32 address_low;
+
+                    [[nodiscard]] GPUVAddr SemaphoreAddress() const {
+                        return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
+                                                     address_low);
+                    }
+                } semaphore_address;
+
+                u32 semaphore_sequence;
+                u32 semaphore_trigger;
+                INSERT_UNION_PADDING_WORDS(0xC);
+
+                // The pusher and the puller share the reference counter, the pusher only has read
+                // access
+                u32 reference_count;
+                INSERT_UNION_PADDING_WORDS(0x5);
+
+                u32 semaphore_acquire;
+                u32 semaphore_release;
+                u32 fence_value;
+                FenceAction fence_action;
+                INSERT_UNION_PADDING_WORDS(0xE2);
+
+                // Puller state
+                u32 acquire_mode;
+                u32 acquire_source;
+                u32 acquire_active;
+                u32 acquire_timeout;
+                u32 acquire_value;
+            };
+            std::array<u32, NUM_REGS> reg_array;
+        };
+    } regs{};
+
+    /// Performs any additional setup necessary in order to begin GPU emulation.
+    /// This can be used to launch any necessary threads and register any necessary
+    /// core timing events.
+    void Start();
+
+    /// Obtain the CPU Context
+    void ObtainContext();
+
+    /// Release the CPU Context
+    void ReleaseContext();
+
+    /// Push GPU command entries to be processed
+    void PushGPUEntries(Tegra::CommandList&& entries);
+
+    /// Push GPU command buffer entries to be processed
+    void PushCommandBuffer(Tegra::ChCommandHeaderList& entries);
+
+    /// Swap buffers (render frame)
+    void SwapBuffers(const Tegra::FramebufferConfig* framebuffer);
+
+    /// Notify rasterizer that any caches of the specified region should be flushed to Switch memory
+    void FlushRegion(VAddr addr, u64 size);
+
+    /// Notify rasterizer that any caches of the specified region should be invalidated
+    void InvalidateRegion(VAddr addr, u64 size);
+
+    /// Notify rasterizer that any caches of the specified region should be flushed and invalidated
+    void FlushAndInvalidateRegion(VAddr addr, u64 size);
+
+protected:
+    void TriggerCpuInterrupt(u32 syncpoint_id, u32 value) const;
+
+private:
+    void ProcessBindMethod(const MethodCall& method_call);
+    void ProcessFenceActionMethod();
+    void ProcessWaitForInterruptMethod();
+    void ProcessSemaphoreTriggerMethod();
+    void ProcessSemaphoreRelease();
+    void ProcessSemaphoreAcquire();
+
+    /// Calls a GPU puller method.
+    void CallPullerMethod(const MethodCall& method_call);
+
+    /// Calls a GPU engine method.
+    void CallEngineMethod(const MethodCall& method_call);
+
+    /// Calls a GPU engine multivalue method.
+    void CallEngineMultiMethod(u32 method, u32 subchannel, const u32* base_start, u32 amount,
+                               u32 methods_pending);
+
+    /// Determines where the method should be executed.
+    [[nodiscard]] bool ExecuteMethodOnEngine(u32 method);
+
+protected:
+    Core::System& system;
+    std::unique_ptr<Tegra::MemoryManager> memory_manager;
+    std::unique_ptr<Tegra::DmaPusher> dma_pusher;
+    std::unique_ptr<Tegra::CDmaPusher> cdma_pusher;
+    std::unique_ptr<VideoCore::RendererBase> renderer;
+    const bool use_nvdec;
+
+private:
+    /// Mapping of command subchannels to their bound engine ids
+    std::array<EngineID, 8> bound_engines = {};
+    /// 3D engine
+    std::unique_ptr<Engines::Maxwell3D> maxwell_3d;
+    /// 2D engine
+    std::unique_ptr<Engines::Fermi2D> fermi_2d;
+    /// Compute engine
+    std::unique_ptr<Engines::KeplerCompute> kepler_compute;
+    /// DMA engine
+    std::unique_ptr<Engines::MaxwellDMA> maxwell_dma;
+    /// Inline memory engine
+    std::unique_ptr<Engines::KeplerMemory> kepler_memory;
+    /// Shader build notifier
+    std::unique_ptr<VideoCore::ShaderNotify> shader_notify;
+
+    std::array<std::atomic<u32>, Service::Nvidia::MaxSyncPoints> syncpoints{};
+
+    std::array<std::list<u32>, Service::Nvidia::MaxSyncPoints> syncpt_interrupts;
+
+    std::mutex sync_mutex;
+    std::mutex device_mutex;
+
+    std::condition_variable sync_cv;
+
+    struct FlushRequest {
+        explicit FlushRequest(u64 fence_, VAddr addr_, std::size_t size_)
+            : fence{fence_}, addr{addr_}, size{size_} {}
+        u64 fence;
+        VAddr addr;
+        std::size_t size;
+    };
+
+    std::list<FlushRequest> flush_requests;
+    std::atomic<u64> current_flush_fence{};
+    u64 last_flush_fence{};
+    std::mutex flush_request_mutex;
+
+    const bool is_async;
+
+    VideoCommon::GPUThread::ThreadManager gpu_thread;
+    std::unique_ptr<Core::Frontend::GraphicsContext> cpu_context;
+};
+
+#define ASSERT_REG_POSITION(field_name, position)                                                  \
+    static_assert(offsetof(GPU::Regs, field_name) == position * 4,                                 \
+                  "Field " #field_name " has invalid position")
+
+ASSERT_REG_POSITION(semaphore_address, 0x4);
+ASSERT_REG_POSITION(semaphore_sequence, 0x6);
+ASSERT_REG_POSITION(semaphore_trigger, 0x7);
+ASSERT_REG_POSITION(reference_count, 0x14);
+ASSERT_REG_POSITION(semaphore_acquire, 0x1A);
+ASSERT_REG_POSITION(semaphore_release, 0x1B);
+ASSERT_REG_POSITION(fence_value, 0x1C);
+ASSERT_REG_POSITION(fence_action, 0x1D);
+
+ASSERT_REG_POSITION(acquire_mode, 0x100);
+ASSERT_REG_POSITION(acquire_source, 0x101);
+ASSERT_REG_POSITION(acquire_active, 0x102);
+ASSERT_REG_POSITION(acquire_timeout, 0x103);
+ASSERT_REG_POSITION(acquire_value, 0x104);
+
+#undef ASSERT_REG_POSITION
+
+} // namespace Tegra
--- a/src/video_core/gpu_thread.cpp
+++ b/src/video_core/gpu_thread.cpp
@@ -0,0 +1,162 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "common/assert.h"
+#include "common/microprofile.h"
+#include "common/scope_exit.h"
+#include "common/thread.h"
+#include "core/core.h"
+#include "core/frontend/emu_window.h"
+#include "core/settings.h"
+#include "video_core/dma_pusher.h"
+#include "video_core/gpu.h"
+#include "video_core/gpu_thread.h"
+#include "video_core/renderer_base.h"
+
+namespace VideoCommon::GPUThread {
+
+/// Runs the GPU thread
+static void RunThread(Core::System& system, VideoCore::RendererBase& renderer,
+                      Core::Frontend::GraphicsContext& context, Tegra::DmaPusher& dma_pusher,
+                      SynchState& state, Tegra::CDmaPusher& cdma_pusher) {
+    std::string name = "yuzu:GPU";
+    MicroProfileOnThreadCreate(name.c_str());
+    SCOPE_EXIT({ MicroProfileOnThreadExit(); });
+
+    Common::SetCurrentThreadName(name.c_str());
+    Common::SetCurrentThreadPriority(Common::ThreadPriority::High);
+    system.RegisterHostThread();
+
+    // Wait for first GPU command before acquiring the window context
+    while (state.queue.Empty())
+        ;
+
+    // If emulation was stopped during disk shader loading, abort before trying to acquire context
+    if (!state.is_running) {
+        return;
+    }
+
+    auto current_context = context.Acquire();
+
+    CommandDataContainer next;
+    while (state.is_running) {
+        next = state.queue.PopWait();
+        if (auto* submit_list = std::get_if<SubmitListCommand>(&next.data)) {
+            dma_pusher.Push(std::move(submit_list->entries));
+            dma_pusher.DispatchCalls();
+        } else if (auto* command_list = std::get_if<SubmitChCommandEntries>(&next.data)) {
+            // NVDEC
+            cdma_pusher.Push(std::move(command_list->entries));
+            cdma_pusher.DispatchCalls();
+        } else if (const auto* data = std::get_if<SwapBuffersCommand>(&next.data)) {
+            renderer.SwapBuffers(data->framebuffer ? &*data->framebuffer : nullptr);
+        } else if (std::holds_alternative<OnCommandListEndCommand>(next.data)) {
+            renderer.Rasterizer().ReleaseFences();
+        } else if (std::holds_alternative<GPUTickCommand>(next.data)) {
+            system.GPU().TickWork();
+        } else if (const auto* flush = std::get_if<FlushRegionCommand>(&next.data)) {
+            renderer.Rasterizer().FlushRegion(flush->addr, flush->size);
+        } else if (const auto* invalidate = std::get_if<InvalidateRegionCommand>(&next.data)) {
+            renderer.Rasterizer().OnCPUWrite(invalidate->addr, invalidate->size);
+        } else if (std::holds_alternative<EndProcessingCommand>(next.data)) {
+            return;
+        } else {
+            UNREACHABLE();
+        }
+        state.signaled_fence.store(next.fence);
+    }
+}
+
+ThreadManager::ThreadManager(Core::System& system_, bool is_async_)
+    : system{system_}, is_async{is_async_} {}
+
+ThreadManager::~ThreadManager() {
+    if (!thread.joinable()) {
+        return;
+    }
+
+    // Notify GPU thread that a shutdown is pending
+    PushCommand(EndProcessingCommand());
+    thread.join();
+}
+
+void ThreadManager::StartThread(VideoCore::RendererBase& renderer,
+                                Core::Frontend::GraphicsContext& context,
+                                Tegra::DmaPusher& dma_pusher, Tegra::CDmaPusher& cdma_pusher) {
+    thread = std::thread(RunThread, std::ref(system), std::ref(renderer), std::ref(context),
+                         std::ref(dma_pusher), std::ref(state), std::ref(cdma_pusher));
+}
+
+void ThreadManager::SubmitList(Tegra::CommandList&& entries) {
+    PushCommand(SubmitListCommand(std::move(entries)));
+}
+
+void ThreadManager::SubmitCommandBuffer(Tegra::ChCommandHeaderList&& entries) {
+    PushCommand(SubmitChCommandEntries(std::move(entries)));
+}
+
+void ThreadManager::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) {
+    PushCommand(SwapBuffersCommand(framebuffer ? std::make_optional(*framebuffer) : std::nullopt));
+}
+
+void ThreadManager::FlushRegion(VAddr addr, u64 size) {
+    if (!is_async) {
+        // Always flush with synchronous GPU mode
+        PushCommand(FlushRegionCommand(addr, size));
+        return;
+    }
+
+    // Asynchronous GPU mode
+    switch (Settings::values.gpu_accuracy.GetValue()) {
+    case Settings::GPUAccuracy::Normal:
+        PushCommand(FlushRegionCommand(addr, size));
+        break;
+    case Settings::GPUAccuracy::High:
+        // TODO(bunnei): Is this right? Preserving existing behavior for now
+        break;
+    case Settings::GPUAccuracy::Extreme: {
+        auto& gpu = system.GPU();
+        u64 fence = gpu.RequestFlush(addr, size);
+        PushCommand(GPUTickCommand());
+        while (fence > gpu.CurrentFlushRequestFence()) {
+        }
+        break;
+    }
+    default:
+        UNIMPLEMENTED_MSG("Unsupported gpu_accuracy {}", Settings::values.gpu_accuracy.GetValue());
+    }
+}
+
+void ThreadManager::InvalidateRegion(VAddr addr, u64 size) {
+    system.Renderer().Rasterizer().OnCPUWrite(addr, size);
+}
+
+void ThreadManager::FlushAndInvalidateRegion(VAddr addr, u64 size) {
+    // Skip flush on asynch mode, as FlushAndInvalidateRegion is not used for anything too important
+    system.Renderer().Rasterizer().OnCPUWrite(addr, size);
+}
+
+void ThreadManager::WaitIdle() const {
+    while (state.last_fence > state.signaled_fence.load(std::memory_order_relaxed) &&
+           system.IsPoweredOn()) {
+    }
+}
+
+void ThreadManager::OnCommandListEnd() {
+    PushCommand(OnCommandListEndCommand());
+}
+
+u64 ThreadManager::PushCommand(CommandData&& command_data) {
+    const u64 fence{++state.last_fence};
+    state.queue.Push(CommandDataContainer(std::move(command_data), fence));
+
+    if (!is_async) {
+        // In synchronous GPU mode, block the caller until the command has executed
+        WaitIdle();
+    }
+
+    return fence;
+}
+
+} // namespace VideoCommon::GPUThread
--- a/src/video_core/gpu_thread.h
+++ b/src/video_core/gpu_thread.h
@@ -0,0 +1,161 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <atomic>
+#include <condition_variable>
+#include <mutex>
+#include <optional>
+#include <thread>
+#include <variant>
+
+#include "common/threadsafe_queue.h"
+#include "video_core/framebuffer_config.h"
+
+namespace Tegra {
+struct FramebufferConfig;
+class DmaPusher;
+} // namespace Tegra
+
+namespace Core {
+namespace Frontend {
+class GraphicsContext;
+}
+class System;
+} // namespace Core
+
+namespace VideoCore {
+class RendererBase;
+} // namespace VideoCore
+
+namespace VideoCommon::GPUThread {
+
+/// Command to signal to the GPU thread that processing has ended
+struct EndProcessingCommand final {};
+
+/// Command to signal to the GPU thread that a command list is ready for processing
+struct SubmitListCommand final {
+    explicit SubmitListCommand(Tegra::CommandList&& entries_) : entries{std::move(entries_)} {}
+
+    Tegra::CommandList entries;
+};
+
+/// Command to signal to the GPU thread that a cdma command list is ready for processing
+struct SubmitChCommandEntries final {
+    explicit SubmitChCommandEntries(Tegra::ChCommandHeaderList&& entries_)
+        : entries{std::move(entries_)} {}
+
+    Tegra::ChCommandHeaderList entries;
+};
+
+/// Command to signal to the GPU thread that a swap buffers is pending
+struct SwapBuffersCommand final {
+    explicit SwapBuffersCommand(std::optional<const Tegra::FramebufferConfig> framebuffer_)
+        : framebuffer{std::move(framebuffer_)} {}
+
+    std::optional<Tegra::FramebufferConfig> framebuffer;
+};
+
+/// Command to signal to the GPU thread to flush a region
+struct FlushRegionCommand final {
+    explicit constexpr FlushRegionCommand(VAddr addr_, u64 size_) : addr{addr_}, size{size_} {}
+
+    VAddr addr;
+    u64 size;
+};
+
+/// Command to signal to the GPU thread to invalidate a region
+struct InvalidateRegionCommand final {
+    explicit constexpr InvalidateRegionCommand(VAddr addr_, u64 size_) : addr{addr_}, size{size_} {}
+
+    VAddr addr;
+    u64 size;
+};
+
+/// Command to signal to the GPU thread to flush and invalidate a region
+struct FlushAndInvalidateRegionCommand final {
+    explicit constexpr FlushAndInvalidateRegionCommand(VAddr addr_, u64 size_)
+        : addr{addr_}, size{size_} {}
+
+    VAddr addr;
+    u64 size;
+};
+
+/// Command called within the gpu, to schedule actions after a command list end
+struct OnCommandListEndCommand final {};
+
+/// Command to make the gpu look into pending requests
+struct GPUTickCommand final {};
+
+using CommandData =
+    std::variant<EndProcessingCommand, SubmitListCommand, SubmitChCommandEntries,
+                 SwapBuffersCommand, FlushRegionCommand, InvalidateRegionCommand,
+                 FlushAndInvalidateRegionCommand, OnCommandListEndCommand, GPUTickCommand>;
+
+struct CommandDataContainer {
+    CommandDataContainer() = default;
+
+    explicit CommandDataContainer(CommandData&& data_, u64 next_fence_)
+        : data{std::move(data_)}, fence{next_fence_} {}
+
+    CommandData data;
+    u64 fence{};
+};
+
+/// Struct used to synchronize the GPU thread
+struct SynchState final {
+    std::atomic_bool is_running{true};
+
+    using CommandQueue = Common::MPSCQueue<CommandDataContainer>;
+    CommandQueue queue;
+    u64 last_fence{};
+    std::atomic<u64> signaled_fence{};
+};
+
+/// Class used to manage the GPU thread
+class ThreadManager final {
+public:
+    explicit ThreadManager(Core::System& system_, bool is_async_);
+    ~ThreadManager();
+
+    /// Creates and starts the GPU thread.
+    void StartThread(VideoCore::RendererBase& renderer, Core::Frontend::GraphicsContext& context,
+                     Tegra::DmaPusher& dma_pusher, Tegra::CDmaPusher& cdma_pusher);
+
+    /// Push GPU command entries to be processed
+    void SubmitList(Tegra::CommandList&& entries);
+
+    /// Push GPU CDMA command buffer entries to be processed
+    void SubmitCommandBuffer(Tegra::ChCommandHeaderList&& entries);
+
+    /// Swap buffers (render frame)
+    void SwapBuffers(const Tegra::FramebufferConfig* framebuffer);
+
+    /// Notify rasterizer that any caches of the specified region should be flushed to Switch memory
+    void FlushRegion(VAddr addr, u64 size);
+
+    /// Notify rasterizer that any caches of the specified region should be invalidated
+    void InvalidateRegion(VAddr addr, u64 size);
+
+    /// Notify rasterizer that any caches of the specified region should be flushed and invalidated
+    void FlushAndInvalidateRegion(VAddr addr, u64 size);
+
+    // Wait until the gpu thread is idle.
+    void WaitIdle() const;
+
+    void OnCommandListEnd();
+
+private:
+    /// Pushes a command to be executed by the GPU thread
+    u64 PushCommand(CommandData&& command_data);
+
+    SynchState state;
+    Core::System& system;
+    std::thread thread;
+    std::thread::id thread_id;
+    const bool is_async;
+};
+
+} // namespace VideoCommon::GPUThread
--- a/src/video_core/guest_driver.cpp
+++ b/src/video_core/guest_driver.cpp
@@ -0,0 +1,37 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <algorithm>
+#include <limits>
+#include <vector>
+
+#include "common/common_types.h"
+#include "video_core/guest_driver.h"
+
+namespace VideoCore {
+
+void GuestDriverProfile::DeduceTextureHandlerSize(std::vector<u32> bound_offsets) {
+    if (texture_handler_size) {
+        return;
+    }
+    const std::size_t size = bound_offsets.size();
+    if (size < 2) {
+        return;
+    }
+    std::sort(bound_offsets.begin(), bound_offsets.end(), std::less{});
+    u32 min_val = std::numeric_limits<u32>::max();
+    for (std::size_t i = 1; i < size; ++i) {
+        if (bound_offsets[i] == bound_offsets[i - 1]) {
+            continue;
+        }
+        const u32 new_min = bound_offsets[i] - bound_offsets[i - 1];
+        min_val = std::min(min_val, new_min);
+    }
+    if (min_val > 2) {
+        return;
+    }
+    texture_handler_size = min_texture_handler_size * min_val;
+}
+
+} // namespace VideoCore
--- a/src/video_core/guest_driver.h
+++ b/src/video_core/guest_driver.h
@@ -0,0 +1,46 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <optional>
+#include <vector>
+
+#include "common/common_types.h"
+
+namespace VideoCore {
+
+/**
+ * The GuestDriverProfile class is used to learn about the GPU drivers behavior and collect
+ * information necessary for impossible to avoid HLE methods like shader tracks as they are
+ * Entscheidungsproblems.
+ */
+class GuestDriverProfile {
+public:
+    explicit GuestDriverProfile() = default;
+    explicit GuestDriverProfile(std::optional<u32> texture_handler_size_)
+        : texture_handler_size{texture_handler_size_} {}
+
+    void DeduceTextureHandlerSize(std::vector<u32> bound_offsets);
+
+    u32 GetTextureHandlerSize() const {
+        return texture_handler_size.value_or(default_texture_handler_size);
+    }
+
+    bool IsTextureHandlerSizeKnown() const {
+        return texture_handler_size.has_value();
+    }
+
+private:
+    // Minimum size of texture handler any driver can use.
+    static constexpr u32 min_texture_handler_size = 4;
+
+    // This goes with Vulkan and OpenGL standards but Nvidia GPUs can easily use 4 bytes instead.
+    // Thus, certain drivers may squish the size.
+    static constexpr u32 default_texture_handler_size = 8;
+
+    std::optional<u32> texture_handler_size = default_texture_handler_size;
+};
+
+} // namespace VideoCore
--- a/src/video_core/host_shaders/CMakeLists.txt
+++ b/src/video_core/host_shaders/CMakeLists.txt
@@ -0,0 +1,71 @@
+set(SHADER_FILES
+    block_linear_unswizzle_2d.comp
+    block_linear_unswizzle_3d.comp
+    convert_depth_to_float.frag
+    convert_float_to_depth.frag
+    full_screen_triangle.vert
+    opengl_copy_bc4.comp
+    opengl_present.frag
+    opengl_present.vert
+    pitch_unswizzle.comp
+    vulkan_blit_color_float.frag
+    vulkan_blit_depth_stencil.frag
+    vulkan_present.frag
+    vulkan_present.vert
+    vulkan_quad_array.comp
+    vulkan_quad_indexed.comp
+    vulkan_uint8.comp
+)
+
+find_program(GLSLANGVALIDATOR "glslangValidator" REQUIRED)
+
+set(GLSL_FLAGS "")
+
+set(SHADER_INCLUDE ${CMAKE_CURRENT_BINARY_DIR}/include)
+set(SHADER_DIR ${SHADER_INCLUDE}/video_core/host_shaders)
+set(HOST_SHADERS_INCLUDE ${SHADER_INCLUDE} PARENT_SCOPE)
+
+set(INPUT_FILE ${CMAKE_CURRENT_SOURCE_DIR}/source_shader.h.in)
+set(HEADER_GENERATOR ${CMAKE_CURRENT_SOURCE_DIR}/StringShaderHeader.cmake)
+
+foreach(FILENAME IN ITEMS ${SHADER_FILES})
+    string(REPLACE "." "_" SHADER_NAME ${FILENAME})
+    set(SOURCE_FILE ${CMAKE_CURRENT_SOURCE_DIR}/${FILENAME})
+    # Skip generating source headers on Vulkan exclusive files
+    if (NOT ${FILENAME} MATCHES "vulkan.*")
+        set(SOURCE_HEADER_FILE ${SHADER_DIR}/${SHADER_NAME}.h)
+        add_custom_command(
+            OUTPUT
+                ${SOURCE_HEADER_FILE}
+            COMMAND
+                ${CMAKE_COMMAND} -P ${HEADER_GENERATOR} ${SOURCE_FILE} ${SOURCE_HEADER_FILE} ${INPUT_FILE}
+            MAIN_DEPENDENCY
+                ${SOURCE_FILE}
+            DEPENDS
+                ${INPUT_FILE}
+                # HEADER_GENERATOR should be included here but msbuild seems to assume it's always modified
+        )
+        set(SHADER_HEADERS ${SHADER_HEADERS} ${SOURCE_HEADER_FILE})
+    endif()
+    # Skip compiling to SPIR-V OpenGL exclusive files
+    if (NOT ${FILENAME} MATCHES "opengl.*")
+        string(TOUPPER ${SHADER_NAME}_SPV SPIRV_VARIABLE_NAME)
+        set(SPIRV_HEADER_FILE ${SHADER_DIR}/${SHADER_NAME}_spv.h)
+        add_custom_command(
+            OUTPUT
+                ${SPIRV_HEADER_FILE}
+            COMMAND
+                ${GLSLANGVALIDATOR} -V ${GLSL_FLAGS} --variable-name ${SPIRV_VARIABLE_NAME} -o ${SPIRV_HEADER_FILE} ${SOURCE_FILE}
+            MAIN_DEPENDENCY
+                ${SOURCE_FILE}
+        )
+        set(SHADER_HEADERS ${SHADER_HEADERS} ${SPIRV_HEADER_FILE})
+    endif()
+endforeach()
+
+add_custom_target(host_shaders
+    DEPENDS
+        ${SHADER_HEADERS}
+    SOURCES
+        ${SHADER_FILES}
+)
--- a/src/video_core/host_shaders/StringShaderHeader.cmake
+++ b/src/video_core/host_shaders/StringShaderHeader.cmake
@@ -0,0 +1,13 @@
+set(SOURCE_FILE ${CMAKE_ARGV3})
+set(HEADER_FILE ${CMAKE_ARGV4})
+set(INPUT_FILE ${CMAKE_ARGV5})
+
+get_filename_component(CONTENTS_NAME ${SOURCE_FILE} NAME)
+string(REPLACE "." "_" CONTENTS_NAME ${CONTENTS_NAME})
+string(TOUPPER ${CONTENTS_NAME} CONTENTS_NAME)
+
+file(READ ${SOURCE_FILE} CONTENTS)
+
+get_filename_component(OUTPUT_DIR ${HEADER_FILE} DIRECTORY)
+make_directory(${OUTPUT_DIR})
+configure_file(${INPUT_FILE} ${HEADER_FILE} @ONLY)
--- a/src/video_core/host_shaders/block_linear_unswizzle_2d.comp
+++ b/src/video_core/host_shaders/block_linear_unswizzle_2d.comp
@@ -0,0 +1,122 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#version 430
+
+#ifdef VULKAN
+
+#extension GL_EXT_shader_16bit_storage : require
+#extension GL_EXT_shader_8bit_storage : require
+#define HAS_EXTENDED_TYPES 1
+#define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants {
+#define END_PUSH_CONSTANTS };
+#define UNIFORM(n)
+#define BINDING_SWIZZLE_BUFFER 0
+#define BINDING_INPUT_BUFFER 1
+#define BINDING_OUTPUT_IMAGE 2
+
+#else // ^^^ Vulkan ^^^ // vvv OpenGL vvv
+
+#extension GL_NV_gpu_shader5 : enable
+#ifdef GL_NV_gpu_shader5
+#define HAS_EXTENDED_TYPES 1
+#else
+#define HAS_EXTENDED_TYPES 0
+#endif
+#define BEGIN_PUSH_CONSTANTS
+#define END_PUSH_CONSTANTS
+#define UNIFORM(n) layout (location = n) uniform
+#define BINDING_SWIZZLE_BUFFER 0
+#define BINDING_INPUT_BUFFER 1
+#define BINDING_OUTPUT_IMAGE 0
+
+#endif
+
+BEGIN_PUSH_CONSTANTS
+UNIFORM(0) uvec3 origin;
+UNIFORM(1) ivec3 destination;
+UNIFORM(2) uint bytes_per_block_log2;
+UNIFORM(3) uint layer_stride;
+UNIFORM(4) uint block_size;
+UNIFORM(5) uint x_shift;
+UNIFORM(6) uint block_height;
+UNIFORM(7) uint block_height_mask;
+END_PUSH_CONSTANTS
+
+layout(binding = BINDING_SWIZZLE_BUFFER, std430) readonly buffer SwizzleTable {
+    uint swizzle_table[];
+};
+
+#if HAS_EXTENDED_TYPES
+layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU8 { uint8_t u8data[]; };
+layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU16 { uint16_t u16data[]; };
+#endif
+layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU32 { uint u32data[]; };
+layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU64 { uvec2 u64data[]; };
+layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU128 { uvec4 u128data[]; };
+
+layout(binding = BINDING_OUTPUT_IMAGE) uniform writeonly uimage2DArray output_image;
+
+layout(local_size_x = 32, local_size_y = 32, local_size_z = 1) in;
+
+const uint GOB_SIZE_X = 64;
+const uint GOB_SIZE_Y = 8;
+const uint GOB_SIZE_Z = 1;
+const uint GOB_SIZE = GOB_SIZE_X * GOB_SIZE_Y * GOB_SIZE_Z;
+
+const uint GOB_SIZE_X_SHIFT = 6;
+const uint GOB_SIZE_Y_SHIFT = 3;
+const uint GOB_SIZE_Z_SHIFT = 0;
+const uint GOB_SIZE_SHIFT = GOB_SIZE_X_SHIFT + GOB_SIZE_Y_SHIFT + GOB_SIZE_Z_SHIFT;
+
+const uvec2 SWIZZLE_MASK = uvec2(GOB_SIZE_X - 1, GOB_SIZE_Y - 1);
+
+uint SwizzleOffset(uvec2 pos) {
+    pos = pos & SWIZZLE_MASK;
+    return swizzle_table[pos.y * 64 + pos.x];
+}
+
+uvec4 ReadTexel(uint offset) {
+    switch (bytes_per_block_log2) {
+#if HAS_EXTENDED_TYPES
+    case 0:
+        return uvec4(u8data[offset], 0, 0, 0);
+    case 1:
+        return uvec4(u16data[offset / 2], 0, 0, 0);
+#else
+    case 0:
+        return uvec4(bitfieldExtract(u32data[offset / 4], int((offset * 8) & 24), 8), 0, 0, 0);
+    case 1:
+        return uvec4(bitfieldExtract(u32data[offset / 4], int((offset * 8) & 16), 16), 0, 0, 0);
+#endif
+    case 2:
+        return uvec4(u32data[offset / 4], 0, 0, 0);
+    case 3:
+        return uvec4(u64data[offset / 8], 0, 0);
+    case 4:
+        return u128data[offset / 16];
+    }
+    return uvec4(0);
+}
+
+void main() {
+    uvec3 pos = gl_GlobalInvocationID + origin;
+    pos.x <<= bytes_per_block_log2;
+
+    // Read as soon as possible due to its latency
+    const uint swizzle = SwizzleOffset(pos.xy);
+
+    const uint block_y = pos.y >> GOB_SIZE_Y_SHIFT;
+
+    uint offset = 0;
+    offset += pos.z * layer_stride;
+    offset += (block_y >> block_height) * block_size;
+    offset += (block_y & block_height_mask) << GOB_SIZE_SHIFT;
+    offset += (pos.x >> GOB_SIZE_X_SHIFT) << x_shift;
+    offset += swizzle;
+
+    const uvec4 texel = ReadTexel(offset);
+    const ivec3 coord = ivec3(gl_GlobalInvocationID) + destination;
+    imageStore(output_image, coord, texel);
+}
--- a/src/video_core/host_shaders/block_linear_unswizzle_3d.comp
+++ b/src/video_core/host_shaders/block_linear_unswizzle_3d.comp
@@ -0,0 +1,125 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#version 430
+
+#ifdef VULKAN
+
+#extension GL_EXT_shader_16bit_storage : require
+#extension GL_EXT_shader_8bit_storage : require
+#define HAS_EXTENDED_TYPES 1
+#define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants {
+#define END_PUSH_CONSTANTS };
+#define UNIFORM(n)
+#define BINDING_SWIZZLE_BUFFER 0
+#define BINDING_INPUT_BUFFER 1
+#define BINDING_OUTPUT_IMAGE 2
+
+#else // ^^^ Vulkan ^^^ // vvv OpenGL vvv
+
+#extension GL_NV_gpu_shader5 : enable
+#ifdef GL_NV_gpu_shader5
+#define HAS_EXTENDED_TYPES 1
+#else
+#define HAS_EXTENDED_TYPES 0
+#endif
+#define BEGIN_PUSH_CONSTANTS
+#define END_PUSH_CONSTANTS
+#define UNIFORM(n) layout (location = n) uniform
+#define BINDING_SWIZZLE_BUFFER 0
+#define BINDING_INPUT_BUFFER 1
+#define BINDING_OUTPUT_IMAGE 0
+
+#endif
+
+BEGIN_PUSH_CONSTANTS
+UNIFORM(0) uvec3 origin;
+UNIFORM(1) ivec3 destination;
+UNIFORM(2) uint bytes_per_block_log2;
+UNIFORM(3) uint slice_size;
+UNIFORM(4) uint block_size;
+UNIFORM(5) uint x_shift;
+UNIFORM(6) uint block_height;
+UNIFORM(7) uint block_height_mask;
+UNIFORM(8) uint block_depth;
+UNIFORM(9) uint block_depth_mask;
+END_PUSH_CONSTANTS
+
+layout(binding = BINDING_SWIZZLE_BUFFER, std430) readonly buffer SwizzleTable {
+    uint swizzle_table[];
+};
+
+#if HAS_EXTENDED_TYPES
+layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU8 { uint8_t u8data[]; };
+layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU16 { uint16_t u16data[]; };
+#endif
+layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU32 { uint u32data[]; };
+layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU64 { uvec2 u64data[]; };
+layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU128 { uvec4 u128data[]; };
+
+layout(binding = BINDING_OUTPUT_IMAGE) uniform writeonly uimage3D output_image;
+
+layout(local_size_x = 16, local_size_y = 8, local_size_z = 8) in;
+
+const uint GOB_SIZE_X = 64;
+const uint GOB_SIZE_Y = 8;
+const uint GOB_SIZE_Z = 1;
+const uint GOB_SIZE = GOB_SIZE_X * GOB_SIZE_Y * GOB_SIZE_Z;
+
+const uint GOB_SIZE_X_SHIFT = 6;
+const uint GOB_SIZE_Y_SHIFT = 3;
+const uint GOB_SIZE_Z_SHIFT = 0;
+const uint GOB_SIZE_SHIFT = GOB_SIZE_X_SHIFT + GOB_SIZE_Y_SHIFT + GOB_SIZE_Z_SHIFT;
+
+const uvec2 SWIZZLE_MASK = uvec2(GOB_SIZE_X - 1, GOB_SIZE_Y - 1);
+
+uint SwizzleOffset(uvec2 pos) {
+    pos = pos & SWIZZLE_MASK;
+    return swizzle_table[pos.y * 64 + pos.x];
+}
+
+uvec4 ReadTexel(uint offset) {
+    switch (bytes_per_block_log2) {
+#if HAS_EXTENDED_TYPES
+    case 0:
+        return uvec4(u8data[offset], 0, 0, 0);
+    case 1:
+        return uvec4(u16data[offset / 2], 0, 0, 0);
+#else
+    case 0:
+        return uvec4(bitfieldExtract(u32data[offset / 4], int((offset * 8) & 24), 8), 0, 0, 0);
+    case 1:
+        return uvec4(bitfieldExtract(u32data[offset / 4], int((offset * 8) & 16), 16), 0, 0, 0);
+#endif
+    case 2:
+        return uvec4(u32data[offset / 4], 0, 0, 0);
+    case 3:
+        return uvec4(u64data[offset / 8], 0, 0);
+    case 4:
+        return u128data[offset / 16];
+    }
+    return uvec4(0);
+}
+
+void main() {
+    uvec3 pos = gl_GlobalInvocationID + origin;
+    pos.x <<= bytes_per_block_log2;
+
+    // Read as soon as possible due to its latency
+    const uint swizzle = SwizzleOffset(pos.xy);
+
+    const uint block_y = pos.y >> GOB_SIZE_Y_SHIFT;
+
+    uint offset = 0;
+    offset += (pos.z >> block_depth) * slice_size;
+    offset += (pos.z & block_depth_mask) << (GOB_SIZE_SHIFT + block_height);
+    offset += (block_y >> block_height) * block_size;
+    offset += (block_y & block_height_mask) << GOB_SIZE_SHIFT;
+    offset += (pos.x >> GOB_SIZE_X_SHIFT) << x_shift;
+    offset += swizzle;
+
+    const uvec4 texel = ReadTexel(offset);
+    const ivec3 coord = ivec3(gl_GlobalInvocationID) + destination;
+    imageStore(output_image, coord, texel);
+}
--- a/src/video_core/host_shaders/convert_depth_to_float.frag
+++ b/src/video_core/host_shaders/convert_depth_to_float.frag
@@ -0,0 +1,13 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#version 450
+
+layout(binding = 0) uniform sampler2D depth_texture;
+layout(location = 0) out float output_color;
+
+void main() {
+    ivec2 coord = ivec2(gl_FragCoord.xy);
+    output_color = texelFetch(depth_texture, coord, 0).r;
+}
--- a/src/video_core/host_shaders/convert_float_to_depth.frag
+++ b/src/video_core/host_shaders/convert_float_to_depth.frag
@@ -0,0 +1,13 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#version 450
+
+layout(binding = 0) uniform sampler2D color_texture;
+
+void main() {
+    ivec2 coord = ivec2(gl_FragCoord.xy);
+    float color = texelFetch(color_texture, coord, 0).r;
+    gl_FragDepth = color;
+}
--- a/src/video_core/host_shaders/full_screen_triangle.vert
+++ b/src/video_core/host_shaders/full_screen_triangle.vert
@@ -0,0 +1,29 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#version 450
+
+#ifdef VULKAN
+#define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants {
+#define END_PUSH_CONSTANTS };
+#define UNIFORM(n)
+#else // ^^^ Vulkan ^^^ // vvv OpenGL vvv
+#define BEGIN_PUSH_CONSTANTS
+#define END_PUSH_CONSTANTS
+#define UNIFORM(n) layout (location = n) uniform
+#endif
+
+BEGIN_PUSH_CONSTANTS
+UNIFORM(0) vec2 tex_scale;
+UNIFORM(1) vec2 tex_offset;
+END_PUSH_CONSTANTS
+
+layout(location = 0) out vec2 texcoord;
+
+void main() {
+    float x = float((gl_VertexIndex & 1) << 2);
+    float y = float((gl_VertexIndex & 2) << 1);
+    gl_Position = vec4(x - 1.0, y - 1.0, 0.0, 1.0);
+    texcoord = fma(vec2(x, y) / 2.0, tex_scale, tex_offset);
+}
--- a/src/video_core/host_shaders/opengl_copy_bc4.comp
+++ b/src/video_core/host_shaders/opengl_copy_bc4.comp
@@ -0,0 +1,70 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#version 430 core
+#extension GL_ARB_gpu_shader_int64 : require
+
+layout (local_size_x = 4, local_size_y = 4) in;
+
+layout(binding = 0, rg32ui) readonly uniform uimage3D bc4_input;
+layout(binding = 1, rgba8ui) writeonly uniform uimage3D bc4_output;
+
+layout(location = 0) uniform uvec3 src_offset;
+layout(location = 1) uniform uvec3 dst_offset;
+
+// https://www.khronos.org/registry/OpenGL/extensions/ARB/ARB_texture_compression_rgtc.txt
+uint DecompressBlock(uint64_t bits, uvec2 coord) {
+    const uint code_offset = 16 + 3 * (4 * coord.y + coord.x);
+    const uint code = uint(bits >> code_offset) & 7;
+    const uint red0 = uint(bits >> 0) & 0xff;
+    const uint red1 = uint(bits >> 8) & 0xff;
+    if (red0 > red1) {
+        switch (code) {
+        case 0:
+            return red0;
+        case 1:
+            return red1;
+        case 2:
+            return (6 * red0 + 1 * red1) / 7;
+        case 3:
+            return (5 * red0 + 2 * red1) / 7;
+        case 4:
+            return (4 * red0 + 3 * red1) / 7;
+        case 5:
+            return (3 * red0 + 4 * red1) / 7;
+        case 6:
+            return (2 * red0 + 5 * red1) / 7;
+        case 7:
+            return (1 * red0 + 6 * red1) / 7;
+        }
+    } else {
+        switch (code) {
+        case 0:
+            return red0;
+        case 1:
+            return red1;
+        case 2:
+            return (4 * red0 + 1 * red1) / 5;
+        case 3:
+            return (3 * red0 + 2 * red1) / 5;
+        case 4:
+            return (2 * red0 + 3 * red1) / 5;
+        case 5:
+            return (1 * red0 + 4 * red1) / 5;
+        case 6:
+            return 0;
+        case 7:
+            return 0xff;
+        }
+    }
+    return 0;
+}
+
+void main() {
+    uvec2 packed_bits = imageLoad(bc4_input, ivec3(gl_WorkGroupID + src_offset)).rg;
+    uint64_t bits = packUint2x32(packed_bits);
+    uint red = DecompressBlock(bits, gl_LocalInvocationID.xy);
+    uvec4 color = uvec4(red & 0xff, 0, 0, 0xff);
+    imageStore(bc4_output, ivec3(gl_GlobalInvocationID + dst_offset), color);
+}
--- a/src/video_core/host_shaders/opengl_present.frag
+++ b/src/video_core/host_shaders/opengl_present.frag
@@ -0,0 +1,14 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#version 430 core
+
+layout (location = 0) in vec2 frag_tex_coord;
+layout (location = 0) out vec4 color;
+
+layout (binding = 0) uniform sampler2D color_texture;
+
+void main() {
+    color = vec4(texture(color_texture, frag_tex_coord).rgb, 1.0f);
+}
--- a/src/video_core/host_shaders/opengl_present.vert
+++ b/src/video_core/host_shaders/opengl_present.vert
@@ -0,0 +1,28 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#version 430 core
+
+out gl_PerVertex {
+    vec4 gl_Position;
+};
+
+layout (location = 0) in vec2 vert_position;
+layout (location = 1) in vec2 vert_tex_coord;
+layout (location = 0) out vec2 frag_tex_coord;
+
+// This is a truncated 3x3 matrix for 2D transformations:
+// The upper-left 2x2 submatrix performs scaling/rotation/mirroring.
+// The third column performs translation.
+// The third row could be used for projection, which we don't need in 2D. It hence is assumed to
+// implicitly be [0, 0, 1]
+layout (location = 0) uniform mat3x2 modelview_matrix;
+
+void main() {
+    // Multiply input position by the rotscale part of the matrix and then manually translate by
+    // the last column. This is equivalent to using a full 3x3 matrix and expanding the vector
+    // to `vec3(vert_position.xy, 1.0)`
+    gl_Position = vec4(mat2(modelview_matrix) * vert_position + modelview_matrix[2], 0.0, 1.0);
+    frag_tex_coord = vert_tex_coord;
+}
--- a/src/video_core/host_shaders/pitch_unswizzle.comp
+++ b/src/video_core/host_shaders/pitch_unswizzle.comp
@@ -0,0 +1,86 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#version 430
+
+#ifdef VULKAN
+
+#extension GL_EXT_shader_16bit_storage : require
+#extension GL_EXT_shader_8bit_storage : require
+#define HAS_EXTENDED_TYPES 1
+#define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants {
+#define END_PUSH_CONSTANTS };
+#define UNIFORM(n)
+#define BINDING_INPUT_BUFFER 0
+#define BINDING_OUTPUT_IMAGE 1
+
+#else // ^^^ Vulkan ^^^ // vvv OpenGL vvv
+
+#extension GL_NV_gpu_shader5 : enable
+#ifdef GL_NV_gpu_shader5
+#define HAS_EXTENDED_TYPES 1
+#else
+#define HAS_EXTENDED_TYPES 0
+#endif
+#define BEGIN_PUSH_CONSTANTS
+#define END_PUSH_CONSTANTS
+#define UNIFORM(n) layout (location = n) uniform
+#define BINDING_INPUT_BUFFER 0
+#define BINDING_OUTPUT_IMAGE 0
+
+#endif
+
+BEGIN_PUSH_CONSTANTS
+UNIFORM(0) uvec2 origin;
+UNIFORM(1) ivec2 destination;
+UNIFORM(2) uint bytes_per_block;
+UNIFORM(3) uint pitch;
+END_PUSH_CONSTANTS
+
+#if HAS_EXTENDED_TYPES
+layout(binding = BINDING_INPUT_BUFFER, std430) readonly buffer InputBufferU8 { uint8_t u8data[]; };
+layout(binding = BINDING_INPUT_BUFFER, std430) readonly buffer InputBufferU16 { uint16_t u16data[]; };
+#endif
+layout(binding = BINDING_INPUT_BUFFER, std430) readonly buffer InputBufferU32 { uint u32data[]; };
+layout(binding = BINDING_INPUT_BUFFER, std430) readonly buffer InputBufferU64 { uvec2 u64data[]; };
+layout(binding = BINDING_INPUT_BUFFER, std430) readonly buffer InputBufferU128 { uvec4 u128data[]; };
+
+layout(binding = BINDING_OUTPUT_IMAGE) writeonly uniform uimage2D output_image;
+
+layout(local_size_x = 32, local_size_y = 32, local_size_z = 1) in;
+
+uvec4 ReadTexel(uint offset) {
+    switch (bytes_per_block) {
+#if HAS_EXTENDED_TYPES
+    case 1:
+        return uvec4(u8data[offset], 0, 0, 0);
+    case 2:
+        return uvec4(u16data[offset / 2], 0, 0, 0);
+#else
+    case 1:
+        return uvec4(bitfieldExtract(u32data[offset / 4], int((offset * 8) & 24), 8), 0, 0, 0);
+    case 2:
+        return uvec4(bitfieldExtract(u32data[offset / 4], int((offset * 8) & 16), 16), 0, 0, 0);
+#endif
+    case 4:
+        return uvec4(u32data[offset / 4], 0, 0, 0);
+    case 8:
+        return uvec4(u64data[offset / 8], 0, 0);
+    case 16:
+        return u128data[offset / 16];
+    }
+    return uvec4(0);
+}
+
+void main() {
+    uvec2 pos = gl_GlobalInvocationID.xy + origin;
+
+    uint offset = 0;
+    offset += pos.x * bytes_per_block;
+    offset += pos.y * pitch;
+
+    const uvec4 texel = ReadTexel(offset);
+    const ivec2 coord = ivec2(gl_GlobalInvocationID.xy) + destination;
+    imageStore(output_image, coord, texel);
+}
--- a/src/video_core/host_shaders/source_shader.h.in
+++ b/src/video_core/host_shaders/source_shader.h.in
@@ -0,0 +1,9 @@
+#pragma once
+
+#include <string_view>
+
+namespace HostShaders {
+
+constexpr std::string_view @CONTENTS_NAME@ = R"(@CONTENTS@)";
+
+} // namespace HostShaders
--- a/src/video_core/host_shaders/vulkan_blit_color_float.frag
+++ b/src/video_core/host_shaders/vulkan_blit_color_float.frag
@@ -0,0 +1,14 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#version 450
+
+layout(binding = 0) uniform sampler2D tex;
+
+layout(location = 0) in vec2 texcoord;
+layout(location = 0) out vec4 color;
+
+void main() {
+    color = textureLod(tex, texcoord, 0);
+}
--- a/src/video_core/host_shaders/vulkan_blit_depth_stencil.frag
+++ b/src/video_core/host_shaders/vulkan_blit_depth_stencil.frag
@@ -0,0 +1,16 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#version 450
+#extension GL_ARB_shader_stencil_export : require
+
+layout(binding = 0) uniform sampler2D depth_tex;
+layout(binding = 1) uniform isampler2D stencil_tex;
+
+layout(location = 0) in vec2 texcoord;
+
+void main() {
+    gl_FragDepth = textureLod(depth_tex, texcoord, 0).r;
+    gl_FragStencilRefARB = textureLod(stencil_tex, texcoord, 0).r;
+}
--- a/src/video_core/host_shaders/vulkan_present.frag
+++ b/src/video_core/host_shaders/vulkan_present.frag
@@ -0,0 +1,15 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#version 460 core
+
+layout (location = 0) in vec2 frag_tex_coord;
+
+layout (location = 0) out vec4 color;
+
+layout (binding = 1) uniform sampler2D color_texture;
+
+void main() {
+    color = texture(color_texture, frag_tex_coord);
+}
--- a/src/video_core/host_shaders/vulkan_present.vert
+++ b/src/video_core/host_shaders/vulkan_present.vert
@@ -0,0 +1,19 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#version 460 core
+
+layout (location = 0) in vec2 vert_position;
+layout (location = 1) in vec2 vert_tex_coord;
+
+layout (location = 0) out vec2 frag_tex_coord;
+
+layout (set = 0, binding = 0) uniform MatrixBlock {
+    mat4 modelview_matrix;
+};
+
+void main() {
+    gl_Position = modelview_matrix * vec4(vert_position, 0.0, 1.0);
+    frag_tex_coord = vert_tex_coord;
+}
--- a/src/video_core/host_shaders/vulkan_quad_array.comp
+++ b/src/video_core/host_shaders/vulkan_quad_array.comp
@@ -0,0 +1,28 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#version 460 core
+
+layout (local_size_x = 1024) in;
+
+layout (std430, set = 0, binding = 0) buffer OutputBuffer {
+    uint output_indexes[];
+};
+
+layout (push_constant) uniform PushConstants {
+    uint first;
+};
+
+void main() {
+    uint primitive = gl_GlobalInvocationID.x;
+    if (primitive * 6 >= output_indexes.length()) {
+        return;
+    }
+
+    const uint quad_map[6] = uint[](0, 1, 2, 0, 2, 3);
+    for (uint vertex = 0; vertex < 6; ++vertex) {
+        uint index = first + primitive * 4 + quad_map[vertex];
+        output_indexes[primitive * 6 + vertex] = index;
+    }
+}
--- a/src/video_core/host_shaders/vulkan_quad_indexed.comp
+++ b/src/video_core/host_shaders/vulkan_quad_indexed.comp
@@ -0,0 +1,41 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#version 460 core
+
+layout (local_size_x = 1024) in;
+
+layout (std430, set = 0, binding = 0) readonly buffer InputBuffer {
+    uint input_indexes[];
+};
+
+layout (std430, set = 0, binding = 1) writeonly buffer OutputBuffer {
+    uint output_indexes[];
+};
+
+layout (push_constant) uniform PushConstants {
+    uint base_vertex;
+    int index_shift; // 0: uint8, 1: uint16, 2: uint32
+};
+
+void main() {
+    int primitive = int(gl_GlobalInvocationID.x);
+    if (primitive * 6 >= output_indexes.length()) {
+        return;
+    }
+
+    int index_size = 8 << index_shift;
+    int flipped_shift = 2 - index_shift;
+    int mask = (1 << flipped_shift) - 1;
+
+    const int quad_swizzle[6] = int[](0, 1, 2, 0, 2, 3);
+    for (uint vertex = 0; vertex < 6; ++vertex) {
+        int offset = primitive * 4 + quad_swizzle[vertex];
+        int int_offset = offset >> flipped_shift;
+        int bit_offset = (offset & mask) * index_size;
+        uint packed_input = input_indexes[int_offset];
+        uint index = bitfieldExtract(packed_input, bit_offset, index_size);
+        output_indexes[primitive * 6 + vertex] = index + base_vertex;
+    }
+}
--- a/src/video_core/host_shaders/vulkan_uint8.comp
+++ b/src/video_core/host_shaders/vulkan_uint8.comp
@@ -0,0 +1,24 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#version 460 core
+#extension GL_EXT_shader_16bit_storage : require
+#extension GL_EXT_shader_8bit_storage : require
+
+layout (local_size_x = 1024) in;
+
+layout (std430, set = 0, binding = 0) readonly buffer InputBuffer {
+    uint8_t input_indexes[];
+};
+
+layout (std430, set = 0, binding = 1) writeonly buffer OutputBuffer {
+    uint16_t output_indexes[];
+};
+
+void main() {
+    uint id = gl_GlobalInvocationID.x;
+    if (id < input_indexes.length()) {
+        output_indexes[id] = uint16_t(input_indexes[id]);
+    }
+}
--- a/src/video_core/macro/macro.cpp
+++ b/src/video_core/macro/macro.cpp
@@ -0,0 +1,91 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <optional>
+#include <boost/container_hash/hash.hpp>
+#include "common/assert.h"
+#include "common/logging/log.h"
+#include "core/settings.h"
+#include "video_core/engines/maxwell_3d.h"
+#include "video_core/macro/macro.h"
+#include "video_core/macro/macro_hle.h"
+#include "video_core/macro/macro_interpreter.h"
+#include "video_core/macro/macro_jit_x64.h"
+
+namespace Tegra {
+
+MacroEngine::MacroEngine(Engines::Maxwell3D& maxwell3d)
+    : hle_macros{std::make_unique<Tegra::HLEMacro>(maxwell3d)} {}
+
+MacroEngine::~MacroEngine() = default;
+
+void MacroEngine::AddCode(u32 method, u32 data) {
+    uploaded_macro_code[method].push_back(data);
+}
+
+void MacroEngine::Execute(Engines::Maxwell3D& maxwell3d, u32 method,
+                          const std::vector<u32>& parameters) {
+    auto compiled_macro = macro_cache.find(method);
+    if (compiled_macro != macro_cache.end()) {
+        const auto& cache_info = compiled_macro->second;
+        if (cache_info.has_hle_program) {
+            cache_info.hle_program->Execute(parameters, method);
+        } else {
+            cache_info.lle_program->Execute(parameters, method);
+        }
+    } else {
+        // Macro not compiled, check if it's uploaded and if so, compile it
+        std::optional<u32> mid_method;
+        const auto macro_code = uploaded_macro_code.find(method);
+        if (macro_code == uploaded_macro_code.end()) {
+            for (const auto& [method_base, code] : uploaded_macro_code) {
+                if (method >= method_base && (method - method_base) < code.size()) {
+                    mid_method = method_base;
+                    break;
+                }
+            }
+            if (!mid_method.has_value()) {
+                UNREACHABLE_MSG("Macro 0x{0:x} was not uploaded", method);
+                return;
+            }
+        }
+        auto& cache_info = macro_cache[method];
+
+        if (!mid_method.has_value()) {
+            cache_info.lle_program = Compile(macro_code->second);
+            cache_info.hash = boost::hash_value(macro_code->second);
+        } else {
+            const auto& macro_cached = uploaded_macro_code[mid_method.value()];
+            const auto rebased_method = method - mid_method.value();
+            auto& code = uploaded_macro_code[method];
+            code.resize(macro_cached.size() - rebased_method);
+            std::memcpy(code.data(), macro_cached.data() + rebased_method,
+                        code.size() * sizeof(u32));
+            cache_info.hash = boost::hash_value(code);
+            cache_info.lle_program = Compile(code);
+        }
+
+        auto hle_program = hle_macros->GetHLEProgram(cache_info.hash);
+        if (hle_program.has_value()) {
+            cache_info.has_hle_program = true;
+            cache_info.hle_program = std::move(hle_program.value());
+            cache_info.hle_program->Execute(parameters, method);
+        } else {
+            cache_info.lle_program->Execute(parameters, method);
+        }
+    }
+}
+
+std::unique_ptr<MacroEngine> GetMacroEngine(Engines::Maxwell3D& maxwell3d) {
+    if (Settings::values.disable_macro_jit) {
+        return std::make_unique<MacroInterpreter>(maxwell3d);
+    }
+#ifdef ARCHITECTURE_x86_64
+    return std::make_unique<MacroJITx64>(maxwell3d);
+#else
+    return std::make_unique<MacroInterpreter>(maxwell3d);
+#endif
+}
+
+} // namespace Tegra
--- a/src/video_core/macro/macro.h
+++ b/src/video_core/macro/macro.h
@@ -0,0 +1,142 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <memory>
+#include <unordered_map>
+#include <vector>
+#include "common/bit_field.h"
+#include "common/common_types.h"
+
+namespace Tegra {
+
+namespace Engines {
+class Maxwell3D;
+}
+
+namespace Macro {
+constexpr std::size_t NUM_MACRO_REGISTERS = 8;
+enum class Operation : u32 {
+    ALU = 0,
+    AddImmediate = 1,
+    ExtractInsert = 2,
+    ExtractShiftLeftImmediate = 3,
+    ExtractShiftLeftRegister = 4,
+    Read = 5,
+    Unused = 6, // This operation doesn't seem to be a valid encoding.
+    Branch = 7,
+};
+
+enum class ALUOperation : u32 {
+    Add = 0,
+    AddWithCarry = 1,
+    Subtract = 2,
+    SubtractWithBorrow = 3,
+    // Operations 4-7 don't seem to be valid encodings.
+    Xor = 8,
+    Or = 9,
+    And = 10,
+    AndNot = 11,
+    Nand = 12
+};
+
+enum class ResultOperation : u32 {
+    IgnoreAndFetch = 0,
+    Move = 1,
+    MoveAndSetMethod = 2,
+    FetchAndSend = 3,
+    MoveAndSend = 4,
+    FetchAndSetMethod = 5,
+    MoveAndSetMethodFetchAndSend = 6,
+    MoveAndSetMethodSend = 7
+};
+
+enum class BranchCondition : u32 {
+    Zero = 0,
+    NotZero = 1,
+};
+
+union Opcode {
+    u32 raw;
+    BitField<0, 3, Operation> operation;
+    BitField<4, 3, ResultOperation> result_operation;
+    BitField<4, 1, BranchCondition> branch_condition;
+    // If set on a branch, then the branch doesn't have a delay slot.
+    BitField<5, 1, u32> branch_annul;
+    BitField<7, 1, u32> is_exit;
+    BitField<8, 3, u32> dst;
+    BitField<11, 3, u32> src_a;
+    BitField<14, 3, u32> src_b;
+    // The signed immediate overlaps the second source operand and the alu operation.
+    BitField<14, 18, s32> immediate;
+
+    BitField<17, 5, ALUOperation> alu_operation;
+
+    // Bitfield instructions data
+    BitField<17, 5, u32> bf_src_bit;
+    BitField<22, 5, u32> bf_size;
+    BitField<27, 5, u32> bf_dst_bit;
+
+    u32 GetBitfieldMask() const {
+        return (1 << bf_size) - 1;
+    }
+
+    s32 GetBranchTarget() const {
+        return static_cast<s32>(immediate * sizeof(u32));
+    }
+};
+
+union MethodAddress {
+    u32 raw;
+    BitField<0, 12, u32> address;
+    BitField<12, 6, u32> increment;
+};
+
+} // namespace Macro
+
+class HLEMacro;
+
+class CachedMacro {
+public:
+    virtual ~CachedMacro() = default;
+    /**
+     * Executes the macro code with the specified input parameters.
+     *
+     * @param parameters The parameters of the macro
+     * @param method     The method to execute
+     */
+    virtual void Execute(const std::vector<u32>& parameters, u32 method) = 0;
+};
+
+class MacroEngine {
+public:
+    explicit MacroEngine(Engines::Maxwell3D& maxwell3d);
+    virtual ~MacroEngine();
+
+    // Store the uploaded macro code to compile them when they're called.
+    void AddCode(u32 method, u32 data);
+
+    // Compiles the macro if its not in the cache, and executes the compiled macro
+    void Execute(Engines::Maxwell3D& maxwell3d, u32 method, const std::vector<u32>& parameters);
+
+protected:
+    virtual std::unique_ptr<CachedMacro> Compile(const std::vector<u32>& code) = 0;
+
+private:
+    struct CacheInfo {
+        std::unique_ptr<CachedMacro> lle_program{};
+        std::unique_ptr<CachedMacro> hle_program{};
+        u64 hash{};
+        bool has_hle_program{};
+    };
+
+    std::unordered_map<u32, CacheInfo> macro_cache;
+    std::unordered_map<u32, std::vector<u32>> uploaded_macro_code;
+    std::unique_ptr<HLEMacro> hle_macros;
+};
+
+std::unique_ptr<MacroEngine> GetMacroEngine(Engines::Maxwell3D& maxwell3d);
+
+} // namespace Tegra
--- a/src/video_core/macro/macro_hle.cpp
+++ b/src/video_core/macro/macro_hle.cpp
@@ -0,0 +1,109 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <array>
+#include <vector>
+#include "video_core/engines/maxwell_3d.h"
+#include "video_core/macro/macro_hle.h"
+#include "video_core/rasterizer_interface.h"
+
+namespace Tegra {
+
+namespace {
+// HLE'd functions
+void HLE_771BB18C62444DA0(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& parameters) {
+    const u32 instance_count = parameters[2] & maxwell3d.GetRegisterValue(0xD1B);
+
+    maxwell3d.regs.draw.topology.Assign(
+        static_cast<Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology>(parameters[0] & 0x3ffffff));
+    maxwell3d.regs.vb_base_instance = parameters[5];
+    maxwell3d.mme_draw.instance_count = instance_count;
+    maxwell3d.regs.vb_element_base = parameters[3];
+    maxwell3d.regs.index_array.count = parameters[1];
+    maxwell3d.regs.index_array.first = parameters[4];
+
+    if (maxwell3d.ShouldExecute()) {
+        maxwell3d.Rasterizer().Draw(true, true);
+    }
+    maxwell3d.regs.index_array.count = 0;
+    maxwell3d.mme_draw.instance_count = 0;
+    maxwell3d.mme_draw.current_mode = Engines::Maxwell3D::MMEDrawMode::Undefined;
+}
+
+void HLE_0D61FC9FAAC9FCAD(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& parameters) {
+    const u32 count = (maxwell3d.GetRegisterValue(0xD1B) & parameters[2]);
+
+    maxwell3d.regs.vertex_buffer.first = parameters[3];
+    maxwell3d.regs.vertex_buffer.count = parameters[1];
+    maxwell3d.regs.vb_base_instance = parameters[4];
+    maxwell3d.regs.draw.topology.Assign(
+        static_cast<Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology>(parameters[0]));
+    maxwell3d.mme_draw.instance_count = count;
+
+    if (maxwell3d.ShouldExecute()) {
+        maxwell3d.Rasterizer().Draw(false, true);
+    }
+    maxwell3d.regs.vertex_buffer.count = 0;
+    maxwell3d.mme_draw.instance_count = 0;
+    maxwell3d.mme_draw.current_mode = Engines::Maxwell3D::MMEDrawMode::Undefined;
+}
+
+void HLE_0217920100488FF7(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& parameters) {
+    const u32 instance_count = (maxwell3d.GetRegisterValue(0xD1B) & parameters[2]);
+    const u32 element_base = parameters[4];
+    const u32 base_instance = parameters[5];
+    maxwell3d.regs.index_array.first = parameters[3];
+    maxwell3d.regs.reg_array[0x446] = element_base; // vertex id base?
+    maxwell3d.regs.index_array.count = parameters[1];
+    maxwell3d.regs.vb_element_base = element_base;
+    maxwell3d.regs.vb_base_instance = base_instance;
+    maxwell3d.mme_draw.instance_count = instance_count;
+    maxwell3d.CallMethodFromMME(0x8e3, 0x640);
+    maxwell3d.CallMethodFromMME(0x8e4, element_base);
+    maxwell3d.CallMethodFromMME(0x8e5, base_instance);
+    maxwell3d.regs.draw.topology.Assign(
+        static_cast<Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology>(parameters[0]));
+    if (maxwell3d.ShouldExecute()) {
+        maxwell3d.Rasterizer().Draw(true, true);
+    }
+    maxwell3d.regs.reg_array[0x446] = 0x0; // vertex id base?
+    maxwell3d.regs.index_array.count = 0;
+    maxwell3d.regs.vb_element_base = 0x0;
+    maxwell3d.regs.vb_base_instance = 0x0;
+    maxwell3d.mme_draw.instance_count = 0;
+    maxwell3d.CallMethodFromMME(0x8e3, 0x640);
+    maxwell3d.CallMethodFromMME(0x8e4, 0x0);
+    maxwell3d.CallMethodFromMME(0x8e5, 0x0);
+    maxwell3d.mme_draw.current_mode = Engines::Maxwell3D::MMEDrawMode::Undefined;
+}
+} // Anonymous namespace
+
+constexpr std::array<std::pair<u64, HLEFunction>, 3> hle_funcs{{
+    {0x771BB18C62444DA0, &HLE_771BB18C62444DA0},
+    {0x0D61FC9FAAC9FCAD, &HLE_0D61FC9FAAC9FCAD},
+    {0x0217920100488FF7, &HLE_0217920100488FF7},
+}};
+
+HLEMacro::HLEMacro(Engines::Maxwell3D& maxwell3d_) : maxwell3d{maxwell3d_} {}
+HLEMacro::~HLEMacro() = default;
+
+std::optional<std::unique_ptr<CachedMacro>> HLEMacro::GetHLEProgram(u64 hash) const {
+    const auto it = std::find_if(hle_funcs.cbegin(), hle_funcs.cend(),
+                                 [hash](const auto& pair) { return pair.first == hash; });
+    if (it == hle_funcs.end()) {
+        return std::nullopt;
+    }
+    return std::make_unique<HLEMacroImpl>(maxwell3d, it->second);
+}
+
+HLEMacroImpl::~HLEMacroImpl() = default;
+
+HLEMacroImpl::HLEMacroImpl(Engines::Maxwell3D& maxwell3d_, HLEFunction func_)
+    : maxwell3d{maxwell3d_}, func{func_} {}
+
+void HLEMacroImpl::Execute(const std::vector<u32>& parameters, u32 method) {
+    func(maxwell3d, parameters);
+}
+
+} // namespace Tegra
--- a/src/video_core/macro/macro_hle.h
+++ b/src/video_core/macro/macro_hle.h
@@ -0,0 +1,44 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <memory>
+#include <optional>
+#include <vector>
+#include "common/common_types.h"
+#include "video_core/macro/macro.h"
+
+namespace Tegra {
+
+namespace Engines {
+class Maxwell3D;
+}
+
+using HLEFunction = void (*)(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& parameters);
+
+class HLEMacro {
+public:
+    explicit HLEMacro(Engines::Maxwell3D& maxwell3d_);
+    ~HLEMacro();
+
+    std::optional<std::unique_ptr<CachedMacro>> GetHLEProgram(u64 hash) const;
+
+private:
+    Engines::Maxwell3D& maxwell3d;
+};
+
+class HLEMacroImpl : public CachedMacro {
+public:
+    explicit HLEMacroImpl(Engines::Maxwell3D& maxwell3d, HLEFunction func);
+    ~HLEMacroImpl();
+
+    void Execute(const std::vector<u32>& parameters, u32 method) override;
+
+private:
+    Engines::Maxwell3D& maxwell3d;
+    HLEFunction func;
+};
+
+} // namespace Tegra
--- a/src/video_core/macro/macro_interpreter.cpp
+++ b/src/video_core/macro/macro_interpreter.cpp
@@ -0,0 +1,287 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "common/assert.h"
+#include "common/logging/log.h"
+#include "common/microprofile.h"
+#include "video_core/engines/maxwell_3d.h"
+#include "video_core/macro/macro_interpreter.h"
+
+MICROPROFILE_DEFINE(MacroInterp, "GPU", "Execute macro interpreter", MP_RGB(128, 128, 192));
+
+namespace Tegra {
+MacroInterpreter::MacroInterpreter(Engines::Maxwell3D& maxwell3d_)
+    : MacroEngine{maxwell3d_}, maxwell3d{maxwell3d_} {}
+
+std::unique_ptr<CachedMacro> MacroInterpreter::Compile(const std::vector<u32>& code) {
+    return std::make_unique<MacroInterpreterImpl>(maxwell3d, code);
+}
+
+MacroInterpreterImpl::MacroInterpreterImpl(Engines::Maxwell3D& maxwell3d_,
+                                           const std::vector<u32>& code_)
+    : maxwell3d{maxwell3d_}, code{code_} {}
+
+void MacroInterpreterImpl::Execute(const std::vector<u32>& params, u32 method) {
+    MICROPROFILE_SCOPE(MacroInterp);
+    Reset();
+
+    registers[1] = params[0];
+    num_parameters = params.size();
+
+    if (num_parameters > parameters_capacity) {
+        parameters_capacity = num_parameters;
+        parameters = std::make_unique<u32[]>(num_parameters);
+    }
+    std::memcpy(parameters.get(), params.data(), num_parameters * sizeof(u32));
+
+    // Execute the code until we hit an exit condition.
+    bool keep_executing = true;
+    while (keep_executing) {
+        keep_executing = Step(false);
+    }
+
+    // Assert the the macro used all the input parameters
+    ASSERT(next_parameter_index == num_parameters);
+}
+
+void MacroInterpreterImpl::Reset() {
+    registers = {};
+    pc = 0;
+    delayed_pc = {};
+    method_address.raw = 0;
+    num_parameters = 0;
+    // The next parameter index starts at 1, because $r1 already has the value of the first
+    // parameter.
+    next_parameter_index = 1;
+    carry_flag = false;
+}
+
+bool MacroInterpreterImpl::Step(bool is_delay_slot) {
+    u32 base_address = pc;
+
+    Macro::Opcode opcode = GetOpcode();
+    pc += 4;
+
+    // Update the program counter if we were delayed
+    if (delayed_pc) {
+        ASSERT(is_delay_slot);
+        pc = *delayed_pc;
+        delayed_pc = {};
+    }
+
+    switch (opcode.operation) {
+    case Macro::Operation::ALU: {
+        u32 result = GetALUResult(opcode.alu_operation, GetRegister(opcode.src_a),
+                                  GetRegister(opcode.src_b));
+        ProcessResult(opcode.result_operation, opcode.dst, result);
+        break;
+    }
+    case Macro::Operation::AddImmediate: {
+        ProcessResult(opcode.result_operation, opcode.dst,
+                      GetRegister(opcode.src_a) + opcode.immediate);
+        break;
+    }
+    case Macro::Operation::ExtractInsert: {
+        u32 dst = GetRegister(opcode.src_a);
+        u32 src = GetRegister(opcode.src_b);
+
+        src = (src >> opcode.bf_src_bit) & opcode.GetBitfieldMask();
+        dst &= ~(opcode.GetBitfieldMask() << opcode.bf_dst_bit);
+        dst |= src << opcode.bf_dst_bit;
+        ProcessResult(opcode.result_operation, opcode.dst, dst);
+        break;
+    }
+    case Macro::Operation::ExtractShiftLeftImmediate: {
+        u32 dst = GetRegister(opcode.src_a);
+        u32 src = GetRegister(opcode.src_b);
+
+        u32 result = ((src >> dst) & opcode.GetBitfieldMask()) << opcode.bf_dst_bit;
+
+        ProcessResult(opcode.result_operation, opcode.dst, result);
+        break;
+    }
+    case Macro::Operation::ExtractShiftLeftRegister: {
+        u32 dst = GetRegister(opcode.src_a);
+        u32 src = GetRegister(opcode.src_b);
+
+        u32 result = ((src >> opcode.bf_src_bit) & opcode.GetBitfieldMask()) << dst;
+
+        ProcessResult(opcode.result_operation, opcode.dst, result);
+        break;
+    }
+    case Macro::Operation::Read: {
+        u32 result = Read(GetRegister(opcode.src_a) + opcode.immediate);
+        ProcessResult(opcode.result_operation, opcode.dst, result);
+        break;
+    }
+    case Macro::Operation::Branch: {
+        ASSERT_MSG(!is_delay_slot, "Executing a branch in a delay slot is not valid");
+        u32 value = GetRegister(opcode.src_a);
+        bool taken = EvaluateBranchCondition(opcode.branch_condition, value);
+        if (taken) {
+            // Ignore the delay slot if the branch has the annul bit.
+            if (opcode.branch_annul) {
+                pc = base_address + opcode.GetBranchTarget();
+                return true;
+            }
+
+            delayed_pc = base_address + opcode.GetBranchTarget();
+            // Execute one more instruction due to the delay slot.
+            return Step(true);
+        }
+        break;
+    }
+    default:
+        UNIMPLEMENTED_MSG("Unimplemented macro operation {}", opcode.operation.Value());
+    }
+
+    // An instruction with the Exit flag will not actually
+    // cause an exit if it's executed inside a delay slot.
+    if (opcode.is_exit && !is_delay_slot) {
+        // Exit has a delay slot, execute the next instruction
+        Step(true);
+        return false;
+    }
+
+    return true;
+}
+
+u32 MacroInterpreterImpl::GetALUResult(Macro::ALUOperation operation, u32 src_a, u32 src_b) {
+    switch (operation) {
+    case Macro::ALUOperation::Add: {
+        const u64 result{static_cast<u64>(src_a) + src_b};
+        carry_flag = result > 0xffffffff;
+        return static_cast<u32>(result);
+    }
+    case Macro::ALUOperation::AddWithCarry: {
+        const u64 result{static_cast<u64>(src_a) + src_b + (carry_flag ? 1ULL : 0ULL)};
+        carry_flag = result > 0xffffffff;
+        return static_cast<u32>(result);
+    }
+    case Macro::ALUOperation::Subtract: {
+        const u64 result{static_cast<u64>(src_a) - src_b};
+        carry_flag = result < 0x100000000;
+        return static_cast<u32>(result);
+    }
+    case Macro::ALUOperation::SubtractWithBorrow: {
+        const u64 result{static_cast<u64>(src_a) - src_b - (carry_flag ? 0ULL : 1ULL)};
+        carry_flag = result < 0x100000000;
+        return static_cast<u32>(result);
+    }
+    case Macro::ALUOperation::Xor:
+        return src_a ^ src_b;
+    case Macro::ALUOperation::Or:
+        return src_a | src_b;
+    case Macro::ALUOperation::And:
+        return src_a & src_b;
+    case Macro::ALUOperation::AndNot:
+        return src_a & ~src_b;
+    case Macro::ALUOperation::Nand:
+        return ~(src_a & src_b);
+
+    default:
+        UNIMPLEMENTED_MSG("Unimplemented ALU operation {}", operation);
+        return 0;
+    }
+}
+
+void MacroInterpreterImpl::ProcessResult(Macro::ResultOperation operation, u32 reg, u32 result) {
+    switch (operation) {
+    case Macro::ResultOperation::IgnoreAndFetch:
+        // Fetch parameter and ignore result.
+        SetRegister(reg, FetchParameter());
+        break;
+    case Macro::ResultOperation::Move:
+        // Move result.
+        SetRegister(reg, result);
+        break;
+    case Macro::ResultOperation::MoveAndSetMethod:
+        // Move result and use as Method Address.
+        SetRegister(reg, result);
+        SetMethodAddress(result);
+        break;
+    case Macro::ResultOperation::FetchAndSend:
+        // Fetch parameter and send result.
+        SetRegister(reg, FetchParameter());
+        Send(result);
+        break;
+    case Macro::ResultOperation::MoveAndSend:
+        // Move and send result.
+        SetRegister(reg, result);
+        Send(result);
+        break;
+    case Macro::ResultOperation::FetchAndSetMethod:
+        // Fetch parameter and use result as Method Address.
+        SetRegister(reg, FetchParameter());
+        SetMethodAddress(result);
+        break;
+    case Macro::ResultOperation::MoveAndSetMethodFetchAndSend:
+        // Move result and use as Method Address, then fetch and send parameter.
+        SetRegister(reg, result);
+        SetMethodAddress(result);
+        Send(FetchParameter());
+        break;
+    case Macro::ResultOperation::MoveAndSetMethodSend:
+        // Move result and use as Method Address, then send bits 12:17 of result.
+        SetRegister(reg, result);
+        SetMethodAddress(result);
+        Send((result >> 12) & 0b111111);
+        break;
+    default:
+        UNIMPLEMENTED_MSG("Unimplemented result operation {}", operation);
+    }
+}
+
+bool MacroInterpreterImpl::EvaluateBranchCondition(Macro::BranchCondition cond, u32 value) const {
+    switch (cond) {
+    case Macro::BranchCondition::Zero:
+        return value == 0;
+    case Macro::BranchCondition::NotZero:
+        return value != 0;
+    }
+    UNREACHABLE();
+    return true;
+}
+
+Macro::Opcode MacroInterpreterImpl::GetOpcode() const {
+    ASSERT((pc % sizeof(u32)) == 0);
+    ASSERT(pc < code.size() * sizeof(u32));
+    return {code[pc / sizeof(u32)]};
+}
+
+u32 MacroInterpreterImpl::GetRegister(u32 register_id) const {
+    return registers.at(register_id);
+}
+
+void MacroInterpreterImpl::SetRegister(u32 register_id, u32 value) {
+    // Register 0 is hardwired as the zero register.
+    // Ensure no writes to it actually occur.
+    if (register_id == 0) {
+        return;
+    }
+
+    registers.at(register_id) = value;
+}
+
+void MacroInterpreterImpl::SetMethodAddress(u32 address) {
+    method_address.raw = address;
+}
+
+void MacroInterpreterImpl::Send(u32 value) {
+    maxwell3d.CallMethodFromMME(method_address.address, value);
+    // Increment the method address by the method increment.
+    method_address.address.Assign(method_address.address.Value() +
+                                  method_address.increment.Value());
+}
+
+u32 MacroInterpreterImpl::Read(u32 method) const {
+    return maxwell3d.GetRegisterValue(method);
+}
+
+u32 MacroInterpreterImpl::FetchParameter() {
+    ASSERT(next_parameter_index < num_parameters);
+    return parameters[next_parameter_index++];
+}
+
+} // namespace Tegra
--- a/src/video_core/macro/macro_interpreter.h
+++ b/src/video_core/macro/macro_interpreter.h
@@ -0,0 +1,102 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+#include <array>
+#include <optional>
+#include <vector>
+#include "common/bit_field.h"
+#include "common/common_types.h"
+#include "video_core/macro/macro.h"
+
+namespace Tegra {
+namespace Engines {
+class Maxwell3D;
+}
+
+class MacroInterpreter final : public MacroEngine {
+public:
+    explicit MacroInterpreter(Engines::Maxwell3D& maxwell3d_);
+
+protected:
+    std::unique_ptr<CachedMacro> Compile(const std::vector<u32>& code) override;
+
+private:
+    Engines::Maxwell3D& maxwell3d;
+};
+
+class MacroInterpreterImpl : public CachedMacro {
+public:
+    explicit MacroInterpreterImpl(Engines::Maxwell3D& maxwell3d_, const std::vector<u32>& code_);
+    void Execute(const std::vector<u32>& params, u32 method) override;
+
+private:
+    /// Resets the execution engine state, zeroing registers, etc.
+    void Reset();
+
+    /**
+     * Executes a single macro instruction located at the current program counter. Returns whether
+     * the interpreter should keep running.
+     *
+     * @param is_delay_slot Whether the current step is being executed due to a delay slot in a
+     *                      previous instruction.
+     */
+    bool Step(bool is_delay_slot);
+
+    /// Calculates the result of an ALU operation. src_a OP src_b;
+    u32 GetALUResult(Macro::ALUOperation operation, u32 src_a, u32 src_b);
+
+    /// Performs the result operation on the input result and stores it in the specified register
+    /// (if necessary).
+    void ProcessResult(Macro::ResultOperation operation, u32 reg, u32 result);
+
+    /// Evaluates the branch condition and returns whether the branch should be taken or not.
+    bool EvaluateBranchCondition(Macro::BranchCondition cond, u32 value) const;
+
+    /// Reads an opcode at the current program counter location.
+    Macro::Opcode GetOpcode() const;
+
+    /// Returns the specified register's value. Register 0 is hardcoded to always return 0.
+    u32 GetRegister(u32 register_id) const;
+
+    /// Sets the register to the input value.
+    void SetRegister(u32 register_id, u32 value);
+
+    /// Sets the method address to use for the next Send instruction.
+    void SetMethodAddress(u32 address);
+
+    /// Calls a GPU Engine method with the input parameter.
+    void Send(u32 value);
+
+    /// Reads a GPU register located at the method address.
+    u32 Read(u32 method) const;
+
+    /// Returns the next parameter in the parameter queue.
+    u32 FetchParameter();
+
+    Engines::Maxwell3D& maxwell3d;
+
+    /// Current program counter
+    u32 pc;
+    /// Program counter to execute at after the delay slot is executed.
+    std::optional<u32> delayed_pc;
+
+    /// General purpose macro registers.
+    std::array<u32, Macro::NUM_MACRO_REGISTERS> registers = {};
+
+    /// Method address to use for the next Send instruction.
+    Macro::MethodAddress method_address = {};
+
+    /// Input parameters of the current macro.
+    std::unique_ptr<u32[]> parameters;
+    std::size_t num_parameters = 0;
+    std::size_t parameters_capacity = 0;
+    /// Index of the next parameter that will be fetched by the 'parm' instruction.
+    u32 next_parameter_index = 0;
+
+    bool carry_flag = false;
+    const std::vector<u32>& code;
+};
+
+} // namespace Tegra
--- a/src/video_core/macro/macro_jit_x64.cpp
+++ b/src/video_core/macro/macro_jit_x64.cpp
@@ -0,0 +1,619 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "common/assert.h"
+#include "common/logging/log.h"
+#include "common/microprofile.h"
+#include "common/x64/xbyak_util.h"
+#include "video_core/engines/maxwell_3d.h"
+#include "video_core/macro/macro_interpreter.h"
+#include "video_core/macro/macro_jit_x64.h"
+
+MICROPROFILE_DEFINE(MacroJitCompile, "GPU", "Compile macro JIT", MP_RGB(173, 255, 47));
+MICROPROFILE_DEFINE(MacroJitExecute, "GPU", "Execute macro JIT", MP_RGB(255, 255, 0));
+
+namespace Tegra {
+constexpr Xbyak::Reg64 STATE = Xbyak::util::rbx;
+constexpr Xbyak::Reg32 RESULT = Xbyak::util::ebp;
+constexpr Xbyak::Reg64 PARAMETERS = Xbyak::util::r12;
+constexpr Xbyak::Reg32 METHOD_ADDRESS = Xbyak::util::r14d;
+constexpr Xbyak::Reg64 BRANCH_HOLDER = Xbyak::util::r15;
+
+static const std::bitset<32> PERSISTENT_REGISTERS = Common::X64::BuildRegSet({
+    STATE,
+    RESULT,
+    PARAMETERS,
+    METHOD_ADDRESS,
+    BRANCH_HOLDER,
+});
+
+MacroJITx64::MacroJITx64(Engines::Maxwell3D& maxwell3d_)
+    : MacroEngine{maxwell3d_}, maxwell3d{maxwell3d_} {}
+
+std::unique_ptr<CachedMacro> MacroJITx64::Compile(const std::vector<u32>& code) {
+    return std::make_unique<MacroJITx64Impl>(maxwell3d, code);
+}
+
+MacroJITx64Impl::MacroJITx64Impl(Engines::Maxwell3D& maxwell3d_, const std::vector<u32>& code_)
+    : CodeGenerator{MAX_CODE_SIZE}, code{code_}, maxwell3d{maxwell3d_} {
+    Compile();
+}
+
+MacroJITx64Impl::~MacroJITx64Impl() = default;
+
+void MacroJITx64Impl::Execute(const std::vector<u32>& parameters, u32 method) {
+    MICROPROFILE_SCOPE(MacroJitExecute);
+    ASSERT_OR_EXECUTE(program != nullptr, { return; });
+    JITState state{};
+    state.maxwell3d = &maxwell3d;
+    state.registers = {};
+    program(&state, parameters.data());
+}
+
+void MacroJITx64Impl::Compile_ALU(Macro::Opcode opcode) {
+    const bool is_a_zero = opcode.src_a == 0;
+    const bool is_b_zero = opcode.src_b == 0;
+    const bool valid_operation = !is_a_zero && !is_b_zero;
+    [[maybe_unused]] const bool is_move_operation = !is_a_zero && is_b_zero;
+    const bool has_zero_register = is_a_zero || is_b_zero;
+    const bool no_zero_reg_skip = opcode.alu_operation == Macro::ALUOperation::AddWithCarry ||
+                                  opcode.alu_operation == Macro::ALUOperation::SubtractWithBorrow;
+
+    Xbyak::Reg32 src_a;
+    Xbyak::Reg32 src_b;
+
+    if (!optimizer.zero_reg_skip || no_zero_reg_skip) {
+        src_a = Compile_GetRegister(opcode.src_a, RESULT);
+        src_b = Compile_GetRegister(opcode.src_b, eax);
+    } else {
+        if (!is_a_zero) {
+            src_a = Compile_GetRegister(opcode.src_a, RESULT);
+        }
+        if (!is_b_zero) {
+            src_b = Compile_GetRegister(opcode.src_b, eax);
+        }
+    }
+
+    bool has_emitted = false;
+
+    switch (opcode.alu_operation) {
+    case Macro::ALUOperation::Add:
+        if (optimizer.zero_reg_skip) {
+            if (valid_operation) {
+                add(src_a, src_b);
+            }
+        } else {
+            add(src_a, src_b);
+        }
+
+        if (!optimizer.can_skip_carry) {
+            setc(byte[STATE + offsetof(JITState, carry_flag)]);
+        }
+        break;
+    case Macro::ALUOperation::AddWithCarry:
+        bt(dword[STATE + offsetof(JITState, carry_flag)], 0);
+        adc(src_a, src_b);
+        setc(byte[STATE + offsetof(JITState, carry_flag)]);
+        break;
+    case Macro::ALUOperation::Subtract:
+        if (optimizer.zero_reg_skip) {
+            if (valid_operation) {
+                sub(src_a, src_b);
+                has_emitted = true;
+            }
+        } else {
+            sub(src_a, src_b);
+            has_emitted = true;
+        }
+        if (!optimizer.can_skip_carry && has_emitted) {
+            setc(byte[STATE + offsetof(JITState, carry_flag)]);
+        }
+        break;
+    case Macro::ALUOperation::SubtractWithBorrow:
+        bt(dword[STATE + offsetof(JITState, carry_flag)], 0);
+        sbb(src_a, src_b);
+        setc(byte[STATE + offsetof(JITState, carry_flag)]);
+        break;
+    case Macro::ALUOperation::Xor:
+        if (optimizer.zero_reg_skip) {
+            if (valid_operation) {
+                xor_(src_a, src_b);
+            }
+        } else {
+            xor_(src_a, src_b);
+        }
+        break;
+    case Macro::ALUOperation::Or:
+        if (optimizer.zero_reg_skip) {
+            if (valid_operation) {
+                or_(src_a, src_b);
+            }
+        } else {
+            or_(src_a, src_b);
+        }
+        break;
+    case Macro::ALUOperation::And:
+        if (optimizer.zero_reg_skip) {
+            if (!has_zero_register) {
+                and_(src_a, src_b);
+            }
+        } else {
+            and_(src_a, src_b);
+        }
+        break;
+    case Macro::ALUOperation::AndNot:
+        if (optimizer.zero_reg_skip) {
+            if (!is_a_zero) {
+                not_(src_b);
+                and_(src_a, src_b);
+            }
+        } else {
+            not_(src_b);
+            and_(src_a, src_b);
+        }
+        break;
+    case Macro::ALUOperation::Nand:
+        if (optimizer.zero_reg_skip) {
+            if (!is_a_zero) {
+                and_(src_a, src_b);
+                not_(src_a);
+            }
+        } else {
+            and_(src_a, src_b);
+            not_(src_a);
+        }
+        break;
+    default:
+        UNIMPLEMENTED_MSG("Unimplemented ALU operation {}", opcode.alu_operation.Value());
+        break;
+    }
+    Compile_ProcessResult(opcode.result_operation, opcode.dst);
+}
+
+void MacroJITx64Impl::Compile_AddImmediate(Macro::Opcode opcode) {
+    if (optimizer.skip_dummy_addimmediate) {
+        // Games tend to use this as an exit instruction placeholder. It's to encode an instruction
+        // without doing anything. In our case we can just not emit anything.
+        if (opcode.result_operation == Macro::ResultOperation::Move && opcode.dst == 0) {
+            return;
+        }
+    }
+    // Check for redundant moves
+    if (optimizer.optimize_for_method_move &&
+        opcode.result_operation == Macro::ResultOperation::MoveAndSetMethod) {
+        if (next_opcode.has_value()) {
+            const auto next = *next_opcode;
+            if (next.result_operation == Macro::ResultOperation::MoveAndSetMethod &&
+                opcode.dst == next.dst) {
+                return;
+            }
+        }
+    }
+    if (optimizer.zero_reg_skip && opcode.src_a == 0) {
+        if (opcode.immediate == 0) {
+            xor_(RESULT, RESULT);
+        } else {
+            mov(RESULT, opcode.immediate);
+        }
+    } else {
+        auto result = Compile_GetRegister(opcode.src_a, RESULT);
+        if (opcode.immediate > 2) {
+            add(result, opcode.immediate);
+        } else if (opcode.immediate == 1) {
+            inc(result);
+        } else if (opcode.immediate < 0) {
+            sub(result, opcode.immediate * -1);
+        }
+    }
+    Compile_ProcessResult(opcode.result_operation, opcode.dst);
+}
+
+void MacroJITx64Impl::Compile_ExtractInsert(Macro::Opcode opcode) {
+    auto dst = Compile_GetRegister(opcode.src_a, RESULT);
+    auto src = Compile_GetRegister(opcode.src_b, eax);
+
+    if (opcode.bf_src_bit != 0 && opcode.bf_src_bit != 31) {
+        shr(src, opcode.bf_src_bit);
+    } else if (opcode.bf_src_bit == 31) {
+        xor_(src, src);
+    }
+    // Don't bother masking the whole register since we're using a 32 bit register
+    if (opcode.bf_size != 31 && opcode.bf_size != 0) {
+        and_(src, opcode.GetBitfieldMask());
+    } else if (opcode.bf_size == 0) {
+        xor_(src, src);
+    }
+    if (opcode.bf_dst_bit != 31 && opcode.bf_dst_bit != 0) {
+        shl(src, opcode.bf_dst_bit);
+    } else if (opcode.bf_dst_bit == 31) {
+        xor_(src, src);
+    }
+
+    const u32 mask = ~(opcode.GetBitfieldMask() << opcode.bf_dst_bit);
+    if (mask != 0xffffffff) {
+        and_(dst, mask);
+    }
+    or_(dst, src);
+    Compile_ProcessResult(opcode.result_operation, opcode.dst);
+}
+
+void MacroJITx64Impl::Compile_ExtractShiftLeftImmediate(Macro::Opcode opcode) {
+    const auto dst = Compile_GetRegister(opcode.src_a, ecx);
+    const auto src = Compile_GetRegister(opcode.src_b, RESULT);
+
+    shr(src, dst.cvt8());
+    if (opcode.bf_size != 0 && opcode.bf_size != 31) {
+        and_(src, opcode.GetBitfieldMask());
+    } else if (opcode.bf_size == 0) {
+        xor_(src, src);
+    }
+
+    if (opcode.bf_dst_bit != 0 && opcode.bf_dst_bit != 31) {
+        shl(src, opcode.bf_dst_bit);
+    } else if (opcode.bf_dst_bit == 31) {
+        xor_(src, src);
+    }
+    Compile_ProcessResult(opcode.result_operation, opcode.dst);
+}
+
+void MacroJITx64Impl::Compile_ExtractShiftLeftRegister(Macro::Opcode opcode) {
+    const auto dst = Compile_GetRegister(opcode.src_a, ecx);
+    const auto src = Compile_GetRegister(opcode.src_b, RESULT);
+
+    if (opcode.bf_src_bit != 0) {
+        shr(src, opcode.bf_src_bit);
+    }
+
+    if (opcode.bf_size != 31) {
+        and_(src, opcode.GetBitfieldMask());
+    }
+    shl(src, dst.cvt8());
+
+    Compile_ProcessResult(opcode.result_operation, opcode.dst);
+}
+
+void MacroJITx64Impl::Compile_Read(Macro::Opcode opcode) {
+    if (optimizer.zero_reg_skip && opcode.src_a == 0) {
+        if (opcode.immediate == 0) {
+            xor_(RESULT, RESULT);
+        } else {
+            mov(RESULT, opcode.immediate);
+        }
+    } else {
+        auto result = Compile_GetRegister(opcode.src_a, RESULT);
+        if (opcode.immediate > 2) {
+            add(result, opcode.immediate);
+        } else if (opcode.immediate == 1) {
+            inc(result);
+        } else if (opcode.immediate < 0) {
+            sub(result, opcode.immediate * -1);
+        }
+    }
+
+    // Equivalent to Engines::Maxwell3D::GetRegisterValue:
+    if (optimizer.enable_asserts) {
+        Xbyak::Label pass_range_check;
+        cmp(RESULT, static_cast<u32>(Engines::Maxwell3D::Regs::NUM_REGS));
+        jb(pass_range_check);
+        int3();
+        L(pass_range_check);
+    }
+    mov(rax, qword[STATE]);
+    mov(RESULT,
+        dword[rax + offsetof(Engines::Maxwell3D, regs) +
+              offsetof(Engines::Maxwell3D::Regs, reg_array) + RESULT.cvt64() * sizeof(u32)]);
+
+    Compile_ProcessResult(opcode.result_operation, opcode.dst);
+}
+
+static void Send(Engines::Maxwell3D* maxwell3d, Macro::MethodAddress method_address, u32 value) {
+    maxwell3d->CallMethodFromMME(method_address.address, value);
+}
+
+void Tegra::MacroJITx64Impl::Compile_Send(Xbyak::Reg32 value) {
+    Common::X64::ABI_PushRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0);
+    mov(Common::X64::ABI_PARAM1, qword[STATE]);
+    mov(Common::X64::ABI_PARAM2, METHOD_ADDRESS);
+    mov(Common::X64::ABI_PARAM3, value);
+    Common::X64::CallFarFunction(*this, &Send);
+    Common::X64::ABI_PopRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0);
+
+    Xbyak::Label dont_process{};
+    // Get increment
+    test(METHOD_ADDRESS, 0x3f000);
+    // If zero, method address doesn't update
+    je(dont_process);
+
+    mov(ecx, METHOD_ADDRESS);
+    and_(METHOD_ADDRESS, 0xfff);
+    shr(ecx, 12);
+    and_(ecx, 0x3f);
+    lea(eax, ptr[rcx + METHOD_ADDRESS.cvt64()]);
+    sal(ecx, 12);
+    or_(eax, ecx);
+
+    mov(METHOD_ADDRESS, eax);
+
+    L(dont_process);
+}
+
+void Tegra::MacroJITx64Impl::Compile_Branch(Macro::Opcode opcode) {
+    ASSERT_MSG(!is_delay_slot, "Executing a branch in a delay slot is not valid");
+    const s32 jump_address =
+        static_cast<s32>(pc) + static_cast<s32>(opcode.GetBranchTarget() / sizeof(s32));
+
+    Xbyak::Label end;
+    auto value = Compile_GetRegister(opcode.src_a, eax);
+    test(value, value);
+    if (optimizer.has_delayed_pc) {
+        switch (opcode.branch_condition) {
+        case Macro::BranchCondition::Zero:
+            jne(end, T_NEAR);
+            break;
+        case Macro::BranchCondition::NotZero:
+            je(end, T_NEAR);
+            break;
+        }
+
+        if (opcode.branch_annul) {
+            xor_(BRANCH_HOLDER, BRANCH_HOLDER);
+            jmp(labels[jump_address], T_NEAR);
+        } else {
+            Xbyak::Label handle_post_exit{};
+            Xbyak::Label skip{};
+            jmp(skip, T_NEAR);
+            if (opcode.is_exit) {
+                L(handle_post_exit);
+                // Execute 1 instruction
+                mov(BRANCH_HOLDER, end_of_code);
+                // Jump to next instruction to skip delay slot check
+                jmp(labels[jump_address], T_NEAR);
+            } else {
+                L(handle_post_exit);
+                xor_(BRANCH_HOLDER, BRANCH_HOLDER);
+                jmp(labels[jump_address], T_NEAR);
+            }
+            L(skip);
+            mov(BRANCH_HOLDER, handle_post_exit);
+            jmp(delay_skip[pc], T_NEAR);
+        }
+    } else {
+        switch (opcode.branch_condition) {
+        case Macro::BranchCondition::Zero:
+            je(labels[jump_address], T_NEAR);
+            break;
+        case Macro::BranchCondition::NotZero:
+            jne(labels[jump_address], T_NEAR);
+            break;
+        }
+    }
+
+    L(end);
+}
+
+void Tegra::MacroJITx64Impl::Optimizer_ScanFlags() {
+    optimizer.can_skip_carry = true;
+    optimizer.has_delayed_pc = false;
+    for (auto raw_op : code) {
+        Macro::Opcode op{};
+        op.raw = raw_op;
+
+        if (op.operation == Macro::Operation::ALU) {
+            // Scan for any ALU operations which actually use the carry flag, if they don't exist in
+            // our current code we can skip emitting the carry flag handling operations
+            if (op.alu_operation == Macro::ALUOperation::AddWithCarry ||
+                op.alu_operation == Macro::ALUOperation::SubtractWithBorrow) {
+                optimizer.can_skip_carry = false;
+            }
+        }
+
+        if (op.operation == Macro::Operation::Branch) {
+            if (!op.branch_annul) {
+                optimizer.has_delayed_pc = true;
+            }
+        }
+    }
+}
+
+void MacroJITx64Impl::Compile() {
+    MICROPROFILE_SCOPE(MacroJitCompile);
+    labels.fill(Xbyak::Label());
+
+    Common::X64::ABI_PushRegistersAndAdjustStack(*this, Common::X64::ABI_ALL_CALLEE_SAVED, 8);
+    // JIT state
+    mov(STATE, Common::X64::ABI_PARAM1);
+    mov(PARAMETERS, Common::X64::ABI_PARAM2);
+    xor_(RESULT, RESULT);
+    xor_(METHOD_ADDRESS, METHOD_ADDRESS);
+    xor_(BRANCH_HOLDER, BRANCH_HOLDER);
+
+    mov(dword[STATE + offsetof(JITState, registers) + 4], Compile_FetchParameter());
+
+    // Track get register for zero registers and mark it as no-op
+    optimizer.zero_reg_skip = true;
+
+    // AddImmediate tends to be used as a NOP instruction, if we detect this we can
+    // completely skip the entire code path and no emit anything
+    optimizer.skip_dummy_addimmediate = true;
+
+    // SMO tends to emit a lot of unnecessary method moves, we can mitigate this by only emitting
+    // one if our register isn't "dirty"
+    optimizer.optimize_for_method_move = true;
+
+    // Enable run-time assertions in JITted code
+    optimizer.enable_asserts = false;
+
+    // Check to see if we can skip emitting certain instructions
+    Optimizer_ScanFlags();
+
+    const u32 op_count = static_cast<u32>(code.size());
+    for (u32 i = 0; i < op_count; i++) {
+        if (i < op_count - 1) {
+            pc = i + 1;
+            next_opcode = GetOpCode();
+        } else {
+            next_opcode = {};
+        }
+        pc = i;
+        Compile_NextInstruction();
+    }
+
+    L(end_of_code);
+
+    Common::X64::ABI_PopRegistersAndAdjustStack(*this, Common::X64::ABI_ALL_CALLEE_SAVED, 8);
+    ret();
+    ready();
+    program = getCode<ProgramType>();
+}
+
+bool MacroJITx64Impl::Compile_NextInstruction() {
+    const auto opcode = GetOpCode();
+    if (labels[pc].getAddress()) {
+        return false;
+    }
+
+    L(labels[pc]);
+
+    switch (opcode.operation) {
+    case Macro::Operation::ALU:
+        Compile_ALU(opcode);
+        break;
+    case Macro::Operation::AddImmediate:
+        Compile_AddImmediate(opcode);
+        break;
+    case Macro::Operation::ExtractInsert:
+        Compile_ExtractInsert(opcode);
+        break;
+    case Macro::Operation::ExtractShiftLeftImmediate:
+        Compile_ExtractShiftLeftImmediate(opcode);
+        break;
+    case Macro::Operation::ExtractShiftLeftRegister:
+        Compile_ExtractShiftLeftRegister(opcode);
+        break;
+    case Macro::Operation::Read:
+        Compile_Read(opcode);
+        break;
+    case Macro::Operation::Branch:
+        Compile_Branch(opcode);
+        break;
+    default:
+        UNIMPLEMENTED_MSG("Unimplemented opcode {}", opcode.operation.Value());
+        break;
+    }
+
+    if (optimizer.has_delayed_pc) {
+        if (opcode.is_exit) {
+            mov(rax, end_of_code);
+            test(BRANCH_HOLDER, BRANCH_HOLDER);
+            cmove(BRANCH_HOLDER, rax);
+            // Jump to next instruction to skip delay slot check
+            je(labels[pc + 1], T_NEAR);
+        } else {
+            // TODO(ogniK): Optimize delay slot branching
+            Xbyak::Label no_delay_slot{};
+            test(BRANCH_HOLDER, BRANCH_HOLDER);
+            je(no_delay_slot, T_NEAR);
+            mov(rax, BRANCH_HOLDER);
+            xor_(BRANCH_HOLDER, BRANCH_HOLDER);
+            jmp(rax);
+            L(no_delay_slot);
+        }
+        L(delay_skip[pc]);
+        if (opcode.is_exit) {
+            return false;
+        }
+    } else {
+        test(BRANCH_HOLDER, BRANCH_HOLDER);
+        jne(end_of_code, T_NEAR);
+        if (opcode.is_exit) {
+            inc(BRANCH_HOLDER);
+            return false;
+        }
+    }
+    return true;
+}
+
+Xbyak::Reg32 Tegra::MacroJITx64Impl::Compile_FetchParameter() {
+    mov(eax, dword[PARAMETERS]);
+    add(PARAMETERS, sizeof(u32));
+    return eax;
+}
+
+Xbyak::Reg32 MacroJITx64Impl::Compile_GetRegister(u32 index, Xbyak::Reg32 dst) {
+    if (index == 0) {
+        // Register 0 is always zero
+        xor_(dst, dst);
+    } else {
+        mov(dst, dword[STATE + offsetof(JITState, registers) + index * sizeof(u32)]);
+    }
+
+    return dst;
+}
+
+void MacroJITx64Impl::Compile_ProcessResult(Macro::ResultOperation operation, u32 reg) {
+    const auto SetRegister = [this](u32 reg_index, const Xbyak::Reg32& result) {
+        // Register 0 is supposed to always return 0. NOP is implemented as a store to the zero
+        // register.
+        if (reg_index == 0) {
+            return;
+        }
+        mov(dword[STATE + offsetof(JITState, registers) + reg_index * sizeof(u32)], result);
+    };
+    const auto SetMethodAddress = [this](const Xbyak::Reg32& reg32) { mov(METHOD_ADDRESS, reg32); };
+
+    switch (operation) {
+    case Macro::ResultOperation::IgnoreAndFetch:
+        SetRegister(reg, Compile_FetchParameter());
+        break;
+    case Macro::ResultOperation::Move:
+        SetRegister(reg, RESULT);
+        break;
+    case Macro::ResultOperation::MoveAndSetMethod:
+        SetRegister(reg, RESULT);
+        SetMethodAddress(RESULT);
+        break;
+    case Macro::ResultOperation::FetchAndSend:
+        // Fetch parameter and send result.
+        SetRegister(reg, Compile_FetchParameter());
+        Compile_Send(RESULT);
+        break;
+    case Macro::ResultOperation::MoveAndSend:
+        // Move and send result.
+        SetRegister(reg, RESULT);
+        Compile_Send(RESULT);
+        break;
+    case Macro::ResultOperation::FetchAndSetMethod:
+        // Fetch parameter and use result as Method Address.
+        SetRegister(reg, Compile_FetchParameter());
+        SetMethodAddress(RESULT);
+        break;
+    case Macro::ResultOperation::MoveAndSetMethodFetchAndSend:
+        // Move result and use as Method Address, then fetch and send parameter.
+        SetRegister(reg, RESULT);
+        SetMethodAddress(RESULT);
+        Compile_Send(Compile_FetchParameter());
+        break;
+    case Macro::ResultOperation::MoveAndSetMethodSend:
+        // Move result and use as Method Address, then send bits 12:17 of result.
+        SetRegister(reg, RESULT);
+        SetMethodAddress(RESULT);
+        shr(RESULT, 12);
+        and_(RESULT, 0b111111);
+        Compile_Send(RESULT);
+        break;
+    default:
+        UNIMPLEMENTED_MSG("Unimplemented macro operation {}", operation);
+    }
+}
+
+Macro::Opcode MacroJITx64Impl::GetOpCode() const {
+    ASSERT(pc < code.size());
+    return {code[pc]};
+}
+
+std::bitset<32> MacroJITx64Impl::PersistentCallerSavedRegs() const {
+    return PERSISTENT_REGISTERS & Common::X64::ABI_ALL_CALLER_SAVED;
+}
+
+} // namespace Tegra
--- a/src/video_core/macro/macro_jit_x64.h
+++ b/src/video_core/macro/macro_jit_x64.h
@@ -0,0 +1,98 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <array>
+#include <bitset>
+#include <xbyak.h>
+#include "common/bit_field.h"
+#include "common/common_types.h"
+#include "common/x64/xbyak_abi.h"
+#include "video_core/macro/macro.h"
+
+namespace Tegra {
+
+namespace Engines {
+class Maxwell3D;
+}
+
+/// MAX_CODE_SIZE is arbitrarily chosen based on current booting games
+constexpr size_t MAX_CODE_SIZE = 0x10000;
+
+class MacroJITx64 final : public MacroEngine {
+public:
+    explicit MacroJITx64(Engines::Maxwell3D& maxwell3d_);
+
+protected:
+    std::unique_ptr<CachedMacro> Compile(const std::vector<u32>& code) override;
+
+private:
+    Engines::Maxwell3D& maxwell3d;
+};
+
+class MacroJITx64Impl : public Xbyak::CodeGenerator, public CachedMacro {
+public:
+    explicit MacroJITx64Impl(Engines::Maxwell3D& maxwell3d_, const std::vector<u32>& code_);
+    ~MacroJITx64Impl();
+
+    void Execute(const std::vector<u32>& parameters, u32 method) override;
+
+    void Compile_ALU(Macro::Opcode opcode);
+    void Compile_AddImmediate(Macro::Opcode opcode);
+    void Compile_ExtractInsert(Macro::Opcode opcode);
+    void Compile_ExtractShiftLeftImmediate(Macro::Opcode opcode);
+    void Compile_ExtractShiftLeftRegister(Macro::Opcode opcode);
+    void Compile_Read(Macro::Opcode opcode);
+    void Compile_Branch(Macro::Opcode opcode);
+
+private:
+    void Optimizer_ScanFlags();
+
+    void Compile();
+    bool Compile_NextInstruction();
+
+    Xbyak::Reg32 Compile_FetchParameter();
+    Xbyak::Reg32 Compile_GetRegister(u32 index, Xbyak::Reg32 dst);
+
+    void Compile_ProcessResult(Macro::ResultOperation operation, u32 reg);
+    void Compile_Send(Xbyak::Reg32 value);
+
+    Macro::Opcode GetOpCode() const;
+    std::bitset<32> PersistentCallerSavedRegs() const;
+
+    struct JITState {
+        Engines::Maxwell3D* maxwell3d{};
+        std::array<u32, Macro::NUM_MACRO_REGISTERS> registers{};
+        u32 carry_flag{};
+    };
+    static_assert(offsetof(JITState, maxwell3d) == 0, "Maxwell3D is not at 0x0");
+    using ProgramType = void (*)(JITState*, const u32*);
+
+    struct OptimizerState {
+        bool can_skip_carry{};
+        bool has_delayed_pc{};
+        bool zero_reg_skip{};
+        bool skip_dummy_addimmediate{};
+        bool optimize_for_method_move{};
+        bool enable_asserts{};
+    };
+    OptimizerState optimizer{};
+
+    std::optional<Macro::Opcode> next_opcode{};
+    ProgramType program{nullptr};
+
+    std::array<Xbyak::Label, MAX_CODE_SIZE> labels;
+    std::array<Xbyak::Label, MAX_CODE_SIZE> delay_skip;
+    Xbyak::Label end_of_code{};
+
+    bool is_delay_slot{};
+    u32 pc{};
+    std::optional<u32> delayed_pc;
+
+    const std::vector<u32>& code;
+    Engines::Maxwell3D& maxwell3d;
+};
+
+} // namespace Tegra
--- a/src/video_core/memory_manager.cpp
+++ b/src/video_core/memory_manager.cpp
@@ -0,0 +1,358 @@
+// Copyright 2018 yuzu emulator team
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "common/alignment.h"
+#include "common/assert.h"
+#include "core/core.h"
+#include "core/hle/kernel/memory/page_table.h"
+#include "core/hle/kernel/process.h"
+#include "core/memory.h"
+#include "video_core/gpu.h"
+#include "video_core/memory_manager.h"
+#include "video_core/rasterizer_interface.h"
+#include "video_core/renderer_base.h"
+
+namespace Tegra {
+
+MemoryManager::MemoryManager(Core::System& system_)
+    : system{system_}, page_table(page_table_size) {}
+
+MemoryManager::~MemoryManager() = default;
+
+void MemoryManager::BindRasterizer(VideoCore::RasterizerInterface& rasterizer_) {
+    rasterizer = &rasterizer_;
+}
+
+GPUVAddr MemoryManager::UpdateRange(GPUVAddr gpu_addr, PageEntry page_entry, std::size_t size) {
+    u64 remaining_size{size};
+    for (u64 offset{}; offset < size; offset += page_size) {
+        if (remaining_size < page_size) {
+            SetPageEntry(gpu_addr + offset, page_entry + offset, remaining_size);
+        } else {
+            SetPageEntry(gpu_addr + offset, page_entry + offset);
+        }
+        remaining_size -= page_size;
+    }
+    return gpu_addr;
+}
+
+GPUVAddr MemoryManager::Map(VAddr cpu_addr, GPUVAddr gpu_addr, std::size_t size) {
+    return UpdateRange(gpu_addr, cpu_addr, size);
+}
+
+GPUVAddr MemoryManager::MapAllocate(VAddr cpu_addr, std::size_t size, std::size_t align) {
+    return Map(cpu_addr, *FindFreeRange(size, align), size);
+}
+
+GPUVAddr MemoryManager::MapAllocate32(VAddr cpu_addr, std::size_t size) {
+    const std::optional<GPUVAddr> gpu_addr = FindFreeRange(size, 1, true);
+    ASSERT(gpu_addr);
+    return Map(cpu_addr, *gpu_addr, size);
+}
+
+void MemoryManager::Unmap(GPUVAddr gpu_addr, std::size_t size) {
+    if (!size) {
+        return;
+    }
+
+    // Flush and invalidate through the GPU interface, to be asynchronous if possible.
+    const std::optional<VAddr> cpu_addr = GpuToCpuAddress(gpu_addr);
+    ASSERT(cpu_addr);
+
+    rasterizer->UnmapMemory(*cpu_addr, size);
+
+    UpdateRange(gpu_addr, PageEntry::State::Unmapped, size);
+}
+
+std::optional<GPUVAddr> MemoryManager::AllocateFixed(GPUVAddr gpu_addr, std::size_t size) {
+    for (u64 offset{}; offset < size; offset += page_size) {
+        if (!GetPageEntry(gpu_addr + offset).IsUnmapped()) {
+            return std::nullopt;
+        }
+    }
+
+    return UpdateRange(gpu_addr, PageEntry::State::Allocated, size);
+}
+
+GPUVAddr MemoryManager::Allocate(std::size_t size, std::size_t align) {
+    return *AllocateFixed(*FindFreeRange(size, align), size);
+}
+
+void MemoryManager::TryLockPage(PageEntry page_entry, std::size_t size) {
+    if (!page_entry.IsValid()) {
+        return;
+    }
+
+    ASSERT(system.CurrentProcess()
+               ->PageTable()
+               .LockForDeviceAddressSpace(page_entry.ToAddress(), size)
+               .IsSuccess());
+}
+
+void MemoryManager::TryUnlockPage(PageEntry page_entry, std::size_t size) {
+    if (!page_entry.IsValid()) {
+        return;
+    }
+
+    ASSERT(system.CurrentProcess()
+               ->PageTable()
+               .UnlockForDeviceAddressSpace(page_entry.ToAddress(), size)
+               .IsSuccess());
+}
+
+void MemoryManager::UnmapVicFrame(GPUVAddr gpu_addr, std::size_t size) {
+    if (!size) {
+        return;
+    }
+
+    const std::optional<VAddr> cpu_addr = GpuToCpuAddress(gpu_addr);
+    ASSERT(cpu_addr);
+    system.GPU().Renderer().Rasterizer().InvalidateExceptTextureCache(*cpu_addr, size);
+    cache_invalidate_queue.push_back({*cpu_addr, size});
+
+    UpdateRange(gpu_addr, PageEntry::State::Unmapped, size);
+}
+
+void MemoryManager::InvalidateQueuedCaches() {
+    for (const auto& entry : cache_invalidate_queue) {
+        rasterizer->InvalidateTextureCache(entry.first, entry.second);
+    }
+    cache_invalidate_queue.clear();
+}
+PageEntry MemoryManager::GetPageEntry(GPUVAddr gpu_addr) const {
+    return page_table[PageEntryIndex(gpu_addr)];
+}
+
+void MemoryManager::SetPageEntry(GPUVAddr gpu_addr, PageEntry page_entry, std::size_t size) {
+    // TODO(bunnei): We should lock/unlock device regions. This currently causes issues due to
+    // improper tracking, but should be fixed in the future.
+
+    //// Unlock the old page
+    // TryUnlockPage(page_table[PageEntryIndex(gpu_addr)], size);
+
+    //// Lock the new page
+    // TryLockPage(page_entry, size);
+
+    page_table[PageEntryIndex(gpu_addr)] = page_entry;
+}
+
+std::optional<GPUVAddr> MemoryManager::FindFreeRange(std::size_t size, std::size_t align,
+                                                     bool start_32bit_address) const {
+    if (!align) {
+        align = page_size;
+    } else {
+        align = Common::AlignUp(align, page_size);
+    }
+
+    u64 available_size{};
+    GPUVAddr gpu_addr{start_32bit_address ? address_space_start_low : address_space_start};
+    while (gpu_addr + available_size < address_space_size) {
+        if (GetPageEntry(gpu_addr + available_size).IsUnmapped()) {
+            available_size += page_size;
+
+            if (available_size >= size) {
+                return gpu_addr;
+            }
+        } else {
+            gpu_addr += available_size + page_size;
+            available_size = 0;
+
+            const auto remainder{gpu_addr % align};
+            if (remainder) {
+                gpu_addr = (gpu_addr - remainder) + align;
+            }
+        }
+    }
+
+    return std::nullopt;
+}
+
+std::optional<VAddr> MemoryManager::GpuToCpuAddress(GPUVAddr gpu_addr) const {
+    const auto page_entry{GetPageEntry(gpu_addr)};
+    if (!page_entry.IsValid()) {
+        return std::nullopt;
+    }
+
+    return page_entry.ToAddress() + (gpu_addr & page_mask);
+}
+
+template <typename T>
+T MemoryManager::Read(GPUVAddr addr) const {
+    if (auto page_pointer{GetPointer(addr)}; page_pointer) {
+        // NOTE: Avoid adding any extra logic to this fast-path block
+        T value;
+        std::memcpy(&value, page_pointer, sizeof(T));
+        return value;
+    }
+
+    UNREACHABLE();
+
+    return {};
+}
+
+template <typename T>
+void MemoryManager::Write(GPUVAddr addr, T data) {
+    if (auto page_pointer{GetPointer(addr)}; page_pointer) {
+        // NOTE: Avoid adding any extra logic to this fast-path block
+        std::memcpy(page_pointer, &data, sizeof(T));
+        return;
+    }
+
+    UNREACHABLE();
+}
+
+template u8 MemoryManager::Read<u8>(GPUVAddr addr) const;
+template u16 MemoryManager::Read<u16>(GPUVAddr addr) const;
+template u32 MemoryManager::Read<u32>(GPUVAddr addr) const;
+template u64 MemoryManager::Read<u64>(GPUVAddr addr) const;
+template void MemoryManager::Write<u8>(GPUVAddr addr, u8 data);
+template void MemoryManager::Write<u16>(GPUVAddr addr, u16 data);
+template void MemoryManager::Write<u32>(GPUVAddr addr, u32 data);
+template void MemoryManager::Write<u64>(GPUVAddr addr, u64 data);
+
+u8* MemoryManager::GetPointer(GPUVAddr gpu_addr) {
+    if (!GetPageEntry(gpu_addr).IsValid()) {
+        return {};
+    }
+
+    const auto address{GpuToCpuAddress(gpu_addr)};
+    if (!address) {
+        return {};
+    }
+
+    return system.Memory().GetPointer(*address);
+}
+
+const u8* MemoryManager::GetPointer(GPUVAddr gpu_addr) const {
+    if (!GetPageEntry(gpu_addr).IsValid()) {
+        return {};
+    }
+
+    const auto address{GpuToCpuAddress(gpu_addr)};
+    if (!address) {
+        return {};
+    }
+
+    return system.Memory().GetPointer(*address);
+}
+
+void MemoryManager::ReadBlock(GPUVAddr gpu_src_addr, void* dest_buffer, std::size_t size) const {
+    std::size_t remaining_size{size};
+    std::size_t page_index{gpu_src_addr >> page_bits};
+    std::size_t page_offset{gpu_src_addr & page_mask};
+
+    while (remaining_size > 0) {
+        const std::size_t copy_amount{
+            std::min(static_cast<std::size_t>(page_size) - page_offset, remaining_size)};
+
+        if (const auto page_addr{GpuToCpuAddress(page_index << page_bits)}; page_addr) {
+            const auto src_addr{*page_addr + page_offset};
+
+            // Flush must happen on the rasterizer interface, such that memory is always synchronous
+            // when it is read (even when in asynchronous GPU mode). Fixes Dead Cells title menu.
+            rasterizer->FlushRegion(src_addr, copy_amount);
+            system.Memory().ReadBlockUnsafe(src_addr, dest_buffer, copy_amount);
+        }
+
+        page_index++;
+        page_offset = 0;
+        dest_buffer = static_cast<u8*>(dest_buffer) + copy_amount;
+        remaining_size -= copy_amount;
+    }
+}
+
+void MemoryManager::ReadBlockUnsafe(GPUVAddr gpu_src_addr, void* dest_buffer,
+                                    const std::size_t size) const {
+    std::size_t remaining_size{size};
+    std::size_t page_index{gpu_src_addr >> page_bits};
+    std::size_t page_offset{gpu_src_addr & page_mask};
+
+    while (remaining_size > 0) {
+        const std::size_t copy_amount{
+            std::min(static_cast<std::size_t>(page_size) - page_offset, remaining_size)};
+
+        if (const auto page_addr{GpuToCpuAddress(page_index << page_bits)}; page_addr) {
+            const auto src_addr{*page_addr + page_offset};
+            system.Memory().ReadBlockUnsafe(src_addr, dest_buffer, copy_amount);
+        } else {
+            std::memset(dest_buffer, 0, copy_amount);
+        }
+
+        page_index++;
+        page_offset = 0;
+        dest_buffer = static_cast<u8*>(dest_buffer) + copy_amount;
+        remaining_size -= copy_amount;
+    }
+}
+
+void MemoryManager::WriteBlock(GPUVAddr gpu_dest_addr, const void* src_buffer, std::size_t size) {
+    std::size_t remaining_size{size};
+    std::size_t page_index{gpu_dest_addr >> page_bits};
+    std::size_t page_offset{gpu_dest_addr & page_mask};
+
+    while (remaining_size > 0) {
+        const std::size_t copy_amount{
+            std::min(static_cast<std::size_t>(page_size) - page_offset, remaining_size)};
+
+        if (const auto page_addr{GpuToCpuAddress(page_index << page_bits)}; page_addr) {
+            const auto dest_addr{*page_addr + page_offset};
+
+            // Invalidate must happen on the rasterizer interface, such that memory is always
+            // synchronous when it is written (even when in asynchronous GPU mode).
+            rasterizer->InvalidateRegion(dest_addr, copy_amount);
+            system.Memory().WriteBlockUnsafe(dest_addr, src_buffer, copy_amount);
+        }
+
+        page_index++;
+        page_offset = 0;
+        src_buffer = static_cast<const u8*>(src_buffer) + copy_amount;
+        remaining_size -= copy_amount;
+    }
+}
+
+void MemoryManager::WriteBlockUnsafe(GPUVAddr gpu_dest_addr, const void* src_buffer,
+                                     std::size_t size) {
+    std::size_t remaining_size{size};
+    std::size_t page_index{gpu_dest_addr >> page_bits};
+    std::size_t page_offset{gpu_dest_addr & page_mask};
+
+    while (remaining_size > 0) {
+        const std::size_t copy_amount{
+            std::min(static_cast<std::size_t>(page_size) - page_offset, remaining_size)};
+
+        if (const auto page_addr{GpuToCpuAddress(page_index << page_bits)}; page_addr) {
+            const auto dest_addr{*page_addr + page_offset};
+            system.Memory().WriteBlockUnsafe(dest_addr, src_buffer, copy_amount);
+        }
+
+        page_index++;
+        page_offset = 0;
+        src_buffer = static_cast<const u8*>(src_buffer) + copy_amount;
+        remaining_size -= copy_amount;
+    }
+}
+
+void MemoryManager::CopyBlock(GPUVAddr gpu_dest_addr, GPUVAddr gpu_src_addr, std::size_t size) {
+    std::vector<u8> tmp_buffer(size);
+    ReadBlock(gpu_src_addr, tmp_buffer.data(), size);
+    WriteBlock(gpu_dest_addr, tmp_buffer.data(), size);
+}
+
+void MemoryManager::CopyBlockUnsafe(GPUVAddr gpu_dest_addr, GPUVAddr gpu_src_addr,
+                                    std::size_t size) {
+    std::vector<u8> tmp_buffer(size);
+    ReadBlockUnsafe(gpu_src_addr, tmp_buffer.data(), size);
+    WriteBlockUnsafe(gpu_dest_addr, tmp_buffer.data(), size);
+}
+
+bool MemoryManager::IsGranularRange(GPUVAddr gpu_addr, std::size_t size) const {
+    const auto cpu_addr{GpuToCpuAddress(gpu_addr)};
+    if (!cpu_addr) {
+        return false;
+    }
+    const std::size_t page{(*cpu_addr & Core::Memory::PAGE_MASK) + size};
+    return page <= Core::Memory::PAGE_SIZE;
+}
+
+} // namespace Tegra
--- a/src/video_core/memory_manager.h
+++ b/src/video_core/memory_manager.h
@@ -0,0 +1,164 @@
+// Copyright 2018 yuzu emulator team
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <map>
+#include <optional>
+#include <vector>
+
+#include "common/common_types.h"
+
+namespace VideoCore {
+class RasterizerInterface;
+}
+
+namespace Core {
+class System;
+}
+
+namespace Tegra {
+
+class PageEntry final {
+public:
+    enum class State : u32 {
+        Unmapped = static_cast<u32>(-1),
+        Allocated = static_cast<u32>(-2),
+    };
+
+    constexpr PageEntry() = default;
+    constexpr PageEntry(State state_) : state{state_} {}
+    constexpr PageEntry(VAddr addr) : state{static_cast<State>(addr >> ShiftBits)} {}
+
+    [[nodiscard]] constexpr bool IsUnmapped() const {
+        return state == State::Unmapped;
+    }
+
+    [[nodiscard]] constexpr bool IsAllocated() const {
+        return state == State::Allocated;
+    }
+
+    [[nodiscard]] constexpr bool IsValid() const {
+        return !IsUnmapped() && !IsAllocated();
+    }
+
+    [[nodiscard]] constexpr VAddr ToAddress() const {
+        if (!IsValid()) {
+            return {};
+        }
+
+        return static_cast<VAddr>(state) << ShiftBits;
+    }
+
+    [[nodiscard]] constexpr PageEntry operator+(u64 offset) const {
+        // If this is a reserved value, offsets do not apply
+        if (!IsValid()) {
+            return *this;
+        }
+        return PageEntry{(static_cast<VAddr>(state) << ShiftBits) + offset};
+    }
+
+private:
+    static constexpr std::size_t ShiftBits{12};
+
+    State state{State::Unmapped};
+};
+static_assert(sizeof(PageEntry) == 4, "PageEntry is too large");
+
+class MemoryManager final {
+public:
+    explicit MemoryManager(Core::System& system_);
+    ~MemoryManager();
+
+    /// Binds a renderer to the memory manager.
+    void BindRasterizer(VideoCore::RasterizerInterface& rasterizer);
+
+    [[nodiscard]] std::optional<VAddr> GpuToCpuAddress(GPUVAddr addr) const;
+
+    template <typename T>
+    [[nodiscard]] T Read(GPUVAddr addr) const;
+
+    template <typename T>
+    void Write(GPUVAddr addr, T data);
+
+    [[nodiscard]] u8* GetPointer(GPUVAddr addr);
+    [[nodiscard]] const u8* GetPointer(GPUVAddr addr) const;
+
+    /**
+     * ReadBlock and WriteBlock are full read and write operations over virtual
+     * GPU Memory. It's important to use these when GPU memory may not be continuous
+     * in the Host Memory counterpart. Note: This functions cause Host GPU Memory
+     * Flushes and Invalidations, respectively to each operation.
+     */
+    void ReadBlock(GPUVAddr gpu_src_addr, void* dest_buffer, std::size_t size) const;
+    void WriteBlock(GPUVAddr gpu_dest_addr, const void* src_buffer, std::size_t size);
+    void CopyBlock(GPUVAddr gpu_dest_addr, GPUVAddr gpu_src_addr, std::size_t size);
+
+    /**
+     * ReadBlockUnsafe and WriteBlockUnsafe are special versions of ReadBlock and
+     * WriteBlock respectively. In this versions, no flushing or invalidation is actually
+     * done and their performance is similar to a memcpy. This functions can be used
+     * on either of this 2 scenarios instead of their safe counterpart:
+     * - Memory which is sure to never be represented in the Host GPU.
+     * - Memory Managed by a Cache Manager. Example: Texture Flushing should use
+     * WriteBlockUnsafe instead of WriteBlock since it shouldn't invalidate the texture
+     * being flushed.
+     */
+    void ReadBlockUnsafe(GPUVAddr gpu_src_addr, void* dest_buffer, std::size_t size) const;
+    void WriteBlockUnsafe(GPUVAddr gpu_dest_addr, const void* src_buffer, std::size_t size);
+    void CopyBlockUnsafe(GPUVAddr gpu_dest_addr, GPUVAddr gpu_src_addr, std::size_t size);
+
+    /**
+     * IsGranularRange checks if a gpu region can be simply read with a pointer.
+     */
+    [[nodiscard]] bool IsGranularRange(GPUVAddr gpu_addr, std::size_t size) const;
+
+    [[nodiscard]] GPUVAddr Map(VAddr cpu_addr, GPUVAddr gpu_addr, std::size_t size);
+    [[nodiscard]] GPUVAddr MapAllocate(VAddr cpu_addr, std::size_t size, std::size_t align);
+    [[nodiscard]] GPUVAddr MapAllocate32(VAddr cpu_addr, std::size_t size);
+    [[nodiscard]] std::optional<GPUVAddr> AllocateFixed(GPUVAddr gpu_addr, std::size_t size);
+    [[nodiscard]] GPUVAddr Allocate(std::size_t size, std::size_t align);
+    void Unmap(GPUVAddr gpu_addr, std::size_t size);
+
+    /**
+     * Some Decoded NVDEC frames require that texture cache does not get invalidated.
+     * UnmapVicFrame defers the texture cache invalidation until the stream ends
+     * by invoking InvalidateQueuedCaches to invalidate all frame texture caches.
+     */
+    void UnmapVicFrame(GPUVAddr gpu_addr, std::size_t size);
+    void InvalidateQueuedCaches();
+
+private:
+    [[nodiscard]] PageEntry GetPageEntry(GPUVAddr gpu_addr) const;
+    void SetPageEntry(GPUVAddr gpu_addr, PageEntry page_entry, std::size_t size = page_size);
+    GPUVAddr UpdateRange(GPUVAddr gpu_addr, PageEntry page_entry, std::size_t size);
+    [[nodiscard]] std::optional<GPUVAddr> FindFreeRange(std::size_t size, std::size_t align,
+                                                        bool start_32bit_address = false) const;
+
+    void TryLockPage(PageEntry page_entry, std::size_t size);
+    void TryUnlockPage(PageEntry page_entry, std::size_t size);
+
+    [[nodiscard]] static constexpr std::size_t PageEntryIndex(GPUVAddr gpu_addr) {
+        return (gpu_addr >> page_bits) & page_table_mask;
+    }
+
+    static constexpr u64 address_space_size = 1ULL << 40;
+    static constexpr u64 address_space_start = 1ULL << 32;
+    static constexpr u64 address_space_start_low = 1ULL << 16;
+    static constexpr u64 page_bits{16};
+    static constexpr u64 page_size{1 << page_bits};
+    static constexpr u64 page_mask{page_size - 1};
+    static constexpr u64 page_table_bits{24};
+    static constexpr u64 page_table_size{1 << page_table_bits};
+    static constexpr u64 page_table_mask{page_table_size - 1};
+
+    Core::System& system;
+
+    VideoCore::RasterizerInterface* rasterizer = nullptr;
+
+    std::vector<PageEntry> page_table;
+    std::vector<std::pair<VAddr, std::size_t>> cache_invalidate_queue;
+};
+
+} // namespace Tegra
--- a/src/video_core/morton.cpp
+++ b/src/video_core/morton.cpp
--- a/src/video_core/morton.h
+++ b/src/video_core/morton.h
--- a/src/video_core/query_cache.h
+++ b/src/video_core/query_cache.h
@@ -0,0 +1,399 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <algorithm>
+#include <array>
+#include <cstring>
+#include <iterator>
+#include <memory>
+#include <mutex>
+#include <optional>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "common/assert.h"
+#include "core/core.h"
+#include "core/settings.h"
+#include "video_core/engines/maxwell_3d.h"
+#include "video_core/gpu.h"
+#include "video_core/memory_manager.h"
+#include "video_core/rasterizer_interface.h"
+
+namespace VideoCommon {
+
+template <class QueryCache, class HostCounter>
+class CounterStreamBase {
+public:
+    explicit CounterStreamBase(QueryCache& cache_, VideoCore::QueryType type_)
+        : cache{cache_}, type{type_} {}
+
+    /// Updates the state of the stream, enabling or disabling as needed.
+    void Update(bool enabled) {
+        if (enabled) {
+            Enable();
+        } else {
+            Disable();
+        }
+    }
+
+    /// Resets the stream to zero. It doesn't disable the query after resetting.
+    void Reset() {
+        if (current) {
+            current->EndQuery();
+
+            // Immediately start a new query to avoid disabling its state.
+            current = cache.Counter(nullptr, type);
+        }
+        last = nullptr;
+    }
+
+    /// Returns the current counter slicing as needed.
+    std::shared_ptr<HostCounter> Current() {
+        if (!current) {
+            return nullptr;
+        }
+        current->EndQuery();
+        last = std::move(current);
+        current = cache.Counter(last, type);
+        return last;
+    }
+
+    /// Returns true when the counter stream is enabled.
+    bool IsEnabled() const {
+        return current != nullptr;
+    }
+
+private:
+    /// Enables the stream.
+    void Enable() {
+        if (current) {
+            return;
+        }
+        current = cache.Counter(last, type);
+    }
+
+    // Disables the stream.
+    void Disable() {
+        if (current) {
+            current->EndQuery();
+        }
+        last = std::exchange(current, nullptr);
+    }
+
+    QueryCache& cache;
+    const VideoCore::QueryType type;
+
+    std::shared_ptr<HostCounter> current;
+    std::shared_ptr<HostCounter> last;
+};
+
+template <class QueryCache, class CachedQuery, class CounterStream, class HostCounter>
+class QueryCacheBase {
+public:
+    explicit QueryCacheBase(VideoCore::RasterizerInterface& rasterizer_,
+                            Tegra::Engines::Maxwell3D& maxwell3d_,
+                            Tegra::MemoryManager& gpu_memory_)
+        : rasterizer{rasterizer_}, maxwell3d{maxwell3d_},
+          gpu_memory{gpu_memory_}, streams{{CounterStream{static_cast<QueryCache&>(*this),
+                                                          VideoCore::QueryType::SamplesPassed}}} {}
+
+    void InvalidateRegion(VAddr addr, std::size_t size) {
+        std::unique_lock lock{mutex};
+        FlushAndRemoveRegion(addr, size);
+    }
+
+    void FlushRegion(VAddr addr, std::size_t size) {
+        std::unique_lock lock{mutex};
+        FlushAndRemoveRegion(addr, size);
+    }
+
+    /**
+     * Records a query in GPU mapped memory, potentially marked with a timestamp.
+     * @param gpu_addr  GPU address to flush to when the mapped memory is read.
+     * @param type      Query type, e.g. SamplesPassed.
+     * @param timestamp Timestamp, when empty the flushed query is assumed to be short.
+     */
+    void Query(GPUVAddr gpu_addr, VideoCore::QueryType type, std::optional<u64> timestamp) {
+        std::unique_lock lock{mutex};
+        const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr);
+        ASSERT(cpu_addr);
+
+        CachedQuery* query = TryGet(*cpu_addr);
+        if (!query) {
+            ASSERT_OR_EXECUTE(cpu_addr, return;);
+            u8* const host_ptr = gpu_memory.GetPointer(gpu_addr);
+
+            query = Register(type, *cpu_addr, host_ptr, timestamp.has_value());
+        }
+
+        query->BindCounter(Stream(type).Current(), timestamp);
+        if (Settings::values.use_asynchronous_gpu_emulation.GetValue()) {
+            AsyncFlushQuery(*cpu_addr);
+        }
+    }
+
+    /// Updates counters from GPU state. Expected to be called once per draw, clear or dispatch.
+    void UpdateCounters() {
+        std::unique_lock lock{mutex};
+        const auto& regs = maxwell3d.regs;
+        Stream(VideoCore::QueryType::SamplesPassed).Update(regs.samplecnt_enable);
+    }
+
+    /// Resets a counter to zero. It doesn't disable the query after resetting.
+    void ResetCounter(VideoCore::QueryType type) {
+        std::unique_lock lock{mutex};
+        Stream(type).Reset();
+    }
+
+    /// Disable all active streams. Expected to be called at the end of a command buffer.
+    void DisableStreams() {
+        std::unique_lock lock{mutex};
+        for (auto& stream : streams) {
+            stream.Update(false);
+        }
+    }
+
+    /// Returns a new host counter.
+    std::shared_ptr<HostCounter> Counter(std::shared_ptr<HostCounter> dependency,
+                                         VideoCore::QueryType type) {
+        return std::make_shared<HostCounter>(static_cast<QueryCache&>(*this), std::move(dependency),
+                                             type);
+    }
+
+    /// Returns the counter stream of the specified type.
+    CounterStream& Stream(VideoCore::QueryType type) {
+        return streams[static_cast<std::size_t>(type)];
+    }
+
+    /// Returns the counter stream of the specified type.
+    const CounterStream& Stream(VideoCore::QueryType type) const {
+        return streams[static_cast<std::size_t>(type)];
+    }
+
+    void CommitAsyncFlushes() {
+        committed_flushes.push_back(uncommitted_flushes);
+        uncommitted_flushes.reset();
+    }
+
+    bool HasUncommittedFlushes() const {
+        return uncommitted_flushes != nullptr;
+    }
+
+    bool ShouldWaitAsyncFlushes() const {
+        if (committed_flushes.empty()) {
+            return false;
+        }
+        return committed_flushes.front() != nullptr;
+    }
+
+    void PopAsyncFlushes() {
+        if (committed_flushes.empty()) {
+            return;
+        }
+        auto& flush_list = committed_flushes.front();
+        if (!flush_list) {
+            committed_flushes.pop_front();
+            return;
+        }
+        for (VAddr query_address : *flush_list) {
+            FlushAndRemoveRegion(query_address, 4);
+        }
+        committed_flushes.pop_front();
+    }
+
+private:
+    /// Flushes a memory range to guest memory and removes it from the cache.
+    void FlushAndRemoveRegion(VAddr addr, std::size_t size) {
+        const u64 addr_begin = static_cast<u64>(addr);
+        const u64 addr_end = addr_begin + static_cast<u64>(size);
+        const auto in_range = [addr_begin, addr_end](CachedQuery& query) {
+            const u64 cache_begin = query.GetCpuAddr();
+            const u64 cache_end = cache_begin + query.SizeInBytes();
+            return cache_begin < addr_end && addr_begin < cache_end;
+        };
+
+        const u64 page_end = addr_end >> PAGE_BITS;
+        for (u64 page = addr_begin >> PAGE_BITS; page <= page_end; ++page) {
+            const auto& it = cached_queries.find(page);
+            if (it == std::end(cached_queries)) {
+                continue;
+            }
+            auto& contents = it->second;
+            for (auto& query : contents) {
+                if (!in_range(query)) {
+                    continue;
+                }
+                rasterizer.UpdatePagesCachedCount(query.GetCpuAddr(), query.SizeInBytes(), -1);
+                query.Flush();
+            }
+            contents.erase(std::remove_if(std::begin(contents), std::end(contents), in_range),
+                           std::end(contents));
+        }
+    }
+
+    /// Registers the passed parameters as cached and returns a pointer to the stored cached query.
+    CachedQuery* Register(VideoCore::QueryType type, VAddr cpu_addr, u8* host_ptr, bool timestamp) {
+        rasterizer.UpdatePagesCachedCount(cpu_addr, CachedQuery::SizeInBytes(timestamp), 1);
+        const u64 page = static_cast<u64>(cpu_addr) >> PAGE_BITS;
+        return &cached_queries[page].emplace_back(static_cast<QueryCache&>(*this), type, cpu_addr,
+                                                  host_ptr);
+    }
+
+    /// Tries to a get a cached query. Returns nullptr on failure.
+    CachedQuery* TryGet(VAddr addr) {
+        const u64 page = static_cast<u64>(addr) >> PAGE_BITS;
+        const auto it = cached_queries.find(page);
+        if (it == std::end(cached_queries)) {
+            return nullptr;
+        }
+        auto& contents = it->second;
+        const auto found = std::find_if(std::begin(contents), std::end(contents),
+                                        [addr](auto& query) { return query.GetCpuAddr() == addr; });
+        return found != std::end(contents) ? &*found : nullptr;
+    }
+
+    void AsyncFlushQuery(VAddr addr) {
+        if (!uncommitted_flushes) {
+            uncommitted_flushes = std::make_shared<std::unordered_set<VAddr>>();
+        }
+        uncommitted_flushes->insert(addr);
+    }
+
+    static constexpr std::uintptr_t PAGE_SIZE = 4096;
+    static constexpr unsigned PAGE_BITS = 12;
+
+    VideoCore::RasterizerInterface& rasterizer;
+    Tegra::Engines::Maxwell3D& maxwell3d;
+    Tegra::MemoryManager& gpu_memory;
+
+    std::recursive_mutex mutex;
+
+    std::unordered_map<u64, std::vector<CachedQuery>> cached_queries;
+
+    std::array<CounterStream, VideoCore::NumQueryTypes> streams;
+
+    std::shared_ptr<std::unordered_set<VAddr>> uncommitted_flushes{};
+    std::list<std::shared_ptr<std::unordered_set<VAddr>>> committed_flushes;
+};
+
+template <class QueryCache, class HostCounter>
+class HostCounterBase {
+public:
+    explicit HostCounterBase(std::shared_ptr<HostCounter> dependency_)
+        : dependency{std::move(dependency_)}, depth{dependency ? (dependency->Depth() + 1) : 0} {
+        // Avoid nesting too many dependencies to avoid a stack overflow when these are deleted.
+        constexpr u64 depth_threshold = 96;
+        if (depth > depth_threshold) {
+            depth = 0;
+            base_result = dependency->Query();
+            dependency = nullptr;
+        }
+    }
+    virtual ~HostCounterBase() = default;
+
+    /// Returns the current value of the query.
+    u64 Query() {
+        if (result) {
+            return *result;
+        }
+
+        u64 value = BlockingQuery() + base_result;
+        if (dependency) {
+            value += dependency->Query();
+            dependency = nullptr;
+        }
+
+        result = value;
+        return *result;
+    }
+
+    /// Returns true when flushing this query will potentially wait.
+    bool WaitPending() const noexcept {
+        return result.has_value();
+    }
+
+    u64 Depth() const noexcept {
+        return depth;
+    }
+
+protected:
+    /// Returns the value of query from the backend API blocking as needed.
+    virtual u64 BlockingQuery() const = 0;
+
+private:
+    std::shared_ptr<HostCounter> dependency; ///< Counter to add to this value.
+    std::optional<u64> result;               ///< Filled with the already returned value.
+    u64 depth;                               ///< Number of nested dependencies.
+    u64 base_result = 0;                     ///< Equivalent to nested dependencies value.
+};
+
+template <class HostCounter>
+class CachedQueryBase {
+public:
+    explicit CachedQueryBase(VAddr cpu_addr_, u8* host_ptr_)
+        : cpu_addr{cpu_addr_}, host_ptr{host_ptr_} {}
+    virtual ~CachedQueryBase() = default;
+
+    CachedQueryBase(CachedQueryBase&&) noexcept = default;
+    CachedQueryBase(const CachedQueryBase&) = delete;
+
+    CachedQueryBase& operator=(CachedQueryBase&&) noexcept = default;
+    CachedQueryBase& operator=(const CachedQueryBase&) = delete;
+
+    /// Flushes the query to guest memory.
+    virtual void Flush() {
+        // When counter is nullptr it means that it's just been reseted. We are supposed to write a
+        // zero in these cases.
+        const u64 value = counter ? counter->Query() : 0;
+        std::memcpy(host_ptr, &value, sizeof(u64));
+
+        if (timestamp) {
+            std::memcpy(host_ptr + TIMESTAMP_OFFSET, &*timestamp, sizeof(u64));
+        }
+    }
+
+    /// Binds a counter to this query.
+    void BindCounter(std::shared_ptr<HostCounter> counter_, std::optional<u64> timestamp_) {
+        if (counter) {
+            // If there's an old counter set it means the query is being rewritten by the game.
+            // To avoid losing the data forever, flush here.
+            Flush();
+        }
+        counter = std::move(counter_);
+        timestamp = timestamp_;
+    }
+
+    VAddr GetCpuAddr() const noexcept {
+        return cpu_addr;
+    }
+
+    u64 SizeInBytes() const noexcept {
+        return SizeInBytes(timestamp.has_value());
+    }
+
+    static constexpr u64 SizeInBytes(bool with_timestamp) noexcept {
+        return with_timestamp ? LARGE_QUERY_SIZE : SMALL_QUERY_SIZE;
+    }
+
+protected:
+    /// Returns true when querying the counter may potentially block.
+    bool WaitPending() const noexcept {
+        return counter && counter->WaitPending();
+    }
+
+private:
+    static constexpr std::size_t SMALL_QUERY_SIZE = 8;   // Query size without timestamp.
+    static constexpr std::size_t LARGE_QUERY_SIZE = 16;  // Query size with timestamp.
+    static constexpr std::intptr_t TIMESTAMP_OFFSET = 8; // Timestamp offset in a large query.
+
+    VAddr cpu_addr;                       ///< Guest CPU address.
+    u8* host_ptr;                         ///< Writable host pointer.
+    std::shared_ptr<HostCounter> counter; ///< Host counter to query, owns the dependency tree.
+    std::optional<u64> timestamp;         ///< Timestamp to flush to guest memory.
+};
+
+} // namespace VideoCommon
--- a/src/video_core/rasterizer_accelerated.cpp
+++ b/src/video_core/rasterizer_accelerated.cpp
@@ -0,0 +1,65 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <mutex>
+
+#include <boost/icl/interval_map.hpp>
+#include <boost/range/iterator_range.hpp>
+
+#include "common/assert.h"
+#include "common/common_types.h"
+#include "core/memory.h"
+#include "video_core/rasterizer_accelerated.h"
+
+namespace VideoCore {
+
+namespace {
+
+template <typename Map, typename Interval>
+constexpr auto RangeFromInterval(Map& map, const Interval& interval) {
+    return boost::make_iterator_range(map.equal_range(interval));
+}
+
+} // Anonymous namespace
+
+RasterizerAccelerated::RasterizerAccelerated(Core::Memory::Memory& cpu_memory_)
+    : cpu_memory{cpu_memory_} {}
+
+RasterizerAccelerated::~RasterizerAccelerated() = default;
+
+void RasterizerAccelerated::UpdatePagesCachedCount(VAddr addr, u64 size, int delta) {
+    std::lock_guard lock{pages_mutex};
+    const u64 page_start{addr >> Core::Memory::PAGE_BITS};
+    const u64 page_end{(addr + size + Core::Memory::PAGE_SIZE - 1) >> Core::Memory::PAGE_BITS};
+
+    // Interval maps will erase segments if count reaches 0, so if delta is negative we have to
+    // subtract after iterating
+    const auto pages_interval = CachedPageMap::interval_type::right_open(page_start, page_end);
+    if (delta > 0) {
+        cached_pages.add({pages_interval, delta});
+    }
+
+    for (const auto& pair : RangeFromInterval(cached_pages, pages_interval)) {
+        const auto interval = pair.first & pages_interval;
+        const int count = pair.second;
+
+        const VAddr interval_start_addr = boost::icl::first(interval) << Core::Memory::PAGE_BITS;
+        const VAddr interval_end_addr = boost::icl::last_next(interval) << Core::Memory::PAGE_BITS;
+        const u64 interval_size = interval_end_addr - interval_start_addr;
+
+        if (delta > 0 && count == delta) {
+            cpu_memory.RasterizerMarkRegionCached(interval_start_addr, interval_size, true);
+        } else if (delta < 0 && count == -delta) {
+            cpu_memory.RasterizerMarkRegionCached(interval_start_addr, interval_size, false);
+        } else {
+            ASSERT(count >= 0);
+        }
+    }
+
+    if (delta < 0) {
+        cached_pages.add({pages_interval, delta});
+    }
+}
+
+} // namespace VideoCore
--- a/src/video_core/rasterizer_accelerated.h
+++ b/src/video_core/rasterizer_accelerated.h
@@ -0,0 +1,36 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <mutex>
+
+#include <boost/icl/interval_map.hpp>
+
+#include "common/common_types.h"
+#include "video_core/rasterizer_interface.h"
+
+namespace Core::Memory {
+class Memory;
+}
+
+namespace VideoCore {
+
+/// Implements the shared part in GPU accelerated rasterizers in RasterizerInterface.
+class RasterizerAccelerated : public RasterizerInterface {
+public:
+    explicit RasterizerAccelerated(Core::Memory::Memory& cpu_memory_);
+    ~RasterizerAccelerated() override;
+
+    void UpdatePagesCachedCount(VAddr addr, u64 size, int delta) override;
+
+private:
+    using CachedPageMap = boost::icl::interval_map<u64, int>;
+    CachedPageMap cached_pages;
+    std::mutex pages_mutex;
+
+    Core::Memory::Memory& cpu_memory;
+};
+
+} // namespace VideoCore
--- a/src/video_core/rasterizer_interface.h
+++ b/src/video_core/rasterizer_interface.h
@@ -0,0 +1,140 @@
+// Copyright 2018 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <atomic>
+#include <functional>
+#include <optional>
+#include "common/common_types.h"
+#include "video_core/engines/fermi_2d.h"
+#include "video_core/gpu.h"
+#include "video_core/guest_driver.h"
+
+namespace Tegra {
+class MemoryManager;
+}
+
+namespace VideoCore {
+
+enum class QueryType {
+    SamplesPassed,
+};
+constexpr std::size_t NumQueryTypes = 1;
+
+enum class LoadCallbackStage {
+    Prepare,
+    Build,
+    Complete,
+};
+using DiskResourceLoadCallback = std::function<void(LoadCallbackStage, std::size_t, std::size_t)>;
+
+class RasterizerInterface {
+public:
+    virtual ~RasterizerInterface() = default;
+
+    /// Dispatches a draw invocation
+    virtual void Draw(bool is_indexed, bool is_instanced) = 0;
+
+    /// Clear the current framebuffer
+    virtual void Clear() = 0;
+
+    /// Dispatches a compute shader invocation
+    virtual void DispatchCompute(GPUVAddr code_addr) = 0;
+
+    /// Resets the counter of a query
+    virtual void ResetCounter(QueryType type) = 0;
+
+    /// Records a GPU query and caches it
+    virtual void Query(GPUVAddr gpu_addr, QueryType type, std::optional<u64> timestamp) = 0;
+
+    /// Signal a GPU based semaphore as a fence
+    virtual void SignalSemaphore(GPUVAddr addr, u32 value) = 0;
+
+    /// Signal a GPU based syncpoint as a fence
+    virtual void SignalSyncPoint(u32 value) = 0;
+
+    /// Release all pending fences.
+    virtual void ReleaseFences() = 0;
+
+    /// Notify rasterizer that all caches should be flushed to Switch memory
+    virtual void FlushAll() = 0;
+
+    /// Notify rasterizer that any caches of the specified region should be flushed to Switch memory
+    virtual void FlushRegion(VAddr addr, u64 size) = 0;
+
+    /// Notify rasterizer to flush the texture cache to Switch memory
+    virtual void InvalidateExceptTextureCache(VAddr addr, u64 size) = 0;
+
+    /// Notify rasterizer to invalidate the texture cache
+    virtual void InvalidateTextureCache(VAddr addr, u64 size) = 0;
+
+    /// Check if the the specified memory area requires flushing to CPU Memory.
+    virtual bool MustFlushRegion(VAddr addr, u64 size) = 0;
+
+    /// Notify rasterizer that any caches of the specified region should be invalidated
+    virtual void InvalidateRegion(VAddr addr, u64 size) = 0;
+
+    /// Notify rasterizer that any caches of the specified region are desync with guest
+    virtual void OnCPUWrite(VAddr addr, u64 size) = 0;
+
+    /// Sync memory between guest and host.
+    virtual void SyncGuestHost() = 0;
+
+    /// Unmap memory range
+    virtual void UnmapMemory(VAddr addr, u64 size) = 0;
+
+    /// Notify rasterizer that any caches of the specified region should be flushed to Switch memory
+    /// and invalidated
+    virtual void FlushAndInvalidateRegion(VAddr addr, u64 size) = 0;
+
+    /// Notify the host renderer to wait for previous primitive and compute operations.
+    virtual void WaitForIdle() = 0;
+
+    /// Notify the host renderer to wait for reads and writes to render targets and flush caches.
+    virtual void FragmentBarrier() = 0;
+
+    /// Notify the host renderer to make available previous render target writes.
+    virtual void TiledCacheBarrier() = 0;
+
+    /// Notify the rasterizer to send all written commands to the host GPU.
+    virtual void FlushCommands() = 0;
+
+    /// Notify rasterizer that a frame is about to finish
+    virtual void TickFrame() = 0;
+
+    /// Attempt to use a faster method to perform a surface copy
+    [[nodiscard]] virtual bool AccelerateSurfaceCopy(
+        const Tegra::Engines::Fermi2D::Surface& src, const Tegra::Engines::Fermi2D::Surface& dst,
+        const Tegra::Engines::Fermi2D::Config& copy_config) {
+        return false;
+    }
+
+    /// Attempt to use a faster method to display the framebuffer to screen
+    [[nodiscard]] virtual bool AccelerateDisplay(const Tegra::FramebufferConfig& config,
+                                                 VAddr framebuffer_addr, u32 pixel_stride) {
+        return false;
+    }
+
+    /// Increase/decrease the number of object in pages touching the specified region
+    virtual void UpdatePagesCachedCount(VAddr addr, u64 size, int delta) {}
+
+    /// Initialize disk cached resources for the game being emulated
+    virtual void LoadDiskResources(u64 title_id, const std::atomic_bool& stop_loading,
+                                   const DiskResourceLoadCallback& callback) {}
+
+    /// Grant access to the Guest Driver Profile for recording/obtaining info on the guest driver.
+    [[nodiscard]] GuestDriverProfile& AccessGuestDriverProfile() {
+        return guest_driver_profile;
+    }
+
+    /// Grant access to the Guest Driver Profile for recording/obtaining info on the guest driver.
+    [[nodiscard]] const GuestDriverProfile& AccessGuestDriverProfile() const {
+        return guest_driver_profile;
+    }
+
+private:
+    GuestDriverProfile guest_driver_profile{};
+};
+} // namespace VideoCore
--- a/src/video_core/renderer_base.cpp
+++ b/src/video_core/renderer_base.cpp
@@ -0,0 +1,45 @@
+// Copyright 2015 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "common/logging/log.h"
+#include "core/frontend/emu_window.h"
+#include "core/settings.h"
+#include "video_core/renderer_base.h"
+
+namespace VideoCore {
+
+RendererBase::RendererBase(Core::Frontend::EmuWindow& window_,
+                           std::unique_ptr<Core::Frontend::GraphicsContext> context_)
+    : render_window{window_}, context{std::move(context_)} {
+    RefreshBaseSettings();
+}
+
+RendererBase::~RendererBase() = default;
+
+void RendererBase::RefreshBaseSettings() {
+    UpdateCurrentFramebufferLayout();
+
+    renderer_settings.use_framelimiter = Settings::values.use_frame_limit.GetValue();
+    renderer_settings.set_background_color = true;
+}
+
+void RendererBase::UpdateCurrentFramebufferLayout() {
+    const Layout::FramebufferLayout& layout = render_window.GetFramebufferLayout();
+
+    render_window.UpdateCurrentFramebufferLayout(layout.width, layout.height);
+}
+
+void RendererBase::RequestScreenshot(void* data, std::function<void()> callback,
+                                     const Layout::FramebufferLayout& layout) {
+    if (renderer_settings.screenshot_requested) {
+        LOG_ERROR(Render, "A screenshot is already requested or in progress, ignoring the request");
+        return;
+    }
+    renderer_settings.screenshot_bits = data;
+    renderer_settings.screenshot_complete_callback = std::move(callback);
+    renderer_settings.screenshot_framebuffer_layout = layout;
+    renderer_settings.screenshot_requested = true;
+}
+
+} // namespace VideoCore
--- a/src/video_core/renderer_base.h
+++ b/src/video_core/renderer_base.h
@@ -0,0 +1,113 @@
+// Copyright 2014 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <atomic>
+#include <memory>
+#include <optional>
+
+#include "common/common_types.h"
+#include "core/frontend/emu_window.h"
+#include "video_core/gpu.h"
+#include "video_core/rasterizer_interface.h"
+
+namespace Core::Frontend {
+class EmuWindow;
+class GraphicsContext;
+} // namespace Core::Frontend
+
+namespace VideoCore {
+
+struct RendererSettings {
+    std::atomic_bool use_framelimiter{false};
+    std::atomic_bool set_background_color{false};
+
+    // Screenshot
+    std::atomic<bool> screenshot_requested{false};
+    void* screenshot_bits{};
+    std::function<void()> screenshot_complete_callback;
+    Layout::FramebufferLayout screenshot_framebuffer_layout;
+};
+
+class RendererBase : NonCopyable {
+public:
+    explicit RendererBase(Core::Frontend::EmuWindow& window,
+                          std::unique_ptr<Core::Frontend::GraphicsContext> context);
+    virtual ~RendererBase();
+
+    /// Initialize the renderer
+    [[nodiscard]] virtual bool Init() = 0;
+
+    /// Shutdown the renderer
+    virtual void ShutDown() = 0;
+
+    /// Finalize rendering the guest frame and draw into the presentation texture
+    virtual void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) = 0;
+
+    // Getter/setter functions:
+    // ------------------------
+
+    [[nodiscard]] f32 GetCurrentFPS() const {
+        return m_current_fps;
+    }
+
+    [[nodiscard]] int GetCurrentFrame() const {
+        return m_current_frame;
+    }
+
+    [[nodiscard]] RasterizerInterface& Rasterizer() {
+        return *rasterizer;
+    }
+
+    [[nodiscard]] const RasterizerInterface& Rasterizer() const {
+        return *rasterizer;
+    }
+
+    [[nodiscard]] Core::Frontend::GraphicsContext& Context() {
+        return *context;
+    }
+
+    [[nodiscard]] const Core::Frontend::GraphicsContext& Context() const {
+        return *context;
+    }
+
+    [[nodiscard]] Core::Frontend::EmuWindow& GetRenderWindow() {
+        return render_window;
+    }
+
+    [[nodiscard]] const Core::Frontend::EmuWindow& GetRenderWindow() const {
+        return render_window;
+    }
+
+    [[nodiscard]] RendererSettings& Settings() {
+        return renderer_settings;
+    }
+
+    [[nodiscard]] const RendererSettings& Settings() const {
+        return renderer_settings;
+    }
+
+    /// Refreshes the settings common to all renderers
+    void RefreshBaseSettings();
+
+    /// Request a screenshot of the next frame
+    void RequestScreenshot(void* data, std::function<void()> callback,
+                           const Layout::FramebufferLayout& layout);
+
+protected:
+    Core::Frontend::EmuWindow& render_window; ///< Reference to the render window handle.
+    std::unique_ptr<RasterizerInterface> rasterizer;
+    std::unique_ptr<Core::Frontend::GraphicsContext> context;
+    f32 m_current_fps = 0.0f; ///< Current framerate, should be set by the renderer
+    int m_current_frame = 0;  ///< Current frame, should be set by the renderer
+
+    RendererSettings renderer_settings;
+
+private:
+    /// Updates the framebuffer layout of the contained render window handle.
+    void UpdateCurrentFramebufferLayout();
+};
+
+} // namespace VideoCore
--- a/src/video_core/renderer_opengl/gl_arb_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_arb_decompiler.cpp
--- a/src/video_core/renderer_opengl/gl_arb_decompiler.h
+++ b/src/video_core/renderer_opengl/gl_arb_decompiler.h
@@ -0,0 +1,29 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <string>
+#include <string_view>
+
+#include "common/common_types.h"
+
+namespace Tegra::Engines {
+enum class ShaderType : u32;
+}
+
+namespace VideoCommon::Shader {
+class ShaderIR;
+class Registry;
+} // namespace VideoCommon::Shader
+
+namespace OpenGL {
+
+class Device;
+
+std::string DecompileAssemblyShader(const Device& device, const VideoCommon::Shader::ShaderIR& ir,
+                                    const VideoCommon::Shader::Registry& registry,
+                                    Tegra::Engines::ShaderType stage, std::string_view identifier);
+
+} // namespace OpenGL
--- a/src/video_core/renderer_opengl/gl_buffer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
@@ -0,0 +1,99 @@
+// Copyright 2018 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <memory>
+
+#include <glad/glad.h>
+
+#include "common/assert.h"
+#include "common/microprofile.h"
+#include "video_core/buffer_cache/buffer_cache.h"
+#include "video_core/engines/maxwell_3d.h"
+#include "video_core/rasterizer_interface.h"
+#include "video_core/renderer_opengl/gl_buffer_cache.h"
+#include "video_core/renderer_opengl/gl_device.h"
+#include "video_core/renderer_opengl/gl_rasterizer.h"
+#include "video_core/renderer_opengl/gl_resource_manager.h"
+
+namespace OpenGL {
+
+using Maxwell = Tegra::Engines::Maxwell3D::Regs;
+
+MICROPROFILE_DEFINE(OpenGL_Buffer_Download, "OpenGL", "Buffer Download", MP_RGB(192, 192, 128));
+
+Buffer::Buffer(const Device& device_, VAddr cpu_addr_, std::size_t size_)
+    : BufferBlock{cpu_addr_, size_} {
+    gl_buffer.Create();
+    glNamedBufferData(gl_buffer.handle, static_cast<GLsizeiptr>(size_), nullptr, GL_DYNAMIC_DRAW);
+    if (device_.UseAssemblyShaders() || device_.HasVertexBufferUnifiedMemory()) {
+        glMakeNamedBufferResidentNV(gl_buffer.handle, GL_READ_WRITE);
+        glGetNamedBufferParameterui64vNV(gl_buffer.handle, GL_BUFFER_GPU_ADDRESS_NV, &gpu_address);
+    }
+}
+
+Buffer::~Buffer() = default;
+
+void Buffer::Upload(std::size_t offset, std::size_t data_size, const u8* data) {
+    glNamedBufferSubData(Handle(), static_cast<GLintptr>(offset),
+                         static_cast<GLsizeiptr>(data_size), data);
+}
+
+void Buffer::Download(std::size_t offset, std::size_t data_size, u8* data) {
+    MICROPROFILE_SCOPE(OpenGL_Buffer_Download);
+    const GLsizeiptr gl_size = static_cast<GLsizeiptr>(data_size);
+    const GLintptr gl_offset = static_cast<GLintptr>(offset);
+    if (read_buffer.handle == 0) {
+        read_buffer.Create();
+        glNamedBufferData(read_buffer.handle, static_cast<GLsizeiptr>(Size()), nullptr,
+                          GL_STREAM_READ);
+    }
+    glMemoryBarrier(GL_BUFFER_UPDATE_BARRIER_BIT);
+    glCopyNamedBufferSubData(gl_buffer.handle, read_buffer.handle, gl_offset, gl_offset, gl_size);
+    glGetNamedBufferSubData(read_buffer.handle, gl_offset, gl_size, data);
+}
+
+void Buffer::CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst_offset,
+                      std::size_t copy_size) {
+    glCopyNamedBufferSubData(src.Handle(), Handle(), static_cast<GLintptr>(src_offset),
+                             static_cast<GLintptr>(dst_offset), static_cast<GLsizeiptr>(copy_size));
+}
+
+OGLBufferCache::OGLBufferCache(VideoCore::RasterizerInterface& rasterizer_,
+                               Tegra::MemoryManager& gpu_memory_, Core::Memory::Memory& cpu_memory_,
+                               const Device& device_, OGLStreamBuffer& stream_buffer_,
+                               StateTracker& state_tracker)
+    : GenericBufferCache{rasterizer_, gpu_memory_, cpu_memory_, stream_buffer_}, device{device_} {
+    if (!device.HasFastBufferSubData()) {
+        return;
+    }
+
+    static constexpr GLsizeiptr size = static_cast<GLsizeiptr>(Maxwell::MaxConstBufferSize);
+    glCreateBuffers(static_cast<GLsizei>(std::size(cbufs)), std::data(cbufs));
+    for (const GLuint cbuf : cbufs) {
+        glNamedBufferData(cbuf, size, nullptr, GL_STREAM_DRAW);
+    }
+}
+
+OGLBufferCache::~OGLBufferCache() {
+    glDeleteBuffers(static_cast<GLsizei>(std::size(cbufs)), std::data(cbufs));
+}
+
+std::shared_ptr<Buffer> OGLBufferCache::CreateBlock(VAddr cpu_addr, std::size_t size) {
+    return std::make_shared<Buffer>(device, cpu_addr, size);
+}
+
+OGLBufferCache::BufferInfo OGLBufferCache::GetEmptyBuffer(std::size_t) {
+    return {0, 0, 0};
+}
+
+OGLBufferCache::BufferInfo OGLBufferCache::ConstBufferUpload(const void* raw_pointer,
+                                                             std::size_t size) {
+    DEBUG_ASSERT(cbuf_cursor < std::size(cbufs));
+    const GLuint cbuf = cbufs[cbuf_cursor++];
+
+    glNamedBufferSubData(cbuf, 0, static_cast<GLsizeiptr>(size), raw_pointer);
+    return {cbuf, 0, 0};
+}
+
+} // namespace OpenGL
--- a/src/video_core/renderer_opengl/gl_buffer_cache.h
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.h
@@ -0,0 +1,83 @@
+// Copyright 2018 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <array>
+#include <memory>
+
+#include "common/common_types.h"
+#include "video_core/buffer_cache/buffer_cache.h"
+#include "video_core/engines/maxwell_3d.h"
+#include "video_core/renderer_opengl/gl_resource_manager.h"
+#include "video_core/renderer_opengl/gl_stream_buffer.h"
+
+namespace Core {
+class System;
+}
+
+namespace OpenGL {
+
+class Device;
+class OGLStreamBuffer;
+class RasterizerOpenGL;
+class StateTracker;
+
+class Buffer : public VideoCommon::BufferBlock {
+public:
+    explicit Buffer(const Device& device_, VAddr cpu_addr_, std::size_t size_);
+    ~Buffer();
+
+    void Upload(std::size_t offset, std::size_t data_size, const u8* data);
+
+    void Download(std::size_t offset, std::size_t data_size, u8* data);
+
+    void CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst_offset,
+                  std::size_t copy_size);
+
+    GLuint Handle() const noexcept {
+        return gl_buffer.handle;
+    }
+
+    u64 Address() const noexcept {
+        return gpu_address;
+    }
+
+private:
+    OGLBuffer gl_buffer;
+    OGLBuffer read_buffer;
+    u64 gpu_address = 0;
+};
+
+using GenericBufferCache = VideoCommon::BufferCache<Buffer, GLuint, OGLStreamBuffer>;
+class OGLBufferCache final : public GenericBufferCache {
+public:
+    explicit OGLBufferCache(VideoCore::RasterizerInterface& rasterizer,
+                            Tegra::MemoryManager& gpu_memory, Core::Memory::Memory& cpu_memory,
+                            const Device& device, OGLStreamBuffer& stream_buffer,
+                            StateTracker& state_tracker);
+    ~OGLBufferCache();
+
+    BufferInfo GetEmptyBuffer(std::size_t) override;
+
+    void Acquire() noexcept {
+        cbuf_cursor = 0;
+    }
+
+protected:
+    std::shared_ptr<Buffer> CreateBlock(VAddr cpu_addr, std::size_t size) override;
+
+    BufferInfo ConstBufferUpload(const void* raw_pointer, std::size_t size) override;
+
+private:
+    static constexpr std::size_t NUM_CBUFS = Tegra::Engines::Maxwell3D::Regs::MaxConstBuffers *
+                                             Tegra::Engines::Maxwell3D::Regs::MaxShaderProgram;
+
+    const Device& device;
+
+    std::size_t cbuf_cursor = 0;
+    std::array<GLuint, NUM_CBUFS> cbufs{};
+};
+
+} // namespace OpenGL
--- a/src/video_core/renderer_opengl/gl_device.cpp
+++ b/src/video_core/renderer_opengl/gl_device.cpp
@@ -0,0 +1,294 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <algorithm>
+#include <array>
+#include <cstddef>
+#include <cstdlib>
+#include <cstring>
+#include <limits>
+#include <optional>
+#include <span>
+#include <vector>
+
+#include <glad/glad.h>
+
+#include "common/logging/log.h"
+#include "common/scope_exit.h"
+#include "core/settings.h"
+#include "video_core/renderer_opengl/gl_device.h"
+#include "video_core/renderer_opengl/gl_resource_manager.h"
+
+namespace OpenGL {
+
+namespace {
+
+// One uniform block is reserved for emulation purposes
+constexpr u32 ReservedUniformBlocks = 1;
+
+constexpr u32 NumStages = 5;
+
+constexpr std::array LIMIT_UBOS = {
+    GL_MAX_VERTEX_UNIFORM_BLOCKS,          GL_MAX_TESS_CONTROL_UNIFORM_BLOCKS,
+    GL_MAX_TESS_EVALUATION_UNIFORM_BLOCKS, GL_MAX_GEOMETRY_UNIFORM_BLOCKS,
+    GL_MAX_FRAGMENT_UNIFORM_BLOCKS,        GL_MAX_COMPUTE_UNIFORM_BLOCKS,
+};
+constexpr std::array LIMIT_SSBOS = {
+    GL_MAX_VERTEX_SHADER_STORAGE_BLOCKS,          GL_MAX_TESS_CONTROL_SHADER_STORAGE_BLOCKS,
+    GL_MAX_TESS_EVALUATION_SHADER_STORAGE_BLOCKS, GL_MAX_GEOMETRY_SHADER_STORAGE_BLOCKS,
+    GL_MAX_FRAGMENT_SHADER_STORAGE_BLOCKS,        GL_MAX_COMPUTE_SHADER_STORAGE_BLOCKS,
+};
+constexpr std::array LIMIT_SAMPLERS = {
+    GL_MAX_VERTEX_TEXTURE_IMAGE_UNITS,
+    GL_MAX_TESS_CONTROL_TEXTURE_IMAGE_UNITS,
+    GL_MAX_TESS_EVALUATION_TEXTURE_IMAGE_UNITS,
+    GL_MAX_GEOMETRY_TEXTURE_IMAGE_UNITS,
+    GL_MAX_TEXTURE_IMAGE_UNITS,
+    GL_MAX_COMPUTE_TEXTURE_IMAGE_UNITS,
+};
+constexpr std::array LIMIT_IMAGES = {
+    GL_MAX_VERTEX_IMAGE_UNIFORMS,          GL_MAX_TESS_CONTROL_IMAGE_UNIFORMS,
+    GL_MAX_TESS_EVALUATION_IMAGE_UNIFORMS, GL_MAX_GEOMETRY_IMAGE_UNIFORMS,
+    GL_MAX_FRAGMENT_IMAGE_UNIFORMS,        GL_MAX_COMPUTE_IMAGE_UNIFORMS,
+};
+
+template <typename T>
+T GetInteger(GLenum pname) {
+    GLint temporary;
+    glGetIntegerv(pname, &temporary);
+    return static_cast<T>(temporary);
+}
+
+bool TestProgram(const GLchar* glsl) {
+    const GLuint shader{glCreateShaderProgramv(GL_VERTEX_SHADER, 1, &glsl)};
+    GLint link_status;
+    glGetProgramiv(shader, GL_LINK_STATUS, &link_status);
+    glDeleteProgram(shader);
+    return link_status == GL_TRUE;
+}
+
+std::vector<std::string_view> GetExtensions() {
+    GLint num_extensions;
+    glGetIntegerv(GL_NUM_EXTENSIONS, &num_extensions);
+    std::vector<std::string_view> extensions;
+    extensions.reserve(num_extensions);
+    for (GLint index = 0; index < num_extensions; ++index) {
+        extensions.push_back(
+            reinterpret_cast<const char*>(glGetStringi(GL_EXTENSIONS, static_cast<GLuint>(index))));
+    }
+    return extensions;
+}
+
+bool HasExtension(std::span<const std::string_view> extensions, std::string_view extension) {
+    return std::ranges::find(extensions, extension) != extensions.end();
+}
+
+u32 Extract(u32& base, u32& num, u32 amount, std::optional<GLenum> limit = {}) {
+    ASSERT(num >= amount);
+    if (limit) {
+        amount = std::min(amount, GetInteger<u32>(*limit));
+    }
+    num -= amount;
+    return std::exchange(base, base + amount);
+}
+
+std::array<u32, Tegra::Engines::MaxShaderTypes> BuildMaxUniformBuffers() noexcept {
+    std::array<u32, Tegra::Engines::MaxShaderTypes> max;
+    std::ranges::transform(LIMIT_UBOS, max.begin(),
+                           [](GLenum pname) { return GetInteger<u32>(pname); });
+    return max;
+}
+
+std::array<Device::BaseBindings, Tegra::Engines::MaxShaderTypes> BuildBaseBindings() noexcept {
+    std::array<Device::BaseBindings, Tegra::Engines::MaxShaderTypes> bindings;
+
+    static constexpr std::array<std::size_t, 5> stage_swizzle{0, 1, 2, 3, 4};
+    const u32 total_ubos = GetInteger<u32>(GL_MAX_UNIFORM_BUFFER_BINDINGS);
+    const u32 total_ssbos = GetInteger<u32>(GL_MAX_SHADER_STORAGE_BUFFER_BINDINGS);
+    const u32 total_samplers = GetInteger<u32>(GL_MAX_COMBINED_TEXTURE_IMAGE_UNITS);
+
+    u32 num_ubos = total_ubos - ReservedUniformBlocks;
+    u32 num_ssbos = total_ssbos;
+    u32 num_samplers = total_samplers;
+
+    u32 base_ubo = ReservedUniformBlocks;
+    u32 base_ssbo = 0;
+    u32 base_samplers = 0;
+
+    for (std::size_t i = 0; i < NumStages; ++i) {
+        const std::size_t stage = stage_swizzle[i];
+        bindings[stage] = {
+            Extract(base_ubo, num_ubos, total_ubos / NumStages, LIMIT_UBOS[stage]),
+            Extract(base_ssbo, num_ssbos, total_ssbos / NumStages, LIMIT_SSBOS[stage]),
+            Extract(base_samplers, num_samplers, total_samplers / NumStages,
+                    LIMIT_SAMPLERS[stage])};
+    }
+
+    u32 num_images = GetInteger<u32>(GL_MAX_IMAGE_UNITS);
+    u32 base_images = 0;
+
+    // GL_MAX_IMAGE_UNITS is guaranteed by the spec to have a minimum value of 8.
+    // Due to the limitation of GL_MAX_IMAGE_UNITS, reserve at least 4 image bindings on the
+    // fragment stage, and at least 1 for the rest of the stages.
+    // So far games are observed to use 1 image binding on vertex and 4 on fragment stages.
+
+    // Reserve at least 4 image bindings on the fragment stage.
+    bindings[4].image =
+        Extract(base_images, num_images, std::max(4U, num_images / NumStages), LIMIT_IMAGES[4]);
+
+    // This is guaranteed to be at least 1.
+    const u32 total_extracted_images = num_images / (NumStages - 1);
+
+    // Reserve the other image bindings.
+    for (std::size_t i = 0; i < NumStages; ++i) {
+        const std::size_t stage = stage_swizzle[i];
+        if (stage == 4) {
+            continue;
+        }
+        bindings[stage].image =
+            Extract(base_images, num_images, total_extracted_images, LIMIT_IMAGES[stage]);
+    }
+
+    // Compute doesn't care about any of this.
+    bindings[5] = {0, 0, 0, 0};
+
+    return bindings;
+}
+
+bool IsASTCSupported() {
+    static constexpr std::array targets = {GL_TEXTURE_2D, GL_TEXTURE_2D_ARRAY};
+    static constexpr std::array formats = {
+        GL_COMPRESSED_RGBA_ASTC_4x4_KHR,           GL_COMPRESSED_RGBA_ASTC_5x4_KHR,
+        GL_COMPRESSED_RGBA_ASTC_5x5_KHR,           GL_COMPRESSED_RGBA_ASTC_6x5_KHR,
+        GL_COMPRESSED_RGBA_ASTC_6x6_KHR,           GL_COMPRESSED_RGBA_ASTC_8x5_KHR,
+        GL_COMPRESSED_RGBA_ASTC_8x6_KHR,           GL_COMPRESSED_RGBA_ASTC_8x8_KHR,
+        GL_COMPRESSED_RGBA_ASTC_10x5_KHR,          GL_COMPRESSED_RGBA_ASTC_10x6_KHR,
+        GL_COMPRESSED_RGBA_ASTC_10x8_KHR,          GL_COMPRESSED_RGBA_ASTC_10x10_KHR,
+        GL_COMPRESSED_RGBA_ASTC_12x10_KHR,         GL_COMPRESSED_RGBA_ASTC_12x12_KHR,
+        GL_COMPRESSED_SRGB8_ALPHA8_ASTC_4x4_KHR,   GL_COMPRESSED_SRGB8_ALPHA8_ASTC_5x4_KHR,
+        GL_COMPRESSED_SRGB8_ALPHA8_ASTC_5x5_KHR,   GL_COMPRESSED_SRGB8_ALPHA8_ASTC_6x5_KHR,
+        GL_COMPRESSED_SRGB8_ALPHA8_ASTC_6x6_KHR,   GL_COMPRESSED_SRGB8_ALPHA8_ASTC_8x5_KHR,
+        GL_COMPRESSED_SRGB8_ALPHA8_ASTC_8x6_KHR,   GL_COMPRESSED_SRGB8_ALPHA8_ASTC_8x8_KHR,
+        GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x5_KHR,  GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x6_KHR,
+        GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x8_KHR,  GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x10_KHR,
+        GL_COMPRESSED_SRGB8_ALPHA8_ASTC_12x10_KHR, GL_COMPRESSED_SRGB8_ALPHA8_ASTC_12x12_KHR,
+    };
+    static constexpr std::array required_support = {
+        GL_VERTEX_TEXTURE,   GL_TESS_CONTROL_TEXTURE, GL_TESS_EVALUATION_TEXTURE,
+        GL_GEOMETRY_TEXTURE, GL_FRAGMENT_TEXTURE,     GL_COMPUTE_TEXTURE,
+    };
+
+    for (const GLenum target : targets) {
+        for (const GLenum format : formats) {
+            for (const GLenum support : required_support) {
+                GLint value;
+                glGetInternalformativ(target, format, support, 1, &value);
+                if (value != GL_FULL_SUPPORT) {
+                    return false;
+                }
+            }
+        }
+    }
+    return true;
+}
+
+[[nodiscard]] bool IsDebugToolAttached(std::span<const std::string_view> extensions) {
+    const bool nsight = std::getenv("NVTX_INJECTION64_PATH") || std::getenv("NSIGHT_LAUNCHED");
+    return nsight || HasExtension(extensions, "GL_EXT_debug_tool");
+}
+
+} // Anonymous namespace
+
+Device::Device()
+    : max_uniform_buffers{BuildMaxUniformBuffers()}, base_bindings{BuildBaseBindings()} {
+    const std::string_view vendor = reinterpret_cast<const char*>(glGetString(GL_VENDOR));
+    const std::string_view version = reinterpret_cast<const char*>(glGetString(GL_VERSION));
+    const std::vector extensions = GetExtensions();
+
+    const bool is_nvidia = vendor == "NVIDIA Corporation";
+    const bool is_amd = vendor == "ATI Technologies Inc.";
+
+    bool disable_fast_buffer_sub_data = false;
+    if (is_nvidia && version == "4.6.0 NVIDIA 443.24") {
+        LOG_WARNING(
+            Render_OpenGL,
+            "Beta driver 443.24 is known to have issues. There might be performance issues.");
+        disable_fast_buffer_sub_data = true;
+    }
+    uniform_buffer_alignment = GetInteger<size_t>(GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT);
+    shader_storage_alignment = GetInteger<size_t>(GL_SHADER_STORAGE_BUFFER_OFFSET_ALIGNMENT);
+    max_vertex_attributes = GetInteger<u32>(GL_MAX_VERTEX_ATTRIBS);
+    max_varyings = GetInteger<u32>(GL_MAX_VARYING_VECTORS);
+    max_compute_shared_memory_size = GetInteger<u32>(GL_MAX_COMPUTE_SHARED_MEMORY_SIZE);
+    has_warp_intrinsics = GLAD_GL_NV_gpu_shader5 && GLAD_GL_NV_shader_thread_group &&
+                          GLAD_GL_NV_shader_thread_shuffle;
+    has_shader_ballot = GLAD_GL_ARB_shader_ballot;
+    has_vertex_viewport_layer = GLAD_GL_ARB_shader_viewport_layer_array;
+    has_image_load_formatted = HasExtension(extensions, "GL_EXT_shader_image_load_formatted");
+    has_texture_shadow_lod = HasExtension(extensions, "GL_EXT_texture_shadow_lod");
+    has_astc = IsASTCSupported();
+    has_variable_aoffi = TestVariableAoffi();
+    has_component_indexing_bug = is_amd;
+    has_precise_bug = TestPreciseBug();
+    has_nv_viewport_array2 = GLAD_GL_NV_viewport_array2;
+    has_vertex_buffer_unified_memory = GLAD_GL_NV_vertex_buffer_unified_memory;
+    has_debugging_tool_attached = IsDebugToolAttached(extensions);
+
+    // At the moment of writing this, only Nvidia's driver optimizes BufferSubData on exclusive
+    // uniform buffers as "push constants"
+    has_fast_buffer_sub_data = is_nvidia && !disable_fast_buffer_sub_data;
+
+    use_assembly_shaders = Settings::values.use_assembly_shaders.GetValue() &&
+                           GLAD_GL_NV_gpu_program5 && GLAD_GL_NV_compute_program5 &&
+                           GLAD_GL_NV_transform_feedback && GLAD_GL_NV_transform_feedback2;
+
+    use_asynchronous_shaders = Settings::values.use_asynchronous_shaders.GetValue();
+
+    LOG_INFO(Render_OpenGL, "Renderer_VariableAOFFI: {}", has_variable_aoffi);
+    LOG_INFO(Render_OpenGL, "Renderer_ComponentIndexingBug: {}", has_component_indexing_bug);
+    LOG_INFO(Render_OpenGL, "Renderer_PreciseBug: {}", has_precise_bug);
+
+    if (Settings::values.use_assembly_shaders.GetValue() && !use_assembly_shaders) {
+        LOG_ERROR(Render_OpenGL, "Assembly shaders enabled but not supported");
+    }
+}
+
+Device::Device(std::nullptr_t) {
+    max_uniform_buffers.fill(std::numeric_limits<u32>::max());
+    uniform_buffer_alignment = 4;
+    shader_storage_alignment = 4;
+    max_vertex_attributes = 16;
+    max_varyings = 15;
+    max_compute_shared_memory_size = 0x10000;
+    has_warp_intrinsics = true;
+    has_shader_ballot = true;
+    has_vertex_viewport_layer = true;
+    has_image_load_formatted = true;
+    has_texture_shadow_lod = true;
+    has_variable_aoffi = true;
+}
+
+bool Device::TestVariableAoffi() {
+    return TestProgram(R"(#version 430 core
+// This is a unit test, please ignore me on apitrace bug reports.
+uniform sampler2D tex;
+uniform ivec2 variable_offset;
+out vec4 output_attribute;
+void main() {
+    output_attribute = textureOffset(tex, vec2(0), variable_offset);
+})");
+}
+
+bool Device::TestPreciseBug() {
+    return !TestProgram(R"(#version 430 core
+in vec3 coords;
+out float out_value;
+uniform sampler2DShadow tex;
+void main() {
+    precise float tmp_value = vec4(texture(tex, coords)).x;
+    out_value = tmp_value;
+})");
+}
+
+} // namespace OpenGL
--- a/src/video_core/renderer_opengl/gl_device.h
+++ b/src/video_core/renderer_opengl/gl_device.h
@@ -0,0 +1,147 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <cstddef>
+#include "common/common_types.h"
+#include "video_core/engines/shader_type.h"
+
+namespace OpenGL {
+
+static constexpr u32 EmulationUniformBlockBinding = 0;
+
+class Device final {
+public:
+    struct BaseBindings final {
+        u32 uniform_buffer{};
+        u32 shader_storage_buffer{};
+        u32 sampler{};
+        u32 image{};
+    };
+
+    explicit Device();
+    explicit Device(std::nullptr_t);
+
+    u32 GetMaxUniformBuffers(Tegra::Engines::ShaderType shader_type) const noexcept {
+        return max_uniform_buffers[static_cast<std::size_t>(shader_type)];
+    }
+
+    const BaseBindings& GetBaseBindings(std::size_t stage_index) const noexcept {
+        return base_bindings[stage_index];
+    }
+
+    const BaseBindings& GetBaseBindings(Tegra::Engines::ShaderType shader_type) const noexcept {
+        return GetBaseBindings(static_cast<std::size_t>(shader_type));
+    }
+
+    size_t GetUniformBufferAlignment() const {
+        return uniform_buffer_alignment;
+    }
+
+    size_t GetShaderStorageBufferAlignment() const {
+        return shader_storage_alignment;
+    }
+
+    u32 GetMaxVertexAttributes() const {
+        return max_vertex_attributes;
+    }
+
+    u32 GetMaxVaryings() const {
+        return max_varyings;
+    }
+
+    u32 GetMaxComputeSharedMemorySize() const {
+        return max_compute_shared_memory_size;
+    }
+
+    bool HasWarpIntrinsics() const {
+        return has_warp_intrinsics;
+    }
+
+    bool HasShaderBallot() const {
+        return has_shader_ballot;
+    }
+
+    bool HasVertexViewportLayer() const {
+        return has_vertex_viewport_layer;
+    }
+
+    bool HasImageLoadFormatted() const {
+        return has_image_load_formatted;
+    }
+
+    bool HasTextureShadowLod() const {
+        return has_texture_shadow_lod;
+    }
+
+    bool HasVertexBufferUnifiedMemory() const {
+        return has_vertex_buffer_unified_memory;
+    }
+
+    bool HasASTC() const {
+        return has_astc;
+    }
+
+    bool HasVariableAoffi() const {
+        return has_variable_aoffi;
+    }
+
+    bool HasComponentIndexingBug() const {
+        return has_component_indexing_bug;
+    }
+
+    bool HasPreciseBug() const {
+        return has_precise_bug;
+    }
+
+    bool HasFastBufferSubData() const {
+        return has_fast_buffer_sub_data;
+    }
+
+    bool HasNvViewportArray2() const {
+        return has_nv_viewport_array2;
+    }
+
+    bool HasDebuggingToolAttached() const {
+        return has_debugging_tool_attached;
+    }
+
+    bool UseAssemblyShaders() const {
+        return use_assembly_shaders;
+    }
+
+    bool UseAsynchronousShaders() const {
+        return use_asynchronous_shaders;
+    }
+
+private:
+    static bool TestVariableAoffi();
+    static bool TestPreciseBug();
+
+    std::array<u32, Tegra::Engines::MaxShaderTypes> max_uniform_buffers{};
+    std::array<BaseBindings, Tegra::Engines::MaxShaderTypes> base_bindings{};
+    size_t uniform_buffer_alignment{};
+    size_t shader_storage_alignment{};
+    u32 max_vertex_attributes{};
+    u32 max_varyings{};
+    u32 max_compute_shared_memory_size{};
+    bool has_warp_intrinsics{};
+    bool has_shader_ballot{};
+    bool has_vertex_viewport_layer{};
+    bool has_image_load_formatted{};
+    bool has_texture_shadow_lod{};
+    bool has_vertex_buffer_unified_memory{};
+    bool has_astc{};
+    bool has_variable_aoffi{};
+    bool has_component_indexing_bug{};
+    bool has_precise_bug{};
+    bool has_fast_buffer_sub_data{};
+    bool has_nv_viewport_array2{};
+    bool has_debugging_tool_attached{};
+    bool use_assembly_shaders{};
+    bool use_asynchronous_shaders{};
+};
+
+} // namespace OpenGL
--- a/src/video_core/renderer_opengl/gl_fence_manager.cpp
+++ b/src/video_core/renderer_opengl/gl_fence_manager.cpp
@@ -0,0 +1,73 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "common/assert.h"
+
+#include <glad/glad.h>
+
+#include "video_core/renderer_opengl/gl_buffer_cache.h"
+#include "video_core/renderer_opengl/gl_fence_manager.h"
+
+namespace OpenGL {
+
+GLInnerFence::GLInnerFence(u32 payload_, bool is_stubbed_) : FenceBase{payload_, is_stubbed_} {}
+
+GLInnerFence::GLInnerFence(GPUVAddr address_, u32 payload_, bool is_stubbed_)
+    : FenceBase{address_, payload_, is_stubbed_} {}
+
+GLInnerFence::~GLInnerFence() = default;
+
+void GLInnerFence::Queue() {
+    if (is_stubbed) {
+        return;
+    }
+    ASSERT(sync_object.handle == 0);
+    sync_object.Create();
+}
+
+bool GLInnerFence::IsSignaled() const {
+    if (is_stubbed) {
+        return true;
+    }
+    ASSERT(sync_object.handle != 0);
+    GLsizei length;
+    GLint sync_status;
+    glGetSynciv(sync_object.handle, GL_SYNC_STATUS, sizeof(GLint), &length, &sync_status);
+    return sync_status == GL_SIGNALED;
+}
+
+void GLInnerFence::Wait() {
+    if (is_stubbed) {
+        return;
+    }
+    ASSERT(sync_object.handle != 0);
+    glClientWaitSync(sync_object.handle, 0, GL_TIMEOUT_IGNORED);
+}
+
+FenceManagerOpenGL::FenceManagerOpenGL(VideoCore::RasterizerInterface& rasterizer_,
+                                       Tegra::GPU& gpu_, TextureCache& texture_cache_,
+                                       OGLBufferCache& buffer_cache_, QueryCache& query_cache_)
+    : GenericFenceManager{rasterizer_, gpu_, texture_cache_, buffer_cache_, query_cache_} {}
+
+Fence FenceManagerOpenGL::CreateFence(u32 value, bool is_stubbed) {
+    return std::make_shared<GLInnerFence>(value, is_stubbed);
+}
+
+Fence FenceManagerOpenGL::CreateFence(GPUVAddr addr, u32 value, bool is_stubbed) {
+    return std::make_shared<GLInnerFence>(addr, value, is_stubbed);
+}
+
+void FenceManagerOpenGL::QueueFence(Fence& fence) {
+    fence->Queue();
+}
+
+bool FenceManagerOpenGL::IsFenceSignaled(Fence& fence) const {
+    return fence->IsSignaled();
+}
+
+void FenceManagerOpenGL::WaitFence(Fence& fence) {
+    fence->Wait();
+}
+
+} // namespace OpenGL
--- a/Show More
+++ b/Show More