early-access version 4105

2024-02-01 07:01:42 +01:00
parent 7e95a06b53
commit afc99d73c7
36 changed files with 958 additions and 2765 deletions
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
 yuzu emulator early access
 =============

-This is the source code for early-access 4104.
+This is the source code for early-access 4105.

 ## Legal Notice

--- a/src/core/device_memory_manager.h
+++ b/src/core/device_memory_manager.h
@@ -43,8 +43,6 @@ public:
    DeviceMemoryManager(const DeviceMemory& device_memory);
    ~DeviceMemoryManager();

-    static constexpr bool HAS_FLUSH_INVALIDATION = true;
-
    void BindInterface(DeviceInterface* device_inter);

    DAddr Allocate(size_t size);
--- a/src/core/guest_memory.h
+++ b/src/core/guest_memory.h
@@ -44,32 +44,15 @@ public:
    GuestMemory() = delete;
    explicit GuestMemory(M& memory, u64 addr, std::size_t size,
                         Common::ScratchBuffer<T>* backup = nullptr)
-        : m_memory{&memory}, m_addr{addr}, m_size{size} {
+        : m_memory{memory}, m_addr{addr}, m_size{size} {
        static_assert(FLAGS & GuestMemoryFlags::Read || FLAGS & GuestMemoryFlags::Write);
-        if constexpr (!(FLAGS & GuestMemoryFlags::Read)) {
-            if (!this->TrySetSpan()) {
-                if (backup) {
-                    backup->resize_destructive(this->size());
-                    m_data_span = *backup;
-                    m_span_valid = true;
-                    m_is_data_copy = true;
-                } else {
-                    m_data_copy.resize(this->size());
-                    m_data_span = std::span(m_data_copy);
-                    m_span_valid = true;
-                    m_is_data_copy = true;
-                }
-            }
-        } else if constexpr (FLAGS & GuestMemoryFlags::Read) {
+        if constexpr (FLAGS & GuestMemoryFlags::Read) {
            Read(addr, size, backup);
        }
    }

    ~GuestMemory() = default;

-    GuestMemory(GuestMemory&& rhs) = default;
-    GuestMemory& operator=(GuestMemory&& rhs) = default;
-
    T* data() noexcept {
        return m_data_span.data();
    }
@@ -126,8 +109,8 @@ public:
        }

        if (this->TrySetSpan()) {
-            if constexpr (FLAGS & GuestMemoryFlags::Safe && M::HAS_FLUSH_INVALIDATION) {
-                m_memory->FlushRegion(m_addr, this->size_bytes());
+            if constexpr (FLAGS & GuestMemoryFlags::Safe) {
+                m_memory.FlushRegion(m_addr, this->size_bytes());
            }
        } else {
            if (backup) {
@@ -140,9 +123,9 @@ public:
            m_is_data_copy = true;
            m_span_valid = true;
            if constexpr (FLAGS & GuestMemoryFlags::Safe) {
-                m_memory->ReadBlock(m_addr, this->data(), this->size_bytes());
+                m_memory.ReadBlock(m_addr, this->data(), this->size_bytes());
            } else {
-                m_memory->ReadBlockUnsafe(m_addr, this->data(), this->size_bytes());
+                m_memory.ReadBlockUnsafe(m_addr, this->data(), this->size_bytes());
            }
        }
        return m_data_span;
@@ -150,19 +133,18 @@ public:

    void Write(std::span<T> write_data) noexcept {
        if constexpr (FLAGS & GuestMemoryFlags::Cached) {
-            m_memory->WriteBlockCached(m_addr, write_data.data(), this->size_bytes());
+            m_memory.WriteBlockCached(m_addr, write_data.data(), this->size_bytes());
        } else if constexpr (FLAGS & GuestMemoryFlags::Safe) {
-            m_memory->WriteBlock(m_addr, write_data.data(), this->size_bytes());
+            m_memory.WriteBlock(m_addr, write_data.data(), this->size_bytes());
        } else {
-            m_memory->WriteBlockUnsafe(m_addr, write_data.data(), this->size_bytes());
+            m_memory.WriteBlockUnsafe(m_addr, write_data.data(), this->size_bytes());
        }
    }

    bool TrySetSpan() noexcept {
-        if (u8* ptr = m_memory->GetSpan(m_addr, this->size_bytes()); ptr) {
+        if (u8* ptr = m_memory.GetSpan(m_addr, this->size_bytes()); ptr) {
            m_data_span = {reinterpret_cast<T*>(ptr), this->size()};
            m_span_valid = true;
-            m_is_data_copy = false;
            return true;
        }
        return false;
@@ -177,7 +159,7 @@ protected:
        return m_addr_changed;
    }

-    M* m_memory;
+    M& m_memory;
    u64 m_addr{};
    size_t m_size{};
    std::span<T> m_data_span{};
@@ -193,7 +175,17 @@ public:
    GuestMemoryScoped() = delete;
    explicit GuestMemoryScoped(M& memory, u64 addr, std::size_t size,
                               Common::ScratchBuffer<T>* backup = nullptr)
-        : GuestMemory<M, T, FLAGS>(memory, addr, size, backup) {}
+        : GuestMemory<M, T, FLAGS>(memory, addr, size, backup) {
+        if constexpr (!(FLAGS & GuestMemoryFlags::Read)) {
+            if (!this->TrySetSpan()) {
+                if (backup) {
+                    this->m_data_span = *backup;
+                    this->m_span_valid = true;
+                    this->m_is_data_copy = true;
+                }
+            }
+        }
+    }

    ~GuestMemoryScoped() {
        if constexpr (FLAGS & GuestMemoryFlags::Write) {
@@ -204,17 +196,15 @@ public:
            if (this->AddressChanged() || this->IsDataCopy()) {
                ASSERT(this->m_span_valid);
                if constexpr (FLAGS & GuestMemoryFlags::Cached) {
-                    this->m_memory->WriteBlockCached(this->m_addr, this->data(),
-                                                     this->size_bytes());
+                    this->m_memory.WriteBlockCached(this->m_addr, this->data(), this->size_bytes());
                } else if constexpr (FLAGS & GuestMemoryFlags::Safe) {
-                    this->m_memory->WriteBlock(this->m_addr, this->data(), this->size_bytes());
+                    this->m_memory.WriteBlock(this->m_addr, this->data(), this->size_bytes());
                } else {
-                    this->m_memory->WriteBlockUnsafe(this->m_addr, this->data(),
-                                                     this->size_bytes());
+                    this->m_memory.WriteBlockUnsafe(this->m_addr, this->data(), this->size_bytes());
                }
            } else if constexpr ((FLAGS & GuestMemoryFlags::Safe) ||
                                 (FLAGS & GuestMemoryFlags::Cached)) {
-                this->m_memory->InvalidateRegion(this->m_addr, this->size_bytes());
+                this->m_memory.InvalidateRegion(this->m_addr, this->size_bytes());
            }
        }
    }
--- a/src/core/hle/service/nvdrv/core/container.h
+++ b/src/core/hle/service/nvdrv/core/container.h
@@ -67,7 +67,10 @@ public:
    const SyncpointManager& GetSyncpointManager() const;

    struct Host1xDeviceFileData {
+        std::unordered_map<DeviceFD, u32> fd_to_id{};
        std::deque<u32> syncpts_accumulated{};
+        u32 nvdec_next_id{};
+        u32 vic_next_id{};
    };

    Host1xDeviceFileData& Host1xDeviceFile();
--- a/src/core/hle/service/nvdrv/devices/nvhost_nvdec.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvhost_nvdec.cpp
@@ -8,7 +8,6 @@
 #include "core/hle/service/nvdrv/core/container.h"
 #include "core/hle/service/nvdrv/devices/ioctl_serialization.h"
 #include "core/hle/service/nvdrv/devices/nvhost_nvdec.h"
-#include "video_core/host1x/host1x.h"
 #include "video_core/renderer_base.h"

 namespace Service::Nvidia::Devices {
@@ -22,8 +21,13 @@ NvResult nvhost_nvdec::Ioctl1(DeviceFD fd, Ioctl command, std::span<const u8> in
    switch (command.group) {
    case 0x0:
        switch (command.cmd) {
-        case 0x1:
+        case 0x1: {
+            auto& host1x_file = core.Host1xDeviceFile();
+            if (!host1x_file.fd_to_id.contains(fd)) {
+                host1x_file.fd_to_id[fd] = host1x_file.nvdec_next_id++;
+            }
            return WrapFixedVariable(this, &nvhost_nvdec::Submit, input, output, fd);
+        }
        case 0x2:
            return WrapFixed(this, &nvhost_nvdec::GetSyncpoint, input, output);
        case 0x3:
@@ -68,12 +72,15 @@ void nvhost_nvdec::OnOpen(NvCore::SessionId session_id, DeviceFD fd) {
    LOG_INFO(Service_NVDRV, "NVDEC video stream started");
    system.SetNVDECActive(true);
    sessions[fd] = session_id;
-    host1x.StartDevice(fd, Tegra::Host1x::ChannelType::NvDec, channel_syncpoint);
 }

 void nvhost_nvdec::OnClose(DeviceFD fd) {
    LOG_INFO(Service_NVDRV, "NVDEC video stream ended");
-    host1x.StopDevice(fd, Tegra::Host1x::ChannelType::NvDec);
+    auto& host1x_file = core.Host1xDeviceFile();
+    const auto iter = host1x_file.fd_to_id.find(fd);
+    if (iter != host1x_file.fd_to_id.end()) {
+        system.GPU().ClearCdmaInstance(iter->second);
+    }
    system.SetNVDECActive(false);
    auto it = sessions.find(fd);
    if (it != sessions.end()) {
--- a/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.cpp
@@ -55,9 +55,8 @@ std::size_t WriteVectors(std::span<u8> dst, const std::vector<T>& src, std::size

 nvhost_nvdec_common::nvhost_nvdec_common(Core::System& system_, NvCore::Container& core_,
                                         NvCore::ChannelType channel_type_)
-    : nvdevice{system_}, host1x{system_.Host1x()}, core{core_},
-      syncpoint_manager{core.GetSyncpointManager()}, nvmap{core.GetNvMapFile()},
-      channel_type{channel_type_} {
+    : nvdevice{system_}, core{core_}, syncpoint_manager{core.GetSyncpointManager()},
+      nvmap{core.GetNvMapFile()}, channel_type{channel_type_} {
    auto& syncpts_accumulated = core.Host1xDeviceFile().syncpts_accumulated;
    if (syncpts_accumulated.empty()) {
        channel_syncpoint = syncpoint_manager.AllocateSyncpoint(false);
@@ -96,24 +95,24 @@ NvResult nvhost_nvdec_common::Submit(IoctlSubmit& params, std::span<u8> data, De
    offset += SliceVectors(data, syncpt_increments, params.syncpoint_count, offset);
    offset += SliceVectors(data, fence_thresholds, params.fence_count, offset);

+    auto& gpu = system.GPU();
    auto* session = core.GetSession(sessions[fd]);

-    for (std::size_t i = 0; i < syncpt_increments.size(); i++) {
-        const SyncptIncr& syncpt_incr = syncpt_increments[i];
-        fence_thresholds[i] =
-            syncpoint_manager.IncrementSyncpointMaxExt(syncpt_incr.id, syncpt_incr.increments);
+    if (gpu.UseNvdec()) {
+        for (std::size_t i = 0; i < syncpt_increments.size(); i++) {
+            const SyncptIncr& syncpt_incr = syncpt_increments[i];
+            fence_thresholds[i] =
+                syncpoint_manager.IncrementSyncpointMaxExt(syncpt_incr.id, syncpt_incr.increments);
+        }
    }
-
    for (const auto& cmd_buffer : command_buffers) {
        const auto object = nvmap.GetHandle(cmd_buffer.memory_id);
        ASSERT_OR_EXECUTE(object, return NvResult::InvalidState;);
-        Core::Memory::CpuGuestMemory<Tegra::ChCommandHeader,
-                                     Core::Memory::GuestMemoryFlags::SafeRead>
-            cmdlist(session->process->GetMemory(), object->address + cmd_buffer.offset,
-                    cmd_buffer.word_count);
-        host1x.PushEntries(fd, std::move(cmdlist));
+        Tegra::ChCommandHeaderList cmdlist(cmd_buffer.word_count);
+        session->process->GetMemory().ReadBlock(object->address + cmd_buffer.offset, cmdlist.data(),
+                                                cmdlist.size() * sizeof(u32));
+        gpu.PushCommandBuffer(core.Host1xDeviceFile().fd_to_id[fd], cmdlist);
    }
-
    // Some games expect command_buffers to be written back
    offset = 0;
    offset += WriteVectors(data, command_buffers, offset);
--- a/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.h
+++ b/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.h
@@ -119,7 +119,6 @@ protected:

    Kernel::KEvent* QueryEvent(u32 event_id) override;

-    Tegra::Host1x::Host1x& host1x;
    u32 channel_syncpoint;
    s32_le nvmap_fd{};
    u32_le submit_timeout{};
--- a/src/core/hle/service/nvdrv/devices/nvhost_vic.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvhost_vic.cpp
@@ -7,7 +7,6 @@
 #include "core/hle/service/nvdrv/core/container.h"
 #include "core/hle/service/nvdrv/devices/ioctl_serialization.h"
 #include "core/hle/service/nvdrv/devices/nvhost_vic.h"
-#include "video_core/host1x/host1x.h"
 #include "video_core/renderer_base.h"

 namespace Service::Nvidia::Devices {
@@ -22,8 +21,13 @@ NvResult nvhost_vic::Ioctl1(DeviceFD fd, Ioctl command, std::span<const u8> inpu
    switch (command.group) {
    case 0x0:
        switch (command.cmd) {
-        case 0x1:
+        case 0x1: {
+            auto& host1x_file = core.Host1xDeviceFile();
+            if (!host1x_file.fd_to_id.contains(fd)) {
+                host1x_file.fd_to_id[fd] = host1x_file.vic_next_id++;
+            }
            return WrapFixedVariable(this, &nvhost_vic::Submit, input, output, fd);
+        }
        case 0x2:
            return WrapFixed(this, &nvhost_vic::GetSyncpoint, input, output);
        case 0x3:
@@ -66,11 +70,14 @@ NvResult nvhost_vic::Ioctl3(DeviceFD fd, Ioctl command, std::span<const u8> inpu

 void nvhost_vic::OnOpen(NvCore::SessionId session_id, DeviceFD fd) {
    sessions[fd] = session_id;
-    host1x.StartDevice(fd, Tegra::Host1x::ChannelType::VIC, channel_syncpoint);
 }

 void nvhost_vic::OnClose(DeviceFD fd) {
-    host1x.StopDevice(fd, Tegra::Host1x::ChannelType::VIC);
+    auto& host1x_file = core.Host1xDeviceFile();
+    const auto iter = host1x_file.fd_to_id.find(fd);
+    if (iter != host1x_file.fd_to_id.end()) {
+        system.GPU().ClearCdmaInstance(iter->second);
+    }
    sessions.erase(fd);
 }

--- a/src/core/memory.h
+++ b/src/core/memory.h
@@ -64,8 +64,6 @@ public:
    Memory(Memory&&) = default;
    Memory& operator=(Memory&&) = delete;

-    static constexpr bool HAS_FLUSH_INVALIDATION = false;
-
    /**
     * Resets the state of the Memory system.
     */
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -58,8 +58,8 @@ add_library(video_core STATIC
    framebuffer_config.h
    fsr.cpp
    fsr.h
-    host1x/codecs/decoder.cpp
-    host1x/codecs/decoder.h
+    host1x/codecs/codec.cpp
+    host1x/codecs/codec.h
    host1x/codecs/h264.cpp
    host1x/codecs/h264.h
    host1x/codecs/vp8.cpp
@@ -78,6 +78,8 @@ add_library(video_core STATIC
    host1x/nvdec.cpp
    host1x/nvdec.h
    host1x/nvdec_common.h
+    host1x/sync_manager.cpp
+    host1x/sync_manager.h
    host1x/syncpoint_manager.cpp
    host1x/syncpoint_manager.h
    host1x/vic.cpp
--- a/src/video_core/cdma_pusher.cpp
+++ b/src/video_core/cdma_pusher.cpp
@@ -2,130 +2,136 @@
 // SPDX-License-Identifier: MIT

 #include <bit>
-
-#include "common/thread.h"
-#include "core/core.h"
 #include "video_core/cdma_pusher.h"
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/host1x/control.h"
 #include "video_core/host1x/host1x.h"
 #include "video_core/host1x/nvdec.h"
 #include "video_core/host1x/nvdec_common.h"
+#include "video_core/host1x/sync_manager.h"
 #include "video_core/host1x/vic.h"
 #include "video_core/memory_manager.h"

 namespace Tegra {
-
-CDmaPusher::CDmaPusher(Host1x::Host1x& host1x_, s32 id)
-    : host1x{host1x_}, memory_manager{host1x.GMMU()},
-      host_processor{std::make_unique<Host1x::Control>(host1x_)}, current_class{
-                                                                      static_cast<ChClassId>(id)} {
-    thread = std::jthread([this](std::stop_token stop_token) { ProcessEntries(stop_token); });
-}
+CDmaPusher::CDmaPusher(Host1x::Host1x& host1x_)
+    : host1x{host1x_}, nvdec_processor(std::make_shared<Host1x::Nvdec>(host1x)),
+      vic_processor(std::make_unique<Host1x::Vic>(host1x, nvdec_processor)),
+      host1x_processor(std::make_unique<Host1x::Control>(host1x)),
+      sync_manager(std::make_unique<Host1x::SyncptIncrManager>(host1x)) {}

 CDmaPusher::~CDmaPusher() = default;

-void CDmaPusher::ProcessEntries(std::stop_token stop_token) {
-    Common::SetCurrentThreadPriority(Common::ThreadPriority::High);
-    ChCommandHeaderList command_list{host1x.System().ApplicationMemory(), 0, 0};
-    u32 count{};
-    u32 method_offset{};
-    u32 mask{};
-    bool incrementing{};
-
-    while (!stop_token.stop_requested()) {
-        {
-            std::unique_lock l{command_mutex};
-            Common::CondvarWait(command_cv, l, stop_token,
-                                [this]() { return command_lists.size() > 0; });
-            if (stop_token.stop_requested()) {
-                return;
+void CDmaPusher::ProcessEntries(ChCommandHeaderList&& entries) {
+    for (const auto& value : entries) {
+        if (mask != 0) {
+            const auto lbs = static_cast<u32>(std::countr_zero(mask));
+            mask &= ~(1U << lbs);
+            ExecuteCommand(offset + lbs, value.raw);
+            continue;
+        } else if (count != 0) {
+            --count;
+            ExecuteCommand(offset, value.raw);
+            if (incrementing) {
+                ++offset;
            }
-
-            command_list = std::move(command_lists.front());
-            command_lists.pop_front();
+            continue;
        }
-
-        size_t i = 0;
-        for (const auto value : command_list) {
-            i++;
-            if (mask != 0) {
-                const auto lbs = static_cast<u32>(std::countr_zero(mask));
-                mask &= ~(1U << lbs);
-                ExecuteCommand(method_offset + lbs, value.raw);
-                continue;
-            } else if (count != 0) {
-                --count;
-                ExecuteCommand(method_offset, value.raw);
-                if (incrementing) {
-                    ++method_offset;
-                }
-                continue;
-            }
-            const auto mode = value.submission_mode.Value();
-            switch (mode) {
-            case ChSubmissionMode::SetClass: {
-                mask = value.value & 0x3f;
-                method_offset = value.method_offset;
-                current_class = static_cast<ChClassId>((value.value >> 6) & 0x3ff);
-                break;
-            }
-            case ChSubmissionMode::Incrementing:
-            case ChSubmissionMode::NonIncrementing:
-                count = value.value;
-                method_offset = value.method_offset;
-                incrementing = mode == ChSubmissionMode::Incrementing;
-                break;
-            case ChSubmissionMode::Mask:
-                mask = value.value;
-                method_offset = value.method_offset;
-                break;
-            case ChSubmissionMode::Immediate: {
-                const u32 data = value.value & 0xfff;
-                method_offset = value.method_offset;
-                ExecuteCommand(method_offset, data);
-                break;
-            }
-            default:
-                LOG_ERROR(HW_GPU, "Bad command at index {} (bytes 0x{:X}), buffer size {}", i - 1,
-                          (i - 1) * sizeof(u32), command_list.size());
-                UNIMPLEMENTED_MSG("ChSubmission mode {} is not implemented!",
-                                  static_cast<u32>(mode));
-                break;
-            }
+        const auto mode = value.submission_mode.Value();
+        switch (mode) {
+        case ChSubmissionMode::SetClass: {
+            mask = value.value & 0x3f;
+            offset = value.method_offset;
+            current_class = static_cast<ChClassId>((value.value >> 6) & 0x3ff);
+            break;
+        }
+        case ChSubmissionMode::Incrementing:
+        case ChSubmissionMode::NonIncrementing:
+            count = value.value;
+            offset = value.method_offset;
+            incrementing = mode == ChSubmissionMode::Incrementing;
+            break;
+        case ChSubmissionMode::Mask:
+            mask = value.value;
+            offset = value.method_offset;
+            break;
+        case ChSubmissionMode::Immediate: {
+            const u32 data = value.value & 0xfff;
+            offset = value.method_offset;
+            ExecuteCommand(offset, data);
+            break;
+        }
+        default:
+            UNIMPLEMENTED_MSG("ChSubmission mode {} is not implemented!", static_cast<u32>(mode));
+            break;
        }
    }
 }

-void CDmaPusher::ExecuteCommand(u32 method, u32 arg) {
+void CDmaPusher::ExecuteCommand(u32 state_offset, u32 data) {
    switch (current_class) {
-    case ChClassId::Control:
-        LOG_TRACE(Service_NVDRV, "Class {} method 0x{:X} arg 0x{:X}",
-                  static_cast<u32>(current_class), method, arg);
-        host_processor->ProcessMethod(static_cast<Host1x::Control::Method>(method), arg);
-        break;
-    default:
-        thi_regs.reg_array[method] = arg;
-        switch (static_cast<ThiMethod>(method)) {
+    case ChClassId::NvDec:
+        ThiStateWrite(nvdec_thi_state, offset, data);
+        switch (static_cast<ThiMethod>(offset)) {
        case ThiMethod::IncSyncpt: {
-            const auto syncpoint_id = static_cast<u32>(arg & 0xFF);
-            [[maybe_unused]] const auto cond = static_cast<u32>((arg >> 8) & 0xFF);
-            LOG_TRACE(Service_NVDRV, "Class {} IncSyncpt Method, syncpt {} cond {}",
-                      static_cast<u32>(current_class), syncpoint_id, cond);
-            auto& syncpoint_manager = host1x.GetSyncpointManager();
-            syncpoint_manager.IncrementGuest(syncpoint_id);
-            syncpoint_manager.IncrementHost(syncpoint_id);
+            LOG_DEBUG(Service_NVDRV, "NVDEC Class IncSyncpt Method");
+            const auto syncpoint_id = static_cast<u32>(data & 0xFF);
+            const auto cond = static_cast<u32>((data >> 8) & 0xFF);
+            if (cond == 0) {
+                sync_manager->Increment(syncpoint_id);
+            } else {
+                sync_manager->SignalDone(
+                    sync_manager->IncrementWhenDone(static_cast<u32>(current_class), syncpoint_id));
+            }
            break;
        }
        case ThiMethod::SetMethod1:
-            LOG_TRACE(Service_NVDRV, "Class {} method 0x{:X} arg 0x{:X}",
-                      static_cast<u32>(current_class), static_cast<u32>(thi_regs.method_0), arg);
-            ProcessMethod(thi_regs.method_0, arg);
+            LOG_DEBUG(Service_NVDRV, "NVDEC method 0x{:X}",
+                      static_cast<u32>(nvdec_thi_state.method_0));
+            nvdec_processor->ProcessMethod(nvdec_thi_state.method_0, data);
            break;
        default:
            break;
        }
+        break;
+    case ChClassId::GraphicsVic:
+        ThiStateWrite(vic_thi_state, static_cast<u32>(state_offset), {data});
+        switch (static_cast<ThiMethod>(state_offset)) {
+        case ThiMethod::IncSyncpt: {
+            LOG_DEBUG(Service_NVDRV, "VIC Class IncSyncpt Method");
+            const auto syncpoint_id = static_cast<u32>(data & 0xFF);
+            const auto cond = static_cast<u32>((data >> 8) & 0xFF);
+            if (cond == 0) {
+                sync_manager->Increment(syncpoint_id);
+            } else {
+                sync_manager->SignalDone(
+                    sync_manager->IncrementWhenDone(static_cast<u32>(current_class), syncpoint_id));
+            }
+            break;
+        }
+        case ThiMethod::SetMethod1:
+            LOG_DEBUG(Service_NVDRV, "VIC method 0x{:X}, Args=({})",
+                      static_cast<u32>(vic_thi_state.method_0), data);
+            vic_processor->ProcessMethod(static_cast<Host1x::Vic::Method>(vic_thi_state.method_0),
+                                         data);
+            break;
+        default:
+            break;
+        }
+        break;
+    case ChClassId::Control:
+        // This device is mainly for syncpoint synchronization
+        LOG_DEBUG(Service_NVDRV, "Host1X Class Method");
+        host1x_processor->ProcessMethod(static_cast<Host1x::Control::Method>(offset), data);
+        break;
+    default:
+        UNIMPLEMENTED_MSG("Current class not implemented {:X}", static_cast<u32>(current_class));
+        break;
    }
 }

+void CDmaPusher::ThiStateWrite(ThiRegisters& state, u32 state_offset, u32 argument) {
+    u8* const offset_ptr = reinterpret_cast<u8*>(&state) + sizeof(u32) * state_offset;
+    std::memcpy(offset_ptr, &argument, sizeof(u32));
+}
+
 } // namespace Tegra
--- a/src/video_core/cdma_pusher.h
+++ b/src/video_core/cdma_pusher.h
@@ -3,18 +3,12 @@

 #pragma once

-#include <condition_variable>
-#include <deque>
 #include <memory>
-#include <mutex>
-#include <thread>
 #include <vector>

 #include "common/bit_field.h"
 #include "common/common_funcs.h"
 #include "common/common_types.h"
-#include "common/polyfill_thread.h"
-#include "core/memory.h"

 namespace Tegra {

@@ -68,31 +62,23 @@ struct ChCommand {
    std::vector<u32> arguments;
 };

-using ChCommandHeaderList =
-    Core::Memory::CpuGuestMemory<Tegra::ChCommandHeader, Core::Memory::GuestMemoryFlags::SafeRead>;
+using ChCommandHeaderList = std::vector<ChCommandHeader>;

 struct ThiRegisters {
-    static constexpr std::size_t NUM_REGS = 0x20;
-
-    union {
-        struct {
-            u32_le increment_syncpt;
-            INSERT_PADDING_WORDS_NOINIT(1);
-            u32_le increment_syncpt_error;
-            u32_le ctx_switch_incremement_syncpt;
-            INSERT_PADDING_WORDS_NOINIT(4);
-            u32_le ctx_switch;
-            INSERT_PADDING_WORDS_NOINIT(1);
-            u32_le ctx_syncpt_eof;
-            INSERT_PADDING_WORDS_NOINIT(5);
-            u32_le method_0;
-            u32_le method_1;
-            INSERT_PADDING_WORDS_NOINIT(12);
-            u32_le int_status;
-            u32_le int_mask;
-        };
-        std::array<u32, NUM_REGS> reg_array;
-    };
+    u32_le increment_syncpt{};
+    INSERT_PADDING_WORDS(1);
+    u32_le increment_syncpt_error{};
+    u32_le ctx_switch_incremement_syncpt{};
+    INSERT_PADDING_WORDS(4);
+    u32_le ctx_switch{};
+    INSERT_PADDING_WORDS(1);
+    u32_le ctx_syncpt_eof{};
+    INSERT_PADDING_WORDS(5);
+    u32_le method_0{};
+    u32_le method_1{};
+    INSERT_PADDING_WORDS(12);
+    u32_le int_status{};
+    u32_le int_mask{};
 };

 enum class ThiMethod : u32 {
@@ -103,39 +89,32 @@ enum class ThiMethod : u32 {

 class CDmaPusher {
 public:
-    CDmaPusher() = delete;
-    virtual ~CDmaPusher();
+    explicit CDmaPusher(Host1x::Host1x& host1x);
+    ~CDmaPusher();

-    void PushEntries(ChCommandHeaderList&& entries) {
-        std::scoped_lock l{command_mutex};
-        command_lists.push_back(std::move(entries));
-        command_cv.notify_one();
-    }
-
-protected:
-    explicit CDmaPusher(Host1x::Host1x& host1x, s32 id);
-
-    virtual void ProcessMethod(u32 method, u32 arg) = 0;
-
-    Host1x::Host1x& host1x;
-    Tegra::MemoryManager& memory_manager;
+    /// Process the command entry
+    void ProcessEntries(ChCommandHeaderList&& entries);

 private:
-    /// Process the command entry
-    void ProcessEntries(std::stop_token stop_token);
-
    /// Invoke command class devices to execute the command based on the current state
    void ExecuteCommand(u32 state_offset, u32 data);

-    std::unique_ptr<Host1x::Control> host_processor;
+    /// Write arguments value to the ThiRegisters member at the specified offset
+    void ThiStateWrite(ThiRegisters& state, u32 offset, u32 argument);

-    std::mutex command_mutex;
-    std::condition_variable_any command_cv;
-    std::deque<ChCommandHeaderList> command_lists;
-    std::jthread thread;
+    Host1x::Host1x& host1x;
+    std::shared_ptr<Tegra::Host1x::Nvdec> nvdec_processor;
+    std::unique_ptr<Tegra::Host1x::Vic> vic_processor;
+    std::unique_ptr<Tegra::Host1x::Control> host1x_processor;
+    std::unique_ptr<Host1x::SyncptIncrManager> sync_manager;
+    ChClassId current_class{};
+    ThiRegisters vic_thi_state{};
+    ThiRegisters nvdec_thi_state{};

-    ThiRegisters thi_regs{};
-    ChClassId current_class;
+    u32 count{};
+    u32 offset{};
+    u32 mask{};
+    bool incrementing{};
 };

 } // namespace Tegra
--- a/src/video_core/engines/sw_blitter/blitter.cpp
+++ b/src/video_core/engines/sw_blitter/blitter.cpp
@@ -111,6 +111,20 @@ void Bilinear(std::span<const f32> input, std::span<f32> output, size_t src_widt
    }
 }

+template <bool unpack>
+void ProcessPitchLinear(std::span<const u8> input, std::span<u8> output, size_t extent_x,
+                        size_t extent_y, u32 pitch, u32 x0, u32 y0, size_t bpp) {
+    const size_t base_offset = x0 * bpp;
+    const size_t copy_size = extent_x * bpp;
+    for (size_t y = 0; y < extent_y; y++) {
+        const size_t first_offset = (y + y0) * pitch + base_offset;
+        const size_t second_offset = y * extent_x * bpp;
+        u8* write_to = unpack ? &output[first_offset] : &output[second_offset];
+        const u8* read_from = unpack ? &input[second_offset] : &input[first_offset];
+        std::memcpy(write_to, read_from, copy_size);
+    }
+}
+
 } // namespace

 struct SoftwareBlitEngine::BlitEngineImpl {
@@ -138,19 +152,6 @@ bool SoftwareBlitEngine::Blit(Fermi2D::Surface& src, Fermi2D::Surface& dst,
        }
        return static_cast<size_t>(surface.pitch * surface.height);
    };
-    const auto process_pitch_linear = [](bool unpack, std::span<const u8> input,
-                                         std::span<u8> output, u32 extent_x, u32 extent_y,
-                                         u32 pitch, u32 x0, u32 y0, size_t bpp) {
-        const size_t base_offset = x0 * bpp;
-        const size_t copy_size = extent_x * bpp;
-        for (u32 y = y0; y < extent_y; y++) {
-            const size_t first_offset = y * pitch + base_offset;
-            const size_t second_offset = y * extent_x * bpp;
-            u8* write_to = unpack ? &output[first_offset] : &output[second_offset];
-            const u8* read_from = unpack ? &input[second_offset] : &input[first_offset];
-            std::memcpy(write_to, read_from, copy_size);
-        }
-    };

    const u32 src_extent_x = config.src_x1 - config.src_x0;
    const u32 src_extent_y = config.src_y1 - config.src_y0;
@@ -205,8 +206,8 @@ bool SoftwareBlitEngine::Blit(Fermi2D::Surface& src, Fermi2D::Surface& dst,
                         src.depth, config.src_x0, config.src_y0, src_extent_x, src_extent_y,
                         src.block_height, src.block_depth, src_extent_x * src_bytes_per_pixel);
    } else {
-        process_pitch_linear(false, tmp_buffer, impl->src_buffer, src_extent_x, src_extent_y,
-                             src.pitch, config.src_x0, config.src_y0, src_bytes_per_pixel);
+        ProcessPitchLinear<false>(tmp_buffer, impl->src_buffer, src_extent_x, src_extent_y,
+                                  src.pitch, config.src_x0, config.src_y0, src_bytes_per_pixel);
    }

    // Conversion Phase
@@ -229,9 +230,9 @@ bool SoftwareBlitEngine::Blit(Fermi2D::Surface& src, Fermi2D::Surface& dst,
                       dst.depth, config.dst_x0, config.dst_y0, dst_extent_x, dst_extent_y,
                       dst.block_height, dst.block_depth, dst_extent_x * dst_bytes_per_pixel);
    } else {
-        process_pitch_linear(true, impl->dst_buffer, tmp_buffer2, dst_extent_x, dst_extent_y,
-                             dst.pitch, config.dst_x0, config.dst_y0,
-                             static_cast<size_t>(dst_bytes_per_pixel));
+        ProcessPitchLinear<true>(impl->dst_buffer, tmp_buffer2, dst_extent_x, dst_extent_y,
+                                 dst.pitch, config.dst_x0, config.dst_y0,
+                                 static_cast<size_t>(dst_bytes_per_pixel));
    }
    return true;
 }
--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@@ -250,6 +250,30 @@ struct GPU::Impl {
        gpu_thread.SubmitList(channel, std::move(entries));
    }

+    /// Push GPU command buffer entries to be processed
+    void PushCommandBuffer(u32 id, Tegra::ChCommandHeaderList& entries) {
+        if (!use_nvdec) {
+            return;
+        }
+
+        if (!cdma_pushers.contains(id)) {
+            cdma_pushers.insert_or_assign(id, std::make_unique<Tegra::CDmaPusher>(host1x));
+        }
+
+        // SubmitCommandBuffer would make the nvdec operations async, this is not currently working
+        // TODO(ameerj): RE proper async nvdec operation
+        // gpu_thread.SubmitCommandBuffer(std::move(entries));
+        cdma_pushers[id]->ProcessEntries(std::move(entries));
+    }
+
+    /// Frees the CDMAPusher instance to free up resources
+    void ClearCdmaInstance(u32 id) {
+        const auto iter = cdma_pushers.find(id);
+        if (iter != cdma_pushers.end()) {
+            cdma_pushers.erase(iter);
+        }
+    }
+
    /// Swap buffers (render frame)
    void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) {
        gpu_thread.SwapBuffers(framebuffer);
@@ -332,6 +356,7 @@ struct GPU::Impl {
    Core::System& system;
    Host1x::Host1x& host1x;

+    std::map<u32, std::unique_ptr<Tegra::CDmaPusher>> cdma_pushers;
    std::unique_ptr<VideoCore::RendererBase> renderer;
    VideoCore::RasterizerInterface* rasterizer = nullptr;
    const bool use_nvdec;
@@ -521,6 +546,14 @@ void GPU::PushGPUEntries(s32 channel, Tegra::CommandList&& entries) {
    impl->PushGPUEntries(channel, std::move(entries));
 }

+void GPU::PushCommandBuffer(u32 id, Tegra::ChCommandHeaderList& entries) {
+    impl->PushCommandBuffer(id, entries);
+}
+
+void GPU::ClearCdmaInstance(u32 id) {
+    impl->ClearCdmaInstance(id);
+}
+
 void GPU::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) {
    impl->SwapBuffers(framebuffer);
 }
--- a/src/video_core/gpu.h
+++ b/src/video_core/gpu.h
@@ -232,6 +232,12 @@ public:
    /// Push GPU command entries to be processed
    void PushGPUEntries(s32 channel, Tegra::CommandList&& entries);

+    /// Push GPU command buffer entries to be processed
+    void PushCommandBuffer(u32 id, Tegra::ChCommandHeaderList& entries);
+
+    /// Frees the CDMAPusher instance to free up resources
+    void ClearCdmaInstance(u32 id);
+
    /// Swap buffers (render frame)
    void SwapBuffers(const Tegra::FramebufferConfig* framebuffer);

--- a/src/video_core/gpu_thread.cpp
+++ b/src/video_core/gpu_thread.cpp
@@ -12,7 +12,6 @@
 #include "video_core/dma_pusher.h"
 #include "video_core/gpu.h"
 #include "video_core/gpu_thread.h"
-#include "video_core/host1x/host1x.h"
 #include "video_core/renderer_base.h"

 namespace VideoCommon::GPUThread {
--- a/src/video_core/host1x/codecs/h264.cpp
+++ b/src/video_core/host1x/codecs/h264.cpp
@@ -10,7 +10,7 @@
 #include "video_core/host1x/host1x.h"
 #include "video_core/memory_manager.h"

-namespace Tegra::Decoders {
+namespace Tegra::Decoder {
 namespace {
 // ZigZag LUTs from libavcodec.
 constexpr std::array<u8, 64> zig_zag_direct{
@@ -25,55 +25,23 @@ constexpr std::array<u8, 16> zig_zag_scan{
 };
 } // Anonymous namespace

-H264::H264(Host1x::Host1x& host1x_, const Host1x::NvdecCommon::NvdecRegisters& regs_, s32 id_)
-    : Decoder{host1x_, id_, regs_} {
-    codec = Host1x::NvdecCommon::VideoCodec::H264;
-    initialized = decode_api.Initialize(codec);
-}
+H264::H264(Host1x::Host1x& host1x_) : host1x{host1x_} {}

 H264::~H264() = default;

-std::tuple<u64, u64> H264::GetProgressiveOffsets() {
-    auto pic_idx{current_context.h264_parameter_set.curr_pic_idx};
-    auto luma{regs.surface_luma_offsets[pic_idx].Address() +
-              current_context.h264_parameter_set.luma_frame_offset.Address()};
-    auto chroma{regs.surface_chroma_offsets[pic_idx].Address() +
-                current_context.h264_parameter_set.chroma_frame_offset.Address()};
-    return {luma, chroma};
-}
+std::span<const u8> H264::ComposeFrame(const Host1x::NvdecCommon::NvdecRegisters& state,
+                                       size_t* out_configuration_size, bool is_first_frame) {
+    H264DecoderContext context;
+    host1x.GMMU().ReadBlock(state.picture_info_offset, &context, sizeof(H264DecoderContext));

-std::tuple<u64, u64, u64, u64> H264::GetInterlacedOffsets() {
-    auto pic_idx{current_context.h264_parameter_set.curr_pic_idx};
-    auto luma_top{regs.surface_luma_offsets[pic_idx].Address() +
-                  current_context.h264_parameter_set.luma_top_offset.Address()};
-    auto luma_bottom{regs.surface_luma_offsets[pic_idx].Address() +
-                     current_context.h264_parameter_set.luma_bot_offset.Address()};
-    auto chroma_top{regs.surface_chroma_offsets[pic_idx].Address() +
-                    current_context.h264_parameter_set.chroma_top_offset.Address()};
-    auto chroma_bottom{regs.surface_chroma_offsets[pic_idx].Address() +
-                       current_context.h264_parameter_set.chroma_bot_offset.Address()};
-    return {luma_top, luma_bottom, chroma_top, chroma_bottom};
-}
-
-bool H264::IsInterlaced() {
-    return current_context.h264_parameter_set.luma_top_offset.Address() != 0 ||
-           current_context.h264_parameter_set.luma_bot_offset.Address() != 0;
-}
-
-std::span<const u8> H264::ComposeFrame() {
-    memory_manager.ReadBlock(regs.picture_info_offset.Address(), &current_context,
-                             sizeof(H264DecoderContext));
-
-    const s64 frame_number = current_context.h264_parameter_set.frame_number.Value();
+    const s64 frame_number = context.h264_parameter_set.frame_number.Value();
    if (!is_first_frame && frame_number != 0) {
-        frame_scratch.resize_destructive(current_context.stream_len);
-        memory_manager.ReadBlock(regs.frame_bitstream_offset.Address(), frame_scratch.data(),
-                                 frame_scratch.size());
-        return frame_scratch;
+        frame.resize_destructive(context.stream_len);
+        host1x.GMMU().ReadBlock(state.frame_bitstream_offset, frame.data(), frame.size());
+        *out_configuration_size = 0;
+        return frame;
    }

-    is_first_frame = false;
-
    // Encode header
    H264BitWriter writer{};
    writer.WriteU(1, 24);
@@ -85,7 +53,7 @@ std::span<const u8> H264::ComposeFrame() {
    writer.WriteU(31, 8);
    writer.WriteUe(0);
    const u32 chroma_format_idc =
-        static_cast<u32>(current_context.h264_parameter_set.chroma_format_idc.Value());
+        static_cast<u32>(context.h264_parameter_set.chroma_format_idc.Value());
    writer.WriteUe(chroma_format_idc);
    if (chroma_format_idc == 3) {
        writer.WriteBit(false);
@@ -93,44 +61,42 @@ std::span<const u8> H264::ComposeFrame() {

    writer.WriteUe(0);
    writer.WriteUe(0);
-    writer.WriteBit(current_context.qpprime_y_zero_transform_bypass_flag.Value() != 0);
+    writer.WriteBit(false); // QpprimeYZeroTransformBypassFlag
    writer.WriteBit(false); // Scaling matrix present flag

-    writer.WriteUe(
-        static_cast<u32>(current_context.h264_parameter_set.log2_max_frame_num_minus4.Value()));
+    writer.WriteUe(static_cast<u32>(context.h264_parameter_set.log2_max_frame_num_minus4.Value()));

    const auto order_cnt_type =
-        static_cast<u32>(current_context.h264_parameter_set.pic_order_cnt_type.Value());
+        static_cast<u32>(context.h264_parameter_set.pic_order_cnt_type.Value());
    writer.WriteUe(order_cnt_type);
    if (order_cnt_type == 0) {
-        writer.WriteUe(current_context.h264_parameter_set.log2_max_pic_order_cnt_lsb_minus4);
+        writer.WriteUe(context.h264_parameter_set.log2_max_pic_order_cnt_lsb_minus4);
    } else if (order_cnt_type == 1) {
-        writer.WriteBit(current_context.h264_parameter_set.delta_pic_order_always_zero_flag != 0);
+        writer.WriteBit(context.h264_parameter_set.delta_pic_order_always_zero_flag != 0);

        writer.WriteSe(0);
        writer.WriteSe(0);
        writer.WriteUe(0);
    }

-    const s32 pic_height = current_context.h264_parameter_set.frame_height_in_mbs /
-                           (current_context.h264_parameter_set.frame_mbs_only_flag ? 1 : 2);
+    const s32 pic_height = context.h264_parameter_set.frame_height_in_map_units /
+                           (context.h264_parameter_set.frame_mbs_only_flag ? 1 : 2);

-    u32 max_num_ref_frames =
-        std::max(std::max(current_context.h264_parameter_set.num_refidx_l0_default_active,
-                          current_context.h264_parameter_set.num_refidx_l1_default_active) +
-                     1,
-                 4);
+    // TODO (ameerj): Where do we get this number, it seems to be particular for each stream
+    const auto nvdec_decoding = Settings::values.nvdec_emulation.GetValue();
+    const bool uses_gpu_decoding = nvdec_decoding == Settings::NvdecEmulation::Gpu;
+    const u32 max_num_ref_frames = uses_gpu_decoding ? 6u : 16u;
    writer.WriteUe(max_num_ref_frames);
    writer.WriteBit(false);
-    writer.WriteUe(current_context.h264_parameter_set.pic_width_in_mbs - 1);
+    writer.WriteUe(context.h264_parameter_set.pic_width_in_mbs - 1);
    writer.WriteUe(pic_height - 1);
-    writer.WriteBit(current_context.h264_parameter_set.frame_mbs_only_flag != 0);
+    writer.WriteBit(context.h264_parameter_set.frame_mbs_only_flag != 0);

-    if (!current_context.h264_parameter_set.frame_mbs_only_flag) {
-        writer.WriteBit(current_context.h264_parameter_set.flags.mbaff_frame.Value() != 0);
+    if (!context.h264_parameter_set.frame_mbs_only_flag) {
+        writer.WriteBit(context.h264_parameter_set.flags.mbaff_frame.Value() != 0);
    }

-    writer.WriteBit(current_context.h264_parameter_set.flags.direct_8x8_inference.Value() != 0);
+    writer.WriteBit(context.h264_parameter_set.flags.direct_8x8_inference.Value() != 0);
    writer.WriteBit(false); // Frame cropping flag
    writer.WriteBit(false); // VUI parameter present flag

@@ -145,59 +111,57 @@ std::span<const u8> H264::ComposeFrame() {
    writer.WriteUe(0);
    writer.WriteUe(0);

-    writer.WriteBit(current_context.h264_parameter_set.entropy_coding_mode_flag != 0);
-    writer.WriteBit(current_context.h264_parameter_set.pic_order_present_flag != 0);
+    writer.WriteBit(context.h264_parameter_set.entropy_coding_mode_flag != 0);
+    writer.WriteBit(context.h264_parameter_set.pic_order_present_flag != 0);
    writer.WriteUe(0);
-    writer.WriteUe(current_context.h264_parameter_set.num_refidx_l0_default_active);
-    writer.WriteUe(current_context.h264_parameter_set.num_refidx_l1_default_active);
-    writer.WriteBit(current_context.h264_parameter_set.flags.weighted_pred.Value() != 0);
-    writer.WriteU(static_cast<s32>(current_context.h264_parameter_set.weighted_bipred_idc.Value()),
-                  2);
-    s32 pic_init_qp =
-        static_cast<s32>(current_context.h264_parameter_set.pic_init_qp_minus26.Value());
+    writer.WriteUe(context.h264_parameter_set.num_refidx_l0_default_active);
+    writer.WriteUe(context.h264_parameter_set.num_refidx_l1_default_active);
+    writer.WriteBit(context.h264_parameter_set.flags.weighted_pred.Value() != 0);
+    writer.WriteU(static_cast<s32>(context.h264_parameter_set.weighted_bipred_idc.Value()), 2);
+    s32 pic_init_qp = static_cast<s32>(context.h264_parameter_set.pic_init_qp_minus26.Value());
    writer.WriteSe(pic_init_qp);
    writer.WriteSe(0);
    s32 chroma_qp_index_offset =
-        static_cast<s32>(current_context.h264_parameter_set.chroma_qp_index_offset.Value());
+        static_cast<s32>(context.h264_parameter_set.chroma_qp_index_offset.Value());

    writer.WriteSe(chroma_qp_index_offset);
-    writer.WriteBit(current_context.h264_parameter_set.deblocking_filter_control_present_flag != 0);
-    writer.WriteBit(current_context.h264_parameter_set.flags.constrained_intra_pred.Value() != 0);
-    writer.WriteBit(current_context.h264_parameter_set.redundant_pic_cnt_present_flag != 0);
-    writer.WriteBit(current_context.h264_parameter_set.transform_8x8_mode_flag != 0);
+    writer.WriteBit(context.h264_parameter_set.deblocking_filter_control_present_flag != 0);
+    writer.WriteBit(context.h264_parameter_set.flags.constrained_intra_pred.Value() != 0);
+    writer.WriteBit(context.h264_parameter_set.redundant_pic_cnt_present_flag != 0);
+    writer.WriteBit(context.h264_parameter_set.transform_8x8_mode_flag != 0);

    writer.WriteBit(true); // pic_scaling_matrix_present_flag

    for (s32 index = 0; index < 6; index++) {
        writer.WriteBit(true);
-        std::span<const u8> matrix{current_context.weight_scale_4x4};
-        writer.WriteScalingList(scan_scratch, matrix, index * 16, 16);
+        std::span<const u8> matrix{context.weight_scale};
+        writer.WriteScalingList(scan, matrix, index * 16, 16);
    }

-    if (current_context.h264_parameter_set.transform_8x8_mode_flag) {
+    if (context.h264_parameter_set.transform_8x8_mode_flag) {
        for (s32 index = 0; index < 2; index++) {
            writer.WriteBit(true);
-            std::span<const u8> matrix{current_context.weight_scale_8x8};
-            writer.WriteScalingList(scan_scratch, matrix, index * 64, 64);
+            std::span<const u8> matrix{context.weight_scale_8x8};
+            writer.WriteScalingList(scan, matrix, index * 64, 64);
        }
    }

    s32 chroma_qp_index_offset2 =
-        static_cast<s32>(current_context.h264_parameter_set.second_chroma_qp_index_offset.Value());
+        static_cast<s32>(context.h264_parameter_set.second_chroma_qp_index_offset.Value());

    writer.WriteSe(chroma_qp_index_offset2);

    writer.End();

    const auto& encoded_header = writer.GetByteArray();
-    frame_scratch.resize(encoded_header.size() + current_context.stream_len);
-    std::memcpy(frame_scratch.data(), encoded_header.data(), encoded_header.size());
+    frame.resize(encoded_header.size() + context.stream_len);
+    std::memcpy(frame.data(), encoded_header.data(), encoded_header.size());

-    memory_manager.ReadBlock(regs.frame_bitstream_offset.Address(),
-                             frame_scratch.data() + encoded_header.size(),
-                             current_context.stream_len);
+    *out_configuration_size = encoded_header.size();
+    host1x.GMMU().ReadBlock(state.frame_bitstream_offset, frame.data() + encoded_header.size(),
+                            context.stream_len);

-    return frame_scratch;
+    return frame;
 }

 H264BitWriter::H264BitWriter() = default;
@@ -314,4 +278,4 @@ void H264BitWriter::Flush() {
    buffer = 0;
    buffer_pos = 0;
 }
-} // namespace Tegra::Decoders
+} // namespace Tegra::Decoder
--- a/src/video_core/host1x/codecs/h264.h
+++ b/src/video_core/host1x/codecs/h264.h
@@ -10,7 +10,6 @@
 #include "common/common_funcs.h"
 #include "common/common_types.h"
 #include "common/scratch_buffer.h"
-#include "video_core/host1x/codecs/decoder.h"
 #include "video_core/host1x/nvdec_common.h"

 namespace Tegra {
@@ -19,7 +18,7 @@ namespace Host1x {
 class Host1x;
 } // namespace Host1x

-namespace Decoders {
+namespace Decoder {

 class H264BitWriter {
 public:
@@ -61,212 +60,123 @@ private:
    std::vector<u8> byte_array;
 };

-struct Offset {
-    constexpr u32 Address() const noexcept {
-        return offset << 8;
-    }
+class H264 {
+public:
+    explicit H264(Host1x::Host1x& host1x);
+    ~H264();
+
+    /// Compose the H264 frame for FFmpeg decoding
+    [[nodiscard]] std::span<const u8> ComposeFrame(const Host1x::NvdecCommon::NvdecRegisters& state,
+                                                   size_t* out_configuration_size,
+                                                   bool is_first_frame = false);

 private:
-    u32 offset;
-};
-static_assert(std::is_trivial_v<Offset>, "Offset must be trivial");
-static_assert(sizeof(Offset) == 0x4, "Offset has the wrong size!");
+    Common::ScratchBuffer<u8> frame;
+    Common::ScratchBuffer<u8> scan;
+    Host1x::Host1x& host1x;

-struct H264ParameterSet {
-    s32 log2_max_pic_order_cnt_lsb_minus4; ///< 0x00
-    s32 delta_pic_order_always_zero_flag;  ///< 0x04
-    s32 frame_mbs_only_flag;               ///< 0x08
-    u32 pic_width_in_mbs;                  ///< 0x0C
-    u32 frame_height_in_mbs;               ///< 0x10
-    union {                                ///< 0x14
-        BitField<0, 2, u32> tile_format;
-        BitField<2, 3, u32> gob_height;
-        BitField<5, 27, u32> reserved_surface_format;
+    struct H264ParameterSet {
+        s32 log2_max_pic_order_cnt_lsb_minus4; ///< 0x00
+        s32 delta_pic_order_always_zero_flag;  ///< 0x04
+        s32 frame_mbs_only_flag;               ///< 0x08
+        u32 pic_width_in_mbs;                  ///< 0x0C
+        u32 frame_height_in_map_units;         ///< 0x10
+        union {                                ///< 0x14
+            BitField<0, 2, u32> tile_format;
+            BitField<2, 3, u32> gob_height;
+        };
+        u32 entropy_coding_mode_flag;               ///< 0x18
+        s32 pic_order_present_flag;                 ///< 0x1C
+        s32 num_refidx_l0_default_active;           ///< 0x20
+        s32 num_refidx_l1_default_active;           ///< 0x24
+        s32 deblocking_filter_control_present_flag; ///< 0x28
+        s32 redundant_pic_cnt_present_flag;         ///< 0x2C
+        u32 transform_8x8_mode_flag;                ///< 0x30
+        u32 pitch_luma;                             ///< 0x34
+        u32 pitch_chroma;                           ///< 0x38
+        u32 luma_top_offset;                        ///< 0x3C
+        u32 luma_bot_offset;                        ///< 0x40
+        u32 luma_frame_offset;                      ///< 0x44
+        u32 chroma_top_offset;                      ///< 0x48
+        u32 chroma_bot_offset;                      ///< 0x4C
+        u32 chroma_frame_offset;                    ///< 0x50
+        u32 hist_buffer_size;                       ///< 0x54
+        union {                                     ///< 0x58
+            union {
+                BitField<0, 1, u64> mbaff_frame;
+                BitField<1, 1, u64> direct_8x8_inference;
+                BitField<2, 1, u64> weighted_pred;
+                BitField<3, 1, u64> constrained_intra_pred;
+                BitField<4, 1, u64> ref_pic;
+                BitField<5, 1, u64> field_pic;
+                BitField<6, 1, u64> bottom_field;
+                BitField<7, 1, u64> second_field;
+            } flags;
+            BitField<8, 4, u64> log2_max_frame_num_minus4;
+            BitField<12, 2, u64> chroma_format_idc;
+            BitField<14, 2, u64> pic_order_cnt_type;
+            BitField<16, 6, s64> pic_init_qp_minus26;
+            BitField<22, 5, s64> chroma_qp_index_offset;
+            BitField<27, 5, s64> second_chroma_qp_index_offset;
+            BitField<32, 2, u64> weighted_bipred_idc;
+            BitField<34, 7, u64> curr_pic_idx;
+            BitField<41, 5, u64> curr_col_idx;
+            BitField<46, 16, u64> frame_number;
+            BitField<62, 1, u64> frame_surfaces;
+            BitField<63, 1, u64> output_memory_layout;
+        };
    };
-    u32 entropy_coding_mode_flag;               ///< 0x18
-    s32 pic_order_present_flag;                 ///< 0x1C
-    s32 num_refidx_l0_default_active;           ///< 0x20
-    s32 num_refidx_l1_default_active;           ///< 0x24
-    s32 deblocking_filter_control_present_flag; ///< 0x28
-    s32 redundant_pic_cnt_present_flag;         ///< 0x2C
-    u32 transform_8x8_mode_flag;                ///< 0x30
-    u32 pitch_luma;                             ///< 0x34
-    u32 pitch_chroma;                           ///< 0x38
-    Offset luma_top_offset;                     ///< 0x3C
-    Offset luma_bot_offset;                     ///< 0x40
-    Offset luma_frame_offset;                   ///< 0x44
-    Offset chroma_top_offset;                   ///< 0x48
-    Offset chroma_bot_offset;                   ///< 0x4C
-    Offset chroma_frame_offset;                 ///< 0x50
-    u32 hist_buffer_size;                       ///< 0x54
-    union {                                     ///< 0x58
-        union {
-            BitField<0, 1, u64> mbaff_frame;
-            BitField<1, 1, u64> direct_8x8_inference;
-            BitField<2, 1, u64> weighted_pred;
-            BitField<3, 1, u64> constrained_intra_pred;
-            BitField<4, 1, u64> ref_pic;
-            BitField<5, 1, u64> field_pic;
-            BitField<6, 1, u64> bottom_field;
-            BitField<7, 1, u64> second_field;
-        } flags;
-        BitField<8, 4, u64> log2_max_frame_num_minus4;
-        BitField<12, 2, u64> chroma_format_idc;
-        BitField<14, 2, u64> pic_order_cnt_type;
-        BitField<16, 6, s64> pic_init_qp_minus26;
-        BitField<22, 5, s64> chroma_qp_index_offset;
-        BitField<27, 5, s64> second_chroma_qp_index_offset;
-        BitField<32, 2, u64> weighted_bipred_idc;
-        BitField<34, 7, u64> curr_pic_idx;
-        BitField<41, 5, u64> curr_col_idx;
-        BitField<46, 16, u64> frame_number;
-        BitField<62, 1, u64> frame_surfaces;
-        BitField<63, 1, u64> output_memory_layout;
+    static_assert(sizeof(H264ParameterSet) == 0x60, "H264ParameterSet is an invalid size");
+
+    struct H264DecoderContext {
+        INSERT_PADDING_WORDS_NOINIT(18);       ///< 0x0000
+        u32 stream_len;                        ///< 0x0048
+        INSERT_PADDING_WORDS_NOINIT(3);        ///< 0x004C
+        H264ParameterSet h264_parameter_set;   ///< 0x0058
+        INSERT_PADDING_WORDS_NOINIT(66);       ///< 0x00B8
+        std::array<u8, 0x60> weight_scale;     ///< 0x01C0
+        std::array<u8, 0x80> weight_scale_8x8; ///< 0x0220
    };
-};
-static_assert(sizeof(H264ParameterSet) == 0x60, "H264ParameterSet is an invalid size");
+    static_assert(sizeof(H264DecoderContext) == 0x2A0, "H264DecoderContext is an invalid size");

 #define ASSERT_POSITION(field_name, position)                                                      \
    static_assert(offsetof(H264ParameterSet, field_name) == position,                              \
                  "Field " #field_name " has invalid position")

-ASSERT_POSITION(log2_max_pic_order_cnt_lsb_minus4, 0x00);
-ASSERT_POSITION(delta_pic_order_always_zero_flag, 0x04);
-ASSERT_POSITION(frame_mbs_only_flag, 0x08);
-ASSERT_POSITION(pic_width_in_mbs, 0x0C);
-ASSERT_POSITION(frame_height_in_mbs, 0x10);
-ASSERT_POSITION(tile_format, 0x14);
-ASSERT_POSITION(entropy_coding_mode_flag, 0x18);
-ASSERT_POSITION(pic_order_present_flag, 0x1C);
-ASSERT_POSITION(num_refidx_l0_default_active, 0x20);
-ASSERT_POSITION(num_refidx_l1_default_active, 0x24);
-ASSERT_POSITION(deblocking_filter_control_present_flag, 0x28);
-ASSERT_POSITION(redundant_pic_cnt_present_flag, 0x2C);
-ASSERT_POSITION(transform_8x8_mode_flag, 0x30);
-ASSERT_POSITION(pitch_luma, 0x34);
-ASSERT_POSITION(pitch_chroma, 0x38);
-ASSERT_POSITION(luma_top_offset, 0x3C);
-ASSERT_POSITION(luma_bot_offset, 0x40);
-ASSERT_POSITION(luma_frame_offset, 0x44);
-ASSERT_POSITION(chroma_top_offset, 0x48);
-ASSERT_POSITION(chroma_bot_offset, 0x4C);
-ASSERT_POSITION(chroma_frame_offset, 0x50);
-ASSERT_POSITION(hist_buffer_size, 0x54);
-ASSERT_POSITION(flags, 0x58);
+    ASSERT_POSITION(log2_max_pic_order_cnt_lsb_minus4, 0x00);
+    ASSERT_POSITION(delta_pic_order_always_zero_flag, 0x04);
+    ASSERT_POSITION(frame_mbs_only_flag, 0x08);
+    ASSERT_POSITION(pic_width_in_mbs, 0x0C);
+    ASSERT_POSITION(frame_height_in_map_units, 0x10);
+    ASSERT_POSITION(tile_format, 0x14);
+    ASSERT_POSITION(entropy_coding_mode_flag, 0x18);
+    ASSERT_POSITION(pic_order_present_flag, 0x1C);
+    ASSERT_POSITION(num_refidx_l0_default_active, 0x20);
+    ASSERT_POSITION(num_refidx_l1_default_active, 0x24);
+    ASSERT_POSITION(deblocking_filter_control_present_flag, 0x28);
+    ASSERT_POSITION(redundant_pic_cnt_present_flag, 0x2C);
+    ASSERT_POSITION(transform_8x8_mode_flag, 0x30);
+    ASSERT_POSITION(pitch_luma, 0x34);
+    ASSERT_POSITION(pitch_chroma, 0x38);
+    ASSERT_POSITION(luma_top_offset, 0x3C);
+    ASSERT_POSITION(luma_bot_offset, 0x40);
+    ASSERT_POSITION(luma_frame_offset, 0x44);
+    ASSERT_POSITION(chroma_top_offset, 0x48);
+    ASSERT_POSITION(chroma_bot_offset, 0x4C);
+    ASSERT_POSITION(chroma_frame_offset, 0x50);
+    ASSERT_POSITION(hist_buffer_size, 0x54);
+    ASSERT_POSITION(flags, 0x58);
 #undef ASSERT_POSITION

-struct DpbEntry {
-    union {
-        BitField<0, 7, u32> index;
-        BitField<7, 5, u32> col_idx;
-        BitField<12, 2, u32> state;
-        BitField<14, 1, u32> is_long_term;
-        BitField<15, 1, u32> non_existing;
-        BitField<16, 1, u32> is_field;
-        BitField<17, 4, u32> top_field_marking;
-        BitField<21, 4, u32> bottom_field_marking;
-        BitField<25, 1, u32> output_memory_layout;
-        BitField<26, 6, u32> reserved;
-    } flags;
-    std::array<u32, 2> field_order_cnt;
-    u32 frame_idx;
-};
-static_assert(sizeof(DpbEntry) == 0x10, "DpbEntry has the wrong size!");
-
-struct DisplayParam {
-    union {
-        BitField<0, 1, u32> enable_tf_output;
-        BitField<1, 1, u32> vc1_map_y_flag;
-        BitField<2, 3, u32> map_y_value;
-        BitField<5, 1, u32> vc1_map_uv_flag;
-        BitField<6, 3, u32> map_uv_value;
-        BitField<9, 8, u32> out_stride;
-        BitField<17, 3, u32> tiling_format;
-        BitField<20, 1, u32> output_structure; // 0=frame, 1=field
-        BitField<21, 11, u32> reserved0;
-    };
-    std::array<s32, 2> output_top;
-    std::array<s32, 2> output_bottom;
-    union {
-        BitField<0, 1, u32> enable_histogram;
-        BitField<1, 12, u32> histogram_start_x;
-        BitField<13, 12, u32> histogram_start_y;
-        BitField<25, 7, u32> reserved1;
-    };
-    union {
-        BitField<0, 12, u32> histogram_end_x;
-        BitField<12, 12, u32> histogram_end_y;
-        BitField<24, 8, u32> reserved2;
-    };
-};
-static_assert(sizeof(DisplayParam) == 0x1C, "DisplayParam has the wrong size!");
-
-struct H264DecoderContext {
-    INSERT_PADDING_WORDS_NOINIT(13);                        ///< 0x0000
-    std::array<u8, 16> eos;                                 ///< 0x0034
-    u8 explicit_eos_present_flag;                           ///< 0x0044
-    u8 hint_dump_en;                                        ///< 0x0045
-    INSERT_PADDING_BYTES_NOINIT(2);                         ///< 0x0046
-    u32 stream_len;                                         ///< 0x0048
-    u32 slice_count;                                        ///< 0x004C
-    u32 mbhist_buffer_size;                                 ///< 0x0050
-    u32 gptimer_timeout_value;                              ///< 0x0054
-    H264ParameterSet h264_parameter_set;                    ///< 0x0058
-    std::array<s32, 2> curr_field_order_cnt;                ///< 0x00B8
-    std::array<DpbEntry, 16> dpb;                           ///< 0x00C0
-    std::array<u8, 0x60> weight_scale_4x4;                  ///< 0x01C0
-    std::array<u8, 0x80> weight_scale_8x8;                  ///< 0x0220
-    std::array<u8, 2> num_inter_view_refs_lX;               ///< 0x02A0
-    std::array<u8, 14> reserved2;                           ///< 0x02A2
-    std::array<std::array<s8, 16>, 2> inter_view_refidx_lX; ///< 0x02B0
-    union {                                                 ///< 0x02D0
-        BitField<0, 1, u32> lossless_ipred8x8_filter_enable;
-        BitField<1, 1, u32> qpprime_y_zero_transform_bypass_flag;
-        BitField<2, 30, u32> reserved3;
-    };
-    DisplayParam display_param;   ///< 0x02D4
-    std::array<u32, 3> reserved4; ///< 0x02F0
-};
-static_assert(sizeof(H264DecoderContext) == 0x2FC, "H264DecoderContext is an invalid size");
-
 #define ASSERT_POSITION(field_name, position)                                                      \
    static_assert(offsetof(H264DecoderContext, field_name) == position,                            \
                  "Field " #field_name " has invalid position")

-ASSERT_POSITION(stream_len, 0x48);
-ASSERT_POSITION(h264_parameter_set, 0x58);
-ASSERT_POSITION(dpb, 0xC0);
-ASSERT_POSITION(weight_scale_4x4, 0x1C0);
+    ASSERT_POSITION(stream_len, 0x48);
+    ASSERT_POSITION(h264_parameter_set, 0x58);
+    ASSERT_POSITION(weight_scale, 0x1C0);
 #undef ASSERT_POSITION
-
-class H264 final : public Decoder {
-public:
-    explicit H264(Host1x::Host1x& host1x, const Host1x::NvdecCommon::NvdecRegisters& regs, s32 id);
-    ~H264() override;
-
-    H264(const H264&) = delete;
-    H264& operator=(const H264&) = delete;
-
-    H264(H264&&) = delete;
-    H264& operator=(H264&&) = delete;
-
-    /// Compose the H264 frame for FFmpeg decoding
-    [[nodiscard]] std::span<const u8> ComposeFrame() override;
-
-    std::tuple<u64, u64> GetProgressiveOffsets() override;
-    std::tuple<u64, u64, u64, u64> GetInterlacedOffsets() override;
-    bool IsInterlaced() override;
-
-    std::string_view GetCurrentCodecName() const override {
-        return "H264";
-    }
-
-private:
-    bool is_first_frame{true};
-    Common::ScratchBuffer<u8> frame_scratch;
-    Common::ScratchBuffer<u8> scan_scratch;
-    H264DecoderContext current_context{};
 };

-} // namespace Decoders
+} // namespace Decoder
 } // namespace Tegra
--- a/src/video_core/host1x/codecs/vp8.cpp
+++ b/src/video_core/host1x/codecs/vp8.cpp
@@ -7,69 +7,47 @@
 #include "video_core/host1x/host1x.h"
 #include "video_core/memory_manager.h"

-namespace Tegra::Decoders {
-VP8::VP8(Host1x::Host1x& host1x_, const Host1x::NvdecCommon::NvdecRegisters& regs_, s32 id_)
-    : Decoder{host1x_, id_, regs_} {
-    codec = Host1x::NvdecCommon::VideoCodec::VP8;
-    initialized = decode_api.Initialize(codec);
-}
+namespace Tegra::Decoder {
+VP8::VP8(Host1x::Host1x& host1x_) : host1x{host1x_} {}

 VP8::~VP8() = default;

-std::tuple<u64, u64> VP8::GetProgressiveOffsets() {
-    auto luma{regs.surface_luma_offsets[static_cast<u32>(Vp8SurfaceIndex::Current)].Address()};
-    auto chroma{regs.surface_chroma_offsets[static_cast<u32>(Vp8SurfaceIndex::Current)].Address()};
-    return {luma, chroma};
-}
+std::span<const u8> VP8::ComposeFrame(const Host1x::NvdecCommon::NvdecRegisters& state) {
+    VP8PictureInfo info;
+    host1x.GMMU().ReadBlock(state.picture_info_offset, &info, sizeof(VP8PictureInfo));

-std::tuple<u64, u64, u64, u64> VP8::GetInterlacedOffsets() {
-    auto luma_top{regs.surface_luma_offsets[static_cast<u32>(Vp8SurfaceIndex::Current)].Address()};
-    auto luma_bottom{
-        regs.surface_luma_offsets[static_cast<u32>(Vp8SurfaceIndex::Current)].Address()};
-    auto chroma_top{
-        regs.surface_chroma_offsets[static_cast<u32>(Vp8SurfaceIndex::Current)].Address()};
-    auto chroma_bottom{
-        regs.surface_chroma_offsets[static_cast<u32>(Vp8SurfaceIndex::Current)].Address()};
-    return {luma_top, luma_bottom, chroma_top, chroma_bottom};
-}
-
-std::span<const u8> VP8::ComposeFrame() {
-    memory_manager.ReadBlock(regs.picture_info_offset.Address(), &current_context,
-                             sizeof(VP8PictureInfo));
-
-    const bool is_key_frame = current_context.key_frame == 1u;
-    const auto bitstream_size = static_cast<size_t>(current_context.vld_buffer_size);
+    const bool is_key_frame = info.key_frame == 1u;
+    const auto bitstream_size = static_cast<size_t>(info.vld_buffer_size);
    const size_t header_size = is_key_frame ? 10u : 3u;
-    frame_scratch.resize(header_size + bitstream_size);
+    frame.resize(header_size + bitstream_size);

    // Based on page 30 of the VP8 specification.
    // https://datatracker.ietf.org/doc/rfc6386/
-    frame_scratch[0] = is_key_frame ? 0u : 1u; // 1-bit frame type (0: keyframe, 1: interframes).
-    frame_scratch[0] |=
-        static_cast<u8>((current_context.version & 7u) << 1u); // 3-bit version number
-    frame_scratch[0] |= static_cast<u8>(1u << 4u);             // 1-bit show_frame flag
+    frame[0] = is_key_frame ? 0u : 1u; // 1-bit frame type (0: keyframe, 1: interframes).
+    frame[0] |= static_cast<u8>((info.version & 7u) << 1u); // 3-bit version number
+    frame[0] |= static_cast<u8>(1u << 4u);                  // 1-bit show_frame flag

    // The next 19-bits are the first partition size
-    frame_scratch[0] |= static_cast<u8>((current_context.first_part_size & 7u) << 5u);
-    frame_scratch[1] = static_cast<u8>((current_context.first_part_size & 0x7f8u) >> 3u);
-    frame_scratch[2] = static_cast<u8>((current_context.first_part_size & 0x7f800u) >> 11u);
+    frame[0] |= static_cast<u8>((info.first_part_size & 7u) << 5u);
+    frame[1] = static_cast<u8>((info.first_part_size & 0x7f8u) >> 3u);
+    frame[2] = static_cast<u8>((info.first_part_size & 0x7f800u) >> 11u);

    if (is_key_frame) {
-        frame_scratch[3] = 0x9du;
-        frame_scratch[4] = 0x01u;
-        frame_scratch[5] = 0x2au;
+        frame[3] = 0x9du;
+        frame[4] = 0x01u;
+        frame[5] = 0x2au;
        // TODO(ameerj): Horizontal/Vertical Scale
        // 16 bits: (2 bits Horizontal Scale << 14) | Width (14 bits)
-        frame_scratch[6] = static_cast<u8>(current_context.frame_width & 0xff);
-        frame_scratch[7] = static_cast<u8>(((current_context.frame_width >> 8) & 0x3f));
+        frame[6] = static_cast<u8>(info.frame_width & 0xff);
+        frame[7] = static_cast<u8>(((info.frame_width >> 8) & 0x3f));
        // 16 bits:(2 bits Vertical Scale << 14) | Height (14 bits)
-        frame_scratch[8] = static_cast<u8>(current_context.frame_height & 0xff);
-        frame_scratch[9] = static_cast<u8>(((current_context.frame_height >> 8) & 0x3f));
+        frame[8] = static_cast<u8>(info.frame_height & 0xff);
+        frame[9] = static_cast<u8>(((info.frame_height >> 8) & 0x3f));
    }
-    const u64 bitstream_offset = regs.frame_bitstream_offset.Address();
-    memory_manager.ReadBlock(bitstream_offset, frame_scratch.data() + header_size, bitstream_size);
+    const u64 bitstream_offset = state.frame_bitstream_offset;
+    host1x.GMMU().ReadBlock(bitstream_offset, frame.data() + header_size, bitstream_size);

-    return frame_scratch;
+    return frame;
 }

-} // namespace Tegra::Decoders
+} // namespace Tegra::Decoder
--- a/src/video_core/host1x/codecs/vp8.h
+++ b/src/video_core/host1x/codecs/vp8.h
@@ -9,7 +9,6 @@
 #include "common/common_funcs.h"
 #include "common/common_types.h"
 #include "common/scratch_buffer.h"
-#include "video_core/host1x/codecs/decoder.h"
 #include "video_core/host1x/nvdec_common.h"

 namespace Tegra {
@@ -18,40 +17,20 @@ namespace Host1x {
 class Host1x;
 } // namespace Host1x

-namespace Decoders {
-enum class Vp8SurfaceIndex : u32 {
-    Last = 0,
-    Golden = 1,
-    AltRef = 2,
-    Current = 3,
-};
+namespace Decoder {

-class VP8 final : public Decoder {
+class VP8 {
 public:
-    explicit VP8(Host1x::Host1x& host1x, const Host1x::NvdecCommon::NvdecRegisters& regs, s32 id);
-    ~VP8() override;
+    explicit VP8(Host1x::Host1x& host1x);
+    ~VP8();

-    VP8(const VP8&) = delete;
-    VP8& operator=(const VP8&) = delete;
-
-    VP8(VP8&&) = delete;
-    VP8& operator=(VP8&&) = delete;
-
-    [[nodiscard]] std::span<const u8> ComposeFrame() override;
-
-    std::tuple<u64, u64> GetProgressiveOffsets() override;
-    std::tuple<u64, u64, u64, u64> GetInterlacedOffsets() override;
-
-    bool IsInterlaced() override {
-        return false;
-    }
-
-    std::string_view GetCurrentCodecName() const override {
-        return "VP8";
-    }
+    /// Compose the VP8 frame for FFmpeg decoding
+    [[nodiscard]] std::span<const u8> ComposeFrame(
+        const Host1x::NvdecCommon::NvdecRegisters& state);

 private:
-    Common::ScratchBuffer<u8> frame_scratch;
+    Common::ScratchBuffer<u8> frame;
+    Host1x::Host1x& host1x;

    struct VP8PictureInfo {
        INSERT_PADDING_WORDS_NOINIT(14);
@@ -94,9 +73,7 @@ private:
        INSERT_PADDING_WORDS_NOINIT(3);
    };
    static_assert(sizeof(VP8PictureInfo) == 0xc0, "PictureInfo is an invalid size");
-
-    VP8PictureInfo current_context{};
 };

-} // namespace Decoders
+} // namespace Decoder
 } // namespace Tegra
--- a/src/video_core/host1x/codecs/vp9.cpp
+++ b/src/video_core/host1x/codecs/vp9.cpp
@@ -4,13 +4,12 @@
 #include <algorithm> // for std::copy
 #include <numeric>

-#include "common/alignment.h"
 #include "common/assert.h"
 #include "video_core/host1x/codecs/vp9.h"
 #include "video_core/host1x/host1x.h"
 #include "video_core/memory_manager.h"

-namespace Tegra::Decoders {
+namespace Tegra::Decoder {
 namespace {
 constexpr u32 diff_update_probability = 252;
 constexpr u32 frame_sync_code = 0x498342;
@@ -238,11 +237,7 @@ constexpr std::array<u8, 254> map_lut{
 }
 } // Anonymous namespace

-VP9::VP9(Host1x::Host1x& host1x_, const Host1x::NvdecCommon::NvdecRegisters& regs_, s32 id_)
-    : Decoder{host1x_, id_, regs_} {
-    codec = Host1x::NvdecCommon::VideoCodec::VP9;
-    initialized = decode_api.Initialize(codec);
-}
+VP9::VP9(Host1x::Host1x& host1x_) : host1x{host1x_} {}

 VP9::~VP9() = default;

@@ -361,113 +356,35 @@ void VP9::WriteMvProbabilityUpdate(VpxRangeEncoder& writer, u8 new_prob, u8 old_
    }
 }

-void VP9::WriteSegmentation(VpxBitStreamWriter& writer) {
-    bool enabled = current_picture_info.segmentation.enabled != 0;
-    writer.WriteBit(enabled);
-    if (!enabled) {
-        return;
-    }
+Vp9PictureInfo VP9::GetVp9PictureInfo(const Host1x::NvdecCommon::NvdecRegisters& state) {
+    PictureInfo picture_info;
+    host1x.GMMU().ReadBlock(state.picture_info_offset, &picture_info, sizeof(PictureInfo));
+    Vp9PictureInfo vp9_info = picture_info.Convert();

-    auto update_map = current_picture_info.segmentation.update_map != 0;
-    writer.WriteBit(update_map);
-
-    if (update_map) {
-        EntropyProbs entropy_probs{};
-        memory_manager.ReadBlock(regs.vp9_prob_tab_buffer_offset.Address(), &entropy_probs,
-                                 sizeof(entropy_probs));
-
-        auto WriteProb = [&](u8 prob) {
-            bool coded = prob != 255;
-            writer.WriteBit(coded);
-            if (coded) {
-                writer.WriteU(prob, 8);
-            }
-        };
-
-        for (size_t i = 0; i < entropy_probs.mb_segment_tree_probs.size(); i++) {
-            WriteProb(entropy_probs.mb_segment_tree_probs[i]);
-        }
-
-        auto temporal_update = current_picture_info.segmentation.temporal_update != 0;
-        writer.WriteBit(temporal_update);
-
-        if (temporal_update) {
-            for (s32 i = 0; i < 3; i++) {
-                WriteProb(entropy_probs.segment_pred_probs[i]);
-            }
-        }
-    }
-
-    if (last_segmentation == current_picture_info.segmentation) {
-        writer.WriteBit(false);
-        return;
-    }
-
-    last_segmentation = current_picture_info.segmentation;
-    writer.WriteBit(true);
-    writer.WriteBit(current_picture_info.segmentation.abs_delta != 0);
-
-    constexpr s32 MAX_SEGMENTS = 8;
-    constexpr std::array SegmentationFeatureBits = {8, 6, 2, 0};
-
-    for (s32 i = 0; i < MAX_SEGMENTS; i++) {
-        auto q_enabled = current_picture_info.segmentation.feature_enabled[i][0] != 0;
-        writer.WriteBit(q_enabled);
-        if (q_enabled) {
-            writer.WriteS(current_picture_info.segmentation.feature_data[i][0],
-                          SegmentationFeatureBits[0]);
-        }
-
-        auto lf_enabled = current_picture_info.segmentation.feature_enabled[i][1] != 0;
-        writer.WriteBit(lf_enabled);
-        if (lf_enabled) {
-            writer.WriteS(current_picture_info.segmentation.feature_data[i][1],
-                          SegmentationFeatureBits[1]);
-        }
-
-        auto ref_enabled = current_picture_info.segmentation.feature_enabled[i][2] != 0;
-        writer.WriteBit(ref_enabled);
-        if (ref_enabled) {
-            writer.WriteU(current_picture_info.segmentation.feature_data[i][2],
-                          SegmentationFeatureBits[2]);
-        }
-
-        auto skip_enabled = current_picture_info.segmentation.feature_enabled[i][3] != 0;
-        writer.WriteBit(skip_enabled);
-    }
-}
-
-Vp9PictureInfo VP9::GetVp9PictureInfo() {
-    memory_manager.ReadBlock(regs.picture_info_offset.Address(), &current_picture_info,
-                             sizeof(PictureInfo));
-    Vp9PictureInfo vp9_info = current_picture_info.Convert();
-
-    InsertEntropy(regs.vp9_prob_tab_buffer_offset.Address(), vp9_info.entropy);
+    InsertEntropy(state.vp9_entropy_probs_offset, vp9_info.entropy);

    // surface_luma_offset[0:3] contains the address of the reference frame offsets in the following
    // order: last, golden, altref, current.
-    for (size_t i = 0; i < 4; i++) {
-        vp9_info.frame_offsets[i] = regs.surface_luma_offsets[i].Address();
-    }
+    std::copy(state.surface_luma_offset.begin(), state.surface_luma_offset.begin() + 4,
+              vp9_info.frame_offsets.begin());

    return vp9_info;
 }

 void VP9::InsertEntropy(u64 offset, Vp9EntropyProbs& dst) {
    EntropyProbs entropy;
-    memory_manager.ReadBlock(offset, &entropy, sizeof(EntropyProbs));
+    host1x.GMMU().ReadBlock(offset, &entropy, sizeof(EntropyProbs));
    entropy.Convert(dst);
 }

-Vp9FrameContainer VP9::GetCurrentFrame() {
+Vp9FrameContainer VP9::GetCurrentFrame(const Host1x::NvdecCommon::NvdecRegisters& state) {
    Vp9FrameContainer current_frame{};
    {
        // gpu.SyncGuestHost(); epic, why?
-        current_frame.info = GetVp9PictureInfo();
+        current_frame.info = GetVp9PictureInfo(state);
        current_frame.bit_stream.resize(current_frame.info.bitstream_size);
-        memory_manager.ReadBlock(regs.frame_bitstream_offset.Address(),
-                                 current_frame.bit_stream.data(),
-                                 current_frame.info.bitstream_size);
+        host1x.GMMU().ReadBlock(state.frame_bitstream_offset, current_frame.bit_stream.data(),
+                                current_frame.info.bitstream_size);
    }
    if (!next_frame.bit_stream.empty()) {
        Vp9FrameContainer temp{
@@ -825,7 +742,8 @@ VpxBitStreamWriter VP9::ComposeUncompressedHeader() {
    uncomp_writer.WriteDeltaQ(current_frame_info.uv_dc_delta_q);
    uncomp_writer.WriteDeltaQ(current_frame_info.uv_ac_delta_q);

-    WriteSegmentation(uncomp_writer);
+    ASSERT(!current_frame_info.segment_enabled);
+    uncomp_writer.WriteBit(false); // Segmentation enabled (TODO).

    const s32 min_tile_cols_log2 = CalcMinLog2TileCols(current_frame_info.frame_size.width);
    const s32 max_tile_cols_log2 = CalcMaxLog2TileCols(current_frame_info.frame_size.width);
@@ -852,29 +770,10 @@ VpxBitStreamWriter VP9::ComposeUncompressedHeader() {
    return uncomp_writer;
 }

-std::tuple<u64, u64> VP9::GetProgressiveOffsets() {
-    auto luma{regs.surface_luma_offsets[static_cast<u32>(Vp9SurfaceIndex::Current)].Address()};
-    auto chroma{regs.surface_chroma_offsets[static_cast<u32>(Vp9SurfaceIndex::Current)].Address()};
-    return {luma, chroma};
-}
-
-std::tuple<u64, u64, u64, u64> VP9::GetInterlacedOffsets() {
-    auto luma_top{regs.surface_luma_offsets[static_cast<u32>(Vp9SurfaceIndex::Current)].Address()};
-    auto luma_bottom{
-        regs.surface_luma_offsets[static_cast<u32>(Vp9SurfaceIndex::Current)].Address()};
-    auto chroma_top{
-        regs.surface_chroma_offsets[static_cast<u32>(Vp9SurfaceIndex::Current)].Address()};
-    auto chroma_bottom{
-        regs.surface_chroma_offsets[static_cast<u32>(Vp9SurfaceIndex::Current)].Address()};
-    return {luma_top, luma_bottom, chroma_top, chroma_bottom};
-}
-
-std::span<const u8> VP9::ComposeFrame() {
-    vp9_hidden_frame = false;
-
+void VP9::ComposeFrame(const Host1x::NvdecCommon::NvdecRegisters& state) {
    std::vector<u8> bitstream;
    {
-        Vp9FrameContainer curr_frame = GetCurrentFrame();
+        Vp9FrameContainer curr_frame = GetCurrentFrame(state);
        current_frame_info = curr_frame.info;
        bitstream = std::move(curr_frame.bit_stream);
    }
@@ -887,16 +786,12 @@ std::span<const u8> VP9::ComposeFrame() {
    std::vector<u8> uncompressed_header = uncomp_writer.GetByteArray();

    // Write headers and frame to buffer
-    frame_scratch.resize(uncompressed_header.size() + compressed_header.size() + bitstream.size());
-    std::copy(uncompressed_header.begin(), uncompressed_header.end(), frame_scratch.begin());
+    frame.resize(uncompressed_header.size() + compressed_header.size() + bitstream.size());
+    std::copy(uncompressed_header.begin(), uncompressed_header.end(), frame.begin());
    std::copy(compressed_header.begin(), compressed_header.end(),
-              frame_scratch.begin() + uncompressed_header.size());
+              frame.begin() + uncompressed_header.size());
    std::copy(bitstream.begin(), bitstream.end(),
-              frame_scratch.begin() + uncompressed_header.size() + compressed_header.size());
-
-    vp9_hidden_frame = WasFrameHidden();
-
-    return GetFrameBytes();
+              frame.begin() + uncompressed_header.size() + compressed_header.size());
 }

 VpxRangeEncoder::VpxRangeEncoder() {
@@ -1049,4 +944,4 @@ const std::vector<u8>& VpxBitStreamWriter::GetByteArray() const {
    return byte_array;
 }

-} // namespace Tegra::Decoders
+} // namespace Tegra::Decoder
--- a/src/video_core/host1x/codecs/vp9.h
+++ b/src/video_core/host1x/codecs/vp9.h
@@ -10,7 +10,6 @@
 #include "common/common_types.h"
 #include "common/scratch_buffer.h"
 #include "common/stream.h"
-#include "video_core/host1x/codecs/decoder.h"
 #include "video_core/host1x/codecs/vp9_types.h"
 #include "video_core/host1x/nvdec_common.h"

@@ -20,7 +19,7 @@ namespace Host1x {
 class Host1x;
 } // namespace Host1x

-namespace Decoders {
+namespace Decoder {

 /// The VpxRangeEncoder, and VpxBitStreamWriter classes are used to compose the
 /// VP9 header bitstreams.
@@ -111,31 +110,21 @@ private:
    std::vector<u8> byte_array;
 };

-class VP9 final : public Decoder {
+class VP9 {
 public:
-    explicit VP9(Host1x::Host1x& host1x, const Host1x::NvdecCommon::NvdecRegisters& regs, s32 id);
-    ~VP9() override;
+    explicit VP9(Host1x::Host1x& host1x);
+    ~VP9();

    VP9(const VP9&) = delete;
    VP9& operator=(const VP9&) = delete;

-    VP9(VP9&&) = delete;
+    VP9(VP9&&) = default;
    VP9& operator=(VP9&&) = delete;

-    [[nodiscard]] std::span<const u8> ComposeFrame() override;
+    /// Composes the VP9 frame from the GPU state information.
+    /// Based on the official VP9 spec documentation
+    void ComposeFrame(const Host1x::NvdecCommon::NvdecRegisters& state);

-    std::tuple<u64, u64> GetProgressiveOffsets() override;
-    std::tuple<u64, u64, u64, u64> GetInterlacedOffsets() override;
-
-    bool IsInterlaced() override {
-        return false;
-    }
-
-    std::string_view GetCurrentCodecName() const override {
-        return "VP9";
-    }
-
-private:
    /// Returns true if the most recent frame was a hidden frame.
    [[nodiscard]] bool WasFrameHidden() const {
        return !current_frame_info.show_frame;
@@ -143,9 +132,10 @@ private:

    /// Returns a const span to the composed frame data.
    [[nodiscard]] std::span<const u8> GetFrameBytes() const {
-        return frame_scratch;
+        return frame;
    }

+private:
    /// Generates compressed header probability updates in the bitstream writer
    template <typename T, std::size_t N>
    void WriteProbabilityUpdate(VpxRangeEncoder& writer, const std::array<T, N>& new_prob,
@@ -177,22 +167,23 @@ private:
    /// Write motion vector probability updates. 6.3.17 in the spec
    void WriteMvProbabilityUpdate(VpxRangeEncoder& writer, u8 new_prob, u8 old_prob);

-    void WriteSegmentation(VpxBitStreamWriter& writer);
-
    /// Returns VP9 information from NVDEC provided offset and size
-    [[nodiscard]] Vp9PictureInfo GetVp9PictureInfo();
+    [[nodiscard]] Vp9PictureInfo GetVp9PictureInfo(
+        const Host1x::NvdecCommon::NvdecRegisters& state);

    /// Read and convert NVDEC provided entropy probs to Vp9EntropyProbs struct
    void InsertEntropy(u64 offset, Vp9EntropyProbs& dst);

    /// Returns frame to be decoded after buffering
-    [[nodiscard]] Vp9FrameContainer GetCurrentFrame();
+    [[nodiscard]] Vp9FrameContainer GetCurrentFrame(
+        const Host1x::NvdecCommon::NvdecRegisters& state);

    /// Use NVDEC providied information to compose the headers for the current frame
    [[nodiscard]] std::vector<u8> ComposeCompressedHeader();
    [[nodiscard]] VpxBitStreamWriter ComposeUncompressedHeader();

-    Common::ScratchBuffer<u8> frame_scratch;
+    Host1x::Host1x& host1x;
+    Common::ScratchBuffer<u8> frame;

    std::array<s8, 4> loop_filter_ref_deltas{};
    std::array<s8, 2> loop_filter_mode_deltas{};
@@ -201,11 +192,9 @@ private:
    std::array<Vp9EntropyProbs, 4> frame_ctxs{};
    bool swap_ref_indices{};

-    Segmentation last_segmentation{};
-    PictureInfo current_picture_info{};
    Vp9PictureInfo current_frame_info{};
    Vp9EntropyProbs prev_frame_probs{};
 };

-} // namespace Decoders
+} // namespace Decoder
 } // namespace Tegra
--- a/src/video_core/host1x/codecs/vp9_types.h
+++ b/src/video_core/host1x/codecs/vp9_types.h
@@ -11,14 +11,7 @@

 namespace Tegra {

-namespace Decoders {
-enum class Vp9SurfaceIndex : u32 {
-    Last = 0,
-    Golden = 1,
-    AltRef = 2,
-    Current = 3,
-};
-
+namespace Decoder {
 struct Vp9FrameDimensions {
    s16 width;
    s16 height;
@@ -55,13 +48,11 @@ enum class TxMode {
 };

 struct Segmentation {
-    constexpr bool operator==(const Segmentation& rhs) const = default;
-
    u8 enabled;
    u8 update_map;
    u8 temporal_update;
    u8 abs_delta;
-    std::array<std::array<u8, 4>, 8> feature_enabled;
+    std::array<u32, 8> feature_mask;
    std::array<std::array<s16, 4>, 8> feature_data;
 };
 static_assert(sizeof(Segmentation) == 0x64, "Segmentation is an invalid size");
@@ -199,17 +190,7 @@ struct PictureInfo {
 static_assert(sizeof(PictureInfo) == 0x100, "PictureInfo is an invalid size");

 struct EntropyProbs {
-    std::array<u8, 10 * 10 * 8> kf_bmode_prob;         ///< 0x0000
-    std::array<u8, 10 * 10 * 1> kf_bmode_probB;        ///< 0x0320
-    std::array<u8, 3> ref_pred_probs;                  ///< 0x0384
-    std::array<u8, 7> mb_segment_tree_probs;           ///< 0x0387
-    std::array<u8, 3> segment_pred_probs;              ///< 0x038E
-    std::array<u8, 4> ref_scores;                      ///< 0x0391
-    std::array<u8, 2> prob_comppred;                   ///< 0x0395
-    INSERT_PADDING_BYTES_NOINIT(9);                    ///< 0x0397
-    std::array<u8, 10 * 8> kf_uv_mode_prob;            ///< 0x03A0
-    std::array<u8, 10 * 1> kf_uv_mode_probB;           ///< 0x03F0
-    INSERT_PADDING_BYTES_NOINIT(6);                    ///< 0x03FA
+    INSERT_PADDING_BYTES_NOINIT(1024);                 ///< 0x0000
    std::array<u8, 28> inter_mode_prob;                ///< 0x0400
    std::array<u8, 4> intra_inter_prob;                ///< 0x041C
    INSERT_PADDING_BYTES_NOINIT(80);                   ///< 0x0420
@@ -321,5 +302,5 @@ ASSERT_POSITION(class_0_fr, 0x560);
 ASSERT_POSITION(coef_probs, 0x5A0);
 #undef ASSERT_POSITION

-}; // namespace Decoders
+}; // namespace Decoder
 }; // namespace Tegra
--- a/src/video_core/host1x/control.cpp
+++ b/src/video_core/host1x/control.cpp
@@ -27,7 +27,6 @@ void Control::ProcessMethod(Method method, u32 argument) {
 }

 void Control::Execute(u32 data) {
-    LOG_TRACE(Service_NVDRV, "Control wait syncpt {} value {}", data, syncpoint_value);
    host1x.GetSyncpointManager().WaitHost(data, syncpoint_value);
 }

--- a/src/video_core/host1x/control.h
+++ b/src/video_core/host1x/control.h
@@ -6,7 +6,9 @@

 #include "common/common_types.h"

-namespace Tegra::Host1x {
+namespace Tegra {
+
+namespace Host1x {

 class Host1x;
 class Nvdec;
@@ -29,8 +31,10 @@ private:
    /// For Host1x, execute is waiting on a syncpoint previously written into the state
    void Execute(u32 data);

-    Host1x& host1x;
    u32 syncpoint_value{};
+    Host1x& host1x;
 };

-} // namespace Tegra::Host1x
+} // namespace Host1x
+
+} // namespace Tegra
--- a/src/video_core/host1x/ffmpeg/ffmpeg.cpp
+++ b/src/video_core/host1x/ffmpeg/ffmpeg.cpp
@@ -5,9 +5,7 @@
 #include "common/logging/log.h"
 #include "common/scope_exit.h"
 #include "common/settings.h"
-#include "core/memory.h"
 #include "video_core/host1x/ffmpeg/ffmpeg.h"
-#include "video_core/memory_manager.h"

 extern "C" {
 #ifdef LIBVA_FOUND
@@ -185,8 +183,8 @@ bool HardwareContext::InitializeWithType(AVHWDeviceType type) {
    return true;
 }

-DecoderContext::DecoderContext(const Decoder& decoder) : m_decoder{decoder} {
-    m_codec_context = avcodec_alloc_context3(m_decoder.GetCodec());
+DecoderContext::DecoderContext(const Decoder& decoder) {
+    m_codec_context = avcodec_alloc_context3(decoder.GetCodec());
    av_opt_set(m_codec_context->priv_data, "tune", "zerolatency", 0);
    m_codec_context->thread_count = 0;
    m_codec_context->thread_type &= ~FF_THREAD_FRAME;
@@ -218,25 +216,6 @@ bool DecoderContext::OpenContext(const Decoder& decoder) {
 }

 bool DecoderContext::SendPacket(const Packet& packet) {
-    m_temp_frame = std::make_shared<Frame>();
-    m_got_frame = 0;
-
-// Android can randomly crash when calling decode directly, so skip.
-// TODO update ffmpeg and hope that fixes it.
-#ifndef ANDROID
-    if (!m_codec_context->hw_device_ctx && m_codec_context->codec_id == AV_CODEC_ID_H264) {
-        m_decode_order = true;
-        auto* codec{ffcodec(m_decoder.GetCodec())};
-        if (const int ret = codec->cb.decode(m_codec_context, m_temp_frame->GetFrame(),
-                                             &m_got_frame, packet.GetPacket());
-            ret < 0) {
-            LOG_DEBUG(Service_NVDRV, "avcodec_send_packet error {}", AVError(ret));
-            return false;
-        }
-        return true;
-    }
-#endif
-
    if (const int ret = avcodec_send_packet(m_codec_context, packet.GetPacket()); ret < 0) {
        LOG_ERROR(HW_GPU, "avcodec_send_packet error: {}", AVError(ret));
        return false;
@@ -245,73 +224,139 @@ bool DecoderContext::SendPacket(const Packet& packet) {
    return true;
 }

-std::shared_ptr<Frame> DecoderContext::ReceiveFrame() {
-    // Android can randomly crash when calling decode directly, so skip.
-    // TODO update ffmpeg and hope that fixes it.
-#ifndef ANDROID
-    if (!m_codec_context->hw_device_ctx && m_codec_context->codec_id == AV_CODEC_ID_H264) {
-        m_decode_order = true;
-        auto* codec{ffcodec(m_decoder.GetCodec())};
-        int ret{0};
+std::unique_ptr<Frame> DecoderContext::ReceiveFrame(bool* out_is_interlaced) {
+    auto dst_frame = std::make_unique<Frame>();

-        if (m_got_frame == 0) {
-            Packet packet{{}};
-            auto* pkt = packet.GetPacket();
-            pkt->data = nullptr;
-            pkt->size = 0;
-            ret = codec->cb.decode(m_codec_context, m_temp_frame->GetFrame(), &m_got_frame, pkt);
-            m_codec_context->has_b_frames = 0;
+    const auto ReceiveImpl = [&](AVFrame* frame) {
+        if (const int ret = avcodec_receive_frame(m_codec_context, frame); ret < 0) {
+            LOG_ERROR(HW_GPU, "avcodec_receive_frame error: {}", AVError(ret));
+            return false;
        }

-        if (m_got_frame == 0 || ret < 0) {
-            LOG_ERROR(Service_NVDRV, "Failed to receive a frame! error {}", ret);
+        *out_is_interlaced =
+#if defined(FF_API_INTERLACED_FRAME) || LIBAVUTIL_VERSION_MAJOR >= 59
+            (frame->flags & AV_FRAME_FLAG_INTERLACED) != 0;
+#else
+            frame->interlaced_frame != 0;
+#endif
+        return true;
+    };
+
+    if (m_codec_context->hw_device_ctx) {
+        // If we have a hardware context, make a separate frame here to receive the
+        // hardware result before sending it to the output.
+        Frame intermediate_frame;
+
+        if (!ReceiveImpl(intermediate_frame.GetFrame())) {
            return {};
        }
-    } else
-#endif
-    {

-        const auto ReceiveImpl = [&](AVFrame* frame) {
-            if (const int ret = avcodec_receive_frame(m_codec_context, frame); ret < 0) {
-                LOG_ERROR(HW_GPU, "avcodec_receive_frame error: {}", AVError(ret));
-                return false;
-            }
-
-            return true;
-        };
-
-        if (m_codec_context->hw_device_ctx) {
-            // If we have a hardware context, make a separate frame here to receive the
-            // hardware result before sending it to the output.
-            Frame intermediate_frame;
-
-            if (!ReceiveImpl(intermediate_frame.GetFrame())) {
-                return {};
-            }
-
-            m_temp_frame->SetFormat(PreferredGpuFormat);
-            if (const int ret = av_hwframe_transfer_data(m_temp_frame->GetFrame(),
-                                                         intermediate_frame.GetFrame(), 0);
-                ret < 0) {
-                LOG_ERROR(HW_GPU, "av_hwframe_transfer_data error: {}", AVError(ret));
-                return {};
-            }
-        } else {
-            // Otherwise, decode the frame as normal.
-            if (!ReceiveImpl(m_temp_frame->GetFrame())) {
-                return {};
-            }
+        dst_frame->SetFormat(PreferredGpuFormat);
+        if (const int ret =
+                av_hwframe_transfer_data(dst_frame->GetFrame(), intermediate_frame.GetFrame(), 0);
+            ret < 0) {
+            LOG_ERROR(HW_GPU, "av_hwframe_transfer_data error: {}", AVError(ret));
+            return {};
+        }
+    } else {
+        // Otherwise, decode the frame as normal.
+        if (!ReceiveImpl(dst_frame->GetFrame())) {
+            return {};
        }
    }

-#if defined(FF_API_INTERLACED_FRAME) || LIBAVUTIL_VERSION_MAJOR >= 59
-    m_temp_frame->GetFrame()->interlaced_frame =
-        (m_temp_frame->GetFrame()->flags & AV_FRAME_FLAG_INTERLACED) != 0;
-#endif
-    return std::move(m_temp_frame);
+    return dst_frame;
+}
+
+DeinterlaceFilter::DeinterlaceFilter(const Frame& frame) {
+    const AVFilter* buffer_src = avfilter_get_by_name("buffer");
+    const AVFilter* buffer_sink = avfilter_get_by_name("buffersink");
+    AVFilterInOut* inputs = avfilter_inout_alloc();
+    AVFilterInOut* outputs = avfilter_inout_alloc();
+    SCOPE_EXIT({
+        avfilter_inout_free(&inputs);
+        avfilter_inout_free(&outputs);
+    });
+
+    // Don't know how to get the accurate time_base but it doesn't matter for yadif filter
+    // so just use 1/1 to make buffer filter happy
+    std::string args = fmt::format("video_size={}x{}:pix_fmt={}:time_base=1/1", frame.GetWidth(),
+                                   frame.GetHeight(), static_cast<int>(frame.GetPixelFormat()));
+
+    m_filter_graph = avfilter_graph_alloc();
+    int ret = avfilter_graph_create_filter(&m_source_context, buffer_src, "in", args.c_str(),
+                                           nullptr, m_filter_graph);
+    if (ret < 0) {
+        LOG_ERROR(HW_GPU, "avfilter_graph_create_filter source error: {}", AVError(ret));
+        return;
+    }
+
+    ret = avfilter_graph_create_filter(&m_sink_context, buffer_sink, "out", nullptr, nullptr,
+                                       m_filter_graph);
+    if (ret < 0) {
+        LOG_ERROR(HW_GPU, "avfilter_graph_create_filter sink error: {}", AVError(ret));
+        return;
+    }
+
+    inputs->name = av_strdup("out");
+    inputs->filter_ctx = m_sink_context;
+    inputs->pad_idx = 0;
+    inputs->next = nullptr;
+
+    outputs->name = av_strdup("in");
+    outputs->filter_ctx = m_source_context;
+    outputs->pad_idx = 0;
+    outputs->next = nullptr;
+
+    const char* description = "yadif=1:-1:0";
+    ret = avfilter_graph_parse_ptr(m_filter_graph, description, &inputs, &outputs, nullptr);
+    if (ret < 0) {
+        LOG_ERROR(HW_GPU, "avfilter_graph_parse_ptr error: {}", AVError(ret));
+        return;
+    }
+
+    ret = avfilter_graph_config(m_filter_graph, nullptr);
+    if (ret < 0) {
+        LOG_ERROR(HW_GPU, "avfilter_graph_config error: {}", AVError(ret));
+        return;
+    }
+
+    m_initialized = true;
+}
+
+bool DeinterlaceFilter::AddSourceFrame(const Frame& frame) {
+    if (const int ret = av_buffersrc_add_frame_flags(m_source_context, frame.GetFrame(),
+                                                     AV_BUFFERSRC_FLAG_KEEP_REF);
+        ret < 0) {
+        LOG_ERROR(HW_GPU, "av_buffersrc_add_frame_flags error: {}", AVError(ret));
+        return false;
+    }
+
+    return true;
+}
+
+std::unique_ptr<Frame> DeinterlaceFilter::DrainSinkFrame() {
+    auto dst_frame = std::make_unique<Frame>();
+    const int ret = av_buffersink_get_frame(m_sink_context, dst_frame->GetFrame());
+
+    if (ret == AVERROR(EAGAIN) || ret == AVERROR(AVERROR_EOF)) {
+        return {};
+    }
+
+    if (ret < 0) {
+        LOG_ERROR(HW_GPU, "av_buffersink_get_frame error: {}", AVError(ret));
+        return {};
+    }
+
+    return dst_frame;
+}
+
+DeinterlaceFilter::~DeinterlaceFilter() {
+    avfilter_graph_free(&m_filter_graph);
 }

 void DecodeApi::Reset() {
+    m_deinterlace_filter.reset();
    m_hardware_context.reset();
    m_decoder_context.reset();
    m_decoder.reset();
@@ -337,14 +382,43 @@ bool DecodeApi::Initialize(Tegra::Host1x::NvdecCommon::VideoCodec codec) {
    return true;
 }

-bool DecodeApi::SendPacket(std::span<const u8> packet_data) {
+bool DecodeApi::SendPacket(std::span<const u8> packet_data, size_t configuration_size) {
    FFmpeg::Packet packet(packet_data);
    return m_decoder_context->SendPacket(packet);
 }

-std::shared_ptr<Frame> DecodeApi::ReceiveFrame() {
+void DecodeApi::ReceiveFrames(std::queue<std::unique_ptr<Frame>>& frame_queue) {
    // Receive raw frame from decoder.
-    return m_decoder_context->ReceiveFrame();
+    bool is_interlaced;
+    auto frame = m_decoder_context->ReceiveFrame(&is_interlaced);
+    if (!frame) {
+        return;
+    }
+
+    if (!is_interlaced) {
+        // If the frame is not interlaced, we can pend it now.
+        frame_queue.push(std::move(frame));
+    } else {
+        // Create the deinterlacer if needed.
+        if (!m_deinterlace_filter) {
+            m_deinterlace_filter.emplace(*frame);
+        }
+
+        // Add the frame we just received.
+        if (!m_deinterlace_filter->AddSourceFrame(*frame)) {
+            return;
+        }
+
+        // Pend output fields.
+        while (true) {
+            auto filter_frame = m_deinterlace_filter->DrainSinkFrame();
+            if (!filter_frame) {
+                break;
+            }
+
+            frame_queue.push(std::move(filter_frame));
+        }
+    }
 }

 } // namespace FFmpeg
--- a/src/video_core/host1x/ffmpeg/ffmpeg.h
+++ b/src/video_core/host1x/ffmpeg/ffmpeg.h
@@ -20,20 +20,17 @@ extern "C" {
 #endif

 #include <libavcodec/avcodec.h>
+#include <libavfilter/avfilter.h>
+#include <libavfilter/buffersink.h>
+#include <libavfilter/buffersrc.h>
+#include <libavutil/avutil.h>
 #include <libavutil/opt.h>
-#ifndef ANDROID
-#include <libavcodec/codec_internal.h>
-#endif

 #if defined(__GNUC__) || defined(__clang__)
 #pragma GCC diagnostic pop
 #endif
 }

-namespace Tegra {
-class MemoryManager;
-}
-
 namespace FFmpeg {

 class Packet;
@@ -93,10 +90,6 @@ public:
        return m_frame->data[plane];
    }

-    const u8* GetPlane(int plane) const {
-        return m_frame->data[plane];
-    }
-
    u8** GetPlanes() const {
        return m_frame->data;
    }
@@ -105,14 +98,6 @@ public:
        m_frame->format = format;
    }

-    bool IsInterlaced() const {
-        return m_frame->interlaced_frame != 0;
-    }
-
-    bool IsHardwareDecoded() const {
-        return m_frame->hw_frames_ctx != nullptr;
-    }
-
    AVFrame* GetFrame() const {
        return m_frame;
    }
@@ -175,22 +160,33 @@ public:
    void InitializeHardwareDecoder(const HardwareContext& context, AVPixelFormat hw_pix_fmt);
    bool OpenContext(const Decoder& decoder);
    bool SendPacket(const Packet& packet);
-    std::shared_ptr<Frame> ReceiveFrame();
+    std::unique_ptr<Frame> ReceiveFrame(bool* out_is_interlaced);

    AVCodecContext* GetCodecContext() const {
        return m_codec_context;
    }

-    bool UsingDecodeOrder() const {
-        return m_decode_order;
-    }
+private:
+    AVCodecContext* m_codec_context{};
+};
+
+// Wraps an AVFilterGraph.
+class DeinterlaceFilter {
+public:
+    YUZU_NON_COPYABLE(DeinterlaceFilter);
+    YUZU_NON_MOVEABLE(DeinterlaceFilter);
+
+    explicit DeinterlaceFilter(const Frame& frame);
+    ~DeinterlaceFilter();
+
+    bool AddSourceFrame(const Frame& frame);
+    std::unique_ptr<Frame> DrainSinkFrame();

 private:
-    const Decoder& m_decoder;
-    AVCodecContext* m_codec_context{};
-    s32 m_got_frame{};
-    std::shared_ptr<Frame> m_temp_frame{};
-    bool m_decode_order{};
+    AVFilterGraph* m_filter_graph{};
+    AVFilterContext* m_source_context{};
+    AVFilterContext* m_sink_context{};
+    bool m_initialized{};
 };

 class DecodeApi {
@@ -204,17 +200,14 @@ public:
    bool Initialize(Tegra::Host1x::NvdecCommon::VideoCodec codec);
    void Reset();

-    bool UsingDecodeOrder() const {
-        return m_decoder_context->UsingDecodeOrder();
-    }
-
-    bool SendPacket(std::span<const u8> packet_data);
-    std::shared_ptr<Frame> ReceiveFrame();
+    bool SendPacket(std::span<const u8> packet_data, size_t configuration_size);
+    void ReceiveFrames(std::queue<std::unique_ptr<Frame>>& frame_queue);

 private:
    std::optional<FFmpeg::Decoder> m_decoder;
    std::optional<FFmpeg::DecoderContext> m_decoder_context;
    std::optional<FFmpeg::HardwareContext> m_hardware_context;
+    std::optional<FFmpeg::DeinterlaceFilter> m_deinterlace_filter;
 };

 } // namespace FFmpeg
--- a/src/video_core/host1x/host1x.cpp
+++ b/src/video_core/host1x/host1x.cpp
@@ -3,10 +3,10 @@

 #include "core/core.h"
 #include "video_core/host1x/host1x.h"
-#include "video_core/host1x/nvdec.h"
-#include "video_core/host1x/vic.h"

-namespace Tegra::Host1x {
+namespace Tegra {
+
+namespace Host1x {

 Host1x::Host1x(Core::System& system_)
    : system{system_}, syncpoint_manager{},
@@ -15,23 +15,6 @@ Host1x::Host1x(Core::System& system_)

 Host1x::~Host1x() = default;

-void Host1x::StartDevice(s32 fd, ChannelType type, u32 syncpt) {
-    switch (type) {
-    case ChannelType::NvDec:
-        devices[fd] = std::make_unique<Tegra::Host1x::Nvdec>(*this, fd, syncpt);
-        last_nvdec_fd = fd;
-        break;
-    case ChannelType::VIC:
-        devices[fd] = std::make_unique<Tegra::Host1x::Vic>(*this, fd, GetLastNvdecDevice(), syncpt);
-        break;
-    default:
-        LOG_ERROR(HW_GPU, "Unimplemented host1x device {}", static_cast<u32>(type));
-        break;
-    }
-}
+} // namespace Host1x

-void Host1x::StopDevice(s32 fd, ChannelType type) {
-    devices.erase(fd);
-}
-
-} // namespace Tegra::Host1x
+} // namespace Tegra
--- a/src/video_core/host1x/host1x.h
+++ b/src/video_core/host1x/host1x.h
@@ -6,7 +6,6 @@
 #include "common/common_types.h"

 #include "common/address_space.h"
-#include "video_core/cdma_pusher.h"
 #include "video_core/host1x/gpu_device_memory_manager.h"
 #include "video_core/host1x/syncpoint_manager.h"
 #include "video_core/memory_manager.h"
@@ -15,29 +14,15 @@ namespace Core {
 class System;
 } // namespace Core

-namespace Tegra::Host1x {
-class Nvdec;
+namespace Tegra {

-enum class ChannelType : u32 {
-    MsEnc = 0,
-    VIC = 1,
-    GPU = 2,
-    NvDec = 3,
-    Display = 4,
-    NvJpg = 5,
-    TSec = 6,
-    Max = 7,
-};
+namespace Host1x {

 class Host1x {
 public:
    explicit Host1x(Core::System& system);
    ~Host1x();

-    Core::System& System() {
-        return system;
-    }
-
    SyncpointManager& GetSyncpointManager() {
        return syncpoint_manager;
    }
@@ -70,31 +55,14 @@ public:
        return *allocator;
    }

-    void StartDevice(s32 fd, ChannelType type, u32 syncpt);
-    void StopDevice(s32 fd, ChannelType type);
-
-    void PushEntries(s32 fd, ChCommandHeaderList&& entries) {
-        auto it = devices.find(fd);
-        if (it == devices.end()) {
-            return;
-        }
-        it->second->PushEntries(std::move(entries));
-    }
-
-    Nvdec& GetLastNvdecDevice() {
-        auto it = devices.find(last_nvdec_fd);
-        ASSERT(it->second.get() != nullptr);
-        return *reinterpret_cast<Nvdec*>(it->second.get());
-    }
-
 private:
    Core::System& system;
    SyncpointManager syncpoint_manager;
    Tegra::MaxwellDeviceMemoryManager memory_manager;
    Tegra::MemoryManager gmmu_manager;
    std::unique_ptr<Common::FlatAllocator<u32, 0, 32>> allocator;
-    std::unordered_map<s32, std::unique_ptr<CDmaPusher>> devices;
-    s32 last_nvdec_fd{};
 };

-} // namespace Tegra::Host1x
+} // namespace Host1x
+
+} // namespace Tegra
--- a/src/video_core/host1x/nvdec.cpp
+++ b/src/video_core/host1x/nvdec.cpp
@@ -2,12 +2,6 @@
 // SPDX-License-Identifier: GPL-2.0-or-later

 #include "common/assert.h"
-
-#include "common/polyfill_thread.h"
-#include "common/settings.h"
-#include "video_core/host1x/codecs/h264.h"
-#include "video_core/host1x/codecs/vp8.h"
-#include "video_core/host1x/codecs/vp9.h"
 #include "video_core/host1x/host1x.h"
 #include "video_core/host1x/nvdec.h"

@@ -16,68 +10,37 @@ namespace Tegra::Host1x {
 #define NVDEC_REG_INDEX(field_name)                                                                \
    (offsetof(NvdecCommon::NvdecRegisters, field_name) / sizeof(u64))

-Nvdec::Nvdec(Host1x& host1x_, s32 id_, u32 syncpt)
-    : CDmaPusher{host1x_, id_}, id{id_}, syncpoint{syncpt} {
-    LOG_INFO(HW_GPU, "Created nvdec {}", id);
-}
+Nvdec::Nvdec(Host1x& host1x_)
+    : host1x(host1x_), state{}, codec(std::make_unique<Codec>(host1x, state)) {}

-Nvdec::~Nvdec() {
-    LOG_INFO(HW_GPU, "Destroying nvdec {}", id);
-}
+Nvdec::~Nvdec() = default;

 void Nvdec::ProcessMethod(u32 method, u32 argument) {
-    regs.reg_array[method] = argument;
+    state.reg_array[method] = static_cast<u64>(argument) << 8;

    switch (method) {
    case NVDEC_REG_INDEX(set_codec_id):
-        CreateDecoder(static_cast<NvdecCommon::VideoCodec>(argument));
+        codec->SetTargetCodec(static_cast<NvdecCommon::VideoCodec>(argument));
        break;
-    case NVDEC_REG_INDEX(execute): {
-        if (wait_needed) {
-            std::this_thread::sleep_for(std::chrono::milliseconds(32));
-            wait_needed = false;
-        }
+    case NVDEC_REG_INDEX(execute):
        Execute();
-    } break;
+        break;
    }
 }

-void Nvdec::CreateDecoder(NvdecCommon::VideoCodec codec) {
-    if (decoder.get()) {
-        return;
-    }
-    switch (codec) {
-    case NvdecCommon::VideoCodec::H264:
-        decoder = std::make_unique<Decoders::H264>(host1x, regs, id);
-        break;
-    case NvdecCommon::VideoCodec::VP8:
-        decoder = std::make_unique<Decoders::VP8>(host1x, regs, id);
-        break;
-    case NvdecCommon::VideoCodec::VP9:
-        decoder = std::make_unique<Decoders::VP9>(host1x, regs, id);
-        break;
-    default:
-        UNIMPLEMENTED_MSG("Codec {}", decoder->GetCurrentCodecName());
-        break;
-    }
-    LOG_INFO(HW_GPU, "Created decoder {} for id {}", decoder->GetCurrentCodecName(), id);
+std::unique_ptr<FFmpeg::Frame> Nvdec::GetFrame() {
+    return codec->GetCurrentFrame();
 }

 void Nvdec::Execute() {
-    if (Settings::values.nvdec_emulation.GetValue() == Settings::NvdecEmulation::Off) [[unlikely]] {
-        // Signalling syncpts too fast can cause games to get stuck as they don't expect a <1ms
-        // execution time. Sleep for half of a 60 fps frame just in case.
-        std::this_thread::sleep_for(std::chrono::milliseconds(8));
-        return;
-    }
-    switch (decoder->GetCurrentCodec()) {
+    switch (codec->GetCurrentCodec()) {
    case NvdecCommon::VideoCodec::H264:
    case NvdecCommon::VideoCodec::VP8:
    case NvdecCommon::VideoCodec::VP9:
-        decoder->Decode();
+        codec->Decode();
        break;
    default:
-        UNIMPLEMENTED_MSG("Codec {}", decoder->GetCurrentCodecName());
+        UNIMPLEMENTED_MSG("Codec {}", codec->GetCurrentCodecName());
        break;
    }
 }
--- a/src/video_core/host1x/nvdec.h
+++ b/src/video_core/host1x/nvdec.h
@@ -5,49 +5,33 @@

 #include <memory>
 #include <vector>
-
 #include "common/common_types.h"
-#include "video_core/cdma_pusher.h"
-#include "video_core/host1x/codecs/decoder.h"
+#include "video_core/host1x/codecs/codec.h"

 namespace Tegra {

 namespace Host1x {
+
 class Host1x;

-class Nvdec final : public CDmaPusher {
+class Nvdec {
 public:
-    explicit Nvdec(Host1x& host1x, s32 id, u32 syncpt);
+    explicit Nvdec(Host1x& host1x);
    ~Nvdec();

    /// Writes the method into the state, Invoke Execute() if encountered
-    void ProcessMethod(u32 method, u32 arg) override;
+    void ProcessMethod(u32 method, u32 argument);

-    std::shared_ptr<FFmpeg::Frame> GetFrame(u64 luma_offset) {
-        return decoder->GetFrame(luma_offset);
-    }
-
-    u32 GetSyncpoint() const {
-        return syncpoint;
-    }
-
-    void SetWait() {
-        wait_needed = true;
-    }
+    /// Return most recently decoded frame
+    [[nodiscard]] std::unique_ptr<FFmpeg::Frame> GetFrame();

 private:
-    /// Create the decoder when the codec id is set
-    void CreateDecoder(NvdecCommon::VideoCodec codec);
-
    /// Invoke codec to decode a frame
    void Execute();

-    s32 id;
-    u32 syncpoint;
-
-    NvdecCommon::NvdecRegisters regs{};
-    std::unique_ptr<Decoder> decoder;
-    bool wait_needed{false};
+    Host1x& host1x;
+    NvdecCommon::NvdecRegisters state;
+    std::unique_ptr<Codec> codec;
 };

 } // namespace Host1x
--- a/src/video_core/host1x/nvdec_common.h
+++ b/src/video_core/host1x/nvdec_common.h
@@ -17,17 +17,6 @@ enum class VideoCodec : u64 {
    VP9 = 0x9,
 };

-struct Offset {
-    constexpr u64 Address() const noexcept {
-        return offset << 8;
-    }
-
-private:
-    u64 offset;
-};
-static_assert(std::is_trivial_v<Offset>, "Offset must be trivial");
-static_assert(sizeof(Offset) == 0x8, "Offset has the wrong size!");
-
 // NVDEC should use a 32-bit address space, but is mapped to 64-bit,
 // doubling the sizes here is compensating for that.
 struct NvdecRegisters {
@@ -49,40 +38,29 @@ struct NvdecRegisters {
                    BitField<17, 1, u64> all_intra_frame;
                };
            } control_params;
-            Offset picture_info_offset;                    ///< 0x0808
-            Offset frame_bitstream_offset;                 ///< 0x0810
-            u64 frame_number;                              ///< 0x0818
-            Offset h264_slice_data_offsets;                ///< 0x0820
-            Offset h264_mv_dump_offset;                    ///< 0x0828
-            INSERT_PADDING_WORDS_NOINIT(6);                ///< 0x0830
-            Offset frame_stats_offset;                     ///< 0x0848
-            Offset h264_last_surface_luma_offset;          ///< 0x0850
-            Offset h264_last_surface_chroma_offset;        ///< 0x0858
-            std::array<Offset, 17> surface_luma_offsets;   ///< 0x0860
-            std::array<Offset, 17> surface_chroma_offsets; ///< 0x08E8
-            Offset pic_scratch_buf_offset;                 ///< 0x0970
-            Offset external_mvbuffer_offset;               ///< 0x0978
-            INSERT_PADDING_WORDS_NOINIT(32);               ///< 0x0980
-            Offset h264_mbhist_buffer_offset;              ///< 0x0A00
-            INSERT_PADDING_WORDS_NOINIT(30);               ///< 0x0A08
-            Offset vp8_prob_data_offset;                   ///< 0x0A80
-            Offset vp8_header_partition_buf_offset;        ///< 0x0A88
-            INSERT_PADDING_WORDS_NOINIT(28);               ///< 0x0A90
-            Offset hvec_scalist_list_offset;               ///< 0x0B00
-            Offset hvec_tile_sizes_offset;                 ///< 0x0B08
-            Offset hvec_filter_buffer_offset;              ///< 0x0B10
-            Offset hvec_sao_buffer_offset;                 ///< 0x0B18
-            Offset hvec_slice_info_buffer_offset;          ///< 0x0B20
-            Offset hvec_slice_group_index_offset;          ///< 0x0B28
-            INSERT_PADDING_WORDS_NOINIT(20);               ///< 0x0B30
-            Offset vp9_prob_tab_buffer_offset;             ///< 0x0B80
-            Offset vp9_ctx_counter_buffer_offset;          ///< 0x0B88
-            Offset vp9_segment_read_buffer_offset;         ///< 0x0B90
-            Offset vp9_segment_write_buffer_offset;        ///< 0x0B98
-            Offset vp9_tile_size_buffer_offset;            ///< 0x0BA0
-            Offset vp9_col_mvwrite_buffer_offset;          ///< 0x0BA8
-            Offset vp9_col_mvread_buffer_offset;           ///< 0x0BB0
-            Offset vp9_filter_buffer_offset;               ///< 0x0BB8
+            u64 picture_info_offset;                   ///< 0x0808
+            u64 frame_bitstream_offset;                ///< 0x0810
+            u64 frame_number;                          ///< 0x0818
+            u64 h264_slice_data_offsets;               ///< 0x0820
+            u64 h264_mv_dump_offset;                   ///< 0x0828
+            INSERT_PADDING_WORDS_NOINIT(6);            ///< 0x0830
+            u64 frame_stats_offset;                    ///< 0x0848
+            u64 h264_last_surface_luma_offset;         ///< 0x0850
+            u64 h264_last_surface_chroma_offset;       ///< 0x0858
+            std::array<u64, 17> surface_luma_offset;   ///< 0x0860
+            std::array<u64, 17> surface_chroma_offset; ///< 0x08E8
+            INSERT_PADDING_WORDS_NOINIT(68);           ///< 0x0970
+            u64 vp8_prob_data_offset;                  ///< 0x0A80
+            u64 vp8_header_partition_buf_offset;       ///< 0x0A88
+            INSERT_PADDING_WORDS_NOINIT(60);           ///< 0x0A90
+            u64 vp9_entropy_probs_offset;              ///< 0x0B80
+            u64 vp9_backward_updates_offset;           ///< 0x0B88
+            u64 vp9_last_frame_segmap_offset;          ///< 0x0B90
+            u64 vp9_curr_frame_segmap_offset;          ///< 0x0B98
+            INSERT_PADDING_WORDS_NOINIT(2);            ///< 0x0BA0
+            u64 vp9_last_frame_mvs_offset;             ///< 0x0BA8
+            u64 vp9_curr_frame_mvs_offset;             ///< 0x0BB0
+            INSERT_PADDING_WORDS_NOINIT(2);            ///< 0x0BB8
        };
        std::array<u64, NUM_REGS> reg_array;
    };
@@ -103,16 +81,16 @@ ASSERT_REG_POSITION(h264_slice_data_offsets, 0x104);
 ASSERT_REG_POSITION(frame_stats_offset, 0x109);
 ASSERT_REG_POSITION(h264_last_surface_luma_offset, 0x10A);
 ASSERT_REG_POSITION(h264_last_surface_chroma_offset, 0x10B);
-ASSERT_REG_POSITION(surface_luma_offsets, 0x10C);
-ASSERT_REG_POSITION(surface_chroma_offsets, 0x11D);
+ASSERT_REG_POSITION(surface_luma_offset, 0x10C);
+ASSERT_REG_POSITION(surface_chroma_offset, 0x11D);
 ASSERT_REG_POSITION(vp8_prob_data_offset, 0x150);
 ASSERT_REG_POSITION(vp8_header_partition_buf_offset, 0x151);
-ASSERT_REG_POSITION(vp9_prob_tab_buffer_offset, 0x170);
-ASSERT_REG_POSITION(vp9_ctx_counter_buffer_offset, 0x171);
-ASSERT_REG_POSITION(vp9_segment_read_buffer_offset, 0x172);
-ASSERT_REG_POSITION(vp9_segment_write_buffer_offset, 0x173);
-ASSERT_REG_POSITION(vp9_col_mvwrite_buffer_offset, 0x175);
-ASSERT_REG_POSITION(vp9_col_mvread_buffer_offset, 0x176);
+ASSERT_REG_POSITION(vp9_entropy_probs_offset, 0x170);
+ASSERT_REG_POSITION(vp9_backward_updates_offset, 0x171);
+ASSERT_REG_POSITION(vp9_last_frame_segmap_offset, 0x172);
+ASSERT_REG_POSITION(vp9_curr_frame_segmap_offset, 0x173);
+ASSERT_REG_POSITION(vp9_last_frame_mvs_offset, 0x175);
+ASSERT_REG_POSITION(vp9_curr_frame_mvs_offset, 0x176);

 #undef ASSERT_REG_POSITION

--- a/src/video_core/host1x/syncpoint_manager.cpp
+++ b/src/video_core/host1x/syncpoint_manager.cpp
@@ -18,7 +18,7 @@ SyncpointManager::ActionHandle SyncpointManager::RegisterAction(
        return {};
    }

-    std::scoped_lock lk(guard);
+    std::unique_lock lk(guard);
    if (syncpoint.load(std::memory_order_relaxed) >= expected_value) {
        action();
        return {};
@@ -35,7 +35,7 @@ SyncpointManager::ActionHandle SyncpointManager::RegisterAction(

 void SyncpointManager::DeregisterAction(std::list<RegisteredAction>& action_storage,
                                        const ActionHandle& handle) {
-    std::scoped_lock lk(guard);
+    std::unique_lock lk(guard);

    // We want to ensure the iterator still exists prior to erasing it
    // Otherwise, if an invalid iterator was passed in then it could lead to UB
@@ -78,7 +78,7 @@ void SyncpointManager::Increment(std::atomic<u32>& syncpoint, std::condition_var
                                 std::list<RegisteredAction>& action_storage) {
    auto new_value{syncpoint.fetch_add(1, std::memory_order_acq_rel) + 1};

-    std::scoped_lock lk(guard);
+    std::unique_lock lk(guard);
    auto it = action_storage.begin();
    while (it != action_storage.end()) {
        if (it->expected_value > new_value) {
--- a/src/video_core/host1x/vic.cpp
+++ b/src/video_core/host1x/vic.cpp
--- a/src/video_core/host1x/vic.h
+++ b/src/video_core/host1x/vic.h
@@ -3,645 +3,65 @@

 #pragma once

-#include <condition_variable>
-#include <functional>
 #include <memory>
-#include <mutex>
-#include <thread>

 #include "common/common_types.h"
 #include "common/scratch_buffer.h"
-#include "video_core/cdma_pusher.h"

-namespace Tegra::Host1x {
+struct SwsContext;
+
+namespace Tegra {
+
+namespace Host1x {
+
 class Host1x;
 class Nvdec;
+union VicConfig;

-struct Pixel {
-    u16 r;
-    u16 g;
-    u16 b;
-    u16 a;
-};
-
-// One underscore represents separate pixels.
-// Double underscore represents separate planes.
-// _N represents chroma subsampling, not a separate pixel.
-enum class VideoPixelFormat : u32 {
-    A8 = 0,
-    L8 = 1,
-    A4L4 = 2,
-    L4A4 = 3,
-    R8 = 4,
-    A8L8 = 5,
-    L8A8 = 6,
-    R8G8 = 7,
-    G8R8 = 8,
-    B5G6R5 = 9,
-    R5G6B5 = 10,
-    B6G5R5 = 11,
-    R5G5B6 = 12,
-    A1B5G5R5 = 13,
-    A1R5G5B5 = 14,
-    B5G5R5A1 = 15,
-    R5G5B5A1 = 16,
-    A5B5G5R1 = 17,
-    A5R1G5B5 = 18,
-    B5G5R1A5 = 19,
-    R1G5B5A5 = 20,
-    X1B5G5R5 = 21,
-    X1R5G5B5 = 22,
-    B5G5R5X1 = 23,
-    R5G5B5X1 = 24,
-    A4B4G5R4 = 25,
-    A4R4G4B4 = 26,
-    B4G4R4A4 = 27,
-    R4G4B4A4 = 28,
-    B8G8R8 = 29,
-    R8G8B8 = 30,
-    A8B8G8R8 = 31,
-    A8R8G8B8 = 32,
-    B8G8R8A8 = 33,
-    R8G8B8A8 = 34,
-    X8B8G8R8 = 35,
-    X8R8G8B8 = 36,
-    B8G8R8X8 = 37,
-    R8G8B8X8 = 38,
-    A8B10G10R10 = 39,
-    A2R10G10B10 = 40,
-    B10G10R10A2 = 41,
-    R10G10B10A2 = 42,
-    A4P4 = 43,
-    P4A4 = 44,
-    P8A8 = 45,
-    A8P8 = 46,
-    P8 = 47,
-    P1 = 48,
-    U8V8 = 49,
-    V8U8 = 50,
-    A8Y8U8V8 = 51,
-    V8U8Y8A8 = 52,
-    Y8U8V8 = 53,
-    Y8V8U8 = 54,
-    U8V8Y8 = 55,
-    V8U8Y8 = 56,
-    Y8U8_Y8V8 = 57,
-    Y8V8_Y8U8 = 58,
-    U8Y8_V8Y8 = 59,
-    V8Y8_U8Y8 = 60,
-    Y8__U8V8_N444 = 61,
-    Y8__V8U8_N444 = 62,
-    Y8__U8V8_N422 = 63,
-    Y8__V8U8_N422 = 64,
-    Y8__U8V8_N422R = 65,
-    Y8__V8U8_N422R = 66,
-    Y8__U8V8_N420 = 67,
-    Y8__V8U8_N420 = 68,
-    Y8__U8__V8_N444 = 69,
-    Y8__U8__V8_N422 = 70,
-    Y8__U8__V8_N422R = 71,
-    Y8__U8__V8_N420 = 72,
-    U8 = 73,
-    V8 = 74,
-};
-
-struct Offset {
-    constexpr u32 Address() const noexcept {
-        return offset << 8;
-    }
-
-private:
-    u32 offset;
-};
-static_assert(std::is_trivial_v<Offset>, "Offset must be trivial");
-static_assert(sizeof(Offset) == 0x4, "Offset has the wrong size!");
-
-struct PlaneOffsets {
-    Offset luma;
-    Offset chroma_u;
-    Offset chroma_v;
-};
-static_assert(sizeof(PlaneOffsets) == 0xC, "PlaneOffsets has the wrong size!");
-
-enum SurfaceIndex : u32 {
-    Current = 0,
-    Previous = 1,
-    Next = 2,
-    NextNoiseReduced = 3,
-    CurrentMotion = 4,
-    PreviousMotion = 5,
-    PreviousPreviousMotion = 6,
-    CombinedMotion = 7,
-};
-
-enum class DXVAHD_ALPHA_FILL_MODE : u32 {
-    OPAQUE = 0,
-    BACKGROUND = 1,
-    DESTINATION = 2,
-    SOURCE_STREAM = 3,
-    COMPOSITED = 4,
-    SOURCE_ALPHA = 5,
-};
-
-enum class DXVAHD_FRAME_FORMAT : u64 {
-    PROGRESSIVE = 0,
-    INTERLACED_TOP_FIELD_FIRST = 1,
-    INTERLACED_BOTTOM_FIELD_FIRST = 2,
-    TOP_FIELD = 3,
-    BOTTOM_FIELD = 4,
-    SUBPIC_PROGRESSIVE = 5,
-    SUBPIC_INTERLACED_TOP_FIELD_FIRST = 6,
-    SUBPIC_INTERLACED_BOTTOM_FIELD_FIRST = 7,
-    SUBPIC_TOP_FIELD = 8,
-    SUBPIC_BOTTOM_FIELD = 9,
-    TOP_FIELD_CHROMA_BOTTOM = 10,
-    BOTTOM_FIELD_CHROMA_TOP = 11,
-    SUBPIC_TOP_FIELD_CHROMA_BOTTOM = 12,
-    SUBPIC_BOTTOM_FIELD_CHROMA_TOP = 13,
-};
-
-enum class DXVAHD_DEINTERLACE_MODE_PRIVATE : u64 {
-    WEAVE = 0,
-    BOB_FIELD = 1,
-    BOB = 2,
-    NEWBOB = 3,
-    DISI1 = 4,
-    WEAVE_LUMA_BOB_FIELD_CHROMA = 5,
-    MAX = 0xF,
-};
-
-enum class BLK_KIND {
-    PITCH = 0,
-    GENERIC_16Bx2 = 1,
-    // These are unsupported in the vic
-    BL_NAIVE = 2,
-    BL_KEPLER_XBAR_RAW = 3,
-    VP2_TILED = 15,
-};
-
-enum class BLEND_SRCFACTC : u32 {
-    K1 = 0,
-    K1_TIMES_DST = 1,
-    NEG_K1_TIMES_DST = 2,
-    K1_TIMES_SRC = 3,
-    ZERO = 4,
-};
-
-enum class BLEND_DSTFACTC : u32 {
-    K1 = 0,
-    K2 = 1,
-    K1_TIMES_DST = 2,
-    NEG_K1_TIMES_DST = 3,
-    NEG_K1_TIMES_SRC = 4,
-    ZERO = 5,
-    ONE = 6,
-};
-
-enum class BLEND_SRCFACTA : u32 {
-    K1 = 0,
-    K2 = 1,
-    NEG_K1_TIMES_DST = 2,
-    ZERO = 3,
-    MAX = 7,
-};
-
-enum class BLEND_DSTFACTA : u32 {
-    K2 = 0,
-    NEG_K1_TIMES_SRC = 1,
-    ZERO = 2,
-    ONE = 3,
-    MAX = 7,
-};
-
-struct PipeConfig {
-    union {
-        BitField<0, 11, u32> downsample_horiz;
-        BitField<11, 5, u32> reserved0;
-        BitField<16, 11, u32> downsample_vert;
-        BitField<27, 5, u32> reserved1;
-    };
-    u32 reserved2;
-    u32 reserved3;
-    u32 reserved4;
-};
-static_assert(sizeof(PipeConfig) == 0x10, "PipeConfig has the wrong size!");
-
-struct OutputConfig {
-    union {
-        BitField<0, 3, DXVAHD_ALPHA_FILL_MODE> alpha_fill_mode;
-        BitField<3, 3, u64> alpha_fill_slot;
-        BitField<6, 10, u64> background_a;
-        BitField<16, 10, u64> background_r;
-        BitField<26, 10, u64> background_g;
-        BitField<36, 10, u64> background_b;
-        BitField<46, 2, u64> regamma_mode;
-        BitField<48, 1, u64> output_flip_x;
-        BitField<49, 1, u64> output_flip_y;
-        BitField<50, 1, u64> output_transpose;
-        BitField<51, 1, u64> reserved1;
-        BitField<52, 12, u64> reserved2;
-    };
-    union {
-        BitField<0, 14, u32> target_rect_left;
-        BitField<14, 2, u32> reserved3;
-        BitField<16, 14, u32> target_rect_right;
-        BitField<30, 2, u32> reserved4;
-    };
-    union {
-        BitField<0, 14, u32> target_rect_top;
-        BitField<14, 2, u32> reserved5;
-        BitField<16, 14, u32> target_rect_bottom;
-        BitField<30, 2, u32> reserved6;
-    };
-};
-static_assert(sizeof(OutputConfig) == 0x10, "OutputConfig has the wrong size!");
-
-struct OutputSurfaceConfig {
-    union {
-        BitField<0, 7, VideoPixelFormat> out_pixel_format;
-        BitField<7, 2, u32> out_chroma_loc_horiz;
-        BitField<9, 2, u32> out_chroma_loc_vert;
-        BitField<11, 4, BLK_KIND> out_block_kind;
-        BitField<15, 4, u32> out_block_height; // in gobs, log2
-        BitField<19, 3, u32> reserved0;
-        BitField<22, 10, u32> reserved1;
-    };
-    union {
-        BitField<0, 14, u32> out_surface_width;   // - 1
-        BitField<14, 14, u32> out_surface_height; // - 1
-        BitField<28, 4, u32> reserved2;
-    };
-    union {
-        BitField<0, 14, u32> out_luma_width;   // - 1
-        BitField<14, 14, u32> out_luma_height; // - 1
-        BitField<28, 4, u32> reserved3;
-    };
-    union {
-        BitField<0, 14, u32> out_chroma_width;   // - 1
-        BitField<14, 14, u32> out_chroma_height; // - 1
-        BitField<28, 4, u32> reserved4;
-    };
-};
-static_assert(sizeof(OutputSurfaceConfig) == 0x10, "OutputSurfaceConfig has the wrong size!");
-
-struct MatrixStruct {
-    union {
-        BitField<0, 20, s64> matrix_coeff00;  // (0,0) of 4x3 conversion matrix
-        BitField<20, 20, s64> matrix_coeff10; // (1,0) of 4x3 conversion matrix
-        BitField<40, 20, s64> matrix_coeff20; // (2,0) of 4x3 conversion matrix
-        BitField<60, 4, u64> matrix_r_shift;
-    };
-    union {
-        BitField<0, 20, s64> matrix_coeff01;  // (0,1) of 4x3 conversion matrix
-        BitField<20, 20, s64> matrix_coeff11; // (1,1) of 4x3 conversion matrix
-        BitField<40, 20, s64> matrix_coeff21; // (2,1) of 4x3 conversion matrix
-        BitField<60, 3, u64> reserved0;
-        BitField<63, 1, u64> matrix_enable;
-    };
-    union {
-        BitField<0, 20, s64> matrix_coeff02;  // (0,2) of 4x3 conversion matrix
-        BitField<20, 20, s64> matrix_coeff12; // (1,2) of 4x3 conversion matrix
-        BitField<40, 20, s64> matrix_coeff22; // (2,2) of 4x3 conversion matrix
-        BitField<60, 4, u64> reserved1;
-    };
-    union {
-        BitField<0, 20, s64> matrix_coeff03;  // (0,3) of 4x3 conversion matrix
-        BitField<20, 20, s64> matrix_coeff13; // (1,3) of 4x3 conversion matrix
-        BitField<40, 20, s64> matrix_coeff23; // (2,3) of 4x3 conversion matrix
-        BitField<60, 4, u64> reserved2;
-    };
-};
-static_assert(sizeof(MatrixStruct) == 0x20, "MatrixStruct has the wrong size!");
-
-struct ClearRectStruct {
-    union {
-        BitField<0, 14, u32> clear_rect0_left;
-        BitField<14, 2, u32> reserved0;
-        BitField<16, 14, u32> clear_rect0_right;
-        BitField<30, 2, u32> reserved1;
-    };
-    union {
-        BitField<0, 14, u32> clear_rect0_top;
-        BitField<14, 2, u32> reserved2;
-        BitField<16, 14, u32> clear_rect0_bottom;
-        BitField<30, 2, u32> reserved3;
-    };
-    union {
-        BitField<0, 14, u32> clear_rect1_left;
-        BitField<14, 2, u32> reserved4;
-        BitField<16, 14, u32> clear_rect1_right;
-        BitField<30, 2, u32> reserved5;
-    };
-    union {
-        BitField<0, 14, u32> clear_rect1_top;
-        BitField<14, 2, u32> reserved6;
-        BitField<16, 14, u32> clear_rect1_bottom;
-        BitField<30, 2, u32> reserved7;
-    };
-};
-static_assert(sizeof(ClearRectStruct) == 0x10, "ClearRectStruct has the wrong size!");
-
-struct SlotConfig {
-    union {
-        BitField<0, 1, u64> slot_enable;
-        BitField<1, 1, u64> denoise;
-        BitField<2, 1, u64> advanced_denoise;
-        BitField<3, 1, u64> cadence_detect;
-        BitField<4, 1, u64> motion_map;
-        BitField<5, 1, u64> motion_map_capture;
-        BitField<6, 1, u64> is_even;
-        BitField<7, 1, u64> chroma_even;
-        // fetch control struct
-        BitField<8, 1, u64> current_field_enable;
-        BitField<9, 1, u64> prev_field_enable;
-        BitField<10, 1, u64> next_field_enable;
-        BitField<11, 1, u64> next_nr_field_enable; // noise reduction
-        BitField<12, 1, u64> current_motion_field_enable;
-        BitField<13, 1, u64> prev_motion_field_enable;
-        BitField<14, 1, u64> prev_prev_motion_field_enable;
-        BitField<15, 1, u64> combined_motion_field_enable;
-
-        BitField<16, 4, DXVAHD_FRAME_FORMAT> frame_format;
-        BitField<20, 2, u64> filter_length_y; // 0: 1-tap, 1: 2-tap, 2: 5-tap, 3: 10-tap
-        BitField<22, 2, u64> filter_length_x;
-        BitField<24, 12, u64> panoramic;
-        BitField<36, 22, u64> reserved1;
-        BitField<58, 6, u64> detail_filter_clamp;
-    };
-    union {
-        BitField<0, 10, u64> filter_noise;
-        BitField<10, 10, u64> filter_detail;
-        BitField<20, 10, u64> chroma_noise;
-        BitField<30, 10, u64> chroma_detail;
-        BitField<40, 4, DXVAHD_DEINTERLACE_MODE_PRIVATE> deinterlace_mode;
-        BitField<44, 3, u64> motion_accumulation_weight;
-        BitField<47, 11, u64> noise_iir;
-        BitField<58, 4, u64> light_level;
-        BitField<62, 2, u64> reserved4;
-    };
-    union {
-        BitField<0, 10, u64> soft_clamp_low;
-        BitField<10, 10, u64> soft_clamp_high;
-        BitField<20, 3, u64> reserved5;
-        BitField<23, 9, u64> reserved6;
-        BitField<32, 10, u64> planar_alpha;
-        BitField<42, 1, u64> constant_alpha;
-        BitField<43, 3, u64> stereo_interleave;
-        BitField<46, 1, u64> clip_enabled;
-        BitField<47, 8, u64> clear_rect_mask;
-        BitField<55, 2, u64> degamma_mode;
-        BitField<57, 1, u64> reserved7;
-        BitField<58, 1, u64> decompress_enable;
-        BitField<59, 5, u64> reserved9;
-    };
-    union {
-        BitField<0, 8, u64> decompress_ctb_count;
-        BitField<8, 32, u64> decompress_zbc_count;
-        BitField<40, 24, u64> reserved12;
-    };
-    union {
-        BitField<0, 30, u64> source_rect_left;
-        BitField<30, 2, u64> reserved14;
-        BitField<32, 30, u64> source_rect_right;
-        BitField<62, 2, u64> reserved15;
-    };
-    union {
-        BitField<0, 30, u64> source_rect_top;
-        BitField<30, 2, u64> reserved16;
-        BitField<32, 30, u64> source_rect_bottom;
-        BitField<62, 2, u64> reserved17;
-    };
-    union {
-        BitField<0, 14, u64> dest_rect_left;
-        BitField<14, 2, u64> reserved18;
-        BitField<16, 14, u64> dest_rect_right;
-        BitField<30, 2, u64> reserved19;
-        BitField<32, 14, u64> dest_rect_top;
-        BitField<46, 2, u64> reserved20;
-        BitField<48, 14, u64> dest_rect_bottom;
-        BitField<62, 2, u64> reserved21;
-    };
-    u32 reserved22;
-    u32 reserved23;
-};
-static_assert(sizeof(SlotConfig) == 0x40, "SlotConfig has the wrong size!");
-
-struct SlotSurfaceConfig {
-    union {
-        BitField<0, 7, VideoPixelFormat> slot_pixel_format;
-        BitField<7, 2, u32> slot_chroma_loc_horiz;
-        BitField<9, 2, u32> slot_chroma_loc_vert;
-        BitField<11, 4, u32> slot_block_kind;
-        BitField<15, 4, u32> slot_block_height;
-        BitField<19, 3, u32> slot_cache_width;
-        BitField<22, 10, u32> reserved0;
-    };
-    union {
-        BitField<0, 14, u32> slot_surface_width;   //  - 1
-        BitField<14, 14, u32> slot_surface_height; //  - 1
-        BitField<28, 4, u32> reserved1;
-    };
-    union {
-        BitField<0, 14, u32> slot_luma_width;   // padded, - 1
-        BitField<14, 14, u32> slot_luma_height; // padded, - 1
-        BitField<28, 4, u32> reserved2;
-    };
-    union {
-        BitField<0, 14, u32> slot_chroma_width;   // padded, - 1
-        BitField<14, 14, u32> slot_chroma_height; // padded, - 1
-        BitField<28, 4, u32> reserved3;
-    };
-};
-static_assert(sizeof(SlotSurfaceConfig) == 0x10, "SlotSurfaceConfig has the wrong size!");
-
-struct LumaKeyStruct {
-    union {
-        BitField<0, 20, u64> luma_coeff0;  // (0) of 4x1 conversion matrix, S12.8 format
-        BitField<20, 20, u64> luma_coeff1; // (1) of 4x1 conversion matrix, S12.8 format
-        BitField<40, 20, u64> luma_coeff2; // (2) of 4x1 conversion matrix, S12.8 format
-        BitField<60, 4, u64> luma_r_shift;
-    };
-    union {
-        BitField<0, 20, u64> luma_coeff3; // (3) of 4x1 conversion matrix, S12.8 format
-        BitField<20, 10, u64> luma_key_lower;
-        BitField<30, 10, u64> luma_key_upper;
-        BitField<40, 1, u64> luma_key_enabled;
-        BitField<41, 2, u64> reserved0;
-        BitField<43, 21, u64> reserved1;
-    };
-};
-static_assert(sizeof(LumaKeyStruct) == 0x10, "LumaKeyStruct has the wrong size!");
-
-struct BlendingSlotStruct {
-    union {
-        BitField<0, 10, u32> alpha_k1;
-        BitField<10, 6, u32> reserved0;
-        BitField<16, 10, u32> alpha_k2;
-        BitField<26, 6, u32> reserved1;
-    };
-    union {
-        BitField<0, 3, BLEND_SRCFACTC> src_factor_color_match_select;
-        BitField<3, 1, u32> reserved2;
-        BitField<4, 3, BLEND_DSTFACTC> dst_factor_color_match_select;
-        BitField<7, 1, u32> reserved3;
-        BitField<8, 3, BLEND_SRCFACTA> src_factor_a_match_select;
-        BitField<11, 1, u32> reserved4;
-        BitField<12, 3, BLEND_DSTFACTA> dst_factor_a_match_select;
-        BitField<15, 1, u32> reserved5;
-        BitField<16, 4, u32> reserved6;
-        BitField<20, 4, u32> reserved7;
-        BitField<24, 4, u32> reserved8;
-        BitField<28, 4, u32> reserved9;
-    };
-    union {
-        BitField<0, 2, u32> reserved10;
-        BitField<2, 10, u32> override_r;
-        BitField<12, 10, u32> override_g;
-        BitField<22, 10, u32> override_b;
-    };
-    union {
-        BitField<0, 10, u32> override_a;
-        BitField<10, 2, u32> reserved11;
-        BitField<12, 1, u32> use_override_r;
-        BitField<13, 1, u32> use_override_g;
-        BitField<14, 1, u32> use_override_b;
-        BitField<15, 1, u32> use_override_a;
-        BitField<16, 1, u32> mask_r;
-        BitField<17, 1, u32> mask_g;
-        BitField<18, 1, u32> mask_b;
-        BitField<19, 1, u32> mask_a;
-        BitField<20, 12, u32> reserved12;
-    };
-};
-static_assert(sizeof(BlendingSlotStruct) == 0x10, "BlendingSlotStruct has the wrong size!");
-
-struct SlotStruct {
-    SlotConfig config;
-    SlotSurfaceConfig surface_config;
-    LumaKeyStruct luma_key;
-    MatrixStruct color_matrix;
-    MatrixStruct gamut_matrix;
-    BlendingSlotStruct blending;
-};
-static_assert(sizeof(SlotStruct) == 0xB0, "SlotStruct has the wrong size!");
-
-struct ConfigStruct {
-    PipeConfig pipe_config;
-    OutputConfig output_config;
-    OutputSurfaceConfig output_surface_config;
-    MatrixStruct out_color_matrix;
-    std::array<ClearRectStruct, 4> clear_rects;
-    std::array<SlotStruct, 8> slot_structs;
-};
-static_assert(offsetof(ConfigStruct, pipe_config) == 0x0, "pipe_config is in the wrong place!");
-static_assert(offsetof(ConfigStruct, output_config) == 0x10,
-              "output_config is in the wrong place!");
-static_assert(offsetof(ConfigStruct, output_surface_config) == 0x20,
-              "output_surface_config is in the wrong place!");
-static_assert(offsetof(ConfigStruct, out_color_matrix) == 0x30,
-              "out_color_matrix is in the wrong place!");
-static_assert(offsetof(ConfigStruct, clear_rects) == 0x50, "clear_rects is in the wrong place!");
-static_assert(offsetof(ConfigStruct, slot_structs) == 0x90, "slot_structs is in the wrong place!");
-static_assert(sizeof(ConfigStruct) == 0x610, "ConfigStruct has the wrong size!");
-
-struct VicRegisters {
-    static constexpr std::size_t NUM_REGS = 0x446;
-
-    union {
-        struct {
-            INSERT_PADDING_WORDS_NOINIT(0xC0);
-            u32 execute;
-            INSERT_PADDING_WORDS_NOINIT(0x3F);
-            std::array<std::array<PlaneOffsets, 8>, 8> surfaces;
-            u32 picture_index;
-            u32 control_params;
-            Offset config_struct_offset;
-            Offset filter_struct_offset;
-            Offset palette_offset;
-            Offset hist_offset;
-            u32 context_id;
-            u32 fce_ucode_size;
-            PlaneOffsets output_surface;
-            Offset fce_ucode_offset;
-            INSERT_PADDING_WORDS_NOINIT(0x4);
-            std::array<u32, 8> slot_context_ids;
-            std::array<Offset, 8> comp_tag_buffer_offsets;
-            std::array<Offset, 8> history_buffer_offset;
-            INSERT_PADDING_WORDS_NOINIT(0x25D);
-            u32 pm_trigger_end;
-        };
-        std::array<u32, NUM_REGS> reg_array;
-    };
-};
-static_assert(offsetof(VicRegisters, execute) == 0x300, "execute is in the wrong place!");
-static_assert(offsetof(VicRegisters, surfaces) == 0x400, "surfaces is in the wrong place!");
-static_assert(offsetof(VicRegisters, picture_index) == 0x700,
-              "picture_index is in the wrong place!");
-static_assert(offsetof(VicRegisters, control_params) == 0x704,
-              "control_params is in the wrong place!");
-static_assert(offsetof(VicRegisters, config_struct_offset) == 0x708,
-              "config_struct_offset is in the wrong place!");
-static_assert(offsetof(VicRegisters, output_surface) == 0x720,
-              "output_surface is in the wrong place!");
-static_assert(offsetof(VicRegisters, slot_context_ids) == 0x740,
-              "slot_context_ids is in the wrong place!");
-static_assert(offsetof(VicRegisters, history_buffer_offset) == 0x780,
-              "history_buffer_offset is in the wrong place!");
-static_assert(offsetof(VicRegisters, pm_trigger_end) == 0x1114,
-              "pm_trigger_end is in the wrong place!");
-static_assert(sizeof(VicRegisters) == 0x1118, "VicRegisters has the wrong size!");
-
-class Vic final : public CDmaPusher {
+class Vic {
 public:
    enum class Method : u32 {
-        Execute = offsetof(VicRegisters, execute),
-        SetControlParams = offsetof(VicRegisters, control_params),
-        SetConfigStructOffset = offsetof(VicRegisters, config_struct_offset),
-        SetOutputSurfaceLumaOffset = offsetof(VicRegisters, output_surface.luma),
-        SetOutputSurfaceChromaOffset = offsetof(VicRegisters, output_surface.chroma_u),
-        SetOutputSurfaceChromaUnusedOffset = offsetof(VicRegisters, output_surface.chroma_v)
+        Execute = 0xc0,
+        SetControlParams = 0x1c1,
+        SetConfigStructOffset = 0x1c2,
+        SetOutputSurfaceLumaOffset = 0x1c8,
+        SetOutputSurfaceChromaOffset = 0x1c9,
+        SetOutputSurfaceChromaUnusedOffset = 0x1ca
    };

-    explicit Vic(Host1x& host1x, s32 id, Nvdec& nvdec_processor, u32 syncpt);
+    explicit Vic(Host1x& host1x, std::shared_ptr<Nvdec> nvdec_processor);
+
    ~Vic();

    /// Write to the device state.
-    void ProcessMethod(u32 method, u32 arg) override;
+    void ProcessMethod(Method method, u32 argument);

 private:
    void Execute();

-    void Blend(const ConfigStruct& config, const SlotStruct& slot);
+    void WriteRGBFrame(std::unique_ptr<FFmpeg::Frame> frame, const VicConfig& config);

-    template <bool Planar, bool Interlaced = false>
-    void ReadProgressiveY8__V8U8_N420(const SlotStruct& slot, std::span<const PlaneOffsets> offsets,
-                                      std::shared_ptr<const FFmpeg::Frame> frame);
-    template <bool Planar, bool TopField>
-    void ReadInterlacedY8__V8U8_N420(const SlotStruct& slot, std::span<const PlaneOffsets> offsets,
-                                     std::shared_ptr<const FFmpeg::Frame> frame);
+    void WriteYUVFrame(std::unique_ptr<FFmpeg::Frame> frame, const VicConfig& config);

-    template <bool Planar>
-    void ReadY8__V8U8_N420(const SlotStruct& slot, std::span<const PlaneOffsets> offsets,
-                           std::shared_ptr<const FFmpeg::Frame> frame);
+    Host1x& host1x;
+    std::shared_ptr<Tegra::Host1x::Nvdec> nvdec_processor;

-    void WriteY8__V8U8_N420(const OutputSurfaceConfig& output_surface_config);
+    /// Avoid reallocation of the following buffers every frame, as their
+    /// size does not change during a stream
+    using AVMallocPtr = std::unique_ptr<u8, decltype(&av_free)>;
+    AVMallocPtr converted_frame_buffer;
+    Common::ScratchBuffer<u8> luma_buffer;
+    Common::ScratchBuffer<u8> chroma_buffer;

-    template <VideoPixelFormat Format>
-    void WriteABGR(const OutputSurfaceConfig& output_surface_config);
+    GPUVAddr config_struct_address{};
+    GPUVAddr output_surface_luma_address{};
+    GPUVAddr output_surface_chroma_address{};

-    Nvdec& nvdec_processor;
-    s32 id;
-    u32 syncpoint;
-
-    VicRegisters regs{};
-
-    const bool has_sse41{false};
-
-    Common::ScratchBuffer<Pixel> output_surface;
-    Common::ScratchBuffer<Pixel> slot_surface;
-    Common::ScratchBuffer<u8> luma_scratch;
-    Common::ScratchBuffer<u8> chroma_scratch;
-    Common::ScratchBuffer<u8> swizzle_scratch;
+    SwsContext* scaler_ctx{};
+    s32 scaler_width{};
+    s32 scaler_height{};
 };

-} // namespace Tegra::Host1x
+} // namespace Host1x
+
+} // namespace Tegra
--- a/src/video_core/memory_manager.h
+++ b/src/video_core/memory_manager.h
@@ -42,8 +42,6 @@ public:
                           u64 page_bits_ = 12);
    ~MemoryManager();

-    static constexpr bool HAS_FLUSH_INVALIDATION = true;
-
    size_t GetID() const {
        return unique_identifier;
    }