From afc99d73c72548c39c27860131906213e7dc09a7 Mon Sep 17 00:00:00 2001 From: pineappleEA Date: Thu, 1 Feb 2024 07:01:42 +0100 Subject: [PATCH] early-access version 4105 --- README.md | 2 +- src/core/device_memory_manager.h | 2 - src/core/guest_memory.h | 62 +- src/core/hle/service/nvdrv/core/container.h | 3 + .../service/nvdrv/devices/nvhost_nvdec.cpp | 15 +- .../nvdrv/devices/nvhost_nvdec_common.cpp | 27 +- .../nvdrv/devices/nvhost_nvdec_common.h | 1 - .../hle/service/nvdrv/devices/nvhost_vic.cpp | 15 +- src/core/memory.h | 2 - src/video_core/CMakeLists.txt | 6 +- src/video_core/cdma_pusher.cpp | 196 +-- src/video_core/cdma_pusher.h | 87 +- src/video_core/engines/sw_blitter/blitter.cpp | 37 +- src/video_core/gpu.cpp | 33 + src/video_core/gpu.h | 6 + src/video_core/gpu_thread.cpp | 1 - src/video_core/host1x/codecs/h264.cpp | 142 +- src/video_core/host1x/codecs/h264.h | 292 ++-- src/video_core/host1x/codecs/vp8.cpp | 72 +- src/video_core/host1x/codecs/vp8.h | 43 +- src/video_core/host1x/codecs/vp9.cpp | 151 +- src/video_core/host1x/codecs/vp9.h | 45 +- src/video_core/host1x/codecs/vp9_types.h | 27 +- src/video_core/host1x/control.cpp | 1 - src/video_core/host1x/control.h | 10 +- src/video_core/host1x/ffmpeg/ffmpeg.cpp | 236 ++-- src/video_core/host1x/ffmpeg/ffmpeg.h | 61 +- src/video_core/host1x/host1x.cpp | 27 +- src/video_core/host1x/host1x.h | 42 +- src/video_core/host1x/nvdec.cpp | 61 +- src/video_core/host1x/nvdec.h | 36 +- src/video_core/host1x/nvdec_common.h | 84 +- src/video_core/host1x/syncpoint_manager.cpp | 6 +- src/video_core/host1x/vic.cpp | 1238 +++-------------- src/video_core/host1x/vic.h | 652 +-------- src/video_core/memory_manager.h | 2 - 36 files changed, 958 insertions(+), 2765 deletions(-) diff --git a/README.md b/README.md index b50f7d45c..42379b1b9 100755 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ yuzu emulator early access ============= -This is the source code for early-access 4104. +This is the source code for early-access 4105. ## Legal Notice diff --git a/src/core/device_memory_manager.h b/src/core/device_memory_manager.h index 6dcf7bb22..0568a821b 100755 --- a/src/core/device_memory_manager.h +++ b/src/core/device_memory_manager.h @@ -43,8 +43,6 @@ public: DeviceMemoryManager(const DeviceMemory& device_memory); ~DeviceMemoryManager(); - static constexpr bool HAS_FLUSH_INVALIDATION = true; - void BindInterface(DeviceInterface* device_inter); DAddr Allocate(size_t size); diff --git a/src/core/guest_memory.h b/src/core/guest_memory.h index 83292f702..7ee18c126 100755 --- a/src/core/guest_memory.h +++ b/src/core/guest_memory.h @@ -44,32 +44,15 @@ public: GuestMemory() = delete; explicit GuestMemory(M& memory, u64 addr, std::size_t size, Common::ScratchBuffer* backup = nullptr) - : m_memory{&memory}, m_addr{addr}, m_size{size} { + : m_memory{memory}, m_addr{addr}, m_size{size} { static_assert(FLAGS & GuestMemoryFlags::Read || FLAGS & GuestMemoryFlags::Write); - if constexpr (!(FLAGS & GuestMemoryFlags::Read)) { - if (!this->TrySetSpan()) { - if (backup) { - backup->resize_destructive(this->size()); - m_data_span = *backup; - m_span_valid = true; - m_is_data_copy = true; - } else { - m_data_copy.resize(this->size()); - m_data_span = std::span(m_data_copy); - m_span_valid = true; - m_is_data_copy = true; - } - } - } else if constexpr (FLAGS & GuestMemoryFlags::Read) { + if constexpr (FLAGS & GuestMemoryFlags::Read) { Read(addr, size, backup); } } ~GuestMemory() = default; - GuestMemory(GuestMemory&& rhs) = default; - GuestMemory& operator=(GuestMemory&& rhs) = default; - T* data() noexcept { return m_data_span.data(); } @@ -126,8 +109,8 @@ public: } if (this->TrySetSpan()) { - if constexpr (FLAGS & GuestMemoryFlags::Safe && M::HAS_FLUSH_INVALIDATION) { - m_memory->FlushRegion(m_addr, this->size_bytes()); + if constexpr (FLAGS & GuestMemoryFlags::Safe) { + m_memory.FlushRegion(m_addr, this->size_bytes()); } } else { if (backup) { @@ -140,9 +123,9 @@ public: m_is_data_copy = true; m_span_valid = true; if constexpr (FLAGS & GuestMemoryFlags::Safe) { - m_memory->ReadBlock(m_addr, this->data(), this->size_bytes()); + m_memory.ReadBlock(m_addr, this->data(), this->size_bytes()); } else { - m_memory->ReadBlockUnsafe(m_addr, this->data(), this->size_bytes()); + m_memory.ReadBlockUnsafe(m_addr, this->data(), this->size_bytes()); } } return m_data_span; @@ -150,19 +133,18 @@ public: void Write(std::span write_data) noexcept { if constexpr (FLAGS & GuestMemoryFlags::Cached) { - m_memory->WriteBlockCached(m_addr, write_data.data(), this->size_bytes()); + m_memory.WriteBlockCached(m_addr, write_data.data(), this->size_bytes()); } else if constexpr (FLAGS & GuestMemoryFlags::Safe) { - m_memory->WriteBlock(m_addr, write_data.data(), this->size_bytes()); + m_memory.WriteBlock(m_addr, write_data.data(), this->size_bytes()); } else { - m_memory->WriteBlockUnsafe(m_addr, write_data.data(), this->size_bytes()); + m_memory.WriteBlockUnsafe(m_addr, write_data.data(), this->size_bytes()); } } bool TrySetSpan() noexcept { - if (u8* ptr = m_memory->GetSpan(m_addr, this->size_bytes()); ptr) { + if (u8* ptr = m_memory.GetSpan(m_addr, this->size_bytes()); ptr) { m_data_span = {reinterpret_cast(ptr), this->size()}; m_span_valid = true; - m_is_data_copy = false; return true; } return false; @@ -177,7 +159,7 @@ protected: return m_addr_changed; } - M* m_memory; + M& m_memory; u64 m_addr{}; size_t m_size{}; std::span m_data_span{}; @@ -193,7 +175,17 @@ public: GuestMemoryScoped() = delete; explicit GuestMemoryScoped(M& memory, u64 addr, std::size_t size, Common::ScratchBuffer* backup = nullptr) - : GuestMemory(memory, addr, size, backup) {} + : GuestMemory(memory, addr, size, backup) { + if constexpr (!(FLAGS & GuestMemoryFlags::Read)) { + if (!this->TrySetSpan()) { + if (backup) { + this->m_data_span = *backup; + this->m_span_valid = true; + this->m_is_data_copy = true; + } + } + } + } ~GuestMemoryScoped() { if constexpr (FLAGS & GuestMemoryFlags::Write) { @@ -204,17 +196,15 @@ public: if (this->AddressChanged() || this->IsDataCopy()) { ASSERT(this->m_span_valid); if constexpr (FLAGS & GuestMemoryFlags::Cached) { - this->m_memory->WriteBlockCached(this->m_addr, this->data(), - this->size_bytes()); + this->m_memory.WriteBlockCached(this->m_addr, this->data(), this->size_bytes()); } else if constexpr (FLAGS & GuestMemoryFlags::Safe) { - this->m_memory->WriteBlock(this->m_addr, this->data(), this->size_bytes()); + this->m_memory.WriteBlock(this->m_addr, this->data(), this->size_bytes()); } else { - this->m_memory->WriteBlockUnsafe(this->m_addr, this->data(), - this->size_bytes()); + this->m_memory.WriteBlockUnsafe(this->m_addr, this->data(), this->size_bytes()); } } else if constexpr ((FLAGS & GuestMemoryFlags::Safe) || (FLAGS & GuestMemoryFlags::Cached)) { - this->m_memory->InvalidateRegion(this->m_addr, this->size_bytes()); + this->m_memory.InvalidateRegion(this->m_addr, this->size_bytes()); } } } diff --git a/src/core/hle/service/nvdrv/core/container.h b/src/core/hle/service/nvdrv/core/container.h index 647514535..c4aa9018f 100755 --- a/src/core/hle/service/nvdrv/core/container.h +++ b/src/core/hle/service/nvdrv/core/container.h @@ -67,7 +67,10 @@ public: const SyncpointManager& GetSyncpointManager() const; struct Host1xDeviceFileData { + std::unordered_map fd_to_id{}; std::deque syncpts_accumulated{}; + u32 nvdec_next_id{}; + u32 vic_next_id{}; }; Host1xDeviceFileData& Host1xDeviceFile(); diff --git a/src/core/hle/service/nvdrv/devices/nvhost_nvdec.cpp b/src/core/hle/service/nvdrv/devices/nvhost_nvdec.cpp index 0daa97c7f..762abc560 100755 --- a/src/core/hle/service/nvdrv/devices/nvhost_nvdec.cpp +++ b/src/core/hle/service/nvdrv/devices/nvhost_nvdec.cpp @@ -8,7 +8,6 @@ #include "core/hle/service/nvdrv/core/container.h" #include "core/hle/service/nvdrv/devices/ioctl_serialization.h" #include "core/hle/service/nvdrv/devices/nvhost_nvdec.h" -#include "video_core/host1x/host1x.h" #include "video_core/renderer_base.h" namespace Service::Nvidia::Devices { @@ -22,8 +21,13 @@ NvResult nvhost_nvdec::Ioctl1(DeviceFD fd, Ioctl command, std::span in switch (command.group) { case 0x0: switch (command.cmd) { - case 0x1: + case 0x1: { + auto& host1x_file = core.Host1xDeviceFile(); + if (!host1x_file.fd_to_id.contains(fd)) { + host1x_file.fd_to_id[fd] = host1x_file.nvdec_next_id++; + } return WrapFixedVariable(this, &nvhost_nvdec::Submit, input, output, fd); + } case 0x2: return WrapFixed(this, &nvhost_nvdec::GetSyncpoint, input, output); case 0x3: @@ -68,12 +72,15 @@ void nvhost_nvdec::OnOpen(NvCore::SessionId session_id, DeviceFD fd) { LOG_INFO(Service_NVDRV, "NVDEC video stream started"); system.SetNVDECActive(true); sessions[fd] = session_id; - host1x.StartDevice(fd, Tegra::Host1x::ChannelType::NvDec, channel_syncpoint); } void nvhost_nvdec::OnClose(DeviceFD fd) { LOG_INFO(Service_NVDRV, "NVDEC video stream ended"); - host1x.StopDevice(fd, Tegra::Host1x::ChannelType::NvDec); + auto& host1x_file = core.Host1xDeviceFile(); + const auto iter = host1x_file.fd_to_id.find(fd); + if (iter != host1x_file.fd_to_id.end()) { + system.GPU().ClearCdmaInstance(iter->second); + } system.SetNVDECActive(false); auto it = sessions.find(fd); if (it != sessions.end()) { diff --git a/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.cpp b/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.cpp index da29f332c..38778c714 100755 --- a/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.cpp +++ b/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.cpp @@ -55,9 +55,8 @@ std::size_t WriteVectors(std::span dst, const std::vector& src, std::size nvhost_nvdec_common::nvhost_nvdec_common(Core::System& system_, NvCore::Container& core_, NvCore::ChannelType channel_type_) - : nvdevice{system_}, host1x{system_.Host1x()}, core{core_}, - syncpoint_manager{core.GetSyncpointManager()}, nvmap{core.GetNvMapFile()}, - channel_type{channel_type_} { + : nvdevice{system_}, core{core_}, syncpoint_manager{core.GetSyncpointManager()}, + nvmap{core.GetNvMapFile()}, channel_type{channel_type_} { auto& syncpts_accumulated = core.Host1xDeviceFile().syncpts_accumulated; if (syncpts_accumulated.empty()) { channel_syncpoint = syncpoint_manager.AllocateSyncpoint(false); @@ -96,24 +95,24 @@ NvResult nvhost_nvdec_common::Submit(IoctlSubmit& params, std::span data, De offset += SliceVectors(data, syncpt_increments, params.syncpoint_count, offset); offset += SliceVectors(data, fence_thresholds, params.fence_count, offset); + auto& gpu = system.GPU(); auto* session = core.GetSession(sessions[fd]); - for (std::size_t i = 0; i < syncpt_increments.size(); i++) { - const SyncptIncr& syncpt_incr = syncpt_increments[i]; - fence_thresholds[i] = - syncpoint_manager.IncrementSyncpointMaxExt(syncpt_incr.id, syncpt_incr.increments); + if (gpu.UseNvdec()) { + for (std::size_t i = 0; i < syncpt_increments.size(); i++) { + const SyncptIncr& syncpt_incr = syncpt_increments[i]; + fence_thresholds[i] = + syncpoint_manager.IncrementSyncpointMaxExt(syncpt_incr.id, syncpt_incr.increments); + } } - for (const auto& cmd_buffer : command_buffers) { const auto object = nvmap.GetHandle(cmd_buffer.memory_id); ASSERT_OR_EXECUTE(object, return NvResult::InvalidState;); - Core::Memory::CpuGuestMemory - cmdlist(session->process->GetMemory(), object->address + cmd_buffer.offset, - cmd_buffer.word_count); - host1x.PushEntries(fd, std::move(cmdlist)); + Tegra::ChCommandHeaderList cmdlist(cmd_buffer.word_count); + session->process->GetMemory().ReadBlock(object->address + cmd_buffer.offset, cmdlist.data(), + cmdlist.size() * sizeof(u32)); + gpu.PushCommandBuffer(core.Host1xDeviceFile().fd_to_id[fd], cmdlist); } - // Some games expect command_buffers to be written back offset = 0; offset += WriteVectors(data, command_buffers, offset); diff --git a/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.h b/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.h index 9e6eefb19..59ed38fa9 100755 --- a/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.h +++ b/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.h @@ -119,7 +119,6 @@ protected: Kernel::KEvent* QueryEvent(u32 event_id) override; - Tegra::Host1x::Host1x& host1x; u32 channel_syncpoint; s32_le nvmap_fd{}; u32_le submit_timeout{}; diff --git a/src/core/hle/service/nvdrv/devices/nvhost_vic.cpp b/src/core/hle/service/nvdrv/devices/nvhost_vic.cpp index 9d5bb298e..12ffddeb8 100755 --- a/src/core/hle/service/nvdrv/devices/nvhost_vic.cpp +++ b/src/core/hle/service/nvdrv/devices/nvhost_vic.cpp @@ -7,7 +7,6 @@ #include "core/hle/service/nvdrv/core/container.h" #include "core/hle/service/nvdrv/devices/ioctl_serialization.h" #include "core/hle/service/nvdrv/devices/nvhost_vic.h" -#include "video_core/host1x/host1x.h" #include "video_core/renderer_base.h" namespace Service::Nvidia::Devices { @@ -22,8 +21,13 @@ NvResult nvhost_vic::Ioctl1(DeviceFD fd, Ioctl command, std::span inpu switch (command.group) { case 0x0: switch (command.cmd) { - case 0x1: + case 0x1: { + auto& host1x_file = core.Host1xDeviceFile(); + if (!host1x_file.fd_to_id.contains(fd)) { + host1x_file.fd_to_id[fd] = host1x_file.vic_next_id++; + } return WrapFixedVariable(this, &nvhost_vic::Submit, input, output, fd); + } case 0x2: return WrapFixed(this, &nvhost_vic::GetSyncpoint, input, output); case 0x3: @@ -66,11 +70,14 @@ NvResult nvhost_vic::Ioctl3(DeviceFD fd, Ioctl command, std::span inpu void nvhost_vic::OnOpen(NvCore::SessionId session_id, DeviceFD fd) { sessions[fd] = session_id; - host1x.StartDevice(fd, Tegra::Host1x::ChannelType::VIC, channel_syncpoint); } void nvhost_vic::OnClose(DeviceFD fd) { - host1x.StopDevice(fd, Tegra::Host1x::ChannelType::VIC); + auto& host1x_file = core.Host1xDeviceFile(); + const auto iter = host1x_file.fd_to_id.find(fd); + if (iter != host1x_file.fd_to_id.end()) { + system.GPU().ClearCdmaInstance(iter->second); + } sessions.erase(fd); } diff --git a/src/core/memory.h b/src/core/memory.h index c8fd99c82..80a93ef90 100755 --- a/src/core/memory.h +++ b/src/core/memory.h @@ -64,8 +64,6 @@ public: Memory(Memory&&) = default; Memory& operator=(Memory&&) = delete; - static constexpr bool HAS_FLUSH_INVALIDATION = false; - /** * Resets the state of the Memory system. */ diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index 60e7378be..a61a3d205 100755 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt @@ -58,8 +58,8 @@ add_library(video_core STATIC framebuffer_config.h fsr.cpp fsr.h - host1x/codecs/decoder.cpp - host1x/codecs/decoder.h + host1x/codecs/codec.cpp + host1x/codecs/codec.h host1x/codecs/h264.cpp host1x/codecs/h264.h host1x/codecs/vp8.cpp @@ -78,6 +78,8 @@ add_library(video_core STATIC host1x/nvdec.cpp host1x/nvdec.h host1x/nvdec_common.h + host1x/sync_manager.cpp + host1x/sync_manager.h host1x/syncpoint_manager.cpp host1x/syncpoint_manager.h host1x/vic.cpp diff --git a/src/video_core/cdma_pusher.cpp b/src/video_core/cdma_pusher.cpp index 6daca0f0a..2e38410cf 100755 --- a/src/video_core/cdma_pusher.cpp +++ b/src/video_core/cdma_pusher.cpp @@ -2,130 +2,136 @@ // SPDX-License-Identifier: MIT #include - -#include "common/thread.h" -#include "core/core.h" #include "video_core/cdma_pusher.h" #include "video_core/engines/maxwell_3d.h" #include "video_core/host1x/control.h" #include "video_core/host1x/host1x.h" #include "video_core/host1x/nvdec.h" #include "video_core/host1x/nvdec_common.h" +#include "video_core/host1x/sync_manager.h" #include "video_core/host1x/vic.h" #include "video_core/memory_manager.h" namespace Tegra { - -CDmaPusher::CDmaPusher(Host1x::Host1x& host1x_, s32 id) - : host1x{host1x_}, memory_manager{host1x.GMMU()}, - host_processor{std::make_unique(host1x_)}, current_class{ - static_cast(id)} { - thread = std::jthread([this](std::stop_token stop_token) { ProcessEntries(stop_token); }); -} +CDmaPusher::CDmaPusher(Host1x::Host1x& host1x_) + : host1x{host1x_}, nvdec_processor(std::make_shared(host1x)), + vic_processor(std::make_unique(host1x, nvdec_processor)), + host1x_processor(std::make_unique(host1x)), + sync_manager(std::make_unique(host1x)) {} CDmaPusher::~CDmaPusher() = default; -void CDmaPusher::ProcessEntries(std::stop_token stop_token) { - Common::SetCurrentThreadPriority(Common::ThreadPriority::High); - ChCommandHeaderList command_list{host1x.System().ApplicationMemory(), 0, 0}; - u32 count{}; - u32 method_offset{}; - u32 mask{}; - bool incrementing{}; - - while (!stop_token.stop_requested()) { - { - std::unique_lock l{command_mutex}; - Common::CondvarWait(command_cv, l, stop_token, - [this]() { return command_lists.size() > 0; }); - if (stop_token.stop_requested()) { - return; +void CDmaPusher::ProcessEntries(ChCommandHeaderList&& entries) { + for (const auto& value : entries) { + if (mask != 0) { + const auto lbs = static_cast(std::countr_zero(mask)); + mask &= ~(1U << lbs); + ExecuteCommand(offset + lbs, value.raw); + continue; + } else if (count != 0) { + --count; + ExecuteCommand(offset, value.raw); + if (incrementing) { + ++offset; } - - command_list = std::move(command_lists.front()); - command_lists.pop_front(); + continue; } - - size_t i = 0; - for (const auto value : command_list) { - i++; - if (mask != 0) { - const auto lbs = static_cast(std::countr_zero(mask)); - mask &= ~(1U << lbs); - ExecuteCommand(method_offset + lbs, value.raw); - continue; - } else if (count != 0) { - --count; - ExecuteCommand(method_offset, value.raw); - if (incrementing) { - ++method_offset; - } - continue; - } - const auto mode = value.submission_mode.Value(); - switch (mode) { - case ChSubmissionMode::SetClass: { - mask = value.value & 0x3f; - method_offset = value.method_offset; - current_class = static_cast((value.value >> 6) & 0x3ff); - break; - } - case ChSubmissionMode::Incrementing: - case ChSubmissionMode::NonIncrementing: - count = value.value; - method_offset = value.method_offset; - incrementing = mode == ChSubmissionMode::Incrementing; - break; - case ChSubmissionMode::Mask: - mask = value.value; - method_offset = value.method_offset; - break; - case ChSubmissionMode::Immediate: { - const u32 data = value.value & 0xfff; - method_offset = value.method_offset; - ExecuteCommand(method_offset, data); - break; - } - default: - LOG_ERROR(HW_GPU, "Bad command at index {} (bytes 0x{:X}), buffer size {}", i - 1, - (i - 1) * sizeof(u32), command_list.size()); - UNIMPLEMENTED_MSG("ChSubmission mode {} is not implemented!", - static_cast(mode)); - break; - } + const auto mode = value.submission_mode.Value(); + switch (mode) { + case ChSubmissionMode::SetClass: { + mask = value.value & 0x3f; + offset = value.method_offset; + current_class = static_cast((value.value >> 6) & 0x3ff); + break; + } + case ChSubmissionMode::Incrementing: + case ChSubmissionMode::NonIncrementing: + count = value.value; + offset = value.method_offset; + incrementing = mode == ChSubmissionMode::Incrementing; + break; + case ChSubmissionMode::Mask: + mask = value.value; + offset = value.method_offset; + break; + case ChSubmissionMode::Immediate: { + const u32 data = value.value & 0xfff; + offset = value.method_offset; + ExecuteCommand(offset, data); + break; + } + default: + UNIMPLEMENTED_MSG("ChSubmission mode {} is not implemented!", static_cast(mode)); + break; } } } -void CDmaPusher::ExecuteCommand(u32 method, u32 arg) { +void CDmaPusher::ExecuteCommand(u32 state_offset, u32 data) { switch (current_class) { - case ChClassId::Control: - LOG_TRACE(Service_NVDRV, "Class {} method 0x{:X} arg 0x{:X}", - static_cast(current_class), method, arg); - host_processor->ProcessMethod(static_cast(method), arg); - break; - default: - thi_regs.reg_array[method] = arg; - switch (static_cast(method)) { + case ChClassId::NvDec: + ThiStateWrite(nvdec_thi_state, offset, data); + switch (static_cast(offset)) { case ThiMethod::IncSyncpt: { - const auto syncpoint_id = static_cast(arg & 0xFF); - [[maybe_unused]] const auto cond = static_cast((arg >> 8) & 0xFF); - LOG_TRACE(Service_NVDRV, "Class {} IncSyncpt Method, syncpt {} cond {}", - static_cast(current_class), syncpoint_id, cond); - auto& syncpoint_manager = host1x.GetSyncpointManager(); - syncpoint_manager.IncrementGuest(syncpoint_id); - syncpoint_manager.IncrementHost(syncpoint_id); + LOG_DEBUG(Service_NVDRV, "NVDEC Class IncSyncpt Method"); + const auto syncpoint_id = static_cast(data & 0xFF); + const auto cond = static_cast((data >> 8) & 0xFF); + if (cond == 0) { + sync_manager->Increment(syncpoint_id); + } else { + sync_manager->SignalDone( + sync_manager->IncrementWhenDone(static_cast(current_class), syncpoint_id)); + } break; } case ThiMethod::SetMethod1: - LOG_TRACE(Service_NVDRV, "Class {} method 0x{:X} arg 0x{:X}", - static_cast(current_class), static_cast(thi_regs.method_0), arg); - ProcessMethod(thi_regs.method_0, arg); + LOG_DEBUG(Service_NVDRV, "NVDEC method 0x{:X}", + static_cast(nvdec_thi_state.method_0)); + nvdec_processor->ProcessMethod(nvdec_thi_state.method_0, data); break; default: break; } + break; + case ChClassId::GraphicsVic: + ThiStateWrite(vic_thi_state, static_cast(state_offset), {data}); + switch (static_cast(state_offset)) { + case ThiMethod::IncSyncpt: { + LOG_DEBUG(Service_NVDRV, "VIC Class IncSyncpt Method"); + const auto syncpoint_id = static_cast(data & 0xFF); + const auto cond = static_cast((data >> 8) & 0xFF); + if (cond == 0) { + sync_manager->Increment(syncpoint_id); + } else { + sync_manager->SignalDone( + sync_manager->IncrementWhenDone(static_cast(current_class), syncpoint_id)); + } + break; + } + case ThiMethod::SetMethod1: + LOG_DEBUG(Service_NVDRV, "VIC method 0x{:X}, Args=({})", + static_cast(vic_thi_state.method_0), data); + vic_processor->ProcessMethod(static_cast(vic_thi_state.method_0), + data); + break; + default: + break; + } + break; + case ChClassId::Control: + // This device is mainly for syncpoint synchronization + LOG_DEBUG(Service_NVDRV, "Host1X Class Method"); + host1x_processor->ProcessMethod(static_cast(offset), data); + break; + default: + UNIMPLEMENTED_MSG("Current class not implemented {:X}", static_cast(current_class)); + break; } } +void CDmaPusher::ThiStateWrite(ThiRegisters& state, u32 state_offset, u32 argument) { + u8* const offset_ptr = reinterpret_cast(&state) + sizeof(u32) * state_offset; + std::memcpy(offset_ptr, &argument, sizeof(u32)); +} + } // namespace Tegra diff --git a/src/video_core/cdma_pusher.h b/src/video_core/cdma_pusher.h index 553654b8f..2d663cdaa 100755 --- a/src/video_core/cdma_pusher.h +++ b/src/video_core/cdma_pusher.h @@ -3,18 +3,12 @@ #pragma once -#include -#include #include -#include -#include #include #include "common/bit_field.h" #include "common/common_funcs.h" #include "common/common_types.h" -#include "common/polyfill_thread.h" -#include "core/memory.h" namespace Tegra { @@ -68,31 +62,23 @@ struct ChCommand { std::vector arguments; }; -using ChCommandHeaderList = - Core::Memory::CpuGuestMemory; +using ChCommandHeaderList = std::vector; struct ThiRegisters { - static constexpr std::size_t NUM_REGS = 0x20; - - union { - struct { - u32_le increment_syncpt; - INSERT_PADDING_WORDS_NOINIT(1); - u32_le increment_syncpt_error; - u32_le ctx_switch_incremement_syncpt; - INSERT_PADDING_WORDS_NOINIT(4); - u32_le ctx_switch; - INSERT_PADDING_WORDS_NOINIT(1); - u32_le ctx_syncpt_eof; - INSERT_PADDING_WORDS_NOINIT(5); - u32_le method_0; - u32_le method_1; - INSERT_PADDING_WORDS_NOINIT(12); - u32_le int_status; - u32_le int_mask; - }; - std::array reg_array; - }; + u32_le increment_syncpt{}; + INSERT_PADDING_WORDS(1); + u32_le increment_syncpt_error{}; + u32_le ctx_switch_incremement_syncpt{}; + INSERT_PADDING_WORDS(4); + u32_le ctx_switch{}; + INSERT_PADDING_WORDS(1); + u32_le ctx_syncpt_eof{}; + INSERT_PADDING_WORDS(5); + u32_le method_0{}; + u32_le method_1{}; + INSERT_PADDING_WORDS(12); + u32_le int_status{}; + u32_le int_mask{}; }; enum class ThiMethod : u32 { @@ -103,39 +89,32 @@ enum class ThiMethod : u32 { class CDmaPusher { public: - CDmaPusher() = delete; - virtual ~CDmaPusher(); + explicit CDmaPusher(Host1x::Host1x& host1x); + ~CDmaPusher(); - void PushEntries(ChCommandHeaderList&& entries) { - std::scoped_lock l{command_mutex}; - command_lists.push_back(std::move(entries)); - command_cv.notify_one(); - } - -protected: - explicit CDmaPusher(Host1x::Host1x& host1x, s32 id); - - virtual void ProcessMethod(u32 method, u32 arg) = 0; - - Host1x::Host1x& host1x; - Tegra::MemoryManager& memory_manager; + /// Process the command entry + void ProcessEntries(ChCommandHeaderList&& entries); private: - /// Process the command entry - void ProcessEntries(std::stop_token stop_token); - /// Invoke command class devices to execute the command based on the current state void ExecuteCommand(u32 state_offset, u32 data); - std::unique_ptr host_processor; + /// Write arguments value to the ThiRegisters member at the specified offset + void ThiStateWrite(ThiRegisters& state, u32 offset, u32 argument); - std::mutex command_mutex; - std::condition_variable_any command_cv; - std::deque command_lists; - std::jthread thread; + Host1x::Host1x& host1x; + std::shared_ptr nvdec_processor; + std::unique_ptr vic_processor; + std::unique_ptr host1x_processor; + std::unique_ptr sync_manager; + ChClassId current_class{}; + ThiRegisters vic_thi_state{}; + ThiRegisters nvdec_thi_state{}; - ThiRegisters thi_regs{}; - ChClassId current_class; + u32 count{}; + u32 offset{}; + u32 mask{}; + bool incrementing{}; }; } // namespace Tegra diff --git a/src/video_core/engines/sw_blitter/blitter.cpp b/src/video_core/engines/sw_blitter/blitter.cpp index 4bc079024..8bcc2f7a7 100755 --- a/src/video_core/engines/sw_blitter/blitter.cpp +++ b/src/video_core/engines/sw_blitter/blitter.cpp @@ -111,6 +111,20 @@ void Bilinear(std::span input, std::span output, size_t src_widt } } +template +void ProcessPitchLinear(std::span input, std::span output, size_t extent_x, + size_t extent_y, u32 pitch, u32 x0, u32 y0, size_t bpp) { + const size_t base_offset = x0 * bpp; + const size_t copy_size = extent_x * bpp; + for (size_t y = 0; y < extent_y; y++) { + const size_t first_offset = (y + y0) * pitch + base_offset; + const size_t second_offset = y * extent_x * bpp; + u8* write_to = unpack ? &output[first_offset] : &output[second_offset]; + const u8* read_from = unpack ? &input[second_offset] : &input[first_offset]; + std::memcpy(write_to, read_from, copy_size); + } +} + } // namespace struct SoftwareBlitEngine::BlitEngineImpl { @@ -138,19 +152,6 @@ bool SoftwareBlitEngine::Blit(Fermi2D::Surface& src, Fermi2D::Surface& dst, } return static_cast(surface.pitch * surface.height); }; - const auto process_pitch_linear = [](bool unpack, std::span input, - std::span output, u32 extent_x, u32 extent_y, - u32 pitch, u32 x0, u32 y0, size_t bpp) { - const size_t base_offset = x0 * bpp; - const size_t copy_size = extent_x * bpp; - for (u32 y = y0; y < extent_y; y++) { - const size_t first_offset = y * pitch + base_offset; - const size_t second_offset = y * extent_x * bpp; - u8* write_to = unpack ? &output[first_offset] : &output[second_offset]; - const u8* read_from = unpack ? &input[second_offset] : &input[first_offset]; - std::memcpy(write_to, read_from, copy_size); - } - }; const u32 src_extent_x = config.src_x1 - config.src_x0; const u32 src_extent_y = config.src_y1 - config.src_y0; @@ -205,8 +206,8 @@ bool SoftwareBlitEngine::Blit(Fermi2D::Surface& src, Fermi2D::Surface& dst, src.depth, config.src_x0, config.src_y0, src_extent_x, src_extent_y, src.block_height, src.block_depth, src_extent_x * src_bytes_per_pixel); } else { - process_pitch_linear(false, tmp_buffer, impl->src_buffer, src_extent_x, src_extent_y, - src.pitch, config.src_x0, config.src_y0, src_bytes_per_pixel); + ProcessPitchLinear(tmp_buffer, impl->src_buffer, src_extent_x, src_extent_y, + src.pitch, config.src_x0, config.src_y0, src_bytes_per_pixel); } // Conversion Phase @@ -229,9 +230,9 @@ bool SoftwareBlitEngine::Blit(Fermi2D::Surface& src, Fermi2D::Surface& dst, dst.depth, config.dst_x0, config.dst_y0, dst_extent_x, dst_extent_y, dst.block_height, dst.block_depth, dst_extent_x * dst_bytes_per_pixel); } else { - process_pitch_linear(true, impl->dst_buffer, tmp_buffer2, dst_extent_x, dst_extent_y, - dst.pitch, config.dst_x0, config.dst_y0, - static_cast(dst_bytes_per_pixel)); + ProcessPitchLinear(impl->dst_buffer, tmp_buffer2, dst_extent_x, dst_extent_y, + dst.pitch, config.dst_x0, config.dst_y0, + static_cast(dst_bytes_per_pixel)); } return true; } diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp index a45a086f1..007b280fa 100755 --- a/src/video_core/gpu.cpp +++ b/src/video_core/gpu.cpp @@ -250,6 +250,30 @@ struct GPU::Impl { gpu_thread.SubmitList(channel, std::move(entries)); } + /// Push GPU command buffer entries to be processed + void PushCommandBuffer(u32 id, Tegra::ChCommandHeaderList& entries) { + if (!use_nvdec) { + return; + } + + if (!cdma_pushers.contains(id)) { + cdma_pushers.insert_or_assign(id, std::make_unique(host1x)); + } + + // SubmitCommandBuffer would make the nvdec operations async, this is not currently working + // TODO(ameerj): RE proper async nvdec operation + // gpu_thread.SubmitCommandBuffer(std::move(entries)); + cdma_pushers[id]->ProcessEntries(std::move(entries)); + } + + /// Frees the CDMAPusher instance to free up resources + void ClearCdmaInstance(u32 id) { + const auto iter = cdma_pushers.find(id); + if (iter != cdma_pushers.end()) { + cdma_pushers.erase(iter); + } + } + /// Swap buffers (render frame) void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) { gpu_thread.SwapBuffers(framebuffer); @@ -332,6 +356,7 @@ struct GPU::Impl { Core::System& system; Host1x::Host1x& host1x; + std::map> cdma_pushers; std::unique_ptr renderer; VideoCore::RasterizerInterface* rasterizer = nullptr; const bool use_nvdec; @@ -521,6 +546,14 @@ void GPU::PushGPUEntries(s32 channel, Tegra::CommandList&& entries) { impl->PushGPUEntries(channel, std::move(entries)); } +void GPU::PushCommandBuffer(u32 id, Tegra::ChCommandHeaderList& entries) { + impl->PushCommandBuffer(id, entries); +} + +void GPU::ClearCdmaInstance(u32 id) { + impl->ClearCdmaInstance(id); +} + void GPU::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) { impl->SwapBuffers(framebuffer); } diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h index 15b40d80a..6f8851d9f 100755 --- a/src/video_core/gpu.h +++ b/src/video_core/gpu.h @@ -232,6 +232,12 @@ public: /// Push GPU command entries to be processed void PushGPUEntries(s32 channel, Tegra::CommandList&& entries); + /// Push GPU command buffer entries to be processed + void PushCommandBuffer(u32 id, Tegra::ChCommandHeaderList& entries); + + /// Frees the CDMAPusher instance to free up resources + void ClearCdmaInstance(u32 id); + /// Swap buffers (render frame) void SwapBuffers(const Tegra::FramebufferConfig* framebuffer); diff --git a/src/video_core/gpu_thread.cpp b/src/video_core/gpu_thread.cpp index 5f91c7a42..0ba5b0e35 100755 --- a/src/video_core/gpu_thread.cpp +++ b/src/video_core/gpu_thread.cpp @@ -12,7 +12,6 @@ #include "video_core/dma_pusher.h" #include "video_core/gpu.h" #include "video_core/gpu_thread.h" -#include "video_core/host1x/host1x.h" #include "video_core/renderer_base.h" namespace VideoCommon::GPUThread { diff --git a/src/video_core/host1x/codecs/h264.cpp b/src/video_core/host1x/codecs/h264.cpp index 64e835846..77eda05c2 100755 --- a/src/video_core/host1x/codecs/h264.cpp +++ b/src/video_core/host1x/codecs/h264.cpp @@ -10,7 +10,7 @@ #include "video_core/host1x/host1x.h" #include "video_core/memory_manager.h" -namespace Tegra::Decoders { +namespace Tegra::Decoder { namespace { // ZigZag LUTs from libavcodec. constexpr std::array zig_zag_direct{ @@ -25,55 +25,23 @@ constexpr std::array zig_zag_scan{ }; } // Anonymous namespace -H264::H264(Host1x::Host1x& host1x_, const Host1x::NvdecCommon::NvdecRegisters& regs_, s32 id_) - : Decoder{host1x_, id_, regs_} { - codec = Host1x::NvdecCommon::VideoCodec::H264; - initialized = decode_api.Initialize(codec); -} +H264::H264(Host1x::Host1x& host1x_) : host1x{host1x_} {} H264::~H264() = default; -std::tuple H264::GetProgressiveOffsets() { - auto pic_idx{current_context.h264_parameter_set.curr_pic_idx}; - auto luma{regs.surface_luma_offsets[pic_idx].Address() + - current_context.h264_parameter_set.luma_frame_offset.Address()}; - auto chroma{regs.surface_chroma_offsets[pic_idx].Address() + - current_context.h264_parameter_set.chroma_frame_offset.Address()}; - return {luma, chroma}; -} +std::span H264::ComposeFrame(const Host1x::NvdecCommon::NvdecRegisters& state, + size_t* out_configuration_size, bool is_first_frame) { + H264DecoderContext context; + host1x.GMMU().ReadBlock(state.picture_info_offset, &context, sizeof(H264DecoderContext)); -std::tuple H264::GetInterlacedOffsets() { - auto pic_idx{current_context.h264_parameter_set.curr_pic_idx}; - auto luma_top{regs.surface_luma_offsets[pic_idx].Address() + - current_context.h264_parameter_set.luma_top_offset.Address()}; - auto luma_bottom{regs.surface_luma_offsets[pic_idx].Address() + - current_context.h264_parameter_set.luma_bot_offset.Address()}; - auto chroma_top{regs.surface_chroma_offsets[pic_idx].Address() + - current_context.h264_parameter_set.chroma_top_offset.Address()}; - auto chroma_bottom{regs.surface_chroma_offsets[pic_idx].Address() + - current_context.h264_parameter_set.chroma_bot_offset.Address()}; - return {luma_top, luma_bottom, chroma_top, chroma_bottom}; -} - -bool H264::IsInterlaced() { - return current_context.h264_parameter_set.luma_top_offset.Address() != 0 || - current_context.h264_parameter_set.luma_bot_offset.Address() != 0; -} - -std::span H264::ComposeFrame() { - memory_manager.ReadBlock(regs.picture_info_offset.Address(), ¤t_context, - sizeof(H264DecoderContext)); - - const s64 frame_number = current_context.h264_parameter_set.frame_number.Value(); + const s64 frame_number = context.h264_parameter_set.frame_number.Value(); if (!is_first_frame && frame_number != 0) { - frame_scratch.resize_destructive(current_context.stream_len); - memory_manager.ReadBlock(regs.frame_bitstream_offset.Address(), frame_scratch.data(), - frame_scratch.size()); - return frame_scratch; + frame.resize_destructive(context.stream_len); + host1x.GMMU().ReadBlock(state.frame_bitstream_offset, frame.data(), frame.size()); + *out_configuration_size = 0; + return frame; } - is_first_frame = false; - // Encode header H264BitWriter writer{}; writer.WriteU(1, 24); @@ -85,7 +53,7 @@ std::span H264::ComposeFrame() { writer.WriteU(31, 8); writer.WriteUe(0); const u32 chroma_format_idc = - static_cast(current_context.h264_parameter_set.chroma_format_idc.Value()); + static_cast(context.h264_parameter_set.chroma_format_idc.Value()); writer.WriteUe(chroma_format_idc); if (chroma_format_idc == 3) { writer.WriteBit(false); @@ -93,44 +61,42 @@ std::span H264::ComposeFrame() { writer.WriteUe(0); writer.WriteUe(0); - writer.WriteBit(current_context.qpprime_y_zero_transform_bypass_flag.Value() != 0); + writer.WriteBit(false); // QpprimeYZeroTransformBypassFlag writer.WriteBit(false); // Scaling matrix present flag - writer.WriteUe( - static_cast(current_context.h264_parameter_set.log2_max_frame_num_minus4.Value())); + writer.WriteUe(static_cast(context.h264_parameter_set.log2_max_frame_num_minus4.Value())); const auto order_cnt_type = - static_cast(current_context.h264_parameter_set.pic_order_cnt_type.Value()); + static_cast(context.h264_parameter_set.pic_order_cnt_type.Value()); writer.WriteUe(order_cnt_type); if (order_cnt_type == 0) { - writer.WriteUe(current_context.h264_parameter_set.log2_max_pic_order_cnt_lsb_minus4); + writer.WriteUe(context.h264_parameter_set.log2_max_pic_order_cnt_lsb_minus4); } else if (order_cnt_type == 1) { - writer.WriteBit(current_context.h264_parameter_set.delta_pic_order_always_zero_flag != 0); + writer.WriteBit(context.h264_parameter_set.delta_pic_order_always_zero_flag != 0); writer.WriteSe(0); writer.WriteSe(0); writer.WriteUe(0); } - const s32 pic_height = current_context.h264_parameter_set.frame_height_in_mbs / - (current_context.h264_parameter_set.frame_mbs_only_flag ? 1 : 2); + const s32 pic_height = context.h264_parameter_set.frame_height_in_map_units / + (context.h264_parameter_set.frame_mbs_only_flag ? 1 : 2); - u32 max_num_ref_frames = - std::max(std::max(current_context.h264_parameter_set.num_refidx_l0_default_active, - current_context.h264_parameter_set.num_refidx_l1_default_active) + - 1, - 4); + // TODO (ameerj): Where do we get this number, it seems to be particular for each stream + const auto nvdec_decoding = Settings::values.nvdec_emulation.GetValue(); + const bool uses_gpu_decoding = nvdec_decoding == Settings::NvdecEmulation::Gpu; + const u32 max_num_ref_frames = uses_gpu_decoding ? 6u : 16u; writer.WriteUe(max_num_ref_frames); writer.WriteBit(false); - writer.WriteUe(current_context.h264_parameter_set.pic_width_in_mbs - 1); + writer.WriteUe(context.h264_parameter_set.pic_width_in_mbs - 1); writer.WriteUe(pic_height - 1); - writer.WriteBit(current_context.h264_parameter_set.frame_mbs_only_flag != 0); + writer.WriteBit(context.h264_parameter_set.frame_mbs_only_flag != 0); - if (!current_context.h264_parameter_set.frame_mbs_only_flag) { - writer.WriteBit(current_context.h264_parameter_set.flags.mbaff_frame.Value() != 0); + if (!context.h264_parameter_set.frame_mbs_only_flag) { + writer.WriteBit(context.h264_parameter_set.flags.mbaff_frame.Value() != 0); } - writer.WriteBit(current_context.h264_parameter_set.flags.direct_8x8_inference.Value() != 0); + writer.WriteBit(context.h264_parameter_set.flags.direct_8x8_inference.Value() != 0); writer.WriteBit(false); // Frame cropping flag writer.WriteBit(false); // VUI parameter present flag @@ -145,59 +111,57 @@ std::span H264::ComposeFrame() { writer.WriteUe(0); writer.WriteUe(0); - writer.WriteBit(current_context.h264_parameter_set.entropy_coding_mode_flag != 0); - writer.WriteBit(current_context.h264_parameter_set.pic_order_present_flag != 0); + writer.WriteBit(context.h264_parameter_set.entropy_coding_mode_flag != 0); + writer.WriteBit(context.h264_parameter_set.pic_order_present_flag != 0); writer.WriteUe(0); - writer.WriteUe(current_context.h264_parameter_set.num_refidx_l0_default_active); - writer.WriteUe(current_context.h264_parameter_set.num_refidx_l1_default_active); - writer.WriteBit(current_context.h264_parameter_set.flags.weighted_pred.Value() != 0); - writer.WriteU(static_cast(current_context.h264_parameter_set.weighted_bipred_idc.Value()), - 2); - s32 pic_init_qp = - static_cast(current_context.h264_parameter_set.pic_init_qp_minus26.Value()); + writer.WriteUe(context.h264_parameter_set.num_refidx_l0_default_active); + writer.WriteUe(context.h264_parameter_set.num_refidx_l1_default_active); + writer.WriteBit(context.h264_parameter_set.flags.weighted_pred.Value() != 0); + writer.WriteU(static_cast(context.h264_parameter_set.weighted_bipred_idc.Value()), 2); + s32 pic_init_qp = static_cast(context.h264_parameter_set.pic_init_qp_minus26.Value()); writer.WriteSe(pic_init_qp); writer.WriteSe(0); s32 chroma_qp_index_offset = - static_cast(current_context.h264_parameter_set.chroma_qp_index_offset.Value()); + static_cast(context.h264_parameter_set.chroma_qp_index_offset.Value()); writer.WriteSe(chroma_qp_index_offset); - writer.WriteBit(current_context.h264_parameter_set.deblocking_filter_control_present_flag != 0); - writer.WriteBit(current_context.h264_parameter_set.flags.constrained_intra_pred.Value() != 0); - writer.WriteBit(current_context.h264_parameter_set.redundant_pic_cnt_present_flag != 0); - writer.WriteBit(current_context.h264_parameter_set.transform_8x8_mode_flag != 0); + writer.WriteBit(context.h264_parameter_set.deblocking_filter_control_present_flag != 0); + writer.WriteBit(context.h264_parameter_set.flags.constrained_intra_pred.Value() != 0); + writer.WriteBit(context.h264_parameter_set.redundant_pic_cnt_present_flag != 0); + writer.WriteBit(context.h264_parameter_set.transform_8x8_mode_flag != 0); writer.WriteBit(true); // pic_scaling_matrix_present_flag for (s32 index = 0; index < 6; index++) { writer.WriteBit(true); - std::span matrix{current_context.weight_scale_4x4}; - writer.WriteScalingList(scan_scratch, matrix, index * 16, 16); + std::span matrix{context.weight_scale}; + writer.WriteScalingList(scan, matrix, index * 16, 16); } - if (current_context.h264_parameter_set.transform_8x8_mode_flag) { + if (context.h264_parameter_set.transform_8x8_mode_flag) { for (s32 index = 0; index < 2; index++) { writer.WriteBit(true); - std::span matrix{current_context.weight_scale_8x8}; - writer.WriteScalingList(scan_scratch, matrix, index * 64, 64); + std::span matrix{context.weight_scale_8x8}; + writer.WriteScalingList(scan, matrix, index * 64, 64); } } s32 chroma_qp_index_offset2 = - static_cast(current_context.h264_parameter_set.second_chroma_qp_index_offset.Value()); + static_cast(context.h264_parameter_set.second_chroma_qp_index_offset.Value()); writer.WriteSe(chroma_qp_index_offset2); writer.End(); const auto& encoded_header = writer.GetByteArray(); - frame_scratch.resize(encoded_header.size() + current_context.stream_len); - std::memcpy(frame_scratch.data(), encoded_header.data(), encoded_header.size()); + frame.resize(encoded_header.size() + context.stream_len); + std::memcpy(frame.data(), encoded_header.data(), encoded_header.size()); - memory_manager.ReadBlock(regs.frame_bitstream_offset.Address(), - frame_scratch.data() + encoded_header.size(), - current_context.stream_len); + *out_configuration_size = encoded_header.size(); + host1x.GMMU().ReadBlock(state.frame_bitstream_offset, frame.data() + encoded_header.size(), + context.stream_len); - return frame_scratch; + return frame; } H264BitWriter::H264BitWriter() = default; @@ -314,4 +278,4 @@ void H264BitWriter::Flush() { buffer = 0; buffer_pos = 0; } -} // namespace Tegra::Decoders +} // namespace Tegra::Decoder diff --git a/src/video_core/host1x/codecs/h264.h b/src/video_core/host1x/codecs/h264.h index 09f9080f3..80dbb4e4b 100755 --- a/src/video_core/host1x/codecs/h264.h +++ b/src/video_core/host1x/codecs/h264.h @@ -10,7 +10,6 @@ #include "common/common_funcs.h" #include "common/common_types.h" #include "common/scratch_buffer.h" -#include "video_core/host1x/codecs/decoder.h" #include "video_core/host1x/nvdec_common.h" namespace Tegra { @@ -19,7 +18,7 @@ namespace Host1x { class Host1x; } // namespace Host1x -namespace Decoders { +namespace Decoder { class H264BitWriter { public: @@ -61,212 +60,123 @@ private: std::vector byte_array; }; -struct Offset { - constexpr u32 Address() const noexcept { - return offset << 8; - } +class H264 { +public: + explicit H264(Host1x::Host1x& host1x); + ~H264(); + + /// Compose the H264 frame for FFmpeg decoding + [[nodiscard]] std::span ComposeFrame(const Host1x::NvdecCommon::NvdecRegisters& state, + size_t* out_configuration_size, + bool is_first_frame = false); private: - u32 offset; -}; -static_assert(std::is_trivial_v, "Offset must be trivial"); -static_assert(sizeof(Offset) == 0x4, "Offset has the wrong size!"); + Common::ScratchBuffer frame; + Common::ScratchBuffer scan; + Host1x::Host1x& host1x; -struct H264ParameterSet { - s32 log2_max_pic_order_cnt_lsb_minus4; ///< 0x00 - s32 delta_pic_order_always_zero_flag; ///< 0x04 - s32 frame_mbs_only_flag; ///< 0x08 - u32 pic_width_in_mbs; ///< 0x0C - u32 frame_height_in_mbs; ///< 0x10 - union { ///< 0x14 - BitField<0, 2, u32> tile_format; - BitField<2, 3, u32> gob_height; - BitField<5, 27, u32> reserved_surface_format; + struct H264ParameterSet { + s32 log2_max_pic_order_cnt_lsb_minus4; ///< 0x00 + s32 delta_pic_order_always_zero_flag; ///< 0x04 + s32 frame_mbs_only_flag; ///< 0x08 + u32 pic_width_in_mbs; ///< 0x0C + u32 frame_height_in_map_units; ///< 0x10 + union { ///< 0x14 + BitField<0, 2, u32> tile_format; + BitField<2, 3, u32> gob_height; + }; + u32 entropy_coding_mode_flag; ///< 0x18 + s32 pic_order_present_flag; ///< 0x1C + s32 num_refidx_l0_default_active; ///< 0x20 + s32 num_refidx_l1_default_active; ///< 0x24 + s32 deblocking_filter_control_present_flag; ///< 0x28 + s32 redundant_pic_cnt_present_flag; ///< 0x2C + u32 transform_8x8_mode_flag; ///< 0x30 + u32 pitch_luma; ///< 0x34 + u32 pitch_chroma; ///< 0x38 + u32 luma_top_offset; ///< 0x3C + u32 luma_bot_offset; ///< 0x40 + u32 luma_frame_offset; ///< 0x44 + u32 chroma_top_offset; ///< 0x48 + u32 chroma_bot_offset; ///< 0x4C + u32 chroma_frame_offset; ///< 0x50 + u32 hist_buffer_size; ///< 0x54 + union { ///< 0x58 + union { + BitField<0, 1, u64> mbaff_frame; + BitField<1, 1, u64> direct_8x8_inference; + BitField<2, 1, u64> weighted_pred; + BitField<3, 1, u64> constrained_intra_pred; + BitField<4, 1, u64> ref_pic; + BitField<5, 1, u64> field_pic; + BitField<6, 1, u64> bottom_field; + BitField<7, 1, u64> second_field; + } flags; + BitField<8, 4, u64> log2_max_frame_num_minus4; + BitField<12, 2, u64> chroma_format_idc; + BitField<14, 2, u64> pic_order_cnt_type; + BitField<16, 6, s64> pic_init_qp_minus26; + BitField<22, 5, s64> chroma_qp_index_offset; + BitField<27, 5, s64> second_chroma_qp_index_offset; + BitField<32, 2, u64> weighted_bipred_idc; + BitField<34, 7, u64> curr_pic_idx; + BitField<41, 5, u64> curr_col_idx; + BitField<46, 16, u64> frame_number; + BitField<62, 1, u64> frame_surfaces; + BitField<63, 1, u64> output_memory_layout; + }; }; - u32 entropy_coding_mode_flag; ///< 0x18 - s32 pic_order_present_flag; ///< 0x1C - s32 num_refidx_l0_default_active; ///< 0x20 - s32 num_refidx_l1_default_active; ///< 0x24 - s32 deblocking_filter_control_present_flag; ///< 0x28 - s32 redundant_pic_cnt_present_flag; ///< 0x2C - u32 transform_8x8_mode_flag; ///< 0x30 - u32 pitch_luma; ///< 0x34 - u32 pitch_chroma; ///< 0x38 - Offset luma_top_offset; ///< 0x3C - Offset luma_bot_offset; ///< 0x40 - Offset luma_frame_offset; ///< 0x44 - Offset chroma_top_offset; ///< 0x48 - Offset chroma_bot_offset; ///< 0x4C - Offset chroma_frame_offset; ///< 0x50 - u32 hist_buffer_size; ///< 0x54 - union { ///< 0x58 - union { - BitField<0, 1, u64> mbaff_frame; - BitField<1, 1, u64> direct_8x8_inference; - BitField<2, 1, u64> weighted_pred; - BitField<3, 1, u64> constrained_intra_pred; - BitField<4, 1, u64> ref_pic; - BitField<5, 1, u64> field_pic; - BitField<6, 1, u64> bottom_field; - BitField<7, 1, u64> second_field; - } flags; - BitField<8, 4, u64> log2_max_frame_num_minus4; - BitField<12, 2, u64> chroma_format_idc; - BitField<14, 2, u64> pic_order_cnt_type; - BitField<16, 6, s64> pic_init_qp_minus26; - BitField<22, 5, s64> chroma_qp_index_offset; - BitField<27, 5, s64> second_chroma_qp_index_offset; - BitField<32, 2, u64> weighted_bipred_idc; - BitField<34, 7, u64> curr_pic_idx; - BitField<41, 5, u64> curr_col_idx; - BitField<46, 16, u64> frame_number; - BitField<62, 1, u64> frame_surfaces; - BitField<63, 1, u64> output_memory_layout; + static_assert(sizeof(H264ParameterSet) == 0x60, "H264ParameterSet is an invalid size"); + + struct H264DecoderContext { + INSERT_PADDING_WORDS_NOINIT(18); ///< 0x0000 + u32 stream_len; ///< 0x0048 + INSERT_PADDING_WORDS_NOINIT(3); ///< 0x004C + H264ParameterSet h264_parameter_set; ///< 0x0058 + INSERT_PADDING_WORDS_NOINIT(66); ///< 0x00B8 + std::array weight_scale; ///< 0x01C0 + std::array weight_scale_8x8; ///< 0x0220 }; -}; -static_assert(sizeof(H264ParameterSet) == 0x60, "H264ParameterSet is an invalid size"); + static_assert(sizeof(H264DecoderContext) == 0x2A0, "H264DecoderContext is an invalid size"); #define ASSERT_POSITION(field_name, position) \ static_assert(offsetof(H264ParameterSet, field_name) == position, \ "Field " #field_name " has invalid position") -ASSERT_POSITION(log2_max_pic_order_cnt_lsb_minus4, 0x00); -ASSERT_POSITION(delta_pic_order_always_zero_flag, 0x04); -ASSERT_POSITION(frame_mbs_only_flag, 0x08); -ASSERT_POSITION(pic_width_in_mbs, 0x0C); -ASSERT_POSITION(frame_height_in_mbs, 0x10); -ASSERT_POSITION(tile_format, 0x14); -ASSERT_POSITION(entropy_coding_mode_flag, 0x18); -ASSERT_POSITION(pic_order_present_flag, 0x1C); -ASSERT_POSITION(num_refidx_l0_default_active, 0x20); -ASSERT_POSITION(num_refidx_l1_default_active, 0x24); -ASSERT_POSITION(deblocking_filter_control_present_flag, 0x28); -ASSERT_POSITION(redundant_pic_cnt_present_flag, 0x2C); -ASSERT_POSITION(transform_8x8_mode_flag, 0x30); -ASSERT_POSITION(pitch_luma, 0x34); -ASSERT_POSITION(pitch_chroma, 0x38); -ASSERT_POSITION(luma_top_offset, 0x3C); -ASSERT_POSITION(luma_bot_offset, 0x40); -ASSERT_POSITION(luma_frame_offset, 0x44); -ASSERT_POSITION(chroma_top_offset, 0x48); -ASSERT_POSITION(chroma_bot_offset, 0x4C); -ASSERT_POSITION(chroma_frame_offset, 0x50); -ASSERT_POSITION(hist_buffer_size, 0x54); -ASSERT_POSITION(flags, 0x58); + ASSERT_POSITION(log2_max_pic_order_cnt_lsb_minus4, 0x00); + ASSERT_POSITION(delta_pic_order_always_zero_flag, 0x04); + ASSERT_POSITION(frame_mbs_only_flag, 0x08); + ASSERT_POSITION(pic_width_in_mbs, 0x0C); + ASSERT_POSITION(frame_height_in_map_units, 0x10); + ASSERT_POSITION(tile_format, 0x14); + ASSERT_POSITION(entropy_coding_mode_flag, 0x18); + ASSERT_POSITION(pic_order_present_flag, 0x1C); + ASSERT_POSITION(num_refidx_l0_default_active, 0x20); + ASSERT_POSITION(num_refidx_l1_default_active, 0x24); + ASSERT_POSITION(deblocking_filter_control_present_flag, 0x28); + ASSERT_POSITION(redundant_pic_cnt_present_flag, 0x2C); + ASSERT_POSITION(transform_8x8_mode_flag, 0x30); + ASSERT_POSITION(pitch_luma, 0x34); + ASSERT_POSITION(pitch_chroma, 0x38); + ASSERT_POSITION(luma_top_offset, 0x3C); + ASSERT_POSITION(luma_bot_offset, 0x40); + ASSERT_POSITION(luma_frame_offset, 0x44); + ASSERT_POSITION(chroma_top_offset, 0x48); + ASSERT_POSITION(chroma_bot_offset, 0x4C); + ASSERT_POSITION(chroma_frame_offset, 0x50); + ASSERT_POSITION(hist_buffer_size, 0x54); + ASSERT_POSITION(flags, 0x58); #undef ASSERT_POSITION -struct DpbEntry { - union { - BitField<0, 7, u32> index; - BitField<7, 5, u32> col_idx; - BitField<12, 2, u32> state; - BitField<14, 1, u32> is_long_term; - BitField<15, 1, u32> non_existing; - BitField<16, 1, u32> is_field; - BitField<17, 4, u32> top_field_marking; - BitField<21, 4, u32> bottom_field_marking; - BitField<25, 1, u32> output_memory_layout; - BitField<26, 6, u32> reserved; - } flags; - std::array field_order_cnt; - u32 frame_idx; -}; -static_assert(sizeof(DpbEntry) == 0x10, "DpbEntry has the wrong size!"); - -struct DisplayParam { - union { - BitField<0, 1, u32> enable_tf_output; - BitField<1, 1, u32> vc1_map_y_flag; - BitField<2, 3, u32> map_y_value; - BitField<5, 1, u32> vc1_map_uv_flag; - BitField<6, 3, u32> map_uv_value; - BitField<9, 8, u32> out_stride; - BitField<17, 3, u32> tiling_format; - BitField<20, 1, u32> output_structure; // 0=frame, 1=field - BitField<21, 11, u32> reserved0; - }; - std::array output_top; - std::array output_bottom; - union { - BitField<0, 1, u32> enable_histogram; - BitField<1, 12, u32> histogram_start_x; - BitField<13, 12, u32> histogram_start_y; - BitField<25, 7, u32> reserved1; - }; - union { - BitField<0, 12, u32> histogram_end_x; - BitField<12, 12, u32> histogram_end_y; - BitField<24, 8, u32> reserved2; - }; -}; -static_assert(sizeof(DisplayParam) == 0x1C, "DisplayParam has the wrong size!"); - -struct H264DecoderContext { - INSERT_PADDING_WORDS_NOINIT(13); ///< 0x0000 - std::array eos; ///< 0x0034 - u8 explicit_eos_present_flag; ///< 0x0044 - u8 hint_dump_en; ///< 0x0045 - INSERT_PADDING_BYTES_NOINIT(2); ///< 0x0046 - u32 stream_len; ///< 0x0048 - u32 slice_count; ///< 0x004C - u32 mbhist_buffer_size; ///< 0x0050 - u32 gptimer_timeout_value; ///< 0x0054 - H264ParameterSet h264_parameter_set; ///< 0x0058 - std::array curr_field_order_cnt; ///< 0x00B8 - std::array dpb; ///< 0x00C0 - std::array weight_scale_4x4; ///< 0x01C0 - std::array weight_scale_8x8; ///< 0x0220 - std::array num_inter_view_refs_lX; ///< 0x02A0 - std::array reserved2; ///< 0x02A2 - std::array, 2> inter_view_refidx_lX; ///< 0x02B0 - union { ///< 0x02D0 - BitField<0, 1, u32> lossless_ipred8x8_filter_enable; - BitField<1, 1, u32> qpprime_y_zero_transform_bypass_flag; - BitField<2, 30, u32> reserved3; - }; - DisplayParam display_param; ///< 0x02D4 - std::array reserved4; ///< 0x02F0 -}; -static_assert(sizeof(H264DecoderContext) == 0x2FC, "H264DecoderContext is an invalid size"); - #define ASSERT_POSITION(field_name, position) \ static_assert(offsetof(H264DecoderContext, field_name) == position, \ "Field " #field_name " has invalid position") -ASSERT_POSITION(stream_len, 0x48); -ASSERT_POSITION(h264_parameter_set, 0x58); -ASSERT_POSITION(dpb, 0xC0); -ASSERT_POSITION(weight_scale_4x4, 0x1C0); + ASSERT_POSITION(stream_len, 0x48); + ASSERT_POSITION(h264_parameter_set, 0x58); + ASSERT_POSITION(weight_scale, 0x1C0); #undef ASSERT_POSITION - -class H264 final : public Decoder { -public: - explicit H264(Host1x::Host1x& host1x, const Host1x::NvdecCommon::NvdecRegisters& regs, s32 id); - ~H264() override; - - H264(const H264&) = delete; - H264& operator=(const H264&) = delete; - - H264(H264&&) = delete; - H264& operator=(H264&&) = delete; - - /// Compose the H264 frame for FFmpeg decoding - [[nodiscard]] std::span ComposeFrame() override; - - std::tuple GetProgressiveOffsets() override; - std::tuple GetInterlacedOffsets() override; - bool IsInterlaced() override; - - std::string_view GetCurrentCodecName() const override { - return "H264"; - } - -private: - bool is_first_frame{true}; - Common::ScratchBuffer frame_scratch; - Common::ScratchBuffer scan_scratch; - H264DecoderContext current_context{}; }; -} // namespace Decoders +} // namespace Decoder } // namespace Tegra diff --git a/src/video_core/host1x/codecs/vp8.cpp b/src/video_core/host1x/codecs/vp8.cpp index ec5a53fbb..e8b9bf6bf 100755 --- a/src/video_core/host1x/codecs/vp8.cpp +++ b/src/video_core/host1x/codecs/vp8.cpp @@ -7,69 +7,47 @@ #include "video_core/host1x/host1x.h" #include "video_core/memory_manager.h" -namespace Tegra::Decoders { -VP8::VP8(Host1x::Host1x& host1x_, const Host1x::NvdecCommon::NvdecRegisters& regs_, s32 id_) - : Decoder{host1x_, id_, regs_} { - codec = Host1x::NvdecCommon::VideoCodec::VP8; - initialized = decode_api.Initialize(codec); -} +namespace Tegra::Decoder { +VP8::VP8(Host1x::Host1x& host1x_) : host1x{host1x_} {} VP8::~VP8() = default; -std::tuple VP8::GetProgressiveOffsets() { - auto luma{regs.surface_luma_offsets[static_cast(Vp8SurfaceIndex::Current)].Address()}; - auto chroma{regs.surface_chroma_offsets[static_cast(Vp8SurfaceIndex::Current)].Address()}; - return {luma, chroma}; -} +std::span VP8::ComposeFrame(const Host1x::NvdecCommon::NvdecRegisters& state) { + VP8PictureInfo info; + host1x.GMMU().ReadBlock(state.picture_info_offset, &info, sizeof(VP8PictureInfo)); -std::tuple VP8::GetInterlacedOffsets() { - auto luma_top{regs.surface_luma_offsets[static_cast(Vp8SurfaceIndex::Current)].Address()}; - auto luma_bottom{ - regs.surface_luma_offsets[static_cast(Vp8SurfaceIndex::Current)].Address()}; - auto chroma_top{ - regs.surface_chroma_offsets[static_cast(Vp8SurfaceIndex::Current)].Address()}; - auto chroma_bottom{ - regs.surface_chroma_offsets[static_cast(Vp8SurfaceIndex::Current)].Address()}; - return {luma_top, luma_bottom, chroma_top, chroma_bottom}; -} - -std::span VP8::ComposeFrame() { - memory_manager.ReadBlock(regs.picture_info_offset.Address(), ¤t_context, - sizeof(VP8PictureInfo)); - - const bool is_key_frame = current_context.key_frame == 1u; - const auto bitstream_size = static_cast(current_context.vld_buffer_size); + const bool is_key_frame = info.key_frame == 1u; + const auto bitstream_size = static_cast(info.vld_buffer_size); const size_t header_size = is_key_frame ? 10u : 3u; - frame_scratch.resize(header_size + bitstream_size); + frame.resize(header_size + bitstream_size); // Based on page 30 of the VP8 specification. // https://datatracker.ietf.org/doc/rfc6386/ - frame_scratch[0] = is_key_frame ? 0u : 1u; // 1-bit frame type (0: keyframe, 1: interframes). - frame_scratch[0] |= - static_cast((current_context.version & 7u) << 1u); // 3-bit version number - frame_scratch[0] |= static_cast(1u << 4u); // 1-bit show_frame flag + frame[0] = is_key_frame ? 0u : 1u; // 1-bit frame type (0: keyframe, 1: interframes). + frame[0] |= static_cast((info.version & 7u) << 1u); // 3-bit version number + frame[0] |= static_cast(1u << 4u); // 1-bit show_frame flag // The next 19-bits are the first partition size - frame_scratch[0] |= static_cast((current_context.first_part_size & 7u) << 5u); - frame_scratch[1] = static_cast((current_context.first_part_size & 0x7f8u) >> 3u); - frame_scratch[2] = static_cast((current_context.first_part_size & 0x7f800u) >> 11u); + frame[0] |= static_cast((info.first_part_size & 7u) << 5u); + frame[1] = static_cast((info.first_part_size & 0x7f8u) >> 3u); + frame[2] = static_cast((info.first_part_size & 0x7f800u) >> 11u); if (is_key_frame) { - frame_scratch[3] = 0x9du; - frame_scratch[4] = 0x01u; - frame_scratch[5] = 0x2au; + frame[3] = 0x9du; + frame[4] = 0x01u; + frame[5] = 0x2au; // TODO(ameerj): Horizontal/Vertical Scale // 16 bits: (2 bits Horizontal Scale << 14) | Width (14 bits) - frame_scratch[6] = static_cast(current_context.frame_width & 0xff); - frame_scratch[7] = static_cast(((current_context.frame_width >> 8) & 0x3f)); + frame[6] = static_cast(info.frame_width & 0xff); + frame[7] = static_cast(((info.frame_width >> 8) & 0x3f)); // 16 bits:(2 bits Vertical Scale << 14) | Height (14 bits) - frame_scratch[8] = static_cast(current_context.frame_height & 0xff); - frame_scratch[9] = static_cast(((current_context.frame_height >> 8) & 0x3f)); + frame[8] = static_cast(info.frame_height & 0xff); + frame[9] = static_cast(((info.frame_height >> 8) & 0x3f)); } - const u64 bitstream_offset = regs.frame_bitstream_offset.Address(); - memory_manager.ReadBlock(bitstream_offset, frame_scratch.data() + header_size, bitstream_size); + const u64 bitstream_offset = state.frame_bitstream_offset; + host1x.GMMU().ReadBlock(bitstream_offset, frame.data() + header_size, bitstream_size); - return frame_scratch; + return frame; } -} // namespace Tegra::Decoders +} // namespace Tegra::Decoder diff --git a/src/video_core/host1x/codecs/vp8.h b/src/video_core/host1x/codecs/vp8.h index 5c5f51fb8..36a7e08e0 100755 --- a/src/video_core/host1x/codecs/vp8.h +++ b/src/video_core/host1x/codecs/vp8.h @@ -9,7 +9,6 @@ #include "common/common_funcs.h" #include "common/common_types.h" #include "common/scratch_buffer.h" -#include "video_core/host1x/codecs/decoder.h" #include "video_core/host1x/nvdec_common.h" namespace Tegra { @@ -18,40 +17,20 @@ namespace Host1x { class Host1x; } // namespace Host1x -namespace Decoders { -enum class Vp8SurfaceIndex : u32 { - Last = 0, - Golden = 1, - AltRef = 2, - Current = 3, -}; +namespace Decoder { -class VP8 final : public Decoder { +class VP8 { public: - explicit VP8(Host1x::Host1x& host1x, const Host1x::NvdecCommon::NvdecRegisters& regs, s32 id); - ~VP8() override; + explicit VP8(Host1x::Host1x& host1x); + ~VP8(); - VP8(const VP8&) = delete; - VP8& operator=(const VP8&) = delete; - - VP8(VP8&&) = delete; - VP8& operator=(VP8&&) = delete; - - [[nodiscard]] std::span ComposeFrame() override; - - std::tuple GetProgressiveOffsets() override; - std::tuple GetInterlacedOffsets() override; - - bool IsInterlaced() override { - return false; - } - - std::string_view GetCurrentCodecName() const override { - return "VP8"; - } + /// Compose the VP8 frame for FFmpeg decoding + [[nodiscard]] std::span ComposeFrame( + const Host1x::NvdecCommon::NvdecRegisters& state); private: - Common::ScratchBuffer frame_scratch; + Common::ScratchBuffer frame; + Host1x::Host1x& host1x; struct VP8PictureInfo { INSERT_PADDING_WORDS_NOINIT(14); @@ -94,9 +73,7 @@ private: INSERT_PADDING_WORDS_NOINIT(3); }; static_assert(sizeof(VP8PictureInfo) == 0xc0, "PictureInfo is an invalid size"); - - VP8PictureInfo current_context{}; }; -} // namespace Decoders +} // namespace Decoder } // namespace Tegra diff --git a/src/video_core/host1x/codecs/vp9.cpp b/src/video_core/host1x/codecs/vp9.cpp index b7228115e..9701b867b 100755 --- a/src/video_core/host1x/codecs/vp9.cpp +++ b/src/video_core/host1x/codecs/vp9.cpp @@ -4,13 +4,12 @@ #include // for std::copy #include -#include "common/alignment.h" #include "common/assert.h" #include "video_core/host1x/codecs/vp9.h" #include "video_core/host1x/host1x.h" #include "video_core/memory_manager.h" -namespace Tegra::Decoders { +namespace Tegra::Decoder { namespace { constexpr u32 diff_update_probability = 252; constexpr u32 frame_sync_code = 0x498342; @@ -238,11 +237,7 @@ constexpr std::array map_lut{ } } // Anonymous namespace -VP9::VP9(Host1x::Host1x& host1x_, const Host1x::NvdecCommon::NvdecRegisters& regs_, s32 id_) - : Decoder{host1x_, id_, regs_} { - codec = Host1x::NvdecCommon::VideoCodec::VP9; - initialized = decode_api.Initialize(codec); -} +VP9::VP9(Host1x::Host1x& host1x_) : host1x{host1x_} {} VP9::~VP9() = default; @@ -361,113 +356,35 @@ void VP9::WriteMvProbabilityUpdate(VpxRangeEncoder& writer, u8 new_prob, u8 old_ } } -void VP9::WriteSegmentation(VpxBitStreamWriter& writer) { - bool enabled = current_picture_info.segmentation.enabled != 0; - writer.WriteBit(enabled); - if (!enabled) { - return; - } +Vp9PictureInfo VP9::GetVp9PictureInfo(const Host1x::NvdecCommon::NvdecRegisters& state) { + PictureInfo picture_info; + host1x.GMMU().ReadBlock(state.picture_info_offset, &picture_info, sizeof(PictureInfo)); + Vp9PictureInfo vp9_info = picture_info.Convert(); - auto update_map = current_picture_info.segmentation.update_map != 0; - writer.WriteBit(update_map); - - if (update_map) { - EntropyProbs entropy_probs{}; - memory_manager.ReadBlock(regs.vp9_prob_tab_buffer_offset.Address(), &entropy_probs, - sizeof(entropy_probs)); - - auto WriteProb = [&](u8 prob) { - bool coded = prob != 255; - writer.WriteBit(coded); - if (coded) { - writer.WriteU(prob, 8); - } - }; - - for (size_t i = 0; i < entropy_probs.mb_segment_tree_probs.size(); i++) { - WriteProb(entropy_probs.mb_segment_tree_probs[i]); - } - - auto temporal_update = current_picture_info.segmentation.temporal_update != 0; - writer.WriteBit(temporal_update); - - if (temporal_update) { - for (s32 i = 0; i < 3; i++) { - WriteProb(entropy_probs.segment_pred_probs[i]); - } - } - } - - if (last_segmentation == current_picture_info.segmentation) { - writer.WriteBit(false); - return; - } - - last_segmentation = current_picture_info.segmentation; - writer.WriteBit(true); - writer.WriteBit(current_picture_info.segmentation.abs_delta != 0); - - constexpr s32 MAX_SEGMENTS = 8; - constexpr std::array SegmentationFeatureBits = {8, 6, 2, 0}; - - for (s32 i = 0; i < MAX_SEGMENTS; i++) { - auto q_enabled = current_picture_info.segmentation.feature_enabled[i][0] != 0; - writer.WriteBit(q_enabled); - if (q_enabled) { - writer.WriteS(current_picture_info.segmentation.feature_data[i][0], - SegmentationFeatureBits[0]); - } - - auto lf_enabled = current_picture_info.segmentation.feature_enabled[i][1] != 0; - writer.WriteBit(lf_enabled); - if (lf_enabled) { - writer.WriteS(current_picture_info.segmentation.feature_data[i][1], - SegmentationFeatureBits[1]); - } - - auto ref_enabled = current_picture_info.segmentation.feature_enabled[i][2] != 0; - writer.WriteBit(ref_enabled); - if (ref_enabled) { - writer.WriteU(current_picture_info.segmentation.feature_data[i][2], - SegmentationFeatureBits[2]); - } - - auto skip_enabled = current_picture_info.segmentation.feature_enabled[i][3] != 0; - writer.WriteBit(skip_enabled); - } -} - -Vp9PictureInfo VP9::GetVp9PictureInfo() { - memory_manager.ReadBlock(regs.picture_info_offset.Address(), ¤t_picture_info, - sizeof(PictureInfo)); - Vp9PictureInfo vp9_info = current_picture_info.Convert(); - - InsertEntropy(regs.vp9_prob_tab_buffer_offset.Address(), vp9_info.entropy); + InsertEntropy(state.vp9_entropy_probs_offset, vp9_info.entropy); // surface_luma_offset[0:3] contains the address of the reference frame offsets in the following // order: last, golden, altref, current. - for (size_t i = 0; i < 4; i++) { - vp9_info.frame_offsets[i] = regs.surface_luma_offsets[i].Address(); - } + std::copy(state.surface_luma_offset.begin(), state.surface_luma_offset.begin() + 4, + vp9_info.frame_offsets.begin()); return vp9_info; } void VP9::InsertEntropy(u64 offset, Vp9EntropyProbs& dst) { EntropyProbs entropy; - memory_manager.ReadBlock(offset, &entropy, sizeof(EntropyProbs)); + host1x.GMMU().ReadBlock(offset, &entropy, sizeof(EntropyProbs)); entropy.Convert(dst); } -Vp9FrameContainer VP9::GetCurrentFrame() { +Vp9FrameContainer VP9::GetCurrentFrame(const Host1x::NvdecCommon::NvdecRegisters& state) { Vp9FrameContainer current_frame{}; { // gpu.SyncGuestHost(); epic, why? - current_frame.info = GetVp9PictureInfo(); + current_frame.info = GetVp9PictureInfo(state); current_frame.bit_stream.resize(current_frame.info.bitstream_size); - memory_manager.ReadBlock(regs.frame_bitstream_offset.Address(), - current_frame.bit_stream.data(), - current_frame.info.bitstream_size); + host1x.GMMU().ReadBlock(state.frame_bitstream_offset, current_frame.bit_stream.data(), + current_frame.info.bitstream_size); } if (!next_frame.bit_stream.empty()) { Vp9FrameContainer temp{ @@ -825,7 +742,8 @@ VpxBitStreamWriter VP9::ComposeUncompressedHeader() { uncomp_writer.WriteDeltaQ(current_frame_info.uv_dc_delta_q); uncomp_writer.WriteDeltaQ(current_frame_info.uv_ac_delta_q); - WriteSegmentation(uncomp_writer); + ASSERT(!current_frame_info.segment_enabled); + uncomp_writer.WriteBit(false); // Segmentation enabled (TODO). const s32 min_tile_cols_log2 = CalcMinLog2TileCols(current_frame_info.frame_size.width); const s32 max_tile_cols_log2 = CalcMaxLog2TileCols(current_frame_info.frame_size.width); @@ -852,29 +770,10 @@ VpxBitStreamWriter VP9::ComposeUncompressedHeader() { return uncomp_writer; } -std::tuple VP9::GetProgressiveOffsets() { - auto luma{regs.surface_luma_offsets[static_cast(Vp9SurfaceIndex::Current)].Address()}; - auto chroma{regs.surface_chroma_offsets[static_cast(Vp9SurfaceIndex::Current)].Address()}; - return {luma, chroma}; -} - -std::tuple VP9::GetInterlacedOffsets() { - auto luma_top{regs.surface_luma_offsets[static_cast(Vp9SurfaceIndex::Current)].Address()}; - auto luma_bottom{ - regs.surface_luma_offsets[static_cast(Vp9SurfaceIndex::Current)].Address()}; - auto chroma_top{ - regs.surface_chroma_offsets[static_cast(Vp9SurfaceIndex::Current)].Address()}; - auto chroma_bottom{ - regs.surface_chroma_offsets[static_cast(Vp9SurfaceIndex::Current)].Address()}; - return {luma_top, luma_bottom, chroma_top, chroma_bottom}; -} - -std::span VP9::ComposeFrame() { - vp9_hidden_frame = false; - +void VP9::ComposeFrame(const Host1x::NvdecCommon::NvdecRegisters& state) { std::vector bitstream; { - Vp9FrameContainer curr_frame = GetCurrentFrame(); + Vp9FrameContainer curr_frame = GetCurrentFrame(state); current_frame_info = curr_frame.info; bitstream = std::move(curr_frame.bit_stream); } @@ -887,16 +786,12 @@ std::span VP9::ComposeFrame() { std::vector uncompressed_header = uncomp_writer.GetByteArray(); // Write headers and frame to buffer - frame_scratch.resize(uncompressed_header.size() + compressed_header.size() + bitstream.size()); - std::copy(uncompressed_header.begin(), uncompressed_header.end(), frame_scratch.begin()); + frame.resize(uncompressed_header.size() + compressed_header.size() + bitstream.size()); + std::copy(uncompressed_header.begin(), uncompressed_header.end(), frame.begin()); std::copy(compressed_header.begin(), compressed_header.end(), - frame_scratch.begin() + uncompressed_header.size()); + frame.begin() + uncompressed_header.size()); std::copy(bitstream.begin(), bitstream.end(), - frame_scratch.begin() + uncompressed_header.size() + compressed_header.size()); - - vp9_hidden_frame = WasFrameHidden(); - - return GetFrameBytes(); + frame.begin() + uncompressed_header.size() + compressed_header.size()); } VpxRangeEncoder::VpxRangeEncoder() { @@ -1049,4 +944,4 @@ const std::vector& VpxBitStreamWriter::GetByteArray() const { return byte_array; } -} // namespace Tegra::Decoders +} // namespace Tegra::Decoder diff --git a/src/video_core/host1x/codecs/vp9.h b/src/video_core/host1x/codecs/vp9.h index 549e38987..23abf4a33 100755 --- a/src/video_core/host1x/codecs/vp9.h +++ b/src/video_core/host1x/codecs/vp9.h @@ -10,7 +10,6 @@ #include "common/common_types.h" #include "common/scratch_buffer.h" #include "common/stream.h" -#include "video_core/host1x/codecs/decoder.h" #include "video_core/host1x/codecs/vp9_types.h" #include "video_core/host1x/nvdec_common.h" @@ -20,7 +19,7 @@ namespace Host1x { class Host1x; } // namespace Host1x -namespace Decoders { +namespace Decoder { /// The VpxRangeEncoder, and VpxBitStreamWriter classes are used to compose the /// VP9 header bitstreams. @@ -111,31 +110,21 @@ private: std::vector byte_array; }; -class VP9 final : public Decoder { +class VP9 { public: - explicit VP9(Host1x::Host1x& host1x, const Host1x::NvdecCommon::NvdecRegisters& regs, s32 id); - ~VP9() override; + explicit VP9(Host1x::Host1x& host1x); + ~VP9(); VP9(const VP9&) = delete; VP9& operator=(const VP9&) = delete; - VP9(VP9&&) = delete; + VP9(VP9&&) = default; VP9& operator=(VP9&&) = delete; - [[nodiscard]] std::span ComposeFrame() override; + /// Composes the VP9 frame from the GPU state information. + /// Based on the official VP9 spec documentation + void ComposeFrame(const Host1x::NvdecCommon::NvdecRegisters& state); - std::tuple GetProgressiveOffsets() override; - std::tuple GetInterlacedOffsets() override; - - bool IsInterlaced() override { - return false; - } - - std::string_view GetCurrentCodecName() const override { - return "VP9"; - } - -private: /// Returns true if the most recent frame was a hidden frame. [[nodiscard]] bool WasFrameHidden() const { return !current_frame_info.show_frame; @@ -143,9 +132,10 @@ private: /// Returns a const span to the composed frame data. [[nodiscard]] std::span GetFrameBytes() const { - return frame_scratch; + return frame; } +private: /// Generates compressed header probability updates in the bitstream writer template void WriteProbabilityUpdate(VpxRangeEncoder& writer, const std::array& new_prob, @@ -177,22 +167,23 @@ private: /// Write motion vector probability updates. 6.3.17 in the spec void WriteMvProbabilityUpdate(VpxRangeEncoder& writer, u8 new_prob, u8 old_prob); - void WriteSegmentation(VpxBitStreamWriter& writer); - /// Returns VP9 information from NVDEC provided offset and size - [[nodiscard]] Vp9PictureInfo GetVp9PictureInfo(); + [[nodiscard]] Vp9PictureInfo GetVp9PictureInfo( + const Host1x::NvdecCommon::NvdecRegisters& state); /// Read and convert NVDEC provided entropy probs to Vp9EntropyProbs struct void InsertEntropy(u64 offset, Vp9EntropyProbs& dst); /// Returns frame to be decoded after buffering - [[nodiscard]] Vp9FrameContainer GetCurrentFrame(); + [[nodiscard]] Vp9FrameContainer GetCurrentFrame( + const Host1x::NvdecCommon::NvdecRegisters& state); /// Use NVDEC providied information to compose the headers for the current frame [[nodiscard]] std::vector ComposeCompressedHeader(); [[nodiscard]] VpxBitStreamWriter ComposeUncompressedHeader(); - Common::ScratchBuffer frame_scratch; + Host1x::Host1x& host1x; + Common::ScratchBuffer frame; std::array loop_filter_ref_deltas{}; std::array loop_filter_mode_deltas{}; @@ -201,11 +192,9 @@ private: std::array frame_ctxs{}; bool swap_ref_indices{}; - Segmentation last_segmentation{}; - PictureInfo current_picture_info{}; Vp9PictureInfo current_frame_info{}; Vp9EntropyProbs prev_frame_probs{}; }; -} // namespace Decoders +} // namespace Decoder } // namespace Tegra diff --git a/src/video_core/host1x/codecs/vp9_types.h b/src/video_core/host1x/codecs/vp9_types.h index dad6e8437..591fe73de 100755 --- a/src/video_core/host1x/codecs/vp9_types.h +++ b/src/video_core/host1x/codecs/vp9_types.h @@ -11,14 +11,7 @@ namespace Tegra { -namespace Decoders { -enum class Vp9SurfaceIndex : u32 { - Last = 0, - Golden = 1, - AltRef = 2, - Current = 3, -}; - +namespace Decoder { struct Vp9FrameDimensions { s16 width; s16 height; @@ -55,13 +48,11 @@ enum class TxMode { }; struct Segmentation { - constexpr bool operator==(const Segmentation& rhs) const = default; - u8 enabled; u8 update_map; u8 temporal_update; u8 abs_delta; - std::array, 8> feature_enabled; + std::array feature_mask; std::array, 8> feature_data; }; static_assert(sizeof(Segmentation) == 0x64, "Segmentation is an invalid size"); @@ -199,17 +190,7 @@ struct PictureInfo { static_assert(sizeof(PictureInfo) == 0x100, "PictureInfo is an invalid size"); struct EntropyProbs { - std::array kf_bmode_prob; ///< 0x0000 - std::array kf_bmode_probB; ///< 0x0320 - std::array ref_pred_probs; ///< 0x0384 - std::array mb_segment_tree_probs; ///< 0x0387 - std::array segment_pred_probs; ///< 0x038E - std::array ref_scores; ///< 0x0391 - std::array prob_comppred; ///< 0x0395 - INSERT_PADDING_BYTES_NOINIT(9); ///< 0x0397 - std::array kf_uv_mode_prob; ///< 0x03A0 - std::array kf_uv_mode_probB; ///< 0x03F0 - INSERT_PADDING_BYTES_NOINIT(6); ///< 0x03FA + INSERT_PADDING_BYTES_NOINIT(1024); ///< 0x0000 std::array inter_mode_prob; ///< 0x0400 std::array intra_inter_prob; ///< 0x041C INSERT_PADDING_BYTES_NOINIT(80); ///< 0x0420 @@ -321,5 +302,5 @@ ASSERT_POSITION(class_0_fr, 0x560); ASSERT_POSITION(coef_probs, 0x5A0); #undef ASSERT_POSITION -}; // namespace Decoders +}; // namespace Decoder }; // namespace Tegra diff --git a/src/video_core/host1x/control.cpp b/src/video_core/host1x/control.cpp index b931e854d..1406bd41e 100755 --- a/src/video_core/host1x/control.cpp +++ b/src/video_core/host1x/control.cpp @@ -27,7 +27,6 @@ void Control::ProcessMethod(Method method, u32 argument) { } void Control::Execute(u32 data) { - LOG_TRACE(Service_NVDRV, "Control wait syncpt {} value {}", data, syncpoint_value); host1x.GetSyncpointManager().WaitHost(data, syncpoint_value); } diff --git a/src/video_core/host1x/control.h b/src/video_core/host1x/control.h index 41dfe8ca0..d76da5ecf 100755 --- a/src/video_core/host1x/control.h +++ b/src/video_core/host1x/control.h @@ -6,7 +6,9 @@ #include "common/common_types.h" -namespace Tegra::Host1x { +namespace Tegra { + +namespace Host1x { class Host1x; class Nvdec; @@ -29,8 +31,10 @@ private: /// For Host1x, execute is waiting on a syncpoint previously written into the state void Execute(u32 data); - Host1x& host1x; u32 syncpoint_value{}; + Host1x& host1x; }; -} // namespace Tegra::Host1x +} // namespace Host1x + +} // namespace Tegra diff --git a/src/video_core/host1x/ffmpeg/ffmpeg.cpp b/src/video_core/host1x/ffmpeg/ffmpeg.cpp index d41b08c53..96686da59 100755 --- a/src/video_core/host1x/ffmpeg/ffmpeg.cpp +++ b/src/video_core/host1x/ffmpeg/ffmpeg.cpp @@ -5,9 +5,7 @@ #include "common/logging/log.h" #include "common/scope_exit.h" #include "common/settings.h" -#include "core/memory.h" #include "video_core/host1x/ffmpeg/ffmpeg.h" -#include "video_core/memory_manager.h" extern "C" { #ifdef LIBVA_FOUND @@ -185,8 +183,8 @@ bool HardwareContext::InitializeWithType(AVHWDeviceType type) { return true; } -DecoderContext::DecoderContext(const Decoder& decoder) : m_decoder{decoder} { - m_codec_context = avcodec_alloc_context3(m_decoder.GetCodec()); +DecoderContext::DecoderContext(const Decoder& decoder) { + m_codec_context = avcodec_alloc_context3(decoder.GetCodec()); av_opt_set(m_codec_context->priv_data, "tune", "zerolatency", 0); m_codec_context->thread_count = 0; m_codec_context->thread_type &= ~FF_THREAD_FRAME; @@ -218,25 +216,6 @@ bool DecoderContext::OpenContext(const Decoder& decoder) { } bool DecoderContext::SendPacket(const Packet& packet) { - m_temp_frame = std::make_shared(); - m_got_frame = 0; - -// Android can randomly crash when calling decode directly, so skip. -// TODO update ffmpeg and hope that fixes it. -#ifndef ANDROID - if (!m_codec_context->hw_device_ctx && m_codec_context->codec_id == AV_CODEC_ID_H264) { - m_decode_order = true; - auto* codec{ffcodec(m_decoder.GetCodec())}; - if (const int ret = codec->cb.decode(m_codec_context, m_temp_frame->GetFrame(), - &m_got_frame, packet.GetPacket()); - ret < 0) { - LOG_DEBUG(Service_NVDRV, "avcodec_send_packet error {}", AVError(ret)); - return false; - } - return true; - } -#endif - if (const int ret = avcodec_send_packet(m_codec_context, packet.GetPacket()); ret < 0) { LOG_ERROR(HW_GPU, "avcodec_send_packet error: {}", AVError(ret)); return false; @@ -245,73 +224,139 @@ bool DecoderContext::SendPacket(const Packet& packet) { return true; } -std::shared_ptr DecoderContext::ReceiveFrame() { - // Android can randomly crash when calling decode directly, so skip. - // TODO update ffmpeg and hope that fixes it. -#ifndef ANDROID - if (!m_codec_context->hw_device_ctx && m_codec_context->codec_id == AV_CODEC_ID_H264) { - m_decode_order = true; - auto* codec{ffcodec(m_decoder.GetCodec())}; - int ret{0}; +std::unique_ptr DecoderContext::ReceiveFrame(bool* out_is_interlaced) { + auto dst_frame = std::make_unique(); - if (m_got_frame == 0) { - Packet packet{{}}; - auto* pkt = packet.GetPacket(); - pkt->data = nullptr; - pkt->size = 0; - ret = codec->cb.decode(m_codec_context, m_temp_frame->GetFrame(), &m_got_frame, pkt); - m_codec_context->has_b_frames = 0; + const auto ReceiveImpl = [&](AVFrame* frame) { + if (const int ret = avcodec_receive_frame(m_codec_context, frame); ret < 0) { + LOG_ERROR(HW_GPU, "avcodec_receive_frame error: {}", AVError(ret)); + return false; } - if (m_got_frame == 0 || ret < 0) { - LOG_ERROR(Service_NVDRV, "Failed to receive a frame! error {}", ret); + *out_is_interlaced = +#if defined(FF_API_INTERLACED_FRAME) || LIBAVUTIL_VERSION_MAJOR >= 59 + (frame->flags & AV_FRAME_FLAG_INTERLACED) != 0; +#else + frame->interlaced_frame != 0; +#endif + return true; + }; + + if (m_codec_context->hw_device_ctx) { + // If we have a hardware context, make a separate frame here to receive the + // hardware result before sending it to the output. + Frame intermediate_frame; + + if (!ReceiveImpl(intermediate_frame.GetFrame())) { return {}; } - } else -#endif - { - const auto ReceiveImpl = [&](AVFrame* frame) { - if (const int ret = avcodec_receive_frame(m_codec_context, frame); ret < 0) { - LOG_ERROR(HW_GPU, "avcodec_receive_frame error: {}", AVError(ret)); - return false; - } - - return true; - }; - - if (m_codec_context->hw_device_ctx) { - // If we have a hardware context, make a separate frame here to receive the - // hardware result before sending it to the output. - Frame intermediate_frame; - - if (!ReceiveImpl(intermediate_frame.GetFrame())) { - return {}; - } - - m_temp_frame->SetFormat(PreferredGpuFormat); - if (const int ret = av_hwframe_transfer_data(m_temp_frame->GetFrame(), - intermediate_frame.GetFrame(), 0); - ret < 0) { - LOG_ERROR(HW_GPU, "av_hwframe_transfer_data error: {}", AVError(ret)); - return {}; - } - } else { - // Otherwise, decode the frame as normal. - if (!ReceiveImpl(m_temp_frame->GetFrame())) { - return {}; - } + dst_frame->SetFormat(PreferredGpuFormat); + if (const int ret = + av_hwframe_transfer_data(dst_frame->GetFrame(), intermediate_frame.GetFrame(), 0); + ret < 0) { + LOG_ERROR(HW_GPU, "av_hwframe_transfer_data error: {}", AVError(ret)); + return {}; + } + } else { + // Otherwise, decode the frame as normal. + if (!ReceiveImpl(dst_frame->GetFrame())) { + return {}; } } -#if defined(FF_API_INTERLACED_FRAME) || LIBAVUTIL_VERSION_MAJOR >= 59 - m_temp_frame->GetFrame()->interlaced_frame = - (m_temp_frame->GetFrame()->flags & AV_FRAME_FLAG_INTERLACED) != 0; -#endif - return std::move(m_temp_frame); + return dst_frame; +} + +DeinterlaceFilter::DeinterlaceFilter(const Frame& frame) { + const AVFilter* buffer_src = avfilter_get_by_name("buffer"); + const AVFilter* buffer_sink = avfilter_get_by_name("buffersink"); + AVFilterInOut* inputs = avfilter_inout_alloc(); + AVFilterInOut* outputs = avfilter_inout_alloc(); + SCOPE_EXIT({ + avfilter_inout_free(&inputs); + avfilter_inout_free(&outputs); + }); + + // Don't know how to get the accurate time_base but it doesn't matter for yadif filter + // so just use 1/1 to make buffer filter happy + std::string args = fmt::format("video_size={}x{}:pix_fmt={}:time_base=1/1", frame.GetWidth(), + frame.GetHeight(), static_cast(frame.GetPixelFormat())); + + m_filter_graph = avfilter_graph_alloc(); + int ret = avfilter_graph_create_filter(&m_source_context, buffer_src, "in", args.c_str(), + nullptr, m_filter_graph); + if (ret < 0) { + LOG_ERROR(HW_GPU, "avfilter_graph_create_filter source error: {}", AVError(ret)); + return; + } + + ret = avfilter_graph_create_filter(&m_sink_context, buffer_sink, "out", nullptr, nullptr, + m_filter_graph); + if (ret < 0) { + LOG_ERROR(HW_GPU, "avfilter_graph_create_filter sink error: {}", AVError(ret)); + return; + } + + inputs->name = av_strdup("out"); + inputs->filter_ctx = m_sink_context; + inputs->pad_idx = 0; + inputs->next = nullptr; + + outputs->name = av_strdup("in"); + outputs->filter_ctx = m_source_context; + outputs->pad_idx = 0; + outputs->next = nullptr; + + const char* description = "yadif=1:-1:0"; + ret = avfilter_graph_parse_ptr(m_filter_graph, description, &inputs, &outputs, nullptr); + if (ret < 0) { + LOG_ERROR(HW_GPU, "avfilter_graph_parse_ptr error: {}", AVError(ret)); + return; + } + + ret = avfilter_graph_config(m_filter_graph, nullptr); + if (ret < 0) { + LOG_ERROR(HW_GPU, "avfilter_graph_config error: {}", AVError(ret)); + return; + } + + m_initialized = true; +} + +bool DeinterlaceFilter::AddSourceFrame(const Frame& frame) { + if (const int ret = av_buffersrc_add_frame_flags(m_source_context, frame.GetFrame(), + AV_BUFFERSRC_FLAG_KEEP_REF); + ret < 0) { + LOG_ERROR(HW_GPU, "av_buffersrc_add_frame_flags error: {}", AVError(ret)); + return false; + } + + return true; +} + +std::unique_ptr DeinterlaceFilter::DrainSinkFrame() { + auto dst_frame = std::make_unique(); + const int ret = av_buffersink_get_frame(m_sink_context, dst_frame->GetFrame()); + + if (ret == AVERROR(EAGAIN) || ret == AVERROR(AVERROR_EOF)) { + return {}; + } + + if (ret < 0) { + LOG_ERROR(HW_GPU, "av_buffersink_get_frame error: {}", AVError(ret)); + return {}; + } + + return dst_frame; +} + +DeinterlaceFilter::~DeinterlaceFilter() { + avfilter_graph_free(&m_filter_graph); } void DecodeApi::Reset() { + m_deinterlace_filter.reset(); m_hardware_context.reset(); m_decoder_context.reset(); m_decoder.reset(); @@ -337,14 +382,43 @@ bool DecodeApi::Initialize(Tegra::Host1x::NvdecCommon::VideoCodec codec) { return true; } -bool DecodeApi::SendPacket(std::span packet_data) { +bool DecodeApi::SendPacket(std::span packet_data, size_t configuration_size) { FFmpeg::Packet packet(packet_data); return m_decoder_context->SendPacket(packet); } -std::shared_ptr DecodeApi::ReceiveFrame() { +void DecodeApi::ReceiveFrames(std::queue>& frame_queue) { // Receive raw frame from decoder. - return m_decoder_context->ReceiveFrame(); + bool is_interlaced; + auto frame = m_decoder_context->ReceiveFrame(&is_interlaced); + if (!frame) { + return; + } + + if (!is_interlaced) { + // If the frame is not interlaced, we can pend it now. + frame_queue.push(std::move(frame)); + } else { + // Create the deinterlacer if needed. + if (!m_deinterlace_filter) { + m_deinterlace_filter.emplace(*frame); + } + + // Add the frame we just received. + if (!m_deinterlace_filter->AddSourceFrame(*frame)) { + return; + } + + // Pend output fields. + while (true) { + auto filter_frame = m_deinterlace_filter->DrainSinkFrame(); + if (!filter_frame) { + break; + } + + frame_queue.push(std::move(filter_frame)); + } + } } } // namespace FFmpeg diff --git a/src/video_core/host1x/ffmpeg/ffmpeg.h b/src/video_core/host1x/ffmpeg/ffmpeg.h index a74fcba80..1de0bbd83 100755 --- a/src/video_core/host1x/ffmpeg/ffmpeg.h +++ b/src/video_core/host1x/ffmpeg/ffmpeg.h @@ -20,20 +20,17 @@ extern "C" { #endif #include +#include +#include +#include +#include #include -#ifndef ANDROID -#include -#endif #if defined(__GNUC__) || defined(__clang__) #pragma GCC diagnostic pop #endif } -namespace Tegra { -class MemoryManager; -} - namespace FFmpeg { class Packet; @@ -93,10 +90,6 @@ public: return m_frame->data[plane]; } - const u8* GetPlane(int plane) const { - return m_frame->data[plane]; - } - u8** GetPlanes() const { return m_frame->data; } @@ -105,14 +98,6 @@ public: m_frame->format = format; } - bool IsInterlaced() const { - return m_frame->interlaced_frame != 0; - } - - bool IsHardwareDecoded() const { - return m_frame->hw_frames_ctx != nullptr; - } - AVFrame* GetFrame() const { return m_frame; } @@ -175,22 +160,33 @@ public: void InitializeHardwareDecoder(const HardwareContext& context, AVPixelFormat hw_pix_fmt); bool OpenContext(const Decoder& decoder); bool SendPacket(const Packet& packet); - std::shared_ptr ReceiveFrame(); + std::unique_ptr ReceiveFrame(bool* out_is_interlaced); AVCodecContext* GetCodecContext() const { return m_codec_context; } - bool UsingDecodeOrder() const { - return m_decode_order; - } +private: + AVCodecContext* m_codec_context{}; +}; + +// Wraps an AVFilterGraph. +class DeinterlaceFilter { +public: + YUZU_NON_COPYABLE(DeinterlaceFilter); + YUZU_NON_MOVEABLE(DeinterlaceFilter); + + explicit DeinterlaceFilter(const Frame& frame); + ~DeinterlaceFilter(); + + bool AddSourceFrame(const Frame& frame); + std::unique_ptr DrainSinkFrame(); private: - const Decoder& m_decoder; - AVCodecContext* m_codec_context{}; - s32 m_got_frame{}; - std::shared_ptr m_temp_frame{}; - bool m_decode_order{}; + AVFilterGraph* m_filter_graph{}; + AVFilterContext* m_source_context{}; + AVFilterContext* m_sink_context{}; + bool m_initialized{}; }; class DecodeApi { @@ -204,17 +200,14 @@ public: bool Initialize(Tegra::Host1x::NvdecCommon::VideoCodec codec); void Reset(); - bool UsingDecodeOrder() const { - return m_decoder_context->UsingDecodeOrder(); - } - - bool SendPacket(std::span packet_data); - std::shared_ptr ReceiveFrame(); + bool SendPacket(std::span packet_data, size_t configuration_size); + void ReceiveFrames(std::queue>& frame_queue); private: std::optional m_decoder; std::optional m_decoder_context; std::optional m_hardware_context; + std::optional m_deinterlace_filter; }; } // namespace FFmpeg diff --git a/src/video_core/host1x/host1x.cpp b/src/video_core/host1x/host1x.cpp index 1c1e24da3..27c59640b 100755 --- a/src/video_core/host1x/host1x.cpp +++ b/src/video_core/host1x/host1x.cpp @@ -3,10 +3,10 @@ #include "core/core.h" #include "video_core/host1x/host1x.h" -#include "video_core/host1x/nvdec.h" -#include "video_core/host1x/vic.h" -namespace Tegra::Host1x { +namespace Tegra { + +namespace Host1x { Host1x::Host1x(Core::System& system_) : system{system_}, syncpoint_manager{}, @@ -15,23 +15,6 @@ Host1x::Host1x(Core::System& system_) Host1x::~Host1x() = default; -void Host1x::StartDevice(s32 fd, ChannelType type, u32 syncpt) { - switch (type) { - case ChannelType::NvDec: - devices[fd] = std::make_unique(*this, fd, syncpt); - last_nvdec_fd = fd; - break; - case ChannelType::VIC: - devices[fd] = std::make_unique(*this, fd, GetLastNvdecDevice(), syncpt); - break; - default: - LOG_ERROR(HW_GPU, "Unimplemented host1x device {}", static_cast(type)); - break; - } -} +} // namespace Host1x -void Host1x::StopDevice(s32 fd, ChannelType type) { - devices.erase(fd); -} - -} // namespace Tegra::Host1x +} // namespace Tegra diff --git a/src/video_core/host1x/host1x.h b/src/video_core/host1x/host1x.h index 3329943a6..958aeda4f 100755 --- a/src/video_core/host1x/host1x.h +++ b/src/video_core/host1x/host1x.h @@ -6,7 +6,6 @@ #include "common/common_types.h" #include "common/address_space.h" -#include "video_core/cdma_pusher.h" #include "video_core/host1x/gpu_device_memory_manager.h" #include "video_core/host1x/syncpoint_manager.h" #include "video_core/memory_manager.h" @@ -15,29 +14,15 @@ namespace Core { class System; } // namespace Core -namespace Tegra::Host1x { -class Nvdec; +namespace Tegra { -enum class ChannelType : u32 { - MsEnc = 0, - VIC = 1, - GPU = 2, - NvDec = 3, - Display = 4, - NvJpg = 5, - TSec = 6, - Max = 7, -}; +namespace Host1x { class Host1x { public: explicit Host1x(Core::System& system); ~Host1x(); - Core::System& System() { - return system; - } - SyncpointManager& GetSyncpointManager() { return syncpoint_manager; } @@ -70,31 +55,14 @@ public: return *allocator; } - void StartDevice(s32 fd, ChannelType type, u32 syncpt); - void StopDevice(s32 fd, ChannelType type); - - void PushEntries(s32 fd, ChCommandHeaderList&& entries) { - auto it = devices.find(fd); - if (it == devices.end()) { - return; - } - it->second->PushEntries(std::move(entries)); - } - - Nvdec& GetLastNvdecDevice() { - auto it = devices.find(last_nvdec_fd); - ASSERT(it->second.get() != nullptr); - return *reinterpret_cast(it->second.get()); - } - private: Core::System& system; SyncpointManager syncpoint_manager; Tegra::MaxwellDeviceMemoryManager memory_manager; Tegra::MemoryManager gmmu_manager; std::unique_ptr> allocator; - std::unordered_map> devices; - s32 last_nvdec_fd{}; }; -} // namespace Tegra::Host1x +} // namespace Host1x + +} // namespace Tegra diff --git a/src/video_core/host1x/nvdec.cpp b/src/video_core/host1x/nvdec.cpp index e543a185f..4507a0d26 100755 --- a/src/video_core/host1x/nvdec.cpp +++ b/src/video_core/host1x/nvdec.cpp @@ -2,12 +2,6 @@ // SPDX-License-Identifier: GPL-2.0-or-later #include "common/assert.h" - -#include "common/polyfill_thread.h" -#include "common/settings.h" -#include "video_core/host1x/codecs/h264.h" -#include "video_core/host1x/codecs/vp8.h" -#include "video_core/host1x/codecs/vp9.h" #include "video_core/host1x/host1x.h" #include "video_core/host1x/nvdec.h" @@ -16,68 +10,37 @@ namespace Tegra::Host1x { #define NVDEC_REG_INDEX(field_name) \ (offsetof(NvdecCommon::NvdecRegisters, field_name) / sizeof(u64)) -Nvdec::Nvdec(Host1x& host1x_, s32 id_, u32 syncpt) - : CDmaPusher{host1x_, id_}, id{id_}, syncpoint{syncpt} { - LOG_INFO(HW_GPU, "Created nvdec {}", id); -} +Nvdec::Nvdec(Host1x& host1x_) + : host1x(host1x_), state{}, codec(std::make_unique(host1x, state)) {} -Nvdec::~Nvdec() { - LOG_INFO(HW_GPU, "Destroying nvdec {}", id); -} +Nvdec::~Nvdec() = default; void Nvdec::ProcessMethod(u32 method, u32 argument) { - regs.reg_array[method] = argument; + state.reg_array[method] = static_cast(argument) << 8; switch (method) { case NVDEC_REG_INDEX(set_codec_id): - CreateDecoder(static_cast(argument)); + codec->SetTargetCodec(static_cast(argument)); break; - case NVDEC_REG_INDEX(execute): { - if (wait_needed) { - std::this_thread::sleep_for(std::chrono::milliseconds(32)); - wait_needed = false; - } + case NVDEC_REG_INDEX(execute): Execute(); - } break; + break; } } -void Nvdec::CreateDecoder(NvdecCommon::VideoCodec codec) { - if (decoder.get()) { - return; - } - switch (codec) { - case NvdecCommon::VideoCodec::H264: - decoder = std::make_unique(host1x, regs, id); - break; - case NvdecCommon::VideoCodec::VP8: - decoder = std::make_unique(host1x, regs, id); - break; - case NvdecCommon::VideoCodec::VP9: - decoder = std::make_unique(host1x, regs, id); - break; - default: - UNIMPLEMENTED_MSG("Codec {}", decoder->GetCurrentCodecName()); - break; - } - LOG_INFO(HW_GPU, "Created decoder {} for id {}", decoder->GetCurrentCodecName(), id); +std::unique_ptr Nvdec::GetFrame() { + return codec->GetCurrentFrame(); } void Nvdec::Execute() { - if (Settings::values.nvdec_emulation.GetValue() == Settings::NvdecEmulation::Off) [[unlikely]] { - // Signalling syncpts too fast can cause games to get stuck as they don't expect a <1ms - // execution time. Sleep for half of a 60 fps frame just in case. - std::this_thread::sleep_for(std::chrono::milliseconds(8)); - return; - } - switch (decoder->GetCurrentCodec()) { + switch (codec->GetCurrentCodec()) { case NvdecCommon::VideoCodec::H264: case NvdecCommon::VideoCodec::VP8: case NvdecCommon::VideoCodec::VP9: - decoder->Decode(); + codec->Decode(); break; default: - UNIMPLEMENTED_MSG("Codec {}", decoder->GetCurrentCodecName()); + UNIMPLEMENTED_MSG("Codec {}", codec->GetCurrentCodecName()); break; } } diff --git a/src/video_core/host1x/nvdec.h b/src/video_core/host1x/nvdec.h index afbbd311d..806c30f2c 100755 --- a/src/video_core/host1x/nvdec.h +++ b/src/video_core/host1x/nvdec.h @@ -5,49 +5,33 @@ #include #include - #include "common/common_types.h" -#include "video_core/cdma_pusher.h" -#include "video_core/host1x/codecs/decoder.h" +#include "video_core/host1x/codecs/codec.h" namespace Tegra { namespace Host1x { + class Host1x; -class Nvdec final : public CDmaPusher { +class Nvdec { public: - explicit Nvdec(Host1x& host1x, s32 id, u32 syncpt); + explicit Nvdec(Host1x& host1x); ~Nvdec(); /// Writes the method into the state, Invoke Execute() if encountered - void ProcessMethod(u32 method, u32 arg) override; + void ProcessMethod(u32 method, u32 argument); - std::shared_ptr GetFrame(u64 luma_offset) { - return decoder->GetFrame(luma_offset); - } - - u32 GetSyncpoint() const { - return syncpoint; - } - - void SetWait() { - wait_needed = true; - } + /// Return most recently decoded frame + [[nodiscard]] std::unique_ptr GetFrame(); private: - /// Create the decoder when the codec id is set - void CreateDecoder(NvdecCommon::VideoCodec codec); - /// Invoke codec to decode a frame void Execute(); - s32 id; - u32 syncpoint; - - NvdecCommon::NvdecRegisters regs{}; - std::unique_ptr decoder; - bool wait_needed{false}; + Host1x& host1x; + NvdecCommon::NvdecRegisters state; + std::unique_ptr codec; }; } // namespace Host1x diff --git a/src/video_core/host1x/nvdec_common.h b/src/video_core/host1x/nvdec_common.h index 13b96452f..0fd678269 100755 --- a/src/video_core/host1x/nvdec_common.h +++ b/src/video_core/host1x/nvdec_common.h @@ -17,17 +17,6 @@ enum class VideoCodec : u64 { VP9 = 0x9, }; -struct Offset { - constexpr u64 Address() const noexcept { - return offset << 8; - } - -private: - u64 offset; -}; -static_assert(std::is_trivial_v, "Offset must be trivial"); -static_assert(sizeof(Offset) == 0x8, "Offset has the wrong size!"); - // NVDEC should use a 32-bit address space, but is mapped to 64-bit, // doubling the sizes here is compensating for that. struct NvdecRegisters { @@ -49,40 +38,29 @@ struct NvdecRegisters { BitField<17, 1, u64> all_intra_frame; }; } control_params; - Offset picture_info_offset; ///< 0x0808 - Offset frame_bitstream_offset; ///< 0x0810 - u64 frame_number; ///< 0x0818 - Offset h264_slice_data_offsets; ///< 0x0820 - Offset h264_mv_dump_offset; ///< 0x0828 - INSERT_PADDING_WORDS_NOINIT(6); ///< 0x0830 - Offset frame_stats_offset; ///< 0x0848 - Offset h264_last_surface_luma_offset; ///< 0x0850 - Offset h264_last_surface_chroma_offset; ///< 0x0858 - std::array surface_luma_offsets; ///< 0x0860 - std::array surface_chroma_offsets; ///< 0x08E8 - Offset pic_scratch_buf_offset; ///< 0x0970 - Offset external_mvbuffer_offset; ///< 0x0978 - INSERT_PADDING_WORDS_NOINIT(32); ///< 0x0980 - Offset h264_mbhist_buffer_offset; ///< 0x0A00 - INSERT_PADDING_WORDS_NOINIT(30); ///< 0x0A08 - Offset vp8_prob_data_offset; ///< 0x0A80 - Offset vp8_header_partition_buf_offset; ///< 0x0A88 - INSERT_PADDING_WORDS_NOINIT(28); ///< 0x0A90 - Offset hvec_scalist_list_offset; ///< 0x0B00 - Offset hvec_tile_sizes_offset; ///< 0x0B08 - Offset hvec_filter_buffer_offset; ///< 0x0B10 - Offset hvec_sao_buffer_offset; ///< 0x0B18 - Offset hvec_slice_info_buffer_offset; ///< 0x0B20 - Offset hvec_slice_group_index_offset; ///< 0x0B28 - INSERT_PADDING_WORDS_NOINIT(20); ///< 0x0B30 - Offset vp9_prob_tab_buffer_offset; ///< 0x0B80 - Offset vp9_ctx_counter_buffer_offset; ///< 0x0B88 - Offset vp9_segment_read_buffer_offset; ///< 0x0B90 - Offset vp9_segment_write_buffer_offset; ///< 0x0B98 - Offset vp9_tile_size_buffer_offset; ///< 0x0BA0 - Offset vp9_col_mvwrite_buffer_offset; ///< 0x0BA8 - Offset vp9_col_mvread_buffer_offset; ///< 0x0BB0 - Offset vp9_filter_buffer_offset; ///< 0x0BB8 + u64 picture_info_offset; ///< 0x0808 + u64 frame_bitstream_offset; ///< 0x0810 + u64 frame_number; ///< 0x0818 + u64 h264_slice_data_offsets; ///< 0x0820 + u64 h264_mv_dump_offset; ///< 0x0828 + INSERT_PADDING_WORDS_NOINIT(6); ///< 0x0830 + u64 frame_stats_offset; ///< 0x0848 + u64 h264_last_surface_luma_offset; ///< 0x0850 + u64 h264_last_surface_chroma_offset; ///< 0x0858 + std::array surface_luma_offset; ///< 0x0860 + std::array surface_chroma_offset; ///< 0x08E8 + INSERT_PADDING_WORDS_NOINIT(68); ///< 0x0970 + u64 vp8_prob_data_offset; ///< 0x0A80 + u64 vp8_header_partition_buf_offset; ///< 0x0A88 + INSERT_PADDING_WORDS_NOINIT(60); ///< 0x0A90 + u64 vp9_entropy_probs_offset; ///< 0x0B80 + u64 vp9_backward_updates_offset; ///< 0x0B88 + u64 vp9_last_frame_segmap_offset; ///< 0x0B90 + u64 vp9_curr_frame_segmap_offset; ///< 0x0B98 + INSERT_PADDING_WORDS_NOINIT(2); ///< 0x0BA0 + u64 vp9_last_frame_mvs_offset; ///< 0x0BA8 + u64 vp9_curr_frame_mvs_offset; ///< 0x0BB0 + INSERT_PADDING_WORDS_NOINIT(2); ///< 0x0BB8 }; std::array reg_array; }; @@ -103,16 +81,16 @@ ASSERT_REG_POSITION(h264_slice_data_offsets, 0x104); ASSERT_REG_POSITION(frame_stats_offset, 0x109); ASSERT_REG_POSITION(h264_last_surface_luma_offset, 0x10A); ASSERT_REG_POSITION(h264_last_surface_chroma_offset, 0x10B); -ASSERT_REG_POSITION(surface_luma_offsets, 0x10C); -ASSERT_REG_POSITION(surface_chroma_offsets, 0x11D); +ASSERT_REG_POSITION(surface_luma_offset, 0x10C); +ASSERT_REG_POSITION(surface_chroma_offset, 0x11D); ASSERT_REG_POSITION(vp8_prob_data_offset, 0x150); ASSERT_REG_POSITION(vp8_header_partition_buf_offset, 0x151); -ASSERT_REG_POSITION(vp9_prob_tab_buffer_offset, 0x170); -ASSERT_REG_POSITION(vp9_ctx_counter_buffer_offset, 0x171); -ASSERT_REG_POSITION(vp9_segment_read_buffer_offset, 0x172); -ASSERT_REG_POSITION(vp9_segment_write_buffer_offset, 0x173); -ASSERT_REG_POSITION(vp9_col_mvwrite_buffer_offset, 0x175); -ASSERT_REG_POSITION(vp9_col_mvread_buffer_offset, 0x176); +ASSERT_REG_POSITION(vp9_entropy_probs_offset, 0x170); +ASSERT_REG_POSITION(vp9_backward_updates_offset, 0x171); +ASSERT_REG_POSITION(vp9_last_frame_segmap_offset, 0x172); +ASSERT_REG_POSITION(vp9_curr_frame_segmap_offset, 0x173); +ASSERT_REG_POSITION(vp9_last_frame_mvs_offset, 0x175); +ASSERT_REG_POSITION(vp9_curr_frame_mvs_offset, 0x176); #undef ASSERT_REG_POSITION diff --git a/src/video_core/host1x/syncpoint_manager.cpp b/src/video_core/host1x/syncpoint_manager.cpp index 1b5dd4ba6..eb5dd8d70 100755 --- a/src/video_core/host1x/syncpoint_manager.cpp +++ b/src/video_core/host1x/syncpoint_manager.cpp @@ -18,7 +18,7 @@ SyncpointManager::ActionHandle SyncpointManager::RegisterAction( return {}; } - std::scoped_lock lk(guard); + std::unique_lock lk(guard); if (syncpoint.load(std::memory_order_relaxed) >= expected_value) { action(); return {}; @@ -35,7 +35,7 @@ SyncpointManager::ActionHandle SyncpointManager::RegisterAction( void SyncpointManager::DeregisterAction(std::list& action_storage, const ActionHandle& handle) { - std::scoped_lock lk(guard); + std::unique_lock lk(guard); // We want to ensure the iterator still exists prior to erasing it // Otherwise, if an invalid iterator was passed in then it could lead to UB @@ -78,7 +78,7 @@ void SyncpointManager::Increment(std::atomic& syncpoint, std::condition_var std::list& action_storage) { auto new_value{syncpoint.fetch_add(1, std::memory_order_acq_rel) + 1}; - std::scoped_lock lk(guard); + std::unique_lock lk(guard); auto it = action_storage.begin(); while (it != action_storage.end()) { if (it->expected_value > new_value) { diff --git a/src/video_core/host1x/vic.cpp b/src/video_core/host1x/vic.cpp index 56877b789..34b365803 100755 --- a/src/video_core/host1x/vic.cpp +++ b/src/video_core/host1x/vic.cpp @@ -2,18 +2,6 @@ // SPDX-License-Identifier: GPL-2.0-or-later #include -#include -#include - -#if defined(ARCHITECTURE_x86_64) -#if defined(_MSC_VER) -#include -#else -#include -#endif -#elif defined(ARCHITECTURE_arm64) -#include -#endif extern "C" { #if defined(__GNUC__) || defined(__clang__) @@ -26,1108 +14,228 @@ extern "C" { #endif } -#include "common/alignment.h" #include "common/assert.h" #include "common/bit_field.h" #include "common/logging/log.h" -#include "common/polyfill_thread.h" -#include "common/settings.h" #include "video_core/engines/maxwell_3d.h" -#include "video_core/guest_memory.h" #include "video_core/host1x/host1x.h" #include "video_core/host1x/nvdec.h" #include "video_core/host1x/vic.h" #include "video_core/memory_manager.h" #include "video_core/textures/decoders.h" -#if defined(ARCHITECTURE_x86_64) -#include "common/x64/cpu_detect.h" -#elif defined(ARCHITECTURE_arm64) -// Some ARM64 detect -#endif +namespace Tegra { + +namespace Host1x { -namespace Tegra::Host1x { namespace { -static bool HasSSE41() { -#if defined(ARCHITECTURE_x86_64) - const auto& cpu_caps{Common::GetCPUCaps()}; - return cpu_caps.sse4_1; -#else - return false; -#endif -} +enum class VideoPixelFormat : u64_le { + RGBA8 = 0x1f, + BGRA8 = 0x20, + RGBX8 = 0x23, + YUV420 = 0x44, +}; +} // Anonymous namespace -void SwizzleSurface(std::span output, u32 out_stride, std::span input, u32 in_stride, - u32 height) { - /* - * Taken from https://github.com/averne/FFmpeg/blob/nvtegra/libavutil/hwcontext_nvtegra.c#L949 - */ - const uint32_t x_mask = 0xFFFFFFD2u; - const uint32_t y_mask = 0x2Cu; - uint32_t offs_x{}; - uint32_t offs_y{}; - uint32_t offs_line{}; +union VicConfig { + u64_le raw{}; + BitField<0, 7, VideoPixelFormat> pixel_format; + BitField<7, 2, u64_le> chroma_loc_horiz; + BitField<9, 2, u64_le> chroma_loc_vert; + BitField<11, 4, u64_le> block_linear_kind; + BitField<15, 4, u64_le> block_linear_height_log2; + BitField<32, 14, u64_le> surface_width_minus1; + BitField<46, 14, u64_le> surface_height_minus1; +}; - for (u32 y = 0; y < height; y += 2) { - auto dst_line = output.data() + offs_y * 16; - const auto src_line = input.data() + y * (in_stride / 16) * 16; +Vic::Vic(Host1x& host1x_, std::shared_ptr nvdec_processor_) + : host1x(host1x_), + nvdec_processor(std::move(nvdec_processor_)), converted_frame_buffer{nullptr, av_free} {} - offs_line = offs_x; - for (u32 x = 0; x < in_stride; x += 16) { - std::memcpy(&dst_line[offs_line * 16], &src_line[x], 16); - std::memcpy(&dst_line[offs_line * 16 + 16], &src_line[x + in_stride], 16); - offs_line = (offs_line - x_mask) & x_mask; - } +Vic::~Vic() = default; - offs_y = (offs_y - y_mask) & y_mask; - - /* Wrap into next tile row */ - if (!offs_y) { - offs_x += out_stride; - } - } -} - -} // namespace - -Vic::Vic(Host1x& host1x_, s32 id_, Nvdec& nvdec_processor_, u32 syncpt) - : CDmaPusher{host1x_, id_}, - nvdec_processor{nvdec_processor_}, id{id_}, syncpoint{syncpt}, has_sse41{HasSSE41()} { - LOG_INFO(HW_GPU, "Created vic {}", id); -} - -Vic::~Vic() { - LOG_INFO(HW_GPU, "Destroying vic {}", id); -} - -void Vic::ProcessMethod(u32 method, u32 arg) { - LOG_TRACE(HW_GPU, "Vic method 0x{:X}", static_cast(method)); - regs.reg_array[method] = arg; - - switch (static_cast(method * sizeof(u32))) { - case Method::Execute: { - auto vic_syncpoint = host1x.GetSyncpointManager().GetGuestSyncpointValue(syncpoint); - auto nvdec_syncpoint = - host1x.GetSyncpointManager().GetGuestSyncpointValue(nvdec_processor.GetSyncpoint()); - if (vic_syncpoint + 4 < nvdec_syncpoint) { - nvdec_processor.SetWait(); - } +void Vic::ProcessMethod(Method method, u32 argument) { + LOG_DEBUG(HW_GPU, "Vic method 0x{:X}", static_cast(method)); + const u64 arg = static_cast(argument) << 8; + switch (method) { + case Method::Execute: Execute(); - } break; + break; + case Method::SetConfigStructOffset: + config_struct_address = arg; + break; + case Method::SetOutputSurfaceLumaOffset: + output_surface_luma_address = arg; + break; + case Method::SetOutputSurfaceChromaOffset: + output_surface_chroma_address = arg; + break; default: break; } } void Vic::Execute() { - ConfigStruct config{}; - memory_manager.ReadBlock(regs.config_struct_offset.Address(), &config, sizeof(ConfigStruct)); + if (output_surface_luma_address == 0) { + LOG_ERROR(Service_NVDRV, "VIC Luma address not set."); + return; + } + const VicConfig config{host1x.GMMU().Read(config_struct_address + 0x20)}; + auto frame = nvdec_processor->GetFrame(); + if (!frame) { + return; + } + const u64 surface_width = config.surface_width_minus1 + 1; + const u64 surface_height = config.surface_height_minus1 + 1; + if (static_cast(frame->GetWidth()) != surface_width || + static_cast(frame->GetHeight()) != surface_height) { + // TODO: Properly support multiple video streams with differing frame dimensions + LOG_WARNING(Service_NVDRV, "Frame dimensions {}x{} don't match surface dimensions {}x{}", + frame->GetWidth(), frame->GetHeight(), surface_width, surface_height); + } + switch (config.pixel_format) { + case VideoPixelFormat::RGBA8: + case VideoPixelFormat::BGRA8: + case VideoPixelFormat::RGBX8: + WriteRGBFrame(std::move(frame), config); + break; + case VideoPixelFormat::YUV420: + WriteYUVFrame(std::move(frame), config); + break; + default: + UNIMPLEMENTED_MSG("Unknown video pixel format {:X}", config.pixel_format.Value()); + break; + } +} - auto output_width{config.output_surface_config.out_surface_width + 1}; - auto output_height{config.output_surface_config.out_surface_height + 1}; - output_surface.resize_destructive(output_width * output_height); +void Vic::WriteRGBFrame(std::unique_ptr frame, const VicConfig& config) { + LOG_TRACE(Service_NVDRV, "Writing RGB Frame"); - if (Settings::values.nvdec_emulation.GetValue() == Settings::NvdecEmulation::Off) [[unlikely]] { - // Fill the frame with black, as otherwise they can have random data and be very glitchy. - std::fill(output_surface.begin(), output_surface.end(), Pixel{}); - } else { - for (size_t i = 0; i < config.slot_structs.size(); i++) { - auto& slot_config{config.slot_structs[i]}; - if (!slot_config.config.slot_enable) { - continue; - } + const auto frame_width = frame->GetWidth(); + const auto frame_height = frame->GetHeight(); + const auto frame_format = frame->GetPixelFormat(); - auto luma_offset{regs.surfaces[i][SurfaceIndex::Current].luma.Address()}; - auto frame = nvdec_processor.GetFrame(luma_offset); - if (!frame.get()) { - LOG_ERROR(HW_GPU, "Vic failed to get frame with offset 0x{:X}", luma_offset); - continue; - } - - switch (frame->GetPixelFormat()) { - case AV_PIX_FMT_YUV420P: - ReadY8__V8U8_N420(slot_config, regs.surfaces[i], std::move(frame)); - break; - case AV_PIX_FMT_NV12: - ReadY8__V8U8_N420(slot_config, regs.surfaces[i], std::move(frame)); - break; + if (!scaler_ctx || frame_width != scaler_width || frame_height != scaler_height) { + const AVPixelFormat target_format = [pixel_format = config.pixel_format]() { + switch (pixel_format) { + case VideoPixelFormat::RGBA8: + return AV_PIX_FMT_RGBA; + case VideoPixelFormat::BGRA8: + return AV_PIX_FMT_BGRA; + case VideoPixelFormat::RGBX8: + return AV_PIX_FMT_RGB0; default: - UNIMPLEMENTED_MSG( - "Unimplemented slot pixel format {}", - static_cast(slot_config.surface_config.slot_pixel_format.Value())); - break; + return AV_PIX_FMT_RGBA; } + }(); - Blend(config, slot_config); - } + sws_freeContext(scaler_ctx); + // Frames are decoded into either YUV420 or NV12 formats. Convert to desired RGB format + scaler_ctx = sws_getContext(frame_width, frame_height, frame_format, frame_width, + frame_height, target_format, 0, nullptr, nullptr, nullptr); + scaler_width = frame_width; + scaler_height = frame_height; + converted_frame_buffer.reset(); } - - switch (config.output_surface_config.out_pixel_format) { - case VideoPixelFormat::A8B8G8R8: - case VideoPixelFormat::X8B8G8R8: - WriteABGR(config.output_surface_config); - break; - case VideoPixelFormat::A8R8G8B8: - WriteABGR(config.output_surface_config); - break; - case VideoPixelFormat::Y8__V8U8_N420: - WriteY8__V8U8_N420(config.output_surface_config); - break; - default: - UNIMPLEMENTED_MSG("Unknown video pixel format {}", - config.output_surface_config.out_pixel_format.Value()); - break; + if (!converted_frame_buffer) { + const size_t frame_size = frame_width * frame_height * 4; + converted_frame_buffer = AVMallocPtr{static_cast(av_malloc(frame_size)), av_free}; } -} + const std::array converted_stride{frame_width * 4, frame_height * 4, 0, 0}; + u8* const converted_frame_buf_addr{converted_frame_buffer.get()}; + sws_scale(scaler_ctx, frame->GetPlanes(), frame->GetStrides(), 0, frame_height, + &converted_frame_buf_addr, converted_stride.data()); -template -void Vic::ReadProgressiveY8__V8U8_N420(const SlotStruct& slot, - std::span offsets, - std::shared_ptr frame) { - const auto out_luma_width{slot.surface_config.slot_surface_width + 1}; - auto out_luma_height{slot.surface_config.slot_surface_height + 1}; - const auto out_luma_stride{out_luma_width}; + // Use the minimum of surface/frame dimensions to avoid buffer overflow. + const u32 surface_width = static_cast(config.surface_width_minus1) + 1; + const u32 surface_height = static_cast(config.surface_height_minus1) + 1; + const u32 width = std::min(surface_width, static_cast(frame_width)); + const u32 height = std::min(surface_height, static_cast(frame_height)); + const u32 blk_kind = static_cast(config.block_linear_kind); + if (blk_kind != 0) { + // swizzle pitch linear to block linear + const u32 block_height = static_cast(config.block_linear_height_log2); + const auto size = Texture::CalculateSize(true, 4, width, height, 1, block_height, 0); + luma_buffer.resize_destructive(size); + std::span frame_buff(converted_frame_buf_addr, 4 * width * height); + Texture::SwizzleSubrect(luma_buffer, frame_buff, 4, width, height, 1, 0, 0, width, height, + block_height, 0, width * 4); - if constexpr (Interlaced) { - out_luma_height *= 2; - } - - slot_surface.resize_destructive(out_luma_width * out_luma_height); - - const auto in_luma_width{std::min(frame->GetWidth(), static_cast(out_luma_width))}; - const auto in_luma_height{std::min(frame->GetHeight(), static_cast(out_luma_height))}; - const auto in_luma_stride{frame->GetStride(0)}; - - const auto in_chroma_stride{frame->GetStride(1)}; - - const auto* luma_buffer{frame->GetPlane(0)}; - const auto* chroma_u_buffer{frame->GetPlane(1)}; - const auto* chroma_v_buffer{frame->GetPlane(2)}; - - [[maybe_unused]] auto DecodeLinear = [&]() { - const auto alpha{static_cast(slot.config.planar_alpha.Value())}; - - for (s32 y = 0; y < in_luma_height; y++) { - const auto src_luma{y * in_luma_stride}; - const auto src_chroma{(y / 2) * in_chroma_stride}; - const auto dst{y * out_luma_stride}; - for (s32 x = 0; x < in_luma_width; x++) { - slot_surface[dst + x].r = static_cast(luma_buffer[src_luma + x] << 2); - // Chroma samples are duplicated horizontally and vertically. - if constexpr (Planar) { - slot_surface[dst + x].g = - static_cast(chroma_u_buffer[src_chroma + x / 2] << 2); - slot_surface[dst + x].b = - static_cast(chroma_v_buffer[src_chroma + x / 2] << 2); - } else { - slot_surface[dst + x].g = - static_cast(chroma_u_buffer[src_chroma + (x & ~1) + 0] << 2); - slot_surface[dst + x].b = - static_cast(chroma_u_buffer[src_chroma + (x & ~1) + 1] << 2); - } - slot_surface[dst + x].a = alpha; - } - } - }; - -#if defined(ARCHITECTURE_x86_64) - if (!has_sse41) { - DecodeLinear(); - return; - } - - const auto alpha = - _mm_slli_epi64(_mm_set1_epi64x(static_cast(slot.config.planar_alpha.Value())), 48); - - const auto shuffle_mask = _mm_set_epi8(13, 15, 14, 12, 9, 11, 10, 8, 5, 7, 6, 4, 1, 3, 2, 0); - - for (s32 y = 0; y < in_luma_height; y++) { - const auto src_luma{y * in_luma_stride}; - const auto src_chroma{(y / 2) * in_chroma_stride}; - const auto dst{y * out_luma_stride}; - for (s32 x = 0; x < in_luma_width; x += 16) { - // clang-format off - // Prefetch next iteration's memory - _mm_prefetch((const char*)&luma_buffer[src_luma + x + 16], _MM_HINT_T0); - - // Load 8 bytes * 2 of 8-bit luma samples - // luma0 = 00 00 00 00 00 00 00 00 LL LL LL LL LL LL LL LL - auto luma0 = _mm_loadl_epi64((__m128i*)&luma_buffer[src_luma + x + 0]); - auto luma1 = _mm_loadl_epi64((__m128i*)&luma_buffer[src_luma + x + 8]); - - __m128i chroma; - - if constexpr (Planar) { - _mm_prefetch((const char*)&chroma_u_buffer[src_chroma + x / 2 + 8], _MM_HINT_T0); - _mm_prefetch((const char*)&chroma_v_buffer[src_chroma + x / 2 + 8], _MM_HINT_T0); - - // If Chroma is planar, we have separate U and V planes, load 8 bytes of each - // chroma_u0 = 00 00 00 00 00 00 00 00 UU UU UU UU UU UU UU UU - // chroma_v0 = 00 00 00 00 00 00 00 00 VV VV VV VV VV VV VV VV - auto chroma_u0 = _mm_loadl_epi64((__m128i*)&chroma_u_buffer[src_chroma + x / 2]); - auto chroma_v0 = _mm_loadl_epi64((__m128i*)&chroma_v_buffer[src_chroma + x / 2]); - - // Interleave the 8 bytes of U and V into a single 16 byte reg - // chroma = VV UU VV UU VV UU VV UU VV UU VV UU VV UU VV UU - chroma = _mm_unpacklo_epi8(chroma_u0, chroma_v0); - } else { - _mm_prefetch((const char*)&chroma_u_buffer[src_chroma + x / 2 + 8], _MM_HINT_T0); - - // Chroma is already interleaved in semiplanar format, just load 16 bytes - // chroma = VV UU VV UU VV UU VV UU VV UU VV UU VV UU VV UU - chroma = _mm_load_si128((__m128i*)&chroma_u_buffer[src_chroma + x]); - } - - // Convert the low 8 bytes of 8-bit luma into 16-bit luma - // luma0 = [00] [00] [00] [00] [00] [00] [00] [00] [LL] [LL] [LL] [LL] [LL] [LL] [LL] [LL] - // -> - // luma0 = [00 LL] [00 LL] [00 LL] [00 LL] [00 LL] [00 LL] [00 LL] [00 LL] - luma0 = _mm_cvtepu8_epi16(luma0); - luma1 = _mm_cvtepu8_epi16(luma1); - - // Treat the 8 bytes of 8-bit chroma as 16-bit channels, this allows us to take both the - // U and V together as one element. Using chroma twice here duplicates the values, as we - // take element 0 from chroma, and then element 0 from chroma again, etc. We need to - // duplicate chroma horitonally as chroma is half the width of luma. - // chroma = [VV8 UU8] [VV7 UU7] [VV6 UU6] [VV5 UU5] [VV4 UU4] [VV3 UU3] [VV2 UU2] [VV1 UU1] - // -> - // chroma00 = [VV4 UU4] [VV4 UU4] [VV3 UU3] [VV3 UU3] [VV2 UU2] [VV2 UU2] [VV1 UU1] [VV1 UU1] - // chroma01 = [VV8 UU8] [VV8 UU8] [VV7 UU7] [VV7 UU7] [VV6 UU6] [VV6 UU6] [VV5 UU5] [VV5 UU5] - auto chroma00 = _mm_unpacklo_epi16(chroma, chroma); - auto chroma01 = _mm_unpackhi_epi16(chroma, chroma); - - // Interleave the 16-bit luma and chroma. - // luma0 = [008 LL8] [007 LL7] [006 LL6] [005 LL5] [004 LL4] [003 LL3] [002 LL2] [001 LL1] - // chroma00 = [VV8 UU8] [VV7 UU7] [VV6 UU6] [VV5 UU5] [VV4 UU4] [VV3 UU3] [VV2 UU2] [VV1 UU1] - // -> - // yuv0 = [VV4 UU4 004 LL4] [VV3 UU3 003 LL3] [VV2 UU2 002 LL2] [VV1 UU1 001 LL1] - // yuv1 = [VV8 UU8 008 LL8] [VV7 UU7 007 LL7] [VV6 UU6 006 LL6] [VV5 UU5 005 LL5] - auto yuv0 = _mm_unpacklo_epi16(luma0, chroma00); - auto yuv1 = _mm_unpackhi_epi16(luma0, chroma00); - auto yuv2 = _mm_unpacklo_epi16(luma1, chroma01); - auto yuv3 = _mm_unpackhi_epi16(luma1, chroma01); - - // Shuffle the luma/chroma into the channel ordering we actually want. The high byte of - // the luma which is now a constant 0 after converting 8-bit -> 16-bit is used as the - // alpha. Luma -> R, U -> G, V -> B, 0 -> A - // yuv0 = [VV4 UU4 004 LL4] [VV3 UU3 003 LL3] [VV2 UU2 002 LL2] [VV1 UU1 001 LL1] - // -> - // yuv0 = [AA4 VV4 UU4 LL4] [AA3 VV3 UU3 LL3] [AA2 VV2 UU2 LL2] [AA1 VV1 UU1 LL1] - yuv0 = _mm_shuffle_epi8(yuv0, shuffle_mask); - yuv1 = _mm_shuffle_epi8(yuv1, shuffle_mask); - yuv2 = _mm_shuffle_epi8(yuv2, shuffle_mask); - yuv3 = _mm_shuffle_epi8(yuv3, shuffle_mask); - - // Extend the 8-bit channels we have into 16-bits, as that's the target surface format. - // Since this turns just the low 8 bytes into 16 bytes, the second of - // each operation here right shifts the register by 8 to get the high pixels. - // yuv0 = [AA4] [VV4] [UU4] [LL4] [AA3] [VV3] [UU3] [LL3] [AA2] [VV2] [UU2] [LL2] [AA1] [VV1] [UU1] [LL1] - // -> - // yuv01 = [002 AA2] [002 VV2] [002 UU2] [002 LL2] [001 AA1] [001 VV1] [001 UU1] [001 LL1] - // yuv23 = [004 AA4] [004 VV4] [004 UU4] [004 LL4] [003 AA3] [003 VV3] ]003 UU3] [003 LL3] - auto yuv01 = _mm_cvtepu8_epi16(yuv0); - auto yuv23 = _mm_cvtepu8_epi16(_mm_srli_si128(yuv0, 8)); - auto yuv45 = _mm_cvtepu8_epi16(yuv1); - auto yuv67 = _mm_cvtepu8_epi16(_mm_srli_si128(yuv1, 8)); - auto yuv89 = _mm_cvtepu8_epi16(yuv2); - auto yuv1011 = _mm_cvtepu8_epi16(_mm_srli_si128(yuv2, 8)); - auto yuv1213 = _mm_cvtepu8_epi16(yuv3); - auto yuv1415 = _mm_cvtepu8_epi16(_mm_srli_si128(yuv3, 8)); - - // Left-shift all 16-bit channels by 2, this is to get us into a 10-bit format instead - // of 8, which is the format alpha is in, as well as other blending values. - yuv01 = _mm_slli_epi16(yuv01, 2); - yuv23 = _mm_slli_epi16(yuv23, 2); - yuv45 = _mm_slli_epi16(yuv45, 2); - yuv67 = _mm_slli_epi16(yuv67, 2); - yuv89 = _mm_slli_epi16(yuv89, 2); - yuv1011 = _mm_slli_epi16(yuv1011, 2); - yuv1213 = _mm_slli_epi16(yuv1213, 2); - yuv1415 = _mm_slli_epi16(yuv1415, 2); - - // OR in the planar alpha, this has already been duplicated and shifted into position, - // and just fills in the AA channels with the actual alpha value. - yuv01 = _mm_or_si128(yuv01, alpha); - yuv23 = _mm_or_si128(yuv23, alpha); - yuv45 = _mm_or_si128(yuv45, alpha); - yuv67 = _mm_or_si128(yuv67, alpha); - yuv89 = _mm_or_si128(yuv89, alpha); - yuv1011 = _mm_or_si128(yuv1011, alpha); - yuv1213 = _mm_or_si128(yuv1213, alpha); - yuv1415 = _mm_or_si128(yuv1415, alpha); - - // Store out the pixels. One pixel is now 8 bytes, so each store is 2 pixels. - // [AA AA] [VV VV] [UU UU] [LL LL] [AA AA] [VV VV] [UU UU] [LL LL] - _mm_store_si128((__m128i*)&slot_surface[dst + x + 0], yuv01); - _mm_store_si128((__m128i*)&slot_surface[dst + x + 2], yuv23); - _mm_store_si128((__m128i*)&slot_surface[dst + x + 4], yuv45); - _mm_store_si128((__m128i*)&slot_surface[dst + x + 6], yuv67); - _mm_store_si128((__m128i*)&slot_surface[dst + x + 8], yuv89); - _mm_store_si128((__m128i*)&slot_surface[dst + x + 10], yuv1011); - _mm_store_si128((__m128i*)&slot_surface[dst + x + 12], yuv1213); - _mm_store_si128((__m128i*)&slot_surface[dst + x + 14], yuv1415); - - // clang-format on - } - } -#elif defined(ARCHITECTURE_arm64) - DecodeLinear(); -#else - DecodeLinear(); -#endif -} - -template -void Vic::ReadInterlacedY8__V8U8_N420(const SlotStruct& slot, std::span offsets, - std::shared_ptr frame) { - if constexpr (!Planar) { - ReadProgressiveY8__V8U8_N420(slot, offsets, std::move(frame)); - return; - } - const auto out_luma_width{slot.surface_config.slot_surface_width + 1}; - const auto out_luma_height{(slot.surface_config.slot_surface_height + 1) * 2}; - const auto out_luma_stride{out_luma_width}; - - slot_surface.resize_destructive(out_luma_width * out_luma_height); - - const auto in_luma_width{std::min(frame->GetWidth(), static_cast(out_luma_width))}; - [[maybe_unused]] const auto in_luma_height{ - std::min(frame->GetHeight(), static_cast(out_luma_height))}; - const auto in_luma_stride{frame->GetStride(0)}; - - [[maybe_unused]] const auto in_chroma_width{(frame->GetWidth() + 1) / 2}; - const auto in_chroma_height{(frame->GetHeight() + 1) / 2}; - const auto in_chroma_stride{frame->GetStride(1)}; - - const auto* luma_buffer{frame->GetPlane(0)}; - const auto* chroma_u_buffer{frame->GetPlane(1)}; - const auto* chroma_v_buffer{frame->GetPlane(2)}; - - [[maybe_unused]] auto DecodeLinear = [&]() { - auto DecodeBobField = [&]() { - const auto alpha{static_cast(slot.config.planar_alpha.Value())}; - - for (s32 y = static_cast(TopField == false); y < in_chroma_height * 2; y += 2) { - const auto src_luma{y * in_luma_stride}; - const auto src_chroma{(y / 2) * in_chroma_stride}; - const auto dst{y * out_luma_stride}; - for (s32 x = 0; x < in_luma_width; x++) { - slot_surface[dst + x].r = static_cast(luma_buffer[src_luma + x] << 2); - if constexpr (Planar) { - slot_surface[dst + x].g = - static_cast(chroma_u_buffer[src_chroma + x / 2] << 2); - slot_surface[dst + x].b = - static_cast(chroma_v_buffer[src_chroma + x / 2] << 2); - } else { - slot_surface[dst + x].g = - static_cast(chroma_u_buffer[src_chroma + (x & ~1) + 0] << 2); - slot_surface[dst + x].b = - static_cast(chroma_u_buffer[src_chroma + (x & ~1) + 1] << 2); - } - slot_surface[dst + x].a = alpha; - } - - s32 other_line{}; - if constexpr (TopField) { - other_line = (y + 1) * out_luma_stride; - } else { - other_line = (y - 1) * out_luma_stride; - } - std::memcpy(&slot_surface[other_line], &slot_surface[dst], - out_luma_width * sizeof(Pixel)); - } - }; - - switch (slot.config.deinterlace_mode) { - case DXVAHD_DEINTERLACE_MODE_PRIVATE::WEAVE: - // Due to the fact that we do not write to memory in nvdec, we cannot use Weave as it - // relies on the previous frame. - DecodeBobField(); - break; - case DXVAHD_DEINTERLACE_MODE_PRIVATE::BOB_FIELD: - DecodeBobField(); - break; - case DXVAHD_DEINTERLACE_MODE_PRIVATE::DISI1: - // Due to the fact that we do not write to memory in nvdec, we cannot use DISI1 as it - // relies on previous/next frames. - DecodeBobField(); - break; - default: - UNIMPLEMENTED_MSG("Deinterlace mode {} not implemented!", - static_cast(slot.config.deinterlace_mode.Value())); - break; - } - }; - - DecodeLinear(); -} - -template -void Vic::ReadY8__V8U8_N420(const SlotStruct& slot, std::span offsets, - std::shared_ptr frame) { - switch (slot.config.frame_format) { - case DXVAHD_FRAME_FORMAT::PROGRESSIVE: - ReadProgressiveY8__V8U8_N420(slot, offsets, std::move(frame)); - break; - case DXVAHD_FRAME_FORMAT::TOP_FIELD: - ReadInterlacedY8__V8U8_N420(slot, offsets, std::move(frame)); - break; - case DXVAHD_FRAME_FORMAT::BOTTOM_FIELD: - ReadInterlacedY8__V8U8_N420(slot, offsets, std::move(frame)); - break; - default: - LOG_ERROR(HW_GPU, "Unknown deinterlace format {}", - static_cast(slot.config.frame_format.Value())); - break; - } -} - -void Vic::Blend(const ConfigStruct& config, const SlotStruct& slot) { - constexpr auto add_one([](u32 v) -> u32 { return v != 0 ? v + 1 : 0; }); - - auto source_left{add_one(static_cast(slot.config.source_rect_left.Value()))}; - auto source_right{add_one(static_cast(slot.config.source_rect_right.Value()))}; - auto source_top{add_one(static_cast(slot.config.source_rect_top.Value()))}; - auto source_bottom{add_one(static_cast(slot.config.source_rect_bottom.Value()))}; - - const auto dest_left{add_one(static_cast(slot.config.dest_rect_left.Value()))}; - const auto dest_right{add_one(static_cast(slot.config.dest_rect_right.Value()))}; - const auto dest_top{add_one(static_cast(slot.config.dest_rect_top.Value()))}; - const auto dest_bottom{add_one(static_cast(slot.config.dest_rect_bottom.Value()))}; - - auto rect_left{add_one(config.output_config.target_rect_left.Value())}; - auto rect_right{add_one(config.output_config.target_rect_right.Value())}; - auto rect_top{add_one(config.output_config.target_rect_top.Value())}; - auto rect_bottom{add_one(config.output_config.target_rect_bottom.Value())}; - - rect_left = std::max(rect_left, dest_left); - rect_right = std::min(rect_right, dest_right); - rect_top = std::max(rect_top, dest_top); - rect_bottom = std::min(rect_bottom, dest_bottom); - - source_left = std::max(source_left, rect_left); - source_right = std::min(source_right, rect_right); - source_top = std::max(source_top, rect_top); - source_bottom = std::min(source_bottom, rect_bottom); - - if (source_left >= source_right || source_top >= source_bottom) { - return; - } - - const auto out_surface_width{config.output_surface_config.out_surface_width + 1}; - [[maybe_unused]] const auto out_surface_height{config.output_surface_config.out_surface_height + - 1}; - const auto in_surface_width{slot.surface_config.slot_surface_width + 1}; - - source_bottom = std::min(source_bottom, out_surface_height); - source_right = std::min(source_right, out_surface_width); - - // TODO Alpha blending. No games I've seen use more than a single surface or supply an alpha - // below max, so it's ignored for now. - - if (!slot.color_matrix.matrix_enable) { - const auto copy_width = std::min(source_right - source_left, rect_right - rect_left); - - for (u32 y = source_top; y < source_bottom; y++) { - const auto dst_line = y * out_surface_width; - const auto src_line = y * in_surface_width; - std::memcpy(&output_surface[dst_line + rect_left], - &slot_surface[src_line + source_left], copy_width * sizeof(Pixel)); - } + host1x.GMMU().WriteBlock(output_surface_luma_address, luma_buffer.data(), size); } else { - // clang-format off - // Colour conversion is enabled, this is a 3x4 * 4x1 matrix multiplication, resulting in a 3x1 matrix. - // | r0c0 r0c1 r0c2 r0c3 | | R | | R | - // | r1c0 r1c1 r1c2 r1c3 | * | G | = | G | - // | r2c0 r2c1 r2c2 r2c3 | | B | | B | - // | 1 | - // clang-format on - - [[maybe_unused]] auto DecodeLinear = [&]() { - const auto r0c0 = static_cast(slot.color_matrix.matrix_coeff00.Value()); - const auto r0c1 = static_cast(slot.color_matrix.matrix_coeff01.Value()); - const auto r0c2 = static_cast(slot.color_matrix.matrix_coeff02.Value()); - const auto r0c3 = static_cast(slot.color_matrix.matrix_coeff03.Value()); - const auto r1c0 = static_cast(slot.color_matrix.matrix_coeff10.Value()); - const auto r1c1 = static_cast(slot.color_matrix.matrix_coeff11.Value()); - const auto r1c2 = static_cast(slot.color_matrix.matrix_coeff12.Value()); - const auto r1c3 = static_cast(slot.color_matrix.matrix_coeff13.Value()); - const auto r2c0 = static_cast(slot.color_matrix.matrix_coeff20.Value()); - const auto r2c1 = static_cast(slot.color_matrix.matrix_coeff21.Value()); - const auto r2c2 = static_cast(slot.color_matrix.matrix_coeff22.Value()); - const auto r2c3 = static_cast(slot.color_matrix.matrix_coeff23.Value()); - - const auto shift = static_cast(slot.color_matrix.matrix_r_shift.Value()); - const auto clamp_min = static_cast(slot.config.soft_clamp_low.Value()); - const auto clamp_max = static_cast(slot.config.soft_clamp_high.Value()); - - auto MatMul = [&](const Pixel& in_pixel) -> std::tuple { - auto r = static_cast(in_pixel.r); - auto g = static_cast(in_pixel.g); - auto b = static_cast(in_pixel.b); - - r = in_pixel.r * r0c0 + in_pixel.g * r0c1 + in_pixel.b * r0c2; - g = in_pixel.r * r1c0 + in_pixel.g * r1c1 + in_pixel.b * r1c2; - b = in_pixel.r * r2c0 + in_pixel.g * r2c1 + in_pixel.b * r2c2; - - r >>= shift; - g >>= shift; - b >>= shift; - - r += r0c3; - g += r1c3; - b += r2c3; - - r >>= 8; - g >>= 8; - b >>= 8; - - return {r, g, b, static_cast(in_pixel.a)}; - }; - - for (u32 y = source_top; y < source_bottom; y++) { - const auto src{y * in_surface_width + source_left}; - const auto dst{y * out_surface_width + rect_left}; - for (u32 x = source_left; x < source_right; x++) { - auto [r, g, b, a] = MatMul(slot_surface[src + x]); - - r = std::clamp(r, clamp_min, clamp_max); - g = std::clamp(g, clamp_min, clamp_max); - b = std::clamp(b, clamp_min, clamp_max); - a = std::clamp(a, clamp_min, clamp_max); - - output_surface[dst + x] = {static_cast(r), static_cast(g), - static_cast(b), static_cast(a)}; - } - } - }; - -#if defined(ARCHITECTURE_x86_64) - if (!has_sse41) { - DecodeLinear(); - return; - } - - // Fill the columns, e.g - // c0 = [00 00 00 00] [r2c0 r2c0 r2c0 r2c0] [r1c0 r1c0 r1c0 r1c0] [r0c0 r0c0 r0c0 r0c0] - - const auto c0 = _mm_set_epi32(0, static_cast(slot.color_matrix.matrix_coeff20.Value()), - static_cast(slot.color_matrix.matrix_coeff10.Value()), - static_cast(slot.color_matrix.matrix_coeff00.Value())); - const auto c1 = _mm_set_epi32(0, static_cast(slot.color_matrix.matrix_coeff21.Value()), - static_cast(slot.color_matrix.matrix_coeff11.Value()), - static_cast(slot.color_matrix.matrix_coeff01.Value())); - const auto c2 = _mm_set_epi32(0, static_cast(slot.color_matrix.matrix_coeff22.Value()), - static_cast(slot.color_matrix.matrix_coeff12.Value()), - static_cast(slot.color_matrix.matrix_coeff02.Value())); - const auto c3 = _mm_set_epi32(0, static_cast(slot.color_matrix.matrix_coeff23.Value()), - static_cast(slot.color_matrix.matrix_coeff13.Value()), - static_cast(slot.color_matrix.matrix_coeff03.Value())); - - // Set the matrix right-shift as a single element. - const auto shift = - _mm_set_epi32(0, 0, 0, static_cast(slot.color_matrix.matrix_r_shift.Value())); - - // Set every 16-bit value to the soft clamp values for clamping every 16-bit channel. - const auto clamp_min = _mm_set1_epi16(static_cast(slot.config.soft_clamp_low.Value())); - const auto clamp_max = - _mm_set1_epi16(static_cast(slot.config.soft_clamp_high.Value())); - - // clang-format off - - auto MatMul = [](__m128i& p, const __m128i& col0, const __m128i& col1, const __m128i& col2, - const __m128i& col3, const __m128i& trm_shift) -> __m128i { - // Duplicate the 32-bit channels, e.g - // p = [AA AA AA AA] [BB BB BB BB] [GG GG GG GG] [RR RR RR RR] - // -> - // r = [RR4 RR4 RR4 RR4] [RR3 RR3 RR3 RR3] [RR2 RR2 RR2 RR2] [RR1 RR1 RR1 RR1] - auto r = _mm_shuffle_epi32(p, 0x0); - auto g = _mm_shuffle_epi32(p, 0x55); - auto b = _mm_shuffle_epi32(p, 0xAA); - - // Multiply the rows and columns c0 * r, c1 * g, c2 * b, e.g - // r = [RR4 RR4 RR4 RR4] [ RR3 RR3 RR3 RR3] [ RR2 RR2 RR2 RR2] [ RR1 RR1 RR1 RR1] - // * - // c0 = [ 00 00 00 00] [r2c0 r2c0 r2c0 r2c0] [r1c0 r1c0 r1c0 r1c0] [r0c0 r0c0 r0c0 r0c0] - r = _mm_mullo_epi32(r, col0); - g = _mm_mullo_epi32(g, col1); - b = _mm_mullo_epi32(b, col2); - - // Add them all together vertically, such that the 32-bit element - // out[0] = (r[0] * c0[0]) + (g[0] * c1[0]) + (b[0] * c2[0]) - auto out = _mm_add_epi32(_mm_add_epi32(r, g), b); - - // Shift the result by r_shift, as the TRM says - out = _mm_sra_epi32(out, trm_shift); - - // Add the final column. Because the 4x1 matrix has this row as 1, there's no need to - // multiply by it, and as per the TRM this column ignores r_shift, so it's just added - // here after shifting. - out = _mm_add_epi32(out, col3); - - // Shift the result back from S12.8 to integer values - return _mm_srai_epi32(out, 8); - }; - - for (u32 y = source_top; y < source_bottom; y++) { - const auto src{y * in_surface_width + source_left}; - const auto dst{y * out_surface_width + rect_left}; - for (u32 x = source_left; x < source_right; x += 8) { - // clang-format off - // Prefetch the next iteration's memory - _mm_prefetch((const char*)&slot_surface[src + x + 8], _MM_HINT_T0); - - // Load in pixels - // p01 = [AA AA] [BB BB] [GG GG] [RR RR] [AA AA] [BB BB] [GG GG] [RR RR] - auto p01 = _mm_load_si128((__m128i*)&slot_surface[src + x + 0]); - auto p23 = _mm_load_si128((__m128i*)&slot_surface[src + x + 2]); - auto p45 = _mm_load_si128((__m128i*)&slot_surface[src + x + 4]); - auto p67 = _mm_load_si128((__m128i*)&slot_surface[src + x + 6]); - - // Convert the 16-bit channels into 32-bit (unsigned), as the matrix values are - // 32-bit and to avoid overflow. - // p01 = [AA2 AA2] [BB2 BB2] [GG2 GG2] [RR2 RR2] [AA1 AA1] [BB1 BB1] [GG1 GG1] [RR1 RR1] - // -> - // p01_lo = [001 001 AA1 AA1] [001 001 BB1 BB1] [001 001 GG1 GG1] [001 001 RR1 RR1] - // p01_hi = [002 002 AA2 AA2] [002 002 BB2 BB2] [002 002 GG2 GG2] [002 002 RR2 RR2] - auto p01_lo = _mm_cvtepu16_epi32(p01); - auto p01_hi = _mm_cvtepu16_epi32(_mm_srli_si128(p01, 8)); - auto p23_lo = _mm_cvtepu16_epi32(p23); - auto p23_hi = _mm_cvtepu16_epi32(_mm_srli_si128(p23, 8)); - auto p45_lo = _mm_cvtepu16_epi32(p45); - auto p45_hi = _mm_cvtepu16_epi32(_mm_srli_si128(p45, 8)); - auto p67_lo = _mm_cvtepu16_epi32(p67); - auto p67_hi = _mm_cvtepu16_epi32(_mm_srli_si128(p67, 8)); - - // Matrix multiply the pixel, doing the colour conversion. - auto out0 = MatMul(p01_lo, c0, c1, c2, c3, shift); - auto out1 = MatMul(p01_hi, c0, c1, c2, c3, shift); - auto out2 = MatMul(p23_lo, c0, c1, c2, c3, shift); - auto out3 = MatMul(p23_hi, c0, c1, c2, c3, shift); - auto out4 = MatMul(p45_lo, c0, c1, c2, c3, shift); - auto out5 = MatMul(p45_hi, c0, c1, c2, c3, shift); - auto out6 = MatMul(p67_lo, c0, c1, c2, c3, shift); - auto out7 = MatMul(p67_hi, c0, c1, c2, c3, shift); - - // Pack the 32-bit channel pixels back into 16-bit using unsigned saturation - // out0 = [001 001 AA1 AA1] [001 001 BB1 BB1] [001 001 GG1 GG1] [001 001 RR1 RR1] - // out1 = [002 002 AA2 AA2] [002 002 BB2 BB2] [002 002 GG2 GG2] [002 002 RR2 RR2] - // -> - // done0 = [AA2 AA2] [BB2 BB2] [GG2 GG2] [RR2 RR2] [AA1 AA1] [BB1 BB1] [GG1 GG1] [RR1 RR1] - auto done0 = _mm_packus_epi32(out0, out1); - auto done1 = _mm_packus_epi32(out2, out3); - auto done2 = _mm_packus_epi32(out4, out5); - auto done3 = _mm_packus_epi32(out6, out7); - - // Blend the original alpha back into the pixel, as the matrix multiply gives us a - // 3-channel output, not 4. - // 0x88 = b10001000, taking RGB from the first argument, A from the second argument. - // done0 = [002 002] [BB2 BB2] [GG2 GG2] [RR2 RR2] [001 001] [BB1 BB1] [GG1 GG1] [RR1 RR1] - // -> - // done0 = [AA2 AA2] [BB2 BB2] [GG2 GG2] [RR2 RR2] [AA1 AA1] [BB1 BB1] [GG1 GG1] [RR1 RR1] - done0 = _mm_blend_epi16(done0, p01, 0x88); - done1 = _mm_blend_epi16(done1, p23, 0x88); - done2 = _mm_blend_epi16(done2, p45, 0x88); - done3 = _mm_blend_epi16(done3, p67, 0x88); - - // Clamp the 16-bit channels to the soft-clamp min/max. - done0 = _mm_max_epu16(done0, clamp_min); - done1 = _mm_max_epu16(done1, clamp_min); - done2 = _mm_max_epu16(done2, clamp_min); - done3 = _mm_max_epu16(done3, clamp_min); - - done0 = _mm_min_epu16(done0, clamp_max); - done1 = _mm_min_epu16(done1, clamp_max); - done2 = _mm_min_epu16(done2, clamp_max); - done3 = _mm_min_epu16(done3, clamp_max); - - // Store the pixels to the output surface. - _mm_store_si128((__m128i*)&output_surface[dst + x + 0], done0); - _mm_store_si128((__m128i*)&output_surface[dst + x + 2], done1); - _mm_store_si128((__m128i*)&output_surface[dst + x + 4], done2); - _mm_store_si128((__m128i*)&output_surface[dst + x + 6], done3); - - } - } - // clang-format on -#elif defined(ARCHITECTURE_arm64) - DecodeLinear(); -#else - DecodeLinear(); -#endif + // send pitch linear frame + const size_t linear_size = width * height * 4; + host1x.GMMU().WriteBlock(output_surface_luma_address, converted_frame_buf_addr, + linear_size); } } -void Vic::WriteY8__V8U8_N420(const OutputSurfaceConfig& output_surface_config) { - constexpr u32 BytesPerPixel = 1; +void Vic::WriteYUVFrame(std::unique_ptr frame, const VicConfig& config) { + LOG_TRACE(Service_NVDRV, "Writing YUV420 Frame"); - auto surface_width{output_surface_config.out_surface_width + 1}; - auto surface_height{output_surface_config.out_surface_height + 1}; - const auto surface_stride{surface_width}; + const std::size_t surface_width = config.surface_width_minus1 + 1; + const std::size_t surface_height = config.surface_height_minus1 + 1; + const std::size_t aligned_width = (surface_width + 0xff) & ~0xffUL; + // Use the minimum of surface/frame dimensions to avoid buffer overflow. + const auto frame_width = std::min(surface_width, static_cast(frame->GetWidth())); + const auto frame_height = std::min(surface_height, static_cast(frame->GetHeight())); - const auto out_luma_width = output_surface_config.out_luma_width + 1; - const auto out_luma_height = output_surface_config.out_luma_height + 1; - const auto out_luma_stride = Common::AlignUp(out_luma_width * BytesPerPixel, 0x10); - const auto out_luma_size = out_luma_height * out_luma_stride; + const auto stride = static_cast(frame->GetStride(0)); - const auto out_chroma_width = output_surface_config.out_chroma_width + 1; - const auto out_chroma_height = output_surface_config.out_chroma_height + 1; - const auto out_chroma_stride = Common::AlignUp(out_chroma_width * BytesPerPixel * 2, 0x10); - const auto out_chroma_size = out_chroma_height * out_chroma_stride; + luma_buffer.resize_destructive(aligned_width * surface_height); + chroma_buffer.resize_destructive(aligned_width * surface_height / 2); - surface_width = std::min(surface_width, out_luma_width); - surface_height = std::min(surface_height, out_luma_height); + // Populate luma buffer + const u8* luma_src = frame->GetData(0); + for (std::size_t y = 0; y < frame_height; ++y) { + const std::size_t src = y * stride; + const std::size_t dst = y * aligned_width; + std::memcpy(luma_buffer.data() + dst, luma_src + src, frame_width); + } + host1x.GMMU().WriteBlock(output_surface_luma_address, luma_buffer.data(), luma_buffer.size()); - [[maybe_unused]] auto DecodeLinear = [&](std::span out_luma, std::span out_chroma) { - for (u32 y = 0; y < surface_height; ++y) { - const auto src_luma = y * surface_stride; - const auto dst_luma = y * out_luma_stride; - const auto src_chroma = y * surface_stride; - const auto dst_chroma = (y / 2) * out_chroma_stride; - for (u32 x = 0; x < surface_width; x += 2) { - out_luma[dst_luma + x + 0] = - static_cast(output_surface[src_luma + x + 0].r >> 2); - out_luma[dst_luma + x + 1] = - static_cast(output_surface[src_luma + x + 1].r >> 2); - out_chroma[dst_chroma + x + 0] = - static_cast(output_surface[src_chroma + x].g >> 2); - out_chroma[dst_chroma + x + 1] = - static_cast(output_surface[src_chroma + x].b >> 2); + // Chroma + const std::size_t half_height = frame_height / 2; + const auto half_stride = static_cast(frame->GetStride(1)); + + switch (frame->GetPixelFormat()) { + case AV_PIX_FMT_YUV420P: { + // Frame from FFmpeg software + // Populate chroma buffer from both channels with interleaving. + const std::size_t half_width = frame_width / 2; + u8* chroma_buffer_data = chroma_buffer.data(); + const u8* chroma_b_src = frame->GetData(1); + const u8* chroma_r_src = frame->GetData(2); + for (std::size_t y = 0; y < half_height; ++y) { + const std::size_t src = y * half_stride; + const std::size_t dst = y * aligned_width; + for (std::size_t x = 0; x < half_width; ++x) { + chroma_buffer_data[dst + x * 2] = chroma_b_src[src + x]; + chroma_buffer_data[dst + x * 2 + 1] = chroma_r_src[src + x]; } } - }; - - auto Decode = [&](std::span out_luma, std::span out_chroma) { -#if defined(ARCHITECTURE_x86_64) - if (!has_sse41) { - DecodeLinear(out_luma, out_chroma); - return; - } - - // luma_mask = [00 00] [00 00] [00 00] [FF FF] [00 00] [00 00] [00 00] [FF FF] - const auto luma_mask = _mm_set_epi16(0, 0, 0, -1, 0, 0, 0, -1); - - for (u32 y = 0; y < surface_height; ++y) { - const auto src = y * surface_stride; - const auto dst_luma = y * out_luma_stride; - const auto dst_chroma = (y / 2) * out_chroma_stride; - for (u32 x = 0; x < surface_width; x += 16) { - // clang-format off - // Prefetch the next cache lines, 2 per iteration - _mm_prefetch((const char*)&output_surface[src + x + 16], _MM_HINT_T0); - _mm_prefetch((const char*)&output_surface[src + x + 24], _MM_HINT_T0); - - // Load the 64-bit pixels, 2 per variable. - auto pixel01 = _mm_load_si128((__m128i*)&output_surface[src + x + 0]); - auto pixel23 = _mm_load_si128((__m128i*)&output_surface[src + x + 2]); - auto pixel45 = _mm_load_si128((__m128i*)&output_surface[src + x + 4]); - auto pixel67 = _mm_load_si128((__m128i*)&output_surface[src + x + 6]); - auto pixel89 = _mm_load_si128((__m128i*)&output_surface[src + x + 8]); - auto pixel1011 = _mm_load_si128((__m128i*)&output_surface[src + x + 10]); - auto pixel1213 = _mm_load_si128((__m128i*)&output_surface[src + x + 12]); - auto pixel1415 = _mm_load_si128((__m128i*)&output_surface[src + x + 14]); - - // Split out the luma of each pixel using the luma_mask above. - // pixel01 = [AA2 AA2] [VV2 VV2] [UU2 UU2] [LL2 LL2] [AA1 AA1] [VV1 VV1] [UU1 UU1] [LL1 LL1] - // -> - // l01 = [002 002] [002 002] [002 002] [LL2 LL2] [001 001] [001 001] [001 001] [LL1 LL1] - auto l01 = _mm_and_si128(pixel01, luma_mask); - auto l23 = _mm_and_si128(pixel23, luma_mask); - auto l45 = _mm_and_si128(pixel45, luma_mask); - auto l67 = _mm_and_si128(pixel67, luma_mask); - auto l89 = _mm_and_si128(pixel89, luma_mask); - auto l1011 = _mm_and_si128(pixel1011, luma_mask); - auto l1213 = _mm_and_si128(pixel1213, luma_mask); - auto l1415 = _mm_and_si128(pixel1415, luma_mask); - - // Pack 32-bit elements from 2 registers down into 16-bit elements in 1 register. - // l01 = [002 002 002 002] [002 002 LL2 LL2] [001 001 001 001] [001 001 LL1 LL1] - // l23 = [004 004 004 004] [004 004 LL4 LL4] [003 003 003 003] [003 003 LL3 LL3] - // -> - // l0123 = [004 004] [LL4 LL4] [003 003] [LL3 LL3] [002 002] [LL2 LL2] [001 001] [LL1 LL1] - auto l0123 = _mm_packus_epi32(l01, l23); - auto l4567 = _mm_packus_epi32(l45, l67); - auto l891011 = _mm_packus_epi32(l89, l1011); - auto l12131415 = _mm_packus_epi32(l1213, l1415); - - // Pack 32-bit elements from 2 registers down into 16-bit elements in 1 register. - // l0123 = [004 004 LL4 LL4] [003 003 LL3 LL3] [002 002 LL2 LL2] [001 001 LL1 LL1] - // l4567 = [008 008 LL8 LL8] [007 007 LL7 LL7] [006 006 LL6 LL6] [005 005 LL5 LL5] - // -> - // luma_lo = [LL8 LL8] [LL7 LL7] [LL6 LL6] [LL5 LL5] [LL4 LL4] [LL3 LL3] [LL2 LL2] [LL1 LL1] - auto luma_lo = _mm_packus_epi32(l0123, l4567); - auto luma_hi = _mm_packus_epi32(l891011, l12131415); - - // Right-shift the 16-bit elements by 2, un-doing the left shift by 2 on read - // and bringing the range back to 8-bit. - luma_lo = _mm_srli_epi16(luma_lo, 2); - luma_hi = _mm_srli_epi16(luma_hi, 2); - - // Pack with unsigned saturation the 16-bit values in 2 registers into 8-bit values in 1 register. - // luma_lo = [LL8 LL8] [LL7 LL7] [LL6 LL6] [LL5 LL5] [LL4 LL4] [LL3 LL3] [LL2 LL2] [LL1 LL1] - // luma_hi = [LL16 LL16] [LL15 LL15] [LL14 LL14] [LL13 LL13] [LL12 LL12] [LL11 LL11] [LL10 LL10] [LL9 LL9] - // -> - // luma = [LL16] [LL15] [LL14] [LL13] [LL12] [LL11] [LL10] [LL9] [LL8] [LL7] [LL6] [LL5] [LL4] [LL3] [LL2] [LL1] - auto luma = _mm_packus_epi16(luma_lo, luma_hi); - - // Store the 16 bytes of luma - _mm_store_si128((__m128i*)&out_luma[dst_luma + x], luma); - - if (y % 2 == 0) { - // Chroma, done every other line as it's half the height of luma. - - // Shift the register right by 2 bytes (not bits), to kick out the 16-bit luma. - // We can do this instead of &'ing a mask and then shifting. - // pixel01 = [AA2 AA2] [VV2 VV2] [UU2 UU2] [LL2 LL2] [AA1 AA1] [VV1 VV1] [UU1 UU1] [LL1 LL1] - // -> - // c01 = [ 00 00] [AA2 AA2] [VV2 VV2] [UU2 UU2] [LL2 LL2] [AA1 AA1] [VV1 VV1] [UU1 UU1] - auto c01 = _mm_srli_si128(pixel01, 2); - auto c23 = _mm_srli_si128(pixel23, 2); - auto c45 = _mm_srli_si128(pixel45, 2); - auto c67 = _mm_srli_si128(pixel67, 2); - auto c89 = _mm_srli_si128(pixel89, 2); - auto c1011 = _mm_srli_si128(pixel1011, 2); - auto c1213 = _mm_srli_si128(pixel1213, 2); - auto c1415 = _mm_srli_si128(pixel1415, 2); - - // Interleave the lower 8 bytes as 32-bit elements from 2 registers into 1 register. - // This has the effect of skipping every other chroma value horitonally, - // notice the high pixels UU2/UU4 are skipped. - // This is intended as N420 chroma width is half the luma width. - // c01 = [ 00 00 AA2 AA2] [VV2 VV2 UU2 UU2] [LL2 LL2 AA1 AA1] [VV1 VV1 UU1 UU1] - // c23 = [ 00 00 AA4 AA4] [VV4 VV4 UU4 UU4] [LL4 LL4 AA3 AA3] [VV3 VV3 UU3 UU3] - // -> - // c0123 = [LL4 LL4 AA3 AA3] [LL2 LL2 AA1 AA1] [VV3 VV3 UU3 UU3] [VV1 VV1 UU1 UU1] - auto c0123 = _mm_unpacklo_epi32(c01, c23); - auto c4567 = _mm_unpacklo_epi32(c45, c67); - auto c891011 = _mm_unpacklo_epi32(c89, c1011); - auto c12131415 = _mm_unpacklo_epi32(c1213, c1415); - - // Interleave the low 64-bit elements from 2 registers into 1. - // c0123 = [LL4 LL4 AA3 AA3 LL2 LL2 AA1 AA1] [VV3 VV3 UU3 UU3 VV1 VV1 UU1 UU1] - // c4567 = [LL8 LL8 AA7 AA7 LL6 LL6 AA5 AA5] [VV7 VV7 UU7 UU7 VV5 VV5 UU5 UU5] - // -> - // chroma_lo = [VV7 VV7 UU7 UU7 VV5 VV5 UU5 UU5] [VV3 VV3 UU3 UU3 VV1 VV1 UU1 UU1] - auto chroma_lo = _mm_unpacklo_epi64(c0123, c4567); - auto chroma_hi = _mm_unpacklo_epi64(c891011, c12131415); - - // Right-shift the 16-bit elements by 2, un-doing the left shift by 2 on read - // and bringing the range back to 8-bit. - chroma_lo = _mm_srli_epi16(chroma_lo, 2); - chroma_hi = _mm_srli_epi16(chroma_hi, 2); - - // Pack with unsigned saturation the 16-bit elements from 2 registers into 8-bit elements in 1 register. - // chroma_lo = [ VV7 VV7] [ UU7 UU7] [ VV5 VV5] [ UU5 UU5] [ VV3 VV3] [ UU3 UU3] [VV1 VV1] [UU1 UU1] - // chroma_hi = [VV15 VV15] [UU15 UU15] [VV13 VV13] [UU13 UU13] [VV11 VV11] [UU11 UU11] [VV9 VV9] [UU9 UU9] - // -> - // chroma = [VV15] [UU15] [VV13] [UU13] [VV11] [UU11] [VV9] [UU9] [VV7] [UU7] [VV5] [UU5] [VV3] [UU3] [VV1] [UU1] - auto chroma = _mm_packus_epi16(chroma_lo, chroma_hi); - - // Store the 16 bytes of chroma. - _mm_store_si128((__m128i*)&out_chroma[dst_chroma + x + 0], chroma); - } - - // clang-format on - } - } -#elif defined(ARCHITECTURE_arm64) - DecodeLinear(out_luma, out_chroma); -#else - DecodeLinear(out_luma, out_chroma); -#endif - }; - - switch (output_surface_config.out_block_kind) { - case BLK_KIND::GENERIC_16Bx2: { - const u32 block_height = static_cast(output_surface_config.out_block_height); - const auto out_luma_swizzle_size = Texture::CalculateSize( - true, BytesPerPixel, out_luma_width, out_luma_height, 1, block_height, 0); - const auto out_chroma_swizzle_size = Texture::CalculateSize( - true, BytesPerPixel * 2, out_chroma_width, out_chroma_height, 1, block_height, 0); - - luma_scratch.resize_destructive(out_luma_size); - chroma_scratch.resize_destructive(out_chroma_size); - - Decode(luma_scratch, chroma_scratch); - - Tegra::Memory::GpuGuestMemoryScoped out_luma( - memory_manager, regs.output_surface.luma.Address(), out_luma_swizzle_size, - &swizzle_scratch); - - SwizzleSurface(out_luma, out_luma_stride, luma_scratch, out_luma_stride, out_luma_height); - - Tegra::Memory::GpuGuestMemoryScoped - out_chroma(memory_manager, regs.output_surface.chroma_u.Address(), - out_chroma_swizzle_size, &swizzle_scratch); - - SwizzleSurface(out_chroma, out_chroma_stride, chroma_scratch, out_chroma_stride, - out_chroma_height); - } break; - case BLK_KIND::PITCH: { - // Unfortunately due to a driver bug or game bug, the chroma address can be not - // appropriately spaced from the luma, so the luma of size out_stride * height runs into the - // top of the chroma buffer. Unfortunately that removes an optimisation here where we could - // create guest spans and decode into game memory directly to avoid the memory copy from - // scratch to game. Due to this bug, we must write the luma first, and then the chroma - // afterwards to re-overwrite the luma being too large. - luma_scratch.resize_destructive(out_luma_size); - chroma_scratch.resize_destructive(out_chroma_size); - - Decode(luma_scratch, chroma_scratch); - - memory_manager.WriteBlock(regs.output_surface.luma.Address(), luma_scratch.data(), - out_luma_size); - memory_manager.WriteBlock(regs.output_surface.chroma_u.Address(), chroma_scratch.data(), - out_chroma_size); - } break; - default: - UNREACHABLE(); break; } -} - -template -void Vic::WriteABGR(const OutputSurfaceConfig& output_surface_config) { - constexpr u32 BytesPerPixel = 4; - - auto surface_width{output_surface_config.out_surface_width + 1}; - auto surface_height{output_surface_config.out_surface_height + 1}; - const auto surface_stride{surface_width}; - - const auto out_luma_width = output_surface_config.out_luma_width + 1; - const auto out_luma_height = output_surface_config.out_luma_height + 1; - const auto out_luma_stride = Common ::AlignUp(out_luma_width * BytesPerPixel, 0x10); - const auto out_luma_size = out_luma_height * out_luma_stride; - - surface_width = std::min(surface_width, out_luma_width); - surface_height = std::min(surface_height, out_luma_height); - - [[maybe_unused]] auto DecodeLinear = [&](std::span out_buffer) { - for (u32 y = 0; y < surface_height; y++) { - const auto src = y * surface_stride; - const auto dst = y * out_luma_stride; - for (u32 x = 0; x < surface_width; x++) { - if constexpr (Format == VideoPixelFormat::A8R8G8B8) { - out_buffer[dst + x * 4 + 0] = static_cast(output_surface[src + x].b >> 2); - out_buffer[dst + x * 4 + 1] = static_cast(output_surface[src + x].g >> 2); - out_buffer[dst + x * 4 + 2] = static_cast(output_surface[src + x].r >> 2); - out_buffer[dst + x * 4 + 3] = static_cast(output_surface[src + x].a >> 2); - } else { - out_buffer[dst + x * 4 + 0] = static_cast(output_surface[src + x].r >> 2); - out_buffer[dst + x * 4 + 1] = static_cast(output_surface[src + x].g >> 2); - out_buffer[dst + x * 4 + 2] = static_cast(output_surface[src + x].b >> 2); - out_buffer[dst + x * 4 + 3] = static_cast(output_surface[src + x].a >> 2); - } - } + case AV_PIX_FMT_NV12: { + // Frame from VA-API hardware + // This is already interleaved so just copy + const u8* chroma_src = frame->GetData(1); + for (std::size_t y = 0; y < half_height; ++y) { + const std::size_t src = y * stride; + const std::size_t dst = y * aligned_width; + std::memcpy(chroma_buffer.data() + dst, chroma_src + src, frame_width); } - }; - - auto Decode = [&](std::span out_buffer) { -#if defined(ARCHITECTURE_x86_64) - if (!has_sse41) { - DecodeLinear(out_buffer); - return; - } - - for (u32 y = 0; y < surface_height; y++) { - const auto src = y * surface_stride; - const auto dst = y * out_luma_stride; - for (u32 x = 0; x < surface_width; x += 16) { - // clang-format off - // Prefetch the next 2 cache lines - _mm_prefetch((const char*)&output_surface[src + x + 16], _MM_HINT_T0); - _mm_prefetch((const char*)&output_surface[src + x + 24], _MM_HINT_T0); - - // Load the pixels, 16-bit channels, 8 bytes per pixel, e.g - // pixel01 = [AA AA BB BB GG GG RR RR AA AA BB BB GG GG RR RR - auto pixel01 = _mm_load_si128((__m128i*)&output_surface[src + x + 0]); - auto pixel23 = _mm_load_si128((__m128i*)&output_surface[src + x + 2]); - auto pixel45 = _mm_load_si128((__m128i*)&output_surface[src + x + 4]); - auto pixel67 = _mm_load_si128((__m128i*)&output_surface[src + x + 6]); - auto pixel89 = _mm_load_si128((__m128i*)&output_surface[src + x + 8]); - auto pixel1011 = _mm_load_si128((__m128i*)&output_surface[src + x + 10]); - auto pixel1213 = _mm_load_si128((__m128i*)&output_surface[src + x + 12]); - auto pixel1415 = _mm_load_si128((__m128i*)&output_surface[src + x + 14]); - - // Right-shift the channels by 16 to un-do the left shit on read and bring the range - // back to 8-bit. - pixel01 = _mm_srli_epi16(pixel01, 2); - pixel23 = _mm_srli_epi16(pixel23, 2); - pixel45 = _mm_srli_epi16(pixel45, 2); - pixel67 = _mm_srli_epi16(pixel67, 2); - pixel89 = _mm_srli_epi16(pixel89, 2); - pixel1011 = _mm_srli_epi16(pixel1011, 2); - pixel1213 = _mm_srli_epi16(pixel1213, 2); - pixel1415 = _mm_srli_epi16(pixel1415, 2); - - // Pack with unsigned saturation 16-bit channels from 2 registers into 8-bit channels in 1 register. - // pixel01 = [AA2 AA2] [BB2 BB2] [GG2 GG2] [RR2 RR2] [AA1 AA1] [BB1 BB1] [GG1 GG1] [RR1 RR1] - // pixel23 = [AA4 AA4] [BB4 BB4] [GG4 GG4] [RR4 RR4] [AA3 AA3] [BB3 BB3] [GG3 GG3] [RR3 RR3] - // -> - // pixels0_lo = [AA4] [BB4] [GG4] [RR4] [AA3] [BB3] [GG3] [RR3] [AA2] [BB2] [GG2] [RR2] [AA1] [BB1] [GG1] [RR1] - auto pixels0_lo = _mm_packus_epi16(pixel01, pixel23); - auto pixels0_hi = _mm_packus_epi16(pixel45, pixel67); - auto pixels1_lo = _mm_packus_epi16(pixel89, pixel1011); - auto pixels1_hi = _mm_packus_epi16(pixel1213, pixel1415); - - if constexpr (Format == VideoPixelFormat::A8R8G8B8) { - const auto shuffle = - _mm_set_epi8(15, 12, 13, 14, 11, 8, 9, 10, 7, 4, 5, 6, 3, 0, 1, 2); - - // Our pixels are ABGR (big-endian) by default, if ARGB is needed, we need to shuffle. - // pixels0_lo = [AA4 BB4 GG4 RR4] [AA3 BB3 GG3 RR3] [AA2 BB2 GG2 RR2] [AA1 BB1 GG1 RR1] - // -> - // pixels0_lo = [AA4 RR4 GG4 BB4] [AA3 RR3 GG3 BB3] [AA2 RR2 GG2 BB2] [AA1 RR1 GG1 BB1] - pixels0_lo = _mm_shuffle_epi8(pixels0_lo, shuffle); - pixels0_hi = _mm_shuffle_epi8(pixels0_hi, shuffle); - pixels1_lo = _mm_shuffle_epi8(pixels1_lo, shuffle); - pixels1_hi = _mm_shuffle_epi8(pixels1_hi, shuffle); - } - - // Store the pixels - _mm_store_si128((__m128i*)&out_buffer[dst + x * 4 + 0], pixels0_lo); - _mm_store_si128((__m128i*)&out_buffer[dst + x * 4 + 16], pixels0_hi); - _mm_store_si128((__m128i*)&out_buffer[dst + x * 4 + 32], pixels1_lo); - _mm_store_si128((__m128i*)&out_buffer[dst + x * 4 + 48], pixels1_hi); - - // clang-format on - } - } -#elif defined(ARCHITECTURE_arm64) - DecodeLinear(out_buffer); -#else - DecodeLinear(out_buffer); -#endif - }; - - switch (output_surface_config.out_block_kind) { - case BLK_KIND::GENERIC_16Bx2: { - const u32 block_height = static_cast(output_surface_config.out_block_height); - const auto out_swizzle_size = Texture::CalculateSize(true, BytesPerPixel, out_luma_width, - out_luma_height, 1, block_height, 0); - - luma_scratch.resize_destructive(out_luma_size); - - Decode(luma_scratch); - - Tegra::Memory::GpuGuestMemoryScoped out_luma( - memory_manager, regs.output_surface.luma.Address(), out_swizzle_size, &swizzle_scratch); - - SwizzleSurface(out_luma, out_luma_stride, luma_scratch, out_luma_stride, out_luma_height); - } break; - case BLK_KIND::PITCH: { - luma_scratch.resize_destructive(out_luma_size); - - Tegra::Memory::GpuGuestMemoryScoped out_luma( - memory_manager, regs.output_surface.luma.Address(), out_luma_size, &luma_scratch); - - Decode(out_luma); - } break; - default: - UNREACHABLE(); break; } + default: + ASSERT(false); + break; + } + host1x.GMMU().WriteBlock(output_surface_chroma_address, chroma_buffer.data(), + chroma_buffer.size()); } -} // namespace Tegra::Host1x +} // namespace Host1x + +} // namespace Tegra diff --git a/src/video_core/host1x/vic.h b/src/video_core/host1x/vic.h index 9d8bc90b8..2de77e71e 100755 --- a/src/video_core/host1x/vic.h +++ b/src/video_core/host1x/vic.h @@ -3,645 +3,65 @@ #pragma once -#include -#include #include -#include -#include #include "common/common_types.h" #include "common/scratch_buffer.h" -#include "video_core/cdma_pusher.h" -namespace Tegra::Host1x { +struct SwsContext; + +namespace Tegra { + +namespace Host1x { + class Host1x; class Nvdec; +union VicConfig; -struct Pixel { - u16 r; - u16 g; - u16 b; - u16 a; -}; - -// One underscore represents separate pixels. -// Double underscore represents separate planes. -// _N represents chroma subsampling, not a separate pixel. -enum class VideoPixelFormat : u32 { - A8 = 0, - L8 = 1, - A4L4 = 2, - L4A4 = 3, - R8 = 4, - A8L8 = 5, - L8A8 = 6, - R8G8 = 7, - G8R8 = 8, - B5G6R5 = 9, - R5G6B5 = 10, - B6G5R5 = 11, - R5G5B6 = 12, - A1B5G5R5 = 13, - A1R5G5B5 = 14, - B5G5R5A1 = 15, - R5G5B5A1 = 16, - A5B5G5R1 = 17, - A5R1G5B5 = 18, - B5G5R1A5 = 19, - R1G5B5A5 = 20, - X1B5G5R5 = 21, - X1R5G5B5 = 22, - B5G5R5X1 = 23, - R5G5B5X1 = 24, - A4B4G5R4 = 25, - A4R4G4B4 = 26, - B4G4R4A4 = 27, - R4G4B4A4 = 28, - B8G8R8 = 29, - R8G8B8 = 30, - A8B8G8R8 = 31, - A8R8G8B8 = 32, - B8G8R8A8 = 33, - R8G8B8A8 = 34, - X8B8G8R8 = 35, - X8R8G8B8 = 36, - B8G8R8X8 = 37, - R8G8B8X8 = 38, - A8B10G10R10 = 39, - A2R10G10B10 = 40, - B10G10R10A2 = 41, - R10G10B10A2 = 42, - A4P4 = 43, - P4A4 = 44, - P8A8 = 45, - A8P8 = 46, - P8 = 47, - P1 = 48, - U8V8 = 49, - V8U8 = 50, - A8Y8U8V8 = 51, - V8U8Y8A8 = 52, - Y8U8V8 = 53, - Y8V8U8 = 54, - U8V8Y8 = 55, - V8U8Y8 = 56, - Y8U8_Y8V8 = 57, - Y8V8_Y8U8 = 58, - U8Y8_V8Y8 = 59, - V8Y8_U8Y8 = 60, - Y8__U8V8_N444 = 61, - Y8__V8U8_N444 = 62, - Y8__U8V8_N422 = 63, - Y8__V8U8_N422 = 64, - Y8__U8V8_N422R = 65, - Y8__V8U8_N422R = 66, - Y8__U8V8_N420 = 67, - Y8__V8U8_N420 = 68, - Y8__U8__V8_N444 = 69, - Y8__U8__V8_N422 = 70, - Y8__U8__V8_N422R = 71, - Y8__U8__V8_N420 = 72, - U8 = 73, - V8 = 74, -}; - -struct Offset { - constexpr u32 Address() const noexcept { - return offset << 8; - } - -private: - u32 offset; -}; -static_assert(std::is_trivial_v, "Offset must be trivial"); -static_assert(sizeof(Offset) == 0x4, "Offset has the wrong size!"); - -struct PlaneOffsets { - Offset luma; - Offset chroma_u; - Offset chroma_v; -}; -static_assert(sizeof(PlaneOffsets) == 0xC, "PlaneOffsets has the wrong size!"); - -enum SurfaceIndex : u32 { - Current = 0, - Previous = 1, - Next = 2, - NextNoiseReduced = 3, - CurrentMotion = 4, - PreviousMotion = 5, - PreviousPreviousMotion = 6, - CombinedMotion = 7, -}; - -enum class DXVAHD_ALPHA_FILL_MODE : u32 { - OPAQUE = 0, - BACKGROUND = 1, - DESTINATION = 2, - SOURCE_STREAM = 3, - COMPOSITED = 4, - SOURCE_ALPHA = 5, -}; - -enum class DXVAHD_FRAME_FORMAT : u64 { - PROGRESSIVE = 0, - INTERLACED_TOP_FIELD_FIRST = 1, - INTERLACED_BOTTOM_FIELD_FIRST = 2, - TOP_FIELD = 3, - BOTTOM_FIELD = 4, - SUBPIC_PROGRESSIVE = 5, - SUBPIC_INTERLACED_TOP_FIELD_FIRST = 6, - SUBPIC_INTERLACED_BOTTOM_FIELD_FIRST = 7, - SUBPIC_TOP_FIELD = 8, - SUBPIC_BOTTOM_FIELD = 9, - TOP_FIELD_CHROMA_BOTTOM = 10, - BOTTOM_FIELD_CHROMA_TOP = 11, - SUBPIC_TOP_FIELD_CHROMA_BOTTOM = 12, - SUBPIC_BOTTOM_FIELD_CHROMA_TOP = 13, -}; - -enum class DXVAHD_DEINTERLACE_MODE_PRIVATE : u64 { - WEAVE = 0, - BOB_FIELD = 1, - BOB = 2, - NEWBOB = 3, - DISI1 = 4, - WEAVE_LUMA_BOB_FIELD_CHROMA = 5, - MAX = 0xF, -}; - -enum class BLK_KIND { - PITCH = 0, - GENERIC_16Bx2 = 1, - // These are unsupported in the vic - BL_NAIVE = 2, - BL_KEPLER_XBAR_RAW = 3, - VP2_TILED = 15, -}; - -enum class BLEND_SRCFACTC : u32 { - K1 = 0, - K1_TIMES_DST = 1, - NEG_K1_TIMES_DST = 2, - K1_TIMES_SRC = 3, - ZERO = 4, -}; - -enum class BLEND_DSTFACTC : u32 { - K1 = 0, - K2 = 1, - K1_TIMES_DST = 2, - NEG_K1_TIMES_DST = 3, - NEG_K1_TIMES_SRC = 4, - ZERO = 5, - ONE = 6, -}; - -enum class BLEND_SRCFACTA : u32 { - K1 = 0, - K2 = 1, - NEG_K1_TIMES_DST = 2, - ZERO = 3, - MAX = 7, -}; - -enum class BLEND_DSTFACTA : u32 { - K2 = 0, - NEG_K1_TIMES_SRC = 1, - ZERO = 2, - ONE = 3, - MAX = 7, -}; - -struct PipeConfig { - union { - BitField<0, 11, u32> downsample_horiz; - BitField<11, 5, u32> reserved0; - BitField<16, 11, u32> downsample_vert; - BitField<27, 5, u32> reserved1; - }; - u32 reserved2; - u32 reserved3; - u32 reserved4; -}; -static_assert(sizeof(PipeConfig) == 0x10, "PipeConfig has the wrong size!"); - -struct OutputConfig { - union { - BitField<0, 3, DXVAHD_ALPHA_FILL_MODE> alpha_fill_mode; - BitField<3, 3, u64> alpha_fill_slot; - BitField<6, 10, u64> background_a; - BitField<16, 10, u64> background_r; - BitField<26, 10, u64> background_g; - BitField<36, 10, u64> background_b; - BitField<46, 2, u64> regamma_mode; - BitField<48, 1, u64> output_flip_x; - BitField<49, 1, u64> output_flip_y; - BitField<50, 1, u64> output_transpose; - BitField<51, 1, u64> reserved1; - BitField<52, 12, u64> reserved2; - }; - union { - BitField<0, 14, u32> target_rect_left; - BitField<14, 2, u32> reserved3; - BitField<16, 14, u32> target_rect_right; - BitField<30, 2, u32> reserved4; - }; - union { - BitField<0, 14, u32> target_rect_top; - BitField<14, 2, u32> reserved5; - BitField<16, 14, u32> target_rect_bottom; - BitField<30, 2, u32> reserved6; - }; -}; -static_assert(sizeof(OutputConfig) == 0x10, "OutputConfig has the wrong size!"); - -struct OutputSurfaceConfig { - union { - BitField<0, 7, VideoPixelFormat> out_pixel_format; - BitField<7, 2, u32> out_chroma_loc_horiz; - BitField<9, 2, u32> out_chroma_loc_vert; - BitField<11, 4, BLK_KIND> out_block_kind; - BitField<15, 4, u32> out_block_height; // in gobs, log2 - BitField<19, 3, u32> reserved0; - BitField<22, 10, u32> reserved1; - }; - union { - BitField<0, 14, u32> out_surface_width; // - 1 - BitField<14, 14, u32> out_surface_height; // - 1 - BitField<28, 4, u32> reserved2; - }; - union { - BitField<0, 14, u32> out_luma_width; // - 1 - BitField<14, 14, u32> out_luma_height; // - 1 - BitField<28, 4, u32> reserved3; - }; - union { - BitField<0, 14, u32> out_chroma_width; // - 1 - BitField<14, 14, u32> out_chroma_height; // - 1 - BitField<28, 4, u32> reserved4; - }; -}; -static_assert(sizeof(OutputSurfaceConfig) == 0x10, "OutputSurfaceConfig has the wrong size!"); - -struct MatrixStruct { - union { - BitField<0, 20, s64> matrix_coeff00; // (0,0) of 4x3 conversion matrix - BitField<20, 20, s64> matrix_coeff10; // (1,0) of 4x3 conversion matrix - BitField<40, 20, s64> matrix_coeff20; // (2,0) of 4x3 conversion matrix - BitField<60, 4, u64> matrix_r_shift; - }; - union { - BitField<0, 20, s64> matrix_coeff01; // (0,1) of 4x3 conversion matrix - BitField<20, 20, s64> matrix_coeff11; // (1,1) of 4x3 conversion matrix - BitField<40, 20, s64> matrix_coeff21; // (2,1) of 4x3 conversion matrix - BitField<60, 3, u64> reserved0; - BitField<63, 1, u64> matrix_enable; - }; - union { - BitField<0, 20, s64> matrix_coeff02; // (0,2) of 4x3 conversion matrix - BitField<20, 20, s64> matrix_coeff12; // (1,2) of 4x3 conversion matrix - BitField<40, 20, s64> matrix_coeff22; // (2,2) of 4x3 conversion matrix - BitField<60, 4, u64> reserved1; - }; - union { - BitField<0, 20, s64> matrix_coeff03; // (0,3) of 4x3 conversion matrix - BitField<20, 20, s64> matrix_coeff13; // (1,3) of 4x3 conversion matrix - BitField<40, 20, s64> matrix_coeff23; // (2,3) of 4x3 conversion matrix - BitField<60, 4, u64> reserved2; - }; -}; -static_assert(sizeof(MatrixStruct) == 0x20, "MatrixStruct has the wrong size!"); - -struct ClearRectStruct { - union { - BitField<0, 14, u32> clear_rect0_left; - BitField<14, 2, u32> reserved0; - BitField<16, 14, u32> clear_rect0_right; - BitField<30, 2, u32> reserved1; - }; - union { - BitField<0, 14, u32> clear_rect0_top; - BitField<14, 2, u32> reserved2; - BitField<16, 14, u32> clear_rect0_bottom; - BitField<30, 2, u32> reserved3; - }; - union { - BitField<0, 14, u32> clear_rect1_left; - BitField<14, 2, u32> reserved4; - BitField<16, 14, u32> clear_rect1_right; - BitField<30, 2, u32> reserved5; - }; - union { - BitField<0, 14, u32> clear_rect1_top; - BitField<14, 2, u32> reserved6; - BitField<16, 14, u32> clear_rect1_bottom; - BitField<30, 2, u32> reserved7; - }; -}; -static_assert(sizeof(ClearRectStruct) == 0x10, "ClearRectStruct has the wrong size!"); - -struct SlotConfig { - union { - BitField<0, 1, u64> slot_enable; - BitField<1, 1, u64> denoise; - BitField<2, 1, u64> advanced_denoise; - BitField<3, 1, u64> cadence_detect; - BitField<4, 1, u64> motion_map; - BitField<5, 1, u64> motion_map_capture; - BitField<6, 1, u64> is_even; - BitField<7, 1, u64> chroma_even; - // fetch control struct - BitField<8, 1, u64> current_field_enable; - BitField<9, 1, u64> prev_field_enable; - BitField<10, 1, u64> next_field_enable; - BitField<11, 1, u64> next_nr_field_enable; // noise reduction - BitField<12, 1, u64> current_motion_field_enable; - BitField<13, 1, u64> prev_motion_field_enable; - BitField<14, 1, u64> prev_prev_motion_field_enable; - BitField<15, 1, u64> combined_motion_field_enable; - - BitField<16, 4, DXVAHD_FRAME_FORMAT> frame_format; - BitField<20, 2, u64> filter_length_y; // 0: 1-tap, 1: 2-tap, 2: 5-tap, 3: 10-tap - BitField<22, 2, u64> filter_length_x; - BitField<24, 12, u64> panoramic; - BitField<36, 22, u64> reserved1; - BitField<58, 6, u64> detail_filter_clamp; - }; - union { - BitField<0, 10, u64> filter_noise; - BitField<10, 10, u64> filter_detail; - BitField<20, 10, u64> chroma_noise; - BitField<30, 10, u64> chroma_detail; - BitField<40, 4, DXVAHD_DEINTERLACE_MODE_PRIVATE> deinterlace_mode; - BitField<44, 3, u64> motion_accumulation_weight; - BitField<47, 11, u64> noise_iir; - BitField<58, 4, u64> light_level; - BitField<62, 2, u64> reserved4; - }; - union { - BitField<0, 10, u64> soft_clamp_low; - BitField<10, 10, u64> soft_clamp_high; - BitField<20, 3, u64> reserved5; - BitField<23, 9, u64> reserved6; - BitField<32, 10, u64> planar_alpha; - BitField<42, 1, u64> constant_alpha; - BitField<43, 3, u64> stereo_interleave; - BitField<46, 1, u64> clip_enabled; - BitField<47, 8, u64> clear_rect_mask; - BitField<55, 2, u64> degamma_mode; - BitField<57, 1, u64> reserved7; - BitField<58, 1, u64> decompress_enable; - BitField<59, 5, u64> reserved9; - }; - union { - BitField<0, 8, u64> decompress_ctb_count; - BitField<8, 32, u64> decompress_zbc_count; - BitField<40, 24, u64> reserved12; - }; - union { - BitField<0, 30, u64> source_rect_left; - BitField<30, 2, u64> reserved14; - BitField<32, 30, u64> source_rect_right; - BitField<62, 2, u64> reserved15; - }; - union { - BitField<0, 30, u64> source_rect_top; - BitField<30, 2, u64> reserved16; - BitField<32, 30, u64> source_rect_bottom; - BitField<62, 2, u64> reserved17; - }; - union { - BitField<0, 14, u64> dest_rect_left; - BitField<14, 2, u64> reserved18; - BitField<16, 14, u64> dest_rect_right; - BitField<30, 2, u64> reserved19; - BitField<32, 14, u64> dest_rect_top; - BitField<46, 2, u64> reserved20; - BitField<48, 14, u64> dest_rect_bottom; - BitField<62, 2, u64> reserved21; - }; - u32 reserved22; - u32 reserved23; -}; -static_assert(sizeof(SlotConfig) == 0x40, "SlotConfig has the wrong size!"); - -struct SlotSurfaceConfig { - union { - BitField<0, 7, VideoPixelFormat> slot_pixel_format; - BitField<7, 2, u32> slot_chroma_loc_horiz; - BitField<9, 2, u32> slot_chroma_loc_vert; - BitField<11, 4, u32> slot_block_kind; - BitField<15, 4, u32> slot_block_height; - BitField<19, 3, u32> slot_cache_width; - BitField<22, 10, u32> reserved0; - }; - union { - BitField<0, 14, u32> slot_surface_width; // - 1 - BitField<14, 14, u32> slot_surface_height; // - 1 - BitField<28, 4, u32> reserved1; - }; - union { - BitField<0, 14, u32> slot_luma_width; // padded, - 1 - BitField<14, 14, u32> slot_luma_height; // padded, - 1 - BitField<28, 4, u32> reserved2; - }; - union { - BitField<0, 14, u32> slot_chroma_width; // padded, - 1 - BitField<14, 14, u32> slot_chroma_height; // padded, - 1 - BitField<28, 4, u32> reserved3; - }; -}; -static_assert(sizeof(SlotSurfaceConfig) == 0x10, "SlotSurfaceConfig has the wrong size!"); - -struct LumaKeyStruct { - union { - BitField<0, 20, u64> luma_coeff0; // (0) of 4x1 conversion matrix, S12.8 format - BitField<20, 20, u64> luma_coeff1; // (1) of 4x1 conversion matrix, S12.8 format - BitField<40, 20, u64> luma_coeff2; // (2) of 4x1 conversion matrix, S12.8 format - BitField<60, 4, u64> luma_r_shift; - }; - union { - BitField<0, 20, u64> luma_coeff3; // (3) of 4x1 conversion matrix, S12.8 format - BitField<20, 10, u64> luma_key_lower; - BitField<30, 10, u64> luma_key_upper; - BitField<40, 1, u64> luma_key_enabled; - BitField<41, 2, u64> reserved0; - BitField<43, 21, u64> reserved1; - }; -}; -static_assert(sizeof(LumaKeyStruct) == 0x10, "LumaKeyStruct has the wrong size!"); - -struct BlendingSlotStruct { - union { - BitField<0, 10, u32> alpha_k1; - BitField<10, 6, u32> reserved0; - BitField<16, 10, u32> alpha_k2; - BitField<26, 6, u32> reserved1; - }; - union { - BitField<0, 3, BLEND_SRCFACTC> src_factor_color_match_select; - BitField<3, 1, u32> reserved2; - BitField<4, 3, BLEND_DSTFACTC> dst_factor_color_match_select; - BitField<7, 1, u32> reserved3; - BitField<8, 3, BLEND_SRCFACTA> src_factor_a_match_select; - BitField<11, 1, u32> reserved4; - BitField<12, 3, BLEND_DSTFACTA> dst_factor_a_match_select; - BitField<15, 1, u32> reserved5; - BitField<16, 4, u32> reserved6; - BitField<20, 4, u32> reserved7; - BitField<24, 4, u32> reserved8; - BitField<28, 4, u32> reserved9; - }; - union { - BitField<0, 2, u32> reserved10; - BitField<2, 10, u32> override_r; - BitField<12, 10, u32> override_g; - BitField<22, 10, u32> override_b; - }; - union { - BitField<0, 10, u32> override_a; - BitField<10, 2, u32> reserved11; - BitField<12, 1, u32> use_override_r; - BitField<13, 1, u32> use_override_g; - BitField<14, 1, u32> use_override_b; - BitField<15, 1, u32> use_override_a; - BitField<16, 1, u32> mask_r; - BitField<17, 1, u32> mask_g; - BitField<18, 1, u32> mask_b; - BitField<19, 1, u32> mask_a; - BitField<20, 12, u32> reserved12; - }; -}; -static_assert(sizeof(BlendingSlotStruct) == 0x10, "BlendingSlotStruct has the wrong size!"); - -struct SlotStruct { - SlotConfig config; - SlotSurfaceConfig surface_config; - LumaKeyStruct luma_key; - MatrixStruct color_matrix; - MatrixStruct gamut_matrix; - BlendingSlotStruct blending; -}; -static_assert(sizeof(SlotStruct) == 0xB0, "SlotStruct has the wrong size!"); - -struct ConfigStruct { - PipeConfig pipe_config; - OutputConfig output_config; - OutputSurfaceConfig output_surface_config; - MatrixStruct out_color_matrix; - std::array clear_rects; - std::array slot_structs; -}; -static_assert(offsetof(ConfigStruct, pipe_config) == 0x0, "pipe_config is in the wrong place!"); -static_assert(offsetof(ConfigStruct, output_config) == 0x10, - "output_config is in the wrong place!"); -static_assert(offsetof(ConfigStruct, output_surface_config) == 0x20, - "output_surface_config is in the wrong place!"); -static_assert(offsetof(ConfigStruct, out_color_matrix) == 0x30, - "out_color_matrix is in the wrong place!"); -static_assert(offsetof(ConfigStruct, clear_rects) == 0x50, "clear_rects is in the wrong place!"); -static_assert(offsetof(ConfigStruct, slot_structs) == 0x90, "slot_structs is in the wrong place!"); -static_assert(sizeof(ConfigStruct) == 0x610, "ConfigStruct has the wrong size!"); - -struct VicRegisters { - static constexpr std::size_t NUM_REGS = 0x446; - - union { - struct { - INSERT_PADDING_WORDS_NOINIT(0xC0); - u32 execute; - INSERT_PADDING_WORDS_NOINIT(0x3F); - std::array, 8> surfaces; - u32 picture_index; - u32 control_params; - Offset config_struct_offset; - Offset filter_struct_offset; - Offset palette_offset; - Offset hist_offset; - u32 context_id; - u32 fce_ucode_size; - PlaneOffsets output_surface; - Offset fce_ucode_offset; - INSERT_PADDING_WORDS_NOINIT(0x4); - std::array slot_context_ids; - std::array comp_tag_buffer_offsets; - std::array history_buffer_offset; - INSERT_PADDING_WORDS_NOINIT(0x25D); - u32 pm_trigger_end; - }; - std::array reg_array; - }; -}; -static_assert(offsetof(VicRegisters, execute) == 0x300, "execute is in the wrong place!"); -static_assert(offsetof(VicRegisters, surfaces) == 0x400, "surfaces is in the wrong place!"); -static_assert(offsetof(VicRegisters, picture_index) == 0x700, - "picture_index is in the wrong place!"); -static_assert(offsetof(VicRegisters, control_params) == 0x704, - "control_params is in the wrong place!"); -static_assert(offsetof(VicRegisters, config_struct_offset) == 0x708, - "config_struct_offset is in the wrong place!"); -static_assert(offsetof(VicRegisters, output_surface) == 0x720, - "output_surface is in the wrong place!"); -static_assert(offsetof(VicRegisters, slot_context_ids) == 0x740, - "slot_context_ids is in the wrong place!"); -static_assert(offsetof(VicRegisters, history_buffer_offset) == 0x780, - "history_buffer_offset is in the wrong place!"); -static_assert(offsetof(VicRegisters, pm_trigger_end) == 0x1114, - "pm_trigger_end is in the wrong place!"); -static_assert(sizeof(VicRegisters) == 0x1118, "VicRegisters has the wrong size!"); - -class Vic final : public CDmaPusher { +class Vic { public: enum class Method : u32 { - Execute = offsetof(VicRegisters, execute), - SetControlParams = offsetof(VicRegisters, control_params), - SetConfigStructOffset = offsetof(VicRegisters, config_struct_offset), - SetOutputSurfaceLumaOffset = offsetof(VicRegisters, output_surface.luma), - SetOutputSurfaceChromaOffset = offsetof(VicRegisters, output_surface.chroma_u), - SetOutputSurfaceChromaUnusedOffset = offsetof(VicRegisters, output_surface.chroma_v) + Execute = 0xc0, + SetControlParams = 0x1c1, + SetConfigStructOffset = 0x1c2, + SetOutputSurfaceLumaOffset = 0x1c8, + SetOutputSurfaceChromaOffset = 0x1c9, + SetOutputSurfaceChromaUnusedOffset = 0x1ca }; - explicit Vic(Host1x& host1x, s32 id, Nvdec& nvdec_processor, u32 syncpt); + explicit Vic(Host1x& host1x, std::shared_ptr nvdec_processor); + ~Vic(); /// Write to the device state. - void ProcessMethod(u32 method, u32 arg) override; + void ProcessMethod(Method method, u32 argument); private: void Execute(); - void Blend(const ConfigStruct& config, const SlotStruct& slot); + void WriteRGBFrame(std::unique_ptr frame, const VicConfig& config); - template - void ReadProgressiveY8__V8U8_N420(const SlotStruct& slot, std::span offsets, - std::shared_ptr frame); - template - void ReadInterlacedY8__V8U8_N420(const SlotStruct& slot, std::span offsets, - std::shared_ptr frame); + void WriteYUVFrame(std::unique_ptr frame, const VicConfig& config); - template - void ReadY8__V8U8_N420(const SlotStruct& slot, std::span offsets, - std::shared_ptr frame); + Host1x& host1x; + std::shared_ptr nvdec_processor; - void WriteY8__V8U8_N420(const OutputSurfaceConfig& output_surface_config); + /// Avoid reallocation of the following buffers every frame, as their + /// size does not change during a stream + using AVMallocPtr = std::unique_ptr; + AVMallocPtr converted_frame_buffer; + Common::ScratchBuffer luma_buffer; + Common::ScratchBuffer chroma_buffer; - template - void WriteABGR(const OutputSurfaceConfig& output_surface_config); + GPUVAddr config_struct_address{}; + GPUVAddr output_surface_luma_address{}; + GPUVAddr output_surface_chroma_address{}; - Nvdec& nvdec_processor; - s32 id; - u32 syncpoint; - - VicRegisters regs{}; - - const bool has_sse41{false}; - - Common::ScratchBuffer output_surface; - Common::ScratchBuffer slot_surface; - Common::ScratchBuffer luma_scratch; - Common::ScratchBuffer chroma_scratch; - Common::ScratchBuffer swizzle_scratch; + SwsContext* scaler_ctx{}; + s32 scaler_width{}; + s32 scaler_height{}; }; -} // namespace Tegra::Host1x +} // namespace Host1x + +} // namespace Tegra diff --git a/src/video_core/memory_manager.h b/src/video_core/memory_manager.h index 9b312b112..1b31cdcee 100755 --- a/src/video_core/memory_manager.h +++ b/src/video_core/memory_manager.h @@ -42,8 +42,6 @@ public: u64 page_bits_ = 12); ~MemoryManager(); - static constexpr bool HAS_FLUSH_INVALIDATION = true; - size_t GetID() const { return unique_identifier; }