From 92d13687fef302927d0f2b0826b69ba91df9d18e Mon Sep 17 00:00:00 2001 From: pineappleEA Date: Sun, 1 Aug 2021 02:08:36 +0200 Subject: [PATCH] early-access version 1952 --- README.md | 2 +- .../service/nvdrv/devices/nvdisp_disp0.cpp | 9 +- .../hle/service/nvdrv/devices/nvdisp_disp0.h | 2 +- .../hle/service/nvflinger/buffer_queue.cpp | 4 + src/core/hle/service/nvflinger/buffer_queue.h | 1 + src/core/hle/service/nvflinger/nvflinger.cpp | 30 ++-- src/core/hle/service/nvflinger/nvflinger.h | 2 + src/core/hle/service/vi/vi.cpp | 1 + src/video_core/gpu.cpp | 133 ++++++++++++++++-- src/video_core/gpu.h | 83 +++++++++-- src/video_core/gpu_thread.cpp | 2 +- .../renderer_opengl/gl_rasterizer.cpp | 5 +- .../renderer_vulkan/vk_rasterizer.cpp | 1 + 13 files changed, 223 insertions(+), 52 deletions(-) diff --git a/README.md b/README.md index 7510f52bc..a100a91e4 100755 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ yuzu emulator early access ============= -This is the source code for early-access 1951. +This is the source code for early-access 1952. ## Legal Notice diff --git a/src/core/hle/service/nvdrv/devices/nvdisp_disp0.cpp b/src/core/hle/service/nvdrv/devices/nvdisp_disp0.cpp index ce6065db2..e5eb397c0 100755 --- a/src/core/hle/service/nvdrv/devices/nvdisp_disp0.cpp +++ b/src/core/hle/service/nvdrv/devices/nvdisp_disp0.cpp @@ -5,10 +5,8 @@ #include "common/assert.h" #include "common/logging/log.h" #include "core/core.h" -#include "core/core_timing.h" #include "core/hle/service/nvdrv/devices/nvdisp_disp0.h" #include "core/hle/service/nvdrv/devices/nvmap.h" -#include "core/perf_stats.h" #include "video_core/gpu.h" #include "video_core/renderer_base.h" @@ -41,7 +39,7 @@ void nvdisp_disp0::OnClose(DeviceFD fd) {} void nvdisp_disp0::flip(u32 buffer_handle, u32 offset, u32 format, u32 width, u32 height, u32 stride, NVFlinger::BufferQueue::BufferTransformFlags transform, - const Common::Rectangle& crop_rect) { + const Common::Rectangle& crop_rect, const MultiFence& fences) { VAddr addr = nvmap_dev->GetObjectAddress(buffer_handle); LOG_TRACE(Service, "Drawing from address {:X} offset {:08X} Width {} Height {} Stride {} Format {}", @@ -52,10 +50,7 @@ void nvdisp_disp0::flip(u32 buffer_handle, u32 offset, u32 format, u32 width, u3 addr, offset, width, height, stride, static_cast(format), transform, crop_rect}; - system.GetPerfStats().EndSystemFrame(); - system.GPU().SwapBuffers(&framebuffer); - system.SpeedLimiter().DoSpeedLimiting(system.CoreTiming().GetGlobalTimeUs()); - system.GetPerfStats().BeginSystemFrame(); + system.GPU().QueueFrame(&framebuffer, fences); } } // namespace Service::Nvidia::Devices diff --git a/src/core/hle/service/nvdrv/devices/nvdisp_disp0.h b/src/core/hle/service/nvdrv/devices/nvdisp_disp0.h index de01e1d5f..a5bf12a12 100755 --- a/src/core/hle/service/nvdrv/devices/nvdisp_disp0.h +++ b/src/core/hle/service/nvdrv/devices/nvdisp_disp0.h @@ -33,7 +33,7 @@ public: /// Performs a screen flip, drawing the buffer pointed to by the handle. void flip(u32 buffer_handle, u32 offset, u32 format, u32 width, u32 height, u32 stride, NVFlinger::BufferQueue::BufferTransformFlags transform, - const Common::Rectangle& crop_rect); + const Common::Rectangle& crop_rect, const MultiFence& fence); private: std::shared_ptr nvmap_dev; diff --git a/src/core/hle/service/nvflinger/buffer_queue.cpp b/src/core/hle/service/nvflinger/buffer_queue.cpp index 59ddf6298..94f027bd3 100755 --- a/src/core/hle/service/nvflinger/buffer_queue.cpp +++ b/src/core/hle/service/nvflinger/buffer_queue.cpp @@ -88,6 +88,10 @@ const IGBPBuffer& BufferQueue::RequestBuffer(u32 slot) const { return buffers[slot].igbp_buffer; } +const BufferQueue::Buffer& BufferQueue::AccessBuffer(u32 slot) const { + return buffers[slot]; +} + void BufferQueue::QueueBuffer(u32 slot, BufferTransformFlags transform, const Common::Rectangle& crop_rect, u32 swap_interval, Service::Nvidia::MultiFence& multi_fence) { diff --git a/src/core/hle/service/nvflinger/buffer_queue.h b/src/core/hle/service/nvflinger/buffer_queue.h index 61e337ac5..088e13446 100755 --- a/src/core/hle/service/nvflinger/buffer_queue.h +++ b/src/core/hle/service/nvflinger/buffer_queue.h @@ -107,6 +107,7 @@ public: void Connect(); void Disconnect(); u32 Query(QueryType type); + const Buffer& AccessBuffer(u32 slot) const; u32 GetId() const { return id; diff --git a/src/core/hle/service/nvflinger/nvflinger.cpp b/src/core/hle/service/nvflinger/nvflinger.cpp index 941748970..115f14d35 100755 --- a/src/core/hle/service/nvflinger/nvflinger.cpp +++ b/src/core/hle/service/nvflinger/nvflinger.cpp @@ -274,8 +274,6 @@ void NVFlinger::Compose() { continue; } - const auto& igbp_buffer = buffer->get().igbp_buffer; - if (!system.IsPoweredOn()) { return; // We are likely shutting down } @@ -289,23 +287,31 @@ void NVFlinger::Compose() { } guard->lock(); + system.GetPerfStats().EndSystemFrame(); MicroProfileFlip(); - - // Now send the buffer to the GPU for drawing. - // TODO(Subv): Support more than just disp0. The display device selection is probably based - // on which display we're drawing (Default, Internal, External, etc) - auto nvdisp = nvdrv->GetDevice("/dev/nvdisp_disp0"); - ASSERT(nvdisp); - - nvdisp->flip(igbp_buffer.gpu_buffer_id, igbp_buffer.offset, igbp_buffer.format, - igbp_buffer.width, igbp_buffer.height, igbp_buffer.stride, - buffer->get().transform, buffer->get().crop_rect); + system.SpeedLimiter().DoSpeedLimiting(system.CoreTiming().GetGlobalTimeUs()); + system.GetPerfStats().BeginSystemFrame(); swap_interval = buffer->get().swap_interval; buffer_queue.ReleaseBuffer(buffer->get().slot); } } +void NVFlinger::PrequeueFrame(u32 buffer_queue_id, u32 slot) { + auto& buffer_queue = *FindBufferQueue(buffer_queue_id); + const auto& buffer = buffer_queue.AccessBuffer(slot); + const auto& igbp_buffer = buffer.igbp_buffer; + + // Now send the buffer to the GPU for drawing. + // TODO(Subv): Support more than just disp0. The display device selection is probably based + // on which display we're drawing (Default, Internal, External, etc) + auto nvdisp = nvdrv->GetDevice("/dev/nvdisp_disp0"); + ASSERT(nvdisp); + nvdisp->flip(igbp_buffer.gpu_buffer_id, igbp_buffer.offset, igbp_buffer.format, + igbp_buffer.width, igbp_buffer.height, igbp_buffer.stride, buffer.transform, + buffer.crop_rect, buffer.multi_fence); +} + s64 NVFlinger::GetNextTicks() const { static constexpr s64 max_hertz = 120LL; diff --git a/src/core/hle/service/nvflinger/nvflinger.h b/src/core/hle/service/nvflinger/nvflinger.h index d80fd07ef..af6771928 100755 --- a/src/core/hle/service/nvflinger/nvflinger.h +++ b/src/core/hle/service/nvflinger/nvflinger.h @@ -77,6 +77,8 @@ public: /// Obtains a buffer queue identified by the ID. [[nodiscard]] BufferQueue* FindBufferQueue(u32 id); + void PrequeueFrame(u32 buffer_queue_id, u32 slot); + /// Performs a composition request to the emulated nvidia GPU and triggers the vsync events when /// finished. void Compose(); diff --git a/src/core/hle/service/vi/vi.cpp b/src/core/hle/service/vi/vi.cpp index 3e5949d52..c2b6a97ec 100755 --- a/src/core/hle/service/vi/vi.cpp +++ b/src/core/hle/service/vi/vi.cpp @@ -592,6 +592,7 @@ private: buffer_queue.QueueBuffer(request.data.slot, request.data.transform, request.data.GetCropRect(), request.data.swap_interval, request.data.multi_fence); + nv_flinger.PrequeueFrame(id, request.data.slot); IGBPQueueBufferResponseParcel response{1280, 720}; ctx.WriteBuffer(response.Serialize()); diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp index ff024f530..661b21919 100755 --- a/src/video_core/gpu.cpp +++ b/src/video_core/gpu.cpp @@ -114,10 +114,17 @@ void GPU::WaitFence(u32 syncpoint_id, u32 value) { }); } +void GPU::IncrementSyncPointGuest(const u32 syncpoint_id) { + std::lock_guard lock{pre_sync_mutex}; + auto& syncpoint = pre_syncpoints.at(syncpoint_id); + syncpoint++; + ProcessFrameRequests(syncpoint_id, syncpoint); +} + void GPU::IncrementSyncPoint(const u32 syncpoint_id) { + std::lock_guard lock{sync_mutex}; auto& syncpoint = syncpoints.at(syncpoint_id); syncpoint++; - std::lock_guard lock{sync_mutex}; sync_cv.notify_all(); auto& interrupt = syncpt_interrupts.at(syncpoint_id); if (!interrupt.empty()) { @@ -162,25 +169,121 @@ bool GPU::CancelSyncptInterrupt(const u32 syncpoint_id, const u32 value) { return true; } +void GPU::WaitOnWorkRequest(u64 fence) { + std::unique_lock lck{work_request_mutex}; + request_cv.wait(lck, + [&] { return fence >= current_request_fence.load(std::memory_order_relaxed); }); +} + u64 GPU::RequestFlush(VAddr addr, std::size_t size) { - std::unique_lock lck{flush_request_mutex}; - const u64 fence = ++last_flush_fence; - flush_requests.emplace_back(fence, addr, size); + std::unique_lock lck{work_request_mutex}; + const u64 fence = ++last_request_fence; + work_requests.emplace_back(fence, addr, size); + return fence; +} + +u64 GPU::RequestQueueFrame(u64 id) { + std::unique_lock lck{work_request_mutex}; + const u64 fence = ++last_request_fence; + work_requests.emplace_back(fence, id); return fence; } void GPU::TickWork() { - std::unique_lock lck{flush_request_mutex}; - while (!flush_requests.empty()) { - auto& request = flush_requests.front(); + std::unique_lock lck{work_request_mutex}; + while (!work_requests.empty()) { + auto request = work_requests.front(); const u64 fence = request.fence; - const VAddr addr = request.addr; - const std::size_t size = request.size; - flush_requests.pop_front(); - flush_request_mutex.unlock(); - rasterizer->FlushRegion(addr, size); - current_flush_fence.store(fence); - flush_request_mutex.lock(); + work_requests.pop_front(); + work_request_mutex.unlock(); + switch (request.type) { + case RequestType::Flush: { + rasterizer->FlushRegion(request.flush.addr, request.flush.size); + break; + } + case RequestType::QueueFrame: { + Tegra::FramebufferConfig frame_info; + { + std::unique_lock lock(frame_requests_mutex); + const u64 searching_id = request.queue_frame.id; + auto it = std::find_if( + frame_queue_items.begin(), frame_queue_items.end(), + [searching_id](const FrameQueue& item) { return item.id == searching_id; }); + ASSERT(it != frame_queue_items.end()); + frame_info = it->frame_info; + frame_queue_items.erase(it); + } + renderer->SwapBuffers(&frame_info); + break; + } + default: { + LOG_ERROR(HW_GPU, "Unknown work request type={}", request.type); + } + } + current_request_fence.store(fence, std::memory_order_release); + work_request_mutex.lock(); + request_cv.notify_all(); + } +} + +void GPU::QueueFrame(const Tegra::FramebufferConfig* framebuffer, + const Service::Nvidia::MultiFence& fences) { + std::unique_lock lock(frame_requests_mutex); + if (fences.num_fences == 0) { + u64 new_queue_id = frame_queue_ids++; + FrameQueue item{ + .frame_info = *framebuffer, + .id = new_queue_id, + }; + frame_queue_items.push_back(item); + RequestQueueFrame(new_queue_id); + return; + } + u64 new_id = frame_request_ids++; + FrameRequest request{ + .frame_info = *framebuffer, + .count = 0, + .id = new_id, + }; + std::unique_lock lck{pre_sync_mutex}; + for (size_t i = 0; i < fences.num_fences; i++) { + auto& fence = fences.fences[i]; + if (pre_syncpoints[fence.id].load(std::memory_order_relaxed) < fence.value) { + const FrameTrigger trigger{ + .id = new_id, + .sync_point_value = fence.value, + }; + frame_triggers[fence.id].push_back(trigger); + ++request.count; + } + } + if (request.count == 0) { + lck.unlock(); + gpu_thread.SwapBuffers(framebuffer); + return; + } + frame_requests.emplace(new_id, request); +} + +void GPU::ProcessFrameRequests(u32 syncpoint_id, u32 new_value) { + auto& list = frame_triggers[syncpoint_id]; + if (list.empty()) { + return; + } + auto it = list.begin(); + while (it != list.end()) { + if (it->sync_point_value <= new_value) { + auto obj = frame_requests.find(it->id); + --obj->second.count; + if (obj->second.count == 0) { + rasterizer->FlushCommands(); + renderer->SwapBuffers(&obj->second.frame_info); + frame_requests.erase(obj); + } + it = list.erase(it); + continue; + } + ++it; } } @@ -399,7 +502,7 @@ void GPU::ProcessFenceActionMethod() { WaitFence(regs.fence_action.syncpoint_id, regs.fence_value); break; case FenceOperation::Increment: - IncrementSyncPoint(regs.fence_action.syncpoint_id); + rasterizer->SignalSyncPoint(regs.fence_action.syncpoint_id); break; default: UNIMPLEMENTED_MSG("Unimplemented operation {}", regs.fence_action.op.Value()); diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h index a8e98e51b..8d50758d5 100755 --- a/src/video_core/gpu.h +++ b/src/video_core/gpu.h @@ -159,11 +159,16 @@ public: void OnCommandListEnd(); /// Request a host GPU memory flush from the CPU. - [[nodiscard]] u64 RequestFlush(VAddr addr, std::size_t size); + u64 RequestFlush(VAddr addr, std::size_t size); + + void WaitOnWorkRequest(u64 fence); + + void QueueFrame(const Tegra::FramebufferConfig* framebuffer, + const Service::Nvidia::MultiFence& fence); /// Obtains current flush request fence id. - [[nodiscard]] u64 CurrentFlushRequestFence() const { - return current_flush_fence.load(std::memory_order_relaxed); + [[nodiscard]] u64 CurrentWorkRequestFence() const { + return current_request_fence.load(std::memory_order_relaxed); } /// Tick pending requests within the GPU. @@ -225,6 +230,7 @@ public: /// Allows the CPU/NvFlinger to wait on the GPU before presenting a frame. void WaitFence(u32 syncpoint_id, u32 value); + void IncrementSyncPointGuest(u32 syncpoint_id); void IncrementSyncPoint(u32 syncpoint_id); [[nodiscard]] u32 GetSyncpointValue(u32 syncpoint_id) const; @@ -365,6 +371,34 @@ private: /// Determines where the method should be executed. [[nodiscard]] bool ExecuteMethodOnEngine(u32 method); + struct FrameRequest { + Tegra::FramebufferConfig frame_info; + size_t count; + u64 id; + }; + + struct FrameTrigger { + u64 id; + u32 sync_point_value; + }; + + struct FrameQueue { + Tegra::FramebufferConfig frame_info; + u64 id; + }; + + /// Request a frame release on the GPU thread + u64 RequestQueueFrame(u64 id); + + void ProcessFrameRequests(u32 syncpoint_id, u32 new_value); + + std::mutex frame_requests_mutex; + std::unordered_map> frame_triggers; + std::unordered_map frame_requests; + std::list frame_queue_items; + u64 frame_queue_ids{}; + u64 frame_request_ids{}; + protected: Core::System& system; std::unique_ptr memory_manager; @@ -392,27 +426,50 @@ private: /// When true, we are about to shut down emulation session, so terminate outstanding tasks std::atomic_bool shutting_down{}; + std::array, Service::Nvidia::MaxSyncPoints> pre_syncpoints{}; std::array, Service::Nvidia::MaxSyncPoints> syncpoints{}; std::array, Service::Nvidia::MaxSyncPoints> syncpt_interrupts; + std::mutex pre_sync_mutex; std::mutex sync_mutex; std::mutex device_mutex; std::condition_variable sync_cv; - struct FlushRequest { - explicit FlushRequest(u64 fence_, VAddr addr_, std::size_t size_) - : fence{fence_}, addr{addr_}, size{size_} {} - u64 fence; - VAddr addr; - std::size_t size; + enum class RequestType : u32 { + Flush = 0, + QueueFrame = 1, }; - std::list flush_requests; - std::atomic current_flush_fence{}; - u64 last_flush_fence{}; - std::mutex flush_request_mutex; + struct WorkRequest { + explicit WorkRequest(u64 fence_, VAddr addr_, std::size_t size_) + : fence{fence_}, type{RequestType::Flush} { + flush.addr = addr_; + flush.size = size_; + } + + explicit WorkRequest(u64 fence_, u64 id) : fence{fence_}, type{RequestType::QueueFrame} { + queue_frame.id = id; + } + u64 fence; + union { + struct { + VAddr addr; + std::size_t size; + } flush; + struct { + u64 id; + } queue_frame; + }; + RequestType type; + }; // namespace Tegra + + std::list work_requests; + std::atomic current_request_fence{}; + u64 last_request_fence{}; + std::mutex work_request_mutex; + std::condition_variable request_cv; const bool is_async; diff --git a/src/video_core/gpu_thread.cpp b/src/video_core/gpu_thread.cpp index 46f642b19..3470064f3 100755 --- a/src/video_core/gpu_thread.cpp +++ b/src/video_core/gpu_thread.cpp @@ -105,7 +105,7 @@ void ThreadManager::FlushRegion(VAddr addr, u64 size) { auto& gpu = system.GPU(); u64 fence = gpu.RequestFlush(addr, size); PushCommand(GPUTickCommand(), true); - ASSERT(fence <= gpu.CurrentFlushRequestFence()); + ASSERT(fence <= gpu.CurrentWorkRequestFence()); } void ThreadManager::InvalidateRegion(VAddr addr, u64 size) { diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index d5169e9c4..28d881e86 100755 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -214,6 +214,8 @@ void RasterizerOpenGL::Clear() { void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) { MICROPROFILE_SCOPE(OpenGL_Drawing); + SCOPE_EXIT({ gpu.TickWork(); }); + query_cache.UpdateCounters(); SyncState(); @@ -269,8 +271,6 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) { ++num_queued_commands; has_written_global_memory |= pipeline->WritesGlobalMemory(); - - gpu.TickWork(); } void RasterizerOpenGL::DispatchCompute() { @@ -421,6 +421,7 @@ void RasterizerOpenGL::SignalSemaphore(GPUVAddr addr, u32 value) { } void RasterizerOpenGL::SignalSyncPoint(u32 value) { + gpu.IncrementSyncPointGuest(value); if (!gpu.IsAsync()) { gpu.IncrementSyncPoint(value); return; diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index c6e093718..e7594796a 100755 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -412,6 +412,7 @@ void RasterizerVulkan::SignalSemaphore(GPUVAddr addr, u32 value) { } void RasterizerVulkan::SignalSyncPoint(u32 value) { + gpu.IncrementSyncPointGuest(value); if (!gpu.IsAsync()) { gpu.IncrementSyncPoint(value); return;