diff --git a/README.md b/README.md index ac912eb77..bd7b781c7 100755 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ yuzu emulator early access ============= -This is the source code for early-access 3829. +This is the source code for early-access 3830. ## Legal Notice diff --git a/src/common/settings.cpp b/src/common/settings.cpp index 6af15298b..3a88bcd7f 100755 --- a/src/common/settings.cpp +++ b/src/common/settings.cpp @@ -129,17 +129,13 @@ void LogSettings() { log_path("DataStorage_SDMCDir", Common::FS::GetYuzuPath(Common::FS::YuzuPath::SDMCDir)); } -void UpdateGPUAccuracy() { - values.current_gpu_accuracy = values.gpu_accuracy.GetValue(); -} - bool IsGPULevelExtreme() { - return values.current_gpu_accuracy == GpuAccuracy::Extreme; + return values.gpu_accuracy.GetValue() == GpuAccuracy::Extreme; } bool IsGPULevelHigh() { - return values.current_gpu_accuracy == GpuAccuracy::Extreme || - values.current_gpu_accuracy == GpuAccuracy::High; + return values.gpu_accuracy.GetValue() == GpuAccuracy::Extreme || + values.gpu_accuracy.GetValue() == GpuAccuracy::High; } bool IsFastmemEnabled() { diff --git a/src/common/settings.h b/src/common/settings.h index 6f37304f4..2b4d6a787 100755 --- a/src/common/settings.h +++ b/src/common/settings.h @@ -307,7 +307,6 @@ struct Values { Specialization::Default, true, true}; - GpuAccuracy current_gpu_accuracy{GpuAccuracy::High}; SwitchableSetting max_anisotropy{ linkage, AnisotropyMode::Automatic, AnisotropyMode::Automatic, AnisotropyMode::X16, "max_anisotropy", Category::RendererAdvanced}; @@ -515,7 +514,6 @@ struct Values { extern Values values; -void UpdateGPUAccuracy(); bool IsGPULevelExtreme(); bool IsGPULevelHigh(); diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index 28222509f..ba2ccfae9 100755 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt @@ -95,12 +95,6 @@ add_library(video_core STATIC memory_manager.h precompiled_headers.h pte_kind.h - query_cache/bank_base.h - query_cache/query_base.h - query_cache/query_cache_base.h - query_cache/query_cache.h - query_cache/query_stream.h - query_cache/types.h query_cache.h rasterizer_accelerated.cpp rasterizer_accelerated.h diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index d2ebc279b..54a145313 100755 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -272,19 +272,13 @@ std::pair BufferCache

::ObtainBuffer(GPUVAddr gpu_ad if (!cpu_addr) { return {&slot_buffers[NULL_BUFFER_ID], 0}; } - return ObtainCPUBuffer(*cpu_addr, size, sync_info, post_op); -} - -template -std::pair BufferCache

::ObtainCPUBuffer( - VAddr cpu_addr, u32 size, ObtainBufferSynchronize sync_info, ObtainBufferOperation post_op) { - const BufferId buffer_id = FindBuffer(cpu_addr, size); + const BufferId buffer_id = FindBuffer(*cpu_addr, size); Buffer& buffer = slot_buffers[buffer_id]; // synchronize op switch (sync_info) { case ObtainBufferSynchronize::FullSynchronize: - SynchronizeBuffer(buffer, cpu_addr, size); + SynchronizeBuffer(buffer, *cpu_addr, size); break; default: break; @@ -292,10 +286,10 @@ std::pair BufferCache

::ObtainCPUBuffer( switch (post_op) { case ObtainBufferOperation::MarkAsWritten: - MarkWrittenBuffer(buffer_id, cpu_addr, size); + MarkWrittenBuffer(buffer_id, *cpu_addr, size); break; case ObtainBufferOperation::DiscardWrite: { - IntervalType interval{cpu_addr, size}; + IntervalType interval{*cpu_addr, size}; ClearDownload(interval); break; } @@ -303,7 +297,7 @@ std::pair BufferCache

::ObtainCPUBuffer( break; } - return {&buffer, buffer.Offset(cpu_addr)}; + return {&buffer, buffer.Offset(*cpu_addr)}; } template diff --git a/src/video_core/buffer_cache/buffer_cache_base.h b/src/video_core/buffer_cache/buffer_cache_base.h index c4f6e8d12..0b7135d49 100755 --- a/src/video_core/buffer_cache/buffer_cache_base.h +++ b/src/video_core/buffer_cache/buffer_cache_base.h @@ -295,10 +295,6 @@ public: [[nodiscard]] std::pair ObtainBuffer(GPUVAddr gpu_addr, u32 size, ObtainBufferSynchronize sync_info, ObtainBufferOperation post_op); - - [[nodiscard]] std::pair ObtainCPUBuffer(VAddr gpu_addr, u32 size, - ObtainBufferSynchronize sync_info, - ObtainBufferOperation post_op); void FlushCachedWrites(); /// Return true when there are uncommitted buffers to be downloaded @@ -339,14 +335,6 @@ public: [[nodiscard]] std::pair GetDrawIndirectBuffer(); - template - void BufferOperations(Func&& func) { - do { - channel_state->has_deleted_buffers = false; - func(); - } while (channel_state->has_deleted_buffers); - } - std::recursive_mutex mutex; Runtime& runtime; diff --git a/src/video_core/control/channel_state_cache.h b/src/video_core/control/channel_state_cache.h index d0aeb81de..cbfb32141 100755 --- a/src/video_core/control/channel_state_cache.h +++ b/src/video_core/control/channel_state_cache.h @@ -51,7 +51,7 @@ public: virtual void CreateChannel(Tegra::Control::ChannelState& channel); /// Bind a channel for execution. - virtual void BindToChannel(s32 id); + void BindToChannel(s32 id); /// Erase channel's state. void EraseChannel(s32 id); diff --git a/src/video_core/engines/draw_manager.h b/src/video_core/engines/draw_manager.h index 18d959143..7c22c49f1 100755 --- a/src/video_core/engines/draw_manager.h +++ b/src/video_core/engines/draw_manager.h @@ -46,7 +46,6 @@ public: }; struct IndirectParams { - bool is_byte_count; bool is_indexed; bool include_count; GPUVAddr count_start_address; diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp index 81eeb3b00..0a0d1a3b0 100755 --- a/src/video_core/engines/maxwell_3d.cpp +++ b/src/video_core/engines/maxwell_3d.cpp @@ -20,6 +20,8 @@ namespace Tegra::Engines { +using VideoCore::QueryType; + /// First register id that is actually a Macro call. constexpr u32 MacroRegistersStart = 0xE00; @@ -494,21 +496,27 @@ void Maxwell3D::StampQueryResult(u64 payload, bool long_query) { } void Maxwell3D::ProcessQueryGet() { - VideoCommon::QueryPropertiesFlags flags{}; - if (regs.report_semaphore.query.short_query == 0) { - flags |= VideoCommon::QueryPropertiesFlags::HasTimeout; - } - const GPUVAddr sequence_address{regs.report_semaphore.Address()}; - const VideoCommon::QueryType query_type = - static_cast(regs.report_semaphore.query.report.Value()); - const u32 payload = regs.report_semaphore.payload; - const u32 subreport = regs.report_semaphore.query.sub_report; switch (regs.report_semaphore.query.operation) { case Regs::ReportSemaphore::Operation::Release: if (regs.report_semaphore.query.short_query != 0) { - flags |= VideoCommon::QueryPropertiesFlags::IsAFence; + const GPUVAddr sequence_address{regs.report_semaphore.Address()}; + const u32 payload = regs.report_semaphore.payload; + std::function operation([this, sequence_address, payload] { + memory_manager.Write(sequence_address, payload); + }); + rasterizer->SignalFence(std::move(operation)); + } else { + struct LongQueryResult { + u64_le value; + u64_le timestamp; + }; + const GPUVAddr sequence_address{regs.report_semaphore.Address()}; + const u32 payload = regs.report_semaphore.payload; + [this, sequence_address, payload] { + memory_manager.Write(sequence_address + sizeof(u64), system.GPU().GetTicks()); + memory_manager.Write(sequence_address, payload); + }(); } - rasterizer->Query(sequence_address, query_type, flags, payload, subreport); break; case Regs::ReportSemaphore::Operation::Acquire: // TODO(Blinkhawk): Under this operation, the GPU waits for the CPU to write a value that @@ -516,7 +524,11 @@ void Maxwell3D::ProcessQueryGet() { UNIMPLEMENTED_MSG("Unimplemented query operation ACQUIRE"); break; case Regs::ReportSemaphore::Operation::ReportOnly: - rasterizer->Query(sequence_address, query_type, flags, payload, subreport); + if (const std::optional result = GetQueryResult()) { + // If the query returns an empty optional it means it's cached and deferred. + // In this case we have a non-empty result, so we stamp it immediately. + StampQueryResult(*result, regs.report_semaphore.query.short_query == 0); + } break; case Regs::ReportSemaphore::Operation::Trap: UNIMPLEMENTED_MSG("Unimplemented query operation TRAP"); @@ -528,10 +540,6 @@ void Maxwell3D::ProcessQueryGet() { } void Maxwell3D::ProcessQueryCondition() { - if (rasterizer->AccelerateConditionalRendering()) { - execute_on = true; - return; - } const GPUVAddr condition_address{regs.render_enable.Address()}; switch (regs.render_enable_override) { case Regs::RenderEnable::Override::AlwaysRender: @@ -541,6 +549,10 @@ void Maxwell3D::ProcessQueryCondition() { execute_on = false; break; case Regs::RenderEnable::Override::UseRenderEnable: { + if (rasterizer->AccelerateConditionalRendering()) { + execute_on = true; + return; + } switch (regs.render_enable.mode) { case Regs::RenderEnable::Mode::True: { execute_on = true; @@ -582,9 +594,15 @@ void Maxwell3D::ProcessQueryCondition() { } void Maxwell3D::ProcessCounterReset() { +#if ANDROID + if (!Settings::IsGPULevelHigh()) { + // This is problematic on Android, disable on GPU Normal. + return; + } +#endif switch (regs.clear_report_value) { case Regs::ClearReport::ZPassPixelCount: - rasterizer->ResetCounter(VideoCommon::QueryType::ZPassPixelCount64); + rasterizer->ResetCounter(QueryType::SamplesPassed); break; default: LOG_DEBUG(Render_OpenGL, "Unimplemented counter reset={}", regs.clear_report_value); @@ -598,6 +616,28 @@ void Maxwell3D::ProcessSyncPoint() { rasterizer->SignalSyncPoint(sync_point); } +std::optional Maxwell3D::GetQueryResult() { + switch (regs.report_semaphore.query.report) { + case Regs::ReportSemaphore::Report::Payload: + return regs.report_semaphore.payload; + case Regs::ReportSemaphore::Report::ZPassPixelCount64: +#if ANDROID + if (!Settings::IsGPULevelHigh()) { + // This is problematic on Android, disable on GPU Normal. + return 120; + } +#endif + // Deferred. + rasterizer->Query(regs.report_semaphore.Address(), QueryType::SamplesPassed, + system.GPU().GetTicks()); + return std::nullopt; + default: + LOG_DEBUG(HW_GPU, "Unimplemented query report type {}", + regs.report_semaphore.query.report.Value()); + return 1; + } +} + void Maxwell3D::ProcessCBBind(size_t stage_index) { // Bind the buffer currently in CB_ADDRESS to the specified index in the desired shader // stage. diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h index ffaab882f..488dfb018 100755 --- a/src/video_core/engines/maxwell_3d.h +++ b/src/video_core/engines/maxwell_3d.h @@ -3182,6 +3182,9 @@ private: /// Handles writes to syncing register. void ProcessSyncPoint(); + /// Returns a query's value or an empty object if the value will be deferred through a cache. + std::optional GetQueryResult(); + void RefreshParametersImpl(); bool IsMethodExecutable(u32 method); diff --git a/src/video_core/engines/maxwell_dma.cpp b/src/video_core/engines/maxwell_dma.cpp index 82e87df0b..d73a91712 100755 --- a/src/video_core/engines/maxwell_dma.cpp +++ b/src/video_core/engines/maxwell_dma.cpp @@ -361,17 +361,21 @@ void MaxwellDMA::ReleaseSemaphore() { const auto type = regs.launch_dma.semaphore_type; const GPUVAddr address = regs.semaphore.address; const u32 payload = regs.semaphore.payload; - VideoCommon::QueryPropertiesFlags flags{VideoCommon::QueryPropertiesFlags::IsAFence}; switch (type) { case LaunchDMA::SemaphoreType::NONE: break; case LaunchDMA::SemaphoreType::RELEASE_ONE_WORD_SEMAPHORE: { - rasterizer->Query(address, VideoCommon::QueryType::Payload, flags, payload, 0); + std::function operation( + [this, address, payload] { memory_manager.Write(address, payload); }); + rasterizer->SignalFence(std::move(operation)); break; } case LaunchDMA::SemaphoreType::RELEASE_FOUR_WORD_SEMAPHORE: { - rasterizer->Query(address, VideoCommon::QueryType::Payload, - flags | VideoCommon::QueryPropertiesFlags::HasTimeout, payload, 0); + std::function operation([this, address, payload] { + memory_manager.Write(address + sizeof(u64), system.GPU().GetTicks()); + memory_manager.Write(address, payload); + }); + rasterizer->SignalFence(std::move(operation)); break; } default: diff --git a/src/video_core/engines/puller.cpp b/src/video_core/engines/puller.cpp index 3fe06cc88..8a989b4cb 100755 --- a/src/video_core/engines/puller.cpp +++ b/src/video_core/engines/puller.cpp @@ -77,8 +77,10 @@ void Puller::ProcessSemaphoreTriggerMethod() { if (op == GpuSemaphoreOperation::WriteLong) { const GPUVAddr sequence_address{regs.semaphore_address.SemaphoreAddress()}; const u32 payload = regs.semaphore_sequence; - rasterizer->Query(sequence_address, VideoCommon::QueryType::Payload, - VideoCommon::QueryPropertiesFlags::HasTimeout, payload, 0); + [this, sequence_address, payload] { + memory_manager.Write(sequence_address + sizeof(u64), gpu.GetTicks()); + memory_manager.Write(sequence_address, payload); + }(); } else { do { const u32 word{memory_manager.Read(regs.semaphore_address.SemaphoreAddress())}; @@ -113,8 +115,10 @@ void Puller::ProcessSemaphoreTriggerMethod() { void Puller::ProcessSemaphoreRelease() { const GPUVAddr sequence_address{regs.semaphore_address.SemaphoreAddress()}; const u32 payload = regs.semaphore_release; - rasterizer->Query(sequence_address, VideoCommon::QueryType::Payload, - VideoCommon::QueryPropertiesFlags::IsAFence, payload, 0); + std::function operation([this, sequence_address, payload] { + memory_manager.Write(sequence_address, payload); + }); + rasterizer->SignalFence(std::move(operation)); } void Puller::ProcessSemaphoreAcquire() { @@ -123,6 +127,7 @@ void Puller::ProcessSemaphoreAcquire() { while (word != value) { regs.acquire_active = true; regs.acquire_value = value; + std::this_thread::sleep_for(std::chrono::milliseconds(1)); rasterizer->ReleaseFences(); word = memory_manager.Read(regs.semaphore_address.SemaphoreAddress()); // TODO(kemathe73) figure out how to do the acquire_timeout diff --git a/src/video_core/fence_manager.h b/src/video_core/fence_manager.h index 139c824d3..89ef61999 100755 --- a/src/video_core/fence_manager.h +++ b/src/video_core/fence_manager.h @@ -55,9 +55,6 @@ public: // Unlike other fences, this one doesn't void SignalOrdering() { - if constexpr (!can_async_check) { - TryReleasePendingFences(); - } std::scoped_lock lock{buffer_cache.mutex}; buffer_cache.AccumulateFlushes(); } @@ -107,25 +104,9 @@ public: SignalFence(std::move(func)); } - void WaitPendingFences([[maybe_unused]] bool force) { + void WaitPendingFences() { if constexpr (!can_async_check) { TryReleasePendingFences(); - } else { - if (!force) { - return; - } - std::mutex wait_mutex; - std::condition_variable wait_cv; - std::atomic wait_finished{}; - std::function func([&] { - std::scoped_lock lk(wait_mutex); - wait_finished.store(true, std::memory_order_relaxed); - wait_cv.notify_all(); - }); - SignalFence(std::move(func)); - std::unique_lock lk(wait_mutex); - wait_cv.wait( - lk, [&wait_finished] { return wait_finished.load(std::memory_order_relaxed); }); } } diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp index 1df87ea7b..aa99c4941 100755 --- a/src/video_core/gpu.cpp +++ b/src/video_core/gpu.cpp @@ -102,8 +102,7 @@ struct GPU::Impl { /// Signal the ending of command list. void OnCommandListEnd() { - rasterizer->ReleaseFences(false); - Settings::UpdateGPUAccuracy(); + rasterizer->ReleaseFences(); } /// Request a host GPU memory flush from the CPU. @@ -221,7 +220,6 @@ struct GPU::Impl { /// This can be used to launch any necessary threads and register any necessary /// core timing events. void Start() { - Settings::UpdateGPUAccuracy(); gpu_thread.StartThread(*renderer, renderer->Context(), *scheduler); } diff --git a/src/video_core/host_shaders/CMakeLists.txt b/src/video_core/host_shaders/CMakeLists.txt index a06cde47c..7f30da79c 100755 --- a/src/video_core/host_shaders/CMakeLists.txt +++ b/src/video_core/host_shaders/CMakeLists.txt @@ -41,9 +41,6 @@ set(SHADER_FILES pitch_unswizzle.comp present_bicubic.frag present_gaussian.frag - queries_prefix_scan_sum.comp - queries_prefix_scan_sum_nosubgroups.comp - resolve_conditional_render.comp smaa_edge_detection.vert smaa_edge_detection.frag smaa_blending_weight_calculation.vert @@ -73,7 +70,6 @@ if ("${GLSLANGVALIDATOR}" STREQUAL "GLSLANGVALIDATOR-NOTFOUND") endif() set(GLSL_FLAGS "") -set(SPIR_V_VERSION "spirv1.3") set(QUIET_FLAG "--quiet") set(SHADER_INCLUDE ${CMAKE_CURRENT_BINARY_DIR}/include) @@ -127,7 +123,7 @@ foreach(FILENAME IN ITEMS ${SHADER_FILES}) OUTPUT ${SPIRV_HEADER_FILE} COMMAND - ${GLSLANGVALIDATOR} -V ${QUIET_FLAG} -I"${FIDELITYFX_INCLUDE_DIR}" ${GLSL_FLAGS} --variable-name ${SPIRV_VARIABLE_NAME} -o ${SPIRV_HEADER_FILE} ${SOURCE_FILE} --target-env ${SPIR_V_VERSION} + ${GLSLANGVALIDATOR} -V ${QUIET_FLAG} -I"${FIDELITYFX_INCLUDE_DIR}" ${GLSL_FLAGS} --variable-name ${SPIRV_VARIABLE_NAME} -o ${SPIRV_HEADER_FILE} ${SOURCE_FILE} MAIN_DEPENDENCY ${SOURCE_FILE} ) diff --git a/src/video_core/macro/macro_hle.cpp b/src/video_core/macro/macro_hle.cpp index 09f6e2d97..ae9e07af2 100755 --- a/src/video_core/macro/macro_hle.cpp +++ b/src/video_core/macro/macro_hle.cpp @@ -67,7 +67,6 @@ public: } auto& params = maxwell3d.draw_manager->GetIndirectParams(); - params.is_byte_count = false; params.is_indexed = false; params.include_count = false; params.count_start_address = 0; @@ -162,7 +161,6 @@ public: 0, 0x644, Maxwell3D::HLEReplacementAttributeType::BaseInstance); } auto& params = maxwell3d.draw_manager->GetIndirectParams(); - params.is_byte_count = false; params.is_indexed = true; params.include_count = false; params.count_start_address = 0; @@ -258,7 +256,6 @@ public: const u32 estimate = static_cast(maxwell3d.EstimateIndexBufferSize()); maxwell3d.dirty.flags[VideoCommon::Dirty::IndexBuffer] = true; auto& params = maxwell3d.draw_manager->GetIndirectParams(); - params.is_byte_count = false; params.is_indexed = true; params.include_count = true; params.count_start_address = maxwell3d.GetMacroAddress(4); @@ -322,47 +319,6 @@ private: } }; -class HLE_DrawIndirectByteCount final : public HLEMacroImpl { -public: - explicit HLE_DrawIndirectByteCount(Maxwell3D& maxwell3d_) : HLEMacroImpl(maxwell3d_) {} - - void Execute(const std::vector& parameters, [[maybe_unused]] u32 method) override { - auto topology = static_cast(parameters[0] & 0xFFFFU); - if (!maxwell3d.AnyParametersDirty() || !IsTopologySafe(topology)) { - Fallback(parameters); - return; - } - - auto& params = maxwell3d.draw_manager->GetIndirectParams(); - params.is_byte_count = true; - params.is_indexed = false; - params.include_count = false; - params.count_start_address = 0; - params.indirect_start_address = maxwell3d.GetMacroAddress(2); - params.buffer_size = 4; - params.max_draw_counts = 1; - params.stride = parameters[1]; - maxwell3d.regs.draw.begin = parameters[0]; - maxwell3d.regs.draw_auto_stride = parameters[1]; - maxwell3d.regs.draw_auto_byte_count = parameters[2]; - - maxwell3d.draw_manager->DrawArrayIndirect(topology); - } - -private: - void Fallback(const std::vector& parameters) { - maxwell3d.RefreshParameters(); - - maxwell3d.regs.draw.begin = parameters[0]; - maxwell3d.regs.draw_auto_stride = parameters[1]; - maxwell3d.regs.draw_auto_byte_count = parameters[2]; - - maxwell3d.draw_manager->DrawArray( - maxwell3d.regs.draw.topology, 0, - maxwell3d.regs.draw_auto_byte_count / maxwell3d.regs.draw_auto_stride, 0, 1); - } -}; - class HLE_C713C83D8F63CCF3 final : public HLEMacroImpl { public: explicit HLE_C713C83D8F63CCF3(Maxwell3D& maxwell3d_) : HLEMacroImpl(maxwell3d_) {} @@ -580,11 +536,6 @@ HLEMacro::HLEMacro(Maxwell3D& maxwell3d_) : maxwell3d{maxwell3d_} { [](Maxwell3D& maxwell3d__) -> std::unique_ptr { return std::make_unique(maxwell3d__); })); - builders.emplace(0xB5F74EDB717278ECULL, - std::function(Maxwell3D&)>( - [](Maxwell3D& maxwell3d__) -> std::unique_ptr { - return std::make_unique(maxwell3d__); - })); } HLEMacro::~HLEMacro() = default; diff --git a/src/video_core/query_cache.h b/src/video_core/query_cache.h index 450a75296..a363daab2 100755 --- a/src/video_core/query_cache.h +++ b/src/video_core/query_cache.h @@ -25,13 +25,6 @@ #include "video_core/rasterizer_interface.h" #include "video_core/texture_cache/slot_vector.h" -namespace VideoCore { -enum class QueryType { - SamplesPassed, -}; -constexpr std::size_t NumQueryTypes = 1; -} // namespace VideoCore - namespace VideoCommon { using AsyncJobId = SlotId; @@ -105,10 +98,10 @@ private: }; template -class QueryCacheLegacy : public VideoCommon::ChannelSetupCaches { +class QueryCacheBase : public VideoCommon::ChannelSetupCaches { public: - explicit QueryCacheLegacy(VideoCore::RasterizerInterface& rasterizer_, - Core::Memory::Memory& cpu_memory_) + explicit QueryCacheBase(VideoCore::RasterizerInterface& rasterizer_, + Core::Memory::Memory& cpu_memory_) : rasterizer{rasterizer_}, // Use reinterpret_cast instead of static_cast as workaround for // UBSan bug (https://github.com/llvm/llvm-project/issues/59060) diff --git a/src/video_core/rasterizer_interface.h b/src/video_core/rasterizer_interface.h index 06b538b3b..53130d6bf 100755 --- a/src/video_core/rasterizer_interface.h +++ b/src/video_core/rasterizer_interface.h @@ -12,7 +12,6 @@ #include "video_core/cache_types.h" #include "video_core/engines/fermi_2d.h" #include "video_core/gpu.h" -#include "video_core/query_cache/types.h" #include "video_core/rasterizer_download_area.h" namespace Tegra { @@ -27,6 +26,11 @@ struct ChannelState; namespace VideoCore { +enum class QueryType { + SamplesPassed, +}; +constexpr std::size_t NumQueryTypes = 1; + enum class LoadCallbackStage { Prepare, Build, @@ -54,11 +58,10 @@ public: virtual void DispatchCompute() = 0; /// Resets the counter of a query - virtual void ResetCounter(VideoCommon::QueryType type) = 0; + virtual void ResetCounter(QueryType type) = 0; /// Records a GPU query and caches it - virtual void Query(GPUVAddr gpu_addr, VideoCommon::QueryType type, - VideoCommon::QueryPropertiesFlags flags, u32 payload, u32 subreport) = 0; + virtual void Query(GPUVAddr gpu_addr, QueryType type, std::optional timestamp) = 0; /// Signal an uniform buffer binding virtual void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, @@ -80,7 +83,7 @@ public: virtual void SignalReference() = 0; /// Release all pending fences. - virtual void ReleaseFences(bool force = true) = 0; + virtual void ReleaseFences() = 0; /// Notify rasterizer that all caches should be flushed to Switch memory virtual void FlushAll() = 0; diff --git a/src/video_core/renderer_null/null_rasterizer.cpp b/src/video_core/renderer_null/null_rasterizer.cpp index 65cd5aa06..92ecf6682 100755 --- a/src/video_core/renderer_null/null_rasterizer.cpp +++ b/src/video_core/renderer_null/null_rasterizer.cpp @@ -26,18 +26,16 @@ void RasterizerNull::Draw(bool is_indexed, u32 instance_count) {} void RasterizerNull::DrawTexture() {} void RasterizerNull::Clear(u32 layer_count) {} void RasterizerNull::DispatchCompute() {} -void RasterizerNull::ResetCounter(VideoCommon::QueryType type) {} -void RasterizerNull::Query(GPUVAddr gpu_addr, VideoCommon::QueryType type, - VideoCommon::QueryPropertiesFlags flags, u32 payload, u32 subreport) { +void RasterizerNull::ResetCounter(VideoCore::QueryType type) {} +void RasterizerNull::Query(GPUVAddr gpu_addr, VideoCore::QueryType type, + std::optional timestamp) { if (!gpu_memory) { return; } - if (True(flags & VideoCommon::QueryPropertiesFlags::HasTimeout)) { - u64 ticks = m_gpu.GetTicks(); - gpu_memory->Write(gpu_addr + 8, ticks); - gpu_memory->Write(gpu_addr, static_cast(payload)); - } else { - gpu_memory->Write(gpu_addr, payload); + + gpu_memory->Write(gpu_addr, u64{0}); + if (timestamp) { + gpu_memory->Write(gpu_addr + 8, *timestamp); } } void RasterizerNull::BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, @@ -76,7 +74,7 @@ void RasterizerNull::SignalSyncPoint(u32 value) { syncpoint_manager.IncrementHost(value); } void RasterizerNull::SignalReference() {} -void RasterizerNull::ReleaseFences(bool) {} +void RasterizerNull::ReleaseFences() {} void RasterizerNull::FlushAndInvalidateRegion(VAddr addr, u64 size, VideoCommon::CacheType) {} void RasterizerNull::WaitForIdle() {} void RasterizerNull::FragmentBarrier() {} diff --git a/src/video_core/renderer_null/null_rasterizer.h b/src/video_core/renderer_null/null_rasterizer.h index 23001eeb8..93b9a6971 100755 --- a/src/video_core/renderer_null/null_rasterizer.h +++ b/src/video_core/renderer_null/null_rasterizer.h @@ -42,9 +42,8 @@ public: void DrawTexture() override; void Clear(u32 layer_count) override; void DispatchCompute() override; - void ResetCounter(VideoCommon::QueryType type) override; - void Query(GPUVAddr gpu_addr, VideoCommon::QueryType type, - VideoCommon::QueryPropertiesFlags flags, u32 payload, u32 subreport) override; + void ResetCounter(VideoCore::QueryType type) override; + void Query(GPUVAddr gpu_addr, VideoCore::QueryType type, std::optional timestamp) override; void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size) override; void DisableGraphicsUniformBuffer(size_t stage, u32 index) override; void FlushAll() override; @@ -64,7 +63,7 @@ public: void SyncOperation(std::function&& func) override; void SignalSyncPoint(u32 value) override; void SignalReference() override; - void ReleaseFences(bool force) override; + void ReleaseFences() override; void FlushAndInvalidateRegion( VAddr addr, u64 size, VideoCommon::CacheType which = VideoCommon::CacheType::All) override; void WaitForIdle() override; diff --git a/src/video_core/renderer_opengl/gl_query_cache.cpp b/src/video_core/renderer_opengl/gl_query_cache.cpp index 251bcb140..92ba43f25 100755 --- a/src/video_core/renderer_opengl/gl_query_cache.cpp +++ b/src/video_core/renderer_opengl/gl_query_cache.cpp @@ -27,7 +27,7 @@ constexpr GLenum GetTarget(VideoCore::QueryType type) { } // Anonymous namespace QueryCache::QueryCache(RasterizerOpenGL& rasterizer_, Core::Memory::Memory& cpu_memory_) - : QueryCacheLegacy(rasterizer_, cpu_memory_), gl_rasterizer{rasterizer_} {} + : QueryCacheBase(rasterizer_, cpu_memory_), gl_rasterizer{rasterizer_} {} QueryCache::~QueryCache() = default; diff --git a/src/video_core/renderer_opengl/gl_query_cache.h b/src/video_core/renderer_opengl/gl_query_cache.h index c69488421..5c127c6d1 100755 --- a/src/video_core/renderer_opengl/gl_query_cache.h +++ b/src/video_core/renderer_opengl/gl_query_cache.h @@ -26,7 +26,7 @@ class RasterizerOpenGL; using CounterStream = VideoCommon::CounterStreamBase; class QueryCache final - : public VideoCommon::QueryCacheLegacy { + : public VideoCommon::QueryCacheBase { public: explicit QueryCache(RasterizerOpenGL& rasterizer_, Core::Memory::Memory& cpu_memory_); ~QueryCache(); diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index 2cc52003f..3659f729e 100755 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -385,39 +385,13 @@ void RasterizerOpenGL::DispatchCompute() { has_written_global_memory |= pipeline->WritesGlobalMemory(); } -void RasterizerOpenGL::ResetCounter(VideoCommon::QueryType type) { - if (type == VideoCommon::QueryType::ZPassPixelCount64) { - query_cache.ResetCounter(VideoCore::QueryType::SamplesPassed); - } +void RasterizerOpenGL::ResetCounter(VideoCore::QueryType type) { + query_cache.ResetCounter(type); } -void RasterizerOpenGL::Query(GPUVAddr gpu_addr, VideoCommon::QueryType type, - VideoCommon::QueryPropertiesFlags flags, u32 payload, u32 subreport) { - if (type == VideoCommon::QueryType::ZPassPixelCount64) { - if (True(flags & VideoCommon::QueryPropertiesFlags::HasTimeout)) { - query_cache.Query(gpu_addr, VideoCore::QueryType::SamplesPassed, {gpu.GetTicks()}); - } else { - query_cache.Query(gpu_addr, VideoCore::QueryType::SamplesPassed, std::nullopt); - } - return; - } - if (type != VideoCommon::QueryType::Payload) { - payload = 1u; - } - std::function func([this, gpu_addr, flags, memory_manager = gpu_memory, payload]() { - if (True(flags & VideoCommon::QueryPropertiesFlags::HasTimeout)) { - u64 ticks = gpu.GetTicks(); - memory_manager->Write(gpu_addr + 8, ticks); - memory_manager->Write(gpu_addr, static_cast(payload)); - } else { - memory_manager->Write(gpu_addr, payload); - } - }); - if (True(flags & VideoCommon::QueryPropertiesFlags::IsAFence)) { - SignalFence(std::move(func)); - return; - } - func(); +void RasterizerOpenGL::Query(GPUVAddr gpu_addr, VideoCore::QueryType type, + std::optional timestamp) { + query_cache.Query(gpu_addr, type, timestamp); } void RasterizerOpenGL::BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, @@ -588,8 +562,8 @@ void RasterizerOpenGL::SignalReference() { fence_manager.SignalOrdering(); } -void RasterizerOpenGL::ReleaseFences(bool force) { - fence_manager.WaitPendingFences(force); +void RasterizerOpenGL::ReleaseFences() { + fence_manager.WaitPendingFences(); } void RasterizerOpenGL::FlushAndInvalidateRegion(VAddr addr, u64 size, diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h index d7f1481d2..39ab86641 100755 --- a/src/video_core/renderer_opengl/gl_rasterizer.h +++ b/src/video_core/renderer_opengl/gl_rasterizer.h @@ -86,9 +86,8 @@ public: void DrawTexture() override; void Clear(u32 layer_count) override; void DispatchCompute() override; - void ResetCounter(VideoCommon::QueryType type) override; - void Query(GPUVAddr gpu_addr, VideoCommon::QueryType type, - VideoCommon::QueryPropertiesFlags flags, u32 payload, u32 subreport) override; + void ResetCounter(VideoCore::QueryType type) override; + void Query(GPUVAddr gpu_addr, VideoCore::QueryType type, std::optional timestamp) override; void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size) override; void DisableGraphicsUniformBuffer(size_t stage, u32 index) override; void FlushAll() override; @@ -108,7 +107,7 @@ public: void SyncOperation(std::function&& func) override; void SignalSyncPoint(u32 value) override; void SignalReference() override; - void ReleaseFences(bool force = true) override; + void ReleaseFences() override; void FlushAndInvalidateRegion( VAddr addr, u64 size, VideoCommon::CacheType which = VideoCommon::CacheType::All) override; void WaitForIdle() override; diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp index a65a3ee72..da57ae66b 100755 --- a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp @@ -61,9 +61,6 @@ vk::Buffer CreateBuffer(const Device& device, const MemoryAllocator& memory_allo if (device.IsExtTransformFeedbackSupported()) { flags |= VK_BUFFER_USAGE_TRANSFORM_FEEDBACK_BUFFER_BIT_EXT; } - if (device.IsExtConditionalRendering()) { - flags |= VK_BUFFER_USAGE_CONDITIONAL_RENDERING_BIT_EXT; - } const VkBufferCreateInfo buffer_ci = { .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, .pNext = nullptr, diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.cpp b/src/video_core/renderer_vulkan/vk_compute_pass.cpp index 381440c12..454418775 100755 --- a/src/video_core/renderer_vulkan/vk_compute_pass.cpp +++ b/src/video_core/renderer_vulkan/vk_compute_pass.cpp @@ -12,9 +12,6 @@ #include "common/common_types.h" #include "common/div_ceil.h" #include "video_core/host_shaders/astc_decoder_comp_spv.h" -#include "video_core/host_shaders/queries_prefix_scan_sum_comp_spv.h" -#include "video_core/host_shaders/queries_prefix_scan_sum_nosubgroups_comp_spv.h" -#include "video_core/host_shaders/resolve_conditional_render_comp_spv.h" #include "video_core/host_shaders/vulkan_quad_indexed_comp_spv.h" #include "video_core/host_shaders/vulkan_uint8_comp_spv.h" #include "video_core/renderer_vulkan/vk_compute_pass.h" @@ -60,30 +57,6 @@ constexpr std::array INPUT_OUTPUT_DESCRIPTOR_SE }, }}; -constexpr std::array QUERIES_SCAN_DESCRIPTOR_SET_BINDINGS{{ - { - .binding = 0, - .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, - .descriptorCount = 1, - .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, - .pImmutableSamplers = nullptr, - }, - { - .binding = 1, - .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, - .descriptorCount = 1, - .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, - .pImmutableSamplers = nullptr, - }, - { - .binding = 2, - .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, - .descriptorCount = 1, - .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, - .pImmutableSamplers = nullptr, - }, -}}; - constexpr DescriptorBankInfo INPUT_OUTPUT_BANK_INFO{ .uniform_buffers = 0, .storage_buffers = 2, @@ -94,16 +67,6 @@ constexpr DescriptorBankInfo INPUT_OUTPUT_BANK_INFO{ .score = 2, }; -constexpr DescriptorBankInfo QUERIES_SCAN_BANK_INFO{ - .uniform_buffers = 0, - .storage_buffers = 3, - .texture_buffers = 0, - .image_buffers = 0, - .textures = 0, - .images = 0, - .score = 3, -}; - constexpr std::array ASTC_DESCRIPTOR_SET_BINDINGS{{ { .binding = ASTC_BINDING_INPUT_BUFFER, @@ -140,15 +103,6 @@ constexpr VkDescriptorUpdateTemplateEntry INPUT_OUTPUT_DESCRIPTOR_UPDATE_TEMPLAT .stride = sizeof(DescriptorUpdateEntry), }; -constexpr VkDescriptorUpdateTemplateEntry QUERIES_SCAN_DESCRIPTOR_UPDATE_TEMPLATE{ - .dstBinding = 0, - .dstArrayElement = 0, - .descriptorCount = 3, - .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, - .offset = 0, - .stride = sizeof(DescriptorUpdateEntry), -}; - constexpr std::array ASTC_PASS_DESCRIPTOR_UPDATE_TEMPLATE_ENTRY{{ { @@ -177,19 +131,13 @@ struct AstcPushConstants { u32 block_height; u32 block_height_mask; }; - -struct QueriesPrefixScanPushConstants { - u32 max_accumulation_base; - u32 accumulation_limit; -}; } // Anonymous namespace ComputePass::ComputePass(const Device& device_, DescriptorPool& descriptor_pool, vk::Span bindings, vk::Span templates, const DescriptorBankInfo& bank_info, - vk::Span push_constants, std::span code, - std::optional optional_subgroup_size) + vk::Span push_constants, std::span code) : device{device_} { descriptor_set_layout = device.GetLogical().CreateDescriptorSetLayout({ .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO, @@ -230,19 +178,13 @@ ComputePass::ComputePass(const Device& device_, DescriptorPool& descriptor_pool, .pCode = code.data(), }); device.SaveShader(code); - const VkPipelineShaderStageRequiredSubgroupSizeCreateInfoEXT subgroup_size_ci{ - .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_REQUIRED_SUBGROUP_SIZE_CREATE_INFO_EXT, - .pNext = nullptr, - .requiredSubgroupSize = optional_subgroup_size ? *optional_subgroup_size : 32U, - }; - bool use_setup_size = device.IsExtSubgroupSizeControlSupported() && optional_subgroup_size; pipeline = device.GetLogical().CreateComputePipeline({ .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO, .pNext = nullptr, .flags = 0, .stage{ .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, - .pNext = use_setup_size ? &subgroup_size_ci : nullptr, + .pNext = nullptr, .flags = 0, .stage = VK_SHADER_STAGE_COMPUTE_BIT, .module = *module, @@ -360,114 +302,6 @@ std::pair QuadIndexedPass::Assemble( return {staging.buffer, staging.offset}; } -ConditionalRenderingResolvePass::ConditionalRenderingResolvePass( - const Device& device_, Scheduler& scheduler_, DescriptorPool& descriptor_pool_, - ComputePassDescriptorQueue& compute_pass_descriptor_queue_) - : ComputePass(device_, descriptor_pool_, INPUT_OUTPUT_DESCRIPTOR_SET_BINDINGS, - INPUT_OUTPUT_DESCRIPTOR_UPDATE_TEMPLATE, INPUT_OUTPUT_BANK_INFO, nullptr, - RESOLVE_CONDITIONAL_RENDER_COMP_SPV), - scheduler{scheduler_}, compute_pass_descriptor_queue{compute_pass_descriptor_queue_} {} - -void ConditionalRenderingResolvePass::Resolve(VkBuffer dst_buffer, VkBuffer src_buffer, - u32 src_offset, bool compare_to_zero) { - const size_t compare_size = compare_to_zero ? 8 : 24; - - compute_pass_descriptor_queue.Acquire(); - compute_pass_descriptor_queue.AddBuffer(src_buffer, src_offset, compare_size); - compute_pass_descriptor_queue.AddBuffer(dst_buffer, 0, sizeof(u32)); - const void* const descriptor_data{compute_pass_descriptor_queue.UpdateData()}; - - scheduler.RequestOutsideRenderPassOperationContext(); - scheduler.Record([this, descriptor_data](vk::CommandBuffer cmdbuf) { - static constexpr VkMemoryBarrier read_barrier{ - .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, - .pNext = nullptr, - .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT | VK_ACCESS_SHADER_WRITE_BIT, - .dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, - }; - static constexpr VkMemoryBarrier write_barrier{ - .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, - .pNext = nullptr, - .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT, - .dstAccessMask = VK_ACCESS_CONDITIONAL_RENDERING_READ_BIT_EXT, - }; - const VkDescriptorSet set = descriptor_allocator.Commit(); - device.GetLogical().UpdateDescriptorSet(set, *descriptor_template, descriptor_data); - - cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, - VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, read_barrier); - cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline); - cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, *layout, 0, set, {}); - cmdbuf.Dispatch(1, 1, 1); - cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, - VK_PIPELINE_STAGE_CONDITIONAL_RENDERING_BIT_EXT, 0, write_barrier); - }); -} - -QueriesPrefixScanPass::QueriesPrefixScanPass( - const Device& device_, Scheduler& scheduler_, DescriptorPool& descriptor_pool_, - ComputePassDescriptorQueue& compute_pass_descriptor_queue_) - : ComputePass( - device_, descriptor_pool_, QUERIES_SCAN_DESCRIPTOR_SET_BINDINGS, - QUERIES_SCAN_DESCRIPTOR_UPDATE_TEMPLATE, QUERIES_SCAN_BANK_INFO, - COMPUTE_PUSH_CONSTANT_RANGE, - device_.IsSubgroupFeatureSupported(VK_SUBGROUP_FEATURE_BASIC_BIT) && - device_.IsSubgroupFeatureSupported(VK_SUBGROUP_FEATURE_ARITHMETIC_BIT) && - device_.IsSubgroupFeatureSupported(VK_SUBGROUP_FEATURE_SHUFFLE_BIT) && - device_.IsSubgroupFeatureSupported(VK_SUBGROUP_FEATURE_SHUFFLE_RELATIVE_BIT) - ? std::span(QUERIES_PREFIX_SCAN_SUM_COMP_SPV) - : std::span(QUERIES_PREFIX_SCAN_SUM_NOSUBGROUPS_COMP_SPV), - {32}), - scheduler{scheduler_}, compute_pass_descriptor_queue{compute_pass_descriptor_queue_} {} - -void QueriesPrefixScanPass::Run(VkBuffer accumulation_buffer, VkBuffer dst_buffer, - VkBuffer src_buffer, size_t number_of_sums, - size_t max_accumulation_limit) { - size_t aligned_runs = Common::AlignUp(number_of_sums, 32); - - compute_pass_descriptor_queue.Acquire(); - compute_pass_descriptor_queue.AddBuffer(src_buffer, 0, aligned_runs * sizeof(u64)); - compute_pass_descriptor_queue.AddBuffer(dst_buffer, 0, aligned_runs * sizeof(u64)); - compute_pass_descriptor_queue.AddBuffer(accumulation_buffer, 0, sizeof(u64)); - const void* const descriptor_data{compute_pass_descriptor_queue.UpdateData()}; - - scheduler.RequestOutsideRenderPassOperationContext(); - scheduler.Record([this, descriptor_data, max_accumulation_limit, number_of_sums, - aligned_runs](vk::CommandBuffer cmdbuf) { - static constexpr VkMemoryBarrier read_barrier{ - .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, - .pNext = nullptr, - .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, - .dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, - }; - static constexpr VkMemoryBarrier write_barrier{ - .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, - .pNext = nullptr, - .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT, - .dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_TRANSFER_READ_BIT | - VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT | - VK_ACCESS_INDIRECT_COMMAND_READ_BIT | VK_ACCESS_INDEX_READ_BIT | - VK_ACCESS_UNIFORM_READ_BIT | - VK_ACCESS_CONDITIONAL_RENDERING_READ_BIT_EXT, - }; - const QueriesPrefixScanPushConstants uniforms{ - .max_accumulation_base = static_cast(max_accumulation_limit), - .accumulation_limit = static_cast(number_of_sums - 1), - }; - const VkDescriptorSet set = descriptor_allocator.Commit(); - device.GetLogical().UpdateDescriptorSet(set, *descriptor_template, descriptor_data); - - cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, - VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, read_barrier); - cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline); - cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, *layout, 0, set, {}); - cmdbuf.PushConstants(*layout, VK_SHADER_STAGE_COMPUTE_BIT, uniforms); - cmdbuf.Dispatch(static_cast(aligned_runs / 32U), 1, 1); - cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, - VK_PIPELINE_STAGE_CONDITIONAL_RENDERING_BIT_EXT, 0, write_barrier); - }); -} - ASTCDecoderPass::ASTCDecoderPass(const Device& device_, Scheduler& scheduler_, DescriptorPool& descriptor_pool_, StagingBufferPool& staging_buffer_pool_, diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.h b/src/video_core/renderer_vulkan/vk_compute_pass.h index 95df5f894..045a3c941 100755 --- a/src/video_core/renderer_vulkan/vk_compute_pass.h +++ b/src/video_core/renderer_vulkan/vk_compute_pass.h @@ -3,7 +3,6 @@ #pragma once -#include #include #include @@ -32,8 +31,7 @@ public: vk::Span bindings, vk::Span templates, const DescriptorBankInfo& bank_info, - vk::Span push_constants, std::span code, - std::optional optional_subgroup_size = std::nullopt); + vk::Span push_constants, std::span code); ~ComputePass(); protected: @@ -84,33 +82,6 @@ private: ComputePassDescriptorQueue& compute_pass_descriptor_queue; }; -class ConditionalRenderingResolvePass final : public ComputePass { -public: - explicit ConditionalRenderingResolvePass( - const Device& device_, Scheduler& scheduler_, DescriptorPool& descriptor_pool_, - ComputePassDescriptorQueue& compute_pass_descriptor_queue_); - - void Resolve(VkBuffer dst_buffer, VkBuffer src_buffer, u32 src_offset, bool compare_to_zero); - -private: - Scheduler& scheduler; - ComputePassDescriptorQueue& compute_pass_descriptor_queue; -}; - -class QueriesPrefixScanPass final : public ComputePass { -public: - explicit QueriesPrefixScanPass(const Device& device_, Scheduler& scheduler_, - DescriptorPool& descriptor_pool_, - ComputePassDescriptorQueue& compute_pass_descriptor_queue_); - - void Run(VkBuffer accumulation_buffer, VkBuffer dst_buffer, VkBuffer src_buffer, - size_t number_of_sums, size_t max_accumulation_limit); - -private: - Scheduler& scheduler; - ComputePassDescriptorQueue& compute_pass_descriptor_queue; -}; - class ASTCDecoderPass final : public ComputePass { public: explicit ASTCDecoderPass(const Device& device_, Scheduler& scheduler_, diff --git a/src/video_core/renderer_vulkan/vk_fence_manager.h b/src/video_core/renderer_vulkan/vk_fence_manager.h index 594e798fa..65034ea34 100755 --- a/src/video_core/renderer_vulkan/vk_fence_manager.h +++ b/src/video_core/renderer_vulkan/vk_fence_manager.h @@ -7,7 +7,6 @@ #include "video_core/fence_manager.h" #include "video_core/renderer_vulkan/vk_buffer_cache.h" -#include "video_core/renderer_vulkan/vk_query_cache.h" #include "video_core/renderer_vulkan/vk_texture_cache.h" namespace Core { @@ -21,6 +20,7 @@ class RasterizerInterface; namespace Vulkan { class Device; +class QueryCache; class Scheduler; class InnerFence : public VideoCommon::FenceBase { diff --git a/src/video_core/renderer_vulkan/vk_query_cache.cpp b/src/video_core/renderer_vulkan/vk_query_cache.cpp index d7861ba28..b6d604bd5 100755 --- a/src/video_core/renderer_vulkan/vk_query_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_query_cache.cpp @@ -1,1537 +1,139 @@ -// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project -// SPDX-License-Identifier: GPL-3.0-or-later +// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later +#include #include -#include -#include -#include -#include -#include -#include #include #include -#include "common/bit_util.h" -#include "common/common_types.h" -#include "core/memory.h" -#include "video_core/engines/draw_manager.h" -#include "video_core/query_cache/query_cache.h" -#include "video_core/renderer_vulkan/vk_buffer_cache.h" -#include "video_core/renderer_vulkan/vk_compute_pass.h" #include "video_core/renderer_vulkan/vk_query_cache.h" #include "video_core/renderer_vulkan/vk_resource_pool.h" #include "video_core/renderer_vulkan/vk_scheduler.h" -#include "video_core/renderer_vulkan/vk_staging_buffer_pool.h" -#include "video_core/renderer_vulkan/vk_update_descriptor.h" #include "video_core/vulkan_common/vulkan_device.h" -#include "video_core/vulkan_common/vulkan_memory_allocator.h" #include "video_core/vulkan_common/vulkan_wrapper.h" namespace Vulkan { -using Tegra::Engines::Maxwell3D; -using VideoCommon::QueryType; +using VideoCore::QueryType; namespace { -class SamplesQueryBank : public VideoCommon::BankBase { -public: - static constexpr size_t BANK_SIZE = 256; - static constexpr size_t QUERY_SIZE = 8; - explicit SamplesQueryBank(const Device& device_, size_t index_) - : BankBase(BANK_SIZE), device{device_}, index{index_} { - const auto& dev = device.GetLogical(); - query_pool = dev.CreateQueryPool({ - .sType = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO, - .pNext = nullptr, - .flags = 0, - .queryType = VK_QUERY_TYPE_OCCLUSION, - .queryCount = BANK_SIZE, - .pipelineStatistics = 0, - }); - Reset(); - } - ~SamplesQueryBank() = default; +constexpr std::array QUERY_TARGETS = {VK_QUERY_TYPE_OCCLUSION}; - void Reset() override { - ASSERT(references == 0); - VideoCommon::BankBase::Reset(); - const auto& dev = device.GetLogical(); - dev.ResetQueryPool(*query_pool, 0, BANK_SIZE); - host_results.fill(0ULL); - next_bank = 0; - } - - void Sync(size_t start, size_t size) { - const auto& dev = device.GetLogical(); - const VkResult query_result = dev.GetQueryResults( - *query_pool, static_cast(start), static_cast(size), sizeof(u64) * size, - &host_results[start], sizeof(u64), VK_QUERY_RESULT_64_BIT | VK_QUERY_RESULT_WAIT_BIT); - switch (query_result) { - case VK_SUCCESS: - return; - case VK_ERROR_DEVICE_LOST: - device.ReportLoss(); - [[fallthrough]]; - default: - throw vk::Exception(query_result); - } - } - - VkQueryPool GetInnerPool() { - return *query_pool; - } - - size_t GetIndex() const { - return index; - } - - const std::array& GetResults() const { - return host_results; - } - - size_t next_bank; - -private: - const Device& device; - const size_t index; - vk::QueryPool query_pool; - std::array host_results; -}; - -using BaseStreamer = VideoCommon::SimpleStreamer; - -struct HostSyncValues { - VAddr address; - size_t size; - size_t offset; - - static constexpr bool GeneratesBaseBuffer = false; -}; - -class SamplesStreamer : public BaseStreamer { -public: - explicit SamplesStreamer(size_t id_, QueryCacheRuntime& runtime_, - VideoCore::RasterizerInterface* rasterizer_, const Device& device_, - Scheduler& scheduler_, const MemoryAllocator& memory_allocator_, - ComputePassDescriptorQueue& compute_pass_descriptor_queue, - DescriptorPool& descriptor_pool) - : BaseStreamer(id_), runtime{runtime_}, rasterizer{rasterizer_}, device{device_}, - scheduler{scheduler_}, memory_allocator{memory_allocator_} { - current_bank = nullptr; - current_query = nullptr; - ammend_value = 0; - acumulation_value = 0; - queries_prefix_scan_pass = std::make_unique( - device, scheduler, descriptor_pool, compute_pass_descriptor_queue); - - const VkBufferCreateInfo buffer_ci = { - .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, - .pNext = nullptr, - .flags = 0, - .size = 8, - .usage = VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | - VK_BUFFER_USAGE_STORAGE_BUFFER_BIT, - .sharingMode = VK_SHARING_MODE_EXCLUSIVE, - .queueFamilyIndexCount = 0, - .pQueueFamilyIndices = nullptr, - }; - accumulation_buffer = memory_allocator.CreateBuffer(buffer_ci, MemoryUsage::DeviceLocal); - scheduler.RequestOutsideRenderPassOperationContext(); - scheduler.Record([buffer = *accumulation_buffer](vk::CommandBuffer cmdbuf) { - cmdbuf.FillBuffer(buffer, 0, 8, 0); - }); - } - - ~SamplesStreamer() = default; - - void StartCounter() override { - if (has_started) { - return; - } - ReserveHostQuery(); - scheduler.Record([query_pool = current_query_pool, - query_index = current_bank_slot](vk::CommandBuffer cmdbuf) { - const bool use_precise = Settings::IsGPULevelHigh(); - cmdbuf.BeginQuery(query_pool, static_cast(query_index), - use_precise ? VK_QUERY_CONTROL_PRECISE_BIT : 0); - }); - has_started = true; - } - - void PauseCounter() override { - if (!has_started) { - return; - } - scheduler.Record([query_pool = current_query_pool, - query_index = current_bank_slot](vk::CommandBuffer cmdbuf) { - cmdbuf.EndQuery(query_pool, static_cast(query_index)); - }); - has_started = false; - } - - void ResetCounter() override { - if (has_started) { - PauseCounter(); - } - AbandonCurrentQuery(); - std::function func([this, counts = pending_flush_queries.size()] { - ammend_value = 0; - acumulation_value = 0; - }); - rasterizer->SyncOperation(std::move(func)); - accumulation_since_last_sync = false; - last_accumulation_checkpoint = std::min(last_accumulation_checkpoint, num_slots_used); - } - - void CloseCounter() override { - PauseCounter(); - } - - bool HasPendingSync() const override { - return !pending_sync.empty(); - } - - void SyncWrites() override { - if (sync_values_stash.empty()) { - return; - } - - for (size_t i = 0; i < sync_values_stash.size(); i++) { - runtime.template SyncValues(sync_values_stash[i], - *buffers[resolve_buffers[i]]); - } - - sync_values_stash.clear(); - } - - void PresyncWrites() override { - if (pending_sync.empty()) { - return; - } - PauseCounter(); - sync_values_stash.clear(); - sync_values_stash.emplace_back(); - std::vector* sync_values = &sync_values_stash.back(); - sync_values->reserve(num_slots_used); - std::unordered_map> offsets; - resolve_buffers.clear(); - size_t resolve_buffer_index = ObtainBuffer(num_slots_used); - resolve_buffers.push_back(resolve_buffer_index); - size_t base_offset = 0; - - ApplyBanksWideOp(pending_sync, [&](SamplesQueryBank* bank, size_t start, - size_t amount) { - size_t bank_id = bank->GetIndex(); - auto& resolve_buffer = buffers[resolve_buffer_index]; - VkQueryPool query_pool = bank->GetInnerPool(); - scheduler.RequestOutsideRenderPassOperationContext(); - scheduler.Record([start, amount, base_offset, query_pool, - buffer = *resolve_buffer](vk::CommandBuffer cmdbuf) { - const VkBufferMemoryBarrier copy_query_pool_barrier{ - .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER, - .pNext = nullptr, - .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, - .dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT, - .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, - .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, - .buffer = buffer, - .offset = base_offset, - .size = amount * SamplesQueryBank::QUERY_SIZE, - }; - - cmdbuf.CopyQueryPoolResults( - query_pool, static_cast(start), static_cast(amount), buffer, - static_cast(base_offset), SamplesQueryBank::QUERY_SIZE, - VK_QUERY_RESULT_WAIT_BIT | VK_QUERY_RESULT_64_BIT); - cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, - VK_PIPELINE_STAGE_TRANSFER_BIT, 0, copy_query_pool_barrier); - }); - offsets[bank_id] = {start, base_offset}; - base_offset += amount * SamplesQueryBank::QUERY_SIZE; - }); - - // Convert queries - bool has_multi_queries = false; - for (auto q : pending_sync) { - auto* query = GetQuery(q); - size_t sync_value_slot = 0; - if (True(query->flags & VideoCommon::QueryFlagBits::IsRewritten)) { - continue; - } - if (True(query->flags & VideoCommon::QueryFlagBits::IsInvalidated)) { - continue; - } - if (accumulation_since_last_sync || query->size_slots > 1) { - if (!has_multi_queries) { - has_multi_queries = true; - sync_values_stash.emplace_back(); - } - sync_value_slot = 1; - } - query->flags |= VideoCommon::QueryFlagBits::IsHostSynced; - auto loc_data = offsets[query->start_bank_id]; - sync_values_stash[sync_value_slot].emplace_back(HostSyncValues{ - .address = query->guest_address, - .size = SamplesQueryBank::QUERY_SIZE, - .offset = - loc_data.second + (query->start_slot - loc_data.first + query->size_slots - 1) * - SamplesQueryBank::QUERY_SIZE, - }); - } - - if (has_multi_queries) { - size_t intermediary_buffer_index = ObtainBuffer(num_slots_used); - resolve_buffers.push_back(intermediary_buffer_index); - queries_prefix_scan_pass->Run(*accumulation_buffer, *buffers[intermediary_buffer_index], - *buffers[resolve_buffer_index], num_slots_used, - std::min(last_accumulation_checkpoint, num_slots_used)); - } else { - scheduler.RequestOutsideRenderPassOperationContext(); - scheduler.Record([buffer = *accumulation_buffer](vk::CommandBuffer cmdbuf) { - cmdbuf.FillBuffer(buffer, 0, 8, 0); - }); - } - - ReplicateCurrentQueryIfNeeded(); - std::function func([this] { ammend_value = acumulation_value; }); - rasterizer->SyncOperation(std::move(func)); - AbandonCurrentQuery(); - num_slots_used = 0; - last_accumulation_checkpoint = std::numeric_limits::max(); - accumulation_since_last_sync = has_multi_queries; - pending_sync.clear(); - } - - size_t WriteCounter(VAddr address, bool has_timestamp, u32 value, - [[maybe_unused]] std::optional subreport) override { - PauseCounter(); - auto index = BuildQuery(); - auto* new_query = GetQuery(index); - new_query->guest_address = address; - new_query->value = 0; - new_query->flags &= ~VideoCommon::QueryFlagBits::IsOrphan; - if (has_timestamp) { - new_query->flags |= VideoCommon::QueryFlagBits::HasTimestamp; - } - if (!current_query) { - new_query->flags |= VideoCommon::QueryFlagBits::IsFinalValueSynced; - return index; - } - new_query->start_bank_id = current_query->start_bank_id; - new_query->size_banks = current_query->size_banks; - new_query->start_slot = current_query->start_slot; - new_query->size_slots = current_query->size_slots; - ApplyBankOp(new_query, [](SamplesQueryBank* bank, size_t start, size_t amount) { - bank->AddReference(amount); - }); - pending_sync.push_back(index); - pending_flush_queries.push_back(index); - return index; - } - - bool HasUnsyncedQueries() const override { - return !pending_flush_queries.empty(); - } - - void PushUnsyncedQueries() override { - PauseCounter(); - current_bank->Close(); - { - std::scoped_lock lk(flush_guard); - pending_flush_sets.emplace_back(std::move(pending_flush_queries)); - } - } - - void PopUnsyncedQueries() override { - std::vector current_flush_queries; - { - std::scoped_lock lk(flush_guard); - current_flush_queries = std::move(pending_flush_sets.front()); - pending_flush_sets.pop_front(); - } - ApplyBanksWideOp( - current_flush_queries, - [](SamplesQueryBank* bank, size_t start, size_t amount) { bank->Sync(start, amount); }); - for (auto q : current_flush_queries) { - auto* query = GetQuery(q); - u64 total = 0; - ApplyBankOp(query, [&total](SamplesQueryBank* bank, size_t start, size_t amount) { - const auto& results = bank->GetResults(); - for (size_t i = 0; i < amount; i++) { - total += results[start + i]; - } - }); - query->value = total; - query->flags |= VideoCommon::QueryFlagBits::IsFinalValueSynced; - } - } - -private: - template - void ApplyBankOp(VideoCommon::HostQueryBase* query, Func&& func) { - size_t size_slots = query->size_slots; - if (size_slots == 0) { - return; - } - size_t bank_id = query->start_bank_id; - size_t banks_set = query->size_banks; - size_t start_slot = query->start_slot; - for (size_t i = 0; i < banks_set; i++) { - auto& the_bank = bank_pool.GetBank(bank_id); - size_t amount = std::min(the_bank.Size() - start_slot, size_slots); - func(&the_bank, start_slot, amount); - bank_id = the_bank.next_bank - 1; - start_slot = 0; - size_slots -= amount; - } - } - - template - void ApplyBanksWideOp(std::vector& queries, Func&& func) { - std::conditional_t>, - std::unordered_map>> - indexer; - for (auto q : queries) { - auto* query = GetQuery(q); - ApplyBankOp(query, [&indexer](SamplesQueryBank* bank, size_t start, size_t amount) { - auto id_ = bank->GetIndex(); - auto pair = indexer.try_emplace(id_, std::numeric_limits::max(), - std::numeric_limits::min()); - auto& current_pair = pair.first->second; - current_pair.first = std::min(current_pair.first, start); - current_pair.second = std::max(current_pair.second, amount + start); - }); - } - for (auto& cont : indexer) { - func(&bank_pool.GetBank(cont.first), cont.second.first, - cont.second.second - cont.second.first); - } - } - - void ReserveBank() { - current_bank_id = - bank_pool.ReserveBank([this](std::deque& queue, size_t index) { - queue.emplace_back(device, index); - }); - if (current_bank) { - current_bank->next_bank = current_bank_id + 1; - } - current_bank = &bank_pool.GetBank(current_bank_id); - current_query_pool = current_bank->GetInnerPool(); - } - - size_t ReserveBankSlot() { - if (!current_bank || current_bank->IsClosed()) { - ReserveBank(); - } - auto [built, index] = current_bank->Reserve(); - current_bank_slot = index; - return index; - } - - void ReserveHostQuery() { - size_t new_slot = ReserveBankSlot(); - current_bank->AddReference(1); - num_slots_used++; - if (current_query) { - size_t bank_id = current_query->start_bank_id; - size_t banks_set = current_query->size_banks - 1; - bool found = bank_id == current_bank_id; - while (!found && banks_set > 0) { - SamplesQueryBank& some_bank = bank_pool.GetBank(bank_id); - bank_id = some_bank.next_bank - 1; - found = bank_id == current_bank_id; - banks_set--; - } - if (!found) { - current_query->size_banks++; - } - current_query->size_slots++; - } else { - current_query_id = BuildQuery(); - current_query = GetQuery(current_query_id); - current_query->start_bank_id = static_cast(current_bank_id); - current_query->size_banks = 1; - current_query->start_slot = new_slot; - current_query->size_slots = 1; - } - } - - void Free(size_t query_id) override { - std::scoped_lock lk(guard); - auto* query = GetQuery(query_id); - ApplyBankOp(query, [](SamplesQueryBank* bank, size_t start, size_t amount) { - bank->CloseReference(amount); - }); - ReleaseQuery(query_id); - } - - void AbandonCurrentQuery() { - if (!current_query) { - return; - } - Free(current_query_id); - current_query = nullptr; - current_query_id = 0; - } - - void ReplicateCurrentQueryIfNeeded() { - if (pending_sync.empty()) { - return; - } - if (!current_query) { - return; - } - auto index = BuildQuery(); - auto* new_query = GetQuery(index); - new_query->guest_address = 0; - new_query->value = 0; - new_query->flags &= ~VideoCommon::QueryFlagBits::IsOrphan; - new_query->start_bank_id = current_query->start_bank_id; - new_query->size_banks = current_query->size_banks; - new_query->start_slot = current_query->start_slot; - new_query->size_slots = current_query->size_slots; - ApplyBankOp(new_query, [](SamplesQueryBank* bank, size_t start, size_t amount) { - bank->AddReference(amount); - }); - pending_flush_queries.push_back(index); - std::function func([this, index] { - auto* query = GetQuery(index); - query->value += GetAmmendValue(); - SetAccumulationValue(query->value); - Free(index); - }); - } - - template - size_t ObtainBuffer(size_t num_needed) { - const size_t log_2 = std::max(6U, Common::Log2Ceil64(num_needed)); - if constexpr (is_resolve) { - if (resolve_table[log_2] != 0) { - return resolve_table[log_2] - 1; - } - } else { - if (intermediary_table[log_2] != 0) { - return intermediary_table[log_2] - 1; - } - } - const VkBufferCreateInfo buffer_ci = { - .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, - .pNext = nullptr, - .flags = 0, - .size = SamplesQueryBank::QUERY_SIZE * (1ULL << log_2), - .usage = VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | - VK_BUFFER_USAGE_STORAGE_BUFFER_BIT, - .sharingMode = VK_SHARING_MODE_EXCLUSIVE, - .queueFamilyIndexCount = 0, - .pQueueFamilyIndices = nullptr, - }; - buffers.emplace_back(memory_allocator.CreateBuffer(buffer_ci, MemoryUsage::DeviceLocal)); - if constexpr (is_resolve) { - resolve_table[log_2] = buffers.size(); - } else { - intermediary_table[log_2] = buffers.size(); - } - return buffers.size() - 1; - } - - QueryCacheRuntime& runtime; - VideoCore::RasterizerInterface* rasterizer; - const Device& device; - Scheduler& scheduler; - const MemoryAllocator& memory_allocator; - VideoCommon::BankPool bank_pool; - std::deque buffers; - std::array resolve_table{}; - std::array intermediary_table{}; - vk::Buffer accumulation_buffer; - std::deque> sync_values_stash; - std::vector resolve_buffers; - - // syncing queue - std::vector pending_sync; - - // flush levels - std::vector pending_flush_queries; - std::deque> pending_flush_sets; - - // State Machine - size_t current_bank_slot; - size_t current_bank_id; - SamplesQueryBank* current_bank; - VkQueryPool current_query_pool; - size_t current_query_id; - size_t num_slots_used{}; - size_t last_accumulation_checkpoint{}; - bool accumulation_since_last_sync{}; - VideoCommon::HostQueryBase* current_query; - bool has_started{}; - std::mutex flush_guard; - - std::unique_ptr queries_prefix_scan_pass; -}; - -// Transform feedback queries -class TFBQueryBank : public VideoCommon::BankBase { -public: - static constexpr size_t BANK_SIZE = 1024; - static constexpr size_t QUERY_SIZE = 4; - explicit TFBQueryBank(Scheduler& scheduler_, const MemoryAllocator& memory_allocator, - size_t index_) - : BankBase(BANK_SIZE), scheduler{scheduler_}, index{index_} { - const VkBufferCreateInfo buffer_ci = { - .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, - .pNext = nullptr, - .flags = 0, - .size = QUERY_SIZE * BANK_SIZE, - .usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, - .sharingMode = VK_SHARING_MODE_EXCLUSIVE, - .queueFamilyIndexCount = 0, - .pQueueFamilyIndices = nullptr, - }; - buffer = memory_allocator.CreateBuffer(buffer_ci, MemoryUsage::DeviceLocal); - } - - ~TFBQueryBank() = default; - - void Reset() override { - ASSERT(references == 0); - VideoCommon::BankBase::Reset(); - } - - void Sync(StagingBufferRef& stagging_buffer, size_t extra_offset, size_t start, size_t size) { - scheduler.RequestOutsideRenderPassOperationContext(); - scheduler.Record([this, dst_buffer = stagging_buffer.buffer, extra_offset, start, - size](vk::CommandBuffer cmdbuf) { - std::array copy{VkBufferCopy{ - .srcOffset = start * QUERY_SIZE, - .dstOffset = extra_offset, - .size = size * QUERY_SIZE, - }}; - cmdbuf.CopyBuffer(*buffer, dst_buffer, copy); - }); - } - - size_t GetIndex() const { - return index; - } - - VkBuffer GetBuffer() const { - return *buffer; - } - -private: - Scheduler& scheduler; - const size_t index; - vk::Buffer buffer; -}; - -class PrimitivesSucceededStreamer; - -class TFBCounterStreamer : public BaseStreamer { -public: - explicit TFBCounterStreamer(size_t id_, QueryCacheRuntime& runtime_, const Device& device_, - Scheduler& scheduler_, const MemoryAllocator& memory_allocator_, - StagingBufferPool& staging_pool_) - : BaseStreamer(id_), runtime{runtime_}, device{device_}, scheduler{scheduler_}, - memory_allocator{memory_allocator_}, staging_pool{staging_pool_} { - buffers_count = 0; - current_bank = nullptr; - counter_buffers.fill(VK_NULL_HANDLE); - offsets.fill(0); - last_queries.fill(0); - last_queries_stride.fill(1); - const VkBufferCreateInfo buffer_ci = { - .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, - .pNext = nullptr, - .flags = 0, - .size = TFBQueryBank::QUERY_SIZE * NUM_STREAMS, - .usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT | - VK_BUFFER_USAGE_TRANSFORM_FEEDBACK_COUNTER_BUFFER_BIT_EXT, - .sharingMode = VK_SHARING_MODE_EXCLUSIVE, - .queueFamilyIndexCount = 0, - .pQueueFamilyIndices = nullptr, - }; - - counters_buffer = memory_allocator.CreateBuffer(buffer_ci, MemoryUsage::DeviceLocal); - for (auto& c : counter_buffers) { - c = *counters_buffer; - } - size_t base_offset = 0; - for (auto& o : offsets) { - o = base_offset; - base_offset += TFBQueryBank::QUERY_SIZE; - } - } - - ~TFBCounterStreamer() = default; - - void StartCounter() override { - FlushBeginTFB(); - has_started = true; - } - - void PauseCounter() override { - CloseCounter(); - } - - void ResetCounter() override { - CloseCounter(); - } - - void CloseCounter() override { - if (has_flushed_end_pending) { - FlushEndTFB(); - } - runtime.View3DRegs([this](Maxwell3D& maxwell3d) { - if (maxwell3d.regs.transform_feedback_enabled == 0) { - streams_mask = 0; - has_started = false; - } - }); - } - - bool HasPendingSync() const override { - return !pending_sync.empty(); - } - - void SyncWrites() override { - CloseCounter(); - std::unordered_map> sync_values_stash; - for (auto q : pending_sync) { - auto* query = GetQuery(q); - if (True(query->flags & VideoCommon::QueryFlagBits::IsRewritten)) { - continue; - } - if (True(query->flags & VideoCommon::QueryFlagBits::IsInvalidated)) { - continue; - } - query->flags |= VideoCommon::QueryFlagBits::IsHostSynced; - sync_values_stash.try_emplace(query->start_bank_id); - sync_values_stash[query->start_bank_id].emplace_back(HostSyncValues{ - .address = query->guest_address, - .size = TFBQueryBank::QUERY_SIZE, - .offset = query->start_slot * TFBQueryBank::QUERY_SIZE, - }); - } - for (auto& p : sync_values_stash) { - auto& bank = bank_pool.GetBank(p.first); - runtime.template SyncValues(p.second, bank.GetBuffer()); - } - pending_sync.clear(); - } - - size_t WriteCounter(VAddr address, bool has_timestamp, u32 value, - std::optional subreport_) override { - auto index = BuildQuery(); - auto* new_query = GetQuery(index); - new_query->guest_address = address; - new_query->value = 0; - new_query->flags &= ~VideoCommon::QueryFlagBits::IsOrphan; - if (has_timestamp) { - new_query->flags |= VideoCommon::QueryFlagBits::HasTimestamp; - } - if (!subreport_) { - new_query->flags |= VideoCommon::QueryFlagBits::IsFinalValueSynced; - return index; - } - const size_t subreport = static_cast(*subreport_); - last_queries[subreport] = address; - if ((streams_mask & (1ULL << subreport)) == 0) { - new_query->flags |= VideoCommon::QueryFlagBits::IsFinalValueSynced; - return index; - } - CloseCounter(); - auto [bank_slot, data_slot] = ProduceCounterBuffer(subreport); - new_query->start_bank_id = static_cast(bank_slot); - new_query->size_banks = 1; - new_query->start_slot = static_cast(data_slot); - new_query->size_slots = 1; - pending_sync.push_back(index); - pending_flush_queries.push_back(index); - return index; - } - - std::optional> GetLastQueryStream(size_t stream) { - if (last_queries[stream] != 0) { - std::pair result(last_queries[stream], last_queries_stride[stream]); - return result; - } - return std::nullopt; - } - - Maxwell3D::Regs::PrimitiveTopology GetOutputTopology() const { - return out_topology; - } - - bool HasUnsyncedQueries() const override { - return !pending_flush_queries.empty(); - } - - void PushUnsyncedQueries() override { - CloseCounter(); - auto staging_ref = staging_pool.Request( - pending_flush_queries.size() * TFBQueryBank::QUERY_SIZE, MemoryUsage::Download, true); - size_t offset_base = staging_ref.offset; - for (auto q : pending_flush_queries) { - auto* query = GetQuery(q); - auto& bank = bank_pool.GetBank(query->start_bank_id); - bank.Sync(staging_ref, offset_base, query->start_slot, 1); - offset_base += TFBQueryBank::QUERY_SIZE; - bank.CloseReference(); - } - static constexpr VkMemoryBarrier WRITE_BARRIER{ - .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, - .pNext = nullptr, - .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, - .dstAccessMask = VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT, - }; - scheduler.RequestOutsideRenderPassOperationContext(); - scheduler.Record([](vk::CommandBuffer cmdbuf) { - cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, - VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 0, WRITE_BARRIER); - }); - - std::scoped_lock lk(flush_guard); - for (auto& str : free_queue) { - staging_pool.FreeDeferred(str); - } - free_queue.clear(); - download_buffers.emplace_back(staging_ref); - pending_flush_sets.emplace_back(std::move(pending_flush_queries)); - } - - void PopUnsyncedQueries() override { - StagingBufferRef staging_ref; - std::vector flushed_queries; - { - std::scoped_lock lk(flush_guard); - staging_ref = download_buffers.front(); - flushed_queries = std::move(pending_flush_sets.front()); - download_buffers.pop_front(); - pending_flush_sets.pop_front(); - } - - size_t offset_base = staging_ref.offset; - for (auto q : flushed_queries) { - auto* query = GetQuery(q); - u32 result = 0; - std::memcpy(&result, staging_ref.mapped_span.data() + offset_base, sizeof(u32)); - query->value = static_cast(result); - query->flags |= VideoCommon::QueryFlagBits::IsFinalValueSynced; - offset_base += TFBQueryBank::QUERY_SIZE; - } - - { - std::scoped_lock lk(flush_guard); - free_queue.emplace_back(staging_ref); - } - } - -private: - void FlushBeginTFB() { - if (has_flushed_end_pending) [[unlikely]] { - return; - } - has_flushed_end_pending = true; - if (!has_started || buffers_count == 0) { - scheduler.Record([](vk::CommandBuffer cmdbuf) { - cmdbuf.BeginTransformFeedbackEXT(0, 0, nullptr, nullptr); - }); - UpdateBuffers(); - return; - } - scheduler.Record([this, total = static_cast(buffers_count)](vk::CommandBuffer cmdbuf) { - cmdbuf.BeginTransformFeedbackEXT(0, total, counter_buffers.data(), offsets.data()); - }); - UpdateBuffers(); - } - - void FlushEndTFB() { - if (!has_flushed_end_pending) [[unlikely]] { - UNREACHABLE(); - return; - } - has_flushed_end_pending = false; - - if (buffers_count == 0) { - scheduler.Record([](vk::CommandBuffer cmdbuf) { - cmdbuf.EndTransformFeedbackEXT(0, 0, nullptr, nullptr); - }); - } else { - scheduler.Record([this, - total = static_cast(buffers_count)](vk::CommandBuffer cmdbuf) { - cmdbuf.EndTransformFeedbackEXT(0, total, counter_buffers.data(), offsets.data()); - }); - } - } - - void UpdateBuffers() { - last_queries.fill(0); - last_queries_stride.fill(1); - runtime.View3DRegs([this](Maxwell3D& maxwell3d) { - buffers_count = 0; - out_topology = maxwell3d.draw_manager->GetDrawState().topology; - for (size_t i = 0; i < Maxwell3D::Regs::NumTransformFeedbackBuffers; i++) { - const auto& tf = maxwell3d.regs.transform_feedback; - if (tf.buffers[i].enable == 0) { - continue; - } - const size_t stream = tf.controls[i].stream; - last_queries_stride[stream] = tf.controls[i].stride; - streams_mask |= 1ULL << stream; - buffers_count = std::max(buffers_count, stream + 1); - } - }); - } - - std::pair ProduceCounterBuffer(size_t stream) { - if (current_bank == nullptr || current_bank->IsClosed()) { - current_bank_id = - bank_pool.ReserveBank([this](std::deque& queue, size_t index) { - queue.emplace_back(scheduler, memory_allocator, index); - }); - current_bank = &bank_pool.GetBank(current_bank_id); - } - auto [dont_care, other] = current_bank->Reserve(); - const size_t slot = other; // workaround to compile bug. - current_bank->AddReference(); - - static constexpr VkMemoryBarrier READ_BARRIER{ - .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, - .pNext = nullptr, - .srcAccessMask = VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT, - .dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT, - }; - static constexpr VkMemoryBarrier WRITE_BARRIER{ - .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, - .pNext = nullptr, - .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, - .dstAccessMask = VK_ACCESS_MEMORY_READ_BIT, - }; - scheduler.RequestOutsideRenderPassOperationContext(); - scheduler.Record([dst_buffer = current_bank->GetBuffer(), - src_buffer = counter_buffers[stream], src_offset = offsets[stream], - slot](vk::CommandBuffer cmdbuf) { - cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFORM_FEEDBACK_BIT_EXT, - VK_PIPELINE_STAGE_TRANSFER_BIT, 0, READ_BARRIER); - std::array copy{VkBufferCopy{ - .srcOffset = src_offset, - .dstOffset = slot * TFBQueryBank::QUERY_SIZE, - .size = TFBQueryBank::QUERY_SIZE, - }}; - cmdbuf.CopyBuffer(src_buffer, dst_buffer, copy); - cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, - 0, WRITE_BARRIER); - }); - return {current_bank_id, slot}; - } - - friend class PrimitivesSucceededStreamer; - - static constexpr size_t NUM_STREAMS = 4; - - QueryCacheRuntime& runtime; - const Device& device; - Scheduler& scheduler; - const MemoryAllocator& memory_allocator; - StagingBufferPool& staging_pool; - VideoCommon::BankPool bank_pool; - size_t current_bank_id; - TFBQueryBank* current_bank; - vk::Buffer counters_buffer; - - // syncing queue - std::vector pending_sync; - - // flush levels - std::vector pending_flush_queries; - std::deque download_buffers; - std::deque> pending_flush_sets; - std::vector free_queue; - std::mutex flush_guard; - - // state machine - bool has_started{}; - bool has_flushed_end_pending{}; - size_t buffers_count{}; - std::array counter_buffers{}; - std::array offsets{}; - std::array last_queries; - std::array last_queries_stride; - Maxwell3D::Regs::PrimitiveTopology out_topology; - u64 streams_mask; -}; - -class PrimitivesQueryBase : public VideoCommon::QueryBase { -public: - // Default constructor - PrimitivesQueryBase() - : VideoCommon::QueryBase(0, VideoCommon::QueryFlagBits::IsHostManaged, 0) {} - - // Parameterized constructor - PrimitivesQueryBase(bool has_timestamp, VAddr address) - : VideoCommon::QueryBase(address, VideoCommon::QueryFlagBits::IsHostManaged, 0) { - if (has_timestamp) { - flags |= VideoCommon::QueryFlagBits::HasTimestamp; - } - } - - u64 stride{}; - VAddr dependant_address{}; - Maxwell3D::Regs::PrimitiveTopology topology{Maxwell3D::Regs::PrimitiveTopology::Points}; - size_t dependant_index{}; - bool dependant_manage{}; -}; - -class PrimitivesSucceededStreamer : public VideoCommon::SimpleStreamer { -public: - explicit PrimitivesSucceededStreamer(size_t id_, QueryCacheRuntime& runtime_, - TFBCounterStreamer& tfb_streamer_, - Core::Memory::Memory& cpu_memory_) - : VideoCommon::SimpleStreamer(id_), runtime{runtime_}, - tfb_streamer{tfb_streamer_}, cpu_memory{cpu_memory_} { - MakeDependent(&tfb_streamer); - } - - ~PrimitivesSucceededStreamer() = default; - - size_t WriteCounter(VAddr address, bool has_timestamp, u32 value, - std::optional subreport_) override { - auto index = BuildQuery(); - auto* new_query = GetQuery(index); - new_query->guest_address = address; - new_query->value = 0; - if (has_timestamp) { - new_query->flags |= VideoCommon::QueryFlagBits::HasTimestamp; - } - if (!subreport_) { - new_query->flags |= VideoCommon::QueryFlagBits::IsFinalValueSynced; - return index; - } - const size_t subreport = static_cast(*subreport_); - auto dependant_address_opt = tfb_streamer.GetLastQueryStream(subreport); - bool must_manage_dependance = false; - new_query->topology = tfb_streamer.GetOutputTopology(); - if (dependant_address_opt) { - auto [dep_address, stride] = *dependant_address_opt; - new_query->dependant_address = dep_address; - new_query->stride = stride; - } else { - new_query->dependant_index = - tfb_streamer.WriteCounter(address, has_timestamp, value, subreport_); - auto* dependant_query = tfb_streamer.GetQuery(new_query->dependant_index); - dependant_query->flags |= VideoCommon::QueryFlagBits::IsInvalidated; - must_manage_dependance = true; - if (True(dependant_query->flags & VideoCommon::QueryFlagBits::IsFinalValueSynced)) { - new_query->value = 0; - new_query->flags |= VideoCommon::QueryFlagBits::IsFinalValueSynced; - if (must_manage_dependance) { - tfb_streamer.Free(new_query->dependant_index); - } - return index; - } - new_query->stride = 1; - runtime.View3DRegs([new_query, subreport](Maxwell3D& maxwell3d) { - for (size_t i = 0; i < Maxwell3D::Regs::NumTransformFeedbackBuffers; i++) { - const auto& tf = maxwell3d.regs.transform_feedback; - if (tf.buffers[i].enable == 0) { - continue; - } - if (tf.controls[i].stream != subreport) { - continue; - } - new_query->stride = tf.controls[i].stride; - break; - } - }); - } - - new_query->dependant_manage = must_manage_dependance; - pending_flush_queries.push_back(index); - return index; - } - - bool HasUnsyncedQueries() const override { - return !pending_flush_queries.empty(); - } - - void PushUnsyncedQueries() override { - std::scoped_lock lk(flush_guard); - pending_flush_sets.emplace_back(std::move(pending_flush_queries)); - pending_flush_queries.clear(); - } - - void PopUnsyncedQueries() override { - std::vector flushed_queries; - { - std::scoped_lock lk(flush_guard); - flushed_queries = std::move(pending_flush_sets.front()); - pending_flush_sets.pop_front(); - } - - for (auto q : flushed_queries) { - auto* query = GetQuery(q); - if (True(query->flags & VideoCommon::QueryFlagBits::IsFinalValueSynced)) { - continue; - } - - query->flags |= VideoCommon::QueryFlagBits::IsFinalValueSynced; - u64 num_vertices = 0; - if (query->dependant_manage) { - auto* dependant_query = tfb_streamer.GetQuery(query->dependant_index); - num_vertices = dependant_query->value / query->stride; - tfb_streamer.Free(query->dependant_index); - } else { - u8* pointer = cpu_memory.GetPointer(query->dependant_address); - u32 result; - std::memcpy(&result, pointer, sizeof(u32)); - num_vertices = static_cast(result) / query->stride; - } - query->value = [&]() -> u64 { - switch (query->topology) { - case Maxwell3D::Regs::PrimitiveTopology::Points: - return num_vertices; - case Maxwell3D::Regs::PrimitiveTopology::Lines: - return num_vertices / 2; - case Maxwell3D::Regs::PrimitiveTopology::LineLoop: - return (num_vertices / 2) + 1; - case Maxwell3D::Regs::PrimitiveTopology::LineStrip: - return num_vertices - 1; - case Maxwell3D::Regs::PrimitiveTopology::Patches: - case Maxwell3D::Regs::PrimitiveTopology::Triangles: - case Maxwell3D::Regs::PrimitiveTopology::TrianglesAdjacency: - return num_vertices / 3; - case Maxwell3D::Regs::PrimitiveTopology::TriangleFan: - case Maxwell3D::Regs::PrimitiveTopology::TriangleStrip: - case Maxwell3D::Regs::PrimitiveTopology::TriangleStripAdjacency: - return num_vertices - 2; - case Maxwell3D::Regs::PrimitiveTopology::Quads: - return num_vertices / 4; - case Maxwell3D::Regs::PrimitiveTopology::Polygon: - return 1U; - default: - return num_vertices; - } - }(); - } - } - -private: - QueryCacheRuntime& runtime; - TFBCounterStreamer& tfb_streamer; - Core::Memory::Memory& cpu_memory; - - // syncing queue - std::vector pending_sync; - - // flush levels - std::vector pending_flush_queries; - std::deque> pending_flush_sets; - std::mutex flush_guard; -}; - -} // namespace - -struct QueryCacheRuntimeImpl { - QueryCacheRuntimeImpl(QueryCacheRuntime& runtime, VideoCore::RasterizerInterface* rasterizer_, - Core::Memory::Memory& cpu_memory_, Vulkan::BufferCache& buffer_cache_, - const Device& device_, const MemoryAllocator& memory_allocator_, - Scheduler& scheduler_, StagingBufferPool& staging_pool_, - ComputePassDescriptorQueue& compute_pass_descriptor_queue, - DescriptorPool& descriptor_pool) - : rasterizer{rasterizer_}, cpu_memory{cpu_memory_}, - buffer_cache{buffer_cache_}, device{device_}, - memory_allocator{memory_allocator_}, scheduler{scheduler_}, staging_pool{staging_pool_}, - guest_streamer(0, runtime), - sample_streamer(static_cast(QueryType::ZPassPixelCount64), runtime, rasterizer, - device, scheduler, memory_allocator, compute_pass_descriptor_queue, - descriptor_pool), - tfb_streamer(static_cast(QueryType::StreamingByteCount), runtime, device, - scheduler, memory_allocator, staging_pool), - primitives_succeeded_streamer( - static_cast(QueryType::StreamingPrimitivesSucceeded), runtime, tfb_streamer, - cpu_memory_), - primitives_needed_minus_suceeded_streamer( - static_cast(QueryType::StreamingPrimitivesNeededMinusSucceeded), runtime, 0u), - hcr_setup{}, hcr_is_set{}, is_hcr_running{} { - - hcr_setup.sType = VK_STRUCTURE_TYPE_CONDITIONAL_RENDERING_BEGIN_INFO_EXT; - hcr_setup.pNext = nullptr; - hcr_setup.flags = 0; - - conditional_resolve_pass = std::make_unique( - device, scheduler, descriptor_pool, compute_pass_descriptor_queue); - - const VkBufferCreateInfo buffer_ci = { - .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, - .pNext = nullptr, - .flags = 0, - .size = sizeof(u32), - .usage = VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | - VK_BUFFER_USAGE_CONDITIONAL_RENDERING_BIT_EXT, - .sharingMode = VK_SHARING_MODE_EXCLUSIVE, - .queueFamilyIndexCount = 0, - .pQueueFamilyIndices = nullptr, - }; - hcr_resolve_buffer = memory_allocator.CreateBuffer(buffer_ci, MemoryUsage::DeviceLocal); - } - - VideoCore::RasterizerInterface* rasterizer; - Core::Memory::Memory& cpu_memory; - Vulkan::BufferCache& buffer_cache; - - const Device& device; - const MemoryAllocator& memory_allocator; - Scheduler& scheduler; - StagingBufferPool& staging_pool; - - // Streamers - VideoCommon::GuestStreamer guest_streamer; - SamplesStreamer sample_streamer; - TFBCounterStreamer tfb_streamer; - PrimitivesSucceededStreamer primitives_succeeded_streamer; - VideoCommon::StubStreamer primitives_needed_minus_suceeded_streamer; - - std::vector> little_cache; - std::vector> buffers_to_upload_to; - std::vector redirect_cache; - std::vector> copies_setup; - - // Host conditional rendering data - std::unique_ptr conditional_resolve_pass; - vk::Buffer hcr_resolve_buffer; - VkConditionalRenderingBeginInfoEXT hcr_setup; - VkBuffer hcr_buffer; - size_t hcr_offset; - bool hcr_is_set; - bool is_hcr_running; - - // maxwell3d - Maxwell3D* maxwell3d; -}; - -QueryCacheRuntime::QueryCacheRuntime(VideoCore::RasterizerInterface* rasterizer, - Core::Memory::Memory& cpu_memory_, - Vulkan::BufferCache& buffer_cache_, const Device& device_, - const MemoryAllocator& memory_allocator_, - Scheduler& scheduler_, StagingBufferPool& staging_pool_, - ComputePassDescriptorQueue& compute_pass_descriptor_queue, - DescriptorPool& descriptor_pool) { - impl = std::make_unique( - *this, rasterizer, cpu_memory_, buffer_cache_, device_, memory_allocator_, scheduler_, - staging_pool_, compute_pass_descriptor_queue, descriptor_pool); +constexpr VkQueryType GetTarget(QueryType type) { + return QUERY_TARGETS[static_cast(type)]; } -void QueryCacheRuntime::Bind3DEngine(Maxwell3D* maxwell3d) { - impl->maxwell3d = maxwell3d; +} // Anonymous namespace + +QueryPool::QueryPool(const Device& device_, Scheduler& scheduler, QueryType type_) + : ResourcePool{scheduler.GetMasterSemaphore(), GROW_STEP}, device{device_}, type{type_} {} + +QueryPool::~QueryPool() = default; + +std::pair QueryPool::Commit() { + std::size_t index; + do { + index = CommitResource(); + } while (usage[index]); + usage[index] = true; + + return {*pools[index / GROW_STEP], static_cast(index % GROW_STEP)}; } -template -void QueryCacheRuntime::View3DRegs(Func&& func) { - func(*impl->maxwell3d); +void QueryPool::Allocate(std::size_t begin, std::size_t end) { + usage.resize(end); + + pools.push_back(device.GetLogical().CreateQueryPool({ + .sType = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .queryType = GetTarget(type), + .queryCount = static_cast(end - begin), + .pipelineStatistics = 0, + })); } -void QueryCacheRuntime::EndHostConditionalRendering() { - PauseHostConditionalRendering(); - impl->hcr_is_set = false; - impl->is_hcr_running = false; - impl->hcr_buffer = nullptr; - impl->hcr_offset = 0; -} - -void QueryCacheRuntime::PauseHostConditionalRendering() { - if (!impl->hcr_is_set) { - return; - } - if (impl->is_hcr_running) { - impl->scheduler.Record( - [](vk::CommandBuffer cmdbuf) { cmdbuf.EndConditionalRenderingEXT(); }); - } - impl->is_hcr_running = false; -} - -void QueryCacheRuntime::ResumeHostConditionalRendering() { - if (!impl->hcr_is_set) { - return; - } - if (!impl->is_hcr_running) { - impl->scheduler.Record([hcr_setup = impl->hcr_setup](vk::CommandBuffer cmdbuf) { - cmdbuf.BeginConditionalRenderingEXT(hcr_setup); +void QueryPool::Reserve(std::pair query) { + const auto it = + std::find_if(pools.begin(), pools.end(), [query_pool = query.first](vk::QueryPool& pool) { + return query_pool == *pool; }); + + if (it != std::end(pools)) { + const std::ptrdiff_t pool_index = std::distance(std::begin(pools), it); + usage[pool_index * GROW_STEP + static_cast(query.second)] = false; } - impl->is_hcr_running = true; } -void QueryCacheRuntime::HostConditionalRenderingCompareValueImpl(VideoCommon::LookupData object, - bool is_equal) { - { - std::scoped_lock lk(impl->buffer_cache.mutex); - static constexpr auto sync_info = VideoCommon::ObtainBufferSynchronize::FullSynchronize; - const auto post_op = VideoCommon::ObtainBufferOperation::DoNothing; - const auto [buffer, offset] = - impl->buffer_cache.ObtainCPUBuffer(object.address, 8, sync_info, post_op); - impl->hcr_buffer = buffer->Handle(); - impl->hcr_offset = offset; +QueryCache::QueryCache(VideoCore::RasterizerInterface& rasterizer_, + Core::Memory::Memory& cpu_memory_, const Device& device_, + Scheduler& scheduler_) + : QueryCacheBase{rasterizer_, cpu_memory_}, device{device_}, scheduler{scheduler_}, + query_pools{ + QueryPool{device_, scheduler_, QueryType::SamplesPassed}, + } {} + +QueryCache::~QueryCache() { + // TODO(Rodrigo): This is a hack to destroy all HostCounter instances before the base class + // destructor is called. The query cache should be redesigned to have a proper ownership model + // instead of using shared pointers. + for (size_t query_type = 0; query_type < VideoCore::NumQueryTypes; ++query_type) { + auto& stream = Stream(static_cast(query_type)); + stream.Update(false); + stream.Reset(); } - if (impl->hcr_is_set) { - if (impl->hcr_setup.buffer == impl->hcr_buffer && - impl->hcr_setup.offset == impl->hcr_offset) { - ResumeHostConditionalRendering(); - return; - } - PauseHostConditionalRendering(); - } - impl->hcr_setup.buffer = impl->hcr_buffer; - impl->hcr_setup.offset = impl->hcr_offset; - impl->hcr_setup.flags = is_equal ? VK_CONDITIONAL_RENDERING_INVERTED_BIT_EXT : 0; - impl->hcr_is_set = true; - impl->is_hcr_running = false; - ResumeHostConditionalRendering(); } -void QueryCacheRuntime::HostConditionalRenderingCompareBCImpl(VAddr address, bool is_equal) { - VkBuffer to_resolve; - u32 to_resolve_offset; - { - std::scoped_lock lk(impl->buffer_cache.mutex); - static constexpr auto sync_info = VideoCommon::ObtainBufferSynchronize::NoSynchronize; - const auto post_op = VideoCommon::ObtainBufferOperation::DoNothing; - const auto [buffer, offset] = - impl->buffer_cache.ObtainCPUBuffer(address, 24, sync_info, post_op); - to_resolve = buffer->Handle(); - to_resolve_offset = static_cast(offset); - } - if (impl->is_hcr_running) { - PauseHostConditionalRendering(); - } - impl->conditional_resolve_pass->Resolve(*impl->hcr_resolve_buffer, to_resolve, - to_resolve_offset, false); - impl->hcr_setup.buffer = *impl->hcr_resolve_buffer; - impl->hcr_setup.offset = 0; - impl->hcr_setup.flags = is_equal ? 0 : VK_CONDITIONAL_RENDERING_INVERTED_BIT_EXT; - impl->hcr_is_set = true; - impl->is_hcr_running = false; - ResumeHostConditionalRendering(); +std::pair QueryCache::AllocateQuery(QueryType type) { + return query_pools[static_cast(type)].Commit(); } -bool QueryCacheRuntime::HostConditionalRenderingCompareValue(VideoCommon::LookupData object_1, - [[maybe_unused]] bool qc_dirty) { - if (!impl->device.IsExtConditionalRendering()) { - return false; - } - HostConditionalRenderingCompareValueImpl(object_1, false); - return true; +void QueryCache::Reserve(QueryType type, std::pair query) { + query_pools[static_cast(type)].Reserve(query); } -bool QueryCacheRuntime::HostConditionalRenderingCompareValues(VideoCommon::LookupData object_1, - VideoCommon::LookupData object_2, - bool qc_dirty, bool equal_check) { - if (!impl->device.IsExtConditionalRendering()) { - return false; - } - - const auto check_in_bc = [&](VAddr address) { - return impl->buffer_cache.IsRegionGpuModified(address, 8); - }; - const auto check_value = [&](VAddr address) { - u8* ptr = impl->cpu_memory.GetPointer(address); - u64 value{}; - std::memcpy(&value, ptr, sizeof(value)); - return value == 0; - }; - std::array objects{&object_1, &object_2}; - std::array is_in_bc{}; - std::array is_in_qc{}; - std::array is_in_ac{}; - std::array is_null{}; - { - std::scoped_lock lk(impl->buffer_cache.mutex); - for (size_t i = 0; i < 2; i++) { - is_in_qc[i] = objects[i]->found_query != nullptr; - is_in_bc[i] = !is_in_qc[i] && check_in_bc(objects[i]->address); - is_in_ac[i] = is_in_qc[i] || is_in_bc[i]; - } - } - - if (!is_in_ac[0] && !is_in_ac[1]) { - EndHostConditionalRendering(); - return false; - } - - if (!qc_dirty && !is_in_bc[0] && !is_in_bc[1]) { - EndHostConditionalRendering(); - return false; - } - - for (size_t i = 0; i < 2; i++) { - is_null[i] = !is_in_ac[i] && check_value(objects[i]->address); - } - - for (size_t i = 0; i < 2; i++) { - if (is_null[i]) { - size_t j = (i + 1) % 2; - HostConditionalRenderingCompareValueImpl(*objects[j], equal_check); - return true; - } - } - if (!is_in_bc[0] && !is_in_bc[1]) { - // Both queries are in query cache, it's best to just flush. - return true; - } - HostConditionalRenderingCompareBCImpl(object_1.address, equal_check); - return true; +HostCounter::HostCounter(QueryCache& cache_, std::shared_ptr dependency_, + QueryType type_) + : HostCounterBase{std::move(dependency_)}, cache{cache_}, type{type_}, + query{cache_.AllocateQuery(type_)}, tick{cache_.GetScheduler().CurrentTick()} { + const vk::Device* logical = &cache.GetDevice().GetLogical(); + cache.GetScheduler().Record([logical, query_ = query](vk::CommandBuffer cmdbuf) { + const bool use_precise = Settings::IsGPULevelHigh(); + logical->ResetQueryPool(query_.first, query_.second, 1); + cmdbuf.BeginQuery(query_.first, query_.second, + use_precise ? VK_QUERY_CONTROL_PRECISE_BIT : 0); + }); } -QueryCacheRuntime::~QueryCacheRuntime() = default; +HostCounter::~HostCounter() { + cache.Reserve(type, query); +} -VideoCommon::StreamerInterface* QueryCacheRuntime::GetStreamerInterface(QueryType query_type) { - switch (query_type) { - case QueryType::Payload: - return &impl->guest_streamer; - case QueryType::ZPassPixelCount64: - return &impl->sample_streamer; - case QueryType::StreamingByteCount: - return &impl->tfb_streamer; - case QueryType::StreamingPrimitivesNeeded: - case QueryType::VtgPrimitivesOut: - case QueryType::StreamingPrimitivesSucceeded: - return &impl->primitives_succeeded_streamer; - case QueryType::StreamingPrimitivesNeededMinusSucceeded: - return &impl->primitives_needed_minus_suceeded_streamer; +void HostCounter::EndQuery() { + cache.GetScheduler().Record([query_ = query](vk::CommandBuffer cmdbuf) { + cmdbuf.EndQuery(query_.first, query_.second); + }); +} + +u64 HostCounter::BlockingQuery(bool async) const { + if (!async) { + cache.GetScheduler().Wait(tick); + } + u64 data; + const VkResult query_result = cache.GetDevice().GetLogical().GetQueryResults( + query.first, query.second, 1, sizeof(data), &data, sizeof(data), + VK_QUERY_RESULT_64_BIT | VK_QUERY_RESULT_WAIT_BIT); + + switch (query_result) { + case VK_SUCCESS: + return data; + case VK_ERROR_DEVICE_LOST: + cache.GetDevice().ReportLoss(); + [[fallthrough]]; default: - return nullptr; + throw vk::Exception(query_result); } } -void QueryCacheRuntime::Barriers(bool is_prebarrier) { - static constexpr VkMemoryBarrier READ_BARRIER{ - .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, - .pNext = nullptr, - .srcAccessMask = VK_ACCESS_MEMORY_WRITE_BIT, - .dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT | VK_ACCESS_TRANSFER_WRITE_BIT, - }; - static constexpr VkMemoryBarrier WRITE_BARRIER{ - .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, - .pNext = nullptr, - .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, - .dstAccessMask = VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT, - }; - if (is_prebarrier) { - impl->scheduler.Record([](vk::CommandBuffer cmdbuf) { - cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, - VK_PIPELINE_STAGE_TRANSFER_BIT, 0, READ_BARRIER); - }); - } else { - impl->scheduler.Record([](vk::CommandBuffer cmdbuf) { - cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, - VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 0, WRITE_BARRIER); - }); - } -} - -template -void QueryCacheRuntime::SyncValues(std::span values, VkBuffer base_src_buffer) { - if (values.size() == 0) { - return; - } - impl->redirect_cache.clear(); - impl->little_cache.clear(); - size_t total_size = 0; - for (auto& sync_val : values) { - total_size += sync_val.size; - bool found = false; - VAddr base = Common::AlignDown(sync_val.address, Core::Memory::YUZU_PAGESIZE); - VAddr base_end = base + Core::Memory::YUZU_PAGESIZE; - for (size_t i = 0; i < impl->little_cache.size(); i++) { - const auto set_found = [&] { - impl->redirect_cache.push_back(i); - found = true; - }; - auto& loc = impl->little_cache[i]; - if (base < loc.second && loc.first < base_end) { - set_found(); - break; - } - if (loc.first == base_end) { - loc.first = base; - set_found(); - break; - } - if (loc.second == base) { - loc.second = base_end; - set_found(); - break; - } - } - if (!found) { - impl->redirect_cache.push_back(impl->little_cache.size()); - impl->little_cache.emplace_back(base, base_end); - } - } - - // Vulkan part. - std::scoped_lock lk(impl->buffer_cache.mutex); - impl->buffer_cache.BufferOperations([&] { - impl->buffers_to_upload_to.clear(); - for (auto& pair : impl->little_cache) { - static constexpr auto sync_info = VideoCommon::ObtainBufferSynchronize::FullSynchronize; - const auto post_op = VideoCommon::ObtainBufferOperation::DoNothing; - const auto [buffer, offset] = impl->buffer_cache.ObtainCPUBuffer( - pair.first, static_cast(pair.second - pair.first), sync_info, post_op); - impl->buffers_to_upload_to.emplace_back(buffer->Handle(), offset); - } - }); - - VkBuffer src_buffer; - [[maybe_unused]] StagingBufferRef ref; - impl->copies_setup.clear(); - impl->copies_setup.resize(impl->little_cache.size()); - if constexpr (SyncValuesType::GeneratesBaseBuffer) { - ref = impl->staging_pool.Request(total_size, MemoryUsage::Upload); - size_t current_offset = ref.offset; - size_t accumulated_size = 0; - for (size_t i = 0; i < values.size(); i++) { - size_t which_copy = impl->redirect_cache[i]; - impl->copies_setup[which_copy].emplace_back(VkBufferCopy{ - .srcOffset = current_offset + accumulated_size, - .dstOffset = impl->buffers_to_upload_to[which_copy].second + values[i].address - - impl->little_cache[which_copy].first, - .size = values[i].size, - }); - std::memcpy(ref.mapped_span.data() + accumulated_size, &values[i].value, - values[i].size); - accumulated_size += values[i].size; - } - src_buffer = ref.buffer; - } else { - for (size_t i = 0; i < values.size(); i++) { - size_t which_copy = impl->redirect_cache[i]; - impl->copies_setup[which_copy].emplace_back(VkBufferCopy{ - .srcOffset = values[i].offset, - .dstOffset = impl->buffers_to_upload_to[which_copy].second + values[i].address - - impl->little_cache[which_copy].first, - .size = values[i].size, - }); - } - src_buffer = base_src_buffer; - } - - impl->scheduler.RequestOutsideRenderPassOperationContext(); - impl->scheduler.Record([src_buffer, dst_buffers = std::move(impl->buffers_to_upload_to), - vk_copies = std::move(impl->copies_setup)](vk::CommandBuffer cmdbuf) { - size_t size = dst_buffers.size(); - for (size_t i = 0; i < size; i++) { - cmdbuf.CopyBuffer(src_buffer, dst_buffers[i].first, vk_copies[i]); - } - }); -} - } // namespace Vulkan - -namespace VideoCommon { - -template class QueryCacheBase; - -} // namespace VideoCommon diff --git a/src/video_core/renderer_vulkan/vk_query_cache.h b/src/video_core/renderer_vulkan/vk_query_cache.h index 2a1ba251b..18f562e43 100755 --- a/src/video_core/renderer_vulkan/vk_query_cache.h +++ b/src/video_core/renderer_vulkan/vk_query_cache.h @@ -1,75 +1,101 @@ -// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project -// SPDX-License-Identifier: GPL-3.0-or-later +// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later #pragma once +#include #include +#include +#include -#include "video_core/query_cache/query_cache_base.h" -#include "video_core/renderer_vulkan/vk_buffer_cache.h" +#include "common/common_types.h" +#include "video_core/query_cache.h" +#include "video_core/renderer_vulkan/vk_resource_pool.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" namespace VideoCore { class RasterizerInterface; } -namespace VideoCommon { -class StreamerInterface; -} - namespace Vulkan { +class CachedQuery; class Device; +class HostCounter; +class QueryCache; class Scheduler; -class StagingBufferPool; -struct QueryCacheRuntimeImpl; +using CounterStream = VideoCommon::CounterStreamBase; -class QueryCacheRuntime { +class QueryPool final : public ResourcePool { public: - explicit QueryCacheRuntime(VideoCore::RasterizerInterface* rasterizer, - Core::Memory::Memory& cpu_memory_, - Vulkan::BufferCache& buffer_cache_, const Device& device_, - const MemoryAllocator& memory_allocator_, Scheduler& scheduler_, - StagingBufferPool& staging_pool_, - ComputePassDescriptorQueue& compute_pass_descriptor_queue, - DescriptorPool& descriptor_pool); - ~QueryCacheRuntime(); + explicit QueryPool(const Device& device, Scheduler& scheduler, VideoCore::QueryType type); + ~QueryPool() override; - template - void SyncValues(std::span values, VkBuffer base_src_buffer = nullptr); + std::pair Commit(); - void Barriers(bool is_prebarrier); + void Reserve(std::pair query); - void EndHostConditionalRendering(); - - void PauseHostConditionalRendering(); - - void ResumeHostConditionalRendering(); - - bool HostConditionalRenderingCompareValue(VideoCommon::LookupData object_1, bool qc_dirty); - - bool HostConditionalRenderingCompareValues(VideoCommon::LookupData object_1, - VideoCommon::LookupData object_2, bool qc_dirty, - bool equal_check); - - VideoCommon::StreamerInterface* GetStreamerInterface(VideoCommon::QueryType query_type); - - void Bind3DEngine(Tegra::Engines::Maxwell3D* maxwell3d); - - template - void View3DRegs(Func&& func); +protected: + void Allocate(std::size_t begin, std::size_t end) override; private: - void HostConditionalRenderingCompareValueImpl(VideoCommon::LookupData object, bool is_equal); - void HostConditionalRenderingCompareBCImpl(VAddr address, bool is_equal); - friend struct QueryCacheRuntimeImpl; - std::unique_ptr impl; + static constexpr std::size_t GROW_STEP = 512; + + const Device& device; + const VideoCore::QueryType type; + + std::vector pools; + std::vector usage; }; -struct QueryCacheParams { - using RuntimeType = typename Vulkan::QueryCacheRuntime; +class QueryCache final + : public VideoCommon::QueryCacheBase { +public: + explicit QueryCache(VideoCore::RasterizerInterface& rasterizer_, + Core::Memory::Memory& cpu_memory_, const Device& device_, + Scheduler& scheduler_); + ~QueryCache(); + + std::pair AllocateQuery(VideoCore::QueryType type); + + void Reserve(VideoCore::QueryType type, std::pair query); + + const Device& GetDevice() const noexcept { + return device; + } + + Scheduler& GetScheduler() const noexcept { + return scheduler; + } + +private: + const Device& device; + Scheduler& scheduler; + std::array query_pools; }; -using QueryCache = VideoCommon::QueryCacheBase; +class HostCounter final : public VideoCommon::HostCounterBase { +public: + explicit HostCounter(QueryCache& cache_, std::shared_ptr dependency_, + VideoCore::QueryType type_); + ~HostCounter(); + + void EndQuery(); + +private: + u64 BlockingQuery(bool async = false) const override; + + QueryCache& cache; + const VideoCore::QueryType type; + const std::pair query; + const u64 tick; +}; + +class CachedQuery : public VideoCommon::CachedQueryBase { +public: + explicit CachedQuery(QueryCache&, VideoCore::QueryType, VAddr cpu_addr_, u8* host_ptr_) + : CachedQueryBase{cpu_addr_, host_ptr_} {} +}; } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index 64cd81864..2eaafc00d 100755 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -24,7 +24,6 @@ #include "video_core/renderer_vulkan/vk_compute_pipeline.h" #include "video_core/renderer_vulkan/vk_descriptor_pool.h" #include "video_core/renderer_vulkan/vk_pipeline_cache.h" -#include "video_core/renderer_vulkan/vk_query_cache.h" #include "video_core/renderer_vulkan/vk_rasterizer.h" #include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/renderer_vulkan/vk_staging_buffer_pool.h" @@ -171,11 +170,9 @@ RasterizerVulkan::RasterizerVulkan(Core::Frontend::EmuWindow& emu_window_, Tegra buffer_cache_runtime(device, memory_allocator, scheduler, staging_pool, guest_descriptor_queue, compute_pass_descriptor_queue, descriptor_pool), buffer_cache(*this, cpu_memory_, buffer_cache_runtime), - query_cache_runtime(this, cpu_memory_, buffer_cache, device, memory_allocator, scheduler, - staging_pool, compute_pass_descriptor_queue, descriptor_pool), - query_cache(gpu, *this, cpu_memory_, query_cache_runtime), pipeline_cache(*this, device, scheduler, descriptor_pool, guest_descriptor_queue, render_pass_cache, buffer_cache, texture_cache, gpu.ShaderNotify()), + query_cache{*this, cpu_memory_, device, scheduler}, accelerate_dma(buffer_cache, texture_cache, scheduler), fence_manager(*this, gpu, texture_cache, buffer_cache, query_cache, device, scheduler), wfi_event(device.GetLogical().CreateEvent()) { @@ -192,7 +189,14 @@ void RasterizerVulkan::PrepareDraw(bool is_indexed, Func&& draw_func) { FlushWork(); gpu_memory->FlushCaching(); - query_cache.NotifySegment(true); +#if ANDROID + if (Settings::IsGPULevelHigh()) { + // This is problematic on Android, disable on GPU Normal. + query_cache.UpdateCounters(); + } +#else + query_cache.UpdateCounters(); +#endif GraphicsPipeline* const pipeline{pipeline_cache.CurrentGraphicsPipeline()}; if (!pipeline) { @@ -203,12 +207,13 @@ void RasterizerVulkan::PrepareDraw(bool is_indexed, Func&& draw_func) { pipeline->SetEngine(maxwell3d, gpu_memory); pipeline->Configure(is_indexed); + BeginTransformFeedback(); + UpdateDynamicStates(); - HandleTransformFeedback(); - query_cache.CounterEnable(VideoCommon::QueryType::ZPassPixelCount64, - maxwell3d->regs.zpass_pixel_count_enable); draw_func(); + + EndTransformFeedback(); } void RasterizerVulkan::Draw(bool is_indexed, u32 instance_count) { @@ -236,14 +241,6 @@ void RasterizerVulkan::DrawIndirect() { const auto indirect_buffer = buffer_cache.GetDrawIndirectBuffer(); const auto& buffer = indirect_buffer.first; const auto& offset = indirect_buffer.second; - if (params.is_byte_count) { - scheduler.Record([buffer_obj = buffer->Handle(), offset, - stride = params.stride](vk::CommandBuffer cmdbuf) { - cmdbuf.DrawIndirectByteCountEXT(1, 0, buffer_obj, offset, 0, - static_cast(stride)); - }); - return; - } if (params.include_count) { const auto count = buffer_cache.GetDrawIndirectCount(); const auto& draw_buffer = count.first; @@ -283,15 +280,20 @@ void RasterizerVulkan::DrawTexture() { SCOPE_EXIT({ gpu.TickWork(); }); FlushWork(); - query_cache.NotifySegment(true); +#if ANDROID + if (Settings::IsGPULevelHigh()) { + // This is problematic on Android, disable on GPU Normal. + query_cache.UpdateCounters(); + } +#else + query_cache.UpdateCounters(); +#endif texture_cache.SynchronizeGraphicsDescriptors(); texture_cache.UpdateRenderTargets(false); UpdateDynamicStates(); - query_cache.CounterEnable(VideoCommon::QueryType::ZPassPixelCount64, - maxwell3d->regs.zpass_pixel_count_enable); const auto& draw_texture_state = maxwell3d->draw_manager->GetDrawTextureState(); const auto& sampler = texture_cache.GetGraphicsSampler(draw_texture_state.src_sampler); const auto& texture = texture_cache.GetImageView(draw_texture_state.src_texture); @@ -314,9 +316,14 @@ void RasterizerVulkan::Clear(u32 layer_count) { FlushWork(); gpu_memory->FlushCaching(); - query_cache.NotifySegment(true); - query_cache.CounterEnable(VideoCommon::QueryType::ZPassPixelCount64, - maxwell3d->regs.zpass_pixel_count_enable); +#if ANDROID + if (Settings::IsGPULevelHigh()) { + // This is problematic on Android, disable on GPU Normal. + query_cache.UpdateCounters(); + } +#else + query_cache.UpdateCounters(); +#endif auto& regs = maxwell3d->regs; const bool use_color = regs.clear_surface.R || regs.clear_surface.G || regs.clear_surface.B || @@ -461,13 +468,13 @@ void RasterizerVulkan::DispatchCompute() { scheduler.Record([dim](vk::CommandBuffer cmdbuf) { cmdbuf.Dispatch(dim[0], dim[1], dim[2]); }); } -void RasterizerVulkan::ResetCounter(VideoCommon::QueryType type) { - query_cache.CounterReset(type); +void RasterizerVulkan::ResetCounter(VideoCore::QueryType type) { + query_cache.ResetCounter(type); } -void RasterizerVulkan::Query(GPUVAddr gpu_addr, VideoCommon::QueryType type, - VideoCommon::QueryPropertiesFlags flags, u32 payload, u32 subreport) { - query_cache.CounterReport(gpu_addr, type, flags, payload, subreport); +void RasterizerVulkan::Query(GPUVAddr gpu_addr, VideoCore::QueryType type, + std::optional timestamp) { + query_cache.Query(gpu_addr, type, timestamp); } void RasterizerVulkan::BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, @@ -648,8 +655,8 @@ void RasterizerVulkan::SignalReference() { fence_manager.SignalReference(); } -void RasterizerVulkan::ReleaseFences(bool force) { - fence_manager.WaitPendingFences(force); +void RasterizerVulkan::ReleaseFences() { + fence_manager.WaitPendingFences(); } void RasterizerVulkan::FlushAndInvalidateRegion(VAddr addr, u64 size, @@ -673,8 +680,6 @@ void RasterizerVulkan::WaitForIdle() { flags |= VK_PIPELINE_STAGE_TRANSFORM_FEEDBACK_BIT_EXT; } - query_cache.NotifyWFI(); - scheduler.RequestOutsideRenderPassOperationContext(); scheduler.Record([event = *wfi_event, flags](vk::CommandBuffer cmdbuf) { cmdbuf.SetEvent(event, flags); @@ -718,7 +723,19 @@ void RasterizerVulkan::TickFrame() { bool RasterizerVulkan::AccelerateConditionalRendering() { gpu_memory->FlushCaching(); - return query_cache.AccelerateHostConditionalRendering(); + if (Settings::IsGPULevelHigh()) { + // TODO(Blinkhawk): Reimplement Host conditional rendering. + return false; + } + // Medium / Low Hack: stub any checks on queries written into the buffer cache. + const GPUVAddr condition_address{maxwell3d->regs.render_enable.Address()}; + Maxwell::ReportSemaphore::Compare cmp; + if (gpu_memory->IsMemoryDirty(condition_address, sizeof(cmp), + VideoCommon::CacheType::BufferCache | + VideoCommon::CacheType::QueryCache)) { + return true; + } + return false; } bool RasterizerVulkan::AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Surface& src, @@ -764,7 +781,6 @@ bool RasterizerVulkan::AccelerateDisplay(const Tegra::FramebufferConfig& config, if (!image_view) { return false; } - query_cache.NotifySegment(false); screen_info.image = image_view->ImageHandle(); screen_info.image_view = image_view->Handle(Shader::TextureType::Color2D); screen_info.width = image_view->size.width; @@ -903,18 +919,31 @@ void RasterizerVulkan::UpdateDynamicStates() { } } -void RasterizerVulkan::HandleTransformFeedback() { +void RasterizerVulkan::BeginTransformFeedback() { const auto& regs = maxwell3d->regs; + if (regs.transform_feedback_enabled == 0) { + return; + } if (!device.IsExtTransformFeedbackSupported()) { LOG_ERROR(Render_Vulkan, "Transform feedbacks used but not supported"); return; } - query_cache.CounterEnable(VideoCommon::QueryType::StreamingByteCount, - regs.transform_feedback_enabled); - if (regs.transform_feedback_enabled != 0) { - UNIMPLEMENTED_IF(regs.IsShaderConfigEnabled(Maxwell::ShaderType::TessellationInit) || - regs.IsShaderConfigEnabled(Maxwell::ShaderType::Tessellation)); + UNIMPLEMENTED_IF(regs.IsShaderConfigEnabled(Maxwell::ShaderType::TessellationInit) || + regs.IsShaderConfigEnabled(Maxwell::ShaderType::Tessellation)); + scheduler.Record( + [](vk::CommandBuffer cmdbuf) { cmdbuf.BeginTransformFeedbackEXT(0, 0, nullptr, nullptr); }); +} + +void RasterizerVulkan::EndTransformFeedback() { + const auto& regs = maxwell3d->regs; + if (regs.transform_feedback_enabled == 0) { + return; } + if (!device.IsExtTransformFeedbackSupported()) { + return; + } + scheduler.Record( + [](vk::CommandBuffer cmdbuf) { cmdbuf.EndTransformFeedbackEXT(0, 0, nullptr, nullptr); }); } void RasterizerVulkan::UpdateViewportsState(Tegra::Engines::Maxwell3D::Regs& regs) { diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h index b8c637c47..ac05126cd 100755 --- a/src/video_core/renderer_vulkan/vk_rasterizer.h +++ b/src/video_core/renderer_vulkan/vk_rasterizer.h @@ -84,9 +84,8 @@ public: void DrawTexture() override; void Clear(u32 layer_count) override; void DispatchCompute() override; - void ResetCounter(VideoCommon::QueryType type) override; - void Query(GPUVAddr gpu_addr, VideoCommon::QueryType type, - VideoCommon::QueryPropertiesFlags flags, u32 payload, u32 subreport) override; + void ResetCounter(VideoCore::QueryType type) override; + void Query(GPUVAddr gpu_addr, VideoCore::QueryType type, std::optional timestamp) override; void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size) override; void DisableGraphicsUniformBuffer(size_t stage, u32 index) override; void FlushAll() override; @@ -107,7 +106,7 @@ public: void SyncOperation(std::function&& func) override; void SignalSyncPoint(u32 value) override; void SignalReference() override; - void ReleaseFences(bool force = true) override; + void ReleaseFences() override; void FlushAndInvalidateRegion( VAddr addr, u64 size, VideoCommon::CacheType which = VideoCommon::CacheType::All) override; void WaitForIdle() override; @@ -147,7 +146,9 @@ private: void UpdateDynamicStates(); - void HandleTransformFeedback(); + void BeginTransformFeedback(); + + void EndTransformFeedback(); void UpdateViewportsState(Tegra::Engines::Maxwell3D::Regs& regs); void UpdateScissorsState(Tegra::Engines::Maxwell3D::Regs& regs); @@ -194,9 +195,8 @@ private: TextureCache texture_cache; BufferCacheRuntime buffer_cache_runtime; BufferCache buffer_cache; - QueryCacheRuntime query_cache_runtime; - QueryCache query_cache; PipelineCache pipeline_cache; + QueryCache query_cache; AccelerateDMA accelerate_dma; FenceManager fence_manager; diff --git a/src/video_core/renderer_vulkan/vk_scheduler.cpp b/src/video_core/renderer_vulkan/vk_scheduler.cpp index 3b7aaf006..34b655832 100755 --- a/src/video_core/renderer_vulkan/vk_scheduler.cpp +++ b/src/video_core/renderer_vulkan/vk_scheduler.cpp @@ -243,10 +243,10 @@ void Scheduler::AllocateNewContext() { #if ANDROID if (Settings::IsGPULevelHigh()) { // This is problematic on Android, disable on GPU Normal. - query_cache->NotifySegment(true); + query_cache->UpdateCounters(); } #else - query_cache->NotifySegment(true); + query_cache->UpdateCounters(); #endif } } @@ -261,12 +261,11 @@ void Scheduler::EndPendingOperations() { #if ANDROID if (Settings::IsGPULevelHigh()) { // This is problematic on Android, disable on GPU Normal. - // query_cache->DisableStreams(); + query_cache->DisableStreams(); } #else - // query_cache->DisableStreams(); + query_cache->DisableStreams(); #endif - query_cache->NotifySegment(false); EndRenderPass(); } diff --git a/src/video_core/renderer_vulkan/vk_scheduler.h b/src/video_core/renderer_vulkan/vk_scheduler.h index 04692dbdf..074b4b4df 100755 --- a/src/video_core/renderer_vulkan/vk_scheduler.h +++ b/src/video_core/renderer_vulkan/vk_scheduler.h @@ -17,11 +17,6 @@ #include "video_core/renderer_vulkan/vk_master_semaphore.h" #include "video_core/vulkan_common/vulkan_wrapper.h" -namespace VideoCommon { -template -class QueryCacheBase; -} - namespace Vulkan { class CommandPool; @@ -29,8 +24,7 @@ class Device; class Framebuffer; class GraphicsPipeline; class StateTracker; - -struct QueryCacheParams; +class QueryCache; /// The scheduler abstracts command buffer and fence management with an interface that's able to do /// OpenGL-like operations on Vulkan command buffers. @@ -69,7 +63,7 @@ public: void InvalidateState(); /// Assigns the query cache. - void SetQueryCache(VideoCommon::QueryCacheBase& query_cache_) { + void SetQueryCache(QueryCache& query_cache_) { query_cache = &query_cache_; } @@ -225,7 +219,7 @@ private: std::unique_ptr master_semaphore; std::unique_ptr command_pool; - VideoCommon::QueryCacheBase* query_cache = nullptr; + QueryCache* query_cache = nullptr; vk::CommandBuffer current_cmdbuf; diff --git a/src/video_core/vulkan_common/vulkan_device.h b/src/video_core/vulkan_common/vulkan_device.h index 81c56aac3..87d33081d 100755 --- a/src/video_core/vulkan_common/vulkan_device.h +++ b/src/video_core/vulkan_common/vulkan_device.h @@ -60,7 +60,6 @@ VK_DEFINE_HANDLE(VmaAllocator) // Define miscellaneous extensions which may be used by the implementation here. #define FOR_EACH_VK_EXTENSION(EXTENSION) \ - EXTENSION(EXT, CONDITIONAL_RENDERING, conditional_rendering) \ EXTENSION(EXT, CONSERVATIVE_RASTERIZATION, conservative_rasterization) \ EXTENSION(EXT, DEPTH_RANGE_UNRESTRICTED, depth_range_unrestricted) \ EXTENSION(EXT, MEMORY_BUDGET, memory_budget) \ @@ -93,7 +92,6 @@ VK_DEFINE_HANDLE(VmaAllocator) // Define extensions where the absence of the extension may result in a degraded experience. #define FOR_EACH_VK_RECOMMENDED_EXTENSION(EXTENSION_NAME) \ - EXTENSION_NAME(VK_EXT_CONDITIONAL_RENDERING_EXTENSION_NAME) \ EXTENSION_NAME(VK_EXT_CONSERVATIVE_RASTERIZATION_EXTENSION_NAME) \ EXTENSION_NAME(VK_EXT_DEPTH_RANGE_UNRESTRICTED_EXTENSION_NAME) \ EXTENSION_NAME(VK_EXT_EXTENDED_DYNAMIC_STATE_EXTENSION_NAME) \ @@ -530,10 +528,6 @@ public: return extensions.shader_atomic_int64; } - bool IsExtConditionalRendering() const { - return extensions.conditional_rendering; - } - bool HasTimelineSemaphore() const; /// Returns the minimum supported version of SPIR-V. diff --git a/src/video_core/vulkan_common/vulkan_wrapper.cpp b/src/video_core/vulkan_common/vulkan_wrapper.cpp index bde632acb..ba6a51af6 100755 --- a/src/video_core/vulkan_common/vulkan_wrapper.cpp +++ b/src/video_core/vulkan_common/vulkan_wrapper.cpp @@ -75,7 +75,6 @@ void Load(VkDevice device, DeviceDispatch& dld) noexcept { X(vkBeginCommandBuffer); X(vkBindBufferMemory); X(vkBindImageMemory); - X(vkCmdBeginConditionalRenderingEXT); X(vkCmdBeginQuery); X(vkCmdBeginRenderPass); X(vkCmdBeginTransformFeedbackEXT); @@ -92,7 +91,6 @@ void Load(VkDevice device, DeviceDispatch& dld) noexcept { X(vkCmdCopyBufferToImage); X(vkCmdCopyImage); X(vkCmdCopyImageToBuffer); - X(vkCmdCopyQueryPoolResults); X(vkCmdDispatch); X(vkCmdDraw); X(vkCmdDrawIndexed); @@ -100,8 +98,6 @@ void Load(VkDevice device, DeviceDispatch& dld) noexcept { X(vkCmdDrawIndexedIndirect); X(vkCmdDrawIndirectCount); X(vkCmdDrawIndexedIndirectCount); - X(vkCmdDrawIndirectByteCountEXT); - X(vkCmdEndConditionalRenderingEXT); X(vkCmdEndQuery); X(vkCmdEndRenderPass); X(vkCmdEndTransformFeedbackEXT); diff --git a/src/video_core/vulkan_common/vulkan_wrapper.h b/src/video_core/vulkan_common/vulkan_wrapper.h index e848f8f7f..d97abee9e 100755 --- a/src/video_core/vulkan_common/vulkan_wrapper.h +++ b/src/video_core/vulkan_common/vulkan_wrapper.h @@ -185,7 +185,6 @@ struct DeviceDispatch : InstanceDispatch { PFN_vkBeginCommandBuffer vkBeginCommandBuffer{}; PFN_vkBindBufferMemory vkBindBufferMemory{}; PFN_vkBindImageMemory vkBindImageMemory{}; - PFN_vkCmdBeginConditionalRenderingEXT vkCmdBeginConditionalRenderingEXT{}; PFN_vkCmdBeginDebugUtilsLabelEXT vkCmdBeginDebugUtilsLabelEXT{}; PFN_vkCmdBeginQuery vkCmdBeginQuery{}; PFN_vkCmdBeginRenderPass vkCmdBeginRenderPass{}; @@ -203,7 +202,6 @@ struct DeviceDispatch : InstanceDispatch { PFN_vkCmdCopyBufferToImage vkCmdCopyBufferToImage{}; PFN_vkCmdCopyImage vkCmdCopyImage{}; PFN_vkCmdCopyImageToBuffer vkCmdCopyImageToBuffer{}; - PFN_vkCmdCopyQueryPoolResults vkCmdCopyQueryPoolResults{}; PFN_vkCmdDispatch vkCmdDispatch{}; PFN_vkCmdDraw vkCmdDraw{}; PFN_vkCmdDrawIndexed vkCmdDrawIndexed{}; @@ -211,8 +209,6 @@ struct DeviceDispatch : InstanceDispatch { PFN_vkCmdDrawIndexedIndirect vkCmdDrawIndexedIndirect{}; PFN_vkCmdDrawIndirectCount vkCmdDrawIndirectCount{}; PFN_vkCmdDrawIndexedIndirectCount vkCmdDrawIndexedIndirectCount{}; - PFN_vkCmdDrawIndirectByteCountEXT vkCmdDrawIndirectByteCountEXT{}; - PFN_vkCmdEndConditionalRenderingEXT vkCmdEndConditionalRenderingEXT{}; PFN_vkCmdEndDebugUtilsLabelEXT vkCmdEndDebugUtilsLabelEXT{}; PFN_vkCmdEndQuery vkCmdEndQuery{}; PFN_vkCmdEndRenderPass vkCmdEndRenderPass{}; @@ -1185,13 +1181,6 @@ public: count_offset, draw_count, stride); } - void DrawIndirectByteCountEXT(u32 instance_count, u32 first_instance, VkBuffer counter_buffer, - VkDeviceSize counter_buffer_offset, u32 counter_offset, - u32 stride) { - dld->vkCmdDrawIndirectByteCountEXT(handle, instance_count, first_instance, counter_buffer, - counter_buffer_offset, counter_offset, stride); - } - void ClearAttachments(Span attachments, Span rects) const noexcept { dld->vkCmdClearAttachments(handle, attachments.size(), attachments.data(), rects.size(), @@ -1276,13 +1265,6 @@ public: regions.data()); } - void CopyQueryPoolResults(VkQueryPool query_pool, u32 first_query, u32 query_count, - VkBuffer dst_buffer, VkDeviceSize dst_offset, VkDeviceSize stride, - VkQueryResultFlags flags) const noexcept { - dld->vkCmdCopyQueryPoolResults(handle, query_pool, first_query, query_count, dst_buffer, - dst_offset, stride, flags); - } - void FillBuffer(VkBuffer dst_buffer, VkDeviceSize dst_offset, VkDeviceSize size, u32 data) const noexcept { dld->vkCmdFillBuffer(handle, dst_buffer, dst_offset, size, data); @@ -1461,15 +1443,6 @@ public: counter_buffers, counter_buffer_offsets); } - void BeginConditionalRenderingEXT( - const VkConditionalRenderingBeginInfoEXT& info) const noexcept { - dld->vkCmdBeginConditionalRenderingEXT(handle, &info); - } - - void EndConditionalRenderingEXT() const noexcept { - dld->vkCmdEndConditionalRenderingEXT(handle); - } - void BeginDebugUtilsLabelEXT(const char* label, std::span color) const noexcept { const VkDebugUtilsLabelEXT label_info{ .sType = VK_STRUCTURE_TYPE_DEBUG_UTILS_LABEL_EXT,