From f97ec12f78bc5e1f8b9d25022ef74bf96a3861fb Mon Sep 17 00:00:00 2001 From: pineappleEA Date: Sun, 4 Jul 2021 23:34:44 +0200 Subject: [PATCH] early-access version 1843 --- README.md | 2 +- src/core/hle/service/nvdrv/nvdrv.cpp | 1 - src/tests/video_core/buffer_base.cpp | 2 +- src/video_core/buffer_cache/buffer_base.h | 14 +- src/video_core/buffer_cache/buffer_cache.h | 134 +++--- src/video_core/engines/fermi_2d.cpp | 15 +- src/video_core/fence_manager.h | 34 +- src/video_core/gpu_thread.cpp | 15 + src/video_core/gpu_thread.h | 5 + src/video_core/memory_manager.cpp | 109 ++++- src/video_core/memory_manager.h | 22 +- src/video_core/rasterizer_interface.h | 6 +- .../renderer_opengl/gl_rasterizer.cpp | 10 +- .../renderer_opengl/gl_rasterizer.h | 3 +- .../renderer_vulkan/vk_fence_manager.cpp | 4 + .../renderer_vulkan/vk_rasterizer.cpp | 12 +- .../renderer_vulkan/vk_rasterizer.h | 3 +- src/video_core/renderer_vulkan/vk_scheduler.h | 4 + src/video_core/texture_cache/image_base.cpp | 3 + src/video_core/texture_cache/image_base.h | 39 +- src/video_core/texture_cache/texture_cache.h | 435 +++++++++++++++--- src/video_core/texture_cache/types.h | 5 +- src/video_core/texture_cache/util.cpp | 24 +- src/video_core/texture_cache/util.h | 4 +- 24 files changed, 739 insertions(+), 166 deletions(-) diff --git a/README.md b/README.md index 2735b42e1..7e0b795e2 100755 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ yuzu emulator early access ============= -This is the source code for early-access 1842. +This is the source code for early-access 1843. ## Legal Notice diff --git a/src/core/hle/service/nvdrv/nvdrv.cpp b/src/core/hle/service/nvdrv/nvdrv.cpp index 8d3841cf7..3de21a06d 100755 --- a/src/core/hle/service/nvdrv/nvdrv.cpp +++ b/src/core/hle/service/nvdrv/nvdrv.cpp @@ -41,7 +41,6 @@ void InstallInterfaces(SM::ServiceManager& service_manager, NVFlinger::NVFlinger Module::Module(Core::System& system) : syncpoint_manager{system.GPU()}, service_context{system, "nvdrv"} { - auto& kernel = system.Kernel(); for (u32 i = 0; i < MaxNvEvents; i++) { events_interface.events[i].event = service_context.CreateEvent(fmt::format("NVDRV::NvEvent_{}", i)); diff --git a/src/tests/video_core/buffer_base.cpp b/src/tests/video_core/buffer_base.cpp index edced69bb..cfcdc2253 100755 --- a/src/tests/video_core/buffer_base.cpp +++ b/src/tests/video_core/buffer_base.cpp @@ -536,7 +536,7 @@ TEST_CASE("BufferBase: Cached write downloads") { REQUIRE(rasterizer.Count() == 63); buffer.MarkRegionAsGpuModified(c + PAGE, PAGE); int num = 0; - buffer.ForEachDownloadRange(c, WORD, [&](u64 offset, u64 size) { ++num; }); + buffer.ForEachDownloadRange(c, WORD, true, [&](u64 offset, u64 size) { ++num; }); buffer.ForEachUploadRange(c, WORD, [&](u64 offset, u64 size) { ++num; }); REQUIRE(num == 0); REQUIRE(!buffer.IsRegionCpuModified(c + PAGE, PAGE)); diff --git a/src/video_core/buffer_cache/buffer_base.h b/src/video_core/buffer_cache/buffer_base.h index b121d36a3..9e39858c8 100755 --- a/src/video_core/buffer_cache/buffer_base.h +++ b/src/video_core/buffer_cache/buffer_base.h @@ -226,19 +226,19 @@ public: /// Call 'func' for each CPU modified range and unmark those pages as CPU modified template void ForEachUploadRange(VAddr query_cpu_range, u64 size, Func&& func) { - ForEachModifiedRange(query_cpu_range, size, func); + ForEachModifiedRange(query_cpu_range, size, true, func); } /// Call 'func' for each GPU modified range and unmark those pages as GPU modified template - void ForEachDownloadRange(VAddr query_cpu_range, u64 size, Func&& func) { - ForEachModifiedRange(query_cpu_range, size, func); + void ForEachDownloadRange(VAddr query_cpu_range, u64 size, bool clear, Func&& func) { + ForEachModifiedRange(query_cpu_range, size, clear, func); } /// Call 'func' for each GPU modified range and unmark those pages as GPU modified template void ForEachDownloadRange(Func&& func) { - ForEachModifiedRange(cpu_addr, SizeBytes(), func); + ForEachModifiedRange(cpu_addr, SizeBytes(), true, func); } /// Mark buffer as picked @@ -415,7 +415,7 @@ private: * @param func Function to call for each turned off region */ template - void ForEachModifiedRange(VAddr query_cpu_range, s64 size, Func&& func) { + void ForEachModifiedRange(VAddr query_cpu_range, s64 size, bool clear, Func&& func) { static_assert(type != Type::Untracked); const s64 difference = query_cpu_range - cpu_addr; @@ -467,7 +467,9 @@ private: bits = (bits << left_offset) >> left_offset; const u64 current_word = state_words[word_index] & bits; - state_words[word_index] &= ~bits; + if (clear) { + state_words[word_index] &= ~bits; + } if constexpr (type == Type::CPU) { const u64 current_bits = untracked_words[word_index] & bits; diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index cad7f902d..dc2b1f447 100755 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -15,6 +15,7 @@ #include #include +#include #include "common/common_types.h" #include "common/div_ceil.h" @@ -77,6 +78,9 @@ class BufferCache { using Runtime = typename P::Runtime; using Buffer = typename P::Buffer; + using IntervalSet = boost::icl::interval_set; + using IntervalType = typename IntervalSet::interval_type; + struct Empty {}; struct OverlapResult { @@ -153,6 +157,7 @@ public: /// Commit asynchronous downloads void CommitAsyncFlushes(); + void CommitAsyncFlushesHigh(); /// Pop asynchronous downloads void PopAsyncFlushes(); @@ -160,6 +165,9 @@ public: /// Return true when a CPU region is modified from the GPU [[nodiscard]] bool IsRegionGpuModified(VAddr addr, size_t size); + /// Return true when a CPU region is modified from the GPU + [[nodiscard]] bool IsRegionCpuModified(VAddr addr, size_t size); + std::mutex mutex; private: @@ -272,8 +280,6 @@ private: void DeleteBuffer(BufferId buffer_id); - void ReplaceBufferDownloads(BufferId old_buffer_id, BufferId new_buffer_id); - void NotifyBufferDeletion(); [[nodiscard]] Binding StorageBufferBinding(GPUVAddr ssbo_addr) const; @@ -327,9 +333,7 @@ private: std::vector cached_write_buffer_ids; - // TODO: This data structure is not optimal and it should be reworked - std::vector uncommitted_downloads; - std::deque> committed_downloads; + IntervalSet uncommitted_ranges; size_t immediate_buffer_capacity = 0; std::unique_ptr immediate_buffer_alloc; @@ -547,29 +551,18 @@ void BufferCache

::FlushCachedWrites() { template bool BufferCache

::HasUncommittedFlushes() const noexcept { - return !uncommitted_downloads.empty(); + return !uncommitted_ranges.empty(); } template bool BufferCache

::ShouldWaitAsyncFlushes() const noexcept { - return !committed_downloads.empty() && !committed_downloads.front().empty(); + return false; } template -void BufferCache

::CommitAsyncFlushes() { - // This is intentionally passing the value by copy - committed_downloads.push_front(uncommitted_downloads); - uncommitted_downloads.clear(); -} - -template -void BufferCache

::PopAsyncFlushes() { - if (committed_downloads.empty()) { - return; - } - auto scope_exit_pop_download = detail::ScopeExit([this] { committed_downloads.pop_back(); }); - const std::span download_ids = committed_downloads.back(); - if (download_ids.empty()) { +void BufferCache

::CommitAsyncFlushesHigh() { + const IntervalSet& intervals = uncommitted_ranges; + if (intervals.empty()) { return; } MICROPROFILE_SCOPE(GPU_DownloadMemory); @@ -577,18 +570,35 @@ void BufferCache

::PopAsyncFlushes() { boost::container::small_vector, 1> downloads; u64 total_size_bytes = 0; u64 largest_copy = 0; - for (const BufferId buffer_id : download_ids) { - slot_buffers[buffer_id].ForEachDownloadRange([&](u64 range_offset, u64 range_size) { - downloads.push_back({ - BufferCopy{ - .src_offset = range_offset, - .dst_offset = total_size_bytes, - .size = range_size, - }, - buffer_id, - }); - total_size_bytes += range_size; - largest_copy = std::max(largest_copy, range_size); + for (auto& interval : intervals) { + const std::size_t size = interval.upper() - interval.lower(); + const VAddr cpu_addr = interval.lower(); + const VAddr cpu_addr_end = interval.upper(); + ForEachBufferInRange(cpu_addr, size, [&](BufferId buffer_id, Buffer& buffer) { + boost::container::small_vector copies; + buffer.ForEachDownloadRange( + cpu_addr, size, false, [&](u64 range_offset, u64 range_size) { + VAddr cpu_addr_base = buffer.CpuAddr() + range_offset; + VAddr cpu_addr_end2 = cpu_addr_base + range_size; + const s64 difference = s64(cpu_addr_end2 - cpu_addr_end); + cpu_addr_end2 -= u64(std::max(difference, 0)); + const s64 difference2 = s64(cpu_addr - cpu_addr_base); + cpu_addr_base += u64(std::max(difference2, 0)); + const u64 new_size = cpu_addr_end2 - cpu_addr_base; + const u64 new_offset = cpu_addr_base - buffer.CpuAddr(); + ASSERT(!IsRegionCpuModified(cpu_addr_base, new_size)); + downloads.push_back({ + BufferCopy{ + .src_offset = new_offset, + .dst_offset = total_size_bytes, + .size = new_size, + }, + buffer_id, + }); + total_size_bytes += new_size; + buffer.UnmarkRegionAsGpuModified(cpu_addr_base, new_size); + largest_copy = std::max(largest_copy, new_size); + }); }); } if (downloads.empty()) { @@ -622,6 +632,18 @@ void BufferCache

::PopAsyncFlushes() { } } +template +void BufferCache

::CommitAsyncFlushes() { + if (Settings::values.gpu_accuracy.GetValue() == Settings::GPUAccuracy::High) { + CommitAsyncFlushesHigh(); + } else { + uncommitted_ranges.clear(); + } +} + +template +void BufferCache

::PopAsyncFlushes() {} + template bool BufferCache

::IsRegionGpuModified(VAddr addr, size_t size) { const u64 page_end = Common::DivCeil(addr + size, PAGE_SIZE); @@ -641,6 +663,25 @@ bool BufferCache

::IsRegionGpuModified(VAddr addr, size_t size) { return false; } +template +bool BufferCache

::IsRegionCpuModified(VAddr addr, size_t size) { + const u64 page_end = Common::DivCeil(addr + size, PAGE_SIZE); + for (u64 page = addr >> PAGE_BITS; page < page_end;) { + const BufferId image_id = page_table[page]; + if (!image_id) { + ++page; + continue; + } + Buffer& buffer = slot_buffers[image_id]; + if (buffer.IsRegionCpuModified(addr, size)) { + return true; + } + const VAddr end_addr = buffer.CpuAddr() + buffer.SizeBytes(); + page = Common::DivCeil(end_addr, PAGE_SIZE); + } + return false; +} + template void BufferCache

::BindHostIndexBuffer() { Buffer& buffer = slot_buffers[index_buffer.buffer_id]; @@ -1010,16 +1051,14 @@ void BufferCache

::MarkWrittenBuffer(BufferId buffer_id, VAddr cpu_addr, u32 s Buffer& buffer = slot_buffers[buffer_id]; buffer.MarkRegionAsGpuModified(cpu_addr, size); - const bool is_accuracy_high = Settings::IsGPULevelHigh(); + const bool is_accuracy_high = + Settings::values.gpu_accuracy.GetValue() == Settings::GPUAccuracy::High; const bool is_async = Settings::values.use_asynchronous_gpu_emulation.GetValue(); - if (!is_accuracy_high || !is_async) { + if (!is_async && !is_accuracy_high) { return; } - if (std::ranges::find(uncommitted_downloads, buffer_id) != uncommitted_downloads.end()) { - // Already inserted - return; - } - uncommitted_downloads.push_back(buffer_id); + const IntervalType base_interval{cpu_addr, cpu_addr + size}; + uncommitted_ranges.add(base_interval); } template @@ -1103,7 +1142,6 @@ void BufferCache

::JoinOverlap(BufferId new_buffer_id, BufferId overlap_id, if (!copies.empty()) { runtime.CopyBuffer(slot_buffers[new_buffer_id], overlap, copies); } - ReplaceBufferDownloads(overlap_id, new_buffer_id); DeleteBuffer(overlap_id); } @@ -1244,7 +1282,7 @@ void BufferCache

::DownloadBufferMemory(Buffer& buffer, VAddr cpu_addr, u64 si boost::container::small_vector copies; u64 total_size_bytes = 0; u64 largest_copy = 0; - buffer.ForEachDownloadRange(cpu_addr, size, [&](u64 range_offset, u64 range_size) { + buffer.ForEachDownloadRange(cpu_addr, size, true, [&](u64 range_offset, u64 range_size) { copies.push_back(BufferCopy{ .src_offset = range_offset, .dst_offset = total_size_bytes, @@ -1315,18 +1353,6 @@ void BufferCache

::DeleteBuffer(BufferId buffer_id) { NotifyBufferDeletion(); } -template -void BufferCache

::ReplaceBufferDownloads(BufferId old_buffer_id, BufferId new_buffer_id) { - const auto replace = [old_buffer_id, new_buffer_id](std::vector& buffers) { - std::ranges::replace(buffers, old_buffer_id, new_buffer_id); - if (auto it = std::ranges::find(buffers, new_buffer_id); it != buffers.end()) { - buffers.erase(std::remove(it + 1, buffers.end(), new_buffer_id), buffers.end()); - } - }; - replace(uncommitted_downloads); - std::ranges::for_each(committed_downloads, replace); -} - template void BufferCache

::NotifyBufferDeletion() { if constexpr (HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS) { diff --git a/src/video_core/engines/fermi_2d.cpp b/src/video_core/engines/fermi_2d.cpp index 8107caad4..ec5cbc11f 100755 --- a/src/video_core/engines/fermi_2d.cpp +++ b/src/video_core/engines/fermi_2d.cpp @@ -65,19 +65,20 @@ void Fermi2D::Blit() { .src_x1 = static_cast((args.du_dx * args.dst_width + args.src_x0) >> 32), .src_y1 = static_cast((args.dv_dy * args.dst_height + args.src_y0) >> 32), }; - Surface src = regs.src; - s32 src_address_offset = 0; const auto bytes_per_pixel = BytesPerBlock(PixelFormatFromRenderTargetFormat(src.format)); - if (src.linear == Tegra::Engines::Fermi2D::MemoryLayout::Pitch && src.width == config.src_x1 && - config.src_x1 > static_cast(src.pitch / bytes_per_pixel) && config.src_x0 > 0) { - src_address_offset = config.src_x0 * bytes_per_pixel; + const auto is_copy_out_of_bound = + src.linear == Tegra::Engines::Fermi2D::MemoryLayout::Pitch && src.width == config.src_x1 && + config.src_x1 > static_cast(src.pitch / bytes_per_pixel) && config.src_x0 > 0; + if (is_copy_out_of_bound) { + auto address = src.Address() + config.src_x0 * bytes_per_pixel; + src.addr_upper = static_cast(address >> 32); + src.addr_lower = static_cast(address); src.width -= config.src_x0; config.src_x1 -= config.src_x0; config.src_x0 = 0; } - - if (!rasterizer->AccelerateSurfaceCopy(src, src_address_offset, regs.dst, config)) { + if (!rasterizer->AccelerateSurfaceCopy(src, regs.dst, config)) { UNIMPLEMENTED(); } } diff --git a/src/video_core/fence_manager.h b/src/video_core/fence_manager.h index f055b61e9..696ea6dd9 100755 --- a/src/video_core/fence_manager.h +++ b/src/video_core/fence_manager.h @@ -96,6 +96,23 @@ public: } } + void TryReleasePendingFences() { + while (!fences.empty()) { + TFence& current_fence = fences.front(); + if (ShouldWait() && !IsFenceSignaled(current_fence)) { + return; + } + PopAsyncFlushes(); + if (current_fence->IsSemaphore()) { + gpu_memory.template Write(current_fence->GetAddress(), + current_fence->GetPayload()); + } else { + gpu.IncrementSyncPoint(current_fence->GetPayload()); + } + PopFence(); + } + } + protected: explicit FenceManager(VideoCore::RasterizerInterface& rasterizer_, Tegra::GPU& gpu_, TTextureCache& texture_cache_, TTBufferCache& buffer_cache_, @@ -125,23 +142,6 @@ protected: TQueryCache& query_cache; private: - void TryReleasePendingFences() { - while (!fences.empty()) { - TFence& current_fence = fences.front(); - if (ShouldWait() && !IsFenceSignaled(current_fence)) { - return; - } - PopAsyncFlushes(); - if (current_fence->IsSemaphore()) { - gpu_memory.template Write(current_fence->GetAddress(), - current_fence->GetPayload()); - } else { - gpu.IncrementSyncPoint(current_fence->GetPayload()); - } - PopFence(); - } - } - bool ShouldWait() const { std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex}; return texture_cache.ShouldWaitAsyncFlushes() || buffer_cache.ShouldWaitAsyncFlushes() || diff --git a/src/video_core/gpu_thread.cpp b/src/video_core/gpu_thread.cpp index 46f642b19..25c0d30dd 100755 --- a/src/video_core/gpu_thread.cpp +++ b/src/video_core/gpu_thread.cpp @@ -8,6 +8,7 @@ #include "common/settings.h" #include "common/thread.h" #include "core/core.h" +#include "core/core_timing.h" #include "core/frontend/emu_window.h" #include "video_core/dma_pusher.h" #include "video_core/gpu.h" @@ -83,6 +84,17 @@ void ThreadManager::StartThread(VideoCore::RendererBase& renderer, rasterizer = renderer.ReadRasterizer(); thread = std::thread(RunThread, std::ref(system), std::ref(renderer), std::ref(context), std::ref(dma_pusher), std::ref(state)); + gpu_sync_event = Core::Timing::CreateEvent( + "GPUHostSyncCallback", [this](std::uintptr_t, std::chrono::nanoseconds) { + if (!state.is_running) { + return; + } + + OnCommandListEnd(); + const auto time_interval = std::chrono::nanoseconds{500 * 1000}; + system.CoreTiming().ScheduleEvent(time_interval, gpu_sync_event); + }); + system.CoreTiming().ScheduleEvent(std::chrono::nanoseconds{500 * 1000}, gpu_sync_event); } void ThreadManager::SubmitList(Tegra::CommandList&& entries) { @@ -128,6 +140,9 @@ void ThreadManager::ShutDown() { state.cv.notify_all(); } + system.CoreTiming().UnscheduleEvent(gpu_sync_event, 0); + system.CoreTiming().RemoveEvent(gpu_sync_event); + if (!thread.joinable()) { return; } diff --git a/src/video_core/gpu_thread.h b/src/video_core/gpu_thread.h index 11a648f38..ea98df3b1 100755 --- a/src/video_core/gpu_thread.h +++ b/src/video_core/gpu_thread.h @@ -20,6 +20,10 @@ class DmaPusher; } // namespace Tegra namespace Core { +namespace Timing { +class CoreTiming; +struct EventType; +} // namespace Timing namespace Frontend { class GraphicsContext; } @@ -150,6 +154,7 @@ private: SynchState state; std::thread thread; + std::shared_ptr gpu_sync_event; }; } // namespace VideoCommon::GPUThread diff --git a/src/video_core/memory_manager.cpp b/src/video_core/memory_manager.cpp index 02385384c..9deea9a26 100755 --- a/src/video_core/memory_manager.cpp +++ b/src/video_core/memory_manager.cpp @@ -69,11 +69,16 @@ void MemoryManager::Unmap(GPUVAddr gpu_addr, std::size_t size) { } else { UNREACHABLE_MSG("Unmapping non-existent GPU address=0x{:x}", gpu_addr); } - // Flush and invalidate through the GPU interface, to be asynchronous if possible. - const std::optional cpu_addr = GpuToCpuAddress(gpu_addr); - ASSERT(cpu_addr); - rasterizer->UnmapMemory(*cpu_addr, size); + const auto submapped_ranges = GetSubmappedRange(gpu_addr, size); + + for (const auto& map : submapped_ranges) { + // Flush and invalidate through the GPU interface, to be asynchronous if possible. + const std::optional cpu_addr = GpuToCpuAddress(map.first); + ASSERT(cpu_addr); + + rasterizer->UnmapMemory(*cpu_addr, map.second); + } UpdateRange(gpu_addr, PageEntry::State::Unmapped, size); } @@ -146,8 +151,14 @@ void MemoryManager::SetPageEntry(GPUVAddr gpu_addr, PageEntry page_entry, std::s //// Lock the new page // TryLockPage(page_entry, size); + auto& current_page = page_table[PageEntryIndex(gpu_addr)]; - page_table[PageEntryIndex(gpu_addr)] = page_entry; + if ((!current_page.IsValid() && page_entry.IsValid()) || + current_page.ToAddress() != page_entry.ToAddress()) { + rasterizer->ModifyGPUMemory(gpu_addr, size); + } + + current_page = page_entry; } std::optional MemoryManager::FindFreeRange(std::size_t size, std::size_t align, @@ -193,6 +204,19 @@ std::optional MemoryManager::GpuToCpuAddress(GPUVAddr gpu_addr) const { return page_entry.ToAddress() + (gpu_addr & page_mask); } +std::optional MemoryManager::GpuToCpuAddress(GPUVAddr addr, std::size_t size) const { + size_t page_index{addr >> page_bits}; + const size_t page_last{(addr + size + page_size - 1) >> page_bits}; + while (page_index < page_last) { + const auto page_addr{GpuToCpuAddress(page_index << page_bits)}; + if (page_addr && *page_addr != 0) { + return page_addr; + } + ++page_index; + } + return std::nullopt; +} + template T MemoryManager::Read(GPUVAddr addr) const { if (auto page_pointer{GetPointer(addr)}; page_pointer) { @@ -389,4 +413,79 @@ bool MemoryManager::IsGranularRange(GPUVAddr gpu_addr, std::size_t size) const { return page <= Core::Memory::PAGE_SIZE; } +bool MemoryManager::IsContinousRange(GPUVAddr gpu_addr, std::size_t size) const { + size_t page_index{gpu_addr >> page_bits}; + const size_t page_last{(gpu_addr + size + page_size - 1) >> page_bits}; + std::optional old_page_addr{}; + while (page_index != page_last) { + const auto page_addr{GpuToCpuAddress(page_index << page_bits)}; + if (!page_addr || *page_addr == 0) { + return false; + } + if (old_page_addr) { + if (*old_page_addr + page_size != *page_addr) { + return false; + } + } + old_page_addr = page_addr; + ++page_index; + } + return true; +} + +bool MemoryManager::IsFullyMappedRange(GPUVAddr gpu_addr, std::size_t size) const { + size_t page_index{gpu_addr >> page_bits}; + const size_t page_last{(gpu_addr + size + page_size - 1) >> page_bits}; + while (page_index < page_last) { + if (!page_table[page_index].IsValid() || page_table[page_index].ToAddress() == 0) { + return false; + } + ++page_index; + } + return true; +} + +std::vector> MemoryManager::GetSubmappedRange( + GPUVAddr gpu_addr, std::size_t size) const { + std::vector> result{}; + size_t page_index{gpu_addr >> page_bits}; + size_t remaining_size{size}; + size_t page_offset{gpu_addr & page_mask}; + std::optional> last_segment{}; + std::optional old_page_addr{}; + const auto extend_size = [this, &last_segment, &page_index](std::size_t bytes) { + if (!last_segment) { + GPUVAddr new_base_addr = page_index << page_bits; + last_segment = {new_base_addr, bytes}; + } else { + last_segment->second += bytes; + } + }; + const auto split = [this, &last_segment, &result] { + if (last_segment) { + result.push_back(*last_segment); + last_segment = std::nullopt; + } + }; + while (remaining_size > 0) { + const size_t num_bytes{std::min(page_size - page_offset, remaining_size)}; + const auto page_addr{GpuToCpuAddress(page_index << page_bits)}; + if (!page_addr) { + split(); + } else if (old_page_addr) { + if (*old_page_addr + page_size != *page_addr) { + split(); + } + extend_size(num_bytes); + } else { + extend_size(num_bytes); + } + ++page_index; + page_offset = 0; + remaining_size -= num_bytes; + } + split(); + return result; +} + } // namespace Tegra diff --git a/src/video_core/memory_manager.h b/src/video_core/memory_manager.h index 5d6c196fa..509f14f26 100755 --- a/src/video_core/memory_manager.h +++ b/src/video_core/memory_manager.h @@ -76,6 +76,8 @@ public: [[nodiscard]] std::optional GpuToCpuAddress(GPUVAddr addr) const; + [[nodiscard]] std::optional GpuToCpuAddress(GPUVAddr addr, std::size_t size) const; + template [[nodiscard]] T Read(GPUVAddr addr) const; @@ -112,10 +114,28 @@ public: void WriteBlockUnsafe(GPUVAddr gpu_dest_addr, const void* src_buffer, std::size_t size); /** - * IsGranularRange checks if a gpu region can be simply read with a pointer. + * Checks if a gpu region can be simply read with a pointer. */ [[nodiscard]] bool IsGranularRange(GPUVAddr gpu_addr, std::size_t size) const; + /** + * Checks if a gpu region is mapped by a single range of cpu addresses. + */ + [[nodiscard]] bool IsContinousRange(GPUVAddr gpu_addr, std::size_t size) const; + + /** + * Checks if a gpu region is mapped entirely. + */ + [[nodiscard]] bool IsFullyMappedRange(GPUVAddr gpu_addr, std::size_t size) const; + + /** + * Returns a vector with all the subranges of cpu addresses mapped beneath. + * if the region is continous, a single pair will be returned. If it's unmapped, an empty vector + * will be returned; + */ + std::vector> GetSubmappedRange(GPUVAddr gpu_addr, + std::size_t size) const; + [[nodiscard]] GPUVAddr Map(VAddr cpu_addr, GPUVAddr gpu_addr, std::size_t size); [[nodiscard]] GPUVAddr MapAllocate(VAddr cpu_addr, std::size_t size, std::size_t align); [[nodiscard]] GPUVAddr MapAllocate32(VAddr cpu_addr, std::size_t size); diff --git a/src/video_core/rasterizer_interface.h b/src/video_core/rasterizer_interface.h index c1c636ceb..fcbdacae3 100755 --- a/src/video_core/rasterizer_interface.h +++ b/src/video_core/rasterizer_interface.h @@ -93,6 +93,9 @@ public: /// Unmap memory range virtual void UnmapMemory(VAddr addr, u64 size) = 0; + /// Remap GPU memory range. This means underneath backing memory changed + virtual void ModifyGPUMemory(GPUVAddr addr, u64 size) = 0; + /// Notify rasterizer that any caches of the specified region should be flushed to Switch memory /// and invalidated virtual void FlushAndInvalidateRegion(VAddr addr, u64 size) = 0; @@ -114,8 +117,7 @@ public: /// Attempt to use a faster method to perform a surface copy [[nodiscard]] virtual bool AccelerateSurfaceCopy( - const Tegra::Engines::Fermi2D::Surface& src, s32 src_address_offset, - const Tegra::Engines::Fermi2D::Surface& dst, + const Tegra::Engines::Fermi2D::Surface& src, const Tegra::Engines::Fermi2D::Surface& dst, const Tegra::Engines::Fermi2D::Config& copy_config) { return false; } diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index ecf67e5ce..99bd7ac9c 100755 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -631,6 +631,13 @@ void RasterizerOpenGL::UnmapMemory(VAddr addr, u64 size) { shader_cache.OnCPUWrite(addr, size); } +void RasterizerOpenGL::ModifyGPUMemory(GPUVAddr addr, u64 size) { + { + std::scoped_lock lock{texture_cache.mutex}; + texture_cache.UnmapGPUMemory(addr, size); + } +} + void RasterizerOpenGL::SignalSemaphore(GPUVAddr addr, u32 value) { if (!gpu.IsAsync()) { gpu_memory.Write(addr, value); @@ -698,12 +705,11 @@ void RasterizerOpenGL::TickFrame() { } bool RasterizerOpenGL::AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Surface& src, - s32 src_address_offset, const Tegra::Engines::Fermi2D::Surface& dst, const Tegra::Engines::Fermi2D::Config& copy_config) { MICROPROFILE_SCOPE(OpenGL_Blits); std::scoped_lock lock{texture_cache.mutex}; - texture_cache.BlitImage(dst, src, src_address_offset, copy_config); + texture_cache.BlitImage(dst, src, copy_config); return true; } diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h index 7444ebd8d..ad7326ece 100755 --- a/src/video_core/renderer_opengl/gl_rasterizer.h +++ b/src/video_core/renderer_opengl/gl_rasterizer.h @@ -82,6 +82,7 @@ public: void OnCPUWrite(VAddr addr, u64 size) override; void SyncGuestHost() override; void UnmapMemory(VAddr addr, u64 size) override; + void ModifyGPUMemory(GPUVAddr addr, u64 size) override; void SignalSemaphore(GPUVAddr addr, u32 value) override; void SignalSyncPoint(u32 value) override; void ReleaseFences() override; @@ -91,7 +92,7 @@ public: void TiledCacheBarrier() override; void FlushCommands() override; void TickFrame() override; - bool AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Surface& src, s32 src_address_offset, + bool AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Surface& src, const Tegra::Engines::Fermi2D::Surface& dst, const Tegra::Engines::Fermi2D::Config& copy_config) override; bool AccelerateDisplay(const Tegra::FramebufferConfig& config, VAddr framebuffer_addr, diff --git a/src/video_core/renderer_vulkan/vk_fence_manager.cpp b/src/video_core/renderer_vulkan/vk_fence_manager.cpp index 3bec48d14..c2d6676e7 100755 --- a/src/video_core/renderer_vulkan/vk_fence_manager.cpp +++ b/src/video_core/renderer_vulkan/vk_fence_manager.cpp @@ -34,6 +34,10 @@ bool InnerFence::IsSignaled() const { if (is_stubbed) { return true; } + if (scheduler.IsFree(wait_tick)) { + return true; + } + scheduler.Refresh(); return scheduler.IsFree(wait_tick); } diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index 524fb2ad4..2ce1be4af 100755 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -577,6 +577,13 @@ void RasterizerVulkan::UnmapMemory(VAddr addr, u64 size) { pipeline_cache.OnCPUWrite(addr, size); } +void RasterizerVulkan::ModifyGPUMemory(GPUVAddr addr, u64 size) { + { + std::scoped_lock lock{texture_cache.mutex}; + texture_cache.UnmapGPUMemory(addr, size); + } +} + void RasterizerVulkan::SignalSemaphore(GPUVAddr addr, u32 value) { if (!gpu.IsAsync()) { gpu_memory.Write(addr, value); @@ -597,7 +604,7 @@ void RasterizerVulkan::ReleaseFences() { if (!gpu.IsAsync()) { return; } - fence_manager.WaitPendingFences(); + fence_manager.TryReleasePendingFences(); } void RasterizerVulkan::FlushAndInvalidateRegion(VAddr addr, u64 size) { @@ -658,11 +665,10 @@ void RasterizerVulkan::TickFrame() { } bool RasterizerVulkan::AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Surface& src, - s32 src_address_offset, const Tegra::Engines::Fermi2D::Surface& dst, const Tegra::Engines::Fermi2D::Config& copy_config) { std::scoped_lock lock{texture_cache.mutex}; - texture_cache.BlitImage(dst, src, src_address_offset, copy_config); + texture_cache.BlitImage(dst, src, copy_config); return true; } diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h index d4a4ee58e..a29022a50 100755 --- a/src/video_core/renderer_vulkan/vk_rasterizer.h +++ b/src/video_core/renderer_vulkan/vk_rasterizer.h @@ -74,6 +74,7 @@ public: void OnCPUWrite(VAddr addr, u64 size) override; void SyncGuestHost() override; void UnmapMemory(VAddr addr, u64 size) override; + void ModifyGPUMemory(GPUVAddr addr, u64 size) override; void SignalSemaphore(GPUVAddr addr, u32 value) override; void SignalSyncPoint(u32 value) override; void ReleaseFences() override; @@ -83,7 +84,7 @@ public: void TiledCacheBarrier() override; void FlushCommands() override; void TickFrame() override; - bool AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Surface& src, s32 src_address_offset, + bool AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Surface& src, const Tegra::Engines::Fermi2D::Surface& dst, const Tegra::Engines::Fermi2D::Config& copy_config) override; bool AccelerateDisplay(const Tegra::FramebufferConfig& config, VAddr framebuffer_addr, diff --git a/src/video_core/renderer_vulkan/vk_scheduler.h b/src/video_core/renderer_vulkan/vk_scheduler.h index 3ce48e9d2..9e0a1d4e6 100755 --- a/src/video_core/renderer_vulkan/vk_scheduler.h +++ b/src/video_core/renderer_vulkan/vk_scheduler.h @@ -83,6 +83,10 @@ public: return master_semaphore->IsFree(tick); } + void Refresh() const noexcept { + return master_semaphore->Refresh(); + } + /// Waits for the given tick to trigger on the GPU. void Wait(u64 tick) { master_semaphore->Wait(tick); diff --git a/src/video_core/texture_cache/image_base.cpp b/src/video_core/texture_cache/image_base.cpp index ad69d32d1..2aae338b6 100755 --- a/src/video_core/texture_cache/image_base.cpp +++ b/src/video_core/texture_cache/image_base.cpp @@ -69,6 +69,9 @@ ImageBase::ImageBase(const ImageInfo& info_, GPUVAddr gpu_addr_, VAddr cpu_addr_ } } +ImageMapView::ImageMapView(GPUVAddr gpu_addr_, VAddr cpu_addr_, size_t size_, ImageId image_id_) + : gpu_addr{gpu_addr_}, cpu_addr{cpu_addr_}, size{size_}, image_id{image_id_} {} + std::optional ImageBase::TryFindBase(GPUVAddr other_addr) const noexcept { if (other_addr < gpu_addr) { // Subresource address can't be lower than the base diff --git a/src/video_core/texture_cache/image_base.h b/src/video_core/texture_cache/image_base.h index e326cab71..ff1feda9b 100755 --- a/src/video_core/texture_cache/image_base.h +++ b/src/video_core/texture_cache/image_base.h @@ -25,12 +25,14 @@ enum class ImageFlagBits : u32 { Strong = 1 << 5, ///< Exists in the image table, the dimensions are can be trusted Registered = 1 << 6, ///< True when the image is registered Picked = 1 << 7, ///< Temporary flag to mark the image as picked + Remapped = 1 << 8, ///< Image has been remapped. + Sparse = 1 << 9, ///< Image has non continous submemory. // Garbage Collection Flags - BadOverlap = 1 << 8, ///< This image overlaps other but doesn't fit, has higher - ///< garbage collection priority - Alias = 1 << 9, ///< This image has aliases and has priority on garbage - ///< collection + BadOverlap = 1 << 10, ///< This image overlaps other but doesn't fit, has higher + ///< garbage collection priority + Alias = 1 << 11, ///< This image has aliases and has priority on garbage + ///< collection }; DECLARE_ENUM_FLAG_OPERATORS(ImageFlagBits) @@ -57,6 +59,12 @@ struct ImageBase { return cpu_addr < overlap_end && overlap_cpu_addr < cpu_addr_end; } + [[nodiscard]] bool OverlapsGPU(GPUVAddr overlap_gpu_addr, size_t overlap_size) const noexcept { + const VAddr overlap_end = overlap_gpu_addr + overlap_size; + const GPUVAddr gpu_addr_end = gpu_addr + guest_size_bytes; + return gpu_addr < overlap_end && overlap_gpu_addr < gpu_addr_end; + } + void CheckBadOverlapState(); void CheckAliasState(); @@ -84,6 +92,29 @@ struct ImageBase { std::vector aliased_images; std::vector overlapping_images; + ImageMapId map_view_id{}; +}; + +struct ImageMapView { + explicit ImageMapView(GPUVAddr gpu_addr, VAddr cpu_addr, size_t size, ImageId image_id); + + [[nodiscard]] bool Overlaps(VAddr overlap_cpu_addr, size_t overlap_size) const noexcept { + const VAddr overlap_end = overlap_cpu_addr + overlap_size; + const VAddr cpu_addr_end = cpu_addr + size; + return cpu_addr < overlap_end && overlap_cpu_addr < cpu_addr_end; + } + + [[nodiscard]] bool OverlapsGPU(GPUVAddr overlap_gpu_addr, size_t overlap_size) const noexcept { + const GPUVAddr overlap_end = overlap_gpu_addr + overlap_size; + const GPUVAddr gpu_addr_end = gpu_addr + size; + return gpu_addr < overlap_end && overlap_gpu_addr < gpu_addr_end; + } + + GPUVAddr gpu_addr; + VAddr cpu_addr; + size_t size; + ImageId image_id; + bool picked{}; }; struct ImageAllocBase { diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index 917184df8..39a5347fa 100755 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -152,9 +153,12 @@ public: /// Remove images in a region void UnmapMemory(VAddr cpu_addr, size_t size); + /// Remove images in a region + void UnmapGPUMemory(GPUVAddr gpu_addr, size_t size); + /// Blit an image with the given parameters void BlitImage(const Tegra::Engines::Fermi2D::Surface& dst, - const Tegra::Engines::Fermi2D::Surface& src, s32 src_address_offset, + const Tegra::Engines::Fermi2D::Surface& src, const Tegra::Engines::Fermi2D::Config& copy); /// Invalidate the contents of the color buffer index @@ -188,7 +192,22 @@ public: private: /// Iterate over all page indices in a range template - static void ForEachPage(VAddr addr, size_t size, Func&& func) { + static void ForEachCPUPage(VAddr addr, size_t size, Func&& func) { + static constexpr bool RETURNS_BOOL = std::is_same_v, bool>; + const u64 page_end = (addr + size - 1) >> PAGE_BITS; + for (u64 page = addr >> PAGE_BITS; page <= page_end; ++page) { + if constexpr (RETURNS_BOOL) { + if (func(page)) { + break; + } + } else { + func(page); + } + } + } + + template + static void ForEachGPUPage(GPUVAddr addr, size_t size, Func&& func) { static constexpr bool RETURNS_BOOL = std::is_same_v, bool>; const u64 page_end = (addr + size - 1) >> PAGE_BITS; for (u64 page = addr >> PAGE_BITS; page <= page_end; ++page) { @@ -218,7 +237,7 @@ private: FramebufferId GetFramebufferId(const RenderTargets& key); /// Refresh the contents (pixel data) of an image - void RefreshContents(Image& image); + void RefreshContents(Image& image, ImageId image_id); /// Upload data from guest to an image template @@ -248,8 +267,7 @@ private: /// Return a blit image pair from the given guest blit parameters [[nodiscard]] BlitImages GetBlitImages(const Tegra::Engines::Fermi2D::Surface& dst, - const Tegra::Engines::Fermi2D::Surface& src, - s32 src_address_offset); + const Tegra::Engines::Fermi2D::Surface& src); /// Find or create a sampler from a guest descriptor sampler [[nodiscard]] SamplerId FindSampler(const TSCEntry& config); @@ -268,6 +286,16 @@ private: template void ForEachImageInRegion(VAddr cpu_addr, size_t size, Func&& func); + template + void ForEachImageInRegionGPU(GPUVAddr gpu_addr, size_t size, Func&& func); + + template + void ForEachSparseImageInRegion(GPUVAddr gpu_addr, size_t size, Func&& func); + + /// Iterates over all the images in a region calling func + template + void ForEachSparseSegment(ImageBase& image, Func&& func); + /// Find or create an image view in the given image with the passed parameters [[nodiscard]] ImageViewId FindOrEmplaceImageView(ImageId image_id, const ImageViewInfo& info); @@ -278,10 +306,10 @@ private: void UnregisterImage(ImageId image); /// Track CPU reads and writes for image - void TrackImage(ImageBase& image); + void TrackImage(ImageBase& image, ImageId image_id); /// Stop tracking CPU reads and writes for image - void UntrackImage(ImageBase& image); + void UntrackImage(ImageBase& image, ImageId image_id); /// Delete image from the cache void DeleteImage(ImageId image); @@ -339,7 +367,13 @@ private: std::unordered_map samplers; std::unordered_map framebuffers; - std::unordered_map, IdentityHash> page_table; + std::unordered_map, IdentityHash> page_table; + std::unordered_map, IdentityHash> gpu_page_table; + std::unordered_map, IdentityHash> sparse_page_table; + + std::unordered_map> sparse_views; + + VAddr virtual_invalid_space{}; bool has_deleted_images = false; u64 total_used_memory = 0; @@ -348,6 +382,7 @@ private: u64 critical_memory; SlotVector slot_images; + SlotVector slot_map_views; SlotVector slot_image_views; SlotVector slot_image_allocs; SlotVector slot_samplers; @@ -458,7 +493,7 @@ void TextureCache

::RunGarbageCollector() { } } if (True(image->flags & ImageFlagBits::Tracked)) { - UntrackImage(*image); + UntrackImage(*image, image_id); } UnregisterImage(image_id); DeleteImage(image_id); @@ -657,7 +692,9 @@ void TextureCache

::WriteMemory(VAddr cpu_addr, size_t size) { return; } image.flags |= ImageFlagBits::CpuModified; - UntrackImage(image); + if (True(image.flags & ImageFlagBits::Tracked)) { + UntrackImage(image, image_id); + } }); } @@ -694,18 +731,35 @@ void TextureCache

::UnmapMemory(VAddr cpu_addr, size_t size) { for (const ImageId id : deleted_images) { Image& image = slot_images[id]; if (True(image.flags & ImageFlagBits::Tracked)) { - UntrackImage(image); + UntrackImage(image, id); } UnregisterImage(id); DeleteImage(id); } } +template +void TextureCache

::UnmapGPUMemory(GPUVAddr gpu_addr, size_t size) { + std::vector deleted_images; + ForEachImageInRegionGPU(gpu_addr, size, + [&](ImageId id, Image&) { deleted_images.push_back(id); }); + for (const ImageId id : deleted_images) { + Image& image = slot_images[id]; + if (True(image.flags & ImageFlagBits::Remapped)) { + continue; + } + image.flags |= ImageFlagBits::Remapped; + if (True(image.flags & ImageFlagBits::Tracked)) { + UntrackImage(image, id); + } + } +} + template void TextureCache

::BlitImage(const Tegra::Engines::Fermi2D::Surface& dst, - const Tegra::Engines::Fermi2D::Surface& src, s32 src_address_offset, + const Tegra::Engines::Fermi2D::Surface& src, const Tegra::Engines::Fermi2D::Config& copy) { - const BlitImages images = GetBlitImages(dst, src, src_address_offset); + const BlitImages images = GetBlitImages(dst, src); const ImageId dst_id = images.dst_id; const ImageId src_id = images.src_id; PrepareImage(src_id, false, false); @@ -725,7 +779,7 @@ void TextureCache

::BlitImage(const Tegra::Engines::Fermi2D::Surface& dst, Offset2D{.x = copy.src_x1 >> src_samples_x, .y = copy.src_y1 >> src_samples_y}, }; - const std::optional src_base = src_image.TryFindBase(src.Address() + src_address_offset); + const std::optional src_base = src_image.TryFindBase(src.Address()); const SubresourceRange src_range{.base = src_base.value(), .extent = {1, 1}}; const ImageViewInfo src_view_info(ImageViewType::e2D, images.src_format, src_range); const auto [src_framebuffer_id, src_view_id] = RenderTargetFromImage(src_id, src_view_info); @@ -793,9 +847,10 @@ typename P::ImageView* TextureCache

::TryFindFramebufferImageView(VAddr cpu_ad if (it == page_table.end()) { return nullptr; } - const auto& image_ids = it->second; - for (const ImageId image_id : image_ids) { - const ImageBase& image = slot_images[image_id]; + const auto& image_map_ids = it->second; + for (const ImageMapId map_id : image_map_ids) { + const ImageMapView& map = slot_map_views[map_id]; + const ImageBase& image = slot_images[map.image_id]; if (image.cpu_addr != cpu_addr) { continue; } @@ -875,13 +930,13 @@ bool TextureCache

::IsRegionGpuModified(VAddr addr, size_t size) { } template -void TextureCache

::RefreshContents(Image& image) { +void TextureCache

::RefreshContents(Image& image, ImageId image_id) { if (False(image.flags & ImageFlagBits::CpuModified)) { // Only upload modified images return; } image.flags &= ~ImageFlagBits::CpuModified; - TrackImage(image); + TrackImage(image, image_id); if (image.info.num_samples > 1) { LOG_WARNING(HW_GPU, "MSAA image uploads are not implemented"); @@ -918,7 +973,7 @@ void TextureCache

::UploadImageContents(Image& image, StagingBuffer& staging) template ImageViewId TextureCache

::FindImageView(const TICEntry& config) { - if (!IsValidAddress(gpu_memory, config)) { + if (!IsValidEntry(gpu_memory, config)) { return NULL_IMAGE_VIEW_ID; } const auto [pair, is_new] = image_views.try_emplace(config); @@ -960,14 +1015,20 @@ ImageId TextureCache

::FindOrInsertImage(const ImageInfo& info, GPUVAddr gpu_a template ImageId TextureCache

::FindImage(const ImageInfo& info, GPUVAddr gpu_addr, RelaxedOptions options) { - const std::optional cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr); + std::optional cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr); if (!cpu_addr) { - return ImageId{}; + cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr, CalculateGuestSizeInBytes(info)); + if (!cpu_addr) { + return ImageId{}; + } } const bool broken_views = runtime.HasBrokenTextureViewFormats(); const bool native_bgr = runtime.HasNativeBgr(); ImageId image_id; const auto lambda = [&](ImageId existing_image_id, ImageBase& existing_image) { + if (True(existing_image.flags & ImageFlagBits::Remapped)) { + return false; + } if (info.type == ImageType::Linear || existing_image.info.type == ImageType::Linear) { const bool strict_size = False(options & RelaxedOptions::Size) && True(existing_image.flags & ImageFlagBits::Strong); @@ -993,7 +1054,16 @@ ImageId TextureCache

::FindImage(const ImageInfo& info, GPUVAddr gpu_addr, template ImageId TextureCache

::InsertImage(const ImageInfo& info, GPUVAddr gpu_addr, RelaxedOptions options) { - const std::optional cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr); + std::optional cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr); + if (!cpu_addr) { + const auto size = CalculateGuestSizeInBytes(info); + cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr, size); + if (!cpu_addr) { + const VAddr fake_addr = ~(1ULL << 40ULL) + virtual_invalid_space; + virtual_invalid_space += Common::AlignUp(size, 32); + cpu_addr = std::optional(fake_addr); + } + } ASSERT_MSG(cpu_addr, "Tried to insert an image to an invalid gpu_addr=0x{:x}", gpu_addr); const ImageId image_id = JoinImages(info, gpu_addr, *cpu_addr); const Image& image = slot_images[image_id]; @@ -1013,10 +1083,16 @@ ImageId TextureCache

::JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, VA const bool broken_views = runtime.HasBrokenTextureViewFormats(); const bool native_bgr = runtime.HasNativeBgr(); std::vector overlap_ids; + std::unordered_set overlaps_found; std::vector left_aliased_ids; std::vector right_aliased_ids; + std::unordered_set ignore_textures; std::vector bad_overlap_ids; - ForEachImageInRegion(cpu_addr, size_bytes, [&](ImageId overlap_id, ImageBase& overlap) { + const auto region_check = [&](ImageId overlap_id, ImageBase& overlap) { + if (True(overlap.flags & ImageFlagBits::Remapped)) { + ignore_textures.insert(overlap_id); + return; + } if (info.type == ImageType::Linear) { if (info.pitch == overlap.info.pitch && gpu_addr == overlap.gpu_addr) { // Alias linear images with the same pitch @@ -1024,6 +1100,7 @@ ImageId TextureCache

::JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, VA } return; } + overlaps_found.insert(overlap_id); static constexpr bool strict_size = true; const std::optional solution = ResolveOverlap( new_info, gpu_addr, cpu_addr, overlap, strict_size, broken_views, native_bgr); @@ -1047,12 +1124,40 @@ ImageId TextureCache

::JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, VA bad_overlap_ids.push_back(overlap_id); overlap.flags |= ImageFlagBits::BadOverlap; } - }); + }; + ForEachImageInRegion(cpu_addr, size_bytes, region_check); + const auto region_check_gpu = [&](ImageId overlap_id, ImageBase& overlap) { + if (!overlaps_found.contains(overlap_id)) { + if (True(overlap.flags & ImageFlagBits::Remapped)) { + ignore_textures.insert(overlap_id); + } + if (overlap.gpu_addr == gpu_addr && overlap.guest_size_bytes == size_bytes) { + ignore_textures.insert(overlap_id); + } + } + }; + ForEachSparseImageInRegion(gpu_addr, size_bytes, region_check_gpu); const ImageId new_image_id = slot_images.insert(runtime, new_info, gpu_addr, cpu_addr); Image& new_image = slot_images[new_image_id]; + if (!gpu_memory.IsContinousRange(new_image.gpu_addr, new_image.guest_size_bytes)) { + new_image.flags |= ImageFlagBits::Sparse; + } + + for (const ImageId overlap_id : ignore_textures) { + Image& overlap = slot_images[overlap_id]; + if (True(overlap.flags & ImageFlagBits::GpuModified)) { + UNIMPLEMENTED(); + } + if (True(overlap.flags & ImageFlagBits::Tracked)) { + UntrackImage(overlap, overlap_id); + } + UnregisterImage(overlap_id); + DeleteImage(overlap_id); + } + // TODO: Only upload what we need - RefreshContents(new_image); + RefreshContents(new_image, new_image_id); for (const ImageId overlap_id : overlap_ids) { Image& overlap = slot_images[overlap_id]; @@ -1064,7 +1169,7 @@ ImageId TextureCache

::JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, VA runtime.CopyImage(new_image, overlap, copies); } if (True(overlap.flags & ImageFlagBits::Tracked)) { - UntrackImage(overlap); + UntrackImage(overlap, overlap_id); } UnregisterImage(overlap_id); DeleteImage(overlap_id); @@ -1092,11 +1197,10 @@ ImageId TextureCache

::JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, VA template typename TextureCache

::BlitImages TextureCache

::GetBlitImages( - const Tegra::Engines::Fermi2D::Surface& dst, const Tegra::Engines::Fermi2D::Surface& src, - s32 src_address_offset) { + const Tegra::Engines::Fermi2D::Surface& dst, const Tegra::Engines::Fermi2D::Surface& src) { static constexpr auto FIND_OPTIONS = RelaxedOptions::Format | RelaxedOptions::Samples; const GPUVAddr dst_addr = dst.Address(); - const GPUVAddr src_addr = src.Address() + src_address_offset; + const GPUVAddr src_addr = src.Address(); ImageInfo dst_info(dst); ImageInfo src_info(src); ImageId dst_id; @@ -1200,7 +1304,8 @@ void TextureCache

::ForEachImageInRegion(VAddr cpu_addr, size_t size, Func&& f using FuncReturn = typename std::invoke_result::type; static constexpr bool BOOL_BREAK = std::is_same_v; boost::container::small_vector images; - ForEachPage(cpu_addr, size, [this, &images, cpu_addr, size, func](u64 page) { + boost::container::small_vector maps; + ForEachCPUPage(cpu_addr, size, [this, &images, &maps, cpu_addr, size, func](u64 page) { const auto it = page_table.find(page); if (it == page_table.end()) { if constexpr (BOOL_BREAK) { @@ -1209,12 +1314,63 @@ void TextureCache

::ForEachImageInRegion(VAddr cpu_addr, size_t size, Func&& f return; } } + for (const ImageMapId map_id : it->second) { + ImageMapView& map = slot_map_views[map_id]; + if (map.picked) { + continue; + } + if (!map.Overlaps(cpu_addr, size)) { + continue; + } + map.picked = true; + maps.push_back(map_id); + Image& image = slot_images[map.image_id]; + if (True(image.flags & ImageFlagBits::Picked)) { + continue; + } + image.flags |= ImageFlagBits::Picked; + images.push_back(map.image_id); + if constexpr (BOOL_BREAK) { + if (func(map.image_id, image)) { + return true; + } + } else { + func(map.image_id, image); + } + } + if constexpr (BOOL_BREAK) { + return false; + } + }); + for (const ImageId image_id : images) { + slot_images[image_id].flags &= ~ImageFlagBits::Picked; + } + for (const ImageMapId map_id : maps) { + slot_map_views[map_id].picked = false; + } +} + +template +template +void TextureCache

::ForEachImageInRegionGPU(GPUVAddr gpu_addr, size_t size, Func&& func) { + using FuncReturn = typename std::invoke_result::type; + static constexpr bool BOOL_BREAK = std::is_same_v; + boost::container::small_vector images; + ForEachGPUPage(gpu_addr, size, [this, &images, gpu_addr, size, func](u64 page) { + const auto it = gpu_page_table.find(page); + if (it == gpu_page_table.end()) { + if constexpr (BOOL_BREAK) { + return false; + } else { + return; + } + } for (const ImageId image_id : it->second) { Image& image = slot_images[image_id]; if (True(image.flags & ImageFlagBits::Picked)) { continue; } - if (!image.Overlaps(cpu_addr, size)) { + if (!image.OverlapsGPU(gpu_addr, size)) { continue; } image.flags |= ImageFlagBits::Picked; @@ -1236,6 +1392,69 @@ void TextureCache

::ForEachImageInRegion(VAddr cpu_addr, size_t size, Func&& f } } +template +template +void TextureCache

::ForEachSparseImageInRegion(GPUVAddr gpu_addr, size_t size, Func&& func) { + using FuncReturn = typename std::invoke_result::type; + static constexpr bool BOOL_BREAK = std::is_same_v; + boost::container::small_vector images; + ForEachGPUPage(gpu_addr, size, [this, &images, gpu_addr, size, func](u64 page) { + const auto it = sparse_page_table.find(page); + if (it == sparse_page_table.end()) { + if constexpr (BOOL_BREAK) { + return false; + } else { + return; + } + } + for (const ImageId image_id : it->second) { + Image& image = slot_images[image_id]; + if (True(image.flags & ImageFlagBits::Picked)) { + continue; + } + if (!image.OverlapsGPU(gpu_addr, size)) { + continue; + } + image.flags |= ImageFlagBits::Picked; + images.push_back(image_id); + if constexpr (BOOL_BREAK) { + if (func(image_id, image)) { + return true; + } + } else { + func(image_id, image); + } + } + if constexpr (BOOL_BREAK) { + return false; + } + }); + for (const ImageId image_id : images) { + slot_images[image_id].flags &= ~ImageFlagBits::Picked; + } +} + +template +template +void TextureCache

::ForEachSparseSegment(ImageBase& image, Func&& func) { + using FuncReturn = typename std::invoke_result::type; + static constexpr bool RETURNS_BOOL = std::is_same_v; + const auto segments = gpu_memory.GetSubmappedRange(image.gpu_addr, image.guest_size_bytes); + for (auto& segment : segments) { + const auto gpu_addr = segment.first; + const auto size = segment.second; + std::optional cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr); + ASSERT(cpu_addr); + if constexpr (RETURNS_BOOL) { + if (func(gpu_addr, *cpu_addr, size)) { + break; + } + } else { + func(gpu_addr, *cpu_addr, size); + } + } +} + template ImageViewId TextureCache

::FindOrEmplaceImageView(ImageId image_id, const ImageViewInfo& info) { Image& image = slot_images[image_id]; @@ -1253,8 +1472,6 @@ void TextureCache

::RegisterImage(ImageId image_id) { ASSERT_MSG(False(image.flags & ImageFlagBits::Registered), "Trying to register an already registered image"); image.flags |= ImageFlagBits::Registered; - ForEachPage(image.cpu_addr, image.guest_size_bytes, - [this, image_id](u64 page) { page_table[page].push_back(image_id); }); u64 tentative_size = std::max(image.guest_size_bytes, image.unswizzled_size_bytes); if ((IsPixelFormatASTC(image.info.format) && True(image.flags & ImageFlagBits::AcceleratedUpload)) || @@ -1262,6 +1479,27 @@ void TextureCache

::RegisterImage(ImageId image_id) { tentative_size = EstimatedDecompressedSize(tentative_size, image.info.format); } total_used_memory += Common::AlignUp(tentative_size, 1024); + ForEachGPUPage(image.gpu_addr, image.guest_size_bytes, + [this, image_id](u64 page) { gpu_page_table[page].push_back(image_id); }); + if (False(image.flags & ImageFlagBits::Sparse)) { + auto map_id = + slot_map_views.insert(image.gpu_addr, image.cpu_addr, image.guest_size_bytes, image_id); + ForEachCPUPage(image.cpu_addr, image.guest_size_bytes, + [this, map_id](u64 page) { page_table[page].push_back(map_id); }); + image.map_view_id = map_id; + return; + } + std::vector sparse_maps{}; + ForEachSparseSegment( + image, [this, image_id, &sparse_maps](GPUVAddr gpu_addr, VAddr cpu_addr, size_t size) { + auto map_id = slot_map_views.insert(gpu_addr, cpu_addr, size, image_id); + ForEachCPUPage(cpu_addr, size, + [this, map_id](u64 page) { page_table[page].push_back(map_id); }); + sparse_maps.push_back(map_id); + }); + sparse_views.emplace(image_id, std::move(sparse_maps)); + ForEachGPUPage(image.gpu_addr, image.guest_size_bytes, + [this, image_id](u64 page) { sparse_page_table[page].push_back(image_id); }); } template @@ -1278,34 +1516,125 @@ void TextureCache

::UnregisterImage(ImageId image_id) { tentative_size = EstimatedDecompressedSize(tentative_size, image.info.format); } total_used_memory -= Common::AlignUp(tentative_size, 1024); - ForEachPage(image.cpu_addr, image.guest_size_bytes, [this, image_id](u64 page) { - const auto page_it = page_table.find(page); - if (page_it == page_table.end()) { - UNREACHABLE_MSG("Unregistering unregistered page=0x{:x}", page << PAGE_BITS); - return; - } - std::vector& image_ids = page_it->second; - const auto vector_it = std::ranges::find(image_ids, image_id); - if (vector_it == image_ids.end()) { - UNREACHABLE_MSG("Unregistering unregistered image in page=0x{:x}", page << PAGE_BITS); - return; - } - image_ids.erase(vector_it); + const auto& clear_page_table = + [this, image_id]( + u64 page, + std::unordered_map, IdentityHash>& selected_page_table) { + const auto page_it = selected_page_table.find(page); + if (page_it == selected_page_table.end()) { + UNREACHABLE_MSG("Unregistering unregistered page=0x{:x}", page << PAGE_BITS); + return; + } + std::vector& image_ids = page_it->second; + const auto vector_it = std::ranges::find(image_ids, image_id); + if (vector_it == image_ids.end()) { + UNREACHABLE_MSG("Unregistering unregistered image in page=0x{:x}", + page << PAGE_BITS); + return; + } + image_ids.erase(vector_it); + }; + ForEachGPUPage(image.gpu_addr, image.guest_size_bytes, + [this, &clear_page_table](u64 page) { clear_page_table(page, gpu_page_table); }); + if (False(image.flags & ImageFlagBits::Sparse)) { + const auto map_id = image.map_view_id; + ForEachCPUPage(image.cpu_addr, image.guest_size_bytes, [this, map_id](u64 page) { + const auto page_it = page_table.find(page); + if (page_it == page_table.end()) { + UNREACHABLE_MSG("Unregistering unregistered page=0x{:x}", page << PAGE_BITS); + return; + } + std::vector& image_map_ids = page_it->second; + const auto vector_it = std::ranges::find(image_map_ids, map_id); + if (vector_it == image_map_ids.end()) { + UNREACHABLE_MSG("Unregistering unregistered image in page=0x{:x}", + page << PAGE_BITS); + return; + } + image_map_ids.erase(vector_it); + }); + slot_map_views.erase(map_id); + return; + } + ForEachGPUPage(image.gpu_addr, image.guest_size_bytes, [this, &clear_page_table](u64 page) { + clear_page_table(page, sparse_page_table); }); + auto it = sparse_views.find(image_id); + ASSERT(it != sparse_views.end()); + auto& sparse_maps = it->second; + for (auto& map_view_id : sparse_maps) { + const auto& map_range = slot_map_views[map_view_id]; + const VAddr cpu_addr = map_range.cpu_addr; + const std::size_t size = map_range.size; + ForEachCPUPage(cpu_addr, size, [this, image_id](u64 page) { + const auto page_it = page_table.find(page); + if (page_it == page_table.end()) { + UNREACHABLE_MSG("Unregistering unregistered page=0x{:x}", page << PAGE_BITS); + return; + } + std::vector& image_map_ids = page_it->second; + auto vector_it = image_map_ids.begin(); + while (vector_it != image_map_ids.end()) { + ImageMapView& map = slot_map_views[*vector_it]; + if (map.image_id != image_id) { + vector_it++; + continue; + } + if (!map.picked) { + map.picked = true; + } + vector_it = image_map_ids.erase(vector_it); + } + }); + slot_map_views.erase(map_view_id); + } + sparse_views.erase(it); } template -void TextureCache

::TrackImage(ImageBase& image) { +void TextureCache

::TrackImage(ImageBase& image, ImageId image_id) { ASSERT(False(image.flags & ImageFlagBits::Tracked)); image.flags |= ImageFlagBits::Tracked; - rasterizer.UpdatePagesCachedCount(image.cpu_addr, image.guest_size_bytes, 1); + if (False(image.flags & ImageFlagBits::Sparse)) { + rasterizer.UpdatePagesCachedCount(image.cpu_addr, image.guest_size_bytes, 1); + return; + } + if (True(image.flags & ImageFlagBits::Registered)) { + auto it = sparse_views.find(image_id); + ASSERT(it != sparse_views.end()); + auto& sparse_maps = it->second; + for (auto& map_view_id : sparse_maps) { + const auto& map = slot_map_views[map_view_id]; + const VAddr cpu_addr = map.cpu_addr; + const std::size_t size = map.size; + rasterizer.UpdatePagesCachedCount(cpu_addr, size, 1); + } + return; + } + ForEachSparseSegment(image, + [this]([[maybe_unused]] GPUVAddr gpu_addr, VAddr cpu_addr, size_t size) { + rasterizer.UpdatePagesCachedCount(cpu_addr, size, 1); + }); } template -void TextureCache

::UntrackImage(ImageBase& image) { +void TextureCache

::UntrackImage(ImageBase& image, ImageId image_id) { ASSERT(True(image.flags & ImageFlagBits::Tracked)); image.flags &= ~ImageFlagBits::Tracked; - rasterizer.UpdatePagesCachedCount(image.cpu_addr, image.guest_size_bytes, -1); + if (False(image.flags & ImageFlagBits::Sparse)) { + rasterizer.UpdatePagesCachedCount(image.cpu_addr, image.guest_size_bytes, -1); + return; + } + ASSERT(True(image.flags & ImageFlagBits::Registered)); + auto it = sparse_views.find(image_id); + ASSERT(it != sparse_views.end()); + auto& sparse_maps = it->second; + for (auto& map_view_id : sparse_maps) { + const auto& map = slot_map_views[map_view_id]; + const VAddr cpu_addr = map.cpu_addr; + const std::size_t size = map.size; + rasterizer.UpdatePagesCachedCount(cpu_addr, size, -1); + } } template @@ -1447,10 +1776,10 @@ void TextureCache

::PrepareImage(ImageId image_id, bool is_modification, bool if (invalidate) { image.flags &= ~(ImageFlagBits::CpuModified | ImageFlagBits::GpuModified); if (False(image.flags & ImageFlagBits::Tracked)) { - TrackImage(image); + TrackImage(image, image_id); } } else { - RefreshContents(image); + RefreshContents(image, image_id); SynchronizeAliases(image_id); } if (is_modification) { diff --git a/src/video_core/texture_cache/types.h b/src/video_core/texture_cache/types.h index c9571f7e4..47a11cb2f 100755 --- a/src/video_core/texture_cache/types.h +++ b/src/video_core/texture_cache/types.h @@ -16,6 +16,7 @@ constexpr size_t MAX_MIP_LEVELS = 14; constexpr SlotId CORRUPT_ID{0xfffffffe}; using ImageId = SlotId; +using ImageMapId = SlotId; using ImageViewId = SlotId; using ImageAllocId = SlotId; using SamplerId = SlotId; @@ -132,8 +133,8 @@ struct BufferImageCopy { }; struct BufferCopy { - size_t src_offset; - size_t dst_offset; + u64 src_offset; + u64 dst_offset; size_t size; }; diff --git a/src/video_core/texture_cache/util.cpp b/src/video_core/texture_cache/util.cpp index 4efe042b6..10093a11d 100755 --- a/src/video_core/texture_cache/util.cpp +++ b/src/video_core/texture_cache/util.cpp @@ -664,6 +664,16 @@ LevelArray CalculateMipLevelOffsets(const ImageInfo& info) noexcept { return offsets; } +LevelArray CalculateMipLevelSizes(const ImageInfo& info) noexcept { + const u32 num_levels = info.resources.levels; + const LevelInfo level_info = MakeLevelInfo(info); + LevelArray sizes{}; + for (u32 level = 0; level < num_levels; ++level) { + sizes[level] = CalculateLevelSize(level_info, level); + } + return sizes; +} + std::vector CalculateSliceOffsets(const ImageInfo& info) { ASSERT(info.type == ImageType::e3D); std::vector offsets; @@ -776,14 +786,20 @@ std::vector MakeShrinkImageCopies(const ImageInfo& dst, const ImageIn return copies; } -bool IsValidAddress(const Tegra::MemoryManager& gpu_memory, const TICEntry& config) { - if (config.Address() == 0) { +bool IsValidEntry(const Tegra::MemoryManager& gpu_memory, const TICEntry& config) { + const GPUVAddr address = config.Address(); + if (address == 0) { return false; } - if (config.Address() > (u64(1) << 48)) { + if (address > (1ULL << 48)) { return false; } - return gpu_memory.GpuToCpuAddress(config.Address()).has_value(); + if (gpu_memory.GpuToCpuAddress(address).has_value()) { + return true; + } + const ImageInfo info{config}; + const size_t guest_size_bytes = CalculateGuestSizeInBytes(info); + return gpu_memory.GpuToCpuAddress(address, guest_size_bytes).has_value(); } std::vector UnswizzleImage(Tegra::MemoryManager& gpu_memory, GPUVAddr gpu_addr, diff --git a/src/video_core/texture_cache/util.h b/src/video_core/texture_cache/util.h index cdc5cbc75..766502908 100755 --- a/src/video_core/texture_cache/util.h +++ b/src/video_core/texture_cache/util.h @@ -40,6 +40,8 @@ struct OverlapResult { [[nodiscard]] LevelArray CalculateMipLevelOffsets(const ImageInfo& info) noexcept; +[[nodiscard]] LevelArray CalculateMipLevelSizes(const ImageInfo& info) noexcept; + [[nodiscard]] std::vector CalculateSliceOffsets(const ImageInfo& info); [[nodiscard]] std::vector CalculateSliceSubresources(const ImageInfo& info); @@ -55,7 +57,7 @@ struct OverlapResult { const ImageInfo& src, SubresourceBase base); -[[nodiscard]] bool IsValidAddress(const Tegra::MemoryManager& gpu_memory, const TICEntry& config); +[[nodiscard]] bool IsValidEntry(const Tegra::MemoryManager& gpu_memory, const TICEntry& config); [[nodiscard]] std::vector UnswizzleImage(Tegra::MemoryManager& gpu_memory, GPUVAddr gpu_addr, const ImageInfo& info,