early-access version 2585

2022-03-21 16:53:01 +01:00
parent dd6ea95c54
commit 9c48e94f2d
31 changed files with 1072 additions and 483 deletions
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -38,7 +38,7 @@ if (MSVC)
        /MP
        /Zf
        /Zi
-        /Zm200
+        /Zm300
        /Zo
        /permissive-
        /EHsc
--- a/src/core/memory.cpp
+++ b/src/core/memory.cpp
@@ -322,7 +322,7 @@ struct Memory::Impl {
        }

        if (Settings::IsFastmemEnabled()) {
-            const bool is_read_enable = Settings::IsGPULevelHigh() || !cached;
+            const bool is_read_enable = !Settings::IsGPULevelExtreme() || !cached;
            system.DeviceMemory().buffer.Protect(vaddr, size, is_read_enable, !cached);
        }

--- a/src/video_core/buffer_cache/buffer_cache.h
+++ b/src/video_core/buffer_cache/buffer_cache.h
@@ -1495,15 +1495,13 @@ typename BufferCache<P>::OverlapResult BufferCache<P>::ResolveOverlaps(VAddr cpu
        overlap_ids.push_back(overlap_id);
        overlap.Pick();
        const VAddr overlap_cpu_addr = overlap.CpuAddr();
-        bool goes_left = false;
-        if (overlap_cpu_addr < begin) {
-            goes_left = true;
+        const bool expands_left = overlap_cpu_addr < begin;
+        if (expands_left) {
            cpu_addr = begin = overlap_cpu_addr;
        }
        const VAddr overlap_end = overlap_cpu_addr + overlap.SizeBytes();
-        bool goes_right = false;
+        const bool expands_right = overlap_end > end;
        if (overlap_end > end) {
-            goes_right = true;
            end = overlap_end;
        }
        stream_score += overlap.StreamScore();
@@ -1511,11 +1509,11 @@ typename BufferCache<P>::OverlapResult BufferCache<P>::ResolveOverlaps(VAddr cpu
            // When this memory region has been joined a bunch of times, we assume it's being used
            // as a stream buffer. Increase the size to skip constantly recreating buffers.
            has_stream_leap = true;
-            if (goes_right) {
+            if (expands_right) {
                begin -= PAGE_SIZE * 256;
                cpu_addr = begin;
            }
-            if (goes_left) {
+            if (expands_left) {
                end += PAGE_SIZE * 256;
            }
        }
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@@ -2,8 +2,11 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.

+#include <algorithm>
 #include <cstring>
 #include <optional>
+
+#include "common/alignment.h"
 #include "common/assert.h"
 #include "common/settings.h"
 #include "core/core.h"
@@ -27,6 +30,7 @@ Maxwell3D::Maxwell3D(Core::System& system_, MemoryManager& memory_manager_)
      upload_state{memory_manager, regs.upload} {
    dirty.flags.flip();
    InitializeRegisterDefaults();
+    accelerated_reads = Settings::IsFastmemEnabled();
 }

 Maxwell3D::~Maxwell3D() = default;
@@ -210,28 +214,14 @@ void Maxwell3D::ProcessMethodCall(u32 method, u32 argument, u32 nonshadow_argume
        return ProcessCBBind(4);
    case MAXWELL3D_REG_INDEX(draw.vertex_end_gl):
        return DrawArrays();
-    case MAXWELL3D_REG_INDEX(small_index): {
+    case MAXWELL3D_REG_INDEX(small_index):
        regs.index_array.count = regs.small_index.count;
        regs.index_array.first = regs.small_index.first;
        dirty.flags[VideoCommon::Dirty::IndexBuffer] = true;
-        bool is_extreme = Settings::IsGPULevelExtreme();
-
-        if (!is_extreme) {
-            for (size_t i = 0; i < Regs::NumVertexArrays; i++) {
-                if (!dirty.flags[VideoCommon::Dirty::VertexBuffer0 + i]) {
-                    continue;
-                }
-                const u32 stride = regs.vertex_array[i].stride;
-                const u32 num_vertices = regs.index_array.first + regs.index_array.count;
-                const GPUVAddr gpu_addr_begin =
-                    regs.vertex_array[i].StartAddress() + regs.index_array.first * stride;
-                const GPUVAddr gpu_addr_end = gpu_addr_begin + num_vertices * stride + 1;
-                regs.vertex_array_limit[i].SetAddress(gpu_addr_end);
-            }
+        if (!Settings::IsGPULevelExtreme()) {
+            RecalculateVertexArrayLimit();
        }
-        DrawArrays();
-        return;
-    }
+        return DrawArrays();
    case MAXWELL3D_REG_INDEX(topology_override):
        use_topology_override = true;
        return;
@@ -685,4 +675,71 @@ void Maxwell3D::ProcessClearBuffers() {
    rasterizer->Clear();
 }

+void Maxwell3D::RecalculateVertexArrayLimit() {
+    GPUVAddr start_address = regs.index_array.StartAddress();
+    auto& vn_state = vertex_num_approx_state;
+    if (start_address != vn_state.last_index_array_start ||
+        vn_state.current_min_index != regs.index_array.first) {
+        vn_state.last_index_array_start = start_address;
+        vn_state.current_max_index = regs.index_array.first;
+        vn_state.current_min_index = regs.index_array.first;
+        vn_state.current_num_vertices = 0;
+    }
+    const u32 index_count = regs.index_array.first + regs.index_array.count;
+    if (index_count <= vn_state.current_max_index) {
+        return;
+    }
+    const u32 max_base = std::max(regs.index_array.first, vn_state.current_max_index);
+    const u32 num_indices = index_count - max_base;
+    const size_t size_index = regs.index_array.FormatSizeInBytes();
+    const size_t expected_size = num_indices * size_index;
+    const size_t offset = max_base * size_index;
+
+    auto maybe_ptr = memory_manager.GpuToHostPointer(start_address + offset);
+    u8* ptr;
+    if (accelerated_reads && maybe_ptr) {
+        ptr = *maybe_ptr;
+    } else {
+        vn_state.index_buffer_cache.resize(Common::DivideUp(expected_size, sizeof(u32)));
+        ptr = reinterpret_cast<u8*>(vn_state.index_buffer_cache.data());
+        memory_manager.ReadBlockUnsafe(start_address + offset, ptr, expected_size);
+    }
+    vn_state.current_max_index = index_count;
+
+    u32 new_num_vertices{};
+    switch (regs.index_array.format) {
+    case Regs::IndexFormat::UnsignedByte: {
+        std::span<const u8> span{ptr, num_indices};
+        const auto max = std::max_element(span.begin(), span.end());
+        new_num_vertices = *max + 1;
+        break;
+    }
+    case Regs::IndexFormat::UnsignedShort: {
+        std::span<const u16> span{reinterpret_cast<const u16*>(ptr), num_indices};
+        const auto max = std::max_element(span.begin(), span.end());
+        new_num_vertices = *max + 1;
+        break;
+    }
+    case Regs::IndexFormat::UnsignedInt: {
+        std::span<const u32> span{reinterpret_cast<const u32*>(ptr), num_indices};
+        const auto max = std::max_element(span.begin(), span.end());
+        new_num_vertices = *max + 1;
+        break;
+    }
+    }
+    if (new_num_vertices > vn_state.current_num_vertices) {
+        vn_state.current_num_vertices = new_num_vertices;
+        for (size_t i = 0; i < Regs::NumVertexArrays; i++) {
+            if (!regs.vertex_array[i].enable) {
+                continue;
+            }
+            const u32 stride = regs.vertex_array[i].stride;
+            const GPUVAddr gpu_addr_begin = regs.vertex_array[i].StartAddress();
+            const GPUVAddr gpu_addr_end = gpu_addr_begin + new_num_vertices * stride - 1;
+            regs.vertex_array_limit[i].SetAddress(gpu_addr_end);
+            dirty.flags[VideoCommon::Dirty::VertexBuffer0 + i] = true;
+        }
+    }
+}
+
 } // namespace Tegra::Engines
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@@ -1498,6 +1498,16 @@ public:
        Tables tables{};
    } dirty;

+    struct VertexNumApproxState {
+        GPUVAddr last_index_array_start;
+        u32 current_max_index;
+        u32 current_min_index;
+        u32 current_num_vertices;
+        std::vector<u32> index_buffer_cache;
+    } vertex_num_approx_state;
+
+    bool accelerated_reads{};
+
 private:
    void InitializeRegisterDefaults();

@@ -1566,6 +1576,8 @@ private:
    // Handles a instance drawcall from MME
    void StepInstance(MMEDrawMode expected_mode, u32 count);

+    void RecalculateVertexArrayLimit();
+
    /// Returns a query's value or an empty object if the value will be deferred through a cache.
    std::optional<u64> GetQueryResult();

--- a/src/video_core/memory_manager.cpp
+++ b/src/video_core/memory_manager.cpp
@@ -6,6 +6,7 @@

 #include "common/alignment.h"
 #include "common/assert.h"
+#include "common/host_memory.h"
 #include "common/logging/log.h"
 #include "core/core.h"
 #include "core/hle/kernel/k_page_table.h"
@@ -186,6 +187,19 @@ std::optional<VAddr> MemoryManager::GpuToCpuAddress(GPUVAddr gpu_addr) const {
    return page_entry.ToAddress() + (gpu_addr & page_mask);
 }

+std::optional<u8*> MemoryManager::GpuToHostPointer(GPUVAddr gpu_addr) const {
+    auto cpu_addr = GpuToCpuAddress(gpu_addr);
+    if (!cpu_addr) {
+        return std::nullopt;
+    }
+    auto& device_memory = system.DeviceMemory();
+    auto base = device_memory.buffer.VirtualBasePointer();
+    if (!base) {
+        return std::nullopt;
+    }
+    return base + *cpu_addr;
+}
+
 std::optional<VAddr> MemoryManager::GpuToCpuAddress(GPUVAddr addr, std::size_t size) const {
    size_t page_index{addr >> page_bits};
    const size_t page_last{(addr + size + page_size - 1) >> page_bits};
--- a/src/video_core/memory_manager.h
+++ b/src/video_core/memory_manager.h
@@ -76,6 +76,8 @@ public:

    [[nodiscard]] std::optional<VAddr> GpuToCpuAddress(GPUVAddr addr) const;

+    [[nodiscard]] std::optional<u8*> GpuToHostPointer(GPUVAddr addr) const;
+
    [[nodiscard]] std::optional<VAddr> GpuToCpuAddress(GPUVAddr addr, std::size_t size) const;

    template <typename T>