diff --git a/README.md b/README.md index 2b87f174e..cfc556c8b 100755 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ yuzu emulator early access ============= -This is the source code for early-access 1855. +This is the source code for early-access 1857. ## Legal Notice diff --git a/src/common/thread_worker.h b/src/common/thread_worker.h index ddd73220a..8272985ff 100755 --- a/src/common/thread_worker.h +++ b/src/common/thread_worker.h @@ -81,7 +81,7 @@ public: { std::unique_lock lock{queue_mutex}; requests.emplace(std::move(work)); - ++work_scherduled; + ++work_scheduled; } condition.notify_one(); } @@ -94,7 +94,7 @@ public: }); std::unique_lock lock{queue_mutex}; wait_condition.wait(lock, [this] { - return workers_stopped >= workers_queued || work_done >= work_scherduled; + return workers_stopped >= workers_queued || work_done >= work_scheduled; }); } @@ -103,7 +103,7 @@ private: std::mutex queue_mutex; std::condition_variable_any condition; std::condition_variable wait_condition; - std::atomic work_scherduled{}; + std::atomic work_scheduled{}; std::atomic work_done{}; std::atomic workers_stopped{}; std::atomic workers_queued{}; diff --git a/src/common/unique_function.h b/src/common/unique_function.h index 0a2ee9bb5..5c3e39b03 100755 --- a/src/common/unique_function.h +++ b/src/common/unique_function.h @@ -14,7 +14,7 @@ class UniqueFunction { class CallableBase { public: virtual ~CallableBase() = default; - virtual ResultType operator()(Args...) = 0; + virtual ResultType operator()(Args&&...) = 0; }; template @@ -23,7 +23,7 @@ class UniqueFunction { Callable(Functor&& functor_) : functor{std::move(functor_)} {} ~Callable() override = default; - ResultType operator()(Args... args) override { + ResultType operator()(Args&&... args) override { return functor(std::forward(args)...); } @@ -38,25 +38,20 @@ public: UniqueFunction(Functor&& functor) : callable{std::make_unique>(std::move(functor))} {} - UniqueFunction& operator=(UniqueFunction&& rhs) noexcept { - callable = std::move(rhs.callable); - return *this; - } + UniqueFunction& operator=(UniqueFunction&& rhs) noexcept = default; + UniqueFunction(UniqueFunction&& rhs) noexcept = default; - UniqueFunction(UniqueFunction&& rhs) noexcept - : callable{std::move(rhs.callable)} {} + UniqueFunction& operator=(const UniqueFunction&) = delete; + UniqueFunction(const UniqueFunction&) = delete; - ResultType operator()(Args... args) const { + ResultType operator()(Args&&... args) const { return (*callable)(std::forward(args)...); } explicit operator bool() const noexcept { - return callable != nullptr; + return static_cast(callable); } - UniqueFunction& operator=(const UniqueFunction&) = delete; - UniqueFunction(const UniqueFunction&) = delete; - private: std::unique_ptr callable; }; diff --git a/src/video_core/engines/fermi_2d.cpp b/src/video_core/engines/fermi_2d.cpp index 0f640fdae..f26530ede 100755 --- a/src/video_core/engines/fermi_2d.cpp +++ b/src/video_core/engines/fermi_2d.cpp @@ -7,6 +7,10 @@ #include "video_core/engines/fermi_2d.h" #include "video_core/memory_manager.h" #include "video_core/rasterizer_interface.h" +#include "video_core/surface.h" + +using VideoCore::Surface::BytesPerBlock; +using VideoCore::Surface::PixelFormatFromRenderTargetFormat; namespace Tegra::Engines { @@ -49,7 +53,7 @@ void Fermi2D::Blit() { UNIMPLEMENTED_IF_MSG(regs.clip_enable != 0, "Clipped blit enabled"); const auto& args = regs.pixels_from_memory; - const Config config{ + Config config{ .operation = regs.operation, .filter = args.sample_mode.filter, .dst_x0 = args.dst_x0, @@ -61,7 +65,21 @@ void Fermi2D::Blit() { .src_x1 = static_cast((args.du_dx * args.dst_width + args.src_x0) >> 32), .src_y1 = static_cast((args.dv_dy * args.dst_height + args.src_y0) >> 32), }; - if (!rasterizer->AccelerateSurfaceCopy(regs.src, regs.dst, config)) { + Surface src = regs.src; + const auto bytes_per_pixel = BytesPerBlock(PixelFormatFromRenderTargetFormat(src.format)); + const auto need_align_to_pitch = + src.linear == Tegra::Engines::Fermi2D::MemoryLayout::Pitch && + static_cast(src.width) == config.src_x1 && + config.src_x1 > static_cast(src.pitch / bytes_per_pixel) && config.src_x0 > 0; + if (need_align_to_pitch) { + auto address = src.Address() + config.src_x0 * bytes_per_pixel; + src.addr_upper = static_cast(address >> 32); + src.addr_lower = static_cast(address); + src.width -= config.src_x0; + config.src_x1 -= config.src_x0; + config.src_x0 = 0; + } + if (!rasterizer->AccelerateSurfaceCopy(src, regs.dst, config)) { UNIMPLEMENTED(); } } diff --git a/src/video_core/renderer_opengl/util_shaders.cpp b/src/video_core/renderer_opengl/util_shaders.cpp index abaf1ee6a..8fb5be393 100755 --- a/src/video_core/renderer_opengl/util_shaders.cpp +++ b/src/video_core/renderer_opengl/util_shaders.cpp @@ -261,9 +261,9 @@ void UtilShaders::CopyBC4(Image& dst_image, Image& src_image, std::span src_region_override = {}, - std::optional dst_region_override = {}); + const Tegra::Engines::Fermi2D::Config& copy); /// Invalidate the contents of the color buffer index /// These contents become unspecified, the cache can assume aggressive optimizations. @@ -760,9 +758,7 @@ void TextureCache

::UnmapGPUMemory(GPUVAddr gpu_addr, size_t size) { template void TextureCache

::BlitImage(const Tegra::Engines::Fermi2D::Surface& dst, const Tegra::Engines::Fermi2D::Surface& src, - const Tegra::Engines::Fermi2D::Config& copy, - std::optional src_override, - std::optional dst_override) { + const Tegra::Engines::Fermi2D::Config& copy) { const BlitImages images = GetBlitImages(dst, src); const ImageId dst_id = images.dst_id; const ImageId src_id = images.src_id; @@ -773,47 +769,25 @@ void TextureCache

::BlitImage(const Tegra::Engines::Fermi2D::Surface& dst, const ImageBase& src_image = slot_images[src_id]; // TODO: Deduplicate - const std::optional dst_base = dst_image.TryFindBase(dst.Address()); - const SubresourceRange dst_range{.base = dst_base.value(), .extent = {1, 1}}; - const ImageViewInfo dst_view_info(ImageViewType::e2D, images.dst_format, dst_range); - const auto [dst_framebuffer_id, dst_view_id] = RenderTargetFromImage(dst_id, dst_view_info); - const auto [src_samples_x, src_samples_y] = SamplesLog2(src_image.info.num_samples); - - // out of bounds texture blit checking - const bool use_override = src_override.has_value(); - const s32 src_x0 = copy.src_x0 >> src_samples_x; - s32 src_x1 = use_override ? src_override->end.x : copy.src_x1 >> src_samples_x; - const s32 src_y0 = copy.src_y0 >> src_samples_y; - const s32 src_y1 = copy.src_y1 >> src_samples_y; - - const auto src_width = static_cast(src_image.info.size.width); - const bool width_oob = src_x1 > src_width; - const auto width_diff = width_oob ? src_x1 - src_width : 0; - if (width_oob) { - src_x1 = src_width; - } - - const Region2D src_dimensions{ - Offset2D{.x = src_x0, .y = src_y0}, - Offset2D{.x = src_x1, .y = src_y1}, - }; - const auto src_region = use_override ? *src_override : src_dimensions; - const std::optional src_base = src_image.TryFindBase(src.Address()); const SubresourceRange src_range{.base = src_base.value(), .extent = {1, 1}}; const ImageViewInfo src_view_info(ImageViewType::e2D, images.src_format, src_range); const auto [src_framebuffer_id, src_view_id] = RenderTargetFromImage(src_id, src_view_info); - const auto [dst_samples_x, dst_samples_y] = SamplesLog2(dst_image.info.num_samples); - - const s32 dst_x0 = copy.dst_x0 >> dst_samples_x; - const s32 dst_x1 = copy.dst_x1 >> dst_samples_x; - const s32 dst_y0 = copy.dst_y0 >> dst_samples_y; - const s32 dst_y1 = copy.dst_y1 >> dst_samples_y; - const Region2D dst_dimensions{ - Offset2D{.x = dst_x0, .y = dst_y0}, - Offset2D{.x = dst_x1 - width_diff, .y = dst_y1}, + const auto [src_samples_x, src_samples_y] = SamplesLog2(src_image.info.num_samples); + const Region2D src_region{ + Offset2D{.x = copy.src_x0 >> src_samples_x, .y = copy.src_y0 >> src_samples_y}, + Offset2D{.x = copy.src_x1 >> src_samples_x, .y = copy.src_y1 >> src_samples_y}, + }; + + const std::optional dst_base = dst_image.TryFindBase(dst.Address()); + const SubresourceRange dst_range{.base = dst_base.value(), .extent = {1, 1}}; + const ImageViewInfo dst_view_info(ImageViewType::e2D, images.dst_format, dst_range); + const auto [dst_framebuffer_id, dst_view_id] = RenderTargetFromImage(dst_id, dst_view_info); + const auto [dst_samples_x, dst_samples_y] = SamplesLog2(dst_image.info.num_samples); + const Region2D dst_region{ + Offset2D{.x = copy.dst_x0 >> dst_samples_x, .y = copy.dst_y0 >> dst_samples_y}, + Offset2D{.x = copy.dst_x1 >> dst_samples_x, .y = copy.dst_y1 >> dst_samples_y}, }; - const auto dst_region = use_override ? *dst_override : dst_dimensions; // Always call this after src_framebuffer_id was queried, as the address might be invalidated. Framebuffer* const dst_framebuffer = &slot_framebuffers[dst_framebuffer_id]; @@ -830,21 +804,6 @@ void TextureCache

::BlitImage(const Tegra::Engines::Fermi2D::Surface& dst, runtime.BlitImage(dst_framebuffer, dst_view, src_view, dst_region, src_region, copy.filter, copy.operation); } - - if (width_oob) { - // Continue copy of the oob region of the texture on the next row - auto oob_src = src; - oob_src.height++; - const Region2D src_region_override{ - Offset2D{.x = 0, .y = src_y0 + 1}, - Offset2D{.x = width_diff, .y = src_y1 + 1}, - }; - const Region2D dst_region_override{ - Offset2D{.x = dst_x1 - width_diff, .y = dst_y0}, - Offset2D{.x = dst_x1, .y = dst_y1}, - }; - BlitImage(dst, oob_src, copy, src_region_override, dst_region_override); - } } template diff --git a/src/video_core/textures/astc.cpp b/src/video_core/textures/astc.cpp index 6ae035638..3ab500760 100755 --- a/src/video_core/textures/astc.cpp +++ b/src/video_core/textures/astc.cpp @@ -18,15 +18,11 @@ #include #include #include -#if __cpp_lib_parallel_algorithm -#include -#endif #include #include #include -#include "common/alignment.h" #include "common/common_types.h" #include "video_core/textures/astc.h" @@ -1554,87 +1550,30 @@ static void DecompressBlock(std::span inBuf, const u32 blockWidth, void Decompress(std::span data, uint32_t width, uint32_t height, uint32_t depth, uint32_t block_width, uint32_t block_height, std::span output) { - struct ASTCStrideInfo { - u32 z{}; - u32 index{}; - }; + u32 block_index = 0; + std::size_t depth_offset = 0; + for (u32 z = 0; z < depth; z++) { + for (u32 y = 0; y < height; y += block_height) { + for (u32 x = 0; x < width; x += block_width) { + const std::span blockPtr{data.subspan(block_index * 16, 16)}; - const u32 rows = Common::DivideUp(height, block_height); - const u32 cols = Common::DivideUp(width, block_width); + // Blocks can be at most 12x12 + std::array uncompData; + DecompressBlock(blockPtr, block_width, block_height, uncompData); - const u32 num_strides = depth * rows; - std::vector astc_strides(num_strides); + u32 decompWidth = std::min(block_width, width - x); + u32 decompHeight = std::min(block_height, height - y); - for (u32 z = 0; z < depth; ++z) { - for (u32 index = 0; index < rows; ++index) { - astc_strides.emplace_back(ASTCStrideInfo{ - .z{z}, - .index{index}, - }); - } - } - - auto decompress_stride = [&](const ASTCStrideInfo& stride) { - const u32 y = stride.index * block_height; - const u32 depth_offset = stride.z * height * width * 4; - for (u32 x_index = 0; x_index < cols; ++x_index) { - const u32 block_index = (stride.z * rows * cols) + (stride.index * cols) + x_index; - const u32 x = x_index * block_width; - - const std::span blockPtr{data.subspan(block_index * 16, 16)}; - - // Blocks can be at most 12x12 - std::array uncompData; - DecompressBlock(blockPtr, block_width, block_height, uncompData); - - const u32 decompWidth = std::min(block_width, width - x); - const u32 decompHeight = std::min(block_height, height - y); - - const std::span outRow = output.subspan(depth_offset + (y * width + x) * 4); - - for (u32 h = 0; h < decompHeight; ++h) { - std::memcpy(outRow.data() + h * width * 4, uncompData.data() + h * block_width, - decompWidth * 4); + const std::span outRow = output.subspan(depth_offset + (y * width + x) * 4); + for (u32 jj = 0; jj < decompHeight; jj++) { + std::memcpy(outRow.data() + jj * width * 4, + uncompData.data() + jj * block_width, decompWidth * 4); + } + ++block_index; } } - }; - -#if __cpp_lib_parallel_algorithm - std::for_each(std::execution::par, astc_strides.cbegin(), astc_strides.cend(), - decompress_stride); -#else - std::for_each(astc_strides.cbegin(), astc_strides.cend(), decompress_stride); -#endif - - // const u32 rows = Common::DivideUp(height, block_height); - // const u32 cols = Common::DivideUp(width, block_width); - - // for (u32 z = 0; z < depth; ++z) { - // const u32 depth_offset = z * height * width * 4; - // for (u32 y_index = 0; y_index < rows; ++y_index) { - // const u32 y = y_index * block_height; - // for (u32 x_index = 0; x_index < cols; ++x_index) { - // const u32 block_index = (z * rows * cols) + (y_index * cols) + x_index; - // const u32 x = x_index * block_width; - - // const std::span blockPtr{data.subspan(block_index * 16, 16)}; - - // // Blocks can be at most 12x12 - // std::array uncompData; - // DecompressBlock(blockPtr, block_width, block_height, uncompData); - - // u32 decompWidth = std::min(block_width, width - x); - // u32 decompHeight = std::min(block_height, height - y); - - // const std::span outRow = output.subspan(depth_offset + (y * width + x) * 4); - // for (u32 h = 0; h < decompHeight; ++h) { - // std::memcpy(outRow.data() + h * width * 4, uncompData.data() + h * - // block_width, - // decompWidth * 4); - // } - // } - // } - // } + depth_offset += height * width * 4; + } } } // namespace Tegra::Texture::ASTC