early-access version 3378

This commit is contained in:
pineappleEA
2023-02-09 23:19:39 +01:00
parent 9533c29326
commit 2e03ea5d50
73 changed files with 10408 additions and 3120 deletions

View File

@@ -430,7 +430,7 @@ private:
if (query_begin >= SizeBytes() || size < 0) {
return;
}
[[maybe_unused]] u64* const untracked_words = Array<Type::Untracked>();
u64* const untracked_words = Array<Type::Untracked>();
u64* const state_words = Array<type>();
const u64 query_end = query_begin + std::min(static_cast<u64>(size), SizeBytes());
u64* const words_begin = state_words + query_begin / BYTES_PER_WORD;
@@ -483,7 +483,7 @@ private:
NotifyRasterizer<true>(word_index, current_bits, ~u64{0});
}
// Exclude CPU modified pages when visiting GPU pages
const u64 word = current_word;
const u64 word = current_word & ~(type == Type::GPU ? untracked_words[word_index] : 0);
u64 page = page_begin;
page_begin = 0;
@@ -531,7 +531,7 @@ private:
[[nodiscard]] bool IsRegionModified(u64 offset, u64 size) const noexcept {
static_assert(type != Type::Untracked);
[[maybe_unused]] const u64* const untracked_words = Array<Type::Untracked>();
const u64* const untracked_words = Array<Type::Untracked>();
const u64* const state_words = Array<type>();
const u64 num_query_words = size / BYTES_PER_WORD + 1;
const u64 word_begin = offset / BYTES_PER_WORD;
@@ -539,7 +539,8 @@ private:
const u64 page_limit = Common::DivCeil(offset + size, BYTES_PER_PAGE);
u64 page_index = (offset / BYTES_PER_PAGE) % PAGES_PER_WORD;
for (u64 word_index = word_begin; word_index < word_end; ++word_index, page_index = 0) {
const u64 word = state_words[word_index];
const u64 off_word = type == Type::GPU ? untracked_words[word_index] : 0;
const u64 word = state_words[word_index] & ~off_word;
if (word == 0) {
continue;
}
@@ -563,7 +564,7 @@ private:
[[nodiscard]] std::pair<u64, u64> ModifiedRegion(u64 offset, u64 size) const noexcept {
static_assert(type != Type::Untracked);
[[maybe_unused]] const u64* const untracked_words = Array<Type::Untracked>();
const u64* const untracked_words = Array<Type::Untracked>();
const u64* const state_words = Array<type>();
const u64 num_query_words = size / BYTES_PER_WORD + 1;
const u64 word_begin = offset / BYTES_PER_WORD;
@@ -573,7 +574,8 @@ private:
u64 begin = std::numeric_limits<u64>::max();
u64 end = 0;
for (u64 word_index = word_begin; word_index < word_end; ++word_index) {
const u64 word = state_words[word_index];
const u64 off_word = type == Type::GPU ? untracked_words[word_index] : 0;
const u64 word = state_words[word_index] & ~off_word;
if (word == 0) {
continue;
}

View File

@@ -99,7 +99,7 @@ struct GPU::Impl {
/// Signal the ending of command list.
void OnCommandListEnd() {
gpu_thread.OnCommandListEnd();
rasterizer->ReleaseFences();
}
/// Request a host GPU memory flush from the CPU.

View File

@@ -40,8 +40,6 @@ static void RunThread(std::stop_token stop_token, Core::System& system,
scheduler.Push(submit_list->channel, std::move(submit_list->entries));
} else if (const auto* data = std::get_if<SwapBuffersCommand>(&next.data)) {
renderer.SwapBuffers(data->framebuffer ? &*data->framebuffer : nullptr);
} else if (std::holds_alternative<OnCommandListEndCommand>(next.data)) {
rasterizer->ReleaseFences();
} else if (std::holds_alternative<GPUTickCommand>(next.data)) {
system.GPU().TickWork();
} else if (const auto* flush = std::get_if<FlushRegionCommand>(&next.data)) {
@@ -110,10 +108,6 @@ void ThreadManager::FlushAndInvalidateRegion(VAddr addr, u64 size) {
rasterizer->OnCPUWrite(addr, size);
}
void ThreadManager::OnCommandListEnd() {
PushCommand(OnCommandListEndCommand());
}
u64 ThreadManager::PushCommand(CommandData&& command_data, bool block) {
if (!is_async) {
// In synchronous GPU mode, block the caller until the command has executed

View File

@@ -77,16 +77,12 @@ struct FlushAndInvalidateRegionCommand final {
u64 size;
};
/// Command called within the gpu, to schedule actions after a command list end
struct OnCommandListEndCommand final {};
/// Command to make the gpu look into pending requests
struct GPUTickCommand final {};
using CommandData =
std::variant<std::monostate, SubmitListCommand, SwapBuffersCommand, FlushRegionCommand,
InvalidateRegionCommand, FlushAndInvalidateRegionCommand, OnCommandListEndCommand,
GPUTickCommand>;
InvalidateRegionCommand, FlushAndInvalidateRegionCommand, GPUTickCommand>;
struct CommandDataContainer {
CommandDataContainer() = default;
@@ -134,8 +130,6 @@ public:
/// Notify rasterizer that any caches of the specified region should be flushed and invalidated
void FlushAndInvalidateRegion(VAddr addr, u64 size);
void OnCommandListEnd();
void TickGPU();
private:

View File

@@ -22,6 +22,8 @@ set(SHADER_FILES
convert_d24s8_to_abgr8.frag
convert_depth_to_float.frag
convert_float_to_depth.frag
convert_msaa_to_non_msaa.comp
convert_non_msaa_to_msaa.comp
convert_s8d24_to_abgr8.frag
full_screen_triangle.vert
fxaa.frag

View File

@@ -0,0 +1,30 @@
// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
#version 450 core
layout (local_size_x = 8, local_size_y = 8, local_size_z = 1) in;
layout (binding = 0, rgba8) uniform readonly restrict image2DMSArray msaa_in;
layout (binding = 1, rgba8) uniform writeonly restrict image2DArray output_img;
void main() {
const ivec3 coords = ivec3(gl_GlobalInvocationID);
if (any(greaterThanEqual(coords, imageSize(msaa_in)))) {
return;
}
// TODO: Specialization constants for num_samples?
const int num_samples = imageSamples(msaa_in);
for (int curr_sample = 0; curr_sample < num_samples; ++curr_sample) {
const vec4 pixel = imageLoad(msaa_in, coords, curr_sample);
const int single_sample_x = 2 * coords.x + (curr_sample & 1);
const int single_sample_y = 2 * coords.y + ((curr_sample / 2) & 1);
const ivec3 dest_coords = ivec3(single_sample_x, single_sample_y, coords.z);
if (any(greaterThanEqual(dest_coords, imageSize(output_img)))) {
continue;
}
imageStore(output_img, dest_coords, pixel);
}
}

View File

@@ -0,0 +1,29 @@
// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
#version 450 core
layout (local_size_x = 8, local_size_y = 8, local_size_z = 1) in;
layout (binding = 0, rgba8) uniform readonly restrict image2DArray img_in;
layout (binding = 1, rgba8) uniform writeonly restrict image2DMSArray output_msaa;
void main() {
const ivec3 coords = ivec3(gl_GlobalInvocationID);
if (any(greaterThanEqual(coords, imageSize(output_msaa)))) {
return;
}
// TODO: Specialization constants for num_samples?
const int num_samples = imageSamples(output_msaa);
for (int curr_sample = 0; curr_sample < num_samples; ++curr_sample) {
const int single_sample_x = 2 * coords.x + (curr_sample & 1);
const int single_sample_y = 2 * coords.y + ((curr_sample / 2) & 1);
const ivec3 single_coords = ivec3(single_sample_x, single_sample_y, coords.z);
if (any(greaterThanEqual(single_coords, imageSize(img_in)))) {
continue;
}
const vec4 pixel = imageLoad(img_in, single_coords);
imageStore(output_msaa, coords, curr_sample, pixel);
}
}

View File

@@ -557,6 +557,14 @@ void TextureCacheRuntime::CopyImage(Image& dst_image, Image& src_image,
}
}
void TextureCacheRuntime::CopyImageMSAA(Image& dst_image, Image& src_image,
std::span<const VideoCommon::ImageCopy> copies) {
LOG_DEBUG(Render_OpenGL, "Copying from {} samples to {} samples", src_image.info.num_samples,
dst_image.info.num_samples);
// TODO: Leverage the format conversion pass if possible/accurate.
util_shaders.CopyMSAA(dst_image, src_image, copies);
}
void TextureCacheRuntime::ReinterpretImage(Image& dst, Image& src,
std::span<const VideoCommon::ImageCopy> copies) {
LOG_DEBUG(Render_OpenGL, "Converting {} to {}", src.info.format, dst.info.format);

View File

@@ -93,12 +93,19 @@ public:
return device.CanReportMemoryUsage();
}
bool ShouldReinterpret([[maybe_unused]] Image& dst, [[maybe_unused]] Image& src) {
bool ShouldReinterpret([[maybe_unused]] Image& dst,
[[maybe_unused]] Image& src) const noexcept {
return true;
}
bool CanUploadMSAA() const noexcept {
return true;
}
void CopyImage(Image& dst, Image& src, std::span<const VideoCommon::ImageCopy> copies);
void CopyImageMSAA(Image& dst, Image& src, std::span<const VideoCommon::ImageCopy> copies);
void ReinterpretImage(Image& dst, Image& src, std::span<const VideoCommon::ImageCopy> copies);
void ConvertImage(Framebuffer* dst, ImageView& dst_view, ImageView& src_view) {

View File

@@ -12,6 +12,8 @@
#include "video_core/host_shaders/astc_decoder_comp.h"
#include "video_core/host_shaders/block_linear_unswizzle_2d_comp.h"
#include "video_core/host_shaders/block_linear_unswizzle_3d_comp.h"
#include "video_core/host_shaders/convert_msaa_to_non_msaa_comp.h"
#include "video_core/host_shaders/convert_non_msaa_to_msaa_comp.h"
#include "video_core/host_shaders/opengl_convert_s8d24_comp.h"
#include "video_core/host_shaders/opengl_copy_bc4_comp.h"
#include "video_core/host_shaders/pitch_unswizzle_comp.h"
@@ -51,7 +53,9 @@ UtilShaders::UtilShaders(ProgramManager& program_manager_)
block_linear_unswizzle_3d_program(MakeProgram(BLOCK_LINEAR_UNSWIZZLE_3D_COMP)),
pitch_unswizzle_program(MakeProgram(PITCH_UNSWIZZLE_COMP)),
copy_bc4_program(MakeProgram(OPENGL_COPY_BC4_COMP)),
convert_s8d24_program(MakeProgram(OPENGL_CONVERT_S8D24_COMP)) {
convert_s8d24_program(MakeProgram(OPENGL_CONVERT_S8D24_COMP)),
convert_ms_to_nonms_program(MakeProgram(CONVERT_MSAA_TO_NON_MSAA_COMP)),
convert_nonms_to_ms_program(MakeProgram(CONVERT_NON_MSAA_TO_MSAA_COMP)) {
const auto swizzle_table = Tegra::Texture::MakeSwizzleTable();
swizzle_table_buffer.Create();
glNamedBufferStorage(swizzle_table_buffer.handle, sizeof(swizzle_table), &swizzle_table, 0);
@@ -269,6 +273,33 @@ void UtilShaders::ConvertS8D24(Image& dst_image, std::span<const ImageCopy> copi
program_manager.RestoreGuestCompute();
}
void UtilShaders::CopyMSAA(Image& dst_image, Image& src_image,
std::span<const VideoCommon::ImageCopy> copies) {
const bool is_ms_to_non_ms = src_image.info.num_samples > 1 && dst_image.info.num_samples == 1;
const auto program_handle =
is_ms_to_non_ms ? convert_ms_to_nonms_program.handle : convert_nonms_to_ms_program.handle;
program_manager.BindComputeProgram(program_handle);
for (const ImageCopy& copy : copies) {
ASSERT(copy.src_subresource.base_layer == 0);
ASSERT(copy.src_subresource.num_layers == 1);
ASSERT(copy.dst_subresource.base_layer == 0);
ASSERT(copy.dst_subresource.num_layers == 1);
glBindImageTexture(0, src_image.StorageHandle(), copy.src_subresource.base_level, GL_TRUE,
0, GL_READ_ONLY, GL_RGBA8);
glBindImageTexture(1, dst_image.StorageHandle(), copy.dst_subresource.base_level, GL_TRUE,
0, GL_WRITE_ONLY, GL_RGBA8);
const u32 num_dispatches_x = Common::DivCeil(copy.extent.width, 8U);
const u32 num_dispatches_y = Common::DivCeil(copy.extent.height, 8U);
const u32 num_dispatches_z = copy.extent.depth;
glDispatchCompute(num_dispatches_x, num_dispatches_y, num_dispatches_z);
}
program_manager.RestoreGuestCompute();
}
GLenum StoreFormat(u32 bytes_per_block) {
switch (bytes_per_block) {
case 1:

View File

@@ -40,6 +40,9 @@ public:
void ConvertS8D24(Image& dst_image, std::span<const VideoCommon::ImageCopy> copies);
void CopyMSAA(Image& dst_image, Image& src_image,
std::span<const VideoCommon::ImageCopy> copies);
private:
ProgramManager& program_manager;
@@ -51,6 +54,8 @@ private:
OGLProgram pitch_unswizzle_program;
OGLProgram copy_bc4_program;
OGLProgram convert_s8d24_program;
OGLProgram convert_ms_to_nonms_program;
OGLProgram convert_nonms_to_ms_program;
};
GLenum StoreFormat(u32 bytes_per_block);

View File

@@ -548,31 +548,7 @@ void GraphicsPipeline::MakePipeline(VkRenderPass render_pass) {
static_vector<VkVertexInputBindingDescription, 32> vertex_bindings;
static_vector<VkVertexInputBindingDivisorDescriptionEXT, 32> vertex_binding_divisors;
static_vector<VkVertexInputAttributeDescription, 32> vertex_attributes;
if (key.state.dynamic_vertex_input) {
const size_t num_vertex_arrays = std::min(
key.state.attributes.size(), static_cast<size_t>(device.GetMaxVertexInputBindings()));
for (size_t index = 0; index < num_vertex_arrays; ++index) {
const u32 type = key.state.DynamicAttributeType(index);
if (!stage_infos[0].loads.Generic(index) || type == 0) {
continue;
}
vertex_attributes.push_back({
.location = static_cast<u32>(index),
.binding = 0,
.format = type == 1 ? VK_FORMAT_R32_SFLOAT
: type == 2 ? VK_FORMAT_R32_SINT
: VK_FORMAT_R32_UINT,
.offset = 0,
});
}
if (!vertex_attributes.empty()) {
vertex_bindings.push_back({
.binding = 0,
.stride = 4,
.inputRate = VK_VERTEX_INPUT_RATE_VERTEX,
});
}
} else {
if (!key.state.dynamic_vertex_input) {
const size_t num_vertex_arrays = std::min(
Maxwell::NumVertexArrays, static_cast<size_t>(device.GetMaxVertexInputBindings()));
for (size_t index = 0; index < num_vertex_arrays; ++index) {

View File

@@ -1230,6 +1230,11 @@ void TextureCacheRuntime::CopyImage(Image& dst, Image& src,
});
}
void TextureCacheRuntime::CopyImageMSAA(Image& dst, Image& src,
std::span<const VideoCommon::ImageCopy> copies) {
UNIMPLEMENTED_MSG("Copying images with different samples is not implemented in Vulkan.");
}
u64 TextureCacheRuntime::GetDeviceLocalMemory() const {
return device.GetDeviceLocalMemory();
}

View File

@@ -70,6 +70,8 @@ public:
void CopyImage(Image& dst, Image& src, std::span<const VideoCommon::ImageCopy> copies);
void CopyImageMSAA(Image& dst, Image& src, std::span<const VideoCommon::ImageCopy> copies);
bool ShouldReinterpret(Image& dst, Image& src);
void ReinterpretImage(Image& dst, Image& src, std::span<const VideoCommon::ImageCopy> copies);
@@ -80,6 +82,11 @@ public:
return false;
}
bool CanUploadMSAA() const noexcept {
// TODO: Implement buffer to MSAA uploads
return false;
}
void AccelerateImageUpload(Image&, const StagingBufferRef&,
std::span<const VideoCommon::SwizzleParameters>);

View File

@@ -22,6 +22,9 @@ std::string Name(const ImageBase& image) {
const u32 num_layers = image.info.resources.layers;
const u32 num_levels = image.info.resources.levels;
std::string resource;
if (image.info.num_samples > 1) {
resource += fmt::format(":{}xMSAA", image.info.num_samples);
}
if (num_layers > 1) {
resource += fmt::format(":L{}", num_layers);
}

View File

@@ -773,7 +773,7 @@ void TextureCache<P>::RefreshContents(Image& image, ImageId image_id) {
image.flags &= ~ImageFlagBits::CpuModified;
TrackImage(image, image_id);
if (image.info.num_samples > 1) {
if (image.info.num_samples > 1 && !runtime.CanUploadMSAA()) {
LOG_WARNING(HW_GPU, "MSAA image uploads are not implemented");
return;
}
@@ -1167,14 +1167,14 @@ ImageId TextureCache<P>::JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, VA
if (True(overlap.flags & ImageFlagBits::GpuModified)) {
new_image.flags |= ImageFlagBits::GpuModified;
}
const auto& resolution = Settings::values.resolution_info;
const SubresourceBase base = new_image.TryFindBase(overlap.gpu_addr).value();
const u32 up_scale = can_rescale ? resolution.up_scale : 1;
const u32 down_shift = can_rescale ? resolution.down_shift : 0;
auto copies = MakeShrinkImageCopies(new_info, overlap.info, base, up_scale, down_shift);
if (overlap.info.num_samples != new_image.info.num_samples) {
LOG_WARNING(HW_GPU, "Copying between images with different samples is not implemented");
runtime.CopyImageMSAA(new_image, overlap, std::move(copies));
} else {
const auto& resolution = Settings::values.resolution_info;
const SubresourceBase base = new_image.TryFindBase(overlap.gpu_addr).value();
const u32 up_scale = can_rescale ? resolution.up_scale : 1;
const u32 down_shift = can_rescale ? resolution.down_shift : 0;
auto copies = MakeShrinkImageCopies(new_info, overlap.info, base, up_scale, down_shift);
runtime.CopyImage(new_image, overlap, std::move(copies));
}
if (True(overlap.flags & ImageFlagBits::Tracked)) {

View File

@@ -573,10 +573,6 @@ u32 CalculateUnswizzledSizeBytes(const ImageInfo& info) noexcept {
if (info.type == ImageType::Buffer) {
return info.size.width * BytesPerBlock(info.format);
}
if (info.num_samples > 1) {
// Multisample images can't be uploaded or downloaded to the host
return 0;
}
if (info.type == ImageType::Linear) {
return info.pitch * Common::DivCeil(info.size.height, DefaultBlockHeight(info.format));
}
@@ -703,7 +699,7 @@ ImageViewType RenderTargetImageViewType(const ImageInfo& info) noexcept {
std::vector<ImageCopy> MakeShrinkImageCopies(const ImageInfo& dst, const ImageInfo& src,
SubresourceBase base, u32 up_scale, u32 down_shift) {
ASSERT(dst.resources.levels >= src.resources.levels);
ASSERT(dst.num_samples == src.num_samples);
// ASSERT(dst.num_samples == src.num_samples);
const bool is_dst_3d = dst.type == ImageType::e3D;
if (is_dst_3d) {