early-access version 1335
This commit is contained in:
@@ -239,8 +239,7 @@ private:
|
||||
void ImmediateUploadMemory(Buffer& buffer, u64 largest_copy,
|
||||
std::span<const BufferCopy> copies);
|
||||
|
||||
void MappedUploadMemory(Buffer& buffer, u64 total_size_bytes,
|
||||
std::span<const BufferCopy> copies);
|
||||
void MappedUploadMemory(Buffer& buffer, u64 total_size_bytes, std::span<BufferCopy> copies);
|
||||
|
||||
void DeleteBuffer(BufferId buffer_id);
|
||||
|
||||
@@ -362,11 +361,17 @@ void BufferCache<P>::DownloadMemory(VAddr cpu_addr, u64 size) {
|
||||
auto download_staging = runtime.DownloadStagingBuffer(total_size_bytes);
|
||||
const u8* const mapped_memory = download_staging.mapped_span.data();
|
||||
const std::span<BufferCopy> copies_span(copies.data(), copies.data() + copies.size());
|
||||
for (BufferCopy& copy : copies) {
|
||||
// Modify copies to have the staging offset in mind
|
||||
copy.dst_offset += download_staging.offset;
|
||||
}
|
||||
runtime.CopyBuffer(download_staging.buffer, buffer, copies_span);
|
||||
runtime.Finish();
|
||||
for (const BufferCopy& copy : copies) {
|
||||
const VAddr copy_cpu_addr = buffer.CpuAddr() + copy.src_offset;
|
||||
const u8* copy_mapped_memory = mapped_memory + copy.dst_offset;
|
||||
// Undo the modified offset
|
||||
const u64 dst_offset = copy.dst_offset - download_staging.offset;
|
||||
const u8* copy_mapped_memory = mapped_memory + dst_offset;
|
||||
cpu_memory.WriteBlockUnsafe(copy_cpu_addr, copy_mapped_memory, copy.size);
|
||||
}
|
||||
} else {
|
||||
@@ -554,7 +559,9 @@ void BufferCache<P>::PopAsyncFlushes() {
|
||||
}
|
||||
if constexpr (USE_MEMORY_MAPS) {
|
||||
auto download_staging = runtime.DownloadStagingBuffer(total_size_bytes);
|
||||
for (const auto [copy, buffer_id] : downloads) {
|
||||
for (auto& [copy, buffer_id] : downloads) {
|
||||
// Have in mind the staging buffer offset for the copy
|
||||
copy.dst_offset += download_staging.offset;
|
||||
const std::array copies{copy};
|
||||
runtime.CopyBuffer(download_staging.buffer, slot_buffers[buffer_id], copies);
|
||||
}
|
||||
@@ -562,7 +569,9 @@ void BufferCache<P>::PopAsyncFlushes() {
|
||||
for (const auto [copy, buffer_id] : downloads) {
|
||||
const Buffer& buffer = slot_buffers[buffer_id];
|
||||
const VAddr cpu_addr = buffer.CpuAddr() + copy.src_offset;
|
||||
const u8* read_mapped_memory = download_staging.mapped_span.data() + copy.dst_offset;
|
||||
// Undo the modified offset
|
||||
const u64 dst_offset = copy.dst_offset - download_staging.offset;
|
||||
const u8* read_mapped_memory = download_staging.mapped_span.data() + dst_offset;
|
||||
cpu_memory.WriteBlockUnsafe(cpu_addr, read_mapped_memory, copy.size);
|
||||
}
|
||||
} else {
|
||||
@@ -651,24 +660,25 @@ void BufferCache<P>::BindHostGraphicsUniformBuffer(size_t stage, u32 index, u32
|
||||
const VAddr cpu_addr = binding.cpu_addr;
|
||||
const u32 size = binding.size;
|
||||
Buffer& buffer = slot_buffers[binding.buffer_id];
|
||||
if constexpr (IS_OPENGL) {
|
||||
if (size <= SKIP_CACHE_SIZE && !buffer.IsRegionGpuModified(cpu_addr, size)) {
|
||||
if (size <= SKIP_CACHE_SIZE && !buffer.IsRegionGpuModified(cpu_addr, size)) {
|
||||
if constexpr (IS_OPENGL) {
|
||||
if (runtime.HasFastBufferSubData()) {
|
||||
// Fast path for Nvidia
|
||||
if (!HasFastUniformBufferBound(stage, binding_index)) {
|
||||
// We only have to bind when the currently bound buffer is not the fast version
|
||||
fast_bound_uniform_buffers[stage] |= 1U << binding_index;
|
||||
runtime.BindFastUniformBuffer(stage, binding_index, size);
|
||||
}
|
||||
const auto span = ImmediateBufferWithData(cpu_addr, size);
|
||||
runtime.PushFastUniformBuffer(stage, binding_index, span);
|
||||
} else {
|
||||
// Stream buffer path to avoid stalling on non-Nvidia drivers
|
||||
const auto span = runtime.BindMappedUniformBuffer(stage, binding_index, size);
|
||||
cpu_memory.ReadBlockUnsafe(cpu_addr, span.data(), size);
|
||||
return;
|
||||
}
|
||||
return;
|
||||
}
|
||||
fast_bound_uniform_buffers[stage] |= 1U << binding_index;
|
||||
|
||||
// Stream buffer path to avoid stalling on non-Nvidia drivers or Vulkan
|
||||
const std::span<u8> span = runtime.BindMappedUniformBuffer(stage, binding_index, size);
|
||||
cpu_memory.ReadBlockUnsafe(cpu_addr, span.data(), size);
|
||||
return;
|
||||
}
|
||||
// Classic cached path
|
||||
SynchronizeBuffer(buffer, cpu_addr, size);
|
||||
@@ -1117,13 +1127,16 @@ void BufferCache<P>::ImmediateUploadMemory(Buffer& buffer, u64 largest_copy,
|
||||
|
||||
template <class P>
|
||||
void BufferCache<P>::MappedUploadMemory(Buffer& buffer, u64 total_size_bytes,
|
||||
std::span<const BufferCopy> copies) {
|
||||
std::span<BufferCopy> copies) {
|
||||
auto upload_staging = runtime.UploadStagingBuffer(total_size_bytes);
|
||||
const std::span<u8> staging_pointer = upload_staging.mapped_span;
|
||||
for (const BufferCopy& copy : copies) {
|
||||
const VAddr cpu_addr = buffer.CpuAddr() + copy.dst_offset;
|
||||
for (BufferCopy& copy : copies) {
|
||||
u8* const src_pointer = staging_pointer.data() + copy.src_offset;
|
||||
const VAddr cpu_addr = buffer.CpuAddr() + copy.dst_offset;
|
||||
cpu_memory.ReadBlockUnsafe(cpu_addr, src_pointer, copy.size);
|
||||
|
||||
// Apply the staging offset
|
||||
copy.src_offset += upload_staging.offset;
|
||||
}
|
||||
runtime.CopyBuffer(buffer, upload_staging.buffer, copies);
|
||||
}
|
||||
|
@@ -550,15 +550,14 @@ void TextureCacheRuntime::BlitFramebuffer(Framebuffer* dst, Framebuffer* src,
|
||||
}
|
||||
|
||||
void TextureCacheRuntime::AccelerateImageUpload(Image& image, const ImageBufferMap& map,
|
||||
size_t buffer_offset,
|
||||
std::span<const SwizzleParameters> swizzles) {
|
||||
switch (image.info.type) {
|
||||
case ImageType::e2D:
|
||||
return util_shaders.BlockLinearUpload2D(image, map, buffer_offset, swizzles);
|
||||
return util_shaders.BlockLinearUpload2D(image, map, swizzles);
|
||||
case ImageType::e3D:
|
||||
return util_shaders.BlockLinearUpload3D(image, map, buffer_offset, swizzles);
|
||||
return util_shaders.BlockLinearUpload3D(image, map, swizzles);
|
||||
case ImageType::Linear:
|
||||
return util_shaders.PitchUpload(image, map, buffer_offset, swizzles);
|
||||
return util_shaders.PitchUpload(image, map, swizzles);
|
||||
default:
|
||||
UNREACHABLE();
|
||||
break;
|
||||
@@ -710,10 +709,10 @@ Image::Image(TextureCacheRuntime& runtime, const VideoCommon::ImageInfo& info_,
|
||||
}
|
||||
}
|
||||
|
||||
void Image::UploadMemory(const ImageBufferMap& map, size_t buffer_offset,
|
||||
void Image::UploadMemory(const ImageBufferMap& map,
|
||||
std::span<const VideoCommon::BufferImageCopy> copies) {
|
||||
glBindBuffer(GL_PIXEL_UNPACK_BUFFER, map.buffer);
|
||||
glFlushMappedBufferRange(GL_PIXEL_UNPACK_BUFFER, buffer_offset, unswizzled_size_bytes);
|
||||
glFlushMappedBufferRange(GL_PIXEL_UNPACK_BUFFER, map.offset, unswizzled_size_bytes);
|
||||
|
||||
glPixelStorei(GL_UNPACK_ALIGNMENT, 1);
|
||||
|
||||
@@ -729,19 +728,19 @@ void Image::UploadMemory(const ImageBufferMap& map, size_t buffer_offset,
|
||||
current_image_height = copy.buffer_image_height;
|
||||
glPixelStorei(GL_UNPACK_IMAGE_HEIGHT, current_image_height);
|
||||
}
|
||||
CopyBufferToImage(copy, buffer_offset);
|
||||
CopyBufferToImage(copy, map.offset);
|
||||
}
|
||||
}
|
||||
|
||||
void Image::UploadMemory(const ImageBufferMap& map, size_t buffer_offset,
|
||||
void Image::UploadMemory(const ImageBufferMap& map,
|
||||
std::span<const VideoCommon::BufferCopy> copies) {
|
||||
for (const VideoCommon::BufferCopy& copy : copies) {
|
||||
glCopyNamedBufferSubData(map.buffer, buffer.handle, copy.src_offset + buffer_offset,
|
||||
glCopyNamedBufferSubData(map.buffer, buffer.handle, copy.src_offset + map.offset,
|
||||
copy.dst_offset, copy.size);
|
||||
}
|
||||
}
|
||||
|
||||
void Image::DownloadMemory(ImageBufferMap& map, size_t buffer_offset,
|
||||
void Image::DownloadMemory(ImageBufferMap& map,
|
||||
std::span<const VideoCommon::BufferImageCopy> copies) {
|
||||
glMemoryBarrier(GL_PIXEL_BUFFER_BARRIER_BIT); // TODO: Move this to its own API
|
||||
|
||||
@@ -760,7 +759,7 @@ void Image::DownloadMemory(ImageBufferMap& map, size_t buffer_offset,
|
||||
current_image_height = copy.buffer_image_height;
|
||||
glPixelStorei(GL_PACK_IMAGE_HEIGHT, current_image_height);
|
||||
}
|
||||
CopyImageToBuffer(copy, buffer_offset);
|
||||
CopyImageToBuffer(copy, map.offset);
|
||||
}
|
||||
}
|
||||
|
||||
|
@@ -35,6 +35,7 @@ struct ImageBufferMap {
|
||||
~ImageBufferMap();
|
||||
|
||||
std::span<u8> mapped_span;
|
||||
size_t offset = 0;
|
||||
OGLSync* sync;
|
||||
GLuint buffer;
|
||||
};
|
||||
@@ -78,7 +79,7 @@ public:
|
||||
Tegra::Engines::Fermi2D::Filter filter,
|
||||
Tegra::Engines::Fermi2D::Operation operation);
|
||||
|
||||
void AccelerateImageUpload(Image& image, const ImageBufferMap& map, size_t buffer_offset,
|
||||
void AccelerateImageUpload(Image& image, const ImageBufferMap& map,
|
||||
std::span<const VideoCommon::SwizzleParameters> swizzles);
|
||||
|
||||
void InsertUploadMemoryBarrier();
|
||||
@@ -137,14 +138,12 @@ public:
|
||||
explicit Image(TextureCacheRuntime&, const VideoCommon::ImageInfo& info, GPUVAddr gpu_addr,
|
||||
VAddr cpu_addr);
|
||||
|
||||
void UploadMemory(const ImageBufferMap& map, size_t buffer_offset,
|
||||
void UploadMemory(const ImageBufferMap& map,
|
||||
std::span<const VideoCommon::BufferImageCopy> copies);
|
||||
|
||||
void UploadMemory(const ImageBufferMap& map, size_t buffer_offset,
|
||||
std::span<const VideoCommon::BufferCopy> copies);
|
||||
void UploadMemory(const ImageBufferMap& map, std::span<const VideoCommon::BufferCopy> copies);
|
||||
|
||||
void DownloadMemory(ImageBufferMap& map, size_t buffer_offset,
|
||||
std::span<const VideoCommon::BufferImageCopy> copies);
|
||||
void DownloadMemory(ImageBufferMap& map, std::span<const VideoCommon::BufferImageCopy> copies);
|
||||
|
||||
GLuint Handle() const noexcept {
|
||||
return texture.handle;
|
||||
|
@@ -63,7 +63,7 @@ UtilShaders::UtilShaders(ProgramManager& program_manager_)
|
||||
|
||||
UtilShaders::~UtilShaders() = default;
|
||||
|
||||
void UtilShaders::BlockLinearUpload2D(Image& image, const ImageBufferMap& map, size_t buffer_offset,
|
||||
void UtilShaders::BlockLinearUpload2D(Image& image, const ImageBufferMap& map,
|
||||
std::span<const SwizzleParameters> swizzles) {
|
||||
static constexpr Extent3D WORKGROUP_SIZE{32, 32, 1};
|
||||
static constexpr GLuint BINDING_SWIZZLE_BUFFER = 0;
|
||||
@@ -71,13 +71,13 @@ void UtilShaders::BlockLinearUpload2D(Image& image, const ImageBufferMap& map, s
|
||||
static constexpr GLuint BINDING_OUTPUT_IMAGE = 0;
|
||||
|
||||
program_manager.BindHostCompute(block_linear_unswizzle_2d_program.handle);
|
||||
glFlushMappedNamedBufferRange(map.buffer, buffer_offset, image.guest_size_bytes);
|
||||
glFlushMappedNamedBufferRange(map.buffer, map.offset, image.guest_size_bytes);
|
||||
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_SWIZZLE_BUFFER, swizzle_table_buffer.handle);
|
||||
|
||||
const GLenum store_format = StoreFormat(BytesPerBlock(image.info.format));
|
||||
for (const SwizzleParameters& swizzle : swizzles) {
|
||||
const Extent3D num_tiles = swizzle.num_tiles;
|
||||
const size_t input_offset = swizzle.buffer_offset + buffer_offset;
|
||||
const size_t input_offset = swizzle.buffer_offset + map.offset;
|
||||
|
||||
const u32 num_dispatches_x = Common::DivCeil(num_tiles.width, WORKGROUP_SIZE.width);
|
||||
const u32 num_dispatches_y = Common::DivCeil(num_tiles.height, WORKGROUP_SIZE.height);
|
||||
@@ -100,7 +100,7 @@ void UtilShaders::BlockLinearUpload2D(Image& image, const ImageBufferMap& map, s
|
||||
program_manager.RestoreGuestCompute();
|
||||
}
|
||||
|
||||
void UtilShaders::BlockLinearUpload3D(Image& image, const ImageBufferMap& map, size_t buffer_offset,
|
||||
void UtilShaders::BlockLinearUpload3D(Image& image, const ImageBufferMap& map,
|
||||
std::span<const SwizzleParameters> swizzles) {
|
||||
static constexpr Extent3D WORKGROUP_SIZE{16, 8, 8};
|
||||
|
||||
@@ -108,14 +108,14 @@ void UtilShaders::BlockLinearUpload3D(Image& image, const ImageBufferMap& map, s
|
||||
static constexpr GLuint BINDING_INPUT_BUFFER = 1;
|
||||
static constexpr GLuint BINDING_OUTPUT_IMAGE = 0;
|
||||
|
||||
glFlushMappedNamedBufferRange(map.buffer, buffer_offset, image.guest_size_bytes);
|
||||
glFlushMappedNamedBufferRange(map.buffer, map.offset, image.guest_size_bytes);
|
||||
program_manager.BindHostCompute(block_linear_unswizzle_3d_program.handle);
|
||||
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_SWIZZLE_BUFFER, swizzle_table_buffer.handle);
|
||||
|
||||
const GLenum store_format = StoreFormat(BytesPerBlock(image.info.format));
|
||||
for (const SwizzleParameters& swizzle : swizzles) {
|
||||
const Extent3D num_tiles = swizzle.num_tiles;
|
||||
const size_t input_offset = swizzle.buffer_offset + buffer_offset;
|
||||
const size_t input_offset = swizzle.buffer_offset + map.offset;
|
||||
|
||||
const u32 num_dispatches_x = Common::DivCeil(num_tiles.width, WORKGROUP_SIZE.width);
|
||||
const u32 num_dispatches_y = Common::DivCeil(num_tiles.height, WORKGROUP_SIZE.height);
|
||||
@@ -141,7 +141,7 @@ void UtilShaders::BlockLinearUpload3D(Image& image, const ImageBufferMap& map, s
|
||||
program_manager.RestoreGuestCompute();
|
||||
}
|
||||
|
||||
void UtilShaders::PitchUpload(Image& image, const ImageBufferMap& map, size_t buffer_offset,
|
||||
void UtilShaders::PitchUpload(Image& image, const ImageBufferMap& map,
|
||||
std::span<const SwizzleParameters> swizzles) {
|
||||
static constexpr Extent3D WORKGROUP_SIZE{32, 32, 1};
|
||||
static constexpr GLuint BINDING_INPUT_BUFFER = 0;
|
||||
@@ -159,7 +159,7 @@ void UtilShaders::PitchUpload(Image& image, const ImageBufferMap& map, size_t bu
|
||||
"Non-power of two images are not implemented");
|
||||
|
||||
program_manager.BindHostCompute(pitch_unswizzle_program.handle);
|
||||
glFlushMappedNamedBufferRange(map.buffer, buffer_offset, image.guest_size_bytes);
|
||||
glFlushMappedNamedBufferRange(map.buffer, map.offset, image.guest_size_bytes);
|
||||
glUniform2ui(LOC_ORIGIN, 0, 0);
|
||||
glUniform2i(LOC_DESTINATION, 0, 0);
|
||||
glUniform1ui(LOC_BYTES_PER_BLOCK, bytes_per_block);
|
||||
@@ -167,7 +167,7 @@ void UtilShaders::PitchUpload(Image& image, const ImageBufferMap& map, size_t bu
|
||||
glBindImageTexture(BINDING_OUTPUT_IMAGE, image.Handle(), 0, GL_FALSE, 0, GL_WRITE_ONLY, format);
|
||||
for (const SwizzleParameters& swizzle : swizzles) {
|
||||
const Extent3D num_tiles = swizzle.num_tiles;
|
||||
const size_t input_offset = swizzle.buffer_offset + buffer_offset;
|
||||
const size_t input_offset = swizzle.buffer_offset + map.offset;
|
||||
|
||||
const u32 num_dispatches_x = Common::DivCeil(num_tiles.width, WORKGROUP_SIZE.width);
|
||||
const u32 num_dispatches_y = Common::DivCeil(num_tiles.height, WORKGROUP_SIZE.height);
|
||||
|
@@ -24,13 +24,13 @@ public:
|
||||
explicit UtilShaders(ProgramManager& program_manager);
|
||||
~UtilShaders();
|
||||
|
||||
void BlockLinearUpload2D(Image& image, const ImageBufferMap& map, size_t buffer_offset,
|
||||
void BlockLinearUpload2D(Image& image, const ImageBufferMap& map,
|
||||
std::span<const VideoCommon::SwizzleParameters> swizzles);
|
||||
|
||||
void BlockLinearUpload3D(Image& image, const ImageBufferMap& map, size_t buffer_offset,
|
||||
void BlockLinearUpload3D(Image& image, const ImageBufferMap& map,
|
||||
std::span<const VideoCommon::SwizzleParameters> swizzles);
|
||||
|
||||
void PitchUpload(Image& image, const ImageBufferMap& map, size_t buffer_offset,
|
||||
void PitchUpload(Image& image, const ImageBufferMap& map,
|
||||
std::span<const VideoCommon::SwizzleParameters> swizzles);
|
||||
|
||||
void CopyBC4(Image& dst_image, Image& src_image,
|
||||
|
@@ -138,17 +138,18 @@ void BufferCacheRuntime::CopyBuffer(VkBuffer dst_buffer, VkBuffer src_buffer,
|
||||
void BufferCacheRuntime::BindIndexBuffer(PrimitiveTopology topology, IndexFormat index_format,
|
||||
u32 base_vertex, u32 num_indices, VkBuffer buffer,
|
||||
u32 offset, [[maybe_unused]] u32 size) {
|
||||
VkIndexType index_type = MaxwellToVK::IndexFormat(index_format);
|
||||
VkIndexType vk_index_type = MaxwellToVK::IndexFormat(index_format);
|
||||
VkDeviceSize vk_offset = offset;
|
||||
if (topology == PrimitiveTopology::Quads) {
|
||||
index_type = VK_INDEX_TYPE_UINT32;
|
||||
std::tie(buffer, offset) =
|
||||
vk_index_type = VK_INDEX_TYPE_UINT32;
|
||||
std::tie(buffer, vk_offset) =
|
||||
quad_index_pass.Assemble(index_format, num_indices, base_vertex, buffer, offset);
|
||||
} else if (index_type == VK_INDEX_TYPE_UINT8_EXT && !device.IsExtIndexTypeUint8Supported()) {
|
||||
index_type = VK_INDEX_TYPE_UINT16;
|
||||
std::tie(buffer, offset) = uint8_pass.Assemble(num_indices, buffer, offset);
|
||||
} else if (vk_index_type == VK_INDEX_TYPE_UINT8_EXT && !device.IsExtIndexTypeUint8Supported()) {
|
||||
vk_index_type = VK_INDEX_TYPE_UINT16;
|
||||
std::tie(buffer, vk_offset) = uint8_pass.Assemble(num_indices, buffer, offset);
|
||||
}
|
||||
scheduler.Record([buffer, offset, index_type](vk::CommandBuffer cmdbuf) {
|
||||
cmdbuf.BindIndexBuffer(buffer, offset, index_type);
|
||||
scheduler.Record([buffer, vk_offset, vk_index_type](vk::CommandBuffer cmdbuf) {
|
||||
cmdbuf.BindIndexBuffer(buffer, vk_offset, vk_index_type);
|
||||
});
|
||||
}
|
||||
|
||||
|
@@ -69,6 +69,13 @@ public:
|
||||
|
||||
void BindTransformFeedbackBuffer(u32 index, VkBuffer buffer, u32 offset, u32 size);
|
||||
|
||||
std::span<u8> BindMappedUniformBuffer([[maybe_unused]] size_t stage,
|
||||
[[maybe_unused]] u32 binding_index, u32 size) {
|
||||
const StagingBufferRef ref = staging_pool.Request(size, MemoryUsage::Upload);
|
||||
BindBuffer(ref.buffer, static_cast<u32>(ref.offset), size);
|
||||
return ref.mapped_span;
|
||||
}
|
||||
|
||||
void BindUniformBuffer(VkBuffer buffer, u32 offset, u32 size) {
|
||||
BindBuffer(buffer, offset, size);
|
||||
}
|
||||
|
@@ -10,6 +10,7 @@
|
||||
#include "common/alignment.h"
|
||||
#include "common/assert.h"
|
||||
#include "common/common_types.h"
|
||||
#include "common/div_ceil.h"
|
||||
#include "video_core/host_shaders/vulkan_quad_indexed_comp_spv.h"
|
||||
#include "video_core/host_shaders/vulkan_uint8_comp_spv.h"
|
||||
#include "video_core/renderer_vulkan/vk_compute_pass.h"
|
||||
@@ -148,38 +149,33 @@ Uint8Pass::Uint8Pass(const Device& device, VKScheduler& scheduler_,
|
||||
|
||||
Uint8Pass::~Uint8Pass() = default;
|
||||
|
||||
std::pair<VkBuffer, u32> Uint8Pass::Assemble(u32 num_vertices, VkBuffer src_buffer,
|
||||
u32 src_offset) {
|
||||
std::pair<VkBuffer, VkDeviceSize> Uint8Pass::Assemble(u32 num_vertices, VkBuffer src_buffer,
|
||||
u32 src_offset) {
|
||||
const u32 staging_size = static_cast<u32>(num_vertices * sizeof(u16));
|
||||
const auto staging = staging_buffer_pool.Request(staging_size, MemoryUsage::DeviceLocal);
|
||||
|
||||
update_descriptor_queue.Acquire();
|
||||
update_descriptor_queue.AddBuffer(src_buffer, src_offset, num_vertices);
|
||||
update_descriptor_queue.AddBuffer(staging.buffer, 0, staging_size);
|
||||
update_descriptor_queue.AddBuffer(staging.buffer, staging.offset, staging_size);
|
||||
const VkDescriptorSet set = CommitDescriptorSet(update_descriptor_queue);
|
||||
|
||||
scheduler.RequestOutsideRenderPassOperationContext();
|
||||
scheduler.Record([layout = *layout, pipeline = *pipeline, buffer = staging.buffer, set,
|
||||
num_vertices](vk::CommandBuffer cmdbuf) {
|
||||
constexpr u32 dispatch_size = 1024;
|
||||
static constexpr u32 DISPATCH_SIZE = 1024;
|
||||
static constexpr VkMemoryBarrier WRITE_BARRIER{
|
||||
.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
|
||||
.pNext = nullptr,
|
||||
.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT,
|
||||
.dstAccessMask = VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT,
|
||||
};
|
||||
cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, pipeline);
|
||||
cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, layout, 0, set, {});
|
||||
cmdbuf.Dispatch(Common::AlignUp(num_vertices, dispatch_size) / dispatch_size, 1, 1);
|
||||
|
||||
VkBufferMemoryBarrier barrier;
|
||||
barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
|
||||
barrier.pNext = nullptr;
|
||||
barrier.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT;
|
||||
barrier.dstAccessMask = VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT;
|
||||
barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
|
||||
barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
|
||||
barrier.buffer = buffer;
|
||||
barrier.offset = 0;
|
||||
barrier.size = static_cast<VkDeviceSize>(num_vertices * sizeof(u16));
|
||||
cmdbuf.Dispatch(Common::DivCeil(num_vertices, DISPATCH_SIZE), 1, 1);
|
||||
cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
|
||||
VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, 0, {}, barrier, {});
|
||||
VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, 0, WRITE_BARRIER);
|
||||
});
|
||||
return {staging.buffer, 0};
|
||||
return {staging.buffer, staging.offset};
|
||||
}
|
||||
|
||||
QuadIndexedPass::QuadIndexedPass(const Device& device_, VKScheduler& scheduler_,
|
||||
@@ -194,7 +190,7 @@ QuadIndexedPass::QuadIndexedPass(const Device& device_, VKScheduler& scheduler_,
|
||||
|
||||
QuadIndexedPass::~QuadIndexedPass() = default;
|
||||
|
||||
std::pair<VkBuffer, u32> QuadIndexedPass::Assemble(
|
||||
std::pair<VkBuffer, VkDeviceSize> QuadIndexedPass::Assemble(
|
||||
Tegra::Engines::Maxwell3D::Regs::IndexFormat index_format, u32 num_vertices, u32 base_vertex,
|
||||
VkBuffer src_buffer, u32 src_offset) {
|
||||
const u32 index_shift = [index_format] {
|
||||
@@ -217,34 +213,29 @@ std::pair<VkBuffer, u32> QuadIndexedPass::Assemble(
|
||||
|
||||
update_descriptor_queue.Acquire();
|
||||
update_descriptor_queue.AddBuffer(src_buffer, src_offset, input_size);
|
||||
update_descriptor_queue.AddBuffer(staging.buffer, 0, staging_size);
|
||||
update_descriptor_queue.AddBuffer(staging.buffer, staging.offset, staging_size);
|
||||
const VkDescriptorSet set = CommitDescriptorSet(update_descriptor_queue);
|
||||
|
||||
scheduler.RequestOutsideRenderPassOperationContext();
|
||||
scheduler.Record([layout = *layout, pipeline = *pipeline, buffer = staging.buffer, set,
|
||||
num_tri_vertices, base_vertex, index_shift](vk::CommandBuffer cmdbuf) {
|
||||
static constexpr u32 dispatch_size = 1024;
|
||||
static constexpr u32 DISPATCH_SIZE = 1024;
|
||||
static constexpr VkMemoryBarrier WRITE_BARRIER{
|
||||
.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
|
||||
.pNext = nullptr,
|
||||
.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT,
|
||||
.dstAccessMask = VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT,
|
||||
};
|
||||
const std::array push_constants = {base_vertex, index_shift};
|
||||
cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, pipeline);
|
||||
cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, layout, 0, set, {});
|
||||
cmdbuf.PushConstants(layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(push_constants),
|
||||
&push_constants);
|
||||
cmdbuf.Dispatch(Common::AlignUp(num_tri_vertices, dispatch_size) / dispatch_size, 1, 1);
|
||||
|
||||
VkBufferMemoryBarrier barrier;
|
||||
barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
|
||||
barrier.pNext = nullptr;
|
||||
barrier.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT;
|
||||
barrier.dstAccessMask = VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT;
|
||||
barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
|
||||
barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
|
||||
barrier.buffer = buffer;
|
||||
barrier.offset = 0;
|
||||
barrier.size = static_cast<VkDeviceSize>(num_tri_vertices * sizeof(u32));
|
||||
cmdbuf.Dispatch(Common::DivCeil(num_tri_vertices, DISPATCH_SIZE), 1, 1);
|
||||
cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
|
||||
VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, 0, {}, barrier, {});
|
||||
VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, 0, WRITE_BARRIER);
|
||||
});
|
||||
return {staging.buffer, 0};
|
||||
return {staging.buffer, staging.offset};
|
||||
}
|
||||
|
||||
} // namespace Vulkan
|
||||
|
@@ -50,7 +50,8 @@ public:
|
||||
|
||||
/// Assemble uint8 indices into an uint16 index buffer
|
||||
/// Returns a pair with the staging buffer, and the offset where the assembled data is
|
||||
std::pair<VkBuffer, u32> Assemble(u32 num_vertices, VkBuffer src_buffer, u32 src_offset);
|
||||
std::pair<VkBuffer, VkDeviceSize> Assemble(u32 num_vertices, VkBuffer src_buffer,
|
||||
u32 src_offset);
|
||||
|
||||
private:
|
||||
VKScheduler& scheduler;
|
||||
@@ -66,9 +67,9 @@ public:
|
||||
VKUpdateDescriptorQueue& update_descriptor_queue_);
|
||||
~QuadIndexedPass();
|
||||
|
||||
std::pair<VkBuffer, u32> Assemble(Tegra::Engines::Maxwell3D::Regs::IndexFormat index_format,
|
||||
u32 num_vertices, u32 base_vertex, VkBuffer src_buffer,
|
||||
u32 src_offset);
|
||||
std::pair<VkBuffer, VkDeviceSize> Assemble(
|
||||
Tegra::Engines::Maxwell3D::Regs::IndexFormat index_format, u32 num_vertices,
|
||||
u32 base_vertex, VkBuffer src_buffer, u32 src_offset);
|
||||
|
||||
private:
|
||||
VKScheduler& scheduler;
|
||||
|
@@ -8,6 +8,7 @@
|
||||
|
||||
#include <fmt/format.h>
|
||||
|
||||
#include "common/alignment.h"
|
||||
#include "common/assert.h"
|
||||
#include "common/bit_util.h"
|
||||
#include "common/common_types.h"
|
||||
@@ -17,14 +18,118 @@
|
||||
#include "video_core/vulkan_common/vulkan_wrapper.h"
|
||||
|
||||
namespace Vulkan {
|
||||
namespace {
|
||||
// Maximum potential alignment of a Vulkan buffer
|
||||
constexpr VkDeviceSize MAX_ALIGNMENT = 256;
|
||||
// Maximum size to put elements in the stream buffer
|
||||
constexpr VkDeviceSize MAX_STREAM_BUFFER_REQUEST_SIZE = 8 * 1024 * 1024;
|
||||
// Stream buffer size in bytes
|
||||
constexpr VkDeviceSize STREAM_BUFFER_SIZE = 128 * 1024 * 1024;
|
||||
constexpr VkDeviceSize REGION_SIZE = STREAM_BUFFER_SIZE / StagingBufferPool::NUM_SYNCS;
|
||||
|
||||
constexpr VkMemoryPropertyFlags HOST_FLAGS =
|
||||
VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT;
|
||||
constexpr VkMemoryPropertyFlags STREAM_FLAGS = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT | HOST_FLAGS;
|
||||
|
||||
bool IsStreamHeap(VkMemoryHeap heap) noexcept {
|
||||
return STREAM_BUFFER_SIZE < (heap.size * 2) / 3;
|
||||
}
|
||||
|
||||
std::optional<u32> FindMemoryTypeIndex(const VkPhysicalDeviceMemoryProperties& props, u32 type_mask,
|
||||
VkMemoryPropertyFlags flags) noexcept {
|
||||
for (u32 type_index = 0; type_index < props.memoryTypeCount; ++type_index) {
|
||||
if (((type_mask >> type_index) & 1) == 0) {
|
||||
// Memory type is incompatible
|
||||
continue;
|
||||
}
|
||||
const VkMemoryType& memory_type = props.memoryTypes[type_index];
|
||||
if ((memory_type.propertyFlags & flags) != flags) {
|
||||
// Memory type doesn't have the flags we want
|
||||
continue;
|
||||
}
|
||||
if (!IsStreamHeap(props.memoryHeaps[memory_type.heapIndex])) {
|
||||
// Memory heap is not suitable for streaming
|
||||
continue;
|
||||
}
|
||||
// Success!
|
||||
return type_index;
|
||||
}
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
u32 FindMemoryTypeIndex(const VkPhysicalDeviceMemoryProperties& props, u32 type_mask) {
|
||||
// Try to find a DEVICE_LOCAL_BIT type, Nvidia and AMD have a dedicated heap for this
|
||||
std::optional<u32> type = FindMemoryTypeIndex(props, type_mask, STREAM_FLAGS);
|
||||
if (type) {
|
||||
return *type;
|
||||
}
|
||||
// Otherwise try without the DEVICE_LOCAL_BIT
|
||||
type = FindMemoryTypeIndex(props, type_mask, HOST_FLAGS);
|
||||
if (type) {
|
||||
return *type;
|
||||
}
|
||||
// This should never happen, and in case it does, signal it as an out of memory situation
|
||||
throw vk::Exception(VK_ERROR_OUT_OF_DEVICE_MEMORY);
|
||||
}
|
||||
|
||||
size_t Region(size_t iterator) noexcept {
|
||||
return iterator / REGION_SIZE;
|
||||
}
|
||||
} // Anonymous namespace
|
||||
|
||||
StagingBufferPool::StagingBufferPool(const Device& device_, MemoryAllocator& memory_allocator_,
|
||||
VKScheduler& scheduler_)
|
||||
: device{device_}, memory_allocator{memory_allocator_}, scheduler{scheduler_} {}
|
||||
: device{device_}, memory_allocator{memory_allocator_}, scheduler{scheduler_} {
|
||||
const vk::Device& dev = device.GetLogical();
|
||||
stream_buffer = dev.CreateBuffer(VkBufferCreateInfo{
|
||||
.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
|
||||
.pNext = nullptr,
|
||||
.flags = 0,
|
||||
.size = STREAM_BUFFER_SIZE,
|
||||
.usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT |
|
||||
VK_BUFFER_USAGE_INDEX_BUFFER_BIT,
|
||||
.sharingMode = VK_SHARING_MODE_EXCLUSIVE,
|
||||
.queueFamilyIndexCount = 0,
|
||||
.pQueueFamilyIndices = nullptr,
|
||||
});
|
||||
if (device.HasDebuggingToolAttached()) {
|
||||
stream_buffer.SetObjectNameEXT("Stream Buffer");
|
||||
}
|
||||
VkMemoryDedicatedRequirements dedicated_reqs{
|
||||
.sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS,
|
||||
.pNext = nullptr,
|
||||
.prefersDedicatedAllocation = VK_FALSE,
|
||||
.requiresDedicatedAllocation = VK_FALSE,
|
||||
};
|
||||
const auto requirements = dev.GetBufferMemoryRequirements(*stream_buffer, &dedicated_reqs);
|
||||
const bool make_dedicated = dedicated_reqs.prefersDedicatedAllocation == VK_TRUE ||
|
||||
dedicated_reqs.requiresDedicatedAllocation == VK_TRUE;
|
||||
const VkMemoryDedicatedAllocateInfo dedicated_info{
|
||||
.sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO,
|
||||
.pNext = nullptr,
|
||||
.image = nullptr,
|
||||
.buffer = *stream_buffer,
|
||||
};
|
||||
const auto memory_properties = device.GetPhysical().GetMemoryProperties();
|
||||
stream_memory = dev.AllocateMemory(VkMemoryAllocateInfo{
|
||||
.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
|
||||
.pNext = make_dedicated ? &dedicated_info : nullptr,
|
||||
.allocationSize = requirements.size,
|
||||
.memoryTypeIndex = FindMemoryTypeIndex(memory_properties, requirements.memoryTypeBits),
|
||||
});
|
||||
if (device.HasDebuggingToolAttached()) {
|
||||
stream_memory.SetObjectNameEXT("Stream Buffer Memory");
|
||||
}
|
||||
stream_buffer.BindMemory(*stream_memory, 0);
|
||||
stream_pointer = stream_memory.Map(0, STREAM_BUFFER_SIZE);
|
||||
}
|
||||
|
||||
StagingBufferPool::~StagingBufferPool() = default;
|
||||
|
||||
StagingBufferRef StagingBufferPool::Request(size_t size, MemoryUsage usage) {
|
||||
if (usage == MemoryUsage::Upload && size <= MAX_STREAM_BUFFER_REQUEST_SIZE) {
|
||||
return GetStreamBuffer(size);
|
||||
}
|
||||
if (const std::optional<StagingBufferRef> ref = TryGetReservedBuffer(size, usage)) {
|
||||
return *ref;
|
||||
}
|
||||
@@ -39,6 +144,42 @@ void StagingBufferPool::TickFrame() {
|
||||
ReleaseCache(MemoryUsage::Download);
|
||||
}
|
||||
|
||||
StagingBufferRef StagingBufferPool::GetStreamBuffer(size_t size) {
|
||||
for (size_t region = Region(used_iterator), region_end = Region(iterator); region < region_end;
|
||||
++region) {
|
||||
sync_ticks[region] = scheduler.CurrentTick();
|
||||
}
|
||||
used_iterator = iterator;
|
||||
|
||||
for (size_t region = Region(free_iterator) + 1,
|
||||
region_end = std::min(Region(iterator + size) + 1, NUM_SYNCS);
|
||||
region < region_end; ++region) {
|
||||
scheduler.Wait(sync_ticks[region]);
|
||||
}
|
||||
if (iterator + size > free_iterator) {
|
||||
free_iterator = iterator + size;
|
||||
}
|
||||
if (iterator + size > STREAM_BUFFER_SIZE) {
|
||||
for (size_t region = Region(used_iterator); region < NUM_SYNCS; ++region) {
|
||||
sync_ticks[region] = scheduler.CurrentTick();
|
||||
}
|
||||
used_iterator = 0;
|
||||
iterator = 0;
|
||||
free_iterator = size;
|
||||
|
||||
for (size_t region = 0, region_end = Region(size); region <= region_end; ++region) {
|
||||
scheduler.Wait(sync_ticks[region]);
|
||||
}
|
||||
}
|
||||
const size_t offset = iterator;
|
||||
iterator = Common::AlignUp(iterator + size, MAX_ALIGNMENT);
|
||||
return StagingBufferRef{
|
||||
.buffer = *stream_buffer,
|
||||
.offset = static_cast<VkDeviceSize>(offset),
|
||||
.mapped_span = std::span<u8>(stream_pointer + offset, size),
|
||||
};
|
||||
}
|
||||
|
||||
std::optional<StagingBufferRef> StagingBufferPool::TryGetReservedBuffer(size_t size,
|
||||
MemoryUsage usage) {
|
||||
StagingBuffers& cache_level = GetCache(usage)[Common::Log2Ceil64(size)];
|
||||
|
@@ -19,11 +19,14 @@ class VKScheduler;
|
||||
|
||||
struct StagingBufferRef {
|
||||
VkBuffer buffer;
|
||||
VkDeviceSize offset;
|
||||
std::span<u8> mapped_span;
|
||||
};
|
||||
|
||||
class StagingBufferPool {
|
||||
public:
|
||||
static constexpr size_t NUM_SYNCS = 16;
|
||||
|
||||
explicit StagingBufferPool(const Device& device, MemoryAllocator& memory_allocator,
|
||||
VKScheduler& scheduler);
|
||||
~StagingBufferPool();
|
||||
@@ -33,6 +36,11 @@ public:
|
||||
void TickFrame();
|
||||
|
||||
private:
|
||||
struct StreamBufferCommit {
|
||||
size_t upper_bound;
|
||||
u64 tick;
|
||||
};
|
||||
|
||||
struct StagingBuffer {
|
||||
vk::Buffer buffer;
|
||||
MemoryCommit commit;
|
||||
@@ -42,6 +50,7 @@ private:
|
||||
StagingBufferRef Ref() const noexcept {
|
||||
return {
|
||||
.buffer = *buffer,
|
||||
.offset = 0,
|
||||
.mapped_span = mapped_span,
|
||||
};
|
||||
}
|
||||
@@ -56,6 +65,8 @@ private:
|
||||
static constexpr size_t NUM_LEVELS = sizeof(size_t) * CHAR_BIT;
|
||||
using StagingBuffersCache = std::array<StagingBuffers, NUM_LEVELS>;
|
||||
|
||||
StagingBufferRef GetStreamBuffer(size_t size);
|
||||
|
||||
std::optional<StagingBufferRef> TryGetReservedBuffer(size_t size, MemoryUsage usage);
|
||||
|
||||
StagingBufferRef CreateStagingBuffer(size_t size, MemoryUsage usage);
|
||||
@@ -70,6 +81,15 @@ private:
|
||||
MemoryAllocator& memory_allocator;
|
||||
VKScheduler& scheduler;
|
||||
|
||||
vk::Buffer stream_buffer;
|
||||
vk::DeviceMemory stream_memory;
|
||||
u8* stream_pointer = nullptr;
|
||||
|
||||
size_t iterator = 0;
|
||||
size_t used_iterator = 0;
|
||||
size_t free_iterator = 0;
|
||||
std::array<u64, NUM_SYNCS> sync_ticks{};
|
||||
|
||||
StagingBuffersCache device_local_cache;
|
||||
StagingBuffersCache upload_cache;
|
||||
StagingBuffersCache download_cache;
|
||||
|
@@ -802,11 +802,10 @@ Image::Image(TextureCacheRuntime& runtime, const ImageInfo& info_, GPUVAddr gpu_
|
||||
}
|
||||
}
|
||||
|
||||
void Image::UploadMemory(const StagingBufferRef& map, size_t buffer_offset,
|
||||
std::span<const BufferImageCopy> copies) {
|
||||
void Image::UploadMemory(const StagingBufferRef& map, std::span<const BufferImageCopy> copies) {
|
||||
// TODO: Move this to another API
|
||||
scheduler->RequestOutsideRenderPassOperationContext();
|
||||
std::vector vk_copies = TransformBufferImageCopies(copies, buffer_offset, aspect_mask);
|
||||
std::vector vk_copies = TransformBufferImageCopies(copies, map.offset, aspect_mask);
|
||||
const VkBuffer src_buffer = map.buffer;
|
||||
const VkImage vk_image = *image;
|
||||
const VkImageAspectFlags vk_aspect_mask = aspect_mask;
|
||||
@@ -817,11 +816,11 @@ void Image::UploadMemory(const StagingBufferRef& map, size_t buffer_offset,
|
||||
});
|
||||
}
|
||||
|
||||
void Image::UploadMemory(const StagingBufferRef& map, size_t buffer_offset,
|
||||
void Image::UploadMemory(const StagingBufferRef& map,
|
||||
std::span<const VideoCommon::BufferCopy> copies) {
|
||||
// TODO: Move this to another API
|
||||
scheduler->RequestOutsideRenderPassOperationContext();
|
||||
std::vector vk_copies = TransformBufferCopies(copies, buffer_offset);
|
||||
std::vector vk_copies = TransformBufferCopies(copies, map.offset);
|
||||
const VkBuffer src_buffer = map.buffer;
|
||||
const VkBuffer dst_buffer = *buffer;
|
||||
scheduler->Record([src_buffer, dst_buffer, vk_copies](vk::CommandBuffer cmdbuf) {
|
||||
@@ -830,9 +829,8 @@ void Image::UploadMemory(const StagingBufferRef& map, size_t buffer_offset,
|
||||
});
|
||||
}
|
||||
|
||||
void Image::DownloadMemory(const StagingBufferRef& map, size_t buffer_offset,
|
||||
std::span<const BufferImageCopy> copies) {
|
||||
std::vector vk_copies = TransformBufferImageCopies(copies, buffer_offset, aspect_mask);
|
||||
void Image::DownloadMemory(const StagingBufferRef& map, std::span<const BufferImageCopy> copies) {
|
||||
std::vector vk_copies = TransformBufferImageCopies(copies, map.offset, aspect_mask);
|
||||
scheduler->Record([buffer = map.buffer, image = *image, aspect_mask = aspect_mask,
|
||||
vk_copies](vk::CommandBuffer cmdbuf) {
|
||||
const VkImageMemoryBarrier read_barrier{
|
||||
|
@@ -82,7 +82,7 @@ struct TextureCacheRuntime {
|
||||
return false;
|
||||
}
|
||||
|
||||
void AccelerateImageUpload(Image&, const StagingBufferRef&, size_t,
|
||||
void AccelerateImageUpload(Image&, const StagingBufferRef&,
|
||||
std::span<const VideoCommon::SwizzleParameters>) {
|
||||
UNREACHABLE();
|
||||
}
|
||||
@@ -100,13 +100,12 @@ public:
|
||||
explicit Image(TextureCacheRuntime&, const VideoCommon::ImageInfo& info, GPUVAddr gpu_addr,
|
||||
VAddr cpu_addr);
|
||||
|
||||
void UploadMemory(const StagingBufferRef& map, size_t buffer_offset,
|
||||
void UploadMemory(const StagingBufferRef& map,
|
||||
std::span<const VideoCommon::BufferImageCopy> copies);
|
||||
|
||||
void UploadMemory(const StagingBufferRef& map, size_t buffer_offset,
|
||||
std::span<const VideoCommon::BufferCopy> copies);
|
||||
void UploadMemory(const StagingBufferRef& map, std::span<const VideoCommon::BufferCopy> copies);
|
||||
|
||||
void DownloadMemory(const StagingBufferRef& map, size_t buffer_offset,
|
||||
void DownloadMemory(const StagingBufferRef& map,
|
||||
std::span<const VideoCommon::BufferImageCopy> copies);
|
||||
|
||||
[[nodiscard]] VkImage Handle() const noexcept {
|
||||
|
@@ -212,7 +212,7 @@ private:
|
||||
|
||||
/// Upload data from guest to an image
|
||||
template <typename StagingBuffer>
|
||||
void UploadImageContents(Image& image, StagingBuffer& staging_buffer, size_t buffer_offset);
|
||||
void UploadImageContents(Image& image, StagingBuffer& staging_buffer);
|
||||
|
||||
/// Find or create an image view from a guest descriptor
|
||||
[[nodiscard]] ImageViewId FindImageView(const TICEntry& config);
|
||||
@@ -592,7 +592,7 @@ void TextureCache<P>::DownloadMemory(VAddr cpu_addr, size_t size) {
|
||||
Image& image = slot_images[image_id];
|
||||
auto map = runtime.DownloadStagingBuffer(image.unswizzled_size_bytes);
|
||||
const auto copies = FullDownloadCopies(image.info);
|
||||
image.DownloadMemory(map, 0, copies);
|
||||
image.DownloadMemory(map, copies);
|
||||
runtime.Finish();
|
||||
SwizzleImage(gpu_memory, image.gpu_addr, image.info, copies, map.mapped_span);
|
||||
}
|
||||
@@ -750,24 +750,24 @@ void TextureCache<P>::PopAsyncFlushes() {
|
||||
total_size_bytes += slot_images[image_id].unswizzled_size_bytes;
|
||||
}
|
||||
auto download_map = runtime.DownloadStagingBuffer(total_size_bytes);
|
||||
size_t buffer_offset = 0;
|
||||
const size_t original_offset = download_map.offset;
|
||||
for (const ImageId image_id : download_ids) {
|
||||
Image& image = slot_images[image_id];
|
||||
const auto copies = FullDownloadCopies(image.info);
|
||||
image.DownloadMemory(download_map, buffer_offset, copies);
|
||||
buffer_offset += image.unswizzled_size_bytes;
|
||||
image.DownloadMemory(download_map, copies);
|
||||
download_map.offset += image.unswizzled_size_bytes;
|
||||
}
|
||||
// Wait for downloads to finish
|
||||
runtime.Finish();
|
||||
|
||||
buffer_offset = 0;
|
||||
const std::span<u8> download_span = download_map.mapped_span;
|
||||
download_map.offset = original_offset;
|
||||
std::span<u8> download_span = download_map.mapped_span;
|
||||
for (const ImageId image_id : download_ids) {
|
||||
const ImageBase& image = slot_images[image_id];
|
||||
const auto copies = FullDownloadCopies(image.info);
|
||||
const std::span<u8> image_download_span = download_span.subspan(buffer_offset);
|
||||
SwizzleImage(gpu_memory, image.gpu_addr, image.info, copies, image_download_span);
|
||||
buffer_offset += image.unswizzled_size_bytes;
|
||||
SwizzleImage(gpu_memory, image.gpu_addr, image.info, copies, download_span);
|
||||
download_map.offset += image.unswizzled_size_bytes;
|
||||
download_span = download_span.subspan(image.unswizzled_size_bytes);
|
||||
}
|
||||
committed_downloads.pop();
|
||||
}
|
||||
@@ -798,32 +798,32 @@ void TextureCache<P>::RefreshContents(Image& image) {
|
||||
LOG_WARNING(HW_GPU, "MSAA image uploads are not implemented");
|
||||
return;
|
||||
}
|
||||
auto map = runtime.UploadStagingBuffer(MapSizeBytes(image));
|
||||
UploadImageContents(image, map, 0);
|
||||
auto staging = runtime.UploadStagingBuffer(MapSizeBytes(image));
|
||||
UploadImageContents(image, staging);
|
||||
runtime.InsertUploadMemoryBarrier();
|
||||
}
|
||||
|
||||
template <class P>
|
||||
template <typename MapBuffer>
|
||||
void TextureCache<P>::UploadImageContents(Image& image, MapBuffer& map, size_t buffer_offset) {
|
||||
const std::span<u8> mapped_span = map.mapped_span.subspan(buffer_offset);
|
||||
template <typename StagingBuffer>
|
||||
void TextureCache<P>::UploadImageContents(Image& image, StagingBuffer& staging) {
|
||||
const std::span<u8> mapped_span = staging.mapped_span;
|
||||
const GPUVAddr gpu_addr = image.gpu_addr;
|
||||
|
||||
if (True(image.flags & ImageFlagBits::AcceleratedUpload)) {
|
||||
gpu_memory.ReadBlockUnsafe(gpu_addr, mapped_span.data(), mapped_span.size_bytes());
|
||||
const auto uploads = FullUploadSwizzles(image.info);
|
||||
runtime.AccelerateImageUpload(image, map, buffer_offset, uploads);
|
||||
runtime.AccelerateImageUpload(image, staging, uploads);
|
||||
} else if (True(image.flags & ImageFlagBits::Converted)) {
|
||||
std::vector<u8> unswizzled_data(image.unswizzled_size_bytes);
|
||||
auto copies = UnswizzleImage(gpu_memory, gpu_addr, image.info, unswizzled_data);
|
||||
ConvertImage(unswizzled_data, image.info, mapped_span, copies);
|
||||
image.UploadMemory(map, buffer_offset, copies);
|
||||
image.UploadMemory(staging, copies);
|
||||
} else if (image.info.type == ImageType::Buffer) {
|
||||
const std::array copies{UploadBufferCopy(gpu_memory, gpu_addr, image, mapped_span)};
|
||||
image.UploadMemory(map, buffer_offset, copies);
|
||||
image.UploadMemory(staging, copies);
|
||||
} else {
|
||||
const auto copies = UnswizzleImage(gpu_memory, gpu_addr, image.info, mapped_span);
|
||||
image.UploadMemory(map, buffer_offset, copies);
|
||||
image.UploadMemory(staging, copies);
|
||||
}
|
||||
}
|
||||
|
||||
|
@@ -85,11 +85,10 @@ public:
|
||||
MemoryAllocator(const MemoryAllocator&) = delete;
|
||||
|
||||
/**
|
||||
* Commits a memory with the specified requeriments.
|
||||
* Commits a memory with the specified requirements.
|
||||
*
|
||||
* @param requirements Requirements returned from a Vulkan call.
|
||||
* @param host_visible Signals the allocator that it *must* use host visible and coherent
|
||||
* memory. When passing false, it will try to allocate device local memory.
|
||||
* @param usage Indicates how the memory will be used.
|
||||
*
|
||||
* @returns A memory commit.
|
||||
*/
|
||||
|
@@ -168,7 +168,7 @@ void Load(VkDevice device, DeviceDispatch& dld) noexcept {
|
||||
X(vkFreeCommandBuffers);
|
||||
X(vkFreeDescriptorSets);
|
||||
X(vkFreeMemory);
|
||||
X(vkGetBufferMemoryRequirements);
|
||||
X(vkGetBufferMemoryRequirements2);
|
||||
X(vkGetDeviceQueue);
|
||||
X(vkGetEventStatus);
|
||||
X(vkGetFenceStatus);
|
||||
@@ -786,10 +786,20 @@ DeviceMemory Device::AllocateMemory(const VkMemoryAllocateInfo& ai) const {
|
||||
return DeviceMemory(memory, handle, *dld);
|
||||
}
|
||||
|
||||
VkMemoryRequirements Device::GetBufferMemoryRequirements(VkBuffer buffer) const noexcept {
|
||||
VkMemoryRequirements requirements;
|
||||
dld->vkGetBufferMemoryRequirements(handle, buffer, &requirements);
|
||||
return requirements;
|
||||
VkMemoryRequirements Device::GetBufferMemoryRequirements(VkBuffer buffer,
|
||||
void* pnext) const noexcept {
|
||||
const VkBufferMemoryRequirementsInfo2 info{
|
||||
.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_REQUIREMENTS_INFO_2,
|
||||
.pNext = nullptr,
|
||||
.buffer = buffer,
|
||||
};
|
||||
VkMemoryRequirements2 requirements{
|
||||
.sType = VK_STRUCTURE_TYPE_MEMORY_REQUIREMENTS_2,
|
||||
.pNext = pnext,
|
||||
.memoryRequirements{},
|
||||
};
|
||||
dld->vkGetBufferMemoryRequirements2(handle, &info, &requirements);
|
||||
return requirements.memoryRequirements;
|
||||
}
|
||||
|
||||
VkMemoryRequirements Device::GetImageMemoryRequirements(VkImage image) const noexcept {
|
||||
|
@@ -283,7 +283,7 @@ struct DeviceDispatch : InstanceDispatch {
|
||||
PFN_vkFreeCommandBuffers vkFreeCommandBuffers{};
|
||||
PFN_vkFreeDescriptorSets vkFreeDescriptorSets{};
|
||||
PFN_vkFreeMemory vkFreeMemory{};
|
||||
PFN_vkGetBufferMemoryRequirements vkGetBufferMemoryRequirements{};
|
||||
PFN_vkGetBufferMemoryRequirements2 vkGetBufferMemoryRequirements2{};
|
||||
PFN_vkGetDeviceQueue vkGetDeviceQueue{};
|
||||
PFN_vkGetEventStatus vkGetEventStatus{};
|
||||
PFN_vkGetFenceStatus vkGetFenceStatus{};
|
||||
@@ -871,7 +871,8 @@ public:
|
||||
|
||||
DeviceMemory AllocateMemory(const VkMemoryAllocateInfo& ai) const;
|
||||
|
||||
VkMemoryRequirements GetBufferMemoryRequirements(VkBuffer buffer) const noexcept;
|
||||
VkMemoryRequirements GetBufferMemoryRequirements(VkBuffer buffer,
|
||||
void* pnext = nullptr) const noexcept;
|
||||
|
||||
VkMemoryRequirements GetImageMemoryRequirements(VkImage image) const noexcept;
|
||||
|
||||
|
Reference in New Issue
Block a user