early-access version 1255

This commit is contained in:
pineappleEA
2020-12-28 15:15:37 +00:00
parent 84b39492d1
commit 78b48028e1
6254 changed files with 1868140 additions and 0 deletions

314
src/video_core/CMakeLists.txt Executable file
View File

@@ -0,0 +1,314 @@
add_subdirectory(host_shaders)
add_library(video_core STATIC
buffer_cache/buffer_block.h
buffer_cache/buffer_cache.h
buffer_cache/map_interval.cpp
buffer_cache/map_interval.h
cdma_pusher.cpp
cdma_pusher.h
command_classes/codecs/codec.cpp
command_classes/codecs/codec.h
command_classes/codecs/h264.cpp
command_classes/codecs/h264.h
command_classes/codecs/vp9.cpp
command_classes/codecs/vp9.h
command_classes/codecs/vp9_types.h
command_classes/host1x.cpp
command_classes/host1x.h
command_classes/nvdec.cpp
command_classes/nvdec.h
command_classes/nvdec_common.h
command_classes/sync_manager.cpp
command_classes/sync_manager.h
command_classes/vic.cpp
command_classes/vic.h
compatible_formats.cpp
compatible_formats.h
delayed_destruction_ring.h
dirty_flags.cpp
dirty_flags.h
dma_pusher.cpp
dma_pusher.h
engines/const_buffer_engine_interface.h
engines/const_buffer_info.h
engines/engine_interface.h
engines/engine_upload.cpp
engines/engine_upload.h
engines/fermi_2d.cpp
engines/fermi_2d.h
engines/kepler_compute.cpp
engines/kepler_compute.h
engines/kepler_memory.cpp
engines/kepler_memory.h
engines/maxwell_3d.cpp
engines/maxwell_3d.h
engines/maxwell_dma.cpp
engines/maxwell_dma.h
engines/shader_bytecode.h
engines/shader_header.h
engines/shader_type.h
framebuffer_config.h
macro/macro.cpp
macro/macro.h
macro/macro_hle.cpp
macro/macro_hle.h
macro/macro_interpreter.cpp
macro/macro_interpreter.h
macro/macro_jit_x64.cpp
macro/macro_jit_x64.h
fence_manager.h
gpu.cpp
gpu.h
gpu_thread.cpp
gpu_thread.h
guest_driver.cpp
guest_driver.h
memory_manager.cpp
memory_manager.h
morton.cpp
morton.h
query_cache.h
rasterizer_accelerated.cpp
rasterizer_accelerated.h
rasterizer_interface.h
renderer_base.cpp
renderer_base.h
renderer_opengl/gl_arb_decompiler.cpp
renderer_opengl/gl_arb_decompiler.h
renderer_opengl/gl_buffer_cache.cpp
renderer_opengl/gl_buffer_cache.h
renderer_opengl/gl_device.cpp
renderer_opengl/gl_device.h
renderer_opengl/gl_fence_manager.cpp
renderer_opengl/gl_fence_manager.h
renderer_opengl/gl_rasterizer.cpp
renderer_opengl/gl_rasterizer.h
renderer_opengl/gl_resource_manager.cpp
renderer_opengl/gl_resource_manager.h
renderer_opengl/gl_shader_cache.cpp
renderer_opengl/gl_shader_cache.h
renderer_opengl/gl_shader_decompiler.cpp
renderer_opengl/gl_shader_decompiler.h
renderer_opengl/gl_shader_disk_cache.cpp
renderer_opengl/gl_shader_disk_cache.h
renderer_opengl/gl_shader_manager.cpp
renderer_opengl/gl_shader_manager.h
renderer_opengl/gl_shader_util.cpp
renderer_opengl/gl_shader_util.h
renderer_opengl/gl_state_tracker.cpp
renderer_opengl/gl_state_tracker.h
renderer_opengl/gl_stream_buffer.cpp
renderer_opengl/gl_stream_buffer.h
renderer_opengl/gl_texture_cache.cpp
renderer_opengl/gl_texture_cache.h
renderer_opengl/gl_query_cache.cpp
renderer_opengl/gl_query_cache.h
renderer_opengl/maxwell_to_gl.h
renderer_opengl/renderer_opengl.cpp
renderer_opengl/renderer_opengl.h
renderer_opengl/util_shaders.cpp
renderer_opengl/util_shaders.h
renderer_vulkan/blit_image.cpp
renderer_vulkan/blit_image.h
renderer_vulkan/fixed_pipeline_state.cpp
renderer_vulkan/fixed_pipeline_state.h
renderer_vulkan/maxwell_to_vk.cpp
renderer_vulkan/maxwell_to_vk.h
renderer_vulkan/nsight_aftermath_tracker.cpp
renderer_vulkan/nsight_aftermath_tracker.h
renderer_vulkan/renderer_vulkan.h
renderer_vulkan/renderer_vulkan.cpp
renderer_vulkan/vk_blit_screen.cpp
renderer_vulkan/vk_blit_screen.h
renderer_vulkan/vk_buffer_cache.cpp
renderer_vulkan/vk_buffer_cache.h
renderer_vulkan/vk_command_pool.cpp
renderer_vulkan/vk_command_pool.h
renderer_vulkan/vk_compute_pass.cpp
renderer_vulkan/vk_compute_pass.h
renderer_vulkan/vk_compute_pipeline.cpp
renderer_vulkan/vk_compute_pipeline.h
renderer_vulkan/vk_descriptor_pool.cpp
renderer_vulkan/vk_descriptor_pool.h
renderer_vulkan/vk_device.cpp
renderer_vulkan/vk_device.h
renderer_vulkan/vk_fence_manager.cpp
renderer_vulkan/vk_fence_manager.h
renderer_vulkan/vk_graphics_pipeline.cpp
renderer_vulkan/vk_graphics_pipeline.h
renderer_vulkan/vk_master_semaphore.cpp
renderer_vulkan/vk_master_semaphore.h
renderer_vulkan/vk_memory_manager.cpp
renderer_vulkan/vk_memory_manager.h
renderer_vulkan/vk_pipeline_cache.cpp
renderer_vulkan/vk_pipeline_cache.h
renderer_vulkan/vk_query_cache.cpp
renderer_vulkan/vk_query_cache.h
renderer_vulkan/vk_rasterizer.cpp
renderer_vulkan/vk_rasterizer.h
renderer_vulkan/vk_resource_pool.cpp
renderer_vulkan/vk_resource_pool.h
renderer_vulkan/vk_scheduler.cpp
renderer_vulkan/vk_scheduler.h
renderer_vulkan/vk_shader_decompiler.cpp
renderer_vulkan/vk_shader_decompiler.h
renderer_vulkan/vk_shader_util.cpp
renderer_vulkan/vk_shader_util.h
renderer_vulkan/vk_staging_buffer_pool.cpp
renderer_vulkan/vk_staging_buffer_pool.h
renderer_vulkan/vk_state_tracker.cpp
renderer_vulkan/vk_state_tracker.h
renderer_vulkan/vk_stream_buffer.cpp
renderer_vulkan/vk_stream_buffer.h
renderer_vulkan/vk_swapchain.cpp
renderer_vulkan/vk_swapchain.h
renderer_vulkan/vk_texture_cache.cpp
renderer_vulkan/vk_texture_cache.h
renderer_vulkan/vk_update_descriptor.cpp
renderer_vulkan/vk_update_descriptor.h
renderer_vulkan/wrapper.cpp
renderer_vulkan/wrapper.h
shader_cache.h
shader_notify.cpp
shader_notify.h
shader/decode/arithmetic.cpp
shader/decode/arithmetic_immediate.cpp
shader/decode/bfe.cpp
shader/decode/bfi.cpp
shader/decode/shift.cpp
shader/decode/arithmetic_integer.cpp
shader/decode/arithmetic_integer_immediate.cpp
shader/decode/arithmetic_half.cpp
shader/decode/arithmetic_half_immediate.cpp
shader/decode/ffma.cpp
shader/decode/hfma2.cpp
shader/decode/conversion.cpp
shader/decode/memory.cpp
shader/decode/texture.cpp
shader/decode/image.cpp
shader/decode/float_set_predicate.cpp
shader/decode/integer_set_predicate.cpp
shader/decode/half_set_predicate.cpp
shader/decode/predicate_set_register.cpp
shader/decode/predicate_set_predicate.cpp
shader/decode/register_set_predicate.cpp
shader/decode/float_set.cpp
shader/decode/integer_set.cpp
shader/decode/half_set.cpp
shader/decode/video.cpp
shader/decode/warp.cpp
shader/decode/xmad.cpp
shader/decode/other.cpp
shader/ast.cpp
shader/ast.h
shader/async_shaders.cpp
shader/async_shaders.h
shader/compiler_settings.cpp
shader/compiler_settings.h
shader/control_flow.cpp
shader/control_flow.h
shader/decode.cpp
shader/expr.cpp
shader/expr.h
shader/memory_util.cpp
shader/memory_util.h
shader/node_helper.cpp
shader/node_helper.h
shader/node.h
shader/registry.cpp
shader/registry.h
shader/shader_ir.cpp
shader/shader_ir.h
shader/track.cpp
shader/transform_feedback.cpp
shader/transform_feedback.h
surface.cpp
surface.h
texture_cache/decode_bc4.cpp
texture_cache/decode_bc4.h
texture_cache/descriptor_table.h
texture_cache/formatter.cpp
texture_cache/formatter.h
texture_cache/format_lookup_table.cpp
texture_cache/format_lookup_table.h
texture_cache/image_base.cpp
texture_cache/image_base.h
texture_cache/image_info.cpp
texture_cache/image_info.h
texture_cache/image_view_base.cpp
texture_cache/image_view_base.h
texture_cache/image_view_info.cpp
texture_cache/image_view_info.h
texture_cache/render_targets.h
texture_cache/samples_helper.h
texture_cache/slot_vector.h
texture_cache/texture_cache.h
texture_cache/texture_cache.inl
texture_cache/types.h
texture_cache/util.cpp
texture_cache/util.h
textures/astc.cpp
textures/astc.h
textures/decoders.cpp
textures/decoders.h
textures/texture.cpp
textures/texture.h
video_core.cpp
video_core.h
)
create_target_directory_groups(video_core)
target_link_libraries(video_core PUBLIC common core)
target_link_libraries(video_core PRIVATE glad xbyak)
if (MSVC)
target_include_directories(video_core PRIVATE ${FFMPEG_INCLUDE_DIR})
target_link_libraries(video_core PUBLIC ${FFMPEG_LIBRARY_DIR}/swscale.lib ${FFMPEG_LIBRARY_DIR}/avcodec.lib ${FFMPEG_LIBRARY_DIR}/avutil.lib)
else()
target_include_directories(video_core PRIVATE ${FFMPEG_INCLUDE_DIR})
target_link_libraries(video_core PRIVATE ${FFMPEG_LIBRARIES})
endif()
add_dependencies(video_core host_shaders)
target_include_directories(video_core PRIVATE ${HOST_SHADERS_INCLUDE})
target_include_directories(video_core PRIVATE sirit ../../externals/Vulkan-Headers/include)
target_link_libraries(video_core PRIVATE sirit)
if (ENABLE_NSIGHT_AFTERMATH)
if (NOT DEFINED ENV{NSIGHT_AFTERMATH_SDK})
message(ERROR "Environment variable NSIGHT_AFTERMATH_SDK has to be provided")
endif()
if (NOT WIN32)
message(ERROR "Nsight Aftermath doesn't support non-Windows platforms")
endif()
target_compile_definitions(video_core PRIVATE HAS_NSIGHT_AFTERMATH)
target_include_directories(video_core PRIVATE "$ENV{NSIGHT_AFTERMATH_SDK}/include")
endif()
if (MSVC)
target_compile_options(video_core PRIVATE
/we4267 # 'var' : conversion from 'size_t' to 'type', possible loss of data
/we4456 # Declaration of 'identifier' hides previous local declaration
/we4457 # Declaration of 'identifier' hides function parameter
/we4458 # Declaration of 'identifier' hides class member
/we4459 # Declaration of 'identifier' hides global declaration
/we4715 # 'function' : not all control paths return a value
)
else()
target_compile_options(video_core PRIVATE
-Werror=conversion
-Wno-error=sign-conversion
-Werror=pessimizing-move
-Werror=redundant-move
-Werror=shadow
-Werror=switch
-Werror=type-limits
-Werror=unused-variable
$<$<CXX_COMPILER_ID:GNU>:-Werror=class-memaccess>
$<$<CXX_COMPILER_ID:GNU>:-Werror=unused-but-set-parameter>
$<$<CXX_COMPILER_ID:GNU>:-Werror=unused-but-set-variable>
)
endif()

View File

@@ -0,0 +1,62 @@
// Copyright 2019 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#pragma once
#include "common/common_types.h"
namespace VideoCommon {
class BufferBlock {
public:
[[nodiscard]] bool Overlaps(VAddr start, VAddr end) const {
return (cpu_addr < end) && (cpu_addr_end > start);
}
[[nodiscard]] bool IsInside(VAddr other_start, VAddr other_end) const {
return cpu_addr <= other_start && other_end <= cpu_addr_end;
}
[[nodiscard]] std::size_t Offset(VAddr in_addr) const {
return static_cast<std::size_t>(in_addr - cpu_addr);
}
[[nodiscard]] VAddr CpuAddr() const {
return cpu_addr;
}
[[nodiscard]] VAddr CpuAddrEnd() const {
return cpu_addr_end;
}
void SetCpuAddr(VAddr new_addr) {
cpu_addr = new_addr;
cpu_addr_end = new_addr + size;
}
[[nodiscard]] std::size_t Size() const {
return size;
}
[[nodiscard]] u64 Epoch() const {
return epoch;
}
void SetEpoch(u64 new_epoch) {
epoch = new_epoch;
}
protected:
explicit BufferBlock(VAddr cpu_addr_, std::size_t size_) : size{size_} {
SetCpuAddr(cpu_addr_);
}
private:
VAddr cpu_addr{};
VAddr cpu_addr_end{};
std::size_t size{};
u64 epoch{};
};
} // namespace VideoCommon

View File

@@ -0,0 +1,594 @@
// Copyright 2019 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#pragma once
#include <list>
#include <memory>
#include <mutex>
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include <vector>
#include <boost/container/small_vector.hpp>
#include <boost/icl/interval_set.hpp>
#include <boost/intrusive/set.hpp>
#include "common/alignment.h"
#include "common/assert.h"
#include "common/common_types.h"
#include "common/logging/log.h"
#include "core/core.h"
#include "core/memory.h"
#include "core/settings.h"
#include "video_core/buffer_cache/buffer_block.h"
#include "video_core/buffer_cache/map_interval.h"
#include "video_core/memory_manager.h"
#include "video_core/rasterizer_interface.h"
namespace VideoCommon {
template <typename Buffer, typename BufferType, typename StreamBuffer>
class BufferCache {
using IntervalSet = boost::icl::interval_set<VAddr>;
using IntervalType = typename IntervalSet::interval_type;
using VectorMapInterval = boost::container::small_vector<MapInterval*, 1>;
static constexpr u64 WRITE_PAGE_BIT = 11;
static constexpr u64 BLOCK_PAGE_BITS = 21;
static constexpr u64 BLOCK_PAGE_SIZE = 1ULL << BLOCK_PAGE_BITS;
public:
struct BufferInfo {
BufferType handle;
u64 offset;
u64 address;
};
BufferInfo UploadMemory(GPUVAddr gpu_addr, std::size_t size, std::size_t alignment = 4,
bool is_written = false, bool use_fast_cbuf = false) {
std::lock_guard lock{mutex};
const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr);
if (!cpu_addr) {
return GetEmptyBuffer(size);
}
// Cache management is a big overhead, so only cache entries with a given size.
// TODO: Figure out which size is the best for given games.
constexpr std::size_t max_stream_size = 0x800;
if (use_fast_cbuf || size < max_stream_size) {
if (!is_written && !IsRegionWritten(*cpu_addr, *cpu_addr + size - 1)) {
const bool is_granular = gpu_memory.IsGranularRange(gpu_addr, size);
if (use_fast_cbuf) {
u8* dest;
if (is_granular) {
dest = gpu_memory.GetPointer(gpu_addr);
} else {
staging_buffer.resize(size);
dest = staging_buffer.data();
gpu_memory.ReadBlockUnsafe(gpu_addr, dest, size);
}
return ConstBufferUpload(dest, size);
}
if (is_granular) {
u8* const host_ptr = gpu_memory.GetPointer(gpu_addr);
return StreamBufferUpload(size, alignment, [host_ptr, size](u8* dest) {
std::memcpy(dest, host_ptr, size);
});
} else {
return StreamBufferUpload(size, alignment, [this, gpu_addr, size](u8* dest) {
gpu_memory.ReadBlockUnsafe(gpu_addr, dest, size);
});
}
}
}
Buffer* const block = GetBlock(*cpu_addr, size);
MapInterval* const map = MapAddress(block, gpu_addr, *cpu_addr, size);
if (!map) {
return GetEmptyBuffer(size);
}
if (is_written) {
map->MarkAsModified(true, GetModifiedTicks());
if (Settings::IsGPULevelHigh() &&
Settings::values.use_asynchronous_gpu_emulation.GetValue()) {
MarkForAsyncFlush(map);
}
if (!map->is_written) {
map->is_written = true;
MarkRegionAsWritten(map->start, map->end - 1);
}
}
return BufferInfo{block->Handle(), block->Offset(*cpu_addr), block->Address()};
}
/// Uploads from a host memory. Returns the OpenGL buffer where it's located and its offset.
BufferInfo UploadHostMemory(const void* raw_pointer, std::size_t size,
std::size_t alignment = 4) {
std::lock_guard lock{mutex};
return StreamBufferUpload(size, alignment, [raw_pointer, size](u8* dest) {
std::memcpy(dest, raw_pointer, size);
});
}
/// Prepares the buffer cache for data uploading
/// @param max_size Maximum number of bytes that will be uploaded
/// @return True when a stream buffer invalidation was required, false otherwise
void Map(std::size_t max_size) {
std::lock_guard lock{mutex};
std::tie(buffer_ptr, buffer_offset_base) = stream_buffer.Map(max_size, 4);
buffer_offset = buffer_offset_base;
}
/// Finishes the upload stream
void Unmap() {
std::lock_guard lock{mutex};
stream_buffer.Unmap(buffer_offset - buffer_offset_base);
}
/// Function called at the end of each frame, inteded for deferred operations
void TickFrame() {
++epoch;
while (!pending_destruction.empty()) {
// Delay at least 4 frames before destruction.
// This is due to triple buffering happening on some drivers.
static constexpr u64 epochs_to_destroy = 5;
if (pending_destruction.front()->Epoch() + epochs_to_destroy > epoch) {
break;
}
pending_destruction.pop();
}
}
/// Write any cached resources overlapping the specified region back to memory
void FlushRegion(VAddr addr, std::size_t size) {
std::lock_guard lock{mutex};
VectorMapInterval objects = GetMapsInRange(addr, size);
std::sort(objects.begin(), objects.end(),
[](MapInterval* lhs, MapInterval* rhs) { return lhs->ticks < rhs->ticks; });
for (MapInterval* object : objects) {
if (object->is_modified && object->is_registered) {
mutex.unlock();
FlushMap(object);
mutex.lock();
}
}
}
bool MustFlushRegion(VAddr addr, std::size_t size) {
std::lock_guard lock{mutex};
const VectorMapInterval objects = GetMapsInRange(addr, size);
return std::any_of(objects.cbegin(), objects.cend(), [](const MapInterval* map) {
return map->is_modified && map->is_registered;
});
}
/// Mark the specified region as being invalidated
void InvalidateRegion(VAddr addr, u64 size) {
std::lock_guard lock{mutex};
for (auto& object : GetMapsInRange(addr, size)) {
if (object->is_registered) {
Unregister(object);
}
}
}
void OnCPUWrite(VAddr addr, std::size_t size) {
std::lock_guard lock{mutex};
for (MapInterval* object : GetMapsInRange(addr, size)) {
if (object->is_memory_marked && object->is_registered) {
UnmarkMemory(object);
object->is_sync_pending = true;
marked_for_unregister.emplace_back(object);
}
}
}
void SyncGuestHost() {
std::lock_guard lock{mutex};
for (auto& object : marked_for_unregister) {
if (object->is_registered) {
object->is_sync_pending = false;
Unregister(object);
}
}
marked_for_unregister.clear();
}
void CommitAsyncFlushes() {
if (uncommitted_flushes) {
auto commit_list = std::make_shared<std::list<MapInterval*>>();
for (MapInterval* map : *uncommitted_flushes) {
if (map->is_registered && map->is_modified) {
// TODO(Blinkhawk): Implement backend asynchronous flushing
// AsyncFlushMap(map)
commit_list->push_back(map);
}
}
if (!commit_list->empty()) {
committed_flushes.push_back(commit_list);
} else {
committed_flushes.emplace_back();
}
} else {
committed_flushes.emplace_back();
}
uncommitted_flushes.reset();
}
bool ShouldWaitAsyncFlushes() const {
return !committed_flushes.empty() && committed_flushes.front() != nullptr;
}
bool HasUncommittedFlushes() const {
return uncommitted_flushes != nullptr;
}
void PopAsyncFlushes() {
if (committed_flushes.empty()) {
return;
}
auto& flush_list = committed_flushes.front();
if (!flush_list) {
committed_flushes.pop_front();
return;
}
for (MapInterval* map : *flush_list) {
if (map->is_registered) {
// TODO(Blinkhawk): Replace this for reading the asynchronous flush
FlushMap(map);
}
}
committed_flushes.pop_front();
}
virtual BufferInfo GetEmptyBuffer(std::size_t size) = 0;
protected:
explicit BufferCache(VideoCore::RasterizerInterface& rasterizer_,
Tegra::MemoryManager& gpu_memory_, Core::Memory::Memory& cpu_memory_,
StreamBuffer& stream_buffer_)
: rasterizer{rasterizer_}, gpu_memory{gpu_memory_}, cpu_memory{cpu_memory_},
stream_buffer{stream_buffer_} {}
~BufferCache() = default;
virtual std::shared_ptr<Buffer> CreateBlock(VAddr cpu_addr, std::size_t size) = 0;
virtual BufferInfo ConstBufferUpload(const void* raw_pointer, std::size_t size) {
return {};
}
/// Register an object into the cache
MapInterval* Register(MapInterval new_map, bool inherit_written = false) {
const VAddr cpu_addr = new_map.start;
if (!cpu_addr) {
LOG_CRITICAL(HW_GPU, "Failed to register buffer with unmapped gpu_address 0x{:016x}",
new_map.gpu_addr);
return nullptr;
}
const std::size_t size = new_map.end - new_map.start;
new_map.is_registered = true;
rasterizer.UpdatePagesCachedCount(cpu_addr, size, 1);
new_map.is_memory_marked = true;
if (inherit_written) {
MarkRegionAsWritten(new_map.start, new_map.end - 1);
new_map.is_written = true;
}
MapInterval* const storage = mapped_addresses_allocator.Allocate();
*storage = new_map;
mapped_addresses.insert(*storage);
return storage;
}
void UnmarkMemory(MapInterval* map) {
if (!map->is_memory_marked) {
return;
}
const std::size_t size = map->end - map->start;
rasterizer.UpdatePagesCachedCount(map->start, size, -1);
map->is_memory_marked = false;
}
/// Unregisters an object from the cache
void Unregister(MapInterval* map) {
UnmarkMemory(map);
map->is_registered = false;
if (map->is_sync_pending) {
map->is_sync_pending = false;
marked_for_unregister.remove(map);
}
if (map->is_written) {
UnmarkRegionAsWritten(map->start, map->end - 1);
}
const auto it = mapped_addresses.find(*map);
ASSERT(it != mapped_addresses.end());
mapped_addresses.erase(it);
mapped_addresses_allocator.Release(map);
}
private:
MapInterval* MapAddress(Buffer* block, GPUVAddr gpu_addr, VAddr cpu_addr, std::size_t size) {
const VectorMapInterval overlaps = GetMapsInRange(cpu_addr, size);
if (overlaps.empty()) {
const VAddr cpu_addr_end = cpu_addr + size;
if (gpu_memory.IsGranularRange(gpu_addr, size)) {
u8* const host_ptr = gpu_memory.GetPointer(gpu_addr);
block->Upload(block->Offset(cpu_addr), size, host_ptr);
} else {
staging_buffer.resize(size);
gpu_memory.ReadBlockUnsafe(gpu_addr, staging_buffer.data(), size);
block->Upload(block->Offset(cpu_addr), size, staging_buffer.data());
}
return Register(MapInterval(cpu_addr, cpu_addr_end, gpu_addr));
}
const VAddr cpu_addr_end = cpu_addr + size;
if (overlaps.size() == 1) {
MapInterval* const current_map = overlaps[0];
if (current_map->IsInside(cpu_addr, cpu_addr_end)) {
return current_map;
}
}
VAddr new_start = cpu_addr;
VAddr new_end = cpu_addr_end;
bool write_inheritance = false;
bool modified_inheritance = false;
// Calculate new buffer parameters
for (MapInterval* overlap : overlaps) {
new_start = std::min(overlap->start, new_start);
new_end = std::max(overlap->end, new_end);
write_inheritance |= overlap->is_written;
modified_inheritance |= overlap->is_modified;
}
GPUVAddr new_gpu_addr = gpu_addr + new_start - cpu_addr;
for (auto& overlap : overlaps) {
Unregister(overlap);
}
UpdateBlock(block, new_start, new_end, overlaps);
const MapInterval new_map{new_start, new_end, new_gpu_addr};
MapInterval* const map = Register(new_map, write_inheritance);
if (!map) {
return nullptr;
}
if (modified_inheritance) {
map->MarkAsModified(true, GetModifiedTicks());
if (Settings::IsGPULevelHigh() &&
Settings::values.use_asynchronous_gpu_emulation.GetValue()) {
MarkForAsyncFlush(map);
}
}
return map;
}
void UpdateBlock(Buffer* block, VAddr start, VAddr end, const VectorMapInterval& overlaps) {
const IntervalType base_interval{start, end};
IntervalSet interval_set{};
interval_set.add(base_interval);
for (auto& overlap : overlaps) {
const IntervalType subtract{overlap->start, overlap->end};
interval_set.subtract(subtract);
}
for (auto& interval : interval_set) {
const std::size_t size = interval.upper() - interval.lower();
if (size == 0) {
continue;
}
staging_buffer.resize(size);
cpu_memory.ReadBlockUnsafe(interval.lower(), staging_buffer.data(), size);
block->Upload(block->Offset(interval.lower()), size, staging_buffer.data());
}
}
VectorMapInterval GetMapsInRange(VAddr addr, std::size_t size) {
VectorMapInterval result;
if (size == 0) {
return result;
}
const VAddr addr_end = addr + size;
auto it = mapped_addresses.lower_bound(addr);
if (it != mapped_addresses.begin()) {
--it;
}
while (it != mapped_addresses.end() && it->start < addr_end) {
if (it->Overlaps(addr, addr_end)) {
result.push_back(&*it);
}
++it;
}
return result;
}
/// Returns a ticks counter used for tracking when cached objects were last modified
u64 GetModifiedTicks() {
return ++modified_ticks;
}
void FlushMap(MapInterval* map) {
const auto it = blocks.find(map->start >> BLOCK_PAGE_BITS);
ASSERT_OR_EXECUTE(it != blocks.end(), return;);
std::shared_ptr<Buffer> block = it->second;
const std::size_t size = map->end - map->start;
staging_buffer.resize(size);
block->Download(block->Offset(map->start), size, staging_buffer.data());
cpu_memory.WriteBlockUnsafe(map->start, staging_buffer.data(), size);
map->MarkAsModified(false, 0);
}
template <typename Callable>
BufferInfo StreamBufferUpload(std::size_t size, std::size_t alignment, Callable&& callable) {
AlignBuffer(alignment);
const std::size_t uploaded_offset = buffer_offset;
callable(buffer_ptr);
buffer_ptr += size;
buffer_offset += size;
return BufferInfo{stream_buffer.Handle(), uploaded_offset, stream_buffer.Address()};
}
void AlignBuffer(std::size_t alignment) {
// Align the offset, not the mapped pointer
const std::size_t offset_aligned = Common::AlignUp(buffer_offset, alignment);
buffer_ptr += offset_aligned - buffer_offset;
buffer_offset = offset_aligned;
}
std::shared_ptr<Buffer> EnlargeBlock(std::shared_ptr<Buffer> buffer) {
const std::size_t old_size = buffer->Size();
const std::size_t new_size = old_size + BLOCK_PAGE_SIZE;
const VAddr cpu_addr = buffer->CpuAddr();
std::shared_ptr<Buffer> new_buffer = CreateBlock(cpu_addr, new_size);
new_buffer->CopyFrom(*buffer, 0, 0, old_size);
QueueDestruction(std::move(buffer));
const VAddr cpu_addr_end = cpu_addr + new_size - 1;
const u64 page_end = cpu_addr_end >> BLOCK_PAGE_BITS;
for (u64 page_start = cpu_addr >> BLOCK_PAGE_BITS; page_start <= page_end; ++page_start) {
blocks.insert_or_assign(page_start, new_buffer);
}
return new_buffer;
}
std::shared_ptr<Buffer> MergeBlocks(std::shared_ptr<Buffer> first,
std::shared_ptr<Buffer> second) {
const std::size_t size_1 = first->Size();
const std::size_t size_2 = second->Size();
const VAddr first_addr = first->CpuAddr();
const VAddr second_addr = second->CpuAddr();
const VAddr new_addr = std::min(first_addr, second_addr);
const std::size_t new_size = size_1 + size_2;
std::shared_ptr<Buffer> new_buffer = CreateBlock(new_addr, new_size);
new_buffer->CopyFrom(*first, 0, new_buffer->Offset(first_addr), size_1);
new_buffer->CopyFrom(*second, 0, new_buffer->Offset(second_addr), size_2);
QueueDestruction(std::move(first));
QueueDestruction(std::move(second));
const VAddr cpu_addr_end = new_addr + new_size - 1;
const u64 page_end = cpu_addr_end >> BLOCK_PAGE_BITS;
for (u64 page_start = new_addr >> BLOCK_PAGE_BITS; page_start <= page_end; ++page_start) {
blocks.insert_or_assign(page_start, new_buffer);
}
return new_buffer;
}
Buffer* GetBlock(VAddr cpu_addr, std::size_t size) {
std::shared_ptr<Buffer> found;
const VAddr cpu_addr_end = cpu_addr + size - 1;
const u64 page_end = cpu_addr_end >> BLOCK_PAGE_BITS;
for (u64 page_start = cpu_addr >> BLOCK_PAGE_BITS; page_start <= page_end; ++page_start) {
auto it = blocks.find(page_start);
if (it == blocks.end()) {
if (found) {
found = EnlargeBlock(found);
continue;
}
const VAddr start_addr = page_start << BLOCK_PAGE_BITS;
found = CreateBlock(start_addr, BLOCK_PAGE_SIZE);
blocks.insert_or_assign(page_start, found);
continue;
}
if (!found) {
found = it->second;
continue;
}
if (found != it->second) {
found = MergeBlocks(std::move(found), it->second);
}
}
return found.get();
}
void MarkRegionAsWritten(VAddr start, VAddr end) {
const u64 page_end = end >> WRITE_PAGE_BIT;
for (u64 page_start = start >> WRITE_PAGE_BIT; page_start <= page_end; ++page_start) {
if (const auto [it, inserted] = written_pages.emplace(page_start, 1); !inserted) {
++it->second;
}
}
}
void UnmarkRegionAsWritten(VAddr start, VAddr end) {
const u64 page_end = end >> WRITE_PAGE_BIT;
for (u64 page_start = start >> WRITE_PAGE_BIT; page_start <= page_end; ++page_start) {
auto it = written_pages.find(page_start);
if (it != written_pages.end()) {
if (it->second > 1) {
--it->second;
} else {
written_pages.erase(it);
}
}
}
}
bool IsRegionWritten(VAddr start, VAddr end) const {
const u64 page_end = end >> WRITE_PAGE_BIT;
for (u64 page_start = start >> WRITE_PAGE_BIT; page_start <= page_end; ++page_start) {
if (written_pages.contains(page_start)) {
return true;
}
}
return false;
}
void QueueDestruction(std::shared_ptr<Buffer> buffer) {
buffer->SetEpoch(epoch);
pending_destruction.push(std::move(buffer));
}
void MarkForAsyncFlush(MapInterval* map) {
if (!uncommitted_flushes) {
uncommitted_flushes = std::make_shared<std::unordered_set<MapInterval*>>();
}
uncommitted_flushes->insert(map);
}
VideoCore::RasterizerInterface& rasterizer;
Tegra::MemoryManager& gpu_memory;
Core::Memory::Memory& cpu_memory;
StreamBuffer& stream_buffer;
u8* buffer_ptr = nullptr;
u64 buffer_offset = 0;
u64 buffer_offset_base = 0;
MapIntervalAllocator mapped_addresses_allocator;
boost::intrusive::set<MapInterval, boost::intrusive::compare<MapIntervalCompare>>
mapped_addresses;
std::unordered_map<u64, u32> written_pages;
std::unordered_map<u64, std::shared_ptr<Buffer>> blocks;
std::queue<std::shared_ptr<Buffer>> pending_destruction;
u64 epoch = 0;
u64 modified_ticks = 0;
std::vector<u8> staging_buffer;
std::list<MapInterval*> marked_for_unregister;
std::shared_ptr<std::unordered_set<MapInterval*>> uncommitted_flushes;
std::list<std::shared_ptr<std::list<MapInterval*>>> committed_flushes;
std::recursive_mutex mutex;
};
} // namespace VideoCommon

View File

@@ -0,0 +1,33 @@
// Copyright 2020 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#include <algorithm>
#include <array>
#include <cstddef>
#include <memory>
#include "video_core/buffer_cache/map_interval.h"
namespace VideoCommon {
MapIntervalAllocator::MapIntervalAllocator() {
FillFreeList(first_chunk);
}
MapIntervalAllocator::~MapIntervalAllocator() = default;
void MapIntervalAllocator::AllocateNewChunk() {
*new_chunk = std::make_unique<Chunk>();
FillFreeList(**new_chunk);
new_chunk = &(*new_chunk)->next;
}
void MapIntervalAllocator::FillFreeList(Chunk& chunk) {
const std::size_t old_size = free_list.size();
free_list.resize(old_size + chunk.data.size());
std::transform(chunk.data.rbegin(), chunk.data.rend(), free_list.begin() + old_size,
[](MapInterval& interval) { return &interval; });
}
} // namespace VideoCommon

View File

@@ -0,0 +1,93 @@
// Copyright 2019 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#pragma once
#include <array>
#include <cstddef>
#include <memory>
#include <vector>
#include <boost/intrusive/set_hook.hpp>
#include "common/common_types.h"
#include "video_core/gpu.h"
namespace VideoCommon {
struct MapInterval : public boost::intrusive::set_base_hook<boost::intrusive::optimize_size<true>> {
MapInterval() = default;
/*implicit*/ MapInterval(VAddr start_) noexcept : start{start_} {}
explicit MapInterval(VAddr start_, VAddr end_, GPUVAddr gpu_addr_) noexcept
: start{start_}, end{end_}, gpu_addr{gpu_addr_} {}
bool IsInside(VAddr other_start, VAddr other_end) const noexcept {
return start <= other_start && other_end <= end;
}
bool Overlaps(VAddr other_start, VAddr other_end) const noexcept {
return start < other_end && other_start < end;
}
void MarkAsModified(bool is_modified_, u64 ticks_) noexcept {
is_modified = is_modified_;
ticks = ticks_;
}
boost::intrusive::set_member_hook<> member_hook_;
VAddr start = 0;
VAddr end = 0;
GPUVAddr gpu_addr = 0;
u64 ticks = 0;
bool is_written = false;
bool is_modified = false;
bool is_registered = false;
bool is_memory_marked = false;
bool is_sync_pending = false;
};
struct MapIntervalCompare {
constexpr bool operator()(const MapInterval& lhs, const MapInterval& rhs) const noexcept {
return lhs.start < rhs.start;
}
};
class MapIntervalAllocator {
public:
MapIntervalAllocator();
~MapIntervalAllocator();
MapInterval* Allocate() {
if (free_list.empty()) {
AllocateNewChunk();
}
MapInterval* const interval = free_list.back();
free_list.pop_back();
return interval;
}
void Release(MapInterval* interval) {
free_list.push_back(interval);
}
private:
struct Chunk {
std::unique_ptr<Chunk> next;
std::array<MapInterval, 0x8000> data;
};
void AllocateNewChunk();
void FillFreeList(Chunk& chunk);
std::vector<MapInterval*> free_list;
Chunk first_chunk;
std::unique_ptr<Chunk>* new_chunk = &first_chunk.next;
};
} // namespace VideoCommon

171
src/video_core/cdma_pusher.cpp Executable file
View File

@@ -0,0 +1,171 @@
// MIT License
//
// Copyright (c) Ryujinx Team and Contributors
//
// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
// associated documentation files (the "Software"), to deal in the Software without restriction,
// including without limitation the rights to use, copy, modify, merge, publish, distribute,
// sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all copies or
// substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
// NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
//
#include "command_classes/host1x.h"
#include "command_classes/nvdec.h"
#include "command_classes/vic.h"
#include "common/bit_util.h"
#include "video_core/cdma_pusher.h"
#include "video_core/command_classes/nvdec_common.h"
#include "video_core/engines/maxwell_3d.h"
#include "video_core/gpu.h"
#include "video_core/memory_manager.h"
namespace Tegra {
CDmaPusher::CDmaPusher(GPU& gpu_)
: gpu{gpu_}, nvdec_processor(std::make_shared<Nvdec>(gpu)),
vic_processor(std::make_unique<Vic>(gpu, nvdec_processor)),
host1x_processor(std::make_unique<Host1x>(gpu)),
nvdec_sync(std::make_unique<SyncptIncrManager>(gpu)),
vic_sync(std::make_unique<SyncptIncrManager>(gpu)) {}
CDmaPusher::~CDmaPusher() = default;
void CDmaPusher::Push(ChCommandHeaderList&& entries) {
cdma_queue.push(std::move(entries));
}
void CDmaPusher::DispatchCalls() {
while (!cdma_queue.empty()) {
Step();
}
}
void CDmaPusher::Step() {
const auto entries{cdma_queue.front()};
cdma_queue.pop();
std::vector<u32> values(entries.size());
std::memcpy(values.data(), entries.data(), entries.size() * sizeof(u32));
for (const u32 value : values) {
if (mask != 0) {
const u32 lbs = Common::CountTrailingZeroes32(mask);
mask &= ~(1U << lbs);
ExecuteCommand(static_cast<u32>(offset + lbs), value);
continue;
} else if (count != 0) {
--count;
ExecuteCommand(static_cast<u32>(offset), value);
if (incrementing) {
++offset;
}
continue;
}
const auto mode = static_cast<ChSubmissionMode>((value >> 28) & 0xf);
switch (mode) {
case ChSubmissionMode::SetClass: {
mask = value & 0x3f;
offset = (value >> 16) & 0xfff;
current_class = static_cast<ChClassId>((value >> 6) & 0x3ff);
break;
}
case ChSubmissionMode::Incrementing:
case ChSubmissionMode::NonIncrementing:
count = value & 0xffff;
offset = (value >> 16) & 0xfff;
incrementing = mode == ChSubmissionMode::Incrementing;
break;
case ChSubmissionMode::Mask:
mask = value & 0xffff;
offset = (value >> 16) & 0xfff;
break;
case ChSubmissionMode::Immediate: {
const u32 data = value & 0xfff;
offset = (value >> 16) & 0xfff;
ExecuteCommand(static_cast<u32>(offset), data);
break;
}
default:
UNIMPLEMENTED_MSG("ChSubmission mode {} is not implemented!", static_cast<u32>(mode));
break;
}
}
}
void CDmaPusher::ExecuteCommand(u32 state_offset, u32 data) {
switch (current_class) {
case ChClassId::NvDec:
ThiStateWrite(nvdec_thi_state, state_offset, {data});
switch (static_cast<ThiMethod>(state_offset)) {
case ThiMethod::IncSyncpt: {
LOG_DEBUG(Service_NVDRV, "NVDEC Class IncSyncpt Method");
const auto syncpoint_id = static_cast<u32>(data & 0xFF);
const auto cond = static_cast<u32>((data >> 8) & 0xFF);
if (cond == 0) {
nvdec_sync->Increment(syncpoint_id);
} else {
nvdec_sync->IncrementWhenDone(static_cast<u32>(current_class), syncpoint_id);
nvdec_sync->SignalDone(syncpoint_id);
}
break;
}
case ThiMethod::SetMethod1:
LOG_DEBUG(Service_NVDRV, "NVDEC method 0x{:X}",
static_cast<u32>(nvdec_thi_state.method_0));
nvdec_processor->ProcessMethod(static_cast<Nvdec::Method>(nvdec_thi_state.method_0),
{data});
break;
default:
break;
}
break;
case ChClassId::GraphicsVic:
ThiStateWrite(vic_thi_state, static_cast<u32>(state_offset), {data});
switch (static_cast<ThiMethod>(state_offset)) {
case ThiMethod::IncSyncpt: {
LOG_DEBUG(Service_NVDRV, "VIC Class IncSyncpt Method");
const auto syncpoint_id = static_cast<u32>(data & 0xFF);
const auto cond = static_cast<u32>((data >> 8) & 0xFF);
if (cond == 0) {
vic_sync->Increment(syncpoint_id);
} else {
vic_sync->IncrementWhenDone(static_cast<u32>(current_class), syncpoint_id);
vic_sync->SignalDone(syncpoint_id);
}
break;
}
case ThiMethod::SetMethod1:
LOG_DEBUG(Service_NVDRV, "VIC method 0x{:X}, Args=({})",
static_cast<u32>(vic_thi_state.method_0), data);
vic_processor->ProcessMethod(static_cast<Vic::Method>(vic_thi_state.method_0), {data});
break;
default:
break;
}
break;
case ChClassId::Host1x:
// This device is mainly for syncpoint synchronization
LOG_DEBUG(Service_NVDRV, "Host1X Class Method");
host1x_processor->ProcessMethod(static_cast<Host1x::Method>(state_offset), {data});
break;
default:
UNIMPLEMENTED_MSG("Current class not implemented {:X}", static_cast<u32>(current_class));
break;
}
}
void CDmaPusher::ThiStateWrite(ThiRegisters& state, u32 state_offset,
const std::vector<u32>& arguments) {
u8* const state_offset_ptr = reinterpret_cast<u8*>(&state) + sizeof(u32) * state_offset;
std::memcpy(state_offset_ptr, arguments.data(), sizeof(u32) * arguments.size());
}
} // namespace Tegra

138
src/video_core/cdma_pusher.h Executable file
View File

@@ -0,0 +1,138 @@
// Copyright 2020 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#pragma once
#include <memory>
#include <unordered_map>
#include <vector>
#include <queue>
#include "common/bit_field.h"
#include "common/common_types.h"
#include "video_core/command_classes/sync_manager.h"
namespace Tegra {
class GPU;
class Nvdec;
class Vic;
class Host1x;
enum class ChSubmissionMode : u32 {
SetClass = 0,
Incrementing = 1,
NonIncrementing = 2,
Mask = 3,
Immediate = 4,
Restart = 5,
Gather = 6,
};
enum class ChClassId : u32 {
NoClass = 0x0,
Host1x = 0x1,
VideoEncodeMpeg = 0x20,
VideoEncodeNvEnc = 0x21,
VideoStreamingVi = 0x30,
VideoStreamingIsp = 0x32,
VideoStreamingIspB = 0x34,
VideoStreamingViI2c = 0x36,
GraphicsVic = 0x5d,
Graphics3D = 0x60,
GraphicsGpu = 0x61,
Tsec = 0xe0,
TsecB = 0xe1,
NvJpg = 0xc0,
NvDec = 0xf0
};
enum class ChMethod : u32 {
Empty = 0,
SetMethod = 0x10,
SetData = 0x11,
};
union ChCommandHeader {
u32 raw;
BitField<0, 16, u32> value;
BitField<16, 12, ChMethod> method_offset;
BitField<28, 4, ChSubmissionMode> submission_mode;
};
static_assert(sizeof(ChCommandHeader) == sizeof(u32), "ChCommand header is an invalid size");
struct ChCommand {
ChClassId class_id{};
int method_offset{};
std::vector<u32> arguments;
};
using ChCommandHeaderList = std::vector<ChCommandHeader>;
using ChCommandList = std::vector<ChCommand>;
struct ThiRegisters {
u32_le increment_syncpt{};
INSERT_PADDING_WORDS(1);
u32_le increment_syncpt_error{};
u32_le ctx_switch_incremement_syncpt{};
INSERT_PADDING_WORDS(4);
u32_le ctx_switch{};
INSERT_PADDING_WORDS(1);
u32_le ctx_syncpt_eof{};
INSERT_PADDING_WORDS(5);
u32_le method_0{};
u32_le method_1{};
INSERT_PADDING_WORDS(12);
u32_le int_status{};
u32_le int_mask{};
};
enum class ThiMethod : u32 {
IncSyncpt = offsetof(ThiRegisters, increment_syncpt) / sizeof(u32),
SetMethod0 = offsetof(ThiRegisters, method_0) / sizeof(u32),
SetMethod1 = offsetof(ThiRegisters, method_1) / sizeof(u32),
};
class CDmaPusher {
public:
explicit CDmaPusher(GPU& gpu_);
~CDmaPusher();
/// Push NVDEC command buffer entries into queue
void Push(ChCommandHeaderList&& entries);
/// Process queued command buffer entries
void DispatchCalls();
/// Process one queue element
void Step();
/// Invoke command class devices to execute the command based on the current state
void ExecuteCommand(u32 state_offset, u32 data);
private:
/// Write arguments value to the ThiRegisters member at the specified offset
void ThiStateWrite(ThiRegisters& state, u32 state_offset, const std::vector<u32>& arguments);
GPU& gpu;
std::shared_ptr<Nvdec> nvdec_processor;
std::unique_ptr<Vic> vic_processor;
std::unique_ptr<Host1x> host1x_processor;
std::unique_ptr<SyncptIncrManager> nvdec_sync;
std::unique_ptr<SyncptIncrManager> vic_sync;
ChClassId current_class{};
ThiRegisters vic_thi_state{};
ThiRegisters nvdec_thi_state{};
s32 count{};
s32 offset{};
s32 mask{};
bool incrementing{};
// Queue of command lists to be processed
std::queue<ChCommandHeaderList> cdma_queue;
};
} // namespace Tegra

View File

@@ -0,0 +1,129 @@
// Copyright 2020 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#include <cstring>
#include <fstream>
#include <vector>
#include "common/assert.h"
#include "video_core/command_classes/codecs/codec.h"
#include "video_core/command_classes/codecs/h264.h"
#include "video_core/command_classes/codecs/vp9.h"
#include "video_core/gpu.h"
#include "video_core/memory_manager.h"
extern "C" {
#include <libavutil/opt.h>
}
namespace Tegra {
void AVFrameDeleter(AVFrame* ptr) {
av_frame_unref(ptr);
av_free(ptr);
}
Codec::Codec(GPU& gpu_)
: gpu(gpu_), h264_decoder(std::make_unique<Decoder::H264>(gpu)),
vp9_decoder(std::make_unique<Decoder::VP9>(gpu)) {}
Codec::~Codec() {
if (!initialized) {
return;
}
// Free libav memory
AVFrame* av_frame{nullptr};
avcodec_send_packet(av_codec_ctx, nullptr);
av_frame = av_frame_alloc();
avcodec_receive_frame(av_codec_ctx, av_frame);
avcodec_flush_buffers(av_codec_ctx);
av_frame_unref(av_frame);
av_free(av_frame);
avcodec_close(av_codec_ctx);
}
void Codec::SetTargetCodec(NvdecCommon::VideoCodec codec) {
LOG_INFO(Service_NVDRV, "NVDEC video codec initialized to {}", codec);
current_codec = codec;
}
void Codec::StateWrite(u32 offset, u64 arguments) {
u8* const state_offset = reinterpret_cast<u8*>(&state) + offset * sizeof(u64);
std::memcpy(state_offset, &arguments, sizeof(u64));
}
void Codec::Decode() {
bool is_first_frame = false;
if (!initialized) {
if (current_codec == NvdecCommon::VideoCodec::H264) {
av_codec = avcodec_find_decoder(AV_CODEC_ID_H264);
} else if (current_codec == NvdecCommon::VideoCodec::Vp9) {
av_codec = avcodec_find_decoder(AV_CODEC_ID_VP9);
} else {
LOG_ERROR(Service_NVDRV, "Unknown video codec {}", current_codec);
return;
}
av_codec_ctx = avcodec_alloc_context3(av_codec);
av_opt_set(av_codec_ctx->priv_data, "tune", "zerolatency", 0);
// TODO(ameerj): libavcodec gpu hw acceleration
const auto av_error = avcodec_open2(av_codec_ctx, av_codec, nullptr);
if (av_error < 0) {
LOG_ERROR(Service_NVDRV, "avcodec_open2() Failed.");
avcodec_close(av_codec_ctx);
return;
}
initialized = true;
is_first_frame = true;
}
bool vp9_hidden_frame = false;
AVPacket packet{};
av_init_packet(&packet);
std::vector<u8> frame_data;
if (current_codec == NvdecCommon::VideoCodec::H264) {
frame_data = h264_decoder->ComposeFrameHeader(state, is_first_frame);
} else if (current_codec == NvdecCommon::VideoCodec::Vp9) {
frame_data = vp9_decoder->ComposeFrameHeader(state);
vp9_hidden_frame = vp9_decoder->WasFrameHidden();
}
packet.data = frame_data.data();
packet.size = static_cast<int>(frame_data.size());
avcodec_send_packet(av_codec_ctx, &packet);
if (!vp9_hidden_frame) {
// Only receive/store visible frames
AVFramePtr frame = AVFramePtr{av_frame_alloc(), AVFrameDeleter};
avcodec_receive_frame(av_codec_ctx, frame.get());
av_frames.push(std::move(frame));
// Limit queue to 10 frames. Workaround for ZLA decode and queue spam
if (av_frames.size() > 10) {
av_frames.pop();
}
}
}
AVFramePtr Codec::GetCurrentFrame() {
// Sometimes VIC will request more frames than have been decoded.
// in this case, return a nullptr and don't overwrite previous frame data
if (av_frames.empty()) {
return AVFramePtr{nullptr, AVFrameDeleter};
}
AVFramePtr frame = std::move(av_frames.front());
av_frames.pop();
return frame;
}
NvdecCommon::VideoCodec Codec::GetCurrentCodec() const {
return current_codec;
}
} // namespace Tegra

View File

@@ -0,0 +1,70 @@
// Copyright 2020 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#pragma once
#include <memory>
#include <queue>
#include "common/common_types.h"
#include "video_core/command_classes/nvdec_common.h"
extern "C" {
#if defined(__GNUC__) || defined(__clang__)
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wconversion"
#endif
#include <libavcodec/avcodec.h>
#if defined(__GNUC__) || defined(__clang__)
#pragma GCC diagnostic pop
#endif
}
namespace Tegra {
class GPU;
struct VicRegisters;
void AVFrameDeleter(AVFrame* ptr);
using AVFramePtr = std::unique_ptr<AVFrame, decltype(&AVFrameDeleter)>;
namespace Decoder {
class H264;
class VP9;
} // namespace Decoder
class Codec {
public:
explicit Codec(GPU& gpu);
~Codec();
/// Sets NVDEC video stream codec
void SetTargetCodec(NvdecCommon::VideoCodec codec);
/// Populate NvdecRegisters state with argument value at the provided offset
void StateWrite(u32 offset, u64 arguments);
/// Call decoders to construct headers, decode AVFrame with ffmpeg
void Decode();
/// Returns next decoded frame
[[nodiscard]] AVFramePtr GetCurrentFrame();
/// Returns the value of current_codec
[[nodiscard]] NvdecCommon::VideoCodec GetCurrentCodec() const;
private:
bool initialized{};
NvdecCommon::VideoCodec current_codec{NvdecCommon::VideoCodec::None};
AVCodec* av_codec{nullptr};
AVCodecContext* av_codec_ctx{nullptr};
GPU& gpu;
std::unique_ptr<Decoder::H264> h264_decoder;
std::unique_ptr<Decoder::VP9> vp9_decoder;
NvdecCommon::NvdecRegisters state{};
std::queue<AVFramePtr> av_frames{};
};
} // namespace Tegra

View File

@@ -0,0 +1,293 @@
// MIT License
//
// Copyright (c) Ryujinx Team and Contributors
//
// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
// associated documentation files (the "Software"), to deal in the Software without restriction,
// including without limitation the rights to use, copy, modify, merge, publish, distribute,
// sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all copies or
// substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
// NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
//
#include <array>
#include "common/bit_util.h"
#include "video_core/command_classes/codecs/h264.h"
#include "video_core/gpu.h"
#include "video_core/memory_manager.h"
namespace Tegra::Decoder {
namespace {
// ZigZag LUTs from libavcodec.
constexpr std::array<u8, 64> zig_zag_direct{
0, 1, 8, 16, 9, 2, 3, 10, 17, 24, 32, 25, 18, 11, 4, 5, 12, 19, 26, 33, 40, 48,
41, 34, 27, 20, 13, 6, 7, 14, 21, 28, 35, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23,
30, 37, 44, 51, 58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63,
};
constexpr std::array<u8, 16> zig_zag_scan{
0 + 0 * 4, 1 + 0 * 4, 0 + 1 * 4, 0 + 2 * 4, 1 + 1 * 4, 2 + 0 * 4, 3 + 0 * 4, 2 + 1 * 4,
1 + 2 * 4, 0 + 3 * 4, 1 + 3 * 4, 2 + 2 * 4, 3 + 1 * 4, 3 + 2 * 4, 2 + 3 * 4, 3 + 3 * 4,
};
} // Anonymous namespace
H264::H264(GPU& gpu_) : gpu(gpu_) {}
H264::~H264() = default;
const std::vector<u8>& H264::ComposeFrameHeader(const NvdecCommon::NvdecRegisters& state,
bool is_first_frame) {
H264DecoderContext context{};
gpu.MemoryManager().ReadBlock(state.picture_info_offset, &context, sizeof(H264DecoderContext));
const s32 frame_number = static_cast<s32>((context.h264_parameter_set.flags >> 46) & 0x1ffff);
if (!is_first_frame && frame_number != 0) {
frame.resize(context.frame_data_size);
gpu.MemoryManager().ReadBlock(state.frame_bitstream_offset, frame.data(), frame.size());
} else {
/// Encode header
H264BitWriter writer{};
writer.WriteU(1, 24);
writer.WriteU(0, 1);
writer.WriteU(3, 2);
writer.WriteU(7, 5);
writer.WriteU(100, 8);
writer.WriteU(0, 8);
writer.WriteU(31, 8);
writer.WriteUe(0);
const auto chroma_format_idc =
static_cast<u32>((context.h264_parameter_set.flags >> 12) & 3);
writer.WriteUe(chroma_format_idc);
if (chroma_format_idc == 3) {
writer.WriteBit(false);
}
writer.WriteUe(0);
writer.WriteUe(0);
writer.WriteBit(false); // QpprimeYZeroTransformBypassFlag
writer.WriteBit(false); // Scaling matrix present flag
const auto order_cnt_type = static_cast<u32>((context.h264_parameter_set.flags >> 14) & 3);
writer.WriteUe(static_cast<u32>((context.h264_parameter_set.flags >> 8) & 0xf));
writer.WriteUe(order_cnt_type);
if (order_cnt_type == 0) {
writer.WriteUe(context.h264_parameter_set.log2_max_pic_order_cnt);
} else if (order_cnt_type == 1) {
writer.WriteBit(context.h264_parameter_set.delta_pic_order_always_zero_flag != 0);
writer.WriteSe(0);
writer.WriteSe(0);
writer.WriteUe(0);
}
const s32 pic_height = context.h264_parameter_set.pic_height_in_map_units /
(context.h264_parameter_set.frame_mbs_only_flag ? 1 : 2);
writer.WriteUe(16);
writer.WriteBit(false);
writer.WriteUe(context.h264_parameter_set.pic_width_in_mbs - 1);
writer.WriteUe(pic_height - 1);
writer.WriteBit(context.h264_parameter_set.frame_mbs_only_flag != 0);
if (!context.h264_parameter_set.frame_mbs_only_flag) {
writer.WriteBit(((context.h264_parameter_set.flags >> 0) & 1) != 0);
}
writer.WriteBit(((context.h264_parameter_set.flags >> 1) & 1) != 0);
writer.WriteBit(false); // Frame cropping flag
writer.WriteBit(false); // VUI parameter present flag
writer.End();
// H264 PPS
writer.WriteU(1, 24);
writer.WriteU(0, 1);
writer.WriteU(3, 2);
writer.WriteU(8, 5);
writer.WriteUe(0);
writer.WriteUe(0);
writer.WriteBit(context.h264_parameter_set.entropy_coding_mode_flag != 0);
writer.WriteBit(false);
writer.WriteUe(0);
writer.WriteUe(context.h264_parameter_set.num_refidx_l0_default_active);
writer.WriteUe(context.h264_parameter_set.num_refidx_l1_default_active);
writer.WriteBit(((context.h264_parameter_set.flags >> 2) & 1) != 0);
writer.WriteU(static_cast<s32>((context.h264_parameter_set.flags >> 32) & 0x3), 2);
s32 pic_init_qp = static_cast<s32>((context.h264_parameter_set.flags >> 16) & 0x3f);
pic_init_qp = (pic_init_qp << 26) >> 26;
writer.WriteSe(pic_init_qp);
writer.WriteSe(0);
s32 chroma_qp_index_offset =
static_cast<s32>((context.h264_parameter_set.flags >> 22) & 0x1f);
chroma_qp_index_offset = (chroma_qp_index_offset << 27) >> 27;
writer.WriteSe(chroma_qp_index_offset);
writer.WriteBit(context.h264_parameter_set.deblocking_filter_control_flag != 0);
writer.WriteBit(((context.h264_parameter_set.flags >> 3) & 1) != 0);
writer.WriteBit(context.h264_parameter_set.redundant_pic_count_flag != 0);
writer.WriteBit(context.h264_parameter_set.transform_8x8_mode_flag != 0);
writer.WriteBit(true);
for (s32 index = 0; index < 6; index++) {
writer.WriteBit(true);
const auto matrix_x4 =
std::vector<u8>(context.scaling_matrix_4.begin(), context.scaling_matrix_4.end());
writer.WriteScalingList(matrix_x4, index * 16, 16);
}
if (context.h264_parameter_set.transform_8x8_mode_flag) {
for (s32 index = 0; index < 2; index++) {
writer.WriteBit(true);
const auto matrix_x8 = std::vector<u8>(context.scaling_matrix_8.begin(),
context.scaling_matrix_8.end());
writer.WriteScalingList(matrix_x8, index * 64, 64);
}
}
s32 chroma_qp_index_offset2 =
static_cast<s32>((context.h264_parameter_set.flags >> 27) & 0x1f);
chroma_qp_index_offset2 = (chroma_qp_index_offset2 << 27) >> 27;
writer.WriteSe(chroma_qp_index_offset2);
writer.End();
const auto& encoded_header = writer.GetByteArray();
frame.resize(encoded_header.size() + context.frame_data_size);
std::memcpy(frame.data(), encoded_header.data(), encoded_header.size());
gpu.MemoryManager().ReadBlock(state.frame_bitstream_offset,
frame.data() + encoded_header.size(),
context.frame_data_size);
}
return frame;
}
H264BitWriter::H264BitWriter() = default;
H264BitWriter::~H264BitWriter() = default;
void H264BitWriter::WriteU(s32 value, s32 value_sz) {
WriteBits(value, value_sz);
}
void H264BitWriter::WriteSe(s32 value) {
WriteExpGolombCodedInt(value);
}
void H264BitWriter::WriteUe(u32 value) {
WriteExpGolombCodedUInt(value);
}
void H264BitWriter::End() {
WriteBit(true);
Flush();
}
void H264BitWriter::WriteBit(bool state) {
WriteBits(state ? 1 : 0, 1);
}
void H264BitWriter::WriteScalingList(const std::vector<u8>& list, s32 start, s32 count) {
std::vector<u8> scan(count);
if (count == 16) {
std::memcpy(scan.data(), zig_zag_scan.data(), scan.size());
} else {
std::memcpy(scan.data(), zig_zag_direct.data(), scan.size());
}
u8 last_scale = 8;
for (s32 index = 0; index < count; index++) {
const u8 value = list[start + scan[index]];
const s32 delta_scale = static_cast<s32>(value - last_scale);
WriteSe(delta_scale);
last_scale = value;
}
}
std::vector<u8>& H264BitWriter::GetByteArray() {
return byte_array;
}
const std::vector<u8>& H264BitWriter::GetByteArray() const {
return byte_array;
}
void H264BitWriter::WriteBits(s32 value, s32 bit_count) {
s32 value_pos = 0;
s32 remaining = bit_count;
while (remaining > 0) {
s32 copy_size = remaining;
const s32 free_bits = GetFreeBufferBits();
if (copy_size > free_bits) {
copy_size = free_bits;
}
const s32 mask = (1 << copy_size) - 1;
const s32 src_shift = (bit_count - value_pos) - copy_size;
const s32 dst_shift = (buffer_size - buffer_pos) - copy_size;
buffer |= ((value >> src_shift) & mask) << dst_shift;
value_pos += copy_size;
buffer_pos += copy_size;
remaining -= copy_size;
}
}
void H264BitWriter::WriteExpGolombCodedInt(s32 value) {
const s32 sign = value <= 0 ? 0 : 1;
if (value < 0) {
value = -value;
}
value = (value << 1) - sign;
WriteExpGolombCodedUInt(value);
}
void H264BitWriter::WriteExpGolombCodedUInt(u32 value) {
const s32 size = 32 - Common::CountLeadingZeroes32(static_cast<s32>(value + 1));
WriteBits(1, size);
value -= (1U << (size - 1)) - 1;
WriteBits(static_cast<s32>(value), size - 1);
}
s32 H264BitWriter::GetFreeBufferBits() {
if (buffer_pos == buffer_size) {
Flush();
}
return buffer_size - buffer_pos;
}
void H264BitWriter::Flush() {
if (buffer_pos == 0) {
return;
}
byte_array.push_back(static_cast<u8>(buffer));
buffer = 0;
buffer_pos = 0;
}
} // namespace Tegra::Decoder

View File

@@ -0,0 +1,118 @@
// MIT License
//
// Copyright (c) Ryujinx Team and Contributors
//
// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
// associated documentation files (the "Software"), to deal in the Software without restriction,
// including without limitation the rights to use, copy, modify, merge, publish, distribute,
// sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all copies or
// substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
// NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
//
#pragma once
#include <vector>
#include "common/common_funcs.h"
#include "common/common_types.h"
#include "video_core/command_classes/nvdec_common.h"
namespace Tegra {
class GPU;
namespace Decoder {
class H264BitWriter {
public:
H264BitWriter();
~H264BitWriter();
/// The following Write methods are based on clause 9.1 in the H.264 specification.
/// WriteSe and WriteUe write in the Exp-Golomb-coded syntax
void WriteU(s32 value, s32 value_sz);
void WriteSe(s32 value);
void WriteUe(u32 value);
/// Finalize the bitstream
void End();
/// append a bit to the stream, equivalent value to the state parameter
void WriteBit(bool state);
/// Based on section 7.3.2.1.1.1 and Table 7-4 in the H.264 specification
/// Writes the scaling matrices of the sream
void WriteScalingList(const std::vector<u8>& list, s32 start, s32 count);
/// Return the bitstream as a vector.
[[nodiscard]] std::vector<u8>& GetByteArray();
[[nodiscard]] const std::vector<u8>& GetByteArray() const;
private:
void WriteBits(s32 value, s32 bit_count);
void WriteExpGolombCodedInt(s32 value);
void WriteExpGolombCodedUInt(u32 value);
[[nodiscard]] s32 GetFreeBufferBits();
void Flush();
s32 buffer_size{8};
s32 buffer{};
s32 buffer_pos{};
std::vector<u8> byte_array;
};
class H264 {
public:
explicit H264(GPU& gpu);
~H264();
/// Compose the H264 header of the frame for FFmpeg decoding
[[nodiscard]] const std::vector<u8>& ComposeFrameHeader(
const NvdecCommon::NvdecRegisters& state, bool is_first_frame = false);
private:
struct H264ParameterSet {
u32 log2_max_pic_order_cnt{};
u32 delta_pic_order_always_zero_flag{};
u32 frame_mbs_only_flag{};
u32 pic_width_in_mbs{};
u32 pic_height_in_map_units{};
INSERT_PADDING_WORDS(1);
u32 entropy_coding_mode_flag{};
u32 bottom_field_pic_order_flag{};
u32 num_refidx_l0_default_active{};
u32 num_refidx_l1_default_active{};
u32 deblocking_filter_control_flag{};
u32 redundant_pic_count_flag{};
u32 transform_8x8_mode_flag{};
INSERT_PADDING_WORDS(9);
u64 flags{};
u32 frame_number{};
u32 frame_number2{};
};
static_assert(sizeof(H264ParameterSet) == 0x68, "H264ParameterSet is an invalid size");
struct H264DecoderContext {
INSERT_PADDING_BYTES(0x48);
u32 frame_data_size{};
INSERT_PADDING_BYTES(0xc);
H264ParameterSet h264_parameter_set{};
INSERT_PADDING_BYTES(0x100);
std::array<u8, 0x60> scaling_matrix_4;
std::array<u8, 0x80> scaling_matrix_8;
};
static_assert(sizeof(H264DecoderContext) == 0x2a0, "H264DecoderContext is an invalid size");
std::vector<u8> frame;
GPU& gpu;
};
} // namespace Decoder
} // namespace Tegra

View File

@@ -0,0 +1,989 @@
// Copyright 2020 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#include <cstring> // for std::memcpy
#include <numeric>
#include "video_core/command_classes/codecs/vp9.h"
#include "video_core/gpu.h"
#include "video_core/memory_manager.h"
namespace Tegra::Decoder {
namespace {
// Default compressed header probabilities once frame context resets
constexpr Vp9EntropyProbs default_probs{
.y_mode_prob{
65, 32, 18, 144, 162, 194, 41, 51, 98, 132, 68, 18, 165, 217, 196, 45, 40, 78,
173, 80, 19, 176, 240, 193, 64, 35, 46, 221, 135, 38, 194, 248, 121, 96, 85, 29,
},
.partition_prob{
199, 122, 141, 0, 147, 63, 159, 0, 148, 133, 118, 0, 121, 104, 114, 0,
174, 73, 87, 0, 92, 41, 83, 0, 82, 99, 50, 0, 53, 39, 39, 0,
177, 58, 59, 0, 68, 26, 63, 0, 52, 79, 25, 0, 17, 14, 12, 0,
222, 34, 30, 0, 72, 16, 44, 0, 58, 32, 12, 0, 10, 7, 6, 0,
},
.coef_probs{
195, 29, 183, 84, 49, 136, 8, 42, 71, 0, 0, 0, 0, 0, 0, 0, 0, 0,
31, 107, 169, 35, 99, 159, 17, 82, 140, 8, 66, 114, 2, 44, 76, 1, 19, 32,
40, 132, 201, 29, 114, 187, 13, 91, 157, 7, 75, 127, 3, 58, 95, 1, 28, 47,
69, 142, 221, 42, 122, 201, 15, 91, 159, 6, 67, 121, 1, 42, 77, 1, 17, 31,
102, 148, 228, 67, 117, 204, 17, 82, 154, 6, 59, 114, 2, 39, 75, 1, 15, 29,
156, 57, 233, 119, 57, 212, 58, 48, 163, 29, 40, 124, 12, 30, 81, 3, 12, 31,
191, 107, 226, 124, 117, 204, 25, 99, 155, 0, 0, 0, 0, 0, 0, 0, 0, 0,
29, 148, 210, 37, 126, 194, 8, 93, 157, 2, 68, 118, 1, 39, 69, 1, 17, 33,
41, 151, 213, 27, 123, 193, 3, 82, 144, 1, 58, 105, 1, 32, 60, 1, 13, 26,
59, 159, 220, 23, 126, 198, 4, 88, 151, 1, 66, 114, 1, 38, 71, 1, 18, 34,
114, 136, 232, 51, 114, 207, 11, 83, 155, 3, 56, 105, 1, 33, 65, 1, 17, 34,
149, 65, 234, 121, 57, 215, 61, 49, 166, 28, 36, 114, 12, 25, 76, 3, 16, 42,
214, 49, 220, 132, 63, 188, 42, 65, 137, 0, 0, 0, 0, 0, 0, 0, 0, 0,
85, 137, 221, 104, 131, 216, 49, 111, 192, 21, 87, 155, 2, 49, 87, 1, 16, 28,
89, 163, 230, 90, 137, 220, 29, 100, 183, 10, 70, 135, 2, 42, 81, 1, 17, 33,
108, 167, 237, 55, 133, 222, 15, 97, 179, 4, 72, 135, 1, 45, 85, 1, 19, 38,
124, 146, 240, 66, 124, 224, 17, 88, 175, 4, 58, 122, 1, 36, 75, 1, 18, 37,
141, 79, 241, 126, 70, 227, 66, 58, 182, 30, 44, 136, 12, 34, 96, 2, 20, 47,
229, 99, 249, 143, 111, 235, 46, 109, 192, 0, 0, 0, 0, 0, 0, 0, 0, 0,
82, 158, 236, 94, 146, 224, 25, 117, 191, 9, 87, 149, 3, 56, 99, 1, 33, 57,
83, 167, 237, 68, 145, 222, 10, 103, 177, 2, 72, 131, 1, 41, 79, 1, 20, 39,
99, 167, 239, 47, 141, 224, 10, 104, 178, 2, 73, 133, 1, 44, 85, 1, 22, 47,
127, 145, 243, 71, 129, 228, 17, 93, 177, 3, 61, 124, 1, 41, 84, 1, 21, 52,
157, 78, 244, 140, 72, 231, 69, 58, 184, 31, 44, 137, 14, 38, 105, 8, 23, 61,
125, 34, 187, 52, 41, 133, 6, 31, 56, 0, 0, 0, 0, 0, 0, 0, 0, 0,
37, 109, 153, 51, 102, 147, 23, 87, 128, 8, 67, 101, 1, 41, 63, 1, 19, 29,
31, 154, 185, 17, 127, 175, 6, 96, 145, 2, 73, 114, 1, 51, 82, 1, 28, 45,
23, 163, 200, 10, 131, 185, 2, 93, 148, 1, 67, 111, 1, 41, 69, 1, 14, 24,
29, 176, 217, 12, 145, 201, 3, 101, 156, 1, 69, 111, 1, 39, 63, 1, 14, 23,
57, 192, 233, 25, 154, 215, 6, 109, 167, 3, 78, 118, 1, 48, 69, 1, 21, 29,
202, 105, 245, 108, 106, 216, 18, 90, 144, 0, 0, 0, 0, 0, 0, 0, 0, 0,
33, 172, 219, 64, 149, 206, 14, 117, 177, 5, 90, 141, 2, 61, 95, 1, 37, 57,
33, 179, 220, 11, 140, 198, 1, 89, 148, 1, 60, 104, 1, 33, 57, 1, 12, 21,
30, 181, 221, 8, 141, 198, 1, 87, 145, 1, 58, 100, 1, 31, 55, 1, 12, 20,
32, 186, 224, 7, 142, 198, 1, 86, 143, 1, 58, 100, 1, 31, 55, 1, 12, 22,
57, 192, 227, 20, 143, 204, 3, 96, 154, 1, 68, 112, 1, 42, 69, 1, 19, 32,
212, 35, 215, 113, 47, 169, 29, 48, 105, 0, 0, 0, 0, 0, 0, 0, 0, 0,
74, 129, 203, 106, 120, 203, 49, 107, 178, 19, 84, 144, 4, 50, 84, 1, 15, 25,
71, 172, 217, 44, 141, 209, 15, 102, 173, 6, 76, 133, 2, 51, 89, 1, 24, 42,
64, 185, 231, 31, 148, 216, 8, 103, 175, 3, 74, 131, 1, 46, 81, 1, 18, 30,
65, 196, 235, 25, 157, 221, 5, 105, 174, 1, 67, 120, 1, 38, 69, 1, 15, 30,
65, 204, 238, 30, 156, 224, 7, 107, 177, 2, 70, 124, 1, 42, 73, 1, 18, 34,
225, 86, 251, 144, 104, 235, 42, 99, 181, 0, 0, 0, 0, 0, 0, 0, 0, 0,
85, 175, 239, 112, 165, 229, 29, 136, 200, 12, 103, 162, 6, 77, 123, 2, 53, 84,
75, 183, 239, 30, 155, 221, 3, 106, 171, 1, 74, 128, 1, 44, 76, 1, 17, 28,
73, 185, 240, 27, 159, 222, 2, 107, 172, 1, 75, 127, 1, 42, 73, 1, 17, 29,
62, 190, 238, 21, 159, 222, 2, 107, 172, 1, 72, 122, 1, 40, 71, 1, 18, 32,
61, 199, 240, 27, 161, 226, 4, 113, 180, 1, 76, 129, 1, 46, 80, 1, 23, 41,
7, 27, 153, 5, 30, 95, 1, 16, 30, 0, 0, 0, 0, 0, 0, 0, 0, 0,
50, 75, 127, 57, 75, 124, 27, 67, 108, 10, 54, 86, 1, 33, 52, 1, 12, 18,
43, 125, 151, 26, 108, 148, 7, 83, 122, 2, 59, 89, 1, 38, 60, 1, 17, 27,
23, 144, 163, 13, 112, 154, 2, 75, 117, 1, 50, 81, 1, 31, 51, 1, 14, 23,
18, 162, 185, 6, 123, 171, 1, 78, 125, 1, 51, 86, 1, 31, 54, 1, 14, 23,
15, 199, 227, 3, 150, 204, 1, 91, 146, 1, 55, 95, 1, 30, 53, 1, 11, 20,
19, 55, 240, 19, 59, 196, 3, 52, 105, 0, 0, 0, 0, 0, 0, 0, 0, 0,
41, 166, 207, 104, 153, 199, 31, 123, 181, 14, 101, 152, 5, 72, 106, 1, 36, 52,
35, 176, 211, 12, 131, 190, 2, 88, 144, 1, 60, 101, 1, 36, 60, 1, 16, 28,
28, 183, 213, 8, 134, 191, 1, 86, 142, 1, 56, 96, 1, 30, 53, 1, 12, 20,
20, 190, 215, 4, 135, 192, 1, 84, 139, 1, 53, 91, 1, 28, 49, 1, 11, 20,
13, 196, 216, 2, 137, 192, 1, 86, 143, 1, 57, 99, 1, 32, 56, 1, 13, 24,
211, 29, 217, 96, 47, 156, 22, 43, 87, 0, 0, 0, 0, 0, 0, 0, 0, 0,
78, 120, 193, 111, 116, 186, 46, 102, 164, 15, 80, 128, 2, 49, 76, 1, 18, 28,
71, 161, 203, 42, 132, 192, 10, 98, 150, 3, 69, 109, 1, 44, 70, 1, 18, 29,
57, 186, 211, 30, 140, 196, 4, 93, 146, 1, 62, 102, 1, 38, 65, 1, 16, 27,
47, 199, 217, 14, 145, 196, 1, 88, 142, 1, 57, 98, 1, 36, 62, 1, 15, 26,
26, 219, 229, 5, 155, 207, 1, 94, 151, 1, 60, 104, 1, 36, 62, 1, 16, 28,
233, 29, 248, 146, 47, 220, 43, 52, 140, 0, 0, 0, 0, 0, 0, 0, 0, 0,
100, 163, 232, 179, 161, 222, 63, 142, 204, 37, 113, 174, 26, 89, 137, 18, 68, 97,
85, 181, 230, 32, 146, 209, 7, 100, 164, 3, 71, 121, 1, 45, 77, 1, 18, 30,
65, 187, 230, 20, 148, 207, 2, 97, 159, 1, 68, 116, 1, 40, 70, 1, 14, 29,
40, 194, 227, 8, 147, 204, 1, 94, 155, 1, 65, 112, 1, 39, 66, 1, 14, 26,
16, 208, 228, 3, 151, 207, 1, 98, 160, 1, 67, 117, 1, 41, 74, 1, 17, 31,
17, 38, 140, 7, 34, 80, 1, 17, 29, 0, 0, 0, 0, 0, 0, 0, 0, 0,
37, 75, 128, 41, 76, 128, 26, 66, 116, 12, 52, 94, 2, 32, 55, 1, 10, 16,
50, 127, 154, 37, 109, 152, 16, 82, 121, 5, 59, 85, 1, 35, 54, 1, 13, 20,
40, 142, 167, 17, 110, 157, 2, 71, 112, 1, 44, 72, 1, 27, 45, 1, 11, 17,
30, 175, 188, 9, 124, 169, 1, 74, 116, 1, 48, 78, 1, 30, 49, 1, 11, 18,
10, 222, 223, 2, 150, 194, 1, 83, 128, 1, 48, 79, 1, 27, 45, 1, 11, 17,
36, 41, 235, 29, 36, 193, 10, 27, 111, 0, 0, 0, 0, 0, 0, 0, 0, 0,
85, 165, 222, 177, 162, 215, 110, 135, 195, 57, 113, 168, 23, 83, 120, 10, 49, 61,
85, 190, 223, 36, 139, 200, 5, 90, 146, 1, 60, 103, 1, 38, 65, 1, 18, 30,
72, 202, 223, 23, 141, 199, 2, 86, 140, 1, 56, 97, 1, 36, 61, 1, 16, 27,
55, 218, 225, 13, 145, 200, 1, 86, 141, 1, 57, 99, 1, 35, 61, 1, 13, 22,
15, 235, 212, 1, 132, 184, 1, 84, 139, 1, 57, 97, 1, 34, 56, 1, 14, 23,
181, 21, 201, 61, 37, 123, 10, 38, 71, 0, 0, 0, 0, 0, 0, 0, 0, 0,
47, 106, 172, 95, 104, 173, 42, 93, 159, 18, 77, 131, 4, 50, 81, 1, 17, 23,
62, 147, 199, 44, 130, 189, 28, 102, 154, 18, 75, 115, 2, 44, 65, 1, 12, 19,
55, 153, 210, 24, 130, 194, 3, 93, 146, 1, 61, 97, 1, 31, 50, 1, 10, 16,
49, 186, 223, 17, 148, 204, 1, 96, 142, 1, 53, 83, 1, 26, 44, 1, 11, 17,
13, 217, 212, 2, 136, 180, 1, 78, 124, 1, 50, 83, 1, 29, 49, 1, 14, 23,
197, 13, 247, 82, 17, 222, 25, 17, 162, 0, 0, 0, 0, 0, 0, 0, 0, 0,
126, 186, 247, 234, 191, 243, 176, 177, 234, 104, 158, 220, 66, 128, 186, 55, 90, 137,
111, 197, 242, 46, 158, 219, 9, 104, 171, 2, 65, 125, 1, 44, 80, 1, 17, 91,
104, 208, 245, 39, 168, 224, 3, 109, 162, 1, 79, 124, 1, 50, 102, 1, 43, 102,
84, 220, 246, 31, 177, 231, 2, 115, 180, 1, 79, 134, 1, 55, 77, 1, 60, 79,
43, 243, 240, 8, 180, 217, 1, 115, 166, 1, 84, 121, 1, 51, 67, 1, 16, 6,
},
.switchable_interp_prob{235, 162, 36, 255, 34, 3, 149, 144},
.inter_mode_prob{
2, 173, 34, 0, 7, 145, 85, 0, 7, 166, 63, 0, 7, 94,
66, 0, 8, 64, 46, 0, 17, 81, 31, 0, 25, 29, 30, 0,
},
.intra_inter_prob{9, 102, 187, 225},
.comp_inter_prob{9, 102, 187, 225, 0},
.single_ref_prob{33, 16, 77, 74, 142, 142, 172, 170, 238, 247},
.comp_ref_prob{50, 126, 123, 221, 226},
.tx_32x32_prob{3, 136, 37, 5, 52, 13},
.tx_16x16_prob{20, 152, 15, 101},
.tx_8x8_prob{100, 66},
.skip_probs{192, 128, 64},
.joints{32, 64, 96},
.sign{128, 128},
.classes{
224, 144, 192, 168, 192, 176, 192, 198, 198, 245,
216, 128, 176, 160, 176, 176, 192, 198, 198, 208,
},
.class_0{216, 208},
.prob_bits{
136, 140, 148, 160, 176, 192, 224, 234, 234, 240,
136, 140, 148, 160, 176, 192, 224, 234, 234, 240,
},
.class_0_fr{128, 128, 64, 96, 112, 64, 128, 128, 64, 96, 112, 64},
.fr{64, 96, 64, 64, 96, 64},
.class_0_hp{160, 160},
.high_precision{128, 128},
};
constexpr std::array<s32, 256> norm_lut{
0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
};
constexpr std::array<s32, 254> map_lut{
20, 21, 22, 23, 24, 25, 0, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
1, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 2, 50, 51, 52, 53, 54,
55, 56, 57, 58, 59, 60, 61, 3, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72,
73, 4, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 5, 86, 87, 88, 89,
90, 91, 92, 93, 94, 95, 96, 97, 6, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
108, 109, 7, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 8, 122, 123, 124,
125, 126, 127, 128, 129, 130, 131, 132, 133, 9, 134, 135, 136, 137, 138, 139, 140, 141, 142,
143, 144, 145, 10, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 11, 158, 159,
160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 12, 170, 171, 172, 173, 174, 175, 176, 177,
178, 179, 180, 181, 13, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 14, 194,
195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 15, 206, 207, 208, 209, 210, 211, 212,
213, 214, 215, 216, 217, 16, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 17,
230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 18, 242, 243, 244, 245, 246, 247,
248, 249, 250, 251, 252, 253, 19,
};
// 6.2.14 Tile size calculation
[[nodiscard]] s32 CalcMinLog2TileCols(s32 frame_width) {
const s32 sb64_cols = (frame_width + 63) / 64;
s32 min_log2 = 0;
while ((64 << min_log2) < sb64_cols) {
min_log2++;
}
return min_log2;
}
[[nodiscard]] s32 CalcMaxLog2TileCols(s32 frame_width) {
const s32 sb64_cols = (frame_width + 63) / 64;
s32 max_log2 = 1;
while ((sb64_cols >> max_log2) >= 4) {
max_log2++;
}
return max_log2 - 1;
}
// Recenters probability. Based on section 6.3.6 of VP9 Specification
[[nodiscard]] s32 RecenterNonNeg(s32 new_prob, s32 old_prob) {
if (new_prob > old_prob * 2) {
return new_prob;
}
if (new_prob >= old_prob) {
return (new_prob - old_prob) * 2;
}
return (old_prob - new_prob) * 2 - 1;
}
// Adjusts old_prob depending on new_prob. Based on section 6.3.5 of VP9 Specification
[[nodiscard]] s32 RemapProbability(s32 new_prob, s32 old_prob) {
new_prob--;
old_prob--;
std::size_t index{};
if (old_prob * 2 <= 0xff) {
index = static_cast<std::size_t>(std::max(0, RecenterNonNeg(new_prob, old_prob) - 1));
} else {
index = static_cast<std::size_t>(
std::max(0, RecenterNonNeg(0xff - 1 - new_prob, 0xff - 1 - old_prob) - 1));
}
return map_lut[index];
}
} // Anonymous namespace
VP9::VP9(GPU& gpu_) : gpu{gpu_} {}
VP9::~VP9() = default;
void VP9::WriteProbabilityUpdate(VpxRangeEncoder& writer, u8 new_prob, u8 old_prob) {
const bool update = new_prob != old_prob;
writer.Write(update, diff_update_probability);
if (update) {
WriteProbabilityDelta(writer, new_prob, old_prob);
}
}
template <typename T, std::size_t N>
void VP9::WriteProbabilityUpdate(VpxRangeEncoder& writer, const std::array<T, N>& new_prob,
const std::array<T, N>& old_prob) {
for (std::size_t offset = 0; offset < new_prob.size(); ++offset) {
WriteProbabilityUpdate(writer, new_prob[offset], old_prob[offset]);
}
}
template <typename T, std::size_t N>
void VP9::WriteProbabilityUpdateAligned4(VpxRangeEncoder& writer, const std::array<T, N>& new_prob,
const std::array<T, N>& old_prob) {
for (std::size_t offset = 0; offset < new_prob.size(); offset += 4) {
WriteProbabilityUpdate(writer, new_prob[offset + 0], old_prob[offset + 0]);
WriteProbabilityUpdate(writer, new_prob[offset + 1], old_prob[offset + 1]);
WriteProbabilityUpdate(writer, new_prob[offset + 2], old_prob[offset + 2]);
}
}
void VP9::WriteProbabilityDelta(VpxRangeEncoder& writer, u8 new_prob, u8 old_prob) {
const int delta = RemapProbability(new_prob, old_prob);
EncodeTermSubExp(writer, delta);
}
void VP9::EncodeTermSubExp(VpxRangeEncoder& writer, s32 value) {
if (WriteLessThan(writer, value, 16)) {
writer.Write(value, 4);
} else if (WriteLessThan(writer, value, 32)) {
writer.Write(value - 16, 4);
} else if (WriteLessThan(writer, value, 64)) {
writer.Write(value - 32, 5);
} else {
value -= 64;
constexpr s32 size = 8;
const s32 mask = (1 << size) - 191;
const s32 delta = value - mask;
if (delta < 0) {
writer.Write(value, size - 1);
} else {
writer.Write(delta / 2 + mask, size - 1);
writer.Write(delta & 1, 1);
}
}
}
bool VP9::WriteLessThan(VpxRangeEncoder& writer, s32 value, s32 test) {
const bool is_lt = value < test;
writer.Write(!is_lt);
return is_lt;
}
void VP9::WriteCoefProbabilityUpdate(VpxRangeEncoder& writer, s32 tx_mode,
const std::array<u8, 1728>& new_prob,
const std::array<u8, 1728>& old_prob) {
constexpr u32 block_bytes = 2 * 2 * 6 * 6 * 3;
const auto needs_update = [&](u32 base_index) {
return !std::equal(new_prob.begin() + base_index,
new_prob.begin() + base_index + block_bytes,
old_prob.begin() + base_index);
};
for (u32 block_index = 0; block_index < 4; block_index++) {
const u32 base_index = block_index * block_bytes;
const bool update = needs_update(base_index);
writer.Write(update);
if (update) {
u32 index = base_index;
for (s32 i = 0; i < 2; i++) {
for (s32 j = 0; j < 2; j++) {
for (s32 k = 0; k < 6; k++) {
for (s32 l = 0; l < 6; l++) {
if (k != 0 || l < 3) {
WriteProbabilityUpdate(writer, new_prob[index + 0],
old_prob[index + 0]);
WriteProbabilityUpdate(writer, new_prob[index + 1],
old_prob[index + 1]);
WriteProbabilityUpdate(writer, new_prob[index + 2],
old_prob[index + 2]);
}
index += 3;
}
}
}
}
}
if (block_index == static_cast<u32>(tx_mode)) {
break;
}
}
}
void VP9::WriteMvProbabilityUpdate(VpxRangeEncoder& writer, u8 new_prob, u8 old_prob) {
const bool update = new_prob != old_prob;
writer.Write(update, diff_update_probability);
if (update) {
writer.Write(new_prob >> 1, 7);
}
}
Vp9PictureInfo VP9::GetVp9PictureInfo(const NvdecCommon::NvdecRegisters& state) {
PictureInfo picture_info{};
gpu.MemoryManager().ReadBlock(state.picture_info_offset, &picture_info, sizeof(PictureInfo));
Vp9PictureInfo vp9_info = picture_info.Convert();
InsertEntropy(state.vp9_entropy_probs_offset, vp9_info.entropy);
// surface_luma_offset[0:3] contains the address of the reference frame offsets in the following
// order: last, golden, altref, current. It may be worthwhile to track the updates done here
// to avoid buffering frame data needed for reference frame updating in the header composition.
std::memcpy(vp9_info.frame_offsets.data(), state.surface_luma_offset.data(), 4 * sizeof(u64));
return vp9_info;
}
void VP9::InsertEntropy(u64 offset, Vp9EntropyProbs& dst) {
EntropyProbs entropy{};
gpu.MemoryManager().ReadBlock(offset, &entropy, sizeof(EntropyProbs));
entropy.Convert(dst);
}
Vp9FrameContainer VP9::GetCurrentFrame(const NvdecCommon::NvdecRegisters& state) {
Vp9FrameContainer current_frame{};
{
gpu.SyncGuestHost();
current_frame.info = GetVp9PictureInfo(state);
current_frame.bit_stream.resize(current_frame.info.bitstream_size);
gpu.MemoryManager().ReadBlock(state.frame_bitstream_offset, current_frame.bit_stream.data(),
current_frame.info.bitstream_size);
}
// Buffer two frames, saving the last show frame info
if (!next_next_frame.bit_stream.empty()) {
Vp9FrameContainer temp{
.info = current_frame.info,
.bit_stream = std::move(current_frame.bit_stream),
};
next_next_frame.info.show_frame = current_frame.info.last_frame_shown;
current_frame.info = next_next_frame.info;
current_frame.bit_stream = std::move(next_next_frame.bit_stream);
next_next_frame = std::move(temp);
if (!next_frame.bit_stream.empty()) {
Vp9FrameContainer temp2{
.info = current_frame.info,
.bit_stream = std::move(current_frame.bit_stream),
};
next_frame.info.show_frame = current_frame.info.last_frame_shown;
current_frame.info = next_frame.info;
current_frame.bit_stream = std::move(next_frame.bit_stream);
next_frame = std::move(temp2);
} else {
next_frame.info = current_frame.info;
next_frame.bit_stream = std::move(current_frame.bit_stream);
}
} else {
next_next_frame.info = current_frame.info;
next_next_frame.bit_stream = std::move(current_frame.bit_stream);
}
return current_frame;
}
std::vector<u8> VP9::ComposeCompressedHeader() {
VpxRangeEncoder writer{};
const bool update_probs = current_frame_info.show_frame && !current_frame_info.is_key_frame;
if (!current_frame_info.lossless) {
if (static_cast<u32>(current_frame_info.transform_mode) >= 3) {
writer.Write(3, 2);
writer.Write(current_frame_info.transform_mode == 4);
} else {
writer.Write(current_frame_info.transform_mode, 2);
}
}
if (current_frame_info.transform_mode == 4) {
// tx_mode_probs() in the spec
WriteProbabilityUpdate(writer, current_frame_info.entropy.tx_8x8_prob,
prev_frame_probs.tx_8x8_prob);
WriteProbabilityUpdate(writer, current_frame_info.entropy.tx_16x16_prob,
prev_frame_probs.tx_16x16_prob);
WriteProbabilityUpdate(writer, current_frame_info.entropy.tx_32x32_prob,
prev_frame_probs.tx_32x32_prob);
if (update_probs) {
prev_frame_probs.tx_8x8_prob = current_frame_info.entropy.tx_8x8_prob;
prev_frame_probs.tx_16x16_prob = current_frame_info.entropy.tx_16x16_prob;
prev_frame_probs.tx_32x32_prob = current_frame_info.entropy.tx_32x32_prob;
}
}
// read_coef_probs() in the spec
WriteCoefProbabilityUpdate(writer, current_frame_info.transform_mode,
current_frame_info.entropy.coef_probs, prev_frame_probs.coef_probs);
// read_skip_probs() in the spec
WriteProbabilityUpdate(writer, current_frame_info.entropy.skip_probs,
prev_frame_probs.skip_probs);
if (update_probs) {
prev_frame_probs.coef_probs = current_frame_info.entropy.coef_probs;
prev_frame_probs.skip_probs = current_frame_info.entropy.skip_probs;
}
if (!current_frame_info.intra_only) {
// read_inter_probs() in the spec
WriteProbabilityUpdateAligned4(writer, current_frame_info.entropy.inter_mode_prob,
prev_frame_probs.inter_mode_prob);
if (current_frame_info.interp_filter == 4) {
// read_interp_filter_probs() in the spec
WriteProbabilityUpdate(writer, current_frame_info.entropy.switchable_interp_prob,
prev_frame_probs.switchable_interp_prob);
if (update_probs) {
prev_frame_probs.switchable_interp_prob =
current_frame_info.entropy.switchable_interp_prob;
}
}
// read_is_inter_probs() in the spec
WriteProbabilityUpdate(writer, current_frame_info.entropy.intra_inter_prob,
prev_frame_probs.intra_inter_prob);
// frame_reference_mode() in the spec
if ((current_frame_info.ref_frame_sign_bias[1] & 1) !=
(current_frame_info.ref_frame_sign_bias[2] & 1) ||
(current_frame_info.ref_frame_sign_bias[1] & 1) !=
(current_frame_info.ref_frame_sign_bias[3] & 1)) {
if (current_frame_info.reference_mode >= 1) {
writer.Write(1, 1);
writer.Write(current_frame_info.reference_mode == 2);
} else {
writer.Write(0, 1);
}
}
// frame_reference_mode_probs() in the spec
if (current_frame_info.reference_mode == 2) {
WriteProbabilityUpdate(writer, current_frame_info.entropy.comp_inter_prob,
prev_frame_probs.comp_inter_prob);
if (update_probs) {
prev_frame_probs.comp_inter_prob = current_frame_info.entropy.comp_inter_prob;
}
}
if (current_frame_info.reference_mode != 1) {
WriteProbabilityUpdate(writer, current_frame_info.entropy.single_ref_prob,
prev_frame_probs.single_ref_prob);
if (update_probs) {
prev_frame_probs.single_ref_prob = current_frame_info.entropy.single_ref_prob;
}
}
if (current_frame_info.reference_mode != 0) {
WriteProbabilityUpdate(writer, current_frame_info.entropy.comp_ref_prob,
prev_frame_probs.comp_ref_prob);
if (update_probs) {
prev_frame_probs.comp_ref_prob = current_frame_info.entropy.comp_ref_prob;
}
}
// read_y_mode_probs
for (std::size_t index = 0; index < current_frame_info.entropy.y_mode_prob.size();
++index) {
WriteProbabilityUpdate(writer, current_frame_info.entropy.y_mode_prob[index],
prev_frame_probs.y_mode_prob[index]);
}
// read_partition_probs
WriteProbabilityUpdateAligned4(writer, current_frame_info.entropy.partition_prob,
prev_frame_probs.partition_prob);
// mv_probs
for (s32 i = 0; i < 3; i++) {
WriteMvProbabilityUpdate(writer, current_frame_info.entropy.joints[i],
prev_frame_probs.joints[i]);
}
if (update_probs) {
prev_frame_probs.inter_mode_prob = current_frame_info.entropy.inter_mode_prob;
prev_frame_probs.intra_inter_prob = current_frame_info.entropy.intra_inter_prob;
prev_frame_probs.y_mode_prob = current_frame_info.entropy.y_mode_prob;
prev_frame_probs.partition_prob = current_frame_info.entropy.partition_prob;
prev_frame_probs.joints = current_frame_info.entropy.joints;
}
for (s32 i = 0; i < 2; i++) {
WriteMvProbabilityUpdate(writer, current_frame_info.entropy.sign[i],
prev_frame_probs.sign[i]);
for (s32 j = 0; j < 10; j++) {
const int index = i * 10 + j;
WriteMvProbabilityUpdate(writer, current_frame_info.entropy.classes[index],
prev_frame_probs.classes[index]);
}
WriteMvProbabilityUpdate(writer, current_frame_info.entropy.class_0[i],
prev_frame_probs.class_0[i]);
for (s32 j = 0; j < 10; j++) {
const int index = i * 10 + j;
WriteMvProbabilityUpdate(writer, current_frame_info.entropy.prob_bits[index],
prev_frame_probs.prob_bits[index]);
}
}
for (s32 i = 0; i < 2; i++) {
for (s32 j = 0; j < 2; j++) {
for (s32 k = 0; k < 3; k++) {
const int index = i * 2 * 3 + j * 3 + k;
WriteMvProbabilityUpdate(writer, current_frame_info.entropy.class_0_fr[index],
prev_frame_probs.class_0_fr[index]);
}
}
for (s32 j = 0; j < 3; j++) {
const int index = i * 3 + j;
WriteMvProbabilityUpdate(writer, current_frame_info.entropy.fr[index],
prev_frame_probs.fr[index]);
}
}
if (current_frame_info.allow_high_precision_mv) {
for (s32 index = 0; index < 2; index++) {
WriteMvProbabilityUpdate(writer, current_frame_info.entropy.class_0_hp[index],
prev_frame_probs.class_0_hp[index]);
WriteMvProbabilityUpdate(writer, current_frame_info.entropy.high_precision[index],
prev_frame_probs.high_precision[index]);
}
}
// save previous probs
if (update_probs) {
prev_frame_probs.sign = current_frame_info.entropy.sign;
prev_frame_probs.classes = current_frame_info.entropy.classes;
prev_frame_probs.class_0 = current_frame_info.entropy.class_0;
prev_frame_probs.prob_bits = current_frame_info.entropy.prob_bits;
prev_frame_probs.class_0_fr = current_frame_info.entropy.class_0_fr;
prev_frame_probs.fr = current_frame_info.entropy.fr;
prev_frame_probs.class_0_hp = current_frame_info.entropy.class_0_hp;
prev_frame_probs.high_precision = current_frame_info.entropy.high_precision;
}
}
writer.End();
return writer.GetBuffer();
}
VpxBitStreamWriter VP9::ComposeUncompressedHeader() {
VpxBitStreamWriter uncomp_writer{};
uncomp_writer.WriteU(2, 2); // Frame marker.
uncomp_writer.WriteU(0, 2); // Profile.
uncomp_writer.WriteBit(false); // Show existing frame.
uncomp_writer.WriteBit(!current_frame_info.is_key_frame); // is key frame?
uncomp_writer.WriteBit(current_frame_info.show_frame); // show frame?
uncomp_writer.WriteBit(current_frame_info.error_resilient_mode); // error reslience
if (current_frame_info.is_key_frame) {
uncomp_writer.WriteU(frame_sync_code, 24);
uncomp_writer.WriteU(0, 3); // Color space.
uncomp_writer.WriteU(0, 1); // Color range.
uncomp_writer.WriteU(current_frame_info.frame_size.width - 1, 16);
uncomp_writer.WriteU(current_frame_info.frame_size.height - 1, 16);
uncomp_writer.WriteBit(false); // Render and frame size different.
// Reset context
prev_frame_probs = default_probs;
swap_next_golden = false;
loop_filter_ref_deltas.fill(0);
loop_filter_mode_deltas.fill(0);
// allow frames offsets to stabilize before checking for golden frames
grace_period = 4;
// On key frames, all frame slots are set to the current frame,
// so the value of the selected slot doesn't really matter.
frame_ctxs.fill({current_frame_number, false, default_probs});
// intra only, meaning the frame can be recreated with no other references
current_frame_info.intra_only = true;
} else {
if (!current_frame_info.show_frame) {
uncomp_writer.WriteBit(current_frame_info.intra_only);
if (!current_frame_info.last_frame_was_key) {
swap_next_golden = !swap_next_golden;
}
} else {
current_frame_info.intra_only = false;
}
if (!current_frame_info.error_resilient_mode) {
uncomp_writer.WriteU(0, 2); // Reset frame context.
}
// Last, Golden, Altref frames
std::array<s32, 3> ref_frame_index{0, 1, 2};
// Set when next frame is hidden
// altref and golden references are swapped
if (swap_next_golden) {
ref_frame_index = std::array<s32, 3>{0, 2, 1};
}
// update Last Frame
u64 refresh_frame_flags = 1;
// golden frame may refresh, determined if the next golden frame offset is changed
bool golden_refresh = false;
if (grace_period <= 0) {
for (s32 index = 1; index < 3; ++index) {
if (current_frame_info.frame_offsets[index] !=
next_frame.info.frame_offsets[index]) {
current_frame_info.refresh_frame[index] = true;
golden_refresh = true;
grace_period = 3;
}
}
}
if (current_frame_info.show_frame &&
(!next_frame.info.show_frame || next_frame.info.is_key_frame)) {
// Update golden frame
refresh_frame_flags = swap_next_golden ? 2 : 4;
}
if (!current_frame_info.show_frame) {
// Update altref
refresh_frame_flags = swap_next_golden ? 2 : 4;
} else if (golden_refresh) {
refresh_frame_flags = 3;
}
if (current_frame_info.intra_only) {
uncomp_writer.WriteU(frame_sync_code, 24);
uncomp_writer.WriteU(static_cast<s32>(refresh_frame_flags), 8);
uncomp_writer.WriteU(current_frame_info.frame_size.width - 1, 16);
uncomp_writer.WriteU(current_frame_info.frame_size.height - 1, 16);
uncomp_writer.WriteBit(false); // Render and frame size different.
} else {
uncomp_writer.WriteU(static_cast<s32>(refresh_frame_flags), 8);
for (s32 index = 1; index < 4; index++) {
uncomp_writer.WriteU(ref_frame_index[index - 1], 3);
uncomp_writer.WriteU(current_frame_info.ref_frame_sign_bias[index], 1);
}
uncomp_writer.WriteBit(true); // Frame size with refs.
uncomp_writer.WriteBit(false); // Render and frame size different.
uncomp_writer.WriteBit(current_frame_info.allow_high_precision_mv);
uncomp_writer.WriteBit(current_frame_info.interp_filter == 4);
if (current_frame_info.interp_filter != 4) {
uncomp_writer.WriteU(current_frame_info.interp_filter, 2);
}
}
}
if (!current_frame_info.error_resilient_mode) {
uncomp_writer.WriteBit(true); // Refresh frame context. where do i get this info from?
uncomp_writer.WriteBit(true); // Frame parallel decoding mode.
}
int frame_ctx_idx = 0;
if (!current_frame_info.show_frame) {
frame_ctx_idx = 1;
}
uncomp_writer.WriteU(frame_ctx_idx, 2); // Frame context index.
prev_frame_probs =
frame_ctxs[frame_ctx_idx].probs; // reference probabilities for compressed header
frame_ctxs[frame_ctx_idx] = {current_frame_number, false, current_frame_info.entropy};
uncomp_writer.WriteU(current_frame_info.first_level, 6);
uncomp_writer.WriteU(current_frame_info.sharpness_level, 3);
uncomp_writer.WriteBit(current_frame_info.mode_ref_delta_enabled);
if (current_frame_info.mode_ref_delta_enabled) {
// check if ref deltas are different, update accordingly
std::array<bool, 4> update_loop_filter_ref_deltas;
std::array<bool, 2> update_loop_filter_mode_deltas;
bool loop_filter_delta_update = false;
for (std::size_t index = 0; index < current_frame_info.ref_deltas.size(); index++) {
const s8 old_deltas = loop_filter_ref_deltas[index];
const s8 new_deltas = current_frame_info.ref_deltas[index];
const bool differing_delta = old_deltas != new_deltas;
update_loop_filter_ref_deltas[index] = differing_delta;
loop_filter_delta_update |= differing_delta;
}
for (std::size_t index = 0; index < current_frame_info.mode_deltas.size(); index++) {
const s8 old_deltas = loop_filter_mode_deltas[index];
const s8 new_deltas = current_frame_info.mode_deltas[index];
const bool differing_delta = old_deltas != new_deltas;
update_loop_filter_mode_deltas[index] = differing_delta;
loop_filter_delta_update |= differing_delta;
}
uncomp_writer.WriteBit(loop_filter_delta_update);
if (loop_filter_delta_update) {
for (std::size_t index = 0; index < current_frame_info.ref_deltas.size(); index++) {
uncomp_writer.WriteBit(update_loop_filter_ref_deltas[index]);
if (update_loop_filter_ref_deltas[index]) {
uncomp_writer.WriteS(current_frame_info.ref_deltas[index], 6);
}
}
for (std::size_t index = 0; index < current_frame_info.mode_deltas.size(); index++) {
uncomp_writer.WriteBit(update_loop_filter_mode_deltas[index]);
if (update_loop_filter_mode_deltas[index]) {
uncomp_writer.WriteS(current_frame_info.mode_deltas[index], 6);
}
}
// save new deltas
loop_filter_ref_deltas = current_frame_info.ref_deltas;
loop_filter_mode_deltas = current_frame_info.mode_deltas;
}
}
uncomp_writer.WriteU(current_frame_info.base_q_index, 8);
uncomp_writer.WriteDeltaQ(current_frame_info.y_dc_delta_q);
uncomp_writer.WriteDeltaQ(current_frame_info.uv_dc_delta_q);
uncomp_writer.WriteDeltaQ(current_frame_info.uv_ac_delta_q);
uncomp_writer.WriteBit(false); // Segmentation enabled (TODO).
const s32 min_tile_cols_log2 = CalcMinLog2TileCols(current_frame_info.frame_size.width);
const s32 max_tile_cols_log2 = CalcMaxLog2TileCols(current_frame_info.frame_size.width);
const s32 tile_cols_log2_diff = current_frame_info.log2_tile_cols - min_tile_cols_log2;
const s32 tile_cols_log2_inc_mask = (1 << tile_cols_log2_diff) - 1;
// If it's less than the maximum, we need to add an extra 0 on the bitstream
// to indicate that it should stop reading.
if (current_frame_info.log2_tile_cols < max_tile_cols_log2) {
uncomp_writer.WriteU(tile_cols_log2_inc_mask << 1, tile_cols_log2_diff + 1);
} else {
uncomp_writer.WriteU(tile_cols_log2_inc_mask, tile_cols_log2_diff);
}
const bool tile_rows_log2_is_nonzero = current_frame_info.log2_tile_rows != 0;
uncomp_writer.WriteBit(tile_rows_log2_is_nonzero);
if (tile_rows_log2_is_nonzero) {
uncomp_writer.WriteBit(current_frame_info.log2_tile_rows > 1);
}
return uncomp_writer;
}
const std::vector<u8>& VP9::ComposeFrameHeader(const NvdecCommon::NvdecRegisters& state) {
std::vector<u8> bitstream;
{
Vp9FrameContainer curr_frame = GetCurrentFrame(state);
current_frame_info = curr_frame.info;
bitstream = std::move(curr_frame.bit_stream);
}
// The uncompressed header routine sets PrevProb parameters needed for the compressed header
auto uncomp_writer = ComposeUncompressedHeader();
std::vector<u8> compressed_header = ComposeCompressedHeader();
uncomp_writer.WriteU(static_cast<s32>(compressed_header.size()), 16);
uncomp_writer.Flush();
std::vector<u8> uncompressed_header = uncomp_writer.GetByteArray();
// Write headers and frame to buffer
frame.resize(uncompressed_header.size() + compressed_header.size() + bitstream.size());
std::memcpy(frame.data(), uncompressed_header.data(), uncompressed_header.size());
std::memcpy(frame.data() + uncompressed_header.size(), compressed_header.data(),
compressed_header.size());
std::memcpy(frame.data() + uncompressed_header.size() + compressed_header.size(),
bitstream.data(), bitstream.size());
// keep track of frame number
current_frame_number++;
grace_period--;
// don't display hidden frames
hidden = !current_frame_info.show_frame;
return frame;
}
VpxRangeEncoder::VpxRangeEncoder() {
Write(false);
}
VpxRangeEncoder::~VpxRangeEncoder() = default;
void VpxRangeEncoder::Write(s32 value, s32 value_size) {
for (s32 bit = value_size - 1; bit >= 0; bit--) {
Write(((value >> bit) & 1) != 0);
}
}
void VpxRangeEncoder::Write(bool bit) {
Write(bit, half_probability);
}
void VpxRangeEncoder::Write(bool bit, s32 probability) {
u32 local_range = range;
const u32 split = 1 + (((local_range - 1) * static_cast<u32>(probability)) >> 8);
local_range = split;
if (bit) {
low_value += split;
local_range = range - split;
}
s32 shift = norm_lut[local_range];
local_range <<= shift;
count += shift;
if (count >= 0) {
const s32 offset = shift - count;
if (((low_value << (offset - 1)) >> 31) != 0) {
const s32 current_pos = static_cast<s32>(base_stream.GetPosition());
base_stream.Seek(-1, Common::SeekOrigin::FromCurrentPos);
while (PeekByte() == 0xff) {
base_stream.WriteByte(0);
base_stream.Seek(-2, Common::SeekOrigin::FromCurrentPos);
}
base_stream.WriteByte(static_cast<u8>((PeekByte() + 1)));
base_stream.Seek(current_pos, Common::SeekOrigin::SetOrigin);
}
base_stream.WriteByte(static_cast<u8>((low_value >> (24 - offset))));
low_value <<= offset;
shift = count;
low_value &= 0xffffff;
count -= 8;
}
low_value <<= shift;
range = local_range;
}
void VpxRangeEncoder::End() {
for (std::size_t index = 0; index < 32; ++index) {
Write(false);
}
}
u8 VpxRangeEncoder::PeekByte() {
const u8 value = base_stream.ReadByte();
base_stream.Seek(-1, Common::SeekOrigin::FromCurrentPos);
return value;
}
VpxBitStreamWriter::VpxBitStreamWriter() = default;
VpxBitStreamWriter::~VpxBitStreamWriter() = default;
void VpxBitStreamWriter::WriteU(u32 value, u32 value_size) {
WriteBits(value, value_size);
}
void VpxBitStreamWriter::WriteS(s32 value, u32 value_size) {
const bool sign = value < 0;
if (sign) {
value = -value;
}
WriteBits(static_cast<u32>(value << 1) | (sign ? 1 : 0), value_size + 1);
}
void VpxBitStreamWriter::WriteDeltaQ(u32 value) {
const bool delta_coded = value != 0;
WriteBit(delta_coded);
if (delta_coded) {
WriteBits(value, 4);
}
}
void VpxBitStreamWriter::WriteBits(u32 value, u32 bit_count) {
s32 value_pos = 0;
s32 remaining = bit_count;
while (remaining > 0) {
s32 copy_size = remaining;
const s32 free = GetFreeBufferBits();
if (copy_size > free) {
copy_size = free;
}
const s32 mask = (1 << copy_size) - 1;
const s32 src_shift = (bit_count - value_pos) - copy_size;
const s32 dst_shift = (buffer_size - buffer_pos) - copy_size;
buffer |= ((value >> src_shift) & mask) << dst_shift;
value_pos += copy_size;
buffer_pos += copy_size;
remaining -= copy_size;
}
}
void VpxBitStreamWriter::WriteBit(bool state) {
WriteBits(state ? 1 : 0, 1);
}
s32 VpxBitStreamWriter::GetFreeBufferBits() {
if (buffer_pos == buffer_size) {
Flush();
}
return buffer_size - buffer_pos;
}
void VpxBitStreamWriter::Flush() {
if (buffer_pos == 0) {
return;
}
byte_array.push_back(static_cast<u8>(buffer));
buffer = 0;
buffer_pos = 0;
}
std::vector<u8>& VpxBitStreamWriter::GetByteArray() {
return byte_array;
}
const std::vector<u8>& VpxBitStreamWriter::GetByteArray() const {
return byte_array;
}
} // namespace Tegra::Decoder

View File

@@ -0,0 +1,197 @@
// Copyright 2020 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#pragma once
#include <array>
#include <vector>
#include "common/common_types.h"
#include "common/stream.h"
#include "video_core/command_classes/codecs/vp9_types.h"
#include "video_core/command_classes/nvdec_common.h"
namespace Tegra {
class GPU;
enum class FrameType { KeyFrame = 0, InterFrame = 1 };
namespace Decoder {
/// The VpxRangeEncoder, and VpxBitStreamWriter classes are used to compose the
/// VP9 header bitstreams.
class VpxRangeEncoder {
public:
VpxRangeEncoder();
~VpxRangeEncoder();
VpxRangeEncoder(const VpxRangeEncoder&) = delete;
VpxRangeEncoder& operator=(const VpxRangeEncoder&) = delete;
VpxRangeEncoder(VpxRangeEncoder&&) = default;
VpxRangeEncoder& operator=(VpxRangeEncoder&&) = default;
/// Writes the rightmost value_size bits from value into the stream
void Write(s32 value, s32 value_size);
/// Writes a single bit with half probability
void Write(bool bit);
/// Writes a bit to the base_stream encoded with probability
void Write(bool bit, s32 probability);
/// Signal the end of the bitstream
void End();
[[nodiscard]] std::vector<u8>& GetBuffer() {
return base_stream.GetBuffer();
}
[[nodiscard]] const std::vector<u8>& GetBuffer() const {
return base_stream.GetBuffer();
}
private:
u8 PeekByte();
Common::Stream base_stream{};
u32 low_value{};
u32 range{0xff};
s32 count{-24};
s32 half_probability{128};
};
class VpxBitStreamWriter {
public:
VpxBitStreamWriter();
~VpxBitStreamWriter();
VpxBitStreamWriter(const VpxBitStreamWriter&) = delete;
VpxBitStreamWriter& operator=(const VpxBitStreamWriter&) = delete;
VpxBitStreamWriter(VpxBitStreamWriter&&) = default;
VpxBitStreamWriter& operator=(VpxBitStreamWriter&&) = default;
/// Write an unsigned integer value
void WriteU(u32 value, u32 value_size);
/// Write a signed integer value
void WriteS(s32 value, u32 value_size);
/// Based on 6.2.10 of VP9 Spec, writes a delta coded value
void WriteDeltaQ(u32 value);
/// Write a single bit.
void WriteBit(bool state);
/// Pushes current buffer into buffer_array, resets buffer
void Flush();
/// Returns byte_array
[[nodiscard]] std::vector<u8>& GetByteArray();
/// Returns const byte_array
[[nodiscard]] const std::vector<u8>& GetByteArray() const;
private:
/// Write bit_count bits from value into buffer
void WriteBits(u32 value, u32 bit_count);
/// Gets next available position in buffer, invokes Flush() if buffer is full
s32 GetFreeBufferBits();
s32 buffer_size{8};
s32 buffer{};
s32 buffer_pos{};
std::vector<u8> byte_array;
};
class VP9 {
public:
explicit VP9(GPU& gpu_);
~VP9();
VP9(const VP9&) = delete;
VP9& operator=(const VP9&) = delete;
VP9(VP9&&) = default;
VP9& operator=(VP9&&) = delete;
/// Composes the VP9 frame from the GPU state information. Based on the official VP9 spec
/// documentation
[[nodiscard]] const std::vector<u8>& ComposeFrameHeader(
const NvdecCommon::NvdecRegisters& state);
/// Returns true if the most recent frame was a hidden frame.
[[nodiscard]] bool WasFrameHidden() const {
return hidden;
}
private:
/// Generates compressed header probability updates in the bitstream writer
template <typename T, std::size_t N>
void WriteProbabilityUpdate(VpxRangeEncoder& writer, const std::array<T, N>& new_prob,
const std::array<T, N>& old_prob);
/// Generates compressed header probability updates in the bitstream writer
/// If probs are not equal, WriteProbabilityDelta is invoked
void WriteProbabilityUpdate(VpxRangeEncoder& writer, u8 new_prob, u8 old_prob);
/// Generates compressed header probability deltas in the bitstream writer
void WriteProbabilityDelta(VpxRangeEncoder& writer, u8 new_prob, u8 old_prob);
/// Inverse of 6.3.4 Decode term subexp
void EncodeTermSubExp(VpxRangeEncoder& writer, s32 value);
/// Writes if the value is less than the test value
bool WriteLessThan(VpxRangeEncoder& writer, s32 value, s32 test);
/// Writes probability updates for the Coef probabilities
void WriteCoefProbabilityUpdate(VpxRangeEncoder& writer, s32 tx_mode,
const std::array<u8, 1728>& new_prob,
const std::array<u8, 1728>& old_prob);
/// Write probabilities for 4-byte aligned structures
template <typename T, std::size_t N>
void WriteProbabilityUpdateAligned4(VpxRangeEncoder& writer, const std::array<T, N>& new_prob,
const std::array<T, N>& old_prob);
/// Write motion vector probability updates. 6.3.17 in the spec
void WriteMvProbabilityUpdate(VpxRangeEncoder& writer, u8 new_prob, u8 old_prob);
/// Returns VP9 information from NVDEC provided offset and size
[[nodiscard]] Vp9PictureInfo GetVp9PictureInfo(const NvdecCommon::NvdecRegisters& state);
/// Read and convert NVDEC provided entropy probs to Vp9EntropyProbs struct
void InsertEntropy(u64 offset, Vp9EntropyProbs& dst);
/// Returns frame to be decoded after buffering
[[nodiscard]] Vp9FrameContainer GetCurrentFrame(const NvdecCommon::NvdecRegisters& state);
/// Use NVDEC providied information to compose the headers for the current frame
[[nodiscard]] std::vector<u8> ComposeCompressedHeader();
[[nodiscard]] VpxBitStreamWriter ComposeUncompressedHeader();
GPU& gpu;
std::vector<u8> frame;
std::array<s8, 4> loop_filter_ref_deltas{};
std::array<s8, 2> loop_filter_mode_deltas{};
bool hidden = false;
s64 current_frame_number = -2; // since we buffer 2 frames
s32 grace_period = 6; // frame offsets need to stabilize
std::array<FrameContexts, 4> frame_ctxs{};
Vp9FrameContainer next_frame{};
Vp9FrameContainer next_next_frame{};
bool swap_next_golden{};
Vp9PictureInfo current_frame_info{};
Vp9EntropyProbs prev_frame_probs{};
s32 diff_update_probability = 252;
s32 frame_sync_code = 0x498342;
};
} // namespace Decoder
} // namespace Tegra

View File

@@ -0,0 +1,302 @@
// Copyright 2020 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#pragma once
#include <array>
#include <cstring>
#include <vector>
#include "common/common_funcs.h"
#include "common/common_types.h"
namespace Tegra {
class GPU;
namespace Decoder {
struct Vp9FrameDimensions {
s16 width{};
s16 height{};
s16 luma_pitch{};
s16 chroma_pitch{};
};
static_assert(sizeof(Vp9FrameDimensions) == 0x8, "Vp9 Vp9FrameDimensions is an invalid size");
enum FrameFlags : u32 {
IsKeyFrame = 1 << 0,
LastFrameIsKeyFrame = 1 << 1,
FrameSizeChanged = 1 << 2,
ErrorResilientMode = 1 << 3,
LastShowFrame = 1 << 4,
IntraOnly = 1 << 5,
};
enum class TxSize {
Tx4x4 = 0, // 4x4 transform
Tx8x8 = 1, // 8x8 transform
Tx16x16 = 2, // 16x16 transform
Tx32x32 = 3, // 32x32 transform
TxSizes = 4
};
enum class TxMode {
Only4X4 = 0, // Only 4x4 transform used
Allow8X8 = 1, // Allow block transform size up to 8x8
Allow16X16 = 2, // Allow block transform size up to 16x16
Allow32X32 = 3, // Allow block transform size up to 32x32
TxModeSelect = 4, // Transform specified for each block
TxModes = 5
};
struct Segmentation {
u8 enabled{};
u8 update_map{};
u8 temporal_update{};
u8 abs_delta{};
std::array<u32, 8> feature_mask{};
std::array<std::array<s16, 4>, 8> feature_data{};
};
static_assert(sizeof(Segmentation) == 0x64, "Segmentation is an invalid size");
struct LoopFilter {
u8 mode_ref_delta_enabled{};
std::array<s8, 4> ref_deltas{};
std::array<s8, 2> mode_deltas{};
};
static_assert(sizeof(LoopFilter) == 0x7, "LoopFilter is an invalid size");
struct Vp9EntropyProbs {
std::array<u8, 36> y_mode_prob{};
std::array<u8, 64> partition_prob{};
std::array<u8, 1728> coef_probs{};
std::array<u8, 8> switchable_interp_prob{};
std::array<u8, 28> inter_mode_prob{};
std::array<u8, 4> intra_inter_prob{};
std::array<u8, 5> comp_inter_prob{};
std::array<u8, 10> single_ref_prob{};
std::array<u8, 5> comp_ref_prob{};
std::array<u8, 6> tx_32x32_prob{};
std::array<u8, 4> tx_16x16_prob{};
std::array<u8, 2> tx_8x8_prob{};
std::array<u8, 3> skip_probs{};
std::array<u8, 3> joints{};
std::array<u8, 2> sign{};
std::array<u8, 20> classes{};
std::array<u8, 2> class_0{};
std::array<u8, 20> prob_bits{};
std::array<u8, 12> class_0_fr{};
std::array<u8, 6> fr{};
std::array<u8, 2> class_0_hp{};
std::array<u8, 2> high_precision{};
};
static_assert(sizeof(Vp9EntropyProbs) == 0x7B4, "Vp9EntropyProbs is an invalid size");
struct Vp9PictureInfo {
bool is_key_frame{};
bool intra_only{};
bool last_frame_was_key{};
bool frame_size_changed{};
bool error_resilient_mode{};
bool last_frame_shown{};
bool show_frame{};
std::array<s8, 4> ref_frame_sign_bias{};
s32 base_q_index{};
s32 y_dc_delta_q{};
s32 uv_dc_delta_q{};
s32 uv_ac_delta_q{};
bool lossless{};
s32 transform_mode{};
bool allow_high_precision_mv{};
s32 interp_filter{};
s32 reference_mode{};
s8 comp_fixed_ref{};
std::array<s8, 2> comp_var_ref{};
s32 log2_tile_cols{};
s32 log2_tile_rows{};
bool segment_enabled{};
bool segment_map_update{};
bool segment_map_temporal_update{};
s32 segment_abs_delta{};
std::array<u32, 8> segment_feature_enable{};
std::array<std::array<s16, 4>, 8> segment_feature_data{};
bool mode_ref_delta_enabled{};
bool use_prev_in_find_mv_refs{};
std::array<s8, 4> ref_deltas{};
std::array<s8, 2> mode_deltas{};
Vp9EntropyProbs entropy{};
Vp9FrameDimensions frame_size{};
u8 first_level{};
u8 sharpness_level{};
u32 bitstream_size{};
std::array<u64, 4> frame_offsets{};
std::array<bool, 4> refresh_frame{};
};
struct Vp9FrameContainer {
Vp9PictureInfo info{};
std::vector<u8> bit_stream;
};
struct PictureInfo {
INSERT_PADDING_WORDS(12);
u32 bitstream_size{};
INSERT_PADDING_WORDS(5);
Vp9FrameDimensions last_frame_size{};
Vp9FrameDimensions golden_frame_size{};
Vp9FrameDimensions alt_frame_size{};
Vp9FrameDimensions current_frame_size{};
u32 vp9_flags{};
std::array<s8, 4> ref_frame_sign_bias{};
u8 first_level{};
u8 sharpness_level{};
u8 base_q_index{};
u8 y_dc_delta_q{};
u8 uv_ac_delta_q{};
u8 uv_dc_delta_q{};
u8 lossless{};
u8 tx_mode{};
u8 allow_high_precision_mv{};
u8 interp_filter{};
u8 reference_mode{};
s8 comp_fixed_ref{};
std::array<s8, 2> comp_var_ref{};
u8 log2_tile_cols{};
u8 log2_tile_rows{};
Segmentation segmentation{};
LoopFilter loop_filter{};
INSERT_PADDING_BYTES(5);
u32 surface_params{};
INSERT_PADDING_WORDS(3);
[[nodiscard]] Vp9PictureInfo Convert() const {
return {
.is_key_frame = (vp9_flags & FrameFlags::IsKeyFrame) != 0,
.intra_only = (vp9_flags & FrameFlags::IntraOnly) != 0,
.last_frame_was_key = (vp9_flags & FrameFlags::LastFrameIsKeyFrame) != 0,
.frame_size_changed = (vp9_flags & FrameFlags::FrameSizeChanged) != 0,
.error_resilient_mode = (vp9_flags & FrameFlags::ErrorResilientMode) != 0,
.last_frame_shown = (vp9_flags & FrameFlags::LastShowFrame) != 0,
.ref_frame_sign_bias = ref_frame_sign_bias,
.base_q_index = base_q_index,
.y_dc_delta_q = y_dc_delta_q,
.uv_dc_delta_q = uv_dc_delta_q,
.uv_ac_delta_q = uv_ac_delta_q,
.lossless = lossless != 0,
.transform_mode = tx_mode,
.allow_high_precision_mv = allow_high_precision_mv != 0,
.interp_filter = interp_filter,
.reference_mode = reference_mode,
.comp_fixed_ref = comp_fixed_ref,
.comp_var_ref = comp_var_ref,
.log2_tile_cols = log2_tile_cols,
.log2_tile_rows = log2_tile_rows,
.segment_enabled = segmentation.enabled != 0,
.segment_map_update = segmentation.update_map != 0,
.segment_map_temporal_update = segmentation.temporal_update != 0,
.segment_abs_delta = segmentation.abs_delta,
.segment_feature_enable = segmentation.feature_mask,
.segment_feature_data = segmentation.feature_data,
.mode_ref_delta_enabled = loop_filter.mode_ref_delta_enabled != 0,
.use_prev_in_find_mv_refs = !(vp9_flags == (FrameFlags::ErrorResilientMode)) &&
!(vp9_flags == (FrameFlags::FrameSizeChanged)) &&
!(vp9_flags == (FrameFlags::IntraOnly)) &&
(vp9_flags == (FrameFlags::LastShowFrame)) &&
!(vp9_flags == (FrameFlags::LastFrameIsKeyFrame)),
.ref_deltas = loop_filter.ref_deltas,
.mode_deltas = loop_filter.mode_deltas,
.frame_size = current_frame_size,
.first_level = first_level,
.sharpness_level = sharpness_level,
.bitstream_size = bitstream_size,
};
}
};
static_assert(sizeof(PictureInfo) == 0x100, "PictureInfo is an invalid size");
struct EntropyProbs {
INSERT_PADDING_BYTES(1024);
std::array<u8, 28> inter_mode_prob{};
std::array<u8, 4> intra_inter_prob{};
INSERT_PADDING_BYTES(80);
std::array<u8, 2> tx_8x8_prob{};
std::array<u8, 4> tx_16x16_prob{};
std::array<u8, 6> tx_32x32_prob{};
std::array<u8, 4> y_mode_prob_e8{};
std::array<std::array<u8, 8>, 4> y_mode_prob_e0e7{};
INSERT_PADDING_BYTES(64);
std::array<u8, 64> partition_prob{};
INSERT_PADDING_BYTES(10);
std::array<u8, 8> switchable_interp_prob{};
std::array<u8, 5> comp_inter_prob{};
std::array<u8, 3> skip_probs{};
INSERT_PADDING_BYTES(1);
std::array<u8, 3> joints{};
std::array<u8, 2> sign{};
std::array<u8, 2> class_0{};
std::array<u8, 6> fr{};
std::array<u8, 2> class_0_hp{};
std::array<u8, 2> high_precision{};
std::array<u8, 20> classes{};
std::array<u8, 12> class_0_fr{};
std::array<u8, 20> pred_bits{};
std::array<u8, 10> single_ref_prob{};
std::array<u8, 5> comp_ref_prob{};
INSERT_PADDING_BYTES(17);
std::array<u8, 2304> coef_probs{};
void Convert(Vp9EntropyProbs& fc) {
fc.inter_mode_prob = inter_mode_prob;
fc.intra_inter_prob = intra_inter_prob;
fc.tx_8x8_prob = tx_8x8_prob;
fc.tx_16x16_prob = tx_16x16_prob;
fc.tx_32x32_prob = tx_32x32_prob;
for (std::size_t i = 0; i < 4; i++) {
for (std::size_t j = 0; j < 9; j++) {
fc.y_mode_prob[j + 9 * i] = j < 8 ? y_mode_prob_e0e7[i][j] : y_mode_prob_e8[i];
}
}
fc.partition_prob = partition_prob;
fc.switchable_interp_prob = switchable_interp_prob;
fc.comp_inter_prob = comp_inter_prob;
fc.skip_probs = skip_probs;
fc.joints = joints;
fc.sign = sign;
fc.class_0 = class_0;
fc.fr = fr;
fc.class_0_hp = class_0_hp;
fc.high_precision = high_precision;
fc.classes = classes;
fc.class_0_fr = class_0_fr;
fc.prob_bits = pred_bits;
fc.single_ref_prob = single_ref_prob;
fc.comp_ref_prob = comp_ref_prob;
// Skip the 4th element as it goes unused
for (std::size_t i = 0; i < coef_probs.size(); i += 4) {
const std::size_t j = i - i / 4;
fc.coef_probs[j] = coef_probs[i];
fc.coef_probs[j + 1] = coef_probs[i + 1];
fc.coef_probs[j + 2] = coef_probs[i + 2];
}
}
};
static_assert(sizeof(EntropyProbs) == 0xEA0, "EntropyProbs is an invalid size");
enum class Ref { Last, Golden, AltRef };
struct RefPoolElement {
s64 frame{};
Ref ref{};
bool refresh{};
};
struct FrameContexts {
s64 from{};
bool adapted{};
Vp9EntropyProbs probs{};
};
}; // namespace Decoder
}; // namespace Tegra

View File

@@ -0,0 +1,39 @@
// Copyright 2020 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#include "common/assert.h"
#include "video_core/command_classes/host1x.h"
#include "video_core/gpu.h"
Tegra::Host1x::Host1x(GPU& gpu_) : gpu(gpu_) {}
Tegra::Host1x::~Host1x() = default;
void Tegra::Host1x::StateWrite(u32 offset, u32 arguments) {
u8* const state_offset = reinterpret_cast<u8*>(&state) + offset * sizeof(u32);
std::memcpy(state_offset, &arguments, sizeof(u32));
}
void Tegra::Host1x::ProcessMethod(Method method, const std::vector<u32>& arguments) {
StateWrite(static_cast<u32>(method), arguments[0]);
switch (method) {
case Method::WaitSyncpt:
Execute(arguments[0]);
break;
case Method::LoadSyncptPayload32:
syncpoint_value = arguments[0];
break;
case Method::WaitSyncpt32:
Execute(arguments[0]);
break;
default:
UNIMPLEMENTED_MSG("Host1x method 0x{:X}", static_cast<u32>(method));
break;
}
}
void Tegra::Host1x::Execute(u32 data) {
// This method waits on a valid syncpoint.
// TODO: Implement when proper Async is in place
}

View File

@@ -0,0 +1,78 @@
// Copyright 2020 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#pragma once
#include <vector>
#include "common/common_funcs.h"
#include "common/common_types.h"
namespace Tegra {
class GPU;
class Nvdec;
class Host1x {
public:
struct Host1xClassRegisters {
u32 incr_syncpt{};
u32 incr_syncpt_ctrl{};
u32 incr_syncpt_error{};
INSERT_PADDING_WORDS(5);
u32 wait_syncpt{};
u32 wait_syncpt_base{};
u32 wait_syncpt_incr{};
u32 load_syncpt_base{};
u32 incr_syncpt_base{};
u32 clear{};
u32 wait{};
u32 wait_with_interrupt{};
u32 delay_use{};
u32 tick_count_high{};
u32 tick_count_low{};
u32 tick_ctrl{};
INSERT_PADDING_WORDS(23);
u32 ind_ctrl{};
u32 ind_off2{};
u32 ind_off{};
std::array<u32, 31> ind_data{};
INSERT_PADDING_WORDS(1);
u32 load_syncpoint_payload32{};
u32 stall_ctrl{};
u32 wait_syncpt32{};
u32 wait_syncpt_base32{};
u32 load_syncpt_base32{};
u32 incr_syncpt_base32{};
u32 stall_count_high{};
u32 stall_count_low{};
u32 xref_ctrl{};
u32 channel_xref_high{};
u32 channel_xref_low{};
};
static_assert(sizeof(Host1xClassRegisters) == 0x164, "Host1xClassRegisters is an invalid size");
enum class Method : u32 {
WaitSyncpt = offsetof(Host1xClassRegisters, wait_syncpt) / 4,
LoadSyncptPayload32 = offsetof(Host1xClassRegisters, load_syncpoint_payload32) / 4,
WaitSyncpt32 = offsetof(Host1xClassRegisters, wait_syncpt32) / 4,
};
explicit Host1x(GPU& gpu);
~Host1x();
/// Writes the method into the state, Invoke Execute() if encountered
void ProcessMethod(Method method, const std::vector<u32>& arguments);
private:
/// For Host1x, execute is waiting on a syncpoint previously written into the state
void Execute(u32 data);
/// Write argument into the provided offset
void StateWrite(u32 offset, u32 arguments);
u32 syncpoint_value{};
Host1xClassRegisters state{};
GPU& gpu;
};
} // namespace Tegra

View File

@@ -0,0 +1,48 @@
// Copyright 2020 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#include "common/assert.h"
#include "video_core/command_classes/nvdec.h"
#include "video_core/gpu.h"
namespace Tegra {
Nvdec::Nvdec(GPU& gpu_) : gpu(gpu_), codec(std::make_unique<Codec>(gpu)) {}
Nvdec::~Nvdec() = default;
void Nvdec::ProcessMethod(Method method, const std::vector<u32>& arguments) {
if (method == Method::SetVideoCodec) {
codec->StateWrite(static_cast<u32>(method), arguments[0]);
} else {
codec->StateWrite(static_cast<u32>(method), static_cast<u64>(arguments[0]) << 8);
}
switch (method) {
case Method::SetVideoCodec:
codec->SetTargetCodec(static_cast<NvdecCommon::VideoCodec>(arguments[0]));
break;
case Method::Execute:
Execute();
break;
}
}
AVFramePtr Nvdec::GetFrame() {
return codec->GetCurrentFrame();
}
void Nvdec::Execute() {
switch (codec->GetCurrentCodec()) {
case NvdecCommon::VideoCodec::H264:
case NvdecCommon::VideoCodec::Vp9:
codec->Decode();
break;
default:
UNIMPLEMENTED_MSG("Unknown codec {}", static_cast<u32>(codec->GetCurrentCodec()));
break;
}
}
} // namespace Tegra

View File

@@ -0,0 +1,38 @@
// Copyright 2020 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#pragma once
#include <memory>
#include <vector>
#include "common/common_types.h"
#include "video_core/command_classes/codecs/codec.h"
namespace Tegra {
class GPU;
class Nvdec {
public:
enum class Method : u32 {
SetVideoCodec = 0x80,
Execute = 0xc0,
};
explicit Nvdec(GPU& gpu);
~Nvdec();
/// Writes the method into the state, Invoke Execute() if encountered
void ProcessMethod(Method method, const std::vector<u32>& arguments);
/// Return most recently decoded frame
[[nodiscard]] AVFramePtr GetFrame();
private:
/// Invoke codec to decode a frame
void Execute();
GPU& gpu;
std::unique_ptr<Codec> codec;
};
} // namespace Tegra

View File

@@ -0,0 +1,48 @@
// Copyright 2020 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#pragma once
#include "common/common_funcs.h"
#include "common/common_types.h"
namespace Tegra::NvdecCommon {
struct NvdecRegisters {
INSERT_PADDING_WORDS(256);
u64 set_codec_id{};
INSERT_PADDING_WORDS(254);
u64 set_platform_id{};
u64 picture_info_offset{};
u64 frame_bitstream_offset{};
u64 frame_number{};
u64 h264_slice_data_offsets{};
u64 h264_mv_dump_offset{};
INSERT_PADDING_WORDS(6);
u64 frame_stats_offset{};
u64 h264_last_surface_luma_offset{};
u64 h264_last_surface_chroma_offset{};
std::array<u64, 17> surface_luma_offset{};
std::array<u64, 17> surface_chroma_offset{};
INSERT_PADDING_WORDS(132);
u64 vp9_entropy_probs_offset{};
u64 vp9_backward_updates_offset{};
u64 vp9_last_frame_segmap_offset{};
u64 vp9_curr_frame_segmap_offset{};
INSERT_PADDING_WORDS(2);
u64 vp9_last_frame_mvs_offset{};
u64 vp9_curr_frame_mvs_offset{};
INSERT_PADDING_WORDS(2);
};
static_assert(sizeof(NvdecRegisters) == (0xBC0), "NvdecRegisters is incorrect size");
enum class VideoCodec : u32 {
None = 0x0,
H264 = 0x3,
Vp8 = 0x5,
H265 = 0x7,
Vp9 = 0x9,
};
} // namespace Tegra::NvdecCommon

View File

@@ -0,0 +1,60 @@
// MIT License
//
// Copyright (c) Ryujinx Team and Contributors
//
// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
// associated documentation files (the "Software"), to deal in the Software without restriction,
// including without limitation the rights to use, copy, modify, merge, publish, distribute,
// sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all copies or
// substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
// NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
//
#include <algorithm>
#include "sync_manager.h"
#include "video_core/gpu.h"
namespace Tegra {
SyncptIncrManager::SyncptIncrManager(GPU& gpu_) : gpu(gpu_) {}
SyncptIncrManager::~SyncptIncrManager() = default;
void SyncptIncrManager::Increment(u32 id) {
increments.emplace_back(0, 0, id, true);
IncrementAllDone();
}
u32 SyncptIncrManager::IncrementWhenDone(u32 class_id, u32 id) {
const u32 handle = current_id++;
increments.emplace_back(handle, class_id, id);
return handle;
}
void SyncptIncrManager::SignalDone(u32 handle) {
const auto done_incr =
std::find_if(increments.begin(), increments.end(),
[handle](const SyncptIncr& incr) { return incr.id == handle; });
if (done_incr != increments.cend()) {
done_incr->complete = true;
}
IncrementAllDone();
}
void SyncptIncrManager::IncrementAllDone() {
std::size_t done_count = 0;
for (; done_count < increments.size(); ++done_count) {
if (!increments[done_count].complete) {
break;
}
gpu.IncrementSyncPoint(increments[done_count].syncpt_id);
}
increments.erase(increments.begin(), increments.begin() + done_count);
}
} // namespace Tegra

View File

@@ -0,0 +1,64 @@
// MIT License
//
// Copyright (c) Ryujinx Team and Contributors
//
// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
// associated documentation files (the "Software"), to deal in the Software without restriction,
// including without limitation the rights to use, copy, modify, merge, publish, distribute,
// sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all copies or
// substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
// NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
//
#pragma once
#include <mutex>
#include <vector>
#include "common/common_types.h"
namespace Tegra {
class GPU;
struct SyncptIncr {
u32 id;
u32 class_id;
u32 syncpt_id;
bool complete;
SyncptIncr(u32 id_, u32 class_id_, u32 syncpt_id_, bool done = false)
: id(id_), class_id(class_id_), syncpt_id(syncpt_id_), complete(done) {}
};
class SyncptIncrManager {
public:
explicit SyncptIncrManager(GPU& gpu);
~SyncptIncrManager();
/// Add syncpoint id and increment all
void Increment(u32 id);
/// Returns a handle to increment later
u32 IncrementWhenDone(u32 class_id, u32 id);
/// IncrememntAllDone, including handle
void SignalDone(u32 handle);
/// Increment all sequential pending increments that are already done.
void IncrementAllDone();
private:
std::vector<SyncptIncr> increments;
std::mutex increment_lock;
u32 current_id{};
GPU& gpu;
};
} // namespace Tegra

View File

@@ -0,0 +1,175 @@
// Copyright 2020 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#include <array>
#include "common/assert.h"
#include "video_core/command_classes/nvdec.h"
#include "video_core/command_classes/vic.h"
#include "video_core/engines/maxwell_3d.h"
#include "video_core/gpu.h"
#include "video_core/memory_manager.h"
#include "video_core/textures/decoders.h"
extern "C" {
#include <libswscale/swscale.h>
}
namespace Tegra {
Vic::Vic(GPU& gpu_, std::shared_ptr<Nvdec> nvdec_processor_)
: gpu(gpu_), nvdec_processor(std::move(nvdec_processor_)) {}
Vic::~Vic() = default;
void Vic::VicStateWrite(u32 offset, u32 arguments) {
u8* const state_offset = reinterpret_cast<u8*>(&vic_state) + offset * sizeof(u32);
std::memcpy(state_offset, &arguments, sizeof(u32));
}
void Vic::ProcessMethod(Method method, const std::vector<u32>& arguments) {
LOG_DEBUG(HW_GPU, "Vic method 0x{:X}", method);
VicStateWrite(static_cast<u32>(method), arguments[0]);
const u64 arg = static_cast<u64>(arguments[0]) << 8;
switch (method) {
case Method::Execute:
Execute();
break;
case Method::SetConfigStructOffset:
config_struct_address = arg;
break;
case Method::SetOutputSurfaceLumaOffset:
output_surface_luma_address = arg;
break;
case Method::SetOutputSurfaceChromaUOffset:
output_surface_chroma_u_address = arg;
break;
case Method::SetOutputSurfaceChromaVOffset:
output_surface_chroma_v_address = arg;
break;
default:
break;
}
}
void Vic::Execute() {
if (output_surface_luma_address == 0) {
LOG_ERROR(Service_NVDRV, "VIC Luma address not set. Recieved 0x{:X}",
vic_state.output_surface.luma_offset);
return;
}
const VicConfig config{gpu.MemoryManager().Read<u64>(config_struct_address + 0x20)};
const AVFramePtr frame_ptr = nvdec_processor->GetFrame();
const auto* frame = frame_ptr.get();
if (!frame || frame->width == 0 || frame->height == 0) {
return;
}
const VideoPixelFormat pixel_format =
static_cast<VideoPixelFormat>(config.pixel_format.Value());
switch (pixel_format) {
case VideoPixelFormat::BGRA8:
case VideoPixelFormat::RGBA8: {
LOG_TRACE(Service_NVDRV, "Writing RGB Frame");
if (scaler_ctx == nullptr || frame->width != scaler_width ||
frame->height != scaler_height) {
const AVPixelFormat target_format =
(pixel_format == VideoPixelFormat::RGBA8) ? AV_PIX_FMT_RGBA : AV_PIX_FMT_BGRA;
sws_freeContext(scaler_ctx);
scaler_ctx = nullptr;
// FFmpeg returns all frames in YUV420, convert it into expected format
scaler_ctx =
sws_getContext(frame->width, frame->height, AV_PIX_FMT_YUV420P, frame->width,
frame->height, target_format, 0, nullptr, nullptr, nullptr);
scaler_width = frame->width;
scaler_height = frame->height;
}
// Get Converted frame
const std::size_t linear_size = frame->width * frame->height * 4;
using AVMallocPtr = std::unique_ptr<u8, decltype(&av_free)>;
AVMallocPtr converted_frame_buffer{static_cast<u8*>(av_malloc(linear_size)), av_free};
const int converted_stride{frame->width * 4};
u8* const converted_frame_buf_addr{converted_frame_buffer.get()};
sws_scale(scaler_ctx, frame->data, frame->linesize, 0, frame->height,
&converted_frame_buf_addr, &converted_stride);
const u32 blk_kind = static_cast<u32>(config.block_linear_kind);
if (blk_kind != 0) {
// swizzle pitch linear to block linear
const u32 block_height = static_cast<u32>(config.block_linear_height_log2);
const auto size = Tegra::Texture::CalculateSize(true, 4, frame->width, frame->height, 1,
block_height, 0);
std::vector<u8> swizzled_data(size);
Tegra::Texture::SwizzleSubrect(frame->width, frame->height, frame->width * 4,
frame->width, 4, swizzled_data.data(),
converted_frame_buffer.get(), block_height, 0, 0);
gpu.MemoryManager().WriteBlock(output_surface_luma_address, swizzled_data.data(), size);
gpu.Maxwell3D().OnMemoryWrite();
} else {
// send pitch linear frame
gpu.MemoryManager().WriteBlock(output_surface_luma_address, converted_frame_buf_addr,
linear_size);
gpu.Maxwell3D().OnMemoryWrite();
}
break;
}
case VideoPixelFormat::Yuv420: {
LOG_TRACE(Service_NVDRV, "Writing YUV420 Frame");
const std::size_t surface_width = config.surface_width_minus1 + 1;
const std::size_t surface_height = config.surface_height_minus1 + 1;
const std::size_t half_width = surface_width / 2;
const std::size_t half_height = config.surface_height_minus1 / 2;
const std::size_t aligned_width = (surface_width + 0xff) & ~0xff;
const auto* luma_ptr = frame->data[0];
const auto* chroma_b_ptr = frame->data[1];
const auto* chroma_r_ptr = frame->data[2];
const auto stride = frame->linesize[0];
const auto half_stride = frame->linesize[1];
std::vector<u8> luma_buffer(aligned_width * surface_height);
std::vector<u8> chroma_buffer(aligned_width * half_height);
// Populate luma buffer
for (std::size_t y = 0; y < surface_height - 1; ++y) {
std::size_t src = y * stride;
std::size_t dst = y * aligned_width;
std::size_t size = surface_width;
for (std::size_t offset = 0; offset < size; ++offset) {
luma_buffer[dst + offset] = luma_ptr[src + offset];
}
}
gpu.MemoryManager().WriteBlock(output_surface_luma_address, luma_buffer.data(),
luma_buffer.size());
// Populate chroma buffer from both channels with interleaving.
for (std::size_t y = 0; y < half_height; ++y) {
std::size_t src = y * half_stride;
std::size_t dst = y * aligned_width;
for (std::size_t x = 0; x < half_width; ++x) {
chroma_buffer[dst + x * 2] = chroma_b_ptr[src + x];
chroma_buffer[dst + x * 2 + 1] = chroma_r_ptr[src + x];
}
}
gpu.MemoryManager().WriteBlock(output_surface_chroma_u_address, chroma_buffer.data(),
chroma_buffer.size());
gpu.Maxwell3D().OnMemoryWrite();
break;
}
default:
UNIMPLEMENTED_MSG("Unknown video pixel format {}", config.pixel_format.Value());
break;
}
}
} // namespace Tegra

View File

@@ -0,0 +1,110 @@
// Copyright 2020 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#pragma once
#include <memory>
#include <vector>
#include "common/bit_field.h"
#include "common/common_types.h"
struct SwsContext;
namespace Tegra {
class GPU;
class Nvdec;
struct PlaneOffsets {
u32 luma_offset{};
u32 chroma_u_offset{};
u32 chroma_v_offset{};
};
struct VicRegisters {
INSERT_PADDING_WORDS(64);
u32 nop{};
INSERT_PADDING_WORDS(15);
u32 pm_trigger{};
INSERT_PADDING_WORDS(47);
u32 set_application_id{};
u32 set_watchdog_timer{};
INSERT_PADDING_WORDS(17);
u32 context_save_area{};
u32 context_switch{};
INSERT_PADDING_WORDS(43);
u32 execute{};
INSERT_PADDING_WORDS(63);
std::array<std::array<PlaneOffsets, 8>, 8> surfacex_slots{};
u32 picture_index{};
u32 control_params{};
u32 config_struct_offset{};
u32 filter_struct_offset{};
u32 palette_offset{};
u32 hist_offset{};
u32 context_id{};
u32 fce_ucode_size{};
PlaneOffsets output_surface{};
u32 fce_ucode_offset{};
INSERT_PADDING_WORDS(4);
std::array<u32, 8> slot_context_id{};
INSERT_PADDING_WORDS(16);
};
static_assert(sizeof(VicRegisters) == 0x7A0, "VicRegisters is an invalid size");
class Vic {
public:
enum class Method : u32 {
Execute = 0xc0,
SetControlParams = 0x1c1,
SetConfigStructOffset = 0x1c2,
SetOutputSurfaceLumaOffset = 0x1c8,
SetOutputSurfaceChromaUOffset = 0x1c9,
SetOutputSurfaceChromaVOffset = 0x1ca
};
explicit Vic(GPU& gpu, std::shared_ptr<Nvdec> nvdec_processor);
~Vic();
/// Write to the device state.
void ProcessMethod(Method method, const std::vector<u32>& arguments);
private:
void Execute();
void VicStateWrite(u32 offset, u32 arguments);
VicRegisters vic_state{};
enum class VideoPixelFormat : u64_le {
RGBA8 = 0x1f,
BGRA8 = 0x20,
Yuv420 = 0x44,
};
union VicConfig {
u64_le raw{};
BitField<0, 7, u64_le> pixel_format;
BitField<7, 2, u64_le> chroma_loc_horiz;
BitField<9, 2, u64_le> chroma_loc_vert;
BitField<11, 4, u64_le> block_linear_kind;
BitField<15, 4, u64_le> block_linear_height_log2;
BitField<19, 3, u64_le> reserved0;
BitField<22, 10, u64_le> reserved1;
BitField<32, 14, u64_le> surface_width_minus1;
BitField<46, 14, u64_le> surface_height_minus1;
};
GPU& gpu;
std::shared_ptr<Tegra::Nvdec> nvdec_processor;
GPUVAddr config_struct_address{};
GPUVAddr output_surface_luma_address{};
GPUVAddr output_surface_chroma_u_address{};
GPUVAddr output_surface_chroma_v_address{};
SwsContext* scaler_ctx{};
s32 scaler_width{};
s32 scaler_height{};
};
} // namespace Tegra

View File

@@ -0,0 +1,249 @@
// Copyright 2020 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#include <array>
#include <cstddef>
#include "common/common_types.h"
#include "video_core/compatible_formats.h"
#include "video_core/surface.h"
namespace VideoCore::Surface {
namespace {
using Table = std::array<std::array<u64, 2>, MaxPixelFormat>;
// Compatibility table taken from Table 3.X.2 in:
// https://www.khronos.org/registry/OpenGL/extensions/ARB/ARB_texture_view.txt
constexpr std::array VIEW_CLASS_128_BITS{
PixelFormat::R32G32B32A32_FLOAT,
PixelFormat::R32G32B32A32_UINT,
PixelFormat::R32G32B32A32_SINT,
};
constexpr std::array VIEW_CLASS_96_BITS{
PixelFormat::R32G32B32_FLOAT,
};
// Missing formats:
// PixelFormat::RGB32UI,
// PixelFormat::RGB32I,
constexpr std::array VIEW_CLASS_64_BITS{
PixelFormat::R32G32_FLOAT, PixelFormat::R32G32_UINT,
PixelFormat::R32G32_SINT, PixelFormat::R16G16B16A16_FLOAT,
PixelFormat::R16G16B16A16_UNORM, PixelFormat::R16G16B16A16_SNORM,
PixelFormat::R16G16B16A16_UINT, PixelFormat::R16G16B16A16_SINT,
};
// TODO: How should we handle 48 bits?
constexpr std::array VIEW_CLASS_32_BITS{
PixelFormat::R16G16_FLOAT, PixelFormat::B10G11R11_FLOAT, PixelFormat::R32_FLOAT,
PixelFormat::A2B10G10R10_UNORM, PixelFormat::R16G16_UINT, PixelFormat::R32_UINT,
PixelFormat::R16G16_SINT, PixelFormat::R32_SINT, PixelFormat::A8B8G8R8_UNORM,
PixelFormat::R16G16_UNORM, PixelFormat::A8B8G8R8_SNORM, PixelFormat::R16G16_SNORM,
PixelFormat::A8B8G8R8_SRGB, PixelFormat::E5B9G9R9_FLOAT, PixelFormat::B8G8R8A8_UNORM,
PixelFormat::B8G8R8A8_SRGB, PixelFormat::A8B8G8R8_UINT, PixelFormat::A8B8G8R8_SINT,
PixelFormat::A2B10G10R10_UINT,
};
// TODO: How should we handle 24 bits?
constexpr std::array VIEW_CLASS_16_BITS{
PixelFormat::R16_FLOAT, PixelFormat::R8G8_UINT, PixelFormat::R16_UINT,
PixelFormat::R16_SINT, PixelFormat::R8G8_UNORM, PixelFormat::R16_UNORM,
PixelFormat::R8G8_SNORM, PixelFormat::R16_SNORM, PixelFormat::R8G8_SINT,
};
constexpr std::array VIEW_CLASS_8_BITS{
PixelFormat::R8_UINT,
PixelFormat::R8_UNORM,
PixelFormat::R8_SINT,
PixelFormat::R8_SNORM,
};
constexpr std::array VIEW_CLASS_RGTC1_RED{
PixelFormat::BC4_UNORM,
PixelFormat::BC4_SNORM,
};
constexpr std::array VIEW_CLASS_RGTC2_RG{
PixelFormat::BC5_UNORM,
PixelFormat::BC5_SNORM,
};
constexpr std::array VIEW_CLASS_BPTC_UNORM{
PixelFormat::BC7_UNORM,
PixelFormat::BC7_SRGB,
};
constexpr std::array VIEW_CLASS_BPTC_FLOAT{
PixelFormat::BC6H_SFLOAT,
PixelFormat::BC6H_UFLOAT,
};
constexpr std::array VIEW_CLASS_ASTC_4x4_RGBA{
PixelFormat::ASTC_2D_4X4_UNORM,
PixelFormat::ASTC_2D_4X4_SRGB,
};
constexpr std::array VIEW_CLASS_ASTC_5x4_RGBA{
PixelFormat::ASTC_2D_5X4_UNORM,
PixelFormat::ASTC_2D_5X4_SRGB,
};
constexpr std::array VIEW_CLASS_ASTC_5x5_RGBA{
PixelFormat::ASTC_2D_5X5_UNORM,
PixelFormat::ASTC_2D_5X5_SRGB,
};
constexpr std::array VIEW_CLASS_ASTC_6x5_RGBA{
PixelFormat::ASTC_2D_6X5_UNORM,
PixelFormat::ASTC_2D_6X5_SRGB,
};
constexpr std::array VIEW_CLASS_ASTC_6x6_RGBA{
PixelFormat::ASTC_2D_6X6_UNORM,
PixelFormat::ASTC_2D_6X6_SRGB,
};
constexpr std::array VIEW_CLASS_ASTC_8x5_RGBA{
PixelFormat::ASTC_2D_8X5_UNORM,
PixelFormat::ASTC_2D_8X5_SRGB,
};
constexpr std::array VIEW_CLASS_ASTC_8x8_RGBA{
PixelFormat::ASTC_2D_8X8_UNORM,
PixelFormat::ASTC_2D_8X8_SRGB,
};
// Missing formats:
// PixelFormat::ASTC_2D_10X5_UNORM
// PixelFormat::ASTC_2D_10X5_SRGB
// Missing formats:
// PixelFormat::ASTC_2D_10X6_UNORM
// PixelFormat::ASTC_2D_10X6_SRGB
constexpr std::array VIEW_CLASS_ASTC_10x8_RGBA{
PixelFormat::ASTC_2D_10X8_UNORM,
PixelFormat::ASTC_2D_10X8_SRGB,
};
constexpr std::array VIEW_CLASS_ASTC_10x10_RGBA{
PixelFormat::ASTC_2D_10X10_UNORM,
PixelFormat::ASTC_2D_10X10_SRGB,
};
// Missing formats
// ASTC_2D_12X10_UNORM,
// ASTC_2D_12X10_SRGB,
constexpr std::array VIEW_CLASS_ASTC_12x12_RGBA{
PixelFormat::ASTC_2D_12X12_UNORM,
PixelFormat::ASTC_2D_12X12_SRGB,
};
// Compatibility table taken from Table 4.X.1 in:
// https://www.khronos.org/registry/OpenGL/extensions/ARB/ARB_copy_image.txt
constexpr std::array COPY_CLASS_128_BITS{
PixelFormat::R32G32B32A32_UINT, PixelFormat::R32G32B32A32_FLOAT, PixelFormat::R32G32B32A32_SINT,
PixelFormat::BC2_UNORM, PixelFormat::BC2_SRGB, PixelFormat::BC3_UNORM,
PixelFormat::BC3_SRGB, PixelFormat::BC5_UNORM, PixelFormat::BC5_SNORM,
PixelFormat::BC7_UNORM, PixelFormat::BC7_SRGB, PixelFormat::BC6H_SFLOAT,
PixelFormat::BC6H_UFLOAT,
};
// Missing formats:
// PixelFormat::RGBA32I
// COMPRESSED_RG_RGTC2
constexpr std::array COPY_CLASS_64_BITS{
PixelFormat::R16G16B16A16_FLOAT, PixelFormat::R16G16B16A16_UINT,
PixelFormat::R16G16B16A16_UNORM, PixelFormat::R16G16B16A16_SNORM,
PixelFormat::R16G16B16A16_SINT, PixelFormat::R32G32_UINT,
PixelFormat::R32G32_FLOAT, PixelFormat::R32G32_SINT,
PixelFormat::BC1_RGBA_UNORM, PixelFormat::BC1_RGBA_SRGB,
};
// Missing formats:
// COMPRESSED_RGB_S3TC_DXT1_EXT
// COMPRESSED_SRGB_S3TC_DXT1_EXT
// COMPRESSED_RGBA_S3TC_DXT1_EXT
// COMPRESSED_SIGNED_RED_RGTC1
constexpr void Enable(Table& table, size_t format_a, size_t format_b) {
table[format_a][format_b / 64] |= u64(1) << (format_b % 64);
table[format_b][format_a / 64] |= u64(1) << (format_a % 64);
}
constexpr void Enable(Table& table, PixelFormat format_a, PixelFormat format_b) {
Enable(table, static_cast<size_t>(format_a), static_cast<size_t>(format_b));
}
template <typename Range>
constexpr void EnableRange(Table& table, const Range& range) {
for (auto it_a = range.begin(); it_a != range.end(); ++it_a) {
for (auto it_b = it_a; it_b != range.end(); ++it_b) {
Enable(table, *it_a, *it_b);
}
}
}
constexpr bool IsSupported(const Table& table, PixelFormat format_a, PixelFormat format_b) {
const size_t a = static_cast<size_t>(format_a);
const size_t b = static_cast<size_t>(format_b);
return ((table[a][b / 64] >> (b % 64)) & 1) != 0;
}
constexpr Table MakeViewTable() {
Table view{};
for (size_t i = 0; i < MaxPixelFormat; ++i) {
// Identity is allowed
Enable(view, i, i);
}
EnableRange(view, VIEW_CLASS_128_BITS);
EnableRange(view, VIEW_CLASS_96_BITS);
EnableRange(view, VIEW_CLASS_64_BITS);
EnableRange(view, VIEW_CLASS_32_BITS);
EnableRange(view, VIEW_CLASS_16_BITS);
EnableRange(view, VIEW_CLASS_8_BITS);
EnableRange(view, VIEW_CLASS_RGTC1_RED);
EnableRange(view, VIEW_CLASS_RGTC2_RG);
EnableRange(view, VIEW_CLASS_BPTC_UNORM);
EnableRange(view, VIEW_CLASS_BPTC_FLOAT);
EnableRange(view, VIEW_CLASS_ASTC_4x4_RGBA);
EnableRange(view, VIEW_CLASS_ASTC_5x4_RGBA);
EnableRange(view, VIEW_CLASS_ASTC_5x5_RGBA);
EnableRange(view, VIEW_CLASS_ASTC_6x5_RGBA);
EnableRange(view, VIEW_CLASS_ASTC_6x6_RGBA);
EnableRange(view, VIEW_CLASS_ASTC_8x5_RGBA);
EnableRange(view, VIEW_CLASS_ASTC_8x8_RGBA);
EnableRange(view, VIEW_CLASS_ASTC_10x8_RGBA);
EnableRange(view, VIEW_CLASS_ASTC_10x10_RGBA);
EnableRange(view, VIEW_CLASS_ASTC_12x12_RGBA);
return view;
}
constexpr Table MakeCopyTable() {
Table copy = MakeViewTable();
EnableRange(copy, COPY_CLASS_128_BITS);
EnableRange(copy, COPY_CLASS_64_BITS);
return copy;
}
} // Anonymous namespace
bool IsViewCompatible(PixelFormat format_a, PixelFormat format_b) {
static constexpr Table TABLE = MakeViewTable();
return IsSupported(TABLE, format_a, format_b);
}
bool IsCopyCompatible(PixelFormat format_a, PixelFormat format_b) {
static constexpr Table TABLE = MakeCopyTable();
return IsSupported(TABLE, format_a, format_b);
}
} // namespace VideoCore::Surface

View File

@@ -0,0 +1,15 @@
// Copyright 2020 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#pragma once
#include "video_core/surface.h"
namespace VideoCore::Surface {
bool IsViewCompatible(PixelFormat format_a, PixelFormat format_b);
bool IsCopyCompatible(PixelFormat format_a, PixelFormat format_b);
} // namespace VideoCore::Surface

View File

@@ -0,0 +1,31 @@
// Copyright 2020 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#pragma once
#include <array>
#include <cstddef>
#include <utility>
#include <vector>
namespace VideoCommon {
template <typename T, size_t TICKS_TO_DESTROY>
class DelayedDestructionRing {
public:
void Tick() {
index = (index + 1) % TICKS_TO_DESTROY;
elements[index].clear();
}
void Push(T&& object) {
elements[index].push_back(std::move(object));
}
private:
size_t index = 0;
std::array<std::vector<T>, TICKS_TO_DESTROY> elements;
};
} // namespace VideoCommon

45
src/video_core/dirty_flags.cpp Executable file
View File

@@ -0,0 +1,45 @@
// Copyright 2019 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#include <array>
#include <cstddef>
#include "common/common_types.h"
#include "video_core/dirty_flags.h"
#define OFF(field_name) MAXWELL3D_REG_INDEX(field_name)
#define NUM(field_name) (sizeof(::Tegra::Engines::Maxwell3D::Regs::field_name) / (sizeof(u32)))
namespace VideoCommon::Dirty {
using Tegra::Engines::Maxwell3D;
void SetupDirtyRenderTargets(Tegra::Engines::Maxwell3D::DirtyState::Tables& tables) {
FillBlock(tables[0], OFF(tic), NUM(tic), Descriptors);
FillBlock(tables[0], OFF(tsc), NUM(tsc), Descriptors);
static constexpr std::size_t num_per_rt = NUM(rt[0]);
static constexpr std::size_t begin = OFF(rt);
static constexpr std::size_t num = num_per_rt * Maxwell3D::Regs::NumRenderTargets;
for (std::size_t rt = 0; rt < Maxwell3D::Regs::NumRenderTargets; ++rt) {
FillBlock(tables[0], begin + rt * num_per_rt, num_per_rt, ColorBuffer0 + rt);
}
FillBlock(tables[1], begin, num, RenderTargets);
FillBlock(tables[0], OFF(render_area), NUM(render_area), RenderTargets);
tables[0][OFF(rt_control)] = RenderTargets;
tables[1][OFF(rt_control)] = RenderTargetControl;
static constexpr std::array zeta_flags{ZetaBuffer, RenderTargets};
for (std::size_t i = 0; i < std::size(zeta_flags); ++i) {
const u8 flag = zeta_flags[i];
auto& table = tables[i];
table[OFF(zeta_enable)] = flag;
table[OFF(zeta_width)] = flag;
table[OFF(zeta_height)] = flag;
FillBlock(table, OFF(zeta), NUM(zeta), flag);
}
}
} // namespace VideoCommon::Dirty

52
src/video_core/dirty_flags.h Executable file
View File

@@ -0,0 +1,52 @@
// Copyright 2019 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#pragma once
#include <algorithm>
#include <cstddef>
#include <iterator>
#include "common/common_types.h"
#include "video_core/engines/maxwell_3d.h"
namespace VideoCommon::Dirty {
enum : u8 {
NullEntry = 0,
Descriptors,
RenderTargets,
RenderTargetControl,
ColorBuffer0,
ColorBuffer1,
ColorBuffer2,
ColorBuffer3,
ColorBuffer4,
ColorBuffer5,
ColorBuffer6,
ColorBuffer7,
ZetaBuffer,
LastCommonEntry,
};
template <typename Integer>
void FillBlock(Tegra::Engines::Maxwell3D::DirtyState::Table& table, std::size_t begin,
std::size_t num, Integer dirty_index) {
const auto it = std::begin(table) + begin;
std::fill(it, it + num, static_cast<u8>(dirty_index));
}
template <typename Integer1, typename Integer2>
void FillBlock(Tegra::Engines::Maxwell3D::DirtyState::Tables& tables, std::size_t begin,
std::size_t num, Integer1 index_a, Integer2 index_b) {
FillBlock(tables[0], begin, num, index_a);
FillBlock(tables[1], begin, num, index_b);
}
void SetupDirtyRenderTargets(Tegra::Engines::Maxwell3D::DirtyState::Tables& tables);
} // namespace VideoCommon::Dirty

177
src/video_core/dma_pusher.cpp Executable file
View File

@@ -0,0 +1,177 @@
// Copyright 2018 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#include "common/cityhash.h"
#include "common/microprofile.h"
#include "core/core.h"
#include "core/memory.h"
#include "video_core/dma_pusher.h"
#include "video_core/engines/maxwell_3d.h"
#include "video_core/gpu.h"
#include "video_core/memory_manager.h"
namespace Tegra {
DmaPusher::DmaPusher(Core::System& system_, GPU& gpu_) : gpu{gpu_}, system{system_} {}
DmaPusher::~DmaPusher() = default;
MICROPROFILE_DEFINE(DispatchCalls, "GPU", "Execute command buffer", MP_RGB(128, 128, 192));
void DmaPusher::DispatchCalls() {
MICROPROFILE_SCOPE(DispatchCalls);
gpu.SyncGuestHost();
// On entering GPU code, assume all memory may be touched by the ARM core.
gpu.Maxwell3D().OnMemoryWrite();
dma_pushbuffer_subindex = 0;
dma_state.is_last_call = true;
while (system.IsPoweredOn()) {
if (!Step()) {
break;
}
}
gpu.FlushCommands();
gpu.SyncGuestHost();
gpu.OnCommandListEnd();
}
bool DmaPusher::Step() {
if (!ib_enable || dma_pushbuffer.empty()) {
// pushbuffer empty and IB empty or nonexistent - nothing to do
return false;
}
CommandList& command_list{dma_pushbuffer.front()};
ASSERT_OR_EXECUTE(
command_list.command_lists.size() || command_list.prefetch_command_list.size(), {
// Somehow the command_list is empty, in order to avoid a crash
// We ignore it and assume its size is 0.
dma_pushbuffer.pop();
dma_pushbuffer_subindex = 0;
return true;
});
if (command_list.prefetch_command_list.size()) {
// Prefetched command list from nvdrv, used for things like synchronization
command_headers = std::move(command_list.prefetch_command_list);
dma_pushbuffer.pop();
} else {
const CommandListHeader command_list_header{
command_list.command_lists[dma_pushbuffer_subindex++]};
const GPUVAddr dma_get = command_list_header.addr;
if (dma_pushbuffer_subindex >= command_list.command_lists.size()) {
// We've gone through the current list, remove it from the queue
dma_pushbuffer.pop();
dma_pushbuffer_subindex = 0;
}
if (command_list_header.size == 0) {
return true;
}
// Push buffer non-empty, read a word
command_headers.resize(command_list_header.size);
gpu.MemoryManager().ReadBlockUnsafe(dma_get, command_headers.data(),
command_list_header.size * sizeof(u32));
}
for (std::size_t index = 0; index < command_headers.size();) {
const CommandHeader& command_header = command_headers[index];
if (dma_state.method_count) {
// Data word of methods command
if (dma_state.non_incrementing) {
const u32 max_write = static_cast<u32>(
std::min<std::size_t>(index + dma_state.method_count, command_headers.size()) -
index);
CallMultiMethod(&command_header.argument, max_write);
dma_state.method_count -= max_write;
dma_state.is_last_call = true;
index += max_write;
continue;
} else {
dma_state.is_last_call = dma_state.method_count <= 1;
CallMethod(command_header.argument);
}
if (!dma_state.non_incrementing) {
dma_state.method++;
}
if (dma_increment_once) {
dma_state.non_incrementing = true;
}
dma_state.method_count--;
} else {
// No command active - this is the first word of a new one
switch (command_header.mode) {
case SubmissionMode::Increasing:
SetState(command_header);
dma_state.non_incrementing = false;
dma_increment_once = false;
break;
case SubmissionMode::NonIncreasing:
SetState(command_header);
dma_state.non_incrementing = true;
dma_increment_once = false;
break;
case SubmissionMode::Inline:
dma_state.method = command_header.method;
dma_state.subchannel = command_header.subchannel;
CallMethod(command_header.arg_count);
dma_state.non_incrementing = true;
dma_increment_once = false;
break;
case SubmissionMode::IncreaseOnce:
SetState(command_header);
dma_state.non_incrementing = false;
dma_increment_once = true;
break;
default:
break;
}
}
index++;
}
return true;
}
void DmaPusher::SetState(const CommandHeader& command_header) {
dma_state.method = command_header.method;
dma_state.subchannel = command_header.subchannel;
dma_state.method_count = command_header.method_count;
}
void DmaPusher::CallMethod(u32 argument) const {
if (dma_state.method < non_puller_methods) {
gpu.CallMethod(GPU::MethodCall{
dma_state.method,
argument,
dma_state.subchannel,
dma_state.method_count,
});
} else {
subchannels[dma_state.subchannel]->CallMethod(dma_state.method, argument,
dma_state.is_last_call);
}
}
void DmaPusher::CallMultiMethod(const u32* base_start, u32 num_methods) const {
if (dma_state.method < non_puller_methods) {
gpu.CallMultiMethod(dma_state.method, dma_state.subchannel, base_start, num_methods,
dma_state.method_count);
} else {
subchannels[dma_state.subchannel]->CallMultiMethod(dma_state.method, base_start,
num_methods, dma_state.method_count);
}
}
} // namespace Tegra

154
src/video_core/dma_pusher.h Executable file
View File

@@ -0,0 +1,154 @@
// Copyright 2018 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#pragma once
#include <array>
#include <vector>
#include <queue>
#include "common/bit_field.h"
#include "common/common_types.h"
#include "video_core/engines/engine_interface.h"
namespace Core {
class System;
}
namespace Tegra {
class GPU;
enum class SubmissionMode : u32 {
IncreasingOld = 0,
Increasing = 1,
NonIncreasingOld = 2,
NonIncreasing = 3,
Inline = 4,
IncreaseOnce = 5
};
// Note that, traditionally, methods are treated as 4-byte addressable locations, and hence
// their numbers are written down multiplied by 4 in Docs. Here we are not multiply by 4.
// So the values you see in docs might be multiplied by 4.
enum class BufferMethods : u32 {
BindObject = 0x0,
Nop = 0x2,
SemaphoreAddressHigh = 0x4,
SemaphoreAddressLow = 0x5,
SemaphoreSequence = 0x6,
SemaphoreTrigger = 0x7,
NotifyIntr = 0x8,
WrcacheFlush = 0x9,
Unk28 = 0xA,
UnkCacheFlush = 0xB,
RefCnt = 0x14,
SemaphoreAcquire = 0x1A,
SemaphoreRelease = 0x1B,
FenceValue = 0x1C,
FenceAction = 0x1D,
WaitForInterrupt = 0x1E,
Unk7c = 0x1F,
Yield = 0x20,
NonPullerMethods = 0x40,
};
struct CommandListHeader {
union {
u64 raw;
BitField<0, 40, GPUVAddr> addr;
BitField<41, 1, u64> is_non_main;
BitField<42, 21, u64> size;
};
};
static_assert(sizeof(CommandListHeader) == sizeof(u64), "CommandListHeader is incorrect size");
union CommandHeader {
u32 argument;
BitField<0, 13, u32> method;
BitField<0, 24, u32> method_count_;
BitField<13, 3, u32> subchannel;
BitField<16, 13, u32> arg_count;
BitField<16, 13, u32> method_count;
BitField<29, 3, SubmissionMode> mode;
};
static_assert(std::is_standard_layout_v<CommandHeader>, "CommandHeader is not standard layout");
static_assert(sizeof(CommandHeader) == sizeof(u32), "CommandHeader has incorrect size!");
inline CommandHeader BuildCommandHeader(BufferMethods method, u32 arg_count, SubmissionMode mode) {
CommandHeader result{};
result.method.Assign(static_cast<u32>(method));
result.arg_count.Assign(arg_count);
result.mode.Assign(mode);
return result;
}
struct CommandList final {
CommandList() = default;
explicit CommandList(std::size_t size) : command_lists(size) {}
explicit CommandList(std::vector<CommandHeader>&& prefetch_command_list_)
: prefetch_command_list{std::move(prefetch_command_list_)} {}
std::vector<CommandListHeader> command_lists;
std::vector<CommandHeader> prefetch_command_list;
};
/**
* The DmaPusher class implements DMA submission to FIFOs, providing an area of memory that the
* emulated app fills with commands and tells PFIFO to process. The pushbuffers are then assembled
* into a "command stream" consisting of 32-bit words that make up "commands".
* See https://envytools.readthedocs.io/en/latest/hw/fifo/dma-pusher.html#fifo-dma-pusher for
* details on this implementation.
*/
class DmaPusher final {
public:
explicit DmaPusher(Core::System& system_, GPU& gpu_);
~DmaPusher();
void Push(CommandList&& entries) {
dma_pushbuffer.push(std::move(entries));
}
void DispatchCalls();
void BindSubchannel(Engines::EngineInterface* engine, u32 subchannel_id) {
subchannels[subchannel_id] = engine;
}
private:
static constexpr u32 non_puller_methods = 0x40;
static constexpr u32 max_subchannels = 8;
bool Step();
void SetState(const CommandHeader& command_header);
void CallMethod(u32 argument) const;
void CallMultiMethod(const u32* base_start, u32 num_methods) const;
std::vector<CommandHeader> command_headers; ///< Buffer for list of commands fetched at once
std::queue<CommandList> dma_pushbuffer; ///< Queue of command lists to be processed
std::size_t dma_pushbuffer_subindex{}; ///< Index within a command list within the pushbuffer
struct DmaState {
u32 method; ///< Current method
u32 subchannel; ///< Current subchannel
u32 method_count; ///< Current method count
u32 length_pending; ///< Large NI command length pending
bool non_incrementing; ///< Current command's NI flag
bool is_last_call;
};
DmaState dma_state{};
bool dma_increment_once{};
bool ib_enable{true}; ///< IB mode enabled
std::array<Engines::EngineInterface*, max_subchannels> subchannels{};
GPU& gpu;
Core::System& system;
};
} // namespace Tegra

View File

@@ -0,0 +1,103 @@
// Copyright 2019 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#pragma once
#include <type_traits>
#include "common/bit_field.h"
#include "common/common_types.h"
#include "video_core/engines/shader_bytecode.h"
#include "video_core/engines/shader_type.h"
#include "video_core/guest_driver.h"
#include "video_core/textures/texture.h"
namespace Tegra::Engines {
struct SamplerDescriptor {
union {
u32 raw = 0;
BitField<0, 2, Tegra::Shader::TextureType> texture_type;
BitField<2, 3, Tegra::Texture::ComponentType> r_type;
BitField<5, 1, u32> is_array;
BitField<6, 1, u32> is_buffer;
BitField<7, 1, u32> is_shadow;
BitField<8, 3, Tegra::Texture::ComponentType> g_type;
BitField<11, 3, Tegra::Texture::ComponentType> b_type;
BitField<14, 3, Tegra::Texture::ComponentType> a_type;
BitField<17, 7, Tegra::Texture::TextureFormat> format;
};
bool operator==(const SamplerDescriptor& rhs) const noexcept {
return raw == rhs.raw;
}
bool operator!=(const SamplerDescriptor& rhs) const noexcept {
return !operator==(rhs);
}
static SamplerDescriptor FromTIC(const Tegra::Texture::TICEntry& tic) {
using Tegra::Shader::TextureType;
SamplerDescriptor result;
result.format.Assign(tic.format.Value());
result.r_type.Assign(tic.r_type.Value());
result.g_type.Assign(tic.g_type.Value());
result.b_type.Assign(tic.b_type.Value());
result.a_type.Assign(tic.a_type.Value());
switch (tic.texture_type.Value()) {
case Tegra::Texture::TextureType::Texture1D:
result.texture_type.Assign(TextureType::Texture1D);
return result;
case Tegra::Texture::TextureType::Texture2D:
result.texture_type.Assign(TextureType::Texture2D);
return result;
case Tegra::Texture::TextureType::Texture3D:
result.texture_type.Assign(TextureType::Texture3D);
return result;
case Tegra::Texture::TextureType::TextureCubemap:
result.texture_type.Assign(TextureType::TextureCube);
return result;
case Tegra::Texture::TextureType::Texture1DArray:
result.texture_type.Assign(TextureType::Texture1D);
result.is_array.Assign(1);
return result;
case Tegra::Texture::TextureType::Texture2DArray:
result.texture_type.Assign(TextureType::Texture2D);
result.is_array.Assign(1);
return result;
case Tegra::Texture::TextureType::Texture1DBuffer:
result.texture_type.Assign(TextureType::Texture1D);
result.is_buffer.Assign(1);
return result;
case Tegra::Texture::TextureType::Texture2DNoMipmap:
result.texture_type.Assign(TextureType::Texture2D);
return result;
case Tegra::Texture::TextureType::TextureCubeArray:
result.texture_type.Assign(TextureType::TextureCube);
result.is_array.Assign(1);
return result;
default:
result.texture_type.Assign(TextureType::Texture2D);
return result;
}
}
};
static_assert(std::is_trivially_copyable_v<SamplerDescriptor>);
class ConstBufferEngineInterface {
public:
virtual ~ConstBufferEngineInterface() = default;
virtual u32 AccessConstBuffer32(ShaderType stage, u64 const_buffer, u64 offset) const = 0;
virtual SamplerDescriptor AccessBoundSampler(ShaderType stage, u64 offset) const = 0;
virtual SamplerDescriptor AccessBindlessSampler(ShaderType stage, u64 const_buffer,
u64 offset) const = 0;
virtual SamplerDescriptor AccessSampler(u32 handle) const = 0;
virtual u32 GetBoundBuffer() const = 0;
virtual VideoCore::GuestDriverProfile& AccessGuestDriverProfile() = 0;
virtual const VideoCore::GuestDriverProfile& AccessGuestDriverProfile() const = 0;
};
} // namespace Tegra::Engines

View File

@@ -0,0 +1,17 @@
// Copyright 2019 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#pragma once
#include "common/common_types.h"
namespace Tegra::Engines {
struct ConstBufferInfo {
GPUVAddr address;
u32 size;
bool enabled;
};
} // namespace Tegra::Engines

View File

@@ -0,0 +1,22 @@
// Copyright 2020 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#pragma once
#include <type_traits>
#include "common/common_types.h"
namespace Tegra::Engines {
class EngineInterface {
public:
/// Write the value to the register identified by method.
virtual void CallMethod(u32 method, u32 method_argument, bool is_last_call) = 0;
/// Write multiple values to the register identified by method.
virtual void CallMultiMethod(u32 method, const u32* base_start, u32 amount,
u32 methods_pending) = 0;
};
} // namespace Tegra::Engines

View File

@@ -0,0 +1,52 @@
// Copyright 2019 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#include <cstring>
#include "common/assert.h"
#include "video_core/engines/engine_upload.h"
#include "video_core/memory_manager.h"
#include "video_core/textures/decoders.h"
namespace Tegra::Engines::Upload {
State::State(MemoryManager& memory_manager_, Registers& regs_)
: regs{regs_}, memory_manager{memory_manager_} {}
State::~State() = default;
void State::ProcessExec(const bool is_linear_) {
write_offset = 0;
copy_size = regs.line_length_in * regs.line_count;
inner_buffer.resize(copy_size);
is_linear = is_linear_;
}
void State::ProcessData(const u32 data, const bool is_last_call) {
const u32 sub_copy_size = std::min(4U, copy_size - write_offset);
std::memcpy(&inner_buffer[write_offset], &data, sub_copy_size);
write_offset += sub_copy_size;
if (!is_last_call) {
return;
}
const GPUVAddr address{regs.dest.Address()};
if (is_linear) {
memory_manager.WriteBlock(address, inner_buffer.data(), copy_size);
} else {
UNIMPLEMENTED_IF(regs.dest.z != 0);
UNIMPLEMENTED_IF(regs.dest.depth != 1);
UNIMPLEMENTED_IF(regs.dest.BlockWidth() != 0);
UNIMPLEMENTED_IF(regs.dest.BlockDepth() != 0);
const std::size_t dst_size = Tegra::Texture::CalculateSize(
true, 1, regs.dest.width, regs.dest.height, 1, regs.dest.BlockHeight(), 0);
tmp_buffer.resize(dst_size);
memory_manager.ReadBlock(address, tmp_buffer.data(), dst_size);
Tegra::Texture::SwizzleKepler(regs.dest.width, regs.dest.height, regs.dest.x, regs.dest.y,
regs.dest.BlockHeight(), copy_size, inner_buffer.data(),
tmp_buffer.data());
memory_manager.WriteBlock(address, tmp_buffer.data(), dst_size);
}
}
} // namespace Tegra::Engines::Upload

View File

@@ -0,0 +1,73 @@
// Copyright 2019 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#pragma once
#include <vector>
#include "common/bit_field.h"
#include "common/common_types.h"
namespace Tegra {
class MemoryManager;
}
namespace Tegra::Engines::Upload {
struct Registers {
u32 line_length_in;
u32 line_count;
struct {
u32 address_high;
u32 address_low;
u32 pitch;
union {
BitField<0, 4, u32> block_width;
BitField<4, 4, u32> block_height;
BitField<8, 4, u32> block_depth;
};
u32 width;
u32 height;
u32 depth;
u32 z;
u32 x;
u32 y;
GPUVAddr Address() const {
return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) | address_low);
}
u32 BlockWidth() const {
return block_width.Value();
}
u32 BlockHeight() const {
return block_height.Value();
}
u32 BlockDepth() const {
return block_depth.Value();
}
} dest;
};
class State {
public:
explicit State(MemoryManager& memory_manager_, Registers& regs_);
~State();
void ProcessExec(bool is_linear_);
void ProcessData(u32 data, bool is_last_call);
private:
u32 write_offset = 0;
u32 copy_size = 0;
std::vector<u8> inner_buffer;
std::vector<u8> tmp_buffer;
bool is_linear = false;
Registers& regs;
MemoryManager& memory_manager;
};
} // namespace Tegra::Engines::Upload

View File

@@ -0,0 +1,69 @@
// Copyright 2018 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#include "common/assert.h"
#include "common/logging/log.h"
#include "video_core/engines/fermi_2d.h"
#include "video_core/memory_manager.h"
#include "video_core/rasterizer_interface.h"
namespace Tegra::Engines {
Fermi2D::Fermi2D() {
// Nvidia's OpenGL driver seems to assume these values
regs.src.depth = 1;
regs.dst.depth = 1;
}
Fermi2D::~Fermi2D() = default;
void Fermi2D::BindRasterizer(VideoCore::RasterizerInterface& rasterizer_) {
rasterizer = &rasterizer_;
}
void Fermi2D::CallMethod(u32 method, u32 method_argument, bool is_last_call) {
ASSERT_MSG(method < Regs::NUM_REGS,
"Invalid Fermi2D register, increase the size of the Regs structure");
regs.reg_array[method] = method_argument;
if (method == FERMI2D_REG_INDEX(pixels_from_memory.src_y0) + 1) {
Blit();
}
}
void Fermi2D::CallMultiMethod(u32 method, const u32* base_start, u32 amount, u32 methods_pending) {
for (u32 i = 0; i < amount; ++i) {
CallMethod(method, base_start[i], methods_pending - i <= 1);
}
}
void Fermi2D::Blit() {
LOG_DEBUG(HW_GPU, "called. source address=0x{:x}, destination address=0x{:x}",
regs.src.Address(), regs.dst.Address());
UNIMPLEMENTED_IF_MSG(regs.operation != Operation::SrcCopy, "Operation is not copy");
UNIMPLEMENTED_IF_MSG(regs.src.layer != 0, "Source layer is not zero");
UNIMPLEMENTED_IF_MSG(regs.dst.layer != 0, "Destination layer is not zero");
UNIMPLEMENTED_IF_MSG(regs.src.depth != 1, "Source depth is not one");
UNIMPLEMENTED_IF_MSG(regs.clip_enable != 0, "Clipped blit enabled");
const auto& args = regs.pixels_from_memory;
const Config config{
.operation = regs.operation,
.filter = args.sample_mode.filter,
.dst_x0 = args.dst_x0,
.dst_y0 = args.dst_y0,
.dst_x1 = args.dst_x0 + args.dst_width,
.dst_y1 = args.dst_y0 + args.dst_height,
.src_x0 = static_cast<s32>(args.src_x0 >> 32),
.src_y0 = static_cast<s32>(args.src_y0 >> 32),
.src_x1 = static_cast<s32>((args.du_dx * args.dst_width + args.src_x0) >> 32),
.src_y1 = static_cast<s32>((args.dv_dy * args.dst_height + args.src_y0) >> 32),
};
if (!rasterizer->AccelerateSurfaceCopy(regs.src, regs.dst, config)) {
UNIMPLEMENTED();
}
}
} // namespace Tegra::Engines

352
src/video_core/engines/fermi_2d.h Executable file
View File

@@ -0,0 +1,352 @@
// Copyright 2018 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#pragma once
#include <array>
#include <cstddef>
#include "common/bit_field.h"
#include "common/common_funcs.h"
#include "common/common_types.h"
#include "common/math_util.h"
#include "video_core/engines/engine_interface.h"
#include "video_core/gpu.h"
namespace Tegra {
class MemoryManager;
}
namespace VideoCore {
class RasterizerInterface;
}
namespace Tegra::Engines {
/**
* This Engine is known as G80_2D. Documentation can be found in:
* https://github.com/envytools/envytools/blob/master/rnndb/graph/g80_2d.xml
* https://cgit.freedesktop.org/mesa/mesa/tree/src/gallium/drivers/nouveau/nv50/nv50_2d.xml.h
*/
#define FERMI2D_REG_INDEX(field_name) \
(offsetof(Tegra::Engines::Fermi2D::Regs, field_name) / sizeof(u32))
class Fermi2D final : public EngineInterface {
public:
explicit Fermi2D();
~Fermi2D();
/// Binds a rasterizer to this engine.
void BindRasterizer(VideoCore::RasterizerInterface& rasterizer);
/// Write the value to the register identified by method.
void CallMethod(u32 method, u32 method_argument, bool is_last_call) override;
/// Write multiple values to the register identified by method.
void CallMultiMethod(u32 method, const u32* base_start, u32 amount,
u32 methods_pending) override;
enum class Origin : u32 {
Center = 0,
Corner = 1,
};
enum class Filter : u32 {
Point = 0,
Bilinear = 1,
};
enum class Operation : u32 {
SrcCopyAnd = 0,
ROPAnd = 1,
Blend = 2,
SrcCopy = 3,
ROP = 4,
SrcCopyPremult = 5,
BlendPremult = 6,
};
enum class MemoryLayout : u32 {
BlockLinear = 0,
Pitch = 1,
};
enum class CpuIndexWrap : u32 {
Wrap = 0,
NoWrap = 1,
};
struct Surface {
RenderTargetFormat format;
MemoryLayout linear;
union {
BitField<0, 4, u32> block_width;
BitField<4, 4, u32> block_height;
BitField<8, 4, u32> block_depth;
};
u32 depth;
u32 layer;
u32 pitch;
u32 width;
u32 height;
u32 addr_upper;
u32 addr_lower;
[[nodiscard]] constexpr GPUVAddr Address() const noexcept {
return (static_cast<GPUVAddr>(addr_upper) << 32) | static_cast<GPUVAddr>(addr_lower);
}
};
static_assert(sizeof(Surface) == 0x28, "Surface has incorrect size");
enum class SectorPromotion : u32 {
NoPromotion = 0,
PromoteTo2V = 1,
PromoteTo2H = 2,
PromoteTo4 = 3,
};
enum class NumTpcs : u32 {
All = 0,
One = 1,
};
enum class RenderEnableMode : u32 {
False = 0,
True = 1,
Conditional = 2,
RenderIfEqual = 3,
RenderIfNotEqual = 4,
};
enum class ColorKeyFormat : u32 {
A16R56G6B5 = 0,
A1R5G55B5 = 1,
A8R8G8B8 = 2,
A2R10G10B10 = 3,
Y8 = 4,
Y16 = 5,
Y32 = 6,
};
union Beta4 {
BitField<0, 8, u32> b;
BitField<8, 8, u32> g;
BitField<16, 8, u32> r;
BitField<24, 8, u32> a;
};
struct Point {
u32 x;
u32 y;
};
enum class PatternSelect : u32 {
MonoChrome8x8 = 0,
MonoChrome64x1 = 1,
MonoChrome1x64 = 2,
Color = 3,
};
enum class NotifyType : u32 {
WriteOnly = 0,
WriteThenAwaken = 1,
};
enum class MonochromePatternColorFormat : u32 {
A8X8R8G6B5 = 0,
A1R5G5B5 = 1,
A8R8G8B8 = 2,
A8Y8 = 3,
A8X8Y16 = 4,
Y32 = 5,
};
enum class MonochromePatternFormat : u32 {
CGA6_M1 = 0,
LE_M1 = 1,
};
union Regs {
static constexpr std::size_t NUM_REGS = 0x258;
struct {
u32 object;
INSERT_UNION_PADDING_WORDS(0x3F);
u32 no_operation;
NotifyType notify;
INSERT_UNION_PADDING_WORDS(0x2);
u32 wait_for_idle;
INSERT_UNION_PADDING_WORDS(0xB);
u32 pm_trigger;
INSERT_UNION_PADDING_WORDS(0xF);
u32 context_dma_notify;
u32 dst_context_dma;
u32 src_context_dma;
u32 semaphore_context_dma;
INSERT_UNION_PADDING_WORDS(0x1C);
Surface dst;
CpuIndexWrap pixels_from_cpu_index_wrap;
u32 kind2d_check_enable;
Surface src;
SectorPromotion pixels_from_memory_sector_promotion;
INSERT_UNION_PADDING_WORDS(0x1);
NumTpcs num_tpcs;
u32 render_enable_addr_upper;
u32 render_enable_addr_lower;
RenderEnableMode render_enable_mode;
INSERT_UNION_PADDING_WORDS(0x4);
u32 clip_x0;
u32 clip_y0;
u32 clip_width;
u32 clip_height;
BitField<0, 1, u32> clip_enable;
BitField<0, 3, ColorKeyFormat> color_key_format;
u32 color_key;
BitField<0, 1, u32> color_key_enable;
BitField<0, 8, u32> rop;
u32 beta1;
Beta4 beta4;
Operation operation;
union {
BitField<0, 6, u32> x;
BitField<8, 6, u32> y;
} pattern_offset;
BitField<0, 2, PatternSelect> pattern_select;
INSERT_UNION_PADDING_WORDS(0xC);
struct {
BitField<0, 3, MonochromePatternColorFormat> color_format;
BitField<0, 1, MonochromePatternFormat> format;
u32 color0;
u32 color1;
u32 pattern0;
u32 pattern1;
} monochrome_pattern;
struct {
std::array<u32, 0x40> X8R8G8B8;
std::array<u32, 0x20> R5G6B5;
std::array<u32, 0x20> X1R5G5B5;
std::array<u32, 0x10> Y8;
} color_pattern;
INSERT_UNION_PADDING_WORDS(0x10);
struct {
u32 prim_mode;
u32 prim_color_format;
u32 prim_color;
u32 line_tie_break_bits;
INSERT_UNION_PADDING_WORDS(0x14);
u32 prim_point_xy;
INSERT_UNION_PADDING_WORDS(0x7);
std::array<Point, 0x40> prim_point;
} render_solid;
struct {
u32 data_type;
u32 color_format;
u32 index_format;
u32 mono_format;
u32 wrap;
u32 color0;
u32 color1;
u32 mono_opacity;
INSERT_UNION_PADDING_WORDS(0x6);
u32 src_width;
u32 src_height;
u32 dx_du_frac;
u32 dx_du_int;
u32 dx_dv_frac;
u32 dy_dv_int;
u32 dst_x0_frac;
u32 dst_x0_int;
u32 dst_y0_frac;
u32 dst_y0_int;
u32 data;
} pixels_from_cpu;
INSERT_UNION_PADDING_WORDS(0x3);
u32 big_endian_control;
INSERT_UNION_PADDING_WORDS(0x3);
struct {
BitField<0, 3, u32> block_shape;
BitField<0, 5, u32> corral_size;
BitField<0, 1, u32> safe_overlap;
union {
BitField<0, 1, Origin> origin;
BitField<4, 1, Filter> filter;
} sample_mode;
INSERT_UNION_PADDING_WORDS(0x8);
s32 dst_x0;
s32 dst_y0;
s32 dst_width;
s32 dst_height;
s64 du_dx;
s64 dv_dy;
s64 src_x0;
s64 src_y0;
} pixels_from_memory;
};
std::array<u32, NUM_REGS> reg_array;
} regs{};
struct Config {
Operation operation;
Filter filter;
s32 dst_x0;
s32 dst_y0;
s32 dst_x1;
s32 dst_y1;
s32 src_x0;
s32 src_y0;
s32 src_x1;
s32 src_y1;
};
private:
VideoCore::RasterizerInterface* rasterizer;
/// Performs the copy from the source surface to the destination surface as configured in the
/// registers.
void Blit();
};
#define ASSERT_REG_POSITION(field_name, position) \
static_assert(offsetof(Fermi2D::Regs, field_name) == position, \
"Field " #field_name " has invalid position")
ASSERT_REG_POSITION(object, 0x0);
ASSERT_REG_POSITION(no_operation, 0x100);
ASSERT_REG_POSITION(notify, 0x104);
ASSERT_REG_POSITION(wait_for_idle, 0x110);
ASSERT_REG_POSITION(pm_trigger, 0x140);
ASSERT_REG_POSITION(context_dma_notify, 0x180);
ASSERT_REG_POSITION(dst_context_dma, 0x184);
ASSERT_REG_POSITION(src_context_dma, 0x188);
ASSERT_REG_POSITION(semaphore_context_dma, 0x18C);
ASSERT_REG_POSITION(dst, 0x200);
ASSERT_REG_POSITION(pixels_from_cpu_index_wrap, 0x228);
ASSERT_REG_POSITION(kind2d_check_enable, 0x22C);
ASSERT_REG_POSITION(src, 0x230);
ASSERT_REG_POSITION(pixels_from_memory_sector_promotion, 0x258);
ASSERT_REG_POSITION(num_tpcs, 0x260);
ASSERT_REG_POSITION(render_enable_addr_upper, 0x264);
ASSERT_REG_POSITION(render_enable_addr_lower, 0x268);
ASSERT_REG_POSITION(clip_x0, 0x280);
ASSERT_REG_POSITION(clip_y0, 0x284);
ASSERT_REG_POSITION(clip_width, 0x288);
ASSERT_REG_POSITION(clip_height, 0x28c);
ASSERT_REG_POSITION(clip_enable, 0x290);
ASSERT_REG_POSITION(color_key_format, 0x294);
ASSERT_REG_POSITION(color_key, 0x298);
ASSERT_REG_POSITION(rop, 0x2A0);
ASSERT_REG_POSITION(beta1, 0x2A4);
ASSERT_REG_POSITION(beta4, 0x2A8);
ASSERT_REG_POSITION(operation, 0x2AC);
ASSERT_REG_POSITION(pattern_offset, 0x2B0);
ASSERT_REG_POSITION(pattern_select, 0x2B4);
ASSERT_REG_POSITION(monochrome_pattern, 0x2E8);
ASSERT_REG_POSITION(color_pattern, 0x300);
ASSERT_REG_POSITION(render_solid, 0x580);
ASSERT_REG_POSITION(pixels_from_cpu, 0x800);
ASSERT_REG_POSITION(big_endian_control, 0x870);
ASSERT_REG_POSITION(pixels_from_memory, 0x880);
#undef ASSERT_REG_POSITION
} // namespace Tegra::Engines

View File

@@ -0,0 +1,127 @@
// Copyright 2018 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#include <bitset>
#include "common/assert.h"
#include "common/logging/log.h"
#include "core/core.h"
#include "video_core/engines/kepler_compute.h"
#include "video_core/engines/maxwell_3d.h"
#include "video_core/engines/shader_type.h"
#include "video_core/memory_manager.h"
#include "video_core/rasterizer_interface.h"
#include "video_core/renderer_base.h"
#include "video_core/textures/decoders.h"
namespace Tegra::Engines {
KeplerCompute::KeplerCompute(Core::System& system_, MemoryManager& memory_manager_)
: system{system_}, memory_manager{memory_manager_}, upload_state{memory_manager, regs.upload} {}
KeplerCompute::~KeplerCompute() = default;
void KeplerCompute::BindRasterizer(VideoCore::RasterizerInterface& rasterizer_) {
rasterizer = &rasterizer_;
}
void KeplerCompute::CallMethod(u32 method, u32 method_argument, bool is_last_call) {
ASSERT_MSG(method < Regs::NUM_REGS,
"Invalid KeplerCompute register, increase the size of the Regs structure");
regs.reg_array[method] = method_argument;
switch (method) {
case KEPLER_COMPUTE_REG_INDEX(exec_upload): {
upload_state.ProcessExec(regs.exec_upload.linear != 0);
break;
}
case KEPLER_COMPUTE_REG_INDEX(data_upload): {
upload_state.ProcessData(method_argument, is_last_call);
if (is_last_call) {
system.GPU().Maxwell3D().OnMemoryWrite();
}
break;
}
case KEPLER_COMPUTE_REG_INDEX(launch):
ProcessLaunch();
break;
default:
break;
}
}
void KeplerCompute::CallMultiMethod(u32 method, const u32* base_start, u32 amount,
u32 methods_pending) {
for (std::size_t i = 0; i < amount; i++) {
CallMethod(method, base_start[i], methods_pending - static_cast<u32>(i) <= 1);
}
}
u32 KeplerCompute::AccessConstBuffer32(ShaderType stage, u64 const_buffer, u64 offset) const {
ASSERT(stage == ShaderType::Compute);
const auto& buffer = launch_description.const_buffer_config[const_buffer];
u32 result;
std::memcpy(&result, memory_manager.GetPointer(buffer.Address() + offset), sizeof(u32));
return result;
}
SamplerDescriptor KeplerCompute::AccessBoundSampler(ShaderType stage, u64 offset) const {
return AccessBindlessSampler(stage, regs.tex_cb_index, offset * sizeof(Texture::TextureHandle));
}
SamplerDescriptor KeplerCompute::AccessBindlessSampler(ShaderType stage, u64 const_buffer,
u64 offset) const {
ASSERT(stage == ShaderType::Compute);
const auto& tex_info_buffer = launch_description.const_buffer_config[const_buffer];
const GPUVAddr tex_info_address = tex_info_buffer.Address() + offset;
return AccessSampler(memory_manager.Read<u32>(tex_info_address));
}
SamplerDescriptor KeplerCompute::AccessSampler(u32 handle) const {
const Texture::TextureHandle tex_handle{handle};
const Texture::TICEntry tic = GetTICEntry(tex_handle.tic_id);
const Texture::TSCEntry tsc = GetTSCEntry(tex_handle.tsc_id);
SamplerDescriptor result = SamplerDescriptor::FromTIC(tic);
result.is_shadow.Assign(tsc.depth_compare_enabled.Value());
return result;
}
VideoCore::GuestDriverProfile& KeplerCompute::AccessGuestDriverProfile() {
return rasterizer->AccessGuestDriverProfile();
}
const VideoCore::GuestDriverProfile& KeplerCompute::AccessGuestDriverProfile() const {
return rasterizer->AccessGuestDriverProfile();
}
void KeplerCompute::ProcessLaunch() {
const GPUVAddr launch_desc_loc = regs.launch_desc_loc.Address();
memory_manager.ReadBlockUnsafe(launch_desc_loc, &launch_description,
LaunchParams::NUM_LAUNCH_PARAMETERS * sizeof(u32));
const GPUVAddr code_addr = regs.code_loc.Address() + launch_description.program_start;
LOG_TRACE(HW_GPU, "Compute invocation launched at address 0x{:016x}", code_addr);
rasterizer->DispatchCompute(code_addr);
}
Texture::TICEntry KeplerCompute::GetTICEntry(u32 tic_index) const {
const GPUVAddr tic_address_gpu{regs.tic.Address() + tic_index * sizeof(Texture::TICEntry)};
Texture::TICEntry tic_entry;
memory_manager.ReadBlockUnsafe(tic_address_gpu, &tic_entry, sizeof(Texture::TICEntry));
return tic_entry;
}
Texture::TSCEntry KeplerCompute::GetTSCEntry(u32 tsc_index) const {
const GPUVAddr tsc_address_gpu{regs.tsc.Address() + tsc_index * sizeof(Texture::TSCEntry)};
Texture::TSCEntry tsc_entry;
memory_manager.ReadBlockUnsafe(tsc_address_gpu, &tsc_entry, sizeof(Texture::TSCEntry));
return tsc_entry;
}
} // namespace Tegra::Engines

View File

@@ -0,0 +1,269 @@
// Copyright 2018 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#pragma once
#include <array>
#include <cstddef>
#include <vector>
#include "common/bit_field.h"
#include "common/common_funcs.h"
#include "common/common_types.h"
#include "video_core/engines/const_buffer_engine_interface.h"
#include "video_core/engines/engine_interface.h"
#include "video_core/engines/engine_upload.h"
#include "video_core/engines/shader_type.h"
#include "video_core/gpu.h"
#include "video_core/textures/texture.h"
namespace Core {
class System;
}
namespace Tegra {
class MemoryManager;
}
namespace VideoCore {
class RasterizerInterface;
}
namespace Tegra::Engines {
/**
* This Engine is known as GK104_Compute. Documentation can be found in:
* https://github.com/envytools/envytools/blob/master/rnndb/graph/gk104_compute.xml
* https://cgit.freedesktop.org/mesa/mesa/tree/src/gallium/drivers/nouveau/nvc0/nve4_compute.xml.h
*/
#define KEPLER_COMPUTE_REG_INDEX(field_name) \
(offsetof(Tegra::Engines::KeplerCompute::Regs, field_name) / sizeof(u32))
class KeplerCompute final : public ConstBufferEngineInterface, public EngineInterface {
public:
explicit KeplerCompute(Core::System& system, MemoryManager& memory_manager);
~KeplerCompute();
/// Binds a rasterizer to this engine.
void BindRasterizer(VideoCore::RasterizerInterface& rasterizer);
static constexpr std::size_t NumConstBuffers = 8;
struct Regs {
static constexpr std::size_t NUM_REGS = 0xCF8;
union {
struct {
INSERT_UNION_PADDING_WORDS(0x60);
Upload::Registers upload;
struct {
union {
BitField<0, 1, u32> linear;
};
} exec_upload;
u32 data_upload;
INSERT_UNION_PADDING_WORDS(0x3F);
struct {
u32 address;
GPUVAddr Address() const {
return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address) << 8));
}
} launch_desc_loc;
INSERT_UNION_PADDING_WORDS(0x1);
u32 launch;
INSERT_UNION_PADDING_WORDS(0x4A7);
struct {
u32 address_high;
u32 address_low;
u32 limit;
GPUVAddr Address() const {
return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
address_low);
}
} tsc;
INSERT_UNION_PADDING_WORDS(0x3);
struct {
u32 address_high;
u32 address_low;
u32 limit;
GPUVAddr Address() const {
return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
address_low);
}
} tic;
INSERT_UNION_PADDING_WORDS(0x22);
struct {
u32 address_high;
u32 address_low;
GPUVAddr Address() const {
return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
address_low);
}
} code_loc;
INSERT_UNION_PADDING_WORDS(0x3FE);
u32 tex_cb_index;
INSERT_UNION_PADDING_WORDS(0x374);
};
std::array<u32, NUM_REGS> reg_array;
};
} regs{};
struct LaunchParams {
static constexpr std::size_t NUM_LAUNCH_PARAMETERS = 0x40;
INSERT_PADDING_WORDS(0x8);
u32 program_start;
INSERT_PADDING_WORDS(0x2);
BitField<30, 1, u32> linked_tsc;
BitField<0, 31, u32> grid_dim_x;
union {
BitField<0, 16, u32> grid_dim_y;
BitField<16, 16, u32> grid_dim_z;
};
INSERT_PADDING_WORDS(0x3);
BitField<0, 18, u32> shared_alloc;
BitField<16, 16, u32> block_dim_x;
union {
BitField<0, 16, u32> block_dim_y;
BitField<16, 16, u32> block_dim_z;
};
union {
BitField<0, 8, u32> const_buffer_enable_mask;
BitField<29, 2, u32> cache_layout;
};
INSERT_PADDING_WORDS(0x8);
struct ConstBufferConfig {
u32 address_low;
union {
BitField<0, 8, u32> address_high;
BitField<15, 17, u32> size;
};
GPUVAddr Address() const {
return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high.Value()) << 32) |
address_low);
}
};
std::array<ConstBufferConfig, NumConstBuffers> const_buffer_config;
union {
BitField<0, 20, u32> local_pos_alloc;
BitField<27, 5, u32> barrier_alloc;
};
union {
BitField<0, 20, u32> local_neg_alloc;
BitField<24, 5, u32> gpr_alloc;
};
union {
BitField<0, 20, u32> local_crs_alloc;
BitField<24, 5, u32> sass_version;
};
INSERT_PADDING_WORDS(0x10);
} launch_description{};
struct {
u32 write_offset = 0;
u32 copy_size = 0;
std::vector<u8> inner_buffer;
} state{};
static_assert(sizeof(Regs) == Regs::NUM_REGS * sizeof(u32),
"KeplerCompute Regs has wrong size");
static_assert(sizeof(LaunchParams) == LaunchParams::NUM_LAUNCH_PARAMETERS * sizeof(u32),
"KeplerCompute LaunchParams has wrong size");
/// Write the value to the register identified by method.
void CallMethod(u32 method, u32 method_argument, bool is_last_call) override;
/// Write multiple values to the register identified by method.
void CallMultiMethod(u32 method, const u32* base_start, u32 amount,
u32 methods_pending) override;
u32 AccessConstBuffer32(ShaderType stage, u64 const_buffer, u64 offset) const override;
SamplerDescriptor AccessBoundSampler(ShaderType stage, u64 offset) const override;
SamplerDescriptor AccessBindlessSampler(ShaderType stage, u64 const_buffer,
u64 offset) const override;
SamplerDescriptor AccessSampler(u32 handle) const override;
u32 GetBoundBuffer() const override {
return regs.tex_cb_index;
}
VideoCore::GuestDriverProfile& AccessGuestDriverProfile() override;
const VideoCore::GuestDriverProfile& AccessGuestDriverProfile() const override;
private:
void ProcessLaunch();
/// Retrieves information about a specific TIC entry from the TIC buffer.
Texture::TICEntry GetTICEntry(u32 tic_index) const;
/// Retrieves information about a specific TSC entry from the TSC buffer.
Texture::TSCEntry GetTSCEntry(u32 tsc_index) const;
Core::System& system;
MemoryManager& memory_manager;
VideoCore::RasterizerInterface* rasterizer = nullptr;
Upload::State upload_state;
};
#define ASSERT_REG_POSITION(field_name, position) \
static_assert(offsetof(KeplerCompute::Regs, field_name) == position * 4, \
"Field " #field_name " has invalid position")
#define ASSERT_LAUNCH_PARAM_POSITION(field_name, position) \
static_assert(offsetof(KeplerCompute::LaunchParams, field_name) == position * 4, \
"Field " #field_name " has invalid position")
ASSERT_REG_POSITION(upload, 0x60);
ASSERT_REG_POSITION(exec_upload, 0x6C);
ASSERT_REG_POSITION(data_upload, 0x6D);
ASSERT_REG_POSITION(launch, 0xAF);
ASSERT_REG_POSITION(tsc, 0x557);
ASSERT_REG_POSITION(tic, 0x55D);
ASSERT_REG_POSITION(code_loc, 0x582);
ASSERT_REG_POSITION(tex_cb_index, 0x982);
ASSERT_LAUNCH_PARAM_POSITION(program_start, 0x8);
ASSERT_LAUNCH_PARAM_POSITION(grid_dim_x, 0xC);
ASSERT_LAUNCH_PARAM_POSITION(shared_alloc, 0x11);
ASSERT_LAUNCH_PARAM_POSITION(block_dim_x, 0x12);
ASSERT_LAUNCH_PARAM_POSITION(const_buffer_enable_mask, 0x14);
ASSERT_LAUNCH_PARAM_POSITION(const_buffer_config, 0x1D);
#undef ASSERT_REG_POSITION
} // namespace Tegra::Engines

View File

@@ -0,0 +1,50 @@
// Copyright 2018 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#include "common/assert.h"
#include "common/logging/log.h"
#include "core/core.h"
#include "video_core/engines/kepler_memory.h"
#include "video_core/engines/maxwell_3d.h"
#include "video_core/memory_manager.h"
#include "video_core/rasterizer_interface.h"
#include "video_core/renderer_base.h"
#include "video_core/textures/decoders.h"
namespace Tegra::Engines {
KeplerMemory::KeplerMemory(Core::System& system_, MemoryManager& memory_manager)
: system{system_}, upload_state{memory_manager, regs.upload} {}
KeplerMemory::~KeplerMemory() = default;
void KeplerMemory::CallMethod(u32 method, u32 method_argument, bool is_last_call) {
ASSERT_MSG(method < Regs::NUM_REGS,
"Invalid KeplerMemory register, increase the size of the Regs structure");
regs.reg_array[method] = method_argument;
switch (method) {
case KEPLERMEMORY_REG_INDEX(exec): {
upload_state.ProcessExec(regs.exec.linear != 0);
break;
}
case KEPLERMEMORY_REG_INDEX(data): {
upload_state.ProcessData(method_argument, is_last_call);
if (is_last_call) {
system.GPU().Maxwell3D().OnMemoryWrite();
}
break;
}
}
}
void KeplerMemory::CallMultiMethod(u32 method, const u32* base_start, u32 amount,
u32 methods_pending) {
for (std::size_t i = 0; i < amount; i++) {
CallMethod(method, base_start[i], methods_pending - static_cast<u32>(i) <= 1);
}
}
} // namespace Tegra::Engines

View File

@@ -0,0 +1,85 @@
// Copyright 2018 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#pragma once
#include <array>
#include <cstddef>
#include <vector>
#include "common/bit_field.h"
#include "common/common_funcs.h"
#include "common/common_types.h"
#include "video_core/engines/engine_interface.h"
#include "video_core/engines/engine_upload.h"
#include "video_core/gpu.h"
namespace Core {
class System;
}
namespace Tegra {
class MemoryManager;
}
namespace Tegra::Engines {
/**
* This Engine is known as P2MF. Documentation can be found in:
* https://github.com/envytools/envytools/blob/master/rnndb/graph/gk104_p2mf.xml
* https://cgit.freedesktop.org/mesa/mesa/tree/src/gallium/drivers/nouveau/nvc0/nve4_p2mf.xml.h
*/
#define KEPLERMEMORY_REG_INDEX(field_name) \
(offsetof(Tegra::Engines::KeplerMemory::Regs, field_name) / sizeof(u32))
class KeplerMemory final : public EngineInterface {
public:
explicit KeplerMemory(Core::System& system_, MemoryManager& memory_manager);
~KeplerMemory();
/// Write the value to the register identified by method.
void CallMethod(u32 method, u32 method_argument, bool is_last_call) override;
/// Write multiple values to the register identified by method.
void CallMultiMethod(u32 method, const u32* base_start, u32 amount,
u32 methods_pending) override;
struct Regs {
static constexpr size_t NUM_REGS = 0x7F;
union {
struct {
INSERT_UNION_PADDING_WORDS(0x60);
Upload::Registers upload;
struct {
union {
BitField<0, 1, u32> linear;
};
} exec;
u32 data;
INSERT_UNION_PADDING_WORDS(0x11);
};
std::array<u32, NUM_REGS> reg_array;
};
} regs{};
private:
Core::System& system;
Upload::State upload_state;
};
#define ASSERT_REG_POSITION(field_name, position) \
static_assert(offsetof(KeplerMemory::Regs, field_name) == position * 4, \
"Field " #field_name " has invalid position")
ASSERT_REG_POSITION(upload, 0x60);
ASSERT_REG_POSITION(exec, 0x6C);
ASSERT_REG_POSITION(data, 0x6D);
#undef ASSERT_REG_POSITION
} // namespace Tegra::Engines

View File

@@ -0,0 +1,708 @@
// Copyright 2018 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#include <cstring>
#include <optional>
#include "common/assert.h"
#include "core/core.h"
#include "core/core_timing.h"
#include "video_core/engines/maxwell_3d.h"
#include "video_core/engines/shader_type.h"
#include "video_core/gpu.h"
#include "video_core/memory_manager.h"
#include "video_core/rasterizer_interface.h"
#include "video_core/textures/texture.h"
namespace Tegra::Engines {
using VideoCore::QueryType;
/// First register id that is actually a Macro call.
constexpr u32 MacroRegistersStart = 0xE00;
Maxwell3D::Maxwell3D(Core::System& system_, MemoryManager& memory_manager_)
: system{system_}, memory_manager{memory_manager_}, macro_engine{GetMacroEngine(*this)},
upload_state{memory_manager, regs.upload} {
dirty.flags.flip();
InitializeRegisterDefaults();
}
Maxwell3D::~Maxwell3D() = default;
void Maxwell3D::BindRasterizer(VideoCore::RasterizerInterface& rasterizer_) {
rasterizer = &rasterizer_;
}
void Maxwell3D::InitializeRegisterDefaults() {
// Initializes registers to their default values - what games expect them to be at boot. This is
// for certain registers that may not be explicitly set by games.
// Reset all registers to zero
std::memset(&regs, 0, sizeof(regs));
// Depth range near/far is not always set, but is expected to be the default 0.0f, 1.0f. This is
// needed for ARMS.
for (auto& viewport : regs.viewports) {
viewport.depth_range_near = 0.0f;
viewport.depth_range_far = 1.0f;
}
for (auto& viewport : regs.viewport_transform) {
viewport.swizzle.x.Assign(Regs::ViewportSwizzle::PositiveX);
viewport.swizzle.y.Assign(Regs::ViewportSwizzle::PositiveY);
viewport.swizzle.z.Assign(Regs::ViewportSwizzle::PositiveZ);
viewport.swizzle.w.Assign(Regs::ViewportSwizzle::PositiveW);
}
// Doom and Bomberman seems to use the uninitialized registers and just enable blend
// so initialize blend registers with sane values
regs.blend.equation_rgb = Regs::Blend::Equation::Add;
regs.blend.factor_source_rgb = Regs::Blend::Factor::One;
regs.blend.factor_dest_rgb = Regs::Blend::Factor::Zero;
regs.blend.equation_a = Regs::Blend::Equation::Add;
regs.blend.factor_source_a = Regs::Blend::Factor::One;
regs.blend.factor_dest_a = Regs::Blend::Factor::Zero;
for (auto& blend : regs.independent_blend) {
blend.equation_rgb = Regs::Blend::Equation::Add;
blend.factor_source_rgb = Regs::Blend::Factor::One;
blend.factor_dest_rgb = Regs::Blend::Factor::Zero;
blend.equation_a = Regs::Blend::Equation::Add;
blend.factor_source_a = Regs::Blend::Factor::One;
blend.factor_dest_a = Regs::Blend::Factor::Zero;
}
regs.stencil_front_op_fail = Regs::StencilOp::Keep;
regs.stencil_front_op_zfail = Regs::StencilOp::Keep;
regs.stencil_front_op_zpass = Regs::StencilOp::Keep;
regs.stencil_front_func_func = Regs::ComparisonOp::Always;
regs.stencil_front_func_mask = 0xFFFFFFFF;
regs.stencil_front_mask = 0xFFFFFFFF;
regs.stencil_two_side_enable = 1;
regs.stencil_back_op_fail = Regs::StencilOp::Keep;
regs.stencil_back_op_zfail = Regs::StencilOp::Keep;
regs.stencil_back_op_zpass = Regs::StencilOp::Keep;
regs.stencil_back_func_func = Regs::ComparisonOp::Always;
regs.stencil_back_func_mask = 0xFFFFFFFF;
regs.stencil_back_mask = 0xFFFFFFFF;
regs.depth_test_func = Regs::ComparisonOp::Always;
regs.front_face = Regs::FrontFace::CounterClockWise;
regs.cull_face = Regs::CullFace::Back;
// TODO(Rodrigo): Most games do not set a point size. I think this is a case of a
// register carrying a default value. Assume it's OpenGL's default (1).
regs.point_size = 1.0f;
// TODO(bunnei): Some games do not initialize the color masks (e.g. Sonic Mania). Assuming a
// default of enabled fixes rendering here.
for (auto& color_mask : regs.color_mask) {
color_mask.R.Assign(1);
color_mask.G.Assign(1);
color_mask.B.Assign(1);
color_mask.A.Assign(1);
}
for (auto& format : regs.vertex_attrib_format) {
format.constant.Assign(1);
}
// NVN games expect these values to be enabled at boot
regs.rasterize_enable = 1;
regs.rt_separate_frag_data = 1;
regs.framebuffer_srgb = 1;
regs.line_width_aliased = 1.0f;
regs.line_width_smooth = 1.0f;
regs.front_face = Maxwell3D::Regs::FrontFace::ClockWise;
regs.polygon_mode_back = Maxwell3D::Regs::PolygonMode::Fill;
regs.polygon_mode_front = Maxwell3D::Regs::PolygonMode::Fill;
shadow_state = regs;
mme_inline[MAXWELL3D_REG_INDEX(draw.vertex_end_gl)] = true;
mme_inline[MAXWELL3D_REG_INDEX(draw.vertex_begin_gl)] = true;
mme_inline[MAXWELL3D_REG_INDEX(vertex_buffer.count)] = true;
mme_inline[MAXWELL3D_REG_INDEX(index_array.count)] = true;
}
void Maxwell3D::ProcessMacro(u32 method, const u32* base_start, u32 amount, bool is_last_call) {
if (executing_macro == 0) {
// A macro call must begin by writing the macro method's register, not its argument.
ASSERT_MSG((method % 2) == 0,
"Can't start macro execution by writing to the ARGS register");
executing_macro = method;
}
macro_params.insert(macro_params.end(), base_start, base_start + amount);
// Call the macro when there are no more parameters in the command buffer
if (is_last_call) {
CallMacroMethod(executing_macro, macro_params);
macro_params.clear();
}
}
u32 Maxwell3D::ProcessShadowRam(u32 method, u32 argument) {
// Keep track of the register value in shadow_state when requested.
const auto control = shadow_state.shadow_ram_control;
if (control == Regs::ShadowRamControl::Track ||
control == Regs::ShadowRamControl::TrackWithFilter) {
shadow_state.reg_array[method] = argument;
return argument;
}
if (control == Regs::ShadowRamControl::Replay) {
return shadow_state.reg_array[method];
}
return argument;
}
void Maxwell3D::ProcessDirtyRegisters(u32 method, u32 argument) {
if (regs.reg_array[method] == argument) {
return;
}
regs.reg_array[method] = argument;
for (const auto& table : dirty.tables) {
dirty.flags[table[method]] = true;
}
}
void Maxwell3D::ProcessMethodCall(u32 method, u32 argument, u32 nonshadow_argument,
bool is_last_call) {
switch (method) {
case MAXWELL3D_REG_INDEX(wait_for_idle):
return rasterizer->WaitForIdle();
case MAXWELL3D_REG_INDEX(shadow_ram_control):
shadow_state.shadow_ram_control = static_cast<Regs::ShadowRamControl>(nonshadow_argument);
return;
case MAXWELL3D_REG_INDEX(macros.data):
return macro_engine->AddCode(regs.macros.upload_address, argument);
case MAXWELL3D_REG_INDEX(macros.bind):
return ProcessMacroBind(argument);
case MAXWELL3D_REG_INDEX(firmware[4]):
return ProcessFirmwareCall4();
case MAXWELL3D_REG_INDEX(const_buffer.cb_data[0]):
case MAXWELL3D_REG_INDEX(const_buffer.cb_data[1]):
case MAXWELL3D_REG_INDEX(const_buffer.cb_data[2]):
case MAXWELL3D_REG_INDEX(const_buffer.cb_data[3]):
case MAXWELL3D_REG_INDEX(const_buffer.cb_data[4]):
case MAXWELL3D_REG_INDEX(const_buffer.cb_data[5]):
case MAXWELL3D_REG_INDEX(const_buffer.cb_data[6]):
case MAXWELL3D_REG_INDEX(const_buffer.cb_data[7]):
case MAXWELL3D_REG_INDEX(const_buffer.cb_data[8]):
case MAXWELL3D_REG_INDEX(const_buffer.cb_data[9]):
case MAXWELL3D_REG_INDEX(const_buffer.cb_data[10]):
case MAXWELL3D_REG_INDEX(const_buffer.cb_data[11]):
case MAXWELL3D_REG_INDEX(const_buffer.cb_data[12]):
case MAXWELL3D_REG_INDEX(const_buffer.cb_data[13]):
case MAXWELL3D_REG_INDEX(const_buffer.cb_data[14]):
case MAXWELL3D_REG_INDEX(const_buffer.cb_data[15]):
return StartCBData(method);
case MAXWELL3D_REG_INDEX(cb_bind[0]):
return ProcessCBBind(0);
case MAXWELL3D_REG_INDEX(cb_bind[1]):
return ProcessCBBind(1);
case MAXWELL3D_REG_INDEX(cb_bind[2]):
return ProcessCBBind(2);
case MAXWELL3D_REG_INDEX(cb_bind[3]):
return ProcessCBBind(3);
case MAXWELL3D_REG_INDEX(cb_bind[4]):
return ProcessCBBind(4);
case MAXWELL3D_REG_INDEX(draw.vertex_end_gl):
return DrawArrays();
case MAXWELL3D_REG_INDEX(clear_buffers):
return ProcessClearBuffers();
case MAXWELL3D_REG_INDEX(query.query_get):
return ProcessQueryGet();
case MAXWELL3D_REG_INDEX(condition.mode):
return ProcessQueryCondition();
case MAXWELL3D_REG_INDEX(counter_reset):
return ProcessCounterReset();
case MAXWELL3D_REG_INDEX(sync_info):
return ProcessSyncPoint();
case MAXWELL3D_REG_INDEX(exec_upload):
return upload_state.ProcessExec(regs.exec_upload.linear != 0);
case MAXWELL3D_REG_INDEX(data_upload):
upload_state.ProcessData(argument, is_last_call);
if (is_last_call) {
OnMemoryWrite();
}
return;
case MAXWELL3D_REG_INDEX(fragment_barrier):
return rasterizer->FragmentBarrier();
case MAXWELL3D_REG_INDEX(tiled_cache_barrier):
return rasterizer->TiledCacheBarrier();
}
}
void Maxwell3D::CallMacroMethod(u32 method, const std::vector<u32>& parameters) {
// Reset the current macro.
executing_macro = 0;
// Lookup the macro offset
const u32 entry =
((method - MacroRegistersStart) >> 1) % static_cast<u32>(macro_positions.size());
// Execute the current macro.
macro_engine->Execute(*this, macro_positions[entry], parameters);
if (mme_draw.current_mode != MMEDrawMode::Undefined) {
FlushMMEInlineDraw();
}
}
void Maxwell3D::CallMethod(u32 method, u32 method_argument, bool is_last_call) {
if (method == cb_data_state.current) {
regs.reg_array[method] = method_argument;
ProcessCBData(method_argument);
return;
} else if (cb_data_state.current != null_cb_data) {
FinishCBData();
}
// It is an error to write to a register other than the current macro's ARG register before it
// has finished execution.
if (executing_macro != 0) {
ASSERT(method == executing_macro + 1);
}
// Methods after 0xE00 are special, they're actually triggers for some microcode that was
// uploaded to the GPU during initialization.
if (method >= MacroRegistersStart) {
ProcessMacro(method, &method_argument, 1, is_last_call);
return;
}
ASSERT_MSG(method < Regs::NUM_REGS,
"Invalid Maxwell3D register, increase the size of the Regs structure");
const u32 argument = ProcessShadowRam(method, method_argument);
ProcessDirtyRegisters(method, argument);
ProcessMethodCall(method, argument, method_argument, is_last_call);
}
void Maxwell3D::CallMultiMethod(u32 method, const u32* base_start, u32 amount,
u32 methods_pending) {
// Methods after 0xE00 are special, they're actually triggers for some microcode that was
// uploaded to the GPU during initialization.
if (method >= MacroRegistersStart) {
ProcessMacro(method, base_start, amount, amount == methods_pending);
return;
}
switch (method) {
case MAXWELL3D_REG_INDEX(const_buffer.cb_data[0]):
case MAXWELL3D_REG_INDEX(const_buffer.cb_data[1]):
case MAXWELL3D_REG_INDEX(const_buffer.cb_data[2]):
case MAXWELL3D_REG_INDEX(const_buffer.cb_data[3]):
case MAXWELL3D_REG_INDEX(const_buffer.cb_data[4]):
case MAXWELL3D_REG_INDEX(const_buffer.cb_data[5]):
case MAXWELL3D_REG_INDEX(const_buffer.cb_data[6]):
case MAXWELL3D_REG_INDEX(const_buffer.cb_data[7]):
case MAXWELL3D_REG_INDEX(const_buffer.cb_data[8]):
case MAXWELL3D_REG_INDEX(const_buffer.cb_data[9]):
case MAXWELL3D_REG_INDEX(const_buffer.cb_data[10]):
case MAXWELL3D_REG_INDEX(const_buffer.cb_data[11]):
case MAXWELL3D_REG_INDEX(const_buffer.cb_data[12]):
case MAXWELL3D_REG_INDEX(const_buffer.cb_data[13]):
case MAXWELL3D_REG_INDEX(const_buffer.cb_data[14]):
case MAXWELL3D_REG_INDEX(const_buffer.cb_data[15]):
ProcessCBMultiData(method, base_start, amount);
break;
default:
for (std::size_t i = 0; i < amount; i++) {
CallMethod(method, base_start[i], methods_pending - static_cast<u32>(i) <= 1);
}
break;
}
}
void Maxwell3D::StepInstance(const MMEDrawMode expected_mode, const u32 count) {
if (mme_draw.current_mode == MMEDrawMode::Undefined) {
if (mme_draw.gl_begin_consume) {
mme_draw.current_mode = expected_mode;
mme_draw.current_count = count;
mme_draw.instance_count = 1;
mme_draw.gl_begin_consume = false;
mme_draw.gl_end_count = 0;
}
return;
} else {
if (mme_draw.current_mode == expected_mode && count == mme_draw.current_count &&
mme_draw.instance_mode && mme_draw.gl_begin_consume) {
mme_draw.instance_count++;
mme_draw.gl_begin_consume = false;
return;
} else {
FlushMMEInlineDraw();
}
}
// Tail call in case it needs to retry.
StepInstance(expected_mode, count);
}
void Maxwell3D::CallMethodFromMME(u32 method, u32 method_argument) {
if (mme_inline[method]) {
regs.reg_array[method] = method_argument;
if (method == MAXWELL3D_REG_INDEX(vertex_buffer.count) ||
method == MAXWELL3D_REG_INDEX(index_array.count)) {
const MMEDrawMode expected_mode = method == MAXWELL3D_REG_INDEX(vertex_buffer.count)
? MMEDrawMode::Array
: MMEDrawMode::Indexed;
StepInstance(expected_mode, method_argument);
} else if (method == MAXWELL3D_REG_INDEX(draw.vertex_begin_gl)) {
mme_draw.instance_mode =
(regs.draw.instance_next != 0) || (regs.draw.instance_cont != 0);
mme_draw.gl_begin_consume = true;
} else {
mme_draw.gl_end_count++;
}
} else {
if (mme_draw.current_mode != MMEDrawMode::Undefined) {
FlushMMEInlineDraw();
}
CallMethod(method, method_argument, true);
}
}
void Maxwell3D::FlushMMEInlineDraw() {
LOG_TRACE(HW_GPU, "called, topology={}, count={}", regs.draw.topology.Value(),
regs.vertex_buffer.count);
ASSERT_MSG(!(regs.index_array.count && regs.vertex_buffer.count), "Both indexed and direct?");
ASSERT(mme_draw.instance_count == mme_draw.gl_end_count);
// Both instance configuration registers can not be set at the same time.
ASSERT_MSG(!regs.draw.instance_next || !regs.draw.instance_cont,
"Illegal combination of instancing parameters");
const bool is_indexed = mme_draw.current_mode == MMEDrawMode::Indexed;
if (ShouldExecute()) {
rasterizer->Draw(is_indexed, true);
}
// TODO(bunnei): Below, we reset vertex count so that we can use these registers to determine if
// the game is trying to draw indexed or direct mode. This needs to be verified on HW still -
// it's possible that it is incorrect and that there is some other register used to specify the
// drawing mode.
if (is_indexed) {
regs.index_array.count = 0;
} else {
regs.vertex_buffer.count = 0;
}
mme_draw.current_mode = MMEDrawMode::Undefined;
mme_draw.current_count = 0;
mme_draw.instance_count = 0;
mme_draw.instance_mode = false;
mme_draw.gl_begin_consume = false;
mme_draw.gl_end_count = 0;
}
void Maxwell3D::ProcessMacroUpload(u32 data) {
macro_engine->AddCode(regs.macros.upload_address++, data);
}
void Maxwell3D::ProcessMacroBind(u32 data) {
macro_positions[regs.macros.entry++] = data;
}
void Maxwell3D::ProcessFirmwareCall4() {
LOG_WARNING(HW_GPU, "(STUBBED) called");
// Firmware call 4 is a blob that changes some registers depending on its parameters.
// These registers don't affect emulation and so are stubbed by setting 0xd00 to 1.
regs.reg_array[0xd00] = 1;
}
void Maxwell3D::StampQueryResult(u64 payload, bool long_query) {
struct LongQueryResult {
u64_le value;
u64_le timestamp;
};
static_assert(sizeof(LongQueryResult) == 16, "LongQueryResult has wrong size");
const GPUVAddr sequence_address{regs.query.QueryAddress()};
if (long_query) {
// Write the 128-bit result structure in long mode. Note: We emulate an infinitely fast
// GPU, this command may actually take a while to complete in real hardware due to GPU
// wait queues.
LongQueryResult query_result{payload, system.GPU().GetTicks()};
memory_manager.WriteBlock(sequence_address, &query_result, sizeof(query_result));
} else {
memory_manager.Write<u32>(sequence_address, static_cast<u32>(payload));
}
}
void Maxwell3D::ProcessQueryGet() {
// TODO(Subv): Support the other query units.
if (regs.query.query_get.unit != Regs::QueryUnit::Crop) {
LOG_DEBUG(HW_GPU, "Units other than CROP are unimplemented");
}
switch (regs.query.query_get.operation) {
case Regs::QueryOperation::Release:
if (regs.query.query_get.fence == 1) {
rasterizer->SignalSemaphore(regs.query.QueryAddress(), regs.query.query_sequence);
} else {
StampQueryResult(regs.query.query_sequence, regs.query.query_get.short_query == 0);
}
break;
case Regs::QueryOperation::Acquire:
// TODO(Blinkhawk): Under this operation, the GPU waits for the CPU to write a value that
// matches the current payload.
UNIMPLEMENTED_MSG("Unimplemented query operation ACQUIRE");
break;
case Regs::QueryOperation::Counter:
if (const std::optional<u64> result = GetQueryResult()) {
// If the query returns an empty optional it means it's cached and deferred.
// In this case we have a non-empty result, so we stamp it immediately.
StampQueryResult(*result, regs.query.query_get.short_query == 0);
}
break;
case Regs::QueryOperation::Trap:
UNIMPLEMENTED_MSG("Unimplemented query operation TRAP");
break;
default:
UNIMPLEMENTED_MSG("Unknown query operation");
break;
}
}
void Maxwell3D::ProcessQueryCondition() {
const GPUVAddr condition_address{regs.condition.Address()};
switch (regs.condition.mode) {
case Regs::ConditionMode::Always: {
execute_on = true;
break;
}
case Regs::ConditionMode::Never: {
execute_on = false;
break;
}
case Regs::ConditionMode::ResNonZero: {
Regs::QueryCompare cmp;
memory_manager.ReadBlock(condition_address, &cmp, sizeof(cmp));
execute_on = cmp.initial_sequence != 0U && cmp.initial_mode != 0U;
break;
}
case Regs::ConditionMode::Equal: {
Regs::QueryCompare cmp;
memory_manager.ReadBlock(condition_address, &cmp, sizeof(cmp));
execute_on =
cmp.initial_sequence == cmp.current_sequence && cmp.initial_mode == cmp.current_mode;
break;
}
case Regs::ConditionMode::NotEqual: {
Regs::QueryCompare cmp;
memory_manager.ReadBlock(condition_address, &cmp, sizeof(cmp));
execute_on =
cmp.initial_sequence != cmp.current_sequence || cmp.initial_mode != cmp.current_mode;
break;
}
default: {
UNIMPLEMENTED_MSG("Uninplemented Condition Mode!");
execute_on = true;
break;
}
}
}
void Maxwell3D::ProcessCounterReset() {
switch (regs.counter_reset) {
case Regs::CounterReset::SampleCnt:
rasterizer->ResetCounter(QueryType::SamplesPassed);
break;
default:
LOG_DEBUG(Render_OpenGL, "Unimplemented counter reset={}", regs.counter_reset);
break;
}
}
void Maxwell3D::ProcessSyncPoint() {
const u32 sync_point = regs.sync_info.sync_point.Value();
const u32 increment = regs.sync_info.increment.Value();
[[maybe_unused]] const u32 cache_flush = regs.sync_info.unknown.Value();
if (increment) {
rasterizer->SignalSyncPoint(sync_point);
}
}
void Maxwell3D::DrawArrays() {
LOG_TRACE(HW_GPU, "called, topology={}, count={}", regs.draw.topology.Value(),
regs.vertex_buffer.count);
ASSERT_MSG(!(regs.index_array.count && regs.vertex_buffer.count), "Both indexed and direct?");
// Both instance configuration registers can not be set at the same time.
ASSERT_MSG(!regs.draw.instance_next || !regs.draw.instance_cont,
"Illegal combination of instancing parameters");
if (regs.draw.instance_next) {
// Increment the current instance *before* drawing.
state.current_instance += 1;
} else if (!regs.draw.instance_cont) {
// Reset the current instance to 0.
state.current_instance = 0;
}
const bool is_indexed{regs.index_array.count && !regs.vertex_buffer.count};
if (ShouldExecute()) {
rasterizer->Draw(is_indexed, false);
}
// TODO(bunnei): Below, we reset vertex count so that we can use these registers to determine if
// the game is trying to draw indexed or direct mode. This needs to be verified on HW still -
// it's possible that it is incorrect and that there is some other register used to specify the
// drawing mode.
if (is_indexed) {
regs.index_array.count = 0;
} else {
regs.vertex_buffer.count = 0;
}
}
std::optional<u64> Maxwell3D::GetQueryResult() {
switch (regs.query.query_get.select) {
case Regs::QuerySelect::Zero:
return 0;
case Regs::QuerySelect::SamplesPassed:
// Deferred.
rasterizer->Query(regs.query.QueryAddress(), QueryType::SamplesPassed,
system.GPU().GetTicks());
return std::nullopt;
default:
LOG_DEBUG(HW_GPU, "Unimplemented query select type {}",
regs.query.query_get.select.Value());
return 1;
}
}
void Maxwell3D::ProcessCBBind(std::size_t stage_index) {
// Bind the buffer currently in CB_ADDRESS to the specified index in the desired shader stage.
auto& shader = state.shader_stages[stage_index];
auto& bind_data = regs.cb_bind[stage_index];
ASSERT(bind_data.index < Regs::MaxConstBuffers);
auto& buffer = shader.const_buffers[bind_data.index];
buffer.enabled = bind_data.valid.Value() != 0;
buffer.address = regs.const_buffer.BufferAddress();
buffer.size = regs.const_buffer.cb_size;
}
void Maxwell3D::ProcessCBData(u32 value) {
const u32 id = cb_data_state.id;
cb_data_state.buffer[id][cb_data_state.counter] = value;
// Increment the current buffer position.
regs.const_buffer.cb_pos = regs.const_buffer.cb_pos + 4;
cb_data_state.counter++;
}
void Maxwell3D::StartCBData(u32 method) {
constexpr u32 first_cb_data = MAXWELL3D_REG_INDEX(const_buffer.cb_data[0]);
cb_data_state.start_pos = regs.const_buffer.cb_pos;
cb_data_state.id = method - first_cb_data;
cb_data_state.current = method;
cb_data_state.counter = 0;
ProcessCBData(regs.const_buffer.cb_data[cb_data_state.id]);
}
void Maxwell3D::ProcessCBMultiData(u32 method, const u32* start_base, u32 amount) {
if (cb_data_state.current != method) {
if (cb_data_state.current != null_cb_data) {
FinishCBData();
}
constexpr u32 first_cb_data = MAXWELL3D_REG_INDEX(const_buffer.cb_data[0]);
cb_data_state.start_pos = regs.const_buffer.cb_pos;
cb_data_state.id = method - first_cb_data;
cb_data_state.current = method;
cb_data_state.counter = 0;
}
const std::size_t id = cb_data_state.id;
const std::size_t size = amount;
std::size_t i = 0;
for (; i < size; i++) {
cb_data_state.buffer[id][cb_data_state.counter] = start_base[i];
cb_data_state.counter++;
}
// Increment the current buffer position.
regs.const_buffer.cb_pos = regs.const_buffer.cb_pos + 4 * amount;
}
void Maxwell3D::FinishCBData() {
// Write the input value to the current const buffer at the current position.
const GPUVAddr buffer_address = regs.const_buffer.BufferAddress();
ASSERT(buffer_address != 0);
// Don't allow writing past the end of the buffer.
ASSERT(regs.const_buffer.cb_pos <= regs.const_buffer.cb_size);
const GPUVAddr address{buffer_address + cb_data_state.start_pos};
const std::size_t size = regs.const_buffer.cb_pos - cb_data_state.start_pos;
const u32 id = cb_data_state.id;
memory_manager.WriteBlock(address, cb_data_state.buffer[id].data(), size);
OnMemoryWrite();
cb_data_state.id = null_cb_data;
cb_data_state.current = null_cb_data;
}
Texture::TICEntry Maxwell3D::GetTICEntry(u32 tic_index) const {
const GPUVAddr tic_address_gpu{regs.tic.Address() + tic_index * sizeof(Texture::TICEntry)};
Texture::TICEntry tic_entry;
memory_manager.ReadBlockUnsafe(tic_address_gpu, &tic_entry, sizeof(Texture::TICEntry));
return tic_entry;
}
Texture::TSCEntry Maxwell3D::GetTSCEntry(u32 tsc_index) const {
const GPUVAddr tsc_address_gpu{regs.tsc.Address() + tsc_index * sizeof(Texture::TSCEntry)};
Texture::TSCEntry tsc_entry;
memory_manager.ReadBlockUnsafe(tsc_address_gpu, &tsc_entry, sizeof(Texture::TSCEntry));
return tsc_entry;
}
u32 Maxwell3D::GetRegisterValue(u32 method) const {
ASSERT_MSG(method < Regs::NUM_REGS, "Invalid Maxwell3D register");
return regs.reg_array[method];
}
void Maxwell3D::ProcessClearBuffers() {
rasterizer->Clear();
}
u32 Maxwell3D::AccessConstBuffer32(ShaderType stage, u64 const_buffer, u64 offset) const {
ASSERT(stage != ShaderType::Compute);
const auto& shader_stage = state.shader_stages[static_cast<std::size_t>(stage)];
const auto& buffer = shader_stage.const_buffers[const_buffer];
return memory_manager.Read<u32>(buffer.address + offset);
}
SamplerDescriptor Maxwell3D::AccessBoundSampler(ShaderType stage, u64 offset) const {
return AccessBindlessSampler(stage, regs.tex_cb_index, offset * sizeof(Texture::TextureHandle));
}
SamplerDescriptor Maxwell3D::AccessBindlessSampler(ShaderType stage, u64 const_buffer,
u64 offset) const {
ASSERT(stage != ShaderType::Compute);
const auto& shader = state.shader_stages[static_cast<std::size_t>(stage)];
const auto& tex_info_buffer = shader.const_buffers[const_buffer];
const GPUVAddr tex_info_address = tex_info_buffer.address + offset;
return AccessSampler(memory_manager.Read<u32>(tex_info_address));
}
SamplerDescriptor Maxwell3D::AccessSampler(u32 handle) const {
const Texture::TextureHandle tex_handle{handle};
const Texture::TICEntry tic = GetTICEntry(tex_handle.tic_id);
const Texture::TSCEntry tsc = GetTSCEntry(tex_handle.tsc_id);
SamplerDescriptor result = SamplerDescriptor::FromTIC(tic);
result.is_shadow.Assign(tsc.depth_compare_enabled.Value());
return result;
}
VideoCore::GuestDriverProfile& Maxwell3D::AccessGuestDriverProfile() {
return rasterizer->AccessGuestDriverProfile();
}
const VideoCore::GuestDriverProfile& Maxwell3D::AccessGuestDriverProfile() const {
return rasterizer->AccessGuestDriverProfile();
}
} // namespace Tegra::Engines

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,219 @@
// Copyright 2018 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#include "common/assert.h"
#include "common/logging/log.h"
#include "core/core.h"
#include "core/settings.h"
#include "video_core/engines/maxwell_3d.h"
#include "video_core/engines/maxwell_dma.h"
#include "video_core/memory_manager.h"
#include "video_core/renderer_base.h"
#include "video_core/textures/decoders.h"
namespace Tegra::Engines {
using namespace Texture;
MaxwellDMA::MaxwellDMA(Core::System& system_, MemoryManager& memory_manager_)
: system{system_}, memory_manager{memory_manager_} {}
MaxwellDMA::~MaxwellDMA() = default;
void MaxwellDMA::CallMethod(u32 method, u32 method_argument, bool is_last_call) {
ASSERT_MSG(method < NUM_REGS, "Invalid MaxwellDMA register");
regs.reg_array[method] = method_argument;
if (method == offsetof(Regs, launch_dma) / sizeof(u32)) {
Launch();
}
}
void MaxwellDMA::CallMultiMethod(u32 method, const u32* base_start, u32 amount,
u32 methods_pending) {
for (size_t i = 0; i < amount; ++i) {
CallMethod(method, base_start[i], methods_pending - static_cast<u32>(i) <= 1);
}
}
void MaxwellDMA::Launch() {
LOG_TRACE(Render_OpenGL, "DMA copy 0x{:x} -> 0x{:x}", static_cast<GPUVAddr>(regs.offset_in),
static_cast<GPUVAddr>(regs.offset_out));
// TODO(Subv): Perform more research and implement all features of this engine.
const LaunchDMA& launch = regs.launch_dma;
ASSERT(launch.remap_enable == 0);
ASSERT(launch.semaphore_type == LaunchDMA::SemaphoreType::NONE);
ASSERT(launch.interrupt_type == LaunchDMA::InterruptType::NONE);
ASSERT(launch.data_transfer_type == LaunchDMA::DataTransferType::NON_PIPELINED);
ASSERT(regs.dst_params.origin.x == 0);
ASSERT(regs.dst_params.origin.y == 0);
const bool is_src_pitch = launch.src_memory_layout == LaunchDMA::MemoryLayout::PITCH;
const bool is_dst_pitch = launch.dst_memory_layout == LaunchDMA::MemoryLayout::PITCH;
if (!is_src_pitch && !is_dst_pitch) {
// If both the source and the destination are in block layout, assert.
UNREACHABLE_MSG("Tiled->Tiled DMA transfers are not yet implemented");
return;
}
// All copies here update the main memory, so mark all rasterizer states as invalid.
system.GPU().Maxwell3D().OnMemoryWrite();
if (is_src_pitch && is_dst_pitch) {
CopyPitchToPitch();
} else {
ASSERT(launch.multi_line_enable == 1);
if (!is_src_pitch && is_dst_pitch) {
CopyBlockLinearToPitch();
} else {
CopyPitchToBlockLinear();
}
}
}
void MaxwellDMA::CopyPitchToPitch() {
// When `multi_line_enable` bit is disabled the copy is performed as if we were copying a 1D
// buffer of length `line_length_in`.
// Otherwise we copy a 2D image of dimensions (line_length_in, line_count).
if (!regs.launch_dma.multi_line_enable) {
memory_manager.CopyBlock(regs.offset_out, regs.offset_in, regs.line_length_in);
return;
}
// Perform a line-by-line copy.
// We're going to take a subrect of size (line_length_in, line_count) from the source rectangle.
// There is no need to manually flush/invalidate the regions because CopyBlock does that for us.
for (u32 line = 0; line < regs.line_count; ++line) {
const GPUVAddr source_line = regs.offset_in + static_cast<size_t>(line) * regs.pitch_in;
const GPUVAddr dest_line = regs.offset_out + static_cast<size_t>(line) * regs.pitch_out;
memory_manager.CopyBlock(dest_line, source_line, regs.line_length_in);
}
}
void MaxwellDMA::CopyBlockLinearToPitch() {
UNIMPLEMENTED_IF(regs.src_params.block_size.width != 0);
UNIMPLEMENTED_IF(regs.src_params.block_size.depth != 0);
UNIMPLEMENTED_IF(regs.src_params.layer != 0);
// Optimized path for micro copies.
const size_t dst_size = static_cast<size_t>(regs.pitch_out) * regs.line_count;
if (dst_size < GOB_SIZE && regs.pitch_out <= GOB_SIZE_X) {
FastCopyBlockLinearToPitch();
return;
}
// Deswizzle the input and copy it over.
const u32 bytes_per_pixel = regs.pitch_out / regs.line_length_in;
const Parameters& src_params = regs.src_params;
const u32 width = src_params.width;
const u32 height = src_params.height;
const u32 depth = src_params.depth;
const u32 block_height = src_params.block_size.height;
const u32 block_depth = src_params.block_size.depth;
const size_t src_size =
CalculateSize(true, bytes_per_pixel, width, height, depth, block_height, block_depth);
if (read_buffer.size() < src_size) {
read_buffer.resize(src_size);
}
if (write_buffer.size() < dst_size) {
write_buffer.resize(dst_size);
}
memory_manager.ReadBlock(regs.offset_in, read_buffer.data(), src_size);
memory_manager.ReadBlock(regs.offset_out, write_buffer.data(), dst_size);
UnswizzleSubrect(regs.line_length_in, regs.line_count, regs.pitch_out, width, bytes_per_pixel,
block_height, src_params.origin.x, src_params.origin.y, write_buffer.data(),
read_buffer.data());
memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size);
}
void MaxwellDMA::CopyPitchToBlockLinear() {
UNIMPLEMENTED_IF_MSG(regs.dst_params.block_size.width != 0, "Block width is not one");
const auto& dst_params = regs.dst_params;
const u32 bytes_per_pixel = regs.pitch_in / regs.line_length_in;
const u32 width = dst_params.width;
const u32 height = dst_params.height;
const u32 depth = dst_params.depth;
const u32 block_height = dst_params.block_size.height;
const u32 block_depth = dst_params.block_size.depth;
const size_t dst_size =
CalculateSize(true, bytes_per_pixel, width, height, depth, block_height, block_depth);
const size_t dst_layer_size =
CalculateSize(true, bytes_per_pixel, width, height, 1, block_height, block_depth);
const size_t src_size = static_cast<size_t>(regs.pitch_in) * regs.line_count;
if (read_buffer.size() < src_size) {
read_buffer.resize(src_size);
}
if (write_buffer.size() < dst_size) {
write_buffer.resize(dst_size);
}
if (Settings::IsGPULevelExtreme()) {
memory_manager.ReadBlock(regs.offset_in, read_buffer.data(), src_size);
memory_manager.ReadBlock(regs.offset_out, write_buffer.data(), dst_size);
} else {
memory_manager.ReadBlockUnsafe(regs.offset_in, read_buffer.data(), src_size);
memory_manager.ReadBlockUnsafe(regs.offset_out, write_buffer.data(), dst_size);
}
// If the input is linear and the output is tiled, swizzle the input and copy it over.
if (regs.dst_params.block_size.depth > 0) {
ASSERT(dst_params.layer == 0);
SwizzleSliceToVoxel(regs.line_length_in, regs.line_count, regs.pitch_in, width, height,
bytes_per_pixel, block_height, block_depth, dst_params.origin.x,
dst_params.origin.y, write_buffer.data(), read_buffer.data());
} else {
SwizzleSubrect(regs.line_length_in, regs.line_count, regs.pitch_in, width, bytes_per_pixel,
write_buffer.data() + dst_layer_size * dst_params.layer, read_buffer.data(),
block_height, dst_params.origin.x, dst_params.origin.y);
}
memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size);
}
void MaxwellDMA::FastCopyBlockLinearToPitch() {
const u32 bytes_per_pixel = regs.pitch_out / regs.line_length_in;
const size_t src_size = GOB_SIZE;
const size_t dst_size = static_cast<size_t>(regs.pitch_out) * regs.line_count;
u32 pos_x = regs.src_params.origin.x;
u32 pos_y = regs.src_params.origin.y;
const u64 offset = GetGOBOffset(regs.src_params.width, regs.src_params.height, pos_x, pos_y,
regs.src_params.block_size.height, bytes_per_pixel);
const u32 x_in_gob = 64 / bytes_per_pixel;
pos_x = pos_x % x_in_gob;
pos_y = pos_y % 8;
if (read_buffer.size() < src_size) {
read_buffer.resize(src_size);
}
if (write_buffer.size() < dst_size) {
write_buffer.resize(dst_size);
}
if (Settings::IsGPULevelExtreme()) {
memory_manager.ReadBlock(regs.offset_in + offset, read_buffer.data(), src_size);
memory_manager.ReadBlock(regs.offset_out, write_buffer.data(), dst_size);
} else {
memory_manager.ReadBlockUnsafe(regs.offset_in + offset, read_buffer.data(), src_size);
memory_manager.ReadBlockUnsafe(regs.offset_out, write_buffer.data(), dst_size);
}
UnswizzleSubrect(regs.line_length_in, regs.line_count, regs.pitch_out, regs.src_params.width,
bytes_per_pixel, regs.src_params.block_size.height, pos_x, pos_y,
write_buffer.data(), read_buffer.data());
memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size);
}
} // namespace Tegra::Engines

View File

@@ -0,0 +1,274 @@
// Copyright 2018 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#pragma once
#include <array>
#include <cstddef>
#include <vector>
#include "common/bit_field.h"
#include "common/common_funcs.h"
#include "common/common_types.h"
#include "video_core/engines/engine_interface.h"
#include "video_core/gpu.h"
namespace Core {
class System;
}
namespace Tegra {
class MemoryManager;
}
namespace Tegra::Engines {
/**
* This engine is known as gk104_copy. Documentation can be found in:
* https://github.com/NVIDIA/open-gpu-doc/blob/master/classes/dma-copy/clb0b5.h
* https://github.com/envytools/envytools/blob/master/rnndb/fifo/gk104_copy.xml
*/
class MaxwellDMA final : public EngineInterface {
public:
struct PackedGPUVAddr {
u32 upper;
u32 lower;
constexpr operator GPUVAddr() const noexcept {
return (static_cast<GPUVAddr>(upper & 0xff) << 32) | lower;
}
};
union BlockSize {
BitField<0, 4, u32> width;
BitField<4, 4, u32> height;
BitField<8, 4, u32> depth;
BitField<12, 4, u32> gob_height;
};
static_assert(sizeof(BlockSize) == 4);
union Origin {
BitField<0, 16, u32> x;
BitField<16, 16, u32> y;
};
static_assert(sizeof(Origin) == 4);
struct Parameters {
BlockSize block_size;
u32 width;
u32 height;
u32 depth;
u32 layer;
Origin origin;
};
static_assert(sizeof(Parameters) == 24);
struct Semaphore {
PackedGPUVAddr address;
u32 payload;
};
static_assert(sizeof(Semaphore) == 12);
struct RenderEnable {
enum class Mode : u32 {
// Note: This uses Pascal case in order to avoid the identifiers
// FALSE and TRUE, which are reserved on Darwin.
False = 0,
True = 1,
Conditional = 2,
RenderIfEqual = 3,
RenderIfNotEqual = 4,
};
PackedGPUVAddr address;
BitField<0, 3, Mode> mode;
};
static_assert(sizeof(RenderEnable) == 12);
enum class PhysModeTarget : u32 {
LOCAL_FB = 0,
COHERENT_SYSMEM = 1,
NONCOHERENT_SYSMEM = 2,
};
using PhysMode = BitField<0, 2, PhysModeTarget>;
union LaunchDMA {
enum class DataTransferType : u32 {
NONE = 0,
PIPELINED = 1,
NON_PIPELINED = 2,
};
enum class SemaphoreType : u32 {
NONE = 0,
RELEASE_ONE_WORD_SEMAPHORE = 1,
RELEASE_FOUR_WORD_SEMAPHORE = 2,
};
enum class InterruptType : u32 {
NONE = 0,
BLOCKING = 1,
NON_BLOCKING = 2,
};
enum class MemoryLayout : u32 {
BLOCKLINEAR = 0,
PITCH = 1,
};
enum class Type : u32 {
VIRTUAL = 0,
PHYSICAL = 1,
};
enum class SemaphoreReduction : u32 {
IMIN = 0,
IMAX = 1,
IXOR = 2,
IAND = 3,
IOR = 4,
IADD = 5,
INC = 6,
DEC = 7,
FADD = 0xA,
};
enum class SemaphoreReductionSign : u32 {
SIGNED = 0,
UNSIGNED = 1,
};
enum class BypassL2 : u32 {
USE_PTE_SETTING = 0,
FORCE_VOLATILE = 1,
};
BitField<0, 2, DataTransferType> data_transfer_type;
BitField<2, 1, u32> flush_enable;
BitField<3, 2, SemaphoreType> semaphore_type;
BitField<5, 2, InterruptType> interrupt_type;
BitField<7, 1, MemoryLayout> src_memory_layout;
BitField<8, 1, MemoryLayout> dst_memory_layout;
BitField<9, 1, u32> multi_line_enable;
BitField<10, 1, u32> remap_enable;
BitField<11, 1, u32> rmwdisable;
BitField<12, 1, Type> src_type;
BitField<13, 1, Type> dst_type;
BitField<14, 4, SemaphoreReduction> semaphore_reduction;
BitField<18, 1, SemaphoreReductionSign> semaphore_reduction_sign;
BitField<19, 1, u32> reduction_enable;
BitField<20, 1, BypassL2> bypass_l2;
};
static_assert(sizeof(LaunchDMA) == 4);
struct RemapConst {
enum Swizzle : u32 {
SRC_X = 0,
SRC_Y = 1,
SRC_Z = 2,
SRC_W = 3,
CONST_A = 4,
CONST_B = 5,
NO_WRITE = 6,
};
PackedGPUVAddr address;
union {
BitField<0, 3, Swizzle> dst_x;
BitField<4, 3, Swizzle> dst_y;
BitField<8, 3, Swizzle> dst_z;
BitField<12, 3, Swizzle> dst_w;
BitField<16, 2, u32> component_size_minus_one;
BitField<20, 2, u32> num_src_components_minus_one;
BitField<24, 2, u32> num_dst_components_minus_one;
};
};
static_assert(sizeof(RemapConst) == 12);
explicit MaxwellDMA(Core::System& system_, MemoryManager& memory_manager_);
~MaxwellDMA();
/// Write the value to the register identified by method.
void CallMethod(u32 method, u32 method_argument, bool is_last_call) override;
/// Write multiple values to the register identified by method.
void CallMultiMethod(u32 method, const u32* base_start, u32 amount,
u32 methods_pending) override;
private:
/// Performs the copy from the source buffer to the destination buffer as configured in the
/// registers.
void Launch();
void CopyPitchToPitch();
void CopyBlockLinearToPitch();
void CopyPitchToBlockLinear();
void FastCopyBlockLinearToPitch();
Core::System& system;
MemoryManager& memory_manager;
std::vector<u8> read_buffer;
std::vector<u8> write_buffer;
static constexpr std::size_t NUM_REGS = 0x800;
struct Regs {
union {
struct {
u32 reserved[0x40];
u32 nop;
u32 reserved01[0xf];
u32 pm_trigger;
u32 reserved02[0x3f];
Semaphore semaphore;
u32 reserved03[0x2];
RenderEnable render_enable;
PhysMode src_phys_mode;
PhysMode dst_phys_mode;
u32 reserved04[0x26];
LaunchDMA launch_dma;
u32 reserved05[0x3f];
PackedGPUVAddr offset_in;
PackedGPUVAddr offset_out;
u32 pitch_in;
u32 pitch_out;
u32 line_length_in;
u32 line_count;
u32 reserved06[0xb8];
RemapConst remap_const;
Parameters dst_params;
u32 reserved07[0x1];
Parameters src_params;
u32 reserved08[0x275];
u32 pm_trigger_end;
u32 reserved09[0x3ba];
};
std::array<u32, NUM_REGS> reg_array;
};
} regs{};
#define ASSERT_REG_POSITION(field_name, position) \
static_assert(offsetof(MaxwellDMA::Regs, field_name) == position * 4, \
"Field " #field_name " has invalid position")
ASSERT_REG_POSITION(launch_dma, 0xC0);
ASSERT_REG_POSITION(offset_in, 0x100);
ASSERT_REG_POSITION(offset_out, 0x102);
ASSERT_REG_POSITION(pitch_in, 0x104);
ASSERT_REG_POSITION(pitch_out, 0x105);
ASSERT_REG_POSITION(line_length_in, 0x106);
ASSERT_REG_POSITION(line_count, 0x107);
ASSERT_REG_POSITION(remap_const, 0x1C0);
ASSERT_REG_POSITION(dst_params, 0x1C3);
ASSERT_REG_POSITION(src_params, 0x1CA);
#undef ASSERT_REG_POSITION
};
} // namespace Tegra::Engines

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,158 @@
// Copyright 2018 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#pragma once
#include <array>
#include <optional>
#include "common/bit_field.h"
#include "common/common_funcs.h"
#include "common/common_types.h"
namespace Tegra::Shader {
enum class OutputTopology : u32 {
PointList = 1,
LineStrip = 6,
TriangleStrip = 7,
};
enum class PixelImap : u8 {
Unused = 0,
Constant = 1,
Perspective = 2,
ScreenLinear = 3,
};
// Documentation in:
// http://download.nvidia.com/open-gpu-doc/Shader-Program-Header/1/Shader-Program-Header.html
struct Header {
union {
BitField<0, 5, u32> sph_type;
BitField<5, 5, u32> version;
BitField<10, 4, u32> shader_type;
BitField<14, 1, u32> mrt_enable;
BitField<15, 1, u32> kills_pixels;
BitField<16, 1, u32> does_global_store;
BitField<17, 4, u32> sass_version;
BitField<21, 5, u32> reserved;
BitField<26, 1, u32> does_load_or_store;
BitField<27, 1, u32> does_fp64;
BitField<28, 4, u32> stream_out_mask;
} common0;
union {
BitField<0, 24, u32> shader_local_memory_low_size;
BitField<24, 8, u32> per_patch_attribute_count;
} common1;
union {
BitField<0, 24, u32> shader_local_memory_high_size;
BitField<24, 8, u32> threads_per_input_primitive;
} common2;
union {
BitField<0, 24, u32> shader_local_memory_crs_size;
BitField<24, 4, OutputTopology> output_topology;
BitField<28, 4, u32> reserved;
} common3;
union {
BitField<0, 12, u32> max_output_vertices;
BitField<12, 8, u32> store_req_start; // NOTE: not used by geometry shaders.
BitField<20, 4, u32> reserved;
BitField<24, 8, u32> store_req_end; // NOTE: not used by geometry shaders.
} common4;
union {
struct {
INSERT_UNION_PADDING_BYTES(3); // ImapSystemValuesA
INSERT_UNION_PADDING_BYTES(1); // ImapSystemValuesB
INSERT_UNION_PADDING_BYTES(16); // ImapGenericVector[32]
INSERT_UNION_PADDING_BYTES(2); // ImapColor
union {
BitField<0, 8, u16> clip_distances;
BitField<8, 1, u16> point_sprite_s;
BitField<9, 1, u16> point_sprite_t;
BitField<10, 1, u16> fog_coordinate;
BitField<12, 1, u16> tessellation_eval_point_u;
BitField<13, 1, u16> tessellation_eval_point_v;
BitField<14, 1, u16> instance_id;
BitField<15, 1, u16> vertex_id;
};
INSERT_UNION_PADDING_BYTES(5); // ImapFixedFncTexture[10]
INSERT_UNION_PADDING_BYTES(1); // ImapReserved
INSERT_UNION_PADDING_BYTES(3); // OmapSystemValuesA
INSERT_UNION_PADDING_BYTES(1); // OmapSystemValuesB
INSERT_UNION_PADDING_BYTES(16); // OmapGenericVector[32]
INSERT_UNION_PADDING_BYTES(2); // OmapColor
INSERT_UNION_PADDING_BYTES(2); // OmapSystemValuesC
INSERT_UNION_PADDING_BYTES(5); // OmapFixedFncTexture[10]
INSERT_UNION_PADDING_BYTES(1); // OmapReserved
} vtg;
struct {
INSERT_UNION_PADDING_BYTES(3); // ImapSystemValuesA
INSERT_UNION_PADDING_BYTES(1); // ImapSystemValuesB
union {
BitField<0, 2, PixelImap> x;
BitField<2, 2, PixelImap> y;
BitField<4, 2, PixelImap> z;
BitField<6, 2, PixelImap> w;
u8 raw;
} imap_generic_vector[32];
INSERT_UNION_PADDING_BYTES(2); // ImapColor
INSERT_UNION_PADDING_BYTES(2); // ImapSystemValuesC
INSERT_UNION_PADDING_BYTES(10); // ImapFixedFncTexture[10]
INSERT_UNION_PADDING_BYTES(2); // ImapReserved
struct {
u32 target;
union {
BitField<0, 1, u32> sample_mask;
BitField<1, 1, u32> depth;
BitField<2, 30, u32> reserved;
};
} omap;
bool IsColorComponentOutputEnabled(u32 render_target, u32 component) const {
const u32 bit = render_target * 4 + component;
return omap.target & (1 << bit);
}
PixelImap GetPixelImap(u32 attribute) const {
const auto get_index = [this, attribute](u32 index) {
return static_cast<PixelImap>(
(imap_generic_vector[attribute].raw >> (index * 2)) & 3);
};
std::optional<PixelImap> result;
for (u32 component = 0; component < 4; ++component) {
const PixelImap index = get_index(component);
if (index == PixelImap::Unused) {
continue;
}
if (result && result != index) {
LOG_CRITICAL(HW_GPU, "Generic attribute conflict in interpolation mode");
}
result = index;
}
return result.value_or(PixelImap::Unused);
}
} ps;
std::array<u32, 0xF> raw;
};
u64 GetLocalMemorySize() const {
return (common1.shader_local_memory_low_size |
(common2.shader_local_memory_high_size << 24));
}
};
static_assert(sizeof(Header) == 0x50, "Incorrect structure size");
} // namespace Tegra::Shader

View File

@@ -0,0 +1,21 @@
// Copyright 2019 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#pragma once
#include "common/common_types.h"
namespace Tegra::Engines {
enum class ShaderType : u32 {
Vertex = 0,
TesselationControl = 1,
TesselationEval = 2,
Geometry = 3,
Fragment = 4,
Compute = 5,
};
static constexpr std::size_t MaxShaderTypes = 6;
} // namespace Tegra::Engines

177
src/video_core/fence_manager.h Executable file
View File

@@ -0,0 +1,177 @@
// Copyright 2020 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#pragma once
#include <algorithm>
#include <queue>
#include "common/common_types.h"
#include "core/core.h"
#include "video_core/delayed_destruction_ring.h"
#include "video_core/gpu.h"
#include "video_core/memory_manager.h"
#include "video_core/rasterizer_interface.h"
namespace VideoCommon {
class FenceBase {
public:
explicit FenceBase(u32 payload_, bool is_stubbed_)
: address{}, payload{payload_}, is_semaphore{false}, is_stubbed{is_stubbed_} {}
explicit FenceBase(GPUVAddr address_, u32 payload_, bool is_stubbed_)
: address{address_}, payload{payload_}, is_semaphore{true}, is_stubbed{is_stubbed_} {}
GPUVAddr GetAddress() const {
return address;
}
u32 GetPayload() const {
return payload;
}
bool IsSemaphore() const {
return is_semaphore;
}
private:
GPUVAddr address;
u32 payload;
bool is_semaphore;
protected:
bool is_stubbed;
};
template <typename TFence, typename TTextureCache, typename TTBufferCache, typename TQueryCache>
class FenceManager {
public:
/// Notify the fence manager about a new frame
void TickFrame() {
delayed_destruction_ring.Tick();
}
void SignalSemaphore(GPUVAddr addr, u32 value) {
TryReleasePendingFences();
const bool should_flush = ShouldFlush();
CommitAsyncFlushes();
TFence new_fence = CreateFence(addr, value, !should_flush);
fences.push(new_fence);
QueueFence(new_fence);
if (should_flush) {
rasterizer.FlushCommands();
}
rasterizer.SyncGuestHost();
}
void SignalSyncPoint(u32 value) {
TryReleasePendingFences();
const bool should_flush = ShouldFlush();
CommitAsyncFlushes();
TFence new_fence = CreateFence(value, !should_flush);
fences.push(new_fence);
QueueFence(new_fence);
if (should_flush) {
rasterizer.FlushCommands();
}
rasterizer.SyncGuestHost();
}
void WaitPendingFences() {
while (!fences.empty()) {
TFence& current_fence = fences.front();
if (ShouldWait()) {
WaitFence(current_fence);
}
PopAsyncFlushes();
if (current_fence->IsSemaphore()) {
gpu_memory.template Write<u32>(current_fence->GetAddress(),
current_fence->GetPayload());
} else {
gpu.IncrementSyncPoint(current_fence->GetPayload());
}
PopFence();
}
}
protected:
explicit FenceManager(VideoCore::RasterizerInterface& rasterizer_, Tegra::GPU& gpu_,
TTextureCache& texture_cache_, TTBufferCache& buffer_cache_,
TQueryCache& query_cache_)
: rasterizer{rasterizer_}, gpu{gpu_}, gpu_memory{gpu.MemoryManager()},
texture_cache{texture_cache_}, buffer_cache{buffer_cache_}, query_cache{query_cache_} {}
virtual ~FenceManager() = default;
/// Creates a Sync Point Fence Interface, does not create a backend fence if 'is_stubbed' is
/// true
virtual TFence CreateFence(u32 value, bool is_stubbed) = 0;
/// Creates a Semaphore Fence Interface, does not create a backend fence if 'is_stubbed' is true
virtual TFence CreateFence(GPUVAddr addr, u32 value, bool is_stubbed) = 0;
/// Queues a fence into the backend if the fence isn't stubbed.
virtual void QueueFence(TFence& fence) = 0;
/// Notifies that the backend fence has been signaled/reached in host GPU.
virtual bool IsFenceSignaled(TFence& fence) const = 0;
/// Waits until a fence has been signalled by the host GPU.
virtual void WaitFence(TFence& fence) = 0;
VideoCore::RasterizerInterface& rasterizer;
Tegra::GPU& gpu;
Tegra::MemoryManager& gpu_memory;
TTextureCache& texture_cache;
TTBufferCache& buffer_cache;
TQueryCache& query_cache;
private:
void TryReleasePendingFences() {
while (!fences.empty()) {
TFence& current_fence = fences.front();
if (ShouldWait() && !IsFenceSignaled(current_fence)) {
return;
}
PopAsyncFlushes();
if (current_fence->IsSemaphore()) {
gpu_memory.template Write<u32>(current_fence->GetAddress(),
current_fence->GetPayload());
} else {
gpu.IncrementSyncPoint(current_fence->GetPayload());
}
PopFence();
}
}
bool ShouldWait() const {
return texture_cache.ShouldWaitAsyncFlushes() || buffer_cache.ShouldWaitAsyncFlushes() ||
query_cache.ShouldWaitAsyncFlushes();
}
bool ShouldFlush() const {
return texture_cache.HasUncommittedFlushes() || buffer_cache.HasUncommittedFlushes() ||
query_cache.HasUncommittedFlushes();
}
void PopAsyncFlushes() {
texture_cache.PopAsyncFlushes();
buffer_cache.PopAsyncFlushes();
query_cache.PopAsyncFlushes();
}
void CommitAsyncFlushes() {
texture_cache.CommitAsyncFlushes();
buffer_cache.CommitAsyncFlushes();
query_cache.CommitAsyncFlushes();
}
void PopFence() {
delayed_destruction_ring.Push(std::move(fences.front()));
fences.pop();
}
std::queue<TFence> fences;
DelayedDestructionRing<TFence, 6> delayed_destruction_ring;
};
} // namespace VideoCommon

View File

@@ -0,0 +1,31 @@
// Copyright 2020 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#pragma once
namespace Tegra {
/**
* Struct describing framebuffer configuration
*/
struct FramebufferConfig {
enum class PixelFormat : u32 {
A8B8G8R8_UNORM = 1,
RGB565_UNORM = 4,
B8G8R8A8_UNORM = 5,
};
VAddr address{};
u32 offset{};
u32 width{};
u32 height{};
u32 stride{};
PixelFormat pixel_format{};
using TransformFlags = Service::NVFlinger::BufferQueue::BufferTransformFlags;
TransformFlags transform_flags{};
Common::Rectangle<int> crop_rect;
};
} // namespace Tegra

533
src/video_core/gpu.cpp Executable file
View File

@@ -0,0 +1,533 @@
// Copyright 2018 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#include <chrono>
#include "common/assert.h"
#include "common/microprofile.h"
#include "core/core.h"
#include "core/core_timing.h"
#include "core/core_timing_util.h"
#include "core/frontend/emu_window.h"
#include "core/hardware_interrupt_manager.h"
#include "core/memory.h"
#include "core/settings.h"
#include "video_core/engines/fermi_2d.h"
#include "video_core/engines/kepler_compute.h"
#include "video_core/engines/kepler_memory.h"
#include "video_core/engines/maxwell_3d.h"
#include "video_core/engines/maxwell_dma.h"
#include "video_core/gpu.h"
#include "video_core/memory_manager.h"
#include "video_core/renderer_base.h"
#include "video_core/shader_notify.h"
#include "video_core/video_core.h"
namespace Tegra {
MICROPROFILE_DEFINE(GPU_wait, "GPU", "Wait for the GPU", MP_RGB(128, 128, 192));
GPU::GPU(Core::System& system_, bool is_async_, bool use_nvdec_)
: system{system_}, memory_manager{std::make_unique<Tegra::MemoryManager>(system)},
dma_pusher{std::make_unique<Tegra::DmaPusher>(system, *this)},
cdma_pusher{std::make_unique<Tegra::CDmaPusher>(*this)}, use_nvdec{use_nvdec_},
maxwell_3d{std::make_unique<Engines::Maxwell3D>(system, *memory_manager)},
fermi_2d{std::make_unique<Engines::Fermi2D>()},
kepler_compute{std::make_unique<Engines::KeplerCompute>(system, *memory_manager)},
maxwell_dma{std::make_unique<Engines::MaxwellDMA>(system, *memory_manager)},
kepler_memory{std::make_unique<Engines::KeplerMemory>(system, *memory_manager)},
shader_notify{std::make_unique<VideoCore::ShaderNotify>()}, is_async{is_async_},
gpu_thread{system_, is_async_} {}
GPU::~GPU() = default;
void GPU::BindRenderer(std::unique_ptr<VideoCore::RendererBase> renderer_) {
renderer = std::move(renderer_);
VideoCore::RasterizerInterface& rasterizer = renderer->Rasterizer();
memory_manager->BindRasterizer(rasterizer);
maxwell_3d->BindRasterizer(rasterizer);
fermi_2d->BindRasterizer(rasterizer);
kepler_compute->BindRasterizer(rasterizer);
}
Engines::Maxwell3D& GPU::Maxwell3D() {
return *maxwell_3d;
}
const Engines::Maxwell3D& GPU::Maxwell3D() const {
return *maxwell_3d;
}
Engines::KeplerCompute& GPU::KeplerCompute() {
return *kepler_compute;
}
const Engines::KeplerCompute& GPU::KeplerCompute() const {
return *kepler_compute;
}
MemoryManager& GPU::MemoryManager() {
return *memory_manager;
}
const MemoryManager& GPU::MemoryManager() const {
return *memory_manager;
}
DmaPusher& GPU::DmaPusher() {
return *dma_pusher;
}
Tegra::CDmaPusher& GPU::CDmaPusher() {
return *cdma_pusher;
}
const DmaPusher& GPU::DmaPusher() const {
return *dma_pusher;
}
const Tegra::CDmaPusher& GPU::CDmaPusher() const {
return *cdma_pusher;
}
void GPU::WaitFence(u32 syncpoint_id, u32 value) {
// Synced GPU, is always in sync
if (!is_async) {
return;
}
if (syncpoint_id == UINT32_MAX) {
// TODO: Research what this does.
LOG_ERROR(HW_GPU, "Waiting for syncpoint -1 not implemented");
return;
}
MICROPROFILE_SCOPE(GPU_wait);
std::unique_lock lock{sync_mutex};
sync_cv.wait(lock, [=, this] { return syncpoints.at(syncpoint_id).load() >= value; });
}
void GPU::IncrementSyncPoint(const u32 syncpoint_id) {
auto& syncpoint = syncpoints.at(syncpoint_id);
syncpoint++;
std::lock_guard lock{sync_mutex};
sync_cv.notify_all();
auto& interrupt = syncpt_interrupts.at(syncpoint_id);
if (!interrupt.empty()) {
u32 value = syncpoint.load();
auto it = interrupt.begin();
while (it != interrupt.end()) {
if (value >= *it) {
TriggerCpuInterrupt(syncpoint_id, *it);
it = interrupt.erase(it);
continue;
}
it++;
}
}
}
u32 GPU::GetSyncpointValue(const u32 syncpoint_id) const {
return syncpoints.at(syncpoint_id).load();
}
void GPU::RegisterSyncptInterrupt(const u32 syncpoint_id, const u32 value) {
auto& interrupt = syncpt_interrupts.at(syncpoint_id);
bool contains = std::any_of(interrupt.begin(), interrupt.end(),
[value](u32 in_value) { return in_value == value; });
if (contains) {
return;
}
interrupt.emplace_back(value);
}
bool GPU::CancelSyncptInterrupt(const u32 syncpoint_id, const u32 value) {
std::lock_guard lock{sync_mutex};
auto& interrupt = syncpt_interrupts.at(syncpoint_id);
const auto iter =
std::find_if(interrupt.begin(), interrupt.end(),
[value](u32 interrupt_value) { return value == interrupt_value; });
if (iter == interrupt.end()) {
return false;
}
interrupt.erase(iter);
return true;
}
u64 GPU::RequestFlush(VAddr addr, std::size_t size) {
std::unique_lock lck{flush_request_mutex};
const u64 fence = ++last_flush_fence;
flush_requests.emplace_back(fence, addr, size);
return fence;
}
void GPU::TickWork() {
std::unique_lock lck{flush_request_mutex};
while (!flush_requests.empty()) {
auto& request = flush_requests.front();
const u64 fence = request.fence;
const VAddr addr = request.addr;
const std::size_t size = request.size;
flush_requests.pop_front();
flush_request_mutex.unlock();
renderer->Rasterizer().FlushRegion(addr, size);
current_flush_fence.store(fence);
flush_request_mutex.lock();
}
}
u64 GPU::GetTicks() const {
// This values were reversed engineered by fincs from NVN
// The gpu clock is reported in units of 385/625 nanoseconds
constexpr u64 gpu_ticks_num = 384;
constexpr u64 gpu_ticks_den = 625;
u64 nanoseconds = system.CoreTiming().GetGlobalTimeNs().count();
if (Settings::values.use_fast_gpu_time.GetValue()) {
nanoseconds /= 256;
}
const u64 nanoseconds_num = nanoseconds / gpu_ticks_den;
const u64 nanoseconds_rem = nanoseconds % gpu_ticks_den;
return nanoseconds_num * gpu_ticks_num + (nanoseconds_rem * gpu_ticks_num) / gpu_ticks_den;
}
void GPU::FlushCommands() {
renderer->Rasterizer().FlushCommands();
}
void GPU::SyncGuestHost() {
renderer->Rasterizer().SyncGuestHost();
}
enum class GpuSemaphoreOperation {
AcquireEqual = 0x1,
WriteLong = 0x2,
AcquireGequal = 0x4,
AcquireMask = 0x8,
};
void GPU::CallMethod(const MethodCall& method_call) {
LOG_TRACE(HW_GPU, "Processing method {:08X} on subchannel {}", method_call.method,
method_call.subchannel);
ASSERT(method_call.subchannel < bound_engines.size());
if (ExecuteMethodOnEngine(method_call.method)) {
CallEngineMethod(method_call);
} else {
CallPullerMethod(method_call);
}
}
void GPU::CallMultiMethod(u32 method, u32 subchannel, const u32* base_start, u32 amount,
u32 methods_pending) {
LOG_TRACE(HW_GPU, "Processing method {:08X} on subchannel {}", method, subchannel);
ASSERT(subchannel < bound_engines.size());
if (ExecuteMethodOnEngine(method)) {
CallEngineMultiMethod(method, subchannel, base_start, amount, methods_pending);
} else {
for (std::size_t i = 0; i < amount; i++) {
CallPullerMethod(MethodCall{
method,
base_start[i],
subchannel,
methods_pending - static_cast<u32>(i),
});
}
}
}
bool GPU::ExecuteMethodOnEngine(u32 method) {
const auto buffer_method = static_cast<BufferMethods>(method);
return buffer_method >= BufferMethods::NonPullerMethods;
}
void GPU::CallPullerMethod(const MethodCall& method_call) {
regs.reg_array[method_call.method] = method_call.argument;
const auto method = static_cast<BufferMethods>(method_call.method);
switch (method) {
case BufferMethods::BindObject: {
ProcessBindMethod(method_call);
break;
}
case BufferMethods::Nop:
case BufferMethods::SemaphoreAddressHigh:
case BufferMethods::SemaphoreAddressLow:
case BufferMethods::SemaphoreSequence:
case BufferMethods::RefCnt:
case BufferMethods::UnkCacheFlush:
case BufferMethods::WrcacheFlush:
case BufferMethods::FenceValue:
break;
case BufferMethods::FenceAction:
ProcessFenceActionMethod();
break;
case BufferMethods::WaitForInterrupt:
ProcessWaitForInterruptMethod();
break;
case BufferMethods::SemaphoreTrigger: {
ProcessSemaphoreTriggerMethod();
break;
}
case BufferMethods::NotifyIntr: {
// TODO(Kmather73): Research and implement this method.
LOG_ERROR(HW_GPU, "Special puller engine method NotifyIntr not implemented");
break;
}
case BufferMethods::Unk28: {
// TODO(Kmather73): Research and implement this method.
LOG_ERROR(HW_GPU, "Special puller engine method Unk28 not implemented");
break;
}
case BufferMethods::SemaphoreAcquire: {
ProcessSemaphoreAcquire();
break;
}
case BufferMethods::SemaphoreRelease: {
ProcessSemaphoreRelease();
break;
}
case BufferMethods::Yield: {
// TODO(Kmather73): Research and implement this method.
LOG_ERROR(HW_GPU, "Special puller engine method Yield not implemented");
break;
}
default:
LOG_ERROR(HW_GPU, "Special puller engine method {:X} not implemented", method);
break;
}
}
void GPU::CallEngineMethod(const MethodCall& method_call) {
const EngineID engine = bound_engines[method_call.subchannel];
switch (engine) {
case EngineID::FERMI_TWOD_A:
fermi_2d->CallMethod(method_call.method, method_call.argument, method_call.IsLastCall());
break;
case EngineID::MAXWELL_B:
maxwell_3d->CallMethod(method_call.method, method_call.argument, method_call.IsLastCall());
break;
case EngineID::KEPLER_COMPUTE_B:
kepler_compute->CallMethod(method_call.method, method_call.argument,
method_call.IsLastCall());
break;
case EngineID::MAXWELL_DMA_COPY_A:
maxwell_dma->CallMethod(method_call.method, method_call.argument, method_call.IsLastCall());
break;
case EngineID::KEPLER_INLINE_TO_MEMORY_B:
kepler_memory->CallMethod(method_call.method, method_call.argument,
method_call.IsLastCall());
break;
default:
UNIMPLEMENTED_MSG("Unimplemented engine");
}
}
void GPU::CallEngineMultiMethod(u32 method, u32 subchannel, const u32* base_start, u32 amount,
u32 methods_pending) {
const EngineID engine = bound_engines[subchannel];
switch (engine) {
case EngineID::FERMI_TWOD_A:
fermi_2d->CallMultiMethod(method, base_start, amount, methods_pending);
break;
case EngineID::MAXWELL_B:
maxwell_3d->CallMultiMethod(method, base_start, amount, methods_pending);
break;
case EngineID::KEPLER_COMPUTE_B:
kepler_compute->CallMultiMethod(method, base_start, amount, methods_pending);
break;
case EngineID::MAXWELL_DMA_COPY_A:
maxwell_dma->CallMultiMethod(method, base_start, amount, methods_pending);
break;
case EngineID::KEPLER_INLINE_TO_MEMORY_B:
kepler_memory->CallMultiMethod(method, base_start, amount, methods_pending);
break;
default:
UNIMPLEMENTED_MSG("Unimplemented engine");
}
}
void GPU::ProcessBindMethod(const MethodCall& method_call) {
// Bind the current subchannel to the desired engine id.
LOG_DEBUG(HW_GPU, "Binding subchannel {} to engine {}", method_call.subchannel,
method_call.argument);
const auto engine_id = static_cast<EngineID>(method_call.argument);
bound_engines[method_call.subchannel] = static_cast<EngineID>(engine_id);
switch (engine_id) {
case EngineID::FERMI_TWOD_A:
dma_pusher->BindSubchannel(fermi_2d.get(), method_call.subchannel);
break;
case EngineID::MAXWELL_B:
dma_pusher->BindSubchannel(maxwell_3d.get(), method_call.subchannel);
break;
case EngineID::KEPLER_COMPUTE_B:
dma_pusher->BindSubchannel(kepler_compute.get(), method_call.subchannel);
break;
case EngineID::MAXWELL_DMA_COPY_A:
dma_pusher->BindSubchannel(maxwell_dma.get(), method_call.subchannel);
break;
case EngineID::KEPLER_INLINE_TO_MEMORY_B:
dma_pusher->BindSubchannel(kepler_memory.get(), method_call.subchannel);
break;
default:
UNIMPLEMENTED_MSG("Unimplemented engine {:04X}", engine_id);
}
}
void GPU::ProcessFenceActionMethod() {
switch (regs.fence_action.op) {
case FenceOperation::Acquire:
WaitFence(regs.fence_action.syncpoint_id, regs.fence_value);
break;
case FenceOperation::Increment:
IncrementSyncPoint(regs.fence_action.syncpoint_id);
break;
default:
UNIMPLEMENTED_MSG("Unimplemented operation {}", regs.fence_action.op.Value());
}
}
void GPU::ProcessWaitForInterruptMethod() {
// TODO(bunnei) ImplementMe
LOG_WARNING(HW_GPU, "(STUBBED) called");
}
void GPU::ProcessSemaphoreTriggerMethod() {
const auto semaphoreOperationMask = 0xF;
const auto op =
static_cast<GpuSemaphoreOperation>(regs.semaphore_trigger & semaphoreOperationMask);
if (op == GpuSemaphoreOperation::WriteLong) {
struct Block {
u32 sequence;
u32 zeros = 0;
u64 timestamp;
};
Block block{};
block.sequence = regs.semaphore_sequence;
// TODO(Kmather73): Generate a real GPU timestamp and write it here instead of
// CoreTiming
block.timestamp = GetTicks();
memory_manager->WriteBlock(regs.semaphore_address.SemaphoreAddress(), &block,
sizeof(block));
} else {
const u32 word{memory_manager->Read<u32>(regs.semaphore_address.SemaphoreAddress())};
if ((op == GpuSemaphoreOperation::AcquireEqual && word == regs.semaphore_sequence) ||
(op == GpuSemaphoreOperation::AcquireGequal &&
static_cast<s32>(word - regs.semaphore_sequence) > 0) ||
(op == GpuSemaphoreOperation::AcquireMask && (word & regs.semaphore_sequence))) {
// Nothing to do in this case
} else {
regs.acquire_source = true;
regs.acquire_value = regs.semaphore_sequence;
if (op == GpuSemaphoreOperation::AcquireEqual) {
regs.acquire_active = true;
regs.acquire_mode = false;
} else if (op == GpuSemaphoreOperation::AcquireGequal) {
regs.acquire_active = true;
regs.acquire_mode = true;
} else if (op == GpuSemaphoreOperation::AcquireMask) {
// TODO(kemathe) The acquire mask operation waits for a value that, ANDed with
// semaphore_sequence, gives a non-0 result
LOG_ERROR(HW_GPU, "Invalid semaphore operation AcquireMask not implemented");
} else {
LOG_ERROR(HW_GPU, "Invalid semaphore operation");
}
}
}
}
void GPU::ProcessSemaphoreRelease() {
memory_manager->Write<u32>(regs.semaphore_address.SemaphoreAddress(), regs.semaphore_release);
}
void GPU::ProcessSemaphoreAcquire() {
const u32 word = memory_manager->Read<u32>(regs.semaphore_address.SemaphoreAddress());
const auto value = regs.semaphore_acquire;
if (word != value) {
regs.acquire_active = true;
regs.acquire_value = value;
// TODO(kemathe73) figure out how to do the acquire_timeout
regs.acquire_mode = false;
regs.acquire_source = false;
}
}
void GPU::Start() {
gpu_thread.StartThread(*renderer, renderer->Context(), *dma_pusher, *cdma_pusher);
cpu_context = renderer->GetRenderWindow().CreateSharedContext();
cpu_context->MakeCurrent();
}
void GPU::ObtainContext() {
cpu_context->MakeCurrent();
}
void GPU::ReleaseContext() {
cpu_context->DoneCurrent();
}
void GPU::PushGPUEntries(Tegra::CommandList&& entries) {
gpu_thread.SubmitList(std::move(entries));
}
void GPU::PushCommandBuffer(Tegra::ChCommandHeaderList& entries) {
if (!use_nvdec) {
return;
}
// This condition fires when a video stream ends, clear all intermediary data
if (entries[0].raw == 0xDEADB33F) {
cdma_pusher.reset();
return;
}
if (!cdma_pusher) {
cdma_pusher = std::make_unique<Tegra::CDmaPusher>(*this);
}
// SubmitCommandBuffer would make the nvdec operations async, this is not currently working
// TODO(ameerj): RE proper async nvdec operation
// gpu_thread.SubmitCommandBuffer(std::move(entries));
cdma_pusher->Push(std::move(entries));
cdma_pusher->DispatchCalls();
}
void GPU::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) {
gpu_thread.SwapBuffers(framebuffer);
}
void GPU::FlushRegion(VAddr addr, u64 size) {
gpu_thread.FlushRegion(addr, size);
}
void GPU::InvalidateRegion(VAddr addr, u64 size) {
gpu_thread.InvalidateRegion(addr, size);
}
void GPU::FlushAndInvalidateRegion(VAddr addr, u64 size) {
gpu_thread.FlushAndInvalidateRegion(addr, size);
}
void GPU::TriggerCpuInterrupt(const u32 syncpoint_id, const u32 value) const {
auto& interrupt_manager = system.InterruptManager();
interrupt_manager.GPUInterruptSyncpt(syncpoint_id, value);
}
void GPU::WaitIdle() const {
gpu_thread.WaitIdle();
}
void GPU::OnCommandListEnd() {
if (is_async) {
// This command only applies to asynchronous GPU mode
gpu_thread.OnCommandListEnd();
}
}
} // namespace Tegra

436
src/video_core/gpu.h Executable file
View File

@@ -0,0 +1,436 @@
// Copyright 2018 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#pragma once
#include <array>
#include <atomic>
#include <condition_variable>
#include <list>
#include <memory>
#include <mutex>
#include "common/common_types.h"
#include "core/hle/service/nvdrv/nvdata.h"
#include "core/hle/service/nvflinger/buffer_queue.h"
#include "video_core/cdma_pusher.h"
#include "video_core/dma_pusher.h"
#include "video_core/framebuffer_config.h"
#include "video_core/gpu_thread.h"
using CacheAddr = std::uintptr_t;
[[nodiscard]] inline CacheAddr ToCacheAddr(const void* host_ptr) {
return reinterpret_cast<CacheAddr>(host_ptr);
}
[[nodiscard]] inline u8* FromCacheAddr(CacheAddr cache_addr) {
return reinterpret_cast<u8*>(cache_addr);
}
namespace Core {
namespace Frontend {
class EmuWindow;
}
class System;
} // namespace Core
namespace VideoCore {
class RendererBase;
class ShaderNotify;
} // namespace VideoCore
namespace Tegra {
enum class RenderTargetFormat : u32 {
NONE = 0x0,
R32B32G32A32_FLOAT = 0xC0,
R32G32B32A32_SINT = 0xC1,
R32G32B32A32_UINT = 0xC2,
R16G16B16A16_UNORM = 0xC6,
R16G16B16A16_SNORM = 0xC7,
R16G16B16A16_SINT = 0xC8,
R16G16B16A16_UINT = 0xC9,
R16G16B16A16_FLOAT = 0xCA,
R32G32_FLOAT = 0xCB,
R32G32_SINT = 0xCC,
R32G32_UINT = 0xCD,
R16G16B16X16_FLOAT = 0xCE,
B8G8R8A8_UNORM = 0xCF,
B8G8R8A8_SRGB = 0xD0,
A2B10G10R10_UNORM = 0xD1,
A2B10G10R10_UINT = 0xD2,
A8B8G8R8_UNORM = 0xD5,
A8B8G8R8_SRGB = 0xD6,
A8B8G8R8_SNORM = 0xD7,
A8B8G8R8_SINT = 0xD8,
A8B8G8R8_UINT = 0xD9,
R16G16_UNORM = 0xDA,
R16G16_SNORM = 0xDB,
R16G16_SINT = 0xDC,
R16G16_UINT = 0xDD,
R16G16_FLOAT = 0xDE,
B10G11R11_FLOAT = 0xE0,
R32_SINT = 0xE3,
R32_UINT = 0xE4,
R32_FLOAT = 0xE5,
R5G6B5_UNORM = 0xE8,
A1R5G5B5_UNORM = 0xE9,
R8G8_UNORM = 0xEA,
R8G8_SNORM = 0xEB,
R8G8_SINT = 0xEC,
R8G8_UINT = 0xED,
R16_UNORM = 0xEE,
R16_SNORM = 0xEF,
R16_SINT = 0xF0,
R16_UINT = 0xF1,
R16_FLOAT = 0xF2,
R8_UNORM = 0xF3,
R8_SNORM = 0xF4,
R8_SINT = 0xF5,
R8_UINT = 0xF6,
};
enum class DepthFormat : u32 {
D32_FLOAT = 0xA,
D16_UNORM = 0x13,
S8_UINT_Z24_UNORM = 0x14,
D24X8_UNORM = 0x15,
D24S8_UNORM = 0x16,
D24C8_UNORM = 0x18,
D32_FLOAT_S8X24_UINT = 0x19,
};
struct CommandListHeader;
class DebugContext;
namespace Engines {
class Fermi2D;
class Maxwell3D;
class MaxwellDMA;
class KeplerCompute;
class KeplerMemory;
} // namespace Engines
enum class EngineID {
FERMI_TWOD_A = 0x902D, // 2D Engine
MAXWELL_B = 0xB197, // 3D Engine
KEPLER_COMPUTE_B = 0xB1C0,
KEPLER_INLINE_TO_MEMORY_B = 0xA140,
MAXWELL_DMA_COPY_A = 0xB0B5,
};
class MemoryManager;
class GPU final {
public:
struct MethodCall {
u32 method{};
u32 argument{};
u32 subchannel{};
u32 method_count{};
explicit MethodCall(u32 method_, u32 argument_, u32 subchannel_ = 0, u32 method_count_ = 0)
: method(method_), argument(argument_), subchannel(subchannel_),
method_count(method_count_) {}
[[nodiscard]] bool IsLastCall() const {
return method_count <= 1;
}
};
explicit GPU(Core::System& system_, bool is_async_, bool use_nvdec_);
~GPU();
/// Binds a renderer to the GPU.
void BindRenderer(std::unique_ptr<VideoCore::RendererBase> renderer);
/// Calls a GPU method.
void CallMethod(const MethodCall& method_call);
/// Calls a GPU multivalue method.
void CallMultiMethod(u32 method, u32 subchannel, const u32* base_start, u32 amount,
u32 methods_pending);
/// Flush all current written commands into the host GPU for execution.
void FlushCommands();
/// Synchronizes CPU writes with Host GPU memory.
void SyncGuestHost();
/// Signal the ending of command list.
void OnCommandListEnd();
/// Request a host GPU memory flush from the CPU.
[[nodiscard]] u64 RequestFlush(VAddr addr, std::size_t size);
/// Obtains current flush request fence id.
[[nodiscard]] u64 CurrentFlushRequestFence() const {
return current_flush_fence.load(std::memory_order_relaxed);
}
/// Tick pending requests within the GPU.
void TickWork();
/// Returns a reference to the Maxwell3D GPU engine.
[[nodiscard]] Engines::Maxwell3D& Maxwell3D();
/// Returns a const reference to the Maxwell3D GPU engine.
[[nodiscard]] const Engines::Maxwell3D& Maxwell3D() const;
/// Returns a reference to the KeplerCompute GPU engine.
[[nodiscard]] Engines::KeplerCompute& KeplerCompute();
/// Returns a reference to the KeplerCompute GPU engine.
[[nodiscard]] const Engines::KeplerCompute& KeplerCompute() const;
/// Returns a reference to the GPU memory manager.
[[nodiscard]] Tegra::MemoryManager& MemoryManager();
/// Returns a const reference to the GPU memory manager.
[[nodiscard]] const Tegra::MemoryManager& MemoryManager() const;
/// Returns a reference to the GPU DMA pusher.
[[nodiscard]] Tegra::DmaPusher& DmaPusher();
/// Returns a const reference to the GPU DMA pusher.
[[nodiscard]] const Tegra::DmaPusher& DmaPusher() const;
/// Returns a reference to the GPU CDMA pusher.
[[nodiscard]] Tegra::CDmaPusher& CDmaPusher();
/// Returns a const reference to the GPU CDMA pusher.
[[nodiscard]] const Tegra::CDmaPusher& CDmaPusher() const;
/// Returns a reference to the underlying renderer.
[[nodiscard]] VideoCore::RendererBase& Renderer() {
return *renderer;
}
/// Returns a const reference to the underlying renderer.
[[nodiscard]] const VideoCore::RendererBase& Renderer() const {
return *renderer;
}
/// Returns a reference to the shader notifier.
[[nodiscard]] VideoCore::ShaderNotify& ShaderNotify() {
return *shader_notify;
}
/// Returns a const reference to the shader notifier.
[[nodiscard]] const VideoCore::ShaderNotify& ShaderNotify() const {
return *shader_notify;
}
// Waits for the GPU to finish working
void WaitIdle() const;
/// Allows the CPU/NvFlinger to wait on the GPU before presenting a frame.
void WaitFence(u32 syncpoint_id, u32 value);
void IncrementSyncPoint(u32 syncpoint_id);
[[nodiscard]] u32 GetSyncpointValue(u32 syncpoint_id) const;
void RegisterSyncptInterrupt(u32 syncpoint_id, u32 value);
[[nodiscard]] bool CancelSyncptInterrupt(u32 syncpoint_id, u32 value);
[[nodiscard]] u64 GetTicks() const;
[[nodiscard]] std::unique_lock<std::mutex> LockSync() {
return std::unique_lock{sync_mutex};
}
[[nodiscard]] bool IsAsync() const {
return is_async;
}
[[nodiscard]] bool UseNvdec() const {
return use_nvdec;
}
enum class FenceOperation : u32 {
Acquire = 0,
Increment = 1,
};
union FenceAction {
u32 raw;
BitField<0, 1, FenceOperation> op;
BitField<8, 24, u32> syncpoint_id;
[[nodiscard]] static CommandHeader Build(FenceOperation op, u32 syncpoint_id) {
FenceAction result{};
result.op.Assign(op);
result.syncpoint_id.Assign(syncpoint_id);
return {result.raw};
}
};
struct Regs {
static constexpr size_t NUM_REGS = 0x40;
union {
struct {
INSERT_UNION_PADDING_WORDS(0x4);
struct {
u32 address_high;
u32 address_low;
[[nodiscard]] GPUVAddr SemaphoreAddress() const {
return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
address_low);
}
} semaphore_address;
u32 semaphore_sequence;
u32 semaphore_trigger;
INSERT_UNION_PADDING_WORDS(0xC);
// The pusher and the puller share the reference counter, the pusher only has read
// access
u32 reference_count;
INSERT_UNION_PADDING_WORDS(0x5);
u32 semaphore_acquire;
u32 semaphore_release;
u32 fence_value;
FenceAction fence_action;
INSERT_UNION_PADDING_WORDS(0xE2);
// Puller state
u32 acquire_mode;
u32 acquire_source;
u32 acquire_active;
u32 acquire_timeout;
u32 acquire_value;
};
std::array<u32, NUM_REGS> reg_array;
};
} regs{};
/// Performs any additional setup necessary in order to begin GPU emulation.
/// This can be used to launch any necessary threads and register any necessary
/// core timing events.
void Start();
/// Obtain the CPU Context
void ObtainContext();
/// Release the CPU Context
void ReleaseContext();
/// Push GPU command entries to be processed
void PushGPUEntries(Tegra::CommandList&& entries);
/// Push GPU command buffer entries to be processed
void PushCommandBuffer(Tegra::ChCommandHeaderList& entries);
/// Swap buffers (render frame)
void SwapBuffers(const Tegra::FramebufferConfig* framebuffer);
/// Notify rasterizer that any caches of the specified region should be flushed to Switch memory
void FlushRegion(VAddr addr, u64 size);
/// Notify rasterizer that any caches of the specified region should be invalidated
void InvalidateRegion(VAddr addr, u64 size);
/// Notify rasterizer that any caches of the specified region should be flushed and invalidated
void FlushAndInvalidateRegion(VAddr addr, u64 size);
protected:
void TriggerCpuInterrupt(u32 syncpoint_id, u32 value) const;
private:
void ProcessBindMethod(const MethodCall& method_call);
void ProcessFenceActionMethod();
void ProcessWaitForInterruptMethod();
void ProcessSemaphoreTriggerMethod();
void ProcessSemaphoreRelease();
void ProcessSemaphoreAcquire();
/// Calls a GPU puller method.
void CallPullerMethod(const MethodCall& method_call);
/// Calls a GPU engine method.
void CallEngineMethod(const MethodCall& method_call);
/// Calls a GPU engine multivalue method.
void CallEngineMultiMethod(u32 method, u32 subchannel, const u32* base_start, u32 amount,
u32 methods_pending);
/// Determines where the method should be executed.
[[nodiscard]] bool ExecuteMethodOnEngine(u32 method);
protected:
Core::System& system;
std::unique_ptr<Tegra::MemoryManager> memory_manager;
std::unique_ptr<Tegra::DmaPusher> dma_pusher;
std::unique_ptr<Tegra::CDmaPusher> cdma_pusher;
std::unique_ptr<VideoCore::RendererBase> renderer;
const bool use_nvdec;
private:
/// Mapping of command subchannels to their bound engine ids
std::array<EngineID, 8> bound_engines = {};
/// 3D engine
std::unique_ptr<Engines::Maxwell3D> maxwell_3d;
/// 2D engine
std::unique_ptr<Engines::Fermi2D> fermi_2d;
/// Compute engine
std::unique_ptr<Engines::KeplerCompute> kepler_compute;
/// DMA engine
std::unique_ptr<Engines::MaxwellDMA> maxwell_dma;
/// Inline memory engine
std::unique_ptr<Engines::KeplerMemory> kepler_memory;
/// Shader build notifier
std::unique_ptr<VideoCore::ShaderNotify> shader_notify;
std::array<std::atomic<u32>, Service::Nvidia::MaxSyncPoints> syncpoints{};
std::array<std::list<u32>, Service::Nvidia::MaxSyncPoints> syncpt_interrupts;
std::mutex sync_mutex;
std::mutex device_mutex;
std::condition_variable sync_cv;
struct FlushRequest {
explicit FlushRequest(u64 fence_, VAddr addr_, std::size_t size_)
: fence{fence_}, addr{addr_}, size{size_} {}
u64 fence;
VAddr addr;
std::size_t size;
};
std::list<FlushRequest> flush_requests;
std::atomic<u64> current_flush_fence{};
u64 last_flush_fence{};
std::mutex flush_request_mutex;
const bool is_async;
VideoCommon::GPUThread::ThreadManager gpu_thread;
std::unique_ptr<Core::Frontend::GraphicsContext> cpu_context;
};
#define ASSERT_REG_POSITION(field_name, position) \
static_assert(offsetof(GPU::Regs, field_name) == position * 4, \
"Field " #field_name " has invalid position")
ASSERT_REG_POSITION(semaphore_address, 0x4);
ASSERT_REG_POSITION(semaphore_sequence, 0x6);
ASSERT_REG_POSITION(semaphore_trigger, 0x7);
ASSERT_REG_POSITION(reference_count, 0x14);
ASSERT_REG_POSITION(semaphore_acquire, 0x1A);
ASSERT_REG_POSITION(semaphore_release, 0x1B);
ASSERT_REG_POSITION(fence_value, 0x1C);
ASSERT_REG_POSITION(fence_action, 0x1D);
ASSERT_REG_POSITION(acquire_mode, 0x100);
ASSERT_REG_POSITION(acquire_source, 0x101);
ASSERT_REG_POSITION(acquire_active, 0x102);
ASSERT_REG_POSITION(acquire_timeout, 0x103);
ASSERT_REG_POSITION(acquire_value, 0x104);
#undef ASSERT_REG_POSITION
} // namespace Tegra

162
src/video_core/gpu_thread.cpp Executable file
View File

@@ -0,0 +1,162 @@
// Copyright 2019 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#include "common/assert.h"
#include "common/microprofile.h"
#include "common/scope_exit.h"
#include "common/thread.h"
#include "core/core.h"
#include "core/frontend/emu_window.h"
#include "core/settings.h"
#include "video_core/dma_pusher.h"
#include "video_core/gpu.h"
#include "video_core/gpu_thread.h"
#include "video_core/renderer_base.h"
namespace VideoCommon::GPUThread {
/// Runs the GPU thread
static void RunThread(Core::System& system, VideoCore::RendererBase& renderer,
Core::Frontend::GraphicsContext& context, Tegra::DmaPusher& dma_pusher,
SynchState& state, Tegra::CDmaPusher& cdma_pusher) {
std::string name = "yuzu:GPU";
MicroProfileOnThreadCreate(name.c_str());
SCOPE_EXIT({ MicroProfileOnThreadExit(); });
Common::SetCurrentThreadName(name.c_str());
Common::SetCurrentThreadPriority(Common::ThreadPriority::High);
system.RegisterHostThread();
// Wait for first GPU command before acquiring the window context
while (state.queue.Empty())
;
// If emulation was stopped during disk shader loading, abort before trying to acquire context
if (!state.is_running) {
return;
}
auto current_context = context.Acquire();
CommandDataContainer next;
while (state.is_running) {
next = state.queue.PopWait();
if (auto* submit_list = std::get_if<SubmitListCommand>(&next.data)) {
dma_pusher.Push(std::move(submit_list->entries));
dma_pusher.DispatchCalls();
} else if (auto* command_list = std::get_if<SubmitChCommandEntries>(&next.data)) {
// NVDEC
cdma_pusher.Push(std::move(command_list->entries));
cdma_pusher.DispatchCalls();
} else if (const auto* data = std::get_if<SwapBuffersCommand>(&next.data)) {
renderer.SwapBuffers(data->framebuffer ? &*data->framebuffer : nullptr);
} else if (std::holds_alternative<OnCommandListEndCommand>(next.data)) {
renderer.Rasterizer().ReleaseFences();
} else if (std::holds_alternative<GPUTickCommand>(next.data)) {
system.GPU().TickWork();
} else if (const auto* flush = std::get_if<FlushRegionCommand>(&next.data)) {
renderer.Rasterizer().FlushRegion(flush->addr, flush->size);
} else if (const auto* invalidate = std::get_if<InvalidateRegionCommand>(&next.data)) {
renderer.Rasterizer().OnCPUWrite(invalidate->addr, invalidate->size);
} else if (std::holds_alternative<EndProcessingCommand>(next.data)) {
return;
} else {
UNREACHABLE();
}
state.signaled_fence.store(next.fence);
}
}
ThreadManager::ThreadManager(Core::System& system_, bool is_async_)
: system{system_}, is_async{is_async_} {}
ThreadManager::~ThreadManager() {
if (!thread.joinable()) {
return;
}
// Notify GPU thread that a shutdown is pending
PushCommand(EndProcessingCommand());
thread.join();
}
void ThreadManager::StartThread(VideoCore::RendererBase& renderer,
Core::Frontend::GraphicsContext& context,
Tegra::DmaPusher& dma_pusher, Tegra::CDmaPusher& cdma_pusher) {
thread = std::thread(RunThread, std::ref(system), std::ref(renderer), std::ref(context),
std::ref(dma_pusher), std::ref(state), std::ref(cdma_pusher));
}
void ThreadManager::SubmitList(Tegra::CommandList&& entries) {
PushCommand(SubmitListCommand(std::move(entries)));
}
void ThreadManager::SubmitCommandBuffer(Tegra::ChCommandHeaderList&& entries) {
PushCommand(SubmitChCommandEntries(std::move(entries)));
}
void ThreadManager::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) {
PushCommand(SwapBuffersCommand(framebuffer ? std::make_optional(*framebuffer) : std::nullopt));
}
void ThreadManager::FlushRegion(VAddr addr, u64 size) {
if (!is_async) {
// Always flush with synchronous GPU mode
PushCommand(FlushRegionCommand(addr, size));
return;
}
// Asynchronous GPU mode
switch (Settings::values.gpu_accuracy.GetValue()) {
case Settings::GPUAccuracy::Normal:
PushCommand(FlushRegionCommand(addr, size));
break;
case Settings::GPUAccuracy::High:
// TODO(bunnei): Is this right? Preserving existing behavior for now
break;
case Settings::GPUAccuracy::Extreme: {
auto& gpu = system.GPU();
u64 fence = gpu.RequestFlush(addr, size);
PushCommand(GPUTickCommand());
while (fence > gpu.CurrentFlushRequestFence()) {
}
break;
}
default:
UNIMPLEMENTED_MSG("Unsupported gpu_accuracy {}", Settings::values.gpu_accuracy.GetValue());
}
}
void ThreadManager::InvalidateRegion(VAddr addr, u64 size) {
system.Renderer().Rasterizer().OnCPUWrite(addr, size);
}
void ThreadManager::FlushAndInvalidateRegion(VAddr addr, u64 size) {
// Skip flush on asynch mode, as FlushAndInvalidateRegion is not used for anything too important
system.Renderer().Rasterizer().OnCPUWrite(addr, size);
}
void ThreadManager::WaitIdle() const {
while (state.last_fence > state.signaled_fence.load(std::memory_order_relaxed) &&
system.IsPoweredOn()) {
}
}
void ThreadManager::OnCommandListEnd() {
PushCommand(OnCommandListEndCommand());
}
u64 ThreadManager::PushCommand(CommandData&& command_data) {
const u64 fence{++state.last_fence};
state.queue.Push(CommandDataContainer(std::move(command_data), fence));
if (!is_async) {
// In synchronous GPU mode, block the caller until the command has executed
WaitIdle();
}
return fence;
}
} // namespace VideoCommon::GPUThread

161
src/video_core/gpu_thread.h Executable file
View File

@@ -0,0 +1,161 @@
// Copyright 2019 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#pragma once
#include <atomic>
#include <condition_variable>
#include <mutex>
#include <optional>
#include <thread>
#include <variant>
#include "common/threadsafe_queue.h"
#include "video_core/framebuffer_config.h"
namespace Tegra {
struct FramebufferConfig;
class DmaPusher;
} // namespace Tegra
namespace Core {
namespace Frontend {
class GraphicsContext;
}
class System;
} // namespace Core
namespace VideoCore {
class RendererBase;
} // namespace VideoCore
namespace VideoCommon::GPUThread {
/// Command to signal to the GPU thread that processing has ended
struct EndProcessingCommand final {};
/// Command to signal to the GPU thread that a command list is ready for processing
struct SubmitListCommand final {
explicit SubmitListCommand(Tegra::CommandList&& entries_) : entries{std::move(entries_)} {}
Tegra::CommandList entries;
};
/// Command to signal to the GPU thread that a cdma command list is ready for processing
struct SubmitChCommandEntries final {
explicit SubmitChCommandEntries(Tegra::ChCommandHeaderList&& entries_)
: entries{std::move(entries_)} {}
Tegra::ChCommandHeaderList entries;
};
/// Command to signal to the GPU thread that a swap buffers is pending
struct SwapBuffersCommand final {
explicit SwapBuffersCommand(std::optional<const Tegra::FramebufferConfig> framebuffer_)
: framebuffer{std::move(framebuffer_)} {}
std::optional<Tegra::FramebufferConfig> framebuffer;
};
/// Command to signal to the GPU thread to flush a region
struct FlushRegionCommand final {
explicit constexpr FlushRegionCommand(VAddr addr_, u64 size_) : addr{addr_}, size{size_} {}
VAddr addr;
u64 size;
};
/// Command to signal to the GPU thread to invalidate a region
struct InvalidateRegionCommand final {
explicit constexpr InvalidateRegionCommand(VAddr addr_, u64 size_) : addr{addr_}, size{size_} {}
VAddr addr;
u64 size;
};
/// Command to signal to the GPU thread to flush and invalidate a region
struct FlushAndInvalidateRegionCommand final {
explicit constexpr FlushAndInvalidateRegionCommand(VAddr addr_, u64 size_)
: addr{addr_}, size{size_} {}
VAddr addr;
u64 size;
};
/// Command called within the gpu, to schedule actions after a command list end
struct OnCommandListEndCommand final {};
/// Command to make the gpu look into pending requests
struct GPUTickCommand final {};
using CommandData =
std::variant<EndProcessingCommand, SubmitListCommand, SubmitChCommandEntries,
SwapBuffersCommand, FlushRegionCommand, InvalidateRegionCommand,
FlushAndInvalidateRegionCommand, OnCommandListEndCommand, GPUTickCommand>;
struct CommandDataContainer {
CommandDataContainer() = default;
explicit CommandDataContainer(CommandData&& data_, u64 next_fence_)
: data{std::move(data_)}, fence{next_fence_} {}
CommandData data;
u64 fence{};
};
/// Struct used to synchronize the GPU thread
struct SynchState final {
std::atomic_bool is_running{true};
using CommandQueue = Common::MPSCQueue<CommandDataContainer>;
CommandQueue queue;
u64 last_fence{};
std::atomic<u64> signaled_fence{};
};
/// Class used to manage the GPU thread
class ThreadManager final {
public:
explicit ThreadManager(Core::System& system_, bool is_async_);
~ThreadManager();
/// Creates and starts the GPU thread.
void StartThread(VideoCore::RendererBase& renderer, Core::Frontend::GraphicsContext& context,
Tegra::DmaPusher& dma_pusher, Tegra::CDmaPusher& cdma_pusher);
/// Push GPU command entries to be processed
void SubmitList(Tegra::CommandList&& entries);
/// Push GPU CDMA command buffer entries to be processed
void SubmitCommandBuffer(Tegra::ChCommandHeaderList&& entries);
/// Swap buffers (render frame)
void SwapBuffers(const Tegra::FramebufferConfig* framebuffer);
/// Notify rasterizer that any caches of the specified region should be flushed to Switch memory
void FlushRegion(VAddr addr, u64 size);
/// Notify rasterizer that any caches of the specified region should be invalidated
void InvalidateRegion(VAddr addr, u64 size);
/// Notify rasterizer that any caches of the specified region should be flushed and invalidated
void FlushAndInvalidateRegion(VAddr addr, u64 size);
// Wait until the gpu thread is idle.
void WaitIdle() const;
void OnCommandListEnd();
private:
/// Pushes a command to be executed by the GPU thread
u64 PushCommand(CommandData&& command_data);
SynchState state;
Core::System& system;
std::thread thread;
std::thread::id thread_id;
const bool is_async;
};
} // namespace VideoCommon::GPUThread

37
src/video_core/guest_driver.cpp Executable file
View File

@@ -0,0 +1,37 @@
// Copyright 2020 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#include <algorithm>
#include <limits>
#include <vector>
#include "common/common_types.h"
#include "video_core/guest_driver.h"
namespace VideoCore {
void GuestDriverProfile::DeduceTextureHandlerSize(std::vector<u32> bound_offsets) {
if (texture_handler_size) {
return;
}
const std::size_t size = bound_offsets.size();
if (size < 2) {
return;
}
std::sort(bound_offsets.begin(), bound_offsets.end(), std::less{});
u32 min_val = std::numeric_limits<u32>::max();
for (std::size_t i = 1; i < size; ++i) {
if (bound_offsets[i] == bound_offsets[i - 1]) {
continue;
}
const u32 new_min = bound_offsets[i] - bound_offsets[i - 1];
min_val = std::min(min_val, new_min);
}
if (min_val > 2) {
return;
}
texture_handler_size = min_texture_handler_size * min_val;
}
} // namespace VideoCore

46
src/video_core/guest_driver.h Executable file
View File

@@ -0,0 +1,46 @@
// Copyright 2020 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#pragma once
#include <optional>
#include <vector>
#include "common/common_types.h"
namespace VideoCore {
/**
* The GuestDriverProfile class is used to learn about the GPU drivers behavior and collect
* information necessary for impossible to avoid HLE methods like shader tracks as they are
* Entscheidungsproblems.
*/
class GuestDriverProfile {
public:
explicit GuestDriverProfile() = default;
explicit GuestDriverProfile(std::optional<u32> texture_handler_size_)
: texture_handler_size{texture_handler_size_} {}
void DeduceTextureHandlerSize(std::vector<u32> bound_offsets);
u32 GetTextureHandlerSize() const {
return texture_handler_size.value_or(default_texture_handler_size);
}
bool IsTextureHandlerSizeKnown() const {
return texture_handler_size.has_value();
}
private:
// Minimum size of texture handler any driver can use.
static constexpr u32 min_texture_handler_size = 4;
// This goes with Vulkan and OpenGL standards but Nvidia GPUs can easily use 4 bytes instead.
// Thus, certain drivers may squish the size.
static constexpr u32 default_texture_handler_size = 8;
std::optional<u32> texture_handler_size = default_texture_handler_size;
};
} // namespace VideoCore

View File

@@ -0,0 +1,71 @@
set(SHADER_FILES
block_linear_unswizzle_2d.comp
block_linear_unswizzle_3d.comp
convert_depth_to_float.frag
convert_float_to_depth.frag
full_screen_triangle.vert
opengl_copy_bc4.comp
opengl_present.frag
opengl_present.vert
pitch_unswizzle.comp
vulkan_blit_color_float.frag
vulkan_blit_depth_stencil.frag
vulkan_present.frag
vulkan_present.vert
vulkan_quad_array.comp
vulkan_quad_indexed.comp
vulkan_uint8.comp
)
find_program(GLSLANGVALIDATOR "glslangValidator" REQUIRED)
set(GLSL_FLAGS "")
set(SHADER_INCLUDE ${CMAKE_CURRENT_BINARY_DIR}/include)
set(SHADER_DIR ${SHADER_INCLUDE}/video_core/host_shaders)
set(HOST_SHADERS_INCLUDE ${SHADER_INCLUDE} PARENT_SCOPE)
set(INPUT_FILE ${CMAKE_CURRENT_SOURCE_DIR}/source_shader.h.in)
set(HEADER_GENERATOR ${CMAKE_CURRENT_SOURCE_DIR}/StringShaderHeader.cmake)
foreach(FILENAME IN ITEMS ${SHADER_FILES})
string(REPLACE "." "_" SHADER_NAME ${FILENAME})
set(SOURCE_FILE ${CMAKE_CURRENT_SOURCE_DIR}/${FILENAME})
# Skip generating source headers on Vulkan exclusive files
if (NOT ${FILENAME} MATCHES "vulkan.*")
set(SOURCE_HEADER_FILE ${SHADER_DIR}/${SHADER_NAME}.h)
add_custom_command(
OUTPUT
${SOURCE_HEADER_FILE}
COMMAND
${CMAKE_COMMAND} -P ${HEADER_GENERATOR} ${SOURCE_FILE} ${SOURCE_HEADER_FILE} ${INPUT_FILE}
MAIN_DEPENDENCY
${SOURCE_FILE}
DEPENDS
${INPUT_FILE}
# HEADER_GENERATOR should be included here but msbuild seems to assume it's always modified
)
set(SHADER_HEADERS ${SHADER_HEADERS} ${SOURCE_HEADER_FILE})
endif()
# Skip compiling to SPIR-V OpenGL exclusive files
if (NOT ${FILENAME} MATCHES "opengl.*")
string(TOUPPER ${SHADER_NAME}_SPV SPIRV_VARIABLE_NAME)
set(SPIRV_HEADER_FILE ${SHADER_DIR}/${SHADER_NAME}_spv.h)
add_custom_command(
OUTPUT
${SPIRV_HEADER_FILE}
COMMAND
${GLSLANGVALIDATOR} -V ${GLSL_FLAGS} --variable-name ${SPIRV_VARIABLE_NAME} -o ${SPIRV_HEADER_FILE} ${SOURCE_FILE}
MAIN_DEPENDENCY
${SOURCE_FILE}
)
set(SHADER_HEADERS ${SHADER_HEADERS} ${SPIRV_HEADER_FILE})
endif()
endforeach()
add_custom_target(host_shaders
DEPENDS
${SHADER_HEADERS}
SOURCES
${SHADER_FILES}
)

View File

@@ -0,0 +1,13 @@
set(SOURCE_FILE ${CMAKE_ARGV3})
set(HEADER_FILE ${CMAKE_ARGV4})
set(INPUT_FILE ${CMAKE_ARGV5})
get_filename_component(CONTENTS_NAME ${SOURCE_FILE} NAME)
string(REPLACE "." "_" CONTENTS_NAME ${CONTENTS_NAME})
string(TOUPPER ${CONTENTS_NAME} CONTENTS_NAME)
file(READ ${SOURCE_FILE} CONTENTS)
get_filename_component(OUTPUT_DIR ${HEADER_FILE} DIRECTORY)
make_directory(${OUTPUT_DIR})
configure_file(${INPUT_FILE} ${HEADER_FILE} @ONLY)

View File

@@ -0,0 +1,122 @@
// Copyright 2019 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#version 430
#ifdef VULKAN
#extension GL_EXT_shader_16bit_storage : require
#extension GL_EXT_shader_8bit_storage : require
#define HAS_EXTENDED_TYPES 1
#define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants {
#define END_PUSH_CONSTANTS };
#define UNIFORM(n)
#define BINDING_SWIZZLE_BUFFER 0
#define BINDING_INPUT_BUFFER 1
#define BINDING_OUTPUT_IMAGE 2
#else // ^^^ Vulkan ^^^ // vvv OpenGL vvv
#extension GL_NV_gpu_shader5 : enable
#ifdef GL_NV_gpu_shader5
#define HAS_EXTENDED_TYPES 1
#else
#define HAS_EXTENDED_TYPES 0
#endif
#define BEGIN_PUSH_CONSTANTS
#define END_PUSH_CONSTANTS
#define UNIFORM(n) layout (location = n) uniform
#define BINDING_SWIZZLE_BUFFER 0
#define BINDING_INPUT_BUFFER 1
#define BINDING_OUTPUT_IMAGE 0
#endif
BEGIN_PUSH_CONSTANTS
UNIFORM(0) uvec3 origin;
UNIFORM(1) ivec3 destination;
UNIFORM(2) uint bytes_per_block_log2;
UNIFORM(3) uint layer_stride;
UNIFORM(4) uint block_size;
UNIFORM(5) uint x_shift;
UNIFORM(6) uint block_height;
UNIFORM(7) uint block_height_mask;
END_PUSH_CONSTANTS
layout(binding = BINDING_SWIZZLE_BUFFER, std430) readonly buffer SwizzleTable {
uint swizzle_table[];
};
#if HAS_EXTENDED_TYPES
layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU8 { uint8_t u8data[]; };
layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU16 { uint16_t u16data[]; };
#endif
layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU32 { uint u32data[]; };
layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU64 { uvec2 u64data[]; };
layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU128 { uvec4 u128data[]; };
layout(binding = BINDING_OUTPUT_IMAGE) uniform writeonly uimage2DArray output_image;
layout(local_size_x = 32, local_size_y = 32, local_size_z = 1) in;
const uint GOB_SIZE_X = 64;
const uint GOB_SIZE_Y = 8;
const uint GOB_SIZE_Z = 1;
const uint GOB_SIZE = GOB_SIZE_X * GOB_SIZE_Y * GOB_SIZE_Z;
const uint GOB_SIZE_X_SHIFT = 6;
const uint GOB_SIZE_Y_SHIFT = 3;
const uint GOB_SIZE_Z_SHIFT = 0;
const uint GOB_SIZE_SHIFT = GOB_SIZE_X_SHIFT + GOB_SIZE_Y_SHIFT + GOB_SIZE_Z_SHIFT;
const uvec2 SWIZZLE_MASK = uvec2(GOB_SIZE_X - 1, GOB_SIZE_Y - 1);
uint SwizzleOffset(uvec2 pos) {
pos = pos & SWIZZLE_MASK;
return swizzle_table[pos.y * 64 + pos.x];
}
uvec4 ReadTexel(uint offset) {
switch (bytes_per_block_log2) {
#if HAS_EXTENDED_TYPES
case 0:
return uvec4(u8data[offset], 0, 0, 0);
case 1:
return uvec4(u16data[offset / 2], 0, 0, 0);
#else
case 0:
return uvec4(bitfieldExtract(u32data[offset / 4], int((offset * 8) & 24), 8), 0, 0, 0);
case 1:
return uvec4(bitfieldExtract(u32data[offset / 4], int((offset * 8) & 16), 16), 0, 0, 0);
#endif
case 2:
return uvec4(u32data[offset / 4], 0, 0, 0);
case 3:
return uvec4(u64data[offset / 8], 0, 0);
case 4:
return u128data[offset / 16];
}
return uvec4(0);
}
void main() {
uvec3 pos = gl_GlobalInvocationID + origin;
pos.x <<= bytes_per_block_log2;
// Read as soon as possible due to its latency
const uint swizzle = SwizzleOffset(pos.xy);
const uint block_y = pos.y >> GOB_SIZE_Y_SHIFT;
uint offset = 0;
offset += pos.z * layer_stride;
offset += (block_y >> block_height) * block_size;
offset += (block_y & block_height_mask) << GOB_SIZE_SHIFT;
offset += (pos.x >> GOB_SIZE_X_SHIFT) << x_shift;
offset += swizzle;
const uvec4 texel = ReadTexel(offset);
const ivec3 coord = ivec3(gl_GlobalInvocationID) + destination;
imageStore(output_image, coord, texel);
}

View File

@@ -0,0 +1,125 @@
// Copyright 2020 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#version 430
#ifdef VULKAN
#extension GL_EXT_shader_16bit_storage : require
#extension GL_EXT_shader_8bit_storage : require
#define HAS_EXTENDED_TYPES 1
#define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants {
#define END_PUSH_CONSTANTS };
#define UNIFORM(n)
#define BINDING_SWIZZLE_BUFFER 0
#define BINDING_INPUT_BUFFER 1
#define BINDING_OUTPUT_IMAGE 2
#else // ^^^ Vulkan ^^^ // vvv OpenGL vvv
#extension GL_NV_gpu_shader5 : enable
#ifdef GL_NV_gpu_shader5
#define HAS_EXTENDED_TYPES 1
#else
#define HAS_EXTENDED_TYPES 0
#endif
#define BEGIN_PUSH_CONSTANTS
#define END_PUSH_CONSTANTS
#define UNIFORM(n) layout (location = n) uniform
#define BINDING_SWIZZLE_BUFFER 0
#define BINDING_INPUT_BUFFER 1
#define BINDING_OUTPUT_IMAGE 0
#endif
BEGIN_PUSH_CONSTANTS
UNIFORM(0) uvec3 origin;
UNIFORM(1) ivec3 destination;
UNIFORM(2) uint bytes_per_block_log2;
UNIFORM(3) uint slice_size;
UNIFORM(4) uint block_size;
UNIFORM(5) uint x_shift;
UNIFORM(6) uint block_height;
UNIFORM(7) uint block_height_mask;
UNIFORM(8) uint block_depth;
UNIFORM(9) uint block_depth_mask;
END_PUSH_CONSTANTS
layout(binding = BINDING_SWIZZLE_BUFFER, std430) readonly buffer SwizzleTable {
uint swizzle_table[];
};
#if HAS_EXTENDED_TYPES
layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU8 { uint8_t u8data[]; };
layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU16 { uint16_t u16data[]; };
#endif
layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU32 { uint u32data[]; };
layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU64 { uvec2 u64data[]; };
layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU128 { uvec4 u128data[]; };
layout(binding = BINDING_OUTPUT_IMAGE) uniform writeonly uimage3D output_image;
layout(local_size_x = 16, local_size_y = 8, local_size_z = 8) in;
const uint GOB_SIZE_X = 64;
const uint GOB_SIZE_Y = 8;
const uint GOB_SIZE_Z = 1;
const uint GOB_SIZE = GOB_SIZE_X * GOB_SIZE_Y * GOB_SIZE_Z;
const uint GOB_SIZE_X_SHIFT = 6;
const uint GOB_SIZE_Y_SHIFT = 3;
const uint GOB_SIZE_Z_SHIFT = 0;
const uint GOB_SIZE_SHIFT = GOB_SIZE_X_SHIFT + GOB_SIZE_Y_SHIFT + GOB_SIZE_Z_SHIFT;
const uvec2 SWIZZLE_MASK = uvec2(GOB_SIZE_X - 1, GOB_SIZE_Y - 1);
uint SwizzleOffset(uvec2 pos) {
pos = pos & SWIZZLE_MASK;
return swizzle_table[pos.y * 64 + pos.x];
}
uvec4 ReadTexel(uint offset) {
switch (bytes_per_block_log2) {
#if HAS_EXTENDED_TYPES
case 0:
return uvec4(u8data[offset], 0, 0, 0);
case 1:
return uvec4(u16data[offset / 2], 0, 0, 0);
#else
case 0:
return uvec4(bitfieldExtract(u32data[offset / 4], int((offset * 8) & 24), 8), 0, 0, 0);
case 1:
return uvec4(bitfieldExtract(u32data[offset / 4], int((offset * 8) & 16), 16), 0, 0, 0);
#endif
case 2:
return uvec4(u32data[offset / 4], 0, 0, 0);
case 3:
return uvec4(u64data[offset / 8], 0, 0);
case 4:
return u128data[offset / 16];
}
return uvec4(0);
}
void main() {
uvec3 pos = gl_GlobalInvocationID + origin;
pos.x <<= bytes_per_block_log2;
// Read as soon as possible due to its latency
const uint swizzle = SwizzleOffset(pos.xy);
const uint block_y = pos.y >> GOB_SIZE_Y_SHIFT;
uint offset = 0;
offset += (pos.z >> block_depth) * slice_size;
offset += (pos.z & block_depth_mask) << (GOB_SIZE_SHIFT + block_height);
offset += (block_y >> block_height) * block_size;
offset += (block_y & block_height_mask) << GOB_SIZE_SHIFT;
offset += (pos.x >> GOB_SIZE_X_SHIFT) << x_shift;
offset += swizzle;
const uvec4 texel = ReadTexel(offset);
const ivec3 coord = ivec3(gl_GlobalInvocationID) + destination;
imageStore(output_image, coord, texel);
}

View File

@@ -0,0 +1,13 @@
// Copyright 2020 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#version 450
layout(binding = 0) uniform sampler2D depth_texture;
layout(location = 0) out float output_color;
void main() {
ivec2 coord = ivec2(gl_FragCoord.xy);
output_color = texelFetch(depth_texture, coord, 0).r;
}

View File

@@ -0,0 +1,13 @@
// Copyright 2020 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#version 450
layout(binding = 0) uniform sampler2D color_texture;
void main() {
ivec2 coord = ivec2(gl_FragCoord.xy);
float color = texelFetch(color_texture, coord, 0).r;
gl_FragDepth = color;
}

View File

@@ -0,0 +1,29 @@
// Copyright 2020 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#version 450
#ifdef VULKAN
#define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants {
#define END_PUSH_CONSTANTS };
#define UNIFORM(n)
#else // ^^^ Vulkan ^^^ // vvv OpenGL vvv
#define BEGIN_PUSH_CONSTANTS
#define END_PUSH_CONSTANTS
#define UNIFORM(n) layout (location = n) uniform
#endif
BEGIN_PUSH_CONSTANTS
UNIFORM(0) vec2 tex_scale;
UNIFORM(1) vec2 tex_offset;
END_PUSH_CONSTANTS
layout(location = 0) out vec2 texcoord;
void main() {
float x = float((gl_VertexIndex & 1) << 2);
float y = float((gl_VertexIndex & 2) << 1);
gl_Position = vec4(x - 1.0, y - 1.0, 0.0, 1.0);
texcoord = fma(vec2(x, y) / 2.0, tex_scale, tex_offset);
}

View File

@@ -0,0 +1,70 @@
// Copyright 2020 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#version 430 core
#extension GL_ARB_gpu_shader_int64 : require
layout (local_size_x = 4, local_size_y = 4) in;
layout(binding = 0, rg32ui) readonly uniform uimage3D bc4_input;
layout(binding = 1, rgba8ui) writeonly uniform uimage3D bc4_output;
layout(location = 0) uniform uvec3 src_offset;
layout(location = 1) uniform uvec3 dst_offset;
// https://www.khronos.org/registry/OpenGL/extensions/ARB/ARB_texture_compression_rgtc.txt
uint DecompressBlock(uint64_t bits, uvec2 coord) {
const uint code_offset = 16 + 3 * (4 * coord.y + coord.x);
const uint code = uint(bits >> code_offset) & 7;
const uint red0 = uint(bits >> 0) & 0xff;
const uint red1 = uint(bits >> 8) & 0xff;
if (red0 > red1) {
switch (code) {
case 0:
return red0;
case 1:
return red1;
case 2:
return (6 * red0 + 1 * red1) / 7;
case 3:
return (5 * red0 + 2 * red1) / 7;
case 4:
return (4 * red0 + 3 * red1) / 7;
case 5:
return (3 * red0 + 4 * red1) / 7;
case 6:
return (2 * red0 + 5 * red1) / 7;
case 7:
return (1 * red0 + 6 * red1) / 7;
}
} else {
switch (code) {
case 0:
return red0;
case 1:
return red1;
case 2:
return (4 * red0 + 1 * red1) / 5;
case 3:
return (3 * red0 + 2 * red1) / 5;
case 4:
return (2 * red0 + 3 * red1) / 5;
case 5:
return (1 * red0 + 4 * red1) / 5;
case 6:
return 0;
case 7:
return 0xff;
}
}
return 0;
}
void main() {
uvec2 packed_bits = imageLoad(bc4_input, ivec3(gl_WorkGroupID + src_offset)).rg;
uint64_t bits = packUint2x32(packed_bits);
uint red = DecompressBlock(bits, gl_LocalInvocationID.xy);
uvec4 color = uvec4(red & 0xff, 0, 0, 0xff);
imageStore(bc4_output, ivec3(gl_GlobalInvocationID + dst_offset), color);
}

View File

@@ -0,0 +1,14 @@
// Copyright 2020 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#version 430 core
layout (location = 0) in vec2 frag_tex_coord;
layout (location = 0) out vec4 color;
layout (binding = 0) uniform sampler2D color_texture;
void main() {
color = vec4(texture(color_texture, frag_tex_coord).rgb, 1.0f);
}

View File

@@ -0,0 +1,28 @@
// Copyright 2020 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#version 430 core
out gl_PerVertex {
vec4 gl_Position;
};
layout (location = 0) in vec2 vert_position;
layout (location = 1) in vec2 vert_tex_coord;
layout (location = 0) out vec2 frag_tex_coord;
// This is a truncated 3x3 matrix for 2D transformations:
// The upper-left 2x2 submatrix performs scaling/rotation/mirroring.
// The third column performs translation.
// The third row could be used for projection, which we don't need in 2D. It hence is assumed to
// implicitly be [0, 0, 1]
layout (location = 0) uniform mat3x2 modelview_matrix;
void main() {
// Multiply input position by the rotscale part of the matrix and then manually translate by
// the last column. This is equivalent to using a full 3x3 matrix and expanding the vector
// to `vec3(vert_position.xy, 1.0)`
gl_Position = vec4(mat2(modelview_matrix) * vert_position + modelview_matrix[2], 0.0, 1.0);
frag_tex_coord = vert_tex_coord;
}

View File

@@ -0,0 +1,86 @@
// Copyright 2020 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#version 430
#ifdef VULKAN
#extension GL_EXT_shader_16bit_storage : require
#extension GL_EXT_shader_8bit_storage : require
#define HAS_EXTENDED_TYPES 1
#define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants {
#define END_PUSH_CONSTANTS };
#define UNIFORM(n)
#define BINDING_INPUT_BUFFER 0
#define BINDING_OUTPUT_IMAGE 1
#else // ^^^ Vulkan ^^^ // vvv OpenGL vvv
#extension GL_NV_gpu_shader5 : enable
#ifdef GL_NV_gpu_shader5
#define HAS_EXTENDED_TYPES 1
#else
#define HAS_EXTENDED_TYPES 0
#endif
#define BEGIN_PUSH_CONSTANTS
#define END_PUSH_CONSTANTS
#define UNIFORM(n) layout (location = n) uniform
#define BINDING_INPUT_BUFFER 0
#define BINDING_OUTPUT_IMAGE 0
#endif
BEGIN_PUSH_CONSTANTS
UNIFORM(0) uvec2 origin;
UNIFORM(1) ivec2 destination;
UNIFORM(2) uint bytes_per_block;
UNIFORM(3) uint pitch;
END_PUSH_CONSTANTS
#if HAS_EXTENDED_TYPES
layout(binding = BINDING_INPUT_BUFFER, std430) readonly buffer InputBufferU8 { uint8_t u8data[]; };
layout(binding = BINDING_INPUT_BUFFER, std430) readonly buffer InputBufferU16 { uint16_t u16data[]; };
#endif
layout(binding = BINDING_INPUT_BUFFER, std430) readonly buffer InputBufferU32 { uint u32data[]; };
layout(binding = BINDING_INPUT_BUFFER, std430) readonly buffer InputBufferU64 { uvec2 u64data[]; };
layout(binding = BINDING_INPUT_BUFFER, std430) readonly buffer InputBufferU128 { uvec4 u128data[]; };
layout(binding = BINDING_OUTPUT_IMAGE) writeonly uniform uimage2D output_image;
layout(local_size_x = 32, local_size_y = 32, local_size_z = 1) in;
uvec4 ReadTexel(uint offset) {
switch (bytes_per_block) {
#if HAS_EXTENDED_TYPES
case 1:
return uvec4(u8data[offset], 0, 0, 0);
case 2:
return uvec4(u16data[offset / 2], 0, 0, 0);
#else
case 1:
return uvec4(bitfieldExtract(u32data[offset / 4], int((offset * 8) & 24), 8), 0, 0, 0);
case 2:
return uvec4(bitfieldExtract(u32data[offset / 4], int((offset * 8) & 16), 16), 0, 0, 0);
#endif
case 4:
return uvec4(u32data[offset / 4], 0, 0, 0);
case 8:
return uvec4(u64data[offset / 8], 0, 0);
case 16:
return u128data[offset / 16];
}
return uvec4(0);
}
void main() {
uvec2 pos = gl_GlobalInvocationID.xy + origin;
uint offset = 0;
offset += pos.x * bytes_per_block;
offset += pos.y * pitch;
const uvec4 texel = ReadTexel(offset);
const ivec2 coord = ivec2(gl_GlobalInvocationID.xy) + destination;
imageStore(output_image, coord, texel);
}

View File

@@ -0,0 +1,9 @@
#pragma once
#include <string_view>
namespace HostShaders {
constexpr std::string_view @CONTENTS_NAME@ = R"(@CONTENTS@)";
} // namespace HostShaders

View File

@@ -0,0 +1,14 @@
// Copyright 2020 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#version 450
layout(binding = 0) uniform sampler2D tex;
layout(location = 0) in vec2 texcoord;
layout(location = 0) out vec4 color;
void main() {
color = textureLod(tex, texcoord, 0);
}

View File

@@ -0,0 +1,16 @@
// Copyright 2020 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#version 450
#extension GL_ARB_shader_stencil_export : require
layout(binding = 0) uniform sampler2D depth_tex;
layout(binding = 1) uniform isampler2D stencil_tex;
layout(location = 0) in vec2 texcoord;
void main() {
gl_FragDepth = textureLod(depth_tex, texcoord, 0).r;
gl_FragStencilRefARB = textureLod(stencil_tex, texcoord, 0).r;
}

View File

@@ -0,0 +1,15 @@
// Copyright 2019 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#version 460 core
layout (location = 0) in vec2 frag_tex_coord;
layout (location = 0) out vec4 color;
layout (binding = 1) uniform sampler2D color_texture;
void main() {
color = texture(color_texture, frag_tex_coord);
}

View File

@@ -0,0 +1,19 @@
// Copyright 2019 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#version 460 core
layout (location = 0) in vec2 vert_position;
layout (location = 1) in vec2 vert_tex_coord;
layout (location = 0) out vec2 frag_tex_coord;
layout (set = 0, binding = 0) uniform MatrixBlock {
mat4 modelview_matrix;
};
void main() {
gl_Position = modelview_matrix * vec4(vert_position, 0.0, 1.0);
frag_tex_coord = vert_tex_coord;
}

View File

@@ -0,0 +1,28 @@
// Copyright 2019 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#version 460 core
layout (local_size_x = 1024) in;
layout (std430, set = 0, binding = 0) buffer OutputBuffer {
uint output_indexes[];
};
layout (push_constant) uniform PushConstants {
uint first;
};
void main() {
uint primitive = gl_GlobalInvocationID.x;
if (primitive * 6 >= output_indexes.length()) {
return;
}
const uint quad_map[6] = uint[](0, 1, 2, 0, 2, 3);
for (uint vertex = 0; vertex < 6; ++vertex) {
uint index = first + primitive * 4 + quad_map[vertex];
output_indexes[primitive * 6 + vertex] = index;
}
}

View File

@@ -0,0 +1,41 @@
// Copyright 2020 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#version 460 core
layout (local_size_x = 1024) in;
layout (std430, set = 0, binding = 0) readonly buffer InputBuffer {
uint input_indexes[];
};
layout (std430, set = 0, binding = 1) writeonly buffer OutputBuffer {
uint output_indexes[];
};
layout (push_constant) uniform PushConstants {
uint base_vertex;
int index_shift; // 0: uint8, 1: uint16, 2: uint32
};
void main() {
int primitive = int(gl_GlobalInvocationID.x);
if (primitive * 6 >= output_indexes.length()) {
return;
}
int index_size = 8 << index_shift;
int flipped_shift = 2 - index_shift;
int mask = (1 << flipped_shift) - 1;
const int quad_swizzle[6] = int[](0, 1, 2, 0, 2, 3);
for (uint vertex = 0; vertex < 6; ++vertex) {
int offset = primitive * 4 + quad_swizzle[vertex];
int int_offset = offset >> flipped_shift;
int bit_offset = (offset & mask) * index_size;
uint packed_input = input_indexes[int_offset];
uint index = bitfieldExtract(packed_input, bit_offset, index_size);
output_indexes[primitive * 6 + vertex] = index + base_vertex;
}
}

View File

@@ -0,0 +1,24 @@
// Copyright 2019 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#version 460 core
#extension GL_EXT_shader_16bit_storage : require
#extension GL_EXT_shader_8bit_storage : require
layout (local_size_x = 1024) in;
layout (std430, set = 0, binding = 0) readonly buffer InputBuffer {
uint8_t input_indexes[];
};
layout (std430, set = 0, binding = 1) writeonly buffer OutputBuffer {
uint16_t output_indexes[];
};
void main() {
uint id = gl_GlobalInvocationID.x;
if (id < input_indexes.length()) {
output_indexes[id] = uint16_t(input_indexes[id]);
}
}

91
src/video_core/macro/macro.cpp Executable file
View File

@@ -0,0 +1,91 @@
// Copyright 2020 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#include <optional>
#include <boost/container_hash/hash.hpp>
#include "common/assert.h"
#include "common/logging/log.h"
#include "core/settings.h"
#include "video_core/engines/maxwell_3d.h"
#include "video_core/macro/macro.h"
#include "video_core/macro/macro_hle.h"
#include "video_core/macro/macro_interpreter.h"
#include "video_core/macro/macro_jit_x64.h"
namespace Tegra {
MacroEngine::MacroEngine(Engines::Maxwell3D& maxwell3d)
: hle_macros{std::make_unique<Tegra::HLEMacro>(maxwell3d)} {}
MacroEngine::~MacroEngine() = default;
void MacroEngine::AddCode(u32 method, u32 data) {
uploaded_macro_code[method].push_back(data);
}
void MacroEngine::Execute(Engines::Maxwell3D& maxwell3d, u32 method,
const std::vector<u32>& parameters) {
auto compiled_macro = macro_cache.find(method);
if (compiled_macro != macro_cache.end()) {
const auto& cache_info = compiled_macro->second;
if (cache_info.has_hle_program) {
cache_info.hle_program->Execute(parameters, method);
} else {
cache_info.lle_program->Execute(parameters, method);
}
} else {
// Macro not compiled, check if it's uploaded and if so, compile it
std::optional<u32> mid_method;
const auto macro_code = uploaded_macro_code.find(method);
if (macro_code == uploaded_macro_code.end()) {
for (const auto& [method_base, code] : uploaded_macro_code) {
if (method >= method_base && (method - method_base) < code.size()) {
mid_method = method_base;
break;
}
}
if (!mid_method.has_value()) {
UNREACHABLE_MSG("Macro 0x{0:x} was not uploaded", method);
return;
}
}
auto& cache_info = macro_cache[method];
if (!mid_method.has_value()) {
cache_info.lle_program = Compile(macro_code->second);
cache_info.hash = boost::hash_value(macro_code->second);
} else {
const auto& macro_cached = uploaded_macro_code[mid_method.value()];
const auto rebased_method = method - mid_method.value();
auto& code = uploaded_macro_code[method];
code.resize(macro_cached.size() - rebased_method);
std::memcpy(code.data(), macro_cached.data() + rebased_method,
code.size() * sizeof(u32));
cache_info.hash = boost::hash_value(code);
cache_info.lle_program = Compile(code);
}
auto hle_program = hle_macros->GetHLEProgram(cache_info.hash);
if (hle_program.has_value()) {
cache_info.has_hle_program = true;
cache_info.hle_program = std::move(hle_program.value());
cache_info.hle_program->Execute(parameters, method);
} else {
cache_info.lle_program->Execute(parameters, method);
}
}
}
std::unique_ptr<MacroEngine> GetMacroEngine(Engines::Maxwell3D& maxwell3d) {
if (Settings::values.disable_macro_jit) {
return std::make_unique<MacroInterpreter>(maxwell3d);
}
#ifdef ARCHITECTURE_x86_64
return std::make_unique<MacroJITx64>(maxwell3d);
#else
return std::make_unique<MacroInterpreter>(maxwell3d);
#endif
}
} // namespace Tegra

142
src/video_core/macro/macro.h Executable file
View File

@@ -0,0 +1,142 @@
// Copyright 2020 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#pragma once
#include <memory>
#include <unordered_map>
#include <vector>
#include "common/bit_field.h"
#include "common/common_types.h"
namespace Tegra {
namespace Engines {
class Maxwell3D;
}
namespace Macro {
constexpr std::size_t NUM_MACRO_REGISTERS = 8;
enum class Operation : u32 {
ALU = 0,
AddImmediate = 1,
ExtractInsert = 2,
ExtractShiftLeftImmediate = 3,
ExtractShiftLeftRegister = 4,
Read = 5,
Unused = 6, // This operation doesn't seem to be a valid encoding.
Branch = 7,
};
enum class ALUOperation : u32 {
Add = 0,
AddWithCarry = 1,
Subtract = 2,
SubtractWithBorrow = 3,
// Operations 4-7 don't seem to be valid encodings.
Xor = 8,
Or = 9,
And = 10,
AndNot = 11,
Nand = 12
};
enum class ResultOperation : u32 {
IgnoreAndFetch = 0,
Move = 1,
MoveAndSetMethod = 2,
FetchAndSend = 3,
MoveAndSend = 4,
FetchAndSetMethod = 5,
MoveAndSetMethodFetchAndSend = 6,
MoveAndSetMethodSend = 7
};
enum class BranchCondition : u32 {
Zero = 0,
NotZero = 1,
};
union Opcode {
u32 raw;
BitField<0, 3, Operation> operation;
BitField<4, 3, ResultOperation> result_operation;
BitField<4, 1, BranchCondition> branch_condition;
// If set on a branch, then the branch doesn't have a delay slot.
BitField<5, 1, u32> branch_annul;
BitField<7, 1, u32> is_exit;
BitField<8, 3, u32> dst;
BitField<11, 3, u32> src_a;
BitField<14, 3, u32> src_b;
// The signed immediate overlaps the second source operand and the alu operation.
BitField<14, 18, s32> immediate;
BitField<17, 5, ALUOperation> alu_operation;
// Bitfield instructions data
BitField<17, 5, u32> bf_src_bit;
BitField<22, 5, u32> bf_size;
BitField<27, 5, u32> bf_dst_bit;
u32 GetBitfieldMask() const {
return (1 << bf_size) - 1;
}
s32 GetBranchTarget() const {
return static_cast<s32>(immediate * sizeof(u32));
}
};
union MethodAddress {
u32 raw;
BitField<0, 12, u32> address;
BitField<12, 6, u32> increment;
};
} // namespace Macro
class HLEMacro;
class CachedMacro {
public:
virtual ~CachedMacro() = default;
/**
* Executes the macro code with the specified input parameters.
*
* @param parameters The parameters of the macro
* @param method The method to execute
*/
virtual void Execute(const std::vector<u32>& parameters, u32 method) = 0;
};
class MacroEngine {
public:
explicit MacroEngine(Engines::Maxwell3D& maxwell3d);
virtual ~MacroEngine();
// Store the uploaded macro code to compile them when they're called.
void AddCode(u32 method, u32 data);
// Compiles the macro if its not in the cache, and executes the compiled macro
void Execute(Engines::Maxwell3D& maxwell3d, u32 method, const std::vector<u32>& parameters);
protected:
virtual std::unique_ptr<CachedMacro> Compile(const std::vector<u32>& code) = 0;
private:
struct CacheInfo {
std::unique_ptr<CachedMacro> lle_program{};
std::unique_ptr<CachedMacro> hle_program{};
u64 hash{};
bool has_hle_program{};
};
std::unordered_map<u32, CacheInfo> macro_cache;
std::unordered_map<u32, std::vector<u32>> uploaded_macro_code;
std::unique_ptr<HLEMacro> hle_macros;
};
std::unique_ptr<MacroEngine> GetMacroEngine(Engines::Maxwell3D& maxwell3d);
} // namespace Tegra

View File

@@ -0,0 +1,109 @@
// Copyright 2020 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#include <array>
#include <vector>
#include "video_core/engines/maxwell_3d.h"
#include "video_core/macro/macro_hle.h"
#include "video_core/rasterizer_interface.h"
namespace Tegra {
namespace {
// HLE'd functions
void HLE_771BB18C62444DA0(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& parameters) {
const u32 instance_count = parameters[2] & maxwell3d.GetRegisterValue(0xD1B);
maxwell3d.regs.draw.topology.Assign(
static_cast<Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology>(parameters[0] & 0x3ffffff));
maxwell3d.regs.vb_base_instance = parameters[5];
maxwell3d.mme_draw.instance_count = instance_count;
maxwell3d.regs.vb_element_base = parameters[3];
maxwell3d.regs.index_array.count = parameters[1];
maxwell3d.regs.index_array.first = parameters[4];
if (maxwell3d.ShouldExecute()) {
maxwell3d.Rasterizer().Draw(true, true);
}
maxwell3d.regs.index_array.count = 0;
maxwell3d.mme_draw.instance_count = 0;
maxwell3d.mme_draw.current_mode = Engines::Maxwell3D::MMEDrawMode::Undefined;
}
void HLE_0D61FC9FAAC9FCAD(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& parameters) {
const u32 count = (maxwell3d.GetRegisterValue(0xD1B) & parameters[2]);
maxwell3d.regs.vertex_buffer.first = parameters[3];
maxwell3d.regs.vertex_buffer.count = parameters[1];
maxwell3d.regs.vb_base_instance = parameters[4];
maxwell3d.regs.draw.topology.Assign(
static_cast<Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology>(parameters[0]));
maxwell3d.mme_draw.instance_count = count;
if (maxwell3d.ShouldExecute()) {
maxwell3d.Rasterizer().Draw(false, true);
}
maxwell3d.regs.vertex_buffer.count = 0;
maxwell3d.mme_draw.instance_count = 0;
maxwell3d.mme_draw.current_mode = Engines::Maxwell3D::MMEDrawMode::Undefined;
}
void HLE_0217920100488FF7(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& parameters) {
const u32 instance_count = (maxwell3d.GetRegisterValue(0xD1B) & parameters[2]);
const u32 element_base = parameters[4];
const u32 base_instance = parameters[5];
maxwell3d.regs.index_array.first = parameters[3];
maxwell3d.regs.reg_array[0x446] = element_base; // vertex id base?
maxwell3d.regs.index_array.count = parameters[1];
maxwell3d.regs.vb_element_base = element_base;
maxwell3d.regs.vb_base_instance = base_instance;
maxwell3d.mme_draw.instance_count = instance_count;
maxwell3d.CallMethodFromMME(0x8e3, 0x640);
maxwell3d.CallMethodFromMME(0x8e4, element_base);
maxwell3d.CallMethodFromMME(0x8e5, base_instance);
maxwell3d.regs.draw.topology.Assign(
static_cast<Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology>(parameters[0]));
if (maxwell3d.ShouldExecute()) {
maxwell3d.Rasterizer().Draw(true, true);
}
maxwell3d.regs.reg_array[0x446] = 0x0; // vertex id base?
maxwell3d.regs.index_array.count = 0;
maxwell3d.regs.vb_element_base = 0x0;
maxwell3d.regs.vb_base_instance = 0x0;
maxwell3d.mme_draw.instance_count = 0;
maxwell3d.CallMethodFromMME(0x8e3, 0x640);
maxwell3d.CallMethodFromMME(0x8e4, 0x0);
maxwell3d.CallMethodFromMME(0x8e5, 0x0);
maxwell3d.mme_draw.current_mode = Engines::Maxwell3D::MMEDrawMode::Undefined;
}
} // Anonymous namespace
constexpr std::array<std::pair<u64, HLEFunction>, 3> hle_funcs{{
{0x771BB18C62444DA0, &HLE_771BB18C62444DA0},
{0x0D61FC9FAAC9FCAD, &HLE_0D61FC9FAAC9FCAD},
{0x0217920100488FF7, &HLE_0217920100488FF7},
}};
HLEMacro::HLEMacro(Engines::Maxwell3D& maxwell3d_) : maxwell3d{maxwell3d_} {}
HLEMacro::~HLEMacro() = default;
std::optional<std::unique_ptr<CachedMacro>> HLEMacro::GetHLEProgram(u64 hash) const {
const auto it = std::find_if(hle_funcs.cbegin(), hle_funcs.cend(),
[hash](const auto& pair) { return pair.first == hash; });
if (it == hle_funcs.end()) {
return std::nullopt;
}
return std::make_unique<HLEMacroImpl>(maxwell3d, it->second);
}
HLEMacroImpl::~HLEMacroImpl() = default;
HLEMacroImpl::HLEMacroImpl(Engines::Maxwell3D& maxwell3d_, HLEFunction func_)
: maxwell3d{maxwell3d_}, func{func_} {}
void HLEMacroImpl::Execute(const std::vector<u32>& parameters, u32 method) {
func(maxwell3d, parameters);
}
} // namespace Tegra

View File

@@ -0,0 +1,44 @@
// Copyright 2020 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#pragma once
#include <memory>
#include <optional>
#include <vector>
#include "common/common_types.h"
#include "video_core/macro/macro.h"
namespace Tegra {
namespace Engines {
class Maxwell3D;
}
using HLEFunction = void (*)(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& parameters);
class HLEMacro {
public:
explicit HLEMacro(Engines::Maxwell3D& maxwell3d_);
~HLEMacro();
std::optional<std::unique_ptr<CachedMacro>> GetHLEProgram(u64 hash) const;
private:
Engines::Maxwell3D& maxwell3d;
};
class HLEMacroImpl : public CachedMacro {
public:
explicit HLEMacroImpl(Engines::Maxwell3D& maxwell3d, HLEFunction func);
~HLEMacroImpl();
void Execute(const std::vector<u32>& parameters, u32 method) override;
private:
Engines::Maxwell3D& maxwell3d;
HLEFunction func;
};
} // namespace Tegra

View File

@@ -0,0 +1,287 @@
// Copyright 2020 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#include "common/assert.h"
#include "common/logging/log.h"
#include "common/microprofile.h"
#include "video_core/engines/maxwell_3d.h"
#include "video_core/macro/macro_interpreter.h"
MICROPROFILE_DEFINE(MacroInterp, "GPU", "Execute macro interpreter", MP_RGB(128, 128, 192));
namespace Tegra {
MacroInterpreter::MacroInterpreter(Engines::Maxwell3D& maxwell3d_)
: MacroEngine{maxwell3d_}, maxwell3d{maxwell3d_} {}
std::unique_ptr<CachedMacro> MacroInterpreter::Compile(const std::vector<u32>& code) {
return std::make_unique<MacroInterpreterImpl>(maxwell3d, code);
}
MacroInterpreterImpl::MacroInterpreterImpl(Engines::Maxwell3D& maxwell3d_,
const std::vector<u32>& code_)
: maxwell3d{maxwell3d_}, code{code_} {}
void MacroInterpreterImpl::Execute(const std::vector<u32>& params, u32 method) {
MICROPROFILE_SCOPE(MacroInterp);
Reset();
registers[1] = params[0];
num_parameters = params.size();
if (num_parameters > parameters_capacity) {
parameters_capacity = num_parameters;
parameters = std::make_unique<u32[]>(num_parameters);
}
std::memcpy(parameters.get(), params.data(), num_parameters * sizeof(u32));
// Execute the code until we hit an exit condition.
bool keep_executing = true;
while (keep_executing) {
keep_executing = Step(false);
}
// Assert the the macro used all the input parameters
ASSERT(next_parameter_index == num_parameters);
}
void MacroInterpreterImpl::Reset() {
registers = {};
pc = 0;
delayed_pc = {};
method_address.raw = 0;
num_parameters = 0;
// The next parameter index starts at 1, because $r1 already has the value of the first
// parameter.
next_parameter_index = 1;
carry_flag = false;
}
bool MacroInterpreterImpl::Step(bool is_delay_slot) {
u32 base_address = pc;
Macro::Opcode opcode = GetOpcode();
pc += 4;
// Update the program counter if we were delayed
if (delayed_pc) {
ASSERT(is_delay_slot);
pc = *delayed_pc;
delayed_pc = {};
}
switch (opcode.operation) {
case Macro::Operation::ALU: {
u32 result = GetALUResult(opcode.alu_operation, GetRegister(opcode.src_a),
GetRegister(opcode.src_b));
ProcessResult(opcode.result_operation, opcode.dst, result);
break;
}
case Macro::Operation::AddImmediate: {
ProcessResult(opcode.result_operation, opcode.dst,
GetRegister(opcode.src_a) + opcode.immediate);
break;
}
case Macro::Operation::ExtractInsert: {
u32 dst = GetRegister(opcode.src_a);
u32 src = GetRegister(opcode.src_b);
src = (src >> opcode.bf_src_bit) & opcode.GetBitfieldMask();
dst &= ~(opcode.GetBitfieldMask() << opcode.bf_dst_bit);
dst |= src << opcode.bf_dst_bit;
ProcessResult(opcode.result_operation, opcode.dst, dst);
break;
}
case Macro::Operation::ExtractShiftLeftImmediate: {
u32 dst = GetRegister(opcode.src_a);
u32 src = GetRegister(opcode.src_b);
u32 result = ((src >> dst) & opcode.GetBitfieldMask()) << opcode.bf_dst_bit;
ProcessResult(opcode.result_operation, opcode.dst, result);
break;
}
case Macro::Operation::ExtractShiftLeftRegister: {
u32 dst = GetRegister(opcode.src_a);
u32 src = GetRegister(opcode.src_b);
u32 result = ((src >> opcode.bf_src_bit) & opcode.GetBitfieldMask()) << dst;
ProcessResult(opcode.result_operation, opcode.dst, result);
break;
}
case Macro::Operation::Read: {
u32 result = Read(GetRegister(opcode.src_a) + opcode.immediate);
ProcessResult(opcode.result_operation, opcode.dst, result);
break;
}
case Macro::Operation::Branch: {
ASSERT_MSG(!is_delay_slot, "Executing a branch in a delay slot is not valid");
u32 value = GetRegister(opcode.src_a);
bool taken = EvaluateBranchCondition(opcode.branch_condition, value);
if (taken) {
// Ignore the delay slot if the branch has the annul bit.
if (opcode.branch_annul) {
pc = base_address + opcode.GetBranchTarget();
return true;
}
delayed_pc = base_address + opcode.GetBranchTarget();
// Execute one more instruction due to the delay slot.
return Step(true);
}
break;
}
default:
UNIMPLEMENTED_MSG("Unimplemented macro operation {}", opcode.operation.Value());
}
// An instruction with the Exit flag will not actually
// cause an exit if it's executed inside a delay slot.
if (opcode.is_exit && !is_delay_slot) {
// Exit has a delay slot, execute the next instruction
Step(true);
return false;
}
return true;
}
u32 MacroInterpreterImpl::GetALUResult(Macro::ALUOperation operation, u32 src_a, u32 src_b) {
switch (operation) {
case Macro::ALUOperation::Add: {
const u64 result{static_cast<u64>(src_a) + src_b};
carry_flag = result > 0xffffffff;
return static_cast<u32>(result);
}
case Macro::ALUOperation::AddWithCarry: {
const u64 result{static_cast<u64>(src_a) + src_b + (carry_flag ? 1ULL : 0ULL)};
carry_flag = result > 0xffffffff;
return static_cast<u32>(result);
}
case Macro::ALUOperation::Subtract: {
const u64 result{static_cast<u64>(src_a) - src_b};
carry_flag = result < 0x100000000;
return static_cast<u32>(result);
}
case Macro::ALUOperation::SubtractWithBorrow: {
const u64 result{static_cast<u64>(src_a) - src_b - (carry_flag ? 0ULL : 1ULL)};
carry_flag = result < 0x100000000;
return static_cast<u32>(result);
}
case Macro::ALUOperation::Xor:
return src_a ^ src_b;
case Macro::ALUOperation::Or:
return src_a | src_b;
case Macro::ALUOperation::And:
return src_a & src_b;
case Macro::ALUOperation::AndNot:
return src_a & ~src_b;
case Macro::ALUOperation::Nand:
return ~(src_a & src_b);
default:
UNIMPLEMENTED_MSG("Unimplemented ALU operation {}", operation);
return 0;
}
}
void MacroInterpreterImpl::ProcessResult(Macro::ResultOperation operation, u32 reg, u32 result) {
switch (operation) {
case Macro::ResultOperation::IgnoreAndFetch:
// Fetch parameter and ignore result.
SetRegister(reg, FetchParameter());
break;
case Macro::ResultOperation::Move:
// Move result.
SetRegister(reg, result);
break;
case Macro::ResultOperation::MoveAndSetMethod:
// Move result and use as Method Address.
SetRegister(reg, result);
SetMethodAddress(result);
break;
case Macro::ResultOperation::FetchAndSend:
// Fetch parameter and send result.
SetRegister(reg, FetchParameter());
Send(result);
break;
case Macro::ResultOperation::MoveAndSend:
// Move and send result.
SetRegister(reg, result);
Send(result);
break;
case Macro::ResultOperation::FetchAndSetMethod:
// Fetch parameter and use result as Method Address.
SetRegister(reg, FetchParameter());
SetMethodAddress(result);
break;
case Macro::ResultOperation::MoveAndSetMethodFetchAndSend:
// Move result and use as Method Address, then fetch and send parameter.
SetRegister(reg, result);
SetMethodAddress(result);
Send(FetchParameter());
break;
case Macro::ResultOperation::MoveAndSetMethodSend:
// Move result and use as Method Address, then send bits 12:17 of result.
SetRegister(reg, result);
SetMethodAddress(result);
Send((result >> 12) & 0b111111);
break;
default:
UNIMPLEMENTED_MSG("Unimplemented result operation {}", operation);
}
}
bool MacroInterpreterImpl::EvaluateBranchCondition(Macro::BranchCondition cond, u32 value) const {
switch (cond) {
case Macro::BranchCondition::Zero:
return value == 0;
case Macro::BranchCondition::NotZero:
return value != 0;
}
UNREACHABLE();
return true;
}
Macro::Opcode MacroInterpreterImpl::GetOpcode() const {
ASSERT((pc % sizeof(u32)) == 0);
ASSERT(pc < code.size() * sizeof(u32));
return {code[pc / sizeof(u32)]};
}
u32 MacroInterpreterImpl::GetRegister(u32 register_id) const {
return registers.at(register_id);
}
void MacroInterpreterImpl::SetRegister(u32 register_id, u32 value) {
// Register 0 is hardwired as the zero register.
// Ensure no writes to it actually occur.
if (register_id == 0) {
return;
}
registers.at(register_id) = value;
}
void MacroInterpreterImpl::SetMethodAddress(u32 address) {
method_address.raw = address;
}
void MacroInterpreterImpl::Send(u32 value) {
maxwell3d.CallMethodFromMME(method_address.address, value);
// Increment the method address by the method increment.
method_address.address.Assign(method_address.address.Value() +
method_address.increment.Value());
}
u32 MacroInterpreterImpl::Read(u32 method) const {
return maxwell3d.GetRegisterValue(method);
}
u32 MacroInterpreterImpl::FetchParameter() {
ASSERT(next_parameter_index < num_parameters);
return parameters[next_parameter_index++];
}
} // namespace Tegra

View File

@@ -0,0 +1,102 @@
// Copyright 2020 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#pragma once
#include <array>
#include <optional>
#include <vector>
#include "common/bit_field.h"
#include "common/common_types.h"
#include "video_core/macro/macro.h"
namespace Tegra {
namespace Engines {
class Maxwell3D;
}
class MacroInterpreter final : public MacroEngine {
public:
explicit MacroInterpreter(Engines::Maxwell3D& maxwell3d_);
protected:
std::unique_ptr<CachedMacro> Compile(const std::vector<u32>& code) override;
private:
Engines::Maxwell3D& maxwell3d;
};
class MacroInterpreterImpl : public CachedMacro {
public:
explicit MacroInterpreterImpl(Engines::Maxwell3D& maxwell3d_, const std::vector<u32>& code_);
void Execute(const std::vector<u32>& params, u32 method) override;
private:
/// Resets the execution engine state, zeroing registers, etc.
void Reset();
/**
* Executes a single macro instruction located at the current program counter. Returns whether
* the interpreter should keep running.
*
* @param is_delay_slot Whether the current step is being executed due to a delay slot in a
* previous instruction.
*/
bool Step(bool is_delay_slot);
/// Calculates the result of an ALU operation. src_a OP src_b;
u32 GetALUResult(Macro::ALUOperation operation, u32 src_a, u32 src_b);
/// Performs the result operation on the input result and stores it in the specified register
/// (if necessary).
void ProcessResult(Macro::ResultOperation operation, u32 reg, u32 result);
/// Evaluates the branch condition and returns whether the branch should be taken or not.
bool EvaluateBranchCondition(Macro::BranchCondition cond, u32 value) const;
/// Reads an opcode at the current program counter location.
Macro::Opcode GetOpcode() const;
/// Returns the specified register's value. Register 0 is hardcoded to always return 0.
u32 GetRegister(u32 register_id) const;
/// Sets the register to the input value.
void SetRegister(u32 register_id, u32 value);
/// Sets the method address to use for the next Send instruction.
void SetMethodAddress(u32 address);
/// Calls a GPU Engine method with the input parameter.
void Send(u32 value);
/// Reads a GPU register located at the method address.
u32 Read(u32 method) const;
/// Returns the next parameter in the parameter queue.
u32 FetchParameter();
Engines::Maxwell3D& maxwell3d;
/// Current program counter
u32 pc;
/// Program counter to execute at after the delay slot is executed.
std::optional<u32> delayed_pc;
/// General purpose macro registers.
std::array<u32, Macro::NUM_MACRO_REGISTERS> registers = {};
/// Method address to use for the next Send instruction.
Macro::MethodAddress method_address = {};
/// Input parameters of the current macro.
std::unique_ptr<u32[]> parameters;
std::size_t num_parameters = 0;
std::size_t parameters_capacity = 0;
/// Index of the next parameter that will be fetched by the 'parm' instruction.
u32 next_parameter_index = 0;
bool carry_flag = false;
const std::vector<u32>& code;
};
} // namespace Tegra

View File

@@ -0,0 +1,619 @@
// Copyright 2020 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#include "common/assert.h"
#include "common/logging/log.h"
#include "common/microprofile.h"
#include "common/x64/xbyak_util.h"
#include "video_core/engines/maxwell_3d.h"
#include "video_core/macro/macro_interpreter.h"
#include "video_core/macro/macro_jit_x64.h"
MICROPROFILE_DEFINE(MacroJitCompile, "GPU", "Compile macro JIT", MP_RGB(173, 255, 47));
MICROPROFILE_DEFINE(MacroJitExecute, "GPU", "Execute macro JIT", MP_RGB(255, 255, 0));
namespace Tegra {
constexpr Xbyak::Reg64 STATE = Xbyak::util::rbx;
constexpr Xbyak::Reg32 RESULT = Xbyak::util::ebp;
constexpr Xbyak::Reg64 PARAMETERS = Xbyak::util::r12;
constexpr Xbyak::Reg32 METHOD_ADDRESS = Xbyak::util::r14d;
constexpr Xbyak::Reg64 BRANCH_HOLDER = Xbyak::util::r15;
static const std::bitset<32> PERSISTENT_REGISTERS = Common::X64::BuildRegSet({
STATE,
RESULT,
PARAMETERS,
METHOD_ADDRESS,
BRANCH_HOLDER,
});
MacroJITx64::MacroJITx64(Engines::Maxwell3D& maxwell3d_)
: MacroEngine{maxwell3d_}, maxwell3d{maxwell3d_} {}
std::unique_ptr<CachedMacro> MacroJITx64::Compile(const std::vector<u32>& code) {
return std::make_unique<MacroJITx64Impl>(maxwell3d, code);
}
MacroJITx64Impl::MacroJITx64Impl(Engines::Maxwell3D& maxwell3d_, const std::vector<u32>& code_)
: CodeGenerator{MAX_CODE_SIZE}, code{code_}, maxwell3d{maxwell3d_} {
Compile();
}
MacroJITx64Impl::~MacroJITx64Impl() = default;
void MacroJITx64Impl::Execute(const std::vector<u32>& parameters, u32 method) {
MICROPROFILE_SCOPE(MacroJitExecute);
ASSERT_OR_EXECUTE(program != nullptr, { return; });
JITState state{};
state.maxwell3d = &maxwell3d;
state.registers = {};
program(&state, parameters.data());
}
void MacroJITx64Impl::Compile_ALU(Macro::Opcode opcode) {
const bool is_a_zero = opcode.src_a == 0;
const bool is_b_zero = opcode.src_b == 0;
const bool valid_operation = !is_a_zero && !is_b_zero;
[[maybe_unused]] const bool is_move_operation = !is_a_zero && is_b_zero;
const bool has_zero_register = is_a_zero || is_b_zero;
const bool no_zero_reg_skip = opcode.alu_operation == Macro::ALUOperation::AddWithCarry ||
opcode.alu_operation == Macro::ALUOperation::SubtractWithBorrow;
Xbyak::Reg32 src_a;
Xbyak::Reg32 src_b;
if (!optimizer.zero_reg_skip || no_zero_reg_skip) {
src_a = Compile_GetRegister(opcode.src_a, RESULT);
src_b = Compile_GetRegister(opcode.src_b, eax);
} else {
if (!is_a_zero) {
src_a = Compile_GetRegister(opcode.src_a, RESULT);
}
if (!is_b_zero) {
src_b = Compile_GetRegister(opcode.src_b, eax);
}
}
bool has_emitted = false;
switch (opcode.alu_operation) {
case Macro::ALUOperation::Add:
if (optimizer.zero_reg_skip) {
if (valid_operation) {
add(src_a, src_b);
}
} else {
add(src_a, src_b);
}
if (!optimizer.can_skip_carry) {
setc(byte[STATE + offsetof(JITState, carry_flag)]);
}
break;
case Macro::ALUOperation::AddWithCarry:
bt(dword[STATE + offsetof(JITState, carry_flag)], 0);
adc(src_a, src_b);
setc(byte[STATE + offsetof(JITState, carry_flag)]);
break;
case Macro::ALUOperation::Subtract:
if (optimizer.zero_reg_skip) {
if (valid_operation) {
sub(src_a, src_b);
has_emitted = true;
}
} else {
sub(src_a, src_b);
has_emitted = true;
}
if (!optimizer.can_skip_carry && has_emitted) {
setc(byte[STATE + offsetof(JITState, carry_flag)]);
}
break;
case Macro::ALUOperation::SubtractWithBorrow:
bt(dword[STATE + offsetof(JITState, carry_flag)], 0);
sbb(src_a, src_b);
setc(byte[STATE + offsetof(JITState, carry_flag)]);
break;
case Macro::ALUOperation::Xor:
if (optimizer.zero_reg_skip) {
if (valid_operation) {
xor_(src_a, src_b);
}
} else {
xor_(src_a, src_b);
}
break;
case Macro::ALUOperation::Or:
if (optimizer.zero_reg_skip) {
if (valid_operation) {
or_(src_a, src_b);
}
} else {
or_(src_a, src_b);
}
break;
case Macro::ALUOperation::And:
if (optimizer.zero_reg_skip) {
if (!has_zero_register) {
and_(src_a, src_b);
}
} else {
and_(src_a, src_b);
}
break;
case Macro::ALUOperation::AndNot:
if (optimizer.zero_reg_skip) {
if (!is_a_zero) {
not_(src_b);
and_(src_a, src_b);
}
} else {
not_(src_b);
and_(src_a, src_b);
}
break;
case Macro::ALUOperation::Nand:
if (optimizer.zero_reg_skip) {
if (!is_a_zero) {
and_(src_a, src_b);
not_(src_a);
}
} else {
and_(src_a, src_b);
not_(src_a);
}
break;
default:
UNIMPLEMENTED_MSG("Unimplemented ALU operation {}", opcode.alu_operation.Value());
break;
}
Compile_ProcessResult(opcode.result_operation, opcode.dst);
}
void MacroJITx64Impl::Compile_AddImmediate(Macro::Opcode opcode) {
if (optimizer.skip_dummy_addimmediate) {
// Games tend to use this as an exit instruction placeholder. It's to encode an instruction
// without doing anything. In our case we can just not emit anything.
if (opcode.result_operation == Macro::ResultOperation::Move && opcode.dst == 0) {
return;
}
}
// Check for redundant moves
if (optimizer.optimize_for_method_move &&
opcode.result_operation == Macro::ResultOperation::MoveAndSetMethod) {
if (next_opcode.has_value()) {
const auto next = *next_opcode;
if (next.result_operation == Macro::ResultOperation::MoveAndSetMethod &&
opcode.dst == next.dst) {
return;
}
}
}
if (optimizer.zero_reg_skip && opcode.src_a == 0) {
if (opcode.immediate == 0) {
xor_(RESULT, RESULT);
} else {
mov(RESULT, opcode.immediate);
}
} else {
auto result = Compile_GetRegister(opcode.src_a, RESULT);
if (opcode.immediate > 2) {
add(result, opcode.immediate);
} else if (opcode.immediate == 1) {
inc(result);
} else if (opcode.immediate < 0) {
sub(result, opcode.immediate * -1);
}
}
Compile_ProcessResult(opcode.result_operation, opcode.dst);
}
void MacroJITx64Impl::Compile_ExtractInsert(Macro::Opcode opcode) {
auto dst = Compile_GetRegister(opcode.src_a, RESULT);
auto src = Compile_GetRegister(opcode.src_b, eax);
if (opcode.bf_src_bit != 0 && opcode.bf_src_bit != 31) {
shr(src, opcode.bf_src_bit);
} else if (opcode.bf_src_bit == 31) {
xor_(src, src);
}
// Don't bother masking the whole register since we're using a 32 bit register
if (opcode.bf_size != 31 && opcode.bf_size != 0) {
and_(src, opcode.GetBitfieldMask());
} else if (opcode.bf_size == 0) {
xor_(src, src);
}
if (opcode.bf_dst_bit != 31 && opcode.bf_dst_bit != 0) {
shl(src, opcode.bf_dst_bit);
} else if (opcode.bf_dst_bit == 31) {
xor_(src, src);
}
const u32 mask = ~(opcode.GetBitfieldMask() << opcode.bf_dst_bit);
if (mask != 0xffffffff) {
and_(dst, mask);
}
or_(dst, src);
Compile_ProcessResult(opcode.result_operation, opcode.dst);
}
void MacroJITx64Impl::Compile_ExtractShiftLeftImmediate(Macro::Opcode opcode) {
const auto dst = Compile_GetRegister(opcode.src_a, ecx);
const auto src = Compile_GetRegister(opcode.src_b, RESULT);
shr(src, dst.cvt8());
if (opcode.bf_size != 0 && opcode.bf_size != 31) {
and_(src, opcode.GetBitfieldMask());
} else if (opcode.bf_size == 0) {
xor_(src, src);
}
if (opcode.bf_dst_bit != 0 && opcode.bf_dst_bit != 31) {
shl(src, opcode.bf_dst_bit);
} else if (opcode.bf_dst_bit == 31) {
xor_(src, src);
}
Compile_ProcessResult(opcode.result_operation, opcode.dst);
}
void MacroJITx64Impl::Compile_ExtractShiftLeftRegister(Macro::Opcode opcode) {
const auto dst = Compile_GetRegister(opcode.src_a, ecx);
const auto src = Compile_GetRegister(opcode.src_b, RESULT);
if (opcode.bf_src_bit != 0) {
shr(src, opcode.bf_src_bit);
}
if (opcode.bf_size != 31) {
and_(src, opcode.GetBitfieldMask());
}
shl(src, dst.cvt8());
Compile_ProcessResult(opcode.result_operation, opcode.dst);
}
void MacroJITx64Impl::Compile_Read(Macro::Opcode opcode) {
if (optimizer.zero_reg_skip && opcode.src_a == 0) {
if (opcode.immediate == 0) {
xor_(RESULT, RESULT);
} else {
mov(RESULT, opcode.immediate);
}
} else {
auto result = Compile_GetRegister(opcode.src_a, RESULT);
if (opcode.immediate > 2) {
add(result, opcode.immediate);
} else if (opcode.immediate == 1) {
inc(result);
} else if (opcode.immediate < 0) {
sub(result, opcode.immediate * -1);
}
}
// Equivalent to Engines::Maxwell3D::GetRegisterValue:
if (optimizer.enable_asserts) {
Xbyak::Label pass_range_check;
cmp(RESULT, static_cast<u32>(Engines::Maxwell3D::Regs::NUM_REGS));
jb(pass_range_check);
int3();
L(pass_range_check);
}
mov(rax, qword[STATE]);
mov(RESULT,
dword[rax + offsetof(Engines::Maxwell3D, regs) +
offsetof(Engines::Maxwell3D::Regs, reg_array) + RESULT.cvt64() * sizeof(u32)]);
Compile_ProcessResult(opcode.result_operation, opcode.dst);
}
static void Send(Engines::Maxwell3D* maxwell3d, Macro::MethodAddress method_address, u32 value) {
maxwell3d->CallMethodFromMME(method_address.address, value);
}
void Tegra::MacroJITx64Impl::Compile_Send(Xbyak::Reg32 value) {
Common::X64::ABI_PushRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0);
mov(Common::X64::ABI_PARAM1, qword[STATE]);
mov(Common::X64::ABI_PARAM2, METHOD_ADDRESS);
mov(Common::X64::ABI_PARAM3, value);
Common::X64::CallFarFunction(*this, &Send);
Common::X64::ABI_PopRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0);
Xbyak::Label dont_process{};
// Get increment
test(METHOD_ADDRESS, 0x3f000);
// If zero, method address doesn't update
je(dont_process);
mov(ecx, METHOD_ADDRESS);
and_(METHOD_ADDRESS, 0xfff);
shr(ecx, 12);
and_(ecx, 0x3f);
lea(eax, ptr[rcx + METHOD_ADDRESS.cvt64()]);
sal(ecx, 12);
or_(eax, ecx);
mov(METHOD_ADDRESS, eax);
L(dont_process);
}
void Tegra::MacroJITx64Impl::Compile_Branch(Macro::Opcode opcode) {
ASSERT_MSG(!is_delay_slot, "Executing a branch in a delay slot is not valid");
const s32 jump_address =
static_cast<s32>(pc) + static_cast<s32>(opcode.GetBranchTarget() / sizeof(s32));
Xbyak::Label end;
auto value = Compile_GetRegister(opcode.src_a, eax);
test(value, value);
if (optimizer.has_delayed_pc) {
switch (opcode.branch_condition) {
case Macro::BranchCondition::Zero:
jne(end, T_NEAR);
break;
case Macro::BranchCondition::NotZero:
je(end, T_NEAR);
break;
}
if (opcode.branch_annul) {
xor_(BRANCH_HOLDER, BRANCH_HOLDER);
jmp(labels[jump_address], T_NEAR);
} else {
Xbyak::Label handle_post_exit{};
Xbyak::Label skip{};
jmp(skip, T_NEAR);
if (opcode.is_exit) {
L(handle_post_exit);
// Execute 1 instruction
mov(BRANCH_HOLDER, end_of_code);
// Jump to next instruction to skip delay slot check
jmp(labels[jump_address], T_NEAR);
} else {
L(handle_post_exit);
xor_(BRANCH_HOLDER, BRANCH_HOLDER);
jmp(labels[jump_address], T_NEAR);
}
L(skip);
mov(BRANCH_HOLDER, handle_post_exit);
jmp(delay_skip[pc], T_NEAR);
}
} else {
switch (opcode.branch_condition) {
case Macro::BranchCondition::Zero:
je(labels[jump_address], T_NEAR);
break;
case Macro::BranchCondition::NotZero:
jne(labels[jump_address], T_NEAR);
break;
}
}
L(end);
}
void Tegra::MacroJITx64Impl::Optimizer_ScanFlags() {
optimizer.can_skip_carry = true;
optimizer.has_delayed_pc = false;
for (auto raw_op : code) {
Macro::Opcode op{};
op.raw = raw_op;
if (op.operation == Macro::Operation::ALU) {
// Scan for any ALU operations which actually use the carry flag, if they don't exist in
// our current code we can skip emitting the carry flag handling operations
if (op.alu_operation == Macro::ALUOperation::AddWithCarry ||
op.alu_operation == Macro::ALUOperation::SubtractWithBorrow) {
optimizer.can_skip_carry = false;
}
}
if (op.operation == Macro::Operation::Branch) {
if (!op.branch_annul) {
optimizer.has_delayed_pc = true;
}
}
}
}
void MacroJITx64Impl::Compile() {
MICROPROFILE_SCOPE(MacroJitCompile);
labels.fill(Xbyak::Label());
Common::X64::ABI_PushRegistersAndAdjustStack(*this, Common::X64::ABI_ALL_CALLEE_SAVED, 8);
// JIT state
mov(STATE, Common::X64::ABI_PARAM1);
mov(PARAMETERS, Common::X64::ABI_PARAM2);
xor_(RESULT, RESULT);
xor_(METHOD_ADDRESS, METHOD_ADDRESS);
xor_(BRANCH_HOLDER, BRANCH_HOLDER);
mov(dword[STATE + offsetof(JITState, registers) + 4], Compile_FetchParameter());
// Track get register for zero registers and mark it as no-op
optimizer.zero_reg_skip = true;
// AddImmediate tends to be used as a NOP instruction, if we detect this we can
// completely skip the entire code path and no emit anything
optimizer.skip_dummy_addimmediate = true;
// SMO tends to emit a lot of unnecessary method moves, we can mitigate this by only emitting
// one if our register isn't "dirty"
optimizer.optimize_for_method_move = true;
// Enable run-time assertions in JITted code
optimizer.enable_asserts = false;
// Check to see if we can skip emitting certain instructions
Optimizer_ScanFlags();
const u32 op_count = static_cast<u32>(code.size());
for (u32 i = 0; i < op_count; i++) {
if (i < op_count - 1) {
pc = i + 1;
next_opcode = GetOpCode();
} else {
next_opcode = {};
}
pc = i;
Compile_NextInstruction();
}
L(end_of_code);
Common::X64::ABI_PopRegistersAndAdjustStack(*this, Common::X64::ABI_ALL_CALLEE_SAVED, 8);
ret();
ready();
program = getCode<ProgramType>();
}
bool MacroJITx64Impl::Compile_NextInstruction() {
const auto opcode = GetOpCode();
if (labels[pc].getAddress()) {
return false;
}
L(labels[pc]);
switch (opcode.operation) {
case Macro::Operation::ALU:
Compile_ALU(opcode);
break;
case Macro::Operation::AddImmediate:
Compile_AddImmediate(opcode);
break;
case Macro::Operation::ExtractInsert:
Compile_ExtractInsert(opcode);
break;
case Macro::Operation::ExtractShiftLeftImmediate:
Compile_ExtractShiftLeftImmediate(opcode);
break;
case Macro::Operation::ExtractShiftLeftRegister:
Compile_ExtractShiftLeftRegister(opcode);
break;
case Macro::Operation::Read:
Compile_Read(opcode);
break;
case Macro::Operation::Branch:
Compile_Branch(opcode);
break;
default:
UNIMPLEMENTED_MSG("Unimplemented opcode {}", opcode.operation.Value());
break;
}
if (optimizer.has_delayed_pc) {
if (opcode.is_exit) {
mov(rax, end_of_code);
test(BRANCH_HOLDER, BRANCH_HOLDER);
cmove(BRANCH_HOLDER, rax);
// Jump to next instruction to skip delay slot check
je(labels[pc + 1], T_NEAR);
} else {
// TODO(ogniK): Optimize delay slot branching
Xbyak::Label no_delay_slot{};
test(BRANCH_HOLDER, BRANCH_HOLDER);
je(no_delay_slot, T_NEAR);
mov(rax, BRANCH_HOLDER);
xor_(BRANCH_HOLDER, BRANCH_HOLDER);
jmp(rax);
L(no_delay_slot);
}
L(delay_skip[pc]);
if (opcode.is_exit) {
return false;
}
} else {
test(BRANCH_HOLDER, BRANCH_HOLDER);
jne(end_of_code, T_NEAR);
if (opcode.is_exit) {
inc(BRANCH_HOLDER);
return false;
}
}
return true;
}
Xbyak::Reg32 Tegra::MacroJITx64Impl::Compile_FetchParameter() {
mov(eax, dword[PARAMETERS]);
add(PARAMETERS, sizeof(u32));
return eax;
}
Xbyak::Reg32 MacroJITx64Impl::Compile_GetRegister(u32 index, Xbyak::Reg32 dst) {
if (index == 0) {
// Register 0 is always zero
xor_(dst, dst);
} else {
mov(dst, dword[STATE + offsetof(JITState, registers) + index * sizeof(u32)]);
}
return dst;
}
void MacroJITx64Impl::Compile_ProcessResult(Macro::ResultOperation operation, u32 reg) {
const auto SetRegister = [this](u32 reg_index, const Xbyak::Reg32& result) {
// Register 0 is supposed to always return 0. NOP is implemented as a store to the zero
// register.
if (reg_index == 0) {
return;
}
mov(dword[STATE + offsetof(JITState, registers) + reg_index * sizeof(u32)], result);
};
const auto SetMethodAddress = [this](const Xbyak::Reg32& reg32) { mov(METHOD_ADDRESS, reg32); };
switch (operation) {
case Macro::ResultOperation::IgnoreAndFetch:
SetRegister(reg, Compile_FetchParameter());
break;
case Macro::ResultOperation::Move:
SetRegister(reg, RESULT);
break;
case Macro::ResultOperation::MoveAndSetMethod:
SetRegister(reg, RESULT);
SetMethodAddress(RESULT);
break;
case Macro::ResultOperation::FetchAndSend:
// Fetch parameter and send result.
SetRegister(reg, Compile_FetchParameter());
Compile_Send(RESULT);
break;
case Macro::ResultOperation::MoveAndSend:
// Move and send result.
SetRegister(reg, RESULT);
Compile_Send(RESULT);
break;
case Macro::ResultOperation::FetchAndSetMethod:
// Fetch parameter and use result as Method Address.
SetRegister(reg, Compile_FetchParameter());
SetMethodAddress(RESULT);
break;
case Macro::ResultOperation::MoveAndSetMethodFetchAndSend:
// Move result and use as Method Address, then fetch and send parameter.
SetRegister(reg, RESULT);
SetMethodAddress(RESULT);
Compile_Send(Compile_FetchParameter());
break;
case Macro::ResultOperation::MoveAndSetMethodSend:
// Move result and use as Method Address, then send bits 12:17 of result.
SetRegister(reg, RESULT);
SetMethodAddress(RESULT);
shr(RESULT, 12);
and_(RESULT, 0b111111);
Compile_Send(RESULT);
break;
default:
UNIMPLEMENTED_MSG("Unimplemented macro operation {}", operation);
}
}
Macro::Opcode MacroJITx64Impl::GetOpCode() const {
ASSERT(pc < code.size());
return {code[pc]};
}
std::bitset<32> MacroJITx64Impl::PersistentCallerSavedRegs() const {
return PERSISTENT_REGISTERS & Common::X64::ABI_ALL_CALLER_SAVED;
}
} // namespace Tegra

View File

@@ -0,0 +1,98 @@
// Copyright 2020 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#pragma once
#include <array>
#include <bitset>
#include <xbyak.h>
#include "common/bit_field.h"
#include "common/common_types.h"
#include "common/x64/xbyak_abi.h"
#include "video_core/macro/macro.h"
namespace Tegra {
namespace Engines {
class Maxwell3D;
}
/// MAX_CODE_SIZE is arbitrarily chosen based on current booting games
constexpr size_t MAX_CODE_SIZE = 0x10000;
class MacroJITx64 final : public MacroEngine {
public:
explicit MacroJITx64(Engines::Maxwell3D& maxwell3d_);
protected:
std::unique_ptr<CachedMacro> Compile(const std::vector<u32>& code) override;
private:
Engines::Maxwell3D& maxwell3d;
};
class MacroJITx64Impl : public Xbyak::CodeGenerator, public CachedMacro {
public:
explicit MacroJITx64Impl(Engines::Maxwell3D& maxwell3d_, const std::vector<u32>& code_);
~MacroJITx64Impl();
void Execute(const std::vector<u32>& parameters, u32 method) override;
void Compile_ALU(Macro::Opcode opcode);
void Compile_AddImmediate(Macro::Opcode opcode);
void Compile_ExtractInsert(Macro::Opcode opcode);
void Compile_ExtractShiftLeftImmediate(Macro::Opcode opcode);
void Compile_ExtractShiftLeftRegister(Macro::Opcode opcode);
void Compile_Read(Macro::Opcode opcode);
void Compile_Branch(Macro::Opcode opcode);
private:
void Optimizer_ScanFlags();
void Compile();
bool Compile_NextInstruction();
Xbyak::Reg32 Compile_FetchParameter();
Xbyak::Reg32 Compile_GetRegister(u32 index, Xbyak::Reg32 dst);
void Compile_ProcessResult(Macro::ResultOperation operation, u32 reg);
void Compile_Send(Xbyak::Reg32 value);
Macro::Opcode GetOpCode() const;
std::bitset<32> PersistentCallerSavedRegs() const;
struct JITState {
Engines::Maxwell3D* maxwell3d{};
std::array<u32, Macro::NUM_MACRO_REGISTERS> registers{};
u32 carry_flag{};
};
static_assert(offsetof(JITState, maxwell3d) == 0, "Maxwell3D is not at 0x0");
using ProgramType = void (*)(JITState*, const u32*);
struct OptimizerState {
bool can_skip_carry{};
bool has_delayed_pc{};
bool zero_reg_skip{};
bool skip_dummy_addimmediate{};
bool optimize_for_method_move{};
bool enable_asserts{};
};
OptimizerState optimizer{};
std::optional<Macro::Opcode> next_opcode{};
ProgramType program{nullptr};
std::array<Xbyak::Label, MAX_CODE_SIZE> labels;
std::array<Xbyak::Label, MAX_CODE_SIZE> delay_skip;
Xbyak::Label end_of_code{};
bool is_delay_slot{};
u32 pc{};
std::optional<u32> delayed_pc;
const std::vector<u32>& code;
Engines::Maxwell3D& maxwell3d;
};
} // namespace Tegra

358
src/video_core/memory_manager.cpp Executable file
View File

@@ -0,0 +1,358 @@
// Copyright 2018 yuzu emulator team
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#include "common/alignment.h"
#include "common/assert.h"
#include "core/core.h"
#include "core/hle/kernel/memory/page_table.h"
#include "core/hle/kernel/process.h"
#include "core/memory.h"
#include "video_core/gpu.h"
#include "video_core/memory_manager.h"
#include "video_core/rasterizer_interface.h"
#include "video_core/renderer_base.h"
namespace Tegra {
MemoryManager::MemoryManager(Core::System& system_)
: system{system_}, page_table(page_table_size) {}
MemoryManager::~MemoryManager() = default;
void MemoryManager::BindRasterizer(VideoCore::RasterizerInterface& rasterizer_) {
rasterizer = &rasterizer_;
}
GPUVAddr MemoryManager::UpdateRange(GPUVAddr gpu_addr, PageEntry page_entry, std::size_t size) {
u64 remaining_size{size};
for (u64 offset{}; offset < size; offset += page_size) {
if (remaining_size < page_size) {
SetPageEntry(gpu_addr + offset, page_entry + offset, remaining_size);
} else {
SetPageEntry(gpu_addr + offset, page_entry + offset);
}
remaining_size -= page_size;
}
return gpu_addr;
}
GPUVAddr MemoryManager::Map(VAddr cpu_addr, GPUVAddr gpu_addr, std::size_t size) {
return UpdateRange(gpu_addr, cpu_addr, size);
}
GPUVAddr MemoryManager::MapAllocate(VAddr cpu_addr, std::size_t size, std::size_t align) {
return Map(cpu_addr, *FindFreeRange(size, align), size);
}
GPUVAddr MemoryManager::MapAllocate32(VAddr cpu_addr, std::size_t size) {
const std::optional<GPUVAddr> gpu_addr = FindFreeRange(size, 1, true);
ASSERT(gpu_addr);
return Map(cpu_addr, *gpu_addr, size);
}
void MemoryManager::Unmap(GPUVAddr gpu_addr, std::size_t size) {
if (!size) {
return;
}
// Flush and invalidate through the GPU interface, to be asynchronous if possible.
const std::optional<VAddr> cpu_addr = GpuToCpuAddress(gpu_addr);
ASSERT(cpu_addr);
rasterizer->UnmapMemory(*cpu_addr, size);
UpdateRange(gpu_addr, PageEntry::State::Unmapped, size);
}
std::optional<GPUVAddr> MemoryManager::AllocateFixed(GPUVAddr gpu_addr, std::size_t size) {
for (u64 offset{}; offset < size; offset += page_size) {
if (!GetPageEntry(gpu_addr + offset).IsUnmapped()) {
return std::nullopt;
}
}
return UpdateRange(gpu_addr, PageEntry::State::Allocated, size);
}
GPUVAddr MemoryManager::Allocate(std::size_t size, std::size_t align) {
return *AllocateFixed(*FindFreeRange(size, align), size);
}
void MemoryManager::TryLockPage(PageEntry page_entry, std::size_t size) {
if (!page_entry.IsValid()) {
return;
}
ASSERT(system.CurrentProcess()
->PageTable()
.LockForDeviceAddressSpace(page_entry.ToAddress(), size)
.IsSuccess());
}
void MemoryManager::TryUnlockPage(PageEntry page_entry, std::size_t size) {
if (!page_entry.IsValid()) {
return;
}
ASSERT(system.CurrentProcess()
->PageTable()
.UnlockForDeviceAddressSpace(page_entry.ToAddress(), size)
.IsSuccess());
}
void MemoryManager::UnmapVicFrame(GPUVAddr gpu_addr, std::size_t size) {
if (!size) {
return;
}
const std::optional<VAddr> cpu_addr = GpuToCpuAddress(gpu_addr);
ASSERT(cpu_addr);
system.GPU().Renderer().Rasterizer().InvalidateExceptTextureCache(*cpu_addr, size);
cache_invalidate_queue.push_back({*cpu_addr, size});
UpdateRange(gpu_addr, PageEntry::State::Unmapped, size);
}
void MemoryManager::InvalidateQueuedCaches() {
for (const auto& entry : cache_invalidate_queue) {
rasterizer->InvalidateTextureCache(entry.first, entry.second);
}
cache_invalidate_queue.clear();
}
PageEntry MemoryManager::GetPageEntry(GPUVAddr gpu_addr) const {
return page_table[PageEntryIndex(gpu_addr)];
}
void MemoryManager::SetPageEntry(GPUVAddr gpu_addr, PageEntry page_entry, std::size_t size) {
// TODO(bunnei): We should lock/unlock device regions. This currently causes issues due to
// improper tracking, but should be fixed in the future.
//// Unlock the old page
// TryUnlockPage(page_table[PageEntryIndex(gpu_addr)], size);
//// Lock the new page
// TryLockPage(page_entry, size);
page_table[PageEntryIndex(gpu_addr)] = page_entry;
}
std::optional<GPUVAddr> MemoryManager::FindFreeRange(std::size_t size, std::size_t align,
bool start_32bit_address) const {
if (!align) {
align = page_size;
} else {
align = Common::AlignUp(align, page_size);
}
u64 available_size{};
GPUVAddr gpu_addr{start_32bit_address ? address_space_start_low : address_space_start};
while (gpu_addr + available_size < address_space_size) {
if (GetPageEntry(gpu_addr + available_size).IsUnmapped()) {
available_size += page_size;
if (available_size >= size) {
return gpu_addr;
}
} else {
gpu_addr += available_size + page_size;
available_size = 0;
const auto remainder{gpu_addr % align};
if (remainder) {
gpu_addr = (gpu_addr - remainder) + align;
}
}
}
return std::nullopt;
}
std::optional<VAddr> MemoryManager::GpuToCpuAddress(GPUVAddr gpu_addr) const {
const auto page_entry{GetPageEntry(gpu_addr)};
if (!page_entry.IsValid()) {
return std::nullopt;
}
return page_entry.ToAddress() + (gpu_addr & page_mask);
}
template <typename T>
T MemoryManager::Read(GPUVAddr addr) const {
if (auto page_pointer{GetPointer(addr)}; page_pointer) {
// NOTE: Avoid adding any extra logic to this fast-path block
T value;
std::memcpy(&value, page_pointer, sizeof(T));
return value;
}
UNREACHABLE();
return {};
}
template <typename T>
void MemoryManager::Write(GPUVAddr addr, T data) {
if (auto page_pointer{GetPointer(addr)}; page_pointer) {
// NOTE: Avoid adding any extra logic to this fast-path block
std::memcpy(page_pointer, &data, sizeof(T));
return;
}
UNREACHABLE();
}
template u8 MemoryManager::Read<u8>(GPUVAddr addr) const;
template u16 MemoryManager::Read<u16>(GPUVAddr addr) const;
template u32 MemoryManager::Read<u32>(GPUVAddr addr) const;
template u64 MemoryManager::Read<u64>(GPUVAddr addr) const;
template void MemoryManager::Write<u8>(GPUVAddr addr, u8 data);
template void MemoryManager::Write<u16>(GPUVAddr addr, u16 data);
template void MemoryManager::Write<u32>(GPUVAddr addr, u32 data);
template void MemoryManager::Write<u64>(GPUVAddr addr, u64 data);
u8* MemoryManager::GetPointer(GPUVAddr gpu_addr) {
if (!GetPageEntry(gpu_addr).IsValid()) {
return {};
}
const auto address{GpuToCpuAddress(gpu_addr)};
if (!address) {
return {};
}
return system.Memory().GetPointer(*address);
}
const u8* MemoryManager::GetPointer(GPUVAddr gpu_addr) const {
if (!GetPageEntry(gpu_addr).IsValid()) {
return {};
}
const auto address{GpuToCpuAddress(gpu_addr)};
if (!address) {
return {};
}
return system.Memory().GetPointer(*address);
}
void MemoryManager::ReadBlock(GPUVAddr gpu_src_addr, void* dest_buffer, std::size_t size) const {
std::size_t remaining_size{size};
std::size_t page_index{gpu_src_addr >> page_bits};
std::size_t page_offset{gpu_src_addr & page_mask};
while (remaining_size > 0) {
const std::size_t copy_amount{
std::min(static_cast<std::size_t>(page_size) - page_offset, remaining_size)};
if (const auto page_addr{GpuToCpuAddress(page_index << page_bits)}; page_addr) {
const auto src_addr{*page_addr + page_offset};
// Flush must happen on the rasterizer interface, such that memory is always synchronous
// when it is read (even when in asynchronous GPU mode). Fixes Dead Cells title menu.
rasterizer->FlushRegion(src_addr, copy_amount);
system.Memory().ReadBlockUnsafe(src_addr, dest_buffer, copy_amount);
}
page_index++;
page_offset = 0;
dest_buffer = static_cast<u8*>(dest_buffer) + copy_amount;
remaining_size -= copy_amount;
}
}
void MemoryManager::ReadBlockUnsafe(GPUVAddr gpu_src_addr, void* dest_buffer,
const std::size_t size) const {
std::size_t remaining_size{size};
std::size_t page_index{gpu_src_addr >> page_bits};
std::size_t page_offset{gpu_src_addr & page_mask};
while (remaining_size > 0) {
const std::size_t copy_amount{
std::min(static_cast<std::size_t>(page_size) - page_offset, remaining_size)};
if (const auto page_addr{GpuToCpuAddress(page_index << page_bits)}; page_addr) {
const auto src_addr{*page_addr + page_offset};
system.Memory().ReadBlockUnsafe(src_addr, dest_buffer, copy_amount);
} else {
std::memset(dest_buffer, 0, copy_amount);
}
page_index++;
page_offset = 0;
dest_buffer = static_cast<u8*>(dest_buffer) + copy_amount;
remaining_size -= copy_amount;
}
}
void MemoryManager::WriteBlock(GPUVAddr gpu_dest_addr, const void* src_buffer, std::size_t size) {
std::size_t remaining_size{size};
std::size_t page_index{gpu_dest_addr >> page_bits};
std::size_t page_offset{gpu_dest_addr & page_mask};
while (remaining_size > 0) {
const std::size_t copy_amount{
std::min(static_cast<std::size_t>(page_size) - page_offset, remaining_size)};
if (const auto page_addr{GpuToCpuAddress(page_index << page_bits)}; page_addr) {
const auto dest_addr{*page_addr + page_offset};
// Invalidate must happen on the rasterizer interface, such that memory is always
// synchronous when it is written (even when in asynchronous GPU mode).
rasterizer->InvalidateRegion(dest_addr, copy_amount);
system.Memory().WriteBlockUnsafe(dest_addr, src_buffer, copy_amount);
}
page_index++;
page_offset = 0;
src_buffer = static_cast<const u8*>(src_buffer) + copy_amount;
remaining_size -= copy_amount;
}
}
void MemoryManager::WriteBlockUnsafe(GPUVAddr gpu_dest_addr, const void* src_buffer,
std::size_t size) {
std::size_t remaining_size{size};
std::size_t page_index{gpu_dest_addr >> page_bits};
std::size_t page_offset{gpu_dest_addr & page_mask};
while (remaining_size > 0) {
const std::size_t copy_amount{
std::min(static_cast<std::size_t>(page_size) - page_offset, remaining_size)};
if (const auto page_addr{GpuToCpuAddress(page_index << page_bits)}; page_addr) {
const auto dest_addr{*page_addr + page_offset};
system.Memory().WriteBlockUnsafe(dest_addr, src_buffer, copy_amount);
}
page_index++;
page_offset = 0;
src_buffer = static_cast<const u8*>(src_buffer) + copy_amount;
remaining_size -= copy_amount;
}
}
void MemoryManager::CopyBlock(GPUVAddr gpu_dest_addr, GPUVAddr gpu_src_addr, std::size_t size) {
std::vector<u8> tmp_buffer(size);
ReadBlock(gpu_src_addr, tmp_buffer.data(), size);
WriteBlock(gpu_dest_addr, tmp_buffer.data(), size);
}
void MemoryManager::CopyBlockUnsafe(GPUVAddr gpu_dest_addr, GPUVAddr gpu_src_addr,
std::size_t size) {
std::vector<u8> tmp_buffer(size);
ReadBlockUnsafe(gpu_src_addr, tmp_buffer.data(), size);
WriteBlockUnsafe(gpu_dest_addr, tmp_buffer.data(), size);
}
bool MemoryManager::IsGranularRange(GPUVAddr gpu_addr, std::size_t size) const {
const auto cpu_addr{GpuToCpuAddress(gpu_addr)};
if (!cpu_addr) {
return false;
}
const std::size_t page{(*cpu_addr & Core::Memory::PAGE_MASK) + size};
return page <= Core::Memory::PAGE_SIZE;
}
} // namespace Tegra

164
src/video_core/memory_manager.h Executable file
View File

@@ -0,0 +1,164 @@
// Copyright 2018 yuzu emulator team
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#pragma once
#include <map>
#include <optional>
#include <vector>
#include "common/common_types.h"
namespace VideoCore {
class RasterizerInterface;
}
namespace Core {
class System;
}
namespace Tegra {
class PageEntry final {
public:
enum class State : u32 {
Unmapped = static_cast<u32>(-1),
Allocated = static_cast<u32>(-2),
};
constexpr PageEntry() = default;
constexpr PageEntry(State state_) : state{state_} {}
constexpr PageEntry(VAddr addr) : state{static_cast<State>(addr >> ShiftBits)} {}
[[nodiscard]] constexpr bool IsUnmapped() const {
return state == State::Unmapped;
}
[[nodiscard]] constexpr bool IsAllocated() const {
return state == State::Allocated;
}
[[nodiscard]] constexpr bool IsValid() const {
return !IsUnmapped() && !IsAllocated();
}
[[nodiscard]] constexpr VAddr ToAddress() const {
if (!IsValid()) {
return {};
}
return static_cast<VAddr>(state) << ShiftBits;
}
[[nodiscard]] constexpr PageEntry operator+(u64 offset) const {
// If this is a reserved value, offsets do not apply
if (!IsValid()) {
return *this;
}
return PageEntry{(static_cast<VAddr>(state) << ShiftBits) + offset};
}
private:
static constexpr std::size_t ShiftBits{12};
State state{State::Unmapped};
};
static_assert(sizeof(PageEntry) == 4, "PageEntry is too large");
class MemoryManager final {
public:
explicit MemoryManager(Core::System& system_);
~MemoryManager();
/// Binds a renderer to the memory manager.
void BindRasterizer(VideoCore::RasterizerInterface& rasterizer);
[[nodiscard]] std::optional<VAddr> GpuToCpuAddress(GPUVAddr addr) const;
template <typename T>
[[nodiscard]] T Read(GPUVAddr addr) const;
template <typename T>
void Write(GPUVAddr addr, T data);
[[nodiscard]] u8* GetPointer(GPUVAddr addr);
[[nodiscard]] const u8* GetPointer(GPUVAddr addr) const;
/**
* ReadBlock and WriteBlock are full read and write operations over virtual
* GPU Memory. It's important to use these when GPU memory may not be continuous
* in the Host Memory counterpart. Note: This functions cause Host GPU Memory
* Flushes and Invalidations, respectively to each operation.
*/
void ReadBlock(GPUVAddr gpu_src_addr, void* dest_buffer, std::size_t size) const;
void WriteBlock(GPUVAddr gpu_dest_addr, const void* src_buffer, std::size_t size);
void CopyBlock(GPUVAddr gpu_dest_addr, GPUVAddr gpu_src_addr, std::size_t size);
/**
* ReadBlockUnsafe and WriteBlockUnsafe are special versions of ReadBlock and
* WriteBlock respectively. In this versions, no flushing or invalidation is actually
* done and their performance is similar to a memcpy. This functions can be used
* on either of this 2 scenarios instead of their safe counterpart:
* - Memory which is sure to never be represented in the Host GPU.
* - Memory Managed by a Cache Manager. Example: Texture Flushing should use
* WriteBlockUnsafe instead of WriteBlock since it shouldn't invalidate the texture
* being flushed.
*/
void ReadBlockUnsafe(GPUVAddr gpu_src_addr, void* dest_buffer, std::size_t size) const;
void WriteBlockUnsafe(GPUVAddr gpu_dest_addr, const void* src_buffer, std::size_t size);
void CopyBlockUnsafe(GPUVAddr gpu_dest_addr, GPUVAddr gpu_src_addr, std::size_t size);
/**
* IsGranularRange checks if a gpu region can be simply read with a pointer.
*/
[[nodiscard]] bool IsGranularRange(GPUVAddr gpu_addr, std::size_t size) const;
[[nodiscard]] GPUVAddr Map(VAddr cpu_addr, GPUVAddr gpu_addr, std::size_t size);
[[nodiscard]] GPUVAddr MapAllocate(VAddr cpu_addr, std::size_t size, std::size_t align);
[[nodiscard]] GPUVAddr MapAllocate32(VAddr cpu_addr, std::size_t size);
[[nodiscard]] std::optional<GPUVAddr> AllocateFixed(GPUVAddr gpu_addr, std::size_t size);
[[nodiscard]] GPUVAddr Allocate(std::size_t size, std::size_t align);
void Unmap(GPUVAddr gpu_addr, std::size_t size);
/**
* Some Decoded NVDEC frames require that texture cache does not get invalidated.
* UnmapVicFrame defers the texture cache invalidation until the stream ends
* by invoking InvalidateQueuedCaches to invalidate all frame texture caches.
*/
void UnmapVicFrame(GPUVAddr gpu_addr, std::size_t size);
void InvalidateQueuedCaches();
private:
[[nodiscard]] PageEntry GetPageEntry(GPUVAddr gpu_addr) const;
void SetPageEntry(GPUVAddr gpu_addr, PageEntry page_entry, std::size_t size = page_size);
GPUVAddr UpdateRange(GPUVAddr gpu_addr, PageEntry page_entry, std::size_t size);
[[nodiscard]] std::optional<GPUVAddr> FindFreeRange(std::size_t size, std::size_t align,
bool start_32bit_address = false) const;
void TryLockPage(PageEntry page_entry, std::size_t size);
void TryUnlockPage(PageEntry page_entry, std::size_t size);
[[nodiscard]] static constexpr std::size_t PageEntryIndex(GPUVAddr gpu_addr) {
return (gpu_addr >> page_bits) & page_table_mask;
}
static constexpr u64 address_space_size = 1ULL << 40;
static constexpr u64 address_space_start = 1ULL << 32;
static constexpr u64 address_space_start_low = 1ULL << 16;
static constexpr u64 page_bits{16};
static constexpr u64 page_size{1 << page_bits};
static constexpr u64 page_mask{page_size - 1};
static constexpr u64 page_table_bits{24};
static constexpr u64 page_table_size{1 << page_table_bits};
static constexpr u64 page_table_mask{page_table_size - 1};
Core::System& system;
VideoCore::RasterizerInterface* rasterizer = nullptr;
std::vector<PageEntry> page_table;
std::vector<std::pair<VAddr, std::size_t>> cache_invalidate_queue;
};
} // namespace Tegra

0
src/video_core/morton.cpp Executable file
View File

0
src/video_core/morton.h Executable file
View File

399
src/video_core/query_cache.h Executable file
View File

@@ -0,0 +1,399 @@
// Copyright 2020 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#pragma once
#include <algorithm>
#include <array>
#include <cstring>
#include <iterator>
#include <memory>
#include <mutex>
#include <optional>
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include "common/assert.h"
#include "core/core.h"
#include "core/settings.h"
#include "video_core/engines/maxwell_3d.h"
#include "video_core/gpu.h"
#include "video_core/memory_manager.h"
#include "video_core/rasterizer_interface.h"
namespace VideoCommon {
template <class QueryCache, class HostCounter>
class CounterStreamBase {
public:
explicit CounterStreamBase(QueryCache& cache_, VideoCore::QueryType type_)
: cache{cache_}, type{type_} {}
/// Updates the state of the stream, enabling or disabling as needed.
void Update(bool enabled) {
if (enabled) {
Enable();
} else {
Disable();
}
}
/// Resets the stream to zero. It doesn't disable the query after resetting.
void Reset() {
if (current) {
current->EndQuery();
// Immediately start a new query to avoid disabling its state.
current = cache.Counter(nullptr, type);
}
last = nullptr;
}
/// Returns the current counter slicing as needed.
std::shared_ptr<HostCounter> Current() {
if (!current) {
return nullptr;
}
current->EndQuery();
last = std::move(current);
current = cache.Counter(last, type);
return last;
}
/// Returns true when the counter stream is enabled.
bool IsEnabled() const {
return current != nullptr;
}
private:
/// Enables the stream.
void Enable() {
if (current) {
return;
}
current = cache.Counter(last, type);
}
// Disables the stream.
void Disable() {
if (current) {
current->EndQuery();
}
last = std::exchange(current, nullptr);
}
QueryCache& cache;
const VideoCore::QueryType type;
std::shared_ptr<HostCounter> current;
std::shared_ptr<HostCounter> last;
};
template <class QueryCache, class CachedQuery, class CounterStream, class HostCounter>
class QueryCacheBase {
public:
explicit QueryCacheBase(VideoCore::RasterizerInterface& rasterizer_,
Tegra::Engines::Maxwell3D& maxwell3d_,
Tegra::MemoryManager& gpu_memory_)
: rasterizer{rasterizer_}, maxwell3d{maxwell3d_},
gpu_memory{gpu_memory_}, streams{{CounterStream{static_cast<QueryCache&>(*this),
VideoCore::QueryType::SamplesPassed}}} {}
void InvalidateRegion(VAddr addr, std::size_t size) {
std::unique_lock lock{mutex};
FlushAndRemoveRegion(addr, size);
}
void FlushRegion(VAddr addr, std::size_t size) {
std::unique_lock lock{mutex};
FlushAndRemoveRegion(addr, size);
}
/**
* Records a query in GPU mapped memory, potentially marked with a timestamp.
* @param gpu_addr GPU address to flush to when the mapped memory is read.
* @param type Query type, e.g. SamplesPassed.
* @param timestamp Timestamp, when empty the flushed query is assumed to be short.
*/
void Query(GPUVAddr gpu_addr, VideoCore::QueryType type, std::optional<u64> timestamp) {
std::unique_lock lock{mutex};
const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr);
ASSERT(cpu_addr);
CachedQuery* query = TryGet(*cpu_addr);
if (!query) {
ASSERT_OR_EXECUTE(cpu_addr, return;);
u8* const host_ptr = gpu_memory.GetPointer(gpu_addr);
query = Register(type, *cpu_addr, host_ptr, timestamp.has_value());
}
query->BindCounter(Stream(type).Current(), timestamp);
if (Settings::values.use_asynchronous_gpu_emulation.GetValue()) {
AsyncFlushQuery(*cpu_addr);
}
}
/// Updates counters from GPU state. Expected to be called once per draw, clear or dispatch.
void UpdateCounters() {
std::unique_lock lock{mutex};
const auto& regs = maxwell3d.regs;
Stream(VideoCore::QueryType::SamplesPassed).Update(regs.samplecnt_enable);
}
/// Resets a counter to zero. It doesn't disable the query after resetting.
void ResetCounter(VideoCore::QueryType type) {
std::unique_lock lock{mutex};
Stream(type).Reset();
}
/// Disable all active streams. Expected to be called at the end of a command buffer.
void DisableStreams() {
std::unique_lock lock{mutex};
for (auto& stream : streams) {
stream.Update(false);
}
}
/// Returns a new host counter.
std::shared_ptr<HostCounter> Counter(std::shared_ptr<HostCounter> dependency,
VideoCore::QueryType type) {
return std::make_shared<HostCounter>(static_cast<QueryCache&>(*this), std::move(dependency),
type);
}
/// Returns the counter stream of the specified type.
CounterStream& Stream(VideoCore::QueryType type) {
return streams[static_cast<std::size_t>(type)];
}
/// Returns the counter stream of the specified type.
const CounterStream& Stream(VideoCore::QueryType type) const {
return streams[static_cast<std::size_t>(type)];
}
void CommitAsyncFlushes() {
committed_flushes.push_back(uncommitted_flushes);
uncommitted_flushes.reset();
}
bool HasUncommittedFlushes() const {
return uncommitted_flushes != nullptr;
}
bool ShouldWaitAsyncFlushes() const {
if (committed_flushes.empty()) {
return false;
}
return committed_flushes.front() != nullptr;
}
void PopAsyncFlushes() {
if (committed_flushes.empty()) {
return;
}
auto& flush_list = committed_flushes.front();
if (!flush_list) {
committed_flushes.pop_front();
return;
}
for (VAddr query_address : *flush_list) {
FlushAndRemoveRegion(query_address, 4);
}
committed_flushes.pop_front();
}
private:
/// Flushes a memory range to guest memory and removes it from the cache.
void FlushAndRemoveRegion(VAddr addr, std::size_t size) {
const u64 addr_begin = static_cast<u64>(addr);
const u64 addr_end = addr_begin + static_cast<u64>(size);
const auto in_range = [addr_begin, addr_end](CachedQuery& query) {
const u64 cache_begin = query.GetCpuAddr();
const u64 cache_end = cache_begin + query.SizeInBytes();
return cache_begin < addr_end && addr_begin < cache_end;
};
const u64 page_end = addr_end >> PAGE_BITS;
for (u64 page = addr_begin >> PAGE_BITS; page <= page_end; ++page) {
const auto& it = cached_queries.find(page);
if (it == std::end(cached_queries)) {
continue;
}
auto& contents = it->second;
for (auto& query : contents) {
if (!in_range(query)) {
continue;
}
rasterizer.UpdatePagesCachedCount(query.GetCpuAddr(), query.SizeInBytes(), -1);
query.Flush();
}
contents.erase(std::remove_if(std::begin(contents), std::end(contents), in_range),
std::end(contents));
}
}
/// Registers the passed parameters as cached and returns a pointer to the stored cached query.
CachedQuery* Register(VideoCore::QueryType type, VAddr cpu_addr, u8* host_ptr, bool timestamp) {
rasterizer.UpdatePagesCachedCount(cpu_addr, CachedQuery::SizeInBytes(timestamp), 1);
const u64 page = static_cast<u64>(cpu_addr) >> PAGE_BITS;
return &cached_queries[page].emplace_back(static_cast<QueryCache&>(*this), type, cpu_addr,
host_ptr);
}
/// Tries to a get a cached query. Returns nullptr on failure.
CachedQuery* TryGet(VAddr addr) {
const u64 page = static_cast<u64>(addr) >> PAGE_BITS;
const auto it = cached_queries.find(page);
if (it == std::end(cached_queries)) {
return nullptr;
}
auto& contents = it->second;
const auto found = std::find_if(std::begin(contents), std::end(contents),
[addr](auto& query) { return query.GetCpuAddr() == addr; });
return found != std::end(contents) ? &*found : nullptr;
}
void AsyncFlushQuery(VAddr addr) {
if (!uncommitted_flushes) {
uncommitted_flushes = std::make_shared<std::unordered_set<VAddr>>();
}
uncommitted_flushes->insert(addr);
}
static constexpr std::uintptr_t PAGE_SIZE = 4096;
static constexpr unsigned PAGE_BITS = 12;
VideoCore::RasterizerInterface& rasterizer;
Tegra::Engines::Maxwell3D& maxwell3d;
Tegra::MemoryManager& gpu_memory;
std::recursive_mutex mutex;
std::unordered_map<u64, std::vector<CachedQuery>> cached_queries;
std::array<CounterStream, VideoCore::NumQueryTypes> streams;
std::shared_ptr<std::unordered_set<VAddr>> uncommitted_flushes{};
std::list<std::shared_ptr<std::unordered_set<VAddr>>> committed_flushes;
};
template <class QueryCache, class HostCounter>
class HostCounterBase {
public:
explicit HostCounterBase(std::shared_ptr<HostCounter> dependency_)
: dependency{std::move(dependency_)}, depth{dependency ? (dependency->Depth() + 1) : 0} {
// Avoid nesting too many dependencies to avoid a stack overflow when these are deleted.
constexpr u64 depth_threshold = 96;
if (depth > depth_threshold) {
depth = 0;
base_result = dependency->Query();
dependency = nullptr;
}
}
virtual ~HostCounterBase() = default;
/// Returns the current value of the query.
u64 Query() {
if (result) {
return *result;
}
u64 value = BlockingQuery() + base_result;
if (dependency) {
value += dependency->Query();
dependency = nullptr;
}
result = value;
return *result;
}
/// Returns true when flushing this query will potentially wait.
bool WaitPending() const noexcept {
return result.has_value();
}
u64 Depth() const noexcept {
return depth;
}
protected:
/// Returns the value of query from the backend API blocking as needed.
virtual u64 BlockingQuery() const = 0;
private:
std::shared_ptr<HostCounter> dependency; ///< Counter to add to this value.
std::optional<u64> result; ///< Filled with the already returned value.
u64 depth; ///< Number of nested dependencies.
u64 base_result = 0; ///< Equivalent to nested dependencies value.
};
template <class HostCounter>
class CachedQueryBase {
public:
explicit CachedQueryBase(VAddr cpu_addr_, u8* host_ptr_)
: cpu_addr{cpu_addr_}, host_ptr{host_ptr_} {}
virtual ~CachedQueryBase() = default;
CachedQueryBase(CachedQueryBase&&) noexcept = default;
CachedQueryBase(const CachedQueryBase&) = delete;
CachedQueryBase& operator=(CachedQueryBase&&) noexcept = default;
CachedQueryBase& operator=(const CachedQueryBase&) = delete;
/// Flushes the query to guest memory.
virtual void Flush() {
// When counter is nullptr it means that it's just been reseted. We are supposed to write a
// zero in these cases.
const u64 value = counter ? counter->Query() : 0;
std::memcpy(host_ptr, &value, sizeof(u64));
if (timestamp) {
std::memcpy(host_ptr + TIMESTAMP_OFFSET, &*timestamp, sizeof(u64));
}
}
/// Binds a counter to this query.
void BindCounter(std::shared_ptr<HostCounter> counter_, std::optional<u64> timestamp_) {
if (counter) {
// If there's an old counter set it means the query is being rewritten by the game.
// To avoid losing the data forever, flush here.
Flush();
}
counter = std::move(counter_);
timestamp = timestamp_;
}
VAddr GetCpuAddr() const noexcept {
return cpu_addr;
}
u64 SizeInBytes() const noexcept {
return SizeInBytes(timestamp.has_value());
}
static constexpr u64 SizeInBytes(bool with_timestamp) noexcept {
return with_timestamp ? LARGE_QUERY_SIZE : SMALL_QUERY_SIZE;
}
protected:
/// Returns true when querying the counter may potentially block.
bool WaitPending() const noexcept {
return counter && counter->WaitPending();
}
private:
static constexpr std::size_t SMALL_QUERY_SIZE = 8; // Query size without timestamp.
static constexpr std::size_t LARGE_QUERY_SIZE = 16; // Query size with timestamp.
static constexpr std::intptr_t TIMESTAMP_OFFSET = 8; // Timestamp offset in a large query.
VAddr cpu_addr; ///< Guest CPU address.
u8* host_ptr; ///< Writable host pointer.
std::shared_ptr<HostCounter> counter; ///< Host counter to query, owns the dependency tree.
std::optional<u64> timestamp; ///< Timestamp to flush to guest memory.
};
} // namespace VideoCommon

View File

@@ -0,0 +1,65 @@
// Copyright 2019 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#include <mutex>
#include <boost/icl/interval_map.hpp>
#include <boost/range/iterator_range.hpp>
#include "common/assert.h"
#include "common/common_types.h"
#include "core/memory.h"
#include "video_core/rasterizer_accelerated.h"
namespace VideoCore {
namespace {
template <typename Map, typename Interval>
constexpr auto RangeFromInterval(Map& map, const Interval& interval) {
return boost::make_iterator_range(map.equal_range(interval));
}
} // Anonymous namespace
RasterizerAccelerated::RasterizerAccelerated(Core::Memory::Memory& cpu_memory_)
: cpu_memory{cpu_memory_} {}
RasterizerAccelerated::~RasterizerAccelerated() = default;
void RasterizerAccelerated::UpdatePagesCachedCount(VAddr addr, u64 size, int delta) {
std::lock_guard lock{pages_mutex};
const u64 page_start{addr >> Core::Memory::PAGE_BITS};
const u64 page_end{(addr + size + Core::Memory::PAGE_SIZE - 1) >> Core::Memory::PAGE_BITS};
// Interval maps will erase segments if count reaches 0, so if delta is negative we have to
// subtract after iterating
const auto pages_interval = CachedPageMap::interval_type::right_open(page_start, page_end);
if (delta > 0) {
cached_pages.add({pages_interval, delta});
}
for (const auto& pair : RangeFromInterval(cached_pages, pages_interval)) {
const auto interval = pair.first & pages_interval;
const int count = pair.second;
const VAddr interval_start_addr = boost::icl::first(interval) << Core::Memory::PAGE_BITS;
const VAddr interval_end_addr = boost::icl::last_next(interval) << Core::Memory::PAGE_BITS;
const u64 interval_size = interval_end_addr - interval_start_addr;
if (delta > 0 && count == delta) {
cpu_memory.RasterizerMarkRegionCached(interval_start_addr, interval_size, true);
} else if (delta < 0 && count == -delta) {
cpu_memory.RasterizerMarkRegionCached(interval_start_addr, interval_size, false);
} else {
ASSERT(count >= 0);
}
}
if (delta < 0) {
cached_pages.add({pages_interval, delta});
}
}
} // namespace VideoCore

View File

@@ -0,0 +1,36 @@
// Copyright 2019 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#pragma once
#include <mutex>
#include <boost/icl/interval_map.hpp>
#include "common/common_types.h"
#include "video_core/rasterizer_interface.h"
namespace Core::Memory {
class Memory;
}
namespace VideoCore {
/// Implements the shared part in GPU accelerated rasterizers in RasterizerInterface.
class RasterizerAccelerated : public RasterizerInterface {
public:
explicit RasterizerAccelerated(Core::Memory::Memory& cpu_memory_);
~RasterizerAccelerated() override;
void UpdatePagesCachedCount(VAddr addr, u64 size, int delta) override;
private:
using CachedPageMap = boost::icl::interval_map<u64, int>;
CachedPageMap cached_pages;
std::mutex pages_mutex;
Core::Memory::Memory& cpu_memory;
};
} // namespace VideoCore

View File

@@ -0,0 +1,140 @@
// Copyright 2018 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#pragma once
#include <atomic>
#include <functional>
#include <optional>
#include "common/common_types.h"
#include "video_core/engines/fermi_2d.h"
#include "video_core/gpu.h"
#include "video_core/guest_driver.h"
namespace Tegra {
class MemoryManager;
}
namespace VideoCore {
enum class QueryType {
SamplesPassed,
};
constexpr std::size_t NumQueryTypes = 1;
enum class LoadCallbackStage {
Prepare,
Build,
Complete,
};
using DiskResourceLoadCallback = std::function<void(LoadCallbackStage, std::size_t, std::size_t)>;
class RasterizerInterface {
public:
virtual ~RasterizerInterface() = default;
/// Dispatches a draw invocation
virtual void Draw(bool is_indexed, bool is_instanced) = 0;
/// Clear the current framebuffer
virtual void Clear() = 0;
/// Dispatches a compute shader invocation
virtual void DispatchCompute(GPUVAddr code_addr) = 0;
/// Resets the counter of a query
virtual void ResetCounter(QueryType type) = 0;
/// Records a GPU query and caches it
virtual void Query(GPUVAddr gpu_addr, QueryType type, std::optional<u64> timestamp) = 0;
/// Signal a GPU based semaphore as a fence
virtual void SignalSemaphore(GPUVAddr addr, u32 value) = 0;
/// Signal a GPU based syncpoint as a fence
virtual void SignalSyncPoint(u32 value) = 0;
/// Release all pending fences.
virtual void ReleaseFences() = 0;
/// Notify rasterizer that all caches should be flushed to Switch memory
virtual void FlushAll() = 0;
/// Notify rasterizer that any caches of the specified region should be flushed to Switch memory
virtual void FlushRegion(VAddr addr, u64 size) = 0;
/// Notify rasterizer to flush the texture cache to Switch memory
virtual void InvalidateExceptTextureCache(VAddr addr, u64 size) = 0;
/// Notify rasterizer to invalidate the texture cache
virtual void InvalidateTextureCache(VAddr addr, u64 size) = 0;
/// Check if the the specified memory area requires flushing to CPU Memory.
virtual bool MustFlushRegion(VAddr addr, u64 size) = 0;
/// Notify rasterizer that any caches of the specified region should be invalidated
virtual void InvalidateRegion(VAddr addr, u64 size) = 0;
/// Notify rasterizer that any caches of the specified region are desync with guest
virtual void OnCPUWrite(VAddr addr, u64 size) = 0;
/// Sync memory between guest and host.
virtual void SyncGuestHost() = 0;
/// Unmap memory range
virtual void UnmapMemory(VAddr addr, u64 size) = 0;
/// Notify rasterizer that any caches of the specified region should be flushed to Switch memory
/// and invalidated
virtual void FlushAndInvalidateRegion(VAddr addr, u64 size) = 0;
/// Notify the host renderer to wait for previous primitive and compute operations.
virtual void WaitForIdle() = 0;
/// Notify the host renderer to wait for reads and writes to render targets and flush caches.
virtual void FragmentBarrier() = 0;
/// Notify the host renderer to make available previous render target writes.
virtual void TiledCacheBarrier() = 0;
/// Notify the rasterizer to send all written commands to the host GPU.
virtual void FlushCommands() = 0;
/// Notify rasterizer that a frame is about to finish
virtual void TickFrame() = 0;
/// Attempt to use a faster method to perform a surface copy
[[nodiscard]] virtual bool AccelerateSurfaceCopy(
const Tegra::Engines::Fermi2D::Surface& src, const Tegra::Engines::Fermi2D::Surface& dst,
const Tegra::Engines::Fermi2D::Config& copy_config) {
return false;
}
/// Attempt to use a faster method to display the framebuffer to screen
[[nodiscard]] virtual bool AccelerateDisplay(const Tegra::FramebufferConfig& config,
VAddr framebuffer_addr, u32 pixel_stride) {
return false;
}
/// Increase/decrease the number of object in pages touching the specified region
virtual void UpdatePagesCachedCount(VAddr addr, u64 size, int delta) {}
/// Initialize disk cached resources for the game being emulated
virtual void LoadDiskResources(u64 title_id, const std::atomic_bool& stop_loading,
const DiskResourceLoadCallback& callback) {}
/// Grant access to the Guest Driver Profile for recording/obtaining info on the guest driver.
[[nodiscard]] GuestDriverProfile& AccessGuestDriverProfile() {
return guest_driver_profile;
}
/// Grant access to the Guest Driver Profile for recording/obtaining info on the guest driver.
[[nodiscard]] const GuestDriverProfile& AccessGuestDriverProfile() const {
return guest_driver_profile;
}
private:
GuestDriverProfile guest_driver_profile{};
};
} // namespace VideoCore

View File

@@ -0,0 +1,45 @@
// Copyright 2015 Citra Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#include "common/logging/log.h"
#include "core/frontend/emu_window.h"
#include "core/settings.h"
#include "video_core/renderer_base.h"
namespace VideoCore {
RendererBase::RendererBase(Core::Frontend::EmuWindow& window_,
std::unique_ptr<Core::Frontend::GraphicsContext> context_)
: render_window{window_}, context{std::move(context_)} {
RefreshBaseSettings();
}
RendererBase::~RendererBase() = default;
void RendererBase::RefreshBaseSettings() {
UpdateCurrentFramebufferLayout();
renderer_settings.use_framelimiter = Settings::values.use_frame_limit.GetValue();
renderer_settings.set_background_color = true;
}
void RendererBase::UpdateCurrentFramebufferLayout() {
const Layout::FramebufferLayout& layout = render_window.GetFramebufferLayout();
render_window.UpdateCurrentFramebufferLayout(layout.width, layout.height);
}
void RendererBase::RequestScreenshot(void* data, std::function<void()> callback,
const Layout::FramebufferLayout& layout) {
if (renderer_settings.screenshot_requested) {
LOG_ERROR(Render, "A screenshot is already requested or in progress, ignoring the request");
return;
}
renderer_settings.screenshot_bits = data;
renderer_settings.screenshot_complete_callback = std::move(callback);
renderer_settings.screenshot_framebuffer_layout = layout;
renderer_settings.screenshot_requested = true;
}
} // namespace VideoCore

113
src/video_core/renderer_base.h Executable file
View File

@@ -0,0 +1,113 @@
// Copyright 2014 Citra Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#pragma once
#include <atomic>
#include <memory>
#include <optional>
#include "common/common_types.h"
#include "core/frontend/emu_window.h"
#include "video_core/gpu.h"
#include "video_core/rasterizer_interface.h"
namespace Core::Frontend {
class EmuWindow;
class GraphicsContext;
} // namespace Core::Frontend
namespace VideoCore {
struct RendererSettings {
std::atomic_bool use_framelimiter{false};
std::atomic_bool set_background_color{false};
// Screenshot
std::atomic<bool> screenshot_requested{false};
void* screenshot_bits{};
std::function<void()> screenshot_complete_callback;
Layout::FramebufferLayout screenshot_framebuffer_layout;
};
class RendererBase : NonCopyable {
public:
explicit RendererBase(Core::Frontend::EmuWindow& window,
std::unique_ptr<Core::Frontend::GraphicsContext> context);
virtual ~RendererBase();
/// Initialize the renderer
[[nodiscard]] virtual bool Init() = 0;
/// Shutdown the renderer
virtual void ShutDown() = 0;
/// Finalize rendering the guest frame and draw into the presentation texture
virtual void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) = 0;
// Getter/setter functions:
// ------------------------
[[nodiscard]] f32 GetCurrentFPS() const {
return m_current_fps;
}
[[nodiscard]] int GetCurrentFrame() const {
return m_current_frame;
}
[[nodiscard]] RasterizerInterface& Rasterizer() {
return *rasterizer;
}
[[nodiscard]] const RasterizerInterface& Rasterizer() const {
return *rasterizer;
}
[[nodiscard]] Core::Frontend::GraphicsContext& Context() {
return *context;
}
[[nodiscard]] const Core::Frontend::GraphicsContext& Context() const {
return *context;
}
[[nodiscard]] Core::Frontend::EmuWindow& GetRenderWindow() {
return render_window;
}
[[nodiscard]] const Core::Frontend::EmuWindow& GetRenderWindow() const {
return render_window;
}
[[nodiscard]] RendererSettings& Settings() {
return renderer_settings;
}
[[nodiscard]] const RendererSettings& Settings() const {
return renderer_settings;
}
/// Refreshes the settings common to all renderers
void RefreshBaseSettings();
/// Request a screenshot of the next frame
void RequestScreenshot(void* data, std::function<void()> callback,
const Layout::FramebufferLayout& layout);
protected:
Core::Frontend::EmuWindow& render_window; ///< Reference to the render window handle.
std::unique_ptr<RasterizerInterface> rasterizer;
std::unique_ptr<Core::Frontend::GraphicsContext> context;
f32 m_current_fps = 0.0f; ///< Current framerate, should be set by the renderer
int m_current_frame = 0; ///< Current frame, should be set by the renderer
RendererSettings renderer_settings;
private:
/// Updates the framebuffer layout of the contained render window handle.
void UpdateCurrentFramebufferLayout();
};
} // namespace VideoCore

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,29 @@
// Copyright 2020 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#pragma once
#include <string>
#include <string_view>
#include "common/common_types.h"
namespace Tegra::Engines {
enum class ShaderType : u32;
}
namespace VideoCommon::Shader {
class ShaderIR;
class Registry;
} // namespace VideoCommon::Shader
namespace OpenGL {
class Device;
std::string DecompileAssemblyShader(const Device& device, const VideoCommon::Shader::ShaderIR& ir,
const VideoCommon::Shader::Registry& registry,
Tegra::Engines::ShaderType stage, std::string_view identifier);
} // namespace OpenGL

View File

@@ -0,0 +1,99 @@
// Copyright 2018 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#include <memory>
#include <glad/glad.h>
#include "common/assert.h"
#include "common/microprofile.h"
#include "video_core/buffer_cache/buffer_cache.h"
#include "video_core/engines/maxwell_3d.h"
#include "video_core/rasterizer_interface.h"
#include "video_core/renderer_opengl/gl_buffer_cache.h"
#include "video_core/renderer_opengl/gl_device.h"
#include "video_core/renderer_opengl/gl_rasterizer.h"
#include "video_core/renderer_opengl/gl_resource_manager.h"
namespace OpenGL {
using Maxwell = Tegra::Engines::Maxwell3D::Regs;
MICROPROFILE_DEFINE(OpenGL_Buffer_Download, "OpenGL", "Buffer Download", MP_RGB(192, 192, 128));
Buffer::Buffer(const Device& device_, VAddr cpu_addr_, std::size_t size_)
: BufferBlock{cpu_addr_, size_} {
gl_buffer.Create();
glNamedBufferData(gl_buffer.handle, static_cast<GLsizeiptr>(size_), nullptr, GL_DYNAMIC_DRAW);
if (device_.UseAssemblyShaders() || device_.HasVertexBufferUnifiedMemory()) {
glMakeNamedBufferResidentNV(gl_buffer.handle, GL_READ_WRITE);
glGetNamedBufferParameterui64vNV(gl_buffer.handle, GL_BUFFER_GPU_ADDRESS_NV, &gpu_address);
}
}
Buffer::~Buffer() = default;
void Buffer::Upload(std::size_t offset, std::size_t data_size, const u8* data) {
glNamedBufferSubData(Handle(), static_cast<GLintptr>(offset),
static_cast<GLsizeiptr>(data_size), data);
}
void Buffer::Download(std::size_t offset, std::size_t data_size, u8* data) {
MICROPROFILE_SCOPE(OpenGL_Buffer_Download);
const GLsizeiptr gl_size = static_cast<GLsizeiptr>(data_size);
const GLintptr gl_offset = static_cast<GLintptr>(offset);
if (read_buffer.handle == 0) {
read_buffer.Create();
glNamedBufferData(read_buffer.handle, static_cast<GLsizeiptr>(Size()), nullptr,
GL_STREAM_READ);
}
glMemoryBarrier(GL_BUFFER_UPDATE_BARRIER_BIT);
glCopyNamedBufferSubData(gl_buffer.handle, read_buffer.handle, gl_offset, gl_offset, gl_size);
glGetNamedBufferSubData(read_buffer.handle, gl_offset, gl_size, data);
}
void Buffer::CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst_offset,
std::size_t copy_size) {
glCopyNamedBufferSubData(src.Handle(), Handle(), static_cast<GLintptr>(src_offset),
static_cast<GLintptr>(dst_offset), static_cast<GLsizeiptr>(copy_size));
}
OGLBufferCache::OGLBufferCache(VideoCore::RasterizerInterface& rasterizer_,
Tegra::MemoryManager& gpu_memory_, Core::Memory::Memory& cpu_memory_,
const Device& device_, OGLStreamBuffer& stream_buffer_,
StateTracker& state_tracker)
: GenericBufferCache{rasterizer_, gpu_memory_, cpu_memory_, stream_buffer_}, device{device_} {
if (!device.HasFastBufferSubData()) {
return;
}
static constexpr GLsizeiptr size = static_cast<GLsizeiptr>(Maxwell::MaxConstBufferSize);
glCreateBuffers(static_cast<GLsizei>(std::size(cbufs)), std::data(cbufs));
for (const GLuint cbuf : cbufs) {
glNamedBufferData(cbuf, size, nullptr, GL_STREAM_DRAW);
}
}
OGLBufferCache::~OGLBufferCache() {
glDeleteBuffers(static_cast<GLsizei>(std::size(cbufs)), std::data(cbufs));
}
std::shared_ptr<Buffer> OGLBufferCache::CreateBlock(VAddr cpu_addr, std::size_t size) {
return std::make_shared<Buffer>(device, cpu_addr, size);
}
OGLBufferCache::BufferInfo OGLBufferCache::GetEmptyBuffer(std::size_t) {
return {0, 0, 0};
}
OGLBufferCache::BufferInfo OGLBufferCache::ConstBufferUpload(const void* raw_pointer,
std::size_t size) {
DEBUG_ASSERT(cbuf_cursor < std::size(cbufs));
const GLuint cbuf = cbufs[cbuf_cursor++];
glNamedBufferSubData(cbuf, 0, static_cast<GLsizeiptr>(size), raw_pointer);
return {cbuf, 0, 0};
}
} // namespace OpenGL

View File

@@ -0,0 +1,83 @@
// Copyright 2018 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#pragma once
#include <array>
#include <memory>
#include "common/common_types.h"
#include "video_core/buffer_cache/buffer_cache.h"
#include "video_core/engines/maxwell_3d.h"
#include "video_core/renderer_opengl/gl_resource_manager.h"
#include "video_core/renderer_opengl/gl_stream_buffer.h"
namespace Core {
class System;
}
namespace OpenGL {
class Device;
class OGLStreamBuffer;
class RasterizerOpenGL;
class StateTracker;
class Buffer : public VideoCommon::BufferBlock {
public:
explicit Buffer(const Device& device_, VAddr cpu_addr_, std::size_t size_);
~Buffer();
void Upload(std::size_t offset, std::size_t data_size, const u8* data);
void Download(std::size_t offset, std::size_t data_size, u8* data);
void CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst_offset,
std::size_t copy_size);
GLuint Handle() const noexcept {
return gl_buffer.handle;
}
u64 Address() const noexcept {
return gpu_address;
}
private:
OGLBuffer gl_buffer;
OGLBuffer read_buffer;
u64 gpu_address = 0;
};
using GenericBufferCache = VideoCommon::BufferCache<Buffer, GLuint, OGLStreamBuffer>;
class OGLBufferCache final : public GenericBufferCache {
public:
explicit OGLBufferCache(VideoCore::RasterizerInterface& rasterizer,
Tegra::MemoryManager& gpu_memory, Core::Memory::Memory& cpu_memory,
const Device& device, OGLStreamBuffer& stream_buffer,
StateTracker& state_tracker);
~OGLBufferCache();
BufferInfo GetEmptyBuffer(std::size_t) override;
void Acquire() noexcept {
cbuf_cursor = 0;
}
protected:
std::shared_ptr<Buffer> CreateBlock(VAddr cpu_addr, std::size_t size) override;
BufferInfo ConstBufferUpload(const void* raw_pointer, std::size_t size) override;
private:
static constexpr std::size_t NUM_CBUFS = Tegra::Engines::Maxwell3D::Regs::MaxConstBuffers *
Tegra::Engines::Maxwell3D::Regs::MaxShaderProgram;
const Device& device;
std::size_t cbuf_cursor = 0;
std::array<GLuint, NUM_CBUFS> cbufs{};
};
} // namespace OpenGL

View File

@@ -0,0 +1,294 @@
// Copyright 2019 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#include <algorithm>
#include <array>
#include <cstddef>
#include <cstdlib>
#include <cstring>
#include <limits>
#include <optional>
#include <span>
#include <vector>
#include <glad/glad.h>
#include "common/logging/log.h"
#include "common/scope_exit.h"
#include "core/settings.h"
#include "video_core/renderer_opengl/gl_device.h"
#include "video_core/renderer_opengl/gl_resource_manager.h"
namespace OpenGL {
namespace {
// One uniform block is reserved for emulation purposes
constexpr u32 ReservedUniformBlocks = 1;
constexpr u32 NumStages = 5;
constexpr std::array LIMIT_UBOS = {
GL_MAX_VERTEX_UNIFORM_BLOCKS, GL_MAX_TESS_CONTROL_UNIFORM_BLOCKS,
GL_MAX_TESS_EVALUATION_UNIFORM_BLOCKS, GL_MAX_GEOMETRY_UNIFORM_BLOCKS,
GL_MAX_FRAGMENT_UNIFORM_BLOCKS, GL_MAX_COMPUTE_UNIFORM_BLOCKS,
};
constexpr std::array LIMIT_SSBOS = {
GL_MAX_VERTEX_SHADER_STORAGE_BLOCKS, GL_MAX_TESS_CONTROL_SHADER_STORAGE_BLOCKS,
GL_MAX_TESS_EVALUATION_SHADER_STORAGE_BLOCKS, GL_MAX_GEOMETRY_SHADER_STORAGE_BLOCKS,
GL_MAX_FRAGMENT_SHADER_STORAGE_BLOCKS, GL_MAX_COMPUTE_SHADER_STORAGE_BLOCKS,
};
constexpr std::array LIMIT_SAMPLERS = {
GL_MAX_VERTEX_TEXTURE_IMAGE_UNITS,
GL_MAX_TESS_CONTROL_TEXTURE_IMAGE_UNITS,
GL_MAX_TESS_EVALUATION_TEXTURE_IMAGE_UNITS,
GL_MAX_GEOMETRY_TEXTURE_IMAGE_UNITS,
GL_MAX_TEXTURE_IMAGE_UNITS,
GL_MAX_COMPUTE_TEXTURE_IMAGE_UNITS,
};
constexpr std::array LIMIT_IMAGES = {
GL_MAX_VERTEX_IMAGE_UNIFORMS, GL_MAX_TESS_CONTROL_IMAGE_UNIFORMS,
GL_MAX_TESS_EVALUATION_IMAGE_UNIFORMS, GL_MAX_GEOMETRY_IMAGE_UNIFORMS,
GL_MAX_FRAGMENT_IMAGE_UNIFORMS, GL_MAX_COMPUTE_IMAGE_UNIFORMS,
};
template <typename T>
T GetInteger(GLenum pname) {
GLint temporary;
glGetIntegerv(pname, &temporary);
return static_cast<T>(temporary);
}
bool TestProgram(const GLchar* glsl) {
const GLuint shader{glCreateShaderProgramv(GL_VERTEX_SHADER, 1, &glsl)};
GLint link_status;
glGetProgramiv(shader, GL_LINK_STATUS, &link_status);
glDeleteProgram(shader);
return link_status == GL_TRUE;
}
std::vector<std::string_view> GetExtensions() {
GLint num_extensions;
glGetIntegerv(GL_NUM_EXTENSIONS, &num_extensions);
std::vector<std::string_view> extensions;
extensions.reserve(num_extensions);
for (GLint index = 0; index < num_extensions; ++index) {
extensions.push_back(
reinterpret_cast<const char*>(glGetStringi(GL_EXTENSIONS, static_cast<GLuint>(index))));
}
return extensions;
}
bool HasExtension(std::span<const std::string_view> extensions, std::string_view extension) {
return std::ranges::find(extensions, extension) != extensions.end();
}
u32 Extract(u32& base, u32& num, u32 amount, std::optional<GLenum> limit = {}) {
ASSERT(num >= amount);
if (limit) {
amount = std::min(amount, GetInteger<u32>(*limit));
}
num -= amount;
return std::exchange(base, base + amount);
}
std::array<u32, Tegra::Engines::MaxShaderTypes> BuildMaxUniformBuffers() noexcept {
std::array<u32, Tegra::Engines::MaxShaderTypes> max;
std::ranges::transform(LIMIT_UBOS, max.begin(),
[](GLenum pname) { return GetInteger<u32>(pname); });
return max;
}
std::array<Device::BaseBindings, Tegra::Engines::MaxShaderTypes> BuildBaseBindings() noexcept {
std::array<Device::BaseBindings, Tegra::Engines::MaxShaderTypes> bindings;
static constexpr std::array<std::size_t, 5> stage_swizzle{0, 1, 2, 3, 4};
const u32 total_ubos = GetInteger<u32>(GL_MAX_UNIFORM_BUFFER_BINDINGS);
const u32 total_ssbos = GetInteger<u32>(GL_MAX_SHADER_STORAGE_BUFFER_BINDINGS);
const u32 total_samplers = GetInteger<u32>(GL_MAX_COMBINED_TEXTURE_IMAGE_UNITS);
u32 num_ubos = total_ubos - ReservedUniformBlocks;
u32 num_ssbos = total_ssbos;
u32 num_samplers = total_samplers;
u32 base_ubo = ReservedUniformBlocks;
u32 base_ssbo = 0;
u32 base_samplers = 0;
for (std::size_t i = 0; i < NumStages; ++i) {
const std::size_t stage = stage_swizzle[i];
bindings[stage] = {
Extract(base_ubo, num_ubos, total_ubos / NumStages, LIMIT_UBOS[stage]),
Extract(base_ssbo, num_ssbos, total_ssbos / NumStages, LIMIT_SSBOS[stage]),
Extract(base_samplers, num_samplers, total_samplers / NumStages,
LIMIT_SAMPLERS[stage])};
}
u32 num_images = GetInteger<u32>(GL_MAX_IMAGE_UNITS);
u32 base_images = 0;
// GL_MAX_IMAGE_UNITS is guaranteed by the spec to have a minimum value of 8.
// Due to the limitation of GL_MAX_IMAGE_UNITS, reserve at least 4 image bindings on the
// fragment stage, and at least 1 for the rest of the stages.
// So far games are observed to use 1 image binding on vertex and 4 on fragment stages.
// Reserve at least 4 image bindings on the fragment stage.
bindings[4].image =
Extract(base_images, num_images, std::max(4U, num_images / NumStages), LIMIT_IMAGES[4]);
// This is guaranteed to be at least 1.
const u32 total_extracted_images = num_images / (NumStages - 1);
// Reserve the other image bindings.
for (std::size_t i = 0; i < NumStages; ++i) {
const std::size_t stage = stage_swizzle[i];
if (stage == 4) {
continue;
}
bindings[stage].image =
Extract(base_images, num_images, total_extracted_images, LIMIT_IMAGES[stage]);
}
// Compute doesn't care about any of this.
bindings[5] = {0, 0, 0, 0};
return bindings;
}
bool IsASTCSupported() {
static constexpr std::array targets = {GL_TEXTURE_2D, GL_TEXTURE_2D_ARRAY};
static constexpr std::array formats = {
GL_COMPRESSED_RGBA_ASTC_4x4_KHR, GL_COMPRESSED_RGBA_ASTC_5x4_KHR,
GL_COMPRESSED_RGBA_ASTC_5x5_KHR, GL_COMPRESSED_RGBA_ASTC_6x5_KHR,
GL_COMPRESSED_RGBA_ASTC_6x6_KHR, GL_COMPRESSED_RGBA_ASTC_8x5_KHR,
GL_COMPRESSED_RGBA_ASTC_8x6_KHR, GL_COMPRESSED_RGBA_ASTC_8x8_KHR,
GL_COMPRESSED_RGBA_ASTC_10x5_KHR, GL_COMPRESSED_RGBA_ASTC_10x6_KHR,
GL_COMPRESSED_RGBA_ASTC_10x8_KHR, GL_COMPRESSED_RGBA_ASTC_10x10_KHR,
GL_COMPRESSED_RGBA_ASTC_12x10_KHR, GL_COMPRESSED_RGBA_ASTC_12x12_KHR,
GL_COMPRESSED_SRGB8_ALPHA8_ASTC_4x4_KHR, GL_COMPRESSED_SRGB8_ALPHA8_ASTC_5x4_KHR,
GL_COMPRESSED_SRGB8_ALPHA8_ASTC_5x5_KHR, GL_COMPRESSED_SRGB8_ALPHA8_ASTC_6x5_KHR,
GL_COMPRESSED_SRGB8_ALPHA8_ASTC_6x6_KHR, GL_COMPRESSED_SRGB8_ALPHA8_ASTC_8x5_KHR,
GL_COMPRESSED_SRGB8_ALPHA8_ASTC_8x6_KHR, GL_COMPRESSED_SRGB8_ALPHA8_ASTC_8x8_KHR,
GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x5_KHR, GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x6_KHR,
GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x8_KHR, GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x10_KHR,
GL_COMPRESSED_SRGB8_ALPHA8_ASTC_12x10_KHR, GL_COMPRESSED_SRGB8_ALPHA8_ASTC_12x12_KHR,
};
static constexpr std::array required_support = {
GL_VERTEX_TEXTURE, GL_TESS_CONTROL_TEXTURE, GL_TESS_EVALUATION_TEXTURE,
GL_GEOMETRY_TEXTURE, GL_FRAGMENT_TEXTURE, GL_COMPUTE_TEXTURE,
};
for (const GLenum target : targets) {
for (const GLenum format : formats) {
for (const GLenum support : required_support) {
GLint value;
glGetInternalformativ(target, format, support, 1, &value);
if (value != GL_FULL_SUPPORT) {
return false;
}
}
}
}
return true;
}
[[nodiscard]] bool IsDebugToolAttached(std::span<const std::string_view> extensions) {
const bool nsight = std::getenv("NVTX_INJECTION64_PATH") || std::getenv("NSIGHT_LAUNCHED");
return nsight || HasExtension(extensions, "GL_EXT_debug_tool");
}
} // Anonymous namespace
Device::Device()
: max_uniform_buffers{BuildMaxUniformBuffers()}, base_bindings{BuildBaseBindings()} {
const std::string_view vendor = reinterpret_cast<const char*>(glGetString(GL_VENDOR));
const std::string_view version = reinterpret_cast<const char*>(glGetString(GL_VERSION));
const std::vector extensions = GetExtensions();
const bool is_nvidia = vendor == "NVIDIA Corporation";
const bool is_amd = vendor == "ATI Technologies Inc.";
bool disable_fast_buffer_sub_data = false;
if (is_nvidia && version == "4.6.0 NVIDIA 443.24") {
LOG_WARNING(
Render_OpenGL,
"Beta driver 443.24 is known to have issues. There might be performance issues.");
disable_fast_buffer_sub_data = true;
}
uniform_buffer_alignment = GetInteger<size_t>(GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT);
shader_storage_alignment = GetInteger<size_t>(GL_SHADER_STORAGE_BUFFER_OFFSET_ALIGNMENT);
max_vertex_attributes = GetInteger<u32>(GL_MAX_VERTEX_ATTRIBS);
max_varyings = GetInteger<u32>(GL_MAX_VARYING_VECTORS);
max_compute_shared_memory_size = GetInteger<u32>(GL_MAX_COMPUTE_SHARED_MEMORY_SIZE);
has_warp_intrinsics = GLAD_GL_NV_gpu_shader5 && GLAD_GL_NV_shader_thread_group &&
GLAD_GL_NV_shader_thread_shuffle;
has_shader_ballot = GLAD_GL_ARB_shader_ballot;
has_vertex_viewport_layer = GLAD_GL_ARB_shader_viewport_layer_array;
has_image_load_formatted = HasExtension(extensions, "GL_EXT_shader_image_load_formatted");
has_texture_shadow_lod = HasExtension(extensions, "GL_EXT_texture_shadow_lod");
has_astc = IsASTCSupported();
has_variable_aoffi = TestVariableAoffi();
has_component_indexing_bug = is_amd;
has_precise_bug = TestPreciseBug();
has_nv_viewport_array2 = GLAD_GL_NV_viewport_array2;
has_vertex_buffer_unified_memory = GLAD_GL_NV_vertex_buffer_unified_memory;
has_debugging_tool_attached = IsDebugToolAttached(extensions);
// At the moment of writing this, only Nvidia's driver optimizes BufferSubData on exclusive
// uniform buffers as "push constants"
has_fast_buffer_sub_data = is_nvidia && !disable_fast_buffer_sub_data;
use_assembly_shaders = Settings::values.use_assembly_shaders.GetValue() &&
GLAD_GL_NV_gpu_program5 && GLAD_GL_NV_compute_program5 &&
GLAD_GL_NV_transform_feedback && GLAD_GL_NV_transform_feedback2;
use_asynchronous_shaders = Settings::values.use_asynchronous_shaders.GetValue();
LOG_INFO(Render_OpenGL, "Renderer_VariableAOFFI: {}", has_variable_aoffi);
LOG_INFO(Render_OpenGL, "Renderer_ComponentIndexingBug: {}", has_component_indexing_bug);
LOG_INFO(Render_OpenGL, "Renderer_PreciseBug: {}", has_precise_bug);
if (Settings::values.use_assembly_shaders.GetValue() && !use_assembly_shaders) {
LOG_ERROR(Render_OpenGL, "Assembly shaders enabled but not supported");
}
}
Device::Device(std::nullptr_t) {
max_uniform_buffers.fill(std::numeric_limits<u32>::max());
uniform_buffer_alignment = 4;
shader_storage_alignment = 4;
max_vertex_attributes = 16;
max_varyings = 15;
max_compute_shared_memory_size = 0x10000;
has_warp_intrinsics = true;
has_shader_ballot = true;
has_vertex_viewport_layer = true;
has_image_load_formatted = true;
has_texture_shadow_lod = true;
has_variable_aoffi = true;
}
bool Device::TestVariableAoffi() {
return TestProgram(R"(#version 430 core
// This is a unit test, please ignore me on apitrace bug reports.
uniform sampler2D tex;
uniform ivec2 variable_offset;
out vec4 output_attribute;
void main() {
output_attribute = textureOffset(tex, vec2(0), variable_offset);
})");
}
bool Device::TestPreciseBug() {
return !TestProgram(R"(#version 430 core
in vec3 coords;
out float out_value;
uniform sampler2DShadow tex;
void main() {
precise float tmp_value = vec4(texture(tex, coords)).x;
out_value = tmp_value;
})");
}
} // namespace OpenGL

View File

@@ -0,0 +1,147 @@
// Copyright 2019 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#pragma once
#include <cstddef>
#include "common/common_types.h"
#include "video_core/engines/shader_type.h"
namespace OpenGL {
static constexpr u32 EmulationUniformBlockBinding = 0;
class Device final {
public:
struct BaseBindings final {
u32 uniform_buffer{};
u32 shader_storage_buffer{};
u32 sampler{};
u32 image{};
};
explicit Device();
explicit Device(std::nullptr_t);
u32 GetMaxUniformBuffers(Tegra::Engines::ShaderType shader_type) const noexcept {
return max_uniform_buffers[static_cast<std::size_t>(shader_type)];
}
const BaseBindings& GetBaseBindings(std::size_t stage_index) const noexcept {
return base_bindings[stage_index];
}
const BaseBindings& GetBaseBindings(Tegra::Engines::ShaderType shader_type) const noexcept {
return GetBaseBindings(static_cast<std::size_t>(shader_type));
}
size_t GetUniformBufferAlignment() const {
return uniform_buffer_alignment;
}
size_t GetShaderStorageBufferAlignment() const {
return shader_storage_alignment;
}
u32 GetMaxVertexAttributes() const {
return max_vertex_attributes;
}
u32 GetMaxVaryings() const {
return max_varyings;
}
u32 GetMaxComputeSharedMemorySize() const {
return max_compute_shared_memory_size;
}
bool HasWarpIntrinsics() const {
return has_warp_intrinsics;
}
bool HasShaderBallot() const {
return has_shader_ballot;
}
bool HasVertexViewportLayer() const {
return has_vertex_viewport_layer;
}
bool HasImageLoadFormatted() const {
return has_image_load_formatted;
}
bool HasTextureShadowLod() const {
return has_texture_shadow_lod;
}
bool HasVertexBufferUnifiedMemory() const {
return has_vertex_buffer_unified_memory;
}
bool HasASTC() const {
return has_astc;
}
bool HasVariableAoffi() const {
return has_variable_aoffi;
}
bool HasComponentIndexingBug() const {
return has_component_indexing_bug;
}
bool HasPreciseBug() const {
return has_precise_bug;
}
bool HasFastBufferSubData() const {
return has_fast_buffer_sub_data;
}
bool HasNvViewportArray2() const {
return has_nv_viewport_array2;
}
bool HasDebuggingToolAttached() const {
return has_debugging_tool_attached;
}
bool UseAssemblyShaders() const {
return use_assembly_shaders;
}
bool UseAsynchronousShaders() const {
return use_asynchronous_shaders;
}
private:
static bool TestVariableAoffi();
static bool TestPreciseBug();
std::array<u32, Tegra::Engines::MaxShaderTypes> max_uniform_buffers{};
std::array<BaseBindings, Tegra::Engines::MaxShaderTypes> base_bindings{};
size_t uniform_buffer_alignment{};
size_t shader_storage_alignment{};
u32 max_vertex_attributes{};
u32 max_varyings{};
u32 max_compute_shared_memory_size{};
bool has_warp_intrinsics{};
bool has_shader_ballot{};
bool has_vertex_viewport_layer{};
bool has_image_load_formatted{};
bool has_texture_shadow_lod{};
bool has_vertex_buffer_unified_memory{};
bool has_astc{};
bool has_variable_aoffi{};
bool has_component_indexing_bug{};
bool has_precise_bug{};
bool has_fast_buffer_sub_data{};
bool has_nv_viewport_array2{};
bool has_debugging_tool_attached{};
bool use_assembly_shaders{};
bool use_asynchronous_shaders{};
};
} // namespace OpenGL

View File

@@ -0,0 +1,73 @@
// Copyright 2020 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#include "common/assert.h"
#include <glad/glad.h>
#include "video_core/renderer_opengl/gl_buffer_cache.h"
#include "video_core/renderer_opengl/gl_fence_manager.h"
namespace OpenGL {
GLInnerFence::GLInnerFence(u32 payload_, bool is_stubbed_) : FenceBase{payload_, is_stubbed_} {}
GLInnerFence::GLInnerFence(GPUVAddr address_, u32 payload_, bool is_stubbed_)
: FenceBase{address_, payload_, is_stubbed_} {}
GLInnerFence::~GLInnerFence() = default;
void GLInnerFence::Queue() {
if (is_stubbed) {
return;
}
ASSERT(sync_object.handle == 0);
sync_object.Create();
}
bool GLInnerFence::IsSignaled() const {
if (is_stubbed) {
return true;
}
ASSERT(sync_object.handle != 0);
GLsizei length;
GLint sync_status;
glGetSynciv(sync_object.handle, GL_SYNC_STATUS, sizeof(GLint), &length, &sync_status);
return sync_status == GL_SIGNALED;
}
void GLInnerFence::Wait() {
if (is_stubbed) {
return;
}
ASSERT(sync_object.handle != 0);
glClientWaitSync(sync_object.handle, 0, GL_TIMEOUT_IGNORED);
}
FenceManagerOpenGL::FenceManagerOpenGL(VideoCore::RasterizerInterface& rasterizer_,
Tegra::GPU& gpu_, TextureCache& texture_cache_,
OGLBufferCache& buffer_cache_, QueryCache& query_cache_)
: GenericFenceManager{rasterizer_, gpu_, texture_cache_, buffer_cache_, query_cache_} {}
Fence FenceManagerOpenGL::CreateFence(u32 value, bool is_stubbed) {
return std::make_shared<GLInnerFence>(value, is_stubbed);
}
Fence FenceManagerOpenGL::CreateFence(GPUVAddr addr, u32 value, bool is_stubbed) {
return std::make_shared<GLInnerFence>(addr, value, is_stubbed);
}
void FenceManagerOpenGL::QueueFence(Fence& fence) {
fence->Queue();
}
bool FenceManagerOpenGL::IsFenceSignaled(Fence& fence) const {
return fence->IsSignaled();
}
void FenceManagerOpenGL::WaitFence(Fence& fence) {
fence->Wait();
}
} // namespace OpenGL

Some files were not shown because too many files have changed in this diff Show More