early-access version 1866

This commit is contained in:
pineappleEA
2021-07-09 23:54:15 +02:00
parent 335eeff822
commit 7d21887d40
469 changed files with 201995 additions and 78488 deletions

View File

@@ -0,0 +1,928 @@
// Copyright 2021 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#include "common/alignment.h"
#include "shader_recompiler/environment.h"
#include "shader_recompiler/frontend/ir/modifiers.h"
#include "shader_recompiler/frontend/ir/program.h"
#include "shader_recompiler/frontend/ir/value.h"
#include "shader_recompiler/ir_opt/passes.h"
#include "shader_recompiler/shader_info.h"
namespace Shader::Optimization {
namespace {
void AddConstantBufferDescriptor(Info& info, u32 index, u32 count) {
if (count != 1) {
throw NotImplementedException("Constant buffer descriptor indexing");
}
if ((info.constant_buffer_mask & (1U << index)) != 0) {
return;
}
info.constant_buffer_mask |= 1U << index;
auto& cbufs{info.constant_buffer_descriptors};
cbufs.insert(std::ranges::lower_bound(cbufs, index, {}, &ConstantBufferDescriptor::index),
ConstantBufferDescriptor{
.index = index,
.count = 1,
});
}
void GetPatch(Info& info, IR::Patch patch) {
if (!IR::IsGeneric(patch)) {
throw NotImplementedException("Reading non-generic patch {}", patch);
}
info.uses_patches.at(IR::GenericPatchIndex(patch)) = true;
}
void SetPatch(Info& info, IR::Patch patch) {
if (IR::IsGeneric(patch)) {
info.uses_patches.at(IR::GenericPatchIndex(patch)) = true;
return;
}
switch (patch) {
case IR::Patch::TessellationLodLeft:
case IR::Patch::TessellationLodTop:
case IR::Patch::TessellationLodRight:
case IR::Patch::TessellationLodBottom:
info.stores_tess_level_outer = true;
break;
case IR::Patch::TessellationLodInteriorU:
case IR::Patch::TessellationLodInteriorV:
info.stores_tess_level_inner = true;
break;
default:
throw NotImplementedException("Set patch {}", patch);
}
}
void CheckCBufNVN(Info& info, IR::Inst& inst) {
const IR::Value cbuf_index{inst.Arg(0)};
if (!cbuf_index.IsImmediate()) {
info.nvn_buffer_used.set();
return;
}
const u32 index{cbuf_index.U32()};
if (index != 0) {
return;
}
const IR::Value cbuf_offset{inst.Arg(1)};
if (!cbuf_offset.IsImmediate()) {
info.nvn_buffer_used.set();
return;
}
const u32 offset{cbuf_offset.U32()};
const u32 descriptor_size{0x10};
const u32 upper_limit{info.nvn_buffer_base + descriptor_size * 16};
if (offset >= info.nvn_buffer_base && offset < upper_limit) {
const std::size_t nvn_index{(offset - info.nvn_buffer_base) / descriptor_size};
info.nvn_buffer_used.set(nvn_index, true);
}
}
void VisitUsages(Info& info, IR::Inst& inst) {
switch (inst.GetOpcode()) {
case IR::Opcode::CompositeConstructF16x2:
case IR::Opcode::CompositeConstructF16x3:
case IR::Opcode::CompositeConstructF16x4:
case IR::Opcode::CompositeExtractF16x2:
case IR::Opcode::CompositeExtractF16x3:
case IR::Opcode::CompositeExtractF16x4:
case IR::Opcode::CompositeInsertF16x2:
case IR::Opcode::CompositeInsertF16x3:
case IR::Opcode::CompositeInsertF16x4:
case IR::Opcode::SelectF16:
case IR::Opcode::BitCastU16F16:
case IR::Opcode::BitCastF16U16:
case IR::Opcode::PackFloat2x16:
case IR::Opcode::UnpackFloat2x16:
case IR::Opcode::ConvertS16F16:
case IR::Opcode::ConvertS32F16:
case IR::Opcode::ConvertS64F16:
case IR::Opcode::ConvertU16F16:
case IR::Opcode::ConvertU32F16:
case IR::Opcode::ConvertU64F16:
case IR::Opcode::ConvertF16S8:
case IR::Opcode::ConvertF16S16:
case IR::Opcode::ConvertF16S32:
case IR::Opcode::ConvertF16S64:
case IR::Opcode::ConvertF16U8:
case IR::Opcode::ConvertF16U16:
case IR::Opcode::ConvertF16U32:
case IR::Opcode::ConvertF16U64:
case IR::Opcode::FPAbs16:
case IR::Opcode::FPAdd16:
case IR::Opcode::FPCeil16:
case IR::Opcode::FPFloor16:
case IR::Opcode::FPFma16:
case IR::Opcode::FPMul16:
case IR::Opcode::FPNeg16:
case IR::Opcode::FPRoundEven16:
case IR::Opcode::FPSaturate16:
case IR::Opcode::FPClamp16:
case IR::Opcode::FPTrunc16:
case IR::Opcode::FPOrdEqual16:
case IR::Opcode::FPUnordEqual16:
case IR::Opcode::FPOrdNotEqual16:
case IR::Opcode::FPUnordNotEqual16:
case IR::Opcode::FPOrdLessThan16:
case IR::Opcode::FPUnordLessThan16:
case IR::Opcode::FPOrdGreaterThan16:
case IR::Opcode::FPUnordGreaterThan16:
case IR::Opcode::FPOrdLessThanEqual16:
case IR::Opcode::FPUnordLessThanEqual16:
case IR::Opcode::FPOrdGreaterThanEqual16:
case IR::Opcode::FPUnordGreaterThanEqual16:
case IR::Opcode::FPIsNan16:
case IR::Opcode::GlobalAtomicAddF16x2:
case IR::Opcode::GlobalAtomicMinF16x2:
case IR::Opcode::GlobalAtomicMaxF16x2:
case IR::Opcode::StorageAtomicAddF16x2:
case IR::Opcode::StorageAtomicMinF16x2:
case IR::Opcode::StorageAtomicMaxF16x2:
info.uses_fp16 = true;
break;
case IR::Opcode::CompositeConstructF64x2:
case IR::Opcode::CompositeConstructF64x3:
case IR::Opcode::CompositeConstructF64x4:
case IR::Opcode::CompositeExtractF64x2:
case IR::Opcode::CompositeExtractF64x3:
case IR::Opcode::CompositeExtractF64x4:
case IR::Opcode::CompositeInsertF64x2:
case IR::Opcode::CompositeInsertF64x3:
case IR::Opcode::CompositeInsertF64x4:
case IR::Opcode::SelectF64:
case IR::Opcode::BitCastU64F64:
case IR::Opcode::BitCastF64U64:
case IR::Opcode::PackDouble2x32:
case IR::Opcode::UnpackDouble2x32:
case IR::Opcode::FPAbs64:
case IR::Opcode::FPAdd64:
case IR::Opcode::FPCeil64:
case IR::Opcode::FPFloor64:
case IR::Opcode::FPFma64:
case IR::Opcode::FPMax64:
case IR::Opcode::FPMin64:
case IR::Opcode::FPMul64:
case IR::Opcode::FPNeg64:
case IR::Opcode::FPRecip64:
case IR::Opcode::FPRecipSqrt64:
case IR::Opcode::FPRoundEven64:
case IR::Opcode::FPSaturate64:
case IR::Opcode::FPClamp64:
case IR::Opcode::FPTrunc64:
case IR::Opcode::FPOrdEqual64:
case IR::Opcode::FPUnordEqual64:
case IR::Opcode::FPOrdNotEqual64:
case IR::Opcode::FPUnordNotEqual64:
case IR::Opcode::FPOrdLessThan64:
case IR::Opcode::FPUnordLessThan64:
case IR::Opcode::FPOrdGreaterThan64:
case IR::Opcode::FPUnordGreaterThan64:
case IR::Opcode::FPOrdLessThanEqual64:
case IR::Opcode::FPUnordLessThanEqual64:
case IR::Opcode::FPOrdGreaterThanEqual64:
case IR::Opcode::FPUnordGreaterThanEqual64:
case IR::Opcode::FPIsNan64:
case IR::Opcode::ConvertS16F64:
case IR::Opcode::ConvertS32F64:
case IR::Opcode::ConvertS64F64:
case IR::Opcode::ConvertU16F64:
case IR::Opcode::ConvertU32F64:
case IR::Opcode::ConvertU64F64:
case IR::Opcode::ConvertF32F64:
case IR::Opcode::ConvertF64F32:
case IR::Opcode::ConvertF64S8:
case IR::Opcode::ConvertF64S16:
case IR::Opcode::ConvertF64S32:
case IR::Opcode::ConvertF64S64:
case IR::Opcode::ConvertF64U8:
case IR::Opcode::ConvertF64U16:
case IR::Opcode::ConvertF64U32:
case IR::Opcode::ConvertF64U64:
info.uses_fp64 = true;
break;
default:
break;
}
switch (inst.GetOpcode()) {
case IR::Opcode::GetCbufU8:
case IR::Opcode::GetCbufS8:
case IR::Opcode::UndefU8:
case IR::Opcode::LoadGlobalU8:
case IR::Opcode::LoadGlobalS8:
case IR::Opcode::WriteGlobalU8:
case IR::Opcode::WriteGlobalS8:
case IR::Opcode::LoadStorageU8:
case IR::Opcode::LoadStorageS8:
case IR::Opcode::WriteStorageU8:
case IR::Opcode::WriteStorageS8:
case IR::Opcode::LoadSharedU8:
case IR::Opcode::LoadSharedS8:
case IR::Opcode::WriteSharedU8:
case IR::Opcode::SelectU8:
case IR::Opcode::ConvertF16S8:
case IR::Opcode::ConvertF16U8:
case IR::Opcode::ConvertF32S8:
case IR::Opcode::ConvertF32U8:
case IR::Opcode::ConvertF64S8:
case IR::Opcode::ConvertF64U8:
info.uses_int8 = true;
break;
default:
break;
}
switch (inst.GetOpcode()) {
case IR::Opcode::GetCbufU16:
case IR::Opcode::GetCbufS16:
case IR::Opcode::UndefU16:
case IR::Opcode::LoadGlobalU16:
case IR::Opcode::LoadGlobalS16:
case IR::Opcode::WriteGlobalU16:
case IR::Opcode::WriteGlobalS16:
case IR::Opcode::LoadStorageU16:
case IR::Opcode::LoadStorageS16:
case IR::Opcode::WriteStorageU16:
case IR::Opcode::WriteStorageS16:
case IR::Opcode::LoadSharedU16:
case IR::Opcode::LoadSharedS16:
case IR::Opcode::WriteSharedU16:
case IR::Opcode::SelectU16:
case IR::Opcode::BitCastU16F16:
case IR::Opcode::BitCastF16U16:
case IR::Opcode::ConvertS16F16:
case IR::Opcode::ConvertS16F32:
case IR::Opcode::ConvertS16F64:
case IR::Opcode::ConvertU16F16:
case IR::Opcode::ConvertU16F32:
case IR::Opcode::ConvertU16F64:
case IR::Opcode::ConvertF16S16:
case IR::Opcode::ConvertF16U16:
case IR::Opcode::ConvertF32S16:
case IR::Opcode::ConvertF32U16:
case IR::Opcode::ConvertF64S16:
case IR::Opcode::ConvertF64U16:
info.uses_int16 = true;
break;
default:
break;
}
switch (inst.GetOpcode()) {
case IR::Opcode::UndefU64:
case IR::Opcode::LoadGlobalU8:
case IR::Opcode::LoadGlobalS8:
case IR::Opcode::LoadGlobalU16:
case IR::Opcode::LoadGlobalS16:
case IR::Opcode::LoadGlobal32:
case IR::Opcode::LoadGlobal64:
case IR::Opcode::LoadGlobal128:
case IR::Opcode::WriteGlobalU8:
case IR::Opcode::WriteGlobalS8:
case IR::Opcode::WriteGlobalU16:
case IR::Opcode::WriteGlobalS16:
case IR::Opcode::WriteGlobal32:
case IR::Opcode::WriteGlobal64:
case IR::Opcode::WriteGlobal128:
case IR::Opcode::SelectU64:
case IR::Opcode::BitCastU64F64:
case IR::Opcode::BitCastF64U64:
case IR::Opcode::PackUint2x32:
case IR::Opcode::UnpackUint2x32:
case IR::Opcode::IAdd64:
case IR::Opcode::ISub64:
case IR::Opcode::INeg64:
case IR::Opcode::ShiftLeftLogical64:
case IR::Opcode::ShiftRightLogical64:
case IR::Opcode::ShiftRightArithmetic64:
case IR::Opcode::ConvertS64F16:
case IR::Opcode::ConvertS64F32:
case IR::Opcode::ConvertS64F64:
case IR::Opcode::ConvertU64F16:
case IR::Opcode::ConvertU64F32:
case IR::Opcode::ConvertU64F64:
case IR::Opcode::ConvertU64U32:
case IR::Opcode::ConvertU32U64:
case IR::Opcode::ConvertF16U64:
case IR::Opcode::ConvertF32U64:
case IR::Opcode::ConvertF64U64:
case IR::Opcode::SharedAtomicExchange64:
case IR::Opcode::GlobalAtomicIAdd64:
case IR::Opcode::GlobalAtomicSMin64:
case IR::Opcode::GlobalAtomicUMin64:
case IR::Opcode::GlobalAtomicSMax64:
case IR::Opcode::GlobalAtomicUMax64:
case IR::Opcode::GlobalAtomicAnd64:
case IR::Opcode::GlobalAtomicOr64:
case IR::Opcode::GlobalAtomicXor64:
case IR::Opcode::GlobalAtomicExchange64:
case IR::Opcode::StorageAtomicIAdd64:
case IR::Opcode::StorageAtomicSMin64:
case IR::Opcode::StorageAtomicUMin64:
case IR::Opcode::StorageAtomicSMax64:
case IR::Opcode::StorageAtomicUMax64:
case IR::Opcode::StorageAtomicAnd64:
case IR::Opcode::StorageAtomicOr64:
case IR::Opcode::StorageAtomicXor64:
case IR::Opcode::StorageAtomicExchange64:
info.uses_int64 = true;
break;
default:
break;
}
switch (inst.GetOpcode()) {
case IR::Opcode::WriteGlobalU8:
case IR::Opcode::WriteGlobalS8:
case IR::Opcode::WriteGlobalU16:
case IR::Opcode::WriteGlobalS16:
case IR::Opcode::WriteGlobal32:
case IR::Opcode::WriteGlobal64:
case IR::Opcode::WriteGlobal128:
case IR::Opcode::GlobalAtomicIAdd32:
case IR::Opcode::GlobalAtomicSMin32:
case IR::Opcode::GlobalAtomicUMin32:
case IR::Opcode::GlobalAtomicSMax32:
case IR::Opcode::GlobalAtomicUMax32:
case IR::Opcode::GlobalAtomicInc32:
case IR::Opcode::GlobalAtomicDec32:
case IR::Opcode::GlobalAtomicAnd32:
case IR::Opcode::GlobalAtomicOr32:
case IR::Opcode::GlobalAtomicXor32:
case IR::Opcode::GlobalAtomicExchange32:
case IR::Opcode::GlobalAtomicIAdd64:
case IR::Opcode::GlobalAtomicSMin64:
case IR::Opcode::GlobalAtomicUMin64:
case IR::Opcode::GlobalAtomicSMax64:
case IR::Opcode::GlobalAtomicUMax64:
case IR::Opcode::GlobalAtomicAnd64:
case IR::Opcode::GlobalAtomicOr64:
case IR::Opcode::GlobalAtomicXor64:
case IR::Opcode::GlobalAtomicExchange64:
case IR::Opcode::GlobalAtomicAddF32:
case IR::Opcode::GlobalAtomicAddF16x2:
case IR::Opcode::GlobalAtomicAddF32x2:
case IR::Opcode::GlobalAtomicMinF16x2:
case IR::Opcode::GlobalAtomicMinF32x2:
case IR::Opcode::GlobalAtomicMaxF16x2:
case IR::Opcode::GlobalAtomicMaxF32x2:
info.stores_global_memory = true;
[[fallthrough]];
case IR::Opcode::LoadGlobalU8:
case IR::Opcode::LoadGlobalS8:
case IR::Opcode::LoadGlobalU16:
case IR::Opcode::LoadGlobalS16:
case IR::Opcode::LoadGlobal32:
case IR::Opcode::LoadGlobal64:
case IR::Opcode::LoadGlobal128:
info.uses_int64 = true;
info.uses_global_memory = true;
info.used_constant_buffer_types |= IR::Type::U32 | IR::Type::U32x2;
info.used_storage_buffer_types |= IR::Type::U32 | IR::Type::U32x2 | IR::Type::U32x4;
break;
default:
break;
}
switch (inst.GetOpcode()) {
case IR::Opcode::DemoteToHelperInvocation:
info.uses_demote_to_helper_invocation = true;
break;
case IR::Opcode::GetAttribute:
info.loads.mask[static_cast<size_t>(inst.Arg(0).Attribute())] = true;
break;
case IR::Opcode::SetAttribute:
info.stores.mask[static_cast<size_t>(inst.Arg(0).Attribute())] = true;
break;
case IR::Opcode::GetPatch:
GetPatch(info, inst.Arg(0).Patch());
break;
case IR::Opcode::SetPatch:
SetPatch(info, inst.Arg(0).Patch());
break;
case IR::Opcode::GetAttributeIndexed:
info.loads_indexed_attributes = true;
break;
case IR::Opcode::SetAttributeIndexed:
info.stores_indexed_attributes = true;
break;
case IR::Opcode::SetFragColor:
info.stores_frag_color[inst.Arg(0).U32()] = true;
break;
case IR::Opcode::SetSampleMask:
info.stores_sample_mask = true;
break;
case IR::Opcode::SetFragDepth:
info.stores_frag_depth = true;
break;
case IR::Opcode::WorkgroupId:
info.uses_workgroup_id = true;
break;
case IR::Opcode::LocalInvocationId:
info.uses_local_invocation_id = true;
break;
case IR::Opcode::InvocationId:
info.uses_invocation_id = true;
break;
case IR::Opcode::SampleId:
info.uses_sample_id = true;
break;
case IR::Opcode::IsHelperInvocation:
info.uses_is_helper_invocation = true;
break;
case IR::Opcode::LaneId:
info.uses_subgroup_invocation_id = true;
break;
case IR::Opcode::ShuffleIndex:
case IR::Opcode::ShuffleUp:
case IR::Opcode::ShuffleDown:
case IR::Opcode::ShuffleButterfly:
info.uses_subgroup_shuffles = true;
break;
case IR::Opcode::GetCbufU8:
case IR::Opcode::GetCbufS8:
case IR::Opcode::GetCbufU16:
case IR::Opcode::GetCbufS16:
case IR::Opcode::GetCbufU32:
case IR::Opcode::GetCbufF32:
case IR::Opcode::GetCbufU32x2: {
const IR::Value index{inst.Arg(0)};
const IR::Value offset{inst.Arg(1)};
if (!index.IsImmediate()) {
throw NotImplementedException("Constant buffer with non-immediate index");
}
AddConstantBufferDescriptor(info, index.U32(), 1);
u32 element_size{};
switch (inst.GetOpcode()) {
case IR::Opcode::GetCbufU8:
case IR::Opcode::GetCbufS8:
info.used_constant_buffer_types |= IR::Type::U8;
element_size = 1;
break;
case IR::Opcode::GetCbufU16:
case IR::Opcode::GetCbufS16:
info.used_constant_buffer_types |= IR::Type::U16;
element_size = 2;
break;
case IR::Opcode::GetCbufU32:
info.used_constant_buffer_types |= IR::Type::U32;
element_size = 4;
break;
case IR::Opcode::GetCbufF32:
info.used_constant_buffer_types |= IR::Type::F32;
element_size = 4;
break;
case IR::Opcode::GetCbufU32x2:
info.used_constant_buffer_types |= IR::Type::U32x2;
element_size = 8;
break;
default:
break;
}
u32& size{info.constant_buffer_used_sizes[index.U32()]};
if (offset.IsImmediate()) {
size = Common::AlignUp(std::max(size, offset.U32() + element_size), 16u);
} else {
size = 0x10'000;
}
break;
}
case IR::Opcode::BindlessImageSampleImplicitLod:
case IR::Opcode::BindlessImageSampleExplicitLod:
case IR::Opcode::BindlessImageSampleDrefImplicitLod:
case IR::Opcode::BindlessImageSampleDrefExplicitLod:
case IR::Opcode::BindlessImageGather:
case IR::Opcode::BindlessImageGatherDref:
case IR::Opcode::BindlessImageFetch:
case IR::Opcode::BindlessImageQueryDimensions:
case IR::Opcode::BindlessImageQueryLod:
case IR::Opcode::BindlessImageGradient:
case IR::Opcode::BoundImageSampleImplicitLod:
case IR::Opcode::BoundImageSampleExplicitLod:
case IR::Opcode::BoundImageSampleDrefImplicitLod:
case IR::Opcode::BoundImageSampleDrefExplicitLod:
case IR::Opcode::BoundImageGather:
case IR::Opcode::BoundImageGatherDref:
case IR::Opcode::BoundImageFetch:
case IR::Opcode::BoundImageQueryDimensions:
case IR::Opcode::BoundImageQueryLod:
case IR::Opcode::BoundImageGradient:
case IR::Opcode::ImageGather:
case IR::Opcode::ImageGatherDref:
case IR::Opcode::ImageFetch:
case IR::Opcode::ImageQueryDimensions:
case IR::Opcode::ImageGradient: {
const TextureType type{inst.Flags<IR::TextureInstInfo>().type};
info.uses_sampled_1d |= type == TextureType::Color1D || type == TextureType::ColorArray1D;
info.uses_sparse_residency |=
inst.GetAssociatedPseudoOperation(IR::Opcode::GetSparseFromOp) != nullptr;
break;
}
case IR::Opcode::ImageSampleImplicitLod:
case IR::Opcode::ImageSampleExplicitLod:
case IR::Opcode::ImageSampleDrefImplicitLod:
case IR::Opcode::ImageSampleDrefExplicitLod:
case IR::Opcode::ImageQueryLod: {
const auto flags{inst.Flags<IR::TextureInstInfo>()};
const TextureType type{flags.type};
info.uses_sampled_1d |= type == TextureType::Color1D || type == TextureType::ColorArray1D;
info.uses_shadow_lod |= flags.is_depth != 0;
info.uses_sparse_residency |=
inst.GetAssociatedPseudoOperation(IR::Opcode::GetSparseFromOp) != nullptr;
break;
}
case IR::Opcode::ImageRead: {
const auto flags{inst.Flags<IR::TextureInstInfo>()};
info.uses_typeless_image_reads |= flags.image_format == ImageFormat::Typeless;
info.uses_sparse_residency |=
inst.GetAssociatedPseudoOperation(IR::Opcode::GetSparseFromOp) != nullptr;
break;
}
case IR::Opcode::ImageWrite: {
const auto flags{inst.Flags<IR::TextureInstInfo>()};
info.uses_typeless_image_writes |= flags.image_format == ImageFormat::Typeless;
info.uses_image_buffers |= flags.type == TextureType::Buffer;
break;
}
case IR::Opcode::SubgroupEqMask:
case IR::Opcode::SubgroupLtMask:
case IR::Opcode::SubgroupLeMask:
case IR::Opcode::SubgroupGtMask:
case IR::Opcode::SubgroupGeMask:
info.uses_subgroup_mask = true;
break;
case IR::Opcode::VoteAll:
case IR::Opcode::VoteAny:
case IR::Opcode::VoteEqual:
case IR::Opcode::SubgroupBallot:
info.uses_subgroup_vote = true;
break;
case IR::Opcode::FSwizzleAdd:
info.uses_fswzadd = true;
break;
case IR::Opcode::DPdxFine:
case IR::Opcode::DPdyFine:
case IR::Opcode::DPdxCoarse:
case IR::Opcode::DPdyCoarse:
info.uses_derivatives = true;
break;
case IR::Opcode::LoadStorageU8:
case IR::Opcode::LoadStorageS8:
case IR::Opcode::WriteStorageU8:
case IR::Opcode::WriteStorageS8:
info.used_storage_buffer_types |= IR::Type::U8;
break;
case IR::Opcode::LoadStorageU16:
case IR::Opcode::LoadStorageS16:
case IR::Opcode::WriteStorageU16:
case IR::Opcode::WriteStorageS16:
info.used_storage_buffer_types |= IR::Type::U16;
break;
case IR::Opcode::LoadStorage32:
case IR::Opcode::WriteStorage32:
case IR::Opcode::StorageAtomicIAdd32:
case IR::Opcode::StorageAtomicUMin32:
case IR::Opcode::StorageAtomicUMax32:
case IR::Opcode::StorageAtomicAnd32:
case IR::Opcode::StorageAtomicOr32:
case IR::Opcode::StorageAtomicXor32:
case IR::Opcode::StorageAtomicExchange32:
info.used_storage_buffer_types |= IR::Type::U32;
break;
case IR::Opcode::LoadStorage64:
case IR::Opcode::WriteStorage64:
info.used_storage_buffer_types |= IR::Type::U32x2;
break;
case IR::Opcode::LoadStorage128:
case IR::Opcode::WriteStorage128:
info.used_storage_buffer_types |= IR::Type::U32x4;
break;
case IR::Opcode::SharedAtomicSMin32:
info.uses_atomic_s32_min = true;
break;
case IR::Opcode::SharedAtomicSMax32:
info.uses_atomic_s32_max = true;
break;
case IR::Opcode::SharedAtomicInc32:
info.uses_shared_increment = true;
break;
case IR::Opcode::SharedAtomicDec32:
info.uses_shared_decrement = true;
break;
case IR::Opcode::SharedAtomicExchange64:
info.uses_int64_bit_atomics = true;
break;
case IR::Opcode::GlobalAtomicInc32:
case IR::Opcode::StorageAtomicInc32:
info.used_storage_buffer_types |= IR::Type::U32;
info.uses_global_increment = true;
break;
case IR::Opcode::GlobalAtomicDec32:
case IR::Opcode::StorageAtomicDec32:
info.used_storage_buffer_types |= IR::Type::U32;
info.uses_global_decrement = true;
break;
case IR::Opcode::GlobalAtomicAddF32:
case IR::Opcode::StorageAtomicAddF32:
info.used_storage_buffer_types |= IR::Type::U32;
info.uses_atomic_f32_add = true;
break;
case IR::Opcode::GlobalAtomicAddF16x2:
case IR::Opcode::StorageAtomicAddF16x2:
info.used_storage_buffer_types |= IR::Type::U32;
info.uses_atomic_f16x2_add = true;
break;
case IR::Opcode::GlobalAtomicAddF32x2:
case IR::Opcode::StorageAtomicAddF32x2:
info.used_storage_buffer_types |= IR::Type::U32;
info.uses_atomic_f32x2_add = true;
break;
case IR::Opcode::GlobalAtomicMinF16x2:
case IR::Opcode::StorageAtomicMinF16x2:
info.used_storage_buffer_types |= IR::Type::U32;
info.uses_atomic_f16x2_min = true;
break;
case IR::Opcode::GlobalAtomicMinF32x2:
case IR::Opcode::StorageAtomicMinF32x2:
info.used_storage_buffer_types |= IR::Type::U32;
info.uses_atomic_f32x2_min = true;
break;
case IR::Opcode::GlobalAtomicMaxF16x2:
case IR::Opcode::StorageAtomicMaxF16x2:
info.used_storage_buffer_types |= IR::Type::U32;
info.uses_atomic_f16x2_max = true;
break;
case IR::Opcode::GlobalAtomicMaxF32x2:
case IR::Opcode::StorageAtomicMaxF32x2:
info.used_storage_buffer_types |= IR::Type::U32;
info.uses_atomic_f32x2_max = true;
break;
case IR::Opcode::StorageAtomicSMin32:
info.used_storage_buffer_types |= IR::Type::U32;
info.uses_atomic_s32_min = true;
break;
case IR::Opcode::StorageAtomicSMax32:
info.used_storage_buffer_types |= IR::Type::U32;
info.uses_atomic_s32_max = true;
break;
case IR::Opcode::GlobalAtomicIAdd64:
case IR::Opcode::GlobalAtomicSMin64:
case IR::Opcode::GlobalAtomicUMin64:
case IR::Opcode::GlobalAtomicSMax64:
case IR::Opcode::GlobalAtomicUMax64:
case IR::Opcode::GlobalAtomicAnd64:
case IR::Opcode::GlobalAtomicOr64:
case IR::Opcode::GlobalAtomicXor64:
case IR::Opcode::GlobalAtomicExchange64:
case IR::Opcode::StorageAtomicIAdd64:
case IR::Opcode::StorageAtomicSMin64:
case IR::Opcode::StorageAtomicUMin64:
case IR::Opcode::StorageAtomicSMax64:
case IR::Opcode::StorageAtomicUMax64:
case IR::Opcode::StorageAtomicAnd64:
case IR::Opcode::StorageAtomicOr64:
case IR::Opcode::StorageAtomicXor64:
info.used_storage_buffer_types |= IR::Type::U64;
info.uses_int64_bit_atomics = true;
break;
case IR::Opcode::BindlessImageAtomicIAdd32:
case IR::Opcode::BindlessImageAtomicSMin32:
case IR::Opcode::BindlessImageAtomicUMin32:
case IR::Opcode::BindlessImageAtomicSMax32:
case IR::Opcode::BindlessImageAtomicUMax32:
case IR::Opcode::BindlessImageAtomicInc32:
case IR::Opcode::BindlessImageAtomicDec32:
case IR::Opcode::BindlessImageAtomicAnd32:
case IR::Opcode::BindlessImageAtomicOr32:
case IR::Opcode::BindlessImageAtomicXor32:
case IR::Opcode::BindlessImageAtomicExchange32:
case IR::Opcode::BoundImageAtomicIAdd32:
case IR::Opcode::BoundImageAtomicSMin32:
case IR::Opcode::BoundImageAtomicUMin32:
case IR::Opcode::BoundImageAtomicSMax32:
case IR::Opcode::BoundImageAtomicUMax32:
case IR::Opcode::BoundImageAtomicInc32:
case IR::Opcode::BoundImageAtomicDec32:
case IR::Opcode::BoundImageAtomicAnd32:
case IR::Opcode::BoundImageAtomicOr32:
case IR::Opcode::BoundImageAtomicXor32:
case IR::Opcode::BoundImageAtomicExchange32:
case IR::Opcode::ImageAtomicIAdd32:
case IR::Opcode::ImageAtomicSMin32:
case IR::Opcode::ImageAtomicUMin32:
case IR::Opcode::ImageAtomicSMax32:
case IR::Opcode::ImageAtomicUMax32:
case IR::Opcode::ImageAtomicInc32:
case IR::Opcode::ImageAtomicDec32:
case IR::Opcode::ImageAtomicAnd32:
case IR::Opcode::ImageAtomicOr32:
case IR::Opcode::ImageAtomicXor32:
case IR::Opcode::ImageAtomicExchange32:
info.uses_atomic_image_u32 = true;
break;
default:
break;
}
}
void VisitFpModifiers(Info& info, IR::Inst& inst) {
switch (inst.GetOpcode()) {
case IR::Opcode::FPAdd16:
case IR::Opcode::FPFma16:
case IR::Opcode::FPMul16:
case IR::Opcode::FPRoundEven16:
case IR::Opcode::FPFloor16:
case IR::Opcode::FPCeil16:
case IR::Opcode::FPTrunc16: {
const auto control{inst.Flags<IR::FpControl>()};
switch (control.fmz_mode) {
case IR::FmzMode::DontCare:
break;
case IR::FmzMode::FTZ:
case IR::FmzMode::FMZ:
info.uses_fp16_denorms_flush = true;
break;
case IR::FmzMode::None:
info.uses_fp16_denorms_preserve = true;
break;
}
break;
}
case IR::Opcode::FPAdd32:
case IR::Opcode::FPFma32:
case IR::Opcode::FPMul32:
case IR::Opcode::FPRoundEven32:
case IR::Opcode::FPFloor32:
case IR::Opcode::FPCeil32:
case IR::Opcode::FPTrunc32:
case IR::Opcode::FPOrdEqual32:
case IR::Opcode::FPUnordEqual32:
case IR::Opcode::FPOrdNotEqual32:
case IR::Opcode::FPUnordNotEqual32:
case IR::Opcode::FPOrdLessThan32:
case IR::Opcode::FPUnordLessThan32:
case IR::Opcode::FPOrdGreaterThan32:
case IR::Opcode::FPUnordGreaterThan32:
case IR::Opcode::FPOrdLessThanEqual32:
case IR::Opcode::FPUnordLessThanEqual32:
case IR::Opcode::FPOrdGreaterThanEqual32:
case IR::Opcode::FPUnordGreaterThanEqual32:
case IR::Opcode::ConvertF16F32:
case IR::Opcode::ConvertF64F32: {
const auto control{inst.Flags<IR::FpControl>()};
switch (control.fmz_mode) {
case IR::FmzMode::DontCare:
break;
case IR::FmzMode::FTZ:
case IR::FmzMode::FMZ:
info.uses_fp32_denorms_flush = true;
break;
case IR::FmzMode::None:
info.uses_fp32_denorms_preserve = true;
break;
}
break;
}
default:
break;
}
}
void VisitCbufs(Info& info, IR::Inst& inst) {
switch (inst.GetOpcode()) {
case IR::Opcode::GetCbufU8:
case IR::Opcode::GetCbufS8:
case IR::Opcode::GetCbufU16:
case IR::Opcode::GetCbufS16:
case IR::Opcode::GetCbufU32:
case IR::Opcode::GetCbufF32:
case IR::Opcode::GetCbufU32x2: {
CheckCBufNVN(info, inst);
break;
}
default:
break;
}
}
void Visit(Info& info, IR::Inst& inst) {
VisitUsages(info, inst);
VisitFpModifiers(info, inst);
VisitCbufs(info, inst);
}
void GatherInfoFromHeader(Environment& env, Info& info) {
Stage stage{env.ShaderStage()};
if (stage == Stage::Compute) {
return;
}
const auto& header{env.SPH()};
if (stage == Stage::Fragment) {
if (!info.loads_indexed_attributes) {
return;
}
for (size_t index = 0; index < IR::NUM_GENERICS; ++index) {
const size_t offset{static_cast<size_t>(IR::Attribute::Generic0X) + index * 4};
const auto vector{header.ps.imap_generic_vector[index]};
info.loads.mask[offset + 0] = vector.x != PixelImap::Unused;
info.loads.mask[offset + 1] = vector.y != PixelImap::Unused;
info.loads.mask[offset + 2] = vector.z != PixelImap::Unused;
info.loads.mask[offset + 3] = vector.w != PixelImap::Unused;
}
return;
}
if (info.loads_indexed_attributes) {
for (size_t index = 0; index < IR::NUM_GENERICS; ++index) {
const IR::Attribute attribute{IR::Attribute::Generic0X + index * 4};
const auto mask = header.vtg.InputGeneric(index);
for (size_t i = 0; i < 4; ++i) {
info.loads.Set(attribute + i, mask[i]);
}
}
for (size_t index = 0; index < 8; ++index) {
const u16 mask{header.vtg.clip_distances};
info.loads.Set(IR::Attribute::ClipDistance0 + index, ((mask >> index) & 1) != 0);
}
info.loads.Set(IR::Attribute::PrimitiveId, header.vtg.imap_systemb.primitive_array_id != 0);
info.loads.Set(IR::Attribute::Layer, header.vtg.imap_systemb.rt_array_index != 0);
info.loads.Set(IR::Attribute::ViewportIndex, header.vtg.imap_systemb.viewport_index != 0);
info.loads.Set(IR::Attribute::PointSize, header.vtg.imap_systemb.point_size != 0);
info.loads.Set(IR::Attribute::PositionX, header.vtg.imap_systemb.position_x != 0);
info.loads.Set(IR::Attribute::PositionY, header.vtg.imap_systemb.position_y != 0);
info.loads.Set(IR::Attribute::PositionZ, header.vtg.imap_systemb.position_z != 0);
info.loads.Set(IR::Attribute::PositionW, header.vtg.imap_systemb.position_w != 0);
info.loads.Set(IR::Attribute::PointSpriteS, header.vtg.point_sprite_s != 0);
info.loads.Set(IR::Attribute::PointSpriteT, header.vtg.point_sprite_t != 0);
info.loads.Set(IR::Attribute::FogCoordinate, header.vtg.fog_coordinate != 0);
info.loads.Set(IR::Attribute::TessellationEvaluationPointU,
header.vtg.tessellation_eval_point_u != 0);
info.loads.Set(IR::Attribute::TessellationEvaluationPointV,
header.vtg.tessellation_eval_point_v != 0);
info.loads.Set(IR::Attribute::InstanceId, header.vtg.instance_id != 0);
info.loads.Set(IR::Attribute::VertexId, header.vtg.vertex_id != 0);
// TODO: Legacy varyings
}
if (info.stores_indexed_attributes) {
for (size_t index = 0; index < IR::NUM_GENERICS; ++index) {
const IR::Attribute attribute{IR::Attribute::Generic0X + index * 4};
const auto mask{header.vtg.OutputGeneric(index)};
for (size_t i = 0; i < 4; ++i) {
info.stores.Set(attribute + i, mask[i]);
}
}
for (size_t index = 0; index < 8; ++index) {
const u16 mask{header.vtg.omap_systemc.clip_distances};
info.stores.Set(IR::Attribute::ClipDistance0 + index, ((mask >> index) & 1) != 0);
}
info.stores.Set(IR::Attribute::PrimitiveId,
header.vtg.omap_systemb.primitive_array_id != 0);
info.stores.Set(IR::Attribute::Layer, header.vtg.omap_systemb.rt_array_index != 0);
info.stores.Set(IR::Attribute::ViewportIndex, header.vtg.omap_systemb.viewport_index != 0);
info.stores.Set(IR::Attribute::PointSize, header.vtg.omap_systemb.point_size != 0);
info.stores.Set(IR::Attribute::PositionX, header.vtg.omap_systemb.position_x != 0);
info.stores.Set(IR::Attribute::PositionY, header.vtg.omap_systemb.position_y != 0);
info.stores.Set(IR::Attribute::PositionZ, header.vtg.omap_systemb.position_z != 0);
info.stores.Set(IR::Attribute::PositionW, header.vtg.omap_systemb.position_w != 0);
info.stores.Set(IR::Attribute::PointSpriteS, header.vtg.omap_systemc.point_sprite_s != 0);
info.stores.Set(IR::Attribute::PointSpriteT, header.vtg.omap_systemc.point_sprite_t != 0);
info.stores.Set(IR::Attribute::FogCoordinate, header.vtg.omap_systemc.fog_coordinate != 0);
info.stores.Set(IR::Attribute::TessellationEvaluationPointU,
header.vtg.omap_systemc.tessellation_eval_point_u != 0);
info.stores.Set(IR::Attribute::TessellationEvaluationPointV,
header.vtg.omap_systemc.tessellation_eval_point_v != 0);
info.stores.Set(IR::Attribute::InstanceId, header.vtg.omap_systemc.instance_id != 0);
info.stores.Set(IR::Attribute::VertexId, header.vtg.omap_systemc.vertex_id != 0);
// TODO: Legacy varyings
}
}
} // Anonymous namespace
void CollectShaderInfoPass(Environment& env, IR::Program& program) {
Info& info{program.info};
const u32 base{[&] {
switch (program.stage) {
case Stage::VertexA:
case Stage::VertexB:
return 0x110u;
case Stage::TessellationControl:
return 0x210u;
case Stage::TessellationEval:
return 0x310u;
case Stage::Geometry:
return 0x410u;
case Stage::Fragment:
return 0x510u;
case Stage::Compute:
return 0x310u;
}
throw InvalidArgument("Invalid stage {}", program.stage);
}()};
info.nvn_buffer_base = base;
for (IR::Block* const block : program.post_order_blocks) {
for (IR::Inst& inst : block->Instructions()) {
Visit(info, inst);
}
}
GatherInfoFromHeader(env, info);
}
} // namespace Shader::Optimization

View File

@@ -0,0 +1,609 @@
// Copyright 2021 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#include <algorithm>
#include <ranges>
#include <tuple>
#include <type_traits>
#include "common/bit_cast.h"
#include "common/bit_util.h"
#include "shader_recompiler/exception.h"
#include "shader_recompiler/frontend/ir/ir_emitter.h"
#include "shader_recompiler/frontend/ir/value.h"
#include "shader_recompiler/ir_opt/passes.h"
namespace Shader::Optimization {
namespace {
// Metaprogramming stuff to get arguments information out of a lambda
template <typename Func>
struct LambdaTraits : LambdaTraits<decltype(&std::remove_reference_t<Func>::operator())> {};
template <typename ReturnType, typename LambdaType, typename... Args>
struct LambdaTraits<ReturnType (LambdaType::*)(Args...) const> {
template <size_t I>
using ArgType = std::tuple_element_t<I, std::tuple<Args...>>;
static constexpr size_t NUM_ARGS{sizeof...(Args)};
};
template <typename T>
[[nodiscard]] T Arg(const IR::Value& value) {
if constexpr (std::is_same_v<T, bool>) {
return value.U1();
} else if constexpr (std::is_same_v<T, u32>) {
return value.U32();
} else if constexpr (std::is_same_v<T, s32>) {
return static_cast<s32>(value.U32());
} else if constexpr (std::is_same_v<T, f32>) {
return value.F32();
} else if constexpr (std::is_same_v<T, u64>) {
return value.U64();
}
}
template <typename T, typename ImmFn>
bool FoldCommutative(IR::Inst& inst, ImmFn&& imm_fn) {
const IR::Value lhs{inst.Arg(0)};
const IR::Value rhs{inst.Arg(1)};
const bool is_lhs_immediate{lhs.IsImmediate()};
const bool is_rhs_immediate{rhs.IsImmediate()};
if (is_lhs_immediate && is_rhs_immediate) {
const auto result{imm_fn(Arg<T>(lhs), Arg<T>(rhs))};
inst.ReplaceUsesWith(IR::Value{result});
return false;
}
if (is_lhs_immediate && !is_rhs_immediate) {
IR::Inst* const rhs_inst{rhs.InstRecursive()};
if (rhs_inst->GetOpcode() == inst.GetOpcode() && rhs_inst->Arg(1).IsImmediate()) {
const auto combined{imm_fn(Arg<T>(lhs), Arg<T>(rhs_inst->Arg(1)))};
inst.SetArg(0, rhs_inst->Arg(0));
inst.SetArg(1, IR::Value{combined});
} else {
// Normalize
inst.SetArg(0, rhs);
inst.SetArg(1, lhs);
}
}
if (!is_lhs_immediate && is_rhs_immediate) {
const IR::Inst* const lhs_inst{lhs.InstRecursive()};
if (lhs_inst->GetOpcode() == inst.GetOpcode() && lhs_inst->Arg(1).IsImmediate()) {
const auto combined{imm_fn(Arg<T>(rhs), Arg<T>(lhs_inst->Arg(1)))};
inst.SetArg(0, lhs_inst->Arg(0));
inst.SetArg(1, IR::Value{combined});
}
}
return true;
}
template <typename Func>
bool FoldWhenAllImmediates(IR::Inst& inst, Func&& func) {
if (!inst.AreAllArgsImmediates() || inst.HasAssociatedPseudoOperation()) {
return false;
}
using Indices = std::make_index_sequence<LambdaTraits<decltype(func)>::NUM_ARGS>;
inst.ReplaceUsesWith(EvalImmediates(inst, func, Indices{}));
return true;
}
void FoldGetRegister(IR::Inst& inst) {
if (inst.Arg(0).Reg() == IR::Reg::RZ) {
inst.ReplaceUsesWith(IR::Value{u32{0}});
}
}
void FoldGetPred(IR::Inst& inst) {
if (inst.Arg(0).Pred() == IR::Pred::PT) {
inst.ReplaceUsesWith(IR::Value{true});
}
}
/// Replaces the pattern generated by two XMAD multiplications
bool FoldXmadMultiply(IR::Block& block, IR::Inst& inst) {
/*
* We are looking for this pattern:
* %rhs_bfe = BitFieldUExtract %factor_a, #0, #16
* %rhs_mul = IMul32 %rhs_bfe, %factor_b
* %lhs_bfe = BitFieldUExtract %factor_a, #16, #16
* %rhs_mul = IMul32 %lhs_bfe, %factor_b
* %lhs_shl = ShiftLeftLogical32 %rhs_mul, #16
* %result = IAdd32 %lhs_shl, %rhs_mul
*
* And replacing it with
* %result = IMul32 %factor_a, %factor_b
*
* This optimization has been proven safe by LLVM and MSVC.
*/
const IR::Value lhs_arg{inst.Arg(0)};
const IR::Value rhs_arg{inst.Arg(1)};
if (lhs_arg.IsImmediate() || rhs_arg.IsImmediate()) {
return false;
}
IR::Inst* const lhs_shl{lhs_arg.InstRecursive()};
if (lhs_shl->GetOpcode() != IR::Opcode::ShiftLeftLogical32 ||
lhs_shl->Arg(1) != IR::Value{16U}) {
return false;
}
if (lhs_shl->Arg(0).IsImmediate()) {
return false;
}
IR::Inst* const lhs_mul{lhs_shl->Arg(0).InstRecursive()};
IR::Inst* const rhs_mul{rhs_arg.InstRecursive()};
if (lhs_mul->GetOpcode() != IR::Opcode::IMul32 || rhs_mul->GetOpcode() != IR::Opcode::IMul32) {
return false;
}
if (lhs_mul->Arg(1).Resolve() != rhs_mul->Arg(1).Resolve()) {
return false;
}
const IR::U32 factor_b{lhs_mul->Arg(1)};
if (lhs_mul->Arg(0).IsImmediate() || rhs_mul->Arg(0).IsImmediate()) {
return false;
}
IR::Inst* const lhs_bfe{lhs_mul->Arg(0).InstRecursive()};
IR::Inst* const rhs_bfe{rhs_mul->Arg(0).InstRecursive()};
if (lhs_bfe->GetOpcode() != IR::Opcode::BitFieldUExtract) {
return false;
}
if (rhs_bfe->GetOpcode() != IR::Opcode::BitFieldUExtract) {
return false;
}
if (lhs_bfe->Arg(1) != IR::Value{16U} || lhs_bfe->Arg(2) != IR::Value{16U}) {
return false;
}
if (rhs_bfe->Arg(1) != IR::Value{0U} || rhs_bfe->Arg(2) != IR::Value{16U}) {
return false;
}
if (lhs_bfe->Arg(0).Resolve() != rhs_bfe->Arg(0).Resolve()) {
return false;
}
const IR::U32 factor_a{lhs_bfe->Arg(0)};
IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)};
inst.ReplaceUsesWith(ir.IMul(factor_a, factor_b));
return true;
}
template <typename T>
void FoldAdd(IR::Block& block, IR::Inst& inst) {
if (inst.HasAssociatedPseudoOperation()) {
return;
}
if (!FoldCommutative<T>(inst, [](T a, T b) { return a + b; })) {
return;
}
const IR::Value rhs{inst.Arg(1)};
if (rhs.IsImmediate() && Arg<T>(rhs) == 0) {
inst.ReplaceUsesWith(inst.Arg(0));
return;
}
if constexpr (std::is_same_v<T, u32>) {
if (FoldXmadMultiply(block, inst)) {
return;
}
}
}
void FoldISub32(IR::Inst& inst) {
if (FoldWhenAllImmediates(inst, [](u32 a, u32 b) { return a - b; })) {
return;
}
if (inst.Arg(0).IsImmediate() || inst.Arg(1).IsImmediate()) {
return;
}
// ISub32 is generally used to subtract two constant buffers, compare and replace this with
// zero if they equal.
const auto equal_cbuf{[](IR::Inst* a, IR::Inst* b) {
return a->GetOpcode() == IR::Opcode::GetCbufU32 &&
b->GetOpcode() == IR::Opcode::GetCbufU32 && a->Arg(0) == b->Arg(0) &&
a->Arg(1) == b->Arg(1);
}};
IR::Inst* op_a{inst.Arg(0).InstRecursive()};
IR::Inst* op_b{inst.Arg(1).InstRecursive()};
if (equal_cbuf(op_a, op_b)) {
inst.ReplaceUsesWith(IR::Value{u32{0}});
return;
}
// It's also possible a value is being added to a cbuf and then subtracted
if (op_b->GetOpcode() == IR::Opcode::IAdd32) {
// Canonicalize local variables to simplify the following logic
std::swap(op_a, op_b);
}
if (op_b->GetOpcode() != IR::Opcode::GetCbufU32) {
return;
}
IR::Inst* const inst_cbuf{op_b};
if (op_a->GetOpcode() != IR::Opcode::IAdd32) {
return;
}
IR::Value add_op_a{op_a->Arg(0)};
IR::Value add_op_b{op_a->Arg(1)};
if (add_op_b.IsImmediate()) {
// Canonicalize
std::swap(add_op_a, add_op_b);
}
if (add_op_b.IsImmediate()) {
return;
}
IR::Inst* const add_cbuf{add_op_b.InstRecursive()};
if (equal_cbuf(add_cbuf, inst_cbuf)) {
inst.ReplaceUsesWith(add_op_a);
}
}
void FoldSelect(IR::Inst& inst) {
const IR::Value cond{inst.Arg(0)};
if (cond.IsImmediate()) {
inst.ReplaceUsesWith(cond.U1() ? inst.Arg(1) : inst.Arg(2));
}
}
void FoldFPMul32(IR::Inst& inst) {
const auto control{inst.Flags<IR::FpControl>()};
if (control.no_contraction) {
return;
}
// Fold interpolation operations
const IR::Value lhs_value{inst.Arg(0)};
const IR::Value rhs_value{inst.Arg(1)};
if (lhs_value.IsImmediate() || rhs_value.IsImmediate()) {
return;
}
IR::Inst* const lhs_op{lhs_value.InstRecursive()};
IR::Inst* const rhs_op{rhs_value.InstRecursive()};
if (lhs_op->GetOpcode() != IR::Opcode::FPMul32 ||
rhs_op->GetOpcode() != IR::Opcode::FPRecip32) {
return;
}
const IR::Value recip_source{rhs_op->Arg(0)};
const IR::Value lhs_mul_source{lhs_op->Arg(1).Resolve()};
if (recip_source.IsImmediate() || lhs_mul_source.IsImmediate()) {
return;
}
IR::Inst* const attr_a{recip_source.InstRecursive()};
IR::Inst* const attr_b{lhs_mul_source.InstRecursive()};
if (attr_a->GetOpcode() != IR::Opcode::GetAttribute ||
attr_b->GetOpcode() != IR::Opcode::GetAttribute) {
return;
}
if (attr_a->Arg(0).Attribute() == attr_b->Arg(0).Attribute()) {
inst.ReplaceUsesWith(lhs_op->Arg(0));
}
}
void FoldLogicalAnd(IR::Inst& inst) {
if (!FoldCommutative<bool>(inst, [](bool a, bool b) { return a && b; })) {
return;
}
const IR::Value rhs{inst.Arg(1)};
if (rhs.IsImmediate()) {
if (rhs.U1()) {
inst.ReplaceUsesWith(inst.Arg(0));
} else {
inst.ReplaceUsesWith(IR::Value{false});
}
}
}
void FoldLogicalOr(IR::Inst& inst) {
if (!FoldCommutative<bool>(inst, [](bool a, bool b) { return a || b; })) {
return;
}
const IR::Value rhs{inst.Arg(1)};
if (rhs.IsImmediate()) {
if (rhs.U1()) {
inst.ReplaceUsesWith(IR::Value{true});
} else {
inst.ReplaceUsesWith(inst.Arg(0));
}
}
}
void FoldLogicalNot(IR::Inst& inst) {
const IR::U1 value{inst.Arg(0)};
if (value.IsImmediate()) {
inst.ReplaceUsesWith(IR::Value{!value.U1()});
return;
}
IR::Inst* const arg{value.InstRecursive()};
if (arg->GetOpcode() == IR::Opcode::LogicalNot) {
inst.ReplaceUsesWith(arg->Arg(0));
}
}
template <IR::Opcode op, typename Dest, typename Source>
void FoldBitCast(IR::Inst& inst, IR::Opcode reverse) {
const IR::Value value{inst.Arg(0)};
if (value.IsImmediate()) {
inst.ReplaceUsesWith(IR::Value{Common::BitCast<Dest>(Arg<Source>(value))});
return;
}
IR::Inst* const arg_inst{value.InstRecursive()};
if (arg_inst->GetOpcode() == reverse) {
inst.ReplaceUsesWith(arg_inst->Arg(0));
return;
}
if constexpr (op == IR::Opcode::BitCastF32U32) {
if (arg_inst->GetOpcode() == IR::Opcode::GetCbufU32) {
// Replace the bitcast with a typed constant buffer read
inst.ReplaceOpcode(IR::Opcode::GetCbufF32);
inst.SetArg(0, arg_inst->Arg(0));
inst.SetArg(1, arg_inst->Arg(1));
return;
}
}
}
void FoldInverseFunc(IR::Inst& inst, IR::Opcode reverse) {
const IR::Value value{inst.Arg(0)};
if (value.IsImmediate()) {
return;
}
IR::Inst* const arg_inst{value.InstRecursive()};
if (arg_inst->GetOpcode() == reverse) {
inst.ReplaceUsesWith(arg_inst->Arg(0));
return;
}
}
template <typename Func, size_t... I>
IR::Value EvalImmediates(const IR::Inst& inst, Func&& func, std::index_sequence<I...>) {
using Traits = LambdaTraits<decltype(func)>;
return IR::Value{func(Arg<typename Traits::template ArgType<I>>(inst.Arg(I))...)};
}
std::optional<IR::Value> FoldCompositeExtractImpl(IR::Value inst_value, IR::Opcode insert,
IR::Opcode construct, u32 first_index) {
IR::Inst* const inst{inst_value.InstRecursive()};
if (inst->GetOpcode() == construct) {
return inst->Arg(first_index);
}
if (inst->GetOpcode() != insert) {
return std::nullopt;
}
IR::Value value_index{inst->Arg(2)};
if (!value_index.IsImmediate()) {
return std::nullopt;
}
const u32 second_index{value_index.U32()};
if (first_index != second_index) {
IR::Value value_composite{inst->Arg(0)};
if (value_composite.IsImmediate()) {
return std::nullopt;
}
return FoldCompositeExtractImpl(value_composite, insert, construct, first_index);
}
return inst->Arg(1);
}
void FoldCompositeExtract(IR::Inst& inst, IR::Opcode construct, IR::Opcode insert) {
const IR::Value value_1{inst.Arg(0)};
const IR::Value value_2{inst.Arg(1)};
if (value_1.IsImmediate()) {
return;
}
if (!value_2.IsImmediate()) {
return;
}
const u32 first_index{value_2.U32()};
const std::optional result{FoldCompositeExtractImpl(value_1, insert, construct, first_index)};
if (!result) {
return;
}
inst.ReplaceUsesWith(*result);
}
IR::Value GetThroughCast(IR::Value value, IR::Opcode expected_cast) {
if (value.IsImmediate()) {
return value;
}
IR::Inst* const inst{value.InstRecursive()};
if (inst->GetOpcode() == expected_cast) {
return inst->Arg(0).Resolve();
}
return value;
}
void FoldFSwizzleAdd(IR::Block& block, IR::Inst& inst) {
const IR::Value swizzle{inst.Arg(2)};
if (!swizzle.IsImmediate()) {
return;
}
const IR::Value value_1{GetThroughCast(inst.Arg(0).Resolve(), IR::Opcode::BitCastF32U32)};
const IR::Value value_2{GetThroughCast(inst.Arg(1).Resolve(), IR::Opcode::BitCastF32U32)};
if (value_1.IsImmediate()) {
return;
}
const u32 swizzle_value{swizzle.U32()};
if (swizzle_value != 0x99 && swizzle_value != 0xA5) {
return;
}
IR::Inst* const inst2{value_1.InstRecursive()};
if (inst2->GetOpcode() != IR::Opcode::ShuffleButterfly) {
return;
}
const IR::Value value_3{GetThroughCast(inst2->Arg(0).Resolve(), IR::Opcode::BitCastU32F32)};
if (value_2 != value_3) {
return;
}
const IR::Value index{inst2->Arg(1)};
const IR::Value clamp{inst2->Arg(2)};
const IR::Value segmentation_mask{inst2->Arg(3)};
if (!index.IsImmediate() || !clamp.IsImmediate() || !segmentation_mask.IsImmediate()) {
return;
}
if (clamp.U32() != 3 || segmentation_mask.U32() != 28) {
return;
}
if (swizzle_value == 0x99) {
// DPdxFine
if (index.U32() == 1) {
IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)};
inst.ReplaceUsesWith(ir.DPdxFine(IR::F32{inst.Arg(1)}));
}
} else if (swizzle_value == 0xA5) {
// DPdyFine
if (index.U32() == 2) {
IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)};
inst.ReplaceUsesWith(ir.DPdyFine(IR::F32{inst.Arg(1)}));
}
}
}
void ConstantPropagation(IR::Block& block, IR::Inst& inst) {
switch (inst.GetOpcode()) {
case IR::Opcode::GetRegister:
return FoldGetRegister(inst);
case IR::Opcode::GetPred:
return FoldGetPred(inst);
case IR::Opcode::IAdd32:
return FoldAdd<u32>(block, inst);
case IR::Opcode::ISub32:
return FoldISub32(inst);
case IR::Opcode::IMul32:
FoldWhenAllImmediates(inst, [](u32 a, u32 b) { return a * b; });
return;
case IR::Opcode::ShiftRightArithmetic32:
FoldWhenAllImmediates(inst, [](s32 a, s32 b) { return static_cast<u32>(a >> b); });
return;
case IR::Opcode::BitCastF32U32:
return FoldBitCast<IR::Opcode::BitCastF32U32, f32, u32>(inst, IR::Opcode::BitCastU32F32);
case IR::Opcode::BitCastU32F32:
return FoldBitCast<IR::Opcode::BitCastU32F32, u32, f32>(inst, IR::Opcode::BitCastF32U32);
case IR::Opcode::IAdd64:
return FoldAdd<u64>(block, inst);
case IR::Opcode::PackHalf2x16:
return FoldInverseFunc(inst, IR::Opcode::UnpackHalf2x16);
case IR::Opcode::UnpackHalf2x16:
return FoldInverseFunc(inst, IR::Opcode::PackHalf2x16);
case IR::Opcode::SelectU1:
case IR::Opcode::SelectU8:
case IR::Opcode::SelectU16:
case IR::Opcode::SelectU32:
case IR::Opcode::SelectU64:
case IR::Opcode::SelectF16:
case IR::Opcode::SelectF32:
case IR::Opcode::SelectF64:
return FoldSelect(inst);
case IR::Opcode::FPMul32:
return FoldFPMul32(inst);
case IR::Opcode::LogicalAnd:
return FoldLogicalAnd(inst);
case IR::Opcode::LogicalOr:
return FoldLogicalOr(inst);
case IR::Opcode::LogicalNot:
return FoldLogicalNot(inst);
case IR::Opcode::SLessThan:
FoldWhenAllImmediates(inst, [](s32 a, s32 b) { return a < b; });
return;
case IR::Opcode::ULessThan:
FoldWhenAllImmediates(inst, [](u32 a, u32 b) { return a < b; });
return;
case IR::Opcode::SLessThanEqual:
FoldWhenAllImmediates(inst, [](s32 a, s32 b) { return a <= b; });
return;
case IR::Opcode::ULessThanEqual:
FoldWhenAllImmediates(inst, [](u32 a, u32 b) { return a <= b; });
return;
case IR::Opcode::SGreaterThan:
FoldWhenAllImmediates(inst, [](s32 a, s32 b) { return a > b; });
return;
case IR::Opcode::UGreaterThan:
FoldWhenAllImmediates(inst, [](u32 a, u32 b) { return a > b; });
return;
case IR::Opcode::SGreaterThanEqual:
FoldWhenAllImmediates(inst, [](s32 a, s32 b) { return a >= b; });
return;
case IR::Opcode::UGreaterThanEqual:
FoldWhenAllImmediates(inst, [](u32 a, u32 b) { return a >= b; });
return;
case IR::Opcode::IEqual:
FoldWhenAllImmediates(inst, [](u32 a, u32 b) { return a == b; });
return;
case IR::Opcode::INotEqual:
FoldWhenAllImmediates(inst, [](u32 a, u32 b) { return a != b; });
return;
case IR::Opcode::BitwiseAnd32:
FoldWhenAllImmediates(inst, [](u32 a, u32 b) { return a & b; });
return;
case IR::Opcode::BitwiseOr32:
FoldWhenAllImmediates(inst, [](u32 a, u32 b) { return a | b; });
return;
case IR::Opcode::BitwiseXor32:
FoldWhenAllImmediates(inst, [](u32 a, u32 b) { return a ^ b; });
return;
case IR::Opcode::BitFieldUExtract:
FoldWhenAllImmediates(inst, [](u32 base, u32 shift, u32 count) {
if (static_cast<size_t>(shift) + static_cast<size_t>(count) > 32) {
throw LogicError("Undefined result in {}({}, {}, {})", IR::Opcode::BitFieldUExtract,
base, shift, count);
}
return (base >> shift) & ((1U << count) - 1);
});
return;
case IR::Opcode::BitFieldSExtract:
FoldWhenAllImmediates(inst, [](s32 base, u32 shift, u32 count) {
const size_t back_shift{static_cast<size_t>(shift) + static_cast<size_t>(count)};
const size_t left_shift{32 - back_shift};
const size_t right_shift{static_cast<size_t>(32 - count)};
if (back_shift > 32 || left_shift >= 32 || right_shift >= 32) {
throw LogicError("Undefined result in {}({}, {}, {})", IR::Opcode::BitFieldSExtract,
base, shift, count);
}
return static_cast<u32>((base << left_shift) >> right_shift);
});
return;
case IR::Opcode::BitFieldInsert:
FoldWhenAllImmediates(inst, [](u32 base, u32 insert, u32 offset, u32 bits) {
if (bits >= 32 || offset >= 32) {
throw LogicError("Undefined result in {}({}, {}, {}, {})",
IR::Opcode::BitFieldInsert, base, insert, offset, bits);
}
return (base & ~(~(~0u << bits) << offset)) | (insert << offset);
});
return;
case IR::Opcode::CompositeExtractU32x2:
return FoldCompositeExtract(inst, IR::Opcode::CompositeConstructU32x2,
IR::Opcode::CompositeInsertU32x2);
case IR::Opcode::CompositeExtractU32x3:
return FoldCompositeExtract(inst, IR::Opcode::CompositeConstructU32x3,
IR::Opcode::CompositeInsertU32x3);
case IR::Opcode::CompositeExtractU32x4:
return FoldCompositeExtract(inst, IR::Opcode::CompositeConstructU32x4,
IR::Opcode::CompositeInsertU32x4);
case IR::Opcode::CompositeExtractF32x2:
return FoldCompositeExtract(inst, IR::Opcode::CompositeConstructF32x2,
IR::Opcode::CompositeInsertF32x2);
case IR::Opcode::CompositeExtractF32x3:
return FoldCompositeExtract(inst, IR::Opcode::CompositeConstructF32x3,
IR::Opcode::CompositeInsertF32x3);
case IR::Opcode::CompositeExtractF32x4:
return FoldCompositeExtract(inst, IR::Opcode::CompositeConstructF32x4,
IR::Opcode::CompositeInsertF32x4);
case IR::Opcode::CompositeExtractF16x2:
return FoldCompositeExtract(inst, IR::Opcode::CompositeConstructF16x2,
IR::Opcode::CompositeInsertF16x2);
case IR::Opcode::CompositeExtractF16x3:
return FoldCompositeExtract(inst, IR::Opcode::CompositeConstructF16x3,
IR::Opcode::CompositeInsertF16x3);
case IR::Opcode::CompositeExtractF16x4:
return FoldCompositeExtract(inst, IR::Opcode::CompositeConstructF16x4,
IR::Opcode::CompositeInsertF16x4);
case IR::Opcode::FSwizzleAdd:
return FoldFSwizzleAdd(block, inst);
default:
break;
}
}
} // Anonymous namespace
void ConstantPropagationPass(IR::Program& program) {
for (IR::Block* const block : program.post_order_blocks | std::views::reverse) {
for (IR::Inst& inst : block->Instructions()) {
ConstantPropagation(*block, inst);
}
}
}
} // namespace Shader::Optimization

View File

@@ -0,0 +1,28 @@
// Copyright 2021 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#include <ranges>
#include "shader_recompiler/frontend/ir/basic_block.h"
#include "shader_recompiler/frontend/ir/value.h"
#include "shader_recompiler/ir_opt/passes.h"
namespace Shader::Optimization {
void DeadCodeEliminationPass(IR::Program& program) {
// We iterate over the instructions in reverse order.
// This is because removing an instruction reduces the number of uses for earlier instructions.
for (IR::Block* const block : program.post_order_blocks) {
auto it{block->end()};
while (it != block->begin()) {
--it;
if (!it->HasUses() && !it->MayHaveSideEffects()) {
it->Invalidate();
it = block->Instructions().erase(it);
}
}
}
}
} // namespace Shader::Optimization

View File

@@ -0,0 +1,36 @@
// Copyright 2021 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#include <algorithm>
#include <ranges>
#include "common/bit_cast.h"
#include "common/bit_util.h"
#include "shader_recompiler/exception.h"
#include "shader_recompiler/frontend/ir/ir_emitter.h"
#include "shader_recompiler/ir_opt/passes.h"
namespace Shader::Optimization {
void VertexATransformPass(IR::Program& program) {
for (IR::Block* const block : program.blocks) {
for (IR::Inst& inst : block->Instructions()) {
if (inst.GetOpcode() == IR::Opcode::Epilogue) {
return inst.Invalidate();
}
}
}
}
void VertexBTransformPass(IR::Program& program) {
for (IR::Block* const block : program.blocks) {
for (IR::Inst& inst : block->Instructions()) {
if (inst.GetOpcode() == IR::Opcode::Prologue) {
return inst.Invalidate();
}
}
}
}
} // namespace Shader::Optimization

View File

@@ -0,0 +1,527 @@
// Copyright 2021 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#include <algorithm>
#include <compare>
#include <optional>
#include <ranges>
#include <queue>
#include <boost/container/flat_set.hpp>
#include <boost/container/small_vector.hpp>
#include "common/alignment.h"
#include "shader_recompiler/frontend/ir/basic_block.h"
#include "shader_recompiler/frontend/ir/breadth_first_search.h"
#include "shader_recompiler/frontend/ir/ir_emitter.h"
#include "shader_recompiler/frontend/ir/value.h"
#include "shader_recompiler/ir_opt/passes.h"
namespace Shader::Optimization {
namespace {
/// Address in constant buffers to the storage buffer descriptor
struct StorageBufferAddr {
auto operator<=>(const StorageBufferAddr&) const noexcept = default;
u32 index;
u32 offset;
};
/// Block iterator to a global memory instruction and the storage buffer it uses
struct StorageInst {
StorageBufferAddr storage_buffer;
IR::Inst* inst;
IR::Block* block;
};
/// Bias towards a certain range of constant buffers when looking for storage buffers
struct Bias {
u32 index;
u32 offset_begin;
u32 offset_end;
};
using boost::container::flat_set;
using boost::container::small_vector;
using StorageBufferSet =
flat_set<StorageBufferAddr, std::less<StorageBufferAddr>, small_vector<StorageBufferAddr, 16>>;
using StorageInstVector = small_vector<StorageInst, 24>;
using StorageWritesSet =
flat_set<StorageBufferAddr, std::less<StorageBufferAddr>, small_vector<StorageBufferAddr, 16>>;
struct StorageInfo {
StorageBufferSet set;
StorageInstVector to_replace;
StorageWritesSet writes;
};
/// Returns true when the instruction is a global memory instruction
bool IsGlobalMemory(const IR::Inst& inst) {
switch (inst.GetOpcode()) {
case IR::Opcode::LoadGlobalS8:
case IR::Opcode::LoadGlobalU8:
case IR::Opcode::LoadGlobalS16:
case IR::Opcode::LoadGlobalU16:
case IR::Opcode::LoadGlobal32:
case IR::Opcode::LoadGlobal64:
case IR::Opcode::LoadGlobal128:
case IR::Opcode::WriteGlobalS8:
case IR::Opcode::WriteGlobalU8:
case IR::Opcode::WriteGlobalS16:
case IR::Opcode::WriteGlobalU16:
case IR::Opcode::WriteGlobal32:
case IR::Opcode::WriteGlobal64:
case IR::Opcode::WriteGlobal128:
case IR::Opcode::GlobalAtomicIAdd32:
case IR::Opcode::GlobalAtomicSMin32:
case IR::Opcode::GlobalAtomicUMin32:
case IR::Opcode::GlobalAtomicSMax32:
case IR::Opcode::GlobalAtomicUMax32:
case IR::Opcode::GlobalAtomicInc32:
case IR::Opcode::GlobalAtomicDec32:
case IR::Opcode::GlobalAtomicAnd32:
case IR::Opcode::GlobalAtomicOr32:
case IR::Opcode::GlobalAtomicXor32:
case IR::Opcode::GlobalAtomicExchange32:
case IR::Opcode::GlobalAtomicIAdd64:
case IR::Opcode::GlobalAtomicSMin64:
case IR::Opcode::GlobalAtomicUMin64:
case IR::Opcode::GlobalAtomicSMax64:
case IR::Opcode::GlobalAtomicUMax64:
case IR::Opcode::GlobalAtomicAnd64:
case IR::Opcode::GlobalAtomicOr64:
case IR::Opcode::GlobalAtomicXor64:
case IR::Opcode::GlobalAtomicExchange64:
case IR::Opcode::GlobalAtomicAddF32:
case IR::Opcode::GlobalAtomicAddF16x2:
case IR::Opcode::GlobalAtomicAddF32x2:
case IR::Opcode::GlobalAtomicMinF16x2:
case IR::Opcode::GlobalAtomicMinF32x2:
case IR::Opcode::GlobalAtomicMaxF16x2:
case IR::Opcode::GlobalAtomicMaxF32x2:
return true;
default:
return false;
}
}
/// Returns true when the instruction is a global memory instruction
bool IsGlobalMemoryWrite(const IR::Inst& inst) {
switch (inst.GetOpcode()) {
case IR::Opcode::WriteGlobalS8:
case IR::Opcode::WriteGlobalU8:
case IR::Opcode::WriteGlobalS16:
case IR::Opcode::WriteGlobalU16:
case IR::Opcode::WriteGlobal32:
case IR::Opcode::WriteGlobal64:
case IR::Opcode::WriteGlobal128:
case IR::Opcode::GlobalAtomicIAdd32:
case IR::Opcode::GlobalAtomicSMin32:
case IR::Opcode::GlobalAtomicUMin32:
case IR::Opcode::GlobalAtomicSMax32:
case IR::Opcode::GlobalAtomicUMax32:
case IR::Opcode::GlobalAtomicInc32:
case IR::Opcode::GlobalAtomicDec32:
case IR::Opcode::GlobalAtomicAnd32:
case IR::Opcode::GlobalAtomicOr32:
case IR::Opcode::GlobalAtomicXor32:
case IR::Opcode::GlobalAtomicExchange32:
case IR::Opcode::GlobalAtomicIAdd64:
case IR::Opcode::GlobalAtomicSMin64:
case IR::Opcode::GlobalAtomicUMin64:
case IR::Opcode::GlobalAtomicSMax64:
case IR::Opcode::GlobalAtomicUMax64:
case IR::Opcode::GlobalAtomicAnd64:
case IR::Opcode::GlobalAtomicOr64:
case IR::Opcode::GlobalAtomicXor64:
case IR::Opcode::GlobalAtomicExchange64:
case IR::Opcode::GlobalAtomicAddF32:
case IR::Opcode::GlobalAtomicAddF16x2:
case IR::Opcode::GlobalAtomicAddF32x2:
case IR::Opcode::GlobalAtomicMinF16x2:
case IR::Opcode::GlobalAtomicMinF32x2:
case IR::Opcode::GlobalAtomicMaxF16x2:
case IR::Opcode::GlobalAtomicMaxF32x2:
return true;
default:
return false;
}
}
/// Converts a global memory opcode to its storage buffer equivalent
IR::Opcode GlobalToStorage(IR::Opcode opcode) {
switch (opcode) {
case IR::Opcode::LoadGlobalS8:
return IR::Opcode::LoadStorageS8;
case IR::Opcode::LoadGlobalU8:
return IR::Opcode::LoadStorageU8;
case IR::Opcode::LoadGlobalS16:
return IR::Opcode::LoadStorageS16;
case IR::Opcode::LoadGlobalU16:
return IR::Opcode::LoadStorageU16;
case IR::Opcode::LoadGlobal32:
return IR::Opcode::LoadStorage32;
case IR::Opcode::LoadGlobal64:
return IR::Opcode::LoadStorage64;
case IR::Opcode::LoadGlobal128:
return IR::Opcode::LoadStorage128;
case IR::Opcode::WriteGlobalS8:
return IR::Opcode::WriteStorageS8;
case IR::Opcode::WriteGlobalU8:
return IR::Opcode::WriteStorageU8;
case IR::Opcode::WriteGlobalS16:
return IR::Opcode::WriteStorageS16;
case IR::Opcode::WriteGlobalU16:
return IR::Opcode::WriteStorageU16;
case IR::Opcode::WriteGlobal32:
return IR::Opcode::WriteStorage32;
case IR::Opcode::WriteGlobal64:
return IR::Opcode::WriteStorage64;
case IR::Opcode::WriteGlobal128:
return IR::Opcode::WriteStorage128;
case IR::Opcode::GlobalAtomicIAdd32:
return IR::Opcode::StorageAtomicIAdd32;
case IR::Opcode::GlobalAtomicSMin32:
return IR::Opcode::StorageAtomicSMin32;
case IR::Opcode::GlobalAtomicUMin32:
return IR::Opcode::StorageAtomicUMin32;
case IR::Opcode::GlobalAtomicSMax32:
return IR::Opcode::StorageAtomicSMax32;
case IR::Opcode::GlobalAtomicUMax32:
return IR::Opcode::StorageAtomicUMax32;
case IR::Opcode::GlobalAtomicInc32:
return IR::Opcode::StorageAtomicInc32;
case IR::Opcode::GlobalAtomicDec32:
return IR::Opcode::StorageAtomicDec32;
case IR::Opcode::GlobalAtomicAnd32:
return IR::Opcode::StorageAtomicAnd32;
case IR::Opcode::GlobalAtomicOr32:
return IR::Opcode::StorageAtomicOr32;
case IR::Opcode::GlobalAtomicXor32:
return IR::Opcode::StorageAtomicXor32;
case IR::Opcode::GlobalAtomicIAdd64:
return IR::Opcode::StorageAtomicIAdd64;
case IR::Opcode::GlobalAtomicSMin64:
return IR::Opcode::StorageAtomicSMin64;
case IR::Opcode::GlobalAtomicUMin64:
return IR::Opcode::StorageAtomicUMin64;
case IR::Opcode::GlobalAtomicSMax64:
return IR::Opcode::StorageAtomicSMax64;
case IR::Opcode::GlobalAtomicUMax64:
return IR::Opcode::StorageAtomicUMax64;
case IR::Opcode::GlobalAtomicAnd64:
return IR::Opcode::StorageAtomicAnd64;
case IR::Opcode::GlobalAtomicOr64:
return IR::Opcode::StorageAtomicOr64;
case IR::Opcode::GlobalAtomicXor64:
return IR::Opcode::StorageAtomicXor64;
case IR::Opcode::GlobalAtomicExchange32:
return IR::Opcode::StorageAtomicExchange32;
case IR::Opcode::GlobalAtomicExchange64:
return IR::Opcode::StorageAtomicExchange64;
case IR::Opcode::GlobalAtomicAddF32:
return IR::Opcode::StorageAtomicAddF32;
case IR::Opcode::GlobalAtomicAddF16x2:
return IR::Opcode::StorageAtomicAddF16x2;
case IR::Opcode::GlobalAtomicMinF16x2:
return IR::Opcode::StorageAtomicMinF16x2;
case IR::Opcode::GlobalAtomicMaxF16x2:
return IR::Opcode::StorageAtomicMaxF16x2;
case IR::Opcode::GlobalAtomicAddF32x2:
return IR::Opcode::StorageAtomicAddF32x2;
case IR::Opcode::GlobalAtomicMinF32x2:
return IR::Opcode::StorageAtomicMinF32x2;
case IR::Opcode::GlobalAtomicMaxF32x2:
return IR::Opcode::StorageAtomicMaxF32x2;
default:
throw InvalidArgument("Invalid global memory opcode {}", opcode);
}
}
/// Returns true when a storage buffer address satisfies a bias
bool MeetsBias(const StorageBufferAddr& storage_buffer, const Bias& bias) noexcept {
return storage_buffer.index == bias.index && storage_buffer.offset >= bias.offset_begin &&
storage_buffer.offset < bias.offset_end;
}
struct LowAddrInfo {
IR::U32 value;
s32 imm_offset;
};
/// Tries to track the first 32-bits of a global memory instruction
std::optional<LowAddrInfo> TrackLowAddress(IR::Inst* inst) {
// The first argument is the low level GPU pointer to the global memory instruction
const IR::Value addr{inst->Arg(0)};
if (addr.IsImmediate()) {
// Not much we can do if it's an immediate
return std::nullopt;
}
// This address is expected to either be a PackUint2x32, a IAdd64, or a CompositeConstructU32x2
IR::Inst* addr_inst{addr.InstRecursive()};
s32 imm_offset{0};
if (addr_inst->GetOpcode() == IR::Opcode::IAdd64) {
// If it's an IAdd64, get the immediate offset it is applying and grab the address
// instruction. This expects for the instruction to be canonicalized having the address on
// the first argument and the immediate offset on the second one.
const IR::U64 imm_offset_value{addr_inst->Arg(1)};
if (!imm_offset_value.IsImmediate()) {
return std::nullopt;
}
imm_offset = static_cast<s32>(static_cast<s64>(imm_offset_value.U64()));
const IR::U64 iadd_addr{addr_inst->Arg(0)};
if (iadd_addr.IsImmediate()) {
return std::nullopt;
}
addr_inst = iadd_addr.InstRecursive();
}
// With IAdd64 handled, now PackUint2x32 is expected
if (addr_inst->GetOpcode() == IR::Opcode::PackUint2x32) {
// PackUint2x32 is expected to be generated from a vector
const IR::Value vector{addr_inst->Arg(0)};
if (vector.IsImmediate()) {
return std::nullopt;
}
addr_inst = vector.InstRecursive();
}
// The vector is expected to be a CompositeConstructU32x2
if (addr_inst->GetOpcode() != IR::Opcode::CompositeConstructU32x2) {
return std::nullopt;
}
// Grab the first argument from the CompositeConstructU32x2, this is the low address.
return LowAddrInfo{
.value{IR::U32{addr_inst->Arg(0)}},
.imm_offset = imm_offset,
};
}
/// Tries to track the storage buffer address used by a global memory instruction
std::optional<StorageBufferAddr> Track(const IR::Value& value, const Bias* bias) {
const auto pred{[bias](const IR::Inst* inst) -> std::optional<StorageBufferAddr> {
if (inst->GetOpcode() != IR::Opcode::GetCbufU32) {
return std::nullopt;
}
const IR::Value index{inst->Arg(0)};
const IR::Value offset{inst->Arg(1)};
if (!index.IsImmediate()) {
// Definitely not a storage buffer if it's read from a
// non-immediate index
return std::nullopt;
}
if (!offset.IsImmediate()) {
// TODO: Support SSBO arrays
return std::nullopt;
}
const StorageBufferAddr storage_buffer{
.index{index.U32()},
.offset{offset.U32()},
};
if (!Common::IsAligned(storage_buffer.offset, 16)) {
// The SSBO pointer has to be aligned
return std::nullopt;
}
if (bias && !MeetsBias(storage_buffer, *bias)) {
// We have to blacklist some addresses in case we wrongly
// point to them
return std::nullopt;
}
return storage_buffer;
}};
return BreadthFirstSearch(value, pred);
}
/// Collects the storage buffer used by a global memory instruction and the instruction itself
void CollectStorageBuffers(IR::Block& block, IR::Inst& inst, StorageInfo& info) {
// NVN puts storage buffers in a specific range, we have to bias towards these addresses to
// avoid getting false positives
static constexpr Bias nvn_bias{
.index = 0,
.offset_begin = 0x110,
.offset_end = 0x610,
};
// Track the low address of the instruction
const std::optional<LowAddrInfo> low_addr_info{TrackLowAddress(&inst)};
if (!low_addr_info) {
// Failed to track the low address, use NVN fallbacks
return;
}
// First try to find storage buffers in the NVN address
const IR::U32 low_addr{low_addr_info->value};
std::optional<StorageBufferAddr> storage_buffer{Track(low_addr, &nvn_bias)};
if (!storage_buffer) {
// If it fails, track without a bias
storage_buffer = Track(low_addr, nullptr);
if (!storage_buffer) {
// If that also fails, use NVN fallbacks
return;
}
}
// Collect storage buffer and the instruction
if (IsGlobalMemoryWrite(inst)) {
info.writes.insert(*storage_buffer);
}
info.set.insert(*storage_buffer);
info.to_replace.push_back(StorageInst{
.storage_buffer{*storage_buffer},
.inst = &inst,
.block = &block,
});
}
/// Returns the offset in indices (not bytes) for an equivalent storage instruction
IR::U32 StorageOffset(IR::Block& block, IR::Inst& inst, StorageBufferAddr buffer) {
IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)};
IR::U32 offset;
if (const std::optional<LowAddrInfo> low_addr{TrackLowAddress(&inst)}) {
offset = low_addr->value;
if (low_addr->imm_offset != 0) {
offset = ir.IAdd(offset, ir.Imm32(low_addr->imm_offset));
}
} else {
offset = ir.UConvert(32, IR::U64{inst.Arg(0)});
}
// Subtract the least significant 32 bits from the guest offset. The result is the storage
// buffer offset in bytes.
const IR::U32 low_cbuf{ir.GetCbuf(ir.Imm32(buffer.index), ir.Imm32(buffer.offset))};
return ir.ISub(offset, low_cbuf);
}
/// Replace a global memory load instruction with its storage buffer equivalent
void ReplaceLoad(IR::Block& block, IR::Inst& inst, const IR::U32& storage_index,
const IR::U32& offset) {
const IR::Opcode new_opcode{GlobalToStorage(inst.GetOpcode())};
const auto it{IR::Block::InstructionList::s_iterator_to(inst)};
const IR::Value value{&*block.PrependNewInst(it, new_opcode, {storage_index, offset})};
inst.ReplaceUsesWith(value);
}
/// Replace a global memory write instruction with its storage buffer equivalent
void ReplaceWrite(IR::Block& block, IR::Inst& inst, const IR::U32& storage_index,
const IR::U32& offset) {
const IR::Opcode new_opcode{GlobalToStorage(inst.GetOpcode())};
const auto it{IR::Block::InstructionList::s_iterator_to(inst)};
block.PrependNewInst(it, new_opcode, {storage_index, offset, inst.Arg(1)});
inst.Invalidate();
}
/// Replace an atomic operation on global memory instruction with its storage buffer equivalent
void ReplaceAtomic(IR::Block& block, IR::Inst& inst, const IR::U32& storage_index,
const IR::U32& offset) {
const IR::Opcode new_opcode{GlobalToStorage(inst.GetOpcode())};
const auto it{IR::Block::InstructionList::s_iterator_to(inst)};
const IR::Value value{
&*block.PrependNewInst(it, new_opcode, {storage_index, offset, inst.Arg(1)})};
inst.ReplaceUsesWith(value);
}
/// Replace a global memory instruction with its storage buffer equivalent
void Replace(IR::Block& block, IR::Inst& inst, const IR::U32& storage_index,
const IR::U32& offset) {
switch (inst.GetOpcode()) {
case IR::Opcode::LoadGlobalS8:
case IR::Opcode::LoadGlobalU8:
case IR::Opcode::LoadGlobalS16:
case IR::Opcode::LoadGlobalU16:
case IR::Opcode::LoadGlobal32:
case IR::Opcode::LoadGlobal64:
case IR::Opcode::LoadGlobal128:
return ReplaceLoad(block, inst, storage_index, offset);
case IR::Opcode::WriteGlobalS8:
case IR::Opcode::WriteGlobalU8:
case IR::Opcode::WriteGlobalS16:
case IR::Opcode::WriteGlobalU16:
case IR::Opcode::WriteGlobal32:
case IR::Opcode::WriteGlobal64:
case IR::Opcode::WriteGlobal128:
return ReplaceWrite(block, inst, storage_index, offset);
case IR::Opcode::GlobalAtomicIAdd32:
case IR::Opcode::GlobalAtomicSMin32:
case IR::Opcode::GlobalAtomicUMin32:
case IR::Opcode::GlobalAtomicSMax32:
case IR::Opcode::GlobalAtomicUMax32:
case IR::Opcode::GlobalAtomicInc32:
case IR::Opcode::GlobalAtomicDec32:
case IR::Opcode::GlobalAtomicAnd32:
case IR::Opcode::GlobalAtomicOr32:
case IR::Opcode::GlobalAtomicXor32:
case IR::Opcode::GlobalAtomicExchange32:
case IR::Opcode::GlobalAtomicIAdd64:
case IR::Opcode::GlobalAtomicSMin64:
case IR::Opcode::GlobalAtomicUMin64:
case IR::Opcode::GlobalAtomicSMax64:
case IR::Opcode::GlobalAtomicUMax64:
case IR::Opcode::GlobalAtomicAnd64:
case IR::Opcode::GlobalAtomicOr64:
case IR::Opcode::GlobalAtomicXor64:
case IR::Opcode::GlobalAtomicExchange64:
case IR::Opcode::GlobalAtomicAddF32:
case IR::Opcode::GlobalAtomicAddF16x2:
case IR::Opcode::GlobalAtomicAddF32x2:
case IR::Opcode::GlobalAtomicMinF16x2:
case IR::Opcode::GlobalAtomicMinF32x2:
case IR::Opcode::GlobalAtomicMaxF16x2:
case IR::Opcode::GlobalAtomicMaxF32x2:
return ReplaceAtomic(block, inst, storage_index, offset);
default:
throw InvalidArgument("Invalid global memory opcode {}", inst.GetOpcode());
}
}
} // Anonymous namespace
void GlobalMemoryToStorageBufferPass(IR::Program& program) {
StorageInfo info;
for (IR::Block* const block : program.post_order_blocks) {
for (IR::Inst& inst : block->Instructions()) {
if (!IsGlobalMemory(inst)) {
continue;
}
CollectStorageBuffers(*block, inst, info);
}
}
for (const StorageBufferAddr& storage_buffer : info.set) {
program.info.storage_buffers_descriptors.push_back({
.cbuf_index = storage_buffer.index,
.cbuf_offset = storage_buffer.offset,
.count = 1,
.is_written{info.writes.contains(storage_buffer)},
});
}
for (const StorageInst& storage_inst : info.to_replace) {
const StorageBufferAddr storage_buffer{storage_inst.storage_buffer};
const auto it{info.set.find(storage_inst.storage_buffer)};
const IR::U32 index{IR::Value{static_cast<u32>(info.set.index_of(it))}};
IR::Block* const block{storage_inst.block};
IR::Inst* const inst{storage_inst.inst};
const IR::U32 offset{StorageOffset(*block, *inst, storage_buffer)};
Replace(*block, *inst, index, offset);
}
}
template <typename Descriptors, typename Descriptor, typename Func>
static u32 Add(Descriptors& descriptors, const Descriptor& desc, Func&& pred) {
// TODO: Handle arrays
const auto it{std::ranges::find_if(descriptors, pred)};
if (it != descriptors.end()) {
return static_cast<u32>(std::distance(descriptors.begin(), it));
}
descriptors.push_back(desc);
return static_cast<u32>(descriptors.size()) - 1;
}
void JoinStorageInfo(Info& base, Info& source) {
auto& descriptors = base.storage_buffers_descriptors;
for (auto& desc : source.storage_buffers_descriptors) {
auto it{std::ranges::find_if(descriptors, [&desc](const auto& existing) {
return desc.cbuf_index == existing.cbuf_index &&
desc.cbuf_offset == existing.cbuf_offset && desc.count == existing.count;
})};
if (it != descriptors.end()) {
it->is_written |= desc.is_written;
continue;
}
descriptors.push_back(desc);
}
}
} // namespace Shader::Optimization

View File

@@ -0,0 +1,38 @@
// Copyright 2021 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#include <vector>
#include "shader_recompiler/frontend/ir/basic_block.h"
#include "shader_recompiler/frontend/ir/value.h"
#include "shader_recompiler/ir_opt/passes.h"
namespace Shader::Optimization {
void IdentityRemovalPass(IR::Program& program) {
std::vector<IR::Inst*> to_invalidate;
for (IR::Block* const block : program.blocks) {
for (auto inst = block->begin(); inst != block->end();) {
const size_t num_args{inst->NumArgs()};
for (size_t i = 0; i < num_args; ++i) {
IR::Value arg;
while ((arg = inst->Arg(i)).IsIdentity()) {
inst->SetArg(i, arg.Inst()->Arg(0));
}
}
if (inst->GetOpcode() == IR::Opcode::Identity ||
inst->GetOpcode() == IR::Opcode::Void) {
to_invalidate.push_back(&*inst);
inst = block->Instructions().erase(inst);
} else {
++inst;
}
}
}
for (IR::Inst* const inst : to_invalidate) {
inst->Invalidate();
}
}
} // namespace Shader::Optimization

View File

@@ -0,0 +1,143 @@
// Copyright 2021 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#include <algorithm>
#include "shader_recompiler/frontend/ir/ir_emitter.h"
#include "shader_recompiler/frontend/ir/value.h"
#include "shader_recompiler/ir_opt/passes.h"
namespace Shader::Optimization {
namespace {
IR::Opcode Replace(IR::Opcode op) {
switch (op) {
case IR::Opcode::FPAbs16:
return IR::Opcode::FPAbs32;
case IR::Opcode::FPAdd16:
return IR::Opcode::FPAdd32;
case IR::Opcode::FPCeil16:
return IR::Opcode::FPCeil32;
case IR::Opcode::FPFloor16:
return IR::Opcode::FPFloor32;
case IR::Opcode::FPFma16:
return IR::Opcode::FPFma32;
case IR::Opcode::FPMul16:
return IR::Opcode::FPMul32;
case IR::Opcode::FPNeg16:
return IR::Opcode::FPNeg32;
case IR::Opcode::FPRoundEven16:
return IR::Opcode::FPRoundEven32;
case IR::Opcode::FPSaturate16:
return IR::Opcode::FPSaturate32;
case IR::Opcode::FPClamp16:
return IR::Opcode::FPClamp32;
case IR::Opcode::FPTrunc16:
return IR::Opcode::FPTrunc32;
case IR::Opcode::CompositeConstructF16x2:
return IR::Opcode::CompositeConstructF32x2;
case IR::Opcode::CompositeConstructF16x3:
return IR::Opcode::CompositeConstructF32x3;
case IR::Opcode::CompositeConstructF16x4:
return IR::Opcode::CompositeConstructF32x4;
case IR::Opcode::CompositeExtractF16x2:
return IR::Opcode::CompositeExtractF32x2;
case IR::Opcode::CompositeExtractF16x3:
return IR::Opcode::CompositeExtractF32x3;
case IR::Opcode::CompositeExtractF16x4:
return IR::Opcode::CompositeExtractF32x4;
case IR::Opcode::CompositeInsertF16x2:
return IR::Opcode::CompositeInsertF32x2;
case IR::Opcode::CompositeInsertF16x3:
return IR::Opcode::CompositeInsertF32x3;
case IR::Opcode::CompositeInsertF16x4:
return IR::Opcode::CompositeInsertF32x4;
case IR::Opcode::FPOrdEqual16:
return IR::Opcode::FPOrdEqual32;
case IR::Opcode::FPUnordEqual16:
return IR::Opcode::FPUnordEqual32;
case IR::Opcode::FPOrdNotEqual16:
return IR::Opcode::FPOrdNotEqual32;
case IR::Opcode::FPUnordNotEqual16:
return IR::Opcode::FPUnordNotEqual32;
case IR::Opcode::FPOrdLessThan16:
return IR::Opcode::FPOrdLessThan32;
case IR::Opcode::FPUnordLessThan16:
return IR::Opcode::FPUnordLessThan32;
case IR::Opcode::FPOrdGreaterThan16:
return IR::Opcode::FPOrdGreaterThan32;
case IR::Opcode::FPUnordGreaterThan16:
return IR::Opcode::FPUnordGreaterThan32;
case IR::Opcode::FPOrdLessThanEqual16:
return IR::Opcode::FPOrdLessThanEqual32;
case IR::Opcode::FPUnordLessThanEqual16:
return IR::Opcode::FPUnordLessThanEqual32;
case IR::Opcode::FPOrdGreaterThanEqual16:
return IR::Opcode::FPOrdGreaterThanEqual32;
case IR::Opcode::FPUnordGreaterThanEqual16:
return IR::Opcode::FPUnordGreaterThanEqual32;
case IR::Opcode::FPIsNan16:
return IR::Opcode::FPIsNan32;
case IR::Opcode::ConvertS16F16:
return IR::Opcode::ConvertS16F32;
case IR::Opcode::ConvertS32F16:
return IR::Opcode::ConvertS32F32;
case IR::Opcode::ConvertS64F16:
return IR::Opcode::ConvertS64F32;
case IR::Opcode::ConvertU16F16:
return IR::Opcode::ConvertU16F32;
case IR::Opcode::ConvertU32F16:
return IR::Opcode::ConvertU32F32;
case IR::Opcode::ConvertU64F16:
return IR::Opcode::ConvertU64F32;
case IR::Opcode::PackFloat2x16:
return IR::Opcode::PackHalf2x16;
case IR::Opcode::UnpackFloat2x16:
return IR::Opcode::UnpackHalf2x16;
case IR::Opcode::ConvertF32F16:
return IR::Opcode::Identity;
case IR::Opcode::ConvertF16F32:
return IR::Opcode::Identity;
case IR::Opcode::ConvertF16S8:
return IR::Opcode::ConvertF32S8;
case IR::Opcode::ConvertF16S16:
return IR::Opcode::ConvertF32S16;
case IR::Opcode::ConvertF16S32:
return IR::Opcode::ConvertF32S32;
case IR::Opcode::ConvertF16S64:
return IR::Opcode::ConvertF32S64;
case IR::Opcode::ConvertF16U8:
return IR::Opcode::ConvertF32U8;
case IR::Opcode::ConvertF16U16:
return IR::Opcode::ConvertF32U16;
case IR::Opcode::ConvertF16U32:
return IR::Opcode::ConvertF32U32;
case IR::Opcode::ConvertF16U64:
return IR::Opcode::ConvertF32U64;
case IR::Opcode::GlobalAtomicAddF16x2:
return IR::Opcode::GlobalAtomicAddF32x2;
case IR::Opcode::StorageAtomicAddF16x2:
return IR::Opcode::StorageAtomicAddF32x2;
case IR::Opcode::GlobalAtomicMinF16x2:
return IR::Opcode::GlobalAtomicMinF32x2;
case IR::Opcode::StorageAtomicMinF16x2:
return IR::Opcode::StorageAtomicMinF32x2;
case IR::Opcode::GlobalAtomicMaxF16x2:
return IR::Opcode::GlobalAtomicMaxF32x2;
case IR::Opcode::StorageAtomicMaxF16x2:
return IR::Opcode::StorageAtomicMaxF32x2;
default:
return op;
}
}
} // Anonymous namespace
void LowerFp16ToFp32(IR::Program& program) {
for (IR::Block* const block : program.blocks) {
for (IR::Inst& inst : block->Instructions()) {
inst.ReplaceOpcode(Replace(inst.GetOpcode()));
}
}
}
} // namespace Shader::Optimization

View File

@@ -0,0 +1,217 @@
// Copyright 2021 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#include <ranges>
#include <utility>
#include "shader_recompiler/exception.h"
#include "shader_recompiler/frontend/ir/basic_block.h"
#include "shader_recompiler/frontend/ir/ir_emitter.h"
#include "shader_recompiler/frontend/ir/program.h"
#include "shader_recompiler/frontend/ir/value.h"
#include "shader_recompiler/ir_opt/passes.h"
namespace Shader::Optimization {
namespace {
std::pair<IR::U32, IR::U32> Unpack(IR::IREmitter& ir, const IR::Value& packed) {
if (packed.IsImmediate()) {
const u64 value{packed.U64()};
return {
ir.Imm32(static_cast<u32>(value)),
ir.Imm32(static_cast<u32>(value >> 32)),
};
} else {
return std::pair<IR::U32, IR::U32>{
ir.CompositeExtract(packed, 0u),
ir.CompositeExtract(packed, 1u),
};
}
}
void IAdd64To32(IR::Block& block, IR::Inst& inst) {
if (inst.HasAssociatedPseudoOperation()) {
throw NotImplementedException("IAdd64 emulation with pseudo instructions");
}
IR::IREmitter ir(block, IR::Block::InstructionList::s_iterator_to(inst));
const auto [a_lo, a_hi]{Unpack(ir, inst.Arg(0))};
const auto [b_lo, b_hi]{Unpack(ir, inst.Arg(1))};
const IR::U32 ret_lo{ir.IAdd(a_lo, b_lo)};
const IR::U32 carry{ir.Select(ir.GetCarryFromOp(ret_lo), ir.Imm32(1u), ir.Imm32(0u))};
const IR::U32 ret_hi{ir.IAdd(ir.IAdd(a_hi, b_hi), carry)};
inst.ReplaceUsesWith(ir.CompositeConstruct(ret_lo, ret_hi));
}
void ISub64To32(IR::Block& block, IR::Inst& inst) {
if (inst.HasAssociatedPseudoOperation()) {
throw NotImplementedException("ISub64 emulation with pseudo instructions");
}
IR::IREmitter ir(block, IR::Block::InstructionList::s_iterator_to(inst));
const auto [a_lo, a_hi]{Unpack(ir, inst.Arg(0))};
const auto [b_lo, b_hi]{Unpack(ir, inst.Arg(1))};
const IR::U32 ret_lo{ir.ISub(a_lo, b_lo)};
const IR::U1 underflow{ir.IGreaterThan(ret_lo, a_lo, false)};
const IR::U32 underflow_bit{ir.Select(underflow, ir.Imm32(1u), ir.Imm32(0u))};
const IR::U32 ret_hi{ir.ISub(ir.ISub(a_hi, b_hi), underflow_bit)};
inst.ReplaceUsesWith(ir.CompositeConstruct(ret_lo, ret_hi));
}
void INeg64To32(IR::Block& block, IR::Inst& inst) {
if (inst.HasAssociatedPseudoOperation()) {
throw NotImplementedException("INeg64 emulation with pseudo instructions");
}
IR::IREmitter ir(block, IR::Block::InstructionList::s_iterator_to(inst));
auto [lo, hi]{Unpack(ir, inst.Arg(0))};
lo = ir.BitwiseNot(lo);
hi = ir.BitwiseNot(hi);
lo = ir.IAdd(lo, ir.Imm32(1));
const IR::U32 carry{ir.Select(ir.GetCarryFromOp(lo), ir.Imm32(1u), ir.Imm32(0u))};
hi = ir.IAdd(hi, carry);
inst.ReplaceUsesWith(ir.CompositeConstruct(lo, hi));
}
void ShiftLeftLogical64To32(IR::Block& block, IR::Inst& inst) {
if (inst.HasAssociatedPseudoOperation()) {
throw NotImplementedException("ShiftLeftLogical64 emulation with pseudo instructions");
}
IR::IREmitter ir(block, IR::Block::InstructionList::s_iterator_to(inst));
const auto [lo, hi]{Unpack(ir, inst.Arg(0))};
const IR::U32 shift{inst.Arg(1)};
const IR::U32 shifted_lo{ir.ShiftLeftLogical(lo, shift)};
const IR::U32 shifted_hi{ir.ShiftLeftLogical(hi, shift)};
const IR::U32 inv_shift{ir.ISub(shift, ir.Imm32(32))};
const IR::U1 is_long{ir.IGreaterThanEqual(inv_shift, ir.Imm32(0), true)};
const IR::U1 is_zero{ir.IEqual(shift, ir.Imm32(0))};
const IR::U32 long_ret_lo{ir.Imm32(0)};
const IR::U32 long_ret_hi{ir.ShiftLeftLogical(lo, inv_shift)};
const IR::U32 shift_complement{ir.ISub(ir.Imm32(32), shift)};
const IR::U32 lo_extract{ir.BitFieldExtract(lo, shift_complement, shift, false)};
const IR::U32 short_ret_lo{shifted_lo};
const IR::U32 short_ret_hi{ir.BitwiseOr(shifted_hi, lo_extract)};
const IR::U32 zero_ret_lo{lo};
const IR::U32 zero_ret_hi{hi};
const IR::U32 non_zero_lo{ir.Select(is_long, long_ret_lo, short_ret_lo)};
const IR::U32 non_zero_hi{ir.Select(is_long, long_ret_hi, short_ret_hi)};
const IR::U32 ret_lo{ir.Select(is_zero, zero_ret_lo, non_zero_lo)};
const IR::U32 ret_hi{ir.Select(is_zero, zero_ret_hi, non_zero_hi)};
inst.ReplaceUsesWith(ir.CompositeConstruct(ret_lo, ret_hi));
}
void ShiftRightLogical64To32(IR::Block& block, IR::Inst& inst) {
if (inst.HasAssociatedPseudoOperation()) {
throw NotImplementedException("ShiftRightLogical64 emulation with pseudo instructions");
}
IR::IREmitter ir(block, IR::Block::InstructionList::s_iterator_to(inst));
const auto [lo, hi]{Unpack(ir, inst.Arg(0))};
const IR::U32 shift{inst.Arg(1)};
const IR::U32 shifted_lo{ir.ShiftRightLogical(lo, shift)};
const IR::U32 shifted_hi{ir.ShiftRightLogical(hi, shift)};
const IR::U32 inv_shift{ir.ISub(shift, ir.Imm32(32))};
const IR::U1 is_long{ir.IGreaterThanEqual(inv_shift, ir.Imm32(0), true)};
const IR::U1 is_zero{ir.IEqual(shift, ir.Imm32(0))};
const IR::U32 long_ret_hi{ir.Imm32(0)};
const IR::U32 long_ret_lo{ir.ShiftRightLogical(hi, inv_shift)};
const IR::U32 shift_complement{ir.ISub(ir.Imm32(32), shift)};
const IR::U32 short_hi_extract{ir.BitFieldExtract(hi, ir.Imm32(0), shift)};
const IR::U32 short_ret_hi{shifted_hi};
const IR::U32 short_ret_lo{
ir.BitFieldInsert(shifted_lo, short_hi_extract, shift_complement, shift)};
const IR::U32 zero_ret_lo{lo};
const IR::U32 zero_ret_hi{hi};
const IR::U32 non_zero_lo{ir.Select(is_long, long_ret_lo, short_ret_lo)};
const IR::U32 non_zero_hi{ir.Select(is_long, long_ret_hi, short_ret_hi)};
const IR::U32 ret_lo{ir.Select(is_zero, zero_ret_lo, non_zero_lo)};
const IR::U32 ret_hi{ir.Select(is_zero, zero_ret_hi, non_zero_hi)};
inst.ReplaceUsesWith(ir.CompositeConstruct(ret_lo, ret_hi));
}
void ShiftRightArithmetic64To32(IR::Block& block, IR::Inst& inst) {
if (inst.HasAssociatedPseudoOperation()) {
throw NotImplementedException("ShiftRightArithmetic64 emulation with pseudo instructions");
}
IR::IREmitter ir(block, IR::Block::InstructionList::s_iterator_to(inst));
const auto [lo, hi]{Unpack(ir, inst.Arg(0))};
const IR::U32 shift{inst.Arg(1)};
const IR::U32 shifted_lo{ir.ShiftRightLogical(lo, shift)};
const IR::U32 shifted_hi{ir.ShiftRightArithmetic(hi, shift)};
const IR::U32 sign_extension{ir.ShiftRightArithmetic(hi, ir.Imm32(31))};
const IR::U32 inv_shift{ir.ISub(shift, ir.Imm32(32))};
const IR::U1 is_long{ir.IGreaterThanEqual(inv_shift, ir.Imm32(0), true)};
const IR::U1 is_zero{ir.IEqual(shift, ir.Imm32(0))};
const IR::U32 long_ret_hi{sign_extension};
const IR::U32 long_ret_lo{ir.ShiftRightArithmetic(hi, inv_shift)};
const IR::U32 shift_complement{ir.ISub(ir.Imm32(32), shift)};
const IR::U32 short_hi_extract(ir.BitFieldExtract(hi, ir.Imm32(0), shift));
const IR::U32 short_ret_hi{shifted_hi};
const IR::U32 short_ret_lo{
ir.BitFieldInsert(shifted_lo, short_hi_extract, shift_complement, shift)};
const IR::U32 zero_ret_lo{lo};
const IR::U32 zero_ret_hi{hi};
const IR::U32 non_zero_lo{ir.Select(is_long, long_ret_lo, short_ret_lo)};
const IR::U32 non_zero_hi{ir.Select(is_long, long_ret_hi, short_ret_hi)};
const IR::U32 ret_lo{ir.Select(is_zero, zero_ret_lo, non_zero_lo)};
const IR::U32 ret_hi{ir.Select(is_zero, zero_ret_hi, non_zero_hi)};
inst.ReplaceUsesWith(ir.CompositeConstruct(ret_lo, ret_hi));
}
void Lower(IR::Block& block, IR::Inst& inst) {
switch (inst.GetOpcode()) {
case IR::Opcode::PackUint2x32:
case IR::Opcode::UnpackUint2x32:
return inst.ReplaceOpcode(IR::Opcode::Identity);
case IR::Opcode::IAdd64:
return IAdd64To32(block, inst);
case IR::Opcode::ISub64:
return ISub64To32(block, inst);
case IR::Opcode::INeg64:
return INeg64To32(block, inst);
case IR::Opcode::ShiftLeftLogical64:
return ShiftLeftLogical64To32(block, inst);
case IR::Opcode::ShiftRightLogical64:
return ShiftRightLogical64To32(block, inst);
case IR::Opcode::ShiftRightArithmetic64:
return ShiftRightArithmetic64To32(block, inst);
default:
break;
}
}
} // Anonymous namespace
void LowerInt64ToInt32(IR::Program& program) {
for (IR::Block* const block : program.post_order_blocks | std::views::reverse) {
for (IR::Inst& inst : block->Instructions()) {
Lower(*block, inst);
}
}
}
} // namespace Shader::Optimization

View File

@@ -0,0 +1,32 @@
// Copyright 2021 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#pragma once
#include <span>
#include "shader_recompiler/environment.h"
#include "shader_recompiler/frontend/ir/basic_block.h"
#include "shader_recompiler/frontend/ir/program.h"
namespace Shader::Optimization {
void CollectShaderInfoPass(Environment& env, IR::Program& program);
void ConstantPropagationPass(IR::Program& program);
void DeadCodeEliminationPass(IR::Program& program);
void GlobalMemoryToStorageBufferPass(IR::Program& program);
void IdentityRemovalPass(IR::Program& program);
void LowerFp16ToFp32(IR::Program& program);
void LowerInt64ToInt32(IR::Program& program);
void SsaRewritePass(IR::Program& program);
void TexturePass(Environment& env, IR::Program& program);
void VerificationPass(const IR::Program& program);
// Dual Vertex
void VertexATransformPass(IR::Program& program);
void VertexBTransformPass(IR::Program& program);
void JoinTextureInfo(Info& base, Info& source);
void JoinStorageInfo(Info& base, Info& source);
} // namespace Shader::Optimization

View File

@@ -0,0 +1,381 @@
// Copyright 2021 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
// This file implements the SSA rewriting algorithm proposed in
//
// Simple and Efficient Construction of Static Single Assignment Form.
// Braun M., Buchwald S., Hack S., Leiba R., Mallon C., Zwinkau A. (2013)
// In: Jhala R., De Bosschere K. (eds)
// Compiler Construction. CC 2013.
// Lecture Notes in Computer Science, vol 7791.
// Springer, Berlin, Heidelberg
//
// https://link.springer.com/chapter/10.1007/978-3-642-37051-9_6
//
#include <ranges>
#include <span>
#include <variant>
#include <vector>
#include <boost/container/flat_map.hpp>
#include <boost/container/flat_set.hpp>
#include "shader_recompiler/frontend/ir/basic_block.h"
#include "shader_recompiler/frontend/ir/opcodes.h"
#include "shader_recompiler/frontend/ir/pred.h"
#include "shader_recompiler/frontend/ir/reg.h"
#include "shader_recompiler/frontend/ir/value.h"
#include "shader_recompiler/ir_opt/passes.h"
namespace Shader::Optimization {
namespace {
struct FlagTag {
auto operator<=>(const FlagTag&) const noexcept = default;
};
struct ZeroFlagTag : FlagTag {};
struct SignFlagTag : FlagTag {};
struct CarryFlagTag : FlagTag {};
struct OverflowFlagTag : FlagTag {};
struct GotoVariable : FlagTag {
GotoVariable() = default;
explicit GotoVariable(u32 index_) : index{index_} {}
auto operator<=>(const GotoVariable&) const noexcept = default;
u32 index;
};
struct IndirectBranchVariable {
auto operator<=>(const IndirectBranchVariable&) const noexcept = default;
};
using Variant = std::variant<IR::Reg, IR::Pred, ZeroFlagTag, SignFlagTag, CarryFlagTag,
OverflowFlagTag, GotoVariable, IndirectBranchVariable>;
using ValueMap = boost::container::flat_map<IR::Block*, IR::Value>;
struct DefTable {
const IR::Value& Def(IR::Block* block, IR::Reg variable) {
return block->SsaRegValue(variable);
}
void SetDef(IR::Block* block, IR::Reg variable, const IR::Value& value) {
block->SetSsaRegValue(variable, value);
}
const IR::Value& Def(IR::Block* block, IR::Pred variable) {
return preds[IR::PredIndex(variable)][block];
}
void SetDef(IR::Block* block, IR::Pred variable, const IR::Value& value) {
preds[IR::PredIndex(variable)].insert_or_assign(block, value);
}
const IR::Value& Def(IR::Block* block, GotoVariable variable) {
return goto_vars[variable.index][block];
}
void SetDef(IR::Block* block, GotoVariable variable, const IR::Value& value) {
goto_vars[variable.index].insert_or_assign(block, value);
}
const IR::Value& Def(IR::Block* block, IndirectBranchVariable) {
return indirect_branch_var[block];
}
void SetDef(IR::Block* block, IndirectBranchVariable, const IR::Value& value) {
indirect_branch_var.insert_or_assign(block, value);
}
const IR::Value& Def(IR::Block* block, ZeroFlagTag) {
return zero_flag[block];
}
void SetDef(IR::Block* block, ZeroFlagTag, const IR::Value& value) {
zero_flag.insert_or_assign(block, value);
}
const IR::Value& Def(IR::Block* block, SignFlagTag) {
return sign_flag[block];
}
void SetDef(IR::Block* block, SignFlagTag, const IR::Value& value) {
sign_flag.insert_or_assign(block, value);
}
const IR::Value& Def(IR::Block* block, CarryFlagTag) {
return carry_flag[block];
}
void SetDef(IR::Block* block, CarryFlagTag, const IR::Value& value) {
carry_flag.insert_or_assign(block, value);
}
const IR::Value& Def(IR::Block* block, OverflowFlagTag) {
return overflow_flag[block];
}
void SetDef(IR::Block* block, OverflowFlagTag, const IR::Value& value) {
overflow_flag.insert_or_assign(block, value);
}
std::array<ValueMap, IR::NUM_USER_PREDS> preds;
boost::container::flat_map<u32, ValueMap> goto_vars;
ValueMap indirect_branch_var;
ValueMap zero_flag;
ValueMap sign_flag;
ValueMap carry_flag;
ValueMap overflow_flag;
};
IR::Opcode UndefOpcode(IR::Reg) noexcept {
return IR::Opcode::UndefU32;
}
IR::Opcode UndefOpcode(IR::Pred) noexcept {
return IR::Opcode::UndefU1;
}
IR::Opcode UndefOpcode(const FlagTag&) noexcept {
return IR::Opcode::UndefU1;
}
IR::Opcode UndefOpcode(IndirectBranchVariable) noexcept {
return IR::Opcode::UndefU32;
}
enum class Status {
Start,
SetValue,
PreparePhiArgument,
PushPhiArgument,
};
template <typename Type>
struct ReadState {
ReadState(IR::Block* block_) : block{block_} {}
ReadState() = default;
IR::Block* block{};
IR::Value result{};
IR::Inst* phi{};
IR::Block* const* pred_it{};
IR::Block* const* pred_end{};
Status pc{Status::Start};
};
class Pass {
public:
template <typename Type>
void WriteVariable(Type variable, IR::Block* block, const IR::Value& value) {
current_def.SetDef(block, variable, value);
}
template <typename Type>
IR::Value ReadVariable(Type variable, IR::Block* root_block) {
boost::container::small_vector<ReadState<Type>, 64> stack{
ReadState<Type>(nullptr),
ReadState<Type>(root_block),
};
const auto prepare_phi_operand{[&] {
if (stack.back().pred_it == stack.back().pred_end) {
IR::Inst* const phi{stack.back().phi};
IR::Block* const block{stack.back().block};
const IR::Value result{TryRemoveTrivialPhi(*phi, block, UndefOpcode(variable))};
stack.pop_back();
stack.back().result = result;
WriteVariable(variable, block, result);
} else {
IR::Block* const imm_pred{*stack.back().pred_it};
stack.back().pc = Status::PushPhiArgument;
stack.emplace_back(imm_pred);
}
}};
do {
IR::Block* const block{stack.back().block};
switch (stack.back().pc) {
case Status::Start: {
if (const IR::Value& def = current_def.Def(block, variable); !def.IsEmpty()) {
stack.back().result = def;
} else if (!block->IsSsaSealed()) {
// Incomplete CFG
IR::Inst* phi{&*block->PrependNewInst(block->begin(), IR::Opcode::Phi)};
phi->SetFlags(IR::TypeOf(UndefOpcode(variable)));
incomplete_phis[block].insert_or_assign(variable, phi);
stack.back().result = IR::Value{&*phi};
} else if (const std::span imm_preds = block->ImmPredecessors();
imm_preds.size() == 1) {
// Optimize the common case of one predecessor: no phi needed
stack.back().pc = Status::SetValue;
stack.emplace_back(imm_preds.front());
break;
} else {
// Break potential cycles with operandless phi
IR::Inst* const phi{&*block->PrependNewInst(block->begin(), IR::Opcode::Phi)};
phi->SetFlags(IR::TypeOf(UndefOpcode(variable)));
WriteVariable(variable, block, IR::Value{phi});
stack.back().phi = phi;
stack.back().pred_it = imm_preds.data();
stack.back().pred_end = imm_preds.data() + imm_preds.size();
prepare_phi_operand();
break;
}
}
[[fallthrough]];
case Status::SetValue: {
const IR::Value result{stack.back().result};
WriteVariable(variable, block, result);
stack.pop_back();
stack.back().result = result;
break;
}
case Status::PushPhiArgument: {
IR::Inst* const phi{stack.back().phi};
phi->AddPhiOperand(*stack.back().pred_it, stack.back().result);
++stack.back().pred_it;
}
[[fallthrough]];
case Status::PreparePhiArgument:
prepare_phi_operand();
break;
}
} while (stack.size() > 1);
return stack.back().result;
}
void SealBlock(IR::Block* block) {
const auto it{incomplete_phis.find(block)};
if (it != incomplete_phis.end()) {
for (auto& [variant, phi] : it->second) {
std::visit([&](auto& variable) { AddPhiOperands(variable, *phi, block); }, variant);
}
}
block->SsaSeal();
}
private:
template <typename Type>
IR::Value AddPhiOperands(Type variable, IR::Inst& phi, IR::Block* block) {
for (IR::Block* const imm_pred : block->ImmPredecessors()) {
phi.AddPhiOperand(imm_pred, ReadVariable(variable, imm_pred));
}
return TryRemoveTrivialPhi(phi, block, UndefOpcode(variable));
}
IR::Value TryRemoveTrivialPhi(IR::Inst& phi, IR::Block* block, IR::Opcode undef_opcode) {
IR::Value same;
const size_t num_args{phi.NumArgs()};
for (size_t arg_index = 0; arg_index < num_args; ++arg_index) {
const IR::Value& op{phi.Arg(arg_index)};
if (op.Resolve() == same.Resolve() || op == IR::Value{&phi}) {
// Unique value or self-reference
continue;
}
if (!same.IsEmpty()) {
// The phi merges at least two values: not trivial
return IR::Value{&phi};
}
same = op;
}
// Remove the phi node from the block, it will be reinserted
IR::Block::InstructionList& list{block->Instructions()};
list.erase(IR::Block::InstructionList::s_iterator_to(phi));
// Find the first non-phi instruction and use it as an insertion point
IR::Block::iterator reinsert_point{std::ranges::find_if_not(list, IR::IsPhi)};
if (same.IsEmpty()) {
// The phi is unreachable or in the start block
// Insert an undefined instruction and make it the phi node replacement
// The "phi" node reinsertion point is specified after this instruction
reinsert_point = block->PrependNewInst(reinsert_point, undef_opcode);
same = IR::Value{&*reinsert_point};
++reinsert_point;
}
// Reinsert the phi node and reroute all its uses to the "same" value
list.insert(reinsert_point, phi);
phi.ReplaceUsesWith(same);
// TODO: Try to recursively remove all phi users, which might have become trivial
return same;
}
boost::container::flat_map<IR::Block*, boost::container::flat_map<Variant, IR::Inst*>>
incomplete_phis;
DefTable current_def;
};
void VisitInst(Pass& pass, IR::Block* block, IR::Inst& inst) {
switch (inst.GetOpcode()) {
case IR::Opcode::SetRegister:
if (const IR::Reg reg{inst.Arg(0).Reg()}; reg != IR::Reg::RZ) {
pass.WriteVariable(reg, block, inst.Arg(1));
}
break;
case IR::Opcode::SetPred:
if (const IR::Pred pred{inst.Arg(0).Pred()}; pred != IR::Pred::PT) {
pass.WriteVariable(pred, block, inst.Arg(1));
}
break;
case IR::Opcode::SetGotoVariable:
pass.WriteVariable(GotoVariable{inst.Arg(0).U32()}, block, inst.Arg(1));
break;
case IR::Opcode::SetIndirectBranchVariable:
pass.WriteVariable(IndirectBranchVariable{}, block, inst.Arg(0));
break;
case IR::Opcode::SetZFlag:
pass.WriteVariable(ZeroFlagTag{}, block, inst.Arg(0));
break;
case IR::Opcode::SetSFlag:
pass.WriteVariable(SignFlagTag{}, block, inst.Arg(0));
break;
case IR::Opcode::SetCFlag:
pass.WriteVariable(CarryFlagTag{}, block, inst.Arg(0));
break;
case IR::Opcode::SetOFlag:
pass.WriteVariable(OverflowFlagTag{}, block, inst.Arg(0));
break;
case IR::Opcode::GetRegister:
if (const IR::Reg reg{inst.Arg(0).Reg()}; reg != IR::Reg::RZ) {
inst.ReplaceUsesWith(pass.ReadVariable(reg, block));
}
break;
case IR::Opcode::GetPred:
if (const IR::Pred pred{inst.Arg(0).Pred()}; pred != IR::Pred::PT) {
inst.ReplaceUsesWith(pass.ReadVariable(pred, block));
}
break;
case IR::Opcode::GetGotoVariable:
inst.ReplaceUsesWith(pass.ReadVariable(GotoVariable{inst.Arg(0).U32()}, block));
break;
case IR::Opcode::GetIndirectBranchVariable:
inst.ReplaceUsesWith(pass.ReadVariable(IndirectBranchVariable{}, block));
break;
case IR::Opcode::GetZFlag:
inst.ReplaceUsesWith(pass.ReadVariable(ZeroFlagTag{}, block));
break;
case IR::Opcode::GetSFlag:
inst.ReplaceUsesWith(pass.ReadVariable(SignFlagTag{}, block));
break;
case IR::Opcode::GetCFlag:
inst.ReplaceUsesWith(pass.ReadVariable(CarryFlagTag{}, block));
break;
case IR::Opcode::GetOFlag:
inst.ReplaceUsesWith(pass.ReadVariable(OverflowFlagTag{}, block));
break;
default:
break;
}
}
void VisitBlock(Pass& pass, IR::Block* block) {
for (IR::Inst& inst : block->Instructions()) {
VisitInst(pass, block, inst);
}
pass.SealBlock(block);
}
} // Anonymous namespace
void SsaRewritePass(IR::Program& program) {
Pass pass;
for (IR::Block* const block : program.post_order_blocks | std::views::reverse) {
VisitBlock(pass, block);
}
}
} // namespace Shader::Optimization

View File

@@ -0,0 +1,523 @@
// Copyright 2021 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#include <algorithm>
#include <bit>
#include <optional>
#include <boost/container/small_vector.hpp>
#include "shader_recompiler/environment.h"
#include "shader_recompiler/frontend/ir/basic_block.h"
#include "shader_recompiler/frontend/ir/breadth_first_search.h"
#include "shader_recompiler/frontend/ir/ir_emitter.h"
#include "shader_recompiler/ir_opt/passes.h"
#include "shader_recompiler/shader_info.h"
namespace Shader::Optimization {
namespace {
struct ConstBufferAddr {
u32 index;
u32 offset;
u32 secondary_index;
u32 secondary_offset;
IR::U32 dynamic_offset;
u32 count;
bool has_secondary;
};
struct TextureInst {
ConstBufferAddr cbuf;
IR::Inst* inst;
IR::Block* block;
};
using TextureInstVector = boost::container::small_vector<TextureInst, 24>;
constexpr u32 DESCRIPTOR_SIZE = 8;
constexpr u32 DESCRIPTOR_SIZE_SHIFT = static_cast<u32>(std::countr_zero(DESCRIPTOR_SIZE));
IR::Opcode IndexedInstruction(const IR::Inst& inst) {
switch (inst.GetOpcode()) {
case IR::Opcode::BindlessImageSampleImplicitLod:
case IR::Opcode::BoundImageSampleImplicitLod:
return IR::Opcode::ImageSampleImplicitLod;
case IR::Opcode::BoundImageSampleExplicitLod:
case IR::Opcode::BindlessImageSampleExplicitLod:
return IR::Opcode::ImageSampleExplicitLod;
case IR::Opcode::BoundImageSampleDrefImplicitLod:
case IR::Opcode::BindlessImageSampleDrefImplicitLod:
return IR::Opcode::ImageSampleDrefImplicitLod;
case IR::Opcode::BoundImageSampleDrefExplicitLod:
case IR::Opcode::BindlessImageSampleDrefExplicitLod:
return IR::Opcode::ImageSampleDrefExplicitLod;
case IR::Opcode::BindlessImageGather:
case IR::Opcode::BoundImageGather:
return IR::Opcode::ImageGather;
case IR::Opcode::BindlessImageGatherDref:
case IR::Opcode::BoundImageGatherDref:
return IR::Opcode::ImageGatherDref;
case IR::Opcode::BindlessImageFetch:
case IR::Opcode::BoundImageFetch:
return IR::Opcode::ImageFetch;
case IR::Opcode::BoundImageQueryDimensions:
case IR::Opcode::BindlessImageQueryDimensions:
return IR::Opcode::ImageQueryDimensions;
case IR::Opcode::BoundImageQueryLod:
case IR::Opcode::BindlessImageQueryLod:
return IR::Opcode::ImageQueryLod;
case IR::Opcode::BoundImageGradient:
case IR::Opcode::BindlessImageGradient:
return IR::Opcode::ImageGradient;
case IR::Opcode::BoundImageRead:
case IR::Opcode::BindlessImageRead:
return IR::Opcode::ImageRead;
case IR::Opcode::BoundImageWrite:
case IR::Opcode::BindlessImageWrite:
return IR::Opcode::ImageWrite;
case IR::Opcode::BoundImageAtomicIAdd32:
case IR::Opcode::BindlessImageAtomicIAdd32:
return IR::Opcode::ImageAtomicIAdd32;
case IR::Opcode::BoundImageAtomicSMin32:
case IR::Opcode::BindlessImageAtomicSMin32:
return IR::Opcode::ImageAtomicSMin32;
case IR::Opcode::BoundImageAtomicUMin32:
case IR::Opcode::BindlessImageAtomicUMin32:
return IR::Opcode::ImageAtomicUMin32;
case IR::Opcode::BoundImageAtomicSMax32:
case IR::Opcode::BindlessImageAtomicSMax32:
return IR::Opcode::ImageAtomicSMax32;
case IR::Opcode::BoundImageAtomicUMax32:
case IR::Opcode::BindlessImageAtomicUMax32:
return IR::Opcode::ImageAtomicUMax32;
case IR::Opcode::BoundImageAtomicInc32:
case IR::Opcode::BindlessImageAtomicInc32:
return IR::Opcode::ImageAtomicInc32;
case IR::Opcode::BoundImageAtomicDec32:
case IR::Opcode::BindlessImageAtomicDec32:
return IR::Opcode::ImageAtomicDec32;
case IR::Opcode::BoundImageAtomicAnd32:
case IR::Opcode::BindlessImageAtomicAnd32:
return IR::Opcode::ImageAtomicAnd32;
case IR::Opcode::BoundImageAtomicOr32:
case IR::Opcode::BindlessImageAtomicOr32:
return IR::Opcode::ImageAtomicOr32;
case IR::Opcode::BoundImageAtomicXor32:
case IR::Opcode::BindlessImageAtomicXor32:
return IR::Opcode::ImageAtomicXor32;
case IR::Opcode::BoundImageAtomicExchange32:
case IR::Opcode::BindlessImageAtomicExchange32:
return IR::Opcode::ImageAtomicExchange32;
default:
return IR::Opcode::Void;
}
}
bool IsBindless(const IR::Inst& inst) {
switch (inst.GetOpcode()) {
case IR::Opcode::BindlessImageSampleImplicitLod:
case IR::Opcode::BindlessImageSampleExplicitLod:
case IR::Opcode::BindlessImageSampleDrefImplicitLod:
case IR::Opcode::BindlessImageSampleDrefExplicitLod:
case IR::Opcode::BindlessImageGather:
case IR::Opcode::BindlessImageGatherDref:
case IR::Opcode::BindlessImageFetch:
case IR::Opcode::BindlessImageQueryDimensions:
case IR::Opcode::BindlessImageQueryLod:
case IR::Opcode::BindlessImageGradient:
case IR::Opcode::BindlessImageRead:
case IR::Opcode::BindlessImageWrite:
case IR::Opcode::BindlessImageAtomicIAdd32:
case IR::Opcode::BindlessImageAtomicSMin32:
case IR::Opcode::BindlessImageAtomicUMin32:
case IR::Opcode::BindlessImageAtomicSMax32:
case IR::Opcode::BindlessImageAtomicUMax32:
case IR::Opcode::BindlessImageAtomicInc32:
case IR::Opcode::BindlessImageAtomicDec32:
case IR::Opcode::BindlessImageAtomicAnd32:
case IR::Opcode::BindlessImageAtomicOr32:
case IR::Opcode::BindlessImageAtomicXor32:
case IR::Opcode::BindlessImageAtomicExchange32:
return true;
case IR::Opcode::BoundImageSampleImplicitLod:
case IR::Opcode::BoundImageSampleExplicitLod:
case IR::Opcode::BoundImageSampleDrefImplicitLod:
case IR::Opcode::BoundImageSampleDrefExplicitLod:
case IR::Opcode::BoundImageGather:
case IR::Opcode::BoundImageGatherDref:
case IR::Opcode::BoundImageFetch:
case IR::Opcode::BoundImageQueryDimensions:
case IR::Opcode::BoundImageQueryLod:
case IR::Opcode::BoundImageGradient:
case IR::Opcode::BoundImageRead:
case IR::Opcode::BoundImageWrite:
case IR::Opcode::BoundImageAtomicIAdd32:
case IR::Opcode::BoundImageAtomicSMin32:
case IR::Opcode::BoundImageAtomicUMin32:
case IR::Opcode::BoundImageAtomicSMax32:
case IR::Opcode::BoundImageAtomicUMax32:
case IR::Opcode::BoundImageAtomicInc32:
case IR::Opcode::BoundImageAtomicDec32:
case IR::Opcode::BoundImageAtomicAnd32:
case IR::Opcode::BoundImageAtomicOr32:
case IR::Opcode::BoundImageAtomicXor32:
case IR::Opcode::BoundImageAtomicExchange32:
return false;
default:
throw InvalidArgument("Invalid opcode {}", inst.GetOpcode());
}
}
bool IsTextureInstruction(const IR::Inst& inst) {
return IndexedInstruction(inst) != IR::Opcode::Void;
}
std::optional<ConstBufferAddr> TryGetConstBuffer(const IR::Inst* inst);
std::optional<ConstBufferAddr> Track(const IR::Value& value) {
return IR::BreadthFirstSearch(value, TryGetConstBuffer);
}
std::optional<ConstBufferAddr> TryGetConstBuffer(const IR::Inst* inst) {
switch (inst->GetOpcode()) {
default:
return std::nullopt;
case IR::Opcode::BitwiseOr32: {
std::optional lhs{Track(inst->Arg(0))};
std::optional rhs{Track(inst->Arg(1))};
if (!lhs || !rhs) {
return std::nullopt;
}
if (lhs->has_secondary || rhs->has_secondary) {
return std::nullopt;
}
if (lhs->count > 1 || rhs->count > 1) {
return std::nullopt;
}
if (lhs->index > rhs->index || lhs->offset > rhs->offset) {
std::swap(lhs, rhs);
}
return ConstBufferAddr{
.index = lhs->index,
.offset = lhs->offset,
.secondary_index = rhs->index,
.secondary_offset = rhs->offset,
.dynamic_offset = {},
.count = 1,
.has_secondary = true,
};
}
case IR::Opcode::GetCbufU32x2:
case IR::Opcode::GetCbufU32:
break;
}
const IR::Value index{inst->Arg(0)};
const IR::Value offset{inst->Arg(1)};
if (!index.IsImmediate()) {
// Reading a bindless texture from variable indices is valid
// but not supported here at the moment
return std::nullopt;
}
if (offset.IsImmediate()) {
return ConstBufferAddr{
.index = index.U32(),
.offset = offset.U32(),
.secondary_index = 0,
.secondary_offset = 0,
.dynamic_offset = {},
.count = 1,
.has_secondary = false,
};
}
IR::Inst* const offset_inst{offset.InstRecursive()};
if (offset_inst->GetOpcode() != IR::Opcode::IAdd32) {
return std::nullopt;
}
u32 base_offset{};
IR::U32 dynamic_offset;
if (offset_inst->Arg(0).IsImmediate()) {
base_offset = offset_inst->Arg(0).U32();
dynamic_offset = IR::U32{offset_inst->Arg(1)};
} else if (offset_inst->Arg(1).IsImmediate()) {
base_offset = offset_inst->Arg(1).U32();
dynamic_offset = IR::U32{offset_inst->Arg(0)};
} else {
return std::nullopt;
}
return ConstBufferAddr{
.index = index.U32(),
.offset = base_offset,
.secondary_index = 0,
.secondary_offset = 0,
.dynamic_offset = dynamic_offset,
.count = 8,
.has_secondary = false,
};
}
TextureInst MakeInst(Environment& env, IR::Block* block, IR::Inst& inst) {
ConstBufferAddr addr;
if (IsBindless(inst)) {
const std::optional<ConstBufferAddr> track_addr{Track(inst.Arg(0))};
if (!track_addr) {
throw NotImplementedException("Failed to track bindless texture constant buffer");
}
addr = *track_addr;
} else {
addr = ConstBufferAddr{
.index = env.TextureBoundBuffer(),
.offset = inst.Arg(0).U32(),
.secondary_index = 0,
.secondary_offset = 0,
.dynamic_offset = {},
.count = 1,
.has_secondary = false,
};
}
return TextureInst{
.cbuf = addr,
.inst = &inst,
.block = block,
};
}
TextureType ReadTextureType(Environment& env, const ConstBufferAddr& cbuf) {
const u32 secondary_index{cbuf.has_secondary ? cbuf.secondary_index : cbuf.index};
const u32 secondary_offset{cbuf.has_secondary ? cbuf.secondary_offset : cbuf.offset};
const u32 lhs_raw{env.ReadCbufValue(cbuf.index, cbuf.offset)};
const u32 rhs_raw{env.ReadCbufValue(secondary_index, secondary_offset)};
return env.ReadTextureType(lhs_raw | rhs_raw);
}
class Descriptors {
public:
explicit Descriptors(TextureBufferDescriptors& texture_buffer_descriptors_,
ImageBufferDescriptors& image_buffer_descriptors_,
TextureDescriptors& texture_descriptors_,
ImageDescriptors& image_descriptors_)
: texture_buffer_descriptors{texture_buffer_descriptors_},
image_buffer_descriptors{image_buffer_descriptors_},
texture_descriptors{texture_descriptors_}, image_descriptors{image_descriptors_} {}
u32 Add(const TextureBufferDescriptor& desc) {
return Add(texture_buffer_descriptors, desc, [&desc](const auto& existing) {
return desc.cbuf_index == existing.cbuf_index &&
desc.cbuf_offset == existing.cbuf_offset &&
desc.secondary_cbuf_index == existing.secondary_cbuf_index &&
desc.secondary_cbuf_offset == existing.secondary_cbuf_offset &&
desc.count == existing.count && desc.size_shift == existing.size_shift &&
desc.has_secondary == existing.has_secondary;
});
}
u32 Add(const ImageBufferDescriptor& desc) {
const u32 index{Add(image_buffer_descriptors, desc, [&desc](const auto& existing) {
return desc.format == existing.format && desc.cbuf_index == existing.cbuf_index &&
desc.cbuf_offset == existing.cbuf_offset && desc.count == existing.count &&
desc.size_shift == existing.size_shift;
})};
image_buffer_descriptors[index].is_written |= desc.is_written;
image_buffer_descriptors[index].is_read |= desc.is_read;
return index;
}
u32 Add(const TextureDescriptor& desc) {
return Add(texture_descriptors, desc, [&desc](const auto& existing) {
return desc.type == existing.type && desc.is_depth == existing.is_depth &&
desc.has_secondary == existing.has_secondary &&
desc.cbuf_index == existing.cbuf_index &&
desc.cbuf_offset == existing.cbuf_offset &&
desc.secondary_cbuf_index == existing.secondary_cbuf_index &&
desc.secondary_cbuf_offset == existing.secondary_cbuf_offset &&
desc.count == existing.count && desc.size_shift == existing.size_shift;
});
}
u32 Add(const ImageDescriptor& desc) {
const u32 index{Add(image_descriptors, desc, [&desc](const auto& existing) {
return desc.type == existing.type && desc.format == existing.format &&
desc.cbuf_index == existing.cbuf_index &&
desc.cbuf_offset == existing.cbuf_offset && desc.count == existing.count &&
desc.size_shift == existing.size_shift;
})};
image_descriptors[index].is_written |= desc.is_written;
image_descriptors[index].is_read |= desc.is_read;
return index;
}
private:
template <typename Descriptors, typename Descriptor, typename Func>
static u32 Add(Descriptors& descriptors, const Descriptor& desc, Func&& pred) {
// TODO: Handle arrays
const auto it{std::ranges::find_if(descriptors, pred)};
if (it != descriptors.end()) {
return static_cast<u32>(std::distance(descriptors.begin(), it));
}
descriptors.push_back(desc);
return static_cast<u32>(descriptors.size()) - 1;
}
TextureBufferDescriptors& texture_buffer_descriptors;
ImageBufferDescriptors& image_buffer_descriptors;
TextureDescriptors& texture_descriptors;
ImageDescriptors& image_descriptors;
};
} // Anonymous namespace
void TexturePass(Environment& env, IR::Program& program) {
TextureInstVector to_replace;
for (IR::Block* const block : program.post_order_blocks) {
for (IR::Inst& inst : block->Instructions()) {
if (!IsTextureInstruction(inst)) {
continue;
}
to_replace.push_back(MakeInst(env, block, inst));
}
}
// Sort instructions to visit textures by constant buffer index, then by offset
std::ranges::sort(to_replace, [](const auto& lhs, const auto& rhs) {
return lhs.cbuf.offset < rhs.cbuf.offset;
});
std::stable_sort(to_replace.begin(), to_replace.end(), [](const auto& lhs, const auto& rhs) {
return lhs.cbuf.index < rhs.cbuf.index;
});
Descriptors descriptors{
program.info.texture_buffer_descriptors,
program.info.image_buffer_descriptors,
program.info.texture_descriptors,
program.info.image_descriptors,
};
for (TextureInst& texture_inst : to_replace) {
// TODO: Handle arrays
IR::Inst* const inst{texture_inst.inst};
inst->ReplaceOpcode(IndexedInstruction(*inst));
const auto& cbuf{texture_inst.cbuf};
auto flags{inst->Flags<IR::TextureInstInfo>()};
switch (inst->GetOpcode()) {
case IR::Opcode::ImageQueryDimensions:
flags.type.Assign(ReadTextureType(env, cbuf));
inst->SetFlags(flags);
break;
case IR::Opcode::ImageFetch:
if (flags.type != TextureType::Color1D) {
break;
}
if (ReadTextureType(env, cbuf) == TextureType::Buffer) {
// Replace with the bound texture type only when it's a texture buffer
// If the instruction is 1D and the bound type is 2D, don't change the code and let
// the rasterizer robustness handle it
// This happens on Fire Emblem: Three Houses
flags.type.Assign(TextureType::Buffer);
}
break;
default:
break;
}
u32 index;
switch (inst->GetOpcode()) {
case IR::Opcode::ImageRead:
case IR::Opcode::ImageAtomicIAdd32:
case IR::Opcode::ImageAtomicSMin32:
case IR::Opcode::ImageAtomicUMin32:
case IR::Opcode::ImageAtomicSMax32:
case IR::Opcode::ImageAtomicUMax32:
case IR::Opcode::ImageAtomicInc32:
case IR::Opcode::ImageAtomicDec32:
case IR::Opcode::ImageAtomicAnd32:
case IR::Opcode::ImageAtomicOr32:
case IR::Opcode::ImageAtomicXor32:
case IR::Opcode::ImageAtomicExchange32:
case IR::Opcode::ImageWrite: {
if (cbuf.has_secondary) {
throw NotImplementedException("Unexpected separate sampler");
}
const bool is_written{inst->GetOpcode() != IR::Opcode::ImageRead};
const bool is_read{inst->GetOpcode() != IR::Opcode::ImageWrite};
if (flags.type == TextureType::Buffer) {
index = descriptors.Add(ImageBufferDescriptor{
.format = flags.image_format,
.is_written = is_written,
.is_read = is_read,
.cbuf_index = cbuf.index,
.cbuf_offset = cbuf.offset,
.count = cbuf.count,
.size_shift = DESCRIPTOR_SIZE_SHIFT,
});
} else {
index = descriptors.Add(ImageDescriptor{
.type = flags.type,
.format = flags.image_format,
.is_written = is_written,
.is_read = is_read,
.cbuf_index = cbuf.index,
.cbuf_offset = cbuf.offset,
.count = cbuf.count,
.size_shift = DESCRIPTOR_SIZE_SHIFT,
});
}
break;
}
default:
if (flags.type == TextureType::Buffer) {
index = descriptors.Add(TextureBufferDescriptor{
.has_secondary = cbuf.has_secondary,
.cbuf_index = cbuf.index,
.cbuf_offset = cbuf.offset,
.secondary_cbuf_index = cbuf.secondary_index,
.secondary_cbuf_offset = cbuf.secondary_offset,
.count = cbuf.count,
.size_shift = DESCRIPTOR_SIZE_SHIFT,
});
} else {
index = descriptors.Add(TextureDescriptor{
.type = flags.type,
.is_depth = flags.is_depth != 0,
.has_secondary = cbuf.has_secondary,
.cbuf_index = cbuf.index,
.cbuf_offset = cbuf.offset,
.secondary_cbuf_index = cbuf.secondary_index,
.secondary_cbuf_offset = cbuf.secondary_offset,
.count = cbuf.count,
.size_shift = DESCRIPTOR_SIZE_SHIFT,
});
}
break;
}
flags.descriptor_index.Assign(index);
inst->SetFlags(flags);
if (cbuf.count > 1) {
const auto insert_point{IR::Block::InstructionList::s_iterator_to(*inst)};
IR::IREmitter ir{*texture_inst.block, insert_point};
const IR::U32 shift{ir.Imm32(std::countr_zero(DESCRIPTOR_SIZE))};
inst->SetArg(0, ir.ShiftRightArithmetic(cbuf.dynamic_offset, shift));
} else {
inst->SetArg(0, IR::Value{});
}
}
}
void JoinTextureInfo(Info& base, Info& source) {
Descriptors descriptors{
base.texture_buffer_descriptors,
base.image_buffer_descriptors,
base.texture_descriptors,
base.image_descriptors,
};
for (auto& desc : source.texture_buffer_descriptors) {
descriptors.Add(desc);
}
for (auto& desc : source.image_buffer_descriptors) {
descriptors.Add(desc);
}
for (auto& desc : source.texture_descriptors) {
descriptors.Add(desc);
}
for (auto& desc : source.image_descriptors) {
descriptors.Add(desc);
}
}
} // namespace Shader::Optimization

View File

@@ -0,0 +1,98 @@
// Copyright 2021 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#include <map>
#include <set>
#include "shader_recompiler/exception.h"
#include "shader_recompiler/frontend/ir/basic_block.h"
#include "shader_recompiler/frontend/ir/value.h"
#include "shader_recompiler/ir_opt/passes.h"
namespace Shader::Optimization {
static void ValidateTypes(const IR::Program& program) {
for (const auto& block : program.blocks) {
for (const IR::Inst& inst : *block) {
if (inst.GetOpcode() == IR::Opcode::Phi) {
// Skip validation on phi nodes
continue;
}
const size_t num_args{inst.NumArgs()};
for (size_t i = 0; i < num_args; ++i) {
const IR::Type t1{inst.Arg(i).Type()};
const IR::Type t2{IR::ArgTypeOf(inst.GetOpcode(), i)};
if (!IR::AreTypesCompatible(t1, t2)) {
throw LogicError("Invalid types in block:\n{}", IR::DumpBlock(*block));
}
}
}
}
}
static void ValidateUses(const IR::Program& program) {
std::map<IR::Inst*, int> actual_uses;
for (const auto& block : program.blocks) {
for (const IR::Inst& inst : *block) {
const size_t num_args{inst.NumArgs()};
for (size_t i = 0; i < num_args; ++i) {
const IR::Value arg{inst.Arg(i)};
if (!arg.IsImmediate()) {
++actual_uses[arg.Inst()];
}
}
}
}
for (const auto [inst, uses] : actual_uses) {
if (inst->UseCount() != uses) {
throw LogicError("Invalid uses in block: {}", IR::DumpProgram(program));
}
}
}
static void ValidateForwardDeclarations(const IR::Program& program) {
std::set<const IR::Inst*> definitions;
for (const IR::Block* const block : program.blocks) {
for (const IR::Inst& inst : *block) {
definitions.emplace(&inst);
if (inst.GetOpcode() == IR::Opcode::Phi) {
// Phi nodes can have forward declarations
continue;
}
const size_t num_args{inst.NumArgs()};
for (size_t arg = 0; arg < num_args; ++arg) {
if (inst.Arg(arg).IsImmediate()) {
continue;
}
if (!definitions.contains(inst.Arg(arg).Inst())) {
throw LogicError("Forward declaration in block: {}", IR::DumpBlock(*block));
}
}
}
}
}
static void ValidatePhiNodes(const IR::Program& program) {
for (const IR::Block* const block : program.blocks) {
bool no_more_phis{false};
for (const IR::Inst& inst : *block) {
if (inst.GetOpcode() == IR::Opcode::Phi) {
if (no_more_phis) {
throw LogicError("Interleaved phi nodes: {}", IR::DumpBlock(*block));
}
} else {
no_more_phis = true;
}
}
}
}
void VerificationPass(const IR::Program& program) {
ValidateTypes(program);
ValidateUses(program);
ValidateForwardDeclarations(program);
ValidatePhiNodes(program);
}
} // namespace Shader::Optimization