early-access version 1255

2020-12-28 15:15:37 +00:00
parent 84b39492d1
commit 78b48028e1
6254 changed files with 1868140 additions and 0 deletions
--- a/src/video_core/engines/const_buffer_engine_interface.h
+++ b/src/video_core/engines/const_buffer_engine_interface.h
@@ -0,0 +1,103 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <type_traits>
+#include "common/bit_field.h"
+#include "common/common_types.h"
+#include "video_core/engines/shader_bytecode.h"
+#include "video_core/engines/shader_type.h"
+#include "video_core/guest_driver.h"
+#include "video_core/textures/texture.h"
+
+namespace Tegra::Engines {
+
+struct SamplerDescriptor {
+    union {
+        u32 raw = 0;
+        BitField<0, 2, Tegra::Shader::TextureType> texture_type;
+        BitField<2, 3, Tegra::Texture::ComponentType> r_type;
+        BitField<5, 1, u32> is_array;
+        BitField<6, 1, u32> is_buffer;
+        BitField<7, 1, u32> is_shadow;
+        BitField<8, 3, Tegra::Texture::ComponentType> g_type;
+        BitField<11, 3, Tegra::Texture::ComponentType> b_type;
+        BitField<14, 3, Tegra::Texture::ComponentType> a_type;
+        BitField<17, 7, Tegra::Texture::TextureFormat> format;
+    };
+
+    bool operator==(const SamplerDescriptor& rhs) const noexcept {
+        return raw == rhs.raw;
+    }
+
+    bool operator!=(const SamplerDescriptor& rhs) const noexcept {
+        return !operator==(rhs);
+    }
+
+    static SamplerDescriptor FromTIC(const Tegra::Texture::TICEntry& tic) {
+        using Tegra::Shader::TextureType;
+        SamplerDescriptor result;
+
+        result.format.Assign(tic.format.Value());
+        result.r_type.Assign(tic.r_type.Value());
+        result.g_type.Assign(tic.g_type.Value());
+        result.b_type.Assign(tic.b_type.Value());
+        result.a_type.Assign(tic.a_type.Value());
+
+        switch (tic.texture_type.Value()) {
+        case Tegra::Texture::TextureType::Texture1D:
+            result.texture_type.Assign(TextureType::Texture1D);
+            return result;
+        case Tegra::Texture::TextureType::Texture2D:
+            result.texture_type.Assign(TextureType::Texture2D);
+            return result;
+        case Tegra::Texture::TextureType::Texture3D:
+            result.texture_type.Assign(TextureType::Texture3D);
+            return result;
+        case Tegra::Texture::TextureType::TextureCubemap:
+            result.texture_type.Assign(TextureType::TextureCube);
+            return result;
+        case Tegra::Texture::TextureType::Texture1DArray:
+            result.texture_type.Assign(TextureType::Texture1D);
+            result.is_array.Assign(1);
+            return result;
+        case Tegra::Texture::TextureType::Texture2DArray:
+            result.texture_type.Assign(TextureType::Texture2D);
+            result.is_array.Assign(1);
+            return result;
+        case Tegra::Texture::TextureType::Texture1DBuffer:
+            result.texture_type.Assign(TextureType::Texture1D);
+            result.is_buffer.Assign(1);
+            return result;
+        case Tegra::Texture::TextureType::Texture2DNoMipmap:
+            result.texture_type.Assign(TextureType::Texture2D);
+            return result;
+        case Tegra::Texture::TextureType::TextureCubeArray:
+            result.texture_type.Assign(TextureType::TextureCube);
+            result.is_array.Assign(1);
+            return result;
+        default:
+            result.texture_type.Assign(TextureType::Texture2D);
+            return result;
+        }
+    }
+};
+static_assert(std::is_trivially_copyable_v<SamplerDescriptor>);
+
+class ConstBufferEngineInterface {
+public:
+    virtual ~ConstBufferEngineInterface() = default;
+    virtual u32 AccessConstBuffer32(ShaderType stage, u64 const_buffer, u64 offset) const = 0;
+    virtual SamplerDescriptor AccessBoundSampler(ShaderType stage, u64 offset) const = 0;
+    virtual SamplerDescriptor AccessBindlessSampler(ShaderType stage, u64 const_buffer,
+                                                    u64 offset) const = 0;
+    virtual SamplerDescriptor AccessSampler(u32 handle) const = 0;
+    virtual u32 GetBoundBuffer() const = 0;
+
+    virtual VideoCore::GuestDriverProfile& AccessGuestDriverProfile() = 0;
+    virtual const VideoCore::GuestDriverProfile& AccessGuestDriverProfile() const = 0;
+};
+
+} // namespace Tegra::Engines
--- a/src/video_core/engines/const_buffer_info.h
+++ b/src/video_core/engines/const_buffer_info.h
@@ -0,0 +1,17 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include "common/common_types.h"
+
+namespace Tegra::Engines {
+
+struct ConstBufferInfo {
+    GPUVAddr address;
+    u32 size;
+    bool enabled;
+};
+
+} // namespace Tegra::Engines
--- a/src/video_core/engines/engine_interface.h
+++ b/src/video_core/engines/engine_interface.h
@@ -0,0 +1,22 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <type_traits>
+#include "common/common_types.h"
+
+namespace Tegra::Engines {
+
+class EngineInterface {
+public:
+    /// Write the value to the register identified by method.
+    virtual void CallMethod(u32 method, u32 method_argument, bool is_last_call) = 0;
+
+    /// Write multiple values to the register identified by method.
+    virtual void CallMultiMethod(u32 method, const u32* base_start, u32 amount,
+                                 u32 methods_pending) = 0;
+};
+
+} // namespace Tegra::Engines
--- a/src/video_core/engines/engine_upload.cpp
+++ b/src/video_core/engines/engine_upload.cpp
@@ -0,0 +1,52 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <cstring>
+
+#include "common/assert.h"
+#include "video_core/engines/engine_upload.h"
+#include "video_core/memory_manager.h"
+#include "video_core/textures/decoders.h"
+
+namespace Tegra::Engines::Upload {
+
+State::State(MemoryManager& memory_manager_, Registers& regs_)
+    : regs{regs_}, memory_manager{memory_manager_} {}
+
+State::~State() = default;
+
+void State::ProcessExec(const bool is_linear_) {
+    write_offset = 0;
+    copy_size = regs.line_length_in * regs.line_count;
+    inner_buffer.resize(copy_size);
+    is_linear = is_linear_;
+}
+
+void State::ProcessData(const u32 data, const bool is_last_call) {
+    const u32 sub_copy_size = std::min(4U, copy_size - write_offset);
+    std::memcpy(&inner_buffer[write_offset], &data, sub_copy_size);
+    write_offset += sub_copy_size;
+    if (!is_last_call) {
+        return;
+    }
+    const GPUVAddr address{regs.dest.Address()};
+    if (is_linear) {
+        memory_manager.WriteBlock(address, inner_buffer.data(), copy_size);
+    } else {
+        UNIMPLEMENTED_IF(regs.dest.z != 0);
+        UNIMPLEMENTED_IF(regs.dest.depth != 1);
+        UNIMPLEMENTED_IF(regs.dest.BlockWidth() != 0);
+        UNIMPLEMENTED_IF(regs.dest.BlockDepth() != 0);
+        const std::size_t dst_size = Tegra::Texture::CalculateSize(
+            true, 1, regs.dest.width, regs.dest.height, 1, regs.dest.BlockHeight(), 0);
+        tmp_buffer.resize(dst_size);
+        memory_manager.ReadBlock(address, tmp_buffer.data(), dst_size);
+        Tegra::Texture::SwizzleKepler(regs.dest.width, regs.dest.height, regs.dest.x, regs.dest.y,
+                                      regs.dest.BlockHeight(), copy_size, inner_buffer.data(),
+                                      tmp_buffer.data());
+        memory_manager.WriteBlock(address, tmp_buffer.data(), dst_size);
+    }
+}
+
+} // namespace Tegra::Engines::Upload
--- a/src/video_core/engines/engine_upload.h
+++ b/src/video_core/engines/engine_upload.h
@@ -0,0 +1,73 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <vector>
+#include "common/bit_field.h"
+#include "common/common_types.h"
+
+namespace Tegra {
+class MemoryManager;
+}
+
+namespace Tegra::Engines::Upload {
+
+struct Registers {
+    u32 line_length_in;
+    u32 line_count;
+
+    struct {
+        u32 address_high;
+        u32 address_low;
+        u32 pitch;
+        union {
+            BitField<0, 4, u32> block_width;
+            BitField<4, 4, u32> block_height;
+            BitField<8, 4, u32> block_depth;
+        };
+        u32 width;
+        u32 height;
+        u32 depth;
+        u32 z;
+        u32 x;
+        u32 y;
+
+        GPUVAddr Address() const {
+            return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) | address_low);
+        }
+
+        u32 BlockWidth() const {
+            return block_width.Value();
+        }
+
+        u32 BlockHeight() const {
+            return block_height.Value();
+        }
+
+        u32 BlockDepth() const {
+            return block_depth.Value();
+        }
+    } dest;
+};
+
+class State {
+public:
+    explicit State(MemoryManager& memory_manager_, Registers& regs_);
+    ~State();
+
+    void ProcessExec(bool is_linear_);
+    void ProcessData(u32 data, bool is_last_call);
+
+private:
+    u32 write_offset = 0;
+    u32 copy_size = 0;
+    std::vector<u8> inner_buffer;
+    std::vector<u8> tmp_buffer;
+    bool is_linear = false;
+    Registers& regs;
+    MemoryManager& memory_manager;
+};
+
+} // namespace Tegra::Engines::Upload
--- a/src/video_core/engines/fermi_2d.cpp
+++ b/src/video_core/engines/fermi_2d.cpp
@@ -0,0 +1,69 @@
+// Copyright 2018 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "common/assert.h"
+#include "common/logging/log.h"
+#include "video_core/engines/fermi_2d.h"
+#include "video_core/memory_manager.h"
+#include "video_core/rasterizer_interface.h"
+
+namespace Tegra::Engines {
+
+Fermi2D::Fermi2D() {
+    // Nvidia's OpenGL driver seems to assume these values
+    regs.src.depth = 1;
+    regs.dst.depth = 1;
+}
+
+Fermi2D::~Fermi2D() = default;
+
+void Fermi2D::BindRasterizer(VideoCore::RasterizerInterface& rasterizer_) {
+    rasterizer = &rasterizer_;
+}
+
+void Fermi2D::CallMethod(u32 method, u32 method_argument, bool is_last_call) {
+    ASSERT_MSG(method < Regs::NUM_REGS,
+               "Invalid Fermi2D register, increase the size of the Regs structure");
+    regs.reg_array[method] = method_argument;
+
+    if (method == FERMI2D_REG_INDEX(pixels_from_memory.src_y0) + 1) {
+        Blit();
+    }
+}
+
+void Fermi2D::CallMultiMethod(u32 method, const u32* base_start, u32 amount, u32 methods_pending) {
+    for (u32 i = 0; i < amount; ++i) {
+        CallMethod(method, base_start[i], methods_pending - i <= 1);
+    }
+}
+
+void Fermi2D::Blit() {
+    LOG_DEBUG(HW_GPU, "called. source address=0x{:x}, destination address=0x{:x}",
+              regs.src.Address(), regs.dst.Address());
+
+    UNIMPLEMENTED_IF_MSG(regs.operation != Operation::SrcCopy, "Operation is not copy");
+    UNIMPLEMENTED_IF_MSG(regs.src.layer != 0, "Source layer is not zero");
+    UNIMPLEMENTED_IF_MSG(regs.dst.layer != 0, "Destination layer is not zero");
+    UNIMPLEMENTED_IF_MSG(regs.src.depth != 1, "Source depth is not one");
+    UNIMPLEMENTED_IF_MSG(regs.clip_enable != 0, "Clipped blit enabled");
+
+    const auto& args = regs.pixels_from_memory;
+    const Config config{
+        .operation = regs.operation,
+        .filter = args.sample_mode.filter,
+        .dst_x0 = args.dst_x0,
+        .dst_y0 = args.dst_y0,
+        .dst_x1 = args.dst_x0 + args.dst_width,
+        .dst_y1 = args.dst_y0 + args.dst_height,
+        .src_x0 = static_cast<s32>(args.src_x0 >> 32),
+        .src_y0 = static_cast<s32>(args.src_y0 >> 32),
+        .src_x1 = static_cast<s32>((args.du_dx * args.dst_width + args.src_x0) >> 32),
+        .src_y1 = static_cast<s32>((args.dv_dy * args.dst_height + args.src_y0) >> 32),
+    };
+    if (!rasterizer->AccelerateSurfaceCopy(regs.src, regs.dst, config)) {
+        UNIMPLEMENTED();
+    }
+}
+
+} // namespace Tegra::Engines
--- a/src/video_core/engines/fermi_2d.h
+++ b/src/video_core/engines/fermi_2d.h
@@ -0,0 +1,352 @@
+// Copyright 2018 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <array>
+#include <cstddef>
+#include "common/bit_field.h"
+#include "common/common_funcs.h"
+#include "common/common_types.h"
+#include "common/math_util.h"
+#include "video_core/engines/engine_interface.h"
+#include "video_core/gpu.h"
+
+namespace Tegra {
+class MemoryManager;
+}
+
+namespace VideoCore {
+class RasterizerInterface;
+}
+
+namespace Tegra::Engines {
+
+/**
+ * This Engine is known as G80_2D. Documentation can be found in:
+ * https://github.com/envytools/envytools/blob/master/rnndb/graph/g80_2d.xml
+ * https://cgit.freedesktop.org/mesa/mesa/tree/src/gallium/drivers/nouveau/nv50/nv50_2d.xml.h
+ */
+
+#define FERMI2D_REG_INDEX(field_name)                                                              \
+    (offsetof(Tegra::Engines::Fermi2D::Regs, field_name) / sizeof(u32))
+
+class Fermi2D final : public EngineInterface {
+public:
+    explicit Fermi2D();
+    ~Fermi2D();
+
+    /// Binds a rasterizer to this engine.
+    void BindRasterizer(VideoCore::RasterizerInterface& rasterizer);
+
+    /// Write the value to the register identified by method.
+    void CallMethod(u32 method, u32 method_argument, bool is_last_call) override;
+
+    /// Write multiple values to the register identified by method.
+    void CallMultiMethod(u32 method, const u32* base_start, u32 amount,
+                         u32 methods_pending) override;
+
+    enum class Origin : u32 {
+        Center = 0,
+        Corner = 1,
+    };
+
+    enum class Filter : u32 {
+        Point = 0,
+        Bilinear = 1,
+    };
+
+    enum class Operation : u32 {
+        SrcCopyAnd = 0,
+        ROPAnd = 1,
+        Blend = 2,
+        SrcCopy = 3,
+        ROP = 4,
+        SrcCopyPremult = 5,
+        BlendPremult = 6,
+    };
+
+    enum class MemoryLayout : u32 {
+        BlockLinear = 0,
+        Pitch = 1,
+    };
+
+    enum class CpuIndexWrap : u32 {
+        Wrap = 0,
+        NoWrap = 1,
+    };
+
+    struct Surface {
+        RenderTargetFormat format;
+        MemoryLayout linear;
+        union {
+            BitField<0, 4, u32> block_width;
+            BitField<4, 4, u32> block_height;
+            BitField<8, 4, u32> block_depth;
+        };
+        u32 depth;
+        u32 layer;
+        u32 pitch;
+        u32 width;
+        u32 height;
+        u32 addr_upper;
+        u32 addr_lower;
+
+        [[nodiscard]] constexpr GPUVAddr Address() const noexcept {
+            return (static_cast<GPUVAddr>(addr_upper) << 32) | static_cast<GPUVAddr>(addr_lower);
+        }
+    };
+    static_assert(sizeof(Surface) == 0x28, "Surface has incorrect size");
+
+    enum class SectorPromotion : u32 {
+        NoPromotion = 0,
+        PromoteTo2V = 1,
+        PromoteTo2H = 2,
+        PromoteTo4 = 3,
+    };
+
+    enum class NumTpcs : u32 {
+        All = 0,
+        One = 1,
+    };
+
+    enum class RenderEnableMode : u32 {
+        False = 0,
+        True = 1,
+        Conditional = 2,
+        RenderIfEqual = 3,
+        RenderIfNotEqual = 4,
+    };
+
+    enum class ColorKeyFormat : u32 {
+        A16R56G6B5 = 0,
+        A1R5G55B5 = 1,
+        A8R8G8B8 = 2,
+        A2R10G10B10 = 3,
+        Y8 = 4,
+        Y16 = 5,
+        Y32 = 6,
+    };
+
+    union Beta4 {
+        BitField<0, 8, u32> b;
+        BitField<8, 8, u32> g;
+        BitField<16, 8, u32> r;
+        BitField<24, 8, u32> a;
+    };
+
+    struct Point {
+        u32 x;
+        u32 y;
+    };
+
+    enum class PatternSelect : u32 {
+        MonoChrome8x8 = 0,
+        MonoChrome64x1 = 1,
+        MonoChrome1x64 = 2,
+        Color = 3,
+    };
+
+    enum class NotifyType : u32 {
+        WriteOnly = 0,
+        WriteThenAwaken = 1,
+    };
+
+    enum class MonochromePatternColorFormat : u32 {
+        A8X8R8G6B5 = 0,
+        A1R5G5B5 = 1,
+        A8R8G8B8 = 2,
+        A8Y8 = 3,
+        A8X8Y16 = 4,
+        Y32 = 5,
+    };
+
+    enum class MonochromePatternFormat : u32 {
+        CGA6_M1 = 0,
+        LE_M1 = 1,
+    };
+
+    union Regs {
+        static constexpr std::size_t NUM_REGS = 0x258;
+        struct {
+            u32 object;
+            INSERT_UNION_PADDING_WORDS(0x3F);
+            u32 no_operation;
+            NotifyType notify;
+            INSERT_UNION_PADDING_WORDS(0x2);
+            u32 wait_for_idle;
+            INSERT_UNION_PADDING_WORDS(0xB);
+            u32 pm_trigger;
+            INSERT_UNION_PADDING_WORDS(0xF);
+            u32 context_dma_notify;
+            u32 dst_context_dma;
+            u32 src_context_dma;
+            u32 semaphore_context_dma;
+            INSERT_UNION_PADDING_WORDS(0x1C);
+            Surface dst;
+            CpuIndexWrap pixels_from_cpu_index_wrap;
+            u32 kind2d_check_enable;
+            Surface src;
+            SectorPromotion pixels_from_memory_sector_promotion;
+            INSERT_UNION_PADDING_WORDS(0x1);
+            NumTpcs num_tpcs;
+            u32 render_enable_addr_upper;
+            u32 render_enable_addr_lower;
+            RenderEnableMode render_enable_mode;
+            INSERT_UNION_PADDING_WORDS(0x4);
+            u32 clip_x0;
+            u32 clip_y0;
+            u32 clip_width;
+            u32 clip_height;
+            BitField<0, 1, u32> clip_enable;
+            BitField<0, 3, ColorKeyFormat> color_key_format;
+            u32 color_key;
+            BitField<0, 1, u32> color_key_enable;
+            BitField<0, 8, u32> rop;
+            u32 beta1;
+            Beta4 beta4;
+            Operation operation;
+            union {
+                BitField<0, 6, u32> x;
+                BitField<8, 6, u32> y;
+            } pattern_offset;
+            BitField<0, 2, PatternSelect> pattern_select;
+            INSERT_UNION_PADDING_WORDS(0xC);
+            struct {
+                BitField<0, 3, MonochromePatternColorFormat> color_format;
+                BitField<0, 1, MonochromePatternFormat> format;
+                u32 color0;
+                u32 color1;
+                u32 pattern0;
+                u32 pattern1;
+            } monochrome_pattern;
+            struct {
+                std::array<u32, 0x40> X8R8G8B8;
+                std::array<u32, 0x20> R5G6B5;
+                std::array<u32, 0x20> X1R5G5B5;
+                std::array<u32, 0x10> Y8;
+            } color_pattern;
+            INSERT_UNION_PADDING_WORDS(0x10);
+            struct {
+                u32 prim_mode;
+                u32 prim_color_format;
+                u32 prim_color;
+                u32 line_tie_break_bits;
+                INSERT_UNION_PADDING_WORDS(0x14);
+                u32 prim_point_xy;
+                INSERT_UNION_PADDING_WORDS(0x7);
+                std::array<Point, 0x40> prim_point;
+            } render_solid;
+            struct {
+                u32 data_type;
+                u32 color_format;
+                u32 index_format;
+                u32 mono_format;
+                u32 wrap;
+                u32 color0;
+                u32 color1;
+                u32 mono_opacity;
+                INSERT_UNION_PADDING_WORDS(0x6);
+                u32 src_width;
+                u32 src_height;
+                u32 dx_du_frac;
+                u32 dx_du_int;
+                u32 dx_dv_frac;
+                u32 dy_dv_int;
+                u32 dst_x0_frac;
+                u32 dst_x0_int;
+                u32 dst_y0_frac;
+                u32 dst_y0_int;
+                u32 data;
+            } pixels_from_cpu;
+            INSERT_UNION_PADDING_WORDS(0x3);
+            u32 big_endian_control;
+            INSERT_UNION_PADDING_WORDS(0x3);
+            struct {
+                BitField<0, 3, u32> block_shape;
+                BitField<0, 5, u32> corral_size;
+                BitField<0, 1, u32> safe_overlap;
+                union {
+                    BitField<0, 1, Origin> origin;
+                    BitField<4, 1, Filter> filter;
+                } sample_mode;
+                INSERT_UNION_PADDING_WORDS(0x8);
+                s32 dst_x0;
+                s32 dst_y0;
+                s32 dst_width;
+                s32 dst_height;
+                s64 du_dx;
+                s64 dv_dy;
+                s64 src_x0;
+                s64 src_y0;
+            } pixels_from_memory;
+        };
+        std::array<u32, NUM_REGS> reg_array;
+    } regs{};
+
+    struct Config {
+        Operation operation;
+        Filter filter;
+        s32 dst_x0;
+        s32 dst_y0;
+        s32 dst_x1;
+        s32 dst_y1;
+        s32 src_x0;
+        s32 src_y0;
+        s32 src_x1;
+        s32 src_y1;
+    };
+
+private:
+    VideoCore::RasterizerInterface* rasterizer;
+
+    /// Performs the copy from the source surface to the destination surface as configured in the
+    /// registers.
+    void Blit();
+};
+
+#define ASSERT_REG_POSITION(field_name, position)                                                  \
+    static_assert(offsetof(Fermi2D::Regs, field_name) == position,                                 \
+                  "Field " #field_name " has invalid position")
+
+ASSERT_REG_POSITION(object, 0x0);
+ASSERT_REG_POSITION(no_operation, 0x100);
+ASSERT_REG_POSITION(notify, 0x104);
+ASSERT_REG_POSITION(wait_for_idle, 0x110);
+ASSERT_REG_POSITION(pm_trigger, 0x140);
+ASSERT_REG_POSITION(context_dma_notify, 0x180);
+ASSERT_REG_POSITION(dst_context_dma, 0x184);
+ASSERT_REG_POSITION(src_context_dma, 0x188);
+ASSERT_REG_POSITION(semaphore_context_dma, 0x18C);
+ASSERT_REG_POSITION(dst, 0x200);
+ASSERT_REG_POSITION(pixels_from_cpu_index_wrap, 0x228);
+ASSERT_REG_POSITION(kind2d_check_enable, 0x22C);
+ASSERT_REG_POSITION(src, 0x230);
+ASSERT_REG_POSITION(pixels_from_memory_sector_promotion, 0x258);
+ASSERT_REG_POSITION(num_tpcs, 0x260);
+ASSERT_REG_POSITION(render_enable_addr_upper, 0x264);
+ASSERT_REG_POSITION(render_enable_addr_lower, 0x268);
+ASSERT_REG_POSITION(clip_x0, 0x280);
+ASSERT_REG_POSITION(clip_y0, 0x284);
+ASSERT_REG_POSITION(clip_width, 0x288);
+ASSERT_REG_POSITION(clip_height, 0x28c);
+ASSERT_REG_POSITION(clip_enable, 0x290);
+ASSERT_REG_POSITION(color_key_format, 0x294);
+ASSERT_REG_POSITION(color_key, 0x298);
+ASSERT_REG_POSITION(rop, 0x2A0);
+ASSERT_REG_POSITION(beta1, 0x2A4);
+ASSERT_REG_POSITION(beta4, 0x2A8);
+ASSERT_REG_POSITION(operation, 0x2AC);
+ASSERT_REG_POSITION(pattern_offset, 0x2B0);
+ASSERT_REG_POSITION(pattern_select, 0x2B4);
+ASSERT_REG_POSITION(monochrome_pattern, 0x2E8);
+ASSERT_REG_POSITION(color_pattern, 0x300);
+ASSERT_REG_POSITION(render_solid, 0x580);
+ASSERT_REG_POSITION(pixels_from_cpu, 0x800);
+ASSERT_REG_POSITION(big_endian_control, 0x870);
+ASSERT_REG_POSITION(pixels_from_memory, 0x880);
+
+#undef ASSERT_REG_POSITION
+
+} // namespace Tegra::Engines
--- a/src/video_core/engines/kepler_compute.cpp
+++ b/src/video_core/engines/kepler_compute.cpp
@@ -0,0 +1,127 @@
+// Copyright 2018 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <bitset>
+#include "common/assert.h"
+#include "common/logging/log.h"
+#include "core/core.h"
+#include "video_core/engines/kepler_compute.h"
+#include "video_core/engines/maxwell_3d.h"
+#include "video_core/engines/shader_type.h"
+#include "video_core/memory_manager.h"
+#include "video_core/rasterizer_interface.h"
+#include "video_core/renderer_base.h"
+#include "video_core/textures/decoders.h"
+
+namespace Tegra::Engines {
+
+KeplerCompute::KeplerCompute(Core::System& system_, MemoryManager& memory_manager_)
+    : system{system_}, memory_manager{memory_manager_}, upload_state{memory_manager, regs.upload} {}
+
+KeplerCompute::~KeplerCompute() = default;
+
+void KeplerCompute::BindRasterizer(VideoCore::RasterizerInterface& rasterizer_) {
+    rasterizer = &rasterizer_;
+}
+
+void KeplerCompute::CallMethod(u32 method, u32 method_argument, bool is_last_call) {
+    ASSERT_MSG(method < Regs::NUM_REGS,
+               "Invalid KeplerCompute register, increase the size of the Regs structure");
+
+    regs.reg_array[method] = method_argument;
+
+    switch (method) {
+    case KEPLER_COMPUTE_REG_INDEX(exec_upload): {
+        upload_state.ProcessExec(regs.exec_upload.linear != 0);
+        break;
+    }
+    case KEPLER_COMPUTE_REG_INDEX(data_upload): {
+        upload_state.ProcessData(method_argument, is_last_call);
+        if (is_last_call) {
+            system.GPU().Maxwell3D().OnMemoryWrite();
+        }
+        break;
+    }
+    case KEPLER_COMPUTE_REG_INDEX(launch):
+        ProcessLaunch();
+        break;
+    default:
+        break;
+    }
+}
+
+void KeplerCompute::CallMultiMethod(u32 method, const u32* base_start, u32 amount,
+                                    u32 methods_pending) {
+    for (std::size_t i = 0; i < amount; i++) {
+        CallMethod(method, base_start[i], methods_pending - static_cast<u32>(i) <= 1);
+    }
+}
+
+u32 KeplerCompute::AccessConstBuffer32(ShaderType stage, u64 const_buffer, u64 offset) const {
+    ASSERT(stage == ShaderType::Compute);
+    const auto& buffer = launch_description.const_buffer_config[const_buffer];
+    u32 result;
+    std::memcpy(&result, memory_manager.GetPointer(buffer.Address() + offset), sizeof(u32));
+    return result;
+}
+
+SamplerDescriptor KeplerCompute::AccessBoundSampler(ShaderType stage, u64 offset) const {
+    return AccessBindlessSampler(stage, regs.tex_cb_index, offset * sizeof(Texture::TextureHandle));
+}
+
+SamplerDescriptor KeplerCompute::AccessBindlessSampler(ShaderType stage, u64 const_buffer,
+                                                       u64 offset) const {
+    ASSERT(stage == ShaderType::Compute);
+    const auto& tex_info_buffer = launch_description.const_buffer_config[const_buffer];
+    const GPUVAddr tex_info_address = tex_info_buffer.Address() + offset;
+    return AccessSampler(memory_manager.Read<u32>(tex_info_address));
+}
+
+SamplerDescriptor KeplerCompute::AccessSampler(u32 handle) const {
+    const Texture::TextureHandle tex_handle{handle};
+    const Texture::TICEntry tic = GetTICEntry(tex_handle.tic_id);
+    const Texture::TSCEntry tsc = GetTSCEntry(tex_handle.tsc_id);
+
+    SamplerDescriptor result = SamplerDescriptor::FromTIC(tic);
+    result.is_shadow.Assign(tsc.depth_compare_enabled.Value());
+    return result;
+}
+
+VideoCore::GuestDriverProfile& KeplerCompute::AccessGuestDriverProfile() {
+    return rasterizer->AccessGuestDriverProfile();
+}
+
+const VideoCore::GuestDriverProfile& KeplerCompute::AccessGuestDriverProfile() const {
+    return rasterizer->AccessGuestDriverProfile();
+}
+
+void KeplerCompute::ProcessLaunch() {
+    const GPUVAddr launch_desc_loc = regs.launch_desc_loc.Address();
+    memory_manager.ReadBlockUnsafe(launch_desc_loc, &launch_description,
+                                   LaunchParams::NUM_LAUNCH_PARAMETERS * sizeof(u32));
+
+    const GPUVAddr code_addr = regs.code_loc.Address() + launch_description.program_start;
+    LOG_TRACE(HW_GPU, "Compute invocation launched at address 0x{:016x}", code_addr);
+
+    rasterizer->DispatchCompute(code_addr);
+}
+
+Texture::TICEntry KeplerCompute::GetTICEntry(u32 tic_index) const {
+    const GPUVAddr tic_address_gpu{regs.tic.Address() + tic_index * sizeof(Texture::TICEntry)};
+
+    Texture::TICEntry tic_entry;
+    memory_manager.ReadBlockUnsafe(tic_address_gpu, &tic_entry, sizeof(Texture::TICEntry));
+
+    return tic_entry;
+}
+
+Texture::TSCEntry KeplerCompute::GetTSCEntry(u32 tsc_index) const {
+    const GPUVAddr tsc_address_gpu{regs.tsc.Address() + tsc_index * sizeof(Texture::TSCEntry)};
+
+    Texture::TSCEntry tsc_entry;
+    memory_manager.ReadBlockUnsafe(tsc_address_gpu, &tsc_entry, sizeof(Texture::TSCEntry));
+    return tsc_entry;
+}
+
+} // namespace Tegra::Engines
--- a/src/video_core/engines/kepler_compute.h
+++ b/src/video_core/engines/kepler_compute.h
@@ -0,0 +1,269 @@
+// Copyright 2018 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <array>
+#include <cstddef>
+#include <vector>
+#include "common/bit_field.h"
+#include "common/common_funcs.h"
+#include "common/common_types.h"
+#include "video_core/engines/const_buffer_engine_interface.h"
+#include "video_core/engines/engine_interface.h"
+#include "video_core/engines/engine_upload.h"
+#include "video_core/engines/shader_type.h"
+#include "video_core/gpu.h"
+#include "video_core/textures/texture.h"
+
+namespace Core {
+class System;
+}
+
+namespace Tegra {
+class MemoryManager;
+}
+
+namespace VideoCore {
+class RasterizerInterface;
+}
+
+namespace Tegra::Engines {
+
+/**
+ * This Engine is known as GK104_Compute. Documentation can be found in:
+ * https://github.com/envytools/envytools/blob/master/rnndb/graph/gk104_compute.xml
+ * https://cgit.freedesktop.org/mesa/mesa/tree/src/gallium/drivers/nouveau/nvc0/nve4_compute.xml.h
+ */
+
+#define KEPLER_COMPUTE_REG_INDEX(field_name)                                                       \
+    (offsetof(Tegra::Engines::KeplerCompute::Regs, field_name) / sizeof(u32))
+
+class KeplerCompute final : public ConstBufferEngineInterface, public EngineInterface {
+public:
+    explicit KeplerCompute(Core::System& system, MemoryManager& memory_manager);
+    ~KeplerCompute();
+
+    /// Binds a rasterizer to this engine.
+    void BindRasterizer(VideoCore::RasterizerInterface& rasterizer);
+
+    static constexpr std::size_t NumConstBuffers = 8;
+
+    struct Regs {
+        static constexpr std::size_t NUM_REGS = 0xCF8;
+
+        union {
+            struct {
+                INSERT_UNION_PADDING_WORDS(0x60);
+
+                Upload::Registers upload;
+
+                struct {
+                    union {
+                        BitField<0, 1, u32> linear;
+                    };
+                } exec_upload;
+
+                u32 data_upload;
+
+                INSERT_UNION_PADDING_WORDS(0x3F);
+
+                struct {
+                    u32 address;
+                    GPUVAddr Address() const {
+                        return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address) << 8));
+                    }
+                } launch_desc_loc;
+
+                INSERT_UNION_PADDING_WORDS(0x1);
+
+                u32 launch;
+
+                INSERT_UNION_PADDING_WORDS(0x4A7);
+
+                struct {
+                    u32 address_high;
+                    u32 address_low;
+                    u32 limit;
+                    GPUVAddr Address() const {
+                        return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
+                                                     address_low);
+                    }
+                } tsc;
+
+                INSERT_UNION_PADDING_WORDS(0x3);
+
+                struct {
+                    u32 address_high;
+                    u32 address_low;
+                    u32 limit;
+                    GPUVAddr Address() const {
+                        return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
+                                                     address_low);
+                    }
+                } tic;
+
+                INSERT_UNION_PADDING_WORDS(0x22);
+
+                struct {
+                    u32 address_high;
+                    u32 address_low;
+                    GPUVAddr Address() const {
+                        return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
+                                                     address_low);
+                    }
+                } code_loc;
+
+                INSERT_UNION_PADDING_WORDS(0x3FE);
+
+                u32 tex_cb_index;
+
+                INSERT_UNION_PADDING_WORDS(0x374);
+            };
+            std::array<u32, NUM_REGS> reg_array;
+        };
+    } regs{};
+
+    struct LaunchParams {
+        static constexpr std::size_t NUM_LAUNCH_PARAMETERS = 0x40;
+
+        INSERT_PADDING_WORDS(0x8);
+
+        u32 program_start;
+
+        INSERT_PADDING_WORDS(0x2);
+
+        BitField<30, 1, u32> linked_tsc;
+
+        BitField<0, 31, u32> grid_dim_x;
+        union {
+            BitField<0, 16, u32> grid_dim_y;
+            BitField<16, 16, u32> grid_dim_z;
+        };
+
+        INSERT_PADDING_WORDS(0x3);
+
+        BitField<0, 18, u32> shared_alloc;
+
+        BitField<16, 16, u32> block_dim_x;
+        union {
+            BitField<0, 16, u32> block_dim_y;
+            BitField<16, 16, u32> block_dim_z;
+        };
+
+        union {
+            BitField<0, 8, u32> const_buffer_enable_mask;
+            BitField<29, 2, u32> cache_layout;
+        };
+
+        INSERT_PADDING_WORDS(0x8);
+
+        struct ConstBufferConfig {
+            u32 address_low;
+            union {
+                BitField<0, 8, u32> address_high;
+                BitField<15, 17, u32> size;
+            };
+            GPUVAddr Address() const {
+                return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high.Value()) << 32) |
+                                             address_low);
+            }
+        };
+        std::array<ConstBufferConfig, NumConstBuffers> const_buffer_config;
+
+        union {
+            BitField<0, 20, u32> local_pos_alloc;
+            BitField<27, 5, u32> barrier_alloc;
+        };
+
+        union {
+            BitField<0, 20, u32> local_neg_alloc;
+            BitField<24, 5, u32> gpr_alloc;
+        };
+
+        union {
+            BitField<0, 20, u32> local_crs_alloc;
+            BitField<24, 5, u32> sass_version;
+        };
+
+        INSERT_PADDING_WORDS(0x10);
+    } launch_description{};
+
+    struct {
+        u32 write_offset = 0;
+        u32 copy_size = 0;
+        std::vector<u8> inner_buffer;
+    } state{};
+
+    static_assert(sizeof(Regs) == Regs::NUM_REGS * sizeof(u32),
+                  "KeplerCompute Regs has wrong size");
+
+    static_assert(sizeof(LaunchParams) == LaunchParams::NUM_LAUNCH_PARAMETERS * sizeof(u32),
+                  "KeplerCompute LaunchParams has wrong size");
+
+    /// Write the value to the register identified by method.
+    void CallMethod(u32 method, u32 method_argument, bool is_last_call) override;
+
+    /// Write multiple values to the register identified by method.
+    void CallMultiMethod(u32 method, const u32* base_start, u32 amount,
+                         u32 methods_pending) override;
+
+    u32 AccessConstBuffer32(ShaderType stage, u64 const_buffer, u64 offset) const override;
+
+    SamplerDescriptor AccessBoundSampler(ShaderType stage, u64 offset) const override;
+
+    SamplerDescriptor AccessBindlessSampler(ShaderType stage, u64 const_buffer,
+                                            u64 offset) const override;
+
+    SamplerDescriptor AccessSampler(u32 handle) const override;
+
+    u32 GetBoundBuffer() const override {
+        return regs.tex_cb_index;
+    }
+
+    VideoCore::GuestDriverProfile& AccessGuestDriverProfile() override;
+
+    const VideoCore::GuestDriverProfile& AccessGuestDriverProfile() const override;
+
+private:
+    void ProcessLaunch();
+
+    /// Retrieves information about a specific TIC entry from the TIC buffer.
+    Texture::TICEntry GetTICEntry(u32 tic_index) const;
+
+    /// Retrieves information about a specific TSC entry from the TSC buffer.
+    Texture::TSCEntry GetTSCEntry(u32 tsc_index) const;
+
+    Core::System& system;
+    MemoryManager& memory_manager;
+    VideoCore::RasterizerInterface* rasterizer = nullptr;
+    Upload::State upload_state;
+};
+
+#define ASSERT_REG_POSITION(field_name, position)                                                  \
+    static_assert(offsetof(KeplerCompute::Regs, field_name) == position * 4,                       \
+                  "Field " #field_name " has invalid position")
+
+#define ASSERT_LAUNCH_PARAM_POSITION(field_name, position)                                         \
+    static_assert(offsetof(KeplerCompute::LaunchParams, field_name) == position * 4,               \
+                  "Field " #field_name " has invalid position")
+
+ASSERT_REG_POSITION(upload, 0x60);
+ASSERT_REG_POSITION(exec_upload, 0x6C);
+ASSERT_REG_POSITION(data_upload, 0x6D);
+ASSERT_REG_POSITION(launch, 0xAF);
+ASSERT_REG_POSITION(tsc, 0x557);
+ASSERT_REG_POSITION(tic, 0x55D);
+ASSERT_REG_POSITION(code_loc, 0x582);
+ASSERT_REG_POSITION(tex_cb_index, 0x982);
+ASSERT_LAUNCH_PARAM_POSITION(program_start, 0x8);
+ASSERT_LAUNCH_PARAM_POSITION(grid_dim_x, 0xC);
+ASSERT_LAUNCH_PARAM_POSITION(shared_alloc, 0x11);
+ASSERT_LAUNCH_PARAM_POSITION(block_dim_x, 0x12);
+ASSERT_LAUNCH_PARAM_POSITION(const_buffer_enable_mask, 0x14);
+ASSERT_LAUNCH_PARAM_POSITION(const_buffer_config, 0x1D);
+
+#undef ASSERT_REG_POSITION
+
+} // namespace Tegra::Engines
--- a/src/video_core/engines/kepler_memory.cpp
+++ b/src/video_core/engines/kepler_memory.cpp
@@ -0,0 +1,50 @@
+// Copyright 2018 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "common/assert.h"
+#include "common/logging/log.h"
+#include "core/core.h"
+#include "video_core/engines/kepler_memory.h"
+#include "video_core/engines/maxwell_3d.h"
+#include "video_core/memory_manager.h"
+#include "video_core/rasterizer_interface.h"
+#include "video_core/renderer_base.h"
+#include "video_core/textures/decoders.h"
+
+namespace Tegra::Engines {
+
+KeplerMemory::KeplerMemory(Core::System& system_, MemoryManager& memory_manager)
+    : system{system_}, upload_state{memory_manager, regs.upload} {}
+
+KeplerMemory::~KeplerMemory() = default;
+
+void KeplerMemory::CallMethod(u32 method, u32 method_argument, bool is_last_call) {
+    ASSERT_MSG(method < Regs::NUM_REGS,
+               "Invalid KeplerMemory register, increase the size of the Regs structure");
+
+    regs.reg_array[method] = method_argument;
+
+    switch (method) {
+    case KEPLERMEMORY_REG_INDEX(exec): {
+        upload_state.ProcessExec(regs.exec.linear != 0);
+        break;
+    }
+    case KEPLERMEMORY_REG_INDEX(data): {
+        upload_state.ProcessData(method_argument, is_last_call);
+        if (is_last_call) {
+            system.GPU().Maxwell3D().OnMemoryWrite();
+        }
+        break;
+    }
+    }
+}
+
+void KeplerMemory::CallMultiMethod(u32 method, const u32* base_start, u32 amount,
+                                   u32 methods_pending) {
+    for (std::size_t i = 0; i < amount; i++) {
+        CallMethod(method, base_start[i], methods_pending - static_cast<u32>(i) <= 1);
+    }
+}
+
+} // namespace Tegra::Engines
--- a/src/video_core/engines/kepler_memory.h
+++ b/src/video_core/engines/kepler_memory.h
@@ -0,0 +1,85 @@
+// Copyright 2018 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <array>
+#include <cstddef>
+#include <vector>
+#include "common/bit_field.h"
+#include "common/common_funcs.h"
+#include "common/common_types.h"
+#include "video_core/engines/engine_interface.h"
+#include "video_core/engines/engine_upload.h"
+#include "video_core/gpu.h"
+
+namespace Core {
+class System;
+}
+
+namespace Tegra {
+class MemoryManager;
+}
+
+namespace Tegra::Engines {
+
+/**
+ * This Engine is known as P2MF. Documentation can be found in:
+ * https://github.com/envytools/envytools/blob/master/rnndb/graph/gk104_p2mf.xml
+ * https://cgit.freedesktop.org/mesa/mesa/tree/src/gallium/drivers/nouveau/nvc0/nve4_p2mf.xml.h
+ */
+
+#define KEPLERMEMORY_REG_INDEX(field_name)                                                         \
+    (offsetof(Tegra::Engines::KeplerMemory::Regs, field_name) / sizeof(u32))
+
+class KeplerMemory final : public EngineInterface {
+public:
+    explicit KeplerMemory(Core::System& system_, MemoryManager& memory_manager);
+    ~KeplerMemory();
+
+    /// Write the value to the register identified by method.
+    void CallMethod(u32 method, u32 method_argument, bool is_last_call) override;
+
+    /// Write multiple values to the register identified by method.
+    void CallMultiMethod(u32 method, const u32* base_start, u32 amount,
+                         u32 methods_pending) override;
+
+    struct Regs {
+        static constexpr size_t NUM_REGS = 0x7F;
+
+        union {
+            struct {
+                INSERT_UNION_PADDING_WORDS(0x60);
+
+                Upload::Registers upload;
+
+                struct {
+                    union {
+                        BitField<0, 1, u32> linear;
+                    };
+                } exec;
+
+                u32 data;
+
+                INSERT_UNION_PADDING_WORDS(0x11);
+            };
+            std::array<u32, NUM_REGS> reg_array;
+        };
+    } regs{};
+
+private:
+    Core::System& system;
+    Upload::State upload_state;
+};
+
+#define ASSERT_REG_POSITION(field_name, position)                                                  \
+    static_assert(offsetof(KeplerMemory::Regs, field_name) == position * 4,                        \
+                  "Field " #field_name " has invalid position")
+
+ASSERT_REG_POSITION(upload, 0x60);
+ASSERT_REG_POSITION(exec, 0x6C);
+ASSERT_REG_POSITION(data, 0x6D);
+#undef ASSERT_REG_POSITION
+
+} // namespace Tegra::Engines
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@@ -0,0 +1,708 @@
+// Copyright 2018 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <cstring>
+#include <optional>
+#include "common/assert.h"
+#include "core/core.h"
+#include "core/core_timing.h"
+#include "video_core/engines/maxwell_3d.h"
+#include "video_core/engines/shader_type.h"
+#include "video_core/gpu.h"
+#include "video_core/memory_manager.h"
+#include "video_core/rasterizer_interface.h"
+#include "video_core/textures/texture.h"
+
+namespace Tegra::Engines {
+
+using VideoCore::QueryType;
+
+/// First register id that is actually a Macro call.
+constexpr u32 MacroRegistersStart = 0xE00;
+
+Maxwell3D::Maxwell3D(Core::System& system_, MemoryManager& memory_manager_)
+    : system{system_}, memory_manager{memory_manager_}, macro_engine{GetMacroEngine(*this)},
+      upload_state{memory_manager, regs.upload} {
+    dirty.flags.flip();
+    InitializeRegisterDefaults();
+}
+
+Maxwell3D::~Maxwell3D() = default;
+
+void Maxwell3D::BindRasterizer(VideoCore::RasterizerInterface& rasterizer_) {
+    rasterizer = &rasterizer_;
+}
+
+void Maxwell3D::InitializeRegisterDefaults() {
+    // Initializes registers to their default values - what games expect them to be at boot. This is
+    // for certain registers that may not be explicitly set by games.
+
+    // Reset all registers to zero
+    std::memset(&regs, 0, sizeof(regs));
+
+    // Depth range near/far is not always set, but is expected to be the default 0.0f, 1.0f. This is
+    // needed for ARMS.
+    for (auto& viewport : regs.viewports) {
+        viewport.depth_range_near = 0.0f;
+        viewport.depth_range_far = 1.0f;
+    }
+    for (auto& viewport : regs.viewport_transform) {
+        viewport.swizzle.x.Assign(Regs::ViewportSwizzle::PositiveX);
+        viewport.swizzle.y.Assign(Regs::ViewportSwizzle::PositiveY);
+        viewport.swizzle.z.Assign(Regs::ViewportSwizzle::PositiveZ);
+        viewport.swizzle.w.Assign(Regs::ViewportSwizzle::PositiveW);
+    }
+
+    // Doom and Bomberman seems to use the uninitialized registers and just enable blend
+    // so initialize blend registers with sane values
+    regs.blend.equation_rgb = Regs::Blend::Equation::Add;
+    regs.blend.factor_source_rgb = Regs::Blend::Factor::One;
+    regs.blend.factor_dest_rgb = Regs::Blend::Factor::Zero;
+    regs.blend.equation_a = Regs::Blend::Equation::Add;
+    regs.blend.factor_source_a = Regs::Blend::Factor::One;
+    regs.blend.factor_dest_a = Regs::Blend::Factor::Zero;
+    for (auto& blend : regs.independent_blend) {
+        blend.equation_rgb = Regs::Blend::Equation::Add;
+        blend.factor_source_rgb = Regs::Blend::Factor::One;
+        blend.factor_dest_rgb = Regs::Blend::Factor::Zero;
+        blend.equation_a = Regs::Blend::Equation::Add;
+        blend.factor_source_a = Regs::Blend::Factor::One;
+        blend.factor_dest_a = Regs::Blend::Factor::Zero;
+    }
+    regs.stencil_front_op_fail = Regs::StencilOp::Keep;
+    regs.stencil_front_op_zfail = Regs::StencilOp::Keep;
+    regs.stencil_front_op_zpass = Regs::StencilOp::Keep;
+    regs.stencil_front_func_func = Regs::ComparisonOp::Always;
+    regs.stencil_front_func_mask = 0xFFFFFFFF;
+    regs.stencil_front_mask = 0xFFFFFFFF;
+    regs.stencil_two_side_enable = 1;
+    regs.stencil_back_op_fail = Regs::StencilOp::Keep;
+    regs.stencil_back_op_zfail = Regs::StencilOp::Keep;
+    regs.stencil_back_op_zpass = Regs::StencilOp::Keep;
+    regs.stencil_back_func_func = Regs::ComparisonOp::Always;
+    regs.stencil_back_func_mask = 0xFFFFFFFF;
+    regs.stencil_back_mask = 0xFFFFFFFF;
+
+    regs.depth_test_func = Regs::ComparisonOp::Always;
+    regs.front_face = Regs::FrontFace::CounterClockWise;
+    regs.cull_face = Regs::CullFace::Back;
+
+    // TODO(Rodrigo): Most games do not set a point size. I think this is a case of a
+    // register carrying a default value. Assume it's OpenGL's default (1).
+    regs.point_size = 1.0f;
+
+    // TODO(bunnei): Some games do not initialize the color masks (e.g. Sonic Mania). Assuming a
+    // default of enabled fixes rendering here.
+    for (auto& color_mask : regs.color_mask) {
+        color_mask.R.Assign(1);
+        color_mask.G.Assign(1);
+        color_mask.B.Assign(1);
+        color_mask.A.Assign(1);
+    }
+
+    for (auto& format : regs.vertex_attrib_format) {
+        format.constant.Assign(1);
+    }
+
+    // NVN games expect these values to be enabled at boot
+    regs.rasterize_enable = 1;
+    regs.rt_separate_frag_data = 1;
+    regs.framebuffer_srgb = 1;
+    regs.line_width_aliased = 1.0f;
+    regs.line_width_smooth = 1.0f;
+    regs.front_face = Maxwell3D::Regs::FrontFace::ClockWise;
+    regs.polygon_mode_back = Maxwell3D::Regs::PolygonMode::Fill;
+    regs.polygon_mode_front = Maxwell3D::Regs::PolygonMode::Fill;
+
+    shadow_state = regs;
+
+    mme_inline[MAXWELL3D_REG_INDEX(draw.vertex_end_gl)] = true;
+    mme_inline[MAXWELL3D_REG_INDEX(draw.vertex_begin_gl)] = true;
+    mme_inline[MAXWELL3D_REG_INDEX(vertex_buffer.count)] = true;
+    mme_inline[MAXWELL3D_REG_INDEX(index_array.count)] = true;
+}
+
+void Maxwell3D::ProcessMacro(u32 method, const u32* base_start, u32 amount, bool is_last_call) {
+    if (executing_macro == 0) {
+        // A macro call must begin by writing the macro method's register, not its argument.
+        ASSERT_MSG((method % 2) == 0,
+                   "Can't start macro execution by writing to the ARGS register");
+        executing_macro = method;
+    }
+
+    macro_params.insert(macro_params.end(), base_start, base_start + amount);
+
+    // Call the macro when there are no more parameters in the command buffer
+    if (is_last_call) {
+        CallMacroMethod(executing_macro, macro_params);
+        macro_params.clear();
+    }
+}
+
+u32 Maxwell3D::ProcessShadowRam(u32 method, u32 argument) {
+    // Keep track of the register value in shadow_state when requested.
+    const auto control = shadow_state.shadow_ram_control;
+    if (control == Regs::ShadowRamControl::Track ||
+        control == Regs::ShadowRamControl::TrackWithFilter) {
+        shadow_state.reg_array[method] = argument;
+        return argument;
+    }
+    if (control == Regs::ShadowRamControl::Replay) {
+        return shadow_state.reg_array[method];
+    }
+    return argument;
+}
+
+void Maxwell3D::ProcessDirtyRegisters(u32 method, u32 argument) {
+    if (regs.reg_array[method] == argument) {
+        return;
+    }
+    regs.reg_array[method] = argument;
+
+    for (const auto& table : dirty.tables) {
+        dirty.flags[table[method]] = true;
+    }
+}
+
+void Maxwell3D::ProcessMethodCall(u32 method, u32 argument, u32 nonshadow_argument,
+                                  bool is_last_call) {
+    switch (method) {
+    case MAXWELL3D_REG_INDEX(wait_for_idle):
+        return rasterizer->WaitForIdle();
+    case MAXWELL3D_REG_INDEX(shadow_ram_control):
+        shadow_state.shadow_ram_control = static_cast<Regs::ShadowRamControl>(nonshadow_argument);
+        return;
+    case MAXWELL3D_REG_INDEX(macros.data):
+        return macro_engine->AddCode(regs.macros.upload_address, argument);
+    case MAXWELL3D_REG_INDEX(macros.bind):
+        return ProcessMacroBind(argument);
+    case MAXWELL3D_REG_INDEX(firmware[4]):
+        return ProcessFirmwareCall4();
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[0]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[1]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[2]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[3]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[4]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[5]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[6]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[7]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[8]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[9]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[10]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[11]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[12]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[13]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[14]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[15]):
+        return StartCBData(method);
+    case MAXWELL3D_REG_INDEX(cb_bind[0]):
+        return ProcessCBBind(0);
+    case MAXWELL3D_REG_INDEX(cb_bind[1]):
+        return ProcessCBBind(1);
+    case MAXWELL3D_REG_INDEX(cb_bind[2]):
+        return ProcessCBBind(2);
+    case MAXWELL3D_REG_INDEX(cb_bind[3]):
+        return ProcessCBBind(3);
+    case MAXWELL3D_REG_INDEX(cb_bind[4]):
+        return ProcessCBBind(4);
+    case MAXWELL3D_REG_INDEX(draw.vertex_end_gl):
+        return DrawArrays();
+    case MAXWELL3D_REG_INDEX(clear_buffers):
+        return ProcessClearBuffers();
+    case MAXWELL3D_REG_INDEX(query.query_get):
+        return ProcessQueryGet();
+    case MAXWELL3D_REG_INDEX(condition.mode):
+        return ProcessQueryCondition();
+    case MAXWELL3D_REG_INDEX(counter_reset):
+        return ProcessCounterReset();
+    case MAXWELL3D_REG_INDEX(sync_info):
+        return ProcessSyncPoint();
+    case MAXWELL3D_REG_INDEX(exec_upload):
+        return upload_state.ProcessExec(regs.exec_upload.linear != 0);
+    case MAXWELL3D_REG_INDEX(data_upload):
+        upload_state.ProcessData(argument, is_last_call);
+        if (is_last_call) {
+            OnMemoryWrite();
+        }
+        return;
+    case MAXWELL3D_REG_INDEX(fragment_barrier):
+        return rasterizer->FragmentBarrier();
+    case MAXWELL3D_REG_INDEX(tiled_cache_barrier):
+        return rasterizer->TiledCacheBarrier();
+    }
+}
+
+void Maxwell3D::CallMacroMethod(u32 method, const std::vector<u32>& parameters) {
+    // Reset the current macro.
+    executing_macro = 0;
+
+    // Lookup the macro offset
+    const u32 entry =
+        ((method - MacroRegistersStart) >> 1) % static_cast<u32>(macro_positions.size());
+
+    // Execute the current macro.
+    macro_engine->Execute(*this, macro_positions[entry], parameters);
+    if (mme_draw.current_mode != MMEDrawMode::Undefined) {
+        FlushMMEInlineDraw();
+    }
+}
+
+void Maxwell3D::CallMethod(u32 method, u32 method_argument, bool is_last_call) {
+    if (method == cb_data_state.current) {
+        regs.reg_array[method] = method_argument;
+        ProcessCBData(method_argument);
+        return;
+    } else if (cb_data_state.current != null_cb_data) {
+        FinishCBData();
+    }
+
+    // It is an error to write to a register other than the current macro's ARG register before it
+    // has finished execution.
+    if (executing_macro != 0) {
+        ASSERT(method == executing_macro + 1);
+    }
+
+    // Methods after 0xE00 are special, they're actually triggers for some microcode that was
+    // uploaded to the GPU during initialization.
+    if (method >= MacroRegistersStart) {
+        ProcessMacro(method, &method_argument, 1, is_last_call);
+        return;
+    }
+
+    ASSERT_MSG(method < Regs::NUM_REGS,
+               "Invalid Maxwell3D register, increase the size of the Regs structure");
+
+    const u32 argument = ProcessShadowRam(method, method_argument);
+    ProcessDirtyRegisters(method, argument);
+    ProcessMethodCall(method, argument, method_argument, is_last_call);
+}
+
+void Maxwell3D::CallMultiMethod(u32 method, const u32* base_start, u32 amount,
+                                u32 methods_pending) {
+    // Methods after 0xE00 are special, they're actually triggers for some microcode that was
+    // uploaded to the GPU during initialization.
+    if (method >= MacroRegistersStart) {
+        ProcessMacro(method, base_start, amount, amount == methods_pending);
+        return;
+    }
+    switch (method) {
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[0]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[1]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[2]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[3]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[4]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[5]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[6]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[7]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[8]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[9]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[10]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[11]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[12]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[13]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[14]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[15]):
+        ProcessCBMultiData(method, base_start, amount);
+        break;
+    default:
+        for (std::size_t i = 0; i < amount; i++) {
+            CallMethod(method, base_start[i], methods_pending - static_cast<u32>(i) <= 1);
+        }
+        break;
+    }
+}
+
+void Maxwell3D::StepInstance(const MMEDrawMode expected_mode, const u32 count) {
+    if (mme_draw.current_mode == MMEDrawMode::Undefined) {
+        if (mme_draw.gl_begin_consume) {
+            mme_draw.current_mode = expected_mode;
+            mme_draw.current_count = count;
+            mme_draw.instance_count = 1;
+            mme_draw.gl_begin_consume = false;
+            mme_draw.gl_end_count = 0;
+        }
+        return;
+    } else {
+        if (mme_draw.current_mode == expected_mode && count == mme_draw.current_count &&
+            mme_draw.instance_mode && mme_draw.gl_begin_consume) {
+            mme_draw.instance_count++;
+            mme_draw.gl_begin_consume = false;
+            return;
+        } else {
+            FlushMMEInlineDraw();
+        }
+    }
+    // Tail call in case it needs to retry.
+    StepInstance(expected_mode, count);
+}
+
+void Maxwell3D::CallMethodFromMME(u32 method, u32 method_argument) {
+    if (mme_inline[method]) {
+        regs.reg_array[method] = method_argument;
+        if (method == MAXWELL3D_REG_INDEX(vertex_buffer.count) ||
+            method == MAXWELL3D_REG_INDEX(index_array.count)) {
+            const MMEDrawMode expected_mode = method == MAXWELL3D_REG_INDEX(vertex_buffer.count)
+                                                  ? MMEDrawMode::Array
+                                                  : MMEDrawMode::Indexed;
+            StepInstance(expected_mode, method_argument);
+        } else if (method == MAXWELL3D_REG_INDEX(draw.vertex_begin_gl)) {
+            mme_draw.instance_mode =
+                (regs.draw.instance_next != 0) || (regs.draw.instance_cont != 0);
+            mme_draw.gl_begin_consume = true;
+        } else {
+            mme_draw.gl_end_count++;
+        }
+    } else {
+        if (mme_draw.current_mode != MMEDrawMode::Undefined) {
+            FlushMMEInlineDraw();
+        }
+        CallMethod(method, method_argument, true);
+    }
+}
+
+void Maxwell3D::FlushMMEInlineDraw() {
+    LOG_TRACE(HW_GPU, "called, topology={}, count={}", regs.draw.topology.Value(),
+              regs.vertex_buffer.count);
+    ASSERT_MSG(!(regs.index_array.count && regs.vertex_buffer.count), "Both indexed and direct?");
+    ASSERT(mme_draw.instance_count == mme_draw.gl_end_count);
+
+    // Both instance configuration registers can not be set at the same time.
+    ASSERT_MSG(!regs.draw.instance_next || !regs.draw.instance_cont,
+               "Illegal combination of instancing parameters");
+
+    const bool is_indexed = mme_draw.current_mode == MMEDrawMode::Indexed;
+    if (ShouldExecute()) {
+        rasterizer->Draw(is_indexed, true);
+    }
+
+    // TODO(bunnei): Below, we reset vertex count so that we can use these registers to determine if
+    // the game is trying to draw indexed or direct mode. This needs to be verified on HW still -
+    // it's possible that it is incorrect and that there is some other register used to specify the
+    // drawing mode.
+    if (is_indexed) {
+        regs.index_array.count = 0;
+    } else {
+        regs.vertex_buffer.count = 0;
+    }
+    mme_draw.current_mode = MMEDrawMode::Undefined;
+    mme_draw.current_count = 0;
+    mme_draw.instance_count = 0;
+    mme_draw.instance_mode = false;
+    mme_draw.gl_begin_consume = false;
+    mme_draw.gl_end_count = 0;
+}
+
+void Maxwell3D::ProcessMacroUpload(u32 data) {
+    macro_engine->AddCode(regs.macros.upload_address++, data);
+}
+
+void Maxwell3D::ProcessMacroBind(u32 data) {
+    macro_positions[regs.macros.entry++] = data;
+}
+
+void Maxwell3D::ProcessFirmwareCall4() {
+    LOG_WARNING(HW_GPU, "(STUBBED) called");
+
+    // Firmware call 4 is a blob that changes some registers depending on its parameters.
+    // These registers don't affect emulation and so are stubbed by setting 0xd00 to 1.
+    regs.reg_array[0xd00] = 1;
+}
+
+void Maxwell3D::StampQueryResult(u64 payload, bool long_query) {
+    struct LongQueryResult {
+        u64_le value;
+        u64_le timestamp;
+    };
+    static_assert(sizeof(LongQueryResult) == 16, "LongQueryResult has wrong size");
+    const GPUVAddr sequence_address{regs.query.QueryAddress()};
+    if (long_query) {
+        // Write the 128-bit result structure in long mode. Note: We emulate an infinitely fast
+        // GPU, this command may actually take a while to complete in real hardware due to GPU
+        // wait queues.
+        LongQueryResult query_result{payload, system.GPU().GetTicks()};
+        memory_manager.WriteBlock(sequence_address, &query_result, sizeof(query_result));
+    } else {
+        memory_manager.Write<u32>(sequence_address, static_cast<u32>(payload));
+    }
+}
+
+void Maxwell3D::ProcessQueryGet() {
+    // TODO(Subv): Support the other query units.
+    if (regs.query.query_get.unit != Regs::QueryUnit::Crop) {
+        LOG_DEBUG(HW_GPU, "Units other than CROP are unimplemented");
+    }
+
+    switch (regs.query.query_get.operation) {
+    case Regs::QueryOperation::Release:
+        if (regs.query.query_get.fence == 1) {
+            rasterizer->SignalSemaphore(regs.query.QueryAddress(), regs.query.query_sequence);
+        } else {
+            StampQueryResult(regs.query.query_sequence, regs.query.query_get.short_query == 0);
+        }
+        break;
+    case Regs::QueryOperation::Acquire:
+        // TODO(Blinkhawk): Under this operation, the GPU waits for the CPU to write a value that
+        // matches the current payload.
+        UNIMPLEMENTED_MSG("Unimplemented query operation ACQUIRE");
+        break;
+    case Regs::QueryOperation::Counter:
+        if (const std::optional<u64> result = GetQueryResult()) {
+            // If the query returns an empty optional it means it's cached and deferred.
+            // In this case we have a non-empty result, so we stamp it immediately.
+            StampQueryResult(*result, regs.query.query_get.short_query == 0);
+        }
+        break;
+    case Regs::QueryOperation::Trap:
+        UNIMPLEMENTED_MSG("Unimplemented query operation TRAP");
+        break;
+    default:
+        UNIMPLEMENTED_MSG("Unknown query operation");
+        break;
+    }
+}
+
+void Maxwell3D::ProcessQueryCondition() {
+    const GPUVAddr condition_address{regs.condition.Address()};
+    switch (regs.condition.mode) {
+    case Regs::ConditionMode::Always: {
+        execute_on = true;
+        break;
+    }
+    case Regs::ConditionMode::Never: {
+        execute_on = false;
+        break;
+    }
+    case Regs::ConditionMode::ResNonZero: {
+        Regs::QueryCompare cmp;
+        memory_manager.ReadBlock(condition_address, &cmp, sizeof(cmp));
+        execute_on = cmp.initial_sequence != 0U && cmp.initial_mode != 0U;
+        break;
+    }
+    case Regs::ConditionMode::Equal: {
+        Regs::QueryCompare cmp;
+        memory_manager.ReadBlock(condition_address, &cmp, sizeof(cmp));
+        execute_on =
+            cmp.initial_sequence == cmp.current_sequence && cmp.initial_mode == cmp.current_mode;
+        break;
+    }
+    case Regs::ConditionMode::NotEqual: {
+        Regs::QueryCompare cmp;
+        memory_manager.ReadBlock(condition_address, &cmp, sizeof(cmp));
+        execute_on =
+            cmp.initial_sequence != cmp.current_sequence || cmp.initial_mode != cmp.current_mode;
+        break;
+    }
+    default: {
+        UNIMPLEMENTED_MSG("Uninplemented Condition Mode!");
+        execute_on = true;
+        break;
+    }
+    }
+}
+
+void Maxwell3D::ProcessCounterReset() {
+    switch (regs.counter_reset) {
+    case Regs::CounterReset::SampleCnt:
+        rasterizer->ResetCounter(QueryType::SamplesPassed);
+        break;
+    default:
+        LOG_DEBUG(Render_OpenGL, "Unimplemented counter reset={}", regs.counter_reset);
+        break;
+    }
+}
+
+void Maxwell3D::ProcessSyncPoint() {
+    const u32 sync_point = regs.sync_info.sync_point.Value();
+    const u32 increment = regs.sync_info.increment.Value();
+    [[maybe_unused]] const u32 cache_flush = regs.sync_info.unknown.Value();
+    if (increment) {
+        rasterizer->SignalSyncPoint(sync_point);
+    }
+}
+
+void Maxwell3D::DrawArrays() {
+    LOG_TRACE(HW_GPU, "called, topology={}, count={}", regs.draw.topology.Value(),
+              regs.vertex_buffer.count);
+    ASSERT_MSG(!(regs.index_array.count && regs.vertex_buffer.count), "Both indexed and direct?");
+
+    // Both instance configuration registers can not be set at the same time.
+    ASSERT_MSG(!regs.draw.instance_next || !regs.draw.instance_cont,
+               "Illegal combination of instancing parameters");
+
+    if (regs.draw.instance_next) {
+        // Increment the current instance *before* drawing.
+        state.current_instance += 1;
+    } else if (!regs.draw.instance_cont) {
+        // Reset the current instance to 0.
+        state.current_instance = 0;
+    }
+
+    const bool is_indexed{regs.index_array.count && !regs.vertex_buffer.count};
+    if (ShouldExecute()) {
+        rasterizer->Draw(is_indexed, false);
+    }
+
+    // TODO(bunnei): Below, we reset vertex count so that we can use these registers to determine if
+    // the game is trying to draw indexed or direct mode. This needs to be verified on HW still -
+    // it's possible that it is incorrect and that there is some other register used to specify the
+    // drawing mode.
+    if (is_indexed) {
+        regs.index_array.count = 0;
+    } else {
+        regs.vertex_buffer.count = 0;
+    }
+}
+
+std::optional<u64> Maxwell3D::GetQueryResult() {
+    switch (regs.query.query_get.select) {
+    case Regs::QuerySelect::Zero:
+        return 0;
+    case Regs::QuerySelect::SamplesPassed:
+        // Deferred.
+        rasterizer->Query(regs.query.QueryAddress(), QueryType::SamplesPassed,
+                          system.GPU().GetTicks());
+        return std::nullopt;
+    default:
+        LOG_DEBUG(HW_GPU, "Unimplemented query select type {}",
+                  regs.query.query_get.select.Value());
+        return 1;
+    }
+}
+
+void Maxwell3D::ProcessCBBind(std::size_t stage_index) {
+    // Bind the buffer currently in CB_ADDRESS to the specified index in the desired shader stage.
+    auto& shader = state.shader_stages[stage_index];
+    auto& bind_data = regs.cb_bind[stage_index];
+
+    ASSERT(bind_data.index < Regs::MaxConstBuffers);
+    auto& buffer = shader.const_buffers[bind_data.index];
+
+    buffer.enabled = bind_data.valid.Value() != 0;
+    buffer.address = regs.const_buffer.BufferAddress();
+    buffer.size = regs.const_buffer.cb_size;
+}
+
+void Maxwell3D::ProcessCBData(u32 value) {
+    const u32 id = cb_data_state.id;
+    cb_data_state.buffer[id][cb_data_state.counter] = value;
+    // Increment the current buffer position.
+    regs.const_buffer.cb_pos = regs.const_buffer.cb_pos + 4;
+    cb_data_state.counter++;
+}
+
+void Maxwell3D::StartCBData(u32 method) {
+    constexpr u32 first_cb_data = MAXWELL3D_REG_INDEX(const_buffer.cb_data[0]);
+    cb_data_state.start_pos = regs.const_buffer.cb_pos;
+    cb_data_state.id = method - first_cb_data;
+    cb_data_state.current = method;
+    cb_data_state.counter = 0;
+    ProcessCBData(regs.const_buffer.cb_data[cb_data_state.id]);
+}
+
+void Maxwell3D::ProcessCBMultiData(u32 method, const u32* start_base, u32 amount) {
+    if (cb_data_state.current != method) {
+        if (cb_data_state.current != null_cb_data) {
+            FinishCBData();
+        }
+        constexpr u32 first_cb_data = MAXWELL3D_REG_INDEX(const_buffer.cb_data[0]);
+        cb_data_state.start_pos = regs.const_buffer.cb_pos;
+        cb_data_state.id = method - first_cb_data;
+        cb_data_state.current = method;
+        cb_data_state.counter = 0;
+    }
+    const std::size_t id = cb_data_state.id;
+    const std::size_t size = amount;
+    std::size_t i = 0;
+    for (; i < size; i++) {
+        cb_data_state.buffer[id][cb_data_state.counter] = start_base[i];
+        cb_data_state.counter++;
+    }
+    // Increment the current buffer position.
+    regs.const_buffer.cb_pos = regs.const_buffer.cb_pos + 4 * amount;
+}
+
+void Maxwell3D::FinishCBData() {
+    // Write the input value to the current const buffer at the current position.
+    const GPUVAddr buffer_address = regs.const_buffer.BufferAddress();
+    ASSERT(buffer_address != 0);
+
+    // Don't allow writing past the end of the buffer.
+    ASSERT(regs.const_buffer.cb_pos <= regs.const_buffer.cb_size);
+
+    const GPUVAddr address{buffer_address + cb_data_state.start_pos};
+    const std::size_t size = regs.const_buffer.cb_pos - cb_data_state.start_pos;
+
+    const u32 id = cb_data_state.id;
+    memory_manager.WriteBlock(address, cb_data_state.buffer[id].data(), size);
+    OnMemoryWrite();
+
+    cb_data_state.id = null_cb_data;
+    cb_data_state.current = null_cb_data;
+}
+
+Texture::TICEntry Maxwell3D::GetTICEntry(u32 tic_index) const {
+    const GPUVAddr tic_address_gpu{regs.tic.Address() + tic_index * sizeof(Texture::TICEntry)};
+
+    Texture::TICEntry tic_entry;
+    memory_manager.ReadBlockUnsafe(tic_address_gpu, &tic_entry, sizeof(Texture::TICEntry));
+
+    return tic_entry;
+}
+
+Texture::TSCEntry Maxwell3D::GetTSCEntry(u32 tsc_index) const {
+    const GPUVAddr tsc_address_gpu{regs.tsc.Address() + tsc_index * sizeof(Texture::TSCEntry)};
+
+    Texture::TSCEntry tsc_entry;
+    memory_manager.ReadBlockUnsafe(tsc_address_gpu, &tsc_entry, sizeof(Texture::TSCEntry));
+    return tsc_entry;
+}
+
+u32 Maxwell3D::GetRegisterValue(u32 method) const {
+    ASSERT_MSG(method < Regs::NUM_REGS, "Invalid Maxwell3D register");
+    return regs.reg_array[method];
+}
+
+void Maxwell3D::ProcessClearBuffers() {
+    rasterizer->Clear();
+}
+
+u32 Maxwell3D::AccessConstBuffer32(ShaderType stage, u64 const_buffer, u64 offset) const {
+    ASSERT(stage != ShaderType::Compute);
+    const auto& shader_stage = state.shader_stages[static_cast<std::size_t>(stage)];
+    const auto& buffer = shader_stage.const_buffers[const_buffer];
+    return memory_manager.Read<u32>(buffer.address + offset);
+}
+
+SamplerDescriptor Maxwell3D::AccessBoundSampler(ShaderType stage, u64 offset) const {
+    return AccessBindlessSampler(stage, regs.tex_cb_index, offset * sizeof(Texture::TextureHandle));
+}
+
+SamplerDescriptor Maxwell3D::AccessBindlessSampler(ShaderType stage, u64 const_buffer,
+                                                   u64 offset) const {
+    ASSERT(stage != ShaderType::Compute);
+    const auto& shader = state.shader_stages[static_cast<std::size_t>(stage)];
+    const auto& tex_info_buffer = shader.const_buffers[const_buffer];
+    const GPUVAddr tex_info_address = tex_info_buffer.address + offset;
+    return AccessSampler(memory_manager.Read<u32>(tex_info_address));
+}
+
+SamplerDescriptor Maxwell3D::AccessSampler(u32 handle) const {
+    const Texture::TextureHandle tex_handle{handle};
+    const Texture::TICEntry tic = GetTICEntry(tex_handle.tic_id);
+    const Texture::TSCEntry tsc = GetTSCEntry(tex_handle.tsc_id);
+
+    SamplerDescriptor result = SamplerDescriptor::FromTIC(tic);
+    result.is_shadow.Assign(tsc.depth_compare_enabled.Value());
+    return result;
+}
+
+VideoCore::GuestDriverProfile& Maxwell3D::AccessGuestDriverProfile() {
+    return rasterizer->AccessGuestDriverProfile();
+}
+
+const VideoCore::GuestDriverProfile& Maxwell3D::AccessGuestDriverProfile() const {
+    return rasterizer->AccessGuestDriverProfile();
+}
+
+} // namespace Tegra::Engines
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
--- a/src/video_core/engines/maxwell_dma.cpp
+++ b/src/video_core/engines/maxwell_dma.cpp
@@ -0,0 +1,219 @@
+// Copyright 2018 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "common/assert.h"
+#include "common/logging/log.h"
+#include "core/core.h"
+#include "core/settings.h"
+#include "video_core/engines/maxwell_3d.h"
+#include "video_core/engines/maxwell_dma.h"
+#include "video_core/memory_manager.h"
+#include "video_core/renderer_base.h"
+#include "video_core/textures/decoders.h"
+
+namespace Tegra::Engines {
+
+using namespace Texture;
+
+MaxwellDMA::MaxwellDMA(Core::System& system_, MemoryManager& memory_manager_)
+    : system{system_}, memory_manager{memory_manager_} {}
+
+MaxwellDMA::~MaxwellDMA() = default;
+
+void MaxwellDMA::CallMethod(u32 method, u32 method_argument, bool is_last_call) {
+    ASSERT_MSG(method < NUM_REGS, "Invalid MaxwellDMA register");
+
+    regs.reg_array[method] = method_argument;
+
+    if (method == offsetof(Regs, launch_dma) / sizeof(u32)) {
+        Launch();
+    }
+}
+
+void MaxwellDMA::CallMultiMethod(u32 method, const u32* base_start, u32 amount,
+                                 u32 methods_pending) {
+    for (size_t i = 0; i < amount; ++i) {
+        CallMethod(method, base_start[i], methods_pending - static_cast<u32>(i) <= 1);
+    }
+}
+
+void MaxwellDMA::Launch() {
+    LOG_TRACE(Render_OpenGL, "DMA copy 0x{:x} -> 0x{:x}", static_cast<GPUVAddr>(regs.offset_in),
+              static_cast<GPUVAddr>(regs.offset_out));
+
+    // TODO(Subv): Perform more research and implement all features of this engine.
+    const LaunchDMA& launch = regs.launch_dma;
+    ASSERT(launch.remap_enable == 0);
+    ASSERT(launch.semaphore_type == LaunchDMA::SemaphoreType::NONE);
+    ASSERT(launch.interrupt_type == LaunchDMA::InterruptType::NONE);
+    ASSERT(launch.data_transfer_type == LaunchDMA::DataTransferType::NON_PIPELINED);
+    ASSERT(regs.dst_params.origin.x == 0);
+    ASSERT(regs.dst_params.origin.y == 0);
+
+    const bool is_src_pitch = launch.src_memory_layout == LaunchDMA::MemoryLayout::PITCH;
+    const bool is_dst_pitch = launch.dst_memory_layout == LaunchDMA::MemoryLayout::PITCH;
+
+    if (!is_src_pitch && !is_dst_pitch) {
+        // If both the source and the destination are in block layout, assert.
+        UNREACHABLE_MSG("Tiled->Tiled DMA transfers are not yet implemented");
+        return;
+    }
+
+    // All copies here update the main memory, so mark all rasterizer states as invalid.
+    system.GPU().Maxwell3D().OnMemoryWrite();
+
+    if (is_src_pitch && is_dst_pitch) {
+        CopyPitchToPitch();
+    } else {
+        ASSERT(launch.multi_line_enable == 1);
+
+        if (!is_src_pitch && is_dst_pitch) {
+            CopyBlockLinearToPitch();
+        } else {
+            CopyPitchToBlockLinear();
+        }
+    }
+}
+
+void MaxwellDMA::CopyPitchToPitch() {
+    // When `multi_line_enable` bit is disabled the copy is performed as if we were copying a 1D
+    // buffer of length `line_length_in`.
+    // Otherwise we copy a 2D image of dimensions (line_length_in, line_count).
+    if (!regs.launch_dma.multi_line_enable) {
+        memory_manager.CopyBlock(regs.offset_out, regs.offset_in, regs.line_length_in);
+        return;
+    }
+
+    // Perform a line-by-line copy.
+    // We're going to take a subrect of size (line_length_in, line_count) from the source rectangle.
+    // There is no need to manually flush/invalidate the regions because CopyBlock does that for us.
+    for (u32 line = 0; line < regs.line_count; ++line) {
+        const GPUVAddr source_line = regs.offset_in + static_cast<size_t>(line) * regs.pitch_in;
+        const GPUVAddr dest_line = regs.offset_out + static_cast<size_t>(line) * regs.pitch_out;
+        memory_manager.CopyBlock(dest_line, source_line, regs.line_length_in);
+    }
+}
+
+void MaxwellDMA::CopyBlockLinearToPitch() {
+    UNIMPLEMENTED_IF(regs.src_params.block_size.width != 0);
+    UNIMPLEMENTED_IF(regs.src_params.block_size.depth != 0);
+    UNIMPLEMENTED_IF(regs.src_params.layer != 0);
+
+    // Optimized path for micro copies.
+    const size_t dst_size = static_cast<size_t>(regs.pitch_out) * regs.line_count;
+    if (dst_size < GOB_SIZE && regs.pitch_out <= GOB_SIZE_X) {
+        FastCopyBlockLinearToPitch();
+        return;
+    }
+
+    // Deswizzle the input and copy it over.
+    const u32 bytes_per_pixel = regs.pitch_out / regs.line_length_in;
+    const Parameters& src_params = regs.src_params;
+    const u32 width = src_params.width;
+    const u32 height = src_params.height;
+    const u32 depth = src_params.depth;
+    const u32 block_height = src_params.block_size.height;
+    const u32 block_depth = src_params.block_size.depth;
+    const size_t src_size =
+        CalculateSize(true, bytes_per_pixel, width, height, depth, block_height, block_depth);
+
+    if (read_buffer.size() < src_size) {
+        read_buffer.resize(src_size);
+    }
+    if (write_buffer.size() < dst_size) {
+        write_buffer.resize(dst_size);
+    }
+
+    memory_manager.ReadBlock(regs.offset_in, read_buffer.data(), src_size);
+    memory_manager.ReadBlock(regs.offset_out, write_buffer.data(), dst_size);
+
+    UnswizzleSubrect(regs.line_length_in, regs.line_count, regs.pitch_out, width, bytes_per_pixel,
+                     block_height, src_params.origin.x, src_params.origin.y, write_buffer.data(),
+                     read_buffer.data());
+
+    memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size);
+}
+
+void MaxwellDMA::CopyPitchToBlockLinear() {
+    UNIMPLEMENTED_IF_MSG(regs.dst_params.block_size.width != 0, "Block width is not one");
+
+    const auto& dst_params = regs.dst_params;
+    const u32 bytes_per_pixel = regs.pitch_in / regs.line_length_in;
+    const u32 width = dst_params.width;
+    const u32 height = dst_params.height;
+    const u32 depth = dst_params.depth;
+    const u32 block_height = dst_params.block_size.height;
+    const u32 block_depth = dst_params.block_size.depth;
+    const size_t dst_size =
+        CalculateSize(true, bytes_per_pixel, width, height, depth, block_height, block_depth);
+    const size_t dst_layer_size =
+        CalculateSize(true, bytes_per_pixel, width, height, 1, block_height, block_depth);
+
+    const size_t src_size = static_cast<size_t>(regs.pitch_in) * regs.line_count;
+
+    if (read_buffer.size() < src_size) {
+        read_buffer.resize(src_size);
+    }
+    if (write_buffer.size() < dst_size) {
+        write_buffer.resize(dst_size);
+    }
+
+    if (Settings::IsGPULevelExtreme()) {
+        memory_manager.ReadBlock(regs.offset_in, read_buffer.data(), src_size);
+        memory_manager.ReadBlock(regs.offset_out, write_buffer.data(), dst_size);
+    } else {
+        memory_manager.ReadBlockUnsafe(regs.offset_in, read_buffer.data(), src_size);
+        memory_manager.ReadBlockUnsafe(regs.offset_out, write_buffer.data(), dst_size);
+    }
+
+    // If the input is linear and the output is tiled, swizzle the input and copy it over.
+    if (regs.dst_params.block_size.depth > 0) {
+        ASSERT(dst_params.layer == 0);
+        SwizzleSliceToVoxel(regs.line_length_in, regs.line_count, regs.pitch_in, width, height,
+                            bytes_per_pixel, block_height, block_depth, dst_params.origin.x,
+                            dst_params.origin.y, write_buffer.data(), read_buffer.data());
+    } else {
+        SwizzleSubrect(regs.line_length_in, regs.line_count, regs.pitch_in, width, bytes_per_pixel,
+                       write_buffer.data() + dst_layer_size * dst_params.layer, read_buffer.data(),
+                       block_height, dst_params.origin.x, dst_params.origin.y);
+    }
+
+    memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size);
+}
+
+void MaxwellDMA::FastCopyBlockLinearToPitch() {
+    const u32 bytes_per_pixel = regs.pitch_out / regs.line_length_in;
+    const size_t src_size = GOB_SIZE;
+    const size_t dst_size = static_cast<size_t>(regs.pitch_out) * regs.line_count;
+    u32 pos_x = regs.src_params.origin.x;
+    u32 pos_y = regs.src_params.origin.y;
+    const u64 offset = GetGOBOffset(regs.src_params.width, regs.src_params.height, pos_x, pos_y,
+                                    regs.src_params.block_size.height, bytes_per_pixel);
+    const u32 x_in_gob = 64 / bytes_per_pixel;
+    pos_x = pos_x % x_in_gob;
+    pos_y = pos_y % 8;
+
+    if (read_buffer.size() < src_size) {
+        read_buffer.resize(src_size);
+    }
+    if (write_buffer.size() < dst_size) {
+        write_buffer.resize(dst_size);
+    }
+
+    if (Settings::IsGPULevelExtreme()) {
+        memory_manager.ReadBlock(regs.offset_in + offset, read_buffer.data(), src_size);
+        memory_manager.ReadBlock(regs.offset_out, write_buffer.data(), dst_size);
+    } else {
+        memory_manager.ReadBlockUnsafe(regs.offset_in + offset, read_buffer.data(), src_size);
+        memory_manager.ReadBlockUnsafe(regs.offset_out, write_buffer.data(), dst_size);
+    }
+
+    UnswizzleSubrect(regs.line_length_in, regs.line_count, regs.pitch_out, regs.src_params.width,
+                     bytes_per_pixel, regs.src_params.block_size.height, pos_x, pos_y,
+                     write_buffer.data(), read_buffer.data());
+
+    memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size);
+}
+
+} // namespace Tegra::Engines
--- a/src/video_core/engines/maxwell_dma.h
+++ b/src/video_core/engines/maxwell_dma.h
@@ -0,0 +1,274 @@
+// Copyright 2018 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <array>
+#include <cstddef>
+#include <vector>
+#include "common/bit_field.h"
+#include "common/common_funcs.h"
+#include "common/common_types.h"
+#include "video_core/engines/engine_interface.h"
+#include "video_core/gpu.h"
+
+namespace Core {
+class System;
+}
+
+namespace Tegra {
+class MemoryManager;
+}
+
+namespace Tegra::Engines {
+
+/**
+ * This engine is known as gk104_copy. Documentation can be found in:
+ * https://github.com/NVIDIA/open-gpu-doc/blob/master/classes/dma-copy/clb0b5.h
+ * https://github.com/envytools/envytools/blob/master/rnndb/fifo/gk104_copy.xml
+ */
+
+class MaxwellDMA final : public EngineInterface {
+public:
+    struct PackedGPUVAddr {
+        u32 upper;
+        u32 lower;
+
+        constexpr operator GPUVAddr() const noexcept {
+            return (static_cast<GPUVAddr>(upper & 0xff) << 32) | lower;
+        }
+    };
+
+    union BlockSize {
+        BitField<0, 4, u32> width;
+        BitField<4, 4, u32> height;
+        BitField<8, 4, u32> depth;
+        BitField<12, 4, u32> gob_height;
+    };
+    static_assert(sizeof(BlockSize) == 4);
+
+    union Origin {
+        BitField<0, 16, u32> x;
+        BitField<16, 16, u32> y;
+    };
+    static_assert(sizeof(Origin) == 4);
+
+    struct Parameters {
+        BlockSize block_size;
+        u32 width;
+        u32 height;
+        u32 depth;
+        u32 layer;
+        Origin origin;
+    };
+    static_assert(sizeof(Parameters) == 24);
+
+    struct Semaphore {
+        PackedGPUVAddr address;
+        u32 payload;
+    };
+    static_assert(sizeof(Semaphore) == 12);
+
+    struct RenderEnable {
+        enum class Mode : u32 {
+            // Note: This uses Pascal case in order to avoid the identifiers
+            // FALSE and TRUE, which are reserved on Darwin.
+            False = 0,
+            True = 1,
+            Conditional = 2,
+            RenderIfEqual = 3,
+            RenderIfNotEqual = 4,
+        };
+
+        PackedGPUVAddr address;
+        BitField<0, 3, Mode> mode;
+    };
+    static_assert(sizeof(RenderEnable) == 12);
+
+    enum class PhysModeTarget : u32 {
+        LOCAL_FB = 0,
+        COHERENT_SYSMEM = 1,
+        NONCOHERENT_SYSMEM = 2,
+    };
+    using PhysMode = BitField<0, 2, PhysModeTarget>;
+
+    union LaunchDMA {
+        enum class DataTransferType : u32 {
+            NONE = 0,
+            PIPELINED = 1,
+            NON_PIPELINED = 2,
+        };
+
+        enum class SemaphoreType : u32 {
+            NONE = 0,
+            RELEASE_ONE_WORD_SEMAPHORE = 1,
+            RELEASE_FOUR_WORD_SEMAPHORE = 2,
+        };
+
+        enum class InterruptType : u32 {
+            NONE = 0,
+            BLOCKING = 1,
+            NON_BLOCKING = 2,
+        };
+
+        enum class MemoryLayout : u32 {
+            BLOCKLINEAR = 0,
+            PITCH = 1,
+        };
+
+        enum class Type : u32 {
+            VIRTUAL = 0,
+            PHYSICAL = 1,
+        };
+
+        enum class SemaphoreReduction : u32 {
+            IMIN = 0,
+            IMAX = 1,
+            IXOR = 2,
+            IAND = 3,
+            IOR = 4,
+            IADD = 5,
+            INC = 6,
+            DEC = 7,
+            FADD = 0xA,
+        };
+
+        enum class SemaphoreReductionSign : u32 {
+            SIGNED = 0,
+            UNSIGNED = 1,
+        };
+
+        enum class BypassL2 : u32 {
+            USE_PTE_SETTING = 0,
+            FORCE_VOLATILE = 1,
+        };
+
+        BitField<0, 2, DataTransferType> data_transfer_type;
+        BitField<2, 1, u32> flush_enable;
+        BitField<3, 2, SemaphoreType> semaphore_type;
+        BitField<5, 2, InterruptType> interrupt_type;
+        BitField<7, 1, MemoryLayout> src_memory_layout;
+        BitField<8, 1, MemoryLayout> dst_memory_layout;
+        BitField<9, 1, u32> multi_line_enable;
+        BitField<10, 1, u32> remap_enable;
+        BitField<11, 1, u32> rmwdisable;
+        BitField<12, 1, Type> src_type;
+        BitField<13, 1, Type> dst_type;
+        BitField<14, 4, SemaphoreReduction> semaphore_reduction;
+        BitField<18, 1, SemaphoreReductionSign> semaphore_reduction_sign;
+        BitField<19, 1, u32> reduction_enable;
+        BitField<20, 1, BypassL2> bypass_l2;
+    };
+    static_assert(sizeof(LaunchDMA) == 4);
+
+    struct RemapConst {
+        enum Swizzle : u32 {
+            SRC_X = 0,
+            SRC_Y = 1,
+            SRC_Z = 2,
+            SRC_W = 3,
+            CONST_A = 4,
+            CONST_B = 5,
+            NO_WRITE = 6,
+        };
+
+        PackedGPUVAddr address;
+
+        union {
+            BitField<0, 3, Swizzle> dst_x;
+            BitField<4, 3, Swizzle> dst_y;
+            BitField<8, 3, Swizzle> dst_z;
+            BitField<12, 3, Swizzle> dst_w;
+            BitField<16, 2, u32> component_size_minus_one;
+            BitField<20, 2, u32> num_src_components_minus_one;
+            BitField<24, 2, u32> num_dst_components_minus_one;
+        };
+    };
+    static_assert(sizeof(RemapConst) == 12);
+
+    explicit MaxwellDMA(Core::System& system_, MemoryManager& memory_manager_);
+    ~MaxwellDMA();
+
+    /// Write the value to the register identified by method.
+    void CallMethod(u32 method, u32 method_argument, bool is_last_call) override;
+
+    /// Write multiple values to the register identified by method.
+    void CallMultiMethod(u32 method, const u32* base_start, u32 amount,
+                         u32 methods_pending) override;
+
+private:
+    /// Performs the copy from the source buffer to the destination buffer as configured in the
+    /// registers.
+    void Launch();
+
+    void CopyPitchToPitch();
+
+    void CopyBlockLinearToPitch();
+
+    void CopyPitchToBlockLinear();
+
+    void FastCopyBlockLinearToPitch();
+
+    Core::System& system;
+
+    MemoryManager& memory_manager;
+
+    std::vector<u8> read_buffer;
+    std::vector<u8> write_buffer;
+
+    static constexpr std::size_t NUM_REGS = 0x800;
+    struct Regs {
+        union {
+            struct {
+                u32 reserved[0x40];
+                u32 nop;
+                u32 reserved01[0xf];
+                u32 pm_trigger;
+                u32 reserved02[0x3f];
+                Semaphore semaphore;
+                u32 reserved03[0x2];
+                RenderEnable render_enable;
+                PhysMode src_phys_mode;
+                PhysMode dst_phys_mode;
+                u32 reserved04[0x26];
+                LaunchDMA launch_dma;
+                u32 reserved05[0x3f];
+                PackedGPUVAddr offset_in;
+                PackedGPUVAddr offset_out;
+                u32 pitch_in;
+                u32 pitch_out;
+                u32 line_length_in;
+                u32 line_count;
+                u32 reserved06[0xb8];
+                RemapConst remap_const;
+                Parameters dst_params;
+                u32 reserved07[0x1];
+                Parameters src_params;
+                u32 reserved08[0x275];
+                u32 pm_trigger_end;
+                u32 reserved09[0x3ba];
+            };
+            std::array<u32, NUM_REGS> reg_array;
+        };
+    } regs{};
+
+#define ASSERT_REG_POSITION(field_name, position)                                                  \
+    static_assert(offsetof(MaxwellDMA::Regs, field_name) == position * 4,                          \
+                  "Field " #field_name " has invalid position")
+
+    ASSERT_REG_POSITION(launch_dma, 0xC0);
+    ASSERT_REG_POSITION(offset_in, 0x100);
+    ASSERT_REG_POSITION(offset_out, 0x102);
+    ASSERT_REG_POSITION(pitch_in, 0x104);
+    ASSERT_REG_POSITION(pitch_out, 0x105);
+    ASSERT_REG_POSITION(line_length_in, 0x106);
+    ASSERT_REG_POSITION(line_count, 0x107);
+    ASSERT_REG_POSITION(remap_const, 0x1C0);
+    ASSERT_REG_POSITION(dst_params, 0x1C3);
+    ASSERT_REG_POSITION(src_params, 0x1CA);
+
+#undef ASSERT_REG_POSITION
+};
+
+} // namespace Tegra::Engines
--- a/src/video_core/engines/shader_bytecode.h
+++ b/src/video_core/engines/shader_bytecode.h
--- a/src/video_core/engines/shader_header.h
+++ b/src/video_core/engines/shader_header.h
@@ -0,0 +1,158 @@
+// Copyright 2018 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <array>
+#include <optional>
+
+#include "common/bit_field.h"
+#include "common/common_funcs.h"
+#include "common/common_types.h"
+
+namespace Tegra::Shader {
+
+enum class OutputTopology : u32 {
+    PointList = 1,
+    LineStrip = 6,
+    TriangleStrip = 7,
+};
+
+enum class PixelImap : u8 {
+    Unused = 0,
+    Constant = 1,
+    Perspective = 2,
+    ScreenLinear = 3,
+};
+
+// Documentation in:
+// http://download.nvidia.com/open-gpu-doc/Shader-Program-Header/1/Shader-Program-Header.html
+struct Header {
+    union {
+        BitField<0, 5, u32> sph_type;
+        BitField<5, 5, u32> version;
+        BitField<10, 4, u32> shader_type;
+        BitField<14, 1, u32> mrt_enable;
+        BitField<15, 1, u32> kills_pixels;
+        BitField<16, 1, u32> does_global_store;
+        BitField<17, 4, u32> sass_version;
+        BitField<21, 5, u32> reserved;
+        BitField<26, 1, u32> does_load_or_store;
+        BitField<27, 1, u32> does_fp64;
+        BitField<28, 4, u32> stream_out_mask;
+    } common0;
+
+    union {
+        BitField<0, 24, u32> shader_local_memory_low_size;
+        BitField<24, 8, u32> per_patch_attribute_count;
+    } common1;
+
+    union {
+        BitField<0, 24, u32> shader_local_memory_high_size;
+        BitField<24, 8, u32> threads_per_input_primitive;
+    } common2;
+
+    union {
+        BitField<0, 24, u32> shader_local_memory_crs_size;
+        BitField<24, 4, OutputTopology> output_topology;
+        BitField<28, 4, u32> reserved;
+    } common3;
+
+    union {
+        BitField<0, 12, u32> max_output_vertices;
+        BitField<12, 8, u32> store_req_start; // NOTE: not used by geometry shaders.
+        BitField<20, 4, u32> reserved;
+        BitField<24, 8, u32> store_req_end; // NOTE: not used by geometry shaders.
+    } common4;
+
+    union {
+        struct {
+            INSERT_UNION_PADDING_BYTES(3);  // ImapSystemValuesA
+            INSERT_UNION_PADDING_BYTES(1);  // ImapSystemValuesB
+            INSERT_UNION_PADDING_BYTES(16); // ImapGenericVector[32]
+            INSERT_UNION_PADDING_BYTES(2);  // ImapColor
+            union {
+                BitField<0, 8, u16> clip_distances;
+                BitField<8, 1, u16> point_sprite_s;
+                BitField<9, 1, u16> point_sprite_t;
+                BitField<10, 1, u16> fog_coordinate;
+                BitField<12, 1, u16> tessellation_eval_point_u;
+                BitField<13, 1, u16> tessellation_eval_point_v;
+                BitField<14, 1, u16> instance_id;
+                BitField<15, 1, u16> vertex_id;
+            };
+            INSERT_UNION_PADDING_BYTES(5);  // ImapFixedFncTexture[10]
+            INSERT_UNION_PADDING_BYTES(1);  // ImapReserved
+            INSERT_UNION_PADDING_BYTES(3);  // OmapSystemValuesA
+            INSERT_UNION_PADDING_BYTES(1);  // OmapSystemValuesB
+            INSERT_UNION_PADDING_BYTES(16); // OmapGenericVector[32]
+            INSERT_UNION_PADDING_BYTES(2);  // OmapColor
+            INSERT_UNION_PADDING_BYTES(2);  // OmapSystemValuesC
+            INSERT_UNION_PADDING_BYTES(5);  // OmapFixedFncTexture[10]
+            INSERT_UNION_PADDING_BYTES(1);  // OmapReserved
+        } vtg;
+
+        struct {
+            INSERT_UNION_PADDING_BYTES(3); // ImapSystemValuesA
+            INSERT_UNION_PADDING_BYTES(1); // ImapSystemValuesB
+
+            union {
+                BitField<0, 2, PixelImap> x;
+                BitField<2, 2, PixelImap> y;
+                BitField<4, 2, PixelImap> z;
+                BitField<6, 2, PixelImap> w;
+                u8 raw;
+            } imap_generic_vector[32];
+
+            INSERT_UNION_PADDING_BYTES(2);  // ImapColor
+            INSERT_UNION_PADDING_BYTES(2);  // ImapSystemValuesC
+            INSERT_UNION_PADDING_BYTES(10); // ImapFixedFncTexture[10]
+            INSERT_UNION_PADDING_BYTES(2);  // ImapReserved
+
+            struct {
+                u32 target;
+                union {
+                    BitField<0, 1, u32> sample_mask;
+                    BitField<1, 1, u32> depth;
+                    BitField<2, 30, u32> reserved;
+                };
+            } omap;
+
+            bool IsColorComponentOutputEnabled(u32 render_target, u32 component) const {
+                const u32 bit = render_target * 4 + component;
+                return omap.target & (1 << bit);
+            }
+
+            PixelImap GetPixelImap(u32 attribute) const {
+                const auto get_index = [this, attribute](u32 index) {
+                    return static_cast<PixelImap>(
+                        (imap_generic_vector[attribute].raw >> (index * 2)) & 3);
+                };
+
+                std::optional<PixelImap> result;
+                for (u32 component = 0; component < 4; ++component) {
+                    const PixelImap index = get_index(component);
+                    if (index == PixelImap::Unused) {
+                        continue;
+                    }
+                    if (result && result != index) {
+                        LOG_CRITICAL(HW_GPU, "Generic attribute conflict in interpolation mode");
+                    }
+                    result = index;
+                }
+                return result.value_or(PixelImap::Unused);
+            }
+        } ps;
+
+        std::array<u32, 0xF> raw;
+    };
+
+    u64 GetLocalMemorySize() const {
+        return (common1.shader_local_memory_low_size |
+                (common2.shader_local_memory_high_size << 24));
+    }
+};
+static_assert(sizeof(Header) == 0x50, "Incorrect structure size");
+
+} // namespace Tegra::Shader
--- a/src/video_core/engines/shader_type.h
+++ b/src/video_core/engines/shader_type.h
@@ -0,0 +1,21 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include "common/common_types.h"
+
+namespace Tegra::Engines {
+
+enum class ShaderType : u32 {
+    Vertex = 0,
+    TesselationControl = 1,
+    TesselationEval = 2,
+    Geometry = 3,
+    Fragment = 4,
+    Compute = 5,
+};
+static constexpr std::size_t MaxShaderTypes = 6;
+
+} // namespace Tegra::Engines