Merge pull request #3612 from ReinUsesLisp/red

shader/memory: Implement RED.E.ADD and minor changes to ATOM
2025-11-29 12:45:48 -08:00 · 2020-04-15 15:03:49 -04:00
parent 4398bdb4c7 3185245845
commit e33196d4e7
5 changed files with 140 additions and 69 deletions
--- a/src/video_core/engines/shader_bytecode.h
+++ b/src/video_core/engines/shader_bytecode.h
@@ -1005,6 +1005,12 @@ union Instruction {
        BitField<46, 2, u64> cache_mode;
    } stg;

+    union {
+        BitField<23, 3, AtomicOp> operation;
+        BitField<48, 1, u64> extended;
+        BitField<20, 3, GlobalAtomicType> type;
+    } red;
+
    union {
        BitField<52, 4, AtomicOp> operation;
        BitField<49, 3, GlobalAtomicType> type;
@@ -1787,6 +1793,7 @@ public:
        ST_S,
        ST,    // Store in generic memory
        STG,   // Store in global memory
+        RED,   // Reduction operation
        ATOM,  // Atomic operation on global memory
        ATOMS, // Atomic operation on shared memory
        AL2P,  // Transforms attribute memory into physical memory
@@ -2097,6 +2104,7 @@ private:
            INST("1110111101010---", Id::ST_L, Type::Memory, "ST_L"),
            INST("101-------------", Id::ST, Type::Memory, "ST"),
            INST("1110111011011---", Id::STG, Type::Memory, "STG"),
+            INST("1110101111111---", Id::RED, Type::Memory, "RED"),
            INST("11101101--------", Id::ATOM, Type::Memory, "ATOM"),
            INST("11101100--------", Id::ATOMS, Type::Memory, "ATOMS"),
            INST("1110111110100---", Id::AL2P, Type::Memory, "AL2P"),
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
@@ -2119,8 +2119,14 @@ private:
            return {};
        }
        return {fmt::format("atomic{}({}, {})", opname, Visit(operation[0]).GetCode(),
-                            Visit(operation[1]).As(type)),
-                type};
+                            Visit(operation[1]).AsUint()),
+                Type::Uint};
+    }
+
+    template <const std::string_view& opname, Type type>
+    Expression Reduce(Operation operation) {
+        code.AddLine("{};", Atomic<opname, type>(operation).GetCode());
+        return {};
    }

    Expression Branch(Operation operation) {
@@ -2479,6 +2485,20 @@ private:
        &GLSLDecompiler::Atomic<Func::Or, Type::Int>,
        &GLSLDecompiler::Atomic<Func::Xor, Type::Int>,

+        &GLSLDecompiler::Reduce<Func::Add, Type::Uint>,
+        &GLSLDecompiler::Reduce<Func::Min, Type::Uint>,
+        &GLSLDecompiler::Reduce<Func::Max, Type::Uint>,
+        &GLSLDecompiler::Reduce<Func::And, Type::Uint>,
+        &GLSLDecompiler::Reduce<Func::Or, Type::Uint>,
+        &GLSLDecompiler::Reduce<Func::Xor, Type::Uint>,
+
+        &GLSLDecompiler::Reduce<Func::Add, Type::Int>,
+        &GLSLDecompiler::Reduce<Func::Min, Type::Int>,
+        &GLSLDecompiler::Reduce<Func::Max, Type::Int>,
+        &GLSLDecompiler::Reduce<Func::And, Type::Int>,
+        &GLSLDecompiler::Reduce<Func::Or, Type::Int>,
+        &GLSLDecompiler::Reduce<Func::Xor, Type::Int>,
+
        &GLSLDecompiler::Branch,
        &GLSLDecompiler::BranchIndirect,
        &GLSLDecompiler::PushFlowStack,
--- a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
+++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
@@ -1938,11 +1938,8 @@ private:
        return {};
    }

-    template <Id (Module::*func)(Id, Id, Id, Id, Id), Type result_type,
-              Type value_type = result_type>
+    template <Id (Module::*func)(Id, Id, Id, Id, Id)>
    Expression Atomic(Operation operation) {
-        const Id type_def = GetTypeDefinition(result_type);
-
        Id pointer;
        if (const auto smem = std::get_if<SmemNode>(&*operation[0])) {
            pointer = GetSharedMemoryPointer(*smem);
@@ -1950,15 +1947,19 @@ private:
            pointer = GetGlobalMemoryPointer(*gmem);
        } else {
            UNREACHABLE();
-            return {Constant(type_def, 0), result_type};
+            return {v_float_zero, Type::Float};
        }
-
-        const Id value = As(Visit(operation[1]), value_type);
-
        const Id scope = Constant(t_uint, static_cast<u32>(spv::Scope::Device));
-        const Id semantics = Constant(type_def, 0);
+        const Id semantics = Constant(t_uint, 0);
+        const Id value = AsUint(Visit(operation[1]));

-        return {(this->*func)(type_def, pointer, scope, semantics, value), result_type};
+        return {(this->*func)(t_uint, pointer, scope, semantics, value), Type::Uint};
+    }
+
+    template <Id (Module::*func)(Id, Id, Id, Id, Id)>
+    Expression Reduce(Operation operation) {
+        Atomic<func>(operation);
+        return {};
    }

    Expression Branch(Operation operation) {
@@ -2547,21 +2548,35 @@ private:
        &SPIRVDecompiler::AtomicImageXor,
        &SPIRVDecompiler::AtomicImageExchange,

-        &SPIRVDecompiler::Atomic<&Module::OpAtomicExchange, Type::Uint>,
-        &SPIRVDecompiler::Atomic<&Module::OpAtomicIAdd, Type::Uint>,
-        &SPIRVDecompiler::Atomic<&Module::OpAtomicUMin, Type::Uint>,
-        &SPIRVDecompiler::Atomic<&Module::OpAtomicUMax, Type::Uint>,
-        &SPIRVDecompiler::Atomic<&Module::OpAtomicAnd, Type::Uint>,
-        &SPIRVDecompiler::Atomic<&Module::OpAtomicOr, Type::Uint>,
-        &SPIRVDecompiler::Atomic<&Module::OpAtomicXor, Type::Uint>,
+        &SPIRVDecompiler::Atomic<&Module::OpAtomicExchange>,
+        &SPIRVDecompiler::Atomic<&Module::OpAtomicIAdd>,
+        &SPIRVDecompiler::Atomic<&Module::OpAtomicUMin>,
+        &SPIRVDecompiler::Atomic<&Module::OpAtomicUMax>,
+        &SPIRVDecompiler::Atomic<&Module::OpAtomicAnd>,
+        &SPIRVDecompiler::Atomic<&Module::OpAtomicOr>,
+        &SPIRVDecompiler::Atomic<&Module::OpAtomicXor>,

-        &SPIRVDecompiler::Atomic<&Module::OpAtomicExchange, Type::Int>,
-        &SPIRVDecompiler::Atomic<&Module::OpAtomicIAdd, Type::Int>,
-        &SPIRVDecompiler::Atomic<&Module::OpAtomicSMin, Type::Int>,
-        &SPIRVDecompiler::Atomic<&Module::OpAtomicSMax, Type::Int>,
-        &SPIRVDecompiler::Atomic<&Module::OpAtomicAnd, Type::Int>,
-        &SPIRVDecompiler::Atomic<&Module::OpAtomicOr, Type::Int>,
-        &SPIRVDecompiler::Atomic<&Module::OpAtomicXor, Type::Int>,
+        &SPIRVDecompiler::Atomic<&Module::OpAtomicExchange>,
+        &SPIRVDecompiler::Atomic<&Module::OpAtomicIAdd>,
+        &SPIRVDecompiler::Atomic<&Module::OpAtomicSMin>,
+        &SPIRVDecompiler::Atomic<&Module::OpAtomicSMax>,
+        &SPIRVDecompiler::Atomic<&Module::OpAtomicAnd>,
+        &SPIRVDecompiler::Atomic<&Module::OpAtomicOr>,
+        &SPIRVDecompiler::Atomic<&Module::OpAtomicXor>,
+
+        &SPIRVDecompiler::Reduce<&Module::OpAtomicIAdd>,
+        &SPIRVDecompiler::Reduce<&Module::OpAtomicUMin>,
+        &SPIRVDecompiler::Reduce<&Module::OpAtomicUMax>,
+        &SPIRVDecompiler::Reduce<&Module::OpAtomicAnd>,
+        &SPIRVDecompiler::Reduce<&Module::OpAtomicOr>,
+        &SPIRVDecompiler::Reduce<&Module::OpAtomicXor>,
+
+        &SPIRVDecompiler::Reduce<&Module::OpAtomicIAdd>,
+        &SPIRVDecompiler::Reduce<&Module::OpAtomicSMin>,
+        &SPIRVDecompiler::Reduce<&Module::OpAtomicSMax>,
+        &SPIRVDecompiler::Reduce<&Module::OpAtomicAnd>,
+        &SPIRVDecompiler::Reduce<&Module::OpAtomicOr>,
+        &SPIRVDecompiler::Reduce<&Module::OpAtomicXor>,

        &SPIRVDecompiler::Branch,
        &SPIRVDecompiler::BranchIndirect,
--- a/src/video_core/shader/decode/memory.cpp
+++ b/src/video_core/shader/decode/memory.cpp
@@ -3,7 +3,9 @@
 // Refer to the license.txt file included.

 #include <algorithm>
+#include <utility>
 #include <vector>
+
 #include <fmt/format.h>

 #include "common/alignment.h"
@@ -16,6 +18,7 @@

 namespace VideoCommon::Shader {

+using std::move;
 using Tegra::Shader::AtomicOp;
 using Tegra::Shader::AtomicType;
 using Tegra::Shader::Attribute;
@@ -27,29 +30,26 @@ using Tegra::Shader::StoreType;

 namespace {

-Node GetAtomOperation(AtomicOp op, bool is_signed, Node memory, Node data) {
-    const OperationCode operation_code = [op] {
-        switch (op) {
-        case AtomicOp::Add:
-            return OperationCode::AtomicIAdd;
-        case AtomicOp::Min:
-            return OperationCode::AtomicIMin;
-        case AtomicOp::Max:
-            return OperationCode::AtomicIMax;
-        case AtomicOp::And:
-            return OperationCode::AtomicIAnd;
-        case AtomicOp::Or:
-            return OperationCode::AtomicIOr;
-        case AtomicOp::Xor:
-            return OperationCode::AtomicIXor;
-        case AtomicOp::Exch:
-            return OperationCode::AtomicIExchange;
-        default:
-            UNIMPLEMENTED_MSG("op={}", static_cast<int>(op));
-            return OperationCode::AtomicIAdd;
-        }
-    }();
-    return SignedOperation(operation_code, is_signed, std::move(memory), std::move(data));
+OperationCode GetAtomOperation(AtomicOp op) {
+    switch (op) {
+    case AtomicOp::Add:
+        return OperationCode::AtomicIAdd;
+    case AtomicOp::Min:
+        return OperationCode::AtomicIMin;
+    case AtomicOp::Max:
+        return OperationCode::AtomicIMax;
+    case AtomicOp::And:
+        return OperationCode::AtomicIAnd;
+    case AtomicOp::Or:
+        return OperationCode::AtomicIOr;
+    case AtomicOp::Xor:
+        return OperationCode::AtomicIXor;
+    case AtomicOp::Exch:
+        return OperationCode::AtomicIExchange;
+    default:
+        UNIMPLEMENTED_MSG("op={}", static_cast<int>(op));
+        return OperationCode::AtomicIAdd;
+    }
 }

 bool IsUnaligned(Tegra::Shader::UniformType uniform_type) {
@@ -90,23 +90,22 @@ u32 GetMemorySize(Tegra::Shader::UniformType uniform_type) {

 Node ExtractUnaligned(Node value, Node address, u32 mask, u32 size) {
    Node offset = Operation(OperationCode::UBitwiseAnd, address, Immediate(mask));
-    offset = Operation(OperationCode::ULogicalShiftLeft, std::move(offset), Immediate(3));
-    return Operation(OperationCode::UBitfieldExtract, std::move(value), std::move(offset),
-                     Immediate(size));
+    offset = Operation(OperationCode::ULogicalShiftLeft, move(offset), Immediate(3));
+    return Operation(OperationCode::UBitfieldExtract, move(value), move(offset), Immediate(size));
 }

 Node InsertUnaligned(Node dest, Node value, Node address, u32 mask, u32 size) {
-    Node offset = Operation(OperationCode::UBitwiseAnd, std::move(address), Immediate(mask));
-    offset = Operation(OperationCode::ULogicalShiftLeft, std::move(offset), Immediate(3));
-    return Operation(OperationCode::UBitfieldInsert, std::move(dest), std::move(value),
-                     std::move(offset), Immediate(size));
+    Node offset = Operation(OperationCode::UBitwiseAnd, move(address), Immediate(mask));
+    offset = Operation(OperationCode::ULogicalShiftLeft, move(offset), Immediate(3));
+    return Operation(OperationCode::UBitfieldInsert, move(dest), move(value), move(offset),
+                     Immediate(size));
 }

 Node Sign16Extend(Node value) {
    Node sign = Operation(OperationCode::UBitwiseAnd, value, Immediate(1U << 15));
-    Node is_sign = Operation(OperationCode::LogicalUEqual, std::move(sign), Immediate(1U << 15));
+    Node is_sign = Operation(OperationCode::LogicalUEqual, move(sign), Immediate(1U << 15));
    Node extend = Operation(OperationCode::Select, is_sign, Immediate(0xFFFF0000), Immediate(0));
-    return Operation(OperationCode::UBitwiseOr, std::move(value), std::move(extend));
+    return Operation(OperationCode::UBitwiseOr, move(value), move(extend));
 }

 } // Anonymous namespace
@@ -379,20 +378,36 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {

            if (IsUnaligned(type)) {
                const u32 mask = GetUnalignedMask(type);
-                value = InsertUnaligned(gmem, std::move(value), real_address, mask, size);
+                value = InsertUnaligned(gmem, move(value), real_address, mask, size);
            }

            bb.push_back(Operation(OperationCode::Assign, gmem, value));
        }
        break;
    }
+    case OpCode::Id::RED: {
+        UNIMPLEMENTED_IF_MSG(instr.red.type != GlobalAtomicType::U32);
+        UNIMPLEMENTED_IF_MSG(instr.red.operation != AtomicOp::Add);
+        const auto [real_address, base_address, descriptor] =
+            TrackGlobalMemory(bb, instr, true, true);
+        if (!real_address || !base_address) {
+            // Tracking failed, skip atomic.
+            break;
+        }
+        Node gmem = MakeNode<GmemNode>(real_address, base_address, descriptor);
+        Node value = GetRegister(instr.gpr0);
+        bb.push_back(Operation(OperationCode::ReduceIAdd, move(gmem), move(value)));
+        break;
+    }
    case OpCode::Id::ATOM: {
        UNIMPLEMENTED_IF_MSG(instr.atom.operation == AtomicOp::Inc ||
                                 instr.atom.operation == AtomicOp::Dec ||
                                 instr.atom.operation == AtomicOp::SafeAdd,
                             "operation={}", static_cast<int>(instr.atom.operation.Value()));
        UNIMPLEMENTED_IF_MSG(instr.atom.type == GlobalAtomicType::S64 ||
-                                 instr.atom.type == GlobalAtomicType::U64,
+                                 instr.atom.type == GlobalAtomicType::U64 ||
+                                 instr.atom.type == GlobalAtomicType::F16x2_FTZ_RN ||
+                                 instr.atom.type == GlobalAtomicType::F32_FTZ_RN,
                             "type={}", static_cast<int>(instr.atom.type.Value()));

        const auto [real_address, base_address, descriptor] =
@@ -403,11 +418,11 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
        }

        const bool is_signed =
-            instr.atoms.type == AtomicType::S32 || instr.atoms.type == AtomicType::S64;
+            instr.atom.type == GlobalAtomicType::S32 || instr.atom.type == GlobalAtomicType::S64;
        Node gmem = MakeNode<GmemNode>(real_address, base_address, descriptor);
-        Node value = GetAtomOperation(static_cast<AtomicOp>(instr.atom.operation), is_signed, gmem,
-                                      GetRegister(instr.gpr20));
-        SetRegister(bb, instr.gpr0, std::move(value));
+        SetRegister(bb, instr.gpr0,
+                    SignedOperation(GetAtomOperation(instr.atom.operation), is_signed, gmem,
+                                    GetRegister(instr.gpr20)));
        break;
    }
    case OpCode::Id::ATOMS: {
@@ -421,11 +436,10 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
            instr.atoms.type == AtomicType::S32 || instr.atoms.type == AtomicType::S64;
        const s32 offset = instr.atoms.GetImmediateOffset();
        Node address = GetRegister(instr.gpr8);
-        address = Operation(OperationCode::IAdd, std::move(address), Immediate(offset));
-        Node value =
-            GetAtomOperation(static_cast<AtomicOp>(instr.atoms.operation), is_signed,
-                             GetSharedMemory(std::move(address)), GetRegister(instr.gpr20));
-        SetRegister(bb, instr.gpr0, std::move(value));
+        address = Operation(OperationCode::IAdd, move(address), Immediate(offset));
+        SetRegister(bb, instr.gpr0,
+                    SignedOperation(GetAtomOperation(instr.atoms.operation), is_signed,
+                                    GetSharedMemory(move(address)), GetRegister(instr.gpr20)));
        break;
    }
    case OpCode::Id::AL2P: {
--- a/src/video_core/shader/node.h
+++ b/src/video_core/shader/node.h
@@ -178,6 +178,20 @@ enum class OperationCode {
    AtomicIOr,       /// (memory, int) -> int
    AtomicIXor,      /// (memory, int) -> int

+    ReduceUAdd, /// (memory, uint) -> void
+    ReduceUMin, /// (memory, uint) -> void
+    ReduceUMax, /// (memory, uint) -> void
+    ReduceUAnd, /// (memory, uint) -> void
+    ReduceUOr,  /// (memory, uint) -> void
+    ReduceUXor, /// (memory, uint) -> void
+
+    ReduceIAdd, /// (memory, int) -> void
+    ReduceIMin, /// (memory, int) -> void
+    ReduceIMax, /// (memory, int) -> void
+    ReduceIAnd, /// (memory, int) -> void
+    ReduceIOr,  /// (memory, int) -> void
+    ReduceIXor, /// (memory, int) -> void
+
    Branch,         /// (uint branch_target) -> void
    BranchIndirect, /// (uint branch_target) -> void
    PushFlowStack,  /// (uint branch_target) -> void