ROCm · causten · Nov 20, 2024 · Nov 6, 2024 · Nov 7, 2024 · Nov 7, 2024
@@ -148,6 +148,7 @@ add_library(migraphx_gpu
     compile_gen.cpp
     compile_hip.cpp
     compile_hip_code_object.cpp
+    compile_hipblaslt.cpp
     compile_miopen.cpp
     compile_pointwise.cpp
     compiler.cpp

@@ -0,0 +1,78 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#if MIGRAPHX_USE_HIPBLASLT
+#include <migraphx/gpu/compile_hipblaslt.hpp>
+#include <migraphx/gpu/context.hpp>
+#include <migraphx/module.hpp>
+#include <migraphx/iterator_for.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/make_op.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+static size_t compile(migraphx::context& ctx, operation& op, instruction_ref ins)
+{
+    auto v = op.compile(ctx, ins->get_shape(), to_shapes(ins->inputs()));
+    return v.get<std::size_t>("workspace", 0);
+}
+
+void compile_hipblaslt::apply(module& m) const
+{
+    assert(ctx);
+    for(auto ins : iterator_for(m))
+    {
+        if(ins->name() != "gpu::hipblaslt_op")
+            continue;
+        auto op     = any_cast<hipblaslt_op>(ins->get_operator()).op;
+        auto inputs = ins->inputs();
+
+        std::size_t ws = hipblaslt_workspace_size;
+
+        auto alloc = m.insert_instruction(
+            ins, make_op("allocate", {{"shape", to_value(shape{shape::uint8_type, {ws}})}}));
+        inputs.insert(std::prev(inputs.end()), alloc);
+        m.replace_instruction(ins, op, inputs);
+
+        // Calculate workspace size
+        ws               = compile(*ctx, op, ins);
+        auto alloc_after = m.insert_instruction(
+            ins, make_op("allocate", {{"shape", to_value(shape{shape::uint8_type, {ws}})}}));
+
+        // Replace the workspace size with actual worksapce size needed.
+        auto it = std::find(inputs.begin(), inputs.end(), alloc);
+        if(it != inputs.end())
+        {
+            *it = alloc_after; // Replace `alloc` with `alloc_after`
+        }
+        m.replace_instruction(ins, op, inputs);
+    }
+}
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif // MIGRAPHX_USE_HIPBLASLT
@@ -399,7 +399,8 @@ struct hip_gemm_impl
                                       const std::vector<argument>& args,
                                       int32_t solution_idx)
     {
-        auto* algo = &solution.get_result(ctx, *this, solution_idx)[0].algo;
+        auto* algo            = &solution.get_result(ctx, *this, solution_idx)[0].algo;
+        size_t workspace_size = ((is_3inputs ? args[3] : args[2]).get_shape()).bytes();
         return pack(ctx.get_stream().get_hipblaslt(),
                     hipblaslt_desc,
                     get_alpha(),                                  // alpha
@@ -414,7 +415,7 @@ struct hip_gemm_impl
                     is_3inputs ? mat_d : mat_c,                   // Ddesc
                     algo,                                         // algo
                     is_3inputs ? args[3].data() : args[2].data(), // workspace
-                    algo->max_workspace_bytes,                    // workspaceSizeInBytes
+                    workspace_size,                               // workspaceSizeInBytes
                     ctx.get_stream().get()                        // stream
         );
     }
@@ -478,6 +479,53 @@ struct hip_gemm_impl
         return solution_idx;
     }
 
+    /**
+     * Get workspace size for the solution index:  Gets algo from the solution index,
+     * and calls matmulIsAlgoSupported() to get the workspace size.
+     */
+
+    size_t get_workspace_size(context& ctx,
+                              const std::vector<shape>& input_shapes,
+                              int32_t solution_idx) const
+    {
+        size_t workspace_size = hipblaslt_workspace_size;
+        std::vector<argument> input_args;
+        std::transform(input_shapes.begin(),
+                       input_shapes.end(),
+                       std::back_inserter(input_args),
+                       [](const shape& x) { return to_gpu(generate_argument(x)); });
+
+        std::vector<int32_t> algo_index = {solution_idx};
+        std::vector<hipblasLtMatmulHeuristicResult_t> heuristic_result;
+
+        hipblaslt_invoke([&]() {
+            return hipblaslt_ext::getAlgosFromIndex(
+                ctx.get_stream().get_hipblaslt(), algo_index, heuristic_result);
+        });
+        assert(heuristic_result.size() == 1);
+
+        auto algo                 = heuristic_result[0].algo;
+        size_t ret_workspace_size = 0;
+        auto supporting_args =
+            create_hipblaslt_supporting_args_common(ctx, input_args, algo, ret_workspace_size);
+
+        auto status =
+            hipblaslt_invoke(&hipblaslt_ext::matmulIsAlgoSupported, supporting_args, false);
+
+        // If algo is supported, update the workspace size to the actual size needed.
+        // Otherwise, use the default workspace size.
+        if(status == HIPBLAS_STATUS_SUCCESS)
+        {
+            // TODO: Remove this check once issues with '0' workspace size are resolved.
+            // Temporarily, we use the approach where, if the returned workspace size is '0',
+            // we use the default workspace size.
+            // Otherwise, we use the returned workspace size.
+            if(ret_workspace_size != 0)
+                workspace_size = ret_workspace_size;
+        }
+        return workspace_size;
+    }
+
     /**
      * Find best hipBLASLt solution:  Get list of solutions and try them all, returning the index
      * of the fastest one.
@@ -531,6 +579,13 @@ struct hip_gemm_impl
 
         // Initialize to default solution index
         int32_t best_sol = 0;
+        // If no valid/supported solution is returned, use hipblasLtMatmulAlgoGetHeuristic
+        // to get an algo and use solution index from that algo.
+        if(solution_indices.empty())
+        {
+            auto algo = solution.get_result(ctx, *this, 0)[0].algo;
+            solution_indices.push_back(hipblaslt_ext::getIndexFromAlgo(algo));
+        }
         for(auto sol : solution_indices)
         {
             // Warmup: the first call to an op. may not be representative since there is
@@ -664,6 +719,17 @@ int32_t hip_gemm_default_solution(context& ctx,
     return 0;
 }
 
+size_t hip_gemm_workspace_size(context& ctx,
+                               const shape& output_shape,
+                               const std::vector<shape>& input_shapes,
+                               float alpha,
+                               float beta,
+                               int32_t solution_idx)
+{
+    auto gemm_item = hip_gemm_impl(output_shape, input_shapes, alpha, beta);
+    return gemm_item.get_workspace_size(ctx, input_shapes, solution_idx);
+}
+
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx

@@ -0,0 +1,77 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_GPU_COMPILE_HIPBLASLT_HPP
+#define MIGRAPHX_GUARD_GPU_COMPILE_HIPBLASLT_HPP
+
+#include <migraphx/config.hpp>
+#include <migraphx/instruction_ref.hpp>
+#include <migraphx/op/identity.hpp>
+#include <migraphx/register_op.hpp>
+#include <string>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+
+struct module;
+struct context;
+struct operation;
+
+namespace gpu {
+
+struct hipblaslt_op
+{
+    operation op = op::identity{};
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.op, "op"));
+    }
+
+    std::string name() const { return "gpu::hipblaslt_op"; }
+
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        inputs.push_back(inputs.back());
+        return op.compute_shape(inputs);
+    }
+
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
+};
+MIGRAPHX_REGISTER_OP(hipblaslt_op);
+
+struct compile_hipblaslt
+{
+    context* ctx = nullptr;
+    std::string name() const { return "gpu::compile_hipblaslt"; }
+    void apply(module& m) const;
+};
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_GPU_COMPILE_HIPBLASLT_HPP
@@ -126,6 +126,15 @@ struct hip_gemm
                 hip_gemm_finalize(ctx, output_shape, input_shapes, alpha, beta, solution_idx);
         }
     }
+
+    value
+    compile(migraphx::context& ctx, const shape& output, const std::vector<shape>& input_shapes)
+    {
+        finalize(any_cast<migraphx::gpu::context>(ctx), output, input_shapes);
+        size_t ws = hip_gemm_workspace_size(
+            any_cast<migraphx::gpu::context>(ctx), output, input_shapes, alpha, beta, solution_idx);
+        return {{"workspace", ws}};
+    }
 };
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS

@@ -68,6 +68,13 @@ int32_t hip_gemm_default_solution(context& ctx,
                                   const shape& output_shape,
                                   const std::vector<shape>& input_shapes);
 
+size_t hip_gemm_workspace_size(context& ctx,
+                               const shape& output_shape,
+                               const std::vector<shape>& input_shapes,
+                               float alpha,
+                               float beta,
+                               int32_t solution_idx);
+
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx

@@ -251,14 +251,6 @@ struct miopen_apply
         apply_map.emplace(name, [=](instruction_ref ins) {
             std::vector<instruction_ref> refs = ins->inputs();
             assert(refs.size() == 2);
-#if MIGRAPHX_USE_HIPBLASLT
-            if(enabled(MIGRAPHX_ENABLE_HIPBLASLT_GEMM{}))
-            {
-                shape workspace_shape{shape::uint8_type, {hipblaslt_workspace_size}};
-                auto workspace = insert_allocation(ins, workspace_shape);
-                refs.push_back(workspace);
-            }
-#endif
             auto output = insert_allocation(ins, ins->get_shape());
             refs.push_back(output);
 #if MIGRAPHX_USE_HIPBLASLT
@@ -269,7 +261,18 @@ struct miopen_apply
                     ins, rocblas_gemm<Op>{Op{}, 1, 0, compute_fp32}, refs);
 #if MIGRAPHX_USE_HIPBLASLT
             }
-            return mod->replace_instruction(ins, hip_gemm<Op>{Op{}, 1, 0}, refs);
+            std::string op_name = "gpu::hip_gemm";
+            if(contains(name, "quant_"))
+            {
+                op_name = "gpu::hip_quant_gemm";
+            }
+            operation gemm_op = make_op(op_name);
+            return mod->replace_instruction(
+                ins,
+                make_op("gpu::hipblaslt_op", {{"op", to_value(gemm_op)}}),
+                ins->inputs().at(0),
+                ins->inputs().at(1),
+                output);
 #endif
         });
     }

@@ -56,6 +56,7 @@
 #include <migraphx/split_reduce.hpp>
 #include <migraphx/split_single_dyn_dim.hpp>
 #include <migraphx/gpu/allocation_model.hpp>
+#include <migraphx/gpu/compile_hipblaslt.hpp>
 #include <migraphx/gpu/compile_miopen.hpp>
 #include <migraphx/gpu/compile_ops.hpp>
 #include <migraphx/gpu/concat_gpu_opt.hpp>
@@ -224,6 +225,10 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti
         dead_code_elimination{},
         fuse_ops{&ctx, options.fast_math},
         dead_code_elimination{},
+#if MIGRAPHX_USE_HIPBLASLT
+        compile_hipblaslt{&gctx},
+        dead_code_elimination{},
+#endif
         replace_allocate{gpu_allocation_model{}, options.offload_copy},
         dead_code_elimination{},
         adjust_allocation{gpu_allocation_model{}},