Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/nnfusion/common/type/element_type.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ bool element::Type::nnfusion_element_type_to_dtype_string(const element::Type& n
std::string& dtype)
{
if (ng_et == element::boolean)
dtype = "char";
dtype = "int";
else if (ng_et == element::character)
dtype = "char";
else if (ng_et == element::f16)
Expand Down
25 changes: 25 additions & 0 deletions src/nnfusion/core/graph/gnode.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -416,6 +416,7 @@ void FusedGNode::set_inputs_and_outputs(std::shared_ptr<Graph> graph)
m_op_ctxs.push_back(ctx);
}

std::unordered_map<std::shared_ptr<GNode>, std::unordered_map<size_t, size_t>> input_id_map;
// Register input tensors
for (const auto& m_node : m_order_nodes)
{
Expand All @@ -430,6 +431,7 @@ void FusedGNode::set_inputs_and_outputs(std::shared_ptr<Graph> graph)
set_input(input_id, m_node->get_inputs().at(in_edge->get_dst_input()));
graph->add_edge(
in_edge->get_src(), in_edge->get_src_output(), shared_from_this(), input_id);
input_id_map[m_node][in_edge->get_dst_input()] = input_id;
}
}
// Add control-edges as inputs of fused node
Expand Down Expand Up @@ -461,6 +463,29 @@ void FusedGNode::set_inputs_and_outputs(std::shared_ptr<Graph> graph)
has_output = true;
set_output(get_output_size(),
m_node->get_outputs().at(out_edge->get_src_output()));

// get inplace annotation
auto op = std::dynamic_pointer_cast<Op>(m_node->get_op_ptr());
auto op_annotations = op->get_op_annotations();
if (op_annotations)
{
auto oi_pairs = op_annotations->get_in_place_oi_pairs();
for (auto oi_pair : oi_pairs)
{
auto iter = input_id_map.find(m_node);
if (iter != input_id_map.end() && iter->second.count(oi_pair.input) > 0)
{
auto fused_op =
std::dynamic_pointer_cast<Op>(shared_from_this()->get_op_ptr());
AddInplace(fused_op,
get_output_size() - 1,
iter->second[oi_pair.input],
oi_pair.destructive,
oi_pair.force_inplace);
//NNFUSION_LOG(INFO) << "========================: node=" << m_node->get_op_type() << ", oi: <" << oi_pair.output << ", " << oi_pair.input << ">";
}
}
}
}
graph->add_edge(shared_from_this(),
get_output_size() - 1,
Expand Down
2 changes: 1 addition & 1 deletion src/nnfusion/core/kernels/common_langunit.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ LU_DEFINE(header::chrono, "#include <chrono>\n");
LU_DEFINE(header::ctime, "#include <ctime>\n");
LU_DEFINE(header::limits, "#include <limits>\n");
LU_DEFINE(header::iostream, "#include <iostream>\n");
LU_DEFINE(header::windows, "#include <windows.h>\n");
LU_DEFINE(header::windows, "#define NOMINMAX\n#include <windows.h>\n");
LU_DEFINE(header::unordered_map, "#include <unordered_map>\n");
LU_DEFINE(header::torch_extension, "#include <torch/extension.h>\n");

Expand Down
7 changes: 4 additions & 3 deletions src/nnfusion/core/kernels/cuda_gpu/cuda_cudnn.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -201,11 +201,12 @@ LanguageUnit_p cuda::get_cudnn_convolution_descriptor(const Shape& padding,
<< "window_dilation_strides_int, CUDNN_CROSS_CORRELATION, " << data_type << "));\n";
}

if(type == nnfusion::element::f16){
if (type == nnfusion::element::f16)
{
// half precision, use tensor core
lu << "CUDNN_SAFE_CALL(cudnnSetConvolutionMathType(" << desc << ", "
<< "CUDNN_TENSOR_OP_MATH"
<< "));\n";
<< "CUDNN_TENSOR_OP_MATH"
<< "));\n";
}

return _lu;
Expand Down
9 changes: 6 additions & 3 deletions src/nnfusion/core/kernels/cuda_gpu/kernels/batch_matmul.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -103,8 +103,9 @@ namespace nnfusion
@hCublas@, @transA@, @transB@, @m@, @n@, @k@,
&alpha, input1, @lda@, @stride_a@, input0, @ldb@, @stride_b@,
&beta, output0, @ldc@, @stride_c@, @batch@));
)" :
R"(
)"
:
R"(
static const float alpha = @alpha@F, beta = @beta@F;
// if (!@hCublas@)
// CUBLAS_SAFE_CALL(@api_create@(&@hCublas@));
Expand All @@ -116,7 +117,9 @@ namespace nnfusion
{
{"hCublas", "cublas_handle"},
{"api_create", "cublasCreate"},
{"api_exec", dtype == nnfusion::element::f16 ? "cublasHgemmStridedBatched" : "cublasSgemmStridedBatched"},
{"api_exec",
dtype == nnfusion::element::f16 ? "cublasHgemmStridedBatched"
: "cublasSgemmStridedBatched"},
{"transA", transB ? "CUBLAS_OP_T" : "CUBLAS_OP_N"},
{"transB", transA ? "CUBLAS_OP_T" : "CUBLAS_OP_N"},
{"alpha", alpha},
Expand Down
10 changes: 7 additions & 3 deletions src/nnfusion/core/kernels/cuda_gpu/kernels/batch_norm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -171,10 +171,14 @@ LanguageUnit_p cuda::BatchNormNCHW::emit_function_body()
/*
* todo: may have better solution, details in https://github.com/microsoft/nnfusion/issues/434
* */
if(dtype == nnfusion::element::f16){
lu << "output0[st + i] = __hadd(input1[c_id] , __hdiv(__hmul(input0[c_id], __hsub(input2[st + i], input3[c_id])), sqrtf(__hadd(__float2half("
if (dtype == nnfusion::element::f16)
{
lu << "output0[st + i] = __hadd(input1[c_id] , __hdiv(__hmul(input0[c_id], "
"__hsub(input2[st + i], input3[c_id])), sqrtf(__hadd(__float2half("
<< epsilon << "), input4[c_id]))));\n";
}else{
}
else
{
lu << "(input1[c_id] + (input0[c_id] * "
"(input2[st + i] - input3[c_id]) / sqrtf("
<< epsilon << " + input4[c_id])));\n";
Expand Down
Loading