Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/compiler-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ jobs:
working-directory: ${{github.workspace}}
run: |
dotnet tool install --global dotnet-coverage
dotnet-coverage collect -s tools/dotnet_coverage.settings.xml -f cobertura -o coverage/unit.xml "dotnet test -c ${{matrix.config.buildType}} -s test.runsettings --no-build --verbosity normal --blame"
dotnet-coverage collect -s tools/dotnet_coverage.settings.xml -f cobertura -o coverage/unit.xml "dotnet test -c ${{matrix.config.buildType}} -s test.runsettings --no-build --verbosity normal --filter FullyQualifiedName!~Nncase.Tests.TargetTest.UnitTestCUDAKernels --blame"
dotnet-coverage merge -o coverage.unit.xml -f cobertura -r coverage/*.xml

- name: Upload Coverage
Expand Down
12 changes: 12 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,18 @@ option(BUILD_TESTING "Build test programs" OFF)
option(ENABLE_OP_PROFILE "Profile ops cast time" OFF)
option(ENABLE_DUMP_MANAGER "Enable dump manager" OFF)
option(ENABLE_DUMP_MEM "Dump mem usage" OFF)
option(ENABLE_CUDA_RUNTIME "Enable CUDA runtime" OFF)

if(DEFINED CMAKE_CUDA_COMPILER AND NOT "${CMAKE_CUDA_COMPILER}" STREQUAL "")
set(ENABLE_CUDA_RUNTIME ON CACHE BOOL "Enable CUDA runtime" FORCE)
endif()

if(ENABLE_CUDA_RUNTIME)
if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
set(CMAKE_CUDA_ARCHITECTURES 120)
endif()
enable_language(CUDA)
endif()

if (BUILDING_RUNTIME)
# option(ENABLE_VULKAN_RUNTIME "Enable Vulkan runtime" OFF)
Expand Down
7 changes: 6 additions & 1 deletion cmake/compile_flags.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ if (MSVC)
set(PYBIND11_CPP_STANDARD "/std:c++latest")
else()
add_compile_options(-fvisibility=hidden)
add_compile_options(-Wall -Wextra -pedantic -Werror -Wno-multichar -Wno-missing-field-initializers -Wno-unused-function -Wno-type-limits -Wno-unused-local-typedefs -Wno-sign-compare)
add_compile_options(-Wall -Wextra -Wno-missing-field-initializers -Wno-unused-function -Wno-type-limits -Wno-unused-local-typedefs -Wno-sign-compare)
if (APPLE)
add_compile_options(-Wno-four-char-constants -Wno-sometimes-uninitialized -Wno-deprecated -Wno-braced-scalar-init)
elseif (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
Expand All @@ -15,6 +15,11 @@ else()
endif()
endif()

if (CMAKE_CUDA_COMPILER)
message(STATUS "Configuring for CUDA")
#add_compile_options(-save-temps)
endif()

if(${CMAKE_SYSTEM_PROCESSOR} MATCHES
"(x86)|(X86)|(amd64)|(AMD64)|(x86_64)|(X86_64)")
if (MSVC)
Expand Down
5 changes: 5 additions & 0 deletions conanfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ class nncaseConan(ConanFile):
"k230_runtime": [True, False],
"k80_runtime": [True, False],
"vulkan_runtime": [True, False],
"cuda_runtime": [True, False],
"tests": [True, False],
"python": [True, False],
"python_root": ["ANY"]
Expand All @@ -40,6 +41,7 @@ class nncaseConan(ConanFile):
"k230_runtime": False,
"k80_runtime": False,
"vulkan_runtime": False,
"cuda_runtime": False,
"tests": False,
"python": True,
"python_root": ""
Expand Down Expand Up @@ -88,8 +90,11 @@ def generate(self):
tc.variables['ENABLE_K230_RUNTIME'] = self.options.k230_runtime
tc.variables['ENABLE_K80_RUNTIME'] = self.options.k80_runtime
tc.variables['ENABLE_VULKAN_RUNTIME'] = self.options.vulkan_runtime
tc.variables['ENABLE_CUDA_RUNTIME'] = self.options.cuda_runtime
tc.variables['BUILD_PYTHON_BINDING'] = self.options.python
tc.variables['BUILD_TESTING'] = self.options.tests
if self.options.cuda_runtime:
tc.variables['CMAKE_CUDA_ARCHITECTURES'] = "120"
if self.options.get_safe("python_root", default="") != "":
tc.variables['Python3_ROOT_DIR'] = self.options.python_root
if self.options.runtime:
Expand Down
8 changes: 4 additions & 4 deletions modules/Nncase.Modules.NTT/CodeGen/CPU/CSourceBuiltn.cs
Original file line number Diff line number Diff line change
Expand Up @@ -80,16 +80,16 @@ public static string TopoAwareRuntimeDef(NTTTargetOptions options, ulong dataAli
return content;
}

public static string ModuleTopologyDef(NTTTargetOptions options)
public static string ModuleTopologyDef(NTTTargetOptions options, bool isCUDA)
{
var content = RazorTemplateEngine.RenderAsync("~/CodeGen/CPU/Templates/module_topology_def.h.cshtml", options).Result;
var content = RazorTemplateEngine.RenderAsync("~/CodeGen/CPU/Templates/module_topology_def.h.cshtml", new { Hierarchies = options.Hierarchies[0], IsCUDA = isCUDA }).Result;
return content;
}

public static string CMakeDef()
public static string CMakeDef(bool isCUDA)
{
var cmakePath = CMakePath(Path.Combine(Path.GetDirectoryName(typeof(CSourceBuiltn).Assembly.Location)!, "Runtime", "cmake", "ntt_module.cmake"));
var content = RazorTemplateEngine.RenderAsync("~/CodeGen/CPU/Templates/CMakeLists.txt.cshtml", new { CMakePath = cmakePath }).Result;
var content = RazorTemplateEngine.RenderAsync("~/CodeGen/CPU/Templates/CMakeLists.txt.cshtml", new { CMakePath = cmakePath, IsCUDA = isCUDA }).Result;
return content;
}

Expand Down
17 changes: 14 additions & 3 deletions modules/Nncase.Modules.NTT/CodeGen/CPU/CSourceCompiler.cs
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ public class CSourceCompiler
{
private static string? _vcVarPath;

private readonly bool _isCUDA;

/// <summary>
/// compiler exe name.
/// </summary>
Expand All @@ -37,8 +39,9 @@ public class CSourceCompiler
/// </summary>
private string _ext = string.Empty;

public CSourceCompiler()
public CSourceCompiler(bool isCUDA)
{
_isCUDA = isCUDA;
PlatformSpecific();
ArchSpecific();
}
Expand Down Expand Up @@ -186,8 +189,16 @@ private void ArchSpecific()

private string ArgumentsSpecific(string sourcePath, string outPath)
{
var archConfig = RuntimeInformation.IsOSPlatform(OSPlatform.Windows) ?
"-DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl" : string.Empty;
string archConfig = string.Empty;
if (_isCUDA)
{
archConfig = $"-DCMAKE_CUDA_ARCHITECTURES=120 -DCMAKE_CUDA_COMPILER=clang++";
}
else
{
archConfig = RuntimeInformation.IsOSPlatform(OSPlatform.Windows) ?
"-DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl" : string.Empty;
}

#if DEBUG
var config = "Release";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ public static void WriteWithProfiler(string functionName, string tagName = "")
IndentScope.Writer.IndWrite("{\n");
#if false // Disable device profiling for now.
IndentScope.Writer.Write($"constexpr std::string_view function_name = \"{tagName}\";\n");
IndentScope.Writer.Write($"auto_profiler profiler(function_name, runtime::profiling_level::device);\n");
IndentScope.Writer.Write($"profile_scope profiler(function_name, profile_level::device);\n");
#endif
IndentScope.Writer.Write($"{functionName};\n");
IndentScope.Writer.IndWrite("}\n");
Expand All @@ -69,7 +69,7 @@ public static void WriteIndWithProfiler(string functionName, string tagName = ""
IndentScope.Writer.IndWrite("{\n");
#if false // Disable device profiling for now.
IndentScope.Writer.IndWrite($"constexpr std::string_view function_name = \"{tagName}\";\n");
IndentScope.Writer.IndWrite($"auto_profiler profiler(function_name, runtime::profiling_level::device);\n");
IndentScope.Writer.IndWrite($"profile_scope profiler(function_name, profile_level::device);\n");
#endif
IndentScope.Writer.IndWrite($"{functionName};\n");
IndentScope.Writer.IndWrite("}\n");
Expand All @@ -94,7 +94,7 @@ protected override CSymbol VisitPrimFunction(PrimFunction expr)
}

var ctype = $"template<{string.Join(", ", Enumerable.Range(0, expr.Parameters.Length).Select(x => $"class T{x}"))}>" + Environment.NewLine +
$"void {expr.Name}({string.Join(", ", expr.Parameters.AsValueEnumerable().Select(Visit).Select((s, i) => $"T{i} &&{s.Name}").ToArray())})";
$"NTT_DEVICE void {expr.Name}({string.Join(", ", expr.Parameters.AsValueEnumerable().Select(Visit).Select((s, i) => $"T{i} &&{s.Name}").ToArray())})";

using (var scope = new IndentScope(_deviceBuilder))
{
Expand Down Expand Up @@ -192,7 +192,7 @@ protected override CSymbol VisitPhysicalBuffer(PhysicalBuffer expr)
_ => throw new NotSupportedException(expr.Location.ToString()),
};

var str = $"std::span<std::byte, {size.Name}>({name} + {start.Name}, {size.Name})";
var str = $"ntt::span<std::byte, {size.Name}>({name} + {start.Name}, {size.Name})";
symbol = new(start.Type, str);
_exprMemo.Add(expr, symbol);
return symbol;
Expand Down
113 changes: 57 additions & 56 deletions modules/Nncase.Modules.NTT/CodeGen/CPU/FunctionBuilder.cs
Original file line number Diff line number Diff line change
Expand Up @@ -21,15 +21,17 @@ internal class FunctionBuilder
private readonly BinaryWriter _textWriter;
private readonly BinaryWriter _rdataWriter;
private readonly IReadOnlyList<BinaryWriter> _threadLocalRdataWriters;
private readonly IReadOnlyList<BinaryWriter> _warpLocalRdataWriters;
private readonly IReadOnlyList<BinaryWriter> _blockLocalRdataWriters;

public FunctionBuilder(uint id, BinaryWriter rdataWriter, IReadOnlyList<BinaryWriter> threadLocalRdataWriters, IReadOnlyList<BinaryWriter> blockLocalRdataWriters, Targets.NTTTargetOptions targetOptions)
public FunctionBuilder(uint id, BinaryWriter rdataWriter, IReadOnlyList<BinaryWriter> threadLocalRdataWriters, IReadOnlyList<BinaryWriter> warpLocalRdataWriters, IReadOnlyList<BinaryWriter> blockLocalRdataWriters, Targets.NTTTargetOptions targetOptions)
{
_id = id;
_sectionManager = new();
_textWriter = _sectionManager.GetWriter(WellknownSectionNames.Text);
_rdataWriter = rdataWriter;
_threadLocalRdataWriters = threadLocalRdataWriters;
_warpLocalRdataWriters = warpLocalRdataWriters;
_blockLocalRdataWriters = blockLocalRdataWriters;
TargetOptions = targetOptions;
}
Expand Down Expand Up @@ -58,80 +60,33 @@ public unsafe ILinkableFunction Build(BaseFunction baseFunc)
tensor.Serialize(_rdataWriter.BaseStream);
}

// 2. write the thread local rdata
ulong threadLocalRdataPoolSize = ulong.MinValue;
foreach (var (@const, range) in primFunc.SchedResult.ThreadLocalRdatas)
{
var tensor = ((TensorConst)@const).Value;
var distributedType = (DistributedType)@const.CheckedType;
var size = range.Max - range.Min;
threadLocalRdataPoolSize = System.Math.Max(range.Max, threadLocalRdataPoolSize);
var dividedDims = DistributedUtility.GetDividedTensorType(distributedType).Shape.ToValueArray();
var localStrides = TensorUtilities.GetDefaultStrides(dividedDims);
for (int i = 0; i < _threadLocalRdataWriters.Count; i++)
{
var threadLocalRdataWriter = _threadLocalRdataWriters[i];
var shardIndex = DistributedUtility.GetUnraveledIndex(i, TargetOptions.Hierarchies[0]);
(var localOffset, var localShape) = DistributedUtility.GetLocalOffsetAndShape(distributedType, shardIndex);
var linearOffset = TensorUtilities.GetLinearOffset(tensor.Strides, localOffset);

if ((ulong)TensorUtilities.GetProduct(localShape) * (ulong)tensor.ElementType.SizeInBytes > size)
{
throw new InvalidDataException("The Buffer Size Not Equal!");
}

threadLocalRdataWriter.Position(checked((long)range.Min));
tensor.Serialize(threadLocalRdataWriter.BaseStream, linearOffset, localShape, localStrides);
}
}

// 2. write the block local rdata
ulong blockLocalRdataPoolSize = ulong.MinValue;
foreach (var (@const, range) in primFunc.SchedResult.BlockLocalRdatas)
{
var tensor = ((TensorConst)@const).Value;
var distributedType = (DistributedType)@const.CheckedType;
var size = range.Max - range.Min;
blockLocalRdataPoolSize = System.Math.Max(range.Max, blockLocalRdataPoolSize);
var dividedDims = DistributedUtility.GetDividedTensorType(distributedType).Shape.ToValueArray();
var localStrides = TensorUtilities.GetDefaultStrides(dividedDims);
for (int i = 0; i < _blockLocalRdataWriters.Count; i++)
{
var blockLocalRdataWriter = _blockLocalRdataWriters[i];
var shardIndex = DistributedUtility.GetUnraveledIndex(i, TargetOptions.Hierarchies[0][..^1]).Concat([0]).ToArray();
(var localOffset, var localShape) = DistributedUtility.GetLocalOffsetAndShape(distributedType, shardIndex);
var linearOffset = TensorUtilities.GetLinearOffset(tensor.Strides, localOffset);

if ((ulong)TensorUtilities.GetProduct(localShape) * (ulong)tensor.ElementType.SizeInBytes > size)
{
throw new InvalidDataException("The Buffer Size Not Equal!");
}

blockLocalRdataWriter.Position(checked((long)range.Min));
tensor.Serialize(blockLocalRdataWriter.BaseStream, linearOffset, localShape, localStrides);
}
}
// 2. write the local rdatas
var threadLocalRdataPoolSize = SerializeLocalRdata(primFunc.SchedResult.ThreadLocalRdatas, _threadLocalRdataWriters, "t");
var warpLocalRdataPoolSize = SerializeLocalRdata(primFunc.SchedResult.WarpLocalRdatas, _warpLocalRdataWriters, "w");
var blockLocalRdataPoolSize = SerializeLocalRdata(primFunc.SchedResult.BlockLocalRdatas, _blockLocalRdataWriters, "b");

// 4. build function.
// 3. build function.
var visitor = new KernelCSourceConvertVisitor(TargetOptions);
visitor.Visit(primFunc);
var functionCSource = visitor.GetCSource();

// 5. write the kernel desc
// 4. write the kernel desc
using (var writer = _sectionManager.GetWriter(LinkableKernelFunction.KernelHeaderSectionName))
{
var header = default(KernelDescHeader);
header.OutputAlign = (uint)primFunc.SchedResult.OutputAlign;
header.LocalDataAlign = (uint)primFunc.SchedResult.DataAlign;
header.OutputPoolSize = primFunc.SchedResult.OutputUsage;
header.LocalDataPoolSize = primFunc.SchedResult.DataUsage;
header.WarpLocalDataPoolSize = primFunc.SchedResult.WarpLocalDataPoolSize;
header.BlockLocalDataPoolSize = primFunc.SchedResult.BlockLocalDataPoolSize;
writer.Write(ref header);
}

var memoryPoolDesc = new KernelMemoryPoolDesc(
rdataPoolSize,
threadLocalRdataPoolSize,
warpLocalRdataPoolSize,
blockLocalRdataPoolSize);
var kernelDescSection = new LinkedSection(_sectionManager.GetContent(LinkableKernelFunction.KernelHeaderSectionName)!, ".desc", 0, 8, (uint)sizeof(KernelDescHeader));
return new LinkableKernelFunction(_id, primFunc, functionCSource, memoryPoolDesc, _sectionManager.GetContent(WellknownSectionNames.Text)!, kernelDescSection);
Expand All @@ -154,4 +109,50 @@ public unsafe ILinkableFunction Build(BaseFunction baseFunc)

throw new NotSupportedException($"the {baseFunc.GetType()} {baseFunc.Name} is notsupport for codegen!");
}

private ulong SerializeLocalRdata(IReadOnlyDictionary<Const, ValueRange<ulong>> localRdatas, IReadOnlyList<BinaryWriter> localRdataWriters, string scopeName)
{
ulong localRdataPoolSize = ulong.MinValue;
foreach (var (@const, range) in localRdatas)
{
var tensor = ((TensorConst)@const).Value;
var distributedType = (DistributedType)@const.CheckedType;
var size = range.Max - range.Min;
localRdataPoolSize = System.Math.Max(range.Max, localRdataPoolSize);
var dividedDims = DistributedUtility.GetDividedTensorType(distributedType).Shape.ToValueArray();
var localStrides = TensorUtilities.GetDefaultStrides(dividedDims);
for (int i = 0; i < localRdataWriters.Count; i++)
{
var localRdataWriter = localRdataWriters[i];
var shardIndex = GetScopedShardIndex(i, scopeName);
(var localOffset, var localShape) = DistributedUtility.GetLocalOffsetAndShape(distributedType, shardIndex);
var linearOffset = TensorUtilities.GetLinearOffset(tensor.Strides, localOffset);

if ((ulong)TensorUtilities.GetProduct(localShape) * (ulong)tensor.ElementType.SizeInBytes > size)
{
throw new InvalidDataException("The Buffer Size Not Equal!");
}

localRdataWriter.Position(checked((long)range.Min));
tensor.Serialize(localRdataWriter.BaseStream, linearOffset, localShape, localStrides);
}
}

return localRdataPoolSize;
}

private int[] GetScopedShardIndex(int writerIndex, string scopeName)
{
var hierarchies = TargetOptions.Hierarchies[0];
var scopeIndex = TargetOptions.HierarchyNames.IndexOf(scopeName, StringComparison.Ordinal);
if (scopeIndex < 0)
{
return DistributedUtility.GetUnraveledIndex(writerIndex, hierarchies);
}

var scopedHierarchies = hierarchies[..(scopeIndex + 1)];
return DistributedUtility.GetUnraveledIndex(writerIndex, scopedHierarchies)
.Concat(Enumerable.Repeat(0, hierarchies.Length - scopedHierarchies.Length))
.ToArray();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ protected override CSymbol VisitFusion(Fusion expr)
IndentScope.Writer.IndWrite($"template<{string.Join(", ", Enumerable.Range(0, expr.Parameters.Length).Select(x => $"class T{x}"))}> struct {expr.Name} {{\n");
using (_ = new IndentScope())
{
IndentScope.Writer.IndWrite($"auto operator()({string.Join(", ", expr.Parameters.AsValueEnumerable().Select(Visit).Select((s, i) => $"const T{i} &{s.Name}").ToArray())}) const noexcept {{\n");
IndentScope.Writer.IndWrite($"constexpr auto operator()({string.Join(", ", expr.Parameters.AsValueEnumerable().Select(Visit).Select((s, i) => $"const T{i} &{s.Name}").ToArray())}) const noexcept {{\n");

// 2. Function body
using (_ = new IndentScope())
Expand Down
Loading
Loading